diff options
103 files changed, 2532 insertions, 1829 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab index 6dcf75e594f..8b093f8222d 100644 --- a/Documentation/ABI/testing/sysfs-kernel-slab +++ b/Documentation/ABI/testing/sysfs-kernel-slab @@ -45,8 +45,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The alloc_fastpath file is read-only and specifies how many - objects have been allocated using the fast path. + The alloc_fastpath file shows how many objects have been + allocated using the fast path. It can be written to clear the + current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/alloc_from_partial @@ -55,9 +56,10 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The alloc_from_partial file is read-only and specifies how - many times a cpu slab has been full and it has been refilled - by using a slab from the list of partially used slabs. + The alloc_from_partial file shows how many times a cpu slab has + been full and it has been refilled by using a slab from the list + of partially used slabs. It can be written to clear the current + count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/alloc_refill @@ -66,9 +68,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The alloc_refill file is read-only and specifies how many - times the per-cpu freelist was empty but there were objects - available as the result of remote cpu frees. + The alloc_refill file shows how many times the per-cpu freelist + was empty but there were objects available as the result of + remote cpu frees. It can be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/alloc_slab @@ -77,8 +79,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The alloc_slab file is read-only and specifies how many times - a new slab had to be allocated from the page allocator. + The alloc_slab file is shows how many times a new slab had to + be allocated from the page allocator. It can be written to + clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/alloc_slowpath @@ -87,9 +90,10 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The alloc_slowpath file is read-only and specifies how many - objects have been allocated using the slow path because of a - refill or allocation from a partial or new slab. + The alloc_slowpath file shows how many objects have been + allocated using the slow path because of a refill or + allocation from a partial or new slab. It can be written to + clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/cache_dma @@ -117,10 +121,11 @@ KernelVersion: 2.6.31 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file cpuslab_flush is read-only and specifies how many - times a cache's cpu slabs have been flushed as the result of - destroying or shrinking a cache, a cpu going offline, or as - the result of forcing an allocation from a certain node. + The file cpuslab_flush shows how many times a cache's cpu slabs + have been flushed as the result of destroying or shrinking a + cache, a cpu going offline, or as the result of forcing an + allocation from a certain node. It can be written to clear the + current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/ctor @@ -139,8 +144,8 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file deactivate_empty is read-only and specifies how many - times an empty cpu slab was deactivated. + The deactivate_empty file shows how many times an empty cpu slab + was deactivated. It can be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/deactivate_full @@ -149,8 +154,8 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file deactivate_full is read-only and specifies how many - times a full cpu slab was deactivated. + The deactivate_full file shows how many times a full cpu slab + was deactivated. It can be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/deactivate_remote_frees @@ -159,9 +164,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file deactivate_remote_frees is read-only and specifies how - many times a cpu slab has been deactivated and contained free - objects that were freed remotely. + The deactivate_remote_frees file shows how many times a cpu slab + has been deactivated and contained free objects that were freed + remotely. It can be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/deactivate_to_head @@ -170,9 +175,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file deactivate_to_head is read-only and specifies how - many times a partial cpu slab was deactivated and added to the - head of its node's partial list. + The deactivate_to_head file shows how many times a partial cpu + slab was deactivated and added to the head of its node's partial + list. It can be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/deactivate_to_tail @@ -181,9 +186,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file deactivate_to_tail is read-only and specifies how - many times a partial cpu slab was deactivated and added to the - tail of its node's partial list. + The deactivate_to_tail file shows how many times a partial cpu + slab was deactivated and added to the tail of its node's partial + list. It can be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/destroy_by_rcu @@ -201,9 +206,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file free_add_partial is read-only and specifies how many - times an object has been freed in a full slab so that it had to - added to its node's partial list. + The free_add_partial file shows how many times an object has + been freed in a full slab so that it had to added to its node's + partial list. It can be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/free_calls @@ -222,9 +227,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The free_fastpath file is read-only and specifies how many - objects have been freed using the fast path because it was an - object from the cpu slab. + The free_fastpath file shows how many objects have been freed + using the fast path because it was an object from the cpu slab. + It can be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/free_frozen @@ -233,9 +238,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The free_frozen file is read-only and specifies how many - objects have been freed to a frozen slab (i.e. a remote cpu - slab). + The free_frozen file shows how many objects have been freed to + a frozen slab (i.e. a remote cpu slab). It can be written to + clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/free_remove_partial @@ -244,9 +249,10 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file free_remove_partial is read-only and specifies how - many times an object has been freed to a now-empty slab so - that it had to be removed from its node's partial list. + The free_remove_partial file shows how many times an object has + been freed to a now-empty slab so that it had to be removed from + its node's partial list. It can be written to clear the current + count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/free_slab @@ -255,8 +261,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The free_slab file is read-only and specifies how many times an - empty slab has been freed back to the page allocator. + The free_slab file shows how many times an empty slab has been + freed back to the page allocator. It can be written to clear + the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/free_slowpath @@ -265,9 +272,9 @@ KernelVersion: 2.6.25 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The free_slowpath file is read-only and specifies how many - objects have been freed using the slow path (i.e. to a full or - partial slab). + The free_slowpath file shows how many objects have been freed + using the slow path (i.e. to a full or partial slab). It can + be written to clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/hwcache_align @@ -346,10 +353,10 @@ KernelVersion: 2.6.26 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, Christoph Lameter <cl@linux-foundation.org> Description: - The file order_fallback is read-only and specifies how many - times an allocation of a new slab has not been possible at the - cache's order and instead fallen back to its minimum possible - order. + The order_fallback file shows how many times an allocation of a + new slab has not been possible at the cache's order and instead + fallen back to its minimum possible order. It can be written to + clear the current count. Available when CONFIG_SLUB_STATS is enabled. What: /sys/kernel/slab/cache/partial diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt index 9cb9138f7a7..65f4c795015 100644 --- a/Documentation/lockstat.txt +++ b/Documentation/lockstat.txt @@ -62,8 +62,20 @@ applicable). It also tracks 4 contention points per class. A contention point is a call site that had to wait on lock acquisition. + - CONFIGURATION + +Lock statistics are enabled via CONFIG_LOCK_STATS. + - USAGE +Enable collection of statistics: + +# echo 1 >/proc/sys/kernel/lock_stat + +Disable collection of statistics: + +# echo 0 >/proc/sys/kernel/lock_stat + Look at the current lock statistics: ( line numbers not part of actual output, done for clarity in the explanation diff --git a/MAINTAINERS b/MAINTAINERS index 93a07433078..d58fa703ec1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -801,6 +801,19 @@ L: openmoko-kernel@lists.openmoko.org (subscribers-only) W: http://wiki.openmoko.org/wiki/Neo_FreeRunner S: Supported +ARM/QUALCOMM MSM MACHINE SUPPORT +M: David Brown <davidb@codeaurora.org> +M: Daniel Walker <dwalker@codeaurora.org> +M: Bryan Huntsman <bryanh@codeaurora.org> +F: arch/arm/mach-msm/ +F: drivers/video/msm/ +F: drivers/mmc/host/msm_sdcc.c +F: drivers/mmc/host/msm_sdcc.h +F: drivers/serial/msm_serial.h +F: drivers/serial/msm_serial.c +T: git git://codeaurora.org/quic/kernel/dwalker/linux-msm.git +S: Maintained + ARM/TOSA MACHINE SUPPORT M: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> M: Dirk Opfer <dirk@opfer-online.de> diff --git a/arch/arm/mach-msm/Kconfig b/arch/arm/mach-msm/Kconfig index d140abca690..f780086befd 100644 --- a/arch/arm/mach-msm/Kconfig +++ b/arch/arm/mach-msm/Kconfig @@ -3,6 +3,30 @@ if ARCH_MSM comment "MSM Board Type" depends on ARCH_MSM +config MSM_DEBUG_UART + int + default 1 if MSM_DEBUG_UART1 + default 2 if MSM_DEBUG_UART2 + default 3 if MSM_DEBUG_UART3 + +choice + prompt "Debug UART" + + default MSM_DEBUG_UART_NONE + + config MSM_DEBUG_UART_NONE + bool "None" + + config MSM_DEBUG_UART1 + bool "UART1" + + config MSM_DEBUG_UART2 + bool "UART2" + + config MSM_DEBUG_UART3 + bool "UART3" +endchoice + config MACH_HALIBUT depends on ARCH_MSM default y @@ -10,4 +34,10 @@ config MACH_HALIBUT help Support for the Qualcomm SURF7201A eval board. +config MACH_TROUT + default y + bool "HTC Dream (aka trout)" + help + Support for the HTC Dream, T-Mobile G1, Android ADP1 devices. + endif diff --git a/arch/arm/mach-msm/Makefile b/arch/arm/mach-msm/Makefile index 1aa47001aa3..91e6f5c95dc 100644 --- a/arch/arm/mach-msm/Makefile +++ b/arch/arm/mach-msm/Makefile @@ -6,3 +6,4 @@ obj-y += clock.o clock-7x01a.o obj-$(CONFIG_MACH_HALIBUT) += board-halibut.o +obj-$(CONFIG_MACH_TROUT) += board-dream.o diff --git a/arch/arm/mach-msm/board-dream.c b/arch/arm/mach-msm/board-dream.c new file mode 100644 index 00000000000..21afa851316 --- /dev/null +++ b/arch/arm/mach-msm/board-dream.c @@ -0,0 +1,93 @@ +/* linux/arch/arm/mach-msm/board-dream.c + * + * Copyright (C) 2009 Google, Inc. + * Author: Brian Swetland <swetland@google.com> + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/platform_device.h> + +#include <asm/mach-types.h> +#include <asm/mach/arch.h> +#include <asm/mach/map.h> +#include <asm/setup.h> + +#include <mach/board.h> +#include <mach/hardware.h> +#include <mach/msm_iomap.h> + +#include "devices.h" +#include "board-dream.h" + +static struct platform_device *devices[] __initdata = { + &msm_device_uart3, + &msm_device_smd, + &msm_device_nand, + &msm_device_hsusb, + &msm_device_i2c, +}; + +extern struct sys_timer msm_timer; + +static void __init trout_init_irq(void) +{ + msm_init_irq(); +} + +static void __init trout_fixup(struct machine_desc *desc, struct tag *tags, + char **cmdline, struct meminfo *mi) +{ + mi->nr_banks = 1; + mi->bank[0].start = PHYS_OFFSET; + mi->bank[0].node = PHYS_TO_NID(PHYS_OFFSET); + mi->bank[0].size = (101*1024*1024); +} + +static void __init trout_init(void) +{ + platform_add_devices(devices, ARRAY_SIZE(devices)); +} + +static struct map_desc trout_io_desc[] __initdata = { + { + .virtual = TROUT_CPLD_BASE, + .pfn = __phys_to_pfn(TROUT_CPLD_START), + .length = TROUT_CPLD_SIZE, + .type = MT_DEVICE_NONSHARED + } +}; + +static void __init trout_map_io(void) +{ + msm_map_common_io(); + iotable_init(trout_io_desc, ARRAY_SIZE(trout_io_desc)); + +#ifdef CONFIG_MSM_DEBUG_UART3 + /* route UART3 to the "H2W" extended usb connector */ + writeb(0x80, TROUT_CPLD_BASE + 0x00); +#endif + + msm_clock_init(); +} + +MACHINE_START(TROUT, "HTC Dream") + .phys_io = MSM_DEBUG_UART_PHYS, + .io_pg_offst = ((MSM_DEBUG_UART_BASE) >> 18) & 0xfffc, + .boot_params = 0x10000100, + .fixup = trout_fixup, + .map_io = trout_map_io, + .init_irq = trout_init_irq, + .init_machine = trout_init, + .timer = &msm_timer, +MACHINE_END diff --git a/arch/arm/mach-msm/board-dream.h b/arch/arm/mach-msm/board-dream.h new file mode 100644 index 00000000000..4f345a5a0a6 --- /dev/null +++ b/arch/arm/mach-msm/board-dream.h @@ -0,0 +1,5 @@ + +#define TROUT_CPLD_BASE 0xE8100000 +#define TROUT_CPLD_START 0x98000000 +#define TROUT_CPLD_SIZE SZ_4K + diff --git a/arch/arm/mach-msm/include/mach/debug-macro.S b/arch/arm/mach-msm/include/mach/debug-macro.S index 1db3c97dbc4..d48747ebcd3 100644 --- a/arch/arm/mach-msm/include/mach/debug-macro.S +++ b/arch/arm/mach-msm/include/mach/debug-macro.S @@ -14,15 +14,18 @@ * */ + + #include <mach/hardware.h> #include <mach/msm_iomap.h> +#ifdef CONFIG_MSM_DEBUG_UART .macro addruart,rx @ see if the MMU is enabled and select appropriate base address mrc p15, 0, \rx, c1, c0 tst \rx, #1 - ldreq \rx, =MSM_UART1_PHYS - movne \rx, #0 + ldreq \rx, =MSM_DEBUG_UART_PHYS + ldrne \rx, =MSM_DEBUG_UART_BASE .endm .macro senduart,rd,rx @@ -32,13 +35,20 @@ .macro waituart,rd,rx @ wait for TX_READY - teq \rx, #0 - bne 2f -1: ldr \rd, [\rx, #0x08] +1001: ldr \rd, [\rx, #0x08] tst \rd, #0x04 - beq 1b -2: + beq 1001b + .endm +#else + .macro addruart,rx + .endm + + .macro senduart,rd,rx + .endm + + .macro waituart,rd,rx .endm +#endif .macro busyuart,rd,rx .endm diff --git a/arch/arm/mach-msm/include/mach/mmc.h b/arch/arm/mach-msm/include/mach/mmc.h new file mode 100644 index 00000000000..0ecf2542628 --- /dev/null +++ b/arch/arm/mach-msm/include/mach/mmc.h @@ -0,0 +1,26 @@ +/* + * arch/arm/include/asm/mach/mmc.h + */ +#ifndef ASMARM_MACH_MMC_H +#define ASMARM_MACH_MMC_H + +#include <linux/mmc/host.h> +#include <linux/mmc/card.h> +#include <linux/mmc/sdio_func.h> + +struct embedded_sdio_data { + struct sdio_cis cis; + struct sdio_cccr cccr; + struct sdio_embedded_func *funcs; + int num_funcs; +}; + +struct mmc_platform_data { + unsigned int ocr_mask; /* available voltages */ + u32 (*translate_vdd)(struct device *, unsigned int); + unsigned int (*status)(struct device *); + struct embedded_sdio_data *embedded_sdio; + int (*register_status_notify)(void (*callback)(int card_present, void *dev_id), void *dev_id); +}; + +#endif diff --git a/arch/arm/mach-msm/include/mach/msm_iomap.h b/arch/arm/mach-msm/include/mach/msm_iomap.h index 2f7b4c8620d..9dae1a98c77 100644 --- a/arch/arm/mach-msm/include/mach/msm_iomap.h +++ b/arch/arm/mach-msm/include/mach/msm_iomap.h @@ -84,6 +84,18 @@ #define MSM_UART3_PHYS 0xA9C00000 #define MSM_UART3_SIZE SZ_4K +#ifdef CONFIG_MSM_DEBUG_UART +#define MSM_DEBUG_UART_BASE 0xE1000000 +#if CONFIG_MSM_DEBUG_UART == 1 +#define MSM_DEBUG_UART_PHYS MSM_UART1_PHYS +#elif CONFIG_MSM_DEBUG_UART == 2 +#define MSM_DEBUG_UART_PHYS MSM_UART2_PHYS +#elif CONFIG_MSM_DEBUG_UART == 3 +#define MSM_DEBUG_UART_PHYS MSM_UART3_PHYS +#endif +#define MSM_DEBUG_UART_SIZE SZ_4K +#endif + #define MSM_SDC1_PHYS 0xA0400000 #define MSM_SDC1_SIZE SZ_4K diff --git a/arch/arm/mach-msm/include/mach/uncompress.h b/arch/arm/mach-msm/include/mach/uncompress.h index 026e8955ace..d94292c29d8 100644 --- a/arch/arm/mach-msm/include/mach/uncompress.h +++ b/arch/arm/mach-msm/include/mach/uncompress.h @@ -16,9 +16,16 @@ #ifndef __ASM_ARCH_MSM_UNCOMPRESS_H #include "hardware.h" +#include "linux/io.h" +#include "mach/msm_iomap.h" static void putc(int c) { +#if defined(MSM_DEBUG_UART_PHYS) + unsigned base = MSM_DEBUG_UART_PHYS; + while (!(readl(base + 0x08) & 0x04)) ; + writel(c, base + 0x0c); +#endif } static inline void flush(void) diff --git a/arch/arm/mach-msm/io.c b/arch/arm/mach-msm/io.c index 6e7692ff6f2..1c5e7dac086 100644 --- a/arch/arm/mach-msm/io.c +++ b/arch/arm/mach-msm/io.c @@ -42,6 +42,9 @@ static struct map_desc msm_io_desc[] __initdata = { MSM_DEVICE(GPIO1), MSM_DEVICE(GPIO2), MSM_DEVICE(CLK_CTL), +#ifdef CONFIG_MSM_DEBUG_UART + MSM_DEVICE(DEBUG_UART), +#endif { .virtual = (unsigned long) MSM_SHARED_RAM_BASE, .pfn = __phys_to_pfn(MSM_SHARED_RAM_PHYS), diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 731318e5ac1..bc01e3ebfeb 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -187,8 +187,8 @@ config HAVE_MMIOTRACE_SUPPORT def_bool y config X86_DECODER_SELFTEST - bool "x86 instruction decoder selftest" - depends on DEBUG_KERNEL + bool "x86 instruction decoder selftest" + depends on DEBUG_KERNEL && KPROBES ---help--- Perform x86 instruction decoder selftests at build time. This option is useful for checking the sanity of x86 instruction diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h index 84786fb9a23..4d817f9e6e7 100644 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -28,7 +28,9 @@ extern void amd_iommu_flush_all_domains(void); extern void amd_iommu_flush_all_devices(void); extern void amd_iommu_apply_erratum_63(u16 devid); extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); - +extern int amd_iommu_init_devices(void); +extern void amd_iommu_uninit_devices(void); +extern void amd_iommu_init_notifier(void); #ifndef CONFIG_AMD_IOMMU_STATS static inline void amd_iommu_stats_init(void) { } diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 022a84386de..ecb544e6538 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -23,6 +23,7 @@ struct task_struct *__switch_to(struct task_struct *prev, struct tss_struct; void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); +extern void show_regs_common(void); #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 1c0fb4d4ad5..b990b5cc954 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -166,6 +166,43 @@ static void iommu_uninit_device(struct device *dev) { kfree(dev->archdata.iommu); } + +void __init amd_iommu_uninit_devices(void) +{ + struct pci_dev *pdev = NULL; + + for_each_pci_dev(pdev) { + + if (!check_device(&pdev->dev)) + continue; + + iommu_uninit_device(&pdev->dev); + } +} + +int __init amd_iommu_init_devices(void) +{ + struct pci_dev *pdev = NULL; + int ret = 0; + + for_each_pci_dev(pdev) { + + if (!check_device(&pdev->dev)) + continue; + + ret = iommu_init_device(&pdev->dev); + if (ret) + goto out_free; + } + + return 0; + +out_free: + + amd_iommu_uninit_devices(); + + return ret; +} #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -1587,6 +1624,11 @@ static struct notifier_block device_nb = { .notifier_call = device_change_notifier, }; +void amd_iommu_init_notifier(void) +{ + bus_register_notifier(&pci_bus_type, &device_nb); +} + /***************************************************************************** * * The next functions belong to the dma_ops mapping/unmapping code. @@ -2145,8 +2187,6 @@ static void prealloc_protection_domains(void) if (!check_device(&dev->dev)) continue; - iommu_init_device(&dev->dev); - /* Is there already any domain for it? */ if (domain_for_device(&dev->dev)) continue; @@ -2215,8 +2255,6 @@ int __init amd_iommu_init_dma_ops(void) register_iommu(&amd_iommu_ops); - bus_register_notifier(&pci_bus_type, &device_nb); - amd_iommu_stats_init(); return 0; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 9c4a6f74755..1dca9c34eae 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1274,6 +1274,10 @@ static int __init amd_iommu_init(void) if (ret) goto free; + ret = amd_iommu_init_devices(); + if (ret) + goto free; + if (iommu_pass_through) ret = amd_iommu_init_passthrough(); else @@ -1281,6 +1285,8 @@ static int __init amd_iommu_init(void) if (ret) goto free; + amd_iommu_init_notifier(); + enable_iommus(); if (iommu_pass_through) @@ -1296,6 +1302,9 @@ out: return ret; free: + + amd_iommu_uninit_devices(); + free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, get_order(MAX_DOMAIN_ID/8)); diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index d9acc3bee0f..e31b9ffe25f 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -127,7 +127,7 @@ static u32 noop_apic_read(u32 reg) static void noop_apic_write(u32 reg, u32 v) { - WARN_ON_ONCE((cpu_has_apic || !disable_apic)); + WARN_ON_ONCE(cpu_has_apic && !disable_apic); } struct apic apic_noop = { diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index e85f8fb7f8e..dd2b5f26464 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -27,6 +27,9 @@ * * http://www.unisys.com */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/notifier.h> #include <linux/spinlock.h> #include <linux/cpumask.h> @@ -223,9 +226,9 @@ static int parse_unisys_oem(char *oemptr) mip_addr = val; mip = (struct mip_reg *)val; mip_reg = __va(mip); - pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", + pr_debug("host_reg = 0x%lx\n", (unsigned long)host_reg); - pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", + pr_debug("mip_reg = 0x%lx\n", (unsigned long)mip_reg); success++; break; @@ -401,7 +404,7 @@ static void es7000_enable_apic_mode(void) if (!es7000_plat) return; - printk(KERN_INFO "ES7000: Enabling APIC mode.\n"); + pr_info("Enabling APIC mode.\n"); memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); es7000_mip_reg.off_0x00 = MIP_SW_APIC; es7000_mip_reg.off_0x38 = MIP_VALID; @@ -514,8 +517,7 @@ static void es7000_setup_apic_routing(void) { int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); - printk(KERN_INFO - "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", + pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", (apic_version[apic] == 0x14) ? "Physical Cluster" : "Logical Cluster", nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d7ebf25d10e..a8aacd4b513 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1388,13 +1388,14 @@ static void __mcheck_cpu_init_timer(void) struct timer_list *t = &__get_cpu_var(mce_timer); int *n = &__get_cpu_var(mce_next_interval); + setup_timer(t, mce_start_timer, smp_processor_id()); + if (mce_ignore_ce) return; *n = check_interval * HZ; if (!*n) return; - setup_timer(t, mce_start_timer, smp_processor_id()); t->expires = round_jiffies(jiffies + *n); add_timer_on(t, smp_processor_id()); } @@ -1928,7 +1929,7 @@ error2: sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); + sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); sysdev_unregister(&per_cpu(mce_dev, cpu)); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ab1a8a89b98..45506d5dd8d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1632,6 +1632,7 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) data.period = event->hw.last_period; data.addr = 0; + data.raw = NULL; regs.ip = 0; /* @@ -1749,6 +1750,7 @@ static int p6_pmu_handle_irq(struct pt_regs *regs) u64 val; data.addr = 0; + data.raw = NULL; cpuc = &__get_cpu_var(cpu_hw_events); @@ -1794,6 +1796,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) u64 ack, status; data.addr = 0; + data.raw = NULL; cpuc = &__get_cpu_var(cpu_hw_events); @@ -1857,6 +1860,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) u64 val; data.addr = 0; + data.raw = NULL; cpuc = &__get_cpu_var(cpu_hw_events); @@ -2062,12 +2066,6 @@ static __init int p6_pmu_init(void) x86_pmu = p6_pmu; - if (!cpu_has_apic) { - pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); - pr_info("no hardware sampling interrupt available.\n"); - x86_pmu.apic = 0; - } - return 0; } @@ -2159,6 +2157,16 @@ static __init int amd_pmu_init(void) return 0; } +static void __init pmu_check_apic(void) +{ + if (cpu_has_apic) + return; + + x86_pmu.apic = 0; + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); + pr_info("no hardware sampling interrupt available.\n"); +} + void __init init_hw_perf_events(void) { int err; @@ -2180,6 +2188,8 @@ void __init init_hw_perf_events(void) return; } + pmu_check_apic(); + pr_cont("%s PMU driver.\n", x86_pmu.name); if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { @@ -2287,7 +2297,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); -static DEFINE_PER_CPU(int, in_nmi_frame); +static DEFINE_PER_CPU(int, in_ignored_frame); static void @@ -2303,8 +2313,9 @@ static void backtrace_warning(void *data, char *msg) static int backtrace_stack(void *data, char *name) { - per_cpu(in_nmi_frame, smp_processor_id()) = - x86_is_stack_id(NMI_STACK, name); + per_cpu(in_ignored_frame, smp_processor_id()) = + x86_is_stack_id(NMI_STACK, name) || + x86_is_stack_id(DEBUG_STACK, name); return 0; } @@ -2313,7 +2324,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; - if (per_cpu(in_nmi_frame, smp_processor_id())) + if (per_cpu(in_ignored_frame, smp_processor_id())) return; if (reliable) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 8e740934bd1..b13af53883a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -103,6 +103,35 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, return NULL; } +static inline int +in_irq_stack(unsigned long *stack, unsigned long *irq_stack, + unsigned long *irq_stack_end) +{ + return (stack >= irq_stack && stack < irq_stack_end); +} + +/* + * We are returning from the irq stack and go to the previous one. + * If the previous stack is also in the irq stack, then bp in the first + * frame of the irq stack points to the previous, interrupted one. + * Otherwise we have another level of indirection: We first save + * the bp of the previous stack, then we switch the stack to the irq one + * and save a new bp that links to the previous one. + * (See save_args()) + */ +static inline unsigned long +fixup_bp_irq_link(unsigned long bp, unsigned long *stack, + unsigned long *irq_stack, unsigned long *irq_stack_end) +{ +#ifdef CONFIG_FRAME_POINTER + struct stack_frame *frame = (struct stack_frame *)bp; + + if (!in_irq_stack(stack, irq_stack, irq_stack_end)) + return (unsigned long)frame->next_frame; +#endif + return bp; +} + /* * x86-64 can have up to three kernel stacks: * process stack @@ -175,7 +204,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, irq_stack = irq_stack_end - (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); - if (stack >= irq_stack && stack < irq_stack_end) { + if (in_irq_stack(stack, irq_stack, irq_stack_end)) { if (ops->stack(data, "IRQ") < 0) break; bp = print_context_stack(tinfo, stack, bp, @@ -186,6 +215,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * pointer (index -1 to end) in the IRQ stack: */ stack = (unsigned long *) (irq_stack_end[-1]); + bp = fixup_bp_irq_link(bp, stack, irq_stack, + irq_stack_end); irq_stack_end = NULL; ops->stack(data, "EOI"); continue; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 63bca794c8f..673f693fb45 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1076,10 +1076,10 @@ ENTRY(\sym) TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ - PER_CPU(init_tss, %rbp) - subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) + PER_CPU(init_tss, %r12) + subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) call \do_sym - addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) + addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index d42f65ac492..05d5fec64a9 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -362,8 +362,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, return ret; } - if (bp->callback) - ret = arch_store_info(bp); + ret = arch_store_info(bp); if (ret < 0) return ret; @@ -519,7 +518,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) break; } - (bp->callback)(bp, args->regs); + perf_bp_event(bp, args->regs); rcu_read_unlock(); } diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 63123d90210..37542b67c57 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -13,6 +13,9 @@ * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/firmware.h> #include <linux/pci_ids.h> #include <linux/uaccess.h> @@ -81,7 +84,7 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) memset(csig, 0, sizeof(*csig)); rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); - pr_info("microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); + pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; } @@ -111,8 +114,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev) /* ucode might be chipset specific -- currently we don't support this */ if (mc_header->nb_dev_id || mc_header->sb_dev_id) { - pr_err(KERN_ERR "microcode: CPU%d: loading of chipset " - "specific code not yet supported\n", cpu); + pr_err("CPU%d: loading of chipset specific code not yet supported\n", + cpu); return 0; } @@ -141,12 +144,12 @@ static int apply_microcode_amd(int cpu) /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { - pr_err("microcode: CPU%d: update failed " - "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); + pr_err("CPU%d: update failed (for patch_level=0x%x)\n", + cpu, mc_amd->hdr.patch_id); return -1; } - pr_info("microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); + pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); uci->cpu_sig.rev = rev; return 0; @@ -169,15 +172,14 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) return NULL; if (section_hdr[0] != UCODE_UCODE_TYPE) { - pr_err("microcode: error: invalid type field in " - "container file section header\n"); + pr_err("error: invalid type field in container file section header\n"); return NULL; } total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); if (total_size > size || total_size > UCODE_MAX_SIZE) { - pr_err("microcode: error: size mismatch\n"); + pr_err("error: size mismatch\n"); return NULL; } @@ -206,14 +208,13 @@ static int install_equiv_cpu_table(const u8 *buf) size = buf_pos[2]; if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - pr_err("microcode: error: invalid type field in " - "container file section header\n"); + pr_err("error: invalid type field in container file section header\n"); return 0; } equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); if (!equiv_cpu_table) { - pr_err("microcode: failed to allocate equivalent CPU table\n"); + pr_err("failed to allocate equivalent CPU table\n"); return 0; } @@ -246,7 +247,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { - pr_err("microcode: failed to create equivalent cpu table\n"); + pr_err("failed to create equivalent cpu table\n"); return UCODE_ERROR; } @@ -277,8 +278,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) if (!leftover) { vfree(uci->mc); uci->mc = new_mc; - pr_debug("microcode: CPU%d found a matching microcode " - "update with version 0x%x (current=0x%x)\n", + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); } else { vfree(new_mc); @@ -300,7 +300,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) return UCODE_NFOUND; if (*(u32 *)firmware->data != UCODE_MAGIC) { - pr_err("microcode: invalid UCODE_MAGIC (0x%08x)\n", + pr_err("invalid UCODE_MAGIC (0x%08x)\n", *(u32 *)firmware->data); return UCODE_ERROR; } @@ -313,8 +313,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { - pr_info("microcode: AMD microcode update via " - "/dev/cpu/microcode not supported\n"); + pr_info("AMD microcode update via /dev/cpu/microcode not supported\n"); return UCODE_ERROR; } @@ -334,14 +333,13 @@ void init_microcode_amd(struct device *device) WARN_ON(c->x86_vendor != X86_VENDOR_AMD); if (c->x86 < 0x10) { - pr_warning("microcode: AMD CPU family 0x%x not supported\n", - c->x86); + pr_warning("AMD CPU family 0x%x not supported\n", c->x86); return; } supported_cpu = 1; if (request_firmware(&firmware, fw_name, device)) - pr_err("microcode: failed to load file %s\n", fw_name); + pr_err("failed to load file %s\n", fw_name); } void fini_microcode_amd(void) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index e68aae39786..844c02c65fc 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -70,6 +70,9 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/platform_device.h> #include <linux/miscdevice.h> #include <linux/capability.h> @@ -209,7 +212,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, ssize_t ret = -EINVAL; if ((len >> PAGE_SHIFT) > totalram_pages) { - pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); + pr_err("too much data (max %ld pages)\n", totalram_pages); return ret; } @@ -244,7 +247,7 @@ static int __init microcode_dev_init(void) error = misc_register(µcode_dev); if (error) { - pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); + pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR); return error; } @@ -359,7 +362,7 @@ static enum ucode_state microcode_resume_cpu(int cpu) if (!uci->mc) return UCODE_NFOUND; - pr_debug("microcode: CPU%d updated upon resume\n", cpu); + pr_debug("CPU%d updated upon resume\n", cpu); apply_microcode_on_target(cpu); return UCODE_OK; @@ -379,7 +382,7 @@ static enum ucode_state microcode_init_cpu(int cpu) ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); if (ustate == UCODE_OK) { - pr_debug("microcode: CPU%d updated upon init\n", cpu); + pr_debug("CPU%d updated upon init\n", cpu); apply_microcode_on_target(cpu); } @@ -406,7 +409,7 @@ static int mc_sysdev_add(struct sys_device *sys_dev) if (!cpu_online(cpu)) return 0; - pr_debug("microcode: CPU%d added\n", cpu); + pr_debug("CPU%d added\n", cpu); err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); if (err) @@ -425,7 +428,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) if (!cpu_online(cpu)) return 0; - pr_debug("microcode: CPU%d removed\n", cpu); + pr_debug("CPU%d removed\n", cpu); microcode_fini_cpu(cpu); sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); return 0; @@ -473,15 +476,15 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) microcode_update_cpu(cpu); case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - pr_debug("microcode: CPU%d added\n", cpu); + pr_debug("CPU%d added\n", cpu); if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - pr_err("microcode: Failed to create group for CPU%d\n", cpu); + pr_err("Failed to create group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: /* Suspend is in progress, only remove the interface */ sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - pr_debug("microcode: CPU%d removed\n", cpu); + pr_debug("CPU%d removed\n", cpu); break; case CPU_DEAD: case CPU_UP_CANCELED_FROZEN: @@ -507,7 +510,7 @@ static int __init microcode_init(void) microcode_ops = init_amd_microcode(); if (!microcode_ops) { - pr_err("microcode: no support for this CPU vendor\n"); + pr_err("no support for this CPU vendor\n"); return -ENODEV; } @@ -541,8 +544,7 @@ static int __init microcode_init(void) register_hotcpu_notifier(&mc_cpu_notifier); pr_info("Microcode Update Driver: v" MICROCODE_VERSION - " <tigran@aivazian.fsnet.co.uk>," - " Peter Oruba\n"); + " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n"); return 0; } diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 0d334ddd0a9..ebd193e476c 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -70,6 +70,9 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/firmware.h> #include <linux/uaccess.h> #include <linux/kernel.h> @@ -146,8 +149,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || cpu_has(c, X86_FEATURE_IA64)) { - printk(KERN_ERR "microcode: CPU%d not a capable Intel " - "processor\n", cpu_num); + pr_err("CPU%d not a capable Intel processor\n", cpu_num); return -1; } @@ -165,8 +167,8 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); - printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", - cpu_num, csig->sig, csig->pf, csig->rev); + pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", + cpu_num, csig->sig, csig->pf, csig->rev); return 0; } @@ -194,28 +196,24 @@ static int microcode_sanity_check(void *mc) data_size = get_datasize(mc_header); if (data_size + MC_HEADER_SIZE > total_size) { - printk(KERN_ERR "microcode: error! " - "Bad data size in microcode data file\n"); + pr_err("error! Bad data size in microcode data file\n"); return -EINVAL; } if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - printk(KERN_ERR "microcode: error! " - "Unknown microcode update format\n"); + pr_err("error! Unknown microcode update format\n"); return -EINVAL; } ext_table_size = total_size - (MC_HEADER_SIZE + data_size); if (ext_table_size) { if ((ext_table_size < EXT_HEADER_SIZE) || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - printk(KERN_ERR "microcode: error! " - "Small exttable size in microcode data file\n"); + pr_err("error! Small exttable size in microcode data file\n"); return -EINVAL; } ext_header = mc + MC_HEADER_SIZE + data_size; if (ext_table_size != exttable_size(ext_header)) { - printk(KERN_ERR "microcode: error! " - "Bad exttable size in microcode data file\n"); + pr_err("error! Bad exttable size in microcode data file\n"); return -EFAULT; } ext_sigcount = ext_header->count; @@ -230,8 +228,7 @@ static int microcode_sanity_check(void *mc) while (i--) ext_table_sum += ext_tablep[i]; if (ext_table_sum) { - printk(KERN_WARNING "microcode: aborting, " - "bad extended signature table checksum\n"); + pr_warning("aborting, bad extended signature table checksum\n"); return -EINVAL; } } @@ -242,7 +239,7 @@ static int microcode_sanity_check(void *mc) while (i--) orig_sum += ((int *)mc)[i]; if (orig_sum) { - printk(KERN_ERR "microcode: aborting, bad checksum\n"); + pr_err("aborting, bad checksum\n"); return -EINVAL; } if (!ext_table_size) @@ -255,7 +252,7 @@ static int microcode_sanity_check(void *mc) - (mc_header->sig + mc_header->pf + mc_header->cksum) + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); if (sum) { - printk(KERN_ERR "microcode: aborting, bad checksum\n"); + pr_err("aborting, bad checksum\n"); return -EINVAL; } } @@ -327,13 +324,11 @@ static int apply_microcode(int cpu) rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); if (val[1] != mc_intel->hdr.rev) { - printk(KERN_ERR "microcode: CPU%d update " - "to revision 0x%x failed\n", - cpu_num, mc_intel->hdr.rev); + pr_err("CPU%d update to revision 0x%x failed\n", + cpu_num, mc_intel->hdr.rev); return -1; } - printk(KERN_INFO "microcode: CPU%d updated to revision " - "0x%x, date = %04x-%02x-%02x \n", + pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n", cpu_num, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, @@ -362,8 +357,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, mc_size = get_totalsize(&mc_header); if (!mc_size || mc_size > leftover) { - printk(KERN_ERR "microcode: error!" - "Bad data in microcode data file\n"); + pr_err("error! Bad data in microcode data file\n"); break; } @@ -405,9 +399,8 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; - pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); out: return state; } @@ -429,7 +422,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) c->x86, c->x86_model, c->x86_mask); if (request_firmware(&firmware, name, device)) { - pr_debug("microcode: data file %s load failed\n", name); + pr_debug("data file %s load failed\n", name); return UCODE_NFOUND; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5e2ba634ea1..7a7bd4e3ec4 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -10,6 +10,8 @@ #include <linux/clockchips.h> #include <linux/random.h> #include <linux/user-return-notifier.h> +#include <linux/dmi.h> +#include <linux/utsname.h> #include <trace/events/power.h> #include <linux/hw_breakpoint.h> #include <asm/system.h> @@ -90,6 +92,25 @@ void exit_thread(void) } } +void show_regs_common(void) +{ + const char *board, *product; + + board = dmi_get_system_info(DMI_BOARD_NAME); + if (!board) + board = ""; + product = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!product) + product = ""; + + printk("\n"); + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, board, product); +} + void flush_thread(void) { struct task_struct *tsk = current; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 075580b3568..120b88797a7 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -23,7 +23,6 @@ #include <linux/vmalloc.h> #include <linux/user.h> #include <linux/interrupt.h> -#include <linux/utsname.h> #include <linux/delay.h> #include <linux/reboot.h> #include <linux/init.h> @@ -35,7 +34,6 @@ #include <linux/tick.h> #include <linux/percpu.h> #include <linux/prctl.h> -#include <linux/dmi.h> #include <linux/ftrace.h> #include <linux/uaccess.h> #include <linux/io.h> @@ -128,7 +126,6 @@ void __show_regs(struct pt_regs *regs, int all) unsigned long d0, d1, d2, d3, d6, d7; unsigned long sp; unsigned short ss, gs; - const char *board; if (user_mode_vm(regs)) { sp = regs->sp; @@ -140,16 +137,7 @@ void __show_regs(struct pt_regs *regs, int all) savesegment(gs, gs); } - printk("\n"); - - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (!board) - board = ""; - printk("Pid: %d, comm: %s %s (%s %.*s) %s\n", - task_pid_nr(current), current->comm, - print_tainted(), init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); + show_regs_common(); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c95c8f4e790..e5ab0cd0ef3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -26,7 +26,6 @@ #include <linux/slab.h> #include <linux/user.h> #include <linux/interrupt.h> -#include <linux/utsname.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/ptrace.h> @@ -38,7 +37,6 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/ftrace.h> -#include <linux/dmi.h> #include <asm/pgtable.h> #include <asm/system.h> @@ -163,18 +161,8 @@ void __show_regs(struct pt_regs *regs, int all) unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; unsigned int ds, cs, es; - const char *board; - - printk("\n"); - print_modules(); - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (!board) - board = ""; - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); + + show_regs_common(); printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); printk_address(regs->ip, 1); printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 04d182a7cfd..7079ddaf073 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -555,7 +555,9 @@ static int genregs_set(struct task_struct *target, return ret; } -static void ptrace_triggered(struct perf_event *bp, void *data) +static void ptrace_triggered(struct perf_event *bp, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { int i; struct thread_struct *thread = &(current->thread); @@ -593,13 +595,13 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) return dr7; } -static struct perf_event * +static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, struct task_struct *tsk, int disabled) { int err; int gen_len, gen_type; - DEFINE_BREAKPOINT_ATTR(attr); + struct perf_event_attr attr; /* * We shoud have at least an inactive breakpoint at this @@ -607,18 +609,18 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, * written the address register first */ if (!bp) - return ERR_PTR(-EINVAL); + return -EINVAL; err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); if (err) - return ERR_PTR(err); + return err; attr = bp->attr; attr.bp_len = gen_len; attr.bp_type = gen_type; attr.disabled = disabled; - return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); + return modify_user_hw_breakpoint(bp, &attr); } /* @@ -656,28 +658,17 @@ restore: if (!second_pass) continue; - thread->ptrace_bps[i] = NULL; - bp = ptrace_modify_breakpoint(bp, len, type, + rc = ptrace_modify_breakpoint(bp, len, type, tsk, 1); - if (IS_ERR(bp)) { - rc = PTR_ERR(bp); - thread->ptrace_bps[i] = NULL; + if (rc) break; - } - thread->ptrace_bps[i] = bp; } continue; } - bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0); - - /* Incorrect bp, or we have a bug in bp API */ - if (IS_ERR(bp)) { - rc = PTR_ERR(bp); - thread->ptrace_bps[i] = NULL; + rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0); + if (rc) break; - } - thread->ptrace_bps[i] = bp; } /* * Make a second pass to free the remaining unused breakpoints @@ -721,9 +712,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, { struct perf_event *bp; struct thread_struct *t = &tsk->thread; - DEFINE_BREAKPOINT_ATTR(attr); + struct perf_event_attr attr; if (!t->ptrace_bps[nr]) { + hw_breakpoint_init(&attr); /* * Put stub len and type to register (reserve) an inactive but * correct bp @@ -734,26 +726,32 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, attr.disabled = 1; bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); + + /* + * CHECKME: the previous code returned -EIO if the addr wasn't + * a valid task virtual addr. The new one will return -EINVAL in + * this case. + * -EINVAL may be what we want for in-kernel breakpoints users, + * but -EIO looks better for ptrace, since we refuse a register + * writing for the user. And anyway this is the previous + * behaviour. + */ + if (IS_ERR(bp)) + return PTR_ERR(bp); + + t->ptrace_bps[nr] = bp; } else { + int err; + bp = t->ptrace_bps[nr]; - t->ptrace_bps[nr] = NULL; attr = bp->attr; attr.bp_addr = addr; - bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); + err = modify_user_hw_breakpoint(bp, &attr); + if (err) + return err; } - /* - * CHECKME: the previous code returned -EIO if the addr wasn't a - * valid task virtual addr. The new one will return -EINVAL in this - * case. - * -EINVAL may be what we want for in-kernel breakpoints users, but - * -EIO looks better for ptrace, since we refuse a register writing - * for the user. And anyway this is the previous behaviour. - */ - if (IS_ERR(bp)) - return PTR_ERR(bp); - t->ptrace_bps[nr] = bp; return 0; } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 2b97fc5b124..1545bc0c984 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -259,6 +259,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), }, }, + { /* Handle problems with rebooting on ASUS P4S800 */ + .callback = set_bios_reboot, + .ident = "ASUS P4S800", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P4S800"), + }, + }, { } }; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d559af913e1..35abcb8b00e 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> @@ -20,9 +22,9 @@ #include <asm/stackprotector.h> #ifdef CONFIG_DEBUG_PER_CPU_MAPS -# define DBG(x...) printk(KERN_DEBUG x) +# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__) #else -# define DBG(x...) +# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0) #endif DEFINE_PER_CPU(int, cpu_number); @@ -116,8 +118,8 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } else { ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), size, align, goal); - pr_debug("per cpu data for cpu%d %lu bytes on node%d at " - "%016lx\n", cpu, size, node, __pa(ptr)); + pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n", + cpu, size, node, __pa(ptr)); } return ptr; #else @@ -198,8 +200,7 @@ void __init setup_per_cpu_areas(void) pcpu_cpu_distance, pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) - pr_warning("PERCPU: %s allocator failed (%d), " - "falling back to page size\n", + pr_warning("%s allocator failed (%d), falling back to page size\n", pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index fab7440c9bb..296aba49472 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -29,6 +29,8 @@ * Based on QEMU and Xen. */ +#define pr_fmt(fmt) "pit: " fmt + #include <linux/kvm_host.h> #include "irq.h" @@ -262,7 +264,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) static void destroy_pit_timer(struct kvm_timer *pt) { - pr_debug("pit: execute del timer!\n"); + pr_debug("execute del timer!\n"); hrtimer_cancel(&pt->timer); } @@ -284,7 +286,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); - pr_debug("pit: create pit timer, interval is %llu nsec\n", interval); + pr_debug("create pit timer, interval is %llu nsec\n", interval); /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&pt->timer); @@ -309,7 +311,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) WARN_ON(!mutex_is_locked(&ps->lock)); - pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); + pr_debug("load_count val is %d, channel is %d\n", val, channel); /* * The largest possible initial count is 0; this is equivalent @@ -395,8 +397,8 @@ static int pit_ioport_write(struct kvm_io_device *this, mutex_lock(&pit_state->lock); if (val != 0) - pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n", - (unsigned int)addr, len, val); + pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n", + (unsigned int)addr, len, val); if (addr == 3) { channel = val >> 6; diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index a2d6472895f..45b20e486c2 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -5,7 +5,7 @@ inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt quiet_cmd_inat_tables = GEN $@ - cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ + cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ || rm -f $@ $(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) $(call cmd,inat_tables) @@ -20,7 +20,7 @@ lib-y := delay.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o -lib-y += insn.o inat.o +lib-$(CONFIG_KPROBES) += insn.o inat.o obj-y += msr-reg.o msr-reg-export.o diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 07bcc309cfd..c0f6198565e 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -5,6 +5,8 @@ * 2008 Pekka Paalanen <pq@iki.fi> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/list.h> #include <linux/rculist.h> #include <linux/spinlock.h> @@ -136,7 +138,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) pte_t *pte = lookup_address(f->page, &level); if (!pte) { - pr_err("kmmio: no pte for page 0x%08lx\n", f->page); + pr_err("no pte for page 0x%08lx\n", f->page); return -1; } @@ -148,7 +150,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) clear_pte_presence(pte, clear, &f->old_presence); break; default: - pr_err("kmmio: unexpected page level 0x%x.\n", level); + pr_err("unexpected page level 0x%x.\n", level); return -1; } @@ -170,13 +172,14 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) static int arm_kmmio_fault_page(struct kmmio_fault_page *f) { int ret; - WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); + WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); if (f->armed) { - pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", - f->page, f->count, !!f->old_presence); + pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n", + f->page, f->count, !!f->old_presence); } ret = clear_page_presence(f, true); - WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); + WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"), + f->page); f->armed = true; return ret; } @@ -240,24 +243,21 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) * condition needs handling by do_page_fault(), the * page really not being present is the most common. */ - pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n", - addr, smp_processor_id()); + pr_debug("secondary hit for 0x%08lx CPU %d.\n", + addr, smp_processor_id()); if (!faultpage->old_presence) - pr_info("kmmio: unexpected secondary hit for " - "address 0x%08lx on CPU %d.\n", addr, - smp_processor_id()); + pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n", + addr, smp_processor_id()); } else { /* * Prevent overwriting already in-flight context. * This should not happen, let's hope disarming at * least prevents a panic. */ - pr_emerg("kmmio: recursive probe hit on CPU %d, " - "for address 0x%08lx. Ignoring.\n", - smp_processor_id(), addr); - pr_emerg("kmmio: previous hit was at 0x%08lx.\n", - ctx->addr); + pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n", + smp_processor_id(), addr); + pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr); disarm_kmmio_fault_page(faultpage); } goto no_kmmio_ctx; @@ -316,8 +316,8 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) * something external causing them (f.e. using a debugger while * mmio tracing enabled), or erroneous behaviour */ - pr_warning("kmmio: unexpected debug trap on CPU %d.\n", - smp_processor_id()); + pr_warning("unexpected debug trap on CPU %d.\n", + smp_processor_id()); goto out; } @@ -425,7 +425,7 @@ int register_kmmio_probe(struct kmmio_probe *p) list_add_rcu(&p->list, &kmmio_probes); while (size < size_lim) { if (add_kmmio_fault_page(p->addr + size)) - pr_err("kmmio: Unable to set page fault.\n"); + pr_err("Unable to set page fault.\n"); size += PAGE_SIZE; } out: @@ -490,7 +490,7 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) * 2. remove_kmmio_fault_pages() * Remove the pages from kmmio_page_table. * 3. rcu_free_kmmio_fault_pages() - * Actally free the kmmio_fault_page structs as with RCU. + * Actually free the kmmio_fault_page structs as with RCU. */ void unregister_kmmio_probe(struct kmmio_probe *p) { @@ -511,7 +511,7 @@ void unregister_kmmio_probe(struct kmmio_probe *p) drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); if (!drelease) { - pr_crit("kmmio: leaking kmmio_fault_page objects.\n"); + pr_crit("leaking kmmio_fault_page objects.\n"); return; } drelease->release_list = release_list; diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 132772a8ec5..4c765e9c466 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -19,6 +19,9 @@ * * Derived from the read-mod example from relay-examples by Tom Zanussi. */ + +#define pr_fmt(fmt) "mmiotrace: " + #define DEBUG 1 #include <linux/module.h> @@ -36,8 +39,6 @@ #include "pf_in.h" -#define NAME "mmiotrace: " - struct trap_reason { unsigned long addr; unsigned long ip; @@ -96,17 +97,18 @@ static void print_pte(unsigned long address) pte_t *pte = lookup_address(address, &level); if (!pte) { - pr_err(NAME "Error in %s: no pte for page 0x%08lx\n", - __func__, address); + pr_err("Error in %s: no pte for page 0x%08lx\n", + __func__, address); return; } if (level == PG_LEVEL_2M) { - pr_emerg(NAME "4MB pages are not currently supported: " - "0x%08lx\n", address); + pr_emerg("4MB pages are not currently supported: 0x%08lx\n", + address); BUG(); } - pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address, + pr_info("pte for 0x%lx: 0x%llx 0x%llx\n", + address, (unsigned long long)pte_val(*pte), (unsigned long long)pte_val(*pte) & _PAGE_PRESENT); } @@ -118,22 +120,21 @@ static void print_pte(unsigned long address) static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) { const struct trap_reason *my_reason = &get_cpu_var(pf_reason); - pr_emerg(NAME "unexpected fault for address: 0x%08lx, " - "last fault for address: 0x%08lx\n", - addr, my_reason->addr); + pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n", + addr, my_reason->addr); print_pte(addr); print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip); print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip); #ifdef __i386__ pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", - regs->ax, regs->bx, regs->cx, regs->dx); + regs->ax, regs->bx, regs->cx, regs->dx); pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", - regs->si, regs->di, regs->bp, regs->sp); + regs->si, regs->di, regs->bp, regs->sp); #else pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", - regs->ax, regs->cx, regs->dx); + regs->ax, regs->cx, regs->dx); pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", - regs->si, regs->di, regs->bp, regs->sp); + regs->si, regs->di, regs->bp, regs->sp); #endif put_cpu_var(pf_reason); BUG(); @@ -213,7 +214,7 @@ static void post(struct kmmio_probe *p, unsigned long condition, /* this should always return the active_trace count to 0 */ my_reason->active_traces--; if (my_reason->active_traces) { - pr_emerg(NAME "unexpected post handler"); + pr_emerg("unexpected post handler"); BUG(); } @@ -244,7 +245,7 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size, }; if (!trace) { - pr_err(NAME "kmalloc failed in ioremap\n"); + pr_err("kmalloc failed in ioremap\n"); return; } @@ -282,8 +283,8 @@ void mmiotrace_ioremap(resource_size_t offset, unsigned long size, if (!is_enabled()) /* recheck and proper locking in *_core() */ return; - pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n", - (unsigned long long)offset, size, addr); + pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n", + (unsigned long long)offset, size, addr); if ((filter_offset) && (offset != filter_offset)) return; ioremap_trace_core(offset, size, addr); @@ -301,7 +302,7 @@ static void iounmap_trace_core(volatile void __iomem *addr) struct remap_trace *tmp; struct remap_trace *found_trace = NULL; - pr_debug(NAME "Unmapping %p.\n", addr); + pr_debug("Unmapping %p.\n", addr); spin_lock_irq(&trace_lock); if (!is_enabled()) @@ -363,9 +364,8 @@ static void clear_trace_list(void) * Caller also ensures is_enabled() cannot change. */ list_for_each_entry(trace, &trace_list, list) { - pr_notice(NAME "purging non-iounmapped " - "trace @0x%08lx, size 0x%lx.\n", - trace->probe.addr, trace->probe.len); + pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n", + trace->probe.addr, trace->probe.len); if (!nommiotrace) unregister_kmmio_probe(&trace->probe); } @@ -387,7 +387,7 @@ static void enter_uniprocessor(void) if (downed_cpus == NULL && !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) { - pr_notice(NAME "Failed to allocate mask\n"); + pr_notice("Failed to allocate mask\n"); goto out; } @@ -395,20 +395,19 @@ static void enter_uniprocessor(void) cpumask_copy(downed_cpus, cpu_online_mask); cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus); if (num_online_cpus() > 1) - pr_notice(NAME "Disabling non-boot CPUs...\n"); + pr_notice("Disabling non-boot CPUs...\n"); put_online_cpus(); for_each_cpu(cpu, downed_cpus) { err = cpu_down(cpu); if (!err) - pr_info(NAME "CPU%d is down.\n", cpu); + pr_info("CPU%d is down.\n", cpu); else - pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err); + pr_err("Error taking CPU%d down: %d\n", cpu, err); } out: if (num_online_cpus() > 1) - pr_warning(NAME "multiple CPUs still online, " - "may miss events.\n"); + pr_warning("multiple CPUs still online, may miss events.\n"); } /* __ref because leave_uniprocessor calls cpu_up which is __cpuinit, @@ -420,13 +419,13 @@ static void __ref leave_uniprocessor(void) if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0) return; - pr_notice(NAME "Re-enabling CPUs...\n"); + pr_notice("Re-enabling CPUs...\n"); for_each_cpu(cpu, downed_cpus) { err = cpu_up(cpu); if (!err) - pr_info(NAME "enabled CPU%d.\n", cpu); + pr_info("enabled CPU%d.\n", cpu); else - pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err); + pr_err("cannot re-enable CPU%d: %d\n", cpu, err); } } @@ -434,8 +433,8 @@ static void __ref leave_uniprocessor(void) static void enter_uniprocessor(void) { if (num_online_cpus() > 1) - pr_warning(NAME "multiple CPUs are online, may miss events. " - "Suggest booting with maxcpus=1 kernel argument.\n"); + pr_warning("multiple CPUs are online, may miss events. " + "Suggest booting with maxcpus=1 kernel argument.\n"); } static void leave_uniprocessor(void) @@ -450,13 +449,13 @@ void enable_mmiotrace(void) goto out; if (nommiotrace) - pr_info(NAME "MMIO tracing disabled.\n"); + pr_info("MMIO tracing disabled.\n"); kmmio_init(); enter_uniprocessor(); spin_lock_irq(&trace_lock); atomic_inc(&mmiotrace_enabled); spin_unlock_irq(&trace_lock); - pr_info(NAME "enabled.\n"); + pr_info("enabled.\n"); out: mutex_unlock(&mmiotrace_mutex); } @@ -475,7 +474,7 @@ void disable_mmiotrace(void) clear_trace_list(); /* guarantees: no more kmmio callbacks */ leave_uniprocessor(); kmmio_cleanup(); - pr_info(NAME "disabled.\n"); + pr_info("disabled.\n"); out: mutex_unlock(&mmiotrace_mutex); } diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index d8214dc03fa..bee8d6ac269 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -113,7 +113,7 @@ int main(int argc, char **argv) char line[BUFSIZE], sym[BUFSIZE] = "<unknown>"; unsigned char insn_buf[16]; struct insn insn; - int insns = 0, c; + int insns = 0; int warnings = 0; parse_args(argc, argv); diff --git a/drivers/char/lp.c b/drivers/char/lp.c index e444c2dba16..938a3a27388 100644 --- a/drivers/char/lp.c +++ b/drivers/char/lp.c @@ -127,6 +127,7 @@ #include <linux/wait.h> #include <linux/jiffies.h> #include <linux/smp_lock.h> +#include <linux/compat.h> #include <linux/parport.h> #undef LP_STATS @@ -571,13 +572,11 @@ static int lp_release(struct inode * inode, struct file * file) return 0; } -static int lp_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) +static int lp_do_ioctl(unsigned int minor, unsigned int cmd, + unsigned long arg, void __user *argp) { - unsigned int minor = iminor(inode); int status; int retval = 0; - void __user *argp = (void __user *)arg; #ifdef LP_DEBUG printk(KERN_DEBUG "lp%d ioctl, cmd: 0x%x, arg: 0x%lx\n", minor, cmd, arg); @@ -587,9 +586,6 @@ static int lp_ioctl(struct inode *inode, struct file *file, if ((LP_F(minor) & LP_EXIST) == 0) return -ENODEV; switch ( cmd ) { - struct timeval par_timeout; - long to_jiffies; - case LPTIME: LP_TIME(minor) = arg * HZ/100; break; @@ -652,34 +648,101 @@ static int lp_ioctl(struct inode *inode, struct file *file, return -EFAULT; break; - case LPSETTIMEOUT: - if (copy_from_user (&par_timeout, argp, - sizeof (struct timeval))) { - return -EFAULT; - } - /* Convert to jiffies, place in lp_table */ - if ((par_timeout.tv_sec < 0) || - (par_timeout.tv_usec < 0)) { - return -EINVAL; - } - to_jiffies = DIV_ROUND_UP(par_timeout.tv_usec, 1000000/HZ); - to_jiffies += par_timeout.tv_sec * (long) HZ; - if (to_jiffies <= 0) { - return -EINVAL; - } - lp_table[minor].timeout = to_jiffies; - break; - default: retval = -EINVAL; } return retval; } +static int lp_set_timeout(unsigned int minor, struct timeval *par_timeout) +{ + long to_jiffies; + + /* Convert to jiffies, place in lp_table */ + if ((par_timeout->tv_sec < 0) || + (par_timeout->tv_usec < 0)) { + return -EINVAL; + } + to_jiffies = DIV_ROUND_UP(par_timeout->tv_usec, 1000000/HZ); + to_jiffies += par_timeout->tv_sec * (long) HZ; + if (to_jiffies <= 0) { + return -EINVAL; + } + lp_table[minor].timeout = to_jiffies; + return 0; +} + +static long lp_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + unsigned int minor; + struct timeval par_timeout; + int ret; + + minor = iminor(file->f_path.dentry->d_inode); + lock_kernel(); + switch (cmd) { + case LPSETTIMEOUT: + if (copy_from_user(&par_timeout, (void __user *)arg, + sizeof (struct timeval))) { + ret = -EFAULT; + break; + } + ret = lp_set_timeout(minor, &par_timeout); + break; + default: + ret = lp_do_ioctl(minor, cmd, arg, (void __user *)arg); + break; + } + unlock_kernel(); + + return ret; +} + +#ifdef CONFIG_COMPAT +static long lp_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + unsigned int minor; + struct timeval par_timeout; + struct compat_timeval __user *tc; + int ret; + + minor = iminor(file->f_path.dentry->d_inode); + lock_kernel(); + switch (cmd) { + case LPSETTIMEOUT: + tc = compat_ptr(arg); + if (get_user(par_timeout.tv_sec, &tc->tv_sec) || + get_user(par_timeout.tv_usec, &tc->tv_usec)) { + ret = -EFAULT; + break; + } + ret = lp_set_timeout(minor, &par_timeout); + break; +#ifdef LP_STATS + case LPGETSTATS: + /* FIXME: add an implementation if you set LP_STATS */ + ret = -EINVAL; + break; +#endif + default: + ret = lp_do_ioctl(minor, cmd, arg, compat_ptr(arg)); + break; + } + unlock_kernel(); + + return ret; +} +#endif + static const struct file_operations lp_fops = { .owner = THIS_MODULE, .write = lp_write, - .ioctl = lp_ioctl, + .unlocked_ioctl = lp_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = lp_compat_ioctl, +#endif .open = lp_open, .release = lp_release, #ifdef CONFIG_PARPORT_1284 diff --git a/drivers/mmc/host/msm_sdcc.c b/drivers/mmc/host/msm_sdcc.c index dba4600bcdb..b31946e0b4c 100644 --- a/drivers/mmc/host/msm_sdcc.c +++ b/drivers/mmc/host/msm_sdcc.c @@ -38,10 +38,9 @@ #include <asm/div64.h> #include <asm/sizes.h> -#include <asm/mach/mmc.h> +#include <mach/mmc.h> #include <mach/msm_iomap.h> #include <mach/dma.h> -#include <mach/htc_pwrsink.h> #include "msm_sdcc.h" @@ -775,13 +774,11 @@ msmsdcc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) switch (ios->power_mode) { case MMC_POWER_OFF: - htc_pwrsink_set(PWRSINK_SDCARD, 0); break; case MMC_POWER_UP: pwr |= MCI_PWR_UP; break; case MMC_POWER_ON: - htc_pwrsink_set(PWRSINK_SDCARD, 100); pwr |= MCI_PWR_ON; break; } diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 181f78c8410..6e8bcdfd23b 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1388,6 +1388,46 @@ static int proc_reapurbnonblock(struct dev_state *ps, void __user *arg) } #ifdef CONFIG_COMPAT +static int proc_control_compat(struct dev_state *ps, + struct usbdevfs_ctrltransfer32 __user *p32) +{ + struct usbdevfs_ctrltransfer __user *p; + __u32 udata; + p = compat_alloc_user_space(sizeof(*p)); + if (copy_in_user(p, p32, (sizeof(*p32) - sizeof(compat_caddr_t))) || + get_user(udata, &p32->data) || + put_user(compat_ptr(udata), &p->data)) + return -EFAULT; + return proc_control(ps, p); +} + +static int proc_bulk_compat(struct dev_state *ps, + struct usbdevfs_bulktransfer32 __user *p32) +{ + struct usbdevfs_bulktransfer __user *p; + compat_uint_t n; + compat_caddr_t addr; + + p = compat_alloc_user_space(sizeof(*p)); + + if (get_user(n, &p32->ep) || put_user(n, &p->ep) || + get_user(n, &p32->len) || put_user(n, &p->len) || + get_user(n, &p32->timeout) || put_user(n, &p->timeout) || + get_user(addr, &p32->data) || put_user(compat_ptr(addr), &p->data)) + return -EFAULT; + + return proc_bulk(ps, p); +} +static int proc_disconnectsignal_compat(struct dev_state *ps, void __user *arg) +{ + struct usbdevfs_disconnectsignal32 ds; + + if (copy_from_user(&ds, arg, sizeof(ds))) + return -EFAULT; + ps->discsignr = ds.signr; + ps->disccontext = compat_ptr(ds.context); + return 0; +} static int get_urb32(struct usbdevfs_urb *kurb, struct usbdevfs_urb32 __user *uurb) @@ -1482,6 +1522,7 @@ static int proc_reapurbnonblock_compat(struct dev_state *ps, void __user *arg) return processcompl_compat(as, (void __user * __user *)arg); } + #endif static int proc_disconnectsignal(struct dev_state *ps, void __user *arg) @@ -1648,12 +1689,12 @@ static int proc_release_port(struct dev_state *ps, void __user *arg) * are assuming that somehow the configuration has been prevented from * changing. But there's no mechanism to ensure that... */ -static int usbdev_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) +static long usbdev_do_ioctl(struct file *file, unsigned int cmd, + void __user *p) { struct dev_state *ps = file->private_data; + struct inode *inode = file->f_path.dentry->d_inode; struct usb_device *dev = ps->dev; - void __user *p = (void __user *)arg; int ret = -ENOTTY; if (!(file->f_mode & FMODE_WRITE)) @@ -1726,6 +1767,24 @@ static int usbdev_ioctl(struct inode *inode, struct file *file, break; #ifdef CONFIG_COMPAT + case USBDEVFS_CONTROL32: + snoop(&dev->dev, "%s: CONTROL32\n", __func__); + ret = proc_control_compat(ps, p); + if (ret >= 0) + inode->i_mtime = CURRENT_TIME; + break; + + case USBDEVFS_BULK32: + snoop(&dev->dev, "%s: BULK32\n", __func__); + ret = proc_bulk_compat(ps, p); + if (ret >= 0) + inode->i_mtime = CURRENT_TIME; + break; + + case USBDEVFS_DISCSIGNAL32: + snoop(&dev->dev, "%s: DISCSIGNAL32\n", __func__); + ret = proc_disconnectsignal_compat(ps, p); + break; case USBDEVFS_SUBMITURB32: snoop(&dev->dev, "%s: SUBMITURB32\n", __func__); @@ -1745,7 +1804,7 @@ static int usbdev_ioctl(struct inode *inode, struct file *file, break; case USBDEVFS_IOCTL32: - snoop(&dev->dev, "%s: IOCTL\n", __func__); + snoop(&dev->dev, "%s: IOCTL32\n", __func__); ret = proc_ioctl_compat(ps, ptr_to_compat(p)); break; #endif @@ -1801,6 +1860,32 @@ static int usbdev_ioctl(struct inode *inode, struct file *file, return ret; } +static long usbdev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int ret; + + lock_kernel(); + ret = usbdev_do_ioctl(file, cmd, (void __user *)arg); + unlock_kernel(); + + return ret; +} + +#ifdef CONFIG_COMPAT +static long usbdev_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int ret; + + lock_kernel(); + ret = usbdev_do_ioctl(file, cmd, compat_ptr(arg)); + unlock_kernel(); + + return ret; +} +#endif + /* No kernel lock - fine */ static unsigned int usbdev_poll(struct file *file, struct poll_table_struct *wait) @@ -1817,13 +1902,16 @@ static unsigned int usbdev_poll(struct file *file, } const struct file_operations usbdev_file_operations = { - .owner = THIS_MODULE, - .llseek = usbdev_lseek, - .read = usbdev_read, - .poll = usbdev_poll, - .ioctl = usbdev_ioctl, - .open = usbdev_open, - .release = usbdev_release, + .owner = THIS_MODULE, + .llseek = usbdev_lseek, + .read = usbdev_read, + .poll = usbdev_poll, + .unlocked_ioctl = usbdev_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = usbdev_compat_ioctl, +#endif + .open = usbdev_open, + .release = usbdev_release, }; static void usbdev_remove(struct usb_device *udev) diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig index bb5fbed89e7..99c0df1c7eb 100644 --- a/drivers/video/Kconfig +++ b/drivers/video/Kconfig @@ -2131,7 +2131,7 @@ config FB_PRE_INIT_FB the bootloader. config FB_MSM - tristate + tristate "MSM Framebuffer support" depends on FB && ARCH_MSM select FB_CFB_FILLRECT select FB_CFB_COPYAREA diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 2346895b3a7..278020d2449 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -111,43 +111,40 @@ #include <linux/dvb/frontend.h> #include <linux/dvb/video.h> +#include <linux/sort.h> + #ifdef CONFIG_SPARC #include <asm/fbio.h> #endif -static int do_ioctl32_pointer(unsigned int fd, unsigned int cmd, - unsigned long arg, struct file *f) -{ - return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg)); -} - -static int w_long(unsigned int fd, unsigned int cmd, unsigned long arg) +static int w_long(unsigned int fd, unsigned int cmd, + compat_ulong_t __user *argp) { mm_segment_t old_fs = get_fs(); int err; unsigned long val; - + set_fs (KERNEL_DS); err = sys_ioctl(fd, cmd, (unsigned long)&val); set_fs (old_fs); - if (!err && put_user(val, (u32 __user *)compat_ptr(arg))) + if (!err && put_user(val, argp)) return -EFAULT; return err; } - -static int rw_long(unsigned int fd, unsigned int cmd, unsigned long arg) + +static int rw_long(unsigned int fd, unsigned int cmd, + compat_ulong_t __user *argp) { mm_segment_t old_fs = get_fs(); - u32 __user *argptr = compat_ptr(arg); int err; unsigned long val; - - if(get_user(val, argptr)) + + if(get_user(val, argp)) return -EFAULT; set_fs (KERNEL_DS); err = sys_ioctl(fd, cmd, (unsigned long)&val); set_fs (old_fs); - if (!err && put_user(val, argptr)) + if (!err && put_user(val, argp)) return -EFAULT; return err; } @@ -161,7 +158,8 @@ struct compat_video_event { } u; }; -static int do_video_get_event(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_video_get_event(unsigned int fd, unsigned int cmd, + struct compat_video_event __user *up) { struct video_event kevent; mm_segment_t old_fs = get_fs(); @@ -172,8 +170,6 @@ static int do_video_get_event(unsigned int fd, unsigned int cmd, unsigned long a set_fs(old_fs); if (!err) { - struct compat_video_event __user *up = compat_ptr(arg); - err = put_user(kevent.type, &up->type); err |= put_user(kevent.timestamp, &up->timestamp); err |= put_user(kevent.u.size.w, &up->u.size.w); @@ -192,15 +188,14 @@ struct compat_video_still_picture { int32_t size; }; -static int do_video_stillpicture(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_video_stillpicture(unsigned int fd, unsigned int cmd, + struct compat_video_still_picture __user *up) { - struct compat_video_still_picture __user *up; struct video_still_picture __user *up_native; compat_uptr_t fp; int32_t size; int err; - up = (struct compat_video_still_picture __user *) arg; err = get_user(fp, &up->iFrame); err |= get_user(size, &up->size); if (err) @@ -224,14 +219,13 @@ struct compat_video_spu_palette { compat_uptr_t palette; }; -static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, + struct compat_video_spu_palette __user *up) { - struct compat_video_spu_palette __user *up; struct video_spu_palette __user *up_native; compat_uptr_t palp; int length, err; - up = (struct compat_video_spu_palette __user *) arg; err = get_user(palp, &up->palette); err |= get_user(length, &up->length); @@ -299,16 +293,15 @@ static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iov return 0; } -static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) +static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, + sg_io_hdr32_t __user *sgio32) { sg_io_hdr_t __user *sgio; - sg_io_hdr32_t __user *sgio32; u16 iovec_count; u32 data; void __user *dxferp; int err; - sgio32 = compat_ptr(arg); if (get_user(iovec_count, &sgio32->iovec_count)) return -EFAULT; @@ -398,11 +391,11 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ int unused; }; -static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg) +static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct + compat_sg_req_info __user *o) { int err, i; sg_req_info_t __user *r; - struct compat_sg_req_info __user *o = (void __user *)arg; r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); err = sys_ioctl(fd,cmd,(unsigned long)r); if (err < 0) @@ -430,9 +423,9 @@ struct sock_fprog32 { #define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32) #define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32) -static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) +static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, + struct sock_fprog32 __user *u_fprog32) { - struct sock_fprog32 __user *u_fprog32 = compat_ptr(arg); struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog)); void __user *fptr64; u32 fptr32; @@ -469,15 +462,14 @@ struct ppp_idle32 { }; #define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32) -static int ppp_gidle(unsigned int fd, unsigned int cmd, unsigned long arg) +static int ppp_gidle(unsigned int fd, unsigned int cmd, + struct ppp_idle32 __user *idle32) { struct ppp_idle __user *idle; - struct ppp_idle32 __user *idle32; __kernel_time_t xmit, recv; int err; idle = compat_alloc_user_space(sizeof(*idle)); - idle32 = compat_ptr(arg); err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle); @@ -491,15 +483,14 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd, unsigned long arg) return err; } -static int ppp_scompress(unsigned int fd, unsigned int cmd, unsigned long arg) +static int ppp_scompress(unsigned int fd, unsigned int cmd, + struct ppp_option_data32 __user *odata32) { struct ppp_option_data __user *odata; - struct ppp_option_data32 __user *odata32; __u32 data; void __user *datap; odata = compat_alloc_user_space(sizeof(*odata)); - odata32 = compat_ptr(arg); if (get_user(data, &odata32->ptr)) return -EFAULT; @@ -515,35 +506,6 @@ static int ppp_scompress(unsigned int fd, unsigned int cmd, unsigned long arg) return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata); } -static int ppp_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - int err; - - switch (cmd) { - case PPPIOCGIDLE32: - err = ppp_gidle(fd, cmd, arg); - break; - - case PPPIOCSCOMPRESS32: - err = ppp_scompress(fd, cmd, arg); - break; - - default: - do { - static int count; - if (++count <= 20) - printk("ppp_ioctl: Unknown cmd fd(%d) " - "cmd(%08x) arg(%08x)\n", - (int)fd, (unsigned int)cmd, (unsigned int)arg); - } while(0); - err = -EINVAL; - break; - }; - - return err; -} - - #ifdef CONFIG_BLOCK struct mtget32 { compat_long_t mt_type; @@ -561,7 +523,7 @@ struct mtpos32 { }; #define MTIOCPOS32 _IOR('m', 3, struct mtpos32) -static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) +static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp) { mm_segment_t old_fs = get_fs(); struct mtget get; @@ -581,15 +543,6 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) kcmd = MTIOCGET; karg = &get; break; - default: - do { - static int count; - if (++count <= 20) - printk("mt_ioctl: Unknown cmd fd(%d) " - "cmd(%08x) arg(%08x)\n", - (int)fd, (unsigned int)cmd, (unsigned int)arg); - } while(0); - return -EINVAL; } set_fs (KERNEL_DS); err = sys_ioctl (fd, kcmd, (unsigned long)karg); @@ -598,11 +551,11 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) return err; switch (cmd) { case MTIOCPOS32: - upos32 = compat_ptr(arg); + upos32 = argp; err = __put_user(pos.mt_blkno, &upos32->mt_blkno); break; case MTIOCGET32: - umget32 = compat_ptr(arg); + umget32 = argp; err = __put_user(get.mt_type, &umget32->mt_type); err |= __put_user(get.mt_resid, &umget32->mt_resid); err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg); @@ -617,162 +570,8 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) #endif /* CONFIG_BLOCK */ -#ifdef CONFIG_VT - -static int vt_check(struct file *file) -{ - struct tty_struct *tty; - struct inode *inode = file->f_path.dentry->d_inode; - struct vc_data *vc; - - if (file->f_op->unlocked_ioctl != tty_ioctl) - return -EINVAL; - - tty = (struct tty_struct *)file->private_data; - if (tty_paranoia_check(tty, inode, "tty_ioctl")) - return -EINVAL; - - if (tty->ops->ioctl != vt_ioctl) - return -EINVAL; - - vc = (struct vc_data *)tty->driver_data; - if (!vc_cons_allocated(vc->vc_num)) /* impossible? */ - return -ENOIOCTLCMD; - - /* - * To have permissions to do most of the vt ioctls, we either have - * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG. - */ - if (current->signal->tty == tty || capable(CAP_SYS_TTY_CONFIG)) - return 1; - return 0; -} - -struct consolefontdesc32 { - unsigned short charcount; /* characters in font (256 or 512) */ - unsigned short charheight; /* scan lines per character (1-32) */ - compat_caddr_t chardata; /* font data in expanded form */ -}; - -static int do_fontx_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file) -{ - struct consolefontdesc32 __user *user_cfd = compat_ptr(arg); - struct console_font_op op; - compat_caddr_t data; - int i, perm; - - perm = vt_check(file); - if (perm < 0) return perm; - - switch (cmd) { - case PIO_FONTX: - if (!perm) - return -EPERM; - op.op = KD_FONT_OP_SET; - op.flags = 0; - op.width = 8; - if (get_user(op.height, &user_cfd->charheight) || - get_user(op.charcount, &user_cfd->charcount) || - get_user(data, &user_cfd->chardata)) - return -EFAULT; - op.data = compat_ptr(data); - return con_font_op(vc_cons[fg_console].d, &op); - case GIO_FONTX: - op.op = KD_FONT_OP_GET; - op.flags = 0; - op.width = 8; - if (get_user(op.height, &user_cfd->charheight) || - get_user(op.charcount, &user_cfd->charcount) || - get_user(data, &user_cfd->chardata)) - return -EFAULT; - if (!data) - return 0; - op.data = compat_ptr(data); - i = con_font_op(vc_cons[fg_console].d, &op); - if (i) - return i; - if (put_user(op.height, &user_cfd->charheight) || - put_user(op.charcount, &user_cfd->charcount) || - put_user((compat_caddr_t)(unsigned long)op.data, - &user_cfd->chardata)) - return -EFAULT; - return 0; - } - return -EINVAL; -} - -struct console_font_op32 { - compat_uint_t op; /* operation code KD_FONT_OP_* */ - compat_uint_t flags; /* KD_FONT_FLAG_* */ - compat_uint_t width, height; /* font size */ - compat_uint_t charcount; - compat_caddr_t data; /* font data with height fixed to 32 */ -}; - -static int do_kdfontop_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file) -{ - struct console_font_op op; - struct console_font_op32 __user *fontop = compat_ptr(arg); - int perm = vt_check(file), i; - struct vc_data *vc; - - if (perm < 0) return perm; - - if (copy_from_user(&op, fontop, sizeof(struct console_font_op32))) - return -EFAULT; - if (!perm && op.op != KD_FONT_OP_GET) - return -EPERM; - op.data = compat_ptr(((struct console_font_op32 *)&op)->data); - op.flags |= KD_FONT_FLAG_OLD; - vc = ((struct tty_struct *)file->private_data)->driver_data; - i = con_font_op(vc, &op); - if (i) - return i; - ((struct console_font_op32 *)&op)->data = (unsigned long)op.data; - if (copy_to_user(fontop, &op, sizeof(struct console_font_op32))) - return -EFAULT; - return 0; -} - -struct unimapdesc32 { - unsigned short entry_ct; - compat_caddr_t entries; -}; - -static int do_unimap_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file) -{ - struct unimapdesc32 tmp; - struct unimapdesc32 __user *user_ud = compat_ptr(arg); - int perm = vt_check(file); - struct vc_data *vc; - - if (perm < 0) - return perm; - if (copy_from_user(&tmp, user_ud, sizeof tmp)) - return -EFAULT; - if (tmp.entries) - if (!access_ok(VERIFY_WRITE, compat_ptr(tmp.entries), - tmp.entry_ct*sizeof(struct unipair))) - return -EFAULT; - vc = ((struct tty_struct *)file->private_data)->driver_data; - switch (cmd) { - case PIO_UNIMAP: - if (!perm) - return -EPERM; - return con_set_unimap(vc, tmp.entry_ct, - compat_ptr(tmp.entries)); - case GIO_UNIMAP: - if (!perm && fg_console != vc->vc_num) - return -EPERM; - return con_get_unimap(vc, tmp.entry_ct, &(user_ud->entry_ct), - compat_ptr(tmp.entries)); - } - return 0; -} - -#endif /* CONFIG_VT */ - -static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, + compat_uid_t __user *argp) { mm_segment_t old_fs = get_fs(); __kernel_uid_t kuid; @@ -785,20 +584,15 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, unsigned long a set_fs(old_fs); if (err >= 0) - err = put_user(kuid, (compat_uid_t __user *)compat_ptr(arg)); + err = put_user(kuid, argp); return err; } -static __used int -ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - return -EINVAL; -} - -static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg) +static int ioc_settimeout(unsigned int fd, unsigned int cmd, + compat_ulong_t __user *argp) { - return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, arg); + return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, argp); } /* Bluetooth ioctls */ @@ -856,7 +650,8 @@ static int set_raw32_request(struct raw_config_request *req, struct raw32_config return ret ? -EFAULT : 0; } -static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg) +static int raw_ioctl(unsigned fd, unsigned cmd, + struct raw32_config_request __user *user_req) { int ret; @@ -864,7 +659,6 @@ static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg) case RAW_SETBIND: case RAW_GETBIND: { struct raw_config_request req; - struct raw32_config_request __user *user_req = compat_ptr(arg); mm_segment_t oldfs = get_fs(); if ((ret = get_raw32_request(&req, user_req))) @@ -879,9 +673,6 @@ static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg) } break; } - default: - ret = sys_ioctl(fd, cmd, arg); - break; } return ret; } @@ -909,11 +700,11 @@ struct serial_struct32 { compat_int_t reserved[1]; }; -static int serial_struct_ioctl(unsigned fd, unsigned cmd, unsigned long arg) +static int serial_struct_ioctl(unsigned fd, unsigned cmd, + struct serial_struct32 __user *ss32) { typedef struct serial_struct SS; typedef struct serial_struct32 SS32; - struct serial_struct32 __user *ss32 = compat_ptr(arg); int err; struct serial_struct ss; mm_segment_t oldseg = get_fs(); @@ -951,96 +742,6 @@ static int serial_struct_ioctl(unsigned fd, unsigned cmd, unsigned long arg) return err; } -struct usbdevfs_ctrltransfer32 { - u8 bRequestType; - u8 bRequest; - u16 wValue; - u16 wIndex; - u16 wLength; - u32 timeout; /* in milliseconds */ - compat_caddr_t data; -}; - -#define USBDEVFS_CONTROL32 _IOWR('U', 0, struct usbdevfs_ctrltransfer32) - -static int do_usbdevfs_control(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct usbdevfs_ctrltransfer32 __user *p32 = compat_ptr(arg); - struct usbdevfs_ctrltransfer __user *p; - __u32 udata; - p = compat_alloc_user_space(sizeof(*p)); - if (copy_in_user(p, p32, (sizeof(*p32) - sizeof(compat_caddr_t))) || - get_user(udata, &p32->data) || - put_user(compat_ptr(udata), &p->data)) - return -EFAULT; - return sys_ioctl(fd, USBDEVFS_CONTROL, (unsigned long)p); -} - - -struct usbdevfs_bulktransfer32 { - compat_uint_t ep; - compat_uint_t len; - compat_uint_t timeout; /* in milliseconds */ - compat_caddr_t data; -}; - -#define USBDEVFS_BULK32 _IOWR('U', 2, struct usbdevfs_bulktransfer32) - -static int do_usbdevfs_bulk(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct usbdevfs_bulktransfer32 __user *p32 = compat_ptr(arg); - struct usbdevfs_bulktransfer __user *p; - compat_uint_t n; - compat_caddr_t addr; - - p = compat_alloc_user_space(sizeof(*p)); - - if (get_user(n, &p32->ep) || put_user(n, &p->ep) || - get_user(n, &p32->len) || put_user(n, &p->len) || - get_user(n, &p32->timeout) || put_user(n, &p->timeout) || - get_user(addr, &p32->data) || put_user(compat_ptr(addr), &p->data)) - return -EFAULT; - - return sys_ioctl(fd, USBDEVFS_BULK, (unsigned long)p); -} - - -/* - * USBDEVFS_SUBMITURB, USBDEVFS_REAPURB and USBDEVFS_REAPURBNDELAY - * are handled in usbdevfs core. -Christopher Li - */ - -struct usbdevfs_disconnectsignal32 { - compat_int_t signr; - compat_caddr_t context; -}; - -#define USBDEVFS_DISCSIGNAL32 _IOR('U', 14, struct usbdevfs_disconnectsignal32) - -static int do_usbdevfs_discsignal(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct usbdevfs_disconnectsignal kdis; - struct usbdevfs_disconnectsignal32 __user *udis; - mm_segment_t old_fs; - u32 uctx; - int err; - - udis = compat_ptr(arg); - - if (get_user(kdis.signr, &udis->signr) || - __get_user(uctx, &udis->context)) - return -EFAULT; - - kdis.context = compat_ptr(uctx); - - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_ioctl(fd, USBDEVFS_DISCSIGNAL, (unsigned long) &kdis); - set_fs(old_fs); - - return err; -} - /* * I2C layer ioctls */ @@ -1069,9 +770,9 @@ struct i2c_rdwr_aligned { struct i2c_msg msgs[0]; }; -static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, + struct i2c_rdwr_ioctl_data32 __user *udata) { - struct i2c_rdwr_ioctl_data32 __user *udata = compat_ptr(arg); struct i2c_rdwr_aligned __user *tdata; struct i2c_msg __user *tmsgs; struct i2c_msg32 __user *umsgs; @@ -1105,10 +806,10 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, unsigned long ar return sys_ioctl(fd, cmd, (unsigned long)tdata); } -static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, + struct i2c_smbus_ioctl_data32 __user *udata) { struct i2c_smbus_ioctl_data __user *tdata; - struct i2c_smbus_ioctl_data32 __user *udata; compat_caddr_t datap; tdata = compat_alloc_user_space(sizeof(*tdata)); @@ -1117,7 +818,6 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata))) return -EFAULT; - udata = compat_ptr(arg); if (!access_ok(VERIFY_READ, udata, sizeof(*udata))) return -EFAULT; @@ -1137,7 +837,7 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a #define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) #define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t) -static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg) +static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp) { mm_segment_t oldfs = get_fs(); compat_ulong_t val32; @@ -1155,29 +855,14 @@ static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg) if (ret) return ret; val32 = kval; - return put_user(val32, (unsigned int __user *)arg); + return put_user(val32, (unsigned int __user *)argp); case RTC_IRQP_SET32: - return sys_ioctl(fd, RTC_IRQP_SET, arg); + return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp); case RTC_EPOCH_SET32: - return sys_ioctl(fd, RTC_EPOCH_SET, arg); - default: - /* unreached */ - return -ENOIOCTLCMD; + return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp); } -} -static int -lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct compat_timeval __user *tc = (struct compat_timeval __user *)arg; - struct timeval __user *tn = compat_alloc_user_space(sizeof(struct timeval)); - struct timeval ts; - if (get_user(ts.tv_sec, &tc->tv_sec) || - get_user(ts.tv_usec, &tc->tv_usec) || - put_user(ts.tv_sec, &tn->tv_sec) || - put_user(ts.tv_usec, &tn->tv_usec)) - return -EFAULT; - return sys_ioctl(fd, cmd, (unsigned long)tn); + return -ENOIOCTLCMD; } /* on ia32 l_start is on a 32-bit boundary */ @@ -1197,9 +882,9 @@ struct space_resv_32 { #define FS_IOC_RESVSP64_32 _IOW ('X', 42, struct space_resv_32) /* just account for different alignment */ -static int compat_ioctl_preallocate(struct file *file, unsigned long arg) +static int compat_ioctl_preallocate(struct file *file, + struct space_resv_32 __user *p32) { - struct space_resv_32 __user *p32 = compat_ptr(arg); struct space_resv __user *p = compat_alloc_user_space(sizeof(*p)); if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || @@ -1215,27 +900,13 @@ static int compat_ioctl_preallocate(struct file *file, unsigned long arg) } #endif +/* + * simple reversible transform to make our table more evenly + * distributed after sorting. + */ +#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff) -typedef int (*ioctl_trans_handler_t)(unsigned int, unsigned int, - unsigned long, struct file *); - -struct ioctl_trans { - unsigned long cmd; - ioctl_trans_handler_t handler; - struct ioctl_trans *next; -}; - -#define HANDLE_IOCTL(cmd,handler) \ - { (cmd), (ioctl_trans_handler_t)(handler) }, - -/* pointer to compatible structure or no argument */ -#define COMPATIBLE_IOCTL(cmd) \ - { (cmd), do_ioctl32_pointer }, - -/* argument is an unsigned long integer, not a pointer */ -#define ULONG_IOCTL(cmd) \ - { (cmd), (ioctl_trans_handler_t)sys_ioctl }, - +#define COMPATIBLE_IOCTL(cmd) XFORM(cmd), /* ioctl should not be warned about even if it's not implemented. Valid reasons to use this: - It is implemented with ->compat_ioctl on some device, but programs @@ -1245,7 +916,7 @@ struct ioctl_trans { Most other reasons are not valid. */ #define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd) -static struct ioctl_trans ioctl_start[] = { +static unsigned int ioctl_pointer[] = { /* compatible ioctls first */ COMPATIBLE_IOCTL(0x4B50) /* KDGHWCLK - not in the kernel, but don't complain */ COMPATIBLE_IOCTL(0x4B51) /* KDSHWCLK - not in the kernel, but don't complain */ @@ -1256,7 +927,6 @@ COMPATIBLE_IOCTL(TCSETA) COMPATIBLE_IOCTL(TCSETAW) COMPATIBLE_IOCTL(TCSETAF) COMPATIBLE_IOCTL(TCSBRK) -ULONG_IOCTL(TCSBRKP) COMPATIBLE_IOCTL(TCXONC) COMPATIBLE_IOCTL(TCFLSH) COMPATIBLE_IOCTL(TCGETS) @@ -1266,7 +936,6 @@ COMPATIBLE_IOCTL(TCSETSF) COMPATIBLE_IOCTL(TIOCLINUX) COMPATIBLE_IOCTL(TIOCSBRK) COMPATIBLE_IOCTL(TIOCCBRK) -ULONG_IOCTL(TIOCMIWAIT) COMPATIBLE_IOCTL(TIOCGICOUNT) /* Little t */ COMPATIBLE_IOCTL(TIOCGETD) @@ -1288,7 +957,6 @@ COMPATIBLE_IOCTL(TIOCSTI) COMPATIBLE_IOCTL(TIOCOUTQ) COMPATIBLE_IOCTL(TIOCSPGRP) COMPATIBLE_IOCTL(TIOCGPGRP) -ULONG_IOCTL(TIOCSCTTY) COMPATIBLE_IOCTL(TIOCGPTN) COMPATIBLE_IOCTL(TIOCSPTLCK) COMPATIBLE_IOCTL(TIOCSERGETLSR) @@ -1319,36 +987,21 @@ COMPATIBLE_IOCTL(PRINT_RAID_DEBUG) COMPATIBLE_IOCTL(RAID_AUTORUN) COMPATIBLE_IOCTL(CLEAR_ARRAY) COMPATIBLE_IOCTL(ADD_NEW_DISK) -ULONG_IOCTL(HOT_REMOVE_DISK) COMPATIBLE_IOCTL(SET_ARRAY_INFO) COMPATIBLE_IOCTL(SET_DISK_INFO) COMPATIBLE_IOCTL(WRITE_RAID_INFO) COMPATIBLE_IOCTL(UNPROTECT_ARRAY) COMPATIBLE_IOCTL(PROTECT_ARRAY) -ULONG_IOCTL(HOT_ADD_DISK) -ULONG_IOCTL(SET_DISK_FAULTY) COMPATIBLE_IOCTL(RUN_ARRAY) COMPATIBLE_IOCTL(STOP_ARRAY) COMPATIBLE_IOCTL(STOP_ARRAY_RO) COMPATIBLE_IOCTL(RESTART_ARRAY_RW) COMPATIBLE_IOCTL(GET_BITMAP_FILE) -ULONG_IOCTL(SET_BITMAP_FILE) -/* Big K */ -COMPATIBLE_IOCTL(PIO_FONT) -COMPATIBLE_IOCTL(GIO_FONT) -COMPATIBLE_IOCTL(PIO_CMAP) -COMPATIBLE_IOCTL(GIO_CMAP) -ULONG_IOCTL(KDSIGACCEPT) COMPATIBLE_IOCTL(KDGETKEYCODE) COMPATIBLE_IOCTL(KDSETKEYCODE) -ULONG_IOCTL(KIOCSOUND) -ULONG_IOCTL(KDMKTONE) COMPATIBLE_IOCTL(KDGKBTYPE) -ULONG_IOCTL(KDSETMODE) COMPATIBLE_IOCTL(KDGETMODE) -ULONG_IOCTL(KDSKBMODE) COMPATIBLE_IOCTL(KDGKBMODE) -ULONG_IOCTL(KDSKBMETA) COMPATIBLE_IOCTL(KDGKBMETA) COMPATIBLE_IOCTL(KDGKBENT) COMPATIBLE_IOCTL(KDSKBENT) @@ -1358,15 +1011,7 @@ COMPATIBLE_IOCTL(KDGKBDIACR) COMPATIBLE_IOCTL(KDSKBDIACR) COMPATIBLE_IOCTL(KDKBDREP) COMPATIBLE_IOCTL(KDGKBLED) -ULONG_IOCTL(KDSKBLED) COMPATIBLE_IOCTL(KDGETLED) -ULONG_IOCTL(KDSETLED) -COMPATIBLE_IOCTL(GIO_SCRNMAP) -COMPATIBLE_IOCTL(PIO_SCRNMAP) -COMPATIBLE_IOCTL(GIO_UNISCRNMAP) -COMPATIBLE_IOCTL(PIO_UNISCRNMAP) -COMPATIBLE_IOCTL(PIO_FONTRESET) -COMPATIBLE_IOCTL(PIO_UNIMAPCLR) #ifdef CONFIG_BLOCK /* Big S */ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) @@ -1378,20 +1023,6 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) #endif -/* Big V */ -COMPATIBLE_IOCTL(VT_SETMODE) -COMPATIBLE_IOCTL(VT_GETMODE) -COMPATIBLE_IOCTL(VT_GETSTATE) -COMPATIBLE_IOCTL(VT_OPENQRY) -ULONG_IOCTL(VT_ACTIVATE) -ULONG_IOCTL(VT_WAITACTIVE) -ULONG_IOCTL(VT_RELDISP) -ULONG_IOCTL(VT_DISALLOCATE) -COMPATIBLE_IOCTL(VT_RESIZE) -COMPATIBLE_IOCTL(VT_RESIZEX) -COMPATIBLE_IOCTL(VT_LOCKSWITCH) -COMPATIBLE_IOCTL(VT_UNLOCKSWITCH) -COMPATIBLE_IOCTL(VT_GETHIFONTMASK) /* Little p (/dev/rtc, /dev/envctrl, etc.) */ COMPATIBLE_IOCTL(RTC_AIE_ON) COMPATIBLE_IOCTL(RTC_AIE_OFF) @@ -1420,11 +1051,12 @@ COMPATIBLE_IOCTL(MTIOCTOP) /* Socket level stuff */ COMPATIBLE_IOCTL(FIOQSIZE) #ifdef CONFIG_BLOCK +/* loop */ +IGNORE_IOCTL(LOOP_CLR_FD) /* SG stuff */ COMPATIBLE_IOCTL(SG_SET_TIMEOUT) COMPATIBLE_IOCTL(SG_GET_TIMEOUT) COMPATIBLE_IOCTL(SG_EMULATED_HOST) -ULONG_IOCTL(SG_SET_TRANSFORM) COMPATIBLE_IOCTL(SG_GET_TRANSFORM) COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE) COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE) @@ -1478,8 +1110,6 @@ COMPATIBLE_IOCTL(PPPIOCGCHAN) /* PPPOX */ COMPATIBLE_IOCTL(PPPOEIOCSFWD) COMPATIBLE_IOCTL(PPPOEIOCDFWD) -/* LP */ -COMPATIBLE_IOCTL(LPGETSTATUS) /* ppdev */ COMPATIBLE_IOCTL(PPSETMODE) COMPATIBLE_IOCTL(PPRSTATUS) @@ -1661,8 +1291,6 @@ COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS) COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) COMPATIBLE_IOCTL(OSS_GETVERSION) /* AUTOFS */ -ULONG_IOCTL(AUTOFS_IOC_READY) -ULONG_IOCTL(AUTOFS_IOC_FAIL) COMPATIBLE_IOCTL(AUTOFS_IOC_CATATONIC) COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER) COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE) @@ -1755,30 +1383,11 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER) COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO) COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM) COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE) -/* USB */ -COMPATIBLE_IOCTL(USBDEVFS_RESETEP) -COMPATIBLE_IOCTL(USBDEVFS_SETINTERFACE) -COMPATIBLE_IOCTL(USBDEVFS_SETCONFIGURATION) -COMPATIBLE_IOCTL(USBDEVFS_GETDRIVER) -COMPATIBLE_IOCTL(USBDEVFS_DISCARDURB) -COMPATIBLE_IOCTL(USBDEVFS_CLAIMINTERFACE) -COMPATIBLE_IOCTL(USBDEVFS_RELEASEINTERFACE) -COMPATIBLE_IOCTL(USBDEVFS_CONNECTINFO) -COMPATIBLE_IOCTL(USBDEVFS_HUB_PORTINFO) -COMPATIBLE_IOCTL(USBDEVFS_RESET) -COMPATIBLE_IOCTL(USBDEVFS_SUBMITURB32) -COMPATIBLE_IOCTL(USBDEVFS_REAPURB32) -COMPATIBLE_IOCTL(USBDEVFS_REAPURBNDELAY32) -COMPATIBLE_IOCTL(USBDEVFS_CLEAR_HALT) /* NBD */ -ULONG_IOCTL(NBD_SET_SOCK) -ULONG_IOCTL(NBD_SET_BLKSIZE) -ULONG_IOCTL(NBD_SET_SIZE) COMPATIBLE_IOCTL(NBD_DO_IT) COMPATIBLE_IOCTL(NBD_CLEAR_SOCK) COMPATIBLE_IOCTL(NBD_CLEAR_QUE) COMPATIBLE_IOCTL(NBD_PRINT_DEBUG) -ULONG_IOCTL(NBD_SET_SIZE_BLOCKS) COMPATIBLE_IOCTL(NBD_DISCONNECT) /* i2c */ COMPATIBLE_IOCTL(I2C_SLAVE) @@ -1878,42 +1487,6 @@ COMPATIBLE_IOCTL(JSIOCGAXES) COMPATIBLE_IOCTL(JSIOCGBUTTONS) COMPATIBLE_IOCTL(JSIOCGNAME(0)) -/* now things that need handlers */ -#ifdef CONFIG_BLOCK -HANDLE_IOCTL(SG_IO,sg_ioctl_trans) -HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans) -#endif -HANDLE_IOCTL(PPPIOCGIDLE32, ppp_ioctl_trans) -HANDLE_IOCTL(PPPIOCSCOMPRESS32, ppp_ioctl_trans) -HANDLE_IOCTL(PPPIOCSPASS32, ppp_sock_fprog_ioctl_trans) -HANDLE_IOCTL(PPPIOCSACTIVE32, ppp_sock_fprog_ioctl_trans) -#ifdef CONFIG_BLOCK -HANDLE_IOCTL(MTIOCGET32, mt_ioctl_trans) -HANDLE_IOCTL(MTIOCPOS32, mt_ioctl_trans) -#endif -#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int) -HANDLE_IOCTL(AUTOFS_IOC_SETTIMEOUT32, ioc_settimeout) -#ifdef CONFIG_VT -HANDLE_IOCTL(PIO_FONTX, do_fontx_ioctl) -HANDLE_IOCTL(GIO_FONTX, do_fontx_ioctl) -HANDLE_IOCTL(PIO_UNIMAP, do_unimap_ioctl) -HANDLE_IOCTL(GIO_UNIMAP, do_unimap_ioctl) -HANDLE_IOCTL(KDFONTOP, do_kdfontop_ioctl) -#endif -/* One SMB ioctl needs translations. */ -#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t) -HANDLE_IOCTL(SMB_IOC_GETMOUNTUID_32, do_smb_getmountuid) -/* block stuff */ -#ifdef CONFIG_BLOCK -/* loop */ -IGNORE_IOCTL(LOOP_CLR_FD) -/* Raw devices */ -HANDLE_IOCTL(RAW_SETBIND, raw_ioctl) -HANDLE_IOCTL(RAW_GETBIND, raw_ioctl) -#endif -/* Serial */ -HANDLE_IOCTL(TIOCGSERIAL, serial_struct_ioctl) -HANDLE_IOCTL(TIOCSSERIAL, serial_struct_ioctl) #ifdef TIOCGLTC COMPATIBLE_IOCTL(TIOCGLTC) COMPATIBLE_IOCTL(TIOCSLTC) @@ -1928,39 +1501,6 @@ COMPATIBLE_IOCTL(TIOCSLTC) COMPATIBLE_IOCTL(TIOCSTART) COMPATIBLE_IOCTL(TIOCSTOP) #endif -/* Usbdevfs */ -HANDLE_IOCTL(USBDEVFS_CONTROL32, do_usbdevfs_control) -HANDLE_IOCTL(USBDEVFS_BULK32, do_usbdevfs_bulk) -HANDLE_IOCTL(USBDEVFS_DISCSIGNAL32, do_usbdevfs_discsignal) -COMPATIBLE_IOCTL(USBDEVFS_IOCTL32) -/* i2c */ -HANDLE_IOCTL(I2C_FUNCS, w_long) -HANDLE_IOCTL(I2C_RDWR, do_i2c_rdwr_ioctl) -HANDLE_IOCTL(I2C_SMBUS, do_i2c_smbus_ioctl) -/* Not implemented in the native kernel */ -HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl) -HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl) -HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl) -HANDLE_IOCTL(RTC_EPOCH_SET32, rtc_ioctl) - -/* dvb */ -HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event) -HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture) -HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette) - -/* parport */ -COMPATIBLE_IOCTL(LPTIME) -COMPATIBLE_IOCTL(LPCHAR) -COMPATIBLE_IOCTL(LPABORTOPEN) -COMPATIBLE_IOCTL(LPCAREFUL) -COMPATIBLE_IOCTL(LPWAIT) -COMPATIBLE_IOCTL(LPSETIRQ) -COMPATIBLE_IOCTL(LPGETSTATUS) -COMPATIBLE_IOCTL(LPGETSTATUS) -COMPATIBLE_IOCTL(LPRESET) -/*LPGETSTATS not implemented, but no kernels seem to compile it in anyways*/ -COMPATIBLE_IOCTL(LPGETFLAGS) -HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans) /* fat 'r' ioctls. These are handled by fat with ->compat_ioctl, but we don't want warnings on other file systems. So declare @@ -1988,12 +1528,110 @@ IGNORE_IOCTL(FBIOGCURSOR32) #endif }; -#define IOCTL_HASHSIZE 256 -static struct ioctl_trans *ioctl32_hash_table[IOCTL_HASHSIZE]; - -static inline unsigned long ioctl32_hash(unsigned long cmd) +/* + * Convert common ioctl arguments based on their command number + * + * Please do not add any code in here. Instead, implement + * a compat_ioctl operation in the place that handleÑ• the + * ioctl for the native case. + */ +static long do_ioctl_trans(int fd, unsigned int cmd, + unsigned long arg, struct file *file) { - return (((cmd >> 6) ^ (cmd >> 4) ^ cmd)) % IOCTL_HASHSIZE; + void __user *argp = compat_ptr(arg); + + switch (cmd) { + case PPPIOCGIDLE32: + return ppp_gidle(fd, cmd, argp); + case PPPIOCSCOMPRESS32: + return ppp_scompress(fd, cmd, argp); + case PPPIOCSPASS32: + case PPPIOCSACTIVE32: + return ppp_sock_fprog_ioctl_trans(fd, cmd, argp); +#ifdef CONFIG_BLOCK + case SG_IO: + return sg_ioctl_trans(fd, cmd, argp); + case SG_GET_REQUEST_TABLE: + return sg_grt_trans(fd, cmd, argp); + case MTIOCGET32: + case MTIOCPOS32: + return mt_ioctl_trans(fd, cmd, argp); + /* Raw devices */ + case RAW_SETBIND: + case RAW_GETBIND: + return raw_ioctl(fd, cmd, argp); +#endif +#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int) + case AUTOFS_IOC_SETTIMEOUT32: + return ioc_settimeout(fd, cmd, argp); + /* One SMB ioctl needs translations. */ +#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t) + case SMB_IOC_GETMOUNTUID_32: + return do_smb_getmountuid(fd, cmd, argp); + /* Serial */ + case TIOCGSERIAL: + case TIOCSSERIAL: + return serial_struct_ioctl(fd, cmd, argp); + /* i2c */ + case I2C_FUNCS: + return w_long(fd, cmd, argp); + case I2C_RDWR: + return do_i2c_rdwr_ioctl(fd, cmd, argp); + case I2C_SMBUS: + return do_i2c_smbus_ioctl(fd, cmd, argp); + /* Not implemented in the native kernel */ + case RTC_IRQP_READ32: + case RTC_IRQP_SET32: + case RTC_EPOCH_READ32: + case RTC_EPOCH_SET32: + return rtc_ioctl(fd, cmd, argp); + + /* dvb */ + case VIDEO_GET_EVENT: + return do_video_get_event(fd, cmd, argp); + case VIDEO_STILLPICTURE: + return do_video_stillpicture(fd, cmd, argp); + case VIDEO_SET_SPU_PALETTE: + return do_video_set_spu_palette(fd, cmd, argp); + } + + /* + * These take an integer instead of a pointer as 'arg', + * so we must not do a compat_ptr() translation. + */ + switch (cmd) { + /* Big T */ + case TCSBRKP: + case TIOCMIWAIT: + case TIOCSCTTY: + /* RAID */ + case HOT_REMOVE_DISK: + case HOT_ADD_DISK: + case SET_DISK_FAULTY: + case SET_BITMAP_FILE: + /* Big K */ + case KDSIGACCEPT: + case KIOCSOUND: + case KDMKTONE: + case KDSETMODE: + case KDSKBMODE: + case KDSKBMETA: + case KDSKBLED: + case KDSETLED: + /* SG stuff */ + case SG_SET_TRANSFORM: + /* AUTOFS */ + case AUTOFS_IOC_READY: + case AUTOFS_IOC_FAIL: + /* NBD */ + case NBD_SET_SOCK: + case NBD_SET_BLKSIZE: + case NBD_SET_SIZE: + case NBD_SET_SIZE_BLOCKS: + return do_vfs_ioctl(file, fd, cmd, arg); + } + + return -ENOIOCTLCMD; } static void compat_ioctl_error(struct file *filp, unsigned int fd, @@ -2025,12 +1663,33 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd, free_page((unsigned long)path); } +static int compat_ioctl_check_table(unsigned int xcmd) +{ + int i; + const int max = ARRAY_SIZE(ioctl_pointer) - 1; + + BUILD_BUG_ON(max >= (1 << 16)); + + /* guess initial offset into table, assuming a + normalized distribution */ + i = ((xcmd >> 16) * max) >> 16; + + /* do linear search up first, until greater or equal */ + while (ioctl_pointer[i] < xcmd && i < max) + i++; + + /* then do linear search down */ + while (ioctl_pointer[i] > xcmd && i > 0) + i--; + + return ioctl_pointer[i] == xcmd; +} + asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { struct file *filp; int error = -EBADF; - struct ioctl_trans *t; int fput_needed; filp = fget_light(fd, &fput_needed); @@ -2058,7 +1717,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, #if defined(CONFIG_IA64) || defined(CONFIG_X86_64) case FS_IOC_RESVSP_32: case FS_IOC_RESVSP64_32: - error = compat_ioctl_preallocate(filp, arg); + error = compat_ioctl_preallocate(filp, compat_ptr(arg)); goto out_fput; #else case FS_IOC_RESVSP: @@ -2087,12 +1746,11 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, break; } - for (t = ioctl32_hash_table[ioctl32_hash(cmd)]; t; t = t->next) { - if (t->cmd == cmd) - goto found_handler; - } + if (compat_ioctl_check_table(XFORM(cmd))) + goto found_handler; - { + error = do_ioctl_trans(fd, cmd, arg, filp); + if (error == -ENOIOCTLCMD) { static int count; if (++count <= 50) @@ -2103,13 +1761,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, goto out_fput; found_handler: - if (t->handler) { - lock_kernel(); - error = t->handler(fd, cmd, arg, filp); - unlock_kernel(); - goto out_fput; - } - + arg = (unsigned long)compat_ptr(arg); do_ioctl: error = do_vfs_ioctl(filp, fd, cmd, arg); out_fput: @@ -2118,35 +1770,22 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, return error; } -static void ioctl32_insert_translation(struct ioctl_trans *trans) +static int __init init_sys32_ioctl_cmp(const void *p, const void *q) { - unsigned long hash; - struct ioctl_trans *t; - - hash = ioctl32_hash (trans->cmd); - if (!ioctl32_hash_table[hash]) - ioctl32_hash_table[hash] = trans; - else { - t = ioctl32_hash_table[hash]; - while (t->next) - t = t->next; - trans->next = NULL; - t->next = trans; - } + unsigned int a, b; + a = *(unsigned int *)p; + b = *(unsigned int *)q; + if (a > b) + return 1; + if (a < b) + return -1; + return 0; } static int __init init_sys32_ioctl(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(ioctl_start); i++) { - if (ioctl_start[i].next) { - printk("ioctl translation %d bad\n",i); - return -1; - } - - ioctl32_insert_translation(&ioctl_start[i]); - } + sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer), + init_sys32_ioctl_cmp, NULL); return 0; } __initcall(init_sys32_ioctl); diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h index 66d6106a206..204bed37e82 100644 --- a/include/asm-generic/gpio.h +++ b/include/asm-generic/gpio.h @@ -28,6 +28,7 @@ static inline int gpio_is_valid(int number) return ((unsigned)number) < ARCH_NR_GPIOS; } +struct device; struct seq_file; struct module; @@ -181,6 +182,8 @@ static inline void gpio_set_value_cansleep(unsigned gpio, int value) #ifndef CONFIG_GPIO_SYSFS +struct device; + /* sysfs support is only available with gpiolib, where it's optional */ static inline int gpio_export(unsigned gpio, bool direction_may_change) diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h index 7c38c147e5e..6a0b30f78a6 100644 --- a/include/asm-generic/unistd.h +++ b/include/asm-generic/unistd.h @@ -622,9 +622,13 @@ __SYSCALL(__NR_move_pages, sys_move_pages) __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) #define __NR_perf_event_open 241 __SYSCALL(__NR_perf_event_open, sys_perf_event_open) +#define __NR_accept4 242 +__SYSCALL(__NR_accept4, sys_accept4) +#define __NR_recvmmsg 243 +__SYSCALL(__NR_recvmmsg, sys_recvmmsg) #undef __NR_syscalls -#define __NR_syscalls 242 +#define __NR_syscalls 244 /* * All syscalls below here should go away really, @@ -802,7 +806,7 @@ __SYSCALL(__NR_fork, sys_ni_syscall) #define __NR_statfs __NR3264_statfs #define __NR_fstatfs __NR3264_fstatfs #define __NR_truncate __NR3264_truncate -#define __NR_ftruncate __NR3264_truncate +#define __NR_ftruncate __NR3264_ftruncate #define __NR_lseek __NR3264_lseek #define __NR_sendfile __NR3264_sendfile #define __NR_newfstatat __NR3264_fstatat @@ -818,7 +822,7 @@ __SYSCALL(__NR_fork, sys_ni_syscall) #define __NR_statfs64 __NR3264_statfs #define __NR_fstatfs64 __NR3264_fstatfs #define __NR_truncate64 __NR3264_truncate -#define __NR_ftruncate64 __NR3264_truncate +#define __NR_ftruncate64 __NR3264_ftruncate #define __NR_llseek __NR3264_lseek #define __NR_sendfile64 __NR3264_sendfile #define __NR_fstatat64 __NR3264_fstatat diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 789cf5f920c..d77b54733c5 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -84,6 +84,7 @@ extern const struct cpumask *const cpu_active_mask; #define num_online_cpus() cpumask_weight(cpu_online_mask) #define num_possible_cpus() cpumask_weight(cpu_possible_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) +#define num_active_cpus() cpumask_weight(cpu_active_mask) #define cpu_online(cpu) cpumask_test_cpu((cpu), cpu_online_mask) #define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask) #define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask) @@ -92,6 +93,7 @@ extern const struct cpumask *const cpu_active_mask; #define num_online_cpus() 1 #define num_possible_cpus() 1 #define num_present_cpus() 1 +#define num_active_cpus() 1 #define cpu_online(cpu) ((cpu) == 0) #define cpu_possible(cpu) ((cpu) == 0) #define cpu_present(cpu) ((cpu) == 0) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 47bbdf9c38d..38f8d655383 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -57,6 +57,7 @@ struct trace_iterator { /* The below is zeroed out in pipe_read */ struct trace_seq seq; struct trace_entry *ent; + int leftover; int cpu; u64 ts; diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 9bace4b9f4f..af634e95871 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -162,10 +162,11 @@ struct hrtimer_clock_base { * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() * @hres_active: State of high resolution mode - * @check_clocks: Indictator, when set evaluate time source and clock - * event devices whether high resolution mode can be - * activated. - * @nr_events: Total number of timer interrupt events + * @hang_detected: The last hrtimer interrupt detected a hang + * @nr_events: Total number of hrtimer interrupt events + * @nr_retries: Total number of hrtimer interrupt retries + * @nr_hangs: Total number of hrtimer interrupt hangs + * @max_hang_time: Maximum time spent in hrtimer_interrupt */ struct hrtimer_cpu_base { spinlock_t lock; @@ -173,7 +174,11 @@ struct hrtimer_cpu_base { #ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires_next; int hres_active; + int hang_detected; unsigned long nr_events; + unsigned long nr_retries; + unsigned long nr_hangs; + ktime_t max_hang_time; #endif }; @@ -435,47 +440,4 @@ extern u64 ktime_divns(const ktime_t kt, s64 div); /* Show pending timers: */ extern void sysrq_timer_list_show(void); -/* - * Timer-statistics info: - */ -#ifdef CONFIG_TIMER_STATS - -extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf, - void *timerf, char *comm, - unsigned int timer_flag); - -static inline void timer_stats_account_hrtimer(struct hrtimer *timer) -{ - if (likely(!timer_stats_active)) - return; - timer_stats_update_stats(timer, timer->start_pid, timer->start_site, - timer->function, timer->start_comm, 0); -} - -extern void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, - void *addr); - -static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) -{ - __timer_stats_hrtimer_set_start_info(timer, __builtin_return_address(0)); -} - -static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer) -{ - timer->start_site = NULL; -} -#else -static inline void timer_stats_account_hrtimer(struct hrtimer *timer) -{ -} - -static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) -{ -} - -static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer) -{ -} -#endif - #endif diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index a03daed08c5..69f07a9f127 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -20,19 +20,18 @@ enum { #ifdef CONFIG_HAVE_HW_BREAKPOINT -/* As it's for in-kernel or ptrace use, we want it to be pinned */ -#define DEFINE_BREAKPOINT_ATTR(name) \ -struct perf_event_attr name = { \ - .type = PERF_TYPE_BREAKPOINT, \ - .size = sizeof(name), \ - .pinned = 1, \ -}; - static inline void hw_breakpoint_init(struct perf_event_attr *attr) { + memset(attr, 0, sizeof(*attr)); + attr->type = PERF_TYPE_BREAKPOINT; attr->size = sizeof(*attr); + /* + * As it's for in-kernel or ptrace use, we want it to be pinned + * and to call its callback every hits. + */ attr->pinned = 1; + attr->sample_period = 1; } static inline unsigned long hw_breakpoint_addr(struct perf_event *bp) @@ -52,27 +51,24 @@ static inline int hw_breakpoint_len(struct perf_event *bp) extern struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, - perf_callback_t triggered, + perf_overflow_handler_t triggered, struct task_struct *tsk); /* FIXME: only change from the attr, and don't unregister */ -extern struct perf_event * -modify_user_hw_breakpoint(struct perf_event *bp, - struct perf_event_attr *attr, - perf_callback_t triggered, - struct task_struct *tsk); +extern int +modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr); /* * Kernel breakpoints are not associated with any particular thread. */ extern struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, - perf_callback_t triggered, + perf_overflow_handler_t triggered, int cpu); extern struct perf_event ** register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_callback_t triggered); + perf_overflow_handler_t triggered); extern int register_perf_hw_breakpoint(struct perf_event *bp); extern int __register_perf_hw_breakpoint(struct perf_event *bp); @@ -93,20 +89,18 @@ static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp) static inline struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, - perf_callback_t triggered, + perf_overflow_handler_t triggered, struct task_struct *tsk) { return NULL; } -static inline struct perf_event * +static inline int modify_user_hw_breakpoint(struct perf_event *bp, - struct perf_event_attr *attr, - perf_callback_t triggered, - struct task_struct *tsk) { return NULL; } + struct perf_event_attr *attr) { return NULL; } static inline struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, - perf_callback_t triggered, + perf_overflow_handler_t triggered, int cpu) { return NULL; } static inline struct perf_event ** register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_callback_t triggered) { return NULL; } + perf_overflow_handler_t triggered) { return NULL; } static inline int register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } static inline int diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 43adbd7f001..64a53f74c9a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -18,10 +18,6 @@ #include <linux/ioctl.h> #include <asm/byteorder.h> -#ifdef CONFIG_HAVE_HW_BREAKPOINT -#include <asm/hw_breakpoint.h> -#endif - /* * User-space ABI bits: */ @@ -215,12 +211,12 @@ struct perf_event_attr { __u32 wakeup_watermark; /* bytes before wakeup */ }; - union { - struct { /* Hardware breakpoint info */ - __u64 bp_addr; - __u32 bp_type; - __u32 bp_len; - }; + struct { /* Hardware breakpoint info */ + __u64 bp_addr; + __u32 bp_type; + __u32 bp_len; + __u64 __bp_reserved_1; + __u64 __bp_reserved_2; }; __u32 __reserved_2; @@ -451,6 +447,10 @@ enum perf_callchain_context { # include <asm/perf_event.h> #endif +#ifdef CONFIG_HAVE_HW_BREAKPOINT +#include <asm/hw_breakpoint.h> +#endif + #include <linux/list.h> #include <linux/mutex.h> #include <linux/rculist.h> @@ -565,10 +565,12 @@ struct perf_pending_entry { void (*func)(struct perf_pending_entry *); }; -typedef void (*perf_callback_t)(struct perf_event *, void *); - struct perf_sample_data; +typedef void (*perf_overflow_handler_t)(struct perf_event *, int, + struct perf_sample_data *, + struct pt_regs *regs); + /** * struct perf_event - performance event kernel representation: */ @@ -660,18 +662,12 @@ struct perf_event { struct pid_namespace *ns; u64 id; - void (*overflow_handler)(struct perf_event *event, - int nmi, struct perf_sample_data *data, - struct pt_regs *regs); + perf_overflow_handler_t overflow_handler; #ifdef CONFIG_EVENT_PROFILE struct event_filter *filter; #endif - perf_callback_t callback; - - perf_callback_t event_callback; - #endif /* CONFIG_PERF_EVENTS */ }; @@ -781,7 +777,7 @@ extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, pid_t pid, - perf_callback_t callback); + perf_overflow_handler_t callback); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); @@ -876,6 +872,8 @@ extern void perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); +extern void perf_event_enable(struct perf_event *event); +extern void perf_event_disable(struct perf_event *event); #else static inline void perf_event_task_sched_in(struct task_struct *task, int cpu) { } @@ -906,7 +904,8 @@ static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } - +static inline void perf_event_enable(struct perf_event *event) { } +static inline void perf_event_disable(struct perf_event *event) { } #endif #define perf_output_put(handle, x) \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 89115ec7d43..294eb2f8014 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1102,7 +1102,7 @@ struct sched_class { void (*set_curr_task) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); - void (*task_new) (struct rq *rq, struct task_struct *p); + void (*task_fork) (struct task_struct *p); void (*switched_from) (struct rq *this_rq, struct task_struct *task, int running); @@ -1111,7 +1111,8 @@ struct sched_class { void (*prio_changed) (struct rq *this_rq, struct task_struct *task, int oldprio, int running); - unsigned int (*get_rr_interval) (struct task_struct *task); + unsigned int (*get_rr_interval) (struct rq *rq, + struct task_struct *task); #ifdef CONFIG_FAIR_GROUP_SCHED void (*moved_group) (struct task_struct *p); @@ -1151,8 +1152,6 @@ struct sched_entity { u64 start_runtime; u64 avg_wakeup; - u64 avg_running; - #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; @@ -1175,7 +1174,6 @@ struct sched_entity { u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; - u64 nr_forced2_migrations; u64 nr_wakeups; u64 nr_wakeups_sync; @@ -1840,7 +1838,8 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) extern int sched_clock_stable; #endif -extern unsigned long long sched_clock(void); +/* ftrace calls sched_clock() directly */ +extern unsigned long long notrace sched_clock(void); extern void sched_clock_init(void); extern u64 sched_clock_cpu(int cpu); @@ -1903,14 +1902,22 @@ extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_shares_ratelimit; extern unsigned int sysctl_sched_shares_thresh; extern unsigned int sysctl_sched_child_runs_first; + +enum sched_tunable_scaling { + SCHED_TUNABLESCALING_NONE, + SCHED_TUNABLESCALING_LOG, + SCHED_TUNABLESCALING_LINEAR, + SCHED_TUNABLESCALING_END, +}; +extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; + #ifdef CONFIG_SCHED_DEBUG -extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_time_avg; extern unsigned int sysctl_timer_migration; -int sched_nr_latency_handler(struct ctl_table *table, int write, +int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 09077f6ed12..5cf397ceb72 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -14,6 +14,7 @@ struct trace_seq { unsigned char buffer[PAGE_SIZE]; unsigned int len; unsigned int readpos; + int full; }; static inline void @@ -21,6 +22,7 @@ trace_seq_init(struct trace_seq *s) { s->len = 0; s->readpos = 0; + s->full = 0; } /* @@ -33,7 +35,7 @@ extern int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) __attribute__ ((format (printf, 2, 0))); extern int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); -extern void trace_print_seq(struct seq_file *m, struct trace_seq *s); +extern int trace_print_seq(struct seq_file *m, struct trace_seq *s); extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt); extern int trace_seq_puts(struct trace_seq *s, const char *str); @@ -55,8 +57,9 @@ trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) return 0; } -static inline void trace_print_seq(struct seq_file *m, struct trace_seq *s) +static inline int trace_print_seq(struct seq_file *m, struct trace_seq *s) { + return 0; } static inline ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) diff --git a/include/linux/usbdevice_fs.h b/include/linux/usbdevice_fs.h index b2a7d8ba6ee..15591d2ea40 100644 --- a/include/linux/usbdevice_fs.h +++ b/include/linux/usbdevice_fs.h @@ -128,6 +128,29 @@ struct usbdevfs_hub_portinfo { #ifdef __KERNEL__ #ifdef CONFIG_COMPAT #include <linux/compat.h> + +struct usbdevfs_ctrltransfer32 { + u8 bRequestType; + u8 bRequest; + u16 wValue; + u16 wIndex; + u16 wLength; + u32 timeout; /* in milliseconds */ + compat_caddr_t data; +}; + +struct usbdevfs_bulktransfer32 { + compat_uint_t ep; + compat_uint_t len; + compat_uint_t timeout; /* in milliseconds */ + compat_caddr_t data; +}; + +struct usbdevfs_disconnectsignal32 { + compat_int_t signr; + compat_caddr_t context; +}; + struct usbdevfs_urb32 { unsigned char type; unsigned char endpoint; @@ -153,7 +176,9 @@ struct usbdevfs_ioctl32 { #endif /* __KERNEL__ */ #define USBDEVFS_CONTROL _IOWR('U', 0, struct usbdevfs_ctrltransfer) +#define USBDEVFS_CONTROL32 _IOWR('U', 0, struct usbdevfs_ctrltransfer32) #define USBDEVFS_BULK _IOWR('U', 2, struct usbdevfs_bulktransfer) +#define USBDEVFS_BULK32 _IOWR('U', 2, struct usbdevfs_bulktransfer32) #define USBDEVFS_RESETEP _IOR('U', 3, unsigned int) #define USBDEVFS_SETINTERFACE _IOR('U', 4, struct usbdevfs_setinterface) #define USBDEVFS_SETCONFIGURATION _IOR('U', 5, unsigned int) @@ -166,6 +191,7 @@ struct usbdevfs_ioctl32 { #define USBDEVFS_REAPURBNDELAY _IOW('U', 13, void *) #define USBDEVFS_REAPURBNDELAY32 _IOW('U', 13, __u32) #define USBDEVFS_DISCSIGNAL _IOR('U', 14, struct usbdevfs_disconnectsignal) +#define USBDEVFS_DISCSIGNAL32 _IOR('U', 14, struct usbdevfs_disconnectsignal32) #define USBDEVFS_CLAIMINTERFACE _IOR('U', 15, unsigned int) #define USBDEVFS_RELEASEINTERFACE _IOR('U', 16, unsigned int) #define USBDEVFS_CONNECTINFO _IOW('U', 17, struct usbdevfs_connectinfo) diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index e5ce87a0498..9496b965d62 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -301,8 +301,8 @@ TRACE_EVENT(itimer_state, __entry->interval_usec = value->it_interval.tv_usec; ), - TP_printk("which=%d expires=%lu it_value=%lu.%lu it_interval=%lu.%lu", - __entry->which, __entry->expires, + TP_printk("which=%d expires=%llu it_value=%ld.%ld it_interval=%ld.%ld", + __entry->which, (unsigned long long)__entry->expires, __entry->value_sec, __entry->value_usec, __entry->interval_sec, __entry->interval_usec) ); @@ -331,8 +331,8 @@ TRACE_EVENT(itimer_expire, __entry->pid = pid_nr(pid); ), - TP_printk("which=%d pid=%d now=%lu", __entry->which, - (int) __entry->pid, __entry->now) + TP_printk("which=%d pid=%d now=%llu", __entry->which, + (int) __entry->pid, (unsigned long long)__entry->now) ); #endif /* _TRACE_TIMER_H */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 7c4e2713df0..291ac586f37 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -212,6 +212,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err == NOTIFY_BAD) { + set_cpu_active(cpu, true); + nr_calls--; __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); @@ -223,11 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) /* Ensure that we are not runnable on dying cpu */ cpumask_copy(old_allowed, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, - cpumask_of(cpumask_any_but(cpu_online_mask, cpu))); + set_cpus_allowed_ptr(current, cpu_active_mask); err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { + set_cpu_active(cpu, true); /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu) == NOTIFY_BAD) @@ -292,9 +294,6 @@ int __ref cpu_down(unsigned int cpu) err = _cpu_down(cpu, 0); - if (cpu_online(cpu)) - set_cpu_active(cpu, true); - out: cpu_maps_update_done(); stop_machine_destroy(); @@ -387,6 +386,15 @@ int disable_nonboot_cpus(void) * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); + + for_each_online_cpu(cpu) { + if (cpu == first_cpu) + continue; + set_cpu_active(cpu, false); + } + + synchronize_sched(); + printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3cf2183b472..ba401fab459 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -737,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused) { } -static int generate_sched_domains(struct cpumask **domains, +static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { *domains = NULL; @@ -872,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) + if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) return -EINVAL; } retval = validate_change(cs, trialcs); @@ -2010,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) } /* Continue past cpusets with all cpus, mems online */ - if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && + if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; @@ -2019,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_online_mask); + cpu_active_mask); nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); mutex_unlock(&callback_mutex); @@ -2057,8 +2057,10 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, switch (phase) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: break; default: @@ -2067,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, cgroup_lock(); mutex_lock(&callback_mutex); - cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); + cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); mutex_unlock(&callback_mutex); scan_for_empty_cpusets(&top_cpuset); ndoms = generate_sched_domains(&doms, &attr); @@ -2114,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, void __init cpuset_init_smp(void) { - cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); + cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; hotcpu_notifier(cpuset_track_online_cpus, 0); diff --git a/kernel/futex.c b/kernel/futex.c index fb65e822fc4..d73ef1f3e55 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -304,8 +304,14 @@ void put_futex_key(int fshared, union futex_key *key) */ static int fault_in_user_writeable(u32 __user *uaddr) { - int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, - 1, 1, 0, NULL, NULL); + struct mm_struct *mm = current->mm; + int ret; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, (unsigned long)uaddr, + 1, 1, 0, NULL, NULL); + up_read(&mm->mmap_sem); + return ret < 0 ? ret : 0; } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ede52770812..d2f9239dc6b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) static int hrtimer_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { - ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); int res; @@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer, if (expires.tv64 < 0) return -ETIME; - if (expires.tv64 >= expires_next->tv64) + if (expires.tv64 >= cpu_base->expires_next.tv64) + return 0; + + /* + * If a hang was detected in the last timer interrupt then we + * do not schedule a timer which is earlier than the expiry + * which we enforced in the hang detection. We want the system + * to make progress. + */ + if (cpu_base->hang_detected) return 0; /* @@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer, */ res = tick_program_event(expires, 0); if (!IS_ERR_VALUE(res)) - *expires_next = expires; + cpu_base->expires_next = expires; return res; } @@ -747,17 +756,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } #endif /* CONFIG_HIGH_RES_TIMERS */ -#ifdef CONFIG_TIMER_STATS -void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr) +static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) { +#ifdef CONFIG_TIMER_STATS if (timer->start_site) return; - - timer->start_site = addr; + timer->start_site = __builtin_return_address(0); memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); timer->start_pid = current->pid; +#endif } + +static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; +#endif +} + +static inline void timer_stats_account_hrtimer(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + if (likely(!timer_stats_active)) + return; + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm, 0); #endif +} /* * Counterpart to lock_hrtimer_base above: @@ -1217,30 +1242,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) #ifdef CONFIG_HIGH_RES_TIMERS -static int force_clock_reprogram; - -/* - * After 5 iteration's attempts, we consider that hrtimer_interrupt() - * is hanging, which could happen with something that slows the interrupt - * such as the tracing. Then we force the clock reprogramming for each future - * hrtimer interrupts to avoid infinite loops and use the min_delta_ns - * threshold that we will overwrite. - * The next tick event will be scheduled to 3 times we currently spend on - * hrtimer_interrupt(). This gives a good compromise, the cpus will spend - * 1/4 of their time to process the hrtimer interrupts. This is enough to - * let it running without serious starvation. - */ - -static inline void -hrtimer_interrupt_hanging(struct clock_event_device *dev, - ktime_t try_time) -{ - force_clock_reprogram = 1; - dev->min_delta_ns = (unsigned long)try_time.tv64 * 3; - printk(KERN_WARNING "hrtimer: interrupt too slow, " - "forcing clock min delta to %llu ns\n", - (unsigned long long) dev->min_delta_ns); -} /* * High resolution timer interrupt * Called with interrupts disabled @@ -1249,21 +1250,15 @@ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; - ktime_t expires_next, now; - int nr_retries = 0; - int i; + ktime_t expires_next, now, entry_time, delta; + int i, retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event.tv64 = KTIME_MAX; - retry: - /* 5 retries is enough to notice a hang */ - if (!(++nr_retries % 5)) - hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now)); - - now = ktime_get(); - + entry_time = now = ktime_get(); +retry: expires_next.tv64 = KTIME_MAX; spin_lock(&cpu_base->lock); @@ -1325,10 +1320,48 @@ void hrtimer_interrupt(struct clock_event_device *dev) spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ - if (expires_next.tv64 != KTIME_MAX) { - if (tick_program_event(expires_next, force_clock_reprogram)) - goto retry; + if (expires_next.tv64 == KTIME_MAX || + !tick_program_event(expires_next, 0)) { + cpu_base->hang_detected = 0; + return; } + + /* + * The next timer was already expired due to: + * - tracing + * - long lasting callbacks + * - being scheduled away when running in a VM + * + * We need to prevent that we loop forever in the hrtimer + * interrupt routine. We give it 3 attempts to avoid + * overreacting on some spurious event. + */ + now = ktime_get(); + cpu_base->nr_retries++; + if (++retries < 3) + goto retry; + /* + * Give the system a chance to do something else than looping + * here. We stored the entry time, so we know exactly how long + * we spent here. We schedule the next event this amount of + * time away. + */ + cpu_base->nr_hangs++; + cpu_base->hang_detected = 1; + delta = ktime_sub(now, entry_time); + if (delta.tv64 > cpu_base->max_hang_time.tv64) + cpu_base->max_hang_time = delta; + /* + * Limit it to a sensible value as we enforce a longer + * delay. Give the CPU at least 100ms to catch up. + */ + if (delta.tv64 > 100 * NSEC_PER_MSEC) + expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); + else + expires_next = ktime_add(now, delta); + tick_program_event(expires_next, 1); + printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n", + ktime_to_ns(delta)); } /* diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index cf5ee162841..366eedf949c 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -52,7 +52,7 @@ static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); /* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]); +static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); /* Number of non-pinned cpu/task breakpoints in a cpu */ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); @@ -73,7 +73,7 @@ static DEFINE_MUTEX(nr_bp_mutex); static unsigned int max_task_bp_pinned(int cpu) { int i; - unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu); + unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); for (i = HBP_NUM -1; i >= 0; i--) { if (tsk_pinned[i] > 0) @@ -83,15 +83,51 @@ static unsigned int max_task_bp_pinned(int cpu) return 0; } +static int task_bp_pinned(struct task_struct *tsk) +{ + struct perf_event_context *ctx = tsk->perf_event_ctxp; + struct list_head *list; + struct perf_event *bp; + unsigned long flags; + int count = 0; + + if (WARN_ONCE(!ctx, "No perf context for this task")) + return 0; + + list = &ctx->event_list; + + spin_lock_irqsave(&ctx->lock, flags); + + /* + * The current breakpoint counter is not included in the list + * at the open() callback time + */ + list_for_each_entry(bp, list, event_entry) { + if (bp->attr.type == PERF_TYPE_BREAKPOINT) + count++; + } + + spin_unlock_irqrestore(&ctx->lock, flags); + + return count; +} + /* * Report the number of pinned/un-pinned breakpoints we have in * a given cpu (cpu > -1) or in all of them (cpu = -1). */ -static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu) +static void +fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) { + int cpu = bp->cpu; + struct task_struct *tsk = bp->ctx->task; + if (cpu >= 0) { slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); - slots->pinned += max_task_bp_pinned(cpu); + if (!tsk) + slots->pinned += max_task_bp_pinned(cpu); + else + slots->pinned += task_bp_pinned(tsk); slots->flexible = per_cpu(nr_bp_flexible, cpu); return; @@ -101,7 +137,10 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu) unsigned int nr; nr = per_cpu(nr_cpu_bp_pinned, cpu); - nr += max_task_bp_pinned(cpu); + if (!tsk) + nr += max_task_bp_pinned(cpu); + else + nr += task_bp_pinned(tsk); if (nr > slots->pinned) slots->pinned = nr; @@ -118,35 +157,12 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu) */ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) { - int count = 0; - struct perf_event *bp; - struct perf_event_context *ctx = tsk->perf_event_ctxp; unsigned int *tsk_pinned; - struct list_head *list; - unsigned long flags; - - if (WARN_ONCE(!ctx, "No perf context for this task")) - return; - - list = &ctx->event_list; - - spin_lock_irqsave(&ctx->lock, flags); - - /* - * The current breakpoint counter is not included in the list - * at the open() callback time - */ - list_for_each_entry(bp, list, event_entry) { - if (bp->attr.type == PERF_TYPE_BREAKPOINT) - count++; - } + int count = 0; - spin_unlock_irqrestore(&ctx->lock, flags); + count = task_bp_pinned(tsk); - if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list")) - return; - - tsk_pinned = per_cpu(task_bp_pinned, cpu); + tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); if (enable) { tsk_pinned[count]++; if (count > 0) @@ -193,7 +209,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * - If attached to a single cpu, check: * * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM + * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM * * -> If there are already non-pinned counters in this cpu, it means * there is already a free slot for them. @@ -204,7 +220,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * - If attached to every cpus, check: * * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM + * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM * * -> This is roughly the same, except we check the number of per cpu * bp for every cpu and we keep the max one. Same for the per tasks @@ -216,7 +232,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * - If attached to a single cpu, check: * * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM + * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM * * -> Same checks as before. But now the nr_bp_flexible, if any, must keep * one register at least (or they will never be fed). @@ -224,7 +240,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * - If attached to every cpus, check: * * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(task_bp_pinned, *))) < HBP_NUM + * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM */ int reserve_bp_slot(struct perf_event *bp) { @@ -233,7 +249,7 @@ int reserve_bp_slot(struct perf_event *bp) mutex_lock(&nr_bp_mutex); - fetch_bp_busy_slots(&slots, bp->cpu); + fetch_bp_busy_slots(&slots, bp); /* Flexible counters need to keep at least one slot */ if (slots.pinned + (!!slots.flexible) == HBP_NUM) { @@ -259,7 +275,7 @@ void release_bp_slot(struct perf_event *bp) } -int __register_perf_hw_breakpoint(struct perf_event *bp) +int register_perf_hw_breakpoint(struct perf_event *bp) { int ret; @@ -276,19 +292,12 @@ int __register_perf_hw_breakpoint(struct perf_event *bp) * This is a quick hack that will be removed soon, once we remove * the tmp breakpoints from ptrace */ - if (!bp->attr.disabled || bp->callback == perf_bp_event) + if (!bp->attr.disabled || !bp->overflow_handler) ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); return ret; } -int register_perf_hw_breakpoint(struct perf_event *bp) -{ - bp->callback = perf_bp_event; - - return __register_perf_hw_breakpoint(bp); -} - /** * register_user_hw_breakpoint - register a hardware breakpoint for user space * @attr: breakpoint attributes @@ -297,7 +306,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp) */ struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, - perf_callback_t triggered, + perf_overflow_handler_t triggered, struct task_struct *tsk) { return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); @@ -311,19 +320,40 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); * @triggered: callback to trigger when we hit the breakpoint * @tsk: pointer to 'task_struct' of the process to which the address belongs */ -struct perf_event * -modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr, - perf_callback_t triggered, - struct task_struct *tsk) +int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { - /* - * FIXME: do it without unregistering - * - We don't want to lose our slot - * - If the new bp is incorrect, don't lose the older one - */ - unregister_hw_breakpoint(bp); + u64 old_addr = bp->attr.bp_addr; + int old_type = bp->attr.bp_type; + int old_len = bp->attr.bp_len; + int err = 0; - return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); + perf_event_disable(bp); + + bp->attr.bp_addr = attr->bp_addr; + bp->attr.bp_type = attr->bp_type; + bp->attr.bp_len = attr->bp_len; + + if (attr->disabled) + goto end; + + err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + if (!err) + perf_event_enable(bp); + + if (err) { + bp->attr.bp_addr = old_addr; + bp->attr.bp_type = old_type; + bp->attr.bp_len = old_len; + if (!bp->attr.disabled) + perf_event_enable(bp); + + return err; + } + +end: + bp->attr.disabled = attr->disabled; + + return 0; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); @@ -348,7 +378,7 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); */ struct perf_event ** register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_callback_t triggered) + perf_overflow_handler_t triggered) { struct perf_event **cpu_events, **pevent, *bp; long err; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f5dcd36d315..4f8df01dbe5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -168,7 +168,7 @@ static void lock_time_inc(struct lock_time *lt, u64 time) if (time > lt->max) lt->max = time; - if (time < lt->min || !lt->min) + if (time < lt->min || !lt->nr) lt->min = time; lt->total += time; @@ -177,8 +177,15 @@ static void lock_time_inc(struct lock_time *lt, u64 time) static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) { - dst->min += src->min; - dst->max += src->max; + if (!src->nr) + return; + + if (src->max > dst->max) + dst->max = src->max; + + if (src->min < dst->min || !dst->nr) + dst->min = src->min; + dst->total += src->total; dst->nr += src->nr; } @@ -379,7 +386,8 @@ static int save_trace(struct stack_trace *trace) * complete trace that maxes out the entries provided will be reported * as incomplete, friggin useless </rant> */ - if (trace->entries[trace->nr_entries-1] == ULONG_MAX) + if (trace->nr_entries != 0 && + trace->entries[trace->nr_entries-1] == ULONG_MAX) trace->nr_entries--; trace->max_entries = trace->nr_entries; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 40a996ec39f..e73e53c7582 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -36,7 +36,7 @@ /* * Each CPU has a list of per CPU events: */ -DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); +static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); int perf_max_events __read_mostly = 1; static int perf_reserved_percpu __read_mostly; @@ -567,7 +567,7 @@ static void __perf_event_disable(void *info) * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ -static void perf_event_disable(struct perf_event *event) +void perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -971,7 +971,7 @@ static void __perf_event_enable(void *info) * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ -static void perf_event_enable(struct perf_event *event) +void perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -1579,7 +1579,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx, struct task_struct *task) { - memset(ctx, 0, sizeof(*ctx)); spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->group_list); @@ -1654,7 +1653,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) } if (!ctx) { - ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); err = -ENOMEM; if (!ctx) goto errout; @@ -4011,6 +4010,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event->pmu->read(event); data.addr = 0; + data.raw = NULL; data.period = event->hw.last_period; regs = get_irq_regs(); /* @@ -4080,8 +4080,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event) u64 now; now = cpu_clock(cpu); - prev = atomic64_read(&event->hw.prev_count); - atomic64_set(&event->hw.prev_count, now); + prev = atomic64_xchg(&event->hw.prev_count, now); atomic64_add(now - prev, &event->count); } @@ -4286,15 +4285,8 @@ static void bp_perf_event_destroy(struct perf_event *event) static const struct pmu *bp_perf_event_init(struct perf_event *bp) { int err; - /* - * The breakpoint is already filled if we haven't created the counter - * through perf syscall - * FIXME: manage to get trigerred to NULL if it comes from syscalls - */ - if (!bp->callback) - err = register_perf_hw_breakpoint(bp); - else - err = __register_perf_hw_breakpoint(bp); + + err = register_perf_hw_breakpoint(bp); if (err) return ERR_PTR(err); @@ -4308,6 +4300,7 @@ void perf_bp_event(struct perf_event *bp, void *data) struct perf_sample_data sample; struct pt_regs *regs = data; + sample.raw = NULL; sample.addr = bp->attr.bp_addr; if (!perf_exclude_event(bp, regs)) @@ -4390,7 +4383,7 @@ perf_event_alloc(struct perf_event_attr *attr, struct perf_event_context *ctx, struct perf_event *group_leader, struct perf_event *parent_event, - perf_callback_t callback, + perf_overflow_handler_t overflow_handler, gfp_t gfpflags) { const struct pmu *pmu; @@ -4433,10 +4426,10 @@ perf_event_alloc(struct perf_event_attr *attr, event->state = PERF_EVENT_STATE_INACTIVE; - if (!callback && parent_event) - callback = parent_event->callback; + if (!overflow_handler && parent_event) + overflow_handler = parent_event->overflow_handler; - event->callback = callback; + event->overflow_handler = overflow_handler; if (attr->disabled) event->state = PERF_EVENT_STATE_OFF; @@ -4776,7 +4769,8 @@ err_put_context: */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, - pid_t pid, perf_callback_t callback) + pid_t pid, + perf_overflow_handler_t overflow_handler) { struct perf_event *event; struct perf_event_context *ctx; @@ -4793,7 +4787,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, } event = perf_event_alloc(attr, cpu, ctx, NULL, - NULL, callback, GFP_KERNEL); + NULL, overflow_handler, GFP_KERNEL); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_put_context; @@ -5090,7 +5084,7 @@ again: */ int perf_event_init_task(struct task_struct *child) { - struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *child_ctx = NULL, *parent_ctx; struct perf_event_context *cloned_ctx; struct perf_event *event; struct task_struct *parent = current; @@ -5106,20 +5100,6 @@ int perf_event_init_task(struct task_struct *child) return 0; /* - * This is executed from the parent task context, so inherit - * events that have been marked for cloning. - * First allocate and initialize a context for the child. - */ - - child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); - if (!child_ctx) - return -ENOMEM; - - __perf_event_init_context(child_ctx, child); - child->perf_event_ctxp = child_ctx; - get_task_struct(child); - - /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ @@ -5149,6 +5129,26 @@ int perf_event_init_task(struct task_struct *child) continue; } + if (!child->perf_event_ctxp) { + /* + * This is executed from the parent task context, so + * inherit events that have been marked for cloning. + * First allocate and initialize a context for the + * child. + */ + + child_ctx = kzalloc(sizeof(struct perf_event_context), + GFP_KERNEL); + if (!child_ctx) { + ret = -ENOMEM; + goto exit; + } + + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; + get_task_struct(child); + } + ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) { @@ -5177,6 +5177,7 @@ int perf_event_init_task(struct task_struct *child) get_ctx(child_ctx->parent_ctx); } +exit: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); diff --git a/kernel/sched.c b/kernel/sched.c index e7f2cfa6a25..ff39cadf621 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; * default: 0.25ms */ unsigned int sysctl_sched_shares_ratelimit = 250000; +unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; /* * Inject some fuzzyness into changing the per-cpu group shares @@ -1614,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0, shares = 0; + unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; unsigned long *usd_rq_weight; struct sched_domain *sd = data; unsigned long flags; @@ -1630,6 +1631,7 @@ static int tg_shares_up(struct task_group *tg, void *data) weight = tg->cfs_rq[i]->load.weight; usd_rq_weight[i] = weight; + rq_weight += weight; /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to @@ -1638,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!weight) weight = NICE_0_LOAD; - rq_weight += weight; + sum_weight += weight; shares += tg->cfs_rq[i]->shares; } + if (!rq_weight) + rq_weight = sum_weight; + if ((!shares && rq_weight) || shares > tg->shares) shares = tg->shares; @@ -1810,6 +1815,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #endif static void calc_load_account_active(struct rq *this_rq); +static void update_sysctl(void); +static int get_update_sysctl_factor(void); + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + task_thread_info(p)->cpu = cpu; +#endif +} #include "sched_stats.h" #include "sched_idletask.c" @@ -1967,20 +1988,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - set_task_rq(p, cpu); -#ifdef CONFIG_SMP - /* - * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfuly executed on another CPU. We must ensure that updates of - * per-task data have been completed by this moment. - */ - smp_wmb(); - task_thread_info(p)->cpu = cpu; -#endif -} - static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio, int running) @@ -2060,29 +2067,13 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); struct cfs_rq *old_cfsrq = task_cfs_rq(p), *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); - u64 clock_offset; - - clock_offset = old_rq->clock - new_rq->clock; trace_sched_migrate_task(p, new_cpu); -#ifdef CONFIG_SCHEDSTATS - if (p->se.wait_start) - p->se.wait_start -= clock_offset; - if (p->se.sleep_start) - p->se.sleep_start -= clock_offset; - if (p->se.block_start) - p->se.block_start -= clock_offset; -#endif if (old_cpu != new_cpu) { p->se.nr_migrations++; -#ifdef CONFIG_SCHEDSTATS - if (task_hot(p, old_rq->clock, NULL)) - schedstat_inc(p, se.nr_forced2_migrations); -#endif perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } @@ -2323,6 +2314,14 @@ void task_oncpu_function_call(struct task_struct *p, preempt_enable(); } +#ifdef CONFIG_SMP +static inline +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +{ + return p->sched_class->select_task_rq(p, sd_flags, wake_flags); +} +#endif + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2374,17 +2373,14 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (task_contributes_to_load(p)) rq->nr_uninterruptible--; p->state = TASK_WAKING; - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) { - local_irq_save(flags); - rq = cpu_rq(cpu); - update_rq_clock(rq); + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + if (cpu != orig_cpu) set_task_cpu(p, cpu); - local_irq_restore(flags); - } - rq = task_rq_lock(p, &flags); + + rq = __task_rq_lock(p); + update_rq_clock(rq); WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -2499,7 +2495,6 @@ static void __sched_fork(struct task_struct *p) p->se.avg_overlap = 0; p->se.start_runtime = 0; p->se.avg_wakeup = sysctl_sched_wakeup_granularity; - p->se.avg_running = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -2521,7 +2516,6 @@ static void __sched_fork(struct task_struct *p) p->se.nr_failed_migrations_running = 0; p->se.nr_failed_migrations_hot = 0; p->se.nr_forced_migrations = 0; - p->se.nr_forced2_migrations = 0; p->se.nr_wakeups = 0; p->se.nr_wakeups_sync = 0; @@ -2558,7 +2552,6 @@ static void __sched_fork(struct task_struct *p) void sched_fork(struct task_struct *p, int clone_flags) { int cpu = get_cpu(); - unsigned long flags; __sched_fork(p); @@ -2592,13 +2585,13 @@ void sched_fork(struct task_struct *p, int clone_flags) if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + #ifdef CONFIG_SMP - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); + cpu = select_task_rq(p, SD_BALANCE_FORK, 0); #endif - local_irq_save(flags); - update_rq_clock(cpu_rq(cpu)); set_task_cpu(p, cpu); - local_irq_restore(flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) @@ -2631,17 +2624,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); update_rq_clock(rq); - - if (!p->sched_class->task_new || !current->se.on_rq) { - activate_task(rq, p, 0); - } else { - /* - * Let the scheduling class do new task startup - * management (if any): - */ - p->sched_class->task_new(rq, p); - inc_nr_running(rq); - } + activate_task(rq, p, 0); trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -3156,7 +3139,7 @@ out: void sched_exec(void) { int new_cpu, this_cpu = get_cpu(); - new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); + new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0); put_cpu(); if (new_cpu != this_cpu) sched_migrate_task(current, new_cpu); @@ -3172,10 +3155,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ check_preempt_curr(this_rq, p, 0); } @@ -4134,7 +4113,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_copy(cpus, cpu_online_mask); + cpumask_copy(cpus, cpu_active_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4297,7 +4276,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int all_pinned = 0; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_copy(cpus, cpu_online_mask); + cpumask_copy(cpus, cpu_active_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4694,7 +4673,7 @@ int select_nohz_load_balancer(int stop_tick) cpumask_set_cpu(cpu, nohz.cpu_mask); /* time for ilb owner also to sleep */ - if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { + if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) atomic_set(&nohz.load_balancer, -1); return 0; @@ -5396,13 +5375,14 @@ static inline void schedule_debug(struct task_struct *prev) #endif } -static void put_prev_task(struct rq *rq, struct task_struct *p) +static void put_prev_task(struct rq *rq, struct task_struct *prev) { - u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; + if (prev->state == TASK_RUNNING) { + u64 runtime = prev->se.sum_exec_runtime; - update_avg(&p->se.avg_running, runtime); + runtime -= prev->se.prev_sum_exec_runtime; + runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); - if (p->state == TASK_RUNNING) { /* * In order to avoid avg_overlap growing stale when we are * indeed overlapping and hence not getting put to sleep, grow @@ -5412,12 +5392,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p) * correlates to the amount of cache footprint a task can * build up. */ - runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); - update_avg(&p->se.avg_overlap, runtime); - } else { - update_avg(&p->se.avg_running, 0); + update_avg(&prev->se.avg_overlap, runtime); } - p->sched_class->put_prev_task(rq, p); + prev->sched_class->put_prev_task(rq, prev); } /* @@ -6631,6 +6608,8 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; + unsigned long flags; + struct rq *rq; int retval; get_online_cpus(); @@ -6645,7 +6624,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) if (retval) goto out_unlock; + rq = task_rq_lock(p, &flags); cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + task_rq_unlock(rq, &flags); out_unlock: read_unlock(&tasklist_lock); @@ -6883,6 +6864,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, { struct task_struct *p; unsigned int time_slice; + unsigned long flags; + struct rq *rq; int retval; struct timespec t; @@ -6899,7 +6882,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - time_slice = p->sched_class->get_rr_interval(p); + rq = task_rq_lock(p, &flags); + time_slice = p->sched_class->get_rr_interval(rq, p); + task_rq_unlock(rq, &flags); read_unlock(&tasklist_lock); jiffies_to_timespec(time_slice, &t); @@ -7000,7 +6985,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) __sched_fork(idle); idle->se.exec_start = sched_clock(); - idle->prio = idle->normal_prio = MAX_PRIO; cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); __set_task_cpu(idle, cpu); @@ -7041,22 +7025,43 @@ cpumask_var_t nohz_cpu_mask; * * This idea comes from the SD scheduler of Con Kolivas: */ -static inline void sched_init_granularity(void) +static int get_update_sysctl_factor(void) { - unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long limit = 200000000; + unsigned int cpus = min_t(int, num_online_cpus(), 8); + unsigned int factor; + + switch (sysctl_sched_tunable_scaling) { + case SCHED_TUNABLESCALING_NONE: + factor = 1; + break; + case SCHED_TUNABLESCALING_LINEAR: + factor = cpus; + break; + case SCHED_TUNABLESCALING_LOG: + default: + factor = 1 + ilog2(cpus); + break; + } - sysctl_sched_min_granularity *= factor; - if (sysctl_sched_min_granularity > limit) - sysctl_sched_min_granularity = limit; + return factor; +} - sysctl_sched_latency *= factor; - if (sysctl_sched_latency > limit) - sysctl_sched_latency = limit; +static void update_sysctl(void) +{ + unsigned int factor = get_update_sysctl_factor(); - sysctl_sched_wakeup_granularity *= factor; +#define SET_SYSCTL(name) \ + (sysctl_##name = (factor) * normalized_sysctl_##name) + SET_SYSCTL(sched_min_granularity); + SET_SYSCTL(sched_latency); + SET_SYSCTL(sched_wakeup_granularity); + SET_SYSCTL(sched_shares_ratelimit); +#undef SET_SYSCTL +} - sysctl_sched_shares_ratelimit *= factor; +static inline void sched_init_granularity(void) +{ + update_sysctl(); } #ifdef CONFIG_SMP @@ -7093,7 +7098,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpumask_intersects(new_mask, cpu_online_mask)) { + if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; } @@ -7115,7 +7120,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { + if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ struct task_struct *mt = rq->migration_thread; @@ -7269,19 +7274,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) again: /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto move; /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); if (dest_cpu < nr_cpu_ids) goto move; /* No more Mr. Nice Guy. */ if (dest_cpu >= nr_cpu_ids) { cpuset_cpus_allowed_locked(p, &p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); /* * Don't tell them about moving exiting tasks or @@ -7310,7 +7315,7 @@ move: */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); unsigned long flags; local_irq_save(flags); @@ -7563,7 +7568,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) static struct ctl_table_header *sd_sysctl_header; static void register_sched_domain_sysctl(void) { - int i, cpu_num = num_online_cpus(); + int i, cpu_num = num_possible_cpus(); struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); char buf[32]; @@ -7573,7 +7578,7 @@ static void register_sched_domain_sysctl(void) if (entry == NULL) return; - for_each_online_cpu(i) { + for_each_possible_cpu(i) { snprintf(buf, 32, "cpu%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); entry->mode = 0555; @@ -7703,7 +7708,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_lock_irq(&rq->lock); update_rq_clock(rq); deactivate_task(rq, rq->idle, 0); - rq->idle->static_prio = MAX_PRIO; __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); rq->idle->sched_class = &idle_sched_class; migrate_dead_tasks(cpu); @@ -9099,7 +9103,7 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; doms_new = &fallback_doms; - cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map); + cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } @@ -9230,8 +9234,10 @@ static int update_sched_domains(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: partition_sched_domains(1, NULL, NULL); return NOTIFY_OK; @@ -9278,7 +9284,7 @@ void __init sched_init_smp(void) #endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(cpu_online_mask); + arch_init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); @@ -9842,13 +9848,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) se = kzalloc_node(sizeof(struct sched_entity), GFP_KERNEL, cpu_to_node(i)); if (!se) - goto err; + goto err_free_rq; init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); } return 1; + err_free_rq: + kfree(cfs_rq); err: return 0; } @@ -9930,13 +9938,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) rt_se = kzalloc_node(sizeof(struct sched_rt_entity), GFP_KERNEL, cpu_to_node(i)); if (!rt_se) - goto err; + goto err_free_rq; init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); } return 1; + err_free_rq: + kfree(rt_rq); err: return 0; } diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 6988cf08f70..5ae24fc65d7 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -309,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu) print_rq(m, rq, cpu); } +static const char *sched_tunable_scaling_names[] = { + "none", + "logaritmic", + "linear" +}; + static int sched_debug_show(struct seq_file *m, void *v) { u64 now = ktime_to_ns(ktime_get()); @@ -334,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v) #undef PN #undef P + SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", + sysctl_sched_tunable_scaling, + sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); + for_each_online_cpu(cpu) print_cpu(m, cpu); @@ -399,7 +409,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.sum_exec_runtime); PN(se.avg_overlap); PN(se.avg_wakeup); - PN(se.avg_running); nr_switches = p->nvcsw + p->nivcsw; @@ -423,7 +432,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.nr_failed_migrations_running); P(se.nr_failed_migrations_hot); P(se.nr_forced_migrations); - P(se.nr_forced2_migrations); P(se.nr_wakeups); P(se.nr_wakeups_sync); P(se.nr_wakeups_migrate); @@ -499,7 +507,6 @@ void proc_sched_set_task(struct task_struct *p) p->se.nr_failed_migrations_running = 0; p->se.nr_failed_migrations_hot = 0; p->se.nr_forced_migrations = 0; - p->se.nr_forced2_migrations = 0; p->se.nr_wakeups = 0; p->se.nr_wakeups_sync = 0; p->se.nr_wakeups_migrate = 0; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f61837ad336..804a411838f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -21,6 +21,7 @@ */ #include <linux/latencytop.h> +#include <linux/sched.h> /* * Targeted preemption latency for CPU-bound tasks: @@ -35,12 +36,26 @@ * run vmstat and monitor the context-switches (cs) field) */ unsigned int sysctl_sched_latency = 5000000ULL; +unsigned int normalized_sysctl_sched_latency = 5000000ULL; + +/* + * The initial- and re-scaling of tunables is configurable + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * + * Options are: + * SCHED_TUNABLESCALING_NONE - unscaled, always *1 + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + */ +enum sched_tunable_scaling sysctl_sched_tunable_scaling + = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_min_granularity = 1000000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity @@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield; * have immediate wakeup/sleep latencies. */ unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) */ #ifdef CONFIG_SCHED_DEBUG -int sched_nr_latency_handler(struct ctl_table *table, int write, +int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + int factor = get_update_sysctl_factor(); if (ret || !write) return ret; @@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, sysctl_sched_min_granularity); +#define WRT_SYSCTL(name) \ + (normalized_sysctl_##name = sysctl_##name / (factor)) + WRT_SYSCTL(sched_min_granularity); + WRT_SYSCTL(sched_latency); + WRT_SYSCTL(sched_wakeup_granularity); + WRT_SYSCTL(sched_shares_ratelimit); +#undef WRT_SYSCTL + return 0; } #endif @@ -1403,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag new_cpu = prev_cpu; } - rcu_read_lock(); for_each_domain(cpu, tmp) { /* * If power savings logic is enabled for a domain, see if we @@ -1484,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag update_shares(tmp); } - if (affine_sd && wake_affine(affine_sd, p, sync)) { - new_cpu = cpu; - goto out; - } + if (affine_sd && wake_affine(affine_sd, p, sync)) + return cpu; while (sd) { int load_idx = sd->forkexec_idx; @@ -1528,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag /* while loop will break here if sd == NULL */ } -out: - rcu_read_unlock(); return new_cpu; } #endif /* CONFIG_SMP */ @@ -1651,12 +1671,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ int sync = wake_flags & WF_SYNC; int scale = cfs_rq->nr_running >= sched_nr_latency; - update_curr(cfs_rq); - - if (unlikely(rt_prio(p->prio))) { - resched_task(curr); - return; - } + if (unlikely(rt_prio(p->prio))) + goto preempt; if (unlikely(p->sched_class != &fair_sched_class)) return; @@ -1682,50 +1698,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* Idle tasks are by definition preempted by everybody. */ - if (unlikely(curr->policy == SCHED_IDLE)) { - resched_task(curr); - return; - } + if (unlikely(curr->policy == SCHED_IDLE)) + goto preempt; - if ((sched_feat(WAKEUP_SYNC) && sync) || - (sched_feat(WAKEUP_OVERLAP) && - (se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost))) { - resched_task(curr); - return; - } + if (sched_feat(WAKEUP_SYNC) && sync) + goto preempt; - if (sched_feat(WAKEUP_RUNNING)) { - if (pse->avg_running < se->avg_running) { - set_next_buddy(pse); - resched_task(curr); - return; - } - } + if (sched_feat(WAKEUP_OVERLAP) && + se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost) + goto preempt; if (!sched_feat(WAKEUP_PREEMPT)) return; + update_curr(cfs_rq); find_matching_se(&se, &pse); - BUG_ON(!pse); + if (wakeup_preempt_entity(se, pse) == 1) + goto preempt; - if (wakeup_preempt_entity(se, pse) == 1) { - resched_task(curr); - /* - * Only set the backward buddy when the current task is still - * on the rq. This can happen when a wakeup gets interleaved - * with schedule on the ->pre_schedule() or idle_balance() - * point, either of which can * drop the rq lock. - * - * Also, during early boot the idle thread is in the fair class, - * for obvious reasons its a bad idea to schedule back to it. - */ - if (unlikely(!se->on_rq || curr == rq->idle)) - return; - if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) - set_last_buddy(se); - } + return; + +preempt: + resched_task(curr); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved + * with schedule on the ->pre_schedule() or idle_balance() + * point, either of which can * drop the rq lock. + * + * Also, during early boot the idle thread is in the fair class, + * for obvious reasons its a bad idea to schedule back to it. + */ + if (unlikely(!se->on_rq || curr == rq->idle)) + return; + + if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) + set_last_buddy(se); } static struct task_struct *pick_next_task_fair(struct rq *rq) @@ -1905,6 +1915,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } + +static void rq_online_fair(struct rq *rq) +{ + update_sysctl(); +} + +static void rq_offline_fair(struct rq *rq) +{ + update_sysctl(); +} + #endif /* CONFIG_SMP */ /* @@ -1922,28 +1943,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) } /* - * Share the fairness runtime between parent and child, thus the - * total amount of pressure for CPU stays equal - new tasks - * get a chance to run but frequent forkers are not allowed to - * monopolize the CPU. Note: the parent runqueue is locked, - * the child is not running yet. + * called on fork with the child task as argument from the parent's context + * - child not yet on the tasklist + * - preemption disabled */ -static void task_new_fair(struct rq *rq, struct task_struct *p) +static void task_fork_fair(struct task_struct *p) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); + struct cfs_rq *cfs_rq = task_cfs_rq(current); struct sched_entity *se = &p->se, *curr = cfs_rq->curr; int this_cpu = smp_processor_id(); + struct rq *rq = this_rq(); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); - sched_info_queued(p); + if (unlikely(task_cpu(p) != this_cpu)) + __set_task_cpu(p, this_cpu); update_curr(cfs_rq); + if (curr) se->vruntime = curr->vruntime; place_entity(cfs_rq, se, 1); - /* 'curr' will be NULL if the child belongs to a different group */ - if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && - curr && entity_before(curr, se)) { + if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. @@ -1952,7 +1975,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } - enqueue_task_fair(rq, p, 0); + spin_unlock_irqrestore(&rq->lock, flags); } /* @@ -2014,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p) } #endif -unsigned int get_rr_interval_fair(struct task_struct *task) +unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) { struct sched_entity *se = &task->se; - unsigned long flags; - struct rq *rq; unsigned int rr_interval = 0; /* * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise * idle runqueue: */ - rq = task_rq_lock(task, &flags); if (rq->cfs.load.weight) rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); - task_rq_unlock(rq, &flags); return rr_interval; } @@ -2052,11 +2071,13 @@ static const struct sched_class fair_sched_class = { .load_balance = load_balance_fair, .move_one_task = move_one_task_fair, + .rq_online = rq_online_fair, + .rq_offline = rq_offline_fair, #endif .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, - .task_new = task_new_fair, + .task_fork = task_fork_fair, .prio_changed = prio_changed_fair, .switched_to = switched_to_fair, diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 0d94083582c..d5059fd761d 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -54,11 +54,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0) SCHED_FEAT(WAKEUP_OVERLAP, 0) /* - * Wakeup preemption towards tasks that run short - */ -SCHED_FEAT(WAKEUP_RUNNING, 0) - -/* * Use the SYNC wakeup hint, pipes and the likes use this to indicate * the remote end is likely to consume the data we just wrote, and * therefore has cache benefit from being placed on the same cpu, see diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index b133a28fcde..33d5384a73a 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -97,7 +97,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, check_preempt_curr(rq, p, 0); } -unsigned int get_rr_interval_idle(struct task_struct *task) +unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) { return 0; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 5c5fef37841..aecbd9c6b20 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1721,7 +1721,7 @@ static void set_curr_task_rt(struct rq *rq) dequeue_pushable_task(rq, p); } -unsigned int get_rr_interval_rt(struct task_struct *task) +unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) { /* * Time slice is 0 for SCHED_FIFO tasks diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9327a26765c..554ac4894f0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -244,6 +244,10 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; +static int min_sched_shares_ratelimit = 100000; /* 100 usec */ +static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ #endif static struct ctl_table kern_table[] = { @@ -260,7 +264,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_min_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = sched_nr_latency_handler, + .proc_handler = sched_proc_update_handler, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, @@ -269,7 +273,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_latency, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = sched_nr_latency_handler, + .proc_handler = sched_proc_update_handler, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, @@ -278,7 +282,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_wakeup_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = sched_proc_update_handler, .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, @@ -287,7 +291,18 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_shares_ratelimit, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_shares_ratelimit, + .extra2 = &max_sched_shares_ratelimit, + }, + { + .procname = "sched_tunable_scaling", + .data = &sysctl_sched_tunable_scaling, + .maxlen = sizeof(enum sched_tunable_scaling), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_tunable_scaling, + .extra2 = &max_sched_tunable_scaling, }, { .procname = "sched_shares_thresh", @@ -298,13 +313,6 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, }, { - .procname = "sched_features", - .data = &sysctl_sched_features, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "sched_migration_cost", .data = &sysctl_sched_migration_cost, .maxlen = sizeof(unsigned int), diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 665c76edbf1..9d80db4747d 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P_ns(expires_next); P(hres_active); P(nr_events); + P(nr_retries); + P(nr_hangs); + P_ns(max_hang_time); #endif #undef P #undef P_ns @@ -254,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.4\n"); + SEQ_printf(m, "Timer List Version: v0.5\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 874f2893cff..88bd9ae2a9e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1361,11 +1361,7 @@ int trace_array_vprintk(struct trace_array *tr, pause_graph_tracing(); raw_local_irq_save(irq_flags); __raw_spin_lock(&trace_buf_lock); - if (args == NULL) { - strncpy(trace_buf, fmt, TRACE_BUF_SIZE); - len = strlen(trace_buf); - } else - len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); + len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); size = sizeof(*entry) + len + 1; buffer = tr->buffer; @@ -1516,6 +1512,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) int i = (int)*pos; void *ent; + WARN_ON_ONCE(iter->leftover); + (*pos)++; /* can't go backwards */ @@ -1614,8 +1612,16 @@ static void *s_start(struct seq_file *m, loff_t *pos) ; } else { - l = *pos - 1; - p = s_next(m, p, &l); + /* + * If we overflowed the seq_file before, then we want + * to just reuse the trace_seq buffer again. + */ + if (iter->leftover) + p = iter; + else { + l = *pos - 1; + p = s_next(m, p, &l); + } } trace_event_read_lock(); @@ -1923,6 +1929,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; + int ret; if (iter->ent == NULL) { if (iter->tr) { @@ -1942,9 +1949,27 @@ static int s_show(struct seq_file *m, void *v) if (!(trace_flags & TRACE_ITER_VERBOSE)) print_func_help_header(m); } + } else if (iter->leftover) { + /* + * If we filled the seq_file buffer earlier, we + * want to just show it now. + */ + ret = trace_print_seq(m, &iter->seq); + + /* ret should this time be zero, but you never know */ + iter->leftover = ret; + } else { print_trace_line(iter); - trace_print_seq(m, &iter->seq); + ret = trace_print_seq(m, &iter->seq); + /* + * If we overflow the seq_file buffer, then it will + * ask us for this data again at start up. + * Use that instead. + * ret is 0 if seq_file write succeeded. + * -1 otherwise. + */ + iter->leftover = ret; } return 0; @@ -2898,6 +2923,10 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) else cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); + + if (iter->trace->pipe_close) + iter->trace->pipe_close(iter); + mutex_unlock(&trace_types_lock); free_cpumask_var(iter->started); @@ -3320,6 +3349,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, return cnt; } +static int mark_printk(const char *fmt, ...) +{ + int ret; + va_list args; + va_start(args, fmt); + ret = trace_vprintk(0, fmt, args); + va_end(args); + return ret; +} + static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) @@ -3346,7 +3385,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } else buf[cnt] = '\0'; - cnt = trace_vprintk(0, buf, NULL); + cnt = mark_printk("%s", buf); kfree(buf); *fpos += cnt; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1d7f4830a80..7fa33cab696 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -272,6 +272,7 @@ struct tracer_flags { * @pipe_open: called when the trace_pipe file is opened * @wait_pipe: override how the user waits for traces on trace_pipe * @close: called when the trace file is released + * @pipe_close: called when the trace_pipe file is released * @read: override the default read callback on trace_pipe * @splice_read: override the default splice_read callback on trace_pipe * @selftest: selftest to run on boot (see trace_selftest.c) @@ -290,6 +291,7 @@ struct tracer { void (*pipe_open)(struct trace_iterator *iter); void (*wait_pipe)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); + void (*pipe_close)(struct trace_iterator *iter); ssize_t (*read)(struct trace_iterator *iter, struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 45e6c01b2e4..a43d009c561 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -14,9 +14,20 @@ #include "trace.h" #include "trace_output.h" -struct fgraph_data { +struct fgraph_cpu_data { pid_t last_pid; int depth; + int ignore; +}; + +struct fgraph_data { + struct fgraph_cpu_data *cpu_data; + + /* Place to preserve last processed entry. */ + struct ftrace_graph_ent_entry ent; + struct ftrace_graph_ret_entry ret; + int failed; + int cpu; }; #define TRACE_GRAPH_INDENT 2 @@ -384,7 +395,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) if (!data) return TRACE_TYPE_HANDLED; - last_pid = &(per_cpu_ptr(data, cpu)->last_pid); + last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); if (*last_pid == pid) return TRACE_TYPE_HANDLED; @@ -435,26 +446,49 @@ static struct ftrace_graph_ret_entry * get_return_for_leaf(struct trace_iterator *iter, struct ftrace_graph_ent_entry *curr) { - struct ring_buffer_iter *ring_iter; + struct fgraph_data *data = iter->private; + struct ring_buffer_iter *ring_iter = NULL; struct ring_buffer_event *event; struct ftrace_graph_ret_entry *next; - ring_iter = iter->buffer_iter[iter->cpu]; + /* + * If the previous output failed to write to the seq buffer, + * then we just reuse the data from before. + */ + if (data && data->failed) { + curr = &data->ent; + next = &data->ret; + } else { - /* First peek to compare current entry and the next one */ - if (ring_iter) - event = ring_buffer_iter_peek(ring_iter, NULL); - else { - /* We need to consume the current entry to see the next one */ - ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); - event = ring_buffer_peek(iter->tr->buffer, iter->cpu, - NULL); - } + ring_iter = iter->buffer_iter[iter->cpu]; + + /* First peek to compare current entry and the next one */ + if (ring_iter) + event = ring_buffer_iter_peek(ring_iter, NULL); + else { + /* + * We need to consume the current entry to see + * the next one. + */ + ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); + event = ring_buffer_peek(iter->tr->buffer, iter->cpu, + NULL); + } - if (!event) - return NULL; + if (!event) + return NULL; + + next = ring_buffer_event_data(event); - next = ring_buffer_event_data(event); + if (data) { + /* + * Save current and next entries for later reference + * if the output fails. + */ + data->ent = *curr; + data->ret = *next; + } + } if (next->ent.type != TRACE_GRAPH_RET) return NULL; @@ -640,7 +674,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, if (data) { int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data, cpu)->depth); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); /* * Comments display at + 1 to depth. Since @@ -688,7 +722,7 @@ print_graph_entry_nested(struct trace_iterator *iter, if (data) { int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data, cpu)->depth); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); *depth = call->depth; } @@ -782,19 +816,34 @@ static enum print_line_t print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, struct trace_iterator *iter) { - int cpu = iter->cpu; + struct fgraph_data *data = iter->private; struct ftrace_graph_ent *call = &field->graph_ent; struct ftrace_graph_ret_entry *leaf_ret; + static enum print_line_t ret; + int cpu = iter->cpu; if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) return TRACE_TYPE_PARTIAL_LINE; leaf_ret = get_return_for_leaf(iter, field); if (leaf_ret) - return print_graph_entry_leaf(iter, field, leaf_ret, s); + ret = print_graph_entry_leaf(iter, field, leaf_ret, s); else - return print_graph_entry_nested(iter, field, s, cpu); + ret = print_graph_entry_nested(iter, field, s, cpu); + if (data) { + /* + * If we failed to write our output, then we need to make + * note of it. Because we already consumed our entry. + */ + if (s->full) { + data->failed = 1; + data->cpu = cpu; + } else + data->failed = 0; + } + + return ret; } static enum print_line_t @@ -810,7 +859,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (data) { int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data, cpu)->depth); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); /* * Comments display at + 1 to depth. This is the @@ -873,7 +922,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, int i; if (data) - depth = per_cpu_ptr(data, iter->cpu)->depth; + depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; if (print_graph_prologue(iter, s, 0, 0)) return TRACE_TYPE_PARTIAL_LINE; @@ -941,8 +990,33 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t print_graph_function(struct trace_iterator *iter) { + struct ftrace_graph_ent_entry *field; + struct fgraph_data *data = iter->private; struct trace_entry *entry = iter->ent; struct trace_seq *s = &iter->seq; + int cpu = iter->cpu; + int ret; + + if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) { + per_cpu_ptr(data->cpu_data, cpu)->ignore = 0; + return TRACE_TYPE_HANDLED; + } + + /* + * If the last output failed, there's a possibility we need + * to print out the missing entry which would never go out. + */ + if (data && data->failed) { + field = &data->ent; + iter->cpu = data->cpu; + ret = print_graph_entry(field, s, iter); + if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { + per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; + ret = TRACE_TYPE_NO_CONSUME; + } + iter->cpu = cpu; + return ret; + } switch (entry->type) { case TRACE_GRAPH_ENT: { @@ -952,7 +1026,7 @@ print_graph_function(struct trace_iterator *iter) * sizeof(struct ftrace_graph_ent_entry) is very small, * it can be safely saved at the stack. */ - struct ftrace_graph_ent_entry *field, saved; + struct ftrace_graph_ent_entry saved; trace_assign_type(field, entry); saved = *field; return print_graph_entry(&saved, s, iter); @@ -1030,31 +1104,54 @@ static void print_graph_headers(struct seq_file *s) static void graph_trace_open(struct trace_iterator *iter) { /* pid and depth on the last trace processed */ - struct fgraph_data *data = alloc_percpu(struct fgraph_data); + struct fgraph_data *data; int cpu; + iter->private = NULL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) - pr_warning("function graph tracer: not enough memory\n"); - else - for_each_possible_cpu(cpu) { - pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid); - int *depth = &(per_cpu_ptr(data, cpu)->depth); - *pid = -1; - *depth = 0; - } + goto out_err; + + data->cpu_data = alloc_percpu(struct fgraph_cpu_data); + if (!data->cpu_data) + goto out_err_free; + + for_each_possible_cpu(cpu) { + pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); + int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); + *pid = -1; + *depth = 0; + *ignore = 0; + } iter->private = data; + + return; + + out_err_free: + kfree(data); + out_err: + pr_warning("function graph tracer: not enough memory\n"); } static void graph_trace_close(struct trace_iterator *iter) { - free_percpu(iter->private); + struct fgraph_data *data = iter->private; + + if (data) { + free_percpu(data->cpu_data); + kfree(data); + } } static struct tracer graph_trace __read_mostly = { .name = "function_graph", .open = graph_trace_open, + .pipe_open = graph_trace_open, .close = graph_trace_close, + .pipe_close = graph_trace_close, .wait_pipe = poll_wait_pipe, .init = graph_trace_init, .reset = graph_trace_reset, diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index aff5f80b59b..b52d397e57e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -606,23 +606,22 @@ static int create_trace_probe(int argc, char **argv) */ struct trace_probe *tp; int i, ret = 0; - int is_return = 0; + int is_return = 0, is_delete = 0; char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; unsigned long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; - if (argc < 2) { - pr_info("Probe point is not specified.\n"); - return -EINVAL; - } - + /* argc must be >= 1 */ if (argv[0][0] == 'p') is_return = 0; else if (argv[0][0] == 'r') is_return = 1; + else if (argv[0][0] == '-') + is_delete = 1; else { - pr_info("Probe definition must be started with 'p' or 'r'.\n"); + pr_info("Probe definition must be started with 'p', 'r' or" + " '-'.\n"); return -EINVAL; } @@ -642,7 +641,29 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } } + if (!group) + group = KPROBE_EVENT_SYSTEM; + if (is_delete) { + if (!event) { + pr_info("Delete command needs an event name.\n"); + return -EINVAL; + } + tp = find_probe_event(event, group); + if (!tp) { + pr_info("Event %s/%s doesn't exist.\n", group, event); + return -ENOENT; + } + /* delete an event */ + unregister_trace_probe(tp); + free_trace_probe(tp); + return 0; + } + + if (argc < 2) { + pr_info("Probe point is not specified.\n"); + return -EINVAL; + } if (isdigit(argv[1][0])) { if (is_return) { pr_info("Return probe point must be a symbol.\n"); @@ -671,8 +692,6 @@ static int create_trace_probe(int argc, char **argv) argc -= 2; argv += 2; /* setup a probe */ - if (!group) - group = KPROBE_EVENT_SYSTEM; if (!event) { /* Make a new event name */ if (symbol) @@ -1114,7 +1133,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) struct trace_probe *tp = (struct trace_probe *)event_call->data; ret = trace_define_common_fields(event_call); - if (!ret) + if (ret) return ret; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); @@ -1132,7 +1151,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) struct trace_probe *tp = (struct trace_probe *)event_call->data; ret = trace_define_common_fields(event_call); - if (!ret) + if (ret) return ret; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index ddfa0fd43bc..acb87d4a4ac 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -79,11 +79,12 @@ void ksym_collect_stats(unsigned long hbp_hit_addr) } #endif /* CONFIG_PROFILE_KSYM_TRACER */ -void ksym_hbp_handler(struct perf_event *hbp, void *data) +void ksym_hbp_handler(struct perf_event *hbp, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { struct ring_buffer_event *event; struct ksym_trace_entry *entry; - struct pt_regs *regs = data; struct ring_buffer *buffer; int pc; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index b6c12c6a1bc..8e46b3323cd 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -23,13 +23,21 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; -void trace_print_seq(struct seq_file *m, struct trace_seq *s) +int trace_print_seq(struct seq_file *m, struct trace_seq *s) { int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + int ret; + + ret = seq_write(m, s->buffer, len); - seq_write(m, s->buffer, len); + /* + * Only reset this buffer if we successfully wrote to the + * seq_file buffer. + */ + if (!ret) + trace_seq_init(s); - trace_seq_init(s); + return ret; } enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) @@ -85,7 +93,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) va_list ap; int ret; - if (!len) + if (s->full || !len) return 0; va_start(ap, fmt); @@ -93,8 +101,10 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) va_end(ap); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) + if (ret >= len) { + s->full = 1; return 0; + } s->len += ret; @@ -119,14 +129,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) int len = (PAGE_SIZE - 1) - s->len; int ret; - if (!len) + if (s->full || !len) return 0; ret = vsnprintf(s->buffer + s->len, len, fmt, args); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) + if (ret >= len) { + s->full = 1; return 0; + } s->len += ret; @@ -139,14 +151,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) int len = (PAGE_SIZE - 1) - s->len; int ret; - if (!len) + if (s->full || !len) return 0; ret = bstr_printf(s->buffer + s->len, len, fmt, binary); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) + if (ret >= len) { + s->full = 1; return 0; + } s->len += ret; @@ -167,8 +181,13 @@ int trace_seq_puts(struct trace_seq *s, const char *str) { int len = strlen(str); - if (len > ((PAGE_SIZE - 1) - s->len)) + if (s->full) + return 0; + + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; return 0; + } memcpy(s->buffer + s->len, str, len); s->len += len; @@ -178,9 +197,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str) int trace_seq_putc(struct trace_seq *s, unsigned char c) { - if (s->len >= (PAGE_SIZE - 1)) + if (s->full) return 0; + if (s->len >= (PAGE_SIZE - 1)) { + s->full = 1; + return 0; + } + s->buffer[s->len++] = c; return 1; @@ -188,9 +212,14 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c) int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) { - if (len > ((PAGE_SIZE - 1) - s->len)) + if (s->full) return 0; + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; + return 0; + } + memcpy(s->buffer + s->len, mem, len); s->len += len; @@ -203,6 +232,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len) const unsigned char *data = mem; int i, j; + if (s->full) + return 0; + #ifdef __BIG_ENDIAN for (i = 0, j = 0; i < len; i++) { #else @@ -220,8 +252,13 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len) { void *ret; - if (len > ((PAGE_SIZE - 1) - s->len)) + if (s->full) + return 0; + + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; return NULL; + } ret = s->buffer + s->len; s->len += len; @@ -233,8 +270,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path) { unsigned char *p; - if (s->len >= (PAGE_SIZE - 1)) + if (s->full) + return 0; + + if (s->len >= (PAGE_SIZE - 1)) { + s->full = 1; return 0; + } + p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); if (!IS_ERR(p)) { p = mangle_path(s->buffer + s->len, p, "\n"); @@ -247,6 +290,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path) return 1; } + s->full = 1; return 0; } @@ -373,6 +417,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, unsigned long vmstart = 0; int ret = 1; + if (s->full) + return 0; + if (mm) { const struct vm_area_struct *vma; diff --git a/lib/checksum.c b/lib/checksum.c index b2e2fd46846..097508732f3 100644 --- a/lib/checksum.c +++ b/lib/checksum.c @@ -37,7 +37,8 @@ #include <asm/byteorder.h> -static inline unsigned short from32to16(unsigned long x) +#ifndef do_csum +static inline unsigned short from32to16(unsigned int x) { /* add up 16-bit and 16-bit for 16+c bit */ x = (x & 0xffff) + (x >> 16); @@ -49,16 +50,16 @@ static inline unsigned short from32to16(unsigned long x) static unsigned int do_csum(const unsigned char *buff, int len) { int odd, count; - unsigned long result = 0; + unsigned int result = 0; if (len <= 0) goto out; odd = 1 & (unsigned long) buff; if (odd) { #ifdef __LITTLE_ENDIAN - result = *buff; -#else result += (*buff << 8); +#else + result = *buff; #endif len--; buff++; @@ -73,9 +74,9 @@ static unsigned int do_csum(const unsigned char *buff, int len) } count >>= 1; /* nr of 32-bit words.. */ if (count) { - unsigned long carry = 0; + unsigned int carry = 0; do { - unsigned long w = *(unsigned int *) buff; + unsigned int w = *(unsigned int *) buff; count--; buff += 4; result += carry; @@ -102,6 +103,7 @@ static unsigned int do_csum(const unsigned char *buff, int len) out: return result; } +#endif /* * This is a version of ip_compute_csum() optimized for IP headers, diff --git a/mm/slab.c b/mm/slab.c index 7dfa481c96b..a6c9166996a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -604,6 +604,26 @@ static struct kmem_cache cache_cache = { #define BAD_ALIEN_MAGIC 0x01020304ul +/* + * chicken and egg problem: delay the per-cpu array allocation + * until the general caches are up. + */ +static enum { + NONE, + PARTIAL_AC, + PARTIAL_L3, + EARLY, + FULL +} g_cpucache_up; + +/* + * used by boot code to determine if it can use slab based allocator + */ +int slab_is_available(void) +{ + return g_cpucache_up >= EARLY; +} + #ifdef CONFIG_LOCKDEP /* @@ -620,40 +640,52 @@ static struct kmem_cache cache_cache = { static struct lock_class_key on_slab_l3_key; static struct lock_class_key on_slab_alc_key; -static inline void init_lock_keys(void) - +static void init_node_lock_keys(int q) { - int q; struct cache_sizes *s = malloc_sizes; - while (s->cs_size != ULONG_MAX) { - for_each_node(q) { - struct array_cache **alc; - int r; - struct kmem_list3 *l3 = s->cs_cachep->nodelists[q]; - if (!l3 || OFF_SLAB(s->cs_cachep)) - continue; - lockdep_set_class(&l3->list_lock, &on_slab_l3_key); - alc = l3->alien; - /* - * FIXME: This check for BAD_ALIEN_MAGIC - * should go away when common slab code is taught to - * work even without alien caches. - * Currently, non NUMA code returns BAD_ALIEN_MAGIC - * for alloc_alien_cache, - */ - if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) - continue; - for_each_node(r) { - if (alc[r]) - lockdep_set_class(&alc[r]->lock, - &on_slab_alc_key); - } + if (g_cpucache_up != FULL) + return; + + for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { + struct array_cache **alc; + struct kmem_list3 *l3; + int r; + + l3 = s->cs_cachep->nodelists[q]; + if (!l3 || OFF_SLAB(s->cs_cachep)) + return; + lockdep_set_class(&l3->list_lock, &on_slab_l3_key); + alc = l3->alien; + /* + * FIXME: This check for BAD_ALIEN_MAGIC + * should go away when common slab code is taught to + * work even without alien caches. + * Currently, non NUMA code returns BAD_ALIEN_MAGIC + * for alloc_alien_cache, + */ + if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) + return; + for_each_node(r) { + if (alc[r]) + lockdep_set_class(&alc[r]->lock, + &on_slab_alc_key); } - s++; } } + +static inline void init_lock_keys(void) +{ + int node; + + for_each_node(node) + init_node_lock_keys(node); +} #else +static void init_node_lock_keys(int q) +{ +} + static inline void init_lock_keys(void) { } @@ -665,26 +697,6 @@ static inline void init_lock_keys(void) static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; -/* - * chicken and egg problem: delay the per-cpu array allocation - * until the general caches are up. - */ -static enum { - NONE, - PARTIAL_AC, - PARTIAL_L3, - EARLY, - FULL -} g_cpucache_up; - -/* - * used by boot code to determine if it can use slab based allocator - */ -int slab_is_available(void) -{ - return g_cpucache_up >= EARLY; -} - static DEFINE_PER_CPU(struct delayed_work, reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) @@ -1254,6 +1266,8 @@ static int __cpuinit cpuup_prepare(long cpu) kfree(shared); free_alien_cache(alien); } + init_node_lock_keys(node); + return 0; bad: cpuup_canceled(cpu); @@ -3103,13 +3117,19 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) } else { STATS_INC_ALLOCMISS(cachep); objp = cache_alloc_refill(cachep, flags); + /* + * the 'ac' may be updated by cache_alloc_refill(), + * and kmemleak_erase() requires its correct value. + */ + ac = cpu_cache_get(cachep); } /* * To avoid a false negative, if an object that is in one of the * per-CPU caches is leaked, we need to make sure kmemleak doesn't * treat the array pointers as a reference to the object. */ - kmemleak_erase(&ac->entry[ac->avail]); + if (objp) + kmemleak_erase(&ac->entry[ac->avail]); return objp; } @@ -3306,7 +3326,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); - if (unlikely(nodeid == -1)) + if (nodeid == -1) nodeid = numa_node_id(); if (unlikely(!cachep->nodelists[nodeid])) { diff --git a/mm/slub.c b/mm/slub.c index 4996fc71955..da0ce55965d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1735,7 +1735,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, } local_irq_restore(flags); - if (unlikely((gfpflags & __GFP_ZERO) && object)) + if (unlikely(gfpflags & __GFP_ZERO) && object) memset(object, 0, objsize); kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); @@ -4371,12 +4371,28 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) return len + sprintf(buf + len, "\n"); } +static void clear_stat(struct kmem_cache *s, enum stat_item si) +{ + int cpu; + + for_each_online_cpu(cpu) + get_cpu_slab(s, cpu)->stat[si] = 0; +} + #define STAT_ATTR(si, text) \ static ssize_t text##_show(struct kmem_cache *s, char *buf) \ { \ return show_stat(s, buf, si); \ } \ -SLAB_ATTR_RO(text); \ +static ssize_t text##_store(struct kmem_cache *s, \ + const char *buf, size_t length) \ +{ \ + if (buf[0] != '0') \ + return -EINVAL; \ + clear_stat(s, si); \ + return length; \ +} \ +SLAB_ATTR(text); \ STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c index 29525500df0..c69cbe9b242 100644 --- a/samples/hw_breakpoint/data_breakpoint.c +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -41,7 +41,9 @@ module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO); MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any" " write operations on the kernel symbol"); -static void sample_hbp_handler(struct perf_event *temp, void *data) +static void sample_hbp_handler(struct perf_event *bp, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { printk(KERN_INFO "%s value is changed\n", ksym_name); dump_stack(); @@ -51,8 +53,9 @@ static void sample_hbp_handler(struct perf_event *temp, void *data) static int __init hw_break_module_init(void) { int ret; - DEFINE_BREAKPOINT_ATTR(attr); + struct perf_event_attr attr; + hw_breakpoint_init(&attr); attr.bp_addr = kallsyms_lookup_name(ksym_name); attr.bp_len = HW_BREAKPOINT_LEN_4; attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt index 44b0ce35c28..eac4d852e7c 100644 --- a/tools/perf/Documentation/perf-kmem.txt +++ b/tools/perf/Documentation/perf-kmem.txt @@ -8,16 +8,16 @@ perf-kmem - Tool to trace/measure kernel memory(slab) properties SYNOPSIS -------- [verse] -'perf kmem' {record} [<options>] +'perf kmem' {record|stat} [<options>] DESCRIPTION ----------- -There's two variants of perf kmem: +There are two variants of perf kmem: 'perf kmem record <command>' to record the kmem events of an arbitrary workload. - 'perf kmem' to report kernel memory statistics. + 'perf kmem stat' to report kernel memory statistics. OPTIONS ------- @@ -25,8 +25,11 @@ OPTIONS --input=<file>:: Select the input file (default: perf.data) ---stat=<caller|alloc>:: - Select per callsite or per allocation statistics +--caller:: + Show per-callsite statistics + +--alloc:: + Show per-allocation statistics -s <key[,key2...]>:: --sort=<key[,key2...]>:: diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt index 9270594e6df..8fa6bf99fcb 100644 --- a/tools/perf/Documentation/perf-probe.txt +++ b/tools/perf/Documentation/perf-probe.txt @@ -8,10 +8,13 @@ perf-probe - Define new dynamic tracepoints SYNOPSIS -------- [verse] -'perf probe' [options] --add 'PROBE' [--add 'PROBE' ...] +'perf probe' [options] --add='PROBE' [...] or -'perf probe' [options] 'PROBE' ['PROBE' ...] - +'perf probe' [options] PROBE +or +'perf probe' [options] --del='[GROUP:]EVENT' [...] +or +'perf probe' --list DESCRIPTION ----------- @@ -31,8 +34,16 @@ OPTIONS Be more verbose (show parsed arguments, etc). -a:: ---add:: - Define a probe point (see PROBE SYNTAX for detail) +--add=:: + Define a probe event (see PROBE SYNTAX for detail). + +-d:: +--del=:: + Delete a probe event. + +-l:: +--list:: + List up current probe events. PROBE SYNTAX ------------ diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index 7dee9d19ab7..dcb6143a000 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -19,7 +19,7 @@ static char const *input_name = "perf.data"; static int force; static const char *const buildid_list_usage[] = { - "perf report [<options>]", + "perf buildid-list [<options>]", NULL }; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 047fef74bd5..5f209514f65 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -57,11 +57,6 @@ static struct rb_root root_caller_sorted; static unsigned long total_requested, total_allocated; static unsigned long nr_allocs, nr_cross_allocs; -struct raw_event_sample { - u32 size; - char data[0]; -}; - #define PATH_SYS_NODE "/sys/devices/system/node" static void init_cpunode_map(void) @@ -201,7 +196,7 @@ static void insert_caller_stat(unsigned long call_site, } } -static void process_alloc_event(struct raw_event_sample *raw, +static void process_alloc_event(void *data, struct event *event, int cpu, u64 timestamp __used, @@ -214,10 +209,10 @@ static void process_alloc_event(struct raw_event_sample *raw, int bytes_alloc; int node1, node2; - ptr = raw_field_value(event, "ptr", raw->data); - call_site = raw_field_value(event, "call_site", raw->data); - bytes_req = raw_field_value(event, "bytes_req", raw->data); - bytes_alloc = raw_field_value(event, "bytes_alloc", raw->data); + ptr = raw_field_value(event, "ptr", data); + call_site = raw_field_value(event, "call_site", data); + bytes_req = raw_field_value(event, "bytes_req", data); + bytes_alloc = raw_field_value(event, "bytes_alloc", data); insert_alloc_stat(call_site, ptr, bytes_req, bytes_alloc, cpu); insert_caller_stat(call_site, bytes_req, bytes_alloc); @@ -227,7 +222,7 @@ static void process_alloc_event(struct raw_event_sample *raw, if (node) { node1 = cpunode_map[cpu]; - node2 = raw_field_value(event, "node", raw->data); + node2 = raw_field_value(event, "node", data); if (node1 != node2) nr_cross_allocs++; } @@ -262,7 +257,7 @@ static struct alloc_stat *search_alloc_stat(unsigned long ptr, return NULL; } -static void process_free_event(struct raw_event_sample *raw, +static void process_free_event(void *data, struct event *event, int cpu, u64 timestamp __used, @@ -271,7 +266,7 @@ static void process_free_event(struct raw_event_sample *raw, unsigned long ptr; struct alloc_stat *s_alloc, *s_caller; - ptr = raw_field_value(event, "ptr", raw->data); + ptr = raw_field_value(event, "ptr", data); s_alloc = search_alloc_stat(ptr, 0, &root_alloc_stat, ptr_cmp); if (!s_alloc) @@ -289,66 +284,53 @@ static void process_free_event(struct raw_event_sample *raw, } static void -process_raw_event(event_t *raw_event __used, void *more_data, +process_raw_event(event_t *raw_event __used, void *data, int cpu, u64 timestamp, struct thread *thread) { - struct raw_event_sample *raw = more_data; struct event *event; int type; - type = trace_parse_common_type(raw->data); + type = trace_parse_common_type(data); event = trace_find_event(type); if (!strcmp(event->name, "kmalloc") || !strcmp(event->name, "kmem_cache_alloc")) { - process_alloc_event(raw, event, cpu, timestamp, thread, 0); + process_alloc_event(data, event, cpu, timestamp, thread, 0); return; } if (!strcmp(event->name, "kmalloc_node") || !strcmp(event->name, "kmem_cache_alloc_node")) { - process_alloc_event(raw, event, cpu, timestamp, thread, 1); + process_alloc_event(data, event, cpu, timestamp, thread, 1); return; } if (!strcmp(event->name, "kfree") || !strcmp(event->name, "kmem_cache_free")) { - process_free_event(raw, event, cpu, timestamp, thread); + process_free_event(data, event, cpu, timestamp, thread); return; } } static int process_sample_event(event_t *event) { - u64 ip = event->ip.ip; - u64 timestamp = -1; - u32 cpu = -1; - u64 period = 1; - void *more_data = event->ip.__more_data; - struct thread *thread = threads__findnew(event->ip.pid); + struct sample_data data; + struct thread *thread; - if (sample_type & PERF_SAMPLE_TIME) { - timestamp = *(u64 *)more_data; - more_data += sizeof(u64); - } - - if (sample_type & PERF_SAMPLE_CPU) { - cpu = *(u32 *)more_data; - more_data += sizeof(u32); - more_data += sizeof(u32); /* reserved */ - } + memset(&data, 0, sizeof(data)); + data.time = -1; + data.cpu = -1; + data.period = 1; - if (sample_type & PERF_SAMPLE_PERIOD) { - period = *(u64 *)more_data; - more_data += sizeof(u64); - } + event__parse_sample(event, sample_type, &data); dump_printf("(IP, %d): %d/%d: %p period: %Ld\n", event->header.misc, - event->ip.pid, event->ip.tid, - (void *)(long)ip, - (long long)period); + data.pid, data.tid, + (void *)(long)data.ip, + (long long)data.period); + thread = threads__findnew(event->ip.pid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", event->header.type); @@ -357,7 +339,8 @@ static int process_sample_event(event_t *event) dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); - process_raw_event(event, more_data, cpu, timestamp, thread); + process_raw_event(event, data.raw_data, data.cpu, + data.time, thread); return 0; } @@ -543,7 +526,7 @@ static int __cmd_kmem(void) } static const char * const kmem_usage[] = { - "perf kmem [<options>] {record}", + "perf kmem [<options>] {record|stat}", NULL }; @@ -703,18 +686,17 @@ static int parse_sort_opt(const struct option *opt __used, return 0; } -static int parse_stat_opt(const struct option *opt __used, - const char *arg, int unset __used) +static int parse_caller_opt(const struct option *opt __used, + const char *arg __used, int unset __used) { - if (!arg) - return -1; + caller_flag = (alloc_flag + 1); + return 0; +} - if (strcmp(arg, "alloc") == 0) - alloc_flag = (caller_flag + 1); - else if (strcmp(arg, "caller") == 0) - caller_flag = (alloc_flag + 1); - else - return -1; +static int parse_alloc_opt(const struct option *opt __used, + const char *arg __used, int unset __used) +{ + alloc_flag = (caller_flag + 1); return 0; } @@ -739,14 +721,17 @@ static int parse_line_opt(const struct option *opt __used, static const struct option kmem_options[] = { OPT_STRING('i', "input", &input_name, "file", "input file name"), - OPT_CALLBACK(0, "stat", NULL, "<alloc>|<caller>", - "stat selector, Pass 'alloc' or 'caller'.", - parse_stat_opt), + OPT_CALLBACK_NOOPT(0, "caller", NULL, NULL, + "show per-callsite statistics", + parse_caller_opt), + OPT_CALLBACK_NOOPT(0, "alloc", NULL, NULL, + "show per-allocation statistics", + parse_alloc_opt), OPT_CALLBACK('s', "sort", NULL, "key[,key2...]", "sort by keys: ptr, call_site, bytes, hit, pingpong, frag", parse_sort_opt), OPT_CALLBACK('l', "line", NULL, "num", - "show n lins", + "show n lines", parse_line_opt), OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"), OPT_END() @@ -790,18 +775,22 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __used) argc = parse_options(argc, argv, kmem_options, kmem_usage, 0); - if (argc && !strncmp(argv[0], "rec", 3)) - return __cmd_record(argc, argv); - else if (argc) + if (!argc) usage_with_options(kmem_usage, kmem_options); - if (list_empty(&caller_sort)) - setup_sorting(&caller_sort, default_sort_order); - if (list_empty(&alloc_sort)) - setup_sorting(&alloc_sort, default_sort_order); + if (!strncmp(argv[0], "rec", 3)) { + return __cmd_record(argc, argv); + } else if (!strcmp(argv[0], "stat")) { + setup_cpunode_map(); + + if (list_empty(&caller_sort)) + setup_sorting(&caller_sort, default_sort_order); + if (list_empty(&alloc_sort)) + setup_sorting(&alloc_sort, default_sort_order); - setup_cpunode_map(); + return __cmd_kmem(); + } - return __cmd_kmem(); + return 0; } diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index a58e11b7ea8..5a47c1e11f7 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -35,6 +35,7 @@ #include "perf.h" #include "builtin.h" #include "util/util.h" +#include "util/strlist.h" #include "util/event.h" #include "util/debug.h" #include "util/parse-options.h" @@ -43,11 +44,12 @@ #include "util/probe-event.h" /* Default vmlinux search paths */ -#define NR_SEARCH_PATH 3 +#define NR_SEARCH_PATH 4 const char *default_search_path[NR_SEARCH_PATH] = { "/lib/modules/%s/build/vmlinux", /* Custom build kernel */ "/usr/lib/debug/lib/modules/%s/vmlinux", /* Red Hat debuginfo */ "/boot/vmlinux-debug-%s", /* Ubuntu */ +"./vmlinux", /* CWD */ }; #define MAX_PATH_LEN 256 @@ -60,6 +62,7 @@ static struct { int need_dwarf; int nr_probe; struct probe_point probes[MAX_PROBES]; + struct strlist *dellist; } session; static bool listing; @@ -79,6 +82,25 @@ static void parse_probe_event(const char *str) pr_debug("%d arguments\n", pp->nr_args); } +static void parse_probe_event_argv(int argc, const char **argv) +{ + int i, len; + char *buf; + + /* Bind up rest arguments */ + len = 0; + for (i = 0; i < argc; i++) + len += strlen(argv[i]) + 1; + buf = zalloc(len + 1); + if (!buf) + die("Failed to allocate memory for binding arguments."); + len = 0; + for (i = 0; i < argc; i++) + len += sprintf(&buf[len], "%s ", argv[i]); + parse_probe_event(buf); + free(buf); +} + static int opt_add_probe_event(const struct option *opt __used, const char *str, int unset __used) { @@ -87,6 +109,17 @@ static int opt_add_probe_event(const struct option *opt __used, return 0; } +static int opt_del_probe_event(const struct option *opt __used, + const char *str, int unset __used) +{ + if (str) { + if (!session.dellist) + session.dellist = strlist__new(true, NULL); + strlist__add(session.dellist, str); + } + return 0; +} + #ifndef NO_LIBDWARF static int open_default_vmlinux(void) { @@ -121,6 +154,7 @@ static int open_default_vmlinux(void) static const char * const probe_usage[] = { "perf probe [<options>] 'PROBEDEF' ['PROBEDEF' ...]", "perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]", + "perf probe [<options>] --del '[GROUP:]EVENT' ...", "perf probe --list", NULL }; @@ -132,7 +166,9 @@ static const struct option options[] = { OPT_STRING('k', "vmlinux", &session.vmlinux, "file", "vmlinux/module pathname"), #endif - OPT_BOOLEAN('l', "list", &listing, "list up current probes"), + OPT_BOOLEAN('l', "list", &listing, "list up current probe events"), + OPT_CALLBACK('d', "del", NULL, "[GROUP:]EVENT", "delete a probe event.", + opt_del_probe_event), OPT_CALLBACK('a', "add", NULL, #ifdef NO_LIBDWARF "FUNC[+OFFS|%return] [ARG ...]", @@ -160,7 +196,7 @@ static const struct option options[] = { int cmd_probe(int argc, const char **argv, const char *prefix __used) { - int i, j, ret; + int i, ret; #ifndef NO_LIBDWARF int fd; #endif @@ -168,40 +204,52 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used) argc = parse_options(argc, argv, options, probe_usage, PARSE_OPT_STOP_AT_NON_OPTION); - for (i = 0; i < argc; i++) - parse_probe_event(argv[i]); + if (argc > 0) + parse_probe_event_argv(argc, argv); - if ((session.nr_probe == 0 && !listing) || - (session.nr_probe != 0 && listing)) + if ((session.nr_probe == 0 && !session.dellist && !listing)) usage_with_options(probe_usage, options); if (listing) { + if (session.nr_probe != 0 || session.dellist) { + pr_warning(" Error: Don't use --list with" + " --add/--del.\n"); + usage_with_options(probe_usage, options); + } show_perf_probe_events(); return 0; } + if (session.dellist) { + del_trace_kprobe_events(session.dellist); + strlist__delete(session.dellist); + if (session.nr_probe == 0) + return 0; + } + if (session.need_dwarf) #ifdef NO_LIBDWARF die("Debuginfo-analysis is not supported"); #else /* !NO_LIBDWARF */ pr_debug("Some probes require debuginfo.\n"); - if (session.vmlinux) + if (session.vmlinux) { + pr_debug("Try to open %s.", session.vmlinux); fd = open(session.vmlinux, O_RDONLY); - else + } else fd = open_default_vmlinux(); if (fd < 0) { if (session.need_dwarf) - die("Could not open vmlinux/module file."); + die("Could not open debuginfo file."); - pr_warning("Could not open vmlinux/module file." - " Try to use symbols.\n"); + pr_debug("Could not open vmlinux/module file." + " Try to use symbols.\n"); goto end_dwarf; } /* Searching probe points */ - for (j = 0; j < session.nr_probe; j++) { - pp = &session.probes[j]; + for (i = 0; i < session.nr_probe; i++) { + pp = &session.probes[i]; if (pp->found) continue; @@ -223,8 +271,8 @@ end_dwarf: #endif /* !NO_LIBDWARF */ /* Synthesize probes without dwarf */ - for (j = 0; j < session.nr_probe; j++) { - pp = &session.probes[j]; + for (i = 0; i < session.nr_probe; i++) { + pp = &session.probes[i]; if (pp->found) /* This probe is already found. */ continue; diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 383c4ab4f9a..2b9eb3a553e 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -605,44 +605,41 @@ static int validate_chain(struct ip_callchain *chain, event_t *event) static int process_sample_event(event_t *event) { - u64 ip = event->ip.ip; - u64 period = 1; - void *more_data = event->ip.__more_data; - struct ip_callchain *chain = NULL; + struct sample_data data; int cpumode; struct addr_location al; - struct thread *thread = threads__findnew(event->ip.pid); + struct thread *thread; - if (sample_type & PERF_SAMPLE_PERIOD) { - period = *(u64 *)more_data; - more_data += sizeof(u64); - } + memset(&data, 0, sizeof(data)); + data.period = 1; + + event__parse_sample(event, sample_type, &data); dump_printf("(IP, %d): %d/%d: %p period: %Ld\n", event->header.misc, - event->ip.pid, event->ip.tid, - (void *)(long)ip, - (long long)period); + data.pid, data.tid, + (void *)(long)data.ip, + (long long)data.period); if (sample_type & PERF_SAMPLE_CALLCHAIN) { unsigned int i; - chain = (void *)more_data; - - dump_printf("... chain: nr:%Lu\n", chain->nr); + dump_printf("... chain: nr:%Lu\n", data.callchain->nr); - if (validate_chain(chain, event) < 0) { + if (validate_chain(data.callchain, event) < 0) { pr_debug("call-chain problem with event, " "skipping it.\n"); return 0; } if (dump_trace) { - for (i = 0; i < chain->nr; i++) - dump_printf("..... %2d: %016Lx\n", i, chain->ips[i]); + for (i = 0; i < data.callchain->nr; i++) + dump_printf("..... %2d: %016Lx\n", + i, data.callchain->ips[i]); } } + thread = threads__findnew(data.pid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", event->header.type); @@ -657,7 +654,7 @@ static int process_sample_event(event_t *event) cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; thread__find_addr_location(thread, cpumode, - MAP__FUNCTION, ip, &al, NULL); + MAP__FUNCTION, data.ip, &al, NULL); /* * We have to do this here as we may have a dso with no symbol hit that * has a name longer than the ones with symbols sampled. @@ -675,12 +672,12 @@ static int process_sample_event(event_t *event) if (sym_list && al.sym && !strlist__has_entry(sym_list, al.sym->name)) return 0; - if (hist_entry__add(&al, chain, period)) { + if (hist_entry__add(&al, data.callchain, data.period)) { pr_debug("problem incrementing symbol count, skipping event\n"); return -1; } - event__stats.total += period; + event__stats.total += data.period; return 0; } diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 26b782f26ee..7cca7c15b40 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -13,7 +13,6 @@ #include "util/debug.h" #include "util/data_map.h" -#include <sys/types.h> #include <sys/prctl.h> #include <semaphore.h> @@ -141,6 +140,7 @@ struct work_atoms { struct thread *thread; struct rb_node node; u64 max_lat; + u64 max_lat_at; u64 total_lat; u64 nb_atoms; u64 total_runtime; @@ -414,34 +414,33 @@ static u64 get_cpu_usage_nsec_parent(void) return sum; } -static u64 get_cpu_usage_nsec_self(void) +static int self_open_counters(void) { - char filename [] = "/proc/1234567890/sched"; - unsigned long msecs, nsecs; - char *line = NULL; - u64 total = 0; - size_t len = 0; - ssize_t chars; - FILE *file; - int ret; + struct perf_event_attr attr; + int fd; - sprintf(filename, "/proc/%d/sched", getpid()); - file = fopen(filename, "r"); - BUG_ON(!file); + memset(&attr, 0, sizeof(attr)); - while ((chars = getline(&line, &len, file)) != -1) { - ret = sscanf(line, "se.sum_exec_runtime : %ld.%06ld\n", - &msecs, &nsecs); - if (ret == 2) { - total = msecs*1e6 + nsecs; - break; - } - } - if (line) - free(line); - fclose(file); + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_TASK_CLOCK; + + fd = sys_perf_event_open(&attr, 0, -1, -1, 0); - return total; + if (fd < 0) + die("Error: sys_perf_event_open() syscall returned" + "with %d (%s)\n", fd, strerror(errno)); + return fd; +} + +static u64 get_cpu_usage_nsec_self(int fd) +{ + u64 runtime; + int ret; + + ret = read(fd, &runtime, sizeof(runtime)); + BUG_ON(ret != sizeof(runtime)); + + return runtime; } static void *thread_func(void *ctx) @@ -450,9 +449,11 @@ static void *thread_func(void *ctx) u64 cpu_usage_0, cpu_usage_1; unsigned long i, ret; char comm2[22]; + int fd; sprintf(comm2, ":%s", this_task->comm); prctl(PR_SET_NAME, comm2); + fd = self_open_counters(); again: ret = sem_post(&this_task->ready_for_work); @@ -462,16 +463,15 @@ again: ret = pthread_mutex_unlock(&start_work_mutex); BUG_ON(ret); - cpu_usage_0 = get_cpu_usage_nsec_self(); + cpu_usage_0 = get_cpu_usage_nsec_self(fd); for (i = 0; i < this_task->nr_events; i++) { this_task->curr_event = i; process_sched_event(this_task, this_task->atoms[i]); } - cpu_usage_1 = get_cpu_usage_nsec_self(); + cpu_usage_1 = get_cpu_usage_nsec_self(fd); this_task->cpu_usage = cpu_usage_1 - cpu_usage_0; - ret = sem_post(&this_task->work_done_sem); BUG_ON(ret); @@ -628,11 +628,6 @@ static void test_calibrations(void) printf("the sleep test took %Ld nsecs\n", T1-T0); } -struct raw_event_sample { - u32 size; - char data[0]; -}; - #define FILL_FIELD(ptr, field, event, data) \ ptr.field = (typeof(ptr.field)) raw_field_value(event, #field, data) @@ -1019,8 +1014,10 @@ add_sched_in_event(struct work_atoms *atoms, u64 timestamp) delta = atom->sched_in_time - atom->wake_up_time; atoms->total_lat += delta; - if (delta > atoms->max_lat) + if (delta > atoms->max_lat) { atoms->max_lat = delta; + atoms->max_lat_at = timestamp; + } atoms->nb_atoms++; } @@ -1216,10 +1213,11 @@ static void output_lat_thread(struct work_atoms *work_list) avg = work_list->total_lat / work_list->nb_atoms; - printf("|%11.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n", + printf("|%11.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n", (double)work_list->total_runtime / 1e6, work_list->nb_atoms, (double)avg / 1e6, - (double)work_list->max_lat / 1e6); + (double)work_list->max_lat / 1e6, + (double)work_list->max_lat_at / 1e9); } static int pid_cmp(struct work_atoms *l, struct work_atoms *r) @@ -1356,7 +1354,7 @@ static void sort_lat(void) static struct trace_sched_handler *trace_handler; static void -process_sched_wakeup_event(struct raw_event_sample *raw, +process_sched_wakeup_event(void *data, struct event *event, int cpu __used, u64 timestamp __used, @@ -1364,13 +1362,13 @@ process_sched_wakeup_event(struct raw_event_sample *raw, { struct trace_wakeup_event wakeup_event; - FILL_COMMON_FIELDS(wakeup_event, event, raw->data); + FILL_COMMON_FIELDS(wakeup_event, event, data); - FILL_ARRAY(wakeup_event, comm, event, raw->data); - FILL_FIELD(wakeup_event, pid, event, raw->data); - FILL_FIELD(wakeup_event, prio, event, raw->data); - FILL_FIELD(wakeup_event, success, event, raw->data); - FILL_FIELD(wakeup_event, cpu, event, raw->data); + FILL_ARRAY(wakeup_event, comm, event, data); + FILL_FIELD(wakeup_event, pid, event, data); + FILL_FIELD(wakeup_event, prio, event, data); + FILL_FIELD(wakeup_event, success, event, data); + FILL_FIELD(wakeup_event, cpu, event, data); if (trace_handler->wakeup_event) trace_handler->wakeup_event(&wakeup_event, event, cpu, timestamp, thread); @@ -1469,7 +1467,7 @@ map_switch_event(struct trace_switch_event *switch_event, static void -process_sched_switch_event(struct raw_event_sample *raw, +process_sched_switch_event(void *data, struct event *event, int this_cpu, u64 timestamp __used, @@ -1477,15 +1475,15 @@ process_sched_switch_event(struct raw_event_sample *raw, { struct trace_switch_event switch_event; - FILL_COMMON_FIELDS(switch_event, event, raw->data); + FILL_COMMON_FIELDS(switch_event, event, data); - FILL_ARRAY(switch_event, prev_comm, event, raw->data); - FILL_FIELD(switch_event, prev_pid, event, raw->data); - FILL_FIELD(switch_event, prev_prio, event, raw->data); - FILL_FIELD(switch_event, prev_state, event, raw->data); - FILL_ARRAY(switch_event, next_comm, event, raw->data); - FILL_FIELD(switch_event, next_pid, event, raw->data); - FILL_FIELD(switch_event, next_prio, event, raw->data); + FILL_ARRAY(switch_event, prev_comm, event, data); + FILL_FIELD(switch_event, prev_pid, event, data); + FILL_FIELD(switch_event, prev_prio, event, data); + FILL_FIELD(switch_event, prev_state, event, data); + FILL_ARRAY(switch_event, next_comm, event, data); + FILL_FIELD(switch_event, next_pid, event, data); + FILL_FIELD(switch_event, next_prio, event, data); if (curr_pid[this_cpu] != (u32)-1) { /* @@ -1502,7 +1500,7 @@ process_sched_switch_event(struct raw_event_sample *raw, } static void -process_sched_runtime_event(struct raw_event_sample *raw, +process_sched_runtime_event(void *data, struct event *event, int cpu __used, u64 timestamp __used, @@ -1510,17 +1508,17 @@ process_sched_runtime_event(struct raw_event_sample *raw, { struct trace_runtime_event runtime_event; - FILL_ARRAY(runtime_event, comm, event, raw->data); - FILL_FIELD(runtime_event, pid, event, raw->data); - FILL_FIELD(runtime_event, runtime, event, raw->data); - FILL_FIELD(runtime_event, vruntime, event, raw->data); + FILL_ARRAY(runtime_event, comm, event, data); + FILL_FIELD(runtime_event, pid, event, data); + FILL_FIELD(runtime_event, runtime, event, data); + FILL_FIELD(runtime_event, vruntime, event, data); if (trace_handler->runtime_event) trace_handler->runtime_event(&runtime_event, event, cpu, timestamp, thread); } static void -process_sched_fork_event(struct raw_event_sample *raw, +process_sched_fork_event(void *data, struct event *event, int cpu __used, u64 timestamp __used, @@ -1528,12 +1526,12 @@ process_sched_fork_event(struct raw_event_sample *raw, { struct trace_fork_event fork_event; - FILL_COMMON_FIELDS(fork_event, event, raw->data); + FILL_COMMON_FIELDS(fork_event, event, data); - FILL_ARRAY(fork_event, parent_comm, event, raw->data); - FILL_FIELD(fork_event, parent_pid, event, raw->data); - FILL_ARRAY(fork_event, child_comm, event, raw->data); - FILL_FIELD(fork_event, child_pid, event, raw->data); + FILL_ARRAY(fork_event, parent_comm, event, data); + FILL_FIELD(fork_event, parent_pid, event, data); + FILL_ARRAY(fork_event, child_comm, event, data); + FILL_FIELD(fork_event, child_pid, event, data); if (trace_handler->fork_event) trace_handler->fork_event(&fork_event, event, cpu, timestamp, thread); @@ -1550,7 +1548,7 @@ process_sched_exit_event(struct event *event, } static void -process_sched_migrate_task_event(struct raw_event_sample *raw, +process_sched_migrate_task_event(void *data, struct event *event, int cpu __used, u64 timestamp __used, @@ -1558,80 +1556,66 @@ process_sched_migrate_task_event(struct raw_event_sample *raw, { struct trace_migrate_task_event migrate_task_event; - FILL_COMMON_FIELDS(migrate_task_event, event, raw->data); + FILL_COMMON_FIELDS(migrate_task_event, event, data); - FILL_ARRAY(migrate_task_event, comm, event, raw->data); - FILL_FIELD(migrate_task_event, pid, event, raw->data); - FILL_FIELD(migrate_task_event, prio, event, raw->data); - FILL_FIELD(migrate_task_event, cpu, event, raw->data); + FILL_ARRAY(migrate_task_event, comm, event, data); + FILL_FIELD(migrate_task_event, pid, event, data); + FILL_FIELD(migrate_task_event, prio, event, data); + FILL_FIELD(migrate_task_event, cpu, event, data); if (trace_handler->migrate_task_event) trace_handler->migrate_task_event(&migrate_task_event, event, cpu, timestamp, thread); } static void -process_raw_event(event_t *raw_event __used, void *more_data, +process_raw_event(event_t *raw_event __used, void *data, int cpu, u64 timestamp, struct thread *thread) { - struct raw_event_sample *raw = more_data; struct event *event; int type; - type = trace_parse_common_type(raw->data); + + type = trace_parse_common_type(data); event = trace_find_event(type); if (!strcmp(event->name, "sched_switch")) - process_sched_switch_event(raw, event, cpu, timestamp, thread); + process_sched_switch_event(data, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_stat_runtime")) - process_sched_runtime_event(raw, event, cpu, timestamp, thread); + process_sched_runtime_event(data, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_wakeup")) - process_sched_wakeup_event(raw, event, cpu, timestamp, thread); + process_sched_wakeup_event(data, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_wakeup_new")) - process_sched_wakeup_event(raw, event, cpu, timestamp, thread); + process_sched_wakeup_event(data, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_process_fork")) - process_sched_fork_event(raw, event, cpu, timestamp, thread); + process_sched_fork_event(data, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_process_exit")) process_sched_exit_event(event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_migrate_task")) - process_sched_migrate_task_event(raw, event, cpu, timestamp, thread); + process_sched_migrate_task_event(data, event, cpu, timestamp, thread); } static int process_sample_event(event_t *event) { + struct sample_data data; struct thread *thread; - u64 ip = event->ip.ip; - u64 timestamp = -1; - u32 cpu = -1; - u64 period = 1; - void *more_data = event->ip.__more_data; if (!(sample_type & PERF_SAMPLE_RAW)) return 0; - thread = threads__findnew(event->ip.pid); + memset(&data, 0, sizeof(data)); + data.time = -1; + data.cpu = -1; + data.period = -1; - if (sample_type & PERF_SAMPLE_TIME) { - timestamp = *(u64 *)more_data; - more_data += sizeof(u64); - } - - if (sample_type & PERF_SAMPLE_CPU) { - cpu = *(u32 *)more_data; - more_data += sizeof(u32); - more_data += sizeof(u32); /* reserved */ - } - - if (sample_type & PERF_SAMPLE_PERIOD) { - period = *(u64 *)more_data; - more_data += sizeof(u64); - } + event__parse_sample(event, sample_type, &data); dump_printf("(IP, %d): %d/%d: %p period: %Ld\n", event->header.misc, - event->ip.pid, event->ip.tid, - (void *)(long)ip, - (long long)period); + data.pid, data.tid, + (void *)(long)data.ip, + (long long)data.period); + thread = threads__findnew(data.pid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", event->header.type); @@ -1640,10 +1624,10 @@ static int process_sample_event(event_t *event) dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); - if (profile_cpu != -1 && profile_cpu != (int) cpu) + if (profile_cpu != -1 && profile_cpu != (int)data.cpu) return 0; - process_raw_event(event, more_data, cpu, timestamp, thread); + process_raw_event(event, data.raw_data, data.cpu, data.time, thread); return 0; } @@ -1724,9 +1708,9 @@ static void __cmd_lat(void) read_events(); sort_lat(); - printf("\n -----------------------------------------------------------------------------------------\n"); - printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms |\n"); - printf(" -----------------------------------------------------------------------------------------\n"); + printf("\n ---------------------------------------------------------------------------------------------------------------\n"); + printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at |\n"); + printf(" ---------------------------------------------------------------------------------------------------------------\n"); next = rb_first(&sorted_atom_root); @@ -1902,13 +1886,18 @@ static int __cmd_record(int argc, const char **argv) int cmd_sched(int argc, const char **argv, const char *prefix __used) { - symbol__init(0); - argc = parse_options(argc, argv, sched_options, sched_usage, PARSE_OPT_STOP_AT_NON_OPTION); if (!argc) usage_with_options(sched_usage, sched_options); + /* + * Aliased to 'perf trace' for now: + */ + if (!strcmp(argv[0], "trace")) + return cmd_trace(argc, argv, prefix); + + symbol__init(0); if (!strncmp(argv[0], "rec", 3)) { return __cmd_record(argc, argv); } else if (!strncmp(argv[0], "lat", 3)) { @@ -1932,11 +1921,6 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used) usage_with_options(replay_usage, replay_options); } __cmd_replay(); - } else if (!strcmp(argv[0], "trace")) { - /* - * Aliased to 'perf trace' for now: - */ - return cmd_trace(argc, argv, prefix); } else { usage_with_options(sched_usage, sched_options); } diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index cb58b6605fc..f472df9561e 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -302,12 +302,11 @@ process_exit_event(event_t *event) } struct trace_entry { - u32 size; unsigned short type; unsigned char flags; unsigned char preempt_count; int pid; - int tgid; + int lock_depth; }; struct power_entry { @@ -484,43 +483,22 @@ static void sched_switch(int cpu, u64 timestamp, struct trace_entry *te) static int process_sample_event(event_t *event) { - int cursor = 0; - u64 addr = 0; - u64 stamp = 0; - u32 cpu = 0; - u32 pid = 0; + struct sample_data data; struct trace_entry *te; - if (sample_type & PERF_SAMPLE_IP) - cursor++; - - if (sample_type & PERF_SAMPLE_TID) { - pid = event->sample.array[cursor]>>32; - cursor++; - } - if (sample_type & PERF_SAMPLE_TIME) { - stamp = event->sample.array[cursor++]; + memset(&data, 0, sizeof(data)); - if (!first_time || first_time > stamp) - first_time = stamp; - if (last_time < stamp) - last_time = stamp; + event__parse_sample(event, sample_type, &data); + if (sample_type & PERF_SAMPLE_TIME) { + if (!first_time || first_time > data.time) + first_time = data.time; + if (last_time < data.time) + last_time = data.time; } - if (sample_type & PERF_SAMPLE_ADDR) - addr = event->sample.array[cursor++]; - if (sample_type & PERF_SAMPLE_ID) - cursor++; - if (sample_type & PERF_SAMPLE_STREAM_ID) - cursor++; - if (sample_type & PERF_SAMPLE_CPU) - cpu = event->sample.array[cursor++] & 0xFFFFFFFF; - if (sample_type & PERF_SAMPLE_PERIOD) - cursor++; - - te = (void *)&event->sample.array[cursor]; - if (sample_type & PERF_SAMPLE_RAW && te->size > 0) { + te = (void *)data.raw_data; + if (sample_type & PERF_SAMPLE_RAW && data.raw_size > 0) { char *event_str; struct power_entry *pe; @@ -532,19 +510,19 @@ process_sample_event(event_t *event) return 0; if (strcmp(event_str, "power:power_start") == 0) - c_state_start(cpu, stamp, pe->value); + c_state_start(data.cpu, data.time, pe->value); if (strcmp(event_str, "power:power_end") == 0) - c_state_end(cpu, stamp); + c_state_end(data.cpu, data.time); if (strcmp(event_str, "power:power_frequency") == 0) - p_state_change(cpu, stamp, pe->value); + p_state_change(data.cpu, data.time, pe->value); if (strcmp(event_str, "sched:sched_wakeup") == 0) - sched_wakeup(cpu, stamp, pid, te); + sched_wakeup(data.cpu, data.time, data.pid, te); if (strcmp(event_str, "sched:sched_switch") == 0) - sched_switch(cpu, stamp, te); + sched_switch(data.cpu, data.time, te); } return 0; } diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index abb914aa7be..c2fcc34486f 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -66,58 +66,40 @@ static u64 sample_type; static int process_sample_event(event_t *event) { - u64 ip = event->ip.ip; - u64 timestamp = -1; - u32 cpu = -1; - u64 period = 1; - void *more_data = event->ip.__more_data; - struct thread *thread = threads__findnew(event->ip.pid); - - if (sample_type & PERF_SAMPLE_TIME) { - timestamp = *(u64 *)more_data; - more_data += sizeof(u64); - } + struct sample_data data; + struct thread *thread; - if (sample_type & PERF_SAMPLE_CPU) { - cpu = *(u32 *)more_data; - more_data += sizeof(u32); - more_data += sizeof(u32); /* reserved */ - } + memset(&data, 0, sizeof(data)); + data.time = -1; + data.cpu = -1; + data.period = 1; - if (sample_type & PERF_SAMPLE_PERIOD) { - period = *(u64 *)more_data; - more_data += sizeof(u64); - } + event__parse_sample(event, sample_type, &data); dump_printf("(IP, %d): %d/%d: %p period: %Ld\n", event->header.misc, - event->ip.pid, event->ip.tid, - (void *)(long)ip, - (long long)period); + data.pid, data.tid, + (void *)(long)data.ip, + (long long)data.period); + thread = threads__findnew(event->ip.pid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", event->header.type); return -1; } - dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); - if (sample_type & PERF_SAMPLE_RAW) { - struct { - u32 size; - char data[0]; - } *raw = more_data; - /* * FIXME: better resolve from pid from the struct trace_entry * field, although it should be the same than this perf * event pid */ - scripting_ops->process_event(cpu, raw->data, raw->size, - timestamp, thread->comm); + scripting_ops->process_event(data.cpu, data.raw_data, + data.raw_size, + data.time, thread->comm); } - event__stats.total += period; + event__stats.total += data.period; return 0; } diff --git a/tools/perf/util/data_map.c b/tools/perf/util/data_map.c index ca0bedf637c..59b65d0bd7c 100644 --- a/tools/perf/util/data_map.c +++ b/tools/perf/util/data_map.c @@ -100,11 +100,11 @@ process_event(event_t *event, unsigned long offset, unsigned long head) } } -int perf_header__read_build_ids(int input, off_t offset, off_t size) +int perf_header__read_build_ids(int input, u64 offset, u64 size) { struct build_id_event bev; char filename[PATH_MAX]; - off_t limit = offset + size; + u64 limit = offset + size; int err = -1; while (offset < limit) { diff --git a/tools/perf/util/data_map.h b/tools/perf/util/data_map.h index 3180ff7e363..258a87bcc4f 100644 --- a/tools/perf/util/data_map.h +++ b/tools/perf/util/data_map.h @@ -27,6 +27,6 @@ int mmap_dispatch_perf_file(struct perf_header **pheader, int full_paths, int *cwdlen, char **cwd); -int perf_header__read_build_ids(int input, off_t offset, off_t file_size); +int perf_header__read_build_ids(int input, u64 offset, u64 file_size); #endif diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 414b89d1bde..4dcecafa85d 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -310,3 +310,70 @@ int event__preprocess_sample(const event_t *self, struct addr_location *al, al->level == 'H' ? "[hypervisor]" : "<not found>"); return 0; } + +int event__parse_sample(event_t *event, u64 type, struct sample_data *data) +{ + u64 *array = event->sample.array; + + if (type & PERF_SAMPLE_IP) { + data->ip = event->ip.ip; + array++; + } + + if (type & PERF_SAMPLE_TID) { + u32 *p = (u32 *)array; + data->pid = p[0]; + data->tid = p[1]; + array++; + } + + if (type & PERF_SAMPLE_TIME) { + data->time = *array; + array++; + } + + if (type & PERF_SAMPLE_ADDR) { + data->addr = *array; + array++; + } + + if (type & PERF_SAMPLE_ID) { + data->id = *array; + array++; + } + + if (type & PERF_SAMPLE_STREAM_ID) { + data->stream_id = *array; + array++; + } + + if (type & PERF_SAMPLE_CPU) { + u32 *p = (u32 *)array; + data->cpu = *p; + array++; + } + + if (type & PERF_SAMPLE_PERIOD) { + data->period = *array; + array++; + } + + if (type & PERF_SAMPLE_READ) { + pr_debug("PERF_SAMPLE_READ is unsuported for now\n"); + return -1; + } + + if (type & PERF_SAMPLE_CALLCHAIN) { + data->callchain = (struct ip_callchain *)array; + array += 1 + data->callchain->nr; + } + + if (type & PERF_SAMPLE_RAW) { + u32 *p = (u32 *)array; + data->raw_size = *p; + p++; + data->raw_data = p; + } + + return 0; +} diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index a4cc8105cf6..c7a78eef8e5 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -56,11 +56,25 @@ struct read_event { u64 id; }; -struct sample_event{ +struct sample_event { struct perf_event_header header; u64 array[]; }; +struct sample_data { + u64 ip; + u32 pid, tid; + u64 time; + u64 addr; + u64 id; + u64 stream_id; + u32 cpu; + u64 period; + struct ip_callchain *callchain; + u32 raw_size; + void *raw_data; +}; + #define BUILD_ID_SIZE 20 struct build_id_event { @@ -155,5 +169,6 @@ int event__process_task(event_t *self); struct addr_location; int event__preprocess_sample(const event_t *self, struct addr_location *al, symbol_filter_t filter); +int event__parse_sample(event_t *event, u64 type, struct sample_data *data); #endif /* __PERF_RECORD_H */ diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 4805e6dfd23..59a9c0b3033 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -187,7 +187,9 @@ static int do_write(int fd, const void *buf, size_t size) static int __dsos__write_buildid_table(struct list_head *head, int fd) { +#define NAME_ALIGN 64 struct dso *pos; + static const char zero_buf[NAME_ALIGN]; list_for_each_entry(pos, head, node) { int err; @@ -197,14 +199,17 @@ static int __dsos__write_buildid_table(struct list_head *head, int fd) if (!pos->has_build_id) continue; len = pos->long_name_len + 1; - len = ALIGN(len, 64); + len = ALIGN(len, NAME_ALIGN); memset(&b, 0, sizeof(b)); memcpy(&b.build_id, pos->build_id, sizeof(pos->build_id)); b.header.size = sizeof(b) + len; err = do_write(fd, &b, sizeof(b)); if (err < 0) return err; - err = do_write(fd, pos->long_name, len); + err = do_write(fd, pos->long_name, pos->long_name_len + 1); + if (err < 0) + return err; + err = do_write(fd, zero_buf, len - pos->long_name_len - 1); if (err < 0) return err; } diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 9e5dbd66d34..e5bc0fb016b 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -197,7 +197,7 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config) if (id == config) { closedir(evt_dir); closedir(sys_dir); - path = zalloc(sizeof(path)); + path = zalloc(sizeof(*path)); path->system = malloc(MAX_EVENT_LENGTH); if (!path->system) { free(path); @@ -467,7 +467,6 @@ parse_subsystem_tracepoint_event(char *sys_name, char *flags) while ((evt_ent = readdir(evt_dir))) { char event_opt[MAX_EVOPT_LEN + 1]; int len; - unsigned int rem = MAX_EVOPT_LEN; if (!strcmp(evt_ent->d_name, ".") || !strcmp(evt_ent->d_name, "..") @@ -475,20 +474,12 @@ parse_subsystem_tracepoint_event(char *sys_name, char *flags) || !strcmp(evt_ent->d_name, "filter")) continue; - len = snprintf(event_opt, MAX_EVOPT_LEN, "%s:%s", sys_name, - evt_ent->d_name); + len = snprintf(event_opt, MAX_EVOPT_LEN, "%s:%s%s%s", sys_name, + evt_ent->d_name, flags ? ":" : "", + flags ?: ""); if (len < 0) return EVT_FAILED; - rem -= len; - if (flags) { - if (rem < strlen(flags) + 1) - return EVT_FAILED; - - strcat(event_opt, ":"); - strcat(event_opt, flags); - } - if (parse_events(NULL, event_opt, 0)) return EVT_FAILED; } diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c index 6d8af48c925..efebd5b476b 100644 --- a/tools/perf/util/parse-options.c +++ b/tools/perf/util/parse-options.c @@ -430,6 +430,9 @@ int usage_with_options_internal(const char * const *usagestr, pos = fprintf(stderr, " "); if (opts->short_name) pos += fprintf(stderr, "-%c", opts->short_name); + else + pos += fprintf(stderr, " "); + if (opts->long_name && opts->short_name) pos += fprintf(stderr, ", "); if (opts->long_name) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index cd7fbda5e2a..d14a4585bca 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -48,6 +48,9 @@ /* If there is no space to write, returns -E2BIG. */ static int e_snprintf(char *str, size_t size, const char *format, ...) + __attribute__((format(printf, 3, 4))); + +static int e_snprintf(char *str, size_t size, const char *format, ...) { int ret; va_list ap; @@ -258,7 +261,7 @@ int synthesize_perf_probe_event(struct probe_point *pp) ret = e_snprintf(buf, MAX_CMDLEN, "%s%s%s%s", pp->function, offs, pp->retprobe ? "%return" : "", line); else - ret = e_snprintf(buf, MAX_CMDLEN, "%s%s%s%s", pp->file, line); + ret = e_snprintf(buf, MAX_CMDLEN, "%s%s", pp->file, line); if (ret <= 0) goto error; len = ret; @@ -373,14 +376,32 @@ static void clear_probe_point(struct probe_point *pp) free(pp->args); for (i = 0; i < pp->found; i++) free(pp->probes[i]); - memset(pp, 0, sizeof(pp)); + memset(pp, 0, sizeof(*pp)); +} + +/* Show an event */ +static void show_perf_probe_event(const char *group, const char *event, + const char *place, struct probe_point *pp) +{ + int i; + char buf[128]; + + e_snprintf(buf, 128, "%s:%s", group, event); + printf(" %-40s (on %s", buf, place); + + if (pp->nr_args > 0) { + printf(" with"); + for (i = 0; i < pp->nr_args; i++) + printf(" %s", pp->args[i]); + } + printf(")\n"); } /* List up current perf-probe events */ void show_perf_probe_events(void) { unsigned int i; - int fd; + int fd, nr; char *group, *event; struct probe_point pp; struct strlist *rawlist; @@ -393,8 +414,13 @@ void show_perf_probe_events(void) for (i = 0; i < strlist__nr_entries(rawlist); i++) { ent = strlist__entry(rawlist, i); parse_trace_kprobe_event(ent->s, &group, &event, &pp); + /* Synthesize only event probe point */ + nr = pp.nr_args; + pp.nr_args = 0; synthesize_perf_probe_event(&pp); - printf("[%s:%s]\t%s\n", group, event, pp.probes[0]); + pp.nr_args = nr; + /* Show an event */ + show_perf_probe_event(group, event, pp.probes[0], &pp); free(group); free(event); clear_probe_point(&pp); @@ -404,21 +430,28 @@ void show_perf_probe_events(void) } /* Get current perf-probe event names */ -static struct strlist *get_perf_event_names(int fd) +static struct strlist *get_perf_event_names(int fd, bool include_group) { unsigned int i; char *group, *event; + char buf[128]; struct strlist *sl, *rawlist; struct str_node *ent; rawlist = get_trace_kprobe_event_rawlist(fd); - sl = strlist__new(false, NULL); + sl = strlist__new(true, NULL); for (i = 0; i < strlist__nr_entries(rawlist); i++) { ent = strlist__entry(rawlist, i); parse_trace_kprobe_event(ent->s, &group, &event, NULL); - strlist__add(sl, event); + if (include_group) { + if (e_snprintf(buf, 128, "%s:%s", group, event) < 0) + die("Failed to copy group:event name."); + strlist__add(sl, buf); + } else + strlist__add(sl, event); free(group); + free(event); } strlist__delete(rawlist); @@ -426,24 +459,30 @@ static struct strlist *get_perf_event_names(int fd) return sl; } -static int write_trace_kprobe_event(int fd, const char *buf) +static void write_trace_kprobe_event(int fd, const char *buf) { int ret; + pr_debug("Writing event: %s\n", buf); ret = write(fd, buf, strlen(buf)); if (ret <= 0) - die("Failed to create event."); - else - printf("Added new event: %s\n", buf); - - return ret; + die("Failed to write event: %s", strerror(errno)); } static void get_new_event_name(char *buf, size_t len, const char *base, struct strlist *namelist) { int i, ret; - for (i = 0; i < MAX_EVENT_INDEX; i++) { + + /* Try no suffix */ + ret = e_snprintf(buf, len, "%s", base); + if (ret < 0) + die("snprintf() failed: %s", strerror(-ret)); + if (!strlist__has_entry(namelist, buf)) + return; + + /* Try to add suffix */ + for (i = 1; i < MAX_EVENT_INDEX; i++) { ret = e_snprintf(buf, len, "%s_%d", base, i); if (ret < 0) die("snprintf() failed: %s", strerror(-ret)); @@ -464,7 +503,7 @@ void add_trace_kprobe_events(struct probe_point *probes, int nr_probes) fd = open_kprobe_events(O_RDWR, O_APPEND); /* Get current event names */ - namelist = get_perf_event_names(fd); + namelist = get_perf_event_names(fd, false); for (j = 0; j < nr_probes; j++) { pp = probes + j; @@ -476,9 +515,73 @@ void add_trace_kprobe_events(struct probe_point *probes, int nr_probes) PERFPROBE_GROUP, event, pp->probes[i]); write_trace_kprobe_event(fd, buf); + printf("Added new event:\n"); + /* Get the first parameter (probe-point) */ + sscanf(pp->probes[i], "%s", buf); + show_perf_probe_event(PERFPROBE_GROUP, event, + buf, pp); /* Add added event name to namelist */ strlist__add(namelist, event); } } + /* Show how to use the event. */ + printf("\nYou can now use it on all perf tools, such as:\n\n"); + printf("\tperf record -e %s:%s -a sleep 1\n\n", PERFPROBE_GROUP, event); + + strlist__delete(namelist); + close(fd); +} + +static void del_trace_kprobe_event(int fd, const char *group, + const char *event, struct strlist *namelist) +{ + char buf[128]; + + if (e_snprintf(buf, 128, "%s:%s", group, event) < 0) + die("Failed to copy event."); + if (!strlist__has_entry(namelist, buf)) { + pr_warning("Warning: event \"%s\" is not found.\n", buf); + return; + } + /* Convert from perf-probe event to trace-kprobe event */ + if (e_snprintf(buf, 128, "-:%s/%s", group, event) < 0) + die("Failed to copy event."); + + write_trace_kprobe_event(fd, buf); + printf("Remove event: %s:%s\n", group, event); +} + +void del_trace_kprobe_events(struct strlist *dellist) +{ + int fd; + unsigned int i; + const char *group, *event; + char *p, *str; + struct str_node *ent; + struct strlist *namelist; + + fd = open_kprobe_events(O_RDWR, O_APPEND); + /* Get current event names */ + namelist = get_perf_event_names(fd, true); + + for (i = 0; i < strlist__nr_entries(dellist); i++) { + ent = strlist__entry(dellist, i); + str = strdup(ent->s); + if (!str) + die("Failed to copy event."); + p = strchr(str, ':'); + if (p) { + group = str; + *p = '\0'; + event = p + 1; + } else { + group = PERFPROBE_GROUP; + event = str; + } + del_trace_kprobe_event(fd, group, event, namelist); + free(str); + } + strlist__delete(namelist); close(fd); } + diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h index 0c6fe56fe38..f752159124a 100644 --- a/tools/perf/util/probe-event.h +++ b/tools/perf/util/probe-event.h @@ -10,6 +10,7 @@ extern void parse_trace_kprobe_event(const char *str, char **group, char **event, struct probe_point *pp); extern int synthesize_trace_kprobe_event(struct probe_point *pp); extern void add_trace_kprobe_events(struct probe_point *probes, int nr_probes); +extern void del_trace_kprobe_events(struct strlist *dellist); extern void show_perf_probe_events(void); /* Maximum index number of event-name postfix */ diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 293cdfc1b8c..4585f1d8679 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -106,7 +106,7 @@ static int strtailcmp(const char *s1, const char *s2) { int i1 = strlen(s1); int i2 = strlen(s2); - while (--i1 > 0 && --i2 > 0) { + while (--i1 >= 0 && --i2 >= 0) { if (s1[i1] != s2[i2]) return s1[i1] - s2[i2]; } diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index fffcb937cdc..e7508ad3450 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -938,8 +938,9 @@ static bool __dsos__read_build_ids(struct list_head *head) bool dsos__read_build_ids(void) { - return __dsos__read_build_ids(&dsos__kernel) || - __dsos__read_build_ids(&dsos__user); + bool kbuildids = __dsos__read_build_ids(&dsos__kernel), + ubuildids = __dsos__read_build_ids(&dsos__user); + return kbuildids || ubuildids; } /* diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c index 0302405aa2c..c5c32be040b 100644 --- a/tools/perf/util/trace-event-parse.c +++ b/tools/perf/util/trace-event-parse.c @@ -177,7 +177,7 @@ void parse_proc_kallsyms(char *file, unsigned int size __unused) func_count++; } - func_list = malloc_or_die(sizeof(*func_list) * func_count + 1); + func_list = malloc_or_die(sizeof(*func_list) * (func_count + 1)); i = 0; while (list) { @@ -1477,7 +1477,7 @@ process_fields(struct event *event, struct print_flag_sym **list, char **tok) goto out_free; field = malloc_or_die(sizeof(*field)); - memset(field, 0, sizeof(field)); + memset(field, 0, sizeof(*field)); value = arg_eval(arg); field->value = strdup(value); diff --git a/tools/perf/util/trace-event-perl.c b/tools/perf/util/trace-event-perl.c index 51e833fd58c..a5ffe60db5d 100644 --- a/tools/perf/util/trace-event-perl.c +++ b/tools/perf/util/trace-event-perl.c @@ -32,9 +32,6 @@ void xs_init(pTHX); -void boot_Perf__Trace__Context(pTHX_ CV *cv); -void boot_DynaLoader(pTHX_ CV *cv); - void xs_init(pTHX) { const char *file = __FILE__; @@ -573,26 +570,72 @@ struct scripting_ops perl_scripting_ops = { .generate_script = perl_generate_script, }; -#ifdef NO_LIBPERL -void setup_perl_scripting(void) +static void print_unsupported_msg(void) { fprintf(stderr, "Perl scripting not supported." - " Install libperl and rebuild perf to enable it. e.g. " - "apt-get install libperl-dev (ubuntu), yum install " - "perl-ExtUtils-Embed (Fedora), etc.\n"); + " Install libperl and rebuild perf to enable it.\n" + "For example:\n # apt-get install libperl-dev (ubuntu)" + "\n # yum install perl-ExtUtils-Embed (Fedora)" + "\n etc.\n"); } -#else -void setup_perl_scripting(void) + +static int perl_start_script_unsupported(const char *script __unused) +{ + print_unsupported_msg(); + + return -1; +} + +static int perl_stop_script_unsupported(void) +{ + return 0; +} + +static void perl_process_event_unsupported(int cpu __unused, + void *data __unused, + int size __unused, + unsigned long long nsecs __unused, + char *comm __unused) +{ +} + +static int perl_generate_script_unsupported(const char *outfile __unused) +{ + print_unsupported_msg(); + + return -1; +} + +struct scripting_ops perl_scripting_unsupported_ops = { + .name = "Perl", + .start_script = perl_start_script_unsupported, + .stop_script = perl_stop_script_unsupported, + .process_event = perl_process_event_unsupported, + .generate_script = perl_generate_script_unsupported, +}; + +static void register_perl_scripting(struct scripting_ops *scripting_ops) { int err; - err = script_spec_register("Perl", &perl_scripting_ops); + err = script_spec_register("Perl", scripting_ops); if (err) die("error registering Perl script extension"); - err = script_spec_register("pl", &perl_scripting_ops); + err = script_spec_register("pl", scripting_ops); if (err) die("error registering pl script extension"); scripting_context = malloc(sizeof(struct scripting_context)); } + +#ifdef NO_LIBPERL +void setup_perl_scripting(void) +{ + register_perl_scripting(&perl_scripting_unsupported_ops); +} +#else +void setup_perl_scripting(void) +{ + register_perl_scripting(&perl_scripting_ops); +} #endif diff --git a/tools/perf/util/trace-event-perl.h b/tools/perf/util/trace-event-perl.h index 8fe0d866fe1..e88fb26137b 100644 --- a/tools/perf/util/trace-event-perl.h +++ b/tools/perf/util/trace-event-perl.h @@ -34,9 +34,13 @@ typedef int INTERP; #define dXSUB_SYS #define pTHX_ static inline void newXS(const char *a, void *b, const char *c) {} +static void boot_Perf__Trace__Context(pTHX_ CV *cv) {} +static void boot_DynaLoader(pTHX_ CV *cv) {} #else #include <EXTERN.h> #include <perl.h> +void boot_Perf__Trace__Context(pTHX_ CV *cv); +void boot_DynaLoader(pTHX_ CV *cv); typedef PerlInterpreter * INTERP; #endif diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c index 342dfdd43f8..1744422cafc 100644 --- a/tools/perf/util/trace-event-read.c +++ b/tools/perf/util/trace-event-read.c @@ -145,8 +145,9 @@ static void read_proc_kallsyms(void) if (!size) return; - buf = malloc_or_die(size); + buf = malloc_or_die(size + 1); read_or_die(buf, size); + buf[size] = '\0'; parse_proc_kallsyms(buf, size); |