From 1cbd8b3fdcf56a3c39a7596512095c9e33221fa1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 30 Oct 2008 10:45:36 +0000 Subject: x86: add two missing unwind annotations Impact: improve debuginfo Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b86f332c96a..ddeeb105258 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -255,6 +255,7 @@ ENTRY(ret_from_fork) call schedule_tail GET_THREAD_INFO(%rcx) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) + CFI_REMEMBER_STATE jnz rff_trace rff_action: RESTORE_REST @@ -264,6 +265,7 @@ rff_action: jnz int_ret_from_sys_call RESTORE_TOP_OF_STACK %rdi,ARGOFFSET jmp ret_from_sys_call + CFI_RESTORE_STATE rff_trace: movq %rsp,%rdi call syscall_trace_leave -- cgit v1.2.3 From 6cf87efbc7a3676e0ad7c9622ec6aec244a593bc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 4 Nov 2008 10:42:23 +0100 Subject: x86 debug: mark early_printk.o as notrace Impact: do not do function-tracing in the early-printk code this is useful when earlyprintk=vga,keep is used to debug tracer plugins. Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e489ff9cb3e..943fe6026c6 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -12,6 +12,7 @@ CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_ftrace.o = -pg +CFLAGS_REMOVE_early_printk.o = -pg endif # -- cgit v1.2.3 From 8652cb4b0d87accbe78725fd2a13be2787059649 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 12 Nov 2008 13:35:00 -0500 Subject: x86: warn of incorrect cpu_khz on AMD systems Impact: add debug check If none of the perfctrs are free when calculating cpu_khz we default to using ctr 3 (ie, we just choose 3). This may lead to an incorrect tsc freq value which can cause the system to be unstable. To aid in future debugging, WARN the user of a potential problem. Signed-off-by: Prarit Bhargava Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index cb19d650c21..418a095c579 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -80,6 +80,8 @@ unsigned long __init calibrate_cpu(void) break; no_ctr_free = (i == 4); if (no_ctr_free) { + WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " + "cpu_khz value may be incorrect.\n"); i = 3; rdmsrl(MSR_K7_EVNTSEL3, evntsel3); wrmsrl(MSR_K7_EVNTSEL3, 0); -- cgit v1.2.3 From 97a70e548bd97d5a46ae9d44f24aafcc013fd701 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Nov 2008 23:22:35 +0100 Subject: x86, hibernate: fix breakage on x86_32 with CONFIG_NUMA set Impact: fix crash during hibernation on 32-bit NUMA The NUMA code on x86_32 creates special memory mapping that allows each node's pgdat to be located in this node's memory. For this purpose it allocates a memory area at the end of each node's memory and maps this area so that it is accessible with virtual addresses belonging to low memory. As a result, if there is high memory, these NUMA-allocated areas are physically located in high memory, although they are mapped to low memory addresses. Our hibernation code does not take that into account and for this reason hibernation fails on all x86_32 systems with CONFIG_NUMA=y and with high memory present. Fix this by adding a special mapping for the NUMA-allocated memory areas to the temporary page tables created during the last phase of resume. Signed-off-by: Rafael J. Wysocki Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmzone_32.h | 4 ++++ arch/x86/mm/numa_32.c | 35 +++++++++++++++++++++++++++++++++++ arch/x86/power/hibernate_32.c | 4 ++++ 3 files changed, 43 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 485bdf059ff..07f1af494ca 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -34,10 +34,14 @@ static inline void get_memcfg_numa(void) extern int early_pfn_to_nid(unsigned long pfn); +extern void resume_map_numa_kva(pgd_t *pgd); + #else /* !CONFIG_NUMA */ #define get_memcfg_numa get_memcfg_numa_flat +static inline void resume_map_numa_kva(pgd_t *pgd) {} + #endif /* CONFIG_NUMA */ #ifdef CONFIG_DISCONTIGMEM diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 847c164725f..8518c678d83 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -222,6 +222,41 @@ static void __init remap_numa_kva(void) } } +#ifdef CONFIG_HIBERNATION +/** + * resume_map_numa_kva - add KVA mapping to the temporary page tables created + * during resume from hibernation + * @pgd_base - temporary resume page directory + */ +void resume_map_numa_kva(pgd_t *pgd_base) +{ + int node; + + for_each_online_node(node) { + unsigned long start_va, start_pfn, size, pfn; + + start_va = (unsigned long)node_remap_start_vaddr[node]; + start_pfn = node_remap_start_pfn[node]; + size = node_remap_size[node]; + + printk(KERN_DEBUG "%s: node %d\n", __FUNCTION__, node); + + for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) { + unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); + pgd_t *pgd = pgd_base + pgd_index(vaddr); + pud_t *pud = pud_offset(pgd, vaddr); + pmd_t *pmd = pmd_offset(pud, vaddr); + + set_pmd(pmd, pfn_pmd(start_pfn + pfn, + PAGE_KERNEL_LARGE_EXEC)); + + printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n", + __FUNCTION__, vaddr, start_pfn + pfn); + } + } +} +#endif + static unsigned long calculate_numa_remap_pages(void) { int nid; diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index f2b6e3f11bf..81197c62d5b 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -12,6 +12,7 @@ #include #include #include +#include /* Defined in hibernate_asm_32.S */ extern int restore_image(void); @@ -127,6 +128,9 @@ static int resume_physical_mapping_init(pgd_t *pgd_base) } } } + + resume_map_numa_kva(pgd_base); + return 0; } -- cgit v1.2.3 From 604d20554883cf03f888440d58ea7c6d36899839 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Nov 2008 23:26:14 +0100 Subject: x86: make NUMA on 32-bit depend on EXPERIMENTAL again My previous patch to make CONFIG_NUMA on x86_32 depend on BROKEN turned out to be unnecessary, after all, since the source of the hibernation vs CONFIG_NUMA problem turned out to be the fact that we didn't take the NUMA KVA remapping into account in the hibernation code. Signed-off-by: Rafael J. Wysocki Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 93224b56918..4cf0ab13d18 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -957,7 +957,7 @@ config ARCH_PHYS_ADDR_T_64BIT config NUMA bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" depends on SMP - depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && BROKEN) + depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) help -- cgit v1.2.3 From e5e1f606ecbf67e52ebe2df5d14f8b94ec6544d0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 17 Nov 2008 15:07:17 +0100 Subject: AMD IOMMU: add parameter to disable device isolation Impact: add a new AMD IOMMU kernel command line parameter Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 0cdcda35a05..838a2e1d5bb 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1213,6 +1213,8 @@ static int __init parse_amd_iommu_options(char *str) for (; *str; ++str) { if (strncmp(str, "isolate", 7) == 0) amd_iommu_isolate = 1; + if (strncmp(str, "share", 5) == 0) + amd_iommu_isolate = 0; if (strncmp(str, "fullflush", 11) == 0) amd_iommu_unmap_flush = true; } -- cgit v1.2.3 From 3ce1f93c6d53c3f91c3846cf66b018276c8ac2e7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 17 Nov 2008 15:09:20 +0100 Subject: AMD IOMMU: enable device isolation per default Impact: makes device isolation the default for AMD IOMMU Some device drivers showed double-free bugs of DMA memory while testing them with AMD IOMMU. If all devices share the same protection domain this can lead to data corruption and data loss. Prevent this by putting each device into its own protection domain per default. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 838a2e1d5bb..595edd2befc 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -121,7 +121,7 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ -int amd_iommu_isolate; /* if 1, device isolation is enabled */ +int amd_iommu_isolate = 1; /* if 1, device isolation is enabled */ bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the -- cgit v1.2.3 From 695b5676c727d80921a2dc8737d5b3322222db85 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 17 Nov 2008 15:16:43 +0100 Subject: AMD IOMMU: fix fullflush comparison length Impact: fix comparison length for 'fullflush' Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 595edd2befc..30ae2701b3d 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1215,7 +1215,7 @@ static int __init parse_amd_iommu_options(char *str) amd_iommu_isolate = 1; if (strncmp(str, "share", 5) == 0) amd_iommu_isolate = 0; - if (strncmp(str, "fullflush", 11) == 0) + if (strncmp(str, "fullflush", 9) == 0) amd_iommu_unmap_flush = true; } -- cgit v1.2.3 From 8501c45cc32c311ae755a2d5ac8c4a5f04908d42 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 17 Nov 2008 19:11:46 +0100 Subject: AMD IOMMU: check for next_bit also in unmapped area Impact: fix possible use of stale IO/TLB entries Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 331b318304e..e4899e0e878 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -537,7 +537,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, address >>= PAGE_SHIFT; iommu_area_free(dom->bitmap, address, pages); - if (address + pages >= dom->next_bit) + if (address >= dom->next_bit) dom->need_flush = true; } -- cgit v1.2.3 From 0af40a4b1050c050e62eb1dc30b82d5ab22bf221 Mon Sep 17 00:00:00 2001 From: Philipp Kohlbecher Date: Sun, 16 Nov 2008 12:11:01 +0100 Subject: x86: more general identifier for Phoenix BIOS Impact: widen the reach of the low-memory-protect DMI quirk Phoenix BIOSes variously identify their vendor as "Phoenix Technologies, LTD" or "Phoenix Technologies LTD" (without the comma.) This patch makes the identification string in the bad_bios_dmi_table more general (following a suggestion by Ingo Molnar), so that both versions are handled. Again, the patched file compiles cleanly and the patch has been tested successfully on my machine. Signed-off-by: Philipp Kohlbecher Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa6790c1dd..9d5674f7b6c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -764,7 +764,7 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { .callback = dmi_low_memory_corruption, .ident = "Phoenix BIOS", .matches = { - DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"), + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), }, }, #endif -- cgit v1.2.3 From 093bac154c142fa1fb31a3ac69ae1bc08930231b Mon Sep 17 00:00:00 2001 From: Steve Conklin Date: Fri, 14 Nov 2008 00:55:51 -0600 Subject: x86: quirk for reboot stalls on a Dell Optiplex 330 Dell Optiplex 330 appears to hang on reboot. This is resolved by adding a quirk to set bios reboot. Signed-off-by: Leann Ogasawara Signed-off-by: Steve Conklin Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 724adfc63cb..cc5a2545dd4 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -169,6 +169,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0KW626"), }, }, + { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 330", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"), + DMI_MATCH(DMI_BOARD_NAME, "0KP561"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", -- cgit v1.2.3 From 20a4a236c7de5c915551cdc562482aa53eaff40e Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 13 Nov 2008 18:06:04 -0800 Subject: x86: uaccess_64: fix return value in __copy_from_user() __copy_from_user() will return invalid value 16 when it fails to access user space and the size is 10. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess_64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 664f15280f1..f8cfd00db45 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -46,7 +46,7 @@ int __copy_from_user(void *dst, const void __user *src, unsigned size) return ret; case 10: __get_user_asm(*(u64 *)dst, (u64 __user *)src, - ret, "q", "", "=r", 16); + ret, "q", "", "=r", 10); if (unlikely(ret)) return ret; __get_user_asm(*(u16 *)(8 + (char *)dst), -- cgit v1.2.3 From de11defebf00007677fb7ee91d9b089b78786fbb Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 19 Nov 2008 15:36:14 -0800 Subject: reintroduce accept4 Introduce a new accept4() system call. The addition of this system call matches analogous changes in 2.6.27 (dup3(), evenfd2(), signalfd4(), inotify_init1(), epoll_create1(), pipe2()) which added new system calls that differed from analogous traditional system calls in adding a flags argument that can be used to access additional functionality. The accept4() system call is exactly the same as accept(), except that it adds a flags bit-mask argument. Two flags are initially implemented. (Most of the new system calls in 2.6.27 also had both of these flags.) SOCK_CLOEXEC causes the close-on-exec (FD_CLOEXEC) flag to be enabled for the new file descriptor returned by accept4(). This is a useful security feature to avoid leaking information in a multithreaded program where one thread is doing an accept() at the same time as another thread is doing a fork() plus exec(). More details here: http://udrepper.livejournal.com/20407.html "Secure File Descriptor Handling", Ulrich Drepper). The other flag is SOCK_NONBLOCK, which causes the O_NONBLOCK flag to be enabled on the new open file description created by accept4(). (This flag is merely a convenience, saving the use of additional calls fcntl(F_GETFL) and fcntl (F_SETFL) to achieve the same result. Here's a test program. Works on x86-32. Should work on x86-64, but I (mtk) don't have a system to hand to test with. It tests accept4() with each of the four possible combinations of SOCK_CLOEXEC and SOCK_NONBLOCK set/clear in 'flags', and verifies that the appropriate flags are set on the file descriptor/open file description returned by accept4(). I tested Ulrich's patch in this thread by applying against 2.6.28-rc2, and it passes according to my test program. /* test_accept4.c Copyright (C) 2008, Linux Foundation, written by Michael Kerrisk Licensed under the GNU GPLv2 or later. */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #define PORT_NUM 33333 #define die(msg) do { perror(msg); exit(EXIT_FAILURE); } while (0) /**********************************************************************/ /* The following is what we need until glibc gets a wrapper for accept4() */ /* Flags for socket(), socketpair(), accept4() */ #ifndef SOCK_CLOEXEC #define SOCK_CLOEXEC O_CLOEXEC #endif #ifndef SOCK_NONBLOCK #define SOCK_NONBLOCK O_NONBLOCK #endif #ifdef __x86_64__ #define SYS_accept4 288 #elif __i386__ #define USE_SOCKETCALL 1 #define SYS_ACCEPT4 18 #else #error "Sorry -- don't know the syscall # on this architecture" #endif static int accept4(int fd, struct sockaddr *sockaddr, socklen_t *addrlen, int flags) { printf("Calling accept4(): flags = %x", flags); if (flags != 0) { printf(" ("); if (flags & SOCK_CLOEXEC) printf("SOCK_CLOEXEC"); if ((flags & SOCK_CLOEXEC) && (flags & SOCK_NONBLOCK)) printf(" "); if (flags & SOCK_NONBLOCK) printf("SOCK_NONBLOCK"); printf(")"); } printf("\n"); #if USE_SOCKETCALL long args[6]; args[0] = fd; args[1] = (long) sockaddr; args[2] = (long) addrlen; args[3] = flags; return syscall(SYS_socketcall, SYS_ACCEPT4, args); #else return syscall(SYS_accept4, fd, sockaddr, addrlen, flags); #endif } /**********************************************************************/ static int do_test(int lfd, struct sockaddr_in *conn_addr, int closeonexec_flag, int nonblock_flag) { int connfd, acceptfd; int fdf, flf, fdf_pass, flf_pass; struct sockaddr_in claddr; socklen_t addrlen; printf("=======================================\n"); connfd = socket(AF_INET, SOCK_STREAM, 0); if (connfd == -1) die("socket"); if (connect(connfd, (struct sockaddr *) conn_addr, sizeof(struct sockaddr_in)) == -1) die("connect"); addrlen = sizeof(struct sockaddr_in); acceptfd = accept4(lfd, (struct sockaddr *) &claddr, &addrlen, closeonexec_flag | nonblock_flag); if (acceptfd == -1) { perror("accept4()"); close(connfd); return 0; } fdf = fcntl(acceptfd, F_GETFD); if (fdf == -1) die("fcntl:F_GETFD"); fdf_pass = ((fdf & FD_CLOEXEC) != 0) == ((closeonexec_flag & SOCK_CLOEXEC) != 0); printf("Close-on-exec flag is %sset (%s); ", (fdf & FD_CLOEXEC) ? "" : "not ", fdf_pass ? "OK" : "failed"); flf = fcntl(acceptfd, F_GETFL); if (flf == -1) die("fcntl:F_GETFD"); flf_pass = ((flf & O_NONBLOCK) != 0) == ((nonblock_flag & SOCK_NONBLOCK) !=0); printf("nonblock flag is %sset (%s)\n", (flf & O_NONBLOCK) ? "" : "not ", flf_pass ? "OK" : "failed"); close(acceptfd); close(connfd); printf("Test result: %s\n", (fdf_pass && flf_pass) ? "PASS" : "FAIL"); return fdf_pass && flf_pass; } static int create_listening_socket(int port_num) { struct sockaddr_in svaddr; int lfd; int optval; memset(&svaddr, 0, sizeof(struct sockaddr_in)); svaddr.sin_family = AF_INET; svaddr.sin_addr.s_addr = htonl(INADDR_ANY); svaddr.sin_port = htons(port_num); lfd = socket(AF_INET, SOCK_STREAM, 0); if (lfd == -1) die("socket"); optval = 1; if (setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)) == -1) die("setsockopt"); if (bind(lfd, (struct sockaddr *) &svaddr, sizeof(struct sockaddr_in)) == -1) die("bind"); if (listen(lfd, 5) == -1) die("listen"); return lfd; } int main(int argc, char *argv[]) { struct sockaddr_in conn_addr; int lfd; int port_num; int passed; passed = 1; port_num = (argc > 1) ? atoi(argv[1]) : PORT_NUM; memset(&conn_addr, 0, sizeof(struct sockaddr_in)); conn_addr.sin_family = AF_INET; conn_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); conn_addr.sin_port = htons(port_num); lfd = create_listening_socket(port_num); if (!do_test(lfd, &conn_addr, 0, 0)) passed = 0; if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, 0)) passed = 0; if (!do_test(lfd, &conn_addr, 0, SOCK_NONBLOCK)) passed = 0; if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, SOCK_NONBLOCK)) passed = 0; close(lfd); exit(passed ? EXIT_SUCCESS : EXIT_FAILURE); } [mtk.manpages@gmail.com: rewrote changelog, updated test program] Signed-off-by: Ulrich Drepper Tested-by: Michael Kerrisk Acked-by: Michael Kerrisk Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/unistd_64.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 834b2c1d89f..d2e415e6666 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -639,8 +639,8 @@ __SYSCALL(__NR_fallocate, sys_fallocate) __SYSCALL(__NR_timerfd_settime, sys_timerfd_settime) #define __NR_timerfd_gettime 287 __SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime) -#define __NR_paccept 288 -__SYSCALL(__NR_paccept, sys_paccept) +#define __NR_accept4 288 +__SYSCALL(__NR_accept4, sys_accept4) #define __NR_signalfd4 289 __SYSCALL(__NR_signalfd4, sys_signalfd4) #define __NR_eventfd2 290 -- cgit v1.2.3 From 0ca4b6b00113b064c080d26d803d0d7c80fb5dc8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 20 Nov 2008 14:09:33 -0700 Subject: x86: Fix interrupt leak due to migration When we migrate an interrupt from one CPU to another, we set the move_in_progress flag and clean up the vectors later once they're not being used. If you're unlucky and call destroy_irq() before the vectors become un-used, the move_in_progress flag is never cleared, which causes the interrupt to become unusable. This was discovered by Jesse Brandeburg for whom it manifested as an MSI-X device refusing to use MSI-X mode when the driver was unloaded and reloaded repeatedly. Signed-off-by: Matthew Wilcox Signed-off-by: Linus Torvalds --- arch/x86/kernel/io_apic.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 7a3f2028e2e..c9513e1ff28 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -1140,6 +1140,20 @@ static void __clear_irq_vector(int irq) cfg->vector = 0; cpus_clear(cfg->domain); + + if (likely(!cfg->move_in_progress)) + return; + cpus_and(mask, cfg->old_domain, cpu_online_map); + for_each_cpu_mask_nr(cpu, mask) { + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; + vector++) { + if (per_cpu(vector_irq, cpu)[vector] != irq) + continue; + per_cpu(vector_irq, cpu)[vector] = -1; + break; + } + } + cfg->move_in_progress = 0; } void __setup_vector_irq(int cpu) -- cgit v1.2.3 From 3aeb95d5b7839708a8d8e11aa274ee4d0d4042cc Mon Sep 17 00:00:00 2001 From: jia zhang Date: Sun, 23 Nov 2008 09:51:41 +0800 Subject: x86_64: fix the check in stack_overflow_check Impact: make stack overflow debug check and printout narrower stack_overflow_check() should consider the stack usage of pt_regs, and thus it could warn us in advance. Additionally, it looks better for the warning time to start at INITIAL_JIFFIES. Assuming that rsp gets close to the check point before interrupt arrives: when interrupt really happens, thread_info will be partly overrode. Signed-off-by: jia zhang Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_64.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 60eb84eb77a..b842fc82be1 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -29,11 +29,12 @@ static inline void stack_overflow_check(struct pt_regs *regs) { u64 curbase = (u64)task_stack_page(current); - static unsigned long warned = -60*HZ; + static unsigned long warned = INITIAL_JIFFIES - 60*HZ; if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && - regs->sp < curbase + sizeof(struct thread_info) + 128 && - time_after(jiffies, warned + 60*HZ)) { + regs->sp < curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + 128 && + time_after(jiffies, warned + 60*HZ)) { printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", current->comm, curbase, regs->sp); show_stack(NULL,NULL); -- cgit v1.2.3 From f377fa123d0ec621e8e361ecc3f2a8ee70e81a2e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 23 Nov 2008 09:02:26 +0100 Subject: x86: clean up stack overflow debug check Impact: cleanup Simplify the irq-sampled stack overflow debug check: - eliminate an #idef - use WARN_ONCE() to emit a single warning (all bets are off after the first such warning anyway) Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_64.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index b842fc82be1..1d3d0e71b04 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,7 +18,6 @@ #include #include -#ifdef CONFIG_DEBUG_STACKOVERFLOW /* * Probabilistic stack overflow check: * @@ -28,20 +27,18 @@ */ static inline void stack_overflow_check(struct pt_regs *regs) { +#ifdef CONFIG_DEBUG_STACKOVERFLOW u64 curbase = (u64)task_stack_page(current); - static unsigned long warned = INITIAL_JIFFIES - 60*HZ; - - if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && - regs->sp < curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + 128 && - time_after(jiffies, warned + 60*HZ)) { - printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", - current->comm, curbase, regs->sp); - show_stack(NULL,NULL); - warned = jiffies; - } -} + + WARN_ONCE(regs->sp >= curbase && + regs->sp <= curbase + THREAD_SIZE && + regs->sp < curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + 128, + + "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", + current->comm, curbase, regs->sp); #endif +} /* * do_IRQ handles all normal device IRQ's (the special @@ -61,9 +58,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) irq_enter(); irq = __get_cpu_var(vector_irq)[vector]; -#ifdef CONFIG_DEBUG_STACKOVERFLOW stack_overflow_check(regs); -#endif desc = irq_to_desc(irq); if (likely(desc)) -- cgit v1.2.3 From 5f5db591326779a80cfe490c5d6b6ce9fac08b31 Mon Sep 17 00:00:00 2001 From: jia zhang Date: Sun, 23 Nov 2008 22:47:10 +0800 Subject: x86, debug: remove the confusing entry in call trace Impact: improve backtrace quality avoid the confusion in call trace because of the lack of padding at the tail of function. When do_exit gets called, the return address behind call instruction is pushed into stack. If something get wrong in do_exit, for x86_64, the entry "kernel_execve +0x00/0xXX" rather than "child_rip +0xYY/0xZZ" is in the call trace. That looks confusing, so add a u2d to make the return address still part of the original call site. (This also catches any instances of us returning from that function somehow.) Signed-off-by: jia zhang Acked-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 1 + arch/x86/kernel/entry_64.S | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 28b597ef9ca..f6402c4ba10 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1051,6 +1051,7 @@ ENTRY(kernel_thread_helper) push %eax CFI_ADJUST_CFA_OFFSET 4 call do_exit + ud2 # padding for call trace CFI_ENDPROC ENDPROC(kernel_thread_helper) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index ddeeb105258..4a16bf31c78 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1172,6 +1172,7 @@ child_rip: # exit mov %eax, %edi call do_exit + ud2 # padding for call trace CFI_ENDPROC ENDPROC(child_rip) -- cgit v1.2.3