diff options
49 files changed, 755 insertions, 376 deletions
diff --git a/Documentation/debugging-via-ohci1394.txt b/Documentation/debugging-via-ohci1394.txt index de4804e8b39..c360d4e91b4 100644 --- a/Documentation/debugging-via-ohci1394.txt +++ b/Documentation/debugging-via-ohci1394.txt @@ -36,14 +36,15 @@ available (notebooks) or too slow for extensive debug information (like ACPI). Drivers ------- -The OHCI-1394 drivers in drivers/firewire and drivers/ieee1394 initialize -the OHCI-1394 controllers to a working state and can be used to enable -physical DMA. By default you only have to load the driver, and physical -DMA access will be granted to all remote nodes, but it can be turned off -when using the ohci1394 driver. - -Because these drivers depend on the PCI enumeration to be completed, an -initialization routine which can runs pretty early (long before console_init(), +The ohci1394 driver in drivers/ieee1394 initializes the OHCI-1394 controllers +to a working state and enables physical DMA by default for all remote nodes. +This can be turned off by ohci1394's module parameter phys_dma=0. + +The alternative firewire-ohci driver in drivers/firewire uses filtered physical +DMA, hence is not yet suitable for remote debugging. + +Because ohci1394 depends on the PCI enumeration to be completed, an +initialization routine which runs pretty early (long before console_init() which makes the printk buffer appear on the console can be called) was written. To activate it, enable CONFIG_PROVIDE_OHCI1394_DMA_INIT (Kernel hacking menu: diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index fc50d2f959d..e8cb9ff183e 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -128,8 +128,6 @@ void *get_current(void) return current; } -extern void schedule_tail(struct task_struct *prev); - /* * This is called magically, by its address being stuffed in a jmp_buf * and being longjmp-d to. diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index e09a6b73a1a..6d50064db18 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -377,6 +377,19 @@ config X86_OOSTORE def_bool y depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR +# +# P6_NOPs are a relatively minor optimization that require a family >= +# 6 processor, except that it is broken on certain VIA chips. +# Furthermore, AMD chips prefer a totally different sequence of NOPs +# (which work on all CPUs). As a result, disallow these if we're +# compiling X86_GENERIC but not X86_64 (these NOPs do work on all +# x86-64 capable chips); the list of processors in the right-hand clause +# are the cores that benefit from this optimization. +# +config X86_P6_NOP + def_bool y + depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || PENTIUM4) + config X86_TSC def_bool y depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 @@ -390,6 +403,7 @@ config X86_CMOV config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 + default "6" if X86_32 && X86_P6_NOP default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK) default "3" diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index 378353956b5..e77d89f9e8a 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -37,6 +37,12 @@ static int detect_memory_e820(void) "=m" (*desc) : "D" (desc), "d" (SMAP), "a" (0xe820)); + /* BIOSes which terminate the chain with CF = 1 as opposed + to %ebx = 0 don't always report the SMAP signature on + the final, failing, probe. */ + if (err) + break; + /* Some BIOSes stop returning SMAP in the middle of the search loop. We don't know exactly how the BIOS screwed up the map at that point, we might have a @@ -47,9 +53,6 @@ static int detect_memory_e820(void) break; } - if (err) - break; - count++; desc++; } while (next && count < E820MAX); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index a33d5301799..8ea040124f7 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -128,13 +128,11 @@ void foo(void) OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); #endif -#ifdef CONFIG_LGUEST_GUEST +#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); -#endif -#ifdef CONFIG_LGUEST BLANK(); OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f86a3c4a266..a38aafaefc2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -504,7 +504,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Clear all flags overriden by options */ for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] ^= cleared_cpu_caps[i]; + c->x86_capability[i] &= ~cleared_cpu_caps[i]; /* Init Machine Check Exception if available. */ mcheck_init(c); diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index b6e136f23d3..be83336fddb 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -43,6 +43,7 @@ #include <asm/uaccess.h> #include <asm/processor.h> #include <asm/msr.h> +#include <asm/kvm_para.h> #include "mtrr.h" u32 num_var_ranges = 0; @@ -649,6 +650,7 @@ static __init int amd_special_default_mtrr(void) /** * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs + * @end_pfn: ending page frame number * * Some buggy BIOSes don't setup the MTRRs properly for systems with certain * memory configurations. This routine checks that the highest MTRR matches @@ -688,8 +690,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* kvm/qemu doesn't have mtrr set right, don't trim them all */ if (!highest_pfn) { - printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); - WARN_ON(1); + if (!kvm_para_available()) { + printk(KERN_WARNING + "WARNING: strange, CPU MTRRs all blank?\n"); + WARN_ON(1); + } return 0; } diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 200fb3f9ebf..e8b422c1c51 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -76,13 +76,6 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) /* All Transmeta CPUs have a constant TSC */ set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); - /* If we can run i686 user-space code, call us an i686 */ -#define USER686 ((1 << X86_FEATURE_TSC)|\ - (1 << X86_FEATURE_CX8)|\ - (1 << X86_FEATURE_CMOV)) - if (c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686) - c->x86 = 6; - #ifdef CONFIG_SYSCTL /* randomize_va_space slows us down enormously; it probably triggers retranslation of x86->native bytecode */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 2ad9a1bc6a7..c20c9e7e08d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -453,6 +453,7 @@ ENTRY(stub_execve) CFI_REGISTER rip, r11 SAVE_REST FIXUP_TOP_OF_STACK %r11 + movq %rsp, %rcx call sys_execve RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) @@ -1036,15 +1037,16 @@ ENDPROC(child_rip) * rdi: name, rsi: argv, rdx: envp * * We want to fallback into: - * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs) * * do_sys_execve asm fallback arguments: - * rdi: name, rsi: argv, rdx: envp, fake frame on the stack + * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack */ ENTRY(kernel_execve) CFI_STARTPROC FAKE_STACK_FRAME $0 SAVE_ALL + movq %rsp,%rcx call sys_execve movq %rax, RAX(%rsp) RESTORE_REST diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 25eb98540a4..fd8ca53943a 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -606,7 +606,7 @@ ENTRY(_stext) .section ".bss.page_aligned","wa" .align PAGE_SIZE_asm #ifdef CONFIG_X86_PAE -ENTRY(swapper_pg_pmd) +swapper_pg_pmd: .fill 1024*KPMDS,4,0 #else ENTRY(swapper_pg_dir) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index eb415043a92..a007454133a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -379,18 +379,24 @@ NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. * Don't set NX because code runs from these pages. */ - PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) + PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) NEXT_PAGE(level2_kernel_pgt) - /* 40MB kernel mapping. The kernel code cannot be bigger than that. - When you change this change KERNEL_TEXT_SIZE in page.h too. */ - /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ - PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE) - /* Module mapping starts here */ - .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 + /* + * 128 MB kernel mapping. We spend a full page on this pagetable + * anyway. + * + * The kernel code+data+bss must not be bigger than that. + * + * (NOTE: at +128MB starts the module area, see MODULES_VADDR. + * If you want to increase this then increase MODULES_VADDR + * too.) + */ + PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, + KERNEL_IMAGE_SIZE/PMD_SIZE) NEXT_PAGE(level2_spare_pgt) - .fill 512,8,0 + .fill 512, 8, 0 #undef PMDS #undef NEXT_PAGE diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 429d084e014..235fd6c7750 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -368,8 +368,8 @@ static int hpet_clocksource_register(void) return 0; } -/* - * Try to setup the HPET timer +/** + * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ int __init hpet_enable(void) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b0cc8f0136d..43f287744f9 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -730,16 +730,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ asmlinkage long sys_execve(char __user *name, char __user * __user *argv, - char __user * __user *envp, struct pt_regs regs) + char __user * __user *envp, struct pt_regs *regs) { long error; char * filename; filename = getname(name); error = PTR_ERR(filename); - if (IS_ERR(filename)) + if (IS_ERR(filename)) return error; - error = do_execve(filename, argv, envp, ®s); + error = do_execve(filename, argv, envp, regs); putname(filename); return error; } diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 6fd804f0782..7637dc91c79 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -1021,7 +1021,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Clear all flags overriden by options */ for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] ^= cleared_cpu_caps[i]; + c->x86_capability[i] &= ~cleared_cpu_caps[i]; #ifdef CONFIG_X86_MCE mcheck_init(c); diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c index d53bd6fcb42..0880f2c388a 100644 --- a/arch/x86/kernel/smpboot_64.c +++ b/arch/x86/kernel/smpboot_64.c @@ -554,10 +554,10 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) int timeout; unsigned long start_rip; struct create_idle c_idle = { - .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle), .cpu = cpu, .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; + INIT_WORK(&c_idle.work, do_fork_idle); /* allocate memory for gdts of secondary cpus. Hotplug is considered */ if (!cpu_gdt_descr[cpu].address && diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 02f0f61f5b1..c28c342c162 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -25,6 +25,8 @@ static int save_stack_stack(void *data, char *name) static void save_stack_address(void *data, unsigned long addr, int reliable) { struct stack_trace *trace = data; + if (!reliable) + return; if (trace->skip > 0) { trace->skip--; return; @@ -37,6 +39,8 @@ static void save_stack_address_nosched(void *data, unsigned long addr, int reliable) { struct stack_trace *trace = (struct stack_trace *)data; + if (!reliable) + return; if (in_sched_functions(addr)) return; if (trace->skip > 0) { diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index 43517e324be..f14cfd9d1f9 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -28,7 +28,8 @@ EXPORT_SYMBOL_GPL(tsc_khz); static int __init tsc_setup(char *str) { printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC.\n"); + "cannot disable TSC completely.\n"); + mark_tsc_unstable("user disabled TSC"); return 1; } #else diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 3f824277458..b6be812fac0 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -44,11 +44,6 @@ #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) #define __syscall_clobber "r11","cx","memory" -#define __pa_vsymbol(x) \ - ({unsigned long v; \ - extern char __vsyscall_0; \ - asm("" : "=r" (v) : "0" (x)); \ - ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); }) /* * vsyscall_gtod_data contains data that is : @@ -102,7 +97,7 @@ static __always_inline void do_get_tz(struct timezone * tz) static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) { int ret; - asm volatile("vsysc2: syscall" + asm volatile("syscall" : "=a" (ret) : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); @@ -112,7 +107,7 @@ static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) static __always_inline long time_syscall(long *t) { long secs; - asm volatile("vsysc1: syscall" + asm volatile("syscall" : "=a" (secs) : "0" (__NR_time),"D" (t) : __syscall_clobber); return secs; @@ -227,50 +222,10 @@ long __vsyscall(3) venosys_1(void) } #ifdef CONFIG_SYSCTL - -#define SYSCALL 0x050f -#define NOP2 0x9090 - -/* - * NOP out syscall in vsyscall page when not needed. - */ -static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - extern u16 vsysc1, vsysc2; - u16 __iomem *map1; - u16 __iomem *map2; - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - if (!write) - return ret; - /* gcc has some trouble with __va(__pa()), so just do it this - way. */ - map1 = ioremap(__pa_vsymbol(&vsysc1), 2); - if (!map1) - return -ENOMEM; - map2 = ioremap(__pa_vsymbol(&vsysc2), 2); - if (!map2) { - ret = -ENOMEM; - goto out; - } - if (!vsyscall_gtod_data.sysctl_enabled) { - writew(SYSCALL, map1); - writew(SYSCALL, map2); - } else { - writew(NOP2, map1); - writew(NOP2, map2); - } - iounmap(map2); -out: - iounmap(map1); - return ret; -} - static ctl_table kernel_table2[] = { { .procname = "vsyscall64", .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = vsyscall_sysctl_change }, + .mode = 0644 }, {} }; @@ -279,7 +234,6 @@ static ctl_table kernel_root_table2[] = { .child = kernel_table2 }, {} }; - #endif /* Assume __initcall executes before all user space. Hopefully kmod diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 5afdde4895d..cccb38a5965 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -57,6 +57,7 @@ #include <linux/lguest_launcher.h> #include <linux/virtio_console.h> #include <linux/pm.h> +#include <asm/lguest.h> #include <asm/paravirt.h> #include <asm/param.h> #include <asm/page.h> @@ -75,15 +76,6 @@ * behaving in simplified but equivalent ways. In particular, the Guest is the * same kernel as the Host (or at least, built from the same source code). :*/ -/* Declarations for definitions in lguest_guest.S */ -extern char lguest_noirq_start[], lguest_noirq_end[]; -extern const char lgstart_cli[], lgend_cli[]; -extern const char lgstart_sti[], lgend_sti[]; -extern const char lgstart_popf[], lgend_popf[]; -extern const char lgstart_pushf[], lgend_pushf[]; -extern const char lgstart_iret[], lgend_iret[]; -extern void lguest_iret(void); - struct lguest_data lguest_data = { .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, .noirq_start = (u32)lguest_noirq_start, @@ -489,7 +481,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { *pmdp = pmdval; lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK, - (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); + (__pa(pmdp)&(PAGE_SIZE-1)), 0); } /* There are a couple of legacy places where the kernel sets a PTE, but we diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bb652f5a93f..a02a14f0f32 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -172,8 +172,9 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) } /* - * The head.S code sets up the kernel high mapping from: - * __START_KERNEL_map to __START_KERNEL_map + KERNEL_TEXT_SIZE + * The head.S code sets up the kernel high mapping: + * + * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) * * phys_addr holds the negative offset to the kernel, which is added * to the compile time generated pmds. This results in invalid pmds up @@ -515,14 +516,6 @@ void __init mem_init(void) /* clear_bss() already clear the empty_zero_page */ - /* temporary debugging - double check it's true: */ - { - int i; - - for (i = 0; i < 1024; i++) - WARN_ON_ONCE(empty_zero_page[i]); - } - reservedpages = 0; /* this will put all low memory onto the freelists */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 464d8fc21ce..14e48b5a94b 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -44,6 +44,12 @@ static inline unsigned long highmap_end_pfn(void) #endif +#ifdef CONFIG_DEBUG_PAGEALLOC +# define debug_pagealloc 1 +#else +# define debug_pagealloc 0 +#endif + static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -355,45 +361,48 @@ out_unlock: static LIST_HEAD(page_pool); static unsigned long pool_size, pool_pages, pool_low; -static unsigned long pool_used, pool_failed, pool_refill; +static unsigned long pool_used, pool_failed; -static void cpa_fill_pool(void) +static void cpa_fill_pool(struct page **ret) { - struct page *p; gfp_t gfp = GFP_KERNEL; + unsigned long flags; + struct page *p; - /* Do not allocate from interrupt context */ - if (in_irq() || irqs_disabled()) - return; /* - * Check unlocked. I does not matter when we have one more - * page in the pool. The bit lock avoids recursive pool - * allocations: + * Avoid recursion (on debug-pagealloc) and also signal + * our priority to get to these pagetables: */ - if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill)) + if (current->flags & PF_MEMALLOC) return; + current->flags |= PF_MEMALLOC; -#ifdef CONFIG_DEBUG_PAGEALLOC /* - * We could do: - * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL; - * but this fails on !PREEMPT kernels + * Allocate atomically from atomic contexts: */ - gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; -#endif + if (in_atomic() || irqs_disabled() || debug_pagealloc) + gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; - while (pool_pages < pool_size) { + while (pool_pages < pool_size || (ret && !*ret)) { p = alloc_pages(gfp, 0); if (!p) { pool_failed++; break; } - spin_lock_irq(&pgd_lock); + /* + * If the call site needs a page right now, provide it: + */ + if (ret && !*ret) { + *ret = p; + continue; + } + spin_lock_irqsave(&pgd_lock, flags); list_add(&p->lru, &page_pool); pool_pages++; - spin_unlock_irq(&pgd_lock); + spin_unlock_irqrestore(&pgd_lock, flags); } - clear_bit_unlock(0, &pool_refill); + + current->flags &= ~PF_MEMALLOC; } #define SHIFT_MB (20 - PAGE_SHIFT) @@ -414,11 +423,15 @@ void __init cpa_init(void) * GiB. Shift MiB to Gib and multiply the result by * POOL_PAGES_PER_GB: */ - gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; - pool_size = POOL_PAGES_PER_GB * gb; + if (debug_pagealloc) { + gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; + pool_size = POOL_PAGES_PER_GB * gb; + } else { + pool_size = 1; + } pool_low = pool_size; - cpa_fill_pool(); + cpa_fill_pool(NULL); printk(KERN_DEBUG "CPA: page pool initialized %lu of %lu pages preallocated\n", pool_pages, pool_size); @@ -440,16 +453,20 @@ static int split_large_page(pte_t *kpte, unsigned long address) spin_lock_irqsave(&pgd_lock, flags); if (list_empty(&page_pool)) { spin_unlock_irqrestore(&pgd_lock, flags); - return -ENOMEM; + base = NULL; + cpa_fill_pool(&base); + if (!base) + return -ENOMEM; + spin_lock_irqsave(&pgd_lock, flags); + } else { + base = list_first_entry(&page_pool, struct page, lru); + list_del(&base->lru); + pool_pages--; + + if (pool_pages < pool_low) + pool_low = pool_pages; } - base = list_first_entry(&page_pool, struct page, lru); - list_del(&base->lru); - pool_pages--; - - if (pool_pages < pool_low) - pool_low = pool_pages; - /* * Check for races, another CPU might have split this page * up for us already: @@ -734,7 +751,8 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, cpa_flush_all(cache); out: - cpa_fill_pool(); + cpa_fill_pool(NULL); + return ret; } @@ -897,7 +915,7 @@ void kernel_map_pages(struct page *page, int numpages, int enable) * Try to refill the page pool here. We can do this only after * the tlb flush. */ - cpa_fill_pool(); + cpa_fill_pool(NULL); } #ifdef CONFIG_HIBERNATION diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index f385a4b4a48..b8bd0c4aa02 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -48,7 +48,7 @@ obj-$(VDSO64-y) += vdso-syms.lds # Match symbols in the DSO that look like VDSO*; produce a file of constants. # sed-vdsosym := -e 's/^00*/0/' \ - -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p' + -e 's/^\([[:xdigit:]]*\) . \(VDSO[[:alnum:]_]*\)$$/\2 = 0x\1;/p' quiet_cmd_vdsosym = VDSOSYM $@ cmd_vdsosym = $(NM) $< | sed -n $(sed-vdsosym) | LC_ALL=C sort > $@ diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index fbc24358ada..4fbcce758b0 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -113,7 +113,7 @@ int atapi_enabled = 1; module_param(atapi_enabled, int, 0444); MODULE_PARM_DESC(atapi_enabled, "Enable discovery of ATAPI devices (0=off, 1=on)"); -int atapi_dmadir = 0; +static int atapi_dmadir = 0; module_param(atapi_dmadir, int, 0444); MODULE_PARM_DESC(atapi_dmadir, "Enable ATAPI DMADIR bridge support (0=off, 1=on)"); @@ -6567,6 +6567,8 @@ int ata_host_suspend(struct ata_host *host, pm_message_t mesg) ata_lpm_enable(host); rc = ata_host_request_pm(host, mesg, 0, ATA_EHI_QUIET, 1); + if (rc == 0) + host->dev->power.power_state = mesg; return rc; } @@ -6585,6 +6587,7 @@ void ata_host_resume(struct ata_host *host) { ata_host_request_pm(host, PMSG_ON, ATA_EH_SOFTRESET, ATA_EHI_NO_AUTOPSY | ATA_EHI_QUIET, 0); + host->dev->power.power_state = PMSG_ON; /* reenable link pm */ ata_lpm_disable(host); diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h index 6036dedfe37..aa884f71a12 100644 --- a/drivers/ata/libata.h +++ b/drivers/ata/libata.h @@ -56,7 +56,6 @@ enum { extern unsigned int ata_print_id; extern struct workqueue_struct *ata_aux_wq; extern int atapi_enabled; -extern int atapi_dmadir; extern int atapi_passthru16; extern int libata_fua; extern int libata_noacpi; diff --git a/drivers/firewire/fw-cdev.c b/drivers/firewire/fw-cdev.c index 7e73cbaa412..46bc197a047 100644 --- a/drivers/firewire/fw-cdev.c +++ b/drivers/firewire/fw-cdev.c @@ -109,15 +109,17 @@ static int fw_device_op_open(struct inode *inode, struct file *file) struct client *client; unsigned long flags; - device = fw_device_from_devt(inode->i_rdev); + device = fw_device_get_by_devt(inode->i_rdev); if (device == NULL) return -ENODEV; client = kzalloc(sizeof(*client), GFP_KERNEL); - if (client == NULL) + if (client == NULL) { + fw_device_put(device); return -ENOMEM; + } - client->device = fw_device_get(device); + client->device = device; INIT_LIST_HEAD(&client->event_list); INIT_LIST_HEAD(&client->resource_list); spin_lock_init(&client->lock); @@ -644,6 +646,10 @@ static int ioctl_create_iso_context(struct client *client, void *buffer) struct fw_cdev_create_iso_context *request = buffer; struct fw_iso_context *context; + /* We only support one context at this time. */ + if (client->iso_context != NULL) + return -EBUSY; + if (request->channel > 63) return -EINVAL; @@ -790,8 +796,9 @@ static int ioctl_start_iso(struct client *client, void *buffer) { struct fw_cdev_start_iso *request = buffer; - if (request->handle != 0) + if (client->iso_context == NULL || request->handle != 0) return -EINVAL; + if (client->iso_context->type == FW_ISO_CONTEXT_RECEIVE) { if (request->tags == 0 || request->tags > 15) return -EINVAL; @@ -808,7 +815,7 @@ static int ioctl_stop_iso(struct client *client, void *buffer) { struct fw_cdev_stop_iso *request = buffer; - if (request->handle != 0) + if (client->iso_context == NULL || request->handle != 0) return -EINVAL; return fw_iso_context_stop(client->iso_context); diff --git a/drivers/firewire/fw-device.c b/drivers/firewire/fw-device.c index de9066e69ad..2ab13e0f346 100644 --- a/drivers/firewire/fw-device.c +++ b/drivers/firewire/fw-device.c @@ -358,12 +358,9 @@ static ssize_t guid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct fw_device *device = fw_device(dev); - u64 guid; - guid = ((u64)device->config_rom[3] << 32) | device->config_rom[4]; - - return snprintf(buf, PAGE_SIZE, "0x%016llx\n", - (unsigned long long)guid); + return snprintf(buf, PAGE_SIZE, "0x%08x%08x\n", + device->config_rom[3], device->config_rom[4]); } static struct device_attribute fw_device_attributes[] = { @@ -610,12 +607,14 @@ static DECLARE_RWSEM(idr_rwsem); static DEFINE_IDR(fw_device_idr); int fw_cdev_major; -struct fw_device *fw_device_from_devt(dev_t devt) +struct fw_device *fw_device_get_by_devt(dev_t devt) { struct fw_device *device; down_read(&idr_rwsem); device = idr_find(&fw_device_idr, MINOR(devt)); + if (device) + fw_device_get(device); up_read(&idr_rwsem); return device; @@ -627,13 +626,14 @@ static void fw_device_shutdown(struct work_struct *work) container_of(work, struct fw_device, work.work); int minor = MINOR(device->device.devt); - down_write(&idr_rwsem); - idr_remove(&fw_device_idr, minor); - up_write(&idr_rwsem); - fw_device_cdev_remove(device); device_for_each_child(&device->device, NULL, shutdown_unit); device_unregister(&device->device); + + down_write(&idr_rwsem); + idr_remove(&fw_device_idr, minor); + up_write(&idr_rwsem); + fw_device_put(device); } static struct device_type fw_device_type = { @@ -682,10 +682,13 @@ static void fw_device_init(struct work_struct *work) } err = -ENOMEM; + + fw_device_get(device); down_write(&idr_rwsem); if (idr_pre_get(&fw_device_idr, GFP_KERNEL)) err = idr_get_new(&fw_device_idr, device, &minor); up_write(&idr_rwsem); + if (err < 0) goto error; @@ -717,13 +720,22 @@ static void fw_device_init(struct work_struct *work) */ if (atomic_cmpxchg(&device->state, FW_DEVICE_INITIALIZING, - FW_DEVICE_RUNNING) == FW_DEVICE_SHUTDOWN) + FW_DEVICE_RUNNING) == FW_DEVICE_SHUTDOWN) { fw_device_shutdown(&device->work.work); - else - fw_notify("created new fw device %s " - "(%d config rom retries, S%d00)\n", - device->device.bus_id, device->config_rom_retries, - 1 << device->max_speed); + } else { + if (device->config_rom_retries) + fw_notify("created device %s: GUID %08x%08x, S%d00, " + "%d config ROM retries\n", + device->device.bus_id, + device->config_rom[3], device->config_rom[4], + 1 << device->max_speed, + device->config_rom_retries); + else + fw_notify("created device %s: GUID %08x%08x, S%d00\n", + device->device.bus_id, + device->config_rom[3], device->config_rom[4], + 1 << device->max_speed); + } /* * Reschedule the IRM work if we just finished reading the @@ -741,7 +753,9 @@ static void fw_device_init(struct work_struct *work) idr_remove(&fw_device_idr, minor); up_write(&idr_rwsem); error: - put_device(&device->device); + fw_device_put(device); /* fw_device_idr's reference */ + + put_device(&device->device); /* our reference */ } static int update_unit(struct device *dev, void *data) diff --git a/drivers/firewire/fw-device.h b/drivers/firewire/fw-device.h index 0854fe2bc11..43808c02793 100644 --- a/drivers/firewire/fw-device.h +++ b/drivers/firewire/fw-device.h @@ -77,13 +77,13 @@ fw_device_is_shutdown(struct fw_device *device) } struct fw_device *fw_device_get(struct fw_device *device); +struct fw_device *fw_device_get_by_devt(dev_t devt); void fw_device_put(struct fw_device *device); int fw_device_enable_phys_dma(struct fw_device *device); void fw_device_cdev_update(struct fw_device *device); void fw_device_cdev_remove(struct fw_device *device); -struct fw_device *fw_device_from_devt(dev_t devt); extern int fw_cdev_major; struct fw_unit { diff --git a/drivers/firewire/fw-sbp2.c b/drivers/firewire/fw-sbp2.c index 19ece9b6d74..5259491580f 100644 --- a/drivers/firewire/fw-sbp2.c +++ b/drivers/firewire/fw-sbp2.c @@ -28,14 +28,15 @@ * and many others. */ +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/dma-mapping.h> #include <linux/kernel.h> +#include <linux/mod_devicetable.h> #include <linux/module.h> #include <linux/moduleparam.h> -#include <linux/mod_devicetable.h> -#include <linux/device.h> #include <linux/scatterlist.h> -#include <linux/dma-mapping.h> -#include <linux/blkdev.h> #include <linux/string.h> #include <linux/stringify.h> #include <linux/timer.h> @@ -47,9 +48,9 @@ #include <scsi/scsi_device.h> #include <scsi/scsi_host.h> -#include "fw-transaction.h" -#include "fw-topology.h" #include "fw-device.h" +#include "fw-topology.h" +#include "fw-transaction.h" /* * So far only bridges from Oxford Semiconductor are known to support @@ -82,6 +83,9 @@ MODULE_PARM_DESC(exclusive_login, "Exclusive login to sbp2 device " * Avoids access beyond actual disk limits on devices with an off-by-one bug. * Don't use this with devices which don't have this bug. * + * - delay inquiry + * Wait extra SBP2_INQUIRY_DELAY seconds after login before SCSI inquiry. + * * - override internal blacklist * Instead of adding to the built-in blacklist, use only the workarounds * specified in the module load parameter. @@ -91,6 +95,8 @@ MODULE_PARM_DESC(exclusive_login, "Exclusive login to sbp2 device " #define SBP2_WORKAROUND_INQUIRY_36 0x2 #define SBP2_WORKAROUND_MODE_SENSE_8 0x4 #define SBP2_WORKAROUND_FIX_CAPACITY 0x8 +#define SBP2_WORKAROUND_DELAY_INQUIRY 0x10 +#define SBP2_INQUIRY_DELAY 12 #define SBP2_WORKAROUND_OVERRIDE 0x100 static int sbp2_param_workarounds; @@ -100,6 +106,7 @@ MODULE_PARM_DESC(workarounds, "Work around device bugs (default = 0" ", 36 byte inquiry = " __stringify(SBP2_WORKAROUND_INQUIRY_36) ", skip mode page 8 = " __stringify(SBP2_WORKAROUND_MODE_SENSE_8) ", fix capacity = " __stringify(SBP2_WORKAROUND_FIX_CAPACITY) + ", delay inquiry = " __stringify(SBP2_WORKAROUND_DELAY_INQUIRY) ", override internal blacklist = " __stringify(SBP2_WORKAROUND_OVERRIDE) ", or a combination)"); @@ -132,6 +139,7 @@ struct sbp2_logical_unit { int generation; int retries; struct delayed_work work; + bool blocked; }; /* @@ -141,16 +149,18 @@ struct sbp2_logical_unit { struct sbp2_target { struct kref kref; struct fw_unit *unit; + const char *bus_id; + struct list_head lu_list; u64 management_agent_address; int directory_id; int node_id; int address_high; - - unsigned workarounds; - struct list_head lu_list; - + unsigned int workarounds; unsigned int mgt_orb_timeout; + + int dont_block; /* counter for each logical unit */ + int blocked; /* ditto */ }; /* @@ -160,7 +170,7 @@ struct sbp2_target { */ #define SBP2_MIN_LOGIN_ORB_TIMEOUT 5000U /* Timeout in ms */ #define SBP2_MAX_LOGIN_ORB_TIMEOUT 40000U /* Timeout in ms */ -#define SBP2_ORB_TIMEOUT 2000 /* Timeout in ms */ +#define SBP2_ORB_TIMEOUT 2000U /* Timeout in ms */ #define SBP2_ORB_NULL 0x80000000 #define SBP2_MAX_SG_ELEMENT_LENGTH 0xf000 @@ -297,7 +307,7 @@ struct sbp2_command_orb { static const struct { u32 firmware_revision; u32 model; - unsigned workarounds; + unsigned int workarounds; } sbp2_workarounds_table[] = { /* DViCO Momobay CX-1 with TSB42AA9 bridge */ { .firmware_revision = 0x002800, @@ -305,6 +315,11 @@ static const struct { .workarounds = SBP2_WORKAROUND_INQUIRY_36 | SBP2_WORKAROUND_MODE_SENSE_8, }, + /* DViCO Momobay FX-3A with TSB42AA9A bridge */ { + .firmware_revision = 0x002800, + .model = 0x000000, + .workarounds = SBP2_WORKAROUND_DELAY_INQUIRY, + }, /* Initio bridges, actually only needed for some older ones */ { .firmware_revision = 0x000200, .model = ~0, @@ -501,6 +516,9 @@ sbp2_send_management_orb(struct sbp2_logical_unit *lu, int node_id, unsigned int timeout; int retval = -ENOMEM; + if (function == SBP2_LOGOUT_REQUEST && fw_device_is_shutdown(device)) + return 0; + orb = kzalloc(sizeof(*orb), GFP_ATOMIC); if (orb == NULL) return -ENOMEM; @@ -553,20 +571,20 @@ sbp2_send_management_orb(struct sbp2_logical_unit *lu, int node_id, retval = -EIO; if (sbp2_cancel_orbs(lu) == 0) { - fw_error("orb reply timed out, rcode=0x%02x\n", - orb->base.rcode); + fw_error("%s: orb reply timed out, rcode=0x%02x\n", + lu->tgt->bus_id, orb->base.rcode); goto out; } if (orb->base.rcode != RCODE_COMPLETE) { - fw_error("management write failed, rcode 0x%02x\n", - orb->base.rcode); + fw_error("%s: management write failed, rcode 0x%02x\n", + lu->tgt->bus_id, orb->base.rcode); goto out; } if (STATUS_GET_RESPONSE(orb->status) != 0 || STATUS_GET_SBP_STATUS(orb->status) != 0) { - fw_error("error status: %d:%d\n", + fw_error("%s: error status: %d:%d\n", lu->tgt->bus_id, STATUS_GET_RESPONSE(orb->status), STATUS_GET_SBP_STATUS(orb->status)); goto out; @@ -590,29 +608,147 @@ sbp2_send_management_orb(struct sbp2_logical_unit *lu, int node_id, static void complete_agent_reset_write(struct fw_card *card, int rcode, - void *payload, size_t length, void *data) + void *payload, size_t length, void *done) +{ + complete(done); +} + +static void sbp2_agent_reset(struct sbp2_logical_unit *lu) { - struct fw_transaction *t = data; + struct fw_device *device = fw_device(lu->tgt->unit->device.parent); + DECLARE_COMPLETION_ONSTACK(done); + struct fw_transaction t; + static u32 z; + + fw_send_request(device->card, &t, TCODE_WRITE_QUADLET_REQUEST, + lu->tgt->node_id, lu->generation, device->max_speed, + lu->command_block_agent_address + SBP2_AGENT_RESET, + &z, sizeof(z), complete_agent_reset_write, &done); + wait_for_completion(&done); +} - kfree(t); +static void +complete_agent_reset_write_no_wait(struct fw_card *card, int rcode, + void *payload, size_t length, void *data) +{ + kfree(data); } -static int sbp2_agent_reset(struct sbp2_logical_unit *lu) +static void sbp2_agent_reset_no_wait(struct sbp2_logical_unit *lu) { struct fw_device *device = fw_device(lu->tgt->unit->device.parent); struct fw_transaction *t; - static u32 zero; + static u32 z; - t = kzalloc(sizeof(*t), GFP_ATOMIC); + t = kmalloc(sizeof(*t), GFP_ATOMIC); if (t == NULL) - return -ENOMEM; + return; fw_send_request(device->card, t, TCODE_WRITE_QUADLET_REQUEST, lu->tgt->node_id, lu->generation, device->max_speed, lu->command_block_agent_address + SBP2_AGENT_RESET, - &zero, sizeof(zero), complete_agent_reset_write, t); + &z, sizeof(z), complete_agent_reset_write_no_wait, t); +} - return 0; +static void sbp2_set_generation(struct sbp2_logical_unit *lu, int generation) +{ + struct fw_card *card = fw_device(lu->tgt->unit->device.parent)->card; + unsigned long flags; + + /* serialize with comparisons of lu->generation and card->generation */ + spin_lock_irqsave(&card->lock, flags); + lu->generation = generation; + spin_unlock_irqrestore(&card->lock, flags); +} + +static inline void sbp2_allow_block(struct sbp2_logical_unit *lu) +{ + /* + * We may access dont_block without taking card->lock here: + * All callers of sbp2_allow_block() and all callers of sbp2_unblock() + * are currently serialized against each other. + * And a wrong result in sbp2_conditionally_block()'s access of + * dont_block is rather harmless, it simply misses its first chance. + */ + --lu->tgt->dont_block; +} + +/* + * Blocks lu->tgt if all of the following conditions are met: + * - Login, INQUIRY, and high-level SCSI setup of all of the target's + * logical units have been finished (indicated by dont_block == 0). + * - lu->generation is stale. + * + * Note, scsi_block_requests() must be called while holding card->lock, + * otherwise it might foil sbp2_[conditionally_]unblock()'s attempt to + * unblock the target. + */ +static void sbp2_conditionally_block(struct sbp2_logical_unit *lu) +{ + struct sbp2_target *tgt = lu->tgt; + struct fw_card *card = fw_device(tgt->unit->device.parent)->card; + struct Scsi_Host *shost = + container_of((void *)tgt, struct Scsi_Host, hostdata[0]); + unsigned long flags; + + spin_lock_irqsave(&card->lock, flags); + if (!tgt->dont_block && !lu->blocked && + lu->generation != card->generation) { + lu->blocked = true; + if (++tgt->blocked == 1) { + scsi_block_requests(shost); + fw_notify("blocked %s\n", lu->tgt->bus_id); + } + } + spin_unlock_irqrestore(&card->lock, flags); +} + +/* + * Unblocks lu->tgt as soon as all its logical units can be unblocked. + * Note, it is harmless to run scsi_unblock_requests() outside the + * card->lock protected section. On the other hand, running it inside + * the section might clash with shost->host_lock. + */ +static void sbp2_conditionally_unblock(struct sbp2_logical_unit *lu) +{ + struct sbp2_target *tgt = lu->tgt; + struct fw_card *card = fw_device(tgt->unit->device.parent)->card; + struct Scsi_Host *shost = + container_of((void *)tgt, struct Scsi_Host, hostdata[0]); + unsigned long flags; + bool unblock = false; + + spin_lock_irqsave(&card->lock, flags); + if (lu->blocked && lu->generation == card->generation) { + lu->blocked = false; + unblock = --tgt->blocked == 0; + } + spin_unlock_irqrestore(&card->lock, flags); + + if (unblock) { + scsi_unblock_requests(shost); + fw_notify("unblocked %s\n", lu->tgt->bus_id); + } +} + +/* + * Prevents future blocking of tgt and unblocks it. + * Note, it is harmless to run scsi_unblock_requests() outside the + * card->lock protected section. On the other hand, running it inside + * the section might clash with shost->host_lock. + */ +static void sbp2_unblock(struct sbp2_target *tgt) +{ + struct fw_card *card = fw_device(tgt->unit->device.parent)->card; + struct Scsi_Host *shost = + container_of((void *)tgt, struct Scsi_Host, hostdata[0]); + unsigned long flags; + + spin_lock_irqsave(&card->lock, flags); + ++tgt->dont_block; + spin_unlock_irqrestore(&card->lock, flags); + + scsi_unblock_requests(shost); } static void sbp2_release_target(struct kref *kref) @@ -621,23 +757,24 @@ static void sbp2_release_target(struct kref *kref) struct sbp2_logical_unit *lu, *next; struct Scsi_Host *shost = container_of((void *)tgt, struct Scsi_Host, hostdata[0]); - struct fw_device *device = fw_device(tgt->unit->device.parent); + + /* prevent deadlocks */ + sbp2_unblock(tgt); list_for_each_entry_safe(lu, next, &tgt->lu_list, link) { - if (lu->sdev) + if (lu->sdev) { scsi_remove_device(lu->sdev); - - if (!fw_device_is_shutdown(device)) - sbp2_send_management_orb(lu, tgt->node_id, - lu->generation, SBP2_LOGOUT_REQUEST, - lu->login_id, NULL); + scsi_device_put(lu->sdev); + } + sbp2_send_management_orb(lu, tgt->node_id, lu->generation, + SBP2_LOGOUT_REQUEST, lu->login_id, NULL); fw_core_remove_address_handler(&lu->address_handler); list_del(&lu->link); kfree(lu); } scsi_remove_host(shost); - fw_notify("released %s\n", tgt->unit->device.bus_id); + fw_notify("released %s\n", tgt->bus_id); put_device(&tgt->unit->device); scsi_host_put(shost); @@ -666,33 +803,43 @@ static void sbp2_login(struct work_struct *work) { struct sbp2_logical_unit *lu = container_of(work, struct sbp2_logical_unit, work.work); - struct Scsi_Host *shost = - container_of((void *)lu->tgt, struct Scsi_Host, hostdata[0]); + struct sbp2_target *tgt = lu->tgt; + struct fw_device *device = fw_device(tgt->unit->device.parent); + struct Scsi_Host *shost; struct scsi_device *sdev; struct scsi_lun eight_bytes_lun; - struct fw_unit *unit = lu->tgt->unit; - struct fw_device *device = fw_device(unit->device.parent); struct sbp2_login_response response; int generation, node_id, local_node_id; + if (fw_device_is_shutdown(device)) + goto out; + generation = device->generation; smp_rmb(); /* node_id must not be older than generation */ node_id = device->node_id; local_node_id = device->card->node_id; + /* If this is a re-login attempt, log out, or we might be rejected. */ + if (lu->sdev) + sbp2_send_management_orb(lu, device->node_id, generation, + SBP2_LOGOUT_REQUEST, lu->login_id, NULL); + if (sbp2_send_management_orb(lu, node_id, generation, SBP2_LOGIN_REQUEST, lu->lun, &response) < 0) { - if (lu->retries++ < 5) + if (lu->retries++ < 5) { sbp2_queue_work(lu, DIV_ROUND_UP(HZ, 5)); - else - fw_error("failed to login to %s LUN %04x\n", - unit->device.bus_id, lu->lun); + } else { + fw_error("%s: failed to login to LUN %04x\n", + tgt->bus_id, lu->lun); + /* Let any waiting I/O fail from now on. */ + sbp2_unblock(lu->tgt); + } goto out; } - lu->generation = generation; - lu->tgt->node_id = node_id; - lu->tgt->address_high = local_node_id << 16; + tgt->node_id = node_id; + tgt->address_high = local_node_id << 16; + sbp2_set_generation(lu, generation); /* Get command block agent offset and login id. */ lu->command_block_agent_address = @@ -700,8 +847,8 @@ static void sbp2_login(struct work_struct *work) response.command_block_agent.low; lu->login_id = LOGIN_RESPONSE_GET_LOGIN_ID(response); - fw_notify("logged in to %s LUN %04x (%d retries)\n", - unit->device.bus_id, lu->lun, lu->retries); + fw_notify("%s: logged in to LUN %04x (%d retries)\n", + tgt->bus_id, lu->lun, lu->retries); #if 0 /* FIXME: The linux1394 sbp2 does this last step. */ @@ -711,26 +858,62 @@ static void sbp2_login(struct work_struct *work) PREPARE_DELAYED_WORK(&lu->work, sbp2_reconnect); sbp2_agent_reset(lu); + /* This was a re-login. */ + if (lu->sdev) { + sbp2_cancel_orbs(lu); + sbp2_conditionally_unblock(lu); + goto out; + } + + if (lu->tgt->workarounds & SBP2_WORKAROUND_DELAY_INQUIRY) + ssleep(SBP2_INQUIRY_DELAY); + memset(&eight_bytes_lun, 0, sizeof(eight_bytes_lun)); eight_bytes_lun.scsi_lun[0] = (lu->lun >> 8) & 0xff; eight_bytes_lun.scsi_lun[1] = lu->lun & 0xff; + shost = container_of((void *)tgt, struct Scsi_Host, hostdata[0]); sdev = __scsi_add_device(shost, 0, 0, scsilun_to_int(&eight_bytes_lun), lu); - if (IS_ERR(sdev)) { - sbp2_send_management_orb(lu, node_id, generation, - SBP2_LOGOUT_REQUEST, lu->login_id, NULL); - /* - * Set this back to sbp2_login so we fall back and - * retry login on bus reset. - */ - PREPARE_DELAYED_WORK(&lu->work, sbp2_login); - } else { - lu->sdev = sdev; + /* + * FIXME: We are unable to perform reconnects while in sbp2_login(). + * Therefore __scsi_add_device() will get into trouble if a bus reset + * happens in parallel. It will either fail or leave us with an + * unusable sdev. As a workaround we check for this and retry the + * whole login and SCSI probing. + */ + + /* Reported error during __scsi_add_device() */ + if (IS_ERR(sdev)) + goto out_logout_login; + + /* Unreported error during __scsi_add_device() */ + smp_rmb(); /* get current card generation */ + if (generation != device->card->generation) { + scsi_remove_device(sdev); scsi_device_put(sdev); + goto out_logout_login; } + + /* No error during __scsi_add_device() */ + lu->sdev = sdev; + sbp2_allow_block(lu); + goto out; + + out_logout_login: + smp_rmb(); /* generation may have changed */ + generation = device->generation; + smp_rmb(); /* node_id must not be older than generation */ + + sbp2_send_management_orb(lu, device->node_id, generation, + SBP2_LOGOUT_REQUEST, lu->login_id, NULL); + /* + * If a bus reset happened, sbp2_update will have requeued + * lu->work already. Reset the work from reconnect to login. + */ + PREPARE_DELAYED_WORK(&lu->work, sbp2_login); out: - sbp2_target_put(lu->tgt); + sbp2_target_put(tgt); } static int sbp2_add_logical_unit(struct sbp2_target *tgt, int lun_entry) @@ -755,6 +938,8 @@ static int sbp2_add_logical_unit(struct sbp2_target *tgt, int lun_entry) lu->sdev = NULL; lu->lun = lun_entry & 0xffff; lu->retries = 0; + lu->blocked = false; + ++tgt->dont_block; INIT_LIST_HEAD(&lu->orb_list); INIT_DELAYED_WORK(&lu->work, sbp2_login); @@ -813,7 +998,7 @@ static int sbp2_scan_unit_dir(struct sbp2_target *tgt, u32 *directory, if (timeout > tgt->mgt_orb_timeout) fw_notify("%s: config rom contains %ds " "management ORB timeout, limiting " - "to %ds\n", tgt->unit->device.bus_id, + "to %ds\n", tgt->bus_id, timeout / 1000, tgt->mgt_orb_timeout / 1000); break; @@ -836,12 +1021,12 @@ static void sbp2_init_workarounds(struct sbp2_target *tgt, u32 model, u32 firmware_revision) { int i; - unsigned w = sbp2_param_workarounds; + unsigned int w = sbp2_param_workarounds; if (w) fw_notify("Please notify linux1394-devel@lists.sourceforge.net " "if you need the workarounds parameter for %s\n", - tgt->unit->device.bus_id); + tgt->bus_id); if (w & SBP2_WORKAROUND_OVERRIDE) goto out; @@ -863,8 +1048,7 @@ static void sbp2_init_workarounds(struct sbp2_target *tgt, u32 model, if (w) fw_notify("Workarounds for %s: 0x%x " "(firmware_revision 0x%06x, model_id 0x%06x)\n", - tgt->unit->device.bus_id, - w, firmware_revision, model); + tgt->bus_id, w, firmware_revision, model); tgt->workarounds = w; } @@ -888,6 +1072,7 @@ static int sbp2_probe(struct device *dev) tgt->unit = unit; kref_init(&tgt->kref); INIT_LIST_HEAD(&tgt->lu_list); + tgt->bus_id = unit->device.bus_id; if (fw_device_enable_phys_dma(device) < 0) goto fail_shost_put; @@ -938,10 +1123,13 @@ static void sbp2_reconnect(struct work_struct *work) { struct sbp2_logical_unit *lu = container_of(work, struct sbp2_logical_unit, work.work); - struct fw_unit *unit = lu->tgt->unit; - struct fw_device *device = fw_device(unit->device.parent); + struct sbp2_target *tgt = lu->tgt; + struct fw_device *device = fw_device(tgt->unit->device.parent); int generation, node_id, local_node_id; + if (fw_device_is_shutdown(device)) + goto out; + generation = device->generation; smp_rmb(); /* node_id must not be older than generation */ node_id = device->node_id; @@ -950,10 +1138,17 @@ static void sbp2_reconnect(struct work_struct *work) if (sbp2_send_management_orb(lu, node_id, generation, SBP2_RECONNECT_REQUEST, lu->login_id, NULL) < 0) { - if (lu->retries++ >= 5) { - fw_error("failed to reconnect to %s\n", - unit->device.bus_id); - /* Fall back and try to log in again. */ + /* + * If reconnect was impossible even though we are in the + * current generation, fall back and try to log in again. + * + * We could check for "Function rejected" status, but + * looking at the bus generation as simpler and more general. + */ + smp_rmb(); /* get current card generation */ + if (generation == device->card->generation || + lu->retries++ >= 5) { + fw_error("%s: failed to reconnect\n", tgt->bus_id); lu->retries = 0; PREPARE_DELAYED_WORK(&lu->work, sbp2_login); } @@ -961,17 +1156,18 @@ static void sbp2_reconnect(struct work_struct *work) goto out; } - lu->generation = generation; - lu->tgt->node_id = node_id; - lu->tgt->address_high = local_node_id << 16; + tgt->node_id = node_id; + tgt->address_high = local_node_id << 16; + sbp2_set_generation(lu, generation); - fw_notify("reconnected to %s LUN %04x (%d retries)\n", - unit->device.bus_id, lu->lun, lu->retries); + fw_notify("%s: reconnected to LUN %04x (%d retries)\n", + tgt->bus_id, lu->lun, lu->retries); sbp2_agent_reset(lu); sbp2_cancel_orbs(lu); + sbp2_conditionally_unblock(lu); out: - sbp2_target_put(lu->tgt); + sbp2_target_put(tgt); } static void sbp2_update(struct fw_unit *unit) @@ -986,6 +1182,7 @@ static void sbp2_update(struct fw_unit *unit) * Iteration over tgt->lu_list is therefore safe here. */ list_for_each_entry(lu, &tgt->lu_list, link) { + sbp2_conditionally_block(lu); lu->retries = 0; sbp2_queue_work(lu, 0); } @@ -1063,7 +1260,7 @@ complete_command_orb(struct sbp2_orb *base_orb, struct sbp2_status *status) if (status != NULL) { if (STATUS_GET_DEAD(*status)) - sbp2_agent_reset(orb->lu); + sbp2_agent_reset_no_wait(orb->lu); switch (STATUS_GET_RESPONSE(*status)) { case SBP2_STATUS_REQUEST_COMPLETE: @@ -1089,6 +1286,7 @@ complete_command_orb(struct sbp2_orb *base_orb, struct sbp2_status *status) * or when sending the write (less likely). */ result = DID_BUS_BUSY << 16; + sbp2_conditionally_block(orb->lu); } dma_unmap_single(device->card->device, orb->base.request_bus, @@ -1197,7 +1395,7 @@ static int sbp2_scsi_queuecommand(struct scsi_cmnd *cmd, scsi_done_fn_t done) struct sbp2_logical_unit *lu = cmd->device->hostdata; struct fw_device *device = fw_device(lu->tgt->unit->device.parent); struct sbp2_command_orb *orb; - unsigned max_payload; + unsigned int max_payload; int retval = SCSI_MLQUEUE_HOST_BUSY; /* @@ -1275,6 +1473,10 @@ static int sbp2_scsi_slave_alloc(struct scsi_device *sdev) { struct sbp2_logical_unit *lu = sdev->hostdata; + /* (Re-)Adding logical units via the SCSI stack is not supported. */ + if (!lu) + return -ENOSYS; + sdev->allow_restart = 1; /* @@ -1319,7 +1521,7 @@ static int sbp2_scsi_abort(struct scsi_cmnd *cmd) { struct sbp2_logical_unit *lu = cmd->device->hostdata; - fw_notify("sbp2_scsi_abort\n"); + fw_notify("%s: sbp2_scsi_abort\n", lu->tgt->bus_id); sbp2_agent_reset(lu); sbp2_cancel_orbs(lu); diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c index 28e155a9e2a..9e2b1964d71 100644 --- a/drivers/ieee1394/sbp2.c +++ b/drivers/ieee1394/sbp2.c @@ -183,6 +183,9 @@ MODULE_PARM_DESC(exclusive_login, "Exclusive login to sbp2 device " * Avoids access beyond actual disk limits on devices with an off-by-one bug. * Don't use this with devices which don't have this bug. * + * - delay inquiry + * Wait extra SBP2_INQUIRY_DELAY seconds after login before SCSI inquiry. + * * - override internal blacklist * Instead of adding to the built-in blacklist, use only the workarounds * specified in the module load parameter. @@ -195,6 +198,7 @@ MODULE_PARM_DESC(workarounds, "Work around device bugs (default = 0" ", 36 byte inquiry = " __stringify(SBP2_WORKAROUND_INQUIRY_36) ", skip mode page 8 = " __stringify(SBP2_WORKAROUND_MODE_SENSE_8) ", fix capacity = " __stringify(SBP2_WORKAROUND_FIX_CAPACITY) + ", delay inquiry = " __stringify(SBP2_WORKAROUND_DELAY_INQUIRY) ", override internal blacklist = " __stringify(SBP2_WORKAROUND_OVERRIDE) ", or a combination)"); @@ -357,6 +361,11 @@ static const struct { .workarounds = SBP2_WORKAROUND_INQUIRY_36 | SBP2_WORKAROUND_MODE_SENSE_8, }, + /* DViCO Momobay FX-3A with TSB42AA9A bridge */ { + .firmware_revision = 0x002800, + .model_id = 0x000000, + .workarounds = SBP2_WORKAROUND_DELAY_INQUIRY, + }, /* Initio bridges, actually only needed for some older ones */ { .firmware_revision = 0x000200, .model_id = SBP2_ROM_VALUE_WILDCARD, @@ -914,6 +923,9 @@ static int sbp2_start_device(struct sbp2_lu *lu) sbp2_agent_reset(lu, 1); sbp2_max_speed_and_size(lu); + if (lu->workarounds & SBP2_WORKAROUND_DELAY_INQUIRY) + ssleep(SBP2_INQUIRY_DELAY); + error = scsi_add_device(lu->shost, 0, lu->ud->id, 0); if (error) { SBP2_ERR("scsi_add_device failed"); @@ -1962,6 +1974,9 @@ static int sbp2scsi_slave_alloc(struct scsi_device *sdev) { struct sbp2_lu *lu = (struct sbp2_lu *)sdev->host->hostdata[0]; + if (sdev->lun != 0 || sdev->id != lu->ud->id || sdev->channel != 0) + return -ENODEV; + lu->sdev = sdev; sdev->allow_restart = 1; diff --git a/drivers/ieee1394/sbp2.h b/drivers/ieee1394/sbp2.h index d2ecb0d8a1b..80d8e097b06 100644 --- a/drivers/ieee1394/sbp2.h +++ b/drivers/ieee1394/sbp2.h @@ -343,6 +343,8 @@ enum sbp2lu_state_types { #define SBP2_WORKAROUND_INQUIRY_36 0x2 #define SBP2_WORKAROUND_MODE_SENSE_8 0x4 #define SBP2_WORKAROUND_FIX_CAPACITY 0x8 +#define SBP2_WORKAROUND_DELAY_INQUIRY 0x10 +#define SBP2_INQUIRY_DELAY 12 #define SBP2_WORKAROUND_OVERRIDE 0x100 #endif /* SBP2_H */ diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 33888bb5814..2c23bade9aa 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -46,7 +46,7 @@ const struct file_operations ext4_dir_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif - .fsync = ext4_sync_file, /* BKL held */ + .fsync = ext4_sync_file, .release = ext4_release_dir, }; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bc7081f1fbe..9ae6e67090c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -148,6 +148,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, { struct ext4_inode_info *ei = EXT4_I(inode); ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; ext4_grpblk_t colour; int depth; @@ -169,8 +170,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, /* OK. use inode's group */ bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); - colour = (current->pid % 16) * + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); return bg_start + colour + block; } @@ -349,7 +355,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) #define ext4_ext_show_leaf(inode,path) #endif -static void ext4_ext_drop_refs(struct ext4_ext_path *path) +void ext4_ext_drop_refs(struct ext4_ext_path *path) { int depth = path->p_depth; int i; @@ -2168,6 +2174,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, newblock = iblock - ee_block + ext_pblock(ex); ex2 = ex; + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* ex1: ee_block to iblock - 1 : uninitialized */ if (iblock > ee_block) { ex1 = ex; @@ -2200,16 +2210,20 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, newdepth = ext_depth(inode); if (newdepth != depth) { depth = newdepth; - path = ext4_ext_find_extent(inode, iblock, NULL); + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, iblock, path); if (IS_ERR(path)) { err = PTR_ERR(path); - path = NULL; goto out; } eh = path[depth].p_hdr; ex = path[depth].p_ext; if (ex2 != &newex) ex2 = ex; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; } allocated = max_blocks; } @@ -2230,9 +2244,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex2->ee_len = cpu_to_le16(allocated); if (ex2 != ex) goto insert; - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; /* * New (initialized) extent starts from the first block * in the current extent. i.e., ex2 == ex @@ -2276,9 +2287,22 @@ out: } /* + * Block allocation/map/preallocation routine for extents based files + * + * * Need to be called with * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) + * + * return > 0, number of of blocks already mapped/allocated + * if create == 0 and these are pre-allocated blocks + * buffer head is unmapped + * otherwise blocks are mapped + * + * return = 0, if plain look up failed (blocks have not been allocated) + * buffer head is unmapped + * + * return < 0, error case. */ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, @@ -2623,7 +2647,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) * modify 1 super block, 1 block bitmap and 1 group descriptor. */ credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; - down_write((&EXT4_I(inode)->i_data_sem)); + mutex_lock(&inode->i_mutex); retry: while (ret >= 0 && ret < max_blocks) { block = block + ret; @@ -2634,16 +2658,17 @@ retry: break; } - ret = ext4_ext_get_blocks(handle, inode, block, + ret = ext4_get_blocks_wrap(handle, inode, block, max_blocks, &map_bh, EXT4_CREATE_UNINITIALIZED_EXT, 0); - WARN_ON(ret <= 0); if (ret <= 0) { - ext4_error(inode->i_sb, "ext4_fallocate", - "ext4_ext_get_blocks returned error: " - "inode#%lu, block=%u, max_blocks=%lu", +#ifdef EXT4FS_DEBUG + WARN_ON(ret <= 0); + printk(KERN_ERR "%s: ext4_ext_get_blocks " + "returned error inode#%lu, block=%u, " + "max_blocks=%lu", __func__, inode->i_ino, block, max_blocks); - ret = -EIO; +#endif ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); break; @@ -2680,7 +2705,6 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; - up_write((&EXT4_I(inode)->i_data_sem)); /* * Time to update the file size. * Update only when preallocation was requested beyond the file size. @@ -2692,21 +2716,18 @@ retry: * if no error, we assume preallocation succeeded * completely */ - mutex_lock(&inode->i_mutex); i_size_write(inode, offset + len); EXT4_I(inode)->i_disksize = i_size_read(inode); - mutex_unlock(&inode->i_mutex); } else if (ret < 0 && nblocks) { /* Handle partial allocation scenario */ loff_t newsize; - mutex_lock(&inode->i_mutex); newsize = (nblocks << blkbits) + i_size_read(inode); i_size_write(inode, EXT4_BLOCK_ALIGN(newsize, blkbits)); EXT4_I(inode)->i_disksize = i_size_read(inode); - mutex_unlock(&inode->i_mutex); } } + mutex_unlock(&inode->i_mutex); return ret > 0 ? ret2 : ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index da18a74b966..8036b9b5376 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -702,7 +702,12 @@ got: ei->i_dir_start_lookup = 0; ei->i_disksize = 0; - ei->i_flags = EXT4_I(dir)->i_flags & ~EXT4_INDEX_FL; + /* + * Don't inherit extent flag from directory. We set extent flag on + * newly created directory and file only if -o extent mount option is + * specified + */ + ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL); if (S_ISLNK(mode)) ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL); /* dirsync only applies to directories */ @@ -745,12 +750,15 @@ got: goto fail_free_drop; } if (test_opt(sb, EXTENTS)) { - EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; - ext4_ext_tree_init(handle, inode); - err = ext4_update_incompat_feature(handle, sb, - EXT4_FEATURE_INCOMPAT_EXTENTS); - if (err) - goto fail; + /* set extent flag only for directory and file */ + if (S_ISDIR(mode) || S_ISREG(mode)) { + EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; + ext4_ext_tree_init(handle, inode); + err = ext4_update_incompat_feature(handle, sb, + EXT4_FEATURE_INCOMPAT_EXTENTS); + if (err) + goto fail; + } } ext4_debug("allocating inode %lu\n", inode->i_ino); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7dd9b50d5eb..945cbf6cb1f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -403,6 +403,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; __le32 *p; ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; ext4_grpblk_t colour; /* Try to find previous block */ @@ -420,8 +421,13 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) * into the same cylinder group then. */ bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); - colour = (current->pid % 16) * + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); return bg_start + colour; } @@ -768,7 +774,6 @@ err_out: * * `handle' can be NULL if create == 0. * - * The BKL may not be held on entry here. Be sure to take it early. * return > 0, # of blocks mapped or allocated. * return = 0, if plain lookup failed. * return < 0, error case. @@ -903,11 +908,38 @@ out: */ #define DIO_CREDITS 25 + +/* + * + * + * ext4_ext4 get_block() wrapper function + * It will do a look up first, and returns if the blocks already mapped. + * Otherwise it takes the write lock of the i_data_sem and allocate blocks + * and store the allocated blocks in the result buffer head and mark it + * mapped. + * + * If file type is extents based, it will call ext4_ext_get_blocks(), + * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping + * based files + * + * On success, it returns the number of blocks being mapped or allocate. + * if create==0 and the blocks are pre-allocated and uninitialized block, + * the result buffer head is unmapped. If the create ==1, it will make sure + * the buffer head is mapped. + * + * It returns 0 if plain look up failed (blocks have not been allocated), in + * that casem, buffer head is unmapped + * + * It returns the error in case of allocation failure. + */ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, unsigned long max_blocks, struct buffer_head *bh, int create, int extend_disksize) { int retval; + + clear_buffer_mapped(bh); + /* * Try to see if we can get the block without requesting * for new file system block. @@ -921,12 +953,26 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, inode, block, max_blocks, bh, 0, 0); } up_read((&EXT4_I(inode)->i_data_sem)); - if (!create || (retval > 0)) + + /* If it is only a block(s) look up */ + if (!create) + return retval; + + /* + * Returns if the blocks have already allocated + * + * Note that if blocks have been preallocated + * ext4_ext_get_block() returns th create = 0 + * with buffer head unmapped. + */ + if (retval > 0 && buffer_mapped(bh)) return retval; /* - * We need to allocate new blocks which will result - * in i_data update + * New blocks allocate and/or writing to uninitialized extent + * will possibly result in updating i_data, so we take + * the write lock of i_data_sem, and call get_blocks() + * with create == 1 flag. */ down_write((&EXT4_I(inode)->i_data_sem)); /* diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index dd0fcfcb35c..ef97f19c2f9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -627,21 +627,19 @@ static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, return block; } +static inline void *mb_correct_addr_and_bit(int *bit, void *addr) +{ #if BITS_PER_LONG == 64 -#define mb_correct_addr_and_bit(bit, addr) \ -{ \ - bit += ((unsigned long) addr & 7UL) << 3; \ - addr = (void *) ((unsigned long) addr & ~7UL); \ -} + *bit += ((unsigned long) addr & 7UL) << 3; + addr = (void *) ((unsigned long) addr & ~7UL); #elif BITS_PER_LONG == 32 -#define mb_correct_addr_and_bit(bit, addr) \ -{ \ - bit += ((unsigned long) addr & 3UL) << 3; \ - addr = (void *) ((unsigned long) addr & ~3UL); \ -} + *bit += ((unsigned long) addr & 3UL) << 3; + addr = (void *) ((unsigned long) addr & ~3UL); #else #error "how many bits you are?!" #endif + return addr; +} static inline int mb_test_bit(int bit, void *addr) { @@ -649,34 +647,54 @@ static inline int mb_test_bit(int bit, void *addr) * ext4_test_bit on architecture like powerpc * needs unsigned long aligned address */ - mb_correct_addr_and_bit(bit, addr); + addr = mb_correct_addr_and_bit(&bit, addr); return ext4_test_bit(bit, addr); } static inline void mb_set_bit(int bit, void *addr) { - mb_correct_addr_and_bit(bit, addr); + addr = mb_correct_addr_and_bit(&bit, addr); ext4_set_bit(bit, addr); } static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr) { - mb_correct_addr_and_bit(bit, addr); + addr = mb_correct_addr_and_bit(&bit, addr); ext4_set_bit_atomic(lock, bit, addr); } static inline void mb_clear_bit(int bit, void *addr) { - mb_correct_addr_and_bit(bit, addr); + addr = mb_correct_addr_and_bit(&bit, addr); ext4_clear_bit(bit, addr); } static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) { - mb_correct_addr_and_bit(bit, addr); + addr = mb_correct_addr_and_bit(&bit, addr); ext4_clear_bit_atomic(lock, bit, addr); } +static inline int mb_find_next_zero_bit(void *addr, int max, int start) +{ + int fix = 0; + addr = mb_correct_addr_and_bit(&fix, addr); + max += fix; + start += fix; + + return ext4_find_next_zero_bit(addr, max, start) - fix; +} + +static inline int mb_find_next_bit(void *addr, int max, int start) +{ + int fix = 0; + addr = mb_correct_addr_and_bit(&fix, addr); + max += fix; + start += fix; + + return ext4_find_next_bit(addr, max, start) - fix; +} + static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; @@ -906,7 +924,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, unsigned short chunk; unsigned short border; - BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb)); + BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); border = 2 << sb->s_blocksize_bits; @@ -946,12 +964,12 @@ static void ext4_mb_generate_buddy(struct super_block *sb, /* initialize buddy from bitmap which is aggregation * of on-disk bitmap and preallocations */ - i = ext4_find_next_zero_bit(bitmap, max, 0); + i = mb_find_next_zero_bit(bitmap, max, 0); grp->bb_first_free = i; while (i < max) { fragments++; first = i; - i = ext4_find_next_bit(bitmap, max, i); + i = mb_find_next_bit(bitmap, max, i); len = i - first; free += len; if (len > 1) @@ -959,7 +977,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, else grp->bb_counters[0]++; if (i < max) - i = ext4_find_next_zero_bit(bitmap, max, i); + i = mb_find_next_zero_bit(bitmap, max, i); } grp->bb_fragments = fragments; @@ -967,6 +985,10 @@ static void ext4_mb_generate_buddy(struct super_block *sb, ext4_error(sb, __FUNCTION__, "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", group, free, grp->bb_free); + /* + * If we intent to continue, we consider group descritor + * corrupt and update bb_free using bitmap value + */ grp->bb_free = free; } @@ -1778,7 +1800,7 @@ static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, buddy = mb_find_buddy(e4b, i, &max); BUG_ON(buddy == NULL); - k = ext4_find_next_zero_bit(buddy, max, 0); + k = mb_find_next_zero_bit(buddy, max, 0); BUG_ON(k >= max); ac->ac_found++; @@ -1818,11 +1840,11 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, i = e4b->bd_info->bb_first_free; while (free && ac->ac_status == AC_STATUS_CONTINUE) { - i = ext4_find_next_zero_bit(bitmap, + i = mb_find_next_zero_bit(bitmap, EXT4_BLOCKS_PER_GROUP(sb), i); if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { /* - * IF we corrupt the bitmap we won't find any + * IF we have corrupt bitmap, we won't find any * free blocks even though group info says we * we have free blocks */ @@ -1838,6 +1860,12 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, ext4_error(sb, __FUNCTION__, "%d free blocks as per " "group info. But got %d blocks\n", free, ex.fe_len); + /* + * The number of free blocks differs. This mostly + * indicate that the bitmap is corrupt. So exit + * without claiming the space. + */ + break; } ext4_mb_measure_extent(ac, &ex, e4b); @@ -3740,10 +3768,10 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, } while (bit < end) { - bit = ext4_find_next_zero_bit(bitmap_bh->b_data, end, bit); + bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); if (bit >= end) break; - next = ext4_find_next_bit(bitmap_bh->b_data, end, bit); + next = mb_find_next_bit(bitmap_bh->b_data, end, bit); if (next > end) next = end; start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + @@ -3771,6 +3799,10 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, (unsigned long) pa->pa_len); ext4_error(sb, __FUNCTION__, "free %u, pa_free %u\n", free, pa->pa_free); + /* + * pa is already deleted so we use the value obtained + * from the bitmap and continue. + */ } atomic_add(free, &sbi->s_mb_discarded); if (ac) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 8c6c685b9d2..5c1e27de775 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -43,6 +43,7 @@ static int finish_range(handle_t *handle, struct inode *inode, if (IS_ERR(path)) { retval = PTR_ERR(path); + path = NULL; goto err_out; } @@ -74,6 +75,10 @@ static int finish_range(handle_t *handle, struct inode *inode, } retval = ext4_ext_insert_extent(handle, inode, path, &newext); err_out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } lb->first_pblock = 0; return retval; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a9347fb43bc..28aa2ed4297 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1804,12 +1804,8 @@ retry: inode->i_fop = &ext4_dir_operations; inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; dir_block = ext4_bread (handle, inode, 0, 1, &err); - if (!dir_block) { - ext4_dec_count(handle, inode); /* is this nlink == 0? */ - ext4_mark_inode_dirty(handle, inode); - iput (inode); - goto out_stop; - } + if (!dir_block) + goto out_clear_inode; BUFFER_TRACE(dir_block, "get_write_access"); ext4_journal_get_write_access(handle, dir_block); de = (struct ext4_dir_entry_2 *) dir_block->b_data; @@ -1832,7 +1828,8 @@ retry: ext4_mark_inode_dirty(handle, inode); err = ext4_add_entry (handle, dentry, inode); if (err) { - inode->i_nlink = 0; +out_clear_inode: + clear_nlink(inode); ext4_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; @@ -2164,7 +2161,7 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry) dir->i_ctime = dir->i_mtime = ext4_current_time(dir); ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); - ext4_dec_count(handle, inode); + drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); inode->i_ctime = ext4_current_time(inode); @@ -2214,7 +2211,7 @@ retry: err = __page_symlink(inode, symname, l, mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); if (err) { - ext4_dec_count(handle, inode); + clear_nlink(inode); ext4_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; @@ -2223,7 +2220,6 @@ retry: inode->i_op = &ext4_fast_symlink_inode_operations; memcpy((char*)&EXT4_I(inode)->i_data,symname,l); inode->i_size = l-1; - EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; } EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_add_nondir(handle, dentry, inode); @@ -2407,7 +2403,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, ext4_dec_count(handle, old_dir); if (new_inode) { /* checked empty_dir above, can't have another parent, - * ext3_dec_count() won't work for many-linked dirs */ + * ext4_dec_count() won't work for many-linked dirs */ new_inode->i_nlink = 0; } else { ext4_inc_count(handle, new_dir); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 9477a2bd6ff..e29efa0f9d6 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1037,6 +1037,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_warning(sb, __FUNCTION__, "multiple resizers run on filesystem!"); unlock_super(sb); + ext4_journal_stop(handle); err = -EBUSY; goto exit_put; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 96ee899d650..91a1bd67ac1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -314,9 +314,12 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer) static int lstats_show_proc(struct seq_file *m, void *v) { int i; - struct task_struct *task = m->private; - seq_puts(m, "Latency Top version : v0.1\n"); + struct inode *inode = m->private; + struct task_struct *task = get_proc_task(inode); + if (!task) + return -ESRCH; + seq_puts(m, "Latency Top version : v0.1\n"); for (i = 0; i < 32; i++) { if (task->latency_record[i].backtrace[0]) { int q; @@ -341,32 +344,24 @@ static int lstats_show_proc(struct seq_file *m, void *v) } } + put_task_struct(task); return 0; } static int lstats_open(struct inode *inode, struct file *file) { - int ret; - struct seq_file *m; - struct task_struct *task = get_proc_task(inode); - - ret = single_open(file, lstats_show_proc, NULL); - if (!ret) { - m = file->private_data; - m->private = task; - } - return ret; + return single_open(file, lstats_show_proc, inode); } static ssize_t lstats_write(struct file *file, const char __user *buf, size_t count, loff_t *offs) { - struct seq_file *m; - struct task_struct *task; + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); - m = file->private_data; - task = m->private; + if (!task) + return -ESRCH; clear_all_latency_tracing(task); + put_task_struct(task); return count; } diff --git a/include/asm-x86/futex.h b/include/asm-x86/futex.h index cd9f894dd2d..c9952ea9f69 100644 --- a/include/asm-x86/futex.h +++ b/include/asm-x86/futex.h @@ -102,6 +102,13 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) { + +#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) + /* Real i386 machines have no cmpxchg instruction */ + if (boot_cpu_data.x86 == 3) + return -ENOSYS; +#endif + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; diff --git a/include/asm-x86/lguest.h b/include/asm-x86/lguest.h index 4d9367b7297..9b17571e9bc 100644 --- a/include/asm-x86/lguest.h +++ b/include/asm-x86/lguest.h @@ -23,6 +23,17 @@ /* Found in switcher.S */ extern unsigned long default_idt_entries[]; +/* Declarations for definitions in lguest_guest.S */ +extern char lguest_noirq_start[], lguest_noirq_end[]; +extern const char lgstart_cli[], lgend_cli[]; +extern const char lgstart_sti[], lgend_sti[]; +extern const char lgstart_popf[], lgend_popf[]; +extern const char lgstart_pushf[], lgend_pushf[]; +extern const char lgstart_iret[], lgend_iret[]; + +extern void lguest_iret(void); +extern void lguest_init(void); + struct lguest_regs { /* Manually saved part. */ diff --git a/include/asm-x86/nops.h b/include/asm-x86/nops.h index fec025c7f58..e3b2bce0aff 100644 --- a/include/asm-x86/nops.h +++ b/include/asm-x86/nops.h @@ -3,17 +3,29 @@ /* Define nops for use with alternative() */ -/* generic versions from gas */ -#define GENERIC_NOP1 ".byte 0x90\n" -#define GENERIC_NOP2 ".byte 0x89,0xf6\n" -#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" -#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" -#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 -#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" -#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" -#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 +/* generic versions from gas + 1: nop + 2: movl %esi,%esi + 3: leal 0x00(%esi),%esi + 4: leal 0x00(,%esi,1),%esi + 6: leal 0x00000000(%esi),%esi + 7: leal 0x00000000(,%esi,1),%esi +*/ +#define GENERIC_NOP1 ".byte 0x90\n" +#define GENERIC_NOP2 ".byte 0x89,0xf6\n" +#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" +#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" +#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 +#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" +#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" +#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 -/* Opteron 64bit nops */ +/* Opteron 64bit nops + 1: nop + 2: osp nop + 3: osp osp nop + 4: osp osp osp nop +*/ #define K8_NOP1 GENERIC_NOP1 #define K8_NOP2 ".byte 0x66,0x90\n" #define K8_NOP3 ".byte 0x66,0x66,0x90\n" @@ -23,19 +35,35 @@ #define K8_NOP7 K8_NOP4 K8_NOP3 #define K8_NOP8 K8_NOP4 K8_NOP4 -/* K7 nops */ -/* uses eax dependencies (arbitary choice) */ -#define K7_NOP1 GENERIC_NOP1 +/* K7 nops + uses eax dependencies (arbitary choice) + 1: nop + 2: movl %eax,%eax + 3: leal (,%eax,1),%eax + 4: leal 0x00(,%eax,1),%eax + 6: leal 0x00000000(%eax),%eax + 7: leal 0x00000000(,%eax,1),%eax +*/ +#define K7_NOP1 GENERIC_NOP1 #define K7_NOP2 ".byte 0x8b,0xc0\n" #define K7_NOP3 ".byte 0x8d,0x04,0x20\n" #define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n" #define K7_NOP5 K7_NOP4 ASM_NOP1 #define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n" -#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" -#define K7_NOP8 K7_NOP7 ASM_NOP1 +#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" +#define K7_NOP8 K7_NOP7 ASM_NOP1 -/* P6 nops */ -/* uses eax dependencies (Intel-recommended choice) */ +/* P6 nops + uses eax dependencies (Intel-recommended choice) + 1: nop + 2: osp nop + 3: nopl (%eax) + 4: nopl 0x00(%eax) + 5: nopl 0x00(%eax,%eax,1) + 6: osp nopl 0x00(%eax,%eax,1) + 7: nopl 0x00000000(%eax) + 8: nopl 0x00000000(%eax,%eax,1) +*/ #define P6_NOP1 GENERIC_NOP1 #define P6_NOP2 ".byte 0x66,0x90\n" #define P6_NOP3 ".byte 0x0f,0x1f,0x00\n" @@ -63,9 +91,7 @@ #define ASM_NOP6 K7_NOP6 #define ASM_NOP7 K7_NOP7 #define ASM_NOP8 K7_NOP8 -#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \ - defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \ - defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4) +#elif defined(CONFIG_X86_P6_NOP) #define ASM_NOP1 P6_NOP1 #define ASM_NOP2 P6_NOP2 #define ASM_NOP3 P6_NOP3 diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h index f7393bc516e..143546073b9 100644 --- a/include/asm-x86/page_64.h +++ b/include/asm-x86/page_64.h @@ -47,8 +47,12 @@ #define __PHYSICAL_MASK_SHIFT 46 #define __VIRTUAL_MASK_SHIFT 48 -#define KERNEL_TEXT_SIZE (40*1024*1024) -#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL) +/* + * Kernel image size is limited to 128 MB (see level2_kernel_pgt in + * arch/x86/kernel/head_64.S), and it is mapped here: + */ +#define KERNEL_IMAGE_SIZE (128*1024*1024) +#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL) #ifndef __ASSEMBLY__ void clear_page(void *page); diff --git a/include/linux/ext4_fs_extents.h b/include/linux/ext4_fs_extents.h index 697da4bce6c..1285c583b2d 100644 --- a/include/linux/ext4_fs_extents.h +++ b/include/linux/ext4_fs_extents.h @@ -227,5 +227,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, ext4_lblk_t *, ext4_fsblk_t *); extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, ext4_lblk_t *, ext4_fsblk_t *); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); #endif /* _LINUX_EXT4_EXTENTS */ diff --git a/include/linux/sched.h b/include/linux/sched.h index e217d188a10..2c9621f8bf8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -242,6 +242,7 @@ struct task_struct; extern void sched_init(void); extern void sched_init_smp(void); +extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle_bootup_task(struct task_struct *idle); @@ -1189,7 +1190,7 @@ struct task_struct { int softirq_context; #endif #ifdef CONFIG_LOCKDEP -# define MAX_LOCK_DEPTH 30UL +# define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; struct held_lock held_locks[MAX_LOCK_DEPTH]; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3574379f4d6..81a4e4a3f08 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -779,6 +779,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * parallel walking of the hash-list safe: */ list_add_tail_rcu(&class->hash_entry, hash_head); + /* + * Add it to the global list of classes: + */ + list_add_tail_rcu(&class->lock_entry, &all_lock_classes); if (verbose(class)) { graph_unlock(); @@ -2282,10 +2286,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return 0; break; case LOCK_USED: - /* - * Add it to the global list of classes: - */ - list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); debug_atomic_dec(&nr_unused_locks); break; default: diff --git a/kernel/printk.c b/kernel/printk.c index bee36100f11..9adc2a473e6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -666,7 +666,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) } /* Emit the output into the temporary buffer */ printed_len += vscnprintf(printk_buf + printed_len, - sizeof(printk_buf), fmt, args); + sizeof(printk_buf) - printed_len, fmt, args); /* * Copy the output into log_buf. If the caller didn't provide diff --git a/kernel/sched.c b/kernel/sched.c index b387a8de26a..f06950c8a6c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -668,6 +668,8 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; */ unsigned int sysctl_sched_rt_period = 1000000; +static __read_mostly int scheduler_running; + /* * part of the period that we allow rt tasks to run in us. * default: 0.95s @@ -689,14 +691,16 @@ unsigned long long cpu_clock(int cpu) unsigned long flags; struct rq *rq; - local_irq_save(flags); - rq = cpu_rq(cpu); /* * Only call sched_clock() if the scheduler has already been * initialized (some code might call cpu_clock() very early): */ - if (rq->idle) - update_rq_clock(rq); + if (unlikely(!scheduler_running)) + return 0; + + local_irq_save(flags); + rq = cpu_rq(cpu); + update_rq_clock(rq); now = rq->clock; local_irq_restore(flags); @@ -3885,7 +3889,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev) asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; - long *switch_count; + unsigned long *switch_count; struct rq *rq; int cpu; @@ -7284,6 +7288,8 @@ void __init sched_init(void) * During early bootup we pretend to be a normal task: */ current->sched_class = &fair_sched_class; + + scheduler_running = 1; } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6c091d6e159..c8e6492c592 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -202,17 +202,12 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; - struct sched_entity *se = NULL; - struct rb_node *parent; + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); - while (*link) { - parent = *link; - se = rb_entry(parent, struct sched_entity, run_node); - link = &parent->rb_right; - } + if (!last) + return NULL; - return se; + return rb_entry(last, struct sched_entity, run_node); } /************************************************************** |