diff options
Diffstat (limited to 'kernel')
61 files changed, 5841 insertions, 1787 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 82fb182f6f6..d62ec66c1af 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,10 +8,15 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o + hrtimer.o rwsem.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o +obj-$(CONFIG_LOCKDEP) += lockdep.o +ifeq ($(CONFIG_PROC_FS),y) +obj-$(CONFIG_LOCKDEP) += lockdep_proc.o +endif obj-$(CONFIG_FUTEX) += futex.o ifeq ($(CONFIG_COMPAT),y) obj-$(CONFIG_FUTEX) += futex_compat.o @@ -22,6 +27,7 @@ obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o +obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o @@ -42,6 +48,8 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o +obj-$(CONFIG_TASKSTATS) += taskstats.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/acct.c b/kernel/acct.c index f18e0b8df3e..2a7c933651c 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -488,7 +488,7 @@ static void do_acct_process(struct file *file) old_encode_dev(tty_devnum(current->signal->tty)) : 0; read_unlock(&tasklist_lock); - spin_lock(¤t->sighand->siglock); + spin_lock_irq(¤t->sighand->siglock); ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); ac.ac_flag = pacct->ac_flag; @@ -496,7 +496,7 @@ static void do_acct_process(struct file *file) ac.ac_minflt = encode_comp_t(pacct->ac_minflt); ac.ac_majflt = encode_comp_t(pacct->ac_majflt); ac.ac_exitcode = pacct->ac_exitcode; - spin_unlock(¤t->sighand->siglock); + spin_unlock_irq(¤t->sighand->siglock); ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ ac.ac_rw = encode_comp_t(ac.ac_io / 1024); ac.ac_swaps = encode_comp_t(0); diff --git a/kernel/audit.c b/kernel/audit.c index d417ca1db79..963fd15c962 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -690,9 +690,7 @@ static const struct inotify_operations audit_inotify_ops = { /* Initialize audit support at boot time. */ static int __init audit_init(void) { -#ifdef CONFIG_AUDITSYSCALL int i; -#endif printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); @@ -717,10 +715,10 @@ static int __init audit_init(void) audit_ih = inotify_init(&audit_inotify_ops); if (IS_ERR(audit_ih)) audit_panic("cannot initialize inotify handle"); +#endif for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); -#endif return 0; } @@ -1030,6 +1028,9 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, struct sk_buff *skb; static const unsigned char *hex = "0123456789ABCDEF"; + if (!ab) + return; + BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); @@ -1062,6 +1063,9 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, unsigned char *ptr; struct sk_buff *skb; + if (!ab) + return; + BUG_ON(!ab->skb); skb = ab->skb; avail = skb_tailroom(skb); diff --git a/kernel/audit.h b/kernel/audit.h index 6aa33b848cf..a3370232a39 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -104,6 +104,7 @@ static inline int audit_hash_ino(u32 ino) return (ino & (AUDIT_INODE_BUCKETS-1)); } +extern int audit_match_class(int class, unsigned syscall); extern int audit_comparator(const u32 left, const u32 op, const u32 right); extern int audit_compare_dname_path(const char *dname, const char *path, int *dirlen); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 5b4e16276ca..a44879b0c72 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -302,6 +302,15 @@ int __init audit_register_class(int class, unsigned *list) return 0; } +int audit_match_class(int class, unsigned syscall) +{ + if (unlikely(syscall >= AUDIT_BITMASK_SIZE * sizeof(__u32))) + return 0; + if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class])) + return 0; + return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall); +} + /* Common user-space to kernel rule translation. */ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) { @@ -404,6 +413,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_PERS: case AUDIT_ARCH: case AUDIT_MSGTYPE: + case AUDIT_PPID: case AUDIT_DEVMAJOR: case AUDIT_DEVMINOR: case AUDIT_EXIT: @@ -413,6 +423,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_ARG2: case AUDIT_ARG3: break; + case AUDIT_PERM: + if (f->val & ~15) + goto exit_free; + break; case AUDIT_INODE: err = audit_to_inode(&entry->rule, f); if (err) @@ -442,6 +456,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_EQUAL: break; default: + err = -EINVAL; goto exit_free; } } @@ -566,6 +581,10 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, entry->rule.buflen += f->val; entry->rule.filterkey = str; break; + case AUDIT_PERM: + if (f->val & ~15) + goto exit_free; + break; default: goto exit_free; } @@ -579,6 +598,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_EQUAL: break; default: + err = -EINVAL; goto exit_free; } } @@ -911,7 +931,7 @@ static void audit_update_watch(struct audit_parent *parent, } ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "audit updated rules specifying watch="); + audit_log_format(ab, "audit updated rules specifying path="); audit_log_untrustedstring(ab, owatch->path); audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); audit_log_end(ab); @@ -934,19 +954,28 @@ static void audit_remove_parent_watches(struct audit_parent *parent) struct audit_watch *w, *nextw; struct audit_krule *r, *nextr; struct audit_entry *e; + struct audit_buffer *ab; mutex_lock(&audit_filter_mutex); parent->flags |= AUDIT_PARENT_INVALID; list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { list_for_each_entry_safe(r, nextr, &w->rules, rlist) { e = container_of(r, struct audit_entry, rule); + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "audit implicitly removed rule path="); + audit_log_untrustedstring(ab, w->path); + if (r->filterkey) { + audit_log_format(ab, " key="); + audit_log_untrustedstring(ab, r->filterkey); + } else + audit_log_format(ab, " key=(null)"); + audit_log_format(ab, " list=%d", r->listnr); + audit_log_end(ab); + list_del(&r->rlist); list_del_rcu(&e->list); call_rcu(&e->rcu, audit_free_rule_rcu); - - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "audit implicitly removed rule from list=%d\n", - AUDIT_FILTER_EXIT); } audit_remove_watch(w); } @@ -1134,6 +1163,14 @@ static inline int audit_add_rule(struct audit_entry *entry, struct audit_watch *watch = entry->rule.watch; struct nameidata *ndp, *ndw; int h, err, putnd_needed = 0; +#ifdef CONFIG_AUDITSYSCALL + int dont_count = 0; + + /* If either of these, don't count towards total */ + if (entry->rule.listnr == AUDIT_FILTER_USER || + entry->rule.listnr == AUDIT_FILTER_TYPE) + dont_count = 1; +#endif if (inode_f) { h = audit_hash_ino(inode_f->val); @@ -1174,6 +1211,10 @@ static inline int audit_add_rule(struct audit_entry *entry, } else { list_add_tail_rcu(&entry->list, list); } +#ifdef CONFIG_AUDITSYSCALL + if (!dont_count) + audit_n_rules++; +#endif mutex_unlock(&audit_filter_mutex); if (putnd_needed) @@ -1198,6 +1239,14 @@ static inline int audit_del_rule(struct audit_entry *entry, struct audit_watch *watch, *tmp_watch = entry->rule.watch; LIST_HEAD(inotify_list); int h, ret = 0; +#ifdef CONFIG_AUDITSYSCALL + int dont_count = 0; + + /* If either of these, don't count towards total */ + if (entry->rule.listnr == AUDIT_FILTER_USER || + entry->rule.listnr == AUDIT_FILTER_TYPE) + dont_count = 1; +#endif if (inode_f) { h = audit_hash_ino(inode_f->val); @@ -1235,6 +1284,10 @@ static inline int audit_del_rule(struct audit_entry *entry, list_del_rcu(&e->list); call_rcu(&e->rcu, audit_free_rule_rcu); +#ifdef CONFIG_AUDITSYSCALL + if (!dont_count) + audit_n_rules--; +#endif mutex_unlock(&audit_filter_mutex); if (!list_empty(&inotify_list)) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ae40ac8c39e..1bd8827a010 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -85,6 +85,9 @@ extern int audit_enabled; /* Indicates that audit should log the full pathname. */ #define AUDIT_NAME_FULL -1 +/* number of audit rules */ +int audit_n_rules; + /* When fs/namei.c:getname() is called, we store the pointer in name and * we don't let putname() free it (instead we free all of the saved * pointers at syscall exit time). @@ -174,6 +177,7 @@ struct audit_aux_data_path { /* The per-task audit context. */ struct audit_context { + int dummy; /* must be the first element */ int in_syscall; /* 1 if task is in a syscall */ enum audit_state state; unsigned int serial; /* serial number for record */ @@ -205,6 +209,54 @@ struct audit_context { #endif }; +#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) +static inline int open_arg(int flags, int mask) +{ + int n = ACC_MODE(flags); + if (flags & (O_TRUNC | O_CREAT)) + n |= AUDIT_PERM_WRITE; + return n & mask; +} + +static int audit_match_perm(struct audit_context *ctx, int mask) +{ + unsigned n = ctx->major; + switch (audit_classify_syscall(ctx->arch, n)) { + case 0: /* native */ + if ((mask & AUDIT_PERM_WRITE) && + audit_match_class(AUDIT_CLASS_WRITE, n)) + return 1; + if ((mask & AUDIT_PERM_READ) && + audit_match_class(AUDIT_CLASS_READ, n)) + return 1; + if ((mask & AUDIT_PERM_ATTR) && + audit_match_class(AUDIT_CLASS_CHATTR, n)) + return 1; + return 0; + case 1: /* 32bit on biarch */ + if ((mask & AUDIT_PERM_WRITE) && + audit_match_class(AUDIT_CLASS_WRITE_32, n)) + return 1; + if ((mask & AUDIT_PERM_READ) && + audit_match_class(AUDIT_CLASS_READ_32, n)) + return 1; + if ((mask & AUDIT_PERM_ATTR) && + audit_match_class(AUDIT_CLASS_CHATTR_32, n)) + return 1; + return 0; + case 2: /* open */ + return mask & ACC_MODE(ctx->argv[1]); + case 3: /* openat */ + return mask & ACC_MODE(ctx->argv[2]); + case 4: /* socketcall */ + return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND); + case 5: /* execve */ + return mask & AUDIT_PERM_EXEC; + default: + return 0; + } +} + /* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. */ @@ -393,6 +445,9 @@ static int audit_filter_rules(struct task_struct *tsk, /* ignore this field for filtering */ result = 1; break; + case AUDIT_PERM: + result = audit_match_perm(ctx, f->val); + break; } if (!result) @@ -514,7 +569,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, context->return_valid = return_valid; context->return_code = return_code; - if (context->in_syscall && !context->auditable) { + if (context->in_syscall && !context->dummy && !context->auditable) { enum audit_state state; state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); @@ -530,17 +585,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, } get_context: - context->pid = tsk->pid; - context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ - context->uid = tsk->uid; - context->gid = tsk->gid; - context->euid = tsk->euid; - context->suid = tsk->suid; - context->fsuid = tsk->fsuid; - context->egid = tsk->egid; - context->sgid = tsk->sgid; - context->fsgid = tsk->fsgid; - context->personality = tsk->personality; + tsk->audit_context = NULL; return context; } @@ -749,6 +794,17 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts const char *tty; /* tsk == current */ + context->pid = tsk->pid; + context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ + context->uid = tsk->uid; + context->gid = tsk->gid; + context->euid = tsk->euid; + context->suid = tsk->suid; + context->fsuid = tsk->fsuid; + context->egid = tsk->egid; + context->sgid = tsk->sgid; + context->fsgid = tsk->fsgid; + context->personality = tsk->personality; ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); if (!ab) @@ -1066,7 +1122,8 @@ void audit_syscall_entry(int arch, int major, context->argv[3] = a4; state = context->state; - if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT) + context->dummy = !audit_n_rules; + if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)) state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); if (likely(state == AUDIT_DISABLED)) return; @@ -1199,14 +1256,18 @@ void audit_putname(const char *name) #endif } -static void audit_inode_context(int idx, const struct inode *inode) +/* Copy inode data into an audit_names. */ +static void audit_copy_inode(struct audit_names *name, const struct inode *inode) { - struct audit_context *context = current->audit_context; - - selinux_get_inode_sid(inode, &context->names[idx].osid); + name->ino = inode->i_ino; + name->dev = inode->i_sb->s_dev; + name->mode = inode->i_mode; + name->uid = inode->i_uid; + name->gid = inode->i_gid; + name->rdev = inode->i_rdev; + selinux_get_inode_sid(inode, &name->osid); } - /** * audit_inode - store the inode and device from a lookup * @name: name being audited @@ -1240,20 +1301,14 @@ void __audit_inode(const char *name, const struct inode *inode) ++context->ino_count; #endif } - context->names[idx].ino = inode->i_ino; - context->names[idx].dev = inode->i_sb->s_dev; - context->names[idx].mode = inode->i_mode; - context->names[idx].uid = inode->i_uid; - context->names[idx].gid = inode->i_gid; - context->names[idx].rdev = inode->i_rdev; - audit_inode_context(idx, inode); + audit_copy_inode(&context->names[idx], inode); } /** * audit_inode_child - collect inode info for created/removed objects * @dname: inode's dentry name * @inode: inode being audited - * @pino: inode number of dentry parent + * @parent: inode of dentry parent * * For syscalls that create or remove filesystem objects, audit_inode * can only collect information for the filesystem object's parent. @@ -1264,7 +1319,7 @@ void __audit_inode(const char *name, const struct inode *inode) * unsuccessful attempts. */ void __audit_inode_child(const char *dname, const struct inode *inode, - unsigned long pino) + const struct inode *parent) { int idx; struct audit_context *context = current->audit_context; @@ -1278,7 +1333,7 @@ void __audit_inode_child(const char *dname, const struct inode *inode, if (!dname) goto update_context; for (idx = 0; idx < context->name_count; idx++) - if (context->names[idx].ino == pino) { + if (context->names[idx].ino == parent->i_ino) { const char *name = context->names[idx].name; if (!name) @@ -1302,16 +1357,47 @@ update_context: context->names[idx].name_len = AUDIT_NAME_FULL; context->names[idx].name_put = 0; /* don't call __putname() */ - if (inode) { - context->names[idx].ino = inode->i_ino; - context->names[idx].dev = inode->i_sb->s_dev; - context->names[idx].mode = inode->i_mode; - context->names[idx].uid = inode->i_uid; - context->names[idx].gid = inode->i_gid; - context->names[idx].rdev = inode->i_rdev; - audit_inode_context(idx, inode); - } else - context->names[idx].ino = (unsigned long)-1; + if (!inode) + context->names[idx].ino = (unsigned long)-1; + else + audit_copy_inode(&context->names[idx], inode); + + /* A parent was not found in audit_names, so copy the inode data for the + * provided parent. */ + if (!found_name) { + idx = context->name_count++; +#if AUDIT_DEBUG + context->ino_count++; +#endif + audit_copy_inode(&context->names[idx], parent); + } +} + +/** + * audit_inode_update - update inode info for last collected name + * @inode: inode being audited + * + * When open() is called on an existing object with the O_CREAT flag, the inode + * data audit initially collects is incorrect. This additional hook ensures + * audit has the inode data for the actual object to be opened. + */ +void __audit_inode_update(const struct inode *inode) +{ + struct audit_context *context = current->audit_context; + int idx; + + if (!context->in_syscall || !inode) + return; + + if (context->name_count == 0) { + context->name_count++; +#if AUDIT_DEBUG + context->ino_count++; +#endif + } + idx = context->name_count - 1; + + audit_copy_inode(&context->names[idx], inode); } /** @@ -1642,7 +1728,7 @@ int audit_bprm(struct linux_binprm *bprm) unsigned long p, next; void *to; - if (likely(!audit_enabled || !context)) + if (likely(!audit_enabled || !context || context->dummy)) return 0; ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, @@ -1680,7 +1766,7 @@ int audit_socketcall(int nargs, unsigned long *args) struct audit_aux_data_socketcall *ax; struct audit_context *context = current->audit_context; - if (likely(!context)) + if (likely(!context || context->dummy)) return 0; ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL); @@ -1708,7 +1794,7 @@ int audit_sockaddr(int len, void *a) struct audit_aux_data_sockaddr *ax; struct audit_context *context = current->audit_context; - if (likely(!context)) + if (likely(!context || context->dummy)) return 0; ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); diff --git a/kernel/capability.c b/kernel/capability.c index 1a4d8a40d3f..c7685ad00a9 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -46,7 +46,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) int ret = 0; pid_t pid; __u32 version; - task_t *target; + struct task_struct *target; struct __user_cap_data_struct data; if (get_user(version, &header->version)) @@ -96,7 +96,7 @@ static inline int cap_set_pg(int pgrp, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { - task_t *g, *target; + struct task_struct *g, *target; int ret = -EPERM; int found = 0; @@ -128,7 +128,7 @@ static inline int cap_set_all(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { - task_t *g, *target; + struct task_struct *g, *target; int ret = -EPERM; int found = 0; @@ -172,7 +172,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) { kernel_cap_t inheritable, permitted, effective; __u32 version; - task_t *target; + struct task_struct *target; int ret; pid_t pid; diff --git a/kernel/cpu.c b/kernel/cpu.c index 70fbf2e8376..f230f9ae01c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -16,56 +16,48 @@ #include <linux/mutex.h> /* This protects CPUs going up and down... */ -static DEFINE_MUTEX(cpucontrol); +static DEFINE_MUTEX(cpu_add_remove_lock); +static DEFINE_MUTEX(cpu_bitmask_lock); static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); #ifdef CONFIG_HOTPLUG_CPU -static struct task_struct *lock_cpu_hotplug_owner; -static int lock_cpu_hotplug_depth; -static int __lock_cpu_hotplug(int interruptible) -{ - int ret = 0; - - if (lock_cpu_hotplug_owner != current) { - if (interruptible) - ret = mutex_lock_interruptible(&cpucontrol); - else - mutex_lock(&cpucontrol); - } - - /* - * Set only if we succeed in locking - */ - if (!ret) { - lock_cpu_hotplug_depth++; - lock_cpu_hotplug_owner = current; - } - - return ret; -} +/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ +static struct task_struct *recursive; +static int recursive_depth; void lock_cpu_hotplug(void) { - __lock_cpu_hotplug(0); + struct task_struct *tsk = current; + + if (tsk == recursive) { + static int warnings = 10; + if (warnings) { + printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n"); + WARN_ON(1); + warnings--; + } + recursive_depth++; + return; + } + mutex_lock(&cpu_bitmask_lock); + recursive = tsk; } EXPORT_SYMBOL_GPL(lock_cpu_hotplug); void unlock_cpu_hotplug(void) { - if (--lock_cpu_hotplug_depth == 0) { - lock_cpu_hotplug_owner = NULL; - mutex_unlock(&cpucontrol); + WARN_ON(recursive != current); + if (recursive_depth) { + recursive_depth--; + return; } + mutex_unlock(&cpu_bitmask_lock); + recursive = NULL; } EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); -int lock_cpu_hotplug_interruptible(void) -{ - return __lock_cpu_hotplug(1); -} -EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); #endif /* CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ @@ -122,9 +114,7 @@ int cpu_down(unsigned int cpu) struct task_struct *p; cpumask_t old_allowed, tmp; - if ((err = lock_cpu_hotplug_interruptible()) != 0) - return err; - + mutex_lock(&cpu_add_remove_lock); if (num_online_cpus() == 1) { err = -EBUSY; goto out; @@ -150,7 +140,10 @@ int cpu_down(unsigned int cpu) cpu_clear(cpu, tmp); set_cpus_allowed(current, tmp); + mutex_lock(&cpu_bitmask_lock); p = __stop_machine_run(take_cpu_down, NULL, cpu); + mutex_unlock(&cpu_bitmask_lock); + if (IS_ERR(p)) { /* CPU didn't die: tell everyone. Can't complain. */ if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, @@ -187,7 +180,7 @@ out_thread: out_allowed: set_cpus_allowed(current, old_allowed); out: - unlock_cpu_hotplug(); + mutex_unlock(&cpu_add_remove_lock); return err; } #endif /*CONFIG_HOTPLUG_CPU*/ @@ -197,9 +190,7 @@ int __devinit cpu_up(unsigned int cpu) int ret; void *hcpu = (void *)(long)cpu; - if ((ret = lock_cpu_hotplug_interruptible()) != 0) - return ret; - + mutex_lock(&cpu_add_remove_lock); if (cpu_online(cpu) || !cpu_present(cpu)) { ret = -EINVAL; goto out; @@ -214,7 +205,9 @@ int __devinit cpu_up(unsigned int cpu) } /* Arch-specific enabling code. */ + mutex_lock(&cpu_bitmask_lock); ret = __cpu_up(cpu); + mutex_unlock(&cpu_bitmask_lock); if (ret != 0) goto out_notify; BUG_ON(!cpu_online(cpu)); @@ -227,6 +220,6 @@ out_notify: blocking_notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); out: - unlock_cpu_hotplug(); + mutex_unlock(&cpu_add_remove_lock); return ret; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c232dc07743..4ea6f0dc2fc 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -762,6 +762,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) * * Call with manage_mutex held. May nest a call to the * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. + * Must not be called holding callback_mutex, because we must + * not call lock_cpu_hotplug() while holding callback_mutex. */ static void update_cpu_domains(struct cpuset *cur) @@ -781,7 +783,7 @@ static void update_cpu_domains(struct cpuset *cur) if (is_cpu_exclusive(c)) cpus_andnot(pspan, pspan, c->cpus_allowed); } - if (is_removed(cur) || !is_cpu_exclusive(cur)) { + if (!is_cpu_exclusive(cur)) { cpus_or(pspan, pspan, cur->cpus_allowed); if (cpus_equal(pspan, cur->cpus_allowed)) return; @@ -814,6 +816,10 @@ static int update_cpumask(struct cpuset *cs, char *buf) struct cpuset trialcs; int retval, cpus_unchanged; + /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ + if (cs == &top_cpuset) + return -EACCES; + trialcs = *cs; retval = cpulist_parse(buf, trialcs.cpus_allowed); if (retval < 0) @@ -1917,6 +1923,17 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode) return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); } +/* + * Locking note on the strange update_flag() call below: + * + * If the cpuset being removed is marked cpu_exclusive, then simulate + * turning cpu_exclusive off, which will call update_cpu_domains(). + * The lock_cpu_hotplug() call in update_cpu_domains() must not be + * made while holding callback_mutex. Elsewhere the kernel nests + * callback_mutex inside lock_cpu_hotplug() calls. So the reverse + * nesting would risk an ABBA deadlock. + */ + static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) { struct cpuset *cs = dentry->d_fsdata; @@ -1936,11 +1953,16 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) mutex_unlock(&manage_mutex); return -EBUSY; } + if (is_cpu_exclusive(cs)) { + int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0"); + if (retval < 0) { + mutex_unlock(&manage_mutex); + return retval; + } + } parent = cs->parent; mutex_lock(&callback_mutex); set_bit(CS_REMOVED, &cs->flags); - if (is_cpu_exclusive(cs)) - update_cpu_domains(cs); list_del(&cs->sibling); /* delete my sibling from parent->children */ spin_lock(&cs->dentry->d_lock); d = dget(cs->dentry); @@ -2015,6 +2037,33 @@ out: return err; } +/* + * The top_cpuset tracks what CPUs and Memory Nodes are online, + * period. This is necessary in order to make cpusets transparent + * (of no affect) on systems that are actively using CPU hotplug + * but making no active use of cpusets. + * + * This handles CPU hotplug (cpuhp) events. If someday Memory + * Nodes can be hotplugged (dynamically changing node_online_map) + * then we should handle that too, perhaps in a similar way. + */ + +#ifdef CONFIG_HOTPLUG_CPU +static int cpuset_handle_cpuhp(struct notifier_block *nb, + unsigned long phase, void *cpu) +{ + mutex_lock(&manage_mutex); + mutex_lock(&callback_mutex); + + top_cpuset.cpus_allowed = cpu_online_map; + + mutex_unlock(&callback_mutex); + mutex_unlock(&manage_mutex); + + return 0; +} +#endif + /** * cpuset_init_smp - initialize cpus_allowed * @@ -2025,6 +2074,8 @@ void __init cpuset_init_smp(void) { top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_online_map; + + hotcpu_notifier(cpuset_handle_cpuhp, 0); } /** @@ -2369,7 +2420,7 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); int cpuset_excl_nodes_overlap(const struct task_struct *p) { const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ - int overlap = 0; /* do cpusets overlap? */ + int overlap = 1; /* do cpusets overlap? */ task_lock(current); if (current->flags & PF_EXITING) { diff --git a/kernel/delayacct.c b/kernel/delayacct.c new file mode 100644 index 00000000000..36752f124c6 --- /dev/null +++ b/kernel/delayacct.c @@ -0,0 +1,162 @@ +/* delayacct.c - per-task delay accounting + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/sysctl.h> +#include <linux/delayacct.h> + +int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ +kmem_cache_t *delayacct_cache; + +static int __init delayacct_setup_disable(char *str) +{ + delayacct_on = 0; + return 1; +} +__setup("nodelayacct", delayacct_setup_disable); + +void delayacct_init(void) +{ + delayacct_cache = kmem_cache_create("delayacct_cache", + sizeof(struct task_delay_info), + 0, + SLAB_PANIC, + NULL, NULL); + delayacct_tsk_init(&init_task); +} + +void __delayacct_tsk_init(struct task_struct *tsk) +{ + tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); + if (tsk->delays) + spin_lock_init(&tsk->delays->lock); +} + +/* + * Start accounting for a delay statistic using + * its starting timestamp (@start) + */ + +static inline void delayacct_start(struct timespec *start) +{ + do_posix_clock_monotonic_gettime(start); +} + +/* + * Finish delay accounting for a statistic using + * its timestamps (@start, @end), accumalator (@total) and @count + */ + +static void delayacct_end(struct timespec *start, struct timespec *end, + u64 *total, u32 *count) +{ + struct timespec ts; + s64 ns; + + do_posix_clock_monotonic_gettime(end); + ts = timespec_sub(*end, *start); + ns = timespec_to_ns(&ts); + if (ns < 0) + return; + + spin_lock(¤t->delays->lock); + *total += ns; + (*count)++; + spin_unlock(¤t->delays->lock); +} + +void __delayacct_blkio_start(void) +{ + delayacct_start(¤t->delays->blkio_start); +} + +void __delayacct_blkio_end(void) +{ + if (current->delays->flags & DELAYACCT_PF_SWAPIN) + /* Swapin block I/O */ + delayacct_end(¤t->delays->blkio_start, + ¤t->delays->blkio_end, + ¤t->delays->swapin_delay, + ¤t->delays->swapin_count); + else /* Other block I/O */ + delayacct_end(¤t->delays->blkio_start, + ¤t->delays->blkio_end, + ¤t->delays->blkio_delay, + ¤t->delays->blkio_count); +} + +int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) +{ + s64 tmp; + struct timespec ts; + unsigned long t1,t2,t3; + + /* Though tsk->delays accessed later, early exit avoids + * unnecessary returning of other data + */ + if (!tsk->delays) + goto done; + + tmp = (s64)d->cpu_run_real_total; + cputime_to_timespec(tsk->utime + tsk->stime, &ts); + tmp += timespec_to_ns(&ts); + d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; + + /* + * No locking available for sched_info (and too expensive to add one) + * Mitigate by taking snapshot of values + */ + t1 = tsk->sched_info.pcnt; + t2 = tsk->sched_info.run_delay; + t3 = tsk->sched_info.cpu_time; + + d->cpu_count += t1; + + jiffies_to_timespec(t2, &ts); + tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts); + d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; + + tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000; + d->cpu_run_virtual_total = + (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; + + /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ + + spin_lock(&tsk->delays->lock); + tmp = d->blkio_delay_total + tsk->delays->blkio_delay; + d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; + tmp = d->swapin_delay_total + tsk->delays->swapin_delay; + d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; + d->blkio_count += tsk->delays->blkio_count; + d->swapin_count += tsk->delays->swapin_count; + spin_unlock(&tsk->delays->lock); + +done: + return 0; +} + +__u64 __delayacct_blkio_ticks(struct task_struct *tsk) +{ + __u64 ret; + + spin_lock(&tsk->delays->lock); + ret = nsec_to_clock_t(tsk->delays->blkio_delay + + tsk->delays->swapin_delay); + spin_unlock(&tsk->delays->lock); + return ret; +} + diff --git a/kernel/exit.c b/kernel/exit.c index 7f7ef225855..d891883420f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,6 +25,8 @@ #include <linux/mount.h> #include <linux/proc_fs.h> #include <linux/mempolicy.h> +#include <linux/taskstats_kern.h> +#include <linux/delayacct.h> #include <linux/cpuset.h> #include <linux/syscalls.h> #include <linux/signal.h> @@ -134,8 +136,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp) void release_task(struct task_struct * p) { + struct task_struct *leader; int zap_leader; - task_t *leader; repeat: atomic_dec(&p->user->processes); write_lock_irq(&tasklist_lock); @@ -209,7 +211,7 @@ out: * * "I ask you, have you ever known what it is to be an orphan?" */ -static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) +static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) { struct task_struct *p; int ret = 1; @@ -582,7 +584,8 @@ static void exit_mm(struct task_struct * tsk) mmput(mm); } -static inline void choose_new_parent(task_t *p, task_t *reaper) +static inline void +choose_new_parent(struct task_struct *p, struct task_struct *reaper) { /* * Make sure we're not reparenting to ourselves and that @@ -592,7 +595,8 @@ static inline void choose_new_parent(task_t *p, task_t *reaper) p->real_parent = reaper; } -static void reparent_thread(task_t *p, task_t *father, int traced) +static void +reparent_thread(struct task_struct *p, struct task_struct *father, int traced) { /* We don't want people slaying init. */ if (p->exit_signal != -1) @@ -656,8 +660,8 @@ static void reparent_thread(task_t *p, task_t *father, int traced) * group, and if no such member exists, give it to * the global child reaper process (ie "init") */ -static void forget_original_parent(struct task_struct * father, - struct list_head *to_release) +static void +forget_original_parent(struct task_struct *father, struct list_head *to_release) { struct task_struct *p, *reaper = father; struct list_head *_p, *_n; @@ -680,7 +684,7 @@ static void forget_original_parent(struct task_struct * father, */ list_for_each_safe(_p, _n, &father->children) { int ptrace; - p = list_entry(_p,struct task_struct,sibling); + p = list_entry(_p, struct task_struct, sibling); ptrace = p->ptrace; @@ -709,7 +713,7 @@ static void forget_original_parent(struct task_struct * father, list_add(&p->ptrace_list, to_release); } list_for_each_safe(_p, _n, &father->ptrace_children) { - p = list_entry(_p,struct task_struct,ptrace_list); + p = list_entry(_p, struct task_struct, ptrace_list); choose_new_parent(p, reaper); reparent_thread(p, father, 1); } @@ -829,7 +833,7 @@ static void exit_notify(struct task_struct *tsk) list_for_each_safe(_p, _n, &ptrace_dead) { list_del_init(_p); - t = list_entry(_p,struct task_struct,ptrace_list); + t = list_entry(_p, struct task_struct, ptrace_list); release_task(t); } @@ -841,7 +845,9 @@ static void exit_notify(struct task_struct *tsk) fastcall NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; + struct taskstats *tidstats; int group_dead; + unsigned int mycpu; profile_task_exit(tsk); @@ -879,6 +885,8 @@ fastcall NORET_TYPE void do_exit(long code) current->comm, current->pid, preempt_count()); + taskstats_exit_alloc(&tidstats, &mycpu); + acct_update_integrals(tsk); if (tsk->mm) { update_hiwater_rss(tsk->mm); @@ -898,6 +906,9 @@ fastcall NORET_TYPE void do_exit(long code) #endif if (unlikely(tsk->audit_context)) audit_free(tsk); + taskstats_exit_send(tsk, tidstats, group_dead, mycpu); + taskstats_exit_free(tidstats); + exit_mm(tsk); if (group_dead) @@ -933,10 +944,9 @@ fastcall NORET_TYPE void do_exit(long code) if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); /* - * If DEBUG_MUTEXES is on, make sure we are holding no locks: + * Make sure we are holding no locks: */ - mutex_debug_check_no_locks_held(tsk); - rt_mutex_debug_check_no_locks_held(tsk); + debug_check_no_locks_held(tsk); if (tsk->io_context) exit_io_context(); @@ -1011,7 +1021,7 @@ asmlinkage void sys_exit_group(int error_code) do_group_exit((error_code & 0xff) << 8); } -static int eligible_child(pid_t pid, int options, task_t *p) +static int eligible_child(pid_t pid, int options, struct task_struct *p) { if (pid > 0) { if (p->pid != pid) @@ -1043,7 +1053,7 @@ static int eligible_child(pid_t pid, int options, task_t *p) * Do not consider thread group leaders that are * in a non-empty thread group: */ - if (current->tgid != p->tgid && delay_group_leader(p)) + if (delay_group_leader(p)) return 2; if (security_task_wait(p)) @@ -1052,12 +1062,13 @@ static int eligible_child(pid_t pid, int options, task_t *p) return 1; } -static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid, +static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, int why, int status, struct siginfo __user *infop, struct rusage __user *rusagep) { int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; + put_task_struct(p); if (!retval) retval = put_user(SIGCHLD, &infop->si_signo); @@ -1082,7 +1093,7 @@ static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid, * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_zombie(task_t *p, int noreap, +static int wait_task_zombie(struct task_struct *p, int noreap, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { @@ -1244,8 +1255,8 @@ static int wait_task_zombie(task_t *p, int noreap, * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap, - struct siginfo __user *infop, +static int wait_task_stopped(struct task_struct *p, int delayed_group_leader, + int noreap, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { int retval, exit_code; @@ -1359,7 +1370,7 @@ bail_ref: * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_continued(task_t *p, int noreap, +static int wait_task_continued(struct task_struct *p, int noreap, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { @@ -1445,7 +1456,7 @@ repeat: int ret; list_for_each(_p,&tsk->children) { - p = list_entry(_p,struct task_struct,sibling); + p = list_entry(_p, struct task_struct, sibling); ret = eligible_child(pid, options, p); if (!ret) diff --git a/kernel/fork.c b/kernel/fork.c index 9064bf9e131..f9b014e3e70 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -43,6 +43,8 @@ #include <linux/rmap.h> #include <linux/acct.h> #include <linux/cn_proc.h> +#include <linux/delayacct.h> +#include <linux/taskstats_kern.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -61,9 +63,7 @@ int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; - __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ - -EXPORT_SYMBOL(tasklist_lock); +__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ int nr_processes(void) { @@ -117,6 +117,7 @@ void __put_task_struct(struct task_struct *tsk) security_task_free(tsk); free_uid(tsk->user); put_group_info(tsk->group_info); + delayacct_tsk_free(tsk); if (!profile_handoff_task(tsk)) free_task(tsk); @@ -193,7 +194,10 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) down_write(&oldmm->mmap_sem); flush_cache_mm(oldmm); - down_write(&mm->mmap_sem); + /* + * Not linked in yet - no deadlock potential: + */ + down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); mm->locked_vm = 0; mm->mmap = NULL; @@ -817,6 +821,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts if (clone_flags & CLONE_THREAD) { atomic_inc(¤t->signal->count); atomic_inc(¤t->signal->live); + taskstats_tgid_alloc(current->signal); return 0; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); @@ -861,6 +866,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); INIT_LIST_HEAD(&sig->cpu_timers[2]); + taskstats_tgid_init(sig); task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); @@ -882,6 +888,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts void __cleanup_signal(struct signal_struct *sig) { exit_thread_group_keys(sig); + taskstats_tgid_free(sig); kmem_cache_free(signal_cachep, sig); } @@ -919,10 +926,6 @@ static inline void rt_mutex_init_task(struct task_struct *p) spin_lock_init(&p->pi_lock); plist_head_init(&p->pi_waiters, &p->pi_lock); p->pi_blocked_on = NULL; -# ifdef CONFIG_DEBUG_RT_MUTEXES - spin_lock_init(&p->held_list_lock); - INIT_LIST_HEAD(&p->held_list_head); -# endif #endif } @@ -934,13 +937,13 @@ static inline void rt_mutex_init_task(struct task_struct *p) * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ -static task_t *copy_process(unsigned long clone_flags, - unsigned long stack_start, - struct pt_regs *regs, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr, - int pid) +static struct task_struct *copy_process(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr, + int pid) { int retval; struct task_struct *p = NULL; @@ -972,6 +975,10 @@ static task_t *copy_process(unsigned long clone_flags, if (!p) goto fork_out; +#ifdef CONFIG_TRACE_IRQFLAGS + DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); + DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); +#endif retval = -EAGAIN; if (atomic_read(&p->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { @@ -999,12 +1006,13 @@ static task_t *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_put_domain; p->did_exec = 0; + delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); p->pid = pid; retval = -EFAULT; if (clone_flags & CLONE_PARENT_SETTID) if (put_user(p->pid, parent_tidptr)) - goto bad_fork_cleanup; + goto bad_fork_cleanup_delays_binfmt; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); @@ -1046,6 +1054,26 @@ static task_t *copy_process(unsigned long clone_flags, } mpol_fix_fork_child_flag(p); #endif +#ifdef CONFIG_TRACE_IRQFLAGS + p->irq_events = 0; + p->hardirqs_enabled = 0; + p->hardirq_enable_ip = 0; + p->hardirq_enable_event = 0; + p->hardirq_disable_ip = _THIS_IP_; + p->hardirq_disable_event = 0; + p->softirqs_enabled = 1; + p->softirq_enable_ip = _THIS_IP_; + p->softirq_enable_event = 0; + p->softirq_disable_ip = 0; + p->softirq_disable_event = 0; + p->hardirq_context = 0; + p->softirq_context = 0; +#endif +#ifdef CONFIG_LOCKDEP + p->lockdep_depth = 0; /* no locks held yet */ + p->curr_chain_key = 0; + p->lockdep_recursion = 0; +#endif rt_mutex_init_task(p); @@ -1250,7 +1278,8 @@ bad_fork_cleanup_policy: bad_fork_cleanup_cpuset: #endif cpuset_exit(p); -bad_fork_cleanup: +bad_fork_cleanup_delays_binfmt: + delayacct_tsk_free(p); if (p->binfmt) module_put(p->binfmt->module); bad_fork_cleanup_put_domain: @@ -1271,9 +1300,9 @@ struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) return regs; } -task_t * __devinit fork_idle(int cpu) +struct task_struct * __devinit fork_idle(int cpu) { - task_t *task; + struct task_struct *task; struct pt_regs regs; task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); @@ -1360,8 +1389,10 @@ long do_fork(unsigned long clone_flags, if (clone_flags & CLONE_VFORK) { wait_for_completion(&vfork); - if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) + if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { + current->ptrace_message = nr; ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); + } } } else { free_pid(pid); diff --git a/kernel/futex.c b/kernel/futex.c index 15caf93e4a4..9d260e838cf 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -297,7 +297,7 @@ static int futex_handle_fault(unsigned long address, int attempt) struct vm_area_struct * vma; struct mm_struct *mm = current->mm; - if (attempt >= 2 || !(vma = find_vma(mm, address)) || + if (attempt > 2 || !(vma = find_vma(mm, address)) || vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) return -EFAULT; @@ -397,7 +397,7 @@ static struct task_struct * futex_find_get_task(pid_t pid) p = NULL; goto out_unlock; } - if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) { + if (p->exit_state != 0) { p = NULL; goto out_unlock; } @@ -415,15 +415,15 @@ out_unlock: */ void exit_pi_state_list(struct task_struct *curr) { - struct futex_hash_bucket *hb; struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; + struct futex_hash_bucket *hb; union futex_key key; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful - * versus waiters unqueueing themselfs + * versus waiters unqueueing themselves: */ spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { @@ -431,21 +431,24 @@ void exit_pi_state_list(struct task_struct *curr) next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; + hb = hash_futex(&key); spin_unlock_irq(&curr->pi_lock); - hb = hash_futex(&key); spin_lock(&hb->lock); spin_lock_irq(&curr->pi_lock); + /* + * We dropped the pi-lock, so re-check whether this + * task still owns the PI-state: + */ if (head->next != next) { spin_unlock(&hb->lock); continue; } - list_del_init(&pi_state->list); - WARN_ON(pi_state->owner != curr); - + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); pi_state->owner = NULL; spin_unlock_irq(&curr->pi_lock); @@ -470,12 +473,20 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) head = &hb->chain; list_for_each_entry_safe(this, next, head, list) { - if (match_futex (&this->key, &me->key)) { + if (match_futex(&this->key, &me->key)) { /* * Another waiter already exists - bump up * the refcount and return its pi_state: */ pi_state = this->pi_state; + /* + * Userspace might have messed up non PI and PI futexes + */ + if (unlikely(!pi_state)) + return -EINVAL; + + WARN_ON(!atomic_read(&pi_state->refcount)); + atomic_inc(&pi_state->refcount); me->pi_state = pi_state; @@ -484,10 +495,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) } /* - * We are the first waiter - try to look up the real owner and - * attach the new pi_state to it: + * We are the first waiter - try to look up the real owner and attach + * the new pi_state to it, but bail out when the owner died bit is set + * and TID = 0: */ pid = uval & FUTEX_TID_MASK; + if (!pid && (uval & FUTEX_OWNER_DIED)) + return -ESRCH; p = futex_find_get_task(pid); if (!p) return -ESRCH; @@ -504,6 +518,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) pi_state->key = me->key; spin_lock_irq(&p->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); pi_state->owner = p; spin_unlock_irq(&p->pi_lock); @@ -567,20 +582,29 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) * kept enabled while there is PI state around. We must also * preserve the owner died bit.) */ - newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; + if (!(uval & FUTEX_OWNER_DIED)) { + newval = FUTEX_WAITERS | new_owner->pid; - inc_preempt_count(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - dec_preempt_count(); + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + if (curval == -EFAULT) + return -EFAULT; + if (curval != uval) + return -EINVAL; + } - if (curval == -EFAULT) - return -EFAULT; - if (curval != uval) - return -EINVAL; + spin_lock_irq(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); - list_del_init(&pi_state->owner->pi_state_list); + spin_lock_irq(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; + spin_unlock_irq(&new_owner->pi_lock); + rt_mutex_unlock(&pi_state->pi_mutex); return 0; @@ -607,6 +631,22 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) } /* + * Express the locking dependencies for lockdep: + */ +static inline void +double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) +{ + if (hb1 <= hb2) { + spin_lock(&hb1->lock); + if (hb1 < hb2) + spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); + } else { /* hb1 > hb2 */ + spin_lock(&hb2->lock); + spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); + } +} + +/* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ @@ -674,11 +714,7 @@ retryfull: hb2 = hash_futex(&key2); retry: - if (hb1 < hb2) - spin_lock(&hb1->lock); - spin_lock(&hb2->lock); - if (hb1 > hb2) - spin_lock(&hb1->lock); + double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { @@ -711,8 +747,10 @@ retry: */ if (attempt++) { if (futex_handle_fault((unsigned long)uaddr2, - attempt)) + attempt)) { + ret = -EFAULT; goto out; + } goto retry; } @@ -787,11 +825,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); - if (hb1 < hb2) - spin_lock(&hb1->lock); - spin_lock(&hb2->lock); - if (hb1 > hb2) - spin_lock(&hb1->lock); + double_lock_hb(hb1, hb2); if (likely(cmpval != NULL)) { u32 curval; @@ -916,6 +950,7 @@ static int unqueue_me(struct futex_q *q) /* In the common case we don't take the spinlock, which is nice. */ retry: lock_ptr = q->lock_ptr; + barrier(); if (lock_ptr != 0) { spin_lock(lock_ptr); /* @@ -1085,9 +1120,10 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, - struct hrtimer_sleeper *to) +static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, + long nsec, int trylock) { + struct hrtimer_sleeper timeout, *to = NULL; struct task_struct *curr = current; struct futex_hash_bucket *hb; u32 uval, newval, curval; @@ -1097,6 +1133,13 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, if (refill_pi_state_cache()) return -ENOMEM; + if (sec != MAX_SCHEDULE_TIMEOUT) { + to = &timeout; + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); + hrtimer_init_sleeper(to, current); + to->timer.expires = ktime_set(sec, nsec); + } + q.pi_state = NULL; retry: down_read(&curr->mm->mmap_sem); @@ -1222,6 +1265,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, /* Owner died? */ if (q.pi_state->owner != NULL) { spin_lock_irq(&q.pi_state->owner->pi_lock); + WARN_ON(list_empty(&q.pi_state->list)); list_del_init(&q.pi_state->list); spin_unlock_irq(&q.pi_state->owner->pi_lock); } else @@ -1230,6 +1274,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, q.pi_state->owner = current; spin_lock_irq(¤t->pi_lock); + WARN_ON(!list_empty(&q.pi_state->list)); list_add(&q.pi_state->list, ¤t->pi_state_list); spin_unlock_irq(¤t->pi_lock); @@ -1270,7 +1315,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, if (!detect && ret == -EDEADLK && 0) force_sig(SIGKILL, current); - return ret; + return ret != -EINTR ? ret : -ERESTARTNOINTR; out_unlock_release_sem: queue_unlock(&q, hb); @@ -1287,9 +1332,10 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, * still holding the mmap_sem. */ if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr, attempt)) + if (futex_handle_fault((unsigned long)uaddr, attempt)) { + ret = -EFAULT; goto out_unlock_release_sem; - + } goto retry_locked; } @@ -1304,76 +1350,6 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, } /* - * Restart handler - */ -static long futex_lock_pi_restart(struct restart_block *restart) -{ - struct hrtimer_sleeper timeout, *to = NULL; - int ret; - - restart->fn = do_no_restart_syscall; - - if (restart->arg2 || restart->arg3) { - to = &timeout; - hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); - hrtimer_init_sleeper(to, current); - to->timer.expires.tv64 = ((u64)restart->arg1 << 32) | - (u64) restart->arg0; - } - - pr_debug("lock_pi restart: %p, %d (%d)\n", - (u32 __user *)restart->arg0, current->pid); - - ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1, - 0, to); - - if (ret != -EINTR) - return ret; - - restart->fn = futex_lock_pi_restart; - - /* The other values are filled in */ - return -ERESTART_RESTARTBLOCK; -} - -/* - * Called from the syscall entry below. - */ -static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, - long nsec, int trylock) -{ - struct hrtimer_sleeper timeout, *to = NULL; - struct restart_block *restart; - int ret; - - if (sec != MAX_SCHEDULE_TIMEOUT) { - to = &timeout; - hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); - hrtimer_init_sleeper(to, current); - to->timer.expires = ktime_set(sec, nsec); - } - - ret = do_futex_lock_pi(uaddr, detect, trylock, to); - - if (ret != -EINTR) - return ret; - - pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid); - - restart = ¤t_thread_info()->restart_block; - restart->fn = futex_lock_pi_restart; - restart->arg0 = (unsigned long) uaddr; - restart->arg1 = detect; - if (to) { - restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF; - restart->arg3 = to->timer.expires.tv64 >> 32; - } else - restart->arg2 = restart->arg3 = 0; - - return -ERESTART_RESTARTBLOCK; -} - -/* * Userspace attempted a TID -> 0 atomic transition, and failed. * This is the in-kernel slowpath: we look up the PI state (if any), * and do the rt-mutex unlock. @@ -1413,9 +1389,11 @@ retry_locked: * again. If it succeeds then we can return without waking * anyone else up: */ - inc_preempt_count(); - uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); - dec_preempt_count(); + if (!(uval & FUTEX_OWNER_DIED)) { + inc_preempt_count(); + uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); + dec_preempt_count(); + } if (unlikely(uval == -EFAULT)) goto pi_faulted; @@ -1448,9 +1426,11 @@ retry_locked: /* * No waiters - kernel unlocks the futex: */ - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) - goto pi_faulted; + if (!(uval & FUTEX_OWNER_DIED)) { + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; + } out_unlock: spin_unlock(&hb->lock); @@ -1467,9 +1447,10 @@ pi_faulted: * still holding the mmap_sem. */ if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr, attempt)) + if (futex_handle_fault((unsigned long)uaddr, attempt)) { + ret = -EFAULT; goto out_unlock; - + } goto retry_locked; } @@ -1669,9 +1650,9 @@ err_unlock: * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ -int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) +int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) { - u32 uval, nval; + u32 uval, nval, mval; retry: if (get_user(uval, uaddr)) @@ -1688,21 +1669,45 @@ retry: * thread-death.) The rest of the cleanup is done in * userspace. */ - nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, - uval | FUTEX_OWNER_DIED); + mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); + if (nval == -EFAULT) return -1; if (nval != uval) goto retry; - if (uval & FUTEX_WAITERS) - futex_wake(uaddr, 1); + /* + * Wake robust non-PI futexes here. The wakeup of + * PI futexes happens in exit_pi_state(): + */ + if (!pi) { + if (uval & FUTEX_WAITERS) + futex_wake(uaddr, 1); + } } return 0; } /* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int fetch_robust_entry(struct robust_list __user **entry, + struct robust_list __user **head, int *pi) +{ + unsigned long uentry; + + if (get_user(uentry, (unsigned long *)head)) + return -EFAULT; + + *entry = (void *)(uentry & ~1UL); + *pi = uentry & 1; + + return 0; +} + +/* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * @@ -1712,14 +1717,14 @@ void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned long futex_offset; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (get_user(entry, &head->list.next)) + if (fetch_robust_entry(&entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: @@ -1730,10 +1735,11 @@ void exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (get_user(pending, &head->list_op_pending)) + if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; + if (pending) - handle_futex_death((void *)pending + futex_offset, curr); + handle_futex_death((void *)pending + futex_offset, curr, pip); while (entry != &head->list) { /* @@ -1742,12 +1748,12 @@ void exit_robust_list(struct task_struct *curr) */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, - curr)) + curr, pi)) return; /* * Fetch the next entry in the list: */ - if (get_user(entry, &entry->next)) + if (fetch_robust_entry(&entry, &entry->next, &pi)) return; /* * Avoid excessively long or circular lists: diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d1d92b441fb..c5cca3f65cb 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -12,6 +12,23 @@ #include <asm/uaccess.h> + +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int +fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, + compat_uptr_t *head, int *pi) +{ + if (get_user(*uentry, head)) + return -EFAULT; + + *entry = compat_ptr((*uentry) & ~1); + *pi = (unsigned int)(*uentry) & 1; + + return 0; +} + /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. @@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; compat_uptr_t uentry, upending; - unsigned int limit = ROBUST_LIST_LIMIT; compat_long_t futex_offset; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (get_user(uentry, &head->list.next)) + if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) return; - entry = compat_ptr(uentry); /* * Fetch the relative futex offset: */ @@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (get_user(upending, &head->list_op_pending)) + if (fetch_robust_entry(&upending, &pending, + &head->list_op_pending, &pip)) return; - pending = compat_ptr(upending); if (upending) - handle_futex_death((void *)pending + futex_offset, curr); + handle_futex_death((void *)pending + futex_offset, curr, pip); while (compat_ptr(uentry) != &head->list) { /* @@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr) */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, - curr)) + curr, pi)) return; /* * Fetch the next entry in the list: */ - if (get_user(uentry, (compat_uptr_t *)&entry->next)) + if (fetch_robust_entry(&uentry, &entry, + (compat_uptr_t *)&entry->next, &pi)) return; - entry = compat_ptr(uentry); /* * Avoid excessively long or circular lists: */ diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 8d3dc29ef41..21c38a7e666 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -187,7 +187,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) { struct hrtimer_base *new_base; - new_base = &__get_cpu_var(hrtimer_bases[base->index]); + new_base = &__get_cpu_var(hrtimer_bases)[base->index]; if (base != new_base) { /* @@ -669,7 +669,7 @@ static int hrtimer_wakeup(struct hrtimer *timer) return HRTIMER_NORESTART; } -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task) +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) { sl->timer.function = hrtimer_wakeup; sl->task = task; @@ -782,8 +782,10 @@ static void __devinit init_hrtimers_cpu(int cpu) struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); int i; - for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) + for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { spin_lock_init(&base->lock); + lockdep_set_class(&base->lock, &base->lock_key); + } } #ifdef CONFIG_HOTPLUG_CPU @@ -833,7 +835,7 @@ static void migrate_hrtimers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __devinit hrtimer_cpu_notify(struct notifier_block *self, +static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -857,7 +859,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __devinitdata hrtimers_nb = { +static struct notifier_block __cpuinitdata hrtimers_nb = { .notifier_call = hrtimer_cpu_notify, }; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 54105bdfe20..9336f2e89e4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -261,10 +261,13 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) * keep it masked and get out of here */ action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!action || (desc->status & IRQ_DISABLED))) { + desc->status |= IRQ_PENDING; goto out; + } desc->status |= IRQ_INPROGRESS; + desc->status &= ~IRQ_PENDING; spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, regs, action); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index aeb6e391276..48a53f68af9 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -20,6 +20,11 @@ /** * handle_bad_irq - handle spurious and unhandled irqs + * @irq: the interrupt number + * @desc: description of the interrupt + * @regs: pointer to a register structure + * + * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ void fastcall handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) @@ -132,7 +137,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, handle_dynamic_tick(action); if (!(action->flags & IRQF_DISABLED)) - local_irq_enable(); + local_irq_enable_in_hardirq(); do { ret = action->handler(irq, action->dev_id, regs); @@ -249,3 +254,19 @@ out: return 1; } +#ifdef CONFIG_TRACE_IRQFLAGS + +/* + * lockdep: we want to handle all irq_desc locks as a single lock-class: + */ +static struct lock_class_key irq_desc_lock_class; + +void early_init_irq_lock_class(void) +{ + int i; + + for (i = 0; i < NR_IRQS; i++) + lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); +} + +#endif diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c911c6ec4dd..92be519eff2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -137,16 +137,40 @@ EXPORT_SYMBOL(enable_irq); * @irq: interrupt to control * @on: enable/disable power management wakeup * - * Enable/disable power management wakeup mode + * Enable/disable power management wakeup mode, which is + * disabled by default. Enables and disables must match, + * just as they match for non-wakeup mode support. + * + * Wakeup mode lets this IRQ wake the system from sleep + * states like "suspend to RAM". */ int set_irq_wake(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_desc + irq; unsigned long flags; int ret = -ENXIO; + int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake; + /* wakeup-capable irqs can be shared between drivers that + * don't need to have the same sleep mode behaviors. + */ spin_lock_irqsave(&desc->lock, flags); - if (desc->chip->set_wake) + if (on) { + if (desc->wake_depth++ == 0) + desc->status |= IRQ_WAKEUP; + else + set_wake = NULL; + } else { + if (desc->wake_depth == 0) { + printk(KERN_WARNING "Unbalanced IRQ %d " + "wake disable\n", irq); + WARN_ON(1); + } else if (--desc->wake_depth == 0) + desc->status &= ~IRQ_WAKEUP; + else + set_wake = NULL; + } + if (set_wake) ret = desc->chip->set_wake(irq, on); spin_unlock_irqrestore(&desc->lock, flags); return ret; @@ -410,6 +434,12 @@ int request_irq(unsigned int irq, struct irqaction *action; int retval; +#ifdef CONFIG_LOCKDEP + /* + * Lockdep wants atomic interrupt handlers: + */ + irqflags |= SA_INTERRUPT; +#endif /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 872f91ba2ce..35f10f7ff94 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -63,8 +63,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) desc->chip->enable(irq); if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { - desc->status &= ~IRQ_PENDING; - desc->status = status | IRQ_REPLAY; + desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; if (!desc->chip || !desc->chip->retrigger || !desc->chip->retrigger(irq)) { diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 39277dd6bf9..ab16a5a4cfe 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -275,8 +275,8 @@ static void upcase_if_global(struct kallsym_iter *iter) static int get_ksymbol_mod(struct kallsym_iter *iter) { iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, - &iter->value, - &iter->type, iter->name); + &iter->value, &iter->type, + iter->name, sizeof(iter->name)); if (iter->owner == NULL) return 0; diff --git a/kernel/kmod.c b/kernel/kmod.c index 1b7157af051..5c470c57fb5 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -197,11 +197,12 @@ static void __call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; pid_t pid; + int wait = sub_info->wait; /* CLONE_VFORK: wait until the usermode helper has execve'd * successfully We need the data structures to stay around * until that is done. */ - if (sub_info->wait) + if (wait) pid = kernel_thread(wait_for_helper, sub_info, CLONE_FS | CLONE_FILES | SIGCHLD); else @@ -211,7 +212,7 @@ static void __call_usermodehelper(void *data) if (pid < 0) { sub_info->retval = pid; complete(sub_info->complete); - } else if (!sub_info->wait) + } else if (!wait) complete(sub_info->complete); } @@ -233,7 +234,7 @@ static void __call_usermodehelper(void *data) int call_usermodehelper_keys(char *path, char **argv, char **envp, struct key *session_keyring, int wait) { - DECLARE_COMPLETION(done); + DECLARE_COMPLETION_ONSTACK(done); struct subprocess_info sub_info = { .complete = &done, .path = path, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 64aab081153..3f57dfdc8f9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -393,6 +393,7 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) { copy_kprobe(p, ap); + flush_insn_slot(ap); ap->addr = p->addr; ap->pre_handler = aggr_pre_handler; ap->fault_handler = aggr_fault_handler; diff --git a/kernel/kthread.c b/kernel/kthread.c index 24be714b04c..4f9c60ef95e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -216,23 +216,6 @@ EXPORT_SYMBOL(kthread_bind); */ int kthread_stop(struct task_struct *k) { - return kthread_stop_sem(k, NULL); -} -EXPORT_SYMBOL(kthread_stop); - -/** - * kthread_stop_sem - stop a thread created by kthread_create(). - * @k: thread created by kthread_create(). - * @s: semaphore that @k waits on while idle. - * - * Does essentially the same thing as kthread_stop() above, but wakes - * @k by calling up(@s). - * - * Returns the result of threadfn(), or %-EINTR if wake_up_process() - * was never called. - */ -int kthread_stop_sem(struct task_struct *k, struct semaphore *s) -{ int ret; mutex_lock(&kthread_stop_lock); @@ -246,10 +229,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) /* Now set kthread_should_stop() to true, and wake it up. */ kthread_stop_info.k = k; - if (s) - up(s); - else - wake_up_process(k); + wake_up_process(k); put_task_struct(k); /* Once it dies, reset stop ptr, gather result and we're done. */ @@ -260,7 +240,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) return ret; } -EXPORT_SYMBOL(kthread_stop_sem); +EXPORT_SYMBOL(kthread_stop); static __init int helper_init(void) { diff --git a/kernel/lockdep.c b/kernel/lockdep.c new file mode 100644 index 00000000000..9bad1788451 --- /dev/null +++ b/kernel/lockdep.c @@ -0,0 +1,2704 @@ +/* + * kernel/lockdep.c + * + * Runtime locking correctness validator + * + * Started by Ingo Molnar: + * + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * this code maps all the lock dependencies as they occur in a live kernel + * and will warn about the following classes of locking bugs: + * + * - lock inversion scenarios + * - circular lock dependencies + * - hardirq/softirq safe/unsafe locking bugs + * + * Bugs are reported even if the current locking scenario does not cause + * any deadlock at this point. + * + * I.e. if anytime in the past two locks were taken in a different order, + * even if it happened for another task, even if those were different + * locks (but of the same class as this lock), this code will detect it. + * + * Thanks to Arjan van de Ven for coming up with the initial idea of + * mapping lock dependencies runtime. + */ +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/kallsyms.h> +#include <linux/interrupt.h> +#include <linux/stacktrace.h> +#include <linux/debug_locks.h> +#include <linux/irqflags.h> + +#include <asm/sections.h> + +#include "lockdep_internals.h" + +/* + * hash_lock: protects the lockdep hashes and class/list/hash allocators. + * + * This is one of the rare exceptions where it's justified + * to use a raw spinlock - we really dont want the spinlock + * code to recurse back into the lockdep code. + */ +static raw_spinlock_t hash_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +static int lockdep_initialized; + +unsigned long nr_list_entries; +static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; + +/* + * Allocate a lockdep entry. (assumes hash_lock held, returns + * with NULL on failure) + */ +static struct lock_list *alloc_list_entry(void) +{ + if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { + __raw_spin_unlock(&hash_lock); + debug_locks_off(); + printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); + printk("turning off the locking correctness validator.\n"); + return NULL; + } + return list_entries + nr_list_entries++; +} + +/* + * All data structures here are protected by the global debug_lock. + * + * Mutex key structs only get allocated, once during bootup, and never + * get freed - this significantly simplifies the debugging code. + */ +unsigned long nr_lock_classes; +static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; + +/* + * We keep a global list of all lock classes. The list only grows, + * never shrinks. The list is only accessed with the lockdep + * spinlock lock held. + */ +LIST_HEAD(all_lock_classes); + +/* + * The lockdep classes are in a hash-table as well, for fast lookup: + */ +#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) +#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) +#define CLASSHASH_MASK (CLASSHASH_SIZE - 1) +#define __classhashfn(key) ((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK) +#define classhashentry(key) (classhash_table + __classhashfn((key))) + +static struct list_head classhash_table[CLASSHASH_SIZE]; + +unsigned long nr_lock_chains; +static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; + +/* + * We put the lock dependency chains into a hash-table as well, to cache + * their existence: + */ +#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) +#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) +#define CHAINHASH_MASK (CHAINHASH_SIZE - 1) +#define __chainhashfn(chain) \ + (((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK) +#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) + +static struct list_head chainhash_table[CHAINHASH_SIZE]; + +/* + * The hash key of the lock dependency chains is a hash itself too: + * it's a hash of all locks taken up to that lock, including that lock. + * It's a 64-bit hash, because it's important for the keys to be + * unique. + */ +#define iterate_chain_key(key1, key2) \ + (((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \ + ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \ + (key2)) + +void lockdep_off(void) +{ + current->lockdep_recursion++; +} + +EXPORT_SYMBOL(lockdep_off); + +void lockdep_on(void) +{ + current->lockdep_recursion--; +} + +EXPORT_SYMBOL(lockdep_on); + +int lockdep_internal(void) +{ + return current->lockdep_recursion != 0; +} + +EXPORT_SYMBOL(lockdep_internal); + +/* + * Debugging switches: + */ + +#define VERBOSE 0 +#ifdef VERBOSE +# define VERY_VERBOSE 0 +#endif + +#if VERBOSE +# define HARDIRQ_VERBOSE 1 +# define SOFTIRQ_VERBOSE 1 +#else +# define HARDIRQ_VERBOSE 0 +# define SOFTIRQ_VERBOSE 0 +#endif + +#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE +/* + * Quick filtering for interesting events: + */ +static int class_filter(struct lock_class *class) +{ +#if 0 + /* Example */ + if (class->name_version == 1 && + !strcmp(class->name, "lockname")) + return 1; + if (class->name_version == 1 && + !strcmp(class->name, "&struct->lockfield")) + return 1; +#endif + /* Allow everything else. 0 would be filter everything else */ + return 1; +} +#endif + +static int verbose(struct lock_class *class) +{ +#if VERBOSE + return class_filter(class); +#endif + return 0; +} + +#ifdef CONFIG_TRACE_IRQFLAGS + +static int hardirq_verbose(struct lock_class *class) +{ +#if HARDIRQ_VERBOSE + return class_filter(class); +#endif + return 0; +} + +static int softirq_verbose(struct lock_class *class) +{ +#if SOFTIRQ_VERBOSE + return class_filter(class); +#endif + return 0; +} + +#endif + +/* + * Stack-trace: tightly packed array of stack backtrace + * addresses. Protected by the hash_lock. + */ +unsigned long nr_stack_trace_entries; +static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; + +static int save_trace(struct stack_trace *trace) +{ + trace->nr_entries = 0; + trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; + trace->entries = stack_trace + nr_stack_trace_entries; + + save_stack_trace(trace, NULL, 0, 3); + + trace->max_entries = trace->nr_entries; + + nr_stack_trace_entries += trace->nr_entries; + if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) + return 0; + + if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { + __raw_spin_unlock(&hash_lock); + if (debug_locks_off()) { + printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); + printk("turning off the locking correctness validator.\n"); + dump_stack(); + } + return 0; + } + + return 1; +} + +unsigned int nr_hardirq_chains; +unsigned int nr_softirq_chains; +unsigned int nr_process_chains; +unsigned int max_lockdep_depth; +unsigned int max_recursion_depth; + +#ifdef CONFIG_DEBUG_LOCKDEP +/* + * We cannot printk in early bootup code. Not even early_printk() + * might work. So we mark any initialization errors and printk + * about it later on, in lockdep_info(). + */ +static int lockdep_init_error; + +/* + * Various lockdep statistics: + */ +atomic_t chain_lookup_hits; +atomic_t chain_lookup_misses; +atomic_t hardirqs_on_events; +atomic_t hardirqs_off_events; +atomic_t redundant_hardirqs_on; +atomic_t redundant_hardirqs_off; +atomic_t softirqs_on_events; +atomic_t softirqs_off_events; +atomic_t redundant_softirqs_on; +atomic_t redundant_softirqs_off; +atomic_t nr_unused_locks; +atomic_t nr_cyclic_checks; +atomic_t nr_cyclic_check_recursions; +atomic_t nr_find_usage_forwards_checks; +atomic_t nr_find_usage_forwards_recursions; +atomic_t nr_find_usage_backwards_checks; +atomic_t nr_find_usage_backwards_recursions; +# define debug_atomic_inc(ptr) atomic_inc(ptr) +# define debug_atomic_dec(ptr) atomic_dec(ptr) +# define debug_atomic_read(ptr) atomic_read(ptr) +#else +# define debug_atomic_inc(ptr) do { } while (0) +# define debug_atomic_dec(ptr) do { } while (0) +# define debug_atomic_read(ptr) 0 +#endif + +/* + * Locking printouts: + */ + +static const char *usage_str[] = +{ + [LOCK_USED] = "initial-use ", + [LOCK_USED_IN_HARDIRQ] = "in-hardirq-W", + [LOCK_USED_IN_SOFTIRQ] = "in-softirq-W", + [LOCK_ENABLED_SOFTIRQS] = "softirq-on-W", + [LOCK_ENABLED_HARDIRQS] = "hardirq-on-W", + [LOCK_USED_IN_HARDIRQ_READ] = "in-hardirq-R", + [LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R", + [LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R", + [LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R", +}; + +const char * __get_key_name(struct lockdep_subclass_key *key, char *str) +{ + unsigned long offs, size; + char *modname; + + return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str); +} + +void +get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4) +{ + *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.'; + + if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) + *c1 = '+'; + else + if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) + *c1 = '-'; + + if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) + *c2 = '+'; + else + if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) + *c2 = '-'; + + if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) + *c3 = '-'; + if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) { + *c3 = '+'; + if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) + *c3 = '?'; + } + + if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) + *c4 = '-'; + if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) { + *c4 = '+'; + if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) + *c4 = '?'; + } +} + +static void print_lock_name(struct lock_class *class) +{ + char str[128], c1, c2, c3, c4; + const char *name; + + get_usage_chars(class, &c1, &c2, &c3, &c4); + + name = class->name; + if (!name) { + name = __get_key_name(class->key, str); + printk(" (%s", name); + } else { + printk(" (%s", name); + if (class->name_version > 1) + printk("#%d", class->name_version); + if (class->subclass) + printk("/%d", class->subclass); + } + printk("){%c%c%c%c}", c1, c2, c3, c4); +} + +static void print_lockdep_cache(struct lockdep_map *lock) +{ + const char *name; + char str[128]; + + name = lock->name; + if (!name) + name = __get_key_name(lock->key->subkeys, str); + + printk("%s", name); +} + +static void print_lock(struct held_lock *hlock) +{ + print_lock_name(hlock->class); + printk(", at: "); + print_ip_sym(hlock->acquire_ip); +} + +static void lockdep_print_held_locks(struct task_struct *curr) +{ + int i, depth = curr->lockdep_depth; + + if (!depth) { + printk("no locks held by %s/%d.\n", curr->comm, curr->pid); + return; + } + printk("%d lock%s held by %s/%d:\n", + depth, depth > 1 ? "s" : "", curr->comm, curr->pid); + + for (i = 0; i < depth; i++) { + printk(" #%d: ", i); + print_lock(curr->held_locks + i); + } +} + +static void print_lock_class_header(struct lock_class *class, int depth) +{ + int bit; + + printk("%*s->", depth, ""); + print_lock_name(class); + printk(" ops: %lu", class->ops); + printk(" {\n"); + + for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { + if (class->usage_mask & (1 << bit)) { + int len = depth; + + len += printk("%*s %s", depth, "", usage_str[bit]); + len += printk(" at:\n"); + print_stack_trace(class->usage_traces + bit, len); + } + } + printk("%*s }\n", depth, ""); + + printk("%*s ... key at: ",depth,""); + print_ip_sym((unsigned long)class->key); +} + +/* + * printk all lock dependencies starting at <entry>: + */ +static void print_lock_dependencies(struct lock_class *class, int depth) +{ + struct lock_list *entry; + + if (DEBUG_LOCKS_WARN_ON(depth >= 20)) + return; + + print_lock_class_header(class, depth); + + list_for_each_entry(entry, &class->locks_after, entry) { + DEBUG_LOCKS_WARN_ON(!entry->class); + print_lock_dependencies(entry->class, depth + 1); + + printk("%*s ... acquired at:\n",depth,""); + print_stack_trace(&entry->trace, 2); + printk("\n"); + } +} + +/* + * Add a new dependency to the head of the list: + */ +static int add_lock_to_list(struct lock_class *class, struct lock_class *this, + struct list_head *head, unsigned long ip) +{ + struct lock_list *entry; + /* + * Lock not present yet - get a new dependency struct and + * add it to the list: + */ + entry = alloc_list_entry(); + if (!entry) + return 0; + + entry->class = this; + save_trace(&entry->trace); + + /* + * Since we never remove from the dependency list, the list can + * be walked lockless by other CPUs, it's only allocation + * that must be protected by the spinlock. But this also means + * we must make new entries visible only once writes to the + * entry become visible - hence the RCU op: + */ + list_add_tail_rcu(&entry->entry, head); + + return 1; +} + +/* + * Recursive, forwards-direction lock-dependency checking, used for + * both noncyclic checking and for hardirq-unsafe/softirq-unsafe + * checking. + * + * (to keep the stackframe of the recursive functions small we + * use these global variables, and we also mark various helper + * functions as noinline.) + */ +static struct held_lock *check_source, *check_target; + +/* + * Print a dependency chain entry (this is only done when a deadlock + * has been detected): + */ +static noinline int +print_circular_bug_entry(struct lock_list *target, unsigned int depth) +{ + if (debug_locks_silent) + return 0; + printk("\n-> #%u", depth); + print_lock_name(target->class); + printk(":\n"); + print_stack_trace(&target->trace, 6); + + return 0; +} + +/* + * When a circular dependency is detected, print the + * header first: + */ +static noinline int +print_circular_bug_header(struct lock_list *entry, unsigned int depth) +{ + struct task_struct *curr = current; + + __raw_spin_unlock(&hash_lock); + debug_locks_off(); + if (debug_locks_silent) + return 0; + + printk("\n=======================================================\n"); + printk( "[ INFO: possible circular locking dependency detected ]\n"); + printk( "-------------------------------------------------------\n"); + printk("%s/%d is trying to acquire lock:\n", + curr->comm, curr->pid); + print_lock(check_source); + printk("\nbut task is already holding lock:\n"); + print_lock(check_target); + printk("\nwhich lock already depends on the new lock.\n\n"); + printk("\nthe existing dependency chain (in reverse order) is:\n"); + + print_circular_bug_entry(entry, depth); + + return 0; +} + +static noinline int print_circular_bug_tail(void) +{ + struct task_struct *curr = current; + struct lock_list this; + + if (debug_locks_silent) + return 0; + + this.class = check_source->class; + save_trace(&this.trace); + print_circular_bug_entry(&this, 0); + + printk("\nother info that might help us debug this:\n\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +static int noinline print_infinite_recursion_bug(void) +{ + __raw_spin_unlock(&hash_lock); + DEBUG_LOCKS_WARN_ON(1); + + return 0; +} + +/* + * Prove that the dependency graph starting at <entry> can not + * lead to <target>. Print an error and return 0 if it does. + */ +static noinline int +check_noncircular(struct lock_class *source, unsigned int depth) +{ + struct lock_list *entry; + + debug_atomic_inc(&nr_cyclic_check_recursions); + if (depth > max_recursion_depth) + max_recursion_depth = depth; + if (depth >= 20) + return print_infinite_recursion_bug(); + /* + * Check this lock's dependency list: + */ + list_for_each_entry(entry, &source->locks_after, entry) { + if (entry->class == check_target->class) + return print_circular_bug_header(entry, depth+1); + debug_atomic_inc(&nr_cyclic_checks); + if (!check_noncircular(entry->class, depth+1)) + return print_circular_bug_entry(entry, depth+1); + } + return 1; +} + +static int very_verbose(struct lock_class *class) +{ +#if VERY_VERBOSE + return class_filter(class); +#endif + return 0; +} +#ifdef CONFIG_TRACE_IRQFLAGS + +/* + * Forwards and backwards subgraph searching, for the purposes of + * proving that two subgraphs can be connected by a new dependency + * without creating any illegal irq-safe -> irq-unsafe lock dependency. + */ +static enum lock_usage_bit find_usage_bit; +static struct lock_class *forwards_match, *backwards_match; + +/* + * Find a node in the forwards-direction dependency sub-graph starting + * at <source> that matches <find_usage_bit>. + * + * Return 2 if such a node exists in the subgraph, and put that node + * into <forwards_match>. + * + * Return 1 otherwise and keep <forwards_match> unchanged. + * Return 0 on error. + */ +static noinline int +find_usage_forwards(struct lock_class *source, unsigned int depth) +{ + struct lock_list *entry; + int ret; + + if (depth > max_recursion_depth) + max_recursion_depth = depth; + if (depth >= 20) + return print_infinite_recursion_bug(); + + debug_atomic_inc(&nr_find_usage_forwards_checks); + if (source->usage_mask & (1 << find_usage_bit)) { + forwards_match = source; + return 2; + } + + /* + * Check this lock's dependency list: + */ + list_for_each_entry(entry, &source->locks_after, entry) { + debug_atomic_inc(&nr_find_usage_forwards_recursions); + ret = find_usage_forwards(entry->class, depth+1); + if (ret == 2 || ret == 0) + return ret; + } + return 1; +} + +/* + * Find a node in the backwards-direction dependency sub-graph starting + * at <source> that matches <find_usage_bit>. + * + * Return 2 if such a node exists in the subgraph, and put that node + * into <backwards_match>. + * + * Return 1 otherwise and keep <backwards_match> unchanged. + * Return 0 on error. + */ +static noinline int +find_usage_backwards(struct lock_class *source, unsigned int depth) +{ + struct lock_list *entry; + int ret; + + if (depth > max_recursion_depth) + max_recursion_depth = depth; + if (depth >= 20) + return print_infinite_recursion_bug(); + + debug_atomic_inc(&nr_find_usage_backwards_checks); + if (source->usage_mask & (1 << find_usage_bit)) { + backwards_match = source; + return 2; + } + + /* + * Check this lock's dependency list: + */ + list_for_each_entry(entry, &source->locks_before, entry) { + debug_atomic_inc(&nr_find_usage_backwards_recursions); + ret = find_usage_backwards(entry->class, depth+1); + if (ret == 2 || ret == 0) + return ret; + } + return 1; +} + +static int +print_bad_irq_dependency(struct task_struct *curr, + struct held_lock *prev, + struct held_lock *next, + enum lock_usage_bit bit1, + enum lock_usage_bit bit2, + const char *irqclass) +{ + __raw_spin_unlock(&hash_lock); + debug_locks_off(); + if (debug_locks_silent) + return 0; + + printk("\n======================================================\n"); + printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", + irqclass, irqclass); + printk( "------------------------------------------------------\n"); + printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", + curr->comm, curr->pid, + curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, + curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, + curr->hardirqs_enabled, + curr->softirqs_enabled); + print_lock(next); + + printk("\nand this task is already holding:\n"); + print_lock(prev); + printk("which would create a new lock dependency:\n"); + print_lock_name(prev->class); + printk(" ->"); + print_lock_name(next->class); + printk("\n"); + + printk("\nbut this new dependency connects a %s-irq-safe lock:\n", + irqclass); + print_lock_name(backwards_match); + printk("\n... which became %s-irq-safe at:\n", irqclass); + + print_stack_trace(backwards_match->usage_traces + bit1, 1); + + printk("\nto a %s-irq-unsafe lock:\n", irqclass); + print_lock_name(forwards_match); + printk("\n... which became %s-irq-unsafe at:\n", irqclass); + printk("..."); + + print_stack_trace(forwards_match->usage_traces + bit2, 1); + + printk("\nother info that might help us debug this:\n\n"); + lockdep_print_held_locks(curr); + + printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass); + print_lock_dependencies(backwards_match, 0); + + printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass); + print_lock_dependencies(forwards_match, 0); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +static int +check_usage(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next, enum lock_usage_bit bit_backwards, + enum lock_usage_bit bit_forwards, const char *irqclass) +{ + int ret; + + find_usage_bit = bit_backwards; + /* fills in <backwards_match> */ + ret = find_usage_backwards(prev->class, 0); + if (!ret || ret == 1) + return ret; + + find_usage_bit = bit_forwards; + ret = find_usage_forwards(next->class, 0); + if (!ret || ret == 1) + return ret; + /* ret == 2 */ + return print_bad_irq_dependency(curr, prev, next, + bit_backwards, bit_forwards, irqclass); +} + +#endif + +static int +print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next) +{ + debug_locks_off(); + __raw_spin_unlock(&hash_lock); + if (debug_locks_silent) + return 0; + + printk("\n=============================================\n"); + printk( "[ INFO: possible recursive locking detected ]\n"); + printk( "---------------------------------------------\n"); + printk("%s/%d is trying to acquire lock:\n", + curr->comm, curr->pid); + print_lock(next); + printk("\nbut task is already holding lock:\n"); + print_lock(prev); + + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Check whether we are holding such a class already. + * + * (Note that this has to be done separately, because the graph cannot + * detect such classes of deadlocks.) + * + * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read + */ +static int +check_deadlock(struct task_struct *curr, struct held_lock *next, + struct lockdep_map *next_instance, int read) +{ + struct held_lock *prev; + int i; + + for (i = 0; i < curr->lockdep_depth; i++) { + prev = curr->held_locks + i; + if (prev->class != next->class) + continue; + /* + * Allow read-after-read recursion of the same + * lock class (i.e. read_lock(lock)+read_lock(lock)): + */ + if ((read == 2) && prev->read) + return 2; + return print_deadlock_bug(curr, prev, next); + } + return 1; +} + +/* + * There was a chain-cache miss, and we are about to add a new dependency + * to a previous lock. We recursively validate the following rules: + * + * - would the adding of the <prev> -> <next> dependency create a + * circular dependency in the graph? [== circular deadlock] + * + * - does the new prev->next dependency connect any hardirq-safe lock + * (in the full backwards-subgraph starting at <prev>) with any + * hardirq-unsafe lock (in the full forwards-subgraph starting at + * <next>)? [== illegal lock inversion with hardirq contexts] + * + * - does the new prev->next dependency connect any softirq-safe lock + * (in the full backwards-subgraph starting at <prev>) with any + * softirq-unsafe lock (in the full forwards-subgraph starting at + * <next>)? [== illegal lock inversion with softirq contexts] + * + * any of these scenarios could lead to a deadlock. + * + * Then if all the validations pass, we add the forwards and backwards + * dependency. + */ +static int +check_prev_add(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next) +{ + struct lock_list *entry; + int ret; + + /* + * Prove that the new <prev> -> <next> dependency would not + * create a circular dependency in the graph. (We do this by + * forward-recursing into the graph starting at <next>, and + * checking whether we can reach <prev>.) + * + * We are using global variables to control the recursion, to + * keep the stackframe size of the recursive functions low: + */ + check_source = next; + check_target = prev; + if (!(check_noncircular(next->class, 0))) + return print_circular_bug_tail(); + +#ifdef CONFIG_TRACE_IRQFLAGS + /* + * Prove that the new dependency does not connect a hardirq-safe + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at <prev>, and the + * forwards-subgraph starting at <next>: + */ + if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, + LOCK_ENABLED_HARDIRQS, "hard")) + return 0; + + /* + * Prove that the new dependency does not connect a hardirq-safe-read + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at <prev>, and the + * forwards-subgraph starting at <next>: + */ + if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, + LOCK_ENABLED_HARDIRQS, "hard-read")) + return 0; + + /* + * Prove that the new dependency does not connect a softirq-safe + * lock with a softirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at <prev>, and the + * forwards-subgraph starting at <next>: + */ + if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, + LOCK_ENABLED_SOFTIRQS, "soft")) + return 0; + /* + * Prove that the new dependency does not connect a softirq-safe-read + * lock with a softirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at <prev>, and the + * forwards-subgraph starting at <next>: + */ + if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ, + LOCK_ENABLED_SOFTIRQS, "soft")) + return 0; +#endif + /* + * For recursive read-locks we do all the dependency checks, + * but we dont store read-triggered dependencies (only + * write-triggered dependencies). This ensures that only the + * write-side dependencies matter, and that if for example a + * write-lock never takes any other locks, then the reads are + * equivalent to a NOP. + */ + if (next->read == 2 || prev->read == 2) + return 1; + /* + * Is the <prev> -> <next> dependency already present? + * + * (this may occur even though this is a new chain: consider + * e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3 + * chains - the second one will be new, but L1 already has + * L2 added to its dependency list, due to the first chain.) + */ + list_for_each_entry(entry, &prev->class->locks_after, entry) { + if (entry->class == next->class) + return 2; + } + + /* + * Ok, all validations passed, add the new lock + * to the previous lock's dependency list: + */ + ret = add_lock_to_list(prev->class, next->class, + &prev->class->locks_after, next->acquire_ip); + if (!ret) + return 0; + /* + * Return value of 2 signals 'dependency already added', + * in that case we dont have to add the backlink either. + */ + if (ret == 2) + return 2; + ret = add_lock_to_list(next->class, prev->class, + &next->class->locks_before, next->acquire_ip); + + /* + * Debugging printouts: + */ + if (verbose(prev->class) || verbose(next->class)) { + __raw_spin_unlock(&hash_lock); + printk("\n new dependency: "); + print_lock_name(prev->class); + printk(" => "); + print_lock_name(next->class); + printk("\n"); + dump_stack(); + __raw_spin_lock(&hash_lock); + } + return 1; +} + +/* + * Add the dependency to all directly-previous locks that are 'relevant'. + * The ones that are relevant are (in increasing distance from curr): + * all consecutive trylock entries and the final non-trylock entry - or + * the end of this context's lock-chain - whichever comes first. + */ +static int +check_prevs_add(struct task_struct *curr, struct held_lock *next) +{ + int depth = curr->lockdep_depth; + struct held_lock *hlock; + + /* + * Debugging checks. + * + * Depth must not be zero for a non-head lock: + */ + if (!depth) + goto out_bug; + /* + * At least two relevant locks must exist for this + * to be a head: + */ + if (curr->held_locks[depth].irq_context != + curr->held_locks[depth-1].irq_context) + goto out_bug; + + for (;;) { + hlock = curr->held_locks + depth-1; + /* + * Only non-recursive-read entries get new dependencies + * added: + */ + if (hlock->read != 2) { + check_prev_add(curr, hlock, next); + /* + * Stop after the first non-trylock entry, + * as non-trylock entries have added their + * own direct dependencies already, so this + * lock is connected to them indirectly: + */ + if (!hlock->trylock) + break; + } + depth--; + /* + * End of lock-stack? + */ + if (!depth) + break; + /* + * Stop the search if we cross into another context: + */ + if (curr->held_locks[depth].irq_context != + curr->held_locks[depth-1].irq_context) + break; + } + return 1; +out_bug: + __raw_spin_unlock(&hash_lock); + DEBUG_LOCKS_WARN_ON(1); + + return 0; +} + + +/* + * Is this the address of a static object: + */ +static int static_obj(void *obj) +{ + unsigned long start = (unsigned long) &_stext, + end = (unsigned long) &_end, + addr = (unsigned long) obj; +#ifdef CONFIG_SMP + int i; +#endif + + /* + * static variable? + */ + if ((addr >= start) && (addr < end)) + return 1; + +#ifdef CONFIG_SMP + /* + * percpu var? + */ + for_each_possible_cpu(i) { + start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); + end = (unsigned long) &__per_cpu_end + per_cpu_offset(i); + + if ((addr >= start) && (addr < end)) + return 1; + } +#endif + + /* + * module var? + */ + return is_module_address(addr); +} + +/* + * To make lock name printouts unique, we calculate a unique + * class->name_version generation counter: + */ +static int count_matching_names(struct lock_class *new_class) +{ + struct lock_class *class; + int count = 0; + + if (!new_class->name) + return 0; + + list_for_each_entry(class, &all_lock_classes, lock_entry) { + if (new_class->key - new_class->subclass == class->key) + return class->name_version; + if (class->name && !strcmp(class->name, new_class->name)) + count = max(count, class->name_version); + } + + return count + 1; +} + +extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void); + +/* + * Register a lock's class in the hash-table, if the class is not present + * yet. Otherwise we look it up. We cache the result in the lock object + * itself, so actual lookup of the hash should be once per lock object. + */ +static inline struct lock_class * +look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) +{ + struct lockdep_subclass_key *key; + struct list_head *hash_head; + struct lock_class *class; + +#ifdef CONFIG_DEBUG_LOCKDEP + /* + * If the architecture calls into lockdep before initializing + * the hashes then we'll warn about it later. (we cannot printk + * right now) + */ + if (unlikely(!lockdep_initialized)) { + lockdep_init(); + lockdep_init_error = 1; + } +#endif + + /* + * Static locks do not have their class-keys yet - for them the key + * is the lock object itself: + */ + if (unlikely(!lock->key)) + lock->key = (void *)lock; + + /* + * NOTE: the class-key must be unique. For dynamic locks, a static + * lock_class_key variable is passed in through the mutex_init() + * (or spin_lock_init()) call - which acts as the key. For static + * locks we use the lock object itself as the key. + */ + if (sizeof(struct lock_class_key) > sizeof(struct lock_class)) + __error_too_big_MAX_LOCKDEP_SUBCLASSES(); + + key = lock->key->subkeys + subclass; + + hash_head = classhashentry(key); + + /* + * We can walk the hash lockfree, because the hash only + * grows, and we are careful when adding entries to the end: + */ + list_for_each_entry(class, hash_head, hash_entry) + if (class->key == key) + return class; + + return NULL; +} + +/* + * Register a lock's class in the hash-table, if the class is not present + * yet. Otherwise we look it up. We cache the result in the lock object + * itself, so actual lookup of the hash should be once per lock object. + */ +static inline struct lock_class * +register_lock_class(struct lockdep_map *lock, unsigned int subclass) +{ + struct lockdep_subclass_key *key; + struct list_head *hash_head; + struct lock_class *class; + + class = look_up_lock_class(lock, subclass); + if (likely(class)) + return class; + + /* + * Debug-check: all keys must be persistent! + */ + if (!static_obj(lock->key)) { + debug_locks_off(); + printk("INFO: trying to register non-static key.\n"); + printk("the code is fine but needs lockdep annotation.\n"); + printk("turning off the locking correctness validator.\n"); + dump_stack(); + + return NULL; + } + + key = lock->key->subkeys + subclass; + hash_head = classhashentry(key); + + __raw_spin_lock(&hash_lock); + /* + * We have to do the hash-walk again, to avoid races + * with another CPU: + */ + list_for_each_entry(class, hash_head, hash_entry) + if (class->key == key) + goto out_unlock_set; + /* + * Allocate a new key from the static array, and add it to + * the hash: + */ + if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { + __raw_spin_unlock(&hash_lock); + debug_locks_off(); + printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); + printk("turning off the locking correctness validator.\n"); + return NULL; + } + class = lock_classes + nr_lock_classes++; + debug_atomic_inc(&nr_unused_locks); + class->key = key; + class->name = lock->name; + class->subclass = subclass; + INIT_LIST_HEAD(&class->lock_entry); + INIT_LIST_HEAD(&class->locks_before); + INIT_LIST_HEAD(&class->locks_after); + class->name_version = count_matching_names(class); + /* + * We use RCU's safe list-add method to make + * parallel walking of the hash-list safe: + */ + list_add_tail_rcu(&class->hash_entry, hash_head); + + if (verbose(class)) { + __raw_spin_unlock(&hash_lock); + printk("\nnew class %p: %s", class->key, class->name); + if (class->name_version > 1) + printk("#%d", class->name_version); + printk("\n"); + dump_stack(); + __raw_spin_lock(&hash_lock); + } +out_unlock_set: + __raw_spin_unlock(&hash_lock); + + if (!subclass) + lock->class_cache = class; + + DEBUG_LOCKS_WARN_ON(class->subclass != subclass); + + return class; +} + +/* + * Look up a dependency chain. If the key is not present yet then + * add it and return 0 - in this case the new dependency chain is + * validated. If the key is already hashed, return 1. + */ +static inline int lookup_chain_cache(u64 chain_key) +{ + struct list_head *hash_head = chainhashentry(chain_key); + struct lock_chain *chain; + + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + /* + * We can walk it lock-free, because entries only get added + * to the hash: + */ + list_for_each_entry(chain, hash_head, entry) { + if (chain->chain_key == chain_key) { +cache_hit: + debug_atomic_inc(&chain_lookup_hits); + /* + * In the debugging case, force redundant checking + * by returning 1: + */ +#ifdef CONFIG_DEBUG_LOCKDEP + __raw_spin_lock(&hash_lock); + return 1; +#endif + return 0; + } + } + /* + * Allocate a new chain entry from the static array, and add + * it to the hash: + */ + __raw_spin_lock(&hash_lock); + /* + * We have to walk the chain again locked - to avoid duplicates: + */ + list_for_each_entry(chain, hash_head, entry) { + if (chain->chain_key == chain_key) { + __raw_spin_unlock(&hash_lock); + goto cache_hit; + } + } + if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { + __raw_spin_unlock(&hash_lock); + debug_locks_off(); + printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); + printk("turning off the locking correctness validator.\n"); + return 0; + } + chain = lock_chains + nr_lock_chains++; + chain->chain_key = chain_key; + list_add_tail_rcu(&chain->entry, hash_head); + debug_atomic_inc(&chain_lookup_misses); +#ifdef CONFIG_TRACE_IRQFLAGS + if (current->hardirq_context) + nr_hardirq_chains++; + else { + if (current->softirq_context) + nr_softirq_chains++; + else + nr_process_chains++; + } +#else + nr_process_chains++; +#endif + + return 1; +} + +/* + * We are building curr_chain_key incrementally, so double-check + * it from scratch, to make sure that it's done correctly: + */ +static void check_chain_key(struct task_struct *curr) +{ +#ifdef CONFIG_DEBUG_LOCKDEP + struct held_lock *hlock, *prev_hlock = NULL; + unsigned int i, id; + u64 chain_key = 0; + + for (i = 0; i < curr->lockdep_depth; i++) { + hlock = curr->held_locks + i; + if (chain_key != hlock->prev_chain_key) { + debug_locks_off(); + printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n", + curr->lockdep_depth, i, + (unsigned long long)chain_key, + (unsigned long long)hlock->prev_chain_key); + WARN_ON(1); + return; + } + id = hlock->class - lock_classes; + DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS); + if (prev_hlock && (prev_hlock->irq_context != + hlock->irq_context)) + chain_key = 0; + chain_key = iterate_chain_key(chain_key, id); + prev_hlock = hlock; + } + if (chain_key != curr->curr_chain_key) { + debug_locks_off(); + printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n", + curr->lockdep_depth, i, + (unsigned long long)chain_key, + (unsigned long long)curr->curr_chain_key); + WARN_ON(1); + } +#endif +} + +#ifdef CONFIG_TRACE_IRQFLAGS + +/* + * print irq inversion bug: + */ +static int +print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, + struct held_lock *this, int forwards, + const char *irqclass) +{ + __raw_spin_unlock(&hash_lock); + debug_locks_off(); + if (debug_locks_silent) + return 0; + + printk("\n=========================================================\n"); + printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); + printk( "---------------------------------------------------------\n"); + printk("%s/%d just changed the state of lock:\n", + curr->comm, curr->pid); + print_lock(this); + if (forwards) + printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); + else + printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass); + print_lock_name(other); + printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); + + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nthe first lock's dependencies:\n"); + print_lock_dependencies(this->class, 0); + + printk("\nthe second lock's dependencies:\n"); + print_lock_dependencies(other, 0); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Prove that in the forwards-direction subgraph starting at <this> + * there is no lock matching <mask>: + */ +static int +check_usage_forwards(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit bit, const char *irqclass) +{ + int ret; + + find_usage_bit = bit; + /* fills in <forwards_match> */ + ret = find_usage_forwards(this->class, 0); + if (!ret || ret == 1) + return ret; + + return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass); +} + +/* + * Prove that in the backwards-direction subgraph starting at <this> + * there is no lock matching <mask>: + */ +static int +check_usage_backwards(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit bit, const char *irqclass) +{ + int ret; + + find_usage_bit = bit; + /* fills in <backwards_match> */ + ret = find_usage_backwards(this->class, 0); + if (!ret || ret == 1) + return ret; + + return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); +} + +static inline void print_irqtrace_events(struct task_struct *curr) +{ + printk("irq event stamp: %u\n", curr->irq_events); + printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); + print_ip_sym(curr->hardirq_enable_ip); + printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event); + print_ip_sym(curr->hardirq_disable_ip); + printk("softirqs last enabled at (%u): ", curr->softirq_enable_event); + print_ip_sym(curr->softirq_enable_ip); + printk("softirqs last disabled at (%u): ", curr->softirq_disable_event); + print_ip_sym(curr->softirq_disable_ip); +} + +#else +static inline void print_irqtrace_events(struct task_struct *curr) +{ +} +#endif + +static int +print_usage_bug(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) +{ + __raw_spin_unlock(&hash_lock); + debug_locks_off(); + if (debug_locks_silent) + return 0; + + printk("\n=================================\n"); + printk( "[ INFO: inconsistent lock state ]\n"); + printk( "---------------------------------\n"); + + printk("inconsistent {%s} -> {%s} usage.\n", + usage_str[prev_bit], usage_str[new_bit]); + + printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", + curr->comm, curr->pid, + trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, + trace_hardirqs_enabled(curr), + trace_softirqs_enabled(curr)); + print_lock(this); + + printk("{%s} state was registered at:\n", usage_str[prev_bit]); + print_stack_trace(this->class->usage_traces + prev_bit, 1); + + print_irqtrace_events(curr); + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Print out an error if an invalid bit is set: + */ +static inline int +valid_state(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) +{ + if (unlikely(this->class->usage_mask & (1 << bad_bit))) + return print_usage_bug(curr, this, bad_bit, new_bit); + return 1; +} + +#define STRICT_READ_CHECKS 1 + +/* + * Mark a lock with a usage bit, and validate the state transition: + */ +static int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit, unsigned long ip) +{ + unsigned int new_mask = 1 << new_bit, ret = 1; + + /* + * If already set then do not dirty the cacheline, + * nor do any checks: + */ + if (likely(this->class->usage_mask & new_mask)) + return 1; + + __raw_spin_lock(&hash_lock); + /* + * Make sure we didnt race: + */ + if (unlikely(this->class->usage_mask & new_mask)) { + __raw_spin_unlock(&hash_lock); + return 1; + } + + this->class->usage_mask |= new_mask; + +#ifdef CONFIG_TRACE_IRQFLAGS + if (new_bit == LOCK_ENABLED_HARDIRQS || + new_bit == LOCK_ENABLED_HARDIRQS_READ) + ip = curr->hardirq_enable_ip; + else if (new_bit == LOCK_ENABLED_SOFTIRQS || + new_bit == LOCK_ENABLED_SOFTIRQS_READ) + ip = curr->softirq_enable_ip; +#endif + if (!save_trace(this->class->usage_traces + new_bit)) + return 0; + + switch (new_bit) { +#ifdef CONFIG_TRACE_IRQFLAGS + case LOCK_USED_IN_HARDIRQ: + if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) + return 0; + if (!valid_state(curr, this, new_bit, + LOCK_ENABLED_HARDIRQS_READ)) + return 0; + /* + * just marked it hardirq-safe, check that this lock + * took no hardirq-unsafe lock in the past: + */ + if (!check_usage_forwards(curr, this, + LOCK_ENABLED_HARDIRQS, "hard")) + return 0; +#if STRICT_READ_CHECKS + /* + * just marked it hardirq-safe, check that this lock + * took no hardirq-unsafe-read lock in the past: + */ + if (!check_usage_forwards(curr, this, + LOCK_ENABLED_HARDIRQS_READ, "hard-read")) + return 0; +#endif + if (hardirq_verbose(this->class)) + ret = 2; + break; + case LOCK_USED_IN_SOFTIRQ: + if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) + return 0; + if (!valid_state(curr, this, new_bit, + LOCK_ENABLED_SOFTIRQS_READ)) + return 0; + /* + * just marked it softirq-safe, check that this lock + * took no softirq-unsafe lock in the past: + */ + if (!check_usage_forwards(curr, this, + LOCK_ENABLED_SOFTIRQS, "soft")) + return 0; +#if STRICT_READ_CHECKS + /* + * just marked it softirq-safe, check that this lock + * took no softirq-unsafe-read lock in the past: + */ + if (!check_usage_forwards(curr, this, + LOCK_ENABLED_SOFTIRQS_READ, "soft-read")) + return 0; +#endif + if (softirq_verbose(this->class)) + ret = 2; + break; + case LOCK_USED_IN_HARDIRQ_READ: + if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) + return 0; + /* + * just marked it hardirq-read-safe, check that this lock + * took no hardirq-unsafe lock in the past: + */ + if (!check_usage_forwards(curr, this, + LOCK_ENABLED_HARDIRQS, "hard")) + return 0; + if (hardirq_verbose(this->class)) + ret = 2; + break; + case LOCK_USED_IN_SOFTIRQ_READ: + if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) + return 0; + /* + * just marked it softirq-read-safe, check that this lock + * took no softirq-unsafe lock in the past: + */ + if (!check_usage_forwards(curr, this, + LOCK_ENABLED_SOFTIRQS, "soft")) + return 0; + if (softirq_verbose(this->class)) + ret = 2; + break; + case LOCK_ENABLED_HARDIRQS: + if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ)) + return 0; + if (!valid_state(curr, this, new_bit, + LOCK_USED_IN_HARDIRQ_READ)) + return 0; + /* + * just marked it hardirq-unsafe, check that no hardirq-safe + * lock in the system ever took it in the past: + */ + if (!check_usage_backwards(curr, this, + LOCK_USED_IN_HARDIRQ, "hard")) + return 0; +#if STRICT_READ_CHECKS + /* + * just marked it hardirq-unsafe, check that no + * hardirq-safe-read lock in the system ever took + * it in the past: + */ + if (!check_usage_backwards(curr, this, + LOCK_USED_IN_HARDIRQ_READ, "hard-read")) + return 0; +#endif + if (hardirq_verbose(this->class)) + ret = 2; + break; + case LOCK_ENABLED_SOFTIRQS: + if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ)) + return 0; + if (!valid_state(curr, this, new_bit, + LOCK_USED_IN_SOFTIRQ_READ)) + return 0; + /* + * just marked it softirq-unsafe, check that no softirq-safe + * lock in the system ever took it in the past: + */ + if (!check_usage_backwards(curr, this, + LOCK_USED_IN_SOFTIRQ, "soft")) + return 0; +#if STRICT_READ_CHECKS + /* + * just marked it softirq-unsafe, check that no + * softirq-safe-read lock in the system ever took + * it in the past: + */ + if (!check_usage_backwards(curr, this, + LOCK_USED_IN_SOFTIRQ_READ, "soft-read")) + return 0; +#endif + if (softirq_verbose(this->class)) + ret = 2; + break; + case LOCK_ENABLED_HARDIRQS_READ: + if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ)) + return 0; +#if STRICT_READ_CHECKS + /* + * just marked it hardirq-read-unsafe, check that no + * hardirq-safe lock in the system ever took it in the past: + */ + if (!check_usage_backwards(curr, this, + LOCK_USED_IN_HARDIRQ, "hard")) + return 0; +#endif + if (hardirq_verbose(this->class)) + ret = 2; + break; + case LOCK_ENABLED_SOFTIRQS_READ: + if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ)) + return 0; +#if STRICT_READ_CHECKS + /* + * just marked it softirq-read-unsafe, check that no + * softirq-safe lock in the system ever took it in the past: + */ + if (!check_usage_backwards(curr, this, + LOCK_USED_IN_SOFTIRQ, "soft")) + return 0; +#endif + if (softirq_verbose(this->class)) + ret = 2; + break; +#endif + case LOCK_USED: + /* + * Add it to the global list of classes: + */ + list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); + debug_atomic_dec(&nr_unused_locks); + break; + default: + debug_locks_off(); + WARN_ON(1); + return 0; + } + + __raw_spin_unlock(&hash_lock); + + /* + * We must printk outside of the hash_lock: + */ + if (ret == 2) { + printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); + print_lock(this); + print_irqtrace_events(curr); + dump_stack(); + } + + return ret; +} + +#ifdef CONFIG_TRACE_IRQFLAGS +/* + * Mark all held locks with a usage bit: + */ +static int +mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip) +{ + enum lock_usage_bit usage_bit; + struct held_lock *hlock; + int i; + + for (i = 0; i < curr->lockdep_depth; i++) { + hlock = curr->held_locks + i; + + if (hardirq) { + if (hlock->read) + usage_bit = LOCK_ENABLED_HARDIRQS_READ; + else + usage_bit = LOCK_ENABLED_HARDIRQS; + } else { + if (hlock->read) + usage_bit = LOCK_ENABLED_SOFTIRQS_READ; + else + usage_bit = LOCK_ENABLED_SOFTIRQS; + } + if (!mark_lock(curr, hlock, usage_bit, ip)) + return 0; + } + + return 1; +} + +/* + * Debugging helper: via this flag we know that we are in + * 'early bootup code', and will warn about any invalid irqs-on event: + */ +static int early_boot_irqs_enabled; + +void early_boot_irqs_off(void) +{ + early_boot_irqs_enabled = 0; +} + +void early_boot_irqs_on(void) +{ + early_boot_irqs_enabled = 1; +} + +/* + * Hardirqs will be enabled: + */ +void trace_hardirqs_on(void) +{ + struct task_struct *curr = current; + unsigned long ip; + + if (unlikely(!debug_locks || current->lockdep_recursion)) + return; + + if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) + return; + + if (unlikely(curr->hardirqs_enabled)) { + debug_atomic_inc(&redundant_hardirqs_on); + return; + } + /* we'll do an OFF -> ON transition: */ + curr->hardirqs_enabled = 1; + ip = (unsigned long) __builtin_return_address(0); + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) + return; + /* + * We are going to turn hardirqs on, so set the + * usage bit for all held locks: + */ + if (!mark_held_locks(curr, 1, ip)) + return; + /* + * If we have softirqs enabled, then set the usage + * bit for all held locks. (disabled hardirqs prevented + * this bit from being set before) + */ + if (curr->softirqs_enabled) + if (!mark_held_locks(curr, 0, ip)) + return; + + curr->hardirq_enable_ip = ip; + curr->hardirq_enable_event = ++curr->irq_events; + debug_atomic_inc(&hardirqs_on_events); +} + +EXPORT_SYMBOL(trace_hardirqs_on); + +/* + * Hardirqs were disabled: + */ +void trace_hardirqs_off(void) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks || current->lockdep_recursion)) + return; + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + if (curr->hardirqs_enabled) { + /* + * We have done an ON -> OFF transition: + */ + curr->hardirqs_enabled = 0; + curr->hardirq_disable_ip = _RET_IP_; + curr->hardirq_disable_event = ++curr->irq_events; + debug_atomic_inc(&hardirqs_off_events); + } else + debug_atomic_inc(&redundant_hardirqs_off); +} + +EXPORT_SYMBOL(trace_hardirqs_off); + +/* + * Softirqs will be enabled: + */ +void trace_softirqs_on(unsigned long ip) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks)) + return; + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + if (curr->softirqs_enabled) { + debug_atomic_inc(&redundant_softirqs_on); + return; + } + + /* + * We'll do an OFF -> ON transition: + */ + curr->softirqs_enabled = 1; + curr->softirq_enable_ip = ip; + curr->softirq_enable_event = ++curr->irq_events; + debug_atomic_inc(&softirqs_on_events); + /* + * We are going to turn softirqs on, so set the + * usage bit for all held locks, if hardirqs are + * enabled too: + */ + if (curr->hardirqs_enabled) + mark_held_locks(curr, 0, ip); +} + +/* + * Softirqs were disabled: + */ +void trace_softirqs_off(unsigned long ip) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks)) + return; + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + if (curr->softirqs_enabled) { + /* + * We have done an ON -> OFF transition: + */ + curr->softirqs_enabled = 0; + curr->softirq_disable_ip = ip; + curr->softirq_disable_event = ++curr->irq_events; + debug_atomic_inc(&softirqs_off_events); + DEBUG_LOCKS_WARN_ON(!softirq_count()); + } else + debug_atomic_inc(&redundant_softirqs_off); +} + +#endif + +/* + * Initialize a lock instance's lock-class mapping info: + */ +void lockdep_init_map(struct lockdep_map *lock, const char *name, + struct lock_class_key *key) +{ + if (unlikely(!debug_locks)) + return; + + if (DEBUG_LOCKS_WARN_ON(!key)) + return; + if (DEBUG_LOCKS_WARN_ON(!name)) + return; + /* + * Sanity check, the lock-class key must be persistent: + */ + if (!static_obj(key)) { + printk("BUG: key %p not in .data!\n", key); + DEBUG_LOCKS_WARN_ON(1); + return; + } + lock->name = name; + lock->key = key; + lock->class_cache = NULL; +} + +EXPORT_SYMBOL_GPL(lockdep_init_map); + +/* + * This gets called for every mutex_lock*()/spin_lock*() operation. + * We maintain the dependency maps and validate the locking attempt: + */ +static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, int hardirqs_off, + unsigned long ip) +{ + struct task_struct *curr = current; + struct lock_class *class = NULL; + struct held_lock *hlock; + unsigned int depth, id; + int chain_head = 0; + u64 chain_key; + + if (unlikely(!debug_locks)) + return 0; + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; + + if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { + debug_locks_off(); + printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); + printk("turning off the locking correctness validator.\n"); + return 0; + } + + if (!subclass) + class = lock->class_cache; + /* + * Not cached yet or subclass? + */ + if (unlikely(!class)) { + class = register_lock_class(lock, subclass); + if (!class) + return 0; + } + debug_atomic_inc((atomic_t *)&class->ops); + if (very_verbose(class)) { + printk("\nacquire class [%p] %s", class->key, class->name); + if (class->name_version > 1) + printk("#%d", class->name_version); + printk("\n"); + dump_stack(); + } + + /* + * Add the lock to the list of currently held locks. + * (we dont increase the depth just yet, up until the + * dependency checks are done) + */ + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) + return 0; + + hlock = curr->held_locks + depth; + + hlock->class = class; + hlock->acquire_ip = ip; + hlock->instance = lock; + hlock->trylock = trylock; + hlock->read = read; + hlock->check = check; + hlock->hardirqs_off = hardirqs_off; + + if (check != 2) + goto out_calc_hash; +#ifdef CONFIG_TRACE_IRQFLAGS + /* + * If non-trylock use in a hardirq or softirq context, then + * mark the lock as used in these contexts: + */ + if (!trylock) { + if (read) { + if (curr->hardirq_context) + if (!mark_lock(curr, hlock, + LOCK_USED_IN_HARDIRQ_READ, ip)) + return 0; + if (curr->softirq_context) + if (!mark_lock(curr, hlock, + LOCK_USED_IN_SOFTIRQ_READ, ip)) + return 0; + } else { + if (curr->hardirq_context) + if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip)) + return 0; + if (curr->softirq_context) + if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip)) + return 0; + } + } + if (!hardirqs_off) { + if (read) { + if (!mark_lock(curr, hlock, + LOCK_ENABLED_HARDIRQS_READ, ip)) + return 0; + if (curr->softirqs_enabled) + if (!mark_lock(curr, hlock, + LOCK_ENABLED_SOFTIRQS_READ, ip)) + return 0; + } else { + if (!mark_lock(curr, hlock, + LOCK_ENABLED_HARDIRQS, ip)) + return 0; + if (curr->softirqs_enabled) + if (!mark_lock(curr, hlock, + LOCK_ENABLED_SOFTIRQS, ip)) + return 0; + } + } +#endif + /* mark it as used: */ + if (!mark_lock(curr, hlock, LOCK_USED, ip)) + return 0; +out_calc_hash: + /* + * Calculate the chain hash: it's the combined has of all the + * lock keys along the dependency chain. We save the hash value + * at every step so that we can get the current hash easily + * after unlock. The chain hash is then used to cache dependency + * results. + * + * The 'key ID' is what is the most compact key value to drive + * the hash, not class->key. + */ + id = class - lock_classes; + if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + return 0; + + chain_key = curr->curr_chain_key; + if (!depth) { + if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) + return 0; + chain_head = 1; + } + + hlock->prev_chain_key = chain_key; + +#ifdef CONFIG_TRACE_IRQFLAGS + /* + * Keep track of points where we cross into an interrupt context: + */ + hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + + curr->softirq_context; + if (depth) { + struct held_lock *prev_hlock; + + prev_hlock = curr->held_locks + depth-1; + /* + * If we cross into another context, reset the + * hash key (this also prevents the checking and the + * adding of the dependency to 'prev'): + */ + if (prev_hlock->irq_context != hlock->irq_context) { + chain_key = 0; + chain_head = 1; + } + } +#endif + chain_key = iterate_chain_key(chain_key, id); + curr->curr_chain_key = chain_key; + + /* + * Trylock needs to maintain the stack of held locks, but it + * does not add new dependencies, because trylock can be done + * in any order. + * + * We look up the chain_key and do the O(N^2) check and update of + * the dependencies only if this is a new dependency chain. + * (If lookup_chain_cache() returns with 1 it acquires + * hash_lock for us) + */ + if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) { + /* + * Check whether last held lock: + * + * - is irq-safe, if this lock is irq-unsafe + * - is softirq-safe, if this lock is hardirq-unsafe + * + * And check whether the new lock's dependency graph + * could lead back to the previous lock. + * + * any of these scenarios could lead to a deadlock. If + * All validations + */ + int ret = check_deadlock(curr, hlock, lock, read); + + if (!ret) + return 0; + /* + * Mark recursive read, as we jump over it when + * building dependencies (just like we jump over + * trylock entries): + */ + if (ret == 2) + hlock->read = 2; + /* + * Add dependency only if this lock is not the head + * of the chain, and if it's not a secondary read-lock: + */ + if (!chain_head && ret != 2) + if (!check_prevs_add(curr, hlock)) + return 0; + __raw_spin_unlock(&hash_lock); + } + curr->lockdep_depth++; + check_chain_key(curr); + if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { + debug_locks_off(); + printk("BUG: MAX_LOCK_DEPTH too low!\n"); + printk("turning off the locking correctness validator.\n"); + return 0; + } + if (unlikely(curr->lockdep_depth > max_lockdep_depth)) + max_lockdep_depth = curr->lockdep_depth; + + return 1; +} + +static int +print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, + unsigned long ip) +{ + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + printk("\n=====================================\n"); + printk( "[ BUG: bad unlock balance detected! ]\n"); + printk( "-------------------------------------\n"); + printk("%s/%d is trying to release lock (", + curr->comm, curr->pid); + print_lockdep_cache(lock); + printk(") at:\n"); + print_ip_sym(ip); + printk("but there are no more locks to release!\n"); + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Common debugging checks for both nested and non-nested unlock: + */ +static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, + unsigned long ip) +{ + if (unlikely(!debug_locks)) + return 0; + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; + + if (curr->lockdep_depth <= 0) + return print_unlock_inbalance_bug(curr, lock, ip); + + return 1; +} + +/* + * Remove the lock to the list of currently held locks in a + * potentially non-nested (out of order) manner. This is a + * relatively rare operation, as all the unlock APIs default + * to nested mode (which uses lock_release()): + */ +static int +lock_release_non_nested(struct task_struct *curr, + struct lockdep_map *lock, unsigned long ip) +{ + struct held_lock *hlock, *prev_hlock; + unsigned int depth; + int i; + + /* + * Check whether the lock exists in the current stack + * of held locks: + */ + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return 0; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (hlock->instance == lock) + goto found_it; + prev_hlock = hlock; + } + return print_unlock_inbalance_bug(curr, lock, ip); + +found_it: + /* + * We have the right lock to unlock, 'hlock' points to it. + * Now we remove it from the stack, and add back the other + * entries (if any), recalculating the hash along the way: + */ + curr->lockdep_depth = i; + curr->curr_chain_key = hlock->prev_chain_key; + + for (i++; i < depth; i++) { + hlock = curr->held_locks + i; + if (!__lock_acquire(hlock->instance, + hlock->class->subclass, hlock->trylock, + hlock->read, hlock->check, hlock->hardirqs_off, + hlock->acquire_ip)) + return 0; + } + + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) + return 0; + return 1; +} + +/* + * Remove the lock to the list of currently held locks - this gets + * called on mutex_unlock()/spin_unlock*() (or on a failed + * mutex_lock_interruptible()). This is done for unlocks that nest + * perfectly. (i.e. the current top of the lock-stack is unlocked) + */ +static int lock_release_nested(struct task_struct *curr, + struct lockdep_map *lock, unsigned long ip) +{ + struct held_lock *hlock; + unsigned int depth; + + /* + * Pop off the top of the lock stack: + */ + depth = curr->lockdep_depth - 1; + hlock = curr->held_locks + depth; + + /* + * Is the unlock non-nested: + */ + if (hlock->instance != lock) + return lock_release_non_nested(curr, lock, ip); + curr->lockdep_depth--; + + if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) + return 0; + + curr->curr_chain_key = hlock->prev_chain_key; + +#ifdef CONFIG_DEBUG_LOCKDEP + hlock->prev_chain_key = 0; + hlock->class = NULL; + hlock->acquire_ip = 0; + hlock->irq_context = 0; +#endif + return 1; +} + +/* + * Remove the lock to the list of currently held locks - this gets + * called on mutex_unlock()/spin_unlock*() (or on a failed + * mutex_lock_interruptible()). This is done for unlocks that nest + * perfectly. (i.e. the current top of the lock-stack is unlocked) + */ +static void +__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +{ + struct task_struct *curr = current; + + if (!check_unlock(curr, lock, ip)) + return; + + if (nested) { + if (!lock_release_nested(curr, lock, ip)) + return; + } else { + if (!lock_release_non_nested(curr, lock, ip)) + return; + } + + check_chain_key(curr); +} + +/* + * Check whether we follow the irq-flags state precisely: + */ +static void check_flags(unsigned long flags) +{ +#if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS) + if (!debug_locks) + return; + + if (irqs_disabled_flags(flags)) + DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled); + else + DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled); + + /* + * We dont accurately track softirq state in e.g. + * hardirq contexts (such as on 4KSTACKS), so only + * check if not in hardirq contexts: + */ + if (!hardirq_count()) { + if (softirq_count()) + DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); + else + DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); + } + + if (!debug_locks) + print_irqtrace_events(current); +#endif +} + +/* + * We are not always called with irqs disabled - do that here, + * and also avoid lockdep recursion: + */ +void lock_acquire(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + + current->lockdep_recursion = 1; + __lock_acquire(lock, subclass, trylock, read, check, + irqs_disabled_flags(flags), ip); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} + +EXPORT_SYMBOL_GPL(lock_acquire); + +void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + __lock_release(lock, nested, ip); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} + +EXPORT_SYMBOL_GPL(lock_release); + +/* + * Used by the testsuite, sanitize the validator state + * after a simulated failure: + */ + +void lockdep_reset(void) +{ + unsigned long flags; + + raw_local_irq_save(flags); + current->curr_chain_key = 0; + current->lockdep_depth = 0; + current->lockdep_recursion = 0; + memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); + nr_hardirq_chains = 0; + nr_softirq_chains = 0; + nr_process_chains = 0; + debug_locks = 1; + raw_local_irq_restore(flags); +} + +static void zap_class(struct lock_class *class) +{ + int i; + + /* + * Remove all dependencies this lock is + * involved in: + */ + for (i = 0; i < nr_list_entries; i++) { + if (list_entries[i].class == class) + list_del_rcu(&list_entries[i].entry); + } + /* + * Unhash the class and remove it from the all_lock_classes list: + */ + list_del_rcu(&class->hash_entry); + list_del_rcu(&class->lock_entry); + +} + +static inline int within(void *addr, void *start, unsigned long size) +{ + return addr >= start && addr < start + size; +} + +void lockdep_free_key_range(void *start, unsigned long size) +{ + struct lock_class *class, *next; + struct list_head *head; + unsigned long flags; + int i; + + raw_local_irq_save(flags); + __raw_spin_lock(&hash_lock); + + /* + * Unhash all classes that were created by this module: + */ + for (i = 0; i < CLASSHASH_SIZE; i++) { + head = classhash_table + i; + if (list_empty(head)) + continue; + list_for_each_entry_safe(class, next, head, hash_entry) + if (within(class->key, start, size)) + zap_class(class); + } + + __raw_spin_unlock(&hash_lock); + raw_local_irq_restore(flags); +} + +void lockdep_reset_lock(struct lockdep_map *lock) +{ + struct lock_class *class, *next; + struct list_head *head; + unsigned long flags; + int i, j; + + raw_local_irq_save(flags); + + /* + * Remove all classes this lock might have: + */ + for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) { + /* + * If the class exists we look it up and zap it: + */ + class = look_up_lock_class(lock, j); + if (class) + zap_class(class); + } + /* + * Debug check: in the end all mapped classes should + * be gone. + */ + __raw_spin_lock(&hash_lock); + for (i = 0; i < CLASSHASH_SIZE; i++) { + head = classhash_table + i; + if (list_empty(head)) + continue; + list_for_each_entry_safe(class, next, head, hash_entry) { + if (unlikely(class == lock->class_cache)) { + __raw_spin_unlock(&hash_lock); + DEBUG_LOCKS_WARN_ON(1); + goto out_restore; + } + } + } + __raw_spin_unlock(&hash_lock); + +out_restore: + raw_local_irq_restore(flags); +} + +void __init lockdep_init(void) +{ + int i; + + /* + * Some architectures have their own start_kernel() + * code which calls lockdep_init(), while we also + * call lockdep_init() from the start_kernel() itself, + * and we want to initialize the hashes only once: + */ + if (lockdep_initialized) + return; + + for (i = 0; i < CLASSHASH_SIZE; i++) + INIT_LIST_HEAD(classhash_table + i); + + for (i = 0; i < CHAINHASH_SIZE; i++) + INIT_LIST_HEAD(chainhash_table + i); + + lockdep_initialized = 1; +} + +void __init lockdep_info(void) +{ + printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); + + printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); + printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); + printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); + printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); + printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); + printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); + printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); + + printk(" memory used by lock dependency info: %lu kB\n", + (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + + sizeof(struct list_head) * CLASSHASH_SIZE + + sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + + sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + + sizeof(struct list_head) * CHAINHASH_SIZE) / 1024); + + printk(" per task-struct memory footprint: %lu bytes\n", + sizeof(struct held_lock) * MAX_LOCK_DEPTH); + +#ifdef CONFIG_DEBUG_LOCKDEP + if (lockdep_init_error) + printk("WARNING: lockdep init error! Arch code didnt call lockdep_init() early enough?\n"); +#endif +} + +static inline int in_range(const void *start, const void *addr, const void *end) +{ + return addr >= start && addr <= end; +} + +static void +print_freed_lock_bug(struct task_struct *curr, const void *mem_from, + const void *mem_to, struct held_lock *hlock) +{ + if (!debug_locks_off()) + return; + if (debug_locks_silent) + return; + + printk("\n=========================\n"); + printk( "[ BUG: held lock freed! ]\n"); + printk( "-------------------------\n"); + printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", + curr->comm, curr->pid, mem_from, mem_to-1); + print_lock(hlock); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); +} + +/* + * Called when kernel memory is freed (or unmapped), or if a lock + * is destroyed or reinitialized - this code checks whether there is + * any held lock in the memory range of <from> to <to>: + */ +void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) +{ + const void *mem_to = mem_from + mem_len, *lock_from, *lock_to; + struct task_struct *curr = current; + struct held_lock *hlock; + unsigned long flags; + int i; + + if (unlikely(!debug_locks)) + return; + + local_irq_save(flags); + for (i = 0; i < curr->lockdep_depth; i++) { + hlock = curr->held_locks + i; + + lock_from = (void *)hlock->instance; + lock_to = (void *)(hlock->instance + 1); + + if (!in_range(mem_from, lock_from, mem_to) && + !in_range(mem_from, lock_to, mem_to)) + continue; + + print_freed_lock_bug(curr, mem_from, mem_to, hlock); + break; + } + local_irq_restore(flags); +} + +static void print_held_locks_bug(struct task_struct *curr) +{ + if (!debug_locks_off()) + return; + if (debug_locks_silent) + return; + + printk("\n=====================================\n"); + printk( "[ BUG: lock held at task exit time! ]\n"); + printk( "-------------------------------------\n"); + printk("%s/%d is exiting with locks still held!\n", + curr->comm, curr->pid); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); +} + +void debug_check_no_locks_held(struct task_struct *task) +{ + if (unlikely(task->lockdep_depth > 0)) + print_held_locks_bug(task); +} + +void debug_show_all_locks(void) +{ + struct task_struct *g, *p; + int count = 10; + int unlock = 1; + + printk("\nShowing all locks held in the system:\n"); + + /* + * Here we try to get the tasklist_lock as hard as possible, + * if not successful after 2 seconds we ignore it (but keep + * trying). This is to enable a debug printout even if a + * tasklist_lock-holding task deadlocks or crashes. + */ +retry: + if (!read_trylock(&tasklist_lock)) { + if (count == 10) + printk("hm, tasklist_lock locked, retrying... "); + if (count) { + count--; + printk(" #%d", 10-count); + mdelay(200); + goto retry; + } + printk(" ignoring it.\n"); + unlock = 0; + } + if (count != 10) + printk(" locked it.\n"); + + do_each_thread(g, p) { + if (p->lockdep_depth) + lockdep_print_held_locks(p); + if (!unlock) + if (read_trylock(&tasklist_lock)) + unlock = 1; + } while_each_thread(g, p); + + printk("\n"); + printk("=============================================\n\n"); + + if (unlock) + read_unlock(&tasklist_lock); +} + +EXPORT_SYMBOL_GPL(debug_show_all_locks); + +void debug_show_held_locks(struct task_struct *task) +{ + lockdep_print_held_locks(task); +} + +EXPORT_SYMBOL_GPL(debug_show_held_locks); + diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h new file mode 100644 index 00000000000..eab043c83bb --- /dev/null +++ b/kernel/lockdep_internals.h @@ -0,0 +1,78 @@ +/* + * kernel/lockdep_internals.h + * + * Runtime locking correctness validator + * + * lockdep subsystem internal functions and variables. + */ + +/* + * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies + * we track. + * + * We use the per-lock dependency maps in two ways: we grow it by adding + * every to-be-taken lock to all currently held lock's own dependency + * table (if it's not there yet), and we check it for lock order + * conflicts and deadlocks. + */ +#define MAX_LOCKDEP_ENTRIES 8192UL + +#define MAX_LOCKDEP_KEYS_BITS 11 +#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) + +#define MAX_LOCKDEP_CHAINS_BITS 13 +#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) + +/* + * Stack-trace: tightly packed array of stack backtrace + * addresses. Protected by the hash_lock. + */ +#define MAX_STACK_TRACE_ENTRIES 262144UL + +extern struct list_head all_lock_classes; + +extern void +get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4); + +extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); + +extern unsigned long nr_lock_classes; +extern unsigned long nr_list_entries; +extern unsigned long nr_lock_chains; +extern unsigned long nr_stack_trace_entries; + +extern unsigned int nr_hardirq_chains; +extern unsigned int nr_softirq_chains; +extern unsigned int nr_process_chains; +extern unsigned int max_lockdep_depth; +extern unsigned int max_recursion_depth; + +#ifdef CONFIG_DEBUG_LOCKDEP +/* + * Various lockdep statistics: + */ +extern atomic_t chain_lookup_hits; +extern atomic_t chain_lookup_misses; +extern atomic_t hardirqs_on_events; +extern atomic_t hardirqs_off_events; +extern atomic_t redundant_hardirqs_on; +extern atomic_t redundant_hardirqs_off; +extern atomic_t softirqs_on_events; +extern atomic_t softirqs_off_events; +extern atomic_t redundant_softirqs_on; +extern atomic_t redundant_softirqs_off; +extern atomic_t nr_unused_locks; +extern atomic_t nr_cyclic_checks; +extern atomic_t nr_cyclic_check_recursions; +extern atomic_t nr_find_usage_forwards_checks; +extern atomic_t nr_find_usage_forwards_recursions; +extern atomic_t nr_find_usage_backwards_checks; +extern atomic_t nr_find_usage_backwards_recursions; +# define debug_atomic_inc(ptr) atomic_inc(ptr) +# define debug_atomic_dec(ptr) atomic_dec(ptr) +# define debug_atomic_read(ptr) atomic_read(ptr) +#else +# define debug_atomic_inc(ptr) do { } while (0) +# define debug_atomic_dec(ptr) do { } while (0) +# define debug_atomic_read(ptr) 0 +#endif diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c new file mode 100644 index 00000000000..f6e72eaab3f --- /dev/null +++ b/kernel/lockdep_proc.c @@ -0,0 +1,345 @@ +/* + * kernel/lockdep_proc.c + * + * Runtime locking correctness validator + * + * Started by Ingo Molnar: + * + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * Code for /proc/lockdep and /proc/lockdep_stats: + * + */ +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/debug_locks.h> + +#include "lockdep_internals.h" + +static void *l_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct lock_class *class = v; + + (*pos)++; + + if (class->lock_entry.next != &all_lock_classes) + class = list_entry(class->lock_entry.next, struct lock_class, + lock_entry); + else + class = NULL; + m->private = class; + + return class; +} + +static void *l_start(struct seq_file *m, loff_t *pos) +{ + struct lock_class *class = m->private; + + if (&class->lock_entry == all_lock_classes.next) + seq_printf(m, "all lock classes:\n"); + + return class; +} + +static void l_stop(struct seq_file *m, void *v) +{ +} + +static unsigned long count_forward_deps(struct lock_class *class) +{ + struct lock_list *entry; + unsigned long ret = 1; + + /* + * Recurse this class's dependency list: + */ + list_for_each_entry(entry, &class->locks_after, entry) + ret += count_forward_deps(entry->class); + + return ret; +} + +static unsigned long count_backward_deps(struct lock_class *class) +{ + struct lock_list *entry; + unsigned long ret = 1; + + /* + * Recurse this class's dependency list: + */ + list_for_each_entry(entry, &class->locks_before, entry) + ret += count_backward_deps(entry->class); + + return ret; +} + +static int l_show(struct seq_file *m, void *v) +{ + unsigned long nr_forward_deps, nr_backward_deps; + struct lock_class *class = m->private; + char str[128], c1, c2, c3, c4; + const char *name; + + seq_printf(m, "%p", class->key); +#ifdef CONFIG_DEBUG_LOCKDEP + seq_printf(m, " OPS:%8ld", class->ops); +#endif + nr_forward_deps = count_forward_deps(class); + seq_printf(m, " FD:%5ld", nr_forward_deps); + + nr_backward_deps = count_backward_deps(class); + seq_printf(m, " BD:%5ld", nr_backward_deps); + + get_usage_chars(class, &c1, &c2, &c3, &c4); + seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); + + name = class->name; + if (!name) { + name = __get_key_name(class->key, str); + seq_printf(m, ": %s", name); + } else{ + seq_printf(m, ": %s", name); + if (class->name_version > 1) + seq_printf(m, "#%d", class->name_version); + if (class->subclass) + seq_printf(m, "/%d", class->subclass); + } + seq_puts(m, "\n"); + + return 0; +} + +static struct seq_operations lockdep_ops = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show, +}; + +static int lockdep_open(struct inode *inode, struct file *file) +{ + int res = seq_open(file, &lockdep_ops); + if (!res) { + struct seq_file *m = file->private_data; + + if (!list_empty(&all_lock_classes)) + m->private = list_entry(all_lock_classes.next, + struct lock_class, lock_entry); + else + m->private = NULL; + } + return res; +} + +static struct file_operations proc_lockdep_operations = { + .open = lockdep_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void lockdep_stats_debug_show(struct seq_file *m) +{ +#ifdef CONFIG_DEBUG_LOCKDEP + unsigned int hi1 = debug_atomic_read(&hardirqs_on_events), + hi2 = debug_atomic_read(&hardirqs_off_events), + hr1 = debug_atomic_read(&redundant_hardirqs_on), + hr2 = debug_atomic_read(&redundant_hardirqs_off), + si1 = debug_atomic_read(&softirqs_on_events), + si2 = debug_atomic_read(&softirqs_off_events), + sr1 = debug_atomic_read(&redundant_softirqs_on), + sr2 = debug_atomic_read(&redundant_softirqs_off); + + seq_printf(m, " chain lookup misses: %11u\n", + debug_atomic_read(&chain_lookup_misses)); + seq_printf(m, " chain lookup hits: %11u\n", + debug_atomic_read(&chain_lookup_hits)); + seq_printf(m, " cyclic checks: %11u\n", + debug_atomic_read(&nr_cyclic_checks)); + seq_printf(m, " cyclic-check recursions: %11u\n", + debug_atomic_read(&nr_cyclic_check_recursions)); + seq_printf(m, " find-mask forwards checks: %11u\n", + debug_atomic_read(&nr_find_usage_forwards_checks)); + seq_printf(m, " find-mask forwards recursions: %11u\n", + debug_atomic_read(&nr_find_usage_forwards_recursions)); + seq_printf(m, " find-mask backwards checks: %11u\n", + debug_atomic_read(&nr_find_usage_backwards_checks)); + seq_printf(m, " find-mask backwards recursions:%11u\n", + debug_atomic_read(&nr_find_usage_backwards_recursions)); + + seq_printf(m, " hardirq on events: %11u\n", hi1); + seq_printf(m, " hardirq off events: %11u\n", hi2); + seq_printf(m, " redundant hardirq ons: %11u\n", hr1); + seq_printf(m, " redundant hardirq offs: %11u\n", hr2); + seq_printf(m, " softirq on events: %11u\n", si1); + seq_printf(m, " softirq off events: %11u\n", si2); + seq_printf(m, " redundant softirq ons: %11u\n", sr1); + seq_printf(m, " redundant softirq offs: %11u\n", sr2); +#endif +} + +static int lockdep_stats_show(struct seq_file *m, void *v) +{ + struct lock_class *class; + unsigned long nr_unused = 0, nr_uncategorized = 0, + nr_irq_safe = 0, nr_irq_unsafe = 0, + nr_softirq_safe = 0, nr_softirq_unsafe = 0, + nr_hardirq_safe = 0, nr_hardirq_unsafe = 0, + nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, + nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, + nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, + sum_forward_deps = 0, factor = 0; + + list_for_each_entry(class, &all_lock_classes, lock_entry) { + + if (class->usage_mask == 0) + nr_unused++; + if (class->usage_mask == LOCKF_USED) + nr_uncategorized++; + if (class->usage_mask & LOCKF_USED_IN_IRQ) + nr_irq_safe++; + if (class->usage_mask & LOCKF_ENABLED_IRQS) + nr_irq_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) + nr_softirq_safe++; + if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) + nr_softirq_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) + nr_hardirq_safe++; + if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) + nr_hardirq_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_IRQ_READ) + nr_irq_read_safe++; + if (class->usage_mask & LOCKF_ENABLED_IRQS_READ) + nr_irq_read_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) + nr_softirq_read_safe++; + if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) + nr_softirq_read_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) + nr_hardirq_read_safe++; + if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) + nr_hardirq_read_unsafe++; + + sum_forward_deps += count_forward_deps(class); + } +#ifdef CONFIG_LOCKDEP_DEBUG + DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); +#endif + seq_printf(m, " lock-classes: %11lu [max: %lu]\n", + nr_lock_classes, MAX_LOCKDEP_KEYS); + seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", + nr_list_entries, MAX_LOCKDEP_ENTRIES); + seq_printf(m, " indirect dependencies: %11lu\n", + sum_forward_deps); + + /* + * Total number of dependencies: + * + * All irq-safe locks may nest inside irq-unsafe locks, + * plus all the other known dependencies: + */ + seq_printf(m, " all direct dependencies: %11lu\n", + nr_irq_unsafe * nr_irq_safe + + nr_hardirq_unsafe * nr_hardirq_safe + + nr_list_entries); + + /* + * Estimated factor between direct and indirect + * dependencies: + */ + if (nr_list_entries) + factor = sum_forward_deps / nr_list_entries; + + seq_printf(m, " dependency chains: %11lu [max: %lu]\n", + nr_lock_chains, MAX_LOCKDEP_CHAINS); + +#ifdef CONFIG_TRACE_IRQFLAGS + seq_printf(m, " in-hardirq chains: %11u\n", + nr_hardirq_chains); + seq_printf(m, " in-softirq chains: %11u\n", + nr_softirq_chains); +#endif + seq_printf(m, " in-process chains: %11u\n", + nr_process_chains); + seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n", + nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); + seq_printf(m, " combined max dependencies: %11u\n", + (nr_hardirq_chains + 1) * + (nr_softirq_chains + 1) * + (nr_process_chains + 1) + ); + seq_printf(m, " hardirq-safe locks: %11lu\n", + nr_hardirq_safe); + seq_printf(m, " hardirq-unsafe locks: %11lu\n", + nr_hardirq_unsafe); + seq_printf(m, " softirq-safe locks: %11lu\n", + nr_softirq_safe); + seq_printf(m, " softirq-unsafe locks: %11lu\n", + nr_softirq_unsafe); + seq_printf(m, " irq-safe locks: %11lu\n", + nr_irq_safe); + seq_printf(m, " irq-unsafe locks: %11lu\n", + nr_irq_unsafe); + + seq_printf(m, " hardirq-read-safe locks: %11lu\n", + nr_hardirq_read_safe); + seq_printf(m, " hardirq-read-unsafe locks: %11lu\n", + nr_hardirq_read_unsafe); + seq_printf(m, " softirq-read-safe locks: %11lu\n", + nr_softirq_read_safe); + seq_printf(m, " softirq-read-unsafe locks: %11lu\n", + nr_softirq_read_unsafe); + seq_printf(m, " irq-read-safe locks: %11lu\n", + nr_irq_read_safe); + seq_printf(m, " irq-read-unsafe locks: %11lu\n", + nr_irq_read_unsafe); + + seq_printf(m, " uncategorized locks: %11lu\n", + nr_uncategorized); + seq_printf(m, " unused locks: %11lu\n", + nr_unused); + seq_printf(m, " max locking depth: %11u\n", + max_lockdep_depth); + seq_printf(m, " max recursion depth: %11u\n", + max_recursion_depth); + lockdep_stats_debug_show(m); + seq_printf(m, " debug_locks: %11u\n", + debug_locks); + + return 0; +} + +static int lockdep_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, lockdep_stats_show, NULL); +} + +static struct file_operations proc_lockdep_stats_operations = { + .open = lockdep_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init lockdep_proc_init(void) +{ + struct proc_dir_entry *entry; + + entry = create_proc_entry("lockdep", S_IRUSR, NULL); + if (entry) + entry->proc_fops = &proc_lockdep_operations; + + entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL); + if (entry) + entry->proc_fops = &proc_lockdep_stats_operations; + + return 0; +} + +__initcall(lockdep_proc_init); + diff --git a/kernel/module.c b/kernel/module.c index 281172f01e9..2a19cd47c04 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1121,6 +1121,9 @@ static void free_module(struct module *mod) if (mod->percpu) percpu_modfree(mod->percpu); + /* Free lock-classes: */ + lockdep_free_key_range(mod->module_core, mod->core_size); + /* Finally, free the core (containing the module structure) */ module_free(mod, mod->module_core); } @@ -2016,10 +2019,8 @@ const char *module_address_lookup(unsigned long addr, return NULL; } -struct module *module_get_kallsym(unsigned int symnum, - unsigned long *value, - char *type, - char namebuf[128]) +struct module *module_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, size_t namelen) { struct module *mod; @@ -2028,9 +2029,8 @@ struct module *module_get_kallsym(unsigned int symnum, if (symnum < mod->num_symtab) { *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; - strncpy(namebuf, - mod->strtab + mod->symtab[symnum].st_name, - 127); + strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, + namelen); mutex_unlock(&module_mutex); return mod; } @@ -2159,6 +2159,29 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) return e; } +/* + * Is this a valid module address? + */ +int is_module_address(unsigned long addr) +{ + unsigned long flags; + struct module *mod; + + spin_lock_irqsave(&modlist_lock, flags); + + list_for_each_entry(mod, &modules, list) { + if (within(addr, mod->module_core, mod->core_size)) { + spin_unlock_irqrestore(&modlist_lock, flags); + return 1; + } + } + + spin_unlock_irqrestore(&modlist_lock, flags); + + return 0; +} + + /* Is this a valid kernel address? We don't grab the lock: we are oopsing. */ struct module *__module_text_address(unsigned long addr) { diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index e38e4bac97c..e3203c654dd 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -20,367 +20,19 @@ #include <linux/spinlock.h> #include <linux/kallsyms.h> #include <linux/interrupt.h> +#include <linux/debug_locks.h> #include "mutex-debug.h" /* - * We need a global lock when we walk through the multi-process - * lock tree. Only used in the deadlock-debugging case. - */ -DEFINE_SPINLOCK(debug_mutex_lock); - -/* - * All locks held by all tasks, in a single global list: - */ -LIST_HEAD(debug_mutex_held_locks); - -/* - * In the debug case we carry the caller's instruction pointer into - * other functions, but we dont want the function argument overhead - * in the nondebug case - hence these macros: - */ -#define __IP_DECL__ , unsigned long ip -#define __IP__ , ip -#define __RET_IP__ , (unsigned long)__builtin_return_address(0) - -/* - * "mutex debugging enabled" flag. We turn it off when we detect - * the first problem because we dont want to recurse back - * into the tracing code when doing error printk or - * executing a BUG(): - */ -int debug_mutex_on = 1; - -static void printk_task(struct task_struct *p) -{ - if (p) - printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); - else - printk("<none>"); -} - -static void printk_ti(struct thread_info *ti) -{ - if (ti) - printk_task(ti->task); - else - printk("<none>"); -} - -static void printk_task_short(struct task_struct *p) -{ - if (p) - printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio); - else - printk("<none>"); -} - -static void printk_lock(struct mutex *lock, int print_owner) -{ - printk(" [%p] {%s}\n", lock, lock->name); - - if (print_owner && lock->owner) { - printk(".. held by: "); - printk_ti(lock->owner); - printk("\n"); - } - if (lock->owner) { - printk("... acquired at: "); - print_symbol("%s\n", lock->acquire_ip); - } -} - -/* - * printk locks held by a task: - */ -static void show_task_locks(struct task_struct *p) -{ - switch (p->state) { - case TASK_RUNNING: printk("R"); break; - case TASK_INTERRUPTIBLE: printk("S"); break; - case TASK_UNINTERRUPTIBLE: printk("D"); break; - case TASK_STOPPED: printk("T"); break; - case EXIT_ZOMBIE: printk("Z"); break; - case EXIT_DEAD: printk("X"); break; - default: printk("?"); break; - } - printk_task(p); - if (p->blocked_on) { - struct mutex *lock = p->blocked_on->lock; - - printk(" blocked on mutex:"); - printk_lock(lock, 1); - } else - printk(" (not blocked on mutex)\n"); -} - -/* - * printk all locks held in the system (if filter == NULL), - * or all locks belonging to a single task (if filter != NULL): - */ -void show_held_locks(struct task_struct *filter) -{ - struct list_head *curr, *cursor = NULL; - struct mutex *lock; - struct thread_info *t; - unsigned long flags; - int count = 0; - - if (filter) { - printk("------------------------------\n"); - printk("| showing all locks held by: | ("); - printk_task_short(filter); - printk("):\n"); - printk("------------------------------\n"); - } else { - printk("---------------------------\n"); - printk("| showing all locks held: |\n"); - printk("---------------------------\n"); - } - - /* - * Play safe and acquire the global trace lock. We - * cannot printk with that lock held so we iterate - * very carefully: - */ -next: - debug_spin_lock_save(&debug_mutex_lock, flags); - list_for_each(curr, &debug_mutex_held_locks) { - if (cursor && curr != cursor) - continue; - lock = list_entry(curr, struct mutex, held_list); - t = lock->owner; - if (filter && (t != filter->thread_info)) - continue; - count++; - cursor = curr->next; - debug_spin_unlock_restore(&debug_mutex_lock, flags); - - printk("\n#%03d: ", count); - printk_lock(lock, filter ? 0 : 1); - goto next; - } - debug_spin_unlock_restore(&debug_mutex_lock, flags); - printk("\n"); -} - -void mutex_debug_show_all_locks(void) -{ - struct task_struct *g, *p; - int count = 10; - int unlock = 1; - - printk("\nShowing all blocking locks in the system:\n"); - - /* - * Here we try to get the tasklist_lock as hard as possible, - * if not successful after 2 seconds we ignore it (but keep - * trying). This is to enable a debug printout even if a - * tasklist_lock-holding task deadlocks or crashes. - */ -retry: - if (!read_trylock(&tasklist_lock)) { - if (count == 10) - printk("hm, tasklist_lock locked, retrying... "); - if (count) { - count--; - printk(" #%d", 10-count); - mdelay(200); - goto retry; - } - printk(" ignoring it.\n"); - unlock = 0; - } - if (count != 10) - printk(" locked it.\n"); - - do_each_thread(g, p) { - show_task_locks(p); - if (!unlock) - if (read_trylock(&tasklist_lock)) - unlock = 1; - } while_each_thread(g, p); - - printk("\n"); - show_held_locks(NULL); - printk("=============================================\n\n"); - - if (unlock) - read_unlock(&tasklist_lock); -} - -static void report_deadlock(struct task_struct *task, struct mutex *lock, - struct mutex *lockblk, unsigned long ip) -{ - printk("\n%s/%d is trying to acquire this lock:\n", - current->comm, current->pid); - printk_lock(lock, 1); - printk("... trying at: "); - print_symbol("%s\n", ip); - show_held_locks(current); - - if (lockblk) { - printk("but %s/%d is deadlocking current task %s/%d!\n\n", - task->comm, task->pid, current->comm, current->pid); - printk("\n%s/%d is blocked on this lock:\n", - task->comm, task->pid); - printk_lock(lockblk, 1); - - show_held_locks(task); - - printk("\n%s/%d's [blocked] stackdump:\n\n", - task->comm, task->pid); - show_stack(task, NULL); - } - - printk("\n%s/%d's [current] stackdump:\n\n", - current->comm, current->pid); - dump_stack(); - mutex_debug_show_all_locks(); - printk("[ turning off deadlock detection. Please report this. ]\n\n"); - local_irq_disable(); -} - -/* - * Recursively check for mutex deadlocks: - */ -static int check_deadlock(struct mutex *lock, int depth, - struct thread_info *ti, unsigned long ip) -{ - struct mutex *lockblk; - struct task_struct *task; - - if (!debug_mutex_on) - return 0; - - ti = lock->owner; - if (!ti) - return 0; - - task = ti->task; - lockblk = NULL; - if (task->blocked_on) - lockblk = task->blocked_on->lock; - - /* Self-deadlock: */ - if (current == task) { - DEBUG_OFF(); - if (depth) - return 1; - printk("\n==========================================\n"); - printk( "[ BUG: lock recursion deadlock detected! |\n"); - printk( "------------------------------------------\n"); - report_deadlock(task, lock, NULL, ip); - return 0; - } - - /* Ugh, something corrupted the lock data structure? */ - if (depth > 20) { - DEBUG_OFF(); - printk("\n===========================================\n"); - printk( "[ BUG: infinite lock dependency detected!? |\n"); - printk( "-------------------------------------------\n"); - report_deadlock(task, lock, lockblk, ip); - return 0; - } - - /* Recursively check for dependencies: */ - if (lockblk && check_deadlock(lockblk, depth+1, ti, ip)) { - printk("\n============================================\n"); - printk( "[ BUG: circular locking deadlock detected! ]\n"); - printk( "--------------------------------------------\n"); - report_deadlock(task, lock, lockblk, ip); - return 0; - } - return 0; -} - -/* - * Called when a task exits, this function checks whether the - * task is holding any locks, and reports the first one if so: - */ -void mutex_debug_check_no_locks_held(struct task_struct *task) -{ - struct list_head *curr, *next; - struct thread_info *t; - unsigned long flags; - struct mutex *lock; - - if (!debug_mutex_on) - return; - - debug_spin_lock_save(&debug_mutex_lock, flags); - list_for_each_safe(curr, next, &debug_mutex_held_locks) { - lock = list_entry(curr, struct mutex, held_list); - t = lock->owner; - if (t != task->thread_info) - continue; - list_del_init(curr); - DEBUG_OFF(); - debug_spin_unlock_restore(&debug_mutex_lock, flags); - - printk("BUG: %s/%d, lock held at task exit time!\n", - task->comm, task->pid); - printk_lock(lock, 1); - if (lock->owner != task->thread_info) - printk("exiting task is not even the owner??\n"); - return; - } - debug_spin_unlock_restore(&debug_mutex_lock, flags); -} - -/* - * Called when kernel memory is freed (or unmapped), or if a mutex - * is destroyed or reinitialized - this code checks whether there is - * any held lock in the memory range of <from> to <to>: - */ -void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) -{ - struct list_head *curr, *next; - const void *to = from + len; - unsigned long flags; - struct mutex *lock; - void *lock_addr; - - if (!debug_mutex_on) - return; - - debug_spin_lock_save(&debug_mutex_lock, flags); - list_for_each_safe(curr, next, &debug_mutex_held_locks) { - lock = list_entry(curr, struct mutex, held_list); - lock_addr = lock; - if (lock_addr < from || lock_addr >= to) - continue; - list_del_init(curr); - DEBUG_OFF(); - debug_spin_unlock_restore(&debug_mutex_lock, flags); - - printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", - current->comm, current->pid, lock, from, to); - dump_stack(); - printk_lock(lock, 1); - if (lock->owner != current_thread_info()) - printk("freeing task is not even the owner??\n"); - return; - } - debug_spin_unlock_restore(&debug_mutex_lock, flags); -} - -/* * Must be called with lock->wait_lock held. */ -void debug_mutex_set_owner(struct mutex *lock, - struct thread_info *new_owner __IP_DECL__) +void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner) { lock->owner = new_owner; - DEBUG_WARN_ON(!list_empty(&lock->held_list)); - if (debug_mutex_on) { - list_add_tail(&lock->held_list, &debug_mutex_held_locks); - lock->acquire_ip = ip; - } } -void debug_mutex_init_waiter(struct mutex_waiter *waiter) +void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) { memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); waiter->magic = waiter; @@ -389,23 +41,23 @@ void debug_mutex_init_waiter(struct mutex_waiter *waiter) void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) { - SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock)); - DEBUG_WARN_ON(list_empty(&lock->wait_list)); - DEBUG_WARN_ON(waiter->magic != waiter); - DEBUG_WARN_ON(list_empty(&waiter->list)); + SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); + DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); + DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); + DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); } void debug_mutex_free_waiter(struct mutex_waiter *waiter) { - DEBUG_WARN_ON(!list_empty(&waiter->list)); + DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list)); memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter)); } void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti __IP_DECL__) + struct thread_info *ti) { - SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock)); - check_deadlock(lock, 0, ti, ip); + SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); + /* Mark the current thread as blocked on the lock: */ ti->task->blocked_on = waiter; waiter->lock = lock; @@ -414,9 +66,9 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct thread_info *ti) { - DEBUG_WARN_ON(list_empty(&waiter->list)); - DEBUG_WARN_ON(waiter->task != ti->task); - DEBUG_WARN_ON(ti->task->blocked_on != waiter); + DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); + DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); + DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); ti->task->blocked_on = NULL; list_del_init(&waiter->list); @@ -425,24 +77,23 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, void debug_mutex_unlock(struct mutex *lock) { - DEBUG_WARN_ON(lock->magic != lock); - DEBUG_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); - DEBUG_WARN_ON(lock->owner != current_thread_info()); - if (debug_mutex_on) { - DEBUG_WARN_ON(list_empty(&lock->held_list)); - list_del_init(&lock->held_list); - } + DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); + DEBUG_LOCKS_WARN_ON(lock->magic != lock); + DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); + DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); } -void debug_mutex_init(struct mutex *lock, const char *name) +void debug_mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key) { +#ifdef CONFIG_DEBUG_LOCK_ALLOC /* * Make sure we are not reinitializing a held lock: */ - mutex_debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key); +#endif lock->owner = NULL; - INIT_LIST_HEAD(&lock->held_list); - lock->name = name; lock->magic = lock; } @@ -456,7 +107,7 @@ void debug_mutex_init(struct mutex *lock, const char *name) */ void fastcall mutex_destroy(struct mutex *lock) { - DEBUG_WARN_ON(mutex_is_locked(lock)); + DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock)); lock->magic = NULL; } diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index a5196c36a5f..babfbdfc534 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -10,110 +10,44 @@ * More details are in kernel/mutex-debug.c. */ -extern spinlock_t debug_mutex_lock; -extern struct list_head debug_mutex_held_locks; -extern int debug_mutex_on; - -/* - * In the debug case we carry the caller's instruction pointer into - * other functions, but we dont want the function argument overhead - * in the nondebug case - hence these macros: - */ -#define __IP_DECL__ , unsigned long ip -#define __IP__ , ip -#define __RET_IP__ , (unsigned long)__builtin_return_address(0) - /* * This must be called with lock->wait_lock held. */ -extern void debug_mutex_set_owner(struct mutex *lock, - struct thread_info *new_owner __IP_DECL__); +extern void +debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner); static inline void debug_mutex_clear_owner(struct mutex *lock) { lock->owner = NULL; } -extern void debug_mutex_init_waiter(struct mutex_waiter *waiter); +extern void debug_mutex_lock_common(struct mutex *lock, + struct mutex_waiter *waiter); extern void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter); extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); extern void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti __IP_DECL__); + struct thread_info *ti); extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct thread_info *ti); extern void debug_mutex_unlock(struct mutex *lock); -extern void debug_mutex_init(struct mutex *lock, const char *name); - -#define debug_spin_lock_save(lock, flags) \ - do { \ - local_irq_save(flags); \ - if (debug_mutex_on) \ - spin_lock(lock); \ - } while (0) - -#define debug_spin_unlock_restore(lock, flags) \ - do { \ - if (debug_mutex_on) \ - spin_unlock(lock); \ - local_irq_restore(flags); \ - preempt_check_resched(); \ - } while (0) +extern void debug_mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key); #define spin_lock_mutex(lock, flags) \ do { \ struct mutex *l = container_of(lock, struct mutex, wait_lock); \ \ - DEBUG_WARN_ON(in_interrupt()); \ - debug_spin_lock_save(&debug_mutex_lock, flags); \ - spin_lock(lock); \ - DEBUG_WARN_ON(l->magic != l); \ + DEBUG_LOCKS_WARN_ON(in_interrupt()); \ + local_irq_save(flags); \ + __raw_spin_lock(&(lock)->raw_lock); \ + DEBUG_LOCKS_WARN_ON(l->magic != l); \ } while (0) #define spin_unlock_mutex(lock, flags) \ do { \ - spin_unlock(lock); \ - debug_spin_unlock_restore(&debug_mutex_lock, flags); \ + __raw_spin_unlock(&(lock)->raw_lock); \ + local_irq_restore(flags); \ + preempt_check_resched(); \ } while (0) - -#define DEBUG_OFF() \ -do { \ - if (debug_mutex_on) { \ - debug_mutex_on = 0; \ - console_verbose(); \ - if (spin_is_locked(&debug_mutex_lock)) \ - spin_unlock(&debug_mutex_lock); \ - } \ -} while (0) - -#define DEBUG_BUG() \ -do { \ - if (debug_mutex_on) { \ - DEBUG_OFF(); \ - BUG(); \ - } \ -} while (0) - -#define DEBUG_WARN_ON(c) \ -do { \ - if (unlikely(c && debug_mutex_on)) { \ - DEBUG_OFF(); \ - WARN_ON(1); \ - } \ -} while (0) - -# define DEBUG_BUG_ON(c) \ -do { \ - if (unlikely(c)) \ - DEBUG_BUG(); \ -} while (0) - -#ifdef CONFIG_SMP -# define SMP_DEBUG_WARN_ON(c) DEBUG_WARN_ON(c) -# define SMP_DEBUG_BUG_ON(c) DEBUG_BUG_ON(c) -#else -# define SMP_DEBUG_WARN_ON(c) do { } while (0) -# define SMP_DEBUG_BUG_ON(c) do { } while (0) -#endif - diff --git a/kernel/mutex.c b/kernel/mutex.c index 7043db21bbc..8c71cf72a49 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -17,6 +17,7 @@ #include <linux/module.h> #include <linux/spinlock.h> #include <linux/interrupt.h> +#include <linux/debug_locks.h> /* * In the DEBUG case we are using the "NULL fastpath" for mutexes, @@ -38,13 +39,14 @@ * * It is not allowed to initialize an already locked mutex. */ -void fastcall __mutex_init(struct mutex *lock, const char *name) +void +__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { atomic_set(&lock->count, 1); spin_lock_init(&lock->wait_lock); INIT_LIST_HEAD(&lock->wait_list); - debug_mutex_init(lock, name); + debug_mutex_init(lock, name, key); } EXPORT_SYMBOL(__mutex_init); @@ -56,7 +58,7 @@ EXPORT_SYMBOL(__mutex_init); * branch is predicted by the CPU as default-untaken. */ static void fastcall noinline __sched -__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__); +__mutex_lock_slowpath(atomic_t *lock_count); /*** * mutex_lock - acquire the mutex @@ -79,7 +81,7 @@ __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__); * * This function is similar to (but not equivalent to) down(). */ -void fastcall __sched mutex_lock(struct mutex *lock) +void inline fastcall __sched mutex_lock(struct mutex *lock) { might_sleep(); /* @@ -92,7 +94,7 @@ void fastcall __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); static void fastcall noinline __sched -__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__); +__mutex_unlock_slowpath(atomic_t *lock_count); /*** * mutex_unlock - release the mutex @@ -120,18 +122,18 @@ EXPORT_SYMBOL(mutex_unlock); * Lock a mutex (possibly interruptible), slowpath: */ static inline int __sched -__mutex_lock_common(struct mutex *lock, long state __IP_DECL__) +__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) { struct task_struct *task = current; struct mutex_waiter waiter; unsigned int old_val; unsigned long flags; - debug_mutex_init_waiter(&waiter); - spin_lock_mutex(&lock->wait_lock, flags); - debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); + debug_mutex_lock_common(lock, &waiter); + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + debug_mutex_add_waiter(lock, &waiter, task->thread_info); /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); @@ -158,6 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) if (unlikely(state == TASK_INTERRUPTIBLE && signal_pending(task))) { mutex_remove_waiter(lock, &waiter, task->thread_info); + mutex_release(&lock->dep_map, 1, _RET_IP_); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); @@ -173,7 +176,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) /* got the lock - rejoice! */ mutex_remove_waiter(lock, &waiter, task->thread_info); - debug_mutex_set_owner(lock, task->thread_info __IP__); + debug_mutex_set_owner(lock, task->thread_info); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) @@ -183,32 +186,40 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) debug_mutex_free_waiter(&waiter); - DEBUG_WARN_ON(list_empty(&lock->held_list)); - DEBUG_WARN_ON(lock->owner != task->thread_info); - return 0; } static void fastcall noinline __sched -__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__) +__mutex_lock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE __IP__); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0); +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched +mutex_lock_nested(struct mutex *lock, unsigned int subclass) +{ + might_sleep(); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass); } +EXPORT_SYMBOL_GPL(mutex_lock_nested); +#endif + /* * Release the lock, slowpath: */ -static fastcall noinline void -__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) +static fastcall inline void +__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) { struct mutex *lock = container_of(lock_count, struct mutex, count); unsigned long flags; - DEBUG_WARN_ON(lock->owner != current_thread_info()); - spin_lock_mutex(&lock->wait_lock, flags); + mutex_release(&lock->dep_map, nested, _RET_IP_); + debug_mutex_unlock(lock); /* * some architectures leave the lock unlocked in the fastpath failure @@ -218,8 +229,6 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) if (__mutex_slowpath_needs_to_unlock()) atomic_set(&lock->count, 1); - debug_mutex_unlock(lock); - if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ struct mutex_waiter *waiter = @@ -237,11 +246,20 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) } /* + * Release the lock, slowpath: + */ +static fastcall noinline void +__mutex_unlock_slowpath(atomic_t *lock_count) +{ + __mutex_unlock_common_slowpath(lock_count, 1); +} + +/* * Here come the less common (and hence less performance-critical) APIs: * mutex_lock_interruptible() and mutex_trylock(). */ static int fastcall noinline __sched -__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__); +__mutex_lock_interruptible_slowpath(atomic_t *lock_count); /*** * mutex_lock_interruptible - acquire the mutex, interruptable @@ -264,11 +282,11 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock) EXPORT_SYMBOL(mutex_lock_interruptible); static int fastcall noinline __sched -__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__) +__mutex_lock_interruptible_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE __IP__); + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0); } /* @@ -284,8 +302,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) spin_lock_mutex(&lock->wait_lock, flags); prev = atomic_xchg(&lock->count, -1); - if (likely(prev == 1)) - debug_mutex_set_owner(lock, current_thread_info() __RET_IP__); + if (likely(prev == 1)) { + debug_mutex_set_owner(lock, current_thread_info()); + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + } /* Set it back to 0 if there are no waiters: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); @@ -309,7 +329,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) * This function must not be used in interrupt context. The * mutex must be released by the same task that acquired it. */ -int fastcall mutex_trylock(struct mutex *lock) +int fastcall __sched mutex_trylock(struct mutex *lock) { return __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); diff --git a/kernel/mutex.h b/kernel/mutex.h index 06918994725..a075dafbb29 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h @@ -16,22 +16,15 @@ #define mutex_remove_waiter(lock, waiter, ti) \ __list_del((waiter)->list.prev, (waiter)->list.next) -#define DEBUG_WARN_ON(c) do { } while (0) #define debug_mutex_set_owner(lock, new_owner) do { } while (0) #define debug_mutex_clear_owner(lock) do { } while (0) -#define debug_mutex_init_waiter(waiter) do { } while (0) #define debug_mutex_wake_waiter(lock, waiter) do { } while (0) #define debug_mutex_free_waiter(waiter) do { } while (0) -#define debug_mutex_add_waiter(lock, waiter, ti, ip) do { } while (0) +#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) #define debug_mutex_unlock(lock) do { } while (0) -#define debug_mutex_init(lock, name) do { } while (0) - -/* - * Return-address parameters/declarations. They are very useful for - * debugging, but add overhead in the !DEBUG case - so we go the - * trouble of using this not too elegant but zero-cost solution: - */ -#define __IP_DECL__ -#define __IP__ -#define __RET_IP__ +#define debug_mutex_init(lock, name, key) do { } while (0) +static inline void +debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) +{ +} diff --git a/kernel/panic.c b/kernel/panic.c index ab13f0f668b..8010b9b17ac 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -18,6 +18,7 @@ #include <linux/interrupt.h> #include <linux/nmi.h> #include <linux/kexec.h> +#include <linux/debug_locks.h> int panic_on_oops; int tainted; @@ -172,6 +173,7 @@ const char *print_tainted(void) void add_taint(unsigned flag) { + debug_locks = 0; /* can't trust the integrity of the kernel anymore */ tainted |= flag; } EXPORT_SYMBOL(add_taint); @@ -256,6 +258,7 @@ int oops_may_print(void) */ void oops_enter(void) { + debug_locks_off(); /* can't trust the integrity of the kernel anymore */ do_oops_enter_exit(); } diff --git a/kernel/pid.c b/kernel/pid.c index eeb836b65ca..93e212f2067 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -218,7 +218,7 @@ struct pid * fastcall find_pid(int nr) return NULL; } -int fastcall attach_pid(task_t *task, enum pid_type type, int nr) +int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr) { struct pid_link *link; struct pid *pid; @@ -233,7 +233,7 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr) return 0; } -void fastcall detach_pid(task_t *task, enum pid_type type) +void fastcall detach_pid(struct task_struct *task, enum pid_type type) { struct pid_link *link; struct pid *pid; @@ -267,7 +267,7 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) /* * Must be called under rcu_read_lock() or with tasklist_lock read-held. */ -task_t *find_task_by_pid_type(int type, int nr) +struct task_struct *find_task_by_pid_type(int type, int nr) { return pid_task(find_pid(nr), type); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ae44a70aae8..619ecabf7c5 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -56,7 +56,7 @@ config PM_TRACE config SOFTWARE_SUSPEND bool "Software Suspend" - depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) + depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP)) ---help--- Enable the possibility of suspending the machine. It doesn't need ACPI or APM. @@ -78,6 +78,10 @@ config SOFTWARE_SUSPEND For more information take a look at <file:Documentation/power/swsusp.txt>. + (For now, swsusp is incompatible with PAE aka HIGHMEM_64G on i386. + we need identity mapping for resume to work, and that is trivial + to get with 4MB pages, but less than trivial on PAE). + config PM_STD_PARTITION string "Default resume partition" depends on SOFTWARE_SUSPEND diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 84063ac8fcf..c50d15266c1 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c @@ -75,42 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type, return dev; } -static void __pm_unregister(struct pm_dev *dev) -{ - if (dev) { - list_del(&dev->entry); - kfree(dev); - } -} - -/** - * pm_unregister_all - unregister all devices with matching callback - * @callback: callback function pointer - * - * Unregister every device that would call the callback passed. This - * is primarily meant as a helper function for loadable modules. It - * enables a module to give up all its managed devices without keeping - * its own private list. - */ - -void pm_unregister_all(pm_callback callback) -{ - struct list_head *entry; - - if (!callback) - return; - - mutex_lock(&pm_devs_lock); - entry = pm_devs.next; - while (entry != &pm_devs) { - struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); - entry = entry->next; - if (dev->callback == callback) - __pm_unregister(dev); - } - mutex_unlock(&pm_devs_lock); -} - /** * pm_send - send request to a single device * @dev: device to send to @@ -239,7 +203,6 @@ int pm_send_all(pm_request_t rqst, void *data) } EXPORT_SYMBOL(pm_register); -EXPORT_SYMBOL(pm_unregister_all); EXPORT_SYMBOL(pm_send_all); EXPORT_SYMBOL(pm_active); diff --git a/kernel/power/process.c b/kernel/power/process.c index b2a5f671d6c..72e72d2c61e 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -66,13 +66,25 @@ static inline void freeze_process(struct task_struct *p) } } +static void cancel_freezing(struct task_struct *p) +{ + unsigned long flags; + + if (freezing(p)) { + pr_debug(" clean up: %s\n", p->comm); + do_not_freeze(p); + spin_lock_irqsave(&p->sighand->siglock, flags); + recalc_sigpending_tsk(p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } +} + /* 0 = success, else # of processes that we failed to stop */ int freeze_processes(void) { int todo, nr_user, user_frozen; unsigned long start_time; struct task_struct *g, *p; - unsigned long flags; printk( "Stopping tasks: " ); start_time = jiffies; @@ -85,6 +97,10 @@ int freeze_processes(void) continue; if (frozen(p)) continue; + if (p->state == TASK_TRACED && frozen(p->parent)) { + cancel_freezing(p); + continue; + } if (p->mm && !(p->flags & PF_BORROWED_MM)) { /* The task is a user-space one. * Freeze it unless there's a vfork completion @@ -126,13 +142,7 @@ int freeze_processes(void) do_each_thread(g, p) { if (freezeable(p) && !frozen(p)) printk(KERN_ERR " %s\n", p->comm); - if (freezing(p)) { - pr_debug(" clean up: %s\n", p->comm); - p->flags &= ~PF_FREEZE; - spin_lock_irqsave(&p->sighand->siglock, flags); - recalc_sigpending_tsk(p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } + cancel_freezing(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); return todo; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 24c96f35423..75d4886e648 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -227,11 +227,17 @@ static void copy_data_pages(struct pbe *pblist) for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { if (saveable(zone, &zone_pfn)) { struct page *page; + long *src, *dst; + int n; + page = pfn_to_page(zone_pfn + zone->zone_start_pfn); BUG_ON(!pbe); pbe->orig_address = (unsigned long)page_address(page); - /* copy_page is not usable for copying task structs. */ - memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); + /* copy_page and memcpy are not usable for copying task structs. */ + dst = (long *)pbe->address; + src = (long *)pbe->orig_address; + for (n = PAGE_SIZE / sizeof(long); n; n--) + *dst++ = *src++; pbe = pbe->next; } } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 044b8e0c102..f1dd146bd64 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -263,7 +263,6 @@ int swsusp_write(void) struct swap_map_handle handle; struct snapshot_handle snapshot; struct swsusp_info *header; - unsigned long start; int error; if ((error = swsusp_swap_check())) { @@ -281,16 +280,17 @@ int swsusp_write(void) } error = get_swap_writer(&handle); if (!error) { - start = handle.cur_swap; + unsigned long start = handle.cur_swap; error = swap_write_page(&handle, header); - } - if (!error) - error = save_image(&handle, &snapshot, header->pages - 1); - if (!error) { - flush_swap_writer(&handle); - printk("S"); - error = mark_swapfiles(swp_entry(root_swap, start)); - printk("|\n"); + if (!error) + error = save_image(&handle, &snapshot, + header->pages - 1); + if (!error) { + flush_swap_writer(&handle); + printk("S"); + error = mark_swapfiles(swp_entry(root_swap, start)); + printk("|\n"); + } } if (error) free_all_swap_pages(root_swap, handle.bitmap); @@ -311,8 +311,10 @@ static atomic_t io_done = ATOMIC_INIT(0); static int end_io(struct bio *bio, unsigned int num, int err) { - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - panic("I/O error reading memory image"); + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + printk(KERN_ERR "I/O error reading swsusp image.\n"); + return -EIO; + } atomic_set(&io_done, 0); return 0; } diff --git a/kernel/printk.c b/kernel/printk.c index 39ae24d2a41..1149365e989 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -52,7 +52,7 @@ int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; -EXPORT_SYMBOL(console_printk); +EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */ /* * Low lever drivers may need that to know if they can schedule in @@ -518,7 +518,9 @@ asmlinkage int vprintk(const char *fmt, va_list args) zap_locks(); /* This stops the holder of console_sem just where we want him */ - spin_lock_irqsave(&logbuf_lock, flags); + local_irq_save(flags); + lockdep_off(); + spin_lock(&logbuf_lock); printk_cpu = smp_processor_id(); /* Emit the output into the temporary buffer */ @@ -588,7 +590,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) */ console_locked = 1; printk_cpu = UINT_MAX; - spin_unlock_irqrestore(&logbuf_lock, flags); + spin_unlock(&logbuf_lock); /* * Console drivers may assume that per-cpu resources have @@ -604,6 +606,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) console_locked = 0; up(&console_sem); } + lockdep_on(); + local_irq_restore(flags); } else { /* * Someone else owns the drivers. We drop the spinlock, which @@ -611,7 +615,9 @@ asmlinkage int vprintk(const char *fmt, va_list args) * console drivers with the output which we just produced. */ printk_cpu = UINT_MAX; - spin_unlock_irqrestore(&logbuf_lock, flags); + spin_unlock(&logbuf_lock); + lockdep_on(); + local_irq_restore(flags); } preempt_enable(); @@ -767,7 +773,7 @@ int is_console_locked(void) { return console_locked; } -EXPORT_SYMBOL(is_console_locked); +EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */ /** * release_console_sem - unlock the console system @@ -793,6 +799,9 @@ void release_console_sem(void) up(&secondary_console_sem); return; } + + console_may_schedule = 0; + for ( ; ; ) { spin_lock_irqsave(&logbuf_lock, flags); wake_klogd |= log_start - log_end; @@ -806,11 +815,17 @@ void release_console_sem(void) local_irq_restore(flags); } console_locked = 0; - console_may_schedule = 0; up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); - if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) - wake_up_interruptible(&log_wait); + if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) { + /* + * If we printk from within the lock dependency code, + * from within the scheduler code, then do not lock + * up due to self-recursion: + */ + if (!lockdep_internal()) + wake_up_interruptible(&log_wait); + } } EXPORT_SYMBOL(release_console_sem); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 335c5b932e1..9a111f70145 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -28,7 +28,7 @@ * * Must be called with the tasklist lock write-held. */ -void __ptrace_link(task_t *child, task_t *new_parent) +void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) { BUG_ON(!list_empty(&child->ptrace_list)); if (child->parent == new_parent) @@ -46,7 +46,7 @@ void __ptrace_link(task_t *child, task_t *new_parent) * TASK_TRACED, resume it now. * Requires that irqs be disabled. */ -void ptrace_untrace(task_t *child) +void ptrace_untrace(struct task_struct *child) { spin_lock(&child->sighand->siglock); if (child->state == TASK_TRACED) { @@ -65,7 +65,7 @@ void ptrace_untrace(task_t *child) * * Must be called with the tasklist lock write-held. */ -void __ptrace_unlink(task_t *child) +void __ptrace_unlink(struct task_struct *child) { BUG_ON(!child->ptrace); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f464f5ae3f1..523e46483b9 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -53,13 +53,13 @@ static struct rcu_ctrlblk rcu_ctrlblk = { .cur = -300, .completed = -300, - .lock = SPIN_LOCK_UNLOCKED, + .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, - .lock = SPIN_LOCK_UNLOCKED, + .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; @@ -241,12 +241,16 @@ static void rcu_do_batch(struct rcu_data *rdp) next = rdp->donelist = list->next; list->func(list); list = next; - rdp->qlen--; if (++count >= rdp->blimit) break; } + + local_irq_disable(); + rdp->qlen -= count; + local_irq_enable(); if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; + if (!rdp->donelist) rdp->donetail = &rdp->donelist; else @@ -548,7 +552,7 @@ static void __devinit rcu_online_cpu(int cpu) tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); } -static int __devinit rcu_cpu_notify(struct notifier_block *self, +static int __cpuinit rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -565,7 +569,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __devinitdata rcu_nb = { +static struct notifier_block __cpuinitdata rcu_nb = { .notifier_call = rcu_cpu_notify, }; diff --git a/kernel/resource.c b/kernel/resource.c index 129cf046e56..46286434af8 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -244,6 +244,7 @@ int find_next_system_ram(struct resource *res) start = res->start; end = res->end; + BUG_ON(start >= end); read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { @@ -254,15 +255,17 @@ int find_next_system_ram(struct resource *res) p = NULL; break; } - if (p->start >= start) + if ((p->end >= start) && (p->start < end)) break; } read_unlock(&resource_lock); if (!p) return -1; /* copy data */ - res->start = p->start; - res->end = p->end; + if (res->start < p->start) + res->start = p->start; + if (res->end > p->end) + res->end = p->end; return 0; } #endif @@ -404,8 +407,6 @@ int insert_resource(struct resource *parent, struct resource *new) return result; } -EXPORT_SYMBOL(insert_resource); - /* * Given an existing resource, change its start and size to match the * arguments. Returns -EBUSY if it can't fit. Existing children of diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 4aa8a2c9f45..0c1faa950af 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -26,6 +26,7 @@ #include <linux/interrupt.h> #include <linux/plist.h> #include <linux/fs.h> +#include <linux/debug_locks.h> #include "rtmutex_common.h" @@ -45,8 +46,6 @@ do { \ console_verbose(); \ if (spin_is_locked(¤t->pi_lock)) \ spin_unlock(¤t->pi_lock); \ - if (spin_is_locked(¤t->held_list_lock)) \ - spin_unlock(¤t->held_list_lock); \ } \ } while (0) @@ -97,7 +96,7 @@ void deadlock_trace_off(void) rt_trace_on = 0; } -static void printk_task(task_t *p) +static void printk_task(struct task_struct *p) { if (p) printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); @@ -105,14 +104,6 @@ static void printk_task(task_t *p) printk("<none>"); } -static void printk_task_short(task_t *p) -{ - if (p) - printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio); - else - printk("<none>"); -} - static void printk_lock(struct rt_mutex *lock, int print_owner) { if (lock->name) @@ -128,222 +119,6 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) printk_task(rt_mutex_owner(lock)); printk("\n"); } - if (rt_mutex_owner(lock)) { - printk("... acquired at: "); - print_symbol("%s\n", lock->acquire_ip); - } -} - -static void printk_waiter(struct rt_mutex_waiter *w) -{ - printk("-------------------------\n"); - printk("| waiter struct %p:\n", w); - printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n", - w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next, - w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next, - w->list_entry.prio); - printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n", - w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next, - w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next, - w->pi_list_entry.prio); - printk("\n| lock:\n"); - printk_lock(w->lock, 1); - printk("| w->ti->task:\n"); - printk_task(w->task); - printk("| blocked at: "); - print_symbol("%s\n", w->ip); - printk("-------------------------\n"); -} - -static void show_task_locks(task_t *p) -{ - switch (p->state) { - case TASK_RUNNING: printk("R"); break; - case TASK_INTERRUPTIBLE: printk("S"); break; - case TASK_UNINTERRUPTIBLE: printk("D"); break; - case TASK_STOPPED: printk("T"); break; - case EXIT_ZOMBIE: printk("Z"); break; - case EXIT_DEAD: printk("X"); break; - default: printk("?"); break; - } - printk_task(p); - if (p->pi_blocked_on) { - struct rt_mutex *lock = p->pi_blocked_on->lock; - - printk(" blocked on:"); - printk_lock(lock, 1); - } else - printk(" (not blocked)\n"); -} - -void rt_mutex_show_held_locks(task_t *task, int verbose) -{ - struct list_head *curr, *cursor = NULL; - struct rt_mutex *lock; - task_t *t; - unsigned long flags; - int count = 0; - - if (!rt_trace_on) - return; - - if (verbose) { - printk("------------------------------\n"); - printk("| showing all locks held by: | ("); - printk_task_short(task); - printk("):\n"); - printk("------------------------------\n"); - } - -next: - spin_lock_irqsave(&task->held_list_lock, flags); - list_for_each(curr, &task->held_list_head) { - if (cursor && curr != cursor) - continue; - lock = list_entry(curr, struct rt_mutex, held_list_entry); - t = rt_mutex_owner(lock); - WARN_ON(t != task); - count++; - cursor = curr->next; - spin_unlock_irqrestore(&task->held_list_lock, flags); - - printk("\n#%03d: ", count); - printk_lock(lock, 0); - goto next; - } - spin_unlock_irqrestore(&task->held_list_lock, flags); - - printk("\n"); -} - -void rt_mutex_show_all_locks(void) -{ - task_t *g, *p; - int count = 10; - int unlock = 1; - - printk("\n"); - printk("----------------------\n"); - printk("| showing all tasks: |\n"); - printk("----------------------\n"); - - /* - * Here we try to get the tasklist_lock as hard as possible, - * if not successful after 2 seconds we ignore it (but keep - * trying). This is to enable a debug printout even if a - * tasklist_lock-holding task deadlocks or crashes. - */ -retry: - if (!read_trylock(&tasklist_lock)) { - if (count == 10) - printk("hm, tasklist_lock locked, retrying... "); - if (count) { - count--; - printk(" #%d", 10-count); - mdelay(200); - goto retry; - } - printk(" ignoring it.\n"); - unlock = 0; - } - if (count != 10) - printk(" locked it.\n"); - - do_each_thread(g, p) { - show_task_locks(p); - if (!unlock) - if (read_trylock(&tasklist_lock)) - unlock = 1; - } while_each_thread(g, p); - - printk("\n"); - - printk("-----------------------------------------\n"); - printk("| showing all locks held in the system: |\n"); - printk("-----------------------------------------\n"); - - do_each_thread(g, p) { - rt_mutex_show_held_locks(p, 0); - if (!unlock) - if (read_trylock(&tasklist_lock)) - unlock = 1; - } while_each_thread(g, p); - - - printk("=============================================\n\n"); - - if (unlock) - read_unlock(&tasklist_lock); -} - -void rt_mutex_debug_check_no_locks_held(task_t *task) -{ - struct rt_mutex_waiter *w; - struct list_head *curr; - struct rt_mutex *lock; - - if (!rt_trace_on) - return; - if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) { - printk("BUG: PI priority boost leaked!\n"); - printk_task(task); - printk("\n"); - } - if (list_empty(&task->held_list_head)) - return; - - spin_lock(&task->pi_lock); - plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) { - TRACE_OFF(); - - printk("hm, PI interest held at exit time? Task:\n"); - printk_task(task); - printk_waiter(w); - return; - } - spin_unlock(&task->pi_lock); - - list_for_each(curr, &task->held_list_head) { - lock = list_entry(curr, struct rt_mutex, held_list_entry); - - printk("BUG: %s/%d, lock held at task exit time!\n", - task->comm, task->pid); - printk_lock(lock, 1); - if (rt_mutex_owner(lock) != task) - printk("exiting task is not even the owner??\n"); - } -} - -int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len) -{ - const void *to = from + len; - struct list_head *curr; - struct rt_mutex *lock; - unsigned long flags; - void *lock_addr; - - if (!rt_trace_on) - return 0; - - spin_lock_irqsave(¤t->held_list_lock, flags); - list_for_each(curr, ¤t->held_list_head) { - lock = list_entry(curr, struct rt_mutex, held_list_entry); - lock_addr = lock; - if (lock_addr < from || lock_addr >= to) - continue; - TRACE_OFF(); - - printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", - current->comm, current->pid, lock, from, to); - dump_stack(); - printk_lock(lock, 1); - if (rt_mutex_owner(lock) != current) - printk("freeing task is not even the owner??\n"); - return 1; - } - spin_unlock_irqrestore(¤t->held_list_lock, flags); - - return 0; } void rt_mutex_debug_task_free(struct task_struct *task) @@ -395,85 +170,41 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) current->comm, current->pid); printk_lock(waiter->lock, 1); - printk("... trying at: "); - print_symbol("%s\n", waiter->ip); - printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid); printk_lock(waiter->deadlock_lock, 1); - rt_mutex_show_held_locks(current, 1); - rt_mutex_show_held_locks(task, 1); + debug_show_held_locks(current); + debug_show_held_locks(task); printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); show_stack(task, NULL); printk("\n%s/%d's [current] stackdump:\n\n", current->comm, current->pid); dump_stack(); - rt_mutex_show_all_locks(); + debug_show_all_locks(); + printk("[ turning off deadlock detection." "Please report this trace. ]\n\n"); local_irq_disable(); } -void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__) +void debug_rt_mutex_lock(struct rt_mutex *lock) { - unsigned long flags; - - if (rt_trace_on) { - TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry)); - - spin_lock_irqsave(¤t->held_list_lock, flags); - list_add_tail(&lock->held_list_entry, ¤t->held_list_head); - spin_unlock_irqrestore(¤t->held_list_lock, flags); - - lock->acquire_ip = ip; - } } void debug_rt_mutex_unlock(struct rt_mutex *lock) { - unsigned long flags; - - if (rt_trace_on) { - TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); - TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry)); - - spin_lock_irqsave(¤t->held_list_lock, flags); - list_del_init(&lock->held_list_entry); - spin_unlock_irqrestore(¤t->held_list_lock, flags); - } + TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); } -void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, - struct task_struct *powner __IP_DECL__) +void +debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) { - unsigned long flags; - - if (rt_trace_on) { - TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry)); - - spin_lock_irqsave(&powner->held_list_lock, flags); - list_add_tail(&lock->held_list_entry, &powner->held_list_head); - spin_unlock_irqrestore(&powner->held_list_lock, flags); - - lock->acquire_ip = ip; - } } void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) { - unsigned long flags; - - if (rt_trace_on) { - struct task_struct *owner = rt_mutex_owner(lock); - - TRACE_WARN_ON_LOCKED(!owner); - TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry)); - - spin_lock_irqsave(&owner->held_list_lock, flags); - list_del_init(&lock->held_list_entry); - spin_unlock_irqrestore(&owner->held_list_lock, flags); - } + TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); } void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) @@ -493,17 +224,15 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) { - void *addr = lock; - - if (rt_trace_on) { - rt_mutex_debug_check_no_locks_freed(addr, - sizeof(struct rt_mutex)); - INIT_LIST_HEAD(&lock->held_list_entry); - lock->name = name; - } + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lock->name = name; } -void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task) +void +rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) { } diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h index 7612fbc62d7..14193d596d7 100644 --- a/kernel/rtmutex-debug.h +++ b/kernel/rtmutex-debug.h @@ -9,20 +9,16 @@ * This file contains macros used solely by rtmutex.c. Debug version. */ -#define __IP_DECL__ , unsigned long ip -#define __IP__ , ip -#define __RET_IP__ , (unsigned long)__builtin_return_address(0) - extern void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); -extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__); +extern void debug_rt_mutex_lock(struct rt_mutex *lock); extern void debug_rt_mutex_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, - struct task_struct *powner __IP_DECL__); + struct task_struct *powner); extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, struct rt_mutex *lock); diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index e82c2f84824..948bd8f643e 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -33,7 +33,7 @@ struct test_thread_data { }; static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; -static task_t *threads[MAX_RT_TEST_THREADS]; +static struct task_struct *threads[MAX_RT_TEST_THREADS]; static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; enum test_opcodes { @@ -275,6 +275,7 @@ static int test_func(void *data) /* Wait for the next command to be executed */ schedule(); + try_to_freeze(); if (signal_pending(current)) flush_signals(current); @@ -361,8 +362,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, static ssize_t sysfs_test_status(struct sys_device *dev, char *buf) { struct test_thread_data *td; + struct task_struct *tsk; char *curr = buf; - task_t *tsk; int i; td = container_of(dev, struct test_thread_data, sysdev); diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 45d61016da5..3e13a1e5856 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -7,6 +7,8 @@ * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen + * + * See Documentation/rt-mutex-design.txt for details. */ #include <linux/spinlock.h> #include <linux/module.h> @@ -157,12 +159,11 @@ int max_lock_depth = 1024; * Decreases task's usage by one - may thus free the task. * Returns 0 or -EDEADLK. */ -static int rt_mutex_adjust_prio_chain(task_t *task, +static int rt_mutex_adjust_prio_chain(struct task_struct *task, int deadlock_detect, struct rt_mutex *orig_lock, struct rt_mutex_waiter *orig_waiter, - struct task_struct *top_task - __IP_DECL__) + struct task_struct *top_task) { struct rt_mutex *lock; struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; @@ -283,6 +284,7 @@ static int rt_mutex_adjust_prio_chain(task_t *task, spin_unlock_irqrestore(&task->pi_lock, flags); out_put_task: put_task_struct(task); + return ret; } @@ -357,7 +359,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) * * Must be called with lock->wait_lock held. */ -static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__) +static int try_to_take_rt_mutex(struct rt_mutex *lock) { /* * We have to be careful here if the atomic speedups are @@ -384,7 +386,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__) return 0; /* We got the lock. */ - debug_rt_mutex_lock(lock __IP__); + debug_rt_mutex_lock(lock); rt_mutex_set_owner(lock, current, 0); @@ -402,13 +404,12 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__) */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, - int detect_deadlock - __IP_DECL__) + int detect_deadlock) { + struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; - task_t *owner = rt_mutex_owner(lock); - int boost = 0, res; unsigned long flags; + int boost = 0, res; spin_lock_irqsave(¤t->pi_lock, flags); __rt_mutex_adjust_prio(current); @@ -454,7 +455,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, spin_unlock(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, - current __IP__); + current); spin_lock(&lock->wait_lock); @@ -526,12 +527,12 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * Must be called with lock->wait_lock held */ static void remove_waiter(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter __IP_DECL__) + struct rt_mutex_waiter *waiter) { int first = (waiter == rt_mutex_top_waiter(lock)); - int boost = 0; - task_t *owner = rt_mutex_owner(lock); + struct task_struct *owner = rt_mutex_owner(lock); unsigned long flags; + int boost = 0; spin_lock_irqsave(¤t->pi_lock, flags); plist_del(&waiter->list_entry, &lock->wait_list); @@ -568,7 +569,7 @@ static void remove_waiter(struct rt_mutex *lock, spin_unlock(&lock->wait_lock); - rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__); + rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); spin_lock(&lock->wait_lock); } @@ -595,7 +596,7 @@ void rt_mutex_adjust_pi(struct task_struct *task) get_task_struct(task); spin_unlock_irqrestore(&task->pi_lock, flags); - rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__); + rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); } /* @@ -604,7 +605,7 @@ void rt_mutex_adjust_pi(struct task_struct *task) static int __sched rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - int detect_deadlock __IP_DECL__) + int detect_deadlock) { struct rt_mutex_waiter waiter; int ret = 0; @@ -615,7 +616,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, spin_lock(&lock->wait_lock); /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock __IP__)) { + if (try_to_take_rt_mutex(lock)) { spin_unlock(&lock->wait_lock); return 0; } @@ -629,7 +630,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, for (;;) { /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock __IP__)) + if (try_to_take_rt_mutex(lock)) break; /* @@ -653,7 +654,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ if (!waiter.task) { ret = task_blocks_on_rt_mutex(lock, &waiter, - detect_deadlock __IP__); + detect_deadlock); /* * If we got woken up by the owner then start loop * all over without going into schedule to try @@ -680,7 +681,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(TASK_RUNNING); if (unlikely(waiter.task)) - remove_waiter(lock, &waiter __IP__); + remove_waiter(lock, &waiter); /* * try_to_take_rt_mutex() sets the waiter bit @@ -711,7 +712,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, * Slow path try-lock function: */ static inline int -rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__) +rt_mutex_slowtrylock(struct rt_mutex *lock) { int ret = 0; @@ -719,7 +720,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__) if (likely(rt_mutex_owner(lock) != current)) { - ret = try_to_take_rt_mutex(lock __IP__); + ret = try_to_take_rt_mutex(lock); /* * try_to_take_rt_mutex() sets the lock waiters * bit unconditionally. Clean this up. @@ -769,13 +770,13 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state, int detect_deadlock, int (*slowfn)(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - int detect_deadlock __IP_DECL__)) + int detect_deadlock)) { if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 0; } else - return slowfn(lock, state, NULL, detect_deadlock __RET_IP__); + return slowfn(lock, state, NULL, detect_deadlock); } static inline int @@ -783,24 +784,24 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, int detect_deadlock, int (*slowfn)(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - int detect_deadlock __IP_DECL__)) + int detect_deadlock)) { if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 0; } else - return slowfn(lock, state, timeout, detect_deadlock __RET_IP__); + return slowfn(lock, state, timeout, detect_deadlock); } static inline int rt_mutex_fasttrylock(struct rt_mutex *lock, - int (*slowfn)(struct rt_mutex *lock __IP_DECL__)) + int (*slowfn)(struct rt_mutex *lock)) { if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 1; } - return slowfn(lock __RET_IP__); + return slowfn(lock); } static inline void @@ -948,7 +949,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner) { __rt_mutex_init(lock, NULL); - debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__); + debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner, 0); rt_mutex_deadlock_account_lock(lock, proxy_owner); } diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h index 1e0fca13ff7..a1a1dd06421 100644 --- a/kernel/rtmutex.h +++ b/kernel/rtmutex.h @@ -10,9 +10,6 @@ * Non-debug version. */ -#define __IP_DECL__ -#define __IP__ -#define __RET_IP__ #define rt_mutex_deadlock_check(l) (0) #define rt_mutex_deadlock_account_lock(m, t) do { } while (0) #define rt_mutex_deadlock_account_unlock(l) do { } while (0) diff --git a/kernel/rwsem.c b/kernel/rwsem.c new file mode 100644 index 00000000000..291ded556aa --- /dev/null +++ b/kernel/rwsem.c @@ -0,0 +1,147 @@ +/* kernel/rwsem.c: R/W semaphores, public implementation + * + * Written by David Howells (dhowells@redhat.com). + * Derived from asm-i386/semaphore.h + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/rwsem.h> + +#include <asm/system.h> +#include <asm/atomic.h> + +/* + * lock for reading + */ +void down_read(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + + __down_read(sem); +} + +EXPORT_SYMBOL(down_read); + +/* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +int down_read_trylock(struct rw_semaphore *sem) +{ + int ret = __down_read_trylock(sem); + + if (ret == 1) + rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); + return ret; +} + +EXPORT_SYMBOL(down_read_trylock); + +/* + * lock for writing + */ +void down_write(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + + __down_write(sem); +} + +EXPORT_SYMBOL(down_write); + +/* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +int down_write_trylock(struct rw_semaphore *sem) +{ + int ret = __down_write_trylock(sem); + + if (ret == 1) + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + return ret; +} + +EXPORT_SYMBOL(down_write_trylock); + +/* + * release a read lock + */ +void up_read(struct rw_semaphore *sem) +{ + rwsem_release(&sem->dep_map, 1, _RET_IP_); + + __up_read(sem); +} + +EXPORT_SYMBOL(up_read); + +/* + * release a write lock + */ +void up_write(struct rw_semaphore *sem) +{ + rwsem_release(&sem->dep_map, 1, _RET_IP_); + + __up_write(sem); +} + +EXPORT_SYMBOL(up_write); + +/* + * downgrade write lock to read lock + */ +void downgrade_write(struct rw_semaphore *sem) +{ + /* + * lockdep: a downgraded write will live on as a write + * dependency. + */ + __downgrade_write(sem); +} + +EXPORT_SYMBOL(downgrade_write); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void down_read_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); + + __down_read(sem); +} + +EXPORT_SYMBOL(down_read_nested); + +void down_read_non_owner(struct rw_semaphore *sem) +{ + might_sleep(); + + __down_read(sem); +} + +EXPORT_SYMBOL(down_read_non_owner); + +void down_write_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); + + __down_write_nested(sem, subclass); +} + +EXPORT_SYMBOL(down_write_nested); + +void up_read_non_owner(struct rw_semaphore *sem) +{ + __up_read(sem); +} + +EXPORT_SYMBOL(up_read_non_owner); + +#endif + + diff --git a/kernel/sched.c b/kernel/sched.c index d5e37072ea5..a234fbee123 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -30,6 +30,7 @@ #include <linux/capability.h> #include <linux/completion.h> #include <linux/kernel_stat.h> +#include <linux/debug_locks.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/profile.h> @@ -50,6 +51,7 @@ #include <linux/times.h> #include <linux/acct.h> #include <linux/kprobes.h> +#include <linux/delayacct.h> #include <asm/tlb.h> #include <asm/unistd.h> @@ -178,20 +180,15 @@ static unsigned int static_prio_timeslice(int static_prio) return SCALE_PRIO(DEF_TIMESLICE, static_prio); } -static inline unsigned int task_timeslice(task_t *p) +static inline unsigned int task_timeslice(struct task_struct *p) { return static_prio_timeslice(p->static_prio); } -#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ - < (long long) (sd)->cache_hot_time) - /* * These are the runqueue data structures: */ -typedef struct runqueue runqueue_t; - struct prio_array { unsigned int nr_active; DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ @@ -205,7 +202,7 @@ struct prio_array { * (such as the load balancing or the thread migration code), lock * acquire operations must be ordered by ascending &runqueue. */ -struct runqueue { +struct rq { spinlock_t lock; /* @@ -229,9 +226,9 @@ struct runqueue { unsigned long expired_timestamp; unsigned long long timestamp_last_tick; - task_t *curr, *idle; + struct task_struct *curr, *idle; struct mm_struct *prev_mm; - prio_array_t *active, *expired, arrays[2]; + struct prio_array *active, *expired, arrays[2]; int best_expired_prio; atomic_t nr_iowait; @@ -242,7 +239,7 @@ struct runqueue { int active_balance; int push_cpu; - task_t *migration_thread; + struct task_struct *migration_thread; struct list_head migration_queue; #endif @@ -265,9 +262,10 @@ struct runqueue { unsigned long ttwu_cnt; unsigned long ttwu_local; #endif + struct lock_class_key rq_lock_key; }; -static DEFINE_PER_CPU(struct runqueue, runqueues); +static DEFINE_PER_CPU(struct rq, runqueues); /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. @@ -276,8 +274,8 @@ static DEFINE_PER_CPU(struct runqueue, runqueues); * The domain tree of any CPU may only be accessed from within * preempt-disabled sections. */ -#define for_each_domain(cpu, domain) \ -for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) +#define for_each_domain(cpu, __sd) \ + for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) @@ -292,26 +290,33 @@ for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) #endif #ifndef __ARCH_WANT_UNLOCKED_CTXSW -static inline int task_running(runqueue_t *rq, task_t *p) +static inline int task_running(struct rq *rq, struct task_struct *p) { return rq->curr == p; } -static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { } -static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = current; #endif + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + spin_unlock_irq(&rq->lock); } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline int task_running(runqueue_t *rq, task_t *p) +static inline int task_running(struct rq *rq, struct task_struct *p) { #ifdef CONFIG_SMP return p->oncpu; @@ -320,7 +325,7 @@ static inline int task_running(runqueue_t *rq, task_t *p) #endif } -static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { #ifdef CONFIG_SMP /* @@ -337,7 +342,7 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) #endif } -static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { #ifdef CONFIG_SMP /* @@ -358,10 +363,10 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) * __task_rq_lock - lock the runqueue a given task resides on. * Must be called interrupts disabled. */ -static inline runqueue_t *__task_rq_lock(task_t *p) +static inline struct rq *__task_rq_lock(struct task_struct *p) __acquires(rq->lock) { - struct runqueue *rq; + struct rq *rq; repeat_lock_task: rq = task_rq(p); @@ -378,10 +383,10 @@ repeat_lock_task: * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. */ -static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) __acquires(rq->lock) { - struct runqueue *rq; + struct rq *rq; repeat_lock_task: local_irq_save(*flags); @@ -394,13 +399,13 @@ repeat_lock_task: return rq; } -static inline void __task_rq_unlock(runqueue_t *rq) +static inline void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { spin_unlock(&rq->lock); } -static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) +static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) __releases(rq->lock) { spin_unlock_irqrestore(&rq->lock, *flags); @@ -420,7 +425,7 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); seq_printf(seq, "timestamp %lu\n", jiffies); for_each_online_cpu(cpu) { - runqueue_t *rq = cpu_rq(cpu); + struct rq *rq = cpu_rq(cpu); #ifdef CONFIG_SMP struct sched_domain *sd; int dcnt = 0; @@ -497,9 +502,36 @@ struct file_operations proc_schedstat_operations = { .release = single_release, }; +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) +{ + if (rq) { + rq->rq_sched_info.run_delay += delta_jiffies; + rq->rq_sched_info.pcnt++; + } +} + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) +{ + if (rq) + rq->rq_sched_info.cpu_time += delta_jiffies; +} # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) #else /* !CONFIG_SCHEDSTATS */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) +{} +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) +{} # define schedstat_inc(rq, field) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0) #endif @@ -507,10 +539,10 @@ struct file_operations proc_schedstat_operations = { /* * rq_lock - lock a given runqueue and disable interrupts. */ -static inline runqueue_t *this_rq_lock(void) +static inline struct rq *this_rq_lock(void) __acquires(rq->lock) { - runqueue_t *rq; + struct rq *rq; local_irq_disable(); rq = this_rq(); @@ -519,7 +551,7 @@ static inline runqueue_t *this_rq_lock(void) return rq; } -#ifdef CONFIG_SCHEDSTATS +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) /* * Called when a process is dequeued from the active array and given * the cpu. We should note that with the exception of interactive @@ -535,7 +567,7 @@ static inline runqueue_t *this_rq_lock(void) * long it was from the *first* time it was queued to the time that it * finally hit a cpu. */ -static inline void sched_info_dequeued(task_t *t) +static inline void sched_info_dequeued(struct task_struct *t) { t->sched_info.last_queued = 0; } @@ -545,23 +577,18 @@ static inline void sched_info_dequeued(task_t *t) * long it was waiting to run. We also note when it began so that we * can keep stats on how long its timeslice is. */ -static void sched_info_arrive(task_t *t) +static void sched_info_arrive(struct task_struct *t) { - unsigned long now = jiffies, diff = 0; - struct runqueue *rq = task_rq(t); + unsigned long now = jiffies, delta_jiffies = 0; if (t->sched_info.last_queued) - diff = now - t->sched_info.last_queued; + delta_jiffies = now - t->sched_info.last_queued; sched_info_dequeued(t); - t->sched_info.run_delay += diff; + t->sched_info.run_delay += delta_jiffies; t->sched_info.last_arrival = now; t->sched_info.pcnt++; - if (!rq) - return; - - rq->rq_sched_info.run_delay += diff; - rq->rq_sched_info.pcnt++; + rq_sched_info_arrive(task_rq(t), delta_jiffies); } /* @@ -579,25 +606,23 @@ static void sched_info_arrive(task_t *t) * the timestamp if it is already not set. It's assumed that * sched_info_dequeued() will clear that stamp when appropriate. */ -static inline void sched_info_queued(task_t *t) +static inline void sched_info_queued(struct task_struct *t) { - if (!t->sched_info.last_queued) - t->sched_info.last_queued = jiffies; + if (unlikely(sched_info_on())) + if (!t->sched_info.last_queued) + t->sched_info.last_queued = jiffies; } /* * Called when a process ceases being the active-running process, either * voluntarily or involuntarily. Now we can calculate how long we ran. */ -static inline void sched_info_depart(task_t *t) +static inline void sched_info_depart(struct task_struct *t) { - struct runqueue *rq = task_rq(t); - unsigned long diff = jiffies - t->sched_info.last_arrival; + unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; - t->sched_info.cpu_time += diff; - - if (rq) - rq->rq_sched_info.cpu_time += diff; + t->sched_info.cpu_time += delta_jiffies; + rq_sched_info_depart(task_rq(t), delta_jiffies); } /* @@ -605,9 +630,10 @@ static inline void sched_info_depart(task_t *t) * their time slice. (This may also be called when switching to or from * the idle task.) We are only called when prev != next. */ -static inline void sched_info_switch(task_t *prev, task_t *next) +static inline void +__sched_info_switch(struct task_struct *prev, struct task_struct *next) { - struct runqueue *rq = task_rq(prev); + struct rq *rq = task_rq(prev); /* * prev now departs the cpu. It's not interesting to record @@ -620,15 +646,21 @@ static inline void sched_info_switch(task_t *prev, task_t *next) if (next != rq->idle) sched_info_arrive(next); } +static inline void +sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ + if (unlikely(sched_info_on())) + __sched_info_switch(prev, next); +} #else #define sched_info_queued(t) do { } while (0) #define sched_info_switch(t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS */ +#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ /* * Adding/removing a task to/from a priority array: */ -static void dequeue_task(struct task_struct *p, prio_array_t *array) +static void dequeue_task(struct task_struct *p, struct prio_array *array) { array->nr_active--; list_del(&p->run_list); @@ -636,7 +668,7 @@ static void dequeue_task(struct task_struct *p, prio_array_t *array) __clear_bit(p->prio, array->bitmap); } -static void enqueue_task(struct task_struct *p, prio_array_t *array) +static void enqueue_task(struct task_struct *p, struct prio_array *array) { sched_info_queued(p); list_add_tail(&p->run_list, array->queue + p->prio); @@ -649,12 +681,13 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array) * Put task to the end of the run list without the overhead of dequeue * followed by enqueue. */ -static void requeue_task(struct task_struct *p, prio_array_t *array) +static void requeue_task(struct task_struct *p, struct prio_array *array) { list_move_tail(&p->run_list, array->queue + p->prio); } -static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) +static inline void +enqueue_task_head(struct task_struct *p, struct prio_array *array) { list_add(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); @@ -677,7 +710,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) * Both properties are important to certain workloads. */ -static inline int __normal_prio(task_t *p) +static inline int __normal_prio(struct task_struct *p) { int bonus, prio; @@ -713,7 +746,7 @@ static inline int __normal_prio(task_t *p) #define RTPRIO_TO_LOAD_WEIGHT(rp) \ (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) -static void set_load_weight(task_t *p) +static void set_load_weight(struct task_struct *p) { if (has_rt_policy(p)) { #ifdef CONFIG_SMP @@ -731,23 +764,25 @@ static void set_load_weight(task_t *p) p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); } -static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p) +static inline void +inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) { rq->raw_weighted_load += p->load_weight; } -static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p) +static inline void +dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) { rq->raw_weighted_load -= p->load_weight; } -static inline void inc_nr_running(task_t *p, runqueue_t *rq) +static inline void inc_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running++; inc_raw_weighted_load(rq, p); } -static inline void dec_nr_running(task_t *p, runqueue_t *rq) +static inline void dec_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running--; dec_raw_weighted_load(rq, p); @@ -760,7 +795,7 @@ static inline void dec_nr_running(task_t *p, runqueue_t *rq) * setprio syscalls, and whenever the interactivity * estimator recalculates. */ -static inline int normal_prio(task_t *p) +static inline int normal_prio(struct task_struct *p) { int prio; @@ -778,7 +813,7 @@ static inline int normal_prio(task_t *p) * interactivity modifiers. Will be RT if the task got * RT-boosted. If not then it returns p->normal_prio. */ -static int effective_prio(task_t *p) +static int effective_prio(struct task_struct *p) { p->normal_prio = normal_prio(p); /* @@ -794,9 +829,9 @@ static int effective_prio(task_t *p) /* * __activate_task - move a task to the runqueue. */ -static void __activate_task(task_t *p, runqueue_t *rq) +static void __activate_task(struct task_struct *p, struct rq *rq) { - prio_array_t *target = rq->active; + struct prio_array *target = rq->active; if (batch_task(p)) target = rq->expired; @@ -807,7 +842,7 @@ static void __activate_task(task_t *p, runqueue_t *rq) /* * __activate_idle_task - move idle task to the _front_ of runqueue. */ -static inline void __activate_idle_task(task_t *p, runqueue_t *rq) +static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) { enqueue_task_head(p, rq->active); inc_nr_running(p, rq); @@ -817,7 +852,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) * Recalculate p->normal_prio and p->prio after having slept, * updating the sleep-average too: */ -static int recalc_task_prio(task_t *p, unsigned long long now) +static int recalc_task_prio(struct task_struct *p, unsigned long long now) { /* Caller must always ensure 'now >= p->timestamp' */ unsigned long sleep_time = now - p->timestamp; @@ -889,7 +924,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now) * Update all the scheduling statistics stuff. (sleep average * calculation, priority modifiers, etc.) */ -static void activate_task(task_t *p, runqueue_t *rq, int local) +static void activate_task(struct task_struct *p, struct rq *rq, int local) { unsigned long long now; @@ -897,7 +932,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) #ifdef CONFIG_SMP if (!local) { /* Compensate for drifting sched_clock */ - runqueue_t *this_rq = this_rq(); + struct rq *this_rq = this_rq(); now = (now - this_rq->timestamp_last_tick) + rq->timestamp_last_tick; } @@ -936,7 +971,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct task_struct *p, runqueue_t *rq) +static void deactivate_task(struct task_struct *p, struct rq *rq) { dec_nr_running(p, rq); dequeue_task(p, p->array); @@ -956,7 +991,7 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq) #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif -static void resched_task(task_t *p) +static void resched_task(struct task_struct *p) { int cpu; @@ -977,7 +1012,7 @@ static void resched_task(task_t *p) smp_send_reschedule(cpu); } #else -static inline void resched_task(task_t *p) +static inline void resched_task(struct task_struct *p) { assert_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); @@ -988,7 +1023,7 @@ static inline void resched_task(task_t *p) * task_curr - is this task currently executing on a CPU? * @p: the task in question. */ -inline int task_curr(const task_t *p) +inline int task_curr(const struct task_struct *p) { return cpu_curr(task_cpu(p)) == p; } @@ -1000,22 +1035,23 @@ unsigned long weighted_cpuload(const int cpu) } #ifdef CONFIG_SMP -typedef struct { +struct migration_req { struct list_head list; - task_t *task; + struct task_struct *task; int dest_cpu; struct completion done; -} migration_req_t; +}; /* * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) +static int +migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) { - runqueue_t *rq = task_rq(p); + struct rq *rq = task_rq(p); /* * If the task is not on a runqueue (and not running), then @@ -1030,6 +1066,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) req->task = p; req->dest_cpu = dest_cpu; list_add(&req->list, &rq->migration_queue); + return 1; } @@ -1042,10 +1079,10 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -void wait_task_inactive(task_t *p) +void wait_task_inactive(struct task_struct *p) { unsigned long flags; - runqueue_t *rq; + struct rq *rq; int preempted; repeat: @@ -1076,7 +1113,7 @@ repeat: * to another CPU then no harm is done and the purpose has been * achieved as well. */ -void kick_process(task_t *p) +void kick_process(struct task_struct *p) { int cpu; @@ -1096,7 +1133,7 @@ void kick_process(task_t *p) */ static inline unsigned long source_load(int cpu, int type) { - runqueue_t *rq = cpu_rq(cpu); + struct rq *rq = cpu_rq(cpu); if (type == 0) return rq->raw_weighted_load; @@ -1110,7 +1147,7 @@ static inline unsigned long source_load(int cpu, int type) */ static inline unsigned long target_load(int cpu, int type) { - runqueue_t *rq = cpu_rq(cpu); + struct rq *rq = cpu_rq(cpu); if (type == 0) return rq->raw_weighted_load; @@ -1123,10 +1160,10 @@ static inline unsigned long target_load(int cpu, int type) */ static inline unsigned long cpu_avg_load_per_task(int cpu) { - runqueue_t *rq = cpu_rq(cpu); + struct rq *rq = cpu_rq(cpu); unsigned long n = rq->nr_running; - return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; + return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; } /* @@ -1279,7 +1316,7 @@ nextlevel: * Returns the CPU we should wake onto. */ #if defined(ARCH_HAS_SCHED_WAKE_IDLE) -static int wake_idle(int cpu, task_t *p) +static int wake_idle(int cpu, struct task_struct *p) { cpumask_t tmp; struct sched_domain *sd; @@ -1302,7 +1339,7 @@ static int wake_idle(int cpu, task_t *p) return cpu; } #else -static inline int wake_idle(int cpu, task_t *p) +static inline int wake_idle(int cpu, struct task_struct *p) { return cpu; } @@ -1322,15 +1359,15 @@ static inline int wake_idle(int cpu, task_t *p) * * returns failure only if the task is already active. */ -static int try_to_wake_up(task_t *p, unsigned int state, int sync) +static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) { int cpu, this_cpu, success = 0; unsigned long flags; long old_state; - runqueue_t *rq; + struct rq *rq; #ifdef CONFIG_SMP - unsigned long load, this_load; struct sched_domain *sd, *this_sd = NULL; + unsigned long load, this_load; int new_cpu; #endif @@ -1480,15 +1517,14 @@ out: return success; } -int fastcall wake_up_process(task_t *p) +int fastcall wake_up_process(struct task_struct *p) { return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); } - EXPORT_SYMBOL(wake_up_process); -int fastcall wake_up_state(task_t *p, unsigned int state) +int fastcall wake_up_state(struct task_struct *p, unsigned int state) { return try_to_wake_up(p, state, 0); } @@ -1497,7 +1533,7 @@ int fastcall wake_up_state(task_t *p, unsigned int state) * Perform scheduler related setup for a newly forked process p. * p is forked by current. */ -void fastcall sched_fork(task_t *p, int clone_flags) +void fastcall sched_fork(struct task_struct *p, int clone_flags) { int cpu = get_cpu(); @@ -1521,8 +1557,9 @@ void fastcall sched_fork(task_t *p, int clone_flags) INIT_LIST_HEAD(&p->run_list); p->array = NULL; -#ifdef CONFIG_SCHEDSTATS - memset(&p->sched_info, 0, sizeof(p->sched_info)); +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + if (unlikely(sched_info_on())) + memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) p->oncpu = 0; @@ -1565,11 +1602,11 @@ void fastcall sched_fork(task_t *p, int clone_flags) * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) +void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { + struct rq *rq, *this_rq; unsigned long flags; int this_cpu, cpu; - runqueue_t *rq, *this_rq; rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); @@ -1649,10 +1686,10 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) * artificially, because any timeslice recovered here * was given away by the parent in the first place.) */ -void fastcall sched_exit(task_t *p) +void fastcall sched_exit(struct task_struct *p) { unsigned long flags; - runqueue_t *rq; + struct rq *rq; /* * If the child was a (relative-) CPU hog then decrease @@ -1683,7 +1720,7 @@ void fastcall sched_exit(task_t *p) * prepare_task_switch sets up locking and calls architecture specific * hooks. */ -static inline void prepare_task_switch(runqueue_t *rq, task_t *next) +static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) { prepare_lock_switch(rq, next); prepare_arch_switch(next); @@ -1704,7 +1741,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next) * with the lock held can cause deadlocks; see schedule() for * details.) */ -static inline void finish_task_switch(runqueue_t *rq, task_t *prev) +static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; @@ -1742,10 +1779,11 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev) * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. */ -asmlinkage void schedule_tail(task_t *prev) +asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - runqueue_t *rq = this_rq(); + struct rq *rq = this_rq(); + finish_task_switch(rq, prev); #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ @@ -1759,8 +1797,9 @@ asmlinkage void schedule_tail(task_t *prev) * context_switch - switch to the new MM and the new * thread's register state. */ -static inline -task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) +static inline struct task_struct * +context_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) { struct mm_struct *mm = next->mm; struct mm_struct *oldmm = prev->active_mm; @@ -1777,6 +1816,15 @@ task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) WARN_ON(rq->prev_mm); rq->prev_mm = oldmm; } + /* + * Since the runqueue lock will be released by the next + * task (which is an invalid locking op but in the case + * of the scheduler it's an obvious special-case), so we + * do an early lockdep release here: + */ +#ifndef __ARCH_WANT_UNLOCKED_CTXSW + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +#endif /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -1857,12 +1905,21 @@ unsigned long nr_active(void) #ifdef CONFIG_SMP /* + * Is this task likely cache-hot: + */ +static inline int +task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) +{ + return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; +} + +/* * double_rq_lock - safely lock two runqueues * * Note this does not disable interrupts like task_rq_lock, * you need to do so manually before calling. */ -static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) +static void double_rq_lock(struct rq *rq1, struct rq *rq2) __acquires(rq1->lock) __acquires(rq2->lock) { @@ -1886,7 +1943,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) * Note this does not restore interrupts like task_rq_unlock, * you need to do so manually after calling. */ -static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) +static void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq1->lock) __releases(rq2->lock) { @@ -1900,7 +1957,7 @@ static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. */ -static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) +static void double_lock_balance(struct rq *this_rq, struct rq *busiest) __releases(this_rq->lock) __acquires(busiest->lock) __acquires(this_rq->lock) @@ -1921,11 +1978,11 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) * allow dest_cpu, which will force the cpu onto dest_cpu. Then * the cpu_allowed mask is restored. */ -static void sched_migrate_task(task_t *p, int dest_cpu) +static void sched_migrate_task(struct task_struct *p, int dest_cpu) { - migration_req_t req; - runqueue_t *rq; + struct migration_req req; unsigned long flags; + struct rq *rq; rq = task_rq_lock(p, &flags); if (!cpu_isset(dest_cpu, p->cpus_allowed) @@ -1936,11 +1993,13 @@ static void sched_migrate_task(task_t *p, int dest_cpu) if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ struct task_struct *mt = rq->migration_thread; + get_task_struct(mt); task_rq_unlock(rq, &flags); wake_up_process(mt); put_task_struct(mt); wait_for_completion(&req.done); + return; } out: @@ -1964,9 +2023,9 @@ void sched_exec(void) * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static -void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, - runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) +static void pull_task(struct rq *src_rq, struct prio_array *src_array, + struct task_struct *p, struct rq *this_rq, + struct prio_array *this_array, int this_cpu) { dequeue_task(p, src_array); dec_nr_running(p, src_rq); @@ -1987,7 +2046,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static -int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, +int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, struct sched_domain *sd, enum idle_type idle, int *all_pinned) { @@ -2019,6 +2078,7 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, } #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) + /* * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted * load from busiest to this_rq, as part of a balancing operation within @@ -2026,18 +2086,17 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, * * Called with both runqueues locked. */ -static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_nr_move, unsigned long max_load_move, struct sched_domain *sd, enum idle_type idle, int *all_pinned) { - prio_array_t *array, *dst_array; + int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, + best_prio_seen, skip_for_load; + struct prio_array *array, *dst_array; struct list_head *head, *curr; - int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio; - int busiest_best_prio_seen; - int skip_for_load; /* skip the task based on weighted load issues */ + struct task_struct *tmp; long rem_load_move; - task_t *tmp; if (max_nr_move == 0 || max_load_move == 0) goto out; @@ -2045,15 +2104,15 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, rem_load_move = max_load_move; pinned = 1; this_best_prio = rq_best_prio(this_rq); - busiest_best_prio = rq_best_prio(busiest); + best_prio = rq_best_prio(busiest); /* * Enable handling of the case where there is more than one task * with the best priority. If the current running task is one - * of those with prio==busiest_best_prio we know it won't be moved + * of those with prio==best_prio we know it won't be moved * and therefore it's safe to override the skip (based on load) of * any task we find with that prio. */ - busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio; + best_prio_seen = best_prio == busiest->curr->prio; /* * We first consider expired tasks. Those will likely not be @@ -2089,7 +2148,7 @@ skip_bitmap: head = array->queue + idx; curr = head->prev; skip_queue: - tmp = list_entry(curr, task_t, run_list); + tmp = list_entry(curr, struct task_struct, run_list); curr = curr->prev; @@ -2100,10 +2159,11 @@ skip_queue: */ skip_for_load = tmp->load_weight > rem_load_move; if (skip_for_load && idx < this_best_prio) - skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio; + skip_for_load = !best_prio_seen && idx == best_prio; if (skip_for_load || !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { - busiest_best_prio_seen |= idx == busiest_best_prio; + + best_prio_seen |= idx == best_prio; if (curr != head) goto skip_queue; idx++; @@ -2146,8 +2206,8 @@ out: /* * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the amount of weighted load which should be - * moved to restore balance via the imbalance parameter. + * domain. It calculates and returns the amount of weighted load which + * should be moved to restore balance via the imbalance parameter. */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, @@ -2188,7 +2248,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, sum_weighted_load = sum_nr_running = avg_load = 0; for_each_cpu_mask(i, group->cpumask) { - runqueue_t *rq = cpu_rq(i); + struct rq *rq = cpu_rq(i); if (*sd_idle && !idle_cpu(i)) *sd_idle = 0; @@ -2269,7 +2329,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * capacity but still has some space to pick up some load * from other group and save more power */ - if (sum_nr_running <= group_capacity - 1) + if (sum_nr_running <= group_capacity - 1) { if (sum_nr_running > leader_nr_running || (sum_nr_running == leader_nr_running && first_cpu(group->cpumask) > @@ -2277,7 +2337,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, group_leader = group; leader_nr_running = sum_nr_running; } - + } group_next: #endif group = group->next; @@ -2332,8 +2392,7 @@ group_next: * moved */ if (*imbalance < busiest_load_per_task) { - unsigned long pwr_now, pwr_move; - unsigned long tmp; + unsigned long tmp, pwr_now, pwr_move; unsigned int imbn; small_imbalance: @@ -2405,22 +2464,23 @@ ret: /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ -static runqueue_t *find_busiest_queue(struct sched_group *group, - enum idle_type idle, unsigned long imbalance) +static struct rq * +find_busiest_queue(struct sched_group *group, enum idle_type idle, + unsigned long imbalance) { + struct rq *busiest = NULL, *rq; unsigned long max_load = 0; - runqueue_t *busiest = NULL, *rqi; int i; for_each_cpu_mask(i, group->cpumask) { - rqi = cpu_rq(i); + rq = cpu_rq(i); - if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance) + if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) continue; - if (rqi->raw_weighted_load > max_load) { - max_load = rqi->raw_weighted_load; - busiest = rqi; + if (rq->raw_weighted_load > max_load) { + max_load = rq->raw_weighted_load; + busiest = rq; } } @@ -2433,22 +2493,24 @@ static runqueue_t *find_busiest_queue(struct sched_group *group, */ #define MAX_PINNED_INTERVAL 512 -#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0) +static inline unsigned long minus_1_or_zero(unsigned long n) +{ + return n > 0 ? n - 1 : 0; +} + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. * * Called with this_rq unlocked. */ -static int load_balance(int this_cpu, runqueue_t *this_rq, +static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum idle_type idle) { + int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; - runqueue_t *busiest; unsigned long imbalance; - int nr_moved, all_pinned = 0; - int active_balance = 0; - int sd_idle = 0; + struct rq *busiest; if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) @@ -2482,8 +2544,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, */ double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, - minus_1_or_zero(busiest->nr_running), - imbalance, sd, idle, &all_pinned); + minus_1_or_zero(busiest->nr_running), + imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); /* All tasks on this runqueue were pinned by CPU affinity */ @@ -2556,7 +2618,8 @@ out_one_pinned: (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !sched_smt_power_savings) return -1; return 0; } @@ -2568,11 +2631,11 @@ out_one_pinned: * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). * this_rq is locked. */ -static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd) +static int +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) { struct sched_group *group; - runqueue_t *busiest = NULL; + struct rq *busiest = NULL; unsigned long imbalance; int nr_moved = 0; int sd_idle = 0; @@ -2618,9 +2681,11 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, out_balanced: schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !sched_smt_power_savings) return -1; sd->nr_balance_failed = 0; + return 0; } @@ -2628,16 +2693,15 @@ out_balanced: * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -static void idle_balance(int this_cpu, runqueue_t *this_rq) +static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; for_each_domain(this_cpu, sd) { if (sd->flags & SD_BALANCE_NEWIDLE) { - if (load_balance_newidle(this_cpu, this_rq, sd)) { - /* We've pulled tasks over so stop searching */ + /* If we've pulled tasks over stop searching: */ + if (load_balance_newidle(this_cpu, this_rq, sd)) break; - } } } } @@ -2650,14 +2714,14 @@ static void idle_balance(int this_cpu, runqueue_t *this_rq) * * Called with busiest_rq locked. */ -static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) +static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) { - struct sched_domain *sd; - runqueue_t *target_rq; int target_cpu = busiest_rq->push_cpu; + struct sched_domain *sd; + struct rq *target_rq; + /* Is there any task to move? */ if (busiest_rq->nr_running <= 1) - /* no task to move */ return; target_rq = cpu_rq(target_cpu); @@ -2675,21 +2739,20 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && - cpu_isset(busiest_cpu, sd->span)) + cpu_isset(busiest_cpu, sd->span)) break; } - if (unlikely(sd == NULL)) - goto out; - - schedstat_inc(sd, alb_cnt); + if (likely(sd)) { + schedstat_inc(sd, alb_cnt); - if (move_tasks(target_rq, target_cpu, busiest_rq, 1, - RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL)) - schedstat_inc(sd, alb_pushed); - else - schedstat_inc(sd, alb_failed); -out: + if (move_tasks(target_rq, target_cpu, busiest_rq, 1, + RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, + NULL)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); + } spin_unlock(&target_rq->lock); } @@ -2702,23 +2765,27 @@ out: * Balancing parameters are set up in arch_init_sched_domains. */ -/* Don't have all balancing operations going off at once */ -#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) +/* Don't have all balancing operations going off at once: */ +static inline unsigned long cpu_offset(int cpu) +{ + return jiffies + cpu * HZ / NR_CPUS; +} -static void rebalance_tick(int this_cpu, runqueue_t *this_rq, - enum idle_type idle) +static void +rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) { - unsigned long old_load, this_load; - unsigned long j = jiffies + CPU_OFFSET(this_cpu); + unsigned long this_load, interval, j = cpu_offset(this_cpu); struct sched_domain *sd; - int i; + int i, scale; this_load = this_rq->raw_weighted_load; - /* Update our load */ - for (i = 0; i < 3; i++) { - unsigned long new_load = this_load; - int scale = 1 << i; + + /* Update our load: */ + for (i = 0, scale = 1; i < 3; i++, scale <<= 1) { + unsigned long old_load, new_load; + old_load = this_rq->cpu_load[i]; + new_load = this_load; /* * Round up the averaging division if load is increasing. This * prevents us from getting stuck on 9 if the load is 10, for @@ -2730,8 +2797,6 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, } for_each_domain(this_cpu, sd) { - unsigned long interval; - if (!(sd->flags & SD_LOAD_BALANCE)) continue; @@ -2761,17 +2826,18 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, /* * on UP we do not need to balance between CPUs: */ -static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) +static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle) { } -static inline void idle_balance(int cpu, runqueue_t *rq) +static inline void idle_balance(int cpu, struct rq *rq) { } #endif -static inline int wake_priority_sleeper(runqueue_t *rq) +static inline int wake_priority_sleeper(struct rq *rq) { int ret = 0; + #ifdef CONFIG_SCHED_SMT spin_lock(&rq->lock); /* @@ -2795,25 +2861,26 @@ EXPORT_PER_CPU_SYMBOL(kstat); * This is called on clock ticks and on context switches. * Bank in p->sched_time the ns elapsed since the last tick or switch. */ -static inline void update_cpu_clock(task_t *p, runqueue_t *rq, - unsigned long long now) +static inline void +update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) { - unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); - p->sched_time += now - last; + p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); } /* * Return current->sched_time plus any more ns on the sched_clock * that have not yet been banked. */ -unsigned long long current_sched_time(const task_t *tsk) +unsigned long long current_sched_time(const struct task_struct *p) { unsigned long long ns; unsigned long flags; + local_irq_save(flags); - ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); - ns = tsk->sched_time + (sched_clock() - ns); + ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); + ns = p->sched_time + sched_clock() - ns; local_irq_restore(flags); + return ns; } @@ -2827,11 +2894,16 @@ unsigned long long current_sched_time(const task_t *tsk) * increasing number of running tasks. We also ignore the interactivity * if a better static_prio task has expired: */ -#define EXPIRED_STARVING(rq) \ - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ - ((rq)->curr->static_prio > (rq)->best_expired_prio)) +static inline int expired_starving(struct rq *rq) +{ + if (rq->curr->static_prio > rq->best_expired_prio) + return 1; + if (!STARVATION_LIMIT || !rq->expired_timestamp) + return 0; + if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) + return 1; + return 0; +} /* * Account user cpu time to a process. @@ -2864,7 +2936,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cputime_t cputime) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - runqueue_t *rq = this_rq(); + struct rq *rq = this_rq(); cputime64_t tmp; p->stime = cputime_add(p->stime, cputime); @@ -2894,7 +2966,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; cputime64_t tmp = cputime_to_cputime64(steal); - runqueue_t *rq = this_rq(); + struct rq *rq = this_rq(); if (p == rq->idle) { p->stime = cputime_add(p->stime, steal); @@ -2915,10 +2987,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal) */ void scheduler_tick(void) { - int cpu = smp_processor_id(); - runqueue_t *rq = this_rq(); - task_t *p = current; unsigned long long now = sched_clock(); + struct task_struct *p = current; + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); update_cpu_clock(p, rq, now); @@ -2968,7 +3040,7 @@ void scheduler_tick(void) if (!rq->expired_timestamp) rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { + if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { enqueue_task(p, rq->expired); if (p->static_prio < rq->best_expired_prio) rq->best_expired_prio = p->static_prio; @@ -3007,7 +3079,7 @@ out: } #ifdef CONFIG_SCHED_SMT -static inline void wakeup_busy_runqueue(runqueue_t *rq) +static inline void wakeup_busy_runqueue(struct rq *rq) { /* If an SMT runqueue is sleeping due to priority reasons wake it up */ if (rq->curr == rq->idle && rq->nr_running) @@ -3033,7 +3105,7 @@ static void wake_sleeping_dependent(int this_cpu) return; for_each_cpu_mask(i, sd->span) { - runqueue_t *smt_rq = cpu_rq(i); + struct rq *smt_rq = cpu_rq(i); if (i == this_cpu) continue; @@ -3050,7 +3122,8 @@ static void wake_sleeping_dependent(int this_cpu) * utilize, if another task runs on a sibling. This models the * slowdown effect of other tasks running on siblings: */ -static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) +static inline unsigned long +smt_slice(struct task_struct *p, struct sched_domain *sd) { return p->time_slice * (100 - sd->per_cpu_gain) / 100; } @@ -3061,7 +3134,8 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) * acquire their lock. As we only trylock the normal locking order does not * need to be obeyed. */ -static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p) +static int +dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p) { struct sched_domain *tmp, *sd = NULL; int ret = 0, i; @@ -3081,8 +3155,8 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p) return 0; for_each_cpu_mask(i, sd->span) { - runqueue_t *smt_rq; - task_t *smt_curr; + struct task_struct *smt_curr; + struct rq *smt_rq; if (i == this_cpu) continue; @@ -3127,9 +3201,8 @@ unlock: static inline void wake_sleeping_dependent(int this_cpu) { } - -static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq, - task_t *p) +static inline int +dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p) { return 0; } @@ -3142,12 +3215,13 @@ void fastcall add_preempt_count(int val) /* * Underflow? */ - BUG_ON((preempt_count() < 0)); + if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) + return; preempt_count() += val; /* * Spinlock count overflowing soon? */ - BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); + DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); } EXPORT_SYMBOL(add_preempt_count); @@ -3156,11 +3230,15 @@ void fastcall sub_preempt_count(int val) /* * Underflow? */ - BUG_ON(val > preempt_count()); + if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) + return; /* * Is the spinlock portion underflowing? */ - BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); + if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && + !(preempt_count() & PREEMPT_MASK))) + return; + preempt_count() -= val; } EXPORT_SYMBOL(sub_preempt_count); @@ -3178,14 +3256,14 @@ static inline int interactive_sleep(enum sleep_type sleep_type) */ asmlinkage void __sched schedule(void) { - long *switch_count; - task_t *prev, *next; - runqueue_t *rq; - prio_array_t *array; + struct task_struct *prev, *next; + struct prio_array *array; struct list_head *queue; unsigned long long now; unsigned long run_time; int cpu, idx, new_prio; + long *switch_count; + struct rq *rq; /* * Test if we are atomic. Since do_exit() needs to call into @@ -3275,7 +3353,7 @@ need_resched_nonpreemptible: idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; - next = list_entry(queue->next, task_t, run_list); + next = list_entry(queue->next, struct task_struct, run_list); if (!rt_task(next) && interactive_sleep(next->sleep_type)) { unsigned long long delta = now - next->timestamp; @@ -3338,12 +3416,11 @@ switch_tasks: if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; } - EXPORT_SYMBOL(schedule); #ifdef CONFIG_PREEMPT /* - * this is is the entry point to schedule() from in-kernel preemption + * this is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ @@ -3383,11 +3460,10 @@ need_resched: if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; } - EXPORT_SYMBOL(preempt_schedule); /* - * this is is the entry point to schedule() from kernel preemption + * this is the entry point to schedule() from kernel preemption * off of irq context. * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. @@ -3399,7 +3475,7 @@ asmlinkage void __sched preempt_schedule_irq(void) struct task_struct *task = current; int saved_lock_depth; #endif - /* Catch callers which need to be fixed*/ + /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); need_resched: @@ -3432,10 +3508,8 @@ need_resched: int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { - task_t *p = curr->private; - return try_to_wake_up(p, mode, sync); + return try_to_wake_up(curr->private, mode, sync); } - EXPORT_SYMBOL(default_wake_function); /* @@ -3453,13 +3527,11 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, struct list_head *tmp, *next; list_for_each_safe(tmp, next, &q->task_list) { - wait_queue_t *curr; - unsigned flags; - curr = list_entry(tmp, wait_queue_t, task_list); - flags = curr->flags; + wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + unsigned flags = curr->flags; + if (curr->func(curr, mode, sync, key) && - (flags & WQ_FLAG_EXCLUSIVE) && - !--nr_exclusive) + (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } } @@ -3480,7 +3552,6 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, __wake_up_common(q, mode, nr_exclusive, 0, key); spin_unlock_irqrestore(&q->lock, flags); } - EXPORT_SYMBOL(__wake_up); /* @@ -3549,6 +3620,7 @@ EXPORT_SYMBOL(complete_all); void fastcall __sched wait_for_completion(struct completion *x) { might_sleep(); + spin_lock_irq(&x->wait.lock); if (!x->done) { DECLARE_WAITQUEUE(wait, current); @@ -3693,7 +3765,6 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) schedule(); SLEEP_ON_TAIL } - EXPORT_SYMBOL(interruptible_sleep_on); long fastcall __sched @@ -3709,7 +3780,6 @@ interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) return timeout; } - EXPORT_SYMBOL(interruptible_sleep_on_timeout); void fastcall __sched sleep_on(wait_queue_head_t *q) @@ -3722,7 +3792,6 @@ void fastcall __sched sleep_on(wait_queue_head_t *q) schedule(); SLEEP_ON_TAIL } - EXPORT_SYMBOL(sleep_on); long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) @@ -3752,11 +3821,11 @@ EXPORT_SYMBOL(sleep_on_timeout); * * Used by the rt_mutex code to implement priority inheritance logic. */ -void rt_mutex_setprio(task_t *p, int prio) +void rt_mutex_setprio(struct task_struct *p, int prio) { + struct prio_array *array; unsigned long flags; - prio_array_t *array; - runqueue_t *rq; + struct rq *rq; int oldprio; BUG_ON(prio < 0 || prio > MAX_PRIO); @@ -3793,12 +3862,12 @@ void rt_mutex_setprio(task_t *p, int prio) #endif -void set_user_nice(task_t *p, long nice) +void set_user_nice(struct task_struct *p, long nice) { - unsigned long flags; - prio_array_t *array; - runqueue_t *rq; + struct prio_array *array; int old_prio, delta; + unsigned long flags; + struct rq *rq; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -3849,10 +3918,11 @@ EXPORT_SYMBOL(set_user_nice); * @p: task * @nice: nice value */ -int can_nice(const task_t *p, const int nice) +int can_nice(const struct task_struct *p, const int nice) { /* convert nice value [19,-20] to rlimit style value [1,40] */ int nice_rlim = 20 - nice; + return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || capable(CAP_SYS_NICE)); } @@ -3868,8 +3938,7 @@ int can_nice(const task_t *p, const int nice) */ asmlinkage long sys_nice(int increment) { - int retval; - long nice; + long nice, retval; /* * Setpriority might change our priority at the same moment. @@ -3908,7 +3977,7 @@ asmlinkage long sys_nice(int increment) * RT tasks are offset by -200. Normal tasks are centered * around 0, value goes from -16 to +15. */ -int task_prio(const task_t *p) +int task_prio(const struct task_struct *p) { return p->prio - MAX_RT_PRIO; } @@ -3917,7 +3986,7 @@ int task_prio(const task_t *p) * task_nice - return the nice value of a given task. * @p: the task in question. */ -int task_nice(const task_t *p) +int task_nice(const struct task_struct *p) { return TASK_NICE(p); } @@ -3936,7 +4005,7 @@ int idle_cpu(int cpu) * idle_task - return the idle task for a given cpu. * @cpu: the processor in question. */ -task_t *idle_task(int cpu) +struct task_struct *idle_task(int cpu) { return cpu_rq(cpu)->idle; } @@ -3945,7 +4014,7 @@ task_t *idle_task(int cpu) * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. */ -static inline task_t *find_process_by_pid(pid_t pid) +static inline struct task_struct *find_process_by_pid(pid_t pid) { return pid ? find_task_by_pid(pid) : current; } @@ -3954,6 +4023,7 @@ static inline task_t *find_process_by_pid(pid_t pid) static void __setscheduler(struct task_struct *p, int policy, int prio) { BUG_ON(p->array); + p->policy = policy; p->rt_priority = prio; p->normal_prio = normal_prio(p); @@ -3977,11 +4047,10 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { - int retval; - int oldprio, oldpolicy = -1; - prio_array_t *array; + int retval, oldprio, oldpolicy = -1; + struct prio_array *array; unsigned long flags; - runqueue_t *rq; + struct rq *rq; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); @@ -4079,9 +4148,9 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { - int retval; struct sched_param lparam; struct task_struct *p; + int retval; if (!param || pid < 0) return -EINVAL; @@ -4093,10 +4162,9 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) read_unlock_irq(&tasklist_lock); return -ESRCH; } - get_task_struct(p); - read_unlock_irq(&tasklist_lock); retval = sched_setscheduler(p, policy, &lparam); - put_task_struct(p); + read_unlock_irq(&tasklist_lock); + return retval; } @@ -4132,8 +4200,8 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) */ asmlinkage long sys_sched_getscheduler(pid_t pid) { + struct task_struct *p; int retval = -EINVAL; - task_t *p; if (pid < 0) goto out_nounlock; @@ -4160,8 +4228,8 @@ out_nounlock: asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) { struct sched_param lp; + struct task_struct *p; int retval = -EINVAL; - task_t *p; if (!param || pid < 0) goto out_nounlock; @@ -4194,9 +4262,9 @@ out_unlock: long sched_setaffinity(pid_t pid, cpumask_t new_mask) { - task_t *p; - int retval; cpumask_t cpus_allowed; + struct task_struct *p; + int retval; lock_cpu_hotplug(); read_lock(&tasklist_lock); @@ -4282,8 +4350,8 @@ cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; long sched_getaffinity(pid_t pid, cpumask_t *mask) { + struct task_struct *p; int retval; - task_t *p; lock_cpu_hotplug(); read_lock(&tasklist_lock); @@ -4342,9 +4410,8 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, */ asmlinkage long sys_sched_yield(void) { - runqueue_t *rq = this_rq_lock(); - prio_array_t *array = current->array; - prio_array_t *target = rq->expired; + struct rq *rq = this_rq_lock(); + struct prio_array *array = current->array, *target = rq->expired; schedstat_inc(rq, yld_cnt); /* @@ -4378,6 +4445,7 @@ asmlinkage long sys_sched_yield(void) * no need to preempt or enable interrupts: */ __release(rq->lock); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); _raw_spin_unlock(&rq->lock); preempt_enable_no_resched(); @@ -4386,9 +4454,9 @@ asmlinkage long sys_sched_yield(void) return 0; } -static inline int __resched_legal(void) +static inline int __resched_legal(int expected_preempt_count) { - if (unlikely(preempt_count())) + if (unlikely(preempt_count() != expected_preempt_count)) return 0; if (unlikely(system_state != SYSTEM_RUNNING)) return 0; @@ -4414,7 +4482,7 @@ static void __cond_resched(void) int __sched cond_resched(void) { - if (need_resched() && __resched_legal()) { + if (need_resched() && __resched_legal(0)) { __cond_resched(); return 1; } @@ -4440,7 +4508,8 @@ int cond_resched_lock(spinlock_t *lock) ret = 1; spin_lock(lock); } - if (need_resched() && __resched_legal()) { + if (need_resched() && __resched_legal(1)) { + spin_release(&lock->dep_map, 1, _THIS_IP_); _raw_spin_unlock(lock); preempt_enable_no_resched(); __cond_resched(); @@ -4455,8 +4524,10 @@ int __sched cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (need_resched() && __resched_legal()) { - __local_bh_enable(); + if (need_resched() && __resched_legal(0)) { + raw_local_irq_disable(); + _local_bh_enable(); + raw_local_irq_enable(); __cond_resched(); local_bh_disable(); return 1; @@ -4476,7 +4547,6 @@ void __sched yield(void) set_current_state(TASK_RUNNING); sys_sched_yield(); } - EXPORT_SYMBOL(yield); /* @@ -4488,23 +4558,26 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct runqueue *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = &__raw_get_cpu_var(runqueues); + delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); schedule(); atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); } - EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct runqueue *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = &__raw_get_cpu_var(runqueues); long ret; + delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); ret = schedule_timeout(timeout); atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); return ret; } @@ -4566,9 +4639,9 @@ asmlinkage long sys_sched_get_priority_min(int policy) asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) { + struct task_struct *p; int retval = -EINVAL; struct timespec t; - task_t *p; if (pid < 0) goto out_nounlock; @@ -4596,35 +4669,36 @@ out_unlock: static inline struct task_struct *eldest_child(struct task_struct *p) { - if (list_empty(&p->children)) return NULL; + if (list_empty(&p->children)) + return NULL; return list_entry(p->children.next,struct task_struct,sibling); } static inline struct task_struct *older_sibling(struct task_struct *p) { - if (p->sibling.prev==&p->parent->children) return NULL; + if (p->sibling.prev==&p->parent->children) + return NULL; return list_entry(p->sibling.prev,struct task_struct,sibling); } static inline struct task_struct *younger_sibling(struct task_struct *p) { - if (p->sibling.next==&p->parent->children) return NULL; + if (p->sibling.next==&p->parent->children) + return NULL; return list_entry(p->sibling.next,struct task_struct,sibling); } -static void show_task(task_t *p) +static const char stat_nam[] = "RSDTtZX"; + +static void show_task(struct task_struct *p) { - task_t *relative; - unsigned state; + struct task_struct *relative; unsigned long free = 0; - static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; + unsigned state; - printk("%-13.13s ", p->comm); state = p->state ? __ffs(p->state) + 1 : 0; - if (state < ARRAY_SIZE(stat_nam)) - printk(stat_nam[state]); - else - printk("?"); + printk("%-13.13s %c", p->comm, + state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if (BITS_PER_LONG == 32) if (state == TASK_RUNNING) printk(" running "); @@ -4668,7 +4742,7 @@ static void show_task(task_t *p) void show_state(void) { - task_t *g, *p; + struct task_struct *g, *p; #if (BITS_PER_LONG == 32) printk("\n" @@ -4690,7 +4764,7 @@ void show_state(void) } while_each_thread(g, p); read_unlock(&tasklist_lock); - mutex_debug_show_all_locks(); + debug_show_all_locks(); } /** @@ -4701,9 +4775,9 @@ void show_state(void) * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void __devinit init_idle(task_t *idle, int cpu) +void __devinit init_idle(struct task_struct *idle, int cpu) { - runqueue_t *rq = cpu_rq(cpu); + struct rq *rq = cpu_rq(cpu); unsigned long flags; idle->timestamp = sched_clock(); @@ -4742,7 +4816,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; /* * This is how migration works: * - * 1) we queue a migration_req_t structure in the source CPU's + * 1) we queue a struct migration_req structure in the source CPU's * runqueue and wake up that CPU's migration thread. * 2) we down() the locked semaphore => thread blocks. * 3) migration thread wakes up (implicitly it forces the migrated @@ -4764,12 +4838,12 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed(task_t *p, cpumask_t new_mask) +int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) { + struct migration_req req; unsigned long flags; + struct rq *rq; int ret = 0; - migration_req_t req; - runqueue_t *rq; rq = task_rq_lock(p, &flags); if (!cpus_intersects(new_mask, cpu_online_map)) { @@ -4792,9 +4866,9 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask) } out: task_rq_unlock(rq, &flags); + return ret; } - EXPORT_SYMBOL_GPL(set_cpus_allowed); /* @@ -4810,7 +4884,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); */ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { - runqueue_t *rq_dest, *rq_src; + struct rq *rq_dest, *rq_src; int ret = 0; if (unlikely(cpu_is_offline(dest_cpu))) @@ -4838,7 +4912,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) p->timestamp = p->timestamp - rq_src->timestamp_last_tick + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); - activate_task(p, rq_dest, 0); + __activate_task(p, rq_dest); if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); } @@ -4855,16 +4929,16 @@ out: */ static int migration_thread(void *data) { - runqueue_t *rq; int cpu = (long)data; + struct rq *rq; rq = cpu_rq(cpu); BUG_ON(rq->migration_thread != current); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { + struct migration_req *req; struct list_head *head; - migration_req_t *req; try_to_freeze(); @@ -4888,7 +4962,7 @@ static int migration_thread(void *data) set_current_state(TASK_INTERRUPTIBLE); continue; } - req = list_entry(head->next, migration_req_t, list); + req = list_entry(head->next, struct migration_req, list); list_del_init(head->next); spin_unlock(&rq->lock); @@ -4913,28 +4987,28 @@ wait_to_die: #ifdef CONFIG_HOTPLUG_CPU /* Figure out where task on dead CPU should go, use force if neccessary. */ -static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) +static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { - runqueue_t *rq; unsigned long flags; - int dest_cpu; cpumask_t mask; + struct rq *rq; + int dest_cpu; restart: /* On same node? */ mask = node_to_cpumask(cpu_to_node(dead_cpu)); - cpus_and(mask, mask, tsk->cpus_allowed); + cpus_and(mask, mask, p->cpus_allowed); dest_cpu = any_online_cpu(mask); /* On any allowed CPU? */ if (dest_cpu == NR_CPUS) - dest_cpu = any_online_cpu(tsk->cpus_allowed); + dest_cpu = any_online_cpu(p->cpus_allowed); /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { - rq = task_rq_lock(tsk, &flags); - cpus_setall(tsk->cpus_allowed); - dest_cpu = any_online_cpu(tsk->cpus_allowed); + rq = task_rq_lock(p, &flags); + cpus_setall(p->cpus_allowed); + dest_cpu = any_online_cpu(p->cpus_allowed); task_rq_unlock(rq, &flags); /* @@ -4942,12 +5016,12 @@ restart: * kernel threads (both mm NULL), since they never * leave kernel. */ - if (tsk->mm && printk_ratelimit()) + if (p->mm && printk_ratelimit()) printk(KERN_INFO "process %d (%s) no " "longer affine to cpu%d\n", - tsk->pid, tsk->comm, dead_cpu); + p->pid, p->comm, dead_cpu); } - if (!__migrate_task(tsk, dead_cpu, dest_cpu)) + if (!__migrate_task(p, dead_cpu, dest_cpu)) goto restart; } @@ -4958,9 +5032,9 @@ restart: * their home CPUs. So we just add the counter to another CPU's counter, * to keep the global sum constant after CPU-down: */ -static void migrate_nr_uninterruptible(runqueue_t *rq_src) +static void migrate_nr_uninterruptible(struct rq *rq_src) { - runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); + struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); unsigned long flags; local_irq_save(flags); @@ -4974,48 +5048,51 @@ static void migrate_nr_uninterruptible(runqueue_t *rq_src) /* Run through task list and migrate tasks from the dead cpu. */ static void migrate_live_tasks(int src_cpu) { - struct task_struct *tsk, *t; + struct task_struct *p, *t; write_lock_irq(&tasklist_lock); - do_each_thread(t, tsk) { - if (tsk == current) + do_each_thread(t, p) { + if (p == current) continue; - if (task_cpu(tsk) == src_cpu) - move_task_off_dead_cpu(src_cpu, tsk); - } while_each_thread(t, tsk); + if (task_cpu(p) == src_cpu) + move_task_off_dead_cpu(src_cpu, p); + } while_each_thread(t, p); write_unlock_irq(&tasklist_lock); } /* Schedules idle task to be the next runnable task on current CPU. * It does so by boosting its priority to highest possible and adding it to - * the _front_ of runqueue. Used by CPU offline code. + * the _front_ of the runqueue. Used by CPU offline code. */ void sched_idle_next(void) { - int cpu = smp_processor_id(); - runqueue_t *rq = this_rq(); + int this_cpu = smp_processor_id(); + struct rq *rq = cpu_rq(this_cpu); struct task_struct *p = rq->idle; unsigned long flags; /* cpu has to be offline */ - BUG_ON(cpu_online(cpu)); + BUG_ON(cpu_online(this_cpu)); - /* Strictly not necessary since rest of the CPUs are stopped by now - * and interrupts disabled on current cpu. + /* + * Strictly not necessary since rest of the CPUs are stopped by now + * and interrupts disabled on the current cpu. */ spin_lock_irqsave(&rq->lock, flags); __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); - /* Add idle task to _front_ of it's priority queue */ + + /* Add idle task to the _front_ of its priority queue: */ __activate_idle_task(p, rq); spin_unlock_irqrestore(&rq->lock, flags); } -/* Ensures that the idle task is using init_mm right before its cpu goes +/* + * Ensures that the idle task is using init_mm right before its cpu goes * offline. */ void idle_task_exit(void) @@ -5029,17 +5106,17 @@ void idle_task_exit(void) mmdrop(mm); } -static void migrate_dead(unsigned int dead_cpu, task_t *tsk) +static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) { - struct runqueue *rq = cpu_rq(dead_cpu); + struct rq *rq = cpu_rq(dead_cpu); /* Must be exiting, otherwise would be on tasklist. */ - BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); + BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); /* Cannot have done final schedule yet: would have vanished. */ - BUG_ON(tsk->flags & PF_DEAD); + BUG_ON(p->flags & PF_DEAD); - get_task_struct(tsk); + get_task_struct(p); /* * Drop lock around migration; if someone else moves it, @@ -5047,25 +5124,25 @@ static void migrate_dead(unsigned int dead_cpu, task_t *tsk) * fine. */ spin_unlock_irq(&rq->lock); - move_task_off_dead_cpu(dead_cpu, tsk); + move_task_off_dead_cpu(dead_cpu, p); spin_lock_irq(&rq->lock); - put_task_struct(tsk); + put_task_struct(p); } /* release_task() removes task from tasklist, so we won't find dead tasks. */ static void migrate_dead_tasks(unsigned int dead_cpu) { - unsigned arr, i; - struct runqueue *rq = cpu_rq(dead_cpu); + struct rq *rq = cpu_rq(dead_cpu); + unsigned int arr, i; for (arr = 0; arr < 2; arr++) { for (i = 0; i < MAX_PRIO; i++) { struct list_head *list = &rq->arrays[arr].queue[i]; + while (!list_empty(list)) - migrate_dead(dead_cpu, - list_entry(list->next, task_t, - run_list)); + migrate_dead(dead_cpu, list_entry(list->next, + struct task_struct, run_list)); } } } @@ -5075,14 +5152,13 @@ static void migrate_dead_tasks(unsigned int dead_cpu) * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. */ -static int __cpuinit migration_call(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int __cpuinit +migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int cpu = (long)hcpu; struct task_struct *p; - struct runqueue *rq; + int cpu = (long)hcpu; unsigned long flags; + struct rq *rq; switch (action) { case CPU_UP_PREPARE: @@ -5097,10 +5173,12 @@ static int __cpuinit migration_call(struct notifier_block *nfb, task_rq_unlock(rq, &flags); cpu_rq(cpu)->migration_thread = p; break; + case CPU_ONLINE: /* Strictly unneccessary, as first user will wake it. */ wake_up_process(cpu_rq(cpu)->migration_thread); break; + #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: if (!cpu_rq(cpu)->migration_thread) @@ -5111,6 +5189,7 @@ static int __cpuinit migration_call(struct notifier_block *nfb, kthread_stop(cpu_rq(cpu)->migration_thread); cpu_rq(cpu)->migration_thread = NULL; break; + case CPU_DEAD: migrate_live_tasks(cpu); rq = cpu_rq(cpu); @@ -5131,9 +5210,10 @@ static int __cpuinit migration_call(struct notifier_block *nfb, * the requestors. */ spin_lock_irq(&rq->lock); while (!list_empty(&rq->migration_queue)) { - migration_req_t *req; + struct migration_req *req; + req = list_entry(rq->migration_queue.next, - migration_req_t, list); + struct migration_req, list); list_del_init(&req->list); complete(&req->done); } @@ -5155,10 +5235,12 @@ static struct notifier_block __cpuinitdata migration_notifier = { int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); - /* Start one for boot CPU. */ + + /* Start one for the boot CPU: */ migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); + return 0; } #endif @@ -5254,7 +5336,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } while (sd); } #else -#define sched_domain_debug(sd, cpu) {} +# define sched_domain_debug(sd, cpu) do { } while (0) #endif static int sd_degenerate(struct sched_domain *sd) @@ -5280,8 +5362,8 @@ static int sd_degenerate(struct sched_domain *sd) return 1; } -static int sd_parent_degenerate(struct sched_domain *sd, - struct sched_domain *parent) +static int +sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) { unsigned long cflags = sd->flags, pflags = parent->flags; @@ -5314,7 +5396,7 @@ static int sd_parent_degenerate(struct sched_domain *sd, */ static void cpu_attach_domain(struct sched_domain *sd, int cpu) { - runqueue_t *rq = cpu_rq(cpu); + struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; /* Remove the sched domains which do not contribute to scheduling. */ @@ -5576,8 +5658,8 @@ static void touch_cache(void *__cache, unsigned long __size) /* * Measure the cache-cost of one task migration. Returns in units of nsec. */ -static unsigned long long measure_one(void *cache, unsigned long size, - int source, int target) +static unsigned long long +measure_one(void *cache, unsigned long size, int source, int target) { cpumask_t mask, saved_mask; unsigned long long t0, t1, t2, t3, cost; @@ -5729,7 +5811,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) cache = vmalloc(max_size); if (!cache) { printk("could not vmalloc %d bytes for cache!\n", 2*max_size); - return 1000000; // return 1 msec on very small boxen + return 1000000; /* return 1 msec on very small boxen */ } while (size <= max_size) { @@ -5927,9 +6009,9 @@ static int find_next_best_node(int node, unsigned long *used_nodes) */ static cpumask_t sched_domain_node_span(int node) { - int i; - cpumask_t span, nodemask; DECLARE_BITMAP(used_nodes, MAX_NUMNODES); + cpumask_t span, nodemask; + int i; cpus_clear(span); bitmap_zero(used_nodes, MAX_NUMNODES); @@ -5940,6 +6022,7 @@ static cpumask_t sched_domain_node_span(int node) for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { int next_node = find_next_best_node(node, used_nodes); + nodemask = node_to_cpumask(next_node); cpus_or(span, span, nodemask); } @@ -5949,19 +6032,23 @@ static cpumask_t sched_domain_node_span(int node) #endif int sched_smt_power_savings = 0, sched_mc_power_savings = 0; + /* - * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we - * can switch it on easily if needed. + * SMT sched-domains: */ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static struct sched_group sched_group_cpus[NR_CPUS]; + static int cpu_to_cpu_group(int cpu) { return cpu; } #endif +/* + * multi-core sched-domains: + */ #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct sched_domain, core_domains); static struct sched_group *sched_group_core_bycpu[NR_CPUS]; @@ -5981,9 +6068,10 @@ static int cpu_to_core_group(int cpu) static DEFINE_PER_CPU(struct sched_domain, phys_domains); static struct sched_group *sched_group_phys_bycpu[NR_CPUS]; + static int cpu_to_phys_group(int cpu) { -#if defined(CONFIG_SCHED_MC) +#ifdef CONFIG_SCHED_MC cpumask_t mask = cpu_coregroup_map(cpu); return first_cpu(mask); #elif defined(CONFIG_SCHED_SMT) @@ -6404,7 +6492,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) for (i = 0; i < MAX_NUMNODES; i++) init_numa_sched_groups_power(sched_group_nodes[i]); - init_numa_sched_groups_power(sched_group_allnodes); + if (sched_group_allnodes) { + int group = cpu_to_allnodes_group(first_cpu(*cpu_map)); + struct sched_group *sg = &sched_group_allnodes[group]; + + init_numa_sched_groups_power(sg); + } #endif /* Attach the domains */ @@ -6529,6 +6622,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) { int err = 0; + #ifdef CONFIG_SCHED_SMT if (smt_capable()) err = sysfs_create_file(&cls->kset.kobj, @@ -6548,7 +6642,8 @@ static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) { return sprintf(page, "%u\n", sched_mc_power_savings); } -static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count) +static ssize_t sched_mc_power_savings_store(struct sys_device *dev, + const char *buf, size_t count) { return sched_power_savings_store(buf, count, 0); } @@ -6561,7 +6656,8 @@ static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) { return sprintf(page, "%u\n", sched_smt_power_savings); } -static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count) +static ssize_t sched_smt_power_savings_store(struct sys_device *dev, + const char *buf, size_t count) { return sched_power_savings_store(buf, count, 1); } @@ -6623,6 +6719,7 @@ int in_sched_functions(unsigned long addr) { /* Linker adds these: start and end of __sched functions */ extern char __sched_text_start[], __sched_text_end[]; + return in_lock_functions(addr) || (addr >= (unsigned long)__sched_text_start && addr < (unsigned long)__sched_text_end); @@ -6630,14 +6727,15 @@ int in_sched_functions(unsigned long addr) void __init sched_init(void) { - runqueue_t *rq; int i, j, k; for_each_possible_cpu(i) { - prio_array_t *array; + struct prio_array *array; + struct rq *rq; rq = cpu_rq(i); spin_lock_init(&rq->lock); + lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; rq->active = rq->arrays; rq->expired = rq->arrays + 1; @@ -6666,6 +6764,11 @@ void __init sched_init(void) } set_load_weight(&init_task); + +#ifdef CONFIG_RT_MUTEXES + plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); +#endif + /* * The boot idle thread does lazy MMU switching as well: */ @@ -6684,7 +6787,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP void __might_sleep(char *file, int line) { -#if defined(in_atomic) +#ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ if ((in_atomic() || irqs_disabled()) && @@ -6706,10 +6809,10 @@ EXPORT_SYMBOL(__might_sleep); #ifdef CONFIG_MAGIC_SYSRQ void normalize_rt_tasks(void) { + struct prio_array *array; struct task_struct *p; - prio_array_t *array; unsigned long flags; - runqueue_t *rq; + struct rq *rq; read_lock_irq(&tasklist_lock); for_each_process(p) { @@ -6753,7 +6856,7 @@ void normalize_rt_tasks(void) * * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! */ -task_t *curr_task(int cpu) +struct task_struct *curr_task(int cpu) { return cpu_curr(cpu); } @@ -6773,7 +6876,7 @@ task_t *curr_task(int cpu) * * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! */ -void set_curr_task(int cpu, task_t *p) +void set_curr_task(int cpu, struct task_struct *p) { cpu_curr(cpu) = p; } diff --git a/kernel/signal.c b/kernel/signal.c index 7fe874d12fa..bfdb5686fa3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -791,22 +791,31 @@ out: /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. + * + * Note: If we unblock the signal, we always reset it to SIG_DFL, + * since we do not want to have a signal handler that was blocked + * be invoked when user space had explicitly blocked it. + * + * We don't want to have recursive SIGSEGV's etc, for example. */ - int force_sig_info(int sig, struct siginfo *info, struct task_struct *t) { unsigned long int flags; - int ret; + int ret, blocked, ignored; + struct k_sigaction *action; spin_lock_irqsave(&t->sighand->siglock, flags); - if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { - t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; - } - if (sigismember(&t->blocked, sig)) { - sigdelset(&t->blocked, sig); + action = &t->sighand->action[sig-1]; + ignored = action->sa.sa_handler == SIG_IGN; + blocked = sigismember(&t->blocked, sig); + if (blocked || ignored) { + action->sa.sa_handler = SIG_DFL; + if (blocked) { + sigdelset(&t->blocked, sig); + recalc_sigpending_tsk(t); + } } - recalc_sigpending_tsk(t); ret = specific_send_sig_info(sig, info, t); spin_unlock_irqrestore(&t->sighand->siglock, flags); diff --git a/kernel/softirq.c b/kernel/softirq.c index 8f03e3b89b5..3789ca98197 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -62,6 +62,137 @@ static inline void wakeup_softirqd(void) } /* + * This one is for softirq.c-internal use, + * where hardirqs are disabled legitimately: + */ +#ifdef CONFIG_TRACE_IRQFLAGS +static void __local_bh_disable(unsigned long ip) +{ + unsigned long flags; + + WARN_ON_ONCE(in_irq()); + + raw_local_irq_save(flags); + add_preempt_count(SOFTIRQ_OFFSET); + /* + * Were softirqs turned off above: + */ + if (softirq_count() == SOFTIRQ_OFFSET) + trace_softirqs_off(ip); + raw_local_irq_restore(flags); +} +#else /* !CONFIG_TRACE_IRQFLAGS */ +static inline void __local_bh_disable(unsigned long ip) +{ + add_preempt_count(SOFTIRQ_OFFSET); + barrier(); +} +#endif /* CONFIG_TRACE_IRQFLAGS */ + +void local_bh_disable(void) +{ + __local_bh_disable((unsigned long)__builtin_return_address(0)); +} + +EXPORT_SYMBOL(local_bh_disable); + +void __local_bh_enable(void) +{ + WARN_ON_ONCE(in_irq()); + + /* + * softirqs should never be enabled by __local_bh_enable(), + * it always nests inside local_bh_enable() sections: + */ + WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET); + + sub_preempt_count(SOFTIRQ_OFFSET); +} +EXPORT_SYMBOL_GPL(__local_bh_enable); + +/* + * Special-case - softirqs can safely be enabled in + * cond_resched_softirq(), or by __do_softirq(), + * without processing still-pending softirqs: + */ +void _local_bh_enable(void) +{ + WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(!irqs_disabled()); + + if (softirq_count() == SOFTIRQ_OFFSET) + trace_softirqs_on((unsigned long)__builtin_return_address(0)); + sub_preempt_count(SOFTIRQ_OFFSET); +} + +EXPORT_SYMBOL(_local_bh_enable); + +void local_bh_enable(void) +{ +#ifdef CONFIG_TRACE_IRQFLAGS + unsigned long flags; + + WARN_ON_ONCE(in_irq()); +#endif + WARN_ON_ONCE(irqs_disabled()); + +#ifdef CONFIG_TRACE_IRQFLAGS + local_irq_save(flags); +#endif + /* + * Are softirqs going to be turned on now: + */ + if (softirq_count() == SOFTIRQ_OFFSET) + trace_softirqs_on((unsigned long)__builtin_return_address(0)); + /* + * Keep preemption disabled until we are done with + * softirq processing: + */ + sub_preempt_count(SOFTIRQ_OFFSET - 1); + + if (unlikely(!in_interrupt() && local_softirq_pending())) + do_softirq(); + + dec_preempt_count(); +#ifdef CONFIG_TRACE_IRQFLAGS + local_irq_restore(flags); +#endif + preempt_check_resched(); +} +EXPORT_SYMBOL(local_bh_enable); + +void local_bh_enable_ip(unsigned long ip) +{ +#ifdef CONFIG_TRACE_IRQFLAGS + unsigned long flags; + + WARN_ON_ONCE(in_irq()); + + local_irq_save(flags); +#endif + /* + * Are softirqs going to be turned on now: + */ + if (softirq_count() == SOFTIRQ_OFFSET) + trace_softirqs_on(ip); + /* + * Keep preemption disabled until we are done with + * softirq processing: + */ + sub_preempt_count(SOFTIRQ_OFFSET - 1); + + if (unlikely(!in_interrupt() && local_softirq_pending())) + do_softirq(); + + dec_preempt_count(); +#ifdef CONFIG_TRACE_IRQFLAGS + local_irq_restore(flags); +#endif + preempt_check_resched(); +} +EXPORT_SYMBOL(local_bh_enable_ip); + +/* * We restart softirq processing MAX_SOFTIRQ_RESTART times, * and we fall back to softirqd after that. * @@ -80,8 +211,11 @@ asmlinkage void __do_softirq(void) int cpu; pending = local_softirq_pending(); + account_system_vtime(current); + + __local_bh_disable((unsigned long)__builtin_return_address(0)); + trace_softirq_enter(); - local_bh_disable(); cpu = smp_processor_id(); restart: /* Reset the pending bitmask before enabling irqs */ @@ -109,7 +243,10 @@ restart: if (pending) wakeup_softirqd(); - __local_bh_enable(); + trace_softirq_exit(); + + account_system_vtime(current); + _local_bh_enable(); } #ifndef __ARCH_HAS_DO_SOFTIRQ @@ -136,23 +273,6 @@ EXPORT_SYMBOL(do_softirq); #endif -void local_bh_enable(void) -{ - WARN_ON(irqs_disabled()); - /* - * Keep preemption disabled until we are done with - * softirq processing: - */ - sub_preempt_count(SOFTIRQ_OFFSET - 1); - - if (unlikely(!in_interrupt() && local_softirq_pending())) - do_softirq(); - - dec_preempt_count(); - preempt_check_resched(); -} -EXPORT_SYMBOL(local_bh_enable); - #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED # define invoke_softirq() __do_softirq() #else @@ -165,6 +285,7 @@ EXPORT_SYMBOL(local_bh_enable); void irq_exit(void) { account_system_vtime(current); + trace_hardirq_exit(); sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); @@ -208,8 +329,6 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) softirq_vec[nr].action = action; } -EXPORT_SYMBOL(open_softirq); - /* Tasklets */ struct tasklet_head { @@ -446,7 +565,7 @@ static void takeover_tasklets(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __devinit cpu_callback(struct notifier_block *nfb, +static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -486,7 +605,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __devinitdata cpu_nfb = { +static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 6b76caa2298..03e6a2b0b78 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu) /* * Create/destroy watchdog threads as CPUs come and go: */ -static int __devinit +static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; @@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block __devinitdata cpu_nfb = { +static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; diff --git a/kernel/spinlock.c b/kernel/spinlock.c index b31e54eadf5..fb524b009ee 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -13,6 +13,7 @@ #include <linux/preempt.h> #include <linux/spinlock.h> #include <linux/interrupt.h> +#include <linux/debug_locks.h> #include <linux/module.h> /* @@ -29,8 +30,10 @@ EXPORT_SYMBOL(generic__raw_read_trylock); int __lockfunc _spin_trylock(spinlock_t *lock) { preempt_disable(); - if (_raw_spin_trylock(lock)) + if (_raw_spin_trylock(lock)) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); return 1; + } preempt_enable(); return 0; @@ -40,8 +43,10 @@ EXPORT_SYMBOL(_spin_trylock); int __lockfunc _read_trylock(rwlock_t *lock) { preempt_disable(); - if (_raw_read_trylock(lock)) + if (_raw_read_trylock(lock)) { + rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_); return 1; + } preempt_enable(); return 0; @@ -51,19 +56,28 @@ EXPORT_SYMBOL(_read_trylock); int __lockfunc _write_trylock(rwlock_t *lock) { preempt_disable(); - if (_raw_write_trylock(lock)) + if (_raw_write_trylock(lock)) { + rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_); return 1; + } preempt_enable(); return 0; } EXPORT_SYMBOL(_write_trylock); -#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) +/* + * If lockdep is enabled then we use the non-preemption spin-ops + * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are + * not re-enabled during lock-acquire (which the preempt-spin-ops do): + */ +#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \ + defined(CONFIG_DEBUG_LOCK_ALLOC) void __lockfunc _read_lock(rwlock_t *lock) { preempt_disable(); + rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); _raw_read_lock(lock); } EXPORT_SYMBOL(_read_lock); @@ -74,7 +88,17 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) local_irq_save(flags); preempt_disable(); + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + /* + * On lockdep we dont want the hand-coded irq-enable of + * _raw_spin_lock_flags() code, because lockdep assumes + * that interrupts are not re-enabled during lock-acquire: + */ +#ifdef CONFIG_PROVE_LOCKING + _raw_spin_lock(lock); +#else _raw_spin_lock_flags(lock, &flags); +#endif return flags; } EXPORT_SYMBOL(_spin_lock_irqsave); @@ -83,6 +107,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock) { local_irq_disable(); preempt_disable(); + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); _raw_spin_lock(lock); } EXPORT_SYMBOL(_spin_lock_irq); @@ -91,6 +116,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock) { local_bh_disable(); preempt_disable(); + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); _raw_spin_lock(lock); } EXPORT_SYMBOL(_spin_lock_bh); @@ -101,6 +127,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) local_irq_save(flags); preempt_disable(); + rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); _raw_read_lock(lock); return flags; } @@ -110,6 +137,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock) { local_irq_disable(); preempt_disable(); + rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); _raw_read_lock(lock); } EXPORT_SYMBOL(_read_lock_irq); @@ -118,6 +146,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock) { local_bh_disable(); preempt_disable(); + rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); _raw_read_lock(lock); } EXPORT_SYMBOL(_read_lock_bh); @@ -128,6 +157,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) local_irq_save(flags); preempt_disable(); + rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); _raw_write_lock(lock); return flags; } @@ -137,6 +167,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock) { local_irq_disable(); preempt_disable(); + rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); _raw_write_lock(lock); } EXPORT_SYMBOL(_write_lock_irq); @@ -145,6 +176,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock) { local_bh_disable(); preempt_disable(); + rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); _raw_write_lock(lock); } EXPORT_SYMBOL(_write_lock_bh); @@ -152,6 +184,7 @@ EXPORT_SYMBOL(_write_lock_bh); void __lockfunc _spin_lock(spinlock_t *lock) { preempt_disable(); + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); _raw_spin_lock(lock); } @@ -160,6 +193,7 @@ EXPORT_SYMBOL(_spin_lock); void __lockfunc _write_lock(rwlock_t *lock) { preempt_disable(); + rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); _raw_write_lock(lock); } @@ -255,8 +289,22 @@ BUILD_LOCK_OPS(write, rwlock); #endif /* CONFIG_PREEMPT */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) +{ + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + _raw_spin_lock(lock); +} + +EXPORT_SYMBOL(_spin_lock_nested); + +#endif + void __lockfunc _spin_unlock(spinlock_t *lock) { + spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); preempt_enable(); } @@ -264,6 +312,7 @@ EXPORT_SYMBOL(_spin_unlock); void __lockfunc _write_unlock(rwlock_t *lock) { + rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_write_unlock(lock); preempt_enable(); } @@ -271,6 +320,7 @@ EXPORT_SYMBOL(_write_unlock); void __lockfunc _read_unlock(rwlock_t *lock) { + rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_read_unlock(lock); preempt_enable(); } @@ -278,6 +328,7 @@ EXPORT_SYMBOL(_read_unlock); void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { + spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); local_irq_restore(flags); preempt_enable(); @@ -286,6 +337,7 @@ EXPORT_SYMBOL(_spin_unlock_irqrestore); void __lockfunc _spin_unlock_irq(spinlock_t *lock) { + spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); local_irq_enable(); preempt_enable(); @@ -294,14 +346,16 @@ EXPORT_SYMBOL(_spin_unlock_irq); void __lockfunc _spin_unlock_bh(spinlock_t *lock) { + spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); preempt_enable_no_resched(); - local_bh_enable(); + local_bh_enable_ip((unsigned long)__builtin_return_address(0)); } EXPORT_SYMBOL(_spin_unlock_bh); void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { + rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_read_unlock(lock); local_irq_restore(flags); preempt_enable(); @@ -310,6 +364,7 @@ EXPORT_SYMBOL(_read_unlock_irqrestore); void __lockfunc _read_unlock_irq(rwlock_t *lock) { + rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_read_unlock(lock); local_irq_enable(); preempt_enable(); @@ -318,14 +373,16 @@ EXPORT_SYMBOL(_read_unlock_irq); void __lockfunc _read_unlock_bh(rwlock_t *lock) { + rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_read_unlock(lock); preempt_enable_no_resched(); - local_bh_enable(); + local_bh_enable_ip((unsigned long)__builtin_return_address(0)); } EXPORT_SYMBOL(_read_unlock_bh); void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { + rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_write_unlock(lock); local_irq_restore(flags); preempt_enable(); @@ -334,6 +391,7 @@ EXPORT_SYMBOL(_write_unlock_irqrestore); void __lockfunc _write_unlock_irq(rwlock_t *lock) { + rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_write_unlock(lock); local_irq_enable(); preempt_enable(); @@ -342,9 +400,10 @@ EXPORT_SYMBOL(_write_unlock_irq); void __lockfunc _write_unlock_bh(rwlock_t *lock) { + rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_write_unlock(lock); preempt_enable_no_resched(); - local_bh_enable(); + local_bh_enable_ip((unsigned long)__builtin_return_address(0)); } EXPORT_SYMBOL(_write_unlock_bh); @@ -352,11 +411,13 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock) { local_bh_disable(); preempt_disable(); - if (_raw_spin_trylock(lock)) + if (_raw_spin_trylock(lock)) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); return 1; + } preempt_enable_no_resched(); - local_bh_enable(); + local_bh_enable_ip((unsigned long)__builtin_return_address(0)); return 0; } EXPORT_SYMBOL(_spin_trylock_bh); diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c new file mode 100644 index 00000000000..b71816e47a3 --- /dev/null +++ b/kernel/stacktrace.c @@ -0,0 +1,24 @@ +/* + * kernel/stacktrace.c + * + * Stack trace management functions + * + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + */ +#include <linux/sched.h> +#include <linux/kallsyms.h> +#include <linux/stacktrace.h> + +void print_stack_trace(struct stack_trace *trace, int spaces) +{ + int i, j; + + for (i = 0; i < trace->nr_entries; i++) { + unsigned long ip = trace->entries[i]; + + for (j = 0; j < spaces + 1; j++) + printk(" "); + print_ip_sym(ip); + } +} + diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2c0aacc37c5..51cacd111db 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -4,7 +4,6 @@ #include <linux/cpu.h> #include <linux/err.h> #include <linux/syscalls.h> -#include <linux/kthread.h> #include <asm/atomic.h> #include <asm/semaphore.h> #include <asm/uaccess.h> @@ -26,11 +25,13 @@ static unsigned int stopmachine_num_threads; static atomic_t stopmachine_thread_ack; static DECLARE_MUTEX(stopmachine_mutex); -static int stopmachine(void *unused) +static int stopmachine(void *cpu) { int irqs_disabled = 0; int prepared = 0; + set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); + /* Ack: we are alive */ smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ atomic_inc(&stopmachine_thread_ack); @@ -84,8 +85,7 @@ static void stopmachine_set_state(enum stopmachine_state state) static int stop_machine(void) { - int ret = 0; - unsigned int i; + int i, ret = 0; struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; /* One high-prio thread per cpu. We'll do this one. */ @@ -96,16 +96,11 @@ static int stop_machine(void) stopmachine_state = STOPMACHINE_WAIT; for_each_online_cpu(i) { - struct task_struct *tsk; if (i == raw_smp_processor_id()) continue; - tsk = kthread_create(stopmachine, NULL, "stopmachine"); - if (IS_ERR(tsk)) { - ret = PTR_ERR(tsk); + ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); + if (ret < 0) break; - } - kthread_bind(tsk, i); - wake_up_process(tsk); stopmachine_num_threads++; } @@ -116,7 +111,6 @@ static int stop_machine(void) /* If some failed, kill them all. */ if (ret < 0) { stopmachine_set_state(STOPMACHINE_EXIT); - up(&stopmachine_mutex); return ret; } diff --git a/kernel/sys.c b/kernel/sys.c index dbb3b9c7ea6..e236f98f7ec 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1983,7 +1983,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = current->mm->dumpable; break; case PR_SET_DUMPABLE: - if (arg2 < 0 || arg2 > 2) { + if (arg2 < 0 || arg2 > 1) { error = -EINVAL; break; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 99a58f27907..362a0cc3713 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -932,6 +932,17 @@ static ctl_table vm_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, + { + .ctl_name = VM_MIN_UNMAPPED, + .procname = "min_unmapped_ratio", + .data = &sysctl_min_unmapped_ratio, + .maxlen = sizeof(sysctl_min_unmapped_ratio), + .mode = 0644, + .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, #endif #ifdef CONFIG_X86_32 { diff --git a/kernel/taskstats.c b/kernel/taskstats.c new file mode 100644 index 00000000000..e7818765733 --- /dev/null +++ b/kernel/taskstats.c @@ -0,0 +1,564 @@ +/* + * taskstats.c - Export per-task statistics to userland + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * (C) Balbir Singh, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/kernel.h> +#include <linux/taskstats_kern.h> +#include <linux/delayacct.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> +#include <net/genetlink.h> +#include <asm/atomic.h> + +/* + * Maximum length of a cpumask that can be specified in + * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute + */ +#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) + +static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; +static int family_registered; +kmem_cache_t *taskstats_cache; + +static struct genl_family family = { + .id = GENL_ID_GENERATE, + .name = TASKSTATS_GENL_NAME, + .version = TASKSTATS_GENL_VERSION, + .maxattr = TASKSTATS_CMD_ATTR_MAX, +}; + +static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] +__read_mostly = { + [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, + [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, + [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, + [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; + +struct listener { + struct list_head list; + pid_t pid; + char valid; +}; + +struct listener_list { + struct rw_semaphore sem; + struct list_head list; +}; +static DEFINE_PER_CPU(struct listener_list, listener_array); + +enum actions { + REGISTER, + DEREGISTER, + CPU_DONT_CARE +}; + +static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, + void **replyp, size_t size) +{ + struct sk_buff *skb; + void *reply; + + /* + * If new attributes are added, please revisit this allocation + */ + skb = nlmsg_new(size); + if (!skb) + return -ENOMEM; + + if (!info) { + int seq = get_cpu_var(taskstats_seqnum)++; + put_cpu_var(taskstats_seqnum); + + reply = genlmsg_put(skb, 0, seq, + family.id, 0, 0, + cmd, family.version); + } else + reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, + family.id, 0, 0, + cmd, family.version); + if (reply == NULL) { + nlmsg_free(skb); + return -EINVAL; + } + + *skbp = skb; + *replyp = reply; + return 0; +} + +/* + * Send taskstats data in @skb to listener with nl_pid @pid + */ +static int send_reply(struct sk_buff *skb, pid_t pid) +{ + struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); + void *reply = genlmsg_data(genlhdr); + int rc; + + rc = genlmsg_end(skb, reply); + if (rc < 0) { + nlmsg_free(skb); + return rc; + } + + return genlmsg_unicast(skb, pid); +} + +/* + * Send taskstats data in @skb to listeners registered for @cpu's exit data + */ +static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) +{ + struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); + struct listener_list *listeners; + struct listener *s, *tmp; + struct sk_buff *skb_next, *skb_cur = skb; + void *reply = genlmsg_data(genlhdr); + int rc, delcount = 0; + + rc = genlmsg_end(skb, reply); + if (rc < 0) { + nlmsg_free(skb); + return; + } + + rc = 0; + listeners = &per_cpu(listener_array, cpu); + down_read(&listeners->sem); + list_for_each_entry(s, &listeners->list, list) { + skb_next = NULL; + if (!list_is_last(&s->list, &listeners->list)) { + skb_next = skb_clone(skb_cur, GFP_KERNEL); + if (!skb_next) + break; + } + rc = genlmsg_unicast(skb_cur, s->pid); + if (rc == -ECONNREFUSED) { + s->valid = 0; + delcount++; + } + skb_cur = skb_next; + } + up_read(&listeners->sem); + + if (skb_cur) + nlmsg_free(skb_cur); + + if (!delcount) + return; + + /* Delete invalidated entries */ + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + if (!s->valid) { + list_del(&s->list); + kfree(s); + } + } + up_write(&listeners->sem); +} + +static int fill_pid(pid_t pid, struct task_struct *pidtsk, + struct taskstats *stats) +{ + int rc = 0; + struct task_struct *tsk = pidtsk; + + if (!pidtsk) { + read_lock(&tasklist_lock); + tsk = find_task_by_pid(pid); + if (!tsk) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + get_task_struct(tsk); + read_unlock(&tasklist_lock); + } else + get_task_struct(tsk); + + /* + * Each accounting subsystem adds calls to its functions to + * fill in relevant parts of struct taskstsats as follows + * + * per-task-foo(stats, tsk); + */ + + delayacct_add_tsk(stats, tsk); + stats->version = TASKSTATS_VERSION; + + /* Define err: label here if needed */ + put_task_struct(tsk); + return rc; + +} + +static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, + struct taskstats *stats) +{ + struct task_struct *tsk, *first; + unsigned long flags; + + /* + * Add additional stats from live tasks except zombie thread group + * leaders who are already counted with the dead tasks + */ + first = tgidtsk; + if (!first) { + read_lock(&tasklist_lock); + first = find_task_by_pid(tgid); + if (!first) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + get_task_struct(first); + read_unlock(&tasklist_lock); + } else + get_task_struct(first); + + /* Start with stats from dead tasks */ + spin_lock_irqsave(&first->signal->stats_lock, flags); + if (first->signal->stats) + memcpy(stats, first->signal->stats, sizeof(*stats)); + spin_unlock_irqrestore(&first->signal->stats_lock, flags); + + tsk = first; + read_lock(&tasklist_lock); + do { + if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) + continue; + /* + * Accounting subsystem can call its functions here to + * fill in relevant parts of struct taskstsats as follows + * + * per-task-foo(stats, tsk); + */ + delayacct_add_tsk(stats, tsk); + + } while_each_thread(first, tsk); + read_unlock(&tasklist_lock); + stats->version = TASKSTATS_VERSION; + + /* + * Accounting subsytems can also add calls here to modify + * fields of taskstats. + */ + + return 0; +} + + +static void fill_tgid_exit(struct task_struct *tsk) +{ + unsigned long flags; + + spin_lock_irqsave(&tsk->signal->stats_lock, flags); + if (!tsk->signal->stats) + goto ret; + + /* + * Each accounting subsystem calls its functions here to + * accumalate its per-task stats for tsk, into the per-tgid structure + * + * per-task-foo(tsk->signal->stats, tsk); + */ + delayacct_add_tsk(tsk->signal->stats, tsk); +ret: + spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); + return; +} + +static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) +{ + struct listener_list *listeners; + struct listener *s, *tmp; + unsigned int cpu; + cpumask_t mask = *maskp; + + if (!cpus_subset(mask, cpu_possible_map)) + return -EINVAL; + + if (isadd == REGISTER) { + for_each_cpu_mask(cpu, mask) { + s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, + cpu_to_node(cpu)); + if (!s) + goto cleanup; + s->pid = pid; + INIT_LIST_HEAD(&s->list); + s->valid = 1; + + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_add(&s->list, &listeners->list); + up_write(&listeners->sem); + } + return 0; + } + + /* Deregister or cleanup */ +cleanup: + for_each_cpu_mask(cpu, mask) { + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + if (s->pid == pid) { + list_del(&s->list); + kfree(s); + break; + } + } + up_write(&listeners->sem); + } + return 0; +} + +static int parse(struct nlattr *na, cpumask_t *mask) +{ + char *data; + int len; + int ret; + + if (na == NULL) + return 1; + len = nla_len(na); + if (len > TASKSTATS_CPUMASK_MAXLEN) + return -E2BIG; + if (len < 1) + return -EINVAL; + data = kmalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + nla_strlcpy(data, na, len); + ret = cpulist_parse(data, *mask); + kfree(data); + return ret; +} + +static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) +{ + int rc = 0; + struct sk_buff *rep_skb; + struct taskstats stats; + void *reply; + size_t size; + struct nlattr *na; + cpumask_t mask; + + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); + if (rc < 0) + return rc; + if (rc == 0) + return add_del_listener(info->snd_pid, &mask, REGISTER); + + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); + if (rc < 0) + return rc; + if (rc == 0) + return add_del_listener(info->snd_pid, &mask, DEREGISTER); + + /* + * Size includes space for nested attributes + */ + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + + memset(&stats, 0, sizeof(stats)); + rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); + if (rc < 0) + return rc; + + if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { + u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); + rc = fill_pid(pid, NULL, &stats); + if (rc < 0) + goto err; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + stats); + } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { + u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); + rc = fill_tgid(tgid, NULL, &stats); + if (rc < 0) + goto err; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + stats); + } else { + rc = -EINVAL; + goto err; + } + + nla_nest_end(rep_skb, na); + + return send_reply(rep_skb, info->snd_pid); + +nla_put_failure: + return genlmsg_cancel(rep_skb, reply); +err: + nlmsg_free(rep_skb); + return rc; +} + +void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) +{ + struct listener_list *listeners; + struct taskstats *tmp; + /* + * This is the cpu on which the task is exiting currently and will + * be the one for which the exit event is sent, even if the cpu + * on which this function is running changes later. + */ + *mycpu = raw_smp_processor_id(); + + *ptidstats = NULL; + tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); + if (!tmp) + return; + + listeners = &per_cpu(listener_array, *mycpu); + down_read(&listeners->sem); + if (!list_empty(&listeners->list)) { + *ptidstats = tmp; + tmp = NULL; + } + up_read(&listeners->sem); + kfree(tmp); +} + +/* Send pid data out on exit */ +void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, + int group_dead, unsigned int mycpu) +{ + int rc; + struct sk_buff *rep_skb; + void *reply; + size_t size; + int is_thread_group; + struct nlattr *na; + unsigned long flags; + + if (!family_registered || !tidstats) + return; + + spin_lock_irqsave(&tsk->signal->stats_lock, flags); + is_thread_group = tsk->signal->stats ? 1 : 0; + spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); + + rc = 0; + /* + * Size includes space for nested attributes + */ + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + + if (is_thread_group) + size = 2 * size; /* PID + STATS + TGID + STATS */ + + rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); + if (rc < 0) + goto ret; + + rc = fill_pid(tsk->pid, tsk, tidstats); + if (rc < 0) + goto err_skb; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + *tidstats); + nla_nest_end(rep_skb, na); + + if (!is_thread_group) + goto send; + + /* + * tsk has/had a thread group so fill the tsk->signal->stats structure + * Doesn't matter if tsk is the leader or the last group member leaving + */ + + fill_tgid_exit(tsk); + if (!group_dead) + goto send; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); + /* No locking needed for tsk->signal->stats since group is dead */ + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + *tsk->signal->stats); + nla_nest_end(rep_skb, na); + +send: + send_cpu_listeners(rep_skb, mycpu); + return; + +nla_put_failure: + genlmsg_cancel(rep_skb, reply); + goto ret; +err_skb: + nlmsg_free(rep_skb); +ret: + return; +} + +static struct genl_ops taskstats_ops = { + .cmd = TASKSTATS_CMD_GET, + .doit = taskstats_user_cmd, + .policy = taskstats_cmd_get_policy, +}; + +/* Needed early in initialization */ +void __init taskstats_init_early(void) +{ + unsigned int i; + + taskstats_cache = kmem_cache_create("taskstats_cache", + sizeof(struct taskstats), + 0, SLAB_PANIC, NULL, NULL); + for_each_possible_cpu(i) { + INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); + init_rwsem(&(per_cpu(listener_array, i).sem)); + } +} + +static int __init taskstats_init(void) +{ + int rc; + + rc = genl_register_family(&family); + if (rc) + return rc; + + rc = genl_register_ops(&family, &taskstats_ops); + if (rc < 0) + goto err; + + family_registered = 1; + return 0; +err: + genl_unregister_family(&family); + return rc; +} + +/* + * late initcall ensures initialization of statistics collection + * mechanisms precedes initialization of the taskstats interface + */ +late_initcall(taskstats_init); diff --git a/kernel/timer.c b/kernel/timer.c index 5a896025306..1d7dd6267c2 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -84,7 +84,7 @@ typedef struct tvec_t_base_s tvec_base_t; tvec_base_t boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); -static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases }; +static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) @@ -374,6 +374,7 @@ int del_timer_sync(struct timer_list *timer) int ret = try_to_del_timer_sync(timer); if (ret >= 0) return ret; + cpu_relax(); } } @@ -407,7 +408,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) * This function cascades all vectors and executes all expired timer * vectors. */ -#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK +#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) static inline void __run_timers(tvec_base_t *base) { @@ -891,6 +892,7 @@ int do_settimeofday(struct timespec *tv) set_normalized_timespec(&xtime, sec, nsec); set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + clock->error = 0; ntp_clear(); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -967,6 +969,7 @@ void __init timekeeping_init(void) } +static int timekeeping_suspended; /* * timekeeping_resume - Resumes the generic timekeeping subsystem. * @dev: unused @@ -982,6 +985,18 @@ static int timekeeping_resume(struct sys_device *dev) write_seqlock_irqsave(&xtime_lock, flags); /* restart the last cycle value */ clock->cycle_last = clocksource_read(clock); + clock->error = 0; + timekeeping_suspended = 0; + write_sequnlock_irqrestore(&xtime_lock, flags); + return 0; +} + +static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); return 0; } @@ -989,6 +1004,7 @@ static int timekeeping_resume(struct sys_device *dev) /* sysfs resume/suspend bits for timekeeping */ static struct sysdev_class timekeeping_sysclass = { .resume = timekeeping_resume, + .suspend = timekeeping_suspend, set_kset_name("timekeeping"), }; @@ -1008,52 +1024,52 @@ static int __init timekeeping_init_device(void) device_initcall(timekeeping_init_device); /* - * If the error is already larger, we look ahead another tick, + * If the error is already larger, we look ahead even further * to compensate for late or lost adjustments. */ -static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset) +static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset) { - int adj; + s64 tick_error, i; + u32 look_ahead, adj; + s32 error2, mult; /* - * As soon as the machine is synchronized to the external time - * source this should be the common case. + * Use the current error value to determine how much to look ahead. + * The larger the error the slower we adjust for it to avoid problems + * with losing too many ticks, otherwise we would overadjust and + * produce an even larger error. The smaller the adjustment the + * faster we try to adjust for it, as lost ticks can do less harm + * here. This is tuned so that an error of about 1 msec is adusted + * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). */ - error >>= 2; - if (likely(sign > 0 ? error <= *interval : error >= *interval)) - return sign; + error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = abs(error2); + for (look_ahead = 0; error2 > 0; look_ahead++) + error2 >>= 2; /* - * An extra look ahead dampens the effect of the current error, - * which can grow quite large with continously late updates, as - * it would dominate the adjustment value and can lead to - * oscillation. + * Now calculate the error in (1 << look_ahead) ticks, but first + * remove the single look ahead already included in the error. */ - error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); - error -= clock->xtime_interval >> 1; - - adj = 0; - while (1) { - error >>= 1; - if (sign > 0 ? error <= *interval : error >= *interval) - break; - adj++; + tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); + tick_error -= clock->xtime_interval >> 1; + error = ((error - tick_error) >> look_ahead) + tick_error; + + /* Finally calculate the adjustment shift value. */ + i = *interval; + mult = 1; + if (error < 0) { + error = -error; + *interval = -*interval; + *offset = -*offset; + mult = -1; } - - /* - * Add the current adjustments to the error and take the offset - * into account, the latter can cause the error to be hardly - * reduced at the next tick. Check the error again if there's - * room for another adjustment, thus further reducing the error - * which otherwise had to be corrected at the next update. - */ - error = (error << 1) - *interval + *offset; - if (sign > 0 ? error > *interval : error < *interval) - adj++; + for (adj = 0; error > i; adj++) + error >>= 1; *interval <<= adj; *offset <<= adj; - return sign << adj; + return mult << adj; } /* @@ -1068,11 +1084,19 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset) error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); if (error > interval) { - adj = clocksource_bigadjust(1, error, &interval, &offset); + error >>= 2; + if (likely(error <= interval)) + adj = 1; + else + adj = clocksource_bigadjust(error, &interval, &offset); } else if (error < -interval) { - interval = -interval; - offset = -offset; - adj = clocksource_bigadjust(-1, error, &interval, &offset); + error >>= 2; + if (likely(error >= -interval)) { + adj = -1; + interval = -interval; + offset = -offset; + } else + adj = clocksource_bigadjust(error, &interval, &offset); } else return; @@ -1091,13 +1115,16 @@ static void update_wall_time(void) { cycle_t offset; - clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; + /* Make sure we're fully resumed: */ + if (unlikely(timekeeping_suspended)) + return; #ifdef CONFIG_GENERIC_TIME offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; #else offset = clock->cycle_interval; #endif + clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. @@ -1129,7 +1156,7 @@ static void update_wall_time(void) clocksource_adjust(clock, offset); /* store full nanoseconds into xtime */ - xtime.tv_nsec = clock->xtime_nsec >> clock->shift; + xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; /* check to see if there is a new clocksource to use */ @@ -1208,7 +1235,7 @@ unsigned long wall_jiffies = INITIAL_JIFFIES; * playing with xtime and avenrun. */ #ifndef ARCH_HAVE_XTIME_LOCK -seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; +__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); EXPORT_SYMBOL(xtime_lock); #endif @@ -1297,46 +1324,19 @@ asmlinkage long sys_getpid(void) } /* - * Accessing ->group_leader->real_parent is not SMP-safe, it could - * change from under us. However, rather than getting any lock - * we can use an optimistic algorithm: get the parent - * pid, and go back and check that the parent is still - * the same. If it has changed (which is extremely unlikely - * indeed), we just try again.. - * - * NOTE! This depends on the fact that even if we _do_ - * get an old value of "parent", we can happily dereference - * the pointer (it was and remains a dereferencable kernel pointer - * no matter what): we just can't necessarily trust the result - * until we know that the parent pointer is valid. - * - * NOTE2: ->group_leader never changes from under us. + * Accessing ->real_parent is not SMP-safe, it could + * change from under us. However, we can use a stale + * value of ->real_parent under rcu_read_lock(), see + * release_task()->call_rcu(delayed_put_task_struct). */ asmlinkage long sys_getppid(void) { int pid; - struct task_struct *me = current; - struct task_struct *parent; - parent = me->group_leader->real_parent; - for (;;) { - pid = parent->tgid; -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) -{ - struct task_struct *old = parent; + rcu_read_lock(); + pid = rcu_dereference(current->real_parent)->tgid; + rcu_read_unlock(); - /* - * Make sure we read the pid before re-reading the - * parent pointer: - */ - smp_rmb(); - parent = me->group_leader->real_parent; - if (old != parent) - continue; -} -#endif - break; - } return pid; } @@ -1368,7 +1368,7 @@ asmlinkage long sys_getegid(void) static void process_timeout(unsigned long __data) { - wake_up_process((task_t *)__data); + wake_up_process((struct task_struct *)__data); } /** @@ -1559,6 +1559,13 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) return 0; } +/* + * lockdep: we want to track each per-CPU base as a separate lock-class, + * but timer-bases are kmalloc()-ed, so we need to attach separate + * keys to them: + */ +static struct lock_class_key base_lock_keys[NR_CPUS]; + static int __devinit init_timers_cpu(int cpu) { int j; @@ -1594,6 +1601,8 @@ static int __devinit init_timers_cpu(int cpu) } spin_lock_init(&base->lock); + lockdep_set_class(&base->lock, base_lock_keys + cpu); + for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); INIT_LIST_HEAD(base->tv4.vec + j); @@ -1652,7 +1661,7 @@ static void __devinit migrate_timers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __devinit timer_cpu_notify(struct notifier_block *self, +static int __cpuinit timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1672,7 +1681,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __devinitdata timers_nb = { +static struct notifier_block __cpuinitdata timers_nb = { .notifier_call = timer_cpu_notify, }; diff --git a/kernel/wait.c b/kernel/wait.c index 5985d866531..59a82f63275 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -10,6 +10,14 @@ #include <linux/wait.h> #include <linux/hash.h> +void init_waitqueue_head(wait_queue_head_t *q) +{ + spin_lock_init(&q->lock); + INIT_LIST_HEAD(&q->task_list); +} + +EXPORT_SYMBOL(init_waitqueue_head); + void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) { unsigned long flags; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 59f0b42bd89..835fe28b87a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -51,7 +51,7 @@ struct cpu_workqueue_struct { wait_queue_head_t work_done; struct workqueue_struct *wq; - task_t *thread; + struct task_struct *thread; int run_depth; /* Detect run_workqueue() recursion depth */ } ____cacheline_aligned; @@ -68,7 +68,7 @@ struct workqueue_struct { /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove threads to each one as cpus come/go. */ -static DEFINE_SPINLOCK(workqueue_lock); +static DEFINE_MUTEX(workqueue_mutex); static LIST_HEAD(workqueues); static int singlethread_cpu; @@ -93,9 +93,12 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, spin_unlock_irqrestore(&cwq->lock, flags); } -/* - * Queue work on a workqueue. Return non-zero if it was successfully - * added. +/** + * queue_work - queue work on a workqueue + * @wq: workqueue to use + * @work: work to queue + * + * Returns non-zero if it was successfully added. * * We queue the work to the CPU it was submitted, but there is no * guarantee that it will be processed by that CPU. @@ -114,6 +117,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) put_cpu(); return ret; } +EXPORT_SYMBOL_GPL(queue_work); static void delayed_work_timer_fn(unsigned long __data) { @@ -127,6 +131,14 @@ static void delayed_work_timer_fn(unsigned long __data) __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); } +/** + * queue_delayed_work - queue work on a workqueue after delay + * @wq: workqueue to use + * @work: work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns non-zero if it was successfully added. + */ int fastcall queue_delayed_work(struct workqueue_struct *wq, struct work_struct *work, unsigned long delay) { @@ -147,6 +159,38 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq, } return ret; } +EXPORT_SYMBOL_GPL(queue_delayed_work); + +/** + * queue_delayed_work_on - queue work on specific CPU after delay + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @work: work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns non-zero if it was successfully added. + */ +int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct work_struct *work, unsigned long delay) +{ + int ret = 0; + struct timer_list *timer = &work->timer; + + if (!test_and_set_bit(0, &work->pending)) { + BUG_ON(timer_pending(timer)); + BUG_ON(!list_empty(&work->entry)); + + /* This stores wq for the moment, for the timer_fn */ + work->wq_data = wq; + timer->expires = jiffies + delay; + timer->data = (unsigned long)work; + timer->function = delayed_work_timer_fn; + add_timer_on(timer, cpu); + ret = 1; + } + return ret; +} +EXPORT_SYMBOL_GPL(queue_delayed_work_on); static void run_workqueue(struct cpu_workqueue_struct *cwq) { @@ -251,8 +295,9 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) } } -/* +/** * flush_workqueue - ensure that any scheduled work has run to completion. + * @wq: workqueue to flush * * Forces execution of the workqueue and blocks until its completion. * This is typically used in driver shutdown handlers. @@ -275,12 +320,13 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) } else { int cpu; - lock_cpu_hotplug(); + mutex_lock(&workqueue_mutex); for_each_online_cpu(cpu) flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); - unlock_cpu_hotplug(); + mutex_unlock(&workqueue_mutex); } } +EXPORT_SYMBOL_GPL(flush_workqueue); static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, int cpu) @@ -325,8 +371,7 @@ struct workqueue_struct *__create_workqueue(const char *name, } wq->name = name; - /* We don't need the distraction of CPUs appearing and vanishing. */ - lock_cpu_hotplug(); + mutex_lock(&workqueue_mutex); if (singlethread) { INIT_LIST_HEAD(&wq->list); p = create_workqueue_thread(wq, singlethread_cpu); @@ -335,9 +380,7 @@ struct workqueue_struct *__create_workqueue(const char *name, else wake_up_process(p); } else { - spin_lock(&workqueue_lock); list_add(&wq->list, &workqueues); - spin_unlock(&workqueue_lock); for_each_online_cpu(cpu) { p = create_workqueue_thread(wq, cpu); if (p) { @@ -347,7 +390,7 @@ struct workqueue_struct *__create_workqueue(const char *name, destroy = 1; } } - unlock_cpu_hotplug(); + mutex_unlock(&workqueue_mutex); /* * Was there any error during startup? If yes then clean up: @@ -358,6 +401,7 @@ struct workqueue_struct *__create_workqueue(const char *name, } return wq; } +EXPORT_SYMBOL_GPL(__create_workqueue); static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) { @@ -374,6 +418,12 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) kthread_stop(p); } +/** + * destroy_workqueue - safely terminate a workqueue + * @wq: target workqueue + * + * Safely destroy a workqueue. All work currently pending will be done first. + */ void destroy_workqueue(struct workqueue_struct *wq) { int cpu; @@ -381,52 +431,63 @@ void destroy_workqueue(struct workqueue_struct *wq) flush_workqueue(wq); /* We don't need the distraction of CPUs appearing and vanishing. */ - lock_cpu_hotplug(); + mutex_lock(&workqueue_mutex); if (is_single_threaded(wq)) cleanup_workqueue_thread(wq, singlethread_cpu); else { for_each_online_cpu(cpu) cleanup_workqueue_thread(wq, cpu); - spin_lock(&workqueue_lock); list_del(&wq->list); - spin_unlock(&workqueue_lock); } - unlock_cpu_hotplug(); + mutex_unlock(&workqueue_mutex); free_percpu(wq->cpu_wq); kfree(wq); } +EXPORT_SYMBOL_GPL(destroy_workqueue); static struct workqueue_struct *keventd_wq; +/** + * schedule_work - put work task in global workqueue + * @work: job to be done + * + * This puts a job in the kernel-global workqueue. + */ int fastcall schedule_work(struct work_struct *work) { return queue_work(keventd_wq, work); } +EXPORT_SYMBOL(schedule_work); +/** + * schedule_delayed_work - put work task in global workqueue after delay + * @work: job to be done + * @delay: number of jiffies to wait + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue. + */ int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) { return queue_delayed_work(keventd_wq, work, delay); } +EXPORT_SYMBOL(schedule_delayed_work); +/** + * schedule_delayed_work_on - queue work in global workqueue on CPU after delay + * @cpu: cpu to use + * @work: job to be done + * @delay: number of jiffies to wait + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue on the specified CPU. + */ int schedule_delayed_work_on(int cpu, struct work_struct *work, unsigned long delay) { - int ret = 0; - struct timer_list *timer = &work->timer; - - if (!test_and_set_bit(0, &work->pending)) { - BUG_ON(timer_pending(timer)); - BUG_ON(!list_empty(&work->entry)); - /* This stores keventd_wq for the moment, for the timer_fn */ - work->wq_data = keventd_wq; - timer->expires = jiffies + delay; - timer->data = (unsigned long)work; - timer->function = delayed_work_timer_fn; - add_timer_on(timer, cpu); - ret = 1; - } - return ret; + return queue_delayed_work_on(cpu, keventd_wq, work, delay); } +EXPORT_SYMBOL(schedule_delayed_work_on); /** * schedule_on_each_cpu - call a function on each online CPU from keventd @@ -449,11 +510,13 @@ int schedule_on_each_cpu(void (*func)(void *info), void *info) if (!works) return -ENOMEM; + mutex_lock(&workqueue_mutex); for_each_online_cpu(cpu) { INIT_WORK(per_cpu_ptr(works, cpu), func, info); __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), per_cpu_ptr(works, cpu)); } + mutex_unlock(&workqueue_mutex); flush_workqueue(keventd_wq); free_percpu(works); return 0; @@ -463,6 +526,7 @@ void flush_scheduled_work(void) { flush_workqueue(keventd_wq); } +EXPORT_SYMBOL(flush_scheduled_work); /** * cancel_rearming_delayed_workqueue - reliably kill off a delayed @@ -568,6 +632,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: + mutex_lock(&workqueue_mutex); /* Create a new workqueue thread for it. */ list_for_each_entry(wq, &workqueues, list) { if (!create_workqueue_thread(wq, hotcpu)) { @@ -586,6 +651,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, kthread_bind(cwq->thread, hotcpu); wake_up_process(cwq->thread); } + mutex_unlock(&workqueue_mutex); break; case CPU_UP_CANCELED: @@ -597,6 +663,15 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, any_online_cpu(cpu_online_map)); cleanup_workqueue_thread(wq, hotcpu); } + mutex_unlock(&workqueue_mutex); + break; + + case CPU_DOWN_PREPARE: + mutex_lock(&workqueue_mutex); + break; + + case CPU_DOWN_FAILED: + mutex_unlock(&workqueue_mutex); break; case CPU_DEAD: @@ -604,6 +679,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, cleanup_workqueue_thread(wq, hotcpu); list_for_each_entry(wq, &workqueues, list) take_over_work(wq, hotcpu); + mutex_unlock(&workqueue_mutex); break; } @@ -619,13 +695,3 @@ void init_workqueues(void) BUG_ON(!keventd_wq); } -EXPORT_SYMBOL_GPL(__create_workqueue); -EXPORT_SYMBOL_GPL(queue_work); -EXPORT_SYMBOL_GPL(queue_delayed_work); -EXPORT_SYMBOL_GPL(flush_workqueue); -EXPORT_SYMBOL_GPL(destroy_workqueue); - -EXPORT_SYMBOL(schedule_work); -EXPORT_SYMBOL(schedule_delayed_work); -EXPORT_SYMBOL(schedule_delayed_work_on); -EXPORT_SYMBOL(flush_scheduled_work); |