diff options
Diffstat (limited to 'kernel')
42 files changed, 1670 insertions, 691 deletions
diff --git a/kernel/async.c b/kernel/async.c index 968ef9457d4..27235f5de19 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -92,19 +92,18 @@ extern int initcall_debug; static async_cookie_t __lowest_in_progress(struct list_head *running) { struct async_entry *entry; + if (!list_empty(running)) { entry = list_first_entry(running, struct async_entry, list); return entry->cookie; - } else if (!list_empty(&async_pending)) { - entry = list_first_entry(&async_pending, - struct async_entry, list); - return entry->cookie; - } else { - /* nothing in progress... next_cookie is "infinity" */ - return next_cookie; } + list_for_each_entry(entry, &async_pending, list) + if (entry->running == running) + return entry->cookie; + + return next_cookie; /* "infinity" value */ } static async_cookie_t lowest_in_progress(struct list_head *running) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a6fe71fd5d1..713098ee5a0 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1028,7 +1028,7 @@ static void audit_update_watch(struct audit_parent *parent, if (audit_enabled) { struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); audit_log_format(ab, "auid=%u ses=%u", audit_get_loginuid(current), @@ -1067,7 +1067,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) e = container_of(r, struct audit_entry, rule); if (audit_enabled) { struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); audit_log_format(ab, "auid=%u ses=%u", audit_get_loginuid(current), diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 382109b5bae..a7267bfd376 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1133,8 +1133,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, free_cg_links: free_cg_links(&tmp_cg_links); drop_new_super: - up_write(&sb->s_umount); - deactivate_super(sb); + deactivate_locked_super(sb); return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index b9e2edd0072..875ffbdd96d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1409,7 +1409,7 @@ long do_fork(unsigned long clone_flags, } audit_finish_fork(p); - tracehook_report_clone(trace, regs, clone_flags, nr, p); + tracehook_report_clone(regs, clone_flags, nr, p); /* * We set PF_STARTING at creation in case tracing wants to diff --git a/kernel/futex.c b/kernel/futex.c index eef8cd26b5e..80b5ce71659 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -19,6 +19,10 @@ * PRIVATE futexes by Eric Dumazet * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> * + * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> + * Copyright (C) IBM Corporation, 2009 + * Thanks to Thomas Gleixner for conceptual design and careful reviews. + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -96,8 +100,8 @@ struct futex_pi_state { */ struct futex_q { struct plist_node list; - /* There can only be a single waiter */ - wait_queue_head_t waiter; + /* Waiter reference */ + struct task_struct *task; /* Which hash list lock to use: */ spinlock_t *lock_ptr; @@ -107,7 +111,9 @@ struct futex_q { /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; - struct task_struct *task; + + /* rt_waiter storage for requeue_pi: */ + struct rt_mutex_waiter *rt_waiter; /* Bitset for the optional bitmasked wakeup */ u32 bitset; @@ -193,6 +199,7 @@ static void drop_futex_key_refs(union futex_key *key) * @uaddr: virtual address of the futex * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED * @key: address where result is stored. + * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE) * * Returns a negative error code or 0 * The key words are stored in *key on success. @@ -203,7 +210,8 @@ static void drop_futex_key_refs(union futex_key *key) * * lock_page() might sleep, the caller should not hold a spinlock. */ -static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) +static int +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -226,7 +234,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) * but access_ok() should be faster than find_vma() */ if (!fshared) { - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) + if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) return -EFAULT; key->private.mm = mm; key->private.address = address; @@ -235,7 +243,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) } again: - err = get_user_pages_fast(address, 1, 0, &page); + err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page); if (err < 0) return err; @@ -276,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key) drop_futex_key_refs(key); } +/** + * futex_top_waiter() - Return the highest priority waiter on a futex + * @hb: the hash bucket the futex_q's reside in + * @key: the futex key (to distinguish it from other futex futex_q's) + * + * Must be called with the hb lock held. + */ +static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, + union futex_key *key) +{ + struct futex_q *this; + + plist_for_each_entry(this, &hb->chain, list) { + if (match_futex(&this->key, key)) + return this; + } + return NULL; +} + static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) { u32 curval; @@ -537,28 +564,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return 0; } +/** + * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex + * @uaddr: the pi futex user address + * @hb: the pi futex hash bucket + * @key: the futex key associated with uaddr and hb + * @ps: the pi_state pointer where we store the result of the + * lookup + * @task: the task to perform the atomic lock work for. This will + * be "current" except in the case of requeue pi. + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Returns: + * 0 - ready to wait + * 1 - acquired the lock + * <0 - error + * + * The hb->lock and futex_key refs shall be held by the caller. + */ +static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, + union futex_key *key, + struct futex_pi_state **ps, + struct task_struct *task, int set_waiters) +{ + int lock_taken, ret, ownerdied = 0; + u32 uval, newval, curval; + +retry: + ret = lock_taken = 0; + + /* + * To avoid races, we attempt to take the lock here again + * (by doing a 0 -> TID atomic cmpxchg), while holding all + * the locks. It will most likely not succeed. + */ + newval = task_pid_vnr(task); + if (set_waiters) + newval |= FUTEX_WAITERS; + + curval = cmpxchg_futex_value_locked(uaddr, 0, newval); + + if (unlikely(curval == -EFAULT)) + return -EFAULT; + + /* + * Detect deadlocks. + */ + if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) + return -EDEADLK; + + /* + * Surprise - we got the lock. Just return to userspace: + */ + if (unlikely(!curval)) + return 1; + + uval = curval; + + /* + * Set the FUTEX_WAITERS flag, so the owner will know it has someone + * to wake at the next unlock. + */ + newval = curval | FUTEX_WAITERS; + + /* + * There are two cases, where a futex might have no owner (the + * owner TID is 0): OWNER_DIED. We take over the futex in this + * case. We also do an unconditional take over, when the owner + * of the futex died. + * + * This is safe as we are protected by the hash bucket lock ! + */ + if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { + /* Keep the OWNER_DIED bit */ + newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); + ownerdied = 0; + lock_taken = 1; + } + + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + + if (unlikely(curval == -EFAULT)) + return -EFAULT; + if (unlikely(curval != uval)) + goto retry; + + /* + * We took the lock due to owner died take over. + */ + if (unlikely(lock_taken)) + return 1; + + /* + * We dont have the lock. Look up the PI state (or create it if + * we are the first waiter): + */ + ret = lookup_pi_state(uval, hb, key, ps); + + if (unlikely(ret)) { + switch (ret) { + case -ESRCH: + /* + * No owner found for this futex. Check if the + * OWNER_DIED bit is set to figure out whether + * this is a robust futex or not. + */ + if (get_futex_value_locked(&curval, uaddr)) + return -EFAULT; + + /* + * We simply start over in case of a robust + * futex. The code above will take the futex + * and return happy. + */ + if (curval & FUTEX_OWNER_DIED) { + ownerdied = 1; + goto retry; + } + default: + break; + } + } + + return ret; +} + /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. */ static void wake_futex(struct futex_q *q) { - plist_del(&q->list, &q->list.plist); + struct task_struct *p = q->task; + /* - * The lock in wake_up_all() is a crucial memory barrier after the - * plist_del() and also before assigning to q->lock_ptr. + * We set q->lock_ptr = NULL _before_ we wake up the task. If + * a non futex wake up happens on another CPU then the task + * might exit and p would dereference a non existing task + * struct. Prevent this by holding a reference on p across the + * wake up. */ - wake_up(&q->waiter); + get_task_struct(p); + + plist_del(&q->list, &q->list.plist); /* - * The waiting task can free the futex_q as soon as this is written, - * without taking any locks. This must come last. - * - * A memory barrier is required here to prevent the following store to - * lock_ptr from getting ahead of the wakeup. Clearing the lock at the - * end of wake_up() does not prevent this store from moving. + * The waiting task can free the futex_q as soon as + * q->lock_ptr = NULL is written, without taking any locks. A + * memory barrier is required here to prevent the following + * store to lock_ptr from getting ahead of the plist_del. */ smp_wmb(); q->lock_ptr = NULL; + + wake_up_state(p, TASK_NORMAL); + put_task_struct(p); } static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) @@ -677,7 +836,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, fshared, &key); + ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ); if (unlikely(ret != 0)) goto out; @@ -687,7 +846,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { - if (this->pi_state) { + if (this->pi_state || this->rt_waiter) { ret = -EINVAL; break; } @@ -723,10 +882,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int ret, op_ret; retry: - ret = get_futex_key(uaddr1, fshared, &key1); + ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) goto out_put_key1; @@ -800,24 +959,185 @@ out: return ret; } -/* - * Requeue all waiters hashed on one physical page to another - * physical page. +/** + * requeue_futex() - Requeue a futex_q from one hb to another + * @q: the futex_q to requeue + * @hb1: the source hash_bucket + * @hb2: the target hash_bucket + * @key2: the new key for the requeued futex_q + */ +static inline +void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key2) +{ + + /* + * If key1 and key2 hash to the same bucket, no need to + * requeue. + */ + if (likely(&hb1->chain != &hb2->chain)) { + plist_del(&q->list, &hb1->chain); + plist_add(&q->list, &hb2->chain); + q->lock_ptr = &hb2->lock; +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.lock = &hb2->lock; +#endif + } + get_futex_key_refs(key2); + q->key = *key2; +} + +/** + * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue + * q: the futex_q + * key: the key of the requeue target futex + * + * During futex_requeue, with requeue_pi=1, it is possible to acquire the + * target futex if it is uncontended or via a lock steal. Set the futex_q key + * to the requeue target futex so the waiter can detect the wakeup on the right + * futex, but remove it from the hb and NULL the rt_waiter so it can detect + * atomic lock acquisition. Must be called with the q->lock_ptr held. + */ +static inline +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +{ + drop_futex_key_refs(&q->key); + get_futex_key_refs(key); + q->key = *key; + + WARN_ON(plist_node_empty(&q->list)); + plist_del(&q->list, &q->list.plist); + + WARN_ON(!q->rt_waiter); + q->rt_waiter = NULL; + + wake_up_state(q->task, TASK_NORMAL); +} + +/** + * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter + * @pifutex: the user address of the to futex + * @hb1: the from futex hash bucket, must be locked by the caller + * @hb2: the to futex hash bucket, must be locked by the caller + * @key1: the from futex key + * @key2: the to futex key + * @ps: address to store the pi_state pointer + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Try and get the lock on behalf of the top waiter if we can do it atomically. + * Wake the top waiter if we succeed. If the caller specified set_waiters, + * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. + * hb1 and hb2 must be held by the caller. + * + * Returns: + * 0 - failed to acquire the lock atomicly + * 1 - acquired the lock + * <0 - error + */ +static int futex_proxy_trylock_atomic(u32 __user *pifutex, + struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, + union futex_key *key1, union futex_key *key2, + struct futex_pi_state **ps, int set_waiters) +{ + struct futex_q *top_waiter = NULL; + u32 curval; + int ret; + + if (get_futex_value_locked(&curval, pifutex)) + return -EFAULT; + + /* + * Find the top_waiter and determine if there are additional waiters. + * If the caller intends to requeue more than 1 waiter to pifutex, + * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, + * as we have means to handle the possible fault. If not, don't set + * the bit unecessarily as it will force the subsequent unlock to enter + * the kernel. + */ + top_waiter = futex_top_waiter(hb1, key1); + + /* There are no waiters, nothing for us to do. */ + if (!top_waiter) + return 0; + + /* + * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in + * the contended case or if set_waiters is 1. The pi_state is returned + * in ps in contended cases. + */ + ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, + set_waiters); + if (ret == 1) + requeue_pi_wake_futex(top_waiter, key2); + + return ret; +} + +/** + * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 + * uaddr1: source futex user address + * uaddr2: target futex user address + * nr_wake: number of waiters to wake (must be 1 for requeue_pi) + * nr_requeue: number of waiters to requeue (0-INT_MAX) + * requeue_pi: if we are attempting to requeue from a non-pi futex to a + * pi futex (pi to pi requeue is not supported) + * + * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire + * uaddr2 atomically on behalf of the top waiter. + * + * Returns: + * >=0 - on success, the number of tasks requeued or woken + * <0 - on error */ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, - int nr_wake, int nr_requeue, u32 *cmpval) + int nr_wake, int nr_requeue, u32 *cmpval, + int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + int drop_count = 0, task_count = 0, ret; + struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct plist_head *head1; struct futex_q *this, *next; - int ret, drop_count = 0; + u32 curval2; + + if (requeue_pi) { + /* + * requeue_pi requires a pi_state, try to allocate it now + * without any locks in case it fails. + */ + if (refill_pi_state_cache()) + return -ENOMEM; + /* + * requeue_pi must wake as many tasks as it can, up to nr_wake + * + nr_requeue, since it acquires the rt_mutex prior to + * returning to userspace, so as to not leave the rt_mutex with + * waiters and no owner. However, second and third wake-ups + * cannot be predicted as they involve race conditions with the + * first wake and a fault while looking up the pi_state. Both + * pthread_cond_signal() and pthread_cond_broadcast() should + * use nr_wake=1. + */ + if (nr_wake != 1) + return -EINVAL; + } retry: - ret = get_futex_key(uaddr1, fshared, &key1); + if (pi_state != NULL) { + /* + * We will have to lookup the pi_state again, so free this one + * to keep the accounting correct. + */ + free_pi_state(pi_state); + pi_state = NULL; + } + + ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, fshared, &key2, + requeue_pi ? VERIFY_WRITE : VERIFY_READ); if (unlikely(ret != 0)) goto out_put_key1; @@ -852,32 +1172,99 @@ retry_private: } } + if (requeue_pi && (task_count - nr_wake < nr_requeue)) { + /* + * Attempt to acquire uaddr2 and wake the top waiter. If we + * intend to requeue waiters, force setting the FUTEX_WAITERS + * bit. We force this here where we are able to easily handle + * faults rather in the requeue loop below. + */ + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, + &key2, &pi_state, nr_requeue); + + /* + * At this point the top_waiter has either taken uaddr2 or is + * waiting on it. If the former, then the pi_state will not + * exist yet, look it up one more time to ensure we have a + * reference to it. + */ + if (ret == 1) { + WARN_ON(pi_state); + task_count++; + ret = get_futex_value_locked(&curval2, uaddr2); + if (!ret) + ret = lookup_pi_state(curval2, hb2, &key2, + &pi_state); + } + + switch (ret) { + case 0: + break; + case -EFAULT: + double_unlock_hb(hb1, hb2); + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + ret = get_user(curval2, uaddr2); + if (!ret) + goto retry; + goto out; + case -EAGAIN: + /* The owner was exiting, try again. */ + double_unlock_hb(hb1, hb2); + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + cond_resched(); + goto retry; + default: + goto out_unlock; + } + } + head1 = &hb1->chain; plist_for_each_entry_safe(this, next, head1, list) { - if (!match_futex (&this->key, &key1)) + if (task_count - nr_wake >= nr_requeue) + break; + + if (!match_futex(&this->key, &key1)) continue; - if (++ret <= nr_wake) { + + WARN_ON(!requeue_pi && this->rt_waiter); + WARN_ON(requeue_pi && !this->rt_waiter); + + /* + * Wake nr_wake waiters. For requeue_pi, if we acquired the + * lock, we already woke the top_waiter. If not, it will be + * woken by futex_unlock_pi(). + */ + if (++task_count <= nr_wake && !requeue_pi) { wake_futex(this); - } else { - /* - * If key1 and key2 hash to the same bucket, no need to - * requeue. - */ - if (likely(head1 != &hb2->chain)) { - plist_del(&this->list, &hb1->chain); - plist_add(&this->list, &hb2->chain); - this->lock_ptr = &hb2->lock; -#ifdef CONFIG_DEBUG_PI_LIST - this->list.plist.lock = &hb2->lock; -#endif - } - this->key = key2; - get_futex_key_refs(&key2); - drop_count++; + continue; + } - if (ret - nr_wake >= nr_requeue) - break; + /* + * Requeue nr_requeue waiters and possibly one more in the case + * of requeue_pi if we couldn't acquire the lock atomically. + */ + if (requeue_pi) { + /* Prepare the waiter to take the rt_mutex. */ + atomic_inc(&pi_state->refcount); + this->pi_state = pi_state; + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, + this->task, 1); + if (ret == 1) { + /* We got the lock. */ + requeue_pi_wake_futex(this, &key2); + continue; + } else if (ret) { + /* -EDEADLK */ + this->pi_state = NULL; + free_pi_state(pi_state); + goto out_unlock; + } } + requeue_futex(this, hb1, hb2, &key2); + drop_count++; } out_unlock: @@ -897,7 +1284,9 @@ out_put_keys: out_put_key1: put_futex_key(fshared, &key1); out: - return ret; + if (pi_state != NULL) + free_pi_state(pi_state); + return ret ? ret : task_count; } /* The key must be already stored in q->key. */ @@ -905,8 +1294,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) { struct futex_hash_bucket *hb; - init_waitqueue_head(&q->waiter); - get_futex_key_refs(&q->key); hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; @@ -1117,35 +1504,149 @@ handle_fault: */ #define FLAGS_SHARED 0x01 #define FLAGS_CLOCKRT 0x02 +#define FLAGS_HAS_TIMEOUT 0x04 static long futex_wait_restart(struct restart_block *restart); -static int futex_wait(u32 __user *uaddr, int fshared, - u32 val, ktime_t *abs_time, u32 bitset, int clockrt) +/** + * fixup_owner() - Post lock pi_state and corner case management + * @uaddr: user address of the futex + * @fshared: whether the futex is shared (1) or not (0) + * @q: futex_q (contains pi_state and access to the rt_mutex) + * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) + * + * After attempting to lock an rt_mutex, this function is called to cleanup + * the pi_state owner as well as handle race conditions that may allow us to + * acquire the lock. Must be called with the hb lock held. + * + * Returns: + * 1 - success, lock taken + * 0 - success, lock not taken + * <0 - on error (-EFAULT) + */ +static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, + int locked) { - struct task_struct *curr = current; - struct restart_block *restart; - DECLARE_WAITQUEUE(wait, curr); - struct futex_hash_bucket *hb; - struct futex_q q; - u32 uval; - int ret; - struct hrtimer_sleeper t; - int rem = 0; + struct task_struct *owner; + int ret = 0; - if (!bitset) - return -EINVAL; + if (locked) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case: + */ + if (q->pi_state->owner != current) + ret = fixup_pi_state_owner(uaddr, q, current, fshared); + goto out; + } - q.pi_state = NULL; - q.bitset = bitset; -retry: - q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key); - if (unlikely(ret != 0)) + /* + * Catch the rare case, where the lock was released when we were on the + * way back before we locked the hash bucket. + */ + if (q->pi_state->owner == current) { + /* + * Try to get the rt_mutex now. This might fail as some other + * task acquired the rt_mutex after we removed ourself from the + * rt_mutex waiters list. + */ + if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { + locked = 1; + goto out; + } + + /* + * pi_state is incorrect, some other task did a lock steal and + * we returned due to timeout or signal without taking the + * rt_mutex. Too late. We can access the rt_mutex_owner without + * locking, as the other task is now blocked on the hash bucket + * lock. Fix the state up. + */ + owner = rt_mutex_owner(&q->pi_state->pi_mutex); + ret = fixup_pi_state_owner(uaddr, q, owner, fshared); goto out; + } -retry_private: - hb = queue_lock(&q); + /* + * Paranoia check. If we did not take the lock, then we should not be + * the owner, nor the pending owner, of the rt_mutex. + */ + if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) + printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " + "pi-state %p\n", ret, + q->pi_state->pi_mutex.owner, + q->pi_state->owner); + +out: + return ret ? ret : locked; +} + +/** + * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal + * @hb: the futex hash bucket, must be locked by the caller + * @q: the futex_q to queue up on + * @timeout: the prepared hrtimer_sleeper, or null for no timeout + */ +static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + struct hrtimer_sleeper *timeout) +{ + queue_me(q, hb); + + /* + * There might have been scheduling since the queue_me(), as we + * cannot hold a spinlock across the get_user() in case it + * faults, and we cannot just set TASK_INTERRUPTIBLE state when + * queueing ourselves into the futex hash. This code thus has to + * rely on the futex_wake() code removing us from hash when it + * wakes us up. + */ + set_current_state(TASK_INTERRUPTIBLE); + + /* Arm the timer */ + if (timeout) { + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } + + /* + * !plist_node_empty() is safe here without any lock. + * q.lock_ptr != 0 is not safe, because of ordering against wakeup. + */ + if (likely(!plist_node_empty(&q->list))) { + /* + * If the timer has already expired, current will already be + * flagged for rescheduling. Only call schedule if there + * is no timeout, or if it has yet to expire. + */ + if (!timeout || timeout->task) + schedule(); + } + __set_current_state(TASK_RUNNING); +} + +/** + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr: the futex userspace address + * @val: the expected value + * @fshared: whether the futex is shared (1) or not (0) + * @q: the associated futex_q + * @hb: storage for hash_bucket pointer to be returned to caller + * + * Setup the futex_q and locate the hash_bucket. Get the futex value and + * compare it with the expected value. Handle atomic faults internally. + * Return with the hb lock held and a q.key reference on success, and unlocked + * with no q.key reference on failure. + * + * Returns: + * 0 - uaddr contains val and hb has been locked + * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked + */ +static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, + struct futex_q *q, struct futex_hash_bucket **hb) +{ + u32 uval; + int ret; /* * Access the page AFTER the hash-bucket is locked. @@ -1163,95 +1664,83 @@ retry_private: * A consequence is that futex_wait() can return zero and absorb * a wakeup when *uaddr != val on entry to the syscall. This is * rare, but normal. - * - * For shared futexes, we hold the mmap semaphore, so the mapping - * cannot have changed since we looked it up in get_futex_key. */ +retry: + q->key = FUTEX_KEY_INIT; + ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); + if (unlikely(ret != 0)) + return ret; + +retry_private: + *hb = queue_lock(q); + ret = get_futex_value_locked(&uval, uaddr); - if (unlikely(ret)) { - queue_unlock(&q, hb); + if (ret) { + queue_unlock(q, *hb); ret = get_user(uval, uaddr); if (ret) - goto out_put_key; + goto out; if (!fshared) goto retry_private; - put_futex_key(fshared, &q.key); + put_futex_key(fshared, &q->key); goto retry; } - ret = -EWOULDBLOCK; - if (unlikely(uval != val)) { - queue_unlock(&q, hb); - goto out_put_key; - } - /* Only actually queue if *uaddr contained val. */ - queue_me(&q, hb); + if (uval != val) { + queue_unlock(q, *hb); + ret = -EWOULDBLOCK; + } - /* - * There might have been scheduling since the queue_me(), as we - * cannot hold a spinlock across the get_user() in case it - * faults, and we cannot just set TASK_INTERRUPTIBLE state when - * queueing ourselves into the futex hash. This code thus has to - * rely on the futex_wake() code removing us from hash when it - * wakes us up. - */ +out: + if (ret) + put_futex_key(fshared, &q->key); + return ret; +} - /* add_wait_queue is the barrier after __set_current_state. */ - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&q.waiter, &wait); - /* - * !plist_node_empty() is safe here without any lock. - * q.lock_ptr != 0 is not safe, because of ordering against wakeup. - */ - if (likely(!plist_node_empty(&q.list))) { - if (!abs_time) - schedule(); - else { - hrtimer_init_on_stack(&t.timer, - clockrt ? CLOCK_REALTIME : - CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(&t, current); - hrtimer_set_expires_range_ns(&t.timer, *abs_time, - current->timer_slack_ns); - - hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&t.timer)) - t.task = NULL; +static int futex_wait(u32 __user *uaddr, int fshared, + u32 val, ktime_t *abs_time, u32 bitset, int clockrt) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct restart_block *restart; + struct futex_hash_bucket *hb; + struct futex_q q; + int ret; - /* - * the timer could have already expired, in which - * case current would be flagged for rescheduling. - * Don't bother calling schedule. - */ - if (likely(t.task)) - schedule(); + if (!bitset) + return -EINVAL; - hrtimer_cancel(&t.timer); + q.pi_state = NULL; + q.bitset = bitset; + q.rt_waiter = NULL; - /* Flag if a timeout occured */ - rem = (t.task == NULL); + if (abs_time) { + to = &timeout; - destroy_hrtimer_on_stack(&t.timer); - } + hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires_range_ns(&to->timer, *abs_time, + current->timer_slack_ns); } - __set_current_state(TASK_RUNNING); - /* - * NOTE: we don't remove ourselves from the waitqueue because - * we are the only user of it. - */ + /* Prepare to wait on uaddr. */ + ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + if (ret) + goto out; + + /* queue_me and wait for wakeup, timeout, or a signal. */ + futex_wait_queue_me(hb, &q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; if (!unqueue_me(&q)) goto out_put_key; ret = -ETIMEDOUT; - if (rem) + if (to && !to->task) goto out_put_key; /* @@ -1268,7 +1757,7 @@ retry_private: restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; - restart->futex.flags = 0; + restart->futex.flags = FLAGS_HAS_TIMEOUT; if (fshared) restart->futex.flags |= FLAGS_SHARED; @@ -1280,6 +1769,10 @@ retry_private: out_put_key: put_futex_key(fshared, &q.key); out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } return ret; } @@ -1288,13 +1781,16 @@ static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; int fshared = 0; - ktime_t t; + ktime_t t, *tp = NULL; - t.tv64 = restart->futex.time; + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { + t.tv64 = restart->futex.time; + tp = &t; + } restart->fn = do_no_restart_syscall; if (restart->futex.flags & FLAGS_SHARED) fshared = 1; - return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, + return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, restart->futex.bitset, restart->futex.flags & FLAGS_CLOCKRT); } @@ -1310,11 +1806,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, int detect, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; - struct task_struct *curr = current; struct futex_hash_bucket *hb; - u32 uval, newval, curval; + u32 uval; struct futex_q q; - int ret, lock_taken, ownerdied = 0; + int res, ret; if (refill_pi_state_cache()) return -ENOMEM; @@ -1328,90 +1823,25 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, } q.pi_state = NULL; + q.rt_waiter = NULL; retry: q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key); + ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); if (unlikely(ret != 0)) goto out; retry_private: hb = queue_lock(&q); -retry_locked: - ret = lock_taken = 0; - - /* - * To avoid races, we attempt to take the lock here again - * (by doing a 0 -> TID atomic cmpxchg), while holding all - * the locks. It will most likely not succeed. - */ - newval = task_pid_vnr(current); - - curval = cmpxchg_futex_value_locked(uaddr, 0, newval); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - - /* - * Detect deadlocks. In case of REQUEUE_PI this is a valid - * situation and we return success to user space. - */ - if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { - ret = -EDEADLK; - goto out_unlock_put_key; - } - - /* - * Surprise - we got the lock. Just return to userspace: - */ - if (unlikely(!curval)) - goto out_unlock_put_key; - - uval = curval; - - /* - * Set the WAITERS flag, so the owner will know it has someone - * to wake at next unlock - */ - newval = curval | FUTEX_WAITERS; - - /* - * There are two cases, where a futex might have no owner (the - * owner TID is 0): OWNER_DIED. We take over the futex in this - * case. We also do an unconditional take over, when the owner - * of the futex died. - * - * This is safe as we are protected by the hash bucket lock ! - */ - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { - /* Keep the OWNER_DIED bit */ - newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current); - ownerdied = 0; - lock_taken = 1; - } - - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - if (unlikely(curval != uval)) - goto retry_locked; - - /* - * We took the lock due to owner died take over. - */ - if (unlikely(lock_taken)) - goto out_unlock_put_key; - - /* - * We dont have the lock. Look up the PI state (or create it if - * we are the first waiter): - */ - ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); - + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); if (unlikely(ret)) { switch (ret) { - + case 1: + /* We got the lock. */ + ret = 0; + goto out_unlock_put_key; + case -EFAULT: + goto uaddr_faulted; case -EAGAIN: /* * Task is exiting and we just wait for the @@ -1421,25 +1851,6 @@ retry_locked: put_futex_key(fshared, &q.key); cond_resched(); goto retry; - - case -ESRCH: - /* - * No owner found for this futex. Check if the - * OWNER_DIED bit is set to figure out whether - * this is a robust futex or not. - */ - if (get_futex_value_locked(&curval, uaddr)) - goto uaddr_faulted; - - /* - * We simply start over in case of a robust - * futex. The code above will take the futex - * and return happy. - */ - if (curval & FUTEX_OWNER_DIED) { - ownerdied = 1; - goto retry_locked; - } default: goto out_unlock_put_key; } @@ -1463,71 +1874,21 @@ retry_locked: } spin_lock(q.lock_ptr); - - if (!ret) { - /* - * Got the lock. We might not be the anticipated owner - * if we did a lock-steal - fix up the PI-state in - * that case: - */ - if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); - } else { - /* - * Catch the rare case, where the lock was released - * when we were on the way back before we locked the - * hash bucket. - */ - if (q.pi_state->owner == curr) { - /* - * Try to get the rt_mutex now. This might - * fail as some other task acquired the - * rt_mutex after we removed ourself from the - * rt_mutex waiters list. - */ - if (rt_mutex_trylock(&q.pi_state->pi_mutex)) - ret = 0; - else { - /* - * pi_state is incorrect, some other - * task did a lock steal and we - * returned due to timeout or signal - * without taking the rt_mutex. Too - * late. We can access the - * rt_mutex_owner without locking, as - * the other task is now blocked on - * the hash bucket lock. Fix the state - * up. - */ - struct task_struct *owner; - int res; - - owner = rt_mutex_owner(&q.pi_state->pi_mutex); - res = fixup_pi_state_owner(uaddr, &q, owner, - fshared); - - /* propagate -EFAULT, if the fixup failed */ - if (res) - ret = res; - } - } else { - /* - * Paranoia check. If we did not take the lock - * in the trylock above, then we should not be - * the owner of the rtmutex, neither the real - * nor the pending one: - */ - if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) - printk(KERN_ERR "futex_lock_pi: ret = %d " - "pi-mutex: %p pi-state %p\n", ret, - q.pi_state->pi_mutex.owner, - q.pi_state->owner); - } - } + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr, fshared, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it acquired + * the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; /* - * If fixup_pi_state_owner() faulted and was unable to handle the - * fault, unlock it and return the fault to userspace. + * If fixup_owner() faulted and was unable to handle the fault, unlock + * it and return the fault to userspace. */ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) rt_mutex_unlock(&q.pi_state->pi_mutex); @@ -1535,9 +1896,7 @@ retry_locked: /* Unqueue and drop the lock */ unqueue_me_pi(&q); - if (to) - destroy_hrtimer_on_stack(&to->timer); - return ret != -EINTR ? ret : -ERESTARTNOINTR; + goto out; out_unlock_put_key: queue_unlock(&q, hb); @@ -1547,7 +1906,7 @@ out_put_key: out: if (to) destroy_hrtimer_on_stack(&to->timer); - return ret; + return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: /* @@ -1570,7 +1929,6 @@ uaddr_faulted: goto retry; } - /* * Userspace attempted a TID -> 0 atomic transition, and failed. * This is the in-kernel slowpath: we look up the PI state (if any), @@ -1594,7 +1952,7 @@ retry: if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; - ret = get_futex_key(uaddr, fshared, &key); + ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE); if (unlikely(ret != 0)) goto out; @@ -1672,6 +2030,229 @@ pi_faulted: return ret; } +/** + * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex + * @hb: the hash_bucket futex_q was original enqueued on + * @q: the futex_q woken while waiting to be requeued + * @key2: the futex_key of the requeue target futex + * @timeout: the timeout associated with the wait (NULL if none) + * + * Detect if the task was woken on the initial futex as opposed to the requeue + * target futex. If so, determine if it was a timeout or a signal that caused + * the wakeup and return the appropriate error code to the caller. Must be + * called with the hb lock held. + * + * Returns + * 0 - no early wakeup detected + * <0 - -ETIMEDOUT or -ERESTARTNOINTR + */ +static inline +int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, + struct futex_q *q, union futex_key *key2, + struct hrtimer_sleeper *timeout) +{ + int ret = 0; + + /* + * With the hb lock held, we avoid races while we process the wakeup. + * We only need to hold hb (and not hb2) to ensure atomicity as the + * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. + * It can't be requeued from uaddr2 to something else since we don't + * support a PI aware source futex for requeue. + */ + if (!match_futex(&q->key, key2)) { + WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); + /* + * We were woken prior to requeue by a timeout or a signal. + * Unqueue the futex_q and determine which it was. + */ + plist_del(&q->list, &q->list.plist); + drop_futex_key_refs(&q->key); + + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + else + ret = -ERESTARTNOINTR; + } + return ret; +} + +/** + * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 + * @uaddr: the futex we initialyl wait on (non-pi) + * @fshared: whether the futexes are shared (1) or not (0). They must be + * the same type, no requeueing from private to shared, etc. + * @val: the expected value of uaddr + * @abs_time: absolute timeout + * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. + * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) + * @uaddr2: the pi futex we will take prior to returning to user-space + * + * The caller will wait on uaddr and will be requeued by futex_requeue() to + * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and + * complete the acquisition of the rt_mutex prior to returning to userspace. + * This ensures the rt_mutex maintains an owner when it has waiters; without + * one, the pi logic wouldn't know which task to boost/deboost, if there was a + * need to. + * + * We call schedule in futex_wait_queue_me() when we enqueue and return there + * via the following: + * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() + * 2) wakeup on uaddr2 after a requeue and subsequent unlock + * 3) signal (before or after requeue) + * 4) timeout (before or after requeue) + * + * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. + * + * If 2, we may then block on trying to take the rt_mutex and return via: + * 5) successful lock + * 6) signal + * 7) timeout + * 8) other lock acquisition failure + * + * If 6, we setup a restart_block with futex_lock_pi() as the function. + * + * If 4 or 7, we cleanup and return with -ETIMEDOUT. + * + * Returns: + * 0 - On success + * <0 - On error + */ +static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, + u32 val, ktime_t *abs_time, u32 bitset, + int clockrt, u32 __user *uaddr2) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct rt_mutex_waiter rt_waiter; + struct rt_mutex *pi_mutex = NULL; + struct futex_hash_bucket *hb; + union futex_key key2; + struct futex_q q; + int res, ret; + + if (!bitset) + return -EINVAL; + + if (abs_time) { + to = &timeout; + hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires_range_ns(&to->timer, *abs_time, + current->timer_slack_ns); + } + + /* + * The waiter is allocated on our stack, manipulated by the requeue + * code while we sleep on uaddr. + */ + debug_rt_mutex_init_waiter(&rt_waiter); + rt_waiter.task = NULL; + + q.pi_state = NULL; + q.bitset = bitset; + q.rt_waiter = &rt_waiter; + + key2 = FUTEX_KEY_INIT; + ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); + if (unlikely(ret != 0)) + goto out; + + /* Prepare to wait on uaddr. */ + ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + if (ret) + goto out_key2; + + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ + futex_wait_queue_me(hb, &q, to); + + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); + spin_unlock(&hb->lock); + if (ret) + goto out_put_keys; + + /* + * In order for us to be here, we know our q.key == key2, and since + * we took the hb->lock above, we also know that futex_requeue() has + * completed and we no longer have to concern ourselves with a wakeup + * race with the atomic proxy lock acquition by the requeue code. + */ + + /* Check if the requeue code acquired the second futex for us. */ + if (!q.rt_waiter) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case. + */ + if (q.pi_state && (q.pi_state->owner != current)) { + spin_lock(q.lock_ptr); + ret = fixup_pi_state_owner(uaddr2, &q, current, + fshared); + spin_unlock(q.lock_ptr); + } + } else { + /* + * We have been woken up by futex_unlock_pi(), a timeout, or a + * signal. futex_unlock_pi() will not destroy the lock_ptr nor + * the pi_state. + */ + WARN_ON(!&q.pi_state); + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); + debug_rt_mutex_free_waiter(&rt_waiter); + + spin_lock(q.lock_ptr); + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr2, fshared, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it + * acquired the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + /* Unqueue and drop the lock. */ + unqueue_me_pi(&q); + } + + /* + * If fixup_pi_state_owner() faulted and was unable to handle the + * fault, unlock the rt_mutex and return the fault to userspace. + */ + if (ret == -EFAULT) { + if (rt_mutex_owner(pi_mutex) == current) + rt_mutex_unlock(pi_mutex); + } else if (ret == -EINTR) { + /* + * We've already been requeued, but we have no way to + * restart by calling futex_lock_pi() directly. We + * could restart the syscall, but that will look at + * the user space value and return right away. So we + * drop back with EWOULDBLOCK to tell user space that + * "val" has been changed. That's the same what the + * restart of the syscall would do in + * futex_wait_setup(). + */ + ret = -EWOULDBLOCK; + } + +out_put_keys: + put_futex_key(fshared, &q.key); +out_key2: + put_futex_key(fshared, &key2); + +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret; +} + /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. @@ -1894,7 +2475,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, fshared = 1; clockrt = op & FUTEX_CLOCK_REALTIME; - if (clockrt && cmd != FUTEX_WAIT_BITSET) + if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) return -ENOSYS; switch (cmd) { @@ -1909,10 +2490,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ret = futex_wake(uaddr, fshared, val, val3); break; case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); break; case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, + 0); break; case FUTEX_WAKE_OP: ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); @@ -1929,6 +2511,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, if (futex_cmpxchg_enabled) ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); break; + case FUTEX_WAIT_REQUEUE_PI: + val3 = FUTEX_BITSET_MATCH_ANY; + ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, + clockrt, uaddr2); + break; + case FUTEX_CMP_REQUEUE_PI: + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, + 1); + break; default: ret = -ENOSYS; } @@ -1946,7 +2537,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, int cmd = op & FUTEX_CMD_MASK; if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET)) { + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; if (!timespec_valid(&ts)) @@ -1958,11 +2550,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, tp = &t; } /* - * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. + * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. */ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_WAKE_OP) + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (u32) (unsigned long) utime; return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 3394f8f5296..7d047808419 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o -obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o +obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o obj-$(CONFIG_PM_SLEEP) += pm.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c687ba4363f..13c68e71b72 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) spin_lock(&desc->lock); mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) desc->status &= ~IRQ_INPROGRESS; out: desc->chip->eoi(irq); - desc = irq_remap_to_desc(irq, desc); spin_unlock(&desc->lock); } @@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) !desc->action)) { desc->status |= (IRQ_PENDING | IRQ_MASKED); mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); goto out_unlock; } kstat_incr_irqs_this_cpu(irq, desc); @@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) /* Start handling the irq */ if (desc->chip->ack) desc->chip->ack(irq); - desc = irq_remap_to_desc(irq, desc); /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; @@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) if (!noirqdebug) note_interrupt(irq, desc, action_ret); - if (desc->chip->eoi) { + if (desc->chip->eoi) desc->chip->eoi(irq); - desc = irq_remap_to_desc(irq, desc); - } } void @@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, /* Uninstall? */ if (handle == handle_bad_irq) { - if (desc->chip != &no_irq_chip) { + if (desc->chip != &no_irq_chip) mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); - } desc->status |= IRQ_DISABLED; desc->depth = 1; } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d82142be8dd..18041a254d3 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,6 +11,7 @@ */ #include <linux/irq.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> @@ -81,45 +82,48 @@ static struct irq_desc irq_desc_init = { .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), }; -void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) +void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) { - int node; void *ptr; - node = cpu_to_node(cpu); - ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); + if (slab_is_available()) + ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), + GFP_ATOMIC, node); + else + ptr = alloc_bootmem_node(NODE_DATA(node), + nr * sizeof(*desc->kstat_irqs)); /* * don't overwite if can not get new one * init_copy_kstat_irqs() could still use old one */ if (ptr) { - printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", - cpu, node); + printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node); desc->kstat_irqs = ptr; } } -static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) +static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) { memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); spin_lock_init(&desc->lock); desc->irq = irq; #ifdef CONFIG_SMP - desc->cpu = cpu; + desc->node = node; #endif lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_kstat_irqs(desc, cpu, nr_cpu_ids); + init_kstat_irqs(desc, node, nr_cpu_ids); if (!desc->kstat_irqs) { printk(KERN_ERR "can not alloc kstat_irqs\n"); BUG_ON(1); } - if (!init_alloc_desc_masks(desc, cpu, false)) { + if (!alloc_desc_masks(desc, node, false)) { printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); BUG_ON(1); } - arch_init_chip_data(desc, cpu); + init_desc_masks(desc); + arch_init_chip_data(desc, node); } /* @@ -169,7 +173,8 @@ int __init early_irq_init(void) desc[i].irq = i; desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - init_alloc_desc_masks(&desc[i], 0, true); + alloc_desc_masks(&desc[i], 0, true); + init_desc_masks(&desc[i]); irq_desc_ptrs[i] = desc + i; } @@ -187,11 +192,10 @@ struct irq_desc *irq_to_desc(unsigned int irq) return NULL; } -struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) { struct irq_desc *desc; unsigned long flags; - int node; if (irq >= nr_irqs) { WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", @@ -210,15 +214,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) if (desc) goto out_unlock; - node = cpu_to_node(cpu); - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n", - irq, cpu, node); + if (slab_is_available()) + desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); + else + desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc)); + + printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); if (!desc) { printk(KERN_ERR "can not alloc irq_desc\n"); BUG_ON(1); } - init_one_irq_desc(irq, desc, cpu); + init_one_irq_desc(irq, desc, node); irq_desc_ptrs[irq] = desc; @@ -256,7 +262,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq = i; - init_alloc_desc_masks(&desc[i], 0, true); + alloc_desc_masks(&desc[i], 0, true); + init_desc_masks(&desc[i]); desc[i].kstat_irqs = kstat_irqs_all[i]; } return arch_early_irq_init(); @@ -267,7 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) return (irq < NR_IRQS) ? irq_desc + irq : NULL; } -struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) { return irq_to_desc(irq); } @@ -363,8 +370,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; - WARN_ONCE(!in_irq(), "BUG: IRQ handler called from non-hardirq context!"); - if (!(action->flags & IRQF_DISABLED)) local_irq_enable_in_hardirq(); @@ -455,11 +460,8 @@ unsigned int __do_IRQ(unsigned int irq) /* * No locking required for CPU-local interrupts: */ - if (desc->chip->ack) { + if (desc->chip->ack) desc->chip->ack(irq); - /* get new one */ - desc = irq_remap_to_desc(irq, desc); - } if (likely(!(desc->status & IRQ_DISABLED))) { action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) @@ -470,10 +472,8 @@ unsigned int __do_IRQ(unsigned int irq) } spin_lock(&desc->lock); - if (desc->chip->ack) { + if (desc->chip->ack) desc->chip->ack(irq); - desc = irq_remap_to_desc(irq, desc); - } /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 01ce20eab38..73468253143 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); extern struct lock_class_key irq_desc_lock_class; -extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); +extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); extern void clear_kstat_irqs(struct irq_desc *desc); extern spinlock_t sparse_irq_lock; @@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq, extern int irq_select_affinity_usr(unsigned int irq); +extern void +irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask); + /* * Debugging printout: */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2734eca5924..aaf5c9d0577 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq) return 1; } -static void +void irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) { struct irqaction *action = desc->action; @@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PCNTXT) - desc->chip->set_affinity(irq, cpumask); + if (desc->status & IRQ_MOVE_PCNTXT) { + if (!desc->chip->set_affinity(irq, cpumask)) { + cpumask_copy(desc->affinity, cpumask); + irq_set_thread_affinity(desc, cpumask); + } + } else { desc->status |= IRQ_MOVE_PENDING; cpumask_copy(desc->pending_mask, cpumask); } #else - cpumask_copy(desc->affinity, cpumask); - desc->chip->set_affinity(irq, cpumask); + if (!desc->chip->set_affinity(irq, cpumask)) { + cpumask_copy(desc->affinity, cpumask); + irq_set_thread_affinity(desc, cpumask); + } #endif - irq_set_thread_affinity(desc, cpumask); desc->status |= IRQ_AFFINITY_SET; spin_unlock_irqrestore(&desc->lock, flags); return 0; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index e05ad9be43b..cfe767ca154 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -1,5 +1,8 @@ #include <linux/irq.h> +#include <linux/interrupt.h> + +#include "internals.h" void move_masked_irq(int irq) { @@ -39,11 +42,12 @@ void move_masked_irq(int irq) * masking the irqs. */ if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) - < nr_cpu_ids)) { - cpumask_and(desc->affinity, - desc->pending_mask, cpu_online_mask); - desc->chip->set_affinity(irq, desc->affinity); - } + < nr_cpu_ids)) + if (!desc->chip->set_affinity(irq, desc->pending_mask)) { + cpumask_copy(desc->affinity, desc->pending_mask); + irq_set_thread_affinity(desc, desc->pending_mask); + } + cpumask_clear(desc->pending_mask); } diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 44bbdcbaf8d..2f69bee57bf 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -15,9 +15,9 @@ static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc, - int cpu, int nr) + int node, int nr) { - init_kstat_irqs(desc, cpu, nr); + init_kstat_irqs(desc, node, nr); if (desc->kstat_irqs != old_desc->kstat_irqs) memcpy(desc->kstat_irqs, old_desc->kstat_irqs, @@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) } static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, - struct irq_desc *desc, int cpu) + struct irq_desc *desc, int node) { memcpy(desc, old_desc, sizeof(struct irq_desc)); - if (!init_alloc_desc_masks(desc, cpu, false)) { + if (!alloc_desc_masks(desc, node, false)) { printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " "for migration.\n", irq); return false; } spin_lock_init(&desc->lock); - desc->cpu = cpu; + desc->node = node; lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); + init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); init_copy_desc_masks(old_desc, desc); - arch_init_copy_chip_data(old_desc, desc, cpu); + arch_init_copy_chip_data(old_desc, desc, node); return true; } @@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) } static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, - int cpu) + int node) { struct irq_desc *desc; unsigned int irq; unsigned long flags; - int node; irq = old_desc->irq; @@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, if (desc && old_desc != desc) goto out_unlock; - node = cpu_to_node(cpu); desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); if (!desc) { printk(KERN_ERR "irq %d: can not get new irq_desc " @@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, desc = old_desc; goto out_unlock; } - if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { + if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) { /* still use old one */ kfree(desc); desc = old_desc; @@ -97,9 +95,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, /* free the old one */ free_one_irq_desc(old_desc, desc); - spin_unlock(&old_desc->lock); kfree(old_desc); - spin_lock(&desc->lock); return desc; @@ -109,24 +105,14 @@ out_unlock: return desc; } -struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) +struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) { - int old_cpu; - int node, old_node; - /* those all static, do move them */ if (desc->irq < NR_IRQS_LEGACY) return desc; - old_cpu = desc->cpu; - if (old_cpu != cpu) { - node = cpu_to_node(cpu); - old_node = cpu_to_node(old_cpu); - if (old_node != node) - desc = __real_move_irq_desc(desc, cpu); - else - desc->cpu = cpu; - } + if (desc->node != node) + desc = __real_move_irq_desc(desc, node); return desc; } diff --git a/kernel/kexec.c b/kernel/kexec.c index 5a758c6e495..e4983770913 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1451,7 +1451,6 @@ int kernel_kexec(void) error = device_suspend(PMSG_FREEZE); if (error) goto Resume_console; - device_pm_lock(); /* At this point, device_suspend() has been called, * but *not* device_power_down(). We *must* * device_power_down() now. Otherwise, drivers for @@ -1489,7 +1488,6 @@ int kernel_kexec(void) enable_nonboot_cpus(); device_power_up(PMSG_RESTORE); Resume_devices: - device_pm_unlock(); device_resume(PMSG_RESTORE); Resume_console: resume_console(); diff --git a/kernel/kgdb.c b/kernel/kgdb.c index e4dcfb2272a..9147a3190c9 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -1583,8 +1583,8 @@ static void sysrq_handle_gdb(int key, struct tty_struct *tty) static struct sysrq_key_op sysrq_gdb_op = { .handler = sysrq_handle_gdb, - .help_msg = "Gdb", - .action_msg = "GDB", + .help_msg = "debug(G)", + .action_msg = "DEBUG", }; #endif diff --git a/kernel/kmod.c b/kernel/kmod.c index b750675251e..7e95bedb2bf 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -370,8 +370,10 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, sub_info->argv = argv; sub_info->envp = envp; sub_info->cred = prepare_usermodehelper_creds(); - if (!sub_info->cred) + if (!sub_info->cred) { + kfree(sub_info); return NULL; + } out: return sub_info; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a5e74ddee0e..c0fa54b276d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -319,6 +319,22 @@ struct kprobe __kprobes *get_kprobe(void *addr) return NULL; } +/* Arm a kprobe with text_mutex */ +static void __kprobes arm_kprobe(struct kprobe *kp) +{ + mutex_lock(&text_mutex); + arch_arm_kprobe(kp); + mutex_unlock(&text_mutex); +} + +/* Disarm a kprobe with text_mutex */ +static void __kprobes disarm_kprobe(struct kprobe *kp) +{ + mutex_lock(&text_mutex); + arch_disarm_kprobe(kp); + mutex_unlock(&text_mutex); +} + /* * Aggregate handlers for multiple kprobes support - these handlers * take care of invoking the individual kprobe handlers on p->list @@ -538,7 +554,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) ap->flags &= ~KPROBE_FLAG_DISABLED; if (!kprobes_all_disarmed) /* Arm the breakpoint again. */ - arch_arm_kprobe(ap); + arm_kprobe(ap); } return 0; } @@ -789,11 +805,8 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p) * enabled and not gone - otherwise, the breakpoint would * already have been removed. We save on flushing icache. */ - if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) { - mutex_lock(&text_mutex); - arch_disarm_kprobe(p); - mutex_unlock(&text_mutex); - } + if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) + disarm_kprobe(p); hlist_del_rcu(&old_p->hlist); } else { if (p->break_handler && !kprobe_gone(p)) @@ -810,7 +823,7 @@ noclean: if (!kprobe_disabled(old_p)) { try_to_disable_aggr_kprobe(old_p); if (!kprobes_all_disarmed && kprobe_disabled(old_p)) - arch_disarm_kprobe(old_p); + disarm_kprobe(old_p); } } return 0; @@ -1364,7 +1377,7 @@ int __kprobes disable_kprobe(struct kprobe *kp) try_to_disable_aggr_kprobe(p); if (!kprobes_all_disarmed && kprobe_disabled(p)) - arch_disarm_kprobe(p); + disarm_kprobe(p); out: mutex_unlock(&kprobe_mutex); return ret; @@ -1393,7 +1406,7 @@ int __kprobes enable_kprobe(struct kprobe *kp) } if (!kprobes_all_disarmed && kprobe_disabled(p)) - arch_arm_kprobe(p); + arm_kprobe(p); p->flags &= ~KPROBE_FLAG_DISABLED; if (p != kp) diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index a2cc7e9a6e8..699a2ac3a0d 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -54,9 +54,9 @@ enum { * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ -#define MAX_LOCKDEP_ENTRIES 8192UL +#define MAX_LOCKDEP_ENTRIES 16384UL -#define MAX_LOCKDEP_CHAINS_BITS 14 +#define MAX_LOCKDEP_CHAINS_BITS 15 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) diff --git a/kernel/mutex.c b/kernel/mutex.c index 507cf2b5e9f..e5cc0cd28d5 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* didnt get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - __schedule(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); spin_lock_mutex(&lock->wait_lock, flags); } @@ -471,5 +473,28 @@ int __sched mutex_trylock(struct mutex *lock) return ret; } - EXPORT_SYMBOL(mutex_trylock); + +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ + /* dec if we can't possibly hit 0 */ + if (atomic_add_unless(cnt, -1, 1)) + return 0; + /* we might hit 0, so take the lock */ + mutex_lock(lock); + if (!atomic_dec_and_test(cnt)) { + /* when we actually did the dec, we didn't hit 0 */ + mutex_unlock(lock); + return 0; + } + /* we hit 0, and we hold the lock */ + return 1; +} +EXPORT_SYMBOL(atomic_dec_and_mutex_lock); diff --git a/kernel/panic.c b/kernel/panic.c index 3dcaa166135..984b3ecbd72 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -340,34 +340,46 @@ void oops_exit(void) } #ifdef WANT_WARN_ON_SLOWPATH -void warn_slowpath(const char *file, int line, const char *fmt, ...) -{ +struct slowpath_args { + const char *fmt; va_list args; - char function[KSYM_SYMBOL_LEN]; - unsigned long caller = (unsigned long)__builtin_return_address(0); - const char *board; +}; - sprint_symbol(function, caller); +static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args) +{ + const char *board; printk(KERN_WARNING "------------[ cut here ]------------\n"); - printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, - line, function); + printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); board = dmi_get_system_info(DMI_PRODUCT_NAME); if (board) printk(KERN_WARNING "Hardware name: %s\n", board); - if (fmt) { - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); - } + if (args) + vprintk(args->fmt, args->args); print_modules(); dump_stack(); print_oops_end_marker(); add_taint(TAINT_WARN); } -EXPORT_SYMBOL(warn_slowpath); + +void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) +{ + struct slowpath_args args; + + args.fmt = fmt; + va_start(args.args, fmt); + warn_slowpath_common(file, line, __builtin_return_address(0), &args); + va_end(args.args); +} +EXPORT_SYMBOL(warn_slowpath_fmt); + +void warn_slowpath_null(const char *file, int line) +{ + warn_slowpath_common(file, line, __builtin_return_address(0), NULL); +} +EXPORT_SYMBOL(warn_slowpath_null); #endif #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c9dcf98b446..bece7c0b67b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1420,19 +1420,19 @@ void run_posix_cpu_timers(struct task_struct *tsk) * timer call will interfere. */ list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { - int firing; + int cpu_firing; + spin_lock(&timer->it_lock); list_del_init(&timer->it.cpu.entry); - firing = timer->it.cpu.firing; + cpu_firing = timer->it.cpu.firing; timer->it.cpu.firing = 0; /* * The firing flag is -1 if we collided with a reset * of the timer, which already reported this * almost-firing as an overrun. So don't generate an event. */ - if (likely(firing >= 0)) { + if (likely(cpu_firing >= 0)) cpu_timer_fire(timer); - } spin_unlock(&timer->it_lock); } } diff --git a/kernel/power/disk.c b/kernel/power/disk.c index e71ca9cd81b..5cb080e7eeb 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -215,8 +215,6 @@ static int create_image(int platform_mode) if (error) return error; - device_pm_lock(); - /* At this point, device_suspend() has been called, but *not* * device_power_down(). We *must* call device_power_down() now. * Otherwise, drivers for some devices (e.g. interrupt controllers) @@ -227,7 +225,7 @@ static int create_image(int platform_mode) if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); - goto Unlock; + return error; } error = platform_pre_snapshot(platform_mode); @@ -241,9 +239,9 @@ static int create_image(int platform_mode) local_irq_disable(); - sysdev_suspend(PMSG_FREEZE); + error = sysdev_suspend(PMSG_FREEZE); if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " + printk(KERN_ERR "PM: Some system devices failed to power down, " "aborting hibernation\n"); goto Enable_irqs; } @@ -280,9 +278,6 @@ static int create_image(int platform_mode) device_power_up(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - Unlock: - device_pm_unlock(); - return error; } @@ -344,13 +339,11 @@ static int resume_target_kernel(bool platform_mode) { int error; - device_pm_lock(); - error = device_power_down(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); - goto Unlock; + return error; } error = platform_pre_restore(platform_mode); @@ -403,9 +396,6 @@ static int resume_target_kernel(bool platform_mode) device_power_up(PMSG_RECOVER); - Unlock: - device_pm_unlock(); - return error; } @@ -464,11 +454,9 @@ int hibernation_platform_enter(void) goto Resume_devices; } - device_pm_lock(); - error = device_power_down(PMSG_HIBERNATE); if (error) - goto Unlock; + goto Resume_devices; error = hibernation_ops->prepare(); if (error) @@ -493,9 +481,6 @@ int hibernation_platform_enter(void) device_power_up(PMSG_RESTORE); - Unlock: - device_pm_unlock(); - Resume_devices: entering_platform_hibernation = false; device_resume(PMSG_RESTORE); diff --git a/kernel/power/main.c b/kernel/power/main.c index f99ed6a75ea..868028280d1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -289,12 +289,10 @@ static int suspend_enter(suspend_state_t state) { int error; - device_pm_lock(); - if (suspend_ops->prepare) { error = suspend_ops->prepare(); if (error) - goto Done; + return error; } error = device_power_down(PMSG_SUSPEND); @@ -343,9 +341,6 @@ static int suspend_enter(suspend_state_t state) if (suspend_ops->finish) suspend_ops->finish(); - Done: - device_pm_unlock(); - return error; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0692ab5a0d6..42c317874cf 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -304,6 +304,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data) if (child->ptrace) { child->exit_code = data; dead = __ptrace_detach(current, child); + if (!child->exit_state) + wake_up_process(child); } write_unlock_irq(&tasklist_lock); diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index ce97a4df64d..beb0e659adc 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1356,17 +1356,11 @@ static int rcu_sched_grace_period(void *arg) rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - ret = 0; + ret = 0; /* unused */ __wait_event_interruptible(rcu_ctrlblk.sched_wq, rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, ret); - /* - * Signals would prevent us from sleeping, and we cannot - * do much with them in any case. So flush them. - */ - if (ret) - flush_signals(current); couldsleepnext = 0; } while (!kthread_should_stop()); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d2a372fb0b9..0dccfbba6d2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) check_cpu_stall(rsp, rdp); /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rdp->qs_pending) + if (rdp->qs_pending) { + rdp->n_rp_qs_pending++; return 1; + } /* Does this CPU have callbacks ready to invoke? */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (cpu_has_callbacks_ready_to_invoke(rdp)) { + rdp->n_rp_cb_ready++; return 1; + } /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) + if (cpu_needs_another_gp(rsp, rdp)) { + rdp->n_rp_cpu_needs_gp++; return 1; + } /* Has another RCU grace period completed? */ - if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */ + if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ + rdp->n_rp_gp_completed++; return 1; + } /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */ + if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ + rdp->n_rp_gp_started++; return 1; + } /* Has an RCU GP gone long enough to send resched IPIs &c? */ if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && - ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) + ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { + rdp->n_rp_need_fqs++; return 1; + } /* nothing to do */ + rdp->n_rp_need_nothing++; return 0; } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 4b1875ba940..fe1dcdbf1ca 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = { .release = single_release, }; -static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir; +static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) +{ + seq_printf(m, "%3d%cnp=%ld " + "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n", + rdp->cpu, + cpu_is_offline(rdp->cpu) ? '!' : ' ', + rdp->n_rcu_pending, + rdp->n_rp_qs_pending, + rdp->n_rp_cb_ready, + rdp->n_rp_cpu_needs_gp, + rdp->n_rp_gp_completed, + rdp->n_rp_gp_started, + rdp->n_rp_need_fqs, + rdp->n_rp_need_nothing); +} + +static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) +{ + int cpu; + struct rcu_data *rdp; + + for_each_possible_cpu(cpu) { + rdp = rsp->rda[cpu]; + if (rdp->beenonline) + print_one_rcu_pending(m, rdp); + } +} + +static int show_rcu_pending(struct seq_file *m, void *unused) +{ + seq_puts(m, "rcu:\n"); + print_rcu_pendings(m, &rcu_state); + seq_puts(m, "rcu_bh:\n"); + print_rcu_pendings(m, &rcu_bh_state); + return 0; +} + +static int rcu_pending_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcu_pending, NULL); +} + +static struct file_operations rcu_pending_fops = { + .owner = THIS_MODULE, + .open = rcu_pending_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *rcudir; +static struct dentry *datadir; +static struct dentry *datadir_csv; +static struct dentry *gpdir; +static struct dentry *hierdir; +static struct dentry *rcu_pendingdir; + static int __init rcuclassic_trace_init(void) { rcudir = debugfs_create_dir("rcu", NULL); @@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void) NULL, &rcuhier_fops); if (!hierdir) goto free_out; + + rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir, + NULL, &rcu_pending_fops); + if (!rcu_pendingdir) + goto free_out; return 0; free_out: if (datadir) @@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void) debugfs_remove(datadir_csv); debugfs_remove(gpdir); debugfs_remove(hierdir); + debugfs_remove(rcu_pendingdir); debugfs_remove(rcudir); } diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 69d9cb921ff..820c5af44f3 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * assigned pending owner [which might not have taken the * lock yet]: */ -static inline int try_to_steal_lock(struct rt_mutex *lock) +static inline int try_to_steal_lock(struct rt_mutex *lock, + struct task_struct *task) { struct task_struct *pendowner = rt_mutex_owner(lock); struct rt_mutex_waiter *next; @@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) if (!rt_mutex_owner_pending(lock)) return 0; - if (pendowner == current) + if (pendowner == task) return 1; spin_lock_irqsave(&pendowner->pi_lock, flags); - if (current->prio >= pendowner->prio) { + if (task->prio >= pendowner->prio) { spin_unlock_irqrestore(&pendowner->pi_lock, flags); return 0; } @@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) * We are going to steal the lock and a waiter was * enqueued on the pending owners pi_waiters queue. So * we have to enqueue this waiter into - * current->pi_waiters list. This covers the case, - * where current is boosted because it holds another + * task->pi_waiters list. This covers the case, + * where task is boosted because it holds another * lock and gets unboosted because the booster is * interrupted, so we would delay a waiter with higher - * priority as current->normal_prio. + * priority as task->normal_prio. * * Note: in the rare case of a SCHED_OTHER task changing * its priority and thus stealing the lock, next->task - * might be current: + * might be task: */ - if (likely(next->task != current)) { - spin_lock_irqsave(¤t->pi_lock, flags); - plist_add(&next->pi_list_entry, ¤t->pi_waiters); - __rt_mutex_adjust_prio(current); - spin_unlock_irqrestore(¤t->pi_lock, flags); + if (likely(next->task != task)) { + spin_lock_irqsave(&task->pi_lock, flags); + plist_add(&next->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + spin_unlock_irqrestore(&task->pi_lock, flags); } return 1; } @@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ mark_rt_mutex_waiters(lock); - if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) + if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) return 0; /* We got the lock. */ @@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, + struct task_struct *task, int detect_deadlock) { struct task_struct *owner = rt_mutex_owner(lock); @@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, unsigned long flags; int chain_walk = 0, res; - spin_lock_irqsave(¤t->pi_lock, flags); - __rt_mutex_adjust_prio(current); - waiter->task = current; + spin_lock_irqsave(&task->pi_lock, flags); + __rt_mutex_adjust_prio(task); + waiter->task = task; waiter->lock = lock; - plist_node_init(&waiter->list_entry, current->prio); - plist_node_init(&waiter->pi_list_entry, current->prio); + plist_node_init(&waiter->list_entry, task->prio); + plist_node_init(&waiter->pi_list_entry, task->prio); /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) top_waiter = rt_mutex_top_waiter(lock); plist_add(&waiter->list_entry, &lock->wait_list); - current->pi_blocked_on = waiter; + task->pi_blocked_on = waiter; - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock_irqrestore(&task->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { spin_lock_irqsave(&owner->pi_lock, flags); @@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, spin_unlock(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, - current); + task); spin_lock(&lock->wait_lock); @@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task) rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); } -/* - * Slow path lock function: +/** + * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop + * @lock: the rt_mutex to take + * @state: the state the task should block in (TASK_INTERRUPTIBLE + * or TASK_UNINTERRUPTIBLE) + * @timeout: the pre-initialized and started timer, or NULL for none + * @waiter: the pre-initialized rt_mutex_waiter + * @detect_deadlock: passed to task_blocks_on_rt_mutex + * + * lock->wait_lock must be held by the caller. */ static int __sched -rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock) +__rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + struct rt_mutex_waiter *waiter, + int detect_deadlock) { - struct rt_mutex_waiter waiter; int ret = 0; - debug_rt_mutex_init_waiter(&waiter); - waiter.task = NULL; - - spin_lock(&lock->wait_lock); - - /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock)) { - spin_unlock(&lock->wait_lock); - return 0; - } - - set_current_state(state); - - /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) { - hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } - for (;;) { /* Try to acquire the lock: */ if (try_to_take_rt_mutex(lock)) @@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, } /* - * waiter.task is NULL the first time we come here and + * waiter->task is NULL the first time we come here and * when we have been woken up by the previous owner * but the lock got stolen by a higher prio task. */ - if (!waiter.task) { - ret = task_blocks_on_rt_mutex(lock, &waiter, + if (!waiter->task) { + ret = task_blocks_on_rt_mutex(lock, waiter, current, detect_deadlock); /* * If we got woken up by the owner then start loop * all over without going into schedule to try * to get the lock now: */ - if (unlikely(!waiter.task)) { + if (unlikely(!waiter->task)) { /* * Reset the return value. We might * have returned with -EDEADLK and the @@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, spin_unlock(&lock->wait_lock); - debug_rt_mutex_print_deadlock(&waiter); + debug_rt_mutex_print_deadlock(waiter); - if (waiter.task) + if (waiter->task) schedule_rt_mutex(lock); spin_lock(&lock->wait_lock); set_current_state(state); } + return ret; +} + +/* + * Slow path lock function: + */ +static int __sched +rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock) +{ + struct rt_mutex_waiter waiter; + int ret = 0; + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + + spin_lock(&lock->wait_lock); + + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock)) { + spin_unlock(&lock->wait_lock); + return 0; + } + + set_current_state(state); + + /* Setup the timer, when timeout != NULL */ + if (unlikely(timeout)) { + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } + + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, + detect_deadlock); + set_current_state(TASK_RUNNING); if (unlikely(waiter.task)) @@ -864,9 +891,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); /** - * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible - * the timeout structure is provided - * by the caller + * rt_mutex_timed_lock - lock a rt_mutex interruptible + * the timeout structure is provided + * by the caller * * @lock: the rt_mutex to be locked * @timeout: timeout structure or NULL (no timeout) @@ -913,7 +940,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) } EXPORT_SYMBOL_GPL(rt_mutex_unlock); -/*** +/** * rt_mutex_destroy - mark a mutex unusable * @lock: the mutex to be destroyed * @@ -986,6 +1013,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, } /** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * @detect_deadlock: perform deadlock detection (1) or not (0) + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for FUTEX_REQUEUE_PI support. + */ +int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, int detect_deadlock) +{ + int ret; + + spin_lock(&lock->wait_lock); + + mark_rt_mutex_waiters(lock); + + if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { + /* We got the lock for task. */ + debug_rt_mutex_lock(lock); + + rt_mutex_set_owner(lock, task, 0); + + rt_mutex_deadlock_account_lock(lock, task); + return 1; + } + + ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); + + + if (ret && !waiter->task) { + /* + * Reset the return value. We might have + * returned with -EDEADLK and the owner + * released the lock while we were walking the + * pi chain. Let the waiter sort it out. + */ + ret = 0; + } + spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(waiter); + + return ret; +} + +/** * rt_mutex_next_owner - return the next owner of the lock * * @lock: the rt lock query @@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) return rt_mutex_top_waiter(lock)->task; } + +/** + * rt_mutex_finish_proxy_lock() - Complete lock acquisition + * @lock: the rt_mutex we were woken on + * @to: the timeout, null if none. hrtimer should already have + * been started. + * @waiter: the pre-initialized rt_mutex_waiter + * @detect_deadlock: perform deadlock detection (1) or not (0) + * + * Complete the lock acquisition started our behalf by another thread. + * + * Returns: + * 0 - success + * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK + * + * Special API call for PI-futex requeue support + */ +int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter, + int detect_deadlock) +{ + int ret; + + spin_lock(&lock->wait_lock); + + set_current_state(TASK_INTERRUPTIBLE); + + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, + detect_deadlock); + + set_current_state(TASK_RUNNING); + + if (unlikely(waiter->task)) + remove_waiter(lock, waiter); + + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + spin_unlock(&lock->wait_lock); + + /* + * Readjust priority, when we did not get the lock. We might have been + * the pending owner and boosted. Since we did not take the lock, the + * PI boost has to go. + */ + if (unlikely(ret)) + rt_mutex_adjust_prio(current); + + return ret; +} diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index e124bf5800e..97a2f81866a 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); +extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, + int detect_deadlock); +extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter, + int detect_deadlock); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" diff --git a/kernel/sched.c b/kernel/sched.c index b902e587a3a..076e403b9c8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -630,6 +630,10 @@ struct rq { struct list_head migration_queue; #endif + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP int hrtick_csd_pending; @@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) } #endif +static void calc_load_account_active(struct rq *this_rq); + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -2458,6 +2464,17 @@ out: return success; } +/** + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. Returns 1 if the process was woken up, 0 if it was already + * running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ int wake_up_process(struct task_struct *p) { return try_to_wake_up(p, TASK_ALL, 0); @@ -2766,7 +2783,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * combine the page table reload and the switch backend into * one hypercall. */ - arch_enter_lazy_cpu_mode(); + arch_start_context_switch(prev); if (unlikely(!mm)) { next->active_mm = oldmm; @@ -2856,19 +2873,72 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_active(void) +/* Variables and functions for calc_load */ +static atomic_long_t calc_load_tasks; +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) { - unsigned long i, running = 0, uninterruptible = 0; + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} - for_each_online_cpu(i) { - running += cpu_rq(i)->nr_running; - uninterruptible += cpu_rq(i)->nr_uninterruptible; - } +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + return load >> FSHIFT; +} - if (unlikely((long)uninterruptible < 0)) - uninterruptible = 0; +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(void) +{ + unsigned long upd = calc_load_update + 10; + long active; + + if (time_before(jiffies, upd)) + return; + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; - return running + uninterruptible; + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; +} + +/* + * Either called from update_cpu_load() or from a cpu going idle + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long nr_active, delta; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + atomic_long_add(delta, &calc_load_tasks); + } } /* @@ -2899,6 +2969,11 @@ static void update_cpu_load(struct rq *this_rq) new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; } + + if (time_after_eq(jiffies, this_rq->calc_load_update)) { + this_rq->calc_load_update += LOAD_FREQ; + calc_load_account_active(this_rq); + } } #ifdef CONFIG_SMP @@ -4240,10 +4315,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) static struct { atomic_t load_balancer; cpumask_var_t cpu_mask; + cpumask_var_t ilb_grp_nohz_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), }; +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * lowest_flag_domain - Return lowest sched_domain containing flag. + * @cpu: The cpu whose lowest level of sched domain is to + * be returned. + * @flag: The flag to check for the lowest sched_domain + * for the given cpu. + * + * Returns the lowest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) + if (sd && (sd->flags & flag)) + break; + + return sd; +} + +/** + * for_each_flag_domain - Iterates over sched_domains containing the flag. + * @cpu: The cpu whose domains we're iterating over. + * @sd: variable holding the value of the power_savings_sd + * for cpu. + * @flag: The flag to filter the sched_domains to be iterated. + * + * Iterates over all the scheduler domains for a given cpu that has the 'flag' + * set, starting from the lowest sched_domain to the highest. + */ +#define for_each_flag_domain(cpu, sd, flag) \ + for (sd = lowest_flag_domain(cpu, flag); \ + (sd && (sd->flags & flag)); sd = sd->parent) + +/** + * is_semi_idle_group - Checks if the given sched_group is semi-idle. + * @ilb_group: group to be checked for semi-idleness + * + * Returns: 1 if the group is semi-idle. 0 otherwise. + * + * We define a sched_group to be semi idle if it has atleast one idle-CPU + * and atleast one non-idle CPU. This helper function checks if the given + * sched_group is semi-idle or not. + */ +static inline int is_semi_idle_group(struct sched_group *ilb_group) +{ + cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, + sched_group_cpus(ilb_group)); + + /* + * A sched_group is semi-idle when it has atleast one busy cpu + * and atleast one idle cpu. + */ + if (cpumask_empty(nohz.ilb_grp_nohz_mask)) + return 0; + + if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) + return 0; + + return 1; +} +/** + * find_new_ilb - Finds the optimum idle load balancer for nomination. + * @cpu: The cpu which is nominating a new idle_load_balancer. + * + * Returns: Returns the id of the idle load balancer if it exists, + * Else, returns >= nr_cpu_ids. + * + * This algorithm picks the idle load balancer such that it belongs to a + * semi-idle powersavings sched_domain. The idea is to try and avoid + * completely idle packages/cores just for the purpose of idle load balancing + * when there are other idle cpu's which are better suited for that job. + */ +static int find_new_ilb(int cpu) +{ + struct sched_domain *sd; + struct sched_group *ilb_group; + + /* + * Have idle load balancer selection from semi-idle packages only + * when power-aware load balancing is enabled + */ + if (!(sched_smt_power_savings || sched_mc_power_savings)) + goto out_done; + + /* + * Optimize for the case when we have no idle CPUs or only one + * idle CPU. Don't walk the sched_domain hierarchy in such cases + */ + if (cpumask_weight(nohz.cpu_mask) < 2) + goto out_done; + + for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { + ilb_group = sd->groups; + + do { + if (is_semi_idle_group(ilb_group)) + return cpumask_first(nohz.ilb_grp_nohz_mask); + + ilb_group = ilb_group->next; + + } while (ilb_group != sd->groups); + } + +out_done: + return cpumask_first(nohz.cpu_mask); +} +#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ +static inline int find_new_ilb(int call_cpu) +{ + return cpumask_first(nohz.cpu_mask); +} +#endif + /* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle @@ -4298,8 +4489,24 @@ int select_nohz_load_balancer(int stop_tick) /* make me the ilb owner */ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) + } else if (atomic_read(&nohz.load_balancer) == cpu) { + int new_ilb; + + if (!(sched_smt_power_savings || + sched_mc_power_savings)) + return 1; + /* + * Check to see if there is a more power-efficient + * ilb. + */ + new_ilb = find_new_ilb(cpu); + if (new_ilb < nr_cpu_ids && new_ilb != cpu) { + atomic_set(&nohz.load_balancer, -1); + resched_cpu(new_ilb); + return 0; + } return 1; + } } else { if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; @@ -4468,15 +4675,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) } if (atomic_read(&nohz.load_balancer) == -1) { - /* - * simple selection for now: Nominate the - * first cpu in the nohz list to be the next - * ilb owner. - * - * TBD: Traverse the sched domains and nominate - * the nearest cpu in the nohz.cpu_mask. - */ - int ilb = cpumask_first(nohz.cpu_mask); + int ilb = find_new_ilb(cpu); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@ -4732,7 +4931,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (user_tick) account_user_time(p, one_jiffy, one_jiffy_scaled); - else if (p != rq->idle) + else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) account_system_time(p, HARDIRQ_OFFSET, one_jiffy, one_jiffy_scaled); else @@ -5007,13 +5206,15 @@ pick_next_task(struct rq *rq) /* * schedule() is the main scheduler function. */ -asmlinkage void __sched __schedule(void) +asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; +need_resched: + preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_qsctr_inc(cpu); @@ -5070,15 +5271,9 @@ need_resched_nonpreemptible: if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; -} -asmlinkage void __sched schedule(void) -{ -need_resched: - preempt_disable(); - __schedule(); preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (need_resched()) goto need_resched; } EXPORT_SYMBOL(schedule); @@ -5221,7 +5416,7 @@ EXPORT_SYMBOL(default_wake_function); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync, void *key) { wait_queue_t *curr, *next; @@ -5241,6 +5436,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5279,6 +5477,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5315,6 +5516,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * awakened in the same order in which they were queued. * * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete(struct completion *x) { @@ -5332,6 +5536,9 @@ EXPORT_SYMBOL(complete); * @x: holds the state of this particular completion * * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete_all(struct completion *x) { @@ -6490,8 +6697,9 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif - printk(KERN_CONT "%5lu %5d %6d\n", free, - task_pid_nr(p), task_pid_nr(p->real_parent)); + printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + task_pid_nr(p), task_pid_nr(p->real_parent), + (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); } @@ -6970,6 +7178,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu) } } + +/* + * remove the tasks which were accounted by rq from calc_load_tasks. + */ +static void calc_global_load_remove(struct rq *rq) +{ + atomic_long_sub(rq->calc_load_active, &calc_load_tasks); +} #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -7204,6 +7420,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); + rq->calc_load_update = calc_load_update; + rq->calc_load_active = 0; if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); @@ -7243,7 +7461,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); - + calc_global_load_remove(rq); /* * No need to migrate the tasks: it was best-effort if * they didn't take sched_hotcpu_mutex. Just wake up @@ -7753,8 +7971,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* * The cpus mask in sched_group and sched_domain hangs off the end. - * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space - * for nr_cpu_ids < CONFIG_NR_CPUS. + * + * ( See the the comments in include/linux/sched.h:struct sched_group + * and struct sched_domain. ) */ struct static_sched_group { struct sched_group sg; @@ -7875,7 +8094,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) struct sched_domain *sd; sd = &per_cpu(phys_domains, j).sd; - if (j != cpumask_first(sched_group_cpus(sd->groups))) { + if (j != group_first_cpu(sd->groups)) { /* * Only add "power" once for each * physical package. @@ -7953,7 +8172,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) WARN_ON(!sd || !sd->groups); - if (cpu != cpumask_first(sched_group_cpus(sd->groups))) + if (cpu != group_first_cpu(sd->groups)) return; child = sd->child; @@ -8938,6 +9157,8 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); rq->nr_running = 0; + rq->calc_load_active = 0; + rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -9045,6 +9266,9 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + + calc_load_update = jiffies + LOAD_FREQ; + /* * During early bootup we pretend to be a normal task: */ @@ -9055,6 +9279,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ alloc_bootmem_cpumask_var(&nohz.cpu_mask); + alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask); #endif alloc_bootmem_cpumask_var(&cpu_isolated_map); #endif /* SMP */ @@ -9800,6 +10025,13 @@ static int sched_rt_global_constraints(void) if (sysctl_sched_rt_period <= 0) return -EINVAL; + /* + * There's always some RT tasks in the root group + * -- migration, kstopmachine etc.. + */ + if (sysctl_sched_rt_runtime == 0) + return -EBUSY; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 819f17ac796..e1d16c9a768 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -38,7 +38,8 @@ */ unsigned long long __attribute__((weak)) sched_clock(void) { - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); + return (unsigned long long)(jiffies - INITIAL_JIFFIES) + * (NSEC_PER_SEC / HZ); } static __read_mostly int sched_clock_running; diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index cdd3c89574c..344712a5e3e 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -165,7 +165,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) vec->count = 0; if (bootmem) alloc_bootmem_cpumask_var(&vec->mask); - else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL)) + else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) goto cleanup; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3816f217f11..5f9650e8fe7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) find_matching_se(&se, &pse); - while (se) { - BUG_ON(!pse); + BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) { - resched_task(curr); - break; - } - - se = parent_entity(se); - pse = parent_entity(pse); - } + if (wakeup_preempt_entity(se, pse) == 1) + resched_task(curr); } static struct task_struct *pick_next_task_fair(struct rq *rq) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 8a21a2e28c1..499672c10cb 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); - + /* adjust the active tasks as we might go into a long sleep */ + calc_load_account_active(rq); return rq->idle; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f2c66f8f971..9bf0d2a7304 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1591,7 +1591,7 @@ static inline void init_sched_rt_class(void) unsigned int i; for_each_possible_cpu(i) - alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), + zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), GFP_KERNEL, cpu_to_node(i)); } #endif /* CONFIG_SMP */ diff --git a/kernel/smp.c b/kernel/smp.c index 858baac568e..ad63d850120 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, + if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return NOTIFY_BAD; break; diff --git a/kernel/softirq.c b/kernel/softirq.c index b525dd34851..f674f332a02 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -828,7 +828,7 @@ int __init __weak arch_early_irq_init(void) return 0; } -int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) +int __weak arch_init_chip_data(struct irq_desc *desc, int node) { return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e3d2c7dd59b..6a463716ecb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -101,7 +101,9 @@ static int __maybe_unused one = 1; static int __maybe_unused two = 2; static unsigned long one_ul = 1; static int one_hundred = 100; -static int one_thousand = 1000; + +/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -729,6 +731,14 @@ static struct ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, + .procname = "bootloader_version", + .data = &bootloader_version, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, .procname = "kstack_depth_to_print", .data = &kstack_depth_to_print, .maxlen = sizeof(int), @@ -1006,7 +1016,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &dirty_bytes_handler, .strategy = &sysctl_intvec, - .extra1 = &one_ul, + .extra1 = &dirty_bytes_min, }, { .procname = "dirty_writeback_centisecs", @@ -1031,28 +1041,6 @@ static struct ctl_table vm_table[] = { .proc_handler = &proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, - .procname = "nr_pdflush_threads_min", - .data = &nr_pdflush_threads_min, - .maxlen = sizeof nr_pdflush_threads_min, - .mode = 0644 /* read-write */, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &one, - .extra2 = &nr_pdflush_threads_max, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "nr_pdflush_threads_max", - .data = &nr_pdflush_threads_max, - .maxlen = sizeof nr_pdflush_threads_max, - .mode = 0644 /* read-write */, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &nr_pdflush_threads_min, - .extra2 = &one_thousand, - }, - { .ctl_name = VM_SWAPPINESS, .procname = "swappiness", .data = &vm_swappiness, diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 21a5ca84951..83c4417b6a3 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -93,7 +93,17 @@ void tick_handle_periodic(struct clock_event_device *dev) for (;;) { if (!clockevents_program_event(dev, next, ktime_get())) return; - tick_periodic(cpu); + /* + * Have to be careful here. If we're in oneshot mode, + * before we call tick_periodic() in a loop, we need + * to be sure we're using a real hardware clocksource. + * Otherwise we could get trapped in an infinite + * loop, as the tick_periodic() increments jiffies, + * when then will increment time, posibly causing + * the loop to trigger again and again. + */ + if (timekeeping_valid_for_hres()) + tick_periodic(cpu); next = ktime_add(next, tick_period); } } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 687dff49f6e..52a8bf8931f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -22,7 +22,7 @@ /* * This read-write spinlock protects us from races in SMP while - * playing with xtime and avenrun. + * playing with xtime. */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); diff --git a/kernel/timer.c b/kernel/timer.c index cffffad01c3..a26ed294f93 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1123,47 +1123,6 @@ void update_process_times(int user_tick) } /* - * Nr of active tasks - counted in fixed-point numbers - */ -static unsigned long count_active_tasks(void) -{ - return nr_active() * FIXED_1; -} - -/* - * Hmm.. Changed this, as the GNU make sources (load.c) seems to - * imply that avenrun[] is the standard name for this kind of thing. - * Nothing else seems to be standardized: the fractional size etc - * all seem to differ on different machines. - * - * Requires xtime_lock to access. - */ -unsigned long avenrun[3]; - -EXPORT_SYMBOL(avenrun); - -/* - * calc_load - given tick count, update the avenrun load estimates. - * This is called while holding a write_lock on xtime_lock. - */ -static inline void calc_load(unsigned long ticks) -{ - unsigned long active_tasks; /* fixed-point */ - static int count = LOAD_FREQ; - - count -= ticks; - if (unlikely(count < 0)) { - active_tasks = count_active_tasks(); - do { - CALC_LOAD(avenrun[0], EXP_1, active_tasks); - CALC_LOAD(avenrun[1], EXP_5, active_tasks); - CALC_LOAD(avenrun[2], EXP_15, active_tasks); - count += LOAD_FREQ; - } while (count < 0); - } -} - -/* * This function runs timers and the timer-tq in bottom half context. */ static void run_timer_softirq(struct softirq_action *h) @@ -1187,16 +1146,6 @@ void run_local_timers(void) } /* - * Called by the timer interrupt. xtime_lock must already be taken - * by the timer IRQ! - */ -static inline void update_times(unsigned long ticks) -{ - update_wall_time(); - calc_load(ticks); -} - -/* * The 64-bit jiffies value is not atomic - you MUST NOT read it * without sampling the sequence number in xtime_lock. * jiffies is defined in the linker script... @@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks) void do_timer(unsigned long ticks) { jiffies_64 += ticks; - update_times(ticks); + update_wall_time(); + calc_global_load(); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1406,37 +1356,17 @@ int do_sysinfo(struct sysinfo *info) { unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; - unsigned long seq; + struct timespec tp; memset(info, 0, sizeof(struct sysinfo)); - do { - struct timespec tp; - seq = read_seqbegin(&xtime_lock); - - /* - * This is annoying. The below is the same thing - * posix_get_clock_monotonic() does, but it wants to - * take the lock which we want to cover the loads stuff - * too. - */ - - getnstimeofday(&tp); - tp.tv_sec += wall_to_monotonic.tv_sec; - tp.tv_nsec += wall_to_monotonic.tv_nsec; - monotonic_to_bootbased(&tp); - if (tp.tv_nsec - NSEC_PER_SEC >= 0) { - tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; - tp.tv_sec++; - } - info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + ktime_get_ts(&tp); + monotonic_to_bootbased(&tp); + info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); - info->procs = nr_threads; - } while (read_seqretry(&xtime_lock, seq)); + info->procs = nr_threads; si_meminfo(info); si_swapinfo(info); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1ce5dc6372b..cda81ec58d9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2380,7 +2380,7 @@ static const char readme_msg[] = "# echo print-parent > /debug/tracing/trace_options\n" "# echo 1 > /debug/tracing/tracing_enabled\n" "# cat /debug/tracing/trace > /tmp/trace.txt\n" - "echo 0 > /debug/tracing/tracing_enabled\n" + "# echo 0 > /debug/tracing/tracing_enabled\n" ; static ssize_t @@ -3448,6 +3448,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if (!ref) break; + ref->ref = 1; ref->buffer = info->tr->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer); if (!ref->page) { diff --git a/kernel/wait.c b/kernel/wait.c index 42a2dbc181c..ea7c3b4275c 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_common(q, mode, 1, 0, key); + __wake_up_locked_key(q, mode, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); |