aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c1
-rw-r--r--kernel/cgroup_freezer.c19
-rw-r--r--kernel/cpu.c3
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/hrtimer.c26
-rw-r--r--kernel/kprobes.c23
-rw-r--r--kernel/sched.c28
-rw-r--r--kernel/sched_debug.c41
-rw-r--r--kernel/sched_fair.c83
-rw-r--r--kernel/sched_features.h1
-rw-r--r--kernel/smp.c18
-rw-r--r--kernel/softirq.c7
-rw-r--r--kernel/time/tick-sched.c4
-rw-r--r--kernel/timer.c129
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.c19
-rw-r--r--kernel/workqueue.c45
17 files changed, 339 insertions, 115 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a512a75a556..8fe8c0cb137 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2500,7 +2500,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
list_del(&cgrp->sibling);
spin_lock(&cgrp->dentry->d_lock);
d = dget(cgrp->dentry);
- cgrp->dentry = NULL;
spin_unlock(&d->d_lock);
cgroup_d_remove_dir(d);
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 7fa476f01d0..fb249e2bcad 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -184,9 +184,20 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
{
struct freezer *freezer;
- task_lock(task);
+ /*
+ * No lock is needed, since the task isn't on tasklist yet,
+ * so it can't be moved to another cgroup, which means the
+ * freezer won't be removed and will be valid during this
+ * function call.
+ */
freezer = task_freezer(task);
- task_unlock(task);
+
+ /*
+ * The root cgroup is non-freezable, so we can skip the
+ * following check.
+ */
+ if (!freezer->css.cgroup->parent)
+ return;
spin_lock_irq(&freezer->lock);
BUG_ON(freezer->state == CGROUP_FROZEN);
@@ -331,7 +342,7 @@ static int freezer_write(struct cgroup *cgroup,
else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
goal_state = CGROUP_FROZEN;
else
- return -EIO;
+ return -EINVAL;
if (!cgroup_lock_live_group(cgroup))
return -ENODEV;
@@ -350,6 +361,8 @@ static struct cftype files[] = {
static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
{
+ if (!cgroup->parent)
+ return 0;
return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 86d49045dae..5a732c5ef08 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -499,3 +499,6 @@ const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
#endif
};
EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
+
+const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
+EXPORT_SYMBOL(cpu_all_bits);
diff --git a/kernel/exit.c b/kernel/exit.c
index c0711da1548..16eda9b39f8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -143,6 +143,11 @@ static void __exit_signal(struct task_struct *tsk)
if (sig) {
flush_sigqueue(&sig->shared_pending);
taskstats_tgid_free(sig);
+ /*
+ * Make sure ->signal can't go away under rq->lock,
+ * see account_group_exec_runtime().
+ */
+ task_rq_unlock_wait(tsk);
__cleanup_signal(sig);
}
}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2b465dfde42..47e63349d1b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -664,14 +664,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
/* Timer is expired, act upon the callback mode */
switch(timer->cb_mode) {
- case HRTIMER_CB_IRQSAFE_NO_RESTART:
- debug_hrtimer_deactivate(timer);
- /*
- * We can call the callback from here. No restart
- * happens, so no danger of recursion
- */
- BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
- return 1;
case HRTIMER_CB_IRQSAFE_PERCPU:
case HRTIMER_CB_IRQSAFE_UNLOCKED:
/*
@@ -683,7 +675,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
*/
debug_hrtimer_deactivate(timer);
return 1;
- case HRTIMER_CB_IRQSAFE:
case HRTIMER_CB_SOFTIRQ:
/*
* Move everything else into the softirq pending list !
@@ -1209,6 +1200,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
enum hrtimer_restart (*fn)(struct hrtimer *);
struct hrtimer *timer;
int restart;
+ int emulate_hardirq_ctx = 0;
timer = list_entry(cpu_base->cb_pending.next,
struct hrtimer, cb_entry);
@@ -1217,10 +1209,24 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
timer_stats_account_hrtimer(timer);
fn = timer->function;
+ /*
+ * A timer might have been added to the cb_pending list
+ * when it was migrated during a cpu-offline operation.
+ * Emulate hardirq context for such timers.
+ */
+ if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
+ timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED)
+ emulate_hardirq_ctx = 1;
+
__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
spin_unlock_irq(&cpu_base->lock);
- restart = fn(timer);
+ if (unlikely(emulate_hardirq_ctx)) {
+ local_irq_disable();
+ restart = fn(timer);
+ local_irq_enable();
+ } else
+ restart = fn(timer);
spin_lock_irq(&cpu_base->lock);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 8b57a2597f2..9f8a3f25259 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -72,7 +72,7 @@ static bool kprobe_enabled;
DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
static struct {
- spinlock_t lock ____cacheline_aligned;
+ spinlock_t lock ____cacheline_aligned_in_smp;
} kretprobe_table_locks[KPROBE_TABLE_SIZE];
static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
@@ -613,30 +613,37 @@ static int __kprobes __register_kprobe(struct kprobe *p,
return -EINVAL;
p->addr = addr;
- if (!kernel_text_address((unsigned long) p->addr) ||
- in_kprobes_functions((unsigned long) p->addr))
+ preempt_disable();
+ if (!__kernel_text_address((unsigned long) p->addr) ||
+ in_kprobes_functions((unsigned long) p->addr)) {
+ preempt_enable();
return -EINVAL;
+ }
p->mod_refcounted = 0;
/*
* Check if are we probing a module.
*/
- probed_mod = module_text_address((unsigned long) p->addr);
+ probed_mod = __module_text_address((unsigned long) p->addr);
if (probed_mod) {
- struct module *calling_mod = module_text_address(called_from);
+ struct module *calling_mod;
+ calling_mod = __module_text_address(called_from);
/*
* We must allow modules to probe themself and in this case
* avoid incrementing the module refcount, so as to allow
* unloading of self probing modules.
*/
if (calling_mod && calling_mod != probed_mod) {
- if (unlikely(!try_module_get(probed_mod)))
+ if (unlikely(!try_module_get(probed_mod))) {
+ preempt_enable();
return -EINVAL;
+ }
p->mod_refcounted = 1;
} else
probed_mod = NULL;
}
+ preempt_enable();
p->nmissed = 0;
INIT_LIST_HEAD(&p->list);
@@ -718,6 +725,10 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
struct kprobe *old_p;
if (p->mod_refcounted) {
+ /*
+ * Since we've already incremented refcount,
+ * we don't need to disable preemption.
+ */
mod = module_text_address((unsigned long)p->addr);
if (mod)
module_put(mod);
diff --git a/kernel/sched.c b/kernel/sched.c
index 92992e287b1..204d0662b43 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -399,9 +399,9 @@ struct cfs_rq {
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr, *next;
+ struct sched_entity *curr, *next, *last;
- unsigned long nr_spread_over;
+ unsigned int nr_spread_over;
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -971,6 +971,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
}
}
+void task_rq_unlock_wait(struct task_struct *p)
+{
+ struct rq *rq = task_rq(p);
+
+ smp_mb(); /* spin-unlock-wait is not a full memory barrier */
+ spin_unlock_wait(&rq->lock);
+}
+
static void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
@@ -1450,6 +1458,8 @@ static unsigned long cpu_avg_load_per_task(int cpu)
if (rq->nr_running)
rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+ else
+ rq->avg_load_per_task = 0;
return rq->avg_load_per_task;
}
@@ -1807,7 +1817,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
/*
* Buddy candidates are cache hot:
*/
- if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
+ if (sched_feat(CACHE_HOT_BUDDY) &&
+ (&p->se == cfs_rq_of(&p->se)->next ||
+ &p->se == cfs_rq_of(&p->se)->last))
return 1;
if (p->sched_class != &fair_sched_class)
@@ -5874,6 +5886,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
+ spin_lock_irqsave(&rq->lock, flags);
+
__sched_fork(idle);
idle->se.exec_start = sched_clock();
@@ -5881,7 +5895,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
idle->cpus_allowed = cpumask_of_cpu(cpu);
__set_task_cpu(idle, cpu);
- spin_lock_irqsave(&rq->lock, flags);
rq->curr = rq->idle = idle;
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
idle->oncpu = 1;
@@ -6891,15 +6904,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
struct sched_domain *tmp;
/* Remove the sched domains which do not contribute to scheduling. */
- for (tmp = sd; tmp; tmp = tmp->parent) {
+ for (tmp = sd; tmp; ) {
struct sched_domain *parent = tmp->parent;
if (!parent)
break;
+
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
if (parent->parent)
parent->parent->child = tmp;
- }
+ } else
+ tmp = tmp->parent;
}
if (sd && sd_degenerate(sd)) {
@@ -7688,6 +7703,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
error:
free_sched_groups(cpu_map, tmpmask);
SCHED_CPUMASK_FREE((void *)allmasks);
+ kfree(rd);
return -ENOMEM;
#endif
}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5ae17762ec3..48ecc51e770 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -144,7 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
last = __pick_last_entity(cfs_rq);
if (last)
max_vruntime = last->vruntime;
- min_vruntime = rq->cfs.min_vruntime;
+ min_vruntime = cfs_rq->min_vruntime;
rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
spin_unlock_irqrestore(&rq->lock, flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
@@ -161,26 +161,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(spread0));
SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
-
- P(yld_exp_empty);
- P(yld_act_empty);
- P(yld_both_empty);
- P(yld_count);
- P(sched_switch);
- P(sched_count);
- P(sched_goidle);
-
- P(ttwu_count);
- P(ttwu_local);
-
- P(bkl_count);
-
-#undef P
-#endif
- SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
+ SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
@@ -260,6 +242,25 @@ static void print_cpu(struct seq_file *m, int cpu)
#undef P
#undef PN
+#ifdef CONFIG_SCHEDSTATS
+#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
+
+ P(yld_exp_empty);
+ P(yld_act_empty);
+ P(yld_both_empty);
+ P(yld_count);
+
+ P(sched_switch);
+ P(sched_count);
+ P(sched_goidle);
+
+ P(ttwu_count);
+ P(ttwu_local);
+
+ P(bkl_count);
+
+#undef P
+#endif
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ce514afd78f..98345e45b05 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -341,23 +341,20 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->rb_leftmost = next_node;
}
- if (cfs_rq->next == se)
- cfs_rq->next = NULL;
-
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
-static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->rb_leftmost;
-}
-
static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
{
- return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
+ struct rb_node *left = cfs_rq->rb_leftmost;
+
+ if (!left)
+ return NULL;
+
+ return rb_entry(left, struct sched_entity, run_node);
}
-static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -719,6 +716,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
__enqueue_entity(cfs_rq, se);
}
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (cfs_rq->last == se)
+ cfs_rq->last = NULL;
+
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
+}
+
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
@@ -741,6 +747,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
#endif
}
+ clear_buddies(cfs_rq, se);
+
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);
@@ -794,24 +802,15 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-static struct sched_entity *
-pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- if (!cfs_rq->next || wakeup_preempt_entity(cfs_rq->next, se) == 1)
- return se;
-
- return cfs_rq->next;
-}
-
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
- struct sched_entity *se = NULL;
+ struct sched_entity *se = __pick_next_entity(cfs_rq);
- if (first_fair(cfs_rq)) {
- se = __pick_next_entity(cfs_rq);
- se = pick_next(cfs_rq, se);
- set_next_entity(cfs_rq, se);
- }
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
+ return cfs_rq->next;
+
+ if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
+ return cfs_rq->last;
return se;
}
@@ -983,6 +982,8 @@ static void yield_task_fair(struct rq *rq)
if (unlikely(cfs_rq->nr_running == 1))
return;
+ clear_buddies(cfs_rq, se);
+
if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
update_rq_clock(rq);
/*
@@ -1325,26 +1326,53 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
return 0;
}
+static void set_last_buddy(struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->last = se;
+}
+
+static void set_next_buddy(struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->next = se;
+}
+
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
{
struct task_struct *curr = rq->curr;
- struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se, *pse = &p->se;
if (unlikely(rt_prio(p->prio))) {
+ struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+
update_rq_clock(rq);
update_curr(cfs_rq);
resched_task(curr);
return;
}
+ if (unlikely(p->sched_class != &fair_sched_class))
+ return;
+
if (unlikely(se == pse))
return;
- cfs_rq_of(pse)->next = pse;
+ /*
+ * Only set the backward buddy when the current task is still on the
+ * rq. This can happen when a wakeup gets interleaved with schedule on
+ * the ->pre_schedule() or idle_balance() point, either of which can
+ * drop the rq lock.
+ *
+ * Also, during early boot the idle thread is in the fair class, for
+ * obvious reasons its a bad idea to schedule back to the idle thread.
+ */
+ if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
+ set_last_buddy(se);
+ set_next_buddy(pse);
/*
* We can come here with TIF_NEED_RESCHED already set from new task
@@ -1396,6 +1424,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
do {
se = pick_next_entity(cfs_rq);
+ set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index fda01621829..da5d93b5d2c 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,3 +12,4 @@ SCHED_FEAT(LB_BIAS, 1)
SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
SCHED_FEAT(ASYM_EFF_LOAD, 1)
SCHED_FEAT(WAKEUP_OVERLAP, 0)
+SCHED_FEAT(LAST_BUDDY, 1)
diff --git a/kernel/smp.c b/kernel/smp.c
index f362a855377..75c8dde58c5 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -51,10 +51,6 @@ static void csd_flag_wait(struct call_single_data *data)
{
/* Wait for response */
do {
- /*
- * We need to see the flags store in the IPI handler
- */
- smp_mb();
if (!(data->flags & CSD_FLAG_WAIT))
break;
cpu_relax();
@@ -76,6 +72,11 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
list_add_tail(&data->list, &dst->list);
spin_unlock_irqrestore(&dst->lock, flags);
+ /*
+ * Make the list addition visible before sending the ipi.
+ */
+ smp_mb();
+
if (ipi)
arch_send_call_function_single_ipi(cpu);
@@ -157,7 +158,7 @@ void generic_smp_call_function_single_interrupt(void)
* Need to see other stores to list head for checking whether
* list is empty without holding q->lock
*/
- smp_mb();
+ smp_read_barrier_depends();
while (!list_empty(&q->list)) {
unsigned int data_flags;
@@ -191,7 +192,7 @@ void generic_smp_call_function_single_interrupt(void)
/*
* See comment on outer loop
*/
- smp_mb();
+ smp_read_barrier_depends();
}
}
@@ -370,6 +371,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
list_add_tail_rcu(&data->csd.list, &call_function_queue);
spin_unlock_irqrestore(&call_function_lock, flags);
+ /*
+ * Make the list addition visible before sending the ipi.
+ */
+ smp_mb();
+
/* Send a message to all CPUs in the map */
arch_send_call_function_ipi(mask);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7110daeb9a9..e7c69a720d6 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -269,10 +269,11 @@ void irq_enter(void)
{
int cpu = smp_processor_id();
- if (idle_cpu(cpu) && !in_interrupt())
+ if (idle_cpu(cpu) && !in_interrupt()) {
+ __irq_enter();
tick_check_idle(cpu);
-
- __irq_enter();
+ } else
+ __irq_enter();
}
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 5bbb1044f84..342fc9ccab4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -568,6 +568,9 @@ static void tick_nohz_switch_to_nohz(void)
*/
static void tick_nohz_kick_tick(int cpu)
{
+#if 0
+ /* Switch back to 2.6.27 behaviour */
+
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t delta, now;
@@ -584,6 +587,7 @@ static void tick_nohz_kick_tick(int cpu)
return;
tick_nohz_restart(ts, now);
+#endif
}
#else
diff --git a/kernel/timer.c b/kernel/timer.c
index b54e4646cee..566257d1dc1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -112,27 +112,8 @@ timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
tbase_get_deferrable(timer->base));
}
-/**
- * __round_jiffies - function to round jiffies to a full second
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * __round_jiffies() rounds an absolute time in the future (in jiffies)
- * up or down to (approximately) full seconds. This is useful for timers
- * for which the exact time they fire does not matter too much, as long as
- * they fire approximately every X seconds.
- *
- * By rounding these timers to whole seconds, all such timers will fire
- * at the same time, rather than at various times spread out. The goal
- * of this is to have the CPU wake up less, which saves power.
- *
- * The exact rounding is skewed for each processor to avoid all
- * processors firing at the exact same time, which could lead
- * to lock contention or spurious cache line bouncing.
- *
- * The return value is the rounded version of the @j parameter.
- */
-unsigned long __round_jiffies(unsigned long j, int cpu)
+static unsigned long round_jiffies_common(unsigned long j, int cpu,
+ bool force_up)
{
int rem;
unsigned long original = j;
@@ -154,8 +135,9 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
* due to delays of the timer irq, long irq off times etc etc) then
* we should round down to the whole second, not up. Use 1/4th second
* as cutoff for this rounding as an extreme upper bound for this.
+ * But never round down if @force_up is set.
*/
- if (rem < HZ/4) /* round down */
+ if (rem < HZ/4 && !force_up) /* round down */
j = j - rem;
else /* round up */
j = j - rem + HZ;
@@ -167,6 +149,31 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
return original;
return j;
}
+
+/**
+ * __round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies() rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+unsigned long __round_jiffies(unsigned long j, int cpu)
+{
+ return round_jiffies_common(j, cpu, false);
+}
EXPORT_SYMBOL_GPL(__round_jiffies);
/**
@@ -191,13 +198,10 @@ EXPORT_SYMBOL_GPL(__round_jiffies);
*/
unsigned long __round_jiffies_relative(unsigned long j, int cpu)
{
- /*
- * In theory the following code can skip a jiffy in case jiffies
- * increments right between the addition and the later subtraction.
- * However since the entire point of this function is to use approximate
- * timeouts, it's entirely ok to not handle that.
- */
- return __round_jiffies(j + jiffies, cpu) - jiffies;
+ unsigned long j0 = jiffies;
+
+ /* Use j0 because jiffies might change while we run */
+ return round_jiffies_common(j + j0, cpu, false) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_relative);
@@ -218,7 +222,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative);
*/
unsigned long round_jiffies(unsigned long j)
{
- return __round_jiffies(j, raw_smp_processor_id());
+ return round_jiffies_common(j, raw_smp_processor_id(), false);
}
EXPORT_SYMBOL_GPL(round_jiffies);
@@ -243,6 +247,71 @@ unsigned long round_jiffies_relative(unsigned long j)
}
EXPORT_SYMBOL_GPL(round_jiffies_relative);
+/**
+ * __round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up(unsigned long j, int cpu)
+{
+ return round_jiffies_common(j, cpu, true);
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up);
+
+/**
+ * __round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies_relative() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
+{
+ unsigned long j0 = jiffies;
+
+ /* Use j0 because jiffies might change while we run */
+ return round_jiffies_common(j + j0, cpu, true) - j0;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
+
+/**
+ * round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up(unsigned long j)
+{
+ return round_jiffies_common(j, raw_smp_processor_id(), true);
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up);
+
+/**
+ * round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies_relative() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up_relative(unsigned long j)
+{
+ return __round_jiffies_up_relative(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
+
static inline void set_running_timer(struct tvec_base *base,
struct timer_list *timer)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3f338063864..2f76193c348 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1060,7 +1060,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
/* Did the write stamp get updated already? */
if (unlikely(ts < cpu_buffer->write_stamp))
- goto again;
+ delta = 0;
if (test_time_stamp(delta)) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5c97c5b4ea8..ffe7c96fa09 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1755,7 +1755,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
return TRACE_TYPE_HANDLED;
SEQ_PUT_FIELD_RET(s, entry->pid);
- SEQ_PUT_FIELD_RET(s, iter->cpu);
+ SEQ_PUT_FIELD_RET(s, entry->cpu);
SEQ_PUT_FIELD_RET(s, iter->ts);
switch (entry->type) {
@@ -2676,7 +2676,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
{
unsigned long val;
char buf[64];
- int ret;
+ int ret, cpu;
struct trace_array *tr = filp->private_data;
if (cnt >= sizeof(buf))
@@ -2704,6 +2704,14 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
goto out;
}
+ /* disable all cpu buffers */
+ for_each_tracing_cpu(cpu) {
+ if (global_trace.data[cpu])
+ atomic_inc(&global_trace.data[cpu]->disabled);
+ if (max_tr.data[cpu])
+ atomic_inc(&max_tr.data[cpu]->disabled);
+ }
+
if (val != global_trace.entries) {
ret = ring_buffer_resize(global_trace.buffer, val);
if (ret < 0) {
@@ -2735,6 +2743,13 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
if (tracing_disabled)
cnt = -ENOMEM;
out:
+ for_each_tracing_cpu(cpu) {
+ if (global_trace.data[cpu])
+ atomic_dec(&global_trace.data[cpu]->disabled);
+ if (max_tr.data[cpu])
+ atomic_dec(&max_tr.data[cpu]->disabled);
+ }
+
max_tr.entries = global_trace.entries;
mutex_unlock(&trace_types_lock);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f12ab5c4dec..4952322cba4 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -970,6 +970,51 @@ undo:
return ret;
}
+#ifdef CONFIG_SMP
+struct work_for_cpu {
+ struct work_struct work;
+ long (*fn)(void *);
+ void *arg;
+ long ret;
+};
+
+static void do_work_for_cpu(struct work_struct *w)
+{
+ struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work);
+
+ wfc->ret = wfc->fn(wfc->arg);
+}
+
+/**
+ * work_on_cpu - run a function in user context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn: the function to run
+ * @arg: the function arg
+ *
+ * This will return -EINVAL in the cpu is not online, or the return value
+ * of @fn otherwise.
+ */
+long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
+{
+ struct work_for_cpu wfc;
+
+ INIT_WORK(&wfc.work, do_work_for_cpu);
+ wfc.fn = fn;
+ wfc.arg = arg;
+ get_online_cpus();
+ if (unlikely(!cpu_online(cpu)))
+ wfc.ret = -EINVAL;
+ else {
+ schedule_work_on(cpu, &wfc.work);
+ flush_work(&wfc.work);
+ }
+ put_online_cpus();
+
+ return wfc.ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu);
+#endif /* CONFIG_SMP */
+
void __init init_workqueues(void)
{
cpu_populated_map = cpu_online_map;