diff options
author | Jeff Garzik <jeff@garzik.org> | 2007-02-17 15:11:43 -0500 |
---|---|---|
committer | Jeff Garzik <jeff@garzik.org> | 2007-02-17 15:11:43 -0500 |
commit | f630fe2817601314b2eb7ca5ddc23c7834646731 (patch) | |
tree | 3bfb4939b7bbc3859575ca8b58fa3f929b015941 /kernel/time | |
parent | 48c871c1f6a7c7044dd76774fb469e65c7e2e4e8 (diff) | |
parent | 8a03d9a498eaf02c8a118752050a5154852c13bf (diff) |
Merge branch 'master' into upstream
Diffstat (limited to 'kernel/time')
-rw-r--r-- | kernel/time/Kconfig | 25 | ||||
-rw-r--r-- | kernel/time/Makefile | 9 | ||||
-rw-r--r-- | kernel/time/clockevents.c | 345 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 246 | ||||
-rw-r--r-- | kernel/time/jiffies.c | 1 | ||||
-rw-r--r-- | kernel/time/ntp.c | 30 | ||||
-rw-r--r-- | kernel/time/tick-broadcast.c | 480 | ||||
-rw-r--r-- | kernel/time/tick-common.c | 346 | ||||
-rw-r--r-- | kernel/time/tick-internal.h | 110 | ||||
-rw-r--r-- | kernel/time/tick-oneshot.c | 84 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 563 | ||||
-rw-r--r-- | kernel/time/timer_list.c | 287 | ||||
-rw-r--r-- | kernel/time/timer_stats.c | 411 |
13 files changed, 2862 insertions, 75 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig new file mode 100644 index 00000000000..f6635112654 --- /dev/null +++ b/kernel/time/Kconfig @@ -0,0 +1,25 @@ +# +# Timer subsystem related configuration options +# +config TICK_ONESHOT + bool + default n + +config NO_HZ + bool "Tickless System (Dynamic Ticks)" + depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + help + This option enables a tickless system: timer interrupts will + only trigger on an as-needed basis both when the system is + busy and when the system is idle. + +config HIGH_RES_TIMERS + bool "High Resolution Timer Support" + depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + help + This option enables high resolution timer support. If your + hardware is not capable then this option only increases + the size of the kernel image. + diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 61a3907d16f..93bccba1f26 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1 +1,8 @@ -obj-y += ntp.o clocksource.o jiffies.o +obj-y += ntp.o clocksource.o jiffies.o timer_list.o + +obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o +obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o +obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o +obj-$(CONFIG_TIMER_STATS) += timer_stats.o diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c new file mode 100644 index 00000000000..67932ea78c1 --- /dev/null +++ b/kernel/time/clockevents.c @@ -0,0 +1,345 @@ +/* + * linux/kernel/time/clockevents.c + * + * This file contains functions which manage clock event devices. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ + +#include <linux/clockchips.h> +#include <linux/hrtimer.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/smp.h> +#include <linux/sysdev.h> + +/* The registered clock event devices */ +static LIST_HEAD(clockevent_devices); +static LIST_HEAD(clockevents_released); + +/* Notification for clock events */ +static RAW_NOTIFIER_HEAD(clockevents_chain); + +/* Protection for the above */ +static DEFINE_SPINLOCK(clockevents_lock); + +/** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch: value to convert + * @evt: pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +unsigned long clockevent_delta2ns(unsigned long latch, + struct clock_event_device *evt) +{ + u64 clc = ((u64) latch << evt->shift); + + do_div(clc, evt->mult); + if (clc < 1000) + clc = 1000; + if (clc > LONG_MAX) + clc = LONG_MAX; + + return (unsigned long) clc; +} + +/** + * clockevents_set_mode - set the operating mode of a clock event device + * @dev: device to modify + * @mode: new mode + * + * Must be called with interrupts disabled ! + */ +void clockevents_set_mode(struct clock_event_device *dev, + enum clock_event_mode mode) +{ + if (dev->mode != mode) { + dev->set_mode(mode, dev); + dev->mode = mode; + } +} + +/** + * clockevents_program_event - Reprogram the clock event device. + * @expires: absolute expiry time (monotonic clock) + * + * Returns 0 on success, -ETIME when the event is in the past. + */ +int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, + ktime_t now) +{ + unsigned long long clc; + int64_t delta; + + delta = ktime_to_ns(ktime_sub(expires, now)); + + if (delta <= 0) + return -ETIME; + + dev->next_event = expires; + + if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + return 0; + + if (delta > dev->max_delta_ns) + delta = dev->max_delta_ns; + if (delta < dev->min_delta_ns) + delta = dev->min_delta_ns; + + clc = delta * dev->mult; + clc >>= dev->shift; + + return dev->set_next_event((unsigned long) clc, dev); +} + +/** + * clockevents_register_notifier - register a clock events change listener + */ +int clockevents_register_notifier(struct notifier_block *nb) +{ + int ret; + + spin_lock(&clockevents_lock); + ret = raw_notifier_chain_register(&clockevents_chain, nb); + spin_unlock(&clockevents_lock); + + return ret; +} + +/** + * clockevents_unregister_notifier - unregister a clock events change listener + */ +void clockevents_unregister_notifier(struct notifier_block *nb) +{ + spin_lock(&clockevents_lock); + raw_notifier_chain_unregister(&clockevents_chain, nb); + spin_unlock(&clockevents_lock); +} + +/* + * Notify about a clock event change. Called with clockevents_lock + * held. + */ +static void clockevents_do_notify(unsigned long reason, void *dev) +{ + raw_notifier_call_chain(&clockevents_chain, reason, dev); +} + +/* + * Called after a notify add to make devices availble which were + * released from the notifier call. + */ +static void clockevents_notify_released(void) +{ + struct clock_event_device *dev; + + while (!list_empty(&clockevents_released)) { + dev = list_entry(clockevents_released.next, + struct clock_event_device, list); + list_del(&dev->list); + list_add(&dev->list, &clockevent_devices); + clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + } +} + +/** + * clockevents_register_device - register a clock event device + * @dev: device to register + */ +void clockevents_register_device(struct clock_event_device *dev) +{ + BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + + spin_lock(&clockevents_lock); + + list_add(&dev->list, &clockevent_devices); + clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + clockevents_notify_released(); + + spin_unlock(&clockevents_lock); +} + +/* + * Noop handler when we shut down an event device + */ +static void clockevents_handle_noop(struct clock_event_device *dev) +{ +} + +/** + * clockevents_exchange_device - release and request clock devices + * @old: device to release (can be NULL) + * @new: device to request (can be NULL) + * + * Called from the notifier chain. clockevents_lock is held already + */ +void clockevents_exchange_device(struct clock_event_device *old, + struct clock_event_device *new) +{ + unsigned long flags; + + local_irq_save(flags); + /* + * Caller releases a clock event device. We queue it into the + * released list and do a notify add later. + */ + if (old) { + old->event_handler = clockevents_handle_noop; + clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); + list_del(&old->list); + list_add(&old->list, &clockevents_released); + } + + if (new) { + BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); + clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); + } + local_irq_restore(flags); +} + +/** + * clockevents_request_device + */ +struct clock_event_device *clockevents_request_device(unsigned int features, + cpumask_t cpumask) +{ + struct clock_event_device *cur, *dev = NULL; + struct list_head *tmp; + + spin_lock(&clockevents_lock); + + list_for_each(tmp, &clockevent_devices) { + cur = list_entry(tmp, struct clock_event_device, list); + + if ((cur->features & features) == features && + cpus_equal(cpumask, cur->cpumask)) { + if (!dev || dev->rating < cur->rating) + dev = cur; + } + } + + clockevents_exchange_device(NULL, dev); + + spin_unlock(&clockevents_lock); + + return dev; +} + +/** + * clockevents_release_device + */ +void clockevents_release_device(struct clock_event_device *dev) +{ + spin_lock(&clockevents_lock); + + clockevents_exchange_device(dev, NULL); + clockevents_notify_released(); + + spin_unlock(&clockevents_lock); +} + +/** + * clockevents_notify - notification about relevant events + */ +void clockevents_notify(unsigned long reason, void *arg) +{ + spin_lock(&clockevents_lock); + clockevents_do_notify(reason, arg); + + switch (reason) { + case CLOCK_EVT_NOTIFY_CPU_DEAD: + /* + * Unregister the clock event devices which were + * released from the users in the notify chain. + */ + while (!list_empty(&clockevents_released)) { + struct clock_event_device *dev; + + dev = list_entry(clockevents_released.next, + struct clock_event_device, list); + list_del(&dev->list); + } + break; + default: + break; + } + spin_unlock(&clockevents_lock); +} +EXPORT_SYMBOL_GPL(clockevents_notify); + +#ifdef CONFIG_SYSFS + +/** + * clockevents_show_registered - sysfs interface for listing clockevents + * @dev: unused + * @buf: char buffer to be filled with clock events list + * + * Provides sysfs interface for listing registered clock event devices + */ +static ssize_t clockevents_show_registered(struct sys_device *dev, char *buf) +{ + struct list_head *tmp; + char *p = buf; + int cpu; + + spin_lock(&clockevents_lock); + + list_for_each(tmp, &clockevent_devices) { + struct clock_event_device *ce; + + ce = list_entry(tmp, struct clock_event_device, list); + p += sprintf(p, "%-20s F:%04x M:%d", ce->name, + ce->features, ce->mode); + p += sprintf(p, " C:"); + if (!cpus_equal(ce->cpumask, cpu_possible_map)) { + for_each_cpu_mask(cpu, ce->cpumask) + p += sprintf(p, " %d", cpu); + } else { + /* + * FIXME: Add the cpu which is handling this sucker + */ + } + p += sprintf(p, "\n"); + } + + spin_unlock(&clockevents_lock); + + return p - buf; +} + +/* + * Sysfs setup bits: + */ +static SYSDEV_ATTR(registered, 0600, + clockevents_show_registered, NULL); + +static struct sysdev_class clockevents_sysclass = { + set_kset_name("clockevents"), +}; + +static struct sys_device clockevents_sys_device = { + .id = 0, + .cls = &clockevents_sysclass, +}; + +static int __init clockevents_sysfs_init(void) +{ + int error = sysdev_class_register(&clockevents_sysclass); + + if (!error) + error = sysdev_register(&clockevents_sys_device); + if (!error) + error = sysdev_create_file( + &clockevents_sys_device, + &attr_registered); + return error; +} +device_initcall(clockevents_sysfs_init); +#endif diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d9ef176c4e0..193a0793af9 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -29,6 +29,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ +#include <linux/tick.h> /* XXX - Would like a better way for initializing curr_clocksource */ extern struct clocksource clocksource_jiffies; @@ -48,6 +49,7 @@ extern struct clocksource clocksource_jiffies; */ static struct clocksource *curr_clocksource = &clocksource_jiffies; static struct clocksource *next_clocksource; +static struct clocksource *clocksource_override; static LIST_HEAD(clocksource_list); static DEFINE_SPINLOCK(clocksource_lock); static char override_name[32]; @@ -62,9 +64,123 @@ static int __init clocksource_done_booting(void) finished_booting = 1; return 0; } - late_initcall(clocksource_done_booting); +#ifdef CONFIG_CLOCKSOURCE_WATCHDOG +static LIST_HEAD(watchdog_list); +static struct clocksource *watchdog; +static struct timer_list watchdog_timer; +static DEFINE_SPINLOCK(watchdog_lock); +static cycle_t watchdog_last; +/* + * Interval: 0.5sec Treshold: 0.0625s + */ +#define WATCHDOG_INTERVAL (HZ >> 1) +#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4) + +static void clocksource_ratewd(struct clocksource *cs, int64_t delta) +{ + if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD) + return; + + printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", + cs->name, delta); + cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); + clocksource_change_rating(cs, 0); + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; + list_del(&cs->wd_list); +} + +static void clocksource_watchdog(unsigned long data) +{ + struct clocksource *cs, *tmp; + cycle_t csnow, wdnow; + int64_t wd_nsec, cs_nsec; + + spin_lock(&watchdog_lock); + + wdnow = watchdog->read(); + wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); + watchdog_last = wdnow; + + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { + csnow = cs->read(); + /* Initialized ? */ + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { + if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && + (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + /* + * We just marked the clocksource as + * highres-capable, notify the rest of the + * system as well so that we transition + * into high-res mode: + */ + tick_clock_notify(); + } + cs->flags |= CLOCK_SOURCE_WATCHDOG; + cs->wd_last = csnow; + } else { + cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); + cs->wd_last = csnow; + /* Check the delta. Might remove from the list ! */ + clocksource_ratewd(cs, cs_nsec - wd_nsec); + } + } + + if (!list_empty(&watchdog_list)) { + __mod_timer(&watchdog_timer, + watchdog_timer.expires + WATCHDOG_INTERVAL); + } + spin_unlock(&watchdog_lock); +} +static void clocksource_check_watchdog(struct clocksource *cs) +{ + struct clocksource *cse; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + int started = !list_empty(&watchdog_list); + + list_add(&cs->wd_list, &watchdog_list); + if (!started && watchdog) { + watchdog_last = watchdog->read(); + watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; + add_timer(&watchdog_timer); + } + } else if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) { + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + + if (!watchdog || cs->rating > watchdog->rating) { + if (watchdog) + del_timer(&watchdog_timer); + watchdog = cs; + init_timer(&watchdog_timer); + watchdog_timer.function = clocksource_watchdog; + + /* Reset watchdog cycles */ + list_for_each_entry(cse, &watchdog_list, wd_list) + cse->flags &= ~CLOCK_SOURCE_WATCHDOG; + /* Start if list is not empty */ + if (!list_empty(&watchdog_list)) { + watchdog_last = watchdog->read(); + watchdog_timer.expires = + jiffies + WATCHDOG_INTERVAL; + add_timer(&watchdog_timer); + } + } + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} +#else +static void clocksource_check_watchdog(struct clocksource *cs) +{ + if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; +} +#endif + /** * clocksource_get_next - Returns the selected clocksource * @@ -84,60 +200,54 @@ struct clocksource *clocksource_get_next(void) } /** - * select_clocksource - Finds the best registered clocksource. + * select_clocksource - Selects the best registered clocksource. * * Private function. Must hold clocksource_lock when called. * - * Looks through the list of registered clocksources, returning - * the one with the highest rating value. If there is a clocksource - * name that matches the override string, it returns that clocksource. + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. */ static struct clocksource *select_clocksource(void) { - struct clocksource *best = NULL; - struct list_head *tmp; + struct clocksource *next; - list_for_each(tmp, &clocksource_list) { - struct clocksource *src; + if (list_empty(&clocksource_list)) + return NULL; - src = list_entry(tmp, struct clocksource, list); - if (!best) - best = src; - - /* check for override: */ - if (strlen(src->name) == strlen(override_name) && - !strcmp(src->name, override_name)) { - best = src; - break; - } - /* pick the highest rating: */ - if (src->rating > best->rating) - best = src; - } + if (clocksource_override) + next = clocksource_override; + else + next = list_entry(clocksource_list.next, struct clocksource, + list); + + if (next == curr_clocksource) + return NULL; - return best; + return next; } -/** - * is_registered_source - Checks if clocksource is registered - * @c: pointer to a clocksource - * - * Private helper function. Must hold clocksource_lock when called. - * - * Returns one if the clocksource is already registered, zero otherwise. +/* + * Enqueue the clocksource sorted by rating */ -static int is_registered_source(struct clocksource *c) +static int clocksource_enqueue(struct clocksource *c) { - int len = strlen(c->name); - struct list_head *tmp; + struct list_head *tmp, *entry = &clocksource_list; list_for_each(tmp, &clocksource_list) { - struct clocksource *src; - - src = list_entry(tmp, struct clocksource, list); - if (strlen(src->name) == len && !strcmp(src->name, c->name)) - return 1; + struct clocksource *cs; + + cs = list_entry(tmp, struct clocksource, list); + if (cs == c) + return -EBUSY; + /* Keep track of the place, where to insert */ + if (cs->rating >= c->rating) + entry = tmp; } + list_add(&c->list, entry); + + if (strlen(c->name) == strlen(override_name) && + !strcmp(c->name, override_name)) + clocksource_override = c; return 0; } @@ -150,42 +260,35 @@ static int is_registered_source(struct clocksource *c) */ int clocksource_register(struct clocksource *c) { - int ret = 0; unsigned long flags; + int ret; spin_lock_irqsave(&clocksource_lock, flags); - /* check if clocksource is already registered */ - if (is_registered_source(c)) { - printk("register_clocksource: Cannot register %s. " - "Already registered!", c->name); - ret = -EBUSY; - } else { - /* register it */ - list_add(&c->list, &clocksource_list); - /* scan the registered clocksources, and pick the best one */ + ret = clocksource_enqueue(c); + if (!ret) next_clocksource = select_clocksource(); - } spin_unlock_irqrestore(&clocksource_lock, flags); + if (!ret) + clocksource_check_watchdog(c); return ret; } EXPORT_SYMBOL(clocksource_register); /** - * clocksource_reselect - Rescan list for next clocksource + * clocksource_change_rating - Change the rating of a registered clocksource * - * A quick helper function to be used if a clocksource changes its - * rating. Forces the clocksource list to be re-scanned for the best - * clocksource. */ -void clocksource_reselect(void) +void clocksource_change_rating(struct clocksource *cs, int rating) { unsigned long flags; spin_lock_irqsave(&clocksource_lock, flags); + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); next_clocksource = select_clocksource(); spin_unlock_irqrestore(&clocksource_lock, flags); } -EXPORT_SYMBOL(clocksource_reselect); #ifdef CONFIG_SYSFS /** @@ -221,7 +324,11 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf) static ssize_t sysfs_override_clocksource(struct sys_device *dev, const char *buf, size_t count) { + struct clocksource *ovr = NULL; + struct list_head *tmp; size_t ret = count; + int len; + /* strings from sysfs write are not 0 terminated! */ if (count >= sizeof(override_name)) return -EINVAL; @@ -229,17 +336,32 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, /* strip of \n: */ if (buf[count-1] == '\n') count--; - if (count < 1) - return -EINVAL; spin_lock_irq(&clocksource_lock); - /* copy the name given: */ - memcpy(override_name, buf, count); + if (count > 0) + memcpy(override_name, buf, count); override_name[count] = 0; - /* try to select it: */ - next_clocksource = select_clocksource(); + len = strlen(override_name); + if (len) { + ovr = clocksource_override; + /* try to select it: */ + list_for_each(tmp, &clocksource_list) { + struct clocksource *cs; + + cs = list_entry(tmp, struct clocksource, list); + if (strlen(cs->name) == len && + !strcmp(cs->name, override_name)) + ovr = cs; + } + } + + /* Reselect, when the override name has changed */ + if (ovr != clocksource_override) { + clocksource_override = ovr; + next_clocksource = select_clocksource(); + } spin_unlock_irq(&clocksource_lock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a99b2a6e6a0..3be8da8fed7 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -62,7 +62,6 @@ struct clocksource clocksource_jiffies = { .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ .shift = JIFFIES_SHIFT, - .is_continuous = 0, /* tick based, not free running */ }; static int __init init_jiffies_clocksource(void) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 3afeaa3a73f..eb12509e00b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -24,7 +24,7 @@ static u64 tick_length, tick_length_base; #define MAX_TICKADJ 500 /* microsecs */ #define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ - TICK_LENGTH_SHIFT) / HZ) + TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ) /* * phase-lock loop variables @@ -46,13 +46,17 @@ long time_adjust; static void ntp_update_frequency(void) { - tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; - tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; - tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); + u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) + << TICK_LENGTH_SHIFT; + second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; + second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); - do_div(tick_length_base, HZ); + tick_length_base = second_length; - tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; + do_div(second_length, HZ); + tick_nsec = second_length >> TICK_LENGTH_SHIFT; + + do_div(tick_length_base, NTP_INTERVAL_FREQ); } /** @@ -162,7 +166,7 @@ void second_overflow(void) tick_length -= MAX_TICKADJ_SCALED; } else { tick_length += (s64)(time_adjust * NSEC_PER_USEC / - HZ) << TICK_LENGTH_SHIFT; + NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT; time_adjust = 0; } } @@ -239,7 +243,8 @@ int do_adjtimex(struct timex *txc) result = -EINVAL; goto leave; } - time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC); + time_freq = ((s64)txc->freq * NSEC_PER_USEC) + >> (SHIFT_USEC - SHIFT_NSEC); } if (txc->modes & ADJ_MAXERROR) { @@ -309,7 +314,8 @@ int do_adjtimex(struct timex *txc) freq_adj += time_freq; freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); - time_offset = (time_offset / HZ) << SHIFT_UPDATE; + time_offset = (time_offset / NTP_INTERVAL_FREQ) + << SHIFT_UPDATE; } /* STA_PLL */ } /* txc->modes & ADJ_OFFSET */ if (txc->modes & ADJ_TICK) @@ -324,8 +330,10 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) txc->offset = save_adjust; else - txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000; - txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC); + txc->offset = shift_right(time_offset, SHIFT_UPDATE) + * NTP_INTERVAL_FREQ / 1000; + txc->freq = (time_freq / NSEC_PER_USEC) + << (SHIFT_USEC - SHIFT_NSEC); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c new file mode 100644 index 00000000000..12b3efeb9f6 --- /dev/null +++ b/kernel/time/tick-broadcast.c @@ -0,0 +1,480 @@ +/* + * linux/kernel/time/tick-broadcast.c + * + * This file contains functions which emulate a local clock-event + * device via a broadcast event source. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/irq.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/tick.h> + +#include "tick-internal.h" + +/* + * Broadcast support for broken x86 hardware, where the local apic + * timer stops in C3 state. + */ + +struct tick_device tick_broadcast_device; +static cpumask_t tick_broadcast_mask; +static DEFINE_SPINLOCK(tick_broadcast_lock); + +/* + * Debugging: see timer_list.c + */ +struct tick_device *tick_get_broadcast_device(void) +{ + return &tick_broadcast_device; +} + +cpumask_t *tick_get_broadcast_mask(void) +{ + return &tick_broadcast_mask; +} + +/* + * Start the device in periodic mode + */ +static void tick_broadcast_start_periodic(struct clock_event_device *bc) +{ + if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN) + tick_setup_periodic(bc, 1); +} + +/* + * Check, if the device can be utilized as broadcast device: + */ +int tick_check_broadcast_device(struct clock_event_device *dev) +{ + if (tick_broadcast_device.evtdev || + (dev->features & CLOCK_EVT_FEAT_C3STOP)) + return 0; + + clockevents_exchange_device(NULL, dev); + tick_broadcast_device.evtdev = dev; + if (!cpus_empty(tick_broadcast_mask)) + tick_broadcast_start_periodic(dev); + return 1; +} + +/* + * Check, if the device is the broadcast device + */ +int tick_is_broadcast_device(struct clock_event_device *dev) +{ + return (dev && tick_broadcast_device.evtdev == dev); +} + +/* + * Check, if the device is disfunctional and a place holder, which + * needs to be handled by the broadcast device. + */ +int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + /* + * Devices might be registered with both periodic and oneshot + * mode disabled. This signals, that the device needs to be + * operated from the broadcast device and is a placeholder for + * the cpu local device. + */ + if (!tick_device_is_functional(dev)) { + dev->event_handler = tick_handle_periodic; + cpu_set(cpu, tick_broadcast_mask); + tick_broadcast_start_periodic(tick_broadcast_device.evtdev); + ret = 1; + } + + spin_unlock_irqrestore(&tick_broadcast_lock, flags); + return ret; +} + +/* + * Broadcast the event to the cpus, which are set in the mask + */ +int tick_do_broadcast(cpumask_t mask) +{ + int ret = 0, cpu = smp_processor_id(); + struct tick_device *td; + + /* + * Check, if the current cpu is in the mask + */ + if (cpu_isset(cpu, mask)) { + cpu_clear(cpu, mask); + td = &per_cpu(tick_cpu_device, cpu); + td->evtdev->event_handler(td->evtdev); + ret = 1; + } + + if (!cpus_empty(mask)) { + /* + * It might be necessary to actually check whether the devices + * have different broadcast functions. For now, just use the + * one of the first device. This works as long as we have this + * misfeature only on x86 (lapic) + */ + cpu = first_cpu(mask); + td = &per_cpu(tick_cpu_device, cpu); + td->evtdev->broadcast(mask); + ret = 1; + } + return ret; +} + +/* + * Periodic broadcast: + * - invoke the broadcast handlers + */ +static void tick_do_periodic_broadcast(void) +{ + cpumask_t mask; + + spin_lock(&tick_broadcast_lock); + + cpus_and(mask, cpu_online_map, tick_broadcast_mask); + tick_do_broadcast(mask); + + spin_unlock(&tick_broadcast_lock); +} + +/* + * Event handler for periodic broadcast ticks + */ +static void tick_handle_periodic_broadcast(struct clock_event_device *dev) +{ + dev->next_event.tv64 = KTIME_MAX; + + tick_do_periodic_broadcast(); + + /* + * The device is in periodic mode. No reprogramming necessary: + */ + if (dev->mode == CLOCK_EVT_MODE_PERIODIC) + return; + + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + for (;;) { + ktime_t next = ktime_add(dev->next_event, tick_period); + + if (!clockevents_program_event(dev, next, ktime_get())) + return; + tick_do_periodic_broadcast(); + } +} + +/* + * Powerstate information: The system enters/leaves a state, where + * affected devices might stop + */ +static void tick_do_broadcast_on_off(void *why) +{ + struct clock_event_device *bc, *dev; + struct tick_device *td; + unsigned long flags, *reason = why; + int cpu; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + cpu = smp_processor_id(); + td = &per_cpu(tick_cpu_device, cpu); + dev = td->evtdev; + bc = tick_broadcast_device.evtdev; + + /* + * Is the device in broadcast mode forever or is it not + * affected by the powerstate ? + */ + if (!dev || !tick_device_is_functional(dev) || + !(dev->features & CLOCK_EVT_FEAT_C3STOP)) + goto out; + + if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) { + if (!cpu_isset(cpu, tick_broadcast_mask)) { + cpu_set(cpu, tick_broadcast_mask); + if (td->mode == TICKDEV_MODE_PERIODIC) + clockevents_set_mode(dev, + CLOCK_EVT_MODE_SHUTDOWN); + } + } else { + if (cpu_isset(cpu, tick_broadcast_mask)) { + cpu_clear(cpu, tick_broadcast_mask); + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(dev, 0); + } + } + + if (cpus_empty(tick_broadcast_mask)) + clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + else { + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); + } +out: + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/* + * Powerstate information: The system enters/leaves a state, where + * affected devices might stop. + */ +void tick_broadcast_on_off(unsigned long reason, int *oncpu) +{ + int cpu = get_cpu(); + + if (cpu == *oncpu) + tick_do_broadcast_on_off(&reason); + else + smp_call_function_single(*oncpu, tick_do_broadcast_on_off, + &reason, 1, 1); + put_cpu(); +} + +/* + * Set the periodic handler depending on broadcast on/off + */ +void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) +{ + if (!broadcast) + dev->event_handler = tick_handle_periodic; + else + dev->event_handler = tick_handle_periodic_broadcast; +} + +/* + * Remove a CPU from broadcasting + */ +void tick_shutdown_broadcast(unsigned int *cpup) +{ + struct clock_event_device *bc; + unsigned long flags; + unsigned int cpu = *cpup; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + cpu_clear(cpu, tick_broadcast_mask); + + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { + if (bc && cpus_empty(tick_broadcast_mask)) + clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + } + + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +#ifdef CONFIG_TICK_ONESHOT + +static cpumask_t tick_broadcast_oneshot_mask; + +/* + * Debugging: see timer_list.c + */ +cpumask_t *tick_get_broadcast_oneshot_mask(void) +{ + return &tick_broadcast_oneshot_mask; +} + +static int tick_broadcast_set_event(ktime_t expires, int force) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + ktime_t now = ktime_get(); + int res; + + for(;;) { + res = clockevents_program_event(bc, expires, now); + if (!res || !force) + return res; + now = ktime_get(); + expires = ktime_add(now, ktime_set(0, bc->min_delta_ns)); + } +} + +/* + * Reprogram the broadcast device: + * + * Called with tick_broadcast_lock held and interrupts disabled. + */ +static int tick_broadcast_reprogram(void) +{ + ktime_t expires = { .tv64 = KTIME_MAX }; + struct tick_device *td; + int cpu; + + /* + * Find the event which expires next: + */ + for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; + cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev->next_event.tv64 < expires.tv64) + expires = td->evtdev->next_event; + } + + if (expires.tv64 == KTIME_MAX) + return 0; + + return tick_broadcast_set_event(expires, 0); +} + +/* + * Handle oneshot mode broadcasting + */ +static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) +{ + struct tick_device *td; + cpumask_t mask; + ktime_t now; + int cpu; + + spin_lock(&tick_broadcast_lock); +again: + dev->next_event.tv64 = KTIME_MAX; + mask = CPU_MASK_NONE; + now = ktime_get(); + /* Find all expired events */ + for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; + cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev->next_event.tv64 <= now.tv64) + cpu_set(cpu, mask); + } + + /* + * Wakeup the cpus which have an expired event. The broadcast + * device is reprogrammed in the return from idle code. + */ + if (!tick_do_broadcast(mask)) { + /* + * The global event did not expire any CPU local + * events. This happens in dyntick mode, as the + * maximum PIT delta is quite small. + */ + if (tick_broadcast_reprogram()) + goto again; + } + spin_unlock(&tick_broadcast_lock); +} + +/* + * Powerstate information: The system enters/leaves a state, where + * affected devices might stop + */ +void tick_broadcast_oneshot_control(unsigned long reason) +{ + struct clock_event_device *bc, *dev; + struct tick_device *td; + unsigned long flags; + int cpu; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + /* + * Periodic mode does not care about the enter/exit of power + * states + */ + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + goto out; + + bc = tick_broadcast_device.evtdev; + cpu = smp_processor_id(); + td = &per_cpu(tick_cpu_device, cpu); + dev = td->evtdev; + + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + goto out; + + if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { + if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + cpu_set(cpu, tick_broadcast_oneshot_mask); + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + if (dev->next_event.tv64 < bc->next_event.tv64) + tick_broadcast_set_event(dev->next_event, 1); + } + } else { + if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + cpu_clear(cpu, tick_broadcast_oneshot_mask); + clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + if (dev->next_event.tv64 != KTIME_MAX) + tick_program_event(dev->next_event, 1); + } + } + +out: + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/** + * tick_broadcast_setup_highres - setup the broadcast device for highres + */ +void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + if (bc->mode != CLOCK_EVT_MODE_ONESHOT) { + bc->event_handler = tick_handle_oneshot_broadcast; + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + bc->next_event.tv64 = KTIME_MAX; + } +} + +/* + * Select oneshot operating mode for the broadcast device + */ +void tick_broadcast_switch_to_oneshot(void) +{ + struct clock_event_device *bc; + unsigned long flags; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; + bc = tick_broadcast_device.evtdev; + if (bc) + tick_broadcast_setup_oneshot(bc); + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + + +/* + * Remove a dead CPU from broadcasting + */ +void tick_shutdown_broadcast_oneshot(unsigned int *cpup) +{ + struct clock_event_device *bc; + unsigned long flags; + unsigned int cpu = *cpup; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + cpu_clear(cpu, tick_broadcast_oneshot_mask); + + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) { + if (bc && cpus_empty(tick_broadcast_oneshot_mask)) + clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + } + + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +#endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c new file mode 100644 index 00000000000..4500e347f1b --- /dev/null +++ b/kernel/time/tick-common.c @@ -0,0 +1,346 @@ +/* + * linux/kernel/time/tick-common.c + * + * This file contains the base functions to manage periodic tick + * related events. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/irq.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/tick.h> + +#include "tick-internal.h" + +/* + * Tick devices + */ +DEFINE_PER_CPU(struct tick_device, tick_cpu_device); +/* + * Tick next event: keeps track of the tick time + */ +ktime_t tick_next_period; +ktime_t tick_period; +static int tick_do_timer_cpu = -1; +DEFINE_SPINLOCK(tick_device_lock); + +/* + * Debugging: see timer_list.c + */ +struct tick_device *tick_get_device(int cpu) +{ + return &per_cpu(tick_cpu_device, cpu); +} + +/** + * tick_is_oneshot_available - check for a oneshot capable event device + */ +int tick_is_oneshot_available(void) +{ + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + + return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); +} + +/* + * Periodic tick + */ +static void tick_periodic(int cpu) +{ + if (tick_do_timer_cpu == cpu) { + write_seqlock(&xtime_lock); + + /* Keep track of the next tick event */ + tick_next_period = ktime_add(tick_next_period, tick_period); + + do_timer(1); + write_sequnlock(&xtime_lock); + } + + update_process_times(user_mode(get_irq_regs())); + profile_tick(CPU_PROFILING); +} + +/* + * Event handler for periodic ticks + */ +void tick_handle_periodic(struct clock_event_device *dev) +{ + int cpu = smp_processor_id(); + + tick_periodic(cpu); + + if (dev->mode != CLOCK_EVT_MODE_ONESHOT) + return; + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + for (;;) { + ktime_t next = ktime_add(dev->next_event, tick_period); + + if (!clockevents_program_event(dev, next, ktime_get())) + return; + tick_periodic(cpu); + } +} + +/* + * Setup the device for a periodic tick + */ +void tick_setup_periodic(struct clock_event_device *dev, int broadcast) +{ + tick_set_periodic_handler(dev, broadcast); + + /* Broadcast setup ? */ + if (!tick_device_is_functional(dev)) + return; + + if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { + clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); + } else { + unsigned long seq; + ktime_t next; + + do { + seq = read_seqbegin(&xtime_lock); + next = tick_next_period; + } while (read_seqretry(&xtime_lock, seq)); + + clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + + for (;;) { + if (!clockevents_program_event(dev, next, ktime_get())) + return; + next = ktime_add(next, tick_period); + } + } +} + +/* + * Setup the tick device + */ +static void tick_setup_device(struct tick_device *td, + struct clock_event_device *newdev, int cpu, + cpumask_t cpumask) +{ + ktime_t next_event; + void (*handler)(struct clock_event_device *) = NULL; + + /* + * First device setup ? + */ + if (!td->evtdev) { + /* + * If no cpu took the do_timer update, assign it to + * this cpu: + */ + if (tick_do_timer_cpu == -1) { + tick_do_timer_cpu = cpu; + tick_next_period = ktime_get(); + tick_period = ktime_set(0, NSEC_PER_SEC / HZ); + } + + /* + * Startup in periodic mode first. + */ + td->mode = TICKDEV_MODE_PERIODIC; + } else { + handler = td->evtdev->event_handler; + next_event = td->evtdev->next_event; + } + + td->evtdev = newdev; + + /* + * When the device is not per cpu, pin the interrupt to the + * current cpu: + */ + if (!cpus_equal(newdev->cpumask, cpumask)) + irq_set_affinity(newdev->irq, cpumask); + + /* + * When global broadcasting is active, check if the current + * device is registered as a placeholder for broadcast mode. + * This allows us to handle this x86 misfeature in a generic + * way. + */ + if (tick_device_uses_broadcast(newdev, cpu)) + return; + + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(newdev, 0); + else + tick_setup_oneshot(newdev, handler, next_event); +} + +/* + * Check, if the new registered device should be used. + */ +static int tick_check_new_device(struct clock_event_device *newdev) +{ + struct clock_event_device *curdev; + struct tick_device *td; + int cpu, ret = NOTIFY_OK; + unsigned long flags; + cpumask_t cpumask; + + spin_lock_irqsave(&tick_device_lock, flags); + + cpu = smp_processor_id(); + if (!cpu_isset(cpu, newdev->cpumask)) + goto out; + + td = &per_cpu(tick_cpu_device, cpu); + curdev = td->evtdev; + cpumask = cpumask_of_cpu(cpu); + + /* cpu local device ? */ + if (!cpus_equal(newdev->cpumask, cpumask)) { + + /* + * If the cpu affinity of the device interrupt can not + * be set, ignore it. + */ + if (!irq_can_set_affinity(newdev->irq)) + goto out_bc; + + /* + * If we have a cpu local device already, do not replace it + * by a non cpu local device + */ + if (curdev && cpus_equal(curdev->cpumask, cpumask)) + goto out_bc; + } + + /* + * If we have an active device, then check the rating and the oneshot + * feature. + */ + if (curdev) { + /* + * Prefer one shot capable devices ! + */ + if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + goto out_bc; + /* + * Check the rating + */ + if (curdev->rating >= newdev->rating) + goto out_bc; + } + + /* + * Replace the eventually existing device by the new + * device. If the current device is the broadcast device, do + * not give it back to the clockevents layer ! + */ + if (tick_is_broadcast_device(curdev)) { + clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); + curdev = NULL; + } + clockevents_exchange_device(curdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); + + spin_unlock_irqrestore(&tick_device_lock, flags); + return NOTIFY_STOP; + +out_bc: + /* + * Can the new device be used as a broadcast device ? + */ + if (tick_check_broadcast_device(newdev)) + ret = NOTIFY_STOP; +out: + spin_unlock_irqrestore(&tick_device_lock, flags); + + return ret; +} + +/* + * Shutdown an event device on a given cpu: + * + * This is called on a life CPU, when a CPU is dead. So we cannot + * access the hardware device itself. + * We just set the mode and remove it from the lists. + */ +static void tick_shutdown(unsigned int *cpup) +{ + struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); + struct clock_event_device *dev = td->evtdev; + unsigned long flags; + + spin_lock_irqsave(&tick_device_lock, flags); + td->mode = TICKDEV_MODE_PERIODIC; + if (dev) { + /* + * Prevent that the clock events layer tries to call + * the set mode function! + */ + dev->mode = CLOCK_EVT_MODE_UNUSED; + clockevents_exchange_device(dev, NULL); + td->evtdev = NULL; + } + spin_unlock_irqrestore(&tick_device_lock, flags); +} + +/* + * Notification about clock event devices + */ +static int tick_notify(struct notifier_block *nb, unsigned long reason, + void *dev) +{ + switch (reason) { + + case CLOCK_EVT_NOTIFY_ADD: + return tick_check_new_device(dev); + + case CLOCK_EVT_NOTIFY_BROADCAST_ON: + case CLOCK_EVT_NOTIFY_BROADCAST_OFF: + tick_broadcast_on_off(reason, dev); + break; + + case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: + case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: + tick_broadcast_oneshot_control(reason); + break; + + case CLOCK_EVT_NOTIFY_CPU_DEAD: + tick_shutdown_broadcast_oneshot(dev); + tick_shutdown_broadcast(dev); + tick_shutdown(dev); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block tick_notifier = { + .notifier_call = tick_notify, +}; + +/** + * tick_init - initialize the tick control + * + * Register the notifier with the clockevents framework + */ +void __init tick_init(void) +{ + clockevents_register_notifier(&tick_notifier); +} diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h new file mode 100644 index 00000000000..54861a0f29f --- /dev/null +++ b/kernel/time/tick-internal.h @@ -0,0 +1,110 @@ +/* + * tick internal variable and functions used by low/high res code + */ +DECLARE_PER_CPU(struct tick_device, tick_cpu_device); +extern spinlock_t tick_device_lock; +extern ktime_t tick_next_period; +extern ktime_t tick_period; + +extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); +extern void tick_handle_periodic(struct clock_event_device *dev); + +/* + * NO_HZ / high resolution timer shared code + */ +#ifdef CONFIG_TICK_ONESHOT +extern void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt); +extern int tick_program_event(ktime_t expires, int force); +extern void tick_oneshot_notify(void); +extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); + +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); +extern void tick_broadcast_oneshot_control(unsigned long reason); +extern void tick_broadcast_switch_to_oneshot(void); +extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); +# else /* BROADCAST */ +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + BUG(); +} +static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline void tick_broadcast_switch_to_oneshot(void) { } +static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +# endif /* !BROADCAST */ + +#else /* !ONESHOT */ +static inline +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt) +{ + BUG(); +} +static inline int tick_program_event(ktime_t expires, int force) +{ + return 0; +} +static inline void tick_oneshot_notify(void) { } +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + BUG(); +} +static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +#endif /* !TICK_ONESHOT */ + +/* + * Broadcasting support + */ +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int tick_do_broadcast(cpumask_t mask); + +extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); +extern int tick_check_broadcast_device(struct clock_event_device *dev); +extern int tick_is_broadcast_device(struct clock_event_device *dev); +extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); +extern void tick_shutdown_broadcast(unsigned int *cpup); + +extern void +tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); + +#else /* !BROADCAST */ + +static inline int tick_check_broadcast_device(struct clock_event_device *dev) +{ + return 0; +} + +static inline int tick_is_broadcast_device(struct clock_event_device *dev) +{ + return 0; +} +static inline int tick_device_uses_broadcast(struct clock_event_device *dev, + int cpu) +{ + return 0; +} +static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } +static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } +static inline void tick_shutdown_broadcast(unsigned int *cpup) { } + +/* + * Set the periodic handler in non broadcast mode + */ +static inline void tick_set_periodic_handler(struct clock_event_device *dev, + int broadcast) +{ + dev->event_handler = tick_handle_periodic; +} +#endif /* !BROADCAST */ + +/* + * Check, if the device is functional or a dummy for broadcast + */ +static inline int tick_device_is_functional(struct clock_event_device *dev) +{ + return !(dev->features & CLOCK_EVT_FEAT_DUMMY); +} diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c new file mode 100644 index 00000000000..2e8b7ff863c --- /dev/null +++ b/kernel/time/tick-oneshot.c @@ -0,0 +1,84 @@ +/* + * linux/kernel/time/tick-oneshot.c + * + * This file contains functions which manage high resolution tick + * related events. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/irq.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/tick.h> + +#include "tick-internal.h" + +/** + * tick_program_event + */ +int tick_program_event(ktime_t expires, int force) +{ + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + ktime_t now = ktime_get(); + + while (1) { + int ret = clockevents_program_event(dev, expires, now); + + if (!ret || !force) + return ret; + now = ktime_get(); + expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); + } +} + +/** + * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) + */ +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t next_event) +{ + newdev->event_handler = handler; + clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); + clockevents_program_event(newdev, next_event, ktime_get()); +} + +/** + * tick_switch_to_oneshot - switch to oneshot mode + */ +int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) +{ + struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct clock_event_device *dev = td->evtdev; + + if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || + !tick_device_is_functional(dev)) + return -EINVAL; + + td->mode = TICKDEV_MODE_ONESHOT; + dev->event_handler = handler; + clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + tick_broadcast_switch_to_oneshot(); + return 0; +} + +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * tick_init_highres - switch to high resolution mode + * + * Called with interrupts disabled. + */ +int tick_init_highres(void) +{ + return tick_switch_to_oneshot(hrtimer_interrupt); +} +#endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c new file mode 100644 index 00000000000..95e41f7f850 --- /dev/null +++ b/kernel/time/tick-sched.c @@ -0,0 +1,563 @@ +/* + * linux/kernel/time/tick-sched.c + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner + * + * No idle tick implementation for low and high resolution timers + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/tick.h> + +#include "tick-internal.h" + +/* + * Per cpu nohz control structure + */ +static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); + +/* + * The time, when the last jiffy update happened. Protected by xtime_lock. + */ +static ktime_t last_jiffies_update; + +struct tick_sched *tick_get_tick_sched(int cpu) +{ + return &per_cpu(tick_cpu_sched, cpu); +} + +/* + * Must be called with interrupts disabled ! + */ +static void tick_do_update_jiffies64(ktime_t now) +{ + unsigned long ticks = 0; + ktime_t delta; + + /* Reevalute with xtime_lock held */ + write_seqlock(&xtime_lock); + + delta = ktime_sub(now, last_jiffies_update); + if (delta.tv64 >= tick_period.tv64) { + + delta = ktime_sub(delta, tick_period); + last_jiffies_update = ktime_add(last_jiffies_update, + tick_period); + + /* Slow path for long timeouts */ + if (unlikely(delta.tv64 >= tick_period.tv64)) { + s64 incr = ktime_to_ns(tick_period); + + ticks = ktime_divns(delta, incr); + + last_jiffies_update = ktime_add_ns(last_jiffies_update, + incr * ticks); + } + do_timer(++ticks); + } + write_sequnlock(&xtime_lock); +} + +/* + * Initialize and return retrieve the jiffies update. + */ +static ktime_t tick_init_jiffy_update(void) +{ + ktime_t period; + + write_seqlock(&xtime_lock); + /* Did we start the jiffies update yet ? */ + if (last_jiffies_update.tv64 == 0) + last_jiffies_update = tick_next_period; + period = last_jiffies_update; + write_sequnlock(&xtime_lock); + return period; +} + +/* + * NOHZ - aka dynamic tick functionality + */ +#ifdef CONFIG_NO_HZ +/* + * NO HZ enabled ? + */ +static int tick_nohz_enabled __read_mostly = 1; + +/* + * Enable / Disable tickless mode + */ +static int __init setup_tick_nohz(char *str) +{ + if (!strcmp(str, "off")) + tick_nohz_enabled = 0; + else if (!strcmp(str, "on")) + tick_nohz_enabled = 1; + else + return 0; + return 1; +} + +__setup("nohz=", setup_tick_nohz); + +/** + * tick_nohz_update_jiffies - update jiffies when idle was interrupted + * + * Called from interrupt entry when the CPU was idle + * + * In case the sched_tick was stopped on this CPU, we have to check if jiffies + * must be updated. Otherwise an interrupt handler could use a stale jiffy + * value. We do this unconditionally on any cpu, as we don't know whether the + * cpu, which has the update task assigned is in a long sleep. + */ +void tick_nohz_update_jiffies(void) +{ + int cpu = smp_processor_id(); + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + unsigned long flags; + ktime_t now; + + if (!ts->tick_stopped) + return; + + cpu_clear(cpu, nohz_cpu_mask); + now = ktime_get(); + + local_irq_save(flags); + tick_do_update_jiffies64(now); + local_irq_restore(flags); +} + +/** + * tick_nohz_stop_sched_tick - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + * Called either from the idle loop or from irq_exit() when an idle period was + * just interrupted by an interrupt which did not cause a reschedule. + */ +void tick_nohz_stop_sched_tick(void) +{ + unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; + struct tick_sched *ts; + ktime_t last_update, expires, now, delta; + int cpu; + + local_irq_save(flags); + + cpu = smp_processor_id(); + ts = &per_cpu(tick_cpu_sched, cpu); + + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + goto end; + + if (need_resched()) + goto end; + + cpu = smp_processor_id(); + BUG_ON(local_softirq_pending()); + + now = ktime_get(); + /* + * When called from irq_exit we need to account the idle sleep time + * correctly. + */ + if (ts->tick_stopped) { + delta = ktime_sub(now, ts->idle_entrytime); + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + } + + ts->idle_entrytime = now; + ts->idle_calls++; + + /* Read jiffies and the time when jiffies were updated last */ + do { + seq = read_seqbegin(&xtime_lock); + last_update = last_jiffies_update; + last_jiffies = jiffies; + } while (read_seqretry(&xtime_lock, seq)); + + /* Get the next timer wheel timer */ + next_jiffies = get_next_timer_interrupt(last_jiffies); + delta_jiffies = next_jiffies - last_jiffies; + + /* + * Do not stop the tick, if we are only one off + * or if the cpu is required for rcu + */ + if (!ts->tick_stopped && (delta_jiffies == 1 || rcu_needs_cpu(cpu))) + goto out; + + /* Schedule the tick, if we are at least one jiffie off */ + if ((long)delta_jiffies >= 1) { + + if (rcu_needs_cpu(cpu)) + delta_jiffies = 1; + else + cpu_set(cpu, nohz_cpu_mask); + /* + * nohz_stop_sched_tick can be called several times before + * the nohz_restart_sched_tick is called. This happens when + * interrupts arrive which do not cause a reschedule. In the + * first call we save the current tick time, so we can restart + * the scheduler tick in nohz_restart_sched_tick. + */ + if (!ts->tick_stopped) { + ts->idle_tick = ts->sched_timer.expires; + ts->tick_stopped = 1; + ts->idle_jiffies = last_jiffies; + } + /* + * calculate the expiry time for the next timer wheel + * timer + */ + expires = ktime_add_ns(last_update, tick_period.tv64 * + delta_jiffies); + ts->idle_expires = expires; + ts->idle_sleeps++; + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + goto out; + } else if(!tick_program_event(expires, 0)) + goto out; + /* + * We are past the event already. So we crossed a + * jiffie boundary. Update jiffies and raise the + * softirq. + */ + tick_do_update_jiffies64(ktime_get()); + cpu_clear(cpu, nohz_cpu_mask); + } + raise_softirq_irqoff(TIMER_SOFTIRQ); +out: + ts->next_jiffies = next_jiffies; + ts->last_jiffies = last_jiffies; +end: + local_irq_restore(flags); +} + +/** + * nohz_restart_sched_tick - restart the idle tick from the idle task + * + * Restart the idle tick when the CPU is woken up from idle + */ +void tick_nohz_restart_sched_tick(void) +{ + int cpu = smp_processor_id(); + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + unsigned long ticks; + ktime_t now, delta; + + if (!ts->tick_stopped) + return; + + /* Update jiffies first */ + now = ktime_get(); + + local_irq_disable(); + tick_do_update_jiffies64(now); + cpu_clear(cpu, nohz_cpu_mask); + + /* Account the idle time */ + delta = ktime_sub(now, ts->idle_entrytime); + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + + /* + * We stopped the tick in idle. Update process times would miss the + * time we slept as update_process_times does only a 1 tick + * accounting. Enforce that this is accounted to idle ! + */ + ticks = jiffies - ts->idle_jiffies; + /* + * We might be one off. Do not randomly account a huge number of ticks! + */ + if (ticks && ticks < LONG_MAX) { + add_preempt_count(HARDIRQ_OFFSET); + account_system_time(current, HARDIRQ_OFFSET, + jiffies_to_cputime(ticks)); + sub_preempt_count(HARDIRQ_OFFSET); + } + + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + hrtimer_cancel(&ts->sched_timer); + ts->sched_timer.expires = ts->idle_tick; + + while (1) { + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, + ts->sched_timer.expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + } else { + if (!tick_program_event(ts->sched_timer.expires, 0)) + break; + } + /* Update jiffies and reread time */ + tick_do_update_jiffies64(now); + now = ktime_get(); + } + local_irq_enable(); +} + +static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) +{ + hrtimer_forward(&ts->sched_timer, now, tick_period); + return tick_program_event(ts->sched_timer.expires, 0); +} + +/* + * The nohz low res interrupt handler + */ +static void tick_nohz_handler(struct clock_event_device *dev) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct pt_regs *regs = get_irq_regs(); + ktime_t now = ktime_get(); + + dev->next_event.tv64 = KTIME_MAX; + + /* Check, if the jiffies need an update */ + tick_do_update_jiffies64(now); + + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start + * of idle" jiffy stamp so the idle accounting adjustment we + * do when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog(); + ts->idle_jiffies++; + } + + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); + + /* Do not restart, when we are in the idle loop */ + if (ts->tick_stopped) + return; + + while (tick_nohz_reprogram(ts, now)) { + now = ktime_get(); + tick_do_update_jiffies64(now); + } +} + +/** + * tick_nohz_switch_to_nohz - switch to nohz mode + */ +static void tick_nohz_switch_to_nohz(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + ktime_t next; + + if (!tick_nohz_enabled) + return; + + local_irq_disable(); + if (tick_switch_to_oneshot(tick_nohz_handler)) { + local_irq_enable(); + return; + } + + ts->nohz_mode = NOHZ_MODE_LOWRES; + + /* + * Recycle the hrtimer in ts, so we can share the + * hrtimer_forward with the highres code. + */ + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + /* Get the next period */ + next = tick_init_jiffy_update(); + + for (;;) { + ts->sched_timer.expires = next; + if (!tick_program_event(next, 0)) + break; + next = ktime_add(next, tick_period); + } + local_irq_enable(); + + printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", + smp_processor_id()); +} + +#else + +static inline void tick_nohz_switch_to_nohz(void) { } + +#endif /* NO_HZ */ + +/* + * High resolution timer specific code + */ +#ifdef CONFIG_HIGH_RES_TIMERS +/* + * We rearm the timer until we get disabled by the idle code + * Called with interrupts disabled and timer->base->cpu_base->lock held. + */ +static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) +{ + struct tick_sched *ts = + container_of(timer, struct tick_sched, sched_timer); + struct hrtimer_cpu_base *base = timer->base->cpu_base; + struct pt_regs *regs = get_irq_regs(); + ktime_t now = ktime_get(); + + /* Check, if the jiffies need an update */ + tick_do_update_jiffies64(now); + + /* + * Do not call, when we are not in irq context and have + * no valid regs pointer + */ + if (regs) { + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start of + * idle" jiffy stamp so the idle accounting adjustment we do + * when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog(); + ts->idle_jiffies++; + } + /* + * update_process_times() might take tasklist_lock, hence + * drop the base lock. sched-tick hrtimers are per-CPU and + * never accessible by userspace APIs, so this is safe to do. + */ + spin_unlock(&base->lock); + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); + spin_lock(&base->lock); + } + + /* Do not restart, when we are in the idle loop */ + if (ts->tick_stopped) + return HRTIMER_NORESTART; + + hrtimer_forward(timer, now, tick_period); + + return HRTIMER_RESTART; +} + +/** + * tick_setup_sched_timer - setup the tick emulation timer + */ +void tick_setup_sched_timer(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + ktime_t now = ktime_get(); + + /* + * Emulate tick processing via per-CPU hrtimers: + */ + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + ts->sched_timer.function = tick_sched_timer; + ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + + /* Get the next period */ + ts->sched_timer.expires = tick_init_jiffy_update(); + + for (;;) { + hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_start(&ts->sched_timer, ts->sched_timer.expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + now = ktime_get(); + } + +#ifdef CONFIG_NO_HZ + if (tick_nohz_enabled) + ts->nohz_mode = NOHZ_MODE_HIGHRES; +#endif +} + +void tick_cancel_sched_timer(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + if (ts->sched_timer.base) + hrtimer_cancel(&ts->sched_timer); + ts->tick_stopped = 0; + ts->nohz_mode = NOHZ_MODE_INACTIVE; +} +#endif /* HIGH_RES_TIMERS */ + +/** + * Async notification about clocksource changes + */ +void tick_clock_notify(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); +} + +/* + * Async notification about clock event changes + */ +void tick_oneshot_notify(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + set_bit(0, &ts->check_clocks); +} + +/** + * Check, if a change happened, which makes oneshot possible. + * + * Called cyclic from the hrtimer softirq (driven by the timer + * softirq) allow_nohz signals, that we can switch into low-res nohz + * mode, because high resolution timers are disabled (either compile + * or runtime). + */ +int tick_check_oneshot_change(int allow_nohz) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + if (!test_and_clear_bit(0, &ts->check_clocks)) + return 0; + + if (ts->nohz_mode != NOHZ_MODE_INACTIVE) + return 0; + + if (!timekeeping_is_continuous() || !tick_is_oneshot_available()) + return 0; + + if (!allow_nohz) + return 1; + + tick_nohz_switch_to_nohz(); + return 0; +} diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c new file mode 100644 index 00000000000..f82c635c3d5 --- /dev/null +++ b/kernel/time/timer_list.c @@ -0,0 +1,287 @@ +/* + * kernel/time/timer_list.c + * + * List pending timers + * + * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/tick.h> + +#include <asm/uaccess.h> + +typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); + +DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); + +/* + * This allows printing both to /proc/timer_list and + * to the console (on SysRq-Q): + */ +#define SEQ_printf(m, x...) \ + do { \ + if (m) \ + seq_printf(m, x); \ + else \ + printk(x); \ + } while (0) + +static void print_name_offset(struct seq_file *m, void *sym) +{ + unsigned long addr = (unsigned long)sym; + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); + if (sym_name) + SEQ_printf(m, "%s", sym_name); + else + SEQ_printf(m, "<%p>", sym); +} + +static void +print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) +{ +#ifdef CONFIG_TIMER_STATS + char tmp[TASK_COMM_LEN + 1]; +#endif + SEQ_printf(m, " #%d: ", idx); + print_name_offset(m, timer); + SEQ_printf(m, ", "); + print_name_offset(m, timer->function); + SEQ_printf(m, ", S:%02lx", timer->state); +#ifdef CONFIG_TIMER_STATS + SEQ_printf(m, ", "); + print_name_offset(m, timer->start_site); + memcpy(tmp, timer->start_comm, TASK_COMM_LEN); + tmp[TASK_COMM_LEN] = 0; + SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); +#endif + SEQ_printf(m, "\n"); + SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n", + (unsigned long long)ktime_to_ns(timer->expires), + (unsigned long long)(ktime_to_ns(timer->expires) - now)); +} + +static void +print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, + u64 now) +{ + struct hrtimer *timer, tmp; + unsigned long next = 0, i; + struct rb_node *curr; + unsigned long flags; + +next_one: + i = 0; + spin_lock_irqsave(&base->cpu_base->lock, flags); + + curr = base->first; + /* + * Crude but we have to do this O(N*N) thing, because + * we have to unlock the base when printing: + */ + while (curr && i < next) { + curr = rb_next(curr); + i++; + } + + if (curr) { + + timer = rb_entry(curr, struct hrtimer, node); + tmp = *timer; + spin_unlock_irqrestore(&base->cpu_base->lock, flags); + + print_timer(m, &tmp, i, now); + next++; + goto next_one; + } + spin_unlock_irqrestore(&base->cpu_base->lock, flags); +} + +static void +print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) +{ + SEQ_printf(m, " .index: %d\n", + base->index); + SEQ_printf(m, " .resolution: %Ld nsecs\n", + (unsigned long long)ktime_to_ns(base->resolution)); + SEQ_printf(m, " .get_time: "); + print_name_offset(m, base->get_time); + SEQ_printf(m, "\n"); +#ifdef CONFIG_HIGH_RES_TIMERS + SEQ_printf(m, " .offset: %Ld nsecs\n", + ktime_to_ns(base->offset)); +#endif + SEQ_printf(m, "active timers:\n"); + print_active_timers(m, base, now); +} + +static void print_cpu(struct seq_file *m, int cpu, u64 now) +{ + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + int i; + + SEQ_printf(m, "\ncpu: %d\n", cpu); + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + SEQ_printf(m, " clock %d:\n", i); + print_base(m, cpu_base->clock_base + i, now); + } +#define P(x) \ + SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x)) +#define P_ns(x) \ + SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ + (u64)(ktime_to_ns(cpu_base->x))) + +#ifdef CONFIG_HIGH_RES_TIMERS + P_ns(expires_next); + P(hres_active); + P(nr_events); +#endif +#undef P +#undef P_ns + +#ifdef CONFIG_TICK_ONESHOT +# define P(x) \ + SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x)) +# define P_ns(x) \ + SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ + (u64)(ktime_to_ns(ts->x))) + { + struct tick_sched *ts = tick_get_tick_sched(cpu); + P(nohz_mode); + P_ns(idle_tick); + P(tick_stopped); + P(idle_jiffies); + P(idle_calls); + P(idle_sleeps); + P_ns(idle_entrytime); + P_ns(idle_sleeptime); + P(last_jiffies); + P(next_jiffies); + P_ns(idle_expires); + SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies); + } +#endif + +#undef P +#undef P_ns +} + +#ifdef CONFIG_GENERIC_CLOCKEVENTS +static void +print_tickdevice(struct seq_file *m, struct tick_device *td) +{ + struct clock_event_device *dev = td->evtdev; + + SEQ_printf(m, "\nTick Device: mode: %d\n", td->mode); + + SEQ_printf(m, "Clock Event Device: "); + if (!dev) { + SEQ_printf(m, "<NULL>\n"); + return; + } + SEQ_printf(m, "%s\n", dev->name); + SEQ_printf(m, " max_delta_ns: %ld\n", dev->max_delta_ns); + SEQ_printf(m, " min_delta_ns: %ld\n", dev->min_delta_ns); + SEQ_printf(m, " mult: %ld\n", dev->mult); + SEQ_printf(m, " shift: %d\n", dev->shift); + SEQ_printf(m, " mode: %d\n", dev->mode); + SEQ_printf(m, " next_event: %Ld nsecs\n", + (unsigned long long) ktime_to_ns(dev->next_event)); + + SEQ_printf(m, " set_next_event: "); + print_name_offset(m, dev->set_next_event); + SEQ_printf(m, "\n"); + + SEQ_printf(m, " set_mode: "); + print_name_offset(m, dev->set_mode); + SEQ_printf(m, "\n"); + + SEQ_printf(m, " event_handler: "); + print_name_offset(m, dev->event_handler); + SEQ_printf(m, "\n"); +} + +static void timer_list_show_tickdevices(struct seq_file *m) +{ + int cpu; + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST + print_tickdevice(m, tick_get_broadcast_device()); + SEQ_printf(m, "tick_broadcast_mask: %08lx\n", + tick_get_broadcast_mask()->bits[0]); +#ifdef CONFIG_TICK_ONESHOT + SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", + tick_get_broadcast_oneshot_mask()->bits[0]); +#endif + SEQ_printf(m, "\n"); +#endif + for_each_online_cpu(cpu) + print_tickdevice(m, tick_get_device(cpu)); + SEQ_printf(m, "\n"); +} +#else +static void timer_list_show_tickdevices(struct seq_file *m) { } +#endif + +static int timer_list_show(struct seq_file *m, void *v) +{ + u64 now = ktime_to_ns(ktime_get()); + int cpu; + + SEQ_printf(m, "Timer List Version: v0.3\n"); + SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); + SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + + for_each_online_cpu(cpu) + print_cpu(m, cpu, now); + + SEQ_printf(m, "\n"); + timer_list_show_tickdevices(m); + + return 0; +} + +void sysrq_timer_list_show(void) +{ + timer_list_show(NULL, NULL); +} + +static int timer_list_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, timer_list_show, NULL); +} + +static struct file_operations timer_list_fops = { + .open = timer_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init init_timer_list_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = create_proc_entry("timer_list", 0644, NULL); + if (!pe) + return -ENOMEM; + + pe->proc_fops = &timer_list_fops; + + return 0; +} +__initcall(init_timer_list_procfs); diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c new file mode 100644 index 00000000000..1bc4882e28e --- /dev/null +++ b/kernel/time/timer_stats.c @@ -0,0 +1,411 @@ +/* + * kernel/time/timer_stats.c + * + * Collect timer usage statistics. + * + * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * timer_stats is based on timer_top, a similar functionality which was part of + * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the + * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based + * on dynamic allocation of the statistics entries and linear search based + * lookup combined with a global lock, rather than the static array, hash + * and per-CPU locking which is used by timer_stats. It was written for the + * pre hrtimer kernel code and therefore did not take hrtimers into account. + * Nevertheless it provided the base for the timer_stats implementation and + * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks + * for this effort. + * + * timer_top.c is + * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus + * Written by Daniel Petrini <d.pensator@gmail.com> + * timer_top.c was released under the GNU General Public License version 2 + * + * We export the addresses and counting of timer functions being called, + * the pid and cmdline from the owner process if applicable. + * + * Start/stop data collection: + * # echo 1[0] >/proc/timer_stats + * + * Display the information collected so far: + * # cat /proc/timer_stats + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> + +#include <asm/uaccess.h> + +/* + * This is our basic unit of interest: a timer expiry event identified + * by the timer, its start/expire functions and the PID of the task that + * started the timer. We count the number of times an event happens: + */ +struct entry { + /* + * Hash list: + */ + struct entry *next; + + /* + * Hash keys: + */ + void *timer; + void *start_func; + void *expire_func; + pid_t pid; + + /* + * Number of timeout events: + */ + unsigned long count; + + /* + * We save the command-line string to preserve + * this information past task exit: + */ + char comm[TASK_COMM_LEN + 1]; + +} ____cacheline_aligned_in_smp; + +/* + * Spinlock protecting the tables - not taken during lookup: + */ +static DEFINE_SPINLOCK(table_lock); + +/* + * Per-CPU lookup locks for fast hash lookup: + */ +static DEFINE_PER_CPU(spinlock_t, lookup_lock); + +/* + * Mutex to serialize state changes with show-stats activities: + */ +static DEFINE_MUTEX(show_mutex); + +/* + * Collection status, active/inactive: + */ +static int __read_mostly active; + +/* + * Beginning/end timestamps of measurement: + */ +static ktime_t time_start, time_stop; + +/* + * tstat entry structs only get allocated while collection is + * active and never freed during that time - this simplifies + * things quite a bit. + * + * They get freed when a new collection period is started. + */ +#define MAX_ENTRIES_BITS 10 +#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS) + +static unsigned long nr_entries; +static struct entry entries[MAX_ENTRIES]; + +static atomic_t overflow_count; + +static void reset_entries(void) +{ + nr_entries = 0; + memset(entries, 0, sizeof(entries)); + atomic_set(&overflow_count, 0); +} + +static struct entry *alloc_entry(void) +{ + if (nr_entries >= MAX_ENTRIES) + return NULL; + + return entries + nr_entries++; +} + +/* + * The entries are in a hash-table, for fast lookup: + */ +#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1) +#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS) +#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1) + +#define __tstat_hashfn(entry) \ + (((unsigned long)(entry)->timer ^ \ + (unsigned long)(entry)->start_func ^ \ + (unsigned long)(entry)->expire_func ^ \ + (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK) + +#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry)) + +static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly; + +static int match_entries(struct entry *entry1, struct entry *entry2) +{ + return entry1->timer == entry2->timer && + entry1->start_func == entry2->start_func && + entry1->expire_func == entry2->expire_func && + entry1->pid == entry2->pid; +} + +/* + * Look up whether an entry matching this item is present + * in the hash already. Must be called with irqs off and the + * lookup lock held: + */ +static struct entry *tstat_lookup(struct entry *entry, char *comm) +{ + struct entry **head, *curr, *prev; + + head = tstat_hashentry(entry); + curr = *head; + + /* + * The fastpath is when the entry is already hashed, + * we do this with the lookup lock held, but with the + * table lock not held: + */ + while (curr) { + if (match_entries(curr, entry)) + return curr; + + curr = curr->next; + } + /* + * Slowpath: allocate, set up and link a new hash entry: + */ + prev = NULL; + curr = *head; + + spin_lock(&table_lock); + /* + * Make sure we have not raced with another CPU: + */ + while (curr) { + if (match_entries(curr, entry)) + goto out_unlock; + + prev = curr; + curr = curr->next; + } + + curr = alloc_entry(); + if (curr) { + *curr = *entry; + curr->count = 0; + memcpy(curr->comm, comm, TASK_COMM_LEN); + if (prev) + prev->next = curr; + else + *head = curr; + curr->next = NULL; + } + out_unlock: + spin_unlock(&table_lock); + + return curr; +} + +/** + * timer_stats_update_stats - Update the statistics for a timer. + * @timer: pointer to either a timer_list or a hrtimer + * @pid: the pid of the task which set up the timer + * @startf: pointer to the function which did the timer setup + * @timerf: pointer to the timer callback function of the timer + * @comm: name of the process which set up the timer + * + * When the timer is already registered, then the event counter is + * incremented. Otherwise the timer is registered in a free slot. + */ +void timer_stats_update_stats(void *timer, pid_t pid, void *startf, + void *timerf, char * comm) +{ + /* + * It doesnt matter which lock we take: + */ + spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id()); + struct entry *entry, input; + unsigned long flags; + + input.timer = timer; + input.start_func = startf; + input.expire_func = timerf; + input.pid = pid; + + spin_lock_irqsave(lock, flags); + if (!active) + goto out_unlock; + + entry = tstat_lookup(&input, comm); + if (likely(entry)) + entry->count++; + else + atomic_inc(&overflow_count); + + out_unlock: + spin_unlock_irqrestore(lock, flags); +} + +static void print_name_offset(struct seq_file *m, unsigned long addr) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); + if (sym_name) + seq_printf(m, "%s", sym_name); + else + seq_printf(m, "<%p>", (void *)addr); +} + +static int tstats_show(struct seq_file *m, void *v) +{ + struct timespec period; + struct entry *entry; + unsigned long ms; + long events = 0; + ktime_t time; + int i; + + mutex_lock(&show_mutex); + /* + * If still active then calculate up to now: + */ + if (active) + time_stop = ktime_get(); + + time = ktime_sub(time_stop, time_start); + + period = ktime_to_timespec(time); + ms = period.tv_nsec / 1000000; + + seq_puts(m, "Timer Stats Version: v0.1\n"); + seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); + if (atomic_read(&overflow_count)) + seq_printf(m, "Overflow: %d entries\n", + atomic_read(&overflow_count)); + + for (i = 0; i < nr_entries; i++) { + entry = entries + i; + seq_printf(m, "%4lu, %5d %-16s ", + entry->count, entry->pid, entry->comm); + + print_name_offset(m, (unsigned long)entry->start_func); + seq_puts(m, " ("); + print_name_offset(m, (unsigned long)entry->expire_func); + seq_puts(m, ")\n"); + + events += entry->count; + } + + ms += period.tv_sec * 1000; + if (!ms) + ms = 1; + + if (events && period.tv_sec) + seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events, + events / period.tv_sec, events * 1000 / ms); + else + seq_printf(m, "%ld total events\n", events); + + mutex_unlock(&show_mutex); + + return 0; +} + +/* + * After a state change, make sure all concurrent lookup/update + * activities have stopped: + */ +static void sync_access(void) +{ + unsigned long flags; + int cpu; + + for_each_online_cpu(cpu) { + spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); + /* nothing */ + spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); + } +} + +static ssize_t tstats_write(struct file *file, const char __user *buf, + size_t count, loff_t *offs) +{ + char ctl[2]; + + if (count != 2 || *offs) + return -EINVAL; + + if (copy_from_user(ctl, buf, count)) + return -EFAULT; + + mutex_lock(&show_mutex); + switch (ctl[0]) { + case '0': + if (active) { + active = 0; + time_stop = ktime_get(); + sync_access(); + } + break; + case '1': + if (!active) { + reset_entries(); + time_start = ktime_get(); + active = 1; + } + break; + default: + count = -EINVAL; + } + mutex_unlock(&show_mutex); + + return count; +} + +static int tstats_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, tstats_show, NULL); +} + +static struct file_operations tstats_fops = { + .open = tstats_open, + .read = seq_read, + .write = tstats_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +void __init init_timer_stats(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + spin_lock_init(&per_cpu(lookup_lock, cpu)); +} + +static int __init init_tstats_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = create_proc_entry("timer_stats", 0644, NULL); + if (!pe) + return -ENOMEM; + + pe->proc_fops = &tstats_fops; + + return 0; +} +__initcall(init_tstats_procfs); |