From 8d00a6c8f6b08e7167bc03bf955cdc7e47c5132e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 22 Jul 2008 08:39:57 +0200 Subject: genirq: remove last NO_IDLE_HZ leftovers Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 8ccb462ea42..f3047df2d23 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -197,10 +197,6 @@ extern int setup_irq(unsigned int irq, struct irqaction *new); #ifdef CONFIG_GENERIC_HARDIRQS -#ifndef handle_dynamic_tick -# define handle_dynamic_tick(a) do { } while (0) -#endif - #ifdef CONFIG_SMP #if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) -- cgit v1.2.3 From 1aa5dfb751d275ae7117d3b73ac423b4a46f2a73 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 20 Aug 2008 16:37:28 -0700 Subject: clocksource: keep track of original clocksource frequency The clocksource frequency is represented by clocksource->mult/2^(clocksource->shift). Currently, when NTP makes adjustments to the clock frequency, they are made directly to the mult value. This has the drawback that once changed, we cannot know what the orignal mult value was, or how much adjustment has been applied. This property causes problems in calculating proper ntp intervals when switching back and forth between clocksources. This patch separates the current mult value into a mult and mult_orig pair. The mult_orig value stays constant, while the ntp clocksource adjustments are done only to the mult value. This allows for correct ntp interval calculation and additionally lays the groundwork for a new notion of time, what I'm calling the monotonic-raw time, which is introduced in a following patch. Signed-off-by: John Stultz Signed-off-by: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/clocksource.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 55e434feec9..f0a7fb98441 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -45,7 +45,8 @@ struct clocksource; * @read: returns a cycle value * @mask: bitmask for two's complement * subtraction of non 64 bit counters - * @mult: cycle to nanosecond multiplier + * @mult: cycle to nanosecond multiplier (adjusted by NTP) + * @mult_orig: cycle to nanosecond multiplier (unadjusted by NTP) * @shift: cycle to nanosecond divisor (power of two) * @flags: flags describing special properties * @vread: vsyscall based read @@ -63,6 +64,7 @@ struct clocksource { cycle_t (*read)(void); cycle_t mask; u32 mult; + u32 mult_orig; u32 shift; unsigned long flags; cycle_t (*vread)(void); @@ -201,16 +203,17 @@ static inline void clocksource_calculate_interval(struct clocksource *c, { u64 tmp; - /* XXX - All of this could use a whole lot of optimization */ + /* Do the ns -> cycle conversion first, using original mult */ tmp = length_nsec; tmp <<= c->shift; - tmp += c->mult/2; - do_div(tmp, c->mult); + tmp += c->mult_orig/2; + do_div(tmp, c->mult_orig); c->cycle_interval = (cycle_t)tmp; if (c->cycle_interval == 0) c->cycle_interval = 1; + /* Go back from cycles -> shifted ns, this time use ntp adjused mult */ c->xtime_interval = (u64)c->cycle_interval * c->mult; } -- cgit v1.2.3 From 2d42244ae71d6c7b0884b5664cf2eda30fb2ae68 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 20 Aug 2008 16:37:30 -0700 Subject: clocksource: introduce CLOCK_MONOTONIC_RAW In talking with Josip Loncaric, and his work on clock synchronization (see btime.sf.net), he mentioned that for really close synchronization, it is useful to have access to "hardware time", that is a notion of time that is not in any way adjusted by the clock slewing done to keep close time sync. Part of the issue is if we are using the kernel's ntp adjusted representation of time in order to measure how we should correct time, we can run into what Paul McKenney aptly described as "Painting a road using the lines we're painting as the guide". I had been thinking of a similar problem, and was trying to come up with a way to give users access to a purely hardware based time representation that avoided users having to know the underlying frequency and mask values needed to deal with the wide variety of possible underlying hardware counters. My solution is to introduce CLOCK_MONOTONIC_RAW. This exposes a nanosecond based time value, that increments starting at bootup and has no frequency adjustments made to it what so ever. The time is accessed from userspace via the posix_clock_gettime() syscall, passing CLOCK_MONOTONIC_RAW as the clock_id. Signed-off-by: John Stultz Signed-off-by: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/clocksource.h | 3 +++ include/linux/time.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index f0a7fb98441..f88d32f8ff7 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -79,6 +79,7 @@ struct clocksource { /* timekeeping specific data, ignore */ cycle_t cycle_interval; u64 xtime_interval; + u32 raw_interval; /* * Second part is written at each timer interrupt * Keep it in a different cache line to dirty no @@ -87,6 +88,7 @@ struct clocksource { cycle_t cycle_last ____cacheline_aligned_in_smp; u64 xtime_nsec; s64 error; + struct timespec raw_time; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG /* Watchdog related data, used by the framework */ @@ -215,6 +217,7 @@ static inline void clocksource_calculate_interval(struct clocksource *c, /* Go back from cycles -> shifted ns, this time use ntp adjused mult */ c->xtime_interval = (u64)c->cycle_interval * c->mult; + c->raw_interval = ((u64)c->cycle_interval * c->mult_orig) >> c->shift; } diff --git a/include/linux/time.h b/include/linux/time.h index e15206a7e82..205f974b9eb 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -117,6 +117,7 @@ extern int do_setitimer(int which, struct itimerval *value, extern unsigned int alarm_setitimer(unsigned int seconds); extern int do_getitimer(int which, struct itimerval *value); extern void getnstimeofday(struct timespec *tv); +extern void getrawmonotonic(struct timespec *ts); extern void getboottime(struct timespec *ts); extern void monotonic_to_bootbased(struct timespec *ts); @@ -214,6 +215,7 @@ struct itimerval { #define CLOCK_MONOTONIC 1 #define CLOCK_PROCESS_CPUTIME_ID 2 #define CLOCK_THREAD_CPUTIME_ID 3 +#define CLOCK_MONOTONIC_RAW 4 /* * The IDs of various hardware clocks: -- cgit v1.2.3 From 916c7a855174e3b53d182b97a26b2e27a29726a1 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Wed, 20 Aug 2008 16:46:08 -0700 Subject: ntp: fix ADJ_OFFSET_SS_READ bug and do_adjtimex() cleanup Thanks to the review by Michael Kerrisk a bug in the recent ADJ_OFFSET_SS_READ option was discovered, where the ntp time_offset was inadvertently set by it. This fixes this by making the adjtime code more separate from the ntp_adjtime code (both of which really want to be separate syscalls). Signed-off-by: Roman Zippel Signed-off-by: Andrew Morton Acked-by: John Stultz Signed-off-by: Ingo Molnar --- include/linux/timex.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/timex.h b/include/linux/timex.h index fc6035d29d5..c00bcdd3ae4 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -141,8 +141,15 @@ struct timex { #define ADJ_MICRO 0x1000 /* select microsecond resolution */ #define ADJ_NANO 0x2000 /* select nanosecond resolution */ #define ADJ_TICK 0x4000 /* tick value */ + +#ifdef __KERNEL__ +#define ADJ_ADJTIME 0x8000 /* switch between adjtime/adjtimex modes */ +#define ADJ_OFFSET_SINGLESHOT 0x0001 /* old-fashioned adjtime */ +#define ADJ_OFFSET_READONLY 0x2000 /* read-only adjtime */ +#else #define ADJ_OFFSET_SINGLESHOT 0x8001 /* old-fashioned adjtime */ -#define ADJ_OFFSET_SS_READ 0xa001 /* read-only adjtime */ +#define ADJ_OFFSET_SS_READ 0xa001 /* read-only adjtime */ +#endif /* xntp 3.4 compatibility names */ #define MOD_OFFSET ADJ_OFFSET -- cgit v1.2.3 From f06febc96ba8e0af80bcc3eaec0a109e88275fac Mon Sep 17 00:00:00 2001 From: Frank Mayhar Date: Fri, 12 Sep 2008 09:54:39 -0700 Subject: timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar Cc: Roland McGrath Cc: Alexey Dobriyan Cc: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/posix-timers.h | 2 + include/linux/sched.h | 257 ++++++++++++++++++++++++++++++++++++++++--- include/linux/time.h | 3 + 3 files changed, 249 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index a7dd38f30ad..f9d8e9e94e9 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -115,4 +115,6 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, long clock_nanosleep_restart(struct restart_block *restart_block); +void update_rlimit_cpu(unsigned long rlim_new); + #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 3d9120c5ad1..26d7a5f2d0b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -425,6 +425,45 @@ struct pacct_struct { unsigned long ac_minflt, ac_majflt; }; +/** + * struct task_cputime - collected CPU time counts + * @utime: time spent in user mode, in &cputime_t units + * @stime: time spent in kernel mode, in &cputime_t units + * @sum_exec_runtime: total time spent on the CPU, in nanoseconds + * + * This structure groups together three kinds of CPU time that are + * tracked for threads and thread groups. Most things considering + * CPU time want to group these counts together and treat all three + * of them in parallel. + */ +struct task_cputime { + cputime_t utime; + cputime_t stime; + unsigned long long sum_exec_runtime; +}; +/* Alternate field names when used to cache expirations. */ +#define prof_exp stime +#define virt_exp utime +#define sched_exp sum_exec_runtime + +/** + * struct thread_group_cputime - thread group interval timer counts + * @totals: thread group interval timers; substructure for + * uniprocessor kernel, per-cpu for SMP kernel. + * + * This structure contains the version of task_cputime, above, that is + * used for thread group CPU clock calculations. + */ +#ifdef CONFIG_SMP +struct thread_group_cputime { + struct task_cputime *totals; +}; +#else +struct thread_group_cputime { + struct task_cputime totals; +}; +#endif + /* * NOTE! "signal_struct" does not have it's own * locking, because a shared signal_struct always @@ -470,6 +509,17 @@ struct signal_struct { cputime_t it_prof_expires, it_virt_expires; cputime_t it_prof_incr, it_virt_incr; + /* + * Thread group totals for process CPU clocks. + * See thread_group_cputime(), et al, for details. + */ + struct thread_group_cputime cputime; + + /* Earliest-expiration cache. */ + struct task_cputime cputime_expires; + + struct list_head cpu_timers[3]; + /* job control IDs */ /* @@ -500,7 +550,7 @@ struct signal_struct { * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ - cputime_t utime, stime, cutime, cstime; + cputime_t cutime, cstime; cputime_t gtime; cputime_t cgtime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; @@ -508,14 +558,6 @@ struct signal_struct { unsigned long inblock, oublock, cinblock, coublock; struct task_io_accounting ioac; - /* - * Cumulative ns of scheduled CPU time for dead threads in the - * group, not including a zombie group leader. (This only differs - * from jiffies_to_ns(utime + stime) if sched_clock uses something - * other than jiffies.) - */ - unsigned long long sum_sched_runtime; - /* * We don't bother to synchronize most readers of this at all, * because there is no reader checking a limit that actually needs @@ -527,8 +569,6 @@ struct signal_struct { */ struct rlimit rlim[RLIM_NLIMITS]; - struct list_head cpu_timers[3]; - /* keep the process-shared keyrings here so that they do the right * thing in threads created with CLONE_THREAD */ #ifdef CONFIG_KEYS @@ -1134,8 +1174,7 @@ struct task_struct { /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ unsigned long min_flt, maj_flt; - cputime_t it_prof_expires, it_virt_expires; - unsigned long long it_sched_expires; + struct task_cputime cputime_expires; struct list_head cpu_timers[3]; /* process credentials */ @@ -1585,6 +1624,7 @@ extern unsigned long long cpu_clock(int cpu); extern unsigned long long task_sched_runtime(struct task_struct *task); +extern unsigned long long thread_group_sched_runtime(struct task_struct *task); /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP @@ -2081,6 +2121,197 @@ static inline int spin_needbreak(spinlock_t *lock) #endif } +/* + * Thread group CPU time accounting. + */ +#ifdef CONFIG_SMP + +extern int thread_group_cputime_alloc_smp(struct task_struct *); +extern void thread_group_cputime_smp(struct task_struct *, struct task_cputime *); + +static inline void thread_group_cputime_init(struct signal_struct *sig) +{ + sig->cputime.totals = NULL; +} + +static inline int thread_group_cputime_clone_thread(struct task_struct *curr, + struct task_struct *new) +{ + if (curr->signal->cputime.totals) + return 0; + return thread_group_cputime_alloc_smp(curr); +} + +static inline void thread_group_cputime_free(struct signal_struct *sig) +{ + free_percpu(sig->cputime.totals); +} + +/** + * thread_group_cputime - Sum the thread group time fields across all CPUs. + * + * This is a wrapper for the real routine, thread_group_cputime_smp(). See + * that routine for details. + */ +static inline void thread_group_cputime( + struct task_struct *tsk, + struct task_cputime *times) +{ + thread_group_cputime_smp(tsk, times); +} + +/** + * thread_group_cputime_account_user - Maintain utime for a thread group. + * + * @tgtimes: Pointer to thread_group_cputime structure. + * @cputime: Time value by which to increment the utime field of that + * structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the utime field there. + */ +static inline void thread_group_cputime_account_user( + struct thread_group_cputime *tgtimes, + cputime_t cputime) +{ + if (tgtimes->totals) { + struct task_cputime *times; + + times = per_cpu_ptr(tgtimes->totals, get_cpu()); + times->utime = cputime_add(times->utime, cputime); + put_cpu_no_resched(); + } +} + +/** + * thread_group_cputime_account_system - Maintain stime for a thread group. + * + * @tgtimes: Pointer to thread_group_cputime structure. + * @cputime: Time value by which to increment the stime field of that + * structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the stime field there. + */ +static inline void thread_group_cputime_account_system( + struct thread_group_cputime *tgtimes, + cputime_t cputime) +{ + if (tgtimes->totals) { + struct task_cputime *times; + + times = per_cpu_ptr(tgtimes->totals, get_cpu()); + times->stime = cputime_add(times->stime, cputime); + put_cpu_no_resched(); + } +} + +/** + * thread_group_cputime_account_exec_runtime - Maintain exec runtime for a + * thread group. + * + * @tgtimes: Pointer to thread_group_cputime structure. + * @ns: Time value by which to increment the sum_exec_runtime field + * of that structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the sum_exec_runtime field there. + */ +static inline void thread_group_cputime_account_exec_runtime( + struct thread_group_cputime *tgtimes, + unsigned long long ns) +{ + if (tgtimes->totals) { + struct task_cputime *times; + + times = per_cpu_ptr(tgtimes->totals, get_cpu()); + times->sum_exec_runtime += ns; + put_cpu_no_resched(); + } +} + +#else /* CONFIG_SMP */ + +static inline void thread_group_cputime_init(struct signal_struct *sig) +{ + sig->cputime.totals.utime = cputime_zero; + sig->cputime.totals.stime = cputime_zero; + sig->cputime.totals.sum_exec_runtime = 0; +} + +static inline int thread_group_cputime_alloc(struct task_struct *tsk) +{ + return 0; +} + +static inline void thread_group_cputime_free(struct signal_struct *sig) +{ +} + +static inline int thread_group_cputime_clone_thread(struct task_struct *curr, + struct task_struct *tsk) +{ +} + +static inline void thread_group_cputime(struct task_struct *tsk, + struct task_cputime *cputime) +{ + *cputime = tsk->signal->cputime.totals; +} + +static inline void thread_group_cputime_account_user( + struct thread_group_cputime *tgtimes, + cputime_t cputime) +{ + tgtimes->totals->utime = cputime_add(tgtimes->totals->utime, cputime); +} + +static inline void thread_group_cputime_account_system( + struct thread_group_cputime *tgtimes, + cputime_t cputime) +{ + tgtimes->totals->stime = cputime_add(tgtimes->totals->stime, cputime); +} + +static inline void thread_group_cputime_account_exec_runtime( + struct thread_group_cputime *tgtimes, + unsigned long long ns) +{ + tgtimes->totals->sum_exec_runtime += ns; +} + +#endif /* CONFIG_SMP */ + +static inline void account_group_user_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (likely(sig)) + thread_group_cputime_account_user(&sig->cputime, cputime); +} + +static inline void account_group_system_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (likely(sig)) + thread_group_cputime_account_system(&sig->cputime, cputime); +} + +static inline void account_group_exec_runtime(struct task_struct *tsk, + unsigned long long ns) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (likely(sig)) + thread_group_cputime_account_exec_runtime(&sig->cputime, ns); +} + /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. diff --git a/include/linux/time.h b/include/linux/time.h index e15206a7e82..1b70b3c293e 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -125,6 +125,9 @@ extern int timekeeping_valid_for_hres(void); extern void update_wall_time(void); extern void update_xtime_cache(u64 nsec); +struct tms; +extern void do_sys_times(struct tms *); + /** * timespec_to_ns - Convert timespec to nanoseconds * @ts: pointer to the timespec variable to be converted -- cgit v1.2.3 From 0a8eaa4f9b58759595a1bfe13a1295fdc25ba026 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Sep 2008 17:03:52 +0200 Subject: timers: fix itimer/many thread hang, fix #2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix the UP build: In file included from arch/x86/kernel/asm-offsets_32.c:9, from arch/x86/kernel/asm-offsets.c:3: include/linux/sched.h: In function ‘thread_group_cputime_clone_thread’: include/linux/sched.h:2272: warning: no return statement in function returning non-void include/linux/sched.h: In function ‘thread_group_cputime_account_user’: include/linux/sched.h:2284: error: invalid type argument of ‘->’ (have ‘struct task_cputime’) include/linux/sched.h:2284: error: invalid type argument of ‘->’ (have ‘struct task_cputime’) include/linux/sched.h: In function ‘thread_group_cputime_account_system’: include/linux/sched.h:2291: error: invalid type argument of ‘->’ (have ‘struct task_cputime’) include/linux/sched.h:2291: error: invalid type argument of ‘->’ (have ‘struct task_cputime’) include/linux/sched.h: In function ‘thread_group_cputime_account_exec_runtime’: include/linux/sched.h:2298: error: invalid type argument of ‘->’ (have ‘struct task_cputime’) distcc[14501] ERROR: compile arch/x86/kernel/asm-offsets.c on a/30 failed make[1]: *** [arch/x86/kernel/asm-offsets.s] Error 1 Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 26d7a5f2d0b..ed355f02d32 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2251,6 +2251,7 @@ static inline void thread_group_cputime_free(struct signal_struct *sig) static inline int thread_group_cputime_clone_thread(struct task_struct *curr, struct task_struct *tsk) { + return 0; } static inline void thread_group_cputime(struct task_struct *tsk, @@ -2263,21 +2264,21 @@ static inline void thread_group_cputime_account_user( struct thread_group_cputime *tgtimes, cputime_t cputime) { - tgtimes->totals->utime = cputime_add(tgtimes->totals->utime, cputime); + tgtimes->totals.utime = cputime_add(tgtimes->totals.utime, cputime); } static inline void thread_group_cputime_account_system( struct thread_group_cputime *tgtimes, cputime_t cputime) { - tgtimes->totals->stime = cputime_add(tgtimes->totals->stime, cputime); + tgtimes->totals.stime = cputime_add(tgtimes->totals.stime, cputime); } static inline void thread_group_cputime_account_exec_runtime( struct thread_group_cputime *tgtimes, unsigned long long ns) { - tgtimes->totals->sum_exec_runtime += ns; + tgtimes->totals.sum_exec_runtime += ns; } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From 5ce73a4a5a4893a1aa4cdeed1b1a5a6de42c43b6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Sep 2008 17:11:46 +0200 Subject: timers: fix itimer/many thread hang, cleanups Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ed355f02d32..7ce8d4e5356 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -430,7 +430,7 @@ struct pacct_struct { * @utime: time spent in user mode, in &cputime_t units * @stime: time spent in kernel mode, in &cputime_t units * @sum_exec_runtime: total time spent on the CPU, in nanoseconds - * + * * This structure groups together three kinds of CPU time that are * tracked for threads and thread groups. Most things considering * CPU time want to group these counts together and treat all three -- cgit v1.2.3 From d7cfb60c5cf904ecf1e0ae23ec178175b86f0d4a Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 19 Sep 2008 13:13:44 +0100 Subject: hrtimer: remove hrtimer_clock_base::get_softirq_time() Peter Zijlstra noticed this 8 months ago and I just noticed it again. hrtimer_clock_base::get_softirq_time() is currently unused in the entire tree. In fact, looking at the logs, it appears as if it was never used. Remove it. Signed-off-by: Mark McLoughlin Signed-off-by: Ingo Molnar --- include/linux/hrtimer.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 6d93dce61cb..1b079bd29c3 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -145,7 +145,6 @@ struct hrtimer_sleeper { * @first: pointer to the timer node which expires first * @resolution: the resolution of the clock, in nanoseconds * @get_time: function to retrieve the current time of the clock - * @get_softirq_time: function to retrieve the current time from the softirq * @softirq_time: the time when running the hrtimer queue in the softirq * @offset: offset of this clock to the monotonic base * @reprogram: function to reprogram the timer event @@ -157,7 +156,6 @@ struct hrtimer_clock_base { struct rb_node *first; ktime_t resolution; ktime_t (*get_time)(void); - ktime_t (*get_softirq_time)(void); ktime_t softirq_time; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t offset; -- cgit v1.2.3 From b91c4996df56fcd201f85c392a1de7bc3f6641f5 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 19 Sep 2008 13:13:48 +0100 Subject: hrtimer: remove hrtimer_clock_base::reprogram() hrtimer_clock_base::reprogram() also appears to never have been used, so remove it. Signed-off-by: Mark McLoughlin Signed-off-by: Ingo Molnar --- include/linux/hrtimer.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 1b079bd29c3..68b0196d869 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -147,7 +147,6 @@ struct hrtimer_sleeper { * @get_time: function to retrieve the current time of the clock * @softirq_time: the time when running the hrtimer queue in the softirq * @offset: offset of this clock to the monotonic base - * @reprogram: function to reprogram the timer event */ struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; @@ -159,9 +158,6 @@ struct hrtimer_clock_base { ktime_t softirq_time; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t offset; - int (*reprogram)(struct hrtimer *t, - struct hrtimer_clock_base *b, - ktime_t n); #endif }; -- cgit v1.2.3 From bb34d92f643086d546b49cef680f6f305ed84414 Mon Sep 17 00:00:00 2001 From: Frank Mayhar Date: Fri, 12 Sep 2008 09:54:39 -0700 Subject: timers: fix itimer/many thread hang, v2 This is the second resubmission of the posix timer rework patch, posted a few days ago. This includes the changes from the previous resubmittion, which addressed Oleg Nesterov's comments, removing the RCU stuff from the patch and un-inlining the thread_group_cputime() function for SMP. In addition, per Ingo Molnar it simplifies the UP code, consolidating much of it with the SMP version and depending on lower-level SMP/UP handling to take care of the differences. It also cleans up some UP compile errors, moves the scheduler stats-related macros into kernel/sched_stats.h, cleans up a merge error in kernel/fork.c and has a few other minor fixes and cleanups as suggested by Oleg and Ingo. Thanks for the review, guys. Signed-off-by: Frank Mayhar Cc: Roland McGrath Cc: Alexey Dobriyan Cc: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 1 + include/linux/sched.h | 183 ++------------------------------------------ 2 files changed, 6 insertions(+), 178 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index cf9f40a91c9..cac3750cd65 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -52,6 +52,7 @@ static inline int kstat_irqs(int irq) return sum; } +extern unsigned long long task_delta_exec(struct task_struct *); extern void account_user_time(struct task_struct *, cputime_t); extern void account_user_time_scaled(struct task_struct *, cputime_t); extern void account_system_time(struct task_struct *, int, cputime_t); diff --git a/include/linux/sched.h b/include/linux/sched.h index 7ce8d4e5356..b982fb48c8f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -454,15 +454,9 @@ struct task_cputime { * This structure contains the version of task_cputime, above, that is * used for thread group CPU clock calculations. */ -#ifdef CONFIG_SMP struct thread_group_cputime { struct task_cputime *totals; }; -#else -struct thread_group_cputime { - struct task_cputime totals; -}; -#endif /* * NOTE! "signal_struct" does not have it's own @@ -2124,193 +2118,26 @@ static inline int spin_needbreak(spinlock_t *lock) /* * Thread group CPU time accounting. */ -#ifdef CONFIG_SMP -extern int thread_group_cputime_alloc_smp(struct task_struct *); -extern void thread_group_cputime_smp(struct task_struct *, struct task_cputime *); +extern int thread_group_cputime_alloc(struct task_struct *); +extern void thread_group_cputime(struct task_struct *, struct task_cputime *); static inline void thread_group_cputime_init(struct signal_struct *sig) { sig->cputime.totals = NULL; } -static inline int thread_group_cputime_clone_thread(struct task_struct *curr, - struct task_struct *new) +static inline int thread_group_cputime_clone_thread(struct task_struct *curr) { if (curr->signal->cputime.totals) return 0; - return thread_group_cputime_alloc_smp(curr); + return thread_group_cputime_alloc(curr); } -static inline void thread_group_cputime_free(struct signal_struct *sig) -{ - free_percpu(sig->cputime.totals); -} - -/** - * thread_group_cputime - Sum the thread group time fields across all CPUs. - * - * This is a wrapper for the real routine, thread_group_cputime_smp(). See - * that routine for details. - */ -static inline void thread_group_cputime( - struct task_struct *tsk, - struct task_cputime *times) -{ - thread_group_cputime_smp(tsk, times); -} - -/** - * thread_group_cputime_account_user - Maintain utime for a thread group. - * - * @tgtimes: Pointer to thread_group_cputime structure. - * @cputime: Time value by which to increment the utime field of that - * structure. - * - * If thread group time is being maintained, get the structure for the - * running CPU and update the utime field there. - */ -static inline void thread_group_cputime_account_user( - struct thread_group_cputime *tgtimes, - cputime_t cputime) -{ - if (tgtimes->totals) { - struct task_cputime *times; - - times = per_cpu_ptr(tgtimes->totals, get_cpu()); - times->utime = cputime_add(times->utime, cputime); - put_cpu_no_resched(); - } -} - -/** - * thread_group_cputime_account_system - Maintain stime for a thread group. - * - * @tgtimes: Pointer to thread_group_cputime structure. - * @cputime: Time value by which to increment the stime field of that - * structure. - * - * If thread group time is being maintained, get the structure for the - * running CPU and update the stime field there. - */ -static inline void thread_group_cputime_account_system( - struct thread_group_cputime *tgtimes, - cputime_t cputime) -{ - if (tgtimes->totals) { - struct task_cputime *times; - - times = per_cpu_ptr(tgtimes->totals, get_cpu()); - times->stime = cputime_add(times->stime, cputime); - put_cpu_no_resched(); - } -} - -/** - * thread_group_cputime_account_exec_runtime - Maintain exec runtime for a - * thread group. - * - * @tgtimes: Pointer to thread_group_cputime structure. - * @ns: Time value by which to increment the sum_exec_runtime field - * of that structure. - * - * If thread group time is being maintained, get the structure for the - * running CPU and update the sum_exec_runtime field there. - */ -static inline void thread_group_cputime_account_exec_runtime( - struct thread_group_cputime *tgtimes, - unsigned long long ns) -{ - if (tgtimes->totals) { - struct task_cputime *times; - - times = per_cpu_ptr(tgtimes->totals, get_cpu()); - times->sum_exec_runtime += ns; - put_cpu_no_resched(); - } -} - -#else /* CONFIG_SMP */ - -static inline void thread_group_cputime_init(struct signal_struct *sig) -{ - sig->cputime.totals.utime = cputime_zero; - sig->cputime.totals.stime = cputime_zero; - sig->cputime.totals.sum_exec_runtime = 0; -} - -static inline int thread_group_cputime_alloc(struct task_struct *tsk) -{ - return 0; -} static inline void thread_group_cputime_free(struct signal_struct *sig) { -} - -static inline int thread_group_cputime_clone_thread(struct task_struct *curr, - struct task_struct *tsk) -{ - return 0; -} - -static inline void thread_group_cputime(struct task_struct *tsk, - struct task_cputime *cputime) -{ - *cputime = tsk->signal->cputime.totals; -} - -static inline void thread_group_cputime_account_user( - struct thread_group_cputime *tgtimes, - cputime_t cputime) -{ - tgtimes->totals.utime = cputime_add(tgtimes->totals.utime, cputime); -} - -static inline void thread_group_cputime_account_system( - struct thread_group_cputime *tgtimes, - cputime_t cputime) -{ - tgtimes->totals.stime = cputime_add(tgtimes->totals.stime, cputime); -} - -static inline void thread_group_cputime_account_exec_runtime( - struct thread_group_cputime *tgtimes, - unsigned long long ns) -{ - tgtimes->totals.sum_exec_runtime += ns; -} - -#endif /* CONFIG_SMP */ - -static inline void account_group_user_time(struct task_struct *tsk, - cputime_t cputime) -{ - struct signal_struct *sig; - - sig = tsk->signal; - if (likely(sig)) - thread_group_cputime_account_user(&sig->cputime, cputime); -} - -static inline void account_group_system_time(struct task_struct *tsk, - cputime_t cputime) -{ - struct signal_struct *sig; - - sig = tsk->signal; - if (likely(sig)) - thread_group_cputime_account_system(&sig->cputime, cputime); -} - -static inline void account_group_exec_runtime(struct task_struct *tsk, - unsigned long long ns) -{ - struct signal_struct *sig; - - sig = tsk->signal; - if (likely(sig)) - thread_group_cputime_account_exec_runtime(&sig->cputime, ns); + free_percpu(sig->cputime.totals); } /* -- cgit v1.2.3 From 5a9fa73072854981a5c05eb7ba18a96d49c2804f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Sep 2008 14:42:50 -0700 Subject: posix-timers: kill ->it_sigev_signo and ->it_sigev_value With the recent changes ->it_sigev_signo and ->it_sigev_value are only used in sys_timer_create(), kill them. Signed-off-by: Oleg Nesterov Cc: mingo@elte.hu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- include/linux/posix-timers.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index f9d8e9e94e9..a7c72135554 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -45,8 +45,6 @@ struct k_itimer { int it_requeue_pending; /* waiting to requeue this timer */ #define REQUEUE_PENDING 1 int it_sigev_notify; /* notify word of sigevent struct */ - int it_sigev_signo; /* signo word of sigevent struct */ - sigval_t it_sigev_value; /* value word of sigevent struct */ struct task_struct *it_process; /* process to send signal to */ struct sigqueue *sigq; /* signal queue entry. */ union { -- cgit v1.2.3 From 1b02469088ac7a13d7e622b618b7410d0f1ce5ec Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Mon, 22 Sep 2008 14:42:43 -0700 Subject: hrtimer: reorder struct hrtimer to save 8 bytes on 64bit builds reorder struct hrtimer to save 8 bytes on 64 bit builds when CONFIG_TIMER_STATS selected. (also removes 8 bytes from signal_struct) Signed-off-by: Richard Kennedy Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 68b0196d869..8730b60c943 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -115,12 +115,12 @@ struct hrtimer { enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; unsigned long state; - enum hrtimer_cb_mode cb_mode; struct list_head cb_entry; + enum hrtimer_cb_mode cb_mode; #ifdef CONFIG_TIMER_STATS + int start_pid; void *start_site; char start_comm[16]; - int start_pid; #endif }; -- cgit v1.2.3 From d40e944c25fb4642adb2a4c580a48218a9f3f824 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Mon, 22 Sep 2008 14:42:44 -0700 Subject: ntp: improve adjtimex frequency rounding Change PPM_SCALE_INV_SHIFT so that it doesn't throw away any input bits (19 is the amount of the factor 2 in PPM_SCALE), the output frequency can then be calculated back to its input value, as the inverse divide produce a slightly larger value, which is then correctly rounded by the final shift. Reported-by: Martin Ziegler Signed-off-by: Roman Zippel Cc: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- include/linux/timex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/timex.h b/include/linux/timex.h index c00bcdd3ae4..9007313b5b7 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -82,7 +82,7 @@ */ #define SHIFT_USEC 16 /* frequency offset scale (shift) */ #define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC)) -#define PPM_SCALE_INV_SHIFT 20 +#define PPM_SCALE_INV_SHIFT 19 #define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \ PPM_SCALE + 1) -- cgit v1.2.3 From 7086efe1c1536f6bc160e7d60a9bfd645b91f279 Mon Sep 17 00:00:00 2001 From: Frank Mayhar Date: Fri, 12 Sep 2008 09:54:39 -0700 Subject: timers: fix itimer/many thread hang, v3 - fix UP lockup - another set of UP/SMP cleanups and simplifications Signed-off-by: Frank Mayhar Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index b982fb48c8f..23d9d546454 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2134,7 +2134,6 @@ static inline int thread_group_cputime_clone_thread(struct task_struct *curr) return thread_group_cputime_alloc(curr); } - static inline void thread_group_cputime_free(struct signal_struct *sig) { free_percpu(sig->cputime.totals); -- cgit v1.2.3 From 97e1c18e8d17bd87e1e383b2e9d9fc740332c8e2 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 18 Jul 2008 12:16:16 -0400 Subject: tracing: Kernel Tracepoints Implementation of kernel tracepoints. Inspired from the Linux Kernel Markers. Allows complete typing verification by declaring both tracing statement inline functions and probe registration/unregistration static inline functions within the same macro "DEFINE_TRACE". No format string is required. See the tracepoint Documentation and Samples patches for usage examples. Taken from the documentation patch : "A tracepoint placed in code provides a hook to call a function (probe) that you can provide at runtime. A tracepoint can be "on" (a probe is connected to it) or "off" (no probe is attached). When a tracepoint is "off" it has no effect, except for adding a tiny time penalty (checking a condition for a branch) and space penalty (adding a few bytes for the function call at the end of the instrumented function and adds a data structure in a separate section). When a tracepoint is "on", the function you provide is called each time the tracepoint is executed, in the execution context of the caller. When the function provided ends its execution, it returns to the caller (continuing from the tracepoint site). You can put tracepoints at important locations in the code. They are lightweight hooks that can pass an arbitrary number of parameters, which prototypes are described in a tracepoint declaration placed in a header file." Addition and removal of tracepoints is synchronized by RCU using the scheduler (and preempt_disable) as guarantees to find a quiescent state (this is really RCU "classic"). The update side uses rcu_barrier_sched() with call_rcu_sched() and the read/execute side uses "preempt_disable()/preempt_enable()". We make sure the previous array containing probes, which has been scheduled for deletion by the rcu callback, is indeed freed before we proceed to the next update. It therefore limits the rate of modification of a single tracepoint to one update per RCU period. The objective here is to permit fast batch add/removal of probes on _different_ tracepoints. Changelog : - Use #name ":" #proto as string to identify the tracepoint in the tracepoint table. This will make sure not type mismatch happens due to connexion of a probe with the wrong type to a tracepoint declared with the same name in a different header. - Add tracepoint_entry_free_old. - Change __TO_TRACE to get rid of the 'i' iterator. Masami Hiramatsu : Tested on x86-64. Performance impact of a tracepoint : same as markers, except that it adds about 70 bytes of instructions in an unlikely branch of each instrumented function (the for loop, the stack setup and the function call). It currently adds a memory read, a test and a conditional branch at the instrumentation site (in the hot path). Immediate values will eventually change this into a load immediate, test and branch, which removes the memory read which will make the i-cache impact smaller (changing the memory read for a load immediate removes 3-4 bytes per site on x86_32 (depending on mov prefixes), or 7-8 bytes on x86_64, it also saves the d-cache hit). About the performance impact of tracepoints (which is comparable to markers), even without immediate values optimizations, tests done by Hideo Aoki on ia64 show no regression. His test case was using hackbench on a kernel where scheduler instrumentation (about 5 events in code scheduler code) was added. Quoting Hideo Aoki about Markers : I evaluated overhead of kernel marker using linux-2.6-sched-fixes git tree, which includes several markers for LTTng, using an ia64 server. While the immediate trace mark feature isn't implemented on ia64, there is no major performance regression. So, I think that we don't have any issues to propose merging marker point patches into Linus's tree from the viewpoint of performance impact. I prepared two kernels to evaluate. The first one was compiled without CONFIG_MARKERS. The second one was enabled CONFIG_MARKERS. I downloaded the original hackbench from the following URL: http://devresources.linux-foundation.org/craiger/hackbench/src/hackbench.c I ran hackbench 5 times in each condition and calculated the average and difference between the kernels. The parameter of hackbench: every 50 from 50 to 800 The number of CPUs of the server: 2, 4, and 8 Below is the results. As you can see, major performance regression wasn't found in any case. Even if number of processes increases, differences between marker-enabled kernel and marker- disabled kernel doesn't increase. Moreover, if number of CPUs increases, the differences doesn't increase either. Curiously, marker-enabled kernel is better than marker-disabled kernel in more than half cases, although I guess it comes from the difference of memory access pattern. * 2 CPUs Number of | without | with | diff | diff | processes | Marker [Sec] | Marker [Sec] | [Sec] | [%] | -------------------------------------------------------------- 50 | 4.811 | 4.872 | +0.061 | +1.27 | 100 | 9.854 | 10.309 | +0.454 | +4.61 | 150 | 15.602 | 15.040 | -0.562 | -3.6 | 200 | 20.489 | 20.380 | -0.109 | -0.53 | 250 | 25.798 | 25.652 | -0.146 | -0.56 | 300 | 31.260 | 30.797 | -0.463 | -1.48 | 350 | 36.121 | 35.770 | -0.351 | -0.97 | 400 | 42.288 | 42.102 | -0.186 | -0.44 | 450 | 47.778 | 47.253 | -0.526 | -1.1 | 500 | 51.953 | 52.278 | +0.325 | +0.63 | 550 | 58.401 | 57.700 | -0.701 | -1.2 | 600 | 63.334 | 63.222 | -0.112 | -0.18 | 650 | 68.816 | 68.511 | -0.306 | -0.44 | 700 | 74.667 | 74.088 | -0.579 | -0.78 | 750 | 78.612 | 79.582 | +0.970 | +1.23 | 800 | 85.431 | 85.263 | -0.168 | -0.2 | -------------------------------------------------------------- * 4 CPUs Number of | without | with | diff | diff | processes | Marker [Sec] | Marker [Sec] | [Sec] | [%] | -------------------------------------------------------------- 50 | 2.586 | 2.584 | -0.003 | -0.1 | 100 | 5.254 | 5.283 | +0.030 | +0.56 | 150 | 8.012 | 8.074 | +0.061 | +0.76 | 200 | 11.172 | 11.000 | -0.172 | -1.54 | 250 | 13.917 | 14.036 | +0.119 | +0.86 | 300 | 16.905 | 16.543 | -0.362 | -2.14 | 350 | 19.901 | 20.036 | +0.135 | +0.68 | 400 | 22.908 | 23.094 | +0.186 | +0.81 | 450 | 26.273 | 26.101 | -0.172 | -0.66 | 500 | 29.554 | 29.092 | -0.461 | -1.56 | 550 | 32.377 | 32.274 | -0.103 | -0.32 | 600 | 35.855 | 35.322 | -0.533 | -1.49 | 650 | 39.192 | 38.388 | -0.804 | -2.05 | 700 | 41.744 | 41.719 | -0.025 | -0.06 | 750 | 45.016 | 44.496 | -0.520 | -1.16 | 800 | 48.212 | 47.603 | -0.609 | -1.26 | -------------------------------------------------------------- * 8 CPUs Number of | without | with | diff | diff | processes | Marker [Sec] | Marker [Sec] | [Sec] | [%] | -------------------------------------------------------------- 50 | 2.094 | 2.072 | -0.022 | -1.07 | 100 | 4.162 | 4.273 | +0.111 | +2.66 | 150 | 6.485 | 6.540 | +0.055 | +0.84 | 200 | 8.556 | 8.478 | -0.078 | -0.91 | 250 | 10.458 | 10.258 | -0.200 | -1.91 | 300 | 12.425 | 12.750 | +0.325 | +2.62 | 350 | 14.807 | 14.839 | +0.032 | +0.22 | 400 | 16.801 | 16.959 | +0.158 | +0.94 | 450 | 19.478 | 19.009 | -0.470 | -2.41 | 500 | 21.296 | 21.504 | +0.208 | +0.98 | 550 | 23.842 | 23.979 | +0.137 | +0.57 | 600 | 26.309 | 26.111 | -0.198 | -0.75 | 650 | 28.705 | 28.446 | -0.259 | -0.9 | 700 | 31.233 | 31.394 | +0.161 | +0.52 | 750 | 34.064 | 33.720 | -0.344 | -1.01 | 800 | 36.320 | 36.114 | -0.206 | -0.57 | -------------------------------------------------------------- Signed-off-by: Mathieu Desnoyers Acked-by: Masami Hiramatsu Acked-by: 'Peter Zijlstra' Signed-off-by: Ingo Molnar --- include/linux/module.h | 17 ++++++ include/linux/tracepoint.h | 127 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 include/linux/tracepoint.h (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 68e09557c95..8b611350386 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -331,6 +332,10 @@ struct module struct marker *markers; unsigned int num_markers; #endif +#ifdef CONFIG_TRACEPOINTS + struct tracepoint *tracepoints; + unsigned int num_tracepoints; +#endif #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ @@ -454,6 +459,9 @@ extern void print_modules(void); extern void module_update_markers(void); +extern void module_update_tracepoints(void); +extern int module_get_iter_tracepoints(struct tracepoint_iter *iter); + #else /* !CONFIG_MODULES... */ #define EXPORT_SYMBOL(sym) #define EXPORT_SYMBOL_GPL(sym) @@ -558,6 +566,15 @@ static inline void module_update_markers(void) { } +static inline void module_update_tracepoints(void) +{ +} + +static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter) +{ + return 0; +} + #endif /* CONFIG_MODULES */ struct device_driver; diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h new file mode 100644 index 00000000000..e623a6fca5c --- /dev/null +++ b/include/linux/tracepoint.h @@ -0,0 +1,127 @@ +#ifndef _LINUX_TRACEPOINT_H +#define _LINUX_TRACEPOINT_H + +/* + * Kernel Tracepoint API. + * + * See Documentation/tracepoint.txt. + * + * (C) Copyright 2008 Mathieu Desnoyers + * + * Heavily inspired from the Linux Kernel Markers. + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#include +#include + +struct module; +struct tracepoint; + +struct tracepoint { + const char *name; /* Tracepoint name */ + int state; /* State. */ + void **funcs; +} __attribute__((aligned(8))); + + +#define TPPROTO(args...) args +#define TPARGS(args...) args + +#ifdef CONFIG_TRACEPOINTS + +/* + * it_func[0] is never NULL because there is at least one element in the array + * when the array itself is non NULL. + */ +#define __DO_TRACE(tp, proto, args) \ + do { \ + void **it_func; \ + \ + rcu_read_lock_sched(); \ + it_func = rcu_dereference((tp)->funcs); \ + if (it_func) { \ + do { \ + ((void(*)(proto))(*it_func))(args); \ + } while (*(++it_func)); \ + } \ + rcu_read_unlock_sched(); \ + } while (0) + +/* + * Make sure the alignment of the structure in the __tracepoints section will + * not add unwanted padding between the beginning of the section and the + * structure. Force alignment to the same alignment as the section start. + */ +#define DEFINE_TRACE(name, proto, args) \ + static inline void trace_##name(proto) \ + { \ + static const char __tpstrtab_##name[] \ + __attribute__((section("__tracepoints_strings"))) \ + = #name ":" #proto; \ + static struct tracepoint __tracepoint_##name \ + __attribute__((section("__tracepoints"), aligned(8))) = \ + { __tpstrtab_##name, 0, NULL }; \ + if (unlikely(__tracepoint_##name.state)) \ + __DO_TRACE(&__tracepoint_##name, \ + TPPROTO(proto), TPARGS(args)); \ + } \ + static inline int register_trace_##name(void (*probe)(proto)) \ + { \ + return tracepoint_probe_register(#name ":" #proto, \ + (void *)probe); \ + } \ + static inline void unregister_trace_##name(void (*probe)(proto))\ + { \ + tracepoint_probe_unregister(#name ":" #proto, \ + (void *)probe); \ + } + +extern void tracepoint_update_probe_range(struct tracepoint *begin, + struct tracepoint *end); + +#else /* !CONFIG_TRACEPOINTS */ +#define DEFINE_TRACE(name, proto, args) \ + static inline void _do_trace_##name(struct tracepoint *tp, proto) \ + { } \ + static inline void trace_##name(proto) \ + { } \ + static inline int register_trace_##name(void (*probe)(proto)) \ + { \ + return -ENOSYS; \ + } \ + static inline void unregister_trace_##name(void (*probe)(proto))\ + { } + +static inline void tracepoint_update_probe_range(struct tracepoint *begin, + struct tracepoint *end) +{ } +#endif /* CONFIG_TRACEPOINTS */ + +/* + * Connect a probe to a tracepoint. + * Internal API, should not be used directly. + */ +extern int tracepoint_probe_register(const char *name, void *probe); + +/* + * Disconnect a probe from a tracepoint. + * Internal API, should not be used directly. + */ +extern int tracepoint_probe_unregister(const char *name, void *probe); + +struct tracepoint_iter { + struct module *module; + struct tracepoint *tracepoint; +}; + +extern void tracepoint_iter_start(struct tracepoint_iter *iter); +extern void tracepoint_iter_next(struct tracepoint_iter *iter); +extern void tracepoint_iter_stop(struct tracepoint_iter *iter); +extern void tracepoint_iter_reset(struct tracepoint_iter *iter); +extern int tracepoint_get_iter_range(struct tracepoint **tracepoint, + struct tracepoint *begin, struct tracepoint *end); + +#endif -- cgit v1.2.3 From 36dcd67ae994fece615b7c700958d215e884b9ae Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 29 Jul 2008 12:00:59 +0200 Subject: ftrace: ignore functions that cannot be kprobe-ed kprobes already has an extensive list of annotations for functions that should not be instrumented. Add notrace annotations to these functions as well. This is particularly useful for functions called by the NMI path. Signed-off-by: Ingo Molnar --- include/linux/kprobes.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 0be7795655f..497b1d1f7a0 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -29,6 +29,7 @@ * and Prasanna S Panchamukhi * added function-return probes. */ +#include #include #include #include @@ -47,7 +48,7 @@ #define KPROBE_HIT_SSDONE 0x00000008 /* Attach to insert probes on any functions which should be ignored*/ -#define __kprobes __attribute__((__section__(".kprobes.text"))) +#define __kprobes __attribute__((__section__(".kprobes.text"))) notrace struct kprobe; struct pt_regs; @@ -256,7 +257,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); #else /* CONFIG_KPROBES */ -#define __kprobes /**/ +#define __kprobes notrace struct jprobe; struct kretprobe; -- cgit v1.2.3 From 68bf21aa15c85d2e9b623dcda2b1ed8893275fa1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 15:45:08 -0400 Subject: ftrace: mcount call site on boot nops core This is the infrastructure to the converting the mcount call sites recorded by the __mcount_loc section into nops on boot. It also allows for using these sites to enable tracing as normal. When the __mcount_loc section is used, the "ftraced" kernel thread is disabled. This uses the current infrastructure to record the mcount call sites as well as convert them to nops. The mcount function is kept as a stub on boot up and not converted to the ftrace_record_ip function. We use the ftrace_record_ip to only record from the table. This patch does not handle modules. That comes with a later patch. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index bb384068272..d4d6ab453b7 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -162,4 +162,10 @@ static inline void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } #endif +#ifdef CONFIG_FTRACE_MCOUNT_RECORD +extern void ftrace_init(void); +#else +static inline void ftrace_init(void) { } +#endif + #endif /* _LINUX_FTRACE_H */ -- cgit v1.2.3 From 90d595fe5ca4b685465c068907e6e554760abea8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 15:45:09 -0400 Subject: ftrace: enable mcount recording for modules This patch enables the loading of the __mcount_section of modules and changing all the callers of mcount into nops. The modification is done before the init_module function is called, so again, we do not need to use kstop_machine to make these changes. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index d4d6ab453b7..4936489f9ed 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -164,8 +164,11 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } #ifdef CONFIG_FTRACE_MCOUNT_RECORD extern void ftrace_init(void); +extern void ftrace_init_module(unsigned long *start, unsigned long *end); #else static inline void ftrace_init(void) { } +static inline void +ftrace_init_module(unsigned long *start, unsigned long *end) { } #endif #endif /* _LINUX_FTRACE_H */ -- cgit v1.2.3 From 29e71abf56cebc5c5a4e184a6eb4360cc58554ad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 15:45:10 -0400 Subject: ftrace: rebuild everything on change to FTRACE_MCOUNT_RECORD When enabling or disabling CONFIG_FTRACE_MCOUNT_RECORD, we want a full kernel compile to handle the adding of the __mcount_loc sections. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 75d81f157d2..ecce4a4ccd5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -486,4 +486,9 @@ struct sysinfo { #define NUMA_BUILD 0 #endif +/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */ +#ifdef CONFIG_FTRACE_MCOUNT_RECORD +# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD +#endif + #endif -- cgit v1.2.3 From 28614889bcb2558a47d02d52394b7fd9795a9547 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 22:47:18 -0400 Subject: ftrace: move notrace to compiler.h The notrace define belongs in compiler.h so that it can be used in init.h Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 2 ++ include/linux/linkage.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 8322141ee48..98115d9d04d 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -44,6 +44,8 @@ extern void __chk_io_ptr(const volatile void __iomem *); # error Sorry, your compiler is too old/not recognized. #endif +#define notrace __attribute__((no_instrument_function)) + /* Intel compiler defines __GNUC__. So we will overwrite implementations * coming from above header files here */ diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 56ba3739465..9fd1f859021 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -4,8 +4,6 @@ #include #include -#define notrace __attribute__((no_instrument_function)) - #ifdef __cplusplus #define CPP_ASMLINKAGE extern "C" #else -- cgit v1.2.3 From fed1939c64d2288938fdc1c367d49082da65e195 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 22:47:19 -0400 Subject: ftrace: remove old pointers to mcount When a mcount pointer is recorded into a table, it is used to add or remove calls to mcount (replacing them with nops). If the code is removed via removing a module, the pointers still exist. At modifying the code a check is always made to make sure the code being replaced is the code expected. In-other-words, the code being replaced is compared to what it is expected to be before being replaced. There is a very small chance that the code being replaced just happens to look like code that calls mcount (very small since the call to mcount is relative). To remove this chance, this patch adds ftrace_release to allow module unloading to remove the pointers to mcount within the module. Another change for init calls is made to not trace calls marked with __init. The tracing can not be started until after init is done anyway. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 2 ++ include/linux/init.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4936489f9ed..6b232a2460c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -165,10 +165,12 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } #ifdef CONFIG_FTRACE_MCOUNT_RECORD extern void ftrace_init(void); extern void ftrace_init_module(unsigned long *start, unsigned long *end); +extern void ftrace_release(void *start, unsigned long size); #else static inline void ftrace_init(void) { } static inline void ftrace_init_module(unsigned long *start, unsigned long *end) { } +static inline void ftrace_release(void *start, unsigned long size) { } #endif #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/init.h b/include/linux/init.h index 93538b696e3..27f61f6b3cb 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -40,7 +40,7 @@ /* These are for everybody (although not all archs will actually discard it in modules) */ -#define __init __section(.init.text) __cold +#define __init __section(.init.text) __cold notrace #define __initdata __section(.init.data) #define __initconst __section(.init.rodata) #define __exitdata __section(.exit.data) -- cgit v1.2.3 From dd0e545f061f90099a3dcc13aa77e29c6295cf23 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 Aug 2008 12:26:41 -0400 Subject: ftrace: printk formatting infrastructure This patch adds a feature that can help kernel developers debug their code using ftrace. int ftrace_printk(const char *fmt, ...); This records into the ftrace buffer using printf formatting. The entry size in the buffers are still a fixed length. A new type has been added that allows for more entries to be used for a single recording. The start of the print is still the same as the other entries. It returns the number of characters written to the ftrace buffer. For example: Having a module with the following code: static int __init ftrace_print_test(void) { ftrace_printk("jiffies are %ld\n", jiffies); return 0; } Gives me: insmod-5441 3...1 7569us : ftrace_print_test: jiffies are 4296626666 for the latency_trace file and: insmod-5441 [03] 1959.370498: ftrace_print_test jiffies are 4296626666 for the trace file. Note: Only the infrastructure should go into the kernel. It is to help facilitate debugging for other kernel developers. Calls to ftrace_printk is not intended to be left in the kernel, and should be frowned upon just like scattering printks around in the code. But having this easily at your fingertips helps the debugging go faster and bugs be solved quicker. Maybe later on, we can hook this with markers and have their printf format be sucked into ftrace output. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 6b232a2460c..f53b975e32f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -157,9 +157,18 @@ static inline void __ftrace_enabled_restore(int enabled) #ifdef CONFIG_TRACING extern void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); +# define ftrace_printk(x...) __ftrace_printk(_THIS_IP_, x) +extern int +__ftrace_printk(unsigned long ip, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); #else static inline void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } +static inline int +ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0))) +{ + return 0; +} #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD @@ -173,4 +182,5 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { } static inline void ftrace_release(void *start, unsigned long size) { } #endif + #endif /* _LINUX_FTRACE_H */ -- cgit v1.2.3 From 2f2c99dba2398ef7d9c21f7c793180a50e68b1f0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 Aug 2008 16:45:49 -0400 Subject: ftrace: ftrace_printk doc moved Based on Randy Dunlap's suggestion, the ftrace_printk kernel-doc belongs with the ftrace_printk macro that should be used. Not with the __ftrace_printk internal function. Signed-off-by: Steven Rostedt Acked-by: Randy Dunlap Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f53b975e32f..018af16bce5 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -157,7 +157,24 @@ static inline void __ftrace_enabled_restore(int enabled) #ifdef CONFIG_TRACING extern void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); -# define ftrace_printk(x...) __ftrace_printk(_THIS_IP_, x) + +/** + * ftrace_printk - printf formatting in the ftrace buffer + * @fmt: the printf format for printing + * + * Note: __ftrace_printk is an internal function for ftrace_printk and + * the @ip is passed in via the ftrace_printk macro. + * + * This function allows a kernel developer to debug fast path sections + * that printk is not appropriate for. By scattering in various + * printk like tracing in the code, a developer can quickly see + * where problems are occurring. + * + * This is intended as a debugging tool for the developer only. + * Please refrain from leaving ftrace_printks scattered around in + * your code. + */ +# define ftrace_printk(fmt...) __ftrace_printk(_THIS_IP_, fmt) extern int __ftrace_printk(unsigned long ip, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); -- cgit v1.2.3 From 3f5a54e371ca20b119b73704f6c01b71295c1714 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 30 Jul 2008 22:36:46 -0400 Subject: ftrace: dump out ftrace buffers to console on panic At OLS I had a lot of interest to be able to have the ftrace buffers dumped on panic. Usually one would expect to uses kexec and examine the buffers after a new kernel is loaded. But sometimes the resources do not permit kdump and kexec, so having an option to still see the sequence of events up to the crash is very advantageous. This patch adds the option to have the ftrace buffers dumped to the console in the latency_trace format on a panic. When the option is set, the default entries per CPU buffer are lowered to 16384, since the writing to the serial (if that is the console) may take an awful long time otherwise. [ Changes since -v1: Got alpine to send correctly (as well as spell check working). Removed config option. Moved the static variables into ftrace_dump itself. Gave printk a log level. ] Signed-off-by: Steven Rostedt Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 018af16bce5..f7fb92045bf 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -178,6 +178,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); extern int __ftrace_printk(unsigned long ip, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); +extern void ftrace_dump(void); #else static inline void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } @@ -186,6 +187,7 @@ ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0))) { return 0; } +static inline void ftrace_dump(void) { } #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD -- cgit v1.2.3 From 7b928c23fa3e9fa37d1d4ba52ba963f41ee5aae0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 15 Aug 2008 17:48:02 +0200 Subject: ftrace: build fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: In file included from init/main.c:65: include/linux/ftrace.h:166: error: expected ‘,' or ‘;' before ‘{' token make[1]: *** [init/main.o] Error 1 make: *** [init/main.o] Error 2 Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f7fb92045bf..ce929cb5543 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -183,7 +183,10 @@ extern void ftrace_dump(void); static inline void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } static inline int -ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0))) +ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0))); + +static inline int +ftrace_printk(const char *fmt, ...) { return 0; } -- cgit v1.2.3 From c5131ad6c3cbe8f6674993e29a76cecf8deb4384 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 15 Aug 2008 18:22:09 +0200 Subject: ftrace: ftrace_kill_atomic() build fix fix: kernel/built-in.o: In function `ftrace_dump': (.text+0x2e2ea): undefined reference to `ftrace_kill_atomic' Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ce929cb5543..36c439927ff 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -36,6 +36,7 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1); # define register_ftrace_function(ops) do { } while (0) # define unregister_ftrace_function(ops) do { } while (0) # define clear_ftrace_function(ops) do { } while (0) +static inline void ftrace_kill_atomic(void) { } #endif /* CONFIG_FTRACE */ #ifdef CONFIG_DYNAMIC_FTRACE -- cgit v1.2.3 From 3700273586ee6a58b95dd07d9f8a02db4a9b476f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 18 Aug 2008 16:24:56 +0800 Subject: ftrace: fix incorrect comment style of __ftrace_enabled_save() This patch fixes incorrect comment style of __ftrace_enabled_save(). Signed-off-by: Huang Ying Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 36c439927ff..8b4cf38c80d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -99,9 +99,11 @@ static inline void tracer_disable(void) #endif } -/* Ftrace disable/restore without lock. Some synchronization mechanism +/* + * Ftrace disable/restore without lock. Some synchronization mechanism * must be used to prevent ftrace_enabled to be changed between - * disable/restore. */ + * disable/restore. + */ static inline int __ftrace_enabled_save(void) { #ifdef CONFIG_FTRACE -- cgit v1.2.3 From c0719e5a4b1ccc04180b7a7b71095c9fb7131919 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 6 Sep 2008 01:06:03 -0400 Subject: ftrace: use ftrace_release for all dynamic ftrace functions ftrace_release is necessary for all uses of dynamic ftrace and not just the archs that have CONFIG_FTRACE_MCOUNT_RECORD defined. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8b4cf38c80d..5de9903645d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -77,8 +77,10 @@ extern void mcount_call(void); extern int skip_trace(unsigned long ip); -void ftrace_disable_daemon(void); -void ftrace_enable_daemon(void); +extern void ftrace_release(void *start, unsigned long size); + +extern void ftrace_disable_daemon(void); +extern void ftrace_enable_daemon(void); #else # define skip_trace(ip) ({ 0; }) @@ -86,6 +88,7 @@ void ftrace_enable_daemon(void); # define ftrace_set_filter(buf, len, reset) do { } while (0) # define ftrace_disable_daemon() do { } while (0) # define ftrace_enable_daemon() do { } while (0) +static inline void ftrace_release(void *start, unsigned long size) { } #endif /* CONFIG_DYNAMIC_FTRACE */ /* totally disable ftrace - can not re-enable after this */ @@ -199,12 +202,10 @@ static inline void ftrace_dump(void) { } #ifdef CONFIG_FTRACE_MCOUNT_RECORD extern void ftrace_init(void); extern void ftrace_init_module(unsigned long *start, unsigned long *end); -extern void ftrace_release(void *start, unsigned long size); #else static inline void ftrace_init(void) { } static inline void ftrace_init_module(unsigned long *start, unsigned long *end) { } -static inline void ftrace_release(void *start, unsigned long size) { } #endif -- cgit v1.2.3 From 9e57fb35d711331a9b1410c5c56ebeb3733428a0 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Tue, 16 Sep 2008 22:00:34 +0300 Subject: x86 mmiotrace: implement mmiotrace_printk() Offer mmiotrace users a function to inject markers from inside the kernel. This depends on the trace_vprintk() patch. Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/mmiotrace.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 61d19e1b7a0..60cc3bf5c53 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -34,11 +34,15 @@ extern void unregister_kmmio_probe(struct kmmio_probe *p); /* Called from page fault handler. */ extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); -/* Called from ioremap.c */ #ifdef CONFIG_MMIOTRACE +/* Called from ioremap.c */ extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size, void __iomem *addr); extern void mmiotrace_iounmap(volatile void __iomem *addr); + +/* For anyone to insert markers. Remember trailing newline. */ +extern int mmiotrace_printk(const char *fmt, ...) + __attribute__ ((format (printf, 1, 2))); #else static inline void mmiotrace_ioremap(resource_size_t offset, unsigned long size, void __iomem *addr) @@ -48,7 +52,15 @@ static inline void mmiotrace_ioremap(resource_size_t offset, static inline void mmiotrace_iounmap(volatile void __iomem *addr) { } -#endif /* CONFIG_MMIOTRACE_HOOKS */ + +static inline int mmiotrace_printk(const char *fmt, ...) + __attribute__ ((format (printf, 1, 0))); + +static inline int mmiotrace_printk(const char *fmt, ...) +{ + return 0; +} +#endif /* CONFIG_MMIOTRACE */ enum mm_io_opcode { MMIO_READ = 0x1, /* struct mmiotrace_rw */ @@ -81,5 +93,6 @@ extern void enable_mmiotrace(void); extern void disable_mmiotrace(void); extern void mmio_trace_rw(struct mmiotrace_rw *rw); extern void mmio_trace_mapping(struct mmiotrace_map *map); +extern int mmio_trace_printk(const char *fmt, va_list args); #endif /* MMIOTRACE_H */ -- cgit v1.2.3 From 4427414170a63331a9cc36b9598502c5cdfe453b Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Tue, 16 Sep 2008 22:03:56 +0300 Subject: mmiotrace: remove left-over marker cruft Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/mmiotrace.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 60cc3bf5c53..139d7c88d9c 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -67,8 +67,7 @@ enum mm_io_opcode { MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ MMIO_PROBE = 0x3, /* struct mmiotrace_map */ MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ - MMIO_MARKER = 0x5, /* raw char data */ - MMIO_UNKNOWN_OP = 0x6, /* struct mmiotrace_rw */ + MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */ }; struct mmiotrace_rw { -- cgit v1.2.3 From e98d0eabef2748d88fa58760d104e8e68517406b Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 29 Sep 2008 11:05:13 -0400 Subject: markers: marker_synchronize_unregister() Create marker_synchronize_unregister() which must be called before the end of exit() to make sure every probe callers have exited the non preemptible section and thus are not executing the probe code anymore. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- include/linux/marker.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/marker.h b/include/linux/marker.h index 1290653f924..889196c7fbb 100644 --- a/include/linux/marker.h +++ b/include/linux/marker.h @@ -160,4 +160,11 @@ extern int marker_probe_unregister_private_data(marker_probe_func *probe, extern void *marker_get_private_data(const char *name, marker_probe_func *probe, int num); +/* + * marker_synchronize_unregister must be called between the last marker probe + * unregistration and the end of module exit to make sure there is no caller + * executing a probe when it is freed. + */ +#define marker_synchronize_unregister() synchronize_sched() + #endif -- cgit v1.2.3 From 53c8c8fdfd2d2d515bdcb3d0f2a11d1f3f42ece1 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 3 Oct 2008 11:52:54 -0400 Subject: markers: turn marker_synchronize_unregister() into an inline Turn marker synchronize unregister into a static inline. There is no reason to keep it as a macro over a static inline. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- include/linux/marker.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/marker.h b/include/linux/marker.h index 889196c7fbb..38e32e781ed 100644 --- a/include/linux/marker.h +++ b/include/linux/marker.h @@ -13,6 +13,7 @@ */ #include +#include struct module; struct marker; @@ -165,6 +166,9 @@ extern void *marker_get_private_data(const char *name, marker_probe_func *probe, * unregistration and the end of module exit to make sure there is no caller * executing a probe when it is freed. */ -#define marker_synchronize_unregister() synchronize_sched() +static inline void marker_synchronize_unregister(void) +{ + synchronize_sched(); +} #endif -- cgit v1.2.3 From d13744cd6e3fef373a3fe656ac349b4e7c49ff79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Weisbecker?= Date: Tue, 23 Sep 2008 11:32:08 +0100 Subject: tracing/ftrace: add the boot tracer Add the boot/initcall tracer. It's primary purpose is to be able to trace the initcalls. It is intended to be used with scripts/bootgraph.pl after some small improvements. Note that it is not active after its init. To avoid tracing (and so crashing) before the whole tracing engine init, you have to explicitly call start_boot_trace() after do_pre_smp_initcalls() to enable it. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 5de9903645d..91954eb6460 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -5,6 +5,8 @@ #include #include +#include +#include extern int ftrace_enabled; extern int @@ -209,4 +211,21 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { } #endif +struct boot_trace { + pid_t caller; + initcall_t func; + int result; + unsigned long long duration; +}; + +#ifdef CONFIG_BOOT_TRACER +extern void trace_boot(struct boot_trace *it); +extern void start_boot_trace(void); +#else +static inline void trace_boot(struct boot_trace *it) { } +static inline void start_boot_trace(void) { } +#endif + + + #endif /* _LINUX_FTRACE_H */ -- cgit v1.2.3 From 7a8e76a3829f1067b70f715771ff88baf2fbf3c3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 29 Sep 2008 23:02:38 -0400 Subject: tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: - array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: - array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ring_buffer.h | 130 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 include/linux/ring_buffer.h (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h new file mode 100644 index 00000000000..c52375b8330 --- /dev/null +++ b/include/linux/ring_buffer.h @@ -0,0 +1,130 @@ +#ifndef _LINUX_RING_BUFFER_H +#define _LINUX_RING_BUFFER_H + +#include +#include + +struct ring_buffer; +struct ring_buffer_iter; + +/* + * Don't reference this struct directly, use functions below. + */ +struct ring_buffer_event { + u32 type:2, len:3, time_delta:27; + u32 array[]; +}; + +/** + * enum ring_buffer_type - internal ring buffer types + * + * @RINGBUF_TYPE_PADDING: Left over page padding + * array is ignored + * size is variable depending on how much + * padding is needed + * + * @RINGBUF_TYPE_TIME_EXTEND: Extend the time delta + * array[0] = time delta (28 .. 59) + * size = 8 bytes + * + * @RINGBUF_TYPE_TIME_STAMP: Sync time stamp with external clock + * array[0] = tv_nsec + * array[1] = tv_sec + * size = 16 bytes + * + * @RINGBUF_TYPE_DATA: Data record + * If len is zero: + * array[0] holds the actual length + * array[1..(length+3)/4-1] holds data + * else + * length = len << 2 + * array[0..(length+3)/4] holds data + */ +enum ring_buffer_type { + RINGBUF_TYPE_PADDING, + RINGBUF_TYPE_TIME_EXTEND, + /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */ + RINGBUF_TYPE_TIME_STAMP, + RINGBUF_TYPE_DATA, +}; + +unsigned ring_buffer_event_length(struct ring_buffer_event *event); +void *ring_buffer_event_data(struct ring_buffer_event *event); + +/** + * ring_buffer_event_time_delta - return the delta timestamp of the event + * @event: the event to get the delta timestamp of + * + * The delta timestamp is the 27 bit timestamp since the last event. + */ +static inline unsigned +ring_buffer_event_time_delta(struct ring_buffer_event *event) +{ + return event->time_delta; +} + +void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags); +void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags); + +/* + * size is in bytes for each per CPU buffer. + */ +struct ring_buffer * +ring_buffer_alloc(unsigned long size, unsigned flags); +void ring_buffer_free(struct ring_buffer *buffer); + +int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size); + +struct ring_buffer_event * +ring_buffer_lock_reserve(struct ring_buffer *buffer, + unsigned long length, + unsigned long *flags); +int ring_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags); +int ring_buffer_write(struct ring_buffer *buffer, + unsigned long length, void *data); + +struct ring_buffer_event * +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts); +struct ring_buffer_event * +ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts); + +struct ring_buffer_iter * +ring_buffer_read_start(struct ring_buffer *buffer, int cpu); +void ring_buffer_read_finish(struct ring_buffer_iter *iter); + +struct ring_buffer_event * +ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts); +struct ring_buffer_event * +ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts); +void ring_buffer_iter_reset(struct ring_buffer_iter *iter); +int ring_buffer_iter_empty(struct ring_buffer_iter *iter); + +unsigned long ring_buffer_size(struct ring_buffer *buffer); + +void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu); +void ring_buffer_reset(struct ring_buffer *buffer); + +int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, + struct ring_buffer *buffer_b, int cpu); + +int ring_buffer_empty(struct ring_buffer *buffer); +int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu); + +void ring_buffer_record_disable(struct ring_buffer *buffer); +void ring_buffer_record_enable(struct ring_buffer *buffer); +void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu); +void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu); + +unsigned long ring_buffer_entries(struct ring_buffer *buffer); +unsigned long ring_buffer_overruns(struct ring_buffer *buffer); + +u64 ring_buffer_time_stamp(int cpu); +void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); + +enum ring_buffer_flags { + RB_FL_OVERWRITE = 1 << 0, +}; + +#endif /* _LINUX_RING_BUFFER_H */ -- cgit v1.2.3 From d769041f865330034131525ee6a7f72eb4af2a24 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 1 Oct 2008 00:29:53 -0400 Subject: ring_buffer: implement new locking The old "lock always" scheme had issues with lockdep, and was not very efficient anyways. This patch does a new design to be partially lockless on writes. Writes will add new entries to the per cpu pages by simply disabling interrupts. When a write needs to go to another page than it will grab the lock. A new "read page" has been added so that the reader can pull out a page from the ring buffer to read without worrying about the writer writing over it. This allows us to not take the lock for all reads. The lock is now only taken when a read needs to go to a new page. This is far from lockless, and interrupts still need to be disabled, but it is a step towards a more lockless solution, and it also solves a lot of the issues that were noticed by the first conversion of ftrace to the ring buffers. Note: the ring_buffer_{un}lock API has been removed. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ring_buffer.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index c52375b8330..536b0ca46a0 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -63,9 +63,6 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event) return event->time_delta; } -void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags); -void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags); - /* * size is in bytes for each per CPU buffer. */ -- cgit v1.2.3 From cb5ab74204a6e2579d1119bf1348eb806526b12b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 2 Oct 2008 12:59:20 +0200 Subject: tracing/fastboot: change the printing of boot tracer according to bootgraph.pl Change the boot tracer printing to make it parsable for the scripts/bootgraph.pl script. We have now to output two lines for each initcall, according to the printk in do_one_initcall() in init/main.c We need now the call's time and the return's time. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 91954eb6460..4455490d91b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -216,6 +216,8 @@ struct boot_trace { initcall_t func; int result; unsigned long long duration; + ktime_t calltime; + ktime_t rettime; }; #ifdef CONFIG_BOOT_TRACER -- cgit v1.2.3 From 5601020feb0c3010e9e3e0131e9697ac6a06777b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 2 Oct 2008 13:26:05 +0200 Subject: tracing/fastboot: get the initcall name before it disappears After some initcall traces, some initcall names may be inconsistent. That's because these functions will disappear from the .init section and also their name from the symbols table. So we have to copy the name of the function in a buffer large enough during the trace appending. It is not costly for the ring_buffer because the number of initcall entries is commonly not really large. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4455490d91b..e672e51c40a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -7,6 +7,7 @@ #include #include #include +#include extern int ftrace_enabled; extern int @@ -213,7 +214,7 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { } struct boot_trace { pid_t caller; - initcall_t func; + char func[KSYM_NAME_LEN]; int result; unsigned long long duration; ktime_t calltime; @@ -221,10 +222,10 @@ struct boot_trace { }; #ifdef CONFIG_BOOT_TRACER -extern void trace_boot(struct boot_trace *it); +extern void trace_boot(struct boot_trace *it, initcall_t fn); extern void start_boot_trace(void); #else -static inline void trace_boot(struct boot_trace *it) { } +static inline void trace_boot(struct boot_trace *it, initcall_t fn) { } static inline void start_boot_trace(void) { } #endif -- cgit v1.2.3 From 3e1932ad59726d794a865cc159c0593d54bf0cb6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 2 Oct 2008 17:45:47 +0200 Subject: tracing/fastboot: build fix fix: In file included from kernel/sysctl.c:52: include/linux/ftrace.h:217: error: 'KSYM_NAME_LEN' undeclared here (not in a function) Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e672e51c40a..deded114dff 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1,14 +1,14 @@ #ifndef _LINUX_FTRACE_H #define _LINUX_FTRACE_H -#ifdef CONFIG_FTRACE - #include #include #include #include #include +#ifdef CONFIG_FTRACE + extern int ftrace_enabled; extern int ftrace_enable_sysctl(struct ctl_table *table, int write, -- cgit v1.2.3 From eb7fa935274bb233686fdf7a53f40c5d9ee76ed6 Mon Sep 17 00:00:00 2001 From: Steven Noonan Date: Thu, 2 Oct 2008 12:00:07 -0700 Subject: ftrace: ktime.h not included in ftrace.h Including eliminates the following error: include/linux/ftrace.h:220: error: expected specifier-qualifier-list before 'ktime_t' Signed-off-by: Steven Noonan Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index deded114dff..ed53265d1f6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include -- cgit v1.2.3 From 097d036a2f25eecc42435c57e010aaf4a2eed2d9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 3 Oct 2008 15:39:21 +0200 Subject: tracing/fastboot: only trace non-module initcalls At this time, only built-in initcalls interest us. We can't really produce a relevant graph if we include the modules initcall too. I had good results after this patch (see svg in attachment). Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ed53265d1f6..5812dba4ee2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -225,9 +225,11 @@ struct boot_trace { #ifdef CONFIG_BOOT_TRACER extern void trace_boot(struct boot_trace *it, initcall_t fn); extern void start_boot_trace(void); +extern void stop_boot_trace(void); #else static inline void trace_boot(struct boot_trace *it, initcall_t fn) { } static inline void start_boot_trace(void) { } +static inline void stop_boot_trace(void) { } #endif -- cgit v1.2.3 From ca538f6bbe583406f941f3041d40c41f9a13d1de Mon Sep 17 00:00:00 2001 From: Tim Bird Date: Thu, 9 Oct 2008 15:23:05 -0700 Subject: tracing/fastboot: add better resolution to initcall debug/tracing Change the time resolution for initcall_debug to microseconds, from milliseconds. This is handy to determine which initcalls you want to work on for faster booting. One one of my test machines, over 90% of the initcalls are less than a millisecond and (without this patch) these are all reported as 0 msecs. Working on the 900 us ones is more important than the 4 us ones. With 'quiet' on the kernel command line, this adds no significant overhead to kernel boot time. Signed-off-by: Tim Bird Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 5812dba4ee2..a3d46151be1 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -215,9 +215,9 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { } struct boot_trace { pid_t caller; - char func[KSYM_NAME_LEN]; + char func[KSYM_NAME_LEN]; int result; - unsigned long long duration; + unsigned long long duration; /* usecs */ ktime_t calltime; ktime_t rettime; }; -- cgit v1.2.3 From bfadadfccc19e36f7d600c5ce7b3e5ba5197fbf0 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 10 Oct 2008 03:48:25 -0400 Subject: markers: fix synchronize marker unregister static inline Use a #define for synchronize marker unregister to fix include dependencies. Fixes the slab circular inclusion which triggers when slab.git is combined with tracing.git, where rcupdate includes slab, which includes markers which includes rcupdate. Signed-off-by: Mathieu Desnoyers Acked-by: Pekka Enberg Signed-off-by: Ingo Molnar --- include/linux/marker.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/marker.h b/include/linux/marker.h index 38e32e781ed..889196c7fbb 100644 --- a/include/linux/marker.h +++ b/include/linux/marker.h @@ -13,7 +13,6 @@ */ #include -#include struct module; struct marker; @@ -166,9 +165,6 @@ extern void *marker_get_private_data(const char *name, marker_probe_func *probe, * unregistration and the end of module exit to make sure there is no caller * executing a probe when it is freed. */ -static inline void marker_synchronize_unregister(void) -{ - synchronize_sched(); -} +#define marker_synchronize_unregister() synchronize_sched() #endif -- cgit v1.2.3 From f2461fc82a083dd60062e05e704c5fcc1c658ba1 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 6 Oct 2008 10:33:00 -0400 Subject: tracepoints: tracepoint_synchronize_unregister() Create tracepoint_synchronize_unregister() which must be called before the end of exit() to make sure every probe callers have exited the non preemptible section and thus are not executing the probe code anymore. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- include/linux/tracepoint.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index e623a6fca5c..199f4c207c1 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -124,4 +124,11 @@ extern void tracepoint_iter_reset(struct tracepoint_iter *iter); extern int tracepoint_get_iter_range(struct tracepoint **tracepoint, struct tracepoint *begin, struct tracepoint *end); +/* + * tracepoint_synchronize_unregister must be called between the last tracepoint + * probe unregistration and the end of module exit to make sure there is no + * caller executing a probe when it is freed. + */ +#define tracepoint_synchronize_unregister() synchronize_sched() + #endif -- cgit v1.2.3 From 231375cc5cc3549bb413f94a164bdcbd5f9ce943 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 3 Oct 2008 15:01:33 -0400 Subject: tracepoints: synchronize unregister static inline Turn tracepoint synchronize unregister into a static inline. There is no reason to keep it as a macro over a static inline. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- include/linux/tracepoint.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 199f4c207c1..c5bb39c7a77 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -129,6 +129,9 @@ extern int tracepoint_get_iter_range(struct tracepoint **tracepoint, * probe unregistration and the end of module exit to make sure there is no * caller executing a probe when it is freed. */ -#define tracepoint_synchronize_unregister() synchronize_sched() +static inline void tracepoint_synchronize_unregister(void) +{ + synchronize_sched(); +} #endif -- cgit v1.2.3 From 29d434b39c807320fbe4bcdce0ab98a0b9fcb285 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 16 Oct 2008 16:08:57 +0200 Subject: fuse: add include protectors Add include protectors to include/linux/fuse.h and fs/fuse/fuse_i.h. Signed-off-by: Tejun Heo Signed-off-by: Miklos Szeredi --- include/linux/fuse.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fuse.h b/include/linux/fuse.h index 265635dc990..8bc1101e9b3 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -19,6 +19,9 @@ * - add file flags field to fuse_read_in and fuse_write_in */ +#ifndef _LINUX_FUSE_H +#define _LINUX_FUSE_H + #include #include @@ -409,3 +412,5 @@ struct fuse_dirent { #define FUSE_DIRENT_ALIGN(x) (((x) + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1)) #define FUSE_DIRENT_SIZE(d) \ FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen) + +#endif /* _LINUX_FUSE_H */ -- cgit v1.2.3 From a7c1b990f71574e077b94ce4582e2cf11cb891fe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 16 Oct 2008 16:08:57 +0200 Subject: fuse: implement nonseekable open Let the client request nonseekable open using FOPEN_NONSEEKABLE and call nonseekable_open() on the file if requested. Signed-off-by: Tejun Heo Signed-off-by: Miklos Szeredi --- include/linux/fuse.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fuse.h b/include/linux/fuse.h index 8bc1101e9b3..350fe9767bb 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -17,6 +17,9 @@ * - add lock_owner field to fuse_setattr_in, fuse_read_in and fuse_write_in * - add blksize field to fuse_attr * - add file flags field to fuse_read_in and fuse_write_in + * + * 7.10 + * - add nonseekable open flag */ #ifndef _LINUX_FUSE_H @@ -29,7 +32,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 9 +#define FUSE_KERNEL_MINOR_VERSION 10 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -101,9 +104,11 @@ struct fuse_file_lock { * * FOPEN_DIRECT_IO: bypass page cache for this open file * FOPEN_KEEP_CACHE: don't invalidate the data cache on open + * FOPEN_NONSEEKABLE: the file is not seekable */ #define FOPEN_DIRECT_IO (1 << 0) #define FOPEN_KEEP_CACHE (1 << 1) +#define FOPEN_NONSEEKABLE (1 << 2) /** * INIT request/reply flags -- cgit v1.2.3 From 3ddfda11861d305b02ed810b522dcf48b74ca808 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:43 -0700 Subject: generic: add dyn_array support Allow crazy big arrays via bootmem at init stage. Architectures use CONFIG_HAVE_DYN_ARRAY to enable it. usage: | static struct irq_desc irq_desc_init __initdata = { | .status = IRQ_DISABLED, | .chip = &no_irq_chip, | .handle_irq = handle_bad_irq, | .depth = 1, | .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), | #ifdef CONFIG_SMP | .affinity = CPU_MASK_ALL | #endif | }; | | static void __init init_work(void *data) | { | struct dyn_array *da = data; | struct irq_desc *desc; | int i; | | desc = *da->name; | | for (i = 0; i < *da->nr; i++) | memcpy(&desc[i], &irq_desc_init, sizeof(struct irq_desc)); | } | | struct irq_desc *irq_desc; | DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work); after pre_alloc_dyn_array() after setup_arch(), the array is ready to be used. Via this facility we can replace irq_desc[NR_IRQS] array with dyn_array irq_desc[nr_irqs]. v2: remove _nopanic in pre_alloc_dyn_array() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/init.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index 93538b696e3..cf9fa7f174a 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -246,6 +246,29 @@ struct obs_kernel_param { /* Relies on boot_command_line being set */ void __init parse_early_param(void); + +struct dyn_array { + void **name; + unsigned long size; + unsigned int *nr; + unsigned long align; + void (*init_work)(void *); +}; +extern struct dyn_array *__dyn_array_start[], *__dyn_array_end[]; + +#define DEFINE_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \ + static struct dyn_array __dyn_array_##nameX __initdata = \ + { .name = (void **)&nameX,\ + .size = sizeX,\ + .nr = &nrX,\ + .align = alignX,\ + .init_work = init_workX,\ + }; \ + static struct dyn_array *__dyn_array_ptr_##nameX __used \ + __attribute__((__section__(".dyn_array.init"))) = \ + &__dyn_array_##nameX + +extern void pre_alloc_dyn_array(void); #endif /* __ASSEMBLY__ */ /** -- cgit v1.2.3 From 1f3fcd4b1adc972d5c6a34cfed98931c46575b49 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:44 -0700 Subject: add per_cpu_dyn_array support allow dyn-array in per_cpu area, allocated dynamically. usage: | /* in .h */ | struct kernel_stat { | struct cpu_usage_stat cpustat; | unsigned int *irqs; | }; | | /* in .c */ | DEFINE_PER_CPU(struct kernel_stat, kstat); | | DEFINE_PER_CPU_DYN_ARRAY_ADDR(per_cpu__kstat_irqs, per_cpu__kstat.irqs, sizeof(unsigned int), nr_irqs, sizeof(unsigned long), NULL); after setup_percpu()/per_cpu_alloc_dyn_array(), the dyn_array in per_cpu area is ready to use. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/init.h | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index cf9fa7f174a..332806826b8 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -255,12 +255,13 @@ struct dyn_array { void (*init_work)(void *); }; extern struct dyn_array *__dyn_array_start[], *__dyn_array_end[]; +extern struct dyn_array *__per_cpu_dyn_array_start[], *__per_cpu_dyn_array_end[]; -#define DEFINE_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \ +#define DEFINE_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \ static struct dyn_array __dyn_array_##nameX __initdata = \ - { .name = (void **)&nameX,\ + { .name = (void **)&(nameX),\ .size = sizeX,\ - .nr = &nrX,\ + .nr = &(nrX),\ .align = alignX,\ .init_work = init_workX,\ }; \ @@ -268,7 +269,27 @@ extern struct dyn_array *__dyn_array_start[], *__dyn_array_end[]; __attribute__((__section__(".dyn_array.init"))) = \ &__dyn_array_##nameX +#define DEFINE_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \ + DEFINE_DYN_ARRAY_ADDR(nameX, nameX, sizeX, nrX, alignX, init_workX) + +#define DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \ + static struct dyn_array __per_cpu_dyn_array_##nameX __initdata = \ + { .name = (void **)&(addrX),\ + .size = sizeX,\ + .nr = &(nrX),\ + .align = alignX,\ + .init_work = init_workX,\ + }; \ + static struct dyn_array *__per_cpu_dyn_array_ptr_##nameX __used \ + __attribute__((__section__(".per_cpu_dyn_array.init"))) = \ + &__per_cpu_dyn_array_##nameX + +#define DEFINE_PER_CPU_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \ + DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, nameX, nrX, alignX, init_workX) + extern void pre_alloc_dyn_array(void); +extern unsigned long per_cpu_dyn_array_size(void); +extern void per_cpu_alloc_dyn_array(int cpu, char *ptr); #endif /* __ASSEMBLY__ */ /** -- cgit v1.2.3 From 1f8ff037a871690c762d267d8a052529d3102fc9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:45 -0700 Subject: x86: alloc dyn_array all together so could spare some memory with small alignment in bootmem also tighten the alignment checking, and make print out less debug info. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/init.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index 332806826b8..59fbb4aaba6 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -288,7 +288,7 @@ extern struct dyn_array *__per_cpu_dyn_array_start[], *__per_cpu_dyn_array_end[] DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, nameX, nrX, alignX, init_workX) extern void pre_alloc_dyn_array(void); -extern unsigned long per_cpu_dyn_array_size(void); +extern unsigned long per_cpu_dyn_array_size(unsigned long *align); extern void per_cpu_alloc_dyn_array(int cpu, char *ptr); #endif /* __ASSEMBLY__ */ -- cgit v1.2.3 From 85c0f90978bf50596dbd23854648020f1f9b5bfd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:47 -0700 Subject: irq: introduce nr_irqs at this point nr_irqs is equal NR_IRQS convert a few easy users from NR_IRQS to dynamic nr_irqs. v2: according to Eric, we need to take care of arch without generic_hardirqs Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/interrupt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 58ff4e74b2f..511803853a5 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -15,6 +15,8 @@ #include #include +extern int nr_irqs; + /* * These correspond to the IORESOURCE_IRQ_* defines in * linux/ioport.h to select the interrupt line behaviour. When -- cgit v1.2.3 From d60458b224d6b997a582a05cb8c4b9bed9e17a1d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:00 -0700 Subject: irq: make irq_desc to use dyn_array Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/irq.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 1d73d1abb83..5f4b013624d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -179,7 +179,11 @@ struct irq_desc { const char *name; } ____cacheline_internodealigned_in_smp; +#ifdef CONFIG_HAVE_DYN_ARRAY +extern struct irq_desc *irq_desc; +#else extern struct irq_desc irq_desc[NR_IRQS]; +#endif /* * Migration helpers for obsolete names, they will go away: -- cgit v1.2.3 From d17a55ded3393ad3878010bb3a8243a15a8d8df5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:01 -0700 Subject: irq: make irqs in kernel stat use per_cpu_dyn_array Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index cf9f40a91c9..fe1f7fe534b 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -28,7 +28,11 @@ struct cpu_usage_stat { struct kernel_stat { struct cpu_usage_stat cpustat; +#ifdef CONFIG_HAVE_DYN_ARRAY + unsigned int *irqs; +#else unsigned int irqs[NR_IRQS]; +#endif }; DECLARE_PER_CPU(struct kernel_stat, kstat); -- cgit v1.2.3 From 08678b0841267c1d00d771fe01548d86043d065e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:05 -0700 Subject: generic: sparse irqs: use irq_desc() together with dyn_array, instead of irq_desc[] add CONFIG_HAVE_SPARSE_IRQ to for use condensed array. Get rid of irq_desc[] array assumptions. Preallocate 32 irq_desc, and irq_desc() will try to get more. ( No change in functionality is expected anywhere, except the odd build failure where we missed a code site or where a crossing commit itroduces new irq_desc[] usage. ) v2: according to Eric, change get_irq_desc() to irq_desc() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/irq.h | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 5f4b013624d..80b8200f2ad 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -152,6 +152,10 @@ struct irq_chip { * @name: flow handler name for /proc/interrupts output */ struct irq_desc { + unsigned int irq; +#ifdef CONFIG_HAVE_SPARSE_IRQ + struct irq_desc *next; +#endif irq_flow_handler_t handle_irq; struct irq_chip *chip; struct msi_desc *msi_desc; @@ -179,9 +183,9 @@ struct irq_desc { const char *name; } ____cacheline_internodealigned_in_smp; -#ifdef CONFIG_HAVE_DYN_ARRAY -extern struct irq_desc *irq_desc; -#else +extern struct irq_desc *irq_to_desc(unsigned int irq); +#ifndef CONFIG_HAVE_DYN_ARRAY +/* could be removed if we get rid of all irq_desc reference */ extern struct irq_desc irq_desc[NR_IRQS]; #endif @@ -249,7 +253,10 @@ extern int no_irq_affinity; static inline int irq_balancing_disabled(unsigned int irq) { - return irq_desc[irq].status & IRQ_NO_BALANCING_MASK; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + return desc->status & IRQ_NO_BALANCING_MASK; } /* Handle irq action chains: */ @@ -281,7 +288,7 @@ extern unsigned int __do_IRQ(unsigned int irq); */ static inline void generic_handle_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); #ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ desc->handle_irq(irq, desc); @@ -325,7 +332,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, static inline void __set_irq_handler_unlocked(int irq, irq_flow_handler_t handler) { - irq_desc[irq].handle_irq = handler; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->handle_irq = handler; } /* @@ -359,7 +369,7 @@ extern void destroy_irq(unsigned int irq); /* Test to see if a driver has successfully requested an irq */ static inline int irq_has_action(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); return desc->action != NULL; } @@ -374,10 +384,10 @@ extern int set_irq_chip_data(unsigned int irq, void *data); extern int set_irq_type(unsigned int irq, unsigned int type); extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); -#define get_irq_chip(irq) (irq_desc[irq].chip) -#define get_irq_chip_data(irq) (irq_desc[irq].chip_data) -#define get_irq_data(irq) (irq_desc[irq].handler_data) -#define get_irq_msi(irq) (irq_desc[irq].msi_desc) +#define get_irq_chip(irq) (irq_to_desc(irq)->chip) +#define get_irq_chip_data(irq) (irq_to_desc(irq)->chip_data) +#define get_irq_data(irq) (irq_to_desc(irq)->handler_data) +#define get_irq_msi(irq) (irq_to_desc(irq)->msi_desc) #endif /* CONFIG_GENERIC_HARDIRQS */ -- cgit v1.2.3 From 3060d6fe28570640c2d7d66d38b9eaa848c3b9e3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:08 -0700 Subject: x86: put timer_rand_state pointer into irq_desc irq_timer_state[] is a NR_IRQS sized array that is a side-by array to the real irq_desc[] array. Integrate that field into the (now dynamic) irq_desc dynamic array and save some RAM. v2: keep the old way to support arch not support irq_desc Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/irq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 80b8200f2ad..60c856aaac0 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -127,6 +127,7 @@ struct irq_chip { const char *typename; }; +struct timer_rand_state; /** * struct irq_desc - interrupt descriptor * @@ -155,6 +156,7 @@ struct irq_desc { unsigned int irq; #ifdef CONFIG_HAVE_SPARSE_IRQ struct irq_desc *next; + struct timer_rand_state *timer_rand_state; #endif irq_flow_handler_t handle_irq; struct irq_chip *chip; -- cgit v1.2.3 From 7f95ec9e4c12fd067febfd57532da1166d75d858 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:09 -0700 Subject: x86: move kstat_irqs from kstat to irq_desc based on Eric's patch ... together mold it with dyn_array for irq_desc, will allcate kstat_irqs for nr_irq_desc alltogether if needed. -- at that point nr_cpus is known already. v2: make sure system without generic_hardirqs works they don't have irq_desc v3: fix merging v4: [mingo@elte.hu] fix typo [ mingo@elte.hu ] irq: build fix fix: arch/x86/xen/spinlock.c: In function 'xen_spin_lock_slow': arch/x86/xen/spinlock.c:90: error: 'struct kernel_stat' has no member named 'irqs' Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/irq.h | 7 +++++++ include/linux/kernel_stat.h | 22 +++++++++++++++------- 2 files changed, 22 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 60c856aaac0..cbf471aee1c 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -157,6 +157,11 @@ struct irq_desc { #ifdef CONFIG_HAVE_SPARSE_IRQ struct irq_desc *next; struct timer_rand_state *timer_rand_state; +#endif +#ifdef CONFIG_HAVE_DYN_ARRAY + unsigned int *kstat_irqs; +#else + unsigned int kstat_irqs[NR_CPUS]; #endif irq_flow_handler_t handle_irq; struct irq_chip *chip; @@ -190,6 +195,8 @@ extern struct irq_desc *irq_to_desc(unsigned int irq); /* could be removed if we get rid of all irq_desc reference */ extern struct irq_desc irq_desc[NR_IRQS]; #endif +#define kstat_irqs_this_cpu(DESC) \ + ((DESC)->kstat_irqs[smp_processor_id()]) /* * Migration helpers for obsolete names, they will go away: diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index fe1f7fe534b..f10616712de 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -28,10 +28,8 @@ struct cpu_usage_stat { struct kernel_stat { struct cpu_usage_stat cpustat; -#ifdef CONFIG_HAVE_DYN_ARRAY - unsigned int *irqs; -#else - unsigned int irqs[NR_IRQS]; +#ifndef CONFIG_GENERIC_HARDIRQS + unsigned int irqs[NR_IRQS]; #endif }; @@ -43,15 +41,25 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); +#ifndef CONFIG_GENERIC_HARDIRQS +static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) +{ + return kstat_cpu(cpu).irqs[irq]; +} +#else +extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); +#endif + /* * Number of interrupts per specific IRQ source, since bootup */ -static inline int kstat_irqs(int irq) +static inline unsigned int kstat_irqs(unsigned int irq) { - int cpu, sum = 0; + unsigned int sum = 0; + int cpu; for_each_possible_cpu(cpu) - sum += kstat_cpu(cpu).irqs[irq]; + sum += kstat_irqs_cpu(irq, cpu); return sum; } -- cgit v1.2.3 From 9059d8fa4a3a9153da53da890039f7f956cc9d19 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:10 -0700 Subject: irq: add irq_desc_without_new add an irq_desc accessor that will not allocate any sparse entry but returns failure if there's no entry present. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/irq.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index cbf471aee1c..c9ffef7c3b4 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -191,10 +191,23 @@ struct irq_desc { } ____cacheline_internodealigned_in_smp; extern struct irq_desc *irq_to_desc(unsigned int irq); +extern struct irq_desc *__irq_to_desc(unsigned int irq); + +#ifndef CONFIG_HAVE_SPARSE_IRQ + #ifndef CONFIG_HAVE_DYN_ARRAY /* could be removed if we get rid of all irq_desc reference */ extern struct irq_desc irq_desc[NR_IRQS]; +#else +extern struct irq_desc *irq_desc; #endif + +#else + +extern struct irq_desc *sparse_irqs; + +#endif + #define kstat_irqs_this_cpu(DESC) \ ((DESC)->kstat_irqs[smp_processor_id()]) -- cgit v1.2.3 From 2c6927a38f65b53b62f86158fba29a068c4e8b6a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:11 -0700 Subject: irq: replace loop with nr_irqs with for_each_irq_desc There are a handful of loops that go from 0 to nr_irqs and use get_irq_desc() on them. These would allocate all the irq_desc entries, regardless of the need for them. Use the smarter for_each_irq_desc() iterator that will only iterate over the present ones. v2: make sure arch without GENERIC_HARDIRQS work too Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/irq.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index c9ffef7c3b4..9de16ca8b8e 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -202,9 +202,16 @@ extern struct irq_desc irq_desc[NR_IRQS]; extern struct irq_desc *irq_desc; #endif +#ifdef CONFIG_GENERIC_HARDIRQS +#define for_each_irq_desc(irq, desc) \ + for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc = &irq_desc[irq]) +#endif + #else extern struct irq_desc *sparse_irqs; +#define for_each_irq_desc(irqX, desc) \ + for (desc = sparse_irqs, irqX = desc->irq; desc && irqX != -1U; desc = desc->next, irqX = desc ? desc->irq : -1U) #endif -- cgit v1.2.3 From c7fb03a475bd80c642c1345d85c7c550f63514b8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:12 -0700 Subject: irq, fs/proc: replace loop with nr_irqs for proc/stat Replace another nr_irqs loop to avoid the allocation of all sparse irq entries - use for_each_irq_desc instead. v2: make sure arch without GENERIC_HARDIRQS works too Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/interrupt.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 511803853a5..d4039a0b23f 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -17,6 +17,11 @@ extern int nr_irqs; +#ifndef CONFIG_GENERIC_HARDIRQS +#define for_each_irq_desc(irq, desc) \ + for (irq = 0; irq < nr_irqs; irq++) +#endif + /* * These correspond to the IORESOURCE_IRQ_* defines in * linux/ioport.h to select the interrupt line behaviour. When -- cgit v1.2.3 From 46926b67fc663d357a1a8174328998a9e49da0b8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:15 -0700 Subject: generic: add irq_desc in function in parameter So we could remove some duplicated calling to irq_desc v2: make sure irq_desc in init/main.c is not used without generic_hardirqs Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/irq.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 9de16ca8b8e..7b59e193a11 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -315,10 +315,8 @@ extern unsigned int __do_IRQ(unsigned int irq); * irqchip-style controller then we call the ->handle_irq() handler, * and it calls __do_IRQ() if it's attached to an irqtype-style controller. */ -static inline void generic_handle_irq(unsigned int irq) +static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc) { - struct irq_desc *desc = irq_to_desc(irq); - #ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ desc->handle_irq(irq, desc); #else @@ -329,6 +327,11 @@ static inline void generic_handle_irq(unsigned int irq) #endif } +static inline void generic_handle_irq(unsigned int irq) +{ + generic_handle_irq_desc(irq, irq_to_desc(irq)); +} + /* Handling of unhandled and spurious interrupts: */ extern void note_interrupt(unsigned int irq, struct irq_desc *desc, int action_ret); -- cgit v1.2.3 From cb5bc83225a86ca53bbb889ed8439e4fd6cf44ac Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:17 -0700 Subject: x86_64: rename irq_desc/irq_desc_alloc change names: irq_desc() ==> irq_desc_alloc __irq_desc() ==> irq_desc Also split a few of the uses in lowlevel x86 code. v2: need to check if desc is null in smp_irq_move_cleanup Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/irq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 7b59e193a11..5fe1b01c11f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -191,7 +191,7 @@ struct irq_desc { } ____cacheline_internodealigned_in_smp; extern struct irq_desc *irq_to_desc(unsigned int irq); -extern struct irq_desc *__irq_to_desc(unsigned int irq); +extern struct irq_desc *irq_to_desc_alloc(unsigned int irq); #ifndef CONFIG_HAVE_SPARSE_IRQ -- cgit v1.2.3 From 67fb283e148e9bd761f73691d3173b6eab9ba8db Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:18 -0700 Subject: irq: separate sparse_irqs from sparse_irqs_free so later don't need compare with -1U Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/irq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 5fe1b01c11f..d5749852ee6 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -211,7 +211,7 @@ extern struct irq_desc *irq_desc; extern struct irq_desc *sparse_irqs; #define for_each_irq_desc(irqX, desc) \ - for (desc = sparse_irqs, irqX = desc->irq; desc && irqX != -1U; desc = desc->next, irqX = desc ? desc->irq : -1U) + for (desc = sparse_irqs, irqX = desc->irq; desc; desc = desc->next, irqX = desc ? desc->irq : -1U) #endif -- cgit v1.2.3 From e420dfb40c453a9760b86c7f338052bdb4dfa755 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:21 -0700 Subject: x86: put irq_2_iommu pointer into irq_desc when CONFIG_HAVE_SPARSE_IRQ preallocate some irq_2_iommu entries, and use get_one_free_irq_2_iomm to get new one and link to irq_desc if needed. else will use dyn_array or static array. v2: <= nr_irqs fix Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/irq.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index d5749852ee6..788d5a35a58 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -128,6 +128,7 @@ struct irq_chip { }; struct timer_rand_state; +struct irq_2_iommu; /** * struct irq_desc - interrupt descriptor * @@ -162,6 +163,9 @@ struct irq_desc { unsigned int *kstat_irqs; #else unsigned int kstat_irqs[NR_CPUS]; +#endif +#if defined(CONFIG_INTR_REMAP) && defined(CONFIG_HAVE_SPARSE_IRQ) + struct irq_2_iommu *irq_2_iommu; #endif irq_flow_handler_t handle_irq; struct irq_chip *chip; -- cgit v1.2.3 From 6d50bc26836e16a9589e0b128d527c29e30d722a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:22 -0700 Subject: x86: use 28 bits irq NR for pci msi/msix and ht also print out irq no in /proc/interrups and /proc/stat in hex, so could tell bus/dev/func. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/irq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 788d5a35a58..704136138dc 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -399,6 +399,7 @@ extern void set_irq_noprobe(unsigned int irq); extern void set_irq_probe(unsigned int irq); /* Handle dynamic irq creation and destruction */ +extern unsigned int create_irq_nr(unsigned int irq_want); extern int create_irq(void); extern void destroy_irq(unsigned int irq); -- cgit v1.2.3 From 8b8e8c1bf7275eca859fe551dfa484134eaf013b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:23 -0700 Subject: x86: remove irqbalance in kernel for 32 bit This has been deprecated for years, the user space irqbalanced utility works better with numa, has configurable policies, etc... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/irq.h | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 704136138dc..2445d2b3d5d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -185,7 +185,7 @@ struct irq_desc { cpumask_t affinity; unsigned int cpu; #endif -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) +#ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_t pending_mask; #endif #ifdef CONFIG_PROC_FS @@ -241,13 +241,13 @@ extern int setup_irq(unsigned int irq, struct irqaction *new); #ifdef CONFIG_SMP -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) +#ifdef CONFIG_GENERIC_PENDING_IRQ void set_pending_irq(unsigned int irq, cpumask_t mask); void move_native_irq(int irq); void move_masked_irq(int irq); -#else /* CONFIG_GENERIC_PENDING_IRQ || CONFIG_IRQBALANCE */ +#else /* CONFIG_GENERIC_PENDING_IRQ */ static inline void move_irq(int irq) { @@ -274,14 +274,6 @@ static inline void set_pending_irq(unsigned int irq, cpumask_t mask) #endif /* CONFIG_SMP */ -#ifdef CONFIG_IRQBALANCE -extern void set_balance_irq_affinity(unsigned int irq, cpumask_t mask); -#else -static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) -{ -} -#endif - extern int no_irq_affinity; static inline int irq_balancing_disabled(unsigned int irq) -- cgit v1.2.3 From 42379b1122bab7f9aefdbd4b7004a6fa89dfbae5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:45 -0700 Subject: pci: change msi-x vector to 32bit we are using 28bit pci (bus/dev/fn + 12 bits) as irq number, so the cache for irq number should be 32 bit too. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 98dc6243a70..1f8db240ca4 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -723,7 +723,7 @@ enum pci_dma_burst_strategy { }; struct msix_entry { - u16 vector; /* kernel uses to write allocated vector */ + u32 vector; /* kernel uses to write allocated vector */ u16 entry; /* driver uses to specify entry, OS writes */ }; -- cgit v1.2.3 From 8c464a4b23ca283b414022ebc77787f3c7040fa7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 25 Aug 2008 12:41:19 -0700 Subject: sparseirq: move kstat_irqs from kstat to irq_desc - fix fix non-sparseirq architectures. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/irq.h | 4 ++-- include/linux/kernel_stat.h | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 2445d2b3d5d..93fe9a943e7 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -161,8 +161,6 @@ struct irq_desc { #endif #ifdef CONFIG_HAVE_DYN_ARRAY unsigned int *kstat_irqs; -#else - unsigned int kstat_irqs[NR_CPUS]; #endif #if defined(CONFIG_INTR_REMAP) && defined(CONFIG_HAVE_SPARSE_IRQ) struct irq_2_iommu *irq_2_iommu; @@ -219,8 +217,10 @@ extern struct irq_desc *sparse_irqs; #endif +#ifdef CONFIG_HAVE_DYN_ARRAY #define kstat_irqs_this_cpu(DESC) \ ((DESC)->kstat_irqs[smp_processor_id()]) +#endif /* * Migration helpers for obsolete names, they will go away: diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index f10616712de..21249d8c129 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -28,7 +28,7 @@ struct cpu_usage_stat { struct kernel_stat { struct cpu_usage_stat cpustat; -#ifndef CONFIG_GENERIC_HARDIRQS +#ifndef CONFIG_HAVE_DYN_ARRAY unsigned int irqs[NR_IRQS]; #endif }; @@ -41,7 +41,13 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); -#ifndef CONFIG_GENERIC_HARDIRQS +#ifndef CONFIG_HAVE_DYN_ARRAY +#define kstat_irqs_this_cpu(irq) \ + (kstat_this_cpu.irqs[irq]) +#endif + + +#ifndef CONFIG_HAVE_DYN_ARRAY static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { return kstat_cpu(cpu).irqs[irq]; -- cgit v1.2.3 From f6dd5c3106fb283e37d915eeb33019ef40510f85 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 3 Sep 2008 16:58:32 -0700 Subject: dmar: fix using early fixmap mapping for DMAR table parsing Very early detection of the DMAR tables will setup fixmap mapping. For parsing these tables later (while enabling dma and/or interrupt remapping), early fixmap mapping shouldn't be used. Fix it by calling table detection routines again, which will call generic apci_get_table() for setting up the correct mapping. Signed-off-by: Yinghai Lu Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- include/linux/dmar.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmar.h b/include/linux/dmar.h index c360c558e59..f1984fc3e06 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -45,7 +45,6 @@ extern struct list_head dmar_drhd_units; list_for_each_entry(drhd, &dmar_drhd_units, list) extern int dmar_table_init(void); -extern int early_dmar_detect(void); extern int dmar_dev_scope_init(void); /* Intel IOMMU detection */ -- cgit v1.2.3 From a50f70b17541c0060967c6df61133e968bad3652 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 3 Oct 2008 11:58:54 -0500 Subject: x86: Add UV EFI table entry v4 Look for a UV entry in the EFI tables. Signed-off-by: Russ Anderson Signed-off-by: Paul Jackson Acked-by: Huang Ying Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- include/linux/efi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 807373d467f..bb66feb164b 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -208,6 +208,9 @@ typedef efi_status_t efi_set_virtual_address_map_t (unsigned long memory_map_siz #define EFI_GLOBAL_VARIABLE_GUID \ EFI_GUID( 0x8be4df61, 0x93ca, 0x11d2, 0xaa, 0x0d, 0x00, 0xe0, 0x98, 0x03, 0x2b, 0x8c ) +#define UV_SYSTEM_TABLE_GUID \ + EFI_GUID( 0x3b13a7d4, 0x633e, 0x11dd, 0x93, 0xec, 0xda, 0x25, 0x56, 0xd8, 0x95, 0x93 ) + typedef struct { efi_guid_t guid; unsigned long table; @@ -255,6 +258,7 @@ extern struct efi { unsigned long boot_info; /* boot info table */ unsigned long hcdp; /* HCDP table */ unsigned long uga; /* UGA table */ + unsigned long uv_systab; /* UV system table */ efi_get_time_t *get_time; efi_set_time_t *set_time; efi_get_wakeup_time_t *get_wakeup_time; -- cgit v1.2.3 From 7ef0c30dbf96a8d9a234e90c248eb19df3c031be Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 13:07:35 +0200 Subject: genirq: define nr_irqs for architectures with GENERIC_HARDIRQS=n Revert the sparse irq changes in m68k/s390/sparc and just define nr_irqs as NR_IRQS for those architectures. Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index d4039a0b23f..5a57df2ee92 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -15,11 +15,13 @@ #include #include -extern int nr_irqs; - #ifndef CONFIG_GENERIC_HARDIRQS -#define for_each_irq_desc(irq, desc) \ +# define for_each_irq_desc(irq, desc) \ for (irq = 0; irq < nr_irqs; irq++) + +# define nr_irqs NR_IRQS +#else +extern int nr_irqs; #endif /* -- cgit v1.2.3 From 70dd4d992ab324a59cdcd6bedc3f4e729863d514 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 15:39:27 +0200 Subject: genirq: consolidate nr_irqs and for_each_irq_desc() Move all of those to linux/irq.h where they belong. Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 9 --------- include/linux/irq.h | 17 ++++++++++++----- 2 files changed, 12 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 5a57df2ee92..58ff4e74b2f 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -15,15 +15,6 @@ #include #include -#ifndef CONFIG_GENERIC_HARDIRQS -# define for_each_irq_desc(irq, desc) \ - for (irq = 0; irq < nr_irqs; irq++) - -# define nr_irqs NR_IRQS -#else -extern int nr_irqs; -#endif - /* * These correspond to the IORESOURCE_IRQ_* defines in * linux/ioport.h to select the interrupt line behaviour. When diff --git a/include/linux/irq.h b/include/linux/irq.h index 93fe9a943e7..dbe8734ae86 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -11,6 +11,18 @@ #include +#ifndef CONFIG_GENERIC_HARDIRQS +# define nr_irqs NR_IRQS + +# define for_each_irq_desc(irq, desc) \ + for (irq = 0; irq < nr_irqs; irq++) +#else +extern int nr_irqs; + +# define for_each_irq_desc(irq, desc) \ + for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++) +#endif + #ifndef CONFIG_S390 #include @@ -204,11 +216,6 @@ extern struct irq_desc irq_desc[NR_IRQS]; extern struct irq_desc *irq_desc; #endif -#ifdef CONFIG_GENERIC_HARDIRQS -#define for_each_irq_desc(irq, desc) \ - for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc = &irq_desc[irq]) -#endif - #else extern struct irq_desc *sparse_irqs; -- cgit v1.2.3 From c6b7674f323622d86316bf7951ad9cae1ce24642 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 14:31:29 +0200 Subject: genirq: use inline function for irq_to_desc For the non sparse irq case an inline function is perfectly fine. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index dbe8734ae86..7d1adacaadb 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -204,8 +204,6 @@ struct irq_desc { const char *name; } ____cacheline_internodealigned_in_smp; -extern struct irq_desc *irq_to_desc(unsigned int irq); -extern struct irq_desc *irq_to_desc_alloc(unsigned int irq); #ifndef CONFIG_HAVE_SPARSE_IRQ @@ -216,8 +214,21 @@ extern struct irq_desc irq_desc[NR_IRQS]; extern struct irq_desc *irq_desc; #endif +static inline struct irq_desc *irq_to_desc(unsigned int irq) +{ + return (irq < nr_irqs) ? irq_desc + irq : NULL; +} + +static inline struct irq_desc *irq_to_desc_alloc(unsigned int irq) +{ + return irq_to_desc(irq); +} + #else +extern struct irq_desc *irq_to_desc(unsigned int irq); +extern struct irq_desc *irq_to_desc_alloc(unsigned int irq); + extern struct irq_desc *sparse_irqs; #define for_each_irq_desc(irqX, desc) \ for (desc = sparse_irqs, irqX = desc->irq; desc; desc = desc->next, irqX = desc ? desc->irq : -1U) -- cgit v1.2.3 From 2cc21ef843d4fb7da122239b644a1f6f0aca60a6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 14:16:55 +0200 Subject: genirq: remove sparse irq code This code is not ready, but we need to rip it out instead of rebasing as we would lose the APIC/IO_APIC unification otherwise. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 7d1adacaadb..68e0f3f9df3 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -167,15 +167,8 @@ struct irq_2_iommu; */ struct irq_desc { unsigned int irq; -#ifdef CONFIG_HAVE_SPARSE_IRQ - struct irq_desc *next; - struct timer_rand_state *timer_rand_state; -#endif #ifdef CONFIG_HAVE_DYN_ARRAY unsigned int *kstat_irqs; -#endif -#if defined(CONFIG_INTR_REMAP) && defined(CONFIG_HAVE_SPARSE_IRQ) - struct irq_2_iommu *irq_2_iommu; #endif irq_flow_handler_t handle_irq; struct irq_chip *chip; @@ -205,8 +198,6 @@ struct irq_desc { } ____cacheline_internodealigned_in_smp; -#ifndef CONFIG_HAVE_SPARSE_IRQ - #ifndef CONFIG_HAVE_DYN_ARRAY /* could be removed if we get rid of all irq_desc reference */ extern struct irq_desc irq_desc[NR_IRQS]; @@ -224,17 +215,6 @@ static inline struct irq_desc *irq_to_desc_alloc(unsigned int irq) return irq_to_desc(irq); } -#else - -extern struct irq_desc *irq_to_desc(unsigned int irq); -extern struct irq_desc *irq_to_desc_alloc(unsigned int irq); - -extern struct irq_desc *sparse_irqs; -#define for_each_irq_desc(irqX, desc) \ - for (desc = sparse_irqs, irqX = desc->irq; desc; desc = desc->next, irqX = desc ? desc->irq : -1U) - -#endif - #ifdef CONFIG_HAVE_DYN_ARRAY #define kstat_irqs_this_cpu(DESC) \ ((DESC)->kstat_irqs[smp_processor_id()]) -- cgit v1.2.3 From ee32c9732244bde4b9b59eeac2814c23e2b71f8d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 14:34:09 +0200 Subject: genirq: remove irq_to_desc_alloc Remove the leftover of sparseirqs. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 68e0f3f9df3..3f33c779030 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -210,11 +210,6 @@ static inline struct irq_desc *irq_to_desc(unsigned int irq) return (irq < nr_irqs) ? irq_desc + irq : NULL; } -static inline struct irq_desc *irq_to_desc_alloc(unsigned int irq) -{ - return irq_to_desc(irq); -} - #ifdef CONFIG_HAVE_DYN_ARRAY #define kstat_irqs_this_cpu(DESC) \ ((DESC)->kstat_irqs[smp_processor_id()]) -- cgit v1.2.3 From d6c88a507ef0b6afdb013cba4e7804ba7324d99a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 15:27:23 +0200 Subject: genirq: revert dynarray Revert the dynarray changes. They need more thought and polishing. Signed-off-by: Thomas Gleixner --- include/linux/init.h | 43 ------------------------------------------- include/linux/irq.h | 15 --------------- include/linux/kernel_stat.h | 16 ++++++---------- 3 files changed, 6 insertions(+), 68 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index 59fbb4aaba6..70ad53e1eab 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -247,49 +247,6 @@ struct obs_kernel_param { /* Relies on boot_command_line being set */ void __init parse_early_param(void); -struct dyn_array { - void **name; - unsigned long size; - unsigned int *nr; - unsigned long align; - void (*init_work)(void *); -}; -extern struct dyn_array *__dyn_array_start[], *__dyn_array_end[]; -extern struct dyn_array *__per_cpu_dyn_array_start[], *__per_cpu_dyn_array_end[]; - -#define DEFINE_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \ - static struct dyn_array __dyn_array_##nameX __initdata = \ - { .name = (void **)&(nameX),\ - .size = sizeX,\ - .nr = &(nrX),\ - .align = alignX,\ - .init_work = init_workX,\ - }; \ - static struct dyn_array *__dyn_array_ptr_##nameX __used \ - __attribute__((__section__(".dyn_array.init"))) = \ - &__dyn_array_##nameX - -#define DEFINE_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \ - DEFINE_DYN_ARRAY_ADDR(nameX, nameX, sizeX, nrX, alignX, init_workX) - -#define DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \ - static struct dyn_array __per_cpu_dyn_array_##nameX __initdata = \ - { .name = (void **)&(addrX),\ - .size = sizeX,\ - .nr = &(nrX),\ - .align = alignX,\ - .init_work = init_workX,\ - }; \ - static struct dyn_array *__per_cpu_dyn_array_ptr_##nameX __used \ - __attribute__((__section__(".per_cpu_dyn_array.init"))) = \ - &__per_cpu_dyn_array_##nameX - -#define DEFINE_PER_CPU_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \ - DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, nameX, nrX, alignX, init_workX) - -extern void pre_alloc_dyn_array(void); -extern unsigned long per_cpu_dyn_array_size(unsigned long *align); -extern void per_cpu_alloc_dyn_array(int cpu, char *ptr); #endif /* __ASSEMBLY__ */ /** diff --git a/include/linux/irq.h b/include/linux/irq.h index 3f33c779030..38bf89f2ade 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -139,8 +139,6 @@ struct irq_chip { const char *typename; }; -struct timer_rand_state; -struct irq_2_iommu; /** * struct irq_desc - interrupt descriptor * @@ -167,9 +165,6 @@ struct irq_2_iommu; */ struct irq_desc { unsigned int irq; -#ifdef CONFIG_HAVE_DYN_ARRAY - unsigned int *kstat_irqs; -#endif irq_flow_handler_t handle_irq; struct irq_chip *chip; struct msi_desc *msi_desc; @@ -198,23 +193,13 @@ struct irq_desc { } ____cacheline_internodealigned_in_smp; -#ifndef CONFIG_HAVE_DYN_ARRAY -/* could be removed if we get rid of all irq_desc reference */ extern struct irq_desc irq_desc[NR_IRQS]; -#else -extern struct irq_desc *irq_desc; -#endif static inline struct irq_desc *irq_to_desc(unsigned int irq) { return (irq < nr_irqs) ? irq_desc + irq : NULL; } -#ifdef CONFIG_HAVE_DYN_ARRAY -#define kstat_irqs_this_cpu(DESC) \ - ((DESC)->kstat_irqs[smp_processor_id()]) -#endif - /* * Migration helpers for obsolete names, they will go away: */ diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 21249d8c129..a9d0d360b77 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -28,9 +28,7 @@ struct cpu_usage_stat { struct kernel_stat { struct cpu_usage_stat cpustat; -#ifndef CONFIG_HAVE_DYN_ARRAY unsigned int irqs[NR_IRQS]; -#endif }; DECLARE_PER_CPU(struct kernel_stat, kstat); @@ -41,20 +39,18 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); -#ifndef CONFIG_HAVE_DYN_ARRAY -#define kstat_irqs_this_cpu(irq) \ - (kstat_this_cpu.irqs[irq]) -#endif +struct irq_desc; +static inline void kstat_incr_irqs_this_cpu(unsigned int irq, + struct irq_desc *desc) +{ + kstat_this_cpu.irqs[irq]++; +} -#ifndef CONFIG_HAVE_DYN_ARRAY static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { return kstat_cpu(cpu).irqs[irq]; } -#else -extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); -#endif /* * Number of interrupts per specific IRQ source, since bootup -- cgit v1.2.3 From a1aca5de08a0cb840a90fb3f729a5940f8d21185 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 15 Oct 2008 19:29:15 +0200 Subject: genirq: remove artifacts from sparseirq removal Signed-off-by: Ingo Molnar --- include/linux/init.h | 1 - include/linux/kernel_stat.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index 70ad53e1eab..93538b696e3 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -246,7 +246,6 @@ struct obs_kernel_param { /* Relies on boot_command_line being set */ void __init parse_early_param(void); - #endif /* __ASSEMBLY__ */ /** diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index a9d0d360b77..89b6ecd4147 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -28,7 +28,7 @@ struct cpu_usage_stat { struct kernel_stat { struct cpu_usage_stat cpustat; - unsigned int irqs[NR_IRQS]; + unsigned int irqs[NR_IRQS]; }; DECLARE_PER_CPU(struct kernel_stat, kstat); -- cgit v1.2.3 From 811410fdb6b9d82a518542289efe9b2a51e3cbfb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Oct 2008 14:16:11 +0200 Subject: genirq: add reverse iterator for irq_desc Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 38bf89f2ade..31632aa65d1 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -21,6 +21,10 @@ extern int nr_irqs; # define for_each_irq_desc(irq, desc) \ for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++) + +# define for_each_irq_desc_reverse(irq, desc) \ + for (irq = nr_irqs -1, desc = irq_desc + (nr_irqs -1 ); \ + irq > 0; irq--, desc--) #endif #ifndef CONFIG_S390 -- cgit v1.2.3 From 2be3b52a5785a6a5c5349fbd315f57595f7074be Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Oct 2008 14:50:27 +0200 Subject: proc: fixup irq iterator There is no need for irq_desc here. Even for sparse_irq we can handle this clever in for_each_irq_nr(). Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 31632aa65d1..0618fb362cb 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -27,6 +27,9 @@ extern int nr_irqs; irq > 0; irq--, desc--) #endif +#define for_each_irq_nr(irq) \ + for (irq = 0; irq < nr_irqs; irq++) + #ifndef CONFIG_S390 #include -- cgit v1.2.3 From fe11edfaabf1787c05d782a7b33e6497d1118b1d Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 17 Oct 2008 18:09:11 +0200 Subject: ide: IDE_AFLAG_MEDIA_CHANGED -> IDE_DFLAG_MEDIA_CHANGED There should be no functional changes caused by this patch. Acked-by: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index c47e371554c..155a57f55c6 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -464,7 +464,6 @@ struct ide_acpi_hwif_link; /* ATAPI device flags */ enum { IDE_AFLAG_DRQ_INTERRUPT = (1 << 0), - IDE_AFLAG_MEDIA_CHANGED = (1 << 1), /* Drive cannot lock the door. */ IDE_AFLAG_NO_DOORLOCK = (1 << 2), @@ -578,7 +577,8 @@ enum { /* don't unload heads */ IDE_DFLAG_NO_UNLOAD = (1 << 27), /* heads unloaded, please don't reset port */ - IDE_DFLAG_PARKED = (1 << 28) + IDE_DFLAG_PARKED = (1 << 28), + IDE_DFLAG_MEDIA_CHANGED = (1 << 29), }; struct ide_drive_s { -- cgit v1.2.3 From da167876bd0f71f1c646e5dd98997544d8d90e8e Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 17 Oct 2008 18:09:11 +0200 Subject: ide: IDE_AFLAG_WP -> IDE_DFLAG_WP There should be no functional changes caused by this patch. Acked-by: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 155a57f55c6..bd0a4d36b6d 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -503,8 +503,6 @@ enum { IDE_AFLAG_CLIK_DRIVE = (1 << 19), /* Requires BH algorithm for packets */ IDE_AFLAG_ZIP_DRIVE = (1 << 20), - /* Write protect */ - IDE_AFLAG_WP = (1 << 21), /* Supports format progress report */ IDE_AFLAG_SRFP = (1 << 22), @@ -579,6 +577,8 @@ enum { /* heads unloaded, please don't reset port */ IDE_DFLAG_PARKED = (1 << 28), IDE_DFLAG_MEDIA_CHANGED = (1 << 29), + /* write protect */ + IDE_DFLAG_WP = (1 << 30), }; struct ide_drive_s { -- cgit v1.2.3 From e01286282eef85e4783b06fb2e0ed84fc111eb32 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 17 Oct 2008 18:09:11 +0200 Subject: ide: IDE_AFLAG_FORMAT_IN_PROGRESS -> IDE_DFLAG_FORMAT_IN_PROGRESS There should be no functional changes caused by this patch. Acked-by: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index bd0a4d36b6d..d111c3ebbba 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -497,8 +497,6 @@ enum { IDE_AFLAG_LE_SPEED_FIELDS = (1 << 17), /* ide-floppy */ - /* Format in progress */ - IDE_AFLAG_FORMAT_IN_PROGRESS = (1 << 18), /* Avoid commands not supported in Clik drive */ IDE_AFLAG_CLIK_DRIVE = (1 << 19), /* Requires BH algorithm for packets */ @@ -579,6 +577,7 @@ enum { IDE_DFLAG_MEDIA_CHANGED = (1 << 29), /* write protect */ IDE_DFLAG_WP = (1 << 30), + IDE_DFLAG_FORMAT_IN_PROGRESS = (1 << 31), }; struct ide_drive_s { -- cgit v1.2.3 From 42619d35c7af2f88cad56425fe3981f1f65ff0bd Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 17 Oct 2008 18:09:11 +0200 Subject: ide: remove IDE_AFLAG_NO_DOORLOCKING Just use IDE_DFLAG_DOORLOCKING instead. There should be no functional changes caused by this patch. Acked-by: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index d111c3ebbba..ba51a93fa54 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -464,8 +464,6 @@ struct ide_acpi_hwif_link; /* ATAPI device flags */ enum { IDE_AFLAG_DRQ_INTERRUPT = (1 << 0), - /* Drive cannot lock the door. */ - IDE_AFLAG_NO_DOORLOCK = (1 << 2), /* ide-cd */ /* Drive cannot eject the disc. */ -- cgit v1.2.3 From 79cb380397c834a35952d8497651d93b543ef968 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 17 Oct 2008 18:09:13 +0200 Subject: ide: allow device drivers to specify per-device type /proc settings Turn ide_driver_t's 'proc' field into ->proc_entries method (and also 'settings' field into ->proc_devsets method). Then update all device drivers accordingly. There should be no functional changes caused by this patch. Acked-by: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index ba51a93fa54..488808891ac 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1120,8 +1120,8 @@ struct ide_driver_s { void (*resume)(ide_drive_t *); void (*shutdown)(ide_drive_t *); #ifdef CONFIG_IDE_PROC_FS - ide_proc_entry_t *proc; - const struct ide_proc_devset *settings; + ide_proc_entry_t * (*proc_entries)(ide_drive_t *); + const struct ide_proc_devset * (*proc_devsets)(ide_drive_t *); #endif }; -- cgit v1.2.3 From 806f80a6fc203ad0bde84e5a9e94572617d2ae45 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 17 Oct 2008 18:09:14 +0200 Subject: ide: add generic ATA/ATAPI disk driver * Add struct ide_disk_ops containing protocol specific methods. * Add 'struct ide_disk_ops *' to ide_drive_t. * Convert ide-{disk,floppy} drivers to use struct ide_disk_ops. * Merge ide-{disk,floppy} drivers into generic ide-gd driver. While at it: - ide_disk_init_capacity() -> ide_disk_get_capacity() Acked-by: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 488808891ac..89e53cfbc78 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -461,6 +461,23 @@ struct ide_acpi_drive_link; struct ide_acpi_hwif_link; #endif +struct ide_drive_s; + +struct ide_disk_ops { + int (*check)(struct ide_drive_s *, const char *); + int (*get_capacity)(struct ide_drive_s *); + void (*setup)(struct ide_drive_s *); + void (*flush)(struct ide_drive_s *); + int (*init_media)(struct ide_drive_s *, struct gendisk *); + int (*set_doorlock)(struct ide_drive_s *, struct gendisk *, + int); + ide_startstop_t (*do_request)(struct ide_drive_s *, struct request *, + sector_t); + int (*end_request)(struct ide_drive_s *, int, int); + int (*ioctl)(struct ide_drive_s *, struct inode *, + struct file *, unsigned int, unsigned long); +}; + /* ATAPI device flags */ enum { IDE_AFLAG_DRQ_INTERRUPT = (1 << 0), @@ -594,6 +611,8 @@ struct ide_drive_s { #endif struct hwif_s *hwif; /* actually (ide_hwif_t *) */ + const struct ide_disk_ops *disk_ops; + unsigned long dev_flags; unsigned long sleep; /* sleep until this time */ -- cgit v1.2.3 From 719254faa17ffedc87ba0fadb9b34e535c9758d5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Oct 2008 09:59:47 +0200 Subject: NOHZ: unify the nohz function calls in irq_enter() We have two separate nohz function calls in irq_enter() for no good reason. Just call a single NOHZ function from irq_enter() and call the bits in the tick code. Signed-off-by: Thomas Gleixner --- include/linux/tick.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index 98921a3e1aa..b6ec8189ac0 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -96,9 +96,11 @@ extern cpumask_t *tick_get_broadcast_oneshot_mask(void); extern void tick_clock_notify(void); extern int tick_check_oneshot_change(int allow_nohz); extern struct tick_sched *tick_get_tick_sched(int cpu); +extern void tick_check_idle(int cpu); # else static inline void tick_clock_notify(void) { } static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +static inline void tick_check_idle(int cpu) { } # endif #else /* CONFIG_GENERIC_CLOCKEVENTS */ @@ -106,26 +108,23 @@ static inline void tick_init(void) { } static inline void tick_cancel_sched_timer(int cpu) { } static inline void tick_clock_notify(void) { } static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +static inline void tick_check_idle(int cpu) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ # ifdef CONFIG_NO_HZ extern void tick_nohz_stop_sched_tick(int inidle); extern void tick_nohz_restart_sched_tick(void); -extern void tick_nohz_update_jiffies(void); extern ktime_t tick_nohz_get_sleep_length(void); -extern void tick_nohz_stop_idle(int cpu); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); # else static inline void tick_nohz_stop_sched_tick(int inidle) { } static inline void tick_nohz_restart_sched_tick(void) { } -static inline void tick_nohz_update_jiffies(void) { } static inline ktime_t tick_nohz_get_sleep_length(void) { ktime_t len = { .tv64 = NSEC_PER_SEC/HZ }; return len; } -static inline void tick_nohz_stop_idle(int cpu) { } static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } # endif /* !NO_HZ */ -- cgit v1.2.3 From dd3a1db900f2a215a7d7dd71b836e149a6cf5fed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Oct 2008 18:20:58 +0200 Subject: genirq: improve include files Move the irq_desc related iterators out of irq.h, into irqnr.h, also available via interrupt.h. This way non-genirq (and even non-hardirq) architectures get the common definitions and iterators. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/interrupt.h | 1 + include/linux/irq.h | 20 +------------------- include/linux/irqnr.h | 24 ++++++++++++++++++++++++ 3 files changed, 26 insertions(+), 19 deletions(-) create mode 100644 include/linux/irqnr.h (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 58ff4e74b2f..72fcfcff563 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/irq.h b/include/linux/irq.h index 0618fb362cb..d058c57be02 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -11,25 +11,6 @@ #include -#ifndef CONFIG_GENERIC_HARDIRQS -# define nr_irqs NR_IRQS - -# define for_each_irq_desc(irq, desc) \ - for (irq = 0; irq < nr_irqs; irq++) -#else -extern int nr_irqs; - -# define for_each_irq_desc(irq, desc) \ - for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++) - -# define for_each_irq_desc_reverse(irq, desc) \ - for (irq = nr_irqs -1, desc = irq_desc + (nr_irqs -1 ); \ - irq > 0; irq--, desc--) -#endif - -#define for_each_irq_nr(irq) \ - for (irq = 0; irq < nr_irqs; irq++) - #ifndef CONFIG_S390 #include @@ -37,6 +18,7 @@ extern int nr_irqs; #include #include #include +#include #include #include diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h new file mode 100644 index 00000000000..3171ddc3b39 --- /dev/null +++ b/include/linux/irqnr.h @@ -0,0 +1,24 @@ +#ifndef _LINUX_IRQNR_H +#define _LINUX_IRQNR_H + +#ifndef CONFIG_GENERIC_HARDIRQS +#include +# define nr_irqs NR_IRQS + +# define for_each_irq_desc(irq, desc) \ + for (irq = 0; irq < nr_irqs; irq++) +#else +extern int nr_irqs; + +# define for_each_irq_desc(irq, desc) \ + for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++) + +# define for_each_irq_desc_reverse(irq, desc) \ + for (irq = nr_irqs -1, desc = irq_desc + (nr_irqs -1 ); \ + irq > 0; irq--, desc--) +#endif + +#define for_each_irq_nr(irq) \ + for (irq = 0; irq < nr_irqs; irq++) + +#endif -- cgit v1.2.3 From edbc25caaa492a82e19baa915f1f6b0a0db6554d Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Thu, 10 Jul 2008 16:29:37 -0500 Subject: PCI: remove dynids.use_driver_data The driver flag dynids.use_driver_data is almost consistently not set, and causes more problems than it solves. It was initially intended as a flag to indicate whether a driver's usage of driver_data had been carefully inspected and was ready for values from userspace. That audit was never done, so most drivers just get a 0 for driver_data when new IDs are added from userspace via sysfs. So remove the flag, allowing drivers to see the data directly (a followon patch validates the passed driver_data value against what the drivers expect). Acked-by: Greg Kroah-Hartman Acked-by: Jean Delvare Signed-off-by: Milton Miller Signed-off-by: Jesse Barnes --- include/linux/pci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index acf8f24037c..c989f58d09b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -347,7 +347,6 @@ struct pci_bus_region { struct pci_dynids { spinlock_t lock; /* protects list, index */ struct list_head list; /* for IDs added at runtime */ - unsigned int use_driver_data:1; /* pci_device_id->driver_data is used */ }; /* ---------------------------------------------------------------- */ -- cgit v1.2.3 From 0235c4fc7fc6f621dc0dd89eba102ad5aa373390 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 18 Aug 2008 21:38:00 +0200 Subject: PCI PM: Introduce function pci_wake_from_d3 Many device drivers use the following sequence of statements to enable the device to wake up the system while being in the D3_hot or D3_cold low power state: pci_enable_wake(pdev, PCI_D3hot, 1); pci_enable_wake(pdev, PCI_D3cold, 1); However, the second call is not necessary if the first one succeeds (the ordering of the statements above doesn't matter here) and it may even be harmful, because we are not supposed to enable PME# after the wake-up power has been enabled for the device. To allow drivers to overcome this problem, introduce function pci_wake_from_d3() that will enable the device to wake up the system from any of D3_hot and D3_cold as long as the wake-up from at least one of them is supported. Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index c989f58d09b..f7e7dbc0919 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -644,6 +644,7 @@ pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state); bool pci_pme_capable(struct pci_dev *dev, pci_power_t state); void pci_pme_active(struct pci_dev *dev, bool enable); int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable); +int pci_wake_from_d3(struct pci_dev *dev, bool enable); pci_power_t pci_target_state(struct pci_dev *dev); int pci_prepare_to_sleep(struct pci_dev *dev); int pci_back_from_sleep(struct pci_dev *dev); -- cgit v1.2.3 From 16dbef4a831782466b10d4ae56837c5ba17d1948 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 Aug 2008 19:36:45 -0700 Subject: PCI: change MSI-x vector to 32bit We are using 28bit pci (bus/dev/fn + 12 bits) as irq number, so the cache for irq number should be 32 bit too. Signed-off-by: Yinghai Lu Cc: Andrew Vasquez Signed-off-by: Jesse Barnes --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index f7e7dbc0919..8a4d0bebc31 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -725,7 +725,7 @@ enum pci_dma_burst_strategy { }; struct msix_entry { - u16 vector; /* kernel uses to write allocated vector */ + u32 vector; /* kernel uses to write allocated vector */ u16 entry; /* driver uses to specify entry, OS writes */ }; -- cgit v1.2.3 From 37a84ec668ba251ae02cf2c2c664baf6b247ae1f Mon Sep 17 00:00:00 2001 From: Seth Heasley Date: Thu, 28 Aug 2008 15:40:59 -0700 Subject: x86/PCI: irq and pci_ids patch for Intel Ibex Peak DeviceIDs This patch updates the Intel Ibex Peak (PCH) LPC and SMBus Controller DeviceIDs. The LPC Controller ID is set by Firmware within the range of 0x3b00-3b1f. This range is included in pci_ids.h using min and max values, and irq.c now has code to handle the range (in lieu of 32 additions to a SWITCH statement). The SMBus Controller ID is a fixed-value and will not change. Signed-off-by: Seth Heasley Acked-by: Jean Delvare Signed-off-by: Jesse Barnes --- include/linux/pci_ids.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 8edddc240e4..e5d344bfcb7 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2454,9 +2454,9 @@ #define PCI_DEVICE_ID_INTEL_ICH10_3 0x3a1a #define PCI_DEVICE_ID_INTEL_ICH10_4 0x3a30 #define PCI_DEVICE_ID_INTEL_ICH10_5 0x3a60 -#define PCI_DEVICE_ID_INTEL_PCH_0 0x3b10 -#define PCI_DEVICE_ID_INTEL_PCH_1 0x3b11 -#define PCI_DEVICE_ID_INTEL_PCH_2 0x3b30 +#define PCI_DEVICE_ID_INTEL_PCH_LPC_MIN 0x3b00 +#define PCI_DEVICE_ID_INTEL_PCH_LPC_MAX 0x3b1f +#define PCI_DEVICE_ID_INTEL_PCH_SMBUS 0x3b30 #define PCI_DEVICE_ID_INTEL_IOAT_SNB 0x402f #define PCI_DEVICE_ID_INTEL_5100_16 0x65f0 #define PCI_DEVICE_ID_INTEL_5100_21 0x65f5 -- cgit v1.2.3 From c322b28a04c084a467a862766f74c40c917a721c Mon Sep 17 00:00:00 2001 From: "Zhao, Yu" Date: Mon, 13 Oct 2008 19:36:05 +0800 Subject: PCI: use same arg names in PCI_VDEVICE comment This cleanup makes the argument names in PCI_VDEVICE comment consistent with those used in its definition. Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- include/linux/pci.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 8a4d0bebc31..008005674b6 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -455,8 +455,8 @@ struct pci_driver { /** * PCI_VDEVICE - macro used to describe a specific pci device in short form - * @vend: the vendor name - * @dev: the 16 bit PCI Device ID + * @vendor: the vendor name + * @device: the 16 bit PCI Device ID * * This macro is used to create a struct pci_device_id that matches a * specific PCI device. The subvendor, and subdevice fields will be set -- cgit v1.2.3 From 58c3a727cb73b75a9104d295f096cca12959a5a5 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Tue, 14 Oct 2008 14:02:53 +0800 Subject: PCI: support PCIe ARI capability This patch adds support for PCI Express Alternative Routing-ID Interpretation (ARI) capability. The ARI capability extends the Function Number field of the PCI Express Endpoint by reusing the Device Number which is otherwise hardwired to 0. With ARI, an Endpoint can have up to 256 functions. Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- include/linux/pci.h | 1 + include/linux/pci_regs.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 008005674b6..7e9a1f0715e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -214,6 +214,7 @@ struct pci_dev { unsigned int broken_parity_status:1; /* Device generates false positive parity */ unsigned int msi_enabled:1; unsigned int msix_enabled:1; + unsigned int ari_enabled:1; /* ARI forwarding */ unsigned int is_managed:1; unsigned int is_pcie:1; pci_dev_flags_t dev_flags; diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h index 450684f7eaa..eb6686b88f9 100644 --- a/include/linux/pci_regs.h +++ b/include/linux/pci_regs.h @@ -419,6 +419,10 @@ #define PCI_EXP_RTCTL_CRSSVE 0x10 /* CRS Software Visibility Enable */ #define PCI_EXP_RTCAP 30 /* Root Capabilities */ #define PCI_EXP_RTSTA 32 /* Root Status */ +#define PCI_EXP_DEVCAP2 36 /* Device Capabilities 2 */ +#define PCI_EXP_DEVCAP2_ARI 0x20 /* Alternative Routing-ID */ +#define PCI_EXP_DEVCTL2 40 /* Device Control 2 */ +#define PCI_EXP_DEVCTL2_ARI 0x20 /* Alternative Routing-ID */ /* Extended Capabilities (PCI-X 2.0 and Express) */ #define PCI_EXT_CAP_ID(header) (header & 0x0000ffff) @@ -429,6 +433,7 @@ #define PCI_EXT_CAP_ID_VC 2 #define PCI_EXT_CAP_ID_DSN 3 #define PCI_EXT_CAP_ID_PWR 4 +#define PCI_EXT_CAP_ID_ARI 14 /* Advanced Error Reporting */ #define PCI_ERR_UNCOR_STATUS 4 /* Uncorrectable Error Status */ @@ -536,5 +541,14 @@ #define HT_CAPTYPE_GEN3 0xD0 /* Generation 3 hypertransport configuration */ #define HT_CAPTYPE_PM 0xE0 /* Hypertransport powermanagement configuration */ +/* Alternative Routing-ID Interpretation */ +#define PCI_ARI_CAP 0x04 /* ARI Capability Register */ +#define PCI_ARI_CAP_MFVC 0x0001 /* MFVC Function Groups Capability */ +#define PCI_ARI_CAP_ACS 0x0002 /* ACS Function Groups Capability */ +#define PCI_ARI_CAP_NFN(x) (((x) >> 8) & 0xff) /* Next Function Number */ +#define PCI_ARI_CTRL 0x06 /* ARI Control Register */ +#define PCI_ARI_CTRL_MFVC 0x0001 /* MFVC Function Groups Enable */ +#define PCI_ARI_CTRL_ACS 0x0002 /* ACS Function Groups Enable */ +#define PCI_ARI_CTRL_FG(x) (((x) >> 4) & 7) /* Function Group */ #endif /* LINUX_PCI_REGS_H */ -- cgit v1.2.3 From aa42d7c6138afdc54f74e971456a0fbfec16b77b Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 28 Sep 2008 16:36:11 -0700 Subject: PCI: introduce an pci_ioremap(pdev, barnr) function A common thing in many PCI drivers is to ioremap() an entire bar. This is a slightly fragile thing right now, needing both an address and a size, and many driver writers do.. various things there. This patch introduces an pci_ioremap() function taking just a PCI device struct and the bar number as arguments, and figures this all out itself, in one place. In addition, we can add various sanity checks to this function (the patch already checks to make sure that the bar in question really is a MEM bar; few to no drivers do that sort of thing). Hopefully with this type of API we get less chance of mistakes in drivers with ioremap() operations. Signed-off-by: Arjan van de Ven Signed-off-by: Jesse Barnes --- include/linux/pci.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 7e9a1f0715e..46ad282ffe4 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1119,5 +1119,18 @@ static inline void pci_mmcfg_early_init(void) { } static inline void pci_mmcfg_late_init(void) { } #endif +static inline void * pci_ioremap_bar(struct pci_dev *pdev, int bar) +{ + /* + * Make sure the BAR is actually a memory resource, not an IO resource + */ + if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) { + WARN_ON(1); + return NULL; + } + return ioremap_nocache(pci_resource_start(pdev, bar), + pci_resource_len(pdev, bar)); +} + #endif /* __KERNEL__ */ #endif /* LINUX_PCI_H */ -- cgit v1.2.3 From 0927678f55c9a50c296f7e6dae85e87b8236e155 Mon Sep 17 00:00:00 2001 From: Jesse Barnes Date: Sat, 18 Oct 2008 17:33:19 -0700 Subject: PCI: use pci_find_ext_capability everywhere Remove some open coded (and buggy) versions of pci_find_ext_capability in favor of the real routine in the PCI core. Tested-by: Tomasz Czernecki Acked-by: Andrew Vasquez Reviewed-by: Matthew Wilcox Signed-off-by: Jesse Barnes --- include/linux/aer.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/aer.h b/include/linux/aer.h index f2518141de8..a2383a72356 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -18,10 +18,6 @@ static inline int pci_enable_pcie_error_reporting(struct pci_dev *dev) { return -EINVAL; } -static inline int pci_find_aer_capability(struct pci_dev *dev) -{ - return 0; -} static inline int pci_disable_pcie_error_reporting(struct pci_dev *dev) { return -EINVAL; -- cgit v1.2.3 From 270c66be9b4a6f2be53ef3aec5dc8e7b07782ec9 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sun, 19 Oct 2008 20:35:20 +0800 Subject: PCI: fix AER capability check The 'use pci_find_ext_capability everywhere' cleanup brought a new bug, which makes the AER stop working. Fix it by actually using find_ext_cap instead of just find_cap. Drop the unused config space size define while we're at it. Signed-off-by: Yu Zhao Signed-off-by: Jesse Barnes --- include/linux/aer.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/aer.h b/include/linux/aer.h index a2383a72356..f7df1eefc10 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -10,7 +10,6 @@ #if defined(CONFIG_PCIEAER) /* pci-e port driver needs this function to enable aer */ extern int pci_enable_pcie_error_reporting(struct pci_dev *dev); -extern int pci_find_aer_capability(struct pci_dev *dev); extern int pci_disable_pcie_error_reporting(struct pci_dev *dev); extern int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev); #else -- cgit v1.2.3 From 96499871f45b9126157b1a5c512d6e30f1635225 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 20 Oct 2008 19:45:43 +0200 Subject: PCI: fix pci_ioremap_bar() on s390 s390 doesn't have ioremap_*, so protect the definition of the new pci_ioremap_bar function with CONFIG_HAS_IOMEM to avoid build breakage. Acked-by: Arjan van de Ven Signed-off-by: Heiko Carstens Signed-off-by: Jesse Barnes --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 46ad282ffe4..085187be29c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1119,6 +1119,7 @@ static inline void pci_mmcfg_early_init(void) { } static inline void pci_mmcfg_late_init(void) { } #endif +#ifdef CONFIG_HAS_IOMEM static inline void * pci_ioremap_bar(struct pci_dev *pdev, int bar) { /* @@ -1131,6 +1132,7 @@ static inline void * pci_ioremap_bar(struct pci_dev *pdev, int bar) return ioremap_nocache(pci_resource_start(pdev, bar), pci_resource_len(pdev, bar)); } +#endif #endif /* __KERNEL__ */ #endif /* LINUX_PCI_H */ -- cgit v1.2.3