diff options
Diffstat (limited to 'Documentation/trace')
-rw-r--r-- | Documentation/trace/events-kmem.txt | 107 | ||||
-rw-r--r-- | Documentation/trace/events.txt | 219 | ||||
-rw-r--r-- | Documentation/trace/ftrace-design.txt | 233 | ||||
-rw-r--r-- | Documentation/trace/ftrace.txt | 76 | ||||
-rw-r--r-- | Documentation/trace/function-graph-fold.vim | 42 | ||||
-rw-r--r-- | Documentation/trace/postprocess/trace-pagealloc-postprocess.pl | 418 | ||||
-rw-r--r-- | Documentation/trace/power.txt | 17 | ||||
-rw-r--r-- | Documentation/trace/ring-buffer-design.txt | 955 | ||||
-rw-r--r-- | Documentation/trace/tracepoint-analysis.txt | 327 |
9 files changed, 2330 insertions, 64 deletions
diff --git a/Documentation/trace/events-kmem.txt b/Documentation/trace/events-kmem.txt new file mode 100644 index 00000000000..6ef2a8652e1 --- /dev/null +++ b/Documentation/trace/events-kmem.txt @@ -0,0 +1,107 @@ + Subsystem Trace Points: kmem + +The tracing system kmem captures events related to object and page allocation +within the kernel. Broadly speaking there are four major subheadings. + + o Slab allocation of small objects of unknown type (kmalloc) + o Slab allocation of small objects of known type + o Page allocation + o Per-CPU Allocator Activity + o External Fragmentation + +This document will describe what each of the tracepoints are and why they +might be useful. + +1. Slab allocation of small objects of unknown type +=================================================== +kmalloc call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s +kmalloc_node call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d +kfree call_site=%lx ptr=%p + +Heavy activity for these events may indicate that a specific cache is +justified, particularly if kmalloc slab pages are getting significantly +internal fragmented as a result of the allocation pattern. By correlating +kmalloc with kfree, it may be possible to identify memory leaks and where +the allocation sites were. + + +2. Slab allocation of small objects of known type +================================================= +kmem_cache_alloc call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s +kmem_cache_alloc_node call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d +kmem_cache_free call_site=%lx ptr=%p + +These events are similar in usage to the kmalloc-related events except that +it is likely easier to pin the event down to a specific cache. At the time +of writing, no information is available on what slab is being allocated from, +but the call_site can usually be used to extrapolate that information + +3. Page allocation +================== +mm_page_alloc page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s +mm_page_alloc_zone_locked page=%p pfn=%lu order=%u migratetype=%d cpu=%d percpu_refill=%d +mm_page_free_direct page=%p pfn=%lu order=%d +mm_pagevec_free page=%p pfn=%lu order=%d cold=%d + +These four events deal with page allocation and freeing. mm_page_alloc is +a simple indicator of page allocator activity. Pages may be allocated from +the per-CPU allocator (high performance) or the buddy allocator. + +If pages are allocated directly from the buddy allocator, the +mm_page_alloc_zone_locked event is triggered. This event is important as high +amounts of activity imply high activity on the zone->lock. Taking this lock +impairs performance by disabling interrupts, dirtying cache lines between +CPUs and serialising many CPUs. + +When a page is freed directly by the caller, the mm_page_free_direct event +is triggered. Significant amounts of activity here could indicate that the +callers should be batching their activities. + +When pages are freed using a pagevec, the mm_pagevec_free is +triggered. Broadly speaking, pages are taken off the LRU lock in bulk and +freed in batch with a pagevec. Significant amounts of activity here could +indicate that the system is under memory pressure and can also indicate +contention on the zone->lru_lock. + +4. Per-CPU Allocator Activity +============================= +mm_page_alloc_zone_locked page=%p pfn=%lu order=%u migratetype=%d cpu=%d percpu_refill=%d +mm_page_pcpu_drain page=%p pfn=%lu order=%d cpu=%d migratetype=%d + +In front of the page allocator is a per-cpu page allocator. It exists only +for order-0 pages, reduces contention on the zone->lock and reduces the +amount of writing on struct page. + +When a per-CPU list is empty or pages of the wrong type are allocated, +the zone->lock will be taken once and the per-CPU list refilled. The event +triggered is mm_page_alloc_zone_locked for each page allocated with the +event indicating whether it is for a percpu_refill or not. + +When the per-CPU list is too full, a number of pages are freed, each one +which triggers a mm_page_pcpu_drain event. + +The individual nature of the events are so that pages can be tracked +between allocation and freeing. A number of drain or refill pages that occur +consecutively imply the zone->lock being taken once. Large amounts of PCP +refills and drains could imply an imbalance between CPUs where too much work +is being concentrated in one place. It could also indicate that the per-CPU +lists should be a larger size. Finally, large amounts of refills on one CPU +and drains on another could be a factor in causing large amounts of cache +line bounces due to writes between CPUs and worth investigating if pages +can be allocated and freed on the same CPU through some algorithm change. + +5. External Fragmentation +========================= +mm_page_alloc_extfrag page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d + +External fragmentation affects whether a high-order allocation will be +successful or not. For some types of hardware, this is important although +it is avoided where possible. If the system is using huge pages and needs +to be able to resize the pool over the lifetime of the system, this value +is important. + +Large numbers of this event implies that memory is fragmenting and +high-order allocations will start failing at some time in the future. One +means of reducing the occurange of this event is to increase the size of +min_free_kbytes in increments of 3*pageblock_size*nr_online_nodes where +pageblock_size is usually the size of the default hugepage size. diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index f157d7594ea..02ac6ed38b2 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt @@ -1,7 +1,7 @@ Event Tracing Documentation written by Theodore Ts'o - Updated by Li Zefan + Updated by Li Zefan and Tom Zanussi 1. Introduction =============== @@ -22,12 +22,12 @@ tracing information should be printed. --------------------------------- The events which are available for tracing can be found in the file -/debug/tracing/available_events. +/sys/kernel/debug/tracing/available_events. To enable a particular event, such as 'sched_wakeup', simply echo it -to /debug/tracing/set_event. For example: +to /sys/kernel/debug/tracing/set_event. For example: - # echo sched_wakeup >> /debug/tracing/set_event + # echo sched_wakeup >> /sys/kernel/debug/tracing/set_event [ Note: '>>' is necessary, otherwise it will firstly disable all the events. ] @@ -35,15 +35,15 @@ to /debug/tracing/set_event. For example: To disable an event, echo the event name to the set_event file prefixed with an exclamation point: - # echo '!sched_wakeup' >> /debug/tracing/set_event + # echo '!sched_wakeup' >> /sys/kernel/debug/tracing/set_event To disable all events, echo an empty line to the set_event file: - # echo > /debug/tracing/set_event + # echo > /sys/kernel/debug/tracing/set_event To enable all events, echo '*:*' or '*:' to the set_event file: - # echo *:* > /debug/tracing/set_event + # echo *:* > /sys/kernel/debug/tracing/set_event The events are organized into subsystems, such as ext4, irq, sched, etc., and a full event name looks like this: <subsystem>:<event>. The @@ -52,29 +52,29 @@ file. All of the events in a subsystem can be specified via the syntax "<subsystem>:*"; for example, to enable all irq events, you can use the command: - # echo 'irq:*' > /debug/tracing/set_event + # echo 'irq:*' > /sys/kernel/debug/tracing/set_event 2.2 Via the 'enable' toggle --------------------------- -The events available are also listed in /debug/tracing/events/ hierarchy +The events available are also listed in /sys/kernel/debug/tracing/events/ hierarchy of directories. To enable event 'sched_wakeup': - # echo 1 > /debug/tracing/events/sched/sched_wakeup/enable + # echo 1 > /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable To disable it: - # echo 0 > /debug/tracing/events/sched/sched_wakeup/enable + # echo 0 > /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable To enable all events in sched subsystem: - # echo 1 > /debug/tracing/events/sched/enable + # echo 1 > /sys/kernel/debug/tracing/events/sched/enable -To eanble all events: +To enable all events: - # echo 1 > /debug/tracing/events/enable + # echo 1 > /sys/kernel/debug/tracing/events/enable When reading one of these enable files, there are four results: @@ -83,8 +83,199 @@ When reading one of these enable files, there are four results: X - there is a mixture of events enabled and disabled ? - this file does not affect any event +2.3 Boot option +--------------- + +In order to facilitate early boot debugging, use boot option: + + trace_event=[event-list] + +The format of this boot option is the same as described in section 2.1. + 3. Defining an event-enabled tracepoint ======================================= See The example provided in samples/trace_events +4. Event formats +================ + +Each trace event has a 'format' file associated with it that contains +a description of each field in a logged event. This information can +be used to parse the binary trace stream, and is also the place to +find the field names that can be used in event filters (see section 5). + +It also displays the format string that will be used to print the +event in text mode, along with the event name and ID used for +profiling. + +Every event has a set of 'common' fields associated with it; these are +the fields prefixed with 'common_'. The other fields vary between +events and correspond to the fields defined in the TRACE_EVENT +definition for that event. + +Each field in the format has the form: + + field:field-type field-name; offset:N; size:N; + +where offset is the offset of the field in the trace record and size +is the size of the data item, in bytes. + +For example, here's the information displayed for the 'sched_wakeup' +event: + +# cat /debug/tracing/events/sched/sched_wakeup/format + +name: sched_wakeup +ID: 60 +format: + field:unsigned short common_type; offset:0; size:2; + field:unsigned char common_flags; offset:2; size:1; + field:unsigned char common_preempt_count; offset:3; size:1; + field:int common_pid; offset:4; size:4; + field:int common_tgid; offset:8; size:4; + + field:char comm[TASK_COMM_LEN]; offset:12; size:16; + field:pid_t pid; offset:28; size:4; + field:int prio; offset:32; size:4; + field:int success; offset:36; size:4; + field:int cpu; offset:40; size:4; + +print fmt: "task %s:%d [%d] success=%d [%03d]", REC->comm, REC->pid, + REC->prio, REC->success, REC->cpu + +This event contains 10 fields, the first 5 common and the remaining 5 +event-specific. All the fields for this event are numeric, except for +'comm' which is a string, a distinction important for event filtering. + +5. Event filtering +================== + +Trace events can be filtered in the kernel by associating boolean +'filter expressions' with them. As soon as an event is logged into +the trace buffer, its fields are checked against the filter expression +associated with that event type. An event with field values that +'match' the filter will appear in the trace output, and an event whose +values don't match will be discarded. An event with no filter +associated with it matches everything, and is the default when no +filter has been set for an event. + +5.1 Expression syntax +--------------------- + +A filter expression consists of one or more 'predicates' that can be +combined using the logical operators '&&' and '||'. A predicate is +simply a clause that compares the value of a field contained within a +logged event with a constant value and returns either 0 or 1 depending +on whether the field value matched (1) or didn't match (0): + + field-name relational-operator value + +Parentheses can be used to provide arbitrary logical groupings and +double-quotes can be used to prevent the shell from interpreting +operators as shell metacharacters. + +The field-names available for use in filters can be found in the +'format' files for trace events (see section 4). + +The relational-operators depend on the type of the field being tested: + +The operators available for numeric fields are: + +==, !=, <, <=, >, >= + +And for string fields they are: + +==, != + +Currently, only exact string matches are supported. + +Currently, the maximum number of predicates in a filter is 16. + +5.2 Setting filters +------------------- + +A filter for an individual event is set by writing a filter expression +to the 'filter' file for the given event. + +For example: + +# cd /debug/tracing/events/sched/sched_wakeup +# echo "common_preempt_count > 4" > filter + +A slightly more involved example: + +# cd /debug/tracing/events/sched/sched_signal_send +# echo "((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter + +If there is an error in the expression, you'll get an 'Invalid +argument' error when setting it, and the erroneous string along with +an error message can be seen by looking at the filter e.g.: + +# cd /debug/tracing/events/sched/sched_signal_send +# echo "((sig >= 10 && sig < 15) || dsig == 17) && comm != bash" > filter +-bash: echo: write error: Invalid argument +# cat filter +((sig >= 10 && sig < 15) || dsig == 17) && comm != bash +^ +parse_error: Field not found + +Currently the caret ('^') for an error always appears at the beginning of +the filter string; the error message should still be useful though +even without more accurate position info. + +5.3 Clearing filters +-------------------- + +To clear the filter for an event, write a '0' to the event's filter +file. + +To clear the filters for all events in a subsystem, write a '0' to the +subsystem's filter file. + +5.3 Subsystem filters +--------------------- + +For convenience, filters for every event in a subsystem can be set or +cleared as a group by writing a filter expression into the filter file +at the root of the subsytem. Note however, that if a filter for any +event within the subsystem lacks a field specified in the subsystem +filter, or if the filter can't be applied for any other reason, the +filter for that event will retain its previous setting. This can +result in an unintended mixture of filters which could lead to +confusing (to the user who might think different filters are in +effect) trace output. Only filters that reference just the common +fields can be guaranteed to propagate successfully to all events. + +Here are a few subsystem filter examples that also illustrate the +above points: + +Clear the filters on all events in the sched subsytem: + +# cd /sys/kernel/debug/tracing/events/sched +# echo 0 > filter +# cat sched_switch/filter +none +# cat sched_wakeup/filter +none + +Set a filter using only common fields for all events in the sched +subsytem (all events end up with the same filter): + +# cd /sys/kernel/debug/tracing/events/sched +# echo common_pid == 0 > filter +# cat sched_switch/filter +common_pid == 0 +# cat sched_wakeup/filter +common_pid == 0 + +Attempt to set a filter using a non-common field for all events in the +sched subsytem (all events but those that have a prev_pid field retain +their old filters): + +# cd /sys/kernel/debug/tracing/events/sched +# echo prev_pid == 0 > filter +# cat sched_switch/filter +prev_pid == 0 +# cat sched_wakeup/filter +common_pid == 0 diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt new file mode 100644 index 00000000000..7003e10f10f --- /dev/null +++ b/Documentation/trace/ftrace-design.txt @@ -0,0 +1,233 @@ + function tracer guts + ==================== + +Introduction +------------ + +Here we will cover the architecture pieces that the common function tracing +code relies on for proper functioning. Things are broken down into increasing +complexity so that you can start simple and at least get basic functionality. + +Note that this focuses on architecture implementation details only. If you +want more explanation of a feature in terms of common code, review the common +ftrace.txt file. + + +Prerequisites +------------- + +Ftrace relies on these features being implemented: + STACKTRACE_SUPPORT - implement save_stack_trace() + TRACE_IRQFLAGS_SUPPORT - implement include/asm/irqflags.h + + +HAVE_FUNCTION_TRACER +-------------------- + +You will need to implement the mcount and the ftrace_stub functions. + +The exact mcount symbol name will depend on your toolchain. Some call it +"mcount", "_mcount", or even "__mcount". You can probably figure it out by +running something like: + $ echo 'main(){}' | gcc -x c -S -o - - -pg | grep mcount + call mcount +We'll make the assumption below that the symbol is "mcount" just to keep things +nice and simple in the examples. + +Keep in mind that the ABI that is in effect inside of the mcount function is +*highly* architecture/toolchain specific. We cannot help you in this regard, +sorry. Dig up some old documentation and/or find someone more familiar than +you to bang ideas off of. Typically, register usage (argument/scratch/etc...) +is a major issue at this point, especially in relation to the location of the +mcount call (before/after function prologue). You might also want to look at +how glibc has implemented the mcount function for your architecture. It might +be (semi-)relevant. + +The mcount function should check the function pointer ftrace_trace_function +to see if it is set to ftrace_stub. If it is, there is nothing for you to do, +so return immediately. If it isn't, then call that function in the same way +the mcount function normally calls __mcount_internal -- the first argument is +the "frompc" while the second argument is the "selfpc" (adjusted to remove the +size of the mcount call that is embedded in the function). + +For example, if the function foo() calls bar(), when the bar() function calls +mcount(), the arguments mcount() will pass to the tracer are: + "frompc" - the address bar() will use to return to foo() + "selfpc" - the address bar() (with _mcount() size adjustment) + +Also keep in mind that this mcount function will be called *a lot*, so +optimizing for the default case of no tracer will help the smooth running of +your system when tracing is disabled. So the start of the mcount function is +typically the bare min with checking things before returning. That also means +the code flow should usually kept linear (i.e. no branching in the nop case). +This is of course an optimization and not a hard requirement. + +Here is some pseudo code that should help (these functions should actually be +implemented in assembly): + +void ftrace_stub(void) +{ + return; +} + +void mcount(void) +{ + /* save any bare state needed in order to do initial checking */ + + extern void (*ftrace_trace_function)(unsigned long, unsigned long); + if (ftrace_trace_function != ftrace_stub) + goto do_trace; + + /* restore any bare state */ + + return; + +do_trace: + + /* save all state needed by the ABI (see paragraph above) */ + + unsigned long frompc = ...; + unsigned long selfpc = <return address> - MCOUNT_INSN_SIZE; + ftrace_trace_function(frompc, selfpc); + + /* restore all state needed by the ABI */ +} + +Don't forget to export mcount for modules ! +extern void mcount(void); +EXPORT_SYMBOL(mcount); + + +HAVE_FUNCTION_TRACE_MCOUNT_TEST +------------------------------- + +This is an optional optimization for the normal case when tracing is turned off +in the system. If you do not enable this Kconfig option, the common ftrace +code will take care of doing the checking for you. + +To support this feature, you only need to check the function_trace_stop +variable in the mcount function. If it is non-zero, there is no tracing to be +done at all, so you can return. + +This additional pseudo code would simply be: +void mcount(void) +{ + /* save any bare state needed in order to do initial checking */ + ++ if (function_trace_stop) ++ return; + + extern void (*ftrace_trace_function)(unsigned long, unsigned long); + if (ftrace_trace_function != ftrace_stub) +... + + +HAVE_FUNCTION_GRAPH_TRACER +-------------------------- + +Deep breath ... time to do some real work. Here you will need to update the +mcount function to check ftrace graph function pointers, as well as implement +some functions to save (hijack) and restore the return address. + +The mcount function should check the function pointers ftrace_graph_return +(compare to ftrace_stub) and ftrace_graph_entry (compare to +ftrace_graph_entry_stub). If either of those are not set to the relevant stub +function, call the arch-specific function ftrace_graph_caller which in turn +calls the arch-specific function prepare_ftrace_return. Neither of these +function names are strictly required, but you should use them anyways to stay +consistent across the architecture ports -- easier to compare & contrast +things. + +The arguments to prepare_ftrace_return are slightly different than what are +passed to ftrace_trace_function. The second argument "selfpc" is the same, +but the first argument should be a pointer to the "frompc". Typically this is +located on the stack. This allows the function to hijack the return address +temporarily to have it point to the arch-specific function return_to_handler. +That function will simply call the common ftrace_return_to_handler function and +that will return the original return address with which, you can return to the +original call site. + +Here is the updated mcount pseudo code: +void mcount(void) +{ +... + if (ftrace_trace_function != ftrace_stub) + goto do_trace; + ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++ extern void (*ftrace_graph_return)(...); ++ extern void (*ftrace_graph_entry)(...); ++ if (ftrace_graph_return != ftrace_stub || ++ ftrace_graph_entry != ftrace_graph_entry_stub) ++ ftrace_graph_caller(); ++#endif + + /* restore any bare state */ +... + +Here is the pseudo code for the new ftrace_graph_caller assembly function: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +void ftrace_graph_caller(void) +{ + /* save all state needed by the ABI */ + + unsigned long *frompc = &...; + unsigned long selfpc = <return address> - MCOUNT_INSN_SIZE; + prepare_ftrace_return(frompc, selfpc); + + /* restore all state needed by the ABI */ +} +#endif + +For information on how to implement prepare_ftrace_return(), simply look at +the x86 version. The only architecture-specific piece in it is the setup of +the fault recovery table (the asm(...) code). The rest should be the same +across architectures. + +Here is the pseudo code for the new return_to_handler assembly function. Note +that the ABI that applies here is different from what applies to the mcount +code. Since you are returning from a function (after the epilogue), you might +be able to skimp on things saved/restored (usually just registers used to pass +return values). + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +void return_to_handler(void) +{ + /* save all state needed by the ABI (see paragraph above) */ + + void (*original_return_point)(void) = ftrace_return_to_handler(); + + /* restore all state needed by the ABI */ + + /* this is usually either a return or a jump */ + original_return_point(); +} +#endif + + +HAVE_FTRACE_NMI_ENTER +--------------------- + +If you can't trace NMI functions, then skip this option. + +<details to be filled> + + +HAVE_FTRACE_SYSCALLS +--------------------- + +<details to be filled> + + +HAVE_FTRACE_MCOUNT_RECORD +------------------------- + +See scripts/recordmcount.pl for more info. + +<details to be filled> + + +HAVE_DYNAMIC_FTRACE +--------------------- + +<details to be filled> diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index a39b3c749de..957b22fde2d 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -26,6 +26,12 @@ disabled, and more (ftrace allows for tracer plugins, which means that the list of tracers can always grow). +Implementation Details +---------------------- + +See ftrace-design.txt for details for arch porters and such. + + The File System --------------- @@ -85,26 +91,19 @@ of ftrace. Here is a list of some of the key files: This file holds the output of the trace in a human readable format (described below). - latency_trace: - - This file shows the same trace but the information - is organized more to display possible latencies - in the system (described below). - trace_pipe: The output is the same as the "trace" file but this file is meant to be streamed with live tracing. - Reads from this file will block until new data - is retrieved. Unlike the "trace" and "latency_trace" - files, this file is a consumer. This means reading - from this file causes sequential reads to display - more current data. Once data is read from this - file, it is consumed, and will not be read - again with a sequential read. The "trace" and - "latency_trace" files are static, and if the - tracer is not adding more data, they will display - the same information every time they are read. + Reads from this file will block until new data is + retrieved. Unlike the "trace" file, this file is a + consumer. This means reading from this file causes + sequential reads to display more current data. Once + data is read from this file, it is consumed, and + will not be read again with a sequential read. The + "trace" file is static, and if the tracer is not + adding more data,they will display the same + information every time they are read. trace_options: @@ -117,10 +116,10 @@ of ftrace. Here is a list of some of the key files: Some of the tracers record the max latency. For example, the time interrupts are disabled. This time is saved in this file. The max trace - will also be stored, and displayed by either - "trace" or "latency_trace". A new max trace will - only be recorded if the latency is greater than - the value in this file. (in microseconds) + will also be stored, and displayed by "trace". + A new max trace will only be recorded if the + latency is greater than the value in this + file. (in microseconds) buffer_size_kb: @@ -134,7 +133,7 @@ of ftrace. Here is a list of some of the key files: than requested, the rest of the page will be used, making the actual allocation bigger than requested. ( Note, the size may not be a multiple of the page size - due to buffer managment overhead. ) + due to buffer management overhead. ) This can only be updated when the current_tracer is set to "nop". @@ -210,7 +209,7 @@ Here is the list of current tracers that may be configured. the trace with the longest max latency. See tracing_max_latency. When a new max is recorded, it replaces the old trace. It is best to view this - trace via the latency_trace file. + trace with the latency-format option enabled. "preemptoff" @@ -307,8 +306,8 @@ the lowest priority thread (pid 0). Latency trace format -------------------- -For traces that display latency times, the latency_trace file -gives somewhat more information to see why a latency happened. +When the latency-format option is enabled, the trace file gives +somewhat more information to see why a latency happened. Here is a typical trace. # tracer: irqsoff @@ -380,9 +379,10 @@ explains which is which. The above is mostly meaningful for kernel developers. - time: This differs from the trace file output. The trace file output - includes an absolute timestamp. The timestamp used by the - latency_trace file is relative to the start of the trace. + time: When the latency-format option is enabled, the trace file + output includes a timestamp relative to the start of the + trace. This differs from the output when latency-format + is disabled, which includes an absolute timestamp. delay: This is just to help catch your eye a bit better. And needs to be fixed to be only relative to the same CPU. @@ -440,7 +440,8 @@ Here are the available options: sym-addr: bash-4000 [01] 1477.606694: simple_strtoul <c0339346> - verbose - This deals with the latency_trace file. + verbose - This deals with the trace file when the + latency-format option is enabled. bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \ (+0.000ms): simple_strtoul (strict_strtoul) @@ -472,7 +473,7 @@ Here are the available options: the app is no longer running The lookup is performed when you read - trace,trace_pipe,latency_trace. Example: + trace,trace_pipe. Example: a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] @@ -481,6 +482,11 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] every scheduling event. Will add overhead if there's a lot of tasks running at once. + latency-format - This option changes the trace. When + it is enabled, the trace displays + additional information about the + latencies, as described in "Latency + trace format". sched_switch ------------ @@ -596,12 +602,13 @@ To reset the maximum, echo 0 into tracing_max_latency. Here is an example: # echo irqsoff > current_tracer + # echo latency-format > trace_options # echo 0 > tracing_max_latency # echo 1 > tracing_enabled # ls -ltr [...] # echo 0 > tracing_enabled - # cat latency_trace + # cat trace # tracer: irqsoff # irqsoff latency trace v1.1.5 on 2.6.26 @@ -703,12 +710,13 @@ which preemption was disabled. The control of preemptoff tracer is much like the irqsoff tracer. # echo preemptoff > current_tracer + # echo latency-format > trace_options # echo 0 > tracing_max_latency # echo 1 > tracing_enabled # ls -ltr [...] # echo 0 > tracing_enabled - # cat latency_trace + # cat trace # tracer: preemptoff # preemptoff latency trace v1.1.5 on 2.6.26-rc8 @@ -850,12 +858,13 @@ Again, using this trace is much like the irqsoff and preemptoff tracers. # echo preemptirqsoff > current_tracer + # echo latency-format > trace_options # echo 0 > tracing_max_latency # echo 1 > tracing_enabled # ls -ltr [...] # echo 0 > tracing_enabled - # cat latency_trace + # cat trace # tracer: preemptirqsoff # preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 @@ -1012,11 +1021,12 @@ Instead of performing an 'ls', we will run 'sleep 1' under 'chrt' which changes the priority of the task. # echo wakeup > current_tracer + # echo latency-format > trace_options # echo 0 > tracing_max_latency # echo 1 > tracing_enabled # chrt -f 5 sleep 1 # echo 0 > tracing_enabled - # cat latency_trace + # cat trace # tracer: wakeup # wakeup latency trace v1.1.5 on 2.6.26-rc8 diff --git a/Documentation/trace/function-graph-fold.vim b/Documentation/trace/function-graph-fold.vim new file mode 100644 index 00000000000..0544b504c8b --- /dev/null +++ b/Documentation/trace/function-graph-fold.vim @@ -0,0 +1,42 @@ +" Enable folding for ftrace function_graph traces. +" +" To use, :source this file while viewing a function_graph trace, or use vim's +" -S option to load from the command-line together with a trace. You can then +" use the usual vim fold commands, such as "za", to open and close nested +" functions. While closed, a fold will show the total time taken for a call, +" as would normally appear on the line with the closing brace. Folded +" functions will not include finish_task_switch(), so folding should remain +" relatively sane even through a context switch. +" +" Note that this will almost certainly only work well with a +" single-CPU trace (e.g. trace-cmd report --cpu 1). + +function! FunctionGraphFoldExpr(lnum) + let line = getline(a:lnum) + if line[-1:] == '{' + if line =~ 'finish_task_switch() {$' + return '>1' + endif + return 'a1' + elseif line[-1:] == '}' + return 's1' + else + return '=' + endif +endfunction + +function! FunctionGraphFoldText() + let s = split(getline(v:foldstart), '|', 1) + if getline(v:foldend+1) =~ 'finish_task_switch() {$' + let s[2] = ' task switch ' + else + let e = split(getline(v:foldend), '|', 1) + let s[2] = e[2] + endif + return join(s, '|') +endfunction + +setlocal foldexpr=FunctionGraphFoldExpr(v:lnum) +setlocal foldtext=FunctionGraphFoldText() +setlocal foldcolumn=12 +setlocal foldmethod=expr diff --git a/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl new file mode 100644 index 00000000000..7df50e8cf4d --- /dev/null +++ b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl @@ -0,0 +1,418 @@ +#!/usr/bin/perl +# This is a POC (proof of concept or piece of crap, take your pick) for reading the +# text representation of trace output related to page allocation. It makes an attempt +# to extract some high-level information on what is going on. The accuracy of the parser +# may vary considerably +# +# Example usage: trace-pagealloc-postprocess.pl < /sys/kernel/debug/tracing/trace_pipe +# other options +# --prepend-parent Report on the parent proc and PID +# --read-procstat If the trace lacks process info, get it from /proc +# --ignore-pid Aggregate processes of the same name together +# +# Copyright (c) IBM Corporation 2009 +# Author: Mel Gorman <mel@csn.ul.ie> +use strict; +use Getopt::Long; + +# Tracepoint events +use constant MM_PAGE_ALLOC => 1; +use constant MM_PAGE_FREE_DIRECT => 2; +use constant MM_PAGEVEC_FREE => 3; +use constant MM_PAGE_PCPU_DRAIN => 4; +use constant MM_PAGE_ALLOC_ZONE_LOCKED => 5; +use constant MM_PAGE_ALLOC_EXTFRAG => 6; +use constant EVENT_UNKNOWN => 7; + +# Constants used to track state +use constant STATE_PCPU_PAGES_DRAINED => 8; +use constant STATE_PCPU_PAGES_REFILLED => 9; + +# High-level events extrapolated from tracepoints +use constant HIGH_PCPU_DRAINS => 10; +use constant HIGH_PCPU_REFILLS => 11; +use constant HIGH_EXT_FRAGMENT => 12; +use constant HIGH_EXT_FRAGMENT_SEVERE => 13; +use constant HIGH_EXT_FRAGMENT_MODERATE => 14; +use constant HIGH_EXT_FRAGMENT_CHANGED => 15; + +my %perprocesspid; +my %perprocess; +my $opt_ignorepid; +my $opt_read_procstat; +my $opt_prepend_parent; + +# Catch sigint and exit on request +my $sigint_report = 0; +my $sigint_exit = 0; +my $sigint_pending = 0; +my $sigint_received = 0; +sub sigint_handler { + my $current_time = time; + if ($current_time - 2 > $sigint_received) { + print "SIGINT received, report pending. Hit ctrl-c again to exit\n"; + $sigint_report = 1; + } else { + if (!$sigint_exit) { + print "Second SIGINT received quickly, exiting\n"; + } + $sigint_exit++; + } + + if ($sigint_exit > 3) { + print "Many SIGINTs received, exiting now without report\n"; + exit; + } + + $sigint_received = $current_time; + $sigint_pending = 1; +} +$SIG{INT} = "sigint_handler"; + +# Parse command line options +GetOptions( + 'ignore-pid' => \$opt_ignorepid, + 'read-procstat' => \$opt_read_procstat, + 'prepend-parent' => \$opt_prepend_parent, +); + +# Defaults for dynamically discovered regex's +my $regex_fragdetails_default = 'page=([0-9a-f]*) pfn=([0-9]*) alloc_order=([-0-9]*) fallback_order=([-0-9]*) pageblock_order=([-0-9]*) alloc_migratetype=([-0-9]*) fallback_migratetype=([-0-9]*) fragmenting=([-0-9]) change_ownership=([-0-9])'; + +# Dyanically discovered regex +my $regex_fragdetails; + +# Static regex used. Specified like this for readability and for use with /o +# (process_pid) (cpus ) ( time ) (tpoint ) (details) +my $regex_traceevent = '\s*([a-zA-Z0-9-]*)\s*(\[[0-9]*\])\s*([0-9.]*):\s*([a-zA-Z_]*):\s*(.*)'; +my $regex_statname = '[-0-9]*\s\((.*)\).*'; +my $regex_statppid = '[-0-9]*\s\(.*\)\s[A-Za-z]\s([0-9]*).*'; + +sub generate_traceevent_regex { + my $event = shift; + my $default = shift; + my $regex; + + # Read the event format or use the default + if (!open (FORMAT, "/sys/kernel/debug/tracing/events/$event/format")) { + $regex = $default; + } else { + my $line; + while (!eof(FORMAT)) { + $line = <FORMAT>; + if ($line =~ /^print fmt:\s"(.*)",.*/) { + $regex = $1; + $regex =~ s/%p/\([0-9a-f]*\)/g; + $regex =~ s/%d/\([-0-9]*\)/g; + $regex =~ s/%lu/\([0-9]*\)/g; + } + } + } + + # Verify fields are in the right order + my $tuple; + foreach $tuple (split /\s/, $regex) { + my ($key, $value) = split(/=/, $tuple); + my $expected = shift; + if ($key ne $expected) { + print("WARNING: Format not as expected '$key' != '$expected'"); + $regex =~ s/$key=\((.*)\)/$key=$1/; + } + } + + if (defined shift) { + die("Fewer fields than expected in format"); + } + + return $regex; +} +$regex_fragdetails = generate_traceevent_regex("kmem/mm_page_alloc_extfrag", + $regex_fragdetails_default, + "page", "pfn", + "alloc_order", "fallback_order", "pageblock_order", + "alloc_migratetype", "fallback_migratetype", + "fragmenting", "change_ownership"); + +sub read_statline($) { + my $pid = $_[0]; + my $statline; + + if (open(STAT, "/proc/$pid/stat")) { + $statline = <STAT>; + close(STAT); + } + + if ($statline eq '') { + $statline = "-1 (UNKNOWN_PROCESS_NAME) R 0"; + } + + return $statline; +} + +sub guess_process_pid($$) { + my $pid = $_[0]; + my $statline = $_[1]; + + if ($pid == 0) { + return "swapper-0"; + } + + if ($statline !~ /$regex_statname/o) { + die("Failed to math stat line for process name :: $statline"); + } + return "$1-$pid"; +} + +sub parent_info($$) { + my $pid = $_[0]; + my $statline = $_[1]; + my $ppid; + + if ($pid == 0) { + return "NOPARENT-0"; + } + + if ($statline !~ /$regex_statppid/o) { + die("Failed to match stat line process ppid:: $statline"); + } + + # Read the ppid stat line + $ppid = $1; + return guess_process_pid($ppid, read_statline($ppid)); +} + +sub process_events { + my $traceevent; + my $process_pid; + my $cpus; + my $timestamp; + my $tracepoint; + my $details; + my $statline; + + # Read each line of the event log +EVENT_PROCESS: + while ($traceevent = <STDIN>) { + if ($traceevent =~ /$regex_traceevent/o) { + $process_pid = $1; + $tracepoint = $4; + + if ($opt_read_procstat || $opt_prepend_parent) { + $process_pid =~ /(.*)-([0-9]*)$/; + my $process = $1; + my $pid = $2; + + $statline = read_statline($pid); + + if ($opt_read_procstat && $process eq '') { + $process_pid = guess_process_pid($pid, $statline); + } + + if ($opt_prepend_parent) { + $process_pid = parent_info($pid, $statline) . " :: $process_pid"; + } + } + + # Unnecessary in this script. Uncomment if required + # $cpus = $2; + # $timestamp = $3; + } else { + next; + } + + # Perl Switch() sucks majorly + if ($tracepoint eq "mm_page_alloc") { + $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}++; + } elsif ($tracepoint eq "mm_page_free_direct") { + $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT}++; + } elsif ($tracepoint eq "mm_pagevec_free") { + $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE}++; + } elsif ($tracepoint eq "mm_page_pcpu_drain") { + $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}++; + $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED}++; + } elsif ($tracepoint eq "mm_page_alloc_zone_locked") { + $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}++; + $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED}++; + } elsif ($tracepoint eq "mm_page_alloc_extfrag") { + + # Extract the details of the event now + $details = $5; + + my ($page, $pfn); + my ($alloc_order, $fallback_order, $pageblock_order); + my ($alloc_migratetype, $fallback_migratetype); + my ($fragmenting, $change_ownership); + + if ($details !~ /$regex_fragdetails/o) { + print "WARNING: Failed to parse mm_page_alloc_extfrag as expected\n"; + next; + } + + $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG}++; + $page = $1; + $pfn = $2; + $alloc_order = $3; + $fallback_order = $4; + $pageblock_order = $5; + $alloc_migratetype = $6; + $fallback_migratetype = $7; + $fragmenting = $8; + $change_ownership = $9; + + if ($fragmenting) { + $perprocesspid{$process_pid}->{HIGH_EXT_FRAG}++; + if ($fallback_order <= 3) { + $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE}++; + } else { + $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE}++; + } + } + if ($change_ownership) { + $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED}++; + } + } else { + $perprocesspid{$process_pid}->{EVENT_UNKNOWN}++; + } + + # Catch a full pcpu drain event + if ($perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED} && + $tracepoint ne "mm_page_pcpu_drain") { + + $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS}++; + $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED} = 0; + } + + # Catch a full pcpu refill event + if ($perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED} && + $tracepoint ne "mm_page_alloc_zone_locked") { + $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS}++; + $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED} = 0; + } + + if ($sigint_pending) { + last EVENT_PROCESS; + } + } +} + +sub dump_stats { + my $hashref = shift; + my %stats = %$hashref; + + # Dump per-process stats + my $process_pid; + my $max_strlen = 0; + + # Get the maximum process name + foreach $process_pid (keys %perprocesspid) { + my $len = length($process_pid); + if ($len > $max_strlen) { + $max_strlen = $len; + } + } + $max_strlen += 2; + + printf("\n"); + printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n", + "Process", "Pages", "Pages", "Pages", "Pages", "PCPU", "PCPU", "PCPU", "Fragment", "Fragment", "MigType", "Fragment", "Fragment", "Unknown"); + printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n", + "details", "allocd", "allocd", "freed", "freed", "pages", "drains", "refills", "Fallback", "Causing", "Changed", "Severe", "Moderate", ""); + + printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n", + "", "", "under lock", "direct", "pagevec", "drain", "", "", "", "", "", "", "", ""); + + foreach $process_pid (keys %stats) { + # Dump final aggregates + if ($stats{$process_pid}->{STATE_PCPU_PAGES_DRAINED}) { + $stats{$process_pid}->{HIGH_PCPU_DRAINS}++; + $stats{$process_pid}->{STATE_PCPU_PAGES_DRAINED} = 0; + } + if ($stats{$process_pid}->{STATE_PCPU_PAGES_REFILLED}) { + $stats{$process_pid}->{HIGH_PCPU_REFILLS}++; + $stats{$process_pid}->{STATE_PCPU_PAGES_REFILLED} = 0; + } + + printf("%-" . $max_strlen . "s %8d %10d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d\n", + $process_pid, + $stats{$process_pid}->{MM_PAGE_ALLOC}, + $stats{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}, + $stats{$process_pid}->{MM_PAGE_FREE_DIRECT}, + $stats{$process_pid}->{MM_PAGEVEC_FREE}, + $stats{$process_pid}->{MM_PAGE_PCPU_DRAIN}, + $stats{$process_pid}->{HIGH_PCPU_DRAINS}, + $stats{$process_pid}->{HIGH_PCPU_REFILLS}, + $stats{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG}, + $stats{$process_pid}->{HIGH_EXT_FRAG}, + $stats{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED}, + $stats{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE}, + $stats{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE}, + $stats{$process_pid}->{EVENT_UNKNOWN}); + } +} + +sub aggregate_perprocesspid() { + my $process_pid; + my $process; + undef %perprocess; + + foreach $process_pid (keys %perprocesspid) { + $process = $process_pid; + $process =~ s/-([0-9])*$//; + if ($process eq '') { + $process = "NO_PROCESS_NAME"; + } + + $perprocess{$process}->{MM_PAGE_ALLOC} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}; + $perprocess{$process}->{MM_PAGE_ALLOC_ZONE_LOCKED} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}; + $perprocess{$process}->{MM_PAGE_FREE_DIRECT} += $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT}; + $perprocess{$process}->{MM_PAGEVEC_FREE} += $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE}; + $perprocess{$process}->{MM_PAGE_PCPU_DRAIN} += $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}; + $perprocess{$process}->{HIGH_PCPU_DRAINS} += $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS}; + $perprocess{$process}->{HIGH_PCPU_REFILLS} += $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS}; + $perprocess{$process}->{MM_PAGE_ALLOC_EXTFRAG} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG}; + $perprocess{$process}->{HIGH_EXT_FRAG} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAG}; + $perprocess{$process}->{HIGH_EXT_FRAGMENT_CHANGED} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED}; + $perprocess{$process}->{HIGH_EXT_FRAGMENT_SEVERE} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE}; + $perprocess{$process}->{HIGH_EXT_FRAGMENT_MODERATE} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE}; + $perprocess{$process}->{EVENT_UNKNOWN} += $perprocesspid{$process_pid}->{EVENT_UNKNOWN}; + } +} + +sub report() { + if (!$opt_ignorepid) { + dump_stats(\%perprocesspid); + } else { + aggregate_perprocesspid(); + dump_stats(\%perprocess); + } +} + +# Process events or signals until neither is available +sub signal_loop() { + my $sigint_processed; + do { + $sigint_processed = 0; + process_events(); + + # Handle pending signals if any + if ($sigint_pending) { + my $current_time = time; + + if ($sigint_exit) { + print "Received exit signal\n"; + $sigint_pending = 0; + } + if ($sigint_report) { + if ($current_time >= $sigint_received + 2) { + report(); + $sigint_report = 0; + $sigint_pending = 0; + $sigint_processed = 1; + } + } + } + } while ($sigint_pending || $sigint_processed); +} + +signal_loop(); +report(); diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt deleted file mode 100644 index cd805e16dc2..00000000000 --- a/Documentation/trace/power.txt +++ /dev/null @@ -1,17 +0,0 @@ -The power tracer collects detailed information about C-state and P-state -transitions, instead of just looking at the high-level "average" -information. - -There is a helper script found in scrips/tracing/power.pl in the kernel -sources which can be used to parse this information and create a -Scalable Vector Graphics (SVG) picture from the trace data. - -To use this tracer: - - echo 0 > /sys/kernel/debug/tracing/tracing_enabled - echo power > /sys/kernel/debug/tracing/current_tracer - echo 1 > /sys/kernel/debug/tracing/tracing_enabled - sleep 1 - echo 0 > /sys/kernel/debug/tracing/tracing_enabled - cat /sys/kernel/debug/tracing/trace | \ - perl scripts/tracing/power.pl > out.sv diff --git a/Documentation/trace/ring-buffer-design.txt b/Documentation/trace/ring-buffer-design.txt new file mode 100644 index 00000000000..5b1d23d604c --- /dev/null +++ b/Documentation/trace/ring-buffer-design.txt @@ -0,0 +1,955 @@ + Lockless Ring Buffer Design + =========================== + +Copyright 2009 Red Hat Inc. + Author: Steven Rostedt <srostedt@redhat.com> + License: The GNU Free Documentation License, Version 1.2 + (dual licensed under the GPL v2) +Reviewers: Mathieu Desnoyers, Huang Ying, Hidetoshi Seto, + and Frederic Weisbecker. + + +Written for: 2.6.31 + +Terminology used in this Document +--------------------------------- + +tail - where new writes happen in the ring buffer. + +head - where new reads happen in the ring buffer. + +producer - the task that writes into the ring buffer (same as writer) + +writer - same as producer + +consumer - the task that reads from the buffer (same as reader) + +reader - same as consumer. + +reader_page - A page outside the ring buffer used solely (for the most part) + by the reader. + +head_page - a pointer to the page that the reader will use next + +tail_page - a pointer to the page that will be written to next + +commit_page - a pointer to the page with the last finished non nested write. + +cmpxchg - hardware assisted atomic transaction that performs the following: + + A = B iff previous A == C + + R = cmpxchg(A, C, B) is saying that we replace A with B if and only if + current A is equal to C, and we put the old (current) A into R + + R gets the previous A regardless if A is updated with B or not. + + To see if the update was successful a compare of R == C may be used. + +The Generic Ring Buffer +----------------------- + +The ring buffer can be used in either an overwrite mode or in +producer/consumer mode. + +Producer/consumer mode is where the producer were to fill up the +buffer before the consumer could free up anything, the producer +will stop writing to the buffer. This will lose most recent events. + +Overwrite mode is where the produce were to fill up the buffer +before the consumer could free up anything, the producer will +overwrite the older data. This will lose the oldest events. + +No two writers can write at the same time (on the same per cpu buffer), +but a writer may interrupt another writer, but it must finish writing +before the previous writer may continue. This is very important to the +algorithm. The writers act like a "stack". The way interrupts works +enforces this behavior. + + + writer1 start + <preempted> writer2 start + <preempted> writer3 start + writer3 finishes + writer2 finishes + writer1 finishes + +This is very much like a writer being preempted by an interrupt and +the interrupt doing a write as well. + +Readers can happen at any time. But no two readers may run at the +same time, nor can a reader preempt/interrupt another reader. A reader +can not preempt/interrupt a writer, but it may read/consume from the +buffer at the same time as a writer is writing, but the reader must be +on another processor to do so. A reader may read on its own processor +and can be preempted by a writer. + +A writer can preempt a reader, but a reader can not preempt a writer. +But a reader can read the buffer at the same time (on another processor) +as a writer. + +The ring buffer is made up of a list of pages held together by a link list. + +At initialization a reader page is allocated for the reader that is not +part of the ring buffer. + +The head_page, tail_page and commit_page are all initialized to point +to the same page. + +The reader page is initialized to have its next pointer pointing to +the head page, and its previous pointer pointing to a page before +the head page. + +The reader has its own page to use. At start up time, this page is +allocated but is not attached to the list. When the reader wants +to read from the buffer, if its page is empty (like it is on start up) +it will swap its page with the head_page. The old reader page will +become part of the ring buffer and the head_page will be removed. +The page after the inserted page (old reader_page) will become the +new head page. + +Once the new page is given to the reader, the reader could do what +it wants with it, as long as a writer has left that page. + +A sample of how the reader page is swapped: Note this does not +show the head page in the buffer, it is for demonstrating a swap +only. + + +------+ + |reader| RING BUFFER + |page | + +------+ + +---+ +---+ +---+ + | |-->| |-->| | + | |<--| |<--| | + +---+ +---+ +---+ + ^ | ^ | + | +-------------+ | + +-----------------+ + + + +------+ + |reader| RING BUFFER + |page |-------------------+ + +------+ v + | +---+ +---+ +---+ + | | |-->| |-->| | + | | |<--| |<--| |<-+ + | +---+ +---+ +---+ | + | ^ | ^ | | + | | +-------------+ | | + | +-----------------+ | + +------------------------------------+ + + +------+ + |reader| RING BUFFER + |page |-------------------+ + +------+ <---------------+ v + | ^ +---+ +---+ +---+ + | | | |-->| |-->| | + | | | | | |<--| |<-+ + | | +---+ +---+ +---+ | + | | | ^ | | + | | +-------------+ | | + | +-----------------------------+ | + +------------------------------------+ + + +------+ + |buffer| RING BUFFER + |page |-------------------+ + +------+ <---------------+ v + | ^ +---+ +---+ +---+ + | | | | | |-->| | + | | New | | | |<--| |<-+ + | | Reader +---+ +---+ +---+ | + | | page ----^ | | + | | | | + | +-----------------------------+ | + +------------------------------------+ + + + +It is possible that the page swapped is the commit page and the tail page, +if what is in the ring buffer is less than what is held in a buffer page. + + + reader page commit page tail page + | | | + v | | + +---+ | | + | |<----------+ | + | |<------------------------+ + | |------+ + +---+ | + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +This case is still valid for this algorithm. +When the writer leaves the page, it simply goes into the ring buffer +since the reader page still points to the next location in the ring +buffer. + + +The main pointers: + + reader page - The page used solely by the reader and is not part + of the ring buffer (may be swapped in) + + head page - the next page in the ring buffer that will be swapped + with the reader page. + + tail page - the page where the next write will take place. + + commit page - the page that last finished a write. + +The commit page only is updated by the outer most writer in the +writer stack. A writer that preempts another writer will not move the +commit page. + +When data is written into the ring buffer, a position is reserved +in the ring buffer and passed back to the writer. When the writer +is finished writing data into that position, it commits the write. + +Another write (or a read) may take place at anytime during this +transaction. If another write happens it must finish before continuing +with the previous write. + + + Write reserve: + + Buffer page + +---------+ + |written | + +---------+ <--- given back to writer (current commit) + |reserved | + +---------+ <--- tail pointer + | empty | + +---------+ + + Write commit: + + Buffer page + +---------+ + |written | + +---------+ + |written | + +---------+ <--- next positon for write (current commit) + | empty | + +---------+ + + + If a write happens after the first reserve: + + Buffer page + +---------+ + |written | + +---------+ <-- current commit + |reserved | + +---------+ <--- given back to second writer + |reserved | + +---------+ <--- tail pointer + + After second writer commits: + + + Buffer page + +---------+ + |written | + +---------+ <--(last full commit) + |reserved | + +---------+ + |pending | + |commit | + +---------+ <--- tail pointer + + When the first writer commits: + + Buffer page + +---------+ + |written | + +---------+ + |written | + +---------+ + |written | + +---------+ <--(last full commit and tail pointer) + + +The commit pointer points to the last write location that was +committed without preempting another write. When a write that +preempted another write is committed, it only becomes a pending commit +and will not be a full commit till all writes have been committed. + +The commit page points to the page that has the last full commit. +The tail page points to the page with the last write (before +committing). + +The tail page is always equal to or after the commit page. It may +be several pages ahead. If the tail page catches up to the commit +page then no more writes may take place (regardless of the mode +of the ring buffer: overwrite and produce/consumer). + +The order of pages are: + + head page + commit page + tail page + +Possible scenario: + tail page + head page commit page | + | | | + v v v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +There is a special case that the head page is after either the commit page +and possibly the tail page. That is when the commit (and tail) page has been +swapped with the reader page. This is because the head page is always +part of the ring buffer, but the reader page is not. When ever there +has been less than a full page that has been committed inside the ring buffer, +and a reader swaps out a page, it will be swapping out the commit page. + + + reader page commit page tail page + | | | + v | | + +---+ | | + | |<----------+ | + | |<------------------------+ + | |------+ + +---+ | + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + ^ + | + head page + + +In this case, the head page will not move when the tail and commit +move back into the ring buffer. + +The reader can not swap a page into the ring buffer if the commit page +is still on that page. If the read meets the last commit (real commit +not pending or reserved), then there is nothing more to read. +The buffer is considered empty until another full commit finishes. + +When the tail meets the head page, if the buffer is in overwrite mode, +the head page will be pushed ahead one. If the buffer is in producer/consumer +mode, the write will fail. + +Overwrite mode: + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + ^ + | + head page + + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + ^ + | + head page + + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + ^ + | + head page + +Note, the reader page will still point to the previous head page. +But when a swap takes place, it will use the most recent head page. + + +Making the Ring Buffer Lockless: +-------------------------------- + +The main idea behind the lockless algorithm is to combine the moving +of the head_page pointer with the swapping of pages with the reader. +State flags are placed inside the pointer to the page. To do this, +each page must be aligned in memory by 4 bytes. This will allow the 2 +least significant bits of the address to be used as flags. Since +they will always be zero for the address. To get the address, +simply mask out the flags. + + MASK = ~3 + + address & MASK + +Two flags will be kept by these two bits: + + HEADER - the page being pointed to is a head page + + UPDATE - the page being pointed to is being updated by a writer + and was or is about to be a head page. + + + reader page + | + v + +---+ + | |------+ + +---+ | + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + +The above pointer "-H->" would have the HEADER flag set. That is +the next page is the next page to be swapped out by the reader. +This pointer means the next page is the head page. + +When the tail page meets the head pointer, it will use cmpxchg to +change the pointer to the UPDATE state: + + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +"-U->" represents a pointer in the UPDATE state. + +Any access to the reader will need to take some sort of lock to serialize +the readers. But the writers will never take a lock to write to the +ring buffer. This means we only need to worry about a single reader, +and writes only preempt in "stack" formation. + +When the reader tries to swap the page with the ring buffer, it +will also use cmpxchg. If the flag bit in the pointer to the +head page does not have the HEADER flag set, the compare will fail +and the reader will need to look for the new head page and try again. +Note, the flag UPDATE and HEADER are never set at the same time. + +The reader swaps the reader page as follows: + + +------+ + |reader| RING BUFFER + |page | + +------+ + +---+ +---+ +---+ + | |--->| |--->| | + | |<---| |<---| | + +---+ +---+ +---+ + ^ | ^ | + | +---------------+ | + +-----H-------------+ + +The reader sets the reader page next pointer as HEADER to the page after +the head page. + + + +------+ + |reader| RING BUFFER + |page |-------H-----------+ + +------+ v + | +---+ +---+ +---+ + | | |--->| |--->| | + | | |<---| |<---| |<-+ + | +---+ +---+ +---+ | + | ^ | ^ | | + | | +---------------+ | | + | +-----H-------------+ | + +--------------------------------------+ + +It does a cmpxchg with the pointer to the previous head page to make it +point to the reader page. Note that the new pointer does not have the HEADER +flag set. This action atomically moves the head page forward. + + +------+ + |reader| RING BUFFER + |page |-------H-----------+ + +------+ v + | ^ +---+ +---+ +---+ + | | | |-->| |-->| | + | | | |<--| |<--| |<-+ + | | +---+ +---+ +---+ | + | | | ^ | | + | | +-------------+ | | + | +-----------------------------+ | + +------------------------------------+ + +After the new head page is set, the previous pointer of the head page is +updated to the reader page. + + +------+ + |reader| RING BUFFER + |page |-------H-----------+ + +------+ <---------------+ v + | ^ +---+ +---+ +---+ + | | | |-->| |-->| | + | | | | | |<--| |<-+ + | | +---+ +---+ +---+ | + | | | ^ | | + | | +-------------+ | | + | +-----------------------------+ | + +------------------------------------+ + + +------+ + |buffer| RING BUFFER + |page |-------H-----------+ <--- New head page + +------+ <---------------+ v + | ^ +---+ +---+ +---+ + | | | | | |-->| | + | | New | | | |<--| |<-+ + | | Reader +---+ +---+ +---+ | + | | page ----^ | | + | | | | + | +-----------------------------+ | + +------------------------------------+ + +Another important point. The page that the reader page points back to +by its previous pointer (the one that now points to the new head page) +never points back to the reader page. That is because the reader page is +not part of the ring buffer. Traversing the ring buffer via the next pointers +will always stay in the ring buffer. Traversing the ring buffer via the +prev pointers may not. + +Note, the way to determine a reader page is simply by examining the previous +pointer of the page. If the next pointer of the previous page does not +point back to the original page, then the original page is a reader page: + + + +--------+ + | reader | next +----+ + | page |-------->| |<====== (buffer page) + +--------+ +----+ + | | ^ + | v | next + prev | +----+ + +------------->| | + +----+ + +The way the head page moves forward: + +When the tail page meets the head page and the buffer is in overwrite mode +and more writes take place, the head page must be moved forward before the +writer may move the tail page. The way this is done is that the writer +performs a cmpxchg to convert the pointer to the head page from the HEADER +flag to have the UPDATE flag set. Once this is done, the reader will +not be able to swap the head page from the buffer, nor will it be able to +move the head page, until the writer is finished with the move. + +This eliminates any races that the reader can have on the writer. The reader +must spin, and this is why the reader can not preempt the writer. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The following page will be made into the new head page. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +After the new head page has been set, we can set the old head page +pointer back to NORMAL. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +After the head page has been moved, the tail page may now move forward. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + +The above are the trivial updates. Now for the more complex scenarios. + + +As stated before, if enough writes preempt the first write, the +tail page may make it all the way around the buffer and meet the commit +page. At this time, we must start dropping writes (usually with some kind +of warning to the user). But what happens if the commit was still on the +reader page? The commit page is not part of the ring buffer. The tail page +must account for this. + + + reader page commit page + | | + v | + +---+ | + | |<----------+ + | | + | |------+ + +---+ | + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + ^ + | + tail page + +If the tail page were to simply push the head page forward, the commit when +leaving the reader page would not be pointing to the correct page. + +The solution to this is to test if the commit page is on the reader page +before pushing the head page. If it is, then it can be assumed that the +tail page wrapped the buffer, and we must drop new writes. + +This is not a race condition, because the commit page can only be moved +by the outter most writer (the writer that was preempted). +This means that the commit will not move while a writer is moving the +tail page. The reader can not swap the reader page if it is also being +used as the commit page. The reader can simply check that the commit +is off the reader page. Once the commit page leaves the reader page +it will never go back on it unless a reader does another swap with the +buffer page that is also the commit page. + + +Nested writes +------------- + +In the pushing forward of the tail page we must first push forward +the head page if the head page is the next page. If the head page +is not the next page, the tail page is simply updated with a cmpxchg. + +Only writers move the tail page. This must be done atomically to protect +against nested writers. + + temp_page = tail_page + next_page = temp_page->next + cmpxchg(tail_page, temp_page, next_page) + +The above will update the tail page if it is still pointing to the expected +page. If this fails, a nested write pushed it forward, the the current write +does not need to push it. + + + temp page + | + v + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +Nested write comes in and moves the tail page forward: + + tail page (moved by nested writer) + temp page | + | | + v v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The above would fail the cmpxchg, but since the tail page has already +been moved forward, the writer will just try again to reserve storage +on the new tail page. + +But the moving of the head page is a bit more complex. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The write converts the head page pointer to UPDATE. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +But if a nested writer preempts here. It will see that the next +page is a head page, but it is also nested. It will detect that +it is nested and will save that information. The detection is the +fact that it sees the UPDATE flag instead of a HEADER or NORMAL +pointer. + +The nested writer will set the new head page pointer. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +But it will not reset the update back to normal. Only the writer +that converted a pointer from HEAD to UPDATE will convert it back +to NORMAL. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +After the nested writer finishes, the outer most writer will convert +the UPDATE pointer to NORMAL. + + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + +It can be even more complex if several nested writes came in and moved +the tail page ahead several pages: + + +(first writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The write converts the head page pointer to UPDATE. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +Next writer comes in, and sees the update and sets up the new +head page. + +(second writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The nested writer moves the tail page forward. But does not set the old +update page to NORMAL because it is not the outer most writer. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +Another writer preempts and sees the page after the tail page is a head page. +It changes it from HEAD to UPDATE. + +(third writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-U->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The writer will move the head page forward: + + +(third writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-U->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +But now that the third writer did change the HEAD flag to UPDATE it +will convert it to normal: + + +(third writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + +Then it will move the tail page, and return back to the second writer. + + +(second writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + +The second writer will fail to move the tail page because it was already +moved, so it will try again and add its data to the new tail page. +It will return to the first writer. + + +(first writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The first writer can not know atomically test if the tail page moved +while it updates the HEAD page. It will then update the head page to +what it thinks is the new head page. + + +(first writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +Since the cmpxchg returns the old value of the pointer the first writer +will see it succeeded in updating the pointer from NORMAL to HEAD. +But as we can see, this is not good enough. It must also check to see +if the tail page is either where it use to be or on the next page: + + +(first writer) + + A B tail page + | | | + v v v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +If tail page != A and tail page does not equal B, then it must reset the +pointer back to NORMAL. The fact that it only needs to worry about +nested writers, it only needs to check this after setting the HEAD page. + + +(first writer) + + A B tail page + | | | + v v v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +Now the writer can update the head page. This is also why the head page must +remain in UPDATE and only reset by the outer most writer. This prevents +the reader from seeing the incorrect head page. + + +(first writer) + + A B tail page + | | | + v v v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + diff --git a/Documentation/trace/tracepoint-analysis.txt b/Documentation/trace/tracepoint-analysis.txt new file mode 100644 index 00000000000..5eb4e487e66 --- /dev/null +++ b/Documentation/trace/tracepoint-analysis.txt @@ -0,0 +1,327 @@ + Notes on Analysing Behaviour Using Events and Tracepoints + + Documentation written by Mel Gorman + PCL information heavily based on email from Ingo Molnar + +1. Introduction +=============== + +Tracepoints (see Documentation/trace/tracepoints.txt) can be used without +creating custom kernel modules to register probe functions using the event +tracing infrastructure. + +Simplistically, tracepoints will represent an important event that when can +be taken in conjunction with other tracepoints to build a "Big Picture" of +what is going on within the system. There are a large number of methods for +gathering and interpreting these events. Lacking any current Best Practises, +this document describes some of the methods that can be used. + +This document assumes that debugfs is mounted on /sys/kernel/debug and that +the appropriate tracing options have been configured into the kernel. It is +assumed that the PCL tool tools/perf has been installed and is in your path. + +2. Listing Available Events +=========================== + +2.1 Standard Utilities +---------------------- + +All possible events are visible from /sys/kernel/debug/tracing/events. Simply +calling + + $ find /sys/kernel/debug/tracing/events -type d + +will give a fair indication of the number of events available. + +2.2 PCL +------- + +Discovery and enumeration of all counters and events, including tracepoints +are available with the perf tool. Getting a list of available events is a +simple case of + + $ perf list 2>&1 | grep Tracepoint + ext4:ext4_free_inode [Tracepoint event] + ext4:ext4_request_inode [Tracepoint event] + ext4:ext4_allocate_inode [Tracepoint event] + ext4:ext4_write_begin [Tracepoint event] + ext4:ext4_ordered_write_end [Tracepoint event] + [ .... remaining output snipped .... ] + + +2. Enabling Events +================== + +2.1 System-Wide Event Enabling +------------------------------ + +See Documentation/trace/events.txt for a proper description on how events +can be enabled system-wide. A short example of enabling all events related +to page allocation would look something like + + $ for i in `find /sys/kernel/debug/tracing/events -name "enable" | grep mm_`; do echo 1 > $i; done + +2.2 System-Wide Event Enabling with SystemTap +--------------------------------------------- + +In SystemTap, tracepoints are accessible using the kernel.trace() function +call. The following is an example that reports every 5 seconds what processes +were allocating the pages. + + global page_allocs + + probe kernel.trace("mm_page_alloc") { + page_allocs[execname()]++ + } + + function print_count() { + printf ("%-25s %-s\n", "#Pages Allocated", "Process Name") + foreach (proc in page_allocs-) + printf("%-25d %s\n", page_allocs[proc], proc) + printf ("\n") + delete page_allocs + } + + probe timer.s(5) { + print_count() + } + +2.3 System-Wide Event Enabling with PCL +--------------------------------------- + +By specifying the -a switch and analysing sleep, the system-wide events +for a duration of time can be examined. + + $ perf stat -a \ + -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ + -e kmem:mm_pagevec_free \ + sleep 10 + Performance counter stats for 'sleep 10': + + 9630 kmem:mm_page_alloc + 2143 kmem:mm_page_free_direct + 7424 kmem:mm_pagevec_free + + 10.002577764 seconds time elapsed + +Similarly, one could execute a shell and exit it as desired to get a report +at that point. + +2.4 Local Event Enabling +------------------------ + +Documentation/trace/ftrace.txt describes how to enable events on a per-thread +basis using set_ftrace_pid. + +2.5 Local Event Enablement with PCL +----------------------------------- + +Events can be activate and tracked for the duration of a process on a local +basis using PCL such as follows. + + $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ + -e kmem:mm_pagevec_free ./hackbench 10 + Time: 0.909 + + Performance counter stats for './hackbench 10': + + 17803 kmem:mm_page_alloc + 12398 kmem:mm_page_free_direct + 4827 kmem:mm_pagevec_free + + 0.973913387 seconds time elapsed + +3. Event Filtering +================== + +Documentation/trace/ftrace.txt covers in-depth how to filter events in +ftrace. Obviously using grep and awk of trace_pipe is an option as well +as any script reading trace_pipe. + +4. Analysing Event Variances with PCL +===================================== + +Any workload can exhibit variances between runs and it can be important +to know what the standard deviation in. By and large, this is left to the +performance analyst to do it by hand. In the event that the discrete event +occurrences are useful to the performance analyst, then perf can be used. + + $ perf stat --repeat 5 -e kmem:mm_page_alloc -e kmem:mm_page_free_direct + -e kmem:mm_pagevec_free ./hackbench 10 + Time: 0.890 + Time: 0.895 + Time: 0.915 + Time: 1.001 + Time: 0.899 + + Performance counter stats for './hackbench 10' (5 runs): + + 16630 kmem:mm_page_alloc ( +- 3.542% ) + 11486 kmem:mm_page_free_direct ( +- 4.771% ) + 4730 kmem:mm_pagevec_free ( +- 2.325% ) + + 0.982653002 seconds time elapsed ( +- 1.448% ) + +In the event that some higher-level event is required that depends on some +aggregation of discrete events, then a script would need to be developed. + +Using --repeat, it is also possible to view how events are fluctuating over +time on a system wide basis using -a and sleep. + + $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ + -e kmem:mm_pagevec_free \ + -a --repeat 10 \ + sleep 1 + Performance counter stats for 'sleep 1' (10 runs): + + 1066 kmem:mm_page_alloc ( +- 26.148% ) + 182 kmem:mm_page_free_direct ( +- 5.464% ) + 890 kmem:mm_pagevec_free ( +- 30.079% ) + + 1.002251757 seconds time elapsed ( +- 0.005% ) + +5. Higher-Level Analysis with Helper Scripts +============================================ + +When events are enabled the events that are triggering can be read from +/sys/kernel/debug/tracing/trace_pipe in human-readable format although binary +options exist as well. By post-processing the output, further information can +be gathered on-line as appropriate. Examples of post-processing might include + + o Reading information from /proc for the PID that triggered the event + o Deriving a higher-level event from a series of lower-level events. + o Calculate latencies between two events + +Documentation/trace/postprocess/trace-pagealloc-postprocess.pl is an example +script that can read trace_pipe from STDIN or a copy of a trace. When used +on-line, it can be interrupted once to generate a report without existing +and twice to exit. + +Simplistically, the script just reads STDIN and counts up events but it +also can do more such as + + o Derive high-level events from many low-level events. If a number of pages + are freed to the main allocator from the per-CPU lists, it recognises + that as one per-CPU drain even though there is no specific tracepoint + for that event + o It can aggregate based on PID or individual process number + o In the event memory is getting externally fragmented, it reports + on whether the fragmentation event was severe or moderate. + o When receiving an event about a PID, it can record who the parent was so + that if large numbers of events are coming from very short-lived + processes, the parent process responsible for creating all the helpers + can be identified + +6. Lower-Level Analysis with PCL +================================ + +There may also be a requirement to identify what functions with a program +were generating events within the kernel. To begin this sort of analysis, the +data must be recorded. At the time of writing, this required root + + $ perf record -c 1 \ + -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ + -e kmem:mm_pagevec_free \ + ./hackbench 10 + Time: 0.894 + [ perf record: Captured and wrote 0.733 MB perf.data (~32010 samples) ] + +Note the use of '-c 1' to set the event period to sample. The default sample +period is quite high to minimise overhead but the information collected can be +very coarse as a result. + +This record outputted a file called perf.data which can be analysed using +perf report. + + $ perf report + # Samples: 30922 + # + # Overhead Command Shared Object + # ........ ......... ................................ + # + 87.27% hackbench [vdso] + 6.85% hackbench /lib/i686/cmov/libc-2.9.so + 2.62% hackbench /lib/ld-2.9.so + 1.52% perf [vdso] + 1.22% hackbench ./hackbench + 0.48% hackbench [kernel] + 0.02% perf /lib/i686/cmov/libc-2.9.so + 0.01% perf /usr/bin/perf + 0.01% perf /lib/ld-2.9.so + 0.00% hackbench /lib/i686/cmov/libpthread-2.9.so + # + # (For more details, try: perf report --sort comm,dso,symbol) + # + +According to this, the vast majority of events occured triggered on events +within the VDSO. With simple binaries, this will often be the case so lets +take a slightly different example. In the course of writing this, it was +noticed that X was generating an insane amount of page allocations so lets look +at it + + $ perf record -c 1 -f \ + -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ + -e kmem:mm_pagevec_free \ + -p `pidof X` + +This was interrupted after a few seconds and + + $ perf report + # Samples: 27666 + # + # Overhead Command Shared Object + # ........ ....... ....................................... + # + 51.95% Xorg [vdso] + 47.95% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 + 0.09% Xorg /lib/i686/cmov/libc-2.9.so + 0.01% Xorg [kernel] + # + # (For more details, try: perf report --sort comm,dso,symbol) + # + +So, almost half of the events are occuring in a library. To get an idea which +symbol. + + $ perf report --sort comm,dso,symbol + # Samples: 27666 + # + # Overhead Command Shared Object Symbol + # ........ ....... ....................................... ...... + # + 51.95% Xorg [vdso] [.] 0x000000ffffe424 + 47.93% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] pixmanFillsse2 + 0.09% Xorg /lib/i686/cmov/libc-2.9.so [.] _int_malloc + 0.01% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] pixman_region32_copy_f + 0.01% Xorg [kernel] [k] read_hpet + 0.01% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] get_fast_path + 0.00% Xorg [kernel] [k] ftrace_trace_userstack + +To see where within the function pixmanFillsse2 things are going wrong + + $ perf annotate pixmanFillsse2 + [ ... ] + 0.00 : 34eeb: 0f 18 08 prefetcht0 (%eax) + : } + : + : extern __inline void __attribute__((__gnu_inline__, __always_inline__, _ + : _mm_store_si128 (__m128i *__P, __m128i __B) : { + : *__P = __B; + 12.40 : 34eee: 66 0f 7f 80 40 ff ff movdqa %xmm0,-0xc0(%eax) + 0.00 : 34ef5: ff + 12.40 : 34ef6: 66 0f 7f 80 50 ff ff movdqa %xmm0,-0xb0(%eax) + 0.00 : 34efd: ff + 12.39 : 34efe: 66 0f 7f 80 60 ff ff movdqa %xmm0,-0xa0(%eax) + 0.00 : 34f05: ff + 12.67 : 34f06: 66 0f 7f 80 70 ff ff movdqa %xmm0,-0x90(%eax) + 0.00 : 34f0d: ff + 12.58 : 34f0e: 66 0f 7f 40 80 movdqa %xmm0,-0x80(%eax) + 12.31 : 34f13: 66 0f 7f 40 90 movdqa %xmm0,-0x70(%eax) + 12.40 : 34f18: 66 0f 7f 40 a0 movdqa %xmm0,-0x60(%eax) + 12.31 : 34f1d: 66 0f 7f 40 b0 movdqa %xmm0,-0x50(%eax) + +At a glance, it looks like the time is being spent copying pixmaps to +the card. Further investigation would be needed to determine why pixmaps +are being copied around so much but a starting point would be to take an +ancient build of libpixmap out of the library path where it was totally +forgotten about from months ago! |