diff options
Diffstat (limited to 'kernel/trace')
28 files changed, 2299 insertions, 762 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 95a0ad191f1..2246141bda4 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -63,7 +63,11 @@ config TRACING # config TRACING_SUPPORT bool - depends on TRACE_IRQFLAGS_SUPPORT + # PPC32 has no irqflags tracing support, but it can use most of the + # tracers anyway, they were tested to build and work. Note that new + # exceptions to this list aren't welcomed, better implement the + # irqflags tracing for your architecture. + depends on TRACE_IRQFLAGS_SUPPORT || PPC32 depends on STACKTRACE_SUPPORT default y @@ -95,11 +99,10 @@ config FUNCTION_GRAPH_TRACER help Enable the kernel to trace a function at both its return and its entry. - It's first purpose is to trace the duration of functions and - draw a call graph for each thread with some informations like - the return value. - This is done by setting the current return address on the current - task structure into a stack of calls. + Its first purpose is to trace the duration of functions and + draw a call graph for each thread with some information like + the return value. This is done by setting the current return + address on the current task structure into a stack of calls. config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" @@ -182,6 +185,7 @@ config FTRACE_SYSCALLS bool "Trace syscalls" depends on HAVE_FTRACE_SYSCALLS select TRACING + select KALLSYMS help Basic tracer to catch the syscall entry and exit events. diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c3feea01c3e..2630f5121ec 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -44,5 +44,7 @@ obj-$(CONFIG_EVENT_TRACER) += trace_events.o obj-$(CONFIG_EVENT_TRACER) += events.o obj-$(CONFIG_EVENT_TRACER) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o +obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o +obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1f32e4edf49..947c5b3f90c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -30,7 +30,7 @@ static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; -static int __read_mostly blk_tracer_enabled; +static bool blk_tracer_enabled __read_mostly; /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 @@ -47,10 +47,9 @@ static struct tracer_flags blk_tracer_flags = { }; /* Global reference count of probes */ -static DEFINE_MUTEX(blk_probe_mutex); static atomic_t blk_probes_ref = ATOMIC_INIT(0); -static int blk_register_tracepoints(void); +static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); /* @@ -60,22 +59,39 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, const void *data, size_t len) { struct blk_io_trace *t; + struct ring_buffer_event *event = NULL; + int pc = 0; + int cpu = smp_processor_id(); + bool blk_tracer = blk_tracer_enabled; + + if (blk_tracer) { + pc = preempt_count(); + event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, + sizeof(*t) + len, + 0, pc); + if (!event) + return; + t = ring_buffer_event_data(event); + goto record_it; + } if (!bt->rchan) return; t = relay_reserve(bt->rchan, sizeof(*t) + len); if (t) { - const int cpu = smp_processor_id(); - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->time = ktime_to_ns(ktime_get()); +record_it: t->device = bt->dev; t->action = action; t->pid = pid; t->cpu = cpu; t->pdu_len = len; memcpy((void *) t + sizeof(*t), data, len); + + if (blk_tracer) + trace_buffer_unlock_commit(blk_tr, event, 0, pc); } } @@ -111,14 +127,8 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) unsigned long flags; char *buf; - if (blk_tr) { - va_start(args, fmt); - ftrace_vprintk(fmt, args); - va_end(args); - return; - } - - if (!bt->msg_data) + if (unlikely(bt->trace_state != Blktrace_running && + !blk_tracer_enabled)) return; local_irq_save(flags); @@ -148,8 +158,8 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, /* * Data direction bit lookup */ -static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), - BLK_TC_ACT(BLK_TC_WRITE) }; +static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), + BLK_TC_ACT(BLK_TC_WRITE) }; /* The ilog2() calls fall out because they're constant */ #define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ @@ -169,9 +179,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, unsigned long *sequence; pid_t pid; int cpu, pc = 0; + bool blk_tracer = blk_tracer_enabled; - if (unlikely(bt->trace_state != Blktrace_running || - !blk_tracer_enabled)) + if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; what |= ddir_act[rw & WRITE]; @@ -186,7 +196,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; cpu = raw_smp_processor_id(); - if (blk_tr) { + if (blk_tracer) { tracing_record_cmdline(current); pc = preempt_count(); @@ -236,7 +246,7 @@ record_it: if (pdu_len) memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); - if (blk_tr) { + if (blk_tracer) { trace_buffer_unlock_commit(blk_tr, event, 0, pc); return; } @@ -248,7 +258,7 @@ record_it: static struct dentry *blk_tree_root; static DEFINE_MUTEX(blk_tree_mutex); -static void blk_trace_cleanup(struct blk_trace *bt) +static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); @@ -256,10 +266,13 @@ static void blk_trace_cleanup(struct blk_trace *bt) free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); - mutex_lock(&blk_probe_mutex); +} + +static void blk_trace_cleanup(struct blk_trace *bt) +{ + blk_trace_free(bt); if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); - mutex_unlock(&blk_probe_mutex); } int blk_trace_remove(struct request_queue *q) @@ -270,8 +283,7 @@ int blk_trace_remove(struct request_queue *q) if (!bt) return -EINVAL; - if (bt->trace_state == Blktrace_setup || - bt->trace_state == Blktrace_stopped) + if (bt->trace_state != Blktrace_running) blk_trace_cleanup(bt); return 0; @@ -414,11 +426,11 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (buts->name[i] == '/') buts->name[i] = '_'; - ret = -ENOMEM; bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) - goto err; + return -ENOMEM; + ret = -ENOMEM; bt->sequence = alloc_percpu(unsigned long); if (!bt->sequence) goto err; @@ -429,11 +441,15 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = -ENOENT; + mutex_lock(&blk_tree_mutex); if (!blk_tree_root) { blk_tree_root = debugfs_create_dir("block", NULL); - if (!blk_tree_root) - return -ENOMEM; + if (!blk_tree_root) { + mutex_unlock(&blk_tree_mutex); + goto err; + } } + mutex_unlock(&blk_tree_mutex); dir = debugfs_create_dir(buts->name, blk_tree_root); @@ -471,14 +487,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->pid = buts->pid; bt->trace_state = Blktrace_setup; - mutex_lock(&blk_probe_mutex); - if (atomic_add_return(1, &blk_probes_ref) == 1) { - ret = blk_register_tracepoints(); - if (ret) - goto probe_err; - } - mutex_unlock(&blk_probe_mutex); - ret = -EBUSY; old_bt = xchg(&q->blk_trace, bt); if (old_bt) { @@ -486,22 +494,12 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, goto err; } + if (atomic_inc_return(&blk_probes_ref) == 1) + blk_register_tracepoints(); + return 0; -probe_err: - atomic_dec(&blk_probes_ref); - mutex_unlock(&blk_probe_mutex); err: - if (bt) { - if (bt->msg_file) - debugfs_remove(bt->msg_file); - if (bt->dropped_file) - debugfs_remove(bt->dropped_file); - free_percpu(bt->sequence); - free_percpu(bt->msg_data); - if (bt->rchan) - relay_close(bt->rchan); - kfree(bt); - } + blk_trace_free(bt); return ret; } @@ -863,7 +861,7 @@ void blk_add_driver_data(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_add_driver_data); -static int blk_register_tracepoints(void) +static void blk_register_tracepoints(void) { int ret; @@ -901,7 +899,6 @@ static int blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_remap(blk_add_trace_remap); WARN_ON(ret); - return 0; } static void blk_unregister_tracepoints(void) @@ -934,25 +931,31 @@ static void blk_unregister_tracepoints(void) static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) { int i = 0; + int tc = t->action >> BLK_TC_SHIFT; + + if (t->action == BLK_TN_MESSAGE) { + rwbs[i++] = 'N'; + goto out; + } - if (t->action & BLK_TC_DISCARD) + if (tc & BLK_TC_DISCARD) rwbs[i++] = 'D'; - else if (t->action & BLK_TC_WRITE) + else if (tc & BLK_TC_WRITE) rwbs[i++] = 'W'; else if (t->bytes) rwbs[i++] = 'R'; else rwbs[i++] = 'N'; - if (t->action & BLK_TC_AHEAD) + if (tc & BLK_TC_AHEAD) rwbs[i++] = 'A'; - if (t->action & BLK_TC_BARRIER) + if (tc & BLK_TC_BARRIER) rwbs[i++] = 'B'; - if (t->action & BLK_TC_SYNC) + if (tc & BLK_TC_SYNC) rwbs[i++] = 'S'; - if (t->action & BLK_TC_META) + if (tc & BLK_TC_META) rwbs[i++] = 'M'; - +out: rwbs[i] = '\0'; } @@ -979,7 +982,7 @@ static inline unsigned long long t_sector(const struct trace_entry *ent) static inline __u16 t_error(const struct trace_entry *ent) { - return te_blk_io_trace(ent)->sector; + return te_blk_io_trace(ent)->error; } static __u64 get_pdu_int(const struct trace_entry *ent) @@ -999,35 +1002,39 @@ static void get_pdu_remap(const struct trace_entry *ent, r->sector = be64_to_cpu(sector); } -static int blk_log_action_iter(struct trace_iterator *iter, const char *act) +typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); + +static int blk_log_action_classic(struct trace_iterator *iter, const char *act) { char rwbs[6]; - unsigned long long ts = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(ts, USEC_PER_SEC); + unsigned long long ts = iter->ts; + unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); unsigned secs = (unsigned long)ts; - const struct trace_entry *ent = iter->ent; - const struct blk_io_trace *t = (const struct blk_io_trace *)ent; + const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); return trace_seq_printf(&iter->seq, - "%3d,%-3d %2d %5d.%06lu %5u %2s %3s ", + "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", MAJOR(t->device), MINOR(t->device), iter->cpu, - secs, usec_rem, ent->pid, act, rwbs); + secs, nsec_rem, iter->ent->pid, act, rwbs); } -static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t, - const char *act) +static int blk_log_action(struct trace_iterator *iter, const char *act) { char rwbs[6]; + const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + fill_rwbs(rwbs, t); - return trace_seq_printf(s, "%3d,%-3d %2s %3s ", + return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", MAJOR(t->device), MINOR(t->device), act, rwbs); } static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) { - const char *cmd = trace_find_cmdline(ent->pid); + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); if (t_sec(ent)) return trace_seq_printf(s, "%llu + %u [%s]\n", @@ -1057,19 +1064,41 @@ static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) { - return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid)); + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + + return trace_seq_printf(s, "[%s]\n", cmd); } static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) { - return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid), - get_pdu_int(ent)); + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + + return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); } static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) { + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), - get_pdu_int(ent), trace_find_cmdline(ent->pid)); + get_pdu_int(ent), cmd); +} + +static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) +{ + int ret; + const struct blk_io_trace *t = te_blk_io_trace(ent); + + ret = trace_seq_putmem(s, t + 1, t->pdu_len); + if (ret) + return trace_seq_putc(s, '\n'); + return ret; } /* @@ -1086,11 +1115,7 @@ static void blk_tracer_print_header(struct seq_file *m) static void blk_tracer_start(struct trace_array *tr) { - mutex_lock(&blk_probe_mutex); - if (atomic_add_return(1, &blk_probes_ref) == 1) - if (blk_register_tracepoints()) - atomic_dec(&blk_probes_ref); - mutex_unlock(&blk_probe_mutex); + blk_tracer_enabled = true; trace_flags &= ~TRACE_ITER_CONTEXT_INFO; } @@ -1098,38 +1123,24 @@ static int blk_tracer_init(struct trace_array *tr) { blk_tr = tr; blk_tracer_start(tr); - mutex_lock(&blk_probe_mutex); - blk_tracer_enabled++; - mutex_unlock(&blk_probe_mutex); return 0; } static void blk_tracer_stop(struct trace_array *tr) { + blk_tracer_enabled = false; trace_flags |= TRACE_ITER_CONTEXT_INFO; - mutex_lock(&blk_probe_mutex); - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); - mutex_unlock(&blk_probe_mutex); } static void blk_tracer_reset(struct trace_array *tr) { - if (!atomic_read(&blk_probes_ref)) - return; - - mutex_lock(&blk_probe_mutex); - blk_tracer_enabled--; - WARN_ON(blk_tracer_enabled < 0); - mutex_unlock(&blk_probe_mutex); - blk_tracer_stop(tr); } -static struct { +static const struct { const char *act[2]; int (*print)(struct trace_seq *s, const struct trace_entry *ent); -} what2act[] __read_mostly = { +} what2act[] = { [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, @@ -1147,29 +1158,48 @@ static struct { [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, }; -static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, - int flags) +static enum print_line_t print_one_line(struct trace_iterator *iter, + bool classic) { struct trace_seq *s = &iter->seq; - const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; - const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1); + const struct blk_io_trace *t; + u16 what; int ret; + bool long_act; + blk_log_action_t *log_action; - if (!trace_print_context(iter)) - return TRACE_TYPE_PARTIAL_LINE; + t = te_blk_io_trace(iter->ent); + what = t->action & ((1 << BLK_TC_SHIFT) - 1); + long_act = !!(trace_flags & TRACE_ITER_VERBOSE); + log_action = classic ? &blk_log_action_classic : &blk_log_action; - if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) + if (t->action == BLK_TN_MESSAGE) { + ret = log_action(iter, long_act ? "message" : "m"); + if (ret) + ret = blk_log_msg(s, iter->ent); + goto out; + } + + if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) ret = trace_seq_printf(s, "Bad pc action %x\n", what); else { - const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); - ret = blk_log_action_seq(s, t, what2act[what].act[long_act]); + ret = log_action(iter, what2act[what].act[long_act]); if (ret) ret = what2act[what].print(s, iter->ent); } - +out: return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; } +static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, + int flags) +{ + if (!trace_print_context(iter)) + return TRACE_TYPE_PARTIAL_LINE; + + return print_one_line(iter, false); +} + static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1177,7 +1207,7 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) const int offset = offsetof(struct blk_io_trace, sector); struct blk_io_trace old = { .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, - .time = ns2usecs(iter->ts), + .time = iter->ts, }; if (!trace_seq_putmem(s, &old, offset)) @@ -1195,26 +1225,10 @@ blk_trace_event_print_binary(struct trace_iterator *iter, int flags) static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) { - const struct blk_io_trace *t; - u16 what; - int ret; - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) return TRACE_TYPE_UNHANDLED; - t = (const struct blk_io_trace *)iter->ent; - what = t->action & ((1 << BLK_TC_SHIFT) - 1); - - if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) - ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what); - else { - const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); - ret = blk_log_action_iter(iter, what2act[what].act[long_act]); - if (ret) - ret = what2act[what].print(&iter->seq, iter->ent); - } - - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + return print_one_line(iter, true); } static struct tracer blk_tracer __read_mostly = { @@ -1260,7 +1274,10 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt == NULL) return -EINVAL; - kfree(bt); + if (atomic_dec_and_test(&blk_probes_ref)) + blk_unregister_tracepoints(); + + blk_trace_free(bt); return 0; } @@ -1270,26 +1287,33 @@ static int blk_trace_remove_queue(struct request_queue *q) static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) { struct blk_trace *old_bt, *bt = NULL; - int ret; + int ret = -ENOMEM; - ret = -ENOMEM; bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) - goto err; + return -ENOMEM; + + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); + if (!bt->msg_data) + goto free_bt; bt->dev = dev; bt->act_mask = (u16)-1; bt->end_lba = -1ULL; - bt->trace_state = Blktrace_running; old_bt = xchg(&q->blk_trace, bt); if (old_bt != NULL) { (void)xchg(&q->blk_trace, old_bt); - kfree(bt); ret = -EBUSY; + goto free_bt; } + + if (atomic_inc_return(&blk_probes_ref) == 1) + blk_register_tracepoints(); return 0; -err: + +free_bt: + blk_trace_free(bt); return ret; } @@ -1297,72 +1321,6 @@ err: * sysfs interface to enable and configure tracing */ -static ssize_t sysfs_blk_trace_enable_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - struct block_device *bdev; - ssize_t ret = -ENXIO; - - lock_kernel(); - bdev = bdget(part_devt(p)); - if (bdev != NULL) { - struct request_queue *q = bdev_get_queue(bdev); - - if (q != NULL) { - mutex_lock(&bdev->bd_mutex); - ret = sprintf(buf, "%u\n", !!q->blk_trace); - mutex_unlock(&bdev->bd_mutex); - } - - bdput(bdev); - } - - unlock_kernel(); - return ret; -} - -static ssize_t sysfs_blk_trace_enable_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct block_device *bdev; - struct request_queue *q; - struct hd_struct *p; - int value; - ssize_t ret = -ENXIO; - - if (count == 0 || sscanf(buf, "%d", &value) != 1) - goto out; - - lock_kernel(); - p = dev_to_part(dev); - bdev = bdget(part_devt(p)); - if (bdev == NULL) - goto out_unlock_kernel; - - q = bdev_get_queue(bdev); - if (q == NULL) - goto out_bdput; - - mutex_lock(&bdev->bd_mutex); - if (value) - ret = blk_trace_setup_queue(q, bdev->bd_dev); - else - ret = blk_trace_remove_queue(q); - mutex_unlock(&bdev->bd_mutex); - - if (ret == 0) - ret = count; -out_bdput: - bdput(bdev); -out_unlock_kernel: - unlock_kernel(); -out: - return ret; -} - static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf); @@ -1374,8 +1332,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, sysfs_blk_trace_attr_show, \ sysfs_blk_trace_attr_store) -static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR, - sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store); +static BLK_TRACE_DEVICE_ATTR(enable); static BLK_TRACE_DEVICE_ATTR(act_mask); static BLK_TRACE_DEVICE_ATTR(pid); static BLK_TRACE_DEVICE_ATTR(start_lba); @@ -1395,53 +1352,85 @@ struct attribute_group blk_trace_attr_group = { .attrs = blk_trace_attrs, }; -static int blk_str2act_mask(const char *str) +static const struct { + int mask; + const char *str; +} mask_maps[] = { + { BLK_TC_READ, "read" }, + { BLK_TC_WRITE, "write" }, + { BLK_TC_BARRIER, "barrier" }, + { BLK_TC_SYNC, "sync" }, + { BLK_TC_QUEUE, "queue" }, + { BLK_TC_REQUEUE, "requeue" }, + { BLK_TC_ISSUE, "issue" }, + { BLK_TC_COMPLETE, "complete" }, + { BLK_TC_FS, "fs" }, + { BLK_TC_PC, "pc" }, + { BLK_TC_AHEAD, "ahead" }, + { BLK_TC_META, "meta" }, + { BLK_TC_DISCARD, "discard" }, + { BLK_TC_DRV_DATA, "drv_data" }, +}; + +static int blk_trace_str2mask(const char *str) { + int i; int mask = 0; - char *copy = kstrdup(str, GFP_KERNEL), *s; + char *s, *token; - if (copy == NULL) + s = kstrdup(str, GFP_KERNEL); + if (s == NULL) return -ENOMEM; - - s = strstrip(copy); + s = strstrip(s); while (1) { - char *sep = strchr(s, ','); - - if (sep != NULL) - *sep = '\0'; - - if (strcasecmp(s, "barrier") == 0) - mask |= BLK_TC_BARRIER; - else if (strcasecmp(s, "complete") == 0) - mask |= BLK_TC_COMPLETE; - else if (strcasecmp(s, "fs") == 0) - mask |= BLK_TC_FS; - else if (strcasecmp(s, "issue") == 0) - mask |= BLK_TC_ISSUE; - else if (strcasecmp(s, "pc") == 0) - mask |= BLK_TC_PC; - else if (strcasecmp(s, "queue") == 0) - mask |= BLK_TC_QUEUE; - else if (strcasecmp(s, "read") == 0) - mask |= BLK_TC_READ; - else if (strcasecmp(s, "requeue") == 0) - mask |= BLK_TC_REQUEUE; - else if (strcasecmp(s, "sync") == 0) - mask |= BLK_TC_SYNC; - else if (strcasecmp(s, "write") == 0) - mask |= BLK_TC_WRITE; - - if (sep == NULL) + token = strsep(&s, ","); + if (token == NULL) break; - s = sep + 1; + if (*token == '\0') + continue; + + for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { + if (strcasecmp(token, mask_maps[i].str) == 0) { + mask |= mask_maps[i].mask; + break; + } + } + if (i == ARRAY_SIZE(mask_maps)) { + mask = -EINVAL; + break; + } } - kfree(copy); + kfree(s); return mask; } +static ssize_t blk_trace_mask2str(char *buf, int mask) +{ + int i; + char *p = buf; + + for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { + if (mask & mask_maps[i].mask) { + p += sprintf(p, "%s%s", + (p == buf) ? "" : ",", mask_maps[i].str); + } + } + *p++ = '\n'; + + return p - buf; +} + +static struct request_queue *blk_trace_get_queue(struct block_device *bdev) +{ + if (bdev->bd_disk == NULL) + return NULL; + + return bdev_get_queue(bdev); +} + static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -1456,20 +1445,29 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (bdev == NULL) goto out_unlock_kernel; - q = bdev_get_queue(bdev); + q = blk_trace_get_queue(bdev); if (q == NULL) goto out_bdput; + mutex_lock(&bdev->bd_mutex); + + if (attr == &dev_attr_enable) { + ret = sprintf(buf, "%u\n", !!q->blk_trace); + goto out_unlock_bdev; + } + if (q->blk_trace == NULL) ret = sprintf(buf, "disabled\n"); else if (attr == &dev_attr_act_mask) - ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask); + ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); else if (attr == &dev_attr_pid) ret = sprintf(buf, "%u\n", q->blk_trace->pid); else if (attr == &dev_attr_start_lba) ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); else if (attr == &dev_attr_end_lba) ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + +out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); out_bdput: bdput(bdev); @@ -1486,7 +1484,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct request_queue *q; struct hd_struct *p; u64 value; - ssize_t ret = -ENXIO; + ssize_t ret = -EINVAL; if (count == 0) goto out; @@ -1494,24 +1492,36 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (attr == &dev_attr_act_mask) { if (sscanf(buf, "%llx", &value) != 1) { /* Assume it is a list of trace category names */ - value = blk_str2act_mask(buf); - if (value < 0) + ret = blk_trace_str2mask(buf); + if (ret < 0) goto out; + value = ret; } } else if (sscanf(buf, "%llu", &value) != 1) goto out; + ret = -ENXIO; + lock_kernel(); p = dev_to_part(dev); bdev = bdget(part_devt(p)); if (bdev == NULL) goto out_unlock_kernel; - q = bdev_get_queue(bdev); + q = blk_trace_get_queue(bdev); if (q == NULL) goto out_bdput; mutex_lock(&bdev->bd_mutex); + + if (attr == &dev_attr_enable) { + if (value) + ret = blk_trace_setup_queue(q, bdev->bd_dev); + else + ret = blk_trace_remove_queue(q); + goto out_unlock_bdev; + } + ret = 0; if (q->blk_trace == NULL) ret = blk_trace_setup_queue(q, bdev->bd_dev); @@ -1525,13 +1535,15 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, q->blk_trace->start_lba = value; else if (attr == &dev_attr_end_lba) q->blk_trace->end_lba = value; - ret = count; } + +out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); out_bdput: bdput(bdev); out_unlock_kernel: unlock_kernel(); out: - return ret; + return ret ? ret : count; } + diff --git a/kernel/trace/events.c b/kernel/trace/events.c index 9fc918da404..246f2aa6dc4 100644 --- a/kernel/trace/events.c +++ b/kernel/trace/events.c @@ -12,4 +12,3 @@ #include "trace_events_stage_2.h" #include "trace_events_stage_3.h" -#include <trace/trace_event_types.h> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d33d306bdcf..f1ed080406c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -29,6 +29,8 @@ #include <linux/list.h> #include <linux/hash.h> +#include <trace/sched.h> + #include <asm/ftrace.h> #include "trace.h" @@ -272,7 +274,7 @@ enum { static int ftrace_filtered; -static LIST_HEAD(ftrace_new_addrs); +static struct dyn_ftrace *ftrace_new_addrs; static DEFINE_MUTEX(ftrace_regex_lock); @@ -339,7 +341,7 @@ static inline int record_frozen(struct dyn_ftrace *rec) static void ftrace_free_rec(struct dyn_ftrace *rec) { - rec->ip = (unsigned long)ftrace_free_records; + rec->freelist = ftrace_free_records; ftrace_free_records = rec; rec->flags |= FTRACE_FL_FREE; } @@ -356,8 +358,14 @@ void ftrace_release(void *start, unsigned long size) mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { - if ((rec->ip >= s) && (rec->ip < e)) + if ((rec->ip >= s) && (rec->ip < e)) { + /* + * rec->ip is changed in ftrace_free_rec() + * It should not between s and e if record was freed. + */ + FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); ftrace_free_rec(rec); + } } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); } @@ -376,7 +384,7 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) return NULL; } - ftrace_free_records = (void *)rec->ip; + ftrace_free_records = rec->freelist; memset(rec, 0, sizeof(*rec)); return rec; } @@ -408,8 +416,8 @@ ftrace_record_ip(unsigned long ip) return NULL; rec->ip = ip; - - list_add(&rec->list, &ftrace_new_addrs); + rec->newlist = ftrace_new_addrs; + ftrace_new_addrs = rec; return rec; } @@ -531,11 +539,12 @@ static void ftrace_replace_code(int enable) do_for_each_ftrace_rec(pg, rec) { /* - * Skip over free records and records that have - * failed. + * Skip over free records, records that have + * failed and not converted. */ if (rec->flags & FTRACE_FL_FREE || - rec->flags & FTRACE_FL_FAILED) + rec->flags & FTRACE_FL_FAILED || + !(rec->flags & FTRACE_FL_CONVERTED)) continue; /* ignore updates to this record's mcount site */ @@ -547,7 +556,7 @@ static void ftrace_replace_code(int enable) } failed = __ftrace_replace_code(rec, enable); - if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { + if (failed) { rec->flags |= FTRACE_FL_FAILED; if ((system_state == SYSTEM_BOOTING) || !core_kernel_text(rec->ip)) { @@ -714,19 +723,21 @@ unsigned long ftrace_update_tot_cnt; static int ftrace_update_code(struct module *mod) { - struct dyn_ftrace *p, *t; + struct dyn_ftrace *p; cycle_t start, stop; start = ftrace_now(raw_smp_processor_id()); ftrace_update_cnt = 0; - list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) { + while (ftrace_new_addrs) { /* If something went wrong, bail without enabling anything */ if (unlikely(ftrace_disabled)) return -1; - list_del_init(&p->list); + p = ftrace_new_addrs; + ftrace_new_addrs = p->newlist; + p->flags = 0L; /* convert record (i.e, patch mcount-call with NOP) */ if (ftrace_code_disable(mod, p)) { @@ -1118,16 +1129,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file) return ftrace_regex_open(inode, file, 0); } -static ssize_t -ftrace_regex_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - if (file->f_mode & FMODE_READ) - return seq_read(file, ubuf, cnt, ppos); - else - return -EPERM; -} - static loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin) { @@ -1880,7 +1881,7 @@ static const struct file_operations ftrace_failures_fops = { static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, - .read = ftrace_regex_read, + .read = seq_read, .write = ftrace_filter_write, .llseek = ftrace_regex_lseek, .release = ftrace_filter_release, @@ -1888,7 +1889,7 @@ static const struct file_operations ftrace_filter_fops = { static const struct file_operations ftrace_notrace_fops = { .open = ftrace_notrace_open, - .read = ftrace_regex_read, + .read = seq_read, .write = ftrace_notrace_write, .llseek = ftrace_regex_lseek, .release = ftrace_notrace_release, @@ -1990,16 +1991,6 @@ ftrace_graph_open(struct inode *inode, struct file *file) return ret; } -static ssize_t -ftrace_graph_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - if (file->f_mode & FMODE_READ) - return seq_read(file, ubuf, cnt, ppos); - else - return -EPERM; -} - static int ftrace_set_func(unsigned long *array, int *idx, char *buffer) { @@ -2130,7 +2121,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, static const struct file_operations ftrace_graph_fops = { .open = ftrace_graph_open, - .read = ftrace_graph_read, + .read = seq_read, .write = ftrace_graph_write, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -2278,7 +2269,7 @@ ftrace_pid_read(struct file *file, char __user *ubuf, if (ftrace_pid_trace == ftrace_swapper_pid) r = sprintf(buf, "swapper tasks\n"); else if (ftrace_pid_trace) - r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace)); + r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace)); else r = sprintf(buf, "no pid\n"); @@ -2606,6 +2597,38 @@ free: return ret; } +static void +ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, + struct task_struct *next) +{ + unsigned long long timestamp; + int index; + + /* + * Does the user want to count the time a function was asleep. + * If so, do not update the time stamps. + */ + if (trace_flags & TRACE_ITER_SLEEP_TIME) + return; + + timestamp = trace_clock_local(); + + prev->ftrace_timestamp = timestamp; + + /* only process tasks that we timestamped */ + if (!next->ftrace_timestamp) + return; + + /* + * Update all the counters in next to make up for the + * time next was sleeping. + */ + timestamp -= next->ftrace_timestamp; + + for (index = next->curr_ret_stack; index >= 0; index--) + next->ret_stack[index].calltime += timestamp; +} + /* Allocate a return stack for each task */ static int start_graph_tracing(void) { @@ -2627,6 +2650,13 @@ static int start_graph_tracing(void) ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); + if (!ret) { + ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch); + if (ret) + pr_info("ftrace_graph: Couldn't activate tracepoint" + " probe to kernel_sched_switch\n"); + } + kfree(ret_stack_list); return ret; } @@ -2659,6 +2689,12 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, mutex_lock(&ftrace_lock); + /* we currently allow only one tracer registered at a time */ + if (atomic_read(&ftrace_graph_active)) { + ret = -EBUSY; + goto out; + } + ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; register_pm_notifier(&ftrace_suspend_notifier); @@ -2683,12 +2719,17 @@ void unregister_ftrace_graph(void) { mutex_lock(&ftrace_lock); + if (!unlikely(atomic_read(&ftrace_graph_active))) + goto out; + atomic_dec(&ftrace_graph_active); + unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); + out: mutex_unlock(&ftrace_lock); } @@ -2704,6 +2745,7 @@ void ftrace_graph_init_task(struct task_struct *t) t->curr_ret_stack = -1; atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; } else t->ret_stack = NULL; } diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index ae201b3eda8..5011f4d91e3 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -6,14 +6,16 @@ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> */ -#include <linux/dcache.h> +#include <linux/tracepoint.h> +#include <linux/seq_file.h> #include <linux/debugfs.h> +#include <linux/dcache.h> #include <linux/fs.h> -#include <linux/seq_file.h> + #include <trace/kmemtrace.h> -#include "trace.h" #include "trace_output.h" +#include "trace.h" /* Select an alternative, minimalistic output than the original one */ #define TRACE_KMEM_OPT_MINIMAL 0x1 @@ -25,14 +27,156 @@ static struct tracer_opt kmem_opts[] = { }; static struct tracer_flags kmem_tracer_flags = { - .val = 0, - .opts = kmem_opts + .val = 0, + .opts = kmem_opts }; - -static bool kmem_tracing_enabled __read_mostly; static struct trace_array *kmemtrace_array; +/* Trace allocations */ +static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node) +{ + struct trace_array *tr = kmemtrace_array; + struct kmemtrace_alloc_entry *entry; + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + if (!event) + return; + + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + + entry->ent.type = TRACE_KMEM_ALLOC; + entry->type_id = type_id; + entry->call_site = call_site; + entry->ptr = ptr; + entry->bytes_req = bytes_req; + entry->bytes_alloc = bytes_alloc; + entry->gfp_flags = gfp_flags; + entry->node = node; + + ring_buffer_unlock_commit(tr->buffer, event); + + trace_wake_up(); +} + +static inline void kmemtrace_free(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr) +{ + struct trace_array *tr = kmemtrace_array; + struct kmemtrace_free_entry *entry; + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + + entry->ent.type = TRACE_KMEM_FREE; + entry->type_id = type_id; + entry->call_site = call_site; + entry->ptr = ptr; + + ring_buffer_unlock_commit(tr->buffer, event); + + trace_wake_up(); +} + +static void kmemtrace_kmalloc(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags) +{ + kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, + bytes_req, bytes_alloc, gfp_flags, -1); +} + +static void kmemtrace_kmem_cache_alloc(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags) +{ + kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, + bytes_req, bytes_alloc, gfp_flags, -1); +} + +static void kmemtrace_kmalloc_node(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node) +{ + kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, + bytes_req, bytes_alloc, gfp_flags, node); +} + +static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node) +{ + kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, + bytes_req, bytes_alloc, gfp_flags, node); +} + +static void kmemtrace_kfree(unsigned long call_site, const void *ptr) +{ + kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); +} + +static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr) +{ + kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); +} + +static int kmemtrace_start_probes(void) +{ + int err; + + err = register_trace_kmalloc(kmemtrace_kmalloc); + if (err) + return err; + err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); + if (err) + return err; + err = register_trace_kmalloc_node(kmemtrace_kmalloc_node); + if (err) + return err; + err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); + if (err) + return err; + err = register_trace_kfree(kmemtrace_kfree); + if (err) + return err; + err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free); + + return err; +} + +static void kmemtrace_stop_probes(void) +{ + unregister_trace_kmalloc(kmemtrace_kmalloc); + unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); + unregister_trace_kmalloc_node(kmemtrace_kmalloc_node); + unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); + unregister_trace_kfree(kmemtrace_kfree); + unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free); +} + static int kmem_trace_init(struct trace_array *tr) { int cpu; @@ -41,14 +185,14 @@ static int kmem_trace_init(struct trace_array *tr) for_each_cpu_mask(cpu, cpu_possible_map) tracing_reset(tr, cpu); - kmem_tracing_enabled = true; + kmemtrace_start_probes(); return 0; } static void kmem_trace_reset(struct trace_array *tr) { - kmem_tracing_enabled = false; + kmemtrace_stop_probes(); } static void kmemtrace_headers(struct seq_file *s) @@ -66,47 +210,84 @@ static void kmemtrace_headers(struct seq_file *s) } /* - * The two following functions give the original output from kmemtrace, - * or something close to....perhaps they need some missing things + * The following functions give the original output from kmemtrace, + * plus the origin CPU, since reordering occurs in-kernel now. */ + +#define KMEMTRACE_USER_ALLOC 0 +#define KMEMTRACE_USER_FREE 1 + +struct kmemtrace_user_event { + u8 event_id; + u8 type_id; + u16 event_size; + u32 cpu; + u64 timestamp; + unsigned long call_site; + unsigned long ptr; +}; + +struct kmemtrace_user_event_alloc { + size_t bytes_req; + size_t bytes_alloc; + unsigned gfp_flags; + int node; +}; + static enum print_line_t -kmemtrace_print_alloc_original(struct trace_iterator *iter, - struct kmemtrace_alloc_entry *entry) +kmemtrace_print_alloc_user(struct trace_iterator *iter, + struct kmemtrace_alloc_entry *entry) { + struct kmemtrace_user_event_alloc *ev_alloc; struct trace_seq *s = &iter->seq; - int ret; + struct kmemtrace_user_event *ev; + + ev = trace_seq_reserve(s, sizeof(*ev)); + if (!ev) + return TRACE_TYPE_PARTIAL_LINE; - /* Taken from the old linux/kmemtrace.h */ - ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu " - "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", - entry->type_id, entry->call_site, (unsigned long) entry->ptr, - (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc, - (unsigned long) entry->gfp_flags, entry->node); + ev->event_id = KMEMTRACE_USER_ALLOC; + ev->type_id = entry->type_id; + ev->event_size = sizeof(*ev) + sizeof(*ev_alloc); + ev->cpu = iter->cpu; + ev->timestamp = iter->ts; + ev->call_site = entry->call_site; + ev->ptr = (unsigned long)entry->ptr; - if (!ret) + ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc)); + if (!ev_alloc) return TRACE_TYPE_PARTIAL_LINE; + ev_alloc->bytes_req = entry->bytes_req; + ev_alloc->bytes_alloc = entry->bytes_alloc; + ev_alloc->gfp_flags = entry->gfp_flags; + ev_alloc->node = entry->node; + return TRACE_TYPE_HANDLED; } static enum print_line_t -kmemtrace_print_free_original(struct trace_iterator *iter, - struct kmemtrace_free_entry *entry) +kmemtrace_print_free_user(struct trace_iterator *iter, + struct kmemtrace_free_entry *entry) { struct trace_seq *s = &iter->seq; - int ret; + struct kmemtrace_user_event *ev; - /* Taken from the old linux/kmemtrace.h */ - ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n", - entry->type_id, entry->call_site, (unsigned long) entry->ptr); - - if (!ret) + ev = trace_seq_reserve(s, sizeof(*ev)); + if (!ev) return TRACE_TYPE_PARTIAL_LINE; + ev->event_id = KMEMTRACE_USER_FREE; + ev->type_id = entry->type_id; + ev->event_size = sizeof(*ev); + ev->cpu = iter->cpu; + ev->timestamp = iter->ts; + ev->call_site = entry->call_site; + ev->ptr = (unsigned long)entry->ptr; + return TRACE_TYPE_HANDLED; } - /* The two other following provide a more minimalistic output */ static enum print_line_t kmemtrace_print_alloc_compress(struct trace_iterator *iter, @@ -178,7 +359,7 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter, static enum print_line_t kmemtrace_print_free_compress(struct trace_iterator *iter, - struct kmemtrace_free_entry *entry) + struct kmemtrace_free_entry *entry) { struct trace_seq *s = &iter->seq; int ret; @@ -239,20 +420,22 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) switch (entry->type) { case TRACE_KMEM_ALLOC: { struct kmemtrace_alloc_entry *field; + trace_assign_type(field, entry); if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) return kmemtrace_print_alloc_compress(iter, field); else - return kmemtrace_print_alloc_original(iter, field); + return kmemtrace_print_alloc_user(iter, field); } case TRACE_KMEM_FREE: { struct kmemtrace_free_entry *field; + trace_assign_type(field, entry); if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) return kmemtrace_print_free_compress(iter, field); else - return kmemtrace_print_free_original(iter, field); + return kmemtrace_print_free_user(iter, field); } default: @@ -260,70 +443,13 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) } } -/* Trace allocations */ -void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - struct ring_buffer_event *event; - struct kmemtrace_alloc_entry *entry; - struct trace_array *tr = kmemtrace_array; - - if (!kmem_tracing_enabled) - return; - - event = trace_buffer_lock_reserve(tr, TRACE_KMEM_ALLOC, - sizeof(*entry), 0, 0); - if (!event) - return; - entry = ring_buffer_event_data(event); - - entry->call_site = call_site; - entry->ptr = ptr; - entry->bytes_req = bytes_req; - entry->bytes_alloc = bytes_alloc; - entry->gfp_flags = gfp_flags; - entry->node = node; - - trace_buffer_unlock_commit(tr, event, 0, 0); -} -EXPORT_SYMBOL(kmemtrace_mark_alloc_node); - -void kmemtrace_mark_free(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr) -{ - struct ring_buffer_event *event; - struct kmemtrace_free_entry *entry; - struct trace_array *tr = kmemtrace_array; - - if (!kmem_tracing_enabled) - return; - - event = trace_buffer_lock_reserve(tr, TRACE_KMEM_FREE, - sizeof(*entry), 0, 0); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->type_id = type_id; - entry->call_site = call_site; - entry->ptr = ptr; - - trace_buffer_unlock_commit(tr, event, 0, 0); -} -EXPORT_SYMBOL(kmemtrace_mark_free); - static struct tracer kmem_tracer __read_mostly = { - .name = "kmemtrace", - .init = kmem_trace_init, - .reset = kmem_trace_reset, - .print_line = kmemtrace_print_line, - .print_header = kmemtrace_headers, - .flags = &kmem_tracer_flags + .name = "kmemtrace", + .init = kmem_trace_init, + .reset = kmem_trace_reset, + .print_line = kmemtrace_print_line, + .print_header = kmemtrace_headers, + .flags = &kmem_tracer_flags }; void kmemtrace_init(void) @@ -335,5 +461,4 @@ static int __init init_kmem_tracer(void) { return register_tracer(&kmem_tracer); } - device_initcall(init_kmem_tracer); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 58128ad2fde..960cbf44c84 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -180,48 +180,74 @@ EXPORT_SYMBOL_GPL(tracing_is_on); #include "trace.h" -/* Up this if you want to test the TIME_EXTENTS and normalization */ -#define DEBUG_SHIFT 0 +#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) +#define RB_ALIGNMENT 4U +#define RB_MAX_SMALL_DATA 28 -u64 ring_buffer_time_stamp(int cpu) +enum { + RB_LEN_TIME_EXTEND = 8, + RB_LEN_TIME_STAMP = 16, +}; + +static inline int rb_null_event(struct ring_buffer_event *event) { - u64 time; + return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0; +} - preempt_disable_notrace(); - /* shift to debug/test normalization and TIME_EXTENTS */ - time = trace_clock_local() << DEBUG_SHIFT; - preempt_enable_no_resched_notrace(); +static inline int rb_discarded_event(struct ring_buffer_event *event) +{ + return event->type == RINGBUF_TYPE_PADDING && event->time_delta; +} - return time; +static void rb_event_set_padding(struct ring_buffer_event *event) +{ + event->type = RINGBUF_TYPE_PADDING; + event->time_delta = 0; } -EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); -void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) +/** + * ring_buffer_event_discard - discard an event in the ring buffer + * @buffer: the ring buffer + * @event: the event to discard + * + * Sometimes a event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * Note, it is up to the user to be careful with this, and protect + * against races. If the user discards an event that has been consumed + * it is possible that it could corrupt the ring buffer. + */ +void ring_buffer_event_discard(struct ring_buffer_event *event) { - /* Just stupid testing the normalize function and deltas */ - *ts >>= DEBUG_SHIFT; + event->type = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; } -EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); -#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) -#define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA 28 +static unsigned +rb_event_data_length(struct ring_buffer_event *event) +{ + unsigned length; -enum { - RB_LEN_TIME_EXTEND = 8, - RB_LEN_TIME_STAMP = 16, -}; + if (event->len) + length = event->len * RB_ALIGNMENT; + else + length = event->array[0]; + return length + RB_EVNT_HDR_SIZE; +} /* inline for ring buffer fast paths */ static unsigned rb_event_length(struct ring_buffer_event *event) { - unsigned length; - switch (event->type) { case RINGBUF_TYPE_PADDING: - /* undefined */ - return -1; + if (rb_null_event(event)) + /* undefined */ + return -1; + return rb_event_data_length(event); case RINGBUF_TYPE_TIME_EXTEND: return RB_LEN_TIME_EXTEND; @@ -230,11 +256,7 @@ rb_event_length(struct ring_buffer_event *event) return RB_LEN_TIME_STAMP; case RINGBUF_TYPE_DATA: - if (event->len) - length = event->len * RB_ALIGNMENT; - else - length = event->array[0]; - return length + RB_EVNT_HDR_SIZE; + return rb_event_data_length(event); default: BUG(); } @@ -374,6 +396,7 @@ struct ring_buffer { #ifdef CONFIG_HOTPLUG_CPU struct notifier_block cpu_notify; #endif + u64 (*clock)(void); }; struct ring_buffer_iter { @@ -394,6 +417,30 @@ struct ring_buffer_iter { _____ret; \ }) +/* Up this if you want to test the TIME_EXTENTS and normalization */ +#define DEBUG_SHIFT 0 + +u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) +{ + u64 time; + + preempt_disable_notrace(); + /* shift to debug/test normalization and TIME_EXTENTS */ + time = buffer->clock() << DEBUG_SHIFT; + preempt_enable_no_resched_notrace(); + + return time; +} +EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); + +void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, + int cpu, u64 *ts) +{ + /* Just stupid testing the normalize function and deltas */ + *ts >>= DEBUG_SHIFT; +} +EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); + /** * check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test @@ -516,7 +563,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) struct list_head *head = &cpu_buffer->pages; struct buffer_page *bpage, *tmp; - list_del_init(&cpu_buffer->reader_page->list); free_buffer_page(cpu_buffer->reader_page); list_for_each_entry_safe(bpage, tmp, head, list) { @@ -533,8 +579,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) extern int ring_buffer_page_too_big(void); #ifdef CONFIG_HOTPLUG_CPU -static int __cpuinit rb_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu); +static int rb_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu); #endif /** @@ -569,13 +615,23 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); buffer->flags = flags; + buffer->clock = trace_clock_local; /* need at least two pages */ if (buffer->pages == 1) buffer->pages++; + /* + * In case of non-hotplug cpu, if the ring-buffer is allocated + * in early initcall, it will not be notified of secondary cpus. + * In that off case, we need to allocate for all possible cpus. + */ +#ifdef CONFIG_HOTPLUG_CPU get_online_cpus(); cpumask_copy(buffer->cpumask, cpu_online_mask); +#else + cpumask_copy(buffer->cpumask, cpu_possible_mask); +#endif buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; @@ -645,6 +701,12 @@ ring_buffer_free(struct ring_buffer *buffer) } EXPORT_SYMBOL_GPL(ring_buffer_free); +void ring_buffer_set_clock(struct ring_buffer *buffer, + u64 (*clock)(void)) +{ + buffer->clock = clock; +} + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); static void @@ -827,11 +889,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) } EXPORT_SYMBOL_GPL(ring_buffer_resize); -static inline int rb_null_event(struct ring_buffer_event *event) -{ - return event->type == RINGBUF_TYPE_PADDING; -} - static inline void * __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) { @@ -1191,7 +1248,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->tail_page = next_page; /* reread the time stamp */ - *ts = ring_buffer_time_stamp(cpu_buffer->cpu); + *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); cpu_buffer->tail_page->page->time_stamp = *ts; } @@ -1201,7 +1258,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (tail < BUF_PAGE_SIZE) { /* Mark the rest of the page with padding */ event = __rb_page_index(tail_page, tail); - event->type = RINGBUF_TYPE_PADDING; + rb_event_set_padding(event); } if (tail <= BUF_PAGE_SIZE) @@ -1334,7 +1391,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) return NULL; - ts = ring_buffer_time_stamp(cpu_buffer->cpu); + ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); /* * Only the first commit can update the timestamp. @@ -1951,7 +2008,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) event = rb_reader_event(cpu_buffer); - if (event->type == RINGBUF_TYPE_DATA) + if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event)) cpu_buffer->entries--; rb_update_read_stamp(cpu_buffer, event); @@ -2034,9 +2091,18 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) switch (event->type) { case RINGBUF_TYPE_PADDING: - RB_WARN_ON(cpu_buffer, 1); + if (rb_null_event(event)) + RB_WARN_ON(cpu_buffer, 1); + /* + * Because the writer could be discarding every + * event it creates (which would probably be bad) + * if we were to go back to "again" then we may never + * catch up, and will trigger the warn on, or lock + * the box. Return the padding, and we will release + * the current locks, and try again. + */ rb_advance_reader(cpu_buffer); - return NULL; + return event; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ @@ -2051,7 +2117,8 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) case RINGBUF_TYPE_DATA: if (ts) { *ts = cpu_buffer->read_stamp + event->time_delta; - ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); + ring_buffer_normalize_time_stamp(buffer, + cpu_buffer->cpu, ts); } return event; @@ -2096,8 +2163,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) switch (event->type) { case RINGBUF_TYPE_PADDING: - rb_inc_iter(iter); - goto again; + if (rb_null_event(event)) { + rb_inc_iter(iter); + goto again; + } + rb_advance_iter(iter); + return event; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ @@ -2112,7 +2183,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) case RINGBUF_TYPE_DATA: if (ts) { *ts = iter->read_stamp + event->time_delta; - ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); + ring_buffer_normalize_time_stamp(buffer, + cpu_buffer->cpu, ts); } return event; @@ -2143,10 +2215,16 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; + again: spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_buffer_peek(buffer, cpu, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (event && event->type == RINGBUF_TYPE_PADDING) { + cpu_relax(); + goto again; + } + return event; } @@ -2165,10 +2243,16 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event *event; unsigned long flags; + again: spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (event && event->type == RINGBUF_TYPE_PADDING) { + cpu_relax(); + goto again; + } + return event; } @@ -2187,6 +2271,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) struct ring_buffer_event *event = NULL; unsigned long flags; + again: /* might be called in atomic */ preempt_disable(); @@ -2208,6 +2293,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) out: preempt_enable(); + if (event && event->type == RINGBUF_TYPE_PADDING) { + cpu_relax(); + goto again; + } + return event; } EXPORT_SYMBOL_GPL(ring_buffer_consume); @@ -2286,6 +2376,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; + again: spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); if (!event) @@ -2295,6 +2386,11 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (event && event->type == RINGBUF_TYPE_PADDING) { + cpu_relax(); + goto again; + } + return event; } EXPORT_SYMBOL_GPL(ring_buffer_read); @@ -2764,8 +2860,8 @@ static __init int rb_init_debugfs(void) fs_initcall(rb_init_debugfs); #ifdef CONFIG_HOTPLUG_CPU -static int __cpuinit rb_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int rb_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { struct ring_buffer *buffer = container_of(self, struct ring_buffer, cpu_notify); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index efe3202c020..a0174a40c56 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -155,13 +155,6 @@ ns2usecs(cycle_t nsec) return nsec; } -cycle_t ftrace_now(int cpu) -{ - u64 ts = ring_buffer_time_stamp(cpu); - ring_buffer_normalize_time_stamp(cpu, &ts); - return ts; -} - /* * The global_trace is the descriptor that holds the tracing * buffers for the live tracing. For each CPU, it contains @@ -178,6 +171,20 @@ static struct trace_array global_trace; static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +cycle_t ftrace_now(int cpu) +{ + u64 ts; + + /* Early boot up does not have a buffer yet */ + if (!global_trace.buffer) + return trace_clock_local(); + + ts = ring_buffer_time_stamp(global_trace.buffer, cpu); + ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); + + return ts; +} + /* * The max_tr is used to snapshot the global_trace when a maximum * latency is reached. Some tracers will use this to store a maximum @@ -248,7 +255,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO; + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME; /** * trace_wake_up - wake up tasks waiting for trace input @@ -308,6 +315,8 @@ static const char *trace_options[] = { "printk-msg-only", "context-info", "latency-format", + "global-clock", + "sleep-time", NULL }; @@ -374,7 +383,7 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) return cnt; } -ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) +static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; void *ret; @@ -633,6 +642,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) } #define SAVED_CMDLINES 128 +#define NO_CMDLINE_MAP UINT_MAX static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; @@ -644,8 +654,8 @@ static atomic_t trace_record_cmdline_disabled __read_mostly; static void trace_init_cmdlines(void) { - memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline)); - memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid)); + memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); + memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); cmdline_idx = 0; } @@ -737,8 +747,7 @@ void trace_stop_cmdline_recording(void); static void trace_save_cmdline(struct task_struct *tsk) { - unsigned map; - unsigned idx; + unsigned pid, idx; if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) return; @@ -753,13 +762,20 @@ static void trace_save_cmdline(struct task_struct *tsk) return; idx = map_pid_to_cmdline[tsk->pid]; - if (idx >= SAVED_CMDLINES) { + if (idx == NO_CMDLINE_MAP) { idx = (cmdline_idx + 1) % SAVED_CMDLINES; - map = map_cmdline_to_pid[idx]; - if (map <= PID_MAX_DEFAULT) - map_pid_to_cmdline[map] = (unsigned)-1; + /* + * Check whether the cmdline buffer at idx has a pid + * mapped. We are going to overwrite that entry so we + * need to clear the map_pid_to_cmdline. Otherwise we + * would read the new comm for the old pid. + */ + pid = map_cmdline_to_pid[idx]; + if (pid != NO_CMDLINE_MAP) + map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; + map_cmdline_to_pid[idx] = tsk->pid; map_pid_to_cmdline[tsk->pid] = idx; cmdline_idx = idx; @@ -770,30 +786,34 @@ static void trace_save_cmdline(struct task_struct *tsk) __raw_spin_unlock(&trace_cmdline_lock); } -char *trace_find_cmdline(int pid) +void trace_find_cmdline(int pid, char comm[]) { - char *cmdline = "<...>"; unsigned map; - if (!pid) - return "<idle>"; + if (!pid) { + strcpy(comm, "<idle>"); + return; + } - if (pid > PID_MAX_DEFAULT) - goto out; + if (pid > PID_MAX_DEFAULT) { + strcpy(comm, "<...>"); + return; + } + __raw_spin_lock(&trace_cmdline_lock); map = map_pid_to_cmdline[pid]; - if (map >= SAVED_CMDLINES) - goto out; - - cmdline = saved_cmdlines[map]; + if (map != NO_CMDLINE_MAP) + strcpy(comm, saved_cmdlines[map]); + else + strcpy(comm, "<...>"); - out: - return cmdline; + __raw_spin_unlock(&trace_cmdline_lock); } void tracing_record_cmdline(struct task_struct *tsk) { - if (atomic_read(&trace_record_cmdline_disabled)) + if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled || + !tracing_is_on()) return; trace_save_cmdline(tsk); @@ -841,15 +861,25 @@ static void ftrace_trace_stack(struct trace_array *tr, static void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc); -void trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer_event *event, - unsigned long flags, int pc) +static inline void __trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer_event *event, + unsigned long flags, int pc, + int wake) { ring_buffer_unlock_commit(tr->buffer, event); ftrace_trace_stack(tr, flags, 6, pc); ftrace_trace_userstack(tr, flags, pc); - trace_wake_up(); + + if (wake) + trace_wake_up(); +} + +void trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + __trace_buffer_unlock_commit(tr, event, flags, pc, 1); } struct ring_buffer_event * @@ -863,7 +893,13 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { - return trace_buffer_unlock_commit(&global_trace, event, flags, pc); + return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); +} + +void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); } void @@ -889,7 +925,7 @@ trace_function(struct trace_array *tr, } #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static void __trace_graph_entry(struct trace_array *tr, +static int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned long flags, int pc) @@ -898,15 +934,17 @@ static void __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent_entry *entry; if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) - return; + return 0; event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT, sizeof(*entry), flags, pc); if (!event) - return; + return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; ring_buffer_unlock_commit(global_trace.buffer, event); + + return 1; } static void __trace_graph_return(struct trace_array *tr, @@ -1127,6 +1165,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) struct trace_array_cpu *data; unsigned long flags; long disabled; + int ret; int cpu; int pc; @@ -1142,15 +1181,18 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); - __trace_graph_entry(tr, trace, flags, pc); + ret = __trace_graph_entry(tr, trace, flags, pc); + } else { + ret = 0; } /* Only do the atomic if it is not already set */ if (!test_tsk_trace_graph(current)) set_tsk_trace_graph(current); + atomic_dec(&data->disabled); local_irq_restore(flags); - return 1; + return ret; } void trace_graph_return(struct ftrace_graph_ret *trace) @@ -1182,7 +1224,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace) * trace_vbprintk - write binary msg to tracing buffer * */ -int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args) +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { static raw_spinlock_t trace_buf_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; @@ -1224,7 +1266,6 @@ int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args) goto out_unlock; entry = ring_buffer_event_data(event); entry->ip = ip; - entry->depth = depth; entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); @@ -1242,7 +1283,7 @@ out: } EXPORT_SYMBOL_GPL(trace_vbprintk); -int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) +int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; @@ -1279,7 +1320,6 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) goto out_unlock; entry = ring_buffer_event_data(event); entry->ip = ip; - entry->depth = depth; memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; @@ -1682,38 +1722,6 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; } -static enum print_line_t print_bprintk_msg_only(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - struct trace_entry *entry = iter->ent; - struct bprint_entry *field; - int ret; - - trace_assign_type(field, entry); - - ret = trace_seq_bprintf(s, field->fmt, field->buf); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t print_printk_msg_only(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - struct trace_entry *entry = iter->ent; - struct print_entry *field; - int ret; - - trace_assign_type(field, entry); - - ret = trace_seq_printf(s, "%s", field->buf); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1775,12 +1783,12 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) if (iter->ent->type == TRACE_BPRINT && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) - return print_bprintk_msg_only(iter); + return trace_print_bprintk_msg_only(iter); if (iter->ent->type == TRACE_PRINT && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) - return print_printk_msg_only(iter); + return trace_print_printk_msg_only(iter); if (trace_flags & TRACE_ITER_BIN) return print_bin_fmt(iter); @@ -1929,9 +1937,14 @@ int tracing_open_generic(struct inode *inode, struct file *filp) static int tracing_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; - struct trace_iterator *iter = m->private; + struct trace_iterator *iter; int cpu; + if (!(file->f_mode & FMODE_READ)) + return 0; + + iter = m->private; + mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { if (iter->buffer_iter[cpu]) @@ -1957,12 +1970,24 @@ static int tracing_open(struct inode *inode, struct file *file) struct trace_iterator *iter; int ret = 0; - iter = __tracing_open(inode, file); - if (IS_ERR(iter)) - ret = PTR_ERR(iter); - else if (trace_flags & TRACE_ITER_LATENCY_FMT) - iter->iter_flags |= TRACE_FILE_LAT_FMT; + /* If this file was open for write, then erase contents */ + if ((file->f_mode & FMODE_WRITE) && + !(file->f_flags & O_APPEND)) { + long cpu = (long) inode->i_private; + if (cpu == TRACE_PIPE_ALL_CPU) + tracing_reset_online_cpus(&global_trace); + else + tracing_reset(&global_trace, cpu); + } + + if (file->f_mode & FMODE_READ) { + iter = __tracing_open(inode, file); + if (IS_ERR(iter)) + ret = PTR_ERR(iter); + else if (trace_flags & TRACE_ITER_LATENCY_FMT) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + } return ret; } @@ -2037,9 +2062,17 @@ static int show_traces_open(struct inode *inode, struct file *file) return ret; } +static ssize_t +tracing_write_stub(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + return count; +} + static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, + .write = tracing_write_stub, .llseek = seq_lseek, .release = tracing_release, }; @@ -2240,6 +2273,34 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) return 0; } +static void set_tracer_flags(unsigned int mask, int enabled) +{ + /* do nothing if flag is already set */ + if (!!(trace_flags & mask) == !!enabled) + return; + + if (enabled) + trace_flags |= mask; + else + trace_flags &= ~mask; + + if (mask == TRACE_ITER_GLOBAL_CLK) { + u64 (*func)(void); + + if (enabled) + func = trace_clock_global; + else + func = trace_clock_local; + + mutex_lock(&trace_types_lock); + ring_buffer_set_clock(global_trace.buffer, func); + + if (max_tr.buffer) + ring_buffer_set_clock(max_tr.buffer, func); + mutex_unlock(&trace_types_lock); + } +} + static ssize_t tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -2267,10 +2328,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, int len = strlen(trace_options[i]); if (strncmp(cmp, trace_options[i], len) == 0) { - if (neg) - trace_flags &= ~(1 << i); - else - trace_flags |= (1 << i); + set_tracer_flags(1 << i, !neg); break; } } @@ -2494,7 +2552,7 @@ static int tracing_set_tracer(const char *buf) if (!ring_buffer_expanded) { ret = tracing_resize_ring_buffer(trace_buf_size); if (ret < 0) - return ret; + goto out; ret = 0; } @@ -3110,7 +3168,7 @@ static int mark_printk(const char *fmt, ...) int ret; va_list args; va_start(args, fmt); - ret = trace_vprintk(0, -1, fmt, args); + ret = trace_vprintk(0, fmt, args); va_end(args); return ret; } @@ -3478,6 +3536,9 @@ struct dentry *tracing_init_dentry(void) if (d_tracer) return d_tracer; + if (!debugfs_initialized()) + return NULL; + d_tracer = debugfs_create_dir("tracing", NULL); if (!d_tracer && !once) { @@ -3539,7 +3600,7 @@ static void tracing_init_debugfs_percpu(long cpu) pr_warning("Could not create debugfs 'trace_pipe' entry\n"); /* per cpu trace */ - entry = debugfs_create_file("trace", 0444, d_cpu, + entry = debugfs_create_file("trace", 0644, d_cpu, (void *) cpu, &tracing_fops); if (!entry) pr_warning("Could not create debugfs 'trace' entry\n"); @@ -3853,7 +3914,7 @@ static __init int tracer_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); - entry = debugfs_create_file("trace", 0444, d_tracer, + entry = debugfs_create_file("trace", 0644, d_tracer, (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); if (!entry) pr_warning("Could not create debugfs 'trace' entry\n"); @@ -3983,11 +4044,12 @@ trace_printk_seq(struct trace_seq *s) trace_seq_init(s); } -void ftrace_dump(void) +static void __ftrace_dump(bool disable_tracing) { static DEFINE_SPINLOCK(ftrace_dump_lock); /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; + unsigned int old_userobj; static int dump_ran; unsigned long flags; int cnt = 0, cpu; @@ -3999,14 +4061,17 @@ void ftrace_dump(void) dump_ran = 1; - /* No turning back! */ tracing_off(); - ftrace_kill(); + + if (disable_tracing) + ftrace_kill(); for_each_tracing_cpu(cpu) { atomic_inc(&global_trace.data[cpu]->disabled); } + old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; + /* don't look at user memory in panic mode */ trace_flags &= ~TRACE_ITER_SYM_USEROBJ; @@ -4051,10 +4116,26 @@ void ftrace_dump(void) else printk(KERN_TRACE "---------------------------------\n"); + /* Re-enable tracing if requested */ + if (!disable_tracing) { + trace_flags |= old_userobj; + + for_each_tracing_cpu(cpu) { + atomic_dec(&global_trace.data[cpu]->disabled); + } + tracing_on(); + } + out: spin_unlock_irqrestore(&ftrace_dump_lock, flags); } +/* By default: disable tracing after the dump */ +void ftrace_dump(void) +{ + __ftrace_dump(true); +} + __init static int tracer_alloc_buffers(void) { struct trace_array_cpu *data; @@ -4125,7 +4206,8 @@ __init static int tracer_alloc_buffers(void) &trace_panic_notifier); register_die_notifier(&trace_die_notifier); - ret = 0; + + return 0; out_free_cpumask: free_cpumask_var(tracing_reader_cpumask); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e7fbc826f1e..9e15802cca9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -123,7 +123,6 @@ struct userstack_entry { struct bprint_entry { struct trace_entry ent; unsigned long ip; - int depth; const char *fmt; u32 buf[]; }; @@ -131,7 +130,6 @@ struct bprint_entry { struct print_entry { struct trace_entry ent; unsigned long ip; - int depth; char buf[]; }; @@ -184,6 +182,12 @@ struct trace_power { struct power_trace state_data; }; +enum kmemtrace_type_id { + KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ + KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ + KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ +}; + struct kmemtrace_alloc_entry { struct trace_entry ent; enum kmemtrace_type_id type_id; @@ -202,6 +206,19 @@ struct kmemtrace_free_entry { const void *ptr; }; +struct syscall_trace_enter { + struct trace_entry ent; + int nr; + unsigned long args[]; +}; + +struct syscall_trace_exit { + struct trace_entry ent; + int nr; + unsigned long ret; +}; + + /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -315,6 +332,10 @@ extern void __ftrace_bad_type(void); TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ TRACE_KMEM_FREE); \ + IF_ASSIGN(var, ent, struct syscall_trace_enter, \ + TRACE_SYSCALL_ENTER); \ + IF_ASSIGN(var, ent, struct syscall_trace_exit, \ + TRACE_SYSCALL_EXIT); \ __ftrace_bad_type(); \ } while (0) @@ -468,6 +489,8 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, unsigned long flags, int pc); void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc); +void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, + unsigned long flags, int pc); struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); @@ -547,7 +570,7 @@ struct tracer_switch_ops { }; #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ -extern char *trace_find_cmdline(int pid); +extern void trace_find_cmdline(int pid, char comm[]); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; @@ -583,9 +606,9 @@ extern int trace_selftest_startup_hw_branches(struct tracer *trace, extern void *head_page(struct trace_array_cpu *data); extern long ns2usecs(cycle_t nsec); extern int -trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args); +trace_vbprintk(unsigned long ip, const char *fmt, va_list args); extern int -trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args); +trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern unsigned long trace_flags; @@ -669,6 +692,8 @@ enum trace_iterator_flags { TRACE_ITER_PRINTK_MSGONLY = 0x10000, TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ TRACE_ITER_LATENCY_FMT = 0x40000, + TRACE_ITER_GLOBAL_CLK = 0x80000, + TRACE_ITER_SLEEP_TIME = 0x100000, }; /* @@ -761,22 +786,89 @@ enum { TRACE_EVENT_TYPE_RAW = 2, }; +struct ftrace_event_field { + struct list_head link; + char *name; + char *type; + int offset; + int size; +}; + struct ftrace_event_call { - char *name; - char *system; - struct dentry *dir; - int enabled; - int (*regfunc)(void); - void (*unregfunc)(void); - int id; - int (*raw_init)(void); - int (*show_format)(struct trace_seq *s); + char *name; + char *system; + struct dentry *dir; + int enabled; + int (*regfunc)(void); + void (*unregfunc)(void); + int id; + int (*raw_init)(void); + int (*show_format)(struct trace_seq *s); + int (*define_fields)(void); + struct list_head fields; + struct filter_pred **preds; + +#ifdef CONFIG_EVENT_PROFILE + atomic_t profile_count; + int (*profile_enable)(struct ftrace_event_call *); + void (*profile_disable)(struct ftrace_event_call *); +#endif }; +struct event_subsystem { + struct list_head list; + const char *name; + struct dentry *entry; + struct filter_pred **preds; +}; + +#define events_for_each(event) \ + for (event = __start_ftrace_events; \ + (unsigned long)event < (unsigned long)__stop_ftrace_events; \ + event++) + +#define MAX_FILTER_PRED 8 + +struct filter_pred; + +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); + +struct filter_pred { + filter_pred_fn_t fn; + u64 val; + char *str_val; + int str_len; + char *field_name; + int offset; + int not; + int or; + int compound; + int clear; +}; + +int trace_define_field(struct ftrace_event_call *call, char *type, + char *name, int offset, int size); +extern void filter_free_pred(struct filter_pred *pred); +extern void filter_print_preds(struct filter_pred **preds, + struct trace_seq *s); +extern int filter_parse(char **pbuf, struct filter_pred *pred); +extern int filter_add_pred(struct ftrace_event_call *call, + struct filter_pred *pred); +extern void filter_free_preds(struct ftrace_event_call *call); +extern int filter_match_preds(struct ftrace_event_call *call, void *rec); +extern void filter_free_subsystem_preds(struct event_subsystem *system); +extern int filter_add_subsystem_pred(struct event_subsystem *system, + struct filter_pred *pred); + void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; +#define for_each_event(event) \ + for (event = __start_ftrace_events; \ + (unsigned long)event < (unsigned long)__stop_ftrace_events; \ + event++) + extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 05b176abfd3..b588fd81f7f 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -18,6 +18,7 @@ #include <linux/percpu.h> #include <linux/sched.h> #include <linux/ktime.h> +#include <linux/trace_clock.h> /* * trace_clock_local(): the simplest and least coherent tracing clock. diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c new file mode 100644 index 00000000000..22cba997077 --- /dev/null +++ b/kernel/trace/trace_event_profile.c @@ -0,0 +1,31 @@ +/* + * trace event based perf counter profiling + * + * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com> + * + */ + +#include "trace.h" + +int ftrace_profile_enable(int event_id) +{ + struct ftrace_event_call *event; + + for_each_event(event) { + if (event->id == event_id) + return event->profile_enable(event); + } + + return -EINVAL; +} + +void ftrace_profile_disable(int event_id) +{ + struct ftrace_event_call *event; + + for_each_event(event) { + if (event->id == event_id) + return event->profile_disable(event); + } +} + diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index 019915063fe..fd78bee71dd 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -105,7 +105,6 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(unsigned int, depth, depth) TRACE_FIELD(char *, fmt, fmt) TRACE_FIELD_ZERO_CHAR(buf) ), @@ -115,7 +114,6 @@ TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore, TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(unsigned int, depth, depth) TRACE_FIELD_ZERO_CHAR(buf) ), TP_RAW_FMT("%08lx (%d) fmt:%p %s") diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 238ea95a411..64ec4d278ff 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -19,10 +19,38 @@ static DEFINE_MUTEX(event_mutex); -#define events_for_each(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) +int trace_define_field(struct ftrace_event_call *call, char *type, + char *name, int offset, int size) +{ + struct ftrace_event_field *field; + + field = kzalloc(sizeof(*field), GFP_KERNEL); + if (!field) + goto err; + + field->name = kstrdup(name, GFP_KERNEL); + if (!field->name) + goto err; + + field->type = kstrdup(type, GFP_KERNEL); + if (!field->type) + goto err; + + field->offset = offset; + field->size = size; + list_add(&field->link, &call->fields); + + return 0; + +err: + if (field) { + kfree(field->name); + kfree(field->type); + } + kfree(field); + + return -ENOMEM; +} static void ftrace_clear_events(void) { @@ -90,7 +118,7 @@ static int ftrace_set_clr_event(char *buf, int set) } mutex_lock(&event_mutex); - events_for_each(call) { + for_each_event(call) { if (!call->name || !call->regfunc) continue; @@ -348,7 +376,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, #undef FIELD #define FIELD(type, name) \ - #type, #name, offsetof(typeof(field), name), sizeof(field.name) + #type, "common_" #name, offsetof(typeof(field), name), \ + sizeof(field.name) static int trace_write_header(struct trace_seq *s) { @@ -378,15 +407,15 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, char *buf; int r; + if (*ppos) + return 0; + s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); - if (*ppos) - return 0; - /* If any of the first writes fail, so will the show_format. */ trace_seq_printf(s, "name: %s\n", call->name); @@ -412,6 +441,162 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, return r; } +static ssize_t +event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + trace_seq_printf(s, "%d\n", call->id); + + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, s->len); + kfree(s); + return r; +} + +static ssize_t +event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + filter_print_preds(call->preds, s); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + + kfree(s); + + return r; +} + +static ssize_t +event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + char buf[64], *pbuf = buf; + struct filter_pred *pred; + int err; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return -ENOMEM; + + err = filter_parse(&pbuf, pred); + if (err < 0) { + filter_free_pred(pred); + return err; + } + + if (pred->clear) { + filter_free_preds(call); + filter_free_pred(pred); + return cnt; + } + + if (filter_add_pred(call, pred)) { + filter_free_pred(pred); + return -EINVAL; + } + + *ppos += cnt; + + return cnt; +} + +static ssize_t +subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct event_subsystem *system = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + filter_print_preds(system->preds, s); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + + kfree(s); + + return r; +} + +static ssize_t +subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct event_subsystem *system = filp->private_data; + char buf[64], *pbuf = buf; + struct filter_pred *pred; + int err; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return -ENOMEM; + + err = filter_parse(&pbuf, pred); + if (err < 0) { + filter_free_pred(pred); + return err; + } + + if (pred->clear) { + filter_free_subsystem_preds(system); + filter_free_pred(pred); + return cnt; + } + + if (filter_add_subsystem_pred(system, pred)) { + filter_free_subsystem_preds(system); + filter_free_pred(pred); + return -EINVAL; + } + + *ppos += cnt; + + return cnt; +} + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -452,6 +637,23 @@ static const struct file_operations ftrace_event_format_fops = { .read = event_format_read, }; +static const struct file_operations ftrace_event_id_fops = { + .open = tracing_open_generic, + .read = event_id_read, +}; + +static const struct file_operations ftrace_event_filter_fops = { + .open = tracing_open_generic, + .read = event_filter_read, + .write = event_filter_write, +}; + +static const struct file_operations ftrace_subsystem_filter_fops = { + .open = tracing_open_generic, + .read = subsystem_filter_read, + .write = subsystem_filter_write, +}; + static struct dentry *event_trace_events_dir(void) { static struct dentry *d_tracer; @@ -472,12 +674,6 @@ static struct dentry *event_trace_events_dir(void) return d_events; } -struct event_subsystem { - struct list_head list; - const char *name; - struct dentry *entry; -}; - static LIST_HEAD(event_subsystems); static struct dentry * @@ -510,6 +706,8 @@ event_subsystem_dir(const char *name, struct dentry *d_events) system->name = name; list_add(&system->list, &event_subsystems); + system->preds = NULL; + return system->entry; } @@ -550,6 +748,28 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) "'%s/enable' entry\n", call->name); } + if (call->id) { + entry = debugfs_create_file("id", 0444, call->dir, call, + &ftrace_event_id_fops); + if (!entry) + pr_warning("Could not create debugfs '%s/id' entry\n", + call->name); + } + + if (call->define_fields) { + ret = call->define_fields(); + if (ret < 0) { + pr_warning("Could not initialize trace point" + " events/%s\n", call->name); + return ret; + } + entry = debugfs_create_file("filter", 0644, call->dir, call, + &ftrace_event_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'%s/filter' entry\n", call->name); + } + /* A trace may not want to export its format */ if (!call->show_format) return 0; @@ -592,7 +812,7 @@ static __init int event_trace_init(void) if (!d_events) return 0; - events_for_each(call) { + for_each_event(call) { /* The linker may leave blanks */ if (!call->name) continue; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c new file mode 100644 index 00000000000..026be412f35 --- /dev/null +++ b/kernel/trace/trace_events_filter.c @@ -0,0 +1,427 @@ +/* + * trace_events_filter - generic event filtering + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> + */ + +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/ctype.h> + +#include "trace.h" +#include "trace_output.h" + +static int filter_pred_64(struct filter_pred *pred, void *event) +{ + u64 *addr = (u64 *)(event + pred->offset); + u64 val = (u64)pred->val; + int match; + + match = (val == *addr) ^ pred->not; + + return match; +} + +static int filter_pred_32(struct filter_pred *pred, void *event) +{ + u32 *addr = (u32 *)(event + pred->offset); + u32 val = (u32)pred->val; + int match; + + match = (val == *addr) ^ pred->not; + + return match; +} + +static int filter_pred_16(struct filter_pred *pred, void *event) +{ + u16 *addr = (u16 *)(event + pred->offset); + u16 val = (u16)pred->val; + int match; + + match = (val == *addr) ^ pred->not; + + return match; +} + +static int filter_pred_8(struct filter_pred *pred, void *event) +{ + u8 *addr = (u8 *)(event + pred->offset); + u8 val = (u8)pred->val; + int match; + + match = (val == *addr) ^ pred->not; + + return match; +} + +static int filter_pred_string(struct filter_pred *pred, void *event) +{ + char *addr = (char *)(event + pred->offset); + int cmp, match; + + cmp = strncmp(addr, pred->str_val, pred->str_len); + + match = (!cmp) ^ pred->not; + + return match; +} + +/* return 1 if event matches, 0 otherwise (discard) */ +int filter_match_preds(struct ftrace_event_call *call, void *rec) +{ + int i, matched, and_failed = 0; + struct filter_pred *pred; + + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (call->preds[i]) { + pred = call->preds[i]; + if (and_failed && !pred->or) + continue; + matched = pred->fn(pred, rec); + if (!matched && !pred->or) { + and_failed = 1; + continue; + } else if (matched && pred->or) + return 1; + } else + break; + } + + if (and_failed) + return 0; + + return 1; +} + +void filter_print_preds(struct filter_pred **preds, struct trace_seq *s) +{ + char *field_name; + struct filter_pred *pred; + int i; + + if (!preds) { + trace_seq_printf(s, "none\n"); + return; + } + + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (preds[i]) { + pred = preds[i]; + field_name = pred->field_name; + if (i) + trace_seq_printf(s, pred->or ? "|| " : "&& "); + trace_seq_printf(s, "%s ", field_name); + trace_seq_printf(s, pred->not ? "!= " : "== "); + if (pred->str_val) + trace_seq_printf(s, "%s\n", pred->str_val); + else + trace_seq_printf(s, "%llu\n", pred->val); + } else + break; + } +} + +static struct ftrace_event_field * +find_event_field(struct ftrace_event_call *call, char *name) +{ + struct ftrace_event_field *field; + + list_for_each_entry(field, &call->fields, link) { + if (!strcmp(field->name, name)) + return field; + } + + return NULL; +} + +void filter_free_pred(struct filter_pred *pred) +{ + if (!pred) + return; + + kfree(pred->field_name); + kfree(pred->str_val); + kfree(pred); +} + +void filter_free_preds(struct ftrace_event_call *call) +{ + int i; + + if (call->preds) { + for (i = 0; i < MAX_FILTER_PRED; i++) + filter_free_pred(call->preds[i]); + kfree(call->preds); + call->preds = NULL; + } +} + +void filter_free_subsystem_preds(struct event_subsystem *system) +{ + struct ftrace_event_call *call = __start_ftrace_events; + int i; + + if (system->preds) { + for (i = 0; i < MAX_FILTER_PRED; i++) + filter_free_pred(system->preds[i]); + kfree(system->preds); + system->preds = NULL; + } + + events_for_each(call) { + if (!call->name || !call->regfunc) + continue; + + if (!strcmp(call->system, system->name)) + filter_free_preds(call); + } +} + +static int __filter_add_pred(struct ftrace_event_call *call, + struct filter_pred *pred) +{ + int i; + + if (call->preds && !pred->compound) + filter_free_preds(call); + + if (!call->preds) { + call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), + GFP_KERNEL); + if (!call->preds) + return -ENOMEM; + } + + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (!call->preds[i]) { + call->preds[i] = pred; + return 0; + } + } + + return -ENOMEM; +} + +static int is_string_field(const char *type) +{ + if (strchr(type, '[') && strstr(type, "char")) + return 1; + + return 0; +} + +int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) +{ + struct ftrace_event_field *field; + + field = find_event_field(call, pred->field_name); + if (!field) + return -EINVAL; + + pred->offset = field->offset; + + if (is_string_field(field->type)) { + if (!pred->str_val) + return -EINVAL; + pred->fn = filter_pred_string; + pred->str_len = field->size; + return __filter_add_pred(call, pred); + } else { + if (pred->str_val) + return -EINVAL; + } + + switch (field->size) { + case 8: + pred->fn = filter_pred_64; + break; + case 4: + pred->fn = filter_pred_32; + break; + case 2: + pred->fn = filter_pred_16; + break; + case 1: + pred->fn = filter_pred_8; + break; + default: + return -EINVAL; + } + + return __filter_add_pred(call, pred); +} + +static struct filter_pred *copy_pred(struct filter_pred *pred) +{ + struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL); + if (!new_pred) + return NULL; + + memcpy(new_pred, pred, sizeof(*pred)); + + if (pred->field_name) { + new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); + if (!new_pred->field_name) { + kfree(new_pred); + return NULL; + } + } + + if (pred->str_val) { + new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL); + if (!new_pred->str_val) { + filter_free_pred(new_pred); + return NULL; + } + } + + return new_pred; +} + +int filter_add_subsystem_pred(struct event_subsystem *system, + struct filter_pred *pred) +{ + struct ftrace_event_call *call = __start_ftrace_events; + struct filter_pred *event_pred; + int i; + + if (system->preds && !pred->compound) + filter_free_subsystem_preds(system); + + if (!system->preds) { + system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), + GFP_KERNEL); + if (!system->preds) + return -ENOMEM; + } + + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (!system->preds[i]) { + system->preds[i] = pred; + break; + } + } + + if (i == MAX_FILTER_PRED) + return -EINVAL; + + events_for_each(call) { + int err; + + if (!call->name || !call->regfunc) + continue; + + if (strcmp(call->system, system->name)) + continue; + + if (!find_event_field(call, pred->field_name)) + continue; + + event_pred = copy_pred(pred); + if (!event_pred) + goto oom; + + err = filter_add_pred(call, event_pred); + if (err) + filter_free_pred(event_pred); + if (err == -ENOMEM) + goto oom; + } + + return 0; + +oom: + system->preds[i] = NULL; + return -ENOMEM; +} + +int filter_parse(char **pbuf, struct filter_pred *pred) +{ + char *tmp, *tok, *val_str = NULL; + int tok_n = 0; + + /* field ==/!= number, or/and field ==/!= number, number */ + while ((tok = strsep(pbuf, " \n"))) { + if (tok_n == 0) { + if (!strcmp(tok, "0")) { + pred->clear = 1; + return 0; + } else if (!strcmp(tok, "&&")) { + pred->or = 0; + pred->compound = 1; + } else if (!strcmp(tok, "||")) { + pred->or = 1; + pred->compound = 1; + } else + pred->field_name = tok; + tok_n = 1; + continue; + } + if (tok_n == 1) { + if (!pred->field_name) + pred->field_name = tok; + else if (!strcmp(tok, "!=")) + pred->not = 1; + else if (!strcmp(tok, "==")) + pred->not = 0; + else { + pred->field_name = NULL; + return -EINVAL; + } + tok_n = 2; + continue; + } + if (tok_n == 2) { + if (pred->compound) { + if (!strcmp(tok, "!=")) + pred->not = 1; + else if (!strcmp(tok, "==")) + pred->not = 0; + else { + pred->field_name = NULL; + return -EINVAL; + } + } else { + val_str = tok; + break; /* done */ + } + tok_n = 3; + continue; + } + if (tok_n == 3) { + val_str = tok; + break; /* done */ + } + } + + pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); + if (!pred->field_name) + return -ENOMEM; + + pred->val = simple_strtoull(val_str, &tmp, 10); + if (tmp == val_str) { + pred->str_val = kstrdup(val_str, GFP_KERNEL); + if (!pred->str_val) + return -ENOMEM; + } + + return 0; +} + + diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 5117c43f5c6..30743f7d411 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -129,3 +129,48 @@ ftrace_format_##call(struct trace_seq *s) \ } #include <trace/trace_event_types.h> + +#undef __field +#define __field(type, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef __array +#define __array(type, item, len) \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#define __common_field(type, item) \ + ret = trace_define_field(event_call, #type, "common_" #item, \ + offsetof(typeof(field.ent), item), \ + sizeof(field.ent.item)); \ + if (ret) \ + return ret; + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ +int \ +ftrace_define_fields_##call(void) \ +{ \ + struct ftrace_raw_##call field; \ + struct ftrace_event_call *event_call = &event_##call; \ + int ret; \ + \ + __common_field(unsigned char, type); \ + __common_field(unsigned char, flags); \ + __common_field(unsigned char, preempt_count); \ + __common_field(int, pid); \ + __common_field(int, tgid); \ + \ + tstruct; \ + \ + return ret; \ +} + +#include <trace/trace_event_types.h> diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index ae2e323df0c..9d2fa78cecc 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -109,6 +109,40 @@ #undef TP_FMT #define TP_FMT(fmt, args...) fmt "\n", ##args +#ifdef CONFIG_EVENT_PROFILE +#define _TRACE_PROFILE(call, proto, args) \ +static void ftrace_profile_##call(proto) \ +{ \ + extern void perf_tpcounter_event(int); \ + perf_tpcounter_event(event_##call.id); \ +} \ + \ +static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \ +{ \ + int ret = 0; \ + \ + if (!atomic_inc_return(&call->profile_count)) \ + ret = register_trace_##call(ftrace_profile_##call); \ + \ + return ret; \ +} \ + \ +static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \ +{ \ + if (atomic_add_negative(-1, &call->profile_count)) \ + unregister_trace_##call(ftrace_profile_##call); \ +} + +#define _TRACE_PROFILE_INIT(call) \ + .profile_count = ATOMIC_INIT(-1), \ + .profile_enable = ftrace_profile_enable_##call, \ + .profile_disable = ftrace_profile_disable_##call, + +#else +#define _TRACE_PROFILE(call, proto, args) +#define _TRACE_PROFILE_INIT(call) +#endif + #define _TRACE_FORMAT(call, proto, args, fmt) \ static void ftrace_event_##call(proto) \ { \ @@ -130,18 +164,33 @@ static void ftrace_unreg_event_##call(void) \ { \ unregister_trace_##call(ftrace_event_##call); \ } \ - + \ +static struct ftrace_event_call event_##call; \ + \ +static int ftrace_init_event_##call(void) \ +{ \ + int id; \ + \ + id = register_ftrace_event(NULL); \ + if (!id) \ + return -ENODEV; \ + event_##call.id = id; \ + return 0; \ +} #undef TRACE_FORMAT #define TRACE_FORMAT(call, proto, args, fmt) \ _TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \ +_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ static struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .system = __stringify(TRACE_SYSTEM), \ + .raw_init = ftrace_init_event_##call, \ .regfunc = ftrace_reg_event_##call, \ .unregfunc = ftrace_unreg_event_##call, \ + _TRACE_PROFILE_INIT(call) \ } #undef __entry @@ -149,11 +198,13 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ +_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ \ static struct ftrace_event_call event_##call; \ \ static void ftrace_raw_event_##call(proto) \ { \ + struct ftrace_event_call *call = &event_##call; \ struct ring_buffer_event *event; \ struct ftrace_raw_##call *entry; \ unsigned long irq_flags; \ @@ -171,7 +222,11 @@ static void ftrace_raw_event_##call(proto) \ \ assign; \ \ - trace_current_buffer_unlock_commit(event, irq_flags, pc); \ + if (call->preds && !filter_match_preds(call, entry)) \ + ring_buffer_event_discard(event); \ + \ + trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ + \ } \ \ static int ftrace_raw_reg_event_##call(void) \ @@ -202,6 +257,7 @@ static int ftrace_raw_init_event_##call(void) \ if (!id) \ return -ENODEV; \ event_##call.id = id; \ + INIT_LIST_HEAD(&event_##call.fields); \ return 0; \ } \ \ @@ -214,4 +270,12 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ .show_format = ftrace_format_##call, \ + .define_fields = ftrace_define_fields_##call, \ + _TRACE_PROFILE_INIT(call) \ } + +#include <trace/trace_event_types.h> + +#undef _TRACE_PROFILE +#undef _TRACE_PROFILE_INIT + diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4c388607ed6..d28687e7b3a 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -14,6 +14,11 @@ #include "trace.h" #include "trace_output.h" +struct fgraph_data { + pid_t last_pid; + int depth; +}; + #define TRACE_GRAPH_INDENT 2 /* Flag options */ @@ -52,9 +57,9 @@ static struct tracer_flags tracer_flags = { /* Add a function return address to the trace stack on thread info.*/ int -ftrace_push_return_trace(unsigned long ret, unsigned long long time, - unsigned long func, int *depth) +ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) { + unsigned long long calltime; int index; if (!current->ret_stack) @@ -66,11 +71,13 @@ ftrace_push_return_trace(unsigned long ret, unsigned long long time, return -EBUSY; } + calltime = trace_clock_local(); + index = ++current->curr_ret_stack; barrier(); current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; - current->ret_stack[index].calltime = time; + current->ret_stack[index].calltime = calltime; *depth = index; return 0; @@ -190,15 +197,15 @@ print_graph_cpu(struct trace_seq *s, int cpu) static enum print_line_t print_graph_proc(struct trace_seq *s, pid_t pid) { - int i; - int ret; - int len; - char comm[8]; - int spaces = 0; + char comm[TASK_COMM_LEN]; /* sign + log10(MAX_INT) + '\0' */ char pid_str[11]; + int spaces = 0; + int ret; + int len; + int i; - strncpy(comm, trace_find_cmdline(pid), 7); + trace_find_cmdline(pid, comm); comm[7] = '\0'; sprintf(pid_str, "%d", pid); @@ -231,16 +238,16 @@ print_graph_proc(struct trace_seq *s, pid_t pid) /* If the pid changed since the last trace, output this event */ static enum print_line_t -verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu) +verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) { pid_t prev_pid; pid_t *last_pid; int ret; - if (!last_pids_cpu) + if (!data) return TRACE_TYPE_HANDLED; - last_pid = per_cpu_ptr(last_pids_cpu, cpu); + last_pid = &(per_cpu_ptr(data, cpu)->last_pid); if (*last_pid == pid) return TRACE_TYPE_HANDLED; @@ -471,6 +478,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ent_entry *entry, struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s) { + struct fgraph_data *data = iter->private; struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; @@ -481,6 +489,18 @@ print_graph_entry_leaf(struct trace_iterator *iter, call = &entry->graph_ent; duration = graph_ret->rettime - graph_ret->calltime; + if (data) { + int cpu = iter->cpu; + int *depth = &(per_cpu_ptr(data, cpu)->depth); + + /* + * Comments display at + 1 to depth. Since + * this is a leaf function, keep the comments + * equal to this depth. + */ + *depth = call->depth - 1; + } + /* Overhead */ ret = print_graph_overhead(duration, s); if (!ret) @@ -512,12 +532,21 @@ print_graph_entry_leaf(struct trace_iterator *iter, } static enum print_line_t -print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, - struct trace_seq *s, pid_t pid, int cpu) +print_graph_entry_nested(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *entry, + struct trace_seq *s, int cpu) { - int i; - int ret; struct ftrace_graph_ent *call = &entry->graph_ent; + struct fgraph_data *data = iter->private; + int ret; + int i; + + if (data) { + int cpu = iter->cpu; + int *depth = &(per_cpu_ptr(data, cpu)->depth); + + *depth = call->depth; + } /* No overhead */ ret = print_graph_overhead(-1, s); @@ -554,24 +583,24 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, } static enum print_line_t -print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, - struct trace_iterator *iter) +print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, + int type, unsigned long addr) { - int ret; - int cpu = iter->cpu; - pid_t *last_entry = iter->private; + struct fgraph_data *data = iter->private; struct trace_entry *ent = iter->ent; - struct ftrace_graph_ent *call = &field->graph_ent; - struct ftrace_graph_ret_entry *leaf_ret; + int cpu = iter->cpu; + int ret; /* Pid */ - if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE) + if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - /* Interrupt */ - ret = print_graph_irq(iter, call->func, TRACE_GRAPH_ENT, cpu, ent->pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + if (type) { + /* Interrupt */ + ret = print_graph_irq(iter, addr, type, cpu, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } /* Absolute time */ if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { @@ -598,11 +627,25 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } + return 0; +} + +static enum print_line_t +print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, + struct trace_iterator *iter) +{ + int cpu = iter->cpu; + struct ftrace_graph_ent *call = &field->graph_ent; + struct ftrace_graph_ret_entry *leaf_ret; + + if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) + return TRACE_TYPE_PARTIAL_LINE; + leaf_ret = get_return_for_leaf(iter, field); if (leaf_ret) return print_graph_entry_leaf(iter, field, leaf_ret, s); else - return print_graph_entry_nested(field, s, iter->ent->pid, cpu); + return print_graph_entry_nested(iter, field, s, cpu); } @@ -610,40 +653,27 @@ static enum print_line_t print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, struct trace_entry *ent, struct trace_iterator *iter) { - int i; - int ret; - int cpu = iter->cpu; - pid_t *last_pid = iter->private, pid = ent->pid; unsigned long long duration = trace->rettime - trace->calltime; + struct fgraph_data *data = iter->private; + pid_t pid = ent->pid; + int cpu = iter->cpu; + int ret; + int i; - /* Pid */ - if (verif_pid(s, pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + if (data) { + int cpu = iter->cpu; + int *depth = &(per_cpu_ptr(data, cpu)->depth); - /* Absolute time */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { - ret = print_graph_abs_time(iter->ts, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + /* + * Comments display at + 1 to depth. This is the + * return from a function, we now want the comments + * to display at the same level of the bracket. + */ + *depth = trace->depth - 1; } - /* Cpu */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - - /* Proc */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, ent->pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (print_graph_prologue(iter, s, 0, 0)) + return TRACE_TYPE_PARTIAL_LINE; /* Overhead */ ret = print_graph_overhead(duration, s); @@ -684,42 +714,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, } static enum print_line_t -print_graph_comment(struct bprint_entry *trace, struct trace_seq *s, - struct trace_entry *ent, struct trace_iterator *iter) +print_graph_comment(struct trace_seq *s, struct trace_entry *ent, + struct trace_iterator *iter) { - int i; + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct fgraph_data *data = iter->private; + struct trace_event *event; + int depth = 0; int ret; - int cpu = iter->cpu; - pid_t *last_pid = iter->private; - - /* Pid */ - if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - /* Absolute time */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { - ret = print_graph_abs_time(iter->ts, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + int i; - /* Cpu */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + if (data) + depth = per_cpu_ptr(data, iter->cpu)->depth; - /* Proc */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, ent->pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (print_graph_prologue(iter, s, 0, 0)) + return TRACE_TYPE_PARTIAL_LINE; /* No overhead */ ret = print_graph_overhead(-1, s); @@ -734,8 +743,8 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s, } /* Indentation */ - if (trace->depth > 0) - for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) { + if (depth > 0) + for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { ret = trace_seq_printf(s, " "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -746,9 +755,26 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_bprintf(s, trace->fmt, trace->buf); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + switch (iter->ent->type) { + case TRACE_BPRINT: + ret = trace_print_bprintk_msg_only(iter); + if (ret != TRACE_TYPE_HANDLED) + return ret; + break; + case TRACE_PRINT: + ret = trace_print_printk_msg_only(iter); + if (ret != TRACE_TYPE_HANDLED) + return ret; + break; + default: + event = ftrace_find_event(ent->type); + if (!event) + return TRACE_TYPE_UNHANDLED; + + ret = event->trace(iter, sym_flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; + } /* Strip ending newline */ if (s->buffer[s->len - 1] == '\n') { @@ -767,8 +793,8 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s, enum print_line_t print_graph_function(struct trace_iterator *iter) { - struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; switch (entry->type) { case TRACE_GRAPH_ENT: { @@ -781,14 +807,11 @@ print_graph_function(struct trace_iterator *iter) trace_assign_type(field, entry); return print_graph_return(&field->ret, s, entry, iter); } - case TRACE_BPRINT: { - struct bprint_entry *field; - trace_assign_type(field, entry); - return print_graph_comment(field, s, entry, iter); - } default: - return TRACE_TYPE_UNHANDLED; + return print_graph_comment(s, entry, iter); } + + return TRACE_TYPE_HANDLED; } static void print_graph_headers(struct seq_file *s) @@ -820,19 +843,21 @@ static void print_graph_headers(struct seq_file *s) static void graph_trace_open(struct trace_iterator *iter) { - /* pid on the last trace processed */ - pid_t *last_pid = alloc_percpu(pid_t); + /* pid and depth on the last trace processed */ + struct fgraph_data *data = alloc_percpu(struct fgraph_data); int cpu; - if (!last_pid) + if (!data) pr_warning("function graph tracer: not enough memory\n"); else for_each_possible_cpu(cpu) { - pid_t *pid = per_cpu_ptr(last_pid, cpu); + pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid); + int *depth = &(per_cpu_ptr(data, cpu)->depth); *pid = -1; + *depth = 0; } - iter->private = last_pid; + iter->private = data; } static void graph_trace_close(struct trace_iterator *iter) diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index f095916e477..8e37fcddd8b 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -359,5 +359,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map) int mmio_trace_printk(const char *fmt, va_list args) { - return trace_vprintk(0, -1, fmt, args); + return trace_vprintk(0, fmt, args); } diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 9aa84bde23c..394f94417e2 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -91,6 +91,7 @@ struct tracer nop_trace __read_mostly = .name = "nop", .init = nop_trace_init, .reset = nop_trace_reset, + .wait_pipe = poll_wait_pipe, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_nop, #endif diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ea9d3b410c7..d72b9a63b24 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -19,6 +19,38 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; +enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct bprint_entry *field; + int ret; + + trace_assign_type(field, entry); + + ret = trace_seq_bprintf(s, field->fmt, field->buf); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct print_entry *field; + int ret; + + trace_assign_type(field, entry); + + ret = trace_seq_printf(s, "%s", field->buf); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + /** * trace_seq_printf - sequence printing of trace information * @s: trace sequence descriptor @@ -105,7 +137,7 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c) return 1; } -int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) +int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) { if (len > ((PAGE_SIZE - 1) - s->len)) return 0; @@ -116,10 +148,10 @@ int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) return len; } -int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) +int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len) { unsigned char hex[HEX_CHARS]; - unsigned char *data = mem; + const unsigned char *data = mem; int i, j; #ifdef __BIG_ENDIAN @@ -135,6 +167,19 @@ int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) return trace_seq_putmem(s, hex, j); } +void *trace_seq_reserve(struct trace_seq *s, size_t len) +{ + void *ret; + + if (len > ((PAGE_SIZE - 1) - s->len)) + return NULL; + + ret = s->buffer + s->len; + s->len += len; + + return ret; +} + int trace_seq_path(struct trace_seq *s, struct path *path) { unsigned char *p; @@ -309,9 +354,9 @@ static int lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) { int hardirq, softirq; - char *comm; + char comm[TASK_COMM_LEN]; - comm = trace_find_cmdline(entry->pid); + trace_find_cmdline(entry->pid, comm); hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; @@ -346,10 +391,12 @@ int trace_print_context(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; - char *comm = trace_find_cmdline(entry->pid); unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned long secs = (unsigned long)t; + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", comm, entry->pid, iter->cpu, secs, usec_rem); @@ -372,7 +419,10 @@ int trace_print_lat_context(struct trace_iterator *iter) rel_usecs = ns2usecs(next_ts - iter->ts); if (verbose) { - char *comm = trace_find_cmdline(entry->pid); + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]" " %ld.%03ldms (+%ld.%03ldms): ", comm, entry->pid, iter->cpu, entry->flags, @@ -444,6 +494,11 @@ int register_ftrace_event(struct trace_event *event) mutex_lock(&trace_event_mutex); + if (!event) { + ret = next_event_type++; + goto out; + } + if (!event->type) event->type = next_event_type++; else if (event->type > __TRACE_LAST_TYPE) { @@ -577,14 +632,15 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, char *delim) { struct ctx_switch_entry *field; - char *comm; + char comm[TASK_COMM_LEN]; int S, T; + trace_assign_type(field, iter->ent); T = task_state_char(field->next_state); S = task_state_char(field->prev_state); - comm = trace_find_cmdline(field->next_pid); + trace_find_cmdline(field->next_pid, comm); if (!trace_seq_printf(&iter->seq, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", field->prev_pid, diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 3b90e6ade1a..e0bde39c2dd 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -15,6 +15,11 @@ struct trace_event { trace_print_func binary; }; +extern enum print_line_t +trace_print_bprintk_msg_only(struct trace_iterator *iter); +extern enum print_line_t +trace_print_printk_msg_only(struct trace_iterator *iter); + extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); extern int @@ -24,24 +29,27 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt); -int trace_seq_puts(struct trace_seq *s, const char *str); -int trace_seq_putc(struct trace_seq *s, unsigned char c); -int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len); -int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len); -int trace_seq_path(struct trace_seq *s, struct path *path); -int seq_print_userip_objs(const struct userstack_entry *entry, - struct trace_seq *s, unsigned long sym_flags); -int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, - unsigned long ip, unsigned long sym_flags); - -int trace_print_context(struct trace_iterator *iter); -int trace_print_lat_context(struct trace_iterator *iter); - -struct trace_event *ftrace_find_event(int type); -int register_ftrace_event(struct trace_event *event); -int unregister_ftrace_event(struct trace_event *event); - -enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); +extern int trace_seq_puts(struct trace_seq *s, const char *str); +extern int trace_seq_putc(struct trace_seq *s, unsigned char c); +extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len); +extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, + size_t len); +extern void *trace_seq_reserve(struct trace_seq *s, size_t len); +extern int trace_seq_path(struct trace_seq *s, struct path *path); +extern int seq_print_userip_objs(const struct userstack_entry *entry, + struct trace_seq *s, unsigned long sym_flags); +extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags); + +extern int trace_print_context(struct trace_iterator *iter); +extern int trace_print_lat_context(struct trace_iterator *iter); + +extern struct trace_event *ftrace_find_event(int type); +extern int register_ftrace_event(struct trace_event *event); +extern int unregister_ftrace_event(struct trace_event *event); + +extern enum print_line_t trace_nop_print(struct trace_iterator *iter, + int flags); #define MAX_MEMHEX_BYTES 8 #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index 91ce672fb03..bae791ebcc5 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -122,12 +122,16 @@ fail_start: static void start_power_trace(struct trace_array *tr) { trace_power_enabled = 1; - tracing_power_register(); } static void stop_power_trace(struct trace_array *tr) { trace_power_enabled = 0; +} + +static void power_trace_reset(struct trace_array *tr) +{ + trace_power_enabled = 0; unregister_trace_power_start(probe_power_start); unregister_trace_power_end(probe_power_end); unregister_trace_power_mark(probe_power_mark); @@ -188,7 +192,7 @@ static struct tracer power_tracer __read_mostly = .init = power_trace_init, .start = start_power_trace, .stop = stop_power_trace, - .reset = stop_power_trace, + .reset = power_trace_reset, .print_line = power_print_line, }; diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 486785214e3..eb81556107f 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -112,7 +112,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...) return 0; va_start(ap, fmt); - ret = trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap); + ret = trace_vbprintk(ip, fmt, ap); va_end(ap); return ret; } @@ -126,7 +126,7 @@ int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap) if (!(trace_flags & TRACE_ITER_PRINTK)) return 0; - return trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap); + return trace_vbprintk(ip, fmt, ap); } EXPORT_SYMBOL_GPL(__ftrace_vbprintk); @@ -139,7 +139,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...) return 0; va_start(ap, fmt); - ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); + ret = trace_vprintk(ip, fmt, ap); va_end(ap); return ret; } @@ -150,7 +150,7 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) if (!(trace_flags & TRACE_ITER_PRINTK)) return 0; - return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); + return trace_vprintk(ip, fmt, ap); } EXPORT_SYMBOL_GPL(__ftrace_vprintk); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 77132c2cf3d..de35f200abd 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -18,6 +18,7 @@ static struct trace_array *ctx_trace; static int __read_mostly tracer_enabled; static int sched_ref; static DEFINE_MUTEX(sched_register_mutex); +static int sched_stopped; static void probe_sched_switch(struct rq *__rq, struct task_struct *prev, @@ -28,7 +29,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, int cpu; int pc; - if (!sched_ref) + if (!sched_ref || sched_stopped) return; tracing_record_cmdline(prev); @@ -193,6 +194,7 @@ static void stop_sched_trace(struct trace_array *tr) static int sched_switch_trace_init(struct trace_array *tr) { ctx_trace = tr; + tracing_reset_online_cpus(tr); tracing_start_sched_switch_record(); return 0; } @@ -205,13 +207,12 @@ static void sched_switch_trace_reset(struct trace_array *tr) static void sched_switch_trace_start(struct trace_array *tr) { - tracing_reset_online_cpus(tr); - tracing_start_sched_switch(); + sched_stopped = 0; } static void sched_switch_trace_stop(struct trace_array *tr) { - tracing_stop_sched_switch(); + sched_stopped = 1; } static struct tracer sched_switch_trace __read_mostly = diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b9109126706..499d01c44cd 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -250,6 +250,28 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) #ifdef CONFIG_FUNCTION_GRAPH_TRACER + +/* Maximum number of functions to trace before diagnosing a hang */ +#define GRAPH_MAX_FUNC_TEST 100000000 + +static void __ftrace_dump(bool disable_tracing); +static unsigned int graph_hang_thresh; + +/* Wrap the real function entry probe to avoid possible hanging */ +static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) +{ + /* This is harmlessly racy, we want to approximately detect a hang */ + if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { + ftrace_graph_stop(); + printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); + if (ftrace_dump_on_oops) + __ftrace_dump(false); + return 0; + } + + return trace_graph_entry(trace); +} + /* * Pretty much the same than for the function tracer from which the selftest * has been borrowed. @@ -261,15 +283,29 @@ trace_selftest_startup_function_graph(struct tracer *trace, int ret; unsigned long count; - ret = tracer_init(trace, tr); + /* + * Simulate the init() callback but we attach a watchdog callback + * to detect and recover from possible hangs + */ + tracing_reset_online_cpus(tr); + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry_watchdog); if (ret) { warn_failed_init_tracer(trace, ret); goto out; } + tracing_start_cmdline_record(); /* Sleep for a 1/10 of a second */ msleep(100); + /* Have we just recovered from a hang? */ + if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) { + tracing_selftest_disabled = true; + ret = -1; + goto out; + } + tracing_stop(); /* check the trace buffer */ @@ -317,6 +353,14 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) local_irq_disable(); udelay(100); local_irq_enable(); + + /* + * Stop the tracer to avoid a warning subsequent + * to buffer flipping failure because tracing_stop() + * disables the tr and max buffers, making flipping impossible + * in case of parallels max irqs off latencies. + */ + trace->stop(tr); /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ @@ -371,6 +415,14 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) preempt_disable(); udelay(100); preempt_enable(); + + /* + * Stop the tracer to avoid a warning subsequent + * to buffer flipping failure because tracing_stop() + * disables the tr and max buffers, making flipping impossible + * in case of parallels max preempt off latencies. + */ + trace->stop(tr); /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ @@ -416,7 +468,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * ret = tracer_init(trace, tr); if (ret) { warn_failed_init_tracer(trace, ret); - goto out; + goto out_no_start; } /* reset the max latency */ @@ -430,31 +482,35 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * /* reverse the order of preempt vs irqs */ local_irq_enable(); + /* + * Stop the tracer to avoid a warning subsequent + * to buffer flipping failure because tracing_stop() + * disables the tr and max buffers, making flipping impossible + * in case of parallels max irqs/preempt off latencies. + */ + trace->stop(tr); /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); - if (ret) { - tracing_start(); + if (ret) goto out; - } ret = trace_test_buffer(&max_tr, &count); - if (ret) { - tracing_start(); + if (ret) goto out; - } if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); ret = -1; - tracing_start(); goto out; } /* do the test by disabling interrupts first this time */ tracing_max_latency = 0; tracing_start(); + trace->start(tr); + preempt_disable(); local_irq_disable(); udelay(100); @@ -462,6 +518,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * /* reverse the order of preempt vs irqs */ local_irq_enable(); + trace->stop(tr); /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ @@ -477,9 +534,10 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * goto out; } - out: - trace->reset(tr); +out: tracing_start(); +out_no_start: + trace->reset(tr); tracing_max_latency = save_max; return ret; diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 39310e3434e..acdebd771a9 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -75,7 +75,7 @@ static int stat_seq_init(struct tracer_stat_session *session) { struct trace_stat_list *iter_entry, *new_entry; struct tracer_stat *ts = session->ts; - void *prev_stat; + void *stat; int ret = 0; int i; @@ -85,6 +85,10 @@ static int stat_seq_init(struct tracer_stat_session *session) if (!ts->stat_cmp) ts->stat_cmp = dummy_cmp; + stat = ts->stat_start(); + if (!stat) + goto exit; + /* * The first entry. Actually this is the second, but the first * one (the stat_list head) is pointless. @@ -99,14 +103,19 @@ static int stat_seq_init(struct tracer_stat_session *session) list_add(&new_entry->list, &session->stat_list); - new_entry->stat = ts->stat_start(); - prev_stat = new_entry->stat; + new_entry->stat = stat; /* * Iterate over the tracer stat entries and store them in a sorted * list. */ for (i = 1; ; i++) { + stat = ts->stat_next(stat, i); + + /* End of insertion */ + if (!stat) + break; + new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); if (!new_entry) { ret = -ENOMEM; @@ -114,31 +123,23 @@ static int stat_seq_init(struct tracer_stat_session *session) } INIT_LIST_HEAD(&new_entry->list); - new_entry->stat = ts->stat_next(prev_stat, i); + new_entry->stat = stat; - /* End of insertion */ - if (!new_entry->stat) - break; - - list_for_each_entry(iter_entry, &session->stat_list, list) { + list_for_each_entry_reverse(iter_entry, &session->stat_list, + list) { /* Insertion with a descendent sorting */ - if (ts->stat_cmp(new_entry->stat, - iter_entry->stat) > 0) { - - list_add_tail(&new_entry->list, - &iter_entry->list); - break; + if (ts->stat_cmp(iter_entry->stat, + new_entry->stat) >= 0) { - /* The current smaller value */ - } else if (list_is_last(&iter_entry->list, - &session->stat_list)) { list_add(&new_entry->list, &iter_entry->list); break; } } - prev_stat = new_entry->stat; + /* The current larger value */ + if (list_empty(&new_entry->list)) + list_add(&new_entry->list, &session->stat_list); } exit: mutex_unlock(&session->stat_mutex); @@ -160,7 +161,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) /* If we are in the beginning of the file, print the headers */ if (!*pos && session->ts->stat_headers) - session->ts->stat_headers(s); + return SEQ_START_TOKEN; return seq_list_start(&session->stat_list, *pos); } @@ -169,6 +170,9 @@ static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) { struct tracer_stat_session *session = s->private; + if (p == SEQ_START_TOKEN) + return seq_list_start(&session->stat_list, *pos); + return seq_list_next(p, &session->stat_list, pos); } @@ -183,6 +187,9 @@ static int stat_seq_show(struct seq_file *s, void *v) struct tracer_stat_session *session = s->private; struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); + if (v == SEQ_START_TOKEN) + return session->ts->stat_headers(s); + return session->ts->stat_show(s, l->stat); } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 66cf97449af..a2a3af29c94 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,21 +1,112 @@ -#include <linux/ftrace.h> #include <linux/kernel.h> - +#include <linux/ftrace.h> #include <asm/syscall.h> #include "trace_output.h" #include "trace.h" -static atomic_t refcount; +/* Keep a counter of the syscall tracing users */ +static int refcount; + +/* Prevent from races on thread flags toggling */ +static DEFINE_MUTEX(syscall_trace_lock); + +/* Option to display the parameters types */ +enum { + TRACE_SYSCALLS_OPT_TYPES = 0x1, +}; + +static struct tracer_opt syscalls_opts[] = { + { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) }, + { } +}; + +static struct tracer_flags syscalls_flags = { + .val = 0, /* By default: no parameters types */ + .opts = syscalls_opts +}; + +enum print_line_t +print_syscall_enter(struct trace_iterator *iter, int flags) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *ent = iter->ent; + struct syscall_trace_enter *trace; + struct syscall_metadata *entry; + int i, ret, syscall; + + trace_assign_type(trace, ent); + + syscall = trace->nr; + + entry = syscall_nr_to_meta(syscall); + if (!entry) + goto end; + + ret = trace_seq_printf(s, "%s(", entry->name); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + for (i = 0; i < entry->nb_args; i++) { + /* parameter types */ + if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) { + ret = trace_seq_printf(s, "%s ", entry->types[i]); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + /* parameter values */ + ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i], + trace->args[i], + i == entry->nb_args - 1 ? ")" : ","); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + +end: + trace_seq_printf(s, "\n"); + return TRACE_TYPE_HANDLED; +} + +enum print_line_t +print_syscall_exit(struct trace_iterator *iter, int flags) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *ent = iter->ent; + struct syscall_trace_exit *trace; + int syscall; + struct syscall_metadata *entry; + int ret; + + trace_assign_type(trace, ent); + + syscall = trace->nr; + + entry = syscall_nr_to_meta(syscall); + if (!entry) { + trace_seq_printf(s, "\n"); + return TRACE_TYPE_HANDLED; + } + + ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, + trace->ret); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} void start_ftrace_syscalls(void) { unsigned long flags; struct task_struct *g, *t; - if (atomic_inc_return(&refcount) != 1) - goto out; + mutex_lock(&syscall_trace_lock); + /* Don't enable the flag on the tasks twice */ + if (++refcount != 1) + goto unlock; + + arch_init_ftrace_syscalls(); read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, t) { @@ -23,8 +114,9 @@ void start_ftrace_syscalls(void) } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); -out: - atomic_dec(&refcount); + +unlock: + mutex_unlock(&syscall_trace_lock); } void stop_ftrace_syscalls(void) @@ -32,8 +124,11 @@ void stop_ftrace_syscalls(void) unsigned long flags; struct task_struct *g, *t; - if (atomic_dec_return(&refcount)) - goto out; + mutex_lock(&syscall_trace_lock); + + /* There are perhaps still some users */ + if (--refcount) + goto unlock; read_lock_irqsave(&tasklist_lock, flags); @@ -42,26 +137,64 @@ void stop_ftrace_syscalls(void) } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); -out: - atomic_inc(&refcount); + +unlock: + mutex_unlock(&syscall_trace_lock); } void ftrace_syscall_enter(struct pt_regs *regs) { + struct syscall_trace_enter *entry; + struct syscall_metadata *sys_data; + struct ring_buffer_event *event; + int size; int syscall_nr; syscall_nr = syscall_get_nr(current, regs); - trace_printk("syscall %d enter\n", syscall_nr); + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; + + event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size, + 0, 0); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->nr = syscall_nr; + syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); + + trace_current_buffer_unlock_commit(event, 0, 0); + trace_wake_up(); } void ftrace_syscall_exit(struct pt_regs *regs) { + struct syscall_trace_exit *entry; + struct syscall_metadata *sys_data; + struct ring_buffer_event *event; int syscall_nr; syscall_nr = syscall_get_nr(current, regs); - trace_printk("syscall %d exit\n", syscall_nr); + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT, + sizeof(*entry), 0, 0); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->nr = syscall_nr; + entry->ret = syscall_get_return_value(current, regs); + + trace_current_buffer_unlock_commit(event, 0, 0); + trace_wake_up(); } static int init_syscall_tracer(struct trace_array *tr) @@ -74,20 +207,24 @@ static int init_syscall_tracer(struct trace_array *tr) static void reset_syscall_tracer(struct trace_array *tr) { stop_ftrace_syscalls(); + tracing_reset_online_cpus(tr); } static struct trace_event syscall_enter_event = { - .type = TRACE_SYSCALL_ENTER, + .type = TRACE_SYSCALL_ENTER, + .trace = print_syscall_enter, }; static struct trace_event syscall_exit_event = { - .type = TRACE_SYSCALL_EXIT, + .type = TRACE_SYSCALL_EXIT, + .trace = print_syscall_exit, }; static struct tracer syscall_tracer __read_mostly = { - .name = "syscall", + .name = "syscall", .init = init_syscall_tracer, - .reset = reset_syscall_tracer + .reset = reset_syscall_tracer, + .flags = &syscalls_flags, }; __init int register_ftrace_syscalls(void) diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 9ab035b58cf..797201e4a13 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -196,6 +196,11 @@ static int workqueue_stat_show(struct seq_file *s, void *p) struct pid *pid; struct task_struct *tsk; + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + if (&cws->list == workqueue_cpu_stat(cpu)->list.next) + seq_printf(s, "\n"); + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); + pid = find_get_pid(cws->pid); if (pid) { tsk = get_pid_task(pid, PIDTYPE_PID); @@ -208,18 +213,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p) put_pid(pid); } - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (&cws->list == workqueue_cpu_stat(cpu)->list.next) - seq_printf(s, "\n"); - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - return 0; } static int workqueue_stat_headers(struct seq_file *s) { seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); - seq_printf(s, "# | | | |\n\n"); + seq_printf(s, "# | | | |\n"); return 0; } |