From 96a2c464de07d7c72988db851c029b204fc59108 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 1 Aug 2009 01:34:24 +0200 Subject: [PATCH 01/40] tracing/bkl: Add bkl ftrace events Add two events lock_kernel and unlock_kernel() to trace the bkl uses. This opens the door for userspace tools to perform statistics about the callsites that use it, dependencies with other locks (by pairing the trace with lock events), use with recursivity and so on... The {__reacquire,release}_kernel_lock() events are not traced because these are called from schedule, thus the sched events are sufficient to trace them. Example of a trace: hald-addon-stor-4152 [000] 165.875501: unlock_kernel: depth: 0, fs/block_dev.c:1358 __blkdev_put() hald-addon-stor-4152 [000] 167.832974: lock_kernel: depth: 0, fs/block_dev.c:1167 __blkdev_get() How to get the callsites that acquire it recursively: cd /debug/tracing/events/bkl echo "lock_depth > 0" > filter firefox-4951 [001] 206.276967: unlock_kernel: depth: 1, fs/reiserfs/super.c:575 reiserfs_dirty_inode() You can also filter by file and/or line. v2: Use of FILTER_PTR_STRING attribute for files and lines fields to make them traceable. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Li Zefan --- include/linux/smp_lock.h | 19 +++++++++--- include/trace/events/bkl.h | 61 ++++++++++++++++++++++++++++++++++++++ lib/kernel_lock.c | 11 +++---- 3 files changed, 82 insertions(+), 9 deletions(-) create mode 100644 include/trace/events/bkl.h diff --git a/include/linux/smp_lock.h b/include/linux/smp_lock.h index 813be59bf34..d48cc77ba70 100644 --- a/include/linux/smp_lock.h +++ b/include/linux/smp_lock.h @@ -3,6 +3,7 @@ #ifdef CONFIG_LOCK_KERNEL #include +#include #define kernel_locked() (current->lock_depth >= 0) @@ -24,8 +25,18 @@ static inline int reacquire_kernel_lock(struct task_struct *task) return 0; } -extern void __lockfunc lock_kernel(void) __acquires(kernel_lock); -extern void __lockfunc unlock_kernel(void) __releases(kernel_lock); +extern void __lockfunc _lock_kernel(void) __acquires(kernel_lock); +extern void __lockfunc _unlock_kernel(void) __releases(kernel_lock); + +#define lock_kernel() { \ + trace_lock_kernel(__func__, __FILE__, __LINE__); \ + _lock_kernel(); \ +} + +#define unlock_kernel() { \ + trace_unlock_kernel(__func__, __FILE__, __LINE__); \ + _unlock_kernel(); \ +} /* * Various legacy drivers don't really need the BKL in a specific @@ -41,8 +52,8 @@ static inline void cycle_kernel_lock(void) #else -#define lock_kernel() do { } while(0) -#define unlock_kernel() do { } while(0) +#define lock_kernel() trace_lock_kernel(__func__, __FILE__, __LINE__); +#define unlock_kernel() trace_unlock_kernel(__func__, __FILE__, __LINE__); #define release_kernel_lock(task) do { } while(0) #define cycle_kernel_lock() do { } while(0) #define reacquire_kernel_lock(task) 0 diff --git a/include/trace/events/bkl.h b/include/trace/events/bkl.h new file mode 100644 index 00000000000..8abd620a490 --- /dev/null +++ b/include/trace/events/bkl.h @@ -0,0 +1,61 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bkl + +#if !defined(_TRACE_BKL_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BKL_H + +#include + +TRACE_EVENT(lock_kernel, + + TP_PROTO(const char *func, const char *file, int line), + + TP_ARGS(func, file, line), + + TP_STRUCT__entry( + __field( int, lock_depth ) + __field_ext( const char *, func, FILTER_PTR_STRING ) + __field_ext( const char *, file, FILTER_PTR_STRING ) + __field( int, line ) + ), + + TP_fast_assign( + /* We want to record the lock_depth after lock is acquired */ + __entry->lock_depth = current->lock_depth + 1; + __entry->func = func; + __entry->file = file; + __entry->line = line; + ), + + TP_printk("depth: %d, %s:%d %s()", __entry->lock_depth, + __entry->file, __entry->line, __entry->func) +); + +TRACE_EVENT(unlock_kernel, + + TP_PROTO(const char *func, const char *file, int line), + + TP_ARGS(func, file, line), + + TP_STRUCT__entry( + __field(int, lock_depth) + __field(const char *, func) + __field(const char *, file) + __field(int, line) + ), + + TP_fast_assign( + __entry->lock_depth = current->lock_depth; + __entry->func = func; + __entry->file = file; + __entry->line = line; + ), + + TP_printk("depth: %d, %s:%d %s()", __entry->lock_depth, + __entry->file, __entry->line, __entry->func) +); + +#endif /* _TRACE_BKL_H */ + +/* This part must be outside protection */ +#include diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c index 39f1029e352..5c10b2e1fd0 100644 --- a/lib/kernel_lock.c +++ b/lib/kernel_lock.c @@ -5,10 +5,11 @@ * relegated to obsolescence, but used by various less * important (or lazy) subsystems. */ -#include #include #include #include +#define CREATE_TRACE_POINTS +#include /* * The 'big kernel lock' @@ -113,7 +114,7 @@ static inline void __unlock_kernel(void) * This cannot happen asynchronously, so we only need to * worry about other CPU's. */ -void __lockfunc lock_kernel(void) +void __lockfunc _lock_kernel(void) { int depth = current->lock_depth+1; if (likely(!depth)) @@ -121,13 +122,13 @@ void __lockfunc lock_kernel(void) current->lock_depth = depth; } -void __lockfunc unlock_kernel(void) +void __lockfunc _unlock_kernel(void) { BUG_ON(current->lock_depth < 0); if (likely(--current->lock_depth < 0)) __unlock_kernel(); } -EXPORT_SYMBOL(lock_kernel); -EXPORT_SYMBOL(unlock_kernel); +EXPORT_SYMBOL(_lock_kernel); +EXPORT_SYMBOL(_unlock_kernel); From 737f453fd115ea0c9642ed6b30e37e296a4e3ed7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 1 Aug 2009 03:42:44 +0200 Subject: [PATCH 02/40] tracing/filters: Cleanup useless headers Cleanup remaining headers inclusion that were only useful when the filter framework and its tracing related filesystem user interface weren't yet separated. v2: Keep module.h, needed for EXPORT_SYMBOL_GPL Signed-off-by: Frederic Weisbecker Cc: Tom Zanussi Cc: Steven Rostedt Cc: Li Zefan --- kernel/trace/trace_events_filter.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 23245785927..189663d82aa 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -18,8 +18,6 @@ * Copyright (C) 2009 Tom Zanussi */ -#include -#include #include #include #include From f3f3f0092477d0165f3f1bf0fd518550b2abd097 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 24 Sep 2009 15:27:41 +0200 Subject: [PATCH 03/40] tracing/event: Cleanup the useless dentry variable Cleanup the useless dentry variable while creating a kernel event set of files. trace_create_file() warns if it fails to create the file anyway, and we don't store the dentry anywhere. v2: Fix a small conflict in kernel/trace/trace_events.c Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Li Zefan --- kernel/trace/trace_events.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 56c260b83a9..8c91b7c8f04 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -898,9 +898,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events) "'%s/filter' entry\n", name); } - entry = trace_create_file("enable", 0644, system->entry, - (void *)system->name, - &ftrace_system_enable_fops); + trace_create_file("enable", 0644, system->entry, + (void *)system->name, + &ftrace_system_enable_fops); return system->entry; } @@ -912,7 +912,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, const struct file_operations *filter, const struct file_operations *format) { - struct dentry *entry; int ret; /* @@ -930,12 +929,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, } if (call->regfunc) - entry = trace_create_file("enable", 0644, call->dir, call, - enable); + trace_create_file("enable", 0644, call->dir, call, + enable); if (call->id && call->profile_enable) - entry = trace_create_file("id", 0444, call->dir, call, - id); + trace_create_file("id", 0444, call->dir, call, + id); if (call->define_fields) { ret = call->define_fields(call); @@ -944,16 +943,16 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, " events/%s\n", call->name); return ret; } - entry = trace_create_file("filter", 0644, call->dir, call, - filter); + trace_create_file("filter", 0644, call->dir, call, + filter); } /* A trace may not want to export its format */ if (!call->show_format) return 0; - entry = trace_create_file("format", 0444, call->dir, call, - format); + trace_create_file("format", 0444, call->dir, call, + format); return 0; } From 1889d20922d14a97b2099fa4d47587217c0ba48b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 24 Sep 2009 21:10:44 +0200 Subject: [PATCH 04/40] tracing/filters: Provide basic regex support This patch provides basic support for regular expressions in filters. It supports the following types of regexp: - *match_beginning - *match_middle* - match_end* - !don't match Example: cd /debug/tracing/events/bkl/lock_kernel echo 'file == "*reiserfs*"' > filter echo 1 > enable gedit-4941 [000] 457.735437: lock_kernel: depth: 0, fs/reiserfs/namei.c:334 reiserfs_lookup() sync_supers-227 [001] 461.379985: lock_kernel: depth: 0, fs/reiserfs/super.c:69 reiserfs_sync_fs() sync_supers-227 [000] 461.383096: lock_kernel: depth: 0, fs/reiserfs/journal.c:1069 flush_commit_list() reiserfs/1-1369 [001] 461.479885: lock_kernel: depth: 0, fs/reiserfs/journal.c:3509 flush_async_commits() Every string is now handled as a regexp in the filter framework, which helps to factorize the code for handling both simple strings and regexp comparisons. (The regexp parsing code has been wildly cherry picked from ftrace.c written by Steve.) v2: Simplify the whole and drop the filter_regex file Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Tom Zanussi Cc: Li Zefan --- kernel/trace/trace.h | 27 +++-- kernel/trace/trace_events_filter.c | 155 ++++++++++++++++++++++++++--- 2 files changed, 157 insertions(+), 25 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 86bcff94791..8d0db6018fe 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -702,20 +702,29 @@ struct event_subsystem { }; struct filter_pred; +struct regex; typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, int val1, int val2); +typedef int (*regex_match_func)(char *str, struct regex *r, int len); + +struct regex { + char pattern[MAX_FILTER_STR_VAL]; + int len; + int field_len; + regex_match_func match; +}; + struct filter_pred { - filter_pred_fn_t fn; - u64 val; - char str_val[MAX_FILTER_STR_VAL]; - int str_len; - char *field_name; - int offset; - int not; - int op; - int pop_n; + filter_pred_fn_t fn; + u64 val; + struct regex regex; + char *field_name; + int offset; + int not; + int op; + int pop_n; }; extern void print_event_filter(struct ftrace_event_call *call, diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 189663d82aa..d3c94c13956 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -195,9 +195,9 @@ static int filter_pred_string(struct filter_pred *pred, void *event, char *addr = (char *)(event + pred->offset); int cmp, match; - cmp = strncmp(addr, pred->str_val, pred->str_len); + cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -209,9 +209,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, char **addr = (char **)(event + pred->offset); int cmp, match; - cmp = strncmp(*addr, pred->str_val, pred->str_len); + cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -235,9 +235,9 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, char *addr = (char *)(event + str_loc); int cmp, match; - cmp = strncmp(addr, pred->str_val, str_len); + cmp = pred->regex.match(addr, &pred->regex, str_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -248,6 +248,126 @@ static int filter_pred_none(struct filter_pred *pred, void *event, return 0; } +/* Basic regex callbacks */ +static int regex_match_full(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, len) == 0) + return 1; + return 0; +} + +static int regex_match_front(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, len) == 0) + return 1; + return 0; +} + +static int regex_match_middle(char *str, struct regex *r, int len) +{ + if (strstr(str, r->pattern)) + return 1; + return 0; +} + +static int regex_match_end(char *str, struct regex *r, int len) +{ + char *ptr = strstr(str, r->pattern); + + if (ptr && (ptr[r->len] == 0)) + return 1; + return 0; +} + +enum regex_type { + MATCH_FULL, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, +}; + +/* + * Pass in a buffer containing a regex and this function will + * set search to point to the search part of the buffer and + * return the type of search it is (see enum above). + * This does modify buff. + * + * Returns enum type. + * search returns the pointer to use for comparison. + * not returns 1 if buff started with a '!' + * 0 otherwise. + */ +static enum regex_type +filter_parse_regex(char *buff, int len, char **search, int *not) +{ + int type = MATCH_FULL; + int i; + + if (buff[0] == '!') { + *not = 1; + buff++; + len--; + } else + *not = 0; + + *search = buff; + + for (i = 0; i < len; i++) { + if (buff[i] == '*') { + if (!i) { + *search = buff + 1; + type = MATCH_END_ONLY; + } else { + if (type == MATCH_END_ONLY) + type = MATCH_MIDDLE_ONLY; + else + type = MATCH_FRONT_ONLY; + buff[i] = 0; + break; + } + } + } + + return type; +} + +static int filter_build_regex(struct filter_pred *pred) +{ + struct regex *r = &pred->regex; + char *search, *dup; + enum regex_type type; + int not; + + type = filter_parse_regex(r->pattern, r->len, &search, ¬); + dup = kstrdup(search, GFP_KERNEL); + if (!dup) + return -ENOMEM; + + strcpy(r->pattern, dup); + kfree(dup); + + r->len = strlen(r->pattern); + + switch (type) { + case MATCH_FULL: + r->match = regex_match_full; + break; + case MATCH_FRONT_ONLY: + r->match = regex_match_front; + break; + case MATCH_MIDDLE_ONLY: + r->match = regex_match_middle; + break; + case MATCH_END_ONLY: + r->match = regex_match_end; + break; + } + + pred->not ^= not; + + return 0; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct ftrace_event_call *call, void *rec) { @@ -394,7 +514,7 @@ static void filter_clear_pred(struct filter_pred *pred) { kfree(pred->field_name); pred->field_name = NULL; - pred->str_len = 0; + pred->regex.len = 0; } static int filter_set_pred(struct filter_pred *dest, @@ -658,21 +778,24 @@ static int filter_add_pred(struct filter_parse_state *ps, } if (is_string_field(field)) { - pred->str_len = field->size; + ret = filter_build_regex(pred); + if (ret) + return ret; - if (field->filter_type == FILTER_STATIC_STRING) + if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; - else if (field->filter_type == FILTER_DYN_STRING) - fn = filter_pred_strloc; + pred->regex.field_len = field->size; + } else if (field->filter_type == FILTER_DYN_STRING) + fn = filter_pred_strloc; else { fn = filter_pred_pchar; - pred->str_len = strlen(pred->str_val); + pred->regex.field_len = strlen(pred->regex.pattern); } } else { if (field->is_signed) - ret = strict_strtoll(pred->str_val, 0, &val); + ret = strict_strtoll(pred->regex.pattern, 0, &val); else - ret = strict_strtoull(pred->str_val, 0, &val); + ret = strict_strtoull(pred->regex.pattern, 0, &val); if (ret) { parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; @@ -1042,8 +1165,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2) return NULL; } - strcpy(pred->str_val, operand2); - pred->str_len = strlen(operand2); + strcpy(pred->regex.pattern, operand2); + pred->regex.len = strlen(pred->regex.pattern); pred->op = op; From 3f6fe06dbf67b46d36fedec502300e04dffeb67a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 24 Sep 2009 21:31:51 +0200 Subject: [PATCH 05/40] tracing/filters: Unify the regex parsing helpers The filter code has stolen the regex parsing function from ftrace to get the regex support. We have duplicated this code, so factorize it in the filter area and make it generally available, as the filter code is the most suited to host this feature. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Tom Zanussi Cc: Li Zefan --- kernel/trace/ftrace.c | 64 +++--------------------------- kernel/trace/trace.h | 9 +++++ kernel/trace/trace_events_filter.c | 20 +++++----- 3 files changed, 23 insertions(+), 70 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cc615f84751..ddf23a225b5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1655,60 +1655,6 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin) return ret; } -enum { - MATCH_FULL, - MATCH_FRONT_ONLY, - MATCH_MIDDLE_ONLY, - MATCH_END_ONLY, -}; - -/* - * (static function - no need for kernel doc) - * - * Pass in a buffer containing a glob and this function will - * set search to point to the search part of the buffer and - * return the type of search it is (see enum above). - * This does modify buff. - * - * Returns enum type. - * search returns the pointer to use for comparison. - * not returns 1 if buff started with a '!' - * 0 otherwise. - */ -static int -ftrace_setup_glob(char *buff, int len, char **search, int *not) -{ - int type = MATCH_FULL; - int i; - - if (buff[0] == '!') { - *not = 1; - buff++; - len--; - } else - *not = 0; - - *search = buff; - - for (i = 0; i < len; i++) { - if (buff[i] == '*') { - if (!i) { - *search = buff + 1; - type = MATCH_END_ONLY; - } else { - if (type == MATCH_END_ONLY) - type = MATCH_MIDDLE_ONLY; - else - type = MATCH_FRONT_ONLY; - buff[i] = 0; - break; - } - } - } - - return type; -} - static int ftrace_match(char *str, char *regex, int len, int type) { int matched = 0; @@ -1757,7 +1703,7 @@ static void ftrace_match_records(char *buff, int len, int enable) int not; flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - type = ftrace_setup_glob(buff, len, &search, ¬); + type = filter_parse_regex(buff, len, &search, ¬); search_len = strlen(search); @@ -1825,7 +1771,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) } if (strlen(buff)) { - type = ftrace_setup_glob(buff, strlen(buff), &search, ¬); + type = filter_parse_regex(buff, strlen(buff), &search, ¬); search_len = strlen(search); } @@ -1990,7 +1936,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int count = 0; char *search; - type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); + type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); /* we do not support '!' for function probes */ @@ -2067,7 +2013,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, else if (glob) { int not; - type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); + type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); /* we do not support '!' for function probes */ @@ -2520,7 +2466,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) return -ENODEV; /* decode regex */ - type = ftrace_setup_glob(buffer, strlen(buffer), &search, ¬); + type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); if (not) return -EINVAL; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8d0db6018fe..db6b83edd49 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -709,6 +709,13 @@ typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, typedef int (*regex_match_func)(char *str, struct regex *r, int len); +enum regex_type { + MATCH_FULL, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, +}; + struct regex { char pattern[MAX_FILTER_STR_VAL]; int len; @@ -727,6 +734,8 @@ struct filter_pred { int pop_n; }; +extern enum regex_type +filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); extern int apply_event_filter(struct ftrace_event_call *call, diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d3c94c13956..8c194de675b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -279,15 +279,14 @@ static int regex_match_end(char *str, struct regex *r, int len) return 0; } -enum regex_type { - MATCH_FULL, - MATCH_FRONT_ONLY, - MATCH_MIDDLE_ONLY, - MATCH_END_ONLY, -}; - -/* - * Pass in a buffer containing a regex and this function will +/** + * filter_parse_regex - parse a basic regex + * @buff: the raw regex + * @len: length of the regex + * @search: will point to the beginning of the string to compare + * @not: tell whether the match will have to be inverted + * + * This passes in a buffer containing a regex and this function will * set search to point to the search part of the buffer and * return the type of search it is (see enum above). * This does modify buff. @@ -297,8 +296,7 @@ enum regex_type { * not returns 1 if buff started with a '!' * 0 otherwise. */ -static enum regex_type -filter_parse_regex(char *buff, int len, char **search, int *not) +enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) { int type = MATCH_FULL; int i; From 925936ebf35a95c290e010b784c962164e6728f3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 28 Sep 2009 17:12:49 +0200 Subject: [PATCH 06/40] tracing: Pushdown the bkl tracepoints calls Currently we are calling the bkl tracepoint callbacks just before the bkl lock/unlock operations, ie the tracepoint call is not inside a lock_kernel() function but inside a lock_kernel() macro. Hence the bkl trace event header must be included from smp_lock.h. This raises some nasty circular header dependencies: linux/smp_lock.h -> trace/events/bkl.h -> trace/define_trace.h -> trace/ftrace.h -> linux/ftrace_event.h -> linux/hardirq.h -> linux/smp_lock.h This results in incomplete event declarations, spurious event definitions and other kind of funny behaviours. This is hardly fixable without ugly workarounds. So instead, we push the file name, line number and function name as lock_kernel() parameters, so that we only deal with the trace event header from lib/kernel_lock.c This adds two parameters to lock_kernel() and unlock_kernel() but it should be fine wrt to performances because this pair dos not seem to be called in fast paths. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Ingo Molnar Cc: Li Zefan --- include/linux/smp_lock.h | 28 +++++++++++++++------------- lib/kernel_lock.c | 15 +++++++++++---- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/include/linux/smp_lock.h b/include/linux/smp_lock.h index d48cc77ba70..2ea1dd1ba21 100644 --- a/include/linux/smp_lock.h +++ b/include/linux/smp_lock.h @@ -3,7 +3,6 @@ #ifdef CONFIG_LOCK_KERNEL #include -#include #define kernel_locked() (current->lock_depth >= 0) @@ -25,18 +24,21 @@ static inline int reacquire_kernel_lock(struct task_struct *task) return 0; } -extern void __lockfunc _lock_kernel(void) __acquires(kernel_lock); -extern void __lockfunc _unlock_kernel(void) __releases(kernel_lock); +extern void __lockfunc +_lock_kernel(const char *func, const char *file, int line) +__acquires(kernel_lock); -#define lock_kernel() { \ - trace_lock_kernel(__func__, __FILE__, __LINE__); \ - _lock_kernel(); \ -} +extern void __lockfunc +_unlock_kernel(const char *func, const char *file, int line) +__releases(kernel_lock); -#define unlock_kernel() { \ - trace_unlock_kernel(__func__, __FILE__, __LINE__); \ - _unlock_kernel(); \ -} +#define lock_kernel() do { \ + _lock_kernel(__func__, __FILE__, __LINE__); \ +} while (0) + +#define unlock_kernel() do { \ + _unlock_kernel(__func__, __FILE__, __LINE__); \ +} while (0) /* * Various legacy drivers don't really need the BKL in a specific @@ -52,8 +54,8 @@ static inline void cycle_kernel_lock(void) #else -#define lock_kernel() trace_lock_kernel(__func__, __FILE__, __LINE__); -#define unlock_kernel() trace_unlock_kernel(__func__, __FILE__, __LINE__); +#define lock_kernel() +#define unlock_kernel() #define release_kernel_lock(task) do { } while(0) #define cycle_kernel_lock() do { } while(0) #define reacquire_kernel_lock(task) 0 diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c index 5c10b2e1fd0..4ebfa5a164d 100644 --- a/lib/kernel_lock.c +++ b/lib/kernel_lock.c @@ -8,9 +8,11 @@ #include #include #include -#define CREATE_TRACE_POINTS #include +#define CREATE_TRACE_POINTS +#include + /* * The 'big kernel lock' * @@ -114,19 +116,24 @@ static inline void __unlock_kernel(void) * This cannot happen asynchronously, so we only need to * worry about other CPU's. */ -void __lockfunc _lock_kernel(void) +void __lockfunc _lock_kernel(const char *func, const char *file, int line) { - int depth = current->lock_depth+1; + int depth = current->lock_depth + 1; + + trace_lock_kernel(func, file, line); + if (likely(!depth)) __lock_kernel(); current->lock_depth = depth; } -void __lockfunc _unlock_kernel(void) +void __lockfunc _unlock_kernel(const char *func, const char *file, int line) { BUG_ON(current->lock_depth < 0); if (likely(--current->lock_depth < 0)) __unlock_kernel(); + + trace_unlock_kernel(func, file, line); } EXPORT_SYMBOL(_lock_kernel); From 3bb258bf430d29a24350fe4f44f8bf07b7b7a8f6 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 4 Oct 2009 17:53:29 -0700 Subject: [PATCH 07/40] ftrace.c: Add #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - Remove prefixes from pr_, use pr_fmt(fmt). No change in output. Signed-off-by: Joe Perches Acked-by: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <9b377eefae9e28c599dd4a17bdc81172965e9931.1254701151.git.joe@perches.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 9dbb527e165..25e6f5fc4b1 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -9,6 +9,8 @@ * the dangers of modifying code on the run. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -336,15 +338,15 @@ int __init ftrace_dyn_arch_init(void *data) switch (faulted) { case 0: - pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); + pr_info("converting mcount calls to 0f 1f 44 00 00\n"); memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); break; case 1: - pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); + pr_info("converting mcount calls to 66 66 66 66 90\n"); memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); break; case 2: - pr_info("ftrace: converting mcount calls to jmp . + 5\n"); + pr_info("converting mcount calls to jmp . + 5\n"); memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); break; } From 3c355863fb32070a2800f41106519c5c3038623a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 4 Oct 2009 17:53:40 -0700 Subject: [PATCH 08/40] testmmiotrace.c: Add and use pr_fmt(fmt) - Add #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt. - Strip MODULE_NAME from pr_s. - Remove MODULE_NAME definition. Signed-off-by: Joe Perches LKML-Reference: <3bb66cc7f85f77b9416902e1be7076f7e3f4ad48.1254701151.git.joe@perches.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/testmmiotrace.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index 427fd1b56df..8565d944f7c 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -1,12 +1,13 @@ /* * Written by Pekka Paalanen, 2008-2009 */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include -#define MODULE_NAME "testmmiotrace" - static unsigned long mmio_address; module_param(mmio_address, ulong, 0); MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " @@ -30,7 +31,7 @@ static unsigned v32(unsigned i) static void do_write_test(void __iomem *p) { unsigned int i; - pr_info(MODULE_NAME ": write test.\n"); + pr_info("write test.\n"); mmiotrace_printk("Write test.\n"); for (i = 0; i < 256; i++) @@ -47,7 +48,7 @@ static void do_read_test(void __iomem *p) { unsigned int i; unsigned errs[3] = { 0 }; - pr_info(MODULE_NAME ": read test.\n"); + pr_info("read test.\n"); mmiotrace_printk("Read test.\n"); for (i = 0; i < 256; i++) @@ -68,7 +69,7 @@ static void do_read_test(void __iomem *p) static void do_read_far_test(void __iomem *p) { - pr_info(MODULE_NAME ": read far test.\n"); + pr_info("read far test.\n"); mmiotrace_printk("Read far test.\n"); ioread32(p + read_far); @@ -78,7 +79,7 @@ static void do_test(unsigned long size) { void __iomem *p = ioremap_nocache(mmio_address, size); if (!p) { - pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); + pr_err("could not ioremap, aborting.\n"); return; } mmiotrace_printk("ioremap returned %p.\n", p); @@ -94,24 +95,22 @@ static int __init init(void) unsigned long size = (read_far) ? (8 << 20) : (16 << 10); if (mmio_address == 0) { - pr_err(MODULE_NAME ": you have to use the module argument " - "mmio_address.\n"); - pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS" - " YOU REALLY KNOW WHAT YOU ARE DOING!\n"); + pr_err("you have to use the module argument mmio_address.\n"); + pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n"); return -ENXIO; } - pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI " - "address space, and writing 16 kB of rubbish in there.\n", - size >> 10, mmio_address); + pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, " + "and writing 16 kB of rubbish in there.\n", + size >> 10, mmio_address); do_test(size); - pr_info(MODULE_NAME ": All done.\n"); + pr_info("All done.\n"); return 0; } static void __exit cleanup(void) { - pr_debug(MODULE_NAME ": unloaded.\n"); + pr_debug("unloaded.\n"); } module_init(init); From 369bc18f9a6c4e2686204c1d7476ab684a720968 Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Mon, 12 Oct 2009 22:17:21 +0200 Subject: [PATCH 09/40] ftrace: add kernel command line graph function filtering Add a command line parameter to allow limiting the function graphs that are traced on boot up from the given top-level callers , when ftrace=function_graph is specified. This patch adds the following command line option: ftrace_graph_filter=function-list Where function-list is a comma separated list of functions to filter. [fweisbec@gmail.com: picked the documentation changes from the v2 patch] Signed-off-by: Stefan Assmann Acked-by: Steven Rostedt LKML-Reference: <4AD2DEB9.2@redhat.com> Signed-off-by: Frederic Weisbecker --- Documentation/kernel-parameters.txt | 7 ++++++ kernel/trace/ftrace.c | 34 +++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 6fa7292947e..1dc4b9cc20e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -778,6 +778,13 @@ and is between 256 and 4096 characters. It is defined in the file by the set_ftrace_notrace file in the debugfs tracing directory. + ftrace_graph_filter=[function-list] + [FTRACE] Limit the top level callers functions traced + by the function graph tracer at boot up. + function-list is a comma separated list of functions + that can be changed at run time by the + set_graph_function file in the debugfs tracing directory. + gamecon.map[2|3]= [HW,JOY] Multisystem joystick and NES/SNES/PSX pad support via parallel port (up to 5 devices per port) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9a72853a8f0..91283d40821 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -78,6 +78,10 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); +#endif + static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op = ftrace_list; @@ -2248,6 +2252,7 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset) #define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static int __init set_ftrace_notrace(char *str) { @@ -2263,6 +2268,31 @@ static int __init set_ftrace_filter(char *str) } __setup("ftrace_filter=", set_ftrace_filter); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int __init set_graph_function(char *str) +{ + strncpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_graph_filter=", set_graph_function); + +static void __init set_ftrace_early_graph(char *buf) +{ + int ret; + char *func; + + while (buf) { + func = strsep(&buf, ","); + /* we allow only one expression at a time */ + ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, + func); + if (ret) + printk(KERN_DEBUG "ftrace: function %s not " + "traceable\n", func); + } +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + static void __init set_ftrace_early_filter(char *buf, int enable) { char *func; @@ -2279,6 +2309,10 @@ static void __init set_ftrace_early_filters(void) set_ftrace_early_filter(ftrace_filter_buf, 1); if (ftrace_notrace_buf[0]) set_ftrace_early_filter(ftrace_notrace_buf, 0); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (ftrace_graph_buf[0]) + set_ftrace_early_graph(ftrace_graph_buf); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ } static int From aef6f81b55f462082699c06e8e67e6eb5630ed45 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 12 Oct 2009 22:23:24 +0200 Subject: [PATCH 10/40] tracing: Rename set_ftrace to set_bootup_ftrace Do this rename because set_ftrace is too much generic and not enough self-explainable as a name. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Li Zefan --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 45068269ebb..866daf8497f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -129,7 +129,7 @@ static int tracing_set_tracer(const char *buf); static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; -static int __init set_ftrace(char *str) +static int __init set_bootup_ftrace(char *str) { strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; @@ -137,7 +137,7 @@ static int __init set_ftrace(char *str) ring_buffer_expanded = 1; return 1; } -__setup("ftrace=", set_ftrace); +__setup("ftrace=", set_bootup_ftrace); static int __init set_ftrace_dump_on_oops(char *str) { From bf7c5b43a12614847b83f507fb169ad30640e406 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 12 Oct 2009 22:31:32 +0200 Subject: [PATCH 11/40] tracing: Remove unused ftrace_trace_addr helper Remove the ftrace_trace_addr() function as only its off-case is implemented and there are no users of it currently. But we keep ftrace_graph_addr() off-case, in case someone come to use the function graph tracer to profit from top-level callers filtering. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Li Zefan --- kernel/trace/trace.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 365fb19d9e1..f22a7ac3238 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -483,10 +483,6 @@ static inline int ftrace_graph_addr(unsigned long addr) return 0; } #else -static inline int ftrace_trace_addr(unsigned long addr) -{ - return 1; -} static inline int ftrace_graph_addr(unsigned long addr) { return 1; From 194ec34184869f0de1cf255c924fc5299e1b3d27 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 13 Oct 2009 16:33:50 -0400 Subject: [PATCH 12/40] function-graph/x86: Replace unbalanced ret with jmp The function graph tracer replaces the return address with a hook to trace the exit of the function call. This hook will finish by returning to the real location the function should return to. But the current implementation uses a ret to jump to the real return location. This causes a imbalance between calls and ret. That is the original function does a call, the ret goes to the handler and then the handler does a ret without a matching call. Although the function graph tracer itself still breaks the branch predictor by replacing the original ret, by using a second ret and causing an imbalance, it breaks the predictor even more. This patch replaces the ret with a jmp to keep the calls and ret balanced. I tested this on one box and it showed a 1.7% increase in performance. Another box only showed a small 0.3% increase. But no box that I tested this on showed a decrease in performance by making this change. Signed-off-by: Steven Rostedt Acked-by: Mathieu Desnoyers Cc: Frederic Weisbecker LKML-Reference: <20091013203425.042034383@goodmis.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 7 ++----- arch/x86/kernel/entry_64.S | 6 +++--- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c097e7d607c..7d52e9da5e0 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1185,17 +1185,14 @@ END(ftrace_graph_caller) .globl return_to_handler return_to_handler: - pushl $0 pushl %eax - pushl %ecx pushl %edx movl %ebp, %eax call ftrace_return_to_handler - movl %eax, 0xc(%esp) + movl %eax, %ecx popl %edx - popl %ecx popl %eax - ret + jmp *%ecx #endif .section .rodata,"a" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b5c061f8f35..bd5bbddddf9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -155,11 +155,11 @@ GLOBAL(return_to_handler) call ftrace_return_to_handler - movq %rax, 16(%rsp) + movq %rax, %rdi movq 8(%rsp), %rdx movq (%rsp), %rax - addq $16, %rsp - retq + addq $24, %rsp + jmp *%rdi #endif From 756d17ee7ee4fbc8238bdf97100af63e6ac441ef Mon Sep 17 00:00:00 2001 From: "jolsa@redhat.com" Date: Tue, 13 Oct 2009 16:33:52 -0400 Subject: [PATCH 13/40] tracing: Support multiple pids in set_pid_ftrace file Adding the possibility to set more than 1 pid in the set_pid_ftrace file, thus allowing to trace more than 1 independent processes. Usage: sh-4.0# echo 284 > ./set_ftrace_pid sh-4.0# cat ./set_ftrace_pid 284 sh-4.0# echo 1 >> ./set_ftrace_pid sh-4.0# echo 0 >> ./set_ftrace_pid sh-4.0# cat ./set_ftrace_pid swapper tasks 1 284 sh-4.0# echo 4 > ./set_ftrace_pid sh-4.0# cat ./set_ftrace_pid 4 sh-4.0# echo > ./set_ftrace_pid sh-4.0# cat ./set_ftrace_pid no pid sh-4.0# Signed-off-by: Jiri Olsa Cc: Frederic Weisbecker LKML-Reference: <20091013203425.565454612@goodmis.org> Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 234 +++++++++++++++++++++++++++++------------- kernel/trace/trace.h | 4 +- 2 files changed, 167 insertions(+), 71 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 45c965919cf..0c799d1af70 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -60,6 +60,13 @@ static int last_ftrace_enabled; /* Quick disabling of function tracer. */ int function_trace_stop; +/* List for set_ftrace_pid's pids. */ +LIST_HEAD(ftrace_pids); +struct ftrace_pid { + struct list_head list; + struct pid *pid; +}; + /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -159,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) else func = ftrace_list_func; - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } @@ -207,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_list->next == &ftrace_list_end) { ftrace_func_t func = ftrace_list->func; - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } @@ -235,7 +242,7 @@ static void ftrace_update_pid_func(void) func = __ftrace_trace_function; #endif - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } else { @@ -825,8 +832,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) } #endif /* CONFIG_FUNCTION_PROFILER */ -/* set when tracing only a pid */ -struct pid *ftrace_pid_trace; static struct pid * const ftrace_swapper_pid = &init_struct_pid; #ifdef CONFIG_DYNAMIC_FTRACE @@ -2758,23 +2763,6 @@ static inline void ftrace_startup_enable(int command) { } # define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ -static ssize_t -ftrace_pid_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - - if (ftrace_pid_trace == ftrace_swapper_pid) - r = sprintf(buf, "swapper tasks\n"); - else if (ftrace_pid_trace) - r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace)); - else - r = sprintf(buf, "no pid\n"); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - static void clear_ftrace_swapper(void) { struct task_struct *p; @@ -2825,14 +2813,12 @@ static void set_ftrace_pid(struct pid *pid) rcu_read_unlock(); } -static void clear_ftrace_pid_task(struct pid **pid) +static void clear_ftrace_pid_task(struct pid *pid) { - if (*pid == ftrace_swapper_pid) + if (pid == ftrace_swapper_pid) clear_ftrace_swapper(); else - clear_ftrace_pid(*pid); - - *pid = NULL; + clear_ftrace_pid(pid); } static void set_ftrace_pid_task(struct pid *pid) @@ -2843,11 +2829,140 @@ static void set_ftrace_pid_task(struct pid *pid) set_ftrace_pid(pid); } +static int ftrace_pid_add(int p) +{ + struct pid *pid; + struct ftrace_pid *fpid; + int ret = -EINVAL; + + mutex_lock(&ftrace_lock); + + if (!p) + pid = ftrace_swapper_pid; + else + pid = find_get_pid(p); + + if (!pid) + goto out; + + ret = 0; + + list_for_each_entry(fpid, &ftrace_pids, list) + if (fpid->pid == pid) + goto out_put; + + ret = -ENOMEM; + + fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); + if (!fpid) + goto out_put; + + list_add(&fpid->list, &ftrace_pids); + fpid->pid = pid; + + set_ftrace_pid_task(pid); + + ftrace_update_pid_func(); + ftrace_startup_enable(0); + + mutex_unlock(&ftrace_lock); + return 0; + +out_put: + if (pid != ftrace_swapper_pid) + put_pid(pid); + +out: + mutex_unlock(&ftrace_lock); + return ret; +} + +static void ftrace_pid_reset(void) +{ + struct ftrace_pid *fpid, *safe; + + mutex_lock(&ftrace_lock); + list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { + struct pid *pid = fpid->pid; + + clear_ftrace_pid_task(pid); + + list_del(&fpid->list); + kfree(fpid); + } + + ftrace_update_pid_func(); + ftrace_startup_enable(0); + + mutex_unlock(&ftrace_lock); +} + +static void *fpid_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&ftrace_lock); + + if (list_empty(&ftrace_pids) && (!*pos)) + return (void *) 1; + + return seq_list_start(&ftrace_pids, *pos); +} + +static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) +{ + if (v == (void *)1) + return NULL; + + return seq_list_next(v, &ftrace_pids, pos); +} + +static void fpid_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&ftrace_lock); +} + +static int fpid_show(struct seq_file *m, void *v) +{ + const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); + + if (v == (void *)1) { + seq_printf(m, "no pid\n"); + return 0; + } + + if (fpid->pid == ftrace_swapper_pid) + seq_printf(m, "swapper tasks\n"); + else + seq_printf(m, "%u\n", pid_vnr(fpid->pid)); + + return 0; +} + +static const struct seq_operations ftrace_pid_sops = { + .start = fpid_start, + .next = fpid_next, + .stop = fpid_stop, + .show = fpid_show, +}; + +static int +ftrace_pid_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_pid_reset(); + + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &ftrace_pid_sops); + + return ret; +} + static ssize_t ftrace_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct pid *pid; char buf[64]; long val; int ret; @@ -2860,57 +2975,38 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; + /* + * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" + * to clean the filter quietly. + */ + strstrip(buf); + if (strlen(buf) == 0) + return 1; + ret = strict_strtol(buf, 10, &val); if (ret < 0) return ret; - mutex_lock(&ftrace_lock); - if (val < 0) { - /* disable pid tracing */ - if (!ftrace_pid_trace) - goto out; + ret = ftrace_pid_add(val); - clear_ftrace_pid_task(&ftrace_pid_trace); + return ret ? ret : cnt; +} - } else { - /* swapper task is special */ - if (!val) { - pid = ftrace_swapper_pid; - if (pid == ftrace_pid_trace) - goto out; - } else { - pid = find_get_pid(val); +static int +ftrace_pid_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); - if (pid == ftrace_pid_trace) { - put_pid(pid); - goto out; - } - } - - if (ftrace_pid_trace) - clear_ftrace_pid_task(&ftrace_pid_trace); - - if (!pid) - goto out; - - ftrace_pid_trace = pid; - - set_ftrace_pid_task(ftrace_pid_trace); - } - - /* update the function call */ - ftrace_update_pid_func(); - ftrace_startup_enable(0); - - out: - mutex_unlock(&ftrace_lock); - - return cnt; + return 0; } static const struct file_operations ftrace_pid_fops = { - .read = ftrace_pid_read, - .write = ftrace_pid_write, + .open = ftrace_pid_open, + .write = ftrace_pid_write, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_pid_release, }; static __init int ftrace_init_debugfs(void) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f22a7ac3238..acef8b4636f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -496,12 +496,12 @@ print_graph_function(struct trace_iterator *iter) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -extern struct pid *ftrace_pid_trace; +extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER static inline int ftrace_trace_task(struct task_struct *task) { - if (!ftrace_pid_trace) + if (list_empty(&ftrace_pids)) return 1; return test_tsk_trace_trace(task); From 5cb084bb1f3fd4dcdaf7e4cf564994346ec8f783 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 13 Oct 2009 16:33:53 -0400 Subject: [PATCH 14/40] tracing: Enable records during the module load I was debuging some module using "function" and "function_graph" tracers and noticed, that if you load module after you enabled tracing, the module's hooks will convert only to NOP instructions. The attached patch enables modules' hooks if there's function trace allready on, thus allowing to trace module functions. Signed-off-by: Jiri Olsa Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt LKML-Reference: <20091013203425.896285120@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0c799d1af70..aaea9cda878 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1270,12 +1270,34 @@ static int ftrace_update_code(struct module *mod) ftrace_new_addrs = p->newlist; p->flags = 0L; - /* convert record (i.e, patch mcount-call with NOP) */ - if (ftrace_code_disable(mod, p)) { - p->flags |= FTRACE_FL_CONVERTED; - ftrace_update_cnt++; - } else + /* + * Do the initial record convertion from mcount jump + * to the NOP instructions. + */ + if (!ftrace_code_disable(mod, p)) { ftrace_free_rec(p); + continue; + } + + p->flags |= FTRACE_FL_CONVERTED; + ftrace_update_cnt++; + + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + */ + if (ftrace_start_up) { + int failed = __ftrace_replace_code(p, 1); + if (failed) { + ftrace_bug(failed, p->ip); + ftrace_free_rec(p); + } + } } stop = ftrace_now(raw_smp_processor_id()); @@ -2609,7 +2631,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) return 0; } -static int ftrace_convert_nops(struct module *mod, +static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { @@ -2669,7 +2691,7 @@ static void ftrace_init_module(struct module *mod, { if (ftrace_disabled || start == end) return; - ftrace_convert_nops(mod, start, end); + ftrace_process_locs(mod, start, end); } static int ftrace_module_notify(struct notifier_block *self, @@ -2730,7 +2752,7 @@ void __init ftrace_init(void) last_ftrace_enabled = ftrace_enabled = 1; - ret = ftrace_convert_nops(NULL, + ret = ftrace_process_locs(NULL, __start_mcount_loc, __stop_mcount_loc); From 4d8289494a37e19cd7f3beacea9c957ad3debad6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 13 Oct 2009 16:33:54 -0400 Subject: [PATCH 15/40] tracing: Enable "__cold" functions Based on the commit: a586df06 "x86: Support __attribute__((__cold__)) in gcc 4.3" some of the functions goes to the ".text.unlikely" section. Looks like there's not many of them (I found printk, panic, __ssb_dma_not_implemented, fat_fs_error), but still worth to include I think. Signed-off-by: Jiri Olsa Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt LKML-Reference: <20091013203426.175845614@goodmis.org> Signed-off-by: Ingo Molnar --- scripts/recordmcount.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 090d300d739..bfb8b2cdd92 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -119,6 +119,7 @@ my %text_sections = ( ".sched.text" => 1, ".spinlock.text" => 1, ".irqentry.text" => 1, + ".text.unlikely" => 1, ); $objdump = "objdump" if ((length $objdump) == 0); From c44fc770845163f8d9e573f37f92a7b7a7ade14e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 19 Sep 2009 06:50:42 +0200 Subject: [PATCH 16/40] tracing: Move syscalls metadata handling from arch to core Most of the syscalls metadata processing is done from arch. But these operations are mostly generic accross archs. Especially now that we have a common variable name that expresses the number of syscalls supported by an arch: NR_syscalls, the only remaining bits that need to reside in arch is the syscall nr to addr translation. v2: Compare syscalls symbols only after the "sys" prefix so that we avoid spurious mismatches with archs that have syscalls wrappers, in which case syscalls symbols have "SyS" prefixed aliases. (Reported by: Heiko Carstens) Signed-off-by: Frederic Weisbecker Acked-by: Heiko Carstens Cc: Ingo Molnar Cc: Steven Rostedt Cc: Li Zefan Cc: Masami Hiramatsu Cc: Jason Baron Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Paul Mundt --- arch/s390/kernel/ftrace.c | 67 +-------------------------- arch/x86/kernel/ftrace.c | 76 +------------------------------ include/trace/syscall.h | 2 +- kernel/trace/trace_syscalls.c | 86 +++++++++++++++++++++++++++++++++++ 4 files changed, 91 insertions(+), 140 deletions(-) diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 57bdcb1e3cd..7c5752c3423 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -206,73 +206,10 @@ out: #ifdef CONFIG_FTRACE_SYSCALLS -extern unsigned long __start_syscalls_metadata[]; -extern unsigned long __stop_syscalls_metadata[]; extern unsigned int sys_call_table[]; -static struct syscall_metadata **syscalls_metadata; - -struct syscall_metadata *syscall_nr_to_meta(int nr) +unsigned long __init arch_syscall_addr(int nr) { - if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) - return NULL; - - return syscalls_metadata[nr]; + return (unsigned long)sys_call_table[nr]; } - -int syscall_name_to_nr(char *name) -{ - int i; - - if (!syscalls_metadata) - return -1; - for (i = 0; i < NR_syscalls; i++) - if (syscalls_metadata[i]) - if (!strcmp(syscalls_metadata[i]->name, name)) - return i; - return -1; -} - -void set_syscall_enter_id(int num, int id) -{ - syscalls_metadata[num]->enter_id = id; -} - -void set_syscall_exit_id(int num, int id) -{ - syscalls_metadata[num]->exit_id = id; -} - -static struct syscall_metadata *find_syscall_meta(unsigned long syscall) -{ - struct syscall_metadata *start; - struct syscall_metadata *stop; - char str[KSYM_SYMBOL_LEN]; - - start = (struct syscall_metadata *)__start_syscalls_metadata; - stop = (struct syscall_metadata *)__stop_syscalls_metadata; - kallsyms_lookup(syscall, NULL, NULL, NULL, str); - - for ( ; start < stop; start++) { - if (start->name && !strcmp(start->name + 3, str + 3)) - return start; - } - return NULL; -} - -static int __init arch_init_ftrace_syscalls(void) -{ - struct syscall_metadata *meta; - int i; - syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * NR_syscalls, - GFP_KERNEL); - if (!syscalls_metadata) - return -ENOMEM; - for (i = 0; i < NR_syscalls; i++) { - meta = find_syscall_meta((unsigned long)sys_call_table[i]); - syscalls_metadata[i] = meta; - } - return 0; -} -arch_initcall(arch_init_ftrace_syscalls); #endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 25e6f5fc4b1..5a1b9758fd6 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -470,82 +470,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, #ifdef CONFIG_FTRACE_SYSCALLS -extern unsigned long __start_syscalls_metadata[]; -extern unsigned long __stop_syscalls_metadata[]; extern unsigned long *sys_call_table; -static struct syscall_metadata **syscalls_metadata; - -static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) +unsigned long __init arch_syscall_addr(int nr) { - struct syscall_metadata *start; - struct syscall_metadata *stop; - char str[KSYM_SYMBOL_LEN]; - - - start = (struct syscall_metadata *)__start_syscalls_metadata; - stop = (struct syscall_metadata *)__stop_syscalls_metadata; - kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str); - - for ( ; start < stop; start++) { - if (start->name && !strcmp(start->name, str)) - return start; - } - return NULL; + return (unsigned long)(&sys_call_table)[nr]; } - -struct syscall_metadata *syscall_nr_to_meta(int nr) -{ - if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) - return NULL; - - return syscalls_metadata[nr]; -} - -int syscall_name_to_nr(char *name) -{ - int i; - - if (!syscalls_metadata) - return -1; - - for (i = 0; i < NR_syscalls; i++) { - if (syscalls_metadata[i]) { - if (!strcmp(syscalls_metadata[i]->name, name)) - return i; - } - } - return -1; -} - -void set_syscall_enter_id(int num, int id) -{ - syscalls_metadata[num]->enter_id = id; -} - -void set_syscall_exit_id(int num, int id) -{ - syscalls_metadata[num]->exit_id = id; -} - -static int __init arch_init_ftrace_syscalls(void) -{ - int i; - struct syscall_metadata *meta; - unsigned long **psys_syscall_table = &sys_call_table; - - syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - NR_syscalls, GFP_KERNEL); - if (!syscalls_metadata) { - WARN_ON(1); - return -ENOMEM; - } - - for (i = 0; i < NR_syscalls; i++) { - meta = find_syscall_meta(psys_syscall_table[i]); - syscalls_metadata[i] = meta; - } - return 0; -} -arch_initcall(arch_init_ftrace_syscalls); #endif diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 5dc283ba5ae..e972f0a40f8 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -33,7 +33,7 @@ struct syscall_metadata { }; #ifdef CONFIG_FTRACE_SYSCALLS -extern struct syscall_metadata *syscall_nr_to_meta(int nr); +extern unsigned long arch_syscall_addr(int nr); extern int syscall_name_to_nr(char *name); void set_syscall_enter_id(int num, int id); void set_syscall_exit_id(int num, int id); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 9fbce6c9d2e..8bda4bff228 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -14,6 +14,69 @@ static int sys_refcount_exit; static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); +extern unsigned long __start_syscalls_metadata[]; +extern unsigned long __stop_syscalls_metadata[]; + +static struct syscall_metadata **syscalls_metadata; + +static struct syscall_metadata *find_syscall_meta(unsigned long syscall) +{ + struct syscall_metadata *start; + struct syscall_metadata *stop; + char str[KSYM_SYMBOL_LEN]; + + + start = (struct syscall_metadata *)__start_syscalls_metadata; + stop = (struct syscall_metadata *)__stop_syscalls_metadata; + kallsyms_lookup(syscall, NULL, NULL, NULL, str); + + for ( ; start < stop; start++) { + /* + * Only compare after the "sys" prefix. Archs that use + * syscall wrappers may have syscalls symbols aliases prefixed + * with "SyS" instead of "sys", leading to an unwanted + * mismatch. + */ + if (start->name && !strcmp(start->name + 3, str + 3)) + return start; + } + return NULL; +} + +static struct syscall_metadata *syscall_nr_to_meta(int nr) +{ + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) + return NULL; + + return syscalls_metadata[nr]; +} + +int syscall_name_to_nr(char *name) +{ + int i; + + if (!syscalls_metadata) + return -1; + + for (i = 0; i < NR_syscalls; i++) { + if (syscalls_metadata[i]) { + if (!strcmp(syscalls_metadata[i]->name, name)) + return i; + } + } + return -1; +} + +void set_syscall_enter_id(int num, int id) +{ + syscalls_metadata[num]->enter_id = id; +} + +void set_syscall_exit_id(int num, int id) +{ + syscalls_metadata[num]->exit_id = id; +} + enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags) { @@ -375,6 +438,29 @@ struct trace_event event_syscall_exit = { .trace = print_syscall_exit, }; +int __init init_ftrace_syscalls(void) +{ + struct syscall_metadata *meta; + unsigned long addr; + int i; + + syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * + NR_syscalls, GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return -ENOMEM; + } + + for (i = 0; i < NR_syscalls; i++) { + addr = arch_syscall_addr(i); + meta = find_syscall_meta(addr); + syscalls_metadata[i] = meta; + } + + return 0; +} +core_initcall(init_ftrace_syscalls); + #ifdef CONFIG_EVENT_PROFILE static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); From 459c6d15a0c52bae43842ff2cd0dd41aa7de9b7f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 19 Sep 2009 07:14:15 +0200 Subject: [PATCH 17/40] tracing: Document HAVE_SYSCALL_TRACEPOINTS needs Document the arch needed requirements to get the support for syscalls tracing. v2: HAVE_FTRACE_SYSCALLS have been changed to HAVE_SYSCALL_TRACEPOINTS recently. Update this config name in the documentation then. Signed-off-by: Frederic Weisbecker Acked-by: Heiko Carstens Cc: Ingo Molnar Cc: Steven Rostedt Cc: Li Zefan Cc: Masami Hiramatsu Cc: Jason Baron Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Paul Mundt --- Documentation/trace/ftrace-design.txt | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt index 7003e10f10f..641a1ef2a7f 100644 --- a/Documentation/trace/ftrace-design.txt +++ b/Documentation/trace/ftrace-design.txt @@ -213,10 +213,19 @@ If you can't trace NMI functions, then skip this option.
-HAVE_FTRACE_SYSCALLS +HAVE_SYSCALL_TRACEPOINTS --------------------- -
+You need very few things to get the syscalls tracing in an arch. + +- Have a NR_syscalls variable in that provides the number + of syscalls supported by the arch. +- Implement arch_syscall_addr() that resolves a syscall address from a + syscall number. +- Support the TIF_SYSCALL_TRACEPOINT thread flags +- Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace + in the ptrace syscalls tracing path. +- Tag this arch as HAVE_SYSCALL_TRACEPOINTS. HAVE_FTRACE_MCOUNT_RECORD From 06f43d66ec36388056f5c697bf1e67c0e0a1645c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Oct 2009 20:43:39 +0200 Subject: [PATCH 18/40] ftrace: Copy ftrace_graph_filter boot param using strlcpy We are using strncpy in the wrong way to copy the ftrace_graph_filter boot param because we pass the buffer size instead of the max string size it can contain (buffer size - 1). The end result might not be NULL terminated as we are abusing the max string size. Lets use strlcpy() instead. Reported-by: Li Zefan Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index aaea9cda878..b10c0d90a6f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2293,7 +2293,7 @@ __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int __init set_graph_function(char *str) { - strncpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_graph_filter=", set_graph_function); From 1beee96bae0daf7f491356777c3080cc436950f5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Oct 2009 20:50:32 +0200 Subject: [PATCH 19/40] ftrace: Rename set_bootup_ftrace into set_cmdline_ftrace set_cmdline_ftrace is a better match against what does this function: apply a tracer name from the kernel command line. Reported-by: Steven Rostedt Signed-off-by: Frederic Weisbecker Cc: Li Zefan --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4311ec3062f..026e715a0c7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -129,7 +129,7 @@ static int tracing_set_tracer(const char *buf); static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; -static int __init set_bootup_ftrace(char *str) +static int __init set_cmdline_ftrace(char *str) { strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; @@ -137,7 +137,7 @@ static int __init set_bootup_ftrace(char *str) ring_buffer_expanded = 1; return 1; } -__setup("ftrace=", set_bootup_ftrace); +__setup("ftrace=", set_cmdline_ftrace); static int __init set_ftrace_dump_on_oops(char *str) { From d49f6aa76d24c60a52530474cb662e8ad9f09471 Mon Sep 17 00:00:00 2001 From: Li Hong Date: Wed, 28 Oct 2009 13:01:38 +0800 Subject: [PATCH 20/40] tracing: Amend documentation in recordmcount.pl to reflect implementation The documentation currently says we will use the first function in a section as a reference. The actual algorithm is: choose the first global function we meet as a reference. If there is none, choose the first local one. Change the documentation to be consistent with the code. Also add several other clarifications. Signed-off-by: Li Hong LKML-Reference: <20091028050138.GA30758@uhli> Signed-off-by: Steven Rostedt --- scripts/recordmcount.pl | 84 ++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 35 deletions(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index bfb8b2cdd92..a569be72f3b 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -6,73 +6,89 @@ # all the offsets to the calls to mcount. # # -# What we want to end up with is a section in vmlinux called -# __mcount_loc that contains a list of pointers to all the -# call sites in the kernel that call mcount. Later on boot up, the kernel -# will read this list, save the locations and turn them into nops. -# When tracing or profiling is later enabled, these locations will then -# be converted back to pointers to some function. +# What we want to end up with this is that each object file will have a +# section called __mcount_loc that will hold the list of pointers to mcount +# callers. After final linking, the vmlinux will have within .init.data the +# list of all callers to mcount between __start_mcount_loc and __stop_mcount_loc. +# Later on boot up, the kernel will read this list, save the locations and turn +# them into nops. When tracing or profiling is later enabled, these locations +# will then be converted back to pointers to some function. # # This is no easy feat. This script is called just after the original # object is compiled and before it is linked. # -# The references to the call sites are offsets from the section of text -# that the call site is in. Hence, all functions in a section that -# has a call site to mcount, will have the offset from the beginning of -# the section and not the beginning of the function. +# When parse this object file using 'objdump', the references to the call +# sites are offsets from the section that the call site is in. Hence, all +# functions in a section that has a call site to mcount, will have the +# offset from the beginning of the section and not the beginning of the +# function. +# +# But where this section will reside finally in vmlinx is undetermined at +# this point. So we can't use this kind of offsets to record the final +# address of this call site. +# +# The trick is to change the call offset referring the start of a section to +# referring a function symbol in this section. During the link step, 'ld' will +# compute the final address according to the information we record. # -# The trick is to find a way to record the beginning of the section. -# The way we do this is to look at the first function in the section -# which will also be the location of that section after final link. # e.g. # # .section ".sched.text", "ax" -# .globl my_func -# my_func: # [...] -# call mcount (offset: 0x5) +# func1: +# [...] +# call mcount (offset: 0x10) # [...] # ret -# other_func: +# .globl fun2 +# func2: (offset: 0x20) # [...] -# call mcount (offset: 0x1b) +# [...] +# ret +# func3: +# [...] +# call mcount (offset: 0x30) # [...] # # Both relocation offsets for the mcounts in the above example will be -# offset from .sched.text. If we make another file called tmp.s with: +# offset from .sched.text. If we choose global symbol func2 as a reference and +# make another file called tmp.s with the new offsets: # # .section __mcount_loc -# .quad my_func + 0x5 -# .quad my_func + 0x1b +# .quad func2 - 0x10 +# .quad func2 + 0x10 # -# We can then compile this tmp.s into tmp.o, and link it to the original +# We can then compile this tmp.s into tmp.o, and link it back to the original # object. # -# But this gets hard if my_func is not globl (a static function). -# In such a case we have: +# In our algorithm, we will choose the first global function we meet in this +# section as the reference. But this gets hard if there is no global functions +# in this section. In such a case we have to select a local one. E.g. func1: # # .section ".sched.text", "ax" -# my_func: +# func1: # [...] -# call mcount (offset: 0x5) +# call mcount (offset: 0x10) # [...] # ret -# other_func: +# func2: # [...] -# call mcount (offset: 0x1b) +# call mcount (offset: 0x20) # [...] +# .section "other.section" # # If we make the tmp.s the same as above, when we link together with -# the original object, we will end up with two symbols for my_func: +# the original object, we will end up with two symbols for func1: # one local, one global. After final compile, we will end up with -# an undefined reference to my_func. +# an undefined reference to func1 or a wrong reference to another global +# func1 in other files. # # Since local objects can reference local variables, we need to find # a way to make tmp.o reference the local objects of the original object -# file after it is linked together. To do this, we convert the my_func +# file after it is linked together. To do this, we convert func1 # into a global symbol before linking tmp.o. Then after we link tmp.o -# we will only have a single symbol for my_func that is global. -# We can convert my_func back into a local symbol and we are done. +# we will only have a single symbol for func1 that is global. +# We can convert func1 back into a local symbol and we are done. # # Here are the steps we take: # @@ -86,10 +102,8 @@ # 6) Link together this new object with the list object. # 7) Convert the local functions back to local symbols and rename # the result as the original object. -# End. # 8) Link the object with the list object. # 9) Move the result back to the original object. -# End. # use strict; From e2d753fac5b3954a3b6001f98479f0435fe7c868 Mon Sep 17 00:00:00 2001 From: Li Hong Date: Tue, 27 Oct 2009 14:57:33 +0800 Subject: [PATCH 21/40] tracing: Correct the check for number of arguments in recordmcount.pl The number of arguments passed into recordmcount.pl is 10, but the code checks if only 7 are passed in. Signed-off-by: Li Hong LKML-Reference: <20091027065733.GB22032@uhli> Signed-off-by: Steven Rostedt --- scripts/recordmcount.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index a569be72f3b..a512af1514b 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -113,7 +113,7 @@ $P =~ s@.*/@@g; my $V = '0.1'; -if ($#ARGV < 7) { +if ($#ARGV != 10) { print "usage: $P arch bits objdump objcopy cc ld nm rm mv is_module inputfile\n"; print "version: $V\n"; exit(1); From bdd3b052c63b2c19a0118937f500985c01a19956 Mon Sep 17 00:00:00 2001 From: Li Hong Date: Wed, 28 Oct 2009 13:03:32 +0800 Subject: [PATCH 22/40] tracing: Check absolute path of input file in recordmcount.pl The ftrace.c file may reference the mcount function and this may interfere with the recordmcount.pl processing. To avoid this, the code does not process the kernel/trace/ftrace.o. But currently the check is against a relative path. This patch modifies the check to succeed if the path is an absolute path. Signed-off-by: Li Hong LKML-Reference: <20091028050332.GC30758@uhli> Signed-off-by: Steven Rostedt --- scripts/recordmcount.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index a512af1514b..b80e5d04416 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -123,7 +123,7 @@ my ($arch, $bits, $objdump, $objcopy, $cc, $ld, $nm, $rm, $mv, $is_module, $inputfile) = @ARGV; # This file refers to mcount and shouldn't be ftraced, so lets' ignore it -if ($inputfile eq "kernel/trace/ftrace.o") { +if ($inputfile =~ m,kernel/trace/ftrace\.o$,) { exit(0); } From 7b7edc27683e20624f4daf17c76041719184201c Mon Sep 17 00:00:00 2001 From: Li Hong Date: Wed, 28 Oct 2009 13:04:21 +0800 Subject: [PATCH 23/40] tracing: Fix objcopy revision check in recordmcount.pl The current logic to check objcopy's version is incorrect. This patch fixes the algorithm and disables the use of local functions as a reference if the objcopy version does not support static to global conversions. Also remove some usused variables. Signed-off-by: Li Hong LKML-Reference: <20091028050421.GD30758@uhli> Signed-off-by: Steven Rostedt --- scripts/recordmcount.pl | 56 ++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index b80e5d04416..d6199fc4870 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -159,6 +159,31 @@ my $function_regex; # Find the name of a function my $mcount_regex; # Find the call site to mcount (return offset) my $alignment; # The .align value to use for $mcount_section my $section_type; # Section header plus possible alignment command +my $can_use_local = 0; # If we can use local function references + +## +# check_objcopy - whether objcopy supports --globalize-symbols +# +# --globalize-symbols came out in 2.17, we must test the version +# of objcopy, and if it is less than 2.17, then we can not +# record local functions. +sub check_objcopy +{ + open (IN, "$objcopy --version |") or die "error running $objcopy"; + while () { + if (/objcopy.*\s(\d+)\.(\d+)/) { + $can_use_local = 1 if ($1 > 2 || ($1 == 2 && $2 >= 17)); + last; + } + } + close (IN); + + if (!$can_use_local) { + print STDERR "WARNING: could not find objcopy version or version " . + "is less than 2.17.\n" . + "\tLocal function references is disabled.\n"; + } +} if ($arch eq "x86") { if ($bits == 64) { @@ -293,34 +318,7 @@ if ($filename =~ m,^(.*)(\.\S),) { my $mcount_s = $dirname . "/.tmp_mc_" . $prefix . ".s"; my $mcount_o = $dirname . "/.tmp_mc_" . $prefix . ".o"; -# -# --globalize-symbols came out in 2.17, we must test the version -# of objcopy, and if it is less than 2.17, then we can not -# record local functions. -my $use_locals = 01; -my $local_warn_once = 0; -my $found_version = 0; - -open (IN, "$objcopy --version |") || die "error running $objcopy"; -while () { - if (/objcopy.*\s(\d+)\.(\d+)/) { - my $major = $1; - my $minor = $2; - - $found_version = 1; - if ($major < 2 || - ($major == 2 && $minor < 17)) { - $use_locals = 0; - } - last; - } -} -close (IN); - -if (!$found_version) { - print STDERR "WARNING: could not find objcopy version.\n" . - "\tDisabling local function references.\n"; -} +check_objcopy(); # # Step 1: find all the local (static functions) and weak symbols. @@ -367,7 +365,7 @@ sub update_funcs if (defined $locals{$ref_func}) { # only use locals if objcopy supports globalize-symbols - if (!$use_locals) { + if (!$can_use_local) { return; } $convert{$ref_func} = 1; From db24c7dcf42f78629d89b34e5d5a98ed56ea2ff5 Mon Sep 17 00:00:00 2001 From: Li Hong Date: Wed, 28 Oct 2009 13:05:23 +0800 Subject: [PATCH 24/40] tracing: Move mcount section search to front of loop in recordmcount.pl Move the mcount section check to the beginning of the objdump read loop. This makes the code easier to follow since the search for the mcount section is performed first before the mcount callers are processed. Signed-off-by: Li Hong LKML-Reference: <20091028050523.GE30758@uhli> Signed-off-by: Steven Rostedt --- scripts/recordmcount.pl | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index d6199fc4870..02c80552b07 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -391,9 +391,27 @@ open(IN, "$objdump -hdr $inputfile|") || die "error running $objdump"; my $text; + +# read headers first my $read_headers = 1; while () { + + if ($read_headers && /$mcount_section/) { + # + # Somehow the make process can execute this script on an + # object twice. If it does, we would duplicate the mcount + # section and it will cause the function tracer self test + # to fail. Check if the mcount section exists, and if it does, + # warn and exit. + # + print STDERR "ERROR: $mcount_section already in $inputfile\n" . + "\tThis may be an indication that your build is corrupted.\n" . + "\tDelete $inputfile and try again. If the same object file\n" . + "\tstill causes an issue, then disable CONFIG_DYNAMIC_FTRACE.\n"; + exit(-1); + } + # is it a section? if (/$section_regex/) { $read_headers = 0; @@ -434,21 +452,7 @@ while () { $offset = hex $1; } } - } elsif ($read_headers && /$mcount_section/) { - # - # Somehow the make process can execute this script on an - # object twice. If it does, we would duplicate the mcount - # section and it will cause the function tracer self test - # to fail. Check if the mcount section exists, and if it does, - # warn and exit. - # - print STDERR "ERROR: $mcount_section already in $inputfile\n" . - "\tThis may be an indication that your build is corrupted.\n" . - "\tDelete $inputfile and try again. If the same object file\n" . - "\tstill causes an issue, then disable CONFIG_DYNAMIC_FTRACE.\n"; - exit(-1); } - # is this a call site to mcount? If so, record it to print later if ($text_found && /$mcount_regex/) { $offsets[$#offsets + 1] = hex $1; From 306dcf47d28aaf9aedfafb17a602768584cfc0f2 Mon Sep 17 00:00:00 2001 From: Li Hong Date: Wed, 28 Oct 2009 13:06:19 +0800 Subject: [PATCH 25/40] tracing: Add regex for weak functions in recordmcount.pl Add a variable to contain the regex needed to find weak functions in the 'nm' output. This will allow other archs to easily override it. Also rename the regex variable $nm_regex to $local_regex to be more descriptive. Signed-off-by: Li Hong LKML-Reference: <20091028050619.GF30758@uhli> Signed-off-by: Steven Rostedt --- scripts/recordmcount.pl | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 02c80552b07..7265a7dcac4 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -92,7 +92,7 @@ # # Here are the steps we take: # -# 1) Record all the local symbols by using 'nm' +# 1) Record all the local and weak symbols by using 'nm' # 2) Use objdump to find all the call site offsets and sections for # mcount. # 3) Compile the list into its own object. @@ -152,7 +152,8 @@ my %weak; # List of weak functions my %convert; # List of local functions used that needs conversion my $type; -my $nm_regex; # Find the local functions (return function) +my $local_regex; # Match a local function (return function) +my $weak_regex; # Match a weak function (return function) my $section_regex; # Find the start of a section my $function_regex; # Find the name of a function # (return offset and func name) @@ -197,7 +198,8 @@ if ($arch eq "x86") { # We base the defaults off of i386, the other archs may # feel free to change them in the below if statements. # -$nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)"; +$local_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)"; +$weak_regex = "^[0-9a-fA-F]+\\s+([wW])\\s+(\\S+)"; $section_regex = "Disassembly of section\\s+(\\S+):"; $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$"; @@ -246,7 +248,7 @@ if ($arch eq "x86_64") { $cc .= " -m32"; } elsif ($arch eq "powerpc") { - $nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\.?\\S+)"; + $local_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\.?\\S+)"; $function_regex = "^([0-9a-fA-F]+)\\s+<(\\.?.*?)>:"; $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s\\.?_mcount\$"; @@ -322,13 +324,13 @@ check_objcopy(); # # Step 1: find all the local (static functions) and weak symbols. -# 't' is local, 'w/W' is weak (we never use a weak function) +# 't' is local, 'w/W' is weak # open (IN, "$nm $inputfile|") || die "error running $nm"; while () { - if (/$nm_regex/) { + if (/$local_regex/) { $locals{$1} = 1; - } elsif (/^[0-9a-fA-F]+\s+([wW])\s+(\S+)/) { + } elsif (/$weak_regex/) { $weak{$2} = $1; } } From 6092858c60f168c1950f8ad73880d54271696ec5 Mon Sep 17 00:00:00 2001 From: Li Hong Date: Wed, 28 Oct 2009 13:07:03 +0800 Subject: [PATCH 26/40] tracing: Move conditional into update_funcs() in recordmcount.pl Move all the condition validations into the function update_funcs(). Also update_funcs should not die if $ref_func is undefined for there may be more than one valid section in an object file. Signed-off-by: Li Hong LKML-Reference: <20091028050703.GG30758@uhli> Signed-off-by: Steven Rostedt --- scripts/recordmcount.pl | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 7265a7dcac4..884776a6e01 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -348,9 +348,7 @@ my $offset = 0; # offset of ref_func to section beginning # sub update_funcs { - return if ($#offsets < 0); - - defined($ref_func) || die "No function to reference"; + return unless ($ref_func and @offsets); # A section only had a weak function, to represent it. # Unfortunately, a weak function may be overwritten by another @@ -425,7 +423,7 @@ while () { $read_function = 0; } # print out any recorded offsets - update_funcs() if (defined($ref_func)); + update_funcs(); # reset all markers and arrays $text_found = 0; @@ -462,7 +460,7 @@ while () { } # dump out anymore offsets that may have been found -update_funcs() if (defined($ref_func)); +update_funcs(); # If we did not find any mcount callers, we are done (do nothing). if (!$opened) { From c4b8ac2c1aee1398b9378b8730bac56294b3410b Mon Sep 17 00:00:00 2001 From: Li Hong Date: Wed, 28 Oct 2009 13:07:43 +0800 Subject: [PATCH 27/40] tracing: Exit with error if a weak function is used in recordmcount.pl If a weak function is used as a relocation reference for mcount callers and that function is overridden, it will cause ftrace to fail at run time. The current code should prevent a weak function from being used, but if one is, the code should exit with an error to fail at compile time. Signed-off-by: Li Hong LKML-Reference: <20091028050743.GH30758@uhli> Signed-off-by: Steven Rostedt --- scripts/recordmcount.pl | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 884776a6e01..a4e2435d482 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -350,15 +350,11 @@ sub update_funcs { return unless ($ref_func and @offsets); - # A section only had a weak function, to represent it. - # Unfortunately, a weak function may be overwritten by another - # function of the same name, making all these offsets incorrect. - # To be safe, we simply print a warning and bail. + # Sanity check on weak function. A weak function may be overwritten by + # another function of the same name, making all these offsets incorrect. if (defined $weak{$ref_func}) { - print STDERR - "$inputfile: WARNING: referencing weak function" . + die "$inputfile: ERROR: referencing weak function" . " $ref_func for mcount\n"; - return; } # is this function static? If so, note this fact. From 7b2a35132ad0a70902dcd2844c27ed64cda0ce9b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 2 Nov 2009 08:50:52 +0800 Subject: [PATCH 28/40] compiler: Introduce __always_unused I wrote some code which is used as compile-time checker, and the code should be elided after compile. So I need to annotate the code as "always unused", compared to "maybe unused". Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Linus Torvalds LKML-Reference: <4AEE2CEC.8040206@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/compiler-gcc.h | 1 + include/linux/compiler.h | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index a3ed7cb8ca3..73dcf804bc9 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -79,6 +79,7 @@ #define noinline __attribute__((noinline)) #define __attribute_const__ __attribute__((__const__)) #define __maybe_unused __attribute__((unused)) +#define __always_unused __attribute__((unused)) #define __gcc_header(x) #x #define _gcc_header(x) __gcc_header(linux/compiler-gcc##x.h) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 04fb5135b4e..7947f4f6fa5 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -213,6 +213,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); # define __maybe_unused /* unimplemented */ #endif +#ifndef __always_unused +# define __always_unused /* unimplemented */ +#endif + #ifndef noinline #define noinline #endif From 5e9b397292ca0b9409dced33e3a22ec993377064 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 2 Nov 2009 08:51:13 +0800 Subject: [PATCH 29/40] tracing: Fix to use __always_unused attribute ____ftrace_check_##name() is used for compile-time check on F_printk() only, so it should be marked as __unused instead of __used. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Linus Torvalds LKML-Reference: <4AEE2D01.4010305@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_export.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 9753fcc61bc..c74848ddb85 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -48,11 +48,11 @@ struct ____ftrace_##name { \ tstruct \ }; \ -static void __used ____ftrace_check_##name(void) \ +static void __always_unused ____ftrace_check_##name(void) \ { \ struct ____ftrace_##name *__entry = NULL; \ \ - /* force cmpile-time check on F_printk() */ \ + /* force compile-time check on F_printk() */ \ printk(print); \ } From 4dae560f97fa438f373b53e14b30149c9e44a600 Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Fri, 30 Oct 2009 19:23:10 +0530 Subject: [PATCH 30/40] kprobes: Sanitize struct kretprobe_instance allocations For as long as kretprobes have existed, we've allocated NR_CPUS instances of kretprobe_instance structures. With the default value of CONFIG_NR_CPUS increasing on certain architectures, we are potentially wasting kernel memory. See http://sourceware.org/bugzilla/show_bug.cgi?id=10839#c3 for more details. Use a saner num_possible_cpus() instead of NR_CPUS for allocation. Signed-off-by: Ananth N Mavinakayanahalli Acked-by: Masami Hiramatsu Cc: Jim Keniston Cc: fweisbec@gmail.com LKML-Reference: <20091030135310.GA22230@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5240d75f4c6..1494e85b35f 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1014,9 +1014,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp) /* Pre-allocate memory for max kretprobe instances */ if (rp->maxactive <= 0) { #ifdef CONFIG_PREEMPT - rp->maxactive = max(10, 2 * NR_CPUS); + rp->maxactive = max(10, 2 * num_possible_cpus()); #else - rp->maxactive = NR_CPUS; + rp->maxactive = num_possible_cpus(); #endif } spin_lock_init(&rp->lock); From a6f0eb6adc42e5eed3f35af99c61c0e411b16f8e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 11 Nov 2009 17:14:07 -0500 Subject: [PATCH 31/40] ring-buffer: Add multiple iterations between benchmark timestamps The ring_buffer_benchmark does a gettimeofday after every write to the ring buffer in its measurements. This adds the overhead of the call to gettimeofday to the measurements and does not give an accurate picture of the length of time it takes to record a trace. This was first noticed with perf top: ------------------------------------------------------------------------------ PerfTop: 679 irqs/sec kernel:99.9% [1000Hz cpu-clock-msecs], (all, 4 CPUs) ------------------------------------------------------------------------------ samples pcnt kernel function _______ _____ _______________ 1673.00 - 27.8% : trace_clock_local 806.00 - 13.4% : do_gettimeofday 590.00 - 9.8% : rb_reserve_next_event 554.00 - 9.2% : native_read_tsc 431.00 - 7.2% : ring_buffer_lock_reserve 365.00 - 6.1% : __rb_reserve_next 355.00 - 5.9% : rb_end_commit 322.00 - 5.4% : getnstimeofday 268.00 - 4.5% : ring_buffer_unlock_commit 262.00 - 4.4% : ring_buffer_producer_thread [ring_buffer_benchmark] 113.00 - 1.9% : read_tsc 91.00 - 1.5% : debug_smp_processor_id 69.00 - 1.1% : trace_recursive_unlock 66.00 - 1.1% : ring_buffer_event_data 25.00 - 0.4% : _spin_unlock_irq And the length of each write to the ring buffer measured at 310ns. This patch adds a new module parameter called "write_interval" which is defaulted to 50. This is the number of writes performed between timestamps. After this patch perf top shows: ------------------------------------------------------------------------------ PerfTop: 244 irqs/sec kernel:100.0% [1000Hz cpu-clock-msecs], (all, 4 CPUs) ------------------------------------------------------------------------------ samples pcnt kernel function _______ _____ _______________ 2842.00 - 40.4% : trace_clock_local 1043.00 - 14.8% : rb_reserve_next_event 784.00 - 11.1% : ring_buffer_lock_reserve 600.00 - 8.5% : __rb_reserve_next 579.00 - 8.2% : rb_end_commit 440.00 - 6.3% : ring_buffer_unlock_commit 290.00 - 4.1% : ring_buffer_producer_thread [ring_buffer_benchmark] 155.00 - 2.2% : debug_smp_processor_id 117.00 - 1.7% : trace_recursive_unlock 103.00 - 1.5% : ring_buffer_event_data 28.00 - 0.4% : do_gettimeofday 22.00 - 0.3% : _spin_unlock_irq 14.00 - 0.2% : native_read_tsc 11.00 - 0.2% : getnstimeofday do_gettimeofday dropped from 13% usage to a mere 0.4%! (using the default 50 interval) The measurement for each timestamp went from 310ns to 210ns. That's 100ns (1/3rd) overhead that the gettimeofday call was introducing. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 573d3cc762c..70df73e4ff2 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -35,6 +35,10 @@ static int disable_reader; module_param(disable_reader, uint, 0644); MODULE_PARM_DESC(disable_reader, "only run producer"); +static int write_iteration = 50; +module_param(write_iteration, uint, 0644); +MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); + static int read_events; static int kill_test; @@ -208,15 +212,18 @@ static void ring_buffer_producer(void) do { struct ring_buffer_event *event; int *entry; + int i; - event = ring_buffer_lock_reserve(buffer, 10); - if (!event) { - missed++; - } else { - hit++; - entry = ring_buffer_event_data(event); - *entry = smp_processor_id(); - ring_buffer_unlock_commit(buffer, event); + for (i = 0; i < write_iteration; i++) { + event = ring_buffer_lock_reserve(buffer, 10); + if (!event) { + missed++; + } else { + hit++; + entry = ring_buffer_event_data(event); + *entry = smp_processor_id(); + ring_buffer_unlock_commit(buffer, event); + } } do_gettimeofday(&end_tv); From 8b2a5dac7859dd1954095fce8b6445c3ceb36ef6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 11 Nov 2009 19:36:03 -0500 Subject: [PATCH 32/40] tracing: do not disable interrupts for trace_clock_local Disabling interrupts in trace_clock_local takes quite a performance hit to the recording of traces. Using perf top we see: ------------------------------------------------------------------------------ PerfTop: 244 irqs/sec kernel:100.0% [1000Hz cpu-clock-msecs], (all, 4 CPUs) ------------------------------------------------------------------------------ samples pcnt kernel function _______ _____ _______________ 2842.00 - 40.4% : trace_clock_local 1043.00 - 14.8% : rb_reserve_next_event 784.00 - 11.1% : ring_buffer_lock_reserve 600.00 - 8.5% : __rb_reserve_next 579.00 - 8.2% : rb_end_commit 440.00 - 6.3% : ring_buffer_unlock_commit 290.00 - 4.1% : ring_buffer_producer_thread [ring_buffer_benchmark] 155.00 - 2.2% : debug_smp_processor_id 117.00 - 1.7% : trace_recursive_unlock 103.00 - 1.5% : ring_buffer_event_data 28.00 - 0.4% : do_gettimeofday 22.00 - 0.3% : _spin_unlock_irq 14.00 - 0.2% : native_read_tsc 11.00 - 0.2% : getnstimeofday Where trace_clock_local is 40% of the tracing, and the time for recording a trace according to ring_buffer_benchmark is 210ns. After converting the interrupts to preemption disabling we have from perf top: ------------------------------------------------------------------------------ PerfTop: 1084 irqs/sec kernel:99.9% [1000Hz cpu-clock-msecs], (all, 4 CPUs) ------------------------------------------------------------------------------ samples pcnt kernel function _______ _____ _______________ 1277.00 - 16.8% : native_read_tsc 1148.00 - 15.1% : rb_reserve_next_event 896.00 - 11.8% : ring_buffer_lock_reserve 688.00 - 9.1% : __rb_reserve_next 664.00 - 8.8% : rb_end_commit 563.00 - 7.4% : ring_buffer_unlock_commit 508.00 - 6.7% : _spin_unlock_irq 365.00 - 4.8% : debug_smp_processor_id 321.00 - 4.2% : trace_clock_local 303.00 - 4.0% : ring_buffer_producer_thread [ring_buffer_benchmark] 273.00 - 3.6% : native_sched_clock 122.00 - 1.6% : trace_recursive_unlock 113.00 - 1.5% : sched_clock 101.00 - 1.3% : ring_buffer_event_data 53.00 - 0.7% : tick_nohz_stop_sched_tick Where trace_clock_local drops from 40% to only taking 4% of the total time. The trace time also goes from 210ns down to 179ns (31ns). I talked with Peter Zijlstra about the impact that sched_clock may have without having interrupts disabled, and he told me that if a timer interrupt comes in, sched_clock may report a wrong time. Balancing a seldom incorrect timestamp with a 15% performance boost, I'll take the performance boost. Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace_clock.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 20c5f92e28a..878c03f386b 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -20,6 +20,8 @@ #include #include +#include "trace.h" + /* * trace_clock_local(): the simplest and least coherent tracing clock. * @@ -28,17 +30,17 @@ */ u64 notrace trace_clock_local(void) { - unsigned long flags; u64 clock; + int resched; /* * sched_clock() is an architecture implemented, fast, scalable, * lockless clock. It is not guaranteed to be coherent across * CPUs, nor across CPU idle events. */ - raw_local_irq_save(flags); + resched = ftrace_preempt_disable(); clock = sched_clock(); - raw_local_irq_restore(flags); + ftrace_preempt_enable(resched); return clock; } From 5a50e33cc916f6a81cb96f0f24f6a88c9ab78b79 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Nov 2009 08:43:01 -0500 Subject: [PATCH 33/40] ring-buffer: Move access to commit_page up into function used With the change of the way we process commits. Where a commit only happens at the outer most level, and that we don't need to worry about a commit ending after the rb_start_commit() has been called, the code use to grab the commit page before the tail page to prevent a possible race. But this race no longer exists with the rb_start_commit() rb_end_commit() interface. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3ffa502fb24..4b8293fa545 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1785,9 +1785,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length, unsigned long tail, - struct buffer_page *commit_page, struct buffer_page *tail_page, u64 *ts) { + struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; @@ -1890,13 +1890,10 @@ static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, unsigned type, unsigned long length, u64 *ts) { - struct buffer_page *tail_page, *commit_page; + struct buffer_page *tail_page; struct ring_buffer_event *event; unsigned long tail, write; - commit_page = cpu_buffer->commit_page; - /* we just need to protect against interrupts */ - barrier(); tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); @@ -1907,7 +1904,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* See if we shot pass the end of this buffer page */ if (write > BUF_PAGE_SIZE) return rb_move_tail(cpu_buffer, length, tail, - commit_page, tail_page, ts); + tail_page, ts); /* We reserved something on the buffer */ From f6060f46819f313d34a8c8151390cda509c23389 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 5 Nov 2009 11:16:17 +0800 Subject: [PATCH 34/40] tracing: Prevent build warning: 'ftrace_graph_buf' defined but not used Prevent build warning when CONFIG_FUNCTION_GRAPH_TRACER is not set. Signed-off-by: Lai Jiangshan LKML-Reference: <4AF24381.5060307@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1ed514fe3a3..7f9b51e8184 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2274,7 +2274,6 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset) #define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; -static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static int __init set_ftrace_notrace(char *str) { @@ -2291,6 +2290,7 @@ static int __init set_ftrace_filter(char *str) __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER +static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); From 638adb0561264a3360a53e93def62288c85d8373 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Nov 2009 10:48:25 -0500 Subject: [PATCH 35/40] tracing: Only print objcopy version warning once from recordmcount If the user has an older version of objcopy, that can not handle converting local symbols to global and vice versa, then some functions will not be part of the dynamic function tracer. The current code in recordmcount.pl will print a warning in this case. Unfortunately, there exists lots of files that may have this issue with older objcopys and this will cause a warning for every file compiled with this issue. This patch solves this overwhelming output by creating a .tmp_quiet_recordmcount file on the first instance the warning is encountered. The warning will not print if this file exists. The temp file is deleted at the beginning of the compile to ensure that the warning will happen once again on new compiles (because the issue is still present). Reported-by: Andrew Morton Cc: Sam Ravnborg Signed-off-by: Steven Rostedt --- Makefile | 1 + scripts/recordmcount.pl | 12 ++++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 9425d1de54c..1c949ac8ce6 100644 --- a/Makefile +++ b/Makefile @@ -379,6 +379,7 @@ export RCS_TAR_IGNORE := --exclude SCCS --exclude BitKeeper --exclude .svn --exc PHONY += scripts_basic scripts_basic: $(Q)$(MAKE) $(build)=scripts/basic + $(Q)rm -f .tmp_quiet_recordmcount # To avoid any implicit rule to kick in, define an empty command. scripts/basic/%: scripts_basic ; diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index a4e2435d482..f0d14452632 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -162,6 +162,11 @@ my $alignment; # The .align value to use for $mcount_section my $section_type; # Section header plus possible alignment command my $can_use_local = 0; # If we can use local function references +# Shut up recordmcount if user has older objcopy +my $quiet_recordmcount = ".tmp_quiet_recordmcount"; +my $print_warning = 1; +$print_warning = 0 if ( -f $quiet_recordmcount); + ## # check_objcopy - whether objcopy supports --globalize-symbols # @@ -179,10 +184,13 @@ sub check_objcopy } close (IN); - if (!$can_use_local) { + if (!$can_use_local && $print_warning) { print STDERR "WARNING: could not find objcopy version or version " . "is less than 2.17.\n" . - "\tLocal function references is disabled.\n"; + "\tLocal function references are disabled.\n"; + open (QUIET, ">$quiet_recordmcount"); + printf QUIET "Disables the warning from recordmcount.pl\n"; + close QUIET; } } From 192dcf1d1775736627280a5dd4cb0f605b21857a Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Wed, 18 Nov 2009 13:06:55 -0800 Subject: [PATCH 36/40] tracing: Remove the stale include/trace/power.h Commit 6161352 moved the power tracing to include/trace/events/, but left the old header behind. No one is using the old header, and its declarations are now incorrect, so it should be removed. Signed-off-by: Josh Stone Acked-by: Arjan van de Ven Cc: Frank Ch. Eigler Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Frederic Weisbecker LKML-Reference: <1258578415-14752-1-git-send-email-jistone@redhat.com> Signed-off-by: Ingo Molnar --- include/trace/power.h | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 include/trace/power.h diff --git a/include/trace/power.h b/include/trace/power.h deleted file mode 100644 index ef204666e98..00000000000 --- a/include/trace/power.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef _TRACE_POWER_H -#define _TRACE_POWER_H - -#include -#include - -enum { - POWER_NONE = 0, - POWER_CSTATE = 1, - POWER_PSTATE = 2, -}; - -struct power_trace { - ktime_t stamp; - ktime_t end; - int type; - int state; -}; - -DECLARE_TRACE(power_start, - TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state), - TP_ARGS(it, type, state)); - -DECLARE_TRACE(power_mark, - TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state), - TP_ARGS(it, type, state)); - -DECLARE_TRACE(power_end, - TP_PROTO(struct power_trace *it), - TP_ARGS(it)); - -#endif /* _TRACE_POWER_H */ From 98e4833ba3c314c99dc364012fba6ac894230ad0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 23 Nov 2009 08:03:09 +0100 Subject: [PATCH 37/40] ring-buffer benchmark: Run producer/consumer threads at nice +19 The ring-buffer benchmark threads run on nice 0 by default, using up a lot of CPU time and slowing down the system: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 1024 root 20 0 0 0 0 D 95.3 0.0 4:01.67 rb_producer 1023 root 20 0 0 0 0 R 93.5 0.0 2:54.33 rb_consumer 21569 mingo 40 0 14852 1048 772 R 3.6 0.1 0:00.05 top 1 root 40 0 4080 928 668 S 0.0 0.0 0:23.98 init Renice them to +19 to make them less intrusive. Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer_benchmark.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 70df73e4ff2..3875d49da99 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -399,6 +399,12 @@ static int __init ring_buffer_benchmark_init(void) if (IS_ERR(producer)) goto out_kill; + /* + * Run them as low-prio background tasks by default: + */ + set_user_nice(consumer, 19); + set_user_nice(producer, 19); + return 0; out_kill: From 457dc928f586f3f4b930206965e6db270034e97e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 23 Nov 2009 11:03:28 +0100 Subject: [PATCH 38/40] tracing, function tracer: Clean up strstrip() usage Clean up strstrip() usage - which also addresses this build warning: kernel/trace/ftrace.c: In function 'ftrace_pid_write': kernel/trace/ftrace.c:3004: warning: ignoring return value of 'strstrip', declared with attribute warn_unused_result Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7f9b51e8184..1dc101d0976 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2985,7 +2985,7 @@ static ssize_t ftrace_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[64]; + char buf[64], *tmp; long val; int ret; @@ -3001,11 +3001,11 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" * to clean the filter quietly. */ - strstrip(buf); - if (strlen(buf) == 0) + tmp = strstrip(buf); + if (strlen(tmp) == 0) return 1; - ret = strict_strtol(buf, 10, &val); + ret = strict_strtol(tmp, 10, &val); if (ret < 0) return ret; @@ -3391,4 +3391,3 @@ void ftrace_graph_stop(void) ftrace_stop(); } #endif - From 7ac074340480018681a0d72b324d4487543bdc0e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Nov 2009 13:22:21 -0500 Subject: [PATCH 39/40] ring-buffer-benchmark: Add parameters to set produce/consumer priorities Running the ring-buffer-benchmark's threads at the lowest priority may work well for keeping it in the background, but it is not appropriate for the benchmarks. This patch adds 4 parameters to the module: consumer_fifo consumer_nice producer_fifo producer_nice By default the consumer and producer still run at nice +19. If the *_fifo options are set, they will override the *_nice values. modprobe ring_buffer_benchmark consumer_nice=0 producer_fifo=10 The above will set the consumer thread to a nice value of 0, and the producer thread to a RT SCHED_FIFO priority of 10. Note, this patch also fixes a bug where calling set_user_nice on the consumer thread would oops the kernel when the parameter "disable_reader" is set. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 58 +++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 3875d49da99..b2477caf09c 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -39,6 +39,24 @@ static int write_iteration = 50; module_param(write_iteration, uint, 0644); MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); +static int producer_nice = 19; +static int consumer_nice = 19; + +static int producer_fifo = -1; +static int consumer_fifo = -1; + +module_param(producer_nice, uint, 0644); +MODULE_PARM_DESC(producer_nice, "nice prio for producer"); + +module_param(consumer_nice, uint, 0644); +MODULE_PARM_DESC(consumer_nice, "nice prio for consumer"); + +module_param(producer_fifo, uint, 0644); +MODULE_PARM_DESC(producer_fifo, "fifo prio for producer"); + +module_param(consumer_fifo, uint, 0644); +MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer"); + static int read_events; static int kill_test; @@ -270,6 +288,27 @@ static void ring_buffer_producer(void) if (kill_test) trace_printk("ERROR!\n"); + + if (!disable_reader) { + if (consumer_fifo < 0) + trace_printk("Running Consumer at nice: %d\n", + consumer_nice); + else + trace_printk("Running Consumer at SCHED_FIFO %d\n", + consumer_fifo); + } + if (producer_fifo < 0) + trace_printk("Running Producer at nice: %d\n", + producer_nice); + else + trace_printk("Running Producer at SCHED_FIFO %d\n", + producer_fifo); + + /* Let the user know that the test is running at low priority */ + if (producer_fifo < 0 && consumer_fifo < 0 && + producer_nice == 19 && consumer_nice == 19) + trace_printk("WARNING!!! This test is running at lowest priority.\n"); + trace_printk("Time: %lld (usecs)\n", time); trace_printk("Overruns: %lld\n", overruns); if (disable_reader) @@ -402,8 +441,23 @@ static int __init ring_buffer_benchmark_init(void) /* * Run them as low-prio background tasks by default: */ - set_user_nice(consumer, 19); - set_user_nice(producer, 19); + if (!disable_reader) { + if (consumer_fifo >= 0) { + struct sched_param param = { + .sched_priority = consumer_fifo + }; + sched_setscheduler(consumer, SCHED_FIFO, ¶m); + } else + set_user_nice(consumer, consumer_nice); + } + + if (producer_fifo >= 0) { + struct sched_param param = { + .sched_priority = consumer_fifo + }; + sched_setscheduler(producer, SCHED_FIFO, ¶m); + } else + set_user_nice(producer, producer_nice); return 0; From b8007ef7422270864eae523cb38d7522a53a94d3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 3 Nov 2009 13:45:32 +0800 Subject: [PATCH 40/40] tracing: Separate raw syscall from syscall tracer The current syscall tracer mixes raw syscalls and real syscalls. echo 1 > events/syscalls/enable And we get these from the output: (XXXX insteads " grep-20914 [001] 588211.446347" .. etc) XXXX: sys_read(fd: 3, buf: 80609a8, count: 7000) XXXX: sys_enter: NR 3 (3, 80609a8, 7000, a, 1000, bfce8ef8) XXXX: sys_read -> 0x138 XXXX: sys_exit: NR 3 = 312 XXXX: sys_read(fd: 3, buf: 8060ae0, count: 7000) XXXX: sys_enter: NR 3 (3, 8060ae0, 7000, a, 1000, bfce8ef8) XXXX: sys_read -> 0x138 XXXX: sys_exit: NR 3 = 312 There are 2 drawbacks here. A) two almost identical records are saved in ringbuffer when a syscall enters or exits. (4 records for every syscall) This wastes precious space in the ring buffer. B) the lines including "sys_enter/sys_exit" produces hardly any useful information for the output (no labels). The user can use this method to prevent these drawbacks: echo 1 > events/syscalls/enable echo 0 > events/syscalls/sys_enter/enable echo 0 > events/syscalls/sys_exit/enable But this is not user friendly. So we separate raw syscall from syscall tracer. After this fix applied: syscall tracer's output (echo 1 > events/syscalls/enable): XXXX: sys_read(fd: 3, buf: bfe87d88, count: 200) XXXX: sys_read -> 0x200 XXXX: sys_fstat64(fd: 3, statbuf: bfe87c98) XXXX: sys_fstat64 -> 0x0 XXXX: sys_close(fd: 3) raw syscall tracer's output (echo 1 > events/raw_syscalls/enable): XXXX: sys_enter: NR 175 (0, bf92bf18, bf92bf98, 8, b748cff4, bf92bef8) XXXX: sys_exit: NR 175 = 0 XXXX: sys_enter: NR 175 (2, bf92bf98, 0, 8, b748cff4, bf92bef8) XXXX: sys_exit: NR 175 = 0 XXXX: sys_enter: NR 3 (9, bf927f9c, 4000, b77e2518, b77dce60, bf92bff8) Signed-off-by: Lai Jiangshan LKML-Reference: <4AEFC37C.5080609@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/trace/events/syscalls.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/trace/events/syscalls.h b/include/trace/events/syscalls.h index 397dff2dbd5..fb726ac7cae 100644 --- a/include/trace/events/syscalls.h +++ b/include/trace/events/syscalls.h @@ -1,5 +1,6 @@ #undef TRACE_SYSTEM -#define TRACE_SYSTEM syscalls +#define TRACE_SYSTEM raw_syscalls +#define TRACE_INCLUDE_FILE syscalls #if !defined(_TRACE_EVENTS_SYSCALLS_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_EVENTS_SYSCALLS_H