Skip to content

Commit 0fc1b09

Browse files
committed
tracing: Use temp buffer when filtering events
Filtering of events requires the data to be written to the ring buffer before it can be decided to filter or not. This is because the parameters of the filter are based on the result that is written to the ring buffer and not on the parameters that are passed into the trace functions. The ftrace ring buffer is optimized for writing into the ring buffer and committing. The discard procedure used when filtering decides the event should be discarded is much more heavy weight. Thus, using a temporary filter when filtering events can speed things up drastically. Without a temp buffer we have: # trace-cmd start -p nop # perf stat -r 10 hackbench 50 0.790706626 seconds time elapsed ( +- 0.71% ) # trace-cmd start -e all # perf stat -r 10 hackbench 50 1.566904059 seconds time elapsed ( +- 0.27% ) # trace-cmd start -e all -f 'common_preempt_count==20' # perf stat -r 10 hackbench 50 1.690598511 seconds time elapsed ( +- 0.19% ) # trace-cmd start -e all -f 'common_preempt_count!=20' # perf stat -r 10 hackbench 50 1.707486364 seconds time elapsed ( +- 0.30% ) The first run above is without any tracing, just to get a based figure. hackbench takes ~0.79 seconds to run on the system. The second run enables tracing all events where nothing is filtered. This increases the time by 100% and hackbench takes 1.57 seconds to run. The third run filters all events where the preempt count will equal "20" (this should never happen) thus all events are discarded. This takes 1.69 seconds to run. This is 10% slower than just committing the events! The last run enables all events and filters where the filter will commit all events, and this takes 1.70 seconds to run. The filtering overhead is approximately 10%. Thus, the discard and commit of an event from the ring buffer may be about the same time. With this patch, the numbers change: # trace-cmd start -p nop # perf stat -r 10 hackbench 50 0.778233033 seconds time elapsed ( +- 0.38% ) # trace-cmd start -e all # perf stat -r 10 hackbench 50 1.582102692 seconds time elapsed ( +- 0.28% ) # trace-cmd start -e all -f 'common_preempt_count==20' # perf stat -r 10 hackbench 50 1.309230710 seconds time elapsed ( +- 0.22% ) # trace-cmd start -e all -f 'common_preempt_count!=20' # perf stat -r 10 hackbench 50 1.786001924 seconds time elapsed ( +- 0.20% ) The first run is again the base with no tracing. The second run is all tracing with no filtering. It is a little slower, but that may be well within the noise. The third run shows that discarding all events only took 1.3 seconds. This is a speed up of 23%! The discard is much faster than even the commit. The one downside is shown in the last run. Events that are not discarded by the filter will take longer to add, this is due to the extra copy of the event. Cc: Alexei Starovoitov <[email protected]> Signed-off-by: Steven Rostedt <[email protected]>
1 parent dcb0b55 commit 0fc1b09

File tree

4 files changed

+185
-8
lines changed

4 files changed

+185
-8
lines changed

kernel/trace/trace.c

Lines changed: 147 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
312312
{
313313
if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
314314
!filter_match_preds(call->filter, rec)) {
315-
ring_buffer_discard_commit(buffer, event);
315+
__trace_event_discard_commit(buffer, event);
316316
return 1;
317317
}
318318

@@ -1660,6 +1660,16 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
16601660
}
16611661
EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
16621662

1663+
static __always_inline void
1664+
trace_event_setup(struct ring_buffer_event *event,
1665+
int type, unsigned long flags, int pc)
1666+
{
1667+
struct trace_entry *ent = ring_buffer_event_data(event);
1668+
1669+
tracing_generic_entry_update(ent, flags, pc);
1670+
ent->type = type;
1671+
}
1672+
16631673
struct ring_buffer_event *
16641674
trace_buffer_lock_reserve(struct ring_buffer *buffer,
16651675
int type,
@@ -1669,21 +1679,136 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
16691679
struct ring_buffer_event *event;
16701680

16711681
event = ring_buffer_lock_reserve(buffer, len);
1672-
if (event != NULL) {
1673-
struct trace_entry *ent = ring_buffer_event_data(event);
1682+
if (event != NULL)
1683+
trace_event_setup(event, type, flags, pc);
1684+
1685+
return event;
1686+
}
1687+
1688+
DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
1689+
DEFINE_PER_CPU(int, trace_buffered_event_cnt);
1690+
static int trace_buffered_event_ref;
1691+
1692+
/**
1693+
* trace_buffered_event_enable - enable buffering events
1694+
*
1695+
* When events are being filtered, it is quicker to use a temporary
1696+
* buffer to write the event data into if there's a likely chance
1697+
* that it will not be committed. The discard of the ring buffer
1698+
* is not as fast as committing, and is much slower than copying
1699+
* a commit.
1700+
*
1701+
* When an event is to be filtered, allocate per cpu buffers to
1702+
* write the event data into, and if the event is filtered and discarded
1703+
* it is simply dropped, otherwise, the entire data is to be committed
1704+
* in one shot.
1705+
*/
1706+
void trace_buffered_event_enable(void)
1707+
{
1708+
struct ring_buffer_event *event;
1709+
struct page *page;
1710+
int cpu;
16741711

1675-
tracing_generic_entry_update(ent, flags, pc);
1676-
ent->type = type;
1712+
WARN_ON_ONCE(!mutex_is_locked(&event_mutex));
1713+
1714+
if (trace_buffered_event_ref++)
1715+
return;
1716+
1717+
for_each_tracing_cpu(cpu) {
1718+
page = alloc_pages_node(cpu_to_node(cpu),
1719+
GFP_KERNEL | __GFP_NORETRY, 0);
1720+
if (!page)
1721+
goto failed;
1722+
1723+
event = page_address(page);
1724+
memset(event, 0, sizeof(*event));
1725+
1726+
per_cpu(trace_buffered_event, cpu) = event;
1727+
1728+
preempt_disable();
1729+
if (cpu == smp_processor_id() &&
1730+
this_cpu_read(trace_buffered_event) !=
1731+
per_cpu(trace_buffered_event, cpu))
1732+
WARN_ON_ONCE(1);
1733+
preempt_enable();
16771734
}
16781735

1679-
return event;
1736+
return;
1737+
failed:
1738+
trace_buffered_event_disable();
1739+
}
1740+
1741+
static void enable_trace_buffered_event(void *data)
1742+
{
1743+
/* Probably not needed, but do it anyway */
1744+
smp_rmb();
1745+
this_cpu_dec(trace_buffered_event_cnt);
1746+
}
1747+
1748+
static void disable_trace_buffered_event(void *data)
1749+
{
1750+
this_cpu_inc(trace_buffered_event_cnt);
1751+
}
1752+
1753+
/**
1754+
* trace_buffered_event_disable - disable buffering events
1755+
*
1756+
* When a filter is removed, it is faster to not use the buffered
1757+
* events, and to commit directly into the ring buffer. Free up
1758+
* the temp buffers when there are no more users. This requires
1759+
* special synchronization with current events.
1760+
*/
1761+
void trace_buffered_event_disable(void)
1762+
{
1763+
int cpu;
1764+
1765+
WARN_ON_ONCE(!mutex_is_locked(&event_mutex));
1766+
1767+
if (WARN_ON_ONCE(!trace_buffered_event_ref))
1768+
return;
1769+
1770+
if (--trace_buffered_event_ref)
1771+
return;
1772+
1773+
preempt_disable();
1774+
/* For each CPU, set the buffer as used. */
1775+
smp_call_function_many(tracing_buffer_mask,
1776+
disable_trace_buffered_event, NULL, 1);
1777+
preempt_enable();
1778+
1779+
/* Wait for all current users to finish */
1780+
synchronize_sched();
1781+
1782+
for_each_tracing_cpu(cpu) {
1783+
free_page((unsigned long)per_cpu(trace_buffered_event, cpu));
1784+
per_cpu(trace_buffered_event, cpu) = NULL;
1785+
}
1786+
/*
1787+
* Make sure trace_buffered_event is NULL before clearing
1788+
* trace_buffered_event_cnt.
1789+
*/
1790+
smp_wmb();
1791+
1792+
preempt_disable();
1793+
/* Do the work on each cpu */
1794+
smp_call_function_many(tracing_buffer_mask,
1795+
enable_trace_buffered_event, NULL, 1);
1796+
preempt_enable();
16801797
}
16811798

16821799
void
16831800
__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
16841801
{
16851802
__this_cpu_write(trace_cmdline_save, true);
1686-
ring_buffer_unlock_commit(buffer, event);
1803+
1804+
/* If this is the temp buffer, we need to commit fully */
1805+
if (this_cpu_read(trace_buffered_event) == event) {
1806+
/* Length is in event->array[0] */
1807+
ring_buffer_write(buffer, event->array[0], &event->array[1]);
1808+
/* Release the temp buffer */
1809+
this_cpu_dec(trace_buffered_event_cnt);
1810+
} else
1811+
ring_buffer_unlock_commit(buffer, event);
16871812
}
16881813

16891814
static struct ring_buffer *temp_buffer;
@@ -1695,8 +1820,23 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
16951820
unsigned long flags, int pc)
16961821
{
16971822
struct ring_buffer_event *entry;
1823+
int val;
16981824

16991825
*current_rb = trace_file->tr->trace_buffer.buffer;
1826+
1827+
if ((trace_file->flags &
1828+
(EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
1829+
(entry = this_cpu_read(trace_buffered_event))) {
1830+
/* Try to use the per cpu buffer first */
1831+
val = this_cpu_inc_return(trace_buffered_event_cnt);
1832+
if (val == 1) {
1833+
trace_event_setup(entry, type, flags, pc);
1834+
entry->array[0] = len;
1835+
return entry;
1836+
}
1837+
this_cpu_dec(trace_buffered_event_cnt);
1838+
}
1839+
17001840
entry = trace_buffer_lock_reserve(*current_rb,
17011841
type, len, flags, pc);
17021842
/*

kernel/trace/trace.h

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1083,6 +1083,23 @@ static inline void trace_buffer_unlock_commit(struct trace_array *tr,
10831083
trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL);
10841084
}
10851085

1086+
DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
1087+
DECLARE_PER_CPU(int, trace_buffered_event_cnt);
1088+
void trace_buffered_event_disable(void);
1089+
void trace_buffered_event_enable(void);
1090+
1091+
static inline void
1092+
__trace_event_discard_commit(struct ring_buffer *buffer,
1093+
struct ring_buffer_event *event)
1094+
{
1095+
if (this_cpu_read(trace_buffered_event) == event) {
1096+
/* Simply release the temp buffer */
1097+
this_cpu_dec(trace_buffered_event_cnt);
1098+
return;
1099+
}
1100+
ring_buffer_discard_commit(buffer, event);
1101+
}
1102+
10861103
/*
10871104
* Helper function for event_trigger_unlock_commit{_regs}().
10881105
* If there are event triggers attached to this event that requires
@@ -1111,7 +1128,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
11111128
if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
11121129
(unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
11131130
!filter_match_preds(file->filter, entry))) {
1114-
ring_buffer_discard_commit(buffer, event);
1131+
__trace_event_discard_commit(buffer, event);
11151132
return true;
11161133
}
11171134

kernel/trace/trace_events.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
363363
{
364364
struct trace_event_call *call = file->event_call;
365365
struct trace_array *tr = file->tr;
366+
unsigned long file_flags = file->flags;
366367
int ret = 0;
367368
int disable;
368369

@@ -445,6 +446,15 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
445446
break;
446447
}
447448

449+
/* Enable or disable use of trace_buffered_event */
450+
if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) !=
451+
(file->flags & EVENT_FILE_FL_SOFT_DISABLED)) {
452+
if (file->flags & EVENT_FILE_FL_SOFT_DISABLED)
453+
trace_buffered_event_enable();
454+
else
455+
trace_buffered_event_disable();
456+
}
457+
448458
return ret;
449459
}
450460

kernel/trace/trace_events_filter.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -823,7 +823,12 @@ static void __free_preds(struct event_filter *filter)
823823

824824
static void filter_disable(struct trace_event_file *file)
825825
{
826+
unsigned long old_flags = file->flags;
827+
826828
file->flags &= ~EVENT_FILE_FL_FILTERED;
829+
830+
if (old_flags != file->flags)
831+
trace_buffered_event_disable();
827832
}
828833

829834
static void __free_filter(struct event_filter *filter)
@@ -1698,7 +1703,12 @@ static int replace_preds(struct trace_event_call *call,
16981703

16991704
static inline void event_set_filtered_flag(struct trace_event_file *file)
17001705
{
1706+
unsigned long old_flags = file->flags;
1707+
17011708
file->flags |= EVENT_FILE_FL_FILTERED;
1709+
1710+
if (old_flags != file->flags)
1711+
trace_buffered_event_enable();
17021712
}
17031713

17041714
static inline void event_set_filter(struct trace_event_file *file,

0 commit comments

Comments
 (0)