Skip to content

Commit f3f47a6

Browse files
Arjan van de VenIngo Molnar
authored andcommitted
tracing: add "power-tracer": C/P state tracer to help power optimization
Impact: new "power-tracer" ftrace plugin This patch adds a C/P-state ftrace plugin that will generate detailed statistics about the C/P-states that are being used, so that we can look at detailed decisions that the C/P-state code is making, rather than the too high level "average" that we have today. An example way of using this is: mount -t debugfs none /sys/kernel/debug echo cstate > /sys/kernel/debug/tracing/current_tracer echo 1 > /sys/kernel/debug/tracing/tracing_enabled sleep 1 echo 0 > /sys/kernel/debug/tracing/tracing_enabled cat /sys/kernel/debug/tracing/trace | perl scripts/trace/cstate.pl > out.svg Signed-off-by: Arjan van de Ven <[email protected]> Signed-off-by: Ingo Molnar <[email protected]>
1 parent 509dcee commit f3f47a6

File tree

8 files changed

+355
-0
lines changed

8 files changed

+355
-0
lines changed

arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include <linux/cpufreq.h>
3434
#include <linux/compiler.h>
3535
#include <linux/dmi.h>
36+
#include <linux/ftrace.h>
3637

3738
#include <linux/acpi.h>
3839
#include <acpi/processor.h>
@@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
391392
unsigned int next_perf_state = 0; /* Index into perf table */
392393
unsigned int i;
393394
int result = 0;
395+
struct power_trace it;
394396

395397
dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
396398

@@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
427429
}
428430
}
429431

432+
trace_power_mark(&it, POWER_PSTATE, next_perf_state);
433+
430434
switch (data->cpu_feature) {
431435
case SYSTEM_INTEL_MSR_CAPABLE:
432436
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;

arch/x86/kernel/process.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <linux/module.h>
88
#include <linux/pm.h>
99
#include <linux/clockchips.h>
10+
#include <linux/ftrace.h>
1011
#include <asm/system.h>
1112

1213
unsigned long idle_halt;
@@ -100,6 +101,9 @@ static inline int hlt_use_halt(void)
100101
void default_idle(void)
101102
{
102103
if (hlt_use_halt()) {
104+
struct power_trace it;
105+
106+
trace_power_start(&it, POWER_CSTATE, 1);
103107
current_thread_info()->status &= ~TS_POLLING;
104108
/*
105109
* TS_POLLING-cleared state must be visible before we
@@ -112,6 +116,7 @@ void default_idle(void)
112116
else
113117
local_irq_enable();
114118
current_thread_info()->status |= TS_POLLING;
119+
trace_power_end(&it);
115120
} else {
116121
local_irq_enable();
117122
/* loop is done by the caller */
@@ -154,24 +159,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
154159
*/
155160
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
156161
{
162+
struct power_trace it;
163+
164+
trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
157165
if (!need_resched()) {
158166
__monitor((void *)&current_thread_info()->flags, 0, 0);
159167
smp_mb();
160168
if (!need_resched())
161169
__mwait(ax, cx);
162170
}
171+
trace_power_end(&it);
163172
}
164173

165174
/* Default MONITOR/MWAIT with no hints, used for default C1 state */
166175
static void mwait_idle(void)
167176
{
177+
struct power_trace it;
168178
if (!need_resched()) {
179+
trace_power_start(&it, POWER_CSTATE, 1);
169180
__monitor((void *)&current_thread_info()->flags, 0, 0);
170181
smp_mb();
171182
if (!need_resched())
172183
__sti_mwait(0, 0);
173184
else
174185
local_irq_enable();
186+
trace_power_end(&it);
175187
} else
176188
local_irq_enable();
177189
}
@@ -183,9 +195,13 @@ static void mwait_idle(void)
183195
*/
184196
static void poll_idle(void)
185197
{
198+
struct power_trace it;
199+
200+
trace_power_start(&it, POWER_CSTATE, 0);
186201
local_irq_enable();
187202
while (!need_resched())
188203
cpu_relax();
204+
trace_power_end(&it);
189205
}
190206

191207
/*

include/linux/ftrace.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,35 @@ ftrace_init_module(struct module *mod,
311311
unsigned long *start, unsigned long *end) { }
312312
#endif
313313

314+
enum {
315+
POWER_NONE = 0,
316+
POWER_CSTATE = 1,
317+
POWER_PSTATE = 2,
318+
};
319+
320+
struct power_trace {
321+
#ifdef CONFIG_POWER_TRACER
322+
ktime_t stamp;
323+
ktime_t end;
324+
int type;
325+
int state;
326+
#endif
327+
};
328+
329+
#ifdef CONFIG_POWER_TRACER
330+
extern void trace_power_start(struct power_trace *it, unsigned int type,
331+
unsigned int state);
332+
extern void trace_power_mark(struct power_trace *it, unsigned int type,
333+
unsigned int state);
334+
extern void trace_power_end(struct power_trace *it);
335+
#else
336+
static inline void trace_power_start(struct power_trace *it, unsigned int type,
337+
unsigned int state) { }
338+
static inline void trace_power_mark(struct power_trace *it, unsigned int type,
339+
unsigned int state) { }
340+
static inline void trace_power_end(struct power_trace *it) { }
341+
#endif
342+
314343

315344
/*
316345
* Structure that defines a return function trace.

kernel/trace/Kconfig

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,17 @@ config BRANCH_TRACER
217217

218218
Say N if unsure.
219219

220+
config POWER_TRACER
221+
bool "Trace power consumption behavior"
222+
depends on DEBUG_KERNEL
223+
depends on X86
224+
select TRACING
225+
help
226+
This tracer helps developers to analyze and optimize the kernels
227+
power management decisions, specifically the C-state and P-state
228+
behavior.
229+
230+
220231
config STACK_TRACER
221232
bool "Trace max stack"
222233
depends on HAVE_FUNCTION_TRACER

kernel/trace/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,5 +32,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
3232
obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
3333
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
3434
obj-$(CONFIG_BTS_TRACER) += trace_bts.o
35+
obj-$(CONFIG_POWER_TRACER) += trace_power.o
3536

3637
libftrace-y := ftrace.o

kernel/trace/trace.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ enum trace_type {
2828
TRACE_FN_RET,
2929
TRACE_USER_STACK,
3030
TRACE_BTS,
31+
TRACE_POWER,
3132

3233
__TRACE_LAST_TYPE
3334
};
@@ -160,6 +161,11 @@ struct bts_entry {
160161
unsigned long to;
161162
};
162163

164+
struct trace_power {
165+
struct trace_entry ent;
166+
struct power_trace state_data;
167+
};
168+
163169
/*
164170
* trace_flag_type is an enumeration that holds different
165171
* states when a trace occurs. These are:
@@ -266,6 +272,7 @@ extern void __ftrace_bad_type(void);
266272
IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
267273
IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
268274
IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
275+
IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
269276
__ftrace_bad_type(); \
270277
} while (0)
271278

kernel/trace/trace_power.c

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
/*
2+
* ring buffer based C-state tracer
3+
*
4+
* Arjan van de Ven <[email protected]>
5+
* Copyright (C) 2008 Intel Corporation
6+
*
7+
* Much is borrowed from trace_boot.c which is
8+
* Copyright (C) 2008 Frederic Weisbecker <[email protected]>
9+
*
10+
*/
11+
12+
#include <linux/init.h>
13+
#include <linux/debugfs.h>
14+
#include <linux/ftrace.h>
15+
#include <linux/kallsyms.h>
16+
#include <linux/module.h>
17+
18+
#include "trace.h"
19+
20+
static struct trace_array *power_trace;
21+
static int __read_mostly trace_power_enabled;
22+
23+
24+
static void start_power_trace(struct trace_array *tr)
25+
{
26+
trace_power_enabled = 1;
27+
}
28+
29+
static void stop_power_trace(struct trace_array *tr)
30+
{
31+
trace_power_enabled = 0;
32+
}
33+
34+
35+
static int power_trace_init(struct trace_array *tr)
36+
{
37+
int cpu;
38+
power_trace = tr;
39+
40+
trace_power_enabled = 1;
41+
42+
for_each_cpu_mask(cpu, cpu_possible_map)
43+
tracing_reset(tr, cpu);
44+
return 0;
45+
}
46+
47+
static enum print_line_t power_print_line(struct trace_iterator *iter)
48+
{
49+
int ret = 0;
50+
struct trace_entry *entry = iter->ent;
51+
struct trace_power *field ;
52+
struct power_trace *it;
53+
struct trace_seq *s = &iter->seq;
54+
struct timespec stamp;
55+
struct timespec duration;
56+
57+
trace_assign_type(field, entry);
58+
it = &field->state_data;
59+
stamp = ktime_to_timespec(it->stamp);
60+
duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
61+
62+
if (entry->type == TRACE_POWER) {
63+
if (it->type == POWER_CSTATE)
64+
ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
65+
stamp.tv_sec,
66+
stamp.tv_nsec,
67+
it->state, iter->cpu,
68+
duration.tv_sec,
69+
duration.tv_nsec);
70+
if (it->type == POWER_PSTATE)
71+
ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
72+
stamp.tv_sec,
73+
stamp.tv_nsec,
74+
it->state, iter->cpu);
75+
if (!ret)
76+
return TRACE_TYPE_PARTIAL_LINE;
77+
return TRACE_TYPE_HANDLED;
78+
}
79+
return TRACE_TYPE_UNHANDLED;
80+
}
81+
82+
static struct tracer power_tracer __read_mostly =
83+
{
84+
.name = "power",
85+
.init = power_trace_init,
86+
.start = start_power_trace,
87+
.stop = stop_power_trace,
88+
.reset = stop_power_trace,
89+
.print_line = power_print_line,
90+
};
91+
92+
static int init_power_trace(void)
93+
{
94+
return register_tracer(&power_tracer);
95+
}
96+
device_initcall(init_power_trace);
97+
98+
void trace_power_start(struct power_trace *it, unsigned int type,
99+
unsigned int level)
100+
{
101+
if (!trace_power_enabled)
102+
return;
103+
104+
memset(it, 0, sizeof(struct power_trace));
105+
it->state = level;
106+
it->type = type;
107+
it->stamp = ktime_get();
108+
}
109+
EXPORT_SYMBOL_GPL(trace_power_start);
110+
111+
112+
void trace_power_end(struct power_trace *it)
113+
{
114+
struct ring_buffer_event *event;
115+
struct trace_power *entry;
116+
struct trace_array_cpu *data;
117+
unsigned long irq_flags;
118+
struct trace_array *tr = power_trace;
119+
120+
if (!trace_power_enabled)
121+
return;
122+
123+
preempt_disable();
124+
it->end = ktime_get();
125+
data = tr->data[smp_processor_id()];
126+
127+
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
128+
&irq_flags);
129+
if (!event)
130+
goto out;
131+
entry = ring_buffer_event_data(event);
132+
tracing_generic_entry_update(&entry->ent, 0, 0);
133+
entry->ent.type = TRACE_POWER;
134+
entry->state_data = *it;
135+
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
136+
137+
trace_wake_up();
138+
139+
out:
140+
preempt_enable();
141+
}
142+
EXPORT_SYMBOL_GPL(trace_power_end);
143+
144+
void trace_power_mark(struct power_trace *it, unsigned int type,
145+
unsigned int level)
146+
{
147+
struct ring_buffer_event *event;
148+
struct trace_power *entry;
149+
struct trace_array_cpu *data;
150+
unsigned long irq_flags;
151+
struct trace_array *tr = power_trace;
152+
153+
if (!trace_power_enabled)
154+
return;
155+
156+
memset(it, 0, sizeof(struct power_trace));
157+
it->state = level;
158+
it->type = type;
159+
it->stamp = ktime_get();
160+
preempt_disable();
161+
it->end = it->stamp;
162+
data = tr->data[smp_processor_id()];
163+
164+
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
165+
&irq_flags);
166+
if (!event)
167+
goto out;
168+
entry = ring_buffer_event_data(event);
169+
tracing_generic_entry_update(&entry->ent, 0, 0);
170+
entry->ent.type = TRACE_POWER;
171+
entry->state_data = *it;
172+
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
173+
174+
trace_wake_up();
175+
176+
out:
177+
preempt_enable();
178+
}
179+
EXPORT_SYMBOL_GPL(trace_power_mark);

0 commit comments

Comments
 (0)