Skip to content

Commit c84897c

Browse files
committed
ring-buffer: Remove 32bit timestamp logic
Each event has a 27 bit timestamp delta that is used to hold the delta from the last event. If the time between events is greater than 2^27, then a timestamp is added that holds a 59 bit absolute timestamp. Until a389d86 ("ring-buffer: Have nested events still record running time stamp"), if an interrupt interrupted an event in progress, all the events delta would be zero to not deal with the races that need to be handled. The commit a389d86 changed that to handle the races giving all events, even those that preempt other events, still have an accurate timestamp. To handle those races requires performing 64-bit cmpxchg on the timestamps. But doing 64-bit cmpxchg on 32-bit architectures is considered very slow. To try to deal with this the timestamp logic was broken into two and then three 32-bit cmpxchgs, with the thought that two (or three) 32-bit cmpxchgs are still faster than a single 64-bit cmpxchg on 32-bit architectures. Part of the problem with this is that I didn't have any 32-bit architectures to test on. After hitting several subtle bugs in this code, an effort was made to try and see if three 32-bit cmpxchgs are indeed faster than a single 64-bit. After a few people brushed off the dust of their old 32-bit machines, tests were done, and even though 32-bit cmpxchg was faster than a single 64-bit, it was in the order of 50% at best, not 300%. After some more refactoring of the code, all 4 64-bit cmpxchg were removed: https://lore.kernel.org/linux-trace-kernel/[email protected] https://lore.kernel.org/linux-trace-kernel/[email protected] https://lore.kernel.org/linux-trace-kernel/[email protected] https://lore.kernel.org/linux-trace-kernel/[email protected]/ With all the 64-bit cmpxchg removed, the complex 32-bit workaround can also be removed. The 32-bit and 64-bit logic is now exactly the same. Link: https://lore.kernel.org/all/[email protected]/ Link: https://lore.kernel.org/linux-trace-kernel/[email protected] Cc: Masami Hiramatsu <[email protected]> Cc: Mark Rutland <[email protected]> Cc: Mathieu Desnoyers <[email protected]> Cc: Linus Torvalds <[email protected]> Signed-off-by: Steven Rostedt (Google) <[email protected]>
1 parent 3bf7009 commit c84897c

File tree

1 file changed

+9
-167
lines changed

1 file changed

+9
-167
lines changed

kernel/trace/ring_buffer.c

Lines changed: 9 additions & 167 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include <linux/cpu.h>
2828
#include <linux/oom.h>
2929

30+
#include <asm/local64.h>
3031
#include <asm/local.h>
3132

3233
/*
@@ -463,27 +464,9 @@ enum {
463464
RB_CTX_MAX
464465
};
465466

466-
#if BITS_PER_LONG == 32
467-
#define RB_TIME_32
468-
#endif
469-
470-
/* To test on 64 bit machines */
471-
//#define RB_TIME_32
472-
473-
#ifdef RB_TIME_32
474-
475-
struct rb_time_struct {
476-
local_t cnt;
477-
local_t top;
478-
local_t bottom;
479-
local_t msb;
480-
};
481-
#else
482-
#include <asm/local64.h>
483467
struct rb_time_struct {
484468
local64_t time;
485469
};
486-
#endif
487470
typedef struct rb_time_struct rb_time_t;
488471

489472
#define MAX_NEST 5
@@ -573,147 +556,14 @@ struct ring_buffer_iter {
573556
int missed_events;
574557
};
575558

576-
#ifdef RB_TIME_32
577-
578-
/*
579-
* On 32 bit machines, local64_t is very expensive. As the ring
580-
* buffer doesn't need all the features of a true 64 bit atomic,
581-
* on 32 bit, it uses these functions (64 still uses local64_t).
582-
*
583-
* For the ring buffer, 64 bit required operations for the time is
584-
* the following:
585-
*
586-
* - Reads may fail if it interrupted a modification of the time stamp.
587-
* It will succeed if it did not interrupt another write even if
588-
* the read itself is interrupted by a write.
589-
* It returns whether it was successful or not.
590-
*
591-
* - Writes always succeed and will overwrite other writes and writes
592-
* that were done by events interrupting the current write.
593-
*
594-
* - A write followed by a read of the same time stamp will always succeed,
595-
* but may not contain the same value.
596-
*
597-
* - A cmpxchg will fail if it interrupted another write or cmpxchg.
598-
* Other than that, it acts like a normal cmpxchg.
599-
*
600-
* The 60 bit time stamp is broken up by 30 bits in a top and bottom half
601-
* (bottom being the least significant 30 bits of the 60 bit time stamp).
602-
*
603-
* The two most significant bits of each half holds a 2 bit counter (0-3).
604-
* Each update will increment this counter by one.
605-
* When reading the top and bottom, if the two counter bits match then the
606-
* top and bottom together make a valid 60 bit number.
607-
*/
608-
#define RB_TIME_SHIFT 30
609-
#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
610-
#define RB_TIME_MSB_SHIFT 60
611-
612-
static inline int rb_time_cnt(unsigned long val)
613-
{
614-
return (val >> RB_TIME_SHIFT) & 3;
615-
}
616-
617-
static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
618-
{
619-
u64 val;
620-
621-
val = top & RB_TIME_VAL_MASK;
622-
val <<= RB_TIME_SHIFT;
623-
val |= bottom & RB_TIME_VAL_MASK;
624-
625-
return val;
626-
}
627-
628-
static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
629-
{
630-
unsigned long top, bottom, msb;
631-
unsigned long c;
632-
633-
/*
634-
* If the read is interrupted by a write, then the cnt will
635-
* be different. Loop until both top and bottom have been read
636-
* without interruption.
637-
*/
638-
do {
639-
c = local_read(&t->cnt);
640-
top = local_read(&t->top);
641-
bottom = local_read(&t->bottom);
642-
msb = local_read(&t->msb);
643-
} while (c != local_read(&t->cnt));
644-
645-
*cnt = rb_time_cnt(top);
646-
647-
/* If top, msb or bottom counts don't match, this interrupted a write */
648-
if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom))
649-
return false;
650-
651-
/* The shift to msb will lose its cnt bits */
652-
*ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT);
653-
return true;
654-
}
655-
656-
static bool rb_time_read(rb_time_t *t, u64 *ret)
657-
{
658-
unsigned long cnt;
659-
660-
return __rb_time_read(t, ret, &cnt);
661-
}
662-
663-
static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
664-
{
665-
return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
666-
}
667-
668-
static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom,
669-
unsigned long *msb)
670-
{
671-
*top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
672-
*bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
673-
*msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT);
674-
}
675-
676-
static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
677-
{
678-
val = rb_time_val_cnt(val, cnt);
679-
local_set(t, val);
680-
}
681-
682-
static void rb_time_set(rb_time_t *t, u64 val)
683-
{
684-
unsigned long cnt, top, bottom, msb;
685-
686-
rb_time_split(val, &top, &bottom, &msb);
687-
688-
/* Writes always succeed with a valid number even if it gets interrupted. */
689-
do {
690-
cnt = local_inc_return(&t->cnt);
691-
rb_time_val_set(&t->top, top, cnt);
692-
rb_time_val_set(&t->bottom, bottom, cnt);
693-
rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt);
694-
} while (cnt != local_read(&t->cnt));
695-
}
696-
697-
static inline bool
698-
rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
699-
{
700-
return local_try_cmpxchg(l, &expect, set);
701-
}
702-
703-
#else /* 64 bits */
704-
705-
/* local64_t always succeeds */
706-
707-
static inline bool rb_time_read(rb_time_t *t, u64 *ret)
559+
static inline void rb_time_read(rb_time_t *t, u64 *ret)
708560
{
709561
*ret = local64_read(&t->time);
710-
return true;
711562
}
712563
static void rb_time_set(rb_time_t *t, u64 val)
713564
{
714565
local64_set(&t->time, val);
715566
}
716-
#endif
717567

718568
/*
719569
* Enable this to make sure that the event passed to
@@ -820,10 +670,7 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
820670
WARN_ONCE(1, "nest (%d) greater than max", nest);
821671

822672
fail:
823-
/* Can only fail on 32 bit */
824-
if (!rb_time_read(&cpu_buffer->write_stamp, &ts))
825-
/* Screw it, just read the current time */
826-
ts = rb_time_stamp(cpu_buffer->buffer);
673+
rb_time_read(&cpu_buffer->write_stamp, &ts);
827674

828675
return ts;
829676
}
@@ -2820,7 +2667,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
28202667
(unsigned long long)info->ts,
28212668
(unsigned long long)info->before,
28222669
(unsigned long long)info->after,
2823-
(unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
2670+
(unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}),
28242671
sched_clock_stable() ? "" :
28252672
"If you just came from a suspend/resume,\n"
28262673
"please switch to the trace global clock:\n"
@@ -3497,16 +3344,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
34973344
struct ring_buffer_event *event;
34983345
struct buffer_page *tail_page;
34993346
unsigned long tail, write, w;
3500-
bool a_ok;
3501-
bool b_ok;
35023347

35033348
/* Don't let the compiler play games with cpu_buffer->tail_page */
35043349
tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
35053350

35063351
/*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK;
35073352
barrier();
3508-
b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
3509-
a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
3353+
rb_time_read(&cpu_buffer->before_stamp, &info->before);
3354+
rb_time_read(&cpu_buffer->write_stamp, &info->after);
35103355
barrier();
35113356
info->ts = rb_time_stamp(cpu_buffer->buffer);
35123357

@@ -3521,7 +3366,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
35213366
if (!w) {
35223367
/* Use the sub-buffer timestamp */
35233368
info->delta = 0;
3524-
} else if (unlikely(!a_ok || !b_ok || info->before != info->after)) {
3369+
} else if (unlikely(info->before != info->after)) {
35253370
info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
35263371
info->length += RB_LEN_TIME_EXTEND;
35273372
} else {
@@ -3570,8 +3415,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
35703415
/* SLOW PATH - Interrupted between A and C */
35713416

35723417
/* Save the old before_stamp */
3573-
a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
3574-
RB_WARN_ON(cpu_buffer, !a_ok);
3418+
rb_time_read(&cpu_buffer->before_stamp, &info->before);
35753419

35763420
/*
35773421
* Read a new timestamp and update the before_stamp to make
@@ -3583,9 +3427,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
35833427
rb_time_set(&cpu_buffer->before_stamp, ts);
35843428

35853429
barrier();
3586-
/*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
3587-
/* Was interrupted before here, write_stamp must be valid */
3588-
RB_WARN_ON(cpu_buffer, !a_ok);
3430+
/*E*/ rb_time_read(&cpu_buffer->write_stamp, &info->after);
35893431
barrier();
35903432
/*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
35913433
info->after == info->before && info->after < ts) {

0 commit comments

Comments
 (0)