Skip to content

Commit fa4b851

Browse files
committed
Merge tag 'trace-ring-buffer-v6.8-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace
Pull tracing fixes from Steven Rostedt: - Do not allow large strings (> 4096) as single write to trace_marker The size of a string written into trace_marker was determined by the size of the sub-buffer in the ring buffer. That size is dependent on the PAGE_SIZE of the architecture as it can be mapped into user space. But on PowerPC, where PAGE_SIZE is 64K, that made the limit of the string of writing into trace_marker 64K. One of the selftests looks at the size of the ring buffer sub-buffers and writes that plus more into the trace_marker. The write will take what it can and report back what it consumed so that the user space application (like echo) will write the rest of the string. The string is stored in the ring buffer and can be read via the "trace" or "trace_pipe" files. The reading of the ring buffer uses vsnprintf(), which uses a precision "%.*s" to make sure it only reads what is stored in the buffer, as a bug could cause the string to be non terminated. With the combination of the precision change and the PAGE_SIZE of 64K allowing huge strings to be added into the ring buffer, plus the test that would actually stress that limit, a bug was reported that the precision used was too big for "%.*s" as the string was close to 64K in size and the max precision of vsnprintf is 32K. Linus suggested not to have that precision as it could hide a bug if the string was again stored without a nul byte. Another issue that was brought up is that the trace_seq buffer is also based on PAGE_SIZE even though it is not tied to the architecture limit like the ring buffer sub-buffer is. Having it be 64K * 2 is simply just too big and wasting memory on systems with 64K page sizes. It is now hardcoded to 8K which is what all other architectures with 4K PAGE_SIZE has. Finally, the write to trace_marker is now limited to 4K as there is no reason to write larger strings into trace_marker. - ring_buffer_wait() should not loop. The ring_buffer_wait() does not have the full context (yet) on if it should loop or not. Just exit the loop as soon as its woken up and let the callers decide to loop or not (they already do, so it's a bit redundant). - Fix shortest_full field to be the smallest amount in the ring buffer that a waiter is waiting for. The "shortest_full" field is updated when a new waiter comes in and wants to wait for a smaller amount of data in the ring buffer than other waiters. But after all waiters are woken up, it's not reset, so if another waiter comes in wanting to wait for more data, it will be woken up when the ring buffer has a smaller amount from what the previous waiters were waiting for. - The wake up all waiters on close is incorrectly called frome .release() and not from .flush() so it will never wake up any waiters as the .release() will not get called until all .read() calls are finished. And the wakeup is for the waiters in those .read() calls. * tag 'trace-ring-buffer-v6.8-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace: tracing: Use .flush() call to wake up readers ring-buffer: Fix resetting of shortest_full ring-buffer: Fix waking up ring buffer readers tracing: Limit trace_marker writes to just 4K tracing: Limit trace_seq size to just 8K and not depend on architecture PAGE_SIZE tracing: Remove precision vsnprintf() check from print event
2 parents 210ee63 + e5d7c19 commit fa4b851

File tree

4 files changed

+120
-94
lines changed

4 files changed

+120
-94
lines changed

include/linux/trace_seq.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,15 @@
99
/*
1010
* Trace sequences are used to allow a function to call several other functions
1111
* to create a string of data to use.
12+
*
13+
* Have the trace seq to be 8K which is typically PAGE_SIZE * 2 on
14+
* most architectures. The TRACE_SEQ_BUFFER_SIZE (which is
15+
* TRACE_SEQ_SIZE minus the other fields of trace_seq), is the
16+
* max size the output of a trace event may be.
1217
*/
1318

14-
#define TRACE_SEQ_BUFFER_SIZE (PAGE_SIZE * 2 - \
19+
#define TRACE_SEQ_SIZE 8192
20+
#define TRACE_SEQ_BUFFER_SIZE (TRACE_SEQ_SIZE - \
1521
(sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int)))
1622

1723
struct trace_seq {

kernel/trace/ring_buffer.c

Lines changed: 91 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -384,7 +384,6 @@ struct rb_irq_work {
384384
struct irq_work work;
385385
wait_queue_head_t waiters;
386386
wait_queue_head_t full_waiters;
387-
long wait_index;
388387
bool waiters_pending;
389388
bool full_waiters_pending;
390389
bool wakeup_full;
@@ -756,8 +755,19 @@ static void rb_wake_up_waiters(struct irq_work *work)
756755

757756
wake_up_all(&rbwork->waiters);
758757
if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
758+
/* Only cpu_buffer sets the above flags */
759+
struct ring_buffer_per_cpu *cpu_buffer =
760+
container_of(rbwork, struct ring_buffer_per_cpu, irq_work);
761+
762+
/* Called from interrupt context */
763+
raw_spin_lock(&cpu_buffer->reader_lock);
759764
rbwork->wakeup_full = false;
760765
rbwork->full_waiters_pending = false;
766+
767+
/* Waking up all waiters, they will reset the shortest full */
768+
cpu_buffer->shortest_full = 0;
769+
raw_spin_unlock(&cpu_buffer->reader_lock);
770+
761771
wake_up_all(&rbwork->full_waiters);
762772
}
763773
}
@@ -798,14 +808,40 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
798808
rbwork = &cpu_buffer->irq_work;
799809
}
800810

801-
rbwork->wait_index++;
802-
/* make sure the waiters see the new index */
803-
smp_wmb();
804-
805811
/* This can be called in any context */
806812
irq_work_queue(&rbwork->work);
807813
}
808814

815+
static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full)
816+
{
817+
struct ring_buffer_per_cpu *cpu_buffer;
818+
bool ret = false;
819+
820+
/* Reads of all CPUs always waits for any data */
821+
if (cpu == RING_BUFFER_ALL_CPUS)
822+
return !ring_buffer_empty(buffer);
823+
824+
cpu_buffer = buffer->buffers[cpu];
825+
826+
if (!ring_buffer_empty_cpu(buffer, cpu)) {
827+
unsigned long flags;
828+
bool pagebusy;
829+
830+
if (!full)
831+
return true;
832+
833+
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
834+
pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
835+
ret = !pagebusy && full_hit(buffer, cpu, full);
836+
837+
if (!cpu_buffer->shortest_full ||
838+
cpu_buffer->shortest_full > full)
839+
cpu_buffer->shortest_full = full;
840+
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
841+
}
842+
return ret;
843+
}
844+
809845
/**
810846
* ring_buffer_wait - wait for input to the ring buffer
811847
* @buffer: buffer to wait on
@@ -821,7 +857,6 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
821857
struct ring_buffer_per_cpu *cpu_buffer;
822858
DEFINE_WAIT(wait);
823859
struct rb_irq_work *work;
824-
long wait_index;
825860
int ret = 0;
826861

827862
/*
@@ -840,81 +875,54 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
840875
work = &cpu_buffer->irq_work;
841876
}
842877

843-
wait_index = READ_ONCE(work->wait_index);
844-
845-
while (true) {
846-
if (full)
847-
prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
848-
else
849-
prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
850-
851-
/*
852-
* The events can happen in critical sections where
853-
* checking a work queue can cause deadlocks.
854-
* After adding a task to the queue, this flag is set
855-
* only to notify events to try to wake up the queue
856-
* using irq_work.
857-
*
858-
* We don't clear it even if the buffer is no longer
859-
* empty. The flag only causes the next event to run
860-
* irq_work to do the work queue wake up. The worse
861-
* that can happen if we race with !trace_empty() is that
862-
* an event will cause an irq_work to try to wake up
863-
* an empty queue.
864-
*
865-
* There's no reason to protect this flag either, as
866-
* the work queue and irq_work logic will do the necessary
867-
* synchronization for the wake ups. The only thing
868-
* that is necessary is that the wake up happens after
869-
* a task has been queued. It's OK for spurious wake ups.
870-
*/
871-
if (full)
872-
work->full_waiters_pending = true;
873-
else
874-
work->waiters_pending = true;
875-
876-
if (signal_pending(current)) {
877-
ret = -EINTR;
878-
break;
879-
}
880-
881-
if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
882-
break;
883-
884-
if (cpu != RING_BUFFER_ALL_CPUS &&
885-
!ring_buffer_empty_cpu(buffer, cpu)) {
886-
unsigned long flags;
887-
bool pagebusy;
888-
bool done;
889-
890-
if (!full)
891-
break;
892-
893-
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
894-
pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
895-
done = !pagebusy && full_hit(buffer, cpu, full);
878+
if (full)
879+
prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
880+
else
881+
prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
896882

897-
if (!cpu_buffer->shortest_full ||
898-
cpu_buffer->shortest_full > full)
899-
cpu_buffer->shortest_full = full;
900-
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
901-
if (done)
902-
break;
903-
}
883+
/*
884+
* The events can happen in critical sections where
885+
* checking a work queue can cause deadlocks.
886+
* After adding a task to the queue, this flag is set
887+
* only to notify events to try to wake up the queue
888+
* using irq_work.
889+
*
890+
* We don't clear it even if the buffer is no longer
891+
* empty. The flag only causes the next event to run
892+
* irq_work to do the work queue wake up. The worse
893+
* that can happen if we race with !trace_empty() is that
894+
* an event will cause an irq_work to try to wake up
895+
* an empty queue.
896+
*
897+
* There's no reason to protect this flag either, as
898+
* the work queue and irq_work logic will do the necessary
899+
* synchronization for the wake ups. The only thing
900+
* that is necessary is that the wake up happens after
901+
* a task has been queued. It's OK for spurious wake ups.
902+
*/
903+
if (full)
904+
work->full_waiters_pending = true;
905+
else
906+
work->waiters_pending = true;
904907

905-
schedule();
908+
if (rb_watermark_hit(buffer, cpu, full))
909+
goto out;
906910

907-
/* Make sure to see the new wait index */
908-
smp_rmb();
909-
if (wait_index != work->wait_index)
910-
break;
911+
if (signal_pending(current)) {
912+
ret = -EINTR;
913+
goto out;
911914
}
912915

916+
schedule();
917+
out:
913918
if (full)
914919
finish_wait(&work->full_waiters, &wait);
915920
else
916921
finish_wait(&work->waiters, &wait);
917922

923+
if (!ret && !rb_watermark_hit(buffer, cpu, full) && signal_pending(current))
924+
ret = -EINTR;
925+
918926
return ret;
919927
}
920928

@@ -937,28 +945,33 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
937945
struct file *filp, poll_table *poll_table, int full)
938946
{
939947
struct ring_buffer_per_cpu *cpu_buffer;
940-
struct rb_irq_work *work;
948+
struct rb_irq_work *rbwork;
941949

942950
if (cpu == RING_BUFFER_ALL_CPUS) {
943-
work = &buffer->irq_work;
951+
rbwork = &buffer->irq_work;
944952
full = 0;
945953
} else {
946954
if (!cpumask_test_cpu(cpu, buffer->cpumask))
947955
return EPOLLERR;
948956

949957
cpu_buffer = buffer->buffers[cpu];
950-
work = &cpu_buffer->irq_work;
958+
rbwork = &cpu_buffer->irq_work;
951959
}
952960

953961
if (full) {
954-
poll_wait(filp, &work->full_waiters, poll_table);
955-
work->full_waiters_pending = true;
962+
unsigned long flags;
963+
964+
poll_wait(filp, &rbwork->full_waiters, poll_table);
965+
966+
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
967+
rbwork->full_waiters_pending = true;
956968
if (!cpu_buffer->shortest_full ||
957969
cpu_buffer->shortest_full > full)
958970
cpu_buffer->shortest_full = full;
971+
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
959972
} else {
960-
poll_wait(filp, &work->waiters, poll_table);
961-
work->waiters_pending = true;
973+
poll_wait(filp, &rbwork->waiters, poll_table);
974+
rbwork->waiters_pending = true;
962975
}
963976

964977
/*

kernel/trace/trace.c

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7293,6 +7293,8 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
72937293
return 0;
72947294
}
72957295

7296+
#define TRACE_MARKER_MAX_SIZE 4096
7297+
72967298
static ssize_t
72977299
tracing_mark_write(struct file *filp, const char __user *ubuf,
72987300
size_t cnt, loff_t *fpos)
@@ -7320,6 +7322,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
73207322
if ((ssize_t)cnt < 0)
73217323
return -EINVAL;
73227324

7325+
if (cnt > TRACE_MARKER_MAX_SIZE)
7326+
cnt = TRACE_MARKER_MAX_SIZE;
7327+
73237328
meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */
73247329
again:
73257330
size = cnt + meta_size;
@@ -7328,11 +7333,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
73287333
if (cnt < FAULTED_SIZE)
73297334
size += FAULTED_SIZE - cnt;
73307335

7331-
if (size > TRACE_SEQ_BUFFER_SIZE) {
7332-
cnt -= size - TRACE_SEQ_BUFFER_SIZE;
7333-
goto again;
7334-
}
7335-
73367336
buffer = tr->array_buffer.buffer;
73377337
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
73387338
tracing_gen_ctx());
@@ -8393,6 +8393,20 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
83938393
return size;
83948394
}
83958395

8396+
static int tracing_buffers_flush(struct file *file, fl_owner_t id)
8397+
{
8398+
struct ftrace_buffer_info *info = file->private_data;
8399+
struct trace_iterator *iter = &info->iter;
8400+
8401+
iter->wait_index++;
8402+
/* Make sure the waiters see the new wait_index */
8403+
smp_wmb();
8404+
8405+
ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
8406+
8407+
return 0;
8408+
}
8409+
83968410
static int tracing_buffers_release(struct inode *inode, struct file *file)
83978411
{
83988412
struct ftrace_buffer_info *info = file->private_data;
@@ -8404,12 +8418,6 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
84048418

84058419
__trace_array_put(iter->tr);
84068420

8407-
iter->wait_index++;
8408-
/* Make sure the waiters see the new wait_index */
8409-
smp_wmb();
8410-
8411-
ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
8412-
84138421
if (info->spare)
84148422
ring_buffer_free_read_page(iter->array_buffer->buffer,
84158423
info->spare_cpu, info->spare);
@@ -8625,6 +8633,7 @@ static const struct file_operations tracing_buffers_fops = {
86258633
.read = tracing_buffers_read,
86268634
.poll = tracing_buffers_poll,
86278635
.release = tracing_buffers_release,
8636+
.flush = tracing_buffers_flush,
86288637
.splice_read = tracing_buffers_splice_read,
86298638
.unlocked_ioctl = tracing_buffers_ioctl,
86308639
.llseek = no_llseek,

kernel/trace/trace_output.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1587,12 +1587,11 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
15871587
{
15881588
struct print_entry *field;
15891589
struct trace_seq *s = &iter->seq;
1590-
int max = iter->ent_size - offsetof(struct print_entry, buf);
15911590

15921591
trace_assign_type(field, iter->ent);
15931592

15941593
seq_print_ip_sym(s, field->ip, flags);
1595-
trace_seq_printf(s, ": %.*s", max, field->buf);
1594+
trace_seq_printf(s, ": %s", field->buf);
15961595

15971596
return trace_handle_return(s);
15981597
}
@@ -1601,11 +1600,10 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
16011600
struct trace_event *event)
16021601
{
16031602
struct print_entry *field;
1604-
int max = iter->ent_size - offsetof(struct print_entry, buf);
16051603

16061604
trace_assign_type(field, iter->ent);
16071605

1608-
trace_seq_printf(&iter->seq, "# %lx %.*s", field->ip, max, field->buf);
1606+
trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
16091607

16101608
return trace_handle_return(&iter->seq);
16111609
}

0 commit comments

Comments
 (0)