Skip to content

Commit f4d34a8

Browse files
committed
tracing: Use pid bitmap instead of a pid array for set_event_pid
In order to add the ability to let tasks that are filtered by the events have their children also be traced on fork (and then not traced on exit), convert the array into a pid bitmask. Most of the time the number of pids is only 32768 pids or a 4k bitmask, which is the same size as the default list currently is, and that list could grow if more pids are listed. This also greatly simplifies the code. Suggested-by: "H. Peter Anvin" <[email protected]> Signed-off-by: Steven Rostedt <[email protected]>
1 parent 9ebc57c commit f4d34a8

File tree

2 files changed

+102
-124
lines changed

2 files changed

+102
-124
lines changed

kernel/trace/trace.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -177,9 +177,8 @@ struct trace_options {
177177
};
178178

179179
struct trace_pid_list {
180-
unsigned int nr_pids;
181-
int order;
182-
pid_t *pids;
180+
int pid_max;
181+
unsigned long *pids;
183182
};
184183

185184
/*

kernel/trace/trace_events.c

Lines changed: 100 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
#include <linux/kthread.h>
1616
#include <linux/tracefs.h>
1717
#include <linux/uaccess.h>
18-
#include <linux/bsearch.h>
18+
#include <linux/vmalloc.h>
1919
#include <linux/module.h>
2020
#include <linux/ctype.h>
2121
#include <linux/sort.h>
@@ -471,23 +471,13 @@ static void ftrace_clear_events(struct trace_array *tr)
471471
mutex_unlock(&event_mutex);
472472
}
473473

474-
static int cmp_pid(const void *key, const void *elt)
475-
{
476-
const pid_t *search_pid = key;
477-
const pid_t *pid = elt;
478-
479-
if (*search_pid == *pid)
480-
return 0;
481-
if (*search_pid < *pid)
482-
return -1;
483-
return 1;
484-
}
474+
/* Shouldn't this be in a header? */
475+
extern int pid_max;
485476

486477
static bool
487478
ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
488479
{
489-
pid_t search_pid;
490-
pid_t *pid;
480+
pid_t pid;
491481

492482
/*
493483
* Return false, because if filtered_pids does not exist,
@@ -496,15 +486,16 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
496486
if (!filtered_pids)
497487
return false;
498488

499-
search_pid = task->pid;
489+
pid = task->pid;
500490

501-
pid = bsearch(&search_pid, filtered_pids->pids,
502-
filtered_pids->nr_pids, sizeof(pid_t),
503-
cmp_pid);
504-
if (!pid)
491+
/*
492+
* If pid_max changed after filtered_pids was created, we
493+
* by default ignore all pids greater than the previous pid_max.
494+
*/
495+
if (task->pid >= filtered_pids->pid_max)
505496
return true;
506497

507-
return false;
498+
return !test_bit(task->pid, filtered_pids->pids);
508499
}
509500

510501
static void
@@ -602,7 +593,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
602593
/* Wait till all users are no longer using pid filtering */
603594
synchronize_sched();
604595

605-
free_pages((unsigned long)pid_list->pids, pid_list->order);
596+
vfree(pid_list->pids);
606597
kfree(pid_list);
607598
}
608599

@@ -946,11 +937,32 @@ static void t_stop(struct seq_file *m, void *p)
946937
mutex_unlock(&event_mutex);
947938
}
948939

940+
static void *
941+
p_next(struct seq_file *m, void *v, loff_t *pos)
942+
{
943+
struct trace_array *tr = m->private;
944+
struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
945+
unsigned long pid = (unsigned long)v;
946+
947+
(*pos)++;
948+
949+
/* pid already is +1 of the actual prevous bit */
950+
pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
951+
952+
/* Return pid + 1 to allow zero to be represented */
953+
if (pid < pid_list->pid_max)
954+
return (void *)(pid + 1);
955+
956+
return NULL;
957+
}
958+
949959
static void *p_start(struct seq_file *m, loff_t *pos)
950960
__acquires(RCU)
951961
{
952962
struct trace_pid_list *pid_list;
953963
struct trace_array *tr = m->private;
964+
unsigned long pid;
965+
loff_t l = 0;
954966

955967
/*
956968
* Grab the mutex, to keep calls to p_next() having the same
@@ -963,10 +975,18 @@ static void *p_start(struct seq_file *m, loff_t *pos)
963975

964976
pid_list = rcu_dereference_sched(tr->filtered_pids);
965977

966-
if (!pid_list || *pos >= pid_list->nr_pids)
978+
if (!pid_list)
979+
return NULL;
980+
981+
pid = find_first_bit(pid_list->pids, pid_list->pid_max);
982+
if (pid >= pid_list->pid_max)
967983
return NULL;
968984

969-
return (void *)&pid_list->pids[*pos];
985+
/* Return pid + 1 so that zero can be the exit value */
986+
for (pid++; pid && l < *pos;
987+
pid = (unsigned long)p_next(m, (void *)pid, &l))
988+
;
989+
return (void *)pid;
970990
}
971991

972992
static void p_stop(struct seq_file *m, void *p)
@@ -976,25 +996,11 @@ static void p_stop(struct seq_file *m, void *p)
976996
mutex_unlock(&event_mutex);
977997
}
978998

979-
static void *
980-
p_next(struct seq_file *m, void *v, loff_t *pos)
981-
{
982-
struct trace_array *tr = m->private;
983-
struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
984-
985-
(*pos)++;
986-
987-
if (*pos >= pid_list->nr_pids)
988-
return NULL;
989-
990-
return (void *)&pid_list->pids[*pos];
991-
}
992-
993999
static int p_show(struct seq_file *m, void *v)
9941000
{
995-
pid_t *pid = v;
1001+
unsigned long pid = (unsigned long)v - 1;
9961002

997-
seq_printf(m, "%d\n", *pid);
1003+
seq_printf(m, "%lu\n", pid);
9981004
return 0;
9991005
}
10001006

@@ -1543,11 +1549,6 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
15431549
return r;
15441550
}
15451551

1546-
static int max_pids(struct trace_pid_list *pid_list)
1547-
{
1548-
return (PAGE_SIZE << pid_list->order) / sizeof(pid_t);
1549-
}
1550-
15511552
static void ignore_task_cpu(void *data)
15521553
{
15531554
struct trace_array *tr = data;
@@ -1571,15 +1572,15 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
15711572
struct seq_file *m = filp->private_data;
15721573
struct trace_array *tr = m->private;
15731574
struct trace_pid_list *filtered_pids = NULL;
1574-
struct trace_pid_list *pid_list = NULL;
1575+
struct trace_pid_list *pid_list;
15751576
struct trace_event_file *file;
15761577
struct trace_parser parser;
15771578
unsigned long val;
15781579
loff_t this_pos;
15791580
ssize_t read = 0;
15801581
ssize_t ret = 0;
15811582
pid_t pid;
1582-
int i;
1583+
int nr_pids = 0;
15831584

15841585
if (!cnt)
15851586
return 0;
@@ -1592,10 +1593,43 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
15921593
return -ENOMEM;
15931594

15941595
mutex_lock(&event_mutex);
1596+
filtered_pids = rcu_dereference_protected(tr->filtered_pids,
1597+
lockdep_is_held(&event_mutex));
1598+
15951599
/*
1596-
* Load as many pids into the array before doing a
1597-
* swap from the tr->filtered_pids to the new list.
1600+
* Always recreate a new array. The write is an all or nothing
1601+
* operation. Always create a new array when adding new pids by
1602+
* the user. If the operation fails, then the current list is
1603+
* not modified.
15981604
*/
1605+
pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
1606+
if (!pid_list) {
1607+
read = -ENOMEM;
1608+
goto out;
1609+
}
1610+
pid_list->pid_max = READ_ONCE(pid_max);
1611+
/* Only truncating will shrink pid_max */
1612+
if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
1613+
pid_list->pid_max = filtered_pids->pid_max;
1614+
pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
1615+
if (!pid_list->pids) {
1616+
kfree(pid_list);
1617+
read = -ENOMEM;
1618+
goto out;
1619+
}
1620+
if (filtered_pids) {
1621+
/* copy the current bits to the new max */
1622+
pid = find_first_bit(filtered_pids->pids,
1623+
filtered_pids->pid_max);
1624+
while (pid < filtered_pids->pid_max) {
1625+
set_bit(pid, pid_list->pids);
1626+
pid = find_next_bit(filtered_pids->pids,
1627+
filtered_pids->pid_max,
1628+
pid + 1);
1629+
nr_pids++;
1630+
}
1631+
}
1632+
15991633
while (cnt > 0) {
16001634

16011635
this_pos = 0;
@@ -1613,92 +1647,35 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
16131647
ret = -EINVAL;
16141648
if (kstrtoul(parser.buffer, 0, &val))
16151649
break;
1616-
if (val > INT_MAX)
1650+
if (val >= pid_list->pid_max)
16171651
break;
16181652

16191653
pid = (pid_t)val;
16201654

1621-
ret = -ENOMEM;
1622-
if (!pid_list) {
1623-
pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
1624-
if (!pid_list)
1625-
break;
1626-
1627-
filtered_pids = rcu_dereference_protected(tr->filtered_pids,
1628-
lockdep_is_held(&event_mutex));
1629-
if (filtered_pids)
1630-
pid_list->order = filtered_pids->order;
1631-
else
1632-
pid_list->order = 0;
1633-
1634-
pid_list->pids = (void *)__get_free_pages(GFP_KERNEL,
1635-
pid_list->order);
1636-
if (!pid_list->pids)
1637-
break;
1638-
1639-
if (filtered_pids) {
1640-
pid_list->nr_pids = filtered_pids->nr_pids;
1641-
memcpy(pid_list->pids, filtered_pids->pids,
1642-
pid_list->nr_pids * sizeof(pid_t));
1643-
} else
1644-
pid_list->nr_pids = 0;
1645-
}
1646-
1647-
if (pid_list->nr_pids >= max_pids(pid_list)) {
1648-
pid_t *pid_page;
1649-
1650-
pid_page = (void *)__get_free_pages(GFP_KERNEL,
1651-
pid_list->order + 1);
1652-
if (!pid_page)
1653-
break;
1654-
memcpy(pid_page, pid_list->pids,
1655-
pid_list->nr_pids * sizeof(pid_t));
1656-
free_pages((unsigned long)pid_list->pids, pid_list->order);
1657-
1658-
pid_list->order++;
1659-
pid_list->pids = pid_page;
1660-
}
1655+
set_bit(pid, pid_list->pids);
1656+
nr_pids++;
16611657

1662-
pid_list->pids[pid_list->nr_pids++] = pid;
16631658
trace_parser_clear(&parser);
16641659
ret = 0;
16651660
}
16661661
trace_parser_put(&parser);
16671662

16681663
if (ret < 0) {
1669-
if (pid_list)
1670-
free_pages((unsigned long)pid_list->pids, pid_list->order);
1664+
vfree(pid_list->pids);
16711665
kfree(pid_list);
1672-
mutex_unlock(&event_mutex);
1673-
return ret;
1674-
}
1675-
1676-
if (!pid_list) {
1677-
mutex_unlock(&event_mutex);
1678-
return ret;
1666+
read = ret;
1667+
goto out;
16791668
}
16801669

1681-
sort(pid_list->pids, pid_list->nr_pids, sizeof(pid_t), cmp_pid, NULL);
1682-
1683-
/* Remove duplicates */
1684-
for (i = 1; i < pid_list->nr_pids; i++) {
1685-
int start = i;
1686-
1687-
while (i < pid_list->nr_pids &&
1688-
pid_list->pids[i - 1] == pid_list->pids[i])
1689-
i++;
1690-
1691-
if (start != i) {
1692-
if (i < pid_list->nr_pids) {
1693-
memmove(&pid_list->pids[start], &pid_list->pids[i],
1694-
(pid_list->nr_pids - i) * sizeof(pid_t));
1695-
pid_list->nr_pids -= i - start;
1696-
i = start;
1697-
} else
1698-
pid_list->nr_pids = start;
1699-
}
1670+
if (!nr_pids) {
1671+
/* Cleared the list of pids */
1672+
vfree(pid_list->pids);
1673+
kfree(pid_list);
1674+
read = ret;
1675+
if (!filtered_pids)
1676+
goto out;
1677+
pid_list = NULL;
17001678
}
1701-
17021679
rcu_assign_pointer(tr->filtered_pids, pid_list);
17031680

17041681
list_for_each_entry(file, &tr->events, list) {
@@ -1708,7 +1685,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
17081685
if (filtered_pids) {
17091686
synchronize_sched();
17101687

1711-
free_pages((unsigned long)filtered_pids->pids, filtered_pids->order);
1688+
vfree(filtered_pids->pids);
17121689
kfree(filtered_pids);
17131690
} else {
17141691
/*
@@ -1745,10 +1722,12 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
17451722
*/
17461723
on_each_cpu(ignore_task_cpu, tr, 1);
17471724

1725+
out:
17481726
mutex_unlock(&event_mutex);
17491727

17501728
ret = read;
1751-
*ppos += read;
1729+
if (read > 0)
1730+
*ppos += read;
17521731

17531732
return ret;
17541733
}

0 commit comments

Comments
 (0)