Skip to content

Commit 0ffca60

Browse files
captain5050acmel
authored andcommitted
perf pmu intel: Adjust cpumaks for sub-NUMA clusters on graniterapids
On graniterapids the cache home agent (CHA) and memory controller (IMC) PMUs all have their cpumask set to per-socket information. In order for per NUMA node aggregation to work correctly the PMUs cpumask needs to be set to CPUs for the relevant sub-NUMA grouping. For example, on a 2 socket graniterapids machine with sub NUMA clustering of 3, for uncore_cha and uncore_imc PMUs the cpumask is "0,120" leading to aggregation only on NUMA nodes 0 and 3: ``` $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a sleep 1 Performance counter stats for 'system wide': N0 1 277,835,681,344 UNC_CHA_CLOCKTICKS N0 1 19,242,894,228 UNC_M_CLOCKTICKS N3 1 277,803,448,124 UNC_CHA_CLOCKTICKS N3 1 19,240,741,498 UNC_M_CLOCKTICKS 1.002113847 seconds time elapsed ``` By updating the PMUs cpumasks to "0,120", "40,160" and "80,200" then the correctly 6 NUMA node aggregations are achieved: ``` $ perf stat --per-node -e 'UNC_CHA_CLOCKTICKS,UNC_M_CLOCKTICKS' -a sleep 1 Performance counter stats for 'system wide': N0 1 92,748,667,796 UNC_CHA_CLOCKTICKS N0 0 6,424,021,142 UNC_M_CLOCKTICKS N1 0 92,753,504,424 UNC_CHA_CLOCKTICKS N1 1 6,424,308,338 UNC_M_CLOCKTICKS N2 0 92,751,170,084 UNC_CHA_CLOCKTICKS N2 0 6,424,227,402 UNC_M_CLOCKTICKS N3 1 92,745,944,144 UNC_CHA_CLOCKTICKS N3 0 6,423,752,086 UNC_M_CLOCKTICKS N4 0 92,725,793,788 UNC_CHA_CLOCKTICKS N4 1 6,422,393,266 UNC_M_CLOCKTICKS N5 0 92,717,504,388 UNC_CHA_CLOCKTICKS N5 0 6,421,842,618 UNC_M_CLOCKTICKS 1.003406645 seconds time elapsed ``` In general, having the perf tool adjust cpumasks isn't desirable as ideally the PMU driver would be advertising the correct cpumask. Signed-off-by: Ian Rogers <[email protected]> Tested-by: Kan Liang <[email protected]> Tested-by: Weilin Wang <[email protected]> Cc: Adrian Hunter <[email protected]> Cc: Alexander Shishkin <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Jiri Olsa <[email protected]> Cc: Mark Rutland <[email protected]> Cc: Namhyung Kim <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Ravi Bangoria <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
1 parent 9e893da commit 0ffca60

File tree

1 file changed

+263
-5
lines changed
  • tools/perf/arch/x86/util

1 file changed

+263
-5
lines changed

tools/perf/arch/x86/util/pmu.c

Lines changed: 263 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
#include <linux/perf_event.h>
99
#include <linux/zalloc.h>
1010
#include <api/fs/fs.h>
11+
#include <api/io_dir.h>
12+
#include <internal/cpumap.h>
1113
#include <errno.h>
1214

1315
#include "../../../util/intel-pt.h"
@@ -16,7 +18,256 @@
1618
#include "../../../util/fncache.h"
1719
#include "../../../util/pmus.h"
1820
#include "mem-events.h"
21+
#include "util/debug.h"
1922
#include "util/env.h"
23+
#include "util/header.h"
24+
25+
static bool x86__is_intel_graniterapids(void)
26+
{
27+
static bool checked_if_graniterapids;
28+
static bool is_graniterapids;
29+
30+
if (!checked_if_graniterapids) {
31+
const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
32+
char *cpuid = get_cpuid_str((struct perf_cpu){0});
33+
34+
is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
35+
free(cpuid);
36+
checked_if_graniterapids = true;
37+
}
38+
return is_graniterapids;
39+
}
40+
41+
static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
42+
{
43+
struct perf_cpu_map *cpus;
44+
char *buf = NULL;
45+
size_t buf_len;
46+
47+
if (sysfs__read_str(sysfs_path, &buf, &buf_len) < 0)
48+
return NULL;
49+
50+
cpus = perf_cpu_map__new(buf);
51+
free(buf);
52+
return cpus;
53+
}
54+
55+
static int snc_nodes_per_l3_cache(void)
56+
{
57+
static bool checked_snc;
58+
static int snc_nodes;
59+
60+
if (!checked_snc) {
61+
struct perf_cpu_map *node_cpus =
62+
read_sysfs_cpu_map("devices/system/node/node0/cpulist");
63+
struct perf_cpu_map *cache_cpus =
64+
read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
65+
66+
snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
67+
perf_cpu_map__put(cache_cpus);
68+
perf_cpu_map__put(node_cpus);
69+
checked_snc = true;
70+
}
71+
return snc_nodes;
72+
}
73+
74+
static bool starts_with(const char *str, const char *prefix)
75+
{
76+
return !strncmp(prefix, str, strlen(prefix));
77+
}
78+
79+
static int num_chas(void)
80+
{
81+
static bool checked_chas;
82+
static int num_chas;
83+
84+
if (!checked_chas) {
85+
int fd = perf_pmu__event_source_devices_fd();
86+
struct io_dir dir;
87+
struct io_dirent64 *dent;
88+
89+
if (fd < 0)
90+
return -1;
91+
92+
io_dir__init(&dir, fd);
93+
94+
while ((dent = io_dir__readdir(&dir)) != NULL) {
95+
/* Note, dent->d_type will be DT_LNK and so isn't a useful filter. */
96+
if (starts_with(dent->d_name, "uncore_cha_"))
97+
num_chas++;
98+
}
99+
close(fd);
100+
checked_chas = true;
101+
}
102+
return num_chas;
103+
}
104+
105+
#define MAX_SNCS 6
106+
107+
static int uncore_cha_snc(struct perf_pmu *pmu)
108+
{
109+
// CHA SNC numbers are ordered correspond to the CHAs number.
110+
unsigned int cha_num;
111+
int num_cha, chas_per_node, cha_snc;
112+
int snc_nodes = snc_nodes_per_l3_cache();
113+
114+
if (snc_nodes <= 1)
115+
return 0;
116+
117+
num_cha = num_chas();
118+
if (num_cha <= 0) {
119+
pr_warning("Unexpected: no CHAs found\n");
120+
return 0;
121+
}
122+
123+
/* Compute SNC for PMU. */
124+
if (sscanf(pmu->name, "uncore_cha_%u", &cha_num) != 1) {
125+
pr_warning("Unexpected: unable to compute CHA number '%s'\n", pmu->name);
126+
return 0;
127+
}
128+
chas_per_node = num_cha / snc_nodes;
129+
cha_snc = cha_num / chas_per_node;
130+
131+
/* Range check cha_snc. for unexpected out of bounds. */
132+
return cha_snc >= MAX_SNCS ? 0 : cha_snc;
133+
}
134+
135+
static int uncore_imc_snc(struct perf_pmu *pmu)
136+
{
137+
// Compute the IMC SNC using lookup tables.
138+
unsigned int imc_num;
139+
int snc_nodes = snc_nodes_per_l3_cache();
140+
const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
141+
const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
142+
const u8 *snc_map;
143+
size_t snc_map_len;
144+
145+
switch (snc_nodes) {
146+
case 2:
147+
snc_map = snc2_map;
148+
snc_map_len = ARRAY_SIZE(snc2_map);
149+
break;
150+
case 3:
151+
snc_map = snc3_map;
152+
snc_map_len = ARRAY_SIZE(snc3_map);
153+
break;
154+
default:
155+
/* Error or no lookup support for SNC with >3 nodes. */
156+
return 0;
157+
}
158+
159+
/* Compute SNC for PMU. */
160+
if (sscanf(pmu->name, "uncore_imc_%u", &imc_num) != 1) {
161+
pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
162+
return 0;
163+
}
164+
if (imc_num >= snc_map_len) {
165+
pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
166+
return 0;
167+
}
168+
return snc_map[imc_num];
169+
}
170+
171+
static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
172+
{
173+
static bool checked_cpu_adjust[MAX_SNCS];
174+
static int cpu_adjust[MAX_SNCS];
175+
struct perf_cpu_map *node_cpus;
176+
char node_path[] = "devices/system/node/node0/cpulist";
177+
178+
/* Was adjust already computed? */
179+
if (checked_cpu_adjust[pmu_snc])
180+
return cpu_adjust[pmu_snc];
181+
182+
/* SNC0 doesn't need an adjust. */
183+
if (pmu_snc == 0) {
184+
cpu_adjust[0] = 0;
185+
checked_cpu_adjust[0] = true;
186+
return 0;
187+
}
188+
189+
/*
190+
* Use NUMA topology to compute first CPU of the NUMA node, we want to
191+
* adjust CPU 0 to be this and similarly for other CPUs if there is >1
192+
* socket.
193+
*/
194+
assert(pmu_snc >= 0 && pmu_snc <= 9);
195+
node_path[24] += pmu_snc; // Shift node0 to be node<pmu_snc>.
196+
node_cpus = read_sysfs_cpu_map(node_path);
197+
cpu_adjust[pmu_snc] = perf_cpu_map__cpu(node_cpus, 0).cpu;
198+
if (cpu_adjust[pmu_snc] < 0) {
199+
pr_debug("Failed to read valid CPU list from <sysfs>/%s\n", node_path);
200+
cpu_adjust[pmu_snc] = 0;
201+
} else {
202+
checked_cpu_adjust[pmu_snc] = true;
203+
}
204+
perf_cpu_map__put(node_cpus);
205+
return cpu_adjust[pmu_snc];
206+
}
207+
208+
static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
209+
{
210+
// With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
211+
// topology. For example, a two socket graniterapids machine may be set
212+
// up with 3-way SNC meaning there are 6 NUMA nodes that should be
213+
// displayed with --per-node. The cpumask of the CHA and IMC PMUs
214+
// reflects per-socket information meaning, for example, uncore_cha_60
215+
// on a two socket graniterapids machine with 120 cores per socket will
216+
// have a cpumask of "0,120". This cpumask needs adjusting to "40,160"
217+
// to reflect that uncore_cha_60 is used for the 2nd SNC of each
218+
// socket. Without the adjustment events on uncore_cha_60 will appear in
219+
// node 0 and node 3 (in our example 2 socket 3-way set up), but with
220+
// the adjustment they will appear in node 1 and node 4. The number of
221+
// CHAs is typically larger than the number of cores. The CHA numbers
222+
// are assumed to split evenly and inorder wrt core numbers. There are
223+
// fewer memory IMC PMUs than cores and mapping is handled using lookup
224+
// tables.
225+
static struct perf_cpu_map *cha_adjusted[MAX_SNCS];
226+
static struct perf_cpu_map *imc_adjusted[MAX_SNCS];
227+
struct perf_cpu_map **adjusted = cha ? cha_adjusted : imc_adjusted;
228+
int idx, pmu_snc, cpu_adjust;
229+
struct perf_cpu cpu;
230+
bool alloc;
231+
232+
// Cpus from the kernel holds first CPU of each socket. e.g. 0,120.
233+
if (perf_cpu_map__cpu(pmu->cpus, 0).cpu != 0) {
234+
pr_debug("Ignoring cpumask adjust for %s as unexpected first CPU\n", pmu->name);
235+
return;
236+
}
237+
238+
pmu_snc = cha ? uncore_cha_snc(pmu) : uncore_imc_snc(pmu);
239+
if (pmu_snc == 0) {
240+
// No adjustment necessary for the first SNC.
241+
return;
242+
}
243+
244+
alloc = adjusted[pmu_snc] == NULL;
245+
if (alloc) {
246+
// Hold onto the perf_cpu_map globally to avoid recomputation.
247+
cpu_adjust = uncore_cha_imc_compute_cpu_adjust(pmu_snc);
248+
adjusted[pmu_snc] = perf_cpu_map__empty_new(perf_cpu_map__nr(pmu->cpus));
249+
if (!adjusted[pmu_snc])
250+
return;
251+
}
252+
253+
perf_cpu_map__for_each_cpu(cpu, idx, pmu->cpus) {
254+
// Compute the new cpu map values or if not allocating, assert
255+
// that they match expectations. asserts will be removed to
256+
// avoid overhead in NDEBUG builds.
257+
if (alloc) {
258+
RC_CHK_ACCESS(adjusted[pmu_snc])->map[idx].cpu = cpu.cpu + cpu_adjust;
259+
} else if (idx == 0) {
260+
cpu_adjust = perf_cpu_map__cpu(adjusted[pmu_snc], idx).cpu - cpu.cpu;
261+
assert(uncore_cha_imc_compute_cpu_adjust(pmu_snc) == cpu_adjust);
262+
} else {
263+
assert(perf_cpu_map__cpu(adjusted[pmu_snc], idx).cpu ==
264+
cpu.cpu + cpu_adjust);
265+
}
266+
}
267+
268+
perf_cpu_map__put(pmu->cpus);
269+
pmu->cpus = perf_cpu_map__get(adjusted[pmu_snc]);
270+
}
20271

21272
void perf_pmu__arch_init(struct perf_pmu *pmu)
22273
{
@@ -49,10 +300,17 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
49300

50301
perf_mem_events__loads_ldlat = 0;
51302
pmu->mem_events = perf_mem_events_amd_ldlat;
52-
} else if (pmu->is_core) {
53-
if (perf_pmu__have_event(pmu, "mem-loads-aux"))
54-
pmu->mem_events = perf_mem_events_intel_aux;
55-
else
56-
pmu->mem_events = perf_mem_events_intel;
303+
} else {
304+
if (pmu->is_core) {
305+
if (perf_pmu__have_event(pmu, "mem-loads-aux"))
306+
pmu->mem_events = perf_mem_events_intel_aux;
307+
else
308+
pmu->mem_events = perf_mem_events_intel;
309+
} else if (x86__is_intel_graniterapids()) {
310+
if (starts_with(pmu->name, "uncore_cha_"))
311+
gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
312+
else if (starts_with(pmu->name, "uncore_imc_"))
313+
gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
314+
}
57315
}
58316
}

0 commit comments

Comments
 (0)