|
8 | 8 | #include <linux/perf_event.h>
|
9 | 9 | #include <linux/zalloc.h>
|
10 | 10 | #include <api/fs/fs.h>
|
| 11 | +#include <api/io_dir.h> |
| 12 | +#include <internal/cpumap.h> |
11 | 13 | #include <errno.h>
|
12 | 14 |
|
13 | 15 | #include "../../../util/intel-pt.h"
|
|
16 | 18 | #include "../../../util/fncache.h"
|
17 | 19 | #include "../../../util/pmus.h"
|
18 | 20 | #include "mem-events.h"
|
| 21 | +#include "util/debug.h" |
19 | 22 | #include "util/env.h"
|
| 23 | +#include "util/header.h" |
| 24 | + |
| 25 | +static bool x86__is_intel_graniterapids(void) |
| 26 | +{ |
| 27 | + static bool checked_if_graniterapids; |
| 28 | + static bool is_graniterapids; |
| 29 | + |
| 30 | + if (!checked_if_graniterapids) { |
| 31 | + const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]"; |
| 32 | + char *cpuid = get_cpuid_str((struct perf_cpu){0}); |
| 33 | + |
| 34 | + is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0; |
| 35 | + free(cpuid); |
| 36 | + checked_if_graniterapids = true; |
| 37 | + } |
| 38 | + return is_graniterapids; |
| 39 | +} |
| 40 | + |
| 41 | +static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path) |
| 42 | +{ |
| 43 | + struct perf_cpu_map *cpus; |
| 44 | + char *buf = NULL; |
| 45 | + size_t buf_len; |
| 46 | + |
| 47 | + if (sysfs__read_str(sysfs_path, &buf, &buf_len) < 0) |
| 48 | + return NULL; |
| 49 | + |
| 50 | + cpus = perf_cpu_map__new(buf); |
| 51 | + free(buf); |
| 52 | + return cpus; |
| 53 | +} |
| 54 | + |
| 55 | +static int snc_nodes_per_l3_cache(void) |
| 56 | +{ |
| 57 | + static bool checked_snc; |
| 58 | + static int snc_nodes; |
| 59 | + |
| 60 | + if (!checked_snc) { |
| 61 | + struct perf_cpu_map *node_cpus = |
| 62 | + read_sysfs_cpu_map("devices/system/node/node0/cpulist"); |
| 63 | + struct perf_cpu_map *cache_cpus = |
| 64 | + read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list"); |
| 65 | + |
| 66 | + snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus); |
| 67 | + perf_cpu_map__put(cache_cpus); |
| 68 | + perf_cpu_map__put(node_cpus); |
| 69 | + checked_snc = true; |
| 70 | + } |
| 71 | + return snc_nodes; |
| 72 | +} |
| 73 | + |
| 74 | +static bool starts_with(const char *str, const char *prefix) |
| 75 | +{ |
| 76 | + return !strncmp(prefix, str, strlen(prefix)); |
| 77 | +} |
| 78 | + |
| 79 | +static int num_chas(void) |
| 80 | +{ |
| 81 | + static bool checked_chas; |
| 82 | + static int num_chas; |
| 83 | + |
| 84 | + if (!checked_chas) { |
| 85 | + int fd = perf_pmu__event_source_devices_fd(); |
| 86 | + struct io_dir dir; |
| 87 | + struct io_dirent64 *dent; |
| 88 | + |
| 89 | + if (fd < 0) |
| 90 | + return -1; |
| 91 | + |
| 92 | + io_dir__init(&dir, fd); |
| 93 | + |
| 94 | + while ((dent = io_dir__readdir(&dir)) != NULL) { |
| 95 | + /* Note, dent->d_type will be DT_LNK and so isn't a useful filter. */ |
| 96 | + if (starts_with(dent->d_name, "uncore_cha_")) |
| 97 | + num_chas++; |
| 98 | + } |
| 99 | + close(fd); |
| 100 | + checked_chas = true; |
| 101 | + } |
| 102 | + return num_chas; |
| 103 | +} |
| 104 | + |
| 105 | +#define MAX_SNCS 6 |
| 106 | + |
| 107 | +static int uncore_cha_snc(struct perf_pmu *pmu) |
| 108 | +{ |
| 109 | + // CHA SNC numbers are ordered correspond to the CHAs number. |
| 110 | + unsigned int cha_num; |
| 111 | + int num_cha, chas_per_node, cha_snc; |
| 112 | + int snc_nodes = snc_nodes_per_l3_cache(); |
| 113 | + |
| 114 | + if (snc_nodes <= 1) |
| 115 | + return 0; |
| 116 | + |
| 117 | + num_cha = num_chas(); |
| 118 | + if (num_cha <= 0) { |
| 119 | + pr_warning("Unexpected: no CHAs found\n"); |
| 120 | + return 0; |
| 121 | + } |
| 122 | + |
| 123 | + /* Compute SNC for PMU. */ |
| 124 | + if (sscanf(pmu->name, "uncore_cha_%u", &cha_num) != 1) { |
| 125 | + pr_warning("Unexpected: unable to compute CHA number '%s'\n", pmu->name); |
| 126 | + return 0; |
| 127 | + } |
| 128 | + chas_per_node = num_cha / snc_nodes; |
| 129 | + cha_snc = cha_num / chas_per_node; |
| 130 | + |
| 131 | + /* Range check cha_snc. for unexpected out of bounds. */ |
| 132 | + return cha_snc >= MAX_SNCS ? 0 : cha_snc; |
| 133 | +} |
| 134 | + |
| 135 | +static int uncore_imc_snc(struct perf_pmu *pmu) |
| 136 | +{ |
| 137 | + // Compute the IMC SNC using lookup tables. |
| 138 | + unsigned int imc_num; |
| 139 | + int snc_nodes = snc_nodes_per_l3_cache(); |
| 140 | + const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0}; |
| 141 | + const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2}; |
| 142 | + const u8 *snc_map; |
| 143 | + size_t snc_map_len; |
| 144 | + |
| 145 | + switch (snc_nodes) { |
| 146 | + case 2: |
| 147 | + snc_map = snc2_map; |
| 148 | + snc_map_len = ARRAY_SIZE(snc2_map); |
| 149 | + break; |
| 150 | + case 3: |
| 151 | + snc_map = snc3_map; |
| 152 | + snc_map_len = ARRAY_SIZE(snc3_map); |
| 153 | + break; |
| 154 | + default: |
| 155 | + /* Error or no lookup support for SNC with >3 nodes. */ |
| 156 | + return 0; |
| 157 | + } |
| 158 | + |
| 159 | + /* Compute SNC for PMU. */ |
| 160 | + if (sscanf(pmu->name, "uncore_imc_%u", &imc_num) != 1) { |
| 161 | + pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name); |
| 162 | + return 0; |
| 163 | + } |
| 164 | + if (imc_num >= snc_map_len) { |
| 165 | + pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes); |
| 166 | + return 0; |
| 167 | + } |
| 168 | + return snc_map[imc_num]; |
| 169 | +} |
| 170 | + |
| 171 | +static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc) |
| 172 | +{ |
| 173 | + static bool checked_cpu_adjust[MAX_SNCS]; |
| 174 | + static int cpu_adjust[MAX_SNCS]; |
| 175 | + struct perf_cpu_map *node_cpus; |
| 176 | + char node_path[] = "devices/system/node/node0/cpulist"; |
| 177 | + |
| 178 | + /* Was adjust already computed? */ |
| 179 | + if (checked_cpu_adjust[pmu_snc]) |
| 180 | + return cpu_adjust[pmu_snc]; |
| 181 | + |
| 182 | + /* SNC0 doesn't need an adjust. */ |
| 183 | + if (pmu_snc == 0) { |
| 184 | + cpu_adjust[0] = 0; |
| 185 | + checked_cpu_adjust[0] = true; |
| 186 | + return 0; |
| 187 | + } |
| 188 | + |
| 189 | + /* |
| 190 | + * Use NUMA topology to compute first CPU of the NUMA node, we want to |
| 191 | + * adjust CPU 0 to be this and similarly for other CPUs if there is >1 |
| 192 | + * socket. |
| 193 | + */ |
| 194 | + assert(pmu_snc >= 0 && pmu_snc <= 9); |
| 195 | + node_path[24] += pmu_snc; // Shift node0 to be node<pmu_snc>. |
| 196 | + node_cpus = read_sysfs_cpu_map(node_path); |
| 197 | + cpu_adjust[pmu_snc] = perf_cpu_map__cpu(node_cpus, 0).cpu; |
| 198 | + if (cpu_adjust[pmu_snc] < 0) { |
| 199 | + pr_debug("Failed to read valid CPU list from <sysfs>/%s\n", node_path); |
| 200 | + cpu_adjust[pmu_snc] = 0; |
| 201 | + } else { |
| 202 | + checked_cpu_adjust[pmu_snc] = true; |
| 203 | + } |
| 204 | + perf_cpu_map__put(node_cpus); |
| 205 | + return cpu_adjust[pmu_snc]; |
| 206 | +} |
| 207 | + |
| 208 | +static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha) |
| 209 | +{ |
| 210 | + // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the |
| 211 | + // topology. For example, a two socket graniterapids machine may be set |
| 212 | + // up with 3-way SNC meaning there are 6 NUMA nodes that should be |
| 213 | + // displayed with --per-node. The cpumask of the CHA and IMC PMUs |
| 214 | + // reflects per-socket information meaning, for example, uncore_cha_60 |
| 215 | + // on a two socket graniterapids machine with 120 cores per socket will |
| 216 | + // have a cpumask of "0,120". This cpumask needs adjusting to "40,160" |
| 217 | + // to reflect that uncore_cha_60 is used for the 2nd SNC of each |
| 218 | + // socket. Without the adjustment events on uncore_cha_60 will appear in |
| 219 | + // node 0 and node 3 (in our example 2 socket 3-way set up), but with |
| 220 | + // the adjustment they will appear in node 1 and node 4. The number of |
| 221 | + // CHAs is typically larger than the number of cores. The CHA numbers |
| 222 | + // are assumed to split evenly and inorder wrt core numbers. There are |
| 223 | + // fewer memory IMC PMUs than cores and mapping is handled using lookup |
| 224 | + // tables. |
| 225 | + static struct perf_cpu_map *cha_adjusted[MAX_SNCS]; |
| 226 | + static struct perf_cpu_map *imc_adjusted[MAX_SNCS]; |
| 227 | + struct perf_cpu_map **adjusted = cha ? cha_adjusted : imc_adjusted; |
| 228 | + int idx, pmu_snc, cpu_adjust; |
| 229 | + struct perf_cpu cpu; |
| 230 | + bool alloc; |
| 231 | + |
| 232 | + // Cpus from the kernel holds first CPU of each socket. e.g. 0,120. |
| 233 | + if (perf_cpu_map__cpu(pmu->cpus, 0).cpu != 0) { |
| 234 | + pr_debug("Ignoring cpumask adjust for %s as unexpected first CPU\n", pmu->name); |
| 235 | + return; |
| 236 | + } |
| 237 | + |
| 238 | + pmu_snc = cha ? uncore_cha_snc(pmu) : uncore_imc_snc(pmu); |
| 239 | + if (pmu_snc == 0) { |
| 240 | + // No adjustment necessary for the first SNC. |
| 241 | + return; |
| 242 | + } |
| 243 | + |
| 244 | + alloc = adjusted[pmu_snc] == NULL; |
| 245 | + if (alloc) { |
| 246 | + // Hold onto the perf_cpu_map globally to avoid recomputation. |
| 247 | + cpu_adjust = uncore_cha_imc_compute_cpu_adjust(pmu_snc); |
| 248 | + adjusted[pmu_snc] = perf_cpu_map__empty_new(perf_cpu_map__nr(pmu->cpus)); |
| 249 | + if (!adjusted[pmu_snc]) |
| 250 | + return; |
| 251 | + } |
| 252 | + |
| 253 | + perf_cpu_map__for_each_cpu(cpu, idx, pmu->cpus) { |
| 254 | + // Compute the new cpu map values or if not allocating, assert |
| 255 | + // that they match expectations. asserts will be removed to |
| 256 | + // avoid overhead in NDEBUG builds. |
| 257 | + if (alloc) { |
| 258 | + RC_CHK_ACCESS(adjusted[pmu_snc])->map[idx].cpu = cpu.cpu + cpu_adjust; |
| 259 | + } else if (idx == 0) { |
| 260 | + cpu_adjust = perf_cpu_map__cpu(adjusted[pmu_snc], idx).cpu - cpu.cpu; |
| 261 | + assert(uncore_cha_imc_compute_cpu_adjust(pmu_snc) == cpu_adjust); |
| 262 | + } else { |
| 263 | + assert(perf_cpu_map__cpu(adjusted[pmu_snc], idx).cpu == |
| 264 | + cpu.cpu + cpu_adjust); |
| 265 | + } |
| 266 | + } |
| 267 | + |
| 268 | + perf_cpu_map__put(pmu->cpus); |
| 269 | + pmu->cpus = perf_cpu_map__get(adjusted[pmu_snc]); |
| 270 | +} |
20 | 271 |
|
21 | 272 | void perf_pmu__arch_init(struct perf_pmu *pmu)
|
22 | 273 | {
|
@@ -49,10 +300,17 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
|
49 | 300 |
|
50 | 301 | perf_mem_events__loads_ldlat = 0;
|
51 | 302 | pmu->mem_events = perf_mem_events_amd_ldlat;
|
52 |
| - } else if (pmu->is_core) { |
53 |
| - if (perf_pmu__have_event(pmu, "mem-loads-aux")) |
54 |
| - pmu->mem_events = perf_mem_events_intel_aux; |
55 |
| - else |
56 |
| - pmu->mem_events = perf_mem_events_intel; |
| 303 | + } else { |
| 304 | + if (pmu->is_core) { |
| 305 | + if (perf_pmu__have_event(pmu, "mem-loads-aux")) |
| 306 | + pmu->mem_events = perf_mem_events_intel_aux; |
| 307 | + else |
| 308 | + pmu->mem_events = perf_mem_events_intel; |
| 309 | + } else if (x86__is_intel_graniterapids()) { |
| 310 | + if (starts_with(pmu->name, "uncore_cha_")) |
| 311 | + gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true); |
| 312 | + else if (starts_with(pmu->name, "uncore_imc_")) |
| 313 | + gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false); |
| 314 | + } |
57 | 315 | }
|
58 | 316 | }
|
0 commit comments