|
| 1 | +.. SPDX-License-Identifier: GPL-2.0 |
| 2 | +
|
| 3 | +.. _cpumasks-header-label: |
| 4 | + |
| 5 | +================== |
| 6 | +BPF cpumask kfuncs |
| 7 | +================== |
| 8 | + |
| 9 | +1. Introduction |
| 10 | +=============== |
| 11 | + |
| 12 | +``struct cpumask`` is a bitmap data structure in the kernel whose indices |
| 13 | +reflect the CPUs on the system. Commonly, cpumasks are used to track which CPUs |
| 14 | +a task is affinitized to, but they can also be used to e.g. track which cores |
| 15 | +are associated with a scheduling domain, which cores on a machine are idle, |
| 16 | +etc. |
| 17 | + |
| 18 | +BPF provides programs with a set of :ref:`kfuncs-header-label` that can be |
| 19 | +used to allocate, mutate, query, and free cpumasks. |
| 20 | + |
| 21 | +2. BPF cpumask objects |
| 22 | +====================== |
| 23 | + |
| 24 | +There are two different types of cpumasks that can be used by BPF programs. |
| 25 | + |
| 26 | +2.1 ``struct bpf_cpumask *`` |
| 27 | +---------------------------- |
| 28 | + |
| 29 | +``struct bpf_cpumask *`` is a cpumask that is allocated by BPF, on behalf of a |
| 30 | +BPF program, and whose lifecycle is entirely controlled by BPF. These cpumasks |
| 31 | +are RCU-protected, can be mutated, can be used as kptrs, and can be safely cast |
| 32 | +to a ``struct cpumask *``. |
| 33 | + |
| 34 | +2.1.1 ``struct bpf_cpumask *`` lifecycle |
| 35 | +---------------------------------------- |
| 36 | + |
| 37 | +A ``struct bpf_cpumask *`` is allocated, acquired, and released, using the |
| 38 | +following functions: |
| 39 | + |
| 40 | +.. kernel-doc:: kernel/bpf/cpumask.c |
| 41 | + :identifiers: bpf_cpumask_create |
| 42 | + |
| 43 | +.. kernel-doc:: kernel/bpf/cpumask.c |
| 44 | + :identifiers: bpf_cpumask_acquire |
| 45 | + |
| 46 | +.. kernel-doc:: kernel/bpf/cpumask.c |
| 47 | + :identifiers: bpf_cpumask_release |
| 48 | + |
| 49 | +For example: |
| 50 | + |
| 51 | +.. code-block:: c |
| 52 | +
|
| 53 | + struct cpumask_map_value { |
| 54 | + struct bpf_cpumask __kptr_ref * cpumask; |
| 55 | + }; |
| 56 | +
|
| 57 | + struct array_map { |
| 58 | + __uint(type, BPF_MAP_TYPE_ARRAY); |
| 59 | + __type(key, int); |
| 60 | + __type(value, struct cpumask_map_value); |
| 61 | + __uint(max_entries, 65536); |
| 62 | + } cpumask_map SEC(".maps"); |
| 63 | +
|
| 64 | + static int cpumask_map_insert(struct bpf_cpumask *mask, u32 pid) |
| 65 | + { |
| 66 | + struct cpumask_map_value local, *v; |
| 67 | + long status; |
| 68 | + struct bpf_cpumask *old; |
| 69 | + u32 key = pid; |
| 70 | +
|
| 71 | + local.cpumask = NULL; |
| 72 | + status = bpf_map_update_elem(&cpumask_map, &key, &local, 0); |
| 73 | + if (status) { |
| 74 | + bpf_cpumask_release(mask); |
| 75 | + return status; |
| 76 | + } |
| 77 | +
|
| 78 | + v = bpf_map_lookup_elem(&cpumask_map, &key); |
| 79 | + if (!v) { |
| 80 | + bpf_cpumask_release(mask); |
| 81 | + return -ENOENT; |
| 82 | + } |
| 83 | +
|
| 84 | + old = bpf_kptr_xchg(&v->cpumask, mask); |
| 85 | + if (old) |
| 86 | + bpf_cpumask_release(old); |
| 87 | +
|
| 88 | + return 0; |
| 89 | + } |
| 90 | +
|
| 91 | + /** |
| 92 | + * A sample tracepoint showing how a task's cpumask can be queried and |
| 93 | + * recorded as a kptr. |
| 94 | + */ |
| 95 | + SEC("tp_btf/task_newtask") |
| 96 | + int BPF_PROG(record_task_cpumask, struct task_struct *task, u64 clone_flags) |
| 97 | + { |
| 98 | + struct bpf_cpumask *cpumask; |
| 99 | + int ret; |
| 100 | +
|
| 101 | + cpumask = bpf_cpumask_create(); |
| 102 | + if (!cpumask) |
| 103 | + return -ENOMEM; |
| 104 | +
|
| 105 | + if (!bpf_cpumask_full(task->cpus_ptr)) |
| 106 | + bpf_printk("task %s has CPU affinity", task->comm); |
| 107 | +
|
| 108 | + bpf_cpumask_copy(cpumask, task->cpus_ptr); |
| 109 | + return cpumask_map_insert(cpumask, task->pid); |
| 110 | + } |
| 111 | +
|
| 112 | +---- |
| 113 | + |
| 114 | +2.1.1 ``struct bpf_cpumask *`` as kptrs |
| 115 | +--------------------------------------- |
| 116 | + |
| 117 | +As mentioned and illustrated above, these ``struct bpf_cpumask *`` objects can |
| 118 | +also be stored in a map and used as kptrs. If a ``struct bpf_cpumask *`` is in |
| 119 | +a map, the reference can be removed from the map with bpf_kptr_xchg(), or |
| 120 | +opportunistically acquired with bpf_cpumask_kptr_get(): |
| 121 | + |
| 122 | +.. kernel-doc:: kernel/bpf/cpumask.c |
| 123 | + :identifiers: bpf_cpumask_kptr_get |
| 124 | + |
| 125 | +Here is an example of a ``struct bpf_cpumask *`` being retrieved from a map: |
| 126 | + |
| 127 | +.. code-block:: c |
| 128 | +
|
| 129 | + /* struct containing the struct bpf_cpumask kptr which is stored in the map. */ |
| 130 | + struct cpumasks_kfunc_map_value { |
| 131 | + struct bpf_cpumask __kptr_ref * bpf_cpumask; |
| 132 | + }; |
| 133 | +
|
| 134 | + /* The map containing struct cpumasks_kfunc_map_value entries. */ |
| 135 | + struct { |
| 136 | + __uint(type, BPF_MAP_TYPE_ARRAY); |
| 137 | + __type(key, int); |
| 138 | + __type(value, struct cpumasks_kfunc_map_value); |
| 139 | + __uint(max_entries, 1); |
| 140 | + } cpumasks_kfunc_map SEC(".maps"); |
| 141 | +
|
| 142 | + /* ... */ |
| 143 | +
|
| 144 | + /** |
| 145 | + * A simple example tracepoint program showing how a |
| 146 | + * struct bpf_cpumask * kptr that is stored in a map can |
| 147 | + * be acquired using the bpf_cpumask_kptr_get() kfunc. |
| 148 | + */ |
| 149 | + SEC("tp_btf/cgroup_mkdir") |
| 150 | + int BPF_PROG(cgrp_ancestor_example, struct cgroup *cgrp, const char *path) |
| 151 | + { |
| 152 | + struct bpf_cpumask *kptr; |
| 153 | + struct cpumasks_kfunc_map_value *v; |
| 154 | + u32 key = 0; |
| 155 | +
|
| 156 | + /* Assume a bpf_cpumask * kptr was previously stored in the map. */ |
| 157 | + v = bpf_map_lookup_elem(&cpumasks_kfunc_map, &key); |
| 158 | + if (!v) |
| 159 | + return -ENOENT; |
| 160 | +
|
| 161 | + /* Acquire a reference to the bpf_cpumask * kptr that's already stored in the map. */ |
| 162 | + kptr = bpf_cpumask_kptr_get(&v->cpumask); |
| 163 | + if (!kptr) |
| 164 | + /* If no bpf_cpumask was present in the map, it's because |
| 165 | + * we're racing with another CPU that removed it with |
| 166 | + * bpf_kptr_xchg() between the bpf_map_lookup_elem() |
| 167 | + * above, and our call to bpf_cpumask_kptr_get(). |
| 168 | + * bpf_cpumask_kptr_get() internally safely handles this |
| 169 | + * race, and will return NULL if the cpumask is no longer |
| 170 | + * present in the map by the time we invoke the kfunc. |
| 171 | + */ |
| 172 | + return -EBUSY; |
| 173 | +
|
| 174 | + /* Free the reference we just took above. Note that the |
| 175 | + * original struct bpf_cpumask * kptr is still in the map. It will |
| 176 | + * be freed either at a later time if another context deletes |
| 177 | + * it from the map, or automatically by the BPF subsystem if |
| 178 | + * it's still present when the map is destroyed. |
| 179 | + */ |
| 180 | + bpf_cpumask_release(kptr); |
| 181 | +
|
| 182 | + return 0; |
| 183 | + } |
| 184 | +
|
| 185 | +---- |
| 186 | + |
| 187 | +2.2 ``struct cpumask`` |
| 188 | +---------------------- |
| 189 | + |
| 190 | +``struct cpumask`` is the object that actually contains the cpumask bitmap |
| 191 | +being queried, mutated, etc. A ``struct bpf_cpumask`` wraps a ``struct |
| 192 | +cpumask``, which is why it's safe to cast it as such (note however that it is |
| 193 | +**not** safe to cast a ``struct cpumask *`` to a ``struct bpf_cpumask *``, and |
| 194 | +the verifier will reject any program that tries to do so). |
| 195 | + |
| 196 | +As we'll see below, any kfunc that mutates its cpumask argument will take a |
| 197 | +``struct bpf_cpumask *`` as that argument. Any argument that simply queries the |
| 198 | +cpumask will instead take a ``struct cpumask *``. |
| 199 | + |
| 200 | +3. cpumask kfuncs |
| 201 | +================= |
| 202 | + |
| 203 | +Above, we described the kfuncs that can be used to allocate, acquire, release, |
| 204 | +etc a ``struct bpf_cpumask *``. This section of the document will describe the |
| 205 | +kfuncs for mutating and querying cpumasks. |
| 206 | + |
| 207 | +3.1 Mutating cpumasks |
| 208 | +--------------------- |
| 209 | + |
| 210 | +Some cpumask kfuncs are "read-only" in that they don't mutate any of their |
| 211 | +arguments, whereas others mutate at least one argument (which means that the |
| 212 | +argument must be a ``struct bpf_cpumask *``, as described above). |
| 213 | + |
| 214 | +This section will describe all of the cpumask kfuncs which mutate at least one |
| 215 | +argument. :ref:`cpumasks-querying-label` below describes the read-only kfuncs. |
| 216 | + |
| 217 | +3.1.1 Setting and clearing CPUs |
| 218 | +------------------------------- |
| 219 | + |
| 220 | +bpf_cpumask_set_cpu() and bpf_cpumask_clear_cpu() can be used to set and clear |
| 221 | +a CPU in a ``struct bpf_cpumask`` respectively: |
| 222 | + |
| 223 | +.. kernel-doc:: kernel/bpf/cpumask.c |
| 224 | + :identifiers: bpf_cpumask_set_cpu bpf_cpumask_clear_cpu |
| 225 | + |
| 226 | +These kfuncs are pretty straightforward, and can be used, for example, as |
| 227 | +follows: |
| 228 | + |
| 229 | +.. code-block:: c |
| 230 | +
|
| 231 | + /** |
| 232 | + * A sample tracepoint showing how a cpumask can be queried. |
| 233 | + */ |
| 234 | + SEC("tp_btf/task_newtask") |
| 235 | + int BPF_PROG(test_set_clear_cpu, struct task_struct *task, u64 clone_flags) |
| 236 | + { |
| 237 | + struct bpf_cpumask *cpumask; |
| 238 | +
|
| 239 | + cpumask = bpf_cpumask_create(); |
| 240 | + if (!cpumask) |
| 241 | + return -ENOMEM; |
| 242 | +
|
| 243 | + bpf_cpumask_set_cpu(0, cpumask); |
| 244 | + if (!bpf_cpumask_test_cpu(0, cast(cpumask))) |
| 245 | + /* Should never happen. */ |
| 246 | + goto release_exit; |
| 247 | +
|
| 248 | + bpf_cpumask_clear_cpu(0, cpumask); |
| 249 | + if (bpf_cpumask_test_cpu(0, cast(cpumask))) |
| 250 | + /* Should never happen. */ |
| 251 | + goto release_exit; |
| 252 | +
|
| 253 | + /* struct cpumask * pointers such as task->cpus_ptr can also be queried. */ |
| 254 | + if (bpf_cpumask_test_cpu(0, task->cpus_ptr)) |
| 255 | + bpf_printk("task %s can use CPU %d", task->comm, 0); |
| 256 | +
|
| 257 | + release_exit: |
| 258 | + bpf_cpumask_release(cpumask); |
| 259 | + return 0; |
| 260 | + } |
| 261 | +
|
| 262 | +---- |
| 263 | + |
| 264 | +bpf_cpumask_test_and_set_cpu() and bpf_cpumask_test_and_clear_cpu() are |
| 265 | +complementary kfuncs that allow callers to atomically test and set (or clear) |
| 266 | +CPUs: |
| 267 | + |
| 268 | +.. kernel-doc:: kernel/bpf/cpumask.c |
| 269 | + :identifiers: bpf_cpumask_test_and_set_cpu bpf_cpumask_test_and_clear_cpu |
| 270 | + |
| 271 | +---- |
| 272 | + |
| 273 | +We can also set and clear entire ``struct bpf_cpumask *`` objects in one |
| 274 | +operation using bpf_cpumask_setall() and bpf_cpumask_clear(): |
| 275 | + |
| 276 | +.. kernel-doc:: kernel/bpf/cpumask.c |
| 277 | + :identifiers: bpf_cpumask_setall bpf_cpumask_clear |
| 278 | + |
| 279 | +3.1.2 Operations between cpumasks |
| 280 | +--------------------------------- |
| 281 | + |
| 282 | +In addition to setting and clearing individual CPUs in a single cpumask, |
| 283 | +callers can also perform bitwise operations between multiple cpumasks using |
| 284 | +bpf_cpumask_and(), bpf_cpumask_or(), and bpf_cpumask_xor(): |
| 285 | + |
| 286 | +.. kernel-doc:: kernel/bpf/cpumask.c |
| 287 | + :identifiers: bpf_cpumask_and bpf_cpumask_or bpf_cpumask_xor |
| 288 | + |
| 289 | +The following is an example of how they may be used. Note that some of the |
| 290 | +kfuncs shown in this example will be covered in more detail below. |
| 291 | + |
| 292 | +.. code-block:: c |
| 293 | +
|
| 294 | + /** |
| 295 | + * A sample tracepoint showing how a cpumask can be mutated using |
| 296 | + bitwise operators (and queried). |
| 297 | + */ |
| 298 | + SEC("tp_btf/task_newtask") |
| 299 | + int BPF_PROG(test_and_or_xor, struct task_struct *task, u64 clone_flags) |
| 300 | + { |
| 301 | + struct bpf_cpumask *mask1, *mask2, *dst1, *dst2; |
| 302 | +
|
| 303 | + mask1 = bpf_cpumask_create(); |
| 304 | + if (!mask1) |
| 305 | + return -ENOMEM; |
| 306 | +
|
| 307 | + mask2 = bpf_cpumask_create(); |
| 308 | + if (!mask2) { |
| 309 | + bpf_cpumask_release(mask1); |
| 310 | + return -ENOMEM; |
| 311 | + } |
| 312 | +
|
| 313 | + // ...Safely create the other two masks... */ |
| 314 | +
|
| 315 | + bpf_cpumask_set_cpu(0, mask1); |
| 316 | + bpf_cpumask_set_cpu(1, mask2); |
| 317 | + bpf_cpumask_and(dst1, (const struct cpumask *)mask1, (const struct cpumask *)mask2); |
| 318 | + if (!bpf_cpumask_empty((const struct cpumask *)dst1)) |
| 319 | + /* Should never happen. */ |
| 320 | + goto release_exit; |
| 321 | +
|
| 322 | + bpf_cpumask_or(dst1, (const struct cpumask *)mask1, (const struct cpumask *)mask2); |
| 323 | + if (!bpf_cpumask_test_cpu(0, (const struct cpumask *)dst1)) |
| 324 | + /* Should never happen. */ |
| 325 | + goto release_exit; |
| 326 | +
|
| 327 | + if (!bpf_cpumask_test_cpu(1, (const struct cpumask *)dst1)) |
| 328 | + /* Should never happen. */ |
| 329 | + goto release_exit; |
| 330 | +
|
| 331 | + bpf_cpumask_xor(dst2, (const struct cpumask *)mask1, (const struct cpumask *)mask2); |
| 332 | + if (!bpf_cpumask_equal((const struct cpumask *)dst1, |
| 333 | + (const struct cpumask *)dst2)) |
| 334 | + /* Should never happen. */ |
| 335 | + goto release_exit; |
| 336 | +
|
| 337 | + release_exit: |
| 338 | + bpf_cpumask_release(mask1); |
| 339 | + bpf_cpumask_release(mask2); |
| 340 | + bpf_cpumask_release(dst1); |
| 341 | + bpf_cpumask_release(dst2); |
| 342 | + return 0; |
| 343 | + } |
| 344 | +
|
| 345 | +---- |
| 346 | + |
| 347 | +The contents of an entire cpumask may be copied to another using |
| 348 | +bpf_cpumask_copy(): |
| 349 | + |
| 350 | +.. kernel-doc:: kernel/bpf/cpumask.c |
| 351 | + :identifiers: bpf_cpumask_copy |
| 352 | + |
| 353 | +---- |
| 354 | + |
| 355 | +.. _cpumasks-querying-label: |
| 356 | + |
| 357 | +3.2 Querying cpumasks |
| 358 | +--------------------- |
| 359 | + |
| 360 | +In addition to the above kfuncs, there is also a set of read-only kfuncs that |
| 361 | +can be used to query the contents of cpumasks. |
| 362 | + |
| 363 | +.. kernel-doc:: kernel/bpf/cpumask.c |
| 364 | + :identifiers: bpf_cpumask_first bpf_cpumask_first_zero bpf_cpumask_test_cpu |
| 365 | + |
| 366 | +.. kernel-doc:: kernel/bpf/cpumask.c |
| 367 | + :identifiers: bpf_cpumask_equal bpf_cpumask_intersects bpf_cpumask_subset |
| 368 | + bpf_cpumask_empty bpf_cpumask_full |
| 369 | + |
| 370 | +.. kernel-doc:: kernel/bpf/cpumask.c |
| 371 | + :identifiers: bpf_cpumask_any bpf_cpumask_any_and |
| 372 | + |
| 373 | +---- |
| 374 | + |
| 375 | +Some example usages of these querying kfuncs were shown above. We will not |
| 376 | +replicate those exmaples here. Note, however, that all of the aforementioned |
| 377 | +kfuncs are tested in `tools/testing/selftests/bpf/progs/cpumask_success.c`_, so |
| 378 | +please take a look there if you're looking for more examples of how they can be |
| 379 | +used. |
| 380 | + |
| 381 | +.. _tools/testing/selftests/bpf/progs/cpumask_success.c: |
| 382 | + https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/progs/cpumask_success.c |
| 383 | + |
| 384 | + |
| 385 | +4. Adding BPF cpumask kfuncs |
| 386 | +============================ |
| 387 | + |
| 388 | +The set of supported BPF cpumask kfuncs are not (yet) a 1-1 match with the |
| 389 | +cpumask operations in include/linux/cpumask.h. Any of those cpumask operations |
| 390 | +could easily be encapsulated in a new kfunc if and when required. If you'd like |
| 391 | +to support a new cpumask operation, please feel free to submit a patch. If you |
| 392 | +do add a new cpumask kfunc, please document it here, and add any relevant |
| 393 | +selftest testcases to the cpumask selftest suite. |
0 commit comments