@@ -229,6 +229,125 @@ do { \
229
229
} \
230
230
})
231
231
232
+ /*
233
+ * Add return operation
234
+ */
235
+ #define percpu_add_return_op (var , val ) \
236
+ ({ \
237
+ typeof(var) paro_ret__ = val; \
238
+ switch (sizeof(var)) { \
239
+ case 1: \
240
+ asm("xaddb %0, "__percpu_arg(1) \
241
+ : "+q" (paro_ret__), "+m" (var) \
242
+ : : "memory"); \
243
+ break; \
244
+ case 2: \
245
+ asm("xaddw %0, "__percpu_arg(1) \
246
+ : "+r" (paro_ret__), "+m" (var) \
247
+ : : "memory"); \
248
+ break; \
249
+ case 4: \
250
+ asm("xaddl %0, "__percpu_arg(1) \
251
+ : "+r" (paro_ret__), "+m" (var) \
252
+ : : "memory"); \
253
+ break; \
254
+ case 8: \
255
+ asm("xaddq %0, "__percpu_arg(1) \
256
+ : "+re" (paro_ret__), "+m" (var) \
257
+ : : "memory"); \
258
+ break; \
259
+ default: __bad_percpu_size(); \
260
+ } \
261
+ paro_ret__ += val; \
262
+ paro_ret__; \
263
+ })
264
+
265
+ /*
266
+ * xchg is implemented using cmpxchg without a lock prefix. xchg is
267
+ * expensive due to the implied lock prefix. The processor cannot prefetch
268
+ * cachelines if xchg is used.
269
+ */
270
+ #define percpu_xchg_op (var , nval ) \
271
+ ({ \
272
+ typeof(var) pxo_ret__; \
273
+ typeof(var) pxo_new__ = (nval); \
274
+ switch (sizeof(var)) { \
275
+ case 1: \
276
+ asm("\n1:mov "__percpu_arg(1)",%%al" \
277
+ "\n\tcmpxchgb %2, "__percpu_arg(1) \
278
+ "\n\tjnz 1b" \
279
+ : "=a" (pxo_ret__), "+m" (var) \
280
+ : "q" (pxo_new__) \
281
+ : "memory"); \
282
+ break; \
283
+ case 2: \
284
+ asm("\n1:mov "__percpu_arg(1)",%%ax" \
285
+ "\n\tcmpxchgw %2, "__percpu_arg(1) \
286
+ "\n\tjnz 1b" \
287
+ : "=a" (pxo_ret__), "+m" (var) \
288
+ : "r" (pxo_new__) \
289
+ : "memory"); \
290
+ break; \
291
+ case 4: \
292
+ asm("\n1:mov "__percpu_arg(1)",%%eax" \
293
+ "\n\tcmpxchgl %2, "__percpu_arg(1) \
294
+ "\n\tjnz 1b" \
295
+ : "=a" (pxo_ret__), "+m" (var) \
296
+ : "r" (pxo_new__) \
297
+ : "memory"); \
298
+ break; \
299
+ case 8: \
300
+ asm("\n1:mov "__percpu_arg(1)",%%rax" \
301
+ "\n\tcmpxchgq %2, "__percpu_arg(1) \
302
+ "\n\tjnz 1b" \
303
+ : "=a" (pxo_ret__), "+m" (var) \
304
+ : "r" (pxo_new__) \
305
+ : "memory"); \
306
+ break; \
307
+ default: __bad_percpu_size(); \
308
+ } \
309
+ pxo_ret__; \
310
+ })
311
+
312
+ /*
313
+ * cmpxchg has no such implied lock semantics as a result it is much
314
+ * more efficient for cpu local operations.
315
+ */
316
+ #define percpu_cmpxchg_op (var , oval , nval ) \
317
+ ({ \
318
+ typeof(var) pco_ret__; \
319
+ typeof(var) pco_old__ = (oval); \
320
+ typeof(var) pco_new__ = (nval); \
321
+ switch (sizeof(var)) { \
322
+ case 1: \
323
+ asm("cmpxchgb %2, "__percpu_arg(1) \
324
+ : "=a" (pco_ret__), "+m" (var) \
325
+ : "q" (pco_new__), "0" (pco_old__) \
326
+ : "memory"); \
327
+ break; \
328
+ case 2: \
329
+ asm("cmpxchgw %2, "__percpu_arg(1) \
330
+ : "=a" (pco_ret__), "+m" (var) \
331
+ : "r" (pco_new__), "0" (pco_old__) \
332
+ : "memory"); \
333
+ break; \
334
+ case 4: \
335
+ asm("cmpxchgl %2, "__percpu_arg(1) \
336
+ : "=a" (pco_ret__), "+m" (var) \
337
+ : "r" (pco_new__), "0" (pco_old__) \
338
+ : "memory"); \
339
+ break; \
340
+ case 8: \
341
+ asm("cmpxchgq %2, "__percpu_arg(1) \
342
+ : "=a" (pco_ret__), "+m" (var) \
343
+ : "r" (pco_new__), "0" (pco_old__) \
344
+ : "memory"); \
345
+ break; \
346
+ default: __bad_percpu_size(); \
347
+ } \
348
+ pco_ret__; \
349
+ })
350
+
232
351
/*
233
352
* percpu_read() makes gcc load the percpu variable every time it is
234
353
* accessed while percpu_read_stable() allows the value to be cached.
@@ -267,6 +386,12 @@ do { \
267
386
#define __this_cpu_xor_1 (pcp , val ) percpu_to_op("xor", (pcp), val)
268
387
#define __this_cpu_xor_2 (pcp , val ) percpu_to_op("xor", (pcp), val)
269
388
#define __this_cpu_xor_4 (pcp , val ) percpu_to_op("xor", (pcp), val)
389
+ /*
390
+ * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much
391
+ * faster than an xchg with forced lock semantics.
392
+ */
393
+ #define __this_cpu_xchg_8 (pcp , nval ) percpu_xchg_op(pcp, nval)
394
+ #define __this_cpu_cmpxchg_8 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
270
395
271
396
#define this_cpu_read_1 (pcp ) percpu_from_op("mov", (pcp), "m"(pcp))
272
397
#define this_cpu_read_2 (pcp ) percpu_from_op("mov", (pcp), "m"(pcp))
@@ -286,6 +411,11 @@ do { \
286
411
#define this_cpu_xor_1 (pcp , val ) percpu_to_op("xor", (pcp), val)
287
412
#define this_cpu_xor_2 (pcp , val ) percpu_to_op("xor", (pcp), val)
288
413
#define this_cpu_xor_4 (pcp , val ) percpu_to_op("xor", (pcp), val)
414
+ #define this_cpu_xchg_1 (pcp , nval ) percpu_xchg_op(pcp, nval)
415
+ #define this_cpu_xchg_2 (pcp , nval ) percpu_xchg_op(pcp, nval)
416
+ #define this_cpu_xchg_4 (pcp , nval ) percpu_xchg_op(pcp, nval)
417
+ #define this_cpu_xchg_8 (pcp , nval ) percpu_xchg_op(pcp, nval)
418
+ #define this_cpu_cmpxchg_8 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
289
419
290
420
#define irqsafe_cpu_add_1 (pcp , val ) percpu_add_op((pcp), val)
291
421
#define irqsafe_cpu_add_2 (pcp , val ) percpu_add_op((pcp), val)
@@ -299,6 +429,31 @@ do { \
299
429
#define irqsafe_cpu_xor_1 (pcp , val ) percpu_to_op("xor", (pcp), val)
300
430
#define irqsafe_cpu_xor_2 (pcp , val ) percpu_to_op("xor", (pcp), val)
301
431
#define irqsafe_cpu_xor_4 (pcp , val ) percpu_to_op("xor", (pcp), val)
432
+ #define irqsafe_cpu_xchg_1 (pcp , nval ) percpu_xchg_op(pcp, nval)
433
+ #define irqsafe_cpu_xchg_2 (pcp , nval ) percpu_xchg_op(pcp, nval)
434
+ #define irqsafe_cpu_xchg_4 (pcp , nval ) percpu_xchg_op(pcp, nval)
435
+ #define irqsafe_cpu_xchg_8 (pcp , nval ) percpu_xchg_op(pcp, nval)
436
+ #define irqsafe_cpu_cmpxchg_8 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
437
+
438
+ #ifndef CONFIG_M386
439
+ #define __this_cpu_add_return_1 (pcp , val ) percpu_add_return_op(pcp, val)
440
+ #define __this_cpu_add_return_2 (pcp , val ) percpu_add_return_op(pcp, val)
441
+ #define __this_cpu_add_return_4 (pcp , val ) percpu_add_return_op(pcp, val)
442
+ #define __this_cpu_cmpxchg_1 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
443
+ #define __this_cpu_cmpxchg_2 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
444
+ #define __this_cpu_cmpxchg_4 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
445
+
446
+ #define this_cpu_add_return_1 (pcp , val ) percpu_add_return_op(pcp, val)
447
+ #define this_cpu_add_return_2 (pcp , val ) percpu_add_return_op(pcp, val)
448
+ #define this_cpu_add_return_4 (pcp , val ) percpu_add_return_op(pcp, val)
449
+ #define this_cpu_cmpxchg_1 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
450
+ #define this_cpu_cmpxchg_2 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
451
+ #define this_cpu_cmpxchg_4 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
452
+
453
+ #define irqsafe_cpu_cmpxchg_1 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
454
+ #define irqsafe_cpu_cmpxchg_2 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
455
+ #define irqsafe_cpu_cmpxchg_4 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
456
+ #endif /* !CONFIG_M386 */
302
457
303
458
/*
304
459
* Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -311,19 +466,20 @@ do { \
311
466
#define __this_cpu_and_8 (pcp , val ) percpu_to_op("and", (pcp), val)
312
467
#define __this_cpu_or_8 (pcp , val ) percpu_to_op("or", (pcp), val)
313
468
#define __this_cpu_xor_8 (pcp , val ) percpu_to_op("xor", (pcp), val)
469
+ #define __this_cpu_add_return_8 (pcp , val ) percpu_add_return_op(pcp, val)
314
470
315
471
#define this_cpu_read_8 (pcp ) percpu_from_op("mov", (pcp), "m"(pcp))
316
472
#define this_cpu_write_8 (pcp , val ) percpu_to_op("mov", (pcp), val)
317
473
#define this_cpu_add_8 (pcp , val ) percpu_add_op((pcp), val)
318
474
#define this_cpu_and_8 (pcp , val ) percpu_to_op("and", (pcp), val)
319
475
#define this_cpu_or_8 (pcp , val ) percpu_to_op("or", (pcp), val)
320
476
#define this_cpu_xor_8 (pcp , val ) percpu_to_op("xor", (pcp), val)
477
+ #define this_cpu_add_return_8 (pcp , val ) percpu_add_return_op(pcp, val)
321
478
322
479
#define irqsafe_cpu_add_8 (pcp , val ) percpu_add_op((pcp), val)
323
480
#define irqsafe_cpu_and_8 (pcp , val ) percpu_to_op("and", (pcp), val)
324
481
#define irqsafe_cpu_or_8 (pcp , val ) percpu_to_op("or", (pcp), val)
325
482
#define irqsafe_cpu_xor_8 (pcp , val ) percpu_to_op("xor", (pcp), val)
326
-
327
483
#endif
328
484
329
485
/* This is not atomic against other CPUs -- CPU preemption needs to be off */
0 commit comments