@@ -2011,41 +2011,41 @@ multiclass F_ATOMIC_2_NEG<ValueType regT, NVPTXRegClass regclass, string SpaceSt
2011
2011
2012
2012
// has 3 operands
2013
2013
multiclass F_ATOMIC_3_imp<ValueType ptrT, NVPTXRegClass ptrclass,
2014
- ValueType regT, NVPTXRegClass regclass,
2014
+ ValueType regT, NVPTXRegClass regclass, string SemStr,
2015
2015
string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
2016
2016
Operand IMMType, list<Predicate> Pred> {
2017
2017
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
2018
2018
def reg : NVPTXInst<(outs regclass:$dst),
2019
2019
(ins ptrclass:$addr, regclass:$b, regclass:$c),
2020
- !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
2020
+ !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
2021
2021
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), (regT regclass:$c)))]>,
2022
2022
Requires<Pred>;
2023
2023
2024
2024
def imm1 : NVPTXInst<(outs regclass:$dst),
2025
2025
(ins ptrclass:$addr, IMMType:$b, regclass:$c),
2026
- !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
2026
+ !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
2027
2027
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, (regT regclass:$c)))]>,
2028
2028
Requires<Pred>;
2029
2029
2030
2030
def imm2 : NVPTXInst<(outs regclass:$dst),
2031
2031
(ins ptrclass:$addr, regclass:$b, IMMType:$c),
2032
- !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
2032
+ !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
2033
2033
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), imm:$c))]>,
2034
2034
Requires<Pred>;
2035
2035
2036
2036
def imm3 : NVPTXInst<(outs regclass:$dst),
2037
2037
(ins ptrclass:$addr, IMMType:$b, IMMType:$c),
2038
- !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
2038
+ !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
2039
2039
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, imm:$c))]>,
2040
2040
Requires<Pred>;
2041
2041
}
2042
2042
}
2043
- multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SpaceStr , string TypeStr ,
2044
- string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
2045
- defm p32 : F_ATOMIC_3_imp<i32, Int32Regs, regT, regclass, SpaceStr, TypeStr, OpcStr ,
2046
- IntOp, IMMType, Pred>;
2047
- defm p64 : F_ATOMIC_3_imp<i64, Int64Regs, regT, regclass, SpaceStr, TypeStr, OpcStr ,
2048
- IntOp, IMMType, Pred>;
2043
+ multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SemStr , string SpaceStr ,
2044
+ string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
2045
+ defm p32 : F_ATOMIC_3_imp<i32, Int32Regs, regT, regclass, SemStr, SpaceStr, TypeStr ,
2046
+ OpcStr, IntOp, IMMType, Pred>;
2047
+ defm p64 : F_ATOMIC_3_imp<i64, Int64Regs, regT, regclass, SemStr, SpaceStr, TypeStr ,
2048
+ OpcStr, IntOp, IMMType, Pred>;
2049
2049
}
2050
2050
2051
2051
// atom_add
@@ -2427,51 +2427,76 @@ defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".xor",
2427
2427
defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64",
2428
2428
".xor", atomic_load_xor_i64_gen, i64imm, imm, [hasSM<32>]>;
2429
2429
2430
- // atom_cas
2431
-
2432
- def atomic_cmp_swap_i16_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
2433
- (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
2434
- def atomic_cmp_swap_i16_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
2435
- (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
2436
- def atomic_cmp_swap_i16_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
2437
- (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
2438
- def atomic_cmp_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
2439
- (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
2440
- def atomic_cmp_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
2441
- (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
2442
- def atomic_cmp_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
2443
- (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
2444
- def atomic_cmp_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
2445
- (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
2446
- def atomic_cmp_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
2447
- (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
2448
- def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
2449
- (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
2450
-
2451
- defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
2452
- atomic_cmp_swap_i16_g, i16imm, [hasSM<70>, hasPTX<63>]>;
2453
- defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, ".shared", ".b16", ".cas",
2454
- atomic_cmp_swap_i16_s, i16imm, [hasSM<70>, hasPTX<63>]>;
2455
- defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", ".b16", ".cas",
2456
- atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
2457
- defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
2458
- atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
2459
- defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32", ".cas",
2460
- atomic_cmp_swap_i32_g, i32imm>;
2461
- defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<i32, Int32Regs, ".shared", ".b32", ".cas",
2462
- atomic_cmp_swap_i32_s, i32imm>;
2463
- defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<i32, Int32Regs, "", ".b32", ".cas",
2464
- atomic_cmp_swap_i32_gen, i32imm>;
2465
- defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32",
2466
- ".cas", atomic_cmp_swap_i32_gen, i32imm>;
2467
- defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64", ".cas",
2468
- atomic_cmp_swap_i64_g, i64imm>;
2469
- defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<i64, Int64Regs, ".shared", ".b64", ".cas",
2470
- atomic_cmp_swap_i64_s, i64imm>;
2471
- defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<i64, Int64Regs, "", ".b64", ".cas",
2472
- atomic_cmp_swap_i64_gen, i64imm>;
2473
- defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64",
2474
- ".cas", atomic_cmp_swap_i64_gen, i64imm>;
2430
+ multiclass ternary_atomic_op_as {
2431
+ // one record per address space
2432
+ def NAME#_generic: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
2433
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
2434
+ AS_match.generic>;
2435
+
2436
+ def NAME#_global: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
2437
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
2438
+ AS_match.global>;
2439
+
2440
+ def NAME#_shared: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
2441
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
2442
+ AS_match.shared>;
2443
+ }
2444
+
2445
+ // generate pattern fragments for size x memory order
2446
+ // NOTE: i8 cmpxchg is not supported in ptx, and AtomicExpandPass will emulate all i8 cmpxchgs
2447
+ // using larger-bitwidth cas
2448
+ foreach size = ["i16", "i32", "i64"] in {
2449
+ foreach order = ["", "_monotonic", "_acquire", "_release", "_acq_rel", "_seq_cst"] in {
2450
+ defm atomic_cmp_swap#_#size#order: ternary_atomic_op_as;
2451
+ }
2452
+ }
2453
+
2454
+ // eg. with type = 32, order = ".acquire", addrspace = ".global",
2455
+ // atomic_cmp_swap_pat = atomic_cmp_swap_i32_acquire_global.
2456
+ // preds = [hasSM<70>, hasPTX<63>]
2457
+ // F_ATOMIC_3<i32, Int32Regs, ".acquire", ".global", ".b32",
2458
+ // ".cas", atomic_cmp_swap_i32_acquire_global, i32imm,
2459
+ // [hasSM<70>, hasPTX<63>]>
2460
+ multiclass INT_PTX_ATOM_CAS<string atomic_cmp_swap_pat, string type,
2461
+ string order, string addrspace, list<Predicate> preds>
2462
+ : F_ATOMIC_3<!cast<ValueType>("i"#type),
2463
+ !cast<NVPTXRegClass>("Int"#type#"Regs"),
2464
+ order,
2465
+ addrspace,
2466
+ ".b"#type,
2467
+ ".cas",
2468
+ !cast<PatFrag>(atomic_cmp_swap_pat),
2469
+ !cast<Operand>("i"#type#"imm"),
2470
+ preds>;
2471
+
2472
+ // Define atom.cas for all combinations of size x addrspace x memory order
2473
+ // supported in PTX *and* on the hardware.
2474
+ foreach size = ["32", "64"] in {
2475
+ foreach addrspace = ["generic", "global", "shared"] in {
2476
+ defvar cas_addrspace_string = !if(!eq(addrspace, "generic"), "", "."#addrspace);
2477
+ foreach order = ["acquire", "release", "acq_rel", "monotonic"] in {
2478
+ defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order);
2479
+ // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
2480
+ // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions-
2481
+ // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs.
2482
+ defm INT_PTX_ATOM_CAS_#size#_#order#addrspace
2483
+ : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size,
2484
+ cas_order_string, cas_addrspace_string,
2485
+ [hasSM<70>, hasPTX<63>]>;
2486
+ defm INT_PTX_ATOM_CAS_#size#_#order#_old#addrspace
2487
+ : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size,
2488
+ "", cas_addrspace_string, []>;
2489
+ }
2490
+ }
2491
+ }
2492
+
2493
+ // Note that 16-bit CAS support in PTX is emulated.
2494
+ defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, "", ".global", ".b16", ".cas",
2495
+ atomic_cmp_swap_i16_global, i16imm, [hasSM<70>, hasPTX<63>]>;
2496
+ defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, "", ".shared", ".b16", ".cas",
2497
+ atomic_cmp_swap_i16_shared, i16imm, [hasSM<70>, hasPTX<63>]>;
2498
+ defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", "", ".b16", ".cas",
2499
+ atomic_cmp_swap_i16_generic, i16imm, [hasSM<70>, hasPTX<63>]>;
2475
2500
2476
2501
// Support for scoped atomic operations. Matches
2477
2502
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
0 commit comments