@@ -319,4 +319,86 @@ extern "C" void acc_combined(int N) {
319
319
// CHECK-NEXT: acc.yield
320
320
// CHECK-NEXT: } loc
321
321
322
+ #pragma acc parallel loop gang
323
+ for (unsigned I = 0 ; I < N; ++I);
324
+ // CHECK-NEXT: acc.parallel combined(loop) {
325
+ // CHECK-NEXT: acc.loop combined(parallel) gang {
326
+ // CHECK: acc.yield
327
+ // CHECK-NEXT: } loc
328
+ // CHECK-NEXT: acc.yield
329
+ // CHECK-NEXT: } loc
330
+ #pragma acc parallel loop gang device_type(nvidia) gang
331
+ for (unsigned I = 0 ; I < N; ++I);
332
+ // CHECK-NEXT: acc.parallel combined(loop) {
333
+ // CHECK-NEXT: acc.loop combined(parallel) gang([#acc.device_type<none>, #acc.device_type<nvidia>]) {
334
+ // CHECK: acc.yield
335
+ // CHECK-NEXT: } loc
336
+ // CHECK-NEXT: acc.yield
337
+ // CHECK-NEXT: } loc
338
+ #pragma acc parallel loop gang(dim:1) device_type(nvidia) gang(dim:2)
339
+ for (unsigned I = 0 ; I < N; ++I);
340
+ // CHECK-NEXT: acc.parallel combined(loop) {
341
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
342
+ // CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64
343
+ // CHECK-NEXT: acc.loop combined(parallel) gang({dim=%[[ONE_CONST]] : i64}, {dim=%[[TWO_CONST]] : i64} [#acc.device_type<nvidia>]) {
344
+ // CHECK: acc.yield
345
+ // CHECK-NEXT: } loc
346
+ // CHECK-NEXT: acc.yield
347
+ // CHECK-NEXT: } loc
348
+ #pragma acc parallel loop gang(static:N, dim: 1) device_type(nvidia, radeon) gang(static:*, dim : 2)
349
+ for (unsigned I = 0 ; I < N; ++I);
350
+ // CHECK-NEXT: acc.parallel combined(loop) {
351
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
352
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
353
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
354
+ // CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
355
+ // CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64
356
+ // CHECK-NEXT: acc.loop combined(parallel) gang({static=%[[N_CONV]] : si32, dim=%[[ONE_CONST]] : i64}, {static=%[[STAR_CONST]] : i64, dim=%[[TWO_CONST]] : i64} [#acc.device_type<nvidia>], {static=%[[STAR_CONST]] : i64, dim=%[[TWO_CONST]] : i64} [#acc.device_type<radeon>]) {
357
+ // CHECK: acc.yield
358
+ // CHECK-NEXT: } loc
359
+ // CHECK-NEXT: acc.yield
360
+ // CHECK-NEXT: } loc
361
+
362
+ #pragma acc kernels loop gang(num:N) device_type(nvidia, radeon) gang(num:N)
363
+ for (unsigned I = 0 ; I < N; ++I);
364
+ // CHECK-NEXT: acc.kernels combined(loop) {
365
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
366
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
367
+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
368
+ // CHECK-NEXT: %[[N_CONV2:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD2]] : !s32i to si32
369
+ // CHECK-NEXT: acc.loop combined(kernels) gang({num=%[[N_CONV]] : si32}, {num=%[[N_CONV2]] : si32} [#acc.device_type<nvidia>], {num=%[[N_CONV2]] : si32} [#acc.device_type<radeon>]) {
370
+ // CHECK: acc.yield
371
+ // CHECK-NEXT: } loc
372
+ // CHECK-NEXT: acc.terminator
373
+ // CHECK-NEXT: } loc
374
+ #pragma acc kernels loop gang(static:N) device_type(nvidia) gang(static:*)
375
+ for (unsigned I = 0 ; I < N; ++I);
376
+ // CHECK-NEXT: acc.kernels combined(loop) {
377
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
378
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
379
+ // CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
380
+ // CHECK-NEXT: acc.loop combined(kernels) gang({static=%[[N_CONV]] : si32}, {static=%[[STAR_CONST]] : i64} [#acc.device_type<nvidia>]) {
381
+ // CHECK: acc.yield
382
+ // CHECK-NEXT: } loc
383
+ // CHECK-NEXT: acc.terminator
384
+ // CHECK-NEXT: } loc
385
+ #pragma acc kernels loop gang(static:N, num: N + 1) device_type(nvidia) gang(static:*, num : N + 2)
386
+ for (unsigned I = 0 ; I < N; ++I);
387
+ // CHECK-NEXT: acc.kernels combined(loop) {
388
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
389
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
390
+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
391
+ // CHECK-NEXT: %[[CIR_ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
392
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[CIR_ONE_CONST]]) nsw : !s32i
393
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
394
+ // CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
395
+ // CHECK-NEXT: %[[N_LOAD3:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
396
+ // CHECK-NEXT: %[[CIR_TWO_CONST:.*]] = cir.const #cir.int<2> : !s32i
397
+ // CHECK-NEXT: %[[N_PLUS_TWO:.*]] = cir.binop(add, %[[N_LOAD3]], %[[CIR_TWO_CONST]]) nsw : !s32i
398
+ // CHECK-NEXT: %[[N_PLUS_TWO_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_TWO]] : !s32i to si32
399
+ // CHECK-NEXT: acc.loop combined(kernels) gang({static=%[[N_CONV]] : si32, num=%[[N_PLUS_ONE_CONV]] : si32}, {static=%[[STAR_CONST]] : i64, num=%[[N_PLUS_TWO_CONV]] : si32} [#acc.device_type<nvidia>]) {
400
+ // CHECK: acc.yield
401
+ // CHECK-NEXT: } loc
402
+ // CHECK-NEXT: acc.terminator
403
+ // CHECK-NEXT: } loc
322
404
}
0 commit comments