@@ -416,8 +416,6 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
416
416
return 1024 ;
417
417
}
418
418
419
- // FIXME: Should we use narrower types for local/region, or account for when
420
- // unaligned access is legal?
421
419
Type *GCNTTIImpl::getMemcpyLoopLoweringType (
422
420
LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
423
421
unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
@@ -426,29 +424,12 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(
426
424
if (AtomicElementSize)
427
425
return Type::getIntNTy (Context, *AtomicElementSize * 8 );
428
426
429
- Align MinAlign = std::min (SrcAlign, DestAlign);
430
-
431
- // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
432
- // hardware into byte accesses. If you assume all alignments are equally
433
- // probable, it's more efficient on average to use short accesses for this
434
- // case.
435
- if (MinAlign == Align (2 ))
436
- return Type::getInt16Ty (Context);
437
-
438
- // Not all subtargets have 128-bit DS instructions, and we currently don't
439
- // form them by default.
440
- if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
441
- SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
442
- DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
443
- DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
444
- return FixedVectorType::get (Type::getInt32Ty (Context), 2 );
445
- }
446
-
447
- // Global memory works best with 16-byte accesses.
427
+ // 16-byte accesses achieve the highest copy throughput.
448
428
// If the operation has a fixed known length that is large enough, it is
449
429
// worthwhile to return an even wider type and let legalization lower it into
450
- // multiple accesses, effectively unrolling the memcpy loop. Private memory
451
- // also hits this, although accesses may be decomposed.
430
+ // multiple accesses, effectively unrolling the memcpy loop.
431
+ // We also rely on legalization to decompose into smaller accesses for
432
+ // subtargets and address spaces where it is necessary.
452
433
//
453
434
// Don't unroll if Length is not a constant, since unrolling leads to worse
454
435
// performance for length values that are smaller or slightly larger than the
@@ -473,26 +454,22 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
473
454
OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
474
455
DestAlign, AtomicCpySize);
475
456
476
- Align MinAlign = std::min (SrcAlign, DestAlign);
477
-
478
- if (MinAlign != Align (2 )) {
479
- Type *I32x4Ty = FixedVectorType::get (Type::getInt32Ty (Context), 4 );
480
- while (RemainingBytes >= 16 ) {
481
- OpsOut.push_back (I32x4Ty);
482
- RemainingBytes -= 16 ;
483
- }
457
+ Type *I32x4Ty = FixedVectorType::get (Type::getInt32Ty (Context), 4 );
458
+ while (RemainingBytes >= 16 ) {
459
+ OpsOut.push_back (I32x4Ty);
460
+ RemainingBytes -= 16 ;
461
+ }
484
462
485
- Type *I64Ty = Type::getInt64Ty (Context);
486
- while (RemainingBytes >= 8 ) {
487
- OpsOut.push_back (I64Ty);
488
- RemainingBytes -= 8 ;
489
- }
463
+ Type *I64Ty = Type::getInt64Ty (Context);
464
+ while (RemainingBytes >= 8 ) {
465
+ OpsOut.push_back (I64Ty);
466
+ RemainingBytes -= 8 ;
467
+ }
490
468
491
- Type *I32Ty = Type::getInt32Ty (Context);
492
- while (RemainingBytes >= 4 ) {
493
- OpsOut.push_back (I32Ty);
494
- RemainingBytes -= 4 ;
495
- }
469
+ Type *I32Ty = Type::getInt32Ty (Context);
470
+ while (RemainingBytes >= 4 ) {
471
+ OpsOut.push_back (I32Ty);
472
+ RemainingBytes -= 4 ;
496
473
}
497
474
498
475
Type *I16Ty = Type::getInt16Ty (Context);
0 commit comments