@@ -1682,6 +1682,11 @@ bool HWConformity::fixDstAlignment(INST_LIST_ITER i, G4_BB *bb, G4_Type extype,
1682
1682
return true ;
1683
1683
}
1684
1684
1685
+ if (byteDst && builder.getNativeExecSize () >= g4::SIMD16) {
1686
+ // For pvc+, leave byte dst to be fixed in fixByteXBarRestriction() later
1687
+ return false ;
1688
+ }
1689
+
1685
1690
if (builder.hasBFMixMode () && extype == Type_F &&
1686
1691
inst->getDst ()->getType () == Type_BF && !inst->isDpas ()) {
1687
1692
// For now, BF mixed mode should not need this check.
@@ -6516,15 +6521,30 @@ bool HWConformity::splitInstListForByteDst(INST_LIST_ITER it, G4_BB *bb,
6516
6521
G4_INST *inst = *it;
6517
6522
G4_opcode inst_op = inst->opcode ();
6518
6523
G4_DstRegRegion *dst = inst->getDst ();
6519
- // check if we can split the inst
6520
- if (!canSplitByteDst (inst_op) || inst->getExecSize () == g4::SIMD1 ||
6521
- (!bb->isAllLaneActive () && !inst->isWriteEnableInst ()) ||
6522
- dst->getByteOffset () % extypesize != 0 || dst->getHorzStride () != 1 ||
6523
- extypesize != TypeSize (Type_W)) {
6524
- return false ;
6524
+
6525
+ bool hasDstSrcOverlap = false ;
6526
+ if (dst && !inst->hasNULLDst ()) {
6527
+ auto srcNum = inst->getNumSrc ();
6528
+ for (int i = 0 ; i < srcNum; i++) {
6529
+ G4_CmpRelation rel = dst->compareOperand (inst->getSrc (i), builder);
6530
+ if (rel != Rel_disjoint) {
6531
+ hasDstSrcOverlap = true ;
6532
+ break ;
6533
+ }
6534
+ }
6525
6535
}
6526
6536
6527
- if (inst->getPredicate () || inst->getCondMod ()) {
6537
+ // check if we can split the inst
6538
+ if (!canSplitByteDst (inst_op) ||
6539
+ inst->getExecSize () == g4::SIMD1 ||
6540
+ (!bb->isAllLaneActive () && !inst->isWriteEnableInst ()) ||
6541
+ dst->getByteOffset () % extypesize != 0 ||
6542
+ dst->getHorzStride () != 1 ||
6543
+ extypesize != TypeSize (Type_W) ||
6544
+ inst->getPredicate () ||
6545
+ inst->getCondMod () ||
6546
+ // Do not split the instruction if dst has overlap with sources
6547
+ hasDstSrcOverlap) {
6528
6548
return false ;
6529
6549
}
6530
6550
@@ -9076,7 +9096,6 @@ void HWConformity::fixByteXBarRestriction(INST_LIST_ITER it, G4_BB *bb) {
9076
9096
}
9077
9097
unsigned int new_option = inst->getMaskOption ();
9078
9098
auto pos = it;
9079
- pos++;
9080
9099
auto dstride = dst->getHorzStride ();
9081
9100
const RegionDesc *shiftRegion = builder.createRegionDesc (dstride, 1 , 0 );
9082
9101
G4_Declare *shiftDcl = builder.createTempVar (
@@ -9088,12 +9107,12 @@ void HWConformity::fixByteXBarRestriction(INST_LIST_ITER it, G4_BB *bb) {
9088
9107
G4_INST *packInst = builder.createMov (inst->getExecSize (), packTmp,
9089
9108
unpackSrc, new_option, false );
9090
9109
packInst->setPredicate (pred);
9091
- bb->insertBefore (pos, packInst);
9110
+ pos = bb->insertAfter (pos, packInst);
9092
9111
// then shift the bytes and words location
9093
9112
G4_INST *shiftInst = builder.createMov (inst->getExecSize (), dst, shiftSrc,
9094
9113
new_option, false );
9095
9114
shiftInst->setPredicate (pred);
9096
- bb->insertBefore (pos, shiftInst);
9115
+ pos = bb->insertAfter (pos, shiftInst);
9097
9116
// update propagation info
9098
9117
maintainDU4TempMov (inst, shiftInst);
9099
9118
// change the destination of the original instruction
0 commit comments