@@ -3690,7 +3690,7 @@ bool Optimizer::foldPseudoNot(G4_BB *bb, INST_LIST_ITER &iter) {
3690
3690
}
3691
3691
3692
3692
/* **
3693
- this function optmize the following cases:
3693
+ this function optimizes the following cases:
3694
3694
3695
3695
case 1:
3696
3696
cmp.gt.P0 s0 s1
@@ -3723,7 +3723,7 @@ mov (1) P0 Imm (NoMask)
3723
3723
smov (8) r[A0, 0] src0 src1 Imm
3724
3724
3725
3725
case 5:
3726
- psuedo_not (1) P2 P1
3726
+ pseudo_not (1) P2 P1
3727
3727
and (1) P4 P3 P2
3728
3728
==>
3729
3729
and (1) P4 P3 ~P1
@@ -3818,7 +3818,7 @@ void Optimizer::optimizeLogicOperation() {
3818
3818
merged = foldPseudoAndOr (bb, ii);
3819
3819
}
3820
3820
3821
- // translate the psuedo op
3821
+ // translate the pseudo op
3822
3822
if (!merged) {
3823
3823
expandPseudoLogic (builder, bb, ii);
3824
3824
}
@@ -3835,7 +3835,9 @@ bool Optimizer::foldPseudoAndOr(G4_BB *bb, INST_LIST_ITER &ii) {
3835
3835
3836
3836
// optimization should apply even when the dst of the pseudo-and/pseudo-or is
3837
3837
// global, since we are just hoisting it up, and WAR/WAW checks should be
3838
- // performed as we search for the src0 and src1 inst.
3838
+ // performed as we search for the src0 and src1 inst. Also need to check if
3839
+ // the mask option of the pseudo-and/pseudo-or matches with the options of
3840
+ // the defining instructions when dst is global.
3839
3841
3840
3842
G4_INST *inst = *ii;
3841
3843
// look for def of srcs
@@ -3852,7 +3854,7 @@ bool Optimizer::foldPseudoAndOr(G4_BB *bb, INST_LIST_ITER &ii) {
3852
3854
3853
3855
The new code uses defInstList directly, and aborts if there are more then are
3854
3856
two definitions. Which means there is more then one instruction writing to
3855
- source. Disadvantage of that is that it is less precisise . For example if we
3857
+ source. Disadvantage of that is that it is less precise . For example if we
3856
3858
are folding in to closest definition then before it was OK, but now will be
3857
3859
disallowed.
3858
3860
*/
@@ -3889,13 +3891,13 @@ bool Optimizer::foldPseudoAndOr(G4_BB *bb, INST_LIST_ITER &ii) {
3889
3891
std::swap (defInstructions[0 ], defInstructions[1 ]);
3890
3892
std::swap (maxSrc1, maxSrc2);
3891
3893
}
3892
- // Doing backward scan until earlist src to make sure dst of and/or is not
3894
+ // Doing backward scan until earliest src to make sure dst of and/or is not
3893
3895
// being written to or being read
3894
3896
/*
3895
3897
handling case like in spmv_csr
3896
- cmp.lt (M1, 1) P15 V40(0,0)<0;1,0> 0x10:w /// $191 cmp.lt (M1, 1) P16
3897
- V110(0,0)<0;1,0> V34(0,0)<0;1,0> /// $192 and (M1,
3898
- 1) P16 P16 P15 /// $193
3898
+ cmp.lt (M1, 1) P15 V40(0,0)<0;1,0> 0x10:w /// $191
3899
+ cmp.lt (M1, 1) P16 V110(0,0)<0;1,0> V34(0,0)<0;1,0> /// $192
3900
+ and (M1, 1) P16 P16 P15 /// $193
3899
3901
*/
3900
3902
if (chkBwdOutputHazard (defInstructions[1 ], ii, defInstructions[0 ])) {
3901
3903
return false ;
@@ -3950,6 +3952,13 @@ bool Optimizer::foldPseudoAndOr(G4_BB *bb, INST_LIST_ITER &ii) {
3950
3952
return false ;
3951
3953
}
3952
3954
3955
+ // Check if mask options are mismatched between the pseudo-and/pseudo-or and
3956
+ // its defining instructions.
3957
+ if ((inst->getMaskOption () != src0DefInst->getMaskOption () ||
3958
+ inst->getMaskOption () != src1DefInst->getMaskOption ()) &&
3959
+ fg.globalOpndHT .isOpndGlobal (inst->getDst ()))
3960
+ return false ;
3961
+
3953
3962
// do the case 3 optimization
3954
3963
3955
3964
G4_PredState ps =
0 commit comments