@@ -173,15 +173,30 @@ static void scalarizeMaskedLoad(CallInst *CI, bool &ModifiedDT) {
173
173
return ;
174
174
}
175
175
176
+ // If the mask is not v1i1, use scalar bit test operations. This generates
177
+ // better results on X86 at least.
178
+ Value *SclrMask;
179
+ if (VectorWidth != 1 ) {
180
+ Type *SclrMaskTy = Builder.getIntNTy (VectorWidth);
181
+ SclrMask = Builder.CreateBitCast (Mask, SclrMaskTy, " scalar_mask" );
182
+ }
183
+
176
184
for (unsigned Idx = 0 ; Idx < VectorWidth; ++Idx) {
177
185
// Fill the "else" block, created in the previous iteration
178
186
//
179
187
// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
180
- // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
188
+ // %mask_1 = and i16 %scalar_mask, i32 1 << Idx
189
+ // %cond = icmp ne i16 %mask_1, 0
181
190
// br i1 %mask_1, label %cond.load, label %else
182
191
//
183
-
184
- Value *Predicate = Builder.CreateExtractElement (Mask, Idx);
192
+ Value *Predicate;
193
+ if (VectorWidth != 1 ) {
194
+ Value *Mask = Builder.getInt (APInt::getOneBitSet (VectorWidth, Idx));
195
+ Predicate = Builder.CreateICmpNE (Builder.CreateAnd (SclrMask, Mask),
196
+ Builder.getIntN (VectorWidth, 0 ));
197
+ } else {
198
+ Predicate = Builder.CreateExtractElement (Mask, Idx);
199
+ }
185
200
186
201
// Create "cond" block
187
202
//
@@ -290,13 +305,29 @@ static void scalarizeMaskedStore(CallInst *CI, bool &ModifiedDT) {
290
305
return ;
291
306
}
292
307
308
+ // If the mask is not v1i1, use scalar bit test operations. This generates
309
+ // better results on X86 at least.
310
+ Value *SclrMask;
311
+ if (VectorWidth != 1 ) {
312
+ Type *SclrMaskTy = Builder.getIntNTy (VectorWidth);
313
+ SclrMask = Builder.CreateBitCast (Mask, SclrMaskTy, " scalar_mask" );
314
+ }
315
+
293
316
for (unsigned Idx = 0 ; Idx < VectorWidth; ++Idx) {
294
317
// Fill the "else" block, created in the previous iteration
295
318
//
296
- // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
319
+ // %mask_1 = and i16 %scalar_mask, i32 1 << Idx
320
+ // %cond = icmp ne i16 %mask_1, 0
297
321
// br i1 %mask_1, label %cond.store, label %else
298
322
//
299
- Value *Predicate = Builder.CreateExtractElement (Mask, Idx);
323
+ Value *Predicate;
324
+ if (VectorWidth != 1 ) {
325
+ Value *Mask = Builder.getInt (APInt::getOneBitSet (VectorWidth, Idx));
326
+ Predicate = Builder.CreateICmpNE (Builder.CreateAnd (SclrMask, Mask),
327
+ Builder.getIntN (VectorWidth, 0 ));
328
+ } else {
329
+ Predicate = Builder.CreateExtractElement (Mask, Idx);
330
+ }
300
331
301
332
// Create "cond" block
302
333
//
@@ -392,15 +423,30 @@ static void scalarizeMaskedGather(CallInst *CI, bool &ModifiedDT) {
392
423
return ;
393
424
}
394
425
426
+ // If the mask is not v1i1, use scalar bit test operations. This generates
427
+ // better results on X86 at least.
428
+ Value *SclrMask;
429
+ if (VectorWidth != 1 ) {
430
+ Type *SclrMaskTy = Builder.getIntNTy (VectorWidth);
431
+ SclrMask = Builder.CreateBitCast (Mask, SclrMaskTy, " scalar_mask" );
432
+ }
433
+
395
434
for (unsigned Idx = 0 ; Idx < VectorWidth; ++Idx) {
396
435
// Fill the "else" block, created in the previous iteration
397
436
//
398
- // %Mask1 = extractelement <16 x i1> %Mask, i32 1
437
+ // %Mask1 = and i16 %scalar_mask, i32 1 << Idx
438
+ // %cond = icmp ne i16 %mask_1, 0
399
439
// br i1 %Mask1, label %cond.load, label %else
400
440
//
401
441
402
- Value *Predicate =
403
- Builder.CreateExtractElement (Mask, Idx, " Mask" + Twine (Idx));
442
+ Value *Predicate;
443
+ if (VectorWidth != 1 ) {
444
+ Value *Mask = Builder.getInt (APInt::getOneBitSet (VectorWidth, Idx));
445
+ Predicate = Builder.CreateICmpNE (Builder.CreateAnd (SclrMask, Mask),
446
+ Builder.getIntN (VectorWidth, 0 ));
447
+ } else {
448
+ Predicate = Builder.CreateExtractElement (Mask, Idx, " Mask" + Twine (Idx));
449
+ }
404
450
405
451
// Create "cond" block
406
452
//
@@ -499,14 +545,29 @@ static void scalarizeMaskedScatter(CallInst *CI, bool &ModifiedDT) {
499
545
return ;
500
546
}
501
547
548
+ // If the mask is not v1i1, use scalar bit test operations. This generates
549
+ // better results on X86 at least.
550
+ Value *SclrMask;
551
+ if (VectorWidth != 1 ) {
552
+ Type *SclrMaskTy = Builder.getIntNTy (VectorWidth);
553
+ SclrMask = Builder.CreateBitCast (Mask, SclrMaskTy, " scalar_mask" );
554
+ }
555
+
502
556
for (unsigned Idx = 0 ; Idx < VectorWidth; ++Idx) {
503
557
// Fill the "else" block, created in the previous iteration
504
558
//
505
- // %Mask1 = extractelement <16 x i1> %Mask, i32 Idx
559
+ // %Mask1 = and i16 %scalar_mask, i32 1 << Idx
560
+ // %cond = icmp ne i16 %mask_1, 0
506
561
// br i1 %Mask1, label %cond.store, label %else
507
562
//
508
- Value *Predicate =
509
- Builder.CreateExtractElement (Mask, Idx, " Mask" + Twine (Idx));
563
+ Value *Predicate;
564
+ if (VectorWidth != 1 ) {
565
+ Value *Mask = Builder.getInt (APInt::getOneBitSet (VectorWidth, Idx));
566
+ Predicate = Builder.CreateICmpNE (Builder.CreateAnd (SclrMask, Mask),
567
+ Builder.getIntN (VectorWidth, 0 ));
568
+ } else {
569
+ Predicate = Builder.CreateExtractElement (Mask, Idx, " Mask" + Twine (Idx));
570
+ }
510
571
511
572
// Create "cond" block
512
573
//
0 commit comments