@@ -279,8 +279,8 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer
279
279
; must be a multiple of element size.
280
280
; TODO: Could bitcast around this limitation.
281
281
282
- define <4 x i32 > @gep01_bitcast_load_i32_insert_v4i32 (ptr align 1 dereferenceable (16 ) %p ) {
283
- ; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32 (
282
+ define <4 x i32 > @gep01_bitcast_load_i32_from_v16i8_insert_v4i32 (ptr align 1 dereferenceable (16 ) %p ) {
283
+ ; CHECK-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32 (
284
284
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
285
285
; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
286
286
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
@@ -292,6 +292,84 @@ define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceabl
292
292
ret <4 x i32 > %r
293
293
}
294
294
295
+ define <2 x i64 > @gep01_bitcast_load_i64_from_v16i8_insert_v2i64 (ptr align 1 dereferenceable (16 ) %p ) {
296
+ ; CHECK-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
297
+ ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
298
+ ; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
299
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
300
+ ; CHECK-NEXT: ret <2 x i64> [[R]]
301
+ ;
302
+ %gep = getelementptr inbounds <16 x i8 >, ptr %p , i64 0 , i64 1
303
+ %s = load i64 , ptr %gep , align 1
304
+ %r = insertelement <2 x i64 > undef , i64 %s , i64 0
305
+ ret <2 x i64 > %r
306
+ }
307
+
308
+ define <4 x i32 > @gep11_bitcast_load_i32_from_v16i8_insert_v4i32 (ptr align 1 dereferenceable (16 ) %p ) {
309
+ ; CHECK-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
310
+ ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
311
+ ; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
312
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
313
+ ; CHECK-NEXT: ret <4 x i32> [[R]]
314
+ ;
315
+ %gep = getelementptr inbounds <16 x i8 >, ptr %p , i64 0 , i64 11
316
+ %s = load i32 , ptr %gep , align 1
317
+ %r = insertelement <4 x i32 > undef , i32 %s , i64 0
318
+ ret <4 x i32 > %r
319
+ }
320
+
321
+ define <4 x i32 > @gep01_bitcast_load_i32_from_v8i16_insert_v4i32 (ptr align 1 dereferenceable (16 ) %p ) {
322
+ ; CHECK-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
323
+ ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
324
+ ; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
325
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
326
+ ; CHECK-NEXT: ret <4 x i32> [[R]]
327
+ ;
328
+ %gep = getelementptr inbounds <8 x i16 >, ptr %p , i64 0 , i64 1
329
+ %s = load i32 , ptr %gep , align 1
330
+ %r = insertelement <4 x i32 > undef , i32 %s , i64 0
331
+ ret <4 x i32 > %r
332
+ }
333
+
334
+ define <2 x i64 > @gep01_bitcast_load_i64_from_v8i16_insert_v2i64 (ptr align 1 dereferenceable (16 ) %p ) {
335
+ ; CHECK-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
336
+ ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
337
+ ; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
338
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
339
+ ; CHECK-NEXT: ret <2 x i64> [[R]]
340
+ ;
341
+ %gep = getelementptr inbounds <8 x i16 >, ptr %p , i64 0 , i64 1
342
+ %s = load i64 , ptr %gep , align 1
343
+ %r = insertelement <2 x i64 > undef , i64 %s , i64 0
344
+ ret <2 x i64 > %r
345
+ }
346
+
347
+ define <4 x i32 > @gep05_bitcast_load_i32_from_v8i16_insert_v4i32 (ptr align 1 dereferenceable (16 ) %p ) {
348
+ ; CHECK-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
349
+ ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
350
+ ; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
351
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
352
+ ; CHECK-NEXT: ret <4 x i32> [[R]]
353
+ ;
354
+ %gep = getelementptr inbounds <8 x i16 >, ptr %p , i64 0 , i64 5
355
+ %s = load i32 , ptr %gep , align 1
356
+ %r = insertelement <4 x i32 > undef , i32 %s , i64 0
357
+ ret <4 x i32 > %r
358
+ }
359
+
360
+ define <2 x i64 > @gep01_bitcast_load_i32_from_v4i32_insert_v2i64 (ptr align 1 dereferenceable (16 ) %p ) {
361
+ ; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(
362
+ ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x i32>, ptr [[P:%.*]], i64 0, i64 1
363
+ ; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
364
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
365
+ ; CHECK-NEXT: ret <2 x i64> [[R]]
366
+ ;
367
+ %gep = getelementptr inbounds <4 x i32 >, ptr %p , i64 0 , i64 1
368
+ %s = load i64 , ptr %gep , align 1
369
+ %r = insertelement <2 x i64 > undef , i64 %s , i64 0
370
+ ret <2 x i64 > %r
371
+ }
372
+
295
373
define <4 x i32 > @gep012_bitcast_load_i32_insert_v4i32 (ptr align 1 dereferenceable (20 ) %p ) nofree nosync {
296
374
; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32(
297
375
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 12
@@ -322,6 +400,58 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab
322
400
ret <4 x i32 > %r
323
401
}
324
402
403
+ define <4 x i32 > @gep07_bitcast_load_i32_from_v8i16_insert_v4i32 (ptr align 1 dereferenceable (16 ) %p ) {
404
+ ; CHECK-LABEL: @gep07_bitcast_load_i32_from_v8i16_insert_v4i32(
405
+ ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 7
406
+ ; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
407
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
408
+ ; CHECK-NEXT: ret <4 x i32> [[R]]
409
+ ;
410
+ %gep = getelementptr inbounds <8 x i16 >, ptr %p , i64 0 , i64 7
411
+ %s = load i32 , ptr %gep , align 1
412
+ %r = insertelement <4 x i32 > undef , i32 %s , i64 0
413
+ ret <4 x i32 > %r
414
+ }
415
+
416
+ define <2 x i64 > @gep03_bitcast_load_i32_from_v4i32_insert_v2i64 (ptr align 1 dereferenceable (16 ) %p ) {
417
+ ; CHECK-LABEL: @gep03_bitcast_load_i32_from_v4i32_insert_v2i64(
418
+ ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x i32>, ptr [[P:%.*]], i64 0, i64 3
419
+ ; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
420
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
421
+ ; CHECK-NEXT: ret <2 x i64> [[R]]
422
+ ;
423
+ %gep = getelementptr inbounds <4 x i32 >, ptr %p , i64 0 , i64 3
424
+ %s = load i64 , ptr %gep , align 1
425
+ %r = insertelement <2 x i64 > undef , i64 %s , i64 0
426
+ ret <2 x i64 > %r
427
+ }
428
+
429
+ define <2 x i64 > @gep09_bitcast_load_i64_from_v16i8_insert_v2i64 (ptr align 1 dereferenceable (16 ) %p ) #0 {
430
+ ; CHECK-LABEL: @gep09_bitcast_load_i64_from_v16i8_insert_v2i64(
431
+ ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 9
432
+ ; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
433
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
434
+ ; CHECK-NEXT: ret <2 x i64> [[R]]
435
+ ;
436
+ %gep = getelementptr inbounds <16 x i8 >, ptr %p , i64 0 , i64 9
437
+ %s = load i64 , ptr %gep , align 1
438
+ %r = insertelement <2 x i64 > undef , i64 %s , i64 0
439
+ ret <2 x i64 > %r
440
+ }
441
+
442
+ define <2 x i64 > @gep05_bitcast_load_i64_from_v8i16_insert_v2i64 (ptr align 1 dereferenceable (16 ) %p ) {
443
+ ; CHECK-LABEL: @gep05_bitcast_load_i64_from_v8i16_insert_v2i64(
444
+ ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
445
+ ; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
446
+ ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
447
+ ; CHECK-NEXT: ret <2 x i64> [[R]]
448
+ ;
449
+ %gep = getelementptr inbounds <8 x i16 >, ptr %p , i64 0 , i64 5
450
+ %s = load i64 , ptr %gep , align 1
451
+ %r = insertelement <2 x i64 > undef , i64 %s , i64 0
452
+ ret <2 x i64 > %r
453
+ }
454
+
325
455
; If there are enough dereferenceable bytes, we can offset the vector load.
326
456
327
457
define <8 x i16 > @gep10_load_i16_insert_v8i16 (ptr align 16 dereferenceable (32 ) %p ) nofree nosync {
0 commit comments