-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[DAG] SimplifyDemandedVectorElts - add handling for INT<->FP conversions #117884
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -141,56 +141,61 @@ declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>) | |
define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { | ||
; CHECK-SSE-LABEL: fmul_pow2_8xhalf: | ||
; CHECK-SSE: # %bb.0: | ||
; CHECK-SSE-NEXT: subq $88, %rsp | ||
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 96 | ||
; CHECK-SSE-NEXT: subq $104, %rsp | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor regression due to the scalarisation of the f16 conversions causing an extra spill (and uses of instructions that can't fold loads) - I'm not convinced this is a showstopper tbh. |
||
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 112 | ||
; CHECK-SSE-NEXT: movdqa %xmm0, %xmm1 | ||
; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] | ||
; CHECK-SSE-NEXT: pslld $23, %xmm1 | ||
; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] | ||
; CHECK-SSE-NEXT: paddd %xmm2, %xmm1 | ||
; CHECK-SSE-NEXT: cvttps2dq %xmm1, %xmm1 | ||
; CHECK-SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill | ||
; CHECK-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | ||
; CHECK-SSE-NEXT: pslld $16, %xmm1 | ||
; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill | ||
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] | ||
; CHECK-SSE-NEXT: pslld $23, %xmm0 | ||
; CHECK-SSE-NEXT: paddd %xmm2, %xmm0 | ||
; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0 | ||
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | ||
; CHECK-SSE-NEXT: pslld $16, %xmm0 | ||
; CHECK-SSE-NEXT: psrld $16, %xmm0 | ||
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | ||
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | ||
; CHECK-SSE-NEXT: psrld $16, %xmm0 | ||
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 | ||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill | ||
; CHECK-SSE-NEXT: cvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload | ||
; CHECK-SSE-NEXT: psrlq $48, %xmm0 | ||
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The bigger problem appears to be all these cvtdq2ps(shuffle) calls that could be replaced with shuffle(cvtdq2ps) and share the same cvtdq2ps |
||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill | ||
; CHECK-SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: # xmm0 = mem[2,3,2,3] | ||
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload | ||
; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero | ||
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 | ||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill | ||
; CHECK-SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: # xmm0 = mem[3,3,3,3] | ||
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 | ||
; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 | ||
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload | ||
; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] | ||
; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0 | ||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill | ||
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload | ||
; CHECK-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 | ||
; CHECK-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill | ||
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | ||
; CHECK-SSE-NEXT: psrld $16, %xmm0 | ||
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 | ||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill | ||
; CHECK-SSE-NEXT: cvtdq2ps (%rsp), %xmm0 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload | ||
; CHECK-SSE-NEXT: psrlq $48, %xmm0 | ||
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 | ||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill | ||
; CHECK-SSE-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: # xmm0 = mem[2,3,2,3] | ||
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload | ||
; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero | ||
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 | ||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill | ||
; CHECK-SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: # xmm0 = mem[3,3,3,3] | ||
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload | ||
; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] | ||
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 | ||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT | ||
|
@@ -202,39 +207,39 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { | |
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT | ||
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 | ||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] | ||
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | ||
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload | ||
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] | ||
; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill | ||
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload | ||
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero | ||
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT | ||
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 | ||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill | ||
; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | ||
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload | ||
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero | ||
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT | ||
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 | ||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload | ||
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] | ||
; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] | ||
; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill | ||
; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] | ||
; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] | ||
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | ||
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload | ||
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero | ||
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT | ||
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 | ||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | ||
; CHECK-SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill | ||
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload | ||
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero | ||
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT | ||
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 | ||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] | ||
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | ||
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload | ||
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] | ||
; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill | ||
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload | ||
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero | ||
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT | ||
|
@@ -246,14 +251,13 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { | |
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT | ||
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 | ||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload | ||
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] | ||
; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] | ||
; CHECK-SSE-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0] | ||
; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0 | ||
; CHECK-SSE-NEXT: addq $88, %rsp | ||
; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] | ||
; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] | ||
; CHECK-SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0] | ||
; CHECK-SSE-NEXT: addq $104, %rsp | ||
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8 | ||
; CHECK-SSE-NEXT: retq | ||
; | ||
|
@@ -1028,17 +1032,17 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { | |
; CHECK-SSE-NEXT: pslld $23, %xmm0 | ||
; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 | ||
; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0 | ||
; CHECK-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] | ||
; CHECK-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,2,u,u,u,u,u,u] | ||
; CHECK-SSE-NEXT: pxor %xmm0, %xmm0 | ||
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] | ||
; CHECK-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | ||
; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0 | ||
; CHECK-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] | ||
; CHECK-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,2,u,u,u,u,u,u] | ||
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | ||
; CHECK-SSE-NEXT: psrld $16, %xmm0 | ||
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 | ||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill | ||
; CHECK-SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: # xmm0 = mem[1,1,1,1] | ||
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 | ||
; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 | ||
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload | ||
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] | ||
; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0 | ||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT | ||
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 | ||
|
@@ -1049,8 +1053,9 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { | |
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT | ||
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 | ||
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT | ||
; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload | ||
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] | ||
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload | ||
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] | ||
; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0 | ||
; CHECK-SSE-NEXT: addq $40, %rsp | ||
; CHECK-SSE-NEXT: retq | ||
; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@stefanp-ibm I'm assuming this change is neutral - is that correct?