Skip to content

[AArch64][GlobalISel] Clean up CTLZ vector type legalization. #131514

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 19, 2025

Conversation

davemgreen
Copy link
Collaborator

Similar to other operations, s8, s16 and s32 vector elements are clamped to legal vector sizes, but in this case s64 are scalarized to use the gpr instructions. This allows vector types to split as opposed to scalarizing.

@llvmbot
Copy link
Member

llvmbot commented Mar 16, 2025

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-aarch64

Author: David Green (davemgreen)

Changes

Similar to other operations, s8, s16 and s32 vector elements are clamped to legal vector sizes, but in this case s64 are scalarized to use the gpr instructions. This allows vector types to split as opposed to scalarizing.


Full diff: https://github.com/llvm/llvm-project/pull/131514.diff

3 Files Affected:

  • (modified) llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp (+1)
  • (modified) llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (+14-3)
  • (modified) llvm/test/CodeGen/AArch64/ctlz.ll (+62-329)
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index ed8bd25698c03..dace204b1251f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -6139,6 +6139,7 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case TargetOpcode::G_FCANONICALIZE:
   case TargetOpcode::G_SEXT_INREG:
   case TargetOpcode::G_ABS:
+  case TargetOpcode::G_CTLZ:
     if (TypeIdx != 0)
       return UnableToLegalize;
     Observer.changingInstr(MI);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 0da3c73b6926d..ebb723003326d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -326,12 +326,23 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .maxScalarEltSameAsIf(always, 1, 0);
 
   getActionDefinitionsBuilder(G_CTLZ)
-      .legalForCartesianProduct(
-          {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
-      .scalarize(1)
+      .legalFor({{s32, s32},
+                  {s64, s64},
+                  {v8s8, v8s8},
+                  {v16s8, v16s8},
+                  {v4s16, v4s16},
+                  {v8s16, v8s16},
+                  {v2s32, v2s32},
+                  {v4s32, v4s32}})
       .widenScalarToNextPow2(1, /*Min=*/32)
       .clampScalar(1, s32, s64)
+      .clampNumElements(0, v8s8, v16s8)
+      .clampNumElements(0, v4s16, v8s16)
+      .clampNumElements(0, v2s32, v4s32)
+      .moreElementsToNextPow2(0)
+      .scalarizeIf(scalarOrEltWiderThan(0, 32), 0)
       .scalarSameSizeAs(0, 1);
+
   getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
 
   getActionDefinitionsBuilder(G_CTTZ)
diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll
index 437e3d5ff75c6..59a845044e164 100644
--- a/llvm/test/CodeGen/AArch64/ctlz.ll
+++ b/llvm/test/CodeGen/AArch64/ctlz.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 
 define void @v2i8(ptr %p1) {
@@ -21,14 +21,14 @@ define void @v2i8(ptr %p1) {
 ;
 ; CHECK-GI-LABEL: v2i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #1]
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    sub w8, w8, #24
-; CHECK-GI-NEXT:    sub w9, w9, #24
-; CHECK-GI-NEXT:    strb w8, [x0]
-; CHECK-GI-NEXT:    strb w9, [x0, #1]
+; CHECK-GI-NEXT:    ldr b0, [x0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #1]
+; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    mov v0.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    clz v0.8b, v0.8b
+; CHECK-GI-NEXT:    st1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    st1 { v0.b }[1], [x8]
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = load <2 x i8>, ptr %p1
@@ -59,18 +59,18 @@ define void @v3i8(ptr %p1) {
 ;
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w10, [x0, #2]
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    clz w10, w10
-; CHECK-GI-NEXT:    sub w8, w8, #24
-; CHECK-GI-NEXT:    sub w9, w9, #24
-; CHECK-GI-NEXT:    strb w8, [x0]
-; CHECK-GI-NEXT:    sub w8, w10, #24
-; CHECK-GI-NEXT:    strb w9, [x0, #1]
-; CHECK-GI-NEXT:    strb w8, [x0, #2]
+; CHECK-GI-NEXT:    ldr b0, [x0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #1]
+; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    add x9, x0, #2
+; CHECK-GI-NEXT:    mov v0.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT:    clz v0.8b, v0.8b
+; CHECK-GI-NEXT:    st1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    st1 { v0.b }[1], [x8]
+; CHECK-GI-NEXT:    st1 { v0.b }[2], [x9]
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = load <3 x i8>, ptr %p1
@@ -95,29 +95,15 @@ define void @v4i8(ptr %p1) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr w8, [x0]
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    uxtb w8, w8
-; CHECK-GI-NEXT:    clz w8, w8
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    sub w8, w8, #24
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
 ; CHECK-GI-NEXT:    mov b0, v0.b[3]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    uxtb w9, w9
-; CHECK-GI-NEXT:    uxtb w8, w10
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    sub w9, w9, #24
-; CHECK-GI-NEXT:    sub w8, w8, #24
-; CHECK-GI-NEXT:    mov v1.b[1], w9
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    uxtb w9, w9
-; CHECK-GI-NEXT:    mov v1.b[2], w8
-; CHECK-GI-NEXT:    clz w8, w9
-; CHECK-GI-NEXT:    sub w8, w8, #24
-; CHECK-GI-NEXT:    mov v1.b[3], w8
-; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT:    clz v0.8b, v2.8b
+; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x0]
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -148,145 +134,11 @@ entry:
 }
 
 define <32 x i8> @v32i8(<32 x i8> %d) {
-; CHECK-SD-LABEL: v32i8:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    clz v0.16b, v0.16b
-; CHECK-SD-NEXT:    clz v1.16b, v1.16b
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v32i8:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    umov w9, v0.b[0]
-; CHECK-GI-NEXT:    umov w11, v1.b[0]
-; CHECK-GI-NEXT:    umov w10, v0.b[1]
-; CHECK-GI-NEXT:    umov w13, v1.b[1]
-; CHECK-GI-NEXT:    umov w8, v0.b[2]
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    clz w11, w11
-; CHECK-GI-NEXT:    clz w10, w10
-; CHECK-GI-NEXT:    sub w14, w9, #24
-; CHECK-GI-NEXT:    sub w12, w11, #24
-; CHECK-GI-NEXT:    clz w11, w13
-; CHECK-GI-NEXT:    fmov s2, w14
-; CHECK-GI-NEXT:    fmov s3, w12
-; CHECK-GI-NEXT:    umov w9, v1.b[2]
-; CHECK-GI-NEXT:    sub w10, w10, #24
-; CHECK-GI-NEXT:    sub w11, w11, #24
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    sub w8, w8, #24
-; CHECK-GI-NEXT:    mov v2.b[1], w10
-; CHECK-GI-NEXT:    mov v3.b[1], w11
-; CHECK-GI-NEXT:    umov w10, v0.b[3]
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    umov w11, v1.b[3]
-; CHECK-GI-NEXT:    sub w9, w9, #24
-; CHECK-GI-NEXT:    mov v2.b[2], w8
-; CHECK-GI-NEXT:    mov v3.b[2], w9
-; CHECK-GI-NEXT:    clz w8, w10
-; CHECK-GI-NEXT:    umov w9, v0.b[4]
-; CHECK-GI-NEXT:    clz w10, w11
-; CHECK-GI-NEXT:    umov w11, v1.b[4]
-; CHECK-GI-NEXT:    sub w8, w8, #24
-; CHECK-GI-NEXT:    sub w10, w10, #24
-; CHECK-GI-NEXT:    mov v2.b[3], w8
-; CHECK-GI-NEXT:    mov v3.b[3], w10
-; CHECK-GI-NEXT:    umov w8, v0.b[5]
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    clz w10, w11
-; CHECK-GI-NEXT:    umov w11, v1.b[5]
-; CHECK-GI-NEXT:    sub w9, w9, #24
-; CHECK-GI-NEXT:    sub w10, w10, #24
-; CHECK-GI-NEXT:    mov v2.b[4], w9
-; CHECK-GI-NEXT:    mov v3.b[4], w10
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    umov w9, v0.b[6]
-; CHECK-GI-NEXT:    clz w10, w11
-; CHECK-GI-NEXT:    umov w11, v1.b[6]
-; CHECK-GI-NEXT:    sub w8, w8, #24
-; CHECK-GI-NEXT:    sub w10, w10, #24
-; CHECK-GI-NEXT:    mov v2.b[5], w8
-; CHECK-GI-NEXT:    mov v3.b[5], w10
-; CHECK-GI-NEXT:    umov w8, v0.b[7]
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    clz w10, w11
-; CHECK-GI-NEXT:    umov w11, v1.b[7]
-; CHECK-GI-NEXT:    sub w9, w9, #24
-; CHECK-GI-NEXT:    sub w10, w10, #24
-; CHECK-GI-NEXT:    mov v2.b[6], w9
-; CHECK-GI-NEXT:    mov v3.b[6], w10
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    umov w9, v0.b[8]
-; CHECK-GI-NEXT:    clz w10, w11
-; CHECK-GI-NEXT:    umov w11, v1.b[8]
-; CHECK-GI-NEXT:    sub w8, w8, #24
-; CHECK-GI-NEXT:    sub w10, w10, #24
-; CHECK-GI-NEXT:    mov v2.b[7], w8
-; CHECK-GI-NEXT:    mov v3.b[7], w10
-; CHECK-GI-NEXT:    umov w8, v0.b[9]
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    clz w10, w11
-; CHECK-GI-NEXT:    umov w11, v1.b[9]
-; CHECK-GI-NEXT:    sub w9, w9, #24
-; CHECK-GI-NEXT:    sub w10, w10, #24
-; CHECK-GI-NEXT:    mov v2.b[8], w9
-; CHECK-GI-NEXT:    mov v3.b[8], w10
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    umov w9, v0.b[10]
-; CHECK-GI-NEXT:    clz w10, w11
-; CHECK-GI-NEXT:    umov w11, v1.b[10]
-; CHECK-GI-NEXT:    sub w8, w8, #24
-; CHECK-GI-NEXT:    sub w10, w10, #24
-; CHECK-GI-NEXT:    mov v2.b[9], w8
-; CHECK-GI-NEXT:    mov v3.b[9], w10
-; CHECK-GI-NEXT:    umov w8, v0.b[11]
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    clz w10, w11
-; CHECK-GI-NEXT:    umov w11, v1.b[11]
-; CHECK-GI-NEXT:    sub w9, w9, #24
-; CHECK-GI-NEXT:    sub w10, w10, #24
-; CHECK-GI-NEXT:    mov v2.b[10], w9
-; CHECK-GI-NEXT:    mov v3.b[10], w10
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    umov w9, v0.b[12]
-; CHECK-GI-NEXT:    clz w10, w11
-; CHECK-GI-NEXT:    umov w11, v1.b[12]
-; CHECK-GI-NEXT:    sub w8, w8, #24
-; CHECK-GI-NEXT:    sub w10, w10, #24
-; CHECK-GI-NEXT:    mov v2.b[11], w8
-; CHECK-GI-NEXT:    mov v3.b[11], w10
-; CHECK-GI-NEXT:    umov w8, v0.b[13]
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    clz w10, w11
-; CHECK-GI-NEXT:    umov w11, v1.b[13]
-; CHECK-GI-NEXT:    sub w9, w9, #24
-; CHECK-GI-NEXT:    sub w10, w10, #24
-; CHECK-GI-NEXT:    mov v2.b[12], w9
-; CHECK-GI-NEXT:    mov v3.b[12], w10
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    umov w9, v0.b[14]
-; CHECK-GI-NEXT:    clz w10, w11
-; CHECK-GI-NEXT:    umov w11, v1.b[14]
-; CHECK-GI-NEXT:    sub w8, w8, #24
-; CHECK-GI-NEXT:    sub w10, w10, #24
-; CHECK-GI-NEXT:    mov v2.b[13], w8
-; CHECK-GI-NEXT:    mov v3.b[13], w10
-; CHECK-GI-NEXT:    umov w8, v0.b[15]
-; CHECK-GI-NEXT:    umov w10, v1.b[15]
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    clz w11, w11
-; CHECK-GI-NEXT:    sub w9, w9, #24
-; CHECK-GI-NEXT:    sub w11, w11, #24
-; CHECK-GI-NEXT:    mov v2.b[14], w9
-; CHECK-GI-NEXT:    mov v3.b[14], w11
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    clz w9, w10
-; CHECK-GI-NEXT:    sub w8, w8, #24
-; CHECK-GI-NEXT:    sub w9, w9, #24
-; CHECK-GI-NEXT:    mov v2.b[15], w8
-; CHECK-GI-NEXT:    mov v3.b[15], w9
-; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    mov v1.16b, v3.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v32i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    clz v0.16b, v0.16b
+; CHECK-NEXT:    clz v1.16b, v1.16b
+; CHECK-NEXT:    ret
 entry:
   %s = call <32 x i8> @llvm.ctlz(<32 x i8> %d, i1 false)
   ret <32 x i8> %s
@@ -310,14 +162,12 @@ define void @v2i16(ptr %p1) {
 ;
 ; CHECK-GI-LABEL: v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrh w8, [x0]
-; CHECK-GI-NEXT:    ldrh w9, [x0, #2]
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    sub w8, w8, #16
-; CHECK-GI-NEXT:    sub w9, w9, #16
-; CHECK-GI-NEXT:    strh w8, [x0]
-; CHECK-GI-NEXT:    strh w9, [x0, #2]
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    clz v0.4h, v0.4h
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = load <2 x i16>, ptr %p1
@@ -338,18 +188,15 @@ define void @v3i16(ptr %p1) {
 ;
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrh w8, [x0]
-; CHECK-GI-NEXT:    ldrh w9, [x0, #2]
-; CHECK-GI-NEXT:    ldrh w10, [x0, #4]
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    clz w10, w10
-; CHECK-GI-NEXT:    sub w8, w8, #16
-; CHECK-GI-NEXT:    sub w9, w9, #16
-; CHECK-GI-NEXT:    strh w8, [x0]
-; CHECK-GI-NEXT:    sub w8, w10, #16
-; CHECK-GI-NEXT:    strh w9, [x0, #2]
-; CHECK-GI-NEXT:    strh w8, [x0, #4]
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x0, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    clz v0.4h, v0.4h
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = load <3 x i16>, ptr %p1
@@ -379,81 +226,11 @@ entry:
 }
 
 define <16 x i16> @v16i16(<16 x i16> %d) {
-; CHECK-SD-LABEL: v16i16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    clz v0.8h, v0.8h
-; CHECK-SD-NEXT:    clz v1.8h, v1.8h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v16i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    umov w8, v0.h[0]
-; CHECK-GI-NEXT:    umov w10, v1.h[0]
-; CHECK-GI-NEXT:    umov w9, v0.h[1]
-; CHECK-GI-NEXT:    umov w11, v1.h[1]
-; CHECK-GI-NEXT:    umov w12, v0.h[2]
-; CHECK-GI-NEXT:    umov w13, v1.h[2]
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    clz w10, w10
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    sub w8, w8, #16
-; CHECK-GI-NEXT:    sub w10, w10, #16
-; CHECK-GI-NEXT:    clz w11, w11
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    fmov s3, w10
-; CHECK-GI-NEXT:    sub w9, w9, #16
-; CHECK-GI-NEXT:    sub w11, w11, #16
-; CHECK-GI-NEXT:    umov w8, v0.h[3]
-; CHECK-GI-NEXT:    clz w10, w13
-; CHECK-GI-NEXT:    sub w10, w10, #16
-; CHECK-GI-NEXT:    mov v2.h[1], w9
-; CHECK-GI-NEXT:    mov v3.h[1], w11
-; CHECK-GI-NEXT:    clz w9, w12
-; CHECK-GI-NEXT:    umov w11, v1.h[3]
-; CHECK-GI-NEXT:    sub w9, w9, #16
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    sub w8, w8, #16
-; CHECK-GI-NEXT:    mov v2.h[2], w9
-; CHECK-GI-NEXT:    mov v3.h[2], w10
-; CHECK-GI-NEXT:    umov w9, v0.h[4]
-; CHECK-GI-NEXT:    clz w10, w11
-; CHECK-GI-NEXT:    umov w11, v1.h[4]
-; CHECK-GI-NEXT:    sub w10, w10, #16
-; CHECK-GI-NEXT:    mov v2.h[3], w8
-; CHECK-GI-NEXT:    mov v3.h[3], w10
-; CHECK-GI-NEXT:    umov w8, v0.h[5]
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    clz w10, w11
-; CHECK-GI-NEXT:    umov w11, v1.h[5]
-; CHECK-GI-NEXT:    sub w9, w9, #16
-; CHECK-GI-NEXT:    sub w10, w10, #16
-; CHECK-GI-NEXT:    mov v2.h[4], w9
-; CHECK-GI-NEXT:    mov v3.h[4], w10
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    umov w9, v0.h[6]
-; CHECK-GI-NEXT:    clz w10, w11
-; CHECK-GI-NEXT:    umov w11, v1.h[6]
-; CHECK-GI-NEXT:    sub w8, w8, #16
-; CHECK-GI-NEXT:    sub w10, w10, #16
-; CHECK-GI-NEXT:    mov v2.h[5], w8
-; CHECK-GI-NEXT:    mov v3.h[5], w10
-; CHECK-GI-NEXT:    umov w8, v0.h[7]
-; CHECK-GI-NEXT:    umov w10, v1.h[7]
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    clz w11, w11
-; CHECK-GI-NEXT:    sub w9, w9, #16
-; CHECK-GI-NEXT:    sub w11, w11, #16
-; CHECK-GI-NEXT:    mov v2.h[6], w9
-; CHECK-GI-NEXT:    mov v3.h[6], w11
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    clz w9, w10
-; CHECK-GI-NEXT:    sub w8, w8, #16
-; CHECK-GI-NEXT:    sub w9, w9, #16
-; CHECK-GI-NEXT:    mov v2.h[7], w8
-; CHECK-GI-NEXT:    mov v3.h[7], w9
-; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    mov v1.16b, v3.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v16i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    clz v0.8h, v0.8h
+; CHECK-NEXT:    clz v1.8h, v1.8h
+; CHECK-NEXT:    ret
 entry:
   %s = call <16 x i16> @llvm.ctlz(<16 x i16> %d, i1 false)
   ret <16 x i16> %s
@@ -470,24 +247,10 @@ entry:
 }
 
 define <3 x i32> @v3i32(<3 x i32> %d) {
-; CHECK-SD-LABEL: v3i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    clz v0.4s, v0.4s
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v3i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov w9, v0.s[1]
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    mov v1.s[0], w8
-; CHECK-GI-NEXT:    mov w8, v0.s[2]
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    mov v1.s[1], w9
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v3i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    clz v0.4s, v0.4s
+; CHECK-NEXT:    ret
 entry:
   %s = call <3 x i32> @llvm.ctlz(<3 x i32> %d, i1 false)
   ret <3 x i32> %s
@@ -504,41 +267,11 @@ entry:
 }
 
 define <8 x i32> @v8i32(<8 x i32> %d) {
-; CHECK-SD-LABEL: v8i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    clz v0.4s, v0.4s
-; CHECK-SD-NEXT:    clz v1.4s, v1.4s
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v8i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    mov w8, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v1.s[1]
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    clz w10, w10
-; CHECK-GI-NEXT:    mov v2.s[0], w9
-; CHECK-GI-NEXT:    mov v3.s[0], w10
-; CHECK-GI-NEXT:    mov w9, v0.s[2]
-; CHECK-GI-NEXT:    mov w10, v1.s[2]
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    clz w11, w11
-; CHECK-GI-NEXT:    mov v2.s[1], w8
-; CHECK-GI-NEXT:    mov v3.s[1], w11
-; CHECK-GI-NEXT:    mov w8, v0.s[3]
-; CHECK-GI-NEXT:    mov w11, v1.s[3]
-; CHECK-GI-NEXT:    clz w9, w9
-; CHECK-GI-NEXT:    clz w10, w10
-; CHECK-GI-NEXT:    mov v2.s[2], w9
-; CHECK-GI-NEXT:    mov v3.s[2], w10
-; CHECK-GI-NEXT:    clz w8, w8
-; CHECK-GI-NEXT:    clz w9, w11
-; CHECK-GI-NEXT:    mov v2.s[3], w8
-; CHECK-GI-NEXT:    mov v3.s[3], w9
-; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    mov v1.16b, v3.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v8i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    clz v0.4s, v0.4s
+; CHECK-NEXT:    clz v1.4s, v1.4s
+; CHECK-NEXT:    ret
 entry:
   %s = call <8 x i32> @llvm.ctlz(<8 x i32> %d, i1 false)
   ret <8 x i32> %s

Copy link

github-actions bot commented Mar 16, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

Similar to other operations, s8, s16 and s32 vector elements are clamped to
legal vector sizes, but in this case s64 are scalarized to use the gpr
instructions. This allows vector types to split as opposed to scalarizing.
@davemgreen davemgreen merged commit b087699 into llvm:main Mar 19, 2025
11 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented Mar 19, 2025

LLVM Buildbot has detected a new failure on builder openmp-offload-amdgpu-runtime running on omp-vega20-0 while building llvm at step 7 "Add check check-offload".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/30/builds/17991

Here is the relevant piece of the build log for the reference
Step 7 (Add check check-offload) failure: test (failure)
...
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/test_libc.cpp (1026 of 1035)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug47654.cpp (1027 of 1035)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug49779.cpp (1028 of 1035)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/wtime.c (1029 of 1035)
PASS: libomptarget :: x86_64-unknown-linux-gnu :: offloading/bug49021.cpp (1030 of 1035)
PASS: libomptarget :: x86_64-unknown-linux-gnu :: offloading/std_complex_arithmetic.cpp (1031 of 1035)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/complex_reduction.cpp (1032 of 1035)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug49021.cpp (1033 of 1035)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/std_complex_arithmetic.cpp (1034 of 1035)
TIMEOUT: libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_offloading_map.cpp (1035 of 1035)
******************** TEST 'libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_offloading_map.cpp' FAILED ********************
Exit Code: -9
Timeout: Reached timeout of 100 seconds

Command Output (stdout):
--
# RUN: at line 1
/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/clang++ -fopenmp    -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib  -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/parallel_offloading_map.cpp -o /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/parallel_offloading_map.cpp.tmp /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a && /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/parallel_offloading_map.cpp.tmp | /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/parallel_offloading_map.cpp
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/clang++ -fopenmp -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/parallel_offloading_map.cpp -o /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/parallel_offloading_map.cpp.tmp /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a
# note: command had no output on stdout or stderr
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/parallel_offloading_map.cpp.tmp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -9
# error: command reached timeout: True
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/parallel_offloading_map.cpp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -9
# error: command reached timeout: True

--

********************
Slowest Tests:
--------------------------------------------------------------------------
100.06s: libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_offloading_map.cpp
16.19s: libomptarget :: amdgcn-amd-amdhsa :: offloading/bug49021.cpp
12.43s: libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_target_teams_reduction_max.cpp
12.33s: libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_target_teams_reduction_min.cpp
10.83s: libomptarget :: amdgcn-amd-amdhsa :: offloading/complex_reduction.cpp
9.35s: libomptarget :: amdgcn-amd-amdhsa :: jit/empty_kernel_lvl2.c
8.85s: libomptarget :: x86_64-unknown-linux-gnu :: offloading/bug49021.cpp
7.87s: libomptarget :: amdgcn-amd-amdhsa :: offloading/ompx_saxpy_mixed.c
7.51s: libomptarget :: amdgcn-amd-amdhsa :: offloading/barrier_fence.c
7.24s: libomptarget :: x86_64-unknown-linux-gnu :: offloading/std_complex_arithmetic.cpp
6.98s: libomptarget :: x86_64-unknown-linux-gnu :: offloading/complex_reduction.cpp
6.50s: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug49021.cpp
6.14s: libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_target_teams_reduction.cpp
5.25s: libomptarget :: amdgcn-amd-amdhsa :: offloading/default_thread_limit.c
5.25s: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/std_complex_arithmetic.cpp

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants