|
| 1 | +; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-dir=%S -polly-vectorizer=polly -polly-codegen < %s -S | FileCheck %s |
| 2 | + |
| 3 | +; #pragma known-parallel |
| 4 | +; for (int c0 = 0; c0 <= 31; c0 += 1) |
| 5 | +; for (int c1 = 0; c1 <= floord(nk - 1, 32); c1 += 1) |
| 6 | +; for (int c2 = 0; c2 <= 7; c2 += 1) |
| 7 | +; for (int c3 = 0; c3 <= min(31, nk - 32 * c1 - 1); c3 += 1) |
| 8 | +; #pragma simd |
| 9 | +; for (int c4 = 0; c4 <= 3; c4 += 1) |
| 10 | +; Stmt_for_body_3(32 * c0 + 4 * c2 + c4, 32 * c1 + c3); |
| 11 | + |
| 12 | +; CHECK: polly.stmt.for.body.3: ; preds = %polly.loop_header18 |
| 13 | +; CHECK: %scevgep = getelementptr [1024 x double], [1024 x double]* %A, i64 0, i64 %21 |
| 14 | +; CHECK: %_p_vec_p = bitcast double* %scevgep to <1 x double>* |
| 15 | +; CHECK: %_p_splat_one = load <1 x double>, <1 x double>* %_p_vec_p, align 8, !alias.scope !1, !noalias !3, !llvm.mem.parallel_loop_access !0 |
| 16 | +; CHECK: %_p_splat = shufflevector <1 x double> %_p_splat_one, <1 x double> %_p_splat_one, <4 x i32> zeroinitializer |
| 17 | +; CHECK: %scevgep26 = getelementptr [1024 x double], [1024 x double]* %C, i64 0, i64 %19 |
| 18 | +; CHECK: %vector_ptr = bitcast double* %scevgep26 to <4 x double>* |
| 19 | +; CHECK: %_p_vec_full = load <4 x double>, <4 x double>* %vector_ptr, align 8, !alias.scope !4, !noalias !5, !llvm.mem.parallel_loop_access !0 |
| 20 | +; CHECK: %addp_vec = fadd <4 x double> %_p_splat, %_p_vec_full |
| 21 | +; CHECK: %40 = extractelement <4 x double> %addp_vec, i32 0 |
| 22 | +; CHECK: %41 = extractelement <4 x double> %addp_vec, i32 1 |
| 23 | +; CHECK: %42 = extractelement <4 x double> %addp_vec, i32 2 |
| 24 | +; CHECK: %43 = extractelement <4 x double> %addp_vec, i32 3 |
| 25 | +; CHECK: %vector_ptr27 = bitcast double* %scevgep26 to <4 x double>* |
| 26 | +; CHECK: store <4 x double> %addp_vec, <4 x double>* %vector_ptr27, align 8, !alias.scope !4, !noalias !5, !llvm.mem.parallel_loop_access !0 |
| 27 | + |
| 28 | +define void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, [1024 x double]* %C, [1024 x double]* %A) #0 { |
| 29 | +entry: |
| 30 | + br label %for.cond.1.preheader |
| 31 | + |
| 32 | +for.cond.1.preheader: ; preds = %entry, %for.inc.10 |
| 33 | + %indvars.iv16 = phi i64 [ 0, %entry ], [ %indvars.iv.next17, %for.inc.10 ] |
| 34 | + %cmp2.13 = icmp sgt i32 %nk, 0 |
| 35 | + br i1 %cmp2.13, label %for.body.3.lr.ph, label %for.inc.10 |
| 36 | + |
| 37 | +for.body.3.lr.ph: ; preds = %for.cond.1.preheader |
| 38 | + br label %for.body.3 |
| 39 | + |
| 40 | +for.body.3: ; preds = %for.body.3.lr.ph, %for.body.3 |
| 41 | + %indvars.iv = phi i64 [ 0, %for.body.3.lr.ph ], [ %indvars.iv.next, %for.body.3 ] |
| 42 | + %arrayidx5 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i64 0, i64 %indvars.iv |
| 43 | + %0 = load double, double* %arrayidx5, align 8 |
| 44 | + %arrayidx9 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 0, i64 %indvars.iv16 |
| 45 | + %1 = load double, double* %arrayidx9, align 8 |
| 46 | + %add = fadd double %0, %1 |
| 47 | + store double %add, double* %arrayidx9, align 8 |
| 48 | + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 |
| 49 | + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 |
| 50 | + %exitcond = icmp ne i32 %lftr.wideiv, %nk |
| 51 | + br i1 %exitcond, label %for.body.3, label %for.cond.1.for.inc.10_crit_edge |
| 52 | + |
| 53 | +for.cond.1.for.inc.10_crit_edge: ; preds = %for.body.3 |
| 54 | + br label %for.inc.10 |
| 55 | + |
| 56 | +for.inc.10: ; preds = %for.cond.1.for.inc.10_crit_edge, %for.cond.1.preheader |
| 57 | + %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1 |
| 58 | + %exitcond18 = icmp ne i64 %indvars.iv.next17, 1024 |
| 59 | + br i1 %exitcond18, label %for.cond.1.preheader, label %for.end.12 |
| 60 | + |
| 61 | +for.end.12: ; preds = %for.inc.10 |
| 62 | + ret void |
| 63 | +} |
0 commit comments