@@ -14196,19 +14196,10 @@ void EmitPass::emitReductionTree( e_opcode op, VISA_Type type, CVariable* src, C
14196
14196
for( unsigned int i = 0; i < numIterations; i++ )
14197
14197
{
14198
14198
// Get alias for src0, src1, and dst based on offsets and SIMD size
14199
- auto* layerSrc0 = m_currShader->GetNewAlias( src, type, i * 2 * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes, false );
14200
- auto* layerSrc1 = m_currShader->GetNewAlias( src, type, ( i * 2 * layerMaxSimdLanes + src1Offset ) * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes, false );
14201
- CVariable* layerDst;
14202
- if( (srcElementCount >> 1 <= dst->GetNumberElement()) && (i + 1 == numIterations ))
14203
- {
14204
- // Final layer, use destination of WaveAll vector intrinsic inst (passed in with correct offset)
14205
- layerDst = dst;
14206
- }
14207
- else
14208
- {
14209
- // Use src as workspace to store intermediate values
14210
- layerDst = m_currShader->GetNewAlias( src, type, i * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes, false );
14211
- }
14199
+ auto* layerSrc0 = m_currShader->GetNewAlias( src, type, i * 2 * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
14200
+ auto* layerSrc1 = m_currShader->GetNewAlias( src, type, ( i * 2 * layerMaxSimdLanes + src1Offset ) * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
14201
+ auto* layerDst = m_currShader->GetNewAlias( src, type, i * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
14202
+
14212
14203
if( !int64EmulationNeeded )
14213
14204
{
14214
14205
m_encoder->SetNoMask();
@@ -14237,6 +14228,13 @@ void EmitPass::emitReductionTree( e_opcode op, VISA_Type type, CVariable* src, C
14237
14228
srcElementCount >>= 1;
14238
14229
reductionElementCount >>= 1;
14239
14230
}
14231
+
14232
+ // copy fully reduced elements from src to dst
14233
+ auto* finalLayerDst = m_currShader->GetNewAlias( src, type, 0, dst->GetNumberElement() );
14234
+ m_encoder->SetNoMask();
14235
+ m_encoder->SetSimdSize( lanesToSIMDMode( dst->GetNumberElement() ) );
14236
+ m_encoder->Copy( dst, finalLayerDst );
14237
+ m_encoder->Push();
14240
14238
}
14241
14239
14242
14240
// Recursive function that emits one or more joint reduction trees based on the joint output width
@@ -14250,8 +14248,8 @@ void EmitPass::emitReductionTrees( e_opcode op, VISA_Type type, SIMDMode simdMod
14250
14248
// Do full tree reduction
14251
14249
unsigned int reductionElements = src->GetNumberElement() / dst->GetNumberElement();
14252
14250
unsigned int groupReductionElementCount = reductionElements * simdLanes;
14253
- CVariable* srcAlias = m_currShader->GetNewAlias( src, type, startIdx * reductionElements * m_encoder->GetCISADataTypeSize( type ), groupReductionElementCount, false );
14254
- CVariable* dstAlias = m_currShader->GetNewAlias( dst, type, startIdx * m_encoder->GetCISADataTypeSize( type ), simdLanes, false );
14251
+ CVariable* srcAlias = m_currShader->GetNewAlias( src, type, startIdx * reductionElements * m_encoder->GetCISADataTypeSize( type ), groupReductionElementCount );
14252
+ CVariable* dstAlias = m_currShader->GetNewAlias( dst, type, startIdx * m_encoder->GetCISADataTypeSize( type ), simdLanes);
14255
14253
emitReductionTree( op, type, srcAlias, dstAlias );
14256
14254
// Start new recursive tree if any elements are left
14257
14255
if ( numGroups > simdLanes )
@@ -22569,13 +22567,13 @@ void EmitPass::emitWaveAll(llvm::GenIntrinsicInst* inst)
22569
22567
for( uint16_t i = 0; i < dst->GetNumberElement(); i++ )
22570
22568
{
22571
22569
// Prepare reduceSrc
22572
- CVariable* srcAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ), false );
22573
- CVariable* reduceSrcAlias = m_currShader->GetNewAlias( reduceSrc, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ), false );
22570
+ CVariable* srcAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22571
+ CVariable* reduceSrcAlias = m_currShader->GetNewAlias( reduceSrc, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22574
22572
ScanReducePrepareSrc( type, identity, false, false, srcAlias, reduceSrcAlias );
22575
22573
22576
22574
// Prepare reduceSrcSecondHalf
22577
- CVariable* srcSecondHalfAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ), false );
22578
- CVariable* reduceSrcSecondHalfAlias = m_currShader->GetNewAlias( reduceSrcSecondHalf, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ), false );
22575
+ CVariable* srcSecondHalfAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22576
+ CVariable* reduceSrcSecondHalfAlias = m_currShader->GetNewAlias( reduceSrcSecondHalf, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22579
22577
ScanReducePrepareSrc( type, identity, false, true, srcSecondHalfAlias, reduceSrcSecondHalfAlias );
22580
22578
22581
22579
// Emit correct operations
0 commit comments