@@ -14188,10 +14188,19 @@ void EmitPass::emitReductionTree( e_opcode op, VISA_Type type, CVariable* src, C
14188
14188
for( unsigned int i = 0; i < numIterations; i++ )
14189
14189
{
14190
14190
// Get alias for src0, src1, and dst based on offsets and SIMD size
14191
- auto* layerSrc0 = m_currShader->GetNewAlias( src, type, i * 2 * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
14192
- auto* layerSrc1 = m_currShader->GetNewAlias( src, type, ( i * 2 * layerMaxSimdLanes + src1Offset ) * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
14193
- auto* layerDst = m_currShader->GetNewAlias( src, type, i * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
14194
-
14191
+ auto* layerSrc0 = m_currShader->GetNewAlias( src, type, i * 2 * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes, false );
14192
+ auto* layerSrc1 = m_currShader->GetNewAlias( src, type, ( i * 2 * layerMaxSimdLanes + src1Offset ) * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes, false );
14193
+ CVariable* layerDst;
14194
+ if( (srcElementCount >> 1 <= dst->GetNumberElement()) && (i + 1 == numIterations ))
14195
+ {
14196
+ // Final layer, use destination of WaveAll vector intrinsic inst (passed in with correct offset)
14197
+ layerDst = dst;
14198
+ }
14199
+ else
14200
+ {
14201
+ // Use src as workspace to store intermediate values
14202
+ layerDst = m_currShader->GetNewAlias( src, type, i * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes, false );
14203
+ }
14195
14204
if( !int64EmulationNeeded )
14196
14205
{
14197
14206
m_encoder->SetNoMask();
@@ -14220,13 +14229,6 @@ void EmitPass::emitReductionTree( e_opcode op, VISA_Type type, CVariable* src, C
14220
14229
srcElementCount >>= 1;
14221
14230
reductionElementCount >>= 1;
14222
14231
}
14223
-
14224
- // copy fully reduced elements from src to dst
14225
- auto* finalLayerDst = m_currShader->GetNewAlias( src, type, 0, dst->GetNumberElement() );
14226
- m_encoder->SetNoMask();
14227
- m_encoder->SetSimdSize( lanesToSIMDMode( dst->GetNumberElement() ) );
14228
- m_encoder->Copy( dst, finalLayerDst );
14229
- m_encoder->Push();
14230
14232
}
14231
14233
14232
14234
// Recursive function that emits one or more joint reduction trees based on the joint output width
@@ -14240,8 +14242,8 @@ void EmitPass::emitReductionTrees( e_opcode op, VISA_Type type, SIMDMode simdMod
14240
14242
// Do full tree reduction
14241
14243
unsigned int reductionElements = src->GetNumberElement() / dst->GetNumberElement();
14242
14244
unsigned int groupReductionElementCount = reductionElements * simdLanes;
14243
- CVariable* srcAlias = m_currShader->GetNewAlias( src, type, startIdx * reductionElements * m_encoder->GetCISADataTypeSize( type ), groupReductionElementCount );
14244
- CVariable* dstAlias = m_currShader->GetNewAlias( dst, type, startIdx * m_encoder->GetCISADataTypeSize( type ), simdLanes);
14245
+ CVariable* srcAlias = m_currShader->GetNewAlias( src, type, startIdx * reductionElements * m_encoder->GetCISADataTypeSize( type ), groupReductionElementCount, false );
14246
+ CVariable* dstAlias = m_currShader->GetNewAlias( dst, type, startIdx * m_encoder->GetCISADataTypeSize( type ), simdLanes, false );
14245
14247
emitReductionTree( op, type, srcAlias, dstAlias );
14246
14248
// Start new recursive tree if any elements are left
14247
14249
if ( numGroups > simdLanes )
@@ -22559,13 +22561,13 @@ void EmitPass::emitWaveAll(llvm::GenIntrinsicInst* inst)
22559
22561
for( uint16_t i = 0; i < dst->GetNumberElement(); i++ )
22560
22562
{
22561
22563
// Prepare reduceSrc
22562
- CVariable* srcAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22563
- CVariable* reduceSrcAlias = m_currShader->GetNewAlias( reduceSrc, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22564
+ CVariable* srcAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ), false );
22565
+ CVariable* reduceSrcAlias = m_currShader->GetNewAlias( reduceSrc, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ), false );
22564
22566
ScanReducePrepareSrc( type, identity, false, false, srcAlias, reduceSrcAlias );
22565
22567
22566
22568
// Prepare reduceSrcSecondHalf
22567
- CVariable* srcSecondHalfAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22568
- CVariable* reduceSrcSecondHalfAlias = m_currShader->GetNewAlias( reduceSrcSecondHalf, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22569
+ CVariable* srcSecondHalfAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ), false );
22570
+ CVariable* reduceSrcSecondHalfAlias = m_currShader->GetNewAlias( reduceSrcSecondHalf, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ), false );
22569
22571
ScanReducePrepareSrc( type, identity, false, true, srcSecondHalfAlias, reduceSrcSecondHalfAlias );
22570
22572
22571
22573
// Emit correct operations
0 commit comments