neoml-lib · FedyuninV · Aug 26, 2022 · Aug 26, 2022 · Aug 26, 2022 · Aug 26, 2022
diff --git a/NeoMathEngine/CMakeLists.txt b/NeoMathEngine/CMakeLists.txt
@@ -47,4 +47,7 @@ add_subdirectory(src)
 if(NeoMathEngine_BUILD_TESTS AND NOT IOS AND NOT ANDROID)
     enable_testing()
     add_subdirectory(test/FullTestDesktop)
+    if(WIN32 AND NOT USE_FINE_OBJECTS AND CMAKE_SIZEOF_VOID_P EQUAL 8)
+        add_subdirectory(test/AvxTestDesktop)
+    endif()
 endif()
diff --git a/NeoMathEngine/include/NeoMathEngine/SimdMathEngine.h b/NeoMathEngine/include/NeoMathEngine/SimdMathEngine.h
@@ -41,6 +41,15 @@ class ISimdMathEngine : public CCrtAllocatedObject {
 	virtual void BlobConvolution( const CConvolutionDesc& convDesc, const float* source,
 		const float* filter, const float* freeTerm, float* result ) const = 0;
 
+	virtual CConvolutionDesc* InitBlockedConvolution( const CBlobDesc& source, int paddingHeight, int paddingWidth,
+		int strideHeight, int strideWidth, int dilationHeight, int dilationWidth, const CBlobDesc& filter,
+		const CBlobDesc& result ) const = 0;
+	virtual void PackBlockedData(const CBlobDesc& desc, const float* source, float* result) const = 0;
+	virtual void UnpackBlockedData( const CBlobDesc& desc, const float* source, float* result ) const = 0;
+	virtual void PackBlockedFilter( const CBlobDesc& desc, const float* source, float* result ) const = 0;
+	virtual void BlockedConvolution( const CConvolutionDesc& convDesc, const float* packedSource,
+		const float* packedFilter, const float* freeTerm, float* packedResult ) const = 0;
+
 	virtual SgemmFunc GetSgemmFunction() const = 0;
 
 	virtual void Tanh( float* dst, const float* src, size_t dataSize, bool isMultithread = true ) = 0;

diff --git a/NeoMathEngine/src/CPU/CpuMathEngineDnnConv.cpp b/NeoMathEngine/src/CPU/CpuMathEngineDnnConv.cpp
@@ -45,13 +45,16 @@ struct CCpuConvolutionDesc : public CCommonConvolutionDesc {
 	TConvAlgo ForwardAlgo;
 	TConvAlgo BackwardAlgo;
 	std::unique_ptr<CConvolutionDesc> SimdConvolutionDesc;
+	std::unique_ptr<CConvolutionDesc> BlockedConvolutionDesc;
 
-	CCpuConvolutionDesc( std::unique_ptr<CConvolutionDesc>& simdConvolutionDesc, const CBlobDesc& source, const CBlobDesc& result, const CBlobDesc& filter,
-			int paddingHeight, int paddingWidth, int strideHeight, int strideWidth, int dilationHeight, int dilationWidth ) :
+	CCpuConvolutionDesc( std::unique_ptr<CConvolutionDesc>& simdConvolutionDesc, std::unique_ptr<CConvolutionDesc>& blockedConvolutionDesc,
+			const CBlobDesc& source, const CBlobDesc& result, const CBlobDesc& filter, int paddingHeight, int paddingWidth,
+			int strideHeight, int strideWidth, int dilationHeight, int dilationWidth ) :
 		CCommonConvolutionDesc( source, result, filter, paddingHeight, paddingWidth, strideHeight, strideWidth, dilationHeight, dilationWidth ),
 		ForwardAlgo( getActualForwardAlgo() ),
 		BackwardAlgo( getActualBackwardAlgo() ),
-		SimdConvolutionDesc( std::move( simdConvolutionDesc ) )
+		SimdConvolutionDesc( std::move( simdConvolutionDesc ) ),
+		BlockedConvolutionDesc( std::move( blockedConvolutionDesc ) )
 	{
 	}
 
@@ -131,14 +134,20 @@ CConvolutionDesc* CCpuMathEngine::InitBlobConvolution( const CBlobDesc& source,
 	ASSERT_EXPR( result.Channels() == filter.BatchWidth() );
 	ASSERT_EXPR( result.Depth() == 1 );
 
-	std::unique_ptr<CConvolutionDesc> simdConvolutionDesc;
+	std::unique_ptr<CConvolutionDesc> blockedConvolutionDesc;
 	if( simdMathEngine != nullptr ) {
+		blockedConvolutionDesc.reset( simdMathEngine->InitBlockedConvolution( source, paddingHeight, paddingWidth,
+			strideHeight, strideWidth, dilationHeight, dilationWidth, filter, result ) );
+	}
+
+	std::unique_ptr<CConvolutionDesc> simdConvolutionDesc;
+	if( simdMathEngine != nullptr && blockedConvolutionDesc == nullptr ) {
 		simdConvolutionDesc = std::unique_ptr<CConvolutionDesc>( simdMathEngine->InitBlobConvolution( source, paddingHeight, paddingWidth,
 			strideHeight, strideWidth, dilationHeight, dilationWidth, filter, result ) );
 	}
 
-	CCpuConvolutionDesc* desc = new CCpuConvolutionDesc( simdConvolutionDesc, source, result, filter,
-		paddingHeight, paddingWidth, strideHeight, strideWidth, dilationHeight, dilationWidth );
+	CCpuConvolutionDesc* desc = new CCpuConvolutionDesc( simdConvolutionDesc, blockedConvolutionDesc, source, result,
+		filter, paddingHeight, paddingWidth, strideHeight, strideWidth, dilationHeight, dilationWidth );
 	return desc;
 }
 
@@ -517,6 +526,21 @@ void CCpuMathEngine::BlobConvolution( const CConvolutionDesc& convDesc, const CC
 
 	const CCpuConvolutionDesc& desc = static_cast<const CCpuConvolutionDesc&>( convDesc );
 
+	if( desc.BlockedConvolutionDesc != nullptr ) {
+		CFloatHandleStackVar packBuff( *this,
+			std::max<int>( desc.Source.BlobSize(), desc.Result.BlobSize() ) + desc.Filter.BlobSize() );
+		float* packedFilter = GetRaw( packBuff.GetHandle() );
+		float* packedIO = packedFilter + desc.Filter.BlobSize();
+		float* rawResult = GetRaw( result );
+		simdMathEngine->PackBlockedData( desc.Source, GetRaw( source ), packedIO );
+		simdMathEngine->PackBlockedFilter( desc.Filter, GetRaw( filter ), packedFilter );
+		simdMathEngine->BlockedConvolution( *desc.BlockedConvolutionDesc, packedIO, packedFilter,
+			freeTerm != nullptr ? GetRaw( *freeTerm ) : nullptr, rawResult );
+		simdMathEngine->UnpackBlockedData( desc.Result, rawResult, packedIO );
+		dataCopy( rawResult, packedIO, desc.Result.BlobSize() );
+		return;
+	}
+
 	if( desc.SimdConvolutionDesc != nullptr ) {
 		simdMathEngine->BlobConvolution( *desc.SimdConvolutionDesc, sourceRaw, filterRaw, freeTermRaw, resultRaw );
 		return;

diff --git a/NeoMathEngine/src/CPU/x86/avx/CMakeLists.txt b/NeoMathEngine/src/CPU/x86/avx/CMakeLists.txt
@@ -25,8 +25,10 @@ target_sources(${PROJECT_NAME}
     ./src/BlobConvolution_jit_FltCnt_18.inl
     ./src/BlobConvolution_jit_FltCnt_24.inl
     ./src/BlobConvolution_jit_FltCnt_32.inl
+    ./src/BlobBlockedConvolution.cpp
     ./src/PrimitivesJit.h
     ./src/AvxCommon.h
+    ./src/AvxMathEngine.h
     ./src/JitCommon.h
     ./src/MatrixMultiplyingInterleaved/Interleavers/Interleavers.h
     ./src/MatrixMultiplyingInterleaved/MicroKernels/Kernel_AVX_6x16.h

diff --git a/NeoMathEngine/src/CPU/x86/avx/src/AvxMathEngine.cpp b/NeoMathEngine/src/CPU/x86/avx/src/AvxMathEngine.cpp
@@ -19,6 +19,7 @@ limitations under the License.
 #include <NeoMathEngine/SimdMathEngine.h>
 #include <BlobConvolution.h>
 #include <PrimitivesJit.h>
+#include <AvxMathEngine.h>
 #include <CPUInfo.h>
 
 namespace NeoML {
@@ -48,32 +49,6 @@ CAvxConvolutionDesc::CAvxConvolutionDesc( IMathEngine* mathEngine, const CBlobDe
 {
 }
 
-class CAvxMathEngine : public ISimdMathEngine {
-public:
-	CAvxMathEngine( IMathEngine* _mathEngine, int _threadCount ) :
-		mathEngine( _mathEngine ), threadCount( _threadCount ), primitives( _mathEngine, _threadCount ) {}
-
-	CConvolutionDesc* InitBlobConvolution( const CBlobDesc& source, int paddingHeight, int paddingWidth,
-		int strideHeight, int strideWidth, int dilationHeight, int dilationWidth, const CBlobDesc& filter,
-		const CBlobDesc& result ) const override;
-
-	void BlobConvolution( const CConvolutionDesc& convDesc, const float* source,
-		const float* filter, const float* freeTerm, float* result ) const override;
-
-	SgemmFunc GetSgemmFunction() const override;
-
-	void Tanh( float* dst, const float* src, size_t dataSize, bool isMultithread ) override;
-	void Sigmoid( float* dst, const float* src, size_t dataSize, bool isMultithread ) override;
-	void Exp( float* dst, const float* src, size_t dataSize, bool isMultithread ) override;
-	void RunOnceRestOfLstm( CMathEngineLstmDesc* desc, const CConstFloatHandle& inputStateBackLink,
-		const CFloatHandle& outputStateBackLink, const CFloatHandle& outputMainBackLink, bool isMultithread ) override;
-
-private:
-	IMathEngine* mathEngine;
-	int threadCount;
-	CPrimitivesJit primitives;
-};
-
 CConvolutionDesc* CAvxMathEngine::InitBlobConvolution( const CBlobDesc& source, int paddingHeight, int paddingWidth,
 	int strideHeight, int strideWidth, int dilationHeight, int dilationWidth, const CBlobDesc& filter,
 	const CBlobDesc& result ) const

diff --git a/NeoMathEngine/src/CPU/x86/avx/src/AvxMathEngine.h b/NeoMathEngine/src/CPU/x86/avx/src/AvxMathEngine.h
@@ -0,0 +1,59 @@
+/* Copyright © 2017-2022 ABBYY Production LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+--------------------------------------------------------------------------------------------------------------*/
+
+#pragma once
+
+#include <NeoMathEngine/SimdMathEngine.h>
+#include <PrimitivesJit.h>
+
+namespace NeoML {
+
+class CAvxMathEngine : public ISimdMathEngine {
+public:
+	CAvxMathEngine( IMathEngine* _mathEngine, int _threadCount ) :
+		mathEngine( _mathEngine ), threadCount( _threadCount ), primitives( _mathEngine, _threadCount ) {}
+
+	CConvolutionDesc* InitBlobConvolution( const CBlobDesc& source, int paddingHeight, int paddingWidth,
+		int strideHeight, int strideWidth, int dilationHeight, int dilationWidth, const CBlobDesc& filter,
+		const CBlobDesc& result ) const override;
+
+	void BlobConvolution( const CConvolutionDesc& convDesc, const float* source,
+		const float* filter, const float* freeTerm, float* result ) const override;
+
+	virtual CConvolutionDesc* InitBlockedConvolution( const CBlobDesc& source, int paddingHeight, int paddingWidth,
+		int strideHeight, int strideWidth, int dilationHeight, int dilationWidth, const CBlobDesc& filter,
+		const CBlobDesc& result ) const override;
+	void PackBlockedData( const CBlobDesc& desc, const float* source, float* result ) const override;
+	void UnpackBlockedData( const CBlobDesc& desc, const float* source, float* result ) const override;
+	void PackBlockedFilter( const CBlobDesc& desc, const float* source, float* result ) const override;
+	void BlockedConvolution( const CConvolutionDesc& convDesc, const float* packedSource,
+		const float* packedFilter, const float* freeTerm, float* packedResult ) const override;
+
+	SgemmFunc GetSgemmFunction() const override;
+
+	void Tanh( float* dst, const float* src, size_t dataSize, bool isMultithread ) override;
+	void Sigmoid( float* dst, const float* src, size_t dataSize, bool isMultithread ) override;
+	void Exp( float* dst, const float* src, size_t dataSize, bool isMultithread ) override;
+	void RunOnceRestOfLstm( CMathEngineLstmDesc* desc, const CConstFloatHandle& inputStateBackLink,
+		const CFloatHandle& outputStateBackLink, const CFloatHandle& outputMainBackLink, bool isMultithread ) override;
+
+private:
+	IMathEngine* mathEngine;
+	int threadCount;
+	CPrimitivesJit primitives;
+};
+
+}
+