neoml-lib
diff --git a/‎NeoML/include/NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h
Lines changed: 88 additions & 0 deletions b/‎NeoML/include/NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h
Lines changed: 88 additions & 0 deletions
diff --git a/‎NeoML/include/NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h
Lines changed: 107 additions & 0 deletions b/‎NeoML/include/NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h
Lines changed: 107 additions & 0 deletions
diff --git a/‎NeoML/include/NeoML/NeoML.h
Lines changed: 2 additions & 0 deletions b/‎NeoML/include/NeoML/NeoML.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎NeoML/src/CMakeLists.txt
Lines changed: 4 additions & 0 deletions b/‎NeoML/src/CMakeLists.txt
Lines changed: 4 additions & 0 deletions
diff --git a/‎NeoML/src/Dnn/Dnn.cpp
Lines changed: 4 additions & 0 deletions b/‎NeoML/src/Dnn/Dnn.cpp
Lines changed: 4 additions & 0 deletions
@@ -0,0 +1,88 @@
+/* Copyright © 2023 ABBYY
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+--------------------------------------------------------------------------------------------------------------*/
+
+#pragma once
+
+#include <NeoML/Dnn/Dnn.h>
+
+namespace NeoML {
+
+struct CFavorAttentionDesc;
+
+// Computes FAVOR normalized self-attention.
+// https://arxiv.org/pdf/2009.14794.pdf.
+// 
+// Inputs: query, key, value
+// Emulates equation: Output ~~ softmax( query * ( key )^T / normalizer ) * value
+//
+//         output
+//           ^
+//           |
+//   +---------------+
+//   |   F A V O R   | <-- projection_matrix
+//   |   Attention   | 
+//   +---------------+
+//    ^      ^      ^
+//    |      |      |
+//  query   key   value
+//
+class NEOML_API CFavorAttentionPerformerLayer : public CBaseLayer {
+	NEOML_DNN_LAYER( CFavorAttentionPerformerLayer )
+public:
+	// Possible activation kernel transformations
+	enum class TAKernel { SoftMax = 0, ReLU = 1 };
+	// Layer inputs
+	enum TInput { TI_Q = 0, TI_K = 1, TI_V = 2 };
+
+	CFavorAttentionPerformerLayer( IMathEngine& mathEngine, const char* name = nullptr );
+
+	int GetRandomFeaturesCount() const { return randomFeaturesCount; }
+	void SetRandomFeaturesCount( int randomFeaturesCount );
+
+	int GetActivationKernel() const { return static_cast<int>( activation ); }
+	void SetActivationKernel( int activation );
+
+	bool GetCausal() const { return causal; }
+	void SetCausal( bool causal );
+
+	bool GetProjectionMatrixType() const { return projectionMatrixType; }
+	void SetProjectionMatrixType( bool projectionMatrixType );
+
+	void Serialize( CArchive& archive ) override;
+
+protected:
+	~CFavorAttentionPerformerLayer();
+
+	// Create output blobs using the input blobs
+	void Reshape() override;
+	// One step of a forward pass
+	void RunOnce() override;
+	// One step of a backward pass
+	void BackwardOnce() override;
+
+private:
+	int randomFeaturesCount = 1; // Number of random features to be used
+	TAKernel activation = TAKernel::SoftMax; // Activation Kernel
+	bool causal = false; // Auto-regressive or not
+	bool projectionMatrixType = true; // Either random projection matrix will be applied (for SoftMax should be true)
+	CFavorAttentionDesc* desc = nullptr; // Favor Attention desctiption
+
+	void destroyFavorAttentionDesc();
+};
+
+NEOML_API CLayerWrapper<CFavorAttentionPerformerLayer> FavorAttentionPerformer(
+	int randomFeaturesCount, int activation, bool causal, bool projectionMatrixType );
+
+} // namespace NeoML
@@ -0,0 +1,107 @@
+/* Copyright © 2023 ABBYY
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+--------------------------------------------------------------------------------------------------------------*/
+
+#pragma once
+
+#include <NeoML/Dnn/Dnn.h>
+#include <NeoML/Dnn/Layers/CompositeLayer.h>
+#include <NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h>
+
+namespace NeoML {
+
+// Multihead Self Attention Performer
+// https://arxiv.org/pdf/2009.14794.pdf
+// Implementation of multiheaded FAVOR-attention & FAVOR-self-attention layers.
+//
+//  +----------------------+--------+-------------------------------------------------------
+//  | Parameter            | Type   | Description
+//  +----------------------+--------+-------------------------------------------------------
+//  | HiddenSize           | int    | output dim of hidden layer
+//  | HeadCount            | int    | number of heads to repeat the same attention structure
+//  | OutputSize           | int    | 
+//  | DropoutRate (TODO)   | float  | dropout rate inside attention for training
+//  +----------------------+--------+-------------------------------------------------------
+class NEOML_API CMultiheadAttentionPerformerLayer : public CCompositeLayer {
+	NEOML_DNN_LAYER( CMultiheadAttentionPerformerLayer )
+public:
+	explicit CMultiheadAttentionPerformerLayer( IMathEngine& mathEngine );
+
+	// Set the Activation Kernel: SoftMax(=0), ReLU(=1)
+	// By default is SoftMax
+	int GetActivationKernel() const { return favor->GetActivationKernel(); }
+	void SetActivationKernel( int activationKernel )
+	{ favor->SetActivationKernel( activationKernel ); }
+
+	// The number of heads in attention
+	// The GetHiddenSize() must be a multiple of this value
+	// By default attention consist of 1 head
+	int GetHeadCount() const { return headCount; }
+	void SetHeadCount( int headCount );
+	
+	// The size of trainable matrices
+	// Must be a multiple of GetHeadCount()
+	int GetHiddenSize() const { return hiddenSize; }
+	void SetHiddenSize( int hiddenSize );
+
+	// The size of output
+	int GetOutputSize() const { return outputSize; }
+	void SetOutputSize( int outputSize );
+
+	void Serialize( CArchive& archive ) override;
+
+	// Recreates the layer if forceRebuild is true or it doesn't contain sublayers
+	void Rebuild( bool forceRebuild );
+
+protected:
+	void Reshape() override;
+
+private:
+	// The amount of heads
+	int headCount;
+	// The size of the trainable matrix
+	int hiddenSize;
+	// Output size
+	int outputSize;
+
+	CPtr<CFavorAttentionPerformerLayer> favor;
+	
+	// Layer inputs
+	enum TInputs {
+		I_Q = 0,
+		I_K = 1,
+		I_V = 2,
+		I_Mask = 3
+	};
+	
+	// Layer outputs
+	enum TOutputs {
+		O_Output = 0,
+		O_Softmax = 1
+	};
+
+	bool isCreated() const { return HasLayer( "Q" ); }
+	void create();
+	
+	CBaseLayer* multiplyInputByMatrixWeights( int size, const char* name, TInputs input );
+	CBaseLayer* multiplyByMatrixWeights( CBaseLayer* input, int width, const char* prefix );
+	CBaseLayer* prepareQ( CBaseLayer* input );
+	CBaseLayer* prepareKV( CBaseLayer* input );
+	CBaseLayer* prepareOutput( CBaseLayer* input );
+};
+
+NEOML_API CLayerWrapper<CMultiheadAttentionPerformerLayer> MultiheadAttentionPerformer(
+	int headCount, int hiddenSize, int outputSize );
+
+} // namespace NeoML
@@ -115,6 +115,7 @@ limitations under the License.
 #include <NeoML/Dnn/Layers/DepthToSpaceLayer.h>
 #include <NeoML/Dnn/Layers/DotProductLayer.h>
 #include <NeoML/Dnn/Layers/EnumBinarizationLayer.h>
+#include <NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h>
 #include <NeoML/Dnn/Layers/FocalLossLayer.h>
 #include <NeoML/Dnn/Layers/FullyConnectedSourceLayer.h>
 #include <NeoML/Dnn/Layers/GlobalMaxPoolingLayer.h>
@@ -130,6 +131,7 @@ limitations under the License.
 #include <NeoML/Dnn/Layers/LrnLayer.h>
 #include <NeoML/Dnn/Layers/MaxOverTimePoolingLayer.h>
 #include <NeoML/Dnn/Layers/ModelWrapperLayer.h>
+#include <NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h>
 #include <NeoML/Dnn/Layers/MultiHingeLossLayer.h>
 #include <NeoML/Dnn/Layers/PositionalEmbeddingLayer.h>
 #include <NeoML/Dnn/Layers/PrecisionRecallLayer.h>
 
@@ -117,6 +117,7 @@ set(NeoML_SOURCES
     Dnn/Layers/DotProductLayer.cpp
     Dnn/Layers/EnumBinarizationLayer.cpp
     Dnn/Layers/FocalLossLayer.cpp
+    Dnn/Layers/FavorAttentionPerformerLayer.cpp
     Dnn/Layers/FullyConnectedSourceLayer.cpp
     Dnn/Layers/GlobalMaxPoolingLayer.cpp
     Dnn/Layers/GlobalSumPoolingLayer.cpp
@@ -132,6 +133,7 @@ set(NeoML_SOURCES
     Dnn/Layers/MaxOverTimePoolingLayer.cpp
     Dnn/Layers/MobileNetV3BlockLayer.cpp
     Dnn/Layers/ModelWrapperLayer.cpp
+    Dnn/Layers/MultiheadAttentionPerformerLayer.cpp
     Dnn/Layers/ObjectNormalizationLayer.cpp
     Dnn/Layers/Onnx/OnnxEltwiseLayer.cpp
     Dnn/Layers/Onnx/OnnxCastLayer.cpp
@@ -377,6 +379,7 @@ set(NeoML_HEADERS
     ../include/NeoML/Dnn/Layers/DotProductLayer.h
     ../include/NeoML/Dnn/Layers/EnumBinarizationLayer.h
     ../include/NeoML/Dnn/Layers/FocalLossLayer.h
+    ../include/NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h
     ../include/NeoML/Dnn/Layers/FullyConnectedSourceLayer.h
     ../include/NeoML/Dnn/Layers/GlobalMaxPoolingLayer.h
     ../include/NeoML/Dnn/Layers/GlobalSumPoolingLayer.h
@@ -392,6 +395,7 @@ set(NeoML_HEADERS
     ../include/NeoML/Dnn/Layers/MaxOverTimePoolingLayer.h
     ../include/NeoML/Dnn/Layers/MobileNetV3BlockLayer.h
     ../include/NeoML/Dnn/Layers/ModelWrapperLayer.h
+    ../include/NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h
     ../include/NeoML/Dnn/Layers/MultiHingeLossLayer.h
     ../include/NeoML/Dnn/Layers/ObjectNormalizationLayer.h
     ../include/NeoML/Dnn/Layers/Onnx/OnnxEltwiseLayer.h
 
@@ -72,6 +72,7 @@ limitations under the License.
 #include <NeoML/Dnn/Layers/DepthToSpaceLayer.h>
 #include <NeoML/Dnn/Layers/DotProductLayer.h>
 #include <NeoML/Dnn/Layers/EnumBinarizationLayer.h>
+#include <NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h>
 #include <NeoML/Dnn/Layers/FocalLossLayer.h>
 #include <NeoML/Dnn/Layers/FullyConnectedSourceLayer.h>
 #include <NeoML/Dnn/Layers/GlobalMaxPoolingLayer.h>
@@ -88,6 +89,7 @@ limitations under the License.
 #include <NeoML/Dnn/Layers/MaxOverTimePoolingLayer.h>
 #include <NeoML/Dnn/Layers/MobileNetV3BlockLayer.h>
 #include <NeoML/Dnn/Layers/ModelWrapperLayer.h>
+#include <NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h>
 #include <NeoML/Dnn/Layers/MultiHingeLossLayer.h>
 #include <NeoML/Dnn/Layers/PositionalEmbeddingLayer.h>
 #include <NeoML/Dnn/Layers/PrecisionRecallLayer.h>
@@ -349,6 +351,7 @@ REGISTER_NEOML_LAYER( CCtcDecodingLayer, "FmlCnnCtcDecodingLayer" )
 REGISTER_NEOML_LAYER( CCtcLossLayer, "FmlCnnCtcLossLayer" )
 REGISTER_NEOML_LAYER( CDotProductLayer, "FmlCnnDotProductLayer" )
 REGISTER_NEOML_LAYER( CEnumBinarizationLayer, "FmlCnnEnumBinarizationLayer" )
+REGISTER_NEOML_LAYER( CFavorAttentionPerformerLayer, "NeoMLDnnFavorAttentionPerformerLayer" )
 REGISTER_NEOML_LAYER( CGlobalMaxPoolingLayer, "FmlCnnGlobalMaxPoolingLayer" )
 REGISTER_NEOML_LAYER( CGrnLayer, "NeoMLDnnGrnLayer" )
 REGISTER_NEOML_LAYER( CGruLayer, "FmlCnnGruLayer" )
@@ -360,6 +363,7 @@ REGISTER_NEOML_LAYER( CLoraFullyConnectedLayer, "NeoMLDnnLoraFullyConnectedLayer
 REGISTER_NEOML_LAYER( CMaxOverTimePoolingLayer, "FmlCnnMaxOverTimePoolingLayer" )
 REGISTER_NEOML_LAYER( CMobileNetV3PreSEBlockLayer, "NeoMLDnnMobileNetV3PreSEBlockLayer" )
 REGISTER_NEOML_LAYER( CMobileNetV3PostSEBlockLayer, "NeoMLDnnMobileNetV3PostSEBlockLayer" )
+REGISTER_NEOML_LAYER( CMultiheadAttentionPerformerLayer, "NeoMLDnnMultiheadAttentionPerformerLayer" )
 REGISTER_NEOML_LAYER( CMultiHingeLossLayer, "FmlCnnMultyHingeLossLayer" )
 REGISTER_NEOML_LAYER( CMultiSquaredHingeLossLayer, "FmlCnnMultySquaredHingeLossLayer" )
 REGISTER_NEOML_LAYER( CPixelToImageLayer, "FmlCnnPixelToImageLayerClass" )