Skip to content

Commit 3b1282b

Browse files
committed
[NeoML] Add MultiheadAttentionPerformerLayer
Signed-off-by: Kirill Golikov <[email protected]>
1 parent 7a71670 commit 3b1282b

11 files changed

+1272
-15
lines changed
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/* Copyright © 2023-2024 ABBYY
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
--------------------------------------------------------------------------------------------------------------*/
15+
16+
#pragma once
17+
18+
#include <NeoML/Dnn/Dnn.h>
19+
20+
namespace NeoML {
21+
22+
struct CFavorAttentionDesc;
23+
24+
// Computes FAVOR normalized self-attention.
25+
// https://arxiv.org/pdf/2009.14794.pdf.
26+
//
27+
// Inputs: query, key, value
28+
// Emulates equation: Output ~~ softmax( query * ( key )^T / normalizer ) * value
29+
//
30+
// output
31+
// ^
32+
// |
33+
// +---------------+
34+
// | F A V O R | <-- projection matrix
35+
// | Attention | (random features)
36+
// +---------------+
37+
// ^ ^ ^
38+
// | | |
39+
// query key value
40+
//
41+
class NEOML_API CFavorAttentionPerformerLayer : public CBaseLayer {
42+
NEOML_DNN_LAYER( CFavorAttentionPerformerLayer )
43+
public:
44+
// Possible activation kernel transformations
45+
enum class TAKernel { SoftMax = 0, ReLU = 1 };
46+
// Layer inputs numeration
47+
enum TInput { TI_Q = 0, TI_K = 1, TI_V = 2 };
48+
// Constructs a random matrix Q using
49+
enum class TRandomMaxrixStructMode {
50+
QMatrix, // QR-factorization of a random 2D-tensor
51+
GivensRotations // Givens random rotations
52+
};
53+
static constexpr TRandomMaxrixStructMode StructMode = TRandomMaxrixStructMode::GivensRotations;
54+
// For normalization of a random matrix Q use sum of rows' norms of a random matrix, or just =sqrt(dim)
55+
static constexpr bool Scaling = false;
56+
57+
// Constructor
58+
CFavorAttentionPerformerLayer( IMathEngine& mathEngine, const char* name = nullptr );
59+
60+
// The projection matrix columns size if it is used, or 0 if not
61+
// Set to 0, if the projection matrix should not be used
62+
int GetRandomFeaturesCount() const { return randomFeaturesCount; }
63+
void SetRandomFeaturesCount( int randomFeaturesCount );
64+
// The activation kernel transformations is used
65+
int GetActivationKernel() const { return static_cast<int>( activation ); }
66+
void SetActivationKernel( int activation );
67+
// The auto-regressive attention is used or not
68+
bool GetCausal() const { return causal; }
69+
void SetCausal( bool causal );
70+
71+
void Serialize( CArchive& archive ) override;
72+
73+
protected:
74+
~CFavorAttentionPerformerLayer();
75+
76+
// Create output blobs using the input blobs
77+
void Reshape() override;
78+
// One step of a forward pass
79+
void RunOnce() override;
80+
// One step of a backward pass
81+
void BackwardOnce() override;
82+
83+
private:
84+
// Number of random features to be used
85+
// For SoftMax should be > 0, the random projection matrix should be applied
86+
int randomFeaturesCount = 0;
87+
TAKernel activation = TAKernel::SoftMax; // Activation Kernel type
88+
bool causal = false; // Auto-regressive attention or not
89+
CFavorAttentionDesc* desc = nullptr; // Favor Attention desctiption
90+
91+
void destroyFavorAttentionDesc();
92+
};
93+
94+
NEOML_API CLayerWrapper<CFavorAttentionPerformerLayer> FavorAttentionPerformer(
95+
int randomFeaturesCount, int activation, bool causal );
96+
97+
} // namespace NeoML
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
/* Copyright © 2023-2024 ABBYY
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
--------------------------------------------------------------------------------------------------------------*/
15+
16+
#pragma once
17+
18+
#include <NeoML/Dnn/Dnn.h>
19+
#include <NeoML/Dnn/Layers/CompositeLayer.h>
20+
21+
namespace NeoML {
22+
23+
// Multihead Self Attention Performer
24+
// https://arxiv.org/pdf/2009.14794.pdf
25+
// Implementation of multiheaded FAVOR-attention & FAVOR-self-attention layers.
26+
//
27+
// +----------------------+--------+-------------------------------------------------------
28+
// | Parameter | Type | Description
29+
// +----------------------+--------+-------------------------------------------------------
30+
// | HiddenSize | int | size of trainable matrices, output dim of hidden layer
31+
// | HeadCount | int | number of heads to repeat the same attention structure
32+
// | OutputSize | int | size of the output
33+
// | ActivationKernel | int | activation (ReLU or SoftMax) kernel transformation
34+
// | RandomFeaturesCount | int | projection matrix columns number, or 0 if isn't used
35+
// | Casual | bool | auto-regressive attention is used or not
36+
// +----------------------+--------+-------------------------------------------------------
37+
class NEOML_API CMultiheadAttentionPerformerLayer : public CCompositeLayer {
38+
NEOML_DNN_LAYER( CMultiheadAttentionPerformerLayer )
39+
public:
40+
explicit CMultiheadAttentionPerformerLayer( IMathEngine& mathEngine );
41+
42+
// Activation kernel type: SoftMax(=0), ReLU(=1)
43+
// By default is SoftMax
44+
int GetActivationKernel() const { return activationKernel; }
45+
void SetActivationKernel( int activationKernel, int randomFeaturesCount, bool casual );
46+
int GetRandomFeaturesCount() const { return randomFeaturesCount; }
47+
bool GetCasual() const { return casual; }
48+
49+
// The number of heads in attention
50+
// The GetHiddenSize() must be a multiple of this value
51+
// By default attention consist of 1 head
52+
int GetHeadCount() const { return headCount; }
53+
void SetHeadCount( int headCount );
54+
55+
// The size of trainable matrices
56+
// Must be a multiple of GetHeadCount()
57+
int GetHiddenSize() const { return hiddenSize; }
58+
void SetHiddenSize( int hiddenSize );
59+
60+
// The size of output
61+
int GetOutputSize() const { return outputSize; }
62+
void SetOutputSize( int outputSize );
63+
64+
void Serialize( CArchive& archive ) override;
65+
66+
// Recreates the layer if forceRebuild is true or it doesn't contain sublayers
67+
void Rebuild( bool forceRebuild );
68+
69+
protected:
70+
void Reshape() override;
71+
72+
private:
73+
// FAVOR+ attention settings
74+
int activationKernel; // Activation kernel transformation
75+
int randomFeaturesCount; // Projection matrix size, if > 0
76+
bool casual; // Auto-regression or not
77+
78+
// The amount of heads
79+
int headCount;
80+
// The size of the trainable matrix
81+
int hiddenSize;
82+
// Output size
83+
int outputSize;
84+
85+
// Layer inputs numeration
86+
enum TInputs { I_Q = 0, I_K = 1, I_V = 2 };
87+
88+
bool isCreated() const { return HasLayer( "Q" ); }
89+
void create();
90+
91+
CBaseLayer* multiplyInputByMatrixWeights( int size, const char* name, TInputs input );
92+
CBaseLayer* multiplyByMatrixWeights( CBaseLayer* input, int width );
93+
CBaseLayer* prepareQ( CBaseLayer* input );
94+
CBaseLayer* prepareKV( CBaseLayer* input, bool isK );
95+
CBaseLayer* prepareOutput( CBaseLayer* input );
96+
};
97+
98+
NEOML_API CLayerWrapper<CMultiheadAttentionPerformerLayer> MultiheadAttentionPerformer(
99+
int headCount, int hiddenSize, int outputSize, int activationKernel, int randomFeaturesCount, bool casual );
100+
101+
} // namespace NeoML

NeoML/include/NeoML/NeoML.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ limitations under the License.
116116
#include <NeoML/Dnn/Layers/DepthToSpaceLayer.h>
117117
#include <NeoML/Dnn/Layers/DotProductLayer.h>
118118
#include <NeoML/Dnn/Layers/EnumBinarizationLayer.h>
119+
#include <NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h>
119120
#include <NeoML/Dnn/Layers/FocalLossLayer.h>
120121
#include <NeoML/Dnn/Layers/FullyConnectedSourceLayer.h>
121122
#include <NeoML/Dnn/Layers/GlobalMaxPoolingLayer.h>
@@ -131,6 +132,7 @@ limitations under the License.
131132
#include <NeoML/Dnn/Layers/LrnLayer.h>
132133
#include <NeoML/Dnn/Layers/MaxOverTimePoolingLayer.h>
133134
#include <NeoML/Dnn/Layers/ModelWrapperLayer.h>
135+
#include <NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h>
134136
#include <NeoML/Dnn/Layers/MultiHingeLossLayer.h>
135137
#include <NeoML/Dnn/Layers/PositionalEmbeddingLayer.h>
136138
#include <NeoML/Dnn/Layers/PrecisionRecallLayer.h>

NeoML/src/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ set(NeoML_SOURCES
118118
Dnn/Layers/DotProductLayer.cpp
119119
Dnn/Layers/EnumBinarizationLayer.cpp
120120
Dnn/Layers/FocalLossLayer.cpp
121+
Dnn/Layers/FavorAttentionPerformerLayer.cpp
121122
Dnn/Layers/FullyConnectedSourceLayer.cpp
122123
Dnn/Layers/GlobalMaxPoolingLayer.cpp
123124
Dnn/Layers/GlobalSumPoolingLayer.cpp
@@ -133,6 +134,7 @@ set(NeoML_SOURCES
133134
Dnn/Layers/MaxOverTimePoolingLayer.cpp
134135
Dnn/Layers/MobileNetV3BlockLayer.cpp
135136
Dnn/Layers/ModelWrapperLayer.cpp
137+
Dnn/Layers/MultiheadAttentionPerformerLayer.cpp
136138
Dnn/Layers/ObjectNormalizationLayer.cpp
137139
Dnn/Layers/Onnx/OnnxEltwiseLayer.cpp
138140
Dnn/Layers/Onnx/OnnxCastLayer.cpp
@@ -379,6 +381,7 @@ set(NeoML_HEADERS
379381
../include/NeoML/Dnn/Layers/DotProductLayer.h
380382
../include/NeoML/Dnn/Layers/EnumBinarizationLayer.h
381383
../include/NeoML/Dnn/Layers/FocalLossLayer.h
384+
../include/NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h
382385
../include/NeoML/Dnn/Layers/FullyConnectedSourceLayer.h
383386
../include/NeoML/Dnn/Layers/GlobalMaxPoolingLayer.h
384387
../include/NeoML/Dnn/Layers/GlobalSumPoolingLayer.h
@@ -394,6 +397,7 @@ set(NeoML_HEADERS
394397
../include/NeoML/Dnn/Layers/MaxOverTimePoolingLayer.h
395398
../include/NeoML/Dnn/Layers/MobileNetV3BlockLayer.h
396399
../include/NeoML/Dnn/Layers/ModelWrapperLayer.h
400+
../include/NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h
397401
../include/NeoML/Dnn/Layers/MultiHingeLossLayer.h
398402
../include/NeoML/Dnn/Layers/ObjectNormalizationLayer.h
399403
../include/NeoML/Dnn/Layers/Onnx/OnnxEltwiseLayer.h

NeoML/src/Dnn/Dnn.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* Copyright © 2017-2023 ABBYY
1+
/* Copyright © 2017-2024 ABBYY
22
33
Licensed under the Apache License, Version 2.0 (the "License");
44
you may not use this file except in compliance with the License.
@@ -73,6 +73,7 @@ limitations under the License.
7373
#include <NeoML/Dnn/Layers/DepthToSpaceLayer.h>
7474
#include <NeoML/Dnn/Layers/DotProductLayer.h>
7575
#include <NeoML/Dnn/Layers/EnumBinarizationLayer.h>
76+
#include <NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h>
7677
#include <NeoML/Dnn/Layers/FocalLossLayer.h>
7778
#include <NeoML/Dnn/Layers/FullyConnectedSourceLayer.h>
7879
#include <NeoML/Dnn/Layers/GlobalMaxPoolingLayer.h>
@@ -89,6 +90,7 @@ limitations under the License.
8990
#include <NeoML/Dnn/Layers/MaxOverTimePoolingLayer.h>
9091
#include <NeoML/Dnn/Layers/MobileNetV3BlockLayer.h>
9192
#include <NeoML/Dnn/Layers/ModelWrapperLayer.h>
93+
#include <NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h>
9294
#include <NeoML/Dnn/Layers/MultiHingeLossLayer.h>
9395
#include <NeoML/Dnn/Layers/PositionalEmbeddingLayer.h>
9496
#include <NeoML/Dnn/Layers/PrecisionRecallLayer.h>
@@ -351,6 +353,7 @@ REGISTER_NEOML_LAYER( CCtcDecodingLayer, "FmlCnnCtcDecodingLayer" )
351353
REGISTER_NEOML_LAYER( CCtcLossLayer, "FmlCnnCtcLossLayer" )
352354
REGISTER_NEOML_LAYER( CDotProductLayer, "FmlCnnDotProductLayer" )
353355
REGISTER_NEOML_LAYER( CEnumBinarizationLayer, "FmlCnnEnumBinarizationLayer" )
356+
REGISTER_NEOML_LAYER( CFavorAttentionPerformerLayer, "NeoMLDnnFavorAttentionPerformerLayer" )
354357
REGISTER_NEOML_LAYER( CGlobalMaxPoolingLayer, "FmlCnnGlobalMaxPoolingLayer" )
355358
REGISTER_NEOML_LAYER( CGrnLayer, "NeoMLDnnGrnLayer" )
356359
REGISTER_NEOML_LAYER( CGruLayer, "FmlCnnGruLayer" )
@@ -362,6 +365,7 @@ REGISTER_NEOML_LAYER( CLoraFullyConnectedLayer, "NeoMLDnnLoraFullyConnectedLayer
362365
REGISTER_NEOML_LAYER( CMaxOverTimePoolingLayer, "FmlCnnMaxOverTimePoolingLayer" )
363366
REGISTER_NEOML_LAYER( CMobileNetV3PreSEBlockLayer, "NeoMLDnnMobileNetV3PreSEBlockLayer" )
364367
REGISTER_NEOML_LAYER( CMobileNetV3PostSEBlockLayer, "NeoMLDnnMobileNetV3PostSEBlockLayer" )
368+
REGISTER_NEOML_LAYER( CMultiheadAttentionPerformerLayer, "NeoMLDnnMultiheadAttentionPerformerLayer" )
365369
REGISTER_NEOML_LAYER( CMultiHingeLossLayer, "FmlCnnMultyHingeLossLayer" )
366370
REGISTER_NEOML_LAYER( CMultiSquaredHingeLossLayer, "FmlCnnMultySquaredHingeLossLayer" )
367371
REGISTER_NEOML_LAYER( CPixelToImageLayer, "FmlCnnPixelToImageLayerClass" )

0 commit comments

Comments
 (0)