Skip to content

[NeoML] Remove excess CUDA syncs in layers #1070

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 26 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
6759eb9
[NeoML] Layers mem-optimize
favorart Jun 7, 2024
1a3c3d8
[NeoMathEngine] Vector operations with float and int arguments
favorart Jun 26, 2024
b673324
[VulkanMathEngine] const CMemoryHandle arrays
favorart Aug 30, 2024
65043aa
[VulkanMathEngine] Unite CFloatHandleStackVar
favorart Sep 3, 2024
7558ae0
[VulkanMathEngine] Get handles for stack vars
favorart Sep 3, 2024
7eade33
[NeoML] remove excess CUDA syncs: RowwiseCh, MobileNetV2, MobileNetV3
favorart Jun 26, 2024
6ed2903
[NeoML] remove excess CUDA syncs: CPrecisionRecallLayer
favorart Jun 26, 2024
29de7f9
[NeoML] remove excess CUDA syncs: FocalLossLayer
favorart Jun 26, 2024
cf969a5
[NeoML] remove excess CUDA syncs: BinaryFocalLossLayer
favorart Jun 26, 2024
3e3b26f
[NeoML] remove excess CUDA syncs: CrossEntropyLossLayer
favorart Jun 26, 2024
4383822
[NeoML] remove excess CUDA syncs: BinaryCrossEntropyLayer
favorart Jun 26, 2024
40f5d9b
[NeoML] remove excess CUDA syncs: CenterLossLayer
favorart Jun 26, 2024
bbe319e
[NeoML] remove excess CUDA syncs: CCtcLossLayer
favorart Jun 26, 2024
bb8c1cd
[NeoML] remove excess CUDA syncs: CLossLayer
favorart Jun 26, 2024
62aba8c
[NeoML] remove excess CUDA syncs: AutoDiffFunctions
favorart Jun 26, 2024
fbf13c4
[NeoML] remove excess CUDA syncs: LoraFullyConnectedLayer
favorart Jun 26, 2024
a87f094
[NeoML] remove excess CUDA syncs: MultichannelLookupLayer
favorart Jun 26, 2024
3d02c0f
[NeoML] remove excess CUDA syncs: ActivationLayers
favorart Jun 26, 2024
dbce51e
[NeoML] remove excess CUDA syncs: BatchNormalizationLayer
favorart Jun 26, 2024
5811de4
[NeoML] Express old vector operations with operations of new arguments
favorart Jun 6, 2024
8b5596b
[NeoML] remove excess CUDA syncs: other layers
favorart Jun 26, 2024
1c9828a
[NeoML] remove excess CUDA syncs: DnnSolver
favorart May 22, 2024
bb78241
[NeoML] CUDA sync in DnnSolver::clipGradients
favorart Sep 4, 2024
847fbc7
[NeoMathEngine] CPU arm64 fix compilation
favorart Jul 1, 2024
06e4d2e
[CudaMathEngine] CUBLAS_POINTER_MODE_DEVICE allows device pointers only
favorart Jul 5, 2024
c2fd9cc
[MetalMathEngine] Add CScalarParameter
favorart Sep 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Build/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ CMAKE_WORKING_DIR=$ROOT/_cmake_working_dir/NeoML.${FINE_CMAKE_BUILD_TARGET}.${FI
pushd ${CMAKE_WORKING_DIR}

if [[ $FINE_CMAKE_BUILD_TARGET == "IOS" ]]; then
cmake -G Xcode -DUSE_FINE_OBJECTS=ON -DCMAKE_TOOLCHAIN_FILE=${ROOT}/NeoML/cmake/ios.toolchain.cmake -DIOS_ARCH=${FINE_CMAKE_BUILD_ARCH} ${ROOT}/NeoML/NeoML
cmake -G Xcode -DUSE_FINE_OBJECTS=ON -DCMAKE_TOOLCHAIN_FILE=${ROOT}/NeoML/cmake/ios.toolchain.cmake -DIOS_ARCH=${FINE_CMAKE_BUILD_ARCH} ${ROOT}/NeoML/NeoML -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_CONFIG}
elif [[ $FINE_CMAKE_BUILD_TARGET == "Linux" && $FINE_CMAKE_BUILD_ARCH == "x86" ]]; then
cmake -DUSE_FINE_OBJECTS=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_CONFIG} -DCMAKE_CXX_FLAGS=-m32 -DCMAKE_C_FLAGS=-m32 ${ROOT}/NeoML/NeoML
elif [[ $FINE_CMAKE_BUILD_TARGET == "Linux" ]]; then
Expand Down
123 changes: 32 additions & 91 deletions NeoML/include/NeoML/Dnn/DnnSolver.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,13 @@ class CDnn;
class NEOML_API CDnnSolver : virtual public IObject {
public:
// Stores the calculated values of layer parameters gradients for further use in Train method
// forSharedWeightsLayer=true should only be used within layers that share weights with other layers.
void AddDiff( const CBaseLayer* layer, const CObjectArray<CDnnBlob>& paramDiffBlobs,
// sharedWeights=true should only be used within layers that share weights with other layers.
void AddDiff( const CBaseLayer* layer, const CObjectArray<CDnnBlob>& paramDiffBlobs,
bool sharedWeights = false );

// Modifies the trainable parameters of the network layers,
// using the accumulated gradients and previous steps' history (moment, etc.)
void Train( float distributedCoeff = 1.f );

// Resets to the initial state
void Reset();

Expand All @@ -62,11 +61,17 @@ class NEOML_API CDnnSolver : virtual public IObject {

// Gets the reference to the math engine
IMathEngine& MathEngine() const { return mathEngine; }
// Get the intermediate result storing blob
const CDnnBlob& TempBlob() const { return *temporaryBlob; }
// Intermediate result storing blob
// hide it to private, its allocated size may > actual
CFloatHandle TempData();
// Reinitialize the intermediate result storing blob
bool ReInitTempBlob( int dataSize );

// Called once on Reset method call
// Resets the stats in the inheriting instances to the initial state
virtual void OnReset() {}

// On each training step the method is called once, before the call to TrainLayer for all layers
virtual void OnTrain() {}

Expand All @@ -78,13 +83,20 @@ class NEOML_API CDnnSolver : virtual public IObject {

private:
IMathEngine& mathEngine;
CPtr<CDnnBlob> gradParams;

// MathEngine memory stored variables for calculations
float learningRate;
float regularizationL2;
float regularizationL1;
float maxGradientNorm;
float clipGradientMin;
float clipGradientMax;

// Intermediate result storing
// hide it to private, its allocated size may > actual
CPtr<CDnnBlob> temporaryBlob;

// The blobs sum
struct CDiffBlobSum final {
const CBaseLayer* LayerOwner{}; // for the given layer
Expand Down Expand Up @@ -141,7 +153,7 @@ void NEOML_API SerializeSolver( CArchive& archive, CDnn& dnn, CPtr<CDnnSolver>&
//---------------------------------------------------------------------------------------------------------------------

template<class T>
class CSolverClassRegistrar {
class CSolverClassRegistrar final {
public:
explicit CSolverClassRegistrar( const char* solverName );
~CSolverClassRegistrar();
Expand All @@ -168,40 +180,27 @@ inline CSolverClassRegistrar<T>::~CSolverClassRegistrar()
class NEOML_API CDnnSimpleGradientSolver : public CDnnSolver {
NEOML_DNN_SOLVER( CDnnSimpleGradientSolver )
public:
CDnnSimpleGradientSolver( IMathEngine& mathEngine );
explicit CDnnSimpleGradientSolver( IMathEngine& mathEngine );

// Moment decay rate (moment is a weighted sum of previous gradients)
float GetMomentDecayRate() const { return momentDecayRate; }
void SetMomentDecayRate(float decayRate) { momentDecayRate = decayRate; }

// Backward compatibility mode
bool IsInCompatibilityMode() const { return isInCompatibilityMode; }
void SetCompatibilityMode( bool compatibilityMode ) { isInCompatibilityMode = compatibilityMode; }

void Serialize( CArchive& archive, const CDnn& dnn ) override;

protected:
// Updates the trainable weights of the layer
void TrainLayer( const CBaseLayer* layer, const CObjectArray<CDnnBlob>& paramBlobs,
const CObjectArray<CDnnBlob>& paramDiffBlobs, CObjectArray<CDnnBlob>& gradientHistory ) override;

private:
// Moment decay rate (moment is a weighted sum of previous gradients)
float momentDecayRate;

// Backward compatibility mode
bool isInCompatibilityMode;

// Temporary variables of Handle type, used for calculations
enum TTempVariable {
TV_MomentDecayRateVar = 0,
TV_OpMomentDecayRateVar,
TV_OpRegL2MomentDecayRateVar,
TV_RateVar,
TV_L1Threshold,
TV_L1Mult,
TV_Count
};

CPtr<CDnnBlob> tempVariables;
};

//---------------------------------------------------------------------------------------------------------------------
Expand All @@ -210,7 +209,7 @@ class NEOML_API CDnnSimpleGradientSolver : public CDnnSolver {
class NEOML_API CDnnAdaptiveGradientSolver : public CDnnSolver {
NEOML_DNN_SOLVER( CDnnAdaptiveGradientSolver )
public:
CDnnAdaptiveGradientSolver( IMathEngine& mathEngine );
explicit CDnnAdaptiveGradientSolver( IMathEngine& mathEngine );

// Retrieves and sets the moment decay rate (moment is a weighted sum of previous gradients)
float GetMomentDecayRate() const { return momentDecayRate; }
Expand All @@ -222,7 +221,7 @@ class NEOML_API CDnnAdaptiveGradientSolver : public CDnnSolver {
// Retrieves and sets the espilon used to avoid division by zero when calculating second moment
float GetEpsilon() const { return epsilon; }
void SetEpsilon( float newEpsilon ) { epsilon = newEpsilon; }

// Backward compatibility mode
bool IsInCompatibilityMode() const { return isInCompatibilityMode; }
void SetCompatibilityMode( bool compatibilityMode ) { isInCompatibilityMode = compatibilityMode; }

Expand All @@ -249,7 +248,7 @@ class NEOML_API CDnnAdaptiveGradientSolver : public CDnnSolver {
// Prepares for the next training step
void OnTrain() override;
// Updates the trainable weights of the layer
virtual void TrainLayer( const CBaseLayer* layer, const CObjectArray<CDnnBlob>& paramBlobs,
void TrainLayer( const CBaseLayer* layer, const CObjectArray<CDnnBlob>& paramBlobs,
const CObjectArray<CDnnBlob>& paramDiffBlobs, CObjectArray<CDnnBlob>& gradientHistory ) override;

private:
Expand Down Expand Up @@ -284,27 +283,8 @@ class NEOML_API CDnnAdaptiveGradientSolver : public CDnnSolver {
bool isAmsGradEnabled;
// Perform weight decay after calculating the moving averages
bool isDecoupledWeightDecay;

// Backward compatibility mode
bool isInCompatibilityMode;

enum TTempVariable {
TV_MomentDecayRateVar = 0,
TV_SecondMomentDecayRateVar,
TV_RegL2Var,
TV_OpMomentDecayRateVar,
TV_OpSecondMomentDecayRateVar,
TV_RateVar,
TV_L1Threshold,
TV_L1Mult,
TV_EpsilonVar,
TV_Count
};

// Temporary Handle variables for calculations
CPtr<CDnnBlob> tempVariables;

CPtr<CDnnBlob> temporaryBlob;
};

//---------------------------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -389,26 +369,6 @@ class NEOML_API CDnnNesterovGradientSolver : public CDnnSolver {
float muTPlusOne; // the mu coefficient for the next step
float productMuT; // the product of mu coefficient over all steps including the current one

enum TTempVariable {
TV_MomentDecayRateVar = 0,
TV_SecondMomentDecayRateVar,
TV_RegL2Var,
TV_OpMomentDecayRateVar,
TV_OpSecondMomentDecayRateVar,
TV_RateVar,
TV_L1Threshold,
TV_L1Mult,
TV_EpsilonVar,
TV_InvOpSecondMomentDecayRateNVar, // 1 / (1 - secondMomentDecay ^ N)
TV_MBarGradMultVar, // the gradient coefficient in the total sum
TV_MBarMomentMultVar, // the moment coefficient in the total sum
TV_Count
};

// Temporary blobs for calculations
CPtr<CDnnBlob> tempVariables;

CPtr<CDnnBlob> temporaryBlob;
// m with a stroke (from the paper referred to)
// It is a weighted sum of the gradient and the first moment
CPtr<CDnnBlob> mBarBlob;
Expand Down Expand Up @@ -492,11 +452,12 @@ class NEOML_API CDnnLambGradientSolver : public CDnnSolver {
void Serialize( CArchive& archive, const CDnn& dnn ) override;

protected:
// Prepares for the next training step
void OnTrain() override;
// Updates the trainable weights of the layer
void TrainLayer( const CBaseLayer* layer, const CObjectArray<CDnnBlob>& paramBlobs,
const CObjectArray<CDnnBlob>& paramDiffBlobs, CObjectArray<CDnnBlob>& gradientHistory ) override;

void OnTrain() override;

private:
// The gradientHistory array stores the previous values of gradients of different types
enum TGradientHistoryType {
Expand All @@ -519,48 +480,28 @@ class NEOML_API CDnnLambGradientSolver : public CDnnSolver {
// Is NVLamb modification used
bool useNvLamb;

enum TTempVariable {
TV_MomentDecayRateVar,
TV_SecondMomentDecayRateVar,
TV_OpMomentDecayRateVar,
TV_OpSecondMomentDecayRateVar,
TV_RateVar,
TV_EpsilonVar,
TV_WeightDecayVar,
TV_ClipMultiplierVar,
TV_LayerNormVar,
TV_TrustRatioVar,
TV_L2NormVar,

TV_Count
};

CPtr<CDnnBlob> tempVariables;

CPtr<CDnnBlob> tempBlob;

CPtr<CDnnBlob> normL2Var;
CArray<float> layersGradientNormSquare;
float totalGradientNorm;

// Layer excluded from optimization
struct CExcludedLayer {
struct CExcludedLayer final {
// Layer name (or substring)
CString LayerName;
// Match type (exact or substring)
TExcludeLayerNameMatchType MatchType;
TExcludeLayerNameMatchType MatchType{ ELNMT_Exact };
// Parameter number
// -1 if all parameters
int ParamIndex;

CExcludedLayer() : MatchType( ELNMT_Exact ), ParamIndex( NotFound ) {}
int ParamIndex{ NotFound };
};
// Layers excluded from weight decay
CArray<CExcludedLayer> excludedLayers;
mutable CPtr<CDnnBlob> tempNormBlob;

float calcL2NormAverage( const CConstFloatHandle& data, int dataSize ) const;
void getWeightDecayIndices( const CBaseLayer& layer, int paramsCount, CHashTable<int>& indexes ) const;

void calcNormalizeMultiplier( const CDnnBlob& weights, const CDnnBlob& update, const CFloatHandle& multiplier ) const;
float calcNormalizeMultiplier( const CDnnBlob& weights, const CDnnBlob& update ) const;
};

template<typename TLayer>
Expand Down
Loading