[CudaMathEngine] CUBLAS_POINTER_MODE_DEVICE allows device pointers only

favorart · favorart · commit 8eb327234e67 · 2024-08-29T09:53:31.000+02:00
Signed-off-by: Kirill Golikov &lt;kirill.golikov@abbyy.com&gt;
diff --git a/NeoMathEngine/src/GPU/CUDA/CudaMathEngineCublas.cu b/NeoMathEngine/src/GPU/CUDA/CudaMathEngineCublas.cu
@@ -41,25 +41,6 @@ void CCudaMathEngine::VectorDotProduct(const CConstFloatHandle& firstHandle, con
 		GetRaw( secondHandle ), 1, GetRaw( resultHandle ) ) );
 }
 
-void CCudaMathEngine::VectorMultiplyAndAdd( const CConstFloatHandle& firstHandle, const CConstFloatHandle& secondHandle,
-	const CFloatHandle& resultHandle, int vectorSize, CFloatParam multParam )
-{
-	ASSERT_EXPR( firstHandle.GetMathEngine() == this );
-	ASSERT_EXPR( secondHandle.GetMathEngine() == this );
-	ASSERT_EXPR( resultHandle.GetMathEngine() == this );
-	SetCudaDevice( device->DeviceNumber );
-
-	const float* const first = GetRaw( firstHandle );
-	const float* const second = GetRaw( secondHandle );
-	float* const result = GetRaw( resultHandle );
-	// cublasSaxpy allows (host or device) pointer
-	const float* mult = multParam.Handle.IsNull() ? &multParam.Value : GetRaw( multParam.Handle );
-
-	if( result != first ) {
-		ASSERT_CUDA( cudaMemcpy( result, first, vectorSize * sizeof( float ), cudaMemcpyDeviceToDevice ) );
-	}
-	ASSERT_CUBLAS( cublas->Saxpy( cublasHandle, vectorSize, mult, second, 1, result, 1 ) );
-}
 
 void CCudaMathEngine::MultiplyMatrixByTransposedMatrix( const CConstFloatHandle& firstHandle, int firstHeight,
 	int firstWidth, int firstRowSize, const CConstFloatHandle& secondHandle, int secondHeight, int secondRowSize,
diff --git a/NeoMathEngine/src/GPU/CUDA/CudaMathEngineVectorMath.cu b/NeoMathEngine/src/GPU/CUDA/CudaMathEngineVectorMath.cu
@@ -951,6 +951,22 @@ void CCudaMathEngine::VectorSub(float first, const CConstFloatHandle& secondHand
 		( first, GetRaw( secondHandle ), GetRaw(resultHandle), vectorSize);
 }
 
+void CCudaMathEngine::VectorMultiplyAndAdd(const CConstFloatHandle& firstHandle, const CConstFloatHandle& secondHandle,
+	const CFloatHandle& resultHandle, int vectorSize, CFloatParam mult)
+{
+	ASSERT_EXPR(firstHandle.GetMathEngine() == this);
+	ASSERT_EXPR(secondHandle.GetMathEngine() == this);
+	ASSERT_EXPR(resultHandle.GetMathEngine() == this);
+	SetCudaDevice(device->DeviceNumber);
+
+	int blockCount = 0;
+	int threadCount = 0;
+	getCudaTaskGrid(blockCount, threadCount, vectorSize);
+
+	VectorMultiplyAndAddKernel<<<blockCount, threadCount>>>
+		( GetRaw(firstHandle), GetRaw(secondHandle), GetRaw(resultHandle), vectorSize, mult );
+}
+
 void CCudaMathEngine::VectorMultiplyAndSub(const CConstFloatHandle& firstHandle, const CConstFloatHandle& secondHandle,
 	const CFloatHandle& resultHandle, int vectorSize, CFloatParam mult)
 {
diff --git a/NeoMathEngine/src/GPU/CUDA/Kernels/CudaVectorMathKernels.h b/NeoMathEngine/src/GPU/CUDA/Kernels/CudaVectorMathKernels.h
@@ -979,6 +979,16 @@ __global__ void VectorSubKernel( float first, const float* __restrict__ second,
 	}
 }
 
+// MultiplyAndAdd
+__global__ void VectorMultiplyAndAddKernel( const float* __restrict__ first,
+	const float* __restrict__ second, float* result, int count, CCudaScalarParameter<float> mult )
+{
+	int index = 0;
+	if( GetCudaTaskIndex( count, index ) ) {
+		result[index] = first[index] + mult * second[index];
+	}
+}
+
 // MultiplyAndSub
 __global__ void VectorMultiplyAndSubKernel(const float* __restrict__ first,
 	const float* __restrict__ second, float* result, int count, CCudaScalarParameter<float> mult)
diff --git a/NeoMathEngine/test/src/inference/VectorMultiplyAndAddTest.cpp b/NeoMathEngine/test/src/inference/VectorMultiplyAndAddTest.cpp
@@ -1,4 +1,4 @@
-/* Copyright © 2017-2020 ABBYY Production LLC
+/* Copyright © 2017-2024 ABBYY
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -38,6 +38,19 @@ static void vectorMultiplyAndAddImpl( const CTestParams& params, int seed )
 		float expected = a[i] + mult * b[i];
 		ASSERT_NEAR( expected, result[i], 1e-3 );
 	}
+
+	{
+		auto resultWrapper = CARRAY_FLOAT_WRAPPER( result );
+		{
+			float multTemp = mult;
+			MathEngine().VectorMultiplyAndAdd( CARRAY_FLOAT_WRAPPER( a ), CARRAY_FLOAT_WRAPPER( b ), resultWrapper, vectorSize, multTemp );
+		}
+	}
+
+	for( int i = 0; i < vectorSize; i++ ) {
+		float expected = a[i] + mult * b[i];
+		ASSERT_NEAR( expected, result[i], 1e-3 );
+	}
 }
 
 //------------------------------------------------------------------------------------------------------------