Update on "[ET-VK] Simplify conv2d weight prepacking (>2x pipeline-creation speedup)"

jorgep31415 · jorgep31415 · commit 999b6fd76fb6 · 2024-04-26T13:10:16.000-07:00
ssjia has previously written two implementations of convolution weights prepacking for CPU (before and after [PyTorch PR #84973](pytorch/pytorch#84973)). Originally, I translated the second implementation to GPU since it is more readable. Now, I translate the first implementation to GPU and switch to it since it requires less steps. The second impl was so complex that during model-load, it took >1500ms to create pipelines. In the test plan's Before, the example sums to 1905ms: ``` [334ms] P::encode-conv2d_prepack_weights_float, (16, 4, 1) [110ms] P::encode-conv2d_dw_prepack_weights_float, (16, 4, 1) [270ms] P::encode-conv2d_prepack_weights_float, (8, 8, 1) [94ms] P::encode-conv2d_dw_prepack_weights_float, (8, 8, 1) [609ms] P::encode-conv_transpose2d_prepack_weights_float, (8, 8, 1) [488ms] P::encode-conv_transpose2d_prepack_weights_float, (16, 4, 1) ``` The first impl now takes <700ms to create pipelines. In the test plan's After, the example sums to 598ms: ``` [135ms] P::encode-conv2d_prepack_weights_float, (16, 4, 1) [83ms] P::encode-conv2d_dw_prepack_weights_float, (16, 4, 1) [102ms] P::encode-conv2d_prepack_weights_float, (8, 8, 1) [69ms] P::encode-conv2d_dw_prepack_weights_float, (8, 8, 1) [115ms] P::encode-conv_transpose2d_prepack_weights_float, (8, 8, 1) [94ms] P::encode-conv_transpose2d_prepack_weights_float, (16, 4, 1) ``` Differential Revision: [D56617129](https://our.internmc.facebook.com/intern/diff/D56617129/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl
@@ -70,7 +70,7 @@ void main() {
   const ivec4 w = p0 % W;
 
   // Map modified tensor_idx to modifed buffer_i
-  // Zero modified tensor idx that are out of bounds
+  // Zero out if modified tensor idx is out of bounds
   const ivec4 buf_i = n * C*H*W + h * W + w;
   const bvec4 mask = bvec4(lessThan(n, ivec4(N)));
 
@@ -84,7 +84,7 @@ void main() {
   if (mask.z) {
     texel.z = SCALAR_T(buffer_in[buf_i.z]);
   }
-  if (mask.w ) {
+  if (mask.w) {
     texel.w = SCALAR_T(buffer_in[buf_i.w]);
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
@@ -74,7 +74,7 @@ void main() {
   const ivec4 w = p1 % W;
 
   // Map modified tensor_idx to modified buffer_i
-  // Zero modified tensor idx that are out of bounds
+  // Zero out if modified tensor idx is out of bounds
   const ivec4 buf_i = n * C*H*W + c * H*W + h * W + w;
   const bvec4 mask = bvec4(ivec4(lessThan(n, ivec4(N))) & ivec4(lessThan(c, ivec4(C))));
 
@@ -88,7 +88,7 @@ void main() {
   if (mask.z) {
     texel.z = SCALAR_T(buffer_in[buf_i.z]);
   }
-  if (mask.w ) {
+  if (mask.w) {
     texel.w = SCALAR_T(buffer_in[buf_i.w]);
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl
@@ -73,7 +73,7 @@ void main() {
   const ivec4 w = W-1 - p1 % W;
 
   // Map modified tensor_idx to modifed buffer_i
-  // Zero modified tensor idx that are out of bounds
+  // Zero out if modified tensor idx is out of bounds
   const ivec4 buf_i = n * C*H*W + c * H*W + h * W + w;
   const bvec4 mask = bvec4(ivec4(lessThan(n, ivec4(N))) & ivec4(lessThan(c, ivec4(C))));
 
@@ -87,7 +87,7 @@ void main() {
   if (mask.z) {
     texel.z = SCALAR_T(buffer_in[buf_i.z]);
   }
-  if (mask.w ) {
+  if (mask.w) {
     texel.w = SCALAR_T(buffer_in[buf_i.w]);
   }
 
diff --git a/examples/cadence/ops/functions.yaml b/examples/cadence/ops/functions.yaml
@@ -60,7 +60,7 @@
     - arg_meta: null
       kernel_name: impl::HiFi::quantized_layer_norm_out
 
-- func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, float src_scale, int src_zero_point, float weight_scale, int weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
       kernel_name: impl::HiFi::quantized_linear_out
diff --git a/examples/cadence/ops/quantized_linear_out.cpp b/examples/cadence/ops/quantized_linear_out.cpp
@@ -24,13 +24,12 @@ void quantized_linear_out(
     const Tensor& src,
     const Tensor& weight,
     const Tensor& bias,
-    double src_scale,
     int64_t src_zero_point,
-    double weight_scale,
-    int64_t weight_zero_point,
+    const Tensor& weight_zero_point,
     const Tensor& out_multiplier,
     const Tensor& out_shift,
     int64_t out_zero_point,
+    const exec_aten::optional<Tensor>& offset,
     Tensor& out) {
   // input comes in shape [leading_dims, in_dim]
   // weight comes in shape [out_dim, in_dim]
@@ -58,7 +57,7 @@ void quantized_linear_out(
       in_dim, // vec_offset of p_mat2.
       out_dim, // out_offset, i.e., offset of next output element written
       1, // out_stride, i.e., stride to go to next output row
-      -weight_zero_point, // mat1_zero_bias
+      -weight_zero_point.const_data_ptr<int32_t>()[0], // mat1_zero_bias
       -src_zero_point, // mat2_zero_bias
       out_multiplier.const_data_ptr<int32_t>(), // out_multiplier
       out_shift.const_data_ptr<int32_t>(), // out_shift

Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ void main() {`
`70`	`70`	`const ivec4 w = p0 % W;`
`71`	`71`
`72`	`72`	`// Map modified tensor_idx to modifed buffer_i`
`73`		`- // Zero modified tensor idx that are out of bounds`
	`73`	`+ // Zero out if modified tensor idx is out of bounds`
`74`	`74`	`const ivec4 buf_i = n * CHW + h * W + w;`
`75`	`75`	`const bvec4 mask = bvec4(lessThan(n, ivec4(N)));`
`76`	`76`
`@@ -84,7 +84,7 @@ void main() {`
`84`	`84`	`if (mask.z) {`
`85`	`85`	`texel.z = SCALAR_T(buffer_in[buf_i.z]);`
`86`	`86`	`}`
`87`		`- if (mask.w ) {`
	`87`	`+ if (mask.w) {`
`88`	`88`	`texel.w = SCALAR_T(buffer_in[buf_i.w]);`
`89`	`89`	`}`
`90`	`90`
Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,7 @@ void main() {`
`74`	`74`	`const ivec4 w = p1 % W;`
`75`	`75`
`76`	`76`	`// Map modified tensor_idx to modified buffer_i`
`77`		`- // Zero modified tensor idx that are out of bounds`
	`77`	`+ // Zero out if modified tensor idx is out of bounds`
`78`	`78`	`const ivec4 buf_i = n * CHW + c * HW + h W + w;`
`79`	`79`	`const bvec4 mask = bvec4(ivec4(lessThan(n, ivec4(N))) & ivec4(lessThan(c, ivec4(C))));`
`80`	`80`
`@@ -88,7 +88,7 @@ void main() {`
`88`	`88`	`if (mask.z) {`
`89`	`89`	`texel.z = SCALAR_T(buffer_in[buf_i.z]);`
`90`	`90`	`}`
`91`		`- if (mask.w ) {`
	`91`	`+ if (mask.w) {`
`92`	`92`	`texel.w = SCALAR_T(buffer_in[buf_i.w]);`
`93`	`93`	`}`
`94`	`94`
Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ void main() {`
`73`	`73`	`const ivec4 w = W-1 - p1 % W;`
`74`	`74`
`75`	`75`	`// Map modified tensor_idx to modifed buffer_i`
`76`		`- // Zero modified tensor idx that are out of bounds`
	`76`	`+ // Zero out if modified tensor idx is out of bounds`
`77`	`77`	`const ivec4 buf_i = n * CHW + c * HW + h W + w;`
`78`	`78`	`const bvec4 mask = bvec4(ivec4(lessThan(n, ivec4(N))) & ivec4(lessThan(c, ivec4(C))));`
`79`	`79`
`@@ -87,7 +87,7 @@ void main() {`
`87`	`87`	`if (mask.z) {`
`88`	`88`	`texel.z = SCALAR_T(buffer_in[buf_i.z]);`
`89`	`89`	`}`
`90`		`- if (mask.w ) {`
	`90`	`+ if (mask.w) {`
`91`	`91`	`texel.w = SCALAR_T(buffer_in[buf_i.w]);`
`92`	`92`	`}`
`93`	`93`