[ET-VK][Ops] enabling double support for quantization and dequantization ops

morelos · morelos · commit 9fb73e4baf4a · 2025-06-11T09:59:31.000-07:00
With the added double support in the layout template, this diff is enabling it as input/output for dequantization. Since there are limitations with how 64bit can be supported, the expectation is that IO be downgraded to 32bit Differential Revision: [D76289197](https://our.internmc.facebook.com/intern/diff/D76289197/) ghstack-source-id: 289707203 Pull Request resolved: #11553
diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize.glsl b/backends/vulkan/runtime/graph/ops/glsl/dequantize.glsl
@@ -155,7 +155,10 @@ $if MODE == "per_tensor":
   [[unroll]] for (int i = 0; i < 4; ++i) {
     IN_T qvalue = IN_T(intex[i]);
     OUT_T value = dequantize_val(qvalue, scale, zero_point);
-    outtex[i] = value;
+    $if OUT_DTYPE == "double":
+      outtex[i] = float(value);
+    $else:
+      outtex[i] = value;
   }
   write_texel(t_out, pos, outtex);
 
@@ -198,7 +201,10 @@ $if MODE == "per_token":
   [[unroll]] for (int i = 0; i < 4; ++i) {
     IN_T qvalue = IN_T(intex[i]);
     OUT_T value = dequantize_val(qvalue, scale_val, zero_point_val);
-    outtex[i] = value;
+    $if OUT_DTYPE == "double":
+      outtex[i] = float(value);
+    $else:
+      outtex[i] = value;
   }
 
   write_texel(t_out, pos, outtex);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize.yaml b/backends/vulkan/runtime/graph/ops/glsl/dequantize.yaml
@@ -15,6 +15,7 @@ dequantize:
     OUT_DTYPE:
       - VALUE: half
       - VALUE: float
+      - VALUE: double
   shader_variants:
     - NAME: dequantize_per_tensor
       MODE: per_tensor
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize.yaml
@@ -11,6 +11,7 @@ quantize:
     IN_DTYPE:
       - VALUE: half
       - VALUE: float
+      - VALUE: double
     OUT_DTYPE:
       - VALUE: uint8
       - VALUE: int8
diff --git a/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp
@@ -191,6 +191,7 @@ void quantize_per_tensor_impl(
 
   // Verify input is a floating point type
   VK_CHECK_COND(
+      graph.dtype_of(input) == vkapi::kDouble ||
       graph.dtype_of(input) == vkapi::kFloat ||
       graph.dtype_of(input) == vkapi::kHalf);
 
@@ -214,6 +215,7 @@ void quantize_per_token_impl(
 
   // Verify input is a floating point type
   VK_CHECK_COND(
+      graph.dtype_of(input) == vkapi::kDouble ||
       graph.dtype_of(input) == vkapi::kFloat ||
       graph.dtype_of(input) == vkapi::kHalf);
 
diff --git a/backends/vulkan/test/op_tests/dequantize_test.cpp b/backends/vulkan/test/op_tests/dequantize_test.cpp
@@ -462,6 +462,12 @@ void test_vulkan_dequantize_per_tensor(
       vkcompute::utils::kBuffer,
       vkcompute::utils::kBuffer);
 
+  // Telling the system to expect a float instead of a double
+  // since the shader can only return 32bit anyways
+  if (out_dtype == at::kDouble) {
+    out_dtype = at::kFloat;
+  }
+
   // Test with texture storage
   test_vulkan_dequantize_per_tensor_impl(
       input_sizes,
@@ -496,6 +502,12 @@ void test_vulkan_dequantize_per_token(
       vkcompute::utils::kBuffer,
       vkcompute::utils::kBuffer);
 
+  // Telling the system to expect a float instead of a double
+  // since the shader can only return 32bit anyways
+  if (out_dtype == at::kDouble) {
+    out_dtype = at::kFloat;
+  }
+
   // Test with texture storage
   test_vulkan_dequantize_per_token_impl(
       input_sizes,
@@ -790,6 +802,19 @@ TEST(
       at::kFloat); // output dtype
 }
 
+TEST(
+    VulkanDequantizePerTensorTest,
+    test_vulkan_dequantize_per_tensor_int32_to_double) {
+  test_vulkan_dequantize_per_tensor(
+      {2, 4, 3}, // input sizes
+      0.0001, // scale
+      100, // zero_point
+      -2147483648, // quant_min
+      2147483647, // quant_max
+      at::kInt, // input dtype
+      at::kDouble); // output dtype
+}
+
 void test_reference_dequantize_per_token(
     const std::vector<int>& input_sizes,
     const std::vector<float>& scales,
@@ -1165,3 +1190,19 @@ TEST(
       at::kInt, // input dtype
       at::kFloat); // output dtype
 }
+
+TEST(
+    VulkanDequantizePerTokenTest,
+    test_vulkan_dequantize_per_token_int32_to_double) {
+  std::vector<float> scales = {0.0001, 0.0002, 0.0003, 0.0};
+  std::vector<int> zero_points = {100, -100, 50, -50};
+
+  test_vulkan_dequantize_per_token(
+      {2, 2, 8}, // input sizes (2*2=4 tokens)
+      scales,
+      zero_points,
+      -2147483648, // quant_min
+      2147483647, // quant_max
+      at::kInt, // input dtype
+      at::kDouble); // output dtype
+}
diff --git a/backends/vulkan/test/op_tests/quantize_test.cpp b/backends/vulkan/test/op_tests/quantize_test.cpp
@@ -432,6 +432,12 @@ void test_vulkan_quantize_per_tensor(
       vkcompute::utils::kBuffer,
       vkcompute::utils::kBuffer);
 
+  // If the in_dtype is a double, convert to float for texture implementation
+  // since they don't support 64bit as inputs
+  if (in_dtype == at::kDouble) {
+    in_dtype = at::kFloat;
+  }
+
   // Test with texture storage
   test_vulkan_quantize_per_tensor_impl(
       input_sizes,
@@ -466,6 +472,12 @@ void test_vulkan_quantize_per_token(
       vkcompute::utils::kBuffer,
       vkcompute::utils::kBuffer);
 
+  // If the in_dtype is a double, convert to float for texture implementation
+  // since they don't support 64bit as inputs
+  if (in_dtype == at::kDouble) {
+    in_dtype = at::kFloat;
+  }
+
   // Test with texture storage
   test_vulkan_quantize_per_token_impl(
       input_sizes,
@@ -718,6 +730,19 @@ TEST(
       at::kChar); // output dtype
 }
 
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_vulkan_quantize_per_tensor_double_to_int8) {
+  test_vulkan_quantize_per_tensor(
+      {2, 3}, // input sizes
+      0.01, // scale
+      1, // zero_point
+      -128, // quant_min
+      127, // quant_max
+      at::kDouble, // input dtype
+      at::kChar); // output dtype
+}
+
 void test_reference_quantize_per_token(
     const std::vector<int>& input_sizes,
     const std::vector<float>& pre_scales,
@@ -1064,3 +1089,19 @@ TEST(VulkanQuantizePerTensorTest, test_vulkan_quantize_per_token_half_to_int8) {
       at::kHalf, // input dtype
       at::kChar); // output dtype
 }
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_vulkan_quantize_per_token_double_to_int8) {
+  std::vector<float> scales = {0.1, 0.2};
+  std::vector<int> zero_points = {0, 5};
+
+  test_vulkan_quantize_per_token(
+      {2, 2}, // input sizes (2*2=4 tokens)
+      scales,
+      zero_points,
+      -128, // quant_min
+      127, // quant_max
+      at::kDouble, // input dtype
+      at::kChar); // output dtype
+}