enable channels last transposed convolution

manuelcandales · facebook-github-bot · commit c0b24e0f75b7 · 2024-07-10T22:57:02.000-07:00
Differential Revision: D59622072
diff --git a/kernels/portable/cpu/op_convolution.cpp b/kernels/portable/cpu/op_convolution.cpp
@@ -136,17 +136,6 @@ void conv2d_impl(
       }
     }
   } else { // transposed convolution
-    if (bias_ptr != nullptr) {
-      out_coord[2] = 0;
-      out_coord[3] = 0;
-      size_t out_c_start_idx =
-          calculate_linear_index(out_coord, out_strides.data(), 4);
-      size_t out_c_end_idx = out_c_start_idx + out_H * out_W;
-      for (size_t out_ix = out_c_start_idx; out_ix < out_c_end_idx; out_ix++) {
-        out_ptr[out_ix] = convert<CTYPE, CTYPE_BIAS>(bias_ptr[out_c]);
-      }
-    }
-
     w_coord[1] = out_c - out_c_start;
 
     for (size_t in_y = 0; in_y < in_H; ++in_y) {
@@ -295,12 +284,22 @@ void convolution_wrapper(
       bias.has_value() ? bias.value().const_data_ptr<CTYPE_BIAS>() : nullptr;
 
   size_t out_N = out.size(0);
-  size_t out_C_per_group = out.size(1) / groups;
+  size_t out_C = out.size(1);
+  size_t out_C_per_group = out_C / groups;
 
-  if (transposed && bias_ptr == nullptr) {
-    // If bias is not present, we need to initialize the output to 0
-    // before we can accumulate into it.
-    memset(out_ptr, 0, out.nbytes());
+  if (transposed) {
+    // For transposed convolution, we need to initialized the output before we
+    // can accumulate into it.
+    if (bias_ptr == nullptr) {
+      // If bias is not present, we need to initialize the output to 0
+      memset(out_ptr, 0, out.nbytes());
+    } else {
+      // If bias is present, we initialize the output to the bias value
+      for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
+        out_ptr[out_ix] = convert<CTYPE, CTYPE_BIAS>(
+            bias_ptr[(out_ix / out_strides[1]) % out_C]);
+      }
+    }
   }
 
   for (size_t batch = 0; batch < out_N; ++batch) {
diff --git a/kernels/test/op_convolution_test.cpp b/kernels/test/op_convolution_test.cpp
@@ -587,6 +587,75 @@ TEST_F(OpConvCorrectnessTest, TransposedNonDefaultParams) {
   EXPECT_TENSOR_CLOSE(out, expected);
 }
 
+TEST_F(OpConvCorrectnessTest, TransposedNonDefaultParamsChannelsLast) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor input = tf.full_channels_last({2, 6, 4, 5}, 2.0);
+  Tensor weight = tf.full_channels_last({6, 1, 2, 2}, 0.5);
+  Tensor bias = tf.make({3}, {1, 2, 3});
+  Tensor out = tf.full_channels_last({2, 3, 3, 6}, 0.7);
+  Tensor expected = tf.make(
+      {2, 3, 3, 6},
+      {1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 3, 3, 1, 3, 3, 1, 3, 3, 2, 2, 2, 2,
+       2, 2, 2, 4, 4, 2, 4, 4, 2, 4, 4, 2, 4, 4, 3, 3, 3, 3, 3, 3, 3, 5,
+       5, 3, 5, 5, 3, 5, 5, 3, 5, 5, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 3, 3,
+       1, 3, 3, 1, 3, 3, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 4, 4, 2, 4, 4, 2,
+       4, 4, 3, 3, 3, 3, 3, 3, 3, 5, 5, 3, 5, 5, 3, 5, 5, 3, 5, 5});
+
+  const std::vector<int32_t> sizes(
+      expected.sizes().begin(), expected.sizes().end());
+
+  int32_t N = sizes[0];
+  int32_t C = sizes[1];
+  int32_t H = sizes[2];
+  int32_t W = sizes[3];
+
+  std::vector<float> contiguous_data(
+      expected.const_data_ptr<float>(),
+      expected.const_data_ptr<float>() + expected.numel());
+  std::vector<float> channels_last_data(
+      N * C * H * W); // Create a new blob with the same total size to contain
+                      // channels_last data
+  for (int32_t n = 0; n < N; ++n) {
+    for (int32_t c = 0; c < C; ++c) {
+      for (int32_t h = 0; h < H; ++h) {
+        for (int32_t w = 0; w < W; ++w) {
+          // Calculate the index in the original blob
+          int32_t old_index = ((n * C + c) * H + h) * W + w;
+          // Calculate the index in the new blob
+          int32_t new_index = ((n * H + h) * W + w) * C + c;
+          // Copy the data
+          channels_last_data[new_index] = contiguous_data[old_index];
+        }
+      }
+    }
+  }
+
+  Tensor expected_channels_last =
+      tf.make_channels_last(sizes, channels_last_data);
+
+  int64_t stride[1] = {3};
+  int64_t padding[1] = {7};
+  int64_t dilation[1] = {5};
+  bool transposed = true;
+  int64_t output_padding[1] = {2};
+  int64_t groups = 3;
+
+  op_convolution_out(
+      input,
+      weight,
+      exec_aten::optional<Tensor>(bias),
+      exec_aten::ArrayRef<int64_t>{stride, 1},
+      exec_aten::ArrayRef<int64_t>{padding, 1},
+      exec_aten::ArrayRef<int64_t>{dilation, 1},
+      transposed,
+      exec_aten::ArrayRef<int64_t>{output_padding, 1},
+      groups,
+      out);
+
+  EXPECT_TENSOR_CLOSE(out, expected_channels_last);
+}
+
 TEST_F(OpConvCorrectnessTest, InvalidOutputPadding) {
   TensorFactory<ScalarType::Float> tf;