facebookresearch
diff --git a/‎pytorch3d/csrc/compositing/alpha_composite.cu
Lines changed: 8 additions & 8 deletions b/‎pytorch3d/csrc/compositing/alpha_composite.cu
Lines changed: 8 additions & 8 deletions
diff --git a/‎pytorch3d/csrc/compositing/alpha_composite.h
Lines changed: 4 additions & 4 deletions b/‎pytorch3d/csrc/compositing/alpha_composite.h
Lines changed: 4 additions & 4 deletions
diff --git a/‎pytorch3d/csrc/compositing/norm_weighted_sum.cu
Lines changed: 7 additions & 7 deletions b/‎pytorch3d/csrc/compositing/norm_weighted_sum.cu
Lines changed: 7 additions & 7 deletions
diff --git a/‎pytorch3d/csrc/compositing/norm_weighted_sum.h
Lines changed: 4 additions & 4 deletions b/‎pytorch3d/csrc/compositing/norm_weighted_sum.h
Lines changed: 4 additions & 4 deletions
diff --git a/‎pytorch3d/csrc/compositing/weighted_sum.cu
Lines changed: 8 additions & 8 deletions b/‎pytorch3d/csrc/compositing/weighted_sum.cu
Lines changed: 8 additions & 8 deletions
diff --git a/‎pytorch3d/csrc/compositing/weighted_sum.h
Lines changed: 3 additions & 3 deletions b/‎pytorch3d/csrc/compositing/weighted_sum.h
Lines changed: 3 additions & 3 deletions
diff --git a/‎pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu
Lines changed: 8 additions & 9 deletions b/‎pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu
Lines changed: 8 additions & 9 deletions
diff --git a/‎pytorch3d/csrc/rasterize_meshes/rasterize_meshes_cpu.cpp
Lines changed: 1 addition & 31 deletions b/‎pytorch3d/csrc/rasterize_meshes/rasterize_meshes_cpu.cpp
Lines changed: 1 addition & 31 deletions
diff --git a/‎pytorch3d/csrc/rasterize_points/rasterization_utils.cuh
Lines changed: 1 addition & 11 deletions b/‎pytorch3d/csrc/rasterize_points/rasterization_utils.cuh
Lines changed: 1 addition & 11 deletions
diff --git a/‎pytorch3d/csrc/rasterize_points/rasterization_utils.h
Lines changed: 34 additions & 0 deletions b/‎pytorch3d/csrc/rasterize_points/rasterization_utils.h
Lines changed: 34 additions & 0 deletions
@@ -30,15 +30,15 @@ __global__ void alphaCompositeCudaForwardKernel(
   // Get the batch and index
   const int batch = blockIdx.x;
 
-  const int num_pixels = C * W * H;
+  const int num_pixels = C * H * W;
   const int num_threads = gridDim.y * blockDim.x;
   const int tid = blockIdx.y * blockDim.x + threadIdx.x;
 
   // Iterate over each feature in each pixel
   for (int pid = tid; pid < num_pixels; pid += num_threads) {
-    int ch = pid / (W * H);
-    int j = (pid % (W * H)) / H;
-    int i = (pid % (W * H)) % H;
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
 
     // alphacomposite the different values
     float cum_alpha = 1.;
@@ -81,16 +81,16 @@ __global__ void alphaCompositeCudaBackwardKernel(
   // Get the batch and index
   const int batch = blockIdx.x;
 
-  const int num_pixels = C * W * H;
+  const int num_pixels = C * H * W;
   const int num_threads = gridDim.y * blockDim.x;
   const int tid = blockIdx.y * blockDim.x + threadIdx.x;
 
   // Parallelize over each feature in each pixel in images of size H * W,
   // for each image in the batch of size batch_size
   for (int pid = tid; pid < num_pixels; pid += num_threads) {
-    int ch = pid / (W * H);
-    int j = (pid % (W * H)) / H;
-    int i = (pid % (W * H)) % H;
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
 
     // alphacomposite the different values
     float cum_alpha = 1.;
 
@@ -11,13 +11,13 @@
 //    features: FloatTensor of shape (C, P) which gives the features
 //            of each point where C is the size of the feature and
 //            P the number of points.
-//    alphas: FloatTensor of shape (N, points_per_pixel, W, W) where
+//    alphas: FloatTensor of shape (N, points_per_pixel, H, W) where
 //            points_per_pixel is the number of points in the z-buffer
-//            sorted in z-order, and W is the image size.
-//    points_idx: IntTensor of shape (N, points_per_pixel, W, W) giving the
+//            sorted in z-order, and (H, W) is the image size.
+//    points_idx: IntTensor of shape (N, points_per_pixel, H, W) giving the
 //            indices of the nearest points at each pixel, sorted in z-order.
 // Returns:
-//    weighted_fs: FloatTensor of shape (N, C, W, W) giving the accumulated
+//    weighted_fs: FloatTensor of shape (N, C, H, W) giving the accumulated
 //            feature for each point. Concretely, it gives:
 //                 weighted_fs[b,c,i,j] = sum_k cum_alpha_k *
 //                   features[c,points_idx[b,k,i,j]]
 
@@ -30,16 +30,16 @@ __global__ void weightedSumNormCudaForwardKernel(
   // Get the batch and index
   const int batch = blockIdx.x;
 
-  const int num_pixels = C * W * H;
+  const int num_pixels = C * H * W;
   const int num_threads = gridDim.y * blockDim.x;
   const int tid = blockIdx.y * blockDim.x + threadIdx.x;
 
   // Parallelize over each feature in each pixel in images of size H * W,
   // for each image in the batch of size batch_size
   for (int pid = tid; pid < num_pixels; pid += num_threads) {
-    int ch = pid / (W * H);
-    int j = (pid % (W * H)) / H;
-    int i = (pid % (W * H)) % H;
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
 
     // Store the accumulated alpha value
     float cum_alpha = 0.;
@@ -101,9 +101,9 @@ __global__ void weightedSumNormCudaBackwardKernel(
   // Parallelize over each feature in each pixel in images of size H * W,
   // for each image in the batch of size batch_size
   for (int pid = tid; pid < num_pixels; pid += num_threads) {
-    int ch = pid / (W * H);
-    int j = (pid % (W * H)) / H;
-    int i = (pid % (W * H)) % H;
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
 
     float sum_alpha = 0.;
     float sum_alphafs = 0.;
 
@@ -11,13 +11,13 @@
 //    features: FloatTensor of shape (C, P) which gives the features
 //            of each point where C is the size of the feature and
 //            P the number of points.
-//    alphas: FloatTensor of shape (N, points_per_pixel, W, W) where
+//    alphas: FloatTensor of shape (N, points_per_pixel, H, W) where
 //            points_per_pixel is the number of points in the z-buffer
-//            sorted in z-order, and W is the image size.
-//    points_idx: IntTensor of shape (N, points_per_pixel, W, W) giving the
+//            sorted in z-order, and (H, W) is the image size.
+//    points_idx: IntTensor of shape (N, points_per_pixel, H, W) giving the
 //            indices of the nearest points at each pixel, sorted in z-order.
 // Returns:
-//    weighted_fs: FloatTensor of shape (N, C, W, W) giving the accumulated
+//    weighted_fs: FloatTensor of shape (N, C, H, W) giving the accumulated
 //            feature in each point. Concretely, it gives:
 //                 weighted_fs[b,c,i,j] = sum_k alphas[b,k,i,j] *
 //                   features[c,points_idx[b,k,i,j]] / sum_k alphas[b,k,i,j]
 
@@ -28,16 +28,16 @@ __global__ void weightedSumCudaForwardKernel(
   // Get the batch and index
   const int batch = blockIdx.x;
 
-  const int num_pixels = C * W * H;
+  const int num_pixels = C * H * W;
   const int num_threads = gridDim.y * blockDim.x;
   const int tid = blockIdx.y * blockDim.x + threadIdx.x;
 
   // Parallelize over each feature in each pixel in images of size H * W,
   // for each image in the batch of size batch_size
   for (int pid = tid; pid < num_pixels; pid += num_threads) {
-    int ch = pid / (W * H);
-    int j = (pid % (W * H)) / H;
-    int i = (pid % (W * H)) % H;
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
 
     // Iterate through the closest K points for this pixel
     for (int k = 0; k < points_idx.size(1); ++k) {
@@ -76,16 +76,16 @@ __global__ void weightedSumCudaBackwardKernel(
   // Get the batch and index
   const int batch = blockIdx.x;
 
-  const int num_pixels = C * W * H;
+  const int num_pixels = C * H * W;
   const int num_threads = gridDim.y * blockDim.x;
   const int tid = blockIdx.y * blockDim.x + threadIdx.x;
 
   // Iterate over each pixel to compute the contribution to the
   // gradient for the features and weights
   for (int pid = tid; pid < num_pixels; pid += num_threads) {
-    int ch = pid / (W * H);
-    int j = (pid % (W * H)) / H;
-    int i = (pid % (W * H)) % H;
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
 
     // Iterate through the closest K points for this pixel
     for (int k = 0; k < points_idx.size(1); ++k) {
 
@@ -11,13 +11,13 @@
 //    features: FloatTensor of shape (C, P) which gives the features
 //            of each point where C is the size of the feature and
 //            P the number of points.
-//    alphas: FloatTensor of shape (N, points_per_pixel, W, W) where
+//    alphas: FloatTensor of shape (N, points_per_pixel, H, W) where
 //            points_per_pixel is the number of points in the z-buffer
-//            sorted in z-order, and W is the image size.
+//            sorted in z-order, and (H, W) is the image size.
 //    points_idx: IntTensor of shape (N, points_per_pixel, W, W) giving the
 //            indices of the nearest points at each pixel, sorted in z-order.
 // Returns:
-//    weighted_fs: FloatTensor of shape (N, C, W, W) giving the accumulated
+//    weighted_fs: FloatTensor of shape (N, C, H, W) giving the accumulated
 //            feature in each point. Concretely, it gives:
 //                 weighted_fs[b,c,i,j] = sum_k alphas[b,k,i,j] *
 //                   features[c,points_idx[b,k,i,j]]
 
@@ -452,7 +452,6 @@ __global__ void RasterizeMeshesBackwardCudaKernel(
       const bool inside = b_pp.x > 0.0f && b_pp.y > 0.0f && b_pp.z > 0.0f;
       const float sign = inside ? -1.0f : 1.0f;
 
-      // TODO(T52813608) Add support for non-square images.
       auto grad_dist_f = PointTriangleDistanceBackward(
           pxy, v0xy, v1xy, v2xy, sign * grad_dist_upstream);
       const float2 ddist_d_v0 = thrust::get<1>(grad_dist_f);
@@ -606,7 +605,7 @@ __global__ void RasterizeMeshesCoarseCudaKernel(
   const float half_pix_x = NDC_x_half_range / W;
   const float half_pix_y = NDC_y_half_range / H;
 
-  // This is a boolean array of shape (num_bins, num_bins, chunk_size)
+  // This is a boolean array of shape (num_bins_y, num_bins_x, chunk_size)
   // stored in shared memory that will track whether each point in the chunk
   // falls into each bin of the image.
   BitMask binmask((unsigned int*)sbuf, num_bins_y, num_bins_x, chunk_size);
@@ -755,7 +754,7 @@ at::Tensor RasterizeMeshesCoarseCuda(
   const int num_bins_y = 1 + (H - 1) / bin_size;
   const int num_bins_x = 1 + (W - 1) / bin_size;
 
-  if (num_bins_y >= kMaxFacesPerBin || num_bins_x >= kMaxFacesPerBin) {
+  if (num_bins_y >= kMaxItemsPerBin || num_bins_x >= kMaxItemsPerBin) {
     std::stringstream ss;
     ss << "In Coarse Rasterizer got num_bins_y: " << num_bins_y
        << ", num_bins_x: " << num_bins_x << ", "
@@ -800,7 +799,7 @@ at::Tensor RasterizeMeshesCoarseCuda(
 // ****************************************************************************
 __global__ void RasterizeMeshesFineCudaKernel(
     const float* face_verts, // (F, 3, 3)
-    const int32_t* bin_faces, // (N, B, B, T)
+    const int32_t* bin_faces, // (N, BH, BW, T)
     const float blur_radius,
     const int bin_size,
     const bool perspective_correct,
@@ -813,12 +812,12 @@ __global__ void RasterizeMeshesFineCudaKernel(
     const int H,
     const int W,
     const int K,
-    int64_t* face_idxs, // (N, S, S, K)
-    float* zbuf, // (N, S, S, K)
-    float* pix_dists, // (N, S, S, K)
-    float* bary // (N, S, S, K, 3)
+    int64_t* face_idxs, // (N, H, W, K)
+    float* zbuf, // (N, H, W, K)
+    float* pix_dists, // (N, H, W, K)
+    float* bary // (N, H, W, K, 3)
 ) {
-  // This can be more than S^2 if S % bin_size != 0
+  // This can be more than H * W if H or W are not divisible by bin_size.
   int num_pixels = N * BH * BW * bin_size * bin_size;
   int num_threads = gridDim.x * blockDim.x;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -5,41 +5,11 @@
 #include <list>
 #include <queue>
 #include <tuple>
+#include "rasterize_points/rasterization_utils.h"
 #include "utils/geometry_utils.h"
 #include "utils/vec2.h"
 #include "utils/vec3.h"
 
-// The default value of the NDC range is [-1, 1], however in the case that
-// H != W, the NDC range is set such that the shorter side has range [-1, 1] and
-// the longer side is scaled by the ratio of H:W. S1 is the dimension for which
-// the NDC range is calculated and S2 is the other image dimension.
-// e.g. to get the NDC x range S1 = W and S2 = H
-float NonSquareNdcRange(int S1, int S2) {
-  float range = 2.0f;
-  if (S1 > S2) {
-    range = ((S1 / S2) * range);
-  }
-  return range;
-}
-
-// Given a pixel coordinate 0 <= i < S1, convert it to a normalized device
-// coordinates. We divide the NDC range into S1 evenly-sized
-// pixels, and assume that each pixel falls in the *center* of its range.
-// The default value of the NDC range is [-1, 1], however in the case that
-// H != W, the NDC range is set such that the shorter side has range [-1, 1] and
-// the longer side is scaled by the ratio of H:W. The dimension of i should be
-// S1 and the other image dimension is S2 For example, to get the x and y NDC
-// coordinates or a given pixel i:
-//     x = PixToNonSquareNdc(i, W, H)
-//     y = PixToNonSquareNdc(i, H, W)
-float PixToNonSquareNdc(int i, int S1, int S2) {
-  float range = NonSquareNdcRange(S1, S2);
-  // NDC: offset + (i * pixel_width + half_pixel_width)
-  // The NDC range is [-range/2, range/2].
-  const float offset = (range / 2.0f);
-  return -offset + (range * i + offset) / S1;
-}
-
 // Get (x, y, z) values for vertex from (3, 3) tensor face.
 template <typename Face>
 auto ExtractVerts(const Face& face, const int vertex_index) {
 
@@ -2,16 +2,6 @@
 
 #pragma once
 
-// Given a pixel coordinate 0 <= i < S, convert it to a normalized device
-// coordinates in the range [-1, 1]. We divide the NDC range into S evenly-sized
-// pixels, and assume that each pixel falls in the *center* of its range.
-// TODO: delete this function after updating the pointcloud rasterizer to
-// support non square images.
-__device__ inline float PixToNdc(int i, int S) {
-  // NDC: x-offset + (i * pixel_width + half_pixel_width)
-  return -1.0 + (2 * i + 1.0) / S;
-}
-
 // The default value of the NDC range is [-1, 1], however in the case that
 // H != W, the NDC range is set such that the shorter side has range [-1, 1] and
 // the longer side is scaled by the ratio of H:W. S1 is the dimension for which
@@ -50,7 +40,7 @@ __device__ inline float PixToNonSquareNdc(int i, int S1, int S2) {
 // TODO: is 8 enough? Would increasing have performance considerations?
 const int32_t kMaxPointsPerPixel = 150;
 
-const int32_t kMaxFacesPerBin = 22;
+const int32_t kMaxItemsPerBin = 22;
 
 template <typename T>
 __device__ inline void BubbleSort(T* arr, int n) {
 
@@ -0,0 +1,34 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#pragma once
+
+// The default value of the NDC range is [-1, 1], however in the case that
+// H != W, the NDC range is set such that the shorter side has range [-1, 1] and
+// the longer side is scaled by the ratio of H:W. S1 is the dimension for which
+// the NDC range is calculated and S2 is the other image dimension.
+// e.g. to get the NDC x range S1 = W and S2 = H
+inline float NonSquareNdcRange(int S1, int S2) {
+  float range = 2.0f;
+  if (S1 > S2) {
+    range = ((S1 / S2) * range);
+  }
+  return range;
+}
+
+// Given a pixel coordinate 0 <= i < S1, convert it to a normalized device
+// coordinates. We divide the NDC range into S1 evenly-sized
+// pixels, and assume that each pixel falls in the *center* of its range.
+// The default value of the NDC range is [-1, 1], however in the case that
+// H != W, the NDC range is set such that the shorter side has range [-1, 1] and
+// the longer side is scaled by the ratio of H:W. The dimension of i should be
+// S1 and the other image dimension is S2 For example, to get the x and y NDC
+// coordinates or a given pixel i:
+//     x = PixToNonSquareNdc(i, W, H)
+//     y = PixToNonSquareNdc(i, H, W)
+inline float PixToNonSquareNdc(int i, int S1, int S2) {
+  float range = NonSquareNdcRange(S1, S2);
+  // NDC: offset + (i * pixel_width + half_pixel_width)
+  // The NDC range is [-range/2, range/2].
+  const float offset = (range / 2.0f);
+  return -offset + (range * i + offset) / S1;
+}