VSX: Fix overflow in complex division (pytorch#116972)

Flamefire · pytorchmergebot · commit 3ee092f75b34 · 2024-01-21T19:21:13.000Z
For large complex values the division produces inf or NaN values which leads other functions to produce such too, e.g. `torch._refs.sgn` used in a test. Example: ``` $ python -c 'import torch; print(torch._refs.sgn(torch.complex(torch.tensor([-501]*16, dtype=torch.float32), torch.tensor([-1e20]*16, dtype=torch.float32))))' tensor([-0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj]) $ python -c 'import torch; t = torch.complex(torch.tensor([-501]*16, dtype=torch.float32), torch.tensor([-1e20]*16, dtype=torch.float32)); print(t / t.abs())' tensor([-0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj, -0.+nanj]) ``` Implement the same algorithm as used in numpy and x86 (pytorch#93277) Reason here is that for a tensor with a component of `1e20` the abs-squared value used in the division contains a term `1e20 * 1e20` which overflows the dynamic range of float32 (3e38) and yields an "inf", so the division yields "nan" Output after change: ``` $ python -c 'import torch; t = torch.complex(torch.tensor([-501]*16, dtype=torch.float32), torch.tensor([-1e20]*16, dtype=torch.float32)); print(torch._refs.sgn(t), t.sgn(), t / t.abs())' tensor([-5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j]) tensor([-5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j]) tensor([-5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j, -5.0100e-18-1.j]) ``` CC @quickwritereader who wrote the initial code and @VitalyFedyunin who was involved in the initial review and @lezcano who reviewed pytorch#93277 Pull Request resolved: pytorch#116972 Approved by: https://github.com/lezcano
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
@@ -212,12 +212,19 @@ class Vectorized<ComplexDbl> {
   static Vectorized<ComplexDbl> el_mergee(
       Vectorized<ComplexDbl>& first,
       Vectorized<ComplexDbl>& second) {
-    // as mergee phased in , we can use vec_perm with mask
     return {
         vec_mergeh(first._vec0, second._vec0),
         vec_mergeh(first._vec1, second._vec1)};
   }
 
+  static Vectorized<ComplexDbl> el_mergeo(
+      Vectorized<ComplexDbl>& first,
+      Vectorized<ComplexDbl>& second) {
+    return {
+        vec_mergel(first._vec0, second._vec0),
+        vec_mergel(first._vec1, second._vec1)};
+  }
+
   Vectorized<ComplexDbl> abs_2_() const {
     auto a = (*this).elwise_mult(*this);
     auto permuted = a.el_swapped();
@@ -385,13 +392,11 @@ class Vectorized<ComplexDbl> {
   static Vectorized<ComplexDbl> horizontal_add(
       Vectorized<ComplexDbl>& first,
       Vectorized<ComplexDbl>& second) {
-    auto first_perm = first.el_swapped(); // 2perm
-    auto second_perm = second.el_swapped(); // 2perm
-    // summ
-    auto first_ret = first + first_perm; // 2add
-    auto second_ret = second + second_perm; // 2 add
-    // now lets choose evens
-    return el_mergee(first_ret, second_ret); // 2 mergee's
+    // Operates on individual floats, see _mm_hadd_ps
+    // {f0+f1, s0+s1, f2+f3, s2+s3, ...}
+    // i.e. it sums the re and im of each value and interleaves first and second:
+    // {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...}
+    return el_mergee(first, second) + el_mergeo(first, second);
   }
 
   static Vectorized<ComplexDbl> horizontal_sub(
@@ -432,25 +437,20 @@ class Vectorized<ComplexDbl> {
     // re + im*i = (a + bi)  / (c + di)
     // re = (ac + bd)/abs_2()
     // im = (bc - ad)/abs_2()
-#if 1
-    auto vi = b.el_mergeo();
-    auto vr = b.el_mergee();
-    auto abs_b = b.abs_2_();
-    vi = vi ^ vd_isign_mask;
-    auto ret = elwise_mult(vr);
-    auto vx_swapped = el_swapped();
-    ret = vx_swapped.el_madd(vi, ret);
-    ret = ret.elwise_div(abs_b);
-#else
-    // Vectorized x86 simulation
-    auto ac_bd = elwise_mult(b);
-    auto d_c = b.el_swapped();
-    d_c = d_c ^ vd_rsign_mask;
-    auto ad_bc = elwise_mult(d_c);
-    auto abs_b = b.abs_2_();
-    auto re_im = horizontal_add(ac_bd, ad_bc);
-    auto ret = re_im.elwise_div(abs_b);
-#endif
+    auto fabs_cd =  Vectorized{
+      vec_andc(b._vec0, vd_sign_mask),
+      vec_andc(b._vec1, vd_sign_mask)};       // |c|            |d|
+    auto fabs_dc =  fabs_cd.el_swapped();     // |d|            |c|
+    auto scale = fabs_cd.elwise_max(fabs_dc); // sc = max(|c|, |d|)
+    auto a2 = elwise_div(scale);              // a/sc           b/sc
+    auto b2 = b.elwise_div(scale);            // c/sc           d/sc
+    auto acbd2 = a2.elwise_mult(b2);          // ac/sc^2        bd/sc^2
+    auto dc2 = b2.el_swapped();               // d/sc           c/sc
+    dc2 = dc2 ^ vd_rsign_mask;                // -d/sc          c/sc
+    auto adbc2 = a2.elwise_mult(dc2);         // -ad/sc^2       bc/sc^2
+    auto ret = horizontal_add(acbd2, adbc2);  // (ac+bd)/sc^2   (bc-ad)/sc^2
+    auto denom2 = b2.abs_2_();                // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2
+    ret = ret.elwise_div(denom2);
     return ret;
   }
 
@@ -511,13 +511,14 @@ class Vectorized<ComplexDbl> {
   DEFINE_MEMBER_OP(operator&, ComplexDbl, vec_and)
   DEFINE_MEMBER_OP(operator|, ComplexDbl, vec_or)
   DEFINE_MEMBER_OP(operator^, ComplexDbl, vec_xor)
-  // elelemtwise helpers
+  // elementwise helpers
   DEFINE_MEMBER_OP(elwise_mult, ComplexDbl, vec_mul)
   DEFINE_MEMBER_OP(elwise_div, ComplexDbl, vec_div)
   DEFINE_MEMBER_OP(elwise_gt, ComplexDbl, vec_cmpgt)
   DEFINE_MEMBER_OP(elwise_ge, ComplexDbl, vec_cmpge)
   DEFINE_MEMBER_OP(elwise_lt, ComplexDbl, vec_cmplt)
   DEFINE_MEMBER_OP(elwise_le, ComplexDbl, vec_cmple)
+  DEFINE_MEMBER_OP(elwise_max, ComplexDbl, vec_max)
 };
 
 template <>
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
@@ -238,18 +238,14 @@ class Vectorized<ComplexFlt> {
     return loadu(tmp);
   }
 
-  static Vectorized<ComplexFlt> horizontal_add_permD8(
+  static Vectorized<ComplexFlt> horizontal_add(
       Vectorized<ComplexFlt>& first,
       Vectorized<ComplexFlt>& second) {
-    // we will simulate it differently with 6 instructions total
-    // lets permute second so that we can add it getting horizontal sums
-    auto first_perm = first.el_swapped(); // 2perm
-    auto second_perm = second.el_swapped(); // 2perm
-    // sum
-    auto first_ret = first + first_perm; // 2add
-    auto second_ret = second + second_perm; // 2 add
-    // now lets choose evens
-    return el_mergee(first_ret, second_ret); // 2 mergee's
+    // Operates on individual floats, see _mm_hadd_ps
+    // {f0+f1, s0+s1, f2+f3, s2+s3, ...}
+    // i.e. it sums the re and im of each value and interleaves first and second:
+    // {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...}
+    return el_mergee(first, second) + el_mergeo(first, second);
   }
 
   static Vectorized<ComplexFlt> horizontal_sub_permD8(
@@ -353,12 +349,19 @@ class Vectorized<ComplexFlt> {
   static Vectorized<ComplexFlt> el_mergee(
       Vectorized<ComplexFlt>& first,
       Vectorized<ComplexFlt>& second) {
-    // as mergee phased in , we can use vec_perm with mask
     return {
         vec_mergee(first._vecb0, second._vecb0),
         vec_mergee(first._vecb1, second._vecb1)};
   }
 
+  static Vectorized<ComplexFlt> el_mergeo(
+      Vectorized<ComplexFlt>& first,
+      Vectorized<ComplexFlt>& second) {
+    return {
+        vec_mergeo(first._vecb0, second._vecb0),
+        vec_mergeo(first._vecb1, second._vecb1)};
+  }
+
   Vectorized<ComplexFlt> angle_() const {
     // angle = atan2(b/a)
     // auto b_a = _mm256_permute_ps(values, 0xB1); // b        a
@@ -488,25 +491,20 @@ class Vectorized<ComplexFlt> {
     // re + im*i = (a + bi)  / (c + di)
     // re = (ac + bd)/abs_2()
     // im = (bc - ad)/abs_2()
-#if 1
-    auto vi = b.el_mergeo();
-    auto vr = b.el_mergee();
-    auto abs_b = b.abs_2_();
-    vi = vi ^ isign_mask;
-    auto ret = elwise_mult(vr);
-    auto vx_swapped = el_swapped();
-    ret = vx_swapped.el_madd(vi, ret);
-    ret = ret.elwise_div(abs_b);
-#else
-    // Vectorized x86 simulation
-    auto ac_bd = elwise_mult(b);
-    auto d_c = b.el_swapped();
-    d_c = d_c ^ rsign_mask;
-    auto ad_bc = elwise_mult(d_c);
-    auto abs_b = b.abs_2_();
-    auto re_im = horizontal_add_permD8(ac_bd, ad_bc);
-    auto ret = re_im.elwise_div(abs_b);
-#endif
+    auto fabs_cd =  Vectorized{
+      vec_andc(b._vec0, sign_mask),
+      vec_andc(b._vec1, sign_mask)};          // |c|            |d|
+    auto fabs_dc =  fabs_cd.el_swapped();     // |d|            |c|
+    auto scale = fabs_cd.elwise_max(fabs_dc); // sc = max(|c|, |d|)
+    auto a2 = elwise_div(scale);              // a/sc           b/sc
+    auto b2 = b.elwise_div(scale);            // c/sc           d/sc
+    auto acbd2 = a2.elwise_mult(b2);          // ac/sc^2        bd/sc^2
+    auto dc2 = b2.el_swapped();               // d/sc           c/sc
+    dc2 = dc2 ^ rsign_mask;                   // -d/sc          c/sc
+    auto adbc2 = a2.elwise_mult(dc2);         // -ad/sc^2       bc/sc^2
+    auto ret = horizontal_add(acbd2, adbc2);  // (ac+bd)/sc^2   (bc-ad)/sc^2
+    auto denom2 = b2.abs_2_();                // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2
+    ret = ret.elwise_div(denom2);
     return ret;
   }
 
@@ -589,6 +587,7 @@ class Vectorized<ComplexFlt> {
   DEFINE_MEMBER_OP(elwise_ge, ComplexFlt, vec_cmpge)
   DEFINE_MEMBER_OP(elwise_lt, ComplexFlt, vec_cmplt)
   DEFINE_MEMBER_OP(elwise_le, ComplexFlt, vec_cmple)
+  DEFINE_MEMBER_OP(elwise_max, ComplexFlt, vec_max)
 };
 
 template <>
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h b/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
@@ -391,6 +391,7 @@ const vbool32 imag_mask = vbool32{0x0, 0xFFFFFFFF, 0x0, 0xFFFFFFFF};
 const vbool32 isign_mask = vbool32{0x0, 0x80000000, 0x0, 0x80000000};
 const vbool32 rsign_mask = vbool32{0x80000000, 0x0, 0x80000000, 0x0};
 
+const vbool64 vd_sign_mask  = vbool64{0x8000000000000000, 0x8000000000000000};
 const vbool64 vd_imag_mask  = vbool64{0x0, 0xFFFFFFFFFFFFFFFF};
 const vbool64 vd_real_mask  = vbool64{0xFFFFFFFFFFFFFFFF, 0x0};
 const vbool64 vd_isign_mask = vbool64{0x0, 0x8000000000000000};