rust-lang
diff --git a/‎Cargo.toml
Lines changed: 4 additions & 1 deletion b/‎Cargo.toml
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/int/mod.rs
Lines changed: 3 additions & 1 deletion b/‎src/int/mod.rs
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/int/sdiv.rs
Lines changed: 1 addition & 0 deletions b/‎src/int/sdiv.rs
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/int/specialized_div_rem/asymmetric.rs
Lines changed: 176 additions & 0 deletions b/‎src/int/specialized_div_rem/asymmetric.rs
Lines changed: 176 additions & 0 deletions
diff --git a/‎src/int/specialized_div_rem/binary_long.rs
Lines changed: 114 additions & 0 deletions b/‎src/int/specialized_div_rem/binary_long.rs
Lines changed: 114 additions & 0 deletions
@@ -38,7 +38,7 @@ cc = { optional = true, version = "1.0" }
 panic-handler = { path = 'crates/panic-handler' }
 
 [features]
-default = ["compiler-builtins"]
+default = ["compiler-builtins", "asymmetric-div-asm"]
 
 # Enable compilation of C code in compiler-rt, filling in some more optimized
 # implementations and also filling in unimplemented intrinsics
@@ -60,6 +60,9 @@ no-lang-items = []
 # Only used in the compiler's build system
 rustc-dep-of-std = ['compiler-builtins', 'core']
 
+# Enables faster u128 division on x86_64 by using the `divq` assembly instruction
+asymmetric-div-asm = []
+
 [[example]]
 name = "intrinsics"
 required-features = ["compiler-builtins"]
 
@@ -14,8 +14,10 @@ macro_rules! os_ty {
 
 pub mod addsub;
 pub mod mul;
-pub mod sdiv;
 pub mod shift;
+
+pub mod sdiv;
+mod specialized_div_rem;
 pub mod udiv;
 
 /// Trait for some basic operations on integers
 
@@ -1,4 +1,5 @@
 use int::Int;
+use super::specialized_div_rem::*;
 
 trait Div: Int {
     /// Returns `a / b`
 
@@ -0,0 +1,176 @@
+macro_rules! impl_asymmetric {
+    (
+        $unsigned_name:ident, // name of the unsigned function
+        $signed_name:ident, // name of the signed function
+        $half_division:ident, // function for division of a $uX by a $uX
+        $asymmetric_division:ident, // function for division of a $uD by a $uX
+        $n_h:expr, // the number of bits in $iH or $uH
+        $uH:ident, // unsigned integer with half the bit width of $uX
+        $uX:ident, // unsigned integer with half the bit width of $uD
+        $uD:ident, // unsigned integer with double the bit width of $uX
+        $iD:ident, // signed version of $uD
+        $($unsigned_attr:meta),*; // attributes for the unsigned function
+        $($signed_attr:meta),* // attributes for the signed function
+    ) => {
+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+        /// tuple.
+        ///
+        /// This is optimized for dividing integers with the same bitwidth as the largest operand in
+        /// an asymmetrically sized division. For example, the x86-64 `divq` assembly instruction
+        /// can divide a 128 bit integer by a 64 bit integer if the quotient fits in 64 bits.
+        ///
+        /// # Panics
+        ///
+        /// When attempting to divide by zero, this function will panic.
+        $(
+            #[$unsigned_attr]
+        )*
+        pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) {
+            #[inline]
+            fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) {
+                let tmp = (lhs as $uD).wrapping_mul(rhs as $uD);
+                (tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
+            }
+            #[inline]
+            fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) {
+                let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD);
+                (tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
+            }
+
+            let n: u32 = $n_h * 2;
+
+            // Many of these subalgorithms are taken from trifecta.rs, see that for better
+            // documentation
+
+            let duo_lo = duo as $uX;
+            let duo_hi = (duo >> n) as $uX;
+            let div_lo = div as $uX;
+            let div_hi = (div >> n) as $uX;
+            if div_hi == 0 {
+                if div_lo == 0 {
+                    // should be unreachable if upstream code has done its job in checking
+                    // for zero
+                    ::abort();
+                }
+                if duo_hi < div_lo {
+                    // plain $uD by $uX division that will fit into $uX
+                    let tmp = unsafe { $asymmetric_division(duo, div_lo) };
+                    return (tmp.0 as $uD, tmp.1 as $uD)
+                } else if (div_lo >> $n_h) == 0 {
+                    // Short division of $uD by a $uH.
+                    let div_0 = div_lo as $uH as $uX;
+                    let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
+
+                    let duo_mid =
+                        ((duo >> $n_h) as $uH as $uX)
+                        | (rem_3 << $n_h);
+                    let (quo_1, rem_2) = $half_division(duo_mid, div_0);
+
+                    let duo_lo =
+                        (duo as $uH as $uX)
+                        | (rem_2 << $n_h);
+                    let (quo_0, rem_1) = $half_division(duo_lo, div_0);
+
+                    return (
+                        (quo_0 as $uD)
+                        | ((quo_1 as $uD) << $n_h)
+                        | ((quo_hi as $uD) << n),
+                        rem_1 as $uD
+                    )
+                } else {
+                    // Short division using the $uD by $uX division
+                    let (quo_hi, rem_hi) = $half_division(duo_hi, div_lo);
+                    let tmp = unsafe {
+                        $asymmetric_division((duo_lo as $uD) | ((rem_hi as $uD) << n), div_lo)
+                    };
+                    return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD)
+                }
+            }
+
+            let duo_lz = duo_hi.leading_zeros();
+            let div_lz = div_hi.leading_zeros();
+            let rel_leading_sb = div_lz.wrapping_sub(duo_lz);
+            if rel_leading_sb < $n_h {
+                // Some x86_64 CPUs have bad `divq` implementations that make putting
+                // a `mul` or `mul - 1` algorithm here beneficial
+                let shift = n.wrapping_sub(duo_lz);
+                let duo_sig_n = (duo >> shift) as $uX;
+                let div_sig_n = (div >> shift) as $uX;
+                let mul = $half_division(duo_sig_n, div_sig_n).0;
+                let div_lo = div as $uX;
+                let div_hi = (div >> n) as $uX;
+                let (tmp_lo, carry) = carrying_mul(mul,div_lo);
+                let (tmp_hi, overflow) = carrying_mul_add(mul,div_hi,carry);
+                let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
+                if ((overflow & 1) != 0) || (duo < tmp) {
+                    return (
+                        mul.wrapping_sub(1) as $uD,
+                        duo.wrapping_add(div.wrapping_sub(tmp))
+                    )
+                } else {
+                    return (
+                        mul as $uD,
+                        duo.wrapping_sub(tmp)
+                    )
+                }
+            } else {
+                // This has been adapted from
+                // https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn
+                // adapted from www.hackersdelight.org
+
+                // This is similar to the `mul` or `mul - 1` algorithm in that it uses only more
+                // significant parts of `duo` and `div` to divide a large integer with a smaller
+                // division instruction.
+                let tmp = unsafe {
+                    $asymmetric_division(duo >> 1, ((div << div_lz) >> n) as $uX)
+                };
+                let mut quo = tmp.0 >> ((n - 1) - div_lz);
+                if quo != 0 {
+                    quo -= 1;
+                }
+                // Note that this is a large $uD multiplication being used here
+                let mut rem = duo - ((quo as $uD) * div);
+
+                if rem >= div {
+                    quo += 1;
+                    rem -= div;
+                }
+                return (quo as $uD, rem)
+            }
+        }
+
+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+        /// tuple.
+        ///
+        /// This is optimized for dividing integers with the same bitwidth as the largest operand in
+        /// an asymmetrically sized division. For example, the x86-64 `divq` assembly instruction
+        /// can divide a 128 bit integer by a 64 bit integer if the quotient fits in 64 bits.
+        ///
+        /// # Panics
+        ///
+        /// When attempting to divide by zero, this function will panic.
+        $(
+            #[$signed_attr]
+        )*
+        pub fn $signed_name(duo: $iD, div: $iD) -> ($iD,$iD) {
+            match (duo < 0, div < 0) {
+                (false,false) => {
+                    let t = $unsigned_name(duo as $uD,div as $uD);
+                    (t.0 as $iD,t.1 as $iD)
+                },
+                (true,false) => {
+                    let t = $unsigned_name(duo.wrapping_neg() as $uD,div as $uD);
+                    ((t.0 as $iD).wrapping_neg(),(t.1 as $iD).wrapping_neg())
+                },
+                (false,true) => {
+                    let t = $unsigned_name(duo as $uD,div.wrapping_neg() as $uD);
+                    ((t.0 as $iD).wrapping_neg(),t.1 as $iD)
+                },
+                (true,true) => {
+                    let t = $unsigned_name(duo.wrapping_neg() as $uD,div.wrapping_neg() as $uD);
+                    (t.0 as $iD,(t.1 as $iD).wrapping_neg())
+                },
+            }
+        }
+    }
+}
@@ -0,0 +1,114 @@
+macro_rules! impl_binary_long {
+    (
+        $unsigned_name:ident, // name of the unsigned function
+        $signed_name:ident, // name of the signed function
+        $n:expr, // the number of bits in a $iX or $uX
+        $uX:ident, // unsigned integer that will be shifted
+        $iX:ident, // signed version of $uX
+        $($unsigned_attr:meta),*; // attributes for the unsigned function
+        $($signed_attr:meta),* // attributes for the signed function
+    ) => {
+
+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+        /// tuple.
+        ///
+        /// This uses binary shift long division only, and is designed for CPUs without fast
+        /// multiplication or division hardware.
+        ///
+        /// # Panics
+        ///
+        /// When attempting to divide by zero, this function will panic.
+        $(
+            #[$unsigned_attr]
+        )*
+        pub fn $unsigned_name(duo: $uX, div: $uX) -> ($uX,$uX) {
+            if div == 0 {
+                // should be unreachable if upstream code has done its job in checking for
+                // zero
+                ::abort();
+            }
+
+            // Full $uX binary long division. Use `leading_zeros` on the first round,
+            // because we assume that the average usage of division has arguments that
+            // are random but have a significant number of leading zero bits. Doing
+            // `leading_zeros` for every round would be very expensive, especially for
+            // CPUs without a native count leading zeros instruction, but doing it just
+            // for the first round is advantageous for both performance of the common
+            // case and for code simplicity. Note that many benchmarks use the full
+            // `n_d` bits for `duo`, and the benchmarks with several bits less have a
+            // good performance increase.
+
+            let div_lz = div.leading_zeros();
+            let duo_lz = duo.leading_zeros();
+
+            if div_lz < duo_lz {
+                return (0, duo)
+            }
+
+            // Figures out how far `div` should be shifted to align most significant
+            // bits
+            let mut shift = div_lz - duo_lz;
+            let mut duo = duo;
+            let mut div = div << shift;
+            let mut quo = 0;
+            loop {
+                // There is a way to do this without branching, but requires too many extra
+                // operations to be faster:
+                // let sub = duo.wrapping_sub(div);
+                // let sign_mask = !(((sub as $iD) >> (n_d - 1)) as $uD);
+                // duo -= div & sign_mask;
+                // quo |= sign_mask & 1;
+                let sub = duo.wrapping_sub(div);
+                if (sub as $iX) >= 0 {
+                    duo = sub;
+                    quo |= 1;
+                    if duo == 0 {
+                        // must have this branch for inputs that are not as
+                        // random as what is used in the benchmarks
+                        return (quo << shift, duo)
+                    }
+                }
+
+                if shift == 0 {
+                    return (quo, duo)
+                }
+                shift -= 1;
+                div >>= 1;
+                quo <<= 1;
+            }
+        }
+
+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+        /// tuple.
+        ///
+        /// This uses binary shift long division only, and is designed for CPUs without fast
+        /// multiplication or division hardware.
+        ///
+        /// # Panics
+        ///
+        /// When attempting to divide by zero, this function will panic.
+        $(
+            #[$signed_attr]
+        )*
+        pub fn $signed_name(duo: $iX, div: $iX) -> ($iX,$iX) {
+            match (duo < 0, div < 0) {
+                (false,false) => {
+                    let t = $unsigned_name(duo as $uX,div as $uX);
+                    (t.0 as $iX,t.1 as $iX)
+                },
+                (true,false) => {
+                    let t = $unsigned_name(duo.wrapping_neg() as $uX,div as $uX);
+                    ((t.0 as $iX).wrapping_neg(),(t.1 as $iX).wrapping_neg())
+                },
+                (false,true) => {
+                    let t = $unsigned_name(duo as $uX,div.wrapping_neg() as $uX);
+                    ((t.0 as $iX).wrapping_neg(),t.1 as $iX)
+                },
+                (true,true) => {
+                    let t = $unsigned_name(duo.wrapping_neg() as $uX,div.wrapping_neg() as $uX);
+                    (t.0 as $iX,(t.1 as $iX).wrapping_neg())
+                },
+            }
+        }
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`use int::Int;`
	`2`	`+use super::specialized_div_rem::*;`
`2`	`3`
`3`	`4`	`trait Div: Int {`
`4`	`5`	/// Returns `a / b`