Skip to content

Commit e7594a4

Browse files
committed
add new feature flag and new files from the specialized-div-rem crate
1 parent cde22bc commit e7594a4

File tree

9 files changed

+1111
-2
lines changed

9 files changed

+1111
-2
lines changed

Cargo.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ cc = { optional = true, version = "1.0" }
3838
panic-handler = { path = 'crates/panic-handler' }
3939

4040
[features]
41-
default = ["compiler-builtins"]
41+
default = ["compiler-builtins", "asymmetric-div-asm"]
4242

4343
# Enable compilation of C code in compiler-rt, filling in some more optimized
4444
# implementations and also filling in unimplemented intrinsics
@@ -60,6 +60,9 @@ no-lang-items = []
6060
# Only used in the compiler's build system
6161
rustc-dep-of-std = ['compiler-builtins', 'core']
6262

63+
# Enables faster u128 division on x86_64 by using the `divq` assembly instruction
64+
asymmetric-div-asm = []
65+
6366
[[example]]
6467
name = "intrinsics"
6568
required-features = ["compiler-builtins"]

src/int/mod.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@ macro_rules! os_ty {
1414

1515
pub mod addsub;
1616
pub mod mul;
17-
pub mod sdiv;
1817
pub mod shift;
18+
19+
pub mod sdiv;
20+
mod specialized_div_rem;
1921
pub mod udiv;
2022

2123
/// Trait for some basic operations on integers

src/int/sdiv.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use int::Int;
2+
use super::specialized_div_rem::*;
23

34
trait Div: Int {
45
/// Returns `a / b`
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
macro_rules! impl_asymmetric {
2+
(
3+
$unsigned_name:ident, // name of the unsigned function
4+
$signed_name:ident, // name of the signed function
5+
$half_division:ident, // function for division of a $uX by a $uX
6+
$asymmetric_division:ident, // function for division of a $uD by a $uX
7+
$n_h:expr, // the number of bits in $iH or $uH
8+
$uH:ident, // unsigned integer with half the bit width of $uX
9+
$uX:ident, // unsigned integer with half the bit width of $uD
10+
$uD:ident, // unsigned integer with double the bit width of $uX
11+
$iD:ident, // signed version of $uD
12+
$($unsigned_attr:meta),*; // attributes for the unsigned function
13+
$($signed_attr:meta),* // attributes for the signed function
14+
) => {
15+
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
16+
/// tuple.
17+
///
18+
/// This is optimized for dividing integers with the same bitwidth as the largest operand in
19+
/// an asymmetrically sized division. For example, the x86-64 `divq` assembly instruction
20+
/// can divide a 128 bit integer by a 64 bit integer if the quotient fits in 64 bits.
21+
///
22+
/// # Panics
23+
///
24+
/// When attempting to divide by zero, this function will panic.
25+
$(
26+
#[$unsigned_attr]
27+
)*
28+
pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) {
29+
#[inline]
30+
fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) {
31+
let tmp = (lhs as $uD).wrapping_mul(rhs as $uD);
32+
(tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
33+
}
34+
#[inline]
35+
fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) {
36+
let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD);
37+
(tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
38+
}
39+
40+
let n: u32 = $n_h * 2;
41+
42+
// Many of these subalgorithms are taken from trifecta.rs, see that for better
43+
// documentation
44+
45+
let duo_lo = duo as $uX;
46+
let duo_hi = (duo >> n) as $uX;
47+
let div_lo = div as $uX;
48+
let div_hi = (div >> n) as $uX;
49+
if div_hi == 0 {
50+
if div_lo == 0 {
51+
// should be unreachable if upstream code has done its job in checking
52+
// for zero
53+
::abort();
54+
}
55+
if duo_hi < div_lo {
56+
// plain $uD by $uX division that will fit into $uX
57+
let tmp = unsafe { $asymmetric_division(duo, div_lo) };
58+
return (tmp.0 as $uD, tmp.1 as $uD)
59+
} else if (div_lo >> $n_h) == 0 {
60+
// Short division of $uD by a $uH.
61+
let div_0 = div_lo as $uH as $uX;
62+
let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
63+
64+
let duo_mid =
65+
((duo >> $n_h) as $uH as $uX)
66+
| (rem_3 << $n_h);
67+
let (quo_1, rem_2) = $half_division(duo_mid, div_0);
68+
69+
let duo_lo =
70+
(duo as $uH as $uX)
71+
| (rem_2 << $n_h);
72+
let (quo_0, rem_1) = $half_division(duo_lo, div_0);
73+
74+
return (
75+
(quo_0 as $uD)
76+
| ((quo_1 as $uD) << $n_h)
77+
| ((quo_hi as $uD) << n),
78+
rem_1 as $uD
79+
)
80+
} else {
81+
// Short division using the $uD by $uX division
82+
let (quo_hi, rem_hi) = $half_division(duo_hi, div_lo);
83+
let tmp = unsafe {
84+
$asymmetric_division((duo_lo as $uD) | ((rem_hi as $uD) << n), div_lo)
85+
};
86+
return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD)
87+
}
88+
}
89+
90+
let duo_lz = duo_hi.leading_zeros();
91+
let div_lz = div_hi.leading_zeros();
92+
let rel_leading_sb = div_lz.wrapping_sub(duo_lz);
93+
if rel_leading_sb < $n_h {
94+
// Some x86_64 CPUs have bad `divq` implementations that make putting
95+
// a `mul` or `mul - 1` algorithm here beneficial
96+
let shift = n.wrapping_sub(duo_lz);
97+
let duo_sig_n = (duo >> shift) as $uX;
98+
let div_sig_n = (div >> shift) as $uX;
99+
let mul = $half_division(duo_sig_n, div_sig_n).0;
100+
let div_lo = div as $uX;
101+
let div_hi = (div >> n) as $uX;
102+
let (tmp_lo, carry) = carrying_mul(mul,div_lo);
103+
let (tmp_hi, overflow) = carrying_mul_add(mul,div_hi,carry);
104+
let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
105+
if ((overflow & 1) != 0) || (duo < tmp) {
106+
return (
107+
mul.wrapping_sub(1) as $uD,
108+
duo.wrapping_add(div.wrapping_sub(tmp))
109+
)
110+
} else {
111+
return (
112+
mul as $uD,
113+
duo.wrapping_sub(tmp)
114+
)
115+
}
116+
} else {
117+
// This has been adapted from
118+
// https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn
119+
// adapted from www.hackersdelight.org
120+
121+
// This is similar to the `mul` or `mul - 1` algorithm in that it uses only more
122+
// significant parts of `duo` and `div` to divide a large integer with a smaller
123+
// division instruction.
124+
let tmp = unsafe {
125+
$asymmetric_division(duo >> 1, ((div << div_lz) >> n) as $uX)
126+
};
127+
let mut quo = tmp.0 >> ((n - 1) - div_lz);
128+
if quo != 0 {
129+
quo -= 1;
130+
}
131+
// Note that this is a large $uD multiplication being used here
132+
let mut rem = duo - ((quo as $uD) * div);
133+
134+
if rem >= div {
135+
quo += 1;
136+
rem -= div;
137+
}
138+
return (quo as $uD, rem)
139+
}
140+
}
141+
142+
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
143+
/// tuple.
144+
///
145+
/// This is optimized for dividing integers with the same bitwidth as the largest operand in
146+
/// an asymmetrically sized division. For example, the x86-64 `divq` assembly instruction
147+
/// can divide a 128 bit integer by a 64 bit integer if the quotient fits in 64 bits.
148+
///
149+
/// # Panics
150+
///
151+
/// When attempting to divide by zero, this function will panic.
152+
$(
153+
#[$signed_attr]
154+
)*
155+
pub fn $signed_name(duo: $iD, div: $iD) -> ($iD,$iD) {
156+
match (duo < 0, div < 0) {
157+
(false,false) => {
158+
let t = $unsigned_name(duo as $uD,div as $uD);
159+
(t.0 as $iD,t.1 as $iD)
160+
},
161+
(true,false) => {
162+
let t = $unsigned_name(duo.wrapping_neg() as $uD,div as $uD);
163+
((t.0 as $iD).wrapping_neg(),(t.1 as $iD).wrapping_neg())
164+
},
165+
(false,true) => {
166+
let t = $unsigned_name(duo as $uD,div.wrapping_neg() as $uD);
167+
((t.0 as $iD).wrapping_neg(),t.1 as $iD)
168+
},
169+
(true,true) => {
170+
let t = $unsigned_name(duo.wrapping_neg() as $uD,div.wrapping_neg() as $uD);
171+
(t.0 as $iD,(t.1 as $iD).wrapping_neg())
172+
},
173+
}
174+
}
175+
}
176+
}
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
macro_rules! impl_binary_long {
2+
(
3+
$unsigned_name:ident, // name of the unsigned function
4+
$signed_name:ident, // name of the signed function
5+
$n:expr, // the number of bits in a $iX or $uX
6+
$uX:ident, // unsigned integer that will be shifted
7+
$iX:ident, // signed version of $uX
8+
$($unsigned_attr:meta),*; // attributes for the unsigned function
9+
$($signed_attr:meta),* // attributes for the signed function
10+
) => {
11+
12+
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
13+
/// tuple.
14+
///
15+
/// This uses binary shift long division only, and is designed for CPUs without fast
16+
/// multiplication or division hardware.
17+
///
18+
/// # Panics
19+
///
20+
/// When attempting to divide by zero, this function will panic.
21+
$(
22+
#[$unsigned_attr]
23+
)*
24+
pub fn $unsigned_name(duo: $uX, div: $uX) -> ($uX,$uX) {
25+
if div == 0 {
26+
// should be unreachable if upstream code has done its job in checking for
27+
// zero
28+
::abort();
29+
}
30+
31+
// Full $uX binary long division. Use `leading_zeros` on the first round,
32+
// because we assume that the average usage of division has arguments that
33+
// are random but have a significant number of leading zero bits. Doing
34+
// `leading_zeros` for every round would be very expensive, especially for
35+
// CPUs without a native count leading zeros instruction, but doing it just
36+
// for the first round is advantageous for both performance of the common
37+
// case and for code simplicity. Note that many benchmarks use the full
38+
// `n_d` bits for `duo`, and the benchmarks with several bits less have a
39+
// good performance increase.
40+
41+
let div_lz = div.leading_zeros();
42+
let duo_lz = duo.leading_zeros();
43+
44+
if div_lz < duo_lz {
45+
return (0, duo)
46+
}
47+
48+
// Figures out how far `div` should be shifted to align most significant
49+
// bits
50+
let mut shift = div_lz - duo_lz;
51+
let mut duo = duo;
52+
let mut div = div << shift;
53+
let mut quo = 0;
54+
loop {
55+
// There is a way to do this without branching, but requires too many extra
56+
// operations to be faster:
57+
// let sub = duo.wrapping_sub(div);
58+
// let sign_mask = !(((sub as $iD) >> (n_d - 1)) as $uD);
59+
// duo -= div & sign_mask;
60+
// quo |= sign_mask & 1;
61+
let sub = duo.wrapping_sub(div);
62+
if (sub as $iX) >= 0 {
63+
duo = sub;
64+
quo |= 1;
65+
if duo == 0 {
66+
// must have this branch for inputs that are not as
67+
// random as what is used in the benchmarks
68+
return (quo << shift, duo)
69+
}
70+
}
71+
72+
if shift == 0 {
73+
return (quo, duo)
74+
}
75+
shift -= 1;
76+
div >>= 1;
77+
quo <<= 1;
78+
}
79+
}
80+
81+
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
82+
/// tuple.
83+
///
84+
/// This uses binary shift long division only, and is designed for CPUs without fast
85+
/// multiplication or division hardware.
86+
///
87+
/// # Panics
88+
///
89+
/// When attempting to divide by zero, this function will panic.
90+
$(
91+
#[$signed_attr]
92+
)*
93+
pub fn $signed_name(duo: $iX, div: $iX) -> ($iX,$iX) {
94+
match (duo < 0, div < 0) {
95+
(false,false) => {
96+
let t = $unsigned_name(duo as $uX,div as $uX);
97+
(t.0 as $iX,t.1 as $iX)
98+
},
99+
(true,false) => {
100+
let t = $unsigned_name(duo.wrapping_neg() as $uX,div as $uX);
101+
((t.0 as $iX).wrapping_neg(),(t.1 as $iX).wrapping_neg())
102+
},
103+
(false,true) => {
104+
let t = $unsigned_name(duo as $uX,div.wrapping_neg() as $uX);
105+
((t.0 as $iX).wrapping_neg(),t.1 as $iX)
106+
},
107+
(true,true) => {
108+
let t = $unsigned_name(duo.wrapping_neg() as $uX,div.wrapping_neg() as $uX);
109+
(t.0 as $iX,(t.1 as $iX).wrapping_neg())
110+
},
111+
}
112+
}
113+
}
114+
}

0 commit comments

Comments
 (0)