Skip to content

Commit 1841ab7

Browse files
committed
optimize popcount implementation
Much like the trailing and leading zero count operations, the population count (popcount) operation has a built-in function within gcc. Unlike these other two operations, which are implemented in terms of these built-ins, the gcc backend implements popcount in terms of a custom implementation of the operation entirely separate from the built-ins gcc provides. This has lead to poor codegen in some circumstances. For instance, the gcc backend of rustc currently emits the following for a function that implements popcount for a u32 (x86_64 targeting AVX2, using standard unix calling convention): popcount: mov eax, edi and edi, 1431655765 shr eax and eax, 1431655765 add edi, eax mov edx, edi and edi, 858993459 shr edx, 2 and edx, 858993459 add edx, edi mov eax, edx and edx, 252645135 shr eax, 4 and eax, 252645135 add eax, edx mov edx, eax and eax, 16711935 shr edx, 8 and edx, 16711935 add edx, eax movzx eax, dx shr edx, 16 add eax, edx ret Rather than using this implementation, gcc could be told to use these built-in functions. This would give the same function the following implementation: popcount: mov eax, edi popcnt rax, rax ret This patch implements the popcount operation in terms of gcc's built-ins in all cases, not just the 128-bit case. Signed-off-by: Andy Sadler <[email protected]>
1 parent 71febd9 commit 1841ab7

File tree

1 file changed

+29
-68
lines changed

1 file changed

+29
-68
lines changed

src/intrinsic/mod.rs

Lines changed: 29 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -811,6 +811,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
811811
let result_type = value.get_type();
812812
let value_type = result_type.to_unsigned(self.cx);
813813

814+
let arg_type = value.get_type();
814815
let value =
815816
if result_type.is_signed(self.cx) {
816817
self.gcc_int_cast(value, value_type)
@@ -819,75 +820,35 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
819820
value
820821
};
821822

822-
if value_type.is_u128(&self.cx) {
823-
// TODO(antoyo): implement in the normal algorithm below to have a more efficient
824-
// implementation (that does not require a call to __popcountdi2).
825-
let popcount = self.context.get_builtin_function("__builtin_popcountll");
826-
let sixty_four = self.gcc_int(value_type, 64);
827-
let right_shift = self.gcc_lshr(value, sixty_four);
828-
let high = self.gcc_int_cast(right_shift, self.cx.ulonglong_type);
829-
let high = self.context.new_call(None, popcount, &[high]);
830-
let low = self.gcc_int_cast(value, self.cx.ulonglong_type);
831-
let low = self.context.new_call(None, popcount, &[low]);
832-
let res = high + low;
833-
return self.gcc_int_cast(res, result_type);
834-
}
835-
836-
// First step.
837-
let mask = self.context.new_rvalue_from_long(value_type, 0x5555555555555555);
838-
let left = value & mask;
839-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 1);
840-
let right = shifted & mask;
841-
let value = left + right;
842-
843-
// Second step.
844-
let mask = self.context.new_rvalue_from_long(value_type, 0x3333333333333333);
845-
let left = value & mask;
846-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 2);
847-
let right = shifted & mask;
848-
let value = left + right;
849-
850-
// Third step.
851-
let mask = self.context.new_rvalue_from_long(value_type, 0x0F0F0F0F0F0F0F0F);
852-
let left = value & mask;
853-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 4);
854-
let right = shifted & mask;
855-
let value = left + right;
856-
857-
if value_type.is_u8(&self.cx) {
858-
return self.context.new_cast(None, value, result_type);
859-
}
860-
861-
// Fourth step.
862-
let mask = self.context.new_rvalue_from_long(value_type, 0x00FF00FF00FF00FF);
863-
let left = value & mask;
864-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 8);
865-
let right = shifted & mask;
866-
let value = left + right;
867-
868-
if value_type.is_u16(&self.cx) {
869-
return self.context.new_cast(None, value, result_type);
870-
}
871-
872-
// Fifth step.
873-
let mask = self.context.new_rvalue_from_long(value_type, 0x0000FFFF0000FFFF);
874-
let left = value & mask;
875-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 16);
876-
let right = shifted & mask;
877-
let value = left + right;
878-
879-
if value_type.is_u32(&self.cx) {
880-
return self.context.new_cast(None, value, result_type);
881-
}
882-
883-
// Sixth step.
884-
let mask = self.context.new_rvalue_from_long(value_type, 0x00000000FFFFFFFF);
885-
let left = value & mask;
886-
let shifted = value >> self.context.new_rvalue_from_int(value_type, 32);
887-
let right = shifted & mask;
888-
let value = left + right;
823+
let (popcount, expected_type) =
824+
// TODO(antoyo): write a new function Type::is_compatible_with(&Type) and use it here
825+
// instead of using is_uint().
826+
if arg_type.is_uchar(&self.cx) || arg_type.is_ushort(&self.cx) || arg_type.is_uint(&self.cx) {
827+
("__builtin_popcount", self.cx.uint_type)
828+
}
829+
else if arg_type.is_ulong(&self.cx) {
830+
("__builtin_popcountl", self.cx.ulong_type)
831+
}
832+
else if arg_type.is_ulonglong(&self.cx) {
833+
("__builtin_popcountll", self.cx.ulonglong_type)
834+
} else {
835+
// TODO(antoyo): implement in the normal algorithm below to have a more efficient
836+
// implementation (that does not require a call to __popcountdi2).
837+
let popcount = self.context.get_builtin_function("__builtin_popcountll");
838+
let sixty_four = self.gcc_int(value_type, 64);
839+
let right_shift = self.gcc_lshr(value, sixty_four);
840+
let high = self.gcc_int_cast(right_shift, self.cx.ulonglong_type);
841+
let high = self.context.new_call(None, popcount, &[high]);
842+
let low = self.gcc_int_cast(value, self.cx.ulonglong_type);
843+
let low = self.context.new_call(None, popcount, &[low]);
844+
let res = high + low;
845+
return self.gcc_int_cast(res, result_type);
846+
};
889847

890-
self.context.new_cast(None, value, result_type)
848+
let popcount = self.context.get_builtin_function(popcount);
849+
let value = self.context.new_cast(None, value, expected_type);
850+
let res = self.context.new_call(None, popcount, &[value]);
851+
self.context.new_cast(None, res, result_type)
891852
}
892853

893854
// Algorithm from: https://blog.regehr.org/archives/1063

0 commit comments

Comments
 (0)