Skip to content
This repository was archived by the owner on Apr 23, 2020. It is now read-only.

Commit 274527c

Browse files
committed
Resubmit r325107 (case folding DJB hash)
The issue was that the has function was generating different results depending on the signedness of char on the host platform. This commit fixes the issue by explicitly using an unsigned char type to prevent sign extension and adds some extra tests. The original commit message was: This patch implements a variant of the DJB hash function which folds the input according to the algorithm in the Dwarf 5 specification (Section 6.1.1.4.5), which in turn references the Unicode Standard (Section 5.18, "Case Mappings"). To achieve this, I have added a llvm::sys::unicode::foldCharSimple function, which performs this mapping. The implementation of this function was generated from the CaseMatching.txt file from the Unicode spec using a python script (which is also included in this patch). The script tries to optimize the function by coalescing adjecant mappings with the same shift and stride (terms I made up). Theoretically, it could be made a bit smarter and merge adjecant blocks that were interrupted by only one or two characters with exceptional mapping, but this would save only a couple of branches, while it would greatly complicate the implementation, so I deemed it was not worth it. Since we assume that the vast majority of the input characters will be US-ASCII, the folding hash function has a fast-path for handling these, and only whips out the full decode+fold+encode logic if we encounter a character outside of this range. It might be possible to implement the folding directly on utf8 sequences, but this would also bring a lot of complexity for the few cases where we will actually need to process non-ascii characters. Reviewers: JDevlieghere, aprantl, probinson, dblaikie Subscribers: mgorny, hintonda, echristo, clayborg, vleschuk, llvm-commits Differential Revision: https://reviews.llvm.org/D42740 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@325732 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent fda791f commit 274527c

File tree

8 files changed

+1063
-2
lines changed

8 files changed

+1063
-2
lines changed

include/llvm/Support/DJB.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ namespace llvm {
2020

2121
/// The Bernstein hash function used by the DWARF accelerator tables.
2222
uint32_t djbHash(StringRef Buffer, uint32_t H = 5381);
23+
24+
/// Computes the Bernstein hash after folding the input according to the Dwarf 5
25+
/// standard case folding rules.
26+
uint32_t caseFoldingDjbHash(StringRef Buffer, uint32_t H = 5381);
2327
} // namespace llvm
2428

2529
#endif // LLVM_SUPPORT_DJB_H

include/llvm/Support/Unicode.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ bool isPrintable(int UCS);
6060
/// * 1 for each of the remaining characters.
6161
int columnWidthUTF8(StringRef Text);
6262

63+
/// Fold input unicode character according the the Simple unicode case folding
64+
/// rules.
65+
int foldCharSimple(int C);
66+
6367
} // namespace unicode
6468
} // namespace sys
6569
} // namespace llvm

lib/Support/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ add_llvm_library(LLVMSupport
113113
Triple.cpp
114114
Twine.cpp
115115
Unicode.cpp
116+
UnicodeCaseFold.cpp
116117
YAMLParser.cpp
117118
YAMLTraits.cpp
118119
raw_os_ostream.cpp

lib/Support/DJB.cpp

Lines changed: 78 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,85 @@
1212
//===----------------------------------------------------------------------===//
1313

1414
#include "llvm/Support/DJB.h"
15+
#include "llvm/ADT/ArrayRef.h"
16+
#include "llvm/Support/Compiler.h"
17+
#include "llvm/Support/ConvertUTF.h"
18+
#include "llvm/Support/Unicode.h"
19+
20+
using namespace llvm;
21+
22+
static inline uint32_t djbHashChar(unsigned char C, uint32_t H) {
23+
return (H << 5) + H + C;
24+
}
1525

1626
uint32_t llvm::djbHash(StringRef Buffer, uint32_t H) {
17-
for (char C : Buffer.bytes())
18-
H = ((H << 5) + H) + C;
27+
for (unsigned char C : Buffer.bytes())
28+
H = djbHashChar(C, H);
29+
return H;
30+
}
31+
32+
static UTF32 chopOneUTF32(StringRef &Buffer) {
33+
UTF32 C;
34+
const UTF8 *const Begin8Const =
35+
reinterpret_cast<const UTF8 *>(Buffer.begin());
36+
const UTF8 *Begin8 = Begin8Const;
37+
UTF32 *Begin32 = &C;
38+
39+
// In lenient mode we will always end up with a "reasonable" value in C for
40+
// non-empty input.
41+
assert(!Buffer.empty());
42+
ConvertUTF8toUTF32(&Begin8, reinterpret_cast<const UTF8 *>(Buffer.end()),
43+
&Begin32, &C + 1, lenientConversion);
44+
Buffer = Buffer.drop_front(Begin8 - Begin8Const);
45+
return C;
46+
}
47+
48+
static StringRef toUTF8(UTF32 C, MutableArrayRef<UTF8> Storage) {
49+
const UTF32 *Begin32 = &C;
50+
UTF8 *Begin8 = Storage.begin();
51+
52+
// The case-folded output should always be a valid unicode character, so use
53+
// strict mode here.
54+
ConversionResult CR = ConvertUTF32toUTF8(&Begin32, &C + 1, &Begin8,
55+
Storage.end(), strictConversion);
56+
assert(CR == conversionOK && "Case folding produced invalid char?");
57+
(void)CR;
58+
return StringRef(reinterpret_cast<char *>(Storage.begin()),
59+
Begin8 - Storage.begin());
60+
}
61+
62+
static UTF32 foldCharDwarf(UTF32 C) {
63+
// DWARF v5 addition to the unicode folding rules.
64+
// Fold "Latin Small Letter Dotless I" and "Latin Capital Letter I With Dot
65+
// Above" into "i".
66+
if (C == 0x130 || C == 0x131)
67+
return 'i';
68+
return sys::unicode::foldCharSimple(C);
69+
}
70+
71+
static uint32_t caseFoldingDjbHashCharSlow(StringRef &Buffer, uint32_t H) {
72+
UTF32 C = chopOneUTF32(Buffer);
73+
74+
C = foldCharDwarf(C);
75+
76+
std::array<UTF8, UNI_MAX_UTF8_BYTES_PER_CODE_POINT> Storage;
77+
StringRef Folded = toUTF8(C, Storage);
78+
return djbHash(Folded, H);
79+
}
80+
81+
uint32_t llvm::caseFoldingDjbHash(StringRef Buffer, uint32_t H) {
82+
while (!Buffer.empty()) {
83+
unsigned char C = Buffer.front();
84+
if (LLVM_LIKELY(C <= 0x7f)) {
85+
// US-ASCII, encoded as one character in utf-8.
86+
// This is by far the most common case, so handle this specially.
87+
if (C >= 'A' && C <= 'Z')
88+
C = 'a' + (C - 'A'); // fold uppercase into lowercase
89+
H = djbHashChar(C, H);
90+
Buffer = Buffer.drop_front();
91+
continue;
92+
}
93+
H = caseFoldingDjbHashCharSlow(Buffer, H);
94+
}
1995
return H;
2096
}

0 commit comments

Comments
 (0)