Skip to content

Commit 6942efc

Browse files
trastgitster
authored andcommitted
xdiff: load full words in the inner loop of xdl_hash_record
Redo the hashing loop in xdl_hash_record in a way that loads an entire 'long' at a time, using masking tricks to see when and where we found the terminating '\n'. I stole inspiration and code from the posts by Linus Torvalds around https://lkml.org/lkml/2012/3/2/452 https://lkml.org/lkml/2012/3/5/6 His method reads the buffers in sizeof(long) increments, and may thus overrun it by at most sizeof(long)-1 bytes before it sees the final newline (or hits the buffer length check). I considered padding out all buffers by a suitable amount to "catch" the overrun, but * this does not work for mmap()'d buffers: if you map 4096+8 bytes from a 4096 byte file, accessing the last 8 bytes results in a SIGBUS on my machine; and * it would also be extremely ugly because it intrudes deep into the unpacking machinery. So I adapted it to not read beyond the buffer at all. Instead, it reads the final partial word byte-by-byte and strings it together. Then it can use the same logic as before to finish the hashing. So far we enable this only on x86_64, where it provides nice speedup for diff-related work: Test origin/next tr/xdiff-fast-hash ----------------------------------------------------------------------------- 4000.1: log -3000 (baseline) 0.07(0.05+0.02) 0.08(0.06+0.02) +14.3% 4000.2: log --raw -3000 (tree-only) 0.37(0.33+0.04) 0.37(0.32+0.04) +0.0% 4000.3: log -p -3000 (Myers) 1.75(1.65+0.09) 1.60(1.49+0.10) -8.6% 4000.4: log -p -3000 --histogram 1.73(1.62+0.09) 1.58(1.49+0.08) -8.7% 4000.5: log -p -3000 --patience 2.11(2.00+0.10) 1.94(1.80+0.11) -8.1% Perhaps other platforms could also benefit. However it does NOT work on big-endian systems! [jc: minimum style and compilation fixes] Signed-off-by: Thomas Rast <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent e8dde3e commit 6942efc

File tree

2 files changed

+124
-0
lines changed

2 files changed

+124
-0
lines changed

Makefile

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,11 @@ all::
288288
# dependency rules.
289289
#
290290
# Define NATIVE_CRLF if your platform uses CRLF for line endings.
291+
#
292+
# Define XDL_FAST_HASH to use an alternative line-hashing method in
293+
# the diff algorithm. It gives a nice speedup if your processor has
294+
# fast unaligned word loads. Does NOT work on big-endian systems!
295+
# Enabled by default on x86_64.
291296

292297
GIT-VERSION-FILE: FORCE
293298
@$(SHELL_PATH) ./GIT-VERSION-GEN
@@ -864,6 +869,9 @@ EXTLIBS =
864869
# because maintaining the nesting to match is a pain. If
865870
# we had "elif" things would have been much nicer...
866871

872+
ifeq ($(uname_M),x86_64)
873+
XDL_FAST_HASH = YesPlease
874+
endif
867875
ifeq ($(uname_S),OSF1)
868876
# Need this for u_short definitions et al
869877
BASIC_CFLAGS += -D_OSF_SOURCE
@@ -1737,6 +1745,10 @@ ifndef NO_MSGFMT_EXTENDED_OPTIONS
17371745
MSGFMT += --check --statistics
17381746
endif
17391747

1748+
ifneq (,$(XDL_FAST_HASH))
1749+
BASIC_CFLAGS += -DXDL_FAST_HASH
1750+
endif
1751+
17401752
ifeq ($(TCLTK_PATH),)
17411753
NO_TCLTK=NoThanks
17421754
endif

xdiff/xutils.c

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
*
2121
*/
2222

23+
#include <limits.h>
24+
#include <assert.h>
2325
#include "xinclude.h"
2426

2527

@@ -276,6 +278,115 @@ static unsigned long xdl_hash_record_with_whitespace(char const **data,
276278
return ha;
277279
}
278280

281+
#ifdef XDL_FAST_HASH
282+
283+
#define ONEBYTES 0x0101010101010101ul
284+
#define NEWLINEBYTES 0x0a0a0a0a0a0a0a0aul
285+
#define HIGHBITS 0x8080808080808080ul
286+
287+
/* Return the high bit set in the first byte that is a zero */
288+
static inline unsigned long has_zero(unsigned long a)
289+
{
290+
return ((a - ONEBYTES) & ~a) & HIGHBITS;
291+
}
292+
293+
#if __WORDSIZE == 64
294+
295+
/*
296+
* Jan Achrenius on G+: microoptimized version of
297+
* the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
298+
* that works for the bytemasks without having to
299+
* mask them first.
300+
*/
301+
static inline long count_masked_bytes(unsigned long mask)
302+
{
303+
return mask * 0x0001020304050608 >> 56;
304+
}
305+
306+
#else /* 32-bit case */
307+
308+
/* Modified Carl Chatfield G+ version for 32-bit */
309+
static inline long count_masked_bytes(long mask)
310+
{
311+
/*
312+
* (a) gives us
313+
* -1 (0, ff), 0 (ffff) or 1 (ffffff)
314+
* (b) gives us
315+
* 0 for 0, 1 for (ff ffff ffffff)
316+
* (a+b+1) gives us
317+
* correct 0-3 bytemask count result
318+
*/
319+
long a = (mask - 256) >> 23;
320+
long b = mask & 1;
321+
return a + b + 1;
322+
}
323+
324+
#endif
325+
326+
unsigned long xdl_hash_record(char const **data, char const *top, long flags)
327+
{
328+
unsigned long hash = 5381;
329+
unsigned long a = 0, mask = 0;
330+
char const *ptr = *data;
331+
char const *end = top - sizeof(unsigned long) + 1;
332+
333+
if (flags & XDF_WHITESPACE_FLAGS)
334+
return xdl_hash_record_with_whitespace(data, top, flags);
335+
336+
ptr -= sizeof(unsigned long);
337+
do {
338+
hash += hash << 5;
339+
hash ^= a;
340+
ptr += sizeof(unsigned long);
341+
if (ptr >= end)
342+
break;
343+
a = *(unsigned long *)ptr;
344+
/* Do we have any '\n' bytes in this word? */
345+
mask = has_zero(a ^ NEWLINEBYTES);
346+
} while (!mask);
347+
348+
if (ptr >= end) {
349+
/*
350+
* There is only a partial word left at the end of the
351+
* buffer. Because we may work with a memory mapping,
352+
* we have to grab the rest byte by byte instead of
353+
* blindly reading it.
354+
*
355+
* To avoid problems with masking in a signed value,
356+
* we use an unsigned char here.
357+
*/
358+
const char *p;
359+
for (p = top - 1; p >= ptr; p--)
360+
a = (a << 8) + *((const unsigned char *)p);
361+
mask = has_zero(a ^ NEWLINEBYTES);
362+
if (!mask)
363+
/*
364+
* No '\n' found in the partial word. Make a
365+
* mask that matches what we read.
366+
*/
367+
mask = 1UL << (8 * (top - ptr) + 7);
368+
}
369+
370+
/* The mask *below* the first high bit set */
371+
mask = (mask - 1) & ~mask;
372+
mask >>= 7;
373+
hash += hash << 5;
374+
hash ^= a & mask;
375+
376+
/* Advance past the last (possibly partial) word */
377+
ptr += count_masked_bytes(mask);
378+
379+
if (ptr < top) {
380+
assert(*ptr == '\n');
381+
ptr++;
382+
}
383+
384+
*data = ptr;
385+
386+
return hash;
387+
}
388+
389+
#else /* XDL_FAST_HASH */
279390

280391
unsigned long xdl_hash_record(char const **data, char const *top, long flags) {
281392
unsigned long ha = 5381;
@@ -293,6 +404,7 @@ unsigned long xdl_hash_record(char const **data, char const *top, long flags) {
293404
return ha;
294405
}
295406

407+
#endif /* XDL_FAST_HASH */
296408

297409
unsigned int xdl_hashbits(unsigned int size) {
298410
unsigned int val = 1, bits = 0;

0 commit comments

Comments
 (0)