Skip to content

Commit ba23bbc

Browse files
author
Junio C Hamano
committed
diffcore-delta: make change counter to byte oriented again.
The textual line oriented change counter was fun but was not very effective. It tended to overcount the changes. This one changes it to a simple N-letter substring based implementation. Signed-off-by: Junio C Hamano <[email protected]>
1 parent 4d0f39c commit ba23bbc

File tree

1 file changed

+68
-28
lines changed

1 file changed

+68
-28
lines changed

diffcore-delta.c

Lines changed: 68 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,53 @@
11
#include "cache.h"
22
#include "diff.h"
33
#include "diffcore.h"
4-
#include "delta.h"
5-
#include "count-delta.h"
6-
7-
static int diffcore_count_changes_1(void *src, unsigned long src_size,
8-
void *dst, unsigned long dst_size,
9-
unsigned long delta_limit,
10-
unsigned long *src_copied,
11-
unsigned long *literal_added)
4+
5+
/*
6+
* Idea here is very simple.
7+
*
8+
* We have total of (sz-N+1) N-byte overlapping sequences in buf whose
9+
* size is sz. If the same N-byte sequence appears in both source and
10+
* destination, we say the byte that starts that sequence is shared
11+
* between them (i.e. copied from source to destination).
12+
*
13+
* For each possible N-byte sequence, if the source buffer has more
14+
* instances of it than the destination buffer, that means the
15+
* difference are the number of bytes not copied from source to
16+
* destination. If the counts are the same, everything was copied
17+
* from source to destination. If the destination has more,
18+
* everything was copied, and destination added more.
19+
*
20+
* We are doing an approximation so we do not really have to waste
21+
* memory by actually storing the sequence. We just hash them into
22+
* somewhere around 2^16 hashbuckets and count the occurrences.
23+
*
24+
* The length of the sequence is arbitrarily set to 8 for now.
25+
*/
26+
27+
#define HASHBASE 65537 /* next_prime(2^16) */
28+
29+
static void hash_chars(unsigned char *buf, unsigned long sz, int *count)
1230
{
13-
void *delta;
14-
unsigned long delta_size;
15-
16-
delta = diff_delta(src, src_size,
17-
dst, dst_size,
18-
&delta_size, delta_limit);
19-
if (!delta)
20-
/* If delta_limit is exceeded, we have too much differences */
21-
return -1;
31+
unsigned int accum1, accum2, i;
2232

23-
/* Estimate the edit size by interpreting delta. */
24-
if (count_delta(delta, delta_size, src_copied, literal_added)) {
25-
free(delta);
26-
return -1;
33+
/* an 8-byte shift register made of accum1 and accum2. New
34+
* bytes come at LSB of accum2, and shifted up to accum1
35+
*/
36+
for (i = accum1 = accum2 = 0; i < 7; i++, sz--) {
37+
accum1 = (accum1 << 8) | (accum2 >> 24);
38+
accum2 = (accum2 << 8) | *buf++;
39+
}
40+
while (sz) {
41+
accum1 = (accum1 << 8) | (accum2 >> 24);
42+
accum2 = (accum2 << 8) | *buf++;
43+
/* We want something that hashes permuted byte
44+
* sequences nicely; simpler hash like (accum1 ^
45+
* accum2) does not perform as well.
46+
*/
47+
i = (accum1 + accum2 * 0x61) % HASHBASE;
48+
count[i]++;
49+
sz--;
2750
}
28-
free(delta);
29-
return 0;
3051
}
3152

3253
int diffcore_count_changes(void *src, unsigned long src_size,
@@ -35,9 +56,28 @@ int diffcore_count_changes(void *src, unsigned long src_size,
3556
unsigned long *src_copied,
3657
unsigned long *literal_added)
3758
{
38-
return diffcore_count_changes_1(src, src_size,
39-
dst, dst_size,
40-
delta_limit,
41-
src_copied,
42-
literal_added);
59+
int *src_count, *dst_count, i;
60+
unsigned long sc, la;
61+
62+
if (src_size < 8 || dst_size < 8)
63+
return -1;
64+
65+
src_count = xcalloc(HASHBASE * 2, sizeof(int));
66+
dst_count = src_count + HASHBASE;
67+
hash_chars(src, src_size, src_count);
68+
hash_chars(dst, dst_size, dst_count);
69+
70+
sc = la = 0;
71+
for (i = 0; i < HASHBASE; i++) {
72+
if (src_count[i] < dst_count[i]) {
73+
la += dst_count[i] - src_count[i];
74+
sc += src_count[i];
75+
}
76+
else /* i.e. if (dst_count[i] <= src_count[i]) */
77+
sc += dst_count[i];
78+
}
79+
*src_copied = sc;
80+
*literal_added = la;
81+
free(src_count);
82+
return 0;
4383
}

0 commit comments

Comments
 (0)