1
1
#include "cache.h"
2
2
#include "diff.h"
3
3
#include "diffcore.h"
4
- #include "delta.h"
5
- #include "count-delta.h"
6
-
7
- static int diffcore_count_changes_1 (void * src , unsigned long src_size ,
8
- void * dst , unsigned long dst_size ,
9
- unsigned long delta_limit ,
10
- unsigned long * src_copied ,
11
- unsigned long * literal_added )
4
+
5
+ /*
6
+ * Idea here is very simple.
7
+ *
8
+ * We have total of (sz-N+1) N-byte overlapping sequences in buf whose
9
+ * size is sz. If the same N-byte sequence appears in both source and
10
+ * destination, we say the byte that starts that sequence is shared
11
+ * between them (i.e. copied from source to destination).
12
+ *
13
+ * For each possible N-byte sequence, if the source buffer has more
14
+ * instances of it than the destination buffer, that means the
15
+ * difference are the number of bytes not copied from source to
16
+ * destination. If the counts are the same, everything was copied
17
+ * from source to destination. If the destination has more,
18
+ * everything was copied, and destination added more.
19
+ *
20
+ * We are doing an approximation so we do not really have to waste
21
+ * memory by actually storing the sequence. We just hash them into
22
+ * somewhere around 2^16 hashbuckets and count the occurrences.
23
+ *
24
+ * The length of the sequence is arbitrarily set to 8 for now.
25
+ */
26
+
27
+ #define HASHBASE 65537 /* next_prime(2^16) */
28
+
29
+ static void hash_chars (unsigned char * buf , unsigned long sz , int * count )
12
30
{
13
- void * delta ;
14
- unsigned long delta_size ;
15
-
16
- delta = diff_delta (src , src_size ,
17
- dst , dst_size ,
18
- & delta_size , delta_limit );
19
- if (!delta )
20
- /* If delta_limit is exceeded, we have too much differences */
21
- return -1 ;
31
+ unsigned int accum1 , accum2 , i ;
22
32
23
- /* Estimate the edit size by interpreting delta. */
24
- if (count_delta (delta , delta_size , src_copied , literal_added )) {
25
- free (delta );
26
- return -1 ;
33
+ /* an 8-byte shift register made of accum1 and accum2. New
34
+ * bytes come at LSB of accum2, and shifted up to accum1
35
+ */
36
+ for (i = accum1 = accum2 = 0 ; i < 7 ; i ++ , sz -- ) {
37
+ accum1 = (accum1 << 8 ) | (accum2 >> 24 );
38
+ accum2 = (accum2 << 8 ) | * buf ++ ;
39
+ }
40
+ while (sz ) {
41
+ accum1 = (accum1 << 8 ) | (accum2 >> 24 );
42
+ accum2 = (accum2 << 8 ) | * buf ++ ;
43
+ /* We want something that hashes permuted byte
44
+ * sequences nicely; simpler hash like (accum1 ^
45
+ * accum2) does not perform as well.
46
+ */
47
+ i = (accum1 + accum2 * 0x61 ) % HASHBASE ;
48
+ count [i ]++ ;
49
+ sz -- ;
27
50
}
28
- free (delta );
29
- return 0 ;
30
51
}
31
52
32
53
int diffcore_count_changes (void * src , unsigned long src_size ,
@@ -35,9 +56,28 @@ int diffcore_count_changes(void *src, unsigned long src_size,
35
56
unsigned long * src_copied ,
36
57
unsigned long * literal_added )
37
58
{
38
- return diffcore_count_changes_1 (src , src_size ,
39
- dst , dst_size ,
40
- delta_limit ,
41
- src_copied ,
42
- literal_added );
59
+ int * src_count , * dst_count , i ;
60
+ unsigned long sc , la ;
61
+
62
+ if (src_size < 8 || dst_size < 8 )
63
+ return -1 ;
64
+
65
+ src_count = xcalloc (HASHBASE * 2 , sizeof (int ));
66
+ dst_count = src_count + HASHBASE ;
67
+ hash_chars (src , src_size , src_count );
68
+ hash_chars (dst , dst_size , dst_count );
69
+
70
+ sc = la = 0 ;
71
+ for (i = 0 ; i < HASHBASE ; i ++ ) {
72
+ if (src_count [i ] < dst_count [i ]) {
73
+ la += dst_count [i ] - src_count [i ];
74
+ sc += src_count [i ];
75
+ }
76
+ else /* i.e. if (dst_count[i] <= src_count[i]) */
77
+ sc += dst_count [i ];
78
+ }
79
+ * src_copied = sc ;
80
+ * literal_added = la ;
81
+ free (src_count );
82
+ return 0 ;
43
83
}
0 commit comments