Skip to content

Commit d99958c

Browse files
committed
Merge branch 'jt/path-filter-fix' into next
The Bloom filter used for path limited history traversal was broken on systems whose "char" is unsigned; update the implementation and bump the format version to 2. * jt/path-filter-fix: commit-graph: new filter ver. that fixes murmur3 repo-settings: introduce commitgraph.changedPathsVersion t4216: test changed path filters with high bit paths t/helper/test-read-graph: implement `bloom-filters` mode bloom.h: make `load_bloom_filter_from_graph()` public t/helper/test-read-graph.c: extract `dump_graph_info()` gitformat-commit-graph: describe version 2 of BDAT
2 parents 996db74 + 9e4df4d commit d99958c

File tree

12 files changed

+350
-39
lines changed

12 files changed

+350
-39
lines changed

Documentation/config/commitgraph.txt

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,26 @@ commitGraph.maxNewFilters::
99
commit-graph write` (c.f., linkgit:git-commit-graph[1]).
1010

1111
commitGraph.readChangedPaths::
12-
If true, then git will use the changed-path Bloom filters in the
13-
commit-graph file (if it exists, and they are present). Defaults to
14-
true. See linkgit:git-commit-graph[1] for more information.
12+
Deprecated. Equivalent to commitGraph.changedPathsVersion=-1 if true, and
13+
commitGraph.changedPathsVersion=0 if false. (If commitGraph.changedPathVersion
14+
is also set, commitGraph.changedPathsVersion takes precedence.)
15+
16+
commitGraph.changedPathsVersion::
17+
Specifies the version of the changed-path Bloom filters that Git will read and
18+
write. May be -1, 0, 1, or 2.
19+
+
20+
Defaults to -1.
21+
+
22+
If -1, Git will use the version of the changed-path Bloom filters in the
23+
repository, defaulting to 1 if there are none.
24+
+
25+
If 0, Git will not read any Bloom filters, and will write version 1 Bloom
26+
filters when instructed to write.
27+
+
28+
If 1, Git will only read version 1 Bloom filters, and will write version 1
29+
Bloom filters.
30+
+
31+
If 2, Git will only read version 2 Bloom filters, and will write version 2
32+
Bloom filters.
33+
+
34+
See linkgit:git-commit-graph[1] for more information.

Documentation/gitformat-commit-graph.txt

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,13 +142,16 @@ All multi-byte numbers are in network byte order.
142142

143143
==== Bloom Filter Data (ID: {'B', 'D', 'A', 'T'}) [Optional]
144144
* It starts with header consisting of three unsigned 32-bit integers:
145-
- Version of the hash algorithm being used. We currently only support
146-
value 1 which corresponds to the 32-bit version of the murmur3 hash
145+
- Version of the hash algorithm being used. We currently support
146+
value 2 which corresponds to the 32-bit version of the murmur3 hash
147147
implemented exactly as described in
148148
https://en.wikipedia.org/wiki/MurmurHash#Algorithm and the double
149149
hashing technique using seed values 0x293ae76f and 0x7e646e2 as
150150
described in https://doi.org/10.1007/978-3-540-30494-4_26 "Bloom Filters
151-
in Probabilistic Verification"
151+
in Probabilistic Verification". Version 1 Bloom filters have a bug that appears
152+
when char is signed and the repository has path names that have characters >=
153+
0x80; Git supports reading and writing them, but this ability will be removed
154+
in a future version of Git.
152155
- The number of times a path is hashed and hence the number of bit positions
153156
that cumulatively determine whether a file is present in the commit.
154157
- The minimum number of bits 'b' per entry in the Bloom filter. If the filter

bloom.c

Lines changed: 69 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ static inline unsigned char get_bitmask(uint32_t pos)
2929
return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
3030
}
3131

32-
static int load_bloom_filter_from_graph(struct commit_graph *g,
33-
struct bloom_filter *filter,
34-
uint32_t graph_pos)
32+
int load_bloom_filter_from_graph(struct commit_graph *g,
33+
struct bloom_filter *filter,
34+
uint32_t graph_pos)
3535
{
3636
uint32_t lex_pos, start_index, end_index;
3737

@@ -66,7 +66,64 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
6666
* Not considered to be cryptographically secure.
6767
* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
6868
*/
69-
uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len)
69+
uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len)
70+
{
71+
const uint32_t c1 = 0xcc9e2d51;
72+
const uint32_t c2 = 0x1b873593;
73+
const uint32_t r1 = 15;
74+
const uint32_t r2 = 13;
75+
const uint32_t m = 5;
76+
const uint32_t n = 0xe6546b64;
77+
int i;
78+
uint32_t k1 = 0;
79+
const char *tail;
80+
81+
int len4 = len / sizeof(uint32_t);
82+
83+
uint32_t k;
84+
for (i = 0; i < len4; i++) {
85+
uint32_t byte1 = (uint32_t)(unsigned char)data[4*i];
86+
uint32_t byte2 = ((uint32_t)(unsigned char)data[4*i + 1]) << 8;
87+
uint32_t byte3 = ((uint32_t)(unsigned char)data[4*i + 2]) << 16;
88+
uint32_t byte4 = ((uint32_t)(unsigned char)data[4*i + 3]) << 24;
89+
k = byte1 | byte2 | byte3 | byte4;
90+
k *= c1;
91+
k = rotate_left(k, r1);
92+
k *= c2;
93+
94+
seed ^= k;
95+
seed = rotate_left(seed, r2) * m + n;
96+
}
97+
98+
tail = (data + len4 * sizeof(uint32_t));
99+
100+
switch (len & (sizeof(uint32_t) - 1)) {
101+
case 3:
102+
k1 ^= ((uint32_t)(unsigned char)tail[2]) << 16;
103+
/*-fallthrough*/
104+
case 2:
105+
k1 ^= ((uint32_t)(unsigned char)tail[1]) << 8;
106+
/*-fallthrough*/
107+
case 1:
108+
k1 ^= ((uint32_t)(unsigned char)tail[0]) << 0;
109+
k1 *= c1;
110+
k1 = rotate_left(k1, r1);
111+
k1 *= c2;
112+
seed ^= k1;
113+
break;
114+
}
115+
116+
seed ^= (uint32_t)len;
117+
seed ^= (seed >> 16);
118+
seed *= 0x85ebca6b;
119+
seed ^= (seed >> 13);
120+
seed *= 0xc2b2ae35;
121+
seed ^= (seed >> 16);
122+
123+
return seed;
124+
}
125+
126+
static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len)
70127
{
71128
const uint32_t c1 = 0xcc9e2d51;
72129
const uint32_t c2 = 0x1b873593;
@@ -131,8 +188,14 @@ void fill_bloom_key(const char *data,
131188
int i;
132189
const uint32_t seed0 = 0x293ae76f;
133190
const uint32_t seed1 = 0x7e646e2c;
134-
const uint32_t hash0 = murmur3_seeded(seed0, data, len);
135-
const uint32_t hash1 = murmur3_seeded(seed1, data, len);
191+
uint32_t hash0, hash1;
192+
if (settings->hash_version == 2) {
193+
hash0 = murmur3_seeded_v2(seed0, data, len);
194+
hash1 = murmur3_seeded_v2(seed1, data, len);
195+
} else {
196+
hash0 = murmur3_seeded_v1(seed0, data, len);
197+
hash1 = murmur3_seeded_v1(seed1, data, len);
198+
}
136199

137200
key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
138201
for (i = 0; i < settings->num_hashes; i++)

bloom.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,16 @@
33

44
struct commit;
55
struct repository;
6+
struct commit_graph;
67

78
struct bloom_filter_settings {
89
/*
910
* The version of the hashing technique being used.
10-
* We currently only support version = 1 which is
11+
* The newest version is 2, which is
1112
* the seeded murmur3 hashing technique implemented
12-
* in bloom.c.
13+
* in bloom.c. Bloom filters of version 1 were created
14+
* with prior versions of Git, which had a bug in the
15+
* implementation of the hash function.
1316
*/
1417
uint32_t hash_version;
1518

@@ -68,14 +71,18 @@ struct bloom_key {
6871
uint32_t *hashes;
6972
};
7073

74+
int load_bloom_filter_from_graph(struct commit_graph *g,
75+
struct bloom_filter *filter,
76+
uint32_t graph_pos);
77+
7178
/*
7279
* Calculate the murmur3 32-bit hash value for the given data
7380
* using the given seed.
7481
* Produces a uniformly distributed hash value.
7582
* Not considered to be cryptographically secure.
7683
* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
7784
*/
78-
uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len);
85+
uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len);
7986

8087
void fill_bloom_key(const char *data,
8188
size_t len,

commit-graph.c

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -304,17 +304,26 @@ static int graph_read_oid_lookup(const unsigned char *chunk_start,
304304
return 0;
305305
}
306306

307+
struct graph_read_bloom_data_context {
308+
struct commit_graph *g;
309+
int *commit_graph_changed_paths_version;
310+
};
311+
307312
static int graph_read_bloom_data(const unsigned char *chunk_start,
308313
size_t chunk_size, void *data)
309314
{
310-
struct commit_graph *g = data;
315+
struct graph_read_bloom_data_context *c = data;
316+
struct commit_graph *g = c->g;
311317
uint32_t hash_version;
312-
g->chunk_bloom_data = chunk_start;
313318
hash_version = get_be32(chunk_start);
314319

315-
if (hash_version != 1)
320+
if (*c->commit_graph_changed_paths_version == -1) {
321+
*c->commit_graph_changed_paths_version = hash_version;
322+
} else if (hash_version != *c->commit_graph_changed_paths_version) {
316323
return 0;
324+
}
317325

326+
g->chunk_bloom_data = chunk_start;
318327
g->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings));
319328
g->bloom_filter_settings->hash_version = hash_version;
320329
g->bloom_filter_settings->num_hashes = get_be32(chunk_start + 4);
@@ -401,11 +410,15 @@ struct commit_graph *parse_commit_graph(struct repo_settings *s,
401410
graph->read_generation_data = 1;
402411
}
403412

404-
if (s->commit_graph_read_changed_paths) {
413+
if (s->commit_graph_changed_paths_version) {
414+
struct graph_read_bloom_data_context context = {
415+
.g = graph,
416+
.commit_graph_changed_paths_version = &s->commit_graph_changed_paths_version
417+
};
405418
pair_chunk(cf, GRAPH_CHUNKID_BLOOMINDEXES,
406419
&graph->chunk_bloom_indexes);
407420
read_chunk(cf, GRAPH_CHUNKID_BLOOMDATA,
408-
graph_read_bloom_data, graph);
421+
graph_read_bloom_data, &context);
409422
}
410423

411424
if (graph->chunk_bloom_indexes && graph->chunk_bloom_data) {
@@ -2388,6 +2401,14 @@ int write_commit_graph(struct object_directory *odb,
23882401
ctx->write_generation_data = (get_configured_generation_version(r) == 2);
23892402
ctx->num_generation_data_overflows = 0;
23902403

2404+
if (r->settings.commit_graph_changed_paths_version < -1
2405+
|| r->settings.commit_graph_changed_paths_version > 2) {
2406+
warning(_("attempting to write a commit-graph, but 'commitgraph.changedPathsVersion' (%d) is not supported"),
2407+
r->settings.commit_graph_changed_paths_version);
2408+
return 0;
2409+
}
2410+
bloom_settings.hash_version = r->settings.commit_graph_changed_paths_version == 2
2411+
? 2 : 1;
23912412
bloom_settings.bits_per_entry = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_BITS_PER_ENTRY",
23922413
bloom_settings.bits_per_entry);
23932414
bloom_settings.num_hashes = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_NUM_HASHES",
@@ -2417,7 +2438,7 @@ int write_commit_graph(struct object_directory *odb,
24172438
g = ctx->r->objects->commit_graph;
24182439

24192440
/* We have changed-paths already. Keep them in the next graph */
2420-
if (g && g->chunk_bloom_data) {
2441+
if (g && g->bloom_filter_settings) {
24212442
ctx->changed_paths = 1;
24222443
ctx->bloom_settings = g->bloom_filter_settings;
24232444
}

oss-fuzz/fuzz-commit-graph.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
1919
* possible.
2020
*/
2121
the_repository->settings.commit_graph_generation_version = 2;
22-
the_repository->settings.commit_graph_read_changed_paths = 1;
22+
the_repository->settings.commit_graph_changed_paths_version = 1;
2323
g = parse_commit_graph(&the_repository->settings, (void *)data, size);
2424
repo_clear(the_repository);
2525
free_commit_graph(g);

repo-settings.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ void prepare_repo_settings(struct repository *r)
2424
int value;
2525
const char *strval;
2626
int manyfiles;
27+
int read_changed_paths;
2728

2829
if (!r->gitdir)
2930
BUG("Cannot add settings for uninitialized repository");
@@ -54,7 +55,10 @@ void prepare_repo_settings(struct repository *r)
5455
/* Commit graph config or default, does not cascade (simple) */
5556
repo_cfg_bool(r, "core.commitgraph", &r->settings.core_commit_graph, 1);
5657
repo_cfg_int(r, "commitgraph.generationversion", &r->settings.commit_graph_generation_version, 2);
57-
repo_cfg_bool(r, "commitgraph.readchangedpaths", &r->settings.commit_graph_read_changed_paths, 1);
58+
repo_cfg_bool(r, "commitgraph.readchangedpaths", &read_changed_paths, 1);
59+
repo_cfg_int(r, "commitgraph.changedpathsversion",
60+
&r->settings.commit_graph_changed_paths_version,
61+
read_changed_paths ? -1 : 0);
5862
repo_cfg_bool(r, "gc.writecommitgraph", &r->settings.gc_write_commit_graph, 1);
5963
repo_cfg_bool(r, "fetch.writecommitgraph", &r->settings.fetch_write_commit_graph, 0);
6064

repository.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ struct repo_settings {
2929

3030
int core_commit_graph;
3131
int commit_graph_generation_version;
32-
int commit_graph_read_changed_paths;
32+
int commit_graph_changed_paths_version;
3333
int gc_write_commit_graph;
3434
int fetch_write_commit_graph;
3535
int command_requires_full_index;

t/helper/test-bloom.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ static void get_bloom_filter_for_commit(const struct object_id *commit_oid)
5050

5151
static const char *bloom_usage = "\n"
5252
" test-tool bloom get_murmur3 <string>\n"
53+
" test-tool bloom get_murmur3_seven_highbit\n"
5354
" test-tool bloom generate_filter <string> [<string>...]\n"
5455
" test-tool bloom get_filter_for_commit <commit-hex>\n";
5556

@@ -64,7 +65,13 @@ int cmd__bloom(int argc, const char **argv)
6465
uint32_t hashed;
6566
if (argc < 3)
6667
usage(bloom_usage);
67-
hashed = murmur3_seeded(0, argv[2], strlen(argv[2]));
68+
hashed = murmur3_seeded_v2(0, argv[2], strlen(argv[2]));
69+
printf("Murmur3 Hash with seed=0:0x%08x\n", hashed);
70+
}
71+
72+
if (!strcmp(argv[1], "get_murmur3_seven_highbit")) {
73+
uint32_t hashed;
74+
hashed = murmur3_seeded_v2(0, "\x99\xaa\xbb\xcc\xdd\xee\xff", 7);
6875
printf("Murmur3 Hash with seed=0:0x%08x\n", hashed);
6976
}
7077

0 commit comments

Comments
 (0)