From 282f906d1b4767d95e2a66072c280c2294a93a9f Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Mon, 20 Apr 2026 07:56:42 -0700 Subject: [PATCH 1/5] merge-ort: propagate callback errors from traverse_trees_wrapper() traverse_trees_wrapper() saves entries from a first pass through traverse_trees() and then replays them through the real callback (collect_merge_info_callback). However, the replay loop silently discards the callback return value. This means any error reported by the callback during replay -- including a future check for malformed trees -- would be ignored, allowing the merge to proceed with corrupt state. Capture the return value, stop the loop on negative (error) returns, and propagate the error to the caller. Note that the callback returns a positive mask value on success, so we normalize non-negative returns to 0 for the caller. Signed-off-by: Elijah Newren --- merge-ort.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/merge-ort.c b/merge-ort.c index 00923ce3cd749b..4b8e32209d9b3a 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -1008,18 +1008,20 @@ static int traverse_trees_wrapper(struct index_state *istate, info->traverse_path = renames->callback_data_traverse_path; info->fn = old_fn; for (i = old_offset; i < renames->callback_data_nr; ++i) { - info->fn(n, - renames->callback_data[i].mask, - renames->callback_data[i].dirmask, - renames->callback_data[i].names, - info); + ret = info->fn(n, + renames->callback_data[i].mask, + renames->callback_data[i].dirmask, + renames->callback_data[i].names, + info); + if (ret < 0) + break; } renames->callback_data_nr = old_offset; free(renames->callback_data_traverse_path); renames->callback_data_traverse_path = old_callback_data_traverse_path; info->traverse_path = NULL; - return 0; + return ret < 0 ? ret : 0; } static void setup_path_info(struct merge_options *opt, From 949b5d8e3f3aefd9497a7b85d860259b9d5db418 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Mon, 20 Apr 2026 07:57:12 -0700 Subject: [PATCH 2/5] merge-ort: drop unnecessary show_all_errors from collect_merge_info() collect_merge_info() has set info.show_all_errors = 1 since d2bc1994f363 (merge-ort: implement a very basic collect_merge_info(), 2020-12-13). This setting was copied from unpack-trees.c where it controls batching of error messages for porcelain display, but merge-ort has no such error-batching logic and never needed it. With show_all_errors set, traverse_trees() captures a negative callback return but continues processing remaining entries rather than stopping immediately. Removing the setting restores the default behavior where a negative return from collect_merge_info_callback() breaks out of the traversal loop right away, allowing a future commit to exit early when a corrupt tree is detected. Signed-off-by: Elijah Newren --- merge-ort.c | 1 - 1 file changed, 1 deletion(-) diff --git a/merge-ort.c b/merge-ort.c index 4b8e32209d9b3a..74e9636020fe40 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -1740,7 +1740,6 @@ static int collect_merge_info(struct merge_options *opt, setup_traverse_info(&info, opt->priv->toplevel_dir); info.fn = collect_merge_info_callback; info.data = opt; - info.show_all_errors = 1; if (repo_parse_tree(opt->repo, merge_base) < 0 || repo_parse_tree(opt->repo, side1) < 0 || From d422f73e129535ffc7b24e64f4cfae1335f25452 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Mon, 20 Apr 2026 14:50:13 -0700 Subject: [PATCH 3/5] merge-ort: free diff pairs queue in clear_or_reinit_internal_opts() clear_or_reinit_internal_opts() is responsible for cleaning up the various data structures in merge_options_internal. It already handles many renames-related structures (dirs_removed, dir_renames, relevant_sources, cached_pairs, deferred, etc.) but does not free renames->pairs[].queue. In the normal code path, resolve_and_process_renames() frees pairs[s].queue and reinitializes it with diff_queue_init() before clear_or_reinit_internal_opts() runs, so the omission is harmless. However, if collect_merge_info() encounters an error and returns early (before resolve_and_process_renames() is ever called), any diff pairs already queued by collect_rename_info()/add_pair() will have their backing array leaked. Fix this by freeing renames->pairs[].queue in the cleanup function. In the normal path the pointer is already NULL (from the earlier diff_queue_init() in resolve_and_process_renames()), so free(NULL) is a safe no-op. Signed-off-by: Elijah Newren --- merge-ort.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/merge-ort.c b/merge-ort.c index 74e9636020fe40..8f911cb63979eb 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -728,6 +728,8 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti, strintmap_clear_func(&renames->deferred[i].possible_trivial_merges); strset_clear_func(&renames->deferred[i].target_dirs); renames->deferred[i].trivial_merges_okay = 1; /* 1 == maybe */ + free(renames->pairs[i].queue); + diff_queue_init(&renames->pairs[i]); } renames->cached_pairs_valid_side = 0; renames->dir_rename_mask = 0; From 0d81c027aafcb386398836ffc73b058b7ea4c702 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Mon, 20 Apr 2026 07:57:37 -0700 Subject: [PATCH 4/5] merge-ort: abort merge when trees have duplicate entries Trees with duplicate entries are malformed; fsck reports "contains duplicate file entries" for them. merge-ort has from the beginning assumed that we would never hit such trees. It was written with the assumption that traverse_trees() calls collect_merge_info_callback() at most once per path. The "sanity checks" in that callback (added in d2bc1994f363 (merge-ort: implement a very basic collect_merge_info(), 2020-12-13)) verify properties of each individual call but not that invariant. The strmap_put() in setup_path_info() silently overwrites the entry from any prior call for the same path, because it assumed there would be no other path. Unfortunately, supplemental data structures for various optimizations could still be tweaked before the extra paths were overwritten, and those data structures not matching expected state could trip various assertions. Change the return type of setup_path_info() from void to int to allow us to detect this case, and abort the merge with a clear error message when it occurs. Signed-off-by: Elijah Newren --- merge-ort.c | 61 ++++++++++++++++------------ t/t6422-merge-rename-corner-cases.sh | 54 ++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 27 deletions(-) diff --git a/merge-ort.c b/merge-ort.c index 8f911cb63979eb..be0829bbb781ef 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -1026,18 +1026,18 @@ static int traverse_trees_wrapper(struct index_state *istate, return ret < 0 ? ret : 0; } -static void setup_path_info(struct merge_options *opt, - struct string_list_item *result, - const char *current_dir_name, - int current_dir_name_len, - char *fullpath, /* we'll take over ownership */ - struct name_entry *names, - struct name_entry *merged_version, - unsigned is_null, /* boolean */ - unsigned df_conflict, /* boolean */ - unsigned filemask, - unsigned dirmask, - int resolved /* boolean */) +static int setup_path_info(struct merge_options *opt, + struct string_list_item *result, + const char *current_dir_name, + int current_dir_name_len, + char *fullpath, /* we'll take over ownership */ + struct name_entry *names, + struct name_entry *merged_version, + unsigned is_null, /* boolean */ + unsigned df_conflict, /* boolean */ + unsigned filemask, + unsigned dirmask, + int resolved /* boolean */) { /* result->util is void*, so mi is a convenience typed variable */ struct merged_info *mi; @@ -1081,9 +1081,11 @@ static void setup_path_info(struct merge_options *opt, */ mi->is_null = 1; } - strmap_put(&opt->priv->paths, fullpath, mi); + if (strmap_put(&opt->priv->paths, fullpath, mi)) + return error(_("tree has duplicate entries for '%s'"), fullpath); result->string = fullpath; result->util = mi; + return 0; } static void add_pair(struct merge_options *opt, @@ -1350,9 +1352,10 @@ static int collect_merge_info_callback(int n, */ if (side1_matches_mbase && side2_matches_mbase) { /* mbase, side1, & side2 all match; use mbase as resolution */ - setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, - names, names+0, mbase_null, 0 /* df_conflict */, - filemask, dirmask, 1 /* resolved */); + if (setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, + names, names+0, mbase_null, 0 /* df_conflict */, + filemask, dirmask, 1 /* resolved */)) + return -1; /* Quit traversing */ return mask; } @@ -1364,9 +1367,10 @@ static int collect_merge_info_callback(int n, */ if (sides_match && filemask == 0x07) { /* use side1 (== side2) version as resolution */ - setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, - names, names+1, side1_null, 0, - filemask, dirmask, 1); + if (setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, + names, names+1, side1_null, 0, + filemask, dirmask, 1)) + return -1; /* Quit traversing */ return mask; } @@ -1378,18 +1382,20 @@ static int collect_merge_info_callback(int n, */ if (side1_matches_mbase && filemask == 0x07) { /* use side2 version as resolution */ - setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, - names, names+2, side2_null, 0, - filemask, dirmask, 1); + if (setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, + names, names+2, side2_null, 0, + filemask, dirmask, 1)) + return -1; /* Quit traversing */ return mask; } /* Similar to above but swapping sides 1 and 2 */ if (side2_matches_mbase && filemask == 0x07) { /* use side1 version as resolution */ - setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, - names, names+1, side1_null, 0, - filemask, dirmask, 1); + if (setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, + names, names+1, side1_null, 0, + filemask, dirmask, 1)) + return -1; /* Quit traversing */ return mask; } @@ -1413,8 +1419,9 @@ static int collect_merge_info_callback(int n, * unconflict some more cases, but that comes later so all we can * do now is record the different non-null file hashes.) */ - setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, - names, NULL, 0, df_conflict, filemask, dirmask, 0); + if (setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, + names, NULL, 0, df_conflict, filemask, dirmask, 0)) + return -1; /* Quit traversing */ ci = pi.util; VERIFY_CI(ci); diff --git a/t/t6422-merge-rename-corner-cases.sh b/t/t6422-merge-rename-corner-cases.sh index e18d5a227d54f7..81b645bb3bdc5b 100755 --- a/t/t6422-merge-rename-corner-cases.sh +++ b/t/t6422-merge-rename-corner-cases.sh @@ -1525,4 +1525,58 @@ test_expect_success 'submodule/directory preliminary conflict' ' ) ' +# Testcase: submodule/directory conflict with duplicate tree entries +# One side has a path as a gitlink (submodule). The other side replaces +# the gitlink with a directory. A third-party tool creates a tree on the +# submodule side that has *both* a gitlink and a tree entry for the same +# path (adding a file inside the submodule path ignoring that there's a +# gitlink there). collect_merge_info_callback() should detect the +# duplicate and abort rather than silently corrupting its bookkeeping. + +test_expect_success 'duplicate tree entries trigger an error' ' + test_when_finished "rm -rf duplicate-entry" && + git init duplicate-entry && + ( + cd duplicate-entry && + + # Base commit: "docs" is a gitlink (submodule) + empty_tree=$(git mktree file.txt && + git add file.txt && + git commit -m base && + + # side1: remove the gitlink, replace with a directory + git checkout -b side1 && + git rm --cached docs && + mkdir -p docs && + echo hello >docs/requirements.txt && + git add docs/requirements.txt && + git commit -m "side1: submodule to directory" && + + # side2: keep the gitlink but craft a tree that also + # contains a tree entry for "docs" (simulating a tool + # that adds files inside a submodule path without + # removing the gitlink first). + git checkout main && + git checkout -b side2 && + blob_oid=$(echo world | git hash-object -w --stdin) && + docs_tree=$(printf "100644 blob %s\trequirements.txt\n" \ + "$blob_oid" | git mktree) && + cur_tree=$(git rev-parse HEAD^{tree}) && + git cat-file -p $cur_tree >tree-listing && + printf "040000 tree %s\tdocs\n" "$docs_tree" >>tree-listing && + new_tree=$(git mktree err && + test_grep "duplicate entries" err + ) +' + test_done From a87bbaa84fd5dcb2a585f82c4a5dfa1572b54588 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 18 Apr 2026 11:43:14 -0700 Subject: [PATCH 5/5] cache-tree: fix verify_cache() to catch non-adjacent D/F conflicts verify_cache() checks that the index does not contain both "path" and "path/file" before writing a tree. It does this by comparing only adjacent entries, relying on the assumption that "path/file" would immediately follow "path" in sorted order. Unfortunately, this assumption does not always hold. For example: docs <-- submodule entry docs-internal/README.md <-- intervening entry docs/requirements.txt <-- D/F conflict, NOT adjacent to "docs" When this happens, verify_cache() silently misses the D/F conflict and write-tree produces a corrupt tree object containing duplicate entries (one for the submodule "docs" and one for the tree "docs"). I could not find any caller in current git that both allows the index to get into this state and then tries to write it out without doing other checks beyond the verify_cache() call in cache_tree_update(), but verify_cache() is documented as a safety net for preventing corrupt trees and should actually provide that guarantee. A downstream consumer that relied solely on cache_tree_update()'s internal checking via verify_cache() to prevent duplicate tree entries was bitten by the gap. Add a test that constructs a corrupt index directly (bypassing the D/F checks in add_index_entry) and verifies that write-tree now rejects it. Signed-off-by: Elijah Newren --- cache-tree.c | 46 ++++++++++++++++++++++++-- t/meson.build | 1 + t/t0093-direct-index-write.pl | 38 ++++++++++++++++++++++ t/t0093-verify-cache-df-gap.sh | 59 ++++++++++++++++++++++++++++++++++ 4 files changed, 141 insertions(+), 3 deletions(-) create mode 100644 t/t0093-direct-index-write.pl create mode 100755 t/t0093-verify-cache-df-gap.sh diff --git a/cache-tree.c b/cache-tree.c index 7881b42aa24c80..f11844fe72020e 100644 --- a/cache-tree.c +++ b/cache-tree.c @@ -192,22 +192,62 @@ static int verify_cache(struct index_state *istate, int flags) for (i = 0; i + 1 < istate->cache_nr; i++) { /* path/file always comes after path because of the way * the cache is sorted. Also path can appear only once, - * which means conflicting one would immediately follow. + * so path/file is likely the immediately following path + * but might be separated if there is e.g. a + * path-internal/... file. */ const struct cache_entry *this_ce = istate->cache[i]; const struct cache_entry *next_ce = istate->cache[i + 1]; const char *this_name = this_ce->name; const char *next_name = next_ce->name; int this_len = ce_namelen(this_ce); + const char *conflict_name = NULL; + if (this_len < ce_namelen(next_ce) && - next_name[this_len] == '/' && + next_name[this_len] <= '/' && strncmp(this_name, next_name, this_len) == 0) { + if (next_name[this_len] == '/') { + conflict_name = next_name; + } else if (next_name[this_len] < '/') { + /* + * The immediately next entry shares our + * prefix but sorts before "path/" (e.g., + * "path-internal" between "path" and + * "path/file", since '-' (0x2D) < '/' + * (0x2F)). Binary search to find where + * "path/" would be and check for a D/F + * conflict there. + */ + struct cache_entry *other; + struct strbuf probe = STRBUF_INIT; + int pos; + + strbuf_add(&probe, this_name, this_len); + strbuf_addch(&probe, '/'); + pos = index_name_pos_sparse(istate, + probe.buf, + probe.len); + strbuf_release(&probe); + + if (pos < 0) + pos = -pos - 1; + if (pos >= (int)istate->cache_nr) + continue; + other = istate->cache[pos]; + if (ce_namelen(other) > this_len && + other->name[this_len] == '/' && + !strncmp(this_name, other->name, this_len)) + conflict_name = other->name; + } + } + + if (conflict_name) { if (10 < ++funny) { fprintf(stderr, "...\n"); break; } fprintf(stderr, "You have both %s and %s\n", - this_name, next_name); + this_name, conflict_name); } } if (funny) diff --git a/t/meson.build b/t/meson.build index 7528e5cda5fef0..362177999bd342 100644 --- a/t/meson.build +++ b/t/meson.build @@ -124,6 +124,7 @@ integration_tests = [ 't0090-cache-tree.sh', 't0091-bugreport.sh', 't0092-diagnose.sh', + 't0093-verify-cache-df-gap.sh', 't0095-bloom.sh', 't0100-previous.sh', 't0101-at-syntax.sh', diff --git a/t/t0093-direct-index-write.pl b/t/t0093-direct-index-write.pl new file mode 100644 index 00000000000000..2881a3ebb21dcd --- /dev/null +++ b/t/t0093-direct-index-write.pl @@ -0,0 +1,38 @@ +#!/usr/bin/perl +# +# Build a v2 index file from entries listed on stdin. +# Each line: "octalmode hex-oid name" +# Output: binary index written to stdout. +# +# This bypasses all D/F safety checks in add_index_entry(), simulating +# what happens when code uses ADD_CACHE_JUST_APPEND to bulk-load entries. +use strict; +use warnings; +use Digest::SHA qw(sha1 sha256); + +my $hash_algo = $ENV{'GIT_DEFAULT_HASH'} || 'sha1'; +my $hash_func = $hash_algo eq 'sha256' ? \&sha256 : \&sha1; + +my @entries; +while (my $line = ) { + chomp $line; + my ($mode, $oid_hex, $name) = split(/ /, $line, 3); + push @entries, [$mode, $oid_hex, $name]; +} + +my $body = "DIRC" . pack("NN", 2, scalar @entries); + +for my $ent (@entries) { + my ($mode, $oid_hex, $name) = @{$ent}; + # 10 x 32-bit stat fields (zeroed), with mode in position 7 + my $stat = pack("N10", 0, 0, 0, 0, 0, 0, oct($mode), 0, 0, 0); + my $oid = pack("H*", $oid_hex); + my $flags = pack("n", length($name) & 0xFFF); + my $entry = $stat . $oid . $flags . $name . "\0"; + # Pad to 8-byte boundary + while (length($entry) % 8) { $entry .= "\0"; } + $body .= $entry; +} + +binmode STDOUT; +print $body . $hash_func->($body); diff --git a/t/t0093-verify-cache-df-gap.sh b/t/t0093-verify-cache-df-gap.sh new file mode 100755 index 00000000000000..0b6829d805269d --- /dev/null +++ b/t/t0093-verify-cache-df-gap.sh @@ -0,0 +1,59 @@ +#!/bin/sh + +test_description='verify_cache() must catch non-adjacent D/F conflicts + +Ensure that verify_cache() can complain about bad entries like: + + docs <-- submodule + docs-internal/... <-- sorts here because "-" < "/" + docs/... <-- D/F conflict with "docs" above, not adjacent + +In order to test verify_cache, we directly construct a corrupt index +(bypassing the D/F safety checks in add_index_entry) and verify that +write-tree rejects it. +' + +. ./test-lib.sh + +if ! test_have_prereq PERL +then + skip_all='skipping verify_cache D/F tests; Perl not available' + test_done +fi + +# Build a v2 index from entries on stdin, bypassing D/F checks. +# Each line: "octalmode hex-oid name" (entries must be pre-sorted). +build_corrupt_index () { + perl "$TEST_DIRECTORY/t0093-direct-index-write.pl" >"$1" +} + +test_expect_success 'setup objects' ' + test_commit base && + BLOB=$(git rev-parse HEAD:base.t) && + SUB_COMMIT=$(git rev-parse HEAD) +' + +test_expect_success 'adjacent D/F conflict is caught by verify_cache' ' + cat >index-entries <<-EOF && + 0160000 $SUB_COMMIT docs + 0100644 $BLOB docs/requirements.txt + EOF + build_corrupt_index .git/index err && + test_grep "You have both docs and docs/requirements.txt" err +' + +test_expect_success 'non-adjacent D/F conflict is caught by verify_cache' ' + cat >index-entries <<-EOF && + 0160000 $SUB_COMMIT docs + 0100644 $BLOB docs-internal/README.md + 0100644 $BLOB docs/requirements.txt + EOF + build_corrupt_index .git/index err && + test_grep "You have both docs and docs/requirements.txt" err +' + +test_done