From f6508bc525edf4c60fdd763078beb78fa209eacf Mon Sep 17 00:00:00 2001 From: Alf Henrik Sauge Date: Sat, 6 Jun 2026 11:00:26 +0200 Subject: [PATCH] Add option to filter revwalk based on pathspec This is required in order to later on accelerate the walk by using bloom filters --- examples/log.c | 60 ++----------- include/git2/revwalk.h | 12 +++ src/libgit2/revwalk.c | 194 +++++++++++++++++++++++++++++++++++++++++ src/libgit2/revwalk.h | 3 + 4 files changed, 215 insertions(+), 54 deletions(-) diff --git a/examples/log.c b/examples/log.c index 62a6eb585..64c51cc27 100644 --- a/examples/log.c +++ b/examples/log.c @@ -66,7 +66,6 @@ static int parse_options( struct log_state *s, struct log_options *opt, int argc, char **argv); static void print_time(const git_time *intime, const char *prefix); static void print_commit(git_commit *commit, struct log_options *opts); -static int match_with_parent(git_commit *commit, int i, git_diff_options *); /** utility functions for filtering */ static int signature_matches(const git_signature *sig, const char *filter); @@ -74,7 +73,7 @@ static int log_message_matches(const git_commit *commit, const char *filter); int lg2_log(git_repository *repo, int argc, char *argv[]) { - int i, count = 0, printed = 0, parents, last_arg; + int count = 0, printed = 0, parents, last_arg; struct log_state s; struct log_options opt; git_diff_options diffopts = GIT_DIFF_OPTIONS_INIT; @@ -90,14 +89,16 @@ int lg2_log(git_repository *repo, int argc, char *argv[]) diffopts.pathspec.strings = &argv[last_arg]; diffopts.pathspec.count = argc - last_arg; - if (diffopts.pathspec.count > 0) - check_lg2(git_pathspec_new(&ps, &diffopts.pathspec), - "Building pathspec", NULL); if (!s.revisions) add_revision(&s, NULL); /** Use the revwalker to traverse the history. */ + if (diffopts.pathspec.count > 0) { + check_lg2(git_pathspec_new(&ps, &diffopts.pathspec), + "Building pathspec", NULL); + check_lg2(git_revwalk_pathspec(s.walker, ps), "Applying pathspec", NULL); + } printed = count = 0; @@ -111,29 +112,6 @@ int lg2_log(git_repository *repo, int argc, char *argv[]) if (opt.max_parents > 0 && parents > opt.max_parents) continue; - if (diffopts.pathspec.count > 0) { - int unmatched = parents; - - if (parents == 0) { - git_tree *tree; - check_lg2(git_commit_tree(&tree, commit), "Get tree", NULL); - if (git_pathspec_match_tree( - NULL, tree, GIT_PATHSPEC_NO_MATCH_ERROR, ps) != 0) - unmatched = 1; - git_tree_free(tree); - } else if (parents == 1) { - unmatched = match_with_parent(commit, 0, &diffopts) ? 0 : 1; - } else { - for (i = 0; i < parents; ++i) { - if (match_with_parent(commit, i, &diffopts)) - unmatched--; - } - } - - if (unmatched > 0) - continue; - } - if (!signature_matches(git_commit_author(commit), opt.author)) continue; @@ -379,32 +357,6 @@ static void print_commit(git_commit *commit, struct log_options *opts) printf("\n"); } -/** Helper to find how many files in a commit changed from its nth parent. */ -static int match_with_parent(git_commit *commit, int i, git_diff_options *opts) -{ - git_commit *parent; - git_tree *a, *b; - git_diff *diff; - int ndeltas; - - check_lg2( - git_commit_parent(&parent, commit, (size_t)i), "Get parent", NULL); - check_lg2(git_commit_tree(&a, parent), "Tree for parent", NULL); - check_lg2(git_commit_tree(&b, commit), "Tree for commit", NULL); - check_lg2( - git_diff_tree_to_tree(&diff, git_commit_owner(commit), a, b, opts), - "Checking diff between parent and commit", NULL); - - ndeltas = (int)git_diff_num_deltas(diff); - - git_diff_free(diff); - git_tree_free(a); - git_tree_free(b); - git_commit_free(parent); - - return ndeltas > 0; -} - /** Print a usage message for the program. */ static void usage(const char *message, const char *arg) { diff --git a/include/git2/revwalk.h b/include/git2/revwalk.h index 7c4ac5465..f0dbedd7e 100644 --- a/include/git2/revwalk.h +++ b/include/git2/revwalk.h @@ -10,6 +10,7 @@ #include "common.h" #include "types.h" #include "oid.h" +#include "pathspec.h" /** * @file git2/revwalk.h @@ -229,6 +230,17 @@ GIT_EXTERN(int) git_revwalk_next(git_oid *out, git_revwalk *walk); */ GIT_EXTERN(int) git_revwalk_sorting(git_revwalk *walk, unsigned int sort_mode); +/** + * Set a git_pathspec object to filter commits on + * + * Changing the pathspec rests the walker + * + * @param walk the walker being used for the traversal. + * @param pathspec Paths to filter commits on + * @return 0 or an error code + */ +GIT_EXTERN(int) git_revwalk_pathspec(git_revwalk *walk, git_pathspec *pathspec); + /** * Push and hide the respective endpoints of the given range. * diff --git a/src/libgit2/revwalk.c b/src/libgit2/revwalk.c index 5f798f8ce..64e741098 100644 --- a/src/libgit2/revwalk.c +++ b/src/libgit2/revwalk.c @@ -9,6 +9,7 @@ #include "commit.h" #include "odb.h" +#include "pathspec.h" #include "pool.h" #include "git2/revparse.h" @@ -468,6 +469,169 @@ static int still_interesting(git_commit_list *list, int64_t time, int slop) return slop - 1; } +static bool include_path_delta(git_revwalk *walk, + git_tree *commit_tree, + git_tree *parent_tree, + git_diff_options *diffopts) +{ + bool include = false; + git_diff *diff; + if (git_diff_tree_to_tree(&diff, walk->repo, parent_tree, commit_tree, diffopts) == 0) { + size_t num_deltas = git_diff_num_deltas(diff); + size_t i; + for (i = 0; i < num_deltas && !include; i++) { + const git_diff_delta *delta = git_diff_get_delta(diff, i); + if (delta->new_file.path + && git_pathspec__match(&walk->pathspec->pathspec, + delta->new_file.path, false, false, NULL, NULL)) { + include = true; + } + else if (delta->old_file.path + && git_pathspec__match(&walk->pathspec->pathspec, + delta->old_file.path, false, false, NULL, NULL)) { + include = true; + } + } + git_diff_free(diff); + } + return include; +} + +static bool include_path_wildcard(git_revwalk *walk, git_commit *commit, git_tree *commit_tree) +{ + unsigned int parents = git_commit_parentcount(commit); + git_diff_options diffopts = GIT_DIFF_OPTIONS_INIT; + bool include = false; + + /* We could narrow this down further by copying over the entire pathspec, + * but that doesn't seem to make any difference in performance. + * So for now, the just the prefix */ + if (walk->pathspec->prefix) { + diffopts.pathspec.strings = &walk->pathspec->prefix; + diffopts.pathspec.count = 1; + } + + if (parents == 0) { + include = include_path_delta(walk, commit_tree, NULL, &diffopts); + } + else { + unsigned int i; + include = true; + /* Loop through all parents, and ensure that it matches with all + * parents before including the commit + */ + for (i = 0; i < parents && include; i++) { + git_commit *parent = NULL; + git_tree *parent_tree = NULL; + /*Assume it's to be excluded unless the delta matches*/ + include = false; + if (git_commit_parent(&parent, commit, i) == 0 + && git_commit_tree(&parent_tree, parent) == 0) { + if (include_path_delta(walk, commit_tree, parent_tree, &diffopts)) + include = true; + } + git_tree_free(parent_tree); + git_commit_free(parent); + } + } + return include; +} + +static bool include_path_exact_root(git_revwalk *walk, git_tree *commit_tree) +{ + size_t i; + git_attr_fnmatch *match; + /* If it's a root commit, we just need to find the first path that matches */ + git_vector_foreach(&walk->pathspec->pathspec, i, match) { + git_tree_entry *entry=NULL; + if (git_tree_entry_bypath(&entry, commit_tree, match->pattern)) { + git_tree_entry_free(entry); + return true; + } + } + return false; +} + +static bool include_path_exact_parent(git_revwalk *walk, git_tree *commit_tree, git_tree *parent_tree) +{ + size_t i; + git_attr_fnmatch *match; + bool include = false; + git_vector_foreach(&walk->pathspec->pathspec, i, match) { + git_tree_entry *commit_entry=NULL; + git_tree_entry *parent_entry=NULL; + + /* Given we are working with full paths here, we only need to look at OIDs + * to know if the path is touched by the commit */ + git_tree_entry_bypath(&commit_entry, commit_tree, match->pattern); + git_tree_entry_bypath(&parent_entry, parent_tree, match->pattern); + + /* If the existance of an entry is different, it means we deal with + * an add or remove case. We don't need to think about renaming here + * since that would still count as a change */ + if ((commit_entry == NULL) != (parent_entry == NULL)) { + include = true; + } + /* Both trees have an entry. Include if the OID is different between the trees */ + else if (commit_entry && parent_entry + && git_oid_equal( + git_tree_entry_id(commit_entry), + git_tree_entry_id(parent_entry)) == 0) { + include = true; + } + git_tree_entry_free(commit_entry); + git_tree_entry_free(parent_entry); + if (include) + return include; + } + return false; +} + +static bool include_path_exact(git_revwalk *walk, git_commit *commit, git_tree *commit_tree) { + unsigned int parents = git_commit_parentcount(commit); + if (parents == 0) { + return include_path_exact_root(walk, commit_tree); + } + else { + unsigned int p; + bool include_commit = true; + /* Loop through all parents, and ensure that it matches with all + * parents before including the commit + */ + for (p = 0; p < parents && include_commit; p++) { + git_commit *parent = NULL; + git_tree *parent_tree = NULL; + if (git_commit_parent(&parent, commit, p) == 0 + && git_commit_tree(&parent_tree, parent) == 0) { + if (!include_path_exact_parent(walk, commit_tree, parent_tree)) + include_commit = false; + } + git_tree_free(parent_tree); + git_commit_free(parent); + } + return include_commit; + } +} + +static bool include_path(git_revwalk *walk, git_commit_list_node *commit_node) +{ + git_commit *commit = NULL; + git_tree *commit_tree = NULL; + bool include = false; + + if (git_commit_lookup(&commit, walk->repo, &commit_node->oid) == 0 + && git_commit_tree(&commit_tree, commit) == 0) { + if (walk->pathspec_wildcard) + include = include_path_wildcard(walk, commit, commit_tree); + else + include = include_path_exact(walk, commit, commit_tree); + } + + git_tree_free(commit_tree); + git_commit_free(commit); + return include; +} + static int limit_list(git_commit_list **out, git_revwalk *walk, git_commit_list *commits) { int error, slop = SLOP; @@ -492,6 +656,9 @@ static int limit_list(git_commit_list **out, git_revwalk *walk, git_commit_list break; } + if (walk->pathspec && !include_path(walk, commit)) + continue; + if (walk->hide_cb && walk->hide_cb(&commit->oid, walk->hide_cb_payload)) continue; @@ -798,6 +965,33 @@ int git_revwalk_next(git_oid *oid, git_revwalk *walk) return error; } +static bool pathspec_has_wildcard(git_pathspec *pathspec) +{ + size_t i; + git_attr_fnmatch *match; + git_vector_foreach(&pathspec->pathspec, i, match) { + if (match->flags & GIT_ATTR_FNMATCH_HASWILD) { + return true; + } + } + return false; +} + +int git_revwalk_pathspec(git_revwalk *walk, git_pathspec *pathspec) { + GIT_ASSERT_ARG(walk); + + if (walk->walking) + git_revwalk_reset(walk); + + if (pathspec) { + walk->pathspec = pathspec; + walk->pathspec_wildcard = pathspec_has_wildcard(pathspec); + walk->limited = 1; + } + + return 0; +} + int git_revwalk_reset(git_revwalk *walk) { git_commit_list_node *commit; diff --git a/src/libgit2/revwalk.h b/src/libgit2/revwalk.h index 632b2807c..44c716706 100644 --- a/src/libgit2/revwalk.h +++ b/src/libgit2/revwalk.h @@ -46,6 +46,9 @@ struct git_revwalk { /* hide callback */ git_revwalk_hide_cb hide_cb; void *hide_cb_payload; + + git_pathspec *pathspec; + bool pathspec_wildcard; }; git_commit_list_node *git_revwalk__commit_lookup(git_revwalk *walk, const git_oid *oid);