All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Elijah Newren via GitGitGadget" <gitgitgadget@gmail.com>
To: git@vger.kernel.org
Cc: Derrick Stolee <dstolee@microsoft.com>,
	Jonathan Tan <jonathantanmy@google.com>,
	Taylor Blau <me@ttaylorr.com>, Elijah Newren <newren@gmail.com>,
	Elijah Newren <newren@gmail.com>
Subject: [PATCH 8/8] diffcore-rename: avoid doing basename comparisons for irrelevant sources
Date: Sun, 28 Feb 2021 03:58:26 +0000	[thread overview]
Message-ID: <8fed92b62f375f97551a4e07bae5e43f9030d0d2.1614484707.git.gitgitgadget@gmail.com> (raw)
In-Reply-To: <pull.845.git.1614484707.gitgitgadget@gmail.com>

From: Elijah Newren <newren@gmail.com>

The basename comparison optimization implemented in
find_basename_matches() is very beneficial since it allows a source to
sometimes only be compared with one other file instead of N other files.
When a match is found, both a source and destination can be removed from
the matrix of inexact rename comparisons.  In contrast, the irrelevant
source optimization only allows us to remove a source from the matrix of
inexact rename comparisons...but it has the advantage of allowing a
source file to not even be loaded into memory at all and be compared to
0 other files.  Generally, not even comparing is a bigger performance
win, so when both optimizations could apply, prefer to use the
irrelevant-source optimization.

For the testcases mentioned in commit 557ac0350d ("merge-ort: begin
performance work; instrument with trace2_region_* calls", 2020-10-28),
this change improves the performance as follows:

                            Before                  After
    no-renames:        5.708 s ±  0.111 s     5.680 s ±  0.096 s
    mega-renames:    102.171 s ±  0.440 s    13.812 s ±  0.162 s
    just-one-mega:     3.471 s ±  0.015 s   506.0  ms ±  3.9  ms

Signed-off-by: Elijah Newren <newren@gmail.com>
---
 diffcore-rename.c | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/diffcore-rename.c b/diffcore-rename.c
index 7f6115fd9018..e8508541be14 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -527,6 +527,7 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
 }
 
 static void initialize_dir_rename_info(struct dir_rename_info *info,
+				       struct strset *relevant_sources,
 				       struct strset *dirs_removed,
 				       struct strmap *dir_rename_count)
 {
@@ -534,7 +535,7 @@ static void initialize_dir_rename_info(struct dir_rename_info *info,
 	struct strmap_entry *entry;
 	int i;
 
-	if (!dirs_removed) {
+	if (!dirs_removed && !relevant_sources) {
 		info->setup = 0;
 		return;
 	}
@@ -549,7 +550,20 @@ static void initialize_dir_rename_info(struct dir_rename_info *info,
 	strmap_init_with_options(&info->dir_rename_guess, NULL, 0);
 
 	/* Setup info->relevant_source_dirs */
-	info->relevant_source_dirs = dirs_removed;
+	info->relevant_source_dirs = NULL;
+	if (dirs_removed || !relevant_sources) {
+		info->relevant_source_dirs = dirs_removed; /* might be NULL */
+	} else {
+		info->relevant_source_dirs = xmalloc(sizeof(struct strintmap));
+		strset_init(info->relevant_source_dirs);
+		strset_for_each_entry(relevant_sources, &iter, entry) {
+			char *dirname = get_dirname(entry->key);
+			if (!dirs_removed ||
+			    strset_contains(dirs_removed, dirname))
+				strset_add(info->relevant_source_dirs, dirname);
+			free(dirname);
+		}
+	}
 
 	/*
 	 * Loop setting up both info->idx_map, and doing setup of
@@ -627,6 +641,13 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info,
 	/* dir_rename_guess */
 	strmap_clear(&info->dir_rename_guess, 1);
 
+	/* relevant_source_dirs */
+	if (info->relevant_source_dirs &&
+	    info->relevant_source_dirs != dirs_removed) {
+		strset_clear(info->relevant_source_dirs);
+		FREE_AND_NULL(info->relevant_source_dirs);
+	}
+
 	/* dir_rename_count */
 	if (!keep_dir_rename_count) {
 		partial_clear_dir_rename_count(info->dir_rename_count);
@@ -749,6 +770,7 @@ static int idx_possible_rename(char *filename, struct dir_rename_info *info)
 static int find_basename_matches(struct diff_options *options,
 				 int minimum_score,
 				 struct dir_rename_info *info,
+				 struct strset *relevant_sources,
 				 struct strset *dirs_removed)
 {
 	/*
@@ -839,6 +861,11 @@ static int find_basename_matches(struct diff_options *options,
 		intptr_t src_index;
 		intptr_t dst_index;
 
+		/* Skip irrelevant sources */
+		if (relevant_sources &&
+		    !strset_contains(relevant_sources, filename))
+			continue;
+
 		/*
 		 * If the basename is unique among remaining sources, then
 		 * src_index will equal 'i' and we can attempt to match it
@@ -1164,7 +1191,7 @@ void diffcore_rename_extended(struct diff_options *options,
 
 		/* Preparation for basename-driven matching. */
 		trace2_region_enter("diff", "dir rename setup", options->repo);
-		initialize_dir_rename_info(&info,
+		initialize_dir_rename_info(&info, relevant_sources,
 					   dirs_removed, dir_rename_count);
 		trace2_region_leave("diff", "dir rename setup", options->repo);
 
@@ -1172,7 +1199,9 @@ void diffcore_rename_extended(struct diff_options *options,
 		trace2_region_enter("diff", "basename matches", options->repo);
 		rename_count += find_basename_matches(options,
 						      min_basename_score,
-						      &info, dirs_removed);
+						      &info,
+						      relevant_sources,
+						      dirs_removed);
 		trace2_region_leave("diff", "basename matches", options->repo);
 
 		/*
-- 
gitgitgadget

  parent reply	other threads:[~2021-02-28  4:00 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-02-28  3:58 [PATCH 0/8] Optimization batch 9: avoid detecting irrelevant renames Elijah Newren via GitGitGadget
2021-02-28  3:58 ` [PATCH 1/8] diffcore-rename: enable filtering possible rename sources Elijah Newren via GitGitGadget
2021-02-28  3:58 ` [PATCH 2/8] merge-ort: precompute subset of sources for which we need rename detection Elijah Newren via GitGitGadget
2021-02-28  3:58 ` [PATCH 3/8] merge-ort: add data structures for an alternate tree traversal Elijah Newren via GitGitGadget
2021-02-28  3:58 ` [PATCH 4/8] merge-ort: introduce wrappers for " Elijah Newren via GitGitGadget
2021-02-28  3:58 ` [PATCH 5/8] merge-ort: precompute whether directory rename detection is needed Elijah Newren via GitGitGadget
2021-02-28  3:58 ` [PATCH 6/8] merge-ort: use relevant_sources to filter possible rename sources Elijah Newren via GitGitGadget
2021-02-28  3:58 ` [PATCH 7/8] merge-ort: skip rename detection entirely if possible Elijah Newren via GitGitGadget
2021-02-28  3:58 ` Elijah Newren via GitGitGadget [this message]
2021-03-01 16:39 ` [PATCH 0/8] Optimization batch 9: avoid detecting irrelevant renames Elijah Newren
2021-03-04  7:54 ` Elijah Newren
2021-03-09  0:09 ` [PATCH v2 " Elijah Newren via GitGitGadget
2021-03-09  0:09   ` [PATCH v2 1/8] diffcore-rename: enable filtering possible rename sources Elijah Newren via GitGitGadget
2021-03-09 22:21     ` Derrick Stolee
2021-03-09 22:40       ` Elijah Newren
2021-03-09 22:45         ` Derrick Stolee
2021-03-09  0:09   ` [PATCH v2 2/8] merge-ort: precompute subset of sources for which we need rename detection Elijah Newren via GitGitGadget
2021-03-09  0:09   ` [PATCH v2 3/8] merge-ort: add data structures for an alternate tree traversal Elijah Newren via GitGitGadget
2021-03-09  0:09   ` [PATCH v2 4/8] merge-ort: introduce wrappers for " Elijah Newren via GitGitGadget
2021-03-09 23:06     ` Derrick Stolee
2021-03-09  0:09   ` [PATCH v2 5/8] merge-ort: precompute whether directory rename detection is needed Elijah Newren via GitGitGadget
2021-03-09  0:09   ` [PATCH v2 6/8] merge-ort: use relevant_sources to filter possible rename sources Elijah Newren via GitGitGadget
2021-03-09  0:09   ` [PATCH v2 7/8] merge-ort: skip rename detection entirely if possible Elijah Newren via GitGitGadget
2021-03-09 22:51     ` Derrick Stolee
2021-03-09 22:57       ` Elijah Newren
2021-03-09  0:09   ` [PATCH v2 8/8] diffcore-rename: avoid doing basename comparisons for irrelevant sources Elijah Newren via GitGitGadget
2021-03-09 22:08   ` [PATCH v2 0/8] Optimization batch 9: avoid detecting irrelevant renames Derrick Stolee
2021-03-10 15:08   ` Derrick Stolee
2021-03-11  0:38   ` [PATCH v3 " Elijah Newren via GitGitGadget
2021-03-11  0:38     ` [PATCH v3 1/8] diffcore-rename: enable filtering possible rename sources Elijah Newren via GitGitGadget
2021-03-11  0:38     ` [PATCH v3 2/8] merge-ort: precompute subset of sources for which we need rename detection Elijah Newren via GitGitGadget
2021-03-11  0:38     ` [PATCH v3 3/8] merge-ort: add data structures for an alternate tree traversal Elijah Newren via GitGitGadget
2021-03-11  0:38     ` [PATCH v3 4/8] merge-ort: introduce wrappers for " Elijah Newren via GitGitGadget
2021-03-11  0:38     ` [PATCH v3 5/8] merge-ort: precompute whether directory rename detection is needed Elijah Newren via GitGitGadget
2021-03-11  0:38     ` [PATCH v3 6/8] merge-ort: use relevant_sources to filter possible rename sources Elijah Newren via GitGitGadget
2021-03-11  0:38     ` [PATCH v3 7/8] merge-ort: skip rename detection entirely if possible Elijah Newren via GitGitGadget
2021-03-11  0:38     ` [PATCH v3 8/8] diffcore-rename: avoid doing basename comparisons for irrelevant sources Elijah Newren via GitGitGadget
2021-03-15 13:57     ` [PATCH v3 0/8] Optimization batch 9: avoid detecting irrelevant renames Derrick Stolee
2021-03-15 17:10   ` [PATCH v2 " Elijah Newren

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=8fed92b62f375f97551a4e07bae5e43f9030d0d2.1614484707.git.gitgitgadget@gmail.com \
    --to=gitgitgadget@gmail.com \
    --cc=dstolee@microsoft.com \
    --cc=git@vger.kernel.org \
    --cc=jonathantanmy@google.com \
    --cc=me@ttaylorr.com \
    --cc=newren@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.