All of lore.kernel.org
 help / color / mirror / Atom feed
From: Karsten Blees <karsten.blees@gmail.com>
To: Git List <git@vger.kernel.org>, Junio C Hamano <gitster@pobox.com>
Cc: Thomas Rast <tr@thomasrast.ch>,
	Jens Lehmann <Jens.Lehmann@web.de>,
	Karsten Blees <karsten.blees@gmail.com>
Subject: [PATCH v4 05/14] diffcore-rename.c: simplify finding exact renames
Date: Thu, 07 Nov 2013 15:36:45 +0100	[thread overview]
Message-ID: <527BA57D.9070400@gmail.com> (raw)
In-Reply-To: <527BA483.6040803@gmail.com>

The find_exact_renames function currently only uses the hash table for
grouping, i.e.:

1. add sources
2. add destinations
3. iterate all buckets, per bucket:
4. split sources from destinations
5. iterate destinations, per destination:
6. iterate sources to find best match

This can be simplified by utilizing the lookup functionality of the hash
table, i.e.:

1. add sources
2. iterate destinations, per destination:
3. lookup sources matching the current destination
4. iterate sources to find best match

This saves several iterations and file_similarity allocations for the
destinations.

Signed-off-by: Karsten Blees <blees@dcon.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 diffcore-rename.c | 75 +++++++++++++++----------------------------------------
 1 file changed, 20 insertions(+), 55 deletions(-)

diff --git a/diffcore-rename.c b/diffcore-rename.c
index 008a60c..cfeb408 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -243,7 +243,7 @@ static int score_compare(const void *a_, const void *b_)
 }
 
 struct file_similarity {
-	int src_dst, index;
+	int index;
 	struct diff_filespec *filespec;
 	struct file_similarity *next;
 };
@@ -260,25 +260,21 @@ static unsigned int hash_filespec(struct diff_filespec *filespec)
 	return hash;
 }
 
-static int find_identical_files(struct file_similarity *src,
-				struct file_similarity *dst,
+static int find_identical_files(struct hash_table *srcs,
+				int dst_index,
 				struct diff_options *options)
 {
 	int renames = 0;
 
-	/*
-	 * Walk over all the destinations ...
-	 */
-	do {
-	struct diff_filespec *target = dst->filespec;
+	struct diff_filespec *target = rename_dst[dst_index].two;
 	struct file_similarity *p, *best;
 	int i = 100, best_score = -1;
 
 	/*
-	 * .. to find the best source match
+	 * Find the best source match for specified destination.
 	 */
 	best = NULL;
-	for (p = src; p; p = p->next) {
+	for (p = lookup_hash(hash_filespec(target), srcs); p; p = p->next) {
 		int score;
 		struct diff_filespec *source = p->filespec;
 
@@ -307,61 +303,28 @@ static int find_identical_files(struct file_similarity *src,
 			break;
 	}
 	if (best) {
-		record_rename_pair(dst->index, best->index, MAX_SCORE);
+		record_rename_pair(dst_index, best->index, MAX_SCORE);
 		renames++;
 	}
-	} while ((dst = dst->next) != NULL);
 	return renames;
 }
 
-static void free_similarity_list(struct file_similarity *p)
+static int free_similarity_list(void *p, void *unused)
 {
 	while (p) {
 		struct file_similarity *entry = p;
-		p = p->next;
+		p = entry->next;
 		free(entry);
 	}
+	return 0;
 }
 
-static int find_same_files(void *ptr, void *data)
-{
-	int ret;
-	struct file_similarity *p = ptr;
-	struct file_similarity *src = NULL, *dst = NULL;
-	struct diff_options *options = data;
-
-	/* Split the hash list up into sources and destinations */
-	do {
-		struct file_similarity *entry = p;
-		p = p->next;
-		if (entry->src_dst < 0) {
-			entry->next = src;
-			src = entry;
-		} else {
-			entry->next = dst;
-			dst = entry;
-		}
-	} while (p);
-
-	/*
-	 * If we have both sources *and* destinations, see if
-	 * we can match them up
-	 */
-	ret = (src && dst) ? find_identical_files(src, dst, options) : 0;
-
-	/* Free the hashes and return the number of renames found */
-	free_similarity_list(src);
-	free_similarity_list(dst);
-	return ret;
-}
-
-static void insert_file_table(struct hash_table *table, int src_dst, int index, struct diff_filespec *filespec)
+static void insert_file_table(struct hash_table *table, int index, struct diff_filespec *filespec)
 {
 	void **pos;
 	unsigned int hash;
 	struct file_similarity *entry = xmalloc(sizeof(*entry));
 
-	entry->src_dst = src_dst;
 	entry->index = index;
 	entry->filespec = filespec;
 	entry->next = NULL;
@@ -385,24 +348,26 @@ static void insert_file_table(struct hash_table *table, int src_dst, int index,
  */
 static int find_exact_renames(struct diff_options *options)
 {
-	int i;
+	int i, renames = 0;
 	struct hash_table file_table;
 
+	/* Add all sources to the hash table */
 	init_hash(&file_table);
-	preallocate_hash(&file_table, rename_src_nr + rename_dst_nr);
+	preallocate_hash(&file_table, rename_src_nr);
 	for (i = 0; i < rename_src_nr; i++)
-		insert_file_table(&file_table, -1, i, rename_src[i].p->one);
+		insert_file_table(&file_table, i, rename_src[i].p->one);
 
+	/* Walk the destinations and find best source match */
 	for (i = 0; i < rename_dst_nr; i++)
-		insert_file_table(&file_table, 1, i, rename_dst[i].two);
+		renames += find_identical_files(&file_table, i, options);
 
-	/* Find the renames */
-	i = for_each_hash(&file_table, find_same_files, options);
+	/* Free source file_similarity chains */
+	for_each_hash(&file_table, free_similarity_list, options);
 
 	/* .. and free the hash data structure */
 	free_hash(&file_table);
 
-	return i;
+	return renames;
 }
 
 #define NUM_CANDIDATE_PER_DST 4
-- 
1.8.4.msysgit.0.12.g88f5ed0

  parent reply	other threads:[~2013-11-07 14:36 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-11-07 14:32 [PATCH v4 00/14] New hash table implementation Karsten Blees
2013-11-07 14:33 ` [PATCH v4 01/14] submodule: don't access the .gitmodules cache entry after removing it Karsten Blees
2013-11-07 22:27   ` Heiko Voigt
2013-11-07 14:34 ` [PATCH v4 02/14] add a hashtable implementation that supports O(1) removal Karsten Blees
2013-11-07 21:40   ` Junio C Hamano
2013-11-08 10:27     ` Karsten Blees
2013-11-08 16:45       ` Philip Oakley
2013-11-08 17:08       ` Junio C Hamano
2013-11-13 16:37         ` Karsten Blees
2013-11-07 14:35 ` [PATCH v4 03/14] buitin/describe.c: use new hash map implementation Karsten Blees
2013-11-07 14:36 ` [PATCH v4 04/14] diffcore-rename.c: move code around to prepare for the next patch Karsten Blees
2013-11-07 14:36 ` Karsten Blees [this message]
2013-11-07 14:37 ` [PATCH v4 06/14] diffcore-rename.c: use new hash map implementation Karsten Blees
2013-11-07 14:38 ` [PATCH v4 07/14] name-hash.c: use new hash map implementation for directories Karsten Blees
2013-11-07 14:38 ` [PATCH v4 08/14] name-hash.c: remove unreferenced directory entries Karsten Blees
2013-11-07 14:39 ` [PATCH v4 09/14] name-hash.c: use new hash map implementation for cache entries Karsten Blees
2013-11-07 14:39 ` [PATCH v4 10/14] name-hash.c: remove cache entries instead of marking them CE_UNHASHED Karsten Blees
2013-11-07 14:40 ` [PATCH v4 11/14] remove old hash.[ch] implementation Karsten Blees
2013-11-07 14:43 ` [PATCH v4 12/14] fix 'git update-index --verbose --again' output Karsten Blees
2013-11-07 22:12   ` Junio C Hamano
2013-11-08 10:27     ` Karsten Blees
2013-11-07 14:44 ` [PATCH v4 13/14] builtin/update-index.c: cleanup update_one Karsten Blees
2013-11-07 21:40   ` Junio C Hamano
2013-11-08 10:27     ` Karsten Blees
2013-11-07 14:45 ` [PATCH v4 14/14] read-cache.c: fix memory leaks caused by removed cache entries Karsten Blees
2013-11-07 21:40   ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=527BA57D.9070400@gmail.com \
    --to=karsten.blees@gmail.com \
    --cc=Jens.Lehmann@web.de \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=tr@thomasrast.ch \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.