git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Duy Nguyen <pclouds@gmail.com>
To: Ben Peart <peartben@gmail.com>
Cc: Ben Peart <Ben.Peart@microsoft.com>,
	Git Mailing List <git@vger.kernel.org>,
	Junio C Hamano <gitster@pobox.com>,
	Eric Sunshine <sunshine@sunshineco.com>
Subject: Re: [PATCH v2] checkout: optimize "git checkout -b <new_branch>"
Date: Sun, 5 Aug 2018 10:57:18 +0200	[thread overview]
Message-ID: <20180805085718.GA27625@duynguyen.home> (raw)
In-Reply-To: <3900401c-4d7b-233c-2098-9771a06ec0dd@gmail.com>

On Thu, Aug 02, 2018 at 02:02:00PM -0400, Ben Peart wrote:
> > And if all go according to plan, there will be no changes made in the
> > index (by either 2-way merge or sparse checkout stuff) we should be
> > able to just skip writing down the index, if we haven't done that
> > already.
> > 
> 
> That would be great as writing the index is 8.4% of the time spent.

And that is of course not good. Avoiding writing index at all is nice
and all but we should not take that much time writing it (what about
other commands?)

I have another idea to reduce index size, which means cheaper trailer
hash calculation, but this may also be a direction you could look
into.

I notice that we have a lot of duplicate data in the index. For
example, most files would have the same ctime and even mtime (at least
the seconds part). group id, device id and file mode should also be
the same in normal case. I guess on Windows, inode is always the same
as well. So why store (and hash) them repeatedly?

This patch writes "index version 5" that only writes those fields out
if they are different from the previous entry (we already do this for
path name in v4).

On webkit repo (275k files), writing time can be reduced from 0.314s
(v2) to 0.206s (v5). File size of all versions:

 9.8M Aug  5 10:27 .git/index.gz
  38M Aug  5 09:54 .git/index-v2
  22M Aug  5 10:26 .git/index-v4
  15M Aug  5 10:34 .git/index-v5

As you can see there's still room to compress, but I don't think we
want to go full deflate mode because compression time goes up.

Another option I think you could consider is not storing full hash. As
long as the shortened hash is still unique, we can expand to full hash
at read time. Of course you can't go too short (I'm thinking half the
current hash size is reasonable trade off) and need to take into
account how much time will be lost for looking up and expanding these
short hashes at read time.

-- 8< --
diff --git a/cache.h b/cache.h
index e6f7ee4b64..1d6bc7d122 100644
--- a/cache.h
+++ b/cache.h
@@ -129,7 +129,7 @@ struct cache_header {
 };
 
 #define INDEX_FORMAT_LB 2
-#define INDEX_FORMAT_UB 4
+#define INDEX_FORMAT_UB 5
 
 /*
  * The "cache_time" is just the low 32 bits of the
diff --git a/read-cache.c b/read-cache.c
index b0b5df5de7..9bb095d6b2 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -2002,7 +2002,7 @@ static int ce_write_flush(git_hash_ctx *context, int fd)
 	return 0;
 }
 
-static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
+static int ce_write(git_hash_ctx *context, int fd, const void *data, unsigned int len)
 {
 	while (len) {
 		unsigned int buffered = write_buffer_len;
@@ -2024,6 +2024,18 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
 	return 0;
 }
 
+static int ce_write_u32(git_hash_ctx *context, int fd, uint32_t data)
+{
+	data = htonl(data);
+	return ce_write(context, fd, &data, sizeof(data));
+}
+
+static int ce_write_u16(git_hash_ctx *context, int fd, uint16_t data)
+{
+	data = htons(data);
+	return ce_write(context, fd, &data, sizeof(data));
+}
+
 static int write_index_ext_header(git_hash_ctx *context, int fd,
 				  unsigned int ext, unsigned int sz)
 {
@@ -2133,8 +2145,46 @@ static void copy_cache_entry_to_ondisk(struct ondisk_cache_entry *ondisk,
 	}
 }
 
+#define ONDISK_MASK (CE_STAGEMASK | CE_INTENT_TO_ADD | CE_SKIP_WORKTREE)
+
+static int ce_write_entry_v5(git_hash_ctx *c, int fd,
+			     const struct cache_entry *ce,
+			     const struct cache_entry *pce)
+{
+	uint8_t mask = 0;
+	const struct stat_data *st1 = &ce->ce_stat_data;
+	const struct stat_data *st2 = &pce->ce_stat_data;
+	//int ret = 0;
+
+	mask |= (st1->sd_ctime.sec != st2->sd_ctime.sec) << 0;
+	mask |= (st1->sd_mtime.sec != st2->sd_mtime.sec) << 1;
+	mask |= (st1->sd_dev != st2->sd_dev) << 2;
+	mask |= (st1->sd_ino != st2->sd_ino) << 3;
+	mask |= (ce->ce_mode != pce->ce_mode) << 4;
+	mask |= (st1->sd_uid != st2->sd_uid) << 5;
+	mask |= (st1->sd_gid != st2->sd_gid) << 6;
+	mask |= ((ce->ce_flags & ONDISK_MASK) != (pce->ce_flags & ONDISK_MASK)) << 7;
+
+	ce_write(c, fd, &mask, 1);
+	ce_write(c, fd, ce->oid.hash, GIT_SHA1_RAWSZ);
+	ce_write_u32(c, fd, st1->sd_ctime.nsec);
+	ce_write_u32(c, fd, st1->sd_mtime.nsec);
+	ce_write_u32(c, fd, st1->sd_size);
+	if (mask & (1 << 0)) ce_write_u32(c, fd, st1->sd_ctime.sec);
+	if (mask & (1 << 1)) ce_write_u32(c, fd, st1->sd_mtime.sec);
+	if (mask & (1 << 2)) ce_write_u32(c, fd, st1->sd_dev);
+	if (mask & (1 << 3)) ce_write_u32(c, fd, st1->sd_ino);
+	if (mask & (1 << 4)) ce_write_u16(c, fd, ce->ce_mode);
+	if (mask & (1 << 5)) ce_write_u32(c, fd, st1->sd_uid);
+	if (mask & (1 << 6)) ce_write_u32(c, fd, st1->sd_gid);
+	if (mask & (1 << 7)) ce_write_u32(c, fd, ce->ce_flags & ONDISK_MASK);
+	return 0;
+}
+
 static int ce_write_entry(git_hash_ctx *c, int fd, struct cache_entry *ce,
-			  struct strbuf *previous_name, struct ondisk_cache_entry *ondisk)
+			  const struct cache_entry *pce,
+			  struct strbuf *previous_name,
+			  struct ondisk_cache_entry *ondisk)
 {
 	int size;
 	int result;
@@ -2173,8 +2223,12 @@ static int ce_write_entry(git_hash_ctx *c, int fd, struct cache_entry *ce,
 		to_remove = previous_name->len - common;
 		prefix_size = encode_varint(to_remove, to_remove_vi);
 
-		copy_cache_entry_to_ondisk(ondisk, ce);
-		result = ce_write(c, fd, ondisk, size);
+		if (!pce) {
+			copy_cache_entry_to_ondisk(ondisk, ce);
+			result = ce_write(c, fd, ondisk, size);
+		} else {
+			result = ce_write_entry_v5(c, fd, ce, pce);
+		}
 		if (!result)
 			result = ce_write(c, fd, to_remove_vi, prefix_size);
 		if (!result)
@@ -2313,7 +2367,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
 		return -1;
 
-	previous_name = (hdr_version == 4) ? &previous_name_buf : NULL;
+	previous_name = (hdr_version >= 4) ? &previous_name_buf : NULL;
 
 	for (i = 0; i < entries; i++) {
 		struct cache_entry *ce = cache[i];
@@ -2334,7 +2388,10 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 
 			drop_cache_tree = 1;
 		}
-		if (ce_write_entry(&c, newfd, ce, previous_name, (struct ondisk_cache_entry *)&ondisk) < 0)
+		if (ce_write_entry(&c, newfd, ce,
+				   hdr_version >= 5 && i > 0 ? cache[i - 1] : NULL,
+				   previous_name,
+				   (struct ondisk_cache_entry *)&ondisk) < 0)
 			err = -1;
 
 		if (err)
-- 8< --
--
Duy

  parent reply	other threads:[~2018-08-05  8:57 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-07-24 18:01 [PATCH v1] checkout: optionally speed up "git checkout -b foo" Ben Peart
2018-07-24 18:42 ` Eric Sunshine
2018-07-24 19:45   ` Ben Peart
2018-07-26 15:04     ` Junio C Hamano
2018-07-26 18:59       ` Eric Sunshine
2018-07-26 19:08         ` Eric Sunshine
2018-07-24 19:21 ` Junio C Hamano
2018-07-24 20:47   ` Ben Peart
2018-07-31 16:39 ` [PATCH v2] checkout: optimize "git checkout -b <new_branch>" Ben Peart
2018-07-31 20:01   ` Junio C Hamano
2018-08-01 15:10   ` Duy Nguyen
2018-08-02 18:02     ` Ben Peart
2018-08-03 15:58       ` Duy Nguyen
2018-08-06 14:25         ` Ben Peart
2018-08-15 21:05           ` Ben Peart
2018-08-05  8:57       ` Duy Nguyen [this message]
2018-08-16 18:27 ` [PATCH v3] " Ben Peart
2018-08-16 18:37   ` Duy Nguyen
2018-08-17 12:37     ` Ben Peart
2018-08-19  1:44       ` Elijah Newren
2018-08-20 13:40         ` Ben Peart
2018-08-20 18:16           ` Elijah Newren
2018-08-21 14:51             ` Duy Nguyen
2018-08-30 17:22               ` Elijah Newren
2018-09-04 16:46                 ` Duy Nguyen
2018-08-20 18:31         ` Junio C Hamano
2018-09-18  5:34   ` [PATCH] config doc: add missing list separator for checkout.optimizeNewBranch Ævar Arnfjörð Bjarmason
2018-09-18 16:57     ` Taylor Blau
2018-09-18 17:16       ` Jeff King
2018-09-18 17:20         ` Taylor Blau
2018-09-18 17:13     ` Jeff King
2018-09-19  4:41       ` Ævar Arnfjörð Bjarmason

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180805085718.GA27625@duynguyen.home \
    --to=pclouds@gmail.com \
    --cc=Ben.Peart@microsoft.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=peartben@gmail.com \
    --cc=sunshine@sunshineco.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).