All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] refs.c: get_ref_cache: use a bucket hash
@ 2015-03-16 14:20 Andreas Krey
  2015-03-16 17:19 ` Thomas Gummerer
  2015-03-16 17:23 ` Junio C Hamano
  0 siblings, 2 replies; 12+ messages in thread
From: Andreas Krey @ 2015-03-16 14:20 UTC (permalink / raw)
  To: Git Mailing List; +Cc: Junio C Hamano

get_ref_cache used a linear list, which obviously is O(n^2).
Use a fixed bucket hash which just takes a factor of 100000
(~ 317^2) out of the n^2 - which is enough.

Signed-off-by: Andreas Krey <a.krey@gmx.de>
---

This brings 'git clean -ndx' times down from 17 minutes
to 11 seconds on one of our workspaces (which accumulated
a lot of ignored directories). Actuallly using adaptive
hashing or other structures seems overkill.

 refs.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/refs.c b/refs.c
index e23542b..8198d9e 100644
--- a/refs.c
+++ b/refs.c
@@ -982,6 +982,8 @@ struct packed_ref_cache {
 	struct stat_validity validity;
 };
 
+#define REF_CACHE_HASH 317
+
 /*
  * Future: need to be in "struct repository"
  * when doing a full libification.
@@ -996,7 +998,7 @@ static struct ref_cache {
 	 * is initialized correctly.
 	 */
 	char name[1];
-} ref_cache, *submodule_ref_caches;
+} ref_cache, *submodule_ref_caches[REF_CACHE_HASH];
 
 /* Lock used for the main packed-refs file: */
 static struct lock_file packlock;
@@ -1065,18 +1067,19 @@ static struct ref_cache *create_ref_cache(const char *submodule)
  */
 static struct ref_cache *get_ref_cache(const char *submodule)
 {
-	struct ref_cache *refs;
+	struct ref_cache *refs, **bucketp;
+	bucketp = submodule_ref_caches + strhash(submodule) % REF_CACHE_HASH;
 
 	if (!submodule || !*submodule)
 		return &ref_cache;
 
-	for (refs = submodule_ref_caches; refs; refs = refs->next)
+	for (refs = *bucketp; refs; refs = refs->next)
 		if (!strcmp(submodule, refs->name))
 			return refs;
 
 	refs = create_ref_cache(submodule);
-	refs->next = submodule_ref_caches;
-	submodule_ref_caches = refs;
+	refs->next = *bucketp;
+	*bucketp = refs;
 	return refs;
 }
 
-- 
2.3.2.223.g7a9409c

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] refs.c: get_ref_cache: use a bucket hash
  2015-03-16 14:20 [PATCH] refs.c: get_ref_cache: use a bucket hash Andreas Krey
@ 2015-03-16 17:19 ` Thomas Gummerer
  2015-03-16 17:23 ` Junio C Hamano
  1 sibling, 0 replies; 12+ messages in thread
From: Thomas Gummerer @ 2015-03-16 17:19 UTC (permalink / raw)
  To: Andreas Krey; +Cc: Git Mailing List, Junio C Hamano

Hi,

On 03/16, Andreas Krey wrote:
> get_ref_cache used a linear list, which obviously is O(n^2).
> Use a fixed bucket hash which just takes a factor of 100000
> (~ 317^2) out of the n^2 - which is enough.
>
> Signed-off-by: Andreas Krey <a.krey@gmx.de>
> ---
>
> This brings 'git clean -ndx' times down from 17 minutes
> to 11 seconds on one of our workspaces (which accumulated
> a lot of ignored directories). Actuallly using adaptive
> hashing or other structures seems overkill.
>
>  refs.c | 13 ++++++++-----
>  1 file changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/refs.c b/refs.c
> index e23542b..8198d9e 100644
> --- a/refs.c
> +++ b/refs.c
> @@ -982,6 +982,8 @@ struct packed_ref_cache {
>  	struct stat_validity validity;
>  };
>
> +#define REF_CACHE_HASH 317
> +
>  /*
>   * Future: need to be in "struct repository"
>   * when doing a full libification.
> @@ -996,7 +998,7 @@ static struct ref_cache {
>  	 * is initialized correctly.
>  	 */
>  	char name[1];
> -} ref_cache, *submodule_ref_caches;
> +} ref_cache, *submodule_ref_caches[REF_CACHE_HASH];
>
>  /* Lock used for the main packed-refs file: */
>  static struct lock_file packlock;
> @@ -1065,18 +1067,19 @@ static struct ref_cache *create_ref_cache(const char *submodule)
>   */
>  static struct ref_cache *get_ref_cache(const char *submodule)
>  {
> -	struct ref_cache *refs;
> +	struct ref_cache *refs, **bucketp;
> +	bucketp = submodule_ref_caches + strhash(submodule) % REF_CACHE_HASH;
>

This breaks the test-suite for me, in the cases where submodule is
NULL.  How about something like this on top?

diff --git a/refs.c b/refs.c
index 8198d9e..311faf2 100644
--- a/refs.c
+++ b/refs.c
@@ -1068,7 +1068,9 @@ static struct ref_cache *create_ref_cache(const char *submodule)
 static struct ref_cache *get_ref_cache(const char *submodule)
 {
        struct ref_cache *refs, **bucketp;
-       bucketp = submodule_ref_caches + strhash(submodule) % REF_CACHE_HASH;
+       bucketp = submodule_ref_caches;
+       if (submodule)
+               bucketp += strhash(submodule) % REF_CACHE_HASH;

        if (!submodule || !*submodule)
                return &ref_cache;

>  	if (!submodule || !*submodule)
>  		return &ref_cache;
>
> -	for (refs = submodule_ref_caches; refs; refs = refs->next)
> +	for (refs = *bucketp; refs; refs = refs->next)
>  		if (!strcmp(submodule, refs->name))
>  			return refs;
>
>  	refs = create_ref_cache(submodule);
> -	refs->next = submodule_ref_caches;
> -	submodule_ref_caches = refs;
> +	refs->next = *bucketp;
> +	*bucketp = refs;
>  	return refs;
>  }
>
> --
> 2.3.2.223.g7a9409c
> --
> To unsubscribe from this list: send the line "unsubscribe git" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] refs.c: get_ref_cache: use a bucket hash
  2015-03-16 14:20 [PATCH] refs.c: get_ref_cache: use a bucket hash Andreas Krey
  2015-03-16 17:19 ` Thomas Gummerer
@ 2015-03-16 17:23 ` Junio C Hamano
  2015-03-16 18:40   ` Andreas Krey
  1 sibling, 1 reply; 12+ messages in thread
From: Junio C Hamano @ 2015-03-16 17:23 UTC (permalink / raw)
  To: Andreas Krey; +Cc: Git Mailing List

Andreas Krey <a.krey@gmx.de> writes:

> get_ref_cache used a linear list, which obviously is O(n^2).
> Use a fixed bucket hash which just takes a factor of 100000
> (~ 317^2) out of the n^2 - which is enough.
>
> Signed-off-by: Andreas Krey <a.krey@gmx.de>
> ---
>
> This brings 'git clean -ndx' times down from 17 minutes
> to 11 seconds on one of our workspaces (which accumulated
> a lot of ignored directories).

Nice.

These impressive numbers should go to the commit log message,
together with a bit more numbers to characterise the shape of the
repository that exhibits the problem with the original code.  You
say "a lot of ignored directories", but do you mean directories in
the working tree (which I suppose do not have much to do with the
submodule_ref_caches[])?  I am guessing that the repository has tons
of submodules?  How many is "tons" to make the pain noticeable?

> Actuallly using adaptive
> hashing or other structures seems overkill.

Perhaps _implementing_ these structures only for this codepath may
be overkill, but would it be an overkill to _use_ existing hashmap.c
implementation?  After all, those who wrote the original would have
thought that anything more complex than a linear list would be
overkill, and nobody disagreed until you found that your repository
disagreed with that assumption ;-)

>  refs.c | 13 ++++++++-----
>  1 file changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/refs.c b/refs.c
> index e23542b..8198d9e 100644
> --- a/refs.c
> +++ b/refs.c
> @@ -982,6 +982,8 @@ struct packed_ref_cache {
>  	struct stat_validity validity;
>  };
>  
> +#define REF_CACHE_HASH 317
> +
>  /*
>   * Future: need to be in "struct repository"
>   * when doing a full libification.
> @@ -996,7 +998,7 @@ static struct ref_cache {
>  	 * is initialized correctly.
>  	 */
>  	char name[1];
> -} ref_cache, *submodule_ref_caches;
> +} ref_cache, *submodule_ref_caches[REF_CACHE_HASH];
>  
>  /* Lock used for the main packed-refs file: */
>  static struct lock_file packlock;
> @@ -1065,18 +1067,19 @@ static struct ref_cache *create_ref_cache(const char *submodule)
>   */
>  static struct ref_cache *get_ref_cache(const char *submodule)
>  {
> -	struct ref_cache *refs;
> +	struct ref_cache *refs, **bucketp;
> +	bucketp = submodule_ref_caches + strhash(submodule) % REF_CACHE_HASH;
>  
>  	if (!submodule || !*submodule)
>  		return &ref_cache;
>  
> -	for (refs = submodule_ref_caches; refs; refs = refs->next)
> +	for (refs = *bucketp; refs; refs = refs->next)
>  		if (!strcmp(submodule, refs->name))
>  			return refs;
>  
>  	refs = create_ref_cache(submodule);
> -	refs->next = submodule_ref_caches;
> -	submodule_ref_caches = refs;
> +	refs->next = *bucketp;
> +	*bucketp = refs;
>  	return refs;
>  }

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] refs.c: get_ref_cache: use a bucket hash
  2015-03-16 17:23 ` Junio C Hamano
@ 2015-03-16 18:40   ` Andreas Krey
  2015-03-17  2:40     ` Jeff King
  0 siblings, 1 reply; 12+ messages in thread
From: Andreas Krey @ 2015-03-16 18:40 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Git Mailing List

On Mon, 16 Mar 2015 10:23:05 +0000, Junio C Hamano wrote:
> Andreas Krey <a.krey@gmx.de> writes:
> 
...
> say "a lot of ignored directories", but do you mean directories in
> the working tree (which I suppose do not have much to do with the
> submodule_ref_caches[])?

Apparently, they do.

>I am guessing that the repository has tons
> of submodules?

Not a single one. Thats's thie interesting thing that
makes me think I'm not actually solving the right problem.

This repo has about 100k subdirectories that are ignored
(I don't know whether directly or within ignored dirs),
and strace said that git looks for '.git/HEAD' and one
other file in each of these. Apparently it trieds to
find out if any of these dirs happen to be a git repo
which git clean treats specially, but it seems it also
calls get_ref_cache for each of these dires even though
the turn out not to be a sub-repo.

In other words: I suspect that get_ref_cache shouldn't
be called that often, or that the cache entries should
be removed once a directory is found not to be a sub repo.
Then the linear list wouldn't really hurt.

I'll look into that tomorrow, and also into the hashmap API.

Andreas

-- 
"Totally trivial. Famous last words."
From: Linus Torvalds <torvalds@*.org>
Date: Fri, 22 Jan 2010 07:29:21 -0800

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] refs.c: get_ref_cache: use a bucket hash
  2015-03-16 18:40   ` Andreas Krey
@ 2015-03-17  2:40     ` Jeff King
  2015-03-17  5:35       ` Junio C Hamano
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff King @ 2015-03-17  2:40 UTC (permalink / raw)
  To: Andreas Krey; +Cc: Michael Haggerty, Junio C Hamano, Git Mailing List

[+cc Michael for get_ref_cache wisdom]

On Mon, Mar 16, 2015 at 07:40:40PM +0100, Andreas Krey wrote:

> >I am guessing that the repository has tons
> > of submodules?
> 
> Not a single one. Thats's thie interesting thing that
> makes me think I'm not actually solving the right problem.
> 
> This repo has about 100k subdirectories that are ignored
> (I don't know whether directly or within ignored dirs),
> and strace said that git looks for '.git/HEAD' and one
> other file in each of these. Apparently it trieds to
> find out if any of these dirs happen to be a git repo
> which git clean treats specially, but it seems it also
> calls get_ref_cache for each of these dires even though
> the turn out not to be a sub-repo.
> 
> In other words: I suspect that get_ref_cache shouldn't
> be called that often, or that the cache entries should
> be removed once a directory is found not to be a sub repo.
> Then the linear list wouldn't really hurt.

Yeah, I'd agree.

The get_ref_cache code was designed to scale to the actual number of
submodules. I do not mind seeing it become a hash if people really do
have a large number of submodules, but that is not what is happening
here.

Bisecting, it looks like things got slow for your case starting in
f538a91 (git-clean: Display more accurate delete messages, 2013-01-11).
I reproduced with basically:

  git init
  for i in $(seq 30000); do mkdir $i; done
  time git clean -nd >/dev/null

It jumps in that commit from ~50ms to ~3000ms.

A backtrace from get_ref_cache shows:

  #0  get_ref_cache (submodule=0xa6a4f0 "1") at refs.c:1070
  #1  0x0000000000516469 in resolve_gitlink_ref (path=0xa6a4d0 "1/", refname=0x584822 "HEAD", 
      sha1=0x7fffffffde90 "\002") at refs.c:1429
  #2  0x0000000000423584 in remove_dirs (path=0x7fffffffe2f0, prefix=0x0, force_flag=2, dry_run=1, quiet=0, 
      dir_gone=0x7fffffffe314) at builtin/clean.c:164
  #3  0x00000000004255a9 in cmd_clean (argc=0, argv=0x7fffffffe5e0, prefix=0x0) at builtin/clean.c:981
  #4  0x0000000000405554 in run_builtin (p=0x7f7b18 <commands+408>, argc=2, argv=0x7fffffffe5e0) at git.c:348
  #5  0x0000000000405761 in handle_builtin (argc=2, argv=0x7fffffffe5e0) at git.c:530
  #6  0x000000000040587d in run_argv (argcp=0x7fffffffe4cc, argv=0x7fffffffe4d8) at git.c:576
  #7  0x0000000000405a6e in main (argc=2, av=0x7fffffffe5d8) at git.c:685

So git-clean speculatively asks "what is HEAD in this maybe-submodule?". The
right solution is probably one of:

  1. In remove_dirs, find out if we have an actual submodule before calling
     resolve_gitlink_ref.

  2. Teach get_ref_cache a "read-only" mode that will not auto-vivify the cache
     if it does not already exist.

Of the two, I think (1) is probably cleaner (I think the way the ref
code is structured, we have to create the submodule ref_cache in order
to start looking things up in it).

It looks like we don't even really care about the value of HEAD. We just
want to know "is it a git directory?". I think in other places (like
"git add"), we just do an existence check for "$dir/.git". That would
not catch a bare repository, but I do not think the current check does
either (it is looking for submodules, which always have a .git).

Maybe something like (largely untested):

diff --git a/builtin/clean.c b/builtin/clean.c
index 98c103f..e2cc47b 100644
--- a/builtin/clean.c
+++ b/builtin/clean.c
@@ -148,6 +148,32 @@ static int exclude_cb(const struct option *opt, const char *arg, int unset)
 	return 0;
 }
 
+static int dir_is_repo(struct strbuf *path)
+{
+	size_t orig = path->len;
+	int ret;
+
+	strbuf_addstr(path, "/.git");
+	if (!access(path->buf, F_OK))
+		ret = 1; /* definitely */
+	else if (errno == ENOENT)
+		ret = 0; /* definitely not */
+	else {
+		/*
+		 * We couldn't tell. It would probably be safer to err
+		 * on the side of saying "yes" here, because we are
+		 * deciding what to delete, and are more likely to keep
+		 * a sub-repo. But it would probably also create annoying
+		 * false positives, where a directory we do not have
+		 * permission to read would say something misleading
+		 * like "not deleting sub-repo foo..."
+		 */
+		ret = 0;
+	}
+	strbuf_setlen(path, orig);
+	return ret;
+}
+
 static int remove_dirs(struct strbuf *path, const char *prefix, int force_flag,
 		int dry_run, int quiet, int *dir_gone)
 {
@@ -155,13 +181,11 @@ static int remove_dirs(struct strbuf *path, const char *prefix, int force_flag,
 	struct strbuf quoted = STRBUF_INIT;
 	struct dirent *e;
 	int res = 0, ret = 0, gone = 1, original_len = path->len, len;
-	unsigned char submodule_head[20];
 	struct string_list dels = STRING_LIST_INIT_DUP;
 
 	*dir_gone = 1;
 
-	if ((force_flag & REMOVE_DIR_KEEP_NESTED_GIT) &&
-			!resolve_gitlink_ref(path->buf, "HEAD", submodule_head)) {
+	if ((force_flag & REMOVE_DIR_KEEP_NESTED_GIT) && dir_is_repo(path)) {
 		if (!quiet) {
 			quote_path_relative(path->buf, prefix, &quoted);
 			printf(dry_run ?  _(msg_would_skip_git_dir) : _(msg_skip_git_dir),

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] refs.c: get_ref_cache: use a bucket hash
  2015-03-17  2:40     ` Jeff King
@ 2015-03-17  5:35       ` Junio C Hamano
  2015-03-17  5:48         ` Jeff King
  0 siblings, 1 reply; 12+ messages in thread
From: Junio C Hamano @ 2015-03-17  5:35 UTC (permalink / raw)
  To: Jeff King; +Cc: Andreas Krey, Michael Haggerty, Git Mailing List

Jeff King <peff@peff.net> writes:

> The get_ref_cache code was designed to scale to the actual number of
> submodules. I do not mind seeing it become a hash if people really do
> have a large number of submodules, but that is not what is happening
> here.
> ...
> So git-clean speculatively asks "what is HEAD in this maybe-submodule?". The
> right solution is probably one of:
>
>   1. In remove_dirs, find out if we have an actual submodule before calling
>      resolve_gitlink_ref.
>
>   2. Teach get_ref_cache a "read-only" mode that will not auto-vivify the cache
>      if it does not already exist.
>
> Of the two, I think (1) is probably cleaner (I think the way the ref
> code is structured, we have to create the submodule ref_cache in order
> to start looking things up in it).

Thanks for a great analysis.  I too wondered if we should be growing
the per-submodule ref-cache when we are only probing.

> It looks like we don't even really care about the value of HEAD. We just
> want to know "is it a git directory?". I think in other places (like
> "git add"), we just do an existence check for "$dir/.git". That would
> not catch a bare repository, but I do not think the current check does
> either (it is looking for submodules, which always have a .git).

If we wanted to be consistent, perhaps we should be reusing the "is
this a git repository?" check used by the auto-discovery codepath
(setup.c:is_git_directory(), perhaps?), but the idea looks simple
enough and sounds sensible.

> Maybe something like (largely untested):
>
> diff --git a/builtin/clean.c b/builtin/clean.c
> index 98c103f..e2cc47b 100644
> --- a/builtin/clean.c
> +++ b/builtin/clean.c
> @@ -148,6 +148,32 @@ static int exclude_cb(const struct option *opt, const char *arg, int unset)
>  	return 0;
>  }
>  
> +static int dir_is_repo(struct strbuf *path)
> +{
> +	size_t orig = path->len;
> +	int ret;
> +
> +	strbuf_addstr(path, "/.git");
> +	if (!access(path->buf, F_OK))
> +		ret = 1; /* definitely */
> +	else if (errno == ENOENT)
> +		ret = 0; /* definitely not */
> +	else {
> +		/*
> +		 * We couldn't tell. It would probably be safer to err
> +		 * on the side of saying "yes" here, because we are
> +		 * deciding what to delete, and are more likely to keep
> +		 * a sub-repo. But it would probably also create annoying
> +		 * false positives, where a directory we do not have
> +		 * permission to read would say something misleading
> +		 * like "not deleting sub-repo foo..."
> +		 */
> +		ret = 0;
> +	}
> +	strbuf_setlen(path, orig);
> +	return ret;
> +}
> +
>  static int remove_dirs(struct strbuf *path, const char *prefix, int force_flag,
>  		int dry_run, int quiet, int *dir_gone)
>  {
> @@ -155,13 +181,11 @@ static int remove_dirs(struct strbuf *path, const char *prefix, int force_flag,
>  	struct strbuf quoted = STRBUF_INIT;
>  	struct dirent *e;
>  	int res = 0, ret = 0, gone = 1, original_len = path->len, len;
> -	unsigned char submodule_head[20];
>  	struct string_list dels = STRING_LIST_INIT_DUP;
>  
>  	*dir_gone = 1;
>  
> -	if ((force_flag & REMOVE_DIR_KEEP_NESTED_GIT) &&
> -			!resolve_gitlink_ref(path->buf, "HEAD", submodule_head)) {
> +	if ((force_flag & REMOVE_DIR_KEEP_NESTED_GIT) && dir_is_repo(path)) {
>  		if (!quiet) {
>  			quote_path_relative(path->buf, prefix, &quoted);
>  			printf(dry_run ?  _(msg_would_skip_git_dir) : _(msg_skip_git_dir),

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] refs.c: get_ref_cache: use a bucket hash
  2015-03-17  5:35       ` Junio C Hamano
@ 2015-03-17  5:48         ` Jeff King
  2015-11-13 15:29           ` Andreas Krey
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff King @ 2015-03-17  5:48 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Andreas Krey, Michael Haggerty, Git Mailing List

On Mon, Mar 16, 2015 at 10:35:18PM -0700, Junio C Hamano wrote:

> > It looks like we don't even really care about the value of HEAD. We just
> > want to know "is it a git directory?". I think in other places (like
> > "git add"), we just do an existence check for "$dir/.git". That would
> > not catch a bare repository, but I do not think the current check does
> > either (it is looking for submodules, which always have a .git).
> 
> If we wanted to be consistent, perhaps we should be reusing the "is
> this a git repository?" check used by the auto-discovery codepath
> (setup.c:is_git_directory(), perhaps?), but the idea looks simple
> enough and sounds sensible.

Yeah, I almost suggested that, but I'm concerned that would make us
inconsistent with how we report untracked files. I thought that dir.c
used ".git" as a magic token there.

But it seems I'm wrong. We do ignore ".git" directly in treat_path(),
but treat_directory actually checks resolve_gitlink_ref. I think this
will suffer the same problem as Andreas's original issue (e.g., if you
run "git ls-files -o").

Likewise, I think dir.c:remove_dir_recurse is in a similar boat.
Grepping for resolve_gitlink_ref, it looks like there may be others,
too.

All of these should be using the same test, I think. Doing that with
is_git_directory() is probably OK. It is a little more expensive than we
might want for mass-use (it actually opens and parses the HEAD file in
each directory), but it quits early when we _don't_ see a git directory,
which would be the common case here.

-Peff

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] refs.c: get_ref_cache: use a bucket hash
  2015-03-17  5:48         ` Jeff King
@ 2015-11-13 15:29           ` Andreas Krey
  2015-11-14  0:01             ` Jeff King
  0 siblings, 1 reply; 12+ messages in thread
From: Andreas Krey @ 2015-11-13 15:29 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Michael Haggerty, Git Mailing List

On Tue, 17 Mar 2015 01:48:00 +0000, Jeff King wrote:
> On Mon, Mar 16, 2015 at 10:35:18PM -0700, Junio C Hamano wrote:
> 
> > > It looks like we don't even really care about the value of HEAD. We just
> > > want to know "is it a git directory?". I think in other places (like
> > > "git add"), we just do an existence check for "$dir/.git". That would
> > > not catch a bare repository, but I do not think the current check does
> > > either (it is looking for submodules, which always have a .git).
> > 
> > If we wanted to be consistent, perhaps we should be reusing the "is
> > this a git repository?" check used by the auto-discovery codepath
> > (setup.c:is_git_directory(), perhaps?), but the idea looks simple
> > enough and sounds sensible.
> 
> Yeah, I almost suggested that, but I'm concerned that would make us
> inconsistent with how we report untracked files. I thought that dir.c
> used ".git" as a magic token there.
> 
> But it seems I'm wrong. We do ignore ".git" directly in treat_path(),
> but treat_directory actually checks resolve_gitlink_ref. I think this
> will suffer the same problem as Andreas's original issue (e.g., if you
> run "git ls-files -o").

Guess what landed on my desk this week. Same repo, same
application test suite, same problem, now with 'git ls-files -o'.

> Likewise, I think dir.c:remove_dir_recurse is in a similar boat.
> Grepping for resolve_gitlink_ref, it looks like there may be others,
> too.

Can't we handle this in resolve_gitlink_ref itself? As I understand it,
it should resolve a ref (here "HEAD") when path points to a submodule.
When there isn't one it should return -1, so:

diff --git a/refs.c b/refs.c
index 132eff5..f8648c5 100644
--- a/refs.c
+++ b/refs.c
@@ -1553,6 +1553,10 @@ int resolve_gitlink_ref(const char *path, const char *refname, unsigned char *sh
 	if (!len)
 		return -1;
 	submodule = xstrndup(path, len);
+	if (!is_git_directory(submodule)) {
+		free(submodule);
+		return -1;
+	}
 	refs = get_ref_cache(submodule);
 	free(submodule);

I'm way too little into the code to see what may this may get wrong.

But this, as well as the old hash-ref-cache patch speeds me
up considerably, in this case a git ls-files -o from half a
minute of mostly user CPU to a second.

> All of these should be using the same test, I think. Doing that with
> is_git_directory() is probably OK. It is a little more expensive than we
> might want for mass-use (it actually opens and parses the HEAD file in
> each directory),

This happens as well when we let resolve_gitlink_ref run its old course.
(It (ls-files) even seems to try to open .git and then .git/HEAD, even
if the former fails with ENOENT.)

Andreas

-- 
"Totally trivial. Famous last words."
From: Linus Torvalds <torvalds@*.org>
Date: Fri, 22 Jan 2010 07:29:21 -0800

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] refs.c: get_ref_cache: use a bucket hash
  2015-11-13 15:29           ` Andreas Krey
@ 2015-11-14  0:01             ` Jeff King
  2015-11-14 13:22               ` Andreas Krey
  2015-11-14 13:35               ` Andreas Krey
  0 siblings, 2 replies; 12+ messages in thread
From: Jeff King @ 2015-11-14  0:01 UTC (permalink / raw)
  To: Andreas Krey; +Cc: Junio C Hamano, Michael Haggerty, Git Mailing List

On Fri, Nov 13, 2015 at 04:29:15PM +0100, Andreas Krey wrote:

> > Likewise, I think dir.c:remove_dir_recurse is in a similar boat.
> > Grepping for resolve_gitlink_ref, it looks like there may be others,
> > too.
> 
> Can't we handle this in resolve_gitlink_ref itself? As I understand it,
> it should resolve a ref (here "HEAD") when path points to a submodule.
> When there isn't one it should return -1, so:

I'm not sure. I think part of the change to git-clean was that
is_git_directory() is a _better_ check than "can we resolve HEAD?"
because it covers empty repos, too.

> diff --git a/refs.c b/refs.c
> index 132eff5..f8648c5 100644
> --- a/refs.c
> +++ b/refs.c
> @@ -1553,6 +1553,10 @@ int resolve_gitlink_ref(const char *path, const char *refname, unsigned char *sh
>  	if (!len)
>  		return -1;
>  	submodule = xstrndup(path, len);
> +	if (!is_git_directory(submodule)) {
> +		free(submodule);
> +		return -1;
> +	}
>  	refs = get_ref_cache(submodule);
>  	free(submodule);
> 
> I'm way too little into the code to see what may this may get wrong.

I don't think it produces wrong outcomes, but I think it's sub-optimal.
In cases where we already have a ref cache, we'll hit the filesystem for
each lookup to re-confirm what we already know. That doesn't affect your
case, but it does when we actually _do_ have a submodule.

So if we were to follow this route, I think it would go better in
get_ref_cache itself (right after we determine there is no existing
cache, but before we call create_ref_cache()).

> But this, as well as the old hash-ref-cache patch speeds me
> up considerably, in this case a git ls-files -o from half a
> minute of mostly user CPU to a second.

Right, that makes sense to me.

> > All of these should be using the same test, I think. Doing that with
> > is_git_directory() is probably OK. It is a little more expensive than we
> > might want for mass-use (it actually opens and parses the HEAD file in
> > each directory),
> 
> This happens as well when we let resolve_gitlink_ref run its old course.
> (It (ls-files) even seems to try to open .git and then .git/HEAD, even
> if the former fails with ENOENT.)

Yes, I think my earlier comment that you are quoting was just misguided.
We only do the extra work if the directory actually does look like a
gitdir, and the many-directories case we are optimizing here is the
opposite of that.

So summing up, I think:

  1. We could get by with teaching get_ref_cache not to auto-create ref
     caches for non-git-directories.

  2. But for a little more work, pushing the is_git_directory() check
     out to the call-sites gives us probably saner semantics overall.

-Peff

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] refs.c: get_ref_cache: use a bucket hash
  2015-11-14  0:01             ` Jeff King
@ 2015-11-14 13:22               ` Andreas Krey
  2015-11-14 13:35               ` Andreas Krey
  1 sibling, 0 replies; 12+ messages in thread
From: Andreas Krey @ 2015-11-14 13:22 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Michael Haggerty, Git Mailing List

On Fri, 13 Nov 2015 19:01:18 +0000, Jeff King wrote:
....
> > Can't we handle this in resolve_gitlink_ref itself? As I understand it,
> > it should resolve a ref (here "HEAD") when path points to a submodule.
> > When there isn't one it should return -1, so:
> 
> I'm not sure. I think part of the change to git-clean was that
> is_git_directory() is a _better_ check than "can we resolve HEAD?"
> because it covers empty repos, too.

I could do

  refs = find_ref_cache(submodule);
  if (!refs && !is_git_directory(....

in resolve_gitlink_ref().

...
> > @@ -1553,6 +1553,10 @@ int resolve_gitlink_ref(const char *path, const char *refname, unsigned char *sh
> >  	if (!len)
> >  		return -1;
> >  	submodule = xstrndup(path, len);
> > +	if (!is_git_directory(submodule)) {
> > +		free(submodule);
> > +		return -1;
> > +	}
> >  	refs = get_ref_cache(submodule);
> >  	free(submodule);
> 
> I don't think it produces wrong outcomes, but I think it's sub-optimal.
> In cases where we already have a ref cache, we'll hit the filesystem for
> each lookup to re-confirm what we already know. That doesn't affect your
> case, but it does when we actually _do_ have a submodule.

I could do

  refs = find_ref_cache(submodule);
  if (!refs && !is_git_directory(....

Also, in my case the current code tries .git/HEAD and .git/packed-refs
for each directory.

> So if we were to follow this route, I think it would go better in
> get_ref_cache itself (right after we determine there is no existing
> cache, but before we call create_ref_cache()).

The stupid part is that get_ref_cache itself only creates the
cache entry and leaved the actual check for later - then it
is too late to not create the cache entry.

Also, when we put it into get_ref_cache we need to return null
for our case and see what for_each_ref_submodule & co make out of that.
That's why I'd like it in resolve_gitlink_ref. :-) Or we put an
no_create parameter into get_ref_cache(). Probably violating
some style guide:

diff --git a/refs.c b/refs.c
index 132eff5..005d0eb 100644
--- a/refs.c
+++ b/refs.c
@@ -1160,7 +1160,7 @@ static struct ref_cache *create_ref_cache(const char *submodule)
  * will be allocated and initialized but not necessarily populated; it
  * should not be freed.
  */
-static struct ref_cache *get_ref_cache(const char *submodule)
+static struct ref_cache *get_ref_cache(const char *submodule, int do_create)
 {
 	struct ref_cache *refs;
 
@@ -1171,6 +1171,9 @@ static struct ref_cache *get_ref_cache(const char *submodule)
 		if (!strcmp(submodule, refs->name))
 			return refs;
 
+	if (!do_create && !is_git_directory(submodule))
+		return 0;
+
 	refs = create_ref_cache(submodule);
 	refs->next = submodule_ref_caches;
 	submodule_ref_caches = refs;
@@ -1553,9 +1556,12 @@ int resolve_gitlink_ref(const char *path, const char *refname, unsigned char *sh
 	if (!len)
 		return -1;
 	submodule = xstrndup(path, len);
-	refs = get_ref_cache(submodule);
+	refs = get_ref_cache(submodule, 0);
 	free(submodule);
 
+	if (!refs)
+		return -1;
+
 	retval = resolve_gitlink_ref_recursive(refs, refname, sha1, 0);
 	return retval;
 }
@@ -2126,7 +2132,7 @@ int for_each_ref(each_ref_fn fn, void *cb_data)
 
 int for_each_ref_submodule(const char *submodule, each_ref_fn fn, void *cb_data)
 {
-	return do_for_each_ref(get_ref_cache(submodule), "", fn, 0, 0, cb_data);
+	return do_for_each_ref(get_ref_cache(submodule, 1), "", fn, 0, 0, cb_data);
 }
 
 int for_each_ref_in(const char *prefix, each_ref_fn fn, void *cb_data)
@@ -2146,7 +2152,7 @@ int for_each_fullref_in(const char *prefix, each_ref_fn fn, void *cb_data, unsig
 int for_each_ref_in_submodule(const char *submodule, const char *prefix,
 		each_ref_fn fn, void *cb_data)
 {
-	return do_for_each_ref(get_ref_cache(submodule), prefix, fn, strlen(prefix), 0, cb_data);
+	return do_for_each_ref(get_ref_cache(submodule, 1), prefix, fn, strlen(prefix), 0, cb_data);
 }
 
 int for_each_tag_ref(each_ref_fn fn, void *cb_data)

(might be nicer to split into find_ref_cache() and get_ref_cache()
instead of adding the parameter).

...
>   2. But for a little more work, pushing the is_git_directory() check
>      out to the call-sites gives us probably saner semantics overall.

Actually, I don't quite think that. The code we push out would be
the same in each place ('is_git_directory() && ...'), wouldn't it?

Andreas

-- 
"Totally trivial. Famous last words."
From: Linus Torvalds <torvalds@*.org>
Date: Fri, 22 Jan 2010 07:29:21 -0800

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] refs.c: get_ref_cache: use a bucket hash
  2015-11-14  0:01             ` Jeff King
  2015-11-14 13:22               ` Andreas Krey
@ 2015-11-14 13:35               ` Andreas Krey
  2015-11-16 16:31                 ` Jeff King
  1 sibling, 1 reply; 12+ messages in thread
From: Andreas Krey @ 2015-11-14 13:35 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Michael Haggerty, Git Mailing List

On Fri, 13 Nov 2015 19:01:18 +0000, Jeff King wrote:
...
>   2. But for a little more work, pushing the is_git_directory() check
>      out to the call-sites gives us probably saner semantics overall.

Oops, now I get it[1]: You mean replacing resolve_gitlink_ref usages
with is_git_directory, like:

diff --git a/dir.c b/dir.c
index d2a8f06..7765dc6 100644
--- a/dir.c
+++ b/dir.c
@@ -1375,8 +1375,7 @@ static enum path_treatment treat_directory(struct dir_struct *dir,
 		if (dir->flags & DIR_SHOW_OTHER_DIRECTORIES)
 			break;
 		if (!(dir->flags & DIR_NO_GITLINKS)) {
-			unsigned char sha1[20];
-			if (resolve_gitlink_ref(dirname, "HEAD", sha1) == 0)
+			if (is_git_directory(dirname))
 				return path_untracked;
 		}
 		return path_recurse;

That, I like. If it is correct.

Andreas

[1] After reading the introduction of is_git_directory, 0179ca7a62.

-- 
"Totally trivial. Famous last words."
From: Linus Torvalds <torvalds@*.org>
Date: Fri, 22 Jan 2010 07:29:21 -0800

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] refs.c: get_ref_cache: use a bucket hash
  2015-11-14 13:35               ` Andreas Krey
@ 2015-11-16 16:31                 ` Jeff King
  0 siblings, 0 replies; 12+ messages in thread
From: Jeff King @ 2015-11-16 16:31 UTC (permalink / raw)
  To: Andreas Krey; +Cc: Junio C Hamano, Michael Haggerty, Git Mailing List

On Sat, Nov 14, 2015 at 02:35:01PM +0100, Andreas Krey wrote:

> On Fri, 13 Nov 2015 19:01:18 +0000, Jeff King wrote:
> ...
> >   2. But for a little more work, pushing the is_git_directory() check
> >      out to the call-sites gives us probably saner semantics overall.
> 
> Oops, now I get it[1]: You mean replacing resolve_gitlink_ref usages
> with is_git_directory, like:

Yes. I mistakenly said is_git_directory, when I really meant
is_git_repository, the new function added in 0179ca7a62. You seem to
have figured out what I meant, but the critical thing is that we check
"$dir/.git", not just "$dir" (and check it both as a git dir and as a
gitfile, as is_git_repository() does).

I'm not sure if we can simply make that function public or not. It's
mostly straightforward, but it does err on the side of "yes, this is a
git repo" if we see a ".git" file we can't read. I think that's probably
reasonable in most sites, but I didn't look closely.

> diff --git a/dir.c b/dir.c
> index d2a8f06..7765dc6 100644
> --- a/dir.c
> +++ b/dir.c
> @@ -1375,8 +1375,7 @@ static enum path_treatment treat_directory(struct dir_struct *dir,
>  		if (dir->flags & DIR_SHOW_OTHER_DIRECTORIES)
>  			break;
>  		if (!(dir->flags & DIR_NO_GITLINKS)) {
> -			unsigned char sha1[20];
> -			if (resolve_gitlink_ref(dirname, "HEAD", sha1) == 0)
> +			if (is_git_directory(dirname))
>  				return path_untracked;
>  		}
>  		return path_recurse;
> 
> That, I like. If it is correct.

Yes, that's what I had in mind, modulo the directory/repository thing
above (the is_git_repository function also takes a strbuf, so we'd need
to handle that extra allocation somewhere).

-Peff

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2015-11-16 16:31 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-16 14:20 [PATCH] refs.c: get_ref_cache: use a bucket hash Andreas Krey
2015-03-16 17:19 ` Thomas Gummerer
2015-03-16 17:23 ` Junio C Hamano
2015-03-16 18:40   ` Andreas Krey
2015-03-17  2:40     ` Jeff King
2015-03-17  5:35       ` Junio C Hamano
2015-03-17  5:48         ` Jeff King
2015-11-13 15:29           ` Andreas Krey
2015-11-14  0:01             ` Jeff King
2015-11-14 13:22               ` Andreas Krey
2015-11-14 13:35               ` Andreas Krey
2015-11-16 16:31                 ` Jeff King

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.