All of lore.kernel.org
 help / color / mirror / Atom feed
From: Elijah Newren <newren@gmail.com>
To: Goss Geppert <gg.oss.dev@gmail.com>
Cc: Git Mailing List <git@vger.kernel.org>,
	Junio C Hamano <gitster@pobox.com>,
	christian w <usebees@gmail.com>,
	Derrick Stolee <derrickstolee@github.com>
Subject: Re: [PATCH v3 2/3] dir: cache git_dir's realpath
Date: Tue, 24 May 2022 07:32:39 -0700	[thread overview]
Message-ID: <CABPp-BGkGu8NZ6wOiMFEVQ+s=28CUMiD-+upWweAtZBjg9ciAA@mail.gmail.com> (raw)
In-Reply-To: <20220520192840.8942-3-ggossdev@gmail.com>

On Fri, May 20, 2022 at 12:29 PM Goss Geppert <gg.oss.dev@gmail.com> wrote:
>
> When traversing the directory tree, the realpath of the `git_dir` is
> required each time a nested repository is encountered to determine how
> the nested repository should be treated.  To prevent excessive resource
> usage in pathological cases (e.g. many nested repositories, a long path,
> symlinks, etc.), cache the realpath after the first usage until the
> traversal is complete.
> ---
>  dir.c | 63 ++++++++++++++++++++++++++++++++++++++++-------------------
>  1 file changed, 43 insertions(+), 20 deletions(-)
>
> diff --git a/dir.c b/dir.c
> index e45e117875..45b89560fc 100644
> --- a/dir.c
> +++ b/dir.c
> @@ -47,10 +47,22 @@ struct cached_dir {
>         struct untracked_cache_dir *ucd;
>  };
>
> +/*
> + * Support data structure for the recursing into the directory tree.
> + */
> +struct traversal_cache {
> +       /*
> +        * The realpath for the `git_dir` may be required in the recursion and is
> +        * cached for repeated use.
> +        */
> +       char *real_gitdir;
> +};
> +
>  static enum path_treatment read_directory_recursive(struct dir_struct *dir,
>         struct index_state *istate, const char *path, int len,
>         struct untracked_cache_dir *untracked,
> -       int check_only, int stop_at_first_file, const struct pathspec *pathspec);
> +       int check_only, int stop_at_first_file, const struct pathspec *pathspec,
> +       struct traversal_cache *cache);
>  static int resolve_dtype(int dtype, struct index_state *istate,
>                          const char *path, int len);
>  struct dirent *readdir_skip_dot_and_dotdot(DIR *dirp)
> @@ -1852,7 +1864,8 @@ static enum path_treatment treat_directory(struct dir_struct *dir,
>         struct index_state *istate,
>         struct untracked_cache_dir *untracked,
>         const char *dirname, int len, int baselen, int excluded,
> -       const struct pathspec *pathspec)
> +       const struct pathspec *pathspec,
> +       struct traversal_cache *cache)
>  {
>         /*
>          * WARNING: From this function, you can return path_recurse or you
> @@ -1906,13 +1919,13 @@ static enum path_treatment treat_directory(struct dir_struct *dir,
>                 nested_repo = is_nonbare_repository_dir(&sb);
>
>                 if (nested_repo) {
> -                       char *real_dirname, *real_gitdir;
> +                       char *real_dirname;
>                         strbuf_addstr(&sb, ".git");
>                         real_dirname = real_pathdup(sb.buf, 1);
> -                       real_gitdir = real_pathdup(the_repository->gitdir, 1);
> +                       if (!cache->real_gitdir)
> +                               cache->real_gitdir = real_pathdup(the_repository->gitdir, 1);
>
> -                       nested_repo = !!strcmp(real_dirname, real_gitdir);
> -                       free(real_gitdir);
> +                       nested_repo = !!strcmp(real_dirname, cache->real_gitdir);
>                         free(real_dirname);
>                 }
>                 strbuf_release(&sb);
> @@ -1944,7 +1957,7 @@ static enum path_treatment treat_directory(struct dir_struct *dir,
>                                 return path_excluded;
>
>                         if (read_directory_recursive(dir, istate, dirname, len,
> -                                                    untracked, 1, 1, pathspec) == path_excluded)
> +                                                    untracked, 1, 1, pathspec, cache) == path_excluded)
>                                 return path_excluded;
>
>                         return path_none;
> @@ -2045,7 +2058,7 @@ static enum path_treatment treat_directory(struct dir_struct *dir,
>         untracked = lookup_untracked(dir->untracked, untracked,
>                                      dirname + baselen, len - baselen);
>         state = read_directory_recursive(dir, istate, dirname, len, untracked,
> -                                        check_only, stop_early, pathspec);
> +                                        check_only, stop_early, pathspec, cache);
>
>         /* There are a variety of reasons we may need to fixup the state... */
>         if (state == path_excluded) {
> @@ -2242,7 +2255,8 @@ static enum path_treatment treat_path_fast(struct dir_struct *dir,
>                                            struct index_state *istate,
>                                            struct strbuf *path,
>                                            int baselen,
> -                                          const struct pathspec *pathspec)
> +                                          const struct pathspec *pathspec,
> +                                          struct traversal_cache *cache)
>  {
>         /*
>          * WARNING: From this function, you can return path_recurse or you
> @@ -2264,7 +2278,7 @@ static enum path_treatment treat_path_fast(struct dir_struct *dir,
>                  * with check_only set.
>                  */
>                 return read_directory_recursive(dir, istate, path->buf, path->len,
> -                                               cdir->ucd, 1, 0, pathspec);
> +                                               cdir->ucd, 1, 0, pathspec, cache);
>         /*
>          * We get path_recurse in the first run when
>          * directory_exists_in_index() returns index_nonexistent. We
> @@ -2280,13 +2294,14 @@ static enum path_treatment treat_path(struct dir_struct *dir,
>                                       struct index_state *istate,
>                                       struct strbuf *path,
>                                       int baselen,
> -                                     const struct pathspec *pathspec)
> +                                     const struct pathspec *pathspec,
> +                                     struct traversal_cache *cache)
>  {
>         int has_path_in_index, dtype, excluded;
>
>         if (!cdir->d_name)
>                 return treat_path_fast(dir, cdir, istate, path,
> -                                      baselen, pathspec);
> +                                      baselen, pathspec, cache);
>         if (is_dot_or_dotdot(cdir->d_name) || !fspathcmp(cdir->d_name, ".git"))
>                 return path_none;
>         strbuf_setlen(path, baselen);
> @@ -2348,7 +2363,7 @@ static enum path_treatment treat_path(struct dir_struct *dir,
>                 strbuf_addch(path, '/');
>                 return treat_directory(dir, istate, untracked,
>                                        path->buf, path->len,
> -                                      baselen, excluded, pathspec);
> +                                      baselen, excluded, pathspec, cache);
>         case DT_REG:
>         case DT_LNK:
>                 if (pathspec &&
> @@ -2549,7 +2564,8 @@ static void add_path_to_appropriate_result_list(struct dir_struct *dir,
>  static enum path_treatment read_directory_recursive(struct dir_struct *dir,
>         struct index_state *istate, const char *base, int baselen,
>         struct untracked_cache_dir *untracked, int check_only,
> -       int stop_at_first_file, const struct pathspec *pathspec)
> +       int stop_at_first_file, const struct pathspec *pathspec,
> +       struct traversal_cache *cache)
>  {
>         /*
>          * WARNING: Do NOT recurse unless path_recurse is returned from
> @@ -2572,7 +2588,7 @@ static enum path_treatment read_directory_recursive(struct dir_struct *dir,
>         while (!read_cached_dir(&cdir)) {
>                 /* check how the file or directory should be treated */
>                 state = treat_path(dir, untracked, &cdir, istate, &path,
> -                                  baselen, pathspec);
> +                                  baselen, pathspec, cache);
>                 dir->visited_paths++;
>
>                 if (state > dir_state)
> @@ -2587,7 +2603,7 @@ static enum path_treatment read_directory_recursive(struct dir_struct *dir,
>                         subdir_state =
>                                 read_directory_recursive(dir, istate, path.buf,
>                                                          path.len, ud,
> -                                                        check_only, stop_at_first_file, pathspec);
> +                                                        check_only, stop_at_first_file, pathspec, cache);
>                         if (subdir_state > dir_state)
>                                 dir_state = subdir_state;
>
> @@ -2661,7 +2677,8 @@ int check_dir_entry_contains(const struct dir_entry *out, const struct dir_entry
>  static int treat_leading_path(struct dir_struct *dir,
>                               struct index_state *istate,
>                               const char *path, int len,
> -                             const struct pathspec *pathspec)
> +                             const struct pathspec *pathspec,
> +                             struct traversal_cache *cache)
>  {
>         struct strbuf sb = STRBUF_INIT;
>         struct strbuf subdir = STRBUF_INIT;
> @@ -2713,7 +2730,8 @@ static int treat_leading_path(struct dir_struct *dir,
>                 strbuf_reset(&subdir);
>                 strbuf_add(&subdir, path+prevlen, baselen-prevlen);
>                 cdir.d_name = subdir.buf;
> -               state = treat_path(dir, NULL, &cdir, istate, &sb, prevlen, pathspec);
> +
> +               state = treat_path(dir, NULL, &cdir, istate, &sb, prevlen, pathspec, cache);
>
>                 if (state != path_recurse)
>                         break; /* do not recurse into it */
> @@ -2986,6 +3004,9 @@ int read_directory(struct dir_struct *dir, struct index_state *istate,
>                    const char *path, int len, const struct pathspec *pathspec)
>  {
>         struct untracked_cache_dir *untracked;
> +       struct traversal_cache cache = {
> +               .real_gitdir = NULL,
> +       };
>
>         trace2_region_enter("dir", "read_directory", istate->repo);
>         dir->visited_paths = 0;
> @@ -3003,8 +3024,10 @@ int read_directory(struct dir_struct *dir, struct index_state *istate,
>                  * e.g. prep_exclude()
>                  */
>                 dir->untracked = NULL;
> -       if (!len || treat_leading_path(dir, istate, path, len, pathspec))
> -               read_directory_recursive(dir, istate, path, len, untracked, 0, 0, pathspec);
> +       if (!len || treat_leading_path(dir, istate, path, len, pathspec, &cache)) {
> +               read_directory_recursive(dir, istate, path, len, untracked, 0, 0, pathspec, &cache);
> +       }
> +       FREE_AND_NULL(cache.real_gitdir);
>         QSORT(dir->entries, dir->nr, cmp_dir_entry);
>         QSORT(dir->ignored, dir->ignored_nr, cmp_dir_entry);
>
> --
> 2.36.0

This makes sense as a way to avoid the call to real_pathdup() if
that's what we want, but I feel it's a micro-optimization that isn't
worth this code churn.  I'd rather drop this patch from the series and
just use the real_pathdup() you had before (see
https://lore.kernel.org/git/CABPp-BGXRzYCvyM38dEUvQ125+VtRu++7L9UiRz98u+1=Lov7A@mail.gmail.com/).
Let's see what Junio says, but I'm hoping he agrees this patch isn't
needed.

  reply	other threads:[~2022-05-24 14:32 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-05-05 20:32 [RFC PATCH 0/1] dir: consider worktree config in path recursion Goss Geppert
2022-05-05 20:32 ` [RFC PATCH 1/1] " Goss Geppert
2022-05-07  3:26   ` Elijah Newren
2022-05-07 17:59     ` oss dev
2022-05-06 17:02 ` [RFC PATCH 0/1] " Junio C Hamano
2022-05-06 20:00   ` oss dev
2022-05-10 17:15 ` [PATCH v2 0/2] " Goss Geppert
2022-05-10 17:15   ` [PATCH v2 1/2] " Goss Geppert
2022-05-11 16:37     ` Junio C Hamano
2022-05-20 19:45       ` oss dev
2022-05-24 14:29       ` Elijah Newren
2022-05-24 19:45         ` Junio C Hamano
2022-05-25  3:46           ` Elijah Newren
2022-05-11 23:07     ` Junio C Hamano
2022-05-20 20:01       ` oss dev
2022-05-23 19:23     ` Derrick Stolee
2022-05-30 18:48       ` oss dev
2022-05-10 17:15   ` [PATCH v2 2/2] dir: minor refactoring / clean-up Goss Geppert
2022-05-11 16:51     ` Junio C Hamano
2022-05-20 20:03       ` oss dev
2022-05-20 19:28 ` [PATCH v3 0/3] dir: traverse into repository Goss Geppert
2022-05-20 19:28   ` [PATCH v3 1/3] " Goss Geppert
2022-05-20 19:28   ` [PATCH v3 2/3] dir: cache git_dir's realpath Goss Geppert
2022-05-24 14:32     ` Elijah Newren [this message]
2022-05-20 19:28   ` [PATCH v3 3/3] dir: minor refactoring / clean-up Goss Geppert
2022-06-16 23:19 ` [PATCH v4 0/2] dir: traverse into repository Goss Geppert
2022-06-22  4:57   ` Elijah Newren
     [not found] ` <20220616231956.154-1-gg.oss@outlook.com>
2022-06-16 23:19   ` [PATCH v4 1/2] " Goss Geppert
2022-06-16 23:44 ` [PATCH v4 0/2] dir: traverse into repository (resending) Goss Geppert
     [not found] ` <20220616234433.225-1-gg.oss@outlook.com>
2022-06-16 23:44   ` [PATCH v4 1/2] dir: traverse into repository Goss Geppert
2022-06-16 23:44   ` [PATCH v4 2/2] dir: minor refactoring / clean-up Goss Geppert

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CABPp-BGkGu8NZ6wOiMFEVQ+s=28CUMiD-+upWweAtZBjg9ciAA@mail.gmail.com' \
    --to=newren@gmail.com \
    --cc=derrickstolee@github.com \
    --cc=gg.oss.dev@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=usebees@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.