All of lore.kernel.org
 help / color / mirror / Atom feed
From: Emily Shaffer <emilyshaffer@google.com>
To: Calvin Wan <calvinwan@google.com>
Cc: git@vger.kernel.org
Subject: Re: [PATCH 4/4] diff-lib: parallelize run_diff_files for submodules
Date: Tue, 27 Sep 2022 11:40:15 -0700	[thread overview]
Message-ID: <YzNDj54cylRzu2Fj@google.com> (raw)
In-Reply-To: <20220922232947.631309-5-calvinwan@google.com>

On Thu, Sep 22, 2022 at 11:29:47PM +0000, Calvin Wan wrote:
> diff --git a/builtin/commit.c b/builtin/commit.c
> index fcf9c85947..c5147a7952 100644
> --- a/builtin/commit.c
> +++ b/builtin/commit.c
> @@ -1468,6 +1468,12 @@ static int git_status_config(const char *k, const char *v, void *cb)
>  		s->detect_rename = git_config_rename(k, v);
>  		return 0;
>  	}
> +	if (!strcmp(k, "status.parallelsubmodules")) {
> +		s->parallel_jobs_submodules = git_config_int(k, v);
> +		if (s->parallel_jobs_submodules < 0)
> +			die(_("status.parallelsubmodules cannot be negative"));

Seems odd to have this check when all through the rest of the code
you're happily setting parallel_jobs=-1.

Although I don't think I disagree with the check ;) rather, I disagree
with the semantics used elsewhere for this magic -1 value.

> +		return 0;
> +	}
>  	return git_diff_ui_config(k, v, NULL);
>  }
>  
> diff --git a/diff-lib.c b/diff-lib.c
> index 2e148b79e6..ec745755fc 100644
> --- a/diff-lib.c
> +++ b/diff-lib.c
> @@ -65,14 +65,19 @@ static int check_removed(const struct index_state *istate, const struct cache_en
>   * Return 1 when changes are detected, 0 otherwise. If the DIRTY_SUBMODULES
>   * option is set, the caller does not only want to know if a submodule is
>   * modified at all but wants to know all the conditions that are met (new
> - * commits, untracked content and/or modified content).
> + * commits, untracked content and/or modified content). If
> + * defer_submodule_status bit is set, dirty_submodule will be left to the
> + * caller to set. defer_submodule_status can also be set to 0 in this
> + * function if there is no need to check if the submodule is modified.
>   */

Should the defer optimization be moved to its own pack? I think it is
somewhat orthogonal to deciding whether to parallelize or not, yes?

>  static int match_stat_with_submodule(struct diff_options *diffopt,
>  				     const struct cache_entry *ce,
>  				     struct stat *st, unsigned ce_option,
> -				     unsigned *dirty_submodule)
> +				     unsigned *dirty_submodule, int *defer_submodule_status,
> +					 int *ignore_untracked_in_submodules)
>  {
>  	int changed = ie_match_stat(diffopt->repo->index, ce, st, ce_option);
> +	int defer = 0;
>  	struct diff_flags orig_flags = diffopt->flags;
>  	if (!S_ISGITLINK(ce->ce_mode))
>  		goto cleanup;
> @@ -83,11 +88,20 @@ static int match_stat_with_submodule(struct diff_options *diffopt,
>  		goto cleanup;
>  	}
>  	if (!diffopt->flags.ignore_dirty_submodules &&
> -		(!changed || diffopt->flags.dirty_submodules))
> +		(!changed || diffopt->flags.dirty_submodules)) {
> +		if (defer_submodule_status && *defer_submodule_status) {
> +			defer = 1;
> +			*ignore_untracked_in_submodules =
> +							diffopt->flags.ignore_untracked_in_submodules;
> +		} else {
>  			*dirty_submodule = is_submodule_modified(ce->name,
>  							diffopt->flags.ignore_untracked_in_submodules);
> +		}
> +	}
>  cleanup:
>  	diffopt->flags = orig_flags;
> +	if (defer_submodule_status)
> +		*defer_submodule_status = defer;
>  	return changed;
>  }
>  
> @@ -117,7 +131,7 @@ static void finish_run_diff_files(struct rev_info *revs,
>  			    ce->name, 0, dirty_submodule);
>  }
>  
> -int run_diff_files(struct rev_info *revs, unsigned int option)
> +int run_diff_files(struct rev_info *revs, unsigned int option, int parallel_jobs)

The meaning of 'parallel_jobs' isn't documented anywhere in this patch
that I can see, but all over we're setting it to some magic -1 value,
right? What's that about?

>  {
>  	int entries, i;
>  	int diff_unmerged_stage = revs->max_count;
> @@ -125,6 +139,7 @@ int run_diff_files(struct rev_info *revs, unsigned int option)
>  			      ? CE_MATCH_RACY_IS_DIRTY : 0);
>  	uint64_t start = getnanotime();
>  	struct index_state *istate = revs->diffopt.repo->index;
> +	struct string_list submodules = STRING_LIST_INIT_NODUP;
>  
>  	diff_set_mnemonic_prefix(&revs->diffopt, "i/", "w/");
>  
> @@ -138,6 +153,9 @@ int run_diff_files(struct rev_info *revs, unsigned int option)
>  		struct cache_entry *ce = istate->cache[i];
>  		int changed;
>  		unsigned dirty_submodule = 0;
> +		int defer_submodule_status = revs && revs->repo &&
> +							!strcmp(revs->repo->gitdir, ".git");
> +		int ignore_untracked_in_submodules;

Does this actually compile with -Wall? I thought it is no good to do an
inline function call in the middle of the allocation block?

>  
>  		if (diff_can_quit_early(&revs->diffopt))
>  			break;
> @@ -269,11 +287,36 @@ int run_diff_files(struct rev_info *revs, unsigned int option)
>  			}
>  
>  			changed = match_stat_with_submodule(&revs->diffopt, ce, &st,
> -							    ce_option, &dirty_submodule);
> +							ce_option, &dirty_submodule,
> +							&defer_submodule_status,
> +							&ignore_untracked_in_submodules);
>  			newmode = ce_mode_from_stat(ce, st.st_mode);
> +			if (defer_submodule_status) {
> +				struct string_list_item *item =
> +								string_list_append(&submodules, ce->name);
> +				struct submodule_status_util *util = xmalloc(sizeof(*util));
> +				util->changed = changed;
> +				util->dirty_submodule = 0;
> +				util->ignore_untracked = ignore_untracked_in_submodules;
> +				util->newmode = newmode;
> +				util->ce = ce;
> +				item->util = util;
> +				continue;
> +			}
>  		}
>  		finish_run_diff_files(revs, ce, istate, changed, dirty_submodule, newmode);
>  	}
> +	if (submodules.nr > 0) {
> +		if (get_submodules_status(revs->repo, &submodules,
> +						parallel_jobs > 0 ? parallel_jobs : 1))

We're not doing a lookup in case of parallel_jobs<0, right? So why even
bother having this special meaning for -1? It's undocumented in the
header for run_diff_files anyway. In fact, I'd expect parallel_jobs=-1
to mean "as many jobs as possible" - but here you're using it just to
mean 1.

Why not let parallel_jobs be unsigned, do special casing for 0 if you
want to use that to mean "recommended # of jobs", and otherwise take it
at face value? Am I missing something in my skim over the patch?

> +			BUG("Submodule status failed");
> +		for (int i = 0; i < submodules.nr; i++) {
> +			struct submodule_status_util *util = submodules.items[i].util;
> +
> +			finish_run_diff_files(revs, util->ce, NULL, util->changed,
> +							util->dirty_submodule, util->newmode);
> +		}
> +	}
>  	diffcore_std(&revs->diffopt);
>  	diff_flush(&revs->diffopt);
>  	trace_performance_since(start, "diff-files");
> @@ -321,7 +364,7 @@ static int get_stat_data(const struct index_state *istate,
>  			return -1;
>  		}
>  		changed = match_stat_with_submodule(diffopt, ce, &st,
> -						    0, dirty_submodule);
> +						    0, dirty_submodule, NULL, NULL);
>  		if (changed) {
>  			mode = ce_mode_from_stat(ce, st.st_mode);
>  			oid = null_oid();
> diff --git a/diff.h b/diff.h
> index 8ae18e5ab1..5a6a615381 100644
> --- a/diff.h
> +++ b/diff.h
> @@ -627,7 +627,7 @@ void diff_get_merge_base(const struct rev_info *revs, struct object_id *mb);
>  #define DIFF_SILENT_ON_REMOVED 01
>  /* report racily-clean paths as modified */
>  #define DIFF_RACY_IS_MODIFIED 02
> -int run_diff_files(struct rev_info *revs, unsigned int option);
> +int run_diff_files(struct rev_info *revs, unsigned int option, int parallel_jobs);

See above; probably documenting parallel_jobs should happen here.

>  
>  #define DIFF_INDEX_CACHED 01
>  #define DIFF_INDEX_MERGE_BASE 02
> diff --git a/submodule.c b/submodule.c
> index 91213ba83c..15729bb327 100644
> --- a/submodule.c
> +++ b/submodule.c
> @@ -1362,6 +1362,20 @@ int submodule_touches_in_range(struct repository *r,
>  	return ret;
>  }
>  

I actually might like to see the conversion to using
run_processes_parallel happen in an earlier patch, with jobs=1. Might
make the review load easier.

That said, in previous series which took that approach, because it
worked correctly with jobs=1 we missed a bug in the writing of the new
*_task_finished() callback.... Maybe that doesn't apply here as the
jobs=N patch would follow on in the same series, so you'd test it right
away?

> +struct submodule_parallel_status {
> +	int index_count;
> +	int result;
> +
> +	struct string_list *submodule_names;
> +	struct repository *r;
> +
> +	/* Pending statuses by OIDs */
> +	struct status_task **oid_status_tasks;
> +	int oid_status_tasks_nr, oid_status_tasks_alloc;
> +};
> +
> +#define SPS_INIT { 0 }
> +
>  struct submodule_parallel_fetch {
>  	/*
>  	 * The index of the last index entry processed by
> @@ -1444,6 +1458,12 @@ struct fetch_task {
>  	struct oid_array *commits; /* Ensure these commits are fetched */
>  };
>  
> +struct status_task {
> +	struct repository *repo;
> +	const char *path;
> +	int ignore_untracked;
> +};
> +
>  /**
>   * When a submodule is not defined in .gitmodules, we cannot access it
>   * via the regular submodule-config. Create a fake submodule, which we can
> @@ -1547,6 +1567,41 @@ static struct fetch_task *fetch_task_create(struct submodule_parallel_fetch *spf
>  	return NULL;
>  }
>  
> +static struct status_task *
> +get_status_task_from_index(struct submodule_parallel_status *sps,
> +			  struct strbuf *err)
The spacing of these long function signatures feels odd; I'd much rather
see something like:

static struct status_task* get_status_task_from_index(
  (args)

Plus, as this get_status_task_from_index() exists only to be called by
get_next_submodule_status(), I'd find it easier to read if this function
was closer to that one. But that's a pretty tiny nit ;)

> +{
> +	for (; sps->index_count < sps->submodule_names->nr; sps->index_count++) {
> +		struct submodule_status_util *util = sps->submodule_names->items[sps->index_count].util;
> +		const struct cache_entry *ce = util->ce;
> +		struct status_task *task;
> +		struct strbuf buf = STRBUF_INIT;
> +		const char *git_dir;
> +
> +		strbuf_addf(&buf, "%s/.git", ce->name);
> +		git_dir = read_gitfile(buf.buf);
> +		if (!git_dir)
> +			git_dir = buf.buf;
> +		if (!is_git_directory(git_dir)) {
> +			if (is_directory(git_dir))
> +				die(_("'%s' not recognized as a git repository"), git_dir);
> +			strbuf_release(&buf);
> +			/* The submodule is not checked out, so it is not modified */
> +			util->dirty_submodule = 0;
> +			continue;
> +		}
> +		strbuf_release(&buf);
> +
> +		task = xmalloc(sizeof(*task));
> +		memset(task, 0, sizeof(*task));
> +		task->path = ce->name;
> +		task->ignore_untracked = util->ignore_untracked;
> +		sps->index_count++;
> +		return task;
> +	}
> +	return NULL;
> +}
> +
>  static struct fetch_task *
>  get_fetch_task_from_index(struct submodule_parallel_fetch *spf,
>  			  struct strbuf *err)
> @@ -1744,6 +1799,16 @@ static int fetch_start_failure(struct strbuf *err,
>  	return 0;
>  }
>  
> +static int status_start_failure(struct strbuf *err,
> +			       void *cb, void *task_cb)
> +{
> +	struct submodule_parallel_status *sps = cb;
> +
> +	sps->result = 1;
> +	return 0;
> +}
> +
> +
>  static int commit_missing_in_sub(const struct object_id *oid, void *data)
>  {
>  	struct repository *subrepo = data;
> @@ -1790,6 +1855,30 @@ static int parse_status_porcelain(char *buf, unsigned *dirty_submodule, int igno
>  	return 0;
>  }
>  
> +static int status_finish(int retvalue, struct strbuf *err,
> +			void *cb, void *task_cb)
> +{
> +	struct submodule_parallel_status *sps = cb;
> +	struct status_task *task = task_cb;
> +	struct string_list list = STRING_LIST_INIT_DUP;
> +	struct string_list_item *it = string_list_lookup(sps->submodule_names,
> +											task->path);
> +	struct submodule_status_util *util = it->util;
> +
> +	int i;
> +
> +	string_list_split(&list, err->buf, '\n', -1);
> +
> +	for (i = 0; i < list.nr; i++) {
> +		if (parse_status_porcelain(list.items[i].string,
> +							&util->dirty_submodule, util->ignore_untracked))
> +			break;
> +	}
> +	strbuf_reset(err);
> +
> +	return 0;
> +}
> +
>  static int fetch_finish(int retvalue, struct strbuf *err,
>  			void *cb, void *task_cb)
>  {
> @@ -1943,6 +2032,59 @@ unsigned is_submodule_modified(const char *path, int ignore_untracked)
>  	return dirty_submodule;
>  }
>  
> +static int get_next_submodule_status(struct child_process *cp,
> +						struct strbuf *err, void *data, void **task_cb)
> +{
> +	struct submodule_parallel_status *sps = data;
> +	struct status_task *task = get_status_task_from_index(sps, err);
> +
> +	int ignore_untracked;
> +
> +	if (!task) {
> +		return 0;
> +	}
> +
> +	ignore_untracked = task->ignore_untracked;
> +
> +	child_process_init(cp);
> +	prepare_submodule_repo_env_in_gitdir(&cp->env);
> +
> +	strvec_init(&cp->args);
> +	strvec_pushl(&cp->args, "status", "--porcelain=2", NULL);
> +	if (ignore_untracked)
> +		strvec_push(&cp->args, "-uno");
> +
> +	prepare_submodule_repo_env(&cp->env);
> +	cp->git_cmd = 1;
> +	cp->no_stdin = 1;
> +	cp->dir = task->path;
> +	*task_cb = task;
> +	return 1;
> +}
> +
> +int get_submodules_status(struct repository *r,
> +			struct string_list *submodules,
> +		    int max_parallel_jobs)
> +{
> +	struct submodule_parallel_status sps = SPS_INIT;
> +
> +	sps.r = r;
> +
> +	if (repo_read_index(r) < 0)
> +		die(_("index file corrupt"));
> +
> +	sps.submodule_names = submodules;
> +	string_list_sort(sps.submodule_names);
> +	run_processes_parallel_pipe_output = 1;
> +	run_processes_parallel_tr2(max_parallel_jobs,
> +				get_next_submodule_status,
> +				status_start_failure,
> +				status_finish,
> +				&sps,
> +				"submodule", "parallel/status");
> +	return sps.result;
> +}
> +
>  int submodule_uses_gitfile(const char *path)
>  {
>  	struct child_process cp = CHILD_PROCESS_INIT;
> diff --git a/submodule.h b/submodule.h
> index bfaa9da186..18a42c64ce 100644
> --- a/submodule.h
> +++ b/submodule.h
> @@ -41,6 +41,12 @@ struct submodule_update_strategy {
>  	.type = SM_UPDATE_UNSPECIFIED, \
>  }
>  
> +struct submodule_status_util {
> +	int changed, ignore_untracked;
> +	unsigned dirty_submodule, newmode;
> +	struct cache_entry *ce;
> +};
> +
>  int is_gitmodules_unmerged(struct index_state *istate);
>  int is_writing_gitmodules_ok(void);
>  int is_staging_gitmodules_ok(struct index_state *istate);
> @@ -94,6 +100,9 @@ int fetch_submodules(struct repository *r,
>  		     int command_line_option,
>  		     int default_option,
>  		     int quiet, int max_parallel_jobs);
> +int get_submodules_status(struct repository *r,
> +			 struct string_list *submodules,
> +		     int max_parallel_jobs);
>  unsigned is_submodule_modified(const char *path, int ignore_untracked);
>  int submodule_uses_gitfile(const char *path);
>  
> diff --git a/wt-status.c b/wt-status.c
> index 867e3e417e..9864484f81 100644
> --- a/wt-status.c
> +++ b/wt-status.c
> @@ -615,7 +615,7 @@ static void wt_status_collect_changes_worktree(struct wt_status *s)
>  	rev.diffopt.rename_limit = s->rename_limit >= 0 ? s->rename_limit : rev.diffopt.rename_limit;
>  	rev.diffopt.rename_score = s->rename_score >= 0 ? s->rename_score : rev.diffopt.rename_score;
>  	copy_pathspec(&rev.prune_data, &s->pathspec);
> -	run_diff_files(&rev, 0);
> +	run_diff_files(&rev, 0, s->parallel_jobs_submodules);
>  	release_revisions(&rev);
>  }
>  
> @@ -1149,7 +1149,7 @@ static void wt_longstatus_print_verbose(struct wt_status *s)
>  		setup_work_tree();
>  		rev.diffopt.a_prefix = "i/";
>  		rev.diffopt.b_prefix = "w/";
> -		run_diff_files(&rev, 0);
> +		run_diff_files(&rev, 0, -1);
>  	}
>  	release_revisions(&rev);
>  }
> @@ -2544,7 +2544,7 @@ int has_unstaged_changes(struct repository *r, int ignore_submodules)
>  	}
>  	rev_info.diffopt.flags.quick = 1;
>  	diff_setup_done(&rev_info.diffopt);
> -	result = run_diff_files(&rev_info, 0);
> +	result = run_diff_files(&rev_info, 0, -1);
>  	result = diff_result_code(&rev_info.diffopt, result);
>  	release_revisions(&rev_info);
>  	return result;
> diff --git a/wt-status.h b/wt-status.h
> index ab9cc9d8f0..2ea2317715 100644
> --- a/wt-status.h
> +++ b/wt-status.h
> @@ -131,6 +131,7 @@ struct wt_status {
>  	enum wt_status_format status_format;
>  	struct wt_status_state state;
>  	struct object_id oid_commit; /* when not Initial */
> +	int parallel_jobs_submodules;
>  
>  	/* These are computed during processing of the individual sections */
>  	int committable;
> -- 
> 2.37.3.998.g577e59143f-goog
> 

I feel like we're missing tests exercising status.parallelSubmodules=N?

 - Emily

  parent reply	other threads:[~2022-09-27 18:40 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-09-22 23:29 [PATCH 0/4] submodule: parallelize status Calvin Wan
2022-09-22 23:29 ` [PATCH 1/4] run-command: add pipe_output to run_processes_parallel Calvin Wan
2022-09-23  7:52   ` Ævar Arnfjörð Bjarmason
2022-09-26 16:59     ` Calvin Wan
2022-09-27 10:52       ` Ævar Arnfjörð Bjarmason
2022-09-23 18:58   ` Junio C Hamano
2022-09-26 17:31     ` Calvin Wan
2022-09-27  4:45       ` Junio C Hamano
2022-09-27 18:10         ` Calvin Wan
2022-09-27 21:40           ` Junio C Hamano
2022-09-27  9:05       ` Ævar Arnfjörð Bjarmason
2022-09-27 17:55         ` Calvin Wan
2022-09-27 19:34           ` Ævar Arnfjörð Bjarmason
2022-09-27 20:45             ` Calvin Wan
2022-09-28  5:40               ` Ævar Arnfjörð Bjarmason
2022-09-29 20:52                 ` Calvin Wan
2022-09-22 23:29 ` [PATCH 2/4] submodule: move status parsing into function Calvin Wan
2022-09-22 23:29 ` [PATCH 3/4] diff-lib: refactor functions Calvin Wan
2022-09-23 20:36   ` Junio C Hamano
2022-09-26 17:35     ` Calvin Wan
2022-09-22 23:29 ` [PATCH 4/4] diff-lib: parallelize run_diff_files for submodules Calvin Wan
2022-09-23  8:06   ` Ævar Arnfjörð Bjarmason
2022-09-24 20:17     ` Junio C Hamano
2022-09-26 17:50     ` Calvin Wan
2022-09-23 21:44   ` Junio C Hamano
2022-09-26 19:12     ` Calvin Wan
2022-09-25 13:59   ` Phillip Wood
2022-09-26 17:11     ` Junio C Hamano
2022-09-26 19:22     ` Calvin Wan
2022-09-27 18:40   ` Emily Shaffer [this message]
2022-09-23 22:56 ` [PATCH 0/4] submodule: parallelize status Junio C Hamano
2022-09-26 16:33   ` Calvin Wan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YzNDj54cylRzu2Fj@google.com \
    --to=emilyshaffer@google.com \
    --cc=calvinwan@google.com \
    --cc=git@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.