All of lore.kernel.org
 help / color / mirror / Atom feed
* [GSoC][PATCH 0/3] clone: convert explicit dir traversal to dir-iterator
@ 2019-02-23 19:03 Matheus Tavares
  2019-02-23 19:03 ` [GSoC][PATCH 1/3] dir-iterator: add pedantic option to dir_iterator_begin Matheus Tavares
                   ` (4 more replies)
  0 siblings, 5 replies; 35+ messages in thread
From: Matheus Tavares @ 2019-02-23 19:03 UTC (permalink / raw)
  To: git; +Cc: Thomas Gummerer

Add pedantic option to dir_iterator_begin at dir-iterator.c and convert
explicit recursive directory traversal at copy_or_link_directory
(builtin/clone.c) to the dir-iterator API.

This is my microproject for GSoC 2019. Idea taken from
https://git.github.io/SoC-2019-Microprojects/#use-dir-iterator-to-avoid-explicit-recursive-directory-traversal

Build: https://travis-ci.org/MatheusBernardino/git/builds/497512561

Matheus Tavares (3):
  dir-iterator: add pedantic option to dir_iterator_begin
  clone: extract function from copy_or_link_directory
  clone: use dir-iterator to avoid explicit dir traversal

 builtin/clone.c      | 72 ++++++++++++++++++++++++++++----------------
 dir-iterator.c       | 23 ++++++++++++--
 dir-iterator.h       | 16 ++++++++--
 refs/files-backend.c |  2 +-
 4 files changed, 81 insertions(+), 32 deletions(-)

-- 
2.20.1


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [GSoC][PATCH 1/3] dir-iterator: add pedantic option to dir_iterator_begin
  2019-02-23 19:03 [GSoC][PATCH 0/3] clone: convert explicit dir traversal to dir-iterator Matheus Tavares
@ 2019-02-23 19:03 ` Matheus Tavares
  2019-02-23 21:35   ` Thomas Gummerer
  2019-02-23 19:03 ` [GSoC][PATCH 2/3] clone: extract function from copy_or_link_directory Matheus Tavares
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 35+ messages in thread
From: Matheus Tavares @ 2019-02-23 19:03 UTC (permalink / raw)
  To: git
  Cc: Thomas Gummerer, Junio C Hamano, Ramsay Jones,
	Nguyễn Thái Ngọc Duy, Michael Haggerty

Add the pedantic option to dir-iterator's initialization function,
dir_iterator_begin. When this option is set to true,
dir_iterator_advance will immediately return ITER_ERROR when failing to
fetch the next entry. When set to false, dir_iterator_advance will emit
a warning and keep looking for the next entry.

Also adjust refs/files-backend.c to the new dir_iterator_begin
signature.

Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
---
Changes in v2:
 - Added in v2

 dir-iterator.c       | 23 +++++++++++++++++++++--
 dir-iterator.h       | 16 +++++++++++++---
 refs/files-backend.c |  2 +-
 3 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/dir-iterator.c b/dir-iterator.c
index f2dcd82fde..070a656555 100644
--- a/dir-iterator.c
+++ b/dir-iterator.c
@@ -48,6 +48,13 @@ struct dir_iterator_int {
 	 * that will be included in this iteration.
 	 */
 	struct dir_iterator_level *levels;
+
+	/*
+	 * Boolean value to define dir-iterator's behaviour when failing to
+	 * fetch next entry. See comments on dir_iterator_begin at
+	 * dir-iterator.h
+	 */
+	int pedantic;
 };
 
 int dir_iterator_advance(struct dir_iterator *dir_iterator)
@@ -71,6 +78,8 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
 
 			level->dir = opendir(iter->base.path.buf);
 			if (!level->dir && errno != ENOENT) {
+				if (iter->pedantic)
+					goto error_out;
 				warning("error opening directory %s: %s",
 					iter->base.path.buf, strerror(errno));
 				/* Popping the level is handled below */
@@ -122,6 +131,8 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
 			if (!de) {
 				/* This level is exhausted; pop up a level. */
 				if (errno) {
+					if (iter->pedantic)
+						goto error_out;
 					warning("error reading directory %s: %s",
 						iter->base.path.buf, strerror(errno));
 				} else if (closedir(level->dir))
@@ -139,10 +150,13 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
 
 			strbuf_addstr(&iter->base.path, de->d_name);
 			if (lstat(iter->base.path.buf, &iter->base.st) < 0) {
-				if (errno != ENOENT)
+				if (errno != ENOENT) {
+					if (iter->pedantic)
+						goto error_out;
 					warning("error reading path '%s': %s",
 						iter->base.path.buf,
 						strerror(errno));
+				}
 				continue;
 			}
 
@@ -159,6 +173,10 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
 			return ITER_OK;
 		}
 	}
+
+error_out:
+	dir_iterator_abort(dir_iterator);
+	return ITER_ERROR;
 }
 
 int dir_iterator_abort(struct dir_iterator *dir_iterator)
@@ -182,7 +200,7 @@ int dir_iterator_abort(struct dir_iterator *dir_iterator)
 	return ITER_DONE;
 }
 
-struct dir_iterator *dir_iterator_begin(const char *path)
+struct dir_iterator *dir_iterator_begin(const char *path, int pedantic)
 {
 	struct dir_iterator_int *iter = xcalloc(1, sizeof(*iter));
 	struct dir_iterator *dir_iterator = &iter->base;
@@ -195,6 +213,7 @@ struct dir_iterator *dir_iterator_begin(const char *path)
 
 	ALLOC_GROW(iter->levels, 10, iter->levels_alloc);
 
+	iter->pedantic = pedantic;
 	iter->levels_nr = 1;
 	iter->levels[0].initialized = 0;
 
diff --git a/dir-iterator.h b/dir-iterator.h
index 970793d07a..50ca8e1a27 100644
--- a/dir-iterator.h
+++ b/dir-iterator.h
@@ -19,7 +19,7 @@
  * A typical iteration looks like this:
  *
  *     int ok;
- *     struct iterator *iter = dir_iterator_begin(path);
+ *     struct iterator *iter = dir_iterator_begin(path, 0);
  *
  *     while ((ok = dir_iterator_advance(iter)) == ITER_OK) {
  *             if (want_to_stop_iteration()) {
@@ -65,9 +65,15 @@ struct dir_iterator {
  * The iteration includes all paths under path, not including path
  * itself and not including "." or ".." entries.
  *
- * path is the starting directory. An internal copy will be made.
+ * Parameters are:
+ * - path is the starting directory. An internal copy will be made.
+ * - pedantic is a boolean value. If true, dir-iterator will free
+ *   resources and return ITER_ERROR immediately, in case of an error
+ *   while trying to fetch the next entry in dir_iterator_advance. If
+ *   false, it will just emit a warning and keep looking for the next
+ *   entry.
  */
-struct dir_iterator *dir_iterator_begin(const char *path);
+struct dir_iterator *dir_iterator_begin(const char *path, int pedantic);
 
 /*
  * Advance the iterator to the first or next item and return ITER_OK.
@@ -76,6 +82,10 @@ struct dir_iterator *dir_iterator_begin(const char *path);
  * dir_iterator and associated resources and return ITER_ERROR. It is
  * a bug to use iterator or call this function again after it has
  * returned ITER_DONE or ITER_ERROR.
+ *
+ * Note that whether dir_iterator_advance will return ITER_ERROR when
+ * failing to fetch the next entry or keep going is defined by the
+ * 'pedantic' option at dir-iterator's initialization.
  */
 int dir_iterator_advance(struct dir_iterator *iterator);
 
diff --git a/refs/files-backend.c b/refs/files-backend.c
index dd8abe9185..c3d3b6c454 100644
--- a/refs/files-backend.c
+++ b/refs/files-backend.c
@@ -2143,7 +2143,7 @@ static struct ref_iterator *reflog_iterator_begin(struct ref_store *ref_store,
 
 	base_ref_iterator_init(ref_iterator, &files_reflog_iterator_vtable, 0);
 	strbuf_addf(&sb, "%s/logs", gitdir);
-	iter->dir_iterator = dir_iterator_begin(sb.buf);
+	iter->dir_iterator = dir_iterator_begin(sb.buf, 0);
 	iter->ref_store = ref_store;
 	strbuf_release(&sb);
 
-- 
2.20.1


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [GSoC][PATCH 2/3] clone: extract function from copy_or_link_directory
  2019-02-23 19:03 [GSoC][PATCH 0/3] clone: convert explicit dir traversal to dir-iterator Matheus Tavares
  2019-02-23 19:03 ` [GSoC][PATCH 1/3] dir-iterator: add pedantic option to dir_iterator_begin Matheus Tavares
@ 2019-02-23 19:03 ` Matheus Tavares
  2019-02-24  8:38   ` Christian Couder
  2019-02-23 19:03 ` [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal Matheus Tavares
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 35+ messages in thread
From: Matheus Tavares @ 2019-02-23 19:03 UTC (permalink / raw)
  To: git; +Cc: Thomas Gummerer, Junio C Hamano

Extract dir creation code snippet from copy_or_link_directory to its own
function named mkdir_if_missing. This change will help removing
copy_or_link_directory's explicit recursion, which will be done in a
following patch. Also makes code more readable.

Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
---
Changes in v2:
 - Replaced explicit reference to a following patch at patch message to
   a more generic reference.
 - Changed struct stat variable name to st, which is the most common
   name for such variables in git's code.

 builtin/clone.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/builtin/clone.c b/builtin/clone.c
index 50bde99618..862d2ea69c 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -392,6 +392,24 @@ static void copy_alternates(struct strbuf *src, struct strbuf *dst,
 	fclose(in);
 }
 
+static void mkdir_if_missing(const char *pathname, mode_t mode)
+{
+	/*
+	 * Create a dir at pathname unless there's already one.
+	 */
+	struct stat st;
+
+	if (mkdir(pathname, mode)) {
+		if (errno != EEXIST)
+			die_errno(_("failed to create directory '%s'"),
+				  pathname);
+		else if (stat(pathname, &st))
+			die_errno(_("failed to stat '%s'"), pathname);
+		else if (!S_ISDIR(st.st_mode))
+			die(_("%s exists and is not a directory"), pathname);
+	}
+}
+
 static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
 				   const char *src_repo, int src_baselen)
 {
@@ -404,14 +422,7 @@ static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
 	if (!dir)
 		die_errno(_("failed to open '%s'"), src->buf);
 
-	if (mkdir(dest->buf, 0777)) {
-		if (errno != EEXIST)
-			die_errno(_("failed to create directory '%s'"), dest->buf);
-		else if (stat(dest->buf, &buf))
-			die_errno(_("failed to stat '%s'"), dest->buf);
-		else if (!S_ISDIR(buf.st_mode))
-			die(_("%s exists and is not a directory"), dest->buf);
-	}
+	mkdir_if_missing(dest->buf, 0777);
 
 	strbuf_addch(src, '/');
 	src_len = src->len;
-- 
2.20.1


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal
  2019-02-23 19:03 [GSoC][PATCH 0/3] clone: convert explicit dir traversal to dir-iterator Matheus Tavares
  2019-02-23 19:03 ` [GSoC][PATCH 1/3] dir-iterator: add pedantic option to dir_iterator_begin Matheus Tavares
  2019-02-23 19:03 ` [GSoC][PATCH 2/3] clone: extract function from copy_or_link_directory Matheus Tavares
@ 2019-02-23 19:03 ` Matheus Tavares
  2019-02-23 21:48   ` Thomas Gummerer
  2019-02-23 22:40   ` Ævar Arnfjörð Bjarmason
  2019-02-23 19:07 ` [GSoC][PATCH 0/3] clone: convert explicit dir traversal to dir-iterator Matheus Tavares Bernardino
  2019-02-23 21:59 ` Thomas Gummerer
  4 siblings, 2 replies; 35+ messages in thread
From: Matheus Tavares @ 2019-02-23 19:03 UTC (permalink / raw)
  To: git
  Cc: Thomas Gummerer, Junio C Hamano, Nguyễn Thái Ngọc Duy

Replace usage of opendir/readdir/closedir API to traverse directories
recursively, at copy_or_link_directory function, by the dir-iterator
API. This simplifies the code and avoid recursive calls to
copy_or_link_directory.

This process also brings some safe behaviour changes to
copy_or_link_directory:
 - It will no longer follows symbolic links. This is not a problem,
   since the function is only used to copy .git/objects directory, and
   symbolic links are not expected there.
 - Hidden directories won't be skipped anymore. In fact, it is odd that
   the function currently skip hidden directories but not hidden files.
   The reason for that could be unintentional: probably the intention
   was to skip '.' and '..' only, but it ended up accidentally skipping
   all directories starting with '.'. Again, it must not be a problem
   not to skip hidden dirs since hidden dirs/files are not expected at
   .git/objects.
 - Now, copy_or_link_directory will call die() in case of an error on
   openddir, readdir or lstat, inside dir_iterator_advance. That means
   it will abort in case of an error trying to fetch any iteration
   entry.

Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
---
Changes in v2:
 - Improved patch message
 - Removed a now unused variable
 - Put warning on stat error back
 - Added pedantic option to dir-iterator initialization
 - Modified copy_or_link_directory not to skip hidden paths

 builtin/clone.c | 47 ++++++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/builtin/clone.c b/builtin/clone.c
index 862d2ea69c..515dc91d63 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -23,6 +23,8 @@
 #include "transport.h"
 #include "strbuf.h"
 #include "dir.h"
+#include "dir-iterator.h"
+#include "iterator.h"
 #include "sigchain.h"
 #include "branch.h"
 #include "remote.h"
@@ -411,42 +413,45 @@ static void mkdir_if_missing(const char *pathname, mode_t mode)
 }
 
 static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
-				   const char *src_repo, int src_baselen)
+				   const char *src_repo)
 {
-	struct dirent *de;
-	struct stat buf;
 	int src_len, dest_len;
-	DIR *dir;
-
-	dir = opendir(src->buf);
-	if (!dir)
-		die_errno(_("failed to open '%s'"), src->buf);
+	struct dir_iterator *iter;
+	int iter_status;
+	struct stat st;
 
 	mkdir_if_missing(dest->buf, 0777);
 
+	iter = dir_iterator_begin(src->buf, 1);
+
 	strbuf_addch(src, '/');
 	src_len = src->len;
 	strbuf_addch(dest, '/');
 	dest_len = dest->len;
 
-	while ((de = readdir(dir)) != NULL) {
+	while ((iter_status = dir_iterator_advance(iter)) == ITER_OK) {
 		strbuf_setlen(src, src_len);
-		strbuf_addstr(src, de->d_name);
+		strbuf_addstr(src, iter->relative_path);
 		strbuf_setlen(dest, dest_len);
-		strbuf_addstr(dest, de->d_name);
-		if (stat(src->buf, &buf)) {
+		strbuf_addstr(dest, iter->relative_path);
+
+		/*
+		 * dir_iterator_advance already calls lstat to populate iter->st
+		 * but, unlike stat, lstat does not checks for permissions on
+		 * the given path.
+		 */
+		if (stat(src->buf, &st)) {
 			warning (_("failed to stat %s\n"), src->buf);
 			continue;
 		}
-		if (S_ISDIR(buf.st_mode)) {
-			if (de->d_name[0] != '.')
-				copy_or_link_directory(src, dest,
-						       src_repo, src_baselen);
+
+		if (S_ISDIR(iter->st.st_mode)) {
+			mkdir_if_missing(dest->buf, 0777);
 			continue;
 		}
 
 		/* Files that cannot be copied bit-for-bit... */
-		if (!strcmp(src->buf + src_baselen, "/info/alternates")) {
+		if (!strcmp(iter->relative_path, "info/alternates")) {
 			copy_alternates(src, dest, src_repo);
 			continue;
 		}
@@ -463,7 +468,11 @@ static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
 		if (copy_file_with_time(dest->buf, src->buf, 0666))
 			die_errno(_("failed to copy file to '%s'"), dest->buf);
 	}
-	closedir(dir);
+
+	if (iter_status != ITER_DONE) {
+		strbuf_setlen(src, src_len);
+		die(_("failed to iterate over '%s'"), src->buf);
+	}
 }
 
 static void clone_local(const char *src_repo, const char *dest_repo)
@@ -481,7 +490,7 @@ static void clone_local(const char *src_repo, const char *dest_repo)
 		get_common_dir(&dest, dest_repo);
 		strbuf_addstr(&src, "/objects");
 		strbuf_addstr(&dest, "/objects");
-		copy_or_link_directory(&src, &dest, src_repo, src.len);
+		copy_or_link_directory(&src, &dest, src_repo);
 		strbuf_release(&src);
 		strbuf_release(&dest);
 	}
-- 
2.20.1


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 0/3] clone: convert explicit dir traversal to dir-iterator
  2019-02-23 19:03 [GSoC][PATCH 0/3] clone: convert explicit dir traversal to dir-iterator Matheus Tavares
                   ` (2 preceding siblings ...)
  2019-02-23 19:03 ` [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal Matheus Tavares
@ 2019-02-23 19:07 ` Matheus Tavares Bernardino
  2019-02-23 20:10   ` Ævar Arnfjörð Bjarmason
  2019-02-23 21:59 ` Thomas Gummerer
  4 siblings, 1 reply; 35+ messages in thread
From: Matheus Tavares Bernardino @ 2019-02-23 19:07 UTC (permalink / raw)
  To: git; +Cc: Thomas Gummerer

On Sat, Feb 23, 2019 at 4:03 PM Matheus Tavares
<matheus.bernardino@usp.br> wrote:
>
> Add pedantic option to dir_iterator_begin at dir-iterator.c and convert
> explicit recursive directory traversal at copy_or_link_directory
> (builtin/clone.c) to the dir-iterator API.
>
> This is my microproject for GSoC 2019. Idea taken from
> https://git.github.io/SoC-2019-Microprojects/#use-dir-iterator-to-avoid-explicit-recursive-directory-traversal
>
> Build: https://travis-ci.org/MatheusBernardino/git/builds/497512561
>
> Matheus Tavares (3):
>   dir-iterator: add pedantic option to dir_iterator_begin
>   clone: extract function from copy_or_link_directory
>   clone: use dir-iterator to avoid explicit dir traversal
>
>  builtin/clone.c      | 72 ++++++++++++++++++++++++++++----------------
>  dir-iterator.c       | 23 ++++++++++++--
>  dir-iterator.h       | 16 ++++++++--
>  refs/files-backend.c |  2 +-
>  4 files changed, 81 insertions(+), 32 deletions(-)
>

Sadly, I forgot the v2 tag, but this is a v2. Should I resent the
series with [PATCH v2 ...] or is it ok?

Matheus Tavares

> --
> 2.20.1
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 0/3] clone: convert explicit dir traversal to dir-iterator
  2019-02-23 19:07 ` [GSoC][PATCH 0/3] clone: convert explicit dir traversal to dir-iterator Matheus Tavares Bernardino
@ 2019-02-23 20:10   ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 35+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-02-23 20:10 UTC (permalink / raw)
  To: Matheus Tavares Bernardino; +Cc: git, Thomas Gummerer


On Sat, Feb 23 2019, Matheus Tavares Bernardino wrote:

> On Sat, Feb 23, 2019 at 4:03 PM Matheus Tavares
> <matheus.bernardino@usp.br> wrote:
>>
>> Add pedantic option to dir_iterator_begin at dir-iterator.c and convert
>> explicit recursive directory traversal at copy_or_link_directory
>> (builtin/clone.c) to the dir-iterator API.
>>
>> This is my microproject for GSoC 2019. Idea taken from
>> https://git.github.io/SoC-2019-Microprojects/#use-dir-iterator-to-avoid-explicit-recursive-directory-traversal
>>
>> Build: https://travis-ci.org/MatheusBernardino/git/builds/497512561
>>
>> Matheus Tavares (3):
>>   dir-iterator: add pedantic option to dir_iterator_begin
>>   clone: extract function from copy_or_link_directory
>>   clone: use dir-iterator to avoid explicit dir traversal
>>
>>  builtin/clone.c      | 72 ++++++++++++++++++++++++++++----------------
>>  dir-iterator.c       | 23 ++++++++++++--
>>  dir-iterator.h       | 16 ++++++++--
>>  refs/files-backend.c |  2 +-
>>  4 files changed, 81 insertions(+), 32 deletions(-)
>>
>
> Sadly, I forgot the v2 tag, but this is a v2. Should I resent the
> series with [PATCH v2 ...] or is it ok?

It's fine. No need to resend.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 1/3] dir-iterator: add pedantic option to dir_iterator_begin
  2019-02-23 19:03 ` [GSoC][PATCH 1/3] dir-iterator: add pedantic option to dir_iterator_begin Matheus Tavares
@ 2019-02-23 21:35   ` Thomas Gummerer
  2019-02-24  8:35     ` Christian Couder
  0 siblings, 1 reply; 35+ messages in thread
From: Thomas Gummerer @ 2019-02-23 21:35 UTC (permalink / raw)
  To: Matheus Tavares
  Cc: git, Junio C Hamano, Ramsay Jones,
	Nguyễn Thái Ngọc Duy, Michael Haggerty

On 02/23, Matheus Tavares wrote:
> Add the pedantic option to dir-iterator's initialization function,
> dir_iterator_begin. When this option is set to true,
> dir_iterator_advance will immediately return ITER_ERROR when failing to
> fetch the next entry. When set to false, dir_iterator_advance will emit
> a warning and keep looking for the next entry.
> 
> Also adjust refs/files-backend.c to the new dir_iterator_begin
> signature.

Thanks, this makes sense to me.  This commit message describes what we
are doing in this patch, but not why we are doing it.  From previously
reviewing this series, I know this is going to be used in a subsequent
patch, but it is nice to give reviewers and future readers of this
patch that context as well.  Just something like "This behaviour is
going to be used in a subsequent patch." should be enough here.

A few comments inline.

> Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
> ---
> Changes in v2:
>  - Added in v2
> 
>  dir-iterator.c       | 23 +++++++++++++++++++++--
>  dir-iterator.h       | 16 +++++++++++++---
>  refs/files-backend.c |  2 +-
>  3 files changed, 35 insertions(+), 6 deletions(-)
> 
> diff --git a/dir-iterator.c b/dir-iterator.c
> index f2dcd82fde..070a656555 100644
> --- a/dir-iterator.c
> +++ b/dir-iterator.c
> @@ -48,6 +48,13 @@ struct dir_iterator_int {
>  	 * that will be included in this iteration.
>  	 */
>  	struct dir_iterator_level *levels;
> +
> +	/*
> +	 * Boolean value to define dir-iterator's behaviour when failing to
> +	 * fetch next entry. See comments on dir_iterator_begin at
> +	 * dir-iterator.h
> +	 */
> +	int pedantic;
>  };
>  
>  int dir_iterator_advance(struct dir_iterator *dir_iterator)
> @@ -71,6 +78,8 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
>  
>  			level->dir = opendir(iter->base.path.buf);
>  			if (!level->dir && errno != ENOENT) {
> +				if (iter->pedantic)
> +					goto error_out;

I think we should also print an error here.  The caller doesn't have
any context on what went wrong, and will probably just 'die()' if an
error is encountered.  I think it would make sense to call
'error(...)' here before 'goto error_out;' to give a useful error
message here.

>  				warning("error opening directory %s: %s",
>  					iter->base.path.buf, strerror(errno));
>  				/* Popping the level is handled below */
> @@ -122,6 +131,8 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
>  			if (!de) {
>  				/* This level is exhausted; pop up a level. */
>  				if (errno) {
> +					if (iter->pedantic)
> +						goto error_out;
>  					warning("error reading directory %s: %s",
>  						iter->base.path.buf, strerror(errno));
>  				} else if (closedir(level->dir))
> @@ -139,10 +150,13 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
>  
>  			strbuf_addstr(&iter->base.path, de->d_name);
>  			if (lstat(iter->base.path.buf, &iter->base.st) < 0) {
> -				if (errno != ENOENT)
> +				if (errno != ENOENT) {
> +					if (iter->pedantic)
> +						goto error_out;
>  					warning("error reading path '%s': %s",
>  						iter->base.path.buf,
>  						strerror(errno));
> +				}
>  				continue;
>  			}
>  
> @@ -159,6 +173,10 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
>  			return ITER_OK;
>  		}
>  	}
> +
> +error_out:
> +	dir_iterator_abort(dir_iterator);
> +	return ITER_ERROR;
>  }
>  
>  int dir_iterator_abort(struct dir_iterator *dir_iterator)
> @@ -182,7 +200,7 @@ int dir_iterator_abort(struct dir_iterator *dir_iterator)
>  	return ITER_DONE;
>  }
>  
> -struct dir_iterator *dir_iterator_begin(const char *path)
> +struct dir_iterator *dir_iterator_begin(const char *path, int pedantic)

Thinking about the future evolution of this interface, it might make
more sense to have that second parameter be a "struct
dir_iterator_opts".  For now it would just have one member "pedantic",
but in the future we could add additional options there instead of
adding additional parameters.

For now there are not many callers of the dir iterator API, so adding
a parameter is not a huge problem.  However we will most likely add
more callers in the future, so changing all of them becomes more and
more costly.

It also becomes a problem for integrating these patches, in the case
where a topic in flight adds a new use of this API.  In that case the
breakage wouldn't be noticed on merges, as there would be no merge
conflicts, however it would break the build, which makes the
maintainers job harder.

We could of course add new functions in the future, rather than
changing dir_iterator_begin, however just having one function, that
allows combining different options sounds like the better choice to me
here.

>  {
>  	struct dir_iterator_int *iter = xcalloc(1, sizeof(*iter));
>  	struct dir_iterator *dir_iterator = &iter->base;
> @@ -195,6 +213,7 @@ struct dir_iterator *dir_iterator_begin(const char *path)
>  
>  	ALLOC_GROW(iter->levels, 10, iter->levels_alloc);
>  
> +	iter->pedantic = pedantic;
>  	iter->levels_nr = 1;
>  	iter->levels[0].initialized = 0;
>  
> diff --git a/dir-iterator.h b/dir-iterator.h
> index 970793d07a..50ca8e1a27 100644
> --- a/dir-iterator.h
> +++ b/dir-iterator.h
> @@ -19,7 +19,7 @@
>   * A typical iteration looks like this:
>   *
>   *     int ok;
> - *     struct iterator *iter = dir_iterator_begin(path);
> + *     struct iterator *iter = dir_iterator_begin(path, 0);
>   *
>   *     while ((ok = dir_iterator_advance(iter)) == ITER_OK) {
>   *             if (want_to_stop_iteration()) {
> @@ -65,9 +65,15 @@ struct dir_iterator {
>   * The iteration includes all paths under path, not including path
>   * itself and not including "." or ".." entries.
>   *
> - * path is the starting directory. An internal copy will be made.
> + * Parameters are:
> + * - path is the starting directory. An internal copy will be made.
> + * - pedantic is a boolean value. If true, dir-iterator will free
> + *   resources and return ITER_ERROR immediately, in case of an error
> + *   while trying to fetch the next entry in dir_iterator_advance. If
> + *   false, it will just emit a warning and keep looking for the next
> + *   entry.
>   */
> -struct dir_iterator *dir_iterator_begin(const char *path);
> +struct dir_iterator *dir_iterator_begin(const char *path, int pedantic);
>  
>  /*
>   * Advance the iterator to the first or next item and return ITER_OK.
> @@ -76,6 +82,10 @@ struct dir_iterator *dir_iterator_begin(const char *path);
>   * dir_iterator and associated resources and return ITER_ERROR. It is
>   * a bug to use iterator or call this function again after it has
>   * returned ITER_DONE or ITER_ERROR.
> + *
> + * Note that whether dir_iterator_advance will return ITER_ERROR when
> + * failing to fetch the next entry or keep going is defined by the
> + * 'pedantic' option at dir-iterator's initialization.
>   */
>  int dir_iterator_advance(struct dir_iterator *iterator);
>  
> diff --git a/refs/files-backend.c b/refs/files-backend.c
> index dd8abe9185..c3d3b6c454 100644
> --- a/refs/files-backend.c
> +++ b/refs/files-backend.c
> @@ -2143,7 +2143,7 @@ static struct ref_iterator *reflog_iterator_begin(struct ref_store *ref_store,
>  
>  	base_ref_iterator_init(ref_iterator, &files_reflog_iterator_vtable, 0);
>  	strbuf_addf(&sb, "%s/logs", gitdir);
> -	iter->dir_iterator = dir_iterator_begin(sb.buf);
> +	iter->dir_iterator = dir_iterator_begin(sb.buf, 0);
>  	iter->ref_store = ref_store;
>  	strbuf_release(&sb);
>  
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal
  2019-02-23 19:03 ` [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal Matheus Tavares
@ 2019-02-23 21:48   ` Thomas Gummerer
  2019-02-24 18:19     ` Matheus Tavares Bernardino
  2019-02-23 22:40   ` Ævar Arnfjörð Bjarmason
  1 sibling, 1 reply; 35+ messages in thread
From: Thomas Gummerer @ 2019-02-23 21:48 UTC (permalink / raw)
  To: Matheus Tavares
  Cc: git, Junio C Hamano, Nguyễn Thái Ngọc Duy

On 02/23, Matheus Tavares wrote:
> Replace usage of opendir/readdir/closedir API to traverse directories
> recursively, at copy_or_link_directory function, by the dir-iterator
> API. This simplifies the code and avoid recursive calls to
> copy_or_link_directory.
> 
> This process also brings some safe behaviour changes to
> copy_or_link_directory:
>  - It will no longer follows symbolic links. This is not a problem,
>    since the function is only used to copy .git/objects directory, and
>    symbolic links are not expected there.
>  - Hidden directories won't be skipped anymore. In fact, it is odd that
>    the function currently skip hidden directories but not hidden files.
>    The reason for that could be unintentional: probably the intention
>    was to skip '.' and '..' only, but it ended up accidentally skipping
>    all directories starting with '.'. Again, it must not be a problem
>    not to skip hidden dirs since hidden dirs/files are not expected at
>    .git/objects.
>  - Now, copy_or_link_directory will call die() in case of an error on
>    openddir, readdir or lstat, inside dir_iterator_advance. That means
>    it will abort in case of an error trying to fetch any iteration
>    entry.
> 
> Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
> ---
> Changes in v2:
>  - Improved patch message
>  - Removed a now unused variable

s/variable/parameter/ I believe?

>  - Put warning on stat error back
>  - Added pedantic option to dir-iterator initialization
>  - Modified copy_or_link_directory not to skip hidden paths

Thanks, these descriptions are very useful for reviewers that had a
look at previous rounds.

>  builtin/clone.c | 47 ++++++++++++++++++++++++++++-------------------
>  1 file changed, 28 insertions(+), 19 deletions(-)
> 
> diff --git a/builtin/clone.c b/builtin/clone.c
> index 862d2ea69c..515dc91d63 100644
> --- a/builtin/clone.c
> +++ b/builtin/clone.c
> @@ -23,6 +23,8 @@
>  #include "transport.h"
>  #include "strbuf.h"
>  #include "dir.h"
> +#include "dir-iterator.h"
> +#include "iterator.h"
>  #include "sigchain.h"
>  #include "branch.h"
>  #include "remote.h"
> @@ -411,42 +413,45 @@ static void mkdir_if_missing(const char *pathname, mode_t mode)
>  }
>  
>  static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
> -				   const char *src_repo, int src_baselen)
> +				   const char *src_repo)
>  {
> -	struct dirent *de;
> -	struct stat buf;
>  	int src_len, dest_len;
> -	DIR *dir;
> -
> -	dir = opendir(src->buf);
> -	if (!dir)
> -		die_errno(_("failed to open '%s'"), src->buf);
> +	struct dir_iterator *iter;
> +	int iter_status;
> +	struct stat st;
>  
>  	mkdir_if_missing(dest->buf, 0777);
>  
> +	iter = dir_iterator_begin(src->buf, 1);
> +
>  	strbuf_addch(src, '/');
>  	src_len = src->len;
>  	strbuf_addch(dest, '/');
>  	dest_len = dest->len;
>  
> -	while ((de = readdir(dir)) != NULL) {
> +	while ((iter_status = dir_iterator_advance(iter)) == ITER_OK) {
>  		strbuf_setlen(src, src_len);
> -		strbuf_addstr(src, de->d_name);
> +		strbuf_addstr(src, iter->relative_path);
>  		strbuf_setlen(dest, dest_len);
> -		strbuf_addstr(dest, de->d_name);
> -		if (stat(src->buf, &buf)) {
> +		strbuf_addstr(dest, iter->relative_path);
> +
> +		/*
> +		 * dir_iterator_advance already calls lstat to populate iter->st
> +		 * but, unlike stat, lstat does not checks for permissions on
> +		 * the given path.
> +		 */

Hmm, lstat does check the permissions on the path, it just doesn't
follow symlinks.  I think I actually mislead you in my previous review
here, and was reading the code in dir-iterator.c all wrong.

I thought it said "if (errno == ENOENT) warning(...)", however the
condition is "errno != ENOENT", which is why I thought we were loosing
warnings when errno == EACCES for example.

As we decided that we would no longer follow symlinks now, I think we
can actually get rid of the stat call here.  Sorry about the confusion.

> +		if (stat(src->buf, &st)) {
>  			warning (_("failed to stat %s\n"), src->buf);
>  			continue;
>  		}
> -		if (S_ISDIR(buf.st_mode)) {
> -			if (de->d_name[0] != '.')
> -				copy_or_link_directory(src, dest,
> -						       src_repo, src_baselen);
> +
> +		if (S_ISDIR(iter->st.st_mode)) {
> +			mkdir_if_missing(dest->buf, 0777);
>  			continue;
>  		}
>  
>  		/* Files that cannot be copied bit-for-bit... */
> -		if (!strcmp(src->buf + src_baselen, "/info/alternates")) {
> +		if (!strcmp(iter->relative_path, "info/alternates")) {
>  			copy_alternates(src, dest, src_repo);
>  			continue;
>  		}
> @@ -463,7 +468,11 @@ static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
>  		if (copy_file_with_time(dest->buf, src->buf, 0666))
>  			die_errno(_("failed to copy file to '%s'"), dest->buf);
>  	}
> -	closedir(dir);
> +
> +	if (iter_status != ITER_DONE) {
> +		strbuf_setlen(src, src_len);
> +		die(_("failed to iterate over '%s'"), src->buf);
> +	}
>  }
>  
>  static void clone_local(const char *src_repo, const char *dest_repo)
> @@ -481,7 +490,7 @@ static void clone_local(const char *src_repo, const char *dest_repo)
>  		get_common_dir(&dest, dest_repo);
>  		strbuf_addstr(&src, "/objects");
>  		strbuf_addstr(&dest, "/objects");
> -		copy_or_link_directory(&src, &dest, src_repo, src.len);
> +		copy_or_link_directory(&src, &dest, src_repo);
>  		strbuf_release(&src);
>  		strbuf_release(&dest);
>  	}
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 0/3] clone: convert explicit dir traversal to dir-iterator
  2019-02-23 19:03 [GSoC][PATCH 0/3] clone: convert explicit dir traversal to dir-iterator Matheus Tavares
                   ` (3 preceding siblings ...)
  2019-02-23 19:07 ` [GSoC][PATCH 0/3] clone: convert explicit dir traversal to dir-iterator Matheus Tavares Bernardino
@ 2019-02-23 21:59 ` Thomas Gummerer
  2019-02-24 16:34   ` Matheus Tavares Bernardino
  4 siblings, 1 reply; 35+ messages in thread
From: Thomas Gummerer @ 2019-02-23 21:59 UTC (permalink / raw)
  To: Matheus Tavares; +Cc: git, Christian Couder

On 02/23, Matheus Tavares wrote:
> Add pedantic option to dir_iterator_begin at dir-iterator.c and convert
> explicit recursive directory traversal at copy_or_link_directory
> (builtin/clone.c) to the dir-iterator API.

Thanks for another iteration of this.

To make life easier for reviewers, please include a link (or the
message-ID) to all previous iteration of the series.  We often use
links to the public-inbox mirror for this,
e.g. https://public-inbox.org/git/20190215154913.18800-1-matheus.bernardino@usp.br/.

This helps reviewers go back quickly to previous iterations of the
series, and refresh their memory on the comments that were left
there.

You can also use the --in-reply-to option in 'git send-email' to chain
the threads, which also makes life easier for reviewers.

An additional way to help reviewers is to include a 'range-diff'
between the previous iteration of the series, and the current
iteration.  See the 'git range-diff' command or the '--range-diff'
option to 'git format-patch' for that.  That helps reviewers to
quickly see what changed between iterations, so that they don't have
to re-review everything, if they can still remember the last round
well enough.

I also added Christian Couder back to the Cc list, as he commented on
the RFC.  It's good to keep people that commented on the series in Cc,
as they have shown some interest in the series, so keeping them in the
Cc list helps highlight those emails for them, and makes it more
likely that the patches get reviewed quickly.

> This is my microproject for GSoC 2019. Idea taken from
> https://git.github.io/SoC-2019-Microprojects/#use-dir-iterator-to-avoid-explicit-recursive-directory-traversal
> 
> Build: https://travis-ci.org/MatheusBernardino/git/builds/497512561
> 
> Matheus Tavares (3):
>   dir-iterator: add pedantic option to dir_iterator_begin
>   clone: extract function from copy_or_link_directory
>   clone: use dir-iterator to avoid explicit dir traversal
> 
>  builtin/clone.c      | 72 ++++++++++++++++++++++++++++----------------
>  dir-iterator.c       | 23 ++++++++++++--
>  dir-iterator.h       | 16 ++++++++--
>  refs/files-backend.c |  2 +-
>  4 files changed, 81 insertions(+), 32 deletions(-)
> 
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal
  2019-02-23 19:03 ` [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal Matheus Tavares
  2019-02-23 21:48   ` Thomas Gummerer
@ 2019-02-23 22:40   ` Ævar Arnfjörð Bjarmason
  2019-02-24  9:41     ` Christian Couder
  1 sibling, 1 reply; 35+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-02-23 22:40 UTC (permalink / raw)
  To: Matheus Tavares
  Cc: git, Thomas Gummerer, Junio C Hamano,
	Nguyễn Thái Ngọc Duy


On Sat, Feb 23 2019, Matheus Tavares wrote:

> Replace usage of opendir/readdir/closedir API to traverse directories
> recursively, at copy_or_link_directory function, by the dir-iterator
> API. This simplifies the code and avoid recursive calls to
> copy_or_link_directory.

Sounds good in principle.

> This process also brings some safe behaviour changes to
> copy_or_link_directory:

I ad-hoc tested some of these, and could spot behavior changes. We
should have tests for these.

>  - It will no longer follows symbolic links. This is not a problem,
>    since the function is only used to copy .git/objects directory, and
>    symbolic links are not expected there.

I don't think we should make that assumption, and I don't know of
anything else in git that does.

I've certainly symlinked individual objects or packs into a repo for
debugging / recovery, and it would be unexpected to clone that and miss
something.

So in the general case we should be strict in what we generate, but
permissive in what we accept. We don't want a "clone" of an existing
repo to fail, or "fsck" to fail after clone...

When trying to test this I made e.g. objects/c4 a symlink to /tmp/c4,
and a specific object in objects/4d/ a symlink to /tmp too.

Without this patch the individual object is still a symlink, but the
object under the directory gets resolved, and "un-symlinked", also with
--dissociate, which seems like an existing bug.

With your patch that symlink structure is copied as-is. That's more
faithful under --local, but a regression for --dissociate (which didn't
work fully to begin with...).

I was paranoid that "no longer follows symbolic links" could also mean
"will ignore those objects", but it seems to more faithfully copy things
as-is for *that* case.

But then I try with --no-hardlinks and stock git dereferences my symlink
structure, but with your patches fails completely:

    Cloning into bare repository 'repo2'...
    error: copy-fd: read returned: Is a directory
    fatal: failed to copy file to 'repo2/objects/c4': Is a directory
    fatal: the remote end hung up unexpectedly
    fatal: cannot change to 'repo2': No such file or directory

So there's at least one case in a few minutes of prodding this where we
can't clone a working repo now, however obscure the setup.

>  - Hidden directories won't be skipped anymore. In fact, it is odd that
>    the function currently skip hidden directories but not hidden files.
>    The reason for that could be unintentional: probably the intention
>    was to skip '.' and '..' only, but it ended up accidentally skipping
>    all directories starting with '.'. Again, it must not be a problem
>    not to skip hidden dirs since hidden dirs/files are not expected at
>    .git/objects.

I reproduce this with --local. A ".foo" isn't copied before, now it
is. Good, I guess. We'd have already copied a "foo".

>  - Now, copy_or_link_directory will call die() in case of an error on
>    openddir, readdir or lstat, inside dir_iterator_advance. That means
>    it will abort in case of an error trying to fetch any iteration
>    entry.

Good, but really IMNSHO this series is tweaking some critical core code
and desperately needs tests.

Unfortunately, in this as in so many edge case we have no existing
tests.

This would be much easier to review and would give reviewers more
confidence if the parts of this that changed behavior started with a
patch or patches that just manually objects/ dirs with various
combinations of symlinks, hardlinks etc., and asserted that the various
options did exactly what they're doing now, and made sure the
source/target repos were the same after/both passed "fsck".

Then followed by some version of this patch which changes the behavior,
and would be forced to tweak those tests. To make it clear e.g. that
some cases where we have a working "clone" are now a hard error.

Thanks for working on this!

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 1/3] dir-iterator: add pedantic option to dir_iterator_begin
  2019-02-23 21:35   ` Thomas Gummerer
@ 2019-02-24  8:35     ` Christian Couder
  2019-02-24 17:43       ` Matheus Tavares Bernardino
  0 siblings, 1 reply; 35+ messages in thread
From: Christian Couder @ 2019-02-24  8:35 UTC (permalink / raw)
  To: Thomas Gummerer
  Cc: Matheus Tavares, git, Junio C Hamano, Ramsay Jones,
	Nguyễn Thái Ngọc Duy, Michael Haggerty

On Sat, Feb 23, 2019 at 10:37 PM Thomas Gummerer <t.gummerer@gmail.com> wrote:
>
> On 02/23, Matheus Tavares wrote:
> > Add the pedantic option to dir-iterator's initialization function,
> > dir_iterator_begin. When this option is set to true,
> > dir_iterator_advance will immediately return ITER_ERROR when failing to
> > fetch the next entry. When set to false, dir_iterator_advance will emit
> > a warning and keep looking for the next entry.
> >
> > Also adjust refs/files-backend.c to the new dir_iterator_begin
> > signature.
>
> Thanks, this makes sense to me.  This commit message describes what we
> are doing in this patch, but not why we are doing it.  From previously
> reviewing this series, I know this is going to be used in a subsequent
> patch, but it is nice to give reviewers and future readers of this
> patch that context as well.  Just something like "This behaviour is
> going to be used in a subsequent patch." should be enough here.

I agree that it's a good idea to add just that.

> >  int dir_iterator_advance(struct dir_iterator *dir_iterator)
> > @@ -71,6 +78,8 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
> >
> >                       level->dir = opendir(iter->base.path.buf);
> >                       if (!level->dir && errno != ENOENT) {
> > +                             if (iter->pedantic)
> > +                                     goto error_out;
>
> I think we should also print an error here.  The caller doesn't have
> any context on what went wrong, and will probably just 'die()' if an
> error is encountered.  I think it would make sense to call
> 'error(...)' here before 'goto error_out;' to give a useful error
> message here.

If we start to give error messages, then we might as well give error
messages all the times when we error out. This will avoid the callers
wondering if they need to give an error message or not.

I am not sure it's necessary here though. And I think if it's useful,
it can be added in another patch or another patch series.

> >                               warning("error opening directory %s: %s",
> >                                       iter->base.path.buf, strerror(errno));
> >                               /* Popping the level is handled below */

> > -struct dir_iterator *dir_iterator_begin(const char *path)
> > +struct dir_iterator *dir_iterator_begin(const char *path, int pedantic)
>
> Thinking about the future evolution of this interface, it might make
> more sense to have that second parameter be a "struct
> dir_iterator_opts".  For now it would just have one member "pedantic",
> but in the future we could add additional options there instead of
> adding additional parameters.

I think it's ok with `int pedantic` for now as improvements can be
done when they are really needed. And we will perhaps find out that
it's better to just change `int pedantic` to `unsigned flags` instead
of `struct dir_iterator_opts`.

Thanks,
Christian.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 2/3] clone: extract function from copy_or_link_directory
  2019-02-23 19:03 ` [GSoC][PATCH 2/3] clone: extract function from copy_or_link_directory Matheus Tavares
@ 2019-02-24  8:38   ` Christian Couder
  0 siblings, 0 replies; 35+ messages in thread
From: Christian Couder @ 2019-02-24  8:38 UTC (permalink / raw)
  To: Matheus Tavares; +Cc: git, Thomas Gummerer, Junio C Hamano

On Sat, Feb 23, 2019 at 8:06 PM Matheus Tavares
<matheus.bernardino@usp.br> wrote:
>
> Extract dir creation code snippet from copy_or_link_directory to its own
> function named mkdir_if_missing. This change will help removing
> copy_or_link_directory's explicit recursion, which will be done in a
> following patch. Also makes code more readable.
>
> Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
> ---
> Changes in v2:
>  - Replaced explicit reference to a following patch at patch message to
>    a more generic reference.
>  - Changed struct stat variable name to st, which is the most common
>    name for such variables in git's code.

Nice!

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal
  2019-02-23 22:40   ` Ævar Arnfjörð Bjarmason
@ 2019-02-24  9:41     ` Christian Couder
  2019-02-24 14:45       ` Ævar Arnfjörð Bjarmason
  2019-02-25  2:31       ` [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal Matheus Tavares Bernardino
  0 siblings, 2 replies; 35+ messages in thread
From: Christian Couder @ 2019-02-24  9:41 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Matheus Tavares, git, Thomas Gummerer, Junio C Hamano,
	Nguyễn Thái Ngọc Duy

On Sat, Feb 23, 2019 at 11:48 PM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
>
> On Sat, Feb 23 2019, Matheus Tavares wrote:
>
> > Replace usage of opendir/readdir/closedir API to traverse directories
> > recursively, at copy_or_link_directory function, by the dir-iterator
> > API. This simplifies the code and avoid recursive calls to
> > copy_or_link_directory.
>
> Sounds good in principle.
>
> > This process also brings some safe behaviour changes to
> > copy_or_link_directory:
>
> I ad-hoc tested some of these, and could spot behavior changes. We
> should have tests for these.

I agree that ideally we should have a few tests for these, but this is
a grey area (see below) and there are areas that are not grey for
which we don't have any test...

And then adding tests would make this series become larger than a
typical GSoC micro-project...

> >  - It will no longer follows symbolic links. This is not a problem,
> >    since the function is only used to copy .git/objects directory, and
> >    symbolic links are not expected there.
>
> I don't think we should make that assumption, and I don't know of
> anything else in git that does.

I think Git itself doesn't create symlinks in ".git/objects/" and we
don't recommend people manually tweaking what's inside ".git/". That's
why I think it's a grey area.

> I've certainly symlinked individual objects or packs into a repo for
> debugging / recovery, and it would be unexpected to clone that and miss
> something.

If people tweak what's inside ".git" by hand, they are expected to
know what they doing and be able to debug it.

> So in the general case we should be strict in what we generate, but
> permissive in what we accept. We don't want a "clone" of an existing
> repo to fail, or "fsck" to fail after clone...

Yeah, but realistically I don't think we are going to foolproof Git
from everything that someone could do by tweaking random things
manually in ".git/".

I am not saying that it should be ok to make things much worse than
they are now in case some things have been tweaked in ".git/", but if
things in general don't look worse in this grey area, and a patch
otherwise improves things, then I think the patch should be ok.

> When trying to test this I made e.g. objects/c4 a symlink to /tmp/c4,
> and a specific object in objects/4d/ a symlink to /tmp too.
>
> Without this patch the individual object is still a symlink, but the
> object under the directory gets resolved, and "un-symlinked", also with
> --dissociate, which seems like an existing bug.
>
> With your patch that symlink structure is copied as-is. That's more
> faithful under --local, but a regression for --dissociate (which didn't
> work fully to begin with...).

I think that I use --local (which is the default if the repository is
specified as a local path) much more often than --dissociate, so for
me the patch would be very positive, especially since --dissociate is
already buggy anyway in this case.

> I was paranoid that "no longer follows symbolic links" could also mean
> "will ignore those objects", but it seems to more faithfully copy things
> as-is for *that* case.

Nice!

> But then I try with --no-hardlinks and stock git dereferences my symlink
> structure, but with your patches fails completely:
>
>     Cloning into bare repository 'repo2'...
>     error: copy-fd: read returned: Is a directory
>     fatal: failed to copy file to 'repo2/objects/c4': Is a directory
>     fatal: the remote end hung up unexpectedly
>     fatal: cannot change to 'repo2': No such file or directory

Maybe this could be fixed. Anyway I don't use --no-hardlinks very
often, so I still think the patch is a positive even with this
failure.

> So there's at least one case in a few minutes of prodding this where we
> can't clone a working repo now, however obscure the setup.
>
> >  - Hidden directories won't be skipped anymore. In fact, it is odd that
> >    the function currently skip hidden directories but not hidden files.
> >    The reason for that could be unintentional: probably the intention
> >    was to skip '.' and '..' only, but it ended up accidentally skipping
> >    all directories starting with '.'. Again, it must not be a problem
> >    not to skip hidden dirs since hidden dirs/files are not expected at
> >    .git/objects.
>
> I reproduce this with --local. A ".foo" isn't copied before, now it
> is. Good, I guess. We'd have already copied a "foo".
>
> >  - Now, copy_or_link_directory will call die() in case of an error on
> >    openddir, readdir or lstat, inside dir_iterator_advance. That means
> >    it will abort in case of an error trying to fetch any iteration
> >    entry.

It would be nice if the above paragraph in the commit message would
say what was the previous behavior and why it's better to die() .

> Good, but really IMNSHO this series is tweaking some critical core code
> and desperately needs tests.

It's critical that this code works well in the usual case, yes. (And
there are already a lot of tests that test that.) But when people have
manually tweaked things in their ".git/objects/", it's not critical
what happens. Many systems have "undefined behaviors" at some point
and that's ok.

And no, I am not saying that we should consider it completely
"undefined behavior" as soon as something is manually tweaked in
".git/", and yes, tests would be nice, and your comments and manual
tests on this are very much appreciated. It's just that I don't think
we should require too much when a patch, especially from a first time
contributor, is already improving things, though it also changes a few
things in a grey area.

> Unfortunately, in this as in so many edge case we have no existing
> tests.
>
> This would be much easier to review and would give reviewers more
> confidence if the parts of this that changed behavior started with a
> patch or patches that just manually objects/ dirs with various

I think "created" is missing between "manually" and  "objects/" in the
above sentence.

> combinations of symlinks, hardlinks etc., and asserted that the various
> options did exactly what they're doing now, and made sure the
> source/target repos were the same after/both passed "fsck".
>
> Then followed by some version of this patch which changes the behavior,
> and would be forced to tweak those tests. To make it clear e.g. that
> some cases where we have a working "clone" are now a hard error.

Unfortunately this would be a lot of work and not appropriate for a
GSoC micro-project.

Thanks,
Christian.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal
  2019-02-24  9:41     ` Christian Couder
@ 2019-02-24 14:45       ` Ævar Arnfjörð Bjarmason
  2019-02-25  9:45         ` Duy Nguyen
  2019-02-25  2:31       ` [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal Matheus Tavares Bernardino
  1 sibling, 1 reply; 35+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-02-24 14:45 UTC (permalink / raw)
  To: Christian Couder
  Cc: Matheus Tavares, git, Thomas Gummerer, Junio C Hamano,
	Nguyễn Thái Ngọc Duy


On Sun, Feb 24 2019, Christian Couder wrote:

> On Sat, Feb 23, 2019 at 11:48 PM Ævar Arnfjörð Bjarmason
> <avarab@gmail.com> wrote:
>>
>>
>> On Sat, Feb 23 2019, Matheus Tavares wrote:
>>
>> > Replace usage of opendir/readdir/closedir API to traverse directories
>> > recursively, at copy_or_link_directory function, by the dir-iterator
>> > API. This simplifies the code and avoid recursive calls to
>> > copy_or_link_directory.
>>
>> Sounds good in principle.
>>
>> > This process also brings some safe behaviour changes to
>> > copy_or_link_directory:
>>
>> I ad-hoc tested some of these, and could spot behavior changes. We
>> should have tests for these.
>
> I agree that ideally we should have a few tests for these, but this is
> a grey area (see below) and there are areas that are not grey for
> which we don't have any test...
>
> And then adding tests would make this series become larger than a
> typical GSoC micro-project...
>
>> >  - It will no longer follows symbolic links. This is not a problem,
>> >    since the function is only used to copy .git/objects directory, and
>> >    symbolic links are not expected there.
>>
>> I don't think we should make that assumption, and I don't know of
>> anything else in git that does.
>
> I think Git itself doesn't create symlinks in ".git/objects/" and we
> don't recommend people manually tweaking what's inside ".git/". That's
> why I think it's a grey area.
>
>> I've certainly symlinked individual objects or packs into a repo for
>> debugging / recovery, and it would be unexpected to clone that and miss
>> something.
>
> If people tweak what's inside ".git" by hand, they are expected to
> know what they doing and be able to debug it.
>
>> So in the general case we should be strict in what we generate, but
>> permissive in what we accept. We don't want a "clone" of an existing
>> repo to fail, or "fsck" to fail after clone...
>
> Yeah, but realistically I don't think we are going to foolproof Git
> from everything that someone could do by tweaking random things
> manually in ".git/".
>
> I am not saying that it should be ok to make things much worse than
> they are now in case some things have been tweaked in ".git/", but if
> things in general don't look worse in this grey area, and a patch
> otherwise improves things, then I think the patch should be ok.
>
>> When trying to test this I made e.g. objects/c4 a symlink to /tmp/c4,
>> and a specific object in objects/4d/ a symlink to /tmp too.
>>
>> Without this patch the individual object is still a symlink, but the
>> object under the directory gets resolved, and "un-symlinked", also with
>> --dissociate, which seems like an existing bug.
>>
>> With your patch that symlink structure is copied as-is. That's more
>> faithful under --local, but a regression for --dissociate (which didn't
>> work fully to begin with...).
>
> I think that I use --local (which is the default if the repository is
> specified as a local path) much more often than --dissociate, so for
> me the patch would be very positive, especially since --dissociate is
> already buggy anyway in this case.
>
>> I was paranoid that "no longer follows symbolic links" could also mean
>> "will ignore those objects", but it seems to more faithfully copy things
>> as-is for *that* case.
>
> Nice!
>
>> But then I try with --no-hardlinks and stock git dereferences my symlink
>> structure, but with your patches fails completely:
>>
>>     Cloning into bare repository 'repo2'...
>>     error: copy-fd: read returned: Is a directory
>>     fatal: failed to copy file to 'repo2/objects/c4': Is a directory
>>     fatal: the remote end hung up unexpectedly
>>     fatal: cannot change to 'repo2': No such file or directory
>
> Maybe this could be fixed. Anyway I don't use --no-hardlinks very
> often, so I still think the patch is a positive even with this
> failure.
>
>> So there's at least one case in a few minutes of prodding this where we
>> can't clone a working repo now, however obscure the setup.
>>
>> >  - Hidden directories won't be skipped anymore. In fact, it is odd that
>> >    the function currently skip hidden directories but not hidden files.
>> >    The reason for that could be unintentional: probably the intention
>> >    was to skip '.' and '..' only, but it ended up accidentally skipping
>> >    all directories starting with '.'. Again, it must not be a problem
>> >    not to skip hidden dirs since hidden dirs/files are not expected at
>> >    .git/objects.
>>
>> I reproduce this with --local. A ".foo" isn't copied before, now it
>> is. Good, I guess. We'd have already copied a "foo".
>>
>> >  - Now, copy_or_link_directory will call die() in case of an error on
>> >    openddir, readdir or lstat, inside dir_iterator_advance. That means
>> >    it will abort in case of an error trying to fetch any iteration
>> >    entry.
>
> It would be nice if the above paragraph in the commit message would
> say what was the previous behavior and why it's better to die() .
>
>> Good, but really IMNSHO this series is tweaking some critical core code
>> and desperately needs tests.
>
> It's critical that this code works well in the usual case, yes. (And
> there are already a lot of tests that test that.) But when people have
> manually tweaked things in their ".git/objects/", it's not critical
> what happens. Many systems have "undefined behaviors" at some point
> and that's ok.
>
> And no, I am not saying that we should consider it completely
> "undefined behavior" as soon as something is manually tweaked in
> ".git/", and yes, tests would be nice, and your comments and manual
> tests on this are very much appreciated. It's just that I don't think
> we should require too much when a patch, especially from a first time
> contributor, is already improving things, though it also changes a few
> things in a grey area.
>
>> Unfortunately, in this as in so many edge case we have no existing
>> tests.
>>
>> This would be much easier to review and would give reviewers more
>> confidence if the parts of this that changed behavior started with a
>> patch or patches that just manually objects/ dirs with various
>
> I think "created" is missing between "manually" and  "objects/" in the
> above sentence.
>
>> combinations of symlinks, hardlinks etc., and asserted that the various
>> options did exactly what they're doing now, and made sure the
>> source/target repos were the same after/both passed "fsck".
>>
>> Then followed by some version of this patch which changes the behavior,
>> and would be forced to tweak those tests. To make it clear e.g. that
>> some cases where we have a working "clone" are now a hard error.
>
> Unfortunately this would be a lot of work and not appropriate for a
> GSoC micro-project.
>
> Thanks,
> Christian.

Here's a test that works before 3/3 and fails after, can be used with my SOB:

    diff --git a/t/t5604-clone-reference.sh b/t/t5604-clone-reference.sh
    index 4320082b1b..80c0a4a19b 100755
    --- a/t/t5604-clone-reference.sh
    +++ b/t/t5604-clone-reference.sh
    @@ -221,4 +221,37 @@ test_expect_success 'clone, dissociate from alternates' '
     	( cd C && git fsck )
     '

    +test_expect_success SHA1,SYMLINKS 'setup repo with manually symlinked objects/*' '
    +	git init S &&
    +	(
    +		cd S &&
    +		test_commit A &&
    +		git gc &&
    +		test_commit B &&
    +		(
    +			cd .git/objects &&
    +			mv 22/3b7836fb19fdf64ba2d3cd6173c6a283141f78 . &&
    +			ln -s ../3b7836fb19fdf64ba2d3cd6173c6a283141f78 22/ &&
    +			mv 40 forty &&
    +			ln -s forty 40 &&
    +			mv pack packs &&
    +			ln -s packs pack
    +		)
    +	)
    +'
    +
    +test_expect_success SHA1,SYMLINKS 'clone repo with manually symlinked objects/*' '
    +	for option in --local --no-hardlinks --shared --dissociate
    +	do
    +		git clone $option S S$option || return 1 &&
    +		git -C S$option fsck || return 1
    +	done &&
    +	find S-* -type l | sort >actual &&
    +	cat >expected <<-EOF &&
    +	S--dissociate/.git/objects/22/3b7836fb19fdf64ba2d3cd6173c6a283141f78
    +	S--local/.git/objects/22/3b7836fb19fdf64ba2d3cd6173c6a283141f78
    +	EOF
    +	test_cmp expected actual
    +'
    +
     test_done

Obviously not ideal, there's a way to do this without relying on the
SHA1 prereq, but in this case I don't see much point. Also similar to
the test_cmp there it would make senses to create
.git/objects/{foo,.foo} and see if they show up in the clone.

The GSOC project is basically "replace existing thing with identical
utility function", but software can always be more/less complex when it
gets to writing the code, so now it's become "we no longer uniformly
accept the same repository formats across fsck/clone as a side-effect".

To me that's not a grey area. Yes we don't produce that ourselves, but
it's the on-disk format we've always supported, and can expect e.g. that
someone's symlinking objects/?? to different partitions etc. in some
advanced setups.

Can't the utility function we're moving to just be made to be
bug-compatible with what we're doing now with symlinks?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 0/3] clone: convert explicit dir traversal to dir-iterator
  2019-02-23 21:59 ` Thomas Gummerer
@ 2019-02-24 16:34   ` Matheus Tavares Bernardino
  2019-02-24 21:07     ` Thomas Gummerer
  0 siblings, 1 reply; 35+ messages in thread
From: Matheus Tavares Bernardino @ 2019-02-24 16:34 UTC (permalink / raw)
  To: Thomas Gummerer; +Cc: git, Christian Couder

On Sat, Feb 23, 2019 at 6:59 PM Thomas Gummerer <t.gummerer@gmail.com> wrote:
>
> On 02/23, Matheus Tavares wrote:
> > Add pedantic option to dir_iterator_begin at dir-iterator.c and convert
> > explicit recursive directory traversal at copy_or_link_directory
> > (builtin/clone.c) to the dir-iterator API.
>
> Thanks for another iteration of this.
>
> To make life easier for reviewers, please include a link (or the
> message-ID) to all previous iteration of the series.  We often use
> links to the public-inbox mirror for this,
> e.g. https://public-inbox.org/git/20190215154913.18800-1-matheus.bernardino@usp.br/.
>
> This helps reviewers go back quickly to previous iterations of the
> series, and refresh their memory on the comments that were left
> there.
>
> You can also use the --in-reply-to option in 'git send-email' to chain
> the threads, which also makes life easier for reviewers.
>
> An additional way to help reviewers is to include a 'range-diff'
> between the previous iteration of the series, and the current
> iteration.  See the 'git range-diff' command or the '--range-diff'
> option to 'git format-patch' for that.  That helps reviewers to
> quickly see what changed between iterations, so that they don't have
> to re-review everything, if they can still remember the last round
> well enough.
>
> I also added Christian Couder back to the Cc list, as he commented on
> the RFC.  It's good to keep people that commented on the series in Cc,
> as they have shown some interest in the series, so keeping them in the
> Cc list helps highlight those emails for them, and makes it more
> likely that the patches get reviewed quickly.
>

Ok, thanks for all the tips, Thomas! Now I started to notice the
public-inbox references for previous iterations in other people's
patches, too.

I am part of a FLOSS group here at USP called FLUSP
(https://flusp.ime.usp.br/), and I plan to write some posts on our
website about what I am learning in the git community so that other
people in the group can have as a base if they decide to start
contributing to git too. So all this tips and explanations are of
great value, not only for me but for others here! Thanks a lot.

> > This is my microproject for GSoC 2019. Idea taken from
> > https://git.github.io/SoC-2019-Microprojects/#use-dir-iterator-to-avoid-explicit-recursive-directory-traversal
> >
> > Build: https://travis-ci.org/MatheusBernardino/git/builds/497512561
> >
> > Matheus Tavares (3):
> >   dir-iterator: add pedantic option to dir_iterator_begin
> >   clone: extract function from copy_or_link_directory
> >   clone: use dir-iterator to avoid explicit dir traversal
> >
> >  builtin/clone.c      | 72 ++++++++++++++++++++++++++++----------------
> >  dir-iterator.c       | 23 ++++++++++++--
> >  dir-iterator.h       | 16 ++++++++--
> >  refs/files-backend.c |  2 +-
> >  4 files changed, 81 insertions(+), 32 deletions(-)
> >
> > --
> > 2.20.1
> >

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 1/3] dir-iterator: add pedantic option to dir_iterator_begin
  2019-02-24  8:35     ` Christian Couder
@ 2019-02-24 17:43       ` Matheus Tavares Bernardino
  2019-02-24 21:06         ` Thomas Gummerer
  0 siblings, 1 reply; 35+ messages in thread
From: Matheus Tavares Bernardino @ 2019-02-24 17:43 UTC (permalink / raw)
  To: Christian Couder
  Cc: Thomas Gummerer, git, Junio C Hamano, Ramsay Jones,
	Nguyễn Thái Ngọc Duy, Michael Haggerty

On Sun, Feb 24, 2019 at 5:35 AM Christian Couder
<christian.couder@gmail.com> wrote:
>
> On Sat, Feb 23, 2019 at 10:37 PM Thomas Gummerer <t.gummerer@gmail.com> wrote:
> >
> > On 02/23, Matheus Tavares wrote:
> > > Add the pedantic option to dir-iterator's initialization function,
> > > dir_iterator_begin. When this option is set to true,
> > > dir_iterator_advance will immediately return ITER_ERROR when failing to
> > > fetch the next entry. When set to false, dir_iterator_advance will emit
> > > a warning and keep looking for the next entry.
> > >
> > > Also adjust refs/files-backend.c to the new dir_iterator_begin
> > > signature.
> >
> > Thanks, this makes sense to me.  This commit message describes what we
> > are doing in this patch, but not why we are doing it.  From previously
> > reviewing this series, I know this is going to be used in a subsequent
> > patch, but it is nice to give reviewers and future readers of this
> > patch that context as well.  Just something like "This behaviour is
> > going to be used in a subsequent patch." should be enough here.
>
> I agree that it's a good idea to add just that.
>

Indeed! Thank you Thomas and Christian. I will be adding this in v3.

> > >  int dir_iterator_advance(struct dir_iterator *dir_iterator)
> > > @@ -71,6 +78,8 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
> > >
> > >                       level->dir = opendir(iter->base.path.buf);
> > >                       if (!level->dir && errno != ENOENT) {
> > > +                             if (iter->pedantic)
> > > +                                     goto error_out;
> >
> > I think we should also print an error here.  The caller doesn't have
> > any context on what went wrong, and will probably just 'die()' if an
> > error is encountered.

To correctly handle the error, I assumed that the caller wouldn't need
to know which exact function returned an error, as long as it knew it
was a "failure to fetch the next entry" kind of error, which is the
"category" of errors caught with the 'pedantic' option. (currently, it
includes errors on lstat, opendir and readdir). Is this assumption
valid?

> > I think it would make sense to call
> > 'error(...)' here before 'goto error_out;' to give a useful error
> > message here.
>
> If we start to give error messages, then we might as well give error
> messages all the times when we error out. This will avoid the callers
> wondering if they need to give an error message or not.
>
> I am not sure it's necessary here though. And I think if it's useful,
> it can be added in another patch or another patch series.
>

I could just copy the warning messages bellow each 'goto error_out'
and use then at an 'error(...)' call before the goto. But as Christian
pointed out, I think this would confuse callers wether they should
print error messages or not. On the other side, it may just be
different 'layers' of errors... I don't have any strong opinion about
this.

> > >                               warning("error opening directory %s: %s",
> > >                                       iter->base.path.buf, strerror(errno));
> > >                               /* Popping the level is handled below */
>
> > > -struct dir_iterator *dir_iterator_begin(const char *path)
> > > +struct dir_iterator *dir_iterator_begin(const char *path, int pedantic)
> >
> > Thinking about the future evolution of this interface, it might make
> > more sense to have that second parameter be a "struct
> > dir_iterator_opts".  For now it would just have one member "pedantic",
> > but in the future we could add additional options there instead of
> > adding additional parameters.
>
> I think it's ok with `int pedantic` for now as improvements can be
> done when they are really needed. And we will perhaps find out that
> it's better to just change `int pedantic` to `unsigned flags` instead
> of `struct dir_iterator_opts`.
>

I did thought about using `unsigned flags` instead of `int pedantic`
for the same reasons Thomas pointed out, but as there would be just
one flag for now, it seemed to me that `int pedantic` would make more
sense (following the 'YAGNI' principle). But if it is already known
that more flags (or options) are coming in a very near future, I may
change this to `unsigned flags` or `struct dir_iterator_opts` in v3 if
you think it is needed. Just let me know, please.

> Thanks,
> Christian.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal
  2019-02-23 21:48   ` Thomas Gummerer
@ 2019-02-24 18:19     ` Matheus Tavares Bernardino
  0 siblings, 0 replies; 35+ messages in thread
From: Matheus Tavares Bernardino @ 2019-02-24 18:19 UTC (permalink / raw)
  To: Thomas Gummerer
  Cc: git, Junio C Hamano, Nguyễn Thái Ngọc Duy

On Sat, Feb 23, 2019 at 6:48 PM Thomas Gummerer <t.gummerer@gmail.com> wrote:
>
> On 02/23, Matheus Tavares wrote:
> > ---
> > Changes in v2:
> >  - Improved patch message
> >  - Removed a now unused variable
>
> s/variable/parameter/ I believe?

Yes, you are right!

> > +     while ((iter_status = dir_iterator_advance(iter)) == ITER_OK) {
> >               strbuf_setlen(src, src_len);
> > -             strbuf_addstr(src, de->d_name);
> > +             strbuf_addstr(src, iter->relative_path);
> >               strbuf_setlen(dest, dest_len);
> > -             strbuf_addstr(dest, de->d_name);
> > -             if (stat(src->buf, &buf)) {
> > +             strbuf_addstr(dest, iter->relative_path);
> > +
> > +             /*
> > +              * dir_iterator_advance already calls lstat to populate iter->st
> > +              * but, unlike stat, lstat does not checks for permissions on
> > +              * the given path.
> > +              */
>
> Hmm, lstat does check the permissions on the path, it just doesn't
> follow symlinks.  I think I actually mislead you in my previous review
> here, and was reading the code in dir-iterator.c all wrong.
>
> I thought it said "if (errno == ENOENT) warning(...)", however the
> condition is "errno != ENOENT", which is why I thought we were loosing
> warnings when errno == EACCES for example.
>
> As we decided that we would no longer follow symlinks now, I think we
> can actually get rid of the stat call here.  Sorry about the confusion.
>

Ok, I also read the lstat man page wrongly and though it didn't check
for permissions. Thanks for noticing that. I will remove the lstat
call in v3.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 1/3] dir-iterator: add pedantic option to dir_iterator_begin
  2019-02-24 17:43       ` Matheus Tavares Bernardino
@ 2019-02-24 21:06         ` Thomas Gummerer
  0 siblings, 0 replies; 35+ messages in thread
From: Thomas Gummerer @ 2019-02-24 21:06 UTC (permalink / raw)
  To: Matheus Tavares Bernardino
  Cc: Christian Couder, git, Junio C Hamano, Ramsay Jones,
	Nguyễn Thái Ngọc Duy, Michael Haggerty

On 02/24, Matheus Tavares Bernardino wrote:
> On Sun, Feb 24, 2019 at 5:35 AM Christian Couder
> <christian.couder@gmail.com> wrote:
> > On Sat, Feb 23, 2019 at 10:37 PM Thomas Gummerer <t.gummerer@gmail.com> wrote:
> > > >  int dir_iterator_advance(struct dir_iterator *dir_iterator)
> > > > @@ -71,6 +78,8 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
> > > >
> > > >                       level->dir = opendir(iter->base.path.buf);
> > > >                       if (!level->dir && errno != ENOENT) {
> > > > +                             if (iter->pedantic)
> > > > +                                     goto error_out;
> > >
> > > I think we should also print an error here.  The caller doesn't have
> > > any context on what went wrong, and will probably just 'die()' if an
> > > error is encountered.
> 
> To correctly handle the error, I assumed that the caller wouldn't need
> to know which exact function returned an error, as long as it knew it
> was a "failure to fetch the next entry" kind of error, which is the
> "category" of errors caught with the 'pedantic' option. (currently, it
> includes errors on lstat, opendir and readdir). Is this assumption
> valid?
>
> > > I think it would make sense to call
> > > 'error(...)' here before 'goto error_out;' to give a useful error
> > > message here.
> >
> > If we start to give error messages, then we might as well give error
> > messages all the times when we error out. This will avoid the callers
> > wondering if they need to give an error message or not.
> >
> > I am not sure it's necessary here though. And I think if it's useful,
> > it can be added in another patch or another patch series.
> >
>
> I could just copy the warning messages bellow each 'goto error_out'
> and use then at an 'error(...)' call before the goto. But as Christian
> pointed out, I think this would confuse callers wether they should
> print error messages or not. On the other side, it may just be
> different 'layers' of errors... I don't have any strong opinion about
> this.

Right, I think it just comes down to which amount of detail we want to
communicate back to the user.  I thought a bit more detail could be
helpful, but just giving a more generic error should also be okay.

> > > >                               warning("error opening directory %s: %s",
> > > >                                       iter->base.path.buf, strerror(errno));
> > > >                               /* Popping the level is handled below */
> >
> > > > -struct dir_iterator *dir_iterator_begin(const char *path)
> > > > +struct dir_iterator *dir_iterator_begin(const char *path, int pedantic)
> > >
> > > Thinking about the future evolution of this interface, it might make
> > > more sense to have that second parameter be a "struct
> > > dir_iterator_opts".  For now it would just have one member "pedantic",
> > > but in the future we could add additional options there instead of
> > > adding additional parameters.
> >
> > I think it's ok with `int pedantic` for now as improvements can be
> > done when they are really needed. And we will perhaps find out that
> > it's better to just change `int pedantic` to `unsigned flags` instead
> > of `struct dir_iterator_opts`.
> >
> 
> I did thought about using `unsigned flags` instead of `int pedantic`
> for the same reasons Thomas pointed out, but as there would be just
> one flag for now, it seemed to me that `int pedantic` would make more
> sense (following the 'YAGNI' principle). But if it is already known
> that more flags (or options) are coming in a very near future, I may
> change this to `unsigned flags` or `struct dir_iterator_opts` in v3 if
> you think it is needed. Just let me know, please.

Looking at the potential improvements that were suggested in the
initial commit adding dir-iterator, 'unsigned flags' would not be
enough to be able to pass all those options.  That's where my
suggestion for 'struct dir_iterator_opts' comes in.

But I don't feel too strongly about it, and am okay with just an 'int
pedantic' option for now, until we see different usages, if others
feel like that's the better option for now.

> > Thanks,
> > Christian.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 0/3] clone: convert explicit dir traversal to dir-iterator
  2019-02-24 16:34   ` Matheus Tavares Bernardino
@ 2019-02-24 21:07     ` Thomas Gummerer
  0 siblings, 0 replies; 35+ messages in thread
From: Thomas Gummerer @ 2019-02-24 21:07 UTC (permalink / raw)
  To: Matheus Tavares Bernardino; +Cc: git, Christian Couder

On 02/24, Matheus Tavares Bernardino wrote:
> I am part of a FLOSS group here at USP called FLUSP
> (https://flusp.ime.usp.br/), and I plan to write some posts on our
> website about what I am learning in the git community so that other
> people in the group can have as a base if they decide to start
> contributing to git too. So all this tips and explanations are of
> great value, not only for me but for others here! Thanks a lot.

Great to hear!  Please do post links to those posts on the mailing
list as well, I'm interested in reading them.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal
  2019-02-24  9:41     ` Christian Couder
  2019-02-24 14:45       ` Ævar Arnfjörð Bjarmason
@ 2019-02-25  2:31       ` Matheus Tavares Bernardino
  2019-02-25 10:25         ` Ævar Arnfjörð Bjarmason
  2019-02-26 10:33         ` Christian Couder
  1 sibling, 2 replies; 35+ messages in thread
From: Matheus Tavares Bernardino @ 2019-02-25  2:31 UTC (permalink / raw)
  To: Christian Couder
  Cc: Ævar Arnfjörð Bjarmason, git, Thomas Gummerer,
	Junio C Hamano, Nguyễn Thái Ngọc Duy

Hi, Christian and Ævar

First of all, thanks for the fast and attentive reviews.

I am a little confused about what I should do next. How should I
proceed with this series?

By what was said, I understood that the series:
1) Is indeed an improvement under --local, because it won't deference
symlinks in this case.
2) Don't make --dissociate any better but as it is already buggy that
would be some work for another patchset.
3) Makes git-clone copy hidden paths which is a good behaviour.
4) Breaks --no-hardlinks when there are symlinks at the repo's objects
directory.

I understood that even though git itself does not create symlinks in
.git/objects, we should take care of the case where the user manually
creates them, right? But what would be the appropriate behaviour: to
follow (i.e. deference) symlinks (the way it is done now) or just copy
the link file itself (the way my series currently do)? And shouldn't
we document this decision somewhere?

About the failure with --no-hardlinks having symlinks at .dir/objects,
it's probably because copy_or_link_directory() is trying to copy a
file which is a symlink to a dir and the copy function used is trying
to copy the dir not the link itself. A possible fix is to change
copy.c to copy the link file, but I haven't studied yet how that could
be accomplished.

Another possible fix is to make copy_or_link_directory() deference
symlink structures when --no-hardlinks is given. But because the
function falls back to no-hardlinks when failing to hardlink, I don't
think it would be easy to accomplish this without making the function
*always* deference symlinks. And that would make the series lose the
item 1), which I understand you liked.

So, what would be the best here?

Best,
Matheus Tavares

On Sun, Feb 24, 2019 at 6:41 AM Christian Couder
<christian.couder@gmail.com> wrote:
>
> On Sat, Feb 23, 2019 at 11:48 PM Ævar Arnfjörð Bjarmason
> <avarab@gmail.com> wrote:
> >
> >
> > On Sat, Feb 23 2019, Matheus Tavares wrote:
> >
> > > Replace usage of opendir/readdir/closedir API to traverse directories
> > > recursively, at copy_or_link_directory function, by the dir-iterator
> > > API. This simplifies the code and avoid recursive calls to
> > > copy_or_link_directory.
> >
> > Sounds good in principle.
> >
> > > This process also brings some safe behaviour changes to
> > > copy_or_link_directory:
> >
> > I ad-hoc tested some of these, and could spot behavior changes. We
> > should have tests for these.
>
> I agree that ideally we should have a few tests for these, but this is
> a grey area (see below) and there are areas that are not grey for
> which we don't have any test...
>
> And then adding tests would make this series become larger than a
> typical GSoC micro-project...
>
> > >  - It will no longer follows symbolic links. This is not a problem,
> > >    since the function is only used to copy .git/objects directory, and
> > >    symbolic links are not expected there.
> >
> > I don't think we should make that assumption, and I don't know of
> > anything else in git that does.
>
> I think Git itself doesn't create symlinks in ".git/objects/" and we
> don't recommend people manually tweaking what's inside ".git/". That's
> why I think it's a grey area.
>
> > I've certainly symlinked individual objects or packs into a repo for
> > debugging / recovery, and it would be unexpected to clone that and miss
> > something.
>
> If people tweak what's inside ".git" by hand, they are expected to
> know what they doing and be able to debug it.
>
> > So in the general case we should be strict in what we generate, but
> > permissive in what we accept. We don't want a "clone" of an existing
> > repo to fail, or "fsck" to fail after clone...
>
> Yeah, but realistically I don't think we are going to foolproof Git
> from everything that someone could do by tweaking random things
> manually in ".git/".
>
> I am not saying that it should be ok to make things much worse than
> they are now in case some things have been tweaked in ".git/", but if
> things in general don't look worse in this grey area, and a patch
> otherwise improves things, then I think the patch should be ok.
>
> > When trying to test this I made e.g. objects/c4 a symlink to /tmp/c4,
> > and a specific object in objects/4d/ a symlink to /tmp too.
> >
> > Without this patch the individual object is still a symlink, but the
> > object under the directory gets resolved, and "un-symlinked", also with
> > --dissociate, which seems like an existing bug.
> >
> > With your patch that symlink structure is copied as-is. That's more
> > faithful under --local, but a regression for --dissociate (which didn't
> > work fully to begin with...).
>
> I think that I use --local (which is the default if the repository is
> specified as a local path) much more often than --dissociate, so for
> me the patch would be very positive, especially since --dissociate is
> already buggy anyway in this case.
>
> > I was paranoid that "no longer follows symbolic links" could also mean
> > "will ignore those objects", but it seems to more faithfully copy things
> > as-is for *that* case.
>
> Nice!
>
> > But then I try with --no-hardlinks and stock git dereferences my symlink
> > structure, but with your patches fails completely:
> >
> >     Cloning into bare repository 'repo2'...
> >     error: copy-fd: read returned: Is a directory
> >     fatal: failed to copy file to 'repo2/objects/c4': Is a directory
> >     fatal: the remote end hung up unexpectedly
> >     fatal: cannot change to 'repo2': No such file or directory
>
> Maybe this could be fixed. Anyway I don't use --no-hardlinks very
> often, so I still think the patch is a positive even with this
> failure.
>
> > So there's at least one case in a few minutes of prodding this where we
> > can't clone a working repo now, however obscure the setup.
> >
> > >  - Hidden directories won't be skipped anymore. In fact, it is odd that
> > >    the function currently skip hidden directories but not hidden files.
> > >    The reason for that could be unintentional: probably the intention
> > >    was to skip '.' and '..' only, but it ended up accidentally skipping
> > >    all directories starting with '.'. Again, it must not be a problem
> > >    not to skip hidden dirs since hidden dirs/files are not expected at
> > >    .git/objects.
> >
> > I reproduce this with --local. A ".foo" isn't copied before, now it
> > is. Good, I guess. We'd have already copied a "foo".
> >
> > >  - Now, copy_or_link_directory will call die() in case of an error on
> > >    openddir, readdir or lstat, inside dir_iterator_advance. That means
> > >    it will abort in case of an error trying to fetch any iteration
> > >    entry.
>
> It would be nice if the above paragraph in the commit message would
> say what was the previous behavior and why it's better to die() .
>
> > Good, but really IMNSHO this series is tweaking some critical core code
> > and desperately needs tests.
>
> It's critical that this code works well in the usual case, yes. (And
> there are already a lot of tests that test that.) But when people have
> manually tweaked things in their ".git/objects/", it's not critical
> what happens. Many systems have "undefined behaviors" at some point
> and that's ok.
>
> And no, I am not saying that we should consider it completely
> "undefined behavior" as soon as something is manually tweaked in
> ".git/", and yes, tests would be nice, and your comments and manual
> tests on this are very much appreciated. It's just that I don't think
> we should require too much when a patch, especially from a first time
> contributor, is already improving things, though it also changes a few
> things in a grey area.
>
> > Unfortunately, in this as in so many edge case we have no existing
> > tests.
> >
> > This would be much easier to review and would give reviewers more
> > confidence if the parts of this that changed behavior started with a
> > patch or patches that just manually objects/ dirs with various
>
> I think "created" is missing between "manually" and  "objects/" in the
> above sentence.
>
> > combinations of symlinks, hardlinks etc., and asserted that the various
> > options did exactly what they're doing now, and made sure the
> > source/target repos were the same after/both passed "fsck".
> >
> > Then followed by some version of this patch which changes the behavior,
> > and would be forced to tweak those tests. To make it clear e.g. that
> > some cases where we have a working "clone" are now a hard error.
>
> Unfortunately this would be a lot of work and not appropriate for a
> GSoC micro-project.
>
> Thanks,
> Christian.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal
  2019-02-24 14:45       ` Ævar Arnfjörð Bjarmason
@ 2019-02-25  9:45         ` Duy Nguyen
  2019-02-26  0:26           ` [WIP RFC PATCH 0/7] clone: dir iterator refactoring with tests Ævar Arnfjörð Bjarmason
                             ` (7 more replies)
  0 siblings, 8 replies; 35+ messages in thread
From: Duy Nguyen @ 2019-02-25  9:45 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Christian Couder, Matheus Tavares, git, Thomas Gummerer, Junio C Hamano

On Sun, Feb 24, 2019 at 9:45 PM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
> ..
> Can't the utility function we're moving to just be made to be
> bug-compatible with what we're doing now with symlinks?

I haven't really followed closely this thread. But I think the first
step should be bug-compatible (it really depends if you count the
current behavior "buggy"). Then add improvements on top if there's
still time or strong consensus. It's easier to bisect that way at
least.
-- 
Duy

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal
  2019-02-25  2:31       ` [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal Matheus Tavares Bernardino
@ 2019-02-25 10:25         ` Ævar Arnfjörð Bjarmason
  2019-02-25 20:40           ` Christian Couder
  2019-02-26 10:33         ` Christian Couder
  1 sibling, 1 reply; 35+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-02-25 10:25 UTC (permalink / raw)
  To: Matheus Tavares Bernardino
  Cc: Christian Couder, git, Thomas Gummerer, Junio C Hamano,
	Nguyễn Thái Ngọc Duy


On Mon, Feb 25 2019, Matheus Tavares Bernardino wrote:

> Hi, Christian and Ævar
>
> First of all, thanks for the fast and attentive reviews.
>
> I am a little confused about what I should do next. How should I
> proceed with this series?
>
> By what was said, I understood that the series:
> 1) Is indeed an improvement under --local, because it won't deference
> symlinks in this case.
> 2) Don't make --dissociate any better but as it is already buggy that
> would be some work for another patchset.
> 3) Makes git-clone copy hidden paths which is a good behaviour.
> 4) Breaks --no-hardlinks when there are symlinks at the repo's objects
> directory.
>
> I understood that even though git itself does not create symlinks in
> .git/objects, we should take care of the case where the user manually
> creates them, right? But what would be the appropriate behaviour: to
> follow (i.e. deference) symlinks (the way it is done now) or just copy
> the link file itself (the way my series currently do)? And shouldn't
> we document this decision somewhere?
>
> About the failure with --no-hardlinks having symlinks at .dir/objects,
> it's probably because copy_or_link_directory() is trying to copy a
> file which is a symlink to a dir and the copy function used is trying
> to copy the dir not the link itself. A possible fix is to change
> copy.c to copy the link file, but I haven't studied yet how that could
> be accomplished.
>
> Another possible fix is to make copy_or_link_directory() deference
> symlink structures when --no-hardlinks is given. But because the
> function falls back to no-hardlinks when failing to hardlink, I don't
> think it would be easy to accomplish this without making the function
> *always* deference symlinks. And that would make the series lose the
> item 1), which I understand you liked.


I don't really have formed opinions one way or the other about what
these specific flags should do in combination with such a repository,
e.g. should --dissociate copy data rather than point to the same
symlinks?

I'm inclined to think so, but I've only thought about it for a couple of
minutes. Maybe if someone starts digging they'll rightly come to a
different conclusion.

Rather, my comment is on the process. Clone behavior is too important to
leave to prose in a commit message. I already found a case where we hard
error not explicitly called out, are there other edge cases I didn't
think of?

So having this e.g. be a 4-part series where 3/4 is introducing tests in
the direction I posted upthread (but needs more work), with 4/4 going
through/justifying each one.


> On Sun, Feb 24, 2019 at 6:41 AM Christian Couder
> <christian.couder@gmail.com> wrote:
>>
>> On Sat, Feb 23, 2019 at 11:48 PM Ævar Arnfjörð Bjarmason
>> <avarab@gmail.com> wrote:
>> >
>> >
>> > On Sat, Feb 23 2019, Matheus Tavares wrote:
>> >
>> > > Replace usage of opendir/readdir/closedir API to traverse directories
>> > > recursively, at copy_or_link_directory function, by the dir-iterator
>> > > API. This simplifies the code and avoid recursive calls to
>> > > copy_or_link_directory.
>> >
>> > Sounds good in principle.
>> >
>> > > This process also brings some safe behaviour changes to
>> > > copy_or_link_directory:
>> >
>> > I ad-hoc tested some of these, and could spot behavior changes. We
>> > should have tests for these.
>>
>> I agree that ideally we should have a few tests for these, but this is
>> a grey area (see below) and there are areas that are not grey for
>> which we don't have any test...
>>
>> And then adding tests would make this series become larger than a
>> typical GSoC micro-project...
>>
>> > >  - It will no longer follows symbolic links. This is not a problem,
>> > >    since the function is only used to copy .git/objects directory, and
>> > >    symbolic links are not expected there.
>> >
>> > I don't think we should make that assumption, and I don't know of
>> > anything else in git that does.
>>
>> I think Git itself doesn't create symlinks in ".git/objects/" and we
>> don't recommend people manually tweaking what's inside ".git/". That's
>> why I think it's a grey area.
>>
>> > I've certainly symlinked individual objects or packs into a repo for
>> > debugging / recovery, and it would be unexpected to clone that and miss
>> > something.
>>
>> If people tweak what's inside ".git" by hand, they are expected to
>> know what they doing and be able to debug it.
>>
>> > So in the general case we should be strict in what we generate, but
>> > permissive in what we accept. We don't want a "clone" of an existing
>> > repo to fail, or "fsck" to fail after clone...
>>
>> Yeah, but realistically I don't think we are going to foolproof Git
>> from everything that someone could do by tweaking random things
>> manually in ".git/".
>>
>> I am not saying that it should be ok to make things much worse than
>> they are now in case some things have been tweaked in ".git/", but if
>> things in general don't look worse in this grey area, and a patch
>> otherwise improves things, then I think the patch should be ok.
>>
>> > When trying to test this I made e.g. objects/c4 a symlink to /tmp/c4,
>> > and a specific object in objects/4d/ a symlink to /tmp too.
>> >
>> > Without this patch the individual object is still a symlink, but the
>> > object under the directory gets resolved, and "un-symlinked", also with
>> > --dissociate, which seems like an existing bug.
>> >
>> > With your patch that symlink structure is copied as-is. That's more
>> > faithful under --local, but a regression for --dissociate (which didn't
>> > work fully to begin with...).
>>
>> I think that I use --local (which is the default if the repository is
>> specified as a local path) much more often than --dissociate, so for
>> me the patch would be very positive, especially since --dissociate is
>> already buggy anyway in this case.
>>
>> > I was paranoid that "no longer follows symbolic links" could also mean
>> > "will ignore those objects", but it seems to more faithfully copy things
>> > as-is for *that* case.
>>
>> Nice!
>>
>> > But then I try with --no-hardlinks and stock git dereferences my symlink
>> > structure, but with your patches fails completely:
>> >
>> >     Cloning into bare repository 'repo2'...
>> >     error: copy-fd: read returned: Is a directory
>> >     fatal: failed to copy file to 'repo2/objects/c4': Is a directory
>> >     fatal: the remote end hung up unexpectedly
>> >     fatal: cannot change to 'repo2': No such file or directory
>>
>> Maybe this could be fixed. Anyway I don't use --no-hardlinks very
>> often, so I still think the patch is a positive even with this
>> failure.
>>
>> > So there's at least one case in a few minutes of prodding this where we
>> > can't clone a working repo now, however obscure the setup.
>> >
>> > >  - Hidden directories won't be skipped anymore. In fact, it is odd that
>> > >    the function currently skip hidden directories but not hidden files.
>> > >    The reason for that could be unintentional: probably the intention
>> > >    was to skip '.' and '..' only, but it ended up accidentally skipping
>> > >    all directories starting with '.'. Again, it must not be a problem
>> > >    not to skip hidden dirs since hidden dirs/files are not expected at
>> > >    .git/objects.
>> >
>> > I reproduce this with --local. A ".foo" isn't copied before, now it
>> > is. Good, I guess. We'd have already copied a "foo".
>> >
>> > >  - Now, copy_or_link_directory will call die() in case of an error on
>> > >    openddir, readdir or lstat, inside dir_iterator_advance. That means
>> > >    it will abort in case of an error trying to fetch any iteration
>> > >    entry.
>>
>> It would be nice if the above paragraph in the commit message would
>> say what was the previous behavior and why it's better to die() .
>>
>> > Good, but really IMNSHO this series is tweaking some critical core code
>> > and desperately needs tests.
>>
>> It's critical that this code works well in the usual case, yes. (And
>> there are already a lot of tests that test that.) But when people have
>> manually tweaked things in their ".git/objects/", it's not critical
>> what happens. Many systems have "undefined behaviors" at some point
>> and that's ok.
>>
>> And no, I am not saying that we should consider it completely
>> "undefined behavior" as soon as something is manually tweaked in
>> ".git/", and yes, tests would be nice, and your comments and manual
>> tests on this are very much appreciated. It's just that I don't think
>> we should require too much when a patch, especially from a first time
>> contributor, is already improving things, though it also changes a few
>> things in a grey area.
>>
>> > Unfortunately, in this as in so many edge case we have no existing
>> > tests.
>> >
>> > This would be much easier to review and would give reviewers more
>> > confidence if the parts of this that changed behavior started with a
>> > patch or patches that just manually objects/ dirs with various
>>
>> I think "created" is missing between "manually" and  "objects/" in the
>> above sentence.
>>
>> > combinations of symlinks, hardlinks etc., and asserted that the various
>> > options did exactly what they're doing now, and made sure the
>> > source/target repos were the same after/both passed "fsck".
>> >
>> > Then followed by some version of this patch which changes the behavior,
>> > and would be forced to tweak those tests. To make it clear e.g. that
>> > some cases where we have a working "clone" are now a hard error.
>>
>> Unfortunately this would be a lot of work and not appropriate for a
>> GSoC micro-project.
>>
>> Thanks,
>> Christian.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal
  2019-02-25 10:25         ` Ævar Arnfjörð Bjarmason
@ 2019-02-25 20:40           ` Christian Couder
  0 siblings, 0 replies; 35+ messages in thread
From: Christian Couder @ 2019-02-25 20:40 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Matheus Tavares Bernardino, git, Thomas Gummerer, Junio C Hamano,
	Nguyễn Thái Ngọc Duy

On Mon, Feb 25, 2019 at 11:25 AM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
>
> On Mon, Feb 25 2019, Matheus Tavares Bernardino wrote:
>
> > Hi, Christian and Ævar
> >
> > First of all, thanks for the fast and attentive reviews.
> >
> > I am a little confused about what I should do next. How should I
> > proceed with this series?
> >
> > By what was said, I understood that the series:
> > 1) Is indeed an improvement under --local, because it won't deference
> > symlinks in this case.
> > 2) Don't make --dissociate any better but as it is already buggy that
> > would be some work for another patchset.
> > 3) Makes git-clone copy hidden paths which is a good behaviour.
> > 4) Breaks --no-hardlinks when there are symlinks at the repo's objects
> > directory.
> >
> > I understood that even though git itself does not create symlinks in
> > .git/objects, we should take care of the case where the user manually
> > creates them, right? But what would be the appropriate behaviour: to
> > follow (i.e. deference) symlinks (the way it is done now) or just copy
> > the link file itself (the way my series currently do)? And shouldn't
> > we document this decision somewhere?
> >
> > About the failure with --no-hardlinks having symlinks at .dir/objects,
> > it's probably because copy_or_link_directory() is trying to copy a
> > file which is a symlink to a dir and the copy function used is trying
> > to copy the dir not the link itself. A possible fix is to change
> > copy.c to copy the link file, but I haven't studied yet how that could
> > be accomplished.
> >
> > Another possible fix is to make copy_or_link_directory() deference
> > symlink structures when --no-hardlinks is given. But because the
> > function falls back to no-hardlinks when failing to hardlink, I don't
> > think it would be easy to accomplish this without making the function
> > *always* deference symlinks. And that would make the series lose the
> > item 1), which I understand you liked.
>
> I don't really have formed opinions one way or the other about what
> these specific flags should do in combination with such a repository,
> e.g. should --dissociate copy data rather than point to the same
> symlinks?
>
> I'm inclined to think so, but I've only thought about it for a couple of
> minutes. Maybe if someone starts digging they'll rightly come to a
> different conclusion.

And maybe one day we will find a very good way to take advantage of
symlinks in .git/objects/ when Git is used normally, but that will go
against what we have decided now, though we have no real need at all
to decide now.

That's why I think it can actually be a good thing not to decide
anything now, and to let us free to decide later.

It's kind of the same as with short options versus long options. It's
a good idea not to use up all your short options too soon (in the name
of current ease of use), and instead wait until people really want a
short option for one option they use really often before attributing
it to this option.

> Rather, my comment is on the process. Clone behavior is too important to
> leave to prose in a commit message.

The same argument could be made to say that what happens in
.git/objects/ when cloning and there are symlinks is too important a
still free design option we have left to give it up right now for no
good reason.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [WIP RFC PATCH 0/7] clone: dir iterator refactoring with tests
  2019-02-25  9:45         ` Duy Nguyen
@ 2019-02-26  0:26           ` Ævar Arnfjörð Bjarmason
  2019-02-26  0:26           ` [WIP RFC PATCH 1/7] dir-iterator: add pedantic option to dir_iterator_begin Ævar Arnfjörð Bjarmason
                             ` (6 subsequent siblings)
  7 siblings, 0 replies; 35+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-02-26  0:26 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy,
	Matheus Tavares, Thomas Gummerer, Christian Couder,
	Ævar Arnfjörð Bjarmason

On Mon, Feb 25 2019, Duy Nguyen wrote:

> On Sun, Feb 24, 2019 at 9:45 PM Ævar Arnfjörð Bjarmason
> <avarab@gmail.com> wrote:
>> ..
>> Can't the utility function we're moving to just be made to be
>> bug-compatible with what we're doing now with symlinks?
>
> I haven't really followed closely this thread. But I think the first
> step should be bug-compatible (it really depends if you count the
> current behavior "buggy"). Then add improvements on top if there's
> still time or strong consensus. It's easier to bisect that way at
> least.

Here's an attempt at doing that. This was hastily done and as seen
from the commit messages I don't really know what the intention with
some of this stuff is / didn't have time to dig, e.g. why the API is
now using lstat() instead of stat().

But as far as I can tell this is a series where up to patch 5 we
retain the current behavior 100%. And in 6/7 and 7/7 introduce the
behavior changes in the previous 3/3 which I've been commenting on.

Matheus Tavares (3):
  dir-iterator: add pedantic option to dir_iterator_begin
  clone: extract function from copy_or_link_directory
  clone: use dir-iterator to avoid explicit dir traversal

Ævar Arnfjörð Bjarmason (4):
  dir-iterator: use stat() instead of lstat()
  clone: test for our behavior on odd objects/* content
  clone: stop ignoring dotdirs in --local etc. clone
  clone: break cloning repos that have symlinks in them

 builtin/clone.c            | 72 ++++++++++++++++++++++++--------------
 dir-iterator.c             | 23 ++++++++++--
 dir-iterator.h             | 16 +++++++--
 refs/files-backend.c       |  2 +-
 t/t5604-clone-reference.sh | 40 +++++++++++++++++++++
 5 files changed, 121 insertions(+), 32 deletions(-)

-- 
2.21.0.rc2.1.g2d5e20a900.dirty


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [WIP RFC PATCH 1/7] dir-iterator: add pedantic option to dir_iterator_begin
  2019-02-25  9:45         ` Duy Nguyen
  2019-02-26  0:26           ` [WIP RFC PATCH 0/7] clone: dir iterator refactoring with tests Ævar Arnfjörð Bjarmason
@ 2019-02-26  0:26           ` Ævar Arnfjörð Bjarmason
  2019-02-26  0:26           ` [WIP RFC PATCH 2/7] dir-iterator: use stat() instead of lstat() Ævar Arnfjörð Bjarmason
                             ` (5 subsequent siblings)
  7 siblings, 0 replies; 35+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-02-26  0:26 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy,
	Matheus Tavares, Thomas Gummerer, Christian Couder

From: Matheus Tavares <matheus.bernardino@usp.br>

Add the pedantic option to dir-iterator's initialization function,
dir_iterator_begin. When this option is set to true,
dir_iterator_advance will immediately return ITER_ERROR when failing to
fetch the next entry. When set to false, dir_iterator_advance will emit
a warning and keep looking for the next entry.

Also adjust refs/files-backend.c to the new dir_iterator_begin
signature.

Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
---
 dir-iterator.c       | 23 +++++++++++++++++++++--
 dir-iterator.h       | 16 +++++++++++++---
 refs/files-backend.c |  2 +-
 3 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/dir-iterator.c b/dir-iterator.c
index f2dcd82fde..070a656555 100644
--- a/dir-iterator.c
+++ b/dir-iterator.c
@@ -48,6 +48,13 @@ struct dir_iterator_int {
 	 * that will be included in this iteration.
 	 */
 	struct dir_iterator_level *levels;
+
+	/*
+	 * Boolean value to define dir-iterator's behaviour when failing to
+	 * fetch next entry. See comments on dir_iterator_begin at
+	 * dir-iterator.h
+	 */
+	int pedantic;
 };
 
 int dir_iterator_advance(struct dir_iterator *dir_iterator)
@@ -71,6 +78,8 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
 
 			level->dir = opendir(iter->base.path.buf);
 			if (!level->dir && errno != ENOENT) {
+				if (iter->pedantic)
+					goto error_out;
 				warning("error opening directory %s: %s",
 					iter->base.path.buf, strerror(errno));
 				/* Popping the level is handled below */
@@ -122,6 +131,8 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
 			if (!de) {
 				/* This level is exhausted; pop up a level. */
 				if (errno) {
+					if (iter->pedantic)
+						goto error_out;
 					warning("error reading directory %s: %s",
 						iter->base.path.buf, strerror(errno));
 				} else if (closedir(level->dir))
@@ -139,10 +150,13 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
 
 			strbuf_addstr(&iter->base.path, de->d_name);
 			if (lstat(iter->base.path.buf, &iter->base.st) < 0) {
-				if (errno != ENOENT)
+				if (errno != ENOENT) {
+					if (iter->pedantic)
+						goto error_out;
 					warning("error reading path '%s': %s",
 						iter->base.path.buf,
 						strerror(errno));
+				}
 				continue;
 			}
 
@@ -159,6 +173,10 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
 			return ITER_OK;
 		}
 	}
+
+error_out:
+	dir_iterator_abort(dir_iterator);
+	return ITER_ERROR;
 }
 
 int dir_iterator_abort(struct dir_iterator *dir_iterator)
@@ -182,7 +200,7 @@ int dir_iterator_abort(struct dir_iterator *dir_iterator)
 	return ITER_DONE;
 }
 
-struct dir_iterator *dir_iterator_begin(const char *path)
+struct dir_iterator *dir_iterator_begin(const char *path, int pedantic)
 {
 	struct dir_iterator_int *iter = xcalloc(1, sizeof(*iter));
 	struct dir_iterator *dir_iterator = &iter->base;
@@ -195,6 +213,7 @@ struct dir_iterator *dir_iterator_begin(const char *path)
 
 	ALLOC_GROW(iter->levels, 10, iter->levels_alloc);
 
+	iter->pedantic = pedantic;
 	iter->levels_nr = 1;
 	iter->levels[0].initialized = 0;
 
diff --git a/dir-iterator.h b/dir-iterator.h
index 970793d07a..50ca8e1a27 100644
--- a/dir-iterator.h
+++ b/dir-iterator.h
@@ -19,7 +19,7 @@
  * A typical iteration looks like this:
  *
  *     int ok;
- *     struct iterator *iter = dir_iterator_begin(path);
+ *     struct iterator *iter = dir_iterator_begin(path, 0);
  *
  *     while ((ok = dir_iterator_advance(iter)) == ITER_OK) {
  *             if (want_to_stop_iteration()) {
@@ -65,9 +65,15 @@ struct dir_iterator {
  * The iteration includes all paths under path, not including path
  * itself and not including "." or ".." entries.
  *
- * path is the starting directory. An internal copy will be made.
+ * Parameters are:
+ * - path is the starting directory. An internal copy will be made.
+ * - pedantic is a boolean value. If true, dir-iterator will free
+ *   resources and return ITER_ERROR immediately, in case of an error
+ *   while trying to fetch the next entry in dir_iterator_advance. If
+ *   false, it will just emit a warning and keep looking for the next
+ *   entry.
  */
-struct dir_iterator *dir_iterator_begin(const char *path);
+struct dir_iterator *dir_iterator_begin(const char *path, int pedantic);
 
 /*
  * Advance the iterator to the first or next item and return ITER_OK.
@@ -76,6 +82,10 @@ struct dir_iterator *dir_iterator_begin(const char *path);
  * dir_iterator and associated resources and return ITER_ERROR. It is
  * a bug to use iterator or call this function again after it has
  * returned ITER_DONE or ITER_ERROR.
+ *
+ * Note that whether dir_iterator_advance will return ITER_ERROR when
+ * failing to fetch the next entry or keep going is defined by the
+ * 'pedantic' option at dir-iterator's initialization.
  */
 int dir_iterator_advance(struct dir_iterator *iterator);
 
diff --git a/refs/files-backend.c b/refs/files-backend.c
index dd8abe9185..c3d3b6c454 100644
--- a/refs/files-backend.c
+++ b/refs/files-backend.c
@@ -2143,7 +2143,7 @@ static struct ref_iterator *reflog_iterator_begin(struct ref_store *ref_store,
 
 	base_ref_iterator_init(ref_iterator, &files_reflog_iterator_vtable, 0);
 	strbuf_addf(&sb, "%s/logs", gitdir);
-	iter->dir_iterator = dir_iterator_begin(sb.buf);
+	iter->dir_iterator = dir_iterator_begin(sb.buf, 0);
 	iter->ref_store = ref_store;
 	strbuf_release(&sb);
 
-- 
2.21.0.rc2.1.g2d5e20a900.dirty


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [WIP RFC PATCH 2/7] dir-iterator: use stat() instead of lstat()
  2019-02-25  9:45         ` Duy Nguyen
  2019-02-26  0:26           ` [WIP RFC PATCH 0/7] clone: dir iterator refactoring with tests Ævar Arnfjörð Bjarmason
  2019-02-26  0:26           ` [WIP RFC PATCH 1/7] dir-iterator: add pedantic option to dir_iterator_begin Ævar Arnfjörð Bjarmason
@ 2019-02-26  0:26           ` Ævar Arnfjörð Bjarmason
  2019-02-26  1:53             ` Matheus Tavares Bernardino
  2019-02-26  0:26           ` [WIP RFC PATCH 3/7] clone: extract function from copy_or_link_directory Ævar Arnfjörð Bjarmason
                             ` (4 subsequent siblings)
  7 siblings, 1 reply; 35+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-02-26  0:26 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy,
	Matheus Tavares, Thomas Gummerer, Christian Couder,
	Ævar Arnfjörð Bjarmason

This is surely a horrible idea, but all tests pass with this (not that
I trust them much). Doing this for later WIP use in clone.c.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 dir-iterator.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dir-iterator.c b/dir-iterator.c
index 070a656555..6a9c0c4d08 100644
--- a/dir-iterator.c
+++ b/dir-iterator.c
@@ -149,7 +149,7 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
 				continue;
 
 			strbuf_addstr(&iter->base.path, de->d_name);
-			if (lstat(iter->base.path.buf, &iter->base.st) < 0) {
+			if (stat(iter->base.path.buf, &iter->base.st) < 0) {
 				if (errno != ENOENT) {
 					if (iter->pedantic)
 						goto error_out;
-- 
2.21.0.rc2.1.g2d5e20a900.dirty


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [WIP RFC PATCH 3/7] clone: extract function from copy_or_link_directory
  2019-02-25  9:45         ` Duy Nguyen
                             ` (2 preceding siblings ...)
  2019-02-26  0:26           ` [WIP RFC PATCH 2/7] dir-iterator: use stat() instead of lstat() Ævar Arnfjörð Bjarmason
@ 2019-02-26  0:26           ` Ævar Arnfjörð Bjarmason
  2019-02-26  0:26           ` [WIP RFC PATCH 4/7] clone: test for our behavior on odd objects/* content Ævar Arnfjörð Bjarmason
                             ` (3 subsequent siblings)
  7 siblings, 0 replies; 35+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-02-26  0:26 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy,
	Matheus Tavares, Thomas Gummerer, Christian Couder

From: Matheus Tavares <matheus.bernardino@usp.br>

Extract dir creation code snippet from copy_or_link_directory to its own
function named mkdir_if_missing. This change will help removing
copy_or_link_directory's explicit recursion, which will be done in a
following patch. Also makes code more readable.

Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
---
 builtin/clone.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/builtin/clone.c b/builtin/clone.c
index 50bde99618..862d2ea69c 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -392,6 +392,24 @@ static void copy_alternates(struct strbuf *src, struct strbuf *dst,
 	fclose(in);
 }
 
+static void mkdir_if_missing(const char *pathname, mode_t mode)
+{
+	/*
+	 * Create a dir at pathname unless there's already one.
+	 */
+	struct stat st;
+
+	if (mkdir(pathname, mode)) {
+		if (errno != EEXIST)
+			die_errno(_("failed to create directory '%s'"),
+				  pathname);
+		else if (stat(pathname, &st))
+			die_errno(_("failed to stat '%s'"), pathname);
+		else if (!S_ISDIR(st.st_mode))
+			die(_("%s exists and is not a directory"), pathname);
+	}
+}
+
 static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
 				   const char *src_repo, int src_baselen)
 {
@@ -404,14 +422,7 @@ static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
 	if (!dir)
 		die_errno(_("failed to open '%s'"), src->buf);
 
-	if (mkdir(dest->buf, 0777)) {
-		if (errno != EEXIST)
-			die_errno(_("failed to create directory '%s'"), dest->buf);
-		else if (stat(dest->buf, &buf))
-			die_errno(_("failed to stat '%s'"), dest->buf);
-		else if (!S_ISDIR(buf.st_mode))
-			die(_("%s exists and is not a directory"), dest->buf);
-	}
+	mkdir_if_missing(dest->buf, 0777);
 
 	strbuf_addch(src, '/');
 	src_len = src->len;
-- 
2.21.0.rc2.1.g2d5e20a900.dirty


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [WIP RFC PATCH 4/7] clone: test for our behavior on odd objects/* content
  2019-02-25  9:45         ` Duy Nguyen
                             ` (3 preceding siblings ...)
  2019-02-26  0:26           ` [WIP RFC PATCH 3/7] clone: extract function from copy_or_link_directory Ævar Arnfjörð Bjarmason
@ 2019-02-26  0:26           ` Ævar Arnfjörð Bjarmason
  2019-02-26  0:26           ` [WIP RFC PATCH 5/7] clone: use dir-iterator to avoid explicit dir traversal Ævar Arnfjörð Bjarmason
                             ` (2 subsequent siblings)
  7 siblings, 0 replies; 35+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-02-26  0:26 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy,
	Matheus Tavares, Thomas Gummerer, Christian Couder,
	Ævar Arnfjörð Bjarmason

We've implicitly supported .git/objects/* content of symlinks since
approximately forever, and when we do a copy of the repo we transfer
those over, but aren't very consistent about other random stuff we
find depending on if it's a "hidden" file or not.

Let's add a test for that, which shouldn't read as an endorsement of
what we're doing now, just asserts current behavior.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 t/t5604-clone-reference.sh | 60 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/t/t5604-clone-reference.sh b/t/t5604-clone-reference.sh
index 4320082b1b..6f9c77049e 100755
--- a/t/t5604-clone-reference.sh
+++ b/t/t5604-clone-reference.sh
@@ -221,4 +221,64 @@ test_expect_success 'clone, dissociate from alternates' '
 	( cd C && git fsck )
 '
 
+test_expect_success SHA1,SYMLINKS 'setup repo with manually symlinked objects/*' '
+	git init S &&
+	(
+		cd S &&
+		test_commit A &&
+		git gc &&
+		test_commit B &&
+		(
+			cd .git/objects &&
+			mv 22/3b7836fb19fdf64ba2d3cd6173c6a283141f78 . &&
+			ln -s ../3b7836fb19fdf64ba2d3cd6173c6a283141f78 22/ &&
+			mv 40 forty &&
+			ln -s forty 40 &&
+			mv pack packs &&
+			ln -s packs pack &&
+			>.some-hidden-file &&
+			>some-file &&
+			mkdir .some-hidden-dir &&
+			>.some-hidden-dir/some-file &&
+			>.some-hidden-dir/.some-dot-file &&
+			mkdir some-dir &&
+			>some-dir/some-file &&
+			>some-dir/.some-dot-file
+		)
+	)
+'
+
+test_expect_success SHA1,SYMLINKS 'clone repo with manually symlinked objects/*' '
+	for option in --local --no-hardlinks --shared --dissociate
+	do
+		git clone $option S S$option || return 1 &&
+		git -C S$option fsck || return 1
+	done &&
+	find S-* -type l | sort >actual &&
+	cat >expected <<-EOF &&
+	S--dissociate/.git/objects/22/3b7836fb19fdf64ba2d3cd6173c6a283141f78
+	S--local/.git/objects/22/3b7836fb19fdf64ba2d3cd6173c6a283141f78
+	EOF
+	test_cmp expected actual &&
+	find S-* -name "*some*" | sort >actual &&
+	cat >expected <<-EOF &&
+	S--dissociate/.git/objects/.some-hidden-file
+	S--dissociate/.git/objects/some-dir
+	S--dissociate/.git/objects/some-dir/.some-dot-file
+	S--dissociate/.git/objects/some-dir/some-file
+	S--dissociate/.git/objects/some-file
+	S--local/.git/objects/.some-hidden-file
+	S--local/.git/objects/some-dir
+	S--local/.git/objects/some-dir/.some-dot-file
+	S--local/.git/objects/some-dir/some-file
+	S--local/.git/objects/some-file
+	S--no-hardlinks/.git/objects/.some-hidden-file
+	S--no-hardlinks/.git/objects/some-dir
+	S--no-hardlinks/.git/objects/some-dir/.some-dot-file
+	S--no-hardlinks/.git/objects/some-dir/some-file
+	S--no-hardlinks/.git/objects/some-file
+	EOF
+	test_cmp expected actual
+'
+
 test_done
-- 
2.21.0.rc2.1.g2d5e20a900.dirty


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [WIP RFC PATCH 5/7] clone: use dir-iterator to avoid explicit dir traversal
  2019-02-25  9:45         ` Duy Nguyen
                             ` (4 preceding siblings ...)
  2019-02-26  0:26           ` [WIP RFC PATCH 4/7] clone: test for our behavior on odd objects/* content Ævar Arnfjörð Bjarmason
@ 2019-02-26  0:26           ` Ævar Arnfjörð Bjarmason
  2019-02-26  3:48             ` Matheus Tavares Bernardino
  2019-02-26  0:26           ` [WIP RFC PATCH 6/7] clone: stop ignoring dotdirs in --local etc. clone Ævar Arnfjörð Bjarmason
  2019-02-26  0:26           ` [WIP RFC PATCH 7/7] clone: break cloning repos that have symlinks in them Ævar Arnfjörð Bjarmason
  7 siblings, 1 reply; 35+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-02-26  0:26 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy,
	Matheus Tavares, Thomas Gummerer, Christian Couder,
	Ævar Arnfjörð Bjarmason

From: Matheus Tavares <matheus.bernardino@usp.br>

Replace usage of opendir/readdir/closedir API to traverse directories
recursively, at copy_or_link_directory function, by the dir-iterator
API. This simplifies the code and avoid recursive calls to
copy_or_link_directory.

[Ævar: This should be bug-compatible with the existing "clone"
behavior. The whole bit here with "iter->relative_path[0] == '.'" is a
dirty hack. We don't copy dot-dirs, and then later on just blindly
ignore ENOENT errors as we descend into them. That case really wants
to be a is_dotdir_or_file_within() test instead]

Now, copy_or_link_directory will call die() in case of an error on
openddir, readdir or lstat, inside dir_iterator_advance. That means it
will abort in case of an error trying to fetch any iteration entry.

Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 builtin/clone.c | 55 +++++++++++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 22 deletions(-)

diff --git a/builtin/clone.c b/builtin/clone.c
index 862d2ea69c..c32e9022b3 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -23,6 +23,8 @@
 #include "transport.h"
 #include "strbuf.h"
 #include "dir.h"
+#include "dir-iterator.h"
+#include "iterator.h"
 #include "sigchain.h"
 #include "branch.h"
 #include "remote.h"
@@ -411,42 +413,47 @@ static void mkdir_if_missing(const char *pathname, mode_t mode)
 }
 
 static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
-				   const char *src_repo, int src_baselen)
+				   const char *src_repo)
 {
-	struct dirent *de;
-	struct stat buf;
 	int src_len, dest_len;
-	DIR *dir;
-
-	dir = opendir(src->buf);
-	if (!dir)
-		die_errno(_("failed to open '%s'"), src->buf);
+	struct dir_iterator *iter;
+	int iter_status;
+	struct stat st;
 
 	mkdir_if_missing(dest->buf, 0777);
 
+	iter = dir_iterator_begin(src->buf, 1);
+
 	strbuf_addch(src, '/');
 	src_len = src->len;
 	strbuf_addch(dest, '/');
 	dest_len = dest->len;
 
-	while ((de = readdir(dir)) != NULL) {
+	while ((iter_status = dir_iterator_advance(iter)) == ITER_OK) {
 		strbuf_setlen(src, src_len);
-		strbuf_addstr(src, de->d_name);
+		strbuf_addstr(src, iter->relative_path);
 		strbuf_setlen(dest, dest_len);
-		strbuf_addstr(dest, de->d_name);
-		if (stat(src->buf, &buf)) {
+		strbuf_addstr(dest, iter->relative_path);
+
+		/*
+		 * dir_iterator_advance already calls lstat to populate iter->st
+		 * but, unlike stat, lstat does not checks for permissions on
+		 * the given path.
+		 */
+		if (stat(src->buf, &st)) {
 			warning (_("failed to stat %s\n"), src->buf);
 			continue;
 		}
-		if (S_ISDIR(buf.st_mode)) {
-			if (de->d_name[0] != '.')
-				copy_or_link_directory(src, dest,
-						       src_repo, src_baselen);
+
+		if (S_ISDIR(iter->st.st_mode)) {
+			if (iter->relative_path[0] == '.')
+				continue;
+			mkdir_if_missing(dest->buf, 0777);
 			continue;
 		}
 
 		/* Files that cannot be copied bit-for-bit... */
-		if (!strcmp(src->buf + src_baselen, "/info/alternates")) {
+		if (!strcmp(iter->relative_path, "info/alternates")) {
 			copy_alternates(src, dest, src_repo);
 			continue;
 		}
@@ -456,14 +463,18 @@ static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
 		if (!option_no_hardlinks) {
 			if (!link(src->buf, dest->buf))
 				continue;
-			if (option_local > 0)
-				die_errno(_("failed to create link '%s'"), dest->buf);
+			if (option_local > 0 && errno != ENOENT)
+				warning_errno(_("failed to create link '%s'"), dest->buf);
 			option_no_hardlinks = 1;
 		}
-		if (copy_file_with_time(dest->buf, src->buf, 0666))
+		if (copy_file_with_time(dest->buf, src->buf, 0666) && errno != ENOENT)
 			die_errno(_("failed to copy file to '%s'"), dest->buf);
 	}
-	closedir(dir);
+
+	if (iter_status != ITER_DONE) {
+		strbuf_setlen(src, src_len);
+		die(_("failed to iterate over '%s'"), src->buf);
+	}
 }
 
 static void clone_local(const char *src_repo, const char *dest_repo)
@@ -481,7 +492,7 @@ static void clone_local(const char *src_repo, const char *dest_repo)
 		get_common_dir(&dest, dest_repo);
 		strbuf_addstr(&src, "/objects");
 		strbuf_addstr(&dest, "/objects");
-		copy_or_link_directory(&src, &dest, src_repo, src.len);
+		copy_or_link_directory(&src, &dest, src_repo);
 		strbuf_release(&src);
 		strbuf_release(&dest);
 	}
-- 
2.21.0.rc2.1.g2d5e20a900.dirty


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [WIP RFC PATCH 6/7] clone: stop ignoring dotdirs in --local etc. clone
  2019-02-25  9:45         ` Duy Nguyen
                             ` (5 preceding siblings ...)
  2019-02-26  0:26           ` [WIP RFC PATCH 5/7] clone: use dir-iterator to avoid explicit dir traversal Ævar Arnfjörð Bjarmason
@ 2019-02-26  0:26           ` Ævar Arnfjörð Bjarmason
  2019-02-26  0:26           ` [WIP RFC PATCH 7/7] clone: break cloning repos that have symlinks in them Ævar Arnfjörð Bjarmason
  7 siblings, 0 replies; 35+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-02-26  0:26 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy,
	Matheus Tavares, Thomas Gummerer, Christian Couder,
	Ævar Arnfjörð Bjarmason

This seems to never have been intentional, just a side-effect of how
the existing code was written.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 builtin/clone.c            | 8 +++-----
 t/t5604-clone-reference.sh | 9 +++++++++
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/builtin/clone.c b/builtin/clone.c
index c32e9022b3..515dc91d63 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -446,8 +446,6 @@ static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
 		}
 
 		if (S_ISDIR(iter->st.st_mode)) {
-			if (iter->relative_path[0] == '.')
-				continue;
 			mkdir_if_missing(dest->buf, 0777);
 			continue;
 		}
@@ -463,11 +461,11 @@ static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
 		if (!option_no_hardlinks) {
 			if (!link(src->buf, dest->buf))
 				continue;
-			if (option_local > 0 && errno != ENOENT)
-				warning_errno(_("failed to create link '%s'"), dest->buf);
+			if (option_local > 0)
+				die_errno(_("failed to create link '%s'"), dest->buf);
 			option_no_hardlinks = 1;
 		}
-		if (copy_file_with_time(dest->buf, src->buf, 0666) && errno != ENOENT)
+		if (copy_file_with_time(dest->buf, src->buf, 0666))
 			die_errno(_("failed to copy file to '%s'"), dest->buf);
 	}
 
diff --git a/t/t5604-clone-reference.sh b/t/t5604-clone-reference.sh
index 6f9c77049e..f1a8e74c44 100755
--- a/t/t5604-clone-reference.sh
+++ b/t/t5604-clone-reference.sh
@@ -262,16 +262,25 @@ test_expect_success SHA1,SYMLINKS 'clone repo with manually symlinked objects/*'
 	test_cmp expected actual &&
 	find S-* -name "*some*" | sort >actual &&
 	cat >expected <<-EOF &&
+	S--dissociate/.git/objects/.some-hidden-dir
+	S--dissociate/.git/objects/.some-hidden-dir/.some-dot-file
+	S--dissociate/.git/objects/.some-hidden-dir/some-file
 	S--dissociate/.git/objects/.some-hidden-file
 	S--dissociate/.git/objects/some-dir
 	S--dissociate/.git/objects/some-dir/.some-dot-file
 	S--dissociate/.git/objects/some-dir/some-file
 	S--dissociate/.git/objects/some-file
+	S--local/.git/objects/.some-hidden-dir
+	S--local/.git/objects/.some-hidden-dir/.some-dot-file
+	S--local/.git/objects/.some-hidden-dir/some-file
 	S--local/.git/objects/.some-hidden-file
 	S--local/.git/objects/some-dir
 	S--local/.git/objects/some-dir/.some-dot-file
 	S--local/.git/objects/some-dir/some-file
 	S--local/.git/objects/some-file
+	S--no-hardlinks/.git/objects/.some-hidden-dir
+	S--no-hardlinks/.git/objects/.some-hidden-dir/.some-dot-file
+	S--no-hardlinks/.git/objects/.some-hidden-dir/some-file
 	S--no-hardlinks/.git/objects/.some-hidden-file
 	S--no-hardlinks/.git/objects/some-dir
 	S--no-hardlinks/.git/objects/some-dir/.some-dot-file
-- 
2.21.0.rc2.1.g2d5e20a900.dirty


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [WIP RFC PATCH 7/7] clone: break cloning repos that have symlinks in them
  2019-02-25  9:45         ` Duy Nguyen
                             ` (6 preceding siblings ...)
  2019-02-26  0:26           ` [WIP RFC PATCH 6/7] clone: stop ignoring dotdirs in --local etc. clone Ævar Arnfjörð Bjarmason
@ 2019-02-26  0:26           ` Ævar Arnfjörð Bjarmason
  7 siblings, 0 replies; 35+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-02-26  0:26 UTC (permalink / raw)
  To: git
  Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy,
	Matheus Tavares, Thomas Gummerer, Christian Couder,
	Ævar Arnfjörð Bjarmason

Revert back to the lstat() behavior in the dir_iterator interface
we've had since it was added in 0fe5043dad ("dir_iterator: new API for
iterating over a directory tree", 2016-06-18).

No reflog test depends on it, so it's unclear if it's needed. Since
clone now uses this it changes the longstanding behavior of how we
just so happened to support objects dirs with symlinks in them.

Why? I don't know. Just writing this up as an RFC with tests. Insert
rationale here.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 dir-iterator.c             |  2 +-
 t/t5604-clone-reference.sh | 45 +++++++-------------------------------
 2 files changed, 9 insertions(+), 38 deletions(-)

diff --git a/dir-iterator.c b/dir-iterator.c
index 6a9c0c4d08..070a656555 100644
--- a/dir-iterator.c
+++ b/dir-iterator.c
@@ -149,7 +149,7 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
 				continue;
 
 			strbuf_addstr(&iter->base.path, de->d_name);
-			if (stat(iter->base.path.buf, &iter->base.st) < 0) {
+			if (lstat(iter->base.path.buf, &iter->base.st) < 0) {
 				if (errno != ENOENT) {
 					if (iter->pedantic)
 						goto error_out;
diff --git a/t/t5604-clone-reference.sh b/t/t5604-clone-reference.sh
index f1a8e74c44..a4cd12643e 100755
--- a/t/t5604-clone-reference.sh
+++ b/t/t5604-clone-reference.sh
@@ -249,45 +249,16 @@ test_expect_success SHA1,SYMLINKS 'setup repo with manually symlinked objects/*'
 '
 
 test_expect_success SHA1,SYMLINKS 'clone repo with manually symlinked objects/*' '
-	for option in --local --no-hardlinks --shared --dissociate
+	for option in --local --no-hardlinks --dissociate
 	do
-		git clone $option S S$option || return 1 &&
-		git -C S$option fsck || return 1
+		test_must_fail git clone $option S S$option 2>err || return 1 &&
+		test_i18ngrep "the remote end hung up" err || return 1
 	done &&
-	find S-* -type l | sort >actual &&
-	cat >expected <<-EOF &&
-	S--dissociate/.git/objects/22/3b7836fb19fdf64ba2d3cd6173c6a283141f78
-	S--local/.git/objects/22/3b7836fb19fdf64ba2d3cd6173c6a283141f78
-	EOF
-	test_cmp expected actual &&
-	find S-* -name "*some*" | sort >actual &&
-	cat >expected <<-EOF &&
-	S--dissociate/.git/objects/.some-hidden-dir
-	S--dissociate/.git/objects/.some-hidden-dir/.some-dot-file
-	S--dissociate/.git/objects/.some-hidden-dir/some-file
-	S--dissociate/.git/objects/.some-hidden-file
-	S--dissociate/.git/objects/some-dir
-	S--dissociate/.git/objects/some-dir/.some-dot-file
-	S--dissociate/.git/objects/some-dir/some-file
-	S--dissociate/.git/objects/some-file
-	S--local/.git/objects/.some-hidden-dir
-	S--local/.git/objects/.some-hidden-dir/.some-dot-file
-	S--local/.git/objects/.some-hidden-dir/some-file
-	S--local/.git/objects/.some-hidden-file
-	S--local/.git/objects/some-dir
-	S--local/.git/objects/some-dir/.some-dot-file
-	S--local/.git/objects/some-dir/some-file
-	S--local/.git/objects/some-file
-	S--no-hardlinks/.git/objects/.some-hidden-dir
-	S--no-hardlinks/.git/objects/.some-hidden-dir/.some-dot-file
-	S--no-hardlinks/.git/objects/.some-hidden-dir/some-file
-	S--no-hardlinks/.git/objects/.some-hidden-file
-	S--no-hardlinks/.git/objects/some-dir
-	S--no-hardlinks/.git/objects/some-dir/.some-dot-file
-	S--no-hardlinks/.git/objects/some-dir/some-file
-	S--no-hardlinks/.git/objects/some-file
-	EOF
-	test_cmp expected actual
+	git clone --shared S S--shared &&
+	find S--shared -type l | sort >actual &&
+	test_must_be_empty actual &&
+	find S--shared -name "*some*" | sort >actual &&
+	test_must_be_empty actual
 '
 
 test_done
-- 
2.21.0.rc2.1.g2d5e20a900.dirty


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP RFC PATCH 2/7] dir-iterator: use stat() instead of lstat()
  2019-02-26  0:26           ` [WIP RFC PATCH 2/7] dir-iterator: use stat() instead of lstat() Ævar Arnfjörð Bjarmason
@ 2019-02-26  1:53             ` Matheus Tavares Bernardino
  0 siblings, 0 replies; 35+ messages in thread
From: Matheus Tavares Bernardino @ 2019-02-26  1:53 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: git, Junio C Hamano, Nguyễn Thái Ngọc Duy,
	Thomas Gummerer, Christian Couder

On Mon, Feb 25, 2019 at 9:26 PM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> This is surely a horrible idea, but all tests pass with this (not that
> I trust them much). Doing this for later WIP use in clone.c.
>
> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> ---
>  dir-iterator.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/dir-iterator.c b/dir-iterator.c
> index 070a656555..6a9c0c4d08 100644
> --- a/dir-iterator.c
> +++ b/dir-iterator.c
> @@ -149,7 +149,7 @@ int dir_iterator_advance(struct dir_iterator *dir_iterator)
>                                 continue;
>
>                         strbuf_addstr(&iter->base.path, de->d_name);
> -                       if (lstat(iter->base.path.buf, &iter->base.st) < 0) {
> +                       if (stat(iter->base.path.buf, &iter->base.st) < 0) {

I think this may have side-effects on other sections that uses the
dir-iterator API, because now it would follow symlinks, right?

>                                 if (errno != ENOENT) {
>                                         if (iter->pedantic)
>                                                 goto error_out;
> --
> 2.21.0.rc2.1.g2d5e20a900.dirty
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP RFC PATCH 5/7] clone: use dir-iterator to avoid explicit dir traversal
  2019-02-26  0:26           ` [WIP RFC PATCH 5/7] clone: use dir-iterator to avoid explicit dir traversal Ævar Arnfjörð Bjarmason
@ 2019-02-26  3:48             ` Matheus Tavares Bernardino
  2019-02-26 11:33               ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 35+ messages in thread
From: Matheus Tavares Bernardino @ 2019-02-26  3:48 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: git, Junio C Hamano, Nguyễn Thái Ngọc Duy,
	Thomas Gummerer, Christian Couder

On Mon, Feb 25, 2019 at 9:26 PM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
> From: Matheus Tavares <matheus.bernardino@usp.br>
>
> Replace usage of opendir/readdir/closedir API to traverse directories
> recursively, at copy_or_link_directory function, by the dir-iterator
> API. This simplifies the code and avoid recursive calls to
> copy_or_link_directory.
>
> [Ævar: This should be bug-compatible with the existing "clone"
> behavior. The whole bit here with "iter->relative_path[0] == '.'" is a
> dirty hack. We don't copy dot-dirs, and then later on just blindly
> ignore ENOENT errors as we descend into them. That case really wants
> to be a is_dotdir_or_file_within() test instead]
>
> Now, copy_or_link_directory will call die() in case of an error on
> openddir, readdir or lstat, inside dir_iterator_advance. That means it
> will abort in case of an error trying to fetch any iteration entry.
>
> Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> ---
>  builtin/clone.c | 55 +++++++++++++++++++++++++++++--------------------
>  1 file changed, 33 insertions(+), 22 deletions(-)
>
> diff --git a/builtin/clone.c b/builtin/clone.c
> index 862d2ea69c..c32e9022b3 100644
> --- a/builtin/clone.c
> +++ b/builtin/clone.c
> @@ -23,6 +23,8 @@
>  #include "transport.h"
>  #include "strbuf.h"
>  #include "dir.h"
> +#include "dir-iterator.h"
> +#include "iterator.h"
>  #include "sigchain.h"
>  #include "branch.h"
>  #include "remote.h"
> @@ -411,42 +413,47 @@ static void mkdir_if_missing(const char *pathname, mode_t mode)
>  }
>
>  static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
> -                                  const char *src_repo, int src_baselen)
> +                                  const char *src_repo)
>  {
> -       struct dirent *de;
> -       struct stat buf;
>         int src_len, dest_len;
> -       DIR *dir;
> -
> -       dir = opendir(src->buf);
> -       if (!dir)
> -               die_errno(_("failed to open '%s'"), src->buf);
> +       struct dir_iterator *iter;
> +       int iter_status;
> +       struct stat st;
>
>         mkdir_if_missing(dest->buf, 0777);
>
> +       iter = dir_iterator_begin(src->buf, 1);
> +
>         strbuf_addch(src, '/');
>         src_len = src->len;
>         strbuf_addch(dest, '/');
>         dest_len = dest->len;
>
> -       while ((de = readdir(dir)) != NULL) {
> +       while ((iter_status = dir_iterator_advance(iter)) == ITER_OK) {
>                 strbuf_setlen(src, src_len);
> -               strbuf_addstr(src, de->d_name);
> +               strbuf_addstr(src, iter->relative_path);
>                 strbuf_setlen(dest, dest_len);
> -               strbuf_addstr(dest, de->d_name);
> -               if (stat(src->buf, &buf)) {
> +               strbuf_addstr(dest, iter->relative_path);
> +
> +               /*
> +                * dir_iterator_advance already calls lstat to populate iter->st
> +                * but, unlike stat, lstat does not checks for permissions on
> +                * the given path.
> +                */
> +               if (stat(src->buf, &st)) {
>                         warning (_("failed to stat %s\n"), src->buf);
>                         continue;
>                 }
> -               if (S_ISDIR(buf.st_mode)) {
> -                       if (de->d_name[0] != '.')
> -                               copy_or_link_directory(src, dest,
> -                                                      src_repo, src_baselen);
> +
> +               if (S_ISDIR(iter->st.st_mode)) {
> +                       if (iter->relative_path[0] == '.')

I think it should be iter->basename[0] here, instead, right?

I also have a more conceptual question here: This additions (or the
is_dotdir_of_file_within as suggested) are just to make patch
compatible with the current behaviour, but are going to be removed
soon after. As this would be kind of a noise, wouldn't it be better to
have a patch before this, already correcting copy_or_link_directory's
behaviour on hidden dirs and them this?

> +                               continue;
> +                       mkdir_if_missing(dest->buf, 0777);
>                         continue;
>                 }
>
>                 /* Files that cannot be copied bit-for-bit... */
> -               if (!strcmp(src->buf + src_baselen, "/info/alternates")) {
> +               if (!strcmp(iter->relative_path, "info/alternates")) {
>                         copy_alternates(src, dest, src_repo);
>                         continue;
>                 }
> @@ -456,14 +463,18 @@ static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
>                 if (!option_no_hardlinks) {
>                         if (!link(src->buf, dest->buf))
>                                 continue;
> -                       if (option_local > 0)
> -                               die_errno(_("failed to create link '%s'"), dest->buf);
> +                       if (option_local > 0 && errno != ENOENT)
> +                               warning_errno(_("failed to create link '%s'"), dest->buf);
>                         option_no_hardlinks = 1;
>                 }
> -               if (copy_file_with_time(dest->buf, src->buf, 0666))
> +               if (copy_file_with_time(dest->buf, src->buf, 0666) && errno != ENOENT)
>                         die_errno(_("failed to copy file to '%s'"), dest->buf);
>         }
> -       closedir(dir);
> +
> +       if (iter_status != ITER_DONE) {
> +               strbuf_setlen(src, src_len);
> +               die(_("failed to iterate over '%s'"), src->buf);
> +       }
>  }
>
>  static void clone_local(const char *src_repo, const char *dest_repo)
> @@ -481,7 +492,7 @@ static void clone_local(const char *src_repo, const char *dest_repo)
>                 get_common_dir(&dest, dest_repo);
>                 strbuf_addstr(&src, "/objects");
>                 strbuf_addstr(&dest, "/objects");
> -               copy_or_link_directory(&src, &dest, src_repo, src.len);
> +               copy_or_link_directory(&src, &dest, src_repo);
>                 strbuf_release(&src);
>                 strbuf_release(&dest);
>         }
> --
> 2.21.0.rc2.1.g2d5e20a900.dirty
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal
  2019-02-25  2:31       ` [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal Matheus Tavares Bernardino
  2019-02-25 10:25         ` Ævar Arnfjörð Bjarmason
@ 2019-02-26 10:33         ` Christian Couder
  1 sibling, 0 replies; 35+ messages in thread
From: Christian Couder @ 2019-02-26 10:33 UTC (permalink / raw)
  To: Matheus Tavares Bernardino
  Cc: Ævar Arnfjörð Bjarmason, git, Thomas Gummerer,
	Junio C Hamano, Nguyễn Thái Ngọc Duy

Hi Matheus,

On Mon, Feb 25, 2019 at 3:31 AM Matheus Tavares Bernardino
<matheus.bernardino@usp.br> wrote:
>
> I am a little confused about what I should do next. How should I
> proceed with this series?

When there are different opinions about what you should do, I would
suggest doing only things everyone is ok with, and then resending
saying in the cover letter that such and such issues still need to be
decided. Then hopefully either Junio will decide or a consensus will
eventually appear about what you should do.

In any case please don't wait until something is decided before
resending with improvements (or just pinging us if you have already
made all the improvements that were suggested) as that may not happen.
If people on the list stop hearing about your patch series, they might
just forget about it.

Thanks,
Christian.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [WIP RFC PATCH 5/7] clone: use dir-iterator to avoid explicit dir traversal
  2019-02-26  3:48             ` Matheus Tavares Bernardino
@ 2019-02-26 11:33               ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 35+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2019-02-26 11:33 UTC (permalink / raw)
  To: Matheus Tavares Bernardino
  Cc: git, Junio C Hamano, Nguyễn Thái Ngọc Duy,
	Thomas Gummerer, Christian Couder


On Tue, Feb 26 2019, Matheus Tavares Bernardino wrote:

> On Mon, Feb 25, 2019 at 9:26 PM Ævar Arnfjörð Bjarmason
> <avarab@gmail.com> wrote:
>>
>> From: Matheus Tavares <matheus.bernardino@usp.br>
>>
>> Replace usage of opendir/readdir/closedir API to traverse directories
>> recursively, at copy_or_link_directory function, by the dir-iterator
>> API. This simplifies the code and avoid recursive calls to
>> copy_or_link_directory.
>>
>> [Ævar: This should be bug-compatible with the existing "clone"
>> behavior. The whole bit here with "iter->relative_path[0] == '.'" is a
>> dirty hack. We don't copy dot-dirs, and then later on just blindly
>> ignore ENOENT errors as we descend into them. That case really wants
>> to be a is_dotdir_or_file_within() test instead]
>>
>> Now, copy_or_link_directory will call die() in case of an error on
>> openddir, readdir or lstat, inside dir_iterator_advance. That means it
>> will abort in case of an error trying to fetch any iteration entry.
>>
>> Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
>> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>> ---
>>  builtin/clone.c | 55 +++++++++++++++++++++++++++++--------------------
>>  1 file changed, 33 insertions(+), 22 deletions(-)
>>
>> diff --git a/builtin/clone.c b/builtin/clone.c
>> index 862d2ea69c..c32e9022b3 100644
>> --- a/builtin/clone.c
>> +++ b/builtin/clone.c
>> @@ -23,6 +23,8 @@
>>  #include "transport.h"
>>  #include "strbuf.h"
>>  #include "dir.h"
>> +#include "dir-iterator.h"
>> +#include "iterator.h"
>>  #include "sigchain.h"
>>  #include "branch.h"
>>  #include "remote.h"
>> @@ -411,42 +413,47 @@ static void mkdir_if_missing(const char *pathname, mode_t mode)
>>  }
>>
>>  static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
>> -                                  const char *src_repo, int src_baselen)
>> +                                  const char *src_repo)
>>  {
>> -       struct dirent *de;
>> -       struct stat buf;
>>         int src_len, dest_len;
>> -       DIR *dir;
>> -
>> -       dir = opendir(src->buf);
>> -       if (!dir)
>> -               die_errno(_("failed to open '%s'"), src->buf);
>> +       struct dir_iterator *iter;
>> +       int iter_status;
>> +       struct stat st;
>>
>>         mkdir_if_missing(dest->buf, 0777);
>>
>> +       iter = dir_iterator_begin(src->buf, 1);
>> +
>>         strbuf_addch(src, '/');
>>         src_len = src->len;
>>         strbuf_addch(dest, '/');
>>         dest_len = dest->len;
>>
>> -       while ((de = readdir(dir)) != NULL) {
>> +       while ((iter_status = dir_iterator_advance(iter)) == ITER_OK) {
>>                 strbuf_setlen(src, src_len);
>> -               strbuf_addstr(src, de->d_name);
>> +               strbuf_addstr(src, iter->relative_path);
>>                 strbuf_setlen(dest, dest_len);
>> -               strbuf_addstr(dest, de->d_name);
>> -               if (stat(src->buf, &buf)) {
>> +               strbuf_addstr(dest, iter->relative_path);
>> +
>> +               /*
>> +                * dir_iterator_advance already calls lstat to populate iter->st
>> +                * but, unlike stat, lstat does not checks for permissions on
>> +                * the given path.
>> +                */
>> +               if (stat(src->buf, &st)) {
>>                         warning (_("failed to stat %s\n"), src->buf);
>>                         continue;
>>                 }
>> -               if (S_ISDIR(buf.st_mode)) {
>> -                       if (de->d_name[0] != '.')
>> -                               copy_or_link_directory(src, dest,
>> -                                                      src_repo, src_baselen);
>> +
>> +               if (S_ISDIR(iter->st.st_mode)) {
>> +                       if (iter->relative_path[0] == '.')
>
> I think it should be iter->basename[0] here, instead, right?

Yeah, sounds better.

> I also have a more conceptual question here: This additions (or the
> is_dotdir_of_file_within as suggested) are just to make patch
> compatible with the current behaviour, but are going to be removed
> soon after.

Yes. it's not an endorsement of the current behavior, but for ease of
review. I.e. to split changes into smaller logical ones ones that are
refactoring on the one hand, and behavior changes on the other.

> As this would be kind of a noise, wouldn't it be better to have a
> patch before this, already correcting copy_or_link_directory's
> behaviour on hidden dirs and them this?

Yeah, maybe structuring it like that is more readable.


>> +                               continue;
>> +                       mkdir_if_missing(dest->buf, 0777);
>>                         continue;
>>                 }
>>
>>                 /* Files that cannot be copied bit-for-bit... */
>> -               if (!strcmp(src->buf + src_baselen, "/info/alternates")) {
>> +               if (!strcmp(iter->relative_path, "info/alternates")) {
>>                         copy_alternates(src, dest, src_repo);
>>                         continue;
>>                 }
>> @@ -456,14 +463,18 @@ static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
>>                 if (!option_no_hardlinks) {
>>                         if (!link(src->buf, dest->buf))
>>                                 continue;
>> -                       if (option_local > 0)
>> -                               die_errno(_("failed to create link '%s'"), dest->buf);
>> +                       if (option_local > 0 && errno != ENOENT)
>> +                               warning_errno(_("failed to create link '%s'"), dest->buf);
>>                         option_no_hardlinks = 1;
>>                 }
>> -               if (copy_file_with_time(dest->buf, src->buf, 0666))
>> +               if (copy_file_with_time(dest->buf, src->buf, 0666) && errno != ENOENT)
>>                         die_errno(_("failed to copy file to '%s'"), dest->buf);
>>         }
>> -       closedir(dir);
>> +
>> +       if (iter_status != ITER_DONE) {
>> +               strbuf_setlen(src, src_len);
>> +               die(_("failed to iterate over '%s'"), src->buf);
>> +       }
>>  }
>>
>>  static void clone_local(const char *src_repo, const char *dest_repo)
>> @@ -481,7 +492,7 @@ static void clone_local(const char *src_repo, const char *dest_repo)
>>                 get_common_dir(&dest, dest_repo);
>>                 strbuf_addstr(&src, "/objects");
>>                 strbuf_addstr(&dest, "/objects");
>> -               copy_or_link_directory(&src, &dest, src_repo, src.len);
>> +               copy_or_link_directory(&src, &dest, src_repo);
>>                 strbuf_release(&src);
>>                 strbuf_release(&dest);
>>         }
>> --
>> 2.21.0.rc2.1.g2d5e20a900.dirty
>>

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2019-02-26 11:33 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-23 19:03 [GSoC][PATCH 0/3] clone: convert explicit dir traversal to dir-iterator Matheus Tavares
2019-02-23 19:03 ` [GSoC][PATCH 1/3] dir-iterator: add pedantic option to dir_iterator_begin Matheus Tavares
2019-02-23 21:35   ` Thomas Gummerer
2019-02-24  8:35     ` Christian Couder
2019-02-24 17:43       ` Matheus Tavares Bernardino
2019-02-24 21:06         ` Thomas Gummerer
2019-02-23 19:03 ` [GSoC][PATCH 2/3] clone: extract function from copy_or_link_directory Matheus Tavares
2019-02-24  8:38   ` Christian Couder
2019-02-23 19:03 ` [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal Matheus Tavares
2019-02-23 21:48   ` Thomas Gummerer
2019-02-24 18:19     ` Matheus Tavares Bernardino
2019-02-23 22:40   ` Ævar Arnfjörð Bjarmason
2019-02-24  9:41     ` Christian Couder
2019-02-24 14:45       ` Ævar Arnfjörð Bjarmason
2019-02-25  9:45         ` Duy Nguyen
2019-02-26  0:26           ` [WIP RFC PATCH 0/7] clone: dir iterator refactoring with tests Ævar Arnfjörð Bjarmason
2019-02-26  0:26           ` [WIP RFC PATCH 1/7] dir-iterator: add pedantic option to dir_iterator_begin Ævar Arnfjörð Bjarmason
2019-02-26  0:26           ` [WIP RFC PATCH 2/7] dir-iterator: use stat() instead of lstat() Ævar Arnfjörð Bjarmason
2019-02-26  1:53             ` Matheus Tavares Bernardino
2019-02-26  0:26           ` [WIP RFC PATCH 3/7] clone: extract function from copy_or_link_directory Ævar Arnfjörð Bjarmason
2019-02-26  0:26           ` [WIP RFC PATCH 4/7] clone: test for our behavior on odd objects/* content Ævar Arnfjörð Bjarmason
2019-02-26  0:26           ` [WIP RFC PATCH 5/7] clone: use dir-iterator to avoid explicit dir traversal Ævar Arnfjörð Bjarmason
2019-02-26  3:48             ` Matheus Tavares Bernardino
2019-02-26 11:33               ` Ævar Arnfjörð Bjarmason
2019-02-26  0:26           ` [WIP RFC PATCH 6/7] clone: stop ignoring dotdirs in --local etc. clone Ævar Arnfjörð Bjarmason
2019-02-26  0:26           ` [WIP RFC PATCH 7/7] clone: break cloning repos that have symlinks in them Ævar Arnfjörð Bjarmason
2019-02-25  2:31       ` [GSoC][PATCH 3/3] clone: use dir-iterator to avoid explicit dir traversal Matheus Tavares Bernardino
2019-02-25 10:25         ` Ævar Arnfjörð Bjarmason
2019-02-25 20:40           ` Christian Couder
2019-02-26 10:33         ` Christian Couder
2019-02-23 19:07 ` [GSoC][PATCH 0/3] clone: convert explicit dir traversal to dir-iterator Matheus Tavares Bernardino
2019-02-23 20:10   ` Ævar Arnfjörð Bjarmason
2019-02-23 21:59 ` Thomas Gummerer
2019-02-24 16:34   ` Matheus Tavares Bernardino
2019-02-24 21:07     ` Thomas Gummerer

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.