All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Perform cheaper connectivity check when pack is used as medium
@ 2012-02-27 15:39 Nguyễn Thái Ngọc Duy
  2012-02-27 18:14 ` Junio C Hamano
  0 siblings, 1 reply; 23+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-02-27 15:39 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy

When we fetch or push, usually "git rev-list --verify-objects --not
--all --stdin" is used to make sure that there are no gaps between
existing refs and new refs. --verify-objects calls parse_object(),
which internally calls check_sha1_signature() to verify object content
matches its SHA-1 signature.

check_sha1_signature() is an expensive operation, especially when new
refs are far away from existing ones because all objects in between
are re-hashed. However, if we receive new objects by pack, we can
skip the operation because packs themselves do not contain SHA-1
signatures. All signatures are recreated by index-pack's hashing the
pack, which we can trust.

Detect pack transfer cases and turn --verify-objects to --objects.
--objects is similar to --verify-objects except that it does not call
check_sha1_signature().

As an (extreme) example, a repository is created with only one commit:
e83c516 (Initial revision of "git", the information manager from hell
- 2005-04-07). The rest of git.git is fetched on top. Without the
patch:

$ time git fetch file:///home/pclouds/w/git/.git
remote: Counting objects: 125638, done.
remote: Compressing objects: 100% (33201/33201), done.
remote: Total 125638 (delta 92568), reused 123517 (delta 90743)
Receiving objects: 100% (125638/125638), 34.58 MiB | 8.07 MiB/s, done.
Resolving deltas: 100% (92568/92568), done.
From file:///home/pclouds/w/git/
 * branch            HEAD       -> FETCH_HEAD

real    1m30.972s
user    1m31.410s
sys     0m1.757s

With the patch:

$ time git fetch file:///home/pclouds/w/git/.git
remote: Counting objects: 125647, done.
remote: Compressing objects: 100% (33209/33209), done.
remote: Total 125647 (delta 92576), reused 123516 (delta 90744)
Receiving objects: 100% (125647/125647), 34.58 MiB | 7.99 MiB/s, done.
Resolving deltas: 100% (92576/92576), done.
From file:///home/pclouds/w/git/
 * branch            HEAD       -> FETCH_HEAD

real    0m51.456s
user    0m52.737s
sys     0m1.548s

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/fetch.c        |   12 +++++++-----
 builtin/receive-pack.c |    4 ++--
 connected.c            |    5 ++++-
 connected.h            |    3 ++-
 transport.c            |    5 +++++
 transport.h            |    1 +
 6 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/builtin/fetch.c b/builtin/fetch.c
index 65f5f9b..2b62f42 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -368,7 +368,7 @@ static int iterate_ref_map(void *cb_data, unsigned char sha1[20])
 }
 
 static int store_updated_refs(const char *raw_url, const char *remote_name,
-		struct ref *ref_map)
+			      struct ref *ref_map, int pack_transport)
 {
 	FILE *fp;
 	struct commit *commit;
@@ -389,7 +389,8 @@ static int store_updated_refs(const char *raw_url, const char *remote_name,
 		url = xstrdup("foreign");
 
 	rm = ref_map;
-	if (check_everything_connected(iterate_ref_map, 0, &rm)) {
+	if (check_everything_connected(iterate_ref_map, 0,
+				       pack_transport ? 0 : 1, &rm)) {
 		rc = error(_("%s did not send all necessary objects\n"), url);
 		goto abort;
 	}
@@ -516,7 +517,7 @@ static int quickfetch(struct ref *ref_map)
 	 */
 	if (depth)
 		return -1;
-	return check_everything_connected(iterate_ref_map, 1, &rm);
+	return check_everything_connected(iterate_ref_map, 1, 0, &rm);
 }
 
 static int fetch_refs(struct transport *transport, struct ref *ref_map)
@@ -526,8 +527,9 @@ static int fetch_refs(struct transport *transport, struct ref *ref_map)
 		ret = transport_fetch_refs(transport, ref_map);
 	if (!ret)
 		ret |= store_updated_refs(transport->url,
-				transport->remote->name,
-				ref_map);
+					  transport->remote->name,
+					  ref_map,
+					  is_pack_transport(transport));
 	transport_unlock_pack(transport);
 	return ret;
 }
diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c
index 0afb8b2..5935751 100644
--- a/builtin/receive-pack.c
+++ b/builtin/receive-pack.c
@@ -669,7 +669,7 @@ static void set_connectivity_errors(struct command *commands)
 	for (cmd = commands; cmd; cmd = cmd->next) {
 		struct command *singleton = cmd;
 		if (!check_everything_connected(command_singleton_iterator,
-						0, &singleton))
+						0, 0, &singleton))
 			continue;
 		cmd->error_string = "missing necessary objects";
 	}
@@ -705,7 +705,7 @@ static void execute_commands(struct command *commands, const char *unpacker_erro
 
 	cmd = commands;
 	if (check_everything_connected(iterate_receive_command_list,
-				       0, &cmd))
+				       0, 0, &cmd))
 		set_connectivity_errors(commands);
 
 	if (run_receive_hook(commands, pre_receive_hook, 0)) {
diff --git a/connected.c b/connected.c
index d762423..6ebd330 100644
--- a/connected.c
+++ b/connected.c
@@ -14,7 +14,8 @@
  *
  * Returns 0 if everything is connected, non-zero otherwise.
  */
-int check_everything_connected(sha1_iterate_fn fn, int quiet, void *cb_data)
+int check_everything_connected(sha1_iterate_fn fn, int quiet,
+			       int strict, void *cb_data)
 {
 	struct child_process rev_list;
 	const char *argv[] = {"rev-list", "--verify-objects",
@@ -26,6 +27,8 @@ int check_everything_connected(sha1_iterate_fn fn, int quiet, void *cb_data)
 	if (fn(cb_data, sha1))
 		return err;
 
+	if (!strict)
+		argv[1] = "--objects";
 	if (quiet)
 		argv[5] = "--quiet";
 
diff --git a/connected.h b/connected.h
index 7e4585a..1f191da 100644
--- a/connected.h
+++ b/connected.h
@@ -15,6 +15,7 @@ typedef int (*sha1_iterate_fn)(void *, unsigned char [20]);
  *
  * Return 0 if Ok, non zero otherwise (i.e. some missing objects)
  */
-extern int check_everything_connected(sha1_iterate_fn, int quiet, void *cb_data);
+extern int check_everything_connected(sha1_iterate_fn, int quiet,
+				      int strict, void *cb_data);
 
 #endif /* CONNECTED_H */
diff --git a/transport.c b/transport.c
index 181f8f2..cd5e0ca 100644
--- a/transport.c
+++ b/transport.c
@@ -1248,3 +1248,8 @@ void for_each_alternate_ref(alternate_ref_fn fn, void *data)
 	cb.data = data;
 	foreach_alt_odb(refs_from_alternate_cb, &cb);
 }
+
+int is_pack_transport(const struct transport *transport)
+{
+	return transport->fetch == fetch_refs_via_pack;
+}
diff --git a/transport.h b/transport.h
index ce99ef8..7cf72ff 100644
--- a/transport.h
+++ b/transport.h
@@ -150,6 +150,7 @@ int transport_disconnect(struct transport *transport);
 char *transport_anonymize_url(const char *url);
 void transport_take_over(struct transport *transport,
 			 struct child_process *child);
+int is_pack_transport(const struct transport *transport);
 
 int transport_connect(struct transport *transport, const char *name,
 		      const char *exec, int fd[2]);
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] Perform cheaper connectivity check when pack is used as medium
  2012-02-27 15:39 [PATCH] Perform cheaper connectivity check when pack is used as medium Nguyễn Thái Ngọc Duy
@ 2012-02-27 18:14 ` Junio C Hamano
  2012-02-28 13:18   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
  0 siblings, 1 reply; 23+ messages in thread
From: Junio C Hamano @ 2012-02-27 18:14 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git

Please turn the existing "int quiet" that carries only one bit into a flag
word, instead of adding yet another int that is only used for one bit.

Other than that, the idea/approach feels sound.

Thanks.
 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH v2] Perform cheaper connectivity check when pack is used as medium
  2012-02-27 18:14 ` Junio C Hamano
@ 2012-02-28 13:18   ` Nguyễn Thái Ngọc Duy
  2012-02-28 15:40     ` Johannes Sixt
  2012-03-02  6:10     ` Junio C Hamano
  0 siblings, 2 replies; 23+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-02-28 13:18 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy

When we fetch or push, usually "git rev-list --verify-objects --not
--all --stdin" is used to make sure that there are no gaps between
existing refs and new refs. --verify-objects calls parse_object(),
which internally calls check_sha1_signature() to verify object content
matches its SHA-1 signature.

check_sha1_signature() is an expensive operation, especially when new
refs are far away from existing ones because all objects in between
are re-hashed. However, if we receive new objects by pack, we can
skip the operation because packs themselves do not contain SHA-1
signatures. All signatures are recreated by unpack-objects/index-pack's
hashing objects in the pack, which we can trust.

Detect pack transfer cases and turn --verify-objects to --objects.
--objects is similar to --verify-objects except that it does not call
check_sha1_signature().

As an (extreme) example, a repository is created with only one commit:
e83c516 (Initial revision of "git", the information manager from hell
- 2005-04-07). The rest of git.git is fetched on top. Without the
patch:

$ time git fetch file:///home/pclouds/w/git/.git
remote: Counting objects: 125638, done.
remote: Compressing objects: 100% (33201/33201), done.
remote: Total 125638 (delta 92568), reused 123517 (delta 90743)
Receiving objects: 100% (125638/125638), 34.58 MiB | 8.07 MiB/s, done.
Resolving deltas: 100% (92568/92568), done.
From file:///home/pclouds/w/git/
 * branch            HEAD       -> FETCH_HEAD

real    1m30.972s
user    1m31.410s
sys     0m1.757s

With the patch:

$ time git fetch file:///home/pclouds/w/git/.git
remote: Counting objects: 125647, done.
remote: Compressing objects: 100% (33209/33209), done.
remote: Total 125647 (delta 92576), reused 123516 (delta 90744)
Receiving objects: 100% (125647/125647), 34.58 MiB | 7.99 MiB/s, done.
Resolving deltas: 100% (92576/92576), done.
From file:///home/pclouds/w/git/
 * branch            HEAD       -> FETCH_HEAD

real    0m51.456s
user    0m52.737s
sys     0m1.548s

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 combine "quiet" and "strict" to a flag word

 builtin/fetch.c |   14 +++++++++-----
 connected.c     |   11 +++++++----
 connected.h     |    6 +++++-
 transport.c     |    5 +++++
 transport.h     |    1 +
 5 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/builtin/fetch.c b/builtin/fetch.c
index 65f5f9b..70c9ca3 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -368,7 +368,7 @@ static int iterate_ref_map(void *cb_data, unsigned char sha1[20])
 }
 
 static int store_updated_refs(const char *raw_url, const char *remote_name,
-		struct ref *ref_map)
+			      struct ref *ref_map, int use_pack)
 {
 	FILE *fp;
 	struct commit *commit;
@@ -389,7 +389,9 @@ static int store_updated_refs(const char *raw_url, const char *remote_name,
 		url = xstrdup("foreign");
 
 	rm = ref_map;
-	if (check_everything_connected(iterate_ref_map, 0, &rm)) {
+	if (check_everything_connected(iterate_ref_map,
+				       use_pack ? 0 : CHECK_CONNECT_STRICT,
+				       &rm)) {
 		rc = error(_("%s did not send all necessary objects\n"), url);
 		goto abort;
 	}
@@ -516,7 +518,8 @@ static int quickfetch(struct ref *ref_map)
 	 */
 	if (depth)
 		return -1;
-	return check_everything_connected(iterate_ref_map, 1, &rm);
+	return check_everything_connected(iterate_ref_map,
+					  CHECK_CONNECT_QUIET, &rm);
 }
 
 static int fetch_refs(struct transport *transport, struct ref *ref_map)
@@ -526,8 +529,9 @@ static int fetch_refs(struct transport *transport, struct ref *ref_map)
 		ret = transport_fetch_refs(transport, ref_map);
 	if (!ret)
 		ret |= store_updated_refs(transport->url,
-				transport->remote->name,
-				ref_map);
+					  transport->remote->name,
+					  ref_map,
+					  is_pack_transport(transport));
 	transport_unlock_pack(transport);
 	return ret;
 }
diff --git a/connected.c b/connected.c
index d762423..9df357d 100644
--- a/connected.c
+++ b/connected.c
@@ -14,10 +14,11 @@
  *
  * Returns 0 if everything is connected, non-zero otherwise.
  */
-int check_everything_connected(sha1_iterate_fn fn, int quiet, void *cb_data)
+int check_everything_connected(sha1_iterate_fn fn, unsigned int flags,
+			       void *cb_data)
 {
 	struct child_process rev_list;
-	const char *argv[] = {"rev-list", "--verify-objects",
+	const char *argv[] = {"rev-list", "--objects",
 			      "--stdin", "--not", "--all", NULL, NULL};
 	char commit[41];
 	unsigned char sha1[20];
@@ -26,7 +27,9 @@ int check_everything_connected(sha1_iterate_fn fn, int quiet, void *cb_data)
 	if (fn(cb_data, sha1))
 		return err;
 
-	if (quiet)
+	if (flags & CHECK_CONNECT_STRICT)
+		argv[1] = "--verify-objects";
+	if (flags & CHECK_CONNECT_QUIET)
 		argv[5] = "--quiet";
 
 	memset(&rev_list, 0, sizeof(rev_list));
@@ -34,7 +37,7 @@ int check_everything_connected(sha1_iterate_fn fn, int quiet, void *cb_data)
 	rev_list.git_cmd = 1;
 	rev_list.in = -1;
 	rev_list.no_stdout = 1;
-	rev_list.no_stderr = quiet;
+	rev_list.no_stderr = flags & CHECK_CONNECT_QUIET;
 	if (start_command(&rev_list))
 		return error(_("Could not run 'git rev-list'"));
 
diff --git a/connected.h b/connected.h
index 7e4585a..52636b9 100644
--- a/connected.h
+++ b/connected.h
@@ -1,6 +1,9 @@
 #ifndef CONNECTED_H
 #define CONNECTED_H
 
+#define CHECK_CONNECT_QUIET  1
+#define CHECK_CONNECT_STRICT 2
+
 /*
  * Take callback data, and return next object name in the buffer.
  * When called after returning the name for the last object, return -1
@@ -15,6 +18,7 @@ typedef int (*sha1_iterate_fn)(void *, unsigned char [20]);
  *
  * Return 0 if Ok, non zero otherwise (i.e. some missing objects)
  */
-extern int check_everything_connected(sha1_iterate_fn, int quiet, void *cb_data);
+extern int check_everything_connected(sha1_iterate_fn, unsigned int flags,
+				      void *cb_data);
 
 #endif /* CONNECTED_H */
diff --git a/transport.c b/transport.c
index 181f8f2..cd5e0ca 100644
--- a/transport.c
+++ b/transport.c
@@ -1248,3 +1248,8 @@ void for_each_alternate_ref(alternate_ref_fn fn, void *data)
 	cb.data = data;
 	foreach_alt_odb(refs_from_alternate_cb, &cb);
 }
+
+int is_pack_transport(const struct transport *transport)
+{
+	return transport->fetch == fetch_refs_via_pack;
+}
diff --git a/transport.h b/transport.h
index ce99ef8..7cf72ff 100644
--- a/transport.h
+++ b/transport.h
@@ -150,6 +150,7 @@ int transport_disconnect(struct transport *transport);
 char *transport_anonymize_url(const char *url);
 void transport_take_over(struct transport *transport,
 			 struct child_process *child);
+int is_pack_transport(const struct transport *transport);
 
 int transport_connect(struct transport *transport, const char *name,
 		      const char *exec, int fd[2]);
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Perform cheaper connectivity check when pack is used as medium
  2012-02-28 13:18   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
@ 2012-02-28 15:40     ` Johannes Sixt
  2012-02-28 15:47       ` Andreas Ericsson
  2012-02-29  1:41       ` Nguyen Thai Ngoc Duy
  2012-03-02  6:10     ` Junio C Hamano
  1 sibling, 2 replies; 23+ messages in thread
From: Johannes Sixt @ 2012-02-28 15:40 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Junio C Hamano

Am 2/28/2012 14:18, schrieb Nguyễn Thái Ngọc Duy:
> Without the patch:
> $ time git fetch file:///home/pclouds/w/git/.git
> remote: Counting objects: 125638, done.
> remote: Compressing objects: 100% (33201/33201), done.
...
> With the patch:
> $ time git fetch file:///home/pclouds/w/git/.git
> remote: Counting objects: 125647, done.
> remote: Compressing objects: 100% (33209/33209), done.

It is a bit irritating that the number are different when they should be
identical...

-- Hannes

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Perform cheaper connectivity check when pack is used as medium
  2012-02-28 15:40     ` Johannes Sixt
@ 2012-02-28 15:47       ` Andreas Ericsson
  2012-02-29  1:41       ` Nguyen Thai Ngoc Duy
  1 sibling, 0 replies; 23+ messages in thread
From: Andreas Ericsson @ 2012-02-28 15:47 UTC (permalink / raw)
  To: Johannes Sixt; +Cc: Nguyễn Thái Ngọc Duy, git, Junio C Hamano

On 02/28/2012 04:40 PM, Johannes Sixt wrote:
> Am 2/28/2012 14:18, schrieb Nguyễn Thái Ngọc Duy:
>> Without the patch:
>> $ time git fetch file:///home/pclouds/w/git/.git
>> remote: Counting objects: 125638, done.
>> remote: Compressing objects: 100% (33201/33201), done.
> ...
>> With the patch:
>> $ time git fetch file:///home/pclouds/w/git/.git
>> remote: Counting objects: 125647, done.
>> remote: Compressing objects: 100% (33209/33209), done.
> 
> It is a bit irritating that the number are different when they should be
> identical...
> 

I found it odd as well, but since the latter shows a larger object
count and a shorter time, I disregarded it and considered it some
evidence that he pushed this patch to that repo.

Since commit created 6 blobs, 2 trees and 1 commit object, and the
latter has 9 objects more, I assume that's what happened anyways.
As such, I think we can live with the small discrepancy. Also note
that the latter post had slower transfer rate. That also skews the
comparison somewhat, but again it's in favour of the patch.

-- 
Andreas Ericsson                   andreas.ericsson@op5.se
OP5 AB                             www.op5.se
Tel: +46 8-230225                  Fax: +46 8-230231

Considering the successes of the wars on alcohol, poverty, drugs and
terror, I think we should give some serious thought to declaring war
on peace.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Perform cheaper connectivity check when pack is used as medium
  2012-02-28 15:40     ` Johannes Sixt
  2012-02-28 15:47       ` Andreas Ericsson
@ 2012-02-29  1:41       ` Nguyen Thai Ngoc Duy
  1 sibling, 0 replies; 23+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-02-29  1:41 UTC (permalink / raw)
  To: Johannes Sixt; +Cc: git, Junio C Hamano

2012/2/28 Johannes Sixt <j.sixt@viscovery.net>:
> Am 2/28/2012 14:18, schrieb Nguyễn Thái Ngọc Duy:
>> Without the patch:
>> $ time git fetch file:///home/pclouds/w/git/.git
>> remote: Counting objects: 125638, done.
>> remote: Compressing objects: 100% (33201/33201), done.
> ...
>> With the patch:
>> $ time git fetch file:///home/pclouds/w/git/.git
>> remote: Counting objects: 125647, done.
>> remote: Compressing objects: 100% (33209/33209), done.
>
> It is a bit irritating that the number are different when they should be
> identical...

~/w/git is my working directory so objects may change a little bit.
Should have fetched from a clone instead, sorry.
-- 
Duy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Perform cheaper connectivity check when pack is used as medium
  2012-02-28 13:18   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
  2012-02-28 15:40     ` Johannes Sixt
@ 2012-03-02  6:10     ` Junio C Hamano
  2012-03-02 14:05       ` Nguyen Thai Ngoc Duy
  1 sibling, 1 reply; 23+ messages in thread
From: Junio C Hamano @ 2012-03-02  6:10 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> When we fetch or push, usually "git rev-list --verify-objects --not
> --all --stdin" is used to make sure that there are no gaps between
> existing refs and new refs. --verify-objects calls parse_object(),
> which internally calls check_sha1_signature() to verify object content
> matches its SHA-1 signature.
>
> check_sha1_signature() is an expensive operation, especially when new
> refs are far away from existing ones because all objects in between
> are re-hashed. However, if we receive new objects by pack, we can
> skip the operation because packs themselves do not contain SHA-1
> signatures. All signatures are recreated by unpack-objects/index-pack's
> hashing objects in the pack, which we can trust.
>
> Detect pack transfer cases and turn --verify-objects to --objects.

After thinking more about this patch, I do not think this "optimization"
is correct.

Consider a case where you have this history


    ---T       o---o---o

where 'T' is the tip of your ref (everything reachable from it is known to
be good), and 'o' are "isolated islands" commits that somehow exist in
your repository but are not complete for whatever reason.

The global history may look like this, where 'X' is the tip the other end
advertised, and '.' are commits that are new to your repository.

    ---T---.---o---o---o---.---X

Upon seeing 'X' advertised and noticing 'T' is the current tip, you would
ask for everything that is needed, i.e. "rev-list --objects T..X", to be
sent to you.

But the other end could send all the trees and commits for 'X' and '.' but
nothing for 'o'.  You will end up with a corrupt history but the loosened
"rev-list --objects T..X" you run after the object transfer will not catch
it.  You need --verify-objects if you want to catch this mode of attack.

Admittedly, in order to mount such an attack successfully, the attacker
needs to know what "isolated islands" you happen to have in the receiving
repository, which makes it much harder than a simpler attack (e.g. sending
only X but nothing else) that would have worked before we introduced the
connectivity check after object transfer.

But we need to realize that the reasoning expressed in your log message
"we received new objects by pack, so everything must validate fine" is not
valid, and we are loosening ourselves to new attacks when we evaluate this
patch.  This is because the completion of the history may be using objects
that did not come from the other side (i.e. commits 'o' and all the trees
necessary for them, but their blobs may be corrupt).

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Perform cheaper connectivity check when pack is used as medium
  2012-03-02  6:10     ` Junio C Hamano
@ 2012-03-02 14:05       ` Nguyen Thai Ngoc Duy
  2012-03-02 17:31         ` Junio C Hamano
  0 siblings, 1 reply; 23+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-03-02 14:05 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

2012/3/2 Junio C Hamano <gitster@pobox.com>:
> Consider a case where you have this history
>
>
>    ---T       o---o---o
>
> where 'T' is the tip of your ref (everything reachable from it is known to
> be good), and 'o' are "isolated islands" commits that somehow exist in
> your repository but are not complete for whatever reason.
>
> The global history may look like this, where 'X' is the tip the other end
> advertised, and '.' are commits that are new to your repository.
>
>    ---T---.---o---o---o---.---X
>
> Upon seeing 'X' advertised and noticing 'T' is the current tip, you would
> ask for everything that is needed, i.e. "rev-list --objects T..X", to be
> sent to you.
>
> But the other end could send all the trees and commits for 'X' and '.' but
> nothing for 'o'.  You will end up with a corrupt history but the loosened
> "rev-list --objects T..X" you run after the object transfer will not catch
> it.  You need --verify-objects if you want to catch this mode of attack.

OK I think I get what you are trying to say. With "rev-list --objects
T..X" we could check commit connectivity from X to T fine. But some
trees in those 'o' commits are bad, but well-formed. As a result,
rev-list goes well and we blindly accept bad trees/blobs. By checking
for tree and blob integrity, we would catch these bad guys. Is that
correct?

The bad islands may be injected from somewhere and cannot be trusted
until they get connectivity with the main DAG.

> Admittedly, in order to mount such an attack successfully, the attacker
> needs to know what "isolated islands" you happen to have in the receiving
> repository, which makes it much harder than a simpler attack (e.g. sending
> only X but nothing else) that would have worked before we introduced the
> connectivity check after object transfer.
>
> But we need to realize that the reasoning expressed in your log message
> "we received new objects by pack, so everything must validate fine" is not
> valid, and we are loosening ourselves to new attacks when we evaluate this
> patch.  This is because the completion of the history may be using objects
> that did not come from the other side (i.e. commits 'o' and all the trees
> necessary for them, but their blobs may be corrupt).

Not everything is valid, then. Objects from new packs are, existing
ones may be guilty. If there is a way to mark new packs trusted, then
we only need to validate the other objects, which should be the
minority or even empty unless an attack is mounted.
-- 
Duy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Perform cheaper connectivity check when pack is used as medium
  2012-03-02 14:05       ` Nguyen Thai Ngoc Duy
@ 2012-03-02 17:31         ` Junio C Hamano
  2012-03-03  3:05           ` Nguyen Thai Ngoc Duy
  0 siblings, 1 reply; 23+ messages in thread
From: Junio C Hamano @ 2012-03-02 17:31 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: git

Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:

> OK I think I get what you are trying to say.
> ...

The attack can be even more simplified; the other side needs to know about
only one blob.

Suppose you have a corrupt blob B that is not referenced from anything in
your repository. "git fsck" will find the corruption of that single blob,
but that does not make your repository corrupt, as the corrupt object is
irrelevant to your history. The tip of your current healthy history is at
commit T.

Starting from that state, you fetch from the other side, that has commit X
at the tip. In this simplified scenario, X is a direct child of T.

You expect that the other side sends everything contained in X that you do
not have in T.  Now, the only difference X makes relative to T is that it
adds a new file whose contents is B at the toplevel of the tree.  And the
transfer gives you the commit object X itself, and its toplevel tree
object, but it omits the blob B by malice (or mistake).

Your "rev-list --object T..X" that is run after the transfer completes
will not notice that B is corrupt, because it only checks if it exists.

And now you corrupted your repository, by making B a part of the history
you (incorrectly) declare complete.

The whole point of the check after the transfer is to make sure that the
transfer will not make a repository that was healthy into a corrupt one,
so using --objects and not --verify-objects is a wrong "optimization" in
this case.

> Not everything is valid, then. Objects from new packs are, existing
> ones may be guilty. If there is a way to mark new packs trusted, then
> we only need to validate the other objects, which should be the
> minority or even empty unless an attack is mounted.

Yes, but how?  Running "fsck" on all of pre-existing objects every time
you fetch (or accept push) is not an answer.

If your fetch did not explode the incoming pack into pieces, a possibility
is to still use the --verify-object codepath, but pass the identity of the
pack (e.g. struct packed_git) down the callchain so that you can avoid
rehashing the objects that came from that single pack, but that would not
help the case where you ended up calling unpack-objects.

I also suspect that more than trivial amount of computation is needed to
determine if a given object exists only in a single pack, so the end
result may not be that much cheaper than the current --verify-object code.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Perform cheaper connectivity check when pack is used as medium
  2012-03-02 17:31         ` Junio C Hamano
@ 2012-03-03  3:05           ` Nguyen Thai Ngoc Duy
  2012-03-03  6:59             ` Junio C Hamano
  0 siblings, 1 reply; 23+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-03-03  3:05 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Sat, Mar 3, 2012 at 12:31 AM, Junio C Hamano <gitster@pobox.com> wrote:
>> Not everything is valid, then. Objects from new packs are, existing
>> ones may be guilty. If there is a way to mark new packs trusted, then
>> we only need to validate the other objects, which should be the
>> minority or even empty unless an attack is mounted.
>
> Yes, but how?  Running "fsck" on all of pre-existing objects every time
> you fetch (or accept push) is not an answer.
>
> If your fetch did not explode the incoming pack into pieces, a possibility
> is to still use the --verify-object codepath, but pass the identity of the
> pack (e.g. struct packed_git) down the callchain so that you can avoid
> rehashing the objects that came from that single pack, but that would not
> help the case where you ended up calling unpack-objects.

It won't help the unpack-objects case. But unpack-objects is only used
when the pack has less than a certain number of objects, doing heavy
check in that case should not take too long. Yes, I was thinking I
would pass pack identity down the verify-pack callchain for index-pack
case.

> I also suspect that more than trivial amount of computation is needed to
> determine if a given object exists only in a single pack, so the end
> result may not be that much cheaper than the current --verify-object code.

Objects can exist in multiple packs right now if they are base
objects. I'm not sure why you need to check for object existence in a
single pack. index-pack tries to make sure duplicate objects with the
same sha-1 must be identical, content-wise. Though it does not do that
hard enough (i.e. only check with one in-repo instance of the object,
instead of all of them).
-- 
Duy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Perform cheaper connectivity check when pack is used as medium
  2012-03-03  3:05           ` Nguyen Thai Ngoc Duy
@ 2012-03-03  6:59             ` Junio C Hamano
  2012-03-03 10:09               ` Nguyen Thai Ngoc Duy
                                 ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Junio C Hamano @ 2012-03-03  6:59 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: git

Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:

> It won't help the unpack-objects case. But unpack-objects is only used
> when the pack has less than a certain number of objects, doing heavy
> check in that case should not take too long. Yes, I was thinking I
> would pass pack identity down the verify-pack callchain for index-pack
> case.

Yes, I think we are on the same track; see below.

>> I also suspect that more than trivial amount of computation is needed to
>> determine if a given object exists only in a single pack, so the end
>> result may not be that much cheaper than the current --verify-object code.
>
> Objects can exist in multiple packs right now if they are base
> objects. I'm not sure why you need to check for object existence in a
> single pack.

What I meant to say was not "it is in this pack and nowhere else", but
about a check like this:

        static void finish_object(struct object *obj, ...)
        {
                struct packed_git *fetched_pack = cb_data->fetched_pack;

                if (obj->type == OBJ_BLOB && !has_sha1_file(obj->sha1))
                        die("missing");
                if (!info->revs->verify_objects)
                        return;
		if (find_pack_entry_one(obj->sha1, fetched_pack))
                        return; /* we just fetched and ran index-pack on it */
                if (!obj->parsed && obj->type != OBJ_COMMIT)
                        parse_object(obj->sha1);                
        }

I think this is the kind of "passing identity down the callchain" both of
us have in mind.  I was trying to say that find_pack_entry() may not be
trivially cheap.  But probably I am being worried too much.

But now you brought it up, I think we may also need to worry about a
corrupt pre-existing loose blob object.  In general, we tend to always
favor reading objects from packs over loose objects, but I do not know
offhand what repacking would do when there are two places it can read the
same object from (it should be allowed to pick whichever is easier to
read).

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Perform cheaper connectivity check when pack is used as medium
  2012-03-03  6:59             ` Junio C Hamano
@ 2012-03-03 10:09               ` Nguyen Thai Ngoc Duy
  2012-03-06 11:20               ` Nguyen Thai Ngoc Duy
  2012-03-14 14:40               ` [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack Nguyễn Thái Ngọc Duy
  2 siblings, 0 replies; 23+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-03-03 10:09 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Sat, Mar 3, 2012 at 1:59 PM, Junio C Hamano <gitster@pobox.com> wrote:
>>> I also suspect that more than trivial amount of computation is needed to
>>> determine if a given object exists only in a single pack, so the end
>>> result may not be that much cheaper than the current --verify-object code.
>>
>> Objects can exist in multiple packs right now if they are base
>> objects. I'm not sure why you need to check for object existence in a
>> single pack.
>
> What I meant to say was not "it is in this pack and nowhere else", but
> about a check like this:
>
>        static void finish_object(struct object *obj, ...)
>        {
>                struct packed_git *fetched_pack = cb_data->fetched_pack;
>
>                if (obj->type == OBJ_BLOB && !has_sha1_file(obj->sha1))
>                        die("missing");
>                if (!info->revs->verify_objects)
>                        return;
>                if (find_pack_entry_one(obj->sha1, fetched_pack))
>                        return; /* we just fetched and ran index-pack on it */
>                if (!obj->parsed && obj->type != OBJ_COMMIT)
>                        parse_object(obj->sha1);
>        }
>
> I think this is the kind of "passing identity down the callchain" both of
> us have in mind.  I was trying to say that find_pack_entry() may not be
> trivially cheap.  But probably I am being worried too much.

This is after index-pack is run and .idx file created, I think
determining object's storage type should be relatively cheap compared
to rehashing. We'll know when I update the patch.

> But now you brought it up, I think we may also need to worry about a
> corrupt pre-existing loose blob object.  In general, we tend to always
> favor reading objects from packs over loose objects, but I do not know
> offhand what repacking would do when there are two places it can read the
> same object from (it should be allowed to pick whichever is easier to
> read).

.. which should be pack for pack-objects/repack because they can do a
straight copy from pack to pack. --no-reuse-objects delegates object
reading back to read_sha1_file(), and this one prefers packs too.
-- 
Duy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Perform cheaper connectivity check when pack is used as medium
  2012-03-03  6:59             ` Junio C Hamano
  2012-03-03 10:09               ` Nguyen Thai Ngoc Duy
@ 2012-03-06 11:20               ` Nguyen Thai Ngoc Duy
  2012-03-06 15:56                 ` Junio C Hamano
  2012-03-14 14:40               ` [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack Nguyễn Thái Ngọc Duy
  2 siblings, 1 reply; 23+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-03-06 11:20 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Sat, Mar 3, 2012 at 1:59 PM, Junio C Hamano <gitster@pobox.com> wrote:
> But now you brought it up, I think we may also need to worry about a
> corrupt pre-existing loose blob object.  In general, we tend to always
> favor reading objects from packs over loose objects, but I do not know
> offhand what repacking would do when there are two places it can read the
> same object from (it should be allowed to pick whichever is easier to
> read).

Corrupt accidentally or on purpose? If some one can add a few corrupt
loose objects to your repository, (s)he can also put a pack with a
corrupt index (e.g. sha-1 does not point to the content that matches
that sha-1) there too. If that corrupt pack is chosen for reading at
"repack -ad" time, I think pack-objects would happily copy over
whatever the index points to. And the new index that pack-objects
generates (i.e. not through index-pack) may be also corrupt.
-- 
Duy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2] Perform cheaper connectivity check when pack is used as medium
  2012-03-06 11:20               ` Nguyen Thai Ngoc Duy
@ 2012-03-06 15:56                 ` Junio C Hamano
  0 siblings, 0 replies; 23+ messages in thread
From: Junio C Hamano @ 2012-03-06 15:56 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: git

Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:

> On Sat, Mar 3, 2012 at 1:59 PM, Junio C Hamano <gitster@pobox.com> wrote:
>> But now you brought it up, I think we may also need to worry about a
>> corrupt pre-existing loose blob object. In general, we tend to always
>> favor reading objects from packs over loose objects, but I do not know
>> offhand what repacking would do when there are two places it can read the
>> same object from (it should be allowed to pick whichever is easier to
>> read).
>
> Corrupt accidentally or on purpose?

Does not matter. The attack outlined does not require you to write a
corrupt one into victim's repository. You only need to _know_ one
that the victim happens to have.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack
  2012-03-03  6:59             ` Junio C Hamano
  2012-03-03 10:09               ` Nguyen Thai Ngoc Duy
  2012-03-06 11:20               ` Nguyen Thai Ngoc Duy
@ 2012-03-14 14:40               ` Nguyễn Thái Ngọc Duy
  2012-03-14 19:05                 ` Junio C Hamano
  2012-03-15 21:09                 ` Junio C Hamano
  2 siblings, 2 replies; 23+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-03-14 14:40 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy

When we fetch or push, usually "git rev-list --verify-objects --not
--all --stdin" is used to make sure that all objects between existing
refs and new refs are good. This means no gaps in between, all objects
are well-formed, object content agrees with its sha-1 signature.

For the last one, --verify-objects calls check_sha1_signature() via
parse_object(). check_sha1_signature() is an expensive operation,
especially when new refs are far away from existing ones because all
objects in between are re-hashed. Because objects coming from the new
pack are already hashed by index-pack, we can trust their
integrity. The only objects left to check are existing ones in repo
but has no connection to any current refs.

Pass the new pack id down to--verify-objects and skip
check_sha1_signature() on objects from that pack.

As an (extreme) example, a repository is created with only one commit:
e83c516 (Initial revision of "git", the information manager from hell
- 2005-04-07). The rest of git.git is fetched on top. Without the
patch:

$ time git fetch file:///home/pclouds/w/git/.git
remote: Counting objects: 125656, done.
remote: Compressing objects: 100% (33280/33280), done.
remote: Total 125656 (delta 92585), reused 123464 (delta 90682)
Receiving objects: 100% (125656/125656), 34.60 MiB | 8.47 MiB/s, done.
Resolving deltas: 100% (92585/92585), done.
From file:///home/pclouds/t/test/
 * branch            HEAD       -> FETCH_HEAD

real    1m30.437s
user    1m31.338s
sys     0m1.687s

With the patch:

$ time git fetch file:///home/pclouds/w/git/.git
remote: Counting objects: 125656, done.
remote: Compressing objects: 100% (33280/33280), done.
remote: Total 125656 (delta 92585), reused 123464 (delta 90682)
Receiving objects: 100% (125656/125656), 34.60 MiB | 7.86 MiB/s, done.
Resolving deltas: 100% (92585/92585), done.
From file:///home/pclouds/t/test/
 * branch            HEAD       -> FETCH_HEAD

real    0m52.182s
user    0m53.151s
sys     0m1.465s

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Another try. Exclude all objects reachable from refs and the new pack
 from --verify-objects tests.
 
 I keep CHECK_CONNECT_QUIET change because it seems a good change
 anyway. transport.[ch] changes are dropped because we can detect pack
 case based on pack_lockfile.

 builtin/fetch.c        |   12 +++++++-----
 builtin/receive-pack.c |    7 +++----
 builtin/rev-list.c     |   20 ++++++++++++++++++--
 connected.c            |   31 +++++++++++++++++++++++--------
 connected.h            |    5 ++++-
 revision.c             |    5 +++++
 revision.h             |    3 +++
 7 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/builtin/fetch.c b/builtin/fetch.c
index 65f5f9b..cc84d04 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -368,7 +368,7 @@ static int iterate_ref_map(void *cb_data, unsigned char sha1[20])
 }
 
 static int store_updated_refs(const char *raw_url, const char *remote_name,
-		struct ref *ref_map)
+			      struct ref *ref_map, const char *pack_lockfile)
 {
 	FILE *fp;
 	struct commit *commit;
@@ -389,7 +389,7 @@ static int store_updated_refs(const char *raw_url, const char *remote_name,
 		url = xstrdup("foreign");
 
 	rm = ref_map;
-	if (check_everything_connected(iterate_ref_map, 0, &rm)) {
+	if (check_everything_connected(iterate_ref_map, 0, pack_lockfile, &rm)) {
 		rc = error(_("%s did not send all necessary objects\n"), url);
 		goto abort;
 	}
@@ -516,7 +516,8 @@ static int quickfetch(struct ref *ref_map)
 	 */
 	if (depth)
 		return -1;
-	return check_everything_connected(iterate_ref_map, 1, &rm);
+	return check_everything_connected(iterate_ref_map,
+					  CHECK_CONNECT_QUIET, NULL, &rm);
 }
 
 static int fetch_refs(struct transport *transport, struct ref *ref_map)
@@ -526,8 +527,9 @@ static int fetch_refs(struct transport *transport, struct ref *ref_map)
 		ret = transport_fetch_refs(transport, ref_map);
 	if (!ret)
 		ret |= store_updated_refs(transport->url,
-				transport->remote->name,
-				ref_map);
+					  transport->remote->name,
+					  ref_map,
+					  transport->pack_lockfile);
 	transport_unlock_pack(transport);
 	return ret;
 }
diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c
index 0afb8b2..01d37f6 100644
--- a/builtin/receive-pack.c
+++ b/builtin/receive-pack.c
@@ -40,6 +40,7 @@ static int auto_gc = 1;
 static const char *head_name;
 static void *head_name_to_free;
 static int sent_capabilities;
+static const char *pack_lockfile;
 
 static enum deny_action parse_deny_action(const char *var, const char *value)
 {
@@ -669,7 +670,7 @@ static void set_connectivity_errors(struct command *commands)
 	for (cmd = commands; cmd; cmd = cmd->next) {
 		struct command *singleton = cmd;
 		if (!check_everything_connected(command_singleton_iterator,
-						0, &singleton))
+						0, pack_lockfile, &singleton))
 			continue;
 		cmd->error_string = "missing necessary objects";
 	}
@@ -705,7 +706,7 @@ static void execute_commands(struct command *commands, const char *unpacker_erro
 
 	cmd = commands;
 	if (check_everything_connected(iterate_receive_command_list,
-				       0, &cmd))
+				       0, pack_lockfile, &cmd))
 		set_connectivity_errors(commands);
 
 	if (run_receive_hook(commands, pre_receive_hook, 0)) {
@@ -797,8 +798,6 @@ static const char *parse_pack_header(struct pack_header *hdr)
 	}
 }
 
-static const char *pack_lockfile;
-
 static const char *unpack(void)
 {
 	struct pack_header hdr;
diff --git a/builtin/rev-list.c b/builtin/rev-list.c
index 4c4d404..21d714b 100644
--- a/builtin/rev-list.c
+++ b/builtin/rev-list.c
@@ -180,8 +180,24 @@ static void finish_object(struct object *obj,
 	struct rev_list_info *info = cb_data;
 	if (obj->type == OBJ_BLOB && !has_sha1_file(obj->sha1))
 		die("missing blob object '%s'", sha1_to_hex(obj->sha1));
-	if (info->revs->verify_objects && !obj->parsed && obj->type != OBJ_COMMIT)
-		parse_object(obj->sha1);
+	if (info->revs->verify_objects &&
+	    !obj->parsed && obj->type != OBJ_COMMIT) {
+		const char *safe_pack = info->revs->safe_pack;
+		struct object_info oi;
+		int safe = 0;
+		memset(&oi, 0, sizeof(oi));
+		if (*safe_pack &&
+		    sha1_object_info_extended(obj->sha1, &oi) >= 0 &&
+		    oi.whence == OI_PACKED) {
+			const char *pack = oi.u.packed.pack->pack_name;
+			int len = strlen(pack);
+			assert(strncmp(pack + len - 51, "/pack-", 6) == 0);
+			assert(strcmp(pack + len - 5, ".pack") == 0);
+			safe = !memcmp(safe_pack, pack + len - 45, 40);
+		}
+		if (!safe)
+			parse_object(obj->sha1);
+	}
 }
 
 static void show_object(struct object *obj,
diff --git a/connected.c b/connected.c
index d762423..af81049 100644
--- a/connected.c
+++ b/connected.c
@@ -14,28 +14,43 @@
  *
  * Returns 0 if everything is connected, non-zero otherwise.
  */
-int check_everything_connected(sha1_iterate_fn fn, int quiet, void *cb_data)
+int check_everything_connected(sha1_iterate_fn fn, unsigned int flags,
+			       const char *pack_lockfile, void *cb_data)
 {
 	struct child_process rev_list;
-	const char *argv[] = {"rev-list", "--verify-objects",
-			      "--stdin", "--not", "--all", NULL, NULL};
+	const char *argv[] = {"rev-list", "--verify-objects", "--stdin",
+			      "--not", "--all", NULL, NULL, NULL, NULL };
 	char commit[41];
 	unsigned char sha1[20];
-	int err = 0;
+	int err = 0, ac = 5;
+	struct strbuf packfile = STRBUF_INIT;
 
 	if (fn(cb_data, sha1))
 		return err;
 
-	if (quiet)
-		argv[5] = "--quiet";
+	if (flags & CHECK_CONNECT_QUIET)
+		argv[ac++] = "--quiet";
+	if (pack_lockfile) {
+		strbuf_addstr(&packfile, pack_lockfile);
+		/* xxx/pack-%40s.keep */
+		assert(strcmp(packfile.buf + packfile.len - 5, ".keep") == 0);
+		assert(strncmp(packfile.buf + packfile.len - 51, "/pack-", 6) == 0);
+		strbuf_setlen(&packfile, packfile.len - 5);
+		strbuf_remove(&packfile, 0, packfile.len - 40);
+		argv[ac++] = "--safe-pack";
+		argv[ac++] = packfile.buf;
+	}
+	assert(ac < ARRAY_SIZE(argv) && argv[ac] == NULL);
 
 	memset(&rev_list, 0, sizeof(rev_list));
 	rev_list.argv = argv;
 	rev_list.git_cmd = 1;
 	rev_list.in = -1;
 	rev_list.no_stdout = 1;
-	rev_list.no_stderr = quiet;
-	if (start_command(&rev_list))
+	rev_list.no_stderr = flags & CHECK_CONNECT_QUIET;
+	err = start_command(&rev_list);
+	strbuf_release(&packfile);
+	if (err)
 		return error(_("Could not run 'git rev-list'"));
 
 	sigchain_push(SIGPIPE, SIG_IGN);
diff --git a/connected.h b/connected.h
index 7e4585a..ed0b559 100644
--- a/connected.h
+++ b/connected.h
@@ -1,6 +1,8 @@
 #ifndef CONNECTED_H
 #define CONNECTED_H
 
+#define CHECK_CONNECT_QUIET  1
+
 /*
  * Take callback data, and return next object name in the buffer.
  * When called after returning the name for the last object, return -1
@@ -15,6 +17,7 @@ typedef int (*sha1_iterate_fn)(void *, unsigned char [20]);
  *
  * Return 0 if Ok, non zero otherwise (i.e. some missing objects)
  */
-extern int check_everything_connected(sha1_iterate_fn, int quiet, void *cb_data);
+extern int check_everything_connected(sha1_iterate_fn, unsigned int flags,
+				      const char *pack_lockfile, void *cb_data);
 
 #endif /* CONNECTED_H */
diff --git a/revision.c b/revision.c
index 819ff01..1c2d017 100644
--- a/revision.c
+++ b/revision.c
@@ -1451,6 +1451,11 @@ static int handle_revision_opt(struct rev_info *revs, int argc, const char **arg
 		revs->tree_objects = 1;
 		revs->blob_objects = 1;
 		revs->verify_objects = 1;
+	} else if (!strcmp(arg, "--safe-pack")) {
+		if (strlen(argv[1]) != 40)
+			die("--safe-pack requires an SHA-1 as pack id, not %s", argv[1]);
+		strcpy(revs->safe_pack, argv[1]);
+		return 2;
 	} else if (!strcmp(arg, "--unpacked")) {
 		revs->unpacked = 1;
 	} else if (!prefixcmp(arg, "--unpacked=")) {
diff --git a/revision.h b/revision.h
index b8e9223..2790910 100644
--- a/revision.h
+++ b/revision.h
@@ -168,6 +168,9 @@ struct rev_info {
 	int count_left;
 	int count_right;
 	int count_same;
+
+	/* --verify-objects */
+	char safe_pack[41];
 };
 
 #define REV_TREE_SAME		0
-- 
1.7.8.36.g69ee2

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack
  2012-03-14 14:40               ` [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack Nguyễn Thái Ngọc Duy
@ 2012-03-14 19:05                 ` Junio C Hamano
  2012-03-15 21:09                 ` Junio C Hamano
  1 sibling, 0 replies; 23+ messages in thread
From: Junio C Hamano @ 2012-03-14 19:05 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git

Nguyễn Thái Ngọc Duy <pclouds@gmail.com> writes:

>  Another try. Exclude all objects reachable from refs and the new pack
>  from --verify-objects tests.

Thanks.  Looks much nicer in general.

>  I keep CHECK_CONNECT_QUIET change because it seems a good change
>  anyway.

It appears that you lost the "one int for a single boolean to flag"
conversion that was the point of CHECK_CONNECT_QUIET symbol.

> diff --git a/builtin/rev-list.c b/builtin/rev-list.c
> index 4c4d404..21d714b 100644
> --- a/builtin/rev-list.c
> +++ b/builtin/rev-list.c
> @@ -180,8 +180,24 @@ static void finish_object(struct object *obj,
>  	struct rev_list_info *info = cb_data;
>  	if (obj->type == OBJ_BLOB && !has_sha1_file(obj->sha1))
>  		die("missing blob object '%s'", sha1_to_hex(obj->sha1));
> -	if (info->revs->verify_objects && !obj->parsed && obj->type != OBJ_COMMIT)
> -		parse_object(obj->sha1);
> +	if (info->revs->verify_objects &&
> +	    !obj->parsed && obj->type != OBJ_COMMIT) {
> +		const char *safe_pack = info->revs->safe_pack;
> +		struct object_info oi;
> +		int safe = 0;
> +		memset(&oi, 0, sizeof(oi));
> +		if (*safe_pack &&
> +		    sha1_object_info_extended(obj->sha1, &oi) >= 0 &&
> +		    oi.whence == OI_PACKED) {
> +			const char *pack = oi.u.packed.pack->pack_name;
> +			int len = strlen(pack);
> +			assert(strncmp(pack + len - 51, "/pack-", 6) == 0);
> +			assert(strcmp(pack + len - 5, ".pack") == 0);
> +			safe = !memcmp(safe_pack, pack + len - 45, 40);
> +		}
> +		if (!safe)
> +			parse_object(obj->sha1);
> +	}
>  }

This looks unnecessarily complex, and I think the complexity comes only
because the new info->revs->safe_pack is a hextual packfile ID.  Wouldn't
it make more sense to store a pointer to "struct packed_git" there, so
that oi.u.packed.pack can be compared with it?

The caller may need a new API in sha1_file.c to let it find a packed git
with a given packfile ID.

> diff --git a/connected.c b/connected.c
> index d762423..af81049 100644
> --- a/connected.c
> +++ b/connected.c
> @@ -14,28 +14,43 @@
>   *
>   * Returns 0 if everything is connected, non-zero otherwise.
>   */
> -int check_everything_connected(sha1_iterate_fn fn, int quiet, void *cb_data)
> +int check_everything_connected(sha1_iterate_fn fn, unsigned int flags,
> +			       const char *pack_lockfile, void *cb_data)
>  {
> ...
> +	if (pack_lockfile) {
> +		strbuf_addstr(&packfile, pack_lockfile);
> +		/* xxx/pack-%40s.keep */
> +		assert(strcmp(packfile.buf + packfile.len - 5, ".keep") == 0);
> +		assert(strncmp(packfile.buf + packfile.len - 51, "/pack-", 6) == 0);
> +		strbuf_setlen(&packfile, packfile.len - 5);
> +		strbuf_remove(&packfile, 0, packfile.len - 40);
> +		argv[ac++] = "--safe-pack";
> +		argv[ac++] = packfile.buf;
> +	}

That looks like a very round-about way.  Why not check the ".keep" and
"/pack-" substrings in pack_lockfile (as a side effect of doing so, you
will learn the offset of the 40-byte substring you want to put in the
packfile strbuf) and strbuf_add(&packfile, pack_lockfile + offset, 40)?

> +	assert(ac < ARRAY_SIZE(argv) && argv[ac] == NULL);

Perhaps you want to use argv_list API instead of adding an assert() here.

> diff --git a/revision.c b/revision.c
> index 819ff01..1c2d017 100644
> --- a/revision.c
> +++ b/revision.c
> @@ -1451,6 +1451,11 @@ static int handle_revision_opt(struct rev_info *revs, int argc, const char **arg
>  		revs->tree_objects = 1;
>  		revs->blob_objects = 1;
>  		revs->verify_objects = 1;
> +	} else if (!strcmp(arg, "--safe-pack")) {
> +		if (strlen(argv[1]) != 40)
> +			die("--safe-pack requires an SHA-1 as pack id, not %s", argv[1]);
> +		strcpy(revs->safe_pack, argv[1]);
> +		return 2;

The condition for the above "die()" does not check if that random 40-byte
string is an SHA-1, let alone it names an existing packfile.  It would be
solved automatically if you change the type of revs.safe_pack to a pointer
to "struct packed_git", though.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack
  2012-03-14 14:40               ` [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack Nguyễn Thái Ngọc Duy
  2012-03-14 19:05                 ` Junio C Hamano
@ 2012-03-15 21:09                 ` Junio C Hamano
  2012-03-15 21:57                   ` Junio C Hamano
  2012-03-16  2:24                   ` Nguyen Thai Ngoc Duy
  1 sibling, 2 replies; 23+ messages in thread
From: Junio C Hamano @ 2012-03-15 21:09 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Shawn O. Pearce

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> When we fetch or push, usually "git rev-list --verify-objects --not
> --all --stdin" is used to make sure that all objects between existing
> refs and new refs are good. This means no gaps in between, all objects
> are well-formed, object content agrees with its sha-1 signature.
>
> For the last one, --verify-objects calls check_sha1_signature() via
> parse_object(). check_sha1_signature() is an expensive operation,

After thinking about this a bit more, I am beginning to think that the
validation of object contents is unnecessary in _all_ cases that involve
"git fetch".  Unpack-objects and index-pack already validate individual
objects, and the only thing we would want to catch are objects that we
already happened to have had in our repository but were unreferenced from
our refs.  But the codepaths that write out loose objects or packfiles
that must have left these objects during the earlier run in our repository
should already have done the validation.

So the only thing that we would be catching with this extra check is a
loose object or a packfile that was deposited outside the control of git
in your repository.

But that is not something we should be trying to catch every time we run
fetch---if your repository is open to be written by a random person from
sideways bypassing git, you have a much bigger problem.  And we have the
tool to catch that kind of breakage: "git fsck".

So the right solution is probably use --objects for connectivity checks as
before; we could add a fetch.paranoid configuration to allow people to
still use it (with this patch to remove the over-pessimism from the code)
but only if they want to.

Cc'ing Shawn, as I took inspiration from him in the first place for the
update to --verify-objects (no, the overly-pessimistic implementation and
excess overhead you solved partially with this patch is not his fault but
is mine).

I said "partially" above in the parentheses because we would still end up
revalidating all the objects in quickfetch() check when you are fetching
from the repository that is your alternate, even with this patch.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack
  2012-03-15 21:09                 ` Junio C Hamano
@ 2012-03-15 21:57                   ` Junio C Hamano
  2012-03-16  2:28                     ` Nguyen Thai Ngoc Duy
  2012-03-16  2:24                   ` Nguyen Thai Ngoc Duy
  1 sibling, 1 reply; 23+ messages in thread
From: Junio C Hamano @ 2012-03-15 21:57 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git, Shawn O. Pearce

Junio C Hamano <gitster@pobox.com> writes:

> After thinking about this a bit more, I am beginning to think that the
> validation of object contents is unnecessary in _all_ cases that involve
> "git fetch". ...
> So the right solution is probably use --objects for connectivity checks as
> before; we could add a fetch.paranoid configuration to allow people to
> still use it (with this patch to remove the over-pessimism from the code)
> but only if they want to.

Following the above train of thought, the patch to revert what 6d4bb38
(fetch: verify we have everything we need before updating our ref,
2011-09-01) did which caused this performance regression should look like
this.  "tag --contains 6d4bb38" tells me that this affects everything
since v1.7.8-rc0 so we may want to issue maintenance updates once this
proves good in 1.7.10.

Nguyễn, sorry for being dense.  I think this ended up being very close to
your initial patch.

-- >8 --
Subject: fetch/receive: remove over-pessimistic connectivity check

Git 1.7.8 introduced an object and history re-validation step after
"fetch" or "push" causes new history to be added to a receiving
repository, in order to protect a malicious server or pushing client to
corrupt it by taking advantage of an existing corrupt object that is
unconnected to existing history of the receiving repository.

But this is way over-pessimistic, and often adds 5x to 8x runtime overhead
for a little gain.  During "fetch" or "receive-pack" (the server side of
"push"), unpack-objects and index-pack already validate individual objects
that are received, and the only thing we would want to catch are objects
that already happened to exist in our repository but were not referenced
from our refs.  But such objects must have been written by an earlier run
of our codepaths that write out loose objects or packfiles, and they must
have done the validation of individual objects when they did so.  The only
thing left to worry about is the connectivity integrity, which can be done
with much cheaper "rev-list --objects".

Remove this over-pessimistic check, while leaving the door open for later
changes, hinting the possibility in an in-code comment.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/fetch.c |    2 +-
 connected.c     |   14 +++++++++-----
 connected.h     |    3 ++-
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/builtin/fetch.c b/builtin/fetch.c
index 8761a33..b2f7d5b 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -505,7 +505,7 @@ static int quickfetch(struct ref *ref_map)
 	 */
 	if (depth)
 		return -1;
-	return check_everything_connected(iterate_ref_map, 1, &rm);
+	return check_everything_connected(iterate_ref_map, CONN_CHECK_QUIET, &rm);
 }
 
 static int fetch_refs(struct transport *transport, struct ref *ref_map)
diff --git a/connected.c b/connected.c
index d762423..3564a9f 100644
--- a/connected.c
+++ b/connected.c
@@ -6,22 +6,26 @@
 /*
  * If we feed all the commits we want to verify to this command
  *
- *  $ git rev-list --verify-objects --stdin --not --all
+ *  $ git rev-list --objects --stdin --not --all
  *
  * and if it does not error out, that means everything reachable from
- * these commits locally exists and is connected to some of our
- * existing refs.
+ * these commits locally exists and is connected to our existing refs.
  *
  * Returns 0 if everything is connected, non-zero otherwise.
+ *
+ * We may want to introduce CONN_CHECK_PARANOIA option the caller can
+ * pass to use --verify-objects instead of --objects to optionally have
+ * individual objects re-validated.
  */
-int check_everything_connected(sha1_iterate_fn fn, int quiet, void *cb_data)
+int check_everything_connected(sha1_iterate_fn fn, unsigned flags, void *cb_data)
 {
 	struct child_process rev_list;
-	const char *argv[] = {"rev-list", "--verify-objects",
+	const char *argv[] = {"rev-list", "--objects",
 			      "--stdin", "--not", "--all", NULL, NULL};
 	char commit[41];
 	unsigned char sha1[20];
 	int err = 0;
+	int quiet = !!(flags & CONN_CHECK_QUIET);
 
 	if (fn(cb_data, sha1))
 		return err;
diff --git a/connected.h b/connected.h
index 7e4585a..6f17e07 100644
--- a/connected.h
+++ b/connected.h
@@ -15,6 +15,7 @@ typedef int (*sha1_iterate_fn)(void *, unsigned char [20]);
  *
  * Return 0 if Ok, non zero otherwise (i.e. some missing objects)
  */
-extern int check_everything_connected(sha1_iterate_fn, int quiet, void *cb_data);
+extern int check_everything_connected(sha1_iterate_fn, unsigned flags, void *cb_data);
+#define CONN_CHECK_QUIET 01
 
 #endif /* CONNECTED_H */

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack
  2012-03-15 21:09                 ` Junio C Hamano
  2012-03-15 21:57                   ` Junio C Hamano
@ 2012-03-16  2:24                   ` Nguyen Thai Ngoc Duy
  2012-03-16  3:46                     ` Junio C Hamano
  1 sibling, 1 reply; 23+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-03-16  2:24 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Shawn O. Pearce

2012/3/16 Junio C Hamano <gitster@pobox.com>:
> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
>
>> When we fetch or push, usually "git rev-list --verify-objects --not
>> --all --stdin" is used to make sure that all objects between existing
>> refs and new refs are good. This means no gaps in between, all objects
>> are well-formed, object content agrees with its sha-1 signature.
>>
>> For the last one, --verify-objects calls check_sha1_signature() via
>> parse_object(). check_sha1_signature() is an expensive operation,
>
> After thinking about this a bit more, I am beginning to think that the
> validation of object contents is unnecessary in _all_ cases that involve
> "git fetch".  Unpack-objects and index-pack already validate individual
> objects, and the only thing we would want to catch are objects that we
> already happened to have had in our repository but were unreferenced from
> our refs.

What about remote helpers? Should we declare it's remote helper
responsibility to validate all incoming objects?
-- 
Duy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack
  2012-03-15 21:57                   ` Junio C Hamano
@ 2012-03-16  2:28                     ` Nguyen Thai Ngoc Duy
  2012-03-16  3:42                       ` Junio C Hamano
  0 siblings, 1 reply; 23+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-03-16  2:28 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Shawn O. Pearce

2012/3/16 Junio C Hamano <gitster@pobox.com>:
> Following the above train of thought, the patch to revert what 6d4bb38
> (fetch: verify we have everything we need before updating our ref,
> 2011-09-01) did which caused this performance regression should look like
> this.  "tag --contains 6d4bb38" tells me that this affects everything
> since v1.7.8-rc0 so we may want to issue maintenance updates once this
> proves good in 1.7.10.
>
> Nguyễn, sorry for being dense.  I think this ended up being very close to
> your initial patch.

I have cheaper "git fetch" any way. That's all that counts from my
user point of view :)

What about push/receive-pack? I think the same thing happens there.
-- 
Duy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack
  2012-03-16  2:28                     ` Nguyen Thai Ngoc Duy
@ 2012-03-16  3:42                       ` Junio C Hamano
  2012-03-16  3:42                         ` Nguyen Thai Ngoc Duy
  0 siblings, 1 reply; 23+ messages in thread
From: Junio C Hamano @ 2012-03-16  3:42 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: git, Shawn O. Pearce

Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:

> What about push/receive-pack? I think the same thing happens there.

As the patch goes to check_everything_connected(), it is automatically
covered, no?

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack
  2012-03-16  3:42                       ` Junio C Hamano
@ 2012-03-16  3:42                         ` Nguyen Thai Ngoc Duy
  0 siblings, 0 replies; 23+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-03-16  3:42 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Shawn O. Pearce

On Fri, Mar 16, 2012 at 10:42 AM, Junio C Hamano <gitster@pobox.com> wrote:
> Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:
>
>> What about push/receive-pack? I think the same thing happens there.
>
> As the patch goes to check_everything_connected(), it is automatically
> covered, no?

Yeah right, sorry stupid question.
-- 
Duy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack
  2012-03-16  2:24                   ` Nguyen Thai Ngoc Duy
@ 2012-03-16  3:46                     ` Junio C Hamano
  0 siblings, 0 replies; 23+ messages in thread
From: Junio C Hamano @ 2012-03-16  3:46 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: git, Shawn O. Pearce

Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:

> What about remote helpers?

What about them?  Don't these foreign-vcs helpers generally create git
objects on our side based on whatever data that comes from the remote
foreign vcs anyway?

> Should we declare it's remote helper
> responsibility to validate all incoming objects?

It is nothing new that they must not deposit broken objects in the
repository, so I do not think there is nothing that needs to be
specifically pointed out.

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2012-03-16  3:47 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-02-27 15:39 [PATCH] Perform cheaper connectivity check when pack is used as medium Nguyễn Thái Ngọc Duy
2012-02-27 18:14 ` Junio C Hamano
2012-02-28 13:18   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
2012-02-28 15:40     ` Johannes Sixt
2012-02-28 15:47       ` Andreas Ericsson
2012-02-29  1:41       ` Nguyen Thai Ngoc Duy
2012-03-02  6:10     ` Junio C Hamano
2012-03-02 14:05       ` Nguyen Thai Ngoc Duy
2012-03-02 17:31         ` Junio C Hamano
2012-03-03  3:05           ` Nguyen Thai Ngoc Duy
2012-03-03  6:59             ` Junio C Hamano
2012-03-03 10:09               ` Nguyen Thai Ngoc Duy
2012-03-06 11:20               ` Nguyen Thai Ngoc Duy
2012-03-06 15:56                 ` Junio C Hamano
2012-03-14 14:40               ` [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack Nguyễn Thái Ngọc Duy
2012-03-14 19:05                 ` Junio C Hamano
2012-03-15 21:09                 ` Junio C Hamano
2012-03-15 21:57                   ` Junio C Hamano
2012-03-16  2:28                     ` Nguyen Thai Ngoc Duy
2012-03-16  3:42                       ` Junio C Hamano
2012-03-16  3:42                         ` Nguyen Thai Ngoc Duy
2012-03-16  2:24                   ` Nguyen Thai Ngoc Duy
2012-03-16  3:46                     ` Junio C Hamano

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.