All of lore.kernel.org
 help / color / mirror / Atom feed
* [WIP/RFH/RFD/PATCH] grep: allow to use textconv filters
@ 2013-02-04 15:27 Michael J Gruber
  2013-02-04 17:12 ` Junio C Hamano
  2013-02-05 11:13 ` Jeff King
  0 siblings, 2 replies; 37+ messages in thread
From: Michael J Gruber @ 2013-02-04 15:27 UTC (permalink / raw)
  To: git; +Cc: Jeff King

Recently and not so recently, we made sure that log/grep type operations
use textconv filters when a userfacing diff would do the same:

ef90ab6 (pickaxe: use textconv for -S counting, 2012-10-28)
b1c2f57 (diff_grep: use textconv buffers for add/deleted files, 2012-10-28)
0508fe5 (combine-diff: respect textconv attributes, 2011-05-23)

"git grep" currently does not use textconv filters at all, that is
neither for displaying the match and context nor for the actual grepping.

Introduce a binary mode "--textconv" (in addition to "--text" and "-I")
which makes git grep use any configured textconv filters for grepping
and output purposes.

Signed-off-by: Michael J Gruber <git@drmicha.warpmail.net>
---

Notes:
    I'm somehow stuck in textconv/filespec/... hell, so I'm sending this out
    in request for help. I'm sure there are people for whom it's a breeze to
    get this right.
    
    The difficulty is in getting the different cases (blob/sha1 vs.
    worktree) right, and in making the changes minimally invasive. It seems
    that some more refactoring could help: "git show --textconv" does not
    use textconv filters when used on blobs either. (It does for diffs, of
    course.) Most existing helper functions are tailored for diffs.

    Nota bene: --textconv does not affect "diff --stat" either...

 builtin/grep.c |  5 ++++-
 grep.c         | 47 +++++++++++++++++++++++++++++------------------
 grep.h         |  3 ++-
 3 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/builtin/grep.c b/builtin/grep.c
index 8025964..2181c22 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -96,7 +96,7 @@ static void add_work(struct grep_opt *opt, enum grep_source_type type,
 
 	grep_source_init(&todo[todo_end].source, type, name, path, id);
 	if (opt->binary != GREP_BINARY_TEXT)
-		grep_source_load_driver(&todo[todo_end].source);
+		grep_source_load_driver(&todo[todo_end].source, opt);
 	todo[todo_end].done = 0;
 	strbuf_reset(&todo[todo_end].out);
 	todo_end = (todo_end + 1) % ARRAY_SIZE(todo);
@@ -659,6 +659,9 @@ int cmd_grep(int argc, const char **argv, const char *prefix)
 		OPT_SET_INT('I', NULL, &opt.binary,
 			N_("don't match patterns in binary files"),
 			GREP_BINARY_NOMATCH),
+		OPT_SET_INT(0, "textconv", &opt.binary,
+			N_("process binary files with textconv filters"),
+			GREP_BINARY_TEXTCONV),
 		{ OPTION_INTEGER, 0, "max-depth", &opt.max_depth, N_("depth"),
 			N_("descend at most <depth> levels"), PARSE_OPT_NONEG,
 			NULL, 1 },
diff --git a/grep.c b/grep.c
index 4bd1b8b..410b7b8 100644
--- a/grep.c
+++ b/grep.c
@@ -1,10 +1,12 @@
 #include "cache.h"
+#include "diff.h"
+#include "diffcore.h"
 #include "grep.h"
 #include "userdiff.h"
 #include "xdiff-interface.h"
 
-static int grep_source_load(struct grep_source *gs);
-static int grep_source_is_binary(struct grep_source *gs);
+static int grep_source_load(struct grep_source *gs, struct grep_opt *opt);
+static int grep_source_is_binary(struct grep_source *gs, struct grep_opt *opt);
 
 static struct grep_opt grep_defaults;
 
@@ -1174,7 +1176,7 @@ static int match_funcname(struct grep_opt *opt, struct grep_source *gs, char *bo
 {
 	xdemitconf_t *xecfg = opt->priv;
 	if (xecfg && !xecfg->find_func) {
-		grep_source_load_driver(gs);
+		grep_source_load_driver(gs, opt);
 		if (gs->driver->funcname.pattern) {
 			const struct userdiff_funcname *pe = &gs->driver->funcname;
 			xdiff_set_find_func(xecfg, pe->pattern, pe->cflags);
@@ -1354,14 +1356,15 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
 
 	switch (opt->binary) {
 	case GREP_BINARY_DEFAULT:
-		if (grep_source_is_binary(gs))
+		if (grep_source_is_binary(gs, opt))
 			binary_match_only = 1;
 		break;
 	case GREP_BINARY_NOMATCH:
-		if (grep_source_is_binary(gs))
+		if (grep_source_is_binary(gs, opt))
 			return 0; /* Assume unmatch */
 		break;
 	case GREP_BINARY_TEXT:
+	case GREP_BINARY_TEXTCONV:
 		break;
 	default:
 		die("bug: unknown binary handling mode");
@@ -1372,7 +1375,7 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
 
 	try_lookahead = should_lookahead(opt);
 
-	if (grep_source_load(gs) < 0)
+	if (grep_source_load(gs, opt) < 0)
 		return 0;
 
 	bol = gs->buf;
@@ -1610,12 +1613,17 @@ void grep_source_clear_data(struct grep_source *gs)
 	}
 }
 
-static int grep_source_load_sha1(struct grep_source *gs)
+static int grep_source_load_sha1(struct grep_source *gs, struct grep_opt *opt)
 {
 	enum object_type type;
-
 	grep_read_lock();
-	gs->buf = read_sha1_file(gs->identifier, &type, &gs->size);
+	if (opt->binary == GREP_BINARY_TEXTCONV) {
+		struct diff_filespec *df = alloc_filespec(gs->name);
+		gs->size = fill_textconv(gs->driver, df, &gs->buf);
+		free_filespec(df);
+	} else {
+		gs->buf = read_sha1_file(gs->identifier, &type, &gs->size);
+	}
 	grep_read_unlock();
 
 	if (!gs->buf)
@@ -1625,7 +1633,7 @@ static int grep_source_load_sha1(struct grep_source *gs)
 	return 0;
 }
 
-static int grep_source_load_file(struct grep_source *gs)
+static int grep_source_load_file(struct grep_source *gs, struct grep_opt *opt)
 {
 	const char *filename = gs->identifier;
 	struct stat st;
@@ -1660,42 +1668,45 @@ static int grep_source_load_file(struct grep_source *gs)
 	return 0;
 }
 
-static int grep_source_load(struct grep_source *gs)
+static int grep_source_load(struct grep_source *gs, struct grep_opt *opt)
 {
 	if (gs->buf)
 		return 0;
 
 	switch (gs->type) {
 	case GREP_SOURCE_FILE:
-		return grep_source_load_file(gs);
+		return grep_source_load_file(gs, opt);
 	case GREP_SOURCE_SHA1:
-		return grep_source_load_sha1(gs);
+		return grep_source_load_sha1(gs, opt);
 	case GREP_SOURCE_BUF:
 		return gs->buf ? 0 : -1;
 	}
 	die("BUG: invalid grep_source type");
 }
 
-void grep_source_load_driver(struct grep_source *gs)
+void grep_source_load_driver(struct grep_source *gs, struct grep_opt *opt)
 {
 	if (gs->driver)
 		return;
 
-	grep_attr_lock();
+	grep_attr_lock(); //TODO
+	printf("Looking up userdiff driver for: %s", gs->path);
 	if (gs->path)
 		gs->driver = userdiff_find_by_path(gs->path);
 	if (!gs->driver)
 		gs->driver = userdiff_find_by_name("default");
+	if (opt->binary == GREP_BINARY_TEXTCONV)
+		gs->driver = userdiff_get_textconv(gs->driver);
 	grep_attr_unlock();
 }
 
-static int grep_source_is_binary(struct grep_source *gs)
+static int grep_source_is_binary(struct grep_source *gs, struct grep_opt *opt)
 {
-	grep_source_load_driver(gs);
+	grep_source_load_driver(gs, opt);
 	if (gs->driver->binary != -1)
 		return gs->driver->binary;
 
-	if (!grep_source_load(gs))
+	if (!grep_source_load(gs, opt))
 		return buffer_is_binary(gs->buf, gs->size);
 
 	return 0;
diff --git a/grep.h b/grep.h
index 8fc854f..d272d25 100644
--- a/grep.h
+++ b/grep.h
@@ -105,6 +105,7 @@ struct grep_opt {
 #define GREP_BINARY_DEFAULT	0
 #define GREP_BINARY_NOMATCH	1
 #define GREP_BINARY_TEXT	2
+#define GREP_BINARY_TEXTCONV	3
 	int binary;
 	int extended;
 	int use_reflog_filter;
@@ -173,7 +174,7 @@ void grep_source_init(struct grep_source *gs, enum grep_source_type type,
 		      const void *identifier);
 void grep_source_clear_data(struct grep_source *gs);
 void grep_source_clear(struct grep_source *gs);
-void grep_source_load_driver(struct grep_source *gs);
+void grep_source_load_driver(struct grep_source *gs, struct grep_opt *opt);
 
 
 int grep_source(struct grep_opt *opt, struct grep_source *gs);
-- 
1.8.1.2.718.g9d378fc

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [WIP/RFH/RFD/PATCH] grep: allow to use textconv filters
  2013-02-04 15:27 [WIP/RFH/RFD/PATCH] grep: allow to use textconv filters Michael J Gruber
@ 2013-02-04 17:12 ` Junio C Hamano
  2013-02-05  8:48   ` Michael J Gruber
  2013-02-05 11:13 ` Jeff King
  1 sibling, 1 reply; 37+ messages in thread
From: Junio C Hamano @ 2013-02-04 17:12 UTC (permalink / raw)
  To: Michael J Gruber; +Cc: git, Jeff King

Michael J Gruber <git@drmicha.warpmail.net> writes:

> Recently and not so recently, we made sure that log/grep type operations
> use textconv filters when a userfacing diff would do the same:
>
> ef90ab6 (pickaxe: use textconv for -S counting, 2012-10-28)
> b1c2f57 (diff_grep: use textconv buffers for add/deleted files, 2012-10-28)
> 0508fe5 (combine-diff: respect textconv attributes, 2011-05-23)
>
> "git grep" currently does not use textconv filters at all, that is
> neither for displaying the match and context nor for the actual grepping.
>
> Introduce a binary mode "--textconv" (in addition to "--text" and "-I")
> which makes git grep use any configured textconv filters for grepping
> and output purposes.
>
> Signed-off-by: Michael J Gruber <git@drmicha.warpmail.net>
> ---
>
> Notes:
>     I'm somehow stuck in textconv/filespec/... hell, so I'm sending this out
>     in request for help. I'm sure there are people for whom it's a breeze to
>     get this right.

Looks like the patch touches the right places in the codepaths that
need to be updated from a quick read.

>     The difficulty is in getting the different cases (blob/sha1 vs.
>     worktree) right, and in making the changes minimally invasive.

I think grep_source abstraction was intended to be a way for that,
and if what it gives you is not sufficient for your needs, extending
it should not be seen as "invasive" at all.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [WIP/RFH/RFD/PATCH] grep: allow to use textconv filters
  2013-02-04 17:12 ` Junio C Hamano
@ 2013-02-05  8:48   ` Michael J Gruber
  0 siblings, 0 replies; 37+ messages in thread
From: Michael J Gruber @ 2013-02-05  8:48 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Jeff King

Junio C Hamano venit, vidit, dixit 04.02.2013 18:12:
> Michael J Gruber <git@drmicha.warpmail.net> writes:
> 
>> Recently and not so recently, we made sure that log/grep type operations
>> use textconv filters when a userfacing diff would do the same:
>>
>> ef90ab6 (pickaxe: use textconv for -S counting, 2012-10-28)
>> b1c2f57 (diff_grep: use textconv buffers for add/deleted files, 2012-10-28)
>> 0508fe5 (combine-diff: respect textconv attributes, 2011-05-23)
>>
>> "git grep" currently does not use textconv filters at all, that is
>> neither for displaying the match and context nor for the actual grepping.
>>
>> Introduce a binary mode "--textconv" (in addition to "--text" and "-I")
>> which makes git grep use any configured textconv filters for grepping
>> and output purposes.
>>
>> Signed-off-by: Michael J Gruber <git@drmicha.warpmail.net>
>> ---
>>
>> Notes:
>>     I'm somehow stuck in textconv/filespec/... hell, so I'm sending this out
>>     in request for help. I'm sure there are people for whom it's a breeze to
>>     get this right.
> 
> Looks like the patch touches the right places in the codepaths that
> need to be updated from a quick read.

I should have added that this coredumps when used on blobs and that
textconv isn't invoked in any case (because it crashes on blobs, and I
haven't implemented tetxconv on worktree files yet).

I'm somehow struggling to use just the right bits from the more diff
centric helpers like fill_textconv or fill_one (which is static so far).

>>     The difficulty is in getting the different cases (blob/sha1 vs.
>>     worktree) right, and in making the changes minimally invasive.
> 
> I think grep_source abstraction was intended to be a way for that,
> and if what it gives you is not sufficient for your needs, extending
> it should not be seen as "invasive" at all.

It seems to me that we textconvified the diff versions of these
abstractions, but not the grep_source abstractions. Doing it at the
source appears to be the right thing.

Michael

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [WIP/RFH/RFD/PATCH] grep: allow to use textconv filters
  2013-02-04 15:27 [WIP/RFH/RFD/PATCH] grep: allow to use textconv filters Michael J Gruber
  2013-02-04 17:12 ` Junio C Hamano
@ 2013-02-05 11:13 ` Jeff King
  2013-02-05 16:21   ` Michael J Gruber
  1 sibling, 1 reply; 37+ messages in thread
From: Jeff King @ 2013-02-05 11:13 UTC (permalink / raw)
  To: Michael J Gruber; +Cc: Junio C Hamano, git

On Mon, Feb 04, 2013 at 04:27:31PM +0100, Michael J Gruber wrote:

> Recently and not so recently, we made sure that log/grep type operations
> use textconv filters when a userfacing diff would do the same:
> 
> ef90ab6 (pickaxe: use textconv for -S counting, 2012-10-28)
> b1c2f57 (diff_grep: use textconv buffers for add/deleted files, 2012-10-28)
> 0508fe5 (combine-diff: respect textconv attributes, 2011-05-23)
> 
> "git grep" currently does not use textconv filters at all, that is
> neither for displaying the match and context nor for the actual grepping.
> 
> Introduce a binary mode "--textconv" (in addition to "--text" and "-I")
> which makes git grep use any configured textconv filters for grepping
> and output purposes.

Sounds like a reasonable goal.

>     The difficulty is in getting the different cases (blob/sha1 vs.
>     worktree) right, and in making the changes minimally invasive. It seems
>     that some more refactoring could help: "git show --textconv" does not
>     use textconv filters when used on blobs either. (It does for diffs, of
>     course.) Most existing helper functions are tailored for diffs.

I think "git show" with blobs originally did not because we have no
filename with which to look up the attributes. IIRC, the patches to
support "cat-file --textconv" taught get_sha1_with_context to report the
path at which we found a blob. I suspect it is mostly a matter of
plumbing that information from the revision parser through to
show_blob_object.

>     Nota bene: --textconv does not affect "diff --stat" either...

Yeah, though I wonder if it should be on by default. The diffstat for a
binary file, unlike the diff, is already useful. The diffstat of the
textconv'd data may _also_ be useful, of course.

> @@ -659,6 +659,9 @@ int cmd_grep(int argc, const char **argv, const char *prefix)
>  		OPT_SET_INT('I', NULL, &opt.binary,
>  			N_("don't match patterns in binary files"),
>  			GREP_BINARY_NOMATCH),
> +		OPT_SET_INT(0, "textconv", &opt.binary,
> +			N_("process binary files with textconv filters"),
> +			GREP_BINARY_TEXTCONV),

Is this really a new form of GREP_BINARY_*? What happens when a file
does not have a textconv filter?

I would expect this to be more like diff's "--textconv" flag, which is
really "allow textconv to be respected". Then you could do:

  git grep -I --textconv foo

to grep in the text version of files which support it, and ignore the
rest.

> -static int grep_source_load(struct grep_source *gs);
> -static int grep_source_is_binary(struct grep_source *gs);
> +static int grep_source_load(struct grep_source *gs, struct grep_opt *opt);
> +static int grep_source_is_binary(struct grep_source *gs, struct grep_opt *opt);

Hmm. grep_source_load is more or less the analogue of
diff_populate_filespec, which does not know about textconv at all. So I
feel like this might be going in at the wrong layer...

> @@ -1354,14 +1356,15 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
>  
>  	switch (opt->binary) {
>  	case GREP_BINARY_DEFAULT:
> -		if (grep_source_is_binary(gs))
> +		if (grep_source_is_binary(gs, opt))
>  			binary_match_only = 1;
>  		break;
>  	case GREP_BINARY_NOMATCH:
> -		if (grep_source_is_binary(gs))
> +		if (grep_source_is_binary(gs, opt))
>  			return 0; /* Assume unmatch */
>  		break;

The is_binary function learned about "opt" so that it could pass it to
grep_source_load, which might do the textconv for us. But we _know_ that
we will not here, because we see that we have other GREP_BINARY flags.

And when we do have _TEXTCONV:

>  	case GREP_BINARY_TEXT:
> +	case GREP_BINARY_TEXTCONV:
>  		break;

We do not call is_binary. :)

> -static int grep_source_load_sha1(struct grep_source *gs)
> +static int grep_source_load_sha1(struct grep_source *gs, struct grep_opt *opt)
>  {
>  	enum object_type type;
> -
>  	grep_read_lock();
> -	gs->buf = read_sha1_file(gs->identifier, &type, &gs->size);
> +	if (opt->binary == GREP_BINARY_TEXTCONV) {
> +		struct diff_filespec *df = alloc_filespec(gs->name);
> +		gs->size = fill_textconv(gs->driver, df, &gs->buf);
> +		free_filespec(df);
> +	} else {
> +		gs->buf = read_sha1_file(gs->identifier, &type, &gs->size);
> +	}

So here we do the textconv for the sha1 case. But what about file
sources?

This is why I think the layer is wrong; you want the fill_textconv
function to call your abstract _load function (in the diff_filespec
world, it is diff_filespec_populate, but it is the moral equivalent).
And you want it to hold off as long as possible in case we can pull the
value from cache, or feed the working tree version of a file straight to
the filter.

> -void grep_source_load_driver(struct grep_source *gs)
> +void grep_source_load_driver(struct grep_source *gs, struct grep_opt *opt)
>  {
>  	if (gs->driver)
>  		return;
>  
> -	grep_attr_lock();
> +	grep_attr_lock(); //TODO
> +	printf("Looking up userdiff driver for: %s", gs->path);
>  	if (gs->path)
>  		gs->driver = userdiff_find_by_path(gs->path);
>  	if (!gs->driver)
>  		gs->driver = userdiff_find_by_name("default");
> +	if (opt->binary == GREP_BINARY_TEXTCONV)
> +		gs->driver = userdiff_get_textconv(gs->driver);
>  	grep_attr_unlock();
>  }

This is wrong. The point of userdiff_get_textconv is that it will return
NULL when we are not doing textconv for this path. So you can use it
like:

  struct userdiff_driver *textconv = userdiff_get_textconv(gs->driver);

  if (textconv) {
          /* ok, we are doing textconv. Call our fill_textconv
           * equivalent. */
  }
  else {
          /* nope, plain old file. */
  }

But by assigning it on top of gs->driver, you're going to end up with a
NULL driver sometimes. And the post-condition of the load_driver
function is that gs->driver always points to a valid driver (even if it
is the default one). I wouldn't be surprised if this causes segfaults.


So I would do it more like the patch below. Only lightly tested by me.
There are some refactoring opportunities if you want to bring
grep_source and diff_filespec closer together.

---
diff --git a/builtin/grep.c b/builtin/grep.c
index 8025964..915c8ef 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -659,6 +659,8 @@ int cmd_grep(int argc, const char **argv, const char *prefix)
 		OPT_SET_INT('I', NULL, &opt.binary,
 			N_("don't match patterns in binary files"),
 			GREP_BINARY_NOMATCH),
+		OPT_BOOL(0, "textconv", &opt.allow_textconv,
+			 N_("process binary files with textconv filters")),
 		{ OPTION_INTEGER, 0, "max-depth", &opt.max_depth, N_("depth"),
 			N_("descend at most <depth> levels"), PARSE_OPT_NONEG,
 			NULL, 1 },
diff --git a/grep.c b/grep.c
index 4bd1b8b..3880d64 100644
--- a/grep.c
+++ b/grep.c
@@ -2,6 +2,8 @@
 #include "grep.h"
 #include "userdiff.h"
 #include "xdiff-interface.h"
+#include "diff.h"
+#include "diffcore.h"
 
 static int grep_source_load(struct grep_source *gs);
 static int grep_source_is_binary(struct grep_source *gs);
@@ -1321,6 +1323,58 @@ static void std_output(struct grep_opt *opt, const void *buf, size_t size)
 	fwrite(buf, size, 1, stdout);
 }
 
+static int fill_textconv_grep(struct userdiff_driver *driver,
+			      struct grep_source *gs)
+{
+	struct diff_filespec *df;
+	char *buf;
+	size_t size;
+
+	if (!driver || !driver->textconv)
+		return grep_source_load(gs);
+
+	/*
+	 * The textconv interface is intimately tied to diff_filespecs, so we
+	 * have to pretend to be one. If we could unify the grep_source
+	 * and diff_filespec structs, this mess could just go away.
+	 */
+	df = alloc_filespec(gs->path);
+	switch (gs->type) {
+	case GREP_SOURCE_SHA1:
+		fill_filespec(df, gs->identifier, 1, 0100644);
+		break;
+	case GREP_SOURCE_FILE:
+		fill_filespec(df, null_sha1, 0, 0100644);
+		break;
+	default:
+		die("BUG: attempt to textconv something without a path?");
+	}
+
+	/*
+	 * fill_textconv is not remotely thread-safe; it may load objects
+	 * behind the scenes, and it modifies the global diff tempfile
+	 * structure.
+	 */
+	grep_read_lock();
+	size = fill_textconv(driver, df, &buf);
+	grep_read_unlock();
+	free_filespec(df);
+
+	/*
+	 * The normal fill_textconv usage by the diff machinery would just keep
+	 * the textconv'd buf separate from the diff_filespec. But much of the
+	 * grep code passes around a grep_source and assumes that its "buf"
+	 * pointer is the beginning of the thing we are searching. So let's
+	 * install our textconv'd version into the grep_source, taking care not
+	 * to leak any existing buffer.
+	 */
+	grep_source_clear_data(gs);
+	gs->buf = buf;
+	gs->size = size;
+
+	return 0;
+}
+
 static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int collect_hits)
 {
 	char *bol;
@@ -1331,6 +1385,7 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
 	unsigned count = 0;
 	int try_lookahead = 0;
 	int show_function = 0;
+	struct userdiff_driver *textconv = NULL;
 	enum grep_context ctx = GREP_CONTEXT_HEAD;
 	xdemitconf_t xecfg;
 
@@ -1352,19 +1407,36 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
 	}
 	opt->last_shown = 0;
 
-	switch (opt->binary) {
-	case GREP_BINARY_DEFAULT:
-		if (grep_source_is_binary(gs))
-			binary_match_only = 1;
-		break;
-	case GREP_BINARY_NOMATCH:
-		if (grep_source_is_binary(gs))
-			return 0; /* Assume unmatch */
-		break;
-	case GREP_BINARY_TEXT:
-		break;
-	default:
-		die("bug: unknown binary handling mode");
+	if (opt->allow_textconv) {
+		grep_source_load_driver(gs);
+		/*
+		 * We might set up the shared textconv cache data here, which
+		 * is not thread-safe.
+		 */
+		grep_attr_lock();
+		textconv = userdiff_get_textconv(gs->driver);
+		grep_attr_unlock();
+	}
+
+	/*
+	 * We know the result of a textconv is text, so we only have to care
+	 * about binary handling if we are not using it.
+	 */
+	if (!textconv) {
+		switch (opt->binary) {
+		case GREP_BINARY_DEFAULT:
+			if (grep_source_is_binary(gs))
+				binary_match_only = 1;
+			break;
+		case GREP_BINARY_NOMATCH:
+			if (grep_source_is_binary(gs))
+				return 0; /* Assume unmatch */
+			break;
+		case GREP_BINARY_TEXT:
+			break;
+		default:
+			die("bug: unknown binary handling mode");
+		}
 	}
 
 	memset(&xecfg, 0, sizeof(xecfg));
@@ -1372,7 +1444,7 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
 
 	try_lookahead = should_lookahead(opt);
 
-	if (grep_source_load(gs) < 0)
+	if (fill_textconv_grep(textconv, gs) < 0)
 		return 0;
 
 	bol = gs->buf;
diff --git a/grep.h b/grep.h
index 8fc854f..94a7ac2 100644
--- a/grep.h
+++ b/grep.h
@@ -106,6 +106,7 @@ struct grep_opt {
 #define GREP_BINARY_NOMATCH	1
 #define GREP_BINARY_TEXT	2
 	int binary;
+	int allow_textconv;
 	int extended;
 	int use_reflog_filter;
 	int pcre;

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [WIP/RFH/RFD/PATCH] grep: allow to use textconv filters
  2013-02-05 11:13 ` Jeff King
@ 2013-02-05 16:21   ` Michael J Gruber
  2013-02-05 20:11     ` Jeff King
  0 siblings, 1 reply; 37+ messages in thread
From: Michael J Gruber @ 2013-02-05 16:21 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, git

Jeff King venit, vidit, dixit 05.02.2013 12:13:
> On Mon, Feb 04, 2013 at 04:27:31PM +0100, Michael J Gruber wrote:
> 
>> Recently and not so recently, we made sure that log/grep type operations
>> use textconv filters when a userfacing diff would do the same:
>>
>> ef90ab6 (pickaxe: use textconv for -S counting, 2012-10-28)
>> b1c2f57 (diff_grep: use textconv buffers for add/deleted files, 2012-10-28)
>> 0508fe5 (combine-diff: respect textconv attributes, 2011-05-23)
>>
>> "git grep" currently does not use textconv filters at all, that is
>> neither for displaying the match and context nor for the actual grepping.
>>
>> Introduce a binary mode "--textconv" (in addition to "--text" and "-I")
>> which makes git grep use any configured textconv filters for grepping
>> and output purposes.
> 
> Sounds like a reasonable goal.
> 
>>     The difficulty is in getting the different cases (blob/sha1 vs.
>>     worktree) right, and in making the changes minimally invasive. It seems
>>     that some more refactoring could help: "git show --textconv" does not
>>     use textconv filters when used on blobs either. (It does for diffs, of
>>     course.) Most existing helper functions are tailored for diffs.
> 
> I think "git show" with blobs originally did not because we have no
> filename with which to look up the attributes. IIRC, the patches to
> support "cat-file --textconv" taught get_sha1_with_context to report the
> path at which we found a blob. I suspect it is mostly a matter of
> plumbing that information from the revision parser through to
> show_blob_object.
> 
>>     Nota bene: --textconv does not affect "diff --stat" either...
> 
> Yeah, though I wonder if it should be on by default. The diffstat for a
> binary file, unlike the diff, is already useful. The diffstat of the
> textconv'd data may _also_ be useful, of course.
> 
>> @@ -659,6 +659,9 @@ int cmd_grep(int argc, const char **argv, const char *prefix)
>>  		OPT_SET_INT('I', NULL, &opt.binary,
>>  			N_("don't match patterns in binary files"),
>>  			GREP_BINARY_NOMATCH),
>> +		OPT_SET_INT(0, "textconv", &opt.binary,
>> +			N_("process binary files with textconv filters"),
>> +			GREP_BINARY_TEXTCONV),
> 
> Is this really a new form of GREP_BINARY_*? What happens when a file
> does not have a textconv filter?
> 
> I would expect this to be more like diff's "--textconv" flag, which is
> really "allow textconv to be respected". Then you could do:
> 
>   git grep -I --textconv foo
> 
> to grep in the text version of files which support it, and ignore the
> rest.
> 
>> -static int grep_source_load(struct grep_source *gs);
>> -static int grep_source_is_binary(struct grep_source *gs);
>> +static int grep_source_load(struct grep_source *gs, struct grep_opt *opt);
>> +static int grep_source_is_binary(struct grep_source *gs, struct grep_opt *opt);
> 
> Hmm. grep_source_load is more or less the analogue of
> diff_populate_filespec, which does not know about textconv at all. So I
> feel like this might be going in at the wrong layer...
> 
>> @@ -1354,14 +1356,15 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
>>  
>>  	switch (opt->binary) {
>>  	case GREP_BINARY_DEFAULT:
>> -		if (grep_source_is_binary(gs))
>> +		if (grep_source_is_binary(gs, opt))
>>  			binary_match_only = 1;
>>  		break;
>>  	case GREP_BINARY_NOMATCH:
>> -		if (grep_source_is_binary(gs))
>> +		if (grep_source_is_binary(gs, opt))
>>  			return 0; /* Assume unmatch */
>>  		break;
> 
> The is_binary function learned about "opt" so that it could pass it to
> grep_source_load, which might do the textconv for us. But we _know_ that
> we will not here, because we see that we have other GREP_BINARY flags.
> 
> And when we do have _TEXTCONV:
> 
>>  	case GREP_BINARY_TEXT:
>> +	case GREP_BINARY_TEXTCONV:
>>  		break;
> 
> We do not call is_binary. :)
> 
>> -static int grep_source_load_sha1(struct grep_source *gs)
>> +static int grep_source_load_sha1(struct grep_source *gs, struct grep_opt *opt)
>>  {
>>  	enum object_type type;
>> -
>>  	grep_read_lock();
>> -	gs->buf = read_sha1_file(gs->identifier, &type, &gs->size);
>> +	if (opt->binary == GREP_BINARY_TEXTCONV) {
>> +		struct diff_filespec *df = alloc_filespec(gs->name);
>> +		gs->size = fill_textconv(gs->driver, df, &gs->buf);
>> +		free_filespec(df);
>> +	} else {
>> +		gs->buf = read_sha1_file(gs->identifier, &type, &gs->size);
>> +	}
> 
> So here we do the textconv for the sha1 case. But what about file
> sources?
> 
> This is why I think the layer is wrong; you want the fill_textconv
> function to call your abstract _load function (in the diff_filespec
> world, it is diff_filespec_populate, but it is the moral equivalent).
> And you want it to hold off as long as possible in case we can pull the
> value from cache, or feed the working tree version of a file straight to
> the filter.
> 
>> -void grep_source_load_driver(struct grep_source *gs)
>> +void grep_source_load_driver(struct grep_source *gs, struct grep_opt *opt)
>>  {
>>  	if (gs->driver)
>>  		return;
>>  
>> -	grep_attr_lock();
>> +	grep_attr_lock(); //TODO
>> +	printf("Looking up userdiff driver for: %s", gs->path);
>>  	if (gs->path)
>>  		gs->driver = userdiff_find_by_path(gs->path);
>>  	if (!gs->driver)
>>  		gs->driver = userdiff_find_by_name("default");
>> +	if (opt->binary == GREP_BINARY_TEXTCONV)
>> +		gs->driver = userdiff_get_textconv(gs->driver);
>>  	grep_attr_unlock();
>>  }
> 
> This is wrong. The point of userdiff_get_textconv is that it will return
> NULL when we are not doing textconv for this path. So you can use it
> like:
> 
>   struct userdiff_driver *textconv = userdiff_get_textconv(gs->driver);
> 
>   if (textconv) {
>           /* ok, we are doing textconv. Call our fill_textconv
>            * equivalent. */
>   }
>   else {
>           /* nope, plain old file. */
>   }
> 
> But by assigning it on top of gs->driver, you're going to end up with a
> NULL driver sometimes. And the post-condition of the load_driver
> function is that gs->driver always points to a valid driver (even if it
> is the default one). I wouldn't be surprised if this causes segfaults.
> 
> 
> So I would do it more like the patch below. Only lightly tested by me.
> There are some refactoring opportunities if you want to bring
> grep_source and diff_filespec closer together.
> 
> ---
> diff --git a/builtin/grep.c b/builtin/grep.c
> index 8025964..915c8ef 100644
> --- a/builtin/grep.c
> +++ b/builtin/grep.c
> @@ -659,6 +659,8 @@ int cmd_grep(int argc, const char **argv, const char *prefix)
>  		OPT_SET_INT('I', NULL, &opt.binary,
>  			N_("don't match patterns in binary files"),
>  			GREP_BINARY_NOMATCH),
> +		OPT_BOOL(0, "textconv", &opt.allow_textconv,
> +			 N_("process binary files with textconv filters")),
>  		{ OPTION_INTEGER, 0, "max-depth", &opt.max_depth, N_("depth"),
>  			N_("descend at most <depth> levels"), PARSE_OPT_NONEG,
>  			NULL, 1 },
> diff --git a/grep.c b/grep.c
> index 4bd1b8b..3880d64 100644
> --- a/grep.c
> +++ b/grep.c
> @@ -2,6 +2,8 @@
>  #include "grep.h"
>  #include "userdiff.h"
>  #include "xdiff-interface.h"
> +#include "diff.h"
> +#include "diffcore.h"
>  
>  static int grep_source_load(struct grep_source *gs);
>  static int grep_source_is_binary(struct grep_source *gs);
> @@ -1321,6 +1323,58 @@ static void std_output(struct grep_opt *opt, const void *buf, size_t size)
>  	fwrite(buf, size, 1, stdout);
>  }
>  
> +static int fill_textconv_grep(struct userdiff_driver *driver,
> +			      struct grep_source *gs)
> +{
> +	struct diff_filespec *df;
> +	char *buf;
> +	size_t size;
> +
> +	if (!driver || !driver->textconv)
> +		return grep_source_load(gs);
> +
> +	/*
> +	 * The textconv interface is intimately tied to diff_filespecs, so we
> +	 * have to pretend to be one. If we could unify the grep_source
> +	 * and diff_filespec structs, this mess could just go away.
> +	 */
> +	df = alloc_filespec(gs->path);
> +	switch (gs->type) {
> +	case GREP_SOURCE_SHA1:
> +		fill_filespec(df, gs->identifier, 1, 0100644);
> +		break;
> +	case GREP_SOURCE_FILE:
> +		fill_filespec(df, null_sha1, 0, 0100644);
> +		break;
> +	default:
> +		die("BUG: attempt to textconv something without a path?");
> +	}
> +
> +	/*
> +	 * fill_textconv is not remotely thread-safe; it may load objects
> +	 * behind the scenes, and it modifies the global diff tempfile
> +	 * structure.
> +	 */
> +	grep_read_lock();
> +	size = fill_textconv(driver, df, &buf);
> +	grep_read_unlock();
> +	free_filespec(df);
> +
> +	/*
> +	 * The normal fill_textconv usage by the diff machinery would just keep
> +	 * the textconv'd buf separate from the diff_filespec. But much of the
> +	 * grep code passes around a grep_source and assumes that its "buf"
> +	 * pointer is the beginning of the thing we are searching. So let's
> +	 * install our textconv'd version into the grep_source, taking care not
> +	 * to leak any existing buffer.
> +	 */
> +	grep_source_clear_data(gs);
> +	gs->buf = buf;
> +	gs->size = size;
> +
> +	return 0;
> +}
> +
>  static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int collect_hits)
>  {
>  	char *bol;
> @@ -1331,6 +1385,7 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
>  	unsigned count = 0;
>  	int try_lookahead = 0;
>  	int show_function = 0;
> +	struct userdiff_driver *textconv = NULL;
>  	enum grep_context ctx = GREP_CONTEXT_HEAD;
>  	xdemitconf_t xecfg;
>  
> @@ -1352,19 +1407,36 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
>  	}
>  	opt->last_shown = 0;
>  
> -	switch (opt->binary) {
> -	case GREP_BINARY_DEFAULT:
> -		if (grep_source_is_binary(gs))
> -			binary_match_only = 1;
> -		break;
> -	case GREP_BINARY_NOMATCH:
> -		if (grep_source_is_binary(gs))
> -			return 0; /* Assume unmatch */
> -		break;
> -	case GREP_BINARY_TEXT:
> -		break;
> -	default:
> -		die("bug: unknown binary handling mode");
> +	if (opt->allow_textconv) {
> +		grep_source_load_driver(gs);
> +		/*
> +		 * We might set up the shared textconv cache data here, which
> +		 * is not thread-safe.
> +		 */
> +		grep_attr_lock();
> +		textconv = userdiff_get_textconv(gs->driver);
> +		grep_attr_unlock();
> +	}
> +
> +	/*
> +	 * We know the result of a textconv is text, so we only have to care
> +	 * about binary handling if we are not using it.
> +	 */
> +	if (!textconv) {
> +		switch (opt->binary) {
> +		case GREP_BINARY_DEFAULT:
> +			if (grep_source_is_binary(gs))
> +				binary_match_only = 1;
> +			break;
> +		case GREP_BINARY_NOMATCH:
> +			if (grep_source_is_binary(gs))
> +				return 0; /* Assume unmatch */
> +			break;
> +		case GREP_BINARY_TEXT:
> +			break;
> +		default:
> +			die("bug: unknown binary handling mode");
> +		}
>  	}
>  
>  	memset(&xecfg, 0, sizeof(xecfg));
> @@ -1372,7 +1444,7 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
>  
>  	try_lookahead = should_lookahead(opt);
>  
> -	if (grep_source_load(gs) < 0)
> +	if (fill_textconv_grep(textconv, gs) < 0)
>  		return 0;
>  
>  	bol = gs->buf;
> diff --git a/grep.h b/grep.h
> index 8fc854f..94a7ac2 100644
> --- a/grep.h
> +++ b/grep.h
> @@ -106,6 +106,7 @@ struct grep_opt {
>  #define GREP_BINARY_NOMATCH	1
>  #define GREP_BINARY_TEXT	2
>  	int binary;
> +	int allow_textconv;
>  	int extended;
>  	int use_reflog_filter;
>  	int pcre;
> 

Thanks Jeff, that helps a lot! It covers "grep expr" and "grep expr rev
-- path" just fine. I'll look into "grep expr rev:path" which does not
work yet because of an empty driver.

I also have "show --textconv" covered and a suggestion for "cat-file
--textconv" (to work without a textconv filter).

Expect a mini-series soon :)

Michael

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [WIP/RFH/RFD/PATCH] grep: allow to use textconv filters
  2013-02-05 16:21   ` Michael J Gruber
@ 2013-02-05 20:11     ` Jeff King
  2013-02-06 15:08       ` [RFC/PATCH 0/4] textconv for show and grep Michael J Gruber
  0 siblings, 1 reply; 37+ messages in thread
From: Jeff King @ 2013-02-05 20:11 UTC (permalink / raw)
  To: Michael J Gruber; +Cc: Junio C Hamano, git

On Tue, Feb 05, 2013 at 05:21:18PM +0100, Michael J Gruber wrote:

> Thanks Jeff, that helps a lot! It covers "grep expr" and "grep expr rev
> -- path" just fine. I'll look into "grep expr rev:path" which does not
> work yet because of an empty driver.
> 
> I also have "show --textconv" covered and a suggestion for "cat-file
> --textconv" (to work without a textconv filter).
> 
> Expect a mini-series soon :)

Cool, I'm glad it helped. It would be great if diff_filespec and
grep_source could grow together into a unified object. One of the gross
things about the patch I posted is that we will now sometimes read the
file/blob data via grep_source_load, and sometimes via
diff_populate_filespec. They _should_ be equivalent, but in an ideal
world, they would be the same code path.

That may be too much to tackle for your series, though (I wanted to do
it when I factored out grep_source, but backed off for the same reason).

The "grep expr rev:path" fix should look something like this:

diff --git a/builtin/grep.c b/builtin/grep.c
index 915c8ef..cdc7d32 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -820,13 +820,17 @@ int cmd_grep(int argc, const char **argv, const char *prefix)
 	for (i = 0; i < argc; i++) {
 		const char *arg = argv[i];
 		unsigned char sha1[20];
+		struct object_context oc;
 		/* Is it a rev? */
-		if (!get_sha1(arg, sha1)) {
+		if (!get_sha1_with_context(arg, sha1, &oc)) {
 			struct object *object = parse_object(sha1);
 			if (!object)
 				die(_("bad object %s"), arg);
 			if (!seen_dashdash)
 				verify_non_filename(prefix, arg);
+			/* oops, we need something that will remember oc.path
+			 * here, so that we can pass it along to
+			 * grep_source_init  */
 			add_object_array(object, arg, &list);
 			continue;
 		}

But you'll have to replace the object_array with something more
featureful, I think.

-Peff

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [RFC/PATCH 0/4] textconv for show and grep
  2013-02-05 20:11     ` Jeff King
@ 2013-02-06 15:08       ` Michael J Gruber
  2013-02-06 15:08         ` [RFC/PATCH 1/4] show: obey --textconv for blobs Michael J Gruber
                           ` (4 more replies)
  0 siblings, 5 replies; 37+ messages in thread
From: Michael J Gruber @ 2013-02-06 15:08 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Jeff King

This min series aims at completing the textconv support of user facing
commands. It is RFC for lack of documentation and tests, to check
whether we really want to go in that direction (I do).

1/4 covers the missing textconv support in the "blob case" of "git
show", which should be (and then is) analogous to "cat-file" and "diff".

2/4 remedies an oddity of "git cat-file --textconv", which errored out
when there is no textconv filter rather than giving an unfiltered blob
(like every other textconv aware command).

3/4 implements "--textconv" for "git grep" sans the blob case; the code
is all Jeff's.

4/4 adds blob support to 3/4 (the "rev:path" case).

3 and 4 can be squashed, of course.

Now I'm quite a happy differ/shower/grepper with my latin1 and OO files ;)

Jeff King (1):
  grep: allow to use textconv filters

Michael J Gruber (3):
  show: obey --textconv for blobs
  cat-file: do not die on --textconv without textconv filters
  grep: obey --textconv for the case rev:path

 builtin/cat-file.c           |   9 ++--
 builtin/grep.c               |  13 +++---
 builtin/log.c                |  24 +++++++++--
 grep.c                       | 100 +++++++++++++++++++++++++++++++++++++------
 grep.h                       |   1 +
 object.c                     |  26 ++++++++---
 object.h                     |   2 +
 t/t8007-cat-file-textconv.sh |  20 +++------
 8 files changed, 148 insertions(+), 47 deletions(-)

-- 
1.8.1.2.752.g32d147e

^ permalink raw reply	[flat|nested] 37+ messages in thread

* [RFC/PATCH 1/4] show: obey --textconv for blobs
  2013-02-06 15:08       ` [RFC/PATCH 0/4] textconv for show and grep Michael J Gruber
@ 2013-02-06 15:08         ` Michael J Gruber
  2013-02-06 16:53           ` Junio C Hamano
  2013-02-06 22:06           ` Jeff King
  2013-02-06 15:08         ` [RFC/PATCH 2/4] cat-file: do not die on --textconv without textconv filters Michael J Gruber
                           ` (3 subsequent siblings)
  4 siblings, 2 replies; 37+ messages in thread
From: Michael J Gruber @ 2013-02-06 15:08 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Jeff King

Currently, "diff" and "cat-file" for blobs obey "--textconv" options
(with the former defaulting to "--textconv" and the latter to
"--no-textconv") whereas "show" does not obey this option, even though
it takes diff options.

Make "show" on blobs behave like "diff", i.e. obey "--textconv" by
default and "--no-textconv" when given.

Signed-off-by: Michael J Gruber <git@drmicha.warpmail.net>
---
 builtin/log.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/builtin/log.c b/builtin/log.c
index 8f0b2e8..f83870d 100644
--- a/builtin/log.c
+++ b/builtin/log.c
@@ -402,10 +402,28 @@ static void show_tagger(char *buf, int len, struct rev_info *rev)
 	strbuf_release(&out);
 }
 
-static int show_blob_object(const unsigned char *sha1, struct rev_info *rev)
+static int show_blob_object(const unsigned char *sha1, struct rev_info *rev, const char *obj_name)
 {
+	unsigned char sha1c[20];
+	struct object_context obj_context;
+	char *buf;
+	unsigned long size;
+
 	fflush(stdout);
-	return stream_blob_to_fd(1, sha1, NULL, 0);
+	if (!DIFF_OPT_TST(&rev->diffopt, ALLOW_TEXTCONV))
+		return stream_blob_to_fd(1, sha1, NULL, 0);
+
+	if (get_sha1_with_context(obj_name, 0, sha1c, &obj_context))
+		die("Not a valid object name %s", obj_name);
+	if (!obj_context.path[0] ||
+	    !textconv_object(obj_context.path, obj_context.mode, sha1c, 1, &buf, &size))
+		return stream_blob_to_fd(1, sha1, NULL, 0);
+
+	if (!buf)
+		die("git show %s: bad file", obj_name);
+
+	write_or_die(1, buf, size);
+	return 0;
 }
 
 static int show_tag_object(const unsigned char *sha1, struct rev_info *rev)
@@ -491,7 +509,7 @@ int cmd_show(int argc, const char **argv, const char *prefix)
 		const char *name = objects[i].name;
 		switch (o->type) {
 		case OBJ_BLOB:
-			ret = show_blob_object(o->sha1, NULL);
+			ret = show_blob_object(o->sha1, &rev, name);
 			break;
 		case OBJ_TAG: {
 			struct tag *t = (struct tag *)o;
-- 
1.8.1.2.752.g32d147e

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [RFC/PATCH 2/4] cat-file: do not die on --textconv without textconv filters
  2013-02-06 15:08       ` [RFC/PATCH 0/4] textconv for show and grep Michael J Gruber
  2013-02-06 15:08         ` [RFC/PATCH 1/4] show: obey --textconv for blobs Michael J Gruber
@ 2013-02-06 15:08         ` Michael J Gruber
  2013-02-06 16:47           ` Junio C Hamano
  2013-02-06 22:19           ` Jeff King
  2013-02-06 15:08         ` [RFC/PATCH 3/4] grep: allow to use " Michael J Gruber
                           ` (2 subsequent siblings)
  4 siblings, 2 replies; 37+ messages in thread
From: Michael J Gruber @ 2013-02-06 15:08 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Jeff King

When a command is supposed to use textconv filters (by default or with
"--textconv") and none are configured then the blob is output without
conversion; the only exception to this rule is "cat-file --textconv".

Make it behave like the rest of textconv aware commands.

Signed-off-by: Michael J Gruber <git@drmicha.warpmail.net>
---
 builtin/cat-file.c           |  9 +++++----
 t/t8007-cat-file-textconv.sh | 20 +++++---------------
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index 00528dd..6912dc2 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -146,10 +146,11 @@ static int cat_one_file(int opt, const char *exp_type, const char *obj_name)
 			die("git cat-file --textconv %s: <object> must be <sha1:path>",
 			    obj_name);
 
-		if (!textconv_object(obj_context.path, obj_context.mode, sha1, 1, &buf, &size))
-			die("git cat-file --textconv: unable to run textconv on %s",
-			    obj_name);
-		break;
+		if (textconv_object(obj_context.path, obj_context.mode, sha1, 1, &buf, &size))
+			break;
+
+		/* otherwise expect a blob */
+		exp_type = "blob";
 
 	case 0:
 		if (type_from_string(exp_type) == OBJ_BLOB) {
diff --git a/t/t8007-cat-file-textconv.sh b/t/t8007-cat-file-textconv.sh
index 78a0085..83c6636 100755
--- a/t/t8007-cat-file-textconv.sh
+++ b/t/t8007-cat-file-textconv.sh
@@ -22,11 +22,11 @@ test_expect_success 'setup ' '
 '
 
 cat >expected <<EOF
-fatal: git cat-file --textconv: unable to run textconv on :one.bin
+bin: test version 2
 EOF
 
 test_expect_success 'no filter specified' '
-	git cat-file --textconv :one.bin 2>result
+	git cat-file --textconv :one.bin >result &&
 	test_cmp expected result
 '
 
@@ -36,10 +36,6 @@ test_expect_success 'setup textconv filters' '
 	git config diff.test.cachetextconv false
 '
 
-cat >expected <<EOF
-bin: test version 2
-EOF
-
 test_expect_success 'cat-file without --textconv' '
 	git cat-file blob :one.bin >result &&
 	test_cmp expected result
@@ -73,25 +69,19 @@ test_expect_success 'cat-file --textconv on previous commit' '
 '
 
 test_expect_success SYMLINKS 'cat-file without --textconv (symlink)' '
+	printf "%s" "one.bin" >expected &&
 	git cat-file blob :symlink.bin >result &&
-	printf "%s" "one.bin" >expected
 	test_cmp expected result
 '
 
 
 test_expect_success SYMLINKS 'cat-file --textconv on index (symlink)' '
-	! git cat-file --textconv :symlink.bin 2>result &&
-	cat >expected <<\EOF &&
-fatal: git cat-file --textconv: unable to run textconv on :symlink.bin
-EOF
+	git cat-file --textconv :symlink.bin >result &&
 	test_cmp expected result
 '
 
 test_expect_success SYMLINKS 'cat-file --textconv on HEAD (symlink)' '
-	! git cat-file --textconv HEAD:symlink.bin 2>result &&
-	cat >expected <<EOF &&
-fatal: git cat-file --textconv: unable to run textconv on HEAD:symlink.bin
-EOF
+	git cat-file --textconv HEAD:symlink.bin >result &&
 	test_cmp expected result
 '
 
-- 
1.8.1.2.752.g32d147e

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [RFC/PATCH 3/4] grep: allow to use textconv filters
  2013-02-06 15:08       ` [RFC/PATCH 0/4] textconv for show and grep Michael J Gruber
  2013-02-06 15:08         ` [RFC/PATCH 1/4] show: obey --textconv for blobs Michael J Gruber
  2013-02-06 15:08         ` [RFC/PATCH 2/4] cat-file: do not die on --textconv without textconv filters Michael J Gruber
@ 2013-02-06 15:08         ` Michael J Gruber
  2013-02-06 15:12           ` Matthieu Moy
  2013-02-06 22:23           ` Jeff King
  2013-02-06 15:08         ` [RFC/PATCH 4/4] grep: obey --textconv for the case rev:path Michael J Gruber
  2013-02-06 16:55         ` [RFC/PATCH 0/4] textconv for show and grep Junio C Hamano
  4 siblings, 2 replies; 37+ messages in thread
From: Michael J Gruber @ 2013-02-06 15:08 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Jeff King

From: Jeff King <peff@peff.net>

Recently and not so recently, we made sure that log/grep type operations
use textconv filters when a userfacing diff would do the same:

ef90ab6 (pickaxe: use textconv for -S counting, 2012-10-28)
b1c2f57 (diff_grep: use textconv buffers for add/deleted files, 2012-10-28)
0508fe5 (combine-diff: respect textconv attributes, 2011-05-23)

"git grep" currently does not use textconv filters at all, that is
neither for displaying the match and context nor for the actual grepping.

Introduce an option "--textconv" which makes git grep use any configured
textconv filters for grepping and output purposes. It is off by default.

Signed-off-by: Michael J Gruber <git@drmicha.warpmail.net>
---
 builtin/grep.c |   2 ++
 grep.c         | 100 +++++++++++++++++++++++++++++++++++++++++++++++++--------
 grep.h         |   1 +
 3 files changed, 89 insertions(+), 14 deletions(-)

diff --git a/builtin/grep.c b/builtin/grep.c
index 8025964..915c8ef 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -659,6 +659,8 @@ int cmd_grep(int argc, const char **argv, const char *prefix)
 		OPT_SET_INT('I', NULL, &opt.binary,
 			N_("don't match patterns in binary files"),
 			GREP_BINARY_NOMATCH),
+		OPT_BOOL(0, "textconv", &opt.allow_textconv,
+			 N_("process binary files with textconv filters")),
 		{ OPTION_INTEGER, 0, "max-depth", &opt.max_depth, N_("depth"),
 			N_("descend at most <depth> levels"), PARSE_OPT_NONEG,
 			NULL, 1 },
diff --git a/grep.c b/grep.c
index 4bd1b8b..3880d64 100644
--- a/grep.c
+++ b/grep.c
@@ -2,6 +2,8 @@
 #include "grep.h"
 #include "userdiff.h"
 #include "xdiff-interface.h"
+#include "diff.h"
+#include "diffcore.h"
 
 static int grep_source_load(struct grep_source *gs);
 static int grep_source_is_binary(struct grep_source *gs);
@@ -1321,6 +1323,58 @@ static void std_output(struct grep_opt *opt, const void *buf, size_t size)
 	fwrite(buf, size, 1, stdout);
 }
 
+static int fill_textconv_grep(struct userdiff_driver *driver,
+			      struct grep_source *gs)
+{
+	struct diff_filespec *df;
+	char *buf;
+	size_t size;
+
+	if (!driver || !driver->textconv)
+		return grep_source_load(gs);
+
+	/*
+	 * The textconv interface is intimately tied to diff_filespecs, so we
+	 * have to pretend to be one. If we could unify the grep_source
+	 * and diff_filespec structs, this mess could just go away.
+	 */
+	df = alloc_filespec(gs->path);
+	switch (gs->type) {
+	case GREP_SOURCE_SHA1:
+		fill_filespec(df, gs->identifier, 1, 0100644);
+		break;
+	case GREP_SOURCE_FILE:
+		fill_filespec(df, null_sha1, 0, 0100644);
+		break;
+	default:
+		die("BUG: attempt to textconv something without a path?");
+	}
+
+	/*
+	 * fill_textconv is not remotely thread-safe; it may load objects
+	 * behind the scenes, and it modifies the global diff tempfile
+	 * structure.
+	 */
+	grep_read_lock();
+	size = fill_textconv(driver, df, &buf);
+	grep_read_unlock();
+	free_filespec(df);
+
+	/*
+	 * The normal fill_textconv usage by the diff machinery would just keep
+	 * the textconv'd buf separate from the diff_filespec. But much of the
+	 * grep code passes around a grep_source and assumes that its "buf"
+	 * pointer is the beginning of the thing we are searching. So let's
+	 * install our textconv'd version into the grep_source, taking care not
+	 * to leak any existing buffer.
+	 */
+	grep_source_clear_data(gs);
+	gs->buf = buf;
+	gs->size = size;
+
+	return 0;
+}
+
 static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int collect_hits)
 {
 	char *bol;
@@ -1331,6 +1385,7 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
 	unsigned count = 0;
 	int try_lookahead = 0;
 	int show_function = 0;
+	struct userdiff_driver *textconv = NULL;
 	enum grep_context ctx = GREP_CONTEXT_HEAD;
 	xdemitconf_t xecfg;
 
@@ -1352,19 +1407,36 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
 	}
 	opt->last_shown = 0;
 
-	switch (opt->binary) {
-	case GREP_BINARY_DEFAULT:
-		if (grep_source_is_binary(gs))
-			binary_match_only = 1;
-		break;
-	case GREP_BINARY_NOMATCH:
-		if (grep_source_is_binary(gs))
-			return 0; /* Assume unmatch */
-		break;
-	case GREP_BINARY_TEXT:
-		break;
-	default:
-		die("bug: unknown binary handling mode");
+	if (opt->allow_textconv) {
+		grep_source_load_driver(gs);
+		/*
+		 * We might set up the shared textconv cache data here, which
+		 * is not thread-safe.
+		 */
+		grep_attr_lock();
+		textconv = userdiff_get_textconv(gs->driver);
+		grep_attr_unlock();
+	}
+
+	/*
+	 * We know the result of a textconv is text, so we only have to care
+	 * about binary handling if we are not using it.
+	 */
+	if (!textconv) {
+		switch (opt->binary) {
+		case GREP_BINARY_DEFAULT:
+			if (grep_source_is_binary(gs))
+				binary_match_only = 1;
+			break;
+		case GREP_BINARY_NOMATCH:
+			if (grep_source_is_binary(gs))
+				return 0; /* Assume unmatch */
+			break;
+		case GREP_BINARY_TEXT:
+			break;
+		default:
+			die("bug: unknown binary handling mode");
+		}
 	}
 
 	memset(&xecfg, 0, sizeof(xecfg));
@@ -1372,7 +1444,7 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
 
 	try_lookahead = should_lookahead(opt);
 
-	if (grep_source_load(gs) < 0)
+	if (fill_textconv_grep(textconv, gs) < 0)
 		return 0;
 
 	bol = gs->buf;
diff --git a/grep.h b/grep.h
index 8fc854f..94a7ac2 100644
--- a/grep.h
+++ b/grep.h
@@ -106,6 +106,7 @@ struct grep_opt {
 #define GREP_BINARY_NOMATCH	1
 #define GREP_BINARY_TEXT	2
 	int binary;
+	int allow_textconv;
 	int extended;
 	int use_reflog_filter;
 	int pcre;
-- 
1.8.1.2.752.g32d147e

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [RFC/PATCH 4/4] grep: obey --textconv for the case rev:path
  2013-02-06 15:08       ` [RFC/PATCH 0/4] textconv for show and grep Michael J Gruber
                           ` (2 preceding siblings ...)
  2013-02-06 15:08         ` [RFC/PATCH 3/4] grep: allow to use " Michael J Gruber
@ 2013-02-06 15:08         ` Michael J Gruber
  2013-02-06 22:36           ` Jeff King
  2013-02-06 16:55         ` [RFC/PATCH 0/4] textconv for show and grep Junio C Hamano
  4 siblings, 1 reply; 37+ messages in thread
From: Michael J Gruber @ 2013-02-06 15:08 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Jeff King

Make "grep" obey the "--textconv" option also for the object case, i.e.
when used with an argument "rev:path".

Signed-off-by: Michael J Gruber <git@drmicha.warpmail.net>
---
 builtin/grep.c | 11 ++++++-----
 object.c       | 26 ++++++++++++++++++++------
 object.h       |  2 ++
 3 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/builtin/grep.c b/builtin/grep.c
index 915c8ef..0f3c4db 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -458,10 +458,10 @@ static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
 }
 
 static int grep_object(struct grep_opt *opt, const struct pathspec *pathspec,
-		       struct object *obj, const char *name)
+		       struct object *obj, const char *name, struct object_context *oc)
 {
 	if (obj->type == OBJ_BLOB)
-		return grep_sha1(opt, obj->sha1, name, 0, NULL);
+		return grep_sha1(opt, obj->sha1, name, 0, oc ? oc->path : NULL);
 	if (obj->type == OBJ_COMMIT || obj->type == OBJ_TREE) {
 		struct tree_desc tree;
 		void *data;
@@ -503,7 +503,7 @@ static int grep_objects(struct grep_opt *opt, const struct pathspec *pathspec,
 	for (i = 0; i < nr; i++) {
 		struct object *real_obj;
 		real_obj = deref_tag(list->objects[i].item, NULL, 0);
-		if (grep_object(opt, pathspec, real_obj, list->objects[i].name)) {
+		if (grep_object(opt, pathspec, real_obj, list->objects[i].name, list->objects[i].context)) {
 			hit = 1;
 			if (opt->status_only)
 				break;
@@ -820,14 +820,15 @@ int cmd_grep(int argc, const char **argv, const char *prefix)
 	for (i = 0; i < argc; i++) {
 		const char *arg = argv[i];
 		unsigned char sha1[20];
+		struct object_context oc;
 		/* Is it a rev? */
-		if (!get_sha1(arg, sha1)) {
+		if (!get_sha1_with_context(arg, 0, sha1, &oc)) {
 			struct object *object = parse_object(sha1);
 			if (!object)
 				die(_("bad object %s"), arg);
 			if (!seen_dashdash)
 				verify_non_filename(prefix, arg);
-			add_object_array(object, arg, &list);
+			add_object_array_with_context(object, arg, &list, xmemdupz(&oc, sizeof(struct object_context)));
 			continue;
 		}
 		if (!strcmp(arg, "--")) {
diff --git a/object.c b/object.c
index 4af3451..1b796c7 100644
--- a/object.c
+++ b/object.c
@@ -245,12 +245,7 @@ int object_list_contains(struct object_list *list, struct object *obj)
 	return 0;
 }
 
-void add_object_array(struct object *obj, const char *name, struct object_array *array)
-{
-	add_object_array_with_mode(obj, name, array, S_IFINVALID);
-}
-
-void add_object_array_with_mode(struct object *obj, const char *name, struct object_array *array, unsigned mode)
+static void add_object_array_with_mode_context(struct object *obj, const char *name, struct object_array *array, unsigned mode, struct object_context *context)
 {
 	unsigned nr = array->nr;
 	unsigned alloc = array->alloc;
@@ -265,9 +260,28 @@ void add_object_array_with_mode(struct object *obj, const char *name, struct obj
 	objects[nr].item = obj;
 	objects[nr].name = name;
 	objects[nr].mode = mode;
+	objects[nr].context = context;
 	array->nr = ++nr;
 }
 
+void add_object_array(struct object *obj, const char *name, struct object_array *array)
+{
+	add_object_array_with_mode(obj, name, array, S_IFINVALID);
+}
+
+void add_object_array_with_mode(struct object *obj, const char *name, struct object_array *array, unsigned mode)
+{
+	add_object_array_with_mode_context(obj, name, array, mode, NULL);
+}
+
+void add_object_array_with_context(struct object *obj, const char *name, struct object_array *array, struct object_context *context)
+{
+	if (context)
+		add_object_array_with_mode_context(obj, name, array, context->mode, context);
+	else
+		add_object_array_with_mode_context(obj, name, array, S_IFINVALID, context);
+}
+
 void object_array_remove_duplicates(struct object_array *array)
 {
 	unsigned int ref, src, dst;
diff --git a/object.h b/object.h
index 6a97b6b..a11d719 100644
--- a/object.h
+++ b/object.h
@@ -13,6 +13,7 @@ struct object_array {
 		struct object *item;
 		const char *name;
 		unsigned mode;
+		struct object_context *context;
 	} *objects;
 };
 
@@ -74,6 +75,7 @@ int object_list_contains(struct object_list *list, struct object *obj);
 /* Object array handling .. */
 void add_object_array(struct object *obj, const char *name, struct object_array *array);
 void add_object_array_with_mode(struct object *obj, const char *name, struct object_array *array, unsigned mode);
+void add_object_array_with_context(struct object *obj, const char *name, struct object_array *array, struct object_context *context);
 void object_array_remove_duplicates(struct object_array *);
 
 void clear_object_flags(unsigned flags);
-- 
1.8.1.2.752.g32d147e

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 3/4] grep: allow to use textconv filters
  2013-02-06 15:08         ` [RFC/PATCH 3/4] grep: allow to use " Michael J Gruber
@ 2013-02-06 15:12           ` Matthieu Moy
  2013-02-06 22:23           ` Jeff King
  1 sibling, 0 replies; 37+ messages in thread
From: Matthieu Moy @ 2013-02-06 15:12 UTC (permalink / raw)
  To: Michael J Gruber; +Cc: git, Junio C Hamano, Jeff King

Michael J Gruber <git@drmicha.warpmail.net> writes:

> Introduce an option "--textconv" which makes git grep use any configured
> textconv filters for grepping and output purposes. It is off by default.
>
> Signed-off-by: Michael J Gruber <git@drmicha.warpmail.net>
> ---
>  builtin/grep.c |   2 ++
>  grep.c         | 100 +++++++++++++++++++++++++++++++++++++++++++++++++--------
>  grep.h         |   1 +

Don't forget to update Documentation/git-grep.txt too.

-- 
Matthieu Moy
http://www-verimag.imag.fr/~moy/

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 2/4] cat-file: do not die on --textconv without textconv filters
  2013-02-06 15:08         ` [RFC/PATCH 2/4] cat-file: do not die on --textconv without textconv filters Michael J Gruber
@ 2013-02-06 16:47           ` Junio C Hamano
  2013-02-06 22:19           ` Jeff King
  1 sibling, 0 replies; 37+ messages in thread
From: Junio C Hamano @ 2013-02-06 16:47 UTC (permalink / raw)
  To: Michael J Gruber; +Cc: git, Jeff King

Michael J Gruber <git@drmicha.warpmail.net> writes:

> When a command is supposed to use textconv filters (by default or with
> "--textconv") and none are configured then the blob is output without
> conversion; the only exception to this rule is "cat-file --textconv".

I am of two minds.  Because cat-file is mostly a low-level plumbing,
I do not necessarily think it is a bad behaviour for it to error out
when it was asked to apply textconv where there is no filter or when
the filter fails to produce an output.  On the other hand, it
certainly makes it more convenient for callers that do not care too
deeply, taking textconv as a mere hint just like Porcelains do.

>
> Make it behave like the rest of textconv aware commands.
>
> Signed-off-by: Michael J Gruber <git@drmicha.warpmail.net>
> ---
>  builtin/cat-file.c           |  9 +++++----
>  t/t8007-cat-file-textconv.sh | 20 +++++---------------
>  2 files changed, 10 insertions(+), 19 deletions(-)
>
> diff --git a/builtin/cat-file.c b/builtin/cat-file.c
> index 00528dd..6912dc2 100644
> --- a/builtin/cat-file.c
> +++ b/builtin/cat-file.c
> @@ -146,10 +146,11 @@ static int cat_one_file(int opt, const char *exp_type, const char *obj_name)
>  			die("git cat-file --textconv %s: <object> must be <sha1:path>",
>  			    obj_name);
>  
> -		if (!textconv_object(obj_context.path, obj_context.mode, sha1, 1, &buf, &size))
> -			die("git cat-file --textconv: unable to run textconv on %s",
> -			    obj_name);
> -		break;
> +		if (textconv_object(obj_context.path, obj_context.mode, sha1, 1, &buf, &size))
> +			break;
> +
> +		/* otherwise expect a blob */
> +		exp_type = "blob";

Please use the constant string blob_type that is available for all
callers including this one.

But stepping back a bit.

What happens when I say "cat-file -c HEAD:Documentation", and what
should happen when I do so?

I think what we want to see in the ideal world might be:

 * If we have a textconv for tree objects at that path to format it
   specially, textconv_object() may be allowed to textualize it
   (even though it is not a blob, and textconv so far has always
   been about blobs; it needs to be considered carefully if it makes
   sense to allow such a usage) and show it;

 * If we don't, we act as if -c were -p; in other words, we treat
   the built-in "human output" implemented by "cat-file -p" as if
   that is a textconv.

In other words, you may want to fall-thru to case 'p', not case 0
with forced "blob" type.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 1/4] show: obey --textconv for blobs
  2013-02-06 15:08         ` [RFC/PATCH 1/4] show: obey --textconv for blobs Michael J Gruber
@ 2013-02-06 16:53           ` Junio C Hamano
  2013-02-06 22:12             ` Jeff King
  2013-02-07  8:48             ` Michael J Gruber
  2013-02-06 22:06           ` Jeff King
  1 sibling, 2 replies; 37+ messages in thread
From: Junio C Hamano @ 2013-02-06 16:53 UTC (permalink / raw)
  To: Michael J Gruber; +Cc: git, Jeff King

Michael J Gruber <git@drmicha.warpmail.net> writes:

> Currently, "diff" and "cat-file" for blobs obey "--textconv" options
> (with the former defaulting to "--textconv" and the latter to
> "--no-textconv") whereas "show" does not obey this option, even though
> it takes diff options.
>
> Make "show" on blobs behave like "diff", i.e. obey "--textconv" by
> default and "--no-textconv" when given.

What does "log -p" do currently, and what should it do?  Does/should
it also use --textconv?

The --textconv is a natural extension of what --ext-diff provides us,
so I think it should trigger the same way as how --ext-diff triggers.

We apply "--ext-diff" for "diff" by default but not for "log -p" and
"show"; I suspect this may have been for a good reason but I do not
recall the discussion that led to the current behaviour offhand.

> Signed-off-by: Michael J Gruber <git@drmicha.warpmail.net>
> ---
>  builtin/log.c | 24 +++++++++++++++++++++---
>  1 file changed, 21 insertions(+), 3 deletions(-)
>
> diff --git a/builtin/log.c b/builtin/log.c
> index 8f0b2e8..f83870d 100644
> --- a/builtin/log.c
> +++ b/builtin/log.c
> @@ -402,10 +402,28 @@ static void show_tagger(char *buf, int len, struct rev_info *rev)
>  	strbuf_release(&out);
>  }
>  
> -static int show_blob_object(const unsigned char *sha1, struct rev_info *rev)
> +static int show_blob_object(const unsigned char *sha1, struct rev_info *rev, const char *obj_name)
>  {
> +	unsigned char sha1c[20];
> +	struct object_context obj_context;
> +	char *buf;
> +	unsigned long size;
> +
>  	fflush(stdout);
> -	return stream_blob_to_fd(1, sha1, NULL, 0);
> +	if (!DIFF_OPT_TST(&rev->diffopt, ALLOW_TEXTCONV))
> +		return stream_blob_to_fd(1, sha1, NULL, 0);
> +
> +	if (get_sha1_with_context(obj_name, 0, sha1c, &obj_context))
> +		die("Not a valid object name %s", obj_name);
> +	if (!obj_context.path[0] ||
> +	    !textconv_object(obj_context.path, obj_context.mode, sha1c, 1, &buf, &size))
> +		return stream_blob_to_fd(1, sha1, NULL, 0);
> +
> +	if (!buf)
> +		die("git show %s: bad file", obj_name);
> +
> +	write_or_die(1, buf, size);
> +	return 0;
>  }
>  
>  static int show_tag_object(const unsigned char *sha1, struct rev_info *rev)
> @@ -491,7 +509,7 @@ int cmd_show(int argc, const char **argv, const char *prefix)
>  		const char *name = objects[i].name;
>  		switch (o->type) {
>  		case OBJ_BLOB:
> -			ret = show_blob_object(o->sha1, NULL);
> +			ret = show_blob_object(o->sha1, &rev, name);
>  			break;
>  		case OBJ_TAG: {
>  			struct tag *t = (struct tag *)o;

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 0/4] textconv for show and grep
  2013-02-06 15:08       ` [RFC/PATCH 0/4] textconv for show and grep Michael J Gruber
                           ` (3 preceding siblings ...)
  2013-02-06 15:08         ` [RFC/PATCH 4/4] grep: obey --textconv for the case rev:path Michael J Gruber
@ 2013-02-06 16:55         ` Junio C Hamano
  4 siblings, 0 replies; 37+ messages in thread
From: Junio C Hamano @ 2013-02-06 16:55 UTC (permalink / raw)
  To: Michael J Gruber; +Cc: git, Jeff King

The parts for "grep" in the series makes tons of sense to me.  I am
not yet convinced about the other two, though.

Thanks.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 1/4] show: obey --textconv for blobs
  2013-02-06 15:08         ` [RFC/PATCH 1/4] show: obey --textconv for blobs Michael J Gruber
  2013-02-06 16:53           ` Junio C Hamano
@ 2013-02-06 22:06           ` Jeff King
  2013-02-07  9:05             ` Michael J Gruber
  1 sibling, 1 reply; 37+ messages in thread
From: Jeff King @ 2013-02-06 22:06 UTC (permalink / raw)
  To: Michael J Gruber; +Cc: git, Junio C Hamano

On Wed, Feb 06, 2013 at 04:08:50PM +0100, Michael J Gruber wrote:

> diff --git a/builtin/log.c b/builtin/log.c
> index 8f0b2e8..f83870d 100644
> --- a/builtin/log.c
> +++ b/builtin/log.c
> @@ -402,10 +402,28 @@ static void show_tagger(char *buf, int len, struct rev_info *rev)
>  	strbuf_release(&out);
>  }
>  
> -static int show_blob_object(const unsigned char *sha1, struct rev_info *rev)
> +static int show_blob_object(const unsigned char *sha1, struct rev_info *rev, const char *obj_name)

Should this maybe just take the whole object_array_entry as a cleanup?

>  {
> +	unsigned char sha1c[20];
> +	struct object_context obj_context;
> +	char *buf;
> +	unsigned long size;
> +
>  	fflush(stdout);
> -	return stream_blob_to_fd(1, sha1, NULL, 0);
> +	if (!DIFF_OPT_TST(&rev->diffopt, ALLOW_TEXTCONV))
> +		return stream_blob_to_fd(1, sha1, NULL, 0);
> +
> +	if (get_sha1_with_context(obj_name, 0, sha1c, &obj_context))
> +		die("Not a valid object name %s", obj_name);

It seems a little hacky that we have to look up the sha1 again. What
should happen in the off chance that "hashcmp(sha1, sha1c) != 0" due to
a race with a simultaneous update of a ref?

Would it be better if object_array_entry replaced its "mode" member with
an object_context? The only downside I see is that we might waste a
significant amount of memory (each context has a PATH_MAX buffer in it).

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 1/4] show: obey --textconv for blobs
  2013-02-06 16:53           ` Junio C Hamano
@ 2013-02-06 22:12             ` Jeff King
  2013-02-06 23:49               ` Junio C Hamano
  2013-02-07  8:48             ` Michael J Gruber
  1 sibling, 1 reply; 37+ messages in thread
From: Jeff King @ 2013-02-06 22:12 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Michael J Gruber, git

On Wed, Feb 06, 2013 at 08:53:52AM -0800, Junio C Hamano wrote:

> Michael J Gruber <git@drmicha.warpmail.net> writes:
> 
> > Currently, "diff" and "cat-file" for blobs obey "--textconv" options
> > (with the former defaulting to "--textconv" and the latter to
> > "--no-textconv") whereas "show" does not obey this option, even though
> > it takes diff options.
> >
> > Make "show" on blobs behave like "diff", i.e. obey "--textconv" by
> > default and "--no-textconv" when given.
> 
> What does "log -p" do currently, and what should it do?  Does/should
> it also use --textconv?
> 
> The --textconv is a natural extension of what --ext-diff provides us,
> so I think it should trigger the same way as how --ext-diff triggers.
> 
> We apply "--ext-diff" for "diff" by default but not for "log -p" and
> "show"; I suspect this may have been for a good reason but I do not
> recall the discussion that led to the current behaviour offhand.

I think Michael's commit message explains the situation badly.
--textconv is already on for "git show" (and for "git log") by default.
Diffs already use it.

This is more about the fact that when showing a single blob, we do not
bother to remember the context of the sha1 lookup, including its
pathname. Therefore we were not previously able to apply any
.gitattributes to the output. So this patch really does two things:

  1. Pass the information along to show_blob_object so that it can
     look up .gitattributes.

  2. Apply the textconv attribute (if ALLOW_TEXTCONV is on, of course).

And stating it that way makes it clear that there may be other missing
steps (3 and up) to apply other gitattributes. For example, should "git
show $blob" respect crlf attributes? Smudge filters?

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 2/4] cat-file: do not die on --textconv without textconv filters
  2013-02-06 15:08         ` [RFC/PATCH 2/4] cat-file: do not die on --textconv without textconv filters Michael J Gruber
  2013-02-06 16:47           ` Junio C Hamano
@ 2013-02-06 22:19           ` Jeff King
  2013-02-06 22:23             ` Junio C Hamano
  1 sibling, 1 reply; 37+ messages in thread
From: Jeff King @ 2013-02-06 22:19 UTC (permalink / raw)
  To: Michael J Gruber; +Cc: git, Junio C Hamano

On Wed, Feb 06, 2013 at 04:08:51PM +0100, Michael J Gruber wrote:

> When a command is supposed to use textconv filters (by default or with
> "--textconv") and none are configured then the blob is output without
> conversion; the only exception to this rule is "cat-file --textconv".
> 
> Make it behave like the rest of textconv aware commands.

Makes sense.

> -		if (!textconv_object(obj_context.path, obj_context.mode, sha1, 1, &buf, &size))
> -			die("git cat-file --textconv: unable to run textconv on %s",
> -			    obj_name);
> -		break;
> +		if (textconv_object(obj_context.path, obj_context.mode, sha1, 1, &buf, &size))
> +			break;

The implication here is that textconv_object should be handling its own
errors and dying, and the return is always "yes, I converted" or "no, I
did not". Which I think is the case.

> +
> +		/* otherwise expect a blob */
> +		exp_type = "blob";
>  
>  	case 0:
>  		if (type_from_string(exp_type) == OBJ_BLOB) {

I wondered at first why we needed to set exp_type here; shouldn't we
already be expecting a blob if we are doing textconv? But then I see
this is really about the fall-through in the switch (which we might want
an explicit comment for).

Which made me wonder: what happens with:

  git cat-file --textconv HEAD

It looks like we die just before textconv-ing, because we have no
obj_context.path. But that is also unlike all of the other --textconv
switches, which mean "turn on textconv if you are showing a blob that
supports it" and not "the specific operation is --textconv, apply it to
this blob". I don't know if that is worth changing or not.

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 2/4] cat-file: do not die on --textconv without textconv filters
  2013-02-06 22:19           ` Jeff King
@ 2013-02-06 22:23             ` Junio C Hamano
  2013-02-06 22:43               ` Jeff King
  0 siblings, 1 reply; 37+ messages in thread
From: Junio C Hamano @ 2013-02-06 22:23 UTC (permalink / raw)
  To: Jeff King; +Cc: Michael J Gruber, git

Jeff King <peff@peff.net> writes:

> Which made me wonder: what happens with:
>
>   git cat-file --textconv HEAD
>
> It looks like we die just before textconv-ing, because we have no
> obj_context.path. But that is also unlike all of the other --textconv
> switches, which mean "turn on textconv if you are showing a blob that
> supports it" and not "the specific operation is --textconv, apply it to
> this blob". I don't know if that is worth changing or not.

OK, so in that sense, "cat-file --textconv HEAD" (or HEAD:) should
die with "Hey, that is not a blob", in other words, Michael's patch
does what we want without further tweaks, right?

By the way are we sure textconv_object() barfs and dies if fed a non
blob?  Otherwise the above does not hold, and the semantics become
"turn on textconv if the object you are showing supports it,
otherwise it has to be a blob.", I think.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 3/4] grep: allow to use textconv filters
  2013-02-06 15:08         ` [RFC/PATCH 3/4] grep: allow to use " Michael J Gruber
  2013-02-06 15:12           ` Matthieu Moy
@ 2013-02-06 22:23           ` Jeff King
  1 sibling, 0 replies; 37+ messages in thread
From: Jeff King @ 2013-02-06 22:23 UTC (permalink / raw)
  To: Michael J Gruber; +Cc: git, Junio C Hamano

On Wed, Feb 06, 2013 at 04:08:52PM +0100, Michael J Gruber wrote:

> From: Jeff King <peff@peff.net>
> 
> Recently and not so recently, we made sure that log/grep type operations
> use textconv filters when a userfacing diff would do the same:
> 
> ef90ab6 (pickaxe: use textconv for -S counting, 2012-10-28)
> b1c2f57 (diff_grep: use textconv buffers for add/deleted files, 2012-10-28)
> 0508fe5 (combine-diff: respect textconv attributes, 2011-05-23)
> 
> "git grep" currently does not use textconv filters at all, that is
> neither for displaying the match and context nor for the actual grepping.
> 
> Introduce an option "--textconv" which makes git grep use any configured
> textconv filters for grepping and output purposes. It is off by default.
> 
> Signed-off-by: Michael J Gruber <git@drmicha.warpmail.net>

Signed-off-by: Jeff King <peff@peff.net>

I'd really love to see the refactoring I talked about in my earlier
message. But as I'm not willing to devote the time to do it right now,
and I do not think this patch has any particular bugs, I think it is OK
as it gets the job done, and does not make the later refactoring any
harder.

The one ugliness that still remains is:

> +	if (opt->allow_textconv) {
> +		grep_source_load_driver(gs);
> +		/*
> +		 * We might set up the shared textconv cache data here, which
> +		 * is not thread-safe.
> +		 */
> +		grep_attr_lock();
> +		textconv = userdiff_get_textconv(gs->driver);
> +		grep_attr_unlock();
> +	}

We lock/unlock the grep_attr_lock twice here: once in
grep_source_load_driver, and then immediately again to call
userdiff_get_textconv. I don't know if it is worth doing the two under
the same lock or not (I guess it should not increase lock contention,
since we do the same amount of work, so it is really just the extra lock
instructions).

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 4/4] grep: obey --textconv for the case rev:path
  2013-02-06 15:08         ` [RFC/PATCH 4/4] grep: obey --textconv for the case rev:path Michael J Gruber
@ 2013-02-06 22:36           ` Jeff King
  2013-02-07  9:05             ` Michael J Gruber
  0 siblings, 1 reply; 37+ messages in thread
From: Jeff King @ 2013-02-06 22:36 UTC (permalink / raw)
  To: Michael J Gruber; +Cc: git, Junio C Hamano

On Wed, Feb 06, 2013 at 04:08:53PM +0100, Michael J Gruber wrote:

> -			add_object_array(object, arg, &list);
> +			add_object_array_with_context(object, arg, &list, xmemdupz(&oc, sizeof(struct object_context)));

If we go this route, this new _with_context variant should be used in
patch 1, too.

> @@ -265,9 +260,28 @@ void add_object_array_with_mode(struct object *obj, const char *name, struct obj
>  	objects[nr].item = obj;
>  	objects[nr].name = name;
>  	objects[nr].mode = mode;
> +	objects[nr].context = context;
>  	array->nr = ++nr;
>  }

This seems a little gross. Who is responsible for allocating the
context? Who frees it? It looks like we duplicate it in cmd_grep. Which
I think is OK, but it means all of this context infrastructure in
object.[ch] is just bolted-on junk waiting for somebody to use it wrong
or get confused.  It does not get set, for example, by the regular
setup_revisions code path.

It would be nice if we could just always have the context available,
then setup_revisions could set it up by default (and replace the "mode"
parameter entirely). But we'd need to do something to avoid the
PATH_MAX-sized buffer for each entry, as some code paths may have a
large number of pending objects.

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 2/4] cat-file: do not die on --textconv without textconv filters
  2013-02-06 22:23             ` Junio C Hamano
@ 2013-02-06 22:43               ` Jeff King
  0 siblings, 0 replies; 37+ messages in thread
From: Jeff King @ 2013-02-06 22:43 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Michael J Gruber, git

On Wed, Feb 06, 2013 at 02:23:36PM -0800, Junio C Hamano wrote:

> Jeff King <peff@peff.net> writes:
> 
> > Which made me wonder: what happens with:
> >
> >   git cat-file --textconv HEAD
> >
> > It looks like we die just before textconv-ing, because we have no
> > obj_context.path. But that is also unlike all of the other --textconv
> > switches, which mean "turn on textconv if you are showing a blob that
> > supports it" and not "the specific operation is --textconv, apply it to
> > this blob". I don't know if that is worth changing or not.
> 
> OK, so in that sense, "cat-file --textconv HEAD" (or HEAD:) should
> die with "Hey, that is not a blob", in other words, Michael's patch
> does what we want without further tweaks, right?

Right, it will die because we do not find a path in the object_context.
For the same reason that "cat-file --textconv $sha1" would die.

A more interesting case is "cat-file --textconv HEAD:Documentation",
which does have a path, but not a blob. And I think that speaks to your
point that we want to fall-through to the pretty-print case, not the
blob case.

> By the way are we sure textconv_object() barfs and dies if fed a non
> blob?  Otherwise the above does not hold, and the semantics become
> "turn on textconv if the object you are showing supports it,
> otherwise it has to be a blob.", I think.

I'm not sure. The sha1 would get passed all the way down to
fill_textconv. I think because sha1_valid is set, it will not try to
reuse the working tree file, so we will end up in
diff_populate_filespec, and we could actually textconv the tree object
itself.

So yeah, I think we want a check that makes sure we are working with a
blob before we even call that function, and "--textconv" should just be
"you must feed me a blob of the form $treeish:$path". In practice nobody
wants to do anything else anyway, so let's keep the code paths simple;
we can always loosen it later if there is a good reason to do so.

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 1/4] show: obey --textconv for blobs
  2013-02-06 22:12             ` Jeff King
@ 2013-02-06 23:49               ` Junio C Hamano
  2013-02-07  0:10                 ` Jeff King
  0 siblings, 1 reply; 37+ messages in thread
From: Junio C Hamano @ 2013-02-06 23:49 UTC (permalink / raw)
  To: Jeff King; +Cc: Michael J Gruber, git

Jeff King <peff@peff.net> writes:

> And stating it that way makes it clear that there may be other missing
> steps (3 and up) to apply other gitattributes. For example, should "git
> show $blob" respect crlf attributes? Smudge filters?

Yeah, I think applying these when path is availble may make sense.

Are we going to teach cat-file to honor them with "--textconv" and
possibly other new options?

Or should the "--textconv" imply application of these other filters?
I am tempted to say yes, but I haven't thought things through...

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 1/4] show: obey --textconv for blobs
  2013-02-06 23:49               ` Junio C Hamano
@ 2013-02-07  0:10                 ` Jeff King
  2013-02-07  0:26                   ` Junio C Hamano
  0 siblings, 1 reply; 37+ messages in thread
From: Jeff King @ 2013-02-07  0:10 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Michael J Gruber, git

On Wed, Feb 06, 2013 at 03:49:44PM -0800, Junio C Hamano wrote:

> Jeff King <peff@peff.net> writes:
> 
> > And stating it that way makes it clear that there may be other missing
> > steps (3 and up) to apply other gitattributes. For example, should "git
> > show $blob" respect crlf attributes? Smudge filters?
> 
> Yeah, I think applying these when path is availble may make sense.
> 
> Are we going to teach cat-file to honor them with "--textconv" and
> possibly other new options?
> 
> Or should the "--textconv" imply application of these other filters?
> I am tempted to say yes, but I haven't thought things through...

For diff, we should already recognize them (because we feed the diff
machinery the path name, and it does the usual attributes lookup). Of
course it only uses things like funcname, not any of the
convert-to-filesystem options (hmm, actually, I guess it may use
convert-from-filesystem, but in such a case it would not be reading from
a blob anyway, but from a filesystem path, so it is not related to this
code).

For cat-file, I don't think --textconv should necessarily imply it. The
original motivation for "cat-file --textconv" was for external blame to
be able to access the converted contents. But it would not want to do
filesystem-level conversion; it wants the canonical version of the file.
I think a better option name for smudge/crlf would be "--to-filesystem"
or something like that. And probably it should not be the default.

git-show should probably have the same option. I doubt it should be on
by default, but I can see it being useful for:

  git show --to-filesystem HEAD:Makefile >Makefile

or whatever. I don't think those features necessarily need to come as
part of Michael's series. They can wait for people who care to implement
them. But I do think the refactoring of passing along the path from the
object_context should keep in mind that we will probably want to go that
way eventually.

Are there other attributes that we might care about when showing a blob?
The only ones I can think of are the ones for converting to the
filesystem representation.

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 1/4] show: obey --textconv for blobs
  2013-02-07  0:10                 ` Jeff King
@ 2013-02-07  0:26                   ` Junio C Hamano
  0 siblings, 0 replies; 37+ messages in thread
From: Junio C Hamano @ 2013-02-07  0:26 UTC (permalink / raw)
  To: Jeff King; +Cc: Michael J Gruber, git

Jeff King <peff@peff.net> writes:

> git-show should probably have the same option. I doubt it should be on
> by default, but I can see it being useful for:
>
>   git show --to-filesystem HEAD:Makefile >Makefile
>
> or whatever. I don't think those features necessarily need to come as
> part of Michael's series.

Yeah, all makes sense (including the part that it might be useful
but it does not belong to this series).

Thanks for sanity checking.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 1/4] show: obey --textconv for blobs
  2013-02-06 16:53           ` Junio C Hamano
  2013-02-06 22:12             ` Jeff King
@ 2013-02-07  8:48             ` Michael J Gruber
  1 sibling, 0 replies; 37+ messages in thread
From: Michael J Gruber @ 2013-02-07  8:48 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Jeff King

Junio C Hamano venit, vidit, dixit 06.02.2013 17:53:
> Michael J Gruber <git@drmicha.warpmail.net> writes:
> 
>> Currently, "diff" and "cat-file" for blobs obey "--textconv" options
>> (with the former defaulting to "--textconv" and the latter to
>> "--no-textconv") whereas "show" does not obey this option, even though
>> it takes diff options.
>>
>> Make "show" on blobs behave like "diff", i.e. obey "--textconv" by
>> default and "--no-textconv" when given.
> 
> What does "log -p" do currently, and what should it do?  Does/should
> it also use --textconv?

It invokes "--textconv" by default, and allows to override with
"--no-textconv. That's what I meant to say but seem to have failed to.

I think at some point we decided to have "human output" on by default
for all things diff (porcelain only, of course) unless the diff is meant
for machine consumption, such as in the case of format-patch.

> The --textconv is a natural extension of what --ext-diff provides us,
> so I think it should trigger the same way as how --ext-diff triggers.

This series' aim is to make the textconv behavior the same for all
"human output" commands. I don't see textconv and ext-diff to be
strongly related, and if then the other way round:

textconv is about converting blobs to a (possibly lossy) human
consumable text.

ext-diff is about computing diffs with an external diff "tool" (not in
the sense of diff-tool).

One way of diffing is textconving blobs then using internal diff on the
resulting text blobs, but ext-diff is more general and, really,
orthogonal in a way. (It used to be used often for what textconv does
when that didn't exist.)

> We apply "--ext-diff" for "diff" by default but not for "log -p" and
> "show"; I suspect this may have been for a good reason but I do not
> recall the discussion that led to the current behaviour offhand.

I don't think I changed anything ext-diff related, did I? 1/4 is about
showing blobs, not diffs.

>> Signed-off-by: Michael J Gruber <git@drmicha.warpmail.net>
>> ---
>>  builtin/log.c | 24 +++++++++++++++++++++---
>>  1 file changed, 21 insertions(+), 3 deletions(-)
>>
>> diff --git a/builtin/log.c b/builtin/log.c
>> index 8f0b2e8..f83870d 100644
>> --- a/builtin/log.c
>> +++ b/builtin/log.c
>> @@ -402,10 +402,28 @@ static void show_tagger(char *buf, int len, struct rev_info *rev)
>>  	strbuf_release(&out);
>>  }
>>  
>> -static int show_blob_object(const unsigned char *sha1, struct rev_info *rev)
>> +static int show_blob_object(const unsigned char *sha1, struct rev_info *rev, const char *obj_name)
>>  {
>> +	unsigned char sha1c[20];
>> +	struct object_context obj_context;
>> +	char *buf;
>> +	unsigned long size;
>> +
>>  	fflush(stdout);
>> -	return stream_blob_to_fd(1, sha1, NULL, 0);
>> +	if (!DIFF_OPT_TST(&rev->diffopt, ALLOW_TEXTCONV))
>> +		return stream_blob_to_fd(1, sha1, NULL, 0);
>> +
>> +	if (get_sha1_with_context(obj_name, 0, sha1c, &obj_context))
>> +		die("Not a valid object name %s", obj_name);
>> +	if (!obj_context.path[0] ||
>> +	    !textconv_object(obj_context.path, obj_context.mode, sha1c, 1, &buf, &size))
>> +		return stream_blob_to_fd(1, sha1, NULL, 0);
>> +
>> +	if (!buf)
>> +		die("git show %s: bad file", obj_name);
>> +
>> +	write_or_die(1, buf, size);
>> +	return 0;
>>  }
>>  
>>  static int show_tag_object(const unsigned char *sha1, struct rev_info *rev)
>> @@ -491,7 +509,7 @@ int cmd_show(int argc, const char **argv, const char *prefix)
>>  		const char *name = objects[i].name;
>>  		switch (o->type) {
>>  		case OBJ_BLOB:
>> -			ret = show_blob_object(o->sha1, NULL);
>> +			ret = show_blob_object(o->sha1, &rev, name);
>>  			break;
>>  		case OBJ_TAG: {
>>  			struct tag *t = (struct tag *)o;

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 1/4] show: obey --textconv for blobs
  2013-02-06 22:06           ` Jeff King
@ 2013-02-07  9:05             ` Michael J Gruber
  2013-02-07  9:11               ` Jeff King
  0 siblings, 1 reply; 37+ messages in thread
From: Michael J Gruber @ 2013-02-07  9:05 UTC (permalink / raw)
  To: Jeff King; +Cc: git, Junio C Hamano

Jeff King venit, vidit, dixit 06.02.2013 23:06:
> On Wed, Feb 06, 2013 at 04:08:50PM +0100, Michael J Gruber wrote:
> 
>> diff --git a/builtin/log.c b/builtin/log.c
>> index 8f0b2e8..f83870d 100644
>> --- a/builtin/log.c
>> +++ b/builtin/log.c
>> @@ -402,10 +402,28 @@ static void show_tagger(char *buf, int len, struct rev_info *rev)
>>  	strbuf_release(&out);
>>  }
>>  
>> -static int show_blob_object(const unsigned char *sha1, struct rev_info *rev)
>> +static int show_blob_object(const unsigned char *sha1, struct rev_info *rev, const char *obj_name)
> 
> Should this maybe just take the whole object_array_entry as a cleanup?

It's just a question of one or two/three pointers (I can't count), but
yes, that would be possible.

>>  {
>> +	unsigned char sha1c[20];
>> +	struct object_context obj_context;
>> +	char *buf;
>> +	unsigned long size;
>> +
>>  	fflush(stdout);
>> -	return stream_blob_to_fd(1, sha1, NULL, 0);
>> +	if (!DIFF_OPT_TST(&rev->diffopt, ALLOW_TEXTCONV))
>> +		return stream_blob_to_fd(1, sha1, NULL, 0);
>> +
>> +	if (get_sha1_with_context(obj_name, 0, sha1c, &obj_context))
>> +		die("Not a valid object name %s", obj_name);
> 
> It seems a little hacky that we have to look up the sha1 again. What
> should happen in the off chance that "hashcmp(sha1, sha1c) != 0" due to
> a race with a simultaneous update of a ref?

I thought about a check here but didn't bother to because I knew the
refactoring would come up again...

> Would it be better if object_array_entry replaced its "mode" member with
> an object_context?

Do all callers/users want to deal with object_context?

I'm wondering why o_c has a mode at all, since it is mostly used in
conjunction with an object, isn't it?

> The only downside I see is that we might waste a
> significant amount of memory (each context has a PATH_MAX buffer in it).

That's why I used a reference to the struct, see my other reply.

Michael

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 4/4] grep: obey --textconv for the case rev:path
  2013-02-06 22:36           ` Jeff King
@ 2013-02-07  9:05             ` Michael J Gruber
  2013-02-07  9:26               ` Jeff King
  0 siblings, 1 reply; 37+ messages in thread
From: Michael J Gruber @ 2013-02-07  9:05 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Git Mailing List

Jeff King venit, vidit, dixit 06.02.2013 23:36:
> On Wed, Feb 06, 2013 at 04:08:53PM +0100, Michael J Gruber wrote:
> 
>> -			add_object_array(object, arg, &list);
>> +			add_object_array_with_context(object, arg, &list, xmemdupz(&oc, sizeof(struct object_context)));
> 
> If we go this route, this new _with_context variant should be used in
> patch 1, too.
> 
>> @@ -265,9 +260,28 @@ void add_object_array_with_mode(struct object *obj, const char *name, struct obj
>>  	objects[nr].item = obj;
>>  	objects[nr].name = name;
>>  	objects[nr].mode = mode;
>> +	objects[nr].context = context;
>>  	array->nr = ++nr;
>>  }
> 
> This seems a little gross. Who is responsible for allocating the
> context? Who frees it? It looks like we duplicate it in cmd_grep. Which

Well, who is responsible for allocating and freeing name and item? I
didn't want to introduce a new member which is a struct when all other
complex members are pointers. Wouldn't that be confusing?

> I think is OK, but it means all of this context infrastructure in
> object.[ch] is just bolted-on junk waiting for somebody to use it wrong
> or get confused.  It does not get set, for example, by the regular
> setup_revisions code path.

Sure, it's NULL when there is no context info, just like in many other
cases.

> It would be nice if we could just always have the context available,
> then setup_revisions could set it up by default (and replace the "mode"
> parameter entirely). But we'd need to do something to avoid the
> PATH_MAX-sized buffer for each entry, as some code paths may have a
> large number of pending objects.

If the information is always available even if we don't need it then it
always takes space. The only way out would be pointing into a pool of
path names rather having a copy in each entry. It's not like I hadn't
talked about providing virtual (blob) objects for path names keyed by
their sha1 before... It's just that I want my grep --textconv now ;)

Michael

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 1/4] show: obey --textconv for blobs
  2013-02-07  9:05             ` Michael J Gruber
@ 2013-02-07  9:11               ` Jeff King
  2013-02-07  9:34                 ` Michael J Gruber
  0 siblings, 1 reply; 37+ messages in thread
From: Jeff King @ 2013-02-07  9:11 UTC (permalink / raw)
  To: Michael J Gruber; +Cc: git, Junio C Hamano

On Thu, Feb 07, 2013 at 10:05:26AM +0100, Michael J Gruber wrote:

> > Would it be better if object_array_entry replaced its "mode" member with
> > an object_context?
> 
> Do all callers/users want to deal with object_context?

Wouldn't it just mean replacing "entry->mode" with "entry->oc.mode" at
each user?

> I'm wondering why o_c has a mode at all, since it is mostly used in
> conjunction with an object, isn't it?

Just as we record the path from the surrounding tree, we record the
mode. It's that mode which gets put into the pending object list by the
revision parser (see the very end of handle_revision_arg). Storing an
object_context instead of the mode would be a strict superset of what we
store now (right now we just throw the rest away).

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 4/4] grep: obey --textconv for the case rev:path
  2013-02-07  9:05             ` Michael J Gruber
@ 2013-02-07  9:26               ` Jeff King
  2013-02-07  9:47                 ` Michael J Gruber
  0 siblings, 1 reply; 37+ messages in thread
From: Jeff King @ 2013-02-07  9:26 UTC (permalink / raw)
  To: Michael J Gruber; +Cc: Junio C Hamano, Git Mailing List

On Thu, Feb 07, 2013 at 10:05:57AM +0100, Michael J Gruber wrote:

> >> @@ -265,9 +260,28 @@ void add_object_array_with_mode(struct object *obj, const char *name, struct obj
> >>  	objects[nr].item = obj;
> >>  	objects[nr].name = name;
> >>  	objects[nr].mode = mode;
> >> +	objects[nr].context = context;
> >>  	array->nr = ++nr;
> >>  }
> > 
> > This seems a little gross. Who is responsible for allocating the
> > context? Who frees it? It looks like we duplicate it in cmd_grep. Which
> 
> Well, who is responsible for allocating and freeing name and item? I
> didn't want to introduce a new member which is a struct when all other
> complex members are pointers. Wouldn't that be confusing?

We cheat on those two. "item" is always a pointer to a "struct object",
which lasts forever and never gets freed. When "name" is set by
setup_revisions, it comes from the argv list, which is assumed to last
forever (and when we add pending blobs for a "--objects" traversal, it
is the empty string (literal).

I'd be OK if we had an exterior object_context that could be handled
in the same way. But how do we tell setup_revisions that we are
interested in seeing the object_context from each parsed item, where
does the allocation come from (is it malloc'd by setup_revisions?), and
who is responsible for freeing it when we pop pending objects in
get_revisions and similar?

I don't think it's as clear cut.

I wonder, though...what we really care about here is just the pathname.
But if it is a pending object that comes from a blob revision argument,
won't it always be of the form "treeish:path"? Could we not even resolve
the sha1 again, but instead just parse out the ":path" bit?

That is sort of like what the repeated call to get_sha1_with_context
does in your first patch. Except that we do not actually want to lookup
the sha1, and it is harmful to do so (e.g., if the ref had moved on to a
new tree that does not have that path, get_sha1 would fail, but we do
not even care what is in the tree; we only want the parsing side effects
of get_sha1).

Hmm.

-Peff

PS By the way, while looking at the object_array code (which I have not
   really used much before), I noticed that add_pending_commit_list sets
   the "name" field to the result of sha1_to_hex. Which means that it is
   likely to be completely bogus by the time you read it. I'm not even
   sure where it gets read or if this matters. And obviously it's
   completely unrelated to what we were discussing; just something I
   noticed.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 1/4] show: obey --textconv for blobs
  2013-02-07  9:11               ` Jeff King
@ 2013-02-07  9:34                 ` Michael J Gruber
  2013-02-07  9:43                   ` Jeff King
  0 siblings, 1 reply; 37+ messages in thread
From: Michael J Gruber @ 2013-02-07  9:34 UTC (permalink / raw)
  To: Jeff King; +Cc: git, Junio C Hamano

Jeff King venit, vidit, dixit 07.02.2013 10:11:
> On Thu, Feb 07, 2013 at 10:05:26AM +0100, Michael J Gruber wrote:
> 
>>> Would it be better if object_array_entry replaced its "mode" member with
>>> an object_context?
>>
>> Do all callers/users want to deal with object_context?
> 
> Wouldn't it just mean replacing "entry->mode" with "entry->oc.mode" at
> each user?

Yes, I meant at the time of creation, i.e. when someone has to create
and pass an o_a_e and maybe only knows a mode, and thus would have to
set the path to NULL or "".

>> I'm wondering why o_c has a mode at all, since it is mostly used in
>> conjunction with an object, isn't it?
> 
> Just as we record the path from the surrounding tree, we record the
> mode. It's that mode which gets put into the pending object list by the
> revision parser (see the very end of handle_revision_arg). Storing an
> object_context instead of the mode would be a strict superset of what we
> store now (right now we just throw the rest away).

Sure. But why does object_context have a mode member at all? Maybe it is
not alway used together with another struct which has the mode already,
then that's a reason.

Michael

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 1/4] show: obey --textconv for blobs
  2013-02-07  9:34                 ` Michael J Gruber
@ 2013-02-07  9:43                   ` Jeff King
  0 siblings, 0 replies; 37+ messages in thread
From: Jeff King @ 2013-02-07  9:43 UTC (permalink / raw)
  To: Michael J Gruber; +Cc: git, Junio C Hamano

On Thu, Feb 07, 2013 at 10:34:26AM +0100, Michael J Gruber wrote:

> > Just as we record the path from the surrounding tree, we record the
> > mode. It's that mode which gets put into the pending object list by the
> > revision parser (see the very end of handle_revision_arg). Storing an
> > object_context instead of the mode would be a strict superset of what we
> > store now (right now we just throw the rest away).
> 
> Sure. But why does object_context have a mode member at all? Maybe it is
> not alway used together with another struct which has the mode already,
> then that's a reason.

Exactly. It's purely for pulling information out of
get_sha1_with_context, and does not know that you are going to put its
output into an object_array_entry (and many call sites do not).

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 4/4] grep: obey --textconv for the case rev:path
  2013-02-07  9:26               ` Jeff King
@ 2013-02-07  9:47                 ` Michael J Gruber
  2013-02-07  9:55                   ` Jeff King
  0 siblings, 1 reply; 37+ messages in thread
From: Michael J Gruber @ 2013-02-07  9:47 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Git Mailing List

Jeff King venit, vidit, dixit 07.02.2013 10:26:
> On Thu, Feb 07, 2013 at 10:05:57AM +0100, Michael J Gruber wrote:
> 
>>>> @@ -265,9 +260,28 @@ void add_object_array_with_mode(struct object *obj, const char *name, struct obj
>>>>  	objects[nr].item = obj;
>>>>  	objects[nr].name = name;
>>>>  	objects[nr].mode = mode;
>>>> +	objects[nr].context = context;
>>>>  	array->nr = ++nr;
>>>>  }
>>>
>>> This seems a little gross. Who is responsible for allocating the
>>> context? Who frees it? It looks like we duplicate it in cmd_grep. Which
>>
>> Well, who is responsible for allocating and freeing name and item? I
>> didn't want to introduce a new member which is a struct when all other
>> complex members are pointers. Wouldn't that be confusing?
> 
> We cheat on those two. "item" is always a pointer to a "struct object",
> which lasts forever and never gets freed. When "name" is set by
> setup_revisions, it comes from the argv list, which is assumed to last
> forever (and when we add pending blobs for a "--objects" traversal, it
> is the empty string (literal).

I see, so they are really different.

> I'd be OK if we had an exterior object_context that could be handled
> in the same way. But how do we tell setup_revisions that we are
> interested in seeing the object_context from each parsed item, where
> does the allocation come from (is it malloc'd by setup_revisions?), and
> who is responsible for freeing it when we pop pending objects in
> get_revisions and similar?

Do we really need all of tree, path and mode in object_context (I mean
not just here, but other users), or only the path? I'd try and resurrect
the virtual path name objects then, they would be just like "item"
storage-wise.

> I don't think it's as clear cut.
> 
> I wonder, though...what we really care about here is just the pathname.
> But if it is a pending object that comes from a blob revision argument,
> won't it always be of the form "treeish:path"? Could we not even resolve
> the sha1 again, but instead just parse out the ":path" bit?

Do we have that, and in what form (e.g. magic expanded etc.)?

> That is sort of like what the repeated call to get_sha1_with_context
> does in your first patch. Except that we do not actually want to lookup
> the sha1, and it is harmful to do so (e.g., if the ref had moved on to a
> new tree that does not have that path, get_sha1 would fail, but we do
> not even care what is in the tree; we only want the parsing side effects
> of get_sha1).
> 
> Hmm.
> 
> -Peff
> 
> PS By the way, while looking at the object_array code (which I have not
>    really used much before), I noticed that add_pending_commit_list sets
>    the "name" field to the result of sha1_to_hex. Which means that it is
>    likely to be completely bogus by the time you read it. I'm not even
>    sure where it gets read or if this matters. And obviously it's
>    completely unrelated to what we were discussing; just something I
>    noticed.

Another thing I noted is that our path mangling at least for grep has
some issues:

(cd t && git grep GET_SHA1_QUIETLY HEAD:../cache.h)
../HEAD:../cache.h:#define GET_SHA1_QUIETLY        01

Taking everything right of ":" could still work.

Michael

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 4/4] grep: obey --textconv for the case rev:path
  2013-02-07  9:47                 ` Michael J Gruber
@ 2013-02-07  9:55                   ` Jeff King
  2013-02-07 10:31                     ` Michael J Gruber
  0 siblings, 1 reply; 37+ messages in thread
From: Jeff King @ 2013-02-07  9:55 UTC (permalink / raw)
  To: Michael J Gruber; +Cc: Junio C Hamano, Git Mailing List

On Thu, Feb 07, 2013 at 10:47:55AM +0100, Michael J Gruber wrote:

> > I'd be OK if we had an exterior object_context that could be handled
> > in the same way. But how do we tell setup_revisions that we are
> > interested in seeing the object_context from each parsed item, where
> > does the allocation come from (is it malloc'd by setup_revisions?), and
> > who is responsible for freeing it when we pop pending objects in
> > get_revisions and similar?
> 
> Do we really need all of tree, path and mode in object_context (I mean
> not just here, but other users), or only the path? I'd try and resurrect
> the virtual path name objects then, they would be just like "item"
> storage-wise.

We need at least mode, since that is how the mode parameter of
object_array_entry gets set. I do not know off-hand who uses "tree". I
suspect the intent was to do .gitattributes lookups inside that tree,
but I do not think we actually do in-tree lookups currently.

> > I don't think it's as clear cut.
> > 
> > I wonder, though...what we really care about here is just the pathname.
> > But if it is a pending object that comes from a blob revision argument,
> > won't it always be of the form "treeish:path"? Could we not even resolve
> > the sha1 again, but instead just parse out the ":path" bit?
> 
> Do we have that, and in what form (e.g. magic expanded etc.)?

Ah, I should have mentioned that. :) We should have the original rev
name in the object_array_entry's name field, shouldn't we? It's just a
matter of re-parsing it.

> Another thing I noted is that our path mangling at least for grep has
> some issues:
> 
> (cd t && git grep GET_SHA1_QUIETLY HEAD:../cache.h)
> ../HEAD:../cache.h:#define GET_SHA1_QUIETLY        01

Yuck.

-Peff

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 4/4] grep: obey --textconv for the case rev:path
  2013-02-07  9:55                   ` Jeff King
@ 2013-02-07 10:31                     ` Michael J Gruber
  2013-02-07 18:03                       ` Junio C Hamano
  0 siblings, 1 reply; 37+ messages in thread
From: Michael J Gruber @ 2013-02-07 10:31 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, Git Mailing List

Jeff King venit, vidit, dixit 07.02.2013 10:55:
> On Thu, Feb 07, 2013 at 10:47:55AM +0100, Michael J Gruber wrote:
> 
>>> I'd be OK if we had an exterior object_context that could be handled
>>> in the same way. But how do we tell setup_revisions that we are
>>> interested in seeing the object_context from each parsed item, where
>>> does the allocation come from (is it malloc'd by setup_revisions?), and
>>> who is responsible for freeing it when we pop pending objects in
>>> get_revisions and similar?
>>
>> Do we really need all of tree, path and mode in object_context (I mean
>> not just here, but other users), or only the path? I'd try and resurrect
>> the virtual path name objects then, they would be just like "item"
>> storage-wise.
> 
> We need at least mode, since that is how the mode parameter of
> object_array_entry gets set. I do not know off-hand who uses "tree". I
> suspect the intent was to do .gitattributes lookups inside that tree,
> but I do not think we actually do in-tree lookups currently.
> 
>>> I don't think it's as clear cut.
>>>
>>> I wonder, though...what we really care about here is just the pathname.
>>> But if it is a pending object that comes from a blob revision argument,
>>> won't it always be of the form "treeish:path"? Could we not even resolve
>>> the sha1 again, but instead just parse out the ":path" bit?
>>
>> Do we have that, and in what form (e.g. magic expanded etc.)?
> 
> Ah, I should have mentioned that. :) We should have the original rev
> name in the object_array_entry's name field, shouldn't we? It's just a
> matter of re-parsing it.
> 
>> Another thing I noted is that our path mangling at least for grep has
>> some issues:
>>
>> (cd t && git grep GET_SHA1_QUIETLY HEAD:../cache.h)
>> ../HEAD:../cache.h:#define GET_SHA1_QUIETLY        01
> 
> Yuck.

And even more yuck:

(cd t && git grep --full-name GET_SHA1_QUIETLY HEAD:../cache.h)
HEAD:../cache.h:#define GET_SHA1_QUIETLY        01

Someone does not expect a "rev:" to be in there, it seems ;)

Michael

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 4/4] grep: obey --textconv for the case rev:path
  2013-02-07 10:31                     ` Michael J Gruber
@ 2013-02-07 18:03                       ` Junio C Hamano
  2013-02-08 11:27                         ` Michael J Gruber
  0 siblings, 1 reply; 37+ messages in thread
From: Junio C Hamano @ 2013-02-07 18:03 UTC (permalink / raw)
  To: Michael J Gruber; +Cc: Jeff King, Git Mailing List

Michael J Gruber <git@drmicha.warpmail.net> writes:

>>> (cd t && git grep GET_SHA1_QUIETLY HEAD:../cache.h)
>>> ../HEAD:../cache.h:#define GET_SHA1_QUIETLY        01
>> 
>> Yuck.
>
> And even more yuck:
>
> (cd t && git grep --full-name GET_SHA1_QUIETLY HEAD:../cache.h)
> HEAD:../cache.h:#define GET_SHA1_QUIETLY        01
>
> Someone does not expect a "rev:" to be in there, it seems ;)

I think stepping outside of $(cwd) is an afterthought the code does
not anticipate.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [RFC/PATCH 4/4] grep: obey --textconv for the case rev:path
  2013-02-07 18:03                       ` Junio C Hamano
@ 2013-02-08 11:27                         ` Michael J Gruber
  0 siblings, 0 replies; 37+ messages in thread
From: Michael J Gruber @ 2013-02-08 11:27 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Jeff King, Git Mailing List

Junio C Hamano venit, vidit, dixit 07.02.2013 19:03:
> Michael J Gruber <git@drmicha.warpmail.net> writes:
> 
>>>> (cd t && git grep GET_SHA1_QUIETLY HEAD:../cache.h)
>>>> ../HEAD:../cache.h:#define GET_SHA1_QUIETLY        01
>>>
>>> Yuck.
>>
>> And even more yuck:
>>
>> (cd t && git grep --full-name GET_SHA1_QUIETLY HEAD:../cache.h)
>> HEAD:../cache.h:#define GET_SHA1_QUIETLY        01
>>
>> Someone does not expect a "rev:" to be in there, it seems ;)
> 
> I think stepping outside of $(cwd) is an afterthought the code does
> not anticipate.
> 

Well, we do resolve relative paths correctly, and there are even some
"chdir" in the code path. It's just that the output label is incorrect.

Michael

^ permalink raw reply	[flat|nested] 37+ messages in thread

end of thread, other threads:[~2013-02-08 11:28 UTC | newest]

Thread overview: 37+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-02-04 15:27 [WIP/RFH/RFD/PATCH] grep: allow to use textconv filters Michael J Gruber
2013-02-04 17:12 ` Junio C Hamano
2013-02-05  8:48   ` Michael J Gruber
2013-02-05 11:13 ` Jeff King
2013-02-05 16:21   ` Michael J Gruber
2013-02-05 20:11     ` Jeff King
2013-02-06 15:08       ` [RFC/PATCH 0/4] textconv for show and grep Michael J Gruber
2013-02-06 15:08         ` [RFC/PATCH 1/4] show: obey --textconv for blobs Michael J Gruber
2013-02-06 16:53           ` Junio C Hamano
2013-02-06 22:12             ` Jeff King
2013-02-06 23:49               ` Junio C Hamano
2013-02-07  0:10                 ` Jeff King
2013-02-07  0:26                   ` Junio C Hamano
2013-02-07  8:48             ` Michael J Gruber
2013-02-06 22:06           ` Jeff King
2013-02-07  9:05             ` Michael J Gruber
2013-02-07  9:11               ` Jeff King
2013-02-07  9:34                 ` Michael J Gruber
2013-02-07  9:43                   ` Jeff King
2013-02-06 15:08         ` [RFC/PATCH 2/4] cat-file: do not die on --textconv without textconv filters Michael J Gruber
2013-02-06 16:47           ` Junio C Hamano
2013-02-06 22:19           ` Jeff King
2013-02-06 22:23             ` Junio C Hamano
2013-02-06 22:43               ` Jeff King
2013-02-06 15:08         ` [RFC/PATCH 3/4] grep: allow to use " Michael J Gruber
2013-02-06 15:12           ` Matthieu Moy
2013-02-06 22:23           ` Jeff King
2013-02-06 15:08         ` [RFC/PATCH 4/4] grep: obey --textconv for the case rev:path Michael J Gruber
2013-02-06 22:36           ` Jeff King
2013-02-07  9:05             ` Michael J Gruber
2013-02-07  9:26               ` Jeff King
2013-02-07  9:47                 ` Michael J Gruber
2013-02-07  9:55                   ` Jeff King
2013-02-07 10:31                     ` Michael J Gruber
2013-02-07 18:03                       ` Junio C Hamano
2013-02-08 11:27                         ` Michael J Gruber
2013-02-06 16:55         ` [RFC/PATCH 0/4] textconv for show and grep Junio C Hamano

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.