All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] make diff --color-words customizable
@ 2009-01-09  0:05 Thomas Rast
  2009-01-09  0:25 ` Johannes Schindelin
  2009-01-09  9:53 ` [RFC PATCH] make diff --color-words customizable Jeff King
  0 siblings, 2 replies; 34+ messages in thread
From: Thomas Rast @ 2009-01-09  0:05 UTC (permalink / raw)
  To: git; +Cc: Johannes Schindelin

Allows for user-configurable word splits when using --color-words.
This can make the diff more readable if the regex is configured
according to the language of the file.

For now the (POSIX extended) regex must be set via the environment
GIT_DIFF_WORDS_REGEX.  Each (non-overlapping) match of the regex is
considered a word.  Anything characters not matched are considered
whitespace.  For example, for C try

  GIT_DIFF_WORDS_REGEX='[0-9]+|[a-zA-Z_][a-zA-Z0-9_]*|(\+|-|&|\|){1,2}|\S'

and for TeX try

  GIT_DIFF_WORDS_REGEX='\\[a-zA-Z@]+ *|\{|\}|\\.|[^\{} [:space:]]+'

Signed-off-by: Thomas Rast <trast@student.ethz.ch>

---

Word diff becomes much more useful especially with TeX, where it is
common to run together \sequences\of\commands\like\this that the
current --color-words treats as a single word.

Apart from possible bugs, the main issue is: where should I put the
configuration for this?


 diff.c |  142 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 files changed, 127 insertions(+), 15 deletions(-)

diff --git a/diff.c b/diff.c
index d235482..c1e24de 100644
--- a/diff.c
+++ b/diff.c
@@ -321,6 +321,7 @@ struct diff_words_buffer {
 	long alloc;
 	long current; /* output pointer */
 	int suppressed_newline;
+	enum diff_word_boundaries *boundaries;
 };
 
 static void diff_words_append(char *line, unsigned long len,
@@ -336,21 +337,35 @@ static void diff_words_append(char *line, unsigned long len,
 	buffer->text.size += len;
 }
 
+enum diff_word_boundaries {
+	DIFF_WORD_CONT,
+	DIFF_WORD_START,
+	DIFF_WORD_SPACE
+};
+
+
 struct diff_words_data {
 	struct diff_words_buffer minus, plus;
 	FILE *file;
+	enum diff_word_boundaries *minus_boundaries, *plus_boundaries;
 };
 
-static void print_word(FILE *file, struct diff_words_buffer *buffer, int len, int color,
+static int print_word(FILE *file, struct diff_words_buffer *buffer, int len, int color,
 		int suppress_newline)
 {
 	const char *ptr;
 	int eol = 0;
 
 	if (len == 0)
-		return;
+		return len;
 
 	ptr  = buffer->text.ptr + buffer->current;
+
+	if (buffer->boundaries[buffer->current+len-1] == DIFF_WORD_START) {
+		buffer->boundaries[buffer->current+len-1] = DIFF_WORD_CONT;
+		len--;
+	}
+
 	buffer->current += len;
 
 	if (ptr[len - 1] == '\n') {
@@ -368,6 +383,8 @@ static void print_word(FILE *file, struct diff_words_buffer *buffer, int len, in
 		else
 			putc('\n', file);
 	}
+
+	return len;
 }
 
 static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len)
@@ -391,13 +408,79 @@ static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len)
 				   &diff_words->plus, len, DIFF_FILE_NEW, 0);
 			break;
 		case ' ':
-			print_word(diff_words->file,
-				   &diff_words->plus, len, DIFF_PLAIN, 0);
+			len = print_word(diff_words->file,
+					 &diff_words->plus, len, DIFF_PLAIN, 0);
 			diff_words->minus.current += len;
 			break;
 	}
 }
 
+static char *worddiff_default = "\\S+";
+static regex_t worddiff_regex;
+static int worddiff_regex_compiled = 0;
+
+static int scan_word_boundaries(struct diff_words_buffer *buf)
+{
+	enum diff_word_boundaries *boundaries = buf->boundaries;
+	char *text = buf->text.ptr;
+	int len = buf->text.size;
+
+	int i = 0;
+	int count = 0;
+	int ret;
+	regmatch_t matches[1];
+	int offset, wordlen;
+	char *strz;
+
+	if (!text)
+		return 0;
+
+	if (!worddiff_regex_compiled) {
+		char *wd_pat = getenv("GIT_DIFF_WORDS_REGEX");
+		if (!wd_pat)
+			wd_pat = worddiff_default;
+		ret = regcomp(&worddiff_regex, wd_pat, REG_EXTENDED);
+		if (ret) {
+			char errbuf[1024];
+			regerror(ret, &worddiff_regex, errbuf, 1024);
+			die("word diff regex failed to compile: '%s': %s",
+			    wd_pat, errbuf);
+		}
+		worddiff_regex_compiled = 1;
+	}
+
+	strz = xmalloc(len+1);
+	memcpy(strz, text, len);
+	strz[len] = '\0';
+
+	while (i < len) {
+		ret = regexec(&worddiff_regex, strz+i, 1, matches, 0);
+		if (ret == REG_NOMATCH) {
+			/* the rest is whitespace */
+			while (i < len)
+				boundaries[i++] = DIFF_WORD_SPACE;
+			break;
+		}
+
+		offset = matches[0].rm_so;
+		while (offset-- > 0 && i < len)
+			boundaries[i++] = DIFF_WORD_SPACE;
+
+		wordlen = matches[0].rm_eo - matches[0].rm_so;
+		if (wordlen-- > 0 && i < len) {
+			boundaries[i++] = DIFF_WORD_START;
+			count++;
+		}
+		while (wordlen-- > 0 && i < len)
+			boundaries[i++] = DIFF_WORD_CONT;
+	}
+
+	free(strz);
+
+	return count;
+}
+
+
 /* this executes the word diff on the accumulated buffers */
 static void diff_words_show(struct diff_words_data *diff_words)
 {
@@ -406,23 +489,50 @@ static void diff_words_show(struct diff_words_data *diff_words)
 	xdemitcb_t ecb;
 	mmfile_t minus, plus;
 	int i;
+	char *p;
+	int bcount;
 
 	memset(&xpp, 0, sizeof(xpp));
 	memset(&xecfg, 0, sizeof(xecfg));
-	minus.size = diff_words->minus.text.size;
-	minus.ptr = xmalloc(minus.size);
-	memcpy(minus.ptr, diff_words->minus.text.ptr, minus.size);
-	for (i = 0; i < minus.size; i++)
-		if (isspace(minus.ptr[i]))
-			minus.ptr[i] = '\n';
+
+	diff_words->minus.boundaries = xmalloc(diff_words->minus.text.size * sizeof(enum diff_word_boundaries));
+	bcount = scan_word_boundaries(&diff_words->minus);
+	minus.size = diff_words->minus.text.size + bcount;
+	minus.ptr = xmalloc(minus.size + bcount);
+	p = minus.ptr;
+	for (i = 0; i < diff_words->minus.text.size; i++) {
+		switch (diff_words->minus.boundaries[i]) {
+		case DIFF_WORD_START:
+			*p++ = '\n';
+			/* fall through */
+		case DIFF_WORD_CONT:
+			*p++ = diff_words->minus.text.ptr[i];
+			break;
+		case DIFF_WORD_SPACE:
+			*p++ = '\n';
+			break;
+		}
+	}
 	diff_words->minus.current = 0;
 
-	plus.size = diff_words->plus.text.size;
+	diff_words->plus.boundaries = xmalloc(diff_words->plus.text.size * sizeof(enum diff_word_boundaries));
+	bcount = scan_word_boundaries(&diff_words->plus);
+	plus.size = diff_words->plus.text.size + bcount;
 	plus.ptr = xmalloc(plus.size);
-	memcpy(plus.ptr, diff_words->plus.text.ptr, plus.size);
-	for (i = 0; i < plus.size; i++)
-		if (isspace(plus.ptr[i]))
-			plus.ptr[i] = '\n';
+	p = plus.ptr;
+	for (i = 0; i < diff_words->plus.text.size; i++) {
+		switch (diff_words->plus.boundaries[i]) {
+		case DIFF_WORD_START:
+			*p++ = '\n';
+			/* fall through */
+		case DIFF_WORD_CONT:
+			*p++ = diff_words->plus.text.ptr[i];
+			break;
+		case DIFF_WORD_SPACE:
+			*p++ = '\n';
+			break;
+		}
+	}
 	diff_words->plus.current = 0;
 
 	xpp.flags = XDF_NEED_MINIMAL;
@@ -432,6 +542,8 @@ static void diff_words_show(struct diff_words_data *diff_words)
 	free(minus.ptr);
 	free(plus.ptr);
 	diff_words->minus.text.size = diff_words->plus.text.size = 0;
+	free(diff_words->minus.boundaries);
+	free(diff_words->plus.boundaries);
 
 	if (diff_words->minus.suppressed_newline) {
 		putc('\n', diff_words->file);
-- 
tg: (c123b7c..) t/word-diff-regex (depends on: origin/master)

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH] make diff --color-words customizable
  2009-01-09  0:05 [RFC PATCH] make diff --color-words customizable Thomas Rast
@ 2009-01-09  0:25 ` Johannes Schindelin
  2009-01-09  0:50   ` Thomas Rast
  2009-01-09  9:53 ` [RFC PATCH] make diff --color-words customizable Jeff King
  1 sibling, 1 reply; 34+ messages in thread
From: Johannes Schindelin @ 2009-01-09  0:25 UTC (permalink / raw)
  To: Thomas Rast; +Cc: git

Hi,

On Fri, 9 Jan 2009, Thomas Rast wrote:

> Allows for user-configurable word splits when using --color-words. This 
> can make the diff more readable if the regex is configured according to 
> the language of the file.
> 
> For now the (POSIX extended) regex must be set via the environment
> GIT_DIFF_WORDS_REGEX.  Each (non-overlapping) match of the regex is
> considered a word.  Anything characters not matched are considered
> whitespace.  For example, for C try
> 
>   GIT_DIFF_WORDS_REGEX='[0-9]+|[a-zA-Z_][a-zA-Z0-9_]*|(\+|-|&|\|){1,2}|\S'
> 
> and for TeX try
> 
>   GIT_DIFF_WORDS_REGEX='\\[a-zA-Z@]+ *|\{|\}|\\.|[^\{} [:space:]]+'

Interesting idea.  However, I think it would be better to do the opposite, 
have _word_ patterns.  And even better to have _one_ pattern.

Then we could have a --color-words-regex=<regex> option.

BTW I think you could do what you intended to do with a _way_ smaller 
and more intuitive patch.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH] make diff --color-words customizable
  2009-01-09  0:25 ` Johannes Schindelin
@ 2009-01-09  0:50   ` Thomas Rast
  2009-01-09 11:15     ` Johannes Schindelin
  0 siblings, 1 reply; 34+ messages in thread
From: Thomas Rast @ 2009-01-09  0:50 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: git

[-- Attachment #1: Type: text/plain, Size: 1604 bytes --]

Johannes Schindelin wrote:
> On Fri, 9 Jan 2009, Thomas Rast wrote:
> 
> > Allows for user-configurable word splits when using --color-words. This 
> > can make the diff more readable if the regex is configured according to 
> > the language of the file.
> > 
> > For now the (POSIX extended) regex must be set via the environment
> > GIT_DIFF_WORDS_REGEX.  Each (non-overlapping) match of the regex is
> > considered a word.  Anything characters not matched are considered
> > whitespace.  For example, for C try
> > 
> >   GIT_DIFF_WORDS_REGEX='[0-9]+|[a-zA-Z_][a-zA-Z0-9_]*|(\+|-|&|\|){1,2}|\S'
[...]
> Interesting idea.  However, I think it would be better to do the opposite, 
> have _word_ patterns.  And even better to have _one_ pattern.

I'm not sure I understand.  It _is_ a single pattern.  The examples
just have several cases to distinguish various semantic groups that
can occur, as a sort of "half tokenizer".  (The C example isn't very
complete however.)

> BTW I think you could do what you intended to do with a _way_ smaller 
> and more intuitive patch.

How?

I don't think the existing mechanism, which just replaces all
whitespace with newlines and does a line diff to find out which words
changed, can "just" be adapted.  We will have to insert extra newlines
at points where the regex said to split a word, but where there was no
whitespace in the original content.  If there's a significantly easier
way to do that than I hacked up, please share.

Or maybe I got your original code all wrong?

-- 
Thomas Rast
trast@{inf,student}.ethz.ch


[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH] make diff --color-words customizable
  2009-01-09  0:05 [RFC PATCH] make diff --color-words customizable Thomas Rast
  2009-01-09  0:25 ` Johannes Schindelin
@ 2009-01-09  9:53 ` Jeff King
  2009-01-09 11:18   ` Johannes Schindelin
  1 sibling, 1 reply; 34+ messages in thread
From: Jeff King @ 2009-01-09  9:53 UTC (permalink / raw)
  To: Thomas Rast; +Cc: git, Johannes Schindelin

On Fri, Jan 09, 2009 at 01:05:05AM +0100, Thomas Rast wrote:

> Word diff becomes much more useful especially with TeX, where it is
> common to run together \sequences\of\commands\like\this that the
> current --color-words treats as a single word.

I have run into this, as well, and it would be nice to have configurable
word boundaries.

> Apart from possible bugs, the main issue is: where should I put the
> configuration for this?

It's a per-file thing, so probably in the diff driver that is triggered
via attributes. See userdiff.[ch]; you'll need to add an entry to the
userdiff_driver struct. You can look at the funcname pattern stuff as a
template, as this is very similar.

-Peff

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH] make diff --color-words customizable
  2009-01-09  0:50   ` Thomas Rast
@ 2009-01-09 11:15     ` Johannes Schindelin
  2009-01-09 11:59       ` [ILLUSTRATION PATCH] color-words: take an optional regular expression describing words Johannes Schindelin
  0 siblings, 1 reply; 34+ messages in thread
From: Johannes Schindelin @ 2009-01-09 11:15 UTC (permalink / raw)
  To: Thomas Rast; +Cc: git

Hi,

On Fri, 9 Jan 2009, Thomas Rast wrote:

> Johannes Schindelin wrote:
> > On Fri, 9 Jan 2009, Thomas Rast wrote:
> > 
> > > Allows for user-configurable word splits when using --color-words. This 
> > > can make the diff more readable if the regex is configured according to 
> > > the language of the file.
> > > 
> > > For now the (POSIX extended) regex must be set via the environment
> > > GIT_DIFF_WORDS_REGEX.  Each (non-overlapping) match of the regex is
> > > considered a word.  Anything characters not matched are considered
> > > whitespace.  For example, for C try
> > > 
> > >   GIT_DIFF_WORDS_REGEX='[0-9]+|[a-zA-Z_][a-zA-Z0-9_]*|(\+|-|&|\|){1,2}|\S'
> [...]
> > Interesting idea.  However, I think it would be better to do the opposite, 
> > have _word_ patterns.  And even better to have _one_ pattern.
> 
> I'm not sure I understand.  It _is_ a single pattern.  The examples
> just have several cases to distinguish various semantic groups that
> can occur, as a sort of "half tokenizer".  (The C example isn't very
> complete however.)

Oh, I was fooled by your use of an array of enums whose purpose I did not 
understand at all.

> > BTW I think you could do what you intended to do with a _way_ smaller 
> > and more intuitive patch.
> 
> How?

Intuitively, all you would have to do is to replace this part in 
diff_words_show()

        for (i = 0; i < minus.size; i++)
                if (isspace(minus.ptr[i]))
                        minus.ptr[i] = '\n';

by a loop finding the next word boundary.  I would suggest making that a 
function, say,

	int find_word_boundary(struct diff_words_data *data, char *minus);

This function would also be responsible to initialize the regexp.

However, as I said, I think it would be much more intuitive to 
characterize the _words_ instead of the _word boundaries_.

And I would like to keep the default as-is (together _with_ the 
performance.  IOW if the user did not specify a regexp, it should fall 
back to what it does now, which is slow enough).

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH] make diff --color-words customizable
  2009-01-09  9:53 ` [RFC PATCH] make diff --color-words customizable Jeff King
@ 2009-01-09 11:18   ` Johannes Schindelin
  2009-01-09 11:22     ` Jeff King
  0 siblings, 1 reply; 34+ messages in thread
From: Johannes Schindelin @ 2009-01-09 11:18 UTC (permalink / raw)
  To: Jeff King; +Cc: Thomas Rast, git

Hi,

On Fri, 9 Jan 2009, Jeff King wrote:

> On Fri, Jan 09, 2009 at 01:05:05AM +0100, Thomas Rast wrote:
> 
> > Apart from possible bugs, the main issue is: where should I put the 
> > configuration for this?
> 
> It's a per-file thing, so probably in the diff driver that is triggered 
> via attributes. See userdiff.[ch]; you'll need to add an entry to the 
> userdiff_driver struct. You can look at the funcname pattern stuff as a 
> template, as this is very similar.

I am not sure I would want that in the config or the attributes.  For me, 
it always has been a question of "oh, that LaTeX diff looks ugly, let's 
see what words actually changed".

Only rarely did I wish for a different word boundary detection algorithm.

So I'd rather have an alias than a config/attribute setting.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH] make diff --color-words customizable
  2009-01-09 11:18   ` Johannes Schindelin
@ 2009-01-09 11:22     ` Jeff King
  0 siblings, 0 replies; 34+ messages in thread
From: Jeff King @ 2009-01-09 11:22 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Thomas Rast, git

On Fri, Jan 09, 2009 at 12:18:37PM +0100, Johannes Schindelin wrote:

> > It's a per-file thing, so probably in the diff driver that is triggered 
> > via attributes. See userdiff.[ch]; you'll need to add an entry to the 
> > userdiff_driver struct. You can look at the funcname pattern stuff as a 
> > template, as this is very similar.
> 
> I am not sure I would want that in the config or the attributes.  For me, 
> it always has been a question of "oh, that LaTeX diff looks ugly, let's 
> see what words actually changed".
> 
> Only rarely did I wish for a different word boundary detection algorithm.
> 
> So I'd rather have an alias than a config/attribute setting.

I am not sure what you are saying.

If it is "I do not want color-words on by default for LaTeX", then I
agree. I meant merely that _if_ color-words is enabled, the word
boundaries would be taken from the diff driver config (just like we do
for matching the funcname header).

If it is "I want to specify the color-words boundary on a per-run basis
rather than a per-file basis", then I want the opposite. However, there
is no reason that both cannot be supported (with command line or
environment taking precedence over what's in the config).

-Peff

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [ILLUSTRATION PATCH] color-words: take an optional regular expression describing words
  2009-01-09 11:15     ` Johannes Schindelin
@ 2009-01-09 11:59       ` Johannes Schindelin
  2009-01-09 12:24         ` Thomas Rast
  0 siblings, 1 reply; 34+ messages in thread
From: Johannes Schindelin @ 2009-01-09 11:59 UTC (permalink / raw)
  To: Thomas Rast; +Cc: git


In some applications, words are not delimited by white space.  To
allow for that, you can specify a regular expression describing
what makes a word with

	git diff --color-words='^[A-Za-z0-9]*'

Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
---

	On Fri, 9 Jan 2009, Johannes Schindelin wrote:

	> Intuitively, all you would have to do is to replace this part in 
	> diff_words_show()
	> 
	>         for (i = 0; i < minus.size; i++)
	>                 if (isspace(minus.ptr[i]))
	>                         minus.ptr[i] = '\n';
	> 
	> by a loop finding the next word boundary.  I would suggest making that a 
	> function, say,
	> 
	> 	int find_word_boundary(struct diff_words_data *data, char *minus);
	> 
	> This function would also be responsible to initialize the regexp.
	> 
	> However, as I said, I think it would be much more intuitive to 
	> characterize the _words_ instead of the _word boundaries_.
	> 
	> And I would like to keep the default as-is (together _with_ the 
	> performance.  IOW if the user did not specify a regexp, it should fall 
	> back to what it does now, which is slow enough).

	And this patch does all that, and it _is_ substantially more 
	compact, as promised.

	It lacks testing, a test script and documentation, as well as 
	configurability via config and/or attributes, but that's your
	job, as I am not really _that_ interested in the feature myself.

 diff.c |   45 +++++++++++++++++++++++++++++++++++++++------
 diff.h |    1 +
 2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/diff.c b/diff.c
index 4643ffc..c7ddb60 100644
--- a/diff.c
+++ b/diff.c
@@ -339,6 +339,7 @@ static void diff_words_append(char *line, unsigned long len,
 struct diff_words_data {
 	struct diff_words_buffer minus, plus;
 	FILE *file;
+	regex_t *word_regex;
 };
 
 static void print_word(FILE *file, struct diff_words_buffer *buffer, int len, int color,
@@ -398,6 +399,25 @@ static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len)
 	}
 }
 
+static int find_word_boundary(struct diff_words_data *diff_words,
+		mmfile_t *buffer, int i)
+{
+	if (i >= buffer->size)
+		return i;
+
+	if (diff_words->word_regex) {
+		regmatch_t match[1];
+		if (!regexec(diff_words->word_regex, buffer->ptr + i,
+				1, match, 0))
+			i += match[0].rm_eo;
+	}
+	else
+		while (i < buffer->size && !isspace(i))
+			i++;
+
+	return i;
+}
+
 /* this executes the word diff on the accumulated buffers */
 static void diff_words_show(struct diff_words_data *diff_words)
 {
@@ -412,17 +432,17 @@ static void diff_words_show(struct diff_words_data *diff_words)
 	minus.size = diff_words->minus.text.size;
 	minus.ptr = xmalloc(minus.size);
 	memcpy(minus.ptr, diff_words->minus.text.ptr, minus.size);
-	for (i = 0; i < minus.size; i++)
-		if (isspace(minus.ptr[i]))
-			minus.ptr[i] = '\n';
+	for (i = 0; (i = find_word_boundary(diff_words, &minus, i))
+			< minus.size; i++)
+		minus.ptr[i] = '\n';
 	diff_words->minus.current = 0;
 
 	plus.size = diff_words->plus.text.size;
 	plus.ptr = xmalloc(plus.size);
 	memcpy(plus.ptr, diff_words->plus.text.ptr, plus.size);
-	for (i = 0; i < plus.size; i++)
-		if (isspace(plus.ptr[i]))
-			plus.ptr[i] = '\n';
+	for (i = 0; (i = find_word_boundary(diff_words, &plus, i))
+			< plus.size; i++)
+		plus.ptr[i] = '\n';
 	diff_words->plus.current = 0;
 
 	xpp.flags = XDF_NEED_MINIMAL;
@@ -461,6 +481,7 @@ static void free_diff_words_data(struct emit_callback *ecbdata)
 
 		free (ecbdata->diff_words->minus.text.ptr);
 		free (ecbdata->diff_words->plus.text.ptr);
+		free(ecbdata->diff_words->word_regex);
 		free(ecbdata->diff_words);
 		ecbdata->diff_words = NULL;
 	}
@@ -1483,6 +1504,14 @@ static void builtin_diff(const char *name_a,
 			ecbdata.diff_words =
 				xcalloc(1, sizeof(struct diff_words_data));
 			ecbdata.diff_words->file = o->file;
+			if (o->word_regex) {
+				ecbdata.diff_words->word_regex = (regex_t *)
+					xmalloc(sizeof(regex_t));
+				if (regcomp(ecbdata.diff_words->word_regex,
+						o->word_regex, REG_EXTENDED))
+					die ("Invalid regular expression: %s",
+							o->word_regex);
+			}
 		}
 		xdi_diff_outf(&mf1, &mf2, fn_out_consume, &ecbdata,
 			      &xpp, &xecfg, &ecb);
@@ -2496,6 +2525,10 @@ int diff_opt_parse(struct diff_options *options, const char **av, int ac)
 		DIFF_OPT_CLR(options, COLOR_DIFF);
 	else if (!strcmp(arg, "--color-words"))
 		options->flags |= DIFF_OPT_COLOR_DIFF | DIFF_OPT_COLOR_DIFF_WORDS;
+	else if (!prefixcmp(arg, "--color-words=")) {
+		options->flags |= DIFF_OPT_COLOR_DIFF | DIFF_OPT_COLOR_DIFF_WORDS;
+		options->word_regex = arg + 14;
+	}
 	else if (!strcmp(arg, "--exit-code"))
 		DIFF_OPT_SET(options, EXIT_WITH_STATUS);
 	else if (!strcmp(arg, "--quiet"))
diff --git a/diff.h b/diff.h
index 4d5a327..23cd90c 100644
--- a/diff.h
+++ b/diff.h
@@ -98,6 +98,7 @@ struct diff_options {
 
 	int stat_width;
 	int stat_name_width;
+	const char *word_regex;
 
 	/* this is set by diffcore for DIFF_FORMAT_PATCH */
 	int found_changes;
-- 
1.6.1.203.gc8be3

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [ILLUSTRATION PATCH] color-words: take an optional regular expression describing words
  2009-01-09 11:59       ` [ILLUSTRATION PATCH] color-words: take an optional regular expression describing words Johannes Schindelin
@ 2009-01-09 12:24         ` Thomas Rast
  2009-01-09 13:05           ` Teemu Likonen
  0 siblings, 1 reply; 34+ messages in thread
From: Thomas Rast @ 2009-01-09 12:24 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: git

[-- Attachment #1: Type: text/plain, Size: 2785 bytes --]

Johannes Schindelin wrote:
> 
> In some applications, words are not delimited by white space.  To
> allow for that, you can specify a regular expression describing
> what makes a word with
> 
> 	git diff --color-words='^[A-Za-z0-9]*'
[...]
> 	> Intuitively, all you would have to do is to replace this part in 
> 	> diff_words_show()
> 	> 
> 	>         for (i = 0; i < minus.size; i++)
> 	>                 if (isspace(minus.ptr[i]))
> 	>                         minus.ptr[i] = '\n';
> 	> 
> 	> by a loop finding the next word boundary.
[...]
> 	> However, as I said, I think it would be much more intuitive to 
> 	> characterize the _words_ instead of the _word boundaries_.

That doesn't work.  You cannot overwrite actual content in the strings
to be diffed with newlines.  The current --color-words exploits the
fact that we don't care about spaces anyway, so we might as well
replace them with newlines, but we _do_ care about the words and in
the regexed version, you have no guarantees about where they might start.

To wit:

  thomas@thomas:~/tmp/foo(master)$ cat >foo
  foo_bar_baz
  quux
  thomas@thomas:~/tmp/foo(master)$ git add foo
  thomas@thomas:~/tmp/foo(master)$ git ci -m initial
  [master (root-commit)]: created f110c6c: "initial"
   1 files changed, 2 insertions(+), 0 deletions(-)
   create mode 100644 foo
  thomas@thomas:~/tmp/foo(master)$ cat >foo
  foo_
  ar_
  az
  quux
  thomas@thomas:~/tmp/foo(master)$ git diff
  diff --git i/foo w/foo
  index 5b34f11..a2762c6 100644
  --- i/foo
  +++ w/foo
  @@ -1,2 +1,4 @@
  -foo_bar_baz
  +foo_
  +ar_
  +az
   quux
  thomas@thomas:~/tmp/foo(master)$ git diff --color-words
  diff --git i/foo w/foo
  index 5b34f11..a2762c6 100644
  --- i/foo
  +++ w/foo
  @@ -1,2 +1,4 @@
  foo_bar_bafoo_
  ar_
  az
  quux
  thomas@thomas:~/tmp/foo(master)$ git diff --color-words='[a-zA-Z]+_?'
  diff --git i/foo w/foo
  index 5b34f11..a2762c6 100644
  --- i/foo
  +++ w/foo
  @@ -1,2 +1,4 @@
  quux

Even without the colours, you can see that it has a blind spot for
changes around a newline.  Perhaps there is an easier way to remember
them, but we definitely cannot *forget* about the word boundaries.

That being said, even though my patch correctly sees the changes, the
above test case also exposes some sort of string overrun :-(

> 	> And I would like to keep the default as-is (together _with_ the 
> 	> performance.  IOW if the user did not specify a regexp, it should fall 
> 	> back to what it does now, which is slow enough).

That's definitely a valid request.

I'll come up with a fixed patch, and probably make it both
funcname-like (Jeff's idea) and command line configurable.

-- 
Thomas Rast
trast@{inf,student}.ethz.ch


[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [ILLUSTRATION PATCH] color-words: take an optional regular expression describing words
  2009-01-09 12:24         ` Thomas Rast
@ 2009-01-09 13:05           ` Teemu Likonen
  2009-01-10  0:57             ` [PATCH v2] make diff --color-words customizable Thomas Rast
  0 siblings, 1 reply; 34+ messages in thread
From: Teemu Likonen @ 2009-01-09 13:05 UTC (permalink / raw)
  To: Thomas Rast; +Cc: Johannes Schindelin, git

Thomas Rast (2009-01-09 13:24 +0100) wrote:

> Johannes Schindelin wrote:
>> > And I would like to keep the default as-is (together _with_ the
>> > performance. IOW if the user did not specify a regexp, it should
>> > fall back to what it does now, which is slow enough).
>
> That's definitely a valid request.

I agree with that too. A good thing about the current --color-words is
that it automatically works with UTF-8 encoded text. This is _very_
important as --color-words is usually the best diff tool for
human-language texts.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2] make diff --color-words customizable
  2009-01-09 13:05           ` Teemu Likonen
@ 2009-01-10  0:57             ` Thomas Rast
  2009-01-10  1:50               ` Jakub Narebski
  2009-01-10 10:49               ` Johannes Schindelin
  0 siblings, 2 replies; 34+ messages in thread
From: Thomas Rast @ 2009-01-10  0:57 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Johannes Schindelin, Teemu Likonen

Allows for user-configurable word splits via a regular expression when
using --color-words.  This can make the diff more readable if the
regex is configured according to the language of the file.

The regex can be specified either through an optional argument
--color-words=<regex> or through the attributes mechanism, similar to
the funcname pattern.

Each non-overlapping match of the regex is a word; everything in
between is whitespace.  We disallow matching the empty string (because
it results in an endless loop) or a newline (breaks color escapes and
interacts badly with the input coming from the usual line diff).  To
help the user, we set REG_NEWLINE so that [^...] and . do not match
newlines.

--color-words works (and always worked) by splitting words onto one
line each, and using the normal line-diff machinery to get a word
diff.  Since we cannot reuse the current approach of simply
overwriting uninteresting characters with '\n', we insert an
artificial '\n' at the end of each detected word.  Its presence must
be tracked so that we can distinguish artificial from source newlines.

Insertion of spaces is somewhat subtle.  We echo a "context" space
twice (once on each side of the diff) if it follows directly after a
word, by "skipping" it during the translation (instead of generating a
'\n').  While this loses a tiny bit of accuracy, it runs together long
sequences of changed words into one removed and one added block,
making the diff much more readable.  As a side-effect, the splitting
regex '\S+' currently results in the exact same output as the original
code.  The existing code still stays in place in case no regex is
provided, for performance.

We also build in patterns for some of the languages that already had
funcname regexes.  They are designed to group UTF-8 sequences into a
single word to make sure they remain readable.

Thanks to Johannes Schindelin for the option handling code.

Signed-off-by: Thomas Rast <trast@student.ethz.ch>

---

Thomas Rast wrote:
> I'll come up with a fixed patch, and probably make it both
> funcname-like (Jeff's idea) and command line configurable.

I think this should do.  Getting the spaces right was harder than I
thought; originally it only tracked _END and _BODY, but then a changed
sentence will look like a lot of separate word changes, making it
extremely confusing.

Teemu Likonen wrote:
> I agree with that too. A good thing about the current --color-words is
> that it automatically works with UTF-8 encoded text. This is _very_
> important as --color-words is usually the best diff tool for
> human-language texts.

Thanks for pointing this out.  I put a [\x80-\xff]+ clause in the
built-in patterns that do not already match high-bit characters, so
that they will keep them together no matter what.  Unfortunately it's
rather hard to get the same effect "by hand", as neither shell, nor
git-config, nor regex.c, seem to expand \xNN or \NNN.  You'll need $''
in bash (is this POSIX?)  or 'echo -e' or a very large keyboard, or a
pattern that can be written in terms of a negated class.

(I briefly considered forcing "|[\x80-\xff]+|\S" into the regular
expression, but the former is very encoding-specific.  Maybe at least
"|\S" would be a good addition.)



 Documentation/diff-options.txt  |   18 +++-
 Documentation/gitattributes.txt |   21 ++++
 diff.c                          |  199 +++++++++++++++++++++++++++++++++++----
 diff.h                          |    1 +
 t/t4033-diff-color-words.sh     |   90 ++++++++++++++++++
 userdiff.c                      |   27 ++++--
 userdiff.h                      |    1 +
 7 files changed, 330 insertions(+), 27 deletions(-)

diff --git a/Documentation/diff-options.txt b/Documentation/diff-options.txt
index 671f533..d22c06b 100644
--- a/Documentation/diff-options.txt
+++ b/Documentation/diff-options.txt
@@ -91,8 +91,22 @@ endif::git-format-patch[]
 	Turn off colored diff, even when the configuration file
 	gives the default to color output.
 
---color-words::
-	Show colored word diff, i.e. color words which have changed.
+--color-words[=<regex>]::
+	Show colored word diff, i.e., color words which have changed.
+	By default, a new word only starts at whitespace, so that a
+	'word' is defined as a maximal sequence of non-whitespace
+	characters.  The optional argument <regex> can be used to
+	configure this.  It can also be set via a diff driver, see
+	linkgit:gitattributes[1]; if a <regex> is given explicitly, it
+	overrides any diff driver setting.
++
+The <regex> must be an (extended) regular expression.  When set, every
+non-overlapping match of the <regex> is considered a word.  (Regular
+expression semantics ensure that quantifiers grab a maximal sequence
+of characters.)  Anything between these matches is considered
+whitespace and ignored for the purposes of finding differences.  You
+may want to append `|\S` to your regular expression to make sure that
+it matches all non-whitespace characters.
 
 --no-renames::
 	Turn off rename detection, even when the configuration
diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt
index 8af22ec..67f5522 100644
--- a/Documentation/gitattributes.txt
+++ b/Documentation/gitattributes.txt
@@ -334,6 +334,27 @@ patterns are available:
 - `tex` suitable for source code for LaTeX documents.
 
 
+Customizing word diff
+^^^^^^^^^^^^^^^^^^^^^
+
+You can customize the rules that `git diff --color-words` uses to
+split words in a line, by specifying an appropriate regular expression
+in the "diff.*.wordregex" configuration variable.  For example, in TeX
+a backslash followed by a sequence of letters forms a command, but
+several such commands can be run together without intervening
+whitespace.  To separate them, use a regular expression such as
+
+------------------------
+[diff "tex"]
+	wordregex = "\\\\[a-zA-Z]+|[{}]|\\\\.|[^\\{} \t]+"
+------------------------
+
+Similar to 'xfuncname', a built in value is provided for the drivers
+`bibtex`, `html`, `java`, `php`, `python` and `tex`.  See the
+documentation of --color-words in linkgit:git-diff[1] for the precise
+semantics.
+
+
 Performing text diffs of binary files
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/diff.c b/diff.c
index d235482..620911e 100644
--- a/diff.c
+++ b/diff.c
@@ -321,6 +321,7 @@ struct diff_words_buffer {
 	long alloc;
 	long current; /* output pointer */
 	int suppressed_newline;
+	enum diff_word_boundaries *boundaries;
 };
 
 static void diff_words_append(char *line, unsigned long len,
@@ -336,23 +337,55 @@ static void diff_words_append(char *line, unsigned long len,
 	buffer->text.size += len;
 }
 
+/*
+ * We use these to save the word boundaries.  WORD_BODY and WORD_END
+ * signal a word, meaning that after the WORD_END character an
+ * artificial newline will be inserted.
+ */
+enum diff_word_boundaries {
+	DIFF_WORD_UNDEF,
+	DIFF_WORD_BODY,
+	DIFF_WORD_END,
+	DIFF_WORD_SPACE,
+	DIFF_WORD_SKIP
+};
+
 struct diff_words_data {
 	struct diff_words_buffer minus, plus;
 	FILE *file;
+	regex_t *word_regex;
+	enum diff_word_boundaries *minus_boundaries, *plus_boundaries;
 };
 
-static void print_word(FILE *file, struct diff_words_buffer *buffer, int len, int color,
+static int print_word(FILE *file, struct diff_words_buffer *buffer, int len, int color,
 		int suppress_newline)
 {
 	const char *ptr;
 	int eol = 0;
 
 	if (len == 0)
-		return;
+		return len;
 
 	ptr  = buffer->text.ptr + buffer->current;
+
+	if (buffer->boundaries
+	    && (buffer->boundaries[buffer->current] == DIFF_WORD_BODY
+		|| buffer->boundaries[buffer->current] == DIFF_WORD_END)) {
+		/* account for the artificial newline */
+		len--;
+		/* we still have len>0 because it is a word */
+	}
+
 	buffer->current += len;
 
+	if (buffer->boundaries
+	    && buffer->boundaries[buffer->current] == DIFF_WORD_SKIP) {
+		/* we had an artificial newline, but the next whitespace
+		 * character right after was skipped because of it */
+		buffer->current++;
+		len++;
+	}
+
 	if (ptr[len - 1] == '\n') {
 		eol = 1;
 		len--;
@@ -368,6 +401,10 @@ static void print_word(FILE *file, struct diff_words_buffer *buffer, int len, in
 		else
 			putc('\n', file);
 	}
+
+	/* we need to return how many chars to skip on the other side,
+	 * so account for the (held off) \n */
+	return len+eol;
 }
 
 static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len)
@@ -391,13 +428,106 @@ static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len)
 				   &diff_words->plus, len, DIFF_FILE_NEW, 0);
 			break;
 		case ' ':
-			print_word(diff_words->file,
-				   &diff_words->plus, len, DIFF_PLAIN, 0);
+			len = print_word(diff_words->file,
+					 &diff_words->plus, len, DIFF_PLAIN, 0);
 			diff_words->minus.current += len;
 			break;
 	}
 }
 
+static void scan_word_boundaries(regex_t *pattern, struct diff_words_buffer *buf,
+				 mmfile_t *mmfile)
+{
+	char *text = buf->text.ptr;
+	int len = buf->text.size;
+	int i = 0;
+	int count = 0;
+	int ret;
+	regmatch_t matches[1];
+	int offset, wordlen;
+	char *strz, *p;
+
+	/* overallocate by 1 so we can safely peek past the end for a SKIP */
+	buf->boundaries = xmalloc((len+1) * sizeof(enum diff_word_boundaries));
+	buf->boundaries[len] = DIFF_WORD_UNDEF;
+
+	if (!text) {
+		mmfile->ptr = NULL;
+		mmfile->size = 0;
+		return;
+	}
+
+	strz = xmalloc(len+1);
+	memcpy(strz, text, len);
+	strz[len] = '\0';
+
+	while (i < len) {
+		ret = regexec(pattern, strz+i, 1, matches, 0);
+		if (ret == REG_NOMATCH) {
+			/* the rest is whitespace */
+			if (i > 0 && i < len) {
+				buf->boundaries[i++] = DIFF_WORD_SKIP;
+				count--;
+			}
+			while (i < len)
+				buf->boundaries[i++] = DIFF_WORD_SPACE;
+			break;
+		}
+
+		offset = matches[0].rm_so;
+		if (offset > 0 && i > 0) {
+			buf->boundaries[i++] = DIFF_WORD_SKIP;
+			count--;
+			offset--;
+		}
+		while (offset-- > 0)
+			buf->boundaries[i++] = DIFF_WORD_SPACE;
+
+		wordlen = matches[0].rm_eo - matches[0].rm_so;
+		while (wordlen > 1) {
+			if (strz[i] == '\n')
+				die("word regex matched a newline near '%s'",
+				    strz+i);
+			buf->boundaries[i++] = DIFF_WORD_BODY;
+			wordlen--;
+		}
+		if (wordlen > 0) {
+			if (strz[i] == '\n')
+				die("word regex matched a newline near '%s'",
+				    strz+i);
+			buf->boundaries[i++] = DIFF_WORD_END;
+			count++;
+		} else {
+			die("word regex matched the empty string at '%s'",
+			    strz+i);
+		}
+	}
+
+	free(strz);
+
+	mmfile->size = len + count;
+	mmfile->ptr = xmalloc(mmfile->size);
+	p = mmfile->ptr;
+	for (i = 0; i < len; i++) {
+		switch (buf->boundaries[i]) {
+		case DIFF_WORD_BODY:
+			*p++ = text[i];
+			break;
+		case DIFF_WORD_END:
+			*p++ = text[i];
+			*p++ = '\n'; /* insert an artificial newline */
+			break;
+		case DIFF_WORD_SPACE:
+			*p++ = '\n';
+			break;
+		case DIFF_WORD_SKIP:
+			/* nothing */
+			break;
+		}
+	}
+}
+
+
 /* this executes the word diff on the accumulated buffers */
 static void diff_words_show(struct diff_words_data *diff_words)
 {
@@ -409,22 +539,31 @@ static void diff_words_show(struct diff_words_data *diff_words)
 
 	memset(&xpp, 0, sizeof(xpp));
 	memset(&xecfg, 0, sizeof(xecfg));
-	minus.size = diff_words->minus.text.size;
-	minus.ptr = xmalloc(minus.size);
-	memcpy(minus.ptr, diff_words->minus.text.ptr, minus.size);
-	for (i = 0; i < minus.size; i++)
-		if (isspace(minus.ptr[i]))
-			minus.ptr[i] = '\n';
-	diff_words->minus.current = 0;
 
-	plus.size = diff_words->plus.text.size;
-	plus.ptr = xmalloc(plus.size);
-	memcpy(plus.ptr, diff_words->plus.text.ptr, plus.size);
-	for (i = 0; i < plus.size; i++)
-		if (isspace(plus.ptr[i]))
-			plus.ptr[i] = '\n';
+	if (!diff_words->word_regex) {
+		minus.size = diff_words->minus.text.size;
+		minus.ptr = xmalloc(minus.size);
+		memcpy(minus.ptr, diff_words->minus.text.ptr, minus.size);
+		for (i = 0; i < minus.size; i++)
+			if (isspace(minus.ptr[i]))
+				minus.ptr[i] = '\n';
+
+		plus.size = diff_words->plus.text.size;
+		plus.ptr = xmalloc(plus.size);
+		memcpy(plus.ptr, diff_words->plus.text.ptr, plus.size);
+		for (i = 0; i < plus.size; i++)
+			if (isspace(plus.ptr[i]))
+				plus.ptr[i] = '\n';
+	} else {
+		scan_word_boundaries(diff_words->word_regex,
+				     &diff_words->minus, &minus);
+		scan_word_boundaries(diff_words->word_regex,
+				     &diff_words->plus, &plus);
+	}
+	diff_words->minus.current = 0;
 	diff_words->plus.current = 0;
 
+
 	xpp.flags = XDF_NEED_MINIMAL;
 	xecfg.ctxlen = diff_words->minus.alloc + diff_words->plus.alloc;
 	xdi_diff_outf(&minus, &plus, fn_out_diff_words_aux, diff_words,
@@ -432,6 +571,8 @@ static void diff_words_show(struct diff_words_data *diff_words)
 	free(minus.ptr);
 	free(plus.ptr);
 	diff_words->minus.text.size = diff_words->plus.text.size = 0;
+	free(diff_words->minus.boundaries);
+	free(diff_words->plus.boundaries);
 
 	if (diff_words->minus.suppressed_newline) {
 		putc('\n', diff_words->file);
@@ -461,6 +602,7 @@ static void free_diff_words_data(struct emit_callback *ecbdata)
 
 		free (ecbdata->diff_words->minus.text.ptr);
 		free (ecbdata->diff_words->plus.text.ptr);
+		free(ecbdata->diff_words->word_regex);
 		free(ecbdata->diff_words);
 		ecbdata->diff_words = NULL;
 	}
@@ -1323,6 +1465,12 @@ static const struct userdiff_funcname *diff_funcname_pattern(struct diff_filespe
 	return one->driver->funcname.pattern ? &one->driver->funcname : NULL;
 }
 
+static const char *userdiff_word_regex(struct diff_filespec *one)
+{
+	diff_filespec_load_driver(one);
+	return one->driver->word_regex;
+}
+
 void diff_set_mnemonic_prefix(struct diff_options *options, const char *a, const char *b)
 {
 	if (!options->a_prefix)
@@ -1483,6 +1631,19 @@ static void builtin_diff(const char *name_a,
 			ecbdata.diff_words =
 				xcalloc(1, sizeof(struct diff_words_data));
 			ecbdata.diff_words->file = o->file;
+			if (!o->word_regex)
+				o->word_regex = userdiff_word_regex(one);
+			if (!o->word_regex)
+				o->word_regex = userdiff_word_regex(two);
+			if (o->word_regex) {
+				ecbdata.diff_words->word_regex = (regex_t *)
+					xmalloc(sizeof(regex_t));
+				if (regcomp(ecbdata.diff_words->word_regex,
+					    o->word_regex,
+					    REG_EXTENDED|REG_NEWLINE))
+					die ("Invalid regular expression: %s",
+					     o->word_regex);
+			}
 		}
 		xdi_diff_outf(&mf1, &mf2, fn_out_consume, &ecbdata,
 			      &xpp, &xecfg, &ecb);
@@ -2494,6 +2655,10 @@ int diff_opt_parse(struct diff_options *options, const char **av, int ac)
 		DIFF_OPT_CLR(options, COLOR_DIFF);
 	else if (!strcmp(arg, "--color-words"))
 		options->flags |= DIFF_OPT_COLOR_DIFF | DIFF_OPT_COLOR_DIFF_WORDS;
+	else if (!prefixcmp(arg, "--color-words=")) {
+		options->flags |= DIFF_OPT_COLOR_DIFF | DIFF_OPT_COLOR_DIFF_WORDS;
+		options->word_regex = arg + 14;
+	}
 	else if (!strcmp(arg, "--exit-code"))
 		DIFF_OPT_SET(options, EXIT_WITH_STATUS);
 	else if (!strcmp(arg, "--quiet"))
diff --git a/diff.h b/diff.h
index 4d5a327..23cd90c 100644
--- a/diff.h
+++ b/diff.h
@@ -98,6 +98,7 @@ struct diff_options {
 
 	int stat_width;
 	int stat_name_width;
+	const char *word_regex;
 
 	/* this is set by diffcore for DIFF_FORMAT_PATCH */
 	int found_changes;
diff --git a/t/t4033-diff-color-words.sh b/t/t4033-diff-color-words.sh
new file mode 100755
index 0000000..536cdac
--- /dev/null
+++ b/t/t4033-diff-color-words.sh
@@ -0,0 +1,90 @@
+#!/bin/sh
+
+
+test_description='diff --color-words'
+. ./test-lib.sh
+
+cat <<EOF > test_a
+foo_bar_baz
+a qu_ux b c
+alpha beta gamma delta
+EOF
+
+cat <<EOF > test_b
+foo_baz_baz
+a qu_new_ux b c
+alpha 4 2 delta
+EOF
+
+# t4026-diff-color.sh tests the color escapes, so we assume they do
+# not change
+
+munge () {
+    tail -n +5 | tr '\033' '!'
+}
+
+cat <<EOF > expect-plain
+![36m@@ -1,3 +1,3 @@![m
+![31mfoo_bar_baz![m![32mfoo_baz_baz![m
+a ![m![31mqu_ux ![m![32mqu_new_ux ![mb ![mc![m
+alpha ![m![31mbeta ![m![31mgamma ![m![32m4 ![m![32m2 ![mdelta![m
+EOF
+
+test_expect_success 'default settings' '
+	git diff --no-index --color-words test_a test_b |
+		munge > actual-plain &&
+	test_cmp expect-plain actual-plain
+'
+
+test_expect_success 'trivial regex yields same as default' '
+	git diff --no-index --color-words="\\S+" test_a test_b |
+		munge > actual-trivial &&
+	test_cmp expect-plain actual-trivial
+'
+
+cat <<EOF > expect-chars
+![36m@@ -1,3 +1,3 @@![m
+f![mo![mo![m_![mb![ma![m![31mr![m![32mz![m_![mb![ma![mz![m
+a ![mq![mu![m_![m![32mn![m![32me![m![32mw![m![32m_![mu![mx ![mb ![mc![m
+a![ml![mp![mh![ma ![m![31mb![m![31me![m![31mt![m![31ma ![m![31mg![m![31ma![m![31mm![m![31mm![m![31ma ![m![32m4 ![m![32m2 ![md![me![ml![mt![ma![m
+EOF
+
+test_expect_success 'character by character regex' '
+	git diff --no-index --color-words="\\S" test_a test_b |
+		munge > actual-chars &&
+	test_cmp expect-chars actual-chars
+'
+
+cat <<EOF > expect-nontrivial
+![36m@@ -1,3 +1,3 @@![m
+foo![m_![m![31mbar![m![32mbaz![m_![mbaz![m
+a ![mqu![m_![m![32mnew![m![32m_![mux ![mb ![mc![m
+alpha ![m![31mbeta ![m![31mgamma ![m![32m4![m![32m ![m![32m2![m![32m ![mdelta![m
+EOF
+
+test_expect_success 'nontrivial regex' '
+	git diff --no-index --color-words="[a-z]+|_" test_a test_b |
+		munge > actual-nontrivial &&
+	test_cmp expect-nontrivial actual-nontrivial
+'
+
+test_expect_success 'set a diff driver' '
+	git config diff.testdriver.wordregex "\\S" &&
+	cat <<EOF > .gitattributes
+test_* diff=testdriver
+EOF
+'
+
+test_expect_success 'use default supplied by driver' '
+	git diff --no-index --color-words test_a test_b |
+		munge > actual-chars-2 &&
+	test_cmp expect-chars actual-chars-2
+'
+
+test_expect_success 'option overrides default' '
+	git diff --no-index --color-words="[a-z]+|_" test_a test_b |
+		munge > actual-nontrivial-2 &&
+	test_cmp expect-nontrivial actual-nontrivial-2
+'
+
+test_done
diff --git a/userdiff.c b/userdiff.c
index 3681062..7fd9a07 100644
--- a/userdiff.c
+++ b/userdiff.c
@@ -6,13 +6,17 @@ static struct userdiff_driver *drivers;
 static int ndrivers;
 static int drivers_alloc;
 
-#define FUNCNAME(name, pattern) \
+#define FUNCNAME(name, pattern)			\
 	{ name, NULL, -1, { pattern, REG_EXTENDED } }
+#define PATTERNS(name, pattern, wordregex)			\
+	{ name, NULL, -1, { pattern, REG_EXTENDED }, NULL, wordregex }
 static struct userdiff_driver builtin_drivers[] = {
-FUNCNAME("html", "^[ \t]*(<[Hh][1-6][ \t].*>.*)$"),
-FUNCNAME("java",
+PATTERNS("html", "^[ \t]*(<[Hh][1-6][ \t].*>.*)$",
+	 "[^<>= \t]+|\\S"),
+PATTERNS("java",
 	 "!^[ \t]*(catch|do|for|if|instanceof|new|return|switch|throw|while)\n"
-	 "^[ \t]*(([ \t]*[A-Za-z_][A-Za-z_0-9]*){2,}[ \t]*\\([^;]*)$"),
+	 "^[ \t]*(([ \t]*[A-Za-z_][A-Za-z_0-9]*){2,}[ \t]*\\([^;]*)$",
+	 "[a-zA-Z_][a-zA-Z0-9_]*|[-+0-9.e]+|[-+*/]=|\\+\\+|--|\\S|[\x80-\xff]+"),
 FUNCNAME("objc",
 	 /* Negate C statements that can look like functions */
 	 "!^[ \t]*(do|for|if|else|return|switch|while)\n"
@@ -27,14 +31,19 @@ FUNCNAME("pascal",
 		"implementation|initialization|finalization)[ \t]*.*)$"
 	 "\n"
 	 "^(.*=[ \t]*(class|record).*)$"),
-FUNCNAME("php", "^[\t ]*((function|class).*)"),
-FUNCNAME("python", "^[ \t]*((class|def)[ \t].*)$"),
+PATTERNS("php", "^[\t ]*((function|class).*)",
+	 "\\$?[a-zA-Z_][a-zA-Z0-9_]*|[-+0-9.e]+|[-+*/]=|\\+\\+|--|->|\\S|[\x80-\xff]+"),
+PATTERNS("python", "^[ \t]*((class|def)[ \t].*)$",
+	 "[a-zA-Z_][a-zA-Z0-9_]*|[-+0-9.e]+|[-+*/]=|//|\\S|[\x80-\xff]+"),
 FUNCNAME("ruby", "^[ \t]*((class|module|def)[ \t].*)$"),
-FUNCNAME("bibtex", "(@[a-zA-Z]{1,}[ \t]*\\{{0,1}[ \t]*[^ \t\"@',\\#}{~%]*).*$"),
-FUNCNAME("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$"),
+PATTERNS("bibtex", "(@[a-zA-Z]{1,}[ \t]*\\{{0,1}[ \t]*[^ \t\"@',\\#}{~%]*).*$",
+	 "[={}\"]|[^={}\" \t]+"),
+PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",
+	 "\\\\[a-zA-Z@]+|[{}]|\\\\.|[^\\{} \t]+"),
 { "default", NULL, -1, { NULL, 0 } },
 };
 #undef FUNCNAME
+#undef PATTERNS
 
 static struct userdiff_driver driver_true = {
 	"diff=true",
@@ -134,6 +143,8 @@ int userdiff_config(const char *k, const char *v)
 		return parse_string(&drv->external, k, v);
 	if ((drv = parse_driver(k, v, "textconv")))
 		return parse_string(&drv->textconv, k, v);
+	if ((drv = parse_driver(k, v, "wordregex")))
+		return parse_string(&drv->word_regex, k, v);
 
 	return 0;
 }
diff --git a/userdiff.h b/userdiff.h
index ba29457..2aab13e 100644
--- a/userdiff.h
+++ b/userdiff.h
@@ -12,6 +12,7 @@ struct userdiff_driver {
 	int binary;
 	struct userdiff_funcname funcname;
 	const char *textconv;
+	const char *word_regex;
 };
 
 int userdiff_config(const char *k, const char *v);
-- 
tg: (c123b7c..) t/word-diff-regex (depends on: origin/master)

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH v2] make diff --color-words customizable
  2009-01-10  0:57             ` [PATCH v2] make diff --color-words customizable Thomas Rast
@ 2009-01-10  1:50               ` Jakub Narebski
  2009-01-10 11:37                 ` Johannes Schindelin
  2009-01-10 10:49               ` Johannes Schindelin
  1 sibling, 1 reply; 34+ messages in thread
From: Jakub Narebski @ 2009-01-10  1:50 UTC (permalink / raw)
  To: git

Thomas Rast wrote:

> --color-words works (and always worked) by splitting words onto one
> line each, and using the normal line-diff machinery to get a word
> diff. 

Cannot we generalize diff machinery / use underlying LCS diff engine
instead of going through line diff?

-- 
Jakub Narebski
Warsaw, Poland
ShadeHawk on #git

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2] make diff --color-words customizable
  2009-01-10  0:57             ` [PATCH v2] make diff --color-words customizable Thomas Rast
  2009-01-10  1:50               ` Jakub Narebski
@ 2009-01-10 10:49               ` Johannes Schindelin
  2009-01-10 11:25                 ` Thomas Rast
  1 sibling, 1 reply; 34+ messages in thread
From: Johannes Schindelin @ 2009-01-10 10:49 UTC (permalink / raw)
  To: Thomas Rast; +Cc: git, Junio C Hamano, Teemu Likonen

Hi,

On Sat, 10 Jan 2009, Thomas Rast wrote:

>  diff.c                          |  199 +++++++++++++++++++++++++++++++++++----

!!!

BTW I did not really think about the issue you raised about the newlines, 
as I seemed to remember that the idea was to substitute all non-word 
characters with newlines, so that the offsets in the substituted text are 
the same as in the original text.

So I still find your patch way too large, and it keeps growing, 
unfortunately.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2] make diff --color-words customizable
  2009-01-10 10:49               ` Johannes Schindelin
@ 2009-01-10 11:25                 ` Thomas Rast
  2009-01-10 11:45                   ` Johannes Schindelin
  0 siblings, 1 reply; 34+ messages in thread
From: Thomas Rast @ 2009-01-10 11:25 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: git, Junio C Hamano, Teemu Likonen

[-- Attachment #1: Type: text/plain, Size: 1771 bytes --]

Johannes Schindelin wrote:
> BTW I did not really think about the issue you raised about the newlines, 
> as I seemed to remember that the idea was to substitute all non-word 
> characters with newlines, so that the offsets in the substituted text are 
> the same as in the original text.

Ok, so here's a very simple example: Suppose you have the word regex
'x+|y+' and compare these two lines:

A: xxyyxy
B: xyxyy

There are *no* non-word characters between consecutive words in this
case, so you *cannot* replace them with newlines.  You cannot replace
some word character either, as should be obvious from the case of
one-letter words, as you would lose actual content.  My counterexample
to your illustration patch exploited a similar border case: suppose
you decide to overwrite the first (instead of last) character of each
word, then you won't be able to tell "foo" from "\noo" in the input.

Unfortunately the space adjustement makes things even worse.  The
existing method has the side-effect that it only inserts a single
newline between words separated by exactly one space, which runs them
together in the resulting line diff, for example

A: foo bar
B: baz quux

would result in

  -foo
  -bar
  +baz
  +quux

instead of (as my original attempts did) the arguably more correct,
but less readable

  -foo
  +bar
    
  -baz
  +quux

where the middle line is a context line for the space.  So in addition
to the word-ending adjustments for the inserted newlines, I also have
to track the status of the space right after the word.

> So I still find your patch way too large

I can't think of a simpler way to do it, and yours unfortunately
doesn't work.

-- 
Thomas Rast
trast@{inf,student}.ethz.ch



[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2] make diff --color-words customizable
  2009-01-10  1:50               ` Jakub Narebski
@ 2009-01-10 11:37                 ` Johannes Schindelin
  2009-01-10 13:36                   ` Jakub Narebski
  0 siblings, 1 reply; 34+ messages in thread
From: Johannes Schindelin @ 2009-01-10 11:37 UTC (permalink / raw)
  To: Jakub Narebski; +Cc: git

Hi,

On Sat, 10 Jan 2009, Jakub Narebski wrote:

> Thomas Rast wrote:
> 
> > --color-words works (and always worked) by splitting words onto one
> > line each, and using the normal line-diff machinery to get a word
> > diff. 
> 
> Cannot we generalize diff machinery / use underlying LCS diff engine
> instead of going through line diff?

What do you think we're doing?  libxdiff is pretty hardcoded to newlines.  
That's why we're substituting non-word characters with newlines.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2] make diff --color-words customizable
  2009-01-10 11:25                 ` Thomas Rast
@ 2009-01-10 11:45                   ` Johannes Schindelin
  2009-01-11  1:34                     ` Junio C Hamano
  0 siblings, 1 reply; 34+ messages in thread
From: Johannes Schindelin @ 2009-01-10 11:45 UTC (permalink / raw)
  To: Thomas Rast; +Cc: git, Junio C Hamano, Teemu Likonen

Hi,

On Sat, 10 Jan 2009, Thomas Rast wrote:

> Johannes Schindelin wrote:
> > BTW I did not really think about the issue you raised about the newlines, 
> > as I seemed to remember that the idea was to substitute all non-word 
> > characters with newlines, so that the offsets in the substituted text are 
> > the same as in the original text.
> 
> Ok, so here's a very simple example: Suppose you have the word regex
> 'x+|y+' and compare these two lines:
> 
> A: xxyyxy
> B: xyxyy

Ah, I see.

> > So I still find your patch way too large
> 
> I can't think of a simpler way to do it, and yours unfortunately doesn't 
> work.

Well, the thing I tried to hint at: it is not good to have a monster 
patch, as nobody will review it.

In your case, I imagine it would be much easier to get reviewers if you 
had

	patch 1/4 refactor color-words to allow for 0-character word 
		boundaries
	patch 2/4 allow regular expressions to define what makes a word
	patch 3/4 add option to specify word boundary regexps via
		attributes
	patch 4/4 test word boundary regexps

And I admit that I documented the code lousily, but that does not mean 
that you should repeat that mistake.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2] make diff --color-words customizable
  2009-01-10 11:37                 ` Johannes Schindelin
@ 2009-01-10 13:36                   ` Jakub Narebski
  2009-01-10 14:08                     ` Johannes Schindelin
  2009-01-10 17:53                     ` Davide Libenzi
  0 siblings, 2 replies; 34+ messages in thread
From: Jakub Narebski @ 2009-01-10 13:36 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: git, Davide Libenzi, Thomas Rast

On Sat, 10 Jan 2009, Johannes Schindelin wrote:
> On Sat, 10 Jan 2009, Jakub Narebski wrote:
> > Thomas Rast wrote:
> > 
> > > --color-words works (and always worked) by splitting words onto one
> > > line each, and using the normal line-diff machinery to get a word
> > > diff. 
> > 
> > Cannot we generalize diff machinery / use underlying LCS diff engine
> > instead of going through line diff?
> 
> What do you think we're doing?  libxdiff is pretty hardcoded to newlines.  
> That's why we're substituting non-word characters with newlines.

Isn't Meyers algorithm used by libxdiff based on LCS, largest common
subsequence, and doesn't it generate from the mathematical point of
view "diff" between two sequences (two arrays) which just happen to
be lines? It is a bit strange that libxdiff doesn't export its low
level algorithm...

-- 
Jakub Narebski
Poland

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2] make diff --color-words customizable
  2009-01-10 13:36                   ` Jakub Narebski
@ 2009-01-10 14:08                     ` Johannes Schindelin
  2009-01-12 23:59                       ` Jakub Narebski
  2009-01-10 17:53                     ` Davide Libenzi
  1 sibling, 1 reply; 34+ messages in thread
From: Johannes Schindelin @ 2009-01-10 14:08 UTC (permalink / raw)
  To: Jakub Narebski; +Cc: git, Davide Libenzi, Thomas Rast

Hi,

On Sat, 10 Jan 2009, Jakub Narebski wrote:

> On Sat, 10 Jan 2009, Johannes Schindelin wrote:
> > On Sat, 10 Jan 2009, Jakub Narebski wrote:
> > > Thomas Rast wrote:
> > > 
> > > > --color-words works (and always worked) by splitting words onto one
> > > > line each, and using the normal line-diff machinery to get a word
> > > > diff. 
> > > 
> > > Cannot we generalize diff machinery / use underlying LCS diff engine
> > > instead of going through line diff?
> > 
> > What do you think we're doing?  libxdiff is pretty hardcoded to newlines.  
> > That's why we're substituting non-word characters with newlines.
> 
> Isn't Meyers algorithm used by libxdiff based on LCS, largest common
> subsequence, and doesn't it generate from the mathematical point of
> view "diff" between two sequences (two arrays) which just happen to
> be lines? It is a bit strange that libxdiff doesn't export its low
> level algorithm...

Umm.

It _is_ Myers' algorithm.  It just so happens that libxdiff hardcodes 
newline to be the separator.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2] make diff --color-words customizable
  2009-01-10 13:36                   ` Jakub Narebski
  2009-01-10 14:08                     ` Johannes Schindelin
@ 2009-01-10 17:53                     ` Davide Libenzi
  2009-01-13  0:52                       ` Jakub Narebski
  1 sibling, 1 reply; 34+ messages in thread
From: Davide Libenzi @ 2009-01-10 17:53 UTC (permalink / raw)
  To: Jakub Narebski; +Cc: Johannes Schindelin, git, Thomas Rast

On Sat, 10 Jan 2009, Jakub Narebski wrote:

> On Sat, 10 Jan 2009, Johannes Schindelin wrote:
> > On Sat, 10 Jan 2009, Jakub Narebski wrote:
> > > Thomas Rast wrote:
> > > 
> > > > --color-words works (and always worked) by splitting words onto one
> > > > line each, and using the normal line-diff machinery to get a word
> > > > diff. 
> > > 
> > > Cannot we generalize diff machinery / use underlying LCS diff engine
> > > instead of going through line diff?
> > 
> > What do you think we're doing?  libxdiff is pretty hardcoded to newlines.  
> > That's why we're substituting non-word characters with newlines.
> 
> Isn't Meyers algorithm used by libxdiff based on LCS, largest common
> subsequence, and doesn't it generate from the mathematical point of
> view "diff" between two sequences (two arrays) which just happen to
> be lines? It is a bit strange that libxdiff doesn't export its low
> level algorithm...

The core doesn't know anything about lines. Only pre-processing (setting 
up the hash by tokenizing the input) and post-processing (adding '\n' to 
the end of each token), knows about newlines. Memory consumption would 
increase significantly though, since there is a per-token cost, and a 
word-based diff will create more of them WRT the same input.


- Davide

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2] make diff --color-words customizable
  2009-01-10 11:45                   ` Johannes Schindelin
@ 2009-01-11  1:34                     ` Junio C Hamano
  2009-01-11 10:27                       ` [PATCH v3 0/4] customizable --color-words Thomas Rast
                                         ` (4 more replies)
  0 siblings, 5 replies; 34+ messages in thread
From: Junio C Hamano @ 2009-01-11  1:34 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: Thomas Rast, git, Teemu Likonen

Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

> Well, the thing I tried to hint at: it is not good to have a monster 
> patch, as nobody will review it.
>
> In your case, I imagine it would be much easier to get reviewers if you 
> had
>
> 	patch 1/4 refactor color-words to allow for 0-character word 
> 		boundaries
> 	patch 2/4 allow regular expressions to define what makes a word
> 	patch 3/4 add option to specify word boundary regexps via
> 		attributes
> 	patch 4/4 test word boundary regexps
>
> And I admit that I documented the code lousily, but that does not mean 
> that you should repeat that mistake.

Sounds like a reasonable request.  Also I am seeing:

    diff.c: In function 'scan_word_boundaries':
    diff.c:512: warning: enumeration value 'DIFF_WORD_UNDEF' not handled in switch

from this part of the code:

	for (i = 0; i < len; i++) {
		switch (buf->boundaries[i]) {
		case DIFF_WORD_BODY:
			*p++ = text[i];
			break;
		case DIFF_WORD_END:
			*p++ = text[i];
			*p++ = '\n'; /* insert an artificial newline */
			break;
		case DIFF_WORD_SPACE:
			*p++ = '\n';
			break;
		case DIFF_WORD_SKIP:
			/* nothing */
			break;
		}
	}

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v3 0/4] customizable --color-words
  2009-01-11  1:34                     ` Junio C Hamano
@ 2009-01-11 10:27                       ` Thomas Rast
  2009-01-11 10:27                       ` [PATCH v3 1/4] word diff: comments, preparations for regex customization Thomas Rast
                                         ` (3 subsequent siblings)
  4 siblings, 0 replies; 34+ messages in thread
From: Thomas Rast @ 2009-01-11 10:27 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Johannes Schindelin, Teemu Likonen

Johannes Schindelin wrote:
> On Sat, 10 Jan 2009, Thomas Rast wrote:
> > Johannes Schindelin wrote:
> > > So I still find your patch way too large

The bad news is... it just got bigger ;-)

> In your case, I imagine it would be much easier to get reviewers if you 
> had
> 
> 	patch 1/4 refactor color-words to allow for 0-character word 
> 		boundaries
> 	patch 2/4 allow regular expressions to define what makes a word

So here's a 4-patch series.  I put the first split in a different
place than you suggested, however.  I couldn't see a good way to
separate empty boundaries from regex splitting in such a way that the
first half can be exercised (is not just dead code).  1/4 basically
just rearranges code a bit and should be a real no-op patch.

Junio C Hamano wrote:
>     diff.c: In function 'scan_word_boundaries':
>     diff.c:512: warning: enumeration value 'DIFF_WORD_UNDEF' not handled in sw

Thanks, added a case to test for this.

There is one other minor semantic change in 2/4: the error reporting
in case your regex matched "foo\nbar" now says "before 'bar'" instead
of "near '\nbar'".  Other than that, there are only a bunch of added
comments when comparing the result of all four patches with v2.


Thomas Rast (4):
  word diff: comments, preparations for regex customization
  word diff: customizable word splits
  word diff: make regex configurable via attributes
  word diff: test customizable word splits

 Documentation/diff-options.txt  |   18 +++-
 Documentation/gitattributes.txt |   21 +++
 diff.c                          |  282 ++++++++++++++++++++++++++++++++++++---
 diff.h                          |    1 +
 t/t4033-diff-color-words.sh     |   90 +++++++++++++
 userdiff.c                      |   27 +++-
 userdiff.h                      |    1 +
 7 files changed, 413 insertions(+), 27 deletions(-)
 create mode 100755 t/t4033-diff-color-words.sh

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v3 1/4] word diff: comments, preparations for regex customization
  2009-01-11  1:34                     ` Junio C Hamano
  2009-01-11 10:27                       ` [PATCH v3 0/4] customizable --color-words Thomas Rast
@ 2009-01-11 10:27                       ` Thomas Rast
  2009-01-11 13:41                         ` Johannes Schindelin
                                           ` (2 more replies)
  2009-01-11 10:27                       ` [PATCH v3 2/4] word diff: customizable word splits Thomas Rast
                                         ` (2 subsequent siblings)
  4 siblings, 3 replies; 34+ messages in thread
From: Thomas Rast @ 2009-01-11 10:27 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Johannes Schindelin, Teemu Likonen

This reorganizes the code for diff --color-words in a way that will be
convenient for the next patch, without changing any of the semantics.
The new variables are not used yet except for their default state.

We also add some comments on the workings of diff_words_show() and
associated helper routines.

Signed-off-by: Thomas Rast <trast@student.ethz.ch>
---
 diff.c |   86 +++++++++++++++++++++++++++++++++++++++++++++++++++------------
 1 files changed, 69 insertions(+), 17 deletions(-)

diff --git a/diff.c b/diff.c
index d235482..f274bf5 100644
--- a/diff.c
+++ b/diff.c
@@ -316,11 +316,21 @@ static int fill_mmfile(mmfile_t *mf, struct diff_filespec *one)
 	return 0;
 }
 
+/* unused right now */
+enum diff_word_boundaries {
+	DIFF_WORD_UNDEF,
+	DIFF_WORD_BODY,
+	DIFF_WORD_END,
+	DIFF_WORD_SPACE,
+	DIFF_WORD_SKIP
+};
+
 struct diff_words_buffer {
 	mmfile_t text;
 	long alloc;
 	long current; /* output pointer */
 	int suppressed_newline;
+	enum diff_word_boundaries *boundaries;
 };
 
 static void diff_words_append(char *line, unsigned long len,
@@ -339,16 +349,23 @@ static void diff_words_append(char *line, unsigned long len,
 struct diff_words_data {
 	struct diff_words_buffer minus, plus;
 	FILE *file;
+	regex_t *word_regex; /* currently unused */
 };
 
-static void print_word(FILE *file, struct diff_words_buffer *buffer, int len, int color,
+/*
+ * Print 'len' characters from the "real" diff data in 'buffer'.  Also
+ * returns how many characters were printed (currently always 'len').
+ * With 'suppress_newline', we remember a final newline instead of
+ * printing it.
+ */
+static int print_word(FILE *file, struct diff_words_buffer *buffer, int len, int color,
 		int suppress_newline)
 {
 	const char *ptr;
 	int eol = 0;
 
 	if (len == 0)
-		return;
+		return len;
 
 	ptr  = buffer->text.ptr + buffer->current;
 	buffer->current += len;
@@ -368,18 +385,30 @@ static void print_word(FILE *file, struct diff_words_buffer *buffer, int len, in
 		else
 			putc('\n', file);
 	}
+
+	/* we need to return how many chars to skip on the other side,
+	 * so account for the (held off) \n */
+	return len+eol;
 }
 
+/*
+ * Callback for word diff output
+ */
 static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len)
 {
 	struct diff_words_data *diff_words = priv;
 
 	if (diff_words->minus.suppressed_newline) {
+		/* We completely drop a suppressed newline on the
+		 * minus side, if it is immediately followed by a plus
+		 * side output. This formats a word change right
+		 * before the end of line correctly */
 		if (line[0] != '+')
 			putc('\n', diff_words->file);
 		diff_words->minus.suppressed_newline = 0;
 	}
 
+	/* account for the [+- ] inserted by the line diff */
 	len--;
 	switch (line[0]) {
 		case '-':
@@ -391,8 +420,10 @@ static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len)
 				   &diff_words->plus, len, DIFF_FILE_NEW, 0);
 			break;
 		case ' ':
-			print_word(diff_words->file,
-				   &diff_words->plus, len, DIFF_PLAIN, 0);
+			len = print_word(diff_words->file,
+					 &diff_words->plus, len, DIFF_PLAIN, 0);
+			/* skip the characters that were printed on
+			 * the other side, too */
 			diff_words->minus.current += len;
 			break;
 	}
@@ -409,22 +440,37 @@ static void diff_words_show(struct diff_words_data *diff_words)
 
 	memset(&xpp, 0, sizeof(xpp));
 	memset(&xecfg, 0, sizeof(xecfg));
-	minus.size = diff_words->minus.text.size;
-	minus.ptr = xmalloc(minus.size);
-	memcpy(minus.ptr, diff_words->minus.text.ptr, minus.size);
-	for (i = 0; i < minus.size; i++)
-		if (isspace(minus.ptr[i]))
-			minus.ptr[i] = '\n';
-	diff_words->minus.current = 0;
 
-	plus.size = diff_words->plus.text.size;
-	plus.ptr = xmalloc(plus.size);
-	memcpy(plus.ptr, diff_words->plus.text.ptr, plus.size);
-	for (i = 0; i < plus.size; i++)
-		if (isspace(plus.ptr[i]))
-			plus.ptr[i] = '\n';
+	/* currently always true */
+	if (!diff_words->word_regex) {
+		/*
+		 * "Simple" word diff: replace all space characters
+		 * with a newline.
+		 *
+		 * This groups together "words" of nonspaces on a line
+		 * each, which we then diff using the normal line-diff
+		 * mechanism.  It also has the nice property that
+		 * character counts/offsets stay the same.
+		 */
+		minus.size = diff_words->minus.text.size;
+		minus.ptr = xmalloc(minus.size);
+		memcpy(minus.ptr, diff_words->minus.text.ptr, minus.size);
+		for (i = 0; i < minus.size; i++)
+			if (isspace(minus.ptr[i]))
+				minus.ptr[i] = '\n';
+
+		plus.size = diff_words->plus.text.size;
+		plus.ptr = xmalloc(plus.size);
+		memcpy(plus.ptr, diff_words->plus.text.ptr, plus.size);
+		for (i = 0; i < plus.size; i++)
+			if (isspace(plus.ptr[i]))
+				plus.ptr[i] = '\n';
+	}
+	diff_words->minus.current = 0;
 	diff_words->plus.current = 0;
 
+	/* we want a minimal diff with enough context to run
+	 * everything into a single hunk */
 	xpp.flags = XDF_NEED_MINIMAL;
 	xecfg.ctxlen = diff_words->minus.alloc + diff_words->plus.alloc;
 	xdi_diff_outf(&minus, &plus, fn_out_diff_words_aux, diff_words,
@@ -432,7 +478,12 @@ static void diff_words_show(struct diff_words_data *diff_words)
 	free(minus.ptr);
 	free(plus.ptr);
 	diff_words->minus.text.size = diff_words->plus.text.size = 0;
+	/* these two are currently free(NULL) */
+	free(diff_words->minus.boundaries);
+	free(diff_words->plus.boundaries);
 
+	/* do not forget about a possible final newline that was held
+	 * back */
 	if (diff_words->minus.suppressed_newline) {
 		putc('\n', diff_words->file);
 		diff_words->minus.suppressed_newline = 0;
@@ -461,6 +512,7 @@ static void free_diff_words_data(struct emit_callback *ecbdata)
 
 		free (ecbdata->diff_words->minus.text.ptr);
 		free (ecbdata->diff_words->plus.text.ptr);
+		free(ecbdata->diff_words->word_regex);
 		free(ecbdata->diff_words);
 		ecbdata->diff_words = NULL;
 	}
-- 
1.6.1.269.g0769

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v3 2/4] word diff: customizable word splits
  2009-01-11  1:34                     ` Junio C Hamano
  2009-01-11 10:27                       ` [PATCH v3 0/4] customizable --color-words Thomas Rast
  2009-01-11 10:27                       ` [PATCH v3 1/4] word diff: comments, preparations for regex customization Thomas Rast
@ 2009-01-11 10:27                       ` Thomas Rast
  2009-01-11 22:20                         ` Junio C Hamano
  2009-01-11 10:27                       ` [PATCH v3 3/4] word diff: make regex configurable via attributes Thomas Rast
  2009-01-11 10:27                       ` [PATCH v3 4/4] word diff: test customizable word splits Thomas Rast
  4 siblings, 1 reply; 34+ messages in thread
From: Thomas Rast @ 2009-01-11 10:27 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Johannes Schindelin, Teemu Likonen

Allows for user-configurable word splits when using --color-words.
This can make the diff more readable if the regex is configured
according to the language of the file.

Each non-overlapping match of the regex is a word; everything in
between is whitespace.  We disallow matching the empty string (because
it results in an endless loop) or a newline (breaks color escapes and
interacts badly with the input coming from the usual line diff).  To
help the user, we set REG_NEWLINE so that [^...] and . do not match
newlines.

--color-words works (and always worked) by splitting words onto one
line each, and using the normal line-diff machinery to get a word
diff.  Since we cannot reuse the current approach of simply
overwriting uninteresting characters with '\n', we insert an
artificial '\n' at the end of each detected word.  Its presence must
be tracked so that we can distinguish artificial from source newlines.

Insertion of spaces is somewhat subtle.  We echo a "context" space
twice (once on each side of the diff) if it follows directly after a
word.  While this loses a tiny bit of accuracy, it runs together long
sequences of changed word into one removed and one added block, making
the diff much more readable.  This feature also means that the
splitting regex '\S+' results in the same output as the original code.
The existing code still stays in place in case no regex is provided,
for performance.

Signed-off-by: Thomas Rast <trast@student.ethz.ch>
---
 Documentation/diff-options.txt |   16 +++-
 diff.c                         |  196 +++++++++++++++++++++++++++++++++++++++-
 diff.h                         |    1 +
 3 files changed, 206 insertions(+), 7 deletions(-)

diff --git a/Documentation/diff-options.txt b/Documentation/diff-options.txt
index 671f533..6152d5b 100644
--- a/Documentation/diff-options.txt
+++ b/Documentation/diff-options.txt
@@ -91,8 +91,20 @@ endif::git-format-patch[]
 	Turn off colored diff, even when the configuration file
 	gives the default to color output.
 
---color-words::
-	Show colored word diff, i.e. color words which have changed.
+--color-words[=<regex>]::
+	Show colored word diff, i.e., color words which have changed.
+	By default, a new word only starts at whitespace, so that a
+	'word' is defined as a maximal sequence of non-whitespace
+	characters.  The optional argument <regex> can be used to
+	configure this.
++
+The <regex> must be an (extended) regular expression.  When set, every
+non-overlapping match of the <regex> is considered a word.  (Regular
+expression semantics ensure that quantifiers grab a maximal sequence
+of characters.)  Anything between these matches is considered
+whitespace and ignored for the purposes of finding differences.  You
+may want to append `|\S` to your regular expression to make sure that
+it matches all non-whitespace characters.
 
 --no-renames::
 	Turn off rename detection, even when the configuration
diff --git a/diff.c b/diff.c
index f274bf5..badaea6 100644
--- a/diff.c
+++ b/diff.c
@@ -316,7 +316,21 @@ static int fill_mmfile(mmfile_t *mf, struct diff_filespec *one)
 	return 0;
 }
 
-/* unused right now */
+/*
+ * We use these to save the word boundaries:
+ *
+ * - BODY and END track the extent of a word; after END, an artificial
+ *   newline will be inserted.
+ *
+ * - SPACE means the character is a space, and will be replaced by a
+ *   newline.  If a space follows immediately after END, it is flagged
+ *   SKIP instead, so that two words with exactly one space in between
+ *   end up with only one newline between them.
+ *
+ * - The boundary array is overallocated by one, and the spare element
+ *   is flagged UNDEF to allow peeking over the end of a word to see
+ *   if the next element is a SKIP.
+ */
 enum diff_word_boundaries {
 	DIFF_WORD_UNDEF,
 	DIFF_WORD_BODY,
@@ -349,12 +363,12 @@ static void diff_words_append(char *line, unsigned long len,
 struct diff_words_data {
 	struct diff_words_buffer minus, plus;
 	FILE *file;
-	regex_t *word_regex; /* currently unused */
+	regex_t *word_regex;
 };
 
 /*
  * Print 'len' characters from the "real" diff data in 'buffer'.  Also
- * returns how many characters were printed (currently always 'len').
+ * returns how many characters were printed.
  * With 'suppress_newline', we remember a final newline instead of
  * printing it.
  */
@@ -368,8 +382,27 @@ static int print_word(FILE *file, struct diff_words_buffer *buffer, int len, int
 		return len;
 
 	ptr  = buffer->text.ptr + buffer->current;
+
+	if (buffer->boundaries
+	    && (buffer->boundaries[buffer->current] == DIFF_WORD_BODY
+		|| buffer->boundaries[buffer->current] == DIFF_WORD_END)) {
+		/* drop the artificial newline */
+		len--;
+		/* we still have len>0 because it is a word, and
+		 * scan_word_boundaries() disallows words of length 0. */
+	}
+
 	buffer->current += len;
 
+	/* Peek past the end (this is safe because we overallocated)
+	 * to check if the next character was a skipped space. If so,
+	 * we put it together with the word. */
+	if (buffer->boundaries
+	    && buffer->boundaries[buffer->current] == DIFF_WORD_SKIP) {
+		buffer->current++;
+		len++;
+	}
+
 	if (ptr[len - 1] == '\n') {
 		eol = 1;
 		len--;
@@ -429,6 +462,141 @@ static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len)
 	}
 }
 
+/*
+ * Fancy word splitting by regex
+ *
+ * We search for all non-overlapping matches of 'pattern' in the
+ * 'buf', and define every match as a (separate) word and all
+ * unmatched characters as whitespace.
+ *
+ * Then we build a new mmfile where each word is on a line of its own.
+ * Two of these can then be line-diffed to find the word differences
+ * between the original buffers.  Unlike the normal word diff (see
+ * diff_words_show() below), the transformation does not preserve
+ * character counts, so we need to keep tracking information in
+ * buf->boundaries for later use by print_word().
+ */
+static void scan_word_boundaries(regex_t *pattern, struct diff_words_buffer *buf,
+				 mmfile_t *mmfile)
+{
+	char *text = buf->text.ptr;
+	int len = buf->text.size;
+	int i = 0;
+	/* counts how many extra characters will be inserted into the
+	 * mmfile */
+	int count = 0;
+	int ret;
+	regmatch_t matches[1];
+	int offset, wordlen;
+	char *strz, *p;
+
+	/* overallocate by 1 so we can safely peek past the end for a
+	 * SKIP, see print_word() */
+	buf->boundaries = xmalloc((len+1) * sizeof(enum diff_word_boundaries));
+	buf->boundaries[len] = DIFF_WORD_UNDEF;
+
+	if (!text) {
+		mmfile->ptr = NULL;
+		mmfile->size = 0;
+		return;
+	}
+
+	/* we unfortunately need a null-terminated copy for regexec */
+	strz = xmalloc(len+1);
+	memcpy(strz, text, len);
+	strz[len] = '\0';
+
+	while (i < len) {
+		/* iteratively match the regex against the rest of the
+		 * input string to find the next word */
+		ret = regexec(pattern, strz+i, 1, matches, 0);
+		if (ret == REG_NOMATCH) {
+			/* The rest is whitespace.  The first space
+			 * character is flagged SKIP unless there was
+			 * no preceding text at all */
+			if (i > 0 && i < len) {
+				buf->boundaries[i++] = DIFF_WORD_SKIP;
+				count--; /* SKIP characters have no output */
+			}
+			while (i < len)
+				buf->boundaries[i++] = DIFF_WORD_SPACE;
+			break;
+		}
+
+		offset = matches[0].rm_so;
+
+		/* everything up to the next word is whitespace, using
+		 * the same SKIP rule as above */
+		if (offset > 0 && i > 0) {
+			buf->boundaries[i++] = DIFF_WORD_SKIP;
+			count--;
+			offset--;
+		}
+		while (offset-- > 0)
+			buf->boundaries[i++] = DIFF_WORD_SPACE;
+
+		/* rm_eo is the first character after the match, so
+		 * this is indeed the number of characters matched */
+		wordlen = matches[0].rm_eo - matches[0].rm_so;
+
+		/* all but the last character are BODY */
+		while (wordlen > 1) {
+			if (strz[i] == '\n')
+				die("word regex matched a newline before '%s'",
+				    strz+i+1);
+			buf->boundaries[i++] = DIFF_WORD_BODY;
+			wordlen--;
+		}
+		/* the last character is END, we will insert an extra
+		 * '\n' after it */
+		if (wordlen > 0) {
+			if (strz[i] == '\n')
+				die("word regex matched a newline before '%s'",
+				    strz+i+1);
+			buf->boundaries[i++] = DIFF_WORD_END;
+			count++;
+		} else {
+			/* this would cause an endless loop, so panic */
+			die("word regex matched the empty string at '%s'",
+			    strz+i);
+		}
+	}
+
+	free(strz);
+
+	/* now build the mmfile. there will be 'count' more characters
+	 * than in the original */
+	mmfile->size = len + count;
+	mmfile->ptr = xmalloc(mmfile->size);
+	p = mmfile->ptr;
+	for (i = 0; i < len; i++) {
+		switch (buf->boundaries[i]) {
+		case DIFF_WORD_BODY: /* copy over */
+			*p++ = text[i];
+			break;
+		case DIFF_WORD_END: /* copy and insert an artificial newline */
+			*p++ = text[i];
+			*p++ = '\n';
+			break;
+		case DIFF_WORD_SPACE: /* replace by '\n' */
+			*p++ = '\n';
+			break;
+		case DIFF_WORD_SKIP:
+			/* Ignore. Since the character right before
+			 * this one is always an END, another way to
+			 * look at it is that we avoid duplicate END
+			 * newlines that are already provided by
+			 * SPACE */
+			break;
+		case DIFF_WORD_UNDEF:
+			/* can't happen, but silences a warning */
+			die("can't happen, send test case to git@vger.kernel.org");
+			break;
+		}
+	}
+}
+
+
 /* this executes the word diff on the accumulated buffers */
 static void diff_words_show(struct diff_words_data *diff_words)
 {
@@ -441,7 +609,6 @@ static void diff_words_show(struct diff_words_data *diff_words)
 	memset(&xpp, 0, sizeof(xpp));
 	memset(&xecfg, 0, sizeof(xecfg));
 
-	/* currently always true */
 	if (!diff_words->word_regex) {
 		/*
 		 * "Simple" word diff: replace all space characters
@@ -465,6 +632,13 @@ static void diff_words_show(struct diff_words_data *diff_words)
 		for (i = 0; i < plus.size; i++)
 			if (isspace(plus.ptr[i]))
 				plus.ptr[i] = '\n';
+	} else {
+		/* Configurable word diff with a regex.  See
+		 * scan_word_boundaries() above. */
+		scan_word_boundaries(diff_words->word_regex,
+				     &diff_words->minus, &minus);
+		scan_word_boundaries(diff_words->word_regex,
+				     &diff_words->plus, &plus);
 	}
 	diff_words->minus.current = 0;
 	diff_words->plus.current = 0;
@@ -478,7 +652,6 @@ static void diff_words_show(struct diff_words_data *diff_words)
 	free(minus.ptr);
 	free(plus.ptr);
 	diff_words->minus.text.size = diff_words->plus.text.size = 0;
-	/* these two are currently free(NULL) */
 	free(diff_words->minus.boundaries);
 	free(diff_words->plus.boundaries);
 
@@ -1535,6 +1708,15 @@ static void builtin_diff(const char *name_a,
 			ecbdata.diff_words =
 				xcalloc(1, sizeof(struct diff_words_data));
 			ecbdata.diff_words->file = o->file;
+			if (o->word_regex) {
+				ecbdata.diff_words->word_regex = (regex_t *)
+					xmalloc(sizeof(regex_t));
+				if (regcomp(ecbdata.diff_words->word_regex,
+					    o->word_regex,
+					    REG_EXTENDED|REG_NEWLINE))
+					die ("Invalid regular expression: %s",
+					     o->word_regex);
+			}
 		}
 		xdi_diff_outf(&mf1, &mf2, fn_out_consume, &ecbdata,
 			      &xpp, &xecfg, &ecb);
@@ -2546,6 +2728,10 @@ int diff_opt_parse(struct diff_options *options, const char **av, int ac)
 		DIFF_OPT_CLR(options, COLOR_DIFF);
 	else if (!strcmp(arg, "--color-words"))
 		options->flags |= DIFF_OPT_COLOR_DIFF | DIFF_OPT_COLOR_DIFF_WORDS;
+	else if (!prefixcmp(arg, "--color-words=")) {
+		options->flags |= DIFF_OPT_COLOR_DIFF | DIFF_OPT_COLOR_DIFF_WORDS;
+		options->word_regex = arg + 14;
+	}
 	else if (!strcmp(arg, "--exit-code"))
 		DIFF_OPT_SET(options, EXIT_WITH_STATUS);
 	else if (!strcmp(arg, "--quiet"))
diff --git a/diff.h b/diff.h
index 4d5a327..23cd90c 100644
--- a/diff.h
+++ b/diff.h
@@ -98,6 +98,7 @@ struct diff_options {
 
 	int stat_width;
 	int stat_name_width;
+	const char *word_regex;
 
 	/* this is set by diffcore for DIFF_FORMAT_PATCH */
 	int found_changes;
-- 
1.6.1.269.g0769

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v3 3/4] word diff: make regex configurable via attributes
  2009-01-11  1:34                     ` Junio C Hamano
                                         ` (2 preceding siblings ...)
  2009-01-11 10:27                       ` [PATCH v3 2/4] word diff: customizable word splits Thomas Rast
@ 2009-01-11 10:27                       ` Thomas Rast
  2009-01-11 23:20                         ` Junio C Hamano
  2009-01-11 10:27                       ` [PATCH v3 4/4] word diff: test customizable word splits Thomas Rast
  4 siblings, 1 reply; 34+ messages in thread
From: Thomas Rast @ 2009-01-11 10:27 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Johannes Schindelin, Teemu Likonen

Make the --color-words splitting regular expression configurable via
the diff driver's 'wordregex' attribute.  The user can then set the
driver on a file in .gitattributes.  If a regex is given on the
command line, it overrides the driver's setting.

We also provide built-in regexes for some of the languages that
already had funcname patterns.  (They are designed to run UTF-8
sequences into a single chunk to make sure they remain readable.)

Signed-off-by: Thomas Rast <trast@student.ethz.ch>
---
 Documentation/diff-options.txt  |    4 +++-
 Documentation/gitattributes.txt |   21 +++++++++++++++++++++
 diff.c                          |   10 ++++++++++
 userdiff.c                      |   27 +++++++++++++++++++--------
 userdiff.h                      |    1 +
 5 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/Documentation/diff-options.txt b/Documentation/diff-options.txt
index 6152d5b..d22c06b 100644
--- a/Documentation/diff-options.txt
+++ b/Documentation/diff-options.txt
@@ -96,7 +96,9 @@ endif::git-format-patch[]
 	By default, a new word only starts at whitespace, so that a
 	'word' is defined as a maximal sequence of non-whitespace
 	characters.  The optional argument <regex> can be used to
-	configure this.
+	configure this.  It can also be set via a diff driver, see
+	linkgit:gitattributes[1]; if a <regex> is given explicitly, it
+	overrides any diff driver setting.
 +
 The <regex> must be an (extended) regular expression.  When set, every
 non-overlapping match of the <regex> is considered a word.  (Regular
diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt
index 8af22ec..67f5522 100644
--- a/Documentation/gitattributes.txt
+++ b/Documentation/gitattributes.txt
@@ -334,6 +334,27 @@ patterns are available:
 - `tex` suitable for source code for LaTeX documents.
 
 
+Customizing word diff
+^^^^^^^^^^^^^^^^^^^^^
+
+You can customize the rules that `git diff --color-words` uses to
+split words in a line, by specifying an appropriate regular expression
+in the "diff.*.wordregex" configuration variable.  For example, in TeX
+a backslash followed by a sequence of letters forms a command, but
+several such commands can be run together without intervening
+whitespace.  To separate them, use a regular expression such as
+
+------------------------
+[diff "tex"]
+	wordregex = "\\\\[a-zA-Z]+|[{}]|\\\\.|[^\\{} \t]+"
+------------------------
+
+Similar to 'xfuncname', a built in value is provided for the drivers
+`bibtex`, `html`, `java`, `php`, `python` and `tex`.  See the
+documentation of --color-words in linkgit:git-diff[1] for the precise
+semantics.
+
+
 Performing text diffs of binary files
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/diff.c b/diff.c
index badaea6..c1cc426 100644
--- a/diff.c
+++ b/diff.c
@@ -1548,6 +1548,12 @@ static const struct userdiff_funcname *diff_funcname_pattern(struct diff_filespe
 	return one->driver->funcname.pattern ? &one->driver->funcname : NULL;
 }
 
+static const char *userdiff_word_regex(struct diff_filespec *one)
+{
+	diff_filespec_load_driver(one);
+	return one->driver->word_regex;
+}
+
 void diff_set_mnemonic_prefix(struct diff_options *options, const char *a, const char *b)
 {
 	if (!options->a_prefix)
@@ -1708,6 +1714,10 @@ static void builtin_diff(const char *name_a,
 			ecbdata.diff_words =
 				xcalloc(1, sizeof(struct diff_words_data));
 			ecbdata.diff_words->file = o->file;
+			if (!o->word_regex)
+				o->word_regex = userdiff_word_regex(one);
+			if (!o->word_regex)
+				o->word_regex = userdiff_word_regex(two);
 			if (o->word_regex) {
 				ecbdata.diff_words->word_regex = (regex_t *)
 					xmalloc(sizeof(regex_t));
diff --git a/userdiff.c b/userdiff.c
index 3681062..7fd9a07 100644
--- a/userdiff.c
+++ b/userdiff.c
@@ -6,13 +6,17 @@ static struct userdiff_driver *drivers;
 static int ndrivers;
 static int drivers_alloc;
 
-#define FUNCNAME(name, pattern) \
+#define FUNCNAME(name, pattern)			\
 	{ name, NULL, -1, { pattern, REG_EXTENDED } }
+#define PATTERNS(name, pattern, wordregex)			\
+	{ name, NULL, -1, { pattern, REG_EXTENDED }, NULL, wordregex }
 static struct userdiff_driver builtin_drivers[] = {
-FUNCNAME("html", "^[ \t]*(<[Hh][1-6][ \t].*>.*)$"),
-FUNCNAME("java",
+PATTERNS("html", "^[ \t]*(<[Hh][1-6][ \t].*>.*)$",
+	 "[^<>= \t]+|\\S"),
+PATTERNS("java",
 	 "!^[ \t]*(catch|do|for|if|instanceof|new|return|switch|throw|while)\n"
-	 "^[ \t]*(([ \t]*[A-Za-z_][A-Za-z_0-9]*){2,}[ \t]*\\([^;]*)$"),
+	 "^[ \t]*(([ \t]*[A-Za-z_][A-Za-z_0-9]*){2,}[ \t]*\\([^;]*)$",
+	 "[a-zA-Z_][a-zA-Z0-9_]*|[-+0-9.e]+|[-+*/]=|\\+\\+|--|\\S|[\x80-\xff]+"),
 FUNCNAME("objc",
 	 /* Negate C statements that can look like functions */
 	 "!^[ \t]*(do|for|if|else|return|switch|while)\n"
@@ -27,14 +31,19 @@ FUNCNAME("pascal",
 		"implementation|initialization|finalization)[ \t]*.*)$"
 	 "\n"
 	 "^(.*=[ \t]*(class|record).*)$"),
-FUNCNAME("php", "^[\t ]*((function|class).*)"),
-FUNCNAME("python", "^[ \t]*((class|def)[ \t].*)$"),
+PATTERNS("php", "^[\t ]*((function|class).*)",
+	 "\\$?[a-zA-Z_][a-zA-Z0-9_]*|[-+0-9.e]+|[-+*/]=|\\+\\+|--|->|\\S|[\x80-\xff]+"),
+PATTERNS("python", "^[ \t]*((class|def)[ \t].*)$",
+	 "[a-zA-Z_][a-zA-Z0-9_]*|[-+0-9.e]+|[-+*/]=|//|\\S|[\x80-\xff]+"),
 FUNCNAME("ruby", "^[ \t]*((class|module|def)[ \t].*)$"),
-FUNCNAME("bibtex", "(@[a-zA-Z]{1,}[ \t]*\\{{0,1}[ \t]*[^ \t\"@',\\#}{~%]*).*$"),
-FUNCNAME("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$"),
+PATTERNS("bibtex", "(@[a-zA-Z]{1,}[ \t]*\\{{0,1}[ \t]*[^ \t\"@',\\#}{~%]*).*$",
+	 "[={}\"]|[^={}\" \t]+"),
+PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",
+	 "\\\\[a-zA-Z@]+|[{}]|\\\\.|[^\\{} \t]+"),
 { "default", NULL, -1, { NULL, 0 } },
 };
 #undef FUNCNAME
+#undef PATTERNS
 
 static struct userdiff_driver driver_true = {
 	"diff=true",
@@ -134,6 +143,8 @@ int userdiff_config(const char *k, const char *v)
 		return parse_string(&drv->external, k, v);
 	if ((drv = parse_driver(k, v, "textconv")))
 		return parse_string(&drv->textconv, k, v);
+	if ((drv = parse_driver(k, v, "wordregex")))
+		return parse_string(&drv->word_regex, k, v);
 
 	return 0;
 }
diff --git a/userdiff.h b/userdiff.h
index ba29457..2aab13e 100644
--- a/userdiff.h
+++ b/userdiff.h
@@ -12,6 +12,7 @@ struct userdiff_driver {
 	int binary;
 	struct userdiff_funcname funcname;
 	const char *textconv;
+	const char *word_regex;
 };
 
 int userdiff_config(const char *k, const char *v);
-- 
1.6.1.269.g0769

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v3 4/4] word diff: test customizable word splits
  2009-01-11  1:34                     ` Junio C Hamano
                                         ` (3 preceding siblings ...)
  2009-01-11 10:27                       ` [PATCH v3 3/4] word diff: make regex configurable via attributes Thomas Rast
@ 2009-01-11 10:27                       ` Thomas Rast
  4 siblings, 0 replies; 34+ messages in thread
From: Thomas Rast @ 2009-01-11 10:27 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Johannes Schindelin, Teemu Likonen

Several tests for regex-configured word splits via command line and
gitattributes.  For good measure we also do a basic test of the
default --color-words since it was so far not covered at all.

Signed-off-by: Thomas Rast <trast@student.ethz.ch>
---
 t/t4033-diff-color-words.sh |   90 +++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 90 insertions(+), 0 deletions(-)
 create mode 100755 t/t4033-diff-color-words.sh

diff --git a/t/t4033-diff-color-words.sh b/t/t4033-diff-color-words.sh
new file mode 100755
index 0000000..536cdac
--- /dev/null
+++ b/t/t4033-diff-color-words.sh
@@ -0,0 +1,90 @@
+#!/bin/sh
+
+
+test_description='diff --color-words'
+. ./test-lib.sh
+
+cat <<EOF > test_a
+foo_bar_baz
+a qu_ux b c
+alpha beta gamma delta
+EOF
+
+cat <<EOF > test_b
+foo_baz_baz
+a qu_new_ux b c
+alpha 4 2 delta
+EOF
+
+# t4026-diff-color.sh tests the color escapes, so we assume they do
+# not change
+
+munge () {
+    tail -n +5 | tr '\033' '!'
+}
+
+cat <<EOF > expect-plain
+![36m@@ -1,3 +1,3 @@![m
+![31mfoo_bar_baz![m![32mfoo_baz_baz![m
+a ![m![31mqu_ux ![m![32mqu_new_ux ![mb ![mc![m
+alpha ![m![31mbeta ![m![31mgamma ![m![32m4 ![m![32m2 ![mdelta![m
+EOF
+
+test_expect_success 'default settings' '
+	git diff --no-index --color-words test_a test_b |
+		munge > actual-plain &&
+	test_cmp expect-plain actual-plain
+'
+
+test_expect_success 'trivial regex yields same as default' '
+	git diff --no-index --color-words="\\S+" test_a test_b |
+		munge > actual-trivial &&
+	test_cmp expect-plain actual-trivial
+'
+
+cat <<EOF > expect-chars
+![36m@@ -1,3 +1,3 @@![m
+f![mo![mo![m_![mb![ma![m![31mr![m![32mz![m_![mb![ma![mz![m
+a ![mq![mu![m_![m![32mn![m![32me![m![32mw![m![32m_![mu![mx ![mb ![mc![m
+a![ml![mp![mh![ma ![m![31mb![m![31me![m![31mt![m![31ma ![m![31mg![m![31ma![m![31mm![m![31mm![m![31ma ![m![32m4 ![m![32m2 ![md![me![ml![mt![ma![m
+EOF
+
+test_expect_success 'character by character regex' '
+	git diff --no-index --color-words="\\S" test_a test_b |
+		munge > actual-chars &&
+	test_cmp expect-chars actual-chars
+'
+
+cat <<EOF > expect-nontrivial
+![36m@@ -1,3 +1,3 @@![m
+foo![m_![m![31mbar![m![32mbaz![m_![mbaz![m
+a ![mqu![m_![m![32mnew![m![32m_![mux ![mb ![mc![m
+alpha ![m![31mbeta ![m![31mgamma ![m![32m4![m![32m ![m![32m2![m![32m ![mdelta![m
+EOF
+
+test_expect_success 'nontrivial regex' '
+	git diff --no-index --color-words="[a-z]+|_" test_a test_b |
+		munge > actual-nontrivial &&
+	test_cmp expect-nontrivial actual-nontrivial
+'
+
+test_expect_success 'set a diff driver' '
+	git config diff.testdriver.wordregex "\\S" &&
+	cat <<EOF > .gitattributes
+test_* diff=testdriver
+EOF
+'
+
+test_expect_success 'use default supplied by driver' '
+	git diff --no-index --color-words test_a test_b |
+		munge > actual-chars-2 &&
+	test_cmp expect-chars actual-chars-2
+'
+
+test_expect_success 'option overrides default' '
+	git diff --no-index --color-words="[a-z]+|_" test_a test_b |
+		munge > actual-nontrivial-2 &&
+	test_cmp expect-nontrivial actual-nontrivial-2
+'
+
+test_done
-- 
1.6.1.269.g0769

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 1/4] word diff: comments, preparations for regex customization
  2009-01-11 10:27                       ` [PATCH v3 1/4] word diff: comments, preparations for regex customization Thomas Rast
@ 2009-01-11 13:41                         ` Johannes Schindelin
  2009-01-11 19:49                         ` Johannes Schindelin
  2009-01-11 22:19                         ` Junio C Hamano
  2 siblings, 0 replies; 34+ messages in thread
From: Johannes Schindelin @ 2009-01-11 13:41 UTC (permalink / raw)
  To: Thomas Rast; +Cc: git, Junio C Hamano, Teemu Likonen

Hi,

On Sun, 11 Jan 2009, Thomas Rast wrote:

> This reorganizes the code for diff --color-words in a way that will be
> convenient for the next patch, without changing any of the semantics.
> The new variables are not used yet except for their default state.
> 
> We also add some comments on the workings of diff_words_show() and
> associated helper routines.
> 
> Signed-off-by: Thomas Rast <trast@student.ethz.ch>
> ---

It not only got bigger, it still fails to explain the idea to me.

I'll work on it myself, then,
Dscho

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 1/4] word diff: comments, preparations for regex customization
  2009-01-11 10:27                       ` [PATCH v3 1/4] word diff: comments, preparations for regex customization Thomas Rast
  2009-01-11 13:41                         ` Johannes Schindelin
@ 2009-01-11 19:49                         ` Johannes Schindelin
  2009-01-11 22:19                         ` Junio C Hamano
  2 siblings, 0 replies; 34+ messages in thread
From: Johannes Schindelin @ 2009-01-11 19:49 UTC (permalink / raw)
  To: Thomas Rast; +Cc: git, Junio C Hamano, Teemu Likonen

Hi,

On Sun, 11 Jan 2009, Thomas Rast wrote:


> +/* unused right now */
> +enum diff_word_boundaries {
> +	DIFF_WORD_UNDEF,
> +	DIFF_WORD_BODY,
> +	DIFF_WORD_END,
> +	DIFF_WORD_SPACE,
> +	DIFF_WORD_SKIP
> +};
> +

Just to illustrate why I reacted so harshly to this patch series:  this 
change is utterly useless.

You do not use the word boundaries at all throughout the complete patch.  
This would have been the only way you could have possibly illustrated how 
the enum is to be used at all, given that you did not document what the 
enum should do eventually.

With such a patch that leaves me as clueless as to the way you want to fix 
the issues as before, I regret having spent the time to look at it at all.

What you _should_ have done:

- _not_ introduce the regex.  That is not what the first patch should be 
  about.

- _use_ the enum for the existing functionality.

- _explain_ what the different values of the enum are about.

- _say_ what the enum pointer points at (is it a single value?  an array?  
  if it is an array, is it per letter?  per word?)

If you are honest, you'll admit that this patch would leave you puzzled 
_yourself_, 6 months from now.

FWIW I think I have a better patch series, to be sent in a few minutes.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 1/4] word diff: comments, preparations for regex customization
  2009-01-11 10:27                       ` [PATCH v3 1/4] word diff: comments, preparations for regex customization Thomas Rast
  2009-01-11 13:41                         ` Johannes Schindelin
  2009-01-11 19:49                         ` Johannes Schindelin
@ 2009-01-11 22:19                         ` Junio C Hamano
  2 siblings, 0 replies; 34+ messages in thread
From: Junio C Hamano @ 2009-01-11 22:19 UTC (permalink / raw)
  To: Thomas Rast; +Cc: git, Johannes Schindelin, Teemu Likonen

Thomas Rast <trast@student.ethz.ch> writes:

> +/* unused right now */
> +enum diff_word_boundaries {
> +	DIFF_WORD_UNDEF,
> +	DIFF_WORD_BODY,
> +	DIFF_WORD_END,
> +	DIFF_WORD_SPACE,
> +	DIFF_WORD_SKIP
> +};

Don't do this.  Please introduce them when you start using them.

>  struct diff_words_buffer {
>  	mmfile_t text;
>  	long alloc;
>  	long current; /* output pointer */
>  	int suppressed_newline;
> +	enum diff_word_boundaries *boundaries;
>  };

Likewise.  Especially because this raises eyebrows "Huh, a pointer to an
enum, or perhaps he allocates an array of enums?" without allowing the
reader to figure it out much later when the field is actually used.

>  static void diff_words_append(char *line, unsigned long len,
> @@ -339,16 +349,23 @@ static void diff_words_append(char *line, unsigned long len,
>  struct diff_words_data {
>  	struct diff_words_buffer minus, plus;
>  	FILE *file;
> +	regex_t *word_regex; /* currently unused */
>  };

I see having this here and keeping it NULL in this patch makes the later
patch to diff_words_show() more readable, so this probably should stay
here.

> -static void print_word(FILE *file, struct diff_words_buffer *buffer, int len, int color,
> +/*
> + * Print 'len' characters from the "real" diff data in 'buffer'.  Also
> + * returns how many characters were printed (currently always 'len').
> + * With 'suppress_newline', we remember a final newline instead of
> + * printing it.
> + */

"... Even in such a case, 'len' that is returned counts the suppressed
newline", or something like that?  If you can concisely describe why the
caller wants the returned count not match the number of actually printed
chars (i.e. it includes the suppressed newline), it would help the reader
understand the logic.  I am _guessing_ it is because this is called to
print matching words from the preimage buffer, and the return value is
used to skip over the same part in the postimage buffer, and by definition
they have to be of the same length (otherwise they won't be matching).

> +static int print_word(FILE *file, struct diff_words_buffer *buffer, int len, int color,
>  		int suppress_newline)
>  {
>  	const char *ptr;
>  	int eol = 0;
>  
>  	if (len == 0)
> -		return;
> +		return len;
>  
>  	ptr  = buffer->text.ptr + buffer->current;
>  	buffer->current += len;
> @@ -368,18 +385,30 @@ static void print_word(FILE *file, struct diff_words_buffer *buffer, int len, in
>  		else
>  			putc('\n', file);
>  	}
> +
> +	/* we need to return how many chars to skip on the other side,
> +	 * so account for the (held off) \n */

Multi-line comment style?  I won't repeat this but you have many...

> +	return len+eol;
>  }
>  
> +/*
> + * Callback for word diff output
> + */

Without saying "to do what", the comment adds more noise than signal.
"Called to parse diff between pre- and post- image files converted into
one-word-per-line format and concatenate them to into lines by dropping
some of the end-of-lines but keeping some others", or something like that?

Thanks.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 2/4] word diff: customizable word splits
  2009-01-11 10:27                       ` [PATCH v3 2/4] word diff: customizable word splits Thomas Rast
@ 2009-01-11 22:20                         ` Junio C Hamano
  0 siblings, 0 replies; 34+ messages in thread
From: Junio C Hamano @ 2009-01-11 22:20 UTC (permalink / raw)
  To: Thomas Rast; +Cc: git, Johannes Schindelin, Teemu Likonen

Thomas Rast <trast@student.ethz.ch> writes:

> Allows for user-configurable word splits when using --color-words.
> This can make the diff more readable if the regex is configured
> according to the language of the file.
>
> Each non-overlapping match of the regex is a word; everything in
> between is whitespace.

What happens if the input "language" does not have any inter-word spacing
but its words can still be expressed by regexp patterns?

ImagineALanguageThatAllowsYouToWriteSomethingLikeThis.  Does the mechanism
help users who want to do word-diff files written in such a language by
outputting:

	ImagineALanguage<red>That</red><green>Which</green>AllowsYou...

when '[A-Z][a-z]*' is given by the word pattern?

> We disallow matching the empty string (because
> it results in an endless loop) or a newline (breaks color escapes and
> interacts badly with the input coming from the usual line diff).  To
> help the user, we set REG_NEWLINE so that [^...] and . do not match
> newlines.

AndImagineALanguageWhoseWordStruc
tureDoesNotCareAboutLineBreak

Can you help users with such payload?

	Side note.  Yes, I am coming from Japanese background.

        Side note 2.  No, I am not saying your code must support both of
        the above to be acceptable.  I am just gauging the design
        assumptions and limitations.

> Insertion of spaces is somewhat subtle.  We echo a "context" space
> twice (once on each side of the diff) if it follows directly after a
> word.  While this loses a tiny bit of accuracy, it runs together long
> sequences of changed word into one removed and one added block, making
> the diff much more readable.

I guess this part can be later enhanced to be more precise, so that it
keeps the original context space more faithfully (i.e. does not lose two
consecutive spaces in the original occidental script, and does not insert
any extra space to the oriental script), if we were to support the second
example I gave above in the future as a follow-up patch.

> +--color-words[=<regex>]::
> +	Show colored word diff, i.e., color words which have changed.
> +	By default, a new word only starts at whitespace, so that a
> +	'word' is defined as a maximal sequence of non-whitespace
> +	characters.  The optional argument <regex> can be used to
> +	configure this.
> ++
> +The <regex> must be an (extended) regular expression.  When set, every
> +non-overlapping match of the <regex> is considered a word.  (Regular
> +expression semantics ensure that quantifiers grab a maximal sequence
> +of characters.)  Anything between these matches is considered
> +whitespace and ignored for the purposes of finding differences.  You
> +may want to append `|\S` to your regular expression to make sure that
> +it matches all non-whitespace characters.

Whose regexp library do we assume here?  Traditionally we limited
ourselves to POSIX BRE, and I do not think anybody minds using POSIX ERE
here, but we need to be clear.  In either case \S is a pcre outside
POSIX.

The rest I only skimmed but did not spot anything glaringly wrong; thanks.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 3/4] word diff: make regex configurable via attributes
  2009-01-11 10:27                       ` [PATCH v3 3/4] word diff: make regex configurable via attributes Thomas Rast
@ 2009-01-11 23:20                         ` Junio C Hamano
  0 siblings, 0 replies; 34+ messages in thread
From: Junio C Hamano @ 2009-01-11 23:20 UTC (permalink / raw)
  To: Thomas Rast; +Cc: git, Johannes Schindelin, Teemu Likonen

Thomas Rast <trast@student.ethz.ch> writes:

> Make the --color-words splitting regular expression configurable via
> the diff driver's 'wordregex' attribute.  The user can then set the
> driver on a file in .gitattributes.  If a regex is given on the
> command line, it overrides the driver's setting.
> ...
> diff --git a/Documentation/diff-options.txt b/Documentation/diff-options.txt
> index 6152d5b..d22c06b 100644
> --- a/Documentation/diff-options.txt
> +++ b/Documentation/diff-options.txt
> @@ -96,7 +96,9 @@ endif::git-format-patch[]
>  	By default, a new word only starts at whitespace, so that a
>  	'word' is defined as a maximal sequence of non-whitespace
>  	characters.  The optional argument <regex> can be used to
> -	configure this.
> +	configure this.  It can also be set via a diff driver, see
> +	linkgit:gitattributes[1]; if a <regex> is given explicitly, it
> +	overrides any diff driver setting.
>  +
>  The <regex> must be an (extended) regular expression.  When set, every
>  non-overlapping match of the <regex> is considered a word.  (Regular

One bikeshedding I think is better to get over with is that this probably
should be called xwordregex for consistency with xfuncname where 'x'
variant means POSIX ERE and the one without means POSIX BRE, even if we
are not going to support diff.wordregex that uses BRE.

I am assuming [3/4] can be trivially adjusted if we were to adopt the
clean-up approach Dscho is taking?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2] make diff --color-words customizable
  2009-01-10 14:08                     ` Johannes Schindelin
@ 2009-01-12 23:59                       ` Jakub Narebski
  2009-01-13  0:40                         ` Johannes Schindelin
  0 siblings, 1 reply; 34+ messages in thread
From: Jakub Narebski @ 2009-01-12 23:59 UTC (permalink / raw)
  To: Johannes Schindelin; +Cc: git, Davide Libenzi, Thomas Rast

Hello!

On Sat, 10 Jan 2009, Johannes Schindelin wrote:
> On Sat, 10 Jan 2009, Jakub Narebski wrote:
>> On Sat, 10 Jan 2009, Johannes Schindelin wrote:
>>> On Sat, 10 Jan 2009, Jakub Narebski wrote:
>>>> Thomas Rast wrote:
>>>> 
>>>>> --color-words works (and always worked) by splitting words onto one
>>>>> line each, and using the normal line-diff machinery to get a word
>>>>> diff. 
>>>> 
>>>> Cannot we generalize diff machinery / use underlying LCS diff engine
>>>> instead of going through line diff?
>>> 
>>> What do you think we're doing?  libxdiff is pretty hardcoded to newlines.  
>>> That's why we're substituting non-word characters with newlines.
>> 
>> Isn't Meyers algorithm used by libxdiff based on LCS, largest common
>> subsequence, and doesn't it generate from the mathematical point of
>> view "diff" between two sequences (two arrays) which just happen to
>> be lines? It is a bit strange that libxdiff doesn't export its low
>> level algorithm...
> 
> Umm.
> 
> It _is_ Myers' algorithm.  It just so happens that libxdiff hardcodes 
> newline to be the separator.

So amd I to understand that _exported_ functions hardcode separator
to be newline (most probably for performance), and there is no function
in libxdiff which calculates LCS, or returns diff for arrays
(sequences)?

-- 
Jakub Narebski
Poland

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2] make diff --color-words customizable
  2009-01-12 23:59                       ` Jakub Narebski
@ 2009-01-13  0:40                         ` Johannes Schindelin
  0 siblings, 0 replies; 34+ messages in thread
From: Johannes Schindelin @ 2009-01-13  0:40 UTC (permalink / raw)
  To: Jakub Narebski; +Cc: git, Davide Libenzi, Thomas Rast

Hi,

On Tue, 13 Jan 2009, Jakub Narebski wrote:

> On Sat, 10 Jan 2009, Johannes Schindelin wrote:
> > On Sat, 10 Jan 2009, Jakub Narebski wrote:
> >> On Sat, 10 Jan 2009, Johannes Schindelin wrote:
> >>> On Sat, 10 Jan 2009, Jakub Narebski wrote:
> >>>> Thomas Rast wrote:
> >>>> 
> >>>>> --color-words works (and always worked) by splitting words onto one
> >>>>> line each, and using the normal line-diff machinery to get a word
> >>>>> diff. 
> >>>> 
> >>>> Cannot we generalize diff machinery / use underlying LCS diff engine
> >>>> instead of going through line diff?
> >>> 
> >>> What do you think we're doing?  libxdiff is pretty hardcoded to newlines.  
> >>> That's why we're substituting non-word characters with newlines.
> >> 
> >> Isn't Meyers algorithm used by libxdiff based on LCS, largest common
> >> subsequence, and doesn't it generate from the mathematical point of
> >> view "diff" between two sequences (two arrays) which just happen to
> >> be lines? It is a bit strange that libxdiff doesn't export its low
> >> level algorithm...
> > 
> > Umm.
> > 
> > It _is_ Myers' algorithm.  It just so happens that libxdiff hardcodes 
> > newline to be the separator.
> 
> So amd I to understand that _exported_ functions hardcode separator
> to be newline (most probably for performance), and there is no function
> in libxdiff which calculates LCS, or returns diff for arrays
> (sequences)?

That is my understanding, yes.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2] make diff --color-words customizable
  2009-01-10 17:53                     ` Davide Libenzi
@ 2009-01-13  0:52                       ` Jakub Narebski
  2009-01-13 18:50                         ` Davide Libenzi
  0 siblings, 1 reply; 34+ messages in thread
From: Jakub Narebski @ 2009-01-13  0:52 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Johannes Schindelin, git, Thomas Rast

On Sat, 10 Jan 2009, Davide Libenzi wrote:
> On Sat, 10 Jan 2009, Jakub Narebski wrote:
>> On Sat, 10 Jan 2009, Johannes Schindelin wrote:
>>> On Sat, 10 Jan 2009, Jakub Narebski wrote:
>>>> Thomas Rast wrote:
>>>> 
>>>>> --color-words works (and always worked) by splitting words onto one
>>>>> line each, and using the normal line-diff machinery to get a word
>>>>> diff. 
>>>> 
>>>> Cannot we generalize diff machinery / use underlying LCS diff engine
>>>> instead of going through line diff?
>>> 
>>> What do you think we're doing?  libxdiff is pretty hardcoded to newlines.  
>>> That's why we're substituting non-word characters with newlines.
>> 
>> Isn't Meyers algorithm used by libxdiff based on LCS, largest common
>> subsequence, and doesn't it generate from the mathematical point of
>> view "diff" between two sequences (two arrays) which just happen to
>> be lines? It is a bit strange that libxdiff doesn't export its low
>> level algorithm...
> 
> The core doesn't know anything about lines. Only pre-processing (setting 
> up the hash by tokenizing the input) and post-processing (adding '\n' to 
> the end of each token), knows about newlines. Memory consumption would 
> increase significantly though, since there is a per-token cost, and a 
> word-based diff will create more of them WRT the same input.

Is this core algorithm available as some exported function in libxdiff?
I mean would it be easy to replace default line tokenizer (per-line
pre-processing) and post-processing to better deal with word diff?

The other side would be to generate per-paragraph diffs (with empty
line being separator)...
-- 
Jakub Narebski
Poland

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2] make diff --color-words customizable
  2009-01-13  0:52                       ` Jakub Narebski
@ 2009-01-13 18:50                         ` Davide Libenzi
  0 siblings, 0 replies; 34+ messages in thread
From: Davide Libenzi @ 2009-01-13 18:50 UTC (permalink / raw)
  To: Jakub Narebski; +Cc: Johannes Schindelin, git, Thomas Rast

On Tue, 13 Jan 2009, Jakub Narebski wrote:

> Is this core algorithm available as some exported function in libxdiff?
> I mean would it be easy to replace default line tokenizer (per-line
> pre-processing) and post-processing to better deal with word diff?

In libxdiff, no. I hadn't thought to export the raw Meyer algo, and nobody 
ever asked before.


> The other side would be to generate per-paragraph diffs (with empty
> line being separator)...

In Git I guess you can use it to generate other kind of diffs. I don't see 
any problems with that. Just requires more memory WRT a line based one.


- Davide

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2009-01-13 18:52 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-01-09  0:05 [RFC PATCH] make diff --color-words customizable Thomas Rast
2009-01-09  0:25 ` Johannes Schindelin
2009-01-09  0:50   ` Thomas Rast
2009-01-09 11:15     ` Johannes Schindelin
2009-01-09 11:59       ` [ILLUSTRATION PATCH] color-words: take an optional regular expression describing words Johannes Schindelin
2009-01-09 12:24         ` Thomas Rast
2009-01-09 13:05           ` Teemu Likonen
2009-01-10  0:57             ` [PATCH v2] make diff --color-words customizable Thomas Rast
2009-01-10  1:50               ` Jakub Narebski
2009-01-10 11:37                 ` Johannes Schindelin
2009-01-10 13:36                   ` Jakub Narebski
2009-01-10 14:08                     ` Johannes Schindelin
2009-01-12 23:59                       ` Jakub Narebski
2009-01-13  0:40                         ` Johannes Schindelin
2009-01-10 17:53                     ` Davide Libenzi
2009-01-13  0:52                       ` Jakub Narebski
2009-01-13 18:50                         ` Davide Libenzi
2009-01-10 10:49               ` Johannes Schindelin
2009-01-10 11:25                 ` Thomas Rast
2009-01-10 11:45                   ` Johannes Schindelin
2009-01-11  1:34                     ` Junio C Hamano
2009-01-11 10:27                       ` [PATCH v3 0/4] customizable --color-words Thomas Rast
2009-01-11 10:27                       ` [PATCH v3 1/4] word diff: comments, preparations for regex customization Thomas Rast
2009-01-11 13:41                         ` Johannes Schindelin
2009-01-11 19:49                         ` Johannes Schindelin
2009-01-11 22:19                         ` Junio C Hamano
2009-01-11 10:27                       ` [PATCH v3 2/4] word diff: customizable word splits Thomas Rast
2009-01-11 22:20                         ` Junio C Hamano
2009-01-11 10:27                       ` [PATCH v3 3/4] word diff: make regex configurable via attributes Thomas Rast
2009-01-11 23:20                         ` Junio C Hamano
2009-01-11 10:27                       ` [PATCH v3 4/4] word diff: test customizable word splits Thomas Rast
2009-01-09  9:53 ` [RFC PATCH] make diff --color-words customizable Jeff King
2009-01-09 11:18   ` Johannes Schindelin
2009-01-09 11:22     ` Jeff King

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.