All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] fetch: align new ref summary printout in UTF-8 locales
@ 2012-09-03 11:10 Nguyễn Thái Ngọc Duy
  2012-09-03 19:26 ` Junio C Hamano
  2012-09-04 10:39 ` [PATCH v2] " Nguyễn Thái Ngọc Duy
  0 siblings, 2 replies; 14+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-09-03 11:10 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy

fetch does printf("%-*s", width, "foo") where "foo" can be an utf-8
string, but width is bytes, not letters. This results in misaligned
ref summary table.

Introduce gettext_length() function that returns the string length in
letters. Make the code use TRANSPORT_SUMMARY(x) where the length is
compensated properly in utf-8 locales.
---
 gettext_length() can be made to support other charsets too. But I'm
 on utf-8, it's not my itch.

 Grepping through '%-*s' does not reveal any other places that obviously
 need adjustment like this (apply and remote might, but pathnames and
 remote names are usually in ascii)

 builtin/fetch.c | 22 ++++++++++------------
 gettext.c       | 14 ++++++++++++--
 gettext.h       |  5 +++++
 transport.c     |  2 +-
 transport.h     |  1 +
 5 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/builtin/fetch.c b/builtin/fetch.c
index bb9a074..d7406d2 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -255,9 +255,8 @@ static int update_local_ref(struct ref *ref,
 	if (!hashcmp(ref->old_sha1, ref->new_sha1)) {
 		if (verbosity > 0)
 			strbuf_addf(display, "= %-*s %-*s -> %s",
-				    TRANSPORT_SUMMARY_WIDTH,
-				    _("[up to date]"), REFCOL_WIDTH,
-				    remote, pretty_ref);
+				    TRANSPORT_SUMMARY(_("[up to date]")),
+				    REFCOL_WIDTH, remote, pretty_ref);
 		return 0;
 	}
 
@@ -271,7 +270,7 @@ static int update_local_ref(struct ref *ref,
 		 */
 		strbuf_addf(display,
 			    _("! %-*s %-*s -> %s  (can't fetch in current branch)"),
-			    TRANSPORT_SUMMARY_WIDTH, _("[rejected]"),
+			    TRANSPORT_SUMMARY(_("[rejected]")),
 			    REFCOL_WIDTH, remote, pretty_ref);
 		return 1;
 	}
@@ -282,7 +281,7 @@ static int update_local_ref(struct ref *ref,
 		r = s_update_ref("updating tag", ref, 0);
 		strbuf_addf(display, "%c %-*s %-*s -> %s%s",
 			    r ? '!' : '-',
-			    TRANSPORT_SUMMARY_WIDTH, _("[tag update]"),
+			    TRANSPORT_SUMMARY(_("[tag update]")),
 			    REFCOL_WIDTH, remote, pretty_ref,
 			    r ? _("  (unable to update local ref)") : "");
 		return r;
@@ -317,7 +316,7 @@ static int update_local_ref(struct ref *ref,
 		r = s_update_ref(msg, ref, 0);
 		strbuf_addf(display, "%c %-*s %-*s -> %s%s",
 			    r ? '!' : '*',
-			    TRANSPORT_SUMMARY_WIDTH, what,
+			    TRANSPORT_SUMMARY(what),
 			    REFCOL_WIDTH, remote, pretty_ref,
 			    r ? _("  (unable to update local ref)") : "");
 		return r;
@@ -335,7 +334,7 @@ static int update_local_ref(struct ref *ref,
 		r = s_update_ref("fast-forward", ref, 1);
 		strbuf_addf(display, "%c %-*s %-*s -> %s%s",
 			    r ? '!' : ' ',
-			    TRANSPORT_SUMMARY_WIDTH, quickref,
+			    TRANSPORT_SUMMARY(quickref),
 			    REFCOL_WIDTH, remote, pretty_ref,
 			    r ? _("  (unable to update local ref)") : "");
 		return r;
@@ -351,13 +350,13 @@ static int update_local_ref(struct ref *ref,
 		r = s_update_ref("forced-update", ref, 1);
 		strbuf_addf(display, "%c %-*s %-*s -> %s  (%s)",
 			    r ? '!' : '+',
-			    TRANSPORT_SUMMARY_WIDTH, quickref,
+			    TRANSPORT_SUMMARY(quickref),
 			    REFCOL_WIDTH, remote, pretty_ref,
 			    r ? _("unable to update local ref") : _("forced update"));
 		return r;
 	} else {
 		strbuf_addf(display, "! %-*s %-*s -> %s  %s",
-			    TRANSPORT_SUMMARY_WIDTH, _("[rejected]"),
+			    TRANSPORT_SUMMARY(_("[rejected]")),
 			    REFCOL_WIDTH, remote, pretty_ref,
 			    _("(non-fast-forward)"));
 		return 1;
@@ -479,8 +478,7 @@ static int store_updated_refs(const char *raw_url, const char *remote_name,
 				free(ref);
 			} else
 				strbuf_addf(&note, "* %-*s %-*s -> FETCH_HEAD",
-					    TRANSPORT_SUMMARY_WIDTH,
-					    *kind ? kind : "branch",
+					    TRANSPORT_SUMMARY(*kind ? kind : "branch"),
 					    REFCOL_WIDTH,
 					    *what ? what : "HEAD");
 			if (note.len) {
@@ -554,7 +552,7 @@ static int prune_refs(struct refspec *refs, int ref_count, struct ref *ref_map)
 			result |= delete_ref(ref->name, NULL, 0);
 		if (verbosity >= 0) {
 			fprintf(stderr, " x %-*s %-*s -> %s\n",
-				TRANSPORT_SUMMARY_WIDTH, _("[deleted]"),
+				TRANSPORT_SUMMARY(_("[deleted]")),
 				REFCOL_WIDTH, _("(none)"), prettify_refname(ref->name));
 			warn_dangling_symref(stderr, dangling_msg, ref->name);
 		}
diff --git a/gettext.c b/gettext.c
index f75bca7..e9f0f0d 100644
--- a/gettext.c
+++ b/gettext.c
@@ -4,6 +4,8 @@
 
 #include "git-compat-util.h"
 #include "gettext.h"
+#include "strbuf.h"
+#include "utf8.h"
 
 #ifndef NO_GETTEXT
 #	include <locale.h>
@@ -27,10 +29,9 @@ int use_gettext_poison(void)
 #endif
 
 #ifndef NO_GETTEXT
+static const char *charset;
 static void init_gettext_charset(const char *domain)
 {
-	const char *charset;
-
 	/*
 	   This trick arranges for messages to be emitted in the user's
 	   requested encoding, but avoids setting LC_CTYPE from the
@@ -128,4 +129,13 @@ void git_setup_gettext(void)
 	init_gettext_charset("git");
 	textdomain("git");
 }
+
+int gettext_length(const char *s)
+{
+	static int is_utf8 = -1;
+	if (is_utf8 == -1)
+		is_utf8 = !strcmp(charset, "UTF-8");
+
+	return is_utf8 ? utf8_strwidth(s) : strlen(s);
+}
 #endif
diff --git a/gettext.h b/gettext.h
index 57ba8bb..7ea6db4 100644
--- a/gettext.h
+++ b/gettext.h
@@ -30,10 +30,15 @@
 
 #ifndef NO_GETTEXT
 extern void git_setup_gettext(void);
+extern int gettext_length(const char *s);
 #else
 static inline void git_setup_gettext(void)
 {
 }
+static inline int gettext_length(const char *s)
+{
+	return strlen(s);
+}
 #endif
 
 #ifdef GETTEXT_POISON
diff --git a/transport.c b/transport.c
index 1811b50..863aaa6 100644
--- a/transport.c
+++ b/transport.c
@@ -629,7 +629,7 @@ static void print_ref_status(char flag, const char *summary, struct ref *to, str
 		else
 			fprintf(stdout, "%s\n", summary);
 	} else {
-		fprintf(stderr, " %c %-*s ", flag, TRANSPORT_SUMMARY_WIDTH, summary);
+		fprintf(stderr, " %c %-*s ", flag, TRANSPORT_SUMMARY(summary));
 		if (from)
 			fprintf(stderr, "%s -> %s", prettify_refname(from->name), prettify_refname(to->name));
 		else
diff --git a/transport.h b/transport.h
index b866c12..1c95a8c 100644
--- a/transport.h
+++ b/transport.h
@@ -106,6 +106,7 @@ struct transport {
 #define TRANSPORT_RECURSE_SUBMODULES_ON_DEMAND 256
 
 #define TRANSPORT_SUMMARY_WIDTH (2 * DEFAULT_ABBREV + 3)
+#define TRANSPORT_SUMMARY(x) (TRANSPORT_SUMMARY_WIDTH + strlen(x) - gettext_length(x)), (x)
 
 /* Returns a transport suitable for the url */
 struct transport *transport_get(struct remote *, const char *);
-- 
1.7.12.rc2.18.g61b472e

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] fetch: align new ref summary printout in UTF-8 locales
  2012-09-03 11:10 [PATCH] fetch: align new ref summary printout in UTF-8 locales Nguyễn Thái Ngọc Duy
@ 2012-09-03 19:26 ` Junio C Hamano
  2012-09-03 20:01   ` Johannes Sixt
  2012-09-04  1:31   ` Nguyen Thai Ngoc Duy
  2012-09-04 10:39 ` [PATCH v2] " Nguyễn Thái Ngọc Duy
  1 sibling, 2 replies; 14+ messages in thread
From: Junio C Hamano @ 2012-09-03 19:26 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy; +Cc: git

Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:

> fetch does printf("%-*s", width, "foo") where "foo" can be an utf-8
> string, but width is bytes, not letters. This results in misaligned
> ref summary table.

"but width is bytes, not letters" is a misleading statement.

Be careful about three different quantities when talking about
aligning on terminal output with monospaced fonts:

 - How many bytes does the string occupy in memory?
 - How many unicode codepoints are in the string?
 - How many display columns does the string occupy on terminal?

Note that some "letters" (e.g. Han) occupy two display columns, and
you want to measure the "width" and compensate that for "bytes".
Letter count do not come into the picture for the purpose of aligning
the terminal output.

> Introduce gettext_length() function that returns the string length in
> letters. Make the code use TRANSPORT_SUMMARY(x) where the length is

Again, are you measuring "string length in letters"?  Or are you
trying to measure in terms of git_wcwidth()?  If the latter, please
name this after "width", not "length" to make it more clear.

> compensated properly in utf-8 locales.
> ---

Is it in vogue to omit Signed-off-by line these days or something?

>  gettext_length() can be made to support other charsets too. But I'm
>  on utf-8, it's not my itch.
>
>  Grepping through '%-*s' does not reveal any other places that obviously
>  need adjustment like this (apply and remote might, but pathnames and
>  remote names are usually in ascii)

Thanks.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] fetch: align new ref summary printout in UTF-8 locales
  2012-09-03 19:26 ` Junio C Hamano
@ 2012-09-03 20:01   ` Johannes Sixt
  2012-09-03 20:30     ` Junio C Hamano
  2012-09-04  1:31   ` Nguyen Thai Ngoc Duy
  1 sibling, 1 reply; 14+ messages in thread
From: Johannes Sixt @ 2012-09-03 20:01 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Nguyễn Thái Ngọc Duy, git

Am 03.09.2012 21:26, schrieb Junio C Hamano:
> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
> 
>> fetch does printf("%-*s", width, "foo") where "foo" can be an utf-8
>> string, but width is bytes, not letters. This results in misaligned
>> ref summary table.
> 
> "but width is bytes, not letters" is a misleading statement.
> 
> Be careful about three different quantities when talking about
> aligning on terminal output with monospaced fonts:
> 
>  - How many bytes does the string occupy in memory?
>  - How many unicode codepoints are in the string?
>  - How many display columns does the string occupy on terminal?
> 
> Note that some "letters" (e.g. Han) occupy two display columns, and
> you want to measure the "width" and compensate that for "bytes".
> Letter count do not come into the picture for the purpose of aligning
> the terminal output.

If I'm reading POSIX correctly, printf measures the width in %*s in bytes.

-- Hannes

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] fetch: align new ref summary printout in UTF-8 locales
  2012-09-03 20:01   ` Johannes Sixt
@ 2012-09-03 20:30     ` Junio C Hamano
  0 siblings, 0 replies; 14+ messages in thread
From: Junio C Hamano @ 2012-09-03 20:30 UTC (permalink / raw)
  To: Johannes Sixt; +Cc: Nguyễn Thái Ngọc Duy, git

Johannes Sixt <j6t@kdbg.org> writes:

> Am 03.09.2012 21:26, schrieb Junio C Hamano:
>> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
>> 
>>> fetch does printf("%-*s", width, "foo") where "foo" can be an utf-8
>>> string, but width is bytes, not letters. This results in misaligned
>>> ref summary table.
>> 
>> "but width is bytes, not letters" is a misleading statement.
>> 
>> Be careful about three different quantities when talking about
>> aligning on terminal output with monospaced fonts:
>> 
>>  - How many bytes does the string occupy in memory?
>>  - How many unicode codepoints are in the string?
>>  - How many display columns does the string occupy on terminal?
>> 
>> Note that some "letters" (e.g. Han) occupy two display columns, and
>> you want to measure the "width" and compensate that for "bytes".
>> Letter count do not come into the picture for the purpose of aligning
>> the terminal output.
>
> If I'm reading POSIX correctly, printf measures the width in %*s in bytes.

Yes, that is the problem.

When we give allocated_width columns to display str, and call

        printf("%.*s", allocated_width, str)

we end up getting (allocated_width - strlen(str)) SPs as filler.  If
the str occupies less than strlen(str) columns (and it often is the
case for a string with non-ascii characters), insufficient number of
filler SPs will be given, and an item on the same line that comes
after this printf() will appear too close to str.  By measuring the
required number of columns to display str, you can compensate. i.e.

        printf("%.*s", allocated_width +
                (strlen(str) - display_width(str)), str)

to give it enough filler.

I think the patch uses utf8_strwidth() which measures the display
columns needed for the string, not number of unicode codepoints in
the string.  I was merely pointing out that the wording of the
proposed log message "bytes, not letters" and the function name
"foo_length" was misleading, when we mean "width".

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] fetch: align new ref summary printout in UTF-8 locales
  2012-09-03 19:26 ` Junio C Hamano
  2012-09-03 20:01   ` Johannes Sixt
@ 2012-09-04  1:31   ` Nguyen Thai Ngoc Duy
  1 sibling, 0 replies; 14+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-09-04  1:31 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Tue, Sep 4, 2012 at 2:26 AM, Junio C Hamano <gitster@pobox.com> wrote:
> Nguyễn Thái Ngọc Duy  <pclouds@gmail.com> writes:
>
>> fetch does printf("%-*s", width, "foo") where "foo" can be an utf-8
>> string, but width is bytes, not letters. This results in misaligned
>> ref summary table.
>
> "but width is bytes, not letters" is a misleading statement.
>
> Be careful about three different quantities when talking about
> aligning on terminal output with monospaced fonts:
>
>  - How many bytes does the string occupy in memory?
>  - How many unicode codepoints are in the string?
>  - How many display columns does the string occupy on terminal?
>
> Note that some "letters" (e.g. Han) occupy two display columns, and
> you want to measure the "width" and compensate that for "bytes".
> Letter count do not come into the picture for the purpose of aligning
> the terminal output.

Hmm.. I did test with zh_CN.UTF-8 and it seemed to work. I'll check this again.
-- 
Duy

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2] fetch: align new ref summary printout in UTF-8 locales
  2012-09-03 11:10 [PATCH] fetch: align new ref summary printout in UTF-8 locales Nguyễn Thái Ngọc Duy
  2012-09-03 19:26 ` Junio C Hamano
@ 2012-09-04 10:39 ` Nguyễn Thái Ngọc Duy
       [not found]   ` <504796A5.4070700@gmail.com>
  2012-09-12 14:02   ` Nguyen Thai Ngoc Duy
  1 sibling, 2 replies; 14+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-09-04 10:39 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Nguyễn Thái Ngọc Duy

fetch does printf("%-*s", width, "foo") where "foo" can be a utf-8
string, but width is in bytes, not columns. For ASCII it's fine as one
byte takes one column. For utf-8, this may result in misaligned ref
summary table.

Introduce gettext_width() function that returns the string length in
columns (currently only supports utf-8 locales). Make the code use
TRANSPORT_SUMMARY(x) where the length is compensated properly in
non-English locales.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
  - rename gettext_length() to gettext_width()
  - use "width" instead of letters
  - leave other "%-*s" places unchanged if they always take ascii
    strings (i.e. no _() calls)
  - note to self, may need to i18n-ize print_ref_status() in
    transport.c, looks like it's used by git-push only

 builtin/fetch.c | 15 +++++++--------
 gettext.c       | 15 +++++++++++++--
 gettext.h       |  5 +++++
 transport.h     |  1 +
 4 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/builtin/fetch.c b/builtin/fetch.c
index bb9a074..85e291f 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -255,9 +255,8 @@ static int update_local_ref(struct ref *ref,
 	if (!hashcmp(ref->old_sha1, ref->new_sha1)) {
 		if (verbosity > 0)
 			strbuf_addf(display, "= %-*s %-*s -> %s",
-				    TRANSPORT_SUMMARY_WIDTH,
-				    _("[up to date]"), REFCOL_WIDTH,
-				    remote, pretty_ref);
+				    TRANSPORT_SUMMARY(_("[up to date]")),
+				    REFCOL_WIDTH, remote, pretty_ref);
 		return 0;
 	}
 
@@ -271,7 +270,7 @@ static int update_local_ref(struct ref *ref,
 		 */
 		strbuf_addf(display,
 			    _("! %-*s %-*s -> %s  (can't fetch in current branch)"),
-			    TRANSPORT_SUMMARY_WIDTH, _("[rejected]"),
+			    TRANSPORT_SUMMARY(_("[rejected]")),
 			    REFCOL_WIDTH, remote, pretty_ref);
 		return 1;
 	}
@@ -282,7 +281,7 @@ static int update_local_ref(struct ref *ref,
 		r = s_update_ref("updating tag", ref, 0);
 		strbuf_addf(display, "%c %-*s %-*s -> %s%s",
 			    r ? '!' : '-',
-			    TRANSPORT_SUMMARY_WIDTH, _("[tag update]"),
+			    TRANSPORT_SUMMARY(_("[tag update]")),
 			    REFCOL_WIDTH, remote, pretty_ref,
 			    r ? _("  (unable to update local ref)") : "");
 		return r;
@@ -317,7 +316,7 @@ static int update_local_ref(struct ref *ref,
 		r = s_update_ref(msg, ref, 0);
 		strbuf_addf(display, "%c %-*s %-*s -> %s%s",
 			    r ? '!' : '*',
-			    TRANSPORT_SUMMARY_WIDTH, what,
+			    TRANSPORT_SUMMARY(what),
 			    REFCOL_WIDTH, remote, pretty_ref,
 			    r ? _("  (unable to update local ref)") : "");
 		return r;
@@ -357,7 +356,7 @@ static int update_local_ref(struct ref *ref,
 		return r;
 	} else {
 		strbuf_addf(display, "! %-*s %-*s -> %s  %s",
-			    TRANSPORT_SUMMARY_WIDTH, _("[rejected]"),
+			    TRANSPORT_SUMMARY(_("[rejected]")),
 			    REFCOL_WIDTH, remote, pretty_ref,
 			    _("(non-fast-forward)"));
 		return 1;
@@ -554,7 +553,7 @@ static int prune_refs(struct refspec *refs, int ref_count, struct ref *ref_map)
 			result |= delete_ref(ref->name, NULL, 0);
 		if (verbosity >= 0) {
 			fprintf(stderr, " x %-*s %-*s -> %s\n",
-				TRANSPORT_SUMMARY_WIDTH, _("[deleted]"),
+				TRANSPORT_SUMMARY(_("[deleted]")),
 				REFCOL_WIDTH, _("(none)"), prettify_refname(ref->name));
 			warn_dangling_symref(stderr, dangling_msg, ref->name);
 		}
diff --git a/gettext.c b/gettext.c
index f75bca7..71e9545 100644
--- a/gettext.c
+++ b/gettext.c
@@ -4,6 +4,8 @@
 
 #include "git-compat-util.h"
 #include "gettext.h"
+#include "strbuf.h"
+#include "utf8.h"
 
 #ifndef NO_GETTEXT
 #	include <locale.h>
@@ -27,10 +29,9 @@ int use_gettext_poison(void)
 #endif
 
 #ifndef NO_GETTEXT
+static const char *charset;
 static void init_gettext_charset(const char *domain)
 {
-	const char *charset;
-
 	/*
 	   This trick arranges for messages to be emitted in the user's
 	   requested encoding, but avoids setting LC_CTYPE from the
@@ -128,4 +129,14 @@ void git_setup_gettext(void)
 	init_gettext_charset("git");
 	textdomain("git");
 }
+
+/* return the number of columns of string 's' in current locale */
+int gettext_width(const char *s)
+{
+	static int is_utf8 = -1;
+	if (is_utf8 == -1)
+		is_utf8 = !strcmp(charset, "UTF-8");
+
+	return is_utf8 ? utf8_strwidth(s) : strlen(s);
+}
 #endif
diff --git a/gettext.h b/gettext.h
index 57ba8bb..e44d8bc 100644
--- a/gettext.h
+++ b/gettext.h
@@ -30,10 +30,15 @@
 
 #ifndef NO_GETTEXT
 extern void git_setup_gettext(void);
+extern int gettext_width(const char *s);
 #else
 static inline void git_setup_gettext(void)
 {
 }
+static inline int gettext_width(const char *s)
+{
+	return strlen(s);
+}
 #endif
 
 #ifdef GETTEXT_POISON
diff --git a/transport.h b/transport.h
index b866c12..a5d375e 100644
--- a/transport.h
+++ b/transport.h
@@ -106,6 +106,7 @@ struct transport {
 #define TRANSPORT_RECURSE_SUBMODULES_ON_DEMAND 256
 
 #define TRANSPORT_SUMMARY_WIDTH (2 * DEFAULT_ABBREV + 3)
+#define TRANSPORT_SUMMARY(x) (TRANSPORT_SUMMARY_WIDTH + strlen(x) - gettext_width(x)), (x)
 
 /* Returns a transport suitable for the url */
 struct transport *transport_get(struct remote *, const char *);
-- 
1.7.12.rc2.18.g61b472e

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] fetch: align new ref summary printout in UTF-8 locales
       [not found]   ` <504796A5.4070700@gmail.com>
@ 2012-09-05 19:20     ` Torsten Bögershausen
  2012-09-06 12:11       ` Nguyen Thai Ngoc Duy
  2012-09-06 15:36       ` Nguyen Thai Ngoc Duy
  0 siblings, 2 replies; 14+ messages in thread
From: Torsten Bögershausen @ 2012-09-05 19:20 UTC (permalink / raw)
  To: Nguyễn Thái Ngọc Duy
  Cc: git, Junio C Hamano, torsten Bögershausen

On 09/05/2012 08:15 PM, Torsten Bögershausen wrote:
> On 04.09.12 12:39, Nguyễn Thái Ngọc Duy wrote:
>> +/* return the number of columns of string 's' in current locale */
>> +int gettext_width(const char *s)
>> +{
>> +	static int is_utf8 = -1;
>> +	if (is_utf8 == -1)
>> +		is_utf8 = !strcmp(charset, "UTF-8");
>> +
>> +	return is_utf8 ? utf8_strwidth(s) : strlen(s);
>
> Will that work for non-ASCII encodings?
> For ISO-8859-x we can say strlen() == strwidth(),
> but for other encodings using multibytes that doesn't work, does it?
(Sorry the message went out before completely written)
Something like that:

int gettext_width(const char *s) {
   static int is_utf8 = -1;

   if (is_utf8 == -1)
     is_utf8 = !strcmp(charset, "UTF-8");

   if (is_utf8)
     return utf8_strwidth(s);
   else  {
     char *s_utf = reencode_string(s, "UTF-8", charset);
     if (s_utf) {
       witdh = utf8_strwidth(s_utf);
       free(s_utf);
     } else
       width = strlen(s);

     return width;
}

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] fetch: align new ref summary printout in UTF-8 locales
  2012-09-05 19:20     ` Torsten Bögershausen
@ 2012-09-06 12:11       ` Nguyen Thai Ngoc Duy
  2012-09-06 15:36       ` Nguyen Thai Ngoc Duy
  1 sibling, 0 replies; 14+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-09-06 12:11 UTC (permalink / raw)
  To: Torsten Bögershausen; +Cc: git, Junio C Hamano

On Thu, Sep 6, 2012 at 2:20 AM, Torsten Bögershausen <tboegi@web.de> wrote:
> On 09/05/2012 08:15 PM, Torsten Bögershausen wrote:
>>
>> On 04.09.12 12:39, Nguyễn Thái Ngọc Duy wrote:
>>>
>>> +/* return the number of columns of string 's' in current locale */
>>> +int gettext_width(const char *s)
>>> +{
>>> +       static int is_utf8 = -1;
>>> +       if (is_utf8 == -1)
>>> +               is_utf8 = !strcmp(charset, "UTF-8");
>>> +
>>> +       return is_utf8 ? utf8_strwidth(s) : strlen(s);
>>
>>
>> Will that work for non-ASCII encodings?
>> For ISO-8859-x we can say strlen() == strwidth(),
>> but for other encodings using multibytes that doesn't work, does it?

No it does not. I think I mentioned that in the first version that I
was only interested in utf-8. Others can extend the function for their
favourite encodings.

> (Sorry the message went out before completely written)
> Something like that:
>
> int gettext_width(const char *s) {
>
>   static int is_utf8 = -1;
>
>   if (is_utf8 == -1)
>
>     is_utf8 = !strcmp(charset, "UTF-8");
>
>   if (is_utf8)
>     return utf8_strwidth(s);
>   else  {
>     char *s_utf = reencode_string(s, "UTF-8", charset);
>     if (s_utf) {
>       witdh = utf8_strwidth(s_utf);
>       free(s_utf);
>     } else
>       width = strlen(s);
>
>     return width;
> }

Yes, something like that, assuming that column information is intact
after the conversion. Maybe you can make that a new function, int
strwidth(const char *str, const char *charset), and make
gettext_strwidth() a thin wrapper:

int gettext_strwidth(const char *s)
{
   return strwidth(s, charset);
}
-- 
Duy

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] fetch: align new ref summary printout in UTF-8 locales
  2012-09-05 19:20     ` Torsten Bögershausen
  2012-09-06 12:11       ` Nguyen Thai Ngoc Duy
@ 2012-09-06 15:36       ` Nguyen Thai Ngoc Duy
  2012-09-06 16:21         ` Torsten Bögershausen
  1 sibling, 1 reply; 14+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-09-06 15:36 UTC (permalink / raw)
  To: Torsten Bögershausen; +Cc: git, Junio C Hamano

On Thu, Sep 6, 2012 at 2:20 AM, Torsten Bögershausen <tboegi@web.de> wrote:
>> Will that work for non-ASCII encodings?
>> For ISO-8859-x we can say strlen() == strwidth(),
>> but for other encodings using multibytes that doesn't work, does it?

BTW if you are interested in supporting non-utf8 output, you may want
to look at 1452bd6 (branch -v: align even when branch names are in
UTF-8 - 2012-08-26), which assumes branches are in utf-8. So you have
to convert them to output charset before printing.
-- 
Duy

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] fetch: align new ref summary printout in UTF-8 locales
  2012-09-06 15:36       ` Nguyen Thai Ngoc Duy
@ 2012-09-06 16:21         ` Torsten Bögershausen
  2012-09-06 18:08           ` Junio C Hamano
  0 siblings, 1 reply; 14+ messages in thread
From: Torsten Bögershausen @ 2012-09-06 16:21 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: git, Junio C Hamano


Am 06.09.2012 um 17:36 schrieb Nguyen Thai Ngoc Duy:

> On Thu, Sep 6, 2012 at 2:20 AM, Torsten Bögershausen <tboegi@web.de> wrote:
>>> Will that work for non-ASCII encodings?
>>> For ISO-8859-x we can say strlen() == strwidth(),
>>> but for other encodings using multibytes that doesn't work, does it?
> 
> BTW if you are interested in supporting non-utf8 output, you may want
> to look at 1452bd6 (branch -v: align even when branch names are in
> UTF-8 - 2012-08-26), which assumes branches are in utf-8. So you have
> to convert them to output charset before printing.
> -- 
> Duy

Thanks,
I try to re-phrase my question:

Do installations still exist which use e.g. BIG5 or any other
multi byte encoding which is not UTF-8?

Do we want to support other encodings than ASCII or UTF-8?
(Because then the screen width needs to be calculate different, I think)

/Torsten

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] fetch: align new ref summary printout in UTF-8 locales
  2012-09-06 16:21         ` Torsten Bögershausen
@ 2012-09-06 18:08           ` Junio C Hamano
  0 siblings, 0 replies; 14+ messages in thread
From: Junio C Hamano @ 2012-09-06 18:08 UTC (permalink / raw)
  To: Torsten Bögershausen; +Cc: Nguyen Thai Ngoc Duy, git

Torsten Bögershausen <tboegi@web.de> writes:

> I try to re-phrase my question:
>
> Do installations still exist which use e.g. BIG5 or any other
> multi byte encoding which is not UTF-8?

Yes.

> Do we want to support other encodings than ASCII or UTF-8?
> (Because then the screen width needs to be calculate different, I think)

That depends on who "we" are and what timeframe you have in mind.

Do our developers care about these encodings so much that we would
reject "ASCCI/UTF-8 only" patch and wait until we enhance it to do
the right thing for other encodings that we do not use ourselves?
No.  That does not make any sense, especially when we know we will
not have a good test coverage on the additional parts that we will
not be using ourselves.

"This change only helps people with ASCII or UTF-8 and does not help
others" alone is never a valid reason to reject a change, but we
still try to be nicer to "others" that may come after we leave this
topic behind by doing a few things:

 - If the change will make things worse than it currently is for
   "others", we would try to minimize the regression for them.

 - If the change will make the code harder to update later to
   enhance with additional change to support "others", we would try
   to anticipate what kind of changes are needed on top, and
   structure the code in such a way that future changes can be made
   cleanly.

For the first point, for multi-byte encodings (e.g. ISO-2022), the
display columns and byte length do not match and in general byte
length is longer than the display columns in the current code.  With
the current code that measures the required columns across elements
by taking the maximum of byte length, they will see wrong number of
filler, so they are already getting a wrong alignment.  With the
"UTF-8 only" change, the required columns and the filler will be
calculated by assuming that the string is in UTF-8, which may make
the computation off in a different way, and if we underestimate the
display columns for a string, they may see the strings truncated,
which is bad.

So as long as gettext_width() punts and returns strlen() for a
malformed UTF-8, it would be OK [*1*].

For the second point, I think the API "here is a string, give me the
number of display columns it will occupy, as I am interested in
aligning them" is a good abstraction that can be later enhanced to
other encodings fairly easily, so I do not see a problem in the
patch that goes in that direction.



[Footnote]

*1* If the patch used utf_strwidth() (which I didn't bother to go
back and check, though), it should be OK.  The underlying
utf8_width() will reject a malformed UTF-8 sequence and the code
falls back to strlen().

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] fetch: align new ref summary printout in UTF-8 locales
  2012-09-04 10:39 ` [PATCH v2] " Nguyễn Thái Ngọc Duy
       [not found]   ` <504796A5.4070700@gmail.com>
@ 2012-09-12 14:02   ` Nguyen Thai Ngoc Duy
  2012-09-12 18:02     ` Junio C Hamano
  1 sibling, 1 reply; 14+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-09-12 14:02 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Ping.. what happens to this patch? Do you want to support other
encodings as well via iconv()? I can't test that though.

On Tue, Sep 4, 2012 at 5:39 PM, Nguyễn Thái Ngọc Duy <pclouds@gmail.com> wrote:
> fetch does printf("%-*s", width, "foo") where "foo" can be a utf-8
> string, but width is in bytes, not columns. For ASCII it's fine as one
> byte takes one column. For utf-8, this may result in misaligned ref
> summary table.
>
> Introduce gettext_width() function that returns the string length in
> columns (currently only supports utf-8 locales). Make the code use
> TRANSPORT_SUMMARY(x) where the length is compensated properly in
> non-English locales.
>
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
>   - rename gettext_length() to gettext_width()
>   - use "width" instead of letters
>   - leave other "%-*s" places unchanged if they always take ascii
>     strings (i.e. no _() calls)
>   - note to self, may need to i18n-ize print_ref_status() in
>     transport.c, looks like it's used by git-push only
>
>  builtin/fetch.c | 15 +++++++--------
>  gettext.c       | 15 +++++++++++++--
>  gettext.h       |  5 +++++
>  transport.h     |  1 +
>  4 files changed, 26 insertions(+), 10 deletions(-)
>
> diff --git a/builtin/fetch.c b/builtin/fetch.c
> index bb9a074..85e291f 100644
> --- a/builtin/fetch.c
> +++ b/builtin/fetch.c
> @@ -255,9 +255,8 @@ static int update_local_ref(struct ref *ref,
>         if (!hashcmp(ref->old_sha1, ref->new_sha1)) {
>                 if (verbosity > 0)
>                         strbuf_addf(display, "= %-*s %-*s -> %s",
> -                                   TRANSPORT_SUMMARY_WIDTH,
> -                                   _("[up to date]"), REFCOL_WIDTH,
> -                                   remote, pretty_ref);
> +                                   TRANSPORT_SUMMARY(_("[up to date]")),
> +                                   REFCOL_WIDTH, remote, pretty_ref);
>                 return 0;
>         }
>
> @@ -271,7 +270,7 @@ static int update_local_ref(struct ref *ref,
>                  */
>                 strbuf_addf(display,
>                             _("! %-*s %-*s -> %s  (can't fetch in current branch)"),
> -                           TRANSPORT_SUMMARY_WIDTH, _("[rejected]"),
> +                           TRANSPORT_SUMMARY(_("[rejected]")),
>                             REFCOL_WIDTH, remote, pretty_ref);
>                 return 1;
>         }
> @@ -282,7 +281,7 @@ static int update_local_ref(struct ref *ref,
>                 r = s_update_ref("updating tag", ref, 0);
>                 strbuf_addf(display, "%c %-*s %-*s -> %s%s",
>                             r ? '!' : '-',
> -                           TRANSPORT_SUMMARY_WIDTH, _("[tag update]"),
> +                           TRANSPORT_SUMMARY(_("[tag update]")),
>                             REFCOL_WIDTH, remote, pretty_ref,
>                             r ? _("  (unable to update local ref)") : "");
>                 return r;
> @@ -317,7 +316,7 @@ static int update_local_ref(struct ref *ref,
>                 r = s_update_ref(msg, ref, 0);
>                 strbuf_addf(display, "%c %-*s %-*s -> %s%s",
>                             r ? '!' : '*',
> -                           TRANSPORT_SUMMARY_WIDTH, what,
> +                           TRANSPORT_SUMMARY(what),
>                             REFCOL_WIDTH, remote, pretty_ref,
>                             r ? _("  (unable to update local ref)") : "");
>                 return r;
> @@ -357,7 +356,7 @@ static int update_local_ref(struct ref *ref,
>                 return r;
>         } else {
>                 strbuf_addf(display, "! %-*s %-*s -> %s  %s",
> -                           TRANSPORT_SUMMARY_WIDTH, _("[rejected]"),
> +                           TRANSPORT_SUMMARY(_("[rejected]")),
>                             REFCOL_WIDTH, remote, pretty_ref,
>                             _("(non-fast-forward)"));
>                 return 1;
> @@ -554,7 +553,7 @@ static int prune_refs(struct refspec *refs, int ref_count, struct ref *ref_map)
>                         result |= delete_ref(ref->name, NULL, 0);
>                 if (verbosity >= 0) {
>                         fprintf(stderr, " x %-*s %-*s -> %s\n",
> -                               TRANSPORT_SUMMARY_WIDTH, _("[deleted]"),
> +                               TRANSPORT_SUMMARY(_("[deleted]")),
>                                 REFCOL_WIDTH, _("(none)"), prettify_refname(ref->name));
>                         warn_dangling_symref(stderr, dangling_msg, ref->name);
>                 }
> diff --git a/gettext.c b/gettext.c
> index f75bca7..71e9545 100644
> --- a/gettext.c
> +++ b/gettext.c
> @@ -4,6 +4,8 @@
>
>  #include "git-compat-util.h"
>  #include "gettext.h"
> +#include "strbuf.h"
> +#include "utf8.h"
>
>  #ifndef NO_GETTEXT
>  #      include <locale.h>
> @@ -27,10 +29,9 @@ int use_gettext_poison(void)
>  #endif
>
>  #ifndef NO_GETTEXT
> +static const char *charset;
>  static void init_gettext_charset(const char *domain)
>  {
> -       const char *charset;
> -
>         /*
>            This trick arranges for messages to be emitted in the user's
>            requested encoding, but avoids setting LC_CTYPE from the
> @@ -128,4 +129,14 @@ void git_setup_gettext(void)
>         init_gettext_charset("git");
>         textdomain("git");
>  }
> +
> +/* return the number of columns of string 's' in current locale */
> +int gettext_width(const char *s)
> +{
> +       static int is_utf8 = -1;
> +       if (is_utf8 == -1)
> +               is_utf8 = !strcmp(charset, "UTF-8");
> +
> +       return is_utf8 ? utf8_strwidth(s) : strlen(s);
> +}
>  #endif
> diff --git a/gettext.h b/gettext.h
> index 57ba8bb..e44d8bc 100644
> --- a/gettext.h
> +++ b/gettext.h
> @@ -30,10 +30,15 @@
>
>  #ifndef NO_GETTEXT
>  extern void git_setup_gettext(void);
> +extern int gettext_width(const char *s);
>  #else
>  static inline void git_setup_gettext(void)
>  {
>  }
> +static inline int gettext_width(const char *s)
> +{
> +       return strlen(s);
> +}
>  #endif
>
>  #ifdef GETTEXT_POISON
> diff --git a/transport.h b/transport.h
> index b866c12..a5d375e 100644
> --- a/transport.h
> +++ b/transport.h
> @@ -106,6 +106,7 @@ struct transport {
>  #define TRANSPORT_RECURSE_SUBMODULES_ON_DEMAND 256
>
>  #define TRANSPORT_SUMMARY_WIDTH (2 * DEFAULT_ABBREV + 3)
> +#define TRANSPORT_SUMMARY(x) (TRANSPORT_SUMMARY_WIDTH + strlen(x) - gettext_width(x)), (x)
>
>  /* Returns a transport suitable for the url */
>  struct transport *transport_get(struct remote *, const char *);
> --
> 1.7.12.rc2.18.g61b472e
>



-- 
Duy

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] fetch: align new ref summary printout in UTF-8 locales
  2012-09-12 14:02   ` Nguyen Thai Ngoc Duy
@ 2012-09-12 18:02     ` Junio C Hamano
  2012-09-12 20:12       ` Torsten Bögershausen
  0 siblings, 1 reply; 14+ messages in thread
From: Junio C Hamano @ 2012-09-12 18:02 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: git

Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:

> Ping.. what happens to this patch? Do you want to support other
> encodings as well via iconv()? I can't test that though.

I thought I refuted a potential blocker, which was an implied
objection from Torsten, in $gmane/204912 already.  As long as we
make it clear that your patch helps only "ASCII/UTF-8 only" audience
but we still "try to be nicer to 'others'" by doing two things I
said in the message, I think we should proceed without iconv() to
keep things simple.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] fetch: align new ref summary printout in UTF-8 locales
  2012-09-12 18:02     ` Junio C Hamano
@ 2012-09-12 20:12       ` Torsten Bögershausen
  0 siblings, 0 replies; 14+ messages in thread
From: Torsten Bögershausen @ 2012-09-12 20:12 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Nguyen Thai Ngoc Duy, git, Torsten Bögershausen

On 12.09.12 20:02, Junio C Hamano wrote:
> Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:
> 
>> Ping.. what happens to this patch? Do you want to support other
>> encodings as well via iconv()? I can't test that though.
> 
> I thought I refuted a potential blocker, which was an implied
> objection from Torsten, in $gmane/204912 already.  As long as we
> make it clear that your patch helps only "ASCII/UTF-8 only" audience
> but we still "try to be nicer to 'others'" by doing two things I
> said in the message, I think we should proceed without iconv() to
> keep things simple.

Please unblock and proceed (as I can't test other encodings either)

And even special thanks for the excellent lines from Junio,
which explained the update philosophy in git so well, 
that I take the freedom to re-post them here:

>> I try to re-phrase my question:
>>
>> Do installations still exist which use e.g. BIG5 or any other
>> multi byte encoding which is not UTF-8?
>
>Yes.
>
>> Do we want to support other encodings than ASCII or UTF-8?
>> (Because then the screen width needs to be calculate different, I think)
>
>That depends on who "we" are and what timeframe you have in mind.
>
>Do our developers care about these encodings so much that we would
>reject "ASCCI/UTF-8 only" patch and wait until we enhance it to do
>the right thing for other encodings that we do not use ourselves?
>No.  That does not make any sense, especially when we know we will
>not have a good test coverage on the additional parts that we will
>not be using ourselves.
>
>"This change only helps people with ASCII or UTF-8 and does not help
>others" alone is never a valid reason to reject a change, but we
>still try to be nicer to "others" that may come after we leave this
>topic behind by doing a few things:
>
> - If the change will make things worse than it currently is for
>   "others", we would try to minimize the regression for them.
>
> - If the change will make the code harder to update later to
>   enhance with additional change to support "others", we would try
>   to anticipate what kind of changes are needed on top, and
>   structure the code in such a way that future changes can be made
>   cleanly.
>
>For the first point, for multi-byte encodings (e.g. ISO-2022), the
>display columns and byte length do not match and in general byte
>length is longer than the display columns in the current code.  With
>the current code that measures the required columns across elements
>by taking the maximum of byte length, they will see wrong number of
>filler, so they are already getting a wrong alignment.  With the
>"UTF-8 only" change, the required columns and the filler will be
>calculated by assuming that the string is in UTF-8, which may make
>the computation off in a different way, and if we underestimate the
>display columns for a string, they may see the strings truncated,
>which is bad.
>
>So as long as gettext_width() punts and returns strlen() for a
>malformed UTF-8, it would be OK [*1*].
>
>For the second point, I think the API "here is a string, give me the
>number of display columns it will occupy, as I am interested in
>aligning them" is a good abstraction that can be later enhanced to
>other encodings fairly easily, so I do not see a problem in the
>patch that goes in that direction.
>
>
>
>[Footnote]
>
>*1* If the patch used utf_strwidth() (which I didn't bother to go
>back and check, though), it should be OK.  The underlying
>utf8_width() will reject a malformed UTF-8 sequence and the code
>falls back to strlen().

 

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2012-09-12 20:12 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-09-03 11:10 [PATCH] fetch: align new ref summary printout in UTF-8 locales Nguyễn Thái Ngọc Duy
2012-09-03 19:26 ` Junio C Hamano
2012-09-03 20:01   ` Johannes Sixt
2012-09-03 20:30     ` Junio C Hamano
2012-09-04  1:31   ` Nguyen Thai Ngoc Duy
2012-09-04 10:39 ` [PATCH v2] " Nguyễn Thái Ngọc Duy
     [not found]   ` <504796A5.4070700@gmail.com>
2012-09-05 19:20     ` Torsten Bögershausen
2012-09-06 12:11       ` Nguyen Thai Ngoc Duy
2012-09-06 15:36       ` Nguyen Thai Ngoc Duy
2012-09-06 16:21         ` Torsten Bögershausen
2012-09-06 18:08           ` Junio C Hamano
2012-09-12 14:02   ` Nguyen Thai Ngoc Duy
2012-09-12 18:02     ` Junio C Hamano
2012-09-12 20:12       ` Torsten Bögershausen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.