All of lore.kernel.org
 help / color / mirror / Atom feed
From: Robin Rosenberg <robin.rosenberg@dewire.com>
To: git@vger.kernel.org
Cc: Robin Rosenberg <robin.rosenberg@dewire.com>
Subject: [RFC 1/8] UTF helpers
Date: Wed, 13 May 2009 00:50:24 +0200	[thread overview]
Message-ID: <1242168631-30753-2-git-send-email-robin.rosenberg@dewire.com> (raw)
In-Reply-To: <1242168631-30753-1-git-send-email-robin.rosenberg@dewire.com>

---
 Makefile          |    8 ++-
 git-compat-util.h |    1 +
 git.c             |    9 +++
 t/test-lib.sh     |    4 +-
 test-utf.c        |   61 ++++++++++++++++
 utf.c             |  207 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 utf.h             |   27 +++++++
 7 files changed, 313 insertions(+), 4 deletions(-)
 create mode 100644 test-utf.c
 create mode 100644 utf.c
 create mode 100644 utf.h

diff --git a/Makefile b/Makefile
index 2d62efb..2d71f01 100644
--- a/Makefile
+++ b/Makefile
@@ -259,7 +259,7 @@ LIB_OBJS = \
 	object.o pack-check.o patch-delta.o path.o pkt-line.o sideband.o \
 	quote.o read-cache.o refs.o run-command.o dir.o object-refs.o \
 	server-info.o setup.o sha1_file.o sha1_name.o strbuf.o \
-	tag.o tree.o usage.o config.o environment.o ctype.o copy.o \
+	tag.o tree.o utf.o usage.o config.o environment.o ctype.o copy.o \
 	fetch-clone.o revision.o pager.o tree-walk.o xdiff-interface.o \
 	write_or_die.o trace.o list-objects.o grep.o \
 	alloc.o merge-file.o path-list.o help.o unpack-trees.o $(DIFF_OBJS) \
@@ -564,6 +564,9 @@ ifdef NO_ACCURATE_DIFF
 endif
 
 # Shell quote (do not use $(call) to accommodate ancient setups);
+ALL_CFLAGS += -DUTF8INTERNAL=1
+ALL_CFLAGS += -DDEBUG=1
+#ALL_CFLAGS += -DTEST=1
 
 SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
 
@@ -811,6 +814,9 @@ export NO_SVN_TESTS
 test: all
 	$(MAKE) -C t/ all
 
+test-utf$X: test-utf.c ctype.o utf.o usage.o
+	$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) test-utf.c utf.c ctype.o usage.o
+
 test-date$X: test-date.c date.o ctype.o
 	$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) test-date.c date.o ctype.o
 
diff --git a/git-compat-util.h b/git-compat-util.h
index 0272d04..f83352b 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -25,6 +25,7 @@
 #include <netinet/in.h>
 #include <sys/types.h>
 #include <dirent.h>
+#include "utf.h"
 
 /* On most systems <limits.h> would have given us this, but
  * not on some systems (e.g. GNU/Hurd).
diff --git a/git.c b/git.c
index 6475847..bd4e726 100644
--- a/git.c
+++ b/git.c
@@ -272,6 +272,15 @@ static void handle_internal_command(int argc, const char **argv, char **envp)
 	};
 	int i;
 
+#ifdef DEBUG
+	if (debug()) {
+		fprintf(stderr,"GIT-");
+		for (i = 1; i<argc; ++i)
+			fprintf(stderr,"%s",argv[i]);
+		fprintf(stderr,"\n");
+	}
+#endif
+
 	/* Turn "git cmd --help" into "git help cmd" */
 	if (argc > 1 && !strcmp(argv[1], "--help")) {
 		argv[1] = argv[0];
diff --git a/t/test-lib.sh b/t/test-lib.sh
index 07cb706..e8aefd8 100755
--- a/t/test-lib.sh
+++ b/t/test-lib.sh
@@ -4,11 +4,9 @@
 #
 
 # For repeatability, reset the environment to known value.
-LANG=C
-LC_ALL=C
 PAGER=cat
 TZ=UTC
-export LANG LC_ALL PAGER TZ
+export PAGER TZ
 EDITOR=:
 VISUAL=:
 unset AUTHOR_DATE
diff --git a/test-utf.c b/test-utf.c
new file mode 100644
index 0000000..133eea0
--- /dev/null
+++ b/test-utf.c
@@ -0,0 +1,61 @@
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+
+#include "cache.h"
+#include "utf.h"
+
+int main(int argc, char **argv)
+{
+	int i;
+
+#if 0
+	for (i = 1; i < argc; i++) {
+		char result1[100];
+		char result2[100];
+
+		utfcpy(result1, argv[i], strlen(argv[i])+1);
+		localcpy(result2, result1, strlen(result1)+1);
+
+		printf("%s -> %s -> %s\n", argv[i], result1, result2);
+	}
+	return 0;
+#endif
+
+#define test(name) case __LINE__: current_name=name; n++; printf("Testing case #%d: %s\n", n, current_name);
+#define end_test break;
+#define begin_suite() char *current_name=0; int n=1; for (i=0; i<1000; ++i) { switch(i) { 
+#define concats(a,b) #a #b
+
+#undef strcmp
+#define assertStringEquals(a,b) assert(#a #b && strcmp(a,b)==0)
+#define assertIntEquals(a,b) assert(#a #b && (a)==(b))
+
+#define end_suite() }}
+
+	begin_suite();
+
+	test("utfcpy") {
+	  char result[100];
+	  utfcpy(result,"Ändrad",7);
+	  assertStringEquals(result,"\303\204ndrad");
+	} end_test;
+
+	test("utflen") {
+	  int result=utflen("Ändrad",7);
+	  assertIntEquals(result,8);
+	} end_test;
+
+	test("localcpy") {
+	  char result[100];
+	  localcpy(result,"\303\204ndrad",8);
+	  assertStringEquals(result,"Ändrad");
+	} end_test;
+
+	test("locallen") {
+	  int result=locallen("\303\204ndrad",8);
+	  assertIntEquals(result,7);
+	} end_test;
+
+	end_suite();
+}
diff --git a/utf.c b/utf.c
new file mode 100644
index 0000000..eb430b2
--- /dev/null
+++ b/utf.c
@@ -0,0 +1,207 @@
+#undef UTF8INTERNAL
+
+#include <langinfo.h>
+#include <iconv.h>
+#include "cache.h"
+#include <locale.h>
+#include <stdarg.h>
+
+static iconv_t local_to_utf8 = (iconv_t)-1;
+static iconv_t utf8_to_local = (iconv_t)-1;
+static iconv_t utf8_to_utf8 = (iconv_t)-1;
+static int same = 0;
+
+#if TEST
+#define die printf
+#endif
+
+static void	initlocale()
+{
+#ifndef NO_ICONV
+	if (!same && local_to_utf8 == (iconv_t)-1) {
+		setlocale(LC_CTYPE, "");
+		char *local_encoding = nl_langinfo(CODESET);
+#ifdef DEBUG
+		if (debug()) fprintf(stderr,"encoding=%s\n", local_encoding);
+#endif
+		if (strcmp(local_encoding,"UTF-8") == 0) {
+			same = 1;
+			return;
+		}
+		local_to_utf8 = iconv_open("UTF-8",  local_encoding);
+		if (local_to_utf8 == (iconv_t)-1) {
+			die("cannot setup locale conversion from %s: %s", local_encoding, strerror(errno));
+		}
+#ifdef DEBUG
+		if (debug()) fprintf(stderr,"utf8_to_local = iconv_open(%s,UTF-8)\n",local_encoding);
+#endif
+		utf8_to_local = iconv_open(local_encoding,  "UTF-8");
+		if (utf8_to_local == (iconv_t)-1) {
+			die("cannot setup locale conversion from %s: %s", local_encoding, strerror(errno));
+		}
+
+		utf8_to_utf8 = iconv_open("UTF-8","UTF-8");
+		if (utf8_to_utf8 == (iconv_t)-1) {
+			die("cannot setup locale conversion from UTF-8 to UTF-8: %s",strerror(errno));
+		}
+	}
+#endif
+}
+
+int maybe_utf8(const char *local, size_t len)
+{
+  char *self = xcalloc(1,len+1);
+  char *selfp = self;
+  size_t outlen = len+1;
+  int ret = iconv(utf8_to_utf8, (char**)&local, &len, &selfp, &outlen);
+  free(self);
+  P(("maybelocal: %0.*s %s\n", len, local, ret!=-1 ? "yes" : "no"));
+  return ret != -1;
+}
+
+size_t utflen(const char *local, size_t locallen)
+{
+#ifndef NO_ICONV
+	initlocale();
+	if (same) {
+		return locallen;
+	}
+	if (maybe_utf8(local, locallen))
+		return locallen;
+
+	size_t outlen=locallen*6;
+	char *outbuf=xcalloc(outlen,1);
+	char *out=outbuf;
+	iconv(local_to_utf8, NULL, NULL, NULL, NULL);
+	const char *vlocal = local;
+	size_t vlocallen = locallen;
+	if (iconv(local_to_utf8,  (char**)&vlocal,  &vlocallen,  &out,  &outlen) == -1) {
+#if TEST
+		perror("failed");
+#endif
+		free(outbuf);
+		return locallen;
+	}
+	*out = 0;
+	free(outbuf);
+	return locallen*6 - outlen;
+#else
+	return locallen;
+#endif
+}
+
+/* Copy and transform */
+void utfcpy(char *to_utf, char *from_local, size_t localsize)
+{
+#ifdef DEBUG
+	char *a=to_utf,*b=from_local;
+#endif
+#ifndef NO_ICONV
+	initlocale();
+	if (same) {
+		memcpy(to_utf, from_local, localsize);
+		return;
+	}
+	if (maybe_utf8(from_local, localsize)) {
+		memcpy(to_utf, from_local, localsize);
+		return;
+	}
+
+	size_t outlen=localsize*6;
+	iconv(local_to_utf8, NULL, NULL, NULL, NULL);
+	char *vfrom_local = from_local;
+	char *vto_utf = to_utf;
+	size_t vlocalsize = localsize;
+	if (iconv(local_to_utf8,  &vfrom_local,  &vlocalsize,  &vto_utf,  &outlen) == -1) {
+		fprintf(stderr,"Failed to convert %0.*s to UTF\n", localsize, from_local);
+		memcpy(to_utf,  from_local,  localsize);
+	}
+#else
+	memcpy(to_utf, from_local, localsize);
+#endif
+#ifdef DEBUG
+	if (debug()) fprintf(stderr,"%0.*s ->UTF %0.*s\n", localsize, b, localsize*6 - outlen, a);
+#endif
+}
+
+size_t locallen(const char *utf, size_t utflen)
+{
+#ifndef NO_ICONV
+	initlocale();
+	if (same) {
+		return utflen;
+	}
+	char *outbuf=xcalloc(utflen*4,1); /* ??, can we be more specific? */
+	char *out=outbuf;
+	size_t outlen=utflen*4;
+	iconv(utf8_to_local, NULL, NULL, NULL, NULL);
+	char *vutf = utf;
+	size_t vutflen = utflen;
+	if (iconv(utf8_to_local,  (char**)&vutf,  &vutflen,  &out,  &outlen) == -1) {
+#ifdef DEBUG
+		perror("failed");
+#endif
+		free(outbuf);
+		return utflen;
+	}
+	*out = 0;
+	free(outbuf);
+	return utflen*4 - outlen; 	
+#else
+	return utflen;
+#endif
+}
+
+void localcpy(char *tolocal, char *fromutf, size_t utflen)
+{
+#ifdef DEBUG
+	char *a=tolocal,*b=fromutf;
+#endif
+	initlocale();
+	if (same) {
+		memcpy(tolocal, fromutf, utflen);
+		return;
+	}
+#ifndef NO_ICONV
+	iconv(utf8_to_local,  NULL,  NULL,  NULL,  NULL);
+	size_t outlen=utflen*4;
+	char *vfromutf = fromutf;
+	char *vtolocal = tolocal;
+	size_t vutflen = utflen;
+	if (iconv(utf8_to_local,  &vfromutf,  &vutflen,  &vtolocal,  &outlen) == -1) {
+		fprintf(stderr,"Failed to convert %0.*s to LOCAL\n", utflen, fromutf);
+		memcpy(tolocal, fromutf, utflen);
+	}
+#else
+	memcpy(tolocale, fromutf, utflen);
+#endif	
+#ifdef DEBUG
+	if (debug()) fprintf(stderr,"%0.*s ->LOCAL %0.*s\n", utflen, b, utflen*4-outlen, a);
+#endif
+}
+
+int PP(const char *fmt,...)
+{
+  va_list va;
+  va_start(va,fmt);
+  int ret=vfprintf(stderr,fmt,va);
+  va_end(va);
+  return ret;
+}
+
+int debugf=-1;
+
+int debug()
+{
+	if (debugf == -1) {
+		char *f = getenv("DEBUG");
+		if (!f) {
+			debugf = 0;
+		} else if (f[0] != 0) {
+			debugf = 1;
+		} else
+			debugf = 0;
+	}
+	return debugf == 1;
+}
+
diff --git a/utf.h b/utf.h
new file mode 100644
index 0000000..c6c6224
--- /dev/null
+++ b/utf.h
@@ -0,0 +1,27 @@
+#ifndef UTF_H
+#define UTF_H 1
+
+/** The number of octets 'local' would occupy encoded as utf8.
+ *  The input format is assumed to be local
+ */
+extern size_t utflen(const char *local,size_t locallen);
+extern size_t locallen(const char *utf,size_t utflen);
+
+/* Copy and transform */
+extern void utfcpy(char *toutf,char *fromlocal,size_t localen);
+
+/* Copy and transform */
+extern void localcpy(char *tolocal,char *fromutf,size_t utflen);
+
+#ifdef DEBUG
+#define D(x) do { if (debug()) fprintf(stderr,"%s:%d:%s\n",__FILE__,__LINE__,x); } while(0)
+#define P(x) do { if (debug()) { fprintf(stderr,"%s:%d:",__FILE__,__LINE__); PP x; } } while(0)
+int PP(const char *fmt,...);
+int debug();
+
+#else
+#define D(x)
+#define P(x)
+#endif
+
+#endif
-- 
1.6.3.dirty

  reply	other threads:[~2009-05-12 22:50 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-05-12 22:50 [RFC 0/8] Antique UTF-8 filename support Robin Rosenberg
2009-05-12 22:50 ` Robin Rosenberg [this message]
2009-05-12 22:50   ` [RFC 2/8] Messages in locale Robin Rosenberg
2009-05-12 22:50     ` [RFC 3/8] Extend tests to cover locale wrt to commit messages Robin Rosenberg
2009-05-12 22:50       ` [RFC 4/8] UTF file names Robin Rosenberg
     [not found]         ` <1242168631-30753-6-git-send-email-robin.rosenberg@dewire.com>
2009-05-12 22:50           ` [RFC 6/8] test of utf_locallinks Robin Rosenberg
2009-05-12 22:50             ` [RFC 7/8] Convert symlink dest in diff Robin Rosenberg
2009-05-12 22:50               ` [RFC 8/8] UTF-8 in non-SHA1-objects Robin Rosenberg
2009-05-13  0:20   ` [RFC 1/8] UTF helpers Johannes Schindelin
2009-05-13  5:24     ` Robin Rosenberg
2009-05-13  9:24       ` Esko Luontola
2009-05-13 10:02         ` Andreas Ericsson
2009-05-13 10:21           ` Esko Luontola
2009-05-13 11:44             ` Alex Riesen
2009-05-13 18:48         ` Junio C Hamano
2009-05-13 19:31           ` Esko Luontola
2009-05-13 20:10             ` Junio C Hamano
2009-05-13 10:14       ` Johannes Schindelin
2009-05-14  4:38       ` Junio C Hamano
2009-05-14 13:57         ` Jay Soffian

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1242168631-30753-2-git-send-email-robin.rosenberg@dewire.com \
    --to=robin.rosenberg@dewire.com \
    --cc=git@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.