linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH][2.5] UTF-8 support in console
@ 2003-05-31 14:10 Chris Heath
  2003-05-31 14:21 ` Christoph Hellwig
  0 siblings, 1 reply; 11+ messages in thread
From: Chris Heath @ 2003-05-31 14:10 UTC (permalink / raw)
  To: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 496 bytes --]

Although the keyboard (keyboard.c) and terminal (vt.c) both have "UTF-8
modes", there are still a couple of short-comings at the kernel level.

  * compose tables use 8-bit characters,
  * selection doesn't copy/paste UTF-8.

(OK, so there are many more, but these two are the ones that make
it inconvenient to even use extended Latin characters.)

Here is a patch against 2.5.70 that fixes selection.  It uses the
existing Unicode font information (ushorts) to create an inverse mapping.

Chris

[-- Attachment #2: selection.patch --]
[-- Type: application/octet-stream, Size: 6480 bytes --]

--- a/include/linux/consolemap.h	2003-05-04 19:53:56.000000000 -0400
+++ b/include/linux/consolemap.h	2003-05-26 15:17:45.000000000 -0400
@@ -10,6 +10,6 @@
 
 struct vc_data;
 
-extern unsigned char inverse_translate(struct vc_data *conp, int glyph);
+extern u16 inverse_translate(struct vc_data *conp, int glyph, int use_unicode);
 extern unsigned short *set_translate(int m,int currcons);
 extern int conv_uni_to_pc(struct vc_data *conp, long ucs);
--- a/drivers/char/consolemap.c	2003-05-04 19:53:36.000000000 -0400
+++ b/drivers/char/consolemap.c	2003-05-26 15:01:58.000000000 -0400
@@ -178,6 +178,7 @@
 	unsigned long	refcount;
 	unsigned long	sum;
 	unsigned char	*inverse_translations[4];
+	u16		*inverse_trans_unicode;
 	int		readonly;
 };
 
@@ -208,6 +209,39 @@
 	}
 }
 
+static void set_inverse_trans_unicode(struct vc_data *conp, struct uni_pagedir *p)
+{
+	int i, j, k, glyph;
+	u16 **p1, *p2;
+	u16 *q;
+	
+	if (!p) return;
+	q = p->inverse_trans_unicode;
+
+	if (!q) {
+		q = p->inverse_trans_unicode = (u16 *) 
+			kmalloc(MAX_GLYPH * sizeof(u16), GFP_KERNEL);
+		if (!q) return;
+	}
+	memset(q, 0, MAX_GLYPH * sizeof(u16));
+
+	for (i = 0; i < 32; i++) {
+		p1 = p->uni_pgdir[i];
+		if (!p1)
+			continue;
+		for (j = 0; j < 32; j++) {
+			p2 = p1[j];
+			if (!p2)
+				continue;
+			for (k = 0; k < 64; k++) {
+				glyph = p2[k];
+				if (glyph >= 0 && glyph < MAX_GLYPH && q[glyph] < 32)
+		  			q[glyph] = (i << 11) + (j << 6) + k;
+			}
+		}
+	}
+}
+
 unsigned short *set_translate(int m,int currcons)
 {
 	inv_translate[currcons] = m;
@@ -218,19 +252,27 @@
  * Inverse translation is impossible for several reasons:
  * 1. The font<->character maps are not 1-1.
  * 2. The text may have been written while a different translation map
- *    was active, or using Unicode.
+ *    was active.
  * Still, it is now possible to a certain extent to cut and paste non-ASCII.
  */
-unsigned char inverse_translate(struct vc_data *conp, int glyph)
+u16 inverse_translate(struct vc_data *conp, int glyph, int use_unicode)
 {
 	struct uni_pagedir *p;
 	if (glyph < 0 || glyph >= MAX_GLYPH)
 		return 0;
-	else if (!(p = (struct uni_pagedir *)*conp->vc_uni_pagedir_loc) ||
-		 !p->inverse_translations[inv_translate[conp->vc_num]])
+	else if (!(p = (struct uni_pagedir *)*conp->vc_uni_pagedir_loc))
 		return glyph;
-	else
-		return p->inverse_translations[inv_translate[conp->vc_num]][glyph];
+	else if (use_unicode) {
+		if (!p->inverse_trans_unicode)
+			return glyph;
+		else
+			return p->inverse_trans_unicode[glyph];
+	} else {
+		if (!p->inverse_translations[inv_translate[conp->vc_num]])
+			return glyph;
+		else
+			return p->inverse_translations[inv_translate[conp->vc_num]][glyph];
+	}
 }
 
 static void update_user_maps(void)
@@ -362,6 +404,10 @@
 			kfree(p->inverse_translations[i]);
 			p->inverse_translations[i] = NULL;
 		}
+	if (p->inverse_trans_unicode) {
+		kfree(p->inverse_trans_unicode);
+		p->inverse_trans_unicode = NULL;
+	}
 }
 
 void con_free_unimap(int con)
@@ -522,6 +568,7 @@
 
 	for (i = 0; i <= 3; i++)
 		set_inverse_transl(conp, p, i); /* Update all inverse translations */
+	set_inverse_trans_unicode(conp, p);
   
 	return err;
 }
@@ -574,6 +621,7 @@
 
 	for (i = 0; i <= 3; i++)
 		set_inverse_transl(conp, p, i);	/* Update all inverse translations */
+	set_inverse_trans_unicode(conp, p);
 	dflt = p;
 	return err;
 }
--- a/drivers/char/selection.c	2003-05-04 19:53:42.000000000 -0400
+++ b/drivers/char/selection.c	2003-05-26 15:26:07.000000000 -0400
@@ -20,6 +20,7 @@
 
 #include <asm/uaccess.h>
 
+#include <linux/kbd_kern.h>
 #include <linux/vt_kern.h>
 #include <linux/consolemap.h>
 #include <linux/selection.h>
@@ -36,6 +37,7 @@
 /* Variables for selection control. */
 /* Use a dynamic buffer, instead of static (Dec 1994) */
        int sel_cons;		/* must not be disallocated */
+static int use_unicode;
 static volatile int sel_start = -1; 	/* cleared by clear_selection */
 static int sel_end;
 static int sel_buffer_lth;
@@ -56,10 +58,10 @@
 	complement_pos(sel_cons, where);
 }
 
-static unsigned char
+static u16
 sel_pos(int n)
 {
-	return inverse_translate(vc_cons[sel_cons].d, screen_glyph(sel_cons, n));
+	return inverse_translate(vc_cons[sel_cons].d, screen_glyph(sel_cons, n), use_unicode);
 }
 
 /* remove the current selection highlight, if any,
@@ -88,8 +90,8 @@
   0xFF7FFFFF  /* latin-1 accented letters, not division sign */
 };
 
-static inline int inword(const unsigned char c) {
-	return ( inwordLut[c>>5] >> (c & 0x1F) ) & 1;
+static inline int inword(const u16 c) {
+	return c > 0xff || (( inwordLut[c>>5] >> (c & 0x1F) ) & 1);
 }
 
 /* set inwordLut contents. Invoked by ioctl(). */
@@ -110,13 +112,36 @@
 	return (v > u) ? u : v;
 }
 
+/* stores the char in UTF8 and returns the number of bytes used (1-3) */
+int store_utf8(u16 c, char *p) 
+{
+	if (c < 0x80) {
+		/*  0******* */
+		p[0] = c;
+		return 1;
+	} else if (c < 0x800) {
+		/* 110***** 10****** */
+		p[0] = 0xc0 | (c >> 6);
+		p[1] = 0x80 | (c & 0x3f);
+		return 2;
+    	} else {
+		/* 1110**** 10****** 10****** */
+		p[0] = 0xe0 | (c >> 12);
+		p[1] = 0x80 | ((c >> 6) & 0x3f);
+		p[2] = 0x80 | (c & 0x3f);
+		return 3;
+    	}
+}
+
 /* set the current selection. Invoked by ioctl() or by kernel code. */
 int set_selection(const unsigned long arg, struct tty_struct *tty, int user)
 {
 	int sel_mode, new_sel_start, new_sel_end, spc;
 	char *bp, *obp;
-	int i, ps, pe;
+	int i, ps, pe, multiplier;
+	u16 c;
 	unsigned int currcons = fg_console;
+	struct kbd_struct *kbd = kbd_table + fg_console;
 
 	unblank_screen();
 	poke_blanked_console();
@@ -170,6 +195,7 @@
 		clear_selection();
 		sel_cons = fg_console;
 	}
+	use_unicode = kbd && kbd->kbdmode == VC_UNICODE;
 
 	switch (sel_mode)
 	{
@@ -252,7 +278,8 @@
 	sel_end = new_sel_end;
 
 	/* Allocate a new buffer before freeing the old one ... */
-	bp = kmalloc((sel_end-sel_start)/2+1, GFP_KERNEL);
+	multiplier = use_unicode ? 3 : 1;  /* chars can take up to 3 bytes */
+	bp = kmalloc((sel_end-sel_start)/2*multiplier+1, GFP_KERNEL);
 	if (!bp) {
 		printk(KERN_WARNING "selection: kmalloc() failed\n");
 		clear_selection();
@@ -264,8 +291,12 @@
 
 	obp = bp;
 	for (i = sel_start; i <= sel_end; i += 2) {
-		*bp = sel_pos(i);
-		if (!isspace(*bp++))
+		c = sel_pos(i);
+		if (use_unicode)
+			bp += store_utf8(c, bp);
+		else
+			*bp++ = c;
+		if (!isspace(c))
 			obp = bp;
 		if (! ((i + 2) % video_size_row)) {
 			/* strip trailing blanks from line and add newline,

^ permalink raw reply	[flat|nested] 11+ messages in thread
* Re: coding style (was Re: [PATCH][2.5] UTF-8 support in console)
@ 2003-05-31 16:06 john
  2003-06-01  4:41 ` Matt Mackall
  0 siblings, 1 reply; 11+ messages in thread
From: john @ 2003-05-31 16:06 UTC (permalink / raw)
  To: chris, davej, hch, linux-kernel, lm

> > Saving a line over readability is utterly bogus.

> I agree 100%.  If you have anything more complex than
>
>         if (error) return (error);
>
> I want it to look like
>
>         if ((expr) || (expr2) || (expr3)) {
>                 return (error);
>         }

Ergh, personally I hate reading code like that.

Having said that, I hate code that is indented.  Am I the only person who mentally
counts, "in, in, in, out", etc, when reading code?  Artificial indenting really
throws off my concentration, because the 'prompting' it provides does nothing to
help me, but the ragged left margin is irritating, and makes moving code around
awkward, because you have to correct the indenting to match the current, (actual),
nesting level of the code, (because indenting is a completely artificial concept).

I'm not trying to say my coding style is right, and yours is wrong, I'm just curious
 :-).

John.

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2003-06-01  5:05 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-05-31 14:10 [PATCH][2.5] UTF-8 support in console Chris Heath
2003-05-31 14:21 ` Christoph Hellwig
2003-05-31 14:43   ` coding style (was Re: [PATCH][2.5] UTF-8 support in console) Larry McVoy
2003-05-31 15:01     ` Dave Jones
2003-05-31 15:39       ` Larry McVoy
2003-05-31 17:14         ` Steven Cole
2003-05-31 17:56           ` viro
2003-05-31 19:37   ` [PATCH][2.5] UTF-8 support in console Chris Heath
2003-05-31 16:06 coding style (was Re: [PATCH][2.5] UTF-8 support in console) john
2003-06-01  4:41 ` Matt Mackall
2003-06-01  5:18   ` Randy.Dunlap

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).