All of lore.kernel.org
 help / color / mirror / Atom feed
* new:mbrtoc16.3: convert from to c16
@ 2021-06-20 20:30 Radisson97
  0 siblings, 0 replies; only message in thread
From: Radisson97 @ 2021-06-20 20:30 UTC (permalink / raw)
  To: linux-man; +Cc: alx.manpages

From 20fb95dcc1b1f85f0bc1afff39824729fea8297b Mon Sep 17 00:00:00 2001
From: Peter Radisson <--show-origin>
Date: Sun, 20 Jun 2021 22:14:59 +0200
Subject: [PATCH] convert between multibyte sequence and 16-bit wide character

documentation including example
Signed-off-by: Peter Radisson <--show-origin>
---
 man3/mbrtoc16.3 | 156 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 156 insertions(+)
 create mode 100644 man3/mbrtoc16.3

diff --git a/man3/mbrtoc16.3 b/man3/mbrtoc16.3
new file mode 100644
index 000000000..18cb48adc
--- /dev/null
+++ b/man3/mbrtoc16.3
@@ -0,0 +1,156 @@
+.TH  MBRTOC16  3 "2021-06-02" Linux "Linux Programmer's Manual"
+.SH NAME
+mbrtoc16, c16rtomb \- convert between multibyte sequence and 16-bit wide character
+.SH SYNOPSIS
+.nf
+.B #include <uchar.h>
+.PP
+.BI "size_t t mbrtoc16 (char16_t * restrict "c16 " ,"
+.BI"                          const char *" restrict s " , size_t " n " ,"
+.BI "                         mbstate_t * restrict " p ");"
+.PP
+.BI "size_t c16rtomb (char * restrict " s ", char16_t  " c16 " ,"
+.BI "                     mbstate_t * restrict " p " );"
+.fi
+.SH DESCRIPTION
+The
+.BR mbrtoc16  ()
+function inspects at most
+.I n
+bytes of the UTF-8 multibyte string starting at
+.IR s .
+If a multibyte character is identified as valid the corresponding UCS-16
+16-bit wide character is stored in
+.IR c16 .
+If the multibyte character is the null wide character, it
+resets the shift state
+.I *p
+to the initial state and returns 0.
+If
+.I p
+is  NULL,  a  static anonymous state known only to the
+function is used instead.
+.PP
+The
+.BR c16rtomb ()
+function converts the 16-bit wide character stored in
+.I c16
+into a mutability sequence  into the memory
+.IR s .
+.SH "RETURN VALUES"
+The
+.BR mbrtoc16  ()
+function returns
+0 for the nul character.
+\-1 for invalid input,
+\-2 for a truncated input,
+\-3 for multibyte 16-bit wide character sequence (U+D800—U+DFFF) that is
+written to
+.IR *c16 .
+No bytes are processed from the input
+.PP
+Otherwise the number of bytes in the multibyte sequence is returned.
+.PP
+The
+.BR c16tombr  ()
+function returns \-1 on error otherwise the number of bytes used
+for the multibytes sequence.
+.SH EXAMPLE
+The input sequence is written as byte sequence to allow a proper
+display. Note that the input is UTF-8 and UTF-16 , it may not possible
+to convert every code.
+.EX
+.\"
+.\" //  https://en.cppreference.com/w/c/string/multibyte/mbrtoc16
+.\"
+.nf
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <locale.h>
+#include <uchar.h>
+#include <wchar.h>
+
+void toc16( char *in,  int in_len, char16_t **outbuf, int *len)
+{
+    char *p_in , *end ;
+    char16_t *p_out,*out;
+    size_t rc;
+
+    out=malloc(in_len*sizeof(*out));
+    p_out = out;
+    p_in = in;
+    end = in + in_len;
+    while((rc = mbrtoc16(p_out, p_in, end - p_in, NULL)))
+    {
+        if(rc ==  -1)      // invalid input
+            break;
+        else if(rc == (size_t)-2) // truncated input
+	  break;
+        else if(rc == (size_t)-3) // UTF-16 high surrogate
+            p_out += 1;
+        else {
+            p_in += rc;
+            p_out += 1;
+        };
+    }
+  *len=p_out - out + 1;
+  *outbuf=out;
+}
+
+void fromc16(char16_t *in, int in_len, char **outbuf, int *len)
+{
+  char *out,*p;
+  int i;
+   size_t rc;
+  p=out=malloc(MB_CUR_MAX * in_len);
+  for(i=0;i<in_len;i++) {
+    rc=c16rtomb(p, in[i], NULL);
+    if(rc == (size_t)-1) break;
+    p += rc;
+    }
+  *outbuf=out;
+  *len=p-out+1;
+}
+
+void dump_u8(char *in, int len)
+{
+    int i;
+    printf("Processing %d UTF-8 code units: [ ", len);
+    for(i = 0; i <len ; ++i) printf("%#x ", (unsigned char)in[i]);
+    puts("]");
+}
+
+void dump_u16(char16_t *in, int len)
+{
+    int i;
+    printf("Processing %d UTF-16 code units: [ ", len);
+    for(i = 0; i < len; ++i) printf("0x%04x ", in[i]);
+    puts("]");
+
+}
+
+int main(void){
+  char in[] = "z\u00df\u6c34\U0001F34C";
+  char16_t *out;
+  int out_len,len;
+  char *p;
+  // make sure we have utf8
+  setlocale(LC_ALL, "de_DE.utf8");
+  dump_u8(in,sizeof in / sizeof *in);
+  toc16(in,sizeof in / sizeof *in,&out,&out_len);
+  dump_u16(out,out_len);
+  fromc16(out,out_len,&p,&len);
+  dump_u8(p,len);
+  return 0;
+}
+.fi
+.EE
+This is a simple example and not production ready.
+.SH NOTES
+UCS-16 is superseded by UCS-32.
+.SH "CONFORMING TO"
+C11
+.SH "SEE ALSO"
+.BR mbrtoc32 (),
+.BR c32tocmbr ()
--
2.26.2


^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2021-06-20 20:30 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-20 20:30 new:mbrtoc16.3: convert from to c16 Radisson97

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.