linux-man.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Radisson97@gmx.de
To: linux-man@vger.kernel.org
Subject: new:mbrtoc32.3: convert from to c32
Date: Sun, 20 Jun 2021 22:28:41 +0200	[thread overview]
Message-ID: <60cfa4f9.oQaEtQkckFQJoYb7%Radisson97@gmx.de> (raw)

From eb1ee6439f85b6a349c84488fa63dc7b795e43a0 Mon Sep 17 00:00:00 2001
From: Peter Radisson <--show-origin>
Date: Sun, 20 Jun 2021 22:21:55 +0200
Subject: [PATCH] convert between multibyte sequence and 32-bit wide character

documentation including example

Signed-off-by: Peter Radisson <--show-origin>
---
 man3/mbrtoc32.3 | 154 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 man3/mbrtoc32.3

diff --git a/man3/mbrtoc32.3 b/man3/mbrtoc32.3
new file mode 100644
index 000000000..8d0c33de1
--- /dev/null
+++ b/man3/mbrtoc32.3
@@ -0,0 +1,154 @@
+.TH  MBRTOC32  3 "2021-06-02" Linux "Linux Programmer's Manual"
+.SH NAME
+mbrtoc32, c32rtomb \- convert between multibyte sequence and 32-bit wide character
+.SH SYNOPSIS
+.nf
+.B #include <uchar.h>
+.PP
+.BI "size_t t mbrtoc32 (char32_t * restrict "c32 " ,"
+.BI"                          const char *" restrict s " , size_t " n " ,"
+.BI "                         mbstate_t * restrict " p ");"
+.PP
+.BI "size_t c32rtomb (char * restrict " s ", char32_t  " c32 " ,"
+.BI "                     mbstate_t * restrict " p " );"
+.fi
+.SH DESCRIPTION
+The
+.BR mbrtoc32  ()
+function inspects at most
+.I n
+bytes of the UTF-8 multibyte string starting at
+.IR s .
+If a multibyte is identified as valid the corresponding UCS-32
+32-bit wide character is stored in
+.IR c32 .
+If  the  multibyte  character is the null wide character, it
+resets the shift state
+.I *p
+to the initial state and returns 0.
+If
+.I p
+is  NULL,  a  static anonymous state known only to the
+function is used instead.
+.PP
+The
+.BR c32rtomb ()
+function converts the 32-bit wide character stored in
+.I c32
+into a mutability sequence  into the memory
+.IR s .
+.SH "RETURN VALUES"
+The
+.BR mbrtoc32  ()
+function returns
+0 for the nul character.
+\-1 for invalid input,
+\-2 for a truncated input,
+\-3 for multibyte 32-bit wide character sequence that is
+written to
+.IR *c32 .
+No bytes are processed from the input
+.PP
+Otherwise the number of bytes in the multibyte sequence is returned.
+.PP
+The
+.BR c32tombr  ()
+function returns \-1 on error otherwise the number of bytes used
+for the multibytes sequence.
+.SH EXAMPLE
+The input sequence is written as byte sequence to allow a proper
+display. Note that the input is UTF-8 and UTF-32 , it may not possible
+to convert every code.
+.EX
+.nf.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <locale.h>
+#include <uchar.h>
+#include <wchar.h>
+
+void toc32( char *in,  int in_len, char32_t **outbuf, int *len)
+{
+    char *p_in , *end ;
+    char32_t *p_out,*out;
+    size_t rc;
+
+    out=malloc(in_len*sizeof(*out));
+    p_out = out;
+    p_in = in;
+    end = in + in_len;
+    while((rc = mbrtoc32(p_out, p_in, end - p_in, NULL)))
+    {
+        if(rc ==  -1)      // invalid input
+            break;
+        else if(rc == (size_t)-2) // truncated input
+	  break;
+        else if(rc == (size_t)-3) // UTF-32 high surrogate
+            p_out += 1;
+        else {
+            p_in += rc;
+            p_out += 1;
+        };
+    }
+    // out_sz = p_out - out + 1;
+  *len=p_out - out + 1;
+  *outbuf=out;
+}
+
+void fromc32(char32_t *in, int in_len, char **outbuf, int *len)
+{
+  char *out,*p;
+  int i;
+   size_t rc;
+  p=out=malloc(MB_CUR_MAX * in_len);
+  for(i=0;i<in_len;i++) {
+    rc=c32rtomb(p, in[i], NULL);
+    if(rc == (size_t)-1) break;
+    p += rc;
+    }
+  *outbuf=out;
+  *len=p-out+1;
+}
+
+void dump_u8(char *in, int len)
+{
+    int i;
+    printf("Processing %d UTF-8 code units: [ ", len);
+    for(i = 0; i <len ; ++i) printf("%#x ", (unsigned char)in[i]);
+    puts("]");
+}
+
+void dump_u32(char32_t *in, int len)
+{
+    int i;
+    printf("Processing %d UTF-32 code units: [ ", len);
+    for(i = 0; i < len; ++i) printf("0x%04x ", in[i]);
+    puts("]");
+
+}
+
+int main(void){
+  char in[] = "z\u00df\u6c34\U0001F34C";
+  char32_t *out;
+  int out_len,len;
+  char *p;
+  // make sure we have utf8
+  setlocale(LC_ALL, "de_DE.utf8");
+  dump_u8(in,sizeof in / sizeof *in);
+  toc32(in,sizeof in / sizeof *in,&out,&out_len);
+  dump_u32(out,out_len);
+  fromc32(out,out_len,&p,&len);
+  dump_u8(p,len);
+  return 0;
+}
+
+.fi
+.EE
+This is a simple example and not production ready.
+.SH "CONFORMING TO"
+C11
+.SH "SEE ALSO"
+.BR mbrtoc16 (),
+.BR c16tocmbr (),
+.BR mbsrtowcs ()
--
2.26.2


             reply	other threads:[~2021-06-20 20:28 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-06-20 20:28 Radisson97 [this message]
2021-06-20 20:29 new:mbrtoc32.3: convert from to c32 Radisson97
2021-07-03 17:40 ` Alejandro Colomar (man-pages)
2021-07-03 18:01   ` Alejandro Colomar (man-pages)
2021-07-05 20:31     ` Radisson
2021-07-06 10:57       ` Alejandro Colomar (man-pages)
2021-07-04 10:26 Bruno Haible
2021-07-05 19:07 ` Alejandro Colomar (man-pages)
2021-07-05 21:09   ` Radisson
2021-07-06 11:06     ` Alejandro Colomar (man-pages)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=60cfa4f9.oQaEtQkckFQJoYb7%Radisson97@gmx.de \
    --to=radisson97@gmx.de \
    --cc=linux-man@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).