ldmc/morphisms/unicode.morphism-base

94 lines
2 KiB
Text

```
#include <stdio.h>
```
morph_string_as_ascii_to_utf8 ()
<Seq ~ <ValueTerminated 0> Char~Ascii~native.UInt8>
--> <Seq Char~Unicode>
~ UTF-8
~ <Seq~<ValueTerminated 0> native.UInt8>
```
while( *src ) { *dst++ = *src++; }
*dst = 0;
return 0;
```
morph_string_as_utf8_to_ascii ()
<Seq Char~Unicode>
~ UTF-8
~ <Seq~<ValueTerminated 0> native.UInt8>
--> <Seq ~ <ValueTerminated 0> Char~Ascii~native.UInt8>
```
while( *src ) {
if( *src < 128 ) {
*dst++ = *src++;
} else {
fprintf(stderr, "(morph UTF-8 to Ascii) ignore multi-byte character\n");
while( *++src >= 128 );
}
}
*dst = 0;
return 0;
```
morph_string_as_ascii_to_utf32 ()
<Seq ~ <ValueTerminated 0> Char~Ascii~native.UInt8>
--> <Seq Char~Unicode>
~ UTF-32
~ <Seq~<ValueTerminated 0> native.UInt32>
```
while( *src ) { *dst++ = *src++; }
*dst = 0;
return 0;
```
morph_string_as_utf8_to_utf32 ()
<Seq Char~Unicode>
~ UTF-8
~ <Seq~<ValueTerminated 0> native.UInt8>
--> <Seq Char~Unicode>
~ UTF-32
~ <Seq~<ValueTerminated 0> native.UInt32>
```
bool has_multibyte = false;
uint32_t val = 0;
while( *src ) {
uint8_t tag = (*src >> 6)&0b11;
switch( tag ) {
// single byte
case 0b00:
case 0b01:
if( has_multibyte ) {
*dst++ = val;
has_multibyte = false;
}
*dst++ = *src++;
break;
// start multibyte
case 0b11:
if( has_multibyte ) {
*dst++ = val;
}
has_multibyte = true;
val = (*src++) & 0b111111;
break;
// continue multibyte
case 0b10:
val <<= 6;
val |= (*src++) & 0b111111;
break;
}
}
if( has_multibyte )
*dst++ = val;
*dst++ = 0;
return 0;
```