add morphisms
- unicode - value delimited seq - zigzag encoding (still lacking any way to get to signed integer yet'..)
This commit is contained in:
parent
630948139b
commit
d0118b56b1
3 changed files with 190 additions and 0 deletions
morphisms
93
morphisms/unicode.morphism-base
Normal file
93
morphisms/unicode.morphism-base
Normal file
|
@ -0,0 +1,93 @@
|
|||
```
|
||||
```
|
||||
|
||||
morph_string_as_ascii_to_utf8 ()
|
||||
<Seq ~ <ValueTerminated '\0'> Char~Ascii~x86.UInt8>
|
||||
--> <Seq Char~Unicode>
|
||||
~ UTF-8
|
||||
~ <Seq~<ValueTerminated 0> x86.UInt8>
|
||||
```
|
||||
while( *src ) { *dst++ = *src++; }
|
||||
*dst = 0;
|
||||
return 0;
|
||||
```
|
||||
|
||||
morph_string_as_utf8_to_ascii ()
|
||||
<Seq Char~Unicode>
|
||||
~ UTF-8
|
||||
~ <Seq~<ValueTerminated 0> x86.UInt8>
|
||||
--> <Seq ~ <ValueTerminated '\0'> Char~Ascii~x86.UInt8>
|
||||
```
|
||||
while( *src ) {
|
||||
if( *src < 128 ) {
|
||||
*dst++ = *src++;
|
||||
} else {
|
||||
fprintf(stderr, "(morph UTF-8 to Ascii) ignore multi-byte character\n");
|
||||
while( *++src >= 128 );
|
||||
}
|
||||
}
|
||||
*dst = 0;
|
||||
return 0;
|
||||
```
|
||||
|
||||
morph_string_as_ascii_to_utf32 ()
|
||||
<Seq ~ <ValueTerminated '\0'> Char~Ascii~x86.UInt8>
|
||||
--> <Seq Char~Unicode>
|
||||
~ UTF-32LE
|
||||
~ <Seq~<ValueTerminated 0> x86.UInt32>
|
||||
```
|
||||
while( *src ) { *dst++ = *src++; }
|
||||
*dst = 0;
|
||||
return 0;
|
||||
```
|
||||
|
||||
morph_string_as_utf8_to_utf32 ()
|
||||
|
||||
<Seq Char~Unicode>
|
||||
~ UTF-8
|
||||
~ <Seq~<ValueTerminated 0> x86.UInt8>
|
||||
|
||||
--> <Seq Char~Unicode>
|
||||
~ UTF-32LE
|
||||
~ <Seq~<ValueTerminated 0> x86.UInt32>
|
||||
|
||||
```
|
||||
bool has_multibyte = false;
|
||||
uint32_t val = 0;
|
||||
while( *src ) {
|
||||
uint8_t tag = (*src >> 6)&0b11;
|
||||
switch( tag ) {
|
||||
// single byte
|
||||
case 0b00:
|
||||
case 0b01:
|
||||
if( has_multibyte ) {
|
||||
*dst++ = val;
|
||||
has_multibyte = false;
|
||||
}
|
||||
*dst++ = *src++;
|
||||
break;
|
||||
|
||||
// start multibyte
|
||||
case 0b11:
|
||||
if( has_multibyte ) {
|
||||
*dst++ = val;
|
||||
}
|
||||
has_multibyte = true;
|
||||
val = (*src++) & 0b111111;
|
||||
break;
|
||||
|
||||
// continue multibyte
|
||||
case 0b10:
|
||||
val <<= 6;
|
||||
val |= (*src++) & 0b111111;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if( has_multibyte )
|
||||
*dst++ = val;
|
||||
|
||||
*dst++ = 0;
|
||||
|
||||
return 0;
|
||||
```
|
Loading…
Add table
Add a link
Reference in a new issue