add morphisms

- unicode
- value delimited seq
- zigzag encoding (still lacking any way to get to signed integer yet'..)
This commit is contained in:
Michael Sippel 2025-03-15 18:49:06 +01:00
parent 630948139b
commit d0118b56b1
Signed by: senvas
GPG key ID: F96CF119C34B64A6
3 changed files with 190 additions and 0 deletions

View file

@ -0,0 +1,93 @@
```
```
morph_string_as_ascii_to_utf8 ()
<Seq ~ <ValueTerminated '\0'> Char~Ascii~x86.UInt8>
--> <Seq Char~Unicode>
~ UTF-8
~ <Seq~<ValueTerminated 0> x86.UInt8>
```
while( *src ) { *dst++ = *src++; }
*dst = 0;
return 0;
```
morph_string_as_utf8_to_ascii ()
<Seq Char~Unicode>
~ UTF-8
~ <Seq~<ValueTerminated 0> x86.UInt8>
--> <Seq ~ <ValueTerminated '\0'> Char~Ascii~x86.UInt8>
```
while( *src ) {
if( *src < 128 ) {
*dst++ = *src++;
} else {
fprintf(stderr, "(morph UTF-8 to Ascii) ignore multi-byte character\n");
while( *++src >= 128 );
}
}
*dst = 0;
return 0;
```
morph_string_as_ascii_to_utf32 ()
<Seq ~ <ValueTerminated '\0'> Char~Ascii~x86.UInt8>
--> <Seq Char~Unicode>
~ UTF-32LE
~ <Seq~<ValueTerminated 0> x86.UInt32>
```
while( *src ) { *dst++ = *src++; }
*dst = 0;
return 0;
```
morph_string_as_utf8_to_utf32 ()
<Seq Char~Unicode>
~ UTF-8
~ <Seq~<ValueTerminated 0> x86.UInt8>
--> <Seq Char~Unicode>
~ UTF-32LE
~ <Seq~<ValueTerminated 0> x86.UInt32>
```
bool has_multibyte = false;
uint32_t val = 0;
while( *src ) {
uint8_t tag = (*src >> 6)&0b11;
switch( tag ) {
// single byte
case 0b00:
case 0b01:
if( has_multibyte ) {
*dst++ = val;
has_multibyte = false;
}
*dst++ = *src++;
break;
// start multibyte
case 0b11:
if( has_multibyte ) {
*dst++ = val;
}
has_multibyte = true;
val = (*src++) & 0b111111;
break;
// continue multibyte
case 0b10:
val <<= 6;
val |= (*src++) & 0b111111;
break;
}
}
if( has_multibyte )
*dst++ = val;
*dst++ = 0;
return 0;
```

View file

@ -0,0 +1,71 @@
```
#include <array/length-prefix.h>
#include <stdlib.h>
```
morph_seqseq_valsep_uint8 (T: Type, SrcDelim: T, DstDelim: T)
< Seq <Seq T> >
~ < ValueSep SrcDelim T >
~ < Seq~<LengthPrefix x86.UInt64> T >
--> < Seq <Seq T> >
~ < ValueSep DstDelim T >
~ < Seq~<LengthPrefix x86.UInt64> T >
```
length_prefix_uint8_array_clear( dst );
uint8_t * dst_items = dst->items;
for( uint64_t i = 0; i < src->len; ++i ) {
if( src->items[i] == SrcDelim ) {
length_prefix_uint8_array_push( dst, DstDelim );
} else if( src->items[i] == DstDelim ) {
if( DstDelim == '\n' ) {
length_prefix_uint8_array_push( dst, '\\' );
length_prefix_uint8_array_push( dst, 'n' );
}
} else {
length_prefix_uint8_array_push( dst, src->items[i] );
}
}
return 0;
```
morph_seqseq_as_valsep_to_lenpfx (T: Type, Delim: T, EscKey: T)
< Seq <Seq T> >
~ < ValueSep T Delim >
~ < Seq~<LengthPrefix x86.UInt64> T >
--> < Seq~<LengthPrefix x86.UInt64>
<Seq~<LengthPrefix x86.UInt64> T >
~ <RefMut < Seq~<LengthPrefix x86.UInt64> T>>
~ x86.Address
~ x86.UInt64
>
```
length_prefix_uint64_array_clear( dst );
struct LengthPrefixUInt8Array * cur_item = NULL;
uint8_t const * start = &src->items[0];
uint8_t const * cur = start;
uint8_t const * end = &src->items[src->len];
while( cur < end ) {
if( *cur == Delim || cur+1 == end ) {
uint64_t len = cur - start;
cur_item = malloc( sizeof(uint64_t) + sizeof(uint8_t) * len );
cur_item->len = len;
memcpy( cur_item->items, start, len );
length_prefix_uint64_array_push( dst, (uint64_t)cur_item );
start = ++cur;
} else {
cur++;
}
}
return 0;
```

View file

@ -0,0 +1,26 @@
```
```
morph_i64_as_twos_complement_to_zigzag ()
~ x86.Int64
--> ~ ZigZagInt ~ ~ x86.UInt64
```
if( *src >= 0 ) {
*dst = (2 * (uint64_t)*src)
} else {
*dst = (2 * (uint64_t)(- *src)) - 1;
}
return 0;
```
morph_i64_as_zigzag_to_twos_complement ()
~ ZigZagInt ~ ~ x86.UInt64
--> ~ x86.Int64
```
if( *src % 2 == 0 ) {
*dst = *src / 2;
} else {
*dst = - ((*src+1) / 2);
}
```