From d0118b56b1a647f066cbc58a43d038a72a0a0ccd Mon Sep 17 00:00:00 2001 From: Michael Sippel <micha@fragmental.art> Date: Sat, 15 Mar 2025 18:49:06 +0100 Subject: [PATCH] add morphisms - unicode - value delimited seq - zigzag encoding (still lacking any way to get to signed integer yet'..) --- morphisms/unicode.morphism-base | 93 +++++++++++++++++++++++++++++ morphisms/value_delim.morphism-base | 71 ++++++++++++++++++++++ morphisms/zigzag.morphism-base | 26 ++++++++ 3 files changed, 190 insertions(+) create mode 100644 morphisms/unicode.morphism-base create mode 100644 morphisms/value_delim.morphism-base create mode 100644 morphisms/zigzag.morphism-base diff --git a/morphisms/unicode.morphism-base b/morphisms/unicode.morphism-base new file mode 100644 index 0000000..8805ace --- /dev/null +++ b/morphisms/unicode.morphism-base @@ -0,0 +1,93 @@ +``` +``` + +morph_string_as_ascii_to_utf8 () + <Seq ~ <ValueTerminated '\0'> Char~Ascii~x86.UInt8> +--> <Seq Char~Unicode> + ~ UTF-8 + ~ <Seq~<ValueTerminated 0> x86.UInt8> +``` + while( *src ) { *dst++ = *src++; } + *dst = 0; + return 0; +``` + +morph_string_as_utf8_to_ascii () + <Seq Char~Unicode> + ~ UTF-8 + ~ <Seq~<ValueTerminated 0> x86.UInt8> +--> <Seq ~ <ValueTerminated '\0'> Char~Ascii~x86.UInt8> +``` + while( *src ) { + if( *src < 128 ) { + *dst++ = *src++; + } else { + fprintf(stderr, "(morph UTF-8 to Ascii) ignore multi-byte character\n"); + while( *++src >= 128 ); + } + } + *dst = 0; + return 0; +``` + +morph_string_as_ascii_to_utf32 () + <Seq ~ <ValueTerminated '\0'> Char~Ascii~x86.UInt8> +--> <Seq Char~Unicode> + ~ UTF-32LE + ~ <Seq~<ValueTerminated 0> x86.UInt32> +``` + while( *src ) { *dst++ = *src++; } + *dst = 0; + return 0; +``` + +morph_string_as_utf8_to_utf32 () + + <Seq Char~Unicode> + ~ UTF-8 + ~ <Seq~<ValueTerminated 0> x86.UInt8> + +--> <Seq Char~Unicode> + ~ UTF-32LE + ~ <Seq~<ValueTerminated 0> x86.UInt32> + +``` + bool has_multibyte = false; + uint32_t val = 0; + while( *src ) { + uint8_t tag = (*src >> 6)&0b11; + switch( tag ) { + // single byte + case 0b00: + case 0b01: + if( has_multibyte ) { + *dst++ = val; + has_multibyte = false; + } + *dst++ = *src++; + break; + + // start multibyte + case 0b11: + if( has_multibyte ) { + *dst++ = val; + } + has_multibyte = true; + val = (*src++) & 0b111111; + break; + + // continue multibyte + case 0b10: + val <<= 6; + val |= (*src++) & 0b111111; + break; + } + } + + if( has_multibyte ) + *dst++ = val; + + *dst++ = 0; + + return 0; +``` diff --git a/morphisms/value_delim.morphism-base b/morphisms/value_delim.morphism-base new file mode 100644 index 0000000..1870ff7 --- /dev/null +++ b/morphisms/value_delim.morphism-base @@ -0,0 +1,71 @@ +``` +#include <array/length-prefix.h> +#include <stdlib.h> +``` + +morph_seqseq_valsep_uint8 (T: Type, SrcDelim: T, DstDelim: T) + < Seq <Seq T> > + ~ < ValueSep SrcDelim T > + ~ < Seq~<LengthPrefix x86.UInt64> T > + +--> < Seq <Seq T> > + ~ < ValueSep DstDelim T > + ~ < Seq~<LengthPrefix x86.UInt64> T > +``` + length_prefix_uint8_array_clear( dst ); + + uint8_t * dst_items = dst->items; + for( uint64_t i = 0; i < src->len; ++i ) { + if( src->items[i] == SrcDelim ) { + length_prefix_uint8_array_push( dst, DstDelim ); + } else if( src->items[i] == DstDelim ) { + if( DstDelim == '\n' ) { + length_prefix_uint8_array_push( dst, '\\' ); + length_prefix_uint8_array_push( dst, 'n' ); + } + } else { + length_prefix_uint8_array_push( dst, src->items[i] ); + } + } + + return 0; +``` + + +morph_seqseq_as_valsep_to_lenpfx (T: Type, Delim: T, EscKey: T) + < Seq <Seq T> > + ~ < ValueSep T Delim > + ~ < Seq~<LengthPrefix x86.UInt64> T > + +--> < Seq~<LengthPrefix x86.UInt64> + <Seq~<LengthPrefix x86.UInt64> T > + ~ <RefMut < Seq~<LengthPrefix x86.UInt64> T>> + ~ x86.Address + ~ x86.UInt64 + > +``` + length_prefix_uint64_array_clear( dst ); + + struct LengthPrefixUInt8Array * cur_item = NULL; + + uint8_t const * start = &src->items[0]; + uint8_t const * cur = start; + uint8_t const * end = &src->items[src->len]; + + while( cur < end ) { + if( *cur == Delim || cur+1 == end ) { + uint64_t len = cur - start; + + cur_item = malloc( sizeof(uint64_t) + sizeof(uint8_t) * len ); + cur_item->len = len; + memcpy( cur_item->items, start, len ); + + length_prefix_uint64_array_push( dst, (uint64_t)cur_item ); + start = ++cur; + } else { + cur++; + } + } + + return 0; +``` diff --git a/morphisms/zigzag.morphism-base b/morphisms/zigzag.morphism-base new file mode 100644 index 0000000..0d54033 --- /dev/null +++ b/morphisms/zigzag.morphism-base @@ -0,0 +1,26 @@ +``` +``` + +morph_i64_as_twos_complement_to_zigzag () + ℤ ~ x86.Int64 +--> ℤ ~ ZigZagInt ~ ℕ ~ x86.UInt64 +``` + if( *src >= 0 ) { + *dst = (2 * (uint64_t)*src) + } else { + *dst = (2 * (uint64_t)(- *src)) - 1; + } + + return 0; +``` + +morph_i64_as_zigzag_to_twos_complement () + ℤ ~ ZigZagInt ~ ℕ ~ x86.UInt64 +--> ℤ ~ x86.Int64 +``` + if( *src % 2 == 0 ) { + *dst = *src / 2; + } else { + *dst = - ((*src+1) / 2); + } +```