pocketlang/src/utils.c
2021-02-07 13:17:58 +05:30

145 lines
3.5 KiB
C

/*
* Copyright (c) 2021 Thakee Nathees
* Licensed under: MIT License
*/
#include "utils.h"
int utilPowerOf2Ceil(int n) {
n--;
n |= n >> 1;
n |= n >> 2;
n |= n >> 4;
n |= n >> 8;
n |= n >> 16;
n++;
return n;
}
bool utilIsName(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_');
}
bool utilIsDigit(char c) {
return ('0' <= c && c <= '9');
}
/****************************************************************************
* UTF8 *
****************************************************************************/
#define B1(first) 0b##first
#define B2(first, last) 0b##first##last
#define B3(first, second, last) 0b##first##second##last
#define B4(first, second, third, last) 0b##first##second##third##last
int utf8_encodeBytesCount(int value) {
if (value <= 0x7f) return 1;
if (value <= 0x7ff) return 2;
if (value <= 0xffff) return 3;
if (value <= 0x10ffff) return 4;
// if we're here means it's an invalid leading byte
return 0;
}
int utf8_decodeBytesCount(uint8_t byte) {
if ((byte >> 7) == 0b0) return 1;
if ((byte >> 6) == 0b10) return 1; //< continuation byte
if ((byte >> 5) == 0b110) return 2;
if ((byte >> 4) == 0b1110) return 3;
if ((byte >> 3) == 0b11110) return 4;
// if we're here means it's an invalid utf8 byte
return 1;
}
int utf8_encodeValue(int value, uint8_t* bytes) {
if (value <= 0x7f) {
*bytes = value & 0x7f;
return 1;
}
// 2 byte character 110xxxxx 10xxxxxx -> last 6 bits write to 2nd byte and
// first 5 bit write to first byte
if (value <= 0x7ff) {
*(bytes++) = B2(110, 00000) | ((value & B2(11111, 000000)) >> 6);
*(bytes) = B2(10, 000000) | ((value & B1(111111)));
return 2;
}
// 3 byte character 1110xxxx 10xxxxxx 10xxxxxx -> from last, 6 bits write
// to 3rd byte, next 6 bits write to 2nd byte, and 4 bits to first byte.
if (value <= 0xffff) {
*(bytes++) = B2(1110, 0000) | ((value & B3(1111, 000000, 000000)) >> 12);
*(bytes++) = B2(10, 000000) | ((value & B2(111111, 000000)) >> 6);
*(bytes) = B2(10, 000000) | ((value & B1(111111)));
return 3;
}
// 4 byte character 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx -> last 6 bits to
// to 4th byte, next 6 bits to 3rd byte, next 6 bits to 2nd byte, 3 bits
// first byte.
if (value <= 0x10ffff) {
*(bytes++) = B2(11110, 000) | ((value & B4(111, 000000, 000000, 000000)) >> 18);
*(bytes++) = B2(10, 000000) | ((value & B3(111111, 000000, 000000)) >> 12);
*(bytes++) = B2(10, 000000) | ((value & B2(111111, 000000)) >> 6);
*(bytes) = B2(10, 000000) | ((value & B1(111111)));
return 4;
}
return 0;
}
int utf8_decodeBytes(uint8_t* bytes, int* value) {
int continue_bytes = 0;
int byte_count = 1;
int _value = 0;
if ((*bytes & B2(11, 000000)) == B2(10, 000000)) {
*value = *bytes;
return byte_count;
}
else if ((*bytes & B2(111, 00000)) == B2(110, 00000)) {
continue_bytes = 1;
_value = (*bytes & B1(11111));
}
else if ((*bytes & B2(1111, 0000)) == B2(1110, 0000)) {
continue_bytes = 2;
_value = (*bytes & B1(1111));
}
else if ((*bytes & B2(11111, 000)) == B2(11110, 000)) {
continue_bytes = 3;
_value = (*bytes & B1(111));
}
else {
// Invalid leading byte
return -1;
}
// now add the continuation bytes to the _value
while (continue_bytes--) {
bytes++, byte_count++;
if ((*bytes & B2(11, 000000)) != B2(10, 000000)) return -1;
_value = (_value << 6) | (*bytes & B2(00, 111111));
}
*value = _value;
return byte_count;
}
#undef B1
#undef B2
#undef B3
#undef B4