110ff414cSEd Maste /* 210ff414cSEd Maste * Copyright (c) 2014-2020 Pavel Kalvoda <me@pavelkalvoda.com> 310ff414cSEd Maste * 410ff414cSEd Maste * libcbor is free software; you can redistribute it and/or modify 510ff414cSEd Maste * it under the terms of the MIT license. See LICENSE for details. 610ff414cSEd Maste */ 710ff414cSEd Maste 810ff414cSEd Maste #include "unicode.h" 95d3e7166SEd Maste #include <stdint.h> 1010ff414cSEd Maste 1110ff414cSEd Maste #define UTF8_ACCEPT 0 1210ff414cSEd Maste #define UTF8_REJECT 1 1310ff414cSEd Maste 1410ff414cSEd Maste static const uint8_t utf8d[] = { 1510ff414cSEd Maste 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1610ff414cSEd Maste 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1710ff414cSEd Maste 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 00..1f */ 1810ff414cSEd Maste 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1910ff414cSEd Maste 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2010ff414cSEd Maste 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 20..3f */ 2110ff414cSEd Maste 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2210ff414cSEd Maste 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2310ff414cSEd Maste 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40..5f */ 2410ff414cSEd Maste 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2510ff414cSEd Maste 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2610ff414cSEd Maste 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60..7f */ 2710ff414cSEd Maste 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2810ff414cSEd Maste 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 2910ff414cSEd Maste 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, /* 80..9f */ 3010ff414cSEd Maste 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3110ff414cSEd Maste 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3210ff414cSEd Maste 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* a0..bf */ 3310ff414cSEd Maste 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3410ff414cSEd Maste 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3510ff414cSEd Maste 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* c0..df */ 3610ff414cSEd Maste 0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 3710ff414cSEd Maste 0x3, 0x3, 0x4, 0x3, 0x3, /* e0..ef */ 3810ff414cSEd Maste 0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 3910ff414cSEd Maste 0x8, 0x8, 0x8, 0x8, 0x8, /* f0..ff */ 4010ff414cSEd Maste 0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 4110ff414cSEd Maste 0x6, 0x1, 0x1, 0x1, 0x1, /* s0..s0 */ 4210ff414cSEd Maste 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4310ff414cSEd Maste 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 4410ff414cSEd Maste 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, /* s1..s2 */ 4510ff414cSEd Maste 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 4610ff414cSEd Maste 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4710ff414cSEd Maste 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, /* s3..s4 */ 4810ff414cSEd Maste 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 4910ff414cSEd Maste 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5010ff414cSEd Maste 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, /* s5..s6 */ 5110ff414cSEd Maste 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 5210ff414cSEd Maste 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 5310ff414cSEd Maste 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* s7..s8 */ 5410ff414cSEd Maste }; 5510ff414cSEd Maste 5610ff414cSEd Maste /* Copyright of this function: (c) 2008-2009 Bjoern Hoehrmann 5710ff414cSEd Maste * <bjoern@hoehrmann.de> */ 5810ff414cSEd Maste /* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. */ 5910ff414cSEd Maste uint32_t _cbor_unicode_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { 6010ff414cSEd Maste uint32_t type = utf8d[byte]; 6110ff414cSEd Maste 6210ff414cSEd Maste *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) 6310ff414cSEd Maste : (0xff >> type) & (byte); 6410ff414cSEd Maste 6510ff414cSEd Maste *state = utf8d[256 + *state * 16 + type]; 6610ff414cSEd Maste return *state; 6710ff414cSEd Maste } 6810ff414cSEd Maste 69*abd87254SEd Maste size_t _cbor_unicode_codepoint_count(cbor_data source, size_t source_length, 7010ff414cSEd Maste struct _cbor_unicode_status* status) { 7110ff414cSEd Maste *status = 7210ff414cSEd Maste (struct _cbor_unicode_status){.location = 0, .status = _CBOR_UNICODE_OK}; 7310ff414cSEd Maste uint32_t codepoint, state = UTF8_ACCEPT, res; 74*abd87254SEd Maste size_t pos = 0, count = 0; 7510ff414cSEd Maste 7610ff414cSEd Maste for (; pos < source_length; pos++) { 7710ff414cSEd Maste res = _cbor_unicode_decode(&state, &codepoint, source[pos]); 7810ff414cSEd Maste 7910ff414cSEd Maste if (res == UTF8_ACCEPT) { 8010ff414cSEd Maste count++; 8110ff414cSEd Maste } else if (res == UTF8_REJECT) { 8210ff414cSEd Maste goto error; 8310ff414cSEd Maste } 8410ff414cSEd Maste } 8510ff414cSEd Maste 8610ff414cSEd Maste /* Unfinished multibyte codepoint */ 8710ff414cSEd Maste if (state != UTF8_ACCEPT) goto error; 8810ff414cSEd Maste 8910ff414cSEd Maste return count; 9010ff414cSEd Maste 9110ff414cSEd Maste error: 9210ff414cSEd Maste *status = (struct _cbor_unicode_status){.location = pos, 9310ff414cSEd Maste .status = _CBOR_UNICODE_BADCP}; 945d3e7166SEd Maste return 0; 9510ff414cSEd Maste } 96