xref: /freebsd/contrib/libcbor/src/cbor/internal/unicode.c (revision 6ba2210ee039f2f12878c217bcf058e9c8b26b29)
1 /*
2  * Copyright (c) 2014-2020 Pavel Kalvoda <me@pavelkalvoda.com>
3  *
4  * libcbor is free software; you can redistribute it and/or modify
5  * it under the terms of the MIT license. See LICENSE for details.
6  */
7 
8 #include "unicode.h"
9 
10 #define UTF8_ACCEPT 0
11 #define UTF8_REJECT 1
12 
13 static const uint8_t utf8d[] = {
14     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
15     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
16     0,   0,   0,   0,   0,   0,   0,   0,   0,   0, /* 00..1f */
17     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
18     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
19     0,   0,   0,   0,   0,   0,   0,   0,   0,   0, /* 20..3f */
20     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
21     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
22     0,   0,   0,   0,   0,   0,   0,   0,   0,   0, /* 40..5f */
23     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
24     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
25     0,   0,   0,   0,   0,   0,   0,   0,   0,   0, /* 60..7f */
26     1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
27     1,   1,   1,   1,   1,   9,   9,   9,   9,   9,   9,
28     9,   9,   9,   9,   9,   9,   9,   9,   9,   9, /* 80..9f */
29     7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
30     7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
31     7,   7,   7,   7,   7,   7,   7,   7,   7,   7, /* a0..bf */
32     8,   8,   2,   2,   2,   2,   2,   2,   2,   2,   2,
33     2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
34     2,   2,   2,   2,   2,   2,   2,   2,   2,   2, /* c0..df */
35     0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3,
36     0x3, 0x3, 0x4, 0x3, 0x3, /* e0..ef */
37     0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8,
38     0x8, 0x8, 0x8, 0x8, 0x8, /* f0..ff */
39     0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4,
40     0x6, 0x1, 0x1, 0x1, 0x1, /* s0..s0 */
41     1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
42     1,   1,   1,   1,   1,   1,   0,   1,   1,   1,   1,
43     1,   0,   1,   0,   1,   1,   1,   1,   1,   1, /* s1..s2 */
44     1,   2,   1,   1,   1,   1,   1,   2,   1,   2,   1,
45     1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
46     1,   2,   1,   1,   1,   1,   1,   1,   1,   1, /* s3..s4 */
47     1,   2,   1,   1,   1,   1,   1,   1,   1,   2,   1,
48     1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
49     1,   3,   1,   3,   1,   1,   1,   1,   1,   1, /* s5..s6 */
50     1,   3,   1,   1,   1,   1,   1,   3,   1,   3,   1,
51     1,   1,   1,   1,   1,   1,   3,   1,   1,   1,   1,
52     1,   1,   1,   1,   1,   1,   1,   1,   1,   1, /* s7..s8 */
53 };
54 
55 /* Copyright of this function: (c) 2008-2009 Bjoern Hoehrmann
56  * <bjoern@hoehrmann.de> */
57 /* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. */
58 uint32_t _cbor_unicode_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
59   uint32_t type = utf8d[byte];
60 
61   *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6)
62                                    : (0xff >> type) & (byte);
63 
64   *state = utf8d[256 + *state * 16 + type];
65   return *state;
66 }
67 
68 size_t _cbor_unicode_codepoint_count(cbor_data source, size_t source_length,
69                                      struct _cbor_unicode_status* status) {
70   *status =
71       (struct _cbor_unicode_status){.location = 0, .status = _CBOR_UNICODE_OK};
72   uint32_t codepoint, state = UTF8_ACCEPT, res;
73   size_t pos = 0, count = 0;
74 
75   for (; pos < source_length; pos++) {
76     res = _cbor_unicode_decode(&state, &codepoint, source[pos]);
77 
78     if (res == UTF8_ACCEPT) {
79       count++;
80     } else if (res == UTF8_REJECT) {
81       goto error;
82     }
83   }
84 
85   /* Unfinished multibyte codepoint */
86   if (state != UTF8_ACCEPT) goto error;
87 
88   return count;
89 
90 error:
91   *status = (struct _cbor_unicode_status){.location = pos,
92                                           .status = _CBOR_UNICODE_BADCP};
93   return -1;
94 }
95