xref: /freebsd/contrib/llvm-project/libc/src/__support/wchar/character_converter.cpp (revision bb722a7d0f1642bff6487f943ad0427799a6e5bf)
1 //===-- Implementation of a class for conversion --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "hdr/errno_macros.h"
10 #include "hdr/types/char32_t.h"
11 #include "hdr/types/char8_t.h"
12 #include "hdr/types/size_t.h"
13 #include "src/__support/CPP/bit.h"
14 #include "src/__support/common.h"
15 #include "src/__support/error_or.h"
16 #include "src/__support/math_extras.h"
17 #include "src/__support/wchar/mbstate.h"
18 
19 #include "character_converter.h"
20 
21 namespace LIBC_NAMESPACE_DECL {
22 namespace internal {
23 
24 // This is for utf-8 bytes other than the first byte
25 constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
26 // The number of bits per utf-8 byte that actually encode character
27 // Information not metadata (# of bits excluding the byte headers)
28 constexpr uint32_t MASK_ENCODED_BITS =
29     mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
30 // Maximum value for utf-32 for a utf-8 sequence of a given length
31 constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
32 constexpr int MAX_UTF8_LENGTH = 4;
33 
CharacterConverter(mbstate * mbstate)34 CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
35 
clear()36 void CharacterConverter::clear() {
37   state->partial = 0;
38   state->bytes_stored = 0;
39   state->total_bytes = 0;
40 }
41 
isFull()42 bool CharacterConverter::isFull() {
43   return state->bytes_stored == state->total_bytes && state->total_bytes != 0;
44 }
45 
isEmpty()46 bool CharacterConverter::isEmpty() { return state->bytes_stored == 0; }
47 
isValidState()48 bool CharacterConverter::isValidState() {
49   if (state->total_bytes > MAX_UTF8_LENGTH)
50     return false;
51 
52   const char32_t max_utf32_value =
53       state->total_bytes == 0 ? 0
54                               : MAX_VALUE_PER_UTF8_LEN[state->total_bytes - 1];
55   return state->bytes_stored <= state->total_bytes &&
56          state->partial <= max_utf32_value;
57 }
58 
push(char8_t utf8_byte)59 int CharacterConverter::push(char8_t utf8_byte) {
60   uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
61   // Checking the first byte if first push
62   if (isEmpty()) {
63     // UTF-8 char has 1 byte total
64     if (num_ones == 0) {
65       state->total_bytes = 1;
66     }
67     // UTF-8 char has 2 through 4 bytes total
68     else if (num_ones >= 2 && num_ones <= 4) {
69       /* Since the format is 110xxxxx, 1110xxxx, and 11110xxx for 2, 3, and 4,
70       we will make the base mask with 7 ones and right shift it as necessary. */
71       constexpr size_t SIGNIFICANT_BITS = 7;
72       char8_t base_mask =
73           static_cast<char8_t>(mask_trailing_ones<uint8_t, SIGNIFICANT_BITS>());
74       state->total_bytes = num_ones;
75       utf8_byte &= (base_mask >> num_ones);
76     }
77     // Invalid first byte
78     else {
79       // bytes_stored and total_bytes will always be 0 here
80       state->partial = static_cast<char32_t>(0);
81       return EILSEQ;
82     }
83     state->partial = static_cast<char32_t>(utf8_byte);
84     state->bytes_stored++;
85     return 0;
86   }
87   // Any subsequent push
88   // Adding 6 more bits so need to left shift
89   if (num_ones == 1 && !isFull()) {
90     char32_t byte = utf8_byte & MASK_ENCODED_BITS;
91     state->partial = state->partial << ENCODED_BITS_PER_UTF8;
92     state->partial |= byte;
93     state->bytes_stored++;
94     return 0;
95   }
96 
97   // Invalid byte -> reset the state
98   clear();
99   return EILSEQ;
100 }
101 
push(char32_t utf32)102 int CharacterConverter::push(char32_t utf32) {
103   // we can't be partially through a conversion when pushing a utf32 value
104   if (!isEmpty())
105     return -1;
106 
107   state->partial = utf32;
108 
109   // determine number of utf-8 bytes needed to represent this utf32 value
110   for (uint8_t i = 0; i < MAX_UTF8_LENGTH; i++) {
111     if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
112       state->total_bytes = i + 1;
113       state->bytes_stored = i + 1;
114       return 0;
115     }
116   }
117 
118   // `utf32` contains a value that is too large to actually represent a valid
119   // unicode character
120   clear();
121   return EILSEQ;
122 }
123 
pop_utf32()124 ErrorOr<char32_t> CharacterConverter::pop_utf32() {
125   // If pop is called too early, do not reset the state, use error to determine
126   // whether enough bytes have been pushed
127   if (!isFull())
128     return Error(-1);
129   char32_t utf32 = state->partial;
130   // reset if successful pop
131   clear();
132   return utf32;
133 }
134 
sizeAsUTF32()135 size_t CharacterConverter::sizeAsUTF32() {
136   return 1; // a single utf-32 value can fit an entire character
137 }
138 
sizeAsUTF8()139 size_t CharacterConverter::sizeAsUTF8() { return state->total_bytes; }
140 
pop_utf8()141 ErrorOr<char8_t> CharacterConverter::pop_utf8() {
142   if (isEmpty())
143     return Error(-1);
144 
145   constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
146   constexpr char8_t CONTINUING_BYTE_HEADER = 0x80;
147 
148   char32_t output;
149 
150   // Shift to get the next 6 bits from the utf32 encoding
151   const size_t shift_amount = (state->bytes_stored - 1) * ENCODED_BITS_PER_UTF8;
152   if (isFull()) {
153     /*
154       Choose the correct set of most significant bits to encode the length
155       of the utf8 sequence. The remaining bits contain the most significant
156       bits of the unicode value of the character.
157     */
158     output = FIRST_BYTE_HEADERS[state->total_bytes - 1] |
159              (state->partial >> shift_amount);
160   } else {
161     // Get the next 6 bits and format it like so: 10xxxxxx
162     output = CONTINUING_BYTE_HEADER |
163              ((state->partial >> shift_amount) & MASK_ENCODED_BITS);
164   }
165 
166   state->bytes_stored--;
167   if (state->bytes_stored == 0)
168     clear();
169 
170   return static_cast<char8_t>(output);
171 }
172 
173 } // namespace internal
174 } // namespace LIBC_NAMESPACE_DECL
175