1 // -*- C++ -*-
2 //===----------------------------------------------------------------------===//
3 //
4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 // See https://llvm.org/LICENSE.txt for license information.
6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //
8 //===----------------------------------------------------------------------===//
9
10 // WARNING, this entire header is generated by
11 // utils/generate_indic_conjunct_break_table.py
12 // DO NOT MODIFY!
13
14 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
15 //
16 // See Terms of Use <https://www.unicode.org/copyright.html>
17 // for definitions of Unicode Inc.'s Data Files and Software.
18 //
19 // NOTICE TO USER: Carefully read the following legal agreement.
20 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
21 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
22 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
23 // TERMS AND CONDITIONS OF THIS AGREEMENT.
24 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
25 // THE DATA FILES OR SOFTWARE.
26 //
27 // COPYRIGHT AND PERMISSION NOTICE
28 //
29 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
30 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
31 //
32 // Permission is hereby granted, free of charge, to any person obtaining
33 // a copy of the Unicode data files and any associated documentation
34 // (the "Data Files") or Unicode software and any associated documentation
35 // (the "Software") to deal in the Data Files or Software
36 // without restriction, including without limitation the rights to use,
37 // copy, modify, merge, publish, distribute, and/or sell copies of
38 // the Data Files or Software, and to permit persons to whom the Data Files
39 // or Software are furnished to do so, provided that either
40 // (a) this copyright and permission notice appear with all copies
41 // of the Data Files or Software, or
42 // (b) this copyright and permission notice appear in associated
43 // Documentation.
44 //
45 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
46 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
47 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
48 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
49 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
50 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
51 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
52 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
53 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
54 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
55 //
56 // Except as contained in this notice, the name of a copyright holder
57 // shall not be used in advertising or otherwise to promote the sale,
58 // use or other dealings in these Data Files or Software without prior
59 // written authorization of the copyright holder.
60
61 #ifndef _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H
62 #define _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H
63
64 #include <__algorithm/ranges_upper_bound.h>
65 #include <__config>
66 #include <__iterator/access.h>
67 #include <cstddef>
68 #include <cstdint>
69
70 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
71 # pragma GCC system_header
72 #endif
73
74 _LIBCPP_BEGIN_NAMESPACE_STD
75
76 #if _LIBCPP_STD_VER >= 20
77
78 namespace __indic_conjunct_break {
79
80 enum class __property : uint8_t {
81 // Values generated from the data files.
82 __Consonant,
83 __Extend,
84 __Linker,
85
86 // The code unit has none of above properties.
87 __none
88 };
89
90 /// The entries of the indic conjunct break property table.
91 ///
92 /// The data is generated from
93 /// - https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
94 ///
95 /// The data has 3 values
96 /// - bits [0, 1] The property. One of the values generated from the datafiles
97 /// of \ref __property
98 /// - bits [2, 10] The size of the range.
99 /// - bits [11, 31] The lower bound code point of the range. The upper bound of
100 /// the range is lower bound + size.
101 ///
102 /// The 9 bits for the size allow a maximum range of 512 elements. Some ranges
103 /// in the Unicode tables are larger. They are stored in multiple consecutive
104 /// ranges in the data table. An alternative would be to store the sizes in a
105 /// separate 16-bit value. The original MSVC STL code had such an approach, but
106 /// this approach uses less space for the data and is about 4% faster in the
107 /// following benchmark.
108 /// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
109 // clang-format off
110 _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[201] = {
111 0x00180139,
112 0x001a807d,
113 0x00241811,
114 0x002c88b1,
115 0x002df801,
116 0x002e0805,
117 0x002e2005,
118 0x002e3801,
119 0x00308029,
120 0x00325851,
121 0x00338001,
122 0x0036b019,
123 0x0036f815,
124 0x00373805,
125 0x0037500d,
126 0x00388801,
127 0x00398069,
128 0x003f5821,
129 0x003fe801,
130 0x0040b00d,
131 0x0040d821,
132 0x00412809,
133 0x00414811,
134 0x0042c809,
135 0x0044c01d,
136 0x0046505d,
137 0x00471871,
138 0x0048a890,
139 0x0049e001,
140 0x004a6802,
141 0x004a880d,
142 0x004ac01c,
143 0x004bc01c,
144 0x004ca84c,
145 0x004d5018,
146 0x004d9000,
147 0x004db00c,
148 0x004de001,
149 0x004e6802,
150 0x004ee004,
151 0x004ef800,
152 0x004f8004,
153 0x004ff001,
154 0x0051e001,
155 0x0054a84c,
156 0x00555018,
157 0x00559004,
158 0x0055a810,
159 0x0055e001,
160 0x00566802,
161 0x0057c800,
162 0x0058a84c,
163 0x00595018,
164 0x00599004,
165 0x0059a810,
166 0x0059e001,
167 0x005a6802,
168 0x005ae004,
169 0x005af800,
170 0x005b8800,
171 0x0060a84c,
172 0x0061503c,
173 0x0061e001,
174 0x00626802,
175 0x0062a805,
176 0x0062c008,
177 0x0065e001,
178 0x0068a894,
179 0x0069d805,
180 0x006a6802,
181 0x0071c009,
182 0x0072400d,
183 0x0075c009,
184 0x0076400d,
185 0x0078c005,
186 0x0079a801,
187 0x0079b801,
188 0x0079c801,
189 0x007b8805,
190 0x007ba001,
191 0x007bd00d,
192 0x007c0001,
193 0x007c1009,
194 0x007c3005,
195 0x007e3001,
196 0x0081b801,
197 0x0081c805,
198 0x00846801,
199 0x009ae809,
200 0x00b8a001,
201 0x00be9001,
202 0x00bee801,
203 0x00c54801,
204 0x00c9c809,
205 0x00d0b805,
206 0x00d30001,
207 0x00d3a81d,
208 0x00d3f801,
209 0x00d58035,
210 0x00d5f83d,
211 0x00d9a001,
212 0x00db5821,
213 0x00dd5801,
214 0x00df3001,
215 0x00e1b801,
216 0x00e68009,
217 0x00e6a031,
218 0x00e71019,
219 0x00e76801,
220 0x00e7a001,
221 0x00e7c005,
222 0x00ee00fd,
223 0x01006801,
224 0x01068031,
225 0x01070801,
226 0x0107282d,
227 0x01677809,
228 0x016bf801,
229 0x016f007d,
230 0x01815015,
231 0x0184c805,
232 0x05337801,
233 0x0533a025,
234 0x0534f005,
235 0x05378005,
236 0x05416001,
237 0x05470045,
238 0x05495809,
239 0x054d9801,
240 0x05558001,
241 0x05559009,
242 0x0555b805,
243 0x0555f005,
244 0x05560801,
245 0x0557b001,
246 0x055f6801,
247 0x07d8f001,
248 0x07f1003d,
249 0x080fe801,
250 0x08170001,
251 0x081bb011,
252 0x08506801,
253 0x08507801,
254 0x0851c009,
255 0x0851f801,
256 0x08572805,
257 0x0869200d,
258 0x08755805,
259 0x0877e809,
260 0x087a3029,
261 0x087c100d,
262 0x08838001,
263 0x0883f801,
264 0x0885d001,
265 0x08880009,
266 0x08899805,
267 0x088b9801,
268 0x088e5001,
269 0x0891b001,
270 0x08974805,
271 0x0899d805,
272 0x089b3019,
273 0x089b8011,
274 0x08a23001,
275 0x08a2f001,
276 0x08a61801,
277 0x08ae0001,
278 0x08b5b801,
279 0x08b95801,
280 0x08c1d001,
281 0x08c9f001,
282 0x08ca1801,
283 0x08d1a001,
284 0x08d23801,
285 0x08d4c801,
286 0x08ea1001,
287 0x08ea2005,
288 0x08ecb801,
289 0x08fa1001,
290 0x0b578011,
291 0x0b598019,
292 0x0de4f001,
293 0x0e8b2801,
294 0x0e8b3809,
295 0x0e8b7011,
296 0x0e8bd81d,
297 0x0e8c2819,
298 0x0e8d500d,
299 0x0e921009,
300 0x0f000019,
301 0x0f004041,
302 0x0f00d819,
303 0x0f011805,
304 0x0f013011,
305 0x0f047801,
306 0x0f098019,
307 0x0f157001,
308 0x0f17600d,
309 0x0f27600d,
310 0x0f468019,
311 0x0f4a2019};
312 // clang-format on
313
314 /// Returns the indic conjuct break property of a code point.
__get_property(const char32_t __code_point)315 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {
316 // The algorithm searches for the upper bound of the range and, when found,
317 // steps back one entry. This algorithm is used since the code point can be
318 // anywhere in the range. After a lower bound is found the next step is to
319 // compare whether the code unit is indeed in the range.
320 //
321 // Since the entry contains a code unit, size, and property the code point
322 // being sought needs to be adjusted. Just shifting the code point to the
323 // proper position doesn't work; suppose an entry has property 0, size 1,
324 // and lower bound 3. This results in the entry 0x1810.
325 // When searching for code point 3 it will search for 0x1800, find 0x1810
326 // and moves to the previous entry. Thus the lower bound value will never
327 // be found.
328 // The simple solution is to set the bits belonging to the property and
329 // size. Then the upper bound for code point 3 will return the entry after
330 // 0x1810. After moving to the previous entry the algorithm arrives at the
331 // correct entry.
332 ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries;
333 if (__i == 0)
334 return __property::__none;
335
336 --__i;
337 uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 2) & 0b1'1111'1111);
338 if (__code_point <= __upper_bound)
339 return static_cast<__property>(__entries[__i] & 0b11);
340
341 return __property::__none;
342 }
343
344 } // namespace __indic_conjunct_break
345
346 #endif //_LIBCPP_STD_VER >= 20
347
348 _LIBCPP_END_NAMESPACE_STD
349
350 #endif // _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H
351