xref: /freebsd/contrib/llvm-project/libcxx/include/__format/indic_conjunct_break_table.h (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 // -*- C++ -*-
2 //===----------------------------------------------------------------------===//
3 //
4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 // See https://llvm.org/LICENSE.txt for license information.
6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //
8 //===----------------------------------------------------------------------===//
9 
10 // WARNING, this entire header is generated by
11 // utils/generate_indic_conjunct_break_table.py
12 // DO NOT MODIFY!
13 
14 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
15 //
16 // See Terms of Use <https://www.unicode.org/copyright.html>
17 // for definitions of Unicode Inc.'s Data Files and Software.
18 //
19 // NOTICE TO USER: Carefully read the following legal agreement.
20 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
21 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
22 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
23 // TERMS AND CONDITIONS OF THIS AGREEMENT.
24 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
25 // THE DATA FILES OR SOFTWARE.
26 //
27 // COPYRIGHT AND PERMISSION NOTICE
28 //
29 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
30 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
31 //
32 // Permission is hereby granted, free of charge, to any person obtaining
33 // a copy of the Unicode data files and any associated documentation
34 // (the "Data Files") or Unicode software and any associated documentation
35 // (the "Software") to deal in the Data Files or Software
36 // without restriction, including without limitation the rights to use,
37 // copy, modify, merge, publish, distribute, and/or sell copies of
38 // the Data Files or Software, and to permit persons to whom the Data Files
39 // or Software are furnished to do so, provided that either
40 // (a) this copyright and permission notice appear with all copies
41 // of the Data Files or Software, or
42 // (b) this copyright and permission notice appear in associated
43 // Documentation.
44 //
45 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
46 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
47 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
48 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
49 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
50 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
51 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
52 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
53 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
54 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
55 //
56 // Except as contained in this notice, the name of a copyright holder
57 // shall not be used in advertising or otherwise to promote the sale,
58 // use or other dealings in these Data Files or Software without prior
59 // written authorization of the copyright holder.
60 
61 #ifndef _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H
62 #define _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H
63 
64 #include <__algorithm/ranges_upper_bound.h>
65 #include <__config>
66 #include <__iterator/access.h>
67 #include <cstddef>
68 #include <cstdint>
69 
70 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
71 #  pragma GCC system_header
72 #endif
73 
74 _LIBCPP_BEGIN_NAMESPACE_STD
75 
76 #if _LIBCPP_STD_VER >= 20
77 
78 namespace __indic_conjunct_break {
79 
80 enum class __property : uint8_t {
81   // Values generated from the data files.
82   __Consonant,
83   __Extend,
84   __Linker,
85 
86   // The code unit has none of above properties.
87   __none
88 };
89 
90 /// The entries of the indic conjunct break property table.
91 ///
92 /// The data is generated from
93 /// -  https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
94 ///
95 /// The data has 3 values
96 /// - bits [0, 1] The property. One of the values generated from the datafiles
97 ///   of \ref __property
98 /// - bits [2, 10] The size of the range.
99 /// - bits [11, 31] The lower bound code point of the range. The upper bound of
100 ///   the range is lower bound + size.
101 ///
102 /// The 9 bits for the size allow a maximum range of 512 elements. Some ranges
103 /// in the Unicode tables are larger. They are stored in multiple consecutive
104 /// ranges in the data table. An alternative would be to store the sizes in a
105 /// separate 16-bit value. The original MSVC STL code had such an approach, but
106 /// this approach uses less space for the data and is about 4% faster in the
107 /// following benchmark.
108 /// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
109 // clang-format off
110 _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[201] = {
111     0x00180139,
112     0x001a807d,
113     0x00241811,
114     0x002c88b1,
115     0x002df801,
116     0x002e0805,
117     0x002e2005,
118     0x002e3801,
119     0x00308029,
120     0x00325851,
121     0x00338001,
122     0x0036b019,
123     0x0036f815,
124     0x00373805,
125     0x0037500d,
126     0x00388801,
127     0x00398069,
128     0x003f5821,
129     0x003fe801,
130     0x0040b00d,
131     0x0040d821,
132     0x00412809,
133     0x00414811,
134     0x0042c809,
135     0x0044c01d,
136     0x0046505d,
137     0x00471871,
138     0x0048a890,
139     0x0049e001,
140     0x004a6802,
141     0x004a880d,
142     0x004ac01c,
143     0x004bc01c,
144     0x004ca84c,
145     0x004d5018,
146     0x004d9000,
147     0x004db00c,
148     0x004de001,
149     0x004e6802,
150     0x004ee004,
151     0x004ef800,
152     0x004f8004,
153     0x004ff001,
154     0x0051e001,
155     0x0054a84c,
156     0x00555018,
157     0x00559004,
158     0x0055a810,
159     0x0055e001,
160     0x00566802,
161     0x0057c800,
162     0x0058a84c,
163     0x00595018,
164     0x00599004,
165     0x0059a810,
166     0x0059e001,
167     0x005a6802,
168     0x005ae004,
169     0x005af800,
170     0x005b8800,
171     0x0060a84c,
172     0x0061503c,
173     0x0061e001,
174     0x00626802,
175     0x0062a805,
176     0x0062c008,
177     0x0065e001,
178     0x0068a894,
179     0x0069d805,
180     0x006a6802,
181     0x0071c009,
182     0x0072400d,
183     0x0075c009,
184     0x0076400d,
185     0x0078c005,
186     0x0079a801,
187     0x0079b801,
188     0x0079c801,
189     0x007b8805,
190     0x007ba001,
191     0x007bd00d,
192     0x007c0001,
193     0x007c1009,
194     0x007c3005,
195     0x007e3001,
196     0x0081b801,
197     0x0081c805,
198     0x00846801,
199     0x009ae809,
200     0x00b8a001,
201     0x00be9001,
202     0x00bee801,
203     0x00c54801,
204     0x00c9c809,
205     0x00d0b805,
206     0x00d30001,
207     0x00d3a81d,
208     0x00d3f801,
209     0x00d58035,
210     0x00d5f83d,
211     0x00d9a001,
212     0x00db5821,
213     0x00dd5801,
214     0x00df3001,
215     0x00e1b801,
216     0x00e68009,
217     0x00e6a031,
218     0x00e71019,
219     0x00e76801,
220     0x00e7a001,
221     0x00e7c005,
222     0x00ee00fd,
223     0x01006801,
224     0x01068031,
225     0x01070801,
226     0x0107282d,
227     0x01677809,
228     0x016bf801,
229     0x016f007d,
230     0x01815015,
231     0x0184c805,
232     0x05337801,
233     0x0533a025,
234     0x0534f005,
235     0x05378005,
236     0x05416001,
237     0x05470045,
238     0x05495809,
239     0x054d9801,
240     0x05558001,
241     0x05559009,
242     0x0555b805,
243     0x0555f005,
244     0x05560801,
245     0x0557b001,
246     0x055f6801,
247     0x07d8f001,
248     0x07f1003d,
249     0x080fe801,
250     0x08170001,
251     0x081bb011,
252     0x08506801,
253     0x08507801,
254     0x0851c009,
255     0x0851f801,
256     0x08572805,
257     0x0869200d,
258     0x08755805,
259     0x0877e809,
260     0x087a3029,
261     0x087c100d,
262     0x08838001,
263     0x0883f801,
264     0x0885d001,
265     0x08880009,
266     0x08899805,
267     0x088b9801,
268     0x088e5001,
269     0x0891b001,
270     0x08974805,
271     0x0899d805,
272     0x089b3019,
273     0x089b8011,
274     0x08a23001,
275     0x08a2f001,
276     0x08a61801,
277     0x08ae0001,
278     0x08b5b801,
279     0x08b95801,
280     0x08c1d001,
281     0x08c9f001,
282     0x08ca1801,
283     0x08d1a001,
284     0x08d23801,
285     0x08d4c801,
286     0x08ea1001,
287     0x08ea2005,
288     0x08ecb801,
289     0x08fa1001,
290     0x0b578011,
291     0x0b598019,
292     0x0de4f001,
293     0x0e8b2801,
294     0x0e8b3809,
295     0x0e8b7011,
296     0x0e8bd81d,
297     0x0e8c2819,
298     0x0e8d500d,
299     0x0e921009,
300     0x0f000019,
301     0x0f004041,
302     0x0f00d819,
303     0x0f011805,
304     0x0f013011,
305     0x0f047801,
306     0x0f098019,
307     0x0f157001,
308     0x0f17600d,
309     0x0f27600d,
310     0x0f468019,
311     0x0f4a2019};
312 // clang-format on
313 
314 /// Returns the indic conjuct break property of a code point.
__get_property(const char32_t __code_point)315 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {
316   // The algorithm searches for the upper bound of the range and, when found,
317   // steps back one entry. This algorithm is used since the code point can be
318   // anywhere in the range. After a lower bound is found the next step is to
319   // compare whether the code unit is indeed in the range.
320   //
321   // Since the entry contains a code unit, size, and property the code point
322   // being sought needs to be adjusted. Just shifting the code point to the
323   // proper position doesn't work; suppose an entry has property 0, size 1,
324   // and lower bound 3. This results in the entry 0x1810.
325   // When searching for code point 3 it will search for 0x1800, find 0x1810
326   // and moves to the previous entry. Thus the lower bound value will never
327   // be found.
328   // The simple solution is to set the bits belonging to the property and
329   // size. Then the upper bound for code point 3 will return the entry after
330   // 0x1810. After moving to the previous entry the algorithm arrives at the
331   // correct entry.
332   ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries;
333   if (__i == 0)
334     return __property::__none;
335 
336   --__i;
337   uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 2) & 0b1'1111'1111);
338   if (__code_point <= __upper_bound)
339     return static_cast<__property>(__entries[__i] & 0b11);
340 
341   return __property::__none;
342 }
343 
344 } // namespace __indic_conjunct_break
345 
346 #endif //_LIBCPP_STD_VER >= 20
347 
348 _LIBCPP_END_NAMESPACE_STD
349 
350 #endif // _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H
351