1 //===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties 2 //-*- C++ -*-===// 3 // 4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 // See https://llvm.org/LICENSE.txt for license information. 6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements functions to map the name or alias of a unicode 11 // character to its codepoint. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/StringExtras.h" 17 #include "llvm/ADT/StringRef.h" 18 #include "llvm/Support/Unicode.h" 19 20 namespace llvm { 21 namespace sys { 22 namespace unicode { 23 24 extern const char *UnicodeNameToCodepointDict; 25 extern const uint8_t *UnicodeNameToCodepointIndex; 26 extern const std::size_t UnicodeNameToCodepointIndexSize; 27 extern const std::size_t UnicodeNameToCodepointLargestNameSize; 28 29 using BufferType = SmallString<64>; 30 31 struct Node { 32 bool IsRoot = false; 33 char32_t Value = 0xFFFFFFFF; 34 uint32_t ChildrenOffset = 0; 35 bool HasSibling = false; 36 uint32_t Size = 0; 37 StringRef Name; 38 const Node *Parent = nullptr; 39 40 constexpr bool isValid() const { 41 return !Name.empty() || Value == 0xFFFFFFFF; 42 } 43 constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; } 44 45 std::string fullName() const { 46 std::string S; 47 // Reserve enough space for most unicode code points. 48 // The chosen value represent the 99th percentile of name size as of 49 // Unicode 15.0. 50 S.reserve(46); 51 const Node *N = this; 52 while (N) { 53 std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S)); 54 N = N->Parent; 55 } 56 std::reverse(S.begin(), S.end()); 57 return S; 58 } 59 }; 60 61 static Node createRoot() { 62 Node N; 63 N.IsRoot = true; 64 N.ChildrenOffset = 1; 65 N.Size = 1; 66 return N; 67 } 68 69 static Node readNode(uint32_t Offset, const Node *Parent = nullptr) { 70 if (Offset == 0) 71 return createRoot(); 72 73 uint32_t Origin = Offset; 74 Node N; 75 N.Parent = Parent; 76 uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++]; 77 if (Offset + 6 >= UnicodeNameToCodepointIndexSize) 78 return N; 79 80 bool LongName = NameInfo & 0x40; 81 bool HasValue = NameInfo & 0x80; 82 std::size_t Size = NameInfo & ~0xC0; 83 if (LongName) { 84 uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8); 85 NameOffset |= UnicodeNameToCodepointIndex[Offset++]; 86 N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size); 87 } else { 88 N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1); 89 } 90 if (HasValue) { 91 uint8_t H = UnicodeNameToCodepointIndex[Offset++]; 92 uint8_t M = UnicodeNameToCodepointIndex[Offset++]; 93 uint8_t L = UnicodeNameToCodepointIndex[Offset++]; 94 N.Value = ((H << 16) | (M << 8) | L) >> 3; 95 96 bool HasChildren = L & 0x02; 97 N.HasSibling = L & 0x01; 98 99 if (HasChildren) { 100 N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16; 101 N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8; 102 N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++]; 103 } 104 } else { 105 uint8_t H = UnicodeNameToCodepointIndex[Offset++]; 106 N.HasSibling = H & 0x80; 107 bool HasChildren = H & 0x40; 108 H &= uint8_t(~0xC0); 109 if (HasChildren) { 110 N.ChildrenOffset = (H << 16); 111 N.ChildrenOffset |= 112 (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8); 113 N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++]; 114 } 115 } 116 N.Size = Offset - Origin; 117 return N; 118 } 119 120 static bool startsWith(StringRef Name, StringRef Needle, bool Strict, 121 std::size_t &Consummed, char &PreviousCharInName, 122 bool IsPrefix = false) { 123 124 Consummed = 0; 125 if (Strict) { 126 if (!Name.starts_with(Needle)) 127 return false; 128 Consummed = Needle.size(); 129 return true; 130 } 131 if (Needle.empty()) 132 return true; 133 134 auto NamePos = Name.begin(); 135 auto NeedlePos = Needle.begin(); 136 137 char PreviousCharInNameOrigin = PreviousCharInName; 138 char PreviousCharInNeedle = *Needle.begin(); 139 auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar, 140 bool IsPrefix = false) { 141 while (It != End) { 142 const auto Next = std::next(It); 143 // Ignore spaces, underscore, medial hyphens 144 // The generator ensures a needle never ends (or starts) by a medial 145 // hyphen https://unicode.org/reports/tr44/#UAX44-LM2. 146 bool Ignore = 147 *It == ' ' || *It == '_' || 148 (*It == '-' && isAlnum(PreviousChar) && 149 ((Next != End && isAlnum(*Next)) || (Next == End && IsPrefix))); 150 PreviousChar = *It; 151 if (!Ignore) 152 break; 153 ++It; 154 } 155 return It; 156 }; 157 158 while (true) { 159 NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName); 160 NeedlePos = 161 IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix); 162 if (NeedlePos == Needle.end()) 163 break; 164 if (NamePos == Name.end()) 165 break; 166 if (toUpper(*NeedlePos) != toUpper(*NamePos)) 167 break; 168 NeedlePos++; 169 NamePos++; 170 } 171 Consummed = std::distance(Name.begin(), NamePos); 172 if (NeedlePos != Needle.end()) { 173 PreviousCharInName = PreviousCharInNameOrigin; 174 } 175 return NeedlePos == Needle.end(); 176 } 177 178 static std::tuple<Node, bool, uint32_t> 179 compareNode(uint32_t Offset, StringRef Name, bool Strict, 180 char PreviousCharInName, BufferType &Buffer, 181 const Node *Parent = nullptr) { 182 Node N = readNode(Offset, Parent); 183 std::size_t Consummed = 0; 184 bool DoesStartWith = N.IsRoot || startsWith(Name, N.Name, Strict, Consummed, 185 PreviousCharInName); 186 if (!DoesStartWith) 187 return std::make_tuple(N, false, 0); 188 189 if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF) 190 return std::make_tuple(N, true, N.Value); 191 192 if (N.hasChildren()) { 193 uint32_t ChildOffset = N.ChildrenOffset; 194 for (;;) { 195 Node C; 196 bool Matches; 197 uint32_t Value; 198 std::tie(C, Matches, Value) = 199 compareNode(ChildOffset, Name.substr(Consummed), Strict, 200 PreviousCharInName, Buffer, &N); 201 if (Matches) { 202 std::reverse_copy(C.Name.begin(), C.Name.end(), 203 std::back_inserter(Buffer)); 204 return std::make_tuple(N, true, Value); 205 } 206 ChildOffset += C.Size; 207 if (!C.HasSibling) 208 break; 209 } 210 } 211 return std::make_tuple(N, false, 0); 212 } 213 214 static std::tuple<Node, bool, uint32_t> 215 compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) { 216 return compareNode(Offset, Name, Strict, 0, Buffer); 217 } 218 219 // clang-format off 220 constexpr const char *const HangulSyllables[][3] = { 221 { "G", "A", "" }, 222 { "GG", "AE", "G" }, 223 { "N", "YA", "GG" }, 224 { "D", "YAE", "GS" }, 225 { "DD", "EO", "N", }, 226 { "R", "E", "NJ" }, 227 { "M", "YEO", "NH" }, 228 { "B", "YE", "D" }, 229 { "BB", "O", "L" }, 230 { "S", "WA", "LG" }, 231 { "SS", "WAE", "LM" }, 232 { "", "OE", "LB" }, 233 { "J", "YO", "LS" }, 234 { "JJ", "U", "LT" }, 235 { "C", "WEO", "LP" }, 236 { "K", "WE", "LH" }, 237 { "T", "WI", "M" }, 238 { "P", "YU", "B" }, 239 { "H", "EU", "BS" }, 240 { 0, "YI", "S" }, 241 { 0, "I", "SS" }, 242 { 0, 0, "NG" }, 243 { 0, 0, "J" }, 244 { 0, 0, "C" }, 245 { 0, 0, "K" }, 246 { 0, 0, "T" }, 247 { 0, 0, "P" }, 248 { 0, 0, "H" } 249 }; 250 // clang-format on 251 252 // Unicode 15.0 253 // 3.12 Conjoining Jamo Behavior Common constants 254 constexpr const char32_t SBase = 0xAC00; 255 constexpr const uint32_t LCount = 19; 256 constexpr const uint32_t VCount = 21; 257 constexpr const uint32_t TCount = 28; 258 259 static std::size_t findSyllable(StringRef Name, bool Strict, 260 char &PreviousInName, int &Pos, int Column) { 261 assert(Column == 0 || Column == 1 || Column == 2); 262 static std::size_t CountPerColumn[] = {LCount, VCount, TCount}; 263 int Len = -1; 264 int Prev = PreviousInName; 265 for (std::size_t I = 0; I < CountPerColumn[Column]; I++) { 266 StringRef Syllable(HangulSyllables[I][Column]); 267 if (int(Syllable.size()) <= Len) 268 continue; 269 std::size_t Consummed = 0; 270 char PreviousInNameCopy = PreviousInName; 271 bool DoesStartWith = 272 startsWith(Name, Syllable, Strict, Consummed, PreviousInNameCopy); 273 if (!DoesStartWith) 274 continue; 275 Len = Consummed; 276 Pos = I; 277 Prev = PreviousInNameCopy; 278 } 279 if (Len == -1) 280 return 0; 281 PreviousInName = Prev; 282 return size_t(Len); 283 } 284 285 static std::optional<char32_t> 286 nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) { 287 Buffer.clear(); 288 // Hangul Syllable Decomposition 289 std::size_t Consummed = 0; 290 char NameStart = 0; 291 bool DoesStartWith = 292 startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, NameStart); 293 if (!DoesStartWith) 294 return std::nullopt; 295 Name = Name.substr(Consummed); 296 int L = -1, V = -1, T = -1; 297 Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0)); 298 Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1)); 299 Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2)); 300 if (L != -1 && V != -1 && T != -1 && Name.empty()) { 301 if (!Strict) { 302 Buffer.append("HANGUL SYLLABLE "); 303 if (L != -1) 304 Buffer.append(HangulSyllables[L][0]); 305 if (V != -1) 306 Buffer.append(HangulSyllables[V][1]); 307 if (T != -1) 308 Buffer.append(HangulSyllables[T][2]); 309 } 310 return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount + 311 std::uint32_t(T); 312 } 313 // Otherwise, it's an illegal syllable name. 314 return std::nullopt; 315 } 316 317 struct GeneratedNamesData { 318 StringRef Prefix; 319 uint32_t Start; 320 uint32_t End; 321 }; 322 323 // Unicode 15.1 Table 4-8. Name Derivation Rule Prefix Strings 324 static const GeneratedNamesData GeneratedNamesDataTable[] = { 325 {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF}, 326 {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF}, 327 {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF}, 328 {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739}, 329 {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D}, 330 {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1}, 331 {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0}, 332 {"CJK UNIFIED IDEOGRAPH-", 0x2EBF0, 0x2EE5D}, 333 {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A}, 334 {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF}, 335 {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7}, 336 {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08}, 337 {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5}, 338 {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB}, 339 {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D}, 340 {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9}, 341 {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D}, 342 }; 343 344 static std::optional<char32_t> 345 nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) { 346 for (auto &&Item : GeneratedNamesDataTable) { 347 Buffer.clear(); 348 std::size_t Consummed = 0; 349 char NameStart = 0; 350 bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed, 351 NameStart, /*IsPrefix=*/true); 352 if (!DoesStartWith) 353 continue; 354 auto Number = Name.substr(Consummed); 355 unsigned long long V = 0; 356 // Be consistent about mandating upper casing. 357 if (Strict && 358 llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; })) 359 return {}; 360 if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End) 361 continue; 362 if (!Strict) { 363 Buffer.append(Item.Prefix); 364 Buffer.append(utohexstr(V, true)); 365 } 366 return V; 367 } 368 return std::nullopt; 369 } 370 371 static std::optional<char32_t> nameToCodepoint(StringRef Name, bool Strict, 372 BufferType &Buffer) { 373 if (Name.empty()) 374 return std::nullopt; 375 376 std::optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer); 377 if (!Res) 378 Res = nameToGeneratedCodePoint(Name, Strict, Buffer); 379 if (Res) 380 return *Res; 381 382 Buffer.clear(); 383 Node Node; 384 bool Matches; 385 uint32_t Value; 386 std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer); 387 if (Matches) { 388 std::reverse(Buffer.begin(), Buffer.end()); 389 // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial 390 // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E. 391 if (!Strict && Value == 0x116c && Name.contains_insensitive("O-E")) { 392 Buffer = "HANGUL JUNGSEONG O-E"; 393 Value = 0x1180; 394 } 395 return Value; 396 } 397 return std::nullopt; 398 } 399 400 std::optional<char32_t> nameToCodepointStrict(StringRef Name) { 401 402 BufferType Buffer; 403 auto Opt = nameToCodepoint(Name, true, Buffer); 404 return Opt; 405 } 406 407 std::optional<LooseMatchingResult> 408 nameToCodepointLooseMatching(StringRef Name) { 409 BufferType Buffer; 410 auto Opt = nameToCodepoint(Name, false, Buffer); 411 if (!Opt) 412 return std::nullopt; 413 return LooseMatchingResult{*Opt, Buffer}; 414 } 415 416 // Find the unicode character whose editing distance to Pattern 417 // is shortest, using the Wagner–Fischer algorithm. 418 llvm::SmallVector<MatchForCodepointName> 419 nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) { 420 // We maintain a fixed size vector of matches, 421 // sorted by distance 422 // The worst match (with the biggest distance) are discarded when new elements 423 // are added. 424 std::size_t LargestEditDistance = 0; 425 llvm::SmallVector<MatchForCodepointName> Matches; 426 Matches.reserve(MaxMatchesCount + 1); 427 428 auto Insert = [&](const Node &Node, uint32_t Distance, 429 char32_t Value) -> bool { 430 if (Distance > LargestEditDistance) { 431 if (Matches.size() == MaxMatchesCount) 432 return false; 433 LargestEditDistance = Distance; 434 } 435 // To avoid allocations, the creation of the name is delayed 436 // as much as possible. 437 std::string Name; 438 auto GetName = [&] { 439 if (Name.empty()) 440 Name = Node.fullName(); 441 return Name; 442 }; 443 444 auto It = llvm::lower_bound( 445 Matches, Distance, 446 [&](const MatchForCodepointName &a, std::size_t Distance) { 447 if (Distance == a.Distance) 448 return a.Name < GetName(); 449 return a.Distance < Distance; 450 }); 451 if (It == Matches.end() && Matches.size() == MaxMatchesCount) 452 return false; 453 454 MatchForCodepointName M{GetName(), Distance, Value}; 455 Matches.insert(It, std::move(M)); 456 if (Matches.size() > MaxMatchesCount) 457 Matches.pop_back(); 458 return true; 459 }; 460 461 // We ignore case, space, hyphens, etc, 462 // in both the search pattern and the prospective names. 463 auto Normalize = [](StringRef Name) { 464 std::string Out; 465 Out.reserve(Name.size()); 466 for (char C : Name) { 467 if (isAlnum(C)) 468 Out.push_back(toUpper(C)); 469 } 470 return Out; 471 }; 472 std::string NormalizedName = Normalize(Pattern); 473 474 // Allocate a matrix big enough for longest names. 475 const std::size_t Columns = 476 std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) + 477 1; 478 479 LLVM_ATTRIBUTE_UNUSED static std::size_t Rows = 480 UnicodeNameToCodepointLargestNameSize + 1; 481 482 std::vector<char> Distances( 483 Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0); 484 485 auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & { 486 assert(Column < Columns); 487 assert(Row < Rows); 488 return Distances[Row * Columns + Column]; 489 }; 490 491 for (std::size_t I = 0; I < Columns; I++) 492 Get(I, 0) = I; 493 494 // Visit the childrens, 495 // Filling (and overriding) the matrix for the name fragment of each node 496 // iteratively. CompleteName is used to collect the actual name of potential 497 // match, respecting case and spacing. 498 auto VisitNode = [&](const Node &N, std::size_t Row, 499 auto &VisitNode) -> void { 500 std::size_t J = 0; 501 for (; J < N.Name.size(); J++) { 502 if (!isAlnum(N.Name[J])) 503 continue; 504 505 Get(0, Row) = Row; 506 507 for (std::size_t I = 1; I < Columns; I++) { 508 const int Delete = Get(I - 1, Row) + 1; 509 const int Insert = Get(I, Row - 1) + 1; 510 511 const int Replace = 512 Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0); 513 514 Get(I, Row) = std::min(Insert, std::min(Delete, Replace)); 515 } 516 517 Row++; 518 } 519 520 unsigned Cost = Get(Columns - 1, Row - 1); 521 if (N.Value != 0xFFFFFFFF) { 522 Insert(N, Cost, N.Value); 523 } 524 525 if (N.hasChildren()) { 526 auto ChildOffset = N.ChildrenOffset; 527 for (;;) { 528 Node C = readNode(ChildOffset, &N); 529 ChildOffset += C.Size; 530 if (!C.isValid()) 531 break; 532 VisitNode(C, Row, VisitNode); 533 if (!C.HasSibling) 534 break; 535 } 536 } 537 }; 538 539 Node Root = createRoot(); 540 VisitNode(Root, 1, VisitNode); 541 return Matches; 542 } 543 544 } // namespace unicode 545 546 } // namespace sys 547 } // namespace llvm 548