1 //===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties 2 //-*- C++ -*-===// 3 // 4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 // See https://llvm.org/LICENSE.txt for license information. 6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements functions to map the name or alias of a unicode 11 // character to its codepoint. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/StringExtras.h" 17 #include "llvm/ADT/StringRef.h" 18 #include "llvm/Support/Unicode.h" 19 20 namespace llvm { 21 namespace sys { 22 namespace unicode { 23 24 extern const char *UnicodeNameToCodepointDict; 25 extern const uint8_t *UnicodeNameToCodepointIndex; 26 extern const std::size_t UnicodeNameToCodepointIndexSize; 27 extern const std::size_t UnicodeNameToCodepointLargestNameSize; 28 29 using BufferType = SmallString<64>; 30 31 struct Node { 32 bool IsRoot = false; 33 char32_t Value = 0xFFFFFFFF; 34 uint32_t ChildrenOffset = 0; 35 bool HasSibling = false; 36 uint32_t Size = 0; 37 StringRef Name; 38 const Node *Parent = nullptr; 39 40 constexpr bool isValid() const { 41 return !Name.empty() || Value == 0xFFFFFFFF; 42 } 43 constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; } 44 45 std::string fullName() const { 46 std::string S; 47 // Reserve enough space for most unicode code points. 48 // The chosen value represent the 99th percentile of name size as of 49 // Unicode 15.0. 50 S.reserve(46); 51 const Node *N = this; 52 while (N) { 53 std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S)); 54 N = N->Parent; 55 } 56 std::reverse(S.begin(), S.end()); 57 return S; 58 } 59 }; 60 61 static Node createRoot() { 62 Node N; 63 N.IsRoot = true; 64 N.ChildrenOffset = 1; 65 N.Size = 1; 66 return N; 67 } 68 69 static Node readNode(uint32_t Offset, const Node *Parent = nullptr) { 70 if (Offset == 0) 71 return createRoot(); 72 73 uint32_t Origin = Offset; 74 Node N; 75 N.Parent = Parent; 76 uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++]; 77 if (Offset + 6 >= UnicodeNameToCodepointIndexSize) 78 return N; 79 80 bool LongName = NameInfo & 0x40; 81 bool HasValue = NameInfo & 0x80; 82 std::size_t Size = NameInfo & ~0xC0; 83 if (LongName) { 84 uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8); 85 NameOffset |= UnicodeNameToCodepointIndex[Offset++]; 86 N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size); 87 } else { 88 N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1); 89 } 90 if (HasValue) { 91 uint8_t H = UnicodeNameToCodepointIndex[Offset++]; 92 uint8_t M = UnicodeNameToCodepointIndex[Offset++]; 93 uint8_t L = UnicodeNameToCodepointIndex[Offset++]; 94 N.Value = ((H << 16) | (M << 8) | L) >> 3; 95 96 bool HasChildren = L & 0x02; 97 N.HasSibling = L & 0x01; 98 99 if (HasChildren) { 100 N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16; 101 N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8; 102 N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++]; 103 } 104 } else { 105 uint8_t H = UnicodeNameToCodepointIndex[Offset++]; 106 N.HasSibling = H & 0x80; 107 bool HasChildren = H & 0x40; 108 H &= uint8_t(~0xC0); 109 if (HasChildren) { 110 N.ChildrenOffset = (H << 16); 111 N.ChildrenOffset |= 112 (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8); 113 N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++]; 114 } 115 } 116 N.Size = Offset - Origin; 117 return N; 118 } 119 120 static bool startsWith(StringRef Name, StringRef Needle, bool Strict, 121 std::size_t &Consummed, char &PreviousCharInName, 122 char &PreviousCharInNeedle, bool IsPrefix = false) { 123 124 Consummed = 0; 125 if (Strict) { 126 if (!Name.startswith(Needle)) 127 return false; 128 Consummed = Needle.size(); 129 return true; 130 } 131 if (Needle.empty()) 132 return true; 133 134 auto NamePos = Name.begin(); 135 auto NeedlePos = Needle.begin(); 136 137 char PreviousCharInNameOrigin = PreviousCharInName; 138 char PreviousCharInNeedleOrigin = PreviousCharInNeedle; 139 140 auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar, 141 bool IgnoreEnd = false) { 142 while (It != End) { 143 const auto Next = std::next(It); 144 // Ignore spaces, underscore, medial hyphens 145 // https://unicode.org/reports/tr44/#UAX44-LM2. 146 bool Ignore = 147 *It == ' ' || *It == '_' || 148 (*It == '-' && isAlnum(PreviousChar) && 149 ((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd))); 150 PreviousChar = *It; 151 if (!Ignore) 152 break; 153 ++It; 154 } 155 return It; 156 }; 157 158 while (true) { 159 NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName); 160 NeedlePos = 161 IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix); 162 if (NeedlePos == Needle.end()) 163 break; 164 if (NamePos == Name.end()) 165 break; 166 if (toUpper(*NeedlePos) != toUpper(*NamePos)) 167 break; 168 NeedlePos++; 169 NamePos++; 170 } 171 Consummed = std::distance(Name.begin(), NamePos); 172 if (NeedlePos != Needle.end()) { 173 PreviousCharInName = PreviousCharInNameOrigin; 174 PreviousCharInNeedle = PreviousCharInNeedleOrigin; 175 } 176 return NeedlePos == Needle.end(); 177 } 178 179 static std::tuple<Node, bool, uint32_t> 180 compareNode(uint32_t Offset, StringRef Name, bool Strict, 181 char PreviousCharInName, char PreviousCharInNeedle, 182 BufferType &Buffer, const Node *Parent = nullptr) { 183 Node N = readNode(Offset, Parent); 184 std::size_t Consummed = 0; 185 bool DoesStartWith = 186 N.IsRoot || startsWith(Name, N.Name, Strict, Consummed, 187 PreviousCharInName, PreviousCharInNeedle); 188 if (!DoesStartWith) 189 return std::make_tuple(N, false, 0); 190 191 if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF) 192 return std::make_tuple(N, true, N.Value); 193 194 if (N.hasChildren()) { 195 uint32_t ChildOffset = N.ChildrenOffset; 196 for (;;) { 197 Node C; 198 bool Matches; 199 uint32_t Value; 200 std::tie(C, Matches, Value) = 201 compareNode(ChildOffset, Name.substr(Consummed), Strict, 202 PreviousCharInName, PreviousCharInNeedle, Buffer, &N); 203 if (Matches) { 204 std::reverse_copy(C.Name.begin(), C.Name.end(), 205 std::back_inserter(Buffer)); 206 return std::make_tuple(N, true, Value); 207 } 208 ChildOffset += C.Size; 209 if (!C.HasSibling) 210 break; 211 } 212 } 213 return std::make_tuple(N, false, 0); 214 } 215 216 static std::tuple<Node, bool, uint32_t> 217 compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) { 218 return compareNode(Offset, Name, Strict, 0, 0, Buffer); 219 } 220 221 // clang-format off 222 constexpr const char *const HangulSyllables[][3] = { 223 { "G", "A", "" }, 224 { "GG", "AE", "G" }, 225 { "N", "YA", "GG" }, 226 { "D", "YAE", "GS" }, 227 { "DD", "EO", "N", }, 228 { "R", "E", "NJ" }, 229 { "M", "YEO", "NH" }, 230 { "B", "YE", "D" }, 231 { "BB", "O", "L" }, 232 { "S", "WA", "LG" }, 233 { "SS", "WAE", "LM" }, 234 { "", "OE", "LB" }, 235 { "J", "YO", "LS" }, 236 { "JJ", "U", "LT" }, 237 { "C", "WEO", "LP" }, 238 { "K", "WE", "LH" }, 239 { "T", "WI", "M" }, 240 { "P", "YU", "B" }, 241 { "H", "EU", "BS" }, 242 { 0, "YI", "S" }, 243 { 0, "I", "SS" }, 244 { 0, 0, "NG" }, 245 { 0, 0, "J" }, 246 { 0, 0, "C" }, 247 { 0, 0, "K" }, 248 { 0, 0, "T" }, 249 { 0, 0, "P" }, 250 { 0, 0, "H" } 251 }; 252 // clang-format on 253 254 // Unicode 15.0 255 // 3.12 Conjoining Jamo Behavior Common constants 256 constexpr const char32_t SBase = 0xAC00; 257 constexpr const uint32_t LCount = 19; 258 constexpr const uint32_t VCount = 21; 259 constexpr const uint32_t TCount = 28; 260 261 static std::size_t findSyllable(StringRef Name, bool Strict, 262 char &PreviousInName, int &Pos, int Column) { 263 assert(Column == 0 || Column == 1 || Column == 2); 264 static std::size_t CountPerColumn[] = {LCount, VCount, TCount}; 265 char NeedleStart = 0; 266 int Len = -1; 267 int Prev = PreviousInName; 268 for (std::size_t I = 0; I < CountPerColumn[Column]; I++) { 269 StringRef Syllable(HangulSyllables[I][Column]); 270 if (int(Syllable.size()) <= Len) 271 continue; 272 std::size_t Consummed = 0; 273 char PreviousInNameCopy = PreviousInName; 274 bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed, 275 PreviousInNameCopy, NeedleStart); 276 if (!DoesStartWith) 277 continue; 278 Len = Consummed; 279 Pos = I; 280 Prev = PreviousInNameCopy; 281 } 282 if (Len == -1) 283 return 0; 284 PreviousInName = Prev; 285 return size_t(Len); 286 } 287 288 static std::optional<char32_t> 289 nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) { 290 Buffer.clear(); 291 // Hangul Syllable Decomposition 292 std::size_t Consummed = 0; 293 char NameStart = 0, NeedleStart = 0; 294 bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, 295 NameStart, NeedleStart); 296 if (!DoesStartWith) 297 return std::nullopt; 298 Name = Name.substr(Consummed); 299 int L = -1, V = -1, T = -1; 300 Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0)); 301 Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1)); 302 Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2)); 303 if (L != -1 && V != -1 && T != -1 && Name.empty()) { 304 if (!Strict) { 305 Buffer.append("HANGUL SYLLABLE "); 306 if (L != -1) 307 Buffer.append(HangulSyllables[L][0]); 308 if (V != -1) 309 Buffer.append(HangulSyllables[V][1]); 310 if (T != -1) 311 Buffer.append(HangulSyllables[T][2]); 312 } 313 return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount + 314 std::uint32_t(T); 315 } 316 // Otherwise, it's an illegal syllable name. 317 return std::nullopt; 318 } 319 320 struct GeneratedNamesData { 321 StringRef Prefix; 322 uint32_t Start; 323 uint32_t End; 324 }; 325 326 // Unicode 15.0 Table 4-8. Name Derivation Rule Prefix Strings 327 static const GeneratedNamesData GeneratedNamesDataTable[] = { 328 {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF}, 329 {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF}, 330 {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF}, 331 {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739}, 332 {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D}, 333 {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1}, 334 {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0}, 335 {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A}, 336 {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF}, 337 {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7}, 338 {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08}, 339 {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5}, 340 {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB}, 341 {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D}, 342 {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9}, 343 {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D}, 344 }; 345 346 static std::optional<char32_t> 347 nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) { 348 for (auto &&Item : GeneratedNamesDataTable) { 349 Buffer.clear(); 350 std::size_t Consummed = 0; 351 char NameStart = 0, NeedleStart = 0; 352 bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed, 353 NameStart, NeedleStart, /*isPrefix*/ true); 354 if (!DoesStartWith) 355 continue; 356 auto Number = Name.substr(Consummed); 357 unsigned long long V = 0; 358 // Be consistent about mandating upper casing. 359 if (Strict && 360 llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; })) 361 return {}; 362 if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End) 363 continue; 364 if (!Strict) { 365 Buffer.append(Item.Prefix); 366 Buffer.append(utohexstr(V, true)); 367 } 368 return V; 369 } 370 return std::nullopt; 371 } 372 373 static std::optional<char32_t> nameToCodepoint(StringRef Name, bool Strict, 374 BufferType &Buffer) { 375 if (Name.empty()) 376 return std::nullopt; 377 378 std::optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer); 379 if (!Res) 380 Res = nameToGeneratedCodePoint(Name, Strict, Buffer); 381 if (Res) 382 return *Res; 383 384 Buffer.clear(); 385 Node Node; 386 bool Matches; 387 uint32_t Value; 388 std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer); 389 if (Matches) { 390 std::reverse(Buffer.begin(), Buffer.end()); 391 // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial 392 // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E. 393 if (!Strict && Value == 0x116c && 394 Name.find_insensitive("O-E") != StringRef::npos) { 395 Buffer = "HANGUL JUNGSEONG O-E"; 396 Value = 0x1180; 397 } 398 return Value; 399 } 400 return std::nullopt; 401 } 402 403 std::optional<char32_t> nameToCodepointStrict(StringRef Name) { 404 405 BufferType Buffer; 406 auto Opt = nameToCodepoint(Name, true, Buffer); 407 return Opt; 408 } 409 410 std::optional<LooseMatchingResult> 411 nameToCodepointLooseMatching(StringRef Name) { 412 BufferType Buffer; 413 auto Opt = nameToCodepoint(Name, false, Buffer); 414 if (!Opt) 415 return std::nullopt; 416 return LooseMatchingResult{*Opt, Buffer}; 417 } 418 419 // Find the unicode character whose editing distance to Pattern 420 // is shortest, using the Wagner–Fischer algorithm. 421 llvm::SmallVector<MatchForCodepointName> 422 nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) { 423 // We maintain a fixed size vector of matches, 424 // sorted by distance 425 // The worst match (with the biggest distance) are discarded when new elements 426 // are added. 427 std::size_t LargestEditDistance = 0; 428 llvm::SmallVector<MatchForCodepointName> Matches; 429 Matches.reserve(MaxMatchesCount + 1); 430 431 auto Insert = [&](const Node &Node, uint32_t Distance, 432 char32_t Value) -> bool { 433 if (Distance > LargestEditDistance) { 434 if (Matches.size() == MaxMatchesCount) 435 return false; 436 LargestEditDistance = Distance; 437 } 438 // To avoid allocations, the creation of the name is delayed 439 // as much as possible. 440 std::string Name; 441 auto GetName = [&] { 442 if (Name.empty()) 443 Name = Node.fullName(); 444 return Name; 445 }; 446 447 auto It = llvm::lower_bound( 448 Matches, Distance, 449 [&](const MatchForCodepointName &a, std::size_t Distance) { 450 if (Distance == a.Distance) 451 return a.Name < GetName(); 452 return a.Distance < Distance; 453 }); 454 if (It == Matches.end() && Matches.size() == MaxMatchesCount) 455 return false; 456 457 MatchForCodepointName M{GetName(), Distance, Value}; 458 Matches.insert(It, std::move(M)); 459 if (Matches.size() > MaxMatchesCount) 460 Matches.pop_back(); 461 return true; 462 }; 463 464 // We ignore case, space, hyphens, etc, 465 // in both the search pattern and the prospective names. 466 auto Normalize = [](StringRef Name) { 467 std::string Out; 468 Out.reserve(Name.size()); 469 for (char C : Name) { 470 if (isAlnum(C)) 471 Out.push_back(toUpper(C)); 472 } 473 return Out; 474 }; 475 std::string NormalizedName = Normalize(Pattern); 476 477 // Allocate a matrix big enough for longest names. 478 const std::size_t Columns = 479 std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) + 480 1; 481 482 LLVM_ATTRIBUTE_UNUSED static std::size_t Rows = 483 UnicodeNameToCodepointLargestNameSize + 1; 484 485 std::vector<char> Distances( 486 Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0); 487 488 auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & { 489 assert(Column < Columns); 490 assert(Row < Rows); 491 return Distances[Row * Columns + Column]; 492 }; 493 494 for (std::size_t I = 0; I < Columns; I++) 495 Get(I, 0) = I; 496 497 // Visit the childrens, 498 // Filling (and overriding) the matrix for the name fragment of each node 499 // iteratively. CompleteName is used to collect the actual name of potential 500 // match, respecting case and spacing. 501 auto VisitNode = [&](const Node &N, std::size_t Row, 502 auto &VisitNode) -> void { 503 std::size_t J = 0; 504 for (; J < N.Name.size(); J++) { 505 if (!isAlnum(N.Name[J])) 506 continue; 507 508 Get(0, Row) = Row; 509 510 for (std::size_t I = 1; I < Columns; I++) { 511 const int Delete = Get(I - 1, Row) + 1; 512 const int Insert = Get(I, Row - 1) + 1; 513 514 const int Replace = 515 Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0); 516 517 Get(I, Row) = std::min(Insert, std::min(Delete, Replace)); 518 } 519 520 Row++; 521 } 522 523 unsigned Cost = Get(Columns - 1, Row - 1); 524 if (N.Value != 0xFFFFFFFF) { 525 Insert(N, Cost, N.Value); 526 } 527 528 if (N.hasChildren()) { 529 auto ChildOffset = N.ChildrenOffset; 530 for (;;) { 531 Node C = readNode(ChildOffset, &N); 532 ChildOffset += C.Size; 533 if (!C.isValid()) 534 break; 535 VisitNode(C, Row, VisitNode); 536 if (!C.HasSibling) 537 break; 538 } 539 } 540 }; 541 542 Node Root = createRoot(); 543 VisitNode(Root, 1, VisitNode); 544 return Matches; 545 } 546 547 } // namespace unicode 548 549 } // namespace sys 550 } // namespace llvm 551