1 //===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties 2 //-*- C++ -*-===// 3 // 4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 // See https://llvm.org/LICENSE.txt for license information. 6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements functions to map the name or alias of a unicode 11 // character to its codepoint. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/StringExtras.h" 17 #include "llvm/ADT/StringRef.h" 18 #include "llvm/Support/Unicode.h" 19 20 namespace llvm { 21 namespace sys { 22 namespace unicode { 23 24 extern const char *UnicodeNameToCodepointDict; 25 extern const uint8_t *UnicodeNameToCodepointIndex; 26 extern const std::size_t UnicodeNameToCodepointIndexSize; 27 extern const std::size_t UnicodeNameToCodepointLargestNameSize; 28 29 using BufferType = SmallString<64>; 30 31 struct Node { 32 bool IsRoot = false; 33 char32_t Value = 0xFFFFFFFF; 34 uint32_t ChildrenOffset = 0; 35 bool HasSibling = false; 36 uint32_t Size = 0; 37 StringRef Name; 38 const Node *Parent = nullptr; 39 40 constexpr bool isValid() const { 41 return !Name.empty() || Value == 0xFFFFFFFF; 42 } 43 constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; } 44 45 std::string fullName() const { 46 std::string S; 47 // Reserve enough space for most unicode code points. 48 // The chosen value represent the 99th percentile of name size as of 49 // Unicode 14. 50 S.reserve(46); 51 const Node *N = this; 52 while (N) { 53 std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S)); 54 N = N->Parent; 55 } 56 std::reverse(S.begin(), S.end()); 57 return S; 58 } 59 }; 60 61 static Node createRoot() { 62 Node N; 63 N.IsRoot = true; 64 N.ChildrenOffset = 1; 65 N.Size = 1; 66 return N; 67 } 68 69 static Node readNode(uint32_t Offset, const Node *Parent = nullptr) { 70 if (Offset == 0) 71 return createRoot(); 72 73 uint32_t Origin = Offset; 74 Node N; 75 N.Parent = Parent; 76 uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++]; 77 if (Offset + 6 >= UnicodeNameToCodepointIndexSize) 78 return N; 79 80 bool LongName = NameInfo & 0x40; 81 bool HasValue = NameInfo & 0x80; 82 std::size_t Size = NameInfo & ~0xC0; 83 if (LongName) { 84 uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8); 85 NameOffset |= UnicodeNameToCodepointIndex[Offset++]; 86 N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size); 87 } else { 88 N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1); 89 } 90 if (HasValue) { 91 uint8_t H = UnicodeNameToCodepointIndex[Offset++]; 92 uint8_t M = UnicodeNameToCodepointIndex[Offset++]; 93 uint8_t L = UnicodeNameToCodepointIndex[Offset++]; 94 N.Value = ((H << 16) | (M << 8) | L) >> 3; 95 96 bool HasChildren = L & 0x02; 97 N.HasSibling = L & 0x01; 98 99 if (HasChildren) { 100 N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16; 101 N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8; 102 N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++]; 103 } 104 } else { 105 uint8_t H = UnicodeNameToCodepointIndex[Offset++]; 106 N.HasSibling = H & 0x80; 107 bool HasChildren = H & 0x40; 108 H &= ~0xC0; 109 if (HasChildren) { 110 N.ChildrenOffset = (H << 16); 111 N.ChildrenOffset |= 112 (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8); 113 N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++]; 114 } 115 } 116 N.Size = Offset - Origin; 117 return N; 118 } 119 120 static bool startsWith(StringRef Name, StringRef Needle, bool Strict, 121 std::size_t &Consummed, char &PreviousCharInName, 122 char &PreviousCharInNeedle, bool IsPrefix = false) { 123 124 Consummed = 0; 125 if (Strict) { 126 if (!Name.startswith(Needle)) 127 return false; 128 Consummed = Needle.size(); 129 return true; 130 } 131 if (Needle.empty()) 132 return true; 133 134 auto NamePos = Name.begin(); 135 auto NeedlePos = Needle.begin(); 136 137 char PreviousCharInNameOrigin = PreviousCharInName; 138 char PreviousCharInNeedleOrigin = PreviousCharInNeedle; 139 140 auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar, 141 bool IgnoreEnd = false) { 142 while (It != End) { 143 const auto Next = std::next(It); 144 // Ignore spaces, underscore, medial hyphens 145 // https://unicode.org/reports/tr44/#UAX44-LM2. 146 bool Ignore = 147 *It == ' ' || *It == '_' || 148 (*It == '-' && isAlnum(PreviousChar) && 149 ((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd))); 150 PreviousChar = *It; 151 if (!Ignore) 152 break; 153 ++It; 154 } 155 return It; 156 }; 157 158 while (true) { 159 NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName); 160 NeedlePos = 161 IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix); 162 if (NeedlePos == Needle.end()) 163 break; 164 if (NamePos == Name.end()) 165 break; 166 if (toUpper(*NeedlePos) != toUpper(*NamePos)) 167 break; 168 NeedlePos++; 169 NamePos++; 170 } 171 Consummed = std::distance(Name.begin(), NamePos); 172 if (NeedlePos != Needle.end()) { 173 PreviousCharInName = PreviousCharInNameOrigin; 174 PreviousCharInNeedle = PreviousCharInNeedleOrigin; 175 } 176 return NeedlePos == Needle.end(); 177 } 178 179 static std::tuple<Node, bool, uint32_t> 180 compareNode(uint32_t Offset, StringRef Name, bool Strict, 181 char PreviousCharInName, char PreviousCharInNeedle, 182 BufferType &Buffer, const Node *Parent = nullptr) { 183 Node N = readNode(Offset, Parent); 184 std::size_t Consummed = 0; 185 bool DoesStartWith = 186 N.IsRoot || startsWith(Name, N.Name, Strict, Consummed, 187 PreviousCharInName, PreviousCharInNeedle); 188 if (!DoesStartWith) 189 return std::make_tuple(N, false, 0); 190 191 if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF) 192 return std::make_tuple(N, true, N.Value); 193 194 if (N.hasChildren()) { 195 uint32_t ChildOffset = N.ChildrenOffset; 196 for (;;) { 197 Node C; 198 bool Matches; 199 uint32_t Value; 200 std::tie(C, Matches, Value) = 201 compareNode(ChildOffset, Name.substr(Consummed), Strict, 202 PreviousCharInName, PreviousCharInNeedle, Buffer, &N); 203 if (Matches) { 204 std::reverse_copy(C.Name.begin(), C.Name.end(), 205 std::back_inserter(Buffer)); 206 return std::make_tuple(N, true, Value); 207 } 208 ChildOffset += C.Size; 209 if (!C.HasSibling) 210 break; 211 } 212 } 213 return std::make_tuple(N, false, 0); 214 } 215 216 static std::tuple<Node, bool, uint32_t> 217 compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) { 218 return compareNode(Offset, Name, Strict, 0, 0, Buffer); 219 } 220 221 // clang-format off 222 constexpr const char *const HangulSyllables[][3] = { 223 { "G", "A", "" }, 224 { "GG", "AE", "G" }, 225 { "N", "YA", "GG" }, 226 { "D", "YAE", "GS" }, 227 { "DD", "EO", "N", }, 228 { "R", "E", "NJ" }, 229 { "M", "YEO", "NH" }, 230 { "B", "YE", "D" }, 231 { "BB", "O", "L" }, 232 { "S", "WA", "LG" }, 233 { "SS", "WAE", "LM" }, 234 { "", "OE", "LB" }, 235 { "J", "YO", "LS" }, 236 { "JJ", "U", "LT" }, 237 { "C", "WEO", "LP" }, 238 { "K", "WE", "LH" }, 239 { "T", "WI", "M" }, 240 { "P", "YU", "B" }, 241 { "H", "EU", "BS" }, 242 { 0, "YI", "S" }, 243 { 0, "I", "SS" }, 244 { 0, 0, "NG" }, 245 { 0, 0, "J" }, 246 { 0, 0, "C" }, 247 { 0, 0, "K" }, 248 { 0, 0, "T" }, 249 { 0, 0, "P" }, 250 { 0, 0, "H" } 251 }; 252 // clang-format on 253 254 // Unicode 14.0 255 // 3.12 Conjoining Jamo Behavior Common constants 256 constexpr const char32_t SBase = 0xAC00; 257 constexpr const uint32_t LCount = 19; 258 constexpr const uint32_t VCount = 21; 259 constexpr const uint32_t TCount = 28; 260 261 static std::size_t findSyllable(StringRef Name, bool Strict, 262 char &PreviousInName, int &Pos, int Column) { 263 assert(Column == 0 || Column == 1 || Column == 2); 264 static std::size_t CountPerColumn[] = {LCount, VCount, TCount}; 265 char NeedleStart = 0; 266 int Len = -1; 267 int Prev = PreviousInName; 268 for (std::size_t I = 0; I < CountPerColumn[Column]; I++) { 269 StringRef Syllable(HangulSyllables[I][Column]); 270 if (int(Syllable.size()) <= Len) 271 continue; 272 std::size_t Consummed = 0; 273 char PreviousInNameCopy = PreviousInName; 274 bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed, 275 PreviousInNameCopy, NeedleStart); 276 if (!DoesStartWith) 277 continue; 278 Len = Consummed; 279 Pos = I; 280 Prev = PreviousInNameCopy; 281 } 282 if (Len == -1) 283 return 0; 284 PreviousInName = Prev; 285 return size_t(Len); 286 } 287 288 static llvm::Optional<char32_t> 289 nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) { 290 Buffer.clear(); 291 // Hangul Syllable Decomposition 292 std::size_t Consummed = 0; 293 char NameStart = 0, NeedleStart = 0; 294 bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, 295 NameStart, NeedleStart); 296 if (!DoesStartWith) 297 return None; 298 Name = Name.substr(Consummed); 299 int L = -1, V = -1, T = -1; 300 Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0)); 301 Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1)); 302 Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2)); 303 if (L != -1 && V != -1 && T != -1 && Name.empty()) { 304 if (!Strict) { 305 Buffer.append("HANGUL SYLLABLE "); 306 if (L != -1) 307 Buffer.append(HangulSyllables[L][0]); 308 if (V != -1) 309 Buffer.append(HangulSyllables[V][1]); 310 if (T != -1) 311 Buffer.append(HangulSyllables[T][2]); 312 } 313 return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount + 314 std::uint32_t(T); 315 } 316 // Otherwise, it's an illegal syllable name. 317 return None; 318 } 319 320 struct GeneratedNamesData { 321 StringRef Prefix; 322 uint32_t Start; 323 uint32_t End; 324 }; 325 326 // Unicode 14.0 Table 4-8. Name Derivation Rule Prefix Strings 327 // This needs to be kept in sync with 328 // llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp 329 static const GeneratedNamesData GeneratedNamesDataTable[] = { 330 {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF}, 331 {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFC}, 332 {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DD}, 333 {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B734}, 334 {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D}, 335 {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1}, 336 {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0}, 337 {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A}, 338 {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7}, 339 {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08}, 340 {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5}, 341 {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB}, 342 {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D}, 343 {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9}, 344 {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D}, 345 }; 346 347 static llvm::Optional<char32_t> 348 nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) { 349 for (auto &&Item : GeneratedNamesDataTable) { 350 Buffer.clear(); 351 std::size_t Consummed = 0; 352 char NameStart = 0, NeedleStart = 0; 353 bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed, 354 NameStart, NeedleStart, /*isPrefix*/ true); 355 if (!DoesStartWith) 356 continue; 357 auto Number = Name.substr(Consummed); 358 unsigned long long V = 0; 359 // Be consistent about mandating upper casing. 360 if (Strict && 361 llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; })) 362 return {}; 363 if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End) 364 continue; 365 if (!Strict) { 366 Buffer.append(Item.Prefix); 367 Buffer.append(utohexstr(V, true)); 368 } 369 return V; 370 } 371 return None; 372 } 373 374 static llvm::Optional<char32_t> nameToCodepoint(StringRef Name, bool Strict, 375 BufferType &Buffer) { 376 if (Name.empty()) 377 return None; 378 379 llvm::Optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer); 380 if (!Res) 381 Res = nameToGeneratedCodePoint(Name, Strict, Buffer); 382 if (Res) 383 return *Res; 384 385 Buffer.clear(); 386 Node Node; 387 bool Matches; 388 uint32_t Value; 389 std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer); 390 if (Matches) { 391 std::reverse(Buffer.begin(), Buffer.end()); 392 // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial 393 // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E. 394 if (!Strict && Value == 0x116c && 395 Name.find_insensitive("O-E") != StringRef::npos) { 396 Buffer = "HANGUL JUNGSEONG O-E"; 397 Value = 0x1180; 398 } 399 return Value; 400 } 401 return None; 402 } 403 404 llvm::Optional<char32_t> nameToCodepointStrict(StringRef Name) { 405 406 BufferType Buffer; 407 auto Opt = nameToCodepoint(Name, true, Buffer); 408 return Opt; 409 } 410 411 llvm::Optional<LooseMatchingResult> 412 nameToCodepointLooseMatching(StringRef Name) { 413 BufferType Buffer; 414 auto Opt = nameToCodepoint(Name, false, Buffer); 415 if (!Opt) 416 return None; 417 return LooseMatchingResult{*Opt, Buffer}; 418 } 419 420 // Find the unicode character whose editing distance to Pattern 421 // is shortest, using the Wagner–Fischer algorithm. 422 llvm::SmallVector<MatchForCodepointName> 423 nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) { 424 // We maintain a fixed size vector of matches, 425 // sorted by distance 426 // The worst match (with the biggest distance) are discarded when new elements 427 // are added. 428 std::size_t LargestEditDistance = 0; 429 llvm::SmallVector<MatchForCodepointName> Matches; 430 Matches.reserve(MaxMatchesCount + 1); 431 432 auto Insert = [&](const Node &Node, uint32_t Distance, 433 char32_t Value) -> bool { 434 if (Distance > LargestEditDistance) { 435 if (Matches.size() == MaxMatchesCount) 436 return false; 437 LargestEditDistance = Distance; 438 } 439 // To avoid allocations, the creation of the name is delayed 440 // as much as possible. 441 std::string Name; 442 auto GetName = [&] { 443 if (Name.empty()) 444 Name = Node.fullName(); 445 return Name; 446 }; 447 448 auto It = std::lower_bound( 449 Matches.begin(), Matches.end(), Distance, 450 [&](const MatchForCodepointName &a, std::size_t Distance) { 451 if (Distance == a.Distance) 452 return a.Name < GetName(); 453 return a.Distance < Distance; 454 }); 455 if (It == Matches.end() && Matches.size() == MaxMatchesCount) 456 return false; 457 458 MatchForCodepointName M{GetName(), Distance, Value}; 459 Matches.insert(It, std::move(M)); 460 if (Matches.size() > MaxMatchesCount) 461 Matches.pop_back(); 462 return true; 463 }; 464 465 // We ignore case, space, hyphens, etc, 466 // in both the search pattern and the prospective names. 467 auto Normalize = [](StringRef Name) { 468 std::string Out; 469 Out.reserve(Name.size()); 470 for (char C : Name) { 471 if (isAlnum(C)) 472 Out.push_back(toUpper(C)); 473 } 474 return Out; 475 }; 476 std::string NormalizedName = Normalize(Pattern); 477 478 // Allocate a matrix big enough for longest names. 479 const std::size_t Columns = 480 std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) + 481 1; 482 483 LLVM_ATTRIBUTE_UNUSED static std::size_t Rows = 484 UnicodeNameToCodepointLargestNameSize + 1; 485 486 std::vector<char> Distances( 487 Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0); 488 489 auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & { 490 assert(Column < Columns); 491 assert(Row < Rows); 492 return Distances[Row * Columns + Column]; 493 }; 494 495 for (std::size_t I = 0; I < Columns; I++) 496 Get(I, 0) = I; 497 498 // Visit the childrens, 499 // Filling (and overriding) the matrix for the name fragment of each node 500 // iteratively. CompleteName is used to collect the actual name of potential 501 // match, respecting case and spacing. 502 auto VisitNode = [&](const Node &N, std::size_t Row, 503 auto &VisitNode) -> void { 504 std::size_t J = 0; 505 for (; J < N.Name.size(); J++) { 506 if (!isAlnum(N.Name[J])) 507 continue; 508 509 Get(0, Row) = Row; 510 511 for (std::size_t I = 1; I < Columns; I++) { 512 const int Delete = Get(I - 1, Row) + 1; 513 const int Insert = Get(I, Row - 1) + 1; 514 515 const int Replace = 516 Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0); 517 518 Get(I, Row) = std::min(Insert, std::min(Delete, Replace)); 519 } 520 521 Row++; 522 } 523 524 unsigned Cost = Get(Columns - 1, Row - 1); 525 if (N.Value != 0xFFFFFFFF) { 526 Insert(N, Cost, N.Value); 527 } 528 529 if (N.hasChildren()) { 530 auto ChildOffset = N.ChildrenOffset; 531 for (;;) { 532 Node C = readNode(ChildOffset, &N); 533 ChildOffset += C.Size; 534 if (!C.isValid()) 535 break; 536 VisitNode(C, Row, VisitNode); 537 if (!C.HasSibling) 538 break; 539 } 540 } 541 }; 542 543 Node Root = createRoot(); 544 VisitNode(Root, 1, VisitNode); 545 return Matches; 546 } 547 548 } // namespace unicode 549 550 } // namespace sys 551 } // namespace llvm 552