1 //===- llvm/Support/Unicode.h - Unicode character properties -*- C++ -*-=====// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines functions that allow querying certain properties of Unicode 10 // characters. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_SUPPORT_UNICODE_H 15 #define LLVM_SUPPORT_UNICODE_H 16 17 #include "llvm/ADT/SmallString.h" 18 #include <optional> 19 #include <string> 20 21 namespace llvm { 22 class StringRef; 23 24 namespace sys { 25 namespace unicode { 26 27 enum ColumnWidthErrors { 28 ErrorInvalidUTF8 = -2, 29 ErrorNonPrintableCharacter = -1 30 }; 31 32 /// Determines if a character is likely to be displayed correctly on the 33 /// terminal. Exact implementation would have to depend on the specific 34 /// terminal, so we define the semantic that should be suitable for generic case 35 /// of a terminal capable to output Unicode characters. 36 /// 37 /// Printable codepoints are those in the categories L, M, N, P, S and Zs 38 /// \return true if the character is considered printable. 39 bool isPrintable(int UCS); 40 41 // Formatting codepoints are codepoints in the Cf category. 42 bool isFormatting(int UCS); 43 44 /// Gets the number of positions the UTF8-encoded \p Text is likely to occupy 45 /// when output on a terminal ("character width"). This depends on the 46 /// implementation of the terminal, and there's no standard definition of 47 /// character width. 48 /// 49 /// The implementation defines it in a way that is expected to be compatible 50 /// with a generic Unicode-capable terminal. 51 /// 52 /// \return Character width: 53 /// * ErrorNonPrintableCharacter (-1) if \p Text contains non-printable 54 /// characters (as identified by isPrintable); 55 /// * 0 for each non-spacing and enclosing combining mark; 56 /// * 2 for each CJK character excluding halfwidth forms; 57 /// * 1 for each of the remaining characters. 58 int columnWidthUTF8(StringRef Text); 59 60 /// Fold input unicode character according the Simple unicode case folding 61 /// rules. 62 int foldCharSimple(int C); 63 64 /// Maps the name or the alias of a Unicode character to its associated 65 /// codepoints. 66 /// The names and aliases are derived from UnicodeData.txt and NameAliases.txt 67 /// For compatibility with the semantics of named character escape sequences in 68 /// C++, this mapping does an exact match sensitive to casing and spacing. 69 /// \return The codepoint of the corresponding character, if any. 70 std::optional<char32_t> nameToCodepointStrict(StringRef Name); 71 72 struct LooseMatchingResult { 73 char32_t CodePoint; 74 SmallString<64> Name; 75 }; 76 77 std::optional<LooseMatchingResult> nameToCodepointLooseMatching(StringRef Name); 78 79 struct MatchForCodepointName { 80 std::string Name; 81 uint32_t Distance = 0; 82 char32_t Value = 0; 83 }; 84 85 SmallVector<MatchForCodepointName> 86 nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount); 87 88 } // namespace unicode 89 } // namespace sys 90 } // namespace llvm 91 92 #endif 93