1 //===- llvm/Support/Unicode.cpp - Unicode character properties -*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements functions that allow querying certain properties of 10 // Unicode characters. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/Unicode.h" 15 #include "llvm/Support/ConvertUTF.h" 16 #include "llvm/Support/UnicodeCharRanges.h" 17 18 namespace llvm { 19 namespace sys { 20 namespace unicode { 21 22 /// Unicode code points of the categories L, M, N, P, S and Zs are considered 23 /// printable. 24 /// In addition, U+00AD SOFT HYPHEN is also considered printable, as 25 /// it's actually displayed on most terminals. \return true if the character is 26 /// considered printable. 27 bool isPrintable(int UCS) { 28 // https://unicode.org/Public/14.0.0/ucdxml/ 29 static const UnicodeCharRange PrintableRanges[] = { 30 {0x0020, 0x007E}, {0x00A0, 0x00AC}, {0x00AE, 0x0377}, 31 {0x037A, 0x037F}, {0x0384, 0x038A}, {0x038C, 0x038C}, 32 {0x038E, 0x03A1}, {0x03A3, 0x052F}, {0x0531, 0x0556}, 33 {0x0559, 0x058A}, {0x058D, 0x058F}, {0x0591, 0x05C7}, 34 {0x05D0, 0x05EA}, {0x05EF, 0x05F4}, {0x0606, 0x061B}, 35 {0x061D, 0x06DC}, {0x06DE, 0x070D}, {0x0710, 0x074A}, 36 {0x074D, 0x07B1}, {0x07C0, 0x07FA}, {0x07FD, 0x082D}, 37 {0x0830, 0x083E}, {0x0840, 0x085B}, {0x085E, 0x085E}, 38 {0x0860, 0x086A}, {0x0870, 0x088E}, {0x0898, 0x08E1}, 39 {0x08E3, 0x0983}, {0x0985, 0x098C}, {0x098F, 0x0990}, 40 {0x0993, 0x09A8}, {0x09AA, 0x09B0}, {0x09B2, 0x09B2}, 41 {0x09B6, 0x09B9}, {0x09BC, 0x09C4}, {0x09C7, 0x09C8}, 42 {0x09CB, 0x09CE}, {0x09D7, 0x09D7}, {0x09DC, 0x09DD}, 43 {0x09DF, 0x09E3}, {0x09E6, 0x09FE}, {0x0A01, 0x0A03}, 44 {0x0A05, 0x0A0A}, {0x0A0F, 0x0A10}, {0x0A13, 0x0A28}, 45 {0x0A2A, 0x0A30}, {0x0A32, 0x0A33}, {0x0A35, 0x0A36}, 46 {0x0A38, 0x0A39}, {0x0A3C, 0x0A3C}, {0x0A3E, 0x0A42}, 47 {0x0A47, 0x0A48}, {0x0A4B, 0x0A4D}, {0x0A51, 0x0A51}, 48 {0x0A59, 0x0A5C}, {0x0A5E, 0x0A5E}, {0x0A66, 0x0A76}, 49 {0x0A81, 0x0A83}, {0x0A85, 0x0A8D}, {0x0A8F, 0x0A91}, 50 {0x0A93, 0x0AA8}, {0x0AAA, 0x0AB0}, {0x0AB2, 0x0AB3}, 51 {0x0AB5, 0x0AB9}, {0x0ABC, 0x0AC5}, {0x0AC7, 0x0AC9}, 52 {0x0ACB, 0x0ACD}, {0x0AD0, 0x0AD0}, {0x0AE0, 0x0AE3}, 53 {0x0AE6, 0x0AF1}, {0x0AF9, 0x0AFF}, {0x0B01, 0x0B03}, 54 {0x0B05, 0x0B0C}, {0x0B0F, 0x0B10}, {0x0B13, 0x0B28}, 55 {0x0B2A, 0x0B30}, {0x0B32, 0x0B33}, {0x0B35, 0x0B39}, 56 {0x0B3C, 0x0B44}, {0x0B47, 0x0B48}, {0x0B4B, 0x0B4D}, 57 {0x0B55, 0x0B57}, {0x0B5C, 0x0B5D}, {0x0B5F, 0x0B63}, 58 {0x0B66, 0x0B77}, {0x0B82, 0x0B83}, {0x0B85, 0x0B8A}, 59 {0x0B8E, 0x0B90}, {0x0B92, 0x0B95}, {0x0B99, 0x0B9A}, 60 {0x0B9C, 0x0B9C}, {0x0B9E, 0x0B9F}, {0x0BA3, 0x0BA4}, 61 {0x0BA8, 0x0BAA}, {0x0BAE, 0x0BB9}, {0x0BBE, 0x0BC2}, 62 {0x0BC6, 0x0BC8}, {0x0BCA, 0x0BCD}, {0x0BD0, 0x0BD0}, 63 {0x0BD7, 0x0BD7}, {0x0BE6, 0x0BFA}, {0x0C00, 0x0C0C}, 64 {0x0C0E, 0x0C10}, {0x0C12, 0x0C28}, {0x0C2A, 0x0C39}, 65 {0x0C3C, 0x0C44}, {0x0C46, 0x0C48}, {0x0C4A, 0x0C4D}, 66 {0x0C55, 0x0C56}, {0x0C58, 0x0C5A}, {0x0C5D, 0x0C5D}, 67 {0x0C60, 0x0C63}, {0x0C66, 0x0C6F}, {0x0C77, 0x0C8C}, 68 {0x0C8E, 0x0C90}, {0x0C92, 0x0CA8}, {0x0CAA, 0x0CB3}, 69 {0x0CB5, 0x0CB9}, {0x0CBC, 0x0CC4}, {0x0CC6, 0x0CC8}, 70 {0x0CCA, 0x0CCD}, {0x0CD5, 0x0CD6}, {0x0CDD, 0x0CDE}, 71 {0x0CE0, 0x0CE3}, {0x0CE6, 0x0CEF}, {0x0CF1, 0x0CF2}, 72 {0x0D00, 0x0D0C}, {0x0D0E, 0x0D10}, {0x0D12, 0x0D44}, 73 {0x0D46, 0x0D48}, {0x0D4A, 0x0D4F}, {0x0D54, 0x0D63}, 74 {0x0D66, 0x0D7F}, {0x0D81, 0x0D83}, {0x0D85, 0x0D96}, 75 {0x0D9A, 0x0DB1}, {0x0DB3, 0x0DBB}, {0x0DBD, 0x0DBD}, 76 {0x0DC0, 0x0DC6}, {0x0DCA, 0x0DCA}, {0x0DCF, 0x0DD4}, 77 {0x0DD6, 0x0DD6}, {0x0DD8, 0x0DDF}, {0x0DE6, 0x0DEF}, 78 {0x0DF2, 0x0DF4}, {0x0E01, 0x0E3A}, {0x0E3F, 0x0E5B}, 79 {0x0E81, 0x0E82}, {0x0E84, 0x0E84}, {0x0E86, 0x0E8A}, 80 {0x0E8C, 0x0EA3}, {0x0EA5, 0x0EA5}, {0x0EA7, 0x0EBD}, 81 {0x0EC0, 0x0EC4}, {0x0EC6, 0x0EC6}, {0x0EC8, 0x0ECD}, 82 {0x0ED0, 0x0ED9}, {0x0EDC, 0x0EDF}, {0x0F00, 0x0F47}, 83 {0x0F49, 0x0F6C}, {0x0F71, 0x0F97}, {0x0F99, 0x0FBC}, 84 {0x0FBE, 0x0FCC}, {0x0FCE, 0x0FDA}, {0x1000, 0x10C5}, 85 {0x10C7, 0x10C7}, {0x10CD, 0x10CD}, {0x10D0, 0x1248}, 86 {0x124A, 0x124D}, {0x1250, 0x1256}, {0x1258, 0x1258}, 87 {0x125A, 0x125D}, {0x1260, 0x1288}, {0x128A, 0x128D}, 88 {0x1290, 0x12B0}, {0x12B2, 0x12B5}, {0x12B8, 0x12BE}, 89 {0x12C0, 0x12C0}, {0x12C2, 0x12C5}, {0x12C8, 0x12D6}, 90 {0x12D8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135A}, 91 {0x135D, 0x137C}, {0x1380, 0x1399}, {0x13A0, 0x13F5}, 92 {0x13F8, 0x13FD}, {0x1400, 0x169C}, {0x16A0, 0x16F8}, 93 {0x1700, 0x1715}, {0x171F, 0x1736}, {0x1740, 0x1753}, 94 {0x1760, 0x176C}, {0x176E, 0x1770}, {0x1772, 0x1773}, 95 {0x1780, 0x17DD}, {0x17E0, 0x17E9}, {0x17F0, 0x17F9}, 96 {0x1800, 0x180D}, {0x180F, 0x1819}, {0x1820, 0x1878}, 97 {0x1880, 0x18AA}, {0x18B0, 0x18F5}, {0x1900, 0x191E}, 98 {0x1920, 0x192B}, {0x1930, 0x193B}, {0x1940, 0x1940}, 99 {0x1944, 0x196D}, {0x1970, 0x1974}, {0x1980, 0x19AB}, 100 {0x19B0, 0x19C9}, {0x19D0, 0x19DA}, {0x19DE, 0x1A1B}, 101 {0x1A1E, 0x1A5E}, {0x1A60, 0x1A7C}, {0x1A7F, 0x1A89}, 102 {0x1A90, 0x1A99}, {0x1AA0, 0x1AAD}, {0x1AB0, 0x1ACE}, 103 {0x1B00, 0x1B4C}, {0x1B50, 0x1B7E}, {0x1B80, 0x1BF3}, 104 {0x1BFC, 0x1C37}, {0x1C3B, 0x1C49}, {0x1C4D, 0x1C88}, 105 {0x1C90, 0x1CBA}, {0x1CBD, 0x1CC7}, {0x1CD0, 0x1CFA}, 106 {0x1D00, 0x1F15}, {0x1F18, 0x1F1D}, {0x1F20, 0x1F45}, 107 {0x1F48, 0x1F4D}, {0x1F50, 0x1F57}, {0x1F59, 0x1F59}, 108 {0x1F5B, 0x1F5B}, {0x1F5D, 0x1F5D}, {0x1F5F, 0x1F7D}, 109 {0x1F80, 0x1FB4}, {0x1FB6, 0x1FC4}, {0x1FC6, 0x1FD3}, 110 {0x1FD6, 0x1FDB}, {0x1FDD, 0x1FEF}, {0x1FF2, 0x1FF4}, 111 {0x1FF6, 0x1FFE}, {0x2000, 0x200A}, {0x2010, 0x2027}, 112 {0x202F, 0x205F}, {0x2070, 0x2071}, {0x2074, 0x208E}, 113 {0x2090, 0x209C}, {0x20A0, 0x20C0}, {0x20D0, 0x20F0}, 114 {0x2100, 0x218B}, {0x2190, 0x2426}, {0x2440, 0x244A}, 115 {0x2460, 0x2B73}, {0x2B76, 0x2B95}, {0x2B97, 0x2CF3}, 116 {0x2CF9, 0x2D25}, {0x2D27, 0x2D27}, {0x2D2D, 0x2D2D}, 117 {0x2D30, 0x2D67}, {0x2D6F, 0x2D70}, {0x2D7F, 0x2D96}, 118 {0x2DA0, 0x2DA6}, {0x2DA8, 0x2DAE}, {0x2DB0, 0x2DB6}, 119 {0x2DB8, 0x2DBE}, {0x2DC0, 0x2DC6}, {0x2DC8, 0x2DCE}, 120 {0x2DD0, 0x2DD6}, {0x2DD8, 0x2DDE}, {0x2DE0, 0x2E5D}, 121 {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3}, {0x2F00, 0x2FD5}, 122 {0x2FF0, 0x2FFB}, {0x3000, 0x303F}, {0x3041, 0x3096}, 123 {0x3099, 0x30FF}, {0x3105, 0x312F}, {0x3131, 0x318E}, 124 {0x3190, 0x31E3}, {0x31F0, 0x321E}, {0x3220, 0xA48C}, 125 {0xA490, 0xA4C6}, {0xA4D0, 0xA62B}, {0xA640, 0xA6F7}, 126 {0xA700, 0xA7CA}, {0xA7D0, 0xA7D1}, {0xA7D3, 0xA7D3}, 127 {0xA7D5, 0xA7D9}, {0xA7F2, 0xA82C}, {0xA830, 0xA839}, 128 {0xA840, 0xA877}, {0xA880, 0xA8C5}, {0xA8CE, 0xA8D9}, 129 {0xA8E0, 0xA953}, {0xA95F, 0xA97C}, {0xA980, 0xA9CD}, 130 {0xA9CF, 0xA9D9}, {0xA9DE, 0xA9FE}, {0xAA00, 0xAA36}, 131 {0xAA40, 0xAA4D}, {0xAA50, 0xAA59}, {0xAA5C, 0xAAC2}, 132 {0xAADB, 0xAAF6}, {0xAB01, 0xAB06}, {0xAB09, 0xAB0E}, 133 {0xAB11, 0xAB16}, {0xAB20, 0xAB26}, {0xAB28, 0xAB2E}, 134 {0xAB30, 0xAB6B}, {0xAB70, 0xABED}, {0xABF0, 0xABF9}, 135 {0xAC00, 0xD7A3}, {0xD7B0, 0xD7C6}, {0xD7CB, 0xD7FB}, 136 {0xF900, 0xFA6D}, {0xFA70, 0xFAD9}, {0xFB00, 0xFB06}, 137 {0xFB13, 0xFB17}, {0xFB1D, 0xFB36}, {0xFB38, 0xFB3C}, 138 {0xFB3E, 0xFB3E}, {0xFB40, 0xFB41}, {0xFB43, 0xFB44}, 139 {0xFB46, 0xFBC2}, {0xFBD3, 0xFD8F}, {0xFD92, 0xFDC7}, 140 {0xFDCF, 0xFDCF}, {0xFDF0, 0xFE19}, {0xFE20, 0xFE52}, 141 {0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFE70, 0xFE74}, 142 {0xFE76, 0xFEFC}, {0xFF01, 0xFFBE}, {0xFFC2, 0xFFC7}, 143 {0xFFCA, 0xFFCF}, {0xFFD2, 0xFFD7}, {0xFFDA, 0xFFDC}, 144 {0xFFE0, 0xFFE6}, {0xFFE8, 0xFFEE}, {0xFFFC, 0xFFFD}, 145 {0x10000, 0x1000B}, {0x1000D, 0x10026}, {0x10028, 0x1003A}, 146 {0x1003C, 0x1003D}, {0x1003F, 0x1004D}, {0x10050, 0x1005D}, 147 {0x10080, 0x100FA}, {0x10100, 0x10102}, {0x10107, 0x10133}, 148 {0x10137, 0x1018E}, {0x10190, 0x1019C}, {0x101A0, 0x101A0}, 149 {0x101D0, 0x101FD}, {0x10280, 0x1029C}, {0x102A0, 0x102D0}, 150 {0x102E0, 0x102FB}, {0x10300, 0x10323}, {0x1032D, 0x1034A}, 151 {0x10350, 0x1037A}, {0x10380, 0x1039D}, {0x1039F, 0x103C3}, 152 {0x103C8, 0x103D5}, {0x10400, 0x1049D}, {0x104A0, 0x104A9}, 153 {0x104B0, 0x104D3}, {0x104D8, 0x104FB}, {0x10500, 0x10527}, 154 {0x10530, 0x10563}, {0x1056F, 0x1057A}, {0x1057C, 0x1058A}, 155 {0x1058C, 0x10592}, {0x10594, 0x10595}, {0x10597, 0x105A1}, 156 {0x105A3, 0x105B1}, {0x105B3, 0x105B9}, {0x105BB, 0x105BC}, 157 {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767}, 158 {0x10780, 0x10785}, {0x10787, 0x107B0}, {0x107B2, 0x107BA}, 159 {0x10800, 0x10805}, {0x10808, 0x10808}, {0x1080A, 0x10835}, 160 {0x10837, 0x10838}, {0x1083C, 0x1083C}, {0x1083F, 0x10855}, 161 {0x10857, 0x1089E}, {0x108A7, 0x108AF}, {0x108E0, 0x108F2}, 162 {0x108F4, 0x108F5}, {0x108FB, 0x1091B}, {0x1091F, 0x10939}, 163 {0x1093F, 0x1093F}, {0x10980, 0x109B7}, {0x109BC, 0x109CF}, 164 {0x109D2, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A13}, 165 {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A38, 0x10A3A}, 166 {0x10A3F, 0x10A48}, {0x10A50, 0x10A58}, {0x10A60, 0x10A9F}, 167 {0x10AC0, 0x10AE6}, {0x10AEB, 0x10AF6}, {0x10B00, 0x10B35}, 168 {0x10B39, 0x10B55}, {0x10B58, 0x10B72}, {0x10B78, 0x10B91}, 169 {0x10B99, 0x10B9C}, {0x10BA9, 0x10BAF}, {0x10C00, 0x10C48}, 170 {0x10C80, 0x10CB2}, {0x10CC0, 0x10CF2}, {0x10CFA, 0x10D27}, 171 {0x10D30, 0x10D39}, {0x10E60, 0x10E7E}, {0x10E80, 0x10EA9}, 172 {0x10EAB, 0x10EAD}, {0x10EB0, 0x10EB1}, {0x10F00, 0x10F27}, 173 {0x10F30, 0x10F59}, {0x10F70, 0x10F89}, {0x10FB0, 0x10FCB}, 174 {0x10FE0, 0x10FF6}, {0x11000, 0x1104D}, {0x11052, 0x11075}, 175 {0x1107F, 0x110BC}, {0x110BE, 0x110C2}, {0x110D0, 0x110E8}, 176 {0x110F0, 0x110F9}, {0x11100, 0x11134}, {0x11136, 0x11147}, 177 {0x11150, 0x11176}, {0x11180, 0x111DF}, {0x111E1, 0x111F4}, 178 {0x11200, 0x11211}, {0x11213, 0x1123E}, {0x11280, 0x11286}, 179 {0x11288, 0x11288}, {0x1128A, 0x1128D}, {0x1128F, 0x1129D}, 180 {0x1129F, 0x112A9}, {0x112B0, 0x112EA}, {0x112F0, 0x112F9}, 181 {0x11300, 0x11303}, {0x11305, 0x1130C}, {0x1130F, 0x11310}, 182 {0x11313, 0x11328}, {0x1132A, 0x11330}, {0x11332, 0x11333}, 183 {0x11335, 0x11339}, {0x1133B, 0x11344}, {0x11347, 0x11348}, 184 {0x1134B, 0x1134D}, {0x11350, 0x11350}, {0x11357, 0x11357}, 185 {0x1135D, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374}, 186 {0x11400, 0x1145B}, {0x1145D, 0x11461}, {0x11480, 0x114C7}, 187 {0x114D0, 0x114D9}, {0x11580, 0x115B5}, {0x115B8, 0x115DD}, 188 {0x11600, 0x11644}, {0x11650, 0x11659}, {0x11660, 0x1166C}, 189 {0x11680, 0x116B9}, {0x116C0, 0x116C9}, {0x11700, 0x1171A}, 190 {0x1171D, 0x1172B}, {0x11730, 0x11746}, {0x11800, 0x1183B}, 191 {0x118A0, 0x118F2}, {0x118FF, 0x11906}, {0x11909, 0x11909}, 192 {0x1190C, 0x11913}, {0x11915, 0x11916}, {0x11918, 0x11935}, 193 {0x11937, 0x11938}, {0x1193B, 0x11946}, {0x11950, 0x11959}, 194 {0x119A0, 0x119A7}, {0x119AA, 0x119D7}, {0x119DA, 0x119E4}, 195 {0x11A00, 0x11A47}, {0x11A50, 0x11AA2}, {0x11AB0, 0x11AF8}, 196 {0x11C00, 0x11C08}, {0x11C0A, 0x11C36}, {0x11C38, 0x11C45}, 197 {0x11C50, 0x11C6C}, {0x11C70, 0x11C8F}, {0x11C92, 0x11CA7}, 198 {0x11CA9, 0x11CB6}, {0x11D00, 0x11D06}, {0x11D08, 0x11D09}, 199 {0x11D0B, 0x11D36}, {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D}, 200 {0x11D3F, 0x11D47}, {0x11D50, 0x11D59}, {0x11D60, 0x11D65}, 201 {0x11D67, 0x11D68}, {0x11D6A, 0x11D8E}, {0x11D90, 0x11D91}, 202 {0x11D93, 0x11D98}, {0x11DA0, 0x11DA9}, {0x11EE0, 0x11EF8}, 203 {0x11FB0, 0x11FB0}, {0x11FC0, 0x11FF1}, {0x11FFF, 0x12399}, 204 {0x12400, 0x1246E}, {0x12470, 0x12474}, {0x12480, 0x12543}, 205 {0x12F90, 0x12FF2}, {0x13000, 0x1342E}, {0x14400, 0x14646}, 206 {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16A60, 0x16A69}, 207 {0x16A6E, 0x16ABE}, {0x16AC0, 0x16AC9}, {0x16AD0, 0x16AED}, 208 {0x16AF0, 0x16AF5}, {0x16B00, 0x16B45}, {0x16B50, 0x16B59}, 209 {0x16B5B, 0x16B61}, {0x16B63, 0x16B77}, {0x16B7D, 0x16B8F}, 210 {0x16E40, 0x16E9A}, {0x16F00, 0x16F4A}, {0x16F4F, 0x16F87}, 211 {0x16F8F, 0x16F9F}, {0x16FE0, 0x16FE4}, {0x16FF0, 0x16FF1}, 212 {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08}, 213 {0x1AFF0, 0x1AFF3}, {0x1AFF5, 0x1AFFB}, {0x1AFFD, 0x1AFFE}, 214 {0x1B000, 0x1B122}, {0x1B150, 0x1B152}, {0x1B164, 0x1B167}, 215 {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A}, {0x1BC70, 0x1BC7C}, 216 {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99}, {0x1BC9C, 0x1BC9F}, 217 {0x1CF00, 0x1CF2D}, {0x1CF30, 0x1CF46}, {0x1CF50, 0x1CFC3}, 218 {0x1D000, 0x1D0F5}, {0x1D100, 0x1D126}, {0x1D129, 0x1D172}, 219 {0x1D17B, 0x1D1EA}, {0x1D200, 0x1D245}, {0x1D2E0, 0x1D2F3}, 220 {0x1D300, 0x1D356}, {0x1D360, 0x1D378}, {0x1D400, 0x1D454}, 221 {0x1D456, 0x1D49C}, {0x1D49E, 0x1D49F}, {0x1D4A2, 0x1D4A2}, 222 {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC}, {0x1D4AE, 0x1D4B9}, 223 {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3}, {0x1D4C5, 0x1D505}, 224 {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514}, {0x1D516, 0x1D51C}, 225 {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E}, {0x1D540, 0x1D544}, 226 {0x1D546, 0x1D546}, {0x1D54A, 0x1D550}, {0x1D552, 0x1D6A5}, 227 {0x1D6A8, 0x1D7CB}, {0x1D7CE, 0x1DA8B}, {0x1DA9B, 0x1DA9F}, 228 {0x1DAA1, 0x1DAAF}, {0x1DF00, 0x1DF1E}, {0x1E000, 0x1E006}, 229 {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024}, 230 {0x1E026, 0x1E02A}, {0x1E100, 0x1E12C}, {0x1E130, 0x1E13D}, 231 {0x1E140, 0x1E149}, {0x1E14E, 0x1E14F}, {0x1E290, 0x1E2AE}, 232 {0x1E2C0, 0x1E2F9}, {0x1E2FF, 0x1E2FF}, {0x1E7E0, 0x1E7E6}, 233 {0x1E7E8, 0x1E7EB}, {0x1E7ED, 0x1E7EE}, {0x1E7F0, 0x1E7FE}, 234 {0x1E800, 0x1E8C4}, {0x1E8C7, 0x1E8D6}, {0x1E900, 0x1E94B}, 235 {0x1E950, 0x1E959}, {0x1E95E, 0x1E95F}, {0x1EC71, 0x1ECB4}, 236 {0x1ED01, 0x1ED3D}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F}, 237 {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27}, 238 {0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39}, 239 {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47}, 240 {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F}, 241 {0x1EE51, 0x1EE52}, {0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57}, 242 {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D}, 243 {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64}, 244 {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72}, {0x1EE74, 0x1EE77}, 245 {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89}, 246 {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9}, 247 {0x1EEAB, 0x1EEBB}, {0x1EEF0, 0x1EEF1}, {0x1F000, 0x1F02B}, 248 {0x1F030, 0x1F093}, {0x1F0A0, 0x1F0AE}, {0x1F0B1, 0x1F0BF}, 249 {0x1F0C1, 0x1F0CF}, {0x1F0D1, 0x1F0F5}, {0x1F100, 0x1F1AD}, 250 {0x1F1E6, 0x1F202}, {0x1F210, 0x1F23B}, {0x1F240, 0x1F248}, 251 {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, {0x1F300, 0x1F6D7}, 252 {0x1F6DD, 0x1F6EC}, {0x1F6F0, 0x1F6FC}, {0x1F700, 0x1F773}, 253 {0x1F780, 0x1F7D8}, {0x1F7E0, 0x1F7EB}, {0x1F7F0, 0x1F7F0}, 254 {0x1F800, 0x1F80B}, {0x1F810, 0x1F847}, {0x1F850, 0x1F859}, 255 {0x1F860, 0x1F887}, {0x1F890, 0x1F8AD}, {0x1F8B0, 0x1F8B1}, 256 {0x1F900, 0x1FA53}, {0x1FA60, 0x1FA6D}, {0x1FA70, 0x1FA74}, 257 {0x1FA78, 0x1FA7C}, {0x1FA80, 0x1FA86}, {0x1FA90, 0x1FAAC}, 258 {0x1FAB0, 0x1FABA}, {0x1FAC0, 0x1FAC5}, {0x1FAD0, 0x1FAD9}, 259 {0x1FAE0, 0x1FAE7}, {0x1FAF0, 0x1FAF6}, {0x1FB00, 0x1FB92}, 260 {0x1FB94, 0x1FBCA}, {0x1FBF0, 0x1FBF9}, {0x20000, 0x2A6DF}, 261 {0x2A700, 0x2B738}, {0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1}, 262 {0x2CEB0, 0x2EBE0}, {0x2F800, 0x2FA1D}, {0x30000, 0x3134A}, 263 {0xE0100, 0xE01EF}}; 264 265 static const UnicodeCharSet Printables(PrintableRanges); 266 // Clang special cases 0x00AD (SOFT HYPHEN) which is rendered as an actual 267 // hyphen in most terminals. 268 return UCS == 0x00AD || Printables.contains(UCS); 269 } 270 271 /// Unicode code points of the Cf category are considered 272 /// formatting characters. 273 bool isFormatting(int UCS) { 274 275 // https://unicode.org/Public/14.0.0/ucdxml/ 276 static const UnicodeCharRange Cf[] = { 277 {0x00AD, 0x00AD}, {0x0600, 0x0605}, {0x061C, 0x061C}, 278 {0x06DD, 0x06DD}, {0x070F, 0x070F}, {0x0890, 0x0891}, 279 {0x08E2, 0x08E2}, {0x180E, 0x180E}, {0x200B, 0x200F}, 280 {0x202A, 0x202E}, {0x2060, 0x2064}, {0x2066, 0x206F}, 281 {0xFEFF, 0xFEFF}, {0xFFF9, 0xFFFB}, {0x110BD, 0x110BD}, 282 {0x110CD, 0x110CD}, {0x13430, 0x13438}, {0x1BCA0, 0x1BCA3}, 283 {0x1D173, 0x1D17A}, {0xE0001, 0xE0001}, {0xE0020, 0xE007F}}; 284 285 static const UnicodeCharSet Format(Cf); 286 return Format.contains(UCS); 287 } 288 289 /// Gets the number of positions a character is likely to occupy when output 290 /// on a terminal ("character width"). This depends on the implementation of the 291 /// terminal, and there's no standard definition of character width. 292 /// The implementation defines it in a way that is expected to be compatible 293 /// with a generic Unicode-capable terminal. 294 /// \return Character width: 295 /// * ErrorNonPrintableCharacter (-1) for non-printable characters (as 296 /// identified by isPrintable); 297 /// * 0 for non-spacing and enclosing combining marks; 298 /// * 2 for CJK characters excluding halfwidth forms; 299 /// * 1 for all remaining characters. 300 static inline int charWidth(int UCS) 301 { 302 if (!isPrintable(UCS)) 303 return ErrorNonPrintableCharacter; 304 305 // Sorted list of non-spacing and enclosing combining mark intervals as 306 // defined in "3.6 Combination" of 307 // http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf 308 static const UnicodeCharRange CombiningCharacterRanges[] = { 309 { 0x0300, 0x036F }, { 0x0483, 0x0489 }, { 0x0591, 0x05BD }, 310 { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, { 0x05C4, 0x05C5 }, 311 { 0x05C7, 0x05C7 }, { 0x0610, 0x061A }, { 0x064B, 0x065F }, 312 { 0x0670, 0x0670 }, { 0x06D6, 0x06DC }, { 0x06DF, 0x06E4 }, 313 { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED }, { 0x0711, 0x0711 }, 314 { 0x0730, 0x074A }, { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 }, 315 { 0x0816, 0x0819 }, { 0x081B, 0x0823 }, { 0x0825, 0x0827 }, 316 { 0x0829, 0x082D }, { 0x0859, 0x085B }, { 0x08E4, 0x08FE }, 317 { 0x0900, 0x0902 }, { 0x093A, 0x093A }, { 0x093C, 0x093C }, 318 { 0x0941, 0x0948 }, { 0x094D, 0x094D }, { 0x0951, 0x0957 }, 319 { 0x0962, 0x0963 }, { 0x0981, 0x0981 }, { 0x09BC, 0x09BC }, 320 { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD }, { 0x09E2, 0x09E3 }, 321 { 0x0A01, 0x0A02 }, { 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 }, 322 { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, { 0x0A51, 0x0A51 }, 323 { 0x0A70, 0x0A71 }, { 0x0A75, 0x0A75 }, { 0x0A81, 0x0A82 }, 324 { 0x0ABC, 0x0ABC }, { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 }, 325 { 0x0ACD, 0x0ACD }, { 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 }, 326 { 0x0B3C, 0x0B3C }, { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B44 }, 327 { 0x0B4D, 0x0B4D }, { 0x0B56, 0x0B56 }, { 0x0B62, 0x0B63 }, 328 { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 }, { 0x0BCD, 0x0BCD }, 329 { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D }, 330 { 0x0C55, 0x0C56 }, { 0x0C62, 0x0C63 }, { 0x0CBC, 0x0CBC }, 331 { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD }, 332 { 0x0CE2, 0x0CE3 }, { 0x0D41, 0x0D44 }, { 0x0D4D, 0x0D4D }, 333 { 0x0D62, 0x0D63 }, { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 }, 334 { 0x0DD6, 0x0DD6 }, { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A }, 335 { 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 }, 336 { 0x0EBB, 0x0EBC }, { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 }, 337 { 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 }, 338 { 0x0F71, 0x0F7E }, { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 }, 339 { 0x0F8D, 0x0F97 }, { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 }, 340 { 0x102D, 0x1030 }, { 0x1032, 0x1037 }, { 0x1039, 0x103A }, 341 { 0x103D, 0x103E }, { 0x1058, 0x1059 }, { 0x105E, 0x1060 }, 342 { 0x1071, 0x1074 }, { 0x1082, 0x1082 }, { 0x1085, 0x1086 }, 343 { 0x108D, 0x108D }, { 0x109D, 0x109D }, { 0x135D, 0x135F }, 344 { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 }, 345 { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD }, 346 { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD }, 347 { 0x180B, 0x180D }, { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 }, 348 { 0x1927, 0x1928 }, { 0x1932, 0x1932 }, { 0x1939, 0x193B }, 349 { 0x1A17, 0x1A18 }, { 0x1A56, 0x1A56 }, { 0x1A58, 0x1A5E }, 350 { 0x1A60, 0x1A60 }, { 0x1A62, 0x1A62 }, { 0x1A65, 0x1A6C }, 351 { 0x1A73, 0x1A7C }, { 0x1A7F, 0x1A7F }, { 0x1B00, 0x1B03 }, 352 { 0x1B34, 0x1B34 }, { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C }, 353 { 0x1B42, 0x1B42 }, { 0x1B6B, 0x1B73 }, { 0x1B80, 0x1B81 }, 354 { 0x1BA2, 0x1BA5 }, { 0x1BA8, 0x1BA9 }, { 0x1BAB, 0x1BAB }, 355 { 0x1BE6, 0x1BE6 }, { 0x1BE8, 0x1BE9 }, { 0x1BED, 0x1BED }, 356 { 0x1BEF, 0x1BF1 }, { 0x1C2C, 0x1C33 }, { 0x1C36, 0x1C37 }, 357 { 0x1CD0, 0x1CD2 }, { 0x1CD4, 0x1CE0 }, { 0x1CE2, 0x1CE8 }, 358 { 0x1CED, 0x1CED }, { 0x1CF4, 0x1CF4 }, { 0x1DC0, 0x1DE6 }, 359 { 0x1DFC, 0x1DFF }, { 0x20D0, 0x20F0 }, { 0x2CEF, 0x2CF1 }, 360 { 0x2D7F, 0x2D7F }, { 0x2DE0, 0x2DFF }, { 0x302A, 0x302D }, 361 { 0x3099, 0x309A }, { 0xA66F, 0xA672 }, { 0xA674, 0xA67D }, 362 { 0xA69F, 0xA69F }, { 0xA6F0, 0xA6F1 }, { 0xA802, 0xA802 }, 363 { 0xA806, 0xA806 }, { 0xA80B, 0xA80B }, { 0xA825, 0xA826 }, 364 { 0xA8C4, 0xA8C4 }, { 0xA8E0, 0xA8F1 }, { 0xA926, 0xA92D }, 365 { 0xA947, 0xA951 }, { 0xA980, 0xA982 }, { 0xA9B3, 0xA9B3 }, 366 { 0xA9B6, 0xA9B9 }, { 0xA9BC, 0xA9BC }, { 0xAA29, 0xAA2E }, 367 { 0xAA31, 0xAA32 }, { 0xAA35, 0xAA36 }, { 0xAA43, 0xAA43 }, 368 { 0xAA4C, 0xAA4C }, { 0xAAB0, 0xAAB0 }, { 0xAAB2, 0xAAB4 }, 369 { 0xAAB7, 0xAAB8 }, { 0xAABE, 0xAABF }, { 0xAAC1, 0xAAC1 }, 370 { 0xAAEC, 0xAAED }, { 0xAAF6, 0xAAF6 }, { 0xABE5, 0xABE5 }, 371 { 0xABE8, 0xABE8 }, { 0xABED, 0xABED }, { 0xFB1E, 0xFB1E }, 372 { 0xFE00, 0xFE0F }, { 0xFE20, 0xFE26 }, { 0x101FD, 0x101FD }, 373 { 0x10A01, 0x10A03 }, { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F }, 374 { 0x10A38, 0x10A3A }, { 0x10A3F, 0x10A3F }, { 0x11001, 0x11001 }, 375 { 0x11038, 0x11046 }, { 0x11080, 0x11081 }, { 0x110B3, 0x110B6 }, 376 { 0x110B9, 0x110BA }, { 0x11100, 0x11102 }, { 0x11127, 0x1112B }, 377 { 0x1112D, 0x11134 }, { 0x11180, 0x11181 }, { 0x111B6, 0x111BE }, 378 { 0x116AB, 0x116AB }, { 0x116AD, 0x116AD }, { 0x116B0, 0x116B5 }, 379 { 0x116B7, 0x116B7 }, { 0x16F8F, 0x16F92 }, { 0x1D167, 0x1D169 }, 380 { 0x1D17B, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD }, 381 { 0x1D242, 0x1D244 }, { 0xE0100, 0xE01EF }, 382 }; 383 static const UnicodeCharSet CombiningCharacters(CombiningCharacterRanges); 384 385 if (CombiningCharacters.contains(UCS)) 386 return 0; 387 388 static const UnicodeCharRange DoubleWidthCharacterRanges[] = { 389 // Hangul Jamo 390 { 0x1100, 0x11FF }, 391 // Deprecated fullwidth angle brackets 392 { 0x2329, 0x232A }, 393 // CJK Misc, CJK Unified Ideographs, Yijing Hexagrams, Yi 394 // excluding U+303F (IDEOGRAPHIC HALF FILL SPACE) 395 { 0x2E80, 0x303E }, { 0x3040, 0xA4CF }, 396 // Hangul 397 { 0xAC00, 0xD7A3 }, { 0xD7B0, 0xD7C6 }, { 0xD7CB, 0xD7FB }, 398 // CJK Unified Ideographs 399 { 0xF900, 0xFAFF }, 400 // Vertical forms 401 { 0xFE10, 0xFE19 }, 402 // CJK Compatibility Forms + Small Form Variants 403 { 0xFE30, 0xFE6F }, 404 // Fullwidth forms 405 { 0xFF01, 0xFF60 }, { 0xFFE0, 0xFFE6 }, 406 // CJK Unified Ideographs 407 { 0x20000, 0x2A6DF }, { 0x2A700, 0x2B81F }, { 0x2F800, 0x2FA1F } 408 }; 409 static const UnicodeCharSet DoubleWidthCharacters(DoubleWidthCharacterRanges); 410 411 if (DoubleWidthCharacters.contains(UCS)) 412 return 2; 413 return 1; 414 } 415 416 static bool isprintableascii(char c) { return c > 31 && c < 127; } 417 418 int columnWidthUTF8(StringRef Text) { 419 unsigned ColumnWidth = 0; 420 unsigned Length; 421 for (size_t i = 0, e = Text.size(); i < e; i += Length) { 422 Length = getNumBytesForUTF8(Text[i]); 423 424 // fast path for ASCII characters 425 if (Length == 1) { 426 if (!isprintableascii(Text[i])) 427 return ErrorNonPrintableCharacter; 428 ColumnWidth += 1; 429 continue; 430 } 431 432 if (Length <= 0 || i + Length > Text.size()) 433 return ErrorInvalidUTF8; 434 UTF32 buf[1]; 435 const UTF8 *Start = reinterpret_cast<const UTF8 *>(Text.data() + i); 436 UTF32 *Target = &buf[0]; 437 if (conversionOK != ConvertUTF8toUTF32(&Start, Start + Length, &Target, 438 Target + 1, strictConversion)) 439 return ErrorInvalidUTF8; 440 int Width = charWidth(buf[0]); 441 if (Width < 0) 442 return ErrorNonPrintableCharacter; 443 ColumnWidth += Width; 444 } 445 return ColumnWidth; 446 } 447 448 } // namespace unicode 449 } // namespace sys 450 } // namespace llvm 451 452