1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * ucs.c - Universal Character Set processing 4 */ 5 6 #include <linux/array_size.h> 7 #include <linux/build_bug.h> 8 #include <linux/bsearch.h> 9 #include <linux/consolemap.h> 10 #include <linux/math.h> 11 12 struct ucs_width16 { 13 u16 first; 14 u16 last; 15 }; 16 17 struct ucs_width32 { 18 u32 first; 19 u32 last; 20 }; 21 22 /* 23 * Width table encoding (consumed by ucs_width_table.h): 24 * 25 * Zero- and double-width ranges are merged into one sorted-by-`first` table 26 * per region (BMP / non-BMP). The BMP table stores plain (first, last) 27 * pairs; per-entry width lives in a packed bitmap *hosted by the non-BMP 28 * table*. 29 * 30 * That hosting is the whole point of the encoding. Non-BMP code points use 31 * only 20 bits, so each u32 has 12 spare high bits sitting around doing 32 * nothing — we'd rather use them than spend a separate parallel array for 33 * width and BMP-bitmap bits. So we move the cp value up by UCS_CP_SHIFT 34 * and stash metadata in the now-free low bits of `last`: 35 * - bit UCS_NONBMP_W2_FLAG_BIT: this entry's own width (0=zero, 1=double), 36 * - bits 0..UCS_NONBMP_BMP_BITS-1: a chunk of the BMP double-width 37 * bitmap. Bit `j` of the chunk in non-BMP entry `c` is set iff BMP 38 * entry (c * UCS_NONBMP_BMP_BITS + j) is double-width. The first 39 * ceil(N_BMP / UCS_NONBMP_BMP_BITS) non-BMP entries carry the bitmap; 40 * the rest leave these bits zero. 41 * 42 * Because the metadata bits sit strictly below the lowest cp-scale bit, 43 * the bsearch comparator does plain u32 comparison on the shifted key and 44 * stored values without masking — ordering between distinct code points is 45 * undisturbed. 46 */ 47 #define UCS_CP_SHIFT 12 48 #define UCS_NONBMP_W2_FLAG_BIT 11 49 #define UCS_NONBMP_W2_FLAG (1u << UCS_NONBMP_W2_FLAG_BIT) 50 51 #define BMP_0WIDTH(first, last) first, last 52 #define BMP_2WIDTH(first, last) first, last 53 #define RANGE_0WIDTH(first, last) \ 54 (u32)(first) << UCS_CP_SHIFT, (u32)(last) << UCS_CP_SHIFT 55 #define RANGE_2WIDTH(first, last) \ 56 (u32)(first) << UCS_CP_SHIFT, ((u32)(last) << UCS_CP_SHIFT) | UCS_NONBMP_W2_FLAG 57 #define BMP_2W_BITS(b) (b) 58 59 #include "ucs_width_table.h" 60 61 static_assert(UCS_NONBMP_BMP_BITS <= UCS_NONBMP_W2_FLAG_BIT, 62 "BMP bitmap chunk would overlap the per-entry width flag"); 63 static_assert(UCS_NONBMP_W2_FLAG_BIT < UCS_CP_SHIFT, 64 "Metadata bits collide with the shifted cp value"); 65 static_assert(DIV_ROUND_UP(ARRAY_SIZE(ucs_bmp_ranges), UCS_NONBMP_BMP_BITS) 66 <= ARRAY_SIZE(ucs_nonbmp_ranges), 67 "Not enough non-BMP entries to host the BMP width bitmap"); 68 69 #define UCS_IS_BMP(cp) ((cp) <= 0xffff) 70 71 static int width16_cmp(const void *key, const void *element) 72 { 73 u16 cp = *(u16 *)key; 74 const struct ucs_width16 *entry = element; 75 76 if (cp < entry->first) 77 return -1; 78 if (cp > entry->last) 79 return 1; 80 return 0; 81 } 82 83 static int width32_cmp(const void *key, const void *element) 84 { 85 u32 k = *(u32 *)key; 86 const struct ucs_width32 *entry = element; 87 88 if (k < entry->first) 89 return -1; 90 if (k > entry->last) 91 return 1; 92 return 0; 93 } 94 95 /** 96 * ucs_get_width() - Get the display width of a Unicode code point. 97 * @cp: Unicode code point (UCS-4) 98 * 99 * Return: 2 for double-width (East Asian Wide/Fullwidth, emoji, ...), 100 * 0 for zero-width (combining marks, format characters, ...), 101 * 1 for everything else (the common case). 102 */ 103 unsigned int ucs_get_width(u32 cp) 104 { 105 const struct ucs_width16 *e16; 106 const struct ucs_width32 *e32; 107 unsigned int idx; 108 u32 k; 109 110 if (UCS_IS_BMP(cp)) { 111 u16 bmp = cp; 112 113 if (bmp < ucs_bmp_ranges[0].first || 114 bmp > ucs_bmp_ranges[ARRAY_SIZE(ucs_bmp_ranges) - 1].last) 115 return 1; 116 117 e16 = __inline_bsearch(&bmp, ucs_bmp_ranges, 118 ARRAY_SIZE(ucs_bmp_ranges), 119 sizeof(*ucs_bmp_ranges), width16_cmp); 120 if (!e16) 121 return 1; 122 123 idx = e16 - ucs_bmp_ranges; 124 return (ucs_nonbmp_ranges[idx / UCS_NONBMP_BMP_BITS].last 125 >> (idx % UCS_NONBMP_BMP_BITS)) & 1 ? 2 : 0; 126 } 127 128 k = cp << UCS_CP_SHIFT; 129 if (k < ucs_nonbmp_ranges[0].first || 130 k > ucs_nonbmp_ranges[ARRAY_SIZE(ucs_nonbmp_ranges) - 1].last) 131 return 1; 132 133 e32 = __inline_bsearch(&k, ucs_nonbmp_ranges, 134 ARRAY_SIZE(ucs_nonbmp_ranges), 135 sizeof(*ucs_nonbmp_ranges), width32_cmp); 136 if (!e32) 137 return 1; 138 return (e32->last & UCS_NONBMP_W2_FLAG) ? 2 : 0; 139 } 140 141 /* 142 * Structure for base with combining mark pairs and resulting recompositions. 143 * Using u16 to save space since all values are within BMP range. 144 */ 145 struct ucs_recomposition { 146 u16 base; /* base character */ 147 u16 mark; /* combining mark */ 148 u16 recomposed; /* corresponding recomposed character */ 149 }; 150 151 #include "ucs_recompose_table.h" 152 153 struct compare_key { 154 u16 base; 155 u16 mark; 156 }; 157 158 static int recomposition_cmp(const void *key, const void *element) 159 { 160 const struct compare_key *search_key = key; 161 const struct ucs_recomposition *entry = element; 162 163 /* Compare base character first */ 164 if (search_key->base < entry->base) 165 return -1; 166 if (search_key->base > entry->base) 167 return 1; 168 169 /* Base characters match, now compare combining character */ 170 if (search_key->mark < entry->mark) 171 return -1; 172 if (search_key->mark > entry->mark) 173 return 1; 174 175 /* Both match */ 176 return 0; 177 } 178 179 /** 180 * ucs_recompose() - Attempt to recompose two Unicode characters into a single character. 181 * @base: Base Unicode code point (UCS-4) 182 * @mark: Combining mark Unicode code point (UCS-4) 183 * 184 * Return: Recomposed Unicode code point, or 0 if no recomposition is possible 185 */ 186 u32 ucs_recompose(u32 base, u32 mark) 187 { 188 /* Check if characters are within the range of our table */ 189 if (base < UCS_RECOMPOSE_MIN_BASE || base > UCS_RECOMPOSE_MAX_BASE || 190 mark < UCS_RECOMPOSE_MIN_MARK || mark > UCS_RECOMPOSE_MAX_MARK) 191 return 0; 192 193 struct compare_key key = { base, mark }; 194 struct ucs_recomposition *result = 195 __inline_bsearch(&key, ucs_recomposition_table, 196 ARRAY_SIZE(ucs_recomposition_table), 197 sizeof(*ucs_recomposition_table), 198 recomposition_cmp); 199 200 return result ? result->recomposed : 0; 201 } 202 203 /* 204 * The fallback table structures implement a 2-level lookup. 205 */ 206 207 struct ucs_page_desc { 208 u8 page; /* Page index (high byte of code points) */ 209 u8 count; /* Number of entries in this page */ 210 u16 start; /* Start index in entries array */ 211 }; 212 213 struct ucs_page_entry { 214 u8 offset; /* Offset within page (0-255) */ 215 u8 fallback; /* Fallback character or range start marker */ 216 }; 217 218 #include "ucs_fallback_table.h" 219 220 static int ucs_page_desc_cmp(const void *key, const void *element) 221 { 222 u8 page = *(u8 *)key; 223 const struct ucs_page_desc *entry = element; 224 225 if (page < entry->page) 226 return -1; 227 if (page > entry->page) 228 return 1; 229 return 0; 230 } 231 232 static int ucs_page_entry_cmp(const void *key, const void *element) 233 { 234 u8 offset = *(u8 *)key; 235 const struct ucs_page_entry *entry = element; 236 237 if (offset < entry->offset) 238 return -1; 239 if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER) { 240 if (offset > entry[1].offset) 241 return 1; 242 } else { 243 if (offset > entry->offset) 244 return 1; 245 } 246 return 0; 247 } 248 249 /** 250 * ucs_get_fallback() - Get a substitution for the provided Unicode character 251 * @cp: Unicode code point (UCS-4) 252 * 253 * Get a simpler fallback character for the provided Unicode character. 254 * This is used for terminal display when corresponding glyph is unavailable. 255 * The substitution may not be as good as the actual glyph for the original 256 * character but still way more helpful than a squared question mark. 257 * 258 * Return: Fallback Unicode code point, or 0 if none is available 259 */ 260 u32 ucs_get_fallback(u32 cp) 261 { 262 const struct ucs_page_desc *page; 263 const struct ucs_page_entry *entry; 264 u8 page_idx = cp >> 8, offset = cp; 265 266 if (!UCS_IS_BMP(cp)) 267 return 0; 268 269 /* 270 * Full-width to ASCII mapping (covering all printable ASCII 33-126) 271 * 0xFF01 (!) to 0xFF5E (~) -> ASCII 33 (!) to 126 (~) 272 * We process them programmatically to reduce the table size. 273 */ 274 if (cp >= 0xFF01 && cp <= 0xFF5E) 275 return cp - 0xFF01 + 33; 276 277 page = __inline_bsearch(&page_idx, ucs_fallback_pages, 278 ARRAY_SIZE(ucs_fallback_pages), 279 sizeof(*ucs_fallback_pages), 280 ucs_page_desc_cmp); 281 if (!page) 282 return 0; 283 284 entry = __inline_bsearch(&offset, ucs_fallback_entries + page->start, 285 page->count, sizeof(*ucs_fallback_entries), 286 ucs_page_entry_cmp); 287 if (!entry) 288 return 0; 289 290 if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER) 291 entry++; 292 return entry->fallback; 293 } 294