xref: /linux/drivers/tty/vt/ucs.c (revision 255dc0ec0b79c354bff017f6d6202adaa092a1c9)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * ucs.c - Universal Character Set processing
4  */
5 
6 #include <linux/array_size.h>
7 #include <linux/build_bug.h>
8 #include <linux/bsearch.h>
9 #include <linux/consolemap.h>
10 #include <linux/math.h>
11 
12 struct ucs_width16 {
13 	u16 first;
14 	u16 last;
15 };
16 
17 struct ucs_width32 {
18 	u32 first;
19 	u32 last;
20 };
21 
22 /*
23  * Width table encoding (consumed by ucs_width_table.h):
24  *
25  * Zero- and double-width ranges are merged into one sorted-by-`first` table
26  * per region (BMP / non-BMP). The BMP table stores plain (first, last)
27  * pairs; per-entry width lives in a packed bitmap *hosted by the non-BMP
28  * table*.
29  *
30  * That hosting is the whole point of the encoding. Non-BMP code points use
31  * only 20 bits, so each u32 has 12 spare high bits sitting around doing
32  * nothing — we'd rather use them than spend a separate parallel array for
33  * width and BMP-bitmap bits. So we move the cp value up by UCS_CP_SHIFT
34  * and stash metadata in the now-free low bits of `last`:
35  *   - bit UCS_NONBMP_W2_FLAG_BIT: this entry's own width (0=zero, 1=double),
36  *   - bits 0..UCS_NONBMP_BMP_BITS-1: a chunk of the BMP double-width
37  *     bitmap. Bit `j` of the chunk in non-BMP entry `c` is set iff BMP
38  *     entry (c * UCS_NONBMP_BMP_BITS + j) is double-width. The first
39  *     ceil(N_BMP / UCS_NONBMP_BMP_BITS) non-BMP entries carry the bitmap;
40  *     the rest leave these bits zero.
41  *
42  * Because the metadata bits sit strictly below the lowest cp-scale bit,
43  * the bsearch comparator does plain u32 comparison on the shifted key and
44  * stored values without masking — ordering between distinct code points is
45  * undisturbed.
46  */
47 #define UCS_CP_SHIFT           12
48 #define UCS_NONBMP_W2_FLAG_BIT 11
49 #define UCS_NONBMP_W2_FLAG     (1u << UCS_NONBMP_W2_FLAG_BIT)
50 
51 #define BMP_0WIDTH(first, last)   first, last
52 #define BMP_2WIDTH(first, last)   first, last
53 #define RANGE_0WIDTH(first, last) \
54 	(u32)(first) << UCS_CP_SHIFT,  (u32)(last) << UCS_CP_SHIFT
55 #define RANGE_2WIDTH(first, last) \
56 	(u32)(first) << UCS_CP_SHIFT, ((u32)(last) << UCS_CP_SHIFT) | UCS_NONBMP_W2_FLAG
57 #define BMP_2W_BITS(b)            (b)
58 
59 #include "ucs_width_table.h"
60 
61 static_assert(UCS_NONBMP_BMP_BITS <= UCS_NONBMP_W2_FLAG_BIT,
62 	      "BMP bitmap chunk would overlap the per-entry width flag");
63 static_assert(UCS_NONBMP_W2_FLAG_BIT < UCS_CP_SHIFT,
64 	      "Metadata bits collide with the shifted cp value");
65 static_assert(DIV_ROUND_UP(ARRAY_SIZE(ucs_bmp_ranges), UCS_NONBMP_BMP_BITS)
66 	      <= ARRAY_SIZE(ucs_nonbmp_ranges),
67 	      "Not enough non-BMP entries to host the BMP width bitmap");
68 
69 #define UCS_IS_BMP(cp)	((cp) <= 0xffff)
70 
71 static int width16_cmp(const void *key, const void *element)
72 {
73 	u16 cp = *(u16 *)key;
74 	const struct ucs_width16 *entry = element;
75 
76 	if (cp < entry->first)
77 		return -1;
78 	if (cp > entry->last)
79 		return 1;
80 	return 0;
81 }
82 
83 static int width32_cmp(const void *key, const void *element)
84 {
85 	u32 k = *(u32 *)key;
86 	const struct ucs_width32 *entry = element;
87 
88 	if (k < entry->first)
89 		return -1;
90 	if (k > entry->last)
91 		return 1;
92 	return 0;
93 }
94 
95 /**
96  * ucs_get_width() - Get the display width of a Unicode code point.
97  * @cp: Unicode code point (UCS-4)
98  *
99  * Return: 2 for double-width (East Asian Wide/Fullwidth, emoji, ...),
100  *         0 for zero-width (combining marks, format characters, ...),
101  *         1 for everything else (the common case).
102  */
103 unsigned int ucs_get_width(u32 cp)
104 {
105 	const struct ucs_width16 *e16;
106 	const struct ucs_width32 *e32;
107 	unsigned int idx;
108 	u32 k;
109 
110 	if (UCS_IS_BMP(cp)) {
111 		u16 bmp = cp;
112 
113 		if (bmp < ucs_bmp_ranges[0].first ||
114 		    bmp > ucs_bmp_ranges[ARRAY_SIZE(ucs_bmp_ranges) - 1].last)
115 			return 1;
116 
117 		e16 = __inline_bsearch(&bmp, ucs_bmp_ranges,
118 				       ARRAY_SIZE(ucs_bmp_ranges),
119 				       sizeof(*ucs_bmp_ranges), width16_cmp);
120 		if (!e16)
121 			return 1;
122 
123 		idx = e16 - ucs_bmp_ranges;
124 		return (ucs_nonbmp_ranges[idx / UCS_NONBMP_BMP_BITS].last
125 			>> (idx % UCS_NONBMP_BMP_BITS)) & 1 ? 2 : 0;
126 	}
127 
128 	k = cp << UCS_CP_SHIFT;
129 	if (k < ucs_nonbmp_ranges[0].first ||
130 	    k > ucs_nonbmp_ranges[ARRAY_SIZE(ucs_nonbmp_ranges) - 1].last)
131 		return 1;
132 
133 	e32 = __inline_bsearch(&k, ucs_nonbmp_ranges,
134 			       ARRAY_SIZE(ucs_nonbmp_ranges),
135 			       sizeof(*ucs_nonbmp_ranges), width32_cmp);
136 	if (!e32)
137 		return 1;
138 	return (e32->last & UCS_NONBMP_W2_FLAG) ? 2 : 0;
139 }
140 
141 /*
142  * Structure for base with combining mark pairs and resulting recompositions.
143  * Using u16 to save space since all values are within BMP range.
144  */
145 struct ucs_recomposition {
146 	u16 base;	/* base character */
147 	u16 mark;	/* combining mark */
148 	u16 recomposed;	/* corresponding recomposed character */
149 };
150 
151 #include "ucs_recompose_table.h"
152 
153 struct compare_key {
154 	u16 base;
155 	u16 mark;
156 };
157 
158 static int recomposition_cmp(const void *key, const void *element)
159 {
160 	const struct compare_key *search_key = key;
161 	const struct ucs_recomposition *entry = element;
162 
163 	/* Compare base character first */
164 	if (search_key->base < entry->base)
165 		return -1;
166 	if (search_key->base > entry->base)
167 		return 1;
168 
169 	/* Base characters match, now compare combining character */
170 	if (search_key->mark < entry->mark)
171 		return -1;
172 	if (search_key->mark > entry->mark)
173 		return 1;
174 
175 	/* Both match */
176 	return 0;
177 }
178 
179 /**
180  * ucs_recompose() - Attempt to recompose two Unicode characters into a single character.
181  * @base: Base Unicode code point (UCS-4)
182  * @mark: Combining mark Unicode code point (UCS-4)
183  *
184  * Return: Recomposed Unicode code point, or 0 if no recomposition is possible
185  */
186 u32 ucs_recompose(u32 base, u32 mark)
187 {
188 	/* Check if characters are within the range of our table */
189 	if (base < UCS_RECOMPOSE_MIN_BASE || base > UCS_RECOMPOSE_MAX_BASE ||
190 	    mark < UCS_RECOMPOSE_MIN_MARK || mark > UCS_RECOMPOSE_MAX_MARK)
191 		return 0;
192 
193 	struct compare_key key = { base, mark };
194 	struct ucs_recomposition *result =
195 		__inline_bsearch(&key, ucs_recomposition_table,
196 				 ARRAY_SIZE(ucs_recomposition_table),
197 				 sizeof(*ucs_recomposition_table),
198 				 recomposition_cmp);
199 
200 	return result ? result->recomposed : 0;
201 }
202 
203 /*
204  * The fallback table structures implement a 2-level lookup.
205  */
206 
207 struct ucs_page_desc {
208 	u8 page;	/* Page index (high byte of code points) */
209 	u8 count;	/* Number of entries in this page */
210 	u16 start;	/* Start index in entries array */
211 };
212 
213 struct ucs_page_entry {
214 	u8 offset;	/* Offset within page (0-255) */
215 	u8 fallback;	/* Fallback character or range start marker */
216 };
217 
218 #include "ucs_fallback_table.h"
219 
220 static int ucs_page_desc_cmp(const void *key, const void *element)
221 {
222 	u8 page = *(u8 *)key;
223 	const struct ucs_page_desc *entry = element;
224 
225 	if (page < entry->page)
226 		return -1;
227 	if (page > entry->page)
228 		return 1;
229 	return 0;
230 }
231 
232 static int ucs_page_entry_cmp(const void *key, const void *element)
233 {
234 	u8 offset = *(u8 *)key;
235 	const struct ucs_page_entry *entry = element;
236 
237 	if (offset < entry->offset)
238 		return -1;
239 	if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER) {
240 		if (offset > entry[1].offset)
241 			return 1;
242 	} else {
243 		if (offset > entry->offset)
244 			return 1;
245 	}
246 	return 0;
247 }
248 
249 /**
250  * ucs_get_fallback() - Get a substitution for the provided Unicode character
251  * @cp: Unicode code point (UCS-4)
252  *
253  * Get a simpler fallback character for the provided Unicode character.
254  * This is used for terminal display when corresponding glyph is unavailable.
255  * The substitution may not be as good as the actual glyph for the original
256  * character but still way more helpful than a squared question mark.
257  *
258  * Return: Fallback Unicode code point, or 0 if none is available
259  */
260 u32 ucs_get_fallback(u32 cp)
261 {
262 	const struct ucs_page_desc *page;
263 	const struct ucs_page_entry *entry;
264 	u8 page_idx = cp >> 8, offset = cp;
265 
266 	if (!UCS_IS_BMP(cp))
267 		return 0;
268 
269 	/*
270 	 * Full-width to ASCII mapping (covering all printable ASCII 33-126)
271 	 * 0xFF01 (!) to 0xFF5E (~) -> ASCII 33 (!) to 126 (~)
272 	 * We process them programmatically to reduce the table size.
273 	 */
274 	if (cp >= 0xFF01 && cp <= 0xFF5E)
275 		return cp - 0xFF01 + 33;
276 
277 	page = __inline_bsearch(&page_idx, ucs_fallback_pages,
278 				ARRAY_SIZE(ucs_fallback_pages),
279 				sizeof(*ucs_fallback_pages),
280 				ucs_page_desc_cmp);
281 	if (!page)
282 		return 0;
283 
284 	entry = __inline_bsearch(&offset, ucs_fallback_entries + page->start,
285 				 page->count, sizeof(*ucs_fallback_entries),
286 				 ucs_page_entry_cmp);
287 	if (!entry)
288 		return 0;
289 
290 	if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER)
291 		entry++;
292 	return entry->fallback;
293 }
294