xref: /linux/drivers/tty/vt/ucs.c (revision e49a3eac9207e9575337f70feeb29430f6f16bb7)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * ucs.c - Universal Character Set processing
4  */
5 
6 #include <linux/array_size.h>
7 #include <linux/bsearch.h>
8 #include <linux/consolemap.h>
9 #include <linux/minmax.h>
10 
11 struct ucs_interval16 {
12 	u16 first;
13 	u16 last;
14 };
15 
16 struct ucs_interval32 {
17 	u32 first;
18 	u32 last;
19 };
20 
21 #include "ucs_width_table.h"
22 
23 static int interval16_cmp(const void *key, const void *element)
24 {
25 	u16 cp = *(u16 *)key;
26 	const struct ucs_interval16 *entry = element;
27 
28 	if (cp < entry->first)
29 		return -1;
30 	if (cp > entry->last)
31 		return 1;
32 	return 0;
33 }
34 
35 static int interval32_cmp(const void *key, const void *element)
36 {
37 	u32 cp = *(u32 *)key;
38 	const struct ucs_interval32 *entry = element;
39 
40 	if (cp < entry->first)
41 		return -1;
42 	if (cp > entry->last)
43 		return 1;
44 	return 0;
45 }
46 
47 static bool cp_in_range16(u16 cp, const struct ucs_interval16 *ranges, size_t size)
48 {
49 	if (cp < ranges[0].first || cp > ranges[size - 1].last)
50 		return false;
51 
52 	return __inline_bsearch(&cp, ranges, size, sizeof(*ranges),
53 				interval16_cmp) != NULL;
54 }
55 
56 static bool cp_in_range32(u32 cp, const struct ucs_interval32 *ranges, size_t size)
57 {
58 	if (cp < ranges[0].first || cp > ranges[size - 1].last)
59 		return false;
60 
61 	return __inline_bsearch(&cp, ranges, size, sizeof(*ranges),
62 				interval32_cmp) != NULL;
63 }
64 
65 #define UCS_IS_BMP(cp)	((cp) <= 0xffff)
66 
67 /**
68  * ucs_is_zero_width() - Determine if a Unicode code point is zero-width.
69  * @cp: Unicode code point (UCS-4)
70  *
71  * Return: true if the character is zero-width, false otherwise
72  */
73 bool ucs_is_zero_width(u32 cp)
74 {
75 	if (UCS_IS_BMP(cp))
76 		return cp_in_range16(cp, ucs_zero_width_bmp_ranges,
77 				     ARRAY_SIZE(ucs_zero_width_bmp_ranges));
78 	else
79 		return cp_in_range32(cp, ucs_zero_width_non_bmp_ranges,
80 				     ARRAY_SIZE(ucs_zero_width_non_bmp_ranges));
81 }
82 
83 /**
84  * ucs_is_double_width() - Determine if a Unicode code point is double-width.
85  * @cp: Unicode code point (UCS-4)
86  *
87  * Return: true if the character is double-width, false otherwise
88  */
89 bool ucs_is_double_width(u32 cp)
90 {
91 	if (UCS_IS_BMP(cp))
92 		return cp_in_range16(cp, ucs_double_width_bmp_ranges,
93 				     ARRAY_SIZE(ucs_double_width_bmp_ranges));
94 	else
95 		return cp_in_range32(cp, ucs_double_width_non_bmp_ranges,
96 				     ARRAY_SIZE(ucs_double_width_non_bmp_ranges));
97 }
98 
99 /*
100  * Structure for base with combining mark pairs and resulting recompositions.
101  * Using u16 to save space since all values are within BMP range.
102  */
103 struct ucs_recomposition {
104 	u16 base;	/* base character */
105 	u16 mark;	/* combining mark */
106 	u16 recomposed;	/* corresponding recomposed character */
107 };
108 
109 #include "ucs_recompose_table.h"
110 
111 struct compare_key {
112 	u16 base;
113 	u16 mark;
114 };
115 
116 static int recomposition_cmp(const void *key, const void *element)
117 {
118 	const struct compare_key *search_key = key;
119 	const struct ucs_recomposition *entry = element;
120 
121 	/* Compare base character first */
122 	if (search_key->base < entry->base)
123 		return -1;
124 	if (search_key->base > entry->base)
125 		return 1;
126 
127 	/* Base characters match, now compare combining character */
128 	if (search_key->mark < entry->mark)
129 		return -1;
130 	if (search_key->mark > entry->mark)
131 		return 1;
132 
133 	/* Both match */
134 	return 0;
135 }
136 
137 /**
138  * ucs_recompose() - Attempt to recompose two Unicode characters into a single character.
139  * @base: Base Unicode code point (UCS-4)
140  * @mark: Combining mark Unicode code point (UCS-4)
141  *
142  * Return: Recomposed Unicode code point, or 0 if no recomposition is possible
143  */
144 u32 ucs_recompose(u32 base, u32 mark)
145 {
146 	/* Check if characters are within the range of our table */
147 	if (base < UCS_RECOMPOSE_MIN_BASE || base > UCS_RECOMPOSE_MAX_BASE ||
148 	    mark < UCS_RECOMPOSE_MIN_MARK || mark > UCS_RECOMPOSE_MAX_MARK)
149 		return 0;
150 
151 	struct compare_key key = { base, mark };
152 	struct ucs_recomposition *result =
153 		__inline_bsearch(&key, ucs_recomposition_table,
154 				 ARRAY_SIZE(ucs_recomposition_table),
155 				 sizeof(*ucs_recomposition_table),
156 				 recomposition_cmp);
157 
158 	return result ? result->recomposed : 0;
159 }
160 
161 /*
162  * The fallback table structures implement a 2-level lookup.
163  */
164 
165 struct ucs_page_desc {
166 	u8 page;	/* Page index (high byte of code points) */
167 	u8 count;	/* Number of entries in this page */
168 	u16 start;	/* Start index in entries array */
169 };
170 
171 struct ucs_page_entry {
172 	u8 offset;	/* Offset within page (0-255) */
173 	u8 fallback;	/* Fallback character or range start marker */
174 };
175 
176 #include "ucs_fallback_table.h"
177 
178 static int ucs_page_desc_cmp(const void *key, const void *element)
179 {
180 	u8 page = *(u8 *)key;
181 	const struct ucs_page_desc *entry = element;
182 
183 	if (page < entry->page)
184 		return -1;
185 	if (page > entry->page)
186 		return 1;
187 	return 0;
188 }
189 
190 static int ucs_page_entry_cmp(const void *key, const void *element)
191 {
192 	u8 offset = *(u8 *)key;
193 	const struct ucs_page_entry *entry = element;
194 
195 	if (offset < entry->offset)
196 		return -1;
197 	if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER) {
198 		if (offset > entry[1].offset)
199 			return 1;
200 	} else {
201 		if (offset > entry->offset)
202 			return 1;
203 	}
204 	return 0;
205 }
206 
207 /**
208  * ucs_get_fallback() - Get a substitution for the provided Unicode character
209  * @cp: Unicode code point (UCS-4)
210  *
211  * Get a simpler fallback character for the provided Unicode character.
212  * This is used for terminal display when corresponding glyph is unavailable.
213  * The substitution may not be as good as the actual glyph for the original
214  * character but still way more helpful than a squared question mark.
215  *
216  * Return: Fallback Unicode code point, or 0 if none is available
217  */
218 u32 ucs_get_fallback(u32 cp)
219 {
220 	const struct ucs_page_desc *page;
221 	const struct ucs_page_entry *entry;
222 	u8 page_idx = cp >> 8, offset = cp;
223 
224 	if (!UCS_IS_BMP(cp))
225 		return 0;
226 
227 	/*
228 	 * Full-width to ASCII mapping (covering all printable ASCII 33-126)
229 	 * 0xFF01 (!) to 0xFF5E (~) -> ASCII 33 (!) to 126 (~)
230 	 * We process them programmatically to reduce the table size.
231 	 */
232 	if (cp >= 0xFF01 && cp <= 0xFF5E)
233 		return cp - 0xFF01 + 33;
234 
235 	page = __inline_bsearch(&page_idx, ucs_fallback_pages,
236 				ARRAY_SIZE(ucs_fallback_pages),
237 				sizeof(*ucs_fallback_pages),
238 				ucs_page_desc_cmp);
239 	if (!page)
240 		return 0;
241 
242 	entry = __inline_bsearch(&offset, ucs_fallback_entries + page->start,
243 				 page->count, sizeof(*ucs_fallback_entries),
244 				 ucs_page_entry_cmp);
245 	if (!entry)
246 		return 0;
247 
248 	if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER)
249 		entry++;
250 	return entry->fallback;
251 }
252