xref: /linux/fs/hfsplus/unicode.c (revision b3e1c7855e8e1c4d77685ce4a8cd9cdd576058eb)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  linux/fs/hfsplus/unicode.c
4  *
5  * Copyright (C) 2001
6  * Brad Boyer (flar@allandria.com)
7  * (C) 2003 Ardis Technologies <roman@ardistech.com>
8  *
9  * Handler routines for unicode strings
10  */
11 
12 #include <linux/types.h>
13 #include <linux/nls.h>
14 #include "hfsplus_fs.h"
15 #include "hfsplus_raw.h"
16 
17 /* Fold the case of a unicode char, given the 16 bit value */
18 /* Returns folded char, or 0 if ignorable */
case_fold(u16 c)19 static inline u16 case_fold(u16 c)
20 {
21 	u16 tmp;
22 
23 	tmp = hfsplus_case_fold_table[c >> 8];
24 	if (tmp)
25 		tmp = hfsplus_case_fold_table[tmp + (c & 0xff)];
26 	else
27 		tmp = c;
28 	return tmp;
29 }
30 
31 /* Compare unicode strings, return values like normal strcmp */
hfsplus_strcasecmp(const struct hfsplus_unistr * s1,const struct hfsplus_unistr * s2)32 int hfsplus_strcasecmp(const struct hfsplus_unistr *s1,
33 		       const struct hfsplus_unistr *s2)
34 {
35 	u16 len1, len2, c1, c2;
36 	const hfsplus_unichr *p1, *p2;
37 
38 	len1 = be16_to_cpu(s1->length);
39 	len2 = be16_to_cpu(s2->length);
40 	p1 = s1->unicode;
41 	p2 = s2->unicode;
42 
43 	if (len1 > HFSPLUS_MAX_STRLEN) {
44 		len1 = HFSPLUS_MAX_STRLEN;
45 		pr_err("invalid length %u has been corrected to %d\n",
46 			be16_to_cpu(s1->length), len1);
47 	}
48 
49 	if (len2 > HFSPLUS_MAX_STRLEN) {
50 		len2 = HFSPLUS_MAX_STRLEN;
51 		pr_err("invalid length %u has been corrected to %d\n",
52 			be16_to_cpu(s2->length), len2);
53 	}
54 
55 	while (1) {
56 		c1 = c2 = 0;
57 
58 		while (len1 && !c1) {
59 			c1 = case_fold(be16_to_cpu(*p1));
60 			p1++;
61 			len1--;
62 		}
63 		while (len2 && !c2) {
64 			c2 = case_fold(be16_to_cpu(*p2));
65 			p2++;
66 			len2--;
67 		}
68 
69 		if (c1 != c2)
70 			return (c1 < c2) ? -1 : 1;
71 		if (!c1 && !c2)
72 			return 0;
73 	}
74 }
75 
76 /* Compare names as a sequence of 16-bit unsigned integers */
hfsplus_strcmp(const struct hfsplus_unistr * s1,const struct hfsplus_unistr * s2)77 int hfsplus_strcmp(const struct hfsplus_unistr *s1,
78 		   const struct hfsplus_unistr *s2)
79 {
80 	u16 len1, len2, c1, c2;
81 	const hfsplus_unichr *p1, *p2;
82 	int len;
83 
84 	len1 = be16_to_cpu(s1->length);
85 	len2 = be16_to_cpu(s2->length);
86 	p1 = s1->unicode;
87 	p2 = s2->unicode;
88 
89 	if (len1 > HFSPLUS_MAX_STRLEN) {
90 		len1 = HFSPLUS_MAX_STRLEN;
91 		pr_err("invalid length %u has been corrected to %d\n",
92 			be16_to_cpu(s1->length), len1);
93 	}
94 
95 	if (len2 > HFSPLUS_MAX_STRLEN) {
96 		len2 = HFSPLUS_MAX_STRLEN;
97 		pr_err("invalid length %u has been corrected to %d\n",
98 			be16_to_cpu(s2->length), len2);
99 	}
100 
101 	for (len = min(len1, len2); len > 0; len--) {
102 		c1 = be16_to_cpu(*p1);
103 		c2 = be16_to_cpu(*p2);
104 		if (c1 != c2)
105 			return c1 < c2 ? -1 : 1;
106 		p1++;
107 		p2++;
108 	}
109 
110 	return len1 < len2 ? -1 :
111 	       len1 > len2 ? 1 : 0;
112 }
113 
114 
115 #define Hangul_SBase	0xac00
116 #define Hangul_LBase	0x1100
117 #define Hangul_VBase	0x1161
118 #define Hangul_TBase	0x11a7
119 #define Hangul_SCount	11172
120 #define Hangul_LCount	19
121 #define Hangul_VCount	21
122 #define Hangul_TCount	28
123 #define Hangul_NCount	(Hangul_VCount * Hangul_TCount)
124 
125 
hfsplus_compose_lookup(u16 * p,u16 cc)126 static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
127 {
128 	int i, s, e;
129 
130 	s = 1;
131 	e = p[1];
132 	if (!e || cc < p[s * 2] || cc > p[e * 2])
133 		return NULL;
134 	do {
135 		i = (s + e) / 2;
136 		if (cc > p[i * 2])
137 			s = i + 1;
138 		else if (cc < p[i * 2])
139 			e = i - 1;
140 		else
141 			return hfsplus_compose_table + p[i * 2 + 1];
142 	} while (s <= e);
143 	return NULL;
144 }
145 
hfsplus_uni2asc(struct super_block * sb,const struct hfsplus_unistr * ustr,int max_len,char * astr,int * len_p)146 static int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr,
147 		    int max_len, char *astr, int *len_p)
148 {
149 	const hfsplus_unichr *ip;
150 	struct nls_table *nls = HFSPLUS_SB(sb)->nls;
151 	u8 *op;
152 	u16 cc, c0, c1;
153 	u16 *ce1, *ce2;
154 	int i, len, ustrlen, res, compose;
155 
156 	op = astr;
157 	ip = ustr->unicode;
158 
159 	ustrlen = be16_to_cpu(ustr->length);
160 	if (ustrlen > max_len) {
161 		ustrlen = max_len;
162 		pr_err("invalid length %u has been corrected to %d\n",
163 			be16_to_cpu(ustr->length), ustrlen);
164 	}
165 
166 	len = *len_p;
167 	ce1 = NULL;
168 	compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
169 
170 	while (ustrlen > 0) {
171 		c0 = be16_to_cpu(*ip++);
172 		ustrlen--;
173 		/* search for single decomposed char */
174 		if (likely(compose))
175 			ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c0);
176 		if (ce1)
177 			cc = ce1[0];
178 		else
179 			cc = 0;
180 		if (cc) {
181 			/* start of a possibly decomposed Hangul char */
182 			if (cc != 0xffff)
183 				goto done;
184 			if (!ustrlen)
185 				goto same;
186 			c1 = be16_to_cpu(*ip) - Hangul_VBase;
187 			if (c1 < Hangul_VCount) {
188 				/* compose the Hangul char */
189 				cc = (c0 - Hangul_LBase) * Hangul_VCount;
190 				cc = (cc + c1) * Hangul_TCount;
191 				cc += Hangul_SBase;
192 				ip++;
193 				ustrlen--;
194 				if (!ustrlen)
195 					goto done;
196 				c1 = be16_to_cpu(*ip) - Hangul_TBase;
197 				if (c1 > 0 && c1 < Hangul_TCount) {
198 					cc += c1;
199 					ip++;
200 					ustrlen--;
201 				}
202 				goto done;
203 			}
204 		}
205 		while (1) {
206 			/* main loop for common case of not composed chars */
207 			if (!ustrlen)
208 				goto same;
209 			c1 = be16_to_cpu(*ip);
210 			if (likely(compose))
211 				ce1 = hfsplus_compose_lookup(
212 					hfsplus_compose_table, c1);
213 			if (ce1)
214 				break;
215 			switch (c0) {
216 			case 0:
217 				c0 = 0x2400;
218 				break;
219 			case '/':
220 				c0 = ':';
221 				break;
222 			}
223 			res = nls->uni2char(c0, op, len);
224 			if (res < 0) {
225 				if (res == -ENAMETOOLONG)
226 					goto out;
227 				*op = '?';
228 				res = 1;
229 			}
230 			op += res;
231 			len -= res;
232 			c0 = c1;
233 			ip++;
234 			ustrlen--;
235 		}
236 		ce2 = hfsplus_compose_lookup(ce1, c0);
237 		if (ce2) {
238 			i = 1;
239 			while (i < ustrlen) {
240 				ce1 = hfsplus_compose_lookup(ce2,
241 					be16_to_cpu(ip[i]));
242 				if (!ce1)
243 					break;
244 				i++;
245 				ce2 = ce1;
246 			}
247 			cc = ce2[0];
248 			if (cc) {
249 				ip += i;
250 				ustrlen -= i;
251 				goto done;
252 			}
253 		}
254 same:
255 		switch (c0) {
256 		case 0:
257 			cc = 0x2400;
258 			break;
259 		case '/':
260 			cc = ':';
261 			break;
262 		default:
263 			cc = c0;
264 		}
265 done:
266 		res = nls->uni2char(cc, op, len);
267 		if (res < 0) {
268 			if (res == -ENAMETOOLONG)
269 				goto out;
270 			*op = '?';
271 			res = 1;
272 		}
273 		op += res;
274 		len -= res;
275 	}
276 	res = 0;
277 out:
278 	*len_p = (char *)op - astr;
279 	return res;
280 }
281 
hfsplus_uni2asc_str(struct super_block * sb,const struct hfsplus_unistr * ustr,char * astr,int * len_p)282 inline int hfsplus_uni2asc_str(struct super_block *sb,
283 			       const struct hfsplus_unistr *ustr, char *astr,
284 			       int *len_p)
285 {
286 	return hfsplus_uni2asc(sb, ustr, HFSPLUS_MAX_STRLEN, astr, len_p);
287 }
288 
hfsplus_uni2asc_xattr_str(struct super_block * sb,const struct hfsplus_attr_unistr * ustr,char * astr,int * len_p)289 inline int hfsplus_uni2asc_xattr_str(struct super_block *sb,
290 				     const struct hfsplus_attr_unistr *ustr,
291 				     char *astr, int *len_p)
292 {
293 	return hfsplus_uni2asc(sb, (const struct hfsplus_unistr *)ustr,
294 			       HFSPLUS_ATTR_MAX_STRLEN, astr, len_p);
295 }
296 
297 /*
298  * Convert one or more ASCII characters into a single unicode character.
299  * Returns the number of ASCII characters corresponding to the unicode char.
300  */
asc2unichar(struct super_block * sb,const char * astr,int len,wchar_t * uc)301 static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
302 			      wchar_t *uc)
303 {
304 	int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
305 	if (size <= 0) {
306 		*uc = '?';
307 		size = 1;
308 	}
309 	switch (*uc) {
310 	case 0x2400:
311 		*uc = 0;
312 		break;
313 	case ':':
314 		*uc = '/';
315 		break;
316 	}
317 	return size;
318 }
319 
320 /* Decomposes a non-Hangul unicode character. */
hfsplus_decompose_nonhangul(wchar_t uc,int * size)321 static u16 *hfsplus_decompose_nonhangul(wchar_t uc, int *size)
322 {
323 	int off;
324 
325 	off = hfsplus_decompose_table[(uc >> 12) & 0xf];
326 	if (off == 0 || off == 0xffff)
327 		return NULL;
328 
329 	off = hfsplus_decompose_table[off + ((uc >> 8) & 0xf)];
330 	if (!off)
331 		return NULL;
332 
333 	off = hfsplus_decompose_table[off + ((uc >> 4) & 0xf)];
334 	if (!off)
335 		return NULL;
336 
337 	off = hfsplus_decompose_table[off + (uc & 0xf)];
338 	*size = off & 3;
339 	if (*size == 0)
340 		return NULL;
341 	return hfsplus_decompose_table + (off / 4);
342 }
343 
344 /*
345  * Try to decompose a unicode character as Hangul. Return 0 if @uc is not
346  * precomposed Hangul, otherwise return the length of the decomposition.
347  *
348  * This function was adapted from sample code from the Unicode Standard
349  * Annex #15: Unicode Normalization Forms, version 3.2.0.
350  *
351  * Copyright (C) 1991-2018 Unicode, Inc.  All rights reserved.  Distributed
352  * under the Terms of Use in http://www.unicode.org/copyright.html.
353  */
hfsplus_try_decompose_hangul(wchar_t uc,u16 * result)354 static int hfsplus_try_decompose_hangul(wchar_t uc, u16 *result)
355 {
356 	int index;
357 	int l, v, t;
358 
359 	index = uc - Hangul_SBase;
360 	if (index < 0 || index >= Hangul_SCount)
361 		return 0;
362 
363 	l = Hangul_LBase + index / Hangul_NCount;
364 	v = Hangul_VBase + (index % Hangul_NCount) / Hangul_TCount;
365 	t = Hangul_TBase + index % Hangul_TCount;
366 
367 	result[0] = l;
368 	result[1] = v;
369 	if (t != Hangul_TBase) {
370 		result[2] = t;
371 		return 3;
372 	}
373 	return 2;
374 }
375 
376 /* Decomposes a single unicode character. */
decompose_unichar(wchar_t uc,int * size,u16 * hangul_buffer)377 static u16 *decompose_unichar(wchar_t uc, int *size, u16 *hangul_buffer)
378 {
379 	u16 *result;
380 
381 	/* Hangul is handled separately */
382 	result = hangul_buffer;
383 	*size = hfsplus_try_decompose_hangul(uc, result);
384 	if (*size == 0)
385 		result = hfsplus_decompose_nonhangul(uc, size);
386 	return result;
387 }
388 
hfsplus_asc2uni(struct super_block * sb,struct hfsplus_unistr * ustr,int max_unistr_len,const char * astr,int len)389 int hfsplus_asc2uni(struct super_block *sb,
390 		    struct hfsplus_unistr *ustr, int max_unistr_len,
391 		    const char *astr, int len)
392 {
393 	int size, dsize, decompose;
394 	u16 *dstr, outlen = 0;
395 	wchar_t c;
396 	u16 dhangul[3];
397 
398 	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
399 	while (outlen < max_unistr_len && len > 0) {
400 		size = asc2unichar(sb, astr, len, &c);
401 
402 		if (decompose)
403 			dstr = decompose_unichar(c, &dsize, dhangul);
404 		else
405 			dstr = NULL;
406 		if (dstr) {
407 			if (outlen + dsize > max_unistr_len)
408 				break;
409 			do {
410 				ustr->unicode[outlen++] = cpu_to_be16(*dstr++);
411 			} while (--dsize > 0);
412 		} else
413 			ustr->unicode[outlen++] = cpu_to_be16(c);
414 
415 		astr += size;
416 		len -= size;
417 	}
418 	ustr->length = cpu_to_be16(outlen);
419 	if (len > 0)
420 		return -ENAMETOOLONG;
421 	return 0;
422 }
423 
424 /*
425  * Hash a string to an integer as appropriate for the HFS+ filesystem.
426  * Composed unicode characters are decomposed and case-folding is performed
427  * if the appropriate bits are (un)set on the superblock.
428  */
hfsplus_hash_dentry(const struct dentry * dentry,struct qstr * str)429 int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
430 {
431 	struct super_block *sb = dentry->d_sb;
432 	const char *astr;
433 	const u16 *dstr;
434 	int casefold, decompose, size, len;
435 	unsigned long hash;
436 	wchar_t c;
437 	u16 c2;
438 	u16 dhangul[3];
439 
440 	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
441 	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
442 	hash = init_name_hash(dentry);
443 	astr = str->name;
444 	len = str->len;
445 	while (len > 0) {
446 		int dsize;
447 		size = asc2unichar(sb, astr, len, &c);
448 		astr += size;
449 		len -= size;
450 
451 		if (decompose)
452 			dstr = decompose_unichar(c, &dsize, dhangul);
453 		else
454 			dstr = NULL;
455 		if (dstr) {
456 			do {
457 				c2 = *dstr++;
458 				if (casefold)
459 					c2 = case_fold(c2);
460 				if (!casefold || c2)
461 					hash = partial_name_hash(c2, hash);
462 			} while (--dsize > 0);
463 		} else {
464 			c2 = c;
465 			if (casefold)
466 				c2 = case_fold(c2);
467 			if (!casefold || c2)
468 				hash = partial_name_hash(c2, hash);
469 		}
470 	}
471 	str->hash = end_name_hash(hash);
472 
473 	return 0;
474 }
475 
476 /*
477  * Compare strings with HFS+ filename ordering.
478  * Composed unicode characters are decomposed and case-folding is performed
479  * if the appropriate bits are (un)set on the superblock.
480  */
hfsplus_compare_dentry(const struct dentry * dentry,unsigned int len,const char * str,const struct qstr * name)481 int hfsplus_compare_dentry(const struct dentry *dentry,
482 		unsigned int len, const char *str, const struct qstr *name)
483 {
484 	struct super_block *sb = dentry->d_sb;
485 	int casefold, decompose, size;
486 	int dsize1, dsize2, len1, len2;
487 	const u16 *dstr1, *dstr2;
488 	const char *astr1, *astr2;
489 	u16 c1, c2;
490 	wchar_t c;
491 	u16 dhangul_1[3], dhangul_2[3];
492 
493 	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
494 	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
495 	astr1 = str;
496 	len1 = len;
497 	astr2 = name->name;
498 	len2 = name->len;
499 	dsize1 = dsize2 = 0;
500 	dstr1 = dstr2 = NULL;
501 
502 	while (len1 > 0 && len2 > 0) {
503 		if (!dsize1) {
504 			size = asc2unichar(sb, astr1, len1, &c);
505 			astr1 += size;
506 			len1 -= size;
507 
508 			if (decompose)
509 				dstr1 = decompose_unichar(c, &dsize1,
510 							  dhangul_1);
511 			if (!decompose || !dstr1) {
512 				c1 = c;
513 				dstr1 = &c1;
514 				dsize1 = 1;
515 			}
516 		}
517 
518 		if (!dsize2) {
519 			size = asc2unichar(sb, astr2, len2, &c);
520 			astr2 += size;
521 			len2 -= size;
522 
523 			if (decompose)
524 				dstr2 = decompose_unichar(c, &dsize2,
525 							  dhangul_2);
526 			if (!decompose || !dstr2) {
527 				c2 = c;
528 				dstr2 = &c2;
529 				dsize2 = 1;
530 			}
531 		}
532 
533 		c1 = *dstr1;
534 		c2 = *dstr2;
535 		if (casefold) {
536 			c1 = case_fold(c1);
537 			if (!c1) {
538 				dstr1++;
539 				dsize1--;
540 				continue;
541 			}
542 			c2 = case_fold(c2);
543 			if (!c2) {
544 				dstr2++;
545 				dsize2--;
546 				continue;
547 			}
548 		}
549 		if (c1 < c2)
550 			return -1;
551 		else if (c1 > c2)
552 			return 1;
553 
554 		dstr1++;
555 		dsize1--;
556 		dstr2++;
557 		dsize2--;
558 	}
559 
560 	if (len1 < len2)
561 		return -1;
562 	if (len1 > len2)
563 		return 1;
564 	return 0;
565 }
566