utf8-norm.c - OpenGrok cross reference for /linux/fs/unicode/utf8-norm.c

Lines Matching +full:0 +full:x00000000 +full:- +full:0 +full:x03ffffff
1 // SPDX-License-Identifier: GPL-2.0-only
11 	int i = um->tables->utf8agetab_size - 1;  in utf8version_is_supported()
13 	while (i >= 0 && um->tables->utf8agetab[i] != 0) {  in utf8version_is_supported()
14 		if (version == um->tables->utf8agetab[i])  in utf8version_is_supported()
16 		i--;  in utf8version_is_supported()
18 	return 0;  in utf8version_is_supported()
22  * UTF-8 valid ranges.
24  * The UTF-8 encoding spreads the bits of a 32bit word over several
28  * 0x00000000 0x0000007F: 0xxxxxxx
29  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
30  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
31  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
32  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
33  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
35  * There is an additional requirement on UTF-8, in that only the
40  * 0x00000000 0x0000007F: 0xxxxxxx
41  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
42  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
43  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
44  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
45  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
47  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
51  *          0 -     0x7F: 0                   - 0x7F
52  *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
53  *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
54  *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
56  * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
59  * the same a single UTF-32 character.  This makes the UTF-8
60  * representation of Unicode strictly smaller than UTF-32.
63  *    Corrigendum #1: UTF-8 Shortest Form
70  * Return the number of bytes used by the current UTF-8 sequence.
71  * Assumes the input points to the first byte of a valid UTF-8
78 	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);  in utf8clen()
82  * Decode a 3-byte UTF-8 sequence.
89 	uc = *str++ & 0x0F;  in utf8decode3()
91 	uc |= *str++ & 0x3F;  in utf8decode3()
93 	uc |= *str++ & 0x3F;  in utf8decode3()
99  * Encode a 3-byte UTF-8 sequence.
104 	str[2] = (val & 0x3F) | 0x80;  in utf8encode3()
106 	str[1] = (val & 0x3F) | 0x80;  in utf8encode3()
108 	str[0] = val | 0xE0;  in utf8encode3()
116  * A compact binary tree, used to decode UTF-8 characters.
121  *  NEXTBYTE  - flag        - advance to next byte if set
122  *  BITNUM    - 3 bit field - the bit number to tested
123  *  OFFLEN    - 2 bit field - number of bytes in the offset
124  * if offlen == 0 (non-branching node)
125  *  RIGHTPATH - 1 bit field - set if the following node is for the
126  *                            right-hand path (tested bit is set)
127  *  TRIENODE  - 1 bit field - set if the following node is an internal
129  * if offlen != 0 (branching node)
130  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
131  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
138 #define BITNUM		0x07
139 #define NEXTBYTE	0x08
140 #define OFFLEN		0x30
142 #define RIGHTPATH	0x40
143 #define TRIENODE	0x80
144 #define RIGHTNODE	0x40
145 #define LEFTNODE	0x80
153  * leaf[0]: The unicode version, stored as a generation number that is
154  *          an index into ->utf8agetab[].  With this we can filter code
156  *          defined.  The CCC of a non-defined code point is 0.
159  *          with a non-zero CCC that occur between two characters with
160  *          a CCC of 0, or at the begin or end of a string.
162  *          between 0 and 254 inclusive, which leaves 255 available as
164  *          Code points with CCC 0 are known as stoppers.
166  *          start of a NUL-terminated string that is the decomposition
172  *          These do affect normalization, as they all have CCC 0.
180  * UTF-8 sequences that match the criteria from the "UTF-8 valid
182  * lookup in the trie can be used to validate the UTF-8 input.
186 #define LEAF_GEN(LEAF)	((LEAF)[0])
190 #define MINCCC		(0)
192 #define STOPPER		(0)
201  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
203  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
204  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
206  * SBase = 0xAC00
207  * LBase = 0x1100
208  * VBase = 0x1161
209  * TBase = 0x11A7
217  *   SIndex = s - SBase
237  *   if (TIndex == 0) {
246 #define SB	(0xAC00)
247 #define LB	(0x1100)
248 #define VB	(0x1161)
249 #define TB	(0x11A7)
267 	si = utf8decode3(str) - SB;  in utf8hangul()
278 	/* Add LPart, a 3-byte UTF-8 sequence. */  in utf8hangul()
281 	/* Add VPart, a 3-byte UTF-8 sequence. */  in utf8hangul()
284 	/* Add TPart if required, also a 3-byte UTF-8 sequence. */  in utf8hangul()
289 	h[0] = '\0';  in utf8hangul()
298  * A non-NULL return guarantees that the UTF-8 sequence starting at s
299  * is well-formed and corresponds to a known unicode code point.  The
300  * shorthand for this will be "is valid UTF-8 unicode".
306 	utf8trie_t	*trie = um->tables->utf8data + um->ntab[n]->offset;  in utf8nlookup()
312 	if (len == 0)  in utf8nlookup()
319 			if (--len == 0)  in utf8nlookup()
330 				while (--offlen) {  in utf8nlookup()
361 	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is  in utf8nlookup()
363 	 * start of the sequence is at s-2.  in utf8nlookup()
365 	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)  in utf8nlookup()
366 		trie = utf8hangul(s - 2, hangul);  in utf8nlookup()
379 	return utf8nlookup(um, n, hangul, s, (size_t)-1);  in utf8lookup()
384  * Return -1 if s is not valid UTF-8 unicode.
390 	size_t		ret = 0;  in utf8nlen()
396 			return -1;  in utf8nlen()
397 		if (um->tables->utf8agetab[LEAF_GEN(leaf)] >  in utf8nlen()
398 		    um->ntab[n]->maxage)  in utf8nlen()
404 		len -= utf8clen(s);  in utf8nlen()
418  * Returns -1 on error, 0 on success.
424 		return -1;  in utf8ncursor()
425 	u8c->um = um;  in utf8ncursor()
426 	u8c->n = n;  in utf8ncursor()
427 	u8c->s = s;  in utf8ncursor()
428 	u8c->p = NULL;  in utf8ncursor()
429 	u8c->ss = NULL;  in utf8ncursor()
430 	u8c->sp = NULL;  in utf8ncursor()
431 	u8c->len = len;  in utf8ncursor()
432 	u8c->slen = 0;  in utf8ncursor()
433 	u8c->ccc = STOPPER;  in utf8ncursor()
434 	u8c->nccc = STOPPER;  in utf8ncursor()
436 	if (u8c->len != len)  in utf8ncursor()
437 		return -1;  in utf8ncursor()
439 	if (len > 0 && (*s & 0xC0) == 0x80)  in utf8ncursor()
440 		return -1;  in utf8ncursor()
441 	return 0;  in utf8ncursor()
447  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
449  * The cursor keeps track of the location in the string in u8c->s.
451  * u8c->p, and u8c->s is set to the start of the decomposition. Note
452  * that bytes from a decomposition do not count against u8c->len.
454  * Characters are emitted if they match the current CCC in u8c->ccc.
455  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
456  * and the function returns 0 in that case.
459  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
461  * emitted and stores it in u8c->nccc, the second pass emits the
467  *  u8c->p  != NULL -> a decomposition is being scanned.
468  *  u8c->ss != NULL -> this is a repeating scan.
469  *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
478 		if (u8c->p && *u8c->s == '\0') {  in utf8byte()
479 			u8c->s = u8c->p;  in utf8byte()
480 			u8c->p = NULL;  in utf8byte()
483 		/* Check for end-of-string. */  in utf8byte()
484 		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {  in utf8byte()
486 			if (u8c->ccc == STOPPER)  in utf8byte()
487 				return 0;  in utf8byte()
488 			/* End-of-string during a scan counts as a stopper. */  in utf8byte()
491 		} else if ((*u8c->s & 0xC0) == 0x80) {  in utf8byte()
493 			if (!u8c->p)  in utf8byte()
494 				u8c->len--;  in utf8byte()
495 			return (unsigned char)*u8c->s++;  in utf8byte()
499 		if (u8c->p) {  in utf8byte()
500 			leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s);  in utf8byte()
502 			leaf = utf8nlookup(u8c->um, u8c->n, u8c->hangul,  in utf8byte()
503 					   u8c->s, u8c->len);  in utf8byte()
508 			return -1;  in utf8byte()
511 		/* Characters that are too new have CCC 0. */  in utf8byte()
512 		if (u8c->um->tables->utf8agetab[LEAF_GEN(leaf)] >  in utf8byte()
513 		    u8c->um->ntab[u8c->n]->maxage) {  in utf8byte()
516 			u8c->len -= utf8clen(u8c->s);  in utf8byte()
517 			u8c->p = u8c->s + utf8clen(u8c->s);  in utf8byte()
518 			u8c->s = LEAF_STR(leaf);  in utf8byte()
519 			/* Empty decomposition implies CCC 0. */  in utf8byte()
520 			if (*u8c->s == '\0') {  in utf8byte()
521 				if (u8c->ccc == STOPPER)  in utf8byte()
527 			leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s);  in utf8byte()
529 				return -1;  in utf8byte()
537 		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)  in utf8byte()
538 			u8c->nccc = ccc;  in utf8byte()
544 		if (ccc == u8c->ccc) {  in utf8byte()
545 			if (!u8c->p)  in utf8byte()
546 				u8c->len--;  in utf8byte()
547 			return (unsigned char)*u8c->s++;  in utf8byte()
552 		if (u8c->nccc == STOPPER) {  in utf8byte()
558 			u8c->ccc = MINCCC - 1;  in utf8byte()
559 			u8c->nccc = ccc;  in utf8byte()
560 			u8c->sp = u8c->p;  in utf8byte()
561 			u8c->ss = u8c->s;  in utf8byte()
562 			u8c->slen = u8c->len;  in utf8byte()
563 			if (!u8c->p)  in utf8byte()
564 				u8c->len -= utf8clen(u8c->s);  in utf8byte()
565 			u8c->s += utf8clen(u8c->s);  in utf8byte()
568 			if (!u8c->p)  in utf8byte()
569 				u8c->len -= utf8clen(u8c->s);  in utf8byte()
570 			u8c->s += utf8clen(u8c->s);  in utf8byte()
571 		} else if (u8c->nccc != MAXCCC + 1) {  in utf8byte()
573 			u8c->ccc = u8c->nccc;  in utf8byte()
574 			u8c->nccc = MAXCCC + 1;  in utf8byte()
575 			u8c->s = u8c->ss;  in utf8byte()
576 			u8c->p = u8c->sp;  in utf8byte()
577 			u8c->len = u8c->slen;  in utf8byte()
580 			u8c->ccc = STOPPER;  in utf8byte()
581 			u8c->nccc = STOPPER;  in utf8byte()
582 			u8c->sp = NULL;  in utf8byte()
583 			u8c->ss = NULL;  in utf8byte()
584 			u8c->slen = 0;  in utf8byte()