utf8-norm.c - OpenGrok cross reference for /linux/fs/unicode/utf8-norm.c

Lines Matching +full:byte +full:- +full:len
1 // SPDX-License-Identifier: GPL-2.0-only
11 	int i = um->tables->utf8agetab_size - 1;  in utf8version_is_supported()
13 	while (i >= 0 && um->tables->utf8agetab[i] != 0) {  in utf8version_is_supported()
14 		if (version == um->tables->utf8agetab[i])  in utf8version_is_supported()
16 		i--;  in utf8version_is_supported()
22  * UTF-8 valid ranges.
24  * The UTF-8 encoding spreads the bits of a 32bit word over several
35  * There is an additional requirement on UTF-8, in that only the
47  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
51  *          0 -     0x7F: 0                   - 0x7F
52  *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
53  *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
54  *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
56  * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
59  * the same a single UTF-32 character.  This makes the UTF-8
60  * representation of Unicode strictly smaller than UTF-32.
63  *    Corrigendum #1: UTF-8 Shortest Form
70  * Return the number of bytes used by the current UTF-8 sequence.
71  * Assumes the input points to the first byte of a valid UTF-8
82  * Decode a 3-byte UTF-8 sequence.
99  * Encode a 3-byte UTF-8 sequence.
116  * A compact binary tree, used to decode UTF-8 characters.
118  * Internal nodes are one byte for the node itself, and up to three
119  * bytes for an offset into the tree.  The first byte contains the
121  *  NEXTBYTE  - flag        - advance to next byte if set
122  *  BITNUM    - 3 bit field - the bit number to tested
123  *  OFFLEN    - 2 bit field - number of bytes in the offset
124  * if offlen == 0 (non-branching node)
125  *  RIGHTPATH - 1 bit field - set if the following node is for the
126  *                            right-hand path (tested bit is set)
127  *  TRIENODE  - 1 bit field - set if the following node is an internal
130  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
131  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
154  *          an index into ->utf8agetab[].  With this we can filter code
156  *          defined.  The CCC of a non-defined code point is 0.
159  *          with a non-zero CCC that occur between two characters with
166  *          start of a NUL-terminated string that is the decomposition
180  * UTF-8 sequences that match the criteria from the "UTF-8 valid
182  * lookup in the trie can be used to validate the UTF-8 input.
217  *   SIndex = s - SBase
267 	si = utf8decode3(str) - SB;  in utf8hangul()
278 	/* Add LPart, a 3-byte UTF-8 sequence. */  in utf8hangul()
281 	/* Add VPart, a 3-byte UTF-8 sequence. */  in utf8hangul()
284 	/* Add TPart if required, also a 3-byte UTF-8 sequence. */  in utf8hangul()
295  * Use trie to scan s, touching at most len bytes.
298  * A non-NULL return guarantees that the UTF-8 sequence starting at s
299  * is well-formed and corresponds to a known unicode code point.  The
300  * shorthand for this will be "is valid UTF-8 unicode".
304 		size_t len)  in utf8nlookup()  argument
306 	utf8trie_t	*trie = um->tables->utf8data + um->ntab[n]->offset;  in utf8nlookup()
312 	if (len == 0)  in utf8nlookup()
319 			if (--len == 0)  in utf8nlookup()
330 				while (--offlen) {  in utf8nlookup()
361 	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is  in utf8nlookup()
363 	 * start of the sequence is at s-2.  in utf8nlookup()
366 		trie = utf8hangul(s - 2, hangul);  in utf8nlookup()
379 	return utf8nlookup(um, n, hangul, s, (size_t)-1);  in utf8lookup()
383  * Length of the normalization of s, touch at most len bytes.
384  * Return -1 if s is not valid UTF-8 unicode.
387 		const char *s, size_t len)  in utf8nlen()  argument
393 	while (len && *s) {  in utf8nlen()
394 		leaf = utf8nlookup(um, n, hangul, s, len);  in utf8nlen()
396 			return -1;  in utf8nlen()
397 		if (um->tables->utf8agetab[LEAF_GEN(leaf)] >  in utf8nlen()
398 		    um->ntab[n]->maxage)  in utf8nlen()
404 		len -= utf8clen(s);  in utf8nlen()
416  *   len    : length of s.
418  * Returns -1 on error, 0 on success.
421 		enum utf8_normalization n, const char *s, size_t len)  in utf8ncursor()  argument
424 		return -1;  in utf8ncursor()
425 	u8c->um = um;  in utf8ncursor()
426 	u8c->n = n;  in utf8ncursor()
427 	u8c->s = s;  in utf8ncursor()
428 	u8c->p = NULL;  in utf8ncursor()
429 	u8c->ss = NULL;  in utf8ncursor()
430 	u8c->sp = NULL;  in utf8ncursor()
431 	u8c->len = len;  in utf8ncursor()
432 	u8c->slen = 0;  in utf8ncursor()
433 	u8c->ccc = STOPPER;  in utf8ncursor()
434 	u8c->nccc = STOPPER;  in utf8ncursor()
436 	if (u8c->len != len)  in utf8ncursor()
437 		return -1;  in utf8ncursor()
438 	/* The first byte of s may not be an utf8 continuation. */  in utf8ncursor()
439 	if (len > 0 && (*s & 0xC0) == 0x80)  in utf8ncursor()
440 		return -1;  in utf8ncursor()
445  * Get one byte from the normalized form of the string described by u8c.
447  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
449  * The cursor keeps track of the location in the string in u8c->s.
451  * u8c->p, and u8c->s is set to the start of the decomposition. Note
452  * that bytes from a decomposition do not count against u8c->len.
454  * Characters are emitted if they match the current CCC in u8c->ccc.
455  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
459  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
461  * emitted and stores it in u8c->nccc, the second pass emits the
467  *  u8c->p  != NULL -> a decomposition is being scanned.
468  *  u8c->ss != NULL -> this is a repeating scan.
469  *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
478 		if (u8c->p && *u8c->s == '\0') {  in utf8byte()
479 			u8c->s = u8c->p;  in utf8byte()
480 			u8c->p = NULL;  in utf8byte()
483 		/* Check for end-of-string. */  in utf8byte()
484 		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {  in utf8byte()
485 			/* There is no next byte. */  in utf8byte()
486 			if (u8c->ccc == STOPPER)  in utf8byte()
488 			/* End-of-string during a scan counts as a stopper. */  in utf8byte()
491 		} else if ((*u8c->s & 0xC0) == 0x80) {  in utf8byte()
493 			if (!u8c->p)  in utf8byte()
494 				u8c->len--;  in utf8byte()
495 			return (unsigned char)*u8c->s++;  in utf8byte()
499 		if (u8c->p) {  in utf8byte()
500 			leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s);  in utf8byte()
502 			leaf = utf8nlookup(u8c->um, u8c->n, u8c->hangul,  in utf8byte()
503 					   u8c->s, u8c->len);  in utf8byte()
508 			return -1;  in utf8byte()
512 		if (u8c->um->tables->utf8agetab[LEAF_GEN(leaf)] >  in utf8byte()
513 		    u8c->um->ntab[u8c->n]->maxage) {  in utf8byte()
516 			u8c->len -= utf8clen(u8c->s);  in utf8byte()
517 			u8c->p = u8c->s + utf8clen(u8c->s);  in utf8byte()
518 			u8c->s = LEAF_STR(leaf);  in utf8byte()
520 			if (*u8c->s == '\0') {  in utf8byte()
521 				if (u8c->ccc == STOPPER)  in utf8byte()
527 			leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s);  in utf8byte()
529 				return -1;  in utf8byte()
537 		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)  in utf8byte()
538 			u8c->nccc = ccc;  in utf8byte()
541 		 * Return the current byte if this is the current  in utf8byte()
544 		if (ccc == u8c->ccc) {  in utf8byte()
545 			if (!u8c->p)  in utf8byte()
546 				u8c->len--;  in utf8byte()
547 			return (unsigned char)*u8c->s++;  in utf8byte()
552 		if (u8c->nccc == STOPPER) {  in utf8byte()
558 			u8c->ccc = MINCCC - 1;  in utf8byte()
559 			u8c->nccc = ccc;  in utf8byte()
560 			u8c->sp = u8c->p;  in utf8byte()
561 			u8c->ss = u8c->s;  in utf8byte()
562 			u8c->slen = u8c->len;  in utf8byte()
563 			if (!u8c->p)  in utf8byte()
564 				u8c->len -= utf8clen(u8c->s);  in utf8byte()
565 			u8c->s += utf8clen(u8c->s);  in utf8byte()
568 			if (!u8c->p)  in utf8byte()
569 				u8c->len -= utf8clen(u8c->s);  in utf8byte()
570 			u8c->s += utf8clen(u8c->s);  in utf8byte()
571 		} else if (u8c->nccc != MAXCCC + 1) {  in utf8byte()
573 			u8c->ccc = u8c->nccc;  in utf8byte()
574 			u8c->nccc = MAXCCC + 1;  in utf8byte()
575 			u8c->s = u8c->ss;  in utf8byte()
576 			u8c->p = u8c->sp;  in utf8byte()
577 			u8c->len = u8c->slen;  in utf8byte()
580 			u8c->ccc = STOPPER;  in utf8byte()
581 			u8c->nccc = STOPPER;  in utf8byte()
582 			u8c->sp = NULL;  in utf8byte()
583 			u8c->ss = NULL;  in utf8byte()
584 			u8c->slen = 0;  in utf8byte()