common/unicode/u8_textprep.c

*4703203dSis/*
*4703203dSis * CDDL HEADER START
*4703203dSis *
*4703203dSis * The contents of this file are subject to the terms of the
*4703203dSis * Common Development and Distribution License (the "License").
*4703203dSis * You may not use this file except in compliance with the License.
*4703203dSis *
*4703203dSis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
*4703203dSis * or http://www.opensolaris.org/os/licensing.
*4703203dSis * See the License for the specific language governing permissions
*4703203dSis * and limitations under the License.
*4703203dSis *
*4703203dSis * When distributing Covered Code, include this CDDL HEADER in each
*4703203dSis * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
*4703203dSis * If applicable, add the following below this CDDL HEADER, with the
*4703203dSis * fields enclosed by brackets "[]" replaced with your own identifying
*4703203dSis * information: Portions Copyright [yyyy] [name of copyright owner]
*4703203dSis *
*4703203dSis * CDDL HEADER END
*4703203dSis */
*4703203dSis/*
*4703203dSis * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
*4703203dSis * Use is subject to license terms.
*4703203dSis */
*4703203dSis
*4703203dSis#pragma ident	"%Z%%M%	%I%	%E% SMI"
*4703203dSis
*4703203dSis
*4703203dSis/*
*4703203dSis * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
*4703203dSis *
*4703203dSis * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
*4703203dSis * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
*4703203dSis * the section 3C man pages.
*4703203dSis * Interface stability: Committed.
*4703203dSis */
*4703203dSis
*4703203dSis#include <sys/types.h>
*4703203dSis#ifdef	_KERNEL
*4703203dSis#include <sys/param.h>
*4703203dSis#include <sys/sysmacros.h>
*4703203dSis#include <sys/systm.h>
*4703203dSis#include <sys/debug.h>
*4703203dSis#include <sys/kmem.h>
*4703203dSis#include <sys/ddi.h>
*4703203dSis#include <sys/sunddi.h>
*4703203dSis#else
*4703203dSis#include <sys/u8_textprep.h>
*4703203dSis#include <strings.h>
*4703203dSis#endif	/* _KERNEL */
*4703203dSis#include <sys/byteorder.h>
*4703203dSis#include <sys/errno.h>
*4703203dSis#include <sys/u8_textprep_data.h>
*4703203dSis
*4703203dSis
*4703203dSis/* The maximum possible number of bytes in a UTF-8 character. */
*4703203dSis#define	U8_MB_CUR_MAX			(4)
*4703203dSis
*4703203dSis/*
*4703203dSis * The maximum number of bytes needed for a UTF-8 character to cover
*4703203dSis * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
*4703203dSis */
*4703203dSis#define	U8_MAX_BYTES_UCS2		(3)
*4703203dSis
*4703203dSis/* The maximum possible number of bytes in a Stream-Safe Text. */
*4703203dSis#define	U8_STREAM_SAFE_TEXT_MAX		(128)
*4703203dSis
*4703203dSis/*
*4703203dSis * The maximum number of characters in a combining/conjoining sequence and
*4703203dSis * the actual upperbound limit of a combining/conjoining sequence.
*4703203dSis */
*4703203dSis#define	U8_MAX_CHARS_A_SEQ		(32)
*4703203dSis#define	U8_UPPER_LIMIT_IN_A_SEQ		(31)
*4703203dSis
*4703203dSis/* The combining class value for Starter. */
*4703203dSis#define	U8_COMBINING_CLASS_STARTER	(0)
*4703203dSis
*4703203dSis/*
*4703203dSis * Some Hangul related macros at below.
*4703203dSis *
*4703203dSis * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
*4703203dSis * Vowels, and optional Trailing consonants in Unicode scalar values.
*4703203dSis *
*4703203dSis * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
*4703203dSis * the actual U+11A8. This is due to that the trailing consonant is optional
*4703203dSis * and thus we are doing a pre-calculation of subtracting one.
*4703203dSis *
*4703203dSis * Each of 19 modern leading consonants has total 588 possible syllables since
*4703203dSis * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
*4703203dSis * no trailing consonant case, i.e., 21 x 28 = 588.
*4703203dSis *
*4703203dSis * We also have bunch of Hangul related macros at below. Please bear in mind
*4703203dSis * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
*4703203dSis * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
*4703203dSis * Jamo; it just guarantee that it will be most likely.
*4703203dSis */
*4703203dSis#define	U8_HANGUL_SYL_FIRST		(0xAC00U)
*4703203dSis#define	U8_HANGUL_SYL_LAST		(0xD7A3U)
*4703203dSis
*4703203dSis#define	U8_HANGUL_JAMO_L_FIRST		(0x1100U)
*4703203dSis#define	U8_HANGUL_JAMO_L_LAST		(0x1112U)
*4703203dSis#define	U8_HANGUL_JAMO_V_FIRST		(0x1161U)
*4703203dSis#define	U8_HANGUL_JAMO_V_LAST		(0x1175U)
*4703203dSis#define	U8_HANGUL_JAMO_T_FIRST		(0x11A7U)
*4703203dSis#define	U8_HANGUL_JAMO_T_LAST		(0x11C2U)
*4703203dSis
*4703203dSis#define	U8_HANGUL_V_COUNT		(21)
*4703203dSis#define	U8_HANGUL_VT_COUNT		(588)
*4703203dSis#define	U8_HANGUL_T_COUNT		(28)
*4703203dSis
*4703203dSis#define	U8_HANGUL_JAMO_1ST_BYTE		(0xE1U)
*4703203dSis
*4703203dSis#define	U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
*4703203dSis	(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
*4703203dSis	(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
*4703203dSis	(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
*4703203dSis
*4703203dSis#define	U8_HANGUL_JAMO_L(u) \
*4703203dSis	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
*4703203dSis
*4703203dSis#define	U8_HANGUL_JAMO_V(u) \
*4703203dSis	((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
*4703203dSis
*4703203dSis#define	U8_HANGUL_JAMO_T(u) \
*4703203dSis	((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
*4703203dSis
*4703203dSis#define	U8_HANGUL_JAMO(u) \
*4703203dSis	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
*4703203dSis
*4703203dSis#define	U8_HANGUL_SYLLABLE(u) \
*4703203dSis	((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
*4703203dSis
*4703203dSis#define	U8_HANGUL_COMPOSABLE_L_V(s, u) \
*4703203dSis	((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
*4703203dSis
*4703203dSis#define	U8_HANGUL_COMPOSABLE_LV_T(s, u) \
*4703203dSis	((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
*4703203dSis
*4703203dSis/* The types of decomposition mappings. */
*4703203dSis#define	U8_DECOMP_BOTH			(0xF5U)
*4703203dSis#define	U8_DECOMP_CANONICAL		(0xF6U)
*4703203dSis
*4703203dSis/* The indicator for 16-bit table. */
*4703203dSis#define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
*4703203dSis
*4703203dSis/* The following are some convenience macros. */
*4703203dSis#define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
*4703203dSis	(u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
*4703203dSis		(uint32_t)(b3) & 0x3F;
*4703203dSis
*4703203dSis#define	U8_SIMPLE_SWAP(a, b, t) \
*4703203dSis	(t) = (a); \
*4703203dSis	(a) = (b); \
*4703203dSis	(b) = (t);
*4703203dSis
*4703203dSis#define	U8_ASCII_TOUPPER(c) \
*4703203dSis	(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
*4703203dSis
*4703203dSis#define	U8_ASCII_TOLOWER(c) \
*4703203dSis	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
*4703203dSis
*4703203dSis#define	U8_ISASCII(c)			(((uchar_t)(c)) < 0x80U)
*4703203dSis/*
*4703203dSis * The following macro assumes that the two characters that are to be
*4703203dSis * swapped are adjacent to each other and 'a' comes before 'b'.
*4703203dSis *
*4703203dSis * If the assumptions are not met, then, the macro will fail.
*4703203dSis */
*4703203dSis#define	U8_SWAP_COMB_MARKS(a, b) \
*4703203dSis	for (k = 0; k < disp[(a)]; k++) \
*4703203dSis		u8t[k] = u8s[start[(a)] + k]; \
*4703203dSis	for (k = 0; k < disp[(b)]; k++) \
*4703203dSis		u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
*4703203dSis	start[(b)] = start[(a)] + disp[(b)]; \
*4703203dSis	for (k = 0; k < disp[(a)]; k++) \
*4703203dSis		u8s[start[(b)] + k] = u8t[k]; \
*4703203dSis	U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
*4703203dSis	U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
*4703203dSis
*4703203dSis/* The possible states during normalization. */
*4703203dSistypedef enum {
*4703203dSis	U8_STATE_START = 0,
*4703203dSis	U8_STATE_HANGUL_L = 1,
*4703203dSis	U8_STATE_HANGUL_LV = 2,
*4703203dSis	U8_STATE_HANGUL_LVT = 3,
*4703203dSis	U8_STATE_HANGUL_V = 4,
*4703203dSis	U8_STATE_HANGUL_T = 5,
*4703203dSis	U8_STATE_COMBINING_MARK = 6
*4703203dSis} u8_normalization_states_t;
*4703203dSis
*4703203dSis/*
*4703203dSis * The three vectors at below are used to check bytes of a given UTF-8
*4703203dSis * character are valid and not containing any malformed byte values.
*4703203dSis *
*4703203dSis * We used to have a quite relaxed UTF-8 binary representation but then there
*4703203dSis * was some security related issues and so the Unicode Consortium defined
*4703203dSis * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
*4703203dSis * one more time at the Unicode 3.2. The following three tables are based on
*4703203dSis * that.
*4703203dSis */
*4703203dSis
*4703203dSis#define	U8_ILLEGAL_NEXT_BYTE_COMMON(c)	((c) < 0x80 || (c) > 0xBF)
*4703203dSis
*4703203dSis#define	I_				U8_ILLEGAL_CHAR
*4703203dSis#define	O_				U8_OUT_OF_RANGE_CHAR
*4703203dSis
*4703203dSisconst int8_t u8_number_of_bytes[0x100] = {
*4703203dSis	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
*4703203dSis	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
*4703203dSis	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
*4703203dSis	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
*4703203dSis	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
*4703203dSis	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
*4703203dSis	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
*4703203dSis	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
*4703203dSis
*4703203dSis/*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
*4703203dSis	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
*4703203dSis
*4703203dSis/*  	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
*4703203dSis	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
*4703203dSis
*4703203dSis/*  	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
*4703203dSis	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
*4703203dSis
*4703203dSis/*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
*4703203dSis	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
*4703203dSis
*4703203dSis/*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
*4703203dSis	I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
*4703203dSis
*4703203dSis/*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
*4703203dSis	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
*4703203dSis
*4703203dSis/*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
*4703203dSis	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
*4703203dSis
*4703203dSis/*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
*4703203dSis	4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
*4703203dSis};
*4703203dSis
*4703203dSis#undef	I_
*4703203dSis#undef	O_
*4703203dSis
*4703203dSisconst uint8_t u8_valid_min_2nd_byte[0x100] = {
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis/*	C0    C1    C2    C3    C4    C5    C6    C7    */
*4703203dSis	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
*4703203dSis/*	C8    C9    CA    CB    CC    CD    CE    CF    */
*4703203dSis	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
*4703203dSis/*	D0    D1    D2    D3    D4    D5    D6    D7    */
*4703203dSis	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
*4703203dSis/*	D8    D9    DA    DB    DC    DD    DE    DF    */
*4703203dSis	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
*4703203dSis/*	E0    E1    E2    E3    E4    E5    E6    E7    */
*4703203dSis	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
*4703203dSis/*	E8    E9    EA    EB    EC    ED    EE    EF    */
*4703203dSis	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
*4703203dSis/*	F0    F1    F2    F3    F4    F5    F6    F7    */
*4703203dSis	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis};
*4703203dSis
*4703203dSisconst uint8_t u8_valid_max_2nd_byte[0x100] = {
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis/*	C0    C1    C2    C3    C4    C5    C6    C7    */
*4703203dSis	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
*4703203dSis/*	C8    C9    CA    CB    CC    CD    CE    CF    */
*4703203dSis	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
*4703203dSis/*	D0    D1    D2    D3    D4    D5    D6    D7    */
*4703203dSis	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
*4703203dSis/*	D8    D9    DA    DB    DC    DD    DE    DF    */
*4703203dSis	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
*4703203dSis/*	E0    E1    E2    E3    E4    E5    E6    E7    */
*4703203dSis	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
*4703203dSis/*	E8    E9    EA    EB    EC    ED    EE    EF    */
*4703203dSis	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
*4703203dSis/*	F0    F1    F2    F3    F4    F5    F6    F7    */
*4703203dSis	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
*4703203dSis	0,    0,    0,    0,    0,    0,    0,    0,
*4703203dSis};
*4703203dSis
*4703203dSis
*4703203dSis/*
*4703203dSis * The u8_validate() validates on the given UTF-8 character string and
*4703203dSis * calculate the byte length. It is quite similar to mblen(3C) except that
*4703203dSis * this will validate against the list of characters if required and
*4703203dSis * specific to UTF-8 and Unicode.
*4703203dSis */
*4703203dSisint
*4703203dSisu8_validate(char *u8str, size_t n, char **list, int flag, int *errno)
*4703203dSis{
*4703203dSis	uchar_t *ib;
*4703203dSis	uchar_t *ibtail;
*4703203dSis	uchar_t **p;
*4703203dSis	uchar_t *s1;
*4703203dSis	uchar_t *s2;
*4703203dSis	uchar_t f;
*4703203dSis	int sz;
*4703203dSis	size_t i;
*4703203dSis	int ret_val;
*4703203dSis	boolean_t second;
*4703203dSis	boolean_t no_need_to_validate_entire;
*4703203dSis	boolean_t check_additional;
*4703203dSis	boolean_t validate_ucs2_range_only;
*4703203dSis
*4703203dSis	if (! u8str)
*4703203dSis		return (0);
*4703203dSis
*4703203dSis	ib = (uchar_t *)u8str;
*4703203dSis	ibtail = ib + n;
*4703203dSis
*4703203dSis	ret_val = 0;
*4703203dSis
*4703203dSis	no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
*4703203dSis	check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
*4703203dSis	validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
*4703203dSis
*4703203dSis	while (ib < ibtail) {
*4703203dSis		/*
*4703203dSis		 * The first byte of a UTF-8 character tells how many
*4703203dSis		 * bytes will follow for the character. If the first byte
*4703203dSis		 * is an illegal byte value or out of range value, we just
*4703203dSis		 * return -1 with an appropriate error number.
*4703203dSis		 */
*4703203dSis		sz = u8_number_of_bytes[*ib];
*4703203dSis		if (sz == U8_ILLEGAL_CHAR) {
*4703203dSis			*errno = EILSEQ;
*4703203dSis			return (-1);
*4703203dSis		}
*4703203dSis
*4703203dSis		if (sz == U8_OUT_OF_RANGE_CHAR ||
*4703203dSis		    (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
*4703203dSis			*errno = ERANGE;
*4703203dSis			return (-1);
*4703203dSis		}
*4703203dSis
*4703203dSis		/*
*4703203dSis		 * If we don't have enough bytes to check on, that's also
*4703203dSis		 * an error. As you can see, we give illegal byte sequence
*4703203dSis		 * checking higher priority then EINVAL cases.
*4703203dSis		 */
*4703203dSis		if ((ibtail - ib) < sz) {
*4703203dSis			*errno = EINVAL;
*4703203dSis			return (-1);
*4703203dSis		}
*4703203dSis
*4703203dSis		if (sz == 1) {
*4703203dSis			ib++;
*4703203dSis			ret_val++;
*4703203dSis		} else {
*4703203dSis			/*
*4703203dSis			 * Check on the multi-byte UTF-8 character. For more
*4703203dSis			 * details on this, see comment added for the used
*4703203dSis			 * data structures at the beginning of the file.
*4703203dSis			 */
*4703203dSis			f = *ib++;
*4703203dSis			ret_val++;
*4703203dSis			second = B_TRUE;
*4703203dSis			for (i = 1; i < sz; i++) {
*4703203dSis				if (second) {
*4703203dSis					if (*ib < u8_valid_min_2nd_byte[f] ||
*4703203dSis					    *ib > u8_valid_max_2nd_byte[f]) {
*4703203dSis						*errno = EILSEQ;
*4703203dSis						return (-1);
*4703203dSis					}
*4703203dSis					second = B_FALSE;
*4703203dSis				} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
*4703203dSis					*errno = EILSEQ;
*4703203dSis					return (-1);
*4703203dSis				}
*4703203dSis				ib++;
*4703203dSis				ret_val++;
*4703203dSis			}
*4703203dSis		}
*4703203dSis
*4703203dSis		if (check_additional) {
*4703203dSis			for (p = (uchar_t **)list, i = 0; p[i]; i++) {
*4703203dSis				s1 = ib - sz;
*4703203dSis				s2 = p[i];
*4703203dSis				while (s1 < ib) {
*4703203dSis					if (*s1 != *s2 || *s2 == '\0')
*4703203dSis						break;
*4703203dSis					s1++;
*4703203dSis					s2++;
*4703203dSis				}
*4703203dSis
*4703203dSis				if (s1 >= ib && *s2 == '\0') {
*4703203dSis					*errno = EBADF;
*4703203dSis					return (-1);
*4703203dSis				}
*4703203dSis			}
*4703203dSis		}
*4703203dSis
*4703203dSis		if (no_need_to_validate_entire)
*4703203dSis			break;
*4703203dSis	}
*4703203dSis
*4703203dSis	return (ret_val);
*4703203dSis}
*4703203dSis
*4703203dSis/*
*4703203dSis * The do_case_conv() looks at the mapping tables and returns found
*4703203dSis * bytes if any. If not found, the input bytes are returned. The function
*4703203dSis * always terminate the return bytes with a null character assuming that
*4703203dSis * there are plenty of room to do so.
*4703203dSis *
*4703203dSis * The case conversions are simple case conversions mapping a character to
*4703203dSis * another character as specified in the Unicode data. The byte size of
*4703203dSis * the mapped character could be different from that of the input character.
*4703203dSis *
*4703203dSis * The return value is the byte length of the returned character excluding
*4703203dSis * the terminating null byte.
*4703203dSis */
*4703203dSisstatic size_t
*4703203dSisdo_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
*4703203dSis{
*4703203dSis	size_t i;
*4703203dSis	uint16_t b1 = 0;
*4703203dSis	uint16_t b2 = 0;
*4703203dSis	uint16_t b3 = 0;
*4703203dSis	uint16_t b3_tbl;
*4703203dSis	uint16_t b3_base;
*4703203dSis	uint16_t b4 = 0;
*4703203dSis	size_t start_id;
*4703203dSis	size_t end_id;
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * At this point, the only possible values for sz are 2, 3, and 4.
*4703203dSis	 * The u8s should point to a vector that is well beyond the size of
*4703203dSis	 * 5 bytes.
*4703203dSis	 */
*4703203dSis	if (sz == 2) {
*4703203dSis		b3 = u8s[0] = s[0];
*4703203dSis		b4 = u8s[1] = s[1];
*4703203dSis	} else if (sz == 3) {
*4703203dSis		b2 = u8s[0] = s[0];
*4703203dSis		b3 = u8s[1] = s[1];
*4703203dSis		b4 = u8s[2] = s[2];
*4703203dSis	} else if (sz == 4) {
*4703203dSis		b1 = u8s[0] = s[0];
*4703203dSis		b2 = u8s[1] = s[1];
*4703203dSis		b3 = u8s[2] = s[2];
*4703203dSis		b4 = u8s[3] = s[3];
*4703203dSis	} else {
*4703203dSis		/* This is not possible but just in case as a fallback. */
*4703203dSis		if (is_it_toupper)
*4703203dSis			*u8s = U8_ASCII_TOUPPER(*s);
*4703203dSis		else
*4703203dSis			*u8s = U8_ASCII_TOLOWER(*s);
*4703203dSis		u8s[1] = '\0';
*4703203dSis
*4703203dSis		return (1);
*4703203dSis	}
*4703203dSis	u8s[sz] = '\0';
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * Let's find out if we have a corresponding character.
*4703203dSis	 */
*4703203dSis	b1 = u8_common_b1_tbl[uv][b1];
*4703203dSis	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
*4703203dSis		return ((size_t)sz);
*4703203dSis
*4703203dSis	b2 = u8_case_common_b2_tbl[uv][b1][b2];
*4703203dSis	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
*4703203dSis		return ((size_t)sz);
*4703203dSis
*4703203dSis	if (is_it_toupper) {
*4703203dSis		b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
*4703203dSis		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
*4703203dSis			return ((size_t)sz);
*4703203dSis
*4703203dSis		start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
*4703203dSis		end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
*4703203dSis
*4703203dSis		/* Either there is no match or an error at the table. */
*4703203dSis		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
*4703203dSis			return ((size_t)sz);
*4703203dSis
*4703203dSis		b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
*4703203dSis
*4703203dSis		for (i = 0; start_id < end_id; start_id++)
*4703203dSis			u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
*4703203dSis	} else {
*4703203dSis		b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
*4703203dSis		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
*4703203dSis			return ((size_t)sz);
*4703203dSis
*4703203dSis		start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
*4703203dSis		end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
*4703203dSis
*4703203dSis		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
*4703203dSis			return ((size_t)sz);
*4703203dSis
*4703203dSis		b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
*4703203dSis
*4703203dSis		for (i = 0; start_id < end_id; start_id++)
*4703203dSis			u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
*4703203dSis	}
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * If i is still zero, that means there is no corresponding character.
*4703203dSis	 */
*4703203dSis	if (i == 0)
*4703203dSis		return ((size_t)sz);
*4703203dSis
*4703203dSis	u8s[i] = '\0';
*4703203dSis
*4703203dSis	return (i);
*4703203dSis}
*4703203dSis
*4703203dSis/*
*4703203dSis * The do_case_compare() function compares the two input strings, s1 and s2,
*4703203dSis * one character at a time doing case conversions if applicable and return
*4703203dSis * the comparison result as like strcmp().
*4703203dSis *
*4703203dSis * Since, in empirical sense, most of text data are 7-bit ASCII characters,
*4703203dSis * we treat the 7-bit ASCII characters as a special case trying to yield
*4703203dSis * faster processing time.
*4703203dSis */
*4703203dSisstatic int
*4703203dSisdo_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
*4703203dSis	size_t n2, boolean_t is_it_toupper, int *errno)
*4703203dSis{
*4703203dSis	int f;
*4703203dSis	int sz1;
*4703203dSis	int sz2;
*4703203dSis	size_t j;
*4703203dSis	size_t i1;
*4703203dSis	size_t i2;
*4703203dSis	uchar_t u8s1[U8_MB_CUR_MAX + 1];
*4703203dSis	uchar_t u8s2[U8_MB_CUR_MAX + 1];
*4703203dSis
*4703203dSis	i1 = i2 = 0;
*4703203dSis	while (i1 < n1 && i2 < n2) {
*4703203dSis		/*
*4703203dSis		 * Find out what would be the byte length for this UTF-8
*4703203dSis		 * character at string s1 and also find out if this is
*4703203dSis		 * an illegal start byte or not and if so, issue a proper
*4703203dSis		 * errno and yet treat this byte as a character.
*4703203dSis		 */
*4703203dSis		sz1 = u8_number_of_bytes[*s1];
*4703203dSis		if (sz1 < 0) {
*4703203dSis			*errno = EILSEQ;
*4703203dSis			sz1 = 1;
*4703203dSis		}
*4703203dSis
*4703203dSis		/*
*4703203dSis		 * For 7-bit ASCII characters mainly, we do a quick case
*4703203dSis		 * conversion right at here.
*4703203dSis		 *
*4703203dSis		 * If we don't have enough bytes for this character, issue
*4703203dSis		 * an EINVAL error and use what are available.
*4703203dSis		 *
*4703203dSis		 * If we have enough bytes, find out if there is
*4703203dSis		 * a corresponding uppercase character and if so, copy over
*4703203dSis		 * the bytes for a comparison later. If there is no
*4703203dSis		 * corresponding uppercase character, then, use what we have
*4703203dSis		 * for the comparison.
*4703203dSis		 */
*4703203dSis		if (sz1 == 1) {
*4703203dSis			if (is_it_toupper)
*4703203dSis				u8s1[0] = U8_ASCII_TOUPPER(*s1);
*4703203dSis			else
*4703203dSis				u8s1[0] = U8_ASCII_TOLOWER(*s1);
*4703203dSis			s1++;
*4703203dSis			u8s1[1] = '\0';
*4703203dSis		} else if ((i1 + sz1) > n1) {
*4703203dSis			*errno = EINVAL;
*4703203dSis			for (j = 0; (i1 + j) < n1; )
*4703203dSis				u8s1[j++] = *s1++;
*4703203dSis			u8s1[j] = '\0';
*4703203dSis		} else {
*4703203dSis			(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
*4703203dSis			s1 += sz1;
*4703203dSis		}
*4703203dSis
*4703203dSis		/* Do the same for the string s2. */
*4703203dSis		sz2 = u8_number_of_bytes[*s2];
*4703203dSis		if (sz2 < 0) {
*4703203dSis			*errno = EILSEQ;
*4703203dSis			sz2 = 1;
*4703203dSis		}
*4703203dSis
*4703203dSis		if (sz2 == 1) {
*4703203dSis			if (is_it_toupper)
*4703203dSis				u8s2[0] = U8_ASCII_TOUPPER(*s2);
*4703203dSis			else
*4703203dSis				u8s2[0] = U8_ASCII_TOLOWER(*s2);
*4703203dSis			s2++;
*4703203dSis			u8s2[1] = '\0';
*4703203dSis		} else if ((i2 + sz2) > n2) {
*4703203dSis			*errno = EINVAL;
*4703203dSis			for (j = 0; (i2 + j) < n2; )
*4703203dSis				u8s2[j++] = *s2++;
*4703203dSis			u8s2[j] = '\0';
*4703203dSis		} else {
*4703203dSis			(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
*4703203dSis			s2 += sz2;
*4703203dSis		}
*4703203dSis
*4703203dSis		/* Now compare the two characters. */
*4703203dSis		if (sz1 == 1 && sz2 == 1) {
*4703203dSis			if (*u8s1 > *u8s2)
*4703203dSis				return (1);
*4703203dSis			if (*u8s1 < *u8s2)
*4703203dSis				return (-1);
*4703203dSis		} else {
*4703203dSis			f = strcmp((const char *)u8s1, (const char *)u8s2);
*4703203dSis			if (f != 0)
*4703203dSis				return (f);
*4703203dSis		}
*4703203dSis
*4703203dSis		/*
*4703203dSis		 * They were the same. Let's move on to the next
*4703203dSis		 * characters then.
*4703203dSis		 */
*4703203dSis		i1 += sz1;
*4703203dSis		i2 += sz2;
*4703203dSis	}
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * We compared until the end of either or both strings.
*4703203dSis	 *
*4703203dSis	 * If we reached to or went over the ends for the both, that means
*4703203dSis	 * they are the same.
*4703203dSis	 *
*4703203dSis	 * If we reached only one of the two ends, that means the other string
*4703203dSis	 * has something which then the fact can be used to determine
*4703203dSis	 * the return value.
*4703203dSis	 */
*4703203dSis	if (i1 >= n1) {
*4703203dSis		if (i2 >= n2)
*4703203dSis			return (0);
*4703203dSis		return (-1);
*4703203dSis	}
*4703203dSis	return (1);
*4703203dSis}
*4703203dSis
*4703203dSis/*
*4703203dSis * The combining_class() function checks on the given bytes and find out
*4703203dSis * the corresponding Unicode combining class value. The return value 0 means
*4703203dSis * it is a Starter. Any illegal UTF-8 character will also be treated as
*4703203dSis * a Starter.
*4703203dSis */
*4703203dSisstatic uchar_t
*4703203dSiscombining_class(size_t uv, uchar_t *s, size_t sz)
*4703203dSis{
*4703203dSis	uint16_t b1 = 0;
*4703203dSis	uint16_t b2 = 0;
*4703203dSis	uint16_t b3 = 0;
*4703203dSis	uint16_t b4 = 0;
*4703203dSis
*4703203dSis	if (sz == 1 || sz > 4)
*4703203dSis		return (0);
*4703203dSis
*4703203dSis	if (sz == 2) {
*4703203dSis		b3 = s[0];
*4703203dSis		b4 = s[1];
*4703203dSis	} else if (sz == 3) {
*4703203dSis		b2 = s[0];
*4703203dSis		b3 = s[1];
*4703203dSis		b4 = s[2];
*4703203dSis	} else if (sz == 4) {
*4703203dSis		b1 = s[0];
*4703203dSis		b2 = s[1];
*4703203dSis		b3 = s[2];
*4703203dSis		b4 = s[3];
*4703203dSis	}
*4703203dSis
*4703203dSis	b1 = u8_common_b1_tbl[uv][b1];
*4703203dSis	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
*4703203dSis		return (0);
*4703203dSis
*4703203dSis	b2 = u8_combining_class_b2_tbl[uv][b1][b2];
*4703203dSis	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
*4703203dSis		return (0);
*4703203dSis
*4703203dSis	b3 = u8_combining_class_b3_tbl[uv][b2][b3];
*4703203dSis	if (b3 == U8_TBL_ELEMENT_NOT_DEF)
*4703203dSis		return (0);
*4703203dSis
*4703203dSis	return (u8_combining_class_b4_tbl[uv][b3][b4]);
*4703203dSis}
*4703203dSis
*4703203dSis/*
*4703203dSis * The do_decomp() function finds out a matching decomposition if any
*4703203dSis * and return. If there is no match, the input bytes are copied and returned.
*4703203dSis * The function also checks if there is a Hangul, decomposes it if necessary
*4703203dSis * and returns.
*4703203dSis *
*4703203dSis * To save time, a single byte 7-bit ASCII character should be handled by
*4703203dSis * the caller.
*4703203dSis *
*4703203dSis * The function returns the number of bytes returned sans always terminating
*4703203dSis * the null byte. It will also return a state that will tell if there was
*4703203dSis * a Hangul character decomposed which then will be used by the caller.
*4703203dSis */
*4703203dSisstatic size_t
*4703203dSisdo_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
*4703203dSis	boolean_t canonical_decomposition, u8_normalization_states_t *state)
*4703203dSis{
*4703203dSis	uint16_t b1 = 0;
*4703203dSis	uint16_t b2 = 0;
*4703203dSis	uint16_t b3 = 0;
*4703203dSis	uint16_t b3_tbl;
*4703203dSis	uint16_t b3_base;
*4703203dSis	uint16_t b4 = 0;
*4703203dSis	size_t start_id;
*4703203dSis	size_t end_id;
*4703203dSis	size_t i;
*4703203dSis	uint32_t u1;
*4703203dSis
*4703203dSis	if (sz == 2) {
*4703203dSis		b3 = u8s[0] = s[0];
*4703203dSis		b4 = u8s[1] = s[1];
*4703203dSis		u8s[2] = '\0';
*4703203dSis	} else if (sz == 3) {
*4703203dSis		/* Convert it to a Unicode scalar value. */
*4703203dSis		U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
*4703203dSis
*4703203dSis		/*
*4703203dSis		 * If this is a Hangul syllable, we decompose it into
*4703203dSis		 * a leading consonant, a vowel, and an optional trailing
*4703203dSis		 * consonant and then return.
*4703203dSis		 */
*4703203dSis		if (U8_HANGUL_SYLLABLE(u1)) {
*4703203dSis			u1 -= U8_HANGUL_SYL_FIRST;
*4703203dSis
*4703203dSis			b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
*4703203dSis			b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
*4703203dSis			    / U8_HANGUL_T_COUNT;
*4703203dSis			b3 = u1 % U8_HANGUL_T_COUNT;
*4703203dSis
*4703203dSis			U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
*4703203dSis			U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
*4703203dSis			if (b3) {
*4703203dSis				b3 += U8_HANGUL_JAMO_T_FIRST;
*4703203dSis				U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
*4703203dSis
*4703203dSis				u8s[9] = '\0';
*4703203dSis				*state = U8_STATE_HANGUL_LVT;
*4703203dSis				return (9);
*4703203dSis			}
*4703203dSis
*4703203dSis			u8s[6] = '\0';
*4703203dSis			*state = U8_STATE_HANGUL_LV;
*4703203dSis			return (6);
*4703203dSis		}
*4703203dSis
*4703203dSis		b2 = u8s[0] = s[0];
*4703203dSis		b3 = u8s[1] = s[1];
*4703203dSis		b4 = u8s[2] = s[2];
*4703203dSis		u8s[3] = '\0';
*4703203dSis
*4703203dSis		/*
*4703203dSis		 * If this is a Hangul Jamo, we know there is nothing
*4703203dSis		 * further that we can decompose.
*4703203dSis		 */
*4703203dSis		if (U8_HANGUL_JAMO_L(u1)) {
*4703203dSis			*state = U8_STATE_HANGUL_L;
*4703203dSis			return (3);
*4703203dSis		}
*4703203dSis
*4703203dSis		if (U8_HANGUL_JAMO_V(u1)) {
*4703203dSis			if (*state == U8_STATE_HANGUL_L)
*4703203dSis				*state = U8_STATE_HANGUL_LV;
*4703203dSis			else
*4703203dSis				*state = U8_STATE_HANGUL_V;
*4703203dSis			return (3);
*4703203dSis		}
*4703203dSis
*4703203dSis		if (U8_HANGUL_JAMO_T(u1)) {
*4703203dSis			if (*state == U8_STATE_HANGUL_LV)
*4703203dSis				*state = U8_STATE_HANGUL_LVT;
*4703203dSis			else
*4703203dSis				*state = U8_STATE_HANGUL_T;
*4703203dSis			return (3);
*4703203dSis		}
*4703203dSis	} else if (sz == 4) {
*4703203dSis		b1 = u8s[0] = s[0];
*4703203dSis		b2 = u8s[1] = s[1];
*4703203dSis		b3 = u8s[2] = s[2];
*4703203dSis		b4 = u8s[3] = s[3];
*4703203dSis		u8s[4] = '\0';
*4703203dSis	} else {
*4703203dSis		/*
*4703203dSis		 * This is a fallback and should not happen if the function
*4703203dSis		 * was called properly.
*4703203dSis		 */
*4703203dSis		u8s[0] = s[0];
*4703203dSis		u8s[1] = '\0';
*4703203dSis		*state = U8_STATE_START;
*4703203dSis		return (1);
*4703203dSis	}
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * At this point, this rountine does not know what it would get.
*4703203dSis	 * The caller should sort it out if the state isn't a Hangul one.
*4703203dSis	 */
*4703203dSis	*state = U8_STATE_START;
*4703203dSis
*4703203dSis	/* Try to find matching decomposition mapping byte sequence. */
*4703203dSis	b1 = u8_common_b1_tbl[uv][b1];
*4703203dSis	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
*4703203dSis		return ((size_t)sz);
*4703203dSis
*4703203dSis	b2 = u8_decomp_b2_tbl[uv][b1][b2];
*4703203dSis	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
*4703203dSis		return ((size_t)sz);
*4703203dSis
*4703203dSis	b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
*4703203dSis	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
*4703203dSis		return ((size_t)sz);
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
*4703203dSis	 * which is 0x8000, this means we couldn't fit the mappings into
*4703203dSis	 * the cardinality of a unsigned byte.
*4703203dSis	 */
*4703203dSis	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
*4703203dSis		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
*4703203dSis		start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
*4703203dSis		end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
*4703203dSis	} else {
*4703203dSis		start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
*4703203dSis		end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
*4703203dSis	}
*4703203dSis
*4703203dSis	/* This also means there wasn't any matching decomposition. */
*4703203dSis	if (start_id >= end_id)
*4703203dSis		return ((size_t)sz);
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * The final table for decomposition mappings has three types of
*4703203dSis	 * byte sequences depending on whether a mapping is for compatibility
*4703203dSis	 * decomposition, canonical decomposition, or both like the following:
*4703203dSis	 *
*4703203dSis	 * (1) Compatibility decomposition mappings:
*4703203dSis	 *
*4703203dSis	 *	+---+---+-...-+---+
*4703203dSis	 *	| B0| B1| ... | Bm|
*4703203dSis	 *	+---+---+-...-+---+
*4703203dSis	 *
*4703203dSis	 *	The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
*4703203dSis	 *
*4703203dSis	 * (2) Canonical decomposition mappings:
*4703203dSis	 *
*4703203dSis	 *	+---+---+---+-...-+---+
*4703203dSis	 *	| T | b0| b1| ... | bn|
*4703203dSis	 *	+---+---+---+-...-+---+
*4703203dSis	 *
*4703203dSis	 *	where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
*4703203dSis	 *
*4703203dSis	 * (3) Both mappings:
*4703203dSis	 *
*4703203dSis	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
*4703203dSis	 *	| T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
*4703203dSis	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
*4703203dSis	 *
*4703203dSis	 *	where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
*4703203dSis	 *	byte, b0 to bn are canonical mapping bytes and B0 to Bm are
*4703203dSis	 *	compatibility mapping bytes.
*4703203dSis	 *
*4703203dSis	 * Note that compatibility decomposition means doing recursive
*4703203dSis	 * decompositions using both compatibility decomposition mappings and
*4703203dSis	 * canonical decomposition mappings. On the other hand, canonical
*4703203dSis	 * decomposition means doing recursive decompositions using only
*4703203dSis	 * canonical decomposition mappings. Since the table we have has gone
*4703203dSis	 * through the recursions already, we do not need to do so during
*4703203dSis	 * runtime, i.e., the table has been completely flattened out
*4703203dSis	 * already.
*4703203dSis	 */
*4703203dSis
*4703203dSis	b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
*4703203dSis
*4703203dSis	/* Get the type, T, of the byte sequence. */
*4703203dSis	b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * If necessary, adjust start_id, end_id, or both. Note that if
*4703203dSis	 * this is compatibility decomposition mapping, there is no
*4703203dSis	 * adjustment.
*4703203dSis	 */
*4703203dSis	if (canonical_decomposition) {
*4703203dSis		/* Is the mapping only for compatibility decomposition? */
*4703203dSis		if (b1 < U8_DECOMP_BOTH)
*4703203dSis			return ((size_t)sz);
*4703203dSis
*4703203dSis		start_id++;
*4703203dSis
*4703203dSis		if (b1 == U8_DECOMP_BOTH) {
*4703203dSis			end_id = start_id +
*4703203dSis			    u8_decomp_final_tbl[uv][b3_base + start_id];
*4703203dSis			start_id++;
*4703203dSis		}
*4703203dSis	} else {
*4703203dSis		/*
*4703203dSis		 * Unless this is a compatibility decomposition mapping,
*4703203dSis		 * we adjust the start_id.
*4703203dSis		 */
*4703203dSis		if (b1 == U8_DECOMP_BOTH) {
*4703203dSis			start_id++;
*4703203dSis			start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
*4703203dSis		} else if (b1 == U8_DECOMP_CANONICAL) {
*4703203dSis			start_id++;
*4703203dSis		}
*4703203dSis	}
*4703203dSis
*4703203dSis	for (i = 0; start_id < end_id; start_id++)
*4703203dSis		u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
*4703203dSis	u8s[i] = '\0';
*4703203dSis
*4703203dSis	return (i);
*4703203dSis}
*4703203dSis
*4703203dSis/*
*4703203dSis * The find_composition_start() function uses the character bytes given and
*4703203dSis * find out the matching composition mappings if any and return the address
*4703203dSis * to the composition mappings as explained in the do_composition().
*4703203dSis */
*4703203dSisstatic uchar_t *
*4703203dSisfind_composition_start(size_t uv, uchar_t *s, size_t sz)
*4703203dSis{
*4703203dSis	uint16_t b1 = 0;
*4703203dSis	uint16_t b2 = 0;
*4703203dSis	uint16_t b3 = 0;
*4703203dSis	uint16_t b3_tbl;
*4703203dSis	uint16_t b3_base;
*4703203dSis	uint16_t b4 = 0;
*4703203dSis	size_t start_id;
*4703203dSis	size_t end_id;
*4703203dSis
*4703203dSis	if (sz == 1) {
*4703203dSis		b4 = s[0];
*4703203dSis	} else if (sz == 2) {
*4703203dSis		b3 = s[0];
*4703203dSis		b4 = s[1];
*4703203dSis	} else if (sz == 3) {
*4703203dSis		b2 = s[0];
*4703203dSis		b3 = s[1];
*4703203dSis		b4 = s[2];
*4703203dSis	} else if (sz == 4) {
*4703203dSis		b1 = s[0];
*4703203dSis		b2 = s[1];
*4703203dSis		b3 = s[2];
*4703203dSis		b4 = s[3];
*4703203dSis	} else {
*4703203dSis		/*
*4703203dSis		 * This is a fallback and should not happen if the function
*4703203dSis		 * was called properly.
*4703203dSis		 */
*4703203dSis		return (NULL);
*4703203dSis	}
*4703203dSis
*4703203dSis	b1 = u8_composition_b1_tbl[uv][b1];
*4703203dSis	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
*4703203dSis		return (NULL);
*4703203dSis
*4703203dSis	b2 = u8_composition_b2_tbl[uv][b1][b2];
*4703203dSis	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
*4703203dSis		return (NULL);
*4703203dSis
*4703203dSis	b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
*4703203dSis	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
*4703203dSis		return (NULL);
*4703203dSis
*4703203dSis	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
*4703203dSis		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
*4703203dSis		start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
*4703203dSis		end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
*4703203dSis	} else {
*4703203dSis		start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
*4703203dSis		end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
*4703203dSis	}
*4703203dSis
*4703203dSis	if (start_id >= end_id)
*4703203dSis		return (NULL);
*4703203dSis
*4703203dSis	b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
*4703203dSis
*4703203dSis	return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
*4703203dSis}
*4703203dSis
*4703203dSis/*
*4703203dSis * The blocked() function checks on the combining class values of previous
*4703203dSis * characters in this sequence and return whether it is blocked or not.
*4703203dSis */
*4703203dSisstatic boolean_t
*4703203dSisblocked(uchar_t *comb_class, size_t last)
*4703203dSis{
*4703203dSis	uchar_t my_comb_class;
*4703203dSis	size_t i;
*4703203dSis
*4703203dSis	my_comb_class = comb_class[last];
*4703203dSis	for (i = 1; i < last; i++)
*4703203dSis		if (comb_class[i] >= my_comb_class ||
*4703203dSis		    comb_class[i] == U8_COMBINING_CLASS_STARTER)
*4703203dSis			return (B_TRUE);
*4703203dSis
*4703203dSis	return (B_FALSE);
*4703203dSis}
*4703203dSis
*4703203dSis/*
*4703203dSis * The do_composition() reads the character string pointed by 's' and
*4703203dSis * do necessary canonical composition and then copy over the result back to
*4703203dSis * the 's'.
*4703203dSis *
*4703203dSis * The input argument 's' cannot contain more than 32 characters.
*4703203dSis */
*4703203dSisstatic size_t
*4703203dSisdo_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
*4703203dSis	uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
*4703203dSis{
*4703203dSis	uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
*4703203dSis	uchar_t tc[U8_MB_CUR_MAX];
*4703203dSis	uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
*4703203dSis	size_t saved_marks_count;
*4703203dSis	uchar_t *p;
*4703203dSis	uchar_t *saved_p;
*4703203dSis	uchar_t *q;
*4703203dSis	size_t i;
*4703203dSis	size_t saved_i;
*4703203dSis	size_t j;
*4703203dSis	size_t k;
*4703203dSis	size_t l;
*4703203dSis	size_t C;
*4703203dSis	size_t saved_l;
*4703203dSis	size_t size;
*4703203dSis	uint32_t u1;
*4703203dSis	uint32_t u2;
*4703203dSis	boolean_t match_not_found = B_TRUE;
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * This should never happen unless the callers are doing some strange
*4703203dSis	 * and unexpected things.
*4703203dSis	 *
*4703203dSis	 * The "last" is the index pointing to the last character not last + 1.
*4703203dSis	 */
*4703203dSis	if (last >= U8_MAX_CHARS_A_SEQ)
*4703203dSis		last = U8_UPPER_LIMIT_IN_A_SEQ;
*4703203dSis
*4703203dSis	for (i = l = 0; i <= last; i++) {
*4703203dSis		/*
*4703203dSis		 * The last or any non-Starters at the beginning, we don't
*4703203dSis		 * have any chance to do composition and so we just copy them
*4703203dSis		 * to the temporary buffer.
*4703203dSis		 */
*4703203dSis		if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
*4703203dSisSAVE_THE_CHAR:
*4703203dSis			p = s + start[i];
*4703203dSis			size = disp[i];
*4703203dSis			for (k = 0; k < size; k++)
*4703203dSis				t[l++] = *p++;
*4703203dSis			continue;
*4703203dSis		}
*4703203dSis
*4703203dSis		/*
*4703203dSis		 * If this could be a start of Hangul Jamos, then, we try to
*4703203dSis		 * conjoin them.
*4703203dSis		 */
*4703203dSis		if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
*4703203dSis			U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
*4703203dSis			    s[start[i] + 1], s[start[i] + 2]);
*4703203dSis			U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
*4703203dSis			    s[start[i] + 4], s[start[i] + 5]);
*4703203dSis
*4703203dSis			if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
*4703203dSis				u1 -= U8_HANGUL_JAMO_L_FIRST;
*4703203dSis				u2 -= U8_HANGUL_JAMO_V_FIRST;
*4703203dSis				u1 = U8_HANGUL_SYL_FIRST +
*4703203dSis				    (u1 * U8_HANGUL_V_COUNT + u2) *
*4703203dSis				    U8_HANGUL_T_COUNT;
*4703203dSis
*4703203dSis				i += 2;
*4703203dSis				if (i <= last) {
*4703203dSis					U8_PUT_3BYTES_INTO_UTF32(u2,
*4703203dSis					    s[start[i]], s[start[i] + 1],
*4703203dSis					    s[start[i] + 2]);
*4703203dSis
*4703203dSis					if (U8_HANGUL_JAMO_T(u2)) {
*4703203dSis						u1 += u2 -
*4703203dSis						    U8_HANGUL_JAMO_T_FIRST;
*4703203dSis						i++;
*4703203dSis					}
*4703203dSis				}
*4703203dSis
*4703203dSis				U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
*4703203dSis				i--;
*4703203dSis				l += 3;
*4703203dSis				continue;
*4703203dSis			}
*4703203dSis		}
*4703203dSis
*4703203dSis		/*
*4703203dSis		 * Let's then find out if this Starter has composition
*4703203dSis		 * mapping.
*4703203dSis		 */
*4703203dSis		p = find_composition_start(uv, s + start[i], disp[i]);
*4703203dSis		if (p == NULL)
*4703203dSis			goto SAVE_THE_CHAR;
*4703203dSis
*4703203dSis		/*
*4703203dSis		 * We have a Starter with composition mapping and the next
*4703203dSis		 * character is a non-Starter. Let's try to find out if
*4703203dSis		 * we can do composition.
*4703203dSis		 */
*4703203dSis
*4703203dSis		saved_p = p;
*4703203dSis		saved_i = i;
*4703203dSis		saved_l = l;
*4703203dSis		saved_marks_count = 0;
*4703203dSis
*4703203dSisTRY_THE_NEXT_MARK:
*4703203dSis		q = s + start[++i];
*4703203dSis		size = disp[i];
*4703203dSis
*4703203dSis		/*
*4703203dSis		 * The next for() loop compares the non-Starter pointed by
*4703203dSis		 * 'q' with the possible (joinable) characters pointed by 'p'.
*4703203dSis		 *
*4703203dSis		 * The composition final table entry pointed by the 'p'
*4703203dSis		 * looks like the following:
*4703203dSis		 *
*4703203dSis		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
*4703203dSis		 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
*4703203dSis		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
*4703203dSis		 *
*4703203dSis		 * where C is the count byte indicating the number of
*4703203dSis		 * mapping pairs where each pair would be look like
*4703203dSis		 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
*4703203dSis		 * character of a canonical decomposition and the B0-Bm are
*4703203dSis		 * the bytes of a matching composite character. The F is
*4703203dSis		 * a filler byte after each character as the separator.
*4703203dSis		 */
*4703203dSis
*4703203dSis		match_not_found = B_TRUE;
*4703203dSis
*4703203dSis		for (C = *p++; C > 0; C--) {
*4703203dSis			for (k = 0; k < size; p++, k++)
*4703203dSis				if (*p != q[k])
*4703203dSis					break;
*4703203dSis
*4703203dSis			/* Have we found it? */
*4703203dSis			if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
*4703203dSis				match_not_found = B_FALSE;
*4703203dSis
*4703203dSis				l = saved_l;
*4703203dSis
*4703203dSis				while (*++p != U8_TBL_ELEMENT_FILLER)
*4703203dSis					t[l++] = *p;
*4703203dSis
*4703203dSis				break;
*4703203dSis			}
*4703203dSis
*4703203dSis			/* We didn't find; skip to the next pair. */
*4703203dSis			if (*p != U8_TBL_ELEMENT_FILLER)
*4703203dSis				while (*++p != U8_TBL_ELEMENT_FILLER)
*4703203dSis					;
*4703203dSis			while (*++p != U8_TBL_ELEMENT_FILLER)
*4703203dSis				;
*4703203dSis			p++;
*4703203dSis		}
*4703203dSis
*4703203dSis		/*
*4703203dSis		 * If there was no match, we will need to save the combining
*4703203dSis		 * mark for later appending. After that, if the next one
*4703203dSis		 * is a non-Starter and not blocked, then, we try once
*4703203dSis		 * again to do composition with the next non-Starter.
*4703203dSis		 *
*4703203dSis		 * If there was no match and this was a Starter, then,
*4703203dSis		 * this is a new start.
*4703203dSis		 *
*4703203dSis		 * If there was a match and a composition done and we have
*4703203dSis		 * more to check on, then, we retrieve a new composition final
*4703203dSis		 * table entry for the composite and then try to do the
*4703203dSis		 * composition again.
*4703203dSis		 */
*4703203dSis
*4703203dSis		if (match_not_found) {
*4703203dSis			if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
*4703203dSis				i--;
*4703203dSis				goto SAVE_THE_CHAR;
*4703203dSis			}
*4703203dSis
*4703203dSis			saved_marks[saved_marks_count++] = i;
*4703203dSis		}
*4703203dSis
*4703203dSis		if (saved_l == l) {
*4703203dSis			while (i < last) {
*4703203dSis				if (blocked(comb_class, i + 1))
*4703203dSis					saved_marks[saved_marks_count++] = ++i;
*4703203dSis				else
*4703203dSis					break;
*4703203dSis			}
*4703203dSis			if (i < last) {
*4703203dSis				p = saved_p;
*4703203dSis				goto TRY_THE_NEXT_MARK;
*4703203dSis			}
*4703203dSis		} else if (i < last) {
*4703203dSis			p = find_composition_start(uv, t + saved_l,
*4703203dSis			    l - saved_l);
*4703203dSis			if (p != NULL) {
*4703203dSis				saved_p = p;
*4703203dSis				goto TRY_THE_NEXT_MARK;
*4703203dSis			}
*4703203dSis		}
*4703203dSis
*4703203dSis		/*
*4703203dSis		 * There is no more composition possible.
*4703203dSis		 *
*4703203dSis		 * If there was no composition what so ever then we copy
*4703203dSis		 * over the original Starter and then append any non-Starters
*4703203dSis		 * remaining at the target string sequentially after that.
*4703203dSis		 */
*4703203dSis
*4703203dSis		if (saved_l == l) {
*4703203dSis			p = s + start[saved_i];
*4703203dSis			size = disp[saved_i];
*4703203dSis			for (j = 0; j < size; j++)
*4703203dSis				t[l++] = *p++;
*4703203dSis		}
*4703203dSis
*4703203dSis		for (k = 0; k < saved_marks_count; k++) {
*4703203dSis			p = s + start[saved_marks[k]];
*4703203dSis			size = disp[saved_marks[k]];
*4703203dSis			for (j = 0; j < size; j++)
*4703203dSis				t[l++] = *p++;
*4703203dSis		}
*4703203dSis	}
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * If the last character is a Starter and if we have a character
*4703203dSis	 * (possibly another Starter) that can be turned into a composite,
*4703203dSis	 * we do so and we do so until there is no more of composition
*4703203dSis	 * possible.
*4703203dSis	 */
*4703203dSis	if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
*4703203dSis		p = *os;
*4703203dSis		saved_l = l - disp[last];
*4703203dSis
*4703203dSis		while (p < oslast) {
*4703203dSis			size = u8_number_of_bytes[*p];
*4703203dSis			if (size <= 1 || (p + size) > oslast)
*4703203dSis				break;
*4703203dSis
*4703203dSis			saved_p = p;
*4703203dSis
*4703203dSis			for (i = 0; i < size; i++)
*4703203dSis				tc[i] = *p++;
*4703203dSis
*4703203dSis			q = find_composition_start(uv, t + saved_l,
*4703203dSis			    l - saved_l);
*4703203dSis			if (q == NULL) {
*4703203dSis				p = saved_p;
*4703203dSis				break;
*4703203dSis			}
*4703203dSis
*4703203dSis			match_not_found = B_TRUE;
*4703203dSis
*4703203dSis			for (C = *q++; C > 0; C--) {
*4703203dSis				for (k = 0; k < size; q++, k++)
*4703203dSis					if (*q != tc[k])
*4703203dSis						break;
*4703203dSis
*4703203dSis				if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
*4703203dSis					match_not_found = B_FALSE;
*4703203dSis
*4703203dSis					l = saved_l;
*4703203dSis
*4703203dSis					while (*++q != U8_TBL_ELEMENT_FILLER) {
*4703203dSis						/*
*4703203dSis						 * This is practically
*4703203dSis						 * impossible but we don't
*4703203dSis						 * want to take any chances.
*4703203dSis						 */
*4703203dSis						if (l >=
*4703203dSis						    U8_STREAM_SAFE_TEXT_MAX) {
*4703203dSis							p = saved_p;
*4703203dSis							goto SAFE_RETURN;
*4703203dSis						}
*4703203dSis						t[l++] = *q;
*4703203dSis					}
*4703203dSis
*4703203dSis					break;
*4703203dSis				}
*4703203dSis
*4703203dSis				if (*q != U8_TBL_ELEMENT_FILLER)
*4703203dSis					while (*++q != U8_TBL_ELEMENT_FILLER)
*4703203dSis						;
*4703203dSis				while (*++q != U8_TBL_ELEMENT_FILLER)
*4703203dSis					;
*4703203dSis				q++;
*4703203dSis			}
*4703203dSis
*4703203dSis			if (match_not_found) {
*4703203dSis				p = saved_p;
*4703203dSis				break;
*4703203dSis			}
*4703203dSis		}
*4703203dSisSAFE_RETURN:
*4703203dSis		*os = p;
*4703203dSis	}
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * Now we copy over the temporary string to the target string.
*4703203dSis	 * Since composition always reduces the number of characters or
*4703203dSis	 * the number of characters stay, we don't need to worry about
*4703203dSis	 * the buffer overflow here.
*4703203dSis	 */
*4703203dSis	for (i = 0; i < l; i++)
*4703203dSis		s[i] = t[i];
*4703203dSis	s[l] = '\0';
*4703203dSis
*4703203dSis	return (l);
*4703203dSis}
*4703203dSis
*4703203dSis/*
*4703203dSis * The collect_a_seq() function checks on the given string s, collect
*4703203dSis * a sequence of characters at u8s, and return the sequence. While it collects
*4703203dSis * a sequence, it also applies case conversion, canonical or compatibility
*4703203dSis * decomposition, canonical decomposition, or some or all of them and
*4703203dSis * in that order.
*4703203dSis *
*4703203dSis * The collected sequence cannot be bigger than 32 characters since if
*4703203dSis * it is having more than 31 characters, the sequence will be terminated
*4703203dSis * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
*4703203dSis * a Stream-Safe Text. The collected sequence is always terminated with
*4703203dSis * a null byte and the return value is the byte length of the sequence
*4703203dSis * including 0. The return value does not include the terminating
*4703203dSis * null byte.
*4703203dSis */
*4703203dSisstatic size_t
*4703203dSiscollect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
*4703203dSis	boolean_t is_it_toupper,
*4703203dSis	boolean_t is_it_tolower,
*4703203dSis	boolean_t canonical_decomposition,
*4703203dSis	boolean_t compatibility_decomposition,
*4703203dSis	boolean_t canonical_composition,
*4703203dSis	int *errno, u8_normalization_states_t *state)
*4703203dSis{
*4703203dSis	uchar_t *s;
*4703203dSis	int sz;
*4703203dSis	int saved_sz;
*4703203dSis	size_t i;
*4703203dSis	size_t j;
*4703203dSis	size_t k;
*4703203dSis	size_t l;
*4703203dSis	uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
*4703203dSis	uchar_t disp[U8_MAX_CHARS_A_SEQ];
*4703203dSis	uchar_t start[U8_MAX_CHARS_A_SEQ];
*4703203dSis	uchar_t u8t[U8_MB_CUR_MAX];
*4703203dSis	uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
*4703203dSis	uchar_t tc;
*4703203dSis	size_t last;
*4703203dSis	size_t saved_last;
*4703203dSis	uint32_t u1;
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * Save the source string pointer which we will return a changed
*4703203dSis	 * pointer if we do processing.
*4703203dSis	 */
*4703203dSis	s = *source;
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * The following is a fallback for just in case callers are not
*4703203dSis	 * checking the string boundaries before the calling.
*4703203dSis	 */
*4703203dSis	if (s >= slast) {
*4703203dSis		u8s[0] = '\0';
*4703203dSis
*4703203dSis		return (0);
*4703203dSis	}
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * As the first thing, let's collect a character and do case
*4703203dSis	 * conversion if necessary.
*4703203dSis	 */
*4703203dSis
*4703203dSis	sz = u8_number_of_bytes[*s];
*4703203dSis
*4703203dSis	if (sz < 0) {
*4703203dSis		*errno = EILSEQ;
*4703203dSis
*4703203dSis		u8s[0] = *s++;
*4703203dSis		u8s[1] = '\0';
*4703203dSis
*4703203dSis		*source = s;
*4703203dSis
*4703203dSis		return (1);
*4703203dSis	}
*4703203dSis
*4703203dSis	if (sz == 1) {
*4703203dSis		if (is_it_toupper)
*4703203dSis			u8s[0] = U8_ASCII_TOUPPER(*s);
*4703203dSis		else if (is_it_tolower)
*4703203dSis			u8s[0] = U8_ASCII_TOLOWER(*s);
*4703203dSis		else
*4703203dSis			u8s[0] = *s;
*4703203dSis		s++;
*4703203dSis		u8s[1] = '\0';
*4703203dSis	} else if ((s + sz) > slast) {
*4703203dSis		*errno = EINVAL;
*4703203dSis
*4703203dSis		for (i = 0; s < slast; )
*4703203dSis			u8s[i++] = *s++;
*4703203dSis		u8s[i] = '\0';
*4703203dSis
*4703203dSis		*source = s;
*4703203dSis
*4703203dSis		return (i);
*4703203dSis	} else {
*4703203dSis		if (is_it_toupper || is_it_tolower) {
*4703203dSis			i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
*4703203dSis			s += sz;
*4703203dSis			sz = i;
*4703203dSis		} else {
*4703203dSis			for (i = 0; i < sz; )
*4703203dSis				u8s[i++] = *s++;
*4703203dSis			u8s[i] = '\0';
*4703203dSis		}
*4703203dSis	}
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * And then canonical/compatibility decomposition followed by
*4703203dSis	 * an optional canonical composition. Please be noted that
*4703203dSis	 * canonical composition is done only when a decomposition is
*4703203dSis	 * done.
*4703203dSis	 */
*4703203dSis	if (canonical_decomposition || compatibility_decomposition) {
*4703203dSis		if (sz == 1) {
*4703203dSis			*state = U8_STATE_START;
*4703203dSis
*4703203dSis			saved_sz = 1;
*4703203dSis
*4703203dSis			comb_class[0] = 0;
*4703203dSis			start[0] = 0;
*4703203dSis			disp[0] = 1;
*4703203dSis
*4703203dSis			last = 1;
*4703203dSis		} else {
*4703203dSis			saved_sz = do_decomp(uv, u8s, u8s, sz,
*4703203dSis			    canonical_decomposition, state);
*4703203dSis
*4703203dSis			last = 0;
*4703203dSis
*4703203dSis			for (i = 0; i < saved_sz; ) {
*4703203dSis				sz = u8_number_of_bytes[u8s[i]];
*4703203dSis
*4703203dSis				comb_class[last] = combining_class(uv,
*4703203dSis				    u8s + i, sz);
*4703203dSis				start[last] = i;
*4703203dSis				disp[last] = sz;
*4703203dSis
*4703203dSis				last++;
*4703203dSis				i += sz;
*4703203dSis			}
*4703203dSis
*4703203dSis			/*
*4703203dSis			 * Decomposition yields various Hangul related
*4703203dSis			 * states but not on combining marks. We need to
*4703203dSis			 * find out at here by checking on the last
*4703203dSis			 * character.
*4703203dSis			 */
*4703203dSis			if (*state == U8_STATE_START) {
*4703203dSis				if (comb_class[last - 1])
*4703203dSis					*state = U8_STATE_COMBINING_MARK;
*4703203dSis			}
*4703203dSis		}
*4703203dSis
*4703203dSis		saved_last = last;
*4703203dSis
*4703203dSis		while (s < slast) {
*4703203dSis			sz = u8_number_of_bytes[*s];
*4703203dSis
*4703203dSis			/*
*4703203dSis			 * If this is an illegal character, an incomplete
*4703203dSis			 * character, or an 7-bit ASCII Starter character,
*4703203dSis			 * then we have collected a sequence; break and let
*4703203dSis			 * the next call deal with the two cases.
*4703203dSis			 *
*4703203dSis			 * Note that this is okay only if you are using this
*4703203dSis			 * function with a fixed length string, not on
*4703203dSis			 * a buffer with multiple calls of one chunk at a time.
*4703203dSis			 */
*4703203dSis			if (sz <= 1) {
*4703203dSis				break;
*4703203dSis			} else if ((s + sz) > slast) {
*4703203dSis				break;
*4703203dSis			} else {
*4703203dSis				/*
*4703203dSis				 * If the previous character was a Hangul Jamo
*4703203dSis				 * and this character is a Hangul Jamo that
*4703203dSis				 * can be conjoined, we collect the Jamo.
*4703203dSis				 */
*4703203dSis				if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
*4703203dSis					U8_PUT_3BYTES_INTO_UTF32(u1,
*4703203dSis					    *s, *(s + 1), *(s + 2));
*4703203dSis
*4703203dSis					if (U8_HANGUL_COMPOSABLE_L_V(*state,
*4703203dSis					    u1)) {
*4703203dSis						i = 0;
*4703203dSis						*state = U8_STATE_HANGUL_LV;
*4703203dSis						goto COLLECT_A_HANGUL;
*4703203dSis					}
*4703203dSis
*4703203dSis					if (U8_HANGUL_COMPOSABLE_LV_T(*state,
*4703203dSis					    u1)) {
*4703203dSis						i = 0;
*4703203dSis						*state = U8_STATE_HANGUL_LVT;
*4703203dSis						goto COLLECT_A_HANGUL;
*4703203dSis					}
*4703203dSis				}
*4703203dSis
*4703203dSis				/*
*4703203dSis				 * Regardless of whatever it was, if this is
*4703203dSis				 * a Starter, we don't collect the character
*4703203dSis				 * since that's a new start and we will deal
*4703203dSis				 * with it at the next time.
*4703203dSis				 */
*4703203dSis				i = combining_class(uv, s, sz);
*4703203dSis				if (i == U8_COMBINING_CLASS_STARTER)
*4703203dSis					break;
*4703203dSis
*4703203dSis				/*
*4703203dSis				 * We know the current character is a combining
*4703203dSis				 * mark. If the previous character wasn't
*4703203dSis				 * a Starter (not Hangul) or a combining mark,
*4703203dSis				 * then, we don't collect this combining mark.
*4703203dSis				 */
*4703203dSis				if (*state != U8_STATE_START &&
*4703203dSis				    *state != U8_STATE_COMBINING_MARK)
*4703203dSis					break;
*4703203dSis
*4703203dSis				*state = U8_STATE_COMBINING_MARK;
*4703203dSisCOLLECT_A_HANGUL:
*4703203dSis				/*
*4703203dSis				 * If we collected a Starter and combining
*4703203dSis				 * marks up to 30, i.e., total 31 characters,
*4703203dSis				 * then, we terminate this degenerately long
*4703203dSis				 * combining sequence with a U+034F COMBINING
*4703203dSis				 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
*4703203dSis				 * UTF-8 and turn this into a Stream-Safe
*4703203dSis				 * Text. This will be extremely rare but
*4703203dSis				 * possible.
*4703203dSis				 *
*4703203dSis				 * The following will also guarantee that
*4703203dSis				 * we are not writing more than 32 characters
*4703203dSis				 * plus a NULL at u8s[].
*4703203dSis				 */
*4703203dSis				if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
*4703203dSisTURN_STREAM_SAFE:
*4703203dSis					*state = U8_STATE_START;
*4703203dSis					comb_class[last] = 0;
*4703203dSis					start[last] = saved_sz;
*4703203dSis					disp[last] = 2;
*4703203dSis					last++;
*4703203dSis
*4703203dSis					u8s[saved_sz++] = 0xCD;
*4703203dSis					u8s[saved_sz++] = 0x8F;
*4703203dSis
*4703203dSis					break;
*4703203dSis				}
*4703203dSis
*4703203dSis				/*
*4703203dSis				 * Some combining marks also do decompose into
*4703203dSis				 * another combining mark or marks.
*4703203dSis				 */
*4703203dSis				if (*state == U8_STATE_COMBINING_MARK) {
*4703203dSis					k = last;
*4703203dSis					l = sz;
*4703203dSis					i = do_decomp(uv, uts, s, sz,
*4703203dSis					    canonical_decomposition, state);
*4703203dSis					for (j = 0; j < i; ) {
*4703203dSis						sz = u8_number_of_bytes[uts[j]];
*4703203dSis
*4703203dSis						comb_class[last] =
*4703203dSis						    combining_class(uv,
*4703203dSis						    uts + j, sz);
*4703203dSis						start[last] = saved_sz + j;
*4703203dSis						disp[last] = sz;
*4703203dSis
*4703203dSis						last++;
*4703203dSis						if (last >=
*4703203dSis						    U8_UPPER_LIMIT_IN_A_SEQ) {
*4703203dSis							last = k;
*4703203dSis							goto TURN_STREAM_SAFE;
*4703203dSis						}
*4703203dSis						j += sz;
*4703203dSis					}
*4703203dSis
*4703203dSis					*state = U8_STATE_COMBINING_MARK;
*4703203dSis					sz = i;
*4703203dSis					s += l;
*4703203dSis
*4703203dSis					for (i = 0; i < sz; i++)
*4703203dSis						u8s[saved_sz++] = uts[i];
*4703203dSis				} else {
*4703203dSis					comb_class[last] = i;
*4703203dSis					start[last] = saved_sz;
*4703203dSis					disp[last] = sz;
*4703203dSis					last++;
*4703203dSis
*4703203dSis					for (i = 0; i < sz; i++)
*4703203dSis						u8s[saved_sz++] = *s++;
*4703203dSis				}
*4703203dSis
*4703203dSis				/*
*4703203dSis				 * If this is U+0345 COMBINING GREEK
*4703203dSis				 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
*4703203dSis				 * iota subscript, and need to be converted to
*4703203dSis				 * uppercase letter, convert it to U+0399 GREEK
*4703203dSis				 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
*4703203dSis				 * i.e., convert to capital adscript form as
*4703203dSis				 * specified in the Unicode standard.
*4703203dSis				 *
*4703203dSis				 * This is the only special case of (ambiguous)
*4703203dSis				 * case conversion at combining marks and
*4703203dSis				 * probably the standard will never have
*4703203dSis				 * anything similar like this in future.
*4703203dSis				 */
*4703203dSis				if (is_it_toupper && sz >= 2 &&
*4703203dSis				    u8s[saved_sz - 2] == 0xCD &&
*4703203dSis				    u8s[saved_sz - 1] == 0x85) {
*4703203dSis					u8s[saved_sz - 2] = 0xCE;
*4703203dSis					u8s[saved_sz - 1] = 0x99;
*4703203dSis				}
*4703203dSis			}
*4703203dSis		}
*4703203dSis
*4703203dSis		/*
*4703203dSis		 * Let's try to ensure a canonical ordering for the collected
*4703203dSis		 * combining marks. We do this only if we have collected
*4703203dSis		 * at least one more non-Starter. (The decomposition mapping
*4703203dSis		 * data tables have fully (and recursively) expanded and
*4703203dSis		 * canonically ordered decompositions.)
*4703203dSis		 *
*4703203dSis		 * The U8_SWAP_COMB_MARKS() convenience macro has some
*4703203dSis		 * assumptions and we are meeting the assumptions.
*4703203dSis		 */
*4703203dSis		last--;
*4703203dSis		if (last >= saved_last) {
*4703203dSis			for (i = 0; i < last; i++)
*4703203dSis				for (j = last; j > i; j--)
*4703203dSis					if (comb_class[j] &&
*4703203dSis					    comb_class[j - 1] > comb_class[j]) {
*4703203dSis						U8_SWAP_COMB_MARKS(j - 1, j);
*4703203dSis					}
*4703203dSis		}
*4703203dSis
*4703203dSis		*source = s;
*4703203dSis
*4703203dSis		if (! canonical_composition) {
*4703203dSis			u8s[saved_sz] = '\0';
*4703203dSis			return (saved_sz);
*4703203dSis		}
*4703203dSis
*4703203dSis		/*
*4703203dSis		 * Now do the canonical composition. Note that we do this
*4703203dSis		 * only after a canonical or compatibility decomposition to
*4703203dSis		 * finish up NFC or NFKC.
*4703203dSis		 */
*4703203dSis		sz = do_composition(uv, u8s, comb_class, start, disp, last,
*4703203dSis		    &s, slast);
*4703203dSis	}
*4703203dSis
*4703203dSis	*source = s;
*4703203dSis
*4703203dSis	return ((size_t)sz);
*4703203dSis}
*4703203dSis
*4703203dSis/*
*4703203dSis * The do_norm_compare() function does string comparion based on Unicode
*4703203dSis * simple case mappings and Unicode Normalization definitions.
*4703203dSis *
*4703203dSis * It does so by collecting a sequence of character at a time and comparing
*4703203dSis * the collected sequences from the strings.
*4703203dSis *
*4703203dSis * The meanings on the return values are the same as the usual strcmp().
*4703203dSis */
*4703203dSisstatic int
*4703203dSisdo_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
*4703203dSis	int flag, int *errno)
*4703203dSis{
*4703203dSis	int result;
*4703203dSis	size_t sz1;
*4703203dSis	size_t sz2;
*4703203dSis	uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
*4703203dSis	uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
*4703203dSis	uchar_t *s1last;
*4703203dSis	uchar_t *s2last;
*4703203dSis	boolean_t is_it_toupper;
*4703203dSis	boolean_t is_it_tolower;
*4703203dSis	boolean_t canonical_decomposition;
*4703203dSis	boolean_t compatibility_decomposition;
*4703203dSis	boolean_t canonical_composition;
*4703203dSis	u8_normalization_states_t state;
*4703203dSis
*4703203dSis	s1last = s1 + n1;
*4703203dSis	s2last = s2 + n2;
*4703203dSis
*4703203dSis	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
*4703203dSis	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
*4703203dSis	canonical_decomposition = flag & U8_CANON_DECOMP;
*4703203dSis	compatibility_decomposition = flag & U8_COMPAT_DECOMP;
*4703203dSis	canonical_composition = flag & U8_CANON_COMP;
*4703203dSis
*4703203dSis	while (s1 < s1last && s2 < s2last) {
*4703203dSis		/*
*4703203dSis		 * If the current character is a 7-bit ASCII and the last
*4703203dSis		 * character, or, if the current character and the next
*4703203dSis		 * character are both some 7-bit ASCII characters then
*4703203dSis		 * we treat the current character as a sequence.
*4703203dSis		 *
*4703203dSis		 * In any other cases, we need to call collect_a_seq().
*4703203dSis		 */
*4703203dSis
*4703203dSis		if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
*4703203dSis		    ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
*4703203dSis			if (is_it_toupper)
*4703203dSis				u8s1[0] = U8_ASCII_TOUPPER(*s1);
*4703203dSis			else if (is_it_tolower)
*4703203dSis				u8s1[0] = U8_ASCII_TOLOWER(*s1);
*4703203dSis			else
*4703203dSis				u8s1[0] = *s1;
*4703203dSis			u8s1[1] = '\0';
*4703203dSis			sz1 = 1;
*4703203dSis			s1++;
*4703203dSis		} else {
*4703203dSis			state = U8_STATE_START;
*4703203dSis			sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
*4703203dSis			    is_it_toupper, is_it_tolower,
*4703203dSis			    canonical_decomposition,
*4703203dSis			    compatibility_decomposition,
*4703203dSis			    canonical_composition, errno, &state);
*4703203dSis		}
*4703203dSis
*4703203dSis		if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
*4703203dSis		    ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
*4703203dSis			if (is_it_toupper)
*4703203dSis				u8s2[0] = U8_ASCII_TOUPPER(*s2);
*4703203dSis			else if (is_it_tolower)
*4703203dSis				u8s2[0] = U8_ASCII_TOLOWER(*s2);
*4703203dSis			else
*4703203dSis				u8s2[0] = *s2;
*4703203dSis			u8s2[1] = '\0';
*4703203dSis			sz2 = 1;
*4703203dSis			s2++;
*4703203dSis		} else {
*4703203dSis			state = U8_STATE_START;
*4703203dSis			sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
*4703203dSis			    is_it_toupper, is_it_tolower,
*4703203dSis			    canonical_decomposition,
*4703203dSis			    compatibility_decomposition,
*4703203dSis			    canonical_composition, errno, &state);
*4703203dSis		}
*4703203dSis
*4703203dSis		/*
*4703203dSis		 * Now compare the two characters. If they are the same,
*4703203dSis		 * we move on to the next character sequences.
*4703203dSis		 */
*4703203dSis		if (sz1 == 1 && sz2 == 1) {
*4703203dSis			if (*u8s1 > *u8s2)
*4703203dSis				return (1);
*4703203dSis			if (*u8s1 < *u8s2)
*4703203dSis				return (-1);
*4703203dSis		} else {
*4703203dSis			result = strcmp((const char *)u8s1, (const char *)u8s2);
*4703203dSis			if (result != 0)
*4703203dSis				return (result);
*4703203dSis		}
*4703203dSis	}
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * We compared until the end of either or both strings.
*4703203dSis	 *
*4703203dSis	 * If we reached to or went over the ends for the both, that means
*4703203dSis	 * they are the same.
*4703203dSis	 *
*4703203dSis	 * If we reached only one end, that means the other string has
*4703203dSis	 * something which then can be used to determine the return value.
*4703203dSis	 */
*4703203dSis	if (s1 >= s1last) {
*4703203dSis		if (s2 >= s2last)
*4703203dSis			return (0);
*4703203dSis		return (-1);
*4703203dSis	}
*4703203dSis	return (1);
*4703203dSis}
*4703203dSis
*4703203dSis/*
*4703203dSis * The u8_strcmp() function compares two UTF-8 strings quite similar to
*4703203dSis * the strcmp(). For the comparison, however, Unicode Normalization specific
*4703203dSis * equivalency and Unicode simple case conversion mappings based equivalency
*4703203dSis * can be requested and checked against.
*4703203dSis */
*4703203dSisint
*4703203dSisu8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
*4703203dSis		int *errno)
*4703203dSis{
*4703203dSis	int f;
*4703203dSis	size_t n1;
*4703203dSis	size_t n2;
*4703203dSis
*4703203dSis	*errno = 0;
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * Check on the requested Unicode version, case conversion, and
*4703203dSis	 * normalization flag values.
*4703203dSis	 */
*4703203dSis
*4703203dSis	if (uv > U8_UNICODE_LATEST) {
*4703203dSis		*errno = ERANGE;
*4703203dSis		uv = U8_UNICODE_LATEST;
*4703203dSis	}
*4703203dSis
*4703203dSis	if (flag == 0) {
*4703203dSis		flag = U8_STRCMP_CS;
*4703203dSis	} else {
*4703203dSis		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
*4703203dSis		    U8_STRCMP_CI_LOWER);
*4703203dSis		if (f == 0) {
*4703203dSis			flag |= U8_STRCMP_CS;
*4703203dSis		} else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
*4703203dSis		    f != U8_STRCMP_CI_LOWER) {
*4703203dSis			*errno = EBADF;
*4703203dSis			flag = U8_STRCMP_CS;
*4703203dSis		}
*4703203dSis
*4703203dSis		f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
*4703203dSis		if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
*4703203dSis		    f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
*4703203dSis			*errno = EBADF;
*4703203dSis			flag = U8_STRCMP_CS;
*4703203dSis		}
*4703203dSis	}
*4703203dSis
*4703203dSis	if (flag == U8_STRCMP_CS) {
*4703203dSis		return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
*4703203dSis	}
*4703203dSis
*4703203dSis	n1 = strlen(s1);
*4703203dSis	n2 = strlen(s2);
*4703203dSis	if (n != 0) {
*4703203dSis		if (n < n1)
*4703203dSis			n1 = n;
*4703203dSis		if (n < n2)
*4703203dSis			n2 = n;
*4703203dSis	}
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * Simple case conversion can be done much faster and so we do
*4703203dSis	 * them separately here.
*4703203dSis	 */
*4703203dSis	if (flag == U8_STRCMP_CI_UPPER) {
*4703203dSis		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
*4703203dSis		    n1, n2, B_TRUE, errno));
*4703203dSis	} else if (flag == U8_STRCMP_CI_LOWER) {
*4703203dSis		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
*4703203dSis		    n1, n2, B_FALSE, errno));
*4703203dSis	}
*4703203dSis
*4703203dSis	return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
*4703203dSis	    flag, errno));
*4703203dSis}
*4703203dSis
*4703203dSissize_t
*4703203dSisu8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
*4703203dSis	int flag, size_t unicode_version, int *errno)
*4703203dSis{
*4703203dSis	int f;
*4703203dSis	int sz;
*4703203dSis	uchar_t *ib;
*4703203dSis	uchar_t *ibtail;
*4703203dSis	uchar_t *ob;
*4703203dSis	uchar_t *obtail;
*4703203dSis	boolean_t do_not_ignore_null;
*4703203dSis	boolean_t do_not_ignore_invalid;
*4703203dSis	boolean_t is_it_toupper;
*4703203dSis	boolean_t is_it_tolower;
*4703203dSis	boolean_t canonical_decomposition;
*4703203dSis	boolean_t compatibility_decomposition;
*4703203dSis	boolean_t canonical_composition;
*4703203dSis	size_t ret_val;
*4703203dSis	size_t i;
*4703203dSis	size_t j;
*4703203dSis	uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
*4703203dSis	u8_normalization_states_t state;
*4703203dSis
*4703203dSis	if (unicode_version > U8_UNICODE_LATEST) {
*4703203dSis		*errno = ERANGE;
*4703203dSis		return ((size_t)-1);
*4703203dSis	}
*4703203dSis
*4703203dSis	f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
*4703203dSis	if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
*4703203dSis		*errno = EBADF;
*4703203dSis		return ((size_t)-1);
*4703203dSis	}
*4703203dSis
*4703203dSis	f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
*4703203dSis	if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
*4703203dSis	    f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
*4703203dSis		*errno = EBADF;
*4703203dSis		return ((size_t)-1);
*4703203dSis	}
*4703203dSis
*4703203dSis	if (inarray == NULL || *inlen == 0)
*4703203dSis		return (0);
*4703203dSis
*4703203dSis	if (outarray == NULL) {
*4703203dSis		*errno = E2BIG;
*4703203dSis		return ((size_t)-1);
*4703203dSis	}
*4703203dSis
*4703203dSis	ib = (uchar_t *)inarray;
*4703203dSis	ob = (uchar_t *)outarray;
*4703203dSis	ibtail = ib + *inlen;
*4703203dSis	obtail = ob + *outlen;
*4703203dSis
*4703203dSis	do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
*4703203dSis	do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
*4703203dSis	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
*4703203dSis	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
*4703203dSis
*4703203dSis	ret_val = 0;
*4703203dSis
*4703203dSis	/*
*4703203dSis	 * If we don't have a normalization flag set, we do the simple case
*4703203dSis	 * conversion based text preparation separately below. Text
*4703203dSis	 * preparation involving Normalization will be done in the false task
*4703203dSis	 * block, again, separately since it will take much more time and
*4703203dSis	 * resource than doing simple case conversions.
*4703203dSis	 */
*4703203dSis	if (f == 0) {
*4703203dSis		while (ib < ibtail) {
*4703203dSis			if (*ib == '\0' && do_not_ignore_null)
*4703203dSis				break;
*4703203dSis
*4703203dSis			sz = u8_number_of_bytes[*ib];
*4703203dSis
*4703203dSis			if (sz < 0) {
*4703203dSis				if (do_not_ignore_invalid) {
*4703203dSis					*errno = EILSEQ;
*4703203dSis					ret_val = (size_t)-1;
*4703203dSis					break;
*4703203dSis				}
*4703203dSis
*4703203dSis				sz = 1;
*4703203dSis				ret_val++;
*4703203dSis			}
*4703203dSis
*4703203dSis			if (sz == 1) {
*4703203dSis				if (ob >= obtail) {
*4703203dSis					*errno = E2BIG;
*4703203dSis					ret_val = (size_t)-1;
*4703203dSis					break;
*4703203dSis				}
*4703203dSis
*4703203dSis				if (is_it_toupper)
*4703203dSis					*ob = U8_ASCII_TOUPPER(*ib);
*4703203dSis				else if (is_it_tolower)
*4703203dSis					*ob = U8_ASCII_TOLOWER(*ib);
*4703203dSis				else
*4703203dSis					*ob = *ib;
*4703203dSis				ib++;
*4703203dSis				ob++;
*4703203dSis			} else if ((ib + sz) > ibtail) {
*4703203dSis				if (do_not_ignore_invalid) {
*4703203dSis					*errno = EINVAL;
*4703203dSis					ret_val = (size_t)-1;
*4703203dSis					break;
*4703203dSis				}
*4703203dSis
*4703203dSis				if ((obtail - ob) < (ibtail - ib)) {
*4703203dSis					*errno = E2BIG;
*4703203dSis					ret_val = (size_t)-1;
*4703203dSis					break;
*4703203dSis				}
*4703203dSis
*4703203dSis				/*
*4703203dSis				 * We treat the remaining incomplete character
*4703203dSis				 * bytes as a character.
*4703203dSis				 */
*4703203dSis				ret_val++;
*4703203dSis
*4703203dSis				while (ib < ibtail)
*4703203dSis					*ob++ = *ib++;
*4703203dSis			} else {
*4703203dSis				if (is_it_toupper || is_it_tolower) {
*4703203dSis					i = do_case_conv(unicode_version, u8s,
*4703203dSis					    ib, sz, is_it_toupper);
*4703203dSis
*4703203dSis					if ((obtail - ob) < i) {
*4703203dSis						*errno = E2BIG;
*4703203dSis						ret_val = (size_t)-1;
*4703203dSis						break;
*4703203dSis					}
*4703203dSis
*4703203dSis					ib += sz;
*4703203dSis
*4703203dSis					for (sz = 0; sz < i; sz++)
*4703203dSis						*ob++ = u8s[sz];
*4703203dSis				} else {
*4703203dSis					if ((obtail - ob) < sz) {
*4703203dSis						*errno = E2BIG;
*4703203dSis						ret_val = (size_t)-1;
*4703203dSis						break;
*4703203dSis					}
*4703203dSis
*4703203dSis					for (i = 0; i < sz; i++)
*4703203dSis						*ob++ = *ib++;
*4703203dSis				}
*4703203dSis			}
*4703203dSis		}
*4703203dSis	} else {
*4703203dSis		canonical_decomposition = flag & U8_CANON_DECOMP;
*4703203dSis		compatibility_decomposition = flag & U8_COMPAT_DECOMP;
*4703203dSis		canonical_composition = flag & U8_CANON_COMP;
*4703203dSis
*4703203dSis		while (ib < ibtail) {
*4703203dSis			if (*ib == '\0' && do_not_ignore_null)
*4703203dSis				break;
*4703203dSis
*4703203dSis			/*
*4703203dSis			 * If the current character is a 7-bit ASCII
*4703203dSis			 * character and it is the last character, or,
*4703203dSis			 * if the current character is a 7-bit ASCII
*4703203dSis			 * character and the next character is also a 7-bit
*4703203dSis			 * ASCII character, then, we copy over this
*4703203dSis			 * character without going through collect_a_seq().
*4703203dSis			 *
*4703203dSis			 * In any other cases, we need to look further with
*4703203dSis			 * the collect_a_seq() function.
*4703203dSis			 */
*4703203dSis			if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
*4703203dSis			    ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
*4703203dSis				if (ob >= obtail) {
*4703203dSis					*errno = E2BIG;
*4703203dSis					ret_val = (size_t)-1;
*4703203dSis					break;
*4703203dSis				}
*4703203dSis
*4703203dSis				if (is_it_toupper)
*4703203dSis					*ob = U8_ASCII_TOUPPER(*ib);
*4703203dSis				else if (is_it_tolower)
*4703203dSis					*ob = U8_ASCII_TOLOWER(*ib);
*4703203dSis				else
*4703203dSis					*ob = *ib;
*4703203dSis				ib++;
*4703203dSis				ob++;
*4703203dSis			} else {
*4703203dSis				*errno = 0;
*4703203dSis				state = U8_STATE_START;
*4703203dSis
*4703203dSis				j = collect_a_seq(unicode_version, u8s,
*4703203dSis				    &ib, ibtail,
*4703203dSis				    is_it_toupper,
*4703203dSis				    is_it_tolower,
*4703203dSis				    canonical_decomposition,
*4703203dSis				    compatibility_decomposition,
*4703203dSis				    canonical_composition,
*4703203dSis				    errno, &state);
*4703203dSis
*4703203dSis				if (*errno && do_not_ignore_invalid) {
*4703203dSis					ret_val = (size_t)-1;
*4703203dSis					break;
*4703203dSis				}
*4703203dSis
*4703203dSis				if ((obtail - ob) < j) {
*4703203dSis					*errno = E2BIG;
*4703203dSis					ret_val = (size_t)-1;
*4703203dSis					break;
*4703203dSis				}
*4703203dSis
*4703203dSis				for (i = 0; i < j; i++)
*4703203dSis					*ob++ = u8s[i];
*4703203dSis			}
*4703203dSis		}
*4703203dSis	}
*4703203dSis
*4703203dSis	*inlen = ibtail - ib;
*4703203dSis	*outlen = obtail - ob;
*4703203dSis
*4703203dSis	return (ret_val);
*4703203dSis}