xref: /illumos-gate/usr/src/lib/gss_mechs/mech_krb5/support/utf8.c (revision 55fea89dcaa64928bed4327112404dcb3e07b79f)
1*ba7b222eSGlenn Barry /*
2*ba7b222eSGlenn Barry  * util/support/utf8.c
3*ba7b222eSGlenn Barry  *
4*ba7b222eSGlenn Barry  * Copyright 2008 by the Massachusetts Institute of Technology.
5*ba7b222eSGlenn Barry  * All Rights Reserved.
6*ba7b222eSGlenn Barry  *
7*ba7b222eSGlenn Barry  * Export of this software from the United States of America may
8*ba7b222eSGlenn Barry  *   require a specific license from the United States Government.
9*ba7b222eSGlenn Barry  *   It is the responsibility of any person or organization contemplating
10*ba7b222eSGlenn Barry  *   export to obtain such a license before exporting.
11*ba7b222eSGlenn Barry  *
12*ba7b222eSGlenn Barry  * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
13*ba7b222eSGlenn Barry  * distribute this software and its documentation for any purpose and
14*ba7b222eSGlenn Barry  * without fee is hereby granted, provided that the above copyright
15*ba7b222eSGlenn Barry  * notice appear in all copies and that both that copyright notice and
16*ba7b222eSGlenn Barry  * this permission notice appear in supporting documentation, and that
17*ba7b222eSGlenn Barry  * the name of M.I.T. not be used in advertising or publicity pertaining
18*ba7b222eSGlenn Barry  * to distribution of the software without specific, written prior
19*ba7b222eSGlenn Barry  * permission.  Furthermore if you modify this software you must label
20*ba7b222eSGlenn Barry  * your software as modified software and not distribute it in such a
21*ba7b222eSGlenn Barry  * fashion that it might be confused with the original M.I.T. software.
22*ba7b222eSGlenn Barry  * M.I.T. makes no representations about the suitability of
23*ba7b222eSGlenn Barry  * this software for any purpose.  It is provided "as is" without express
24*ba7b222eSGlenn Barry  * or implied warranty.
25*ba7b222eSGlenn Barry  */
26*ba7b222eSGlenn Barry /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
27*ba7b222eSGlenn Barry  *
28*ba7b222eSGlenn Barry  * Copyright 1998-2008 The OpenLDAP Foundation.
29*ba7b222eSGlenn Barry  * All rights reserved.
30*ba7b222eSGlenn Barry  *
31*ba7b222eSGlenn Barry  * Redistribution and use in source and binary forms, with or without
32*ba7b222eSGlenn Barry  * modification, are permitted only as authorized by the OpenLDAP
33*ba7b222eSGlenn Barry  * Public License.
34*ba7b222eSGlenn Barry  *
35*ba7b222eSGlenn Barry  * A copy of this license is available in the file LICENSE in the
36*ba7b222eSGlenn Barry  * top-level directory of the distribution or, alternatively, at
37*ba7b222eSGlenn Barry  * <http://www.OpenLDAP.org/license.html>.
38*ba7b222eSGlenn Barry  */
39*ba7b222eSGlenn Barry /* Basic UTF-8 routines
40*ba7b222eSGlenn Barry  *
41*ba7b222eSGlenn Barry  * These routines are "dumb".  Though they understand UTF-8,
42*ba7b222eSGlenn Barry  * they don't grok Unicode.  That is, they can push bits,
43*ba7b222eSGlenn Barry  * but don't have a clue what the bits represent.  That's
44*ba7b222eSGlenn Barry  * good enough for use with the KRB5 Client SDK.
45*ba7b222eSGlenn Barry  *
46*ba7b222eSGlenn Barry  * These routines are not optimized.
47*ba7b222eSGlenn Barry  */
48*ba7b222eSGlenn Barry 
49*ba7b222eSGlenn Barry #include "k5-platform.h"
50*ba7b222eSGlenn Barry #include "k5-utf8.h"
51*ba7b222eSGlenn Barry #include "supp-int.h"
52*ba7b222eSGlenn Barry 
53*ba7b222eSGlenn Barry /*
54*ba7b222eSGlenn Barry  * return the number of bytes required to hold the
55*ba7b222eSGlenn Barry  * NULL-terminated UTF-8 string NOT INCLUDING the
56*ba7b222eSGlenn Barry  * termination.
57*ba7b222eSGlenn Barry  */
krb5int_utf8_bytes(const char * p)58*ba7b222eSGlenn Barry size_t krb5int_utf8_bytes(const char *p)
59*ba7b222eSGlenn Barry {
60*ba7b222eSGlenn Barry     size_t bytes;
61*ba7b222eSGlenn Barry 
62*ba7b222eSGlenn Barry     for (bytes = 0; p[bytes]; bytes++)
63*ba7b222eSGlenn Barry 	;
64*ba7b222eSGlenn Barry 
65*ba7b222eSGlenn Barry     return bytes;
66*ba7b222eSGlenn Barry }
67*ba7b222eSGlenn Barry 
krb5int_utf8_chars(const char * p)68*ba7b222eSGlenn Barry size_t krb5int_utf8_chars(const char *p)
69*ba7b222eSGlenn Barry {
70*ba7b222eSGlenn Barry     /* could be optimized and could check for invalid sequences */
71*ba7b222eSGlenn Barry     size_t chars = 0;
72*ba7b222eSGlenn Barry 
73*ba7b222eSGlenn Barry     for ( ; *p ; KRB5_UTF8_INCR(p))
74*ba7b222eSGlenn Barry 	chars++;
75*ba7b222eSGlenn Barry 
76*ba7b222eSGlenn Barry     return chars;
77*ba7b222eSGlenn Barry }
78*ba7b222eSGlenn Barry 
krb5int_utf8c_chars(const char * p,size_t length)79*ba7b222eSGlenn Barry size_t krb5int_utf8c_chars(const char *p, size_t length)
80*ba7b222eSGlenn Barry {
81*ba7b222eSGlenn Barry     /* could be optimized and could check for invalid sequences */
82*ba7b222eSGlenn Barry     size_t chars = 0;
83*ba7b222eSGlenn Barry     const char *end = p + length;
84*ba7b222eSGlenn Barry 
85*ba7b222eSGlenn Barry     for ( ; p < end; KRB5_UTF8_INCR(p))
86*ba7b222eSGlenn Barry 	chars++;
87*ba7b222eSGlenn Barry 
88*ba7b222eSGlenn Barry     return chars;
89*ba7b222eSGlenn Barry }
90*ba7b222eSGlenn Barry 
91*ba7b222eSGlenn Barry /* return offset to next character */
krb5int_utf8_offset(const char * p)92*ba7b222eSGlenn Barry int krb5int_utf8_offset(const char *p)
93*ba7b222eSGlenn Barry {
94*ba7b222eSGlenn Barry     return KRB5_UTF8_NEXT(p) - p;
95*ba7b222eSGlenn Barry }
96*ba7b222eSGlenn Barry 
97*ba7b222eSGlenn Barry /*
98*ba7b222eSGlenn Barry  * Returns length indicated by first byte.
99*ba7b222eSGlenn Barry  */
100*ba7b222eSGlenn Barry const char krb5int_utf8_lentab[] = {
101*ba7b222eSGlenn Barry     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102*ba7b222eSGlenn Barry     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103*ba7b222eSGlenn Barry     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104*ba7b222eSGlenn Barry     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105*ba7b222eSGlenn Barry     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
106*ba7b222eSGlenn Barry     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
107*ba7b222eSGlenn Barry     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
108*ba7b222eSGlenn Barry     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
109*ba7b222eSGlenn Barry 
krb5int_utf8_charlen(const char * p)110*ba7b222eSGlenn Barry int krb5int_utf8_charlen(const char *p)
111*ba7b222eSGlenn Barry {
112*ba7b222eSGlenn Barry     if (!(*p & 0x80))
113*ba7b222eSGlenn Barry 	return 1;
114*ba7b222eSGlenn Barry 
115*ba7b222eSGlenn Barry     return krb5int_utf8_lentab[*(const unsigned char *)p ^ 0x80];
116*ba7b222eSGlenn Barry }
117*ba7b222eSGlenn Barry 
118*ba7b222eSGlenn Barry /*
119*ba7b222eSGlenn Barry  * Make sure the UTF-8 char used the shortest possible encoding
120*ba7b222eSGlenn Barry  * returns charlen if valid, 0 if not.
121*ba7b222eSGlenn Barry  *
122*ba7b222eSGlenn Barry  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
123*ba7b222eSGlenn Barry  * The table is slightly modified from that of the RFC.
124*ba7b222eSGlenn Barry  *
125*ba7b222eSGlenn Barry  * UCS-4 range (hex)      UTF-8 sequence (binary)
126*ba7b222eSGlenn Barry  * 0000 0000-0000 007F   0.......
127*ba7b222eSGlenn Barry  * 0000 0080-0000 07FF   110++++. 10......
128*ba7b222eSGlenn Barry  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
129*ba7b222eSGlenn Barry  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
130*ba7b222eSGlenn Barry  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
131*ba7b222eSGlenn Barry  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
132*ba7b222eSGlenn Barry  *
133*ba7b222eSGlenn Barry  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
134*ba7b222eSGlenn Barry  * at least one of the '+' bits must be set, otherwise the character
135*ba7b222eSGlenn Barry  * should have been encoded in fewer octets. Note that in the two-octet
136*ba7b222eSGlenn Barry  * case, only the first octet needs to be validated, and this is done
137*ba7b222eSGlenn Barry  * in the krb5int_utf8_lentab[] above.
138*ba7b222eSGlenn Barry  */
139*ba7b222eSGlenn Barry 
140*ba7b222eSGlenn Barry /* mask of required bits in second octet */
141*ba7b222eSGlenn Barry #undef c
142*ba7b222eSGlenn Barry #define c const char
143*ba7b222eSGlenn Barry c krb5int_utf8_mintab[] = {
144*ba7b222eSGlenn Barry     (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
145*ba7b222eSGlenn Barry     (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
146*ba7b222eSGlenn Barry     (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
147*ba7b222eSGlenn Barry     (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
148*ba7b222eSGlenn Barry #undef c
149*ba7b222eSGlenn Barry 
krb5int_utf8_charlen2(const char * p)150*ba7b222eSGlenn Barry int krb5int_utf8_charlen2(const char *p)
151*ba7b222eSGlenn Barry {
152*ba7b222eSGlenn Barry     int i = KRB5_UTF8_CHARLEN(p);
153*ba7b222eSGlenn Barry 
154*ba7b222eSGlenn Barry     if (i > 2) {
155*ba7b222eSGlenn Barry 	if (!(krb5int_utf8_mintab[*p & 0x1f] & p[1]))
156*ba7b222eSGlenn Barry 	    i = 0;
157*ba7b222eSGlenn Barry     }
158*ba7b222eSGlenn Barry 
159*ba7b222eSGlenn Barry     return i;
160*ba7b222eSGlenn Barry }
161*ba7b222eSGlenn Barry 
162*ba7b222eSGlenn Barry /*
163*ba7b222eSGlenn Barry  * Convert a UTF8 character to a UCS4 character.  Return 0 on success,
164*ba7b222eSGlenn Barry  * -1 on failure.
165*ba7b222eSGlenn Barry  */
krb5int_utf8_to_ucs4(const char * p,krb5_ucs4 * out)166*ba7b222eSGlenn Barry int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out)
167*ba7b222eSGlenn Barry {
168*ba7b222eSGlenn Barry     const unsigned char *c = (const unsigned char *) p;
169*ba7b222eSGlenn Barry     krb5_ucs4 ch;
170*ba7b222eSGlenn Barry     int len, i;
171*ba7b222eSGlenn Barry     static unsigned char mask[] = {
172*ba7b222eSGlenn Barry 	0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
173*ba7b222eSGlenn Barry 
174*ba7b222eSGlenn Barry     *out = 0;
175*ba7b222eSGlenn Barry     len = KRB5_UTF8_CHARLEN2(p, len);
176*ba7b222eSGlenn Barry 
177*ba7b222eSGlenn Barry     if (len == 0)
178*ba7b222eSGlenn Barry 	return -1;
179*ba7b222eSGlenn Barry 
180*ba7b222eSGlenn Barry     ch = c[0] & mask[len];
181*ba7b222eSGlenn Barry 
182*ba7b222eSGlenn Barry     for (i = 1; i < len; i++) {
183*ba7b222eSGlenn Barry 	if ((c[i] & 0xc0) != 0x80)
184*ba7b222eSGlenn Barry 	    return -1;
185*ba7b222eSGlenn Barry 
186*ba7b222eSGlenn Barry 	ch <<= 6;
187*ba7b222eSGlenn Barry 	ch |= c[i] & 0x3f;
188*ba7b222eSGlenn Barry     }
189*ba7b222eSGlenn Barry 
190*ba7b222eSGlenn Barry     *out = ch;
191*ba7b222eSGlenn Barry     return 0;
192*ba7b222eSGlenn Barry }
193*ba7b222eSGlenn Barry 
krb5int_utf8_to_ucs2(const char * p,krb5_ucs2 * out)194*ba7b222eSGlenn Barry int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out)
195*ba7b222eSGlenn Barry {
196*ba7b222eSGlenn Barry     krb5_ucs4 ch;
197*ba7b222eSGlenn Barry 
198*ba7b222eSGlenn Barry     *out = 0;
199*ba7b222eSGlenn Barry     if (krb5int_utf8_to_ucs4(p, &ch) == -1 || ch > 0xFFFF)
200*ba7b222eSGlenn Barry 	return -1;
201*ba7b222eSGlenn Barry     *out = (krb5_ucs2) ch;
202*ba7b222eSGlenn Barry     return 0;
203*ba7b222eSGlenn Barry }
204*ba7b222eSGlenn Barry 
205*ba7b222eSGlenn Barry /* conv UCS-2 to UTF-8, not used */
krb5int_ucs4_to_utf8(krb5_ucs4 c,char * buf)206*ba7b222eSGlenn Barry size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)
207*ba7b222eSGlenn Barry {
208*ba7b222eSGlenn Barry     size_t len = 0;
209*ba7b222eSGlenn Barry     unsigned char *p = (unsigned char *) buf;
210*ba7b222eSGlenn Barry 
211*ba7b222eSGlenn Barry     /* not a valid Unicode character */
212*ba7b222eSGlenn Barry     if (c < 0)
213*ba7b222eSGlenn Barry 	return 0;
214*ba7b222eSGlenn Barry 
215*ba7b222eSGlenn Barry     /* Just return length, don't convert */
216*ba7b222eSGlenn Barry     if (buf == NULL) {
217*ba7b222eSGlenn Barry 	if (c < 0x80) return 1;
218*ba7b222eSGlenn Barry 	else if (c < 0x800) return 2;
219*ba7b222eSGlenn Barry 	else if (c < 0x10000) return 3;
220*ba7b222eSGlenn Barry 	else if (c < 0x200000) return 4;
221*ba7b222eSGlenn Barry 	else if (c < 0x4000000) return 5;
222*ba7b222eSGlenn Barry 	else return 6;
223*ba7b222eSGlenn Barry     }
224*ba7b222eSGlenn Barry 
225*ba7b222eSGlenn Barry     if (c < 0x80) {
226*ba7b222eSGlenn Barry 	p[len++] = c;
227*ba7b222eSGlenn Barry     } else if (c < 0x800) {
228*ba7b222eSGlenn Barry 	p[len++] = 0xc0 | ( c >> 6 );
229*ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( c & 0x3f );
230*ba7b222eSGlenn Barry     } else if (c < 0x10000) {
231*ba7b222eSGlenn Barry 	p[len++] = 0xe0 | ( c >> 12 );
232*ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
233*ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( c & 0x3f );
234*ba7b222eSGlenn Barry     } else if (c < 0x200000) {
235*ba7b222eSGlenn Barry 	p[len++] = 0xf0 | ( c >> 18 );
236*ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 12) & 0x3f );
237*ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
238*ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( c & 0x3f );
239*ba7b222eSGlenn Barry     } else if (c < 0x4000000) {
240*ba7b222eSGlenn Barry 	p[len++] = 0xf8 | ( c >> 24 );
241*ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 18) & 0x3f );
242*ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 12) & 0x3f );
243*ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
244*ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( c & 0x3f );
245*ba7b222eSGlenn Barry     } else /* if( c < 0x80000000 ) */ {
246*ba7b222eSGlenn Barry 	p[len++] = 0xfc | ( c >> 30 );
247*ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 24) & 0x3f );
248*ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 18) & 0x3f );
249*ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 12) & 0x3f );
250*ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
251*ba7b222eSGlenn Barry 	p[len++] = 0x80 | ( c & 0x3f );
252*ba7b222eSGlenn Barry     }
253*ba7b222eSGlenn Barry 
254*ba7b222eSGlenn Barry     return len;
255*ba7b222eSGlenn Barry }
256*ba7b222eSGlenn Barry 
krb5int_ucs2_to_utf8(krb5_ucs2 c,char * buf)257*ba7b222eSGlenn Barry size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf)
258*ba7b222eSGlenn Barry {
259*ba7b222eSGlenn Barry     return krb5int_ucs4_to_utf8((krb5_ucs4)c, buf);
260*ba7b222eSGlenn Barry }
261*ba7b222eSGlenn Barry 
262*ba7b222eSGlenn Barry #define KRB5_UCS_UTF8LEN(c)	\
263*ba7b222eSGlenn Barry     c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
264*ba7b222eSGlenn Barry     (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
265*ba7b222eSGlenn Barry 
266*ba7b222eSGlenn Barry /*
267*ba7b222eSGlenn Barry  * Advance to the next UTF-8 character
268*ba7b222eSGlenn Barry  *
269*ba7b222eSGlenn Barry  * Ignores length of multibyte character, instead rely on
270*ba7b222eSGlenn Barry  * continuation markers to find start of next character.
271*ba7b222eSGlenn Barry  * This allows for "resyncing" of when invalid characters
272*ba7b222eSGlenn Barry  * are provided provided the start of the next character
273*ba7b222eSGlenn Barry  * is appears within the 6 bytes examined.
274*ba7b222eSGlenn Barry  */
krb5int_utf8_next(const char * p)275*ba7b222eSGlenn Barry char *krb5int_utf8_next(const char *p)
276*ba7b222eSGlenn Barry {
277*ba7b222eSGlenn Barry     int i;
278*ba7b222eSGlenn Barry     const unsigned char *u = (const unsigned char *) p;
279*ba7b222eSGlenn Barry 
280*ba7b222eSGlenn Barry     if (KRB5_UTF8_ISASCII(u)) {
281*ba7b222eSGlenn Barry 	return (char *) &p[1];
282*ba7b222eSGlenn Barry     }
283*ba7b222eSGlenn Barry 
284*ba7b222eSGlenn Barry     for (i = 1; i < 6; i++) {
285*ba7b222eSGlenn Barry 	if ((u[i] & 0xc0) != 0x80) {
286*ba7b222eSGlenn Barry 	    return (char *) &p[i];
287*ba7b222eSGlenn Barry 	}
288*ba7b222eSGlenn Barry     }
289*ba7b222eSGlenn Barry 
290*ba7b222eSGlenn Barry     return (char *) &p[i];
291*ba7b222eSGlenn Barry }
292*ba7b222eSGlenn Barry 
293*ba7b222eSGlenn Barry /*
294*ba7b222eSGlenn Barry  * Advance to the previous UTF-8 character
295*ba7b222eSGlenn Barry  *
296*ba7b222eSGlenn Barry  * Ignores length of multibyte character, instead rely on
297*ba7b222eSGlenn Barry  * continuation markers to find start of next character.
298*ba7b222eSGlenn Barry  * This allows for "resyncing" of when invalid characters
299*ba7b222eSGlenn Barry  * are provided provided the start of the next character
300*ba7b222eSGlenn Barry  * is appears within the 6 bytes examined.
301*ba7b222eSGlenn Barry  */
krb5int_utf8_prev(const char * p)302*ba7b222eSGlenn Barry char *krb5int_utf8_prev(const char *p)
303*ba7b222eSGlenn Barry {
304*ba7b222eSGlenn Barry     int i;
305*ba7b222eSGlenn Barry     const unsigned char *u = (const unsigned char *) p;
306*ba7b222eSGlenn Barry 
307*ba7b222eSGlenn Barry     for (i = -1; i>-6 ; i--) {
308*ba7b222eSGlenn Barry 	if ((u[i] & 0xc0 ) != 0x80) {
309*ba7b222eSGlenn Barry 	    return (char *) &p[i];
310*ba7b222eSGlenn Barry 	}
311*ba7b222eSGlenn Barry     }
312*ba7b222eSGlenn Barry 
313*ba7b222eSGlenn Barry     return (char *) &p[i];
314*ba7b222eSGlenn Barry }
315*ba7b222eSGlenn Barry 
316*ba7b222eSGlenn Barry /*
317*ba7b222eSGlenn Barry  * Copy one UTF-8 character from src to dst returning
318*ba7b222eSGlenn Barry  * number of bytes copied.
319*ba7b222eSGlenn Barry  *
320*ba7b222eSGlenn Barry  * Ignores length of multibyte character, instead rely on
321*ba7b222eSGlenn Barry  * continuation markers to find start of next character.
322*ba7b222eSGlenn Barry  * This allows for "resyncing" of when invalid characters
323*ba7b222eSGlenn Barry  * are provided provided the start of the next character
324*ba7b222eSGlenn Barry  * is appears within the 6 bytes examined.
325*ba7b222eSGlenn Barry  */
krb5int_utf8_copy(char * dst,const char * src)326*ba7b222eSGlenn Barry int krb5int_utf8_copy(char* dst, const char *src)
327*ba7b222eSGlenn Barry {
328*ba7b222eSGlenn Barry     int i;
329*ba7b222eSGlenn Barry     const unsigned char *u = (const unsigned char *) src;
330*ba7b222eSGlenn Barry 
331*ba7b222eSGlenn Barry     dst[0] = src[0];
332*ba7b222eSGlenn Barry 
333*ba7b222eSGlenn Barry     if (KRB5_UTF8_ISASCII(u)) {
334*ba7b222eSGlenn Barry 	return 1;
335*ba7b222eSGlenn Barry     }
336*ba7b222eSGlenn Barry 
337*ba7b222eSGlenn Barry     for (i=1; i<6; i++) {
338*ba7b222eSGlenn Barry 	if ((u[i] & 0xc0) != 0x80) {
339*ba7b222eSGlenn Barry 	    return i;
340*ba7b222eSGlenn Barry 	}
341*ba7b222eSGlenn Barry 	dst[i] = src[i];
342*ba7b222eSGlenn Barry     }
343*ba7b222eSGlenn Barry 
344*ba7b222eSGlenn Barry     return i;
345*ba7b222eSGlenn Barry }
346*ba7b222eSGlenn Barry 
347*ba7b222eSGlenn Barry #ifndef UTF8_ALPHA_CTYPE
348*ba7b222eSGlenn Barry /*
349*ba7b222eSGlenn Barry  * UTF-8 ctype routines
350*ba7b222eSGlenn Barry  * Only deals with characters < 0x80 (ie: US-ASCII)
351*ba7b222eSGlenn Barry  */
352*ba7b222eSGlenn Barry 
krb5int_utf8_isascii(const char * p)353*ba7b222eSGlenn Barry int krb5int_utf8_isascii(const char * p)
354*ba7b222eSGlenn Barry {
355*ba7b222eSGlenn Barry     unsigned c = * (const unsigned char *) p;
356*ba7b222eSGlenn Barry 
357*ba7b222eSGlenn Barry     return KRB5_ASCII(c);
358*ba7b222eSGlenn Barry }
359*ba7b222eSGlenn Barry 
krb5int_utf8_isdigit(const char * p)360*ba7b222eSGlenn Barry int krb5int_utf8_isdigit(const char * p)
361*ba7b222eSGlenn Barry {
362*ba7b222eSGlenn Barry     unsigned c = * (const unsigned char *) p;
363*ba7b222eSGlenn Barry 
364*ba7b222eSGlenn Barry     if (!KRB5_ASCII(c))
365*ba7b222eSGlenn Barry 	return 0;
366*ba7b222eSGlenn Barry 
367*ba7b222eSGlenn Barry     return KRB5_DIGIT( c );
368*ba7b222eSGlenn Barry }
369*ba7b222eSGlenn Barry 
krb5int_utf8_isxdigit(const char * p)370*ba7b222eSGlenn Barry int krb5int_utf8_isxdigit(const char * p)
371*ba7b222eSGlenn Barry {
372*ba7b222eSGlenn Barry     unsigned c = * (const unsigned char *) p;
373*ba7b222eSGlenn Barry 
374*ba7b222eSGlenn Barry     if (!KRB5_ASCII(c))
375*ba7b222eSGlenn Barry 	return 0;
376*ba7b222eSGlenn Barry 
377*ba7b222eSGlenn Barry     return KRB5_HEX(c);
378*ba7b222eSGlenn Barry }
379*ba7b222eSGlenn Barry 
krb5int_utf8_isspace(const char * p)380*ba7b222eSGlenn Barry int krb5int_utf8_isspace(const char * p)
381*ba7b222eSGlenn Barry {
382*ba7b222eSGlenn Barry     unsigned c = * (const unsigned char *) p;
383*ba7b222eSGlenn Barry 
384*ba7b222eSGlenn Barry     if (!KRB5_ASCII(c))
385*ba7b222eSGlenn Barry 	return 0;
386*ba7b222eSGlenn Barry 
387*ba7b222eSGlenn Barry     switch(c) {
388*ba7b222eSGlenn Barry     case ' ':
389*ba7b222eSGlenn Barry     case '\t':
390*ba7b222eSGlenn Barry     case '\n':
391*ba7b222eSGlenn Barry     case '\r':
392*ba7b222eSGlenn Barry     case '\v':
393*ba7b222eSGlenn Barry     case '\f':
394*ba7b222eSGlenn Barry 	return 1;
395*ba7b222eSGlenn Barry     }
396*ba7b222eSGlenn Barry 
397*ba7b222eSGlenn Barry     return 0;
398*ba7b222eSGlenn Barry }
399*ba7b222eSGlenn Barry 
400*ba7b222eSGlenn Barry /*
401*ba7b222eSGlenn Barry  * These are not needed by the C SDK and are
402*ba7b222eSGlenn Barry  * not "good enough" for general use.
403*ba7b222eSGlenn Barry  */
krb5int_utf8_isalpha(const char * p)404*ba7b222eSGlenn Barry int krb5int_utf8_isalpha(const char * p)
405*ba7b222eSGlenn Barry {
406*ba7b222eSGlenn Barry     unsigned c = * (const unsigned char *) p;
407*ba7b222eSGlenn Barry 
408*ba7b222eSGlenn Barry     if (!KRB5_ASCII(c))
409*ba7b222eSGlenn Barry 	return 0;
410*ba7b222eSGlenn Barry 
411*ba7b222eSGlenn Barry     return KRB5_ALPHA(c);
412*ba7b222eSGlenn Barry }
413*ba7b222eSGlenn Barry 
krb5int_utf8_isalnum(const char * p)414*ba7b222eSGlenn Barry int krb5int_utf8_isalnum(const char * p)
415*ba7b222eSGlenn Barry {
416*ba7b222eSGlenn Barry     unsigned c = * (const unsigned char *) p;
417*ba7b222eSGlenn Barry 
418*ba7b222eSGlenn Barry     if (!KRB5_ASCII(c))
419*ba7b222eSGlenn Barry 	return 0;
420*ba7b222eSGlenn Barry 
421*ba7b222eSGlenn Barry     return KRB5_ALNUM(c);
422*ba7b222eSGlenn Barry }
423*ba7b222eSGlenn Barry 
424*ba7b222eSGlenn Barry #if 0
425*ba7b222eSGlenn Barry int krb5int_utf8_islower(const char * p)
426*ba7b222eSGlenn Barry {
427*ba7b222eSGlenn Barry     unsigned c = * (const unsigned char *) p;
428*ba7b222eSGlenn Barry 
429*ba7b222eSGlenn Barry     if (!KRB5_ASCII(c))
430*ba7b222eSGlenn Barry 	return 0;
431*ba7b222eSGlenn Barry 
432*ba7b222eSGlenn Barry     return KRB5_LOWER(c);
433*ba7b222eSGlenn Barry }
434*ba7b222eSGlenn Barry 
435*ba7b222eSGlenn Barry int krb5int_utf8_isupper(const char * p)
436*ba7b222eSGlenn Barry {
437*ba7b222eSGlenn Barry     unsigned c = * (const unsigned char *) p;
438*ba7b222eSGlenn Barry 
439*ba7b222eSGlenn Barry     if (!KRB5_ASCII(c))
440*ba7b222eSGlenn Barry 	return 0;
441*ba7b222eSGlenn Barry 
442*ba7b222eSGlenn Barry     return KRB5_UPPER(c);
443*ba7b222eSGlenn Barry }
444*ba7b222eSGlenn Barry #endif
445*ba7b222eSGlenn Barry #endif
446*ba7b222eSGlenn Barry 
447*ba7b222eSGlenn Barry 
448*ba7b222eSGlenn Barry /*
449*ba7b222eSGlenn Barry  * UTF-8 string routines
450*ba7b222eSGlenn Barry  */
451*ba7b222eSGlenn Barry 
452*ba7b222eSGlenn Barry /* like strchr() */
krb5int_utf8_strchr(const char * str,const char * chr)453*ba7b222eSGlenn Barry char *krb5int_utf8_strchr(const char *str, const char *chr)
454*ba7b222eSGlenn Barry {
455*ba7b222eSGlenn Barry     krb5_ucs4 chs, ch;
456*ba7b222eSGlenn Barry 
457*ba7b222eSGlenn Barry     if (krb5int_utf8_to_ucs4(chr, &ch) == -1)
458*ba7b222eSGlenn Barry 	return NULL;
459*ba7b222eSGlenn Barry     for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
460*ba7b222eSGlenn Barry 	if (krb5int_utf8_to_ucs4(str, &chs) == 0 && chs == ch)
461*ba7b222eSGlenn Barry 	    return (char *)str;
462*ba7b222eSGlenn Barry     }
463*ba7b222eSGlenn Barry 
464*ba7b222eSGlenn Barry     return NULL;
465*ba7b222eSGlenn Barry }
466*ba7b222eSGlenn Barry 
467*ba7b222eSGlenn Barry /* like strcspn() but returns number of bytes, not characters */
krb5int_utf8_strcspn(const char * str,const char * set)468*ba7b222eSGlenn Barry size_t krb5int_utf8_strcspn(const char *str, const char *set)
469*ba7b222eSGlenn Barry {
470*ba7b222eSGlenn Barry     const char *cstr, *cset;
471*ba7b222eSGlenn Barry     krb5_ucs4 chstr, chset;
472*ba7b222eSGlenn Barry 
473*ba7b222eSGlenn Barry     for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
474*ba7b222eSGlenn Barry 	for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
475*ba7b222eSGlenn Barry 	    if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
476*ba7b222eSGlenn Barry 		&& krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
477*ba7b222eSGlenn Barry 		return cstr - str;
478*ba7b222eSGlenn Barry 	}
479*ba7b222eSGlenn Barry     }
480*ba7b222eSGlenn Barry 
481*ba7b222eSGlenn Barry     return cstr - str;
482*ba7b222eSGlenn Barry }
483*ba7b222eSGlenn Barry 
484*ba7b222eSGlenn Barry /* like strspn() but returns number of bytes, not characters */
krb5int_utf8_strspn(const char * str,const char * set)485*ba7b222eSGlenn Barry size_t krb5int_utf8_strspn(const char *str, const char *set)
486*ba7b222eSGlenn Barry {
487*ba7b222eSGlenn Barry     const char *cstr, *cset;
488*ba7b222eSGlenn Barry     krb5_ucs4 chstr, chset;
489*ba7b222eSGlenn Barry 
490*ba7b222eSGlenn Barry     for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
491*ba7b222eSGlenn Barry 	for (cset = set; ; KRB5_UTF8_INCR(cset)) {
492*ba7b222eSGlenn Barry 	    if (*cset == '\0')
493*ba7b222eSGlenn Barry 		return cstr - str;
494*ba7b222eSGlenn Barry 	    if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
495*ba7b222eSGlenn Barry 		&& krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
496*ba7b222eSGlenn Barry 		break;
497*ba7b222eSGlenn Barry 	}
498*ba7b222eSGlenn Barry     }
499*ba7b222eSGlenn Barry 
500*ba7b222eSGlenn Barry     return cstr - str;
501*ba7b222eSGlenn Barry }
502*ba7b222eSGlenn Barry 
503*ba7b222eSGlenn Barry /* like strpbrk(), replaces strchr() as well */
krb5int_utf8_strpbrk(const char * str,const char * set)504*ba7b222eSGlenn Barry char *krb5int_utf8_strpbrk(const char *str, const char *set)
505*ba7b222eSGlenn Barry {
506*ba7b222eSGlenn Barry     const char *cset;
507*ba7b222eSGlenn Barry     krb5_ucs4 chstr, chset;
508*ba7b222eSGlenn Barry 
509*ba7b222eSGlenn Barry     for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
510*ba7b222eSGlenn Barry 	for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
511*ba7b222eSGlenn Barry 	    if (krb5int_utf8_to_ucs4(str, &chstr) == 0
512*ba7b222eSGlenn Barry 		&& krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
513*ba7b222eSGlenn Barry 		return (char *)str;
514*ba7b222eSGlenn Barry 	}
515*ba7b222eSGlenn Barry     }
516*ba7b222eSGlenn Barry 
517*ba7b222eSGlenn Barry     return NULL;
518*ba7b222eSGlenn Barry }
519*ba7b222eSGlenn Barry 
520*ba7b222eSGlenn Barry /* like strtok_r(), not strtok() */
krb5int_utf8_strtok(char * str,const char * sep,char ** last)521*ba7b222eSGlenn Barry char *krb5int_utf8_strtok(char *str, const char *sep, char **last)
522*ba7b222eSGlenn Barry {
523*ba7b222eSGlenn Barry     char *begin;
524*ba7b222eSGlenn Barry     char *end;
525*ba7b222eSGlenn Barry 
526*ba7b222eSGlenn Barry     if (last == NULL)
527*ba7b222eSGlenn Barry 	return NULL;
528*ba7b222eSGlenn Barry 
529*ba7b222eSGlenn Barry     begin = str ? str : *last;
530*ba7b222eSGlenn Barry 
531*ba7b222eSGlenn Barry     begin += krb5int_utf8_strspn(begin, sep);
532*ba7b222eSGlenn Barry 
533*ba7b222eSGlenn Barry     if (*begin == '\0') {
534*ba7b222eSGlenn Barry 	*last = NULL;
535*ba7b222eSGlenn Barry 	return NULL;
536*ba7b222eSGlenn Barry     }
537*ba7b222eSGlenn Barry 
538*ba7b222eSGlenn Barry     end = &begin[krb5int_utf8_strcspn(begin, sep)];
539*ba7b222eSGlenn Barry 
540*ba7b222eSGlenn Barry     if (*end != '\0') {
541*ba7b222eSGlenn Barry 	char *next = KRB5_UTF8_NEXT(end);
542*ba7b222eSGlenn Barry 	*end = '\0';
543*ba7b222eSGlenn Barry 	end = next;
544*ba7b222eSGlenn Barry     }
545*ba7b222eSGlenn Barry 
546*ba7b222eSGlenn Barry     *last = end;
547*ba7b222eSGlenn Barry 
548*ba7b222eSGlenn Barry     return begin;
549*ba7b222eSGlenn Barry }
550