1*ba7b222eSGlenn Barry /*
2*ba7b222eSGlenn Barry * util/support/utf8.c
3*ba7b222eSGlenn Barry *
4*ba7b222eSGlenn Barry * Copyright 2008 by the Massachusetts Institute of Technology.
5*ba7b222eSGlenn Barry * All Rights Reserved.
6*ba7b222eSGlenn Barry *
7*ba7b222eSGlenn Barry * Export of this software from the United States of America may
8*ba7b222eSGlenn Barry * require a specific license from the United States Government.
9*ba7b222eSGlenn Barry * It is the responsibility of any person or organization contemplating
10*ba7b222eSGlenn Barry * export to obtain such a license before exporting.
11*ba7b222eSGlenn Barry *
12*ba7b222eSGlenn Barry * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
13*ba7b222eSGlenn Barry * distribute this software and its documentation for any purpose and
14*ba7b222eSGlenn Barry * without fee is hereby granted, provided that the above copyright
15*ba7b222eSGlenn Barry * notice appear in all copies and that both that copyright notice and
16*ba7b222eSGlenn Barry * this permission notice appear in supporting documentation, and that
17*ba7b222eSGlenn Barry * the name of M.I.T. not be used in advertising or publicity pertaining
18*ba7b222eSGlenn Barry * to distribution of the software without specific, written prior
19*ba7b222eSGlenn Barry * permission. Furthermore if you modify this software you must label
20*ba7b222eSGlenn Barry * your software as modified software and not distribute it in such a
21*ba7b222eSGlenn Barry * fashion that it might be confused with the original M.I.T. software.
22*ba7b222eSGlenn Barry * M.I.T. makes no representations about the suitability of
23*ba7b222eSGlenn Barry * this software for any purpose. It is provided "as is" without express
24*ba7b222eSGlenn Barry * or implied warranty.
25*ba7b222eSGlenn Barry */
26*ba7b222eSGlenn Barry /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
27*ba7b222eSGlenn Barry *
28*ba7b222eSGlenn Barry * Copyright 1998-2008 The OpenLDAP Foundation.
29*ba7b222eSGlenn Barry * All rights reserved.
30*ba7b222eSGlenn Barry *
31*ba7b222eSGlenn Barry * Redistribution and use in source and binary forms, with or without
32*ba7b222eSGlenn Barry * modification, are permitted only as authorized by the OpenLDAP
33*ba7b222eSGlenn Barry * Public License.
34*ba7b222eSGlenn Barry *
35*ba7b222eSGlenn Barry * A copy of this license is available in the file LICENSE in the
36*ba7b222eSGlenn Barry * top-level directory of the distribution or, alternatively, at
37*ba7b222eSGlenn Barry * <http://www.OpenLDAP.org/license.html>.
38*ba7b222eSGlenn Barry */
39*ba7b222eSGlenn Barry /* Basic UTF-8 routines
40*ba7b222eSGlenn Barry *
41*ba7b222eSGlenn Barry * These routines are "dumb". Though they understand UTF-8,
42*ba7b222eSGlenn Barry * they don't grok Unicode. That is, they can push bits,
43*ba7b222eSGlenn Barry * but don't have a clue what the bits represent. That's
44*ba7b222eSGlenn Barry * good enough for use with the KRB5 Client SDK.
45*ba7b222eSGlenn Barry *
46*ba7b222eSGlenn Barry * These routines are not optimized.
47*ba7b222eSGlenn Barry */
48*ba7b222eSGlenn Barry
49*ba7b222eSGlenn Barry #include "k5-platform.h"
50*ba7b222eSGlenn Barry #include "k5-utf8.h"
51*ba7b222eSGlenn Barry #include "supp-int.h"
52*ba7b222eSGlenn Barry
53*ba7b222eSGlenn Barry /*
54*ba7b222eSGlenn Barry * return the number of bytes required to hold the
55*ba7b222eSGlenn Barry * NULL-terminated UTF-8 string NOT INCLUDING the
56*ba7b222eSGlenn Barry * termination.
57*ba7b222eSGlenn Barry */
krb5int_utf8_bytes(const char * p)58*ba7b222eSGlenn Barry size_t krb5int_utf8_bytes(const char *p)
59*ba7b222eSGlenn Barry {
60*ba7b222eSGlenn Barry size_t bytes;
61*ba7b222eSGlenn Barry
62*ba7b222eSGlenn Barry for (bytes = 0; p[bytes]; bytes++)
63*ba7b222eSGlenn Barry ;
64*ba7b222eSGlenn Barry
65*ba7b222eSGlenn Barry return bytes;
66*ba7b222eSGlenn Barry }
67*ba7b222eSGlenn Barry
krb5int_utf8_chars(const char * p)68*ba7b222eSGlenn Barry size_t krb5int_utf8_chars(const char *p)
69*ba7b222eSGlenn Barry {
70*ba7b222eSGlenn Barry /* could be optimized and could check for invalid sequences */
71*ba7b222eSGlenn Barry size_t chars = 0;
72*ba7b222eSGlenn Barry
73*ba7b222eSGlenn Barry for ( ; *p ; KRB5_UTF8_INCR(p))
74*ba7b222eSGlenn Barry chars++;
75*ba7b222eSGlenn Barry
76*ba7b222eSGlenn Barry return chars;
77*ba7b222eSGlenn Barry }
78*ba7b222eSGlenn Barry
krb5int_utf8c_chars(const char * p,size_t length)79*ba7b222eSGlenn Barry size_t krb5int_utf8c_chars(const char *p, size_t length)
80*ba7b222eSGlenn Barry {
81*ba7b222eSGlenn Barry /* could be optimized and could check for invalid sequences */
82*ba7b222eSGlenn Barry size_t chars = 0;
83*ba7b222eSGlenn Barry const char *end = p + length;
84*ba7b222eSGlenn Barry
85*ba7b222eSGlenn Barry for ( ; p < end; KRB5_UTF8_INCR(p))
86*ba7b222eSGlenn Barry chars++;
87*ba7b222eSGlenn Barry
88*ba7b222eSGlenn Barry return chars;
89*ba7b222eSGlenn Barry }
90*ba7b222eSGlenn Barry
91*ba7b222eSGlenn Barry /* return offset to next character */
krb5int_utf8_offset(const char * p)92*ba7b222eSGlenn Barry int krb5int_utf8_offset(const char *p)
93*ba7b222eSGlenn Barry {
94*ba7b222eSGlenn Barry return KRB5_UTF8_NEXT(p) - p;
95*ba7b222eSGlenn Barry }
96*ba7b222eSGlenn Barry
97*ba7b222eSGlenn Barry /*
98*ba7b222eSGlenn Barry * Returns length indicated by first byte.
99*ba7b222eSGlenn Barry */
100*ba7b222eSGlenn Barry const char krb5int_utf8_lentab[] = {
101*ba7b222eSGlenn Barry 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102*ba7b222eSGlenn Barry 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103*ba7b222eSGlenn Barry 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104*ba7b222eSGlenn Barry 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105*ba7b222eSGlenn Barry 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
106*ba7b222eSGlenn Barry 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
107*ba7b222eSGlenn Barry 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
108*ba7b222eSGlenn Barry 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
109*ba7b222eSGlenn Barry
krb5int_utf8_charlen(const char * p)110*ba7b222eSGlenn Barry int krb5int_utf8_charlen(const char *p)
111*ba7b222eSGlenn Barry {
112*ba7b222eSGlenn Barry if (!(*p & 0x80))
113*ba7b222eSGlenn Barry return 1;
114*ba7b222eSGlenn Barry
115*ba7b222eSGlenn Barry return krb5int_utf8_lentab[*(const unsigned char *)p ^ 0x80];
116*ba7b222eSGlenn Barry }
117*ba7b222eSGlenn Barry
118*ba7b222eSGlenn Barry /*
119*ba7b222eSGlenn Barry * Make sure the UTF-8 char used the shortest possible encoding
120*ba7b222eSGlenn Barry * returns charlen if valid, 0 if not.
121*ba7b222eSGlenn Barry *
122*ba7b222eSGlenn Barry * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
123*ba7b222eSGlenn Barry * The table is slightly modified from that of the RFC.
124*ba7b222eSGlenn Barry *
125*ba7b222eSGlenn Barry * UCS-4 range (hex) UTF-8 sequence (binary)
126*ba7b222eSGlenn Barry * 0000 0000-0000 007F 0.......
127*ba7b222eSGlenn Barry * 0000 0080-0000 07FF 110++++. 10......
128*ba7b222eSGlenn Barry * 0000 0800-0000 FFFF 1110++++ 10+..... 10......
129*ba7b222eSGlenn Barry * 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10......
130*ba7b222eSGlenn Barry * 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10......
131*ba7b222eSGlenn Barry * 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10......
132*ba7b222eSGlenn Barry *
133*ba7b222eSGlenn Barry * The '.' bits are "don't cares". When validating a UTF-8 sequence,
134*ba7b222eSGlenn Barry * at least one of the '+' bits must be set, otherwise the character
135*ba7b222eSGlenn Barry * should have been encoded in fewer octets. Note that in the two-octet
136*ba7b222eSGlenn Barry * case, only the first octet needs to be validated, and this is done
137*ba7b222eSGlenn Barry * in the krb5int_utf8_lentab[] above.
138*ba7b222eSGlenn Barry */
139*ba7b222eSGlenn Barry
140*ba7b222eSGlenn Barry /* mask of required bits in second octet */
141*ba7b222eSGlenn Barry #undef c
142*ba7b222eSGlenn Barry #define c const char
143*ba7b222eSGlenn Barry c krb5int_utf8_mintab[] = {
144*ba7b222eSGlenn Barry (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
145*ba7b222eSGlenn Barry (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
146*ba7b222eSGlenn Barry (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
147*ba7b222eSGlenn Barry (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
148*ba7b222eSGlenn Barry #undef c
149*ba7b222eSGlenn Barry
krb5int_utf8_charlen2(const char * p)150*ba7b222eSGlenn Barry int krb5int_utf8_charlen2(const char *p)
151*ba7b222eSGlenn Barry {
152*ba7b222eSGlenn Barry int i = KRB5_UTF8_CHARLEN(p);
153*ba7b222eSGlenn Barry
154*ba7b222eSGlenn Barry if (i > 2) {
155*ba7b222eSGlenn Barry if (!(krb5int_utf8_mintab[*p & 0x1f] & p[1]))
156*ba7b222eSGlenn Barry i = 0;
157*ba7b222eSGlenn Barry }
158*ba7b222eSGlenn Barry
159*ba7b222eSGlenn Barry return i;
160*ba7b222eSGlenn Barry }
161*ba7b222eSGlenn Barry
162*ba7b222eSGlenn Barry /*
163*ba7b222eSGlenn Barry * Convert a UTF8 character to a UCS4 character. Return 0 on success,
164*ba7b222eSGlenn Barry * -1 on failure.
165*ba7b222eSGlenn Barry */
krb5int_utf8_to_ucs4(const char * p,krb5_ucs4 * out)166*ba7b222eSGlenn Barry int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out)
167*ba7b222eSGlenn Barry {
168*ba7b222eSGlenn Barry const unsigned char *c = (const unsigned char *) p;
169*ba7b222eSGlenn Barry krb5_ucs4 ch;
170*ba7b222eSGlenn Barry int len, i;
171*ba7b222eSGlenn Barry static unsigned char mask[] = {
172*ba7b222eSGlenn Barry 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
173*ba7b222eSGlenn Barry
174*ba7b222eSGlenn Barry *out = 0;
175*ba7b222eSGlenn Barry len = KRB5_UTF8_CHARLEN2(p, len);
176*ba7b222eSGlenn Barry
177*ba7b222eSGlenn Barry if (len == 0)
178*ba7b222eSGlenn Barry return -1;
179*ba7b222eSGlenn Barry
180*ba7b222eSGlenn Barry ch = c[0] & mask[len];
181*ba7b222eSGlenn Barry
182*ba7b222eSGlenn Barry for (i = 1; i < len; i++) {
183*ba7b222eSGlenn Barry if ((c[i] & 0xc0) != 0x80)
184*ba7b222eSGlenn Barry return -1;
185*ba7b222eSGlenn Barry
186*ba7b222eSGlenn Barry ch <<= 6;
187*ba7b222eSGlenn Barry ch |= c[i] & 0x3f;
188*ba7b222eSGlenn Barry }
189*ba7b222eSGlenn Barry
190*ba7b222eSGlenn Barry *out = ch;
191*ba7b222eSGlenn Barry return 0;
192*ba7b222eSGlenn Barry }
193*ba7b222eSGlenn Barry
krb5int_utf8_to_ucs2(const char * p,krb5_ucs2 * out)194*ba7b222eSGlenn Barry int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out)
195*ba7b222eSGlenn Barry {
196*ba7b222eSGlenn Barry krb5_ucs4 ch;
197*ba7b222eSGlenn Barry
198*ba7b222eSGlenn Barry *out = 0;
199*ba7b222eSGlenn Barry if (krb5int_utf8_to_ucs4(p, &ch) == -1 || ch > 0xFFFF)
200*ba7b222eSGlenn Barry return -1;
201*ba7b222eSGlenn Barry *out = (krb5_ucs2) ch;
202*ba7b222eSGlenn Barry return 0;
203*ba7b222eSGlenn Barry }
204*ba7b222eSGlenn Barry
205*ba7b222eSGlenn Barry /* conv UCS-2 to UTF-8, not used */
krb5int_ucs4_to_utf8(krb5_ucs4 c,char * buf)206*ba7b222eSGlenn Barry size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)
207*ba7b222eSGlenn Barry {
208*ba7b222eSGlenn Barry size_t len = 0;
209*ba7b222eSGlenn Barry unsigned char *p = (unsigned char *) buf;
210*ba7b222eSGlenn Barry
211*ba7b222eSGlenn Barry /* not a valid Unicode character */
212*ba7b222eSGlenn Barry if (c < 0)
213*ba7b222eSGlenn Barry return 0;
214*ba7b222eSGlenn Barry
215*ba7b222eSGlenn Barry /* Just return length, don't convert */
216*ba7b222eSGlenn Barry if (buf == NULL) {
217*ba7b222eSGlenn Barry if (c < 0x80) return 1;
218*ba7b222eSGlenn Barry else if (c < 0x800) return 2;
219*ba7b222eSGlenn Barry else if (c < 0x10000) return 3;
220*ba7b222eSGlenn Barry else if (c < 0x200000) return 4;
221*ba7b222eSGlenn Barry else if (c < 0x4000000) return 5;
222*ba7b222eSGlenn Barry else return 6;
223*ba7b222eSGlenn Barry }
224*ba7b222eSGlenn Barry
225*ba7b222eSGlenn Barry if (c < 0x80) {
226*ba7b222eSGlenn Barry p[len++] = c;
227*ba7b222eSGlenn Barry } else if (c < 0x800) {
228*ba7b222eSGlenn Barry p[len++] = 0xc0 | ( c >> 6 );
229*ba7b222eSGlenn Barry p[len++] = 0x80 | ( c & 0x3f );
230*ba7b222eSGlenn Barry } else if (c < 0x10000) {
231*ba7b222eSGlenn Barry p[len++] = 0xe0 | ( c >> 12 );
232*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 6) & 0x3f );
233*ba7b222eSGlenn Barry p[len++] = 0x80 | ( c & 0x3f );
234*ba7b222eSGlenn Barry } else if (c < 0x200000) {
235*ba7b222eSGlenn Barry p[len++] = 0xf0 | ( c >> 18 );
236*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 12) & 0x3f );
237*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 6) & 0x3f );
238*ba7b222eSGlenn Barry p[len++] = 0x80 | ( c & 0x3f );
239*ba7b222eSGlenn Barry } else if (c < 0x4000000) {
240*ba7b222eSGlenn Barry p[len++] = 0xf8 | ( c >> 24 );
241*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 18) & 0x3f );
242*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 12) & 0x3f );
243*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 6) & 0x3f );
244*ba7b222eSGlenn Barry p[len++] = 0x80 | ( c & 0x3f );
245*ba7b222eSGlenn Barry } else /* if( c < 0x80000000 ) */ {
246*ba7b222eSGlenn Barry p[len++] = 0xfc | ( c >> 30 );
247*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 24) & 0x3f );
248*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 18) & 0x3f );
249*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 12) & 0x3f );
250*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 6) & 0x3f );
251*ba7b222eSGlenn Barry p[len++] = 0x80 | ( c & 0x3f );
252*ba7b222eSGlenn Barry }
253*ba7b222eSGlenn Barry
254*ba7b222eSGlenn Barry return len;
255*ba7b222eSGlenn Barry }
256*ba7b222eSGlenn Barry
krb5int_ucs2_to_utf8(krb5_ucs2 c,char * buf)257*ba7b222eSGlenn Barry size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf)
258*ba7b222eSGlenn Barry {
259*ba7b222eSGlenn Barry return krb5int_ucs4_to_utf8((krb5_ucs4)c, buf);
260*ba7b222eSGlenn Barry }
261*ba7b222eSGlenn Barry
262*ba7b222eSGlenn Barry #define KRB5_UCS_UTF8LEN(c) \
263*ba7b222eSGlenn Barry c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
264*ba7b222eSGlenn Barry (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
265*ba7b222eSGlenn Barry
266*ba7b222eSGlenn Barry /*
267*ba7b222eSGlenn Barry * Advance to the next UTF-8 character
268*ba7b222eSGlenn Barry *
269*ba7b222eSGlenn Barry * Ignores length of multibyte character, instead rely on
270*ba7b222eSGlenn Barry * continuation markers to find start of next character.
271*ba7b222eSGlenn Barry * This allows for "resyncing" of when invalid characters
272*ba7b222eSGlenn Barry * are provided provided the start of the next character
273*ba7b222eSGlenn Barry * is appears within the 6 bytes examined.
274*ba7b222eSGlenn Barry */
krb5int_utf8_next(const char * p)275*ba7b222eSGlenn Barry char *krb5int_utf8_next(const char *p)
276*ba7b222eSGlenn Barry {
277*ba7b222eSGlenn Barry int i;
278*ba7b222eSGlenn Barry const unsigned char *u = (const unsigned char *) p;
279*ba7b222eSGlenn Barry
280*ba7b222eSGlenn Barry if (KRB5_UTF8_ISASCII(u)) {
281*ba7b222eSGlenn Barry return (char *) &p[1];
282*ba7b222eSGlenn Barry }
283*ba7b222eSGlenn Barry
284*ba7b222eSGlenn Barry for (i = 1; i < 6; i++) {
285*ba7b222eSGlenn Barry if ((u[i] & 0xc0) != 0x80) {
286*ba7b222eSGlenn Barry return (char *) &p[i];
287*ba7b222eSGlenn Barry }
288*ba7b222eSGlenn Barry }
289*ba7b222eSGlenn Barry
290*ba7b222eSGlenn Barry return (char *) &p[i];
291*ba7b222eSGlenn Barry }
292*ba7b222eSGlenn Barry
293*ba7b222eSGlenn Barry /*
294*ba7b222eSGlenn Barry * Advance to the previous UTF-8 character
295*ba7b222eSGlenn Barry *
296*ba7b222eSGlenn Barry * Ignores length of multibyte character, instead rely on
297*ba7b222eSGlenn Barry * continuation markers to find start of next character.
298*ba7b222eSGlenn Barry * This allows for "resyncing" of when invalid characters
299*ba7b222eSGlenn Barry * are provided provided the start of the next character
300*ba7b222eSGlenn Barry * is appears within the 6 bytes examined.
301*ba7b222eSGlenn Barry */
krb5int_utf8_prev(const char * p)302*ba7b222eSGlenn Barry char *krb5int_utf8_prev(const char *p)
303*ba7b222eSGlenn Barry {
304*ba7b222eSGlenn Barry int i;
305*ba7b222eSGlenn Barry const unsigned char *u = (const unsigned char *) p;
306*ba7b222eSGlenn Barry
307*ba7b222eSGlenn Barry for (i = -1; i>-6 ; i--) {
308*ba7b222eSGlenn Barry if ((u[i] & 0xc0 ) != 0x80) {
309*ba7b222eSGlenn Barry return (char *) &p[i];
310*ba7b222eSGlenn Barry }
311*ba7b222eSGlenn Barry }
312*ba7b222eSGlenn Barry
313*ba7b222eSGlenn Barry return (char *) &p[i];
314*ba7b222eSGlenn Barry }
315*ba7b222eSGlenn Barry
316*ba7b222eSGlenn Barry /*
317*ba7b222eSGlenn Barry * Copy one UTF-8 character from src to dst returning
318*ba7b222eSGlenn Barry * number of bytes copied.
319*ba7b222eSGlenn Barry *
320*ba7b222eSGlenn Barry * Ignores length of multibyte character, instead rely on
321*ba7b222eSGlenn Barry * continuation markers to find start of next character.
322*ba7b222eSGlenn Barry * This allows for "resyncing" of when invalid characters
323*ba7b222eSGlenn Barry * are provided provided the start of the next character
324*ba7b222eSGlenn Barry * is appears within the 6 bytes examined.
325*ba7b222eSGlenn Barry */
krb5int_utf8_copy(char * dst,const char * src)326*ba7b222eSGlenn Barry int krb5int_utf8_copy(char* dst, const char *src)
327*ba7b222eSGlenn Barry {
328*ba7b222eSGlenn Barry int i;
329*ba7b222eSGlenn Barry const unsigned char *u = (const unsigned char *) src;
330*ba7b222eSGlenn Barry
331*ba7b222eSGlenn Barry dst[0] = src[0];
332*ba7b222eSGlenn Barry
333*ba7b222eSGlenn Barry if (KRB5_UTF8_ISASCII(u)) {
334*ba7b222eSGlenn Barry return 1;
335*ba7b222eSGlenn Barry }
336*ba7b222eSGlenn Barry
337*ba7b222eSGlenn Barry for (i=1; i<6; i++) {
338*ba7b222eSGlenn Barry if ((u[i] & 0xc0) != 0x80) {
339*ba7b222eSGlenn Barry return i;
340*ba7b222eSGlenn Barry }
341*ba7b222eSGlenn Barry dst[i] = src[i];
342*ba7b222eSGlenn Barry }
343*ba7b222eSGlenn Barry
344*ba7b222eSGlenn Barry return i;
345*ba7b222eSGlenn Barry }
346*ba7b222eSGlenn Barry
347*ba7b222eSGlenn Barry #ifndef UTF8_ALPHA_CTYPE
348*ba7b222eSGlenn Barry /*
349*ba7b222eSGlenn Barry * UTF-8 ctype routines
350*ba7b222eSGlenn Barry * Only deals with characters < 0x80 (ie: US-ASCII)
351*ba7b222eSGlenn Barry */
352*ba7b222eSGlenn Barry
krb5int_utf8_isascii(const char * p)353*ba7b222eSGlenn Barry int krb5int_utf8_isascii(const char * p)
354*ba7b222eSGlenn Barry {
355*ba7b222eSGlenn Barry unsigned c = * (const unsigned char *) p;
356*ba7b222eSGlenn Barry
357*ba7b222eSGlenn Barry return KRB5_ASCII(c);
358*ba7b222eSGlenn Barry }
359*ba7b222eSGlenn Barry
krb5int_utf8_isdigit(const char * p)360*ba7b222eSGlenn Barry int krb5int_utf8_isdigit(const char * p)
361*ba7b222eSGlenn Barry {
362*ba7b222eSGlenn Barry unsigned c = * (const unsigned char *) p;
363*ba7b222eSGlenn Barry
364*ba7b222eSGlenn Barry if (!KRB5_ASCII(c))
365*ba7b222eSGlenn Barry return 0;
366*ba7b222eSGlenn Barry
367*ba7b222eSGlenn Barry return KRB5_DIGIT( c );
368*ba7b222eSGlenn Barry }
369*ba7b222eSGlenn Barry
krb5int_utf8_isxdigit(const char * p)370*ba7b222eSGlenn Barry int krb5int_utf8_isxdigit(const char * p)
371*ba7b222eSGlenn Barry {
372*ba7b222eSGlenn Barry unsigned c = * (const unsigned char *) p;
373*ba7b222eSGlenn Barry
374*ba7b222eSGlenn Barry if (!KRB5_ASCII(c))
375*ba7b222eSGlenn Barry return 0;
376*ba7b222eSGlenn Barry
377*ba7b222eSGlenn Barry return KRB5_HEX(c);
378*ba7b222eSGlenn Barry }
379*ba7b222eSGlenn Barry
krb5int_utf8_isspace(const char * p)380*ba7b222eSGlenn Barry int krb5int_utf8_isspace(const char * p)
381*ba7b222eSGlenn Barry {
382*ba7b222eSGlenn Barry unsigned c = * (const unsigned char *) p;
383*ba7b222eSGlenn Barry
384*ba7b222eSGlenn Barry if (!KRB5_ASCII(c))
385*ba7b222eSGlenn Barry return 0;
386*ba7b222eSGlenn Barry
387*ba7b222eSGlenn Barry switch(c) {
388*ba7b222eSGlenn Barry case ' ':
389*ba7b222eSGlenn Barry case '\t':
390*ba7b222eSGlenn Barry case '\n':
391*ba7b222eSGlenn Barry case '\r':
392*ba7b222eSGlenn Barry case '\v':
393*ba7b222eSGlenn Barry case '\f':
394*ba7b222eSGlenn Barry return 1;
395*ba7b222eSGlenn Barry }
396*ba7b222eSGlenn Barry
397*ba7b222eSGlenn Barry return 0;
398*ba7b222eSGlenn Barry }
399*ba7b222eSGlenn Barry
400*ba7b222eSGlenn Barry /*
401*ba7b222eSGlenn Barry * These are not needed by the C SDK and are
402*ba7b222eSGlenn Barry * not "good enough" for general use.
403*ba7b222eSGlenn Barry */
krb5int_utf8_isalpha(const char * p)404*ba7b222eSGlenn Barry int krb5int_utf8_isalpha(const char * p)
405*ba7b222eSGlenn Barry {
406*ba7b222eSGlenn Barry unsigned c = * (const unsigned char *) p;
407*ba7b222eSGlenn Barry
408*ba7b222eSGlenn Barry if (!KRB5_ASCII(c))
409*ba7b222eSGlenn Barry return 0;
410*ba7b222eSGlenn Barry
411*ba7b222eSGlenn Barry return KRB5_ALPHA(c);
412*ba7b222eSGlenn Barry }
413*ba7b222eSGlenn Barry
krb5int_utf8_isalnum(const char * p)414*ba7b222eSGlenn Barry int krb5int_utf8_isalnum(const char * p)
415*ba7b222eSGlenn Barry {
416*ba7b222eSGlenn Barry unsigned c = * (const unsigned char *) p;
417*ba7b222eSGlenn Barry
418*ba7b222eSGlenn Barry if (!KRB5_ASCII(c))
419*ba7b222eSGlenn Barry return 0;
420*ba7b222eSGlenn Barry
421*ba7b222eSGlenn Barry return KRB5_ALNUM(c);
422*ba7b222eSGlenn Barry }
423*ba7b222eSGlenn Barry
424*ba7b222eSGlenn Barry #if 0
425*ba7b222eSGlenn Barry int krb5int_utf8_islower(const char * p)
426*ba7b222eSGlenn Barry {
427*ba7b222eSGlenn Barry unsigned c = * (const unsigned char *) p;
428*ba7b222eSGlenn Barry
429*ba7b222eSGlenn Barry if (!KRB5_ASCII(c))
430*ba7b222eSGlenn Barry return 0;
431*ba7b222eSGlenn Barry
432*ba7b222eSGlenn Barry return KRB5_LOWER(c);
433*ba7b222eSGlenn Barry }
434*ba7b222eSGlenn Barry
435*ba7b222eSGlenn Barry int krb5int_utf8_isupper(const char * p)
436*ba7b222eSGlenn Barry {
437*ba7b222eSGlenn Barry unsigned c = * (const unsigned char *) p;
438*ba7b222eSGlenn Barry
439*ba7b222eSGlenn Barry if (!KRB5_ASCII(c))
440*ba7b222eSGlenn Barry return 0;
441*ba7b222eSGlenn Barry
442*ba7b222eSGlenn Barry return KRB5_UPPER(c);
443*ba7b222eSGlenn Barry }
444*ba7b222eSGlenn Barry #endif
445*ba7b222eSGlenn Barry #endif
446*ba7b222eSGlenn Barry
447*ba7b222eSGlenn Barry
448*ba7b222eSGlenn Barry /*
449*ba7b222eSGlenn Barry * UTF-8 string routines
450*ba7b222eSGlenn Barry */
451*ba7b222eSGlenn Barry
452*ba7b222eSGlenn Barry /* like strchr() */
krb5int_utf8_strchr(const char * str,const char * chr)453*ba7b222eSGlenn Barry char *krb5int_utf8_strchr(const char *str, const char *chr)
454*ba7b222eSGlenn Barry {
455*ba7b222eSGlenn Barry krb5_ucs4 chs, ch;
456*ba7b222eSGlenn Barry
457*ba7b222eSGlenn Barry if (krb5int_utf8_to_ucs4(chr, &ch) == -1)
458*ba7b222eSGlenn Barry return NULL;
459*ba7b222eSGlenn Barry for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
460*ba7b222eSGlenn Barry if (krb5int_utf8_to_ucs4(str, &chs) == 0 && chs == ch)
461*ba7b222eSGlenn Barry return (char *)str;
462*ba7b222eSGlenn Barry }
463*ba7b222eSGlenn Barry
464*ba7b222eSGlenn Barry return NULL;
465*ba7b222eSGlenn Barry }
466*ba7b222eSGlenn Barry
467*ba7b222eSGlenn Barry /* like strcspn() but returns number of bytes, not characters */
krb5int_utf8_strcspn(const char * str,const char * set)468*ba7b222eSGlenn Barry size_t krb5int_utf8_strcspn(const char *str, const char *set)
469*ba7b222eSGlenn Barry {
470*ba7b222eSGlenn Barry const char *cstr, *cset;
471*ba7b222eSGlenn Barry krb5_ucs4 chstr, chset;
472*ba7b222eSGlenn Barry
473*ba7b222eSGlenn Barry for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
474*ba7b222eSGlenn Barry for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
475*ba7b222eSGlenn Barry if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
476*ba7b222eSGlenn Barry && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
477*ba7b222eSGlenn Barry return cstr - str;
478*ba7b222eSGlenn Barry }
479*ba7b222eSGlenn Barry }
480*ba7b222eSGlenn Barry
481*ba7b222eSGlenn Barry return cstr - str;
482*ba7b222eSGlenn Barry }
483*ba7b222eSGlenn Barry
484*ba7b222eSGlenn Barry /* like strspn() but returns number of bytes, not characters */
krb5int_utf8_strspn(const char * str,const char * set)485*ba7b222eSGlenn Barry size_t krb5int_utf8_strspn(const char *str, const char *set)
486*ba7b222eSGlenn Barry {
487*ba7b222eSGlenn Barry const char *cstr, *cset;
488*ba7b222eSGlenn Barry krb5_ucs4 chstr, chset;
489*ba7b222eSGlenn Barry
490*ba7b222eSGlenn Barry for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
491*ba7b222eSGlenn Barry for (cset = set; ; KRB5_UTF8_INCR(cset)) {
492*ba7b222eSGlenn Barry if (*cset == '\0')
493*ba7b222eSGlenn Barry return cstr - str;
494*ba7b222eSGlenn Barry if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
495*ba7b222eSGlenn Barry && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
496*ba7b222eSGlenn Barry break;
497*ba7b222eSGlenn Barry }
498*ba7b222eSGlenn Barry }
499*ba7b222eSGlenn Barry
500*ba7b222eSGlenn Barry return cstr - str;
501*ba7b222eSGlenn Barry }
502*ba7b222eSGlenn Barry
503*ba7b222eSGlenn Barry /* like strpbrk(), replaces strchr() as well */
krb5int_utf8_strpbrk(const char * str,const char * set)504*ba7b222eSGlenn Barry char *krb5int_utf8_strpbrk(const char *str, const char *set)
505*ba7b222eSGlenn Barry {
506*ba7b222eSGlenn Barry const char *cset;
507*ba7b222eSGlenn Barry krb5_ucs4 chstr, chset;
508*ba7b222eSGlenn Barry
509*ba7b222eSGlenn Barry for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
510*ba7b222eSGlenn Barry for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
511*ba7b222eSGlenn Barry if (krb5int_utf8_to_ucs4(str, &chstr) == 0
512*ba7b222eSGlenn Barry && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
513*ba7b222eSGlenn Barry return (char *)str;
514*ba7b222eSGlenn Barry }
515*ba7b222eSGlenn Barry }
516*ba7b222eSGlenn Barry
517*ba7b222eSGlenn Barry return NULL;
518*ba7b222eSGlenn Barry }
519*ba7b222eSGlenn Barry
520*ba7b222eSGlenn Barry /* like strtok_r(), not strtok() */
krb5int_utf8_strtok(char * str,const char * sep,char ** last)521*ba7b222eSGlenn Barry char *krb5int_utf8_strtok(char *str, const char *sep, char **last)
522*ba7b222eSGlenn Barry {
523*ba7b222eSGlenn Barry char *begin;
524*ba7b222eSGlenn Barry char *end;
525*ba7b222eSGlenn Barry
526*ba7b222eSGlenn Barry if (last == NULL)
527*ba7b222eSGlenn Barry return NULL;
528*ba7b222eSGlenn Barry
529*ba7b222eSGlenn Barry begin = str ? str : *last;
530*ba7b222eSGlenn Barry
531*ba7b222eSGlenn Barry begin += krb5int_utf8_strspn(begin, sep);
532*ba7b222eSGlenn Barry
533*ba7b222eSGlenn Barry if (*begin == '\0') {
534*ba7b222eSGlenn Barry *last = NULL;
535*ba7b222eSGlenn Barry return NULL;
536*ba7b222eSGlenn Barry }
537*ba7b222eSGlenn Barry
538*ba7b222eSGlenn Barry end = &begin[krb5int_utf8_strcspn(begin, sep)];
539*ba7b222eSGlenn Barry
540*ba7b222eSGlenn Barry if (*end != '\0') {
541*ba7b222eSGlenn Barry char *next = KRB5_UTF8_NEXT(end);
542*ba7b222eSGlenn Barry *end = '\0';
543*ba7b222eSGlenn Barry end = next;
544*ba7b222eSGlenn Barry }
545*ba7b222eSGlenn Barry
546*ba7b222eSGlenn Barry *last = end;
547*ba7b222eSGlenn Barry
548*ba7b222eSGlenn Barry return begin;
549*ba7b222eSGlenn Barry }
550