xref: /titanic_50/usr/src/lib/libldap5/sources/ldap/common/ldaputf8.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1*7c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
2*7c478bd9Sstevel@tonic-gate 
3*7c478bd9Sstevel@tonic-gate /*
4*7c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the Netscape Public
5*7c478bd9Sstevel@tonic-gate  * License Version 1.1 (the "License"); you may not use this file
6*7c478bd9Sstevel@tonic-gate  * except in compliance with the License. You may obtain a copy of
7*7c478bd9Sstevel@tonic-gate  * the License at http://www.mozilla.org/NPL/
8*7c478bd9Sstevel@tonic-gate  *
9*7c478bd9Sstevel@tonic-gate  * Software distributed under the License is distributed on an "AS
10*7c478bd9Sstevel@tonic-gate  * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
11*7c478bd9Sstevel@tonic-gate  * implied. See the License for the specific language governing
12*7c478bd9Sstevel@tonic-gate  * rights and limitations under the License.
13*7c478bd9Sstevel@tonic-gate  *
14*7c478bd9Sstevel@tonic-gate  * The Original Code is Mozilla Communicator client code, released
15*7c478bd9Sstevel@tonic-gate  * March 31, 1998.
16*7c478bd9Sstevel@tonic-gate  *
17*7c478bd9Sstevel@tonic-gate  * The Initial Developer of the Original Code is Netscape
18*7c478bd9Sstevel@tonic-gate  * Communications Corporation. Portions created by Netscape are
19*7c478bd9Sstevel@tonic-gate  * Copyright (C) 1998-1999 Netscape Communications Corporation. All
20*7c478bd9Sstevel@tonic-gate  * Rights Reserved.
21*7c478bd9Sstevel@tonic-gate  *
22*7c478bd9Sstevel@tonic-gate  * Contributor(s):
23*7c478bd9Sstevel@tonic-gate  */
24*7c478bd9Sstevel@tonic-gate 
25*7c478bd9Sstevel@tonic-gate /* uft8.c - misc. utf8 "string" functions. */
26*7c478bd9Sstevel@tonic-gate #include "ldap-int.h"
27*7c478bd9Sstevel@tonic-gate 
28*7c478bd9Sstevel@tonic-gate static char UTF8len[64]
29*7c478bd9Sstevel@tonic-gate = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
30*7c478bd9Sstevel@tonic-gate    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
31*7c478bd9Sstevel@tonic-gate    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32*7c478bd9Sstevel@tonic-gate    2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6};
33*7c478bd9Sstevel@tonic-gate 
34*7c478bd9Sstevel@tonic-gate int
35*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8len(const char * s)36*7c478bd9Sstevel@tonic-gate ldap_utf8len (const char* s)
37*7c478bd9Sstevel@tonic-gate      /* Return the number of char's in the character at *s. */
38*7c478bd9Sstevel@tonic-gate {
39*7c478bd9Sstevel@tonic-gate     return ldap_utf8next((char*)s) - s;
40*7c478bd9Sstevel@tonic-gate }
41*7c478bd9Sstevel@tonic-gate 
42*7c478bd9Sstevel@tonic-gate char*
43*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8next(char * s)44*7c478bd9Sstevel@tonic-gate ldap_utf8next (char* s)
45*7c478bd9Sstevel@tonic-gate      /* Return a pointer to the character immediately following *s.
46*7c478bd9Sstevel@tonic-gate 	Handle any valid UTF-8 character, including '\0' and ASCII.
47*7c478bd9Sstevel@tonic-gate 	Try to handle a misaligned pointer or a malformed character.
48*7c478bd9Sstevel@tonic-gate      */
49*7c478bd9Sstevel@tonic-gate {
50*7c478bd9Sstevel@tonic-gate     register unsigned char* next = (unsigned char*)s;
51*7c478bd9Sstevel@tonic-gate     switch (UTF8len [(*next >> 2) & 0x3F]) {
52*7c478bd9Sstevel@tonic-gate       case 0: /* erroneous: s points to the middle of a character. */
53*7c478bd9Sstevel@tonic-gate       case 6: if ((*++next & 0xC0) != 0x80) break;
54*7c478bd9Sstevel@tonic-gate       case 5: if ((*++next & 0xC0) != 0x80) break;
55*7c478bd9Sstevel@tonic-gate       case 4: if ((*++next & 0xC0) != 0x80) break;
56*7c478bd9Sstevel@tonic-gate       case 3: if ((*++next & 0xC0) != 0x80) break;
57*7c478bd9Sstevel@tonic-gate       case 2: if ((*++next & 0xC0) != 0x80) break;
58*7c478bd9Sstevel@tonic-gate       case 1: ++next;
59*7c478bd9Sstevel@tonic-gate     }
60*7c478bd9Sstevel@tonic-gate     return (char*) next;
61*7c478bd9Sstevel@tonic-gate }
62*7c478bd9Sstevel@tonic-gate 
63*7c478bd9Sstevel@tonic-gate char*
64*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8prev(char * s)65*7c478bd9Sstevel@tonic-gate ldap_utf8prev (char* s)
66*7c478bd9Sstevel@tonic-gate      /* Return a pointer to the character immediately preceding *s.
67*7c478bd9Sstevel@tonic-gate 	Handle any valid UTF-8 character, including '\0' and ASCII.
68*7c478bd9Sstevel@tonic-gate 	Try to handle a misaligned pointer or a malformed character.
69*7c478bd9Sstevel@tonic-gate      */
70*7c478bd9Sstevel@tonic-gate {
71*7c478bd9Sstevel@tonic-gate     register unsigned char* prev = (unsigned char*)s;
72*7c478bd9Sstevel@tonic-gate     unsigned char* limit = prev - 6;
73*7c478bd9Sstevel@tonic-gate     while (((*--prev & 0xC0) == 0x80) && (prev != limit)) {
74*7c478bd9Sstevel@tonic-gate     	;
75*7c478bd9Sstevel@tonic-gate     }
76*7c478bd9Sstevel@tonic-gate     return (char*) prev;
77*7c478bd9Sstevel@tonic-gate }
78*7c478bd9Sstevel@tonic-gate 
79*7c478bd9Sstevel@tonic-gate int
80*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8copy(char * dst,const char * src)81*7c478bd9Sstevel@tonic-gate ldap_utf8copy (char* dst, const char* src)
82*7c478bd9Sstevel@tonic-gate      /* Copy a character from src to dst; return the number of char's copied.
83*7c478bd9Sstevel@tonic-gate 	Handle any valid UTF-8 character, including '\0' and ASCII.
84*7c478bd9Sstevel@tonic-gate 	Try to handle a misaligned pointer or a malformed character.
85*7c478bd9Sstevel@tonic-gate      */
86*7c478bd9Sstevel@tonic-gate {
87*7c478bd9Sstevel@tonic-gate     register const unsigned char* s = (const unsigned char*)src;
88*7c478bd9Sstevel@tonic-gate     switch (UTF8len [(*s >> 2) & 0x3F]) {
89*7c478bd9Sstevel@tonic-gate       case 0: /* erroneous: s points to the middle of a character. */
90*7c478bd9Sstevel@tonic-gate       case 6: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
91*7c478bd9Sstevel@tonic-gate       case 5: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
92*7c478bd9Sstevel@tonic-gate       case 4: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
93*7c478bd9Sstevel@tonic-gate       case 3: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
94*7c478bd9Sstevel@tonic-gate       case 2: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
95*7c478bd9Sstevel@tonic-gate       case 1: *dst   = *s++;
96*7c478bd9Sstevel@tonic-gate     }
97*7c478bd9Sstevel@tonic-gate     return s - (const unsigned char*)src;
98*7c478bd9Sstevel@tonic-gate }
99*7c478bd9Sstevel@tonic-gate 
100*7c478bd9Sstevel@tonic-gate size_t
101*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8characters(const char * src)102*7c478bd9Sstevel@tonic-gate ldap_utf8characters (const char* src)
103*7c478bd9Sstevel@tonic-gate      /* Return the number of UTF-8 characters in the 0-terminated array s. */
104*7c478bd9Sstevel@tonic-gate {
105*7c478bd9Sstevel@tonic-gate     register char* s = (char*)src;
106*7c478bd9Sstevel@tonic-gate     size_t n;
107*7c478bd9Sstevel@tonic-gate     for (n = 0; *s; LDAP_UTF8INC(s)) ++n;
108*7c478bd9Sstevel@tonic-gate     return n;
109*7c478bd9Sstevel@tonic-gate }
110*7c478bd9Sstevel@tonic-gate 
111*7c478bd9Sstevel@tonic-gate unsigned long LDAP_CALL
ldap_utf8getcc(const char ** src)112*7c478bd9Sstevel@tonic-gate ldap_utf8getcc( const char** src )
113*7c478bd9Sstevel@tonic-gate {
114*7c478bd9Sstevel@tonic-gate     register unsigned long c;
115*7c478bd9Sstevel@tonic-gate     register const unsigned char* s = (const unsigned char*)*src;
116*7c478bd9Sstevel@tonic-gate     switch (UTF8len [(*s >> 2) & 0x3F]) {
117*7c478bd9Sstevel@tonic-gate       case 0: /* erroneous: s points to the middle of a character. */
118*7c478bd9Sstevel@tonic-gate 	      c = (*s++) & 0x3F; goto more5;
119*7c478bd9Sstevel@tonic-gate       case 1: c = (*s++); break;
120*7c478bd9Sstevel@tonic-gate       case 2: c = (*s++) & 0x1F; goto more1;
121*7c478bd9Sstevel@tonic-gate       case 3: c = (*s++) & 0x0F; goto more2;
122*7c478bd9Sstevel@tonic-gate       case 4: c = (*s++) & 0x07; goto more3;
123*7c478bd9Sstevel@tonic-gate       case 5: c = (*s++) & 0x03; goto more4;
124*7c478bd9Sstevel@tonic-gate       case 6: c = (*s++) & 0x01; goto more5;
125*7c478bd9Sstevel@tonic-gate       more5: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
126*7c478bd9Sstevel@tonic-gate       more4: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
127*7c478bd9Sstevel@tonic-gate       more3: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
128*7c478bd9Sstevel@tonic-gate       more2: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
129*7c478bd9Sstevel@tonic-gate       more1: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
130*7c478bd9Sstevel@tonic-gate 	break;
131*7c478bd9Sstevel@tonic-gate     }
132*7c478bd9Sstevel@tonic-gate     *src = (const char*)s;
133*7c478bd9Sstevel@tonic-gate     return c;
134*7c478bd9Sstevel@tonic-gate }
135*7c478bd9Sstevel@tonic-gate 
136*7c478bd9Sstevel@tonic-gate char*
137*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8strtok_r(char * sp,const char * brk,char ** next)138*7c478bd9Sstevel@tonic-gate ldap_utf8strtok_r( char* sp, const char* brk, char** next)
139*7c478bd9Sstevel@tonic-gate {
140*7c478bd9Sstevel@tonic-gate     const char *bp;
141*7c478bd9Sstevel@tonic-gate     unsigned long sc, bc;
142*7c478bd9Sstevel@tonic-gate     char *tok;
143*7c478bd9Sstevel@tonic-gate 
144*7c478bd9Sstevel@tonic-gate     if (sp == NULL && (sp = *next) == NULL)
145*7c478bd9Sstevel@tonic-gate       return NULL;
146*7c478bd9Sstevel@tonic-gate 
147*7c478bd9Sstevel@tonic-gate     /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */
148*7c478bd9Sstevel@tonic-gate   cont:
149*7c478bd9Sstevel@tonic-gate     sc = LDAP_UTF8GETC(sp);
150*7c478bd9Sstevel@tonic-gate     for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) {
151*7c478bd9Sstevel@tonic-gate 	if (sc == bc)
152*7c478bd9Sstevel@tonic-gate 	  goto cont;
153*7c478bd9Sstevel@tonic-gate     }
154*7c478bd9Sstevel@tonic-gate 
155*7c478bd9Sstevel@tonic-gate     if (sc == 0) { /* no non-delimiter characters */
156*7c478bd9Sstevel@tonic-gate 	*next = NULL;
157*7c478bd9Sstevel@tonic-gate 	return NULL;
158*7c478bd9Sstevel@tonic-gate     }
159*7c478bd9Sstevel@tonic-gate     tok = LDAP_UTF8PREV(sp);
160*7c478bd9Sstevel@tonic-gate 
161*7c478bd9Sstevel@tonic-gate     /* Scan token; roughly, sp += strcspn(sp, brk)
162*7c478bd9Sstevel@tonic-gate      * Note that brk must be 0-terminated; we stop if we see that, too.
163*7c478bd9Sstevel@tonic-gate      */
164*7c478bd9Sstevel@tonic-gate     while (1) {
165*7c478bd9Sstevel@tonic-gate 	sc = LDAP_UTF8GETC(sp);
166*7c478bd9Sstevel@tonic-gate 	bp = brk;
167*7c478bd9Sstevel@tonic-gate 	do {
168*7c478bd9Sstevel@tonic-gate 	    if ((bc = LDAP_UTF8GETCC(bp)) == sc) {
169*7c478bd9Sstevel@tonic-gate 		if (sc == 0) {
170*7c478bd9Sstevel@tonic-gate 		    *next = NULL;
171*7c478bd9Sstevel@tonic-gate 		} else {
172*7c478bd9Sstevel@tonic-gate 		    *next = sp;
173*7c478bd9Sstevel@tonic-gate 		    *(LDAP_UTF8PREV(sp)) = 0;
174*7c478bd9Sstevel@tonic-gate 		}
175*7c478bd9Sstevel@tonic-gate 		return tok;
176*7c478bd9Sstevel@tonic-gate 	    }
177*7c478bd9Sstevel@tonic-gate 	} while (bc != 0);
178*7c478bd9Sstevel@tonic-gate     }
179*7c478bd9Sstevel@tonic-gate     /* NOTREACHED */
180*7c478bd9Sstevel@tonic-gate }
181*7c478bd9Sstevel@tonic-gate 
182*7c478bd9Sstevel@tonic-gate int
183*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isalnum(char * s)184*7c478bd9Sstevel@tonic-gate ldap_utf8isalnum( char* s )
185*7c478bd9Sstevel@tonic-gate {
186*7c478bd9Sstevel@tonic-gate     register unsigned char c = *(unsigned char*)s;
187*7c478bd9Sstevel@tonic-gate     if (0x80 & c) return 0;
188*7c478bd9Sstevel@tonic-gate     if (c >= 'A' && c <= 'Z') return 1;
189*7c478bd9Sstevel@tonic-gate     if (c >= 'a' && c <= 'z') return 1;
190*7c478bd9Sstevel@tonic-gate     if (c >= '0' && c <= '9') return 1;
191*7c478bd9Sstevel@tonic-gate     return 0;
192*7c478bd9Sstevel@tonic-gate }
193*7c478bd9Sstevel@tonic-gate 
194*7c478bd9Sstevel@tonic-gate int
195*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isalpha(char * s)196*7c478bd9Sstevel@tonic-gate ldap_utf8isalpha( char* s )
197*7c478bd9Sstevel@tonic-gate {
198*7c478bd9Sstevel@tonic-gate     register unsigned char c = *(unsigned char*)s;
199*7c478bd9Sstevel@tonic-gate     if (0x80 & c) return 0;
200*7c478bd9Sstevel@tonic-gate     if (c >= 'A' && c <= 'Z') return 1;
201*7c478bd9Sstevel@tonic-gate     if (c >= 'a' && c <= 'z') return 1;
202*7c478bd9Sstevel@tonic-gate     return 0;
203*7c478bd9Sstevel@tonic-gate }
204*7c478bd9Sstevel@tonic-gate 
205*7c478bd9Sstevel@tonic-gate int
206*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isdigit(char * s)207*7c478bd9Sstevel@tonic-gate ldap_utf8isdigit( char* s )
208*7c478bd9Sstevel@tonic-gate {
209*7c478bd9Sstevel@tonic-gate     register unsigned char c = *(unsigned char*)s;
210*7c478bd9Sstevel@tonic-gate     if (0x80 & c) return 0;
211*7c478bd9Sstevel@tonic-gate     if (c >= '0' && c <= '9') return 1;
212*7c478bd9Sstevel@tonic-gate     return 0;
213*7c478bd9Sstevel@tonic-gate }
214*7c478bd9Sstevel@tonic-gate 
215*7c478bd9Sstevel@tonic-gate int
216*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isxdigit(char * s)217*7c478bd9Sstevel@tonic-gate ldap_utf8isxdigit( char* s )
218*7c478bd9Sstevel@tonic-gate {
219*7c478bd9Sstevel@tonic-gate     register unsigned char c = *(unsigned char*)s;
220*7c478bd9Sstevel@tonic-gate     if (0x80 & c) return 0;
221*7c478bd9Sstevel@tonic-gate     if (c >= '0' && c <= '9') return 1;
222*7c478bd9Sstevel@tonic-gate     if (c >= 'A' && c <= 'F') return 1;
223*7c478bd9Sstevel@tonic-gate     if (c >= 'a' && c <= 'f') return 1;
224*7c478bd9Sstevel@tonic-gate     return 0;
225*7c478bd9Sstevel@tonic-gate }
226*7c478bd9Sstevel@tonic-gate 
227*7c478bd9Sstevel@tonic-gate int
228*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isspace(char * s)229*7c478bd9Sstevel@tonic-gate ldap_utf8isspace( char* s )
230*7c478bd9Sstevel@tonic-gate {
231*7c478bd9Sstevel@tonic-gate     register unsigned char *c = (unsigned char*)s;
232*7c478bd9Sstevel@tonic-gate     int len = ldap_utf8len(s);
233*7c478bd9Sstevel@tonic-gate 
234*7c478bd9Sstevel@tonic-gate     if (len == 0) {
235*7c478bd9Sstevel@tonic-gate 	return 0;
236*7c478bd9Sstevel@tonic-gate     } else if (len == 1) {
237*7c478bd9Sstevel@tonic-gate 	switch (*c) {
238*7c478bd9Sstevel@tonic-gate 	    case 0x09:
239*7c478bd9Sstevel@tonic-gate 	    case 0x0A:
240*7c478bd9Sstevel@tonic-gate 	    case 0x0B:
241*7c478bd9Sstevel@tonic-gate 	    case 0x0C:
242*7c478bd9Sstevel@tonic-gate 	    case 0x0D:
243*7c478bd9Sstevel@tonic-gate 	    case 0x20:
244*7c478bd9Sstevel@tonic-gate 		return 1;
245*7c478bd9Sstevel@tonic-gate 	    default:
246*7c478bd9Sstevel@tonic-gate 		return 0;
247*7c478bd9Sstevel@tonic-gate 	}
248*7c478bd9Sstevel@tonic-gate     } else if (len == 2) {
249*7c478bd9Sstevel@tonic-gate 	if (*c == 0xc2) {
250*7c478bd9Sstevel@tonic-gate 		return *(c+1) == 0x80;
251*7c478bd9Sstevel@tonic-gate 	}
252*7c478bd9Sstevel@tonic-gate     } else if (len == 3) {
253*7c478bd9Sstevel@tonic-gate 	if (*c == 0xE2) {
254*7c478bd9Sstevel@tonic-gate 	    c++;
255*7c478bd9Sstevel@tonic-gate 	    if (*c == 0x80) {
256*7c478bd9Sstevel@tonic-gate 		c++;
257*7c478bd9Sstevel@tonic-gate 		return (*c>=0x80 && *c<=0x8a);
258*7c478bd9Sstevel@tonic-gate 	    }
259*7c478bd9Sstevel@tonic-gate 	} else if (*c == 0xE3) {
260*7c478bd9Sstevel@tonic-gate 	    return (*(c+1)==0x80) && (*(c+2)==0x80);
261*7c478bd9Sstevel@tonic-gate 	} else if (*c==0xEF) {
262*7c478bd9Sstevel@tonic-gate 	    return (*(c+1)==0xBB) && (*(c+2)==0xBF);
263*7c478bd9Sstevel@tonic-gate 	}
264*7c478bd9Sstevel@tonic-gate 	return 0;
265*7c478bd9Sstevel@tonic-gate     }
266*7c478bd9Sstevel@tonic-gate 
267*7c478bd9Sstevel@tonic-gate     /* should never reach here */
268*7c478bd9Sstevel@tonic-gate     return 0;
269*7c478bd9Sstevel@tonic-gate }
270