1*7c478bd9Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI"
2*7c478bd9Sstevel@tonic-gate
3*7c478bd9Sstevel@tonic-gate /*
4*7c478bd9Sstevel@tonic-gate * The contents of this file are subject to the Netscape Public
5*7c478bd9Sstevel@tonic-gate * License Version 1.1 (the "License"); you may not use this file
6*7c478bd9Sstevel@tonic-gate * except in compliance with the License. You may obtain a copy of
7*7c478bd9Sstevel@tonic-gate * the License at http://www.mozilla.org/NPL/
8*7c478bd9Sstevel@tonic-gate *
9*7c478bd9Sstevel@tonic-gate * Software distributed under the License is distributed on an "AS
10*7c478bd9Sstevel@tonic-gate * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
11*7c478bd9Sstevel@tonic-gate * implied. See the License for the specific language governing
12*7c478bd9Sstevel@tonic-gate * rights and limitations under the License.
13*7c478bd9Sstevel@tonic-gate *
14*7c478bd9Sstevel@tonic-gate * The Original Code is Mozilla Communicator client code, released
15*7c478bd9Sstevel@tonic-gate * March 31, 1998.
16*7c478bd9Sstevel@tonic-gate *
17*7c478bd9Sstevel@tonic-gate * The Initial Developer of the Original Code is Netscape
18*7c478bd9Sstevel@tonic-gate * Communications Corporation. Portions created by Netscape are
19*7c478bd9Sstevel@tonic-gate * Copyright (C) 1998-1999 Netscape Communications Corporation. All
20*7c478bd9Sstevel@tonic-gate * Rights Reserved.
21*7c478bd9Sstevel@tonic-gate *
22*7c478bd9Sstevel@tonic-gate * Contributor(s):
23*7c478bd9Sstevel@tonic-gate */
24*7c478bd9Sstevel@tonic-gate
25*7c478bd9Sstevel@tonic-gate /* uft8.c - misc. utf8 "string" functions. */
26*7c478bd9Sstevel@tonic-gate #include "ldap-int.h"
27*7c478bd9Sstevel@tonic-gate
28*7c478bd9Sstevel@tonic-gate static char UTF8len[64]
29*7c478bd9Sstevel@tonic-gate = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
30*7c478bd9Sstevel@tonic-gate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
31*7c478bd9Sstevel@tonic-gate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32*7c478bd9Sstevel@tonic-gate 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6};
33*7c478bd9Sstevel@tonic-gate
34*7c478bd9Sstevel@tonic-gate int
35*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8len(const char * s)36*7c478bd9Sstevel@tonic-gate ldap_utf8len (const char* s)
37*7c478bd9Sstevel@tonic-gate /* Return the number of char's in the character at *s. */
38*7c478bd9Sstevel@tonic-gate {
39*7c478bd9Sstevel@tonic-gate return ldap_utf8next((char*)s) - s;
40*7c478bd9Sstevel@tonic-gate }
41*7c478bd9Sstevel@tonic-gate
42*7c478bd9Sstevel@tonic-gate char*
43*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8next(char * s)44*7c478bd9Sstevel@tonic-gate ldap_utf8next (char* s)
45*7c478bd9Sstevel@tonic-gate /* Return a pointer to the character immediately following *s.
46*7c478bd9Sstevel@tonic-gate Handle any valid UTF-8 character, including '\0' and ASCII.
47*7c478bd9Sstevel@tonic-gate Try to handle a misaligned pointer or a malformed character.
48*7c478bd9Sstevel@tonic-gate */
49*7c478bd9Sstevel@tonic-gate {
50*7c478bd9Sstevel@tonic-gate register unsigned char* next = (unsigned char*)s;
51*7c478bd9Sstevel@tonic-gate switch (UTF8len [(*next >> 2) & 0x3F]) {
52*7c478bd9Sstevel@tonic-gate case 0: /* erroneous: s points to the middle of a character. */
53*7c478bd9Sstevel@tonic-gate case 6: if ((*++next & 0xC0) != 0x80) break;
54*7c478bd9Sstevel@tonic-gate case 5: if ((*++next & 0xC0) != 0x80) break;
55*7c478bd9Sstevel@tonic-gate case 4: if ((*++next & 0xC0) != 0x80) break;
56*7c478bd9Sstevel@tonic-gate case 3: if ((*++next & 0xC0) != 0x80) break;
57*7c478bd9Sstevel@tonic-gate case 2: if ((*++next & 0xC0) != 0x80) break;
58*7c478bd9Sstevel@tonic-gate case 1: ++next;
59*7c478bd9Sstevel@tonic-gate }
60*7c478bd9Sstevel@tonic-gate return (char*) next;
61*7c478bd9Sstevel@tonic-gate }
62*7c478bd9Sstevel@tonic-gate
63*7c478bd9Sstevel@tonic-gate char*
64*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8prev(char * s)65*7c478bd9Sstevel@tonic-gate ldap_utf8prev (char* s)
66*7c478bd9Sstevel@tonic-gate /* Return a pointer to the character immediately preceding *s.
67*7c478bd9Sstevel@tonic-gate Handle any valid UTF-8 character, including '\0' and ASCII.
68*7c478bd9Sstevel@tonic-gate Try to handle a misaligned pointer or a malformed character.
69*7c478bd9Sstevel@tonic-gate */
70*7c478bd9Sstevel@tonic-gate {
71*7c478bd9Sstevel@tonic-gate register unsigned char* prev = (unsigned char*)s;
72*7c478bd9Sstevel@tonic-gate unsigned char* limit = prev - 6;
73*7c478bd9Sstevel@tonic-gate while (((*--prev & 0xC0) == 0x80) && (prev != limit)) {
74*7c478bd9Sstevel@tonic-gate ;
75*7c478bd9Sstevel@tonic-gate }
76*7c478bd9Sstevel@tonic-gate return (char*) prev;
77*7c478bd9Sstevel@tonic-gate }
78*7c478bd9Sstevel@tonic-gate
79*7c478bd9Sstevel@tonic-gate int
80*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8copy(char * dst,const char * src)81*7c478bd9Sstevel@tonic-gate ldap_utf8copy (char* dst, const char* src)
82*7c478bd9Sstevel@tonic-gate /* Copy a character from src to dst; return the number of char's copied.
83*7c478bd9Sstevel@tonic-gate Handle any valid UTF-8 character, including '\0' and ASCII.
84*7c478bd9Sstevel@tonic-gate Try to handle a misaligned pointer or a malformed character.
85*7c478bd9Sstevel@tonic-gate */
86*7c478bd9Sstevel@tonic-gate {
87*7c478bd9Sstevel@tonic-gate register const unsigned char* s = (const unsigned char*)src;
88*7c478bd9Sstevel@tonic-gate switch (UTF8len [(*s >> 2) & 0x3F]) {
89*7c478bd9Sstevel@tonic-gate case 0: /* erroneous: s points to the middle of a character. */
90*7c478bd9Sstevel@tonic-gate case 6: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
91*7c478bd9Sstevel@tonic-gate case 5: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
92*7c478bd9Sstevel@tonic-gate case 4: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
93*7c478bd9Sstevel@tonic-gate case 3: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
94*7c478bd9Sstevel@tonic-gate case 2: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
95*7c478bd9Sstevel@tonic-gate case 1: *dst = *s++;
96*7c478bd9Sstevel@tonic-gate }
97*7c478bd9Sstevel@tonic-gate return s - (const unsigned char*)src;
98*7c478bd9Sstevel@tonic-gate }
99*7c478bd9Sstevel@tonic-gate
100*7c478bd9Sstevel@tonic-gate size_t
101*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8characters(const char * src)102*7c478bd9Sstevel@tonic-gate ldap_utf8characters (const char* src)
103*7c478bd9Sstevel@tonic-gate /* Return the number of UTF-8 characters in the 0-terminated array s. */
104*7c478bd9Sstevel@tonic-gate {
105*7c478bd9Sstevel@tonic-gate register char* s = (char*)src;
106*7c478bd9Sstevel@tonic-gate size_t n;
107*7c478bd9Sstevel@tonic-gate for (n = 0; *s; LDAP_UTF8INC(s)) ++n;
108*7c478bd9Sstevel@tonic-gate return n;
109*7c478bd9Sstevel@tonic-gate }
110*7c478bd9Sstevel@tonic-gate
111*7c478bd9Sstevel@tonic-gate unsigned long LDAP_CALL
ldap_utf8getcc(const char ** src)112*7c478bd9Sstevel@tonic-gate ldap_utf8getcc( const char** src )
113*7c478bd9Sstevel@tonic-gate {
114*7c478bd9Sstevel@tonic-gate register unsigned long c;
115*7c478bd9Sstevel@tonic-gate register const unsigned char* s = (const unsigned char*)*src;
116*7c478bd9Sstevel@tonic-gate switch (UTF8len [(*s >> 2) & 0x3F]) {
117*7c478bd9Sstevel@tonic-gate case 0: /* erroneous: s points to the middle of a character. */
118*7c478bd9Sstevel@tonic-gate c = (*s++) & 0x3F; goto more5;
119*7c478bd9Sstevel@tonic-gate case 1: c = (*s++); break;
120*7c478bd9Sstevel@tonic-gate case 2: c = (*s++) & 0x1F; goto more1;
121*7c478bd9Sstevel@tonic-gate case 3: c = (*s++) & 0x0F; goto more2;
122*7c478bd9Sstevel@tonic-gate case 4: c = (*s++) & 0x07; goto more3;
123*7c478bd9Sstevel@tonic-gate case 5: c = (*s++) & 0x03; goto more4;
124*7c478bd9Sstevel@tonic-gate case 6: c = (*s++) & 0x01; goto more5;
125*7c478bd9Sstevel@tonic-gate more5: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
126*7c478bd9Sstevel@tonic-gate more4: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
127*7c478bd9Sstevel@tonic-gate more3: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
128*7c478bd9Sstevel@tonic-gate more2: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
129*7c478bd9Sstevel@tonic-gate more1: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
130*7c478bd9Sstevel@tonic-gate break;
131*7c478bd9Sstevel@tonic-gate }
132*7c478bd9Sstevel@tonic-gate *src = (const char*)s;
133*7c478bd9Sstevel@tonic-gate return c;
134*7c478bd9Sstevel@tonic-gate }
135*7c478bd9Sstevel@tonic-gate
136*7c478bd9Sstevel@tonic-gate char*
137*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8strtok_r(char * sp,const char * brk,char ** next)138*7c478bd9Sstevel@tonic-gate ldap_utf8strtok_r( char* sp, const char* brk, char** next)
139*7c478bd9Sstevel@tonic-gate {
140*7c478bd9Sstevel@tonic-gate const char *bp;
141*7c478bd9Sstevel@tonic-gate unsigned long sc, bc;
142*7c478bd9Sstevel@tonic-gate char *tok;
143*7c478bd9Sstevel@tonic-gate
144*7c478bd9Sstevel@tonic-gate if (sp == NULL && (sp = *next) == NULL)
145*7c478bd9Sstevel@tonic-gate return NULL;
146*7c478bd9Sstevel@tonic-gate
147*7c478bd9Sstevel@tonic-gate /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */
148*7c478bd9Sstevel@tonic-gate cont:
149*7c478bd9Sstevel@tonic-gate sc = LDAP_UTF8GETC(sp);
150*7c478bd9Sstevel@tonic-gate for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) {
151*7c478bd9Sstevel@tonic-gate if (sc == bc)
152*7c478bd9Sstevel@tonic-gate goto cont;
153*7c478bd9Sstevel@tonic-gate }
154*7c478bd9Sstevel@tonic-gate
155*7c478bd9Sstevel@tonic-gate if (sc == 0) { /* no non-delimiter characters */
156*7c478bd9Sstevel@tonic-gate *next = NULL;
157*7c478bd9Sstevel@tonic-gate return NULL;
158*7c478bd9Sstevel@tonic-gate }
159*7c478bd9Sstevel@tonic-gate tok = LDAP_UTF8PREV(sp);
160*7c478bd9Sstevel@tonic-gate
161*7c478bd9Sstevel@tonic-gate /* Scan token; roughly, sp += strcspn(sp, brk)
162*7c478bd9Sstevel@tonic-gate * Note that brk must be 0-terminated; we stop if we see that, too.
163*7c478bd9Sstevel@tonic-gate */
164*7c478bd9Sstevel@tonic-gate while (1) {
165*7c478bd9Sstevel@tonic-gate sc = LDAP_UTF8GETC(sp);
166*7c478bd9Sstevel@tonic-gate bp = brk;
167*7c478bd9Sstevel@tonic-gate do {
168*7c478bd9Sstevel@tonic-gate if ((bc = LDAP_UTF8GETCC(bp)) == sc) {
169*7c478bd9Sstevel@tonic-gate if (sc == 0) {
170*7c478bd9Sstevel@tonic-gate *next = NULL;
171*7c478bd9Sstevel@tonic-gate } else {
172*7c478bd9Sstevel@tonic-gate *next = sp;
173*7c478bd9Sstevel@tonic-gate *(LDAP_UTF8PREV(sp)) = 0;
174*7c478bd9Sstevel@tonic-gate }
175*7c478bd9Sstevel@tonic-gate return tok;
176*7c478bd9Sstevel@tonic-gate }
177*7c478bd9Sstevel@tonic-gate } while (bc != 0);
178*7c478bd9Sstevel@tonic-gate }
179*7c478bd9Sstevel@tonic-gate /* NOTREACHED */
180*7c478bd9Sstevel@tonic-gate }
181*7c478bd9Sstevel@tonic-gate
182*7c478bd9Sstevel@tonic-gate int
183*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isalnum(char * s)184*7c478bd9Sstevel@tonic-gate ldap_utf8isalnum( char* s )
185*7c478bd9Sstevel@tonic-gate {
186*7c478bd9Sstevel@tonic-gate register unsigned char c = *(unsigned char*)s;
187*7c478bd9Sstevel@tonic-gate if (0x80 & c) return 0;
188*7c478bd9Sstevel@tonic-gate if (c >= 'A' && c <= 'Z') return 1;
189*7c478bd9Sstevel@tonic-gate if (c >= 'a' && c <= 'z') return 1;
190*7c478bd9Sstevel@tonic-gate if (c >= '0' && c <= '9') return 1;
191*7c478bd9Sstevel@tonic-gate return 0;
192*7c478bd9Sstevel@tonic-gate }
193*7c478bd9Sstevel@tonic-gate
194*7c478bd9Sstevel@tonic-gate int
195*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isalpha(char * s)196*7c478bd9Sstevel@tonic-gate ldap_utf8isalpha( char* s )
197*7c478bd9Sstevel@tonic-gate {
198*7c478bd9Sstevel@tonic-gate register unsigned char c = *(unsigned char*)s;
199*7c478bd9Sstevel@tonic-gate if (0x80 & c) return 0;
200*7c478bd9Sstevel@tonic-gate if (c >= 'A' && c <= 'Z') return 1;
201*7c478bd9Sstevel@tonic-gate if (c >= 'a' && c <= 'z') return 1;
202*7c478bd9Sstevel@tonic-gate return 0;
203*7c478bd9Sstevel@tonic-gate }
204*7c478bd9Sstevel@tonic-gate
205*7c478bd9Sstevel@tonic-gate int
206*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isdigit(char * s)207*7c478bd9Sstevel@tonic-gate ldap_utf8isdigit( char* s )
208*7c478bd9Sstevel@tonic-gate {
209*7c478bd9Sstevel@tonic-gate register unsigned char c = *(unsigned char*)s;
210*7c478bd9Sstevel@tonic-gate if (0x80 & c) return 0;
211*7c478bd9Sstevel@tonic-gate if (c >= '0' && c <= '9') return 1;
212*7c478bd9Sstevel@tonic-gate return 0;
213*7c478bd9Sstevel@tonic-gate }
214*7c478bd9Sstevel@tonic-gate
215*7c478bd9Sstevel@tonic-gate int
216*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isxdigit(char * s)217*7c478bd9Sstevel@tonic-gate ldap_utf8isxdigit( char* s )
218*7c478bd9Sstevel@tonic-gate {
219*7c478bd9Sstevel@tonic-gate register unsigned char c = *(unsigned char*)s;
220*7c478bd9Sstevel@tonic-gate if (0x80 & c) return 0;
221*7c478bd9Sstevel@tonic-gate if (c >= '0' && c <= '9') return 1;
222*7c478bd9Sstevel@tonic-gate if (c >= 'A' && c <= 'F') return 1;
223*7c478bd9Sstevel@tonic-gate if (c >= 'a' && c <= 'f') return 1;
224*7c478bd9Sstevel@tonic-gate return 0;
225*7c478bd9Sstevel@tonic-gate }
226*7c478bd9Sstevel@tonic-gate
227*7c478bd9Sstevel@tonic-gate int
228*7c478bd9Sstevel@tonic-gate LDAP_CALL
ldap_utf8isspace(char * s)229*7c478bd9Sstevel@tonic-gate ldap_utf8isspace( char* s )
230*7c478bd9Sstevel@tonic-gate {
231*7c478bd9Sstevel@tonic-gate register unsigned char *c = (unsigned char*)s;
232*7c478bd9Sstevel@tonic-gate int len = ldap_utf8len(s);
233*7c478bd9Sstevel@tonic-gate
234*7c478bd9Sstevel@tonic-gate if (len == 0) {
235*7c478bd9Sstevel@tonic-gate return 0;
236*7c478bd9Sstevel@tonic-gate } else if (len == 1) {
237*7c478bd9Sstevel@tonic-gate switch (*c) {
238*7c478bd9Sstevel@tonic-gate case 0x09:
239*7c478bd9Sstevel@tonic-gate case 0x0A:
240*7c478bd9Sstevel@tonic-gate case 0x0B:
241*7c478bd9Sstevel@tonic-gate case 0x0C:
242*7c478bd9Sstevel@tonic-gate case 0x0D:
243*7c478bd9Sstevel@tonic-gate case 0x20:
244*7c478bd9Sstevel@tonic-gate return 1;
245*7c478bd9Sstevel@tonic-gate default:
246*7c478bd9Sstevel@tonic-gate return 0;
247*7c478bd9Sstevel@tonic-gate }
248*7c478bd9Sstevel@tonic-gate } else if (len == 2) {
249*7c478bd9Sstevel@tonic-gate if (*c == 0xc2) {
250*7c478bd9Sstevel@tonic-gate return *(c+1) == 0x80;
251*7c478bd9Sstevel@tonic-gate }
252*7c478bd9Sstevel@tonic-gate } else if (len == 3) {
253*7c478bd9Sstevel@tonic-gate if (*c == 0xE2) {
254*7c478bd9Sstevel@tonic-gate c++;
255*7c478bd9Sstevel@tonic-gate if (*c == 0x80) {
256*7c478bd9Sstevel@tonic-gate c++;
257*7c478bd9Sstevel@tonic-gate return (*c>=0x80 && *c<=0x8a);
258*7c478bd9Sstevel@tonic-gate }
259*7c478bd9Sstevel@tonic-gate } else if (*c == 0xE3) {
260*7c478bd9Sstevel@tonic-gate return (*(c+1)==0x80) && (*(c+2)==0x80);
261*7c478bd9Sstevel@tonic-gate } else if (*c==0xEF) {
262*7c478bd9Sstevel@tonic-gate return (*(c+1)==0xBB) && (*(c+2)==0xBF);
263*7c478bd9Sstevel@tonic-gate }
264*7c478bd9Sstevel@tonic-gate return 0;
265*7c478bd9Sstevel@tonic-gate }
266*7c478bd9Sstevel@tonic-gate
267*7c478bd9Sstevel@tonic-gate /* should never reach here */
268*7c478bd9Sstevel@tonic-gate return 0;
269*7c478bd9Sstevel@tonic-gate }
270