xref: /illumos-gate/usr/src/lib/libldap5/sources/ldap/common/ldaputf8.c (revision d7fdecd2374114124f192b3bfc84d2d294bb45ab)
1 /*
2  * The contents of this file are subject to the Netscape Public
3  * License Version 1.1 (the "License"); you may not use this file
4  * except in compliance with the License. You may obtain a copy of
5  * the License at http://www.mozilla.org/NPL/
6  *
7  * Software distributed under the License is distributed on an "AS
8  * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
9  * implied. See the License for the specific language governing
10  * rights and limitations under the License.
11  *
12  * The Original Code is Mozilla Communicator client code, released
13  * March 31, 1998.
14  *
15  * The Initial Developer of the Original Code is Netscape
16  * Communications Corporation. Portions created by Netscape are
17  * Copyright (C) 1998-1999 Netscape Communications Corporation. All
18  * Rights Reserved.
19  *
20  * Contributor(s):
21  */
22 
23 /* uft8.c - misc. utf8 "string" functions. */
24 #include "ldap-int.h"
25 
26 static char UTF8len[64]
27 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
28    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
29    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30    2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6};
31 
32 int
33 LDAP_CALL
ldap_utf8len(const char * s)34 ldap_utf8len (const char* s)
35      /* Return the number of char's in the character at *s. */
36 {
37     return ldap_utf8next((char*)s) - s;
38 }
39 
40 char*
41 LDAP_CALL
ldap_utf8next(char * s)42 ldap_utf8next (char* s)
43      /* Return a pointer to the character immediately following *s.
44 	Handle any valid UTF-8 character, including '\0' and ASCII.
45 	Try to handle a misaligned pointer or a malformed character.
46      */
47 {
48     register unsigned char* next = (unsigned char*)s;
49     switch (UTF8len [(*next >> 2) & 0x3F]) {
50       case 0: /* erroneous: s points to the middle of a character. */
51       case 6: if ((*++next & 0xC0) != 0x80) break;
52 	/* FALLTHROUGH */
53       case 5: if ((*++next & 0xC0) != 0x80) break;
54 	/* FALLTHROUGH */
55       case 4: if ((*++next & 0xC0) != 0x80) break;
56 	/* FALLTHROUGH */
57       case 3: if ((*++next & 0xC0) != 0x80) break;
58 	/* FALLTHROUGH */
59       case 2: if ((*++next & 0xC0) != 0x80) break;
60 	/* FALLTHROUGH */
61       case 1: ++next;
62     }
63     return (char*) next;
64 }
65 
66 char*
67 LDAP_CALL
ldap_utf8prev(char * s)68 ldap_utf8prev (char* s)
69      /* Return a pointer to the character immediately preceding *s.
70 	Handle any valid UTF-8 character, including '\0' and ASCII.
71 	Try to handle a misaligned pointer or a malformed character.
72      */
73 {
74     register unsigned char* prev = (unsigned char*)s;
75     unsigned char* limit = prev - 6;
76     while (((*--prev & 0xC0) == 0x80) && (prev != limit)) {
77     	;
78     }
79     return (char*) prev;
80 }
81 
82 int
83 LDAP_CALL
ldap_utf8copy(char * dst,const char * src)84 ldap_utf8copy (char* dst, const char* src)
85      /* Copy a character from src to dst; return the number of char's copied.
86 	Handle any valid UTF-8 character, including '\0' and ASCII.
87 	Try to handle a misaligned pointer or a malformed character.
88      */
89 {
90     register const unsigned char* s = (const unsigned char*)src;
91     switch (UTF8len [(*s >> 2) & 0x3F]) {
92       case 0: /* erroneous: s points to the middle of a character. */
93       case 6: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
94 	/* FALLTHROUGH */
95       case 5: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
96 	/* FALLTHROUGH */
97       case 4: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
98 	/* FALLTHROUGH */
99       case 3: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
100 	/* FALLTHROUGH */
101       case 2: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
102 	/* FALLTHROUGH */
103       case 1: *dst   = *s++;
104     }
105     return s - (const unsigned char*)src;
106 }
107 
108 size_t
109 LDAP_CALL
ldap_utf8characters(const char * src)110 ldap_utf8characters (const char* src)
111      /* Return the number of UTF-8 characters in the 0-terminated array s. */
112 {
113     register char* s = (char*)src;
114     size_t n;
115     for (n = 0; *s; LDAP_UTF8INC(s)) ++n;
116     return n;
117 }
118 
119 unsigned long LDAP_CALL
ldap_utf8getcc(const char ** src)120 ldap_utf8getcc( const char** src )
121 {
122     register unsigned long c;
123     register const unsigned char* s = (const unsigned char*)*src;
124     switch (UTF8len [(*s >> 2) & 0x3F]) {
125       case 0: /* erroneous: s points to the middle of a character. */
126 	      c = (*s++) & 0x3F; goto more5;
127       case 1: c = (*s++); break;
128       case 2: c = (*s++) & 0x1F; goto more1;
129       case 3: c = (*s++) & 0x0F; goto more2;
130       case 4: c = (*s++) & 0x07; goto more3;
131       case 5: c = (*s++) & 0x03; goto more4;
132       case 6: c = (*s++) & 0x01; goto more5;
133       more5: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
134       more4: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
135       more3: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
136       more2: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
137       more1: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
138 	break;
139     }
140     *src = (const char*)s;
141     return c;
142 }
143 
144 char*
145 LDAP_CALL
ldap_utf8strtok_r(char * sp,const char * brk,char ** next)146 ldap_utf8strtok_r( char* sp, const char* brk, char** next)
147 {
148     const char *bp;
149     unsigned long sc, bc;
150     char *tok;
151 
152     if (sp == NULL && (sp = *next) == NULL)
153       return NULL;
154 
155     /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */
156   cont:
157     sc = LDAP_UTF8GETC(sp);
158     for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) {
159 	if (sc == bc)
160 	  goto cont;
161     }
162 
163     if (sc == 0) { /* no non-delimiter characters */
164 	*next = NULL;
165 	return NULL;
166     }
167     tok = LDAP_UTF8PREV(sp);
168 
169     /* Scan token; roughly, sp += strcspn(sp, brk)
170      * Note that brk must be 0-terminated; we stop if we see that, too.
171      */
172     while (1) {
173 	sc = LDAP_UTF8GETC(sp);
174 	bp = brk;
175 	do {
176 	    if ((bc = LDAP_UTF8GETCC(bp)) == sc) {
177 		if (sc == 0) {
178 		    *next = NULL;
179 		} else {
180 		    *next = sp;
181 		    *(LDAP_UTF8PREV(sp)) = 0;
182 		}
183 		return tok;
184 	    }
185 	} while (bc != 0);
186     }
187     /* NOTREACHED */
188 }
189 
190 int
191 LDAP_CALL
ldap_utf8isalnum(char * s)192 ldap_utf8isalnum( char* s )
193 {
194     register unsigned char c = *(unsigned char*)s;
195     if (0x80 & c) return 0;
196     if (c >= 'A' && c <= 'Z') return 1;
197     if (c >= 'a' && c <= 'z') return 1;
198     if (c >= '0' && c <= '9') return 1;
199     return 0;
200 }
201 
202 int
203 LDAP_CALL
ldap_utf8isalpha(char * s)204 ldap_utf8isalpha( char* s )
205 {
206     register unsigned char c = *(unsigned char*)s;
207     if (0x80 & c) return 0;
208     if (c >= 'A' && c <= 'Z') return 1;
209     if (c >= 'a' && c <= 'z') return 1;
210     return 0;
211 }
212 
213 int
214 LDAP_CALL
ldap_utf8isdigit(char * s)215 ldap_utf8isdigit( char* s )
216 {
217     register unsigned char c = *(unsigned char*)s;
218     if (0x80 & c) return 0;
219     if (c >= '0' && c <= '9') return 1;
220     return 0;
221 }
222 
223 int
224 LDAP_CALL
ldap_utf8isxdigit(char * s)225 ldap_utf8isxdigit( char* s )
226 {
227     register unsigned char c = *(unsigned char*)s;
228     if (0x80 & c) return 0;
229     if (c >= '0' && c <= '9') return 1;
230     if (c >= 'A' && c <= 'F') return 1;
231     if (c >= 'a' && c <= 'f') return 1;
232     return 0;
233 }
234 
235 int
236 LDAP_CALL
ldap_utf8isspace(char * s)237 ldap_utf8isspace( char* s )
238 {
239     register unsigned char *c = (unsigned char*)s;
240     int len = ldap_utf8len(s);
241 
242     if (len == 0) {
243 	return 0;
244     } else if (len == 1) {
245 	switch (*c) {
246 	    case 0x09:
247 	    case 0x0A:
248 	    case 0x0B:
249 	    case 0x0C:
250 	    case 0x0D:
251 	    case 0x20:
252 		return 1;
253 	    default:
254 		return 0;
255 	}
256     } else if (len == 2) {
257 	if (*c == 0xc2) {
258 		return *(c+1) == 0x80;
259 	}
260     } else if (len == 3) {
261 	if (*c == 0xE2) {
262 	    c++;
263 	    if (*c == 0x80) {
264 		c++;
265 		return (*c>=0x80 && *c<=0x8a);
266 	    }
267 	} else if (*c == 0xE3) {
268 	    return (*(c+1)==0x80) && (*(c+2)==0x80);
269 	} else if (*c==0xEF) {
270 	    return (*(c+1)==0xBB) && (*(c+2)==0xBF);
271 	}
272 	return 0;
273     }
274 
275     /* should never reach here */
276     return 0;
277 }
278