1 #pragma ident "%Z%%M% %I% %E% SMI"
2
3 /*
4 * The contents of this file are subject to the Netscape Public
5 * License Version 1.1 (the "License"); you may not use this file
6 * except in compliance with the License. You may obtain a copy of
7 * the License at http://www.mozilla.org/NPL/
8 *
9 * Software distributed under the License is distributed on an "AS
10 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
11 * implied. See the License for the specific language governing
12 * rights and limitations under the License.
13 *
14 * The Original Code is Mozilla Communicator client code, released
15 * March 31, 1998.
16 *
17 * The Initial Developer of the Original Code is Netscape
18 * Communications Corporation. Portions created by Netscape are
19 * Copyright (C) 1998-1999 Netscape Communications Corporation. All
20 * Rights Reserved.
21 *
22 * Contributor(s):
23 */
24
25 /* uft8.c - misc. utf8 "string" functions. */
26 #include "ldap-int.h"
27
28 static char UTF8len[64]
29 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
30 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
31 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6};
33
34 int
35 LDAP_CALL
ldap_utf8len(const char * s)36 ldap_utf8len (const char* s)
37 /* Return the number of char's in the character at *s. */
38 {
39 return ldap_utf8next((char*)s) - s;
40 }
41
42 char*
43 LDAP_CALL
ldap_utf8next(char * s)44 ldap_utf8next (char* s)
45 /* Return a pointer to the character immediately following *s.
46 Handle any valid UTF-8 character, including '\0' and ASCII.
47 Try to handle a misaligned pointer or a malformed character.
48 */
49 {
50 register unsigned char* next = (unsigned char*)s;
51 switch (UTF8len [(*next >> 2) & 0x3F]) {
52 case 0: /* erroneous: s points to the middle of a character. */
53 case 6: if ((*++next & 0xC0) != 0x80) break;
54 case 5: if ((*++next & 0xC0) != 0x80) break;
55 case 4: if ((*++next & 0xC0) != 0x80) break;
56 case 3: if ((*++next & 0xC0) != 0x80) break;
57 case 2: if ((*++next & 0xC0) != 0x80) break;
58 case 1: ++next;
59 }
60 return (char*) next;
61 }
62
63 char*
64 LDAP_CALL
ldap_utf8prev(char * s)65 ldap_utf8prev (char* s)
66 /* Return a pointer to the character immediately preceding *s.
67 Handle any valid UTF-8 character, including '\0' and ASCII.
68 Try to handle a misaligned pointer or a malformed character.
69 */
70 {
71 register unsigned char* prev = (unsigned char*)s;
72 unsigned char* limit = prev - 6;
73 while (((*--prev & 0xC0) == 0x80) && (prev != limit)) {
74 ;
75 }
76 return (char*) prev;
77 }
78
79 int
80 LDAP_CALL
ldap_utf8copy(char * dst,const char * src)81 ldap_utf8copy (char* dst, const char* src)
82 /* Copy a character from src to dst; return the number of char's copied.
83 Handle any valid UTF-8 character, including '\0' and ASCII.
84 Try to handle a misaligned pointer or a malformed character.
85 */
86 {
87 register const unsigned char* s = (const unsigned char*)src;
88 switch (UTF8len [(*s >> 2) & 0x3F]) {
89 case 0: /* erroneous: s points to the middle of a character. */
90 case 6: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
91 case 5: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
92 case 4: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
93 case 3: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
94 case 2: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
95 case 1: *dst = *s++;
96 }
97 return s - (const unsigned char*)src;
98 }
99
100 size_t
101 LDAP_CALL
ldap_utf8characters(const char * src)102 ldap_utf8characters (const char* src)
103 /* Return the number of UTF-8 characters in the 0-terminated array s. */
104 {
105 register char* s = (char*)src;
106 size_t n;
107 for (n = 0; *s; LDAP_UTF8INC(s)) ++n;
108 return n;
109 }
110
111 unsigned long LDAP_CALL
ldap_utf8getcc(const char ** src)112 ldap_utf8getcc( const char** src )
113 {
114 register unsigned long c;
115 register const unsigned char* s = (const unsigned char*)*src;
116 switch (UTF8len [(*s >> 2) & 0x3F]) {
117 case 0: /* erroneous: s points to the middle of a character. */
118 c = (*s++) & 0x3F; goto more5;
119 case 1: c = (*s++); break;
120 case 2: c = (*s++) & 0x1F; goto more1;
121 case 3: c = (*s++) & 0x0F; goto more2;
122 case 4: c = (*s++) & 0x07; goto more3;
123 case 5: c = (*s++) & 0x03; goto more4;
124 case 6: c = (*s++) & 0x01; goto more5;
125 more5: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
126 more4: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
127 more3: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
128 more2: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
129 more1: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
130 break;
131 }
132 *src = (const char*)s;
133 return c;
134 }
135
136 char*
137 LDAP_CALL
ldap_utf8strtok_r(char * sp,const char * brk,char ** next)138 ldap_utf8strtok_r( char* sp, const char* brk, char** next)
139 {
140 const char *bp;
141 unsigned long sc, bc;
142 char *tok;
143
144 if (sp == NULL && (sp = *next) == NULL)
145 return NULL;
146
147 /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */
148 cont:
149 sc = LDAP_UTF8GETC(sp);
150 for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) {
151 if (sc == bc)
152 goto cont;
153 }
154
155 if (sc == 0) { /* no non-delimiter characters */
156 *next = NULL;
157 return NULL;
158 }
159 tok = LDAP_UTF8PREV(sp);
160
161 /* Scan token; roughly, sp += strcspn(sp, brk)
162 * Note that brk must be 0-terminated; we stop if we see that, too.
163 */
164 while (1) {
165 sc = LDAP_UTF8GETC(sp);
166 bp = brk;
167 do {
168 if ((bc = LDAP_UTF8GETCC(bp)) == sc) {
169 if (sc == 0) {
170 *next = NULL;
171 } else {
172 *next = sp;
173 *(LDAP_UTF8PREV(sp)) = 0;
174 }
175 return tok;
176 }
177 } while (bc != 0);
178 }
179 /* NOTREACHED */
180 }
181
182 int
183 LDAP_CALL
ldap_utf8isalnum(char * s)184 ldap_utf8isalnum( char* s )
185 {
186 register unsigned char c = *(unsigned char*)s;
187 if (0x80 & c) return 0;
188 if (c >= 'A' && c <= 'Z') return 1;
189 if (c >= 'a' && c <= 'z') return 1;
190 if (c >= '0' && c <= '9') return 1;
191 return 0;
192 }
193
194 int
195 LDAP_CALL
ldap_utf8isalpha(char * s)196 ldap_utf8isalpha( char* s )
197 {
198 register unsigned char c = *(unsigned char*)s;
199 if (0x80 & c) return 0;
200 if (c >= 'A' && c <= 'Z') return 1;
201 if (c >= 'a' && c <= 'z') return 1;
202 return 0;
203 }
204
205 int
206 LDAP_CALL
ldap_utf8isdigit(char * s)207 ldap_utf8isdigit( char* s )
208 {
209 register unsigned char c = *(unsigned char*)s;
210 if (0x80 & c) return 0;
211 if (c >= '0' && c <= '9') return 1;
212 return 0;
213 }
214
215 int
216 LDAP_CALL
ldap_utf8isxdigit(char * s)217 ldap_utf8isxdigit( char* s )
218 {
219 register unsigned char c = *(unsigned char*)s;
220 if (0x80 & c) return 0;
221 if (c >= '0' && c <= '9') return 1;
222 if (c >= 'A' && c <= 'F') return 1;
223 if (c >= 'a' && c <= 'f') return 1;
224 return 0;
225 }
226
227 int
228 LDAP_CALL
ldap_utf8isspace(char * s)229 ldap_utf8isspace( char* s )
230 {
231 register unsigned char *c = (unsigned char*)s;
232 int len = ldap_utf8len(s);
233
234 if (len == 0) {
235 return 0;
236 } else if (len == 1) {
237 switch (*c) {
238 case 0x09:
239 case 0x0A:
240 case 0x0B:
241 case 0x0C:
242 case 0x0D:
243 case 0x20:
244 return 1;
245 default:
246 return 0;
247 }
248 } else if (len == 2) {
249 if (*c == 0xc2) {
250 return *(c+1) == 0x80;
251 }
252 } else if (len == 3) {
253 if (*c == 0xE2) {
254 c++;
255 if (*c == 0x80) {
256 c++;
257 return (*c>=0x80 && *c<=0x8a);
258 }
259 } else if (*c == 0xE3) {
260 return (*(c+1)==0x80) && (*(c+2)==0x80);
261 } else if (*c==0xEF) {
262 return (*(c+1)==0xBB) && (*(c+2)==0xBF);
263 }
264 return 0;
265 }
266
267 /* should never reach here */
268 return 0;
269 }
270