1 /* 2 * The contents of this file are subject to the Netscape Public 3 * License Version 1.1 (the "License"); you may not use this file 4 * except in compliance with the License. You may obtain a copy of 5 * the License at http://www.mozilla.org/NPL/ 6 * 7 * Software distributed under the License is distributed on an "AS 8 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or 9 * implied. See the License for the specific language governing 10 * rights and limitations under the License. 11 * 12 * The Original Code is Mozilla Communicator client code, released 13 * March 31, 1998. 14 * 15 * The Initial Developer of the Original Code is Netscape 16 * Communications Corporation. Portions created by Netscape are 17 * Copyright (C) 1998-1999 Netscape Communications Corporation. All 18 * Rights Reserved. 19 * 20 * Contributor(s): 21 */ 22 23 /* uft8.c - misc. utf8 "string" functions. */ 24 #include "ldap-int.h" 25 26 static char UTF8len[64] 27 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 28 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 29 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6}; 31 32 int 33 LDAP_CALL 34 ldap_utf8len (const char* s) 35 /* Return the number of char's in the character at *s. */ 36 { 37 return ldap_utf8next((char*)s) - s; 38 } 39 40 char* 41 LDAP_CALL 42 ldap_utf8next (char* s) 43 /* Return a pointer to the character immediately following *s. 44 Handle any valid UTF-8 character, including '\0' and ASCII. 45 Try to handle a misaligned pointer or a malformed character. 46 */ 47 { 48 register unsigned char* next = (unsigned char*)s; 49 switch (UTF8len [(*next >> 2) & 0x3F]) { 50 case 0: /* erroneous: s points to the middle of a character. */ 51 case 6: if ((*++next & 0xC0) != 0x80) break; 52 /* FALLTHROUGH */ 53 case 5: if ((*++next & 0xC0) != 0x80) break; 54 /* FALLTHROUGH */ 55 case 4: if ((*++next & 0xC0) != 0x80) break; 56 /* FALLTHROUGH */ 57 case 3: if ((*++next & 0xC0) != 0x80) break; 58 /* FALLTHROUGH */ 59 case 2: if ((*++next & 0xC0) != 0x80) break; 60 /* FALLTHROUGH */ 61 case 1: ++next; 62 } 63 return (char*) next; 64 } 65 66 char* 67 LDAP_CALL 68 ldap_utf8prev (char* s) 69 /* Return a pointer to the character immediately preceding *s. 70 Handle any valid UTF-8 character, including '\0' and ASCII. 71 Try to handle a misaligned pointer or a malformed character. 72 */ 73 { 74 register unsigned char* prev = (unsigned char*)s; 75 unsigned char* limit = prev - 6; 76 while (((*--prev & 0xC0) == 0x80) && (prev != limit)) { 77 ; 78 } 79 return (char*) prev; 80 } 81 82 int 83 LDAP_CALL 84 ldap_utf8copy (char* dst, const char* src) 85 /* Copy a character from src to dst; return the number of char's copied. 86 Handle any valid UTF-8 character, including '\0' and ASCII. 87 Try to handle a misaligned pointer or a malformed character. 88 */ 89 { 90 register const unsigned char* s = (const unsigned char*)src; 91 switch (UTF8len [(*s >> 2) & 0x3F]) { 92 case 0: /* erroneous: s points to the middle of a character. */ 93 case 6: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; 94 /* FALLTHROUGH */ 95 case 5: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; 96 /* FALLTHROUGH */ 97 case 4: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; 98 /* FALLTHROUGH */ 99 case 3: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; 100 /* FALLTHROUGH */ 101 case 2: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; 102 /* FALLTHROUGH */ 103 case 1: *dst = *s++; 104 } 105 return s - (const unsigned char*)src; 106 } 107 108 size_t 109 LDAP_CALL 110 ldap_utf8characters (const char* src) 111 /* Return the number of UTF-8 characters in the 0-terminated array s. */ 112 { 113 register char* s = (char*)src; 114 size_t n; 115 for (n = 0; *s; LDAP_UTF8INC(s)) ++n; 116 return n; 117 } 118 119 unsigned long LDAP_CALL 120 ldap_utf8getcc( const char** src ) 121 { 122 register unsigned long c; 123 register const unsigned char* s = (const unsigned char*)*src; 124 switch (UTF8len [(*s >> 2) & 0x3F]) { 125 case 0: /* erroneous: s points to the middle of a character. */ 126 c = (*s++) & 0x3F; goto more5; 127 case 1: c = (*s++); break; 128 case 2: c = (*s++) & 0x1F; goto more1; 129 case 3: c = (*s++) & 0x0F; goto more2; 130 case 4: c = (*s++) & 0x07; goto more3; 131 case 5: c = (*s++) & 0x03; goto more4; 132 case 6: c = (*s++) & 0x01; goto more5; 133 more5: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); 134 more4: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); 135 more3: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); 136 more2: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); 137 more1: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); 138 break; 139 } 140 *src = (const char*)s; 141 return c; 142 } 143 144 char* 145 LDAP_CALL 146 ldap_utf8strtok_r( char* sp, const char* brk, char** next) 147 { 148 const char *bp; 149 unsigned long sc, bc; 150 char *tok; 151 152 if (sp == NULL && (sp = *next) == NULL) 153 return NULL; 154 155 /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */ 156 cont: 157 sc = LDAP_UTF8GETC(sp); 158 for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) { 159 if (sc == bc) 160 goto cont; 161 } 162 163 if (sc == 0) { /* no non-delimiter characters */ 164 *next = NULL; 165 return NULL; 166 } 167 tok = LDAP_UTF8PREV(sp); 168 169 /* Scan token; roughly, sp += strcspn(sp, brk) 170 * Note that brk must be 0-terminated; we stop if we see that, too. 171 */ 172 while (1) { 173 sc = LDAP_UTF8GETC(sp); 174 bp = brk; 175 do { 176 if ((bc = LDAP_UTF8GETCC(bp)) == sc) { 177 if (sc == 0) { 178 *next = NULL; 179 } else { 180 *next = sp; 181 *(LDAP_UTF8PREV(sp)) = 0; 182 } 183 return tok; 184 } 185 } while (bc != 0); 186 } 187 /* NOTREACHED */ 188 } 189 190 int 191 LDAP_CALL 192 ldap_utf8isalnum( char* s ) 193 { 194 register unsigned char c = *(unsigned char*)s; 195 if (0x80 & c) return 0; 196 if (c >= 'A' && c <= 'Z') return 1; 197 if (c >= 'a' && c <= 'z') return 1; 198 if (c >= '0' && c <= '9') return 1; 199 return 0; 200 } 201 202 int 203 LDAP_CALL 204 ldap_utf8isalpha( char* s ) 205 { 206 register unsigned char c = *(unsigned char*)s; 207 if (0x80 & c) return 0; 208 if (c >= 'A' && c <= 'Z') return 1; 209 if (c >= 'a' && c <= 'z') return 1; 210 return 0; 211 } 212 213 int 214 LDAP_CALL 215 ldap_utf8isdigit( char* s ) 216 { 217 register unsigned char c = *(unsigned char*)s; 218 if (0x80 & c) return 0; 219 if (c >= '0' && c <= '9') return 1; 220 return 0; 221 } 222 223 int 224 LDAP_CALL 225 ldap_utf8isxdigit( char* s ) 226 { 227 register unsigned char c = *(unsigned char*)s; 228 if (0x80 & c) return 0; 229 if (c >= '0' && c <= '9') return 1; 230 if (c >= 'A' && c <= 'F') return 1; 231 if (c >= 'a' && c <= 'f') return 1; 232 return 0; 233 } 234 235 int 236 LDAP_CALL 237 ldap_utf8isspace( char* s ) 238 { 239 register unsigned char *c = (unsigned char*)s; 240 int len = ldap_utf8len(s); 241 242 if (len == 0) { 243 return 0; 244 } else if (len == 1) { 245 switch (*c) { 246 case 0x09: 247 case 0x0A: 248 case 0x0B: 249 case 0x0C: 250 case 0x0D: 251 case 0x20: 252 return 1; 253 default: 254 return 0; 255 } 256 } else if (len == 2) { 257 if (*c == 0xc2) { 258 return *(c+1) == 0x80; 259 } 260 } else if (len == 3) { 261 if (*c == 0xE2) { 262 c++; 263 if (*c == 0x80) { 264 c++; 265 return (*c>=0x80 && *c<=0x8a); 266 } 267 } else if (*c == 0xE3) { 268 return (*(c+1)==0x80) && (*(c+2)==0x80); 269 } else if (*c==0xEF) { 270 return (*(c+1)==0xBB) && (*(c+2)==0xBF); 271 } 272 return 0; 273 } 274 275 /* should never reach here */ 276 return 0; 277 } 278