1 #pragma ident "%Z%%M% %I% %E% SMI" 2 3 /* 4 * The contents of this file are subject to the Netscape Public 5 * License Version 1.1 (the "License"); you may not use this file 6 * except in compliance with the License. You may obtain a copy of 7 * the License at http://www.mozilla.org/NPL/ 8 * 9 * Software distributed under the License is distributed on an "AS 10 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or 11 * implied. See the License for the specific language governing 12 * rights and limitations under the License. 13 * 14 * The Original Code is Mozilla Communicator client code, released 15 * March 31, 1998. 16 * 17 * The Initial Developer of the Original Code is Netscape 18 * Communications Corporation. Portions created by Netscape are 19 * Copyright (C) 1998-1999 Netscape Communications Corporation. All 20 * Rights Reserved. 21 * 22 * Contributor(s): 23 */ 24 25 /* uft8.c - misc. utf8 "string" functions. */ 26 #include "ldap-int.h" 27 28 static char UTF8len[64] 29 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 30 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 31 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6}; 33 34 int 35 LDAP_CALL 36 ldap_utf8len (const char* s) 37 /* Return the number of char's in the character at *s. */ 38 { 39 return ldap_utf8next((char*)s) - s; 40 } 41 42 char* 43 LDAP_CALL 44 ldap_utf8next (char* s) 45 /* Return a pointer to the character immediately following *s. 46 Handle any valid UTF-8 character, including '\0' and ASCII. 47 Try to handle a misaligned pointer or a malformed character. 48 */ 49 { 50 register unsigned char* next = (unsigned char*)s; 51 switch (UTF8len [(*next >> 2) & 0x3F]) { 52 case 0: /* erroneous: s points to the middle of a character. */ 53 case 6: if ((*++next & 0xC0) != 0x80) break; 54 case 5: if ((*++next & 0xC0) != 0x80) break; 55 case 4: if ((*++next & 0xC0) != 0x80) break; 56 case 3: if ((*++next & 0xC0) != 0x80) break; 57 case 2: if ((*++next & 0xC0) != 0x80) break; 58 case 1: ++next; 59 } 60 return (char*) next; 61 } 62 63 char* 64 LDAP_CALL 65 ldap_utf8prev (char* s) 66 /* Return a pointer to the character immediately preceding *s. 67 Handle any valid UTF-8 character, including '\0' and ASCII. 68 Try to handle a misaligned pointer or a malformed character. 69 */ 70 { 71 register unsigned char* prev = (unsigned char*)s; 72 unsigned char* limit = prev - 6; 73 while (((*--prev & 0xC0) == 0x80) && (prev != limit)) { 74 ; 75 } 76 return (char*) prev; 77 } 78 79 int 80 LDAP_CALL 81 ldap_utf8copy (char* dst, const char* src) 82 /* Copy a character from src to dst; return the number of char's copied. 83 Handle any valid UTF-8 character, including '\0' and ASCII. 84 Try to handle a misaligned pointer or a malformed character. 85 */ 86 { 87 register const unsigned char* s = (const unsigned char*)src; 88 switch (UTF8len [(*s >> 2) & 0x3F]) { 89 case 0: /* erroneous: s points to the middle of a character. */ 90 case 6: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; 91 case 5: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; 92 case 4: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; 93 case 3: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; 94 case 2: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; 95 case 1: *dst = *s++; 96 } 97 return s - (const unsigned char*)src; 98 } 99 100 size_t 101 LDAP_CALL 102 ldap_utf8characters (const char* src) 103 /* Return the number of UTF-8 characters in the 0-terminated array s. */ 104 { 105 register char* s = (char*)src; 106 size_t n; 107 for (n = 0; *s; LDAP_UTF8INC(s)) ++n; 108 return n; 109 } 110 111 unsigned long LDAP_CALL 112 ldap_utf8getcc( const char** src ) 113 { 114 register unsigned long c; 115 register const unsigned char* s = (const unsigned char*)*src; 116 switch (UTF8len [(*s >> 2) & 0x3F]) { 117 case 0: /* erroneous: s points to the middle of a character. */ 118 c = (*s++) & 0x3F; goto more5; 119 case 1: c = (*s++); break; 120 case 2: c = (*s++) & 0x1F; goto more1; 121 case 3: c = (*s++) & 0x0F; goto more2; 122 case 4: c = (*s++) & 0x07; goto more3; 123 case 5: c = (*s++) & 0x03; goto more4; 124 case 6: c = (*s++) & 0x01; goto more5; 125 more5: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); 126 more4: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); 127 more3: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); 128 more2: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); 129 more1: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); 130 break; 131 } 132 *src = (const char*)s; 133 return c; 134 } 135 136 char* 137 LDAP_CALL 138 ldap_utf8strtok_r( char* sp, const char* brk, char** next) 139 { 140 const char *bp; 141 unsigned long sc, bc; 142 char *tok; 143 144 if (sp == NULL && (sp = *next) == NULL) 145 return NULL; 146 147 /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */ 148 cont: 149 sc = LDAP_UTF8GETC(sp); 150 for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) { 151 if (sc == bc) 152 goto cont; 153 } 154 155 if (sc == 0) { /* no non-delimiter characters */ 156 *next = NULL; 157 return NULL; 158 } 159 tok = LDAP_UTF8PREV(sp); 160 161 /* Scan token; roughly, sp += strcspn(sp, brk) 162 * Note that brk must be 0-terminated; we stop if we see that, too. 163 */ 164 while (1) { 165 sc = LDAP_UTF8GETC(sp); 166 bp = brk; 167 do { 168 if ((bc = LDAP_UTF8GETCC(bp)) == sc) { 169 if (sc == 0) { 170 *next = NULL; 171 } else { 172 *next = sp; 173 *(LDAP_UTF8PREV(sp)) = 0; 174 } 175 return tok; 176 } 177 } while (bc != 0); 178 } 179 /* NOTREACHED */ 180 } 181 182 int 183 LDAP_CALL 184 ldap_utf8isalnum( char* s ) 185 { 186 register unsigned char c = *(unsigned char*)s; 187 if (0x80 & c) return 0; 188 if (c >= 'A' && c <= 'Z') return 1; 189 if (c >= 'a' && c <= 'z') return 1; 190 if (c >= '0' && c <= '9') return 1; 191 return 0; 192 } 193 194 int 195 LDAP_CALL 196 ldap_utf8isalpha( char* s ) 197 { 198 register unsigned char c = *(unsigned char*)s; 199 if (0x80 & c) return 0; 200 if (c >= 'A' && c <= 'Z') return 1; 201 if (c >= 'a' && c <= 'z') return 1; 202 return 0; 203 } 204 205 int 206 LDAP_CALL 207 ldap_utf8isdigit( char* s ) 208 { 209 register unsigned char c = *(unsigned char*)s; 210 if (0x80 & c) return 0; 211 if (c >= '0' && c <= '9') return 1; 212 return 0; 213 } 214 215 int 216 LDAP_CALL 217 ldap_utf8isxdigit( char* s ) 218 { 219 register unsigned char c = *(unsigned char*)s; 220 if (0x80 & c) return 0; 221 if (c >= '0' && c <= '9') return 1; 222 if (c >= 'A' && c <= 'F') return 1; 223 if (c >= 'a' && c <= 'f') return 1; 224 return 0; 225 } 226 227 int 228 LDAP_CALL 229 ldap_utf8isspace( char* s ) 230 { 231 register unsigned char *c = (unsigned char*)s; 232 int len = ldap_utf8len(s); 233 234 if (len == 0) { 235 return 0; 236 } else if (len == 1) { 237 switch (*c) { 238 case 0x09: 239 case 0x0A: 240 case 0x0B: 241 case 0x0C: 242 case 0x0D: 243 case 0x20: 244 return 1; 245 default: 246 return 0; 247 } 248 } else if (len == 2) { 249 if (*c == 0xc2) { 250 return *(c+1) == 0x80; 251 } 252 } else if (len == 3) { 253 if (*c == 0xE2) { 254 c++; 255 if (*c == 0x80) { 256 c++; 257 return (*c>=0x80 && *c<=0x8a); 258 } 259 } else if (*c == 0xE3) { 260 return (*(c+1)==0x80) && (*(c+2)==0x80); 261 } else if (*c==0xEF) { 262 return (*(c+1)==0xBB) && (*(c+2)==0xBF); 263 } 264 return 0; 265 } 266 267 /* should never reach here */ 268 return 0; 269 } 270