1*7c478bd9Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 2*7c478bd9Sstevel@tonic-gate 3*7c478bd9Sstevel@tonic-gate /* 4*7c478bd9Sstevel@tonic-gate * The contents of this file are subject to the Netscape Public 5*7c478bd9Sstevel@tonic-gate * License Version 1.1 (the "License"); you may not use this file 6*7c478bd9Sstevel@tonic-gate * except in compliance with the License. You may obtain a copy of 7*7c478bd9Sstevel@tonic-gate * the License at http://www.mozilla.org/NPL/ 8*7c478bd9Sstevel@tonic-gate * 9*7c478bd9Sstevel@tonic-gate * Software distributed under the License is distributed on an "AS 10*7c478bd9Sstevel@tonic-gate * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or 11*7c478bd9Sstevel@tonic-gate * implied. See the License for the specific language governing 12*7c478bd9Sstevel@tonic-gate * rights and limitations under the License. 13*7c478bd9Sstevel@tonic-gate * 14*7c478bd9Sstevel@tonic-gate * The Original Code is Mozilla Communicator client code, released 15*7c478bd9Sstevel@tonic-gate * March 31, 1998. 16*7c478bd9Sstevel@tonic-gate * 17*7c478bd9Sstevel@tonic-gate * The Initial Developer of the Original Code is Netscape 18*7c478bd9Sstevel@tonic-gate * Communications Corporation. Portions created by Netscape are 19*7c478bd9Sstevel@tonic-gate * Copyright (C) 1998-1999 Netscape Communications Corporation. All 20*7c478bd9Sstevel@tonic-gate * Rights Reserved. 21*7c478bd9Sstevel@tonic-gate * 22*7c478bd9Sstevel@tonic-gate * Contributor(s): 23*7c478bd9Sstevel@tonic-gate */ 24*7c478bd9Sstevel@tonic-gate 25*7c478bd9Sstevel@tonic-gate /* uft8.c - misc. utf8 "string" functions. */ 26*7c478bd9Sstevel@tonic-gate #include "ldap-int.h" 27*7c478bd9Sstevel@tonic-gate 28*7c478bd9Sstevel@tonic-gate static char UTF8len[64] 29*7c478bd9Sstevel@tonic-gate = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 30*7c478bd9Sstevel@tonic-gate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 31*7c478bd9Sstevel@tonic-gate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32*7c478bd9Sstevel@tonic-gate 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6}; 33*7c478bd9Sstevel@tonic-gate 34*7c478bd9Sstevel@tonic-gate int 35*7c478bd9Sstevel@tonic-gate LDAP_CALL 36*7c478bd9Sstevel@tonic-gate ldap_utf8len (const char* s) 37*7c478bd9Sstevel@tonic-gate /* Return the number of char's in the character at *s. */ 38*7c478bd9Sstevel@tonic-gate { 39*7c478bd9Sstevel@tonic-gate return ldap_utf8next((char*)s) - s; 40*7c478bd9Sstevel@tonic-gate } 41*7c478bd9Sstevel@tonic-gate 42*7c478bd9Sstevel@tonic-gate char* 43*7c478bd9Sstevel@tonic-gate LDAP_CALL 44*7c478bd9Sstevel@tonic-gate ldap_utf8next (char* s) 45*7c478bd9Sstevel@tonic-gate /* Return a pointer to the character immediately following *s. 46*7c478bd9Sstevel@tonic-gate Handle any valid UTF-8 character, including '\0' and ASCII. 47*7c478bd9Sstevel@tonic-gate Try to handle a misaligned pointer or a malformed character. 48*7c478bd9Sstevel@tonic-gate */ 49*7c478bd9Sstevel@tonic-gate { 50*7c478bd9Sstevel@tonic-gate register unsigned char* next = (unsigned char*)s; 51*7c478bd9Sstevel@tonic-gate switch (UTF8len [(*next >> 2) & 0x3F]) { 52*7c478bd9Sstevel@tonic-gate case 0: /* erroneous: s points to the middle of a character. */ 53*7c478bd9Sstevel@tonic-gate case 6: if ((*++next & 0xC0) != 0x80) break; 54*7c478bd9Sstevel@tonic-gate case 5: if ((*++next & 0xC0) != 0x80) break; 55*7c478bd9Sstevel@tonic-gate case 4: if ((*++next & 0xC0) != 0x80) break; 56*7c478bd9Sstevel@tonic-gate case 3: if ((*++next & 0xC0) != 0x80) break; 57*7c478bd9Sstevel@tonic-gate case 2: if ((*++next & 0xC0) != 0x80) break; 58*7c478bd9Sstevel@tonic-gate case 1: ++next; 59*7c478bd9Sstevel@tonic-gate } 60*7c478bd9Sstevel@tonic-gate return (char*) next; 61*7c478bd9Sstevel@tonic-gate } 62*7c478bd9Sstevel@tonic-gate 63*7c478bd9Sstevel@tonic-gate char* 64*7c478bd9Sstevel@tonic-gate LDAP_CALL 65*7c478bd9Sstevel@tonic-gate ldap_utf8prev (char* s) 66*7c478bd9Sstevel@tonic-gate /* Return a pointer to the character immediately preceding *s. 67*7c478bd9Sstevel@tonic-gate Handle any valid UTF-8 character, including '\0' and ASCII. 68*7c478bd9Sstevel@tonic-gate Try to handle a misaligned pointer or a malformed character. 69*7c478bd9Sstevel@tonic-gate */ 70*7c478bd9Sstevel@tonic-gate { 71*7c478bd9Sstevel@tonic-gate register unsigned char* prev = (unsigned char*)s; 72*7c478bd9Sstevel@tonic-gate unsigned char* limit = prev - 6; 73*7c478bd9Sstevel@tonic-gate while (((*--prev & 0xC0) == 0x80) && (prev != limit)) { 74*7c478bd9Sstevel@tonic-gate ; 75*7c478bd9Sstevel@tonic-gate } 76*7c478bd9Sstevel@tonic-gate return (char*) prev; 77*7c478bd9Sstevel@tonic-gate } 78*7c478bd9Sstevel@tonic-gate 79*7c478bd9Sstevel@tonic-gate int 80*7c478bd9Sstevel@tonic-gate LDAP_CALL 81*7c478bd9Sstevel@tonic-gate ldap_utf8copy (char* dst, const char* src) 82*7c478bd9Sstevel@tonic-gate /* Copy a character from src to dst; return the number of char's copied. 83*7c478bd9Sstevel@tonic-gate Handle any valid UTF-8 character, including '\0' and ASCII. 84*7c478bd9Sstevel@tonic-gate Try to handle a misaligned pointer or a malformed character. 85*7c478bd9Sstevel@tonic-gate */ 86*7c478bd9Sstevel@tonic-gate { 87*7c478bd9Sstevel@tonic-gate register const unsigned char* s = (const unsigned char*)src; 88*7c478bd9Sstevel@tonic-gate switch (UTF8len [(*s >> 2) & 0x3F]) { 89*7c478bd9Sstevel@tonic-gate case 0: /* erroneous: s points to the middle of a character. */ 90*7c478bd9Sstevel@tonic-gate case 6: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; 91*7c478bd9Sstevel@tonic-gate case 5: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; 92*7c478bd9Sstevel@tonic-gate case 4: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; 93*7c478bd9Sstevel@tonic-gate case 3: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; 94*7c478bd9Sstevel@tonic-gate case 2: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; 95*7c478bd9Sstevel@tonic-gate case 1: *dst = *s++; 96*7c478bd9Sstevel@tonic-gate } 97*7c478bd9Sstevel@tonic-gate return s - (const unsigned char*)src; 98*7c478bd9Sstevel@tonic-gate } 99*7c478bd9Sstevel@tonic-gate 100*7c478bd9Sstevel@tonic-gate size_t 101*7c478bd9Sstevel@tonic-gate LDAP_CALL 102*7c478bd9Sstevel@tonic-gate ldap_utf8characters (const char* src) 103*7c478bd9Sstevel@tonic-gate /* Return the number of UTF-8 characters in the 0-terminated array s. */ 104*7c478bd9Sstevel@tonic-gate { 105*7c478bd9Sstevel@tonic-gate register char* s = (char*)src; 106*7c478bd9Sstevel@tonic-gate size_t n; 107*7c478bd9Sstevel@tonic-gate for (n = 0; *s; LDAP_UTF8INC(s)) ++n; 108*7c478bd9Sstevel@tonic-gate return n; 109*7c478bd9Sstevel@tonic-gate } 110*7c478bd9Sstevel@tonic-gate 111*7c478bd9Sstevel@tonic-gate unsigned long LDAP_CALL 112*7c478bd9Sstevel@tonic-gate ldap_utf8getcc( const char** src ) 113*7c478bd9Sstevel@tonic-gate { 114*7c478bd9Sstevel@tonic-gate register unsigned long c; 115*7c478bd9Sstevel@tonic-gate register const unsigned char* s = (const unsigned char*)*src; 116*7c478bd9Sstevel@tonic-gate switch (UTF8len [(*s >> 2) & 0x3F]) { 117*7c478bd9Sstevel@tonic-gate case 0: /* erroneous: s points to the middle of a character. */ 118*7c478bd9Sstevel@tonic-gate c = (*s++) & 0x3F; goto more5; 119*7c478bd9Sstevel@tonic-gate case 1: c = (*s++); break; 120*7c478bd9Sstevel@tonic-gate case 2: c = (*s++) & 0x1F; goto more1; 121*7c478bd9Sstevel@tonic-gate case 3: c = (*s++) & 0x0F; goto more2; 122*7c478bd9Sstevel@tonic-gate case 4: c = (*s++) & 0x07; goto more3; 123*7c478bd9Sstevel@tonic-gate case 5: c = (*s++) & 0x03; goto more4; 124*7c478bd9Sstevel@tonic-gate case 6: c = (*s++) & 0x01; goto more5; 125*7c478bd9Sstevel@tonic-gate more5: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); 126*7c478bd9Sstevel@tonic-gate more4: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); 127*7c478bd9Sstevel@tonic-gate more3: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); 128*7c478bd9Sstevel@tonic-gate more2: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); 129*7c478bd9Sstevel@tonic-gate more1: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); 130*7c478bd9Sstevel@tonic-gate break; 131*7c478bd9Sstevel@tonic-gate } 132*7c478bd9Sstevel@tonic-gate *src = (const char*)s; 133*7c478bd9Sstevel@tonic-gate return c; 134*7c478bd9Sstevel@tonic-gate } 135*7c478bd9Sstevel@tonic-gate 136*7c478bd9Sstevel@tonic-gate char* 137*7c478bd9Sstevel@tonic-gate LDAP_CALL 138*7c478bd9Sstevel@tonic-gate ldap_utf8strtok_r( char* sp, const char* brk, char** next) 139*7c478bd9Sstevel@tonic-gate { 140*7c478bd9Sstevel@tonic-gate const char *bp; 141*7c478bd9Sstevel@tonic-gate unsigned long sc, bc; 142*7c478bd9Sstevel@tonic-gate char *tok; 143*7c478bd9Sstevel@tonic-gate 144*7c478bd9Sstevel@tonic-gate if (sp == NULL && (sp = *next) == NULL) 145*7c478bd9Sstevel@tonic-gate return NULL; 146*7c478bd9Sstevel@tonic-gate 147*7c478bd9Sstevel@tonic-gate /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */ 148*7c478bd9Sstevel@tonic-gate cont: 149*7c478bd9Sstevel@tonic-gate sc = LDAP_UTF8GETC(sp); 150*7c478bd9Sstevel@tonic-gate for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) { 151*7c478bd9Sstevel@tonic-gate if (sc == bc) 152*7c478bd9Sstevel@tonic-gate goto cont; 153*7c478bd9Sstevel@tonic-gate } 154*7c478bd9Sstevel@tonic-gate 155*7c478bd9Sstevel@tonic-gate if (sc == 0) { /* no non-delimiter characters */ 156*7c478bd9Sstevel@tonic-gate *next = NULL; 157*7c478bd9Sstevel@tonic-gate return NULL; 158*7c478bd9Sstevel@tonic-gate } 159*7c478bd9Sstevel@tonic-gate tok = LDAP_UTF8PREV(sp); 160*7c478bd9Sstevel@tonic-gate 161*7c478bd9Sstevel@tonic-gate /* Scan token; roughly, sp += strcspn(sp, brk) 162*7c478bd9Sstevel@tonic-gate * Note that brk must be 0-terminated; we stop if we see that, too. 163*7c478bd9Sstevel@tonic-gate */ 164*7c478bd9Sstevel@tonic-gate while (1) { 165*7c478bd9Sstevel@tonic-gate sc = LDAP_UTF8GETC(sp); 166*7c478bd9Sstevel@tonic-gate bp = brk; 167*7c478bd9Sstevel@tonic-gate do { 168*7c478bd9Sstevel@tonic-gate if ((bc = LDAP_UTF8GETCC(bp)) == sc) { 169*7c478bd9Sstevel@tonic-gate if (sc == 0) { 170*7c478bd9Sstevel@tonic-gate *next = NULL; 171*7c478bd9Sstevel@tonic-gate } else { 172*7c478bd9Sstevel@tonic-gate *next = sp; 173*7c478bd9Sstevel@tonic-gate *(LDAP_UTF8PREV(sp)) = 0; 174*7c478bd9Sstevel@tonic-gate } 175*7c478bd9Sstevel@tonic-gate return tok; 176*7c478bd9Sstevel@tonic-gate } 177*7c478bd9Sstevel@tonic-gate } while (bc != 0); 178*7c478bd9Sstevel@tonic-gate } 179*7c478bd9Sstevel@tonic-gate /* NOTREACHED */ 180*7c478bd9Sstevel@tonic-gate } 181*7c478bd9Sstevel@tonic-gate 182*7c478bd9Sstevel@tonic-gate int 183*7c478bd9Sstevel@tonic-gate LDAP_CALL 184*7c478bd9Sstevel@tonic-gate ldap_utf8isalnum( char* s ) 185*7c478bd9Sstevel@tonic-gate { 186*7c478bd9Sstevel@tonic-gate register unsigned char c = *(unsigned char*)s; 187*7c478bd9Sstevel@tonic-gate if (0x80 & c) return 0; 188*7c478bd9Sstevel@tonic-gate if (c >= 'A' && c <= 'Z') return 1; 189*7c478bd9Sstevel@tonic-gate if (c >= 'a' && c <= 'z') return 1; 190*7c478bd9Sstevel@tonic-gate if (c >= '0' && c <= '9') return 1; 191*7c478bd9Sstevel@tonic-gate return 0; 192*7c478bd9Sstevel@tonic-gate } 193*7c478bd9Sstevel@tonic-gate 194*7c478bd9Sstevel@tonic-gate int 195*7c478bd9Sstevel@tonic-gate LDAP_CALL 196*7c478bd9Sstevel@tonic-gate ldap_utf8isalpha( char* s ) 197*7c478bd9Sstevel@tonic-gate { 198*7c478bd9Sstevel@tonic-gate register unsigned char c = *(unsigned char*)s; 199*7c478bd9Sstevel@tonic-gate if (0x80 & c) return 0; 200*7c478bd9Sstevel@tonic-gate if (c >= 'A' && c <= 'Z') return 1; 201*7c478bd9Sstevel@tonic-gate if (c >= 'a' && c <= 'z') return 1; 202*7c478bd9Sstevel@tonic-gate return 0; 203*7c478bd9Sstevel@tonic-gate } 204*7c478bd9Sstevel@tonic-gate 205*7c478bd9Sstevel@tonic-gate int 206*7c478bd9Sstevel@tonic-gate LDAP_CALL 207*7c478bd9Sstevel@tonic-gate ldap_utf8isdigit( char* s ) 208*7c478bd9Sstevel@tonic-gate { 209*7c478bd9Sstevel@tonic-gate register unsigned char c = *(unsigned char*)s; 210*7c478bd9Sstevel@tonic-gate if (0x80 & c) return 0; 211*7c478bd9Sstevel@tonic-gate if (c >= '0' && c <= '9') return 1; 212*7c478bd9Sstevel@tonic-gate return 0; 213*7c478bd9Sstevel@tonic-gate } 214*7c478bd9Sstevel@tonic-gate 215*7c478bd9Sstevel@tonic-gate int 216*7c478bd9Sstevel@tonic-gate LDAP_CALL 217*7c478bd9Sstevel@tonic-gate ldap_utf8isxdigit( char* s ) 218*7c478bd9Sstevel@tonic-gate { 219*7c478bd9Sstevel@tonic-gate register unsigned char c = *(unsigned char*)s; 220*7c478bd9Sstevel@tonic-gate if (0x80 & c) return 0; 221*7c478bd9Sstevel@tonic-gate if (c >= '0' && c <= '9') return 1; 222*7c478bd9Sstevel@tonic-gate if (c >= 'A' && c <= 'F') return 1; 223*7c478bd9Sstevel@tonic-gate if (c >= 'a' && c <= 'f') return 1; 224*7c478bd9Sstevel@tonic-gate return 0; 225*7c478bd9Sstevel@tonic-gate } 226*7c478bd9Sstevel@tonic-gate 227*7c478bd9Sstevel@tonic-gate int 228*7c478bd9Sstevel@tonic-gate LDAP_CALL 229*7c478bd9Sstevel@tonic-gate ldap_utf8isspace( char* s ) 230*7c478bd9Sstevel@tonic-gate { 231*7c478bd9Sstevel@tonic-gate register unsigned char *c = (unsigned char*)s; 232*7c478bd9Sstevel@tonic-gate int len = ldap_utf8len(s); 233*7c478bd9Sstevel@tonic-gate 234*7c478bd9Sstevel@tonic-gate if (len == 0) { 235*7c478bd9Sstevel@tonic-gate return 0; 236*7c478bd9Sstevel@tonic-gate } else if (len == 1) { 237*7c478bd9Sstevel@tonic-gate switch (*c) { 238*7c478bd9Sstevel@tonic-gate case 0x09: 239*7c478bd9Sstevel@tonic-gate case 0x0A: 240*7c478bd9Sstevel@tonic-gate case 0x0B: 241*7c478bd9Sstevel@tonic-gate case 0x0C: 242*7c478bd9Sstevel@tonic-gate case 0x0D: 243*7c478bd9Sstevel@tonic-gate case 0x20: 244*7c478bd9Sstevel@tonic-gate return 1; 245*7c478bd9Sstevel@tonic-gate default: 246*7c478bd9Sstevel@tonic-gate return 0; 247*7c478bd9Sstevel@tonic-gate } 248*7c478bd9Sstevel@tonic-gate } else if (len == 2) { 249*7c478bd9Sstevel@tonic-gate if (*c == 0xc2) { 250*7c478bd9Sstevel@tonic-gate return *(c+1) == 0x80; 251*7c478bd9Sstevel@tonic-gate } 252*7c478bd9Sstevel@tonic-gate } else if (len == 3) { 253*7c478bd9Sstevel@tonic-gate if (*c == 0xE2) { 254*7c478bd9Sstevel@tonic-gate c++; 255*7c478bd9Sstevel@tonic-gate if (*c == 0x80) { 256*7c478bd9Sstevel@tonic-gate c++; 257*7c478bd9Sstevel@tonic-gate return (*c>=0x80 && *c<=0x8a); 258*7c478bd9Sstevel@tonic-gate } 259*7c478bd9Sstevel@tonic-gate } else if (*c == 0xE3) { 260*7c478bd9Sstevel@tonic-gate return (*(c+1)==0x80) && (*(c+2)==0x80); 261*7c478bd9Sstevel@tonic-gate } else if (*c==0xEF) { 262*7c478bd9Sstevel@tonic-gate return (*(c+1)==0xBB) && (*(c+2)==0xBF); 263*7c478bd9Sstevel@tonic-gate } 264*7c478bd9Sstevel@tonic-gate return 0; 265*7c478bd9Sstevel@tonic-gate } 266*7c478bd9Sstevel@tonic-gate 267*7c478bd9Sstevel@tonic-gate /* should never reach here */ 268*7c478bd9Sstevel@tonic-gate return 0; 269*7c478bd9Sstevel@tonic-gate } 270