1 /* 2 * util/support/utf8.c 3 * 4 * Copyright 2008 by the Massachusetts Institute of Technology. 5 * All Rights Reserved. 6 * 7 * Export of this software from the United States of America may 8 * require a specific license from the United States Government. 9 * It is the responsibility of any person or organization contemplating 10 * export to obtain such a license before exporting. 11 * 12 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and 13 * distribute this software and its documentation for any purpose and 14 * without fee is hereby granted, provided that the above copyright 15 * notice appear in all copies and that both that copyright notice and 16 * this permission notice appear in supporting documentation, and that 17 * the name of M.I.T. not be used in advertising or publicity pertaining 18 * to distribution of the software without specific, written prior 19 * permission. Furthermore if you modify this software you must label 20 * your software as modified software and not distribute it in such a 21 * fashion that it might be confused with the original M.I.T. software. 22 * M.I.T. makes no representations about the suitability of 23 * this software for any purpose. It is provided "as is" without express 24 * or implied warranty. 25 */ 26 /* This work is part of OpenLDAP Software <http://www.openldap.org/>. 27 * 28 * Copyright 1998-2008 The OpenLDAP Foundation. 29 * All rights reserved. 30 * 31 * Redistribution and use in source and binary forms, with or without 32 * modification, are permitted only as authorized by the OpenLDAP 33 * Public License. 34 * 35 * A copy of this license is available in the file LICENSE in the 36 * top-level directory of the distribution or, alternatively, at 37 * <http://www.OpenLDAP.org/license.html>. 38 */ 39 /* Basic UTF-8 routines 40 * 41 * These routines are "dumb". Though they understand UTF-8, 42 * they don't grok Unicode. That is, they can push bits, 43 * but don't have a clue what the bits represent. That's 44 * good enough for use with the KRB5 Client SDK. 45 * 46 * These routines are not optimized. 47 */ 48 49 #include "k5-platform.h" 50 #include "k5-utf8.h" 51 #include "supp-int.h" 52 53 /* 54 * return the number of bytes required to hold the 55 * NULL-terminated UTF-8 string NOT INCLUDING the 56 * termination. 57 */ 58 size_t krb5int_utf8_bytes(const char *p) 59 { 60 size_t bytes; 61 62 for (bytes = 0; p[bytes]; bytes++) 63 ; 64 65 return bytes; 66 } 67 68 size_t krb5int_utf8_chars(const char *p) 69 { 70 /* could be optimized and could check for invalid sequences */ 71 size_t chars = 0; 72 73 for ( ; *p ; KRB5_UTF8_INCR(p)) 74 chars++; 75 76 return chars; 77 } 78 79 size_t krb5int_utf8c_chars(const char *p, size_t length) 80 { 81 /* could be optimized and could check for invalid sequences */ 82 size_t chars = 0; 83 const char *end = p + length; 84 85 for ( ; p < end; KRB5_UTF8_INCR(p)) 86 chars++; 87 88 return chars; 89 } 90 91 /* return offset to next character */ 92 int krb5int_utf8_offset(const char *p) 93 { 94 return KRB5_UTF8_NEXT(p) - p; 95 } 96 97 /* 98 * Returns length indicated by first byte. 99 */ 100 const char krb5int_utf8_lentab[] = { 101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 105 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 106 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 107 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 108 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 }; 109 110 int krb5int_utf8_charlen(const char *p) 111 { 112 if (!(*p & 0x80)) 113 return 1; 114 115 return krb5int_utf8_lentab[*(const unsigned char *)p ^ 0x80]; 116 } 117 118 /* 119 * Make sure the UTF-8 char used the shortest possible encoding 120 * returns charlen if valid, 0 if not. 121 * 122 * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4. 123 * The table is slightly modified from that of the RFC. 124 * 125 * UCS-4 range (hex) UTF-8 sequence (binary) 126 * 0000 0000-0000 007F 0....... 127 * 0000 0080-0000 07FF 110++++. 10...... 128 * 0000 0800-0000 FFFF 1110++++ 10+..... 10...... 129 * 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10...... 130 * 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10...... 131 * 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10...... 132 * 133 * The '.' bits are "don't cares". When validating a UTF-8 sequence, 134 * at least one of the '+' bits must be set, otherwise the character 135 * should have been encoded in fewer octets. Note that in the two-octet 136 * case, only the first octet needs to be validated, and this is done 137 * in the krb5int_utf8_lentab[] above. 138 */ 139 140 /* mask of required bits in second octet */ 141 #undef c 142 #define c const char 143 c krb5int_utf8_mintab[] = { 144 (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, 145 (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, 146 (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, 147 (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 }; 148 #undef c 149 150 int krb5int_utf8_charlen2(const char *p) 151 { 152 int i = KRB5_UTF8_CHARLEN(p); 153 154 if (i > 2) { 155 if (!(krb5int_utf8_mintab[*p & 0x1f] & p[1])) 156 i = 0; 157 } 158 159 return i; 160 } 161 162 /* 163 * Convert a UTF8 character to a UCS4 character. Return 0 on success, 164 * -1 on failure. 165 */ 166 int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out) 167 { 168 const unsigned char *c = (const unsigned char *) p; 169 krb5_ucs4 ch; 170 int len, i; 171 static unsigned char mask[] = { 172 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 173 174 *out = 0; 175 len = KRB5_UTF8_CHARLEN2(p, len); 176 177 if (len == 0) 178 return -1; 179 180 ch = c[0] & mask[len]; 181 182 for (i = 1; i < len; i++) { 183 if ((c[i] & 0xc0) != 0x80) 184 return -1; 185 186 ch <<= 6; 187 ch |= c[i] & 0x3f; 188 } 189 190 *out = ch; 191 return 0; 192 } 193 194 int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out) 195 { 196 krb5_ucs4 ch; 197 198 *out = 0; 199 if (krb5int_utf8_to_ucs4(p, &ch) == -1 || ch > 0xFFFF) 200 return -1; 201 *out = (krb5_ucs2) ch; 202 return 0; 203 } 204 205 /* conv UCS-2 to UTF-8, not used */ 206 size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf) 207 { 208 size_t len = 0; 209 unsigned char *p = (unsigned char *) buf; 210 211 /* not a valid Unicode character */ 212 if (c < 0) 213 return 0; 214 215 /* Just return length, don't convert */ 216 if (buf == NULL) { 217 if (c < 0x80) return 1; 218 else if (c < 0x800) return 2; 219 else if (c < 0x10000) return 3; 220 else if (c < 0x200000) return 4; 221 else if (c < 0x4000000) return 5; 222 else return 6; 223 } 224 225 if (c < 0x80) { 226 p[len++] = c; 227 } else if (c < 0x800) { 228 p[len++] = 0xc0 | ( c >> 6 ); 229 p[len++] = 0x80 | ( c & 0x3f ); 230 } else if (c < 0x10000) { 231 p[len++] = 0xe0 | ( c >> 12 ); 232 p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 233 p[len++] = 0x80 | ( c & 0x3f ); 234 } else if (c < 0x200000) { 235 p[len++] = 0xf0 | ( c >> 18 ); 236 p[len++] = 0x80 | ( (c >> 12) & 0x3f ); 237 p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 238 p[len++] = 0x80 | ( c & 0x3f ); 239 } else if (c < 0x4000000) { 240 p[len++] = 0xf8 | ( c >> 24 ); 241 p[len++] = 0x80 | ( (c >> 18) & 0x3f ); 242 p[len++] = 0x80 | ( (c >> 12) & 0x3f ); 243 p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 244 p[len++] = 0x80 | ( c & 0x3f ); 245 } else /* if( c < 0x80000000 ) */ { 246 p[len++] = 0xfc | ( c >> 30 ); 247 p[len++] = 0x80 | ( (c >> 24) & 0x3f ); 248 p[len++] = 0x80 | ( (c >> 18) & 0x3f ); 249 p[len++] = 0x80 | ( (c >> 12) & 0x3f ); 250 p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 251 p[len++] = 0x80 | ( c & 0x3f ); 252 } 253 254 return len; 255 } 256 257 size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf) 258 { 259 return krb5int_ucs4_to_utf8((krb5_ucs4)c, buf); 260 } 261 262 #define KRB5_UCS_UTF8LEN(c) \ 263 c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \ 264 (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6))))) 265 266 /* 267 * Advance to the next UTF-8 character 268 * 269 * Ignores length of multibyte character, instead rely on 270 * continuation markers to find start of next character. 271 * This allows for "resyncing" of when invalid characters 272 * are provided provided the start of the next character 273 * is appears within the 6 bytes examined. 274 */ 275 char *krb5int_utf8_next(const char *p) 276 { 277 int i; 278 const unsigned char *u = (const unsigned char *) p; 279 280 if (KRB5_UTF8_ISASCII(u)) { 281 return (char *) &p[1]; 282 } 283 284 for (i = 1; i < 6; i++) { 285 if ((u[i] & 0xc0) != 0x80) { 286 return (char *) &p[i]; 287 } 288 } 289 290 return (char *) &p[i]; 291 } 292 293 /* 294 * Advance to the previous UTF-8 character 295 * 296 * Ignores length of multibyte character, instead rely on 297 * continuation markers to find start of next character. 298 * This allows for "resyncing" of when invalid characters 299 * are provided provided the start of the next character 300 * is appears within the 6 bytes examined. 301 */ 302 char *krb5int_utf8_prev(const char *p) 303 { 304 int i; 305 const unsigned char *u = (const unsigned char *) p; 306 307 for (i = -1; i>-6 ; i--) { 308 if ((u[i] & 0xc0 ) != 0x80) { 309 return (char *) &p[i]; 310 } 311 } 312 313 return (char *) &p[i]; 314 } 315 316 /* 317 * Copy one UTF-8 character from src to dst returning 318 * number of bytes copied. 319 * 320 * Ignores length of multibyte character, instead rely on 321 * continuation markers to find start of next character. 322 * This allows for "resyncing" of when invalid characters 323 * are provided provided the start of the next character 324 * is appears within the 6 bytes examined. 325 */ 326 int krb5int_utf8_copy(char* dst, const char *src) 327 { 328 int i; 329 const unsigned char *u = (const unsigned char *) src; 330 331 dst[0] = src[0]; 332 333 if (KRB5_UTF8_ISASCII(u)) { 334 return 1; 335 } 336 337 for (i=1; i<6; i++) { 338 if ((u[i] & 0xc0) != 0x80) { 339 return i; 340 } 341 dst[i] = src[i]; 342 } 343 344 return i; 345 } 346 347 #ifndef UTF8_ALPHA_CTYPE 348 /* 349 * UTF-8 ctype routines 350 * Only deals with characters < 0x80 (ie: US-ASCII) 351 */ 352 353 int krb5int_utf8_isascii(const char * p) 354 { 355 unsigned c = * (const unsigned char *) p; 356 357 return KRB5_ASCII(c); 358 } 359 360 int krb5int_utf8_isdigit(const char * p) 361 { 362 unsigned c = * (const unsigned char *) p; 363 364 if (!KRB5_ASCII(c)) 365 return 0; 366 367 return KRB5_DIGIT( c ); 368 } 369 370 int krb5int_utf8_isxdigit(const char * p) 371 { 372 unsigned c = * (const unsigned char *) p; 373 374 if (!KRB5_ASCII(c)) 375 return 0; 376 377 return KRB5_HEX(c); 378 } 379 380 int krb5int_utf8_isspace(const char * p) 381 { 382 unsigned c = * (const unsigned char *) p; 383 384 if (!KRB5_ASCII(c)) 385 return 0; 386 387 switch(c) { 388 case ' ': 389 case '\t': 390 case '\n': 391 case '\r': 392 case '\v': 393 case '\f': 394 return 1; 395 } 396 397 return 0; 398 } 399 400 /* 401 * These are not needed by the C SDK and are 402 * not "good enough" for general use. 403 */ 404 int krb5int_utf8_isalpha(const char * p) 405 { 406 unsigned c = * (const unsigned char *) p; 407 408 if (!KRB5_ASCII(c)) 409 return 0; 410 411 return KRB5_ALPHA(c); 412 } 413 414 int krb5int_utf8_isalnum(const char * p) 415 { 416 unsigned c = * (const unsigned char *) p; 417 418 if (!KRB5_ASCII(c)) 419 return 0; 420 421 return KRB5_ALNUM(c); 422 } 423 424 #if 0 425 int krb5int_utf8_islower(const char * p) 426 { 427 unsigned c = * (const unsigned char *) p; 428 429 if (!KRB5_ASCII(c)) 430 return 0; 431 432 return KRB5_LOWER(c); 433 } 434 435 int krb5int_utf8_isupper(const char * p) 436 { 437 unsigned c = * (const unsigned char *) p; 438 439 if (!KRB5_ASCII(c)) 440 return 0; 441 442 return KRB5_UPPER(c); 443 } 444 #endif 445 #endif 446 447 448 /* 449 * UTF-8 string routines 450 */ 451 452 /* like strchr() */ 453 char *krb5int_utf8_strchr(const char *str, const char *chr) 454 { 455 krb5_ucs4 chs, ch; 456 457 if (krb5int_utf8_to_ucs4(chr, &ch) == -1) 458 return NULL; 459 for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) { 460 if (krb5int_utf8_to_ucs4(str, &chs) == 0 && chs == ch) 461 return (char *)str; 462 } 463 464 return NULL; 465 } 466 467 /* like strcspn() but returns number of bytes, not characters */ 468 size_t krb5int_utf8_strcspn(const char *str, const char *set) 469 { 470 const char *cstr, *cset; 471 krb5_ucs4 chstr, chset; 472 473 for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) { 474 for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) { 475 if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0 476 && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset) 477 return cstr - str; 478 } 479 } 480 481 return cstr - str; 482 } 483 484 /* like strspn() but returns number of bytes, not characters */ 485 size_t krb5int_utf8_strspn(const char *str, const char *set) 486 { 487 const char *cstr, *cset; 488 krb5_ucs4 chstr, chset; 489 490 for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) { 491 for (cset = set; ; KRB5_UTF8_INCR(cset)) { 492 if (*cset == '\0') 493 return cstr - str; 494 if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0 495 && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset) 496 break; 497 } 498 } 499 500 return cstr - str; 501 } 502 503 /* like strpbrk(), replaces strchr() as well */ 504 char *krb5int_utf8_strpbrk(const char *str, const char *set) 505 { 506 const char *cset; 507 krb5_ucs4 chstr, chset; 508 509 for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) { 510 for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) { 511 if (krb5int_utf8_to_ucs4(str, &chstr) == 0 512 && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset) 513 return (char *)str; 514 } 515 } 516 517 return NULL; 518 } 519 520 /* like strtok_r(), not strtok() */ 521 char *krb5int_utf8_strtok(char *str, const char *sep, char **last) 522 { 523 char *begin; 524 char *end; 525 526 if (last == NULL) 527 return NULL; 528 529 begin = str ? str : *last; 530 531 begin += krb5int_utf8_strspn(begin, sep); 532 533 if (*begin == '\0') { 534 *last = NULL; 535 return NULL; 536 } 537 538 end = &begin[krb5int_utf8_strcspn(begin, sep)]; 539 540 if (*end != '\0') { 541 char *next = KRB5_UTF8_NEXT(end); 542 *end = '\0'; 543 end = next; 544 } 545 546 *last = end; 547 548 return begin; 549 } 550