1*ba7b222eSGlenn Barry /* 2*ba7b222eSGlenn Barry * util/support/utf8.c 3*ba7b222eSGlenn Barry * 4*ba7b222eSGlenn Barry * Copyright 2008 by the Massachusetts Institute of Technology. 5*ba7b222eSGlenn Barry * All Rights Reserved. 6*ba7b222eSGlenn Barry * 7*ba7b222eSGlenn Barry * Export of this software from the United States of America may 8*ba7b222eSGlenn Barry * require a specific license from the United States Government. 9*ba7b222eSGlenn Barry * It is the responsibility of any person or organization contemplating 10*ba7b222eSGlenn Barry * export to obtain such a license before exporting. 11*ba7b222eSGlenn Barry * 12*ba7b222eSGlenn Barry * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and 13*ba7b222eSGlenn Barry * distribute this software and its documentation for any purpose and 14*ba7b222eSGlenn Barry * without fee is hereby granted, provided that the above copyright 15*ba7b222eSGlenn Barry * notice appear in all copies and that both that copyright notice and 16*ba7b222eSGlenn Barry * this permission notice appear in supporting documentation, and that 17*ba7b222eSGlenn Barry * the name of M.I.T. not be used in advertising or publicity pertaining 18*ba7b222eSGlenn Barry * to distribution of the software without specific, written prior 19*ba7b222eSGlenn Barry * permission. Furthermore if you modify this software you must label 20*ba7b222eSGlenn Barry * your software as modified software and not distribute it in such a 21*ba7b222eSGlenn Barry * fashion that it might be confused with the original M.I.T. software. 22*ba7b222eSGlenn Barry * M.I.T. makes no representations about the suitability of 23*ba7b222eSGlenn Barry * this software for any purpose. It is provided "as is" without express 24*ba7b222eSGlenn Barry * or implied warranty. 25*ba7b222eSGlenn Barry */ 26*ba7b222eSGlenn Barry /* This work is part of OpenLDAP Software <http://www.openldap.org/>. 27*ba7b222eSGlenn Barry * 28*ba7b222eSGlenn Barry * Copyright 1998-2008 The OpenLDAP Foundation. 29*ba7b222eSGlenn Barry * All rights reserved. 30*ba7b222eSGlenn Barry * 31*ba7b222eSGlenn Barry * Redistribution and use in source and binary forms, with or without 32*ba7b222eSGlenn Barry * modification, are permitted only as authorized by the OpenLDAP 33*ba7b222eSGlenn Barry * Public License. 34*ba7b222eSGlenn Barry * 35*ba7b222eSGlenn Barry * A copy of this license is available in the file LICENSE in the 36*ba7b222eSGlenn Barry * top-level directory of the distribution or, alternatively, at 37*ba7b222eSGlenn Barry * <http://www.OpenLDAP.org/license.html>. 38*ba7b222eSGlenn Barry */ 39*ba7b222eSGlenn Barry /* Basic UTF-8 routines 40*ba7b222eSGlenn Barry * 41*ba7b222eSGlenn Barry * These routines are "dumb". Though they understand UTF-8, 42*ba7b222eSGlenn Barry * they don't grok Unicode. That is, they can push bits, 43*ba7b222eSGlenn Barry * but don't have a clue what the bits represent. That's 44*ba7b222eSGlenn Barry * good enough for use with the KRB5 Client SDK. 45*ba7b222eSGlenn Barry * 46*ba7b222eSGlenn Barry * These routines are not optimized. 47*ba7b222eSGlenn Barry */ 48*ba7b222eSGlenn Barry 49*ba7b222eSGlenn Barry #include "k5-platform.h" 50*ba7b222eSGlenn Barry #include "k5-utf8.h" 51*ba7b222eSGlenn Barry #include "supp-int.h" 52*ba7b222eSGlenn Barry 53*ba7b222eSGlenn Barry /* 54*ba7b222eSGlenn Barry * return the number of bytes required to hold the 55*ba7b222eSGlenn Barry * NULL-terminated UTF-8 string NOT INCLUDING the 56*ba7b222eSGlenn Barry * termination. 57*ba7b222eSGlenn Barry */ 58*ba7b222eSGlenn Barry size_t krb5int_utf8_bytes(const char *p) 59*ba7b222eSGlenn Barry { 60*ba7b222eSGlenn Barry size_t bytes; 61*ba7b222eSGlenn Barry 62*ba7b222eSGlenn Barry for (bytes = 0; p[bytes]; bytes++) 63*ba7b222eSGlenn Barry ; 64*ba7b222eSGlenn Barry 65*ba7b222eSGlenn Barry return bytes; 66*ba7b222eSGlenn Barry } 67*ba7b222eSGlenn Barry 68*ba7b222eSGlenn Barry size_t krb5int_utf8_chars(const char *p) 69*ba7b222eSGlenn Barry { 70*ba7b222eSGlenn Barry /* could be optimized and could check for invalid sequences */ 71*ba7b222eSGlenn Barry size_t chars = 0; 72*ba7b222eSGlenn Barry 73*ba7b222eSGlenn Barry for ( ; *p ; KRB5_UTF8_INCR(p)) 74*ba7b222eSGlenn Barry chars++; 75*ba7b222eSGlenn Barry 76*ba7b222eSGlenn Barry return chars; 77*ba7b222eSGlenn Barry } 78*ba7b222eSGlenn Barry 79*ba7b222eSGlenn Barry size_t krb5int_utf8c_chars(const char *p, size_t length) 80*ba7b222eSGlenn Barry { 81*ba7b222eSGlenn Barry /* could be optimized and could check for invalid sequences */ 82*ba7b222eSGlenn Barry size_t chars = 0; 83*ba7b222eSGlenn Barry const char *end = p + length; 84*ba7b222eSGlenn Barry 85*ba7b222eSGlenn Barry for ( ; p < end; KRB5_UTF8_INCR(p)) 86*ba7b222eSGlenn Barry chars++; 87*ba7b222eSGlenn Barry 88*ba7b222eSGlenn Barry return chars; 89*ba7b222eSGlenn Barry } 90*ba7b222eSGlenn Barry 91*ba7b222eSGlenn Barry /* return offset to next character */ 92*ba7b222eSGlenn Barry int krb5int_utf8_offset(const char *p) 93*ba7b222eSGlenn Barry { 94*ba7b222eSGlenn Barry return KRB5_UTF8_NEXT(p) - p; 95*ba7b222eSGlenn Barry } 96*ba7b222eSGlenn Barry 97*ba7b222eSGlenn Barry /* 98*ba7b222eSGlenn Barry * Returns length indicated by first byte. 99*ba7b222eSGlenn Barry */ 100*ba7b222eSGlenn Barry const char krb5int_utf8_lentab[] = { 101*ba7b222eSGlenn Barry 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 102*ba7b222eSGlenn Barry 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 103*ba7b222eSGlenn Barry 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 104*ba7b222eSGlenn Barry 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 105*ba7b222eSGlenn Barry 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 106*ba7b222eSGlenn Barry 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 107*ba7b222eSGlenn Barry 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 108*ba7b222eSGlenn Barry 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 }; 109*ba7b222eSGlenn Barry 110*ba7b222eSGlenn Barry int krb5int_utf8_charlen(const char *p) 111*ba7b222eSGlenn Barry { 112*ba7b222eSGlenn Barry if (!(*p & 0x80)) 113*ba7b222eSGlenn Barry return 1; 114*ba7b222eSGlenn Barry 115*ba7b222eSGlenn Barry return krb5int_utf8_lentab[*(const unsigned char *)p ^ 0x80]; 116*ba7b222eSGlenn Barry } 117*ba7b222eSGlenn Barry 118*ba7b222eSGlenn Barry /* 119*ba7b222eSGlenn Barry * Make sure the UTF-8 char used the shortest possible encoding 120*ba7b222eSGlenn Barry * returns charlen if valid, 0 if not. 121*ba7b222eSGlenn Barry * 122*ba7b222eSGlenn Barry * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4. 123*ba7b222eSGlenn Barry * The table is slightly modified from that of the RFC. 124*ba7b222eSGlenn Barry * 125*ba7b222eSGlenn Barry * UCS-4 range (hex) UTF-8 sequence (binary) 126*ba7b222eSGlenn Barry * 0000 0000-0000 007F 0....... 127*ba7b222eSGlenn Barry * 0000 0080-0000 07FF 110++++. 10...... 128*ba7b222eSGlenn Barry * 0000 0800-0000 FFFF 1110++++ 10+..... 10...... 129*ba7b222eSGlenn Barry * 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10...... 130*ba7b222eSGlenn Barry * 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10...... 131*ba7b222eSGlenn Barry * 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10...... 132*ba7b222eSGlenn Barry * 133*ba7b222eSGlenn Barry * The '.' bits are "don't cares". When validating a UTF-8 sequence, 134*ba7b222eSGlenn Barry * at least one of the '+' bits must be set, otherwise the character 135*ba7b222eSGlenn Barry * should have been encoded in fewer octets. Note that in the two-octet 136*ba7b222eSGlenn Barry * case, only the first octet needs to be validated, and this is done 137*ba7b222eSGlenn Barry * in the krb5int_utf8_lentab[] above. 138*ba7b222eSGlenn Barry */ 139*ba7b222eSGlenn Barry 140*ba7b222eSGlenn Barry /* mask of required bits in second octet */ 141*ba7b222eSGlenn Barry #undef c 142*ba7b222eSGlenn Barry #define c const char 143*ba7b222eSGlenn Barry c krb5int_utf8_mintab[] = { 144*ba7b222eSGlenn Barry (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, 145*ba7b222eSGlenn Barry (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, 146*ba7b222eSGlenn Barry (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, 147*ba7b222eSGlenn Barry (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 }; 148*ba7b222eSGlenn Barry #undef c 149*ba7b222eSGlenn Barry 150*ba7b222eSGlenn Barry int krb5int_utf8_charlen2(const char *p) 151*ba7b222eSGlenn Barry { 152*ba7b222eSGlenn Barry int i = KRB5_UTF8_CHARLEN(p); 153*ba7b222eSGlenn Barry 154*ba7b222eSGlenn Barry if (i > 2) { 155*ba7b222eSGlenn Barry if (!(krb5int_utf8_mintab[*p & 0x1f] & p[1])) 156*ba7b222eSGlenn Barry i = 0; 157*ba7b222eSGlenn Barry } 158*ba7b222eSGlenn Barry 159*ba7b222eSGlenn Barry return i; 160*ba7b222eSGlenn Barry } 161*ba7b222eSGlenn Barry 162*ba7b222eSGlenn Barry /* 163*ba7b222eSGlenn Barry * Convert a UTF8 character to a UCS4 character. Return 0 on success, 164*ba7b222eSGlenn Barry * -1 on failure. 165*ba7b222eSGlenn Barry */ 166*ba7b222eSGlenn Barry int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out) 167*ba7b222eSGlenn Barry { 168*ba7b222eSGlenn Barry const unsigned char *c = (const unsigned char *) p; 169*ba7b222eSGlenn Barry krb5_ucs4 ch; 170*ba7b222eSGlenn Barry int len, i; 171*ba7b222eSGlenn Barry static unsigned char mask[] = { 172*ba7b222eSGlenn Barry 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 173*ba7b222eSGlenn Barry 174*ba7b222eSGlenn Barry *out = 0; 175*ba7b222eSGlenn Barry len = KRB5_UTF8_CHARLEN2(p, len); 176*ba7b222eSGlenn Barry 177*ba7b222eSGlenn Barry if (len == 0) 178*ba7b222eSGlenn Barry return -1; 179*ba7b222eSGlenn Barry 180*ba7b222eSGlenn Barry ch = c[0] & mask[len]; 181*ba7b222eSGlenn Barry 182*ba7b222eSGlenn Barry for (i = 1; i < len; i++) { 183*ba7b222eSGlenn Barry if ((c[i] & 0xc0) != 0x80) 184*ba7b222eSGlenn Barry return -1; 185*ba7b222eSGlenn Barry 186*ba7b222eSGlenn Barry ch <<= 6; 187*ba7b222eSGlenn Barry ch |= c[i] & 0x3f; 188*ba7b222eSGlenn Barry } 189*ba7b222eSGlenn Barry 190*ba7b222eSGlenn Barry *out = ch; 191*ba7b222eSGlenn Barry return 0; 192*ba7b222eSGlenn Barry } 193*ba7b222eSGlenn Barry 194*ba7b222eSGlenn Barry int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out) 195*ba7b222eSGlenn Barry { 196*ba7b222eSGlenn Barry krb5_ucs4 ch; 197*ba7b222eSGlenn Barry 198*ba7b222eSGlenn Barry *out = 0; 199*ba7b222eSGlenn Barry if (krb5int_utf8_to_ucs4(p, &ch) == -1 || ch > 0xFFFF) 200*ba7b222eSGlenn Barry return -1; 201*ba7b222eSGlenn Barry *out = (krb5_ucs2) ch; 202*ba7b222eSGlenn Barry return 0; 203*ba7b222eSGlenn Barry } 204*ba7b222eSGlenn Barry 205*ba7b222eSGlenn Barry /* conv UCS-2 to UTF-8, not used */ 206*ba7b222eSGlenn Barry size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf) 207*ba7b222eSGlenn Barry { 208*ba7b222eSGlenn Barry size_t len = 0; 209*ba7b222eSGlenn Barry unsigned char *p = (unsigned char *) buf; 210*ba7b222eSGlenn Barry 211*ba7b222eSGlenn Barry /* not a valid Unicode character */ 212*ba7b222eSGlenn Barry if (c < 0) 213*ba7b222eSGlenn Barry return 0; 214*ba7b222eSGlenn Barry 215*ba7b222eSGlenn Barry /* Just return length, don't convert */ 216*ba7b222eSGlenn Barry if (buf == NULL) { 217*ba7b222eSGlenn Barry if (c < 0x80) return 1; 218*ba7b222eSGlenn Barry else if (c < 0x800) return 2; 219*ba7b222eSGlenn Barry else if (c < 0x10000) return 3; 220*ba7b222eSGlenn Barry else if (c < 0x200000) return 4; 221*ba7b222eSGlenn Barry else if (c < 0x4000000) return 5; 222*ba7b222eSGlenn Barry else return 6; 223*ba7b222eSGlenn Barry } 224*ba7b222eSGlenn Barry 225*ba7b222eSGlenn Barry if (c < 0x80) { 226*ba7b222eSGlenn Barry p[len++] = c; 227*ba7b222eSGlenn Barry } else if (c < 0x800) { 228*ba7b222eSGlenn Barry p[len++] = 0xc0 | ( c >> 6 ); 229*ba7b222eSGlenn Barry p[len++] = 0x80 | ( c & 0x3f ); 230*ba7b222eSGlenn Barry } else if (c < 0x10000) { 231*ba7b222eSGlenn Barry p[len++] = 0xe0 | ( c >> 12 ); 232*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 233*ba7b222eSGlenn Barry p[len++] = 0x80 | ( c & 0x3f ); 234*ba7b222eSGlenn Barry } else if (c < 0x200000) { 235*ba7b222eSGlenn Barry p[len++] = 0xf0 | ( c >> 18 ); 236*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 12) & 0x3f ); 237*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 238*ba7b222eSGlenn Barry p[len++] = 0x80 | ( c & 0x3f ); 239*ba7b222eSGlenn Barry } else if (c < 0x4000000) { 240*ba7b222eSGlenn Barry p[len++] = 0xf8 | ( c >> 24 ); 241*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 18) & 0x3f ); 242*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 12) & 0x3f ); 243*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 244*ba7b222eSGlenn Barry p[len++] = 0x80 | ( c & 0x3f ); 245*ba7b222eSGlenn Barry } else /* if( c < 0x80000000 ) */ { 246*ba7b222eSGlenn Barry p[len++] = 0xfc | ( c >> 30 ); 247*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 24) & 0x3f ); 248*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 18) & 0x3f ); 249*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 12) & 0x3f ); 250*ba7b222eSGlenn Barry p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 251*ba7b222eSGlenn Barry p[len++] = 0x80 | ( c & 0x3f ); 252*ba7b222eSGlenn Barry } 253*ba7b222eSGlenn Barry 254*ba7b222eSGlenn Barry return len; 255*ba7b222eSGlenn Barry } 256*ba7b222eSGlenn Barry 257*ba7b222eSGlenn Barry size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf) 258*ba7b222eSGlenn Barry { 259*ba7b222eSGlenn Barry return krb5int_ucs4_to_utf8((krb5_ucs4)c, buf); 260*ba7b222eSGlenn Barry } 261*ba7b222eSGlenn Barry 262*ba7b222eSGlenn Barry #define KRB5_UCS_UTF8LEN(c) \ 263*ba7b222eSGlenn Barry c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \ 264*ba7b222eSGlenn Barry (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6))))) 265*ba7b222eSGlenn Barry 266*ba7b222eSGlenn Barry /* 267*ba7b222eSGlenn Barry * Advance to the next UTF-8 character 268*ba7b222eSGlenn Barry * 269*ba7b222eSGlenn Barry * Ignores length of multibyte character, instead rely on 270*ba7b222eSGlenn Barry * continuation markers to find start of next character. 271*ba7b222eSGlenn Barry * This allows for "resyncing" of when invalid characters 272*ba7b222eSGlenn Barry * are provided provided the start of the next character 273*ba7b222eSGlenn Barry * is appears within the 6 bytes examined. 274*ba7b222eSGlenn Barry */ 275*ba7b222eSGlenn Barry char *krb5int_utf8_next(const char *p) 276*ba7b222eSGlenn Barry { 277*ba7b222eSGlenn Barry int i; 278*ba7b222eSGlenn Barry const unsigned char *u = (const unsigned char *) p; 279*ba7b222eSGlenn Barry 280*ba7b222eSGlenn Barry if (KRB5_UTF8_ISASCII(u)) { 281*ba7b222eSGlenn Barry return (char *) &p[1]; 282*ba7b222eSGlenn Barry } 283*ba7b222eSGlenn Barry 284*ba7b222eSGlenn Barry for (i = 1; i < 6; i++) { 285*ba7b222eSGlenn Barry if ((u[i] & 0xc0) != 0x80) { 286*ba7b222eSGlenn Barry return (char *) &p[i]; 287*ba7b222eSGlenn Barry } 288*ba7b222eSGlenn Barry } 289*ba7b222eSGlenn Barry 290*ba7b222eSGlenn Barry return (char *) &p[i]; 291*ba7b222eSGlenn Barry } 292*ba7b222eSGlenn Barry 293*ba7b222eSGlenn Barry /* 294*ba7b222eSGlenn Barry * Advance to the previous UTF-8 character 295*ba7b222eSGlenn Barry * 296*ba7b222eSGlenn Barry * Ignores length of multibyte character, instead rely on 297*ba7b222eSGlenn Barry * continuation markers to find start of next character. 298*ba7b222eSGlenn Barry * This allows for "resyncing" of when invalid characters 299*ba7b222eSGlenn Barry * are provided provided the start of the next character 300*ba7b222eSGlenn Barry * is appears within the 6 bytes examined. 301*ba7b222eSGlenn Barry */ 302*ba7b222eSGlenn Barry char *krb5int_utf8_prev(const char *p) 303*ba7b222eSGlenn Barry { 304*ba7b222eSGlenn Barry int i; 305*ba7b222eSGlenn Barry const unsigned char *u = (const unsigned char *) p; 306*ba7b222eSGlenn Barry 307*ba7b222eSGlenn Barry for (i = -1; i>-6 ; i--) { 308*ba7b222eSGlenn Barry if ((u[i] & 0xc0 ) != 0x80) { 309*ba7b222eSGlenn Barry return (char *) &p[i]; 310*ba7b222eSGlenn Barry } 311*ba7b222eSGlenn Barry } 312*ba7b222eSGlenn Barry 313*ba7b222eSGlenn Barry return (char *) &p[i]; 314*ba7b222eSGlenn Barry } 315*ba7b222eSGlenn Barry 316*ba7b222eSGlenn Barry /* 317*ba7b222eSGlenn Barry * Copy one UTF-8 character from src to dst returning 318*ba7b222eSGlenn Barry * number of bytes copied. 319*ba7b222eSGlenn Barry * 320*ba7b222eSGlenn Barry * Ignores length of multibyte character, instead rely on 321*ba7b222eSGlenn Barry * continuation markers to find start of next character. 322*ba7b222eSGlenn Barry * This allows for "resyncing" of when invalid characters 323*ba7b222eSGlenn Barry * are provided provided the start of the next character 324*ba7b222eSGlenn Barry * is appears within the 6 bytes examined. 325*ba7b222eSGlenn Barry */ 326*ba7b222eSGlenn Barry int krb5int_utf8_copy(char* dst, const char *src) 327*ba7b222eSGlenn Barry { 328*ba7b222eSGlenn Barry int i; 329*ba7b222eSGlenn Barry const unsigned char *u = (const unsigned char *) src; 330*ba7b222eSGlenn Barry 331*ba7b222eSGlenn Barry dst[0] = src[0]; 332*ba7b222eSGlenn Barry 333*ba7b222eSGlenn Barry if (KRB5_UTF8_ISASCII(u)) { 334*ba7b222eSGlenn Barry return 1; 335*ba7b222eSGlenn Barry } 336*ba7b222eSGlenn Barry 337*ba7b222eSGlenn Barry for (i=1; i<6; i++) { 338*ba7b222eSGlenn Barry if ((u[i] & 0xc0) != 0x80) { 339*ba7b222eSGlenn Barry return i; 340*ba7b222eSGlenn Barry } 341*ba7b222eSGlenn Barry dst[i] = src[i]; 342*ba7b222eSGlenn Barry } 343*ba7b222eSGlenn Barry 344*ba7b222eSGlenn Barry return i; 345*ba7b222eSGlenn Barry } 346*ba7b222eSGlenn Barry 347*ba7b222eSGlenn Barry #ifndef UTF8_ALPHA_CTYPE 348*ba7b222eSGlenn Barry /* 349*ba7b222eSGlenn Barry * UTF-8 ctype routines 350*ba7b222eSGlenn Barry * Only deals with characters < 0x80 (ie: US-ASCII) 351*ba7b222eSGlenn Barry */ 352*ba7b222eSGlenn Barry 353*ba7b222eSGlenn Barry int krb5int_utf8_isascii(const char * p) 354*ba7b222eSGlenn Barry { 355*ba7b222eSGlenn Barry unsigned c = * (const unsigned char *) p; 356*ba7b222eSGlenn Barry 357*ba7b222eSGlenn Barry return KRB5_ASCII(c); 358*ba7b222eSGlenn Barry } 359*ba7b222eSGlenn Barry 360*ba7b222eSGlenn Barry int krb5int_utf8_isdigit(const char * p) 361*ba7b222eSGlenn Barry { 362*ba7b222eSGlenn Barry unsigned c = * (const unsigned char *) p; 363*ba7b222eSGlenn Barry 364*ba7b222eSGlenn Barry if (!KRB5_ASCII(c)) 365*ba7b222eSGlenn Barry return 0; 366*ba7b222eSGlenn Barry 367*ba7b222eSGlenn Barry return KRB5_DIGIT( c ); 368*ba7b222eSGlenn Barry } 369*ba7b222eSGlenn Barry 370*ba7b222eSGlenn Barry int krb5int_utf8_isxdigit(const char * p) 371*ba7b222eSGlenn Barry { 372*ba7b222eSGlenn Barry unsigned c = * (const unsigned char *) p; 373*ba7b222eSGlenn Barry 374*ba7b222eSGlenn Barry if (!KRB5_ASCII(c)) 375*ba7b222eSGlenn Barry return 0; 376*ba7b222eSGlenn Barry 377*ba7b222eSGlenn Barry return KRB5_HEX(c); 378*ba7b222eSGlenn Barry } 379*ba7b222eSGlenn Barry 380*ba7b222eSGlenn Barry int krb5int_utf8_isspace(const char * p) 381*ba7b222eSGlenn Barry { 382*ba7b222eSGlenn Barry unsigned c = * (const unsigned char *) p; 383*ba7b222eSGlenn Barry 384*ba7b222eSGlenn Barry if (!KRB5_ASCII(c)) 385*ba7b222eSGlenn Barry return 0; 386*ba7b222eSGlenn Barry 387*ba7b222eSGlenn Barry switch(c) { 388*ba7b222eSGlenn Barry case ' ': 389*ba7b222eSGlenn Barry case '\t': 390*ba7b222eSGlenn Barry case '\n': 391*ba7b222eSGlenn Barry case '\r': 392*ba7b222eSGlenn Barry case '\v': 393*ba7b222eSGlenn Barry case '\f': 394*ba7b222eSGlenn Barry return 1; 395*ba7b222eSGlenn Barry } 396*ba7b222eSGlenn Barry 397*ba7b222eSGlenn Barry return 0; 398*ba7b222eSGlenn Barry } 399*ba7b222eSGlenn Barry 400*ba7b222eSGlenn Barry /* 401*ba7b222eSGlenn Barry * These are not needed by the C SDK and are 402*ba7b222eSGlenn Barry * not "good enough" for general use. 403*ba7b222eSGlenn Barry */ 404*ba7b222eSGlenn Barry int krb5int_utf8_isalpha(const char * p) 405*ba7b222eSGlenn Barry { 406*ba7b222eSGlenn Barry unsigned c = * (const unsigned char *) p; 407*ba7b222eSGlenn Barry 408*ba7b222eSGlenn Barry if (!KRB5_ASCII(c)) 409*ba7b222eSGlenn Barry return 0; 410*ba7b222eSGlenn Barry 411*ba7b222eSGlenn Barry return KRB5_ALPHA(c); 412*ba7b222eSGlenn Barry } 413*ba7b222eSGlenn Barry 414*ba7b222eSGlenn Barry int krb5int_utf8_isalnum(const char * p) 415*ba7b222eSGlenn Barry { 416*ba7b222eSGlenn Barry unsigned c = * (const unsigned char *) p; 417*ba7b222eSGlenn Barry 418*ba7b222eSGlenn Barry if (!KRB5_ASCII(c)) 419*ba7b222eSGlenn Barry return 0; 420*ba7b222eSGlenn Barry 421*ba7b222eSGlenn Barry return KRB5_ALNUM(c); 422*ba7b222eSGlenn Barry } 423*ba7b222eSGlenn Barry 424*ba7b222eSGlenn Barry #if 0 425*ba7b222eSGlenn Barry int krb5int_utf8_islower(const char * p) 426*ba7b222eSGlenn Barry { 427*ba7b222eSGlenn Barry unsigned c = * (const unsigned char *) p; 428*ba7b222eSGlenn Barry 429*ba7b222eSGlenn Barry if (!KRB5_ASCII(c)) 430*ba7b222eSGlenn Barry return 0; 431*ba7b222eSGlenn Barry 432*ba7b222eSGlenn Barry return KRB5_LOWER(c); 433*ba7b222eSGlenn Barry } 434*ba7b222eSGlenn Barry 435*ba7b222eSGlenn Barry int krb5int_utf8_isupper(const char * p) 436*ba7b222eSGlenn Barry { 437*ba7b222eSGlenn Barry unsigned c = * (const unsigned char *) p; 438*ba7b222eSGlenn Barry 439*ba7b222eSGlenn Barry if (!KRB5_ASCII(c)) 440*ba7b222eSGlenn Barry return 0; 441*ba7b222eSGlenn Barry 442*ba7b222eSGlenn Barry return KRB5_UPPER(c); 443*ba7b222eSGlenn Barry } 444*ba7b222eSGlenn Barry #endif 445*ba7b222eSGlenn Barry #endif 446*ba7b222eSGlenn Barry 447*ba7b222eSGlenn Barry 448*ba7b222eSGlenn Barry /* 449*ba7b222eSGlenn Barry * UTF-8 string routines 450*ba7b222eSGlenn Barry */ 451*ba7b222eSGlenn Barry 452*ba7b222eSGlenn Barry /* like strchr() */ 453*ba7b222eSGlenn Barry char *krb5int_utf8_strchr(const char *str, const char *chr) 454*ba7b222eSGlenn Barry { 455*ba7b222eSGlenn Barry krb5_ucs4 chs, ch; 456*ba7b222eSGlenn Barry 457*ba7b222eSGlenn Barry if (krb5int_utf8_to_ucs4(chr, &ch) == -1) 458*ba7b222eSGlenn Barry return NULL; 459*ba7b222eSGlenn Barry for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) { 460*ba7b222eSGlenn Barry if (krb5int_utf8_to_ucs4(str, &chs) == 0 && chs == ch) 461*ba7b222eSGlenn Barry return (char *)str; 462*ba7b222eSGlenn Barry } 463*ba7b222eSGlenn Barry 464*ba7b222eSGlenn Barry return NULL; 465*ba7b222eSGlenn Barry } 466*ba7b222eSGlenn Barry 467*ba7b222eSGlenn Barry /* like strcspn() but returns number of bytes, not characters */ 468*ba7b222eSGlenn Barry size_t krb5int_utf8_strcspn(const char *str, const char *set) 469*ba7b222eSGlenn Barry { 470*ba7b222eSGlenn Barry const char *cstr, *cset; 471*ba7b222eSGlenn Barry krb5_ucs4 chstr, chset; 472*ba7b222eSGlenn Barry 473*ba7b222eSGlenn Barry for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) { 474*ba7b222eSGlenn Barry for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) { 475*ba7b222eSGlenn Barry if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0 476*ba7b222eSGlenn Barry && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset) 477*ba7b222eSGlenn Barry return cstr - str; 478*ba7b222eSGlenn Barry } 479*ba7b222eSGlenn Barry } 480*ba7b222eSGlenn Barry 481*ba7b222eSGlenn Barry return cstr - str; 482*ba7b222eSGlenn Barry } 483*ba7b222eSGlenn Barry 484*ba7b222eSGlenn Barry /* like strspn() but returns number of bytes, not characters */ 485*ba7b222eSGlenn Barry size_t krb5int_utf8_strspn(const char *str, const char *set) 486*ba7b222eSGlenn Barry { 487*ba7b222eSGlenn Barry const char *cstr, *cset; 488*ba7b222eSGlenn Barry krb5_ucs4 chstr, chset; 489*ba7b222eSGlenn Barry 490*ba7b222eSGlenn Barry for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) { 491*ba7b222eSGlenn Barry for (cset = set; ; KRB5_UTF8_INCR(cset)) { 492*ba7b222eSGlenn Barry if (*cset == '\0') 493*ba7b222eSGlenn Barry return cstr - str; 494*ba7b222eSGlenn Barry if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0 495*ba7b222eSGlenn Barry && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset) 496*ba7b222eSGlenn Barry break; 497*ba7b222eSGlenn Barry } 498*ba7b222eSGlenn Barry } 499*ba7b222eSGlenn Barry 500*ba7b222eSGlenn Barry return cstr - str; 501*ba7b222eSGlenn Barry } 502*ba7b222eSGlenn Barry 503*ba7b222eSGlenn Barry /* like strpbrk(), replaces strchr() as well */ 504*ba7b222eSGlenn Barry char *krb5int_utf8_strpbrk(const char *str, const char *set) 505*ba7b222eSGlenn Barry { 506*ba7b222eSGlenn Barry const char *cset; 507*ba7b222eSGlenn Barry krb5_ucs4 chstr, chset; 508*ba7b222eSGlenn Barry 509*ba7b222eSGlenn Barry for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) { 510*ba7b222eSGlenn Barry for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) { 511*ba7b222eSGlenn Barry if (krb5int_utf8_to_ucs4(str, &chstr) == 0 512*ba7b222eSGlenn Barry && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset) 513*ba7b222eSGlenn Barry return (char *)str; 514*ba7b222eSGlenn Barry } 515*ba7b222eSGlenn Barry } 516*ba7b222eSGlenn Barry 517*ba7b222eSGlenn Barry return NULL; 518*ba7b222eSGlenn Barry } 519*ba7b222eSGlenn Barry 520*ba7b222eSGlenn Barry /* like strtok_r(), not strtok() */ 521*ba7b222eSGlenn Barry char *krb5int_utf8_strtok(char *str, const char *sep, char **last) 522*ba7b222eSGlenn Barry { 523*ba7b222eSGlenn Barry char *begin; 524*ba7b222eSGlenn Barry char *end; 525*ba7b222eSGlenn Barry 526*ba7b222eSGlenn Barry if (last == NULL) 527*ba7b222eSGlenn Barry return NULL; 528*ba7b222eSGlenn Barry 529*ba7b222eSGlenn Barry begin = str ? str : *last; 530*ba7b222eSGlenn Barry 531*ba7b222eSGlenn Barry begin += krb5int_utf8_strspn(begin, sep); 532*ba7b222eSGlenn Barry 533*ba7b222eSGlenn Barry if (*begin == '\0') { 534*ba7b222eSGlenn Barry *last = NULL; 535*ba7b222eSGlenn Barry return NULL; 536*ba7b222eSGlenn Barry } 537*ba7b222eSGlenn Barry 538*ba7b222eSGlenn Barry end = &begin[krb5int_utf8_strcspn(begin, sep)]; 539*ba7b222eSGlenn Barry 540*ba7b222eSGlenn Barry if (*end != '\0') { 541*ba7b222eSGlenn Barry char *next = KRB5_UTF8_NEXT(end); 542*ba7b222eSGlenn Barry *end = '\0'; 543*ba7b222eSGlenn Barry end = next; 544*ba7b222eSGlenn Barry } 545*ba7b222eSGlenn Barry 546*ba7b222eSGlenn Barry *last = end; 547*ba7b222eSGlenn Barry 548*ba7b222eSGlenn Barry return begin; 549*ba7b222eSGlenn Barry } 550