1 /* 2 * Copyright (C) 2008 by the Massachusetts Institute of Technology, 3 * Cambridge, MA, USA. All Rights Reserved. 4 * 5 * This software is being provided to you, the LICENSEE, by the 6 * Massachusetts Institute of Technology (M.I.T.) under the following 7 * license. By obtaining, using and/or copying this software, you agree 8 * that you have read, understood, and will comply with these terms and 9 * conditions: 10 * 11 * Export of this software from the United States of America may 12 * require a specific license from the United States Government. 13 * It is the responsibility of any person or organization contemplating 14 * export to obtain such a license before exporting. 15 * 16 * WITHIN THAT CONSTRAINT, permission to use, copy, modify and distribute 17 * this software and its documentation for any purpose and without fee or 18 * royalty is hereby granted, provided that you agree to comply with the 19 * following copyright notice and statements, including the disclaimer, and 20 * that the same appear on ALL copies of the software and documentation, 21 * including modifications that you make for internal use or for 22 * distribution: 23 * 24 * THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS 25 * OR WARRANTIES, EXPRESS OR IMPLIED. By way of example, but not 26 * limitation, M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF 27 * MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF 28 * THE LICENSED SOFTWARE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY 29 * PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. 30 * 31 * The name of the Massachusetts Institute of Technology or M.I.T. may NOT 32 * be used in advertising or publicity pertaining to distribution of the 33 * software. Title to copyright in this software and any associated 34 * documentation shall at all times remain with M.I.T., and USER agrees to 35 * preserve same. 36 * 37 * Furthermore if you modify this software you must label 38 * your software as modified software and not distribute it in such a 39 * fashion that it might be confused with the original M.I.T. software. 40 */ 41 /* This work is part of OpenLDAP Software <http://www.openldap.org/>. 42 * 43 * Copyright 1998-2008 The OpenLDAP Foundation. 44 * All rights reserved. 45 * 46 * Redistribution and use in source and binary forms, with or without 47 * modification, are permitted only as authorized by the OpenLDAP 48 * Public License. 49 * 50 * A copy of this license is available in file LICENSE in the 51 * top-level directory of the distribution or, alternatively, at 52 * <http://www.OpenLDAP.org/license.html>. 53 */ 54 /* This notice applies to changes, created by or for Novell, Inc., 55 * to preexisting works for which notices appear elsewhere in this file. 56 * 57 * Copyright (C) 2000 Novell, Inc. All Rights Reserved. 58 * 59 * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND TREATIES. 60 * USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT TO VERSION 61 * 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS AVAILABLE AT 62 * HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE" IN THE 63 * TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION OF THIS 64 * WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP PUBLIC 65 * LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT THE 66 * PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY. 67 */ 68 69 #ifndef K5_UTF8_H 70 #define K5_UTF8_H 71 72 #include "autoconf.h" 73 74 #ifdef HAVE_SYS_TYPES_H 75 #include <sys/types.h> 76 #endif 77 78 #ifdef HAVE_UNISTD_H 79 #include <unistd.h> 80 #endif 81 82 #ifdef HAVE_STDLIB_H 83 #include <stdlib.h> 84 #endif 85 86 #if INT_MAX == 0x7fff 87 typedef unsigned int krb5_ucs2; 88 #elif SHRT_MAX == 0x7fff 89 typedef unsigned short krb5_ucs2; 90 #else 91 #error undefined 16 bit type 92 #endif 93 94 #if INT_MAX == 0x7fffffffL 95 typedef int krb5_ucs4; 96 #elif LONG_MAX == 0x7fffffffL 97 typedef long krb5_ucs4; 98 #elif SHRT_MAX == 0x7fffffffL 99 typedef short krb5_ucs4; 100 #else 101 #error: undefined 32 bit type 102 #endif 103 104 #define KRB5_MAX_UTF8_LEN (sizeof(krb5_ucs2) * 3/2) 105 106 int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out); 107 size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf); 108 109 int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out); 110 size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf); 111 112 int 113 krb5int_ucs2s_to_utf8s(const krb5_ucs2 *ucs2s, 114 char **utf8s, 115 size_t *utf8slen); 116 117 int 118 krb5int_ucs2cs_to_utf8s(const krb5_ucs2 *ucs2s, 119 size_t ucs2slen, 120 char **utf8s, 121 size_t *utf8slen); 122 123 int 124 krb5int_ucs2les_to_utf8s(const unsigned char *ucs2les, 125 char **utf8s, 126 size_t *utf8slen); 127 128 int 129 krb5int_ucs2lecs_to_utf8s(const unsigned char *ucs2les, 130 size_t ucs2leslen, 131 char **utf8s, 132 size_t *utf8slen); 133 134 int 135 krb5int_utf8s_to_ucs2s(const char *utf8s, 136 krb5_ucs2 **ucs2s, 137 size_t *ucs2chars); 138 139 int 140 krb5int_utf8cs_to_ucs2s(const char *utf8s, 141 size_t utf8slen, 142 krb5_ucs2 **ucs2s, 143 size_t *ucs2chars); 144 145 int 146 krb5int_utf8s_to_ucs2les(const char *utf8s, 147 unsigned char **ucs2les, 148 size_t *ucs2leslen); 149 150 int 151 krb5int_utf8cs_to_ucs2les(const char *utf8s, 152 size_t utf8slen, 153 unsigned char **ucs2les, 154 size_t *ucs2leslen); 155 156 /* returns the number of bytes in the UTF-8 string */ 157 size_t krb5int_utf8_bytes(const char *); 158 /* returns the number of UTF-8 characters in the string */ 159 size_t krb5int_utf8_chars(const char *); 160 /* returns the number of UTF-8 characters in the counted string */ 161 size_t krb5int_utf8c_chars(const char *, size_t); 162 /* returns the length (in bytes) of the UTF-8 character */ 163 int krb5int_utf8_offset(const char *); 164 /* returns the length (in bytes) indicated by the UTF-8 character */ 165 int krb5int_utf8_charlen(const char *); 166 167 /* returns the length (in bytes) indicated by the UTF-8 character 168 * also checks that shortest possible encoding was used 169 */ 170 int krb5int_utf8_charlen2(const char *); 171 172 /* copies a UTF-8 character and returning number of bytes copied */ 173 int krb5int_utf8_copy(char *, const char *); 174 175 /* returns pointer of next UTF-8 character in string */ 176 char *krb5int_utf8_next( const char *); 177 /* returns pointer of previous UTF-8 character in string */ 178 char *krb5int_utf8_prev( const char *); 179 180 /* primitive ctype routines -- not aware of non-ascii characters */ 181 int krb5int_utf8_isascii( const char *); 182 int krb5int_utf8_isalpha( const char *); 183 int krb5int_utf8_isalnum( const char *); 184 int krb5int_utf8_isdigit( const char *); 185 int krb5int_utf8_isxdigit( const char *); 186 int krb5int_utf8_isspace( const char *); 187 188 /* span characters not in set, return bytes spanned */ 189 size_t krb5int_utf8_strcspn( const char* str, const char *set); 190 /* span characters in set, return bytes spanned */ 191 size_t krb5int_utf8_strspn( const char* str, const char *set); 192 /* return first occurance of character in string */ 193 char *krb5int_utf8_strchr( const char* str, const char *chr); 194 /* return first character of set in string */ 195 char *krb5int_utf8_strpbrk( const char* str, const char *set); 196 /* reentrant tokenizer */ 197 char *krb5int_utf8_strtok( char* sp, const char* sep, char **last); 198 199 /* Optimizations */ 200 extern const char krb5int_utf8_lentab[128]; 201 extern const char krb5int_utf8_mintab[32]; 202 203 #define KRB5_UTF8_ISASCII(p) ( !(*(const unsigned char *)(p) & 0x80 ) ) 204 #define KRB5_UTF8_CHARLEN(p) ( KRB5_UTF8_ISASCII(p) \ 205 ? 1 : krb5int_utf8_lentab[*(const unsigned char *)(p) ^ 0x80] ) 206 207 /* This is like CHARLEN but additionally validates to make sure 208 * the char used the shortest possible encoding. 209 * 'l' is used to temporarily hold the result of CHARLEN. 210 */ 211 #define KRB5_UTF8_CHARLEN2(p, l) ( ( ( l = KRB5_UTF8_CHARLEN( p )) < 3 || \ 212 ( krb5int_utf8_mintab[*(const unsigned char *)(p) & 0x1f] & (p)[1] ) ) ? \ 213 l : 0 ) 214 215 #define KRB5_UTF8_OFFSET(p) ( KRB5_UTF8_ISASCII(p) \ 216 ? 1 : krb5int_utf8_offset((p)) ) 217 218 #define KRB5_UTF8_COPY(d,s) ( KRB5_UTF8_ISASCII(s) \ 219 ? (*(d) = *(s), 1) : krb5int_utf8_copy((d),(s)) ) 220 221 #define KRB5_UTF8_NEXT(p) ( KRB5_UTF8_ISASCII(p) \ 222 ? (char *)(p)+1 : krb5int_utf8_next((p)) ) 223 224 #define KRB5_UTF8_INCR(p) ((p) = KRB5_UTF8_NEXT(p)) 225 226 /* For symmetry */ 227 #define KRB5_UTF8_PREV(p) (krb5int_utf8_prev((p))) 228 #define KRB5_UTF8_DECR(p) ((p)=KRB5_UTF8_PREV((p))) 229 230 /* 231 * these macros assume 'x' is an ASCII x 232 * and assume the "C" locale 233 */ 234 #define KRB5_ASCII(c) (!((c) & 0x80)) 235 #define KRB5_SPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') 236 #define KRB5_DIGIT(c) ((c) >= '0' && (c) <= '9') 237 #define KRB5_LOWER(c) ((c) >= 'a' && (c) <= 'z') 238 #define KRB5_UPPER(c) ((c) >= 'A' && (c) <= 'Z') 239 #define KRB5_ALPHA(c) (KRB5_LOWER(c) || KRB5_UPPER(c)) 240 #define KRB5_ALNUM(c) (KRB5_ALPHA(c) || KRB5_DIGIT(c)) 241 242 #define KRB5_LDH(c) (KRB5_ALNUM(c) || (c) == '-') 243 244 #define KRB5_HEXLOWER(c) ((c) >= 'a' && (c) <= 'f') 245 #define KRB5_HEXUPPER(c) ((c) >= 'A' && (c) <= 'F') 246 #define KRB5_HEX(c) (KRB5_DIGIT(c) || \ 247 KRB5_HEXLOWER(c) || KRB5_HEXUPPER(c)) 248 249 #endif /* K5_UTF8_H */ 250