1 /* 2 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6 /* 7 * util/support/utf8_conv.c 8 * 9 * Copyright 2008 by the Massachusetts Institute of Technology. 10 * All Rights Reserved. 11 * 12 * Export of this software from the United States of America may 13 * require a specific license from the United States Government. 14 * It is the responsibility of any person or organization contemplating 15 * export to obtain such a license before exporting. 16 * 17 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and 18 * distribute this software and its documentation for any purpose and 19 * without fee is hereby granted, provided that the above copyright 20 * notice appear in all copies and that both that copyright notice and 21 * this permission notice appear in supporting documentation, and that 22 * the name of M.I.T. not be used in advertising or publicity pertaining 23 * to distribution of the software without specific, written prior 24 * permission. Furthermore if you modify this software you must label 25 * your software as modified software and not distribute it in such a 26 * fashion that it might be confused with the original M.I.T. software. 27 * M.I.T. makes no representations about the suitability of 28 * this software for any purpose. It is provided "as is" without express 29 * or implied warranty. 30 */ 31 /* This work is part of OpenLDAP Software <http://www.openldap.org/>. 32 * 33 * Copyright 1998-2008 The OpenLDAP Foundation. 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted only as authorized by the OpenLDAP 38 * Public License. 39 * 40 * A copy of this license is available in the file LICENSE in the 41 * top-level directory of the distribution or, alternatively, at 42 * <http://www.OpenLDAP.org/license.html>. 43 */ 44 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved. 45 * 46 * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND 47 * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT 48 * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS 49 * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE" 50 * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION 51 * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP 52 * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT 53 * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY. 54 */ 55 56 /* 57 * UTF-8 Conversion Routines 58 * 59 * These routines convert between Wide Character and UTF-8, 60 * or between MultiByte and UTF-8 encodings. 61 * 62 * Both single character and string versions of the functions are provided. 63 * All functions return -1 if the character or string cannot be converted. 64 */ 65 66 #include "k5-platform.h" 67 #include "k5-utf8.h" 68 #include "supp-int.h" 69 #include "errno.h" /* SUNW17PACresync */ 70 71 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 72 73 static ssize_t 74 k5_utf8s_to_ucs2s(krb5_ucs2 *ucs2str, 75 const char *utf8str, 76 size_t count, 77 int little_endian) 78 { 79 size_t ucs2len = 0; 80 size_t utflen, i; 81 krb5_ucs2 ch; 82 83 /* If input ptr is NULL or empty... */ 84 if (utf8str == NULL || *utf8str == '\0') { 85 *ucs2str = 0; 86 87 return 0; 88 } 89 90 /* Examine next UTF-8 character. */ 91 while (*utf8str && ucs2len < count) { 92 /* Get UTF-8 sequence length from 1st byte */ 93 utflen = KRB5_UTF8_CHARLEN2(utf8str, utflen); 94 95 if (utflen == 0 || utflen > KRB5_MAX_UTF8_LEN) 96 return -1; 97 98 /* First byte minus length tag */ 99 ch = (krb5_ucs2)(utf8str[0] & mask[utflen]); 100 101 for (i = 1; i < utflen; i++) { 102 /* Subsequent bytes must start with 10 */ 103 if ((utf8str[i] & 0xc0) != 0x80) 104 return -1; 105 106 ch <<= 6; /* 6 bits of data in each subsequent byte */ 107 ch |= (krb5_ucs2)(utf8str[i] & 0x3f); 108 } 109 110 if (ucs2str != NULL) { 111 #ifdef K5_BE 112 #ifndef SWAP16 113 #define SWAP16(X) ((((X) << 8) | ((X) >> 8)) & 0xFFFF) 114 #endif 115 if (little_endian) 116 ucs2str[ucs2len] = SWAP16(ch); 117 else 118 #endif 119 ucs2str[ucs2len] = ch; 120 } 121 122 utf8str += utflen; /* Move to next UTF-8 character */ 123 ucs2len++; /* Count number of wide chars stored/required */ 124 } 125 126 assert(ucs2len < count); 127 128 if (ucs2str != NULL) { 129 /* Add null terminator if there's room in the buffer. */ 130 ucs2str[ucs2len] = 0; 131 } 132 133 return ucs2len; 134 } 135 136 int 137 krb5int_utf8s_to_ucs2s(const char *utf8s, 138 krb5_ucs2 **ucs2s, 139 size_t *ucs2chars) 140 { 141 ssize_t len; 142 size_t chars; 143 144 chars = krb5int_utf8_chars(utf8s); 145 *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2)); 146 if (*ucs2s == NULL) { 147 return ENOMEM; 148 } 149 150 len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars + 1, 0); 151 if (len < 0) { 152 free(*ucs2s); 153 *ucs2s = NULL; 154 return EINVAL; 155 } 156 157 if (ucs2chars != NULL) { 158 *ucs2chars = chars; 159 } 160 161 return 0; 162 } 163 164 int 165 krb5int_utf8cs_to_ucs2s(const char *utf8s, 166 size_t utf8slen, 167 krb5_ucs2 **ucs2s, 168 size_t *ucs2chars) 169 { 170 ssize_t len; 171 size_t chars; 172 173 chars = krb5int_utf8c_chars(utf8s, utf8slen); 174 *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2)); 175 if (*ucs2s == NULL) { 176 return ENOMEM; 177 } 178 179 len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars + 1, 0); 180 if (len < 0) { 181 free(*ucs2s); 182 *ucs2s = NULL; 183 return EINVAL; 184 } 185 186 if (ucs2chars != NULL) { 187 *ucs2chars = chars; 188 } 189 190 return 0; 191 } 192 193 int 194 krb5int_utf8s_to_ucs2les(const char *utf8s, 195 unsigned char **ucs2les, 196 size_t *ucs2leslen) 197 { 198 ssize_t len; 199 size_t chars; 200 201 chars = krb5int_utf8_chars(utf8s); 202 203 *ucs2les = (unsigned char *)malloc((chars + 1) * sizeof(krb5_ucs2)); 204 if (*ucs2les == NULL) { 205 return ENOMEM; 206 } 207 208 len = k5_utf8s_to_ucs2s((krb5_ucs2 *)*ucs2les, utf8s, chars + 1, 1); 209 if (len < 0) { 210 free(*ucs2les); 211 *ucs2les = NULL; 212 return EINVAL; 213 } 214 215 if (ucs2leslen != NULL) { 216 *ucs2leslen = chars * sizeof(krb5_ucs2); 217 } 218 219 return 0; 220 } 221 222 int 223 krb5int_utf8cs_to_ucs2les(const char *utf8s, 224 size_t utf8slen, 225 unsigned char **ucs2les, 226 size_t *ucs2leslen) 227 { 228 ssize_t len; 229 size_t chars; 230 231 chars = krb5int_utf8c_chars(utf8s, utf8slen); 232 233 *ucs2les = (unsigned char *)malloc((chars + 1) * sizeof(krb5_ucs2)); 234 if (*ucs2les == NULL) { 235 return ENOMEM; 236 } 237 238 len = k5_utf8s_to_ucs2s((krb5_ucs2 *)*ucs2les, utf8s, chars + 1, 1); 239 if (len < 0) { 240 free(*ucs2les); 241 *ucs2les = NULL; 242 return EINVAL; 243 } 244 245 if (ucs2leslen != NULL) { 246 *ucs2leslen = chars * sizeof(krb5_ucs2); 247 } 248 249 return 0; 250 } 251 252 /*----------------------------------------------------------------------------- 253 Convert a wide char string to a UTF-8 string. 254 No more than 'count' bytes will be written to the output buffer. 255 Return the # of bytes written to the output buffer, excl null terminator. 256 257 ucs2len is -1 if the UCS-2 string is NUL terminated, otherwise it is the 258 length of the UCS-2 string in characters 259 */ 260 static ssize_t 261 k5_ucs2s_to_utf8s(char *utf8str, const krb5_ucs2 *ucs2str, 262 size_t count, ssize_t ucs2len, int little_endian) 263 { 264 int len = 0; 265 int n; 266 char *p = utf8str; 267 krb5_ucs2 empty = 0, ch; 268 269 if (ucs2str == NULL) /* Treat input ptr NULL as an empty string */ 270 ucs2str = ∅ 271 272 if (utf8str == NULL) /* Just compute size of output, excl null */ 273 { 274 while (ucs2len == -1 ? *ucs2str : --ucs2len >= 0) { 275 /* Get UTF-8 size of next wide char */ 276 ch = *ucs2str++; 277 #ifdef K5_BE 278 if (little_endian) 279 ch = SWAP16(ch); 280 #endif 281 282 n = krb5int_ucs2_to_utf8(ch, NULL); 283 if (n < 1) 284 return -1; 285 if (len + n < len) 286 return -1; /* overflow */ 287 len += n; 288 } 289 290 return len; 291 } 292 293 /* Do the actual conversion. */ 294 295 n = 1; /* In case of empty ucs2str */ 296 while (ucs2len == -1 ? *ucs2str != 0 : --ucs2len >= 0) { 297 ch = *ucs2str++; 298 #ifdef K5_BE 299 if (little_endian) 300 ch = SWAP16(ch); 301 #endif 302 303 n = krb5int_ucs2_to_utf8(ch, p); 304 305 if (n < 1) 306 break; 307 308 p += n; 309 count -= n; /* Space left in output buffer */ 310 } 311 312 /* If not enough room for last character, pad remainder with null 313 so that return value = original count, indicating buffer full. */ 314 if (n == 0) { 315 while (count--) 316 *p++ = 0; 317 } 318 /* Add a null terminator if there's room. */ 319 else if (count) 320 *p = 0; 321 322 if (n == -1) /* Conversion encountered invalid wide char. */ 323 return -1; 324 325 /* Return the number of bytes written to output buffer, excl null. */ 326 return (p - utf8str); 327 } 328 329 int 330 krb5int_ucs2s_to_utf8s(const krb5_ucs2 *ucs2s, 331 char **utf8s, 332 size_t *utf8slen) 333 { 334 ssize_t len; 335 336 len = k5_ucs2s_to_utf8s(NULL, ucs2s, 0, -1, 0); 337 if (len < 0) { 338 return EINVAL; 339 } 340 341 *utf8s = (char *)malloc((size_t)len + 1); 342 if (*utf8s == NULL) { 343 return ENOMEM; 344 } 345 346 len = k5_ucs2s_to_utf8s(*utf8s, ucs2s, (size_t)len + 1, -1, 0); 347 if (len < 0) { 348 free(*utf8s); 349 *utf8s = NULL; 350 return EINVAL; 351 } 352 353 if (utf8slen != NULL) { 354 *utf8slen = len; 355 } 356 357 return 0; 358 } 359 360 int 361 krb5int_ucs2les_to_utf8s(const unsigned char *ucs2les, 362 char **utf8s, 363 size_t *utf8slen) 364 { 365 ssize_t len; 366 367 len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0, -1, 1); 368 if (len < 0) 369 return EINVAL; 370 371 *utf8s = (char *)malloc((size_t)len + 1); 372 if (*utf8s == NULL) { 373 return ENOMEM; 374 } 375 376 len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les, (size_t)len + 1, -1, 1); 377 if (len < 0) { 378 free(*utf8s); 379 *utf8s = NULL; 380 return EINVAL; 381 } 382 383 if (utf8slen != NULL) { 384 *utf8slen = len; 385 } 386 387 return 0; 388 } 389 390 int 391 krb5int_ucs2cs_to_utf8s(const krb5_ucs2 *ucs2s, 392 size_t ucs2slen, 393 char **utf8s, 394 size_t *utf8slen) 395 { 396 ssize_t len; 397 398 if (ucs2slen > SSIZE_MAX) 399 return ERANGE; 400 401 len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2s, 0, 402 (ssize_t)ucs2slen, 0); 403 if (len < 0) 404 return EINVAL; 405 406 *utf8s = (char *)malloc((size_t)len + 1); 407 if (*utf8s == NULL) { 408 return ENOMEM; 409 } 410 411 len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2s, 412 (size_t)len + 1, (ssize_t)ucs2slen, 0); 413 if (len < 0) { 414 free(*utf8s); 415 *utf8s = NULL; 416 return EINVAL; 417 } 418 419 if (utf8slen != NULL) { 420 *utf8slen = len; 421 } 422 423 return 0; 424 } 425 426 int 427 krb5int_ucs2lecs_to_utf8s(const unsigned char *ucs2les, 428 size_t ucs2leslen, 429 char **utf8s, 430 size_t *utf8slen) 431 { 432 ssize_t len; 433 434 if (ucs2leslen > SSIZE_MAX) 435 return ERANGE; 436 437 len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0, 438 (ssize_t)ucs2leslen, 1); 439 if (len < 0) 440 return EINVAL; 441 442 *utf8s = (char *)malloc((size_t)len + 1); 443 if (*utf8s == NULL) { 444 return ENOMEM; 445 } 446 447 len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les, 448 (size_t)len + 1, (ssize_t)ucs2leslen, 1); 449 if (len < 0) { 450 free(*utf8s); 451 *utf8s = NULL; 452 return EINVAL; 453 } 454 455 if (utf8slen != NULL) { 456 *utf8slen = len; 457 } 458 459 return 0; 460 } 461 462