19c9af259SGordon Ross /* 29c9af259SGordon Ross * CDDL HEADER START 39c9af259SGordon Ross * 49c9af259SGordon Ross * The contents of this file are subject to the terms of the 59c9af259SGordon Ross * Common Development and Distribution License (the "License"). 69c9af259SGordon Ross * You may not use this file except in compliance with the License. 79c9af259SGordon Ross * 89c9af259SGordon Ross * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 99c9af259SGordon Ross * or http://www.opensolaris.org/os/licensing. 109c9af259SGordon Ross * See the License for the specific language governing permissions 119c9af259SGordon Ross * and limitations under the License. 129c9af259SGordon Ross * 139c9af259SGordon Ross * When distributing Covered Code, include this CDDL HEADER in each 149c9af259SGordon Ross * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 159c9af259SGordon Ross * If applicable, add the following below this CDDL HEADER, with the 169c9af259SGordon Ross * fields enclosed by brackets "[]" replaced with your own identifying 179c9af259SGordon Ross * information: Portions Copyright [yyyy] [name of copyright owner] 189c9af259SGordon Ross * 199c9af259SGordon Ross * CDDL HEADER END 209c9af259SGordon Ross */ 219c9af259SGordon Ross 229c9af259SGordon Ross /* 23*613a2f6bSGordon Ross * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 249c9af259SGordon Ross * Use is subject to license terms. 259c9af259SGordon Ross */ 269c9af259SGordon Ross 279c9af259SGordon Ross /* 289c9af259SGordon Ross * Unicode conversions (yet more) 299c9af259SGordon Ross */ 309c9af259SGordon Ross 319c9af259SGordon Ross #include <stdio.h> 329c9af259SGordon Ross #include <stdlib.h> 339c9af259SGordon Ross #include <string.h> 349c9af259SGordon Ross #include <errno.h> 359c9af259SGordon Ross #include <iconv.h> 369c9af259SGordon Ross #include <libintl.h> 379c9af259SGordon Ross 389c9af259SGordon Ross #include <sys/u8_textprep.h> 399c9af259SGordon Ross 409c9af259SGordon Ross #include <netsmb/smb_lib.h> 419c9af259SGordon Ross #include "charsets.h" 429c9af259SGordon Ross 439c9af259SGordon Ross 449c9af259SGordon Ross /* 459c9af259SGordon Ross * Number of unicode symbols in the string, 469c9af259SGordon Ross * not including the 2-byte null terminator. 479c9af259SGordon Ross * (multiply by two for storage size) 489c9af259SGordon Ross */ 499c9af259SGordon Ross size_t 509c9af259SGordon Ross unicode_strlen(const uint16_t *us) 519c9af259SGordon Ross { 529c9af259SGordon Ross size_t len = 0; 539c9af259SGordon Ross while (*us++) 549c9af259SGordon Ross len++; 559c9af259SGordon Ross return (len); 569c9af259SGordon Ross } 579c9af259SGordon Ross 589c9af259SGordon Ross static char *convert_ucs2xx_to_utf8(iconv_t, const uint16_t *); 599c9af259SGordon Ross 609c9af259SGordon Ross /* 619c9af259SGordon Ross * Convert (native) Unicode string to UTF-8. 629c9af259SGordon Ross * Returns allocated memory. 639c9af259SGordon Ross */ 649c9af259SGordon Ross char * 659c9af259SGordon Ross convert_unicode_to_utf8(uint16_t *us) 669c9af259SGordon Ross { 679c9af259SGordon Ross static iconv_t cd1 = (iconv_t)-1; 689c9af259SGordon Ross 699c9af259SGordon Ross /* Get conversion descriptor (to, from) */ 709c9af259SGordon Ross if (cd1 == (iconv_t)-1) 719c9af259SGordon Ross cd1 = iconv_open("UTF-8", "UCS-2"); 729c9af259SGordon Ross 739c9af259SGordon Ross return (convert_ucs2xx_to_utf8(cd1, us)); 749c9af259SGordon Ross } 759c9af259SGordon Ross 769c9af259SGordon Ross /* 779c9af259SGordon Ross * Convert little-endian Unicode string to UTF-8. 789c9af259SGordon Ross * Returns allocated memory. 799c9af259SGordon Ross */ 809c9af259SGordon Ross char * 819c9af259SGordon Ross convert_leunicode_to_utf8(unsigned short *us) 829c9af259SGordon Ross { 839c9af259SGordon Ross static iconv_t cd2 = (iconv_t)-1; 849c9af259SGordon Ross 859c9af259SGordon Ross /* Get conversion descriptor (to, from) */ 869c9af259SGordon Ross if (cd2 == (iconv_t)-1) 879c9af259SGordon Ross cd2 = iconv_open("UTF-8", "UCS-2LE"); 889c9af259SGordon Ross 899c9af259SGordon Ross return (convert_ucs2xx_to_utf8(cd2, us)); 909c9af259SGordon Ross } 919c9af259SGordon Ross 929c9af259SGordon Ross static char * 939c9af259SGordon Ross convert_ucs2xx_to_utf8(iconv_t cd, const uint16_t *us) 949c9af259SGordon Ross { 959c9af259SGordon Ross char *obuf, *optr; 969c9af259SGordon Ross const char *iptr; 979c9af259SGordon Ross size_t ileft, obsize, oleft, ret; 989c9af259SGordon Ross 999c9af259SGordon Ross if (cd == (iconv_t)-1) { 1009c9af259SGordon Ross smb_error(dgettext(TEXT_DOMAIN, 1019c9af259SGordon Ross "iconv_open(UTF-8/UCS-2)"), -1); 1029c9af259SGordon Ross return (NULL); 1039c9af259SGordon Ross } 1049c9af259SGordon Ross 1059c9af259SGordon Ross iptr = (const char *)us; 1069c9af259SGordon Ross ileft = unicode_strlen(us); 1079c9af259SGordon Ross ileft *= 2; /* now bytes */ 1089c9af259SGordon Ross 1099c9af259SGordon Ross /* Worst-case output size is 2x input size. */ 1109c9af259SGordon Ross oleft = ileft * 2; 1119c9af259SGordon Ross obsize = oleft + 2; /* room for null */ 1129c9af259SGordon Ross obuf = malloc(obsize); 1139c9af259SGordon Ross if (!obuf) 1149c9af259SGordon Ross return (NULL); 1159c9af259SGordon Ross optr = obuf; 1169c9af259SGordon Ross 1179c9af259SGordon Ross ret = iconv(cd, &iptr, &ileft, &optr, &oleft); 1189c9af259SGordon Ross *optr = '\0'; 1199c9af259SGordon Ross if (ret == (size_t)-1) { 1209c9af259SGordon Ross smb_error(dgettext(TEXT_DOMAIN, 1219c9af259SGordon Ross "iconv(%s) failed"), errno, obuf); 1229c9af259SGordon Ross } 1239c9af259SGordon Ross if (ileft) { 1249c9af259SGordon Ross smb_error(dgettext(TEXT_DOMAIN, 1259c9af259SGordon Ross "iconv(%s) failed"), -1, obuf); 1269c9af259SGordon Ross /* 1279c9af259SGordon Ross * XXX: What's better? return NULL? 1289c9af259SGordon Ross * The truncated string? << for now 1299c9af259SGordon Ross */ 1309c9af259SGordon Ross } 1319c9af259SGordon Ross 1329c9af259SGordon Ross return (obuf); 1339c9af259SGordon Ross } 1349c9af259SGordon Ross 1359c9af259SGordon Ross static uint16_t *convert_utf8_to_ucs2xx(iconv_t, const char *); 1369c9af259SGordon Ross 1379c9af259SGordon Ross /* 1389c9af259SGordon Ross * Convert UTF-8 string to Unicode. 1399c9af259SGordon Ross * Returns allocated memory. 1409c9af259SGordon Ross */ 1419c9af259SGordon Ross uint16_t * 1429c9af259SGordon Ross convert_utf8_to_unicode(const char *utf8_string) 1439c9af259SGordon Ross { 1449c9af259SGordon Ross static iconv_t cd3 = (iconv_t)-1; 1459c9af259SGordon Ross 1469c9af259SGordon Ross /* Get conversion descriptor (to, from) */ 1479c9af259SGordon Ross if (cd3 == (iconv_t)-1) 1489c9af259SGordon Ross cd3 = iconv_open("UCS-2", "UTF-8"); 1499c9af259SGordon Ross return (convert_utf8_to_ucs2xx(cd3, utf8_string)); 1509c9af259SGordon Ross } 1519c9af259SGordon Ross 1529c9af259SGordon Ross /* 1539c9af259SGordon Ross * Convert UTF-8 string to little-endian Unicode. 1549c9af259SGordon Ross * Returns allocated memory. 1559c9af259SGordon Ross */ 1569c9af259SGordon Ross uint16_t * 1579c9af259SGordon Ross convert_utf8_to_leunicode(const char *utf8_string) 1589c9af259SGordon Ross { 1599c9af259SGordon Ross static iconv_t cd4 = (iconv_t)-1; 1609c9af259SGordon Ross 1619c9af259SGordon Ross /* Get conversion descriptor (to, from) */ 1629c9af259SGordon Ross if (cd4 == (iconv_t)-1) 1639c9af259SGordon Ross cd4 = iconv_open("UCS-2LE", "UTF-8"); 1649c9af259SGordon Ross return (convert_utf8_to_ucs2xx(cd4, utf8_string)); 1659c9af259SGordon Ross } 1669c9af259SGordon Ross 1679c9af259SGordon Ross static uint16_t * 1689c9af259SGordon Ross convert_utf8_to_ucs2xx(iconv_t cd, const char *utf8_string) 1699c9af259SGordon Ross { 1709c9af259SGordon Ross uint16_t *obuf, *optr; 1719c9af259SGordon Ross const char *iptr; 1729c9af259SGordon Ross size_t ileft, obsize, oleft, ret; 1739c9af259SGordon Ross 1749c9af259SGordon Ross if (cd == (iconv_t)-1) { 1759c9af259SGordon Ross smb_error(dgettext(TEXT_DOMAIN, 1769c9af259SGordon Ross "iconv_open(UCS-2/UTF-8)"), -1); 1779c9af259SGordon Ross return (NULL); 1789c9af259SGordon Ross } 1799c9af259SGordon Ross 1809c9af259SGordon Ross iptr = utf8_string; 1819c9af259SGordon Ross ileft = strlen(iptr); 1829c9af259SGordon Ross 1839c9af259SGordon Ross /* Worst-case output size is 2x input size. */ 1849c9af259SGordon Ross oleft = ileft * 2; 1859c9af259SGordon Ross obsize = oleft + 2; /* room for null */ 1869c9af259SGordon Ross obuf = malloc(obsize); 1879c9af259SGordon Ross if (!obuf) 1889c9af259SGordon Ross return (NULL); 1899c9af259SGordon Ross optr = obuf; 1909c9af259SGordon Ross 1919c9af259SGordon Ross ret = iconv(cd, &iptr, &ileft, (char **)&optr, &oleft); 1929c9af259SGordon Ross *optr = '\0'; 1939c9af259SGordon Ross if (ret == (size_t)-1) { 1949c9af259SGordon Ross smb_error(dgettext(TEXT_DOMAIN, 1959c9af259SGordon Ross "iconv(%s) failed"), errno, utf8_string); 1969c9af259SGordon Ross } 1979c9af259SGordon Ross if (ileft) { 1989c9af259SGordon Ross smb_error(dgettext(TEXT_DOMAIN, 1999c9af259SGordon Ross "iconv(%s) failed"), -1, utf8_string); 2009c9af259SGordon Ross /* 2019c9af259SGordon Ross * XXX: What's better? return NULL? 2029c9af259SGordon Ross * The truncated string? << for now 2039c9af259SGordon Ross */ 2049c9af259SGordon Ross } 2059c9af259SGordon Ross 2069c9af259SGordon Ross return (obuf); 2079c9af259SGordon Ross } 208*613a2f6bSGordon Ross 209*613a2f6bSGordon Ross 210*613a2f6bSGordon Ross /* 211*613a2f6bSGordon Ross * A simple wrapper around u8_textprep_str() that returns the Unicode 212*613a2f6bSGordon Ross * upper-case version of some string. Returns memory from malloc. 213*613a2f6bSGordon Ross * Borrowed from idmapd. 214*613a2f6bSGordon Ross */ 215*613a2f6bSGordon Ross static char * 216*613a2f6bSGordon Ross utf8_str_to_upper_or_lower(const char *s, int upper_lower) 217*613a2f6bSGordon Ross { 218*613a2f6bSGordon Ross char *res = NULL; 219*613a2f6bSGordon Ross char *outs; 220*613a2f6bSGordon Ross size_t inlen, outlen, inbleft, outbleft; 221*613a2f6bSGordon Ross int rc, err; 222*613a2f6bSGordon Ross 223*613a2f6bSGordon Ross /* 224*613a2f6bSGordon Ross * u8_textprep_str() does not allocate memory. The input and 225*613a2f6bSGordon Ross * output buffers may differ in size (though that would be more 226*613a2f6bSGordon Ross * likely when normalization is done). We have to loop over it... 227*613a2f6bSGordon Ross * 228*613a2f6bSGordon Ross * To improve the chances that we can avoid looping we add 10 229*613a2f6bSGordon Ross * bytes of output buffer room the first go around. 230*613a2f6bSGordon Ross */ 231*613a2f6bSGordon Ross inlen = inbleft = strlen(s); 232*613a2f6bSGordon Ross outlen = outbleft = inlen + 10; 233*613a2f6bSGordon Ross if ((res = malloc(outlen)) == NULL) 234*613a2f6bSGordon Ross return (NULL); 235*613a2f6bSGordon Ross outs = res; 236*613a2f6bSGordon Ross 237*613a2f6bSGordon Ross while ((rc = u8_textprep_str((char *)s, &inbleft, outs, 238*613a2f6bSGordon Ross &outbleft, upper_lower, U8_UNICODE_LATEST, &err)) < 0 && 239*613a2f6bSGordon Ross err == E2BIG) { 240*613a2f6bSGordon Ross if ((res = realloc(res, outlen + inbleft)) == NULL) 241*613a2f6bSGordon Ross return (NULL); 242*613a2f6bSGordon Ross /* adjust input/output buffer pointers */ 243*613a2f6bSGordon Ross s += (inlen - inbleft); 244*613a2f6bSGordon Ross outs = res + outlen - outbleft; 245*613a2f6bSGordon Ross /* adjust outbleft and outlen */ 246*613a2f6bSGordon Ross outlen += inbleft; 247*613a2f6bSGordon Ross outbleft += inbleft; 248*613a2f6bSGordon Ross } 249*613a2f6bSGordon Ross 250*613a2f6bSGordon Ross if (rc < 0) { 251*613a2f6bSGordon Ross free(res); 252*613a2f6bSGordon Ross res = NULL; 253*613a2f6bSGordon Ross return (NULL); 254*613a2f6bSGordon Ross } 255*613a2f6bSGordon Ross 256*613a2f6bSGordon Ross res[outlen - outbleft] = '\0'; 257*613a2f6bSGordon Ross 258*613a2f6bSGordon Ross return (res); 259*613a2f6bSGordon Ross } 260*613a2f6bSGordon Ross 261*613a2f6bSGordon Ross char * 262*613a2f6bSGordon Ross utf8_str_toupper(const char *s) 263*613a2f6bSGordon Ross { 264*613a2f6bSGordon Ross return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOUPPER)); 265*613a2f6bSGordon Ross } 266*613a2f6bSGordon Ross 267*613a2f6bSGordon Ross char * 268*613a2f6bSGordon Ross utf8_str_tolower(const char *s) 269*613a2f6bSGordon Ross { 270*613a2f6bSGordon Ross return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOLOWER)); 271*613a2f6bSGordon Ross } 272