1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 26 */ 27 28 /* 29 * Multibyte/wide-char conversion routines. Wide-char encoding provides 30 * a fixed size character encoding that maps to the Unicode 16-bit 31 * (UCS-2) character set standard. Multibyte or UCS transformation 32 * format (UTF) encoding is a variable length character encoding scheme 33 * that s compatible with existing ASCII characters and guarantees that 34 * the resultant strings do not contain embedded null characters. Both 35 * types of encoding provide a null terminator: single byte for UTF-8 36 * and a wide-char null for Unicode. See RFC 2044. 37 * 38 * The table below illustrates the UTF-8 encoding scheme. The letter x 39 * indicates bits available for encoding the character value. 40 * 41 * UCS-2 UTF-8 octet sequence (binary) 42 * 0x0000-0x007F 0xxxxxxx 43 * 0x0080-0x07FF 110xxxxx 10xxxxxx 44 * 0x0800-0xFFFF 1110xxxx 10xxxxxx 10xxxxxx 45 * 46 * RFC 2044 47 * UTF-8,a transformation format of UNICODE and ISO 10646 48 * F. Yergeau 49 * Alis Technologies 50 * October 1996 51 */ 52 53 #if defined(_KERNEL) || defined(_FAKE_KERNEL) 54 #include <sys/types.h> 55 #include <sys/sunddi.h> 56 #else /* _KERNEL || _FAKE_KERNEL */ 57 #include <stdio.h> 58 #include <stdlib.h> 59 #include <strings.h> 60 #include <iconv.h> 61 #include <assert.h> 62 #endif /* _KERNEL || _FAKE_KERNEL */ 63 #include <smbsrv/string.h> 64 65 66 /* 67 * mbstowcs 68 * 69 * The mbstowcs() function converts a multibyte character string 70 * mbstring into a wide character string wcstring. No more than 71 * nwchars wide characters are stored. A terminating null wide 72 * character is appended if there is room. 73 * 74 * Returns the number of wide characters converted, not counting 75 * any terminating null wide character. Returns -1 if an invalid 76 * multibyte character is encountered. 77 */ 78 size_t 79 smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars) 80 { 81 int len; 82 smb_wchar_t *start = wcstring; 83 84 while (nwchars--) { 85 len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX); 86 if (len < 0) { 87 *wcstring = 0; 88 return ((size_t)-1); 89 } 90 91 if (*mbstring == 0) 92 break; 93 94 ++wcstring; 95 mbstring += len; 96 } 97 98 return (wcstring - start); 99 } 100 101 102 /* 103 * mbtowc 104 * 105 * The mbtowc() function converts a multibyte character mbchar into 106 * a wide character and stores the result in the object pointed to 107 * by wcharp. Up to nbytes bytes are examined. 108 * 109 * If mbchar is NULL, mbtowc() returns zero to indicate that shift 110 * states are not supported. Shift states are used to switch between 111 * representation modes using reserved bytes to signal shifting 112 * without them being interpreted as characters. If mbchar is null 113 * mbtowc should return non-zero if the current locale requires shift 114 * states. Otherwise it should be return 0. 115 * 116 * If mbchar is non-null, returns the number of bytes processed in 117 * mbchar. If mbchar is invalid, returns -1. 118 */ 119 int /*ARGSUSED*/ 120 smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes) 121 { 122 unsigned char mbyte; 123 smb_wchar_t wide_char; 124 int count; 125 int bytes_left; 126 127 if (mbchar == NULL) 128 return (0); /* no shift states */ 129 130 /* 0xxxxxxx -> 1 byte ASCII encoding */ 131 if (((mbyte = *mbchar++) & 0x80) == 0) { 132 if (wcharp) 133 *wcharp = (smb_wchar_t)mbyte; 134 135 return (mbyte ? 1 : 0); 136 } 137 138 /* 10xxxxxx -> invalid first byte */ 139 if ((mbyte & 0x40) == 0) 140 return (-1); 141 142 wide_char = mbyte; 143 if ((mbyte & 0x20) == 0) { 144 wide_char &= 0x1f; 145 bytes_left = 1; 146 } else if ((mbyte & 0x10) == 0) { 147 wide_char &= 0x0f; 148 bytes_left = 2; 149 } else { 150 return (-1); 151 } 152 153 count = 1; 154 while (bytes_left--) { 155 if (((mbyte = *mbchar++) & 0xc0) != 0x80) 156 return (-1); 157 158 count++; 159 wide_char = (wide_char << 6) | (mbyte & 0x3f); 160 } 161 162 if (wcharp) 163 *wcharp = wide_char; 164 165 return (count); 166 } 167 168 169 /* 170 * wctomb 171 * 172 * The wctomb() function converts a wide character wchar into a multibyte 173 * character and stores the result in mbchar. The object pointed to by 174 * mbchar must be large enough to accommodate the multibyte character. 175 * 176 * Returns the numberof bytes written to mbchar. 177 */ 178 int 179 smb_wctomb(char *mbchar, smb_wchar_t wchar) 180 { 181 if ((wchar & ~0x7f) == 0) { 182 *mbchar = (char)wchar; 183 return (1); 184 } 185 186 if ((wchar & ~0x7ff) == 0) { 187 *mbchar++ = (wchar >> 6) | 0xc0; 188 *mbchar = (wchar & 0x3f) | 0x80; 189 return (2); 190 } 191 192 *mbchar++ = (wchar >> 12) | 0xe0; 193 *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80; 194 *mbchar = (wchar & 0x3f) | 0x80; 195 return (3); 196 } 197 198 199 /* 200 * wcstombs 201 * 202 * The wcstombs() function converts a wide character string wcstring 203 * into a multibyte character string mbstring. Up to nbytes bytes are 204 * stored in mbstring. Partial multibyte characters at the end of the 205 * string are not stored. The multibyte character string is null 206 * terminated if there is room. 207 * 208 * Returns the number of bytes converted, not counting the terminating 209 * null byte. 210 */ 211 size_t 212 smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes) 213 { 214 char *start = mbstring; 215 const smb_wchar_t *wcp = wcstring; 216 smb_wchar_t wide_char = 0; 217 char buf[4]; 218 size_t len; 219 220 if ((mbstring == NULL) || (wcstring == NULL)) 221 return (0); 222 223 while (nbytes > MTS_MB_CHAR_MAX) { 224 wide_char = *wcp++; 225 len = smb_wctomb(mbstring, wide_char); 226 227 if (wide_char == 0) 228 /*LINTED E_PTRDIFF_OVERFLOW*/ 229 return (mbstring - start); 230 231 mbstring += len; 232 nbytes -= len; 233 } 234 235 while (wide_char && nbytes) { 236 wide_char = *wcp++; 237 if ((len = smb_wctomb(buf, wide_char)) > nbytes) { 238 *mbstring = 0; 239 break; 240 } 241 242 bcopy(buf, mbstring, len); 243 mbstring += len; 244 nbytes -= len; 245 } 246 247 /*LINTED E_PTRDIFF_OVERFLOW*/ 248 return (mbstring - start); 249 } 250 251 252 /* 253 * Returns the number of bytes that would be written if the multi- 254 * byte string mbs was converted to a wide character string, not 255 * counting the terminating null wide character. 256 */ 257 size_t 258 smb_wcequiv_strlen(const char *mbs) 259 { 260 smb_wchar_t wide_char; 261 size_t bytes; 262 size_t len = 0; 263 264 while (*mbs) { 265 bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); 266 if (bytes == ((size_t)-1)) 267 return ((size_t)-1); 268 269 len += sizeof (smb_wchar_t); 270 mbs += bytes; 271 } 272 273 return (len); 274 } 275 276 277 /* 278 * Returns the number of bytes that would be written if the multi- 279 * byte string mbs was converted to an OEM character string, 280 * not counting the terminating null character. 281 */ 282 size_t 283 smb_sbequiv_strlen(const char *mbs) 284 { 285 smb_wchar_t wide_char; 286 size_t nbytes; 287 size_t len = 0; 288 289 while (*mbs) { 290 nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); 291 if (nbytes == ((size_t)-1)) 292 return ((size_t)-1); 293 294 /* 295 * Assume OEM characters are 1-byte (for now). 296 * That's true for cp850, which is the only 297 * codeset this currently supports. See: 298 * smb_oem.c : smb_oem_codeset 299 */ 300 ++len; 301 302 mbs += nbytes; 303 } 304 305 return (len); 306 } 307 308 /* 309 * Convert OEM strings to/from internal (UTF-8) form. 310 * 311 * We rarely encounter these anymore because all modern 312 * SMB clients use Unicode (UTF-16). The few cases where 313 * this IS still called are normally using ASCII, i.e. 314 * tag names etc. so short-cut those cases. If we get 315 * something non-ASCII we have to call iconv. 316 * 317 * If we were to really support OEM code pages, we would 318 * need to have a way to set the OEM code page from some 319 * configuration value. For now it's always CP850. 320 * See also ./smb_oem.c 321 */ 322 static char smb_oem_codepage[32] = "CP850"; 323 324 /* 325 * smb_oemtombs 326 * 327 * Convert a null terminated OEM string 'string' to a UTF-8 string 328 * no longer than max_mblen (null terminated if space). 329 * 330 * If the input string contains invalid OEM characters, a value 331 * of -1 will be returned. Otherwise returns the length of 'mbs', 332 * excluding the terminating null character. 333 * 334 * If either mbstring or string is a null pointer, -1 is returned. 335 */ 336 int 337 smb_oemtombs(char *mbs, const uint8_t *oems, int max_mblen) 338 { 339 uchar_t *p; 340 int oemlen; 341 int rlen; 342 boolean_t need_iconv = B_FALSE; 343 344 if (mbs == NULL || oems == NULL) 345 return (-1); 346 347 /* 348 * Check if the oems is all ASCII (and get the length 349 * while we're at it) so we know if we need to iconv. 350 * We usually can avoid the iconv calls. 351 */ 352 oemlen = 0; 353 p = (uchar_t *)oems; 354 while (*p != '\0') { 355 oemlen++; 356 if (*p & 0x80) 357 need_iconv = B_TRUE; 358 p++; 359 } 360 361 if (need_iconv) { 362 int rc; 363 char *obuf = mbs; 364 size_t olen = max_mblen; 365 size_t ilen = oemlen; 366 #if defined(_KERNEL) || defined(_FAKE_KERNEL) 367 char *ibuf = (char *)oems; 368 kiconv_t ic; 369 int err; 370 371 ic = kiconv_open("UTF-8", smb_oem_codepage); 372 if (ic == (kiconv_t)-1) 373 goto just_copy; 374 rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err); 375 (void) kiconv_close(ic); 376 #else /* _KERNEL || _FAKE_KERNEL */ 377 const char *ibuf = (char *)oems; 378 iconv_t ic; 379 ic = iconv_open("UTF-8", smb_oem_codepage); 380 if (ic == (iconv_t)-1) 381 goto just_copy; 382 rc = iconv(ic, &ibuf, &ilen, &obuf, &olen); 383 (void) iconv_close(ic); 384 #endif /* _KERNEL || _FAKE_KERNEL */ 385 if (rc < 0) 386 return (-1); 387 /* Return val. is output bytes. */ 388 rlen = (max_mblen - olen); 389 } else { 390 just_copy: 391 rlen = oemlen; 392 if (rlen > max_mblen) 393 rlen = max_mblen; 394 bcopy(oems, mbs, rlen); 395 } 396 if (rlen < max_mblen) 397 mbs[rlen] = '\0'; 398 399 return (rlen); 400 } 401 402 /* 403 * smb_mbstooem 404 * 405 * Convert a null terminated multi-byte string 'mbs' to an OEM string 406 * no longer than max_oemlen (null terminated if space). 407 * 408 * If the input string contains invalid multi-byte characters, a value 409 * of -1 will be returned. Otherwise returns the length of 'oems', 410 * excluding the terminating null character. 411 * 412 * If either mbstring or string is a null pointer, -1 is returned. 413 */ 414 int 415 smb_mbstooem(uint8_t *oems, const char *mbs, int max_oemlen) 416 { 417 uchar_t *p; 418 int mbslen; 419 int rlen; 420 boolean_t need_iconv = B_FALSE; 421 422 if (oems == NULL || mbs == NULL) 423 return (-1); 424 425 /* 426 * Check if the mbs is all ASCII (and get the length 427 * while we're at it) so we know if we need to iconv. 428 * We usually can avoid the iconv calls. 429 */ 430 mbslen = 0; 431 p = (uchar_t *)mbs; 432 while (*p != '\0') { 433 mbslen++; 434 if (*p & 0x80) 435 need_iconv = B_TRUE; 436 p++; 437 } 438 439 if (need_iconv) { 440 int rc; 441 char *obuf = (char *)oems; 442 size_t olen = max_oemlen; 443 size_t ilen = mbslen; 444 #if defined(_KERNEL) || defined(_FAKE_KERNEL) 445 char *ibuf = (char *)mbs; 446 kiconv_t ic; 447 int err; 448 449 ic = kiconv_open(smb_oem_codepage, "UTF-8"); 450 if (ic == (kiconv_t)-1) 451 goto just_copy; 452 rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err); 453 (void) kiconv_close(ic); 454 #else /* _KERNEL || _FAKE_KERNEL */ 455 const char *ibuf = mbs; 456 iconv_t ic; 457 ic = iconv_open(smb_oem_codepage, "UTF-8"); 458 if (ic == (iconv_t)-1) 459 goto just_copy; 460 rc = iconv(ic, &ibuf, &ilen, &obuf, &olen); 461 (void) iconv_close(ic); 462 #endif /* _KERNEL || _FAKE_KERNEL */ 463 if (rc < 0) 464 return (-1); 465 /* Return val. is output bytes. */ 466 rlen = (max_oemlen - olen); 467 } else { 468 just_copy: 469 rlen = mbslen; 470 if (rlen > max_oemlen) 471 rlen = max_oemlen; 472 bcopy(mbs, oems, rlen); 473 } 474 if (rlen < max_oemlen) 475 oems[rlen] = '\0'; 476 477 return (rlen); 478 } 479