1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright 2018 Nexenta Systems, Inc. All rights reserved. 26 */ 27 28 /* 29 * Multibyte/wide-char conversion routines. SMB uses UTF-16 on the wire 30 * (smb_wchar_t) and we use UTF-8 internally (our multi-byte, or mbs). 31 */ 32 33 #if defined(_KERNEL) || defined(_FAKE_KERNEL) 34 #include <sys/types.h> 35 #include <sys/sunddi.h> 36 #else /* _KERNEL || _FAKE_KERNEL */ 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <strings.h> 40 #include <iconv.h> 41 #include <assert.h> 42 #endif /* _KERNEL || _FAKE_KERNEL */ 43 #include <sys/u8_textprep.h> 44 #include <smbsrv/string.h> 45 46 47 /* 48 * mbstowcs 49 * 50 * The mbstowcs() function converts a multibyte character string 51 * mbstring into a wide character string wcstring. No more than 52 * nwchars wide characters are stored. A terminating null wide 53 * character is appended if there is room. 54 * 55 * Returns the number of wide characters converted, not counting 56 * any terminating null wide character. Returns -1 if an invalid 57 * multibyte character is encountered. 58 */ 59 size_t 60 smb_mbstowcs(smb_wchar_t *wcs, const char *mbs, size_t nwchars) 61 { 62 size_t mbslen, wcslen; 63 int err; 64 65 /* NULL or empty input is allowed. */ 66 if (mbs == NULL || *mbs == '\0') { 67 if (wcs != NULL && nwchars > 0) 68 *wcs = 0; 69 return (0); 70 } 71 72 /* 73 * Traditional mbstowcs(3C) allows wcs==NULL to get the length. 74 * SMB never calls it that way, but let's future-proof. 75 */ 76 if (wcs == NULL) { 77 return ((size_t)-1); 78 } 79 80 mbslen = strlen(mbs); 81 wcslen = nwchars; 82 err = uconv_u8tou16((const uchar_t *)mbs, &mbslen, 83 wcs, &wcslen, UCONV_OUT_LITTLE_ENDIAN); 84 if (err != 0) 85 return ((size_t)-1); 86 87 if (wcslen < nwchars) 88 wcs[wcslen] = 0; 89 90 return (wcslen); 91 } 92 93 94 /* 95 * mbtowc 96 * 97 * The mbtowc() function converts a multibyte character mbchar into 98 * a wide character and stores the result in the object pointed to 99 * by wcharp. Up to nbytes bytes are examined. 100 * 101 * If mbchar is NULL, mbtowc() returns zero to indicate that shift 102 * states are not supported. Shift states are used to switch between 103 * representation modes using reserved bytes to signal shifting 104 * without them being interpreted as characters. If mbchar is null 105 * mbtowc should return non-zero if the current locale requires shift 106 * states. Otherwise it should be return 0. 107 * 108 * If mbchar is non-null, returns the number of bytes processed in 109 * mbchar. If mbchar is null, convert the null (wcharp=0) but 110 * return length zero. If mbchar is invalid, returns -1. 111 */ 112 int /*ARGSUSED*/ 113 smb_mbtowc(uint32_t *wcharp, const char *mbchar, size_t nbytes) 114 { 115 uint32_t wide_char; 116 int count, err; 117 size_t mblen; 118 size_t wclen; 119 120 if (mbchar == NULL) 121 return (0); /* no shift states */ 122 123 /* 124 * How many bytes in this symbol? 125 */ 126 count = u8_validate((char *)mbchar, nbytes, NULL, 0, &err); 127 if (count < 0) 128 return (-1); 129 130 mblen = count; 131 wclen = 1; 132 err = uconv_u8tou32((const uchar_t *)mbchar, &mblen, 133 &wide_char, &wclen, UCONV_OUT_SYSTEM_ENDIAN); 134 if (err != 0) 135 return (-1); 136 if (wclen == 0) { 137 wide_char = 0; 138 count = 0; 139 } 140 141 if (wcharp) 142 *wcharp = wide_char; 143 144 return (count); 145 } 146 147 148 /* 149 * wctomb 150 * 151 * The wctomb() function converts a wide character wchar into a multibyte 152 * character and stores the result in mbchar. The object pointed to by 153 * mbchar must be large enough to accommodate the multibyte character. 154 * 155 * Returns the numberof bytes written to mbchar. 156 * Note: handles null like any 1-byte char. 157 */ 158 int 159 smb_wctomb(char *mbchar, uint32_t wchar) 160 { 161 char junk[MTS_MB_CUR_MAX+1]; 162 size_t mblen; 163 size_t wclen; 164 int err; 165 166 if (mbchar == NULL) 167 mbchar = junk; 168 169 mblen = MTS_MB_CUR_MAX; 170 wclen = 1; 171 err = uconv_u32tou8(&wchar, &wclen, (uchar_t *)mbchar, &mblen, 172 UCONV_IN_SYSTEM_ENDIAN | UCONV_IGNORE_NULL); 173 if (err != 0) 174 return (-1); 175 176 return ((int)mblen); 177 } 178 179 180 /* 181 * wcstombs 182 * 183 * The wcstombs() function converts a wide character string wcstring 184 * into a multibyte character string mbstring. Up to nbytes bytes are 185 * stored in mbstring. Partial multibyte characters at the end of the 186 * string are not stored. The multibyte character string is null 187 * terminated if there is room. 188 * 189 * Returns the number of bytes converted, not counting the terminating 190 * null byte. Returns -1 if an invalid WC sequence is encountered. 191 */ 192 size_t 193 smb_wcstombs(char *mbs, const smb_wchar_t *wcs, size_t nbytes) 194 { 195 size_t mbslen, wcslen; 196 int err; 197 198 /* NULL or empty input is allowed. */ 199 if (wcs == NULL || *wcs == 0) { 200 if (mbs != NULL && nbytes > 0) 201 *mbs = '\0'; 202 return (0); 203 } 204 205 /* 206 * Traditional wcstombs(3C) allows mbs==NULL to get the length. 207 * SMB never calls it that way, but let's future-proof. 208 */ 209 if (mbs == NULL) { 210 return ((size_t)-1); 211 } 212 213 /* 214 * Compute wcslen 215 */ 216 wcslen = 0; 217 while (wcs[wcslen] != 0) 218 wcslen++; 219 220 mbslen = nbytes; 221 err = uconv_u16tou8(wcs, &wcslen, 222 (uchar_t *)mbs, &mbslen, UCONV_IN_LITTLE_ENDIAN); 223 if (err != 0) 224 return ((size_t)-1); 225 226 if (mbslen < nbytes) 227 mbs[mbslen] = '\0'; 228 229 return (mbslen); 230 } 231 232 233 /* 234 * Returns the number of bytes that would be written if the multi- 235 * byte string mbs was converted to a wide character string, not 236 * counting the terminating null wide character. 237 */ 238 size_t 239 smb_wcequiv_strlen(const char *mbs) 240 { 241 uint32_t wide_char; 242 size_t bytes; 243 size_t len = 0; 244 245 while (*mbs) { 246 bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); 247 if (bytes == ((size_t)-1)) 248 return ((size_t)-1); 249 mbs += bytes; 250 251 len += sizeof (smb_wchar_t); 252 if (bytes > 3) { 253 /* 254 * Extended unicode, so TWO smb_wchar_t 255 */ 256 len += sizeof (smb_wchar_t); 257 } 258 } 259 260 return (len); 261 } 262 263 264 /* 265 * Returns the number of bytes that would be written if the multi- 266 * byte string mbs was converted to an OEM character string, 267 * (smb_mbstooem) not counting the terminating null character. 268 */ 269 size_t 270 smb_sbequiv_strlen(const char *mbs) 271 { 272 size_t nbytes; 273 size_t len = 0; 274 275 while (*mbs) { 276 nbytes = smb_mbtowc(NULL, mbs, MTS_MB_CHAR_MAX); 277 if (nbytes == ((size_t)-1)) 278 return ((size_t)-1); 279 if (nbytes == 0) 280 break; 281 282 if (nbytes == 1) { 283 /* ASCII */ 284 len++; 285 } else if (nbytes < 8) { 286 /* Compute OEM length */ 287 char mbsbuf[8]; 288 uint8_t oembuf[8]; 289 int oemlen; 290 (void) strlcpy(mbsbuf, mbs, nbytes+1); 291 oemlen = smb_mbstooem(oembuf, mbsbuf, 8); 292 if (oemlen < 0) 293 return ((size_t)-1); 294 len += oemlen; 295 } else { 296 return ((size_t)-1); 297 } 298 299 mbs += nbytes; 300 } 301 302 return (len); 303 } 304 305 /* 306 * Convert OEM strings to/from internal (UTF-8) form. 307 * 308 * We rarely encounter these anymore because all modern 309 * SMB clients use Unicode (UTF-16). The few cases where 310 * this IS still called are normally using ASCII, i.e. 311 * tag names etc. so short-cut those cases. If we get 312 * something non-ASCII we have to call iconv. 313 * 314 * If we were to really support OEM code pages, we would 315 * need to have a way to set the OEM code page from some 316 * configuration value. For now it's always CP850. 317 * See also ./smb_oem.c 318 */ 319 static char smb_oem_codepage[32] = "CP850"; 320 321 /* 322 * smb_oemtombs 323 * 324 * Convert a null terminated OEM string 'string' to a UTF-8 string 325 * no longer than max_mblen (null terminated if space). 326 * 327 * If the input string contains invalid OEM characters, a value 328 * of -1 will be returned. Otherwise returns the length of 'mbs', 329 * excluding the terminating null character. 330 * 331 * If either mbstring or string is a null pointer, -1 is returned. 332 */ 333 int 334 smb_oemtombs(char *mbs, const uint8_t *oems, int max_mblen) 335 { 336 uchar_t *p; 337 int oemlen; 338 int rlen; 339 boolean_t need_iconv = B_FALSE; 340 341 if (mbs == NULL || oems == NULL) 342 return (-1); 343 344 /* 345 * Check if the oems is all ASCII (and get the length 346 * while we're at it) so we know if we need to iconv. 347 * We usually can avoid the iconv calls. 348 */ 349 oemlen = 0; 350 p = (uchar_t *)oems; 351 while (*p != '\0') { 352 oemlen++; 353 if (*p & 0x80) 354 need_iconv = B_TRUE; 355 p++; 356 } 357 358 if (need_iconv) { 359 int rc; 360 char *obuf = mbs; 361 size_t olen = max_mblen; 362 size_t ilen = oemlen; 363 #if defined(_KERNEL) || defined(_FAKE_KERNEL) 364 char *ibuf = (char *)oems; 365 kiconv_t ic; 366 int err; 367 368 ic = kiconv_open("UTF-8", smb_oem_codepage); 369 if (ic == (kiconv_t)-1) 370 goto just_copy; 371 rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err); 372 (void) kiconv_close(ic); 373 #else /* _KERNEL || _FAKE_KERNEL */ 374 const char *ibuf = (char *)oems; 375 iconv_t ic; 376 ic = iconv_open("UTF-8", smb_oem_codepage); 377 if (ic == (iconv_t)-1) 378 goto just_copy; 379 rc = iconv(ic, &ibuf, &ilen, &obuf, &olen); 380 (void) iconv_close(ic); 381 #endif /* _KERNEL || _FAKE_KERNEL */ 382 if (rc < 0) 383 return (-1); 384 /* Return val. is output bytes. */ 385 rlen = (max_mblen - olen); 386 } else { 387 just_copy: 388 rlen = oemlen; 389 if (rlen > max_mblen) 390 rlen = max_mblen; 391 bcopy(oems, mbs, rlen); 392 } 393 if (rlen < max_mblen) 394 mbs[rlen] = '\0'; 395 396 return (rlen); 397 } 398 399 /* 400 * smb_mbstooem 401 * 402 * Convert a null terminated multi-byte string 'mbs' to an OEM string 403 * no longer than max_oemlen (null terminated if space). 404 * 405 * If the input string contains invalid multi-byte characters, a value 406 * of -1 will be returned. Otherwise returns the length of 'oems', 407 * excluding the terminating null character. 408 * 409 * If either mbstring or string is a null pointer, -1 is returned. 410 */ 411 int 412 smb_mbstooem(uint8_t *oems, const char *mbs, int max_oemlen) 413 { 414 uchar_t *p; 415 int mbslen; 416 int rlen; 417 boolean_t need_iconv = B_FALSE; 418 419 if (oems == NULL || mbs == NULL) 420 return (-1); 421 422 /* 423 * Check if the mbs is all ASCII (and get the length 424 * while we're at it) so we know if we need to iconv. 425 * We usually can avoid the iconv calls. 426 */ 427 mbslen = 0; 428 p = (uchar_t *)mbs; 429 while (*p != '\0') { 430 mbslen++; 431 if (*p & 0x80) 432 need_iconv = B_TRUE; 433 p++; 434 } 435 436 if (need_iconv) { 437 int rc; 438 char *obuf = (char *)oems; 439 size_t olen = max_oemlen; 440 size_t ilen = mbslen; 441 #if defined(_KERNEL) || defined(_FAKE_KERNEL) 442 char *ibuf = (char *)mbs; 443 kiconv_t ic; 444 int err; 445 446 ic = kiconv_open(smb_oem_codepage, "UTF-8"); 447 if (ic == (kiconv_t)-1) 448 goto just_copy; 449 rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err); 450 (void) kiconv_close(ic); 451 #else /* _KERNEL || _FAKE_KERNEL */ 452 const char *ibuf = mbs; 453 iconv_t ic; 454 ic = iconv_open(smb_oem_codepage, "UTF-8"); 455 if (ic == (iconv_t)-1) 456 goto just_copy; 457 rc = iconv(ic, &ibuf, &ilen, &obuf, &olen); 458 (void) iconv_close(ic); 459 #endif /* _KERNEL || _FAKE_KERNEL */ 460 if (rc < 0) 461 return (-1); 462 /* Return val. is output bytes. */ 463 rlen = (max_oemlen - olen); 464 } else { 465 just_copy: 466 rlen = mbslen; 467 if (rlen > max_oemlen) 468 rlen = max_oemlen; 469 bcopy(mbs, oems, rlen); 470 } 471 if (rlen < max_oemlen) 472 oems[rlen] = '\0'; 473 474 return (rlen); 475 } 476