1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 26 */ 27 28 /* 29 * Multibyte/wide-char conversion routines. Wide-char encoding provides 30 * a fixed size character encoding that maps to the Unicode 16-bit 31 * (UCS-2) character set standard. Multibyte or UCS transformation 32 * format (UTF) encoding is a variable length character encoding scheme 33 * that s compatible with existing ASCII characters and guarantees that 34 * the resultant strings do not contain embedded null characters. Both 35 * types of encoding provide a null terminator: single byte for UTF-8 36 * and a wide-char null for Unicode. See RFC 2044. 37 * 38 * The table below illustrates the UTF-8 encoding scheme. The letter x 39 * indicates bits available for encoding the character value. 40 * 41 * UCS-2 UTF-8 octet sequence (binary) 42 * 0x0000-0x007F 0xxxxxxx 43 * 0x0080-0x07FF 110xxxxx 10xxxxxx 44 * 0x0800-0xFFFF 1110xxxx 10xxxxxx 10xxxxxx 45 * 46 * RFC 2044 47 * UTF-8,a transformation format of UNICODE and ISO 10646 48 * F. Yergeau 49 * Alis Technologies 50 * October 1996 51 */ 52 53 #if defined(_KERNEL) || defined(_FAKE_KERNEL) 54 #include <sys/types.h> 55 #include <sys/sunddi.h> 56 #else 57 #include <stdio.h> 58 #include <stdlib.h> 59 #include <assert.h> 60 #include <strings.h> 61 #endif 62 #include <smbsrv/string.h> 63 64 65 /* 66 * mbstowcs 67 * 68 * The mbstowcs() function converts a multibyte character string 69 * mbstring into a wide character string wcstring. No more than 70 * nwchars wide characters are stored. A terminating null wide 71 * character is appended if there is room. 72 * 73 * Returns the number of wide characters converted, not counting 74 * any terminating null wide character. Returns -1 if an invalid 75 * multibyte character is encountered. 76 */ 77 size_t 78 smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars) 79 { 80 int len; 81 smb_wchar_t *start = wcstring; 82 83 while (nwchars--) { 84 len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX); 85 if (len < 0) { 86 *wcstring = 0; 87 return ((size_t)-1); 88 } 89 90 if (*mbstring == 0) 91 break; 92 93 ++wcstring; 94 mbstring += len; 95 } 96 97 return (wcstring - start); 98 } 99 100 101 /* 102 * mbtowc 103 * 104 * The mbtowc() function converts a multibyte character mbchar into 105 * a wide character and stores the result in the object pointed to 106 * by wcharp. Up to nbytes bytes are examined. 107 * 108 * If mbchar is NULL, mbtowc() returns zero to indicate that shift 109 * states are not supported. Shift states are used to switch between 110 * representation modes using reserved bytes to signal shifting 111 * without them being interpreted as characters. If mbchar is null 112 * mbtowc should return non-zero if the current locale requires shift 113 * states. Otherwise it should be return 0. 114 * 115 * If mbchar is non-null, returns the number of bytes processed in 116 * mbchar. If mbchar is invalid, returns -1. 117 */ 118 int /*ARGSUSED*/ 119 smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes) 120 { 121 unsigned char mbyte; 122 smb_wchar_t wide_char; 123 int count; 124 int bytes_left; 125 126 if (mbchar == NULL) 127 return (0); /* no shift states */ 128 129 /* 0xxxxxxx -> 1 byte ASCII encoding */ 130 if (((mbyte = *mbchar++) & 0x80) == 0) { 131 if (wcharp) 132 *wcharp = (smb_wchar_t)mbyte; 133 134 return (mbyte ? 1 : 0); 135 } 136 137 /* 10xxxxxx -> invalid first byte */ 138 if ((mbyte & 0x40) == 0) 139 return (-1); 140 141 wide_char = mbyte; 142 if ((mbyte & 0x20) == 0) { 143 wide_char &= 0x1f; 144 bytes_left = 1; 145 } else if ((mbyte & 0x10) == 0) { 146 wide_char &= 0x0f; 147 bytes_left = 2; 148 } else { 149 return (-1); 150 } 151 152 count = 1; 153 while (bytes_left--) { 154 if (((mbyte = *mbchar++) & 0xc0) != 0x80) 155 return (-1); 156 157 count++; 158 wide_char = (wide_char << 6) | (mbyte & 0x3f); 159 } 160 161 if (wcharp) 162 *wcharp = wide_char; 163 164 return (count); 165 } 166 167 168 /* 169 * wctomb 170 * 171 * The wctomb() function converts a wide character wchar into a multibyte 172 * character and stores the result in mbchar. The object pointed to by 173 * mbchar must be large enough to accommodate the multibyte character. 174 * 175 * Returns the numberof bytes written to mbchar. 176 */ 177 int 178 smb_wctomb(char *mbchar, smb_wchar_t wchar) 179 { 180 if ((wchar & ~0x7f) == 0) { 181 *mbchar = (char)wchar; 182 return (1); 183 } 184 185 if ((wchar & ~0x7ff) == 0) { 186 *mbchar++ = (wchar >> 6) | 0xc0; 187 *mbchar = (wchar & 0x3f) | 0x80; 188 return (2); 189 } 190 191 *mbchar++ = (wchar >> 12) | 0xe0; 192 *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80; 193 *mbchar = (wchar & 0x3f) | 0x80; 194 return (3); 195 } 196 197 198 /* 199 * wcstombs 200 * 201 * The wcstombs() function converts a wide character string wcstring 202 * into a multibyte character string mbstring. Up to nbytes bytes are 203 * stored in mbstring. Partial multibyte characters at the end of the 204 * string are not stored. The multibyte character string is null 205 * terminated if there is room. 206 * 207 * Returns the number of bytes converted, not counting the terminating 208 * null byte. 209 */ 210 size_t 211 smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes) 212 { 213 char *start = mbstring; 214 const smb_wchar_t *wcp = wcstring; 215 smb_wchar_t wide_char = 0; 216 char buf[4]; 217 size_t len; 218 219 if ((mbstring == NULL) || (wcstring == NULL)) 220 return (0); 221 222 while (nbytes > MTS_MB_CHAR_MAX) { 223 wide_char = *wcp++; 224 len = smb_wctomb(mbstring, wide_char); 225 226 if (wide_char == 0) 227 /*LINTED E_PTRDIFF_OVERFLOW*/ 228 return (mbstring - start); 229 230 mbstring += len; 231 nbytes -= len; 232 } 233 234 while (wide_char && nbytes) { 235 wide_char = *wcp++; 236 if ((len = smb_wctomb(buf, wide_char)) > nbytes) { 237 *mbstring = 0; 238 break; 239 } 240 241 bcopy(buf, mbstring, len); 242 mbstring += len; 243 nbytes -= len; 244 } 245 246 /*LINTED E_PTRDIFF_OVERFLOW*/ 247 return (mbstring - start); 248 } 249 250 251 /* 252 * Returns the number of bytes that would be written if the multi- 253 * byte string mbs was converted to a wide character string, not 254 * counting the terminating null wide character. 255 */ 256 size_t 257 smb_wcequiv_strlen(const char *mbs) 258 { 259 smb_wchar_t wide_char; 260 size_t bytes; 261 size_t len = 0; 262 263 while (*mbs) { 264 bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); 265 if (bytes == ((size_t)-1)) 266 return ((size_t)-1); 267 268 len += sizeof (smb_wchar_t); 269 mbs += bytes; 270 } 271 272 return (len); 273 } 274 275 276 /* 277 * Returns the number of bytes that would be written if the multi- 278 * byte string mbs was converted to a single byte character string, 279 * not counting the terminating null character. 280 */ 281 size_t 282 smb_sbequiv_strlen(const char *mbs) 283 { 284 smb_wchar_t wide_char; 285 size_t nbytes; 286 size_t len = 0; 287 288 while (*mbs) { 289 nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); 290 if (nbytes == ((size_t)-1)) 291 return ((size_t)-1); 292 293 if (wide_char & 0xFF00) 294 len += sizeof (smb_wchar_t); 295 else 296 ++len; 297 298 mbs += nbytes; 299 } 300 301 return (len); 302 } 303 304 305 /* 306 * stombs 307 * 308 * Convert a regular null terminated string 'string' to a UTF-8 encoded 309 * null terminated multi-byte string 'mbstring'. Only full converted 310 * UTF-8 characters will be written 'mbstring'. If a character will not 311 * fit within the remaining buffer space or 'mbstring' will overflow 312 * max_mblen, the conversion process will be terminated and 'mbstring' 313 * will be null terminated. 314 * 315 * Returns the number of bytes written to 'mbstring', excluding the 316 * terminating null character. 317 * 318 * If either mbstring or string is a null pointer, -1 is returned. 319 */ 320 int 321 smb_stombs(char *mbstring, char *string, int max_mblen) 322 { 323 char *start = mbstring; 324 unsigned char *p = (unsigned char *)string; 325 int space_left = max_mblen; 326 int len; 327 smb_wchar_t wide_char; 328 char buf[4]; 329 330 if (!mbstring || !string) 331 return (-1); 332 333 while (*p && space_left > 2) { 334 wide_char = *p++; 335 len = smb_wctomb(mbstring, wide_char); 336 mbstring += len; 337 space_left -= len; 338 } 339 340 if (*p) { 341 wide_char = *p; 342 if ((len = smb_wctomb(buf, wide_char)) < 2) { 343 *mbstring = *buf; 344 mbstring += len; 345 space_left -= len; 346 } 347 } 348 349 *mbstring = '\0'; 350 351 /*LINTED E_PTRDIFF_OVERFLOW*/ 352 return (mbstring - start); 353 } 354 355 356 /* 357 * mbstos 358 * 359 * Convert a null terminated multi-byte string 'mbstring' to a regular 360 * null terminated string 'string'. A 1-byte character in 'mbstring' 361 * maps to a 1-byte character in 'string'. A 2-byte character in 362 * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null. 363 * Otherwise the upper byte null will be discarded to ensure that the 364 * output stream does not contain embedded null characters. 365 * 366 * If the input stream contains invalid multi-byte characters, a value 367 * of -1 will be returned. Otherwise the length of 'string', excluding 368 * the terminating null character, is returned. 369 * 370 * If either mbstring or string is a null pointer, -1 is returned. 371 */ 372 int 373 smb_mbstos(char *string, const char *mbstring) 374 { 375 smb_wchar_t wc; 376 unsigned char *start = (unsigned char *)string; 377 int len; 378 379 if (string == NULL || mbstring == NULL) 380 return (-1); 381 382 while (*mbstring) { 383 if ((len = smb_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) { 384 *string = 0; 385 return (-1); 386 } 387 388 if (wc & 0xFF00) { 389 /*LINTED E_BAD_PTR_CAST_ALIGN*/ 390 *((smb_wchar_t *)string) = wc; 391 string += sizeof (smb_wchar_t); 392 } 393 else 394 { 395 *string = (unsigned char)wc; 396 string++; 397 } 398 399 mbstring += len; 400 } 401 402 *string = 0; 403 404 /*LINTED E_PTRDIFF_OVERFLOW*/ 405 return ((unsigned char *)string - start); 406 } 407