1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Multibyte/wide-char conversion routines. Wide-char encoding provides 28 * a fixed size character encoding that maps to the Unicode 16-bit 29 * (UCS-2) character set standard. Multibyte or UCS transformation 30 * format (UTF) encoding is a variable length character encoding scheme 31 * that s compatible with existing ASCII characters and guarantees that 32 * the resultant strings do not contain embedded null characters. Both 33 * types of encoding provide a null terminator: single byte for UTF-8 34 * and a wide-char null for Unicode. See RFC 2044. 35 * 36 * The table below illustrates the UTF-8 encoding scheme. The letter x 37 * indicates bits available for encoding the character value. 38 * 39 * UCS-2 UTF-8 octet sequence (binary) 40 * 0x0000-0x007F 0xxxxxxx 41 * 0x0080-0x07FF 110xxxxx 10xxxxxx 42 * 0x0800-0xFFFF 1110xxxx 10xxxxxx 10xxxxxx 43 * 44 * RFC 2044 45 * UTF-8,a transformation format of UNICODE and ISO 10646 46 * F. Yergeau 47 * Alis Technologies 48 * October 1996 49 */ 50 51 #ifdef _KERNEL 52 #include <sys/types.h> 53 #include <sys/sunddi.h> 54 #else 55 #include <stdio.h> 56 #include <stdlib.h> 57 #include <assert.h> 58 #include <strings.h> 59 #endif 60 #include <smbsrv/string.h> 61 62 63 /* 64 * mbstowcs 65 * 66 * The mbstowcs() function converts a multibyte character string 67 * mbstring into a wide character string wcstring. No more than 68 * nwchars wide characters are stored. A terminating null wide 69 * character is appended if there is room. 70 * 71 * Returns the number of wide characters converted, not counting 72 * any terminating null wide character. Returns -1 if an invalid 73 * multibyte character is encountered. 74 */ 75 size_t 76 smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars) 77 { 78 int len; 79 smb_wchar_t *start = wcstring; 80 81 while (nwchars--) { 82 len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX); 83 if (len < 0) { 84 *wcstring = 0; 85 return ((size_t)-1); 86 } 87 88 if (*mbstring == 0) 89 break; 90 91 ++wcstring; 92 mbstring += len; 93 } 94 95 return (wcstring - start); 96 } 97 98 99 /* 100 * mbtowc 101 * 102 * The mbtowc() function converts a multibyte character mbchar into 103 * a wide character and stores the result in the object pointed to 104 * by wcharp. Up to nbytes bytes are examined. 105 * 106 * If mbchar is NULL, mbtowc() returns zero to indicate that shift 107 * states are not supported. Shift states are used to switch between 108 * representation modes using reserved bytes to signal shifting 109 * without them being interpreted as characters. If mbchar is null 110 * mbtowc should return non-zero if the current locale requires shift 111 * states. Otherwise it should be return 0. 112 * 113 * If mbchar is non-null, returns the number of bytes processed in 114 * mbchar. If mbchar is invalid, returns -1. 115 */ 116 int /*ARGSUSED*/ 117 smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes) 118 { 119 unsigned char mbyte; 120 smb_wchar_t wide_char; 121 int count; 122 int bytes_left; 123 124 if (mbchar == NULL) 125 return (0); /* no shift states */ 126 127 /* 0xxxxxxx -> 1 byte ASCII encoding */ 128 if (((mbyte = *mbchar++) & 0x80) == 0) { 129 if (wcharp) 130 *wcharp = (smb_wchar_t)mbyte; 131 132 return (mbyte ? 1 : 0); 133 } 134 135 /* 10xxxxxx -> invalid first byte */ 136 if ((mbyte & 0x40) == 0) 137 return (-1); 138 139 wide_char = mbyte; 140 if ((mbyte & 0x20) == 0) { 141 wide_char &= 0x1f; 142 bytes_left = 1; 143 } else if ((mbyte & 0x10) == 0) { 144 wide_char &= 0x0f; 145 bytes_left = 2; 146 } else { 147 return (-1); 148 } 149 150 count = 1; 151 while (bytes_left--) { 152 if (((mbyte = *mbchar++) & 0xc0) != 0x80) 153 return (-1); 154 155 count++; 156 wide_char = (wide_char << 6) | (mbyte & 0x3f); 157 } 158 159 if (wcharp) 160 *wcharp = wide_char; 161 162 return (count); 163 } 164 165 166 /* 167 * wctomb 168 * 169 * The wctomb() function converts a wide character wchar into a multibyte 170 * character and stores the result in mbchar. The object pointed to by 171 * mbchar must be large enough to accommodate the multibyte character. 172 * 173 * Returns the numberof bytes written to mbchar. 174 */ 175 int 176 smb_wctomb(char *mbchar, smb_wchar_t wchar) 177 { 178 if ((wchar & ~0x7f) == 0) { 179 *mbchar = (char)wchar; 180 return (1); 181 } 182 183 if ((wchar & ~0x7ff) == 0) { 184 *mbchar++ = (wchar >> 6) | 0xc0; 185 *mbchar = (wchar & 0x3f) | 0x80; 186 return (2); 187 } 188 189 *mbchar++ = (wchar >> 12) | 0xe0; 190 *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80; 191 *mbchar = (wchar & 0x3f) | 0x80; 192 return (3); 193 } 194 195 196 /* 197 * wcstombs 198 * 199 * The wcstombs() function converts a wide character string wcstring 200 * into a multibyte character string mbstring. Up to nbytes bytes are 201 * stored in mbstring. Partial multibyte characters at the end of the 202 * string are not stored. The multibyte character string is null 203 * terminated if there is room. 204 * 205 * Returns the number of bytes converted, not counting the terminating 206 * null byte. 207 */ 208 size_t 209 smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes) 210 { 211 char *start = mbstring; 212 const smb_wchar_t *wcp = wcstring; 213 smb_wchar_t wide_char; 214 char buf[4]; 215 size_t len; 216 217 if ((mbstring == NULL) || (wcstring == NULL)) 218 return (0); 219 220 while (nbytes > MTS_MB_CHAR_MAX) { 221 wide_char = *wcp++; 222 len = smb_wctomb(mbstring, wide_char); 223 224 if (wide_char == 0) 225 /*LINTED E_PTRDIFF_OVERFLOW*/ 226 return (mbstring - start); 227 228 mbstring += len; 229 nbytes -= len; 230 } 231 232 while (wide_char && nbytes) { 233 wide_char = *wcp++; 234 if ((len = smb_wctomb(buf, wide_char)) > nbytes) { 235 *mbstring = 0; 236 break; 237 } 238 239 bcopy(buf, mbstring, len); 240 mbstring += len; 241 nbytes -= len; 242 } 243 244 /*LINTED E_PTRDIFF_OVERFLOW*/ 245 return (mbstring - start); 246 } 247 248 249 /* 250 * Returns the number of bytes that would be written if the multi- 251 * byte string mbs was converted to a wide character string, not 252 * counting the terminating null wide character. 253 */ 254 size_t 255 smb_wcequiv_strlen(const char *mbs) 256 { 257 smb_wchar_t wide_char; 258 size_t bytes; 259 size_t len = 0; 260 261 while (*mbs) { 262 bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); 263 if (bytes == ((size_t)-1)) 264 return ((size_t)-1); 265 266 len += sizeof (smb_wchar_t); 267 mbs += bytes; 268 } 269 270 return (len); 271 } 272 273 274 /* 275 * Returns the number of bytes that would be written if the multi- 276 * byte string mbs was converted to a single byte character string, 277 * not counting the terminating null character. 278 */ 279 size_t 280 smb_sbequiv_strlen(const char *mbs) 281 { 282 smb_wchar_t wide_char; 283 size_t nbytes; 284 size_t len = 0; 285 286 while (*mbs) { 287 nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); 288 if (nbytes == ((size_t)-1)) 289 return ((size_t)-1); 290 291 if (wide_char & 0xFF00) 292 len += sizeof (smb_wchar_t); 293 else 294 ++len; 295 296 mbs += nbytes; 297 } 298 299 return (len); 300 } 301 302 303 /* 304 * stombs 305 * 306 * Convert a regular null terminated string 'string' to a UTF-8 encoded 307 * null terminated multi-byte string 'mbstring'. Only full converted 308 * UTF-8 characters will be written 'mbstring'. If a character will not 309 * fit within the remaining buffer space or 'mbstring' will overflow 310 * max_mblen, the conversion process will be terminated and 'mbstring' 311 * will be null terminated. 312 * 313 * Returns the number of bytes written to 'mbstring', excluding the 314 * terminating null character. 315 * 316 * If either mbstring or string is a null pointer, -1 is returned. 317 */ 318 int 319 smb_stombs(char *mbstring, char *string, int max_mblen) 320 { 321 char *start = mbstring; 322 unsigned char *p = (unsigned char *)string; 323 int space_left = max_mblen; 324 int len; 325 smb_wchar_t wide_char; 326 char buf[4]; 327 328 if (!mbstring || !string) 329 return (-1); 330 331 while (*p && space_left > 2) { 332 wide_char = *p++; 333 len = smb_wctomb(mbstring, wide_char); 334 mbstring += len; 335 space_left -= len; 336 } 337 338 if (*p) { 339 wide_char = *p; 340 if ((len = smb_wctomb(buf, wide_char)) < 2) { 341 *mbstring = *buf; 342 mbstring += len; 343 space_left -= len; 344 } 345 } 346 347 *mbstring = '\0'; 348 349 /*LINTED E_PTRDIFF_OVERFLOW*/ 350 return (mbstring - start); 351 } 352 353 354 /* 355 * mbstos 356 * 357 * Convert a null terminated multi-byte string 'mbstring' to a regular 358 * null terminated string 'string'. A 1-byte character in 'mbstring' 359 * maps to a 1-byte character in 'string'. A 2-byte character in 360 * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null. 361 * Otherwise the upper byte null will be discarded to ensure that the 362 * output stream does not contain embedded null characters. 363 * 364 * If the input stream contains invalid multi-byte characters, a value 365 * of -1 will be returned. Otherwise the length of 'string', excluding 366 * the terminating null character, is returned. 367 * 368 * If either mbstring or string is a null pointer, -1 is returned. 369 */ 370 int 371 smb_mbstos(char *string, const char *mbstring) 372 { 373 smb_wchar_t wc; 374 unsigned char *start = (unsigned char *)string; 375 int len; 376 377 if (string == NULL || mbstring == NULL) 378 return (-1); 379 380 while (*mbstring) { 381 if ((len = smb_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) { 382 *string = 0; 383 return (-1); 384 } 385 386 if (wc & 0xFF00) { 387 /*LINTED E_BAD_PTR_CAST_ALIGN*/ 388 *((smb_wchar_t *)string) = wc; 389 string += sizeof (smb_wchar_t); 390 } 391 else 392 { 393 *string = (unsigned char)wc; 394 string++; 395 } 396 397 mbstring += len; 398 } 399 400 *string = 0; 401 402 /*LINTED E_PTRDIFF_OVERFLOW*/ 403 return ((unsigned char *)string - start); 404 } 405