1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Multibyte/wide-char conversion routines. Wide-char encoding provides 28 * a fixed size character encoding that maps to the Unicode 16-bit 29 * (UCS-2) character set standard. Multibyte or UCS transformation 30 * format (UTF) encoding is a variable length character encoding scheme 31 * that s compatible with existing ASCII characters and guarantees that 32 * the resultant strings do not contain embedded null characters. Both 33 * types of encoding provide a null terminator: single byte for UTF-8 34 * and a wide-char null for Unicode. See RFC 2044. 35 * 36 * The table below illustrates the UTF-8 encoding scheme. The letter x 37 * indicates bits available for encoding the character value. 38 * 39 * UCS-2 UTF-8 octet sequence (binary) 40 * 0x0000-0x007F 0xxxxxxx 41 * 0x0080-0x07FF 110xxxxx 10xxxxxx 42 * 0x0800-0xFFFF 1110xxxx 10xxxxxx 10xxxxxx 43 * 44 * RFC 2044 45 * UTF-8,a transformation format of UNICODE and ISO 10646 46 * F. Yergeau 47 * Alis Technologies 48 * October 1996 49 */ 50 51 #pragma ident "%Z%%M% %I% %E% SMI" 52 53 #ifdef _KERNEL 54 #include <sys/types.h> 55 #include <sys/sunddi.h> 56 #else 57 #include <stdio.h> 58 #include <stdlib.h> 59 #include <assert.h> 60 #include <strings.h> 61 #endif 62 #include <smbsrv/smb_i18n.h> 63 #include <smbsrv/string.h> 64 65 66 /* 67 * mbstowcs 68 * 69 * The mbstowcs() function converts a multibyte character string 70 * mbstring into a wide character string wcstring. No more than 71 * nwchars wide characters are stored. A terminating null wide 72 * character is appended if there is room. 73 * 74 * Returns the number of wide characters converted, not counting 75 * any terminating null wide character. Returns -1 if an invalid 76 * multibyte character is encountered. 77 */ 78 size_t 79 mts_mbstowcs(mts_wchar_t *wcstring, const char *mbstring, size_t nwchars) 80 { 81 int len; 82 mts_wchar_t *start = wcstring; 83 84 while (nwchars--) { 85 len = mts_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX); 86 if (len < 0) { 87 *wcstring = 0; 88 return ((size_t)-1); 89 } 90 91 if (*mbstring == 0) 92 break; 93 94 ++wcstring; 95 mbstring += len; 96 } 97 98 return (wcstring - start); 99 } 100 101 102 /* 103 * mbtowc 104 * 105 * The mbtowc() function converts a multibyte character mbchar into 106 * a wide character and stores the result in the object pointed to 107 * by wcharp. Up to nbytes bytes are examined. 108 * 109 * If mbchar is NULL, mbtowc() returns zero to indicate that shift 110 * states are not supported. Shift states are used to switch between 111 * representation modes using reserved bytes to signal shifting 112 * without them being interpreted as characters. If mbchar is null 113 * mbtowc should return non-zero if the current locale requires shift 114 * states. Otherwise it should be return 0. 115 * 116 * If mbchar is non-null, returns the number of bytes processed in 117 * mbchar. If mbchar is invalid, returns -1. 118 */ 119 int /*ARGSUSED*/ 120 mts_mbtowc(mts_wchar_t *wcharp, const char *mbchar, size_t nbytes) 121 { 122 unsigned char mbyte; 123 mts_wchar_t wide_char; 124 int count; 125 int bytes_left; 126 127 if (mbchar == NULL) 128 return (0); /* no shift states */ 129 130 /* 0xxxxxxx -> 1 byte ASCII encoding */ 131 if (((mbyte = *mbchar++) & 0x80) == 0) { 132 if (wcharp) 133 *wcharp = (mts_wchar_t)mbyte; 134 135 return (mbyte ? 1 : 0); 136 } 137 138 /* 10xxxxxx -> invalid first byte */ 139 if ((mbyte & 0x40) == 0) 140 return (-1); 141 142 wide_char = mbyte; 143 if ((mbyte & 0x20) == 0) { 144 wide_char &= 0x1f; 145 bytes_left = 1; 146 } else if ((mbyte & 0x10) == 0) { 147 wide_char &= 0x0f; 148 bytes_left = 2; 149 } else { 150 return (-1); 151 } 152 153 count = 1; 154 while (bytes_left--) { 155 if (((mbyte = *mbchar++) & 0xc0) != 0x80) 156 return (-1); 157 158 count++; 159 wide_char = (wide_char << 6) | (mbyte & 0x3f); 160 } 161 162 if (wcharp) 163 *wcharp = wide_char; 164 165 return (count); 166 } 167 168 169 /* 170 * wctomb 171 * 172 * The wctomb() function converts a wide character wchar into a multibyte 173 * character and stores the result in mbchar. The object pointed to by 174 * mbchar must be large enough to accommodate the multibyte character. 175 * 176 * Returns the numberof bytes written to mbchar. 177 */ 178 int 179 mts_wctomb(char *mbchar, mts_wchar_t wchar) 180 { 181 if ((wchar & ~0x7f) == 0) { 182 *mbchar = (char)wchar; 183 return (1); 184 } 185 186 if ((wchar & ~0x7ff) == 0) { 187 *mbchar++ = (wchar >> 6) | 0xc0; 188 *mbchar = (wchar & 0x3f) | 0x80; 189 return (2); 190 } 191 192 *mbchar++ = (wchar >> 12) | 0xe0; 193 *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80; 194 *mbchar = (wchar & 0x3f) | 0x80; 195 return (3); 196 } 197 198 199 /* 200 * wcstombs 201 * 202 * The wcstombs() function converts a wide character string wcstring 203 * into a multibyte character string mbstring. Up to nbytes bytes are 204 * stored in mbstring. Partial multibyte characters at the end of the 205 * string are not stored. The multibyte character string is null 206 * terminated if there is room. 207 * 208 * Returns the number of bytes converted, not counting the terminating 209 * null byte. 210 */ 211 size_t 212 mts_wcstombs(char *mbstring, const mts_wchar_t *wcstring, size_t nbytes) 213 { 214 char *start = mbstring; 215 const mts_wchar_t *wcp = wcstring; 216 mts_wchar_t wide_char; 217 char buf[4]; 218 size_t len; 219 220 if ((mbstring == NULL) || (wcstring == NULL)) 221 return (0); 222 223 while (nbytes > MTS_MB_CHAR_MAX) { 224 wide_char = *wcp++; 225 len = mts_wctomb(mbstring, wide_char); 226 227 if (wide_char == 0) 228 /*LINTED E_PTRDIFF_OVERFLOW*/ 229 return (mbstring - start); 230 231 mbstring += len; 232 nbytes -= len; 233 } 234 235 while (wide_char && nbytes) { 236 wide_char = *wcp++; 237 if ((len = mts_wctomb(buf, wide_char)) > nbytes) { 238 *mbstring = 0; 239 break; 240 } 241 242 bcopy(buf, mbstring, len); 243 mbstring += len; 244 nbytes -= len; 245 } 246 247 /*LINTED E_PTRDIFF_OVERFLOW*/ 248 return (mbstring - start); 249 } 250 251 252 /* 253 * Returns the number of bytes that would be written if the multi- 254 * byte string mbs was converted to a wide character string, not 255 * counting the terminating null wide character. 256 */ 257 size_t 258 mts_wcequiv_strlen(const char *mbs) 259 { 260 mts_wchar_t wide_char; 261 size_t bytes; 262 size_t len = 0; 263 264 while (*mbs) { 265 bytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); 266 if (bytes == ((size_t)-1)) 267 return ((size_t)-1); 268 269 len += sizeof (mts_wchar_t); 270 mbs += bytes; 271 } 272 273 return (len); 274 } 275 276 277 /* 278 * Returns the number of bytes that would be written if the multi- 279 * byte string mbs was converted to a single byte character string, 280 * not counting the terminating null character. 281 */ 282 size_t 283 mts_sbequiv_strlen(const char *mbs) 284 { 285 mts_wchar_t wide_char; 286 size_t nbytes; 287 size_t len = 0; 288 289 while (*mbs) { 290 nbytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); 291 if (nbytes == ((size_t)-1)) 292 return ((size_t)-1); 293 294 if (wide_char & 0xFF00) 295 len += sizeof (mts_wchar_t); 296 else 297 ++len; 298 299 mbs += nbytes; 300 } 301 302 return (len); 303 } 304 305 306 /* 307 * stombs 308 * 309 * Convert a regular null terminated string 'string' to a UTF-8 encoded 310 * null terminated multi-byte string 'mbstring'. Only full converted 311 * UTF-8 characters will be written 'mbstring'. If a character will not 312 * fit within the remaining buffer space or 'mbstring' will overflow 313 * max_mblen, the conversion process will be terminated and 'mbstring' 314 * will be null terminated. 315 * 316 * Returns the number of bytes written to 'mbstring', excluding the 317 * terminating null character. 318 * 319 * If either mbstring or string is a null pointer, -1 is returned. 320 */ 321 int 322 mts_stombs(char *mbstring, char *string, int max_mblen) 323 { 324 char *start = mbstring; 325 unsigned char *p = (unsigned char *)string; 326 int space_left = max_mblen; 327 int len; 328 mts_wchar_t wide_char; 329 char buf[4]; 330 331 if (!mbstring || !string) 332 return (-1); 333 334 while (*p && space_left > 2) { 335 wide_char = *p++; 336 len = mts_wctomb(mbstring, wide_char); 337 mbstring += len; 338 space_left -= len; 339 } 340 341 if (*p) { 342 wide_char = *p; 343 if ((len = mts_wctomb(buf, wide_char)) < 2) { 344 *mbstring = *buf; 345 mbstring += len; 346 space_left -= len; 347 } 348 } 349 350 *mbstring = '\0'; 351 352 /*LINTED E_PTRDIFF_OVERFLOW*/ 353 return (mbstring - start); 354 } 355 356 357 /* 358 * mbstos 359 * 360 * Convert a null terminated multi-byte string 'mbstring' to a regular 361 * null terminated string 'string'. A 1-byte character in 'mbstring' 362 * maps to a 1-byte character in 'string'. A 2-byte character in 363 * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null. 364 * Otherwise the upper byte null will be discarded to ensure that the 365 * output stream does not contain embedded null characters. 366 * 367 * If the input stream contains invalid multi-byte characters, a value 368 * of -1 will be returned. Otherwise the length of 'string', excluding 369 * the terminating null character, is returned. 370 * 371 * If either mbstring or string is a null pointer, -1 is returned. 372 */ 373 int 374 mts_mbstos(char *string, const char *mbstring) 375 { 376 mts_wchar_t wc; 377 unsigned char *start = (unsigned char *)string; 378 int len; 379 380 if (string == NULL || mbstring == NULL) 381 return (-1); 382 383 while (*mbstring) { 384 if ((len = mts_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) { 385 *string = 0; 386 return (-1); 387 } 388 389 if (wc & 0xFF00) { 390 /*LINTED E_BAD_PTR_CAST_ALIGN*/ 391 *((mts_wchar_t *)string) = wc; 392 string += sizeof (mts_wchar_t); 393 } 394 else 395 { 396 *string = (unsigned char)wc; 397 string++; 398 } 399 400 mbstring += len; 401 } 402 403 *string = 0; 404 405 /*LINTED E_PTRDIFF_OVERFLOW*/ 406 return ((unsigned char *)string - start); 407 } 408