1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Multibyte/wide-char conversion routines. Wide-char encoding provides 28 * a fixed size character encoding that maps to the Unicode 16-bit 29 * (UCS-2) character set standard. Multibyte or UCS transformation 30 * format (UTF) encoding is a variable length character encoding scheme 31 * that s compatible with existing ASCII characters and guarantees that 32 * the resultant strings do not contain embedded null characters. Both 33 * types of encoding provide a null terminator: single byte for UTF-8 34 * and a wide-char null for Unicode. See RFC 2044. 35 * 36 * The table below illustrates the UTF-8 encoding scheme. The letter x 37 * indicates bits available for encoding the character value. 38 * 39 * UCS-2 UTF-8 octet sequence (binary) 40 * 0x0000-0x007F 0xxxxxxx 41 * 0x0080-0x07FF 110xxxxx 10xxxxxx 42 * 0x0800-0xFFFF 1110xxxx 10xxxxxx 10xxxxxx 43 * 44 * RFC 2044 45 * UTF-8,a transformation format of UNICODE and ISO 10646 46 * F. Yergeau 47 * Alis Technologies 48 * October 1996 49 */ 50 51 #pragma ident "%Z%%M% %I% %E% SMI" 52 53 #ifdef _KERNEL 54 #include <sys/types.h> 55 #include <sys/sunddi.h> 56 #else 57 #include <stdio.h> 58 #include <stdlib.h> 59 #include <assert.h> 60 #include <strings.h> 61 #endif 62 #include <smbsrv/smb_i18n.h> 63 #include <smbsrv/string.h> 64 65 int mbtowc_verbose = 0; 66 int mbtowc_announce = 0; 67 68 /* 69 * mbstowcs 70 * 71 * The mbstowcs() function converts a multibyte character string 72 * mbstring into a wide character string wcstring. No more than 73 * nwchars wide characters are stored. A terminating null wide 74 * character is appended if there is room. 75 * 76 * Returns the number of wide characters converted, not counting 77 * any terminating null wide character. Returns -1 if an invalid 78 * multibyte character is encountered. 79 */ 80 size_t 81 mts_mbstowcs(mts_wchar_t *wcstring, const char *mbstring, size_t nwchars) 82 { 83 int len; 84 mts_wchar_t *start = wcstring; 85 86 while (nwchars--) { 87 len = mts_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX); 88 if (len < 0) { 89 *wcstring = 0; 90 return ((size_t)-1); 91 } 92 93 if (*mbstring == 0) 94 break; 95 96 ++wcstring; 97 mbstring += len; 98 } 99 100 return (wcstring - start); 101 } 102 103 104 /* 105 * mbtowc 106 * 107 * The mbtowc() function converts a multibyte character mbchar into 108 * a wide character and stores the result in the object pointed to 109 * by wcharp. Up to nbytes bytes are examined. 110 * 111 * If mbchar is NULL, mbtowc() returns zero to indicate that shift 112 * states are not supported. If mbchar is valid, returns the number 113 * of bytes processed in mbchar. If mbchar is invalid, returns -1. 114 */ 115 int /*ARGSUSED*/ 116 mts_mbtowc(mts_wchar_t *wcharp, const char *mbchar, size_t nbytes) 117 { 118 unsigned char mbyte; 119 mts_wchar_t wide_char; 120 int count; 121 int bytes_left; 122 123 if (mbchar == 0) 124 return (0); /* shift states not supported */ 125 126 /* 0xxxxxxx -> 1 byte ASCII encoding */ 127 if (((mbyte = *mbchar++) & 0x80) == 0) { 128 if (wcharp) 129 *wcharp = (mts_wchar_t)mbyte; 130 131 return (mbyte ? 1 : 0); 132 } 133 134 /* 10xxxxxx -> invalid first byte */ 135 if ((mbyte & 0x40) == 0) { 136 if (mbtowc_verbose || mbtowc_announce == 0) { 137 mbtowc_announce = 1; 138 } 139 return (-1); 140 } 141 142 wide_char = mbyte; 143 if ((mbyte & 0x20) == 0) { 144 wide_char &= 0x1f; 145 bytes_left = 1; 146 } else if ((mbyte & 0x10) == 0) { 147 wide_char &= 0x0f; 148 bytes_left = 2; 149 } else { 150 if (mbtowc_verbose || mbtowc_announce == 0) { 151 mbtowc_announce = 1; 152 } 153 return (-1); 154 } 155 156 count = 1; 157 while (bytes_left--) { 158 if (((mbyte = *mbchar++) & 0xc0) != 0x80) { 159 if (mbtowc_verbose || mbtowc_announce == 0) { 160 mbtowc_announce = 1; 161 } 162 return (-1); 163 } 164 165 count++; 166 wide_char = (wide_char << 6) | (mbyte & 0x3f); 167 } 168 169 if (wcharp) 170 *wcharp = wide_char; 171 172 return (count); 173 } 174 175 176 /* 177 * wctomb 178 * 179 * The wctomb() function converts a wide character wchar into a multibyte 180 * character and stores the result in mbchar. The object pointed to by 181 * mbchar must be large enough to accommodate the multibyte character. 182 * 183 * Returns the numberof bytes written to mbchar. 184 */ 185 int 186 mts_wctomb(char *mbchar, mts_wchar_t wchar) 187 { 188 #ifdef UTF8_DEBUG 189 char *start = mbchar; 190 #endif 191 192 if ((wchar & ~0x7f) == 0) { 193 *mbchar = (char)wchar; 194 return (1); 195 } 196 197 if ((wchar & ~0x7ff) == 0) { 198 *mbchar++ = (wchar >> 6) | 0xc0; 199 *mbchar = (wchar & 0x3f) | 0x80; 200 return (2); 201 } 202 203 *mbchar++ = (wchar >> 12) | 0xe0; 204 *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80; 205 *mbchar = (wchar & 0x3f) | 0x80; 206 return (3); 207 } 208 209 210 /* 211 * wcstombs 212 * 213 * The wcstombs() function converts a wide character string wcstring 214 * into a multibyte character string mbstring. Up to nbytes bytes are 215 * stored in mbstring. Partial multibyte characters at the end of the 216 * string are not stored. The multibyte character string is null 217 * terminated if there is room. 218 * 219 * Returns the number of bytes converted, not counting the terminating 220 * null byte. 221 */ 222 size_t 223 mts_wcstombs(char *mbstring, const mts_wchar_t *wcstring, size_t nbytes) 224 { 225 char *start = mbstring; 226 const mts_wchar_t *wcp = wcstring; 227 mts_wchar_t wide_char; 228 char buf[4]; 229 size_t len; 230 231 if ((mbstring == 0) || (wcstring == 0)) 232 return (0); 233 234 while (nbytes > MTS_MB_CHAR_MAX) { 235 wide_char = *wcp++; 236 len = mts_wctomb(mbstring, wide_char); 237 238 if (wide_char == 0) 239 /*LINTED E_PTRDIFF_OVERFLOW*/ 240 return (mbstring - start); 241 242 mbstring += len; 243 nbytes -= len; 244 } 245 246 while (wide_char && nbytes) { 247 wide_char = *wcp++; 248 if ((len = mts_wctomb(buf, wide_char)) > nbytes) { 249 *mbstring = 0; 250 break; 251 } 252 253 bcopy(buf, mbstring, len); 254 mbstring += len; 255 nbytes -= len; 256 } 257 258 /*LINTED E_PTRDIFF_OVERFLOW*/ 259 return (mbstring - start); 260 } 261 262 263 /* 264 * Returns the number of bytes that would be written if the multi- 265 * byte string mbs was converted to a wide character string, not 266 * counting the terminating null wide character. 267 */ 268 size_t 269 mts_wcequiv_strlen(const char *mbs) 270 { 271 mts_wchar_t wide_char; 272 size_t bytes; 273 size_t len = 0; 274 275 while (*mbs) { 276 bytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); 277 if (bytes == ((size_t)-1)) 278 return ((size_t)-1); 279 280 len += sizeof (mts_wchar_t); 281 mbs += bytes; 282 } 283 284 return (len); 285 } 286 287 288 /* 289 * Returns the number of bytes that would be written if the multi- 290 * byte string mbs was converted to a single byte character string, 291 * not counting the terminating null character. 292 */ 293 size_t 294 mts_sbequiv_strlen(const char *mbs) 295 { 296 mts_wchar_t wide_char; 297 size_t nbytes; 298 size_t len = 0; 299 300 while (*mbs) { 301 nbytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); 302 if (nbytes == ((size_t)-1)) 303 return ((size_t)-1); 304 305 if (wide_char & 0xFF00) 306 len += sizeof (mts_wchar_t); 307 else 308 ++len; 309 310 mbs += nbytes; 311 } 312 313 return (len); 314 } 315 316 317 /* 318 * stombs 319 * 320 * Convert a regular null terminated string 'string' to a UTF-8 encoded 321 * null terminated multi-byte string 'mbstring'. Only full converted 322 * UTF-8 characters will be written 'mbstring'. If a character will not 323 * fit within the remaining buffer space or 'mbstring' will overflow 324 * max_mblen, the conversion process will be terminated and 'mbstring' 325 * will be null terminated. 326 * 327 * Returns the number of bytes written to 'mbstring', excluding the 328 * terminating null character. 329 * 330 * If either mbstring or string is a null pointer, -1 is returned. 331 */ 332 int 333 mts_stombs(char *mbstring, char *string, int max_mblen) 334 { 335 char *start = mbstring; 336 unsigned char *p = (unsigned char *)string; 337 int space_left = max_mblen; 338 int len; 339 mts_wchar_t wide_char; 340 char buf[4]; 341 342 if (!mbstring || !string) 343 return (-1); 344 345 while (*p && space_left > 2) { 346 wide_char = *p++; 347 len = mts_wctomb(mbstring, wide_char); 348 mbstring += len; 349 space_left -= len; 350 } 351 352 if (*p) { 353 wide_char = *p; 354 if ((len = mts_wctomb(buf, wide_char)) < 2) { 355 *mbstring = *buf; 356 mbstring += len; 357 space_left -= len; 358 } 359 } 360 361 *mbstring = '\0'; 362 363 /*LINTED E_PTRDIFF_OVERFLOW*/ 364 return (mbstring - start); 365 } 366 367 368 /* 369 * mbstos 370 * 371 * Convert a null terminated multi-byte string 'mbstring' to a regular 372 * null terminated string 'string'. A 1-byte character in 'mbstring' 373 * maps to a 1-byte character in 'string'. A 2-byte character in 374 * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null. 375 * Otherwise the upper byte null will be discarded to ensure that the 376 * output stream does not contain embedded null characters. 377 * 378 * If the input stream contains invalid multi-byte characters, a value 379 * of -1 will be returned. Otherwise the length of 'string', excluding 380 * the terminating null character, is returned. 381 * 382 * If either mbstring or string is a null pointer, -1 is returned. 383 */ 384 int 385 mts_mbstos(char *string, const char *mbstring) 386 { 387 mts_wchar_t wc; 388 unsigned char *start = (unsigned char *)string; 389 int len; 390 391 if (string == 0 || mbstring == 0) 392 return (-1); 393 394 while (*mbstring) { 395 if ((len = mts_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) { 396 *string = 0; 397 return (-1); 398 } 399 400 if (wc & 0xFF00) { 401 /*LINTED E_BAD_PTR_CAST_ALIGN*/ 402 *((mts_wchar_t *)string) = wc; 403 string += sizeof (mts_wchar_t); 404 } 405 else 406 { 407 *string = (unsigned char)wc; 408 string++; 409 } 410 411 mbstring += len; 412 } 413 414 *string = 0; 415 416 /*LINTED E_PTRDIFF_OVERFLOW*/ 417 return ((unsigned char *)string - start); 418 } 419