1*da6c28aaSamw /* 2*da6c28aaSamw * CDDL HEADER START 3*da6c28aaSamw * 4*da6c28aaSamw * The contents of this file are subject to the terms of the 5*da6c28aaSamw * Common Development and Distribution License (the "License"). 6*da6c28aaSamw * You may not use this file except in compliance with the License. 7*da6c28aaSamw * 8*da6c28aaSamw * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*da6c28aaSamw * or http://www.opensolaris.org/os/licensing. 10*da6c28aaSamw * See the License for the specific language governing permissions 11*da6c28aaSamw * and limitations under the License. 12*da6c28aaSamw * 13*da6c28aaSamw * When distributing Covered Code, include this CDDL HEADER in each 14*da6c28aaSamw * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*da6c28aaSamw * If applicable, add the following below this CDDL HEADER, with the 16*da6c28aaSamw * fields enclosed by brackets "[]" replaced with your own identifying 17*da6c28aaSamw * information: Portions Copyright [yyyy] [name of copyright owner] 18*da6c28aaSamw * 19*da6c28aaSamw * CDDL HEADER END 20*da6c28aaSamw */ 21*da6c28aaSamw /* 22*da6c28aaSamw * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23*da6c28aaSamw * Use is subject to license terms. 24*da6c28aaSamw */ 25*da6c28aaSamw 26*da6c28aaSamw /* 27*da6c28aaSamw * Multibyte/wide-char conversion routines. Wide-char encoding provides 28*da6c28aaSamw * a fixed size character encoding that maps to the Unicode 16-bit 29*da6c28aaSamw * (UCS-2) character set standard. Multibyte or UCS transformation 30*da6c28aaSamw * format (UTF) encoding is a variable length character encoding scheme 31*da6c28aaSamw * that s compatible with existing ASCII characters and guarantees that 32*da6c28aaSamw * the resultant strings do not contain embedded null characters. Both 33*da6c28aaSamw * types of encoding provide a null terminator: single byte for UTF-8 34*da6c28aaSamw * and a wide-char null for Unicode. See RFC 2044. 35*da6c28aaSamw * 36*da6c28aaSamw * The table below illustrates the UTF-8 encoding scheme. The letter x 37*da6c28aaSamw * indicates bits available for encoding the character value. 38*da6c28aaSamw * 39*da6c28aaSamw * UCS-2 UTF-8 octet sequence (binary) 40*da6c28aaSamw * 0x0000-0x007F 0xxxxxxx 41*da6c28aaSamw * 0x0080-0x07FF 110xxxxx 10xxxxxx 42*da6c28aaSamw * 0x0800-0xFFFF 1110xxxx 10xxxxxx 10xxxxxx 43*da6c28aaSamw * 44*da6c28aaSamw * RFC 2044 45*da6c28aaSamw * UTF-8,a transformation format of UNICODE and ISO 10646 46*da6c28aaSamw * F. Yergeau 47*da6c28aaSamw * Alis Technologies 48*da6c28aaSamw * October 1996 49*da6c28aaSamw */ 50*da6c28aaSamw 51*da6c28aaSamw #pragma ident "%Z%%M% %I% %E% SMI" 52*da6c28aaSamw 53*da6c28aaSamw #ifdef _KERNEL 54*da6c28aaSamw #include <sys/types.h> 55*da6c28aaSamw #include <sys/sunddi.h> 56*da6c28aaSamw #else 57*da6c28aaSamw #include <stdio.h> 58*da6c28aaSamw #include <stdlib.h> 59*da6c28aaSamw #include <assert.h> 60*da6c28aaSamw #include <strings.h> 61*da6c28aaSamw #endif 62*da6c28aaSamw #include <smbsrv/smb_i18n.h> 63*da6c28aaSamw #include <smbsrv/string.h> 64*da6c28aaSamw 65*da6c28aaSamw int mbtowc_verbose = 0; 66*da6c28aaSamw int mbtowc_announce = 0; 67*da6c28aaSamw 68*da6c28aaSamw /* 69*da6c28aaSamw * mbstowcs 70*da6c28aaSamw * 71*da6c28aaSamw * The mbstowcs() function converts a multibyte character string 72*da6c28aaSamw * mbstring into a wide character string wcstring. No more than 73*da6c28aaSamw * nwchars wide characters are stored. A terminating null wide 74*da6c28aaSamw * character is appended if there is room. 75*da6c28aaSamw * 76*da6c28aaSamw * Returns the number of wide characters converted, not counting 77*da6c28aaSamw * any terminating null wide character. Returns -1 if an invalid 78*da6c28aaSamw * multibyte character is encountered. 79*da6c28aaSamw */ 80*da6c28aaSamw size_t 81*da6c28aaSamw mts_mbstowcs(mts_wchar_t *wcstring, const char *mbstring, size_t nwchars) 82*da6c28aaSamw { 83*da6c28aaSamw int len; 84*da6c28aaSamw mts_wchar_t *start = wcstring; 85*da6c28aaSamw 86*da6c28aaSamw while (nwchars--) { 87*da6c28aaSamw len = mts_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX); 88*da6c28aaSamw if (len < 0) { 89*da6c28aaSamw *wcstring = 0; 90*da6c28aaSamw return ((size_t)-1); 91*da6c28aaSamw } 92*da6c28aaSamw 93*da6c28aaSamw if (*mbstring == 0) 94*da6c28aaSamw break; 95*da6c28aaSamw 96*da6c28aaSamw ++wcstring; 97*da6c28aaSamw mbstring += len; 98*da6c28aaSamw } 99*da6c28aaSamw 100*da6c28aaSamw return (wcstring - start); 101*da6c28aaSamw } 102*da6c28aaSamw 103*da6c28aaSamw 104*da6c28aaSamw /* 105*da6c28aaSamw * mbtowc 106*da6c28aaSamw * 107*da6c28aaSamw * The mbtowc() function converts a multibyte character mbchar into 108*da6c28aaSamw * a wide character and stores the result in the object pointed to 109*da6c28aaSamw * by wcharp. Up to nbytes bytes are examined. 110*da6c28aaSamw * 111*da6c28aaSamw * If mbchar is NULL, mbtowc() returns zero to indicate that shift 112*da6c28aaSamw * states are not supported. If mbchar is valid, returns the number 113*da6c28aaSamw * of bytes processed in mbchar. If mbchar is invalid, returns -1. 114*da6c28aaSamw */ 115*da6c28aaSamw int /*ARGSUSED*/ 116*da6c28aaSamw mts_mbtowc(mts_wchar_t *wcharp, const char *mbchar, size_t nbytes) 117*da6c28aaSamw { 118*da6c28aaSamw unsigned char mbyte; 119*da6c28aaSamw mts_wchar_t wide_char; 120*da6c28aaSamw int count; 121*da6c28aaSamw int bytes_left; 122*da6c28aaSamw 123*da6c28aaSamw if (mbchar == 0) 124*da6c28aaSamw return (0); /* shift states not supported */ 125*da6c28aaSamw 126*da6c28aaSamw /* 0xxxxxxx -> 1 byte ASCII encoding */ 127*da6c28aaSamw if (((mbyte = *mbchar++) & 0x80) == 0) { 128*da6c28aaSamw if (wcharp) 129*da6c28aaSamw *wcharp = (mts_wchar_t)mbyte; 130*da6c28aaSamw 131*da6c28aaSamw return (mbyte ? 1 : 0); 132*da6c28aaSamw } 133*da6c28aaSamw 134*da6c28aaSamw /* 10xxxxxx -> invalid first byte */ 135*da6c28aaSamw if ((mbyte & 0x40) == 0) { 136*da6c28aaSamw if (mbtowc_verbose || mbtowc_announce == 0) { 137*da6c28aaSamw mbtowc_announce = 1; 138*da6c28aaSamw } 139*da6c28aaSamw return (-1); 140*da6c28aaSamw } 141*da6c28aaSamw 142*da6c28aaSamw wide_char = mbyte; 143*da6c28aaSamw if ((mbyte & 0x20) == 0) { 144*da6c28aaSamw wide_char &= 0x1f; 145*da6c28aaSamw bytes_left = 1; 146*da6c28aaSamw } else if ((mbyte & 0x10) == 0) { 147*da6c28aaSamw wide_char &= 0x0f; 148*da6c28aaSamw bytes_left = 2; 149*da6c28aaSamw } else { 150*da6c28aaSamw if (mbtowc_verbose || mbtowc_announce == 0) { 151*da6c28aaSamw mbtowc_announce = 1; 152*da6c28aaSamw } 153*da6c28aaSamw return (-1); 154*da6c28aaSamw } 155*da6c28aaSamw 156*da6c28aaSamw count = 1; 157*da6c28aaSamw while (bytes_left--) { 158*da6c28aaSamw if (((mbyte = *mbchar++) & 0xc0) != 0x80) { 159*da6c28aaSamw if (mbtowc_verbose || mbtowc_announce == 0) { 160*da6c28aaSamw mbtowc_announce = 1; 161*da6c28aaSamw } 162*da6c28aaSamw return (-1); 163*da6c28aaSamw } 164*da6c28aaSamw 165*da6c28aaSamw count++; 166*da6c28aaSamw wide_char = (wide_char << 6) | (mbyte & 0x3f); 167*da6c28aaSamw } 168*da6c28aaSamw 169*da6c28aaSamw if (wcharp) 170*da6c28aaSamw *wcharp = wide_char; 171*da6c28aaSamw 172*da6c28aaSamw return (count); 173*da6c28aaSamw } 174*da6c28aaSamw 175*da6c28aaSamw 176*da6c28aaSamw /* 177*da6c28aaSamw * wctomb 178*da6c28aaSamw * 179*da6c28aaSamw * The wctomb() function converts a wide character wchar into a multibyte 180*da6c28aaSamw * character and stores the result in mbchar. The object pointed to by 181*da6c28aaSamw * mbchar must be large enough to accommodate the multibyte character. 182*da6c28aaSamw * 183*da6c28aaSamw * Returns the numberof bytes written to mbchar. 184*da6c28aaSamw */ 185*da6c28aaSamw int 186*da6c28aaSamw mts_wctomb(char *mbchar, mts_wchar_t wchar) 187*da6c28aaSamw { 188*da6c28aaSamw #ifdef UTF8_DEBUG 189*da6c28aaSamw char *start = mbchar; 190*da6c28aaSamw #endif 191*da6c28aaSamw 192*da6c28aaSamw if ((wchar & ~0x7f) == 0) { 193*da6c28aaSamw *mbchar = (char)wchar; 194*da6c28aaSamw return (1); 195*da6c28aaSamw } 196*da6c28aaSamw 197*da6c28aaSamw if ((wchar & ~0x7ff) == 0) { 198*da6c28aaSamw *mbchar++ = (wchar >> 6) | 0xc0; 199*da6c28aaSamw *mbchar = (wchar & 0x3f) | 0x80; 200*da6c28aaSamw return (2); 201*da6c28aaSamw } 202*da6c28aaSamw 203*da6c28aaSamw *mbchar++ = (wchar >> 12) | 0xe0; 204*da6c28aaSamw *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80; 205*da6c28aaSamw *mbchar = (wchar & 0x3f) | 0x80; 206*da6c28aaSamw return (3); 207*da6c28aaSamw } 208*da6c28aaSamw 209*da6c28aaSamw 210*da6c28aaSamw /* 211*da6c28aaSamw * wcstombs 212*da6c28aaSamw * 213*da6c28aaSamw * The wcstombs() function converts a wide character string wcstring 214*da6c28aaSamw * into a multibyte character string mbstring. Up to nbytes bytes are 215*da6c28aaSamw * stored in mbstring. Partial multibyte characters at the end of the 216*da6c28aaSamw * string are not stored. The multibyte character string is null 217*da6c28aaSamw * terminated if there is room. 218*da6c28aaSamw * 219*da6c28aaSamw * Returns the number of bytes converted, not counting the terminating 220*da6c28aaSamw * null byte. 221*da6c28aaSamw */ 222*da6c28aaSamw size_t 223*da6c28aaSamw mts_wcstombs(char *mbstring, const mts_wchar_t *wcstring, size_t nbytes) 224*da6c28aaSamw { 225*da6c28aaSamw char *start = mbstring; 226*da6c28aaSamw const mts_wchar_t *wcp = wcstring; 227*da6c28aaSamw mts_wchar_t wide_char; 228*da6c28aaSamw char buf[4]; 229*da6c28aaSamw size_t len; 230*da6c28aaSamw 231*da6c28aaSamw if ((mbstring == 0) || (wcstring == 0)) 232*da6c28aaSamw return (0); 233*da6c28aaSamw 234*da6c28aaSamw while (nbytes > MTS_MB_CHAR_MAX) { 235*da6c28aaSamw wide_char = *wcp++; 236*da6c28aaSamw len = mts_wctomb(mbstring, wide_char); 237*da6c28aaSamw 238*da6c28aaSamw if (wide_char == 0) 239*da6c28aaSamw /*LINTED E_PTRDIFF_OVERFLOW*/ 240*da6c28aaSamw return (mbstring - start); 241*da6c28aaSamw 242*da6c28aaSamw mbstring += len; 243*da6c28aaSamw nbytes -= len; 244*da6c28aaSamw } 245*da6c28aaSamw 246*da6c28aaSamw while (wide_char && nbytes) { 247*da6c28aaSamw wide_char = *wcp++; 248*da6c28aaSamw if ((len = mts_wctomb(buf, wide_char)) > nbytes) { 249*da6c28aaSamw *mbstring = 0; 250*da6c28aaSamw break; 251*da6c28aaSamw } 252*da6c28aaSamw 253*da6c28aaSamw bcopy(buf, mbstring, len); 254*da6c28aaSamw mbstring += len; 255*da6c28aaSamw nbytes -= len; 256*da6c28aaSamw } 257*da6c28aaSamw 258*da6c28aaSamw /*LINTED E_PTRDIFF_OVERFLOW*/ 259*da6c28aaSamw return (mbstring - start); 260*da6c28aaSamw } 261*da6c28aaSamw 262*da6c28aaSamw 263*da6c28aaSamw /* 264*da6c28aaSamw * Returns the number of bytes that would be written if the multi- 265*da6c28aaSamw * byte string mbs was converted to a wide character string, not 266*da6c28aaSamw * counting the terminating null wide character. 267*da6c28aaSamw */ 268*da6c28aaSamw size_t 269*da6c28aaSamw mts_wcequiv_strlen(const char *mbs) 270*da6c28aaSamw { 271*da6c28aaSamw mts_wchar_t wide_char; 272*da6c28aaSamw size_t bytes; 273*da6c28aaSamw size_t len = 0; 274*da6c28aaSamw 275*da6c28aaSamw while (*mbs) { 276*da6c28aaSamw bytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); 277*da6c28aaSamw if (bytes == ((size_t)-1)) 278*da6c28aaSamw return ((size_t)-1); 279*da6c28aaSamw 280*da6c28aaSamw len += sizeof (mts_wchar_t); 281*da6c28aaSamw mbs += bytes; 282*da6c28aaSamw } 283*da6c28aaSamw 284*da6c28aaSamw return (len); 285*da6c28aaSamw } 286*da6c28aaSamw 287*da6c28aaSamw 288*da6c28aaSamw /* 289*da6c28aaSamw * Returns the number of bytes that would be written if the multi- 290*da6c28aaSamw * byte string mbs was converted to a single byte character string, 291*da6c28aaSamw * not counting the terminating null character. 292*da6c28aaSamw */ 293*da6c28aaSamw size_t 294*da6c28aaSamw mts_sbequiv_strlen(const char *mbs) 295*da6c28aaSamw { 296*da6c28aaSamw mts_wchar_t wide_char; 297*da6c28aaSamw size_t nbytes; 298*da6c28aaSamw size_t len = 0; 299*da6c28aaSamw 300*da6c28aaSamw while (*mbs) { 301*da6c28aaSamw nbytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); 302*da6c28aaSamw if (nbytes == ((size_t)-1)) 303*da6c28aaSamw return ((size_t)-1); 304*da6c28aaSamw 305*da6c28aaSamw if (wide_char & 0xFF00) 306*da6c28aaSamw len += sizeof (mts_wchar_t); 307*da6c28aaSamw else 308*da6c28aaSamw ++len; 309*da6c28aaSamw 310*da6c28aaSamw mbs += nbytes; 311*da6c28aaSamw } 312*da6c28aaSamw 313*da6c28aaSamw return (len); 314*da6c28aaSamw } 315*da6c28aaSamw 316*da6c28aaSamw 317*da6c28aaSamw /* 318*da6c28aaSamw * stombs 319*da6c28aaSamw * 320*da6c28aaSamw * Convert a regular null terminated string 'string' to a UTF-8 encoded 321*da6c28aaSamw * null terminated multi-byte string 'mbstring'. Only full converted 322*da6c28aaSamw * UTF-8 characters will be written 'mbstring'. If a character will not 323*da6c28aaSamw * fit within the remaining buffer space or 'mbstring' will overflow 324*da6c28aaSamw * max_mblen, the conversion process will be terminated and 'mbstring' 325*da6c28aaSamw * will be null terminated. 326*da6c28aaSamw * 327*da6c28aaSamw * Returns the number of bytes written to 'mbstring', excluding the 328*da6c28aaSamw * terminating null character. 329*da6c28aaSamw * 330*da6c28aaSamw * If either mbstring or string is a null pointer, -1 is returned. 331*da6c28aaSamw */ 332*da6c28aaSamw int 333*da6c28aaSamw mts_stombs(char *mbstring, char *string, int max_mblen) 334*da6c28aaSamw { 335*da6c28aaSamw char *start = mbstring; 336*da6c28aaSamw unsigned char *p = (unsigned char *)string; 337*da6c28aaSamw int space_left = max_mblen; 338*da6c28aaSamw int len; 339*da6c28aaSamw mts_wchar_t wide_char; 340*da6c28aaSamw char buf[4]; 341*da6c28aaSamw 342*da6c28aaSamw if (!mbstring || !string) 343*da6c28aaSamw return (-1); 344*da6c28aaSamw 345*da6c28aaSamw while (*p && space_left > 2) { 346*da6c28aaSamw wide_char = *p++; 347*da6c28aaSamw len = mts_wctomb(mbstring, wide_char); 348*da6c28aaSamw mbstring += len; 349*da6c28aaSamw space_left -= len; 350*da6c28aaSamw } 351*da6c28aaSamw 352*da6c28aaSamw if (*p) { 353*da6c28aaSamw wide_char = *p; 354*da6c28aaSamw if ((len = mts_wctomb(buf, wide_char)) < 2) { 355*da6c28aaSamw *mbstring = *buf; 356*da6c28aaSamw mbstring += len; 357*da6c28aaSamw space_left -= len; 358*da6c28aaSamw } 359*da6c28aaSamw } 360*da6c28aaSamw 361*da6c28aaSamw *mbstring = '\0'; 362*da6c28aaSamw 363*da6c28aaSamw /*LINTED E_PTRDIFF_OVERFLOW*/ 364*da6c28aaSamw return (mbstring - start); 365*da6c28aaSamw } 366*da6c28aaSamw 367*da6c28aaSamw 368*da6c28aaSamw /* 369*da6c28aaSamw * mbstos 370*da6c28aaSamw * 371*da6c28aaSamw * Convert a null terminated multi-byte string 'mbstring' to a regular 372*da6c28aaSamw * null terminated string 'string'. A 1-byte character in 'mbstring' 373*da6c28aaSamw * maps to a 1-byte character in 'string'. A 2-byte character in 374*da6c28aaSamw * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null. 375*da6c28aaSamw * Otherwise the upper byte null will be discarded to ensure that the 376*da6c28aaSamw * output stream does not contain embedded null characters. 377*da6c28aaSamw * 378*da6c28aaSamw * If the input stream contains invalid multi-byte characters, a value 379*da6c28aaSamw * of -1 will be returned. Otherwise the length of 'string', excluding 380*da6c28aaSamw * the terminating null character, is returned. 381*da6c28aaSamw * 382*da6c28aaSamw * If either mbstring or string is a null pointer, -1 is returned. 383*da6c28aaSamw */ 384*da6c28aaSamw int 385*da6c28aaSamw mts_mbstos(char *string, const char *mbstring) 386*da6c28aaSamw { 387*da6c28aaSamw mts_wchar_t wc; 388*da6c28aaSamw unsigned char *start = (unsigned char *)string; 389*da6c28aaSamw int len; 390*da6c28aaSamw 391*da6c28aaSamw if (string == 0 || mbstring == 0) 392*da6c28aaSamw return (-1); 393*da6c28aaSamw 394*da6c28aaSamw while (*mbstring) { 395*da6c28aaSamw if ((len = mts_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) { 396*da6c28aaSamw *string = 0; 397*da6c28aaSamw return (-1); 398*da6c28aaSamw } 399*da6c28aaSamw 400*da6c28aaSamw if (wc & 0xFF00) { 401*da6c28aaSamw /*LINTED E_BAD_PTR_CAST_ALIGN*/ 402*da6c28aaSamw *((mts_wchar_t *)string) = wc; 403*da6c28aaSamw string += sizeof (mts_wchar_t); 404*da6c28aaSamw } 405*da6c28aaSamw else 406*da6c28aaSamw { 407*da6c28aaSamw *string = (unsigned char)wc; 408*da6c28aaSamw string++; 409*da6c28aaSamw } 410*da6c28aaSamw 411*da6c28aaSamw mbstring += len; 412*da6c28aaSamw } 413*da6c28aaSamw 414*da6c28aaSamw *string = 0; 415*da6c28aaSamw 416*da6c28aaSamw /*LINTED E_PTRDIFF_OVERFLOW*/ 417*da6c28aaSamw return ((unsigned char *)string - start); 418*da6c28aaSamw } 419