1da6c28aaSamw /* 2da6c28aaSamw * CDDL HEADER START 3da6c28aaSamw * 4da6c28aaSamw * The contents of this file are subject to the terms of the 5da6c28aaSamw * Common Development and Distribution License (the "License"). 6da6c28aaSamw * You may not use this file except in compliance with the License. 7da6c28aaSamw * 8da6c28aaSamw * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9da6c28aaSamw * or http://www.opensolaris.org/os/licensing. 10da6c28aaSamw * See the License for the specific language governing permissions 11da6c28aaSamw * and limitations under the License. 12da6c28aaSamw * 13da6c28aaSamw * When distributing Covered Code, include this CDDL HEADER in each 14da6c28aaSamw * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15da6c28aaSamw * If applicable, add the following below this CDDL HEADER, with the 16da6c28aaSamw * fields enclosed by brackets "[]" replaced with your own identifying 17da6c28aaSamw * information: Portions Copyright [yyyy] [name of copyright owner] 18da6c28aaSamw * 19da6c28aaSamw * CDDL HEADER END 20da6c28aaSamw */ 21da6c28aaSamw /* 22bbf6f00cSJordan Brown * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23da6c28aaSamw * Use is subject to license terms. 24*b819cea2SGordon Ross * 25*b819cea2SGordon Ross * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 26da6c28aaSamw */ 27da6c28aaSamw 28da6c28aaSamw /* 29da6c28aaSamw * Multibyte/wide-char conversion routines. Wide-char encoding provides 30da6c28aaSamw * a fixed size character encoding that maps to the Unicode 16-bit 31da6c28aaSamw * (UCS-2) character set standard. Multibyte or UCS transformation 32da6c28aaSamw * format (UTF) encoding is a variable length character encoding scheme 33da6c28aaSamw * that s compatible with existing ASCII characters and guarantees that 34da6c28aaSamw * the resultant strings do not contain embedded null characters. Both 35da6c28aaSamw * types of encoding provide a null terminator: single byte for UTF-8 36da6c28aaSamw * and a wide-char null for Unicode. See RFC 2044. 37da6c28aaSamw * 38da6c28aaSamw * The table below illustrates the UTF-8 encoding scheme. The letter x 39da6c28aaSamw * indicates bits available for encoding the character value. 40da6c28aaSamw * 41da6c28aaSamw * UCS-2 UTF-8 octet sequence (binary) 42da6c28aaSamw * 0x0000-0x007F 0xxxxxxx 43da6c28aaSamw * 0x0080-0x07FF 110xxxxx 10xxxxxx 44da6c28aaSamw * 0x0800-0xFFFF 1110xxxx 10xxxxxx 10xxxxxx 45da6c28aaSamw * 46da6c28aaSamw * RFC 2044 47da6c28aaSamw * UTF-8,a transformation format of UNICODE and ISO 10646 48da6c28aaSamw * F. Yergeau 49da6c28aaSamw * Alis Technologies 50da6c28aaSamw * October 1996 51da6c28aaSamw */ 52da6c28aaSamw 53*b819cea2SGordon Ross #if defined(_KERNEL) || defined(_FAKE_KERNEL) 54da6c28aaSamw #include <sys/types.h> 55da6c28aaSamw #include <sys/sunddi.h> 56da6c28aaSamw #else 57da6c28aaSamw #include <stdio.h> 58da6c28aaSamw #include <stdlib.h> 59da6c28aaSamw #include <assert.h> 60da6c28aaSamw #include <strings.h> 61da6c28aaSamw #endif 62da6c28aaSamw #include <smbsrv/string.h> 63da6c28aaSamw 64da6c28aaSamw 65da6c28aaSamw /* 66da6c28aaSamw * mbstowcs 67da6c28aaSamw * 68da6c28aaSamw * The mbstowcs() function converts a multibyte character string 69da6c28aaSamw * mbstring into a wide character string wcstring. No more than 70da6c28aaSamw * nwchars wide characters are stored. A terminating null wide 71da6c28aaSamw * character is appended if there is room. 72da6c28aaSamw * 73da6c28aaSamw * Returns the number of wide characters converted, not counting 74da6c28aaSamw * any terminating null wide character. Returns -1 if an invalid 75da6c28aaSamw * multibyte character is encountered. 76da6c28aaSamw */ 77da6c28aaSamw size_t 78bbf6f00cSJordan Brown smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars) 79da6c28aaSamw { 80da6c28aaSamw int len; 81bbf6f00cSJordan Brown smb_wchar_t *start = wcstring; 82da6c28aaSamw 83da6c28aaSamw while (nwchars--) { 84bbf6f00cSJordan Brown len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX); 85da6c28aaSamw if (len < 0) { 86da6c28aaSamw *wcstring = 0; 87da6c28aaSamw return ((size_t)-1); 88da6c28aaSamw } 89da6c28aaSamw 90da6c28aaSamw if (*mbstring == 0) 91da6c28aaSamw break; 92da6c28aaSamw 93da6c28aaSamw ++wcstring; 94da6c28aaSamw mbstring += len; 95da6c28aaSamw } 96da6c28aaSamw 97da6c28aaSamw return (wcstring - start); 98da6c28aaSamw } 99da6c28aaSamw 100da6c28aaSamw 101da6c28aaSamw /* 102da6c28aaSamw * mbtowc 103da6c28aaSamw * 104da6c28aaSamw * The mbtowc() function converts a multibyte character mbchar into 105da6c28aaSamw * a wide character and stores the result in the object pointed to 106da6c28aaSamw * by wcharp. Up to nbytes bytes are examined. 107da6c28aaSamw * 108da6c28aaSamw * If mbchar is NULL, mbtowc() returns zero to indicate that shift 10955bf511dSas200622 * states are not supported. Shift states are used to switch between 11055bf511dSas200622 * representation modes using reserved bytes to signal shifting 11155bf511dSas200622 * without them being interpreted as characters. If mbchar is null 11255bf511dSas200622 * mbtowc should return non-zero if the current locale requires shift 11355bf511dSas200622 * states. Otherwise it should be return 0. 11455bf511dSas200622 * 11555bf511dSas200622 * If mbchar is non-null, returns the number of bytes processed in 11655bf511dSas200622 * mbchar. If mbchar is invalid, returns -1. 117da6c28aaSamw */ 118da6c28aaSamw int /*ARGSUSED*/ 119bbf6f00cSJordan Brown smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes) 120da6c28aaSamw { 121da6c28aaSamw unsigned char mbyte; 122bbf6f00cSJordan Brown smb_wchar_t wide_char; 123da6c28aaSamw int count; 124da6c28aaSamw int bytes_left; 125da6c28aaSamw 12655bf511dSas200622 if (mbchar == NULL) 12755bf511dSas200622 return (0); /* no shift states */ 128da6c28aaSamw 129da6c28aaSamw /* 0xxxxxxx -> 1 byte ASCII encoding */ 130da6c28aaSamw if (((mbyte = *mbchar++) & 0x80) == 0) { 131da6c28aaSamw if (wcharp) 132bbf6f00cSJordan Brown *wcharp = (smb_wchar_t)mbyte; 133da6c28aaSamw 134da6c28aaSamw return (mbyte ? 1 : 0); 135da6c28aaSamw } 136da6c28aaSamw 137da6c28aaSamw /* 10xxxxxx -> invalid first byte */ 13855bf511dSas200622 if ((mbyte & 0x40) == 0) 139da6c28aaSamw return (-1); 140da6c28aaSamw 141da6c28aaSamw wide_char = mbyte; 142da6c28aaSamw if ((mbyte & 0x20) == 0) { 143da6c28aaSamw wide_char &= 0x1f; 144da6c28aaSamw bytes_left = 1; 145da6c28aaSamw } else if ((mbyte & 0x10) == 0) { 146da6c28aaSamw wide_char &= 0x0f; 147da6c28aaSamw bytes_left = 2; 148da6c28aaSamw } else { 149da6c28aaSamw return (-1); 150da6c28aaSamw } 151da6c28aaSamw 152da6c28aaSamw count = 1; 153da6c28aaSamw while (bytes_left--) { 15455bf511dSas200622 if (((mbyte = *mbchar++) & 0xc0) != 0x80) 155da6c28aaSamw return (-1); 156da6c28aaSamw 157da6c28aaSamw count++; 158da6c28aaSamw wide_char = (wide_char << 6) | (mbyte & 0x3f); 159da6c28aaSamw } 160da6c28aaSamw 161da6c28aaSamw if (wcharp) 162da6c28aaSamw *wcharp = wide_char; 163da6c28aaSamw 164da6c28aaSamw return (count); 165da6c28aaSamw } 166da6c28aaSamw 167da6c28aaSamw 168da6c28aaSamw /* 169da6c28aaSamw * wctomb 170da6c28aaSamw * 171da6c28aaSamw * The wctomb() function converts a wide character wchar into a multibyte 172da6c28aaSamw * character and stores the result in mbchar. The object pointed to by 173da6c28aaSamw * mbchar must be large enough to accommodate the multibyte character. 174da6c28aaSamw * 175da6c28aaSamw * Returns the numberof bytes written to mbchar. 176da6c28aaSamw */ 177da6c28aaSamw int 178bbf6f00cSJordan Brown smb_wctomb(char *mbchar, smb_wchar_t wchar) 179da6c28aaSamw { 180da6c28aaSamw if ((wchar & ~0x7f) == 0) { 181da6c28aaSamw *mbchar = (char)wchar; 182da6c28aaSamw return (1); 183da6c28aaSamw } 184da6c28aaSamw 185da6c28aaSamw if ((wchar & ~0x7ff) == 0) { 186da6c28aaSamw *mbchar++ = (wchar >> 6) | 0xc0; 187da6c28aaSamw *mbchar = (wchar & 0x3f) | 0x80; 188da6c28aaSamw return (2); 189da6c28aaSamw } 190da6c28aaSamw 191da6c28aaSamw *mbchar++ = (wchar >> 12) | 0xe0; 192da6c28aaSamw *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80; 193da6c28aaSamw *mbchar = (wchar & 0x3f) | 0x80; 194da6c28aaSamw return (3); 195da6c28aaSamw } 196da6c28aaSamw 197da6c28aaSamw 198da6c28aaSamw /* 199da6c28aaSamw * wcstombs 200da6c28aaSamw * 201da6c28aaSamw * The wcstombs() function converts a wide character string wcstring 202da6c28aaSamw * into a multibyte character string mbstring. Up to nbytes bytes are 203da6c28aaSamw * stored in mbstring. Partial multibyte characters at the end of the 204da6c28aaSamw * string are not stored. The multibyte character string is null 205da6c28aaSamw * terminated if there is room. 206da6c28aaSamw * 207da6c28aaSamw * Returns the number of bytes converted, not counting the terminating 208da6c28aaSamw * null byte. 209da6c28aaSamw */ 210da6c28aaSamw size_t 211bbf6f00cSJordan Brown smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes) 212da6c28aaSamw { 213da6c28aaSamw char *start = mbstring; 214bbf6f00cSJordan Brown const smb_wchar_t *wcp = wcstring; 215*b819cea2SGordon Ross smb_wchar_t wide_char = 0; 216da6c28aaSamw char buf[4]; 217da6c28aaSamw size_t len; 218da6c28aaSamw 21955bf511dSas200622 if ((mbstring == NULL) || (wcstring == NULL)) 220da6c28aaSamw return (0); 221da6c28aaSamw 222da6c28aaSamw while (nbytes > MTS_MB_CHAR_MAX) { 223da6c28aaSamw wide_char = *wcp++; 224bbf6f00cSJordan Brown len = smb_wctomb(mbstring, wide_char); 225da6c28aaSamw 226da6c28aaSamw if (wide_char == 0) 227da6c28aaSamw /*LINTED E_PTRDIFF_OVERFLOW*/ 228da6c28aaSamw return (mbstring - start); 229da6c28aaSamw 230da6c28aaSamw mbstring += len; 231da6c28aaSamw nbytes -= len; 232da6c28aaSamw } 233da6c28aaSamw 234da6c28aaSamw while (wide_char && nbytes) { 235da6c28aaSamw wide_char = *wcp++; 236bbf6f00cSJordan Brown if ((len = smb_wctomb(buf, wide_char)) > nbytes) { 237da6c28aaSamw *mbstring = 0; 238da6c28aaSamw break; 239da6c28aaSamw } 240da6c28aaSamw 241da6c28aaSamw bcopy(buf, mbstring, len); 242da6c28aaSamw mbstring += len; 243da6c28aaSamw nbytes -= len; 244da6c28aaSamw } 245da6c28aaSamw 246da6c28aaSamw /*LINTED E_PTRDIFF_OVERFLOW*/ 247da6c28aaSamw return (mbstring - start); 248da6c28aaSamw } 249da6c28aaSamw 250da6c28aaSamw 251da6c28aaSamw /* 252da6c28aaSamw * Returns the number of bytes that would be written if the multi- 253da6c28aaSamw * byte string mbs was converted to a wide character string, not 254da6c28aaSamw * counting the terminating null wide character. 255da6c28aaSamw */ 256da6c28aaSamw size_t 257bbf6f00cSJordan Brown smb_wcequiv_strlen(const char *mbs) 258da6c28aaSamw { 259bbf6f00cSJordan Brown smb_wchar_t wide_char; 260da6c28aaSamw size_t bytes; 261da6c28aaSamw size_t len = 0; 262da6c28aaSamw 263da6c28aaSamw while (*mbs) { 264bbf6f00cSJordan Brown bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); 265da6c28aaSamw if (bytes == ((size_t)-1)) 266da6c28aaSamw return ((size_t)-1); 267da6c28aaSamw 268bbf6f00cSJordan Brown len += sizeof (smb_wchar_t); 269da6c28aaSamw mbs += bytes; 270da6c28aaSamw } 271da6c28aaSamw 272da6c28aaSamw return (len); 273da6c28aaSamw } 274da6c28aaSamw 275da6c28aaSamw 276da6c28aaSamw /* 277da6c28aaSamw * Returns the number of bytes that would be written if the multi- 278da6c28aaSamw * byte string mbs was converted to a single byte character string, 279da6c28aaSamw * not counting the terminating null character. 280da6c28aaSamw */ 281da6c28aaSamw size_t 282bbf6f00cSJordan Brown smb_sbequiv_strlen(const char *mbs) 283da6c28aaSamw { 284bbf6f00cSJordan Brown smb_wchar_t wide_char; 285da6c28aaSamw size_t nbytes; 286da6c28aaSamw size_t len = 0; 287da6c28aaSamw 288da6c28aaSamw while (*mbs) { 289bbf6f00cSJordan Brown nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); 290da6c28aaSamw if (nbytes == ((size_t)-1)) 291da6c28aaSamw return ((size_t)-1); 292da6c28aaSamw 293da6c28aaSamw if (wide_char & 0xFF00) 294bbf6f00cSJordan Brown len += sizeof (smb_wchar_t); 295da6c28aaSamw else 296da6c28aaSamw ++len; 297da6c28aaSamw 298da6c28aaSamw mbs += nbytes; 299da6c28aaSamw } 300da6c28aaSamw 301da6c28aaSamw return (len); 302da6c28aaSamw } 303da6c28aaSamw 304da6c28aaSamw 305da6c28aaSamw /* 306da6c28aaSamw * stombs 307da6c28aaSamw * 308da6c28aaSamw * Convert a regular null terminated string 'string' to a UTF-8 encoded 309da6c28aaSamw * null terminated multi-byte string 'mbstring'. Only full converted 310da6c28aaSamw * UTF-8 characters will be written 'mbstring'. If a character will not 311da6c28aaSamw * fit within the remaining buffer space or 'mbstring' will overflow 312da6c28aaSamw * max_mblen, the conversion process will be terminated and 'mbstring' 313da6c28aaSamw * will be null terminated. 314da6c28aaSamw * 315da6c28aaSamw * Returns the number of bytes written to 'mbstring', excluding the 316da6c28aaSamw * terminating null character. 317da6c28aaSamw * 318da6c28aaSamw * If either mbstring or string is a null pointer, -1 is returned. 319da6c28aaSamw */ 320da6c28aaSamw int 321bbf6f00cSJordan Brown smb_stombs(char *mbstring, char *string, int max_mblen) 322da6c28aaSamw { 323da6c28aaSamw char *start = mbstring; 324da6c28aaSamw unsigned char *p = (unsigned char *)string; 325da6c28aaSamw int space_left = max_mblen; 326da6c28aaSamw int len; 327bbf6f00cSJordan Brown smb_wchar_t wide_char; 328da6c28aaSamw char buf[4]; 329da6c28aaSamw 330da6c28aaSamw if (!mbstring || !string) 331da6c28aaSamw return (-1); 332da6c28aaSamw 333da6c28aaSamw while (*p && space_left > 2) { 334da6c28aaSamw wide_char = *p++; 335bbf6f00cSJordan Brown len = smb_wctomb(mbstring, wide_char); 336da6c28aaSamw mbstring += len; 337da6c28aaSamw space_left -= len; 338da6c28aaSamw } 339da6c28aaSamw 340da6c28aaSamw if (*p) { 341da6c28aaSamw wide_char = *p; 342bbf6f00cSJordan Brown if ((len = smb_wctomb(buf, wide_char)) < 2) { 343da6c28aaSamw *mbstring = *buf; 344da6c28aaSamw mbstring += len; 345da6c28aaSamw space_left -= len; 346da6c28aaSamw } 347da6c28aaSamw } 348da6c28aaSamw 349da6c28aaSamw *mbstring = '\0'; 350da6c28aaSamw 351da6c28aaSamw /*LINTED E_PTRDIFF_OVERFLOW*/ 352da6c28aaSamw return (mbstring - start); 353da6c28aaSamw } 354da6c28aaSamw 355da6c28aaSamw 356da6c28aaSamw /* 357da6c28aaSamw * mbstos 358da6c28aaSamw * 359da6c28aaSamw * Convert a null terminated multi-byte string 'mbstring' to a regular 360da6c28aaSamw * null terminated string 'string'. A 1-byte character in 'mbstring' 361da6c28aaSamw * maps to a 1-byte character in 'string'. A 2-byte character in 362da6c28aaSamw * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null. 363da6c28aaSamw * Otherwise the upper byte null will be discarded to ensure that the 364da6c28aaSamw * output stream does not contain embedded null characters. 365da6c28aaSamw * 366da6c28aaSamw * If the input stream contains invalid multi-byte characters, a value 367da6c28aaSamw * of -1 will be returned. Otherwise the length of 'string', excluding 368da6c28aaSamw * the terminating null character, is returned. 369da6c28aaSamw * 370da6c28aaSamw * If either mbstring or string is a null pointer, -1 is returned. 371da6c28aaSamw */ 372da6c28aaSamw int 373bbf6f00cSJordan Brown smb_mbstos(char *string, const char *mbstring) 374da6c28aaSamw { 375bbf6f00cSJordan Brown smb_wchar_t wc; 376da6c28aaSamw unsigned char *start = (unsigned char *)string; 377da6c28aaSamw int len; 378da6c28aaSamw 37955bf511dSas200622 if (string == NULL || mbstring == NULL) 380da6c28aaSamw return (-1); 381da6c28aaSamw 382da6c28aaSamw while (*mbstring) { 383bbf6f00cSJordan Brown if ((len = smb_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) { 384da6c28aaSamw *string = 0; 385da6c28aaSamw return (-1); 386da6c28aaSamw } 387da6c28aaSamw 388da6c28aaSamw if (wc & 0xFF00) { 389da6c28aaSamw /*LINTED E_BAD_PTR_CAST_ALIGN*/ 390bbf6f00cSJordan Brown *((smb_wchar_t *)string) = wc; 391bbf6f00cSJordan Brown string += sizeof (smb_wchar_t); 392da6c28aaSamw } 393da6c28aaSamw else 394da6c28aaSamw { 395da6c28aaSamw *string = (unsigned char)wc; 396da6c28aaSamw string++; 397da6c28aaSamw } 398da6c28aaSamw 399da6c28aaSamw mbstring += len; 400da6c28aaSamw } 401da6c28aaSamw 402da6c28aaSamw *string = 0; 403da6c28aaSamw 404da6c28aaSamw /*LINTED E_PTRDIFF_OVERFLOW*/ 405da6c28aaSamw return ((unsigned char *)string - start); 406da6c28aaSamw } 407