common/smbsrv/smb_utf8.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 *
 * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
 */

/*
 * Multibyte/wide-char conversion routines. Wide-char encoding provides
 * a fixed size character encoding that maps to the Unicode 16-bit
 * (UCS-2) character set standard. Multibyte or UCS transformation
 * format (UTF) encoding is a variable length character encoding scheme
 * that s compatible with existing ASCII characters and guarantees that
 * the resultant strings do not contain embedded null characters. Both
 * types of encoding provide a null terminator: single byte for UTF-8
 * and a wide-char null for Unicode. See RFC 2044.
 *
 * The table below illustrates the UTF-8 encoding scheme. The letter x
 * indicates bits available for encoding the character value.
 *
 *	UCS-2			UTF-8 octet sequence (binary)
 *	0x0000-0x007F	0xxxxxxx
 *	0x0080-0x07FF	110xxxxx 10xxxxxx
 *	0x0800-0xFFFF	1110xxxx 10xxxxxx 10xxxxxx
 *
 * RFC 2044
 * UTF-8,a transformation format of UNICODE and ISO 10646
 * F. Yergeau
 * Alis Technologies
 * October 1996
 */

#if defined(_KERNEL) || defined(_FAKE_KERNEL)
#include <sys/types.h>
#include <sys/sunddi.h>
#else	/* _KERNEL || _FAKE_KERNEL */
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <iconv.h>
#include <assert.h>
#endif	/* _KERNEL || _FAKE_KERNEL */
#include <smbsrv/string.h>


/*
 * mbstowcs
 *
 * The mbstowcs() function converts a multibyte character string
 * mbstring into a wide character string wcstring. No more than
 * nwchars wide characters are stored. A terminating null wide
 * character is appended if there is room.
 *
 * Returns the number of wide characters converted, not counting
 * any terminating null wide character. Returns -1 if an invalid
 * multibyte character is encountered.
 */
size_t
smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars)
{
	int len;
	smb_wchar_t	*start = wcstring;

	while (nwchars--) {
		len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
		if (len < 0) {
			*wcstring = 0;
			return ((size_t)-1);
		}

		if (*mbstring == 0)
			break;

		++wcstring;
		mbstring += len;
	}

	return (wcstring - start);
}


/*
 * mbtowc
 *
 * The mbtowc() function converts a multibyte character mbchar into
 * a wide character and stores the result in the object pointed to
 * by wcharp. Up to nbytes bytes are examined.
 *
 * If mbchar is NULL, mbtowc() returns zero to indicate that shift
 * states are not supported.  Shift states are used to switch between
 * representation modes using reserved bytes to signal shifting
 * without them being interpreted as characters.  If mbchar is null
 * mbtowc should return non-zero if the current locale requires shift
 * states.  Otherwise it should be return 0.
 *
 * If mbchar is non-null, returns the number of bytes processed in
 * mbchar.  If mbchar is invalid, returns -1.
 */
int /*ARGSUSED*/
smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes)
{
	unsigned char mbyte;
	smb_wchar_t wide_char;
	int count;
	int bytes_left;

	if (mbchar == NULL)
		return (0); /* no shift states */

	/* 0xxxxxxx -> 1 byte ASCII encoding */
	if (((mbyte = *mbchar++) & 0x80) == 0) {
		if (wcharp)
			*wcharp = (smb_wchar_t)mbyte;

		return (mbyte ? 1 : 0);
	}

	/* 10xxxxxx -> invalid first byte */
	if ((mbyte & 0x40) == 0)
		return (-1);

	wide_char = mbyte;
	if ((mbyte & 0x20) == 0) {
		wide_char &= 0x1f;
		bytes_left = 1;
	} else if ((mbyte & 0x10) == 0) {
		wide_char &= 0x0f;
		bytes_left = 2;
	} else {
		return (-1);
	}

	count = 1;
	while (bytes_left--) {
		if (((mbyte = *mbchar++) & 0xc0) != 0x80)
			return (-1);

		count++;
		wide_char = (wide_char << 6) | (mbyte & 0x3f);
	}

	if (wcharp)
		*wcharp = wide_char;

	return (count);
}


/*
 * wctomb
 *
 * The wctomb() function converts a wide character wchar into a multibyte
 * character and stores the result in mbchar. The object pointed to by
 * mbchar must be large enough to accommodate the multibyte character.
 *
 * Returns the numberof bytes written to mbchar.
 */
int
smb_wctomb(char *mbchar, smb_wchar_t wchar)
{
	if ((wchar & ~0x7f) == 0) {
		*mbchar = (char)wchar;
		return (1);
	}

	if ((wchar & ~0x7ff) == 0) {
		*mbchar++ = (wchar >> 6) | 0xc0;
		*mbchar = (wchar & 0x3f) | 0x80;
		return (2);
	}

	*mbchar++ = (wchar >> 12) | 0xe0;
	*mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
	*mbchar = (wchar & 0x3f) | 0x80;
	return (3);
}


/*
 * wcstombs
 *
 * The wcstombs() function converts a wide character string wcstring
 * into a multibyte character string mbstring. Up to nbytes bytes are
 * stored in mbstring. Partial multibyte characters at the end of the
 * string are not stored. The multibyte character string is null
 * terminated if there is room.
 *
 * Returns the number of bytes converted, not counting the terminating
 * null byte.
 */
size_t
smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes)
{
	char *start = mbstring;
	const smb_wchar_t *wcp = wcstring;
	smb_wchar_t wide_char = 0;
	char buf[4];
	size_t len;

	if ((mbstring == NULL) || (wcstring == NULL))
		return (0);

	while (nbytes > MTS_MB_CHAR_MAX) {
		wide_char = *wcp++;
		len = smb_wctomb(mbstring, wide_char);

		if (wide_char == 0)
			/*LINTED E_PTRDIFF_OVERFLOW*/
			return (mbstring - start);

		mbstring += len;
		nbytes -= len;
	}

	while (wide_char && nbytes) {
		wide_char = *wcp++;
		if ((len = smb_wctomb(buf, wide_char)) > nbytes) {
			*mbstring = 0;
			break;
		}

		bcopy(buf, mbstring, len);
		mbstring += len;
		nbytes -= len;
	}

	/*LINTED E_PTRDIFF_OVERFLOW*/
	return (mbstring - start);
}


/*
 * Returns the number of bytes that would be written if the multi-
 * byte string mbs was converted to a wide character string, not
 * counting the terminating null wide character.
 */
size_t
smb_wcequiv_strlen(const char *mbs)
{
	smb_wchar_t	wide_char;
	size_t bytes;
	size_t len = 0;

	while (*mbs) {
		bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
		if (bytes == ((size_t)-1))
			return ((size_t)-1);

		len += sizeof (smb_wchar_t);
		mbs += bytes;
	}

	return (len);
}


/*
 * Returns the number of bytes that would be written if the multi-
 * byte string mbs was converted to an OEM character string,
 * not counting the terminating null character.
 */
size_t
smb_sbequiv_strlen(const char *mbs)
{
	smb_wchar_t	wide_char;
	size_t nbytes;
	size_t len = 0;

	while (*mbs) {
		nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
		if (nbytes == ((size_t)-1))
			return ((size_t)-1);

		/*
		 * Assume OEM characters are 1-byte (for now).
		 * That's true for cp850, which is the only
		 * codeset this currently supports.  See:
		 * smb_oem.c : smb_oem_codeset
		 */
		++len;

		mbs += nbytes;
	}

	return (len);
}

/*
 * Convert OEM strings to/from internal (UTF-8) form.
 *
 * We rarely encounter these anymore because all modern
 * SMB clients use Unicode (UTF-16). The few cases where
 * this IS still called are normally using ASCII, i.e.
 * tag names etc. so short-cut those cases.  If we get
 * something non-ASCII we have to call iconv.
 *
 * If we were to really support OEM code pages, we would
 * need to have a way to set the OEM code page from some
 * configuration value.  For now it's always CP850.
 * See also ./smb_oem.c
 */
static char smb_oem_codepage[32] = "CP850";

/*
 * smb_oemtombs
 *
 * Convert a null terminated OEM string 'string' to a UTF-8 string
 * no longer than max_mblen (null terminated if space).
 *
 * If the input string contains invalid OEM characters, a value
 * of -1 will be returned. Otherwise returns the length of 'mbs',
 * excluding the terminating null character.
 *
 * If either mbstring or string is a null pointer, -1 is returned.
 */
int
smb_oemtombs(char *mbs, const uint8_t *oems, int max_mblen)
{
	uchar_t *p;
	int	oemlen;
	int	rlen;
	boolean_t need_iconv = B_FALSE;

	if (mbs == NULL || oems == NULL)
		return (-1);

	/*
	 * Check if the oems is all ASCII (and get the length
	 * while we're at it) so we know if we need to iconv.
	 * We usually can avoid the iconv calls.
	 */
	oemlen = 0;
	p = (uchar_t *)oems;
	while (*p != '\0') {
		oemlen++;
		if (*p & 0x80)
			need_iconv = B_TRUE;
		p++;
	}

	if (need_iconv) {
		int	rc;
		char	*obuf = mbs;
		size_t	olen = max_mblen;
		size_t	ilen = oemlen;
#if defined(_KERNEL) || defined(_FAKE_KERNEL)
		char *ibuf = (char *)oems;
		kiconv_t ic;
		int	err;

		ic = kiconv_open("UTF-8", smb_oem_codepage);
		if (ic == (kiconv_t)-1)
			goto just_copy;
		rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err);
		(void) kiconv_close(ic);
#else	/* _KERNEL || _FAKE_KERNEL */
		const char *ibuf = (char *)oems;
		iconv_t	ic;
		ic = iconv_open("UTF-8", smb_oem_codepage);
		if (ic == (iconv_t)-1)
			goto just_copy;
		rc = iconv(ic, &ibuf, &ilen, &obuf, &olen);
		(void) iconv_close(ic);
#endif	/* _KERNEL || _FAKE_KERNEL */
		if (rc < 0)
			return (-1);
		/* Return val. is output bytes. */
		rlen = (max_mblen - olen);
	} else {
	just_copy:
		rlen = oemlen;
		if (rlen > max_mblen)
			rlen = max_mblen;
		bcopy(oems, mbs, rlen);
	}
	if (rlen < max_mblen)
		mbs[rlen] = '\0';

	return (rlen);
}

/*
 * smb_mbstooem
 *
 * Convert a null terminated multi-byte string 'mbs' to an OEM string
 * no longer than max_oemlen (null terminated if space).
 *
 * If the input string contains invalid multi-byte characters, a value
 * of -1 will be returned. Otherwise returns the length of 'oems',
 * excluding the terminating null character.
 *
 * If either mbstring or string is a null pointer, -1 is returned.
 */
int
smb_mbstooem(uint8_t *oems, const char *mbs, int max_oemlen)
{
	uchar_t *p;
	int	mbslen;
	int	rlen;
	boolean_t need_iconv = B_FALSE;

	if (oems == NULL || mbs == NULL)
		return (-1);

	/*
	 * Check if the mbs is all ASCII (and get the length
	 * while we're at it) so we know if we need to iconv.
	 * We usually can avoid the iconv calls.
	 */
	mbslen = 0;
	p = (uchar_t *)mbs;
	while (*p != '\0') {
		mbslen++;
		if (*p & 0x80)
			need_iconv = B_TRUE;
		p++;
	}

	if (need_iconv) {
		int	rc;
		char	*obuf = (char *)oems;
		size_t	olen = max_oemlen;
		size_t	ilen = mbslen;
#if defined(_KERNEL) || defined(_FAKE_KERNEL)
		char *ibuf = (char *)mbs;
		kiconv_t ic;
		int	err;

		ic = kiconv_open(smb_oem_codepage, "UTF-8");
		if (ic == (kiconv_t)-1)
			goto just_copy;
		rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err);
		(void) kiconv_close(ic);
#else	/* _KERNEL || _FAKE_KERNEL */
		const char *ibuf = mbs;
		iconv_t	ic;
		ic = iconv_open(smb_oem_codepage, "UTF-8");
		if (ic == (iconv_t)-1)
			goto just_copy;
		rc = iconv(ic, &ibuf, &ilen, &obuf, &olen);
		(void) iconv_close(ic);
#endif	/* _KERNEL || _FAKE_KERNEL */
		if (rc < 0)
			return (-1);
		/* Return val. is output bytes. */
		rlen = (max_oemlen - olen);
	} else {
	just_copy:
		rlen = mbslen;
		if (rlen > max_oemlen)
			rlen = max_oemlen;
		bcopy(mbs, oems, rlen);
	}
	if (rlen < max_oemlen)
		oems[rlen] = '\0';

	return (rlen);
}