/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Multibyte/wide-char conversion routines. Wide-char encoding provides * a fixed size character encoding that maps to the Unicode 16-bit * (UCS-2) character set standard. Multibyte or UCS transformation * format (UTF) encoding is a variable length character encoding scheme * that s compatible with existing ASCII characters and guarantees that * the resultant strings do not contain embedded null characters. Both * types of encoding provide a null terminator: single byte for UTF-8 * and a wide-char null for Unicode. See RFC 2044. * * The table below illustrates the UTF-8 encoding scheme. The letter x * indicates bits available for encoding the character value. * * UCS-2 UTF-8 octet sequence (binary) * 0x0000-0x007F 0xxxxxxx * 0x0080-0x07FF 110xxxxx 10xxxxxx * 0x0800-0xFFFF 1110xxxx 10xxxxxx 10xxxxxx * * RFC 2044 * UTF-8,a transformation format of UNICODE and ISO 10646 * F. Yergeau * Alis Technologies * October 1996 */ #pragma ident "%Z%%M% %I% %E% SMI" #ifdef _KERNEL #include #include #else #include #include #include #include #endif #include #include int mbtowc_verbose = 0; int mbtowc_announce = 0; /* * mbstowcs * * The mbstowcs() function converts a multibyte character string * mbstring into a wide character string wcstring. No more than * nwchars wide characters are stored. A terminating null wide * character is appended if there is room. * * Returns the number of wide characters converted, not counting * any terminating null wide character. Returns -1 if an invalid * multibyte character is encountered. */ size_t mts_mbstowcs(mts_wchar_t *wcstring, const char *mbstring, size_t nwchars) { int len; mts_wchar_t *start = wcstring; while (nwchars--) { len = mts_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX); if (len < 0) { *wcstring = 0; return ((size_t)-1); } if (*mbstring == 0) break; ++wcstring; mbstring += len; } return (wcstring - start); } /* * mbtowc * * The mbtowc() function converts a multibyte character mbchar into * a wide character and stores the result in the object pointed to * by wcharp. Up to nbytes bytes are examined. * * If mbchar is NULL, mbtowc() returns zero to indicate that shift * states are not supported. If mbchar is valid, returns the number * of bytes processed in mbchar. If mbchar is invalid, returns -1. */ int /*ARGSUSED*/ mts_mbtowc(mts_wchar_t *wcharp, const char *mbchar, size_t nbytes) { unsigned char mbyte; mts_wchar_t wide_char; int count; int bytes_left; if (mbchar == 0) return (0); /* shift states not supported */ /* 0xxxxxxx -> 1 byte ASCII encoding */ if (((mbyte = *mbchar++) & 0x80) == 0) { if (wcharp) *wcharp = (mts_wchar_t)mbyte; return (mbyte ? 1 : 0); } /* 10xxxxxx -> invalid first byte */ if ((mbyte & 0x40) == 0) { if (mbtowc_verbose || mbtowc_announce == 0) { mbtowc_announce = 1; } return (-1); } wide_char = mbyte; if ((mbyte & 0x20) == 0) { wide_char &= 0x1f; bytes_left = 1; } else if ((mbyte & 0x10) == 0) { wide_char &= 0x0f; bytes_left = 2; } else { if (mbtowc_verbose || mbtowc_announce == 0) { mbtowc_announce = 1; } return (-1); } count = 1; while (bytes_left--) { if (((mbyte = *mbchar++) & 0xc0) != 0x80) { if (mbtowc_verbose || mbtowc_announce == 0) { mbtowc_announce = 1; } return (-1); } count++; wide_char = (wide_char << 6) | (mbyte & 0x3f); } if (wcharp) *wcharp = wide_char; return (count); } /* * wctomb * * The wctomb() function converts a wide character wchar into a multibyte * character and stores the result in mbchar. The object pointed to by * mbchar must be large enough to accommodate the multibyte character. * * Returns the numberof bytes written to mbchar. */ int mts_wctomb(char *mbchar, mts_wchar_t wchar) { #ifdef UTF8_DEBUG char *start = mbchar; #endif if ((wchar & ~0x7f) == 0) { *mbchar = (char)wchar; return (1); } if ((wchar & ~0x7ff) == 0) { *mbchar++ = (wchar >> 6) | 0xc0; *mbchar = (wchar & 0x3f) | 0x80; return (2); } *mbchar++ = (wchar >> 12) | 0xe0; *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80; *mbchar = (wchar & 0x3f) | 0x80; return (3); } /* * wcstombs * * The wcstombs() function converts a wide character string wcstring * into a multibyte character string mbstring. Up to nbytes bytes are * stored in mbstring. Partial multibyte characters at the end of the * string are not stored. The multibyte character string is null * terminated if there is room. * * Returns the number of bytes converted, not counting the terminating * null byte. */ size_t mts_wcstombs(char *mbstring, const mts_wchar_t *wcstring, size_t nbytes) { char *start = mbstring; const mts_wchar_t *wcp = wcstring; mts_wchar_t wide_char; char buf[4]; size_t len; if ((mbstring == 0) || (wcstring == 0)) return (0); while (nbytes > MTS_MB_CHAR_MAX) { wide_char = *wcp++; len = mts_wctomb(mbstring, wide_char); if (wide_char == 0) /*LINTED E_PTRDIFF_OVERFLOW*/ return (mbstring - start); mbstring += len; nbytes -= len; } while (wide_char && nbytes) { wide_char = *wcp++; if ((len = mts_wctomb(buf, wide_char)) > nbytes) { *mbstring = 0; break; } bcopy(buf, mbstring, len); mbstring += len; nbytes -= len; } /*LINTED E_PTRDIFF_OVERFLOW*/ return (mbstring - start); } /* * Returns the number of bytes that would be written if the multi- * byte string mbs was converted to a wide character string, not * counting the terminating null wide character. */ size_t mts_wcequiv_strlen(const char *mbs) { mts_wchar_t wide_char; size_t bytes; size_t len = 0; while (*mbs) { bytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); if (bytes == ((size_t)-1)) return ((size_t)-1); len += sizeof (mts_wchar_t); mbs += bytes; } return (len); } /* * Returns the number of bytes that would be written if the multi- * byte string mbs was converted to a single byte character string, * not counting the terminating null character. */ size_t mts_sbequiv_strlen(const char *mbs) { mts_wchar_t wide_char; size_t nbytes; size_t len = 0; while (*mbs) { nbytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); if (nbytes == ((size_t)-1)) return ((size_t)-1); if (wide_char & 0xFF00) len += sizeof (mts_wchar_t); else ++len; mbs += nbytes; } return (len); } /* * stombs * * Convert a regular null terminated string 'string' to a UTF-8 encoded * null terminated multi-byte string 'mbstring'. Only full converted * UTF-8 characters will be written 'mbstring'. If a character will not * fit within the remaining buffer space or 'mbstring' will overflow * max_mblen, the conversion process will be terminated and 'mbstring' * will be null terminated. * * Returns the number of bytes written to 'mbstring', excluding the * terminating null character. * * If either mbstring or string is a null pointer, -1 is returned. */ int mts_stombs(char *mbstring, char *string, int max_mblen) { char *start = mbstring; unsigned char *p = (unsigned char *)string; int space_left = max_mblen; int len; mts_wchar_t wide_char; char buf[4]; if (!mbstring || !string) return (-1); while (*p && space_left > 2) { wide_char = *p++; len = mts_wctomb(mbstring, wide_char); mbstring += len; space_left -= len; } if (*p) { wide_char = *p; if ((len = mts_wctomb(buf, wide_char)) < 2) { *mbstring = *buf; mbstring += len; space_left -= len; } } *mbstring = '\0'; /*LINTED E_PTRDIFF_OVERFLOW*/ return (mbstring - start); } /* * mbstos * * Convert a null terminated multi-byte string 'mbstring' to a regular * null terminated string 'string'. A 1-byte character in 'mbstring' * maps to a 1-byte character in 'string'. A 2-byte character in * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null. * Otherwise the upper byte null will be discarded to ensure that the * output stream does not contain embedded null characters. * * If the input stream contains invalid multi-byte characters, a value * of -1 will be returned. Otherwise the length of 'string', excluding * the terminating null character, is returned. * * If either mbstring or string is a null pointer, -1 is returned. */ int mts_mbstos(char *string, const char *mbstring) { mts_wchar_t wc; unsigned char *start = (unsigned char *)string; int len; if (string == 0 || mbstring == 0) return (-1); while (*mbstring) { if ((len = mts_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) { *string = 0; return (-1); } if (wc & 0xFF00) { /*LINTED E_BAD_PTR_CAST_ALIGN*/ *((mts_wchar_t *)string) = wc; string += sizeof (mts_wchar_t); } else { *string = (unsigned char)wc; string++; } mbstring += len; } *string = 0; /*LINTED E_PTRDIFF_OVERFLOW*/ return ((unsigned char *)string - start); }