xref: /titanic_50/usr/src/common/smbsrv/smb_utf8.c (revision bbf6f00c25b6a2bed23c35eac6d62998ecdb338c)
1da6c28aaSamw /*
2da6c28aaSamw  * CDDL HEADER START
3da6c28aaSamw  *
4da6c28aaSamw  * The contents of this file are subject to the terms of the
5da6c28aaSamw  * Common Development and Distribution License (the "License").
6da6c28aaSamw  * You may not use this file except in compliance with the License.
7da6c28aaSamw  *
8da6c28aaSamw  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9da6c28aaSamw  * or http://www.opensolaris.org/os/licensing.
10da6c28aaSamw  * See the License for the specific language governing permissions
11da6c28aaSamw  * and limitations under the License.
12da6c28aaSamw  *
13da6c28aaSamw  * When distributing Covered Code, include this CDDL HEADER in each
14da6c28aaSamw  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15da6c28aaSamw  * If applicable, add the following below this CDDL HEADER, with the
16da6c28aaSamw  * fields enclosed by brackets "[]" replaced with your own identifying
17da6c28aaSamw  * information: Portions Copyright [yyyy] [name of copyright owner]
18da6c28aaSamw  *
19da6c28aaSamw  * CDDL HEADER END
20da6c28aaSamw  */
21da6c28aaSamw /*
22*bbf6f00cSJordan Brown  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23da6c28aaSamw  * Use is subject to license terms.
24da6c28aaSamw  */
25da6c28aaSamw 
26da6c28aaSamw /*
27da6c28aaSamw  * Multibyte/wide-char conversion routines. Wide-char encoding provides
28da6c28aaSamw  * a fixed size character encoding that maps to the Unicode 16-bit
29da6c28aaSamw  * (UCS-2) character set standard. Multibyte or UCS transformation
30da6c28aaSamw  * format (UTF) encoding is a variable length character encoding scheme
31da6c28aaSamw  * that s compatible with existing ASCII characters and guarantees that
32da6c28aaSamw  * the resultant strings do not contain embedded null characters. Both
33da6c28aaSamw  * types of encoding provide a null terminator: single byte for UTF-8
34da6c28aaSamw  * and a wide-char null for Unicode. See RFC 2044.
35da6c28aaSamw  *
36da6c28aaSamw  * The table below illustrates the UTF-8 encoding scheme. The letter x
37da6c28aaSamw  * indicates bits available for encoding the character value.
38da6c28aaSamw  *
39da6c28aaSamw  *	UCS-2			UTF-8 octet sequence (binary)
40da6c28aaSamw  *	0x0000-0x007F	0xxxxxxx
41da6c28aaSamw  *	0x0080-0x07FF	110xxxxx 10xxxxxx
42da6c28aaSamw  *	0x0800-0xFFFF	1110xxxx 10xxxxxx 10xxxxxx
43da6c28aaSamw  *
44da6c28aaSamw  * RFC 2044
45da6c28aaSamw  * UTF-8,a transformation format of UNICODE and ISO 10646
46da6c28aaSamw  * F. Yergeau
47da6c28aaSamw  * Alis Technologies
48da6c28aaSamw  * October 1996
49da6c28aaSamw  */
50da6c28aaSamw 
51da6c28aaSamw #ifdef _KERNEL
52da6c28aaSamw #include <sys/types.h>
53da6c28aaSamw #include <sys/sunddi.h>
54da6c28aaSamw #else
55da6c28aaSamw #include <stdio.h>
56da6c28aaSamw #include <stdlib.h>
57da6c28aaSamw #include <assert.h>
58da6c28aaSamw #include <strings.h>
59da6c28aaSamw #endif
60da6c28aaSamw #include <smbsrv/string.h>
61da6c28aaSamw 
62da6c28aaSamw 
63da6c28aaSamw /*
64da6c28aaSamw  * mbstowcs
65da6c28aaSamw  *
66da6c28aaSamw  * The mbstowcs() function converts a multibyte character string
67da6c28aaSamw  * mbstring into a wide character string wcstring. No more than
68da6c28aaSamw  * nwchars wide characters are stored. A terminating null wide
69da6c28aaSamw  * character is appended if there is room.
70da6c28aaSamw  *
71da6c28aaSamw  * Returns the number of wide characters converted, not counting
72da6c28aaSamw  * any terminating null wide character. Returns -1 if an invalid
73da6c28aaSamw  * multibyte character is encountered.
74da6c28aaSamw  */
75da6c28aaSamw size_t
76*bbf6f00cSJordan Brown smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars)
77da6c28aaSamw {
78da6c28aaSamw 	int len;
79*bbf6f00cSJordan Brown 	smb_wchar_t	*start = wcstring;
80da6c28aaSamw 
81da6c28aaSamw 	while (nwchars--) {
82*bbf6f00cSJordan Brown 		len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
83da6c28aaSamw 		if (len < 0) {
84da6c28aaSamw 			*wcstring = 0;
85da6c28aaSamw 			return ((size_t)-1);
86da6c28aaSamw 		}
87da6c28aaSamw 
88da6c28aaSamw 		if (*mbstring == 0)
89da6c28aaSamw 			break;
90da6c28aaSamw 
91da6c28aaSamw 		++wcstring;
92da6c28aaSamw 		mbstring += len;
93da6c28aaSamw 	}
94da6c28aaSamw 
95da6c28aaSamw 	return (wcstring - start);
96da6c28aaSamw }
97da6c28aaSamw 
98da6c28aaSamw 
99da6c28aaSamw /*
100da6c28aaSamw  * mbtowc
101da6c28aaSamw  *
102da6c28aaSamw  * The mbtowc() function converts a multibyte character mbchar into
103da6c28aaSamw  * a wide character and stores the result in the object pointed to
104da6c28aaSamw  * by wcharp. Up to nbytes bytes are examined.
105da6c28aaSamw  *
106da6c28aaSamw  * If mbchar is NULL, mbtowc() returns zero to indicate that shift
10755bf511dSas200622  * states are not supported.  Shift states are used to switch between
10855bf511dSas200622  * representation modes using reserved bytes to signal shifting
10955bf511dSas200622  * without them being interpreted as characters.  If mbchar is null
11055bf511dSas200622  * mbtowc should return non-zero if the current locale requires shift
11155bf511dSas200622  * states.  Otherwise it should be return 0.
11255bf511dSas200622  *
11355bf511dSas200622  * If mbchar is non-null, returns the number of bytes processed in
11455bf511dSas200622  * mbchar.  If mbchar is invalid, returns -1.
115da6c28aaSamw  */
116da6c28aaSamw int /*ARGSUSED*/
117*bbf6f00cSJordan Brown smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes)
118da6c28aaSamw {
119da6c28aaSamw 	unsigned char mbyte;
120*bbf6f00cSJordan Brown 	smb_wchar_t wide_char;
121da6c28aaSamw 	int count;
122da6c28aaSamw 	int bytes_left;
123da6c28aaSamw 
12455bf511dSas200622 	if (mbchar == NULL)
12555bf511dSas200622 		return (0); /* no shift states */
126da6c28aaSamw 
127da6c28aaSamw 	/* 0xxxxxxx -> 1 byte ASCII encoding */
128da6c28aaSamw 	if (((mbyte = *mbchar++) & 0x80) == 0) {
129da6c28aaSamw 		if (wcharp)
130*bbf6f00cSJordan Brown 			*wcharp = (smb_wchar_t)mbyte;
131da6c28aaSamw 
132da6c28aaSamw 		return (mbyte ? 1 : 0);
133da6c28aaSamw 	}
134da6c28aaSamw 
135da6c28aaSamw 	/* 10xxxxxx -> invalid first byte */
13655bf511dSas200622 	if ((mbyte & 0x40) == 0)
137da6c28aaSamw 		return (-1);
138da6c28aaSamw 
139da6c28aaSamw 	wide_char = mbyte;
140da6c28aaSamw 	if ((mbyte & 0x20) == 0) {
141da6c28aaSamw 		wide_char &= 0x1f;
142da6c28aaSamw 		bytes_left = 1;
143da6c28aaSamw 	} else if ((mbyte & 0x10) == 0) {
144da6c28aaSamw 		wide_char &= 0x0f;
145da6c28aaSamw 		bytes_left = 2;
146da6c28aaSamw 	} else {
147da6c28aaSamw 		return (-1);
148da6c28aaSamw 	}
149da6c28aaSamw 
150da6c28aaSamw 	count = 1;
151da6c28aaSamw 	while (bytes_left--) {
15255bf511dSas200622 		if (((mbyte = *mbchar++) & 0xc0) != 0x80)
153da6c28aaSamw 			return (-1);
154da6c28aaSamw 
155da6c28aaSamw 		count++;
156da6c28aaSamw 		wide_char = (wide_char << 6) | (mbyte & 0x3f);
157da6c28aaSamw 	}
158da6c28aaSamw 
159da6c28aaSamw 	if (wcharp)
160da6c28aaSamw 		*wcharp = wide_char;
161da6c28aaSamw 
162da6c28aaSamw 	return (count);
163da6c28aaSamw }
164da6c28aaSamw 
165da6c28aaSamw 
166da6c28aaSamw /*
167da6c28aaSamw  * wctomb
168da6c28aaSamw  *
169da6c28aaSamw  * The wctomb() function converts a wide character wchar into a multibyte
170da6c28aaSamw  * character and stores the result in mbchar. The object pointed to by
171da6c28aaSamw  * mbchar must be large enough to accommodate the multibyte character.
172da6c28aaSamw  *
173da6c28aaSamw  * Returns the numberof bytes written to mbchar.
174da6c28aaSamw  */
175da6c28aaSamw int
176*bbf6f00cSJordan Brown smb_wctomb(char *mbchar, smb_wchar_t wchar)
177da6c28aaSamw {
178da6c28aaSamw 	if ((wchar & ~0x7f) == 0) {
179da6c28aaSamw 		*mbchar = (char)wchar;
180da6c28aaSamw 		return (1);
181da6c28aaSamw 	}
182da6c28aaSamw 
183da6c28aaSamw 	if ((wchar & ~0x7ff) == 0) {
184da6c28aaSamw 		*mbchar++ = (wchar >> 6) | 0xc0;
185da6c28aaSamw 		*mbchar = (wchar & 0x3f) | 0x80;
186da6c28aaSamw 		return (2);
187da6c28aaSamw 	}
188da6c28aaSamw 
189da6c28aaSamw 	*mbchar++ = (wchar >> 12) | 0xe0;
190da6c28aaSamw 	*mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
191da6c28aaSamw 	*mbchar = (wchar & 0x3f) | 0x80;
192da6c28aaSamw 	return (3);
193da6c28aaSamw }
194da6c28aaSamw 
195da6c28aaSamw 
196da6c28aaSamw /*
197da6c28aaSamw  * wcstombs
198da6c28aaSamw  *
199da6c28aaSamw  * The wcstombs() function converts a wide character string wcstring
200da6c28aaSamw  * into a multibyte character string mbstring. Up to nbytes bytes are
201da6c28aaSamw  * stored in mbstring. Partial multibyte characters at the end of the
202da6c28aaSamw  * string are not stored. The multibyte character string is null
203da6c28aaSamw  * terminated if there is room.
204da6c28aaSamw  *
205da6c28aaSamw  * Returns the number of bytes converted, not counting the terminating
206da6c28aaSamw  * null byte.
207da6c28aaSamw  */
208da6c28aaSamw size_t
209*bbf6f00cSJordan Brown smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes)
210da6c28aaSamw {
211da6c28aaSamw 	char *start = mbstring;
212*bbf6f00cSJordan Brown 	const smb_wchar_t *wcp = wcstring;
213*bbf6f00cSJordan Brown 	smb_wchar_t wide_char;
214da6c28aaSamw 	char buf[4];
215da6c28aaSamw 	size_t len;
216da6c28aaSamw 
21755bf511dSas200622 	if ((mbstring == NULL) || (wcstring == NULL))
218da6c28aaSamw 		return (0);
219da6c28aaSamw 
220da6c28aaSamw 	while (nbytes > MTS_MB_CHAR_MAX) {
221da6c28aaSamw 		wide_char = *wcp++;
222*bbf6f00cSJordan Brown 		len = smb_wctomb(mbstring, wide_char);
223da6c28aaSamw 
224da6c28aaSamw 		if (wide_char == 0)
225da6c28aaSamw 			/*LINTED E_PTRDIFF_OVERFLOW*/
226da6c28aaSamw 			return (mbstring - start);
227da6c28aaSamw 
228da6c28aaSamw 		mbstring += len;
229da6c28aaSamw 		nbytes -= len;
230da6c28aaSamw 	}
231da6c28aaSamw 
232da6c28aaSamw 	while (wide_char && nbytes) {
233da6c28aaSamw 		wide_char = *wcp++;
234*bbf6f00cSJordan Brown 		if ((len = smb_wctomb(buf, wide_char)) > nbytes) {
235da6c28aaSamw 			*mbstring = 0;
236da6c28aaSamw 			break;
237da6c28aaSamw 		}
238da6c28aaSamw 
239da6c28aaSamw 		bcopy(buf, mbstring, len);
240da6c28aaSamw 		mbstring += len;
241da6c28aaSamw 		nbytes -= len;
242da6c28aaSamw 	}
243da6c28aaSamw 
244da6c28aaSamw 	/*LINTED E_PTRDIFF_OVERFLOW*/
245da6c28aaSamw 	return (mbstring - start);
246da6c28aaSamw }
247da6c28aaSamw 
248da6c28aaSamw 
249da6c28aaSamw /*
250da6c28aaSamw  * Returns the number of bytes that would be written if the multi-
251da6c28aaSamw  * byte string mbs was converted to a wide character string, not
252da6c28aaSamw  * counting the terminating null wide character.
253da6c28aaSamw  */
254da6c28aaSamw size_t
255*bbf6f00cSJordan Brown smb_wcequiv_strlen(const char *mbs)
256da6c28aaSamw {
257*bbf6f00cSJordan Brown 	smb_wchar_t	wide_char;
258da6c28aaSamw 	size_t bytes;
259da6c28aaSamw 	size_t len = 0;
260da6c28aaSamw 
261da6c28aaSamw 	while (*mbs) {
262*bbf6f00cSJordan Brown 		bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
263da6c28aaSamw 		if (bytes == ((size_t)-1))
264da6c28aaSamw 			return ((size_t)-1);
265da6c28aaSamw 
266*bbf6f00cSJordan Brown 		len += sizeof (smb_wchar_t);
267da6c28aaSamw 		mbs += bytes;
268da6c28aaSamw 	}
269da6c28aaSamw 
270da6c28aaSamw 	return (len);
271da6c28aaSamw }
272da6c28aaSamw 
273da6c28aaSamw 
274da6c28aaSamw /*
275da6c28aaSamw  * Returns the number of bytes that would be written if the multi-
276da6c28aaSamw  * byte string mbs was converted to a single byte character string,
277da6c28aaSamw  * not counting the terminating null character.
278da6c28aaSamw  */
279da6c28aaSamw size_t
280*bbf6f00cSJordan Brown smb_sbequiv_strlen(const char *mbs)
281da6c28aaSamw {
282*bbf6f00cSJordan Brown 	smb_wchar_t	wide_char;
283da6c28aaSamw 	size_t nbytes;
284da6c28aaSamw 	size_t len = 0;
285da6c28aaSamw 
286da6c28aaSamw 	while (*mbs) {
287*bbf6f00cSJordan Brown 		nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
288da6c28aaSamw 		if (nbytes == ((size_t)-1))
289da6c28aaSamw 			return ((size_t)-1);
290da6c28aaSamw 
291da6c28aaSamw 		if (wide_char & 0xFF00)
292*bbf6f00cSJordan Brown 			len += sizeof (smb_wchar_t);
293da6c28aaSamw 		else
294da6c28aaSamw 			++len;
295da6c28aaSamw 
296da6c28aaSamw 		mbs += nbytes;
297da6c28aaSamw 	}
298da6c28aaSamw 
299da6c28aaSamw 	return (len);
300da6c28aaSamw }
301da6c28aaSamw 
302da6c28aaSamw 
303da6c28aaSamw /*
304da6c28aaSamw  * stombs
305da6c28aaSamw  *
306da6c28aaSamw  * Convert a regular null terminated string 'string' to a UTF-8 encoded
307da6c28aaSamw  * null terminated multi-byte string 'mbstring'. Only full converted
308da6c28aaSamw  * UTF-8 characters will be written 'mbstring'. If a character will not
309da6c28aaSamw  * fit within the remaining buffer space or 'mbstring' will overflow
310da6c28aaSamw  * max_mblen, the conversion process will be terminated and 'mbstring'
311da6c28aaSamw  * will be null terminated.
312da6c28aaSamw  *
313da6c28aaSamw  * Returns the number of bytes written to 'mbstring', excluding the
314da6c28aaSamw  * terminating null character.
315da6c28aaSamw  *
316da6c28aaSamw  * If either mbstring or string is a null pointer, -1 is returned.
317da6c28aaSamw  */
318da6c28aaSamw int
319*bbf6f00cSJordan Brown smb_stombs(char *mbstring, char *string, int max_mblen)
320da6c28aaSamw {
321da6c28aaSamw 	char *start = mbstring;
322da6c28aaSamw 	unsigned char *p = (unsigned char *)string;
323da6c28aaSamw 	int space_left = max_mblen;
324da6c28aaSamw 	int	len;
325*bbf6f00cSJordan Brown 	smb_wchar_t	wide_char;
326da6c28aaSamw 	char buf[4];
327da6c28aaSamw 
328da6c28aaSamw 	if (!mbstring || !string)
329da6c28aaSamw 		return (-1);
330da6c28aaSamw 
331da6c28aaSamw 	while (*p && space_left > 2) {
332da6c28aaSamw 		wide_char = *p++;
333*bbf6f00cSJordan Brown 		len = smb_wctomb(mbstring, wide_char);
334da6c28aaSamw 		mbstring += len;
335da6c28aaSamw 		space_left -= len;
336da6c28aaSamw 	}
337da6c28aaSamw 
338da6c28aaSamw 	if (*p) {
339da6c28aaSamw 		wide_char = *p;
340*bbf6f00cSJordan Brown 		if ((len = smb_wctomb(buf, wide_char)) < 2) {
341da6c28aaSamw 			*mbstring = *buf;
342da6c28aaSamw 			mbstring += len;
343da6c28aaSamw 			space_left -= len;
344da6c28aaSamw 		}
345da6c28aaSamw 	}
346da6c28aaSamw 
347da6c28aaSamw 	*mbstring = '\0';
348da6c28aaSamw 
349da6c28aaSamw 	/*LINTED E_PTRDIFF_OVERFLOW*/
350da6c28aaSamw 	return (mbstring - start);
351da6c28aaSamw }
352da6c28aaSamw 
353da6c28aaSamw 
354da6c28aaSamw /*
355da6c28aaSamw  * mbstos
356da6c28aaSamw  *
357da6c28aaSamw  * Convert a null terminated multi-byte string 'mbstring' to a regular
358da6c28aaSamw  * null terminated string 'string'.  A 1-byte character in 'mbstring'
359da6c28aaSamw  * maps to a 1-byte character in 'string'. A 2-byte character in
360da6c28aaSamw  * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
361da6c28aaSamw  * Otherwise the upper byte null will be discarded to ensure that the
362da6c28aaSamw  * output stream does not contain embedded null characters.
363da6c28aaSamw  *
364da6c28aaSamw  * If the input stream contains invalid multi-byte characters, a value
365da6c28aaSamw  * of -1 will be returned. Otherwise the length of 'string', excluding
366da6c28aaSamw  * the terminating null character, is returned.
367da6c28aaSamw  *
368da6c28aaSamw  * If either mbstring or string is a null pointer, -1 is returned.
369da6c28aaSamw  */
370da6c28aaSamw int
371*bbf6f00cSJordan Brown smb_mbstos(char *string, const char *mbstring)
372da6c28aaSamw {
373*bbf6f00cSJordan Brown 	smb_wchar_t wc;
374da6c28aaSamw 	unsigned char *start = (unsigned char *)string;
375da6c28aaSamw 	int len;
376da6c28aaSamw 
37755bf511dSas200622 	if (string == NULL || mbstring == NULL)
378da6c28aaSamw 		return (-1);
379da6c28aaSamw 
380da6c28aaSamw 	while (*mbstring) {
381*bbf6f00cSJordan Brown 		if ((len = smb_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
382da6c28aaSamw 			*string = 0;
383da6c28aaSamw 			return (-1);
384da6c28aaSamw 		}
385da6c28aaSamw 
386da6c28aaSamw 		if (wc & 0xFF00) {
387da6c28aaSamw 			/*LINTED E_BAD_PTR_CAST_ALIGN*/
388*bbf6f00cSJordan Brown 			*((smb_wchar_t *)string) = wc;
389*bbf6f00cSJordan Brown 			string += sizeof (smb_wchar_t);
390da6c28aaSamw 		}
391da6c28aaSamw 		else
392da6c28aaSamw 		{
393da6c28aaSamw 			*string = (unsigned char)wc;
394da6c28aaSamw 			string++;
395da6c28aaSamw 		}
396da6c28aaSamw 
397da6c28aaSamw 		mbstring += len;
398da6c28aaSamw 	}
399da6c28aaSamw 
400da6c28aaSamw 	*string = 0;
401da6c28aaSamw 
402da6c28aaSamw 	/*LINTED E_PTRDIFF_OVERFLOW*/
403da6c28aaSamw 	return ((unsigned char *)string - start);
404da6c28aaSamw }
405