xref: /illumos-gate/usr/src/common/smbsrv/smb_utf8.c (revision da6c28aaf62fa55f0fdb8004aa40f88f23bf53f0)
1*da6c28aaSamw /*
2*da6c28aaSamw  * CDDL HEADER START
3*da6c28aaSamw  *
4*da6c28aaSamw  * The contents of this file are subject to the terms of the
5*da6c28aaSamw  * Common Development and Distribution License (the "License").
6*da6c28aaSamw  * You may not use this file except in compliance with the License.
7*da6c28aaSamw  *
8*da6c28aaSamw  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*da6c28aaSamw  * or http://www.opensolaris.org/os/licensing.
10*da6c28aaSamw  * See the License for the specific language governing permissions
11*da6c28aaSamw  * and limitations under the License.
12*da6c28aaSamw  *
13*da6c28aaSamw  * When distributing Covered Code, include this CDDL HEADER in each
14*da6c28aaSamw  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*da6c28aaSamw  * If applicable, add the following below this CDDL HEADER, with the
16*da6c28aaSamw  * fields enclosed by brackets "[]" replaced with your own identifying
17*da6c28aaSamw  * information: Portions Copyright [yyyy] [name of copyright owner]
18*da6c28aaSamw  *
19*da6c28aaSamw  * CDDL HEADER END
20*da6c28aaSamw  */
21*da6c28aaSamw /*
22*da6c28aaSamw  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23*da6c28aaSamw  * Use is subject to license terms.
24*da6c28aaSamw  */
25*da6c28aaSamw 
26*da6c28aaSamw /*
27*da6c28aaSamw  * Multibyte/wide-char conversion routines. Wide-char encoding provides
28*da6c28aaSamw  * a fixed size character encoding that maps to the Unicode 16-bit
29*da6c28aaSamw  * (UCS-2) character set standard. Multibyte or UCS transformation
30*da6c28aaSamw  * format (UTF) encoding is a variable length character encoding scheme
31*da6c28aaSamw  * that s compatible with existing ASCII characters and guarantees that
32*da6c28aaSamw  * the resultant strings do not contain embedded null characters. Both
33*da6c28aaSamw  * types of encoding provide a null terminator: single byte for UTF-8
34*da6c28aaSamw  * and a wide-char null for Unicode. See RFC 2044.
35*da6c28aaSamw  *
36*da6c28aaSamw  * The table below illustrates the UTF-8 encoding scheme. The letter x
37*da6c28aaSamw  * indicates bits available for encoding the character value.
38*da6c28aaSamw  *
39*da6c28aaSamw  *	UCS-2			UTF-8 octet sequence (binary)
40*da6c28aaSamw  *	0x0000-0x007F	0xxxxxxx
41*da6c28aaSamw  *	0x0080-0x07FF	110xxxxx 10xxxxxx
42*da6c28aaSamw  *	0x0800-0xFFFF	1110xxxx 10xxxxxx 10xxxxxx
43*da6c28aaSamw  *
44*da6c28aaSamw  * RFC 2044
45*da6c28aaSamw  * UTF-8,a transformation format of UNICODE and ISO 10646
46*da6c28aaSamw  * F. Yergeau
47*da6c28aaSamw  * Alis Technologies
48*da6c28aaSamw  * October 1996
49*da6c28aaSamw  */
50*da6c28aaSamw 
51*da6c28aaSamw #pragma ident	"%Z%%M%	%I%	%E% SMI"
52*da6c28aaSamw 
53*da6c28aaSamw #ifdef _KERNEL
54*da6c28aaSamw #include <sys/types.h>
55*da6c28aaSamw #include <sys/sunddi.h>
56*da6c28aaSamw #else
57*da6c28aaSamw #include <stdio.h>
58*da6c28aaSamw #include <stdlib.h>
59*da6c28aaSamw #include <assert.h>
60*da6c28aaSamw #include <strings.h>
61*da6c28aaSamw #endif
62*da6c28aaSamw #include <smbsrv/smb_i18n.h>
63*da6c28aaSamw #include <smbsrv/string.h>
64*da6c28aaSamw 
65*da6c28aaSamw int mbtowc_verbose = 0;
66*da6c28aaSamw int mbtowc_announce = 0;
67*da6c28aaSamw 
68*da6c28aaSamw /*
69*da6c28aaSamw  * mbstowcs
70*da6c28aaSamw  *
71*da6c28aaSamw  * The mbstowcs() function converts a multibyte character string
72*da6c28aaSamw  * mbstring into a wide character string wcstring. No more than
73*da6c28aaSamw  * nwchars wide characters are stored. A terminating null wide
74*da6c28aaSamw  * character is appended if there is room.
75*da6c28aaSamw  *
76*da6c28aaSamw  * Returns the number of wide characters converted, not counting
77*da6c28aaSamw  * any terminating null wide character. Returns -1 if an invalid
78*da6c28aaSamw  * multibyte character is encountered.
79*da6c28aaSamw  */
80*da6c28aaSamw size_t
81*da6c28aaSamw mts_mbstowcs(mts_wchar_t *wcstring, const char *mbstring, size_t nwchars)
82*da6c28aaSamw {
83*da6c28aaSamw 	int len;
84*da6c28aaSamw 	mts_wchar_t	*start = wcstring;
85*da6c28aaSamw 
86*da6c28aaSamw 	while (nwchars--) {
87*da6c28aaSamw 		len = mts_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
88*da6c28aaSamw 		if (len < 0) {
89*da6c28aaSamw 			*wcstring = 0;
90*da6c28aaSamw 			return ((size_t)-1);
91*da6c28aaSamw 		}
92*da6c28aaSamw 
93*da6c28aaSamw 		if (*mbstring == 0)
94*da6c28aaSamw 			break;
95*da6c28aaSamw 
96*da6c28aaSamw 		++wcstring;
97*da6c28aaSamw 		mbstring += len;
98*da6c28aaSamw 	}
99*da6c28aaSamw 
100*da6c28aaSamw 	return (wcstring - start);
101*da6c28aaSamw }
102*da6c28aaSamw 
103*da6c28aaSamw 
104*da6c28aaSamw /*
105*da6c28aaSamw  * mbtowc
106*da6c28aaSamw  *
107*da6c28aaSamw  * The mbtowc() function converts a multibyte character mbchar into
108*da6c28aaSamw  * a wide character and stores the result in the object pointed to
109*da6c28aaSamw  * by wcharp. Up to nbytes bytes are examined.
110*da6c28aaSamw  *
111*da6c28aaSamw  * If mbchar is NULL, mbtowc() returns zero to indicate that shift
112*da6c28aaSamw  * states are not supported. If mbchar is valid, returns the number
113*da6c28aaSamw  * of bytes processed in mbchar. If mbchar is invalid, returns -1.
114*da6c28aaSamw  */
115*da6c28aaSamw int /*ARGSUSED*/
116*da6c28aaSamw mts_mbtowc(mts_wchar_t *wcharp, const char *mbchar, size_t nbytes)
117*da6c28aaSamw {
118*da6c28aaSamw 	unsigned char mbyte;
119*da6c28aaSamw 	mts_wchar_t wide_char;
120*da6c28aaSamw 	int count;
121*da6c28aaSamw 	int bytes_left;
122*da6c28aaSamw 
123*da6c28aaSamw 	if (mbchar == 0)
124*da6c28aaSamw 		return (0); /* shift states not supported */
125*da6c28aaSamw 
126*da6c28aaSamw 	/* 0xxxxxxx -> 1 byte ASCII encoding */
127*da6c28aaSamw 	if (((mbyte = *mbchar++) & 0x80) == 0) {
128*da6c28aaSamw 		if (wcharp)
129*da6c28aaSamw 			*wcharp = (mts_wchar_t)mbyte;
130*da6c28aaSamw 
131*da6c28aaSamw 		return (mbyte ? 1 : 0);
132*da6c28aaSamw 	}
133*da6c28aaSamw 
134*da6c28aaSamw 	/* 10xxxxxx -> invalid first byte */
135*da6c28aaSamw 	if ((mbyte & 0x40) == 0) {
136*da6c28aaSamw 		if (mbtowc_verbose || mbtowc_announce == 0) {
137*da6c28aaSamw 			mbtowc_announce = 1;
138*da6c28aaSamw 		}
139*da6c28aaSamw 		return (-1);
140*da6c28aaSamw 	}
141*da6c28aaSamw 
142*da6c28aaSamw 	wide_char = mbyte;
143*da6c28aaSamw 	if ((mbyte & 0x20) == 0) {
144*da6c28aaSamw 		wide_char &= 0x1f;
145*da6c28aaSamw 		bytes_left = 1;
146*da6c28aaSamw 	} else if ((mbyte & 0x10) == 0) {
147*da6c28aaSamw 		wide_char &= 0x0f;
148*da6c28aaSamw 		bytes_left = 2;
149*da6c28aaSamw 	} else {
150*da6c28aaSamw 		if (mbtowc_verbose || mbtowc_announce == 0) {
151*da6c28aaSamw 			mbtowc_announce = 1;
152*da6c28aaSamw 		}
153*da6c28aaSamw 		return (-1);
154*da6c28aaSamw 	}
155*da6c28aaSamw 
156*da6c28aaSamw 	count = 1;
157*da6c28aaSamw 	while (bytes_left--) {
158*da6c28aaSamw 		if (((mbyte = *mbchar++) & 0xc0) != 0x80) {
159*da6c28aaSamw 			if (mbtowc_verbose || mbtowc_announce == 0) {
160*da6c28aaSamw 				mbtowc_announce = 1;
161*da6c28aaSamw 			}
162*da6c28aaSamw 			return (-1);
163*da6c28aaSamw 		}
164*da6c28aaSamw 
165*da6c28aaSamw 		count++;
166*da6c28aaSamw 		wide_char = (wide_char << 6) | (mbyte & 0x3f);
167*da6c28aaSamw 	}
168*da6c28aaSamw 
169*da6c28aaSamw 	if (wcharp)
170*da6c28aaSamw 		*wcharp = wide_char;
171*da6c28aaSamw 
172*da6c28aaSamw 	return (count);
173*da6c28aaSamw }
174*da6c28aaSamw 
175*da6c28aaSamw 
176*da6c28aaSamw /*
177*da6c28aaSamw  * wctomb
178*da6c28aaSamw  *
179*da6c28aaSamw  * The wctomb() function converts a wide character wchar into a multibyte
180*da6c28aaSamw  * character and stores the result in mbchar. The object pointed to by
181*da6c28aaSamw  * mbchar must be large enough to accommodate the multibyte character.
182*da6c28aaSamw  *
183*da6c28aaSamw  * Returns the numberof bytes written to mbchar.
184*da6c28aaSamw  */
185*da6c28aaSamw int
186*da6c28aaSamw mts_wctomb(char *mbchar, mts_wchar_t wchar)
187*da6c28aaSamw {
188*da6c28aaSamw #ifdef UTF8_DEBUG
189*da6c28aaSamw 	char *start = mbchar;
190*da6c28aaSamw #endif
191*da6c28aaSamw 
192*da6c28aaSamw 	if ((wchar & ~0x7f) == 0) {
193*da6c28aaSamw 		*mbchar = (char)wchar;
194*da6c28aaSamw 		return (1);
195*da6c28aaSamw 	}
196*da6c28aaSamw 
197*da6c28aaSamw 	if ((wchar & ~0x7ff) == 0) {
198*da6c28aaSamw 		*mbchar++ = (wchar >> 6) | 0xc0;
199*da6c28aaSamw 		*mbchar = (wchar & 0x3f) | 0x80;
200*da6c28aaSamw 		return (2);
201*da6c28aaSamw 	}
202*da6c28aaSamw 
203*da6c28aaSamw 	*mbchar++ = (wchar >> 12) | 0xe0;
204*da6c28aaSamw 	*mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
205*da6c28aaSamw 	*mbchar = (wchar & 0x3f) | 0x80;
206*da6c28aaSamw 	return (3);
207*da6c28aaSamw }
208*da6c28aaSamw 
209*da6c28aaSamw 
210*da6c28aaSamw /*
211*da6c28aaSamw  * wcstombs
212*da6c28aaSamw  *
213*da6c28aaSamw  * The wcstombs() function converts a wide character string wcstring
214*da6c28aaSamw  * into a multibyte character string mbstring. Up to nbytes bytes are
215*da6c28aaSamw  * stored in mbstring. Partial multibyte characters at the end of the
216*da6c28aaSamw  * string are not stored. The multibyte character string is null
217*da6c28aaSamw  * terminated if there is room.
218*da6c28aaSamw  *
219*da6c28aaSamw  * Returns the number of bytes converted, not counting the terminating
220*da6c28aaSamw  * null byte.
221*da6c28aaSamw  */
222*da6c28aaSamw size_t
223*da6c28aaSamw mts_wcstombs(char *mbstring, const mts_wchar_t *wcstring, size_t nbytes)
224*da6c28aaSamw {
225*da6c28aaSamw 	char *start = mbstring;
226*da6c28aaSamw 	const mts_wchar_t *wcp = wcstring;
227*da6c28aaSamw 	mts_wchar_t wide_char;
228*da6c28aaSamw 	char buf[4];
229*da6c28aaSamw 	size_t len;
230*da6c28aaSamw 
231*da6c28aaSamw 	if ((mbstring == 0) || (wcstring == 0))
232*da6c28aaSamw 		return (0);
233*da6c28aaSamw 
234*da6c28aaSamw 	while (nbytes > MTS_MB_CHAR_MAX) {
235*da6c28aaSamw 		wide_char = *wcp++;
236*da6c28aaSamw 		len = mts_wctomb(mbstring, wide_char);
237*da6c28aaSamw 
238*da6c28aaSamw 		if (wide_char == 0)
239*da6c28aaSamw 			/*LINTED E_PTRDIFF_OVERFLOW*/
240*da6c28aaSamw 			return (mbstring - start);
241*da6c28aaSamw 
242*da6c28aaSamw 		mbstring += len;
243*da6c28aaSamw 		nbytes -= len;
244*da6c28aaSamw 	}
245*da6c28aaSamw 
246*da6c28aaSamw 	while (wide_char && nbytes) {
247*da6c28aaSamw 		wide_char = *wcp++;
248*da6c28aaSamw 		if ((len = mts_wctomb(buf, wide_char)) > nbytes) {
249*da6c28aaSamw 			*mbstring = 0;
250*da6c28aaSamw 			break;
251*da6c28aaSamw 		}
252*da6c28aaSamw 
253*da6c28aaSamw 		bcopy(buf, mbstring, len);
254*da6c28aaSamw 		mbstring += len;
255*da6c28aaSamw 		nbytes -= len;
256*da6c28aaSamw 	}
257*da6c28aaSamw 
258*da6c28aaSamw 	/*LINTED E_PTRDIFF_OVERFLOW*/
259*da6c28aaSamw 	return (mbstring - start);
260*da6c28aaSamw }
261*da6c28aaSamw 
262*da6c28aaSamw 
263*da6c28aaSamw /*
264*da6c28aaSamw  * Returns the number of bytes that would be written if the multi-
265*da6c28aaSamw  * byte string mbs was converted to a wide character string, not
266*da6c28aaSamw  * counting the terminating null wide character.
267*da6c28aaSamw  */
268*da6c28aaSamw size_t
269*da6c28aaSamw mts_wcequiv_strlen(const char *mbs)
270*da6c28aaSamw {
271*da6c28aaSamw 	mts_wchar_t	wide_char;
272*da6c28aaSamw 	size_t bytes;
273*da6c28aaSamw 	size_t len = 0;
274*da6c28aaSamw 
275*da6c28aaSamw 	while (*mbs) {
276*da6c28aaSamw 		bytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
277*da6c28aaSamw 		if (bytes == ((size_t)-1))
278*da6c28aaSamw 			return ((size_t)-1);
279*da6c28aaSamw 
280*da6c28aaSamw 		len += sizeof (mts_wchar_t);
281*da6c28aaSamw 		mbs += bytes;
282*da6c28aaSamw 	}
283*da6c28aaSamw 
284*da6c28aaSamw 	return (len);
285*da6c28aaSamw }
286*da6c28aaSamw 
287*da6c28aaSamw 
288*da6c28aaSamw /*
289*da6c28aaSamw  * Returns the number of bytes that would be written if the multi-
290*da6c28aaSamw  * byte string mbs was converted to a single byte character string,
291*da6c28aaSamw  * not counting the terminating null character.
292*da6c28aaSamw  */
293*da6c28aaSamw size_t
294*da6c28aaSamw mts_sbequiv_strlen(const char *mbs)
295*da6c28aaSamw {
296*da6c28aaSamw 	mts_wchar_t	wide_char;
297*da6c28aaSamw 	size_t nbytes;
298*da6c28aaSamw 	size_t len = 0;
299*da6c28aaSamw 
300*da6c28aaSamw 	while (*mbs) {
301*da6c28aaSamw 		nbytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
302*da6c28aaSamw 		if (nbytes == ((size_t)-1))
303*da6c28aaSamw 			return ((size_t)-1);
304*da6c28aaSamw 
305*da6c28aaSamw 		if (wide_char & 0xFF00)
306*da6c28aaSamw 			len += sizeof (mts_wchar_t);
307*da6c28aaSamw 		else
308*da6c28aaSamw 			++len;
309*da6c28aaSamw 
310*da6c28aaSamw 		mbs += nbytes;
311*da6c28aaSamw 	}
312*da6c28aaSamw 
313*da6c28aaSamw 	return (len);
314*da6c28aaSamw }
315*da6c28aaSamw 
316*da6c28aaSamw 
317*da6c28aaSamw /*
318*da6c28aaSamw  * stombs
319*da6c28aaSamw  *
320*da6c28aaSamw  * Convert a regular null terminated string 'string' to a UTF-8 encoded
321*da6c28aaSamw  * null terminated multi-byte string 'mbstring'. Only full converted
322*da6c28aaSamw  * UTF-8 characters will be written 'mbstring'. If a character will not
323*da6c28aaSamw  * fit within the remaining buffer space or 'mbstring' will overflow
324*da6c28aaSamw  * max_mblen, the conversion process will be terminated and 'mbstring'
325*da6c28aaSamw  * will be null terminated.
326*da6c28aaSamw  *
327*da6c28aaSamw  * Returns the number of bytes written to 'mbstring', excluding the
328*da6c28aaSamw  * terminating null character.
329*da6c28aaSamw  *
330*da6c28aaSamw  * If either mbstring or string is a null pointer, -1 is returned.
331*da6c28aaSamw  */
332*da6c28aaSamw int
333*da6c28aaSamw mts_stombs(char *mbstring, char *string, int max_mblen)
334*da6c28aaSamw {
335*da6c28aaSamw 	char *start = mbstring;
336*da6c28aaSamw 	unsigned char *p = (unsigned char *)string;
337*da6c28aaSamw 	int space_left = max_mblen;
338*da6c28aaSamw 	int	len;
339*da6c28aaSamw 	mts_wchar_t	wide_char;
340*da6c28aaSamw 	char buf[4];
341*da6c28aaSamw 
342*da6c28aaSamw 	if (!mbstring || !string)
343*da6c28aaSamw 		return (-1);
344*da6c28aaSamw 
345*da6c28aaSamw 	while (*p && space_left > 2) {
346*da6c28aaSamw 		wide_char = *p++;
347*da6c28aaSamw 		len = mts_wctomb(mbstring, wide_char);
348*da6c28aaSamw 		mbstring += len;
349*da6c28aaSamw 		space_left -= len;
350*da6c28aaSamw 	}
351*da6c28aaSamw 
352*da6c28aaSamw 	if (*p) {
353*da6c28aaSamw 		wide_char = *p;
354*da6c28aaSamw 		if ((len = mts_wctomb(buf, wide_char)) < 2) {
355*da6c28aaSamw 			*mbstring = *buf;
356*da6c28aaSamw 			mbstring += len;
357*da6c28aaSamw 			space_left -= len;
358*da6c28aaSamw 		}
359*da6c28aaSamw 	}
360*da6c28aaSamw 
361*da6c28aaSamw 	*mbstring = '\0';
362*da6c28aaSamw 
363*da6c28aaSamw 	/*LINTED E_PTRDIFF_OVERFLOW*/
364*da6c28aaSamw 	return (mbstring - start);
365*da6c28aaSamw }
366*da6c28aaSamw 
367*da6c28aaSamw 
368*da6c28aaSamw /*
369*da6c28aaSamw  * mbstos
370*da6c28aaSamw  *
371*da6c28aaSamw  * Convert a null terminated multi-byte string 'mbstring' to a regular
372*da6c28aaSamw  * null terminated string 'string'.  A 1-byte character in 'mbstring'
373*da6c28aaSamw  * maps to a 1-byte character in 'string'. A 2-byte character in
374*da6c28aaSamw  * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
375*da6c28aaSamw  * Otherwise the upper byte null will be discarded to ensure that the
376*da6c28aaSamw  * output stream does not contain embedded null characters.
377*da6c28aaSamw  *
378*da6c28aaSamw  * If the input stream contains invalid multi-byte characters, a value
379*da6c28aaSamw  * of -1 will be returned. Otherwise the length of 'string', excluding
380*da6c28aaSamw  * the terminating null character, is returned.
381*da6c28aaSamw  *
382*da6c28aaSamw  * If either mbstring or string is a null pointer, -1 is returned.
383*da6c28aaSamw  */
384*da6c28aaSamw int
385*da6c28aaSamw mts_mbstos(char *string, const char *mbstring)
386*da6c28aaSamw {
387*da6c28aaSamw 	mts_wchar_t wc;
388*da6c28aaSamw 	unsigned char *start = (unsigned char *)string;
389*da6c28aaSamw 	int len;
390*da6c28aaSamw 
391*da6c28aaSamw 	if (string == 0 || mbstring == 0)
392*da6c28aaSamw 		return (-1);
393*da6c28aaSamw 
394*da6c28aaSamw 	while (*mbstring) {
395*da6c28aaSamw 		if ((len = mts_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
396*da6c28aaSamw 			*string = 0;
397*da6c28aaSamw 			return (-1);
398*da6c28aaSamw 		}
399*da6c28aaSamw 
400*da6c28aaSamw 		if (wc & 0xFF00) {
401*da6c28aaSamw 			/*LINTED E_BAD_PTR_CAST_ALIGN*/
402*da6c28aaSamw 			*((mts_wchar_t *)string) = wc;
403*da6c28aaSamw 			string += sizeof (mts_wchar_t);
404*da6c28aaSamw 		}
405*da6c28aaSamw 		else
406*da6c28aaSamw 		{
407*da6c28aaSamw 			*string = (unsigned char)wc;
408*da6c28aaSamw 			string++;
409*da6c28aaSamw 		}
410*da6c28aaSamw 
411*da6c28aaSamw 		mbstring += len;
412*da6c28aaSamw 	}
413*da6c28aaSamw 
414*da6c28aaSamw 	*string = 0;
415*da6c28aaSamw 
416*da6c28aaSamw 	/*LINTED E_PTRDIFF_OVERFLOW*/
417*da6c28aaSamw 	return ((unsigned char *)string - start);
418*da6c28aaSamw }
419