1da6c28aaSamw /*
2da6c28aaSamw * CDDL HEADER START
3da6c28aaSamw *
4da6c28aaSamw * The contents of this file are subject to the terms of the
5da6c28aaSamw * Common Development and Distribution License (the "License").
6da6c28aaSamw * You may not use this file except in compliance with the License.
7da6c28aaSamw *
8da6c28aaSamw * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9da6c28aaSamw * or http://www.opensolaris.org/os/licensing.
10da6c28aaSamw * See the License for the specific language governing permissions
11da6c28aaSamw * and limitations under the License.
12da6c28aaSamw *
13da6c28aaSamw * When distributing Covered Code, include this CDDL HEADER in each
14da6c28aaSamw * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15da6c28aaSamw * If applicable, add the following below this CDDL HEADER, with the
16da6c28aaSamw * fields enclosed by brackets "[]" replaced with your own identifying
17da6c28aaSamw * information: Portions Copyright [yyyy] [name of copyright owner]
18da6c28aaSamw *
19da6c28aaSamw * CDDL HEADER END
20da6c28aaSamw */
21da6c28aaSamw /*
22bbf6f00cSJordan Brown * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23da6c28aaSamw * Use is subject to license terms.
24*b819cea2SGordon Ross *
25*b819cea2SGordon Ross * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
26da6c28aaSamw */
27da6c28aaSamw
28da6c28aaSamw /*
29da6c28aaSamw * Multibyte/wide-char conversion routines. Wide-char encoding provides
30da6c28aaSamw * a fixed size character encoding that maps to the Unicode 16-bit
31da6c28aaSamw * (UCS-2) character set standard. Multibyte or UCS transformation
32da6c28aaSamw * format (UTF) encoding is a variable length character encoding scheme
33da6c28aaSamw * that s compatible with existing ASCII characters and guarantees that
34da6c28aaSamw * the resultant strings do not contain embedded null characters. Both
35da6c28aaSamw * types of encoding provide a null terminator: single byte for UTF-8
36da6c28aaSamw * and a wide-char null for Unicode. See RFC 2044.
37da6c28aaSamw *
38da6c28aaSamw * The table below illustrates the UTF-8 encoding scheme. The letter x
39da6c28aaSamw * indicates bits available for encoding the character value.
40da6c28aaSamw *
41da6c28aaSamw * UCS-2 UTF-8 octet sequence (binary)
42da6c28aaSamw * 0x0000-0x007F 0xxxxxxx
43da6c28aaSamw * 0x0080-0x07FF 110xxxxx 10xxxxxx
44da6c28aaSamw * 0x0800-0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
45da6c28aaSamw *
46da6c28aaSamw * RFC 2044
47da6c28aaSamw * UTF-8,a transformation format of UNICODE and ISO 10646
48da6c28aaSamw * F. Yergeau
49da6c28aaSamw * Alis Technologies
50da6c28aaSamw * October 1996
51da6c28aaSamw */
52da6c28aaSamw
53*b819cea2SGordon Ross #if defined(_KERNEL) || defined(_FAKE_KERNEL)
54da6c28aaSamw #include <sys/types.h>
55da6c28aaSamw #include <sys/sunddi.h>
56da6c28aaSamw #else
57da6c28aaSamw #include <stdio.h>
58da6c28aaSamw #include <stdlib.h>
59da6c28aaSamw #include <assert.h>
60da6c28aaSamw #include <strings.h>
61da6c28aaSamw #endif
62da6c28aaSamw #include <smbsrv/string.h>
63da6c28aaSamw
64da6c28aaSamw
65da6c28aaSamw /*
66da6c28aaSamw * mbstowcs
67da6c28aaSamw *
68da6c28aaSamw * The mbstowcs() function converts a multibyte character string
69da6c28aaSamw * mbstring into a wide character string wcstring. No more than
70da6c28aaSamw * nwchars wide characters are stored. A terminating null wide
71da6c28aaSamw * character is appended if there is room.
72da6c28aaSamw *
73da6c28aaSamw * Returns the number of wide characters converted, not counting
74da6c28aaSamw * any terminating null wide character. Returns -1 if an invalid
75da6c28aaSamw * multibyte character is encountered.
76da6c28aaSamw */
77da6c28aaSamw size_t
smb_mbstowcs(smb_wchar_t * wcstring,const char * mbstring,size_t nwchars)78bbf6f00cSJordan Brown smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars)
79da6c28aaSamw {
80da6c28aaSamw int len;
81bbf6f00cSJordan Brown smb_wchar_t *start = wcstring;
82da6c28aaSamw
83da6c28aaSamw while (nwchars--) {
84bbf6f00cSJordan Brown len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
85da6c28aaSamw if (len < 0) {
86da6c28aaSamw *wcstring = 0;
87da6c28aaSamw return ((size_t)-1);
88da6c28aaSamw }
89da6c28aaSamw
90da6c28aaSamw if (*mbstring == 0)
91da6c28aaSamw break;
92da6c28aaSamw
93da6c28aaSamw ++wcstring;
94da6c28aaSamw mbstring += len;
95da6c28aaSamw }
96da6c28aaSamw
97da6c28aaSamw return (wcstring - start);
98da6c28aaSamw }
99da6c28aaSamw
100da6c28aaSamw
101da6c28aaSamw /*
102da6c28aaSamw * mbtowc
103da6c28aaSamw *
104da6c28aaSamw * The mbtowc() function converts a multibyte character mbchar into
105da6c28aaSamw * a wide character and stores the result in the object pointed to
106da6c28aaSamw * by wcharp. Up to nbytes bytes are examined.
107da6c28aaSamw *
108da6c28aaSamw * If mbchar is NULL, mbtowc() returns zero to indicate that shift
10955bf511dSas200622 * states are not supported. Shift states are used to switch between
11055bf511dSas200622 * representation modes using reserved bytes to signal shifting
11155bf511dSas200622 * without them being interpreted as characters. If mbchar is null
11255bf511dSas200622 * mbtowc should return non-zero if the current locale requires shift
11355bf511dSas200622 * states. Otherwise it should be return 0.
11455bf511dSas200622 *
11555bf511dSas200622 * If mbchar is non-null, returns the number of bytes processed in
11655bf511dSas200622 * mbchar. If mbchar is invalid, returns -1.
117da6c28aaSamw */
118da6c28aaSamw int /*ARGSUSED*/
smb_mbtowc(smb_wchar_t * wcharp,const char * mbchar,size_t nbytes)119bbf6f00cSJordan Brown smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes)
120da6c28aaSamw {
121da6c28aaSamw unsigned char mbyte;
122bbf6f00cSJordan Brown smb_wchar_t wide_char;
123da6c28aaSamw int count;
124da6c28aaSamw int bytes_left;
125da6c28aaSamw
12655bf511dSas200622 if (mbchar == NULL)
12755bf511dSas200622 return (0); /* no shift states */
128da6c28aaSamw
129da6c28aaSamw /* 0xxxxxxx -> 1 byte ASCII encoding */
130da6c28aaSamw if (((mbyte = *mbchar++) & 0x80) == 0) {
131da6c28aaSamw if (wcharp)
132bbf6f00cSJordan Brown *wcharp = (smb_wchar_t)mbyte;
133da6c28aaSamw
134da6c28aaSamw return (mbyte ? 1 : 0);
135da6c28aaSamw }
136da6c28aaSamw
137da6c28aaSamw /* 10xxxxxx -> invalid first byte */
13855bf511dSas200622 if ((mbyte & 0x40) == 0)
139da6c28aaSamw return (-1);
140da6c28aaSamw
141da6c28aaSamw wide_char = mbyte;
142da6c28aaSamw if ((mbyte & 0x20) == 0) {
143da6c28aaSamw wide_char &= 0x1f;
144da6c28aaSamw bytes_left = 1;
145da6c28aaSamw } else if ((mbyte & 0x10) == 0) {
146da6c28aaSamw wide_char &= 0x0f;
147da6c28aaSamw bytes_left = 2;
148da6c28aaSamw } else {
149da6c28aaSamw return (-1);
150da6c28aaSamw }
151da6c28aaSamw
152da6c28aaSamw count = 1;
153da6c28aaSamw while (bytes_left--) {
15455bf511dSas200622 if (((mbyte = *mbchar++) & 0xc0) != 0x80)
155da6c28aaSamw return (-1);
156da6c28aaSamw
157da6c28aaSamw count++;
158da6c28aaSamw wide_char = (wide_char << 6) | (mbyte & 0x3f);
159da6c28aaSamw }
160da6c28aaSamw
161da6c28aaSamw if (wcharp)
162da6c28aaSamw *wcharp = wide_char;
163da6c28aaSamw
164da6c28aaSamw return (count);
165da6c28aaSamw }
166da6c28aaSamw
167da6c28aaSamw
168da6c28aaSamw /*
169da6c28aaSamw * wctomb
170da6c28aaSamw *
171da6c28aaSamw * The wctomb() function converts a wide character wchar into a multibyte
172da6c28aaSamw * character and stores the result in mbchar. The object pointed to by
173da6c28aaSamw * mbchar must be large enough to accommodate the multibyte character.
174da6c28aaSamw *
175da6c28aaSamw * Returns the numberof bytes written to mbchar.
176da6c28aaSamw */
177da6c28aaSamw int
smb_wctomb(char * mbchar,smb_wchar_t wchar)178bbf6f00cSJordan Brown smb_wctomb(char *mbchar, smb_wchar_t wchar)
179da6c28aaSamw {
180da6c28aaSamw if ((wchar & ~0x7f) == 0) {
181da6c28aaSamw *mbchar = (char)wchar;
182da6c28aaSamw return (1);
183da6c28aaSamw }
184da6c28aaSamw
185da6c28aaSamw if ((wchar & ~0x7ff) == 0) {
186da6c28aaSamw *mbchar++ = (wchar >> 6) | 0xc0;
187da6c28aaSamw *mbchar = (wchar & 0x3f) | 0x80;
188da6c28aaSamw return (2);
189da6c28aaSamw }
190da6c28aaSamw
191da6c28aaSamw *mbchar++ = (wchar >> 12) | 0xe0;
192da6c28aaSamw *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
193da6c28aaSamw *mbchar = (wchar & 0x3f) | 0x80;
194da6c28aaSamw return (3);
195da6c28aaSamw }
196da6c28aaSamw
197da6c28aaSamw
198da6c28aaSamw /*
199da6c28aaSamw * wcstombs
200da6c28aaSamw *
201da6c28aaSamw * The wcstombs() function converts a wide character string wcstring
202da6c28aaSamw * into a multibyte character string mbstring. Up to nbytes bytes are
203da6c28aaSamw * stored in mbstring. Partial multibyte characters at the end of the
204da6c28aaSamw * string are not stored. The multibyte character string is null
205da6c28aaSamw * terminated if there is room.
206da6c28aaSamw *
207da6c28aaSamw * Returns the number of bytes converted, not counting the terminating
208da6c28aaSamw * null byte.
209da6c28aaSamw */
210da6c28aaSamw size_t
smb_wcstombs(char * mbstring,const smb_wchar_t * wcstring,size_t nbytes)211bbf6f00cSJordan Brown smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes)
212da6c28aaSamw {
213da6c28aaSamw char *start = mbstring;
214bbf6f00cSJordan Brown const smb_wchar_t *wcp = wcstring;
215*b819cea2SGordon Ross smb_wchar_t wide_char = 0;
216da6c28aaSamw char buf[4];
217da6c28aaSamw size_t len;
218da6c28aaSamw
21955bf511dSas200622 if ((mbstring == NULL) || (wcstring == NULL))
220da6c28aaSamw return (0);
221da6c28aaSamw
222da6c28aaSamw while (nbytes > MTS_MB_CHAR_MAX) {
223da6c28aaSamw wide_char = *wcp++;
224bbf6f00cSJordan Brown len = smb_wctomb(mbstring, wide_char);
225da6c28aaSamw
226da6c28aaSamw if (wide_char == 0)
227da6c28aaSamw /*LINTED E_PTRDIFF_OVERFLOW*/
228da6c28aaSamw return (mbstring - start);
229da6c28aaSamw
230da6c28aaSamw mbstring += len;
231da6c28aaSamw nbytes -= len;
232da6c28aaSamw }
233da6c28aaSamw
234da6c28aaSamw while (wide_char && nbytes) {
235da6c28aaSamw wide_char = *wcp++;
236bbf6f00cSJordan Brown if ((len = smb_wctomb(buf, wide_char)) > nbytes) {
237da6c28aaSamw *mbstring = 0;
238da6c28aaSamw break;
239da6c28aaSamw }
240da6c28aaSamw
241da6c28aaSamw bcopy(buf, mbstring, len);
242da6c28aaSamw mbstring += len;
243da6c28aaSamw nbytes -= len;
244da6c28aaSamw }
245da6c28aaSamw
246da6c28aaSamw /*LINTED E_PTRDIFF_OVERFLOW*/
247da6c28aaSamw return (mbstring - start);
248da6c28aaSamw }
249da6c28aaSamw
250da6c28aaSamw
251da6c28aaSamw /*
252da6c28aaSamw * Returns the number of bytes that would be written if the multi-
253da6c28aaSamw * byte string mbs was converted to a wide character string, not
254da6c28aaSamw * counting the terminating null wide character.
255da6c28aaSamw */
256da6c28aaSamw size_t
smb_wcequiv_strlen(const char * mbs)257bbf6f00cSJordan Brown smb_wcequiv_strlen(const char *mbs)
258da6c28aaSamw {
259bbf6f00cSJordan Brown smb_wchar_t wide_char;
260da6c28aaSamw size_t bytes;
261da6c28aaSamw size_t len = 0;
262da6c28aaSamw
263da6c28aaSamw while (*mbs) {
264bbf6f00cSJordan Brown bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
265da6c28aaSamw if (bytes == ((size_t)-1))
266da6c28aaSamw return ((size_t)-1);
267da6c28aaSamw
268bbf6f00cSJordan Brown len += sizeof (smb_wchar_t);
269da6c28aaSamw mbs += bytes;
270da6c28aaSamw }
271da6c28aaSamw
272da6c28aaSamw return (len);
273da6c28aaSamw }
274da6c28aaSamw
275da6c28aaSamw
276da6c28aaSamw /*
277da6c28aaSamw * Returns the number of bytes that would be written if the multi-
278da6c28aaSamw * byte string mbs was converted to a single byte character string,
279da6c28aaSamw * not counting the terminating null character.
280da6c28aaSamw */
281da6c28aaSamw size_t
smb_sbequiv_strlen(const char * mbs)282bbf6f00cSJordan Brown smb_sbequiv_strlen(const char *mbs)
283da6c28aaSamw {
284bbf6f00cSJordan Brown smb_wchar_t wide_char;
285da6c28aaSamw size_t nbytes;
286da6c28aaSamw size_t len = 0;
287da6c28aaSamw
288da6c28aaSamw while (*mbs) {
289bbf6f00cSJordan Brown nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
290da6c28aaSamw if (nbytes == ((size_t)-1))
291da6c28aaSamw return ((size_t)-1);
292da6c28aaSamw
293da6c28aaSamw if (wide_char & 0xFF00)
294bbf6f00cSJordan Brown len += sizeof (smb_wchar_t);
295da6c28aaSamw else
296da6c28aaSamw ++len;
297da6c28aaSamw
298da6c28aaSamw mbs += nbytes;
299da6c28aaSamw }
300da6c28aaSamw
301da6c28aaSamw return (len);
302da6c28aaSamw }
303da6c28aaSamw
304da6c28aaSamw
305da6c28aaSamw /*
306da6c28aaSamw * stombs
307da6c28aaSamw *
308da6c28aaSamw * Convert a regular null terminated string 'string' to a UTF-8 encoded
309da6c28aaSamw * null terminated multi-byte string 'mbstring'. Only full converted
310da6c28aaSamw * UTF-8 characters will be written 'mbstring'. If a character will not
311da6c28aaSamw * fit within the remaining buffer space or 'mbstring' will overflow
312da6c28aaSamw * max_mblen, the conversion process will be terminated and 'mbstring'
313da6c28aaSamw * will be null terminated.
314da6c28aaSamw *
315da6c28aaSamw * Returns the number of bytes written to 'mbstring', excluding the
316da6c28aaSamw * terminating null character.
317da6c28aaSamw *
318da6c28aaSamw * If either mbstring or string is a null pointer, -1 is returned.
319da6c28aaSamw */
320da6c28aaSamw int
smb_stombs(char * mbstring,char * string,int max_mblen)321bbf6f00cSJordan Brown smb_stombs(char *mbstring, char *string, int max_mblen)
322da6c28aaSamw {
323da6c28aaSamw char *start = mbstring;
324da6c28aaSamw unsigned char *p = (unsigned char *)string;
325da6c28aaSamw int space_left = max_mblen;
326da6c28aaSamw int len;
327bbf6f00cSJordan Brown smb_wchar_t wide_char;
328da6c28aaSamw char buf[4];
329da6c28aaSamw
330da6c28aaSamw if (!mbstring || !string)
331da6c28aaSamw return (-1);
332da6c28aaSamw
333da6c28aaSamw while (*p && space_left > 2) {
334da6c28aaSamw wide_char = *p++;
335bbf6f00cSJordan Brown len = smb_wctomb(mbstring, wide_char);
336da6c28aaSamw mbstring += len;
337da6c28aaSamw space_left -= len;
338da6c28aaSamw }
339da6c28aaSamw
340da6c28aaSamw if (*p) {
341da6c28aaSamw wide_char = *p;
342bbf6f00cSJordan Brown if ((len = smb_wctomb(buf, wide_char)) < 2) {
343da6c28aaSamw *mbstring = *buf;
344da6c28aaSamw mbstring += len;
345da6c28aaSamw space_left -= len;
346da6c28aaSamw }
347da6c28aaSamw }
348da6c28aaSamw
349da6c28aaSamw *mbstring = '\0';
350da6c28aaSamw
351da6c28aaSamw /*LINTED E_PTRDIFF_OVERFLOW*/
352da6c28aaSamw return (mbstring - start);
353da6c28aaSamw }
354da6c28aaSamw
355da6c28aaSamw
356da6c28aaSamw /*
357da6c28aaSamw * mbstos
358da6c28aaSamw *
359da6c28aaSamw * Convert a null terminated multi-byte string 'mbstring' to a regular
360da6c28aaSamw * null terminated string 'string'. A 1-byte character in 'mbstring'
361da6c28aaSamw * maps to a 1-byte character in 'string'. A 2-byte character in
362da6c28aaSamw * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
363da6c28aaSamw * Otherwise the upper byte null will be discarded to ensure that the
364da6c28aaSamw * output stream does not contain embedded null characters.
365da6c28aaSamw *
366da6c28aaSamw * If the input stream contains invalid multi-byte characters, a value
367da6c28aaSamw * of -1 will be returned. Otherwise the length of 'string', excluding
368da6c28aaSamw * the terminating null character, is returned.
369da6c28aaSamw *
370da6c28aaSamw * If either mbstring or string is a null pointer, -1 is returned.
371da6c28aaSamw */
372da6c28aaSamw int
smb_mbstos(char * string,const char * mbstring)373bbf6f00cSJordan Brown smb_mbstos(char *string, const char *mbstring)
374da6c28aaSamw {
375bbf6f00cSJordan Brown smb_wchar_t wc;
376da6c28aaSamw unsigned char *start = (unsigned char *)string;
377da6c28aaSamw int len;
378da6c28aaSamw
37955bf511dSas200622 if (string == NULL || mbstring == NULL)
380da6c28aaSamw return (-1);
381da6c28aaSamw
382da6c28aaSamw while (*mbstring) {
383bbf6f00cSJordan Brown if ((len = smb_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
384da6c28aaSamw *string = 0;
385da6c28aaSamw return (-1);
386da6c28aaSamw }
387da6c28aaSamw
388da6c28aaSamw if (wc & 0xFF00) {
389da6c28aaSamw /*LINTED E_BAD_PTR_CAST_ALIGN*/
390bbf6f00cSJordan Brown *((smb_wchar_t *)string) = wc;
391bbf6f00cSJordan Brown string += sizeof (smb_wchar_t);
392da6c28aaSamw }
393da6c28aaSamw else
394da6c28aaSamw {
395da6c28aaSamw *string = (unsigned char)wc;
396da6c28aaSamw string++;
397da6c28aaSamw }
398da6c28aaSamw
399da6c28aaSamw mbstring += len;
400da6c28aaSamw }
401da6c28aaSamw
402da6c28aaSamw *string = 0;
403da6c28aaSamw
404da6c28aaSamw /*LINTED E_PTRDIFF_OVERFLOW*/
405da6c28aaSamw return ((unsigned char *)string - start);
406da6c28aaSamw }
407