1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 *
25 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
26 */
27
28 /*
29 * Multibyte/wide-char conversion routines. Wide-char encoding provides
30 * a fixed size character encoding that maps to the Unicode 16-bit
31 * (UCS-2) character set standard. Multibyte or UCS transformation
32 * format (UTF) encoding is a variable length character encoding scheme
33 * that s compatible with existing ASCII characters and guarantees that
34 * the resultant strings do not contain embedded null characters. Both
35 * types of encoding provide a null terminator: single byte for UTF-8
36 * and a wide-char null for Unicode. See RFC 2044.
37 *
38 * The table below illustrates the UTF-8 encoding scheme. The letter x
39 * indicates bits available for encoding the character value.
40 *
41 * UCS-2 UTF-8 octet sequence (binary)
42 * 0x0000-0x007F 0xxxxxxx
43 * 0x0080-0x07FF 110xxxxx 10xxxxxx
44 * 0x0800-0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
45 *
46 * RFC 2044
47 * UTF-8,a transformation format of UNICODE and ISO 10646
48 * F. Yergeau
49 * Alis Technologies
50 * October 1996
51 */
52
53 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
54 #include <sys/types.h>
55 #include <sys/sunddi.h>
56 #else
57 #include <stdio.h>
58 #include <stdlib.h>
59 #include <assert.h>
60 #include <strings.h>
61 #endif
62 #include <smbsrv/string.h>
63
64
65 /*
66 * mbstowcs
67 *
68 * The mbstowcs() function converts a multibyte character string
69 * mbstring into a wide character string wcstring. No more than
70 * nwchars wide characters are stored. A terminating null wide
71 * character is appended if there is room.
72 *
73 * Returns the number of wide characters converted, not counting
74 * any terminating null wide character. Returns -1 if an invalid
75 * multibyte character is encountered.
76 */
77 size_t
smb_mbstowcs(smb_wchar_t * wcstring,const char * mbstring,size_t nwchars)78 smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars)
79 {
80 int len;
81 smb_wchar_t *start = wcstring;
82
83 while (nwchars--) {
84 len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
85 if (len < 0) {
86 *wcstring = 0;
87 return ((size_t)-1);
88 }
89
90 if (*mbstring == 0)
91 break;
92
93 ++wcstring;
94 mbstring += len;
95 }
96
97 return (wcstring - start);
98 }
99
100
101 /*
102 * mbtowc
103 *
104 * The mbtowc() function converts a multibyte character mbchar into
105 * a wide character and stores the result in the object pointed to
106 * by wcharp. Up to nbytes bytes are examined.
107 *
108 * If mbchar is NULL, mbtowc() returns zero to indicate that shift
109 * states are not supported. Shift states are used to switch between
110 * representation modes using reserved bytes to signal shifting
111 * without them being interpreted as characters. If mbchar is null
112 * mbtowc should return non-zero if the current locale requires shift
113 * states. Otherwise it should be return 0.
114 *
115 * If mbchar is non-null, returns the number of bytes processed in
116 * mbchar. If mbchar is invalid, returns -1.
117 */
118 int /*ARGSUSED*/
smb_mbtowc(smb_wchar_t * wcharp,const char * mbchar,size_t nbytes)119 smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes)
120 {
121 unsigned char mbyte;
122 smb_wchar_t wide_char;
123 int count;
124 int bytes_left;
125
126 if (mbchar == NULL)
127 return (0); /* no shift states */
128
129 /* 0xxxxxxx -> 1 byte ASCII encoding */
130 if (((mbyte = *mbchar++) & 0x80) == 0) {
131 if (wcharp)
132 *wcharp = (smb_wchar_t)mbyte;
133
134 return (mbyte ? 1 : 0);
135 }
136
137 /* 10xxxxxx -> invalid first byte */
138 if ((mbyte & 0x40) == 0)
139 return (-1);
140
141 wide_char = mbyte;
142 if ((mbyte & 0x20) == 0) {
143 wide_char &= 0x1f;
144 bytes_left = 1;
145 } else if ((mbyte & 0x10) == 0) {
146 wide_char &= 0x0f;
147 bytes_left = 2;
148 } else {
149 return (-1);
150 }
151
152 count = 1;
153 while (bytes_left--) {
154 if (((mbyte = *mbchar++) & 0xc0) != 0x80)
155 return (-1);
156
157 count++;
158 wide_char = (wide_char << 6) | (mbyte & 0x3f);
159 }
160
161 if (wcharp)
162 *wcharp = wide_char;
163
164 return (count);
165 }
166
167
168 /*
169 * wctomb
170 *
171 * The wctomb() function converts a wide character wchar into a multibyte
172 * character and stores the result in mbchar. The object pointed to by
173 * mbchar must be large enough to accommodate the multibyte character.
174 *
175 * Returns the numberof bytes written to mbchar.
176 */
177 int
smb_wctomb(char * mbchar,smb_wchar_t wchar)178 smb_wctomb(char *mbchar, smb_wchar_t wchar)
179 {
180 if ((wchar & ~0x7f) == 0) {
181 *mbchar = (char)wchar;
182 return (1);
183 }
184
185 if ((wchar & ~0x7ff) == 0) {
186 *mbchar++ = (wchar >> 6) | 0xc0;
187 *mbchar = (wchar & 0x3f) | 0x80;
188 return (2);
189 }
190
191 *mbchar++ = (wchar >> 12) | 0xe0;
192 *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
193 *mbchar = (wchar & 0x3f) | 0x80;
194 return (3);
195 }
196
197
198 /*
199 * wcstombs
200 *
201 * The wcstombs() function converts a wide character string wcstring
202 * into a multibyte character string mbstring. Up to nbytes bytes are
203 * stored in mbstring. Partial multibyte characters at the end of the
204 * string are not stored. The multibyte character string is null
205 * terminated if there is room.
206 *
207 * Returns the number of bytes converted, not counting the terminating
208 * null byte.
209 */
210 size_t
smb_wcstombs(char * mbstring,const smb_wchar_t * wcstring,size_t nbytes)211 smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes)
212 {
213 char *start = mbstring;
214 const smb_wchar_t *wcp = wcstring;
215 smb_wchar_t wide_char = 0;
216 char buf[4];
217 size_t len;
218
219 if ((mbstring == NULL) || (wcstring == NULL))
220 return (0);
221
222 while (nbytes > MTS_MB_CHAR_MAX) {
223 wide_char = *wcp++;
224 len = smb_wctomb(mbstring, wide_char);
225
226 if (wide_char == 0)
227 /*LINTED E_PTRDIFF_OVERFLOW*/
228 return (mbstring - start);
229
230 mbstring += len;
231 nbytes -= len;
232 }
233
234 while (wide_char && nbytes) {
235 wide_char = *wcp++;
236 if ((len = smb_wctomb(buf, wide_char)) > nbytes) {
237 *mbstring = 0;
238 break;
239 }
240
241 bcopy(buf, mbstring, len);
242 mbstring += len;
243 nbytes -= len;
244 }
245
246 /*LINTED E_PTRDIFF_OVERFLOW*/
247 return (mbstring - start);
248 }
249
250
251 /*
252 * Returns the number of bytes that would be written if the multi-
253 * byte string mbs was converted to a wide character string, not
254 * counting the terminating null wide character.
255 */
256 size_t
smb_wcequiv_strlen(const char * mbs)257 smb_wcequiv_strlen(const char *mbs)
258 {
259 smb_wchar_t wide_char;
260 size_t bytes;
261 size_t len = 0;
262
263 while (*mbs) {
264 bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
265 if (bytes == ((size_t)-1))
266 return ((size_t)-1);
267
268 len += sizeof (smb_wchar_t);
269 mbs += bytes;
270 }
271
272 return (len);
273 }
274
275
276 /*
277 * Returns the number of bytes that would be written if the multi-
278 * byte string mbs was converted to a single byte character string,
279 * not counting the terminating null character.
280 */
281 size_t
smb_sbequiv_strlen(const char * mbs)282 smb_sbequiv_strlen(const char *mbs)
283 {
284 smb_wchar_t wide_char;
285 size_t nbytes;
286 size_t len = 0;
287
288 while (*mbs) {
289 nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
290 if (nbytes == ((size_t)-1))
291 return ((size_t)-1);
292
293 if (wide_char & 0xFF00)
294 len += sizeof (smb_wchar_t);
295 else
296 ++len;
297
298 mbs += nbytes;
299 }
300
301 return (len);
302 }
303
304
305 /*
306 * stombs
307 *
308 * Convert a regular null terminated string 'string' to a UTF-8 encoded
309 * null terminated multi-byte string 'mbstring'. Only full converted
310 * UTF-8 characters will be written 'mbstring'. If a character will not
311 * fit within the remaining buffer space or 'mbstring' will overflow
312 * max_mblen, the conversion process will be terminated and 'mbstring'
313 * will be null terminated.
314 *
315 * Returns the number of bytes written to 'mbstring', excluding the
316 * terminating null character.
317 *
318 * If either mbstring or string is a null pointer, -1 is returned.
319 */
320 int
smb_stombs(char * mbstring,char * string,int max_mblen)321 smb_stombs(char *mbstring, char *string, int max_mblen)
322 {
323 char *start = mbstring;
324 unsigned char *p = (unsigned char *)string;
325 int space_left = max_mblen;
326 int len;
327 smb_wchar_t wide_char;
328 char buf[4];
329
330 if (!mbstring || !string)
331 return (-1);
332
333 while (*p && space_left > 2) {
334 wide_char = *p++;
335 len = smb_wctomb(mbstring, wide_char);
336 mbstring += len;
337 space_left -= len;
338 }
339
340 if (*p) {
341 wide_char = *p;
342 if ((len = smb_wctomb(buf, wide_char)) < 2) {
343 *mbstring = *buf;
344 mbstring += len;
345 space_left -= len;
346 }
347 }
348
349 *mbstring = '\0';
350
351 /*LINTED E_PTRDIFF_OVERFLOW*/
352 return (mbstring - start);
353 }
354
355
356 /*
357 * mbstos
358 *
359 * Convert a null terminated multi-byte string 'mbstring' to a regular
360 * null terminated string 'string'. A 1-byte character in 'mbstring'
361 * maps to a 1-byte character in 'string'. A 2-byte character in
362 * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
363 * Otherwise the upper byte null will be discarded to ensure that the
364 * output stream does not contain embedded null characters.
365 *
366 * If the input stream contains invalid multi-byte characters, a value
367 * of -1 will be returned. Otherwise the length of 'string', excluding
368 * the terminating null character, is returned.
369 *
370 * If either mbstring or string is a null pointer, -1 is returned.
371 */
372 int
smb_mbstos(char * string,const char * mbstring)373 smb_mbstos(char *string, const char *mbstring)
374 {
375 smb_wchar_t wc;
376 unsigned char *start = (unsigned char *)string;
377 int len;
378
379 if (string == NULL || mbstring == NULL)
380 return (-1);
381
382 while (*mbstring) {
383 if ((len = smb_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
384 *string = 0;
385 return (-1);
386 }
387
388 if (wc & 0xFF00) {
389 /*LINTED E_BAD_PTR_CAST_ALIGN*/
390 *((smb_wchar_t *)string) = wc;
391 string += sizeof (smb_wchar_t);
392 }
393 else
394 {
395 *string = (unsigned char)wc;
396 string++;
397 }
398
399 mbstring += len;
400 }
401
402 *string = 0;
403
404 /*LINTED E_PTRDIFF_OVERFLOW*/
405 return ((unsigned char *)string - start);
406 }
407