1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 *
25 * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
26 */
27
28 /*
29 * Multibyte/wide-char conversion routines. SMB uses UTF-16 on the wire
30 * (smb_wchar_t) and we use UTF-8 internally (our multi-byte, or mbs).
31 */
32
33 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
34 #include <sys/types.h>
35 #include <sys/sunddi.h>
36 #else /* _KERNEL || _FAKE_KERNEL */
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <strings.h>
40 #include <iconv.h>
41 #include <assert.h>
42 #endif /* _KERNEL || _FAKE_KERNEL */
43 #include <sys/u8_textprep.h>
44 #include <smbsrv/string.h>
45
46
47 /*
48 * mbstowcs
49 *
50 * The mbstowcs() function converts a multibyte character string
51 * mbstring into a wide character string wcstring. No more than
52 * nwchars wide characters are stored. A terminating null wide
53 * character is appended if there is room.
54 *
55 * Returns the number of wide characters converted, not counting
56 * any terminating null wide character. Returns -1 if an invalid
57 * multibyte character is encountered.
58 */
59 size_t
smb_mbstowcs(smb_wchar_t * wcs,const char * mbs,size_t nwchars)60 smb_mbstowcs(smb_wchar_t *wcs, const char *mbs, size_t nwchars)
61 {
62 size_t mbslen, wcslen;
63 int err;
64
65 /* NULL or empty input is allowed. */
66 if (mbs == NULL || *mbs == '\0') {
67 if (wcs != NULL && nwchars > 0)
68 *wcs = 0;
69 return (0);
70 }
71
72 /*
73 * Traditional mbstowcs(3C) allows wcs==NULL to get the length.
74 * SMB never calls it that way, but let's future-proof.
75 */
76 if (wcs == NULL) {
77 return ((size_t)-1);
78 }
79
80 mbslen = strlen(mbs);
81 wcslen = nwchars;
82 err = uconv_u8tou16((const uchar_t *)mbs, &mbslen,
83 wcs, &wcslen, UCONV_OUT_LITTLE_ENDIAN);
84 if (err != 0)
85 return ((size_t)-1);
86
87 if (wcslen < nwchars)
88 wcs[wcslen] = 0;
89
90 return (wcslen);
91 }
92
93
94 /*
95 * mbtowc
96 *
97 * The mbtowc() function converts a multibyte character mbchar into
98 * a wide character and stores the result in the object pointed to
99 * by wcharp. Up to nbytes bytes are examined.
100 *
101 * If mbchar is NULL, mbtowc() returns zero to indicate that shift
102 * states are not supported. Shift states are used to switch between
103 * representation modes using reserved bytes to signal shifting
104 * without them being interpreted as characters. If mbchar is null
105 * mbtowc should return non-zero if the current locale requires shift
106 * states. Otherwise it should be return 0.
107 *
108 * If mbchar is non-null, returns the number of bytes processed in
109 * mbchar. If mbchar is null, convert the null (wcharp=0) but
110 * return length zero. If mbchar is invalid, returns -1.
111 */
112 int /*ARGSUSED*/
smb_mbtowc(uint32_t * wcharp,const char * mbchar,size_t nbytes)113 smb_mbtowc(uint32_t *wcharp, const char *mbchar, size_t nbytes)
114 {
115 uint32_t wide_char;
116 int count, err;
117 size_t mblen;
118 size_t wclen;
119
120 if (mbchar == NULL)
121 return (0); /* no shift states */
122
123 /*
124 * How many bytes in this symbol?
125 */
126 count = u8_validate((char *)mbchar, nbytes, NULL, 0, &err);
127 if (count < 0)
128 return (-1);
129
130 mblen = count;
131 wclen = 1;
132 err = uconv_u8tou32((const uchar_t *)mbchar, &mblen,
133 &wide_char, &wclen, UCONV_OUT_SYSTEM_ENDIAN);
134 if (err != 0)
135 return (-1);
136 if (wclen == 0) {
137 wide_char = 0;
138 count = 0;
139 }
140
141 if (wcharp)
142 *wcharp = wide_char;
143
144 return (count);
145 }
146
147
148 /*
149 * wctomb
150 *
151 * The wctomb() function converts a wide character wchar into a multibyte
152 * character and stores the result in mbchar. The object pointed to by
153 * mbchar must be large enough to accommodate the multibyte character.
154 *
155 * Returns the numberof bytes written to mbchar.
156 * Note: handles null like any 1-byte char.
157 */
158 int
smb_wctomb(char * mbchar,uint32_t wchar)159 smb_wctomb(char *mbchar, uint32_t wchar)
160 {
161 char junk[MTS_MB_CUR_MAX+1];
162 size_t mblen;
163 size_t wclen;
164 int err;
165
166 if (mbchar == NULL)
167 mbchar = junk;
168
169 mblen = MTS_MB_CUR_MAX;
170 wclen = 1;
171 err = uconv_u32tou8(&wchar, &wclen, (uchar_t *)mbchar, &mblen,
172 UCONV_IN_SYSTEM_ENDIAN | UCONV_IGNORE_NULL);
173 if (err != 0)
174 return (-1);
175
176 return ((int)mblen);
177 }
178
179
180 /*
181 * wcstombs
182 *
183 * The wcstombs() function converts a wide character string wcstring
184 * into a multibyte character string mbstring. Up to nbytes bytes are
185 * stored in mbstring. Partial multibyte characters at the end of the
186 * string are not stored. The multibyte character string is null
187 * terminated if there is room.
188 *
189 * Returns the number of bytes converted, not counting the terminating
190 * null byte. Returns -1 if an invalid WC sequence is encountered.
191 */
192 size_t
smb_wcstombs(char * mbs,const smb_wchar_t * wcs,size_t nbytes)193 smb_wcstombs(char *mbs, const smb_wchar_t *wcs, size_t nbytes)
194 {
195 size_t mbslen, wcslen;
196 int err;
197
198 /* NULL or empty input is allowed. */
199 if (wcs == NULL || *wcs == 0) {
200 if (mbs != NULL && nbytes > 0)
201 *mbs = '\0';
202 return (0);
203 }
204
205 /*
206 * Traditional wcstombs(3C) allows mbs==NULL to get the length.
207 * SMB never calls it that way, but let's future-proof.
208 */
209 if (mbs == NULL) {
210 return ((size_t)-1);
211 }
212
213 /*
214 * Compute wcslen
215 */
216 wcslen = 0;
217 while (wcs[wcslen] != 0)
218 wcslen++;
219
220 mbslen = nbytes;
221 err = uconv_u16tou8(wcs, &wcslen,
222 (uchar_t *)mbs, &mbslen, UCONV_IN_LITTLE_ENDIAN);
223 if (err != 0)
224 return ((size_t)-1);
225
226 if (mbslen < nbytes)
227 mbs[mbslen] = '\0';
228
229 return (mbslen);
230 }
231
232
233 /*
234 * Returns the number of bytes that would be written if the multi-
235 * byte string mbs was converted to a wide character string, not
236 * counting the terminating null wide character.
237 */
238 size_t
smb_wcequiv_strlen(const char * mbs)239 smb_wcequiv_strlen(const char *mbs)
240 {
241 uint32_t wide_char;
242 size_t bytes;
243 size_t len = 0;
244
245 while (*mbs) {
246 bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
247 if (bytes == ((size_t)-1))
248 return ((size_t)-1);
249 mbs += bytes;
250
251 len += sizeof (smb_wchar_t);
252 if (bytes > 3) {
253 /*
254 * Extended unicode, so TWO smb_wchar_t
255 */
256 len += sizeof (smb_wchar_t);
257 }
258 }
259
260 return (len);
261 }
262
263
264 /*
265 * Returns the number of bytes that would be written if the multi-
266 * byte string mbs was converted to an OEM character string,
267 * (smb_mbstooem) not counting the terminating null character.
268 */
269 size_t
smb_sbequiv_strlen(const char * mbs)270 smb_sbequiv_strlen(const char *mbs)
271 {
272 size_t nbytes;
273 size_t len = 0;
274
275 while (*mbs) {
276 nbytes = smb_mbtowc(NULL, mbs, MTS_MB_CHAR_MAX);
277 if (nbytes == ((size_t)-1))
278 return ((size_t)-1);
279 if (nbytes == 0)
280 break;
281
282 if (nbytes == 1) {
283 /* ASCII */
284 len++;
285 } else if (nbytes < 8) {
286 /* Compute OEM length */
287 char mbsbuf[8];
288 uint8_t oembuf[8];
289 int oemlen;
290 (void) strlcpy(mbsbuf, mbs, nbytes+1);
291 oemlen = smb_mbstooem(oembuf, mbsbuf, 8);
292 if (oemlen < 0)
293 return ((size_t)-1);
294 len += oemlen;
295 } else {
296 return ((size_t)-1);
297 }
298
299 mbs += nbytes;
300 }
301
302 return (len);
303 }
304
305 /*
306 * Convert OEM strings to/from internal (UTF-8) form.
307 *
308 * We rarely encounter these anymore because all modern
309 * SMB clients use Unicode (UTF-16). The few cases where
310 * this IS still called are normally using ASCII, i.e.
311 * tag names etc. so short-cut those cases. If we get
312 * something non-ASCII we have to call iconv.
313 *
314 * If we were to really support OEM code pages, we would
315 * need to have a way to set the OEM code page from some
316 * configuration value. For now it's always CP850.
317 * See also ./smb_oem.c
318 */
319 static char smb_oem_codepage[32] = "CP850";
320
321 /*
322 * smb_oemtombs
323 *
324 * Convert a null terminated OEM string 'string' to a UTF-8 string
325 * no longer than max_mblen (null terminated if space).
326 *
327 * If the input string contains invalid OEM characters, a value
328 * of -1 will be returned. Otherwise returns the length of 'mbs',
329 * excluding the terminating null character.
330 *
331 * If either mbstring or string is a null pointer, -1 is returned.
332 */
333 int
smb_oemtombs(char * mbs,const uint8_t * oems,int max_mblen)334 smb_oemtombs(char *mbs, const uint8_t *oems, int max_mblen)
335 {
336 uchar_t *p;
337 int oemlen;
338 int rlen;
339 boolean_t need_iconv = B_FALSE;
340
341 if (mbs == NULL || oems == NULL)
342 return (-1);
343
344 /*
345 * Check if the oems is all ASCII (and get the length
346 * while we're at it) so we know if we need to iconv.
347 * We usually can avoid the iconv calls.
348 */
349 oemlen = 0;
350 p = (uchar_t *)oems;
351 while (*p != '\0') {
352 oemlen++;
353 if (*p & 0x80)
354 need_iconv = B_TRUE;
355 p++;
356 }
357
358 if (need_iconv) {
359 int rc;
360 char *obuf = mbs;
361 size_t olen = max_mblen;
362 size_t ilen = oemlen;
363 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
364 char *ibuf = (char *)oems;
365 kiconv_t ic;
366 int err;
367
368 ic = kiconv_open("UTF-8", smb_oem_codepage);
369 if (ic == (kiconv_t)-1)
370 goto just_copy;
371 rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err);
372 (void) kiconv_close(ic);
373 #else /* _KERNEL || _FAKE_KERNEL */
374 const char *ibuf = (char *)oems;
375 iconv_t ic;
376 ic = iconv_open("UTF-8", smb_oem_codepage);
377 if (ic == (iconv_t)-1)
378 goto just_copy;
379 rc = iconv(ic, &ibuf, &ilen, &obuf, &olen);
380 (void) iconv_close(ic);
381 #endif /* _KERNEL || _FAKE_KERNEL */
382 if (rc < 0)
383 return (-1);
384 /* Return val. is output bytes. */
385 rlen = (max_mblen - olen);
386 } else {
387 just_copy:
388 rlen = oemlen;
389 if (rlen > max_mblen)
390 rlen = max_mblen;
391 bcopy(oems, mbs, rlen);
392 }
393 if (rlen < max_mblen)
394 mbs[rlen] = '\0';
395
396 return (rlen);
397 }
398
399 /*
400 * smb_mbstooem
401 *
402 * Convert a null terminated multi-byte string 'mbs' to an OEM string
403 * no longer than max_oemlen (null terminated if space).
404 *
405 * If the input string contains invalid multi-byte characters, a value
406 * of -1 will be returned. Otherwise returns the length of 'oems',
407 * excluding the terminating null character.
408 *
409 * If either mbstring or string is a null pointer, -1 is returned.
410 */
411 int
smb_mbstooem(uint8_t * oems,const char * mbs,int max_oemlen)412 smb_mbstooem(uint8_t *oems, const char *mbs, int max_oemlen)
413 {
414 uchar_t *p;
415 int mbslen;
416 int rlen;
417 boolean_t need_iconv = B_FALSE;
418
419 if (oems == NULL || mbs == NULL)
420 return (-1);
421
422 /*
423 * Check if the mbs is all ASCII (and get the length
424 * while we're at it) so we know if we need to iconv.
425 * We usually can avoid the iconv calls.
426 */
427 mbslen = 0;
428 p = (uchar_t *)mbs;
429 while (*p != '\0') {
430 mbslen++;
431 if (*p & 0x80)
432 need_iconv = B_TRUE;
433 p++;
434 }
435
436 if (need_iconv) {
437 int rc;
438 char *obuf = (char *)oems;
439 size_t olen = max_oemlen;
440 size_t ilen = mbslen;
441 #if defined(_KERNEL) || defined(_FAKE_KERNEL)
442 char *ibuf = (char *)mbs;
443 kiconv_t ic;
444 int err;
445
446 ic = kiconv_open(smb_oem_codepage, "UTF-8");
447 if (ic == (kiconv_t)-1)
448 goto just_copy;
449 rc = kiconv(ic, &ibuf, &ilen, &obuf, &olen, &err);
450 (void) kiconv_close(ic);
451 #else /* _KERNEL || _FAKE_KERNEL */
452 const char *ibuf = mbs;
453 iconv_t ic;
454 ic = iconv_open(smb_oem_codepage, "UTF-8");
455 if (ic == (iconv_t)-1)
456 goto just_copy;
457 rc = iconv(ic, &ibuf, &ilen, &obuf, &olen);
458 (void) iconv_close(ic);
459 #endif /* _KERNEL || _FAKE_KERNEL */
460 if (rc < 0)
461 return (-1);
462 /* Return val. is output bytes. */
463 rlen = (max_oemlen - olen);
464 } else {
465 just_copy:
466 rlen = mbslen;
467 if (rlen > max_oemlen)
468 rlen = max_oemlen;
469 bcopy(mbs, oems, rlen);
470 }
471 if (rlen < max_oemlen)
472 oems[rlen] = '\0';
473
474 return (rlen);
475 }
476