xref: /titanic_51/usr/src/lib/libsmbfs/smb/utf_str.c (revision 613a2f6ba31e891e3d947a356daf5e563d43c1ce)
1  /*
2   * CDDL HEADER START
3   *
4   * The contents of this file are subject to the terms of the
5   * Common Development and Distribution License (the "License").
6   * You may not use this file except in compliance with the License.
7   *
8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9   * or http://www.opensolaris.org/os/licensing.
10   * See the License for the specific language governing permissions
11   * and limitations under the License.
12   *
13   * When distributing Covered Code, include this CDDL HEADER in each
14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15   * If applicable, add the following below this CDDL HEADER, with the
16   * fields enclosed by brackets "[]" replaced with your own identifying
17   * information: Portions Copyright [yyyy] [name of copyright owner]
18   *
19   * CDDL HEADER END
20   */
21  
22  /*
23   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24   * Use is subject to license terms.
25   */
26  
27  /*
28   * Unicode conversions (yet more)
29   */
30  
31  #include <stdio.h>
32  #include <stdlib.h>
33  #include <string.h>
34  #include <errno.h>
35  #include <iconv.h>
36  #include <libintl.h>
37  
38  #include <sys/u8_textprep.h>
39  
40  #include <netsmb/smb_lib.h>
41  #include "charsets.h"
42  
43  
44  /*
45   * Number of unicode symbols in the string,
46   * not including the 2-byte null terminator.
47   * (multiply by two for storage size)
48   */
49  size_t
50  unicode_strlen(const uint16_t *us)
51  {
52  	size_t len = 0;
53  	while (*us++)
54  		len++;
55  	return (len);
56  }
57  
58  static char *convert_ucs2xx_to_utf8(iconv_t, const uint16_t *);
59  
60  /*
61   * Convert (native) Unicode string to UTF-8.
62   * Returns allocated memory.
63   */
64  char *
65  convert_unicode_to_utf8(uint16_t *us)
66  {
67  	static iconv_t cd1 = (iconv_t)-1;
68  
69  	/* Get conversion descriptor (to, from) */
70  	if (cd1 == (iconv_t)-1)
71  		cd1 = iconv_open("UTF-8", "UCS-2");
72  
73  	return (convert_ucs2xx_to_utf8(cd1, us));
74  }
75  
76  /*
77   * Convert little-endian Unicode string to UTF-8.
78   * Returns allocated memory.
79   */
80  char *
81  convert_leunicode_to_utf8(unsigned short *us)
82  {
83  	static iconv_t cd2 = (iconv_t)-1;
84  
85  	/* Get conversion descriptor (to, from) */
86  	if (cd2 == (iconv_t)-1)
87  		cd2 = iconv_open("UTF-8", "UCS-2LE");
88  
89  	return (convert_ucs2xx_to_utf8(cd2, us));
90  }
91  
92  static char *
93  convert_ucs2xx_to_utf8(iconv_t cd, const uint16_t *us)
94  {
95  	char *obuf, *optr;
96  	const char *iptr;
97  	size_t  ileft, obsize, oleft, ret;
98  
99  	if (cd == (iconv_t)-1) {
100  		smb_error(dgettext(TEXT_DOMAIN,
101  		    "iconv_open(UTF-8/UCS-2)"), -1);
102  		return (NULL);
103  	}
104  
105  	iptr = (const char *)us;
106  	ileft = unicode_strlen(us);
107  	ileft *= 2; /* now bytes */
108  
109  	/* Worst-case output size is 2x input size. */
110  	oleft = ileft * 2;
111  	obsize = oleft + 2; /* room for null */
112  	obuf = malloc(obsize);
113  	if (!obuf)
114  		return (NULL);
115  	optr = obuf;
116  
117  	ret = iconv(cd, &iptr, &ileft, &optr, &oleft);
118  	*optr = '\0';
119  	if (ret == (size_t)-1) {
120  		smb_error(dgettext(TEXT_DOMAIN,
121  		    "iconv(%s) failed"), errno, obuf);
122  	}
123  	if (ileft) {
124  		smb_error(dgettext(TEXT_DOMAIN,
125  		    "iconv(%s) failed"), -1, obuf);
126  		/*
127  		 * XXX: What's better?  return NULL?
128  		 * The truncated string? << for now
129  		 */
130  	}
131  
132  	return (obuf);
133  }
134  
135  static uint16_t *convert_utf8_to_ucs2xx(iconv_t, const char *);
136  
137  /*
138   * Convert UTF-8 string to Unicode.
139   * Returns allocated memory.
140   */
141  uint16_t *
142  convert_utf8_to_unicode(const char *utf8_string)
143  {
144  	static iconv_t cd3 = (iconv_t)-1;
145  
146  	/* Get conversion descriptor (to, from) */
147  	if (cd3 == (iconv_t)-1)
148  		cd3 = iconv_open("UCS-2", "UTF-8");
149  	return (convert_utf8_to_ucs2xx(cd3, utf8_string));
150  }
151  
152  /*
153   * Convert UTF-8 string to little-endian Unicode.
154   * Returns allocated memory.
155   */
156  uint16_t *
157  convert_utf8_to_leunicode(const char *utf8_string)
158  {
159  	static iconv_t cd4 = (iconv_t)-1;
160  
161  	/* Get conversion descriptor (to, from) */
162  	if (cd4 == (iconv_t)-1)
163  		cd4 = iconv_open("UCS-2LE", "UTF-8");
164  	return (convert_utf8_to_ucs2xx(cd4, utf8_string));
165  }
166  
167  static uint16_t *
168  convert_utf8_to_ucs2xx(iconv_t cd, const char *utf8_string)
169  {
170  	uint16_t *obuf, *optr;
171  	const char *iptr;
172  	size_t  ileft, obsize, oleft, ret;
173  
174  	if (cd == (iconv_t)-1) {
175  		smb_error(dgettext(TEXT_DOMAIN,
176  		    "iconv_open(UCS-2/UTF-8)"), -1);
177  		return (NULL);
178  	}
179  
180  	iptr = utf8_string;
181  	ileft = strlen(iptr);
182  
183  	/* Worst-case output size is 2x input size. */
184  	oleft = ileft * 2;
185  	obsize = oleft + 2; /* room for null */
186  	obuf = malloc(obsize);
187  	if (!obuf)
188  		return (NULL);
189  	optr = obuf;
190  
191  	ret = iconv(cd, &iptr, &ileft, (char **)&optr, &oleft);
192  	*optr = '\0';
193  	if (ret == (size_t)-1) {
194  		smb_error(dgettext(TEXT_DOMAIN,
195  		    "iconv(%s) failed"), errno, utf8_string);
196  	}
197  	if (ileft) {
198  		smb_error(dgettext(TEXT_DOMAIN,
199  		    "iconv(%s) failed"), -1, utf8_string);
200  		/*
201  		 * XXX: What's better?  return NULL?
202  		 * The truncated string? << for now
203  		 */
204  	}
205  
206  	return (obuf);
207  }
208  
209  
210  /*
211   * A simple wrapper around u8_textprep_str() that returns the Unicode
212   * upper-case version of some string.  Returns memory from malloc.
213   * Borrowed from idmapd.
214   */
215  static char *
216  utf8_str_to_upper_or_lower(const char *s, int upper_lower)
217  {
218  	char *res = NULL;
219  	char *outs;
220  	size_t inlen, outlen, inbleft, outbleft;
221  	int rc, err;
222  
223  	/*
224  	 * u8_textprep_str() does not allocate memory.  The input and
225  	 * output buffers may differ in size (though that would be more
226  	 * likely when normalization is done).  We have to loop over it...
227  	 *
228  	 * To improve the chances that we can avoid looping we add 10
229  	 * bytes of output buffer room the first go around.
230  	 */
231  	inlen = inbleft = strlen(s);
232  	outlen = outbleft = inlen + 10;
233  	if ((res = malloc(outlen)) == NULL)
234  		return (NULL);
235  	outs = res;
236  
237  	while ((rc = u8_textprep_str((char *)s, &inbleft, outs,
238  	    &outbleft, upper_lower, U8_UNICODE_LATEST, &err)) < 0 &&
239  	    err == E2BIG) {
240  		if ((res = realloc(res, outlen + inbleft)) == NULL)
241  			return (NULL);
242  		/* adjust input/output buffer pointers */
243  		s += (inlen - inbleft);
244  		outs = res + outlen - outbleft;
245  		/* adjust outbleft and outlen */
246  		outlen += inbleft;
247  		outbleft += inbleft;
248  	}
249  
250  	if (rc < 0) {
251  		free(res);
252  		res = NULL;
253  		return (NULL);
254  	}
255  
256  	res[outlen - outbleft] = '\0';
257  
258  	return (res);
259  }
260  
261  char *
262  utf8_str_toupper(const char *s)
263  {
264  	return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOUPPER));
265  }
266  
267  char *
268  utf8_str_tolower(const char *s)
269  {
270  	return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOLOWER));
271  }
272