xref: /illumos-gate/usr/src/lib/iconv_modules/utf-8/common/utf8_to_sb.c (revision 16d8656330ae5622ec32e5007f62145ebafdc50f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * This particular file is to cover conversions from UTF-8 to various single
26  * byte codesets.
27  */
28 
29 
30 #include <stdlib.h>
31 #include <errno.h>
32 #include <sys/types.h>
33 #include "utf8_to_sb.h"
34 
35 
36 
37 void *
_icv_open()38 _icv_open()
39 {
40 	ucs_state_t *cd = (ucs_state_t *)calloc(1, sizeof(ucs_state_t));
41 	if (cd == (ucs_state_t *)NULL) {
42 		errno = ENOMEM;
43 		return((void *)-1);
44 	}
45 
46 	return((void *)cd);
47 }
48 
49 
50 void
_icv_close(ucs_state_t * cd)51 _icv_close(ucs_state_t *cd)
52 {
53 	if (! cd)
54 		errno = EBADF;
55 	else
56 		free((void *)cd);
57 }
58 
59 
60 size_t
_icv_iconv(ucs_state_t * cd,char ** inbuf,size_t * inbufleft,char ** outbuf,size_t * outbufleft)61 _icv_iconv(ucs_state_t *cd, char **inbuf, size_t *inbufleft, char **outbuf,
62                 size_t *outbufleft)
63 {
64 	size_t ret_val = 0;
65 	unsigned char *ib;
66 	unsigned char *ob;
67 	unsigned char *ibtail;
68 	unsigned char *obtail;
69 	register int i, l, h;
70 	signed char sz;
71 	unsigned long u8;
72 
73 	if (! cd) {
74 		errno = EBADF;
75 		return((size_t)-1);
76 	}
77 
78 	if (!inbuf || !(*inbuf)) {
79 		cd->bom_written = false;
80 		return((size_t)0);
81 	}
82 
83 	ib = (unsigned char *)*inbuf;
84 	ob = (unsigned char *)*outbuf;
85 	ibtail = ib + *inbufleft;
86 	obtail = ob + *outbufleft;
87 
88 	/* We skip any first signiture of UTF-8 */
89 	if (!cd->bom_written &&
90 		((ibtail - ib) >= ICV_FETCH_UTF8_BOM_SIZE)) {
91 		for (u8 = 0, i = 0; i < ICV_FETCH_UTF8_BOM_SIZE; i++)
92 			u8 = (u8 << 8) | ((uint_t)(*(ib + i)));
93 		if (u8 == ICV_BOM_IN_BIG_ENDIAN)
94 			ib += ICV_FETCH_UTF8_BOM_SIZE;
95 	}
96 	cd->bom_written = true;
97 
98 	while (ib < ibtail) {
99 		sz = number_of_bytes_in_utf8_char[*ib];
100 		if (sz == ICV_TYPE_ILLEGAL_CHAR) {
101 			errno = EILSEQ;
102 			ret_val = (size_t)-1;
103 			break;
104 		}
105 
106 		if (ob >= obtail) {
107 			errno = E2BIG;
108 			ret_val = (size_t)-1;
109 			break;
110 		}
111 
112 		if (sz == 1) {
113 			*ob++ = *ib++;
114 		} else {
115 			if ((ibtail - ib) < sz) {
116 				errno = EINVAL;
117 				ret_val = (size_t)-1;
118 				break;
119 			}
120 
121 			u8 = 0;
122 			for (i = 0; i < sz; i++) {
123 				if (((unsigned int)*ib) < 0x80) {
124 					errno = EILSEQ;
125 					ret_val = (size_t)-1;
126 					goto illegal_char_err;
127 				}
128 				u8 = (u8 << 8) | ((unsigned int)*ib);
129 				ib++;
130 			}
131 			if ((u8 & ICV_UTF8_REPRESENTATION_ffff_mask) ==
132 			    ICV_UTF8_REPRESENTATION_fffe ||
133 			    (u8 & ICV_UTF8_REPRESENTATION_ffff_mask) ==
134 			    ICV_UTF8_REPRESENTATION_ffff ||
135 			    u8 > ICV_UTF8_REPRESENTATION_10fffd ||
136 			    (u8 >= ICV_UTF8_REPRESENTATION_d800 &&
137 			    u8 <= ICV_UTF8_REPRESENTATION_dfff) ||
138 			    (u8 >= ICV_UTF8_REPRESENTATION_fdd0 &&
139 			    u8 <= ICV_UTF8_REPRESENTATION_fdef)) {
140 				ib -= sz;
141 				errno = EILSEQ;
142 				ret_val = (size_t)-1;
143 				goto illegal_char_err;
144 			}
145 
146 			i = l = 0;
147 			h = (sizeof(u8_sb_tbl) /
148 			     sizeof(to_sb_table_component_t)) - 1;
149 			while (l <= h) {
150 				i = (l + h) / 2;
151 				if (u8_sb_tbl[i].u8 == u8)
152 					break;
153 				else if (u8_sb_tbl[i].u8 < u8)
154 					l = i + 1;
155 				else
156 					h = i - 1;
157 			}
158 
159 			/*
160 			 * We just assume that either we found it or it is
161 			 * a non-identical character that we need to
162 			 * provide a replacement character.
163 			 */
164 			if (u8_sb_tbl[i].u8 == u8) {
165 				*ob++ = u8_sb_tbl[i].sb;
166 			} else {
167 				*ob++ = ICV_CHAR_ASCII_REPLACEMENT;
168 				ret_val++;
169 			}
170 		}
171 	}
172 
173 illegal_char_err:
174 	*inbuf = (char *)ib;
175 	*inbufleft = ibtail - ib;
176 	*outbuf = (char *)ob;
177 	*outbufleft = obtail - ob;
178 
179 	return(ret_val);
180 }
181