xref: /illumos-gate/usr/src/lib/iconv_modules/utf-8/common/utf8.c (revision c5749750a3e052f1194f65a303456224c51dea63)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * This is for UTF-8 to UTF-8 code conversion; it simply passes through
26  * all things with UTF-8 byte sequence checking to screen out any illegal
27  * and thus potentially harmful bytes.
28  */
29 
30 
31 #include <stdlib.h>
32 #include <errno.h>
33 #include <sys/types.h>
34 #include <sys/isa_defs.h>
35 #include "common_defs.h"
36 
37 
38 void *
39 _icv_open()
40 {
41 	return((void *)MAGIC_NUMBER);
42 }
43 
44 
45 void
46 _icv_close(int *cd)
47 {
48 	if (! cd || cd != (int *)MAGIC_NUMBER)
49 		errno = EBADF;
50 }
51 
52 
53 size_t
54 _icv_iconv(int *cd, char **inbuf, size_t *inbufleft, char **outbuf,
55                 size_t *outbufleft)
56 {
57 	size_t ret_val = 0;
58 	uchar_t *ib;
59 	uchar_t *ob;
60 	uchar_t *ibtail;
61 	uchar_t *obtail;
62 	uchar_t *ib_copy;
63 	uint_t u4;
64 	uint_t first_byte;
65 	signed char sz;
66 	signed char obsz;
67 
68 	if (! cd || cd != (int *)MAGIC_NUMBER) {
69 		errno = EBADF;
70 		return((size_t)-1);
71 	}
72 
73 	if (!inbuf || !(*inbuf))
74 		return((size_t)0);
75 
76 	ib = (uchar_t *)*inbuf;
77 	ob = (uchar_t *)*outbuf;
78 	ibtail = ib + *inbufleft;
79 	obtail = ob + *outbufleft;
80 
81 	while (ib < ibtail) {
82 		sz = number_of_bytes_in_utf8_char[*ib];
83 		if (sz == ICV_TYPE_ILLEGAL_CHAR) {
84 			errno = EILSEQ;
85 			ret_val = (size_t)-1;
86 			break;
87 		}
88 		obsz = sz;
89 
90 		if ((ibtail - ib) < sz) {
91 			errno = EINVAL;
92 			ret_val = (size_t)-1;
93 			break;
94 		}
95 
96 		ib_copy = ib;
97 		first_byte = *ib_copy++;
98 		u4 = first_byte & (uint_t)masks_tbl[sz];
99 		for (; sz > 1; sz--) {
100 			if (first_byte) {
101 				if (((uchar_t)*ib_copy) <
102 					valid_min_2nd_byte[first_byte] ||
103 				    ((uchar_t)*ib_copy) >
104 					valid_max_2nd_byte[first_byte]) {
105 					errno = EILSEQ;
106 					ret_val = (size_t)-1;
107 					goto ILLEGAL_CHAR_ERR;
108 				}
109 				first_byte = 0;
110 			} else if (((uint_t)*ib_copy) < 0x80 ||
111 				   ((uint_t)*ib_copy) > 0xbf) {
112 				errno = EILSEQ;
113 				ret_val = (size_t)-1;
114 				goto ILLEGAL_CHAR_ERR;
115 			}
116 			u4 = (u4 << ICV_UTF8_BIT_SHIFT) |
117 				(((uint_t)*ib_copy) & ICV_UTF8_BIT_MASK);
118 			ib_copy++;
119 		}
120 
121 		/*
122 		 * Check some more illegal characters and noncharacters from
123 		 * the input buffer. Surrogate pairs (U+D800 - U+DFFF) are
124 		 * checked at the above for loop.
125 		 */
126 		if ((u4 & 0xffff) == 0x00fffe || (u4 & 0xffff) == 0x00ffff ||
127 		    (u4 >= 0x00fdd0 && u4 <= 0x00fdef) || u4 > 0x10fffd) {
128 			errno = EILSEQ;
129 			ret_val = (size_t)-1;
130 			goto ILLEGAL_CHAR_ERR;
131 		}
132 
133 		if ((obtail - ob) < obsz) {
134 			errno = E2BIG;
135 			ret_val = (size_t)-1;
136 			break;
137 		}
138 
139 		for (; obsz >= 1; obsz--)
140 			*ob++ = *ib++;
141 	}
142 
143 ILLEGAL_CHAR_ERR:
144 	*inbuf = (char *)ib;
145 	*inbufleft = ibtail - ib;
146 	*outbuf = (char *)ob;
147 	*outbufleft = obtail - ob;
148 
149 	return(ret_val);
150 }
151