xref: /titanic_52/usr/src/lib/iconv_modules/utf-8/common/utf7_to_ucs.h (revision 91e1e26ac6a73ce959289cf7d3d96c4baedbe0b8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1998-1999 by Sun Microsystems, Inc.
23  * All rights reserved.
24  */
25 
26 #ifndef	UTF7_TO_UCS_H
27 #define	UTF7_TO_UCS_H
28 
29 
30 #include "common_defs.h"
31 
32 
33 /* Modified Base64 alphabet to Value mapping table -- see RFC 2045. */
34 static const signed char rmb64[0x100] = {
35 /*00*/  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
36 /*10*/  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
37 /*20*/  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,
38 /*30*/  52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
39 /*40*/  -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
40 /*50*/  15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
41 /*60*/  -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
42 /*70*/  41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,
43 /*80*/  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
44 /*90*/  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
45 /*a0*/  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
46 /*b0*/  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
47 /*c0*/  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
48 /*d0*/  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
49 /*e0*/  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
50 /*f0*/  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
51 };
52 
53 /*
54  * Any UCS-2 character sequences will yield:
55  *
56  * +-16 bits (UCS-2)-+  +-16 bits (UCS-2)-+  +-16 bits (UCS-2)-+
57  * |                 |  |                 |  |                 |
58  * xxxx xxxx xxxx xxxx  xxxx xxxx xxxx xxxx  xxxx xxxx xxxx xxxx
59  * |     ||     | |      ||     | |     ||      | |     ||     |
60  * +--0--++--1--+ +---2--++--3--+ +--4--++---5--+ +--6--++--7--+ MBase64 chars
61  *                ^                      ^
62  * initially,     |                      |
63  *                four remnant bits,     |
64  *                                       two remnant bits,
65  *
66  * and, then no remnant bit for three sequential UCS-2 characters,
67  * respectively, and repeat these three UCS-2 character sequences. For the
68  * first UCS-2 character in this sequence, there will be two MBase64
69  * characters, and for the second and the third UCS-2 characters, there will be
70  * three MBase64 characters.
71  *
72  * Following action numbers, 0, 2, 5, and, 7, are assigned to each of
73  * corresponding MBase64 characters that can either yield a UCS-2 character or
74  * indicate a character that is the starting/initial one.
75  */
76 #define	ICV_U7_ACTION_START			0
77 #define	ICV_U7_ACTION_HARVEST1			2
78 #define	ICV_U7_ACTION_HARVEST2			5
79 #define	ICV_U7_ACTION_HARVEST3			7
80 
81 #define	ICV_U7_UCS4_OUTOFUTF16			0xfffefeff
82 
83 #define OUTBUF_SIZE_CHECK(sz) \
84 	if ((obtail - ob) < (sz)) { \
85 		errno = E2BIG; \
86 		ret_val = (size_t)-1; \
87 		break; \
88 	}
89 
90 /*
91  * For better performance and readability, we perfer to write macros like
92  * below instead of putting them in functions and then calling them.
93  */
94 #define CHECK_OUTBUF_SZ_AND_WRITE_U2 \
95 	obsz = (cd->bom_written) ? ICV_FETCH_UCS_SIZE : ICV_FETCH_UCS_SIZE_TWO;\
96 	if ((obtail - ob) < obsz) { \
97 		errno = E2BIG; \
98 		ret_val = (size_t)-1; \
99 		break; \
100 	} \
101 	if (cd->little_endian) { \
102 		if (! cd->bom_written) { \
103 			*ob++ = (uchar_t)0xff; \
104 			*ob++ = (uchar_t)0xfe; \
105 			cd->bom_written = true; \
106 		} \
107 		*ob++ = (uchar_t)(u4 & 0xff); \
108 		*ob++ = (uchar_t)((u4 >> 8) & 0xff); \
109 	} else { \
110 		if (! cd->bom_written) { \
111 			*ob++ = (uchar_t)0xfe; \
112 			*ob++ = (uchar_t)0xff; \
113 			cd->bom_written = true; \
114 		} \
115 		*ob++ = (uchar_t)((u4 >> 8) & 0xff); \
116 		*ob++ = (uchar_t)(u4 & 0xff); \
117 	}
118 
119 #define CHECK_OUTBUF_SZ_AND_WRITE_U4 \
120 	obsz = (cd->bom_written) ? ICV_FETCH_UCS_SIZE : ICV_FETCH_UCS_SIZE_TWO;\
121 	if ((obtail - ob) < obsz) { \
122 		errno = E2BIG; \
123 		ret_val = (size_t)-1; \
124 		break; \
125 	} \
126 	if (cd->little_endian) { \
127 		if (! cd->bom_written) { \
128 			*ob++ = (uchar_t)0xff; \
129 			*ob++ = (uchar_t)0xfe; \
130 			*(ushort_t *)ob = (ushort_t)0; \
131 			ob += 2; \
132 			cd->bom_written = true; \
133 		} \
134 		*ob++ = (uchar_t)(u4 & 0xff); \
135 		*ob++ = (uchar_t)((u4 >> 8) & 0xff); \
136 		*ob++ = (uchar_t)((u4 >> 16) & 0xff); \
137 		*ob++ = (uchar_t)((u4 >> 24) & 0xff); \
138 	} else { \
139 		if (! cd->bom_written) { \
140 			*(ushort_t *)ob = (ushort_t)0; \
141 			ob += 2; \
142 			*ob++ = (uchar_t)0xfe; \
143 			*ob++ = (uchar_t)0xff; \
144 			cd->bom_written = true; \
145 		} \
146 		*ob++ = (uchar_t)((u4 >> 24) & 0xff); \
147 		*ob++ = (uchar_t)((u4 >> 16) & 0xff); \
148 		*ob++ = (uchar_t)((u4 >> 8) & 0xff); \
149 		*ob++ = (uchar_t)(u4 & 0xff); \
150 	}
151 
152 /*
153  * UTF-7's code range is basically that of UTF-16, i.e.,
154  * U+0000 0000 ~ U+0010 FFFF, it cannot go beyond the U+0010 FFFF.
155  */
156 #define	CHECK_OUTBUF_SZ_AND_WRITE_U8_OR_EILSEQ \
157 	if (u4 <= 0x7f) { \
158 		OUTBUF_SIZE_CHECK(1); \
159 		*ob++ = (uchar_t)u4; \
160 	} else if (u4 <= 0x7ff) { \
161 		OUTBUF_SIZE_CHECK(2); \
162 		*ob++ = (uchar_t)(0xc0 | ((u4 & 0x07c0) >> 6)); \
163 		*ob++ = (uchar_t)(0x80 |  (u4 & 0x003f)); \
164 	} else if (u4 <= 0x00ffff) { \
165 		OUTBUF_SIZE_CHECK(3); \
166 		*ob++ = (uchar_t)(0xe0 | ((u4 & 0x0f000) >> 12)); \
167 		*ob++ = (uchar_t)(0x80 | ((u4 & 0x00fc0) >> 6)); \
168 		*ob++ = (uchar_t)(0x80 |  (u4 & 0x0003f)); \
169 	} else if (u4 <= 0x10ffff) { \
170 		OUTBUF_SIZE_CHECK(4); \
171 		*ob++ = (uchar_t)(0xf0 | ((u4 & 0x01c0000) >> 18)); \
172 		*ob++ = (uchar_t)(0x80 | ((u4 & 0x003f000) >> 12)); \
173 		*ob++ = (uchar_t)(0x80 | ((u4 & 0x0000fc0) >> 6)); \
174 		*ob++ = (uchar_t)(0x80 |  (u4 & 0x000003f)); \
175 	} else { \
176 		errno = EILSEQ; \
177 		ret_val = (size_t)-1; \
178 		break; \
179 	}
180 
181 
182 #endif	/* UTF7_TO_UCS_H */
183