xref: /titanic_51/usr/src/lib/iconv_modules/ko/common/utf_to_johap92.c (revision 91e1e26ac6a73ce959289cf7d3d96c4baedbe0b8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1994 by Sun Microsystems, Inc.
23  */
24 
25 
26 #include <stdlib.h>
27 #include <errno.h>
28 #include "hangulcode.h"
29 #include "ktable.h"
30 #include "utf_johap92.h"
31 #include "common_defs.h"
32 
33 #define	MSB	0x80	/* mask for most-significant-bit */
34 typedef enum _USTATE {U0 = 0, U1, U2, U3, U4, U5, U6,UX} USTATE;
35 
36 typedef struct _icv_state {
37 	unsigned char _buffer[6];
38 	USTATE _ustate;
39 	unsigned short _count;
40 	int _errno;
41 } _iconv_st;
42 
43 /****  _ I C V _ O P E N  ****/
44 
45 void* _icv_open()
46 {
47 	_iconv_st *st;
48 	if((st = (_iconv_st *) malloc(sizeof(_iconv_st))) == NULL){
49 		errno = ENOMEM;
50 		return ((void *) -1);
51 	}
52 	st->_ustate = U0;
53 	st->_errno = 0;
54 	st->_count = 0;
55 /*
56 	RESET_CONV_DESC();
57 */
58 	return ((void *) st);
59 }  /* end of int _icv_open(). */
60 
61 
62 /****  _ I C V _ C L O S E  ****/
63 
64 void _icv_close(_iconv_st* st)
65 {
66 	if(!st)
67 		errno = EBADF;
68 	else
69 		free(st);
70 }  /* end of void _icv_close(int*). */
71 
72 
73 /****  _ I C V _ I C O N V  ****/
74 
75 size_t _icv_iconv(_iconv_st* st, char** inbuf, size_t* inbufleft,
76 			char** outbuf, size_t* outbufleft)
77 {
78 	size_t		ret_val = 0;
79 	unsigned char*	ib;
80 	unsigned char*	ob;
81 	unsigned char*	ibtail;
82 	unsigned char*	obtail;
83 
84 	hcode_type utf8_code, johap92_code;
85 
86 	if(st == NULL){
87 		errno = EBADF;
88 		return ((size_t) -1);
89 	}
90 
91 	if (!inbuf || !(*inbuf)){
92 		st->_ustate = U0;
93 		st->_errno = 0;
94 		return((size_t)0);
95 	}
96 
97 	st->_errno = 0;
98 	errno = 0;
99 
100 	ib = (unsigned char*)*inbuf;
101 	ob = (unsigned char*)*outbuf;
102 	ibtail = ib + *inbufleft;
103 	obtail = ob + *outbufleft;
104 
105 
106 	while (ib < ibtail)
107 	{
108 		unsigned char first_byte;
109 		switch(st->_ustate){
110 		case U0:	/* begining of new utf-8 char sequence */
111 			if((*ib & MSB) == 0){	/* MSB is off, so ASCII */
112 				if(ob >= obtail){
113 					errno = E2BIG;
114 					ret_val = (size_t) -1;
115 					break;
116 				}
117 				*ob++ = *ib++;
118 
119 			} else { 	/* Now, begining of UTF-8 */
120 				if((*ib & 0xe0) == 0xc0){
121 				/* 2-byte utf-8				*/
122 				/* true if *ib is (0xc0 ~ 0xdf) 	*/
123 				/* but, need to filter out the range 	*/
124 				/* 0xc0 ~ 0xc1				*/
125 
126 					if(number_of_bytes_in_utf8_char[(unsigned char) *ib] ==
127 					    ICV_TYPE_ILLEGAL_CHAR)
128 						st->_errno = errno = EILSEQ;
129 					else {
130 						st->_ustate = U1;
131 						st->_buffer[0] = *ib;
132 					}
133 				} else if((*ib & 0xf0) == 0xe0){
134 				/* 3 byte utf-8				*/
135 				/* if *ib is (0xe0 ~ 0xef)		*/
136 					st->_ustate = U2;
137 					st->_buffer[0] = *ib;
138 				} else {
139 				/* 4 byte utf-8				*/
140 				/* true if *ib is (0xf0 ~ 0xff)		*/
141 				/* but, need to screen out the range	*/
142 				/* 0xf5 ~ 0xff				*/
143 					if(number_of_bytes_in_utf8_char[(unsigned char) *ib] ==
144 					    ICV_TYPE_ILLEGAL_CHAR)
145 						st->_errno = errno = EILSEQ;
146 					else {
147 						st->_ustate = U4;
148 						st->_buffer[0] = *ib;
149 
150 					}
151 				}
152 				st->_count++;
153 				ib++;
154 			}
155 			break;
156 		case U1:	/* we are getting 2nd byte of 2byte utf-8	*/
157 				/* convert it right here			*/
158 			if((*ib & 0xc0) == MSB){
159 				st->_ustate = UX;
160 				st->_buffer[1] = *ib;
161 				st->_count++;
162 				continue;/* Now, we gotta do the real conversion*/
163 					 /* becuase we just came to an the last	*/
164 					 /* byte of utf-8 character		*/
165 			} else {
166 				ib++;
167 				st->_errno = errno = EILSEQ;
168 				ret_val = (size_t) -1;
169 				break;
170 			}
171 			break;
172 		case U2:	/* 2nd byte of 3byte utf-8			*/
173 			first_byte = (unsigned char) st->_buffer[0];
174 				/* basic utf-8 validity check first...		*/
175 			if((*ib & 0xc0) == MSB){
176 				/* if okay, then what about the range of this byte?	*/
177 				/* if the first byte is 0xed, it is illegal sequence	*/
178 				/* if the second one is between 0xa0 and 0xbf		*/
179 				/* because surrogate section is ill-formed		*/
180 
181 				if((unsigned char)*ib < valid_min_2nd_byte[first_byte] ||
182 				    (unsigned char)*ib > valid_max_2nd_byte[first_byte]){
183 					st->_errno = errno = EILSEQ;
184 				} else {
185 					st->_ustate = U3;
186 					st->_buffer[1] = *ib;
187 					st->_count++;
188 				}
189 
190 			} else {
191 				st->_errno = errno = EILSEQ;
192 			}
193 			ib++;
194 			break;
195 		case U3:	/* 3rd byte of 3byte utf-8			*/
196 			if((*ib & 0xc0) == MSB){
197 				st->_ustate = UX;
198 				st->_buffer[2] = *ib;
199 				st->_count++;
200 				continue;/* Now, we gotta do the real conversion*/
201 					 /* becuase we just came to an the last */
202 					 /* byte of utf-8 character		*/
203 			} else {
204 				st->_errno = errno = EILSEQ;
205 				ret_val = (size_t) -1;
206 				ib++;
207 				break;
208 			}
209 			break;
210 		case U4:	/* 2nd byte of 4byte utf-8			*/
211 			first_byte = st->_buffer[0];
212 			if((*ib & 0xc0) == MSB){
213 				if((unsigned char)*ib < valid_min_2nd_byte[first_byte] ||
214 				  (unsigned char)*ib > valid_max_2nd_byte[first_byte]){
215 					st->_errno = errno = EILSEQ;
216 				} else {
217 					st->_ustate = U5;
218 					st->_buffer[1] = *ib;
219 					st->_count++;
220 				}
221 			} else {
222 				st->_errno = errno = EILSEQ;
223 			}
224 			ib++;
225 			break;
226 		case U5:	/* 3rd byte of 4byte utf-8			*/
227 			if((*ib & 0xc0) == MSB){
228 				st->_ustate = U6;
229 				st->_buffer[2] = *ib;
230 				st->_count++;
231 			} else {
232 				st->_errno = errno = EILSEQ;
233 			}
234 			ib++;
235 			break;
236 		case U6:	/* 4th byte of 4byte utf-8			*/
237 			if((*ib & 0xc0) == MSB){
238 				if((obtail - ob) < 2){
239 					st->_errno = errno = E2BIG;
240 				} else {
241 					*ob++ = NON_ID_CHAR;
242 					*ob++ = NON_ID_CHAR;
243 					st->_ustate = U0;
244 				}
245 			} else {
246 				st->_errno = errno = EILSEQ;
247 			}
248 			ib++;
249 			break;
250 		case UX:
251 			/*******************************************************
252 			 * convert valid utf-8 sequence gathered in the
253 			 * st->_buffer to euc
254 			 *******************************************************/
255 			utf8_code.code = 0;
256 			switch(st->_count){
257 			case 2: /* 2byte utf-8 code */
258 				utf8_code.byte.byte3 = st->_buffer[0];
259 				utf8_code.byte.byte4 = st->_buffer[1];
260 				break;
261 			case 3: /* 3byte utf-8 code */
262 				utf8_code.byte.byte2 = st->_buffer[0];
263 				utf8_code.byte.byte3 = st->_buffer[1];
264 				utf8_code.byte.byte4 = st->_buffer[2];
265 				break;
266 			}
267 			unsigned short _utf8_to_jahap92(utf_code.code)
268 
269 			if (euc_code.code != 0) {
270 			/* If find something -> EUC code */
271                                 *ob++ = euc_code.byte.byte3;
272                                 *ob++ = euc_code.byte.byte4;
273                         }
274                         else
275                         {
276                                 /* Let's assume the code is not identifiable */
277                                 if ((obtail - ob) < 2)
278                                 {
279                                         errno = E2BIG;
280                                         ret_val = (size_t)-1;
281                                 }
282                                 *ob++ = NON_IDENTICAL;
283                                 *ob++ = NON_IDENTICAL;
284                                 ret_val += 2;
285                         }
286 			st->_ustate = U0;
287 			st->_count = 0;
288 			ib++;
289 			break;
290 		default:	/* You are not supposed to get here...		*/
291 				/* But, just only for the integrity		*/
292 			st->_errno = errno = EILSEQ;
293 			st->_ustate = U0;
294 			st->_count = 0;
295 			break;
296 
297 		}
298 		if(st->_errno){
299 #ifdef DEBUG
300 			fprintf(stderr,  "st->_errno=%d\tst->_ustate=%d\n", st->_errno, st->_ustate);
301 #endif /* DEBUG */
302 			break;
303 		}
304 
305 	}
306 	if(errno) return ((size_t) -1);
307 
308 	*inbuf = (char*)ib;
309 	*inbufleft = ibtail - ib;
310 	*outbuf = (char*)ob;
311 	*outbufleft = obtail - ob;
312 
313 	return(ret_val);
314 }  /* end of size_t _icv_iconv(int*, char**, size_t*, char**, size_t*).*/
315 
316 
317 
318 
319 
320 
321 
322 
323 
324 unsigned short _utf8_to_jahap92(unsigned long utf_code)
325 {
326 	int low, mid, high;
327 	low = 0, high = MAX_U2J92_NUM;
328 	while(low < high){
329 		mid = (low + high)/2;
330 		if(utf8_to_johap92_tbl[mid].utf8 = utf_code){
331 			break;
332 		} else if(utf8_to_johap92_tbl[mid].utf8 > utf_code){
333 			high = mid - 1;
334 		} else if(utf8_to_johap92_tbl[mid].utf8 < utf_code){
335 			low = mid + 1;
336 		}
337 	}
338 }
339