ko/common/utf_to_johap92.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 1994 by Sun Microsystems, Inc.
 */


#include <stdlib.h>
#include <errno.h>
#include "hangulcode.h"
#include "ktable.h"
#include "utf_johap92.h"
#include "common_defs.h"

#define	MSB	0x80	/* mask for most-significant-bit */
typedef enum _USTATE {U0 = 0, U1, U2, U3, U4, U5, U6,UX} USTATE;

typedef struct _icv_state {
	unsigned char _buffer[6];
	USTATE _ustate;
	unsigned short _count;
	int _errno;
} _iconv_st;

/****  _ I C V _ O P E N  ****/

void* _icv_open()
{
	_iconv_st *st;
	if((st = (_iconv_st *) malloc(sizeof(_iconv_st))) == NULL){
		errno = ENOMEM;
		return ((void *) -1);
	}
	st->_ustate = U0;
	st->_errno = 0;
	st->_count = 0;
/*
	RESET_CONV_DESC();
*/
	return ((void *) st);
}  /* end of int _icv_open(). */


/****  _ I C V _ C L O S E  ****/

void _icv_close(_iconv_st* st)
{
	if(!st)
		errno = EBADF;
	else
		free(st);
}  /* end of void _icv_close(int*). */


/****  _ I C V _ I C O N V  ****/

size_t _icv_iconv(_iconv_st* st, char** inbuf, size_t* inbufleft,
			char** outbuf, size_t* outbufleft)
{
	size_t		ret_val = 0;
	unsigned char*	ib;
	unsigned char*	ob;
	unsigned char*	ibtail;
	unsigned char*	obtail;

	hcode_type utf8_code, johap92_code;

	if(st == NULL){
		errno = EBADF;
		return ((size_t) -1);
	}

	if (!inbuf || !(*inbuf)){
		st->_ustate = U0;
		st->_errno = 0;
		return((size_t)0);
	}

	st->_errno = 0;
	errno = 0;

	ib = (unsigned char*)*inbuf;
	ob = (unsigned char*)*outbuf;
	ibtail = ib + *inbufleft;
	obtail = ob + *outbufleft;


	while (ib < ibtail)
	{
		unsigned char first_byte;
		switch(st->_ustate){
		case U0:	/* begining of new utf-8 char sequence */
			if((*ib & MSB) == 0){	/* MSB is off, so ASCII */
				if(ob >= obtail){
					errno = E2BIG;
					ret_val = (size_t) -1;
					break;
				}
				*ob++ = *ib++;

			} else { 	/* Now, begining of UTF-8 */
				if((*ib & 0xe0) == 0xc0){
				/* 2-byte utf-8				*/
				/* true if *ib is (0xc0 ~ 0xdf) 	*/
				/* but, need to filter out the range 	*/
				/* 0xc0 ~ 0xc1				*/

					if(number_of_bytes_in_utf8_char[(unsigned char) *ib] ==
					    ICV_TYPE_ILLEGAL_CHAR)
						st->_errno = errno = EILSEQ;
					else {
						st->_ustate = U1;
						st->_buffer[0] = *ib;
					}
				} else if((*ib & 0xf0) == 0xe0){
				/* 3 byte utf-8				*/
				/* if *ib is (0xe0 ~ 0xef)		*/
					st->_ustate = U2;
					st->_buffer[0] = *ib;
				} else {
				/* 4 byte utf-8				*/
				/* true if *ib is (0xf0 ~ 0xff)		*/
				/* but, need to screen out the range	*/
				/* 0xf5 ~ 0xff				*/
					if(number_of_bytes_in_utf8_char[(unsigned char) *ib] ==
					    ICV_TYPE_ILLEGAL_CHAR)
						st->_errno = errno = EILSEQ;
					else {
						st->_ustate = U4;
						st->_buffer[0] = *ib;

					}
				}
				st->_count++;
				ib++;
			}
			break;
		case U1:	/* we are getting 2nd byte of 2byte utf-8	*/
				/* convert it right here			*/
			if((*ib & 0xc0) == MSB){
				st->_ustate = UX;
				st->_buffer[1] = *ib;
				st->_count++;
				continue;/* Now, we gotta do the real conversion*/
					 /* becuase we just came to an the last	*/
					 /* byte of utf-8 character		*/
			} else {
				ib++;
				st->_errno = errno = EILSEQ;
				ret_val = (size_t) -1;
				break;
			}
			break;
		case U2:	/* 2nd byte of 3byte utf-8			*/
			first_byte = (unsigned char) st->_buffer[0];
				/* basic utf-8 validity check first...		*/
			if((*ib & 0xc0) == MSB){
				/* if okay, then what about the range of this byte?	*/
				/* if the first byte is 0xed, it is illegal sequence	*/
				/* if the second one is between 0xa0 and 0xbf		*/
				/* because surrogate section is ill-formed		*/

				if((unsigned char)*ib < valid_min_2nd_byte[first_byte] ||
				    (unsigned char)*ib > valid_max_2nd_byte[first_byte]){
					st->_errno = errno = EILSEQ;
				} else {
					st->_ustate = U3;
					st->_buffer[1] = *ib;
					st->_count++;
				}

			} else {
				st->_errno = errno = EILSEQ;
			}
			ib++;
			break;
		case U3:	/* 3rd byte of 3byte utf-8			*/
			if((*ib & 0xc0) == MSB){
				st->_ustate = UX;
				st->_buffer[2] = *ib;
				st->_count++;
				continue;/* Now, we gotta do the real conversion*/
					 /* becuase we just came to an the last */
					 /* byte of utf-8 character		*/
			} else {
				st->_errno = errno = EILSEQ;
				ret_val = (size_t) -1;
				ib++;
				break;
			}
			break;
		case U4:	/* 2nd byte of 4byte utf-8			*/
			first_byte = st->_buffer[0];
			if((*ib & 0xc0) == MSB){
				if((unsigned char)*ib < valid_min_2nd_byte[first_byte] ||
				  (unsigned char)*ib > valid_max_2nd_byte[first_byte]){
					st->_errno = errno = EILSEQ;
				} else {
					st->_ustate = U5;
					st->_buffer[1] = *ib;
					st->_count++;
				}
			} else {
				st->_errno = errno = EILSEQ;
			}
			ib++;
			break;
		case U5:	/* 3rd byte of 4byte utf-8			*/
			if((*ib & 0xc0) == MSB){
				st->_ustate = U6;
				st->_buffer[2] = *ib;
				st->_count++;
			} else {
				st->_errno = errno = EILSEQ;
			}
			ib++;
			break;
		case U6:	/* 4th byte of 4byte utf-8			*/
			if((*ib & 0xc0) == MSB){
				if((obtail - ob) < 2){
					st->_errno = errno = E2BIG;
				} else {
					*ob++ = NON_ID_CHAR;
					*ob++ = NON_ID_CHAR;
					st->_ustate = U0;
				}
			} else {
				st->_errno = errno = EILSEQ;
			}
			ib++;
			break;
		case UX:
			/*******************************************************
			 * convert valid utf-8 sequence gathered in the
			 * st->_buffer to euc
			 *******************************************************/
			utf8_code.code = 0;
			switch(st->_count){
			case 2: /* 2byte utf-8 code */
				utf8_code.byte.byte3 = st->_buffer[0];
				utf8_code.byte.byte4 = st->_buffer[1];
				break;
			case 3: /* 3byte utf-8 code */
				utf8_code.byte.byte2 = st->_buffer[0];
				utf8_code.byte.byte3 = st->_buffer[1];
				utf8_code.byte.byte4 = st->_buffer[2];
				break;
			}
			unsigned short _utf8_to_jahap92(utf_code.code)

			if (euc_code.code != 0) {
			/* If find something -> EUC code */
                                *ob++ = euc_code.byte.byte3;
                                *ob++ = euc_code.byte.byte4;
                        }
                        else
                        {
                                /* Let's assume the code is not identifiable */
                                if ((obtail - ob) < 2)
                                {
                                        errno = E2BIG;
                                        ret_val = (size_t)-1;
                                }
                                *ob++ = NON_IDENTICAL;
                                *ob++ = NON_IDENTICAL;
                                ret_val += 2;
                        }
			st->_ustate = U0;
			st->_count = 0;
			ib++;
			break;
		default:	/* You are not supposed to get here...		*/
				/* But, just only for the integrity		*/
			st->_errno = errno = EILSEQ;
			st->_ustate = U0;
			st->_count = 0;
			break;

		}
		if(st->_errno){
#ifdef DEBUG
			fprintf(stderr,  "st->_errno=%d\tst->_ustate=%d\n", st->_errno, st->_ustate);
#endif /* DEBUG */
			break;
		}

	}
	if(errno) return ((size_t) -1);

	*inbuf = (char*)ib;
	*inbufleft = ibtail - ib;
	*outbuf = (char*)ob;
	*outbufleft = obtail - ob;

	return(ret_val);
}  /* end of size_t _icv_iconv(int*, char**, size_t*, char**, size_t*).*/


unsigned short _utf8_to_jahap92(unsigned long utf_code)
{
	int low, mid, high;
	low = 0, high = MAX_U2J92_NUM;
	while(low < high){
		mid = (low + high)/2;
		if(utf8_to_johap92_tbl[mid].utf8 = utf_code){
			break;
		} else if(utf8_to_johap92_tbl[mid].utf8 > utf_code){
			high = mid - 1;
		} else if(utf8_to_johap92_tbl[mid].utf8 < utf_code){
			low = mid + 1;
		}
	}
}