1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2002 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include "../inc/common_defs.h" 29 30 /* 31 * convert utf8 string to unicode 32 * return value: 0 - fail 33 * 1 - success 34 */ 35 int 36 convert_utf8_to_ucs4(uchar_t *ib, int utf8_len, uint_t *unicode) 37 { 38 uchar_t first_byte = *ib; 39 uint_t u4; 40 41 if ( number_of_bytes_in_utf8_char[first_byte] != utf8_len) return 0; 42 43 u4 = (uint_t)(*ib++ & masks_tbl[utf8_len]); 44 for (; utf8_len > 1; utf8_len--) 45 { 46 u4 = (u4 << ICV_UTF8_BIT_SHIFT) | (((uint_t) *ib) & ICV_UTF8_BIT_MASK); 47 ++ib; 48 } 49 50 *unicode = u4; 51 52 return 1; 53 } 54 55 /* 56 * check whether the input 'str' is valid UTF-8 byte sequence or not, 57 * which lenght is specified by 'utf8_len' 58 * 59 * return: 0 - invalid byte sequence 60 * 1 - valid byte sequence 61 */ 62 int 63 is_valid_utf8_string(uchar_t *str, int utf8_len) 64 { 65 uint_t unicode = 0; 66 uchar_t *ib = str; 67 uchar_t first_byte; 68 int is_second_byte = 0, len=utf8_len; 69 70 if (number_of_bytes_in_utf8_char[*ib] == ICV_TYPE_ILLEGAL_CHAR || 71 number_of_bytes_in_utf8_char[*ib] != utf8_len ) return 0; 72 73 first_byte = *ib; 74 --utf8_len; 75 ++ib; 76 is_second_byte = 1; 77 78 while (utf8_len != 0) 79 { 80 if (is_second_byte) 81 { 82 if ( *ib < valid_min_2nd_byte[first_byte] || *ib > valid_max_2nd_byte[first_byte] ) 83 return 0; 84 is_second_byte = 0; 85 } 86 else if ((*ib & 0xc0) != 0x80) /* 0x80 -- 0xbf */ 87 return 0; 88 89 --utf8_len; 90 ++ib; 91 } 92 93 convert_utf8_to_ucs4(str, len, &unicode); 94 if (unicode == 0xFFFE || unicode == 0xFFFF) return 0; 95 96 return 1; 97 } 98