1*91e1e26aSAlexander Pyhalov /* 2*91e1e26aSAlexander Pyhalov * CDDL HEADER START 3*91e1e26aSAlexander Pyhalov * 4*91e1e26aSAlexander Pyhalov * The contents of this file are subject to the terms of the 5*91e1e26aSAlexander Pyhalov * Common Development and Distribution License (the "License"). 6*91e1e26aSAlexander Pyhalov * You may not use this file except in compliance with the License. 7*91e1e26aSAlexander Pyhalov * 8*91e1e26aSAlexander Pyhalov * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9*91e1e26aSAlexander Pyhalov * or http://www.opensolaris.org/os/licensing. 10*91e1e26aSAlexander Pyhalov * See the License for the specific language governing permissions 11*91e1e26aSAlexander Pyhalov * and limitations under the License. 12*91e1e26aSAlexander Pyhalov * 13*91e1e26aSAlexander Pyhalov * When distributing Covered Code, include this CDDL HEADER in each 14*91e1e26aSAlexander Pyhalov * file and include the License file at src/OPENSOLARIS.LICENSE. 15*91e1e26aSAlexander Pyhalov * If applicable, add the following below this CDDL HEADER, with the 16*91e1e26aSAlexander Pyhalov * fields enclosed by brackets "[]" replaced with your own identifying 17*91e1e26aSAlexander Pyhalov * information: Portions Copyright [yyyy] [name of copyright owner] 18*91e1e26aSAlexander Pyhalov * 19*91e1e26aSAlexander Pyhalov * CDDL HEADER END 20*91e1e26aSAlexander Pyhalov */ 21*91e1e26aSAlexander Pyhalov 22*91e1e26aSAlexander Pyhalov /* 23*91e1e26aSAlexander Pyhalov * Copyright 2002 Sun Microsystems, Inc. All rights reserved. 24*91e1e26aSAlexander Pyhalov * Use is subject to license terms. 25*91e1e26aSAlexander Pyhalov */ 26*91e1e26aSAlexander Pyhalov 27*91e1e26aSAlexander Pyhalov #include <sys/types.h> 28*91e1e26aSAlexander Pyhalov #include "../inc/common_defs.h" 29*91e1e26aSAlexander Pyhalov 30*91e1e26aSAlexander Pyhalov /* 31*91e1e26aSAlexander Pyhalov * convert utf8 string to unicode 32*91e1e26aSAlexander Pyhalov * return value: 0 - fail 33*91e1e26aSAlexander Pyhalov * 1 - success 34*91e1e26aSAlexander Pyhalov */ 35*91e1e26aSAlexander Pyhalov int 36*91e1e26aSAlexander Pyhalov convert_utf8_to_ucs4(uchar_t *ib, int utf8_len, uint_t *unicode) 37*91e1e26aSAlexander Pyhalov { 38*91e1e26aSAlexander Pyhalov uchar_t first_byte = *ib; 39*91e1e26aSAlexander Pyhalov uint_t u4; 40*91e1e26aSAlexander Pyhalov 41*91e1e26aSAlexander Pyhalov if ( number_of_bytes_in_utf8_char[first_byte] != utf8_len) return 0; 42*91e1e26aSAlexander Pyhalov 43*91e1e26aSAlexander Pyhalov u4 = (uint_t)(*ib++ & masks_tbl[utf8_len]); 44*91e1e26aSAlexander Pyhalov for (; utf8_len > 1; utf8_len--) 45*91e1e26aSAlexander Pyhalov { 46*91e1e26aSAlexander Pyhalov u4 = (u4 << ICV_UTF8_BIT_SHIFT) | (((uint_t) *ib) & ICV_UTF8_BIT_MASK); 47*91e1e26aSAlexander Pyhalov ++ib; 48*91e1e26aSAlexander Pyhalov } 49*91e1e26aSAlexander Pyhalov 50*91e1e26aSAlexander Pyhalov *unicode = u4; 51*91e1e26aSAlexander Pyhalov 52*91e1e26aSAlexander Pyhalov return 1; 53*91e1e26aSAlexander Pyhalov } 54*91e1e26aSAlexander Pyhalov 55*91e1e26aSAlexander Pyhalov /* 56*91e1e26aSAlexander Pyhalov * check whether the input 'str' is valid UTF-8 byte sequence or not, 57*91e1e26aSAlexander Pyhalov * which lenght is specified by 'utf8_len' 58*91e1e26aSAlexander Pyhalov * 59*91e1e26aSAlexander Pyhalov * return: 0 - invalid byte sequence 60*91e1e26aSAlexander Pyhalov * 1 - valid byte sequence 61*91e1e26aSAlexander Pyhalov */ 62*91e1e26aSAlexander Pyhalov int 63*91e1e26aSAlexander Pyhalov is_valid_utf8_string(uchar_t *str, int utf8_len) 64*91e1e26aSAlexander Pyhalov { 65*91e1e26aSAlexander Pyhalov uint_t unicode = 0; 66*91e1e26aSAlexander Pyhalov uchar_t *ib = str; 67*91e1e26aSAlexander Pyhalov uchar_t first_byte; 68*91e1e26aSAlexander Pyhalov int is_second_byte = 0, len=utf8_len; 69*91e1e26aSAlexander Pyhalov 70*91e1e26aSAlexander Pyhalov if (number_of_bytes_in_utf8_char[*ib] == ICV_TYPE_ILLEGAL_CHAR || 71*91e1e26aSAlexander Pyhalov number_of_bytes_in_utf8_char[*ib] != utf8_len ) return 0; 72*91e1e26aSAlexander Pyhalov 73*91e1e26aSAlexander Pyhalov first_byte = *ib; 74*91e1e26aSAlexander Pyhalov --utf8_len; 75*91e1e26aSAlexander Pyhalov ++ib; 76*91e1e26aSAlexander Pyhalov is_second_byte = 1; 77*91e1e26aSAlexander Pyhalov 78*91e1e26aSAlexander Pyhalov while (utf8_len != 0) 79*91e1e26aSAlexander Pyhalov { 80*91e1e26aSAlexander Pyhalov if (is_second_byte) 81*91e1e26aSAlexander Pyhalov { 82*91e1e26aSAlexander Pyhalov if ( *ib < valid_min_2nd_byte[first_byte] || *ib > valid_max_2nd_byte[first_byte] ) 83*91e1e26aSAlexander Pyhalov return 0; 84*91e1e26aSAlexander Pyhalov is_second_byte = 0; 85*91e1e26aSAlexander Pyhalov } 86*91e1e26aSAlexander Pyhalov else if ((*ib & 0xc0) != 0x80) /* 0x80 -- 0xbf */ 87*91e1e26aSAlexander Pyhalov return 0; 88*91e1e26aSAlexander Pyhalov 89*91e1e26aSAlexander Pyhalov --utf8_len; 90*91e1e26aSAlexander Pyhalov ++ib; 91*91e1e26aSAlexander Pyhalov } 92*91e1e26aSAlexander Pyhalov 93*91e1e26aSAlexander Pyhalov convert_utf8_to_ucs4(str, len, &unicode); 94*91e1e26aSAlexander Pyhalov if (unicode == 0xFFFE || unicode == 0xFFFF) return 0; 95*91e1e26aSAlexander Pyhalov 96*91e1e26aSAlexander Pyhalov return 1; 97*91e1e26aSAlexander Pyhalov } 98