1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include <sys/types.h>
28 #include "../inc/common_defs.h"
29
30 /*
31 * convert utf8 string to unicode
32 * return value: 0 - fail
33 * 1 - success
34 */
35 int
convert_utf8_to_ucs4(uchar_t * ib,int utf8_len,uint_t * unicode)36 convert_utf8_to_ucs4(uchar_t *ib, int utf8_len, uint_t *unicode)
37 {
38 uchar_t first_byte = *ib;
39 uint_t u4;
40
41 if ( number_of_bytes_in_utf8_char[first_byte] != utf8_len) return 0;
42
43 u4 = (uint_t)(*ib++ & masks_tbl[utf8_len]);
44 for (; utf8_len > 1; utf8_len--)
45 {
46 u4 = (u4 << ICV_UTF8_BIT_SHIFT) | (((uint_t) *ib) & ICV_UTF8_BIT_MASK);
47 ++ib;
48 }
49
50 *unicode = u4;
51
52 return 1;
53 }
54
55 /*
56 * check whether the input 'str' is valid UTF-8 byte sequence or not,
57 * which lenght is specified by 'utf8_len'
58 *
59 * return: 0 - invalid byte sequence
60 * 1 - valid byte sequence
61 */
62 int
is_valid_utf8_string(uchar_t * str,int utf8_len)63 is_valid_utf8_string(uchar_t *str, int utf8_len)
64 {
65 uint_t unicode = 0;
66 uchar_t *ib = str;
67 uchar_t first_byte;
68 int is_second_byte = 0, len=utf8_len;
69
70 if (number_of_bytes_in_utf8_char[*ib] == ICV_TYPE_ILLEGAL_CHAR ||
71 number_of_bytes_in_utf8_char[*ib] != utf8_len ) return 0;
72
73 first_byte = *ib;
74 --utf8_len;
75 ++ib;
76 is_second_byte = 1;
77
78 while (utf8_len != 0)
79 {
80 if (is_second_byte)
81 {
82 if ( *ib < valid_min_2nd_byte[first_byte] || *ib > valid_max_2nd_byte[first_byte] )
83 return 0;
84 is_second_byte = 0;
85 }
86 else if ((*ib & 0xc0) != 0x80) /* 0x80 -- 0xbf */
87 return 0;
88
89 --utf8_len;
90 ++ib;
91 }
92
93 convert_utf8_to_ucs4(str, len, &unicode);
94 if (unicode == 0xFFFE || unicode == 0xFFFF) return 0;
95
96 return 1;
97 }
98