1 /*- 2 * Copyright (c) 2002 Tim J. Robbins 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <rune.h> 31 #include <stddef.h> 32 #include <stdio.h> 33 #include <stdlib.h> 34 35 rune_t _UTF8_sgetrune(const char *, size_t, char const **); 36 int _UTF8_sputrune(rune_t, char *, size_t, char **); 37 38 int 39 _UTF8_init(_RuneLocale *rl) 40 { 41 42 rl->sgetrune = _UTF8_sgetrune; 43 rl->sputrune = _UTF8_sputrune; 44 _CurrentRuneLocale = rl; 45 __mb_cur_max = 6; 46 47 return (0); 48 } 49 50 rune_t 51 _UTF8_sgetrune(const char *string, size_t n, const char **result) 52 { 53 int ch, len, mask; 54 rune_t lbound, wch; 55 56 if (n < 1) { 57 if (result != NULL) 58 *result = string; 59 return (_INVALID_RUNE); 60 } 61 62 /* 63 * Determine the number of octets that make up this character from 64 * the first octet, and a mask that extracts the interesting bits of 65 * the first octet. 66 * 67 * We also specify a lower bound for the character code to detect 68 * redundant, non-"shortest form" encodings. For example, the 69 * sequence C0 80 is _not_ a legal representation of the null 70 * character. This enforces a 1-to-1 mapping between character 71 * codes and their multibyte representations. 72 */ 73 ch = (unsigned char)*string; 74 if ((ch & 0x80) == 0) { 75 mask = 0x7f; 76 len = 1; 77 lbound = 0; 78 } else if ((ch & 0xe0) == 0xc0) { 79 mask = 0x1f; 80 len = 2; 81 lbound = 0x80; 82 } else if ((ch & 0xf0) == 0xe0) { 83 mask = 0x0f; 84 len = 3; 85 lbound = 0x800; 86 } else if ((ch & 0xf8) == 0xf0) { 87 mask = 0x07; 88 len = 4; 89 lbound = 0x10000; 90 } else if ((ch & 0xfc) == 0xf8) { 91 mask = 0x03; 92 len = 5; 93 lbound = 0x200000; 94 } else if ((ch & 0xfc) == 0xfc) { 95 mask = 0x01; 96 len = 6; 97 lbound = 0x4000000; 98 } else { 99 /* 100 * Malformed input; input is not UTF-8. 101 */ 102 if (result != NULL) 103 *result = string + 1; 104 return (_INVALID_RUNE); 105 } 106 107 if (n < len) { 108 /* 109 * Truncated or partial input. 110 */ 111 if (result != NULL) 112 *result = string; 113 return (_INVALID_RUNE); 114 } 115 116 /* 117 * Decode the octet sequence representing the character in chunks 118 * of 6 bits, most significant first. 119 */ 120 wch = (unsigned char)*string++ & mask; 121 while (--len != 0) { 122 if ((*string & 0xc0) != 0x80) { 123 /* 124 * Malformed input; bad characters in the middle 125 * of a character. 126 */ 127 wch = _INVALID_RUNE; 128 if (result != NULL) 129 *result = string + 1; 130 return (_INVALID_RUNE); 131 } 132 wch <<= 6; 133 wch |= *string++ & 0x3f; 134 } 135 if (wch != _INVALID_RUNE && wch < lbound) 136 /* 137 * Malformed input; redundant encoding. 138 */ 139 wch = _INVALID_RUNE; 140 if (result != NULL) 141 *result = string; 142 return (wch); 143 } 144 145 int 146 _UTF8_sputrune(rune_t c, char *string, size_t n, char **result) 147 { 148 unsigned char lead; 149 int i, len; 150 151 /* 152 * Determine the number of octets needed to represent this character. 153 * We always output the shortest sequence possible. Also specify the 154 * first few bits of the first octet, which contains the information 155 * about the sequence length. 156 */ 157 if ((c & ~0x7f) == 0) { 158 lead = 0; 159 len = 1; 160 } else if ((c & ~0x7ff) == 0) { 161 lead = 0xc0; 162 len = 2; 163 } else if ((c & ~0xffff) == 0) { 164 lead = 0xe0; 165 len = 3; 166 } else if ((c & ~0x1fffff) == 0) { 167 lead = 0xf0; 168 len = 4; 169 } else if ((c & ~0x3ffffff) == 0) { 170 lead = 0xf8; 171 len = 5; 172 } else if ((c & ~0x7fffffff) == 0) { 173 lead = 0xfc; 174 len = 6; 175 } else { 176 /* 177 * Wide character code is out of range. 178 */ 179 if (result != NULL) 180 *result = NULL; 181 return (0); 182 } 183 184 if (n < len) { 185 if (result != NULL) 186 *result = NULL; 187 } else { 188 /* 189 * Output the octets representing the character in chunks 190 * of 6 bits, least significant last. The first octet is 191 * a special case because it contains the sequence length 192 * information. 193 */ 194 for (i = len - 1; i > 0; i--) { 195 string[i] = (c & 0x3f) | 0x80; 196 c >>= 6; 197 } 198 *string = (c & 0xff) | lead; 199 if (result != NULL) 200 *result = string + len; 201 } 202 203 return (len); 204 } 205