1 /* $FreeBSD$ */ 2 /* $NetBSD: citrus_utf1632.c,v 1.9 2008/06/14 16:01:08 tnozaki Exp $ */ 3 4 /*- 5 * Copyright (c)2003 Citrus Project, 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #include <sys/endian.h> 32 #include <sys/types.h> 33 34 #include <assert.h> 35 #include <errno.h> 36 #include <limits.h> 37 #include <stddef.h> 38 #include <stdio.h> 39 #include <stdlib.h> 40 #include <string.h> 41 #include <wchar.h> 42 43 #include "citrus_namespace.h" 44 #include "citrus_types.h" 45 #include "citrus_module.h" 46 #include "citrus_stdenc.h" 47 #include "citrus_bcs.h" 48 49 #include "citrus_utf1632.h" 50 51 52 /* ---------------------------------------------------------------------- 53 * private stuffs used by templates 54 */ 55 56 typedef struct { 57 int chlen; 58 int current_endian; 59 uint8_t ch[4]; 60 } _UTF1632State; 61 62 #define _ENDIAN_UNKNOWN 0 63 #define _ENDIAN_BIG 1 64 #define _ENDIAN_LITTLE 2 65 #if BYTE_ORDER == BIG_ENDIAN 66 #define _ENDIAN_INTERNAL _ENDIAN_BIG 67 #define _ENDIAN_SWAPPED _ENDIAN_LITTLE 68 #else 69 #define _ENDIAN_INTERNAL _ENDIAN_LITTLE 70 #define _ENDIAN_SWAPPED _ENDIAN_BIG 71 #endif 72 #define _MODE_UTF32 0x00000001U 73 #define _MODE_FORCE_ENDIAN 0x00000002U 74 75 typedef struct { 76 int preffered_endian; 77 unsigned int cur_max; 78 uint32_t mode; 79 } _UTF1632EncodingInfo; 80 81 #define _FUNCNAME(m) _citrus_UTF1632_##m 82 #define _ENCODING_INFO _UTF1632EncodingInfo 83 #define _ENCODING_STATE _UTF1632State 84 #define _ENCODING_MB_CUR_MAX(_ei_) ((_ei_)->cur_max) 85 #define _ENCODING_IS_STATE_DEPENDENT 0 86 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_) 0 87 88 89 static __inline void 90 /*ARGSUSED*/ 91 _citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei __unused, 92 _UTF1632State *s) 93 { 94 95 memset(s, 0, sizeof(*s)); 96 } 97 98 static int 99 _citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc, 100 const char **s, size_t n, _UTF1632State *psenc, size_t *nresult) 101 { 102 const char *s0; 103 size_t result; 104 wchar_t wc = L'\0'; 105 int chlenbak, endian, needlen; 106 107 s0 = *s; 108 109 if (s0 == NULL) { 110 _citrus_UTF1632_init_state(ei, psenc); 111 *nresult = 0; /* state independent */ 112 return (0); 113 } 114 115 result = 0; 116 chlenbak = psenc->chlen; 117 118 refetch: 119 needlen = ((ei->mode & _MODE_UTF32) != 0 || chlenbak >= 2) ? 4 : 2; 120 121 while (chlenbak < needlen) { 122 if (n == 0) 123 goto restart; 124 psenc->ch[chlenbak++] = *s0++; 125 n--; 126 result++; 127 } 128 129 /* judge endian marker */ 130 if ((ei->mode & _MODE_UTF32) == 0) { 131 /* UTF16 */ 132 if (psenc->ch[0] == 0xFE && psenc->ch[1] == 0xFF) { 133 psenc->current_endian = _ENDIAN_BIG; 134 chlenbak = 0; 135 goto refetch; 136 } else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE) { 137 psenc->current_endian = _ENDIAN_LITTLE; 138 chlenbak = 0; 139 goto refetch; 140 } 141 } else { 142 /* UTF32 */ 143 if (psenc->ch[0] == 0x00 && psenc->ch[1] == 0x00 && 144 psenc->ch[2] == 0xFE && psenc->ch[3] == 0xFF) { 145 psenc->current_endian = _ENDIAN_BIG; 146 chlenbak = 0; 147 goto refetch; 148 } else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE && 149 psenc->ch[2] == 0x00 && psenc->ch[3] == 0x00) { 150 psenc->current_endian = _ENDIAN_LITTLE; 151 chlenbak = 0; 152 goto refetch; 153 } 154 } 155 endian = ((ei->mode & _MODE_FORCE_ENDIAN) != 0 || 156 psenc->current_endian == _ENDIAN_UNKNOWN) ? ei->preffered_endian : 157 psenc->current_endian; 158 159 /* get wc */ 160 if ((ei->mode & _MODE_UTF32) == 0) { 161 /* UTF16 */ 162 if (needlen == 2) { 163 switch (endian) { 164 case _ENDIAN_LITTLE: 165 wc = (psenc->ch[0] | 166 ((wchar_t)psenc->ch[1] << 8)); 167 break; 168 case _ENDIAN_BIG: 169 wc = (psenc->ch[1] | 170 ((wchar_t)psenc->ch[0] << 8)); 171 break; 172 default: 173 goto ilseq; 174 } 175 if (wc >= 0xD800 && wc <= 0xDBFF) { 176 /* surrogate high */ 177 needlen = 4; 178 goto refetch; 179 } 180 } else { 181 /* surrogate low */ 182 wc -= 0xD800; /* wc : surrogate high (see above) */ 183 wc <<= 10; 184 switch (endian) { 185 case _ENDIAN_LITTLE: 186 if (psenc->ch[3] < 0xDC || psenc->ch[3] > 0xDF) 187 goto ilseq; 188 wc |= psenc->ch[2]; 189 wc |= (wchar_t)(psenc->ch[3] & 3) << 8; 190 break; 191 case _ENDIAN_BIG: 192 if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF) 193 goto ilseq; 194 wc |= psenc->ch[3]; 195 wc |= (wchar_t)(psenc->ch[2] & 3) << 8; 196 break; 197 default: 198 goto ilseq; 199 } 200 wc += 0x10000; 201 } 202 } else { 203 /* UTF32 */ 204 switch (endian) { 205 case _ENDIAN_LITTLE: 206 wc = (psenc->ch[0] | 207 ((wchar_t)psenc->ch[1] << 8) | 208 ((wchar_t)psenc->ch[2] << 16) | 209 ((wchar_t)psenc->ch[3] << 24)); 210 break; 211 case _ENDIAN_BIG: 212 wc = (psenc->ch[3] | 213 ((wchar_t)psenc->ch[2] << 8) | 214 ((wchar_t)psenc->ch[1] << 16) | 215 ((wchar_t)psenc->ch[0] << 24)); 216 break; 217 default: 218 goto ilseq; 219 } 220 if (wc >= 0xD800 && wc <= 0xDFFF) 221 goto ilseq; 222 } 223 224 225 *pwc = wc; 226 psenc->chlen = 0; 227 *nresult = result; 228 *s = s0; 229 230 return (0); 231 232 ilseq: 233 *nresult = (size_t)-1; 234 psenc->chlen = 0; 235 return (EILSEQ); 236 237 restart: 238 *nresult = (size_t)-2; 239 psenc->chlen = chlenbak; 240 *s = s0; 241 return (0); 242 } 243 244 static int 245 _citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n, 246 wchar_t wc, _UTF1632State *psenc, size_t *nresult) 247 { 248 wchar_t wc2; 249 static const char _bom[4] = { 250 0x00, 0x00, 0xFE, 0xFF, 251 }; 252 const char *bom = &_bom[0]; 253 size_t cnt; 254 255 cnt = (size_t)0; 256 if (psenc->current_endian == _ENDIAN_UNKNOWN) { 257 if ((ei->mode & _MODE_FORCE_ENDIAN) == 0) { 258 if (ei->mode & _MODE_UTF32) 259 cnt = 4; 260 else { 261 cnt = 2; 262 bom += 2; 263 } 264 if (n < cnt) 265 goto e2big; 266 memcpy(s, bom, cnt); 267 s += cnt, n -= cnt; 268 } 269 psenc->current_endian = ei->preffered_endian; 270 } 271 272 wc2 = 0; 273 if ((ei->mode & _MODE_UTF32)==0) { 274 /* UTF16 */ 275 if (wc > 0xFFFF) { 276 /* surrogate */ 277 if (wc > 0x10FFFF) 278 goto ilseq; 279 if (n < 4) 280 goto e2big; 281 cnt += 4; 282 wc -= 0x10000; 283 wc2 = (wc & 0x3FF) | 0xDC00; 284 wc = (wc>>10) | 0xD800; 285 } else { 286 if (n < 2) 287 goto e2big; 288 cnt += 2; 289 } 290 291 surrogate: 292 switch (psenc->current_endian) { 293 case _ENDIAN_BIG: 294 s[1] = wc; 295 s[0] = (wc >>= 8); 296 break; 297 case _ENDIAN_LITTLE: 298 s[0] = wc; 299 s[1] = (wc >>= 8); 300 break; 301 } 302 if (wc2 != 0) { 303 wc = wc2; 304 wc2 = 0; 305 s += 2; 306 goto surrogate; 307 } 308 } else { 309 /* UTF32 */ 310 if (wc >= 0xD800 && wc <= 0xDFFF) 311 goto ilseq; 312 if (n < 4) 313 goto e2big; 314 cnt += 4; 315 switch (psenc->current_endian) { 316 case _ENDIAN_BIG: 317 s[3] = wc; 318 s[2] = (wc >>= 8); 319 s[1] = (wc >>= 8); 320 s[0] = (wc >>= 8); 321 break; 322 case _ENDIAN_LITTLE: 323 s[0] = wc; 324 s[1] = (wc >>= 8); 325 s[2] = (wc >>= 8); 326 s[3] = (wc >>= 8); 327 break; 328 } 329 } 330 *nresult = cnt; 331 332 return (0); 333 334 ilseq: 335 *nresult = (size_t)-1; 336 return (EILSEQ); 337 e2big: 338 *nresult = (size_t)-1; 339 return (E2BIG); 340 } 341 342 static void 343 parse_variable(_UTF1632EncodingInfo * __restrict ei, 344 const void * __restrict var, size_t lenvar) 345 { 346 const char *p; 347 348 p = var; 349 while (lenvar > 0) { 350 switch (*p) { 351 case 'B': 352 case 'b': 353 MATCH(big, ei->preffered_endian = _ENDIAN_BIG); 354 break; 355 case 'L': 356 case 'l': 357 MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE); 358 break; 359 case 'i': 360 case 'I': 361 MATCH(internal, ei->preffered_endian = _ENDIAN_INTERNAL); 362 break; 363 case 's': 364 case 'S': 365 MATCH(swapped, ei->preffered_endian = _ENDIAN_SWAPPED); 366 break; 367 case 'F': 368 case 'f': 369 MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN); 370 break; 371 case 'U': 372 case 'u': 373 MATCH(utf32, ei->mode |= _MODE_UTF32); 374 break; 375 } 376 p++; 377 lenvar--; 378 } 379 } 380 381 static int 382 /*ARGSUSED*/ 383 _citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei, 384 const void * __restrict var, size_t lenvar) 385 { 386 387 memset((void *)ei, 0, sizeof(*ei)); 388 389 parse_variable(ei, var, lenvar); 390 391 ei->cur_max = ((ei->mode&_MODE_UTF32) == 0) ? 6 : 8; 392 /* 6: endian + surrogate */ 393 /* 8: endian + normal */ 394 395 if (ei->preffered_endian == _ENDIAN_UNKNOWN) { 396 ei->preffered_endian = _ENDIAN_BIG; 397 } 398 399 return (0); 400 } 401 402 static void 403 /*ARGSUSED*/ 404 _citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei __unused) 405 { 406 407 } 408 409 static __inline int 410 /*ARGSUSED*/ 411 _citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei __unused, 412 _csid_t * __restrict csid, _index_t * __restrict idx, _wc_t wc) 413 { 414 415 *csid = 0; 416 *idx = (_index_t)wc; 417 418 return (0); 419 } 420 421 static __inline int 422 /*ARGSUSED*/ 423 _citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei __unused, 424 _wc_t * __restrict wc, _csid_t csid, _index_t idx) 425 { 426 427 if (csid != 0) 428 return (EILSEQ); 429 430 *wc = (_wc_t)idx; 431 432 return (0); 433 } 434 435 static __inline int 436 /*ARGSUSED*/ 437 _citrus_UTF1632_stdenc_get_state_desc_generic(_UTF1632EncodingInfo * __restrict ei __unused, 438 _UTF1632State * __restrict psenc, int * __restrict rstate) 439 { 440 441 *rstate = (psenc->chlen == 0) ? _STDENC_SDGEN_INITIAL : 442 _STDENC_SDGEN_INCOMPLETE_CHAR; 443 return (0); 444 } 445 446 /* ---------------------------------------------------------------------- 447 * public interface for stdenc 448 */ 449 450 _CITRUS_STDENC_DECLS(UTF1632); 451 _CITRUS_STDENC_DEF_OPS(UTF1632); 452 453 #include "citrus_stdenc_template.h" 454