1 /* $FreeBSD$ */ 2 /* $NetBSD: citrus_utf1632.c,v 1.9 2008/06/14 16:01:08 tnozaki Exp $ */ 3 4 /*- 5 * SPDX-License-Identifier: BSD-2-Clause 6 * 7 * Copyright (c)2003 Citrus Project, 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 #include <sys/endian.h> 34 #include <sys/types.h> 35 36 #include <assert.h> 37 #include <errno.h> 38 #include <limits.h> 39 #include <stddef.h> 40 #include <stdio.h> 41 #include <stdlib.h> 42 #include <string.h> 43 #include <wchar.h> 44 45 #include "citrus_namespace.h" 46 #include "citrus_types.h" 47 #include "citrus_module.h" 48 #include "citrus_stdenc.h" 49 #include "citrus_bcs.h" 50 51 #include "citrus_utf1632.h" 52 53 54 /* ---------------------------------------------------------------------- 55 * private stuffs used by templates 56 */ 57 58 typedef struct { 59 int chlen; 60 int current_endian; 61 uint8_t ch[4]; 62 } _UTF1632State; 63 64 #define _ENDIAN_UNKNOWN 0 65 #define _ENDIAN_BIG 1 66 #define _ENDIAN_LITTLE 2 67 #if BYTE_ORDER == BIG_ENDIAN 68 #define _ENDIAN_INTERNAL _ENDIAN_BIG 69 #define _ENDIAN_SWAPPED _ENDIAN_LITTLE 70 #else 71 #define _ENDIAN_INTERNAL _ENDIAN_LITTLE 72 #define _ENDIAN_SWAPPED _ENDIAN_BIG 73 #endif 74 #define _MODE_UTF32 0x00000001U 75 #define _MODE_FORCE_ENDIAN 0x00000002U 76 77 typedef struct { 78 int preffered_endian; 79 unsigned int cur_max; 80 unsigned int cur_min; 81 uint32_t mode; 82 } _UTF1632EncodingInfo; 83 84 #define _FUNCNAME(m) _citrus_UTF1632_##m 85 #define _ENCODING_INFO _UTF1632EncodingInfo 86 #define _ENCODING_STATE _UTF1632State 87 #define _ENCODING_MB_CUR_MAX(_ei_) ((_ei_)->cur_max) 88 #define _ENCODING_MB_CUR_MIN(_ei_) ((_ei_)->cur_min) 89 #define _ENCODING_IS_STATE_DEPENDENT 0 90 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_) 0 91 92 93 static __inline void 94 /*ARGSUSED*/ 95 _citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei __unused, 96 _UTF1632State *s) 97 { 98 99 memset(s, 0, sizeof(*s)); 100 } 101 102 static int 103 _citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc, 104 char **s, size_t n, _UTF1632State *psenc, size_t *nresult) 105 { 106 char *s0; 107 size_t result; 108 wchar_t wc = L'\0'; 109 int chlenbak, endian, needlen; 110 111 s0 = *s; 112 113 if (s0 == NULL) { 114 _citrus_UTF1632_init_state(ei, psenc); 115 *nresult = 0; /* state independent */ 116 return (0); 117 } 118 119 result = 0; 120 chlenbak = psenc->chlen; 121 122 refetch: 123 needlen = ((ei->mode & _MODE_UTF32) != 0 || chlenbak >= 2) ? 4 : 2; 124 125 while (chlenbak < needlen) { 126 if (n == 0) 127 goto restart; 128 psenc->ch[chlenbak++] = *s0++; 129 n--; 130 result++; 131 } 132 133 /* judge endian marker */ 134 if ((ei->mode & _MODE_UTF32) == 0) { 135 /* UTF16 */ 136 if (psenc->ch[0] == 0xFE && psenc->ch[1] == 0xFF) { 137 psenc->current_endian = _ENDIAN_BIG; 138 chlenbak = 0; 139 goto refetch; 140 } else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE) { 141 psenc->current_endian = _ENDIAN_LITTLE; 142 chlenbak = 0; 143 goto refetch; 144 } 145 } else { 146 /* UTF32 */ 147 if (psenc->ch[0] == 0x00 && psenc->ch[1] == 0x00 && 148 psenc->ch[2] == 0xFE && psenc->ch[3] == 0xFF) { 149 psenc->current_endian = _ENDIAN_BIG; 150 chlenbak = 0; 151 goto refetch; 152 } else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE && 153 psenc->ch[2] == 0x00 && psenc->ch[3] == 0x00) { 154 psenc->current_endian = _ENDIAN_LITTLE; 155 chlenbak = 0; 156 goto refetch; 157 } 158 } 159 endian = ((ei->mode & _MODE_FORCE_ENDIAN) != 0 || 160 psenc->current_endian == _ENDIAN_UNKNOWN) ? ei->preffered_endian : 161 psenc->current_endian; 162 163 /* get wc */ 164 if ((ei->mode & _MODE_UTF32) == 0) { 165 /* UTF16 */ 166 if (needlen == 2) { 167 switch (endian) { 168 case _ENDIAN_LITTLE: 169 wc = (psenc->ch[0] | 170 ((wchar_t)psenc->ch[1] << 8)); 171 break; 172 case _ENDIAN_BIG: 173 wc = (psenc->ch[1] | 174 ((wchar_t)psenc->ch[0] << 8)); 175 break; 176 default: 177 goto ilseq; 178 } 179 if (wc >= 0xD800 && wc <= 0xDBFF) { 180 /* surrogate high */ 181 needlen = 4; 182 goto refetch; 183 } 184 } else { 185 /* surrogate low */ 186 wc -= 0xD800; /* wc : surrogate high (see above) */ 187 wc <<= 10; 188 switch (endian) { 189 case _ENDIAN_LITTLE: 190 if (psenc->ch[3] < 0xDC || psenc->ch[3] > 0xDF) 191 goto ilseq; 192 wc |= psenc->ch[2]; 193 wc |= (wchar_t)(psenc->ch[3] & 3) << 8; 194 break; 195 case _ENDIAN_BIG: 196 if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF) 197 goto ilseq; 198 wc |= psenc->ch[3]; 199 wc |= (wchar_t)(psenc->ch[2] & 3) << 8; 200 break; 201 default: 202 goto ilseq; 203 } 204 wc += 0x10000; 205 } 206 } else { 207 /* UTF32 */ 208 switch (endian) { 209 case _ENDIAN_LITTLE: 210 wc = (psenc->ch[0] | 211 ((wchar_t)psenc->ch[1] << 8) | 212 ((wchar_t)psenc->ch[2] << 16) | 213 ((wchar_t)psenc->ch[3] << 24)); 214 break; 215 case _ENDIAN_BIG: 216 wc = (psenc->ch[3] | 217 ((wchar_t)psenc->ch[2] << 8) | 218 ((wchar_t)psenc->ch[1] << 16) | 219 ((wchar_t)psenc->ch[0] << 24)); 220 break; 221 default: 222 goto ilseq; 223 } 224 if (wc >= 0xD800 && wc <= 0xDFFF) 225 goto ilseq; 226 } 227 228 229 *pwc = wc; 230 psenc->chlen = 0; 231 *nresult = result; 232 *s = s0; 233 234 return (0); 235 236 ilseq: 237 *nresult = (size_t)-1; 238 psenc->chlen = 0; 239 return (EILSEQ); 240 241 restart: 242 *nresult = (size_t)-2; 243 psenc->chlen = chlenbak; 244 *s = s0; 245 return (0); 246 } 247 248 static int 249 _citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n, 250 wchar_t wc, _UTF1632State *psenc, size_t *nresult) 251 { 252 wchar_t wc2; 253 static const char _bom[4] = { 254 0x00, 0x00, 0xFE, 0xFF, 255 }; 256 const char *bom = &_bom[0]; 257 size_t cnt; 258 259 cnt = (size_t)0; 260 if (psenc->current_endian == _ENDIAN_UNKNOWN) { 261 if ((ei->mode & _MODE_FORCE_ENDIAN) == 0) { 262 if (ei->mode & _MODE_UTF32) 263 cnt = 4; 264 else { 265 cnt = 2; 266 bom += 2; 267 } 268 if (n < cnt) 269 goto e2big; 270 memcpy(s, bom, cnt); 271 s += cnt, n -= cnt; 272 } 273 psenc->current_endian = ei->preffered_endian; 274 } 275 276 wc2 = 0; 277 if ((ei->mode & _MODE_UTF32)==0) { 278 /* UTF16 */ 279 if (wc > 0xFFFF) { 280 /* surrogate */ 281 if (wc > 0x10FFFF) 282 goto ilseq; 283 if (n < 4) 284 goto e2big; 285 cnt += 4; 286 wc -= 0x10000; 287 wc2 = (wc & 0x3FF) | 0xDC00; 288 wc = (wc>>10) | 0xD800; 289 } else { 290 if (n < 2) 291 goto e2big; 292 cnt += 2; 293 } 294 295 surrogate: 296 switch (psenc->current_endian) { 297 case _ENDIAN_BIG: 298 s[1] = wc; 299 s[0] = (wc >>= 8); 300 break; 301 case _ENDIAN_LITTLE: 302 s[0] = wc; 303 s[1] = (wc >>= 8); 304 break; 305 } 306 if (wc2 != 0) { 307 wc = wc2; 308 wc2 = 0; 309 s += 2; 310 goto surrogate; 311 } 312 } else { 313 /* UTF32 */ 314 if (wc >= 0xD800 && wc <= 0xDFFF) 315 goto ilseq; 316 if (n < 4) 317 goto e2big; 318 cnt += 4; 319 switch (psenc->current_endian) { 320 case _ENDIAN_BIG: 321 s[3] = wc; 322 s[2] = (wc >>= 8); 323 s[1] = (wc >>= 8); 324 s[0] = (wc >>= 8); 325 break; 326 case _ENDIAN_LITTLE: 327 s[0] = wc; 328 s[1] = (wc >>= 8); 329 s[2] = (wc >>= 8); 330 s[3] = (wc >>= 8); 331 break; 332 } 333 } 334 *nresult = cnt; 335 336 return (0); 337 338 ilseq: 339 *nresult = (size_t)-1; 340 return (EILSEQ); 341 e2big: 342 *nresult = (size_t)-1; 343 return (E2BIG); 344 } 345 346 static void 347 parse_variable(_UTF1632EncodingInfo * __restrict ei, 348 const void * __restrict var, size_t lenvar) 349 { 350 const char *p; 351 352 p = var; 353 while (lenvar > 0) { 354 switch (*p) { 355 case 'B': 356 case 'b': 357 MATCH(big, ei->preffered_endian = _ENDIAN_BIG); 358 break; 359 case 'L': 360 case 'l': 361 MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE); 362 break; 363 case 'i': 364 case 'I': 365 MATCH(internal, ei->preffered_endian = _ENDIAN_INTERNAL); 366 break; 367 case 's': 368 case 'S': 369 MATCH(swapped, ei->preffered_endian = _ENDIAN_SWAPPED); 370 break; 371 case 'F': 372 case 'f': 373 MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN); 374 break; 375 case 'U': 376 case 'u': 377 MATCH(utf32, ei->mode |= _MODE_UTF32); 378 break; 379 } 380 p++; 381 lenvar--; 382 } 383 } 384 385 static int 386 /*ARGSUSED*/ 387 _citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei, 388 const void * __restrict var, size_t lenvar) 389 { 390 391 memset((void *)ei, 0, sizeof(*ei)); 392 393 parse_variable(ei, var, lenvar); 394 395 ei->cur_min = ((ei->mode&_MODE_UTF32) == 0) ? 2 : 4; 396 ei->cur_max = ((ei->mode&_MODE_UTF32) == 0) ? 6 : 8; 397 /* 6: endian + surrogate */ 398 /* 8: endian + normal */ 399 400 if (ei->preffered_endian == _ENDIAN_UNKNOWN) { 401 ei->preffered_endian = _ENDIAN_BIG; 402 } 403 404 return (0); 405 } 406 407 static void 408 /*ARGSUSED*/ 409 _citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei __unused) 410 { 411 412 } 413 414 static __inline int 415 /*ARGSUSED*/ 416 _citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei __unused, 417 _csid_t * __restrict csid, _index_t * __restrict idx, _wc_t wc) 418 { 419 420 *csid = 0; 421 *idx = (_index_t)wc; 422 423 return (0); 424 } 425 426 static __inline int 427 /*ARGSUSED*/ 428 _citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei __unused, 429 _wc_t * __restrict wc, _csid_t csid, _index_t idx) 430 { 431 432 if (csid != 0) 433 return (EILSEQ); 434 435 *wc = (_wc_t)idx; 436 437 return (0); 438 } 439 440 static __inline int 441 /*ARGSUSED*/ 442 _citrus_UTF1632_stdenc_get_state_desc_generic(_UTF1632EncodingInfo * __restrict ei __unused, 443 _UTF1632State * __restrict psenc, int * __restrict rstate) 444 { 445 446 *rstate = (psenc->chlen == 0) ? _STDENC_SDGEN_INITIAL : 447 _STDENC_SDGEN_INCOMPLETE_CHAR; 448 return (0); 449 } 450 451 /* ---------------------------------------------------------------------- 452 * public interface for stdenc 453 */ 454 455 _CITRUS_STDENC_DECLS(UTF1632); 456 _CITRUS_STDENC_DEF_OPS(UTF1632); 457 458 #include "citrus_stdenc_template.h" 459