1 /* $FreeBSD$ */ 2 /* $NetBSD: citrus_utf1632.c,v 1.9 2008/06/14 16:01:08 tnozaki Exp $ */ 3 4 /*- 5 * SPDX-License-Identifier: BSD-2-Clause 6 * 7 * Copyright (c)2003 Citrus Project, 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 #include <sys/endian.h> 34 #include <sys/types.h> 35 36 #include <assert.h> 37 #include <errno.h> 38 #include <limits.h> 39 #include <stddef.h> 40 #include <stdio.h> 41 #include <stdlib.h> 42 #include <string.h> 43 #include <wchar.h> 44 45 #include "citrus_namespace.h" 46 #include "citrus_types.h" 47 #include "citrus_module.h" 48 #include "citrus_stdenc.h" 49 #include "citrus_bcs.h" 50 51 #include "citrus_utf1632.h" 52 53 54 /* ---------------------------------------------------------------------- 55 * private stuffs used by templates 56 */ 57 58 typedef struct { 59 int chlen; 60 int current_endian; 61 uint8_t ch[4]; 62 } _UTF1632State; 63 64 #define _ENDIAN_UNKNOWN 0 65 #define _ENDIAN_BIG 1 66 #define _ENDIAN_LITTLE 2 67 #if BYTE_ORDER == BIG_ENDIAN 68 #define _ENDIAN_INTERNAL _ENDIAN_BIG 69 #define _ENDIAN_SWAPPED _ENDIAN_LITTLE 70 #else 71 #define _ENDIAN_INTERNAL _ENDIAN_LITTLE 72 #define _ENDIAN_SWAPPED _ENDIAN_BIG 73 #endif 74 #define _MODE_UTF32 0x00000001U 75 #define _MODE_FORCE_ENDIAN 0x00000002U 76 77 typedef struct { 78 int preffered_endian; 79 unsigned int cur_max; 80 uint32_t mode; 81 } _UTF1632EncodingInfo; 82 83 #define _FUNCNAME(m) _citrus_UTF1632_##m 84 #define _ENCODING_INFO _UTF1632EncodingInfo 85 #define _ENCODING_STATE _UTF1632State 86 #define _ENCODING_MB_CUR_MAX(_ei_) ((_ei_)->cur_max) 87 #define _ENCODING_IS_STATE_DEPENDENT 0 88 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_) 0 89 90 91 static __inline void 92 /*ARGSUSED*/ 93 _citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei __unused, 94 _UTF1632State *s) 95 { 96 97 memset(s, 0, sizeof(*s)); 98 } 99 100 static int 101 _citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc, 102 char **s, size_t n, _UTF1632State *psenc, size_t *nresult) 103 { 104 char *s0; 105 size_t result; 106 wchar_t wc = L'\0'; 107 int chlenbak, endian, needlen; 108 109 s0 = *s; 110 111 if (s0 == NULL) { 112 _citrus_UTF1632_init_state(ei, psenc); 113 *nresult = 0; /* state independent */ 114 return (0); 115 } 116 117 result = 0; 118 chlenbak = psenc->chlen; 119 120 refetch: 121 needlen = ((ei->mode & _MODE_UTF32) != 0 || chlenbak >= 2) ? 4 : 2; 122 123 while (chlenbak < needlen) { 124 if (n == 0) 125 goto restart; 126 psenc->ch[chlenbak++] = *s0++; 127 n--; 128 result++; 129 } 130 131 /* judge endian marker */ 132 if ((ei->mode & _MODE_UTF32) == 0) { 133 /* UTF16 */ 134 if (psenc->ch[0] == 0xFE && psenc->ch[1] == 0xFF) { 135 psenc->current_endian = _ENDIAN_BIG; 136 chlenbak = 0; 137 goto refetch; 138 } else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE) { 139 psenc->current_endian = _ENDIAN_LITTLE; 140 chlenbak = 0; 141 goto refetch; 142 } 143 } else { 144 /* UTF32 */ 145 if (psenc->ch[0] == 0x00 && psenc->ch[1] == 0x00 && 146 psenc->ch[2] == 0xFE && psenc->ch[3] == 0xFF) { 147 psenc->current_endian = _ENDIAN_BIG; 148 chlenbak = 0; 149 goto refetch; 150 } else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE && 151 psenc->ch[2] == 0x00 && psenc->ch[3] == 0x00) { 152 psenc->current_endian = _ENDIAN_LITTLE; 153 chlenbak = 0; 154 goto refetch; 155 } 156 } 157 endian = ((ei->mode & _MODE_FORCE_ENDIAN) != 0 || 158 psenc->current_endian == _ENDIAN_UNKNOWN) ? ei->preffered_endian : 159 psenc->current_endian; 160 161 /* get wc */ 162 if ((ei->mode & _MODE_UTF32) == 0) { 163 /* UTF16 */ 164 if (needlen == 2) { 165 switch (endian) { 166 case _ENDIAN_LITTLE: 167 wc = (psenc->ch[0] | 168 ((wchar_t)psenc->ch[1] << 8)); 169 break; 170 case _ENDIAN_BIG: 171 wc = (psenc->ch[1] | 172 ((wchar_t)psenc->ch[0] << 8)); 173 break; 174 default: 175 goto ilseq; 176 } 177 if (wc >= 0xD800 && wc <= 0xDBFF) { 178 /* surrogate high */ 179 needlen = 4; 180 goto refetch; 181 } 182 } else { 183 /* surrogate low */ 184 wc -= 0xD800; /* wc : surrogate high (see above) */ 185 wc <<= 10; 186 switch (endian) { 187 case _ENDIAN_LITTLE: 188 if (psenc->ch[3] < 0xDC || psenc->ch[3] > 0xDF) 189 goto ilseq; 190 wc |= psenc->ch[2]; 191 wc |= (wchar_t)(psenc->ch[3] & 3) << 8; 192 break; 193 case _ENDIAN_BIG: 194 if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF) 195 goto ilseq; 196 wc |= psenc->ch[3]; 197 wc |= (wchar_t)(psenc->ch[2] & 3) << 8; 198 break; 199 default: 200 goto ilseq; 201 } 202 wc += 0x10000; 203 } 204 } else { 205 /* UTF32 */ 206 switch (endian) { 207 case _ENDIAN_LITTLE: 208 wc = (psenc->ch[0] | 209 ((wchar_t)psenc->ch[1] << 8) | 210 ((wchar_t)psenc->ch[2] << 16) | 211 ((wchar_t)psenc->ch[3] << 24)); 212 break; 213 case _ENDIAN_BIG: 214 wc = (psenc->ch[3] | 215 ((wchar_t)psenc->ch[2] << 8) | 216 ((wchar_t)psenc->ch[1] << 16) | 217 ((wchar_t)psenc->ch[0] << 24)); 218 break; 219 default: 220 goto ilseq; 221 } 222 if (wc >= 0xD800 && wc <= 0xDFFF) 223 goto ilseq; 224 } 225 226 227 *pwc = wc; 228 psenc->chlen = 0; 229 *nresult = result; 230 *s = s0; 231 232 return (0); 233 234 ilseq: 235 *nresult = (size_t)-1; 236 psenc->chlen = 0; 237 return (EILSEQ); 238 239 restart: 240 *nresult = (size_t)-2; 241 psenc->chlen = chlenbak; 242 *s = s0; 243 return (0); 244 } 245 246 static int 247 _citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n, 248 wchar_t wc, _UTF1632State *psenc, size_t *nresult) 249 { 250 wchar_t wc2; 251 static const char _bom[4] = { 252 0x00, 0x00, 0xFE, 0xFF, 253 }; 254 const char *bom = &_bom[0]; 255 size_t cnt; 256 257 cnt = (size_t)0; 258 if (psenc->current_endian == _ENDIAN_UNKNOWN) { 259 if ((ei->mode & _MODE_FORCE_ENDIAN) == 0) { 260 if (ei->mode & _MODE_UTF32) 261 cnt = 4; 262 else { 263 cnt = 2; 264 bom += 2; 265 } 266 if (n < cnt) 267 goto e2big; 268 memcpy(s, bom, cnt); 269 s += cnt, n -= cnt; 270 } 271 psenc->current_endian = ei->preffered_endian; 272 } 273 274 wc2 = 0; 275 if ((ei->mode & _MODE_UTF32)==0) { 276 /* UTF16 */ 277 if (wc > 0xFFFF) { 278 /* surrogate */ 279 if (wc > 0x10FFFF) 280 goto ilseq; 281 if (n < 4) 282 goto e2big; 283 cnt += 4; 284 wc -= 0x10000; 285 wc2 = (wc & 0x3FF) | 0xDC00; 286 wc = (wc>>10) | 0xD800; 287 } else { 288 if (n < 2) 289 goto e2big; 290 cnt += 2; 291 } 292 293 surrogate: 294 switch (psenc->current_endian) { 295 case _ENDIAN_BIG: 296 s[1] = wc; 297 s[0] = (wc >>= 8); 298 break; 299 case _ENDIAN_LITTLE: 300 s[0] = wc; 301 s[1] = (wc >>= 8); 302 break; 303 } 304 if (wc2 != 0) { 305 wc = wc2; 306 wc2 = 0; 307 s += 2; 308 goto surrogate; 309 } 310 } else { 311 /* UTF32 */ 312 if (wc >= 0xD800 && wc <= 0xDFFF) 313 goto ilseq; 314 if (n < 4) 315 goto e2big; 316 cnt += 4; 317 switch (psenc->current_endian) { 318 case _ENDIAN_BIG: 319 s[3] = wc; 320 s[2] = (wc >>= 8); 321 s[1] = (wc >>= 8); 322 s[0] = (wc >>= 8); 323 break; 324 case _ENDIAN_LITTLE: 325 s[0] = wc; 326 s[1] = (wc >>= 8); 327 s[2] = (wc >>= 8); 328 s[3] = (wc >>= 8); 329 break; 330 } 331 } 332 *nresult = cnt; 333 334 return (0); 335 336 ilseq: 337 *nresult = (size_t)-1; 338 return (EILSEQ); 339 e2big: 340 *nresult = (size_t)-1; 341 return (E2BIG); 342 } 343 344 static void 345 parse_variable(_UTF1632EncodingInfo * __restrict ei, 346 const void * __restrict var, size_t lenvar) 347 { 348 const char *p; 349 350 p = var; 351 while (lenvar > 0) { 352 switch (*p) { 353 case 'B': 354 case 'b': 355 MATCH(big, ei->preffered_endian = _ENDIAN_BIG); 356 break; 357 case 'L': 358 case 'l': 359 MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE); 360 break; 361 case 'i': 362 case 'I': 363 MATCH(internal, ei->preffered_endian = _ENDIAN_INTERNAL); 364 break; 365 case 's': 366 case 'S': 367 MATCH(swapped, ei->preffered_endian = _ENDIAN_SWAPPED); 368 break; 369 case 'F': 370 case 'f': 371 MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN); 372 break; 373 case 'U': 374 case 'u': 375 MATCH(utf32, ei->mode |= _MODE_UTF32); 376 break; 377 } 378 p++; 379 lenvar--; 380 } 381 } 382 383 static int 384 /*ARGSUSED*/ 385 _citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei, 386 const void * __restrict var, size_t lenvar) 387 { 388 389 memset((void *)ei, 0, sizeof(*ei)); 390 391 parse_variable(ei, var, lenvar); 392 393 ei->cur_max = ((ei->mode&_MODE_UTF32) == 0) ? 6 : 8; 394 /* 6: endian + surrogate */ 395 /* 8: endian + normal */ 396 397 if (ei->preffered_endian == _ENDIAN_UNKNOWN) { 398 ei->preffered_endian = _ENDIAN_BIG; 399 } 400 401 return (0); 402 } 403 404 static void 405 /*ARGSUSED*/ 406 _citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei __unused) 407 { 408 409 } 410 411 static __inline int 412 /*ARGSUSED*/ 413 _citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei __unused, 414 _csid_t * __restrict csid, _index_t * __restrict idx, _wc_t wc) 415 { 416 417 *csid = 0; 418 *idx = (_index_t)wc; 419 420 return (0); 421 } 422 423 static __inline int 424 /*ARGSUSED*/ 425 _citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei __unused, 426 _wc_t * __restrict wc, _csid_t csid, _index_t idx) 427 { 428 429 if (csid != 0) 430 return (EILSEQ); 431 432 *wc = (_wc_t)idx; 433 434 return (0); 435 } 436 437 static __inline int 438 /*ARGSUSED*/ 439 _citrus_UTF1632_stdenc_get_state_desc_generic(_UTF1632EncodingInfo * __restrict ei __unused, 440 _UTF1632State * __restrict psenc, int * __restrict rstate) 441 { 442 443 *rstate = (psenc->chlen == 0) ? _STDENC_SDGEN_INITIAL : 444 _STDENC_SDGEN_INCOMPLETE_CHAR; 445 return (0); 446 } 447 448 /* ---------------------------------------------------------------------- 449 * public interface for stdenc 450 */ 451 452 _CITRUS_STDENC_DECLS(UTF1632); 453 _CITRUS_STDENC_DEF_OPS(UTF1632); 454 455 #include "citrus_stdenc_template.h" 456