1 /* $NetBSD: citrus_utf1632.c,v 1.9 2008/06/14 16:01:08 tnozaki Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c)2003 Citrus Project, 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/cdefs.h> 32 #include <sys/endian.h> 33 #include <sys/types.h> 34 35 #include <assert.h> 36 #include <errno.h> 37 #include <limits.h> 38 #include <stddef.h> 39 #include <stdio.h> 40 #include <stdlib.h> 41 #include <string.h> 42 #include <wchar.h> 43 44 #include "citrus_namespace.h" 45 #include "citrus_types.h" 46 #include "citrus_module.h" 47 #include "citrus_stdenc.h" 48 #include "citrus_bcs.h" 49 50 #include "citrus_utf1632.h" 51 52 53 /* ---------------------------------------------------------------------- 54 * private stuffs used by templates 55 */ 56 57 typedef struct { 58 int chlen; 59 int current_endian; 60 uint8_t ch[4]; 61 } _UTF1632State; 62 63 #define _ENDIAN_UNKNOWN 0 64 #define _ENDIAN_BIG 1 65 #define _ENDIAN_LITTLE 2 66 #if BYTE_ORDER == BIG_ENDIAN 67 #define _ENDIAN_INTERNAL _ENDIAN_BIG 68 #define _ENDIAN_SWAPPED _ENDIAN_LITTLE 69 #else 70 #define _ENDIAN_INTERNAL _ENDIAN_LITTLE 71 #define _ENDIAN_SWAPPED _ENDIAN_BIG 72 #endif 73 #define _MODE_UTF32 0x00000001U 74 #define _MODE_FORCE_ENDIAN 0x00000002U 75 76 typedef struct { 77 int preffered_endian; 78 unsigned int cur_max; 79 unsigned int cur_min; 80 uint32_t mode; 81 } _UTF1632EncodingInfo; 82 83 #define _FUNCNAME(m) _citrus_UTF1632_##m 84 #define _ENCODING_INFO _UTF1632EncodingInfo 85 #define _ENCODING_STATE _UTF1632State 86 #define _ENCODING_MB_CUR_MAX(_ei_) ((_ei_)->cur_max) 87 #define _ENCODING_MB_CUR_MIN(_ei_) ((_ei_)->cur_min) 88 #define _ENCODING_IS_STATE_DEPENDENT 0 89 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_) 0 90 91 92 static __inline void 93 /*ARGSUSED*/ 94 _citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei __unused, 95 _UTF1632State *s) 96 { 97 98 memset(s, 0, sizeof(*s)); 99 } 100 101 static int 102 _citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc, 103 char **s, size_t n, _UTF1632State *psenc, size_t *nresult) 104 { 105 char *s0; 106 size_t result; 107 wchar_t wc = L'\0'; 108 int chlenbak, endian, needlen; 109 110 s0 = *s; 111 112 if (s0 == NULL) { 113 _citrus_UTF1632_init_state(ei, psenc); 114 *nresult = 0; /* state independent */ 115 return (0); 116 } 117 118 result = 0; 119 chlenbak = psenc->chlen; 120 121 refetch: 122 needlen = ((ei->mode & _MODE_UTF32) != 0 || chlenbak >= 2) ? 4 : 2; 123 124 while (chlenbak < needlen) { 125 if (n == 0) 126 goto restart; 127 psenc->ch[chlenbak++] = *s0++; 128 n--; 129 result++; 130 } 131 132 /* judge endian marker */ 133 if ((ei->mode & _MODE_UTF32) == 0) { 134 /* UTF16 */ 135 if (psenc->ch[0] == 0xFE && psenc->ch[1] == 0xFF) { 136 psenc->current_endian = _ENDIAN_BIG; 137 chlenbak = 0; 138 goto refetch; 139 } else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE) { 140 psenc->current_endian = _ENDIAN_LITTLE; 141 chlenbak = 0; 142 goto refetch; 143 } 144 } else { 145 /* UTF32 */ 146 if (psenc->ch[0] == 0x00 && psenc->ch[1] == 0x00 && 147 psenc->ch[2] == 0xFE && psenc->ch[3] == 0xFF) { 148 psenc->current_endian = _ENDIAN_BIG; 149 chlenbak = 0; 150 goto refetch; 151 } else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE && 152 psenc->ch[2] == 0x00 && psenc->ch[3] == 0x00) { 153 psenc->current_endian = _ENDIAN_LITTLE; 154 chlenbak = 0; 155 goto refetch; 156 } 157 } 158 endian = ((ei->mode & _MODE_FORCE_ENDIAN) != 0 || 159 psenc->current_endian == _ENDIAN_UNKNOWN) ? ei->preffered_endian : 160 psenc->current_endian; 161 162 /* get wc */ 163 if ((ei->mode & _MODE_UTF32) == 0) { 164 /* UTF16 */ 165 if (needlen == 2) { 166 switch (endian) { 167 case _ENDIAN_LITTLE: 168 wc = (psenc->ch[0] | 169 ((wchar_t)psenc->ch[1] << 8)); 170 break; 171 case _ENDIAN_BIG: 172 wc = (psenc->ch[1] | 173 ((wchar_t)psenc->ch[0] << 8)); 174 break; 175 default: 176 goto ilseq; 177 } 178 if (wc >= 0xD800 && wc <= 0xDBFF) { 179 /* surrogate high */ 180 needlen = 4; 181 goto refetch; 182 } 183 } else { 184 /* surrogate low */ 185 wc -= 0xD800; /* wc : surrogate high (see above) */ 186 wc <<= 10; 187 switch (endian) { 188 case _ENDIAN_LITTLE: 189 if (psenc->ch[3] < 0xDC || psenc->ch[3] > 0xDF) 190 goto ilseq; 191 wc |= psenc->ch[2]; 192 wc |= (wchar_t)(psenc->ch[3] & 3) << 8; 193 break; 194 case _ENDIAN_BIG: 195 if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF) 196 goto ilseq; 197 wc |= psenc->ch[3]; 198 wc |= (wchar_t)(psenc->ch[2] & 3) << 8; 199 break; 200 default: 201 goto ilseq; 202 } 203 wc += 0x10000; 204 } 205 } else { 206 /* UTF32 */ 207 switch (endian) { 208 case _ENDIAN_LITTLE: 209 wc = (psenc->ch[0] | 210 ((wchar_t)psenc->ch[1] << 8) | 211 ((wchar_t)psenc->ch[2] << 16) | 212 ((wchar_t)psenc->ch[3] << 24)); 213 break; 214 case _ENDIAN_BIG: 215 wc = (psenc->ch[3] | 216 ((wchar_t)psenc->ch[2] << 8) | 217 ((wchar_t)psenc->ch[1] << 16) | 218 ((wchar_t)psenc->ch[0] << 24)); 219 break; 220 default: 221 goto ilseq; 222 } 223 if (wc >= 0xD800 && wc <= 0xDFFF) 224 goto ilseq; 225 } 226 227 228 *pwc = wc; 229 psenc->chlen = 0; 230 *nresult = result; 231 *s = s0; 232 233 return (0); 234 235 ilseq: 236 *nresult = (size_t)-1; 237 psenc->chlen = 0; 238 return (EILSEQ); 239 240 restart: 241 *nresult = (size_t)-2; 242 psenc->chlen = chlenbak; 243 *s = s0; 244 return (0); 245 } 246 247 static int 248 _citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n, 249 wchar_t wc, _UTF1632State *psenc, size_t *nresult) 250 { 251 wchar_t wc2; 252 static const char _bom[4] = { 253 0x00, 0x00, 0xFE, 0xFF, 254 }; 255 const char *bom = &_bom[0]; 256 size_t cnt; 257 258 cnt = (size_t)0; 259 if (psenc->current_endian == _ENDIAN_UNKNOWN) { 260 if ((ei->mode & _MODE_FORCE_ENDIAN) == 0) { 261 if (ei->mode & _MODE_UTF32) 262 cnt = 4; 263 else { 264 cnt = 2; 265 bom += 2; 266 } 267 if (n < cnt) 268 goto e2big; 269 memcpy(s, bom, cnt); 270 s += cnt, n -= cnt; 271 } 272 psenc->current_endian = ei->preffered_endian; 273 } 274 275 wc2 = 0; 276 if ((ei->mode & _MODE_UTF32)==0) { 277 /* UTF16 */ 278 if (wc > 0xFFFF) { 279 /* surrogate */ 280 if (wc > 0x10FFFF) 281 goto ilseq; 282 if (n < 4) 283 goto e2big; 284 cnt += 4; 285 wc -= 0x10000; 286 wc2 = (wc & 0x3FF) | 0xDC00; 287 wc = (wc>>10) | 0xD800; 288 } else { 289 if (n < 2) 290 goto e2big; 291 cnt += 2; 292 } 293 294 surrogate: 295 switch (psenc->current_endian) { 296 case _ENDIAN_BIG: 297 s[1] = wc; 298 s[0] = (wc >>= 8); 299 break; 300 case _ENDIAN_LITTLE: 301 s[0] = wc; 302 s[1] = (wc >>= 8); 303 break; 304 } 305 if (wc2 != 0) { 306 wc = wc2; 307 wc2 = 0; 308 s += 2; 309 goto surrogate; 310 } 311 } else { 312 /* UTF32 */ 313 if (wc >= 0xD800 && wc <= 0xDFFF) 314 goto ilseq; 315 if (n < 4) 316 goto e2big; 317 cnt += 4; 318 switch (psenc->current_endian) { 319 case _ENDIAN_BIG: 320 s[3] = wc; 321 s[2] = (wc >>= 8); 322 s[1] = (wc >>= 8); 323 s[0] = (wc >>= 8); 324 break; 325 case _ENDIAN_LITTLE: 326 s[0] = wc; 327 s[1] = (wc >>= 8); 328 s[2] = (wc >>= 8); 329 s[3] = (wc >>= 8); 330 break; 331 } 332 } 333 *nresult = cnt; 334 335 return (0); 336 337 ilseq: 338 *nresult = (size_t)-1; 339 return (EILSEQ); 340 e2big: 341 *nresult = (size_t)-1; 342 return (E2BIG); 343 } 344 345 static void 346 parse_variable(_UTF1632EncodingInfo * __restrict ei, 347 const void * __restrict var, size_t lenvar) 348 { 349 const char *p; 350 351 p = var; 352 while (lenvar > 0) { 353 switch (*p) { 354 case 'B': 355 case 'b': 356 MATCH(big, ei->preffered_endian = _ENDIAN_BIG); 357 break; 358 case 'L': 359 case 'l': 360 MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE); 361 break; 362 case 'i': 363 case 'I': 364 MATCH(internal, ei->preffered_endian = _ENDIAN_INTERNAL); 365 break; 366 case 's': 367 case 'S': 368 MATCH(swapped, ei->preffered_endian = _ENDIAN_SWAPPED); 369 break; 370 case 'F': 371 case 'f': 372 MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN); 373 break; 374 case 'U': 375 case 'u': 376 MATCH(utf32, ei->mode |= _MODE_UTF32); 377 break; 378 } 379 p++; 380 lenvar--; 381 } 382 } 383 384 static int 385 /*ARGSUSED*/ 386 _citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei, 387 const void * __restrict var, size_t lenvar) 388 { 389 390 memset((void *)ei, 0, sizeof(*ei)); 391 392 parse_variable(ei, var, lenvar); 393 394 ei->cur_min = ((ei->mode&_MODE_UTF32) == 0) ? 2 : 4; 395 ei->cur_max = ((ei->mode&_MODE_UTF32) == 0) ? 6 : 8; 396 /* 6: endian + surrogate */ 397 /* 8: endian + normal */ 398 399 if (ei->preffered_endian == _ENDIAN_UNKNOWN) { 400 ei->preffered_endian = _ENDIAN_BIG; 401 } 402 403 return (0); 404 } 405 406 static void 407 /*ARGSUSED*/ 408 _citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei __unused) 409 { 410 411 } 412 413 static __inline int 414 /*ARGSUSED*/ 415 _citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei __unused, 416 _csid_t * __restrict csid, _index_t * __restrict idx, _wc_t wc) 417 { 418 419 *csid = 0; 420 *idx = (_index_t)wc; 421 422 return (0); 423 } 424 425 static __inline int 426 /*ARGSUSED*/ 427 _citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei __unused, 428 _wc_t * __restrict wc, _csid_t csid, _index_t idx) 429 { 430 431 if (csid != 0) 432 return (EILSEQ); 433 434 *wc = (_wc_t)idx; 435 436 return (0); 437 } 438 439 static __inline int 440 /*ARGSUSED*/ 441 _citrus_UTF1632_stdenc_get_state_desc_generic(_UTF1632EncodingInfo * __restrict ei __unused, 442 _UTF1632State * __restrict psenc, int * __restrict rstate) 443 { 444 445 *rstate = (psenc->chlen == 0) ? _STDENC_SDGEN_INITIAL : 446 _STDENC_SDGEN_INCOMPLETE_CHAR; 447 return (0); 448 } 449 450 /* ---------------------------------------------------------------------- 451 * public interface for stdenc 452 */ 453 454 _CITRUS_STDENC_DECLS(UTF1632); 455 _CITRUS_STDENC_DEF_OPS(UTF1632); 456 457 #include "citrus_stdenc_template.h" 458