1 /* $FreeBSD$ */ 2 /* $NetBSD: citrus_iso2022.c,v 1.20 2010/12/07 22:01:45 joerg Exp $ */ 3 4 /*- 5 * SPDX-License-Identifier: BSD-2-Clause 6 * 7 * Copyright (c)1999, 2002 Citrus Project, 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * $Citrus: xpg4dl/FreeBSD/lib/libc/locale/iso2022.c,v 1.23 2001/06/21 01:51:44 yamt Exp $ 32 */ 33 34 #include <sys/cdefs.h> 35 #include <sys/types.h> 36 37 #include <assert.h> 38 #include <errno.h> 39 #include <limits.h> 40 #include <stdbool.h> 41 #include <stddef.h> 42 #include <stdio.h> 43 #include <stdlib.h> 44 #include <string.h> 45 #include <wchar.h> 46 47 #include "citrus_namespace.h" 48 #include "citrus_types.h" 49 #include "citrus_module.h" 50 #include "citrus_stdenc.h" 51 #include "citrus_iso2022.h" 52 53 54 /* ---------------------------------------------------------------------- 55 * private stuffs used by templates 56 */ 57 58 59 /* 60 * wchar_t mappings: 61 * ASCII (ESC ( B) 00000000 00000000 00000000 0xxxxxxx 62 * iso-8859-1 (ESC , A) 00000000 00000000 00000000 1xxxxxxx 63 * 94 charset (ESC ( F) 0fffffff 00000000 00000000 0xxxxxxx 64 * 94 charset (ESC ( M F) 0fffffff 1mmmmmmm 00000000 0xxxxxxx 65 * 96 charset (ESC , F) 0fffffff 00000000 00000000 1xxxxxxx 66 * 96 charset (ESC , M F) 0fffffff 1mmmmmmm 00000000 1xxxxxxx 67 * 94x94 charset (ESC $ ( F) 0fffffff 00000000 0xxxxxxx 0xxxxxxx 68 * 96x96 charset (ESC $ , F) 0fffffff 00000000 0xxxxxxx 1xxxxxxx 69 * 94x94 charset (ESC & V ESC $ ( F) 70 * 0fffffff 1vvvvvvv 0xxxxxxx 0xxxxxxx 71 * 94x94x94 charset (ESC $ ( F) 0fffffff 0xxxxxxx 0xxxxxxx 0xxxxxxx 72 * 96x96x96 charset (ESC $ , F) 0fffffff 0xxxxxxx 0xxxxxxx 1xxxxxxx 73 * reserved for UCS4 co-existence (UCS4 is 31bit encoding thanks to mohta bit) 74 * 1xxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 75 */ 76 77 #define CS94 (0U) 78 #define CS96 (1U) 79 #define CS94MULTI (2U) 80 #define CS96MULTI (3U) 81 82 typedef struct { 83 unsigned char type; 84 unsigned char final; 85 unsigned char interm; 86 unsigned char vers; 87 } _ISO2022Charset; 88 89 static const _ISO2022Charset ascii = { CS94, 'B', '\0', '\0' }; 90 static const _ISO2022Charset iso88591 = { CS96, 'A', '\0', '\0' }; 91 92 typedef struct { 93 _ISO2022Charset g[4]; 94 /* need 3 bits to hold -1, 0, ..., 3 */ 95 int gl:3, 96 gr:3, 97 singlegl:3, 98 singlegr:3; 99 char ch[7]; /* longest escape sequence (ESC & V ESC $ ( F) */ 100 size_t chlen; 101 int flags; 102 #define _ISO2022STATE_FLAG_INITIALIZED 1 103 } _ISO2022State; 104 105 typedef struct { 106 _ISO2022Charset *recommend[4]; 107 size_t recommendsize[4]; 108 _ISO2022Charset initg[4]; 109 int maxcharset; 110 int flags; 111 #define F_8BIT 0x0001 112 #define F_NOOLD 0x0002 113 #define F_SI 0x0010 /*0F*/ 114 #define F_SO 0x0020 /*0E*/ 115 #define F_LS0 0x0010 /*0F*/ 116 #define F_LS1 0x0020 /*0E*/ 117 #define F_LS2 0x0040 /*ESC n*/ 118 #define F_LS3 0x0080 /*ESC o*/ 119 #define F_LS1R 0x0100 /*ESC ~*/ 120 #define F_LS2R 0x0200 /*ESC }*/ 121 #define F_LS3R 0x0400 /*ESC |*/ 122 #define F_SS2 0x0800 /*ESC N*/ 123 #define F_SS3 0x1000 /*ESC O*/ 124 #define F_SS2R 0x2000 /*8E*/ 125 #define F_SS3R 0x4000 /*8F*/ 126 } _ISO2022EncodingInfo; 127 128 #define _CEI_TO_EI(_cei_) (&(_cei_)->ei) 129 #define _CEI_TO_STATE(_cei_, _func_) (_cei_)->states.s_##_func_ 130 131 #define _FUNCNAME(m) _citrus_ISO2022_##m 132 #define _ENCODING_INFO _ISO2022EncodingInfo 133 #define _ENCODING_STATE _ISO2022State 134 #define _ENCODING_MB_CUR_MAX(_ei_) MB_LEN_MAX 135 #define _ENCODING_IS_STATE_DEPENDENT 1 136 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_) \ 137 (!((_ps_)->flags & _ISO2022STATE_FLAG_INITIALIZED)) 138 139 140 #define _ISO2022INVALID (wchar_t)-1 141 142 static __inline bool isc0(__uint8_t x) 143 { 144 145 return ((x & 0x1f) == x); 146 } 147 148 static __inline bool isc1(__uint8_t x) 149 { 150 151 return (0x80 <= x && x <= 0x9f); 152 } 153 154 static __inline bool iscntl(__uint8_t x) 155 { 156 157 return (isc0(x) || isc1(x) || x == 0x7f); 158 } 159 160 static __inline bool is94(__uint8_t x) 161 { 162 163 return (0x21 <= x && x <= 0x7e); 164 } 165 166 static __inline bool is96(__uint8_t x) 167 { 168 169 return (0x20 <= x && x <= 0x7f); 170 } 171 172 static __inline bool isecma(__uint8_t x) 173 { 174 175 return (0x30 <= x && x <= 0x7f); 176 } 177 178 static __inline bool isinterm(__uint8_t x) 179 { 180 181 return (0x20 <= x && x <= 0x2f); 182 } 183 184 static __inline bool isthree(__uint8_t x) 185 { 186 187 return (0x60 <= x && x <= 0x6f); 188 } 189 190 static __inline int 191 getcs(const char * __restrict p, _ISO2022Charset * __restrict cs) 192 { 193 194 if (!strncmp(p, "94$", 3) && p[3] && !p[4]) { 195 cs->final = (unsigned char)(p[3] & 0xff); 196 cs->interm = '\0'; 197 cs->vers = '\0'; 198 cs->type = CS94MULTI; 199 } else if (!strncmp(p, "96$", 3) && p[3] && !p[4]) { 200 cs->final = (unsigned char)(p[3] & 0xff); 201 cs->interm = '\0'; 202 cs->vers = '\0'; 203 cs->type = CS96MULTI; 204 } else if (!strncmp(p, "94", 2) && p[2] && !p[3]) { 205 cs->final = (unsigned char)(p[2] & 0xff); 206 cs->interm = '\0'; 207 cs->vers = '\0'; 208 cs->type = CS94; 209 } else if (!strncmp(p, "96", 2) && p[2] && !p[3]) { 210 cs->final = (unsigned char )(p[2] & 0xff); 211 cs->interm = '\0'; 212 cs->vers = '\0'; 213 cs->type = CS96; 214 } else 215 return (1); 216 217 return (0); 218 } 219 220 221 #define _NOTMATCH 0 222 #define _MATCH 1 223 #define _PARSEFAIL 2 224 225 static __inline int 226 get_recommend(_ISO2022EncodingInfo * __restrict ei, 227 const char * __restrict token) 228 { 229 _ISO2022Charset cs, *p; 230 int i; 231 232 if (!strchr("0123", token[0]) || token[1] != '=') 233 return (_NOTMATCH); 234 235 if (getcs(&token[2], &cs) == 0) 236 ; 237 else if (!strcmp(&token[2], "94")) { 238 cs.final = (unsigned char)(token[4]); 239 cs.interm = '\0'; 240 cs.vers = '\0'; 241 cs.type = CS94; 242 } else if (!strcmp(&token[2], "96")) { 243 cs.final = (unsigned char)(token[4]); 244 cs.interm = '\0'; 245 cs.vers = '\0'; 246 cs.type = CS96; 247 } else if (!strcmp(&token[2], "94$")) { 248 cs.final = (unsigned char)(token[5]); 249 cs.interm = '\0'; 250 cs.vers = '\0'; 251 cs.type = CS94MULTI; 252 } else if (!strcmp(&token[2], "96$")) { 253 cs.final = (unsigned char)(token[5]); 254 cs.interm = '\0'; 255 cs.vers = '\0'; 256 cs.type = CS96MULTI; 257 } else 258 return (_PARSEFAIL); 259 260 i = token[0] - '0'; 261 if (!ei->recommend[i]) 262 ei->recommend[i] = malloc(sizeof(_ISO2022Charset)); 263 else { 264 p = reallocarray(ei->recommend[i], ei->recommendsize[i] + 1, 265 sizeof(_ISO2022Charset)); 266 if (!p) 267 return (_PARSEFAIL); 268 ei->recommend[i] = p; 269 } 270 if (!ei->recommend[i]) 271 return (_PARSEFAIL); 272 ei->recommendsize[i]++; 273 274 (ei->recommend[i] + (ei->recommendsize[i] - 1))->final = cs.final; 275 (ei->recommend[i] + (ei->recommendsize[i] - 1))->interm = cs.interm; 276 (ei->recommend[i] + (ei->recommendsize[i] - 1))->vers = cs.vers; 277 (ei->recommend[i] + (ei->recommendsize[i] - 1))->type = cs.type; 278 279 return (_MATCH); 280 } 281 282 static __inline int 283 get_initg(_ISO2022EncodingInfo * __restrict ei, 284 const char * __restrict token) 285 { 286 _ISO2022Charset cs; 287 288 if (strncmp("INIT", &token[0], 4) || 289 !strchr("0123", token[4]) || 290 token[5] != '=') 291 return (_NOTMATCH); 292 293 if (getcs(&token[6], &cs) != 0) 294 return (_PARSEFAIL); 295 296 ei->initg[token[4] - '0'].type = cs.type; 297 ei->initg[token[4] - '0'].final = cs.final; 298 ei->initg[token[4] - '0'].interm = cs.interm; 299 ei->initg[token[4] - '0'].vers = cs.vers; 300 301 return (_MATCH); 302 } 303 304 static __inline int 305 get_max(_ISO2022EncodingInfo * __restrict ei, 306 const char * __restrict token) 307 { 308 if (!strcmp(token, "MAX1")) 309 ei->maxcharset = 1; 310 else if (!strcmp(token, "MAX2")) 311 ei->maxcharset = 2; 312 else if (!strcmp(token, "MAX3")) 313 ei->maxcharset = 3; 314 else 315 return (_NOTMATCH); 316 317 return (_MATCH); 318 } 319 320 321 static __inline int 322 get_flags(_ISO2022EncodingInfo * __restrict ei, 323 const char * __restrict token) 324 { 325 static struct { 326 const char *tag; 327 int flag; 328 } const tags[] = { 329 { "DUMMY", 0 }, 330 { "8BIT", F_8BIT }, 331 { "NOOLD", F_NOOLD }, 332 { "SI", F_SI }, 333 { "SO", F_SO }, 334 { "LS0", F_LS0 }, 335 { "LS1", F_LS1 }, 336 { "LS2", F_LS2 }, 337 { "LS3", F_LS3 }, 338 { "LS1R", F_LS1R }, 339 { "LS2R", F_LS2R }, 340 { "LS3R", F_LS3R }, 341 { "SS2", F_SS2 }, 342 { "SS3", F_SS3 }, 343 { "SS2R", F_SS2R }, 344 { "SS3R", F_SS3R }, 345 { NULL, 0 } 346 }; 347 int i; 348 349 for (i = 0; tags[i].tag; i++) 350 if (!strcmp(token, tags[i].tag)) { 351 ei->flags |= tags[i].flag; 352 return (_MATCH); 353 } 354 355 return (_NOTMATCH); 356 } 357 358 359 static __inline int 360 _citrus_ISO2022_parse_variable(_ISO2022EncodingInfo * __restrict ei, 361 const void * __restrict var, size_t lenvar __unused) 362 { 363 char const *e, *v; 364 char buf[20]; 365 size_t len; 366 int i, ret; 367 368 /* 369 * parse VARIABLE section. 370 */ 371 372 if (!var) 373 return (EFTYPE); 374 375 v = (const char *) var; 376 377 /* initialize structure */ 378 ei->maxcharset = 0; 379 for (i = 0; i < 4; i++) { 380 ei->recommend[i] = NULL; 381 ei->recommendsize[i] = 0; 382 } 383 ei->flags = 0; 384 385 while (*v) { 386 while (*v == ' ' || *v == '\t') 387 ++v; 388 389 /* find the token */ 390 e = v; 391 while (*e && *e != ' ' && *e != '\t') 392 ++e; 393 394 len = e - v; 395 if (len == 0) 396 break; 397 if (len >= sizeof(buf)) 398 goto parsefail; 399 snprintf(buf, sizeof(buf), "%.*s", (int)len, v); 400 401 if ((ret = get_recommend(ei, buf)) != _NOTMATCH) 402 ; 403 else if ((ret = get_initg(ei, buf)) != _NOTMATCH) 404 ; 405 else if ((ret = get_max(ei, buf)) != _NOTMATCH) 406 ; 407 else if ((ret = get_flags(ei, buf)) != _NOTMATCH) 408 ; 409 else 410 ret = _PARSEFAIL; 411 if (ret == _PARSEFAIL) 412 goto parsefail; 413 v = e; 414 415 } 416 417 return (0); 418 419 parsefail: 420 free(ei->recommend[0]); 421 free(ei->recommend[1]); 422 free(ei->recommend[2]); 423 free(ei->recommend[3]); 424 425 return (EFTYPE); 426 } 427 428 static __inline void 429 /*ARGSUSED*/ 430 _citrus_ISO2022_init_state(_ISO2022EncodingInfo * __restrict ei, 431 _ISO2022State * __restrict s) 432 { 433 int i; 434 435 memset(s, 0, sizeof(*s)); 436 s->gl = 0; 437 s->gr = (ei->flags & F_8BIT) ? 1 : -1; 438 439 for (i = 0; i < 4; i++) 440 if (ei->initg[i].final) { 441 s->g[i].type = ei->initg[i].type; 442 s->g[i].final = ei->initg[i].final; 443 s->g[i].interm = ei->initg[i].interm; 444 } 445 s->singlegl = s->singlegr = -1; 446 s->flags |= _ISO2022STATE_FLAG_INITIALIZED; 447 } 448 449 #if 0 450 static __inline void 451 /*ARGSUSED*/ 452 _citrus_ISO2022_pack_state(_ISO2022EncodingInfo * __restrict ei __unused, 453 void * __restrict pspriv, const _ISO2022State * __restrict s) 454 { 455 456 memcpy(pspriv, (const void *)s, sizeof(*s)); 457 } 458 459 static __inline void 460 /*ARGSUSED*/ 461 _citrus_ISO2022_unpack_state(_ISO2022EncodingInfo * __restrict ei __unused, 462 _ISO2022State * __restrict s, const void * __restrict pspriv) 463 { 464 465 memcpy((void *)s, pspriv, sizeof(*s)); 466 } 467 #endif 468 469 static int 470 /*ARGSUSED*/ 471 _citrus_ISO2022_encoding_module_init(_ISO2022EncodingInfo * __restrict ei, 472 const void * __restrict var, size_t lenvar) 473 { 474 475 return (_citrus_ISO2022_parse_variable(ei, var, lenvar)); 476 } 477 478 static void 479 /*ARGSUSED*/ 480 _citrus_ISO2022_encoding_module_uninit(_ISO2022EncodingInfo *ei __unused) 481 { 482 483 } 484 485 #define ESC '\033' 486 #define ECMA -1 487 #define INTERM -2 488 #define OECMA -3 489 static const struct seqtable { 490 int type; 491 int csoff; 492 int finaloff; 493 int intermoff; 494 int versoff; 495 int len; 496 int chars[10]; 497 } seqtable[] = { 498 /* G0 94MULTI special */ 499 { CS94MULTI, -1, 2, -1, -1, 3, { ESC, '$', OECMA }, }, 500 /* G0 94MULTI special with version identification */ 501 { CS94MULTI, -1, 5, -1, 2, 6, { ESC, '&', ECMA, ESC, '$', OECMA }, }, 502 /* G? 94 */ 503 { CS94, 1, 2, -1, -1, 3, { ESC, CS94, ECMA, }, }, 504 /* G? 94 with 2nd intermediate char */ 505 { CS94, 1, 3, 2, -1, 4, { ESC, CS94, INTERM, ECMA, }, }, 506 /* G? 96 */ 507 { CS96, 1, 2, -1, -1, 3, { ESC, CS96, ECMA, }, }, 508 /* G? 96 with 2nd intermediate char */ 509 { CS96, 1, 3, 2, -1, 4, { ESC, CS96, INTERM, ECMA, }, }, 510 /* G? 94MULTI */ 511 { CS94MULTI, 2, 3, -1, -1, 4, { ESC, '$', CS94, ECMA, }, }, 512 /* G? 96MULTI */ 513 { CS96MULTI, 2, 3, -1, -1, 4, { ESC, '$', CS96, ECMA, }, }, 514 /* G? 94MULTI with version specification */ 515 { CS94MULTI, 5, 6, -1, 2, 7, { ESC, '&', ECMA, ESC, '$', CS94, ECMA, }, }, 516 /* LS2/3 */ 517 { -1, -1, -1, -1, -1, 2, { ESC, 'n', }, }, 518 { -1, -1, -1, -1, -1, 2, { ESC, 'o', }, }, 519 /* LS1/2/3R */ 520 { -1, -1, -1, -1, -1, 2, { ESC, '~', }, }, 521 { -1, -1, -1, -1, -1, 2, { ESC, /*{*/ '}', }, }, 522 { -1, -1, -1, -1, -1, 2, { ESC, '|', }, }, 523 /* SS2/3 */ 524 { -1, -1, -1, -1, -1, 2, { ESC, 'N', }, }, 525 { -1, -1, -1, -1, -1, 2, { ESC, 'O', }, }, 526 /* end of records */ 527 // { 0, } 528 { 0, 0, 0, 0, 0, 0, { ESC, 0, }, } 529 }; 530 531 static int 532 seqmatch(const char * __restrict s, size_t n, 533 const struct seqtable * __restrict sp) 534 { 535 const int *p; 536 537 p = sp->chars; 538 while ((size_t)(p - sp->chars) < n && p - sp->chars < sp->len) { 539 switch (*p) { 540 case ECMA: 541 if (!isecma(*s)) 542 goto terminate; 543 break; 544 case OECMA: 545 if (*s && strchr("@AB", *s)) 546 break; 547 else 548 goto terminate; 549 case INTERM: 550 if (!isinterm(*s)) 551 goto terminate; 552 break; 553 case CS94: 554 if (*s && strchr("()*+", *s)) 555 break; 556 else 557 goto terminate; 558 case CS96: 559 if (*s && strchr(",-./", *s)) 560 break; 561 else 562 goto terminate; 563 default: 564 if (*s != *p) 565 goto terminate; 566 break; 567 } 568 569 p++; 570 s++; 571 } 572 573 terminate: 574 return (p - sp->chars); 575 } 576 577 static wchar_t 578 _ISO2022_sgetwchar(_ISO2022EncodingInfo * __restrict ei __unused, 579 char * __restrict string, size_t n, char ** __restrict result, 580 _ISO2022State * __restrict psenc) 581 { 582 const struct seqtable *sp; 583 wchar_t wchar = 0; 584 int i, cur, nmatch; 585 586 while (1) { 587 /* SI/SO */ 588 if (1 <= n && string[0] == '\017') { 589 psenc->gl = 0; 590 string++; 591 n--; 592 continue; 593 } 594 if (1 <= n && string[0] == '\016') { 595 psenc->gl = 1; 596 string++; 597 n--; 598 continue; 599 } 600 601 /* SS2/3R */ 602 if (1 <= n && string[0] && strchr("\217\216", string[0])) { 603 psenc->singlegl = psenc->singlegr = 604 (string[0] - '\216') + 2; 605 string++; 606 n--; 607 continue; 608 } 609 610 /* eat the letter if this is not ESC */ 611 if (1 <= n && string[0] != '\033') 612 break; 613 614 /* look for a perfect match from escape sequences */ 615 for (sp = &seqtable[0]; sp->len; sp++) { 616 nmatch = seqmatch(string, n, sp); 617 if (sp->len == nmatch && n >= (size_t)(sp->len)) 618 break; 619 } 620 621 if (!sp->len) 622 goto notseq; 623 624 if (sp->type != -1) { 625 if (sp->csoff == -1) 626 i = 0; 627 else { 628 switch (sp->type) { 629 case CS94: 630 case CS94MULTI: 631 i = string[sp->csoff] - '('; 632 break; 633 case CS96: 634 case CS96MULTI: 635 i = string[sp->csoff] - ','; 636 break; 637 default: 638 return (_ISO2022INVALID); 639 } 640 } 641 psenc->g[i].type = sp->type; 642 psenc->g[i].final = '\0'; 643 psenc->g[i].interm = '\0'; 644 psenc->g[i].vers = '\0'; 645 /* sp->finaloff must not be -1 */ 646 if (sp->finaloff != -1) 647 psenc->g[i].final = string[sp->finaloff]; 648 if (sp->intermoff != -1) 649 psenc->g[i].interm = string[sp->intermoff]; 650 if (sp->versoff != -1) 651 psenc->g[i].vers = string[sp->versoff]; 652 653 string += sp->len; 654 n -= sp->len; 655 continue; 656 } 657 658 /* LS2/3 */ 659 if (2 <= n && string[0] == '\033' && 660 string[1] && strchr("no", string[1])) { 661 psenc->gl = string[1] - 'n' + 2; 662 string += 2; 663 n -= 2; 664 continue; 665 } 666 667 /* LS1/2/3R */ 668 /* XXX: { for vi showmatch */ 669 if (2 <= n && string[0] == '\033' && 670 string[1] && strchr("~}|", string[1])) { 671 psenc->gr = 3 - (string[1] - '|'); 672 string += 2; 673 n -= 2; 674 continue; 675 } 676 677 /* SS2/3 */ 678 if (2 <= n && string[0] == '\033' && string[1] && 679 strchr("NO", string[1])) { 680 psenc->singlegl = (string[1] - 'N') + 2; 681 string += 2; 682 n -= 2; 683 continue; 684 } 685 686 notseq: 687 /* 688 * if we've got an unknown escape sequence, eat the ESC at the 689 * head. otherwise, wait till full escape sequence comes. 690 */ 691 for (sp = &seqtable[0]; sp->len; sp++) { 692 nmatch = seqmatch(string, n, sp); 693 if (!nmatch) 694 continue; 695 696 /* 697 * if we are in the middle of escape sequence, 698 * we still need to wait for more characters to come 699 */ 700 if (n < (size_t)(sp->len)) { 701 if ((size_t)(nmatch) == n) { 702 if (result) 703 *result = string; 704 return (_ISO2022INVALID); 705 } 706 } else { 707 if (nmatch == sp->len) { 708 /* this case should not happen */ 709 goto eat; 710 } 711 } 712 } 713 714 break; 715 } 716 717 eat: 718 /* no letter to eat */ 719 if (n < 1) { 720 if (result) 721 *result = string; 722 return (_ISO2022INVALID); 723 } 724 725 /* normal chars. always eat C0/C1 as is. */ 726 if (iscntl(*string & 0xff)) 727 cur = -1; 728 else if (*string & 0x80) 729 cur = (psenc->singlegr == -1) ? psenc->gr : psenc->singlegr; 730 else 731 cur = (psenc->singlegl == -1) ? psenc->gl : psenc->singlegl; 732 733 if (cur == -1) { 734 asis: 735 wchar = *string++ & 0xff; 736 if (result) 737 *result = string; 738 /* reset single shift state */ 739 psenc->singlegr = psenc->singlegl = -1; 740 return (wchar); 741 } 742 743 /* length error check */ 744 switch (psenc->g[cur].type) { 745 case CS94MULTI: 746 case CS96MULTI: 747 if (!isthree(psenc->g[cur].final)) { 748 if (2 <= n && 749 (string[0] & 0x80) == (string[1] & 0x80)) 750 break; 751 } else { 752 if (3 <= n && 753 (string[0] & 0x80) == (string[1] & 0x80) && 754 (string[0] & 0x80) == (string[2] & 0x80)) 755 break; 756 } 757 758 /* we still need to wait for more characters to come */ 759 if (result) 760 *result = string; 761 return (_ISO2022INVALID); 762 763 case CS94: 764 case CS96: 765 if (1 <= n) 766 break; 767 768 /* we still need to wait for more characters to come */ 769 if (result) 770 *result = string; 771 return (_ISO2022INVALID); 772 } 773 774 /* range check */ 775 switch (psenc->g[cur].type) { 776 case CS94: 777 if (!(is94(string[0] & 0x7f))) 778 goto asis; 779 break; 780 case CS96: 781 if (!(is96(string[0] & 0x7f))) 782 goto asis; 783 break; 784 case CS94MULTI: 785 if (!(is94(string[0] & 0x7f) && is94(string[1] & 0x7f))) 786 goto asis; 787 break; 788 case CS96MULTI: 789 if (!(is96(string[0] & 0x7f) && is96(string[1] & 0x7f))) 790 goto asis; 791 break; 792 } 793 794 /* extract the character. */ 795 switch (psenc->g[cur].type) { 796 case CS94: 797 /* special case for ASCII. */ 798 if (psenc->g[cur].final == 'B' && !psenc->g[cur].interm) { 799 wchar = *string++; 800 wchar &= 0x7f; 801 break; 802 } 803 wchar = psenc->g[cur].final; 804 wchar = (wchar << 8); 805 wchar |= (psenc->g[cur].interm ? (0x80 | psenc->g[cur].interm) : 0); 806 wchar = (wchar << 8); 807 wchar = (wchar << 8) | (*string++ & 0x7f); 808 break; 809 case CS96: 810 /* special case for ISO-8859-1. */ 811 if (psenc->g[cur].final == 'A' && !psenc->g[cur].interm) { 812 wchar = *string++; 813 wchar &= 0x7f; 814 wchar |= 0x80; 815 break; 816 } 817 wchar = psenc->g[cur].final; 818 wchar = (wchar << 8); 819 wchar |= (psenc->g[cur].interm ? (0x80 | psenc->g[cur].interm) : 0); 820 wchar = (wchar << 8); 821 wchar = (wchar << 8) | (*string++ & 0x7f); 822 wchar |= 0x80; 823 break; 824 case CS94MULTI: 825 case CS96MULTI: 826 wchar = psenc->g[cur].final; 827 wchar = (wchar << 8); 828 if (isthree(psenc->g[cur].final)) 829 wchar |= (*string++ & 0x7f); 830 wchar = (wchar << 8) | (*string++ & 0x7f); 831 wchar = (wchar << 8) | (*string++ & 0x7f); 832 if (psenc->g[cur].type == CS96MULTI) 833 wchar |= 0x80; 834 break; 835 } 836 837 if (result) 838 *result = string; 839 /* reset single shift state */ 840 psenc->singlegr = psenc->singlegl = -1; 841 return (wchar); 842 } 843 844 845 846 static int 847 _citrus_ISO2022_mbrtowc_priv(_ISO2022EncodingInfo * __restrict ei, 848 wchar_t * __restrict pwc, char ** __restrict s, 849 size_t n, _ISO2022State * __restrict psenc, size_t * __restrict nresult) 850 { 851 char *p, *result, *s0; 852 wchar_t wchar; 853 int c, chlenbak; 854 855 if (*s == NULL) { 856 _citrus_ISO2022_init_state(ei, psenc); 857 *nresult = _ENCODING_IS_STATE_DEPENDENT; 858 return (0); 859 } 860 s0 = *s; 861 c = 0; 862 chlenbak = psenc->chlen; 863 864 /* 865 * if we have something in buffer, use that. 866 * otherwise, skip here 867 */ 868 if (psenc->chlen > sizeof(psenc->ch)) { 869 /* illgeal state */ 870 _citrus_ISO2022_init_state(ei, psenc); 871 goto encoding_error; 872 } 873 if (psenc->chlen == 0) 874 goto emptybuf; 875 876 /* buffer is not empty */ 877 p = psenc->ch; 878 while (psenc->chlen < sizeof(psenc->ch)) { 879 if (n > 0) { 880 psenc->ch[psenc->chlen++] = *s0++; 881 n--; 882 } 883 884 wchar = _ISO2022_sgetwchar(ei, p, psenc->chlen - (p-psenc->ch), 885 &result, psenc); 886 c += result - p; 887 if (wchar != _ISO2022INVALID) { 888 if (psenc->chlen > (size_t)c) 889 memmove(psenc->ch, result, psenc->chlen - c); 890 if (psenc->chlen < (size_t)c) 891 psenc->chlen = 0; 892 else 893 psenc->chlen -= c; 894 goto output; 895 } 896 897 if (n == 0) { 898 if ((size_t)(result - p) == psenc->chlen) 899 /* complete shift sequence. */ 900 psenc->chlen = 0; 901 goto restart; 902 } 903 904 p = result; 905 } 906 907 /* escape sequence too long? */ 908 goto encoding_error; 909 910 emptybuf: 911 wchar = _ISO2022_sgetwchar(ei, s0, n, &result, psenc); 912 if (wchar != _ISO2022INVALID) { 913 c += result - s0; 914 psenc->chlen = 0; 915 s0 = result; 916 goto output; 917 } 918 if (result > s0) { 919 c += (result - s0); 920 n -= (result - s0); 921 s0 = result; 922 if (n > 0) 923 goto emptybuf; 924 /* complete shift sequence. */ 925 goto restart; 926 } 927 n += c; 928 if (n < sizeof(psenc->ch)) { 929 memcpy(psenc->ch, s0 - c, n); 930 psenc->chlen = n; 931 s0 = result; 932 goto restart; 933 } 934 935 /* escape sequence too long? */ 936 937 encoding_error: 938 psenc->chlen = 0; 939 *nresult = (size_t)-1; 940 return (EILSEQ); 941 942 output: 943 *s = s0; 944 if (pwc) 945 *pwc = wchar; 946 *nresult = wchar ? c - chlenbak : 0; 947 return (0); 948 949 restart: 950 *s = s0; 951 *nresult = (size_t)-2; 952 953 return (0); 954 } 955 956 static int 957 recommendation(_ISO2022EncodingInfo * __restrict ei, 958 _ISO2022Charset * __restrict cs) 959 { 960 _ISO2022Charset *recommend; 961 size_t j; 962 int i; 963 964 /* first, try a exact match. */ 965 for (i = 0; i < 4; i++) { 966 recommend = ei->recommend[i]; 967 for (j = 0; j < ei->recommendsize[i]; j++) { 968 if (cs->type != recommend[j].type) 969 continue; 970 if (cs->final != recommend[j].final) 971 continue; 972 if (cs->interm != recommend[j].interm) 973 continue; 974 975 return (i); 976 } 977 } 978 979 /* then, try a wildcard match over final char. */ 980 for (i = 0; i < 4; i++) { 981 recommend = ei->recommend[i]; 982 for (j = 0; j < ei->recommendsize[i]; j++) { 983 if (cs->type != recommend[j].type) 984 continue; 985 if (cs->final && (cs->final != recommend[j].final)) 986 continue; 987 if (cs->interm && (cs->interm != recommend[j].interm)) 988 continue; 989 990 return (i); 991 } 992 } 993 994 /* there's no recommendation. make a guess. */ 995 if (ei->maxcharset == 0) { 996 return (0); 997 } else { 998 switch (cs->type) { 999 case CS94: 1000 case CS94MULTI: 1001 return (0); 1002 case CS96: 1003 case CS96MULTI: 1004 return (1); 1005 } 1006 } 1007 return (0); 1008 } 1009 1010 static int 1011 _ISO2022_sputwchar(_ISO2022EncodingInfo * __restrict ei, wchar_t wc, 1012 char * __restrict string, size_t n, char ** __restrict result, 1013 _ISO2022State * __restrict psenc, size_t * __restrict nresult) 1014 { 1015 _ISO2022Charset cs; 1016 char *p; 1017 char tmp[MB_LEN_MAX]; 1018 size_t len; 1019 int bit8, i = 0, target; 1020 unsigned char mask; 1021 1022 if (isc0(wc & 0xff)) { 1023 /* go back to INIT0 or ASCII on control chars */ 1024 cs = ei->initg[0].final ? ei->initg[0] : ascii; 1025 } else if (isc1(wc & 0xff)) { 1026 /* go back to INIT1 or ISO-8859-1 on control chars */ 1027 cs = ei->initg[1].final ? ei->initg[1] : iso88591; 1028 } else if (!(wc & ~0xff)) { 1029 if (wc & 0x80) { 1030 /* special treatment for ISO-8859-1 */ 1031 cs = iso88591; 1032 } else { 1033 /* special treatment for ASCII */ 1034 cs = ascii; 1035 } 1036 } else { 1037 cs.final = (wc >> 24) & 0x7f; 1038 if ((wc >> 16) & 0x80) 1039 cs.interm = (wc >> 16) & 0x7f; 1040 else 1041 cs.interm = '\0'; 1042 if (wc & 0x80) 1043 cs.type = (wc & 0x00007f00) ? CS96MULTI : CS96; 1044 else 1045 cs.type = (wc & 0x00007f00) ? CS94MULTI : CS94; 1046 } 1047 target = recommendation(ei, &cs); 1048 p = tmp; 1049 bit8 = ei->flags & F_8BIT; 1050 1051 /* designate the charset onto the target plane(G0/1/2/3). */ 1052 if (psenc->g[target].type == cs.type && 1053 psenc->g[target].final == cs.final && 1054 psenc->g[target].interm == cs.interm) 1055 goto planeok; 1056 1057 *p++ = '\033'; 1058 if (cs.type == CS94MULTI || cs.type == CS96MULTI) 1059 *p++ = '$'; 1060 if (target == 0 && cs.type == CS94MULTI && strchr("@AB", cs.final) && 1061 !cs.interm && !(ei->flags & F_NOOLD)) 1062 ; 1063 else if (cs.type == CS94 || cs.type == CS94MULTI) 1064 *p++ = "()*+"[target]; 1065 else 1066 *p++ = ",-./"[target]; 1067 if (cs.interm) 1068 *p++ = cs.interm; 1069 *p++ = cs.final; 1070 1071 psenc->g[target].type = cs.type; 1072 psenc->g[target].final = cs.final; 1073 psenc->g[target].interm = cs.interm; 1074 1075 planeok: 1076 /* invoke the plane onto GL or GR. */ 1077 if (psenc->gl == target) 1078 goto sideok; 1079 if (bit8 && psenc->gr == target) 1080 goto sideok; 1081 1082 if (target == 0 && (ei->flags & F_LS0)) { 1083 *p++ = '\017'; 1084 psenc->gl = 0; 1085 } else if (target == 1 && (ei->flags & F_LS1)) { 1086 *p++ = '\016'; 1087 psenc->gl = 1; 1088 } else if (target == 2 && (ei->flags & F_LS2)) { 1089 *p++ = '\033'; 1090 *p++ = 'n'; 1091 psenc->gl = 2; 1092 } else if (target == 3 && (ei->flags & F_LS3)) { 1093 *p++ = '\033'; 1094 *p++ = 'o'; 1095 psenc->gl = 3; 1096 } else if (bit8 && target == 1 && (ei->flags & F_LS1R)) { 1097 *p++ = '\033'; 1098 *p++ = '~'; 1099 psenc->gr = 1; 1100 } else if (bit8 && target == 2 && (ei->flags & F_LS2R)) { 1101 *p++ = '\033'; 1102 /*{*/ 1103 *p++ = '}'; 1104 psenc->gr = 2; 1105 } else if (bit8 && target == 3 && (ei->flags & F_LS3R)) { 1106 *p++ = '\033'; 1107 *p++ = '|'; 1108 psenc->gr = 3; 1109 } else if (target == 2 && (ei->flags & F_SS2)) { 1110 *p++ = '\033'; 1111 *p++ = 'N'; 1112 psenc->singlegl = 2; 1113 } else if (target == 3 && (ei->flags & F_SS3)) { 1114 *p++ = '\033'; 1115 *p++ = 'O'; 1116 psenc->singlegl = 3; 1117 } else if (bit8 && target == 2 && (ei->flags & F_SS2R)) { 1118 *p++ = '\216'; 1119 *p++ = 'N'; 1120 psenc->singlegl = psenc->singlegr = 2; 1121 } else if (bit8 && target == 3 && (ei->flags & F_SS3R)) { 1122 *p++ = '\217'; 1123 *p++ = 'O'; 1124 psenc->singlegl = psenc->singlegr = 3; 1125 } else 1126 goto ilseq; 1127 1128 sideok: 1129 if (psenc->singlegl == target) 1130 mask = 0x00; 1131 else if (psenc->singlegr == target) 1132 mask = 0x80; 1133 else if (psenc->gl == target) 1134 mask = 0x00; 1135 else if ((ei->flags & F_8BIT) && psenc->gr == target) 1136 mask = 0x80; 1137 else 1138 goto ilseq; 1139 1140 switch (cs.type) { 1141 case CS94: 1142 case CS96: 1143 i = 1; 1144 break; 1145 case CS94MULTI: 1146 case CS96MULTI: 1147 i = !iscntl(wc & 0xff) ? 1148 (isthree(cs.final) ? 3 : 2) : 1; 1149 break; 1150 } 1151 while (i-- > 0) 1152 *p++ = ((wc >> (i << 3)) & 0x7f) | mask; 1153 1154 /* reset single shift state */ 1155 psenc->singlegl = psenc->singlegr = -1; 1156 1157 len = (size_t)(p - tmp); 1158 if (n < len) { 1159 if (result) 1160 *result = (char *)0; 1161 *nresult = (size_t)-1; 1162 return (E2BIG); 1163 } 1164 if (result) 1165 *result = string + len; 1166 memcpy(string, tmp, len); 1167 *nresult = len; 1168 1169 return (0); 1170 1171 ilseq: 1172 *nresult = (size_t)-1; 1173 return (EILSEQ); 1174 } 1175 1176 static int 1177 _citrus_ISO2022_put_state_reset(_ISO2022EncodingInfo * __restrict ei, 1178 char * __restrict s, size_t n, _ISO2022State * __restrict psenc, 1179 size_t * __restrict nresult) 1180 { 1181 char *result; 1182 char buf[MB_LEN_MAX]; 1183 size_t len; 1184 int ret; 1185 1186 /* XXX state will be modified after this operation... */ 1187 ret = _ISO2022_sputwchar(ei, L'\0', buf, sizeof(buf), &result, psenc, 1188 &len); 1189 if (ret) { 1190 *nresult = len; 1191 return (ret); 1192 } 1193 1194 if (sizeof(buf) < len || n < len-1) { 1195 /* XXX should recover state? */ 1196 *nresult = (size_t)-1; 1197 return (E2BIG); 1198 } 1199 1200 memcpy(s, buf, len - 1); 1201 *nresult = len - 1; 1202 return (0); 1203 } 1204 1205 static int 1206 _citrus_ISO2022_wcrtomb_priv(_ISO2022EncodingInfo * __restrict ei, 1207 char * __restrict s, size_t n, wchar_t wc, 1208 _ISO2022State * __restrict psenc, size_t * __restrict nresult) 1209 { 1210 char *result; 1211 char buf[MB_LEN_MAX]; 1212 size_t len; 1213 int ret; 1214 1215 /* XXX state will be modified after this operation... */ 1216 ret = _ISO2022_sputwchar(ei, wc, buf, sizeof(buf), &result, psenc, 1217 &len); 1218 if (ret) { 1219 *nresult = len; 1220 return (ret); 1221 } 1222 1223 if (sizeof(buf) < len || n < len) { 1224 /* XXX should recover state? */ 1225 *nresult = (size_t)-1; 1226 return (E2BIG); 1227 } 1228 1229 memcpy(s, buf, len); 1230 *nresult = len; 1231 return (0); 1232 } 1233 1234 static __inline int 1235 /*ARGSUSED*/ 1236 _citrus_ISO2022_stdenc_wctocs(_ISO2022EncodingInfo * __restrict ei __unused, 1237 _csid_t * __restrict csid, _index_t * __restrict idx, wchar_t wc) 1238 { 1239 wchar_t m, nm; 1240 1241 m = wc & 0x7FFF8080; 1242 nm = wc & 0x007F7F7F; 1243 if (m & 0x00800000) 1244 nm &= 0x00007F7F; 1245 else 1246 m &= 0x7F008080; 1247 if (nm & 0x007F0000) { 1248 /* ^3 mark */ 1249 m |= 0x007F0000; 1250 } else if (nm & 0x00007F00) { 1251 /* ^2 mark */ 1252 m |= 0x00007F00; 1253 } 1254 *csid = (_csid_t)m; 1255 *idx = (_index_t)nm; 1256 1257 return (0); 1258 } 1259 1260 static __inline int 1261 /*ARGSUSED*/ 1262 _citrus_ISO2022_stdenc_cstowc(_ISO2022EncodingInfo * __restrict ei __unused, 1263 wchar_t * __restrict wc, _csid_t csid, _index_t idx) 1264 { 1265 1266 *wc = (wchar_t)(csid & 0x7F808080) | (wchar_t)idx; 1267 1268 return (0); 1269 } 1270 1271 static __inline int 1272 /*ARGSUSED*/ 1273 _citrus_ISO2022_stdenc_get_state_desc_generic(_ISO2022EncodingInfo * __restrict ei __unused, 1274 _ISO2022State * __restrict psenc, int * __restrict rstate) 1275 { 1276 1277 if (psenc->chlen == 0) { 1278 /* XXX: it should distinguish initial and stable. */ 1279 *rstate = _STDENC_SDGEN_STABLE; 1280 } else 1281 *rstate = (psenc->ch[0] == '\033') ? 1282 _STDENC_SDGEN_INCOMPLETE_SHIFT : 1283 _STDENC_SDGEN_INCOMPLETE_CHAR; 1284 return (0); 1285 } 1286 1287 /* ---------------------------------------------------------------------- 1288 * public interface for stdenc 1289 */ 1290 1291 _CITRUS_STDENC_DECLS(ISO2022); 1292 _CITRUS_STDENC_DEF_OPS(ISO2022); 1293 1294 #include "citrus_stdenc_template.h" 1295