1 /* $NetBSD: citrus_iso2022.c,v 1.20 2010/12/07 22:01:45 joerg Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c)1999, 2002 Citrus Project, 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 * $Citrus: xpg4dl/FreeBSD/lib/libc/locale/iso2022.c,v 1.23 2001/06/21 01:51:44 yamt Exp $ 31 */ 32 33 #include <sys/cdefs.h> 34 #include <sys/types.h> 35 36 #include <assert.h> 37 #include <errno.h> 38 #include <limits.h> 39 #include <stdbool.h> 40 #include <stddef.h> 41 #include <stdio.h> 42 #include <stdlib.h> 43 #include <string.h> 44 #include <wchar.h> 45 46 #include "citrus_namespace.h" 47 #include "citrus_types.h" 48 #include "citrus_module.h" 49 #include "citrus_stdenc.h" 50 #include "citrus_iso2022.h" 51 52 53 /* ---------------------------------------------------------------------- 54 * private stuffs used by templates 55 */ 56 57 58 /* 59 * wchar_t mappings: 60 * ASCII (ESC ( B) 00000000 00000000 00000000 0xxxxxxx 61 * iso-8859-1 (ESC , A) 00000000 00000000 00000000 1xxxxxxx 62 * 94 charset (ESC ( F) 0fffffff 00000000 00000000 0xxxxxxx 63 * 94 charset (ESC ( M F) 0fffffff 1mmmmmmm 00000000 0xxxxxxx 64 * 96 charset (ESC , F) 0fffffff 00000000 00000000 1xxxxxxx 65 * 96 charset (ESC , M F) 0fffffff 1mmmmmmm 00000000 1xxxxxxx 66 * 94x94 charset (ESC $ ( F) 0fffffff 00000000 0xxxxxxx 0xxxxxxx 67 * 96x96 charset (ESC $ , F) 0fffffff 00000000 0xxxxxxx 1xxxxxxx 68 * 94x94 charset (ESC & V ESC $ ( F) 69 * 0fffffff 1vvvvvvv 0xxxxxxx 0xxxxxxx 70 * 94x94x94 charset (ESC $ ( F) 0fffffff 0xxxxxxx 0xxxxxxx 0xxxxxxx 71 * 96x96x96 charset (ESC $ , F) 0fffffff 0xxxxxxx 0xxxxxxx 1xxxxxxx 72 * reserved for UCS4 co-existence (UCS4 is 31bit encoding thanks to mohta bit) 73 * 1xxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 74 */ 75 76 #define CS94 (0U) 77 #define CS96 (1U) 78 #define CS94MULTI (2U) 79 #define CS96MULTI (3U) 80 81 typedef struct { 82 unsigned char type; 83 unsigned char final; 84 unsigned char interm; 85 unsigned char vers; 86 } _ISO2022Charset; 87 88 static const _ISO2022Charset ascii = { CS94, 'B', '\0', '\0' }; 89 static const _ISO2022Charset iso88591 = { CS96, 'A', '\0', '\0' }; 90 91 typedef struct { 92 _ISO2022Charset g[4]; 93 /* need 3 bits to hold -1, 0, ..., 3 */ 94 int gl:3, 95 gr:3, 96 singlegl:3, 97 singlegr:3; 98 char ch[7]; /* longest escape sequence (ESC & V ESC $ ( F) */ 99 size_t chlen; 100 int flags; 101 #define _ISO2022STATE_FLAG_INITIALIZED 1 102 } _ISO2022State; 103 104 typedef struct { 105 _ISO2022Charset *recommend[4]; 106 size_t recommendsize[4]; 107 _ISO2022Charset initg[4]; 108 int maxcharset; 109 int flags; 110 #define F_8BIT 0x0001 111 #define F_NOOLD 0x0002 112 #define F_SI 0x0010 /*0F*/ 113 #define F_SO 0x0020 /*0E*/ 114 #define F_LS0 0x0010 /*0F*/ 115 #define F_LS1 0x0020 /*0E*/ 116 #define F_LS2 0x0040 /*ESC n*/ 117 #define F_LS3 0x0080 /*ESC o*/ 118 #define F_LS1R 0x0100 /*ESC ~*/ 119 #define F_LS2R 0x0200 /*ESC }*/ 120 #define F_LS3R 0x0400 /*ESC |*/ 121 #define F_SS2 0x0800 /*ESC N*/ 122 #define F_SS3 0x1000 /*ESC O*/ 123 #define F_SS2R 0x2000 /*8E*/ 124 #define F_SS3R 0x4000 /*8F*/ 125 } _ISO2022EncodingInfo; 126 127 #define _CEI_TO_EI(_cei_) (&(_cei_)->ei) 128 #define _CEI_TO_STATE(_cei_, _func_) (_cei_)->states.s_##_func_ 129 130 #define _FUNCNAME(m) _citrus_ISO2022_##m 131 #define _ENCODING_INFO _ISO2022EncodingInfo 132 #define _ENCODING_STATE _ISO2022State 133 #define _ENCODING_MB_CUR_MAX(_ei_) MB_LEN_MAX 134 #define _ENCODING_IS_STATE_DEPENDENT 1 135 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_) \ 136 (!((_ps_)->flags & _ISO2022STATE_FLAG_INITIALIZED)) 137 138 139 #define _ISO2022INVALID (wchar_t)-1 140 141 static __inline bool isc0(__uint8_t x) 142 { 143 144 return ((x & 0x1f) == x); 145 } 146 147 static __inline bool isc1(__uint8_t x) 148 { 149 150 return (0x80 <= x && x <= 0x9f); 151 } 152 153 static __inline bool iscntl(__uint8_t x) 154 { 155 156 return (isc0(x) || isc1(x) || x == 0x7f); 157 } 158 159 static __inline bool is94(__uint8_t x) 160 { 161 162 return (0x21 <= x && x <= 0x7e); 163 } 164 165 static __inline bool is96(__uint8_t x) 166 { 167 168 return (0x20 <= x && x <= 0x7f); 169 } 170 171 static __inline bool isecma(__uint8_t x) 172 { 173 174 return (0x30 <= x && x <= 0x7f); 175 } 176 177 static __inline bool isinterm(__uint8_t x) 178 { 179 180 return (0x20 <= x && x <= 0x2f); 181 } 182 183 static __inline bool isthree(__uint8_t x) 184 { 185 186 return (0x60 <= x && x <= 0x6f); 187 } 188 189 static __inline int 190 getcs(const char * __restrict p, _ISO2022Charset * __restrict cs) 191 { 192 193 if (!strncmp(p, "94$", 3) && p[3] && !p[4]) { 194 cs->final = (unsigned char)(p[3] & 0xff); 195 cs->interm = '\0'; 196 cs->vers = '\0'; 197 cs->type = CS94MULTI; 198 } else if (!strncmp(p, "96$", 3) && p[3] && !p[4]) { 199 cs->final = (unsigned char)(p[3] & 0xff); 200 cs->interm = '\0'; 201 cs->vers = '\0'; 202 cs->type = CS96MULTI; 203 } else if (!strncmp(p, "94", 2) && p[2] && !p[3]) { 204 cs->final = (unsigned char)(p[2] & 0xff); 205 cs->interm = '\0'; 206 cs->vers = '\0'; 207 cs->type = CS94; 208 } else if (!strncmp(p, "96", 2) && p[2] && !p[3]) { 209 cs->final = (unsigned char )(p[2] & 0xff); 210 cs->interm = '\0'; 211 cs->vers = '\0'; 212 cs->type = CS96; 213 } else 214 return (1); 215 216 return (0); 217 } 218 219 220 #define _NOTMATCH 0 221 #define _MATCH 1 222 #define _PARSEFAIL 2 223 224 static __inline int 225 get_recommend(_ISO2022EncodingInfo * __restrict ei, 226 const char * __restrict token) 227 { 228 _ISO2022Charset cs, *p; 229 int i; 230 231 if (!strchr("0123", token[0]) || token[1] != '=') 232 return (_NOTMATCH); 233 234 if (getcs(&token[2], &cs) == 0) 235 ; 236 else if (!strcmp(&token[2], "94")) { 237 cs.final = (unsigned char)(token[4]); 238 cs.interm = '\0'; 239 cs.vers = '\0'; 240 cs.type = CS94; 241 } else if (!strcmp(&token[2], "96")) { 242 cs.final = (unsigned char)(token[4]); 243 cs.interm = '\0'; 244 cs.vers = '\0'; 245 cs.type = CS96; 246 } else if (!strcmp(&token[2], "94$")) { 247 cs.final = (unsigned char)(token[5]); 248 cs.interm = '\0'; 249 cs.vers = '\0'; 250 cs.type = CS94MULTI; 251 } else if (!strcmp(&token[2], "96$")) { 252 cs.final = (unsigned char)(token[5]); 253 cs.interm = '\0'; 254 cs.vers = '\0'; 255 cs.type = CS96MULTI; 256 } else 257 return (_PARSEFAIL); 258 259 i = token[0] - '0'; 260 if (!ei->recommend[i]) 261 ei->recommend[i] = malloc(sizeof(_ISO2022Charset)); 262 else { 263 p = reallocarray(ei->recommend[i], ei->recommendsize[i] + 1, 264 sizeof(_ISO2022Charset)); 265 if (!p) 266 return (_PARSEFAIL); 267 ei->recommend[i] = p; 268 } 269 if (!ei->recommend[i]) 270 return (_PARSEFAIL); 271 ei->recommendsize[i]++; 272 273 (ei->recommend[i] + (ei->recommendsize[i] - 1))->final = cs.final; 274 (ei->recommend[i] + (ei->recommendsize[i] - 1))->interm = cs.interm; 275 (ei->recommend[i] + (ei->recommendsize[i] - 1))->vers = cs.vers; 276 (ei->recommend[i] + (ei->recommendsize[i] - 1))->type = cs.type; 277 278 return (_MATCH); 279 } 280 281 static __inline int 282 get_initg(_ISO2022EncodingInfo * __restrict ei, 283 const char * __restrict token) 284 { 285 _ISO2022Charset cs; 286 287 if (strncmp("INIT", &token[0], 4) || 288 !strchr("0123", token[4]) || 289 token[5] != '=') 290 return (_NOTMATCH); 291 292 if (getcs(&token[6], &cs) != 0) 293 return (_PARSEFAIL); 294 295 ei->initg[token[4] - '0'].type = cs.type; 296 ei->initg[token[4] - '0'].final = cs.final; 297 ei->initg[token[4] - '0'].interm = cs.interm; 298 ei->initg[token[4] - '0'].vers = cs.vers; 299 300 return (_MATCH); 301 } 302 303 static __inline int 304 get_max(_ISO2022EncodingInfo * __restrict ei, 305 const char * __restrict token) 306 { 307 if (!strcmp(token, "MAX1")) 308 ei->maxcharset = 1; 309 else if (!strcmp(token, "MAX2")) 310 ei->maxcharset = 2; 311 else if (!strcmp(token, "MAX3")) 312 ei->maxcharset = 3; 313 else 314 return (_NOTMATCH); 315 316 return (_MATCH); 317 } 318 319 320 static __inline int 321 get_flags(_ISO2022EncodingInfo * __restrict ei, 322 const char * __restrict token) 323 { 324 static struct { 325 const char *tag; 326 int flag; 327 } const tags[] = { 328 { "DUMMY", 0 }, 329 { "8BIT", F_8BIT }, 330 { "NOOLD", F_NOOLD }, 331 { "SI", F_SI }, 332 { "SO", F_SO }, 333 { "LS0", F_LS0 }, 334 { "LS1", F_LS1 }, 335 { "LS2", F_LS2 }, 336 { "LS3", F_LS3 }, 337 { "LS1R", F_LS1R }, 338 { "LS2R", F_LS2R }, 339 { "LS3R", F_LS3R }, 340 { "SS2", F_SS2 }, 341 { "SS3", F_SS3 }, 342 { "SS2R", F_SS2R }, 343 { "SS3R", F_SS3R }, 344 { NULL, 0 } 345 }; 346 int i; 347 348 for (i = 0; tags[i].tag; i++) 349 if (!strcmp(token, tags[i].tag)) { 350 ei->flags |= tags[i].flag; 351 return (_MATCH); 352 } 353 354 return (_NOTMATCH); 355 } 356 357 358 static __inline int 359 _citrus_ISO2022_parse_variable(_ISO2022EncodingInfo * __restrict ei, 360 const void * __restrict var, size_t lenvar __unused) 361 { 362 char const *e, *v; 363 char buf[20]; 364 size_t len; 365 int i, ret; 366 367 /* 368 * parse VARIABLE section. 369 */ 370 371 if (!var) 372 return (EFTYPE); 373 374 v = (const char *) var; 375 376 /* initialize structure */ 377 ei->maxcharset = 0; 378 for (i = 0; i < 4; i++) { 379 ei->recommend[i] = NULL; 380 ei->recommendsize[i] = 0; 381 } 382 ei->flags = 0; 383 384 while (*v) { 385 while (*v == ' ' || *v == '\t') 386 ++v; 387 388 /* find the token */ 389 e = v; 390 while (*e && *e != ' ' && *e != '\t') 391 ++e; 392 393 len = e - v; 394 if (len == 0) 395 break; 396 if (len >= sizeof(buf)) 397 goto parsefail; 398 snprintf(buf, sizeof(buf), "%.*s", (int)len, v); 399 400 if ((ret = get_recommend(ei, buf)) != _NOTMATCH) 401 ; 402 else if ((ret = get_initg(ei, buf)) != _NOTMATCH) 403 ; 404 else if ((ret = get_max(ei, buf)) != _NOTMATCH) 405 ; 406 else if ((ret = get_flags(ei, buf)) != _NOTMATCH) 407 ; 408 else 409 ret = _PARSEFAIL; 410 if (ret == _PARSEFAIL) 411 goto parsefail; 412 v = e; 413 414 } 415 416 return (0); 417 418 parsefail: 419 free(ei->recommend[0]); 420 free(ei->recommend[1]); 421 free(ei->recommend[2]); 422 free(ei->recommend[3]); 423 424 return (EFTYPE); 425 } 426 427 static __inline void 428 /*ARGSUSED*/ 429 _citrus_ISO2022_init_state(_ISO2022EncodingInfo * __restrict ei, 430 _ISO2022State * __restrict s) 431 { 432 int i; 433 434 memset(s, 0, sizeof(*s)); 435 s->gl = 0; 436 s->gr = (ei->flags & F_8BIT) ? 1 : -1; 437 438 for (i = 0; i < 4; i++) 439 if (ei->initg[i].final) { 440 s->g[i].type = ei->initg[i].type; 441 s->g[i].final = ei->initg[i].final; 442 s->g[i].interm = ei->initg[i].interm; 443 } 444 s->singlegl = s->singlegr = -1; 445 s->flags |= _ISO2022STATE_FLAG_INITIALIZED; 446 } 447 448 #if 0 449 static __inline void 450 /*ARGSUSED*/ 451 _citrus_ISO2022_pack_state(_ISO2022EncodingInfo * __restrict ei __unused, 452 void * __restrict pspriv, const _ISO2022State * __restrict s) 453 { 454 455 memcpy(pspriv, (const void *)s, sizeof(*s)); 456 } 457 458 static __inline void 459 /*ARGSUSED*/ 460 _citrus_ISO2022_unpack_state(_ISO2022EncodingInfo * __restrict ei __unused, 461 _ISO2022State * __restrict s, const void * __restrict pspriv) 462 { 463 464 memcpy((void *)s, pspriv, sizeof(*s)); 465 } 466 #endif 467 468 static int 469 /*ARGSUSED*/ 470 _citrus_ISO2022_encoding_module_init(_ISO2022EncodingInfo * __restrict ei, 471 const void * __restrict var, size_t lenvar) 472 { 473 474 return (_citrus_ISO2022_parse_variable(ei, var, lenvar)); 475 } 476 477 static void 478 /*ARGSUSED*/ 479 _citrus_ISO2022_encoding_module_uninit(_ISO2022EncodingInfo *ei __unused) 480 { 481 482 } 483 484 #define ESC '\033' 485 #define ECMA -1 486 #define INTERM -2 487 #define OECMA -3 488 static const struct seqtable { 489 int type; 490 int csoff; 491 int finaloff; 492 int intermoff; 493 int versoff; 494 int len; 495 int chars[10]; 496 } seqtable[] = { 497 /* G0 94MULTI special */ 498 { CS94MULTI, -1, 2, -1, -1, 3, { ESC, '$', OECMA }, }, 499 /* G0 94MULTI special with version identification */ 500 { CS94MULTI, -1, 5, -1, 2, 6, { ESC, '&', ECMA, ESC, '$', OECMA }, }, 501 /* G? 94 */ 502 { CS94, 1, 2, -1, -1, 3, { ESC, CS94, ECMA, }, }, 503 /* G? 94 with 2nd intermediate char */ 504 { CS94, 1, 3, 2, -1, 4, { ESC, CS94, INTERM, ECMA, }, }, 505 /* G? 96 */ 506 { CS96, 1, 2, -1, -1, 3, { ESC, CS96, ECMA, }, }, 507 /* G? 96 with 2nd intermediate char */ 508 { CS96, 1, 3, 2, -1, 4, { ESC, CS96, INTERM, ECMA, }, }, 509 /* G? 94MULTI */ 510 { CS94MULTI, 2, 3, -1, -1, 4, { ESC, '$', CS94, ECMA, }, }, 511 /* G? 96MULTI */ 512 { CS96MULTI, 2, 3, -1, -1, 4, { ESC, '$', CS96, ECMA, }, }, 513 /* G? 94MULTI with version specification */ 514 { CS94MULTI, 5, 6, -1, 2, 7, { ESC, '&', ECMA, ESC, '$', CS94, ECMA, }, }, 515 /* LS2/3 */ 516 { -1, -1, -1, -1, -1, 2, { ESC, 'n', }, }, 517 { -1, -1, -1, -1, -1, 2, { ESC, 'o', }, }, 518 /* LS1/2/3R */ 519 { -1, -1, -1, -1, -1, 2, { ESC, '~', }, }, 520 { -1, -1, -1, -1, -1, 2, { ESC, /*{*/ '}', }, }, 521 { -1, -1, -1, -1, -1, 2, { ESC, '|', }, }, 522 /* SS2/3 */ 523 { -1, -1, -1, -1, -1, 2, { ESC, 'N', }, }, 524 { -1, -1, -1, -1, -1, 2, { ESC, 'O', }, }, 525 /* end of records */ 526 // { 0, } 527 { 0, 0, 0, 0, 0, 0, { ESC, 0, }, } 528 }; 529 530 static int 531 seqmatch(const char * __restrict s, size_t n, 532 const struct seqtable * __restrict sp) 533 { 534 const int *p; 535 536 p = sp->chars; 537 while ((size_t)(p - sp->chars) < n && p - sp->chars < sp->len) { 538 switch (*p) { 539 case ECMA: 540 if (!isecma(*s)) 541 goto terminate; 542 break; 543 case OECMA: 544 if (*s && strchr("@AB", *s)) 545 break; 546 else 547 goto terminate; 548 case INTERM: 549 if (!isinterm(*s)) 550 goto terminate; 551 break; 552 case CS94: 553 if (*s && strchr("()*+", *s)) 554 break; 555 else 556 goto terminate; 557 case CS96: 558 if (*s && strchr(",-./", *s)) 559 break; 560 else 561 goto terminate; 562 default: 563 if (*s != *p) 564 goto terminate; 565 break; 566 } 567 568 p++; 569 s++; 570 } 571 572 terminate: 573 return (p - sp->chars); 574 } 575 576 static wchar_t 577 _ISO2022_sgetwchar(_ISO2022EncodingInfo * __restrict ei __unused, 578 char * __restrict string, size_t n, char ** __restrict result, 579 _ISO2022State * __restrict psenc) 580 { 581 const struct seqtable *sp; 582 wchar_t wchar = 0; 583 int i, cur, nmatch; 584 585 while (1) { 586 /* SI/SO */ 587 if (1 <= n && string[0] == '\017') { 588 psenc->gl = 0; 589 string++; 590 n--; 591 continue; 592 } 593 if (1 <= n && string[0] == '\016') { 594 psenc->gl = 1; 595 string++; 596 n--; 597 continue; 598 } 599 600 /* SS2/3R */ 601 if (1 <= n && string[0] && strchr("\217\216", string[0])) { 602 psenc->singlegl = psenc->singlegr = 603 (string[0] - '\216') + 2; 604 string++; 605 n--; 606 continue; 607 } 608 609 /* eat the letter if this is not ESC */ 610 if (1 <= n && string[0] != '\033') 611 break; 612 613 /* look for a perfect match from escape sequences */ 614 for (sp = &seqtable[0]; sp->len; sp++) { 615 nmatch = seqmatch(string, n, sp); 616 if (sp->len == nmatch && n >= (size_t)(sp->len)) 617 break; 618 } 619 620 if (!sp->len) 621 goto notseq; 622 623 if (sp->type != -1) { 624 if (sp->csoff == -1) 625 i = 0; 626 else { 627 switch (sp->type) { 628 case CS94: 629 case CS94MULTI: 630 i = string[sp->csoff] - '('; 631 break; 632 case CS96: 633 case CS96MULTI: 634 i = string[sp->csoff] - ','; 635 break; 636 default: 637 return (_ISO2022INVALID); 638 } 639 } 640 psenc->g[i].type = sp->type; 641 psenc->g[i].final = '\0'; 642 psenc->g[i].interm = '\0'; 643 psenc->g[i].vers = '\0'; 644 /* sp->finaloff must not be -1 */ 645 if (sp->finaloff != -1) 646 psenc->g[i].final = string[sp->finaloff]; 647 if (sp->intermoff != -1) 648 psenc->g[i].interm = string[sp->intermoff]; 649 if (sp->versoff != -1) 650 psenc->g[i].vers = string[sp->versoff]; 651 652 string += sp->len; 653 n -= sp->len; 654 continue; 655 } 656 657 /* LS2/3 */ 658 if (2 <= n && string[0] == '\033' && 659 string[1] && strchr("no", string[1])) { 660 psenc->gl = string[1] - 'n' + 2; 661 string += 2; 662 n -= 2; 663 continue; 664 } 665 666 /* LS1/2/3R */ 667 /* XXX: { for vi showmatch */ 668 if (2 <= n && string[0] == '\033' && 669 string[1] && strchr("~}|", string[1])) { 670 psenc->gr = 3 - (string[1] - '|'); 671 string += 2; 672 n -= 2; 673 continue; 674 } 675 676 /* SS2/3 */ 677 if (2 <= n && string[0] == '\033' && string[1] && 678 strchr("NO", string[1])) { 679 psenc->singlegl = (string[1] - 'N') + 2; 680 string += 2; 681 n -= 2; 682 continue; 683 } 684 685 notseq: 686 /* 687 * if we've got an unknown escape sequence, eat the ESC at the 688 * head. otherwise, wait till full escape sequence comes. 689 */ 690 for (sp = &seqtable[0]; sp->len; sp++) { 691 nmatch = seqmatch(string, n, sp); 692 if (!nmatch) 693 continue; 694 695 /* 696 * if we are in the middle of escape sequence, 697 * we still need to wait for more characters to come 698 */ 699 if (n < (size_t)(sp->len)) { 700 if ((size_t)(nmatch) == n) { 701 if (result) 702 *result = string; 703 return (_ISO2022INVALID); 704 } 705 } else { 706 if (nmatch == sp->len) { 707 /* this case should not happen */ 708 goto eat; 709 } 710 } 711 } 712 713 break; 714 } 715 716 eat: 717 /* no letter to eat */ 718 if (n < 1) { 719 if (result) 720 *result = string; 721 return (_ISO2022INVALID); 722 } 723 724 /* normal chars. always eat C0/C1 as is. */ 725 if (iscntl(*string & 0xff)) 726 cur = -1; 727 else if (*string & 0x80) 728 cur = (psenc->singlegr == -1) ? psenc->gr : psenc->singlegr; 729 else 730 cur = (psenc->singlegl == -1) ? psenc->gl : psenc->singlegl; 731 732 if (cur == -1) { 733 asis: 734 wchar = *string++ & 0xff; 735 if (result) 736 *result = string; 737 /* reset single shift state */ 738 psenc->singlegr = psenc->singlegl = -1; 739 return (wchar); 740 } 741 742 /* length error check */ 743 switch (psenc->g[cur].type) { 744 case CS94MULTI: 745 case CS96MULTI: 746 if (!isthree(psenc->g[cur].final)) { 747 if (2 <= n && 748 (string[0] & 0x80) == (string[1] & 0x80)) 749 break; 750 } else { 751 if (3 <= n && 752 (string[0] & 0x80) == (string[1] & 0x80) && 753 (string[0] & 0x80) == (string[2] & 0x80)) 754 break; 755 } 756 757 /* we still need to wait for more characters to come */ 758 if (result) 759 *result = string; 760 return (_ISO2022INVALID); 761 762 case CS94: 763 case CS96: 764 if (1 <= n) 765 break; 766 767 /* we still need to wait for more characters to come */ 768 if (result) 769 *result = string; 770 return (_ISO2022INVALID); 771 } 772 773 /* range check */ 774 switch (psenc->g[cur].type) { 775 case CS94: 776 if (!(is94(string[0] & 0x7f))) 777 goto asis; 778 break; 779 case CS96: 780 if (!(is96(string[0] & 0x7f))) 781 goto asis; 782 break; 783 case CS94MULTI: 784 if (!(is94(string[0] & 0x7f) && is94(string[1] & 0x7f))) 785 goto asis; 786 break; 787 case CS96MULTI: 788 if (!(is96(string[0] & 0x7f) && is96(string[1] & 0x7f))) 789 goto asis; 790 break; 791 } 792 793 /* extract the character. */ 794 switch (psenc->g[cur].type) { 795 case CS94: 796 /* special case for ASCII. */ 797 if (psenc->g[cur].final == 'B' && !psenc->g[cur].interm) { 798 wchar = *string++; 799 wchar &= 0x7f; 800 break; 801 } 802 wchar = psenc->g[cur].final; 803 wchar = (wchar << 8); 804 wchar |= (psenc->g[cur].interm ? (0x80 | psenc->g[cur].interm) : 0); 805 wchar = (wchar << 8); 806 wchar = (wchar << 8) | (*string++ & 0x7f); 807 break; 808 case CS96: 809 /* special case for ISO-8859-1. */ 810 if (psenc->g[cur].final == 'A' && !psenc->g[cur].interm) { 811 wchar = *string++; 812 wchar &= 0x7f; 813 wchar |= 0x80; 814 break; 815 } 816 wchar = psenc->g[cur].final; 817 wchar = (wchar << 8); 818 wchar |= (psenc->g[cur].interm ? (0x80 | psenc->g[cur].interm) : 0); 819 wchar = (wchar << 8); 820 wchar = (wchar << 8) | (*string++ & 0x7f); 821 wchar |= 0x80; 822 break; 823 case CS94MULTI: 824 case CS96MULTI: 825 wchar = psenc->g[cur].final; 826 wchar = (wchar << 8); 827 if (isthree(psenc->g[cur].final)) 828 wchar |= (*string++ & 0x7f); 829 wchar = (wchar << 8) | (*string++ & 0x7f); 830 wchar = (wchar << 8) | (*string++ & 0x7f); 831 if (psenc->g[cur].type == CS96MULTI) 832 wchar |= 0x80; 833 break; 834 } 835 836 if (result) 837 *result = string; 838 /* reset single shift state */ 839 psenc->singlegr = psenc->singlegl = -1; 840 return (wchar); 841 } 842 843 844 845 static int 846 _citrus_ISO2022_mbrtowc_priv(_ISO2022EncodingInfo * __restrict ei, 847 wchar_t * __restrict pwc, char ** __restrict s, 848 size_t n, _ISO2022State * __restrict psenc, size_t * __restrict nresult) 849 { 850 char *p, *result, *s0; 851 wchar_t wchar; 852 int c, chlenbak; 853 854 if (*s == NULL) { 855 _citrus_ISO2022_init_state(ei, psenc); 856 *nresult = _ENCODING_IS_STATE_DEPENDENT; 857 return (0); 858 } 859 s0 = *s; 860 c = 0; 861 chlenbak = psenc->chlen; 862 863 /* 864 * if we have something in buffer, use that. 865 * otherwise, skip here 866 */ 867 if (psenc->chlen > sizeof(psenc->ch)) { 868 /* illgeal state */ 869 _citrus_ISO2022_init_state(ei, psenc); 870 goto encoding_error; 871 } 872 if (psenc->chlen == 0) 873 goto emptybuf; 874 875 /* buffer is not empty */ 876 p = psenc->ch; 877 while (psenc->chlen < sizeof(psenc->ch)) { 878 if (n > 0) { 879 psenc->ch[psenc->chlen++] = *s0++; 880 n--; 881 } 882 883 wchar = _ISO2022_sgetwchar(ei, p, psenc->chlen - (p-psenc->ch), 884 &result, psenc); 885 c += result - p; 886 if (wchar != _ISO2022INVALID) { 887 if (psenc->chlen > (size_t)c) 888 memmove(psenc->ch, result, psenc->chlen - c); 889 if (psenc->chlen < (size_t)c) 890 psenc->chlen = 0; 891 else 892 psenc->chlen -= c; 893 goto output; 894 } 895 896 if (n == 0) { 897 if ((size_t)(result - p) == psenc->chlen) 898 /* complete shift sequence. */ 899 psenc->chlen = 0; 900 goto restart; 901 } 902 903 p = result; 904 } 905 906 /* escape sequence too long? */ 907 goto encoding_error; 908 909 emptybuf: 910 wchar = _ISO2022_sgetwchar(ei, s0, n, &result, psenc); 911 if (wchar != _ISO2022INVALID) { 912 c += result - s0; 913 psenc->chlen = 0; 914 s0 = result; 915 goto output; 916 } 917 if (result > s0) { 918 c += (result - s0); 919 n -= (result - s0); 920 s0 = result; 921 if (n > 0) 922 goto emptybuf; 923 /* complete shift sequence. */ 924 goto restart; 925 } 926 n += c; 927 if (n < sizeof(psenc->ch)) { 928 memcpy(psenc->ch, s0 - c, n); 929 psenc->chlen = n; 930 s0 = result; 931 goto restart; 932 } 933 934 /* escape sequence too long? */ 935 936 encoding_error: 937 psenc->chlen = 0; 938 *nresult = (size_t)-1; 939 return (EILSEQ); 940 941 output: 942 *s = s0; 943 if (pwc) 944 *pwc = wchar; 945 *nresult = wchar ? c - chlenbak : 0; 946 return (0); 947 948 restart: 949 *s = s0; 950 *nresult = (size_t)-2; 951 952 return (0); 953 } 954 955 static int 956 recommendation(_ISO2022EncodingInfo * __restrict ei, 957 _ISO2022Charset * __restrict cs) 958 { 959 _ISO2022Charset *recommend; 960 size_t j; 961 int i; 962 963 /* first, try a exact match. */ 964 for (i = 0; i < 4; i++) { 965 recommend = ei->recommend[i]; 966 for (j = 0; j < ei->recommendsize[i]; j++) { 967 if (cs->type != recommend[j].type) 968 continue; 969 if (cs->final != recommend[j].final) 970 continue; 971 if (cs->interm != recommend[j].interm) 972 continue; 973 974 return (i); 975 } 976 } 977 978 /* then, try a wildcard match over final char. */ 979 for (i = 0; i < 4; i++) { 980 recommend = ei->recommend[i]; 981 for (j = 0; j < ei->recommendsize[i]; j++) { 982 if (cs->type != recommend[j].type) 983 continue; 984 if (cs->final && (cs->final != recommend[j].final)) 985 continue; 986 if (cs->interm && (cs->interm != recommend[j].interm)) 987 continue; 988 989 return (i); 990 } 991 } 992 993 /* there's no recommendation. make a guess. */ 994 if (ei->maxcharset == 0) { 995 return (0); 996 } else { 997 switch (cs->type) { 998 case CS94: 999 case CS94MULTI: 1000 return (0); 1001 case CS96: 1002 case CS96MULTI: 1003 return (1); 1004 } 1005 } 1006 return (0); 1007 } 1008 1009 static int 1010 _ISO2022_sputwchar(_ISO2022EncodingInfo * __restrict ei, wchar_t wc, 1011 char * __restrict string, size_t n, char ** __restrict result, 1012 _ISO2022State * __restrict psenc, size_t * __restrict nresult) 1013 { 1014 _ISO2022Charset cs; 1015 char *p; 1016 char tmp[MB_LEN_MAX]; 1017 size_t len; 1018 int bit8, i = 0, target; 1019 unsigned char mask; 1020 1021 if (isc0(wc & 0xff)) { 1022 /* go back to INIT0 or ASCII on control chars */ 1023 cs = ei->initg[0].final ? ei->initg[0] : ascii; 1024 } else if (isc1(wc & 0xff)) { 1025 /* go back to INIT1 or ISO-8859-1 on control chars */ 1026 cs = ei->initg[1].final ? ei->initg[1] : iso88591; 1027 } else if (!(wc & ~0xff)) { 1028 if (wc & 0x80) { 1029 /* special treatment for ISO-8859-1 */ 1030 cs = iso88591; 1031 } else { 1032 /* special treatment for ASCII */ 1033 cs = ascii; 1034 } 1035 } else { 1036 cs.final = (wc >> 24) & 0x7f; 1037 if ((wc >> 16) & 0x80) 1038 cs.interm = (wc >> 16) & 0x7f; 1039 else 1040 cs.interm = '\0'; 1041 if (wc & 0x80) 1042 cs.type = (wc & 0x00007f00) ? CS96MULTI : CS96; 1043 else 1044 cs.type = (wc & 0x00007f00) ? CS94MULTI : CS94; 1045 } 1046 target = recommendation(ei, &cs); 1047 p = tmp; 1048 bit8 = ei->flags & F_8BIT; 1049 1050 /* designate the charset onto the target plane(G0/1/2/3). */ 1051 if (psenc->g[target].type == cs.type && 1052 psenc->g[target].final == cs.final && 1053 psenc->g[target].interm == cs.interm) 1054 goto planeok; 1055 1056 *p++ = '\033'; 1057 if (cs.type == CS94MULTI || cs.type == CS96MULTI) 1058 *p++ = '$'; 1059 if (target == 0 && cs.type == CS94MULTI && strchr("@AB", cs.final) && 1060 !cs.interm && !(ei->flags & F_NOOLD)) 1061 ; 1062 else if (cs.type == CS94 || cs.type == CS94MULTI) 1063 *p++ = "()*+"[target]; 1064 else 1065 *p++ = ",-./"[target]; 1066 if (cs.interm) 1067 *p++ = cs.interm; 1068 *p++ = cs.final; 1069 1070 psenc->g[target].type = cs.type; 1071 psenc->g[target].final = cs.final; 1072 psenc->g[target].interm = cs.interm; 1073 1074 planeok: 1075 /* invoke the plane onto GL or GR. */ 1076 if (psenc->gl == target) 1077 goto sideok; 1078 if (bit8 && psenc->gr == target) 1079 goto sideok; 1080 1081 if (target == 0 && (ei->flags & F_LS0)) { 1082 *p++ = '\017'; 1083 psenc->gl = 0; 1084 } else if (target == 1 && (ei->flags & F_LS1)) { 1085 *p++ = '\016'; 1086 psenc->gl = 1; 1087 } else if (target == 2 && (ei->flags & F_LS2)) { 1088 *p++ = '\033'; 1089 *p++ = 'n'; 1090 psenc->gl = 2; 1091 } else if (target == 3 && (ei->flags & F_LS3)) { 1092 *p++ = '\033'; 1093 *p++ = 'o'; 1094 psenc->gl = 3; 1095 } else if (bit8 && target == 1 && (ei->flags & F_LS1R)) { 1096 *p++ = '\033'; 1097 *p++ = '~'; 1098 psenc->gr = 1; 1099 } else if (bit8 && target == 2 && (ei->flags & F_LS2R)) { 1100 *p++ = '\033'; 1101 /*{*/ 1102 *p++ = '}'; 1103 psenc->gr = 2; 1104 } else if (bit8 && target == 3 && (ei->flags & F_LS3R)) { 1105 *p++ = '\033'; 1106 *p++ = '|'; 1107 psenc->gr = 3; 1108 } else if (target == 2 && (ei->flags & F_SS2)) { 1109 *p++ = '\033'; 1110 *p++ = 'N'; 1111 psenc->singlegl = 2; 1112 } else if (target == 3 && (ei->flags & F_SS3)) { 1113 *p++ = '\033'; 1114 *p++ = 'O'; 1115 psenc->singlegl = 3; 1116 } else if (bit8 && target == 2 && (ei->flags & F_SS2R)) { 1117 *p++ = '\216'; 1118 *p++ = 'N'; 1119 psenc->singlegl = psenc->singlegr = 2; 1120 } else if (bit8 && target == 3 && (ei->flags & F_SS3R)) { 1121 *p++ = '\217'; 1122 *p++ = 'O'; 1123 psenc->singlegl = psenc->singlegr = 3; 1124 } else 1125 goto ilseq; 1126 1127 sideok: 1128 if (psenc->singlegl == target) 1129 mask = 0x00; 1130 else if (psenc->singlegr == target) 1131 mask = 0x80; 1132 else if (psenc->gl == target) 1133 mask = 0x00; 1134 else if ((ei->flags & F_8BIT) && psenc->gr == target) 1135 mask = 0x80; 1136 else 1137 goto ilseq; 1138 1139 switch (cs.type) { 1140 case CS94: 1141 case CS96: 1142 i = 1; 1143 break; 1144 case CS94MULTI: 1145 case CS96MULTI: 1146 i = !iscntl(wc & 0xff) ? 1147 (isthree(cs.final) ? 3 : 2) : 1; 1148 break; 1149 } 1150 while (i-- > 0) 1151 *p++ = ((wc >> (i << 3)) & 0x7f) | mask; 1152 1153 /* reset single shift state */ 1154 psenc->singlegl = psenc->singlegr = -1; 1155 1156 len = (size_t)(p - tmp); 1157 if (n < len) { 1158 if (result) 1159 *result = (char *)0; 1160 *nresult = (size_t)-1; 1161 return (E2BIG); 1162 } 1163 if (result) 1164 *result = string + len; 1165 memcpy(string, tmp, len); 1166 *nresult = len; 1167 1168 return (0); 1169 1170 ilseq: 1171 *nresult = (size_t)-1; 1172 return (EILSEQ); 1173 } 1174 1175 static int 1176 _citrus_ISO2022_put_state_reset(_ISO2022EncodingInfo * __restrict ei, 1177 char * __restrict s, size_t n, _ISO2022State * __restrict psenc, 1178 size_t * __restrict nresult) 1179 { 1180 char *result; 1181 char buf[MB_LEN_MAX]; 1182 size_t len; 1183 int ret; 1184 1185 /* XXX state will be modified after this operation... */ 1186 ret = _ISO2022_sputwchar(ei, L'\0', buf, sizeof(buf), &result, psenc, 1187 &len); 1188 if (ret) { 1189 *nresult = len; 1190 return (ret); 1191 } 1192 1193 if (sizeof(buf) < len || n < len-1) { 1194 /* XXX should recover state? */ 1195 *nresult = (size_t)-1; 1196 return (E2BIG); 1197 } 1198 1199 memcpy(s, buf, len - 1); 1200 *nresult = len - 1; 1201 return (0); 1202 } 1203 1204 static int 1205 _citrus_ISO2022_wcrtomb_priv(_ISO2022EncodingInfo * __restrict ei, 1206 char * __restrict s, size_t n, wchar_t wc, 1207 _ISO2022State * __restrict psenc, size_t * __restrict nresult) 1208 { 1209 char *result; 1210 char buf[MB_LEN_MAX]; 1211 size_t len; 1212 int ret; 1213 1214 /* XXX state will be modified after this operation... */ 1215 ret = _ISO2022_sputwchar(ei, wc, buf, sizeof(buf), &result, psenc, 1216 &len); 1217 if (ret) { 1218 *nresult = len; 1219 return (ret); 1220 } 1221 1222 if (sizeof(buf) < len || n < len) { 1223 /* XXX should recover state? */ 1224 *nresult = (size_t)-1; 1225 return (E2BIG); 1226 } 1227 1228 memcpy(s, buf, len); 1229 *nresult = len; 1230 return (0); 1231 } 1232 1233 static __inline int 1234 /*ARGSUSED*/ 1235 _citrus_ISO2022_stdenc_wctocs(_ISO2022EncodingInfo * __restrict ei __unused, 1236 _csid_t * __restrict csid, _index_t * __restrict idx, wchar_t wc) 1237 { 1238 wchar_t m, nm; 1239 1240 m = wc & 0x7FFF8080; 1241 nm = wc & 0x007F7F7F; 1242 if (m & 0x00800000) 1243 nm &= 0x00007F7F; 1244 else 1245 m &= 0x7F008080; 1246 if (nm & 0x007F0000) { 1247 /* ^3 mark */ 1248 m |= 0x007F0000; 1249 } else if (nm & 0x00007F00) { 1250 /* ^2 mark */ 1251 m |= 0x00007F00; 1252 } 1253 *csid = (_csid_t)m; 1254 *idx = (_index_t)nm; 1255 1256 return (0); 1257 } 1258 1259 static __inline int 1260 /*ARGSUSED*/ 1261 _citrus_ISO2022_stdenc_cstowc(_ISO2022EncodingInfo * __restrict ei __unused, 1262 wchar_t * __restrict wc, _csid_t csid, _index_t idx) 1263 { 1264 1265 *wc = (wchar_t)(csid & 0x7F808080) | (wchar_t)idx; 1266 1267 return (0); 1268 } 1269 1270 static __inline int 1271 /*ARGSUSED*/ 1272 _citrus_ISO2022_stdenc_get_state_desc_generic(_ISO2022EncodingInfo * __restrict ei __unused, 1273 _ISO2022State * __restrict psenc, int * __restrict rstate) 1274 { 1275 1276 if (psenc->chlen == 0) { 1277 /* XXX: it should distinguish initial and stable. */ 1278 *rstate = _STDENC_SDGEN_STABLE; 1279 } else 1280 *rstate = (psenc->ch[0] == '\033') ? 1281 _STDENC_SDGEN_INCOMPLETE_SHIFT : 1282 _STDENC_SDGEN_INCOMPLETE_CHAR; 1283 return (0); 1284 } 1285 1286 /* ---------------------------------------------------------------------- 1287 * public interface for stdenc 1288 */ 1289 1290 _CITRUS_STDENC_DECLS(ISO2022); 1291 _CITRUS_STDENC_DEF_OPS(ISO2022); 1292 1293 #include "citrus_stdenc_template.h" 1294