1 /* $FreeBSD$ */ 2 /* $NetBSD: citrus_iso2022.c,v 1.20 2010/12/07 22:01:45 joerg Exp $ */ 3 4 /*- 5 * SPDX-License-Identifier: BSD-2-Clause 6 * 7 * Copyright (c)1999, 2002 Citrus Project, 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * $Citrus: xpg4dl/FreeBSD/lib/libc/locale/iso2022.c,v 1.23 2001/06/21 01:51:44 yamt Exp $ 32 */ 33 34 #include <sys/cdefs.h> 35 #include <sys/types.h> 36 37 #include <assert.h> 38 #include <errno.h> 39 #include <limits.h> 40 #include <stdbool.h> 41 #include <stddef.h> 42 #include <stdio.h> 43 #include <stdlib.h> 44 #include <string.h> 45 #include <wchar.h> 46 47 #include "citrus_namespace.h" 48 #include "citrus_types.h" 49 #include "citrus_module.h" 50 #include "citrus_stdenc.h" 51 #include "citrus_iso2022.h" 52 53 54 /* ---------------------------------------------------------------------- 55 * private stuffs used by templates 56 */ 57 58 59 /* 60 * wchar_t mappings: 61 * ASCII (ESC ( B) 00000000 00000000 00000000 0xxxxxxx 62 * iso-8859-1 (ESC , A) 00000000 00000000 00000000 1xxxxxxx 63 * 94 charset (ESC ( F) 0fffffff 00000000 00000000 0xxxxxxx 64 * 94 charset (ESC ( M F) 0fffffff 1mmmmmmm 00000000 0xxxxxxx 65 * 96 charset (ESC , F) 0fffffff 00000000 00000000 1xxxxxxx 66 * 96 charset (ESC , M F) 0fffffff 1mmmmmmm 00000000 1xxxxxxx 67 * 94x94 charset (ESC $ ( F) 0fffffff 00000000 0xxxxxxx 0xxxxxxx 68 * 96x96 charset (ESC $ , F) 0fffffff 00000000 0xxxxxxx 1xxxxxxx 69 * 94x94 charset (ESC & V ESC $ ( F) 70 * 0fffffff 1vvvvvvv 0xxxxxxx 0xxxxxxx 71 * 94x94x94 charset (ESC $ ( F) 0fffffff 0xxxxxxx 0xxxxxxx 0xxxxxxx 72 * 96x96x96 charset (ESC $ , F) 0fffffff 0xxxxxxx 0xxxxxxx 1xxxxxxx 73 * reserved for UCS4 co-existence (UCS4 is 31bit encoding thanks to mohta bit) 74 * 1xxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 75 */ 76 77 #define CS94 (0U) 78 #define CS96 (1U) 79 #define CS94MULTI (2U) 80 #define CS96MULTI (3U) 81 82 typedef struct { 83 unsigned char type; 84 unsigned char final; 85 unsigned char interm; 86 unsigned char vers; 87 } _ISO2022Charset; 88 89 static const _ISO2022Charset ascii = { CS94, 'B', '\0', '\0' }; 90 static const _ISO2022Charset iso88591 = { CS96, 'A', '\0', '\0' }; 91 92 typedef struct { 93 _ISO2022Charset g[4]; 94 /* need 3 bits to hold -1, 0, ..., 3 */ 95 int gl:3, 96 gr:3, 97 singlegl:3, 98 singlegr:3; 99 char ch[7]; /* longest escape sequence (ESC & V ESC $ ( F) */ 100 size_t chlen; 101 int flags; 102 #define _ISO2022STATE_FLAG_INITIALIZED 1 103 } _ISO2022State; 104 105 typedef struct { 106 _ISO2022Charset *recommend[4]; 107 size_t recommendsize[4]; 108 _ISO2022Charset initg[4]; 109 int maxcharset; 110 int flags; 111 #define F_8BIT 0x0001 112 #define F_NOOLD 0x0002 113 #define F_SI 0x0010 /*0F*/ 114 #define F_SO 0x0020 /*0E*/ 115 #define F_LS0 0x0010 /*0F*/ 116 #define F_LS1 0x0020 /*0E*/ 117 #define F_LS2 0x0040 /*ESC n*/ 118 #define F_LS3 0x0080 /*ESC o*/ 119 #define F_LS1R 0x0100 /*ESC ~*/ 120 #define F_LS2R 0x0200 /*ESC }*/ 121 #define F_LS3R 0x0400 /*ESC |*/ 122 #define F_SS2 0x0800 /*ESC N*/ 123 #define F_SS3 0x1000 /*ESC O*/ 124 #define F_SS2R 0x2000 /*8E*/ 125 #define F_SS3R 0x4000 /*8F*/ 126 } _ISO2022EncodingInfo; 127 128 #define _CEI_TO_EI(_cei_) (&(_cei_)->ei) 129 #define _CEI_TO_STATE(_cei_, _func_) (_cei_)->states.s_##_func_ 130 131 #define _FUNCNAME(m) _citrus_ISO2022_##m 132 #define _ENCODING_INFO _ISO2022EncodingInfo 133 #define _ENCODING_STATE _ISO2022State 134 #define _ENCODING_MB_CUR_MAX(_ei_) MB_LEN_MAX 135 #define _ENCODING_IS_STATE_DEPENDENT 1 136 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_) \ 137 (!((_ps_)->flags & _ISO2022STATE_FLAG_INITIALIZED)) 138 139 140 #define _ISO2022INVALID (wchar_t)-1 141 142 static __inline bool isc0(__uint8_t x) 143 { 144 145 return ((x & 0x1f) == x); 146 } 147 148 static __inline bool isc1(__uint8_t x) 149 { 150 151 return (0x80 <= x && x <= 0x9f); 152 } 153 154 static __inline bool iscntl(__uint8_t x) 155 { 156 157 return (isc0(x) || isc1(x) || x == 0x7f); 158 } 159 160 static __inline bool is94(__uint8_t x) 161 { 162 163 return (0x21 <= x && x <= 0x7e); 164 } 165 166 static __inline bool is96(__uint8_t x) 167 { 168 169 return (0x20 <= x && x <= 0x7f); 170 } 171 172 static __inline bool isecma(__uint8_t x) 173 { 174 175 return (0x30 <= x && x <= 0x7f); 176 } 177 178 static __inline bool isinterm(__uint8_t x) 179 { 180 181 return (0x20 <= x && x <= 0x2f); 182 } 183 184 static __inline bool isthree(__uint8_t x) 185 { 186 187 return (0x60 <= x && x <= 0x6f); 188 } 189 190 static __inline int 191 getcs(const char * __restrict p, _ISO2022Charset * __restrict cs) 192 { 193 194 if (!strncmp(p, "94$", 3) && p[3] && !p[4]) { 195 cs->final = (unsigned char)(p[3] & 0xff); 196 cs->interm = '\0'; 197 cs->vers = '\0'; 198 cs->type = CS94MULTI; 199 } else if (!strncmp(p, "96$", 3) && p[3] && !p[4]) { 200 cs->final = (unsigned char)(p[3] & 0xff); 201 cs->interm = '\0'; 202 cs->vers = '\0'; 203 cs->type = CS96MULTI; 204 } else if (!strncmp(p, "94", 2) && p[2] && !p[3]) { 205 cs->final = (unsigned char)(p[2] & 0xff); 206 cs->interm = '\0'; 207 cs->vers = '\0'; 208 cs->type = CS94; 209 } else if (!strncmp(p, "96", 2) && p[2] && !p[3]) { 210 cs->final = (unsigned char )(p[2] & 0xff); 211 cs->interm = '\0'; 212 cs->vers = '\0'; 213 cs->type = CS96; 214 } else 215 return (1); 216 217 return (0); 218 } 219 220 221 #define _NOTMATCH 0 222 #define _MATCH 1 223 #define _PARSEFAIL 2 224 225 static __inline int 226 get_recommend(_ISO2022EncodingInfo * __restrict ei, 227 const char * __restrict token) 228 { 229 _ISO2022Charset cs, *p; 230 int i; 231 232 if (!strchr("0123", token[0]) || token[1] != '=') 233 return (_NOTMATCH); 234 235 if (getcs(&token[2], &cs) == 0) 236 ; 237 else if (!strcmp(&token[2], "94")) { 238 cs.final = (unsigned char)(token[4]); 239 cs.interm = '\0'; 240 cs.vers = '\0'; 241 cs.type = CS94; 242 } else if (!strcmp(&token[2], "96")) { 243 cs.final = (unsigned char)(token[4]); 244 cs.interm = '\0'; 245 cs.vers = '\0'; 246 cs.type = CS96; 247 } else if (!strcmp(&token[2], "94$")) { 248 cs.final = (unsigned char)(token[5]); 249 cs.interm = '\0'; 250 cs.vers = '\0'; 251 cs.type = CS94MULTI; 252 } else if (!strcmp(&token[2], "96$")) { 253 cs.final = (unsigned char)(token[5]); 254 cs.interm = '\0'; 255 cs.vers = '\0'; 256 cs.type = CS96MULTI; 257 } else 258 return (_PARSEFAIL); 259 260 i = token[0] - '0'; 261 if (!ei->recommend[i]) 262 ei->recommend[i] = malloc(sizeof(_ISO2022Charset)); 263 else { 264 p = reallocarray(ei->recommend[i], ei->recommendsize[i] + 1, 265 sizeof(_ISO2022Charset)); 266 if (!p) 267 return (_PARSEFAIL); 268 ei->recommend[i] = p; 269 } 270 if (!ei->recommend[i]) 271 return (_PARSEFAIL); 272 ei->recommendsize[i]++; 273 274 (ei->recommend[i] + (ei->recommendsize[i] - 1))->final = cs.final; 275 (ei->recommend[i] + (ei->recommendsize[i] - 1))->interm = cs.interm; 276 (ei->recommend[i] + (ei->recommendsize[i] - 1))->vers = cs.vers; 277 (ei->recommend[i] + (ei->recommendsize[i] - 1))->type = cs.type; 278 279 return (_MATCH); 280 } 281 282 static __inline int 283 get_initg(_ISO2022EncodingInfo * __restrict ei, 284 const char * __restrict token) 285 { 286 _ISO2022Charset cs; 287 288 if (strncmp("INIT", &token[0], 4) || 289 !strchr("0123", token[4]) || 290 token[5] != '=') 291 return (_NOTMATCH); 292 293 if (getcs(&token[6], &cs) != 0) 294 return (_PARSEFAIL); 295 296 ei->initg[token[4] - '0'].type = cs.type; 297 ei->initg[token[4] - '0'].final = cs.final; 298 ei->initg[token[4] - '0'].interm = cs.interm; 299 ei->initg[token[4] - '0'].vers = cs.vers; 300 301 return (_MATCH); 302 } 303 304 static __inline int 305 get_max(_ISO2022EncodingInfo * __restrict ei, 306 const char * __restrict token) 307 { 308 if (!strcmp(token, "MAX1")) 309 ei->maxcharset = 1; 310 else if (!strcmp(token, "MAX2")) 311 ei->maxcharset = 2; 312 else if (!strcmp(token, "MAX3")) 313 ei->maxcharset = 3; 314 else 315 return (_NOTMATCH); 316 317 return (_MATCH); 318 } 319 320 321 static __inline int 322 get_flags(_ISO2022EncodingInfo * __restrict ei, 323 const char * __restrict token) 324 { 325 static struct { 326 const char *tag; 327 int flag; 328 } const tags[] = { 329 { "DUMMY", 0 }, 330 { "8BIT", F_8BIT }, 331 { "NOOLD", F_NOOLD }, 332 { "SI", F_SI }, 333 { "SO", F_SO }, 334 { "LS0", F_LS0 }, 335 { "LS1", F_LS1 }, 336 { "LS2", F_LS2 }, 337 { "LS3", F_LS3 }, 338 { "LS1R", F_LS1R }, 339 { "LS2R", F_LS2R }, 340 { "LS3R", F_LS3R }, 341 { "SS2", F_SS2 }, 342 { "SS3", F_SS3 }, 343 { "SS2R", F_SS2R }, 344 { "SS3R", F_SS3R }, 345 { NULL, 0 } 346 }; 347 int i; 348 349 for (i = 0; tags[i].tag; i++) 350 if (!strcmp(token, tags[i].tag)) { 351 ei->flags |= tags[i].flag; 352 return (_MATCH); 353 } 354 355 return (_NOTMATCH); 356 } 357 358 359 static __inline int 360 _citrus_ISO2022_parse_variable(_ISO2022EncodingInfo * __restrict ei, 361 const void * __restrict var, size_t lenvar __unused) 362 { 363 char const *e, *v; 364 char buf[20]; 365 size_t len; 366 int i, ret; 367 368 /* 369 * parse VARIABLE section. 370 */ 371 372 if (!var) 373 return (EFTYPE); 374 375 v = (const char *) var; 376 377 /* initialize structure */ 378 ei->maxcharset = 0; 379 for (i = 0; i < 4; i++) { 380 ei->recommend[i] = NULL; 381 ei->recommendsize[i] = 0; 382 } 383 ei->flags = 0; 384 385 while (*v) { 386 while (*v == ' ' || *v == '\t') 387 ++v; 388 389 /* find the token */ 390 e = v; 391 while (*e && *e != ' ' && *e != '\t') 392 ++e; 393 394 len = e - v; 395 if (len == 0) 396 break; 397 if (len >= sizeof(buf)) 398 goto parsefail; 399 snprintf(buf, sizeof(buf), "%.*s", (int)len, v); 400 401 if ((ret = get_recommend(ei, buf)) != _NOTMATCH) 402 ; 403 else if ((ret = get_initg(ei, buf)) != _NOTMATCH) 404 ; 405 else if ((ret = get_max(ei, buf)) != _NOTMATCH) 406 ; 407 else if ((ret = get_flags(ei, buf)) != _NOTMATCH) 408 ; 409 else 410 ret = _PARSEFAIL; 411 if (ret == _PARSEFAIL) 412 goto parsefail; 413 v = e; 414 415 } 416 417 return (0); 418 419 parsefail: 420 free(ei->recommend[0]); 421 free(ei->recommend[1]); 422 free(ei->recommend[2]); 423 free(ei->recommend[3]); 424 425 return (EFTYPE); 426 } 427 428 static __inline void 429 /*ARGSUSED*/ 430 _citrus_ISO2022_init_state(_ISO2022EncodingInfo * __restrict ei, 431 _ISO2022State * __restrict s) 432 { 433 int i; 434 435 memset(s, 0, sizeof(*s)); 436 s->gl = 0; 437 s->gr = (ei->flags & F_8BIT) ? 1 : -1; 438 439 for (i = 0; i < 4; i++) 440 if (ei->initg[i].final) { 441 s->g[i].type = ei->initg[i].type; 442 s->g[i].final = ei->initg[i].final; 443 s->g[i].interm = ei->initg[i].interm; 444 } 445 s->singlegl = s->singlegr = -1; 446 s->flags |= _ISO2022STATE_FLAG_INITIALIZED; 447 } 448 449 #if 0 450 static __inline void 451 /*ARGSUSED*/ 452 _citrus_ISO2022_pack_state(_ISO2022EncodingInfo * __restrict ei __unused, 453 void * __restrict pspriv, const _ISO2022State * __restrict s) 454 { 455 456 memcpy(pspriv, (const void *)s, sizeof(*s)); 457 } 458 459 static __inline void 460 /*ARGSUSED*/ 461 _citrus_ISO2022_unpack_state(_ISO2022EncodingInfo * __restrict ei __unused, 462 _ISO2022State * __restrict s, const void * __restrict pspriv) 463 { 464 465 memcpy((void *)s, pspriv, sizeof(*s)); 466 } 467 #endif 468 469 static int 470 /*ARGSUSED*/ 471 _citrus_ISO2022_encoding_module_init(_ISO2022EncodingInfo * __restrict ei, 472 const void * __restrict var, size_t lenvar) 473 { 474 475 return (_citrus_ISO2022_parse_variable(ei, var, lenvar)); 476 } 477 478 static void 479 /*ARGSUSED*/ 480 _citrus_ISO2022_encoding_module_uninit(_ISO2022EncodingInfo *ei __unused) 481 { 482 483 } 484 485 #define ESC '\033' 486 #define ECMA -1 487 #define INTERM -2 488 #define OECMA -3 489 static const struct seqtable { 490 int type; 491 int csoff; 492 int finaloff; 493 int intermoff; 494 int versoff; 495 int len; 496 int chars[10]; 497 } seqtable[] = { 498 /* G0 94MULTI special */ 499 { CS94MULTI, -1, 2, -1, -1, 3, { ESC, '$', OECMA }, }, 500 /* G0 94MULTI special with version identification */ 501 { CS94MULTI, -1, 5, -1, 2, 6, { ESC, '&', ECMA, ESC, '$', OECMA }, }, 502 /* G? 94 */ 503 { CS94, 1, 2, -1, -1, 3, { ESC, CS94, ECMA, }, }, 504 /* G? 94 with 2nd intermediate char */ 505 { CS94, 1, 3, 2, -1, 4, { ESC, CS94, INTERM, ECMA, }, }, 506 /* G? 96 */ 507 { CS96, 1, 2, -1, -1, 3, { ESC, CS96, ECMA, }, }, 508 /* G? 96 with 2nd intermediate char */ 509 { CS96, 1, 3, 2, -1, 4, { ESC, CS96, INTERM, ECMA, }, }, 510 /* G? 94MULTI */ 511 { CS94MULTI, 2, 3, -1, -1, 4, { ESC, '$', CS94, ECMA, }, }, 512 /* G? 96MULTI */ 513 { CS96MULTI, 2, 3, -1, -1, 4, { ESC, '$', CS96, ECMA, }, }, 514 /* G? 94MULTI with version specification */ 515 { CS94MULTI, 5, 6, -1, 2, 7, { ESC, '&', ECMA, ESC, '$', CS94, ECMA, }, }, 516 /* LS2/3 */ 517 { -1, -1, -1, -1, -1, 2, { ESC, 'n', }, }, 518 { -1, -1, -1, -1, -1, 2, { ESC, 'o', }, }, 519 /* LS1/2/3R */ 520 { -1, -1, -1, -1, -1, 2, { ESC, '~', }, }, 521 { -1, -1, -1, -1, -1, 2, { ESC, /*{*/ '}', }, }, 522 { -1, -1, -1, -1, -1, 2, { ESC, '|', }, }, 523 /* SS2/3 */ 524 { -1, -1, -1, -1, -1, 2, { ESC, 'N', }, }, 525 { -1, -1, -1, -1, -1, 2, { ESC, 'O', }, }, 526 /* end of records */ 527 // { 0, } 528 { 0, 0, 0, 0, 0, 0, { ESC, 0, }, } 529 }; 530 531 static int 532 seqmatch(const char * __restrict s, size_t n, 533 const struct seqtable * __restrict sp) 534 { 535 const int *p; 536 537 p = sp->chars; 538 while ((size_t)(p - sp->chars) < n && p - sp->chars < sp->len) { 539 switch (*p) { 540 case ECMA: 541 if (!isecma(*s)) 542 goto terminate; 543 break; 544 case OECMA: 545 if (*s && strchr("@AB", *s)) 546 break; 547 else 548 goto terminate; 549 case INTERM: 550 if (!isinterm(*s)) 551 goto terminate; 552 break; 553 case CS94: 554 if (*s && strchr("()*+", *s)) 555 break; 556 else 557 goto terminate; 558 case CS96: 559 if (*s && strchr(",-./", *s)) 560 break; 561 else 562 goto terminate; 563 default: 564 if (*s != *p) 565 goto terminate; 566 break; 567 } 568 569 p++; 570 s++; 571 } 572 573 terminate: 574 return (p - sp->chars); 575 } 576 577 static wchar_t 578 _ISO2022_sgetwchar(_ISO2022EncodingInfo * __restrict ei __unused, 579 char * __restrict string, size_t n, char ** __restrict result, 580 _ISO2022State * __restrict psenc) 581 { 582 const struct seqtable *sp; 583 wchar_t wchar = 0; 584 int i, cur, nmatch; 585 586 while (1) { 587 /* SI/SO */ 588 if (1 <= n && string[0] == '\017') { 589 psenc->gl = 0; 590 string++; 591 n--; 592 continue; 593 } 594 if (1 <= n && string[0] == '\016') { 595 psenc->gl = 1; 596 string++; 597 n--; 598 continue; 599 } 600 601 /* SS2/3R */ 602 if (1 <= n && string[0] && strchr("\217\216", string[0])) { 603 psenc->singlegl = psenc->singlegr = 604 (string[0] - '\216') + 2; 605 string++; 606 n--; 607 continue; 608 } 609 610 /* eat the letter if this is not ESC */ 611 if (1 <= n && string[0] != '\033') 612 break; 613 614 /* look for a perfect match from escape sequences */ 615 for (sp = &seqtable[0]; sp->len; sp++) { 616 nmatch = seqmatch(string, n, sp); 617 if (sp->len == nmatch && n >= (size_t)(sp->len)) 618 break; 619 } 620 621 if (!sp->len) 622 goto notseq; 623 624 if (sp->type != -1) { 625 if (sp->csoff == -1) 626 i = 0; 627 else { 628 switch (sp->type) { 629 case CS94: 630 case CS94MULTI: 631 i = string[sp->csoff] - '('; 632 break; 633 case CS96: 634 case CS96MULTI: 635 i = string[sp->csoff] - ','; 636 break; 637 default: 638 return (_ISO2022INVALID); 639 } 640 } 641 psenc->g[i].type = sp->type; 642 psenc->g[i].final = '\0'; 643 psenc->g[i].interm = '\0'; 644 psenc->g[i].vers = '\0'; 645 /* sp->finaloff must not be -1 */ 646 if (sp->finaloff != -1) 647 psenc->g[i].final = string[sp->finaloff]; 648 if (sp->intermoff != -1) 649 psenc->g[i].interm = string[sp->intermoff]; 650 if (sp->versoff != -1) 651 psenc->g[i].vers = string[sp->versoff]; 652 653 string += sp->len; 654 n -= sp->len; 655 continue; 656 } 657 658 /* LS2/3 */ 659 if (2 <= n && string[0] == '\033' && 660 string[1] && strchr("no", string[1])) { 661 psenc->gl = string[1] - 'n' + 2; 662 string += 2; 663 n -= 2; 664 continue; 665 } 666 667 /* LS1/2/3R */ 668 /* XXX: { for vi showmatch */ 669 if (2 <= n && string[0] == '\033' && 670 string[1] && strchr("~}|", string[1])) { 671 psenc->gr = 3 - (string[1] - '|'); 672 string += 2; 673 n -= 2; 674 continue; 675 } 676 677 /* SS2/3 */ 678 if (2 <= n && string[0] == '\033' && string[1] && 679 strchr("NO", string[1])) { 680 psenc->singlegl = (string[1] - 'N') + 2; 681 string += 2; 682 n -= 2; 683 continue; 684 } 685 686 notseq: 687 /* 688 * if we've got an unknown escape sequence, eat the ESC at the 689 * head. otherwise, wait till full escape sequence comes. 690 */ 691 for (sp = &seqtable[0]; sp->len; sp++) { 692 nmatch = seqmatch(string, n, sp); 693 if (!nmatch) 694 continue; 695 696 /* 697 * if we are in the middle of escape sequence, 698 * we still need to wait for more characters to come 699 */ 700 if (n < (size_t)(sp->len)) { 701 if ((size_t)(nmatch) == n) { 702 if (result) 703 *result = string; 704 return (_ISO2022INVALID); 705 } 706 } else { 707 if (nmatch == sp->len) { 708 /* this case should not happen */ 709 goto eat; 710 } 711 } 712 } 713 714 break; 715 } 716 717 eat: 718 /* no letter to eat */ 719 if (n < 1) { 720 if (result) 721 *result = string; 722 return (_ISO2022INVALID); 723 } 724 725 /* normal chars. always eat C0/C1 as is. */ 726 if (iscntl(*string & 0xff)) 727 cur = -1; 728 else if (*string & 0x80) 729 cur = (psenc->singlegr == -1) ? psenc->gr : psenc->singlegr; 730 else 731 cur = (psenc->singlegl == -1) ? psenc->gl : psenc->singlegl; 732 733 if (cur == -1) { 734 asis: 735 wchar = *string++ & 0xff; 736 if (result) 737 *result = string; 738 /* reset single shift state */ 739 psenc->singlegr = psenc->singlegl = -1; 740 return (wchar); 741 } 742 743 /* length error check */ 744 switch (psenc->g[cur].type) { 745 case CS94MULTI: 746 case CS96MULTI: 747 if (!isthree(psenc->g[cur].final)) { 748 if (2 <= n && 749 (string[0] & 0x80) == (string[1] & 0x80)) 750 break; 751 } else { 752 if (3 <= n && 753 (string[0] & 0x80) == (string[1] & 0x80) && 754 (string[0] & 0x80) == (string[2] & 0x80)) 755 break; 756 } 757 758 /* we still need to wait for more characters to come */ 759 if (result) 760 *result = string; 761 return (_ISO2022INVALID); 762 763 case CS94: 764 case CS96: 765 if (1 <= n) 766 break; 767 768 /* we still need to wait for more characters to come */ 769 if (result) 770 *result = string; 771 return (_ISO2022INVALID); 772 } 773 774 /* range check */ 775 switch (psenc->g[cur].type) { 776 case CS94: 777 if (!(is94(string[0] & 0x7f))) 778 goto asis; 779 case CS96: 780 if (!(is96(string[0] & 0x7f))) 781 goto asis; 782 break; 783 case CS94MULTI: 784 if (!(is94(string[0] & 0x7f) && is94(string[1] & 0x7f))) 785 goto asis; 786 break; 787 case CS96MULTI: 788 if (!(is96(string[0] & 0x7f) && is96(string[1] & 0x7f))) 789 goto asis; 790 break; 791 } 792 793 /* extract the character. */ 794 switch (psenc->g[cur].type) { 795 case CS94: 796 /* special case for ASCII. */ 797 if (psenc->g[cur].final == 'B' && !psenc->g[cur].interm) { 798 wchar = *string++; 799 wchar &= 0x7f; 800 break; 801 } 802 wchar = psenc->g[cur].final; 803 wchar = (wchar << 8); 804 wchar |= (psenc->g[cur].interm ? (0x80 | psenc->g[cur].interm) : 0); 805 wchar = (wchar << 8); 806 wchar = (wchar << 8) | (*string++ & 0x7f); 807 break; 808 case CS96: 809 /* special case for ISO-8859-1. */ 810 if (psenc->g[cur].final == 'A' && !psenc->g[cur].interm) { 811 wchar = *string++; 812 wchar &= 0x7f; 813 wchar |= 0x80; 814 break; 815 } 816 wchar = psenc->g[cur].final; 817 wchar = (wchar << 8); 818 wchar |= (psenc->g[cur].interm ? (0x80 | psenc->g[cur].interm) : 0); 819 wchar = (wchar << 8); 820 wchar = (wchar << 8) | (*string++ & 0x7f); 821 wchar |= 0x80; 822 break; 823 case CS94MULTI: 824 case CS96MULTI: 825 wchar = psenc->g[cur].final; 826 wchar = (wchar << 8); 827 if (isthree(psenc->g[cur].final)) 828 wchar |= (*string++ & 0x7f); 829 wchar = (wchar << 8) | (*string++ & 0x7f); 830 wchar = (wchar << 8) | (*string++ & 0x7f); 831 if (psenc->g[cur].type == CS96MULTI) 832 wchar |= 0x80; 833 break; 834 } 835 836 if (result) 837 *result = string; 838 /* reset single shift state */ 839 psenc->singlegr = psenc->singlegl = -1; 840 return (wchar); 841 } 842 843 844 845 static int 846 _citrus_ISO2022_mbrtowc_priv(_ISO2022EncodingInfo * __restrict ei, 847 wchar_t * __restrict pwc, char ** __restrict s, 848 size_t n, _ISO2022State * __restrict psenc, size_t * __restrict nresult) 849 { 850 char *p, *result, *s0; 851 wchar_t wchar; 852 int c, chlenbak; 853 854 if (*s == NULL) { 855 _citrus_ISO2022_init_state(ei, psenc); 856 *nresult = _ENCODING_IS_STATE_DEPENDENT; 857 return (0); 858 } 859 s0 = *s; 860 c = 0; 861 chlenbak = psenc->chlen; 862 863 /* 864 * if we have something in buffer, use that. 865 * otherwise, skip here 866 */ 867 if (psenc->chlen > sizeof(psenc->ch)) { 868 /* illgeal state */ 869 _citrus_ISO2022_init_state(ei, psenc); 870 goto encoding_error; 871 } 872 if (psenc->chlen == 0) 873 goto emptybuf; 874 875 /* buffer is not empty */ 876 p = psenc->ch; 877 while (psenc->chlen < sizeof(psenc->ch)) { 878 if (n > 0) { 879 psenc->ch[psenc->chlen++] = *s0++; 880 n--; 881 } 882 883 wchar = _ISO2022_sgetwchar(ei, p, psenc->chlen - (p-psenc->ch), 884 &result, psenc); 885 c += result - p; 886 if (wchar != _ISO2022INVALID) { 887 if (psenc->chlen > (size_t)c) 888 memmove(psenc->ch, result, psenc->chlen - c); 889 if (psenc->chlen < (size_t)c) 890 psenc->chlen = 0; 891 else 892 psenc->chlen -= c; 893 goto output; 894 } 895 896 if (n == 0) { 897 if ((size_t)(result - p) == psenc->chlen) 898 /* complete shift sequence. */ 899 psenc->chlen = 0; 900 goto restart; 901 } 902 903 p = result; 904 } 905 906 /* escape sequence too long? */ 907 goto encoding_error; 908 909 emptybuf: 910 wchar = _ISO2022_sgetwchar(ei, s0, n, &result, psenc); 911 if (wchar != _ISO2022INVALID) { 912 c += result - s0; 913 psenc->chlen = 0; 914 s0 = result; 915 goto output; 916 } 917 if (result > s0) { 918 c += (result - s0); 919 n -= (result - s0); 920 s0 = result; 921 if (n > 0) 922 goto emptybuf; 923 /* complete shift sequence. */ 924 goto restart; 925 } 926 n += c; 927 if (n < sizeof(psenc->ch)) { 928 memcpy(psenc->ch, s0 - c, n); 929 psenc->chlen = n; 930 s0 = result; 931 goto restart; 932 } 933 934 /* escape sequence too long? */ 935 936 encoding_error: 937 psenc->chlen = 0; 938 *nresult = (size_t)-1; 939 return (EILSEQ); 940 941 output: 942 *s = s0; 943 if (pwc) 944 *pwc = wchar; 945 *nresult = wchar ? c - chlenbak : 0; 946 return (0); 947 948 restart: 949 *s = s0; 950 *nresult = (size_t)-2; 951 952 return (0); 953 } 954 955 static int 956 recommendation(_ISO2022EncodingInfo * __restrict ei, 957 _ISO2022Charset * __restrict cs) 958 { 959 _ISO2022Charset *recommend; 960 size_t j; 961 int i; 962 963 /* first, try a exact match. */ 964 for (i = 0; i < 4; i++) { 965 recommend = ei->recommend[i]; 966 for (j = 0; j < ei->recommendsize[i]; j++) { 967 if (cs->type != recommend[j].type) 968 continue; 969 if (cs->final != recommend[j].final) 970 continue; 971 if (cs->interm != recommend[j].interm) 972 continue; 973 974 return (i); 975 } 976 } 977 978 /* then, try a wildcard match over final char. */ 979 for (i = 0; i < 4; i++) { 980 recommend = ei->recommend[i]; 981 for (j = 0; j < ei->recommendsize[i]; j++) { 982 if (cs->type != recommend[j].type) 983 continue; 984 if (cs->final && (cs->final != recommend[j].final)) 985 continue; 986 if (cs->interm && (cs->interm != recommend[j].interm)) 987 continue; 988 989 return (i); 990 } 991 } 992 993 /* there's no recommendation. make a guess. */ 994 if (ei->maxcharset == 0) { 995 return (0); 996 } else { 997 switch (cs->type) { 998 case CS94: 999 case CS94MULTI: 1000 return (0); 1001 case CS96: 1002 case CS96MULTI: 1003 return (1); 1004 } 1005 } 1006 return (0); 1007 } 1008 1009 static int 1010 _ISO2022_sputwchar(_ISO2022EncodingInfo * __restrict ei, wchar_t wc, 1011 char * __restrict string, size_t n, char ** __restrict result, 1012 _ISO2022State * __restrict psenc, size_t * __restrict nresult) 1013 { 1014 _ISO2022Charset cs; 1015 char *p; 1016 char tmp[MB_LEN_MAX]; 1017 size_t len; 1018 int bit8, i = 0, target; 1019 unsigned char mask; 1020 1021 if (isc0(wc & 0xff)) { 1022 /* go back to INIT0 or ASCII on control chars */ 1023 cs = ei->initg[0].final ? ei->initg[0] : ascii; 1024 } else if (isc1(wc & 0xff)) { 1025 /* go back to INIT1 or ISO-8859-1 on control chars */ 1026 cs = ei->initg[1].final ? ei->initg[1] : iso88591; 1027 } else if (!(wc & ~0xff)) { 1028 if (wc & 0x80) { 1029 /* special treatment for ISO-8859-1 */ 1030 cs = iso88591; 1031 } else { 1032 /* special treatment for ASCII */ 1033 cs = ascii; 1034 } 1035 } else { 1036 cs.final = (wc >> 24) & 0x7f; 1037 if ((wc >> 16) & 0x80) 1038 cs.interm = (wc >> 16) & 0x7f; 1039 else 1040 cs.interm = '\0'; 1041 if (wc & 0x80) 1042 cs.type = (wc & 0x00007f00) ? CS96MULTI : CS96; 1043 else 1044 cs.type = (wc & 0x00007f00) ? CS94MULTI : CS94; 1045 } 1046 target = recommendation(ei, &cs); 1047 p = tmp; 1048 bit8 = ei->flags & F_8BIT; 1049 1050 /* designate the charset onto the target plane(G0/1/2/3). */ 1051 if (psenc->g[target].type == cs.type && 1052 psenc->g[target].final == cs.final && 1053 psenc->g[target].interm == cs.interm) 1054 goto planeok; 1055 1056 *p++ = '\033'; 1057 if (cs.type == CS94MULTI || cs.type == CS96MULTI) 1058 *p++ = '$'; 1059 if (target == 0 && cs.type == CS94MULTI && strchr("@AB", cs.final) && 1060 !cs.interm && !(ei->flags & F_NOOLD)) 1061 ; 1062 else if (cs.type == CS94 || cs.type == CS94MULTI) 1063 *p++ = "()*+"[target]; 1064 else 1065 *p++ = ",-./"[target]; 1066 if (cs.interm) 1067 *p++ = cs.interm; 1068 *p++ = cs.final; 1069 1070 psenc->g[target].type = cs.type; 1071 psenc->g[target].final = cs.final; 1072 psenc->g[target].interm = cs.interm; 1073 1074 planeok: 1075 /* invoke the plane onto GL or GR. */ 1076 if (psenc->gl == target) 1077 goto sideok; 1078 if (bit8 && psenc->gr == target) 1079 goto sideok; 1080 1081 if (target == 0 && (ei->flags & F_LS0)) { 1082 *p++ = '\017'; 1083 psenc->gl = 0; 1084 } else if (target == 1 && (ei->flags & F_LS1)) { 1085 *p++ = '\016'; 1086 psenc->gl = 1; 1087 } else if (target == 2 && (ei->flags & F_LS2)) { 1088 *p++ = '\033'; 1089 *p++ = 'n'; 1090 psenc->gl = 2; 1091 } else if (target == 3 && (ei->flags & F_LS3)) { 1092 *p++ = '\033'; 1093 *p++ = 'o'; 1094 psenc->gl = 3; 1095 } else if (bit8 && target == 1 && (ei->flags & F_LS1R)) { 1096 *p++ = '\033'; 1097 *p++ = '~'; 1098 psenc->gr = 1; 1099 } else if (bit8 && target == 2 && (ei->flags & F_LS2R)) { 1100 *p++ = '\033'; 1101 /*{*/ 1102 *p++ = '}'; 1103 psenc->gr = 2; 1104 } else if (bit8 && target == 3 && (ei->flags & F_LS3R)) { 1105 *p++ = '\033'; 1106 *p++ = '|'; 1107 psenc->gr = 3; 1108 } else if (target == 2 && (ei->flags & F_SS2)) { 1109 *p++ = '\033'; 1110 *p++ = 'N'; 1111 psenc->singlegl = 2; 1112 } else if (target == 3 && (ei->flags & F_SS3)) { 1113 *p++ = '\033'; 1114 *p++ = 'O'; 1115 psenc->singlegl = 3; 1116 } else if (bit8 && target == 2 && (ei->flags & F_SS2R)) { 1117 *p++ = '\216'; 1118 *p++ = 'N'; 1119 psenc->singlegl = psenc->singlegr = 2; 1120 } else if (bit8 && target == 3 && (ei->flags & F_SS3R)) { 1121 *p++ = '\217'; 1122 *p++ = 'O'; 1123 psenc->singlegl = psenc->singlegr = 3; 1124 } else 1125 goto ilseq; 1126 1127 sideok: 1128 if (psenc->singlegl == target) 1129 mask = 0x00; 1130 else if (psenc->singlegr == target) 1131 mask = 0x80; 1132 else if (psenc->gl == target) 1133 mask = 0x00; 1134 else if ((ei->flags & F_8BIT) && psenc->gr == target) 1135 mask = 0x80; 1136 else 1137 goto ilseq; 1138 1139 switch (cs.type) { 1140 case CS94: 1141 case CS96: 1142 i = 1; 1143 break; 1144 case CS94MULTI: 1145 case CS96MULTI: 1146 i = !iscntl(wc & 0xff) ? 1147 (isthree(cs.final) ? 3 : 2) : 1; 1148 break; 1149 } 1150 while (i-- > 0) 1151 *p++ = ((wc >> (i << 3)) & 0x7f) | mask; 1152 1153 /* reset single shift state */ 1154 psenc->singlegl = psenc->singlegr = -1; 1155 1156 len = (size_t)(p - tmp); 1157 if (n < len) { 1158 if (result) 1159 *result = (char *)0; 1160 *nresult = (size_t)-1; 1161 return (E2BIG); 1162 } 1163 if (result) 1164 *result = string + len; 1165 memcpy(string, tmp, len); 1166 *nresult = len; 1167 1168 return (0); 1169 1170 ilseq: 1171 *nresult = (size_t)-1; 1172 return (EILSEQ); 1173 } 1174 1175 static int 1176 _citrus_ISO2022_put_state_reset(_ISO2022EncodingInfo * __restrict ei, 1177 char * __restrict s, size_t n, _ISO2022State * __restrict psenc, 1178 size_t * __restrict nresult) 1179 { 1180 char *result; 1181 char buf[MB_LEN_MAX]; 1182 size_t len; 1183 int ret; 1184 1185 /* XXX state will be modified after this operation... */ 1186 ret = _ISO2022_sputwchar(ei, L'\0', buf, sizeof(buf), &result, psenc, 1187 &len); 1188 if (ret) { 1189 *nresult = len; 1190 return (ret); 1191 } 1192 1193 if (sizeof(buf) < len || n < len-1) { 1194 /* XXX should recover state? */ 1195 *nresult = (size_t)-1; 1196 return (E2BIG); 1197 } 1198 1199 memcpy(s, buf, len - 1); 1200 *nresult = len - 1; 1201 return (0); 1202 } 1203 1204 static int 1205 _citrus_ISO2022_wcrtomb_priv(_ISO2022EncodingInfo * __restrict ei, 1206 char * __restrict s, size_t n, wchar_t wc, 1207 _ISO2022State * __restrict psenc, size_t * __restrict nresult) 1208 { 1209 char *result; 1210 char buf[MB_LEN_MAX]; 1211 size_t len; 1212 int ret; 1213 1214 /* XXX state will be modified after this operation... */ 1215 ret = _ISO2022_sputwchar(ei, wc, buf, sizeof(buf), &result, psenc, 1216 &len); 1217 if (ret) { 1218 *nresult = len; 1219 return (ret); 1220 } 1221 1222 if (sizeof(buf) < len || n < len) { 1223 /* XXX should recover state? */ 1224 *nresult = (size_t)-1; 1225 return (E2BIG); 1226 } 1227 1228 memcpy(s, buf, len); 1229 *nresult = len; 1230 return (0); 1231 } 1232 1233 static __inline int 1234 /*ARGSUSED*/ 1235 _citrus_ISO2022_stdenc_wctocs(_ISO2022EncodingInfo * __restrict ei __unused, 1236 _csid_t * __restrict csid, _index_t * __restrict idx, wchar_t wc) 1237 { 1238 wchar_t m, nm; 1239 1240 m = wc & 0x7FFF8080; 1241 nm = wc & 0x007F7F7F; 1242 if (m & 0x00800000) 1243 nm &= 0x00007F7F; 1244 else 1245 m &= 0x7F008080; 1246 if (nm & 0x007F0000) { 1247 /* ^3 mark */ 1248 m |= 0x007F0000; 1249 } else if (nm & 0x00007F00) { 1250 /* ^2 mark */ 1251 m |= 0x00007F00; 1252 } 1253 *csid = (_csid_t)m; 1254 *idx = (_index_t)nm; 1255 1256 return (0); 1257 } 1258 1259 static __inline int 1260 /*ARGSUSED*/ 1261 _citrus_ISO2022_stdenc_cstowc(_ISO2022EncodingInfo * __restrict ei __unused, 1262 wchar_t * __restrict wc, _csid_t csid, _index_t idx) 1263 { 1264 1265 *wc = (wchar_t)(csid & 0x7F808080) | (wchar_t)idx; 1266 1267 return (0); 1268 } 1269 1270 static __inline int 1271 /*ARGSUSED*/ 1272 _citrus_ISO2022_stdenc_get_state_desc_generic(_ISO2022EncodingInfo * __restrict ei __unused, 1273 _ISO2022State * __restrict psenc, int * __restrict rstate) 1274 { 1275 1276 if (psenc->chlen == 0) { 1277 /* XXX: it should distinguish initial and stable. */ 1278 *rstate = _STDENC_SDGEN_STABLE; 1279 } else 1280 *rstate = (psenc->ch[0] == '\033') ? 1281 _STDENC_SDGEN_INCOMPLETE_SHIFT : 1282 _STDENC_SDGEN_INCOMPLETE_CHAR; 1283 return (0); 1284 } 1285 1286 /* ---------------------------------------------------------------------- 1287 * public interface for stdenc 1288 */ 1289 1290 _CITRUS_STDENC_DECLS(ISO2022); 1291 _CITRUS_STDENC_DEF_OPS(ISO2022); 1292 1293 #include "citrus_stdenc_template.h" 1294