1 /* 2 * Copyright (C) 1984-2015 Mark Nudelman 3 * 4 * You may distribute under the terms of either the GNU General Public 5 * License or the Less License, as specified in the README file. 6 * 7 * For more information, see the README file. 8 */ 9 10 11 /* 12 * Functions to define the character set 13 * and do things specific to the character set. 14 */ 15 16 #include "less.h" 17 #if HAVE_LOCALE 18 #include <locale.h> 19 #include <ctype.h> 20 #include <langinfo.h> 21 #endif 22 23 #include "charset.h" 24 25 public int utf_mode = 0; 26 27 /* 28 * Predefined character sets, 29 * selected by the LESSCHARSET environment variable. 30 */ 31 struct charset { 32 char *name; 33 int *p_flag; 34 char *desc; 35 } charsets[] = { 36 { "ascii", NULL, "8bcccbcc18b95.b" }, 37 { "utf-8", &utf_mode, "8bcccbcc18b95.b126.bb" }, 38 { "iso8859", NULL, "8bcccbcc18b95.33b." }, 39 { "latin3", NULL, "8bcccbcc18b95.33b5.b8.b15.b4.b12.b18.b12.b." }, 40 { "arabic", NULL, "8bcccbcc18b95.33b.3b.7b2.13b.3b.b26.5b19.b" }, 41 { "greek", NULL, "8bcccbcc18b95.33b4.2b4.b3.b35.b44.b" }, 42 { "greek2005", NULL, "8bcccbcc18b95.33b14.b35.b44.b" }, 43 { "hebrew", NULL, "8bcccbcc18b95.33b.b29.32b28.2b2.b" }, 44 { "koi8-r", NULL, "8bcccbcc18b95.b." }, 45 { "KOI8-T", NULL, "8bcccbcc18b95.b8.b6.b8.b.b.5b7.3b4.b4.b3.b.b.3b." }, 46 { "georgianps", NULL, "8bcccbcc18b95.3b11.4b12.2b." }, 47 { "tcvn", NULL, "b..b...bcccbccbbb7.8b95.b48.5b." }, 48 { "TIS-620", NULL, "8bcccbcc18b95.b.4b.11b7.8b." }, 49 { "next", NULL, "8bcccbcc18b95.bb125.bb" }, 50 { "dos", NULL, "8bcccbcc12bc5b95.b." }, 51 { "windows-1251", NULL, "8bcccbcc12bc5b95.b24.b." }, 52 { "windows-1252", NULL, "8bcccbcc12bc5b95.b.b11.b.2b12.b." }, 53 { "windows-1255", NULL, "8bcccbcc12bc5b95.b.b8.b.5b9.b.4b." }, 54 { "ebcdic", NULL, "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." }, 55 { "IBM-1047", NULL, "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" }, 56 { NULL, NULL, NULL } 57 }; 58 59 /* 60 * Support "locale charmap"/nl_langinfo(CODESET) values, as well as others. 61 */ 62 struct cs_alias { 63 char *name; 64 char *oname; 65 } cs_aliases[] = { 66 { "UTF-8", "utf-8" }, 67 { "ANSI_X3.4-1968", "ascii" }, 68 { "US-ASCII", "ascii" }, 69 { "latin1", "iso8859" }, 70 { "ISO-8859-1", "iso8859" }, 71 { "latin9", "iso8859" }, 72 { "ISO-8859-15", "iso8859" }, 73 { "latin2", "iso8859" }, 74 { "ISO-8859-2", "iso8859" }, 75 { "ISO-8859-3", "latin3" }, 76 { "latin4", "iso8859" }, 77 { "ISO-8859-4", "iso8859" }, 78 { "cyrillic", "iso8859" }, 79 { "ISO-8859-5", "iso8859" }, 80 { "ISO-8859-6", "arabic" }, 81 { "ISO-8859-7", "greek" }, 82 { "IBM9005", "greek2005" }, 83 { "ISO-8859-8", "hebrew" }, 84 { "latin5", "iso8859" }, 85 { "ISO-8859-9", "iso8859" }, 86 { "latin6", "iso8859" }, 87 { "ISO-8859-10", "iso8859" }, 88 { "latin7", "iso8859" }, 89 { "ISO-8859-13", "iso8859" }, 90 { "latin8", "iso8859" }, 91 { "ISO-8859-14", "iso8859" }, 92 { "latin10", "iso8859" }, 93 { "ISO-8859-16", "iso8859" }, 94 { "IBM437", "dos" }, 95 { "EBCDIC-US", "ebcdic" }, 96 { "IBM1047", "IBM-1047" }, 97 { "KOI8-R", "koi8-r" }, 98 { "KOI8-U", "koi8-r" }, 99 { "GEORGIAN-PS", "georgianps" }, 100 { "TCVN5712-1", "tcvn" }, 101 { "NEXTSTEP", "next" }, 102 { "windows", "windows-1252" }, /* backward compatibility */ 103 { "CP1251", "windows-1251" }, 104 { "CP1252", "windows-1252" }, 105 { "CP1255", "windows-1255" }, 106 { NULL, NULL } 107 }; 108 109 #define IS_BINARY_CHAR 01 110 #define IS_CONTROL_CHAR 02 111 112 static char chardef[256]; 113 static char *binfmt = NULL; 114 static char *utfbinfmt = NULL; 115 public int binattr = AT_STANDOUT; 116 117 118 /* 119 * Define a charset, given a description string. 120 * The string consists of 256 letters, 121 * one for each character in the charset. 122 * If the string is shorter than 256 letters, missing letters 123 * are taken to be identical to the last one. 124 * A decimal number followed by a letter is taken to be a 125 * repetition of the letter. 126 * 127 * Each letter is one of: 128 * . normal character 129 * b binary character 130 * c control character 131 */ 132 static void 133 ichardef(s) 134 char *s; 135 { 136 register char *cp; 137 register int n; 138 register char v; 139 140 n = 0; 141 v = 0; 142 cp = chardef; 143 while (*s != '\0') 144 { 145 switch (*s++) 146 { 147 case '.': 148 v = 0; 149 break; 150 case 'c': 151 v = IS_CONTROL_CHAR; 152 break; 153 case 'b': 154 v = IS_BINARY_CHAR|IS_CONTROL_CHAR; 155 break; 156 157 case '0': case '1': case '2': case '3': case '4': 158 case '5': case '6': case '7': case '8': case '9': 159 n = (10 * n) + (s[-1] - '0'); 160 continue; 161 162 default: 163 error("invalid chardef", NULL_PARG); 164 quit(QUIT_ERROR); 165 /*NOTREACHED*/ 166 } 167 168 do 169 { 170 if (cp >= chardef + sizeof(chardef)) 171 { 172 error("chardef longer than 256", NULL_PARG); 173 quit(QUIT_ERROR); 174 /*NOTREACHED*/ 175 } 176 *cp++ = v; 177 } while (--n > 0); 178 n = 0; 179 } 180 181 while (cp < chardef + sizeof(chardef)) 182 *cp++ = v; 183 } 184 185 /* 186 * Define a charset, given a charset name. 187 * The valid charset names are listed in the "charsets" array. 188 */ 189 static int 190 icharset(name, no_error) 191 register char *name; 192 int no_error; 193 { 194 register struct charset *p; 195 register struct cs_alias *a; 196 197 if (name == NULL || *name == '\0') 198 return (0); 199 200 /* First see if the name is an alias. */ 201 for (a = cs_aliases; a->name != NULL; a++) 202 { 203 if (strcmp(name, a->name) == 0) 204 { 205 name = a->oname; 206 break; 207 } 208 } 209 210 for (p = charsets; p->name != NULL; p++) 211 { 212 if (strcmp(name, p->name) == 0) 213 { 214 ichardef(p->desc); 215 if (p->p_flag != NULL) 216 *(p->p_flag) = 1; 217 return (1); 218 } 219 } 220 221 if (!no_error) { 222 error("invalid charset name", NULL_PARG); 223 quit(QUIT_ERROR); 224 } 225 return (0); 226 } 227 228 #if HAVE_LOCALE 229 /* 230 * Define a charset, given a locale name. 231 */ 232 static void 233 ilocale() 234 { 235 register int c; 236 237 for (c = 0; c < (int) sizeof(chardef); c++) 238 { 239 if (isprint(c)) 240 chardef[c] = 0; 241 else if (iscntrl(c)) 242 chardef[c] = IS_CONTROL_CHAR; 243 else 244 chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR; 245 } 246 } 247 #endif 248 249 /* 250 * Define the printing format for control (or binary utf) chars. 251 */ 252 static void 253 setbinfmt(s, fmtvarptr, default_fmt) 254 char *s; 255 char **fmtvarptr; 256 char *default_fmt; 257 { 258 if (s && utf_mode) 259 { 260 /* It would be too hard to account for width otherwise. */ 261 char *t = s; 262 while (*t) 263 { 264 if (*t < ' ' || *t > '~') 265 { 266 s = default_fmt; 267 goto attr; 268 } 269 t++; 270 } 271 } 272 273 /* %n is evil */ 274 if (s == NULL || *s == '\0' || 275 (*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) || 276 (*s != '*' && strchr(s, 'n'))) 277 s = default_fmt; 278 279 /* 280 * Select the attributes if it starts with "*". 281 */ 282 attr: 283 if (*s == '*') 284 { 285 switch (s[1]) 286 { 287 case 'd': binattr = AT_BOLD; break; 288 case 'k': binattr = AT_BLINK; break; 289 case 's': binattr = AT_STANDOUT; break; 290 case 'u': binattr = AT_UNDERLINE; break; 291 default: binattr = AT_NORMAL; break; 292 } 293 s += 2; 294 } 295 *fmtvarptr = s; 296 } 297 298 /* 299 * 300 */ 301 static void 302 set_charset() 303 { 304 char *s; 305 306 /* 307 * See if environment variable LESSCHARSET is defined. 308 */ 309 s = lgetenv("LESSCHARSET"); 310 if (icharset(s, 0)) 311 return; 312 313 /* 314 * LESSCHARSET is not defined: try LESSCHARDEF. 315 */ 316 s = lgetenv("LESSCHARDEF"); 317 if (s != NULL && *s != '\0') 318 { 319 ichardef(s); 320 return; 321 } 322 323 #if HAVE_LOCALE 324 #ifdef CODESET 325 /* 326 * Try using the codeset name as the charset name. 327 */ 328 s = nl_langinfo(CODESET); 329 if (icharset(s, 1)) 330 return; 331 #endif 332 #endif 333 334 #if HAVE_STRSTR 335 /* 336 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used. 337 */ 338 if ((s = lgetenv("LC_ALL")) != NULL || 339 (s = lgetenv("LC_CTYPE")) != NULL || 340 (s = lgetenv("LANG")) != NULL) 341 { 342 if ( strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL 343 || strstr(s, "UTF8") != NULL || strstr(s, "utf8") != NULL) 344 if (icharset("utf-8", 1)) 345 return; 346 } 347 #endif 348 349 #if HAVE_LOCALE 350 /* 351 * Get character definitions from locale functions, 352 * rather than from predefined charset entry. 353 */ 354 ilocale(); 355 #if MSDOS_COMPILER 356 /* 357 * Default to "dos". 358 */ 359 (void) icharset("dos", 1); 360 #else 361 /* 362 * Default to "latin1". 363 */ 364 (void) icharset("latin1", 1); 365 #endif 366 #endif 367 } 368 369 /* 370 * Initialize charset data structures. 371 */ 372 public void 373 init_charset() 374 { 375 char *s; 376 377 #if HAVE_LOCALE 378 setlocale(LC_ALL, ""); 379 #endif 380 381 set_charset(); 382 383 s = lgetenv("LESSBINFMT"); 384 setbinfmt(s, &binfmt, "*s<%02X>"); 385 386 s = lgetenv("LESSUTFBINFMT"); 387 setbinfmt(s, &utfbinfmt, "<U+%04lX>"); 388 } 389 390 /* 391 * Is a given character a "binary" character? 392 */ 393 public int 394 binary_char(c) 395 LWCHAR c; 396 { 397 if (utf_mode) 398 return (is_ubin_char(c)); 399 c &= 0377; 400 return (chardef[c] & IS_BINARY_CHAR); 401 } 402 403 /* 404 * Is a given character a "control" character? 405 */ 406 public int 407 control_char(c) 408 LWCHAR c; 409 { 410 c &= 0377; 411 return (chardef[c] & IS_CONTROL_CHAR); 412 } 413 414 /* 415 * Return the printable form of a character. 416 * For example, in the "ascii" charset '\3' is printed as "^C". 417 */ 418 public char * 419 prchar(c) 420 LWCHAR c; 421 { 422 /* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */ 423 static char buf[32]; 424 425 c &= 0377; 426 if ((c < 128 || !utf_mode) && !control_char(c)) 427 SNPRINTF1(buf, sizeof(buf), "%c", (int) c); 428 else if (c == ESC) 429 strcpy(buf, "ESC"); 430 #if IS_EBCDIC_HOST 431 else if (!binary_char(c) && c < 64) 432 SNPRINTF1(buf, sizeof(buf), "^%c", 433 /* 434 * This array roughly inverts CONTROL() #defined in less.h, 435 * and should be kept in sync with CONTROL() and IBM-1047. 436 */ 437 "@ABC.I.?...KLMNO" 438 "PQRS.JH.XY.." 439 "\\]^_" 440 "......W[.....EFG" 441 "..V....D....TU.Z"[c]); 442 #else 443 else if (c < 128 && !control_char(c ^ 0100)) 444 SNPRINTF1(buf, sizeof(buf), "^%c", (int) (c ^ 0100)); 445 #endif 446 else 447 SNPRINTF1(buf, sizeof(buf), binfmt, c); 448 return (buf); 449 } 450 451 /* 452 * Return the printable form of a UTF-8 character. 453 */ 454 public char * 455 prutfchar(ch) 456 LWCHAR ch; 457 { 458 static char buf[32]; 459 460 if (ch == ESC) 461 strcpy(buf, "ESC"); 462 else if (ch < 128 && control_char(ch)) 463 { 464 if (!control_char(ch ^ 0100)) 465 SNPRINTF1(buf, sizeof(buf), "^%c", ((char) ch) ^ 0100); 466 else 467 SNPRINTF1(buf, sizeof(buf), binfmt, (char) ch); 468 } else if (is_ubin_char(ch)) 469 { 470 SNPRINTF1(buf, sizeof(buf), utfbinfmt, ch); 471 } else 472 { 473 char *p = buf; 474 if (ch >= 0x80000000) 475 ch = 0xFFFD; /* REPLACEMENT CHARACTER */ 476 put_wchar(&p, ch); 477 *p = '\0'; 478 } 479 return (buf); 480 } 481 482 /* 483 * Get the length of a UTF-8 character in bytes. 484 */ 485 public int 486 utf_len(ch) 487 char ch; 488 { 489 if ((ch & 0x80) == 0) 490 return 1; 491 if ((ch & 0xE0) == 0xC0) 492 return 2; 493 if ((ch & 0xF0) == 0xE0) 494 return 3; 495 if ((ch & 0xF8) == 0xF0) 496 return 4; 497 if ((ch & 0xFC) == 0xF8) 498 return 5; 499 if ((ch & 0xFE) == 0xFC) 500 return 6; 501 /* Invalid UTF-8 encoding. */ 502 return 1; 503 } 504 505 /* 506 * Does the parameter point to the lead byte of a well-formed UTF-8 character? 507 */ 508 public int 509 is_utf8_well_formed(s, slen) 510 unsigned char *s; 511 int slen; 512 { 513 int i; 514 int len; 515 516 if (IS_UTF8_INVALID(s[0])) 517 return (0); 518 519 len = utf_len((char) s[0]); 520 if (len > slen) 521 return (0); 522 if (len == 1) 523 return (1); 524 if (len == 2) 525 { 526 if (s[0] < 0xC2) 527 return (0); 528 } else 529 { 530 unsigned char mask; 531 mask = (~((1 << (8-len)) - 1)) & 0xFF; 532 if (s[0] == mask && (s[1] & mask) == 0x80) 533 return (0); 534 } 535 536 for (i = 1; i < len; i++) 537 if (!IS_UTF8_TRAIL(s[i])) 538 return (0); 539 return (1); 540 } 541 542 /* 543 * Return number of invalid UTF-8 sequences found in a buffer. 544 */ 545 public int 546 utf_bin_count(data, len) 547 unsigned char *data; 548 int len; 549 { 550 int bin_count = 0; 551 while (len > 0) 552 { 553 if (is_utf8_well_formed(data, len)) 554 { 555 int clen = utf_len(*data); 556 data += clen; 557 len -= clen; 558 } else 559 { 560 /* Skip to next lead byte. */ 561 bin_count++; 562 do { 563 ++data; 564 --len; 565 } while (len > 0 && !IS_UTF8_LEAD(*data)); 566 } 567 } 568 return (bin_count); 569 } 570 571 /* 572 * Get the value of a UTF-8 character. 573 */ 574 public LWCHAR 575 get_wchar(p) 576 char *p; 577 { 578 switch (utf_len(p[0])) 579 { 580 case 1: 581 default: 582 /* 0xxxxxxx */ 583 return (LWCHAR) 584 (p[0] & 0xFF); 585 case 2: 586 /* 110xxxxx 10xxxxxx */ 587 return (LWCHAR) ( 588 ((p[0] & 0x1F) << 6) | 589 (p[1] & 0x3F)); 590 case 3: 591 /* 1110xxxx 10xxxxxx 10xxxxxx */ 592 return (LWCHAR) ( 593 ((p[0] & 0x0F) << 12) | 594 ((p[1] & 0x3F) << 6) | 595 (p[2] & 0x3F)); 596 case 4: 597 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 598 return (LWCHAR) ( 599 ((p[0] & 0x07) << 18) | 600 ((p[1] & 0x3F) << 12) | 601 ((p[2] & 0x3F) << 6) | 602 (p[3] & 0x3F)); 603 case 5: 604 /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 605 return (LWCHAR) ( 606 ((p[0] & 0x03) << 24) | 607 ((p[1] & 0x3F) << 18) | 608 ((p[2] & 0x3F) << 12) | 609 ((p[3] & 0x3F) << 6) | 610 (p[4] & 0x3F)); 611 case 6: 612 /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 613 return (LWCHAR) ( 614 ((p[0] & 0x01) << 30) | 615 ((p[1] & 0x3F) << 24) | 616 ((p[2] & 0x3F) << 18) | 617 ((p[3] & 0x3F) << 12) | 618 ((p[4] & 0x3F) << 6) | 619 (p[5] & 0x3F)); 620 } 621 } 622 623 /* 624 * Store a character into a UTF-8 string. 625 */ 626 public void 627 put_wchar(pp, ch) 628 char **pp; 629 LWCHAR ch; 630 { 631 if (!utf_mode || ch < 0x80) 632 { 633 /* 0xxxxxxx */ 634 *(*pp)++ = (char) ch; 635 } else if (ch < 0x800) 636 { 637 /* 110xxxxx 10xxxxxx */ 638 *(*pp)++ = (char) (0xC0 | ((ch >> 6) & 0x1F)); 639 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 640 } else if (ch < 0x10000) 641 { 642 /* 1110xxxx 10xxxxxx 10xxxxxx */ 643 *(*pp)++ = (char) (0xE0 | ((ch >> 12) & 0x0F)); 644 *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F)); 645 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 646 } else if (ch < 0x200000) 647 { 648 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 649 *(*pp)++ = (char) (0xF0 | ((ch >> 18) & 0x07)); 650 *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F)); 651 *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F)); 652 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 653 } else if (ch < 0x4000000) 654 { 655 /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 656 *(*pp)++ = (char) (0xF0 | ((ch >> 24) & 0x03)); 657 *(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F)); 658 *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F)); 659 *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F)); 660 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 661 } else 662 { 663 /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 664 *(*pp)++ = (char) (0xF0 | ((ch >> 30) & 0x01)); 665 *(*pp)++ = (char) (0x80 | ((ch >> 24) & 0x3F)); 666 *(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F)); 667 *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F)); 668 *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F)); 669 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 670 } 671 } 672 673 /* 674 * Step forward or backward one character in a string. 675 */ 676 public LWCHAR 677 step_char(pp, dir, limit) 678 char **pp; 679 signed int dir; 680 char *limit; 681 { 682 LWCHAR ch; 683 int len; 684 char *p = *pp; 685 686 if (!utf_mode) 687 { 688 /* It's easy if chars are one byte. */ 689 if (dir > 0) 690 ch = (LWCHAR) ((p < limit) ? *p++ : 0); 691 else 692 ch = (LWCHAR) ((p > limit) ? *--p : 0); 693 } else if (dir > 0) 694 { 695 len = utf_len(*p); 696 if (p + len > limit) 697 { 698 ch = 0; 699 p = limit; 700 } else 701 { 702 ch = get_wchar(p); 703 p += len; 704 } 705 } else 706 { 707 while (p > limit && IS_UTF8_TRAIL(p[-1])) 708 p--; 709 if (p > limit) 710 ch = get_wchar(--p); 711 else 712 ch = 0; 713 } 714 *pp = p; 715 return ch; 716 } 717 718 /* 719 * Unicode characters data 720 * Actual data is in the generated *.uni files. 721 */ 722 723 #define DECLARE_RANGE_TABLE_START(name) \ 724 static struct wchar_range name##_array[] = { 725 #define DECLARE_RANGE_TABLE_END(name) \ 726 }; struct wchar_range_table name##_table = { name##_array, sizeof(name##_array)/sizeof(*name##_array) }; 727 728 DECLARE_RANGE_TABLE_START(compose) 729 #include "compose.uni" 730 DECLARE_RANGE_TABLE_END(compose) 731 732 DECLARE_RANGE_TABLE_START(ubin) 733 #include "ubin.uni" 734 DECLARE_RANGE_TABLE_END(ubin) 735 736 DECLARE_RANGE_TABLE_START(wide) 737 #include "wide.uni" 738 DECLARE_RANGE_TABLE_END(wide) 739 740 /* comb_table is special pairs, not ranges. */ 741 static struct wchar_range comb_table[] = { 742 {0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627}, 743 }; 744 745 746 static int 747 is_in_table(ch, table) 748 LWCHAR ch; 749 struct wchar_range_table *table; 750 { 751 int hi; 752 int lo; 753 754 /* Binary search in the table. */ 755 if (ch < table->table[0].first) 756 return 0; 757 lo = 0; 758 hi = table->count - 1; 759 while (lo <= hi) 760 { 761 int mid = (lo + hi) / 2; 762 if (ch > table->table[mid].last) 763 lo = mid + 1; 764 else if (ch < table->table[mid].first) 765 hi = mid - 1; 766 else 767 return 1; 768 } 769 return 0; 770 } 771 772 /* 773 * Is a character a UTF-8 composing character? 774 * If a composing character follows any char, the two combine into one glyph. 775 */ 776 public int 777 is_composing_char(ch) 778 LWCHAR ch; 779 { 780 return is_in_table(ch, &compose_table); 781 } 782 783 /* 784 * Should this UTF-8 character be treated as binary? 785 */ 786 public int 787 is_ubin_char(ch) 788 LWCHAR ch; 789 { 790 return is_in_table(ch, &ubin_table); 791 } 792 793 /* 794 * Is this a double width UTF-8 character? 795 */ 796 public int 797 is_wide_char(ch) 798 LWCHAR ch; 799 { 800 return is_in_table(ch, &wide_table); 801 } 802 803 /* 804 * Is a character a UTF-8 combining character? 805 * A combining char acts like an ordinary char, but if it follows 806 * a specific char (not any char), the two combine into one glyph. 807 */ 808 public int 809 is_combining_char(ch1, ch2) 810 LWCHAR ch1; 811 LWCHAR ch2; 812 { 813 /* The table is small; use linear search. */ 814 int i; 815 for (i = 0; i < sizeof(comb_table)/sizeof(*comb_table); i++) 816 { 817 if (ch1 == comb_table[i].first && 818 ch2 == comb_table[i].last) 819 return 1; 820 } 821 return 0; 822 } 823 824