1 /* 2 * Copyright (C) 1984-2017 Mark Nudelman 3 * 4 * You may distribute under the terms of either the GNU General Public 5 * License or the Less License, as specified in the README file. 6 * 7 * For more information, see the README file. 8 */ 9 10 11 /* 12 * Functions to define the character set 13 * and do things specific to the character set. 14 */ 15 16 #include "less.h" 17 #if HAVE_LOCALE 18 #include <locale.h> 19 #include <ctype.h> 20 #include <langinfo.h> 21 #endif 22 23 #include "charset.h" 24 25 public int utf_mode = 0; 26 27 /* 28 * Predefined character sets, 29 * selected by the LESSCHARSET environment variable. 30 */ 31 struct charset { 32 char *name; 33 int *p_flag; 34 char *desc; 35 } charsets[] = { 36 { "ascii", NULL, "8bcccbcc18b95.b" }, 37 { "utf-8", &utf_mode, "8bcccbcc18b95.b126.bb" }, 38 { "iso8859", NULL, "8bcccbcc18b95.33b." }, 39 { "latin3", NULL, "8bcccbcc18b95.33b5.b8.b15.b4.b12.b18.b12.b." }, 40 { "arabic", NULL, "8bcccbcc18b95.33b.3b.7b2.13b.3b.b26.5b19.b" }, 41 { "greek", NULL, "8bcccbcc18b95.33b4.2b4.b3.b35.b44.b" }, 42 { "greek2005", NULL, "8bcccbcc18b95.33b14.b35.b44.b" }, 43 { "hebrew", NULL, "8bcccbcc18b95.33b.b29.32b28.2b2.b" }, 44 { "koi8-r", NULL, "8bcccbcc18b95.b." }, 45 { "KOI8-T", NULL, "8bcccbcc18b95.b8.b6.b8.b.b.5b7.3b4.b4.b3.b.b.3b." }, 46 { "georgianps", NULL, "8bcccbcc18b95.3b11.4b12.2b." }, 47 { "tcvn", NULL, "b..b...bcccbccbbb7.8b95.b48.5b." }, 48 { "TIS-620", NULL, "8bcccbcc18b95.b.4b.11b7.8b." }, 49 { "next", NULL, "8bcccbcc18b95.bb125.bb" }, 50 { "dos", NULL, "8bcccbcc12bc5b95.b." }, 51 { "windows-1251", NULL, "8bcccbcc12bc5b95.b24.b." }, 52 { "windows-1252", NULL, "8bcccbcc12bc5b95.b.b11.b.2b12.b." }, 53 { "windows-1255", NULL, "8bcccbcc12bc5b95.b.b8.b.5b9.b.4b." }, 54 { "ebcdic", NULL, "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." }, 55 { "IBM-1047", NULL, "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" }, 56 { NULL, NULL, NULL } 57 }; 58 59 /* 60 * Support "locale charmap"/nl_langinfo(CODESET) values, as well as others. 61 */ 62 struct cs_alias { 63 char *name; 64 char *oname; 65 } cs_aliases[] = { 66 { "UTF-8", "utf-8" }, 67 { "utf8", "utf-8" }, 68 { "UTF8", "utf-8" }, 69 { "ANSI_X3.4-1968", "ascii" }, 70 { "US-ASCII", "ascii" }, 71 { "latin1", "iso8859" }, 72 { "ISO-8859-1", "iso8859" }, 73 { "latin9", "iso8859" }, 74 { "ISO-8859-15", "iso8859" }, 75 { "latin2", "iso8859" }, 76 { "ISO-8859-2", "iso8859" }, 77 { "ISO-8859-3", "latin3" }, 78 { "latin4", "iso8859" }, 79 { "ISO-8859-4", "iso8859" }, 80 { "cyrillic", "iso8859" }, 81 { "ISO-8859-5", "iso8859" }, 82 { "ISO-8859-6", "arabic" }, 83 { "ISO-8859-7", "greek" }, 84 { "IBM9005", "greek2005" }, 85 { "ISO-8859-8", "hebrew" }, 86 { "latin5", "iso8859" }, 87 { "ISO-8859-9", "iso8859" }, 88 { "latin6", "iso8859" }, 89 { "ISO-8859-10", "iso8859" }, 90 { "latin7", "iso8859" }, 91 { "ISO-8859-13", "iso8859" }, 92 { "latin8", "iso8859" }, 93 { "ISO-8859-14", "iso8859" }, 94 { "latin10", "iso8859" }, 95 { "ISO-8859-16", "iso8859" }, 96 { "IBM437", "dos" }, 97 { "EBCDIC-US", "ebcdic" }, 98 { "IBM1047", "IBM-1047" }, 99 { "KOI8-R", "koi8-r" }, 100 { "KOI8-U", "koi8-r" }, 101 { "GEORGIAN-PS", "georgianps" }, 102 { "TCVN5712-1", "tcvn" }, 103 { "NEXTSTEP", "next" }, 104 { "windows", "windows-1252" }, /* backward compatibility */ 105 { "CP1251", "windows-1251" }, 106 { "CP1252", "windows-1252" }, 107 { "CP1255", "windows-1255" }, 108 { NULL, NULL } 109 }; 110 111 #define IS_BINARY_CHAR 01 112 #define IS_CONTROL_CHAR 02 113 114 static char chardef[256]; 115 static char *binfmt = NULL; 116 static char *utfbinfmt = NULL; 117 public int binattr = AT_STANDOUT; 118 119 120 /* 121 * Define a charset, given a description string. 122 * The string consists of 256 letters, 123 * one for each character in the charset. 124 * If the string is shorter than 256 letters, missing letters 125 * are taken to be identical to the last one. 126 * A decimal number followed by a letter is taken to be a 127 * repetition of the letter. 128 * 129 * Each letter is one of: 130 * . normal character 131 * b binary character 132 * c control character 133 */ 134 static void 135 ichardef(s) 136 char *s; 137 { 138 char *cp; 139 int n; 140 char v; 141 142 n = 0; 143 v = 0; 144 cp = chardef; 145 while (*s != '\0') 146 { 147 switch (*s++) 148 { 149 case '.': 150 v = 0; 151 break; 152 case 'c': 153 v = IS_CONTROL_CHAR; 154 break; 155 case 'b': 156 v = IS_BINARY_CHAR|IS_CONTROL_CHAR; 157 break; 158 159 case '0': case '1': case '2': case '3': case '4': 160 case '5': case '6': case '7': case '8': case '9': 161 n = (10 * n) + (s[-1] - '0'); 162 continue; 163 164 default: 165 error("invalid chardef", NULL_PARG); 166 quit(QUIT_ERROR); 167 /*NOTREACHED*/ 168 } 169 170 do 171 { 172 if (cp >= chardef + sizeof(chardef)) 173 { 174 error("chardef longer than 256", NULL_PARG); 175 quit(QUIT_ERROR); 176 /*NOTREACHED*/ 177 } 178 *cp++ = v; 179 } while (--n > 0); 180 n = 0; 181 } 182 183 while (cp < chardef + sizeof(chardef)) 184 *cp++ = v; 185 } 186 187 /* 188 * Define a charset, given a charset name. 189 * The valid charset names are listed in the "charsets" array. 190 */ 191 static int 192 icharset(name, no_error) 193 char *name; 194 int no_error; 195 { 196 struct charset *p; 197 struct cs_alias *a; 198 199 if (name == NULL || *name == '\0') 200 return (0); 201 202 /* First see if the name is an alias. */ 203 for (a = cs_aliases; a->name != NULL; a++) 204 { 205 if (strcmp(name, a->name) == 0) 206 { 207 name = a->oname; 208 break; 209 } 210 } 211 212 for (p = charsets; p->name != NULL; p++) 213 { 214 if (strcmp(name, p->name) == 0) 215 { 216 ichardef(p->desc); 217 if (p->p_flag != NULL) 218 *(p->p_flag) = 1; 219 return (1); 220 } 221 } 222 223 if (!no_error) { 224 error("invalid charset name", NULL_PARG); 225 quit(QUIT_ERROR); 226 } 227 return (0); 228 } 229 230 #if HAVE_LOCALE 231 /* 232 * Define a charset, given a locale name. 233 */ 234 static void 235 ilocale() 236 { 237 int c; 238 239 for (c = 0; c < (int) sizeof(chardef); c++) 240 { 241 if (isprint(c)) 242 chardef[c] = 0; 243 else if (iscntrl(c)) 244 chardef[c] = IS_CONTROL_CHAR; 245 else 246 chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR; 247 } 248 } 249 #endif 250 251 /* 252 * Define the printing format for control (or binary utf) chars. 253 */ 254 static void 255 setbinfmt(s, fmtvarptr, default_fmt) 256 char *s; 257 char **fmtvarptr; 258 char *default_fmt; 259 { 260 if (s && utf_mode) 261 { 262 /* It would be too hard to account for width otherwise. */ 263 char *t = s; 264 while (*t) 265 { 266 if (*t < ' ' || *t > '~') 267 { 268 s = default_fmt; 269 goto attr; 270 } 271 t++; 272 } 273 } 274 275 /* %n is evil */ 276 if (s == NULL || *s == '\0' || 277 (*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) || 278 (*s != '*' && strchr(s, 'n'))) 279 s = default_fmt; 280 281 /* 282 * Select the attributes if it starts with "*". 283 */ 284 attr: 285 if (*s == '*') 286 { 287 switch (s[1]) 288 { 289 case 'd': binattr = AT_BOLD; break; 290 case 'k': binattr = AT_BLINK; break; 291 case 's': binattr = AT_STANDOUT; break; 292 case 'u': binattr = AT_UNDERLINE; break; 293 default: binattr = AT_NORMAL; break; 294 } 295 s += 2; 296 } 297 *fmtvarptr = s; 298 } 299 300 /* 301 * 302 */ 303 static void 304 set_charset() 305 { 306 char *s; 307 308 /* 309 * See if environment variable LESSCHARSET is defined. 310 */ 311 s = lgetenv("LESSCHARSET"); 312 if (icharset(s, 0)) 313 return; 314 315 /* 316 * LESSCHARSET is not defined: try LESSCHARDEF. 317 */ 318 s = lgetenv("LESSCHARDEF"); 319 if (s != NULL && *s != '\0') 320 { 321 ichardef(s); 322 return; 323 } 324 325 #if HAVE_LOCALE 326 #ifdef CODESET 327 /* 328 * Try using the codeset name as the charset name. 329 */ 330 s = nl_langinfo(CODESET); 331 if (icharset(s, 1)) 332 return; 333 #endif 334 #endif 335 336 #if HAVE_STRSTR 337 /* 338 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used. 339 */ 340 if ((s = lgetenv("LC_ALL")) != NULL || 341 (s = lgetenv("LC_CTYPE")) != NULL || 342 (s = lgetenv("LANG")) != NULL) 343 { 344 if ( strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL 345 || strstr(s, "UTF8") != NULL || strstr(s, "utf8") != NULL) 346 if (icharset("utf-8", 1)) 347 return; 348 } 349 #endif 350 351 #if HAVE_LOCALE 352 /* 353 * Get character definitions from locale functions, 354 * rather than from predefined charset entry. 355 */ 356 ilocale(); 357 #if MSDOS_COMPILER 358 /* 359 * Default to "dos". 360 */ 361 (void) icharset("dos", 1); 362 #else 363 /* 364 * Default to "latin1". 365 */ 366 (void) icharset("latin1", 1); 367 #endif 368 #endif 369 } 370 371 /* 372 * Initialize charset data structures. 373 */ 374 public void 375 init_charset() 376 { 377 char *s; 378 379 #if HAVE_LOCALE 380 setlocale(LC_ALL, ""); 381 #endif 382 383 set_charset(); 384 385 s = lgetenv("LESSBINFMT"); 386 setbinfmt(s, &binfmt, "*s<%02X>"); 387 388 s = lgetenv("LESSUTFBINFMT"); 389 setbinfmt(s, &utfbinfmt, "<U+%04lX>"); 390 } 391 392 /* 393 * Is a given character a "binary" character? 394 */ 395 public int 396 binary_char(c) 397 LWCHAR c; 398 { 399 if (utf_mode) 400 return (is_ubin_char(c)); 401 c &= 0377; 402 return (chardef[c] & IS_BINARY_CHAR); 403 } 404 405 /* 406 * Is a given character a "control" character? 407 */ 408 public int 409 control_char(c) 410 LWCHAR c; 411 { 412 c &= 0377; 413 return (chardef[c] & IS_CONTROL_CHAR); 414 } 415 416 /* 417 * Return the printable form of a character. 418 * For example, in the "ascii" charset '\3' is printed as "^C". 419 */ 420 public char * 421 prchar(c) 422 LWCHAR c; 423 { 424 /* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */ 425 static char buf[32]; 426 427 c &= 0377; 428 if ((c < 128 || !utf_mode) && !control_char(c)) 429 SNPRINTF1(buf, sizeof(buf), "%c", (int) c); 430 else if (c == ESC) 431 strcpy(buf, "ESC"); 432 #if IS_EBCDIC_HOST 433 else if (!binary_char(c) && c < 64) 434 SNPRINTF1(buf, sizeof(buf), "^%c", 435 /* 436 * This array roughly inverts CONTROL() #defined in less.h, 437 * and should be kept in sync with CONTROL() and IBM-1047. 438 */ 439 "@ABC.I.?...KLMNO" 440 "PQRS.JH.XY.." 441 "\\]^_" 442 "......W[.....EFG" 443 "..V....D....TU.Z"[c]); 444 #else 445 else if (c < 128 && !control_char(c ^ 0100)) 446 SNPRINTF1(buf, sizeof(buf), "^%c", (int) (c ^ 0100)); 447 #endif 448 else 449 SNPRINTF1(buf, sizeof(buf), binfmt, c); 450 return (buf); 451 } 452 453 /* 454 * Return the printable form of a UTF-8 character. 455 */ 456 public char * 457 prutfchar(ch) 458 LWCHAR ch; 459 { 460 static char buf[32]; 461 462 if (ch == ESC) 463 strcpy(buf, "ESC"); 464 else if (ch < 128 && control_char(ch)) 465 { 466 if (!control_char(ch ^ 0100)) 467 SNPRINTF1(buf, sizeof(buf), "^%c", ((char) ch) ^ 0100); 468 else 469 SNPRINTF1(buf, sizeof(buf), binfmt, (char) ch); 470 } else if (is_ubin_char(ch)) 471 { 472 SNPRINTF1(buf, sizeof(buf), utfbinfmt, ch); 473 } else 474 { 475 char *p = buf; 476 if (ch >= 0x80000000) 477 ch = 0xFFFD; /* REPLACEMENT CHARACTER */ 478 put_wchar(&p, ch); 479 *p = '\0'; 480 } 481 return (buf); 482 } 483 484 /* 485 * Get the length of a UTF-8 character in bytes. 486 */ 487 public int 488 utf_len(ch) 489 unsigned char ch; 490 { 491 if ((ch & 0x80) == 0) 492 return 1; 493 if ((ch & 0xE0) == 0xC0) 494 return 2; 495 if ((ch & 0xF0) == 0xE0) 496 return 3; 497 if ((ch & 0xF8) == 0xF0) 498 return 4; 499 if ((ch & 0xFC) == 0xF8) 500 return 5; 501 if ((ch & 0xFE) == 0xFC) 502 return 6; 503 /* Invalid UTF-8 encoding. */ 504 return 1; 505 } 506 507 /* 508 * Does the parameter point to the lead byte of a well-formed UTF-8 character? 509 */ 510 public int 511 is_utf8_well_formed(ss, slen) 512 char *ss; 513 int slen; 514 { 515 int i; 516 int len; 517 unsigned char *s = (unsigned char *) ss; 518 519 if (IS_UTF8_INVALID(s[0])) 520 return (0); 521 522 len = utf_len(s[0]); 523 if (len > slen) 524 return (0); 525 if (len == 1) 526 return (1); 527 if (len == 2) 528 { 529 if (s[0] < 0xC2) 530 return (0); 531 } else 532 { 533 unsigned char mask; 534 mask = (~((1 << (8-len)) - 1)) & 0xFF; 535 if (s[0] == mask && (s[1] & mask) == 0x80) 536 return (0); 537 } 538 539 for (i = 1; i < len; i++) 540 if (!IS_UTF8_TRAIL(s[i])) 541 return (0); 542 return (1); 543 } 544 545 /* 546 * Return number of invalid UTF-8 sequences found in a buffer. 547 */ 548 public int 549 utf_bin_count(data, len) 550 char *data; 551 int len; 552 { 553 int bin_count = 0; 554 while (len > 0) 555 { 556 if (is_utf8_well_formed(data, len)) 557 { 558 int clen = utf_len(*data & 0377); 559 data += clen; 560 len -= clen; 561 } else 562 { 563 /* Skip to next lead byte. */ 564 bin_count++; 565 do { 566 ++data; 567 --len; 568 } while (len > 0 && !IS_UTF8_LEAD(*data & 0377)); 569 } 570 } 571 return (bin_count); 572 } 573 574 /* 575 * Get the value of a UTF-8 character. 576 */ 577 public LWCHAR 578 get_wchar(p) 579 constant char *p; 580 { 581 switch (utf_len(p[0])) 582 { 583 case 1: 584 default: 585 /* 0xxxxxxx */ 586 return (LWCHAR) 587 (p[0] & 0xFF); 588 case 2: 589 /* 110xxxxx 10xxxxxx */ 590 return (LWCHAR) ( 591 ((p[0] & 0x1F) << 6) | 592 (p[1] & 0x3F)); 593 case 3: 594 /* 1110xxxx 10xxxxxx 10xxxxxx */ 595 return (LWCHAR) ( 596 ((p[0] & 0x0F) << 12) | 597 ((p[1] & 0x3F) << 6) | 598 (p[2] & 0x3F)); 599 case 4: 600 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 601 return (LWCHAR) ( 602 ((p[0] & 0x07) << 18) | 603 ((p[1] & 0x3F) << 12) | 604 ((p[2] & 0x3F) << 6) | 605 (p[3] & 0x3F)); 606 case 5: 607 /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 608 return (LWCHAR) ( 609 ((p[0] & 0x03) << 24) | 610 ((p[1] & 0x3F) << 18) | 611 ((p[2] & 0x3F) << 12) | 612 ((p[3] & 0x3F) << 6) | 613 (p[4] & 0x3F)); 614 case 6: 615 /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 616 return (LWCHAR) ( 617 ((p[0] & 0x01) << 30) | 618 ((p[1] & 0x3F) << 24) | 619 ((p[2] & 0x3F) << 18) | 620 ((p[3] & 0x3F) << 12) | 621 ((p[4] & 0x3F) << 6) | 622 (p[5] & 0x3F)); 623 } 624 } 625 626 /* 627 * Store a character into a UTF-8 string. 628 */ 629 public void 630 put_wchar(pp, ch) 631 char **pp; 632 LWCHAR ch; 633 { 634 if (!utf_mode || ch < 0x80) 635 { 636 /* 0xxxxxxx */ 637 *(*pp)++ = (char) ch; 638 } else if (ch < 0x800) 639 { 640 /* 110xxxxx 10xxxxxx */ 641 *(*pp)++ = (char) (0xC0 | ((ch >> 6) & 0x1F)); 642 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 643 } else if (ch < 0x10000) 644 { 645 /* 1110xxxx 10xxxxxx 10xxxxxx */ 646 *(*pp)++ = (char) (0xE0 | ((ch >> 12) & 0x0F)); 647 *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F)); 648 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 649 } else if (ch < 0x200000) 650 { 651 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 652 *(*pp)++ = (char) (0xF0 | ((ch >> 18) & 0x07)); 653 *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F)); 654 *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F)); 655 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 656 } else if (ch < 0x4000000) 657 { 658 /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 659 *(*pp)++ = (char) (0xF0 | ((ch >> 24) & 0x03)); 660 *(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F)); 661 *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F)); 662 *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F)); 663 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 664 } else 665 { 666 /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 667 *(*pp)++ = (char) (0xF0 | ((ch >> 30) & 0x01)); 668 *(*pp)++ = (char) (0x80 | ((ch >> 24) & 0x3F)); 669 *(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F)); 670 *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F)); 671 *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F)); 672 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 673 } 674 } 675 676 /* 677 * Step forward or backward one character in a string. 678 */ 679 public LWCHAR 680 step_char(pp, dir, limit) 681 char **pp; 682 signed int dir; 683 constant char *limit; 684 { 685 LWCHAR ch; 686 int len; 687 char *p = *pp; 688 689 if (!utf_mode) 690 { 691 /* It's easy if chars are one byte. */ 692 if (dir > 0) 693 ch = (LWCHAR) ((p < limit) ? *p++ : 0); 694 else 695 ch = (LWCHAR) ((p > limit) ? *--p : 0); 696 } else if (dir > 0) 697 { 698 len = utf_len(*p); 699 if (p + len > limit) 700 { 701 ch = 0; 702 p = (char *) limit; 703 } else 704 { 705 ch = get_wchar(p); 706 p += len; 707 } 708 } else 709 { 710 while (p > limit && IS_UTF8_TRAIL(p[-1])) 711 p--; 712 if (p > limit) 713 ch = get_wchar(--p); 714 else 715 ch = 0; 716 } 717 *pp = p; 718 return ch; 719 } 720 721 /* 722 * Unicode characters data 723 * Actual data is in the generated *.uni files. 724 */ 725 726 #define DECLARE_RANGE_TABLE_START(name) \ 727 static struct wchar_range name##_array[] = { 728 #define DECLARE_RANGE_TABLE_END(name) \ 729 }; struct wchar_range_table name##_table = { name##_array, sizeof(name##_array)/sizeof(*name##_array) }; 730 731 DECLARE_RANGE_TABLE_START(compose) 732 #include "compose.uni" 733 DECLARE_RANGE_TABLE_END(compose) 734 735 DECLARE_RANGE_TABLE_START(ubin) 736 #include "ubin.uni" 737 DECLARE_RANGE_TABLE_END(ubin) 738 739 DECLARE_RANGE_TABLE_START(wide) 740 #include "wide.uni" 741 DECLARE_RANGE_TABLE_END(wide) 742 743 /* comb_table is special pairs, not ranges. */ 744 static struct wchar_range comb_table[] = { 745 {0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627}, 746 }; 747 748 749 static int 750 is_in_table(ch, table) 751 LWCHAR ch; 752 struct wchar_range_table *table; 753 { 754 int hi; 755 int lo; 756 757 /* Binary search in the table. */ 758 if (ch < table->table[0].first) 759 return 0; 760 lo = 0; 761 hi = table->count - 1; 762 while (lo <= hi) 763 { 764 int mid = (lo + hi) / 2; 765 if (ch > table->table[mid].last) 766 lo = mid + 1; 767 else if (ch < table->table[mid].first) 768 hi = mid - 1; 769 else 770 return 1; 771 } 772 return 0; 773 } 774 775 /* 776 * Is a character a UTF-8 composing character? 777 * If a composing character follows any char, the two combine into one glyph. 778 */ 779 public int 780 is_composing_char(ch) 781 LWCHAR ch; 782 { 783 return is_in_table(ch, &compose_table); 784 } 785 786 /* 787 * Should this UTF-8 character be treated as binary? 788 */ 789 public int 790 is_ubin_char(ch) 791 LWCHAR ch; 792 { 793 return is_in_table(ch, &ubin_table); 794 } 795 796 /* 797 * Is this a double width UTF-8 character? 798 */ 799 public int 800 is_wide_char(ch) 801 LWCHAR ch; 802 { 803 return is_in_table(ch, &wide_table); 804 } 805 806 /* 807 * Is a character a UTF-8 combining character? 808 * A combining char acts like an ordinary char, but if it follows 809 * a specific char (not any char), the two combine into one glyph. 810 */ 811 public int 812 is_combining_char(ch1, ch2) 813 LWCHAR ch1; 814 LWCHAR ch2; 815 { 816 /* The table is small; use linear search. */ 817 int i; 818 for (i = 0; i < sizeof(comb_table)/sizeof(*comb_table); i++) 819 { 820 if (ch1 == comb_table[i].first && 821 ch2 == comb_table[i].last) 822 return 1; 823 } 824 return 0; 825 } 826 827