charset.c (009e81b16465ea457c0e63fd49fe77f47cc27a5a) | charset.c (1ea316270f1f75922ac53976d5d8808a41442f46) |
---|---|
1/* 2 * Copyright (C) 1984-2015 Mark Nudelman 3 * 4 * You may distribute under the terms of either the GNU General Public 5 * License or the Less License, as specified in the README file. 6 * 7 * For more information, see the README file. 8 */ --- 116 unchanged lines hidden (view full) --- 125 * repetition of the letter. 126 * 127 * Each letter is one of: 128 * . normal character 129 * b binary character 130 * c control character 131 */ 132 static void | 1/* 2 * Copyright (C) 1984-2015 Mark Nudelman 3 * 4 * You may distribute under the terms of either the GNU General Public 5 * License or the Less License, as specified in the README file. 6 * 7 * For more information, see the README file. 8 */ --- 116 unchanged lines hidden (view full) --- 125 * repetition of the letter. 126 * 127 * Each letter is one of: 128 * . normal character 129 * b binary character 130 * c control character 131 */ 132 static void |
133ichardef(s) 134 char *s; | 133ichardef(char *s) |
135{ | 134{ |
136 register char *cp; 137 register int n; 138 register char v; | 135 char *cp; 136 int n; 137 char v; |
139 140 n = 0; 141 v = 0; 142 cp = chardef; 143 while (*s != '\0') 144 { 145 switch (*s++) 146 { --- 35 unchanged lines hidden (view full) --- 182 *cp++ = v; 183} 184 185/* 186 * Define a charset, given a charset name. 187 * The valid charset names are listed in the "charsets" array. 188 */ 189 static int | 138 139 n = 0; 140 v = 0; 141 cp = chardef; 142 while (*s != '\0') 143 { 144 switch (*s++) 145 { --- 35 unchanged lines hidden (view full) --- 181 *cp++ = v; 182} 183 184/* 185 * Define a charset, given a charset name. 186 * The valid charset names are listed in the "charsets" array. 187 */ 188 static int |
190icharset(name, no_error) 191 register char *name; 192 int no_error; | 189icharset(char *name, int no_error) |
193{ | 190{ |
194 register struct charset *p; 195 register struct cs_alias *a; | 191 struct charset *p; 192 struct cs_alias *a; |
196 197 if (name == NULL || *name == '\0') 198 return (0); 199 200 /* First see if the name is an alias. */ 201 for (a = cs_aliases; a->name != NULL; a++) 202 { 203 if (strcmp(name, a->name) == 0) --- 21 unchanged lines hidden (view full) --- 225 return (0); 226} 227 228#if HAVE_LOCALE 229/* 230 * Define a charset, given a locale name. 231 */ 232 static void | 193 194 if (name == NULL || *name == '\0') 195 return (0); 196 197 /* First see if the name is an alias. */ 198 for (a = cs_aliases; a->name != NULL; a++) 199 { 200 if (strcmp(name, a->name) == 0) --- 21 unchanged lines hidden (view full) --- 222 return (0); 223} 224 225#if HAVE_LOCALE 226/* 227 * Define a charset, given a locale name. 228 */ 229 static void |
233ilocale() | 230ilocale(void) |
234{ | 231{ |
235 register int c; | 232 int c; |
236 237 for (c = 0; c < (int) sizeof(chardef); c++) 238 { 239 if (isprint(c)) 240 chardef[c] = 0; 241 else if (iscntrl(c)) 242 chardef[c] = IS_CONTROL_CHAR; 243 else 244 chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR; 245 } 246} 247#endif 248 249/* 250 * Define the printing format for control (or binary utf) chars. 251 */ 252 static void | 233 234 for (c = 0; c < (int) sizeof(chardef); c++) 235 { 236 if (isprint(c)) 237 chardef[c] = 0; 238 else if (iscntrl(c)) 239 chardef[c] = IS_CONTROL_CHAR; 240 else 241 chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR; 242 } 243} 244#endif 245 246/* 247 * Define the printing format for control (or binary utf) chars. 248 */ 249 static void |
253setbinfmt(s, fmtvarptr, default_fmt) 254 char *s; 255 char **fmtvarptr; 256 char *default_fmt; | 250setbinfmt(char *s, char **fmtvarptr, char *default_fmt) |
257{ 258 if (s && utf_mode) 259 { 260 /* It would be too hard to account for width otherwise. */ 261 char *t = s; 262 while (*t) 263 { 264 if (*t < ' ' || *t > '~') --- 29 unchanged lines hidden (view full) --- 294 } 295 *fmtvarptr = s; 296} 297 298/* 299 * 300 */ 301 static void | 251{ 252 if (s && utf_mode) 253 { 254 /* It would be too hard to account for width otherwise. */ 255 char *t = s; 256 while (*t) 257 { 258 if (*t < ' ' || *t > '~') --- 29 unchanged lines hidden (view full) --- 288 } 289 *fmtvarptr = s; 290} 291 292/* 293 * 294 */ 295 static void |
302set_charset() | 296set_charset(void) |
303{ 304 char *s; 305 306 /* 307 * See if environment variable LESSCHARSET is defined. 308 */ 309 s = lgetenv("LESSCHARSET"); 310 if (icharset(s, 0)) --- 54 unchanged lines hidden (view full) --- 365#endif 366#endif 367} 368 369/* 370 * Initialize charset data structures. 371 */ 372 public void | 297{ 298 char *s; 299 300 /* 301 * See if environment variable LESSCHARSET is defined. 302 */ 303 s = lgetenv("LESSCHARSET"); 304 if (icharset(s, 0)) --- 54 unchanged lines hidden (view full) --- 359#endif 360#endif 361} 362 363/* 364 * Initialize charset data structures. 365 */ 366 public void |
373init_charset() | 367init_charset(void) |
374{ 375 char *s; 376 377#if HAVE_LOCALE 378 setlocale(LC_ALL, ""); 379#endif 380 381 set_charset(); --- 4 unchanged lines hidden (view full) --- 386 s = lgetenv("LESSUTFBINFMT"); 387 setbinfmt(s, &utfbinfmt, "<U+%04lX>"); 388} 389 390/* 391 * Is a given character a "binary" character? 392 */ 393 public int | 368{ 369 char *s; 370 371#if HAVE_LOCALE 372 setlocale(LC_ALL, ""); 373#endif 374 375 set_charset(); --- 4 unchanged lines hidden (view full) --- 380 s = lgetenv("LESSUTFBINFMT"); 381 setbinfmt(s, &utfbinfmt, "<U+%04lX>"); 382} 383 384/* 385 * Is a given character a "binary" character? 386 */ 387 public int |
394binary_char(c) 395 LWCHAR c; | 388binary_char(LWCHAR c) |
396{ 397 if (utf_mode) 398 return (is_ubin_char(c)); 399 c &= 0377; 400 return (chardef[c] & IS_BINARY_CHAR); 401} 402 403/* 404 * Is a given character a "control" character? 405 */ 406 public int | 389{ 390 if (utf_mode) 391 return (is_ubin_char(c)); 392 c &= 0377; 393 return (chardef[c] & IS_BINARY_CHAR); 394} 395 396/* 397 * Is a given character a "control" character? 398 */ 399 public int |
407control_char(c) 408 LWCHAR c; | 400control_char(LWCHAR c) |
409{ 410 c &= 0377; 411 return (chardef[c] & IS_CONTROL_CHAR); 412} 413 414/* 415 * Return the printable form of a character. 416 * For example, in the "ascii" charset '\3' is printed as "^C". 417 */ 418 public char * | 401{ 402 c &= 0377; 403 return (chardef[c] & IS_CONTROL_CHAR); 404} 405 406/* 407 * Return the printable form of a character. 408 * For example, in the "ascii" charset '\3' is printed as "^C". 409 */ 410 public char * |
419prchar(c) 420 LWCHAR c; | 411prchar(LWCHAR c) |
421{ 422 /* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */ 423 static char buf[32]; 424 425 c &= 0377; 426 if ((c < 128 || !utf_mode) && !control_char(c)) 427 SNPRINTF1(buf, sizeof(buf), "%c", (int) c); 428 else if (c == ESC) --- 18 unchanged lines hidden (view full) --- 447 SNPRINTF1(buf, sizeof(buf), binfmt, c); 448 return (buf); 449} 450 451/* 452 * Return the printable form of a UTF-8 character. 453 */ 454 public char * | 412{ 413 /* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */ 414 static char buf[32]; 415 416 c &= 0377; 417 if ((c < 128 || !utf_mode) && !control_char(c)) 418 SNPRINTF1(buf, sizeof(buf), "%c", (int) c); 419 else if (c == ESC) --- 18 unchanged lines hidden (view full) --- 438 SNPRINTF1(buf, sizeof(buf), binfmt, c); 439 return (buf); 440} 441 442/* 443 * Return the printable form of a UTF-8 character. 444 */ 445 public char * |
455prutfchar(ch) 456 LWCHAR ch; | 446prutfchar(LWCHAR ch) |
457{ 458 static char buf[32]; 459 460 if (ch == ESC) 461 strcpy(buf, "ESC"); 462 else if (ch < 128 && control_char(ch)) 463 { 464 if (!control_char(ch ^ 0100)) --- 13 unchanged lines hidden (view full) --- 478 } 479 return (buf); 480} 481 482/* 483 * Get the length of a UTF-8 character in bytes. 484 */ 485 public int | 447{ 448 static char buf[32]; 449 450 if (ch == ESC) 451 strcpy(buf, "ESC"); 452 else if (ch < 128 && control_char(ch)) 453 { 454 if (!control_char(ch ^ 0100)) --- 13 unchanged lines hidden (view full) --- 468 } 469 return (buf); 470} 471 472/* 473 * Get the length of a UTF-8 character in bytes. 474 */ 475 public int |
486utf_len(ch) 487 char ch; | 476utf_len(char ch) |
488{ 489 if ((ch & 0x80) == 0) 490 return 1; 491 if ((ch & 0xE0) == 0xC0) 492 return 2; 493 if ((ch & 0xF0) == 0xE0) 494 return 3; 495 if ((ch & 0xF8) == 0xF0) --- 5 unchanged lines hidden (view full) --- 501 /* Invalid UTF-8 encoding. */ 502 return 1; 503} 504 505/* 506 * Does the parameter point to the lead byte of a well-formed UTF-8 character? 507 */ 508 public int | 477{ 478 if ((ch & 0x80) == 0) 479 return 1; 480 if ((ch & 0xE0) == 0xC0) 481 return 2; 482 if ((ch & 0xF0) == 0xE0) 483 return 3; 484 if ((ch & 0xF8) == 0xF0) --- 5 unchanged lines hidden (view full) --- 490 /* Invalid UTF-8 encoding. */ 491 return 1; 492} 493 494/* 495 * Does the parameter point to the lead byte of a well-formed UTF-8 character? 496 */ 497 public int |
509is_utf8_well_formed(s, slen) 510 unsigned char *s; 511 int slen; | 498is_utf8_well_formed(unsigned char *s, int slen) |
512{ 513 int i; 514 int len; 515 516 if (IS_UTF8_INVALID(s[0])) 517 return (0); 518 519 len = utf_len((char) s[0]); --- 18 unchanged lines hidden (view full) --- 538 return (0); 539 return (1); 540} 541 542/* 543 * Return number of invalid UTF-8 sequences found in a buffer. 544 */ 545 public int | 499{ 500 int i; 501 int len; 502 503 if (IS_UTF8_INVALID(s[0])) 504 return (0); 505 506 len = utf_len((char) s[0]); --- 18 unchanged lines hidden (view full) --- 525 return (0); 526 return (1); 527} 528 529/* 530 * Return number of invalid UTF-8 sequences found in a buffer. 531 */ 532 public int |
546utf_bin_count(data, len) 547 unsigned char *data; 548 int len; | 533utf_bin_count(unsigned char *data, int len) |
549{ 550 int bin_count = 0; 551 while (len > 0) 552 { 553 if (is_utf8_well_formed(data, len)) 554 { 555 int clen = utf_len(*data); 556 data += clen; --- 10 unchanged lines hidden (view full) --- 567 } 568 return (bin_count); 569} 570 571/* 572 * Get the value of a UTF-8 character. 573 */ 574 public LWCHAR | 534{ 535 int bin_count = 0; 536 while (len > 0) 537 { 538 if (is_utf8_well_formed(data, len)) 539 { 540 int clen = utf_len(*data); 541 data += clen; --- 10 unchanged lines hidden (view full) --- 552 } 553 return (bin_count); 554} 555 556/* 557 * Get the value of a UTF-8 character. 558 */ 559 public LWCHAR |
575get_wchar(p) 576 char *p; | 560get_wchar(constant char *p) |
577{ 578 switch (utf_len(p[0])) 579 { 580 case 1: 581 default: 582 /* 0xxxxxxx */ 583 return (LWCHAR) 584 (p[0] & 0xFF); --- 34 unchanged lines hidden (view full) --- 619 (p[5] & 0x3F)); 620 } 621} 622 623/* 624 * Store a character into a UTF-8 string. 625 */ 626 public void | 561{ 562 switch (utf_len(p[0])) 563 { 564 case 1: 565 default: 566 /* 0xxxxxxx */ 567 return (LWCHAR) 568 (p[0] & 0xFF); --- 34 unchanged lines hidden (view full) --- 603 (p[5] & 0x3F)); 604 } 605} 606 607/* 608 * Store a character into a UTF-8 string. 609 */ 610 public void |
627put_wchar(pp, ch) 628 char **pp; 629 LWCHAR ch; | 611put_wchar(char **pp, LWCHAR ch) |
630{ 631 if (!utf_mode || ch < 0x80) 632 { 633 /* 0xxxxxxx */ 634 *(*pp)++ = (char) ch; 635 } else if (ch < 0x800) 636 { 637 /* 110xxxxx 10xxxxxx */ --- 31 unchanged lines hidden (view full) --- 669 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 670 } 671} 672 673/* 674 * Step forward or backward one character in a string. 675 */ 676 public LWCHAR | 612{ 613 if (!utf_mode || ch < 0x80) 614 { 615 /* 0xxxxxxx */ 616 *(*pp)++ = (char) ch; 617 } else if (ch < 0x800) 618 { 619 /* 110xxxxx 10xxxxxx */ --- 31 unchanged lines hidden (view full) --- 651 *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 652 } 653} 654 655/* 656 * Step forward or backward one character in a string. 657 */ 658 public LWCHAR |
677step_char(pp, dir, limit) 678 char **pp; 679 signed int dir; 680 char *limit; | 659step_char(constant char **pp, signed int dir, constant char *limit) |
681{ 682 LWCHAR ch; 683 int len; | 660{ 661 LWCHAR ch; 662 int len; |
684 char *p = *pp; | 663 constant char *p = *pp; |
685 686 if (!utf_mode) 687 { 688 /* It's easy if chars are one byte. */ 689 if (dir > 0) 690 ch = (LWCHAR) ((p < limit) ? *p++ : 0); 691 else 692 ch = (LWCHAR) ((p > limit) ? *--p : 0); --- 46 unchanged lines hidden (view full) --- 739 740/* comb_table is special pairs, not ranges. */ 741static struct wchar_range comb_table[] = { 742 {0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627}, 743}; 744 745 746 static int | 664 665 if (!utf_mode) 666 { 667 /* It's easy if chars are one byte. */ 668 if (dir > 0) 669 ch = (LWCHAR) ((p < limit) ? *p++ : 0); 670 else 671 ch = (LWCHAR) ((p > limit) ? *--p : 0); --- 46 unchanged lines hidden (view full) --- 718 719/* comb_table is special pairs, not ranges. */ 720static struct wchar_range comb_table[] = { 721 {0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627}, 722}; 723 724 725 static int |
747is_in_table(ch, table) 748 LWCHAR ch; 749 struct wchar_range_table *table; | 726is_in_table(LWCHAR ch, struct wchar_range_table *table) |
750{ 751 int hi; 752 int lo; 753 754 /* Binary search in the table. */ 755 if (ch < table->table[0].first) 756 return 0; 757 lo = 0; --- 11 unchanged lines hidden (view full) --- 769 return 0; 770} 771 772/* 773 * Is a character a UTF-8 composing character? 774 * If a composing character follows any char, the two combine into one glyph. 775 */ 776 public int | 727{ 728 int hi; 729 int lo; 730 731 /* Binary search in the table. */ 732 if (ch < table->table[0].first) 733 return 0; 734 lo = 0; --- 11 unchanged lines hidden (view full) --- 746 return 0; 747} 748 749/* 750 * Is a character a UTF-8 composing character? 751 * If a composing character follows any char, the two combine into one glyph. 752 */ 753 public int |
777is_composing_char(ch) 778 LWCHAR ch; | 754is_composing_char(LWCHAR ch) |
779{ 780 return is_in_table(ch, &compose_table); 781} 782 783/* 784 * Should this UTF-8 character be treated as binary? 785 */ 786 public int | 755{ 756 return is_in_table(ch, &compose_table); 757} 758 759/* 760 * Should this UTF-8 character be treated as binary? 761 */ 762 public int |
787is_ubin_char(ch) 788 LWCHAR ch; | 763is_ubin_char(LWCHAR ch) |
789{ 790 return is_in_table(ch, &ubin_table); 791} 792 793/* 794 * Is this a double width UTF-8 character? 795 */ 796 public int | 764{ 765 return is_in_table(ch, &ubin_table); 766} 767 768/* 769 * Is this a double width UTF-8 character? 770 */ 771 public int |
797is_wide_char(ch) 798 LWCHAR ch; | 772is_wide_char(LWCHAR ch) |
799{ 800 return is_in_table(ch, &wide_table); 801} 802 803/* 804 * Is a character a UTF-8 combining character? 805 * A combining char acts like an ordinary char, but if it follows 806 * a specific char (not any char), the two combine into one glyph. 807 */ 808 public int | 773{ 774 return is_in_table(ch, &wide_table); 775} 776 777/* 778 * Is a character a UTF-8 combining character? 779 * A combining char acts like an ordinary char, but if it follows 780 * a specific char (not any char), the two combine into one glyph. 781 */ 782 public int |
809is_combining_char(ch1, ch2) 810 LWCHAR ch1; 811 LWCHAR ch2; | 783is_combining_char(LWCHAR ch1, LWCHAR ch2) |
812{ 813 /* The table is small; use linear search. */ 814 int i; 815 for (i = 0; i < sizeof(comb_table)/sizeof(*comb_table); i++) 816 { 817 if (ch1 == comb_table[i].first && 818 ch2 == comb_table[i].last) 819 return 1; 820 } 821 return 0; 822} 823 | 784{ 785 /* The table is small; use linear search. */ 786 int i; 787 for (i = 0; i < sizeof(comb_table)/sizeof(*comb_table); i++) 788 { 789 if (ch1 == comb_table[i].first && 790 ch2 == comb_table[i].last) 791 return 1; 792 } 793 return 0; 794} 795 |