charset.c (009e81b16465ea457c0e63fd49fe77f47cc27a5a) charset.c (1ea316270f1f75922ac53976d5d8808a41442f46)
1/*
2 * Copyright (C) 1984-2015 Mark Nudelman
3 *
4 * You may distribute under the terms of either the GNU General Public
5 * License or the Less License, as specified in the README file.
6 *
7 * For more information, see the README file.
8 */

--- 116 unchanged lines hidden (view full) ---

125 * repetition of the letter.
126 *
127 * Each letter is one of:
128 * . normal character
129 * b binary character
130 * c control character
131 */
132 static void
1/*
2 * Copyright (C) 1984-2015 Mark Nudelman
3 *
4 * You may distribute under the terms of either the GNU General Public
5 * License or the Less License, as specified in the README file.
6 *
7 * For more information, see the README file.
8 */

--- 116 unchanged lines hidden (view full) ---

125 * repetition of the letter.
126 *
127 * Each letter is one of:
128 * . normal character
129 * b binary character
130 * c control character
131 */
132 static void
133ichardef(s)
134 char *s;
133ichardef(char *s)
135{
134{
136 register char *cp;
137 register int n;
138 register char v;
135 char *cp;
136 int n;
137 char v;
139
140 n = 0;
141 v = 0;
142 cp = chardef;
143 while (*s != '\0')
144 {
145 switch (*s++)
146 {

--- 35 unchanged lines hidden (view full) ---

182 *cp++ = v;
183}
184
185/*
186 * Define a charset, given a charset name.
187 * The valid charset names are listed in the "charsets" array.
188 */
189 static int
138
139 n = 0;
140 v = 0;
141 cp = chardef;
142 while (*s != '\0')
143 {
144 switch (*s++)
145 {

--- 35 unchanged lines hidden (view full) ---

181 *cp++ = v;
182}
183
184/*
185 * Define a charset, given a charset name.
186 * The valid charset names are listed in the "charsets" array.
187 */
188 static int
190icharset(name, no_error)
191 register char *name;
192 int no_error;
189icharset(char *name, int no_error)
193{
190{
194 register struct charset *p;
195 register struct cs_alias *a;
191 struct charset *p;
192 struct cs_alias *a;
196
197 if (name == NULL || *name == '\0')
198 return (0);
199
200 /* First see if the name is an alias. */
201 for (a = cs_aliases; a->name != NULL; a++)
202 {
203 if (strcmp(name, a->name) == 0)

--- 21 unchanged lines hidden (view full) ---

225 return (0);
226}
227
228#if HAVE_LOCALE
229/*
230 * Define a charset, given a locale name.
231 */
232 static void
193
194 if (name == NULL || *name == '\0')
195 return (0);
196
197 /* First see if the name is an alias. */
198 for (a = cs_aliases; a->name != NULL; a++)
199 {
200 if (strcmp(name, a->name) == 0)

--- 21 unchanged lines hidden (view full) ---

222 return (0);
223}
224
225#if HAVE_LOCALE
226/*
227 * Define a charset, given a locale name.
228 */
229 static void
233ilocale()
230ilocale(void)
234{
231{
235 register int c;
232 int c;
236
237 for (c = 0; c < (int) sizeof(chardef); c++)
238 {
239 if (isprint(c))
240 chardef[c] = 0;
241 else if (iscntrl(c))
242 chardef[c] = IS_CONTROL_CHAR;
243 else
244 chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
245 }
246}
247#endif
248
249/*
250 * Define the printing format for control (or binary utf) chars.
251 */
252 static void
233
234 for (c = 0; c < (int) sizeof(chardef); c++)
235 {
236 if (isprint(c))
237 chardef[c] = 0;
238 else if (iscntrl(c))
239 chardef[c] = IS_CONTROL_CHAR;
240 else
241 chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
242 }
243}
244#endif
245
246/*
247 * Define the printing format for control (or binary utf) chars.
248 */
249 static void
253setbinfmt(s, fmtvarptr, default_fmt)
254 char *s;
255 char **fmtvarptr;
256 char *default_fmt;
250setbinfmt(char *s, char **fmtvarptr, char *default_fmt)
257{
258 if (s && utf_mode)
259 {
260 /* It would be too hard to account for width otherwise. */
261 char *t = s;
262 while (*t)
263 {
264 if (*t < ' ' || *t > '~')

--- 29 unchanged lines hidden (view full) ---

294 }
295 *fmtvarptr = s;
296}
297
298/*
299 *
300 */
301 static void
251{
252 if (s && utf_mode)
253 {
254 /* It would be too hard to account for width otherwise. */
255 char *t = s;
256 while (*t)
257 {
258 if (*t < ' ' || *t > '~')

--- 29 unchanged lines hidden (view full) ---

288 }
289 *fmtvarptr = s;
290}
291
292/*
293 *
294 */
295 static void
302set_charset()
296set_charset(void)
303{
304 char *s;
305
306 /*
307 * See if environment variable LESSCHARSET is defined.
308 */
309 s = lgetenv("LESSCHARSET");
310 if (icharset(s, 0))

--- 54 unchanged lines hidden (view full) ---

365#endif
366#endif
367}
368
369/*
370 * Initialize charset data structures.
371 */
372 public void
297{
298 char *s;
299
300 /*
301 * See if environment variable LESSCHARSET is defined.
302 */
303 s = lgetenv("LESSCHARSET");
304 if (icharset(s, 0))

--- 54 unchanged lines hidden (view full) ---

359#endif
360#endif
361}
362
363/*
364 * Initialize charset data structures.
365 */
366 public void
373init_charset()
367init_charset(void)
374{
375 char *s;
376
377#if HAVE_LOCALE
378 setlocale(LC_ALL, "");
379#endif
380
381 set_charset();

--- 4 unchanged lines hidden (view full) ---

386 s = lgetenv("LESSUTFBINFMT");
387 setbinfmt(s, &utfbinfmt, "<U+%04lX>");
388}
389
390/*
391 * Is a given character a "binary" character?
392 */
393 public int
368{
369 char *s;
370
371#if HAVE_LOCALE
372 setlocale(LC_ALL, "");
373#endif
374
375 set_charset();

--- 4 unchanged lines hidden (view full) ---

380 s = lgetenv("LESSUTFBINFMT");
381 setbinfmt(s, &utfbinfmt, "<U+%04lX>");
382}
383
384/*
385 * Is a given character a "binary" character?
386 */
387 public int
394binary_char(c)
395 LWCHAR c;
388binary_char(LWCHAR c)
396{
397 if (utf_mode)
398 return (is_ubin_char(c));
399 c &= 0377;
400 return (chardef[c] & IS_BINARY_CHAR);
401}
402
403/*
404 * Is a given character a "control" character?
405 */
406 public int
389{
390 if (utf_mode)
391 return (is_ubin_char(c));
392 c &= 0377;
393 return (chardef[c] & IS_BINARY_CHAR);
394}
395
396/*
397 * Is a given character a "control" character?
398 */
399 public int
407control_char(c)
408 LWCHAR c;
400control_char(LWCHAR c)
409{
410 c &= 0377;
411 return (chardef[c] & IS_CONTROL_CHAR);
412}
413
414/*
415 * Return the printable form of a character.
416 * For example, in the "ascii" charset '\3' is printed as "^C".
417 */
418 public char *
401{
402 c &= 0377;
403 return (chardef[c] & IS_CONTROL_CHAR);
404}
405
406/*
407 * Return the printable form of a character.
408 * For example, in the "ascii" charset '\3' is printed as "^C".
409 */
410 public char *
419prchar(c)
420 LWCHAR c;
411prchar(LWCHAR c)
421{
422 /* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */
423 static char buf[32];
424
425 c &= 0377;
426 if ((c < 128 || !utf_mode) && !control_char(c))
427 SNPRINTF1(buf, sizeof(buf), "%c", (int) c);
428 else if (c == ESC)

--- 18 unchanged lines hidden (view full) ---

447 SNPRINTF1(buf, sizeof(buf), binfmt, c);
448 return (buf);
449}
450
451/*
452 * Return the printable form of a UTF-8 character.
453 */
454 public char *
412{
413 /* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */
414 static char buf[32];
415
416 c &= 0377;
417 if ((c < 128 || !utf_mode) && !control_char(c))
418 SNPRINTF1(buf, sizeof(buf), "%c", (int) c);
419 else if (c == ESC)

--- 18 unchanged lines hidden (view full) ---

438 SNPRINTF1(buf, sizeof(buf), binfmt, c);
439 return (buf);
440}
441
442/*
443 * Return the printable form of a UTF-8 character.
444 */
445 public char *
455prutfchar(ch)
456 LWCHAR ch;
446prutfchar(LWCHAR ch)
457{
458 static char buf[32];
459
460 if (ch == ESC)
461 strcpy(buf, "ESC");
462 else if (ch < 128 && control_char(ch))
463 {
464 if (!control_char(ch ^ 0100))

--- 13 unchanged lines hidden (view full) ---

478 }
479 return (buf);
480}
481
482/*
483 * Get the length of a UTF-8 character in bytes.
484 */
485 public int
447{
448 static char buf[32];
449
450 if (ch == ESC)
451 strcpy(buf, "ESC");
452 else if (ch < 128 && control_char(ch))
453 {
454 if (!control_char(ch ^ 0100))

--- 13 unchanged lines hidden (view full) ---

468 }
469 return (buf);
470}
471
472/*
473 * Get the length of a UTF-8 character in bytes.
474 */
475 public int
486utf_len(ch)
487 char ch;
476utf_len(char ch)
488{
489 if ((ch & 0x80) == 0)
490 return 1;
491 if ((ch & 0xE0) == 0xC0)
492 return 2;
493 if ((ch & 0xF0) == 0xE0)
494 return 3;
495 if ((ch & 0xF8) == 0xF0)

--- 5 unchanged lines hidden (view full) ---

501 /* Invalid UTF-8 encoding. */
502 return 1;
503}
504
505/*
506 * Does the parameter point to the lead byte of a well-formed UTF-8 character?
507 */
508 public int
477{
478 if ((ch & 0x80) == 0)
479 return 1;
480 if ((ch & 0xE0) == 0xC0)
481 return 2;
482 if ((ch & 0xF0) == 0xE0)
483 return 3;
484 if ((ch & 0xF8) == 0xF0)

--- 5 unchanged lines hidden (view full) ---

490 /* Invalid UTF-8 encoding. */
491 return 1;
492}
493
494/*
495 * Does the parameter point to the lead byte of a well-formed UTF-8 character?
496 */
497 public int
509is_utf8_well_formed(s, slen)
510 unsigned char *s;
511 int slen;
498is_utf8_well_formed(unsigned char *s, int slen)
512{
513 int i;
514 int len;
515
516 if (IS_UTF8_INVALID(s[0]))
517 return (0);
518
519 len = utf_len((char) s[0]);

--- 18 unchanged lines hidden (view full) ---

538 return (0);
539 return (1);
540}
541
542/*
543 * Return number of invalid UTF-8 sequences found in a buffer.
544 */
545 public int
499{
500 int i;
501 int len;
502
503 if (IS_UTF8_INVALID(s[0]))
504 return (0);
505
506 len = utf_len((char) s[0]);

--- 18 unchanged lines hidden (view full) ---

525 return (0);
526 return (1);
527}
528
529/*
530 * Return number of invalid UTF-8 sequences found in a buffer.
531 */
532 public int
546utf_bin_count(data, len)
547 unsigned char *data;
548 int len;
533utf_bin_count(unsigned char *data, int len)
549{
550 int bin_count = 0;
551 while (len > 0)
552 {
553 if (is_utf8_well_formed(data, len))
554 {
555 int clen = utf_len(*data);
556 data += clen;

--- 10 unchanged lines hidden (view full) ---

567 }
568 return (bin_count);
569}
570
571/*
572 * Get the value of a UTF-8 character.
573 */
574 public LWCHAR
534{
535 int bin_count = 0;
536 while (len > 0)
537 {
538 if (is_utf8_well_formed(data, len))
539 {
540 int clen = utf_len(*data);
541 data += clen;

--- 10 unchanged lines hidden (view full) ---

552 }
553 return (bin_count);
554}
555
556/*
557 * Get the value of a UTF-8 character.
558 */
559 public LWCHAR
575get_wchar(p)
576 char *p;
560get_wchar(constant char *p)
577{
578 switch (utf_len(p[0]))
579 {
580 case 1:
581 default:
582 /* 0xxxxxxx */
583 return (LWCHAR)
584 (p[0] & 0xFF);

--- 34 unchanged lines hidden (view full) ---

619 (p[5] & 0x3F));
620 }
621}
622
623/*
624 * Store a character into a UTF-8 string.
625 */
626 public void
561{
562 switch (utf_len(p[0]))
563 {
564 case 1:
565 default:
566 /* 0xxxxxxx */
567 return (LWCHAR)
568 (p[0] & 0xFF);

--- 34 unchanged lines hidden (view full) ---

603 (p[5] & 0x3F));
604 }
605}
606
607/*
608 * Store a character into a UTF-8 string.
609 */
610 public void
627put_wchar(pp, ch)
628 char **pp;
629 LWCHAR ch;
611put_wchar(char **pp, LWCHAR ch)
630{
631 if (!utf_mode || ch < 0x80)
632 {
633 /* 0xxxxxxx */
634 *(*pp)++ = (char) ch;
635 } else if (ch < 0x800)
636 {
637 /* 110xxxxx 10xxxxxx */

--- 31 unchanged lines hidden (view full) ---

669 *(*pp)++ = (char) (0x80 | (ch & 0x3F));
670 }
671}
672
673/*
674 * Step forward or backward one character in a string.
675 */
676 public LWCHAR
612{
613 if (!utf_mode || ch < 0x80)
614 {
615 /* 0xxxxxxx */
616 *(*pp)++ = (char) ch;
617 } else if (ch < 0x800)
618 {
619 /* 110xxxxx 10xxxxxx */

--- 31 unchanged lines hidden (view full) ---

651 *(*pp)++ = (char) (0x80 | (ch & 0x3F));
652 }
653}
654
655/*
656 * Step forward or backward one character in a string.
657 */
658 public LWCHAR
677step_char(pp, dir, limit)
678 char **pp;
679 signed int dir;
680 char *limit;
659step_char(constant char **pp, signed int dir, constant char *limit)
681{
682 LWCHAR ch;
683 int len;
660{
661 LWCHAR ch;
662 int len;
684 char *p = *pp;
663 constant char *p = *pp;
685
686 if (!utf_mode)
687 {
688 /* It's easy if chars are one byte. */
689 if (dir > 0)
690 ch = (LWCHAR) ((p < limit) ? *p++ : 0);
691 else
692 ch = (LWCHAR) ((p > limit) ? *--p : 0);

--- 46 unchanged lines hidden (view full) ---

739
740/* comb_table is special pairs, not ranges. */
741static struct wchar_range comb_table[] = {
742 {0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627},
743};
744
745
746 static int
664
665 if (!utf_mode)
666 {
667 /* It's easy if chars are one byte. */
668 if (dir > 0)
669 ch = (LWCHAR) ((p < limit) ? *p++ : 0);
670 else
671 ch = (LWCHAR) ((p > limit) ? *--p : 0);

--- 46 unchanged lines hidden (view full) ---

718
719/* comb_table is special pairs, not ranges. */
720static struct wchar_range comb_table[] = {
721 {0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627},
722};
723
724
725 static int
747is_in_table(ch, table)
748 LWCHAR ch;
749 struct wchar_range_table *table;
726is_in_table(LWCHAR ch, struct wchar_range_table *table)
750{
751 int hi;
752 int lo;
753
754 /* Binary search in the table. */
755 if (ch < table->table[0].first)
756 return 0;
757 lo = 0;

--- 11 unchanged lines hidden (view full) ---

769 return 0;
770}
771
772/*
773 * Is a character a UTF-8 composing character?
774 * If a composing character follows any char, the two combine into one glyph.
775 */
776 public int
727{
728 int hi;
729 int lo;
730
731 /* Binary search in the table. */
732 if (ch < table->table[0].first)
733 return 0;
734 lo = 0;

--- 11 unchanged lines hidden (view full) ---

746 return 0;
747}
748
749/*
750 * Is a character a UTF-8 composing character?
751 * If a composing character follows any char, the two combine into one glyph.
752 */
753 public int
777is_composing_char(ch)
778 LWCHAR ch;
754is_composing_char(LWCHAR ch)
779{
780 return is_in_table(ch, &compose_table);
781}
782
783/*
784 * Should this UTF-8 character be treated as binary?
785 */
786 public int
755{
756 return is_in_table(ch, &compose_table);
757}
758
759/*
760 * Should this UTF-8 character be treated as binary?
761 */
762 public int
787is_ubin_char(ch)
788 LWCHAR ch;
763is_ubin_char(LWCHAR ch)
789{
790 return is_in_table(ch, &ubin_table);
791}
792
793/*
794 * Is this a double width UTF-8 character?
795 */
796 public int
764{
765 return is_in_table(ch, &ubin_table);
766}
767
768/*
769 * Is this a double width UTF-8 character?
770 */
771 public int
797is_wide_char(ch)
798 LWCHAR ch;
772is_wide_char(LWCHAR ch)
799{
800 return is_in_table(ch, &wide_table);
801}
802
803/*
804 * Is a character a UTF-8 combining character?
805 * A combining char acts like an ordinary char, but if it follows
806 * a specific char (not any char), the two combine into one glyph.
807 */
808 public int
773{
774 return is_in_table(ch, &wide_table);
775}
776
777/*
778 * Is a character a UTF-8 combining character?
779 * A combining char acts like an ordinary char, but if it follows
780 * a specific char (not any char), the two combine into one glyph.
781 */
782 public int
809is_combining_char(ch1, ch2)
810 LWCHAR ch1;
811 LWCHAR ch2;
783is_combining_char(LWCHAR ch1, LWCHAR ch2)
812{
813 /* The table is small; use linear search. */
814 int i;
815 for (i = 0; i < sizeof(comb_table)/sizeof(*comb_table); i++)
816 {
817 if (ch1 == comb_table[i].first &&
818 ch2 == comb_table[i].last)
819 return 1;
820 }
821 return 0;
822}
823
784{
785 /* The table is small; use linear search. */
786 int i;
787 for (i = 0; i < sizeof(comb_table)/sizeof(*comb_table); i++)
788 {
789 if (ch1 == comb_table[i].first &&
790 ch2 == comb_table[i].last)
791 return 1;
792 }
793 return 0;
794}
795