charset.c (7e1b7636c894be9d1130c284089ce1ea0786ecec) charset.c (b2ea244070ff84eab79e04befb7aa30c982fc84d)
1/*
2 * Copyright (C) 1984-2017 Mark Nudelman
3 *
4 * You may distribute under the terms of either the GNU General Public
5 * License or the Less License, as specified in the README file.
6 *
7 * For more information, see the README file.
8 */

--- 8 unchanged lines hidden (view full) ---

17#if HAVE_LOCALE
18#include <locale.h>
19#include <ctype.h>
20#include <langinfo.h>
21#endif
22
23#include "charset.h"
24
1/*
2 * Copyright (C) 1984-2017 Mark Nudelman
3 *
4 * You may distribute under the terms of either the GNU General Public
5 * License or the Less License, as specified in the README file.
6 *
7 * For more information, see the README file.
8 */

--- 8 unchanged lines hidden (view full) ---

17#if HAVE_LOCALE
18#include <locale.h>
19#include <ctype.h>
20#include <langinfo.h>
21#endif
22
23#include "charset.h"
24
25#if MSDOS_COMPILER==WIN32C
26#define WIN32_LEAN_AND_MEAN
27#include <windows.h>
28#endif
29
30extern int bs_mode;
31
25public int utf_mode = 0;
26
27/*
28 * Predefined character sets,
29 * selected by the LESSCHARSET environment variable.
30 */
31struct charset {
32 char *name;

--- 177 unchanged lines hidden (view full) ---

210 }
211
212 for (p = charsets; p->name != NULL; p++)
213 {
214 if (strcmp(name, p->name) == 0)
215 {
216 ichardef(p->desc);
217 if (p->p_flag != NULL)
32public int utf_mode = 0;
33
34/*
35 * Predefined character sets,
36 * selected by the LESSCHARSET environment variable.
37 */
38struct charset {
39 char *name;

--- 177 unchanged lines hidden (view full) ---

217 }
218
219 for (p = charsets; p->name != NULL; p++)
220 {
221 if (strcmp(name, p->name) == 0)
222 {
223 ichardef(p->desc);
224 if (p->p_flag != NULL)
225 {
226#if MSDOS_COMPILER==WIN32C
227 *(p->p_flag) = 1 + (GetConsoleOutputCP() != CP_UTF8);
228#else
218 *(p->p_flag) = 1;
229 *(p->p_flag) = 1;
230#endif
231 }
219 return (1);
220 }
221 }
222
223 if (!no_error) {
224 error("invalid charset name", NULL_PARG);
225 quit(QUIT_ERROR);
226 }

--- 19 unchanged lines hidden (view full) ---

246 chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
247 }
248}
249#endif
250
251/*
252 * Define the printing format for control (or binary utf) chars.
253 */
232 return (1);
233 }
234 }
235
236 if (!no_error) {
237 error("invalid charset name", NULL_PARG);
238 quit(QUIT_ERROR);
239 }

--- 19 unchanged lines hidden (view full) ---

259 chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
260 }
261}
262#endif
263
264/*
265 * Define the printing format for control (or binary utf) chars.
266 */
254 static void
255setbinfmt(s, fmtvarptr, default_fmt)
267 public void
268setfmt(s, fmtvarptr, attrptr, default_fmt)
256 char *s;
257 char **fmtvarptr;
269 char *s;
270 char **fmtvarptr;
271 int *attrptr;
258 char *default_fmt;
259{
260 if (s && utf_mode)
261 {
262 /* It would be too hard to account for width otherwise. */
272 char *default_fmt;
273{
274 if (s && utf_mode)
275 {
276 /* It would be too hard to account for width otherwise. */
263 char *t = s;
277 char constant *t = s;
264 while (*t)
265 {
266 if (*t < ' ' || *t > '~')
267 {
268 s = default_fmt;
269 goto attr;
270 }
271 t++;

--- 5 unchanged lines hidden (view full) ---

277 (*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) ||
278 (*s != '*' && strchr(s, 'n')))
279 s = default_fmt;
280
281 /*
282 * Select the attributes if it starts with "*".
283 */
284 attr:
278 while (*t)
279 {
280 if (*t < ' ' || *t > '~')
281 {
282 s = default_fmt;
283 goto attr;
284 }
285 t++;

--- 5 unchanged lines hidden (view full) ---

291 (*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) ||
292 (*s != '*' && strchr(s, 'n')))
293 s = default_fmt;
294
295 /*
296 * Select the attributes if it starts with "*".
297 */
298 attr:
285 if (*s == '*')
299 if (*s == '*' && s[1] != '\0')
286 {
287 switch (s[1])
288 {
300 {
301 switch (s[1])
302 {
289 case 'd': binattr = AT_BOLD; break;
290 case 'k': binattr = AT_BLINK; break;
291 case 's': binattr = AT_STANDOUT; break;
292 case 'u': binattr = AT_UNDERLINE; break;
293 default: binattr = AT_NORMAL; break;
303 case 'd': *attrptr = AT_BOLD; break;
304 case 'k': *attrptr = AT_BLINK; break;
305 case 's': *attrptr = AT_STANDOUT; break;
306 case 'u': *attrptr = AT_UNDERLINE; break;
307 default: *attrptr = AT_NORMAL; break;
294 }
295 s += 2;
296 }
297 *fmtvarptr = s;
298}
299
300/*
301 *
302 */
303 static void
304set_charset()
305{
306 char *s;
307
308 }
309 s += 2;
310 }
311 *fmtvarptr = s;
312}
313
314/*
315 *
316 */
317 static void
318set_charset()
319{
320 char *s;
321
322#if MSDOS_COMPILER==WIN32C
308 /*
323 /*
324 * If the Windows console is using UTF-8, we'll use it too.
325 */
326 if (GetConsoleOutputCP() == CP_UTF8)
327 if (icharset("utf-8", 1))
328 return;
329#endif
330 /*
309 * See if environment variable LESSCHARSET is defined.
310 */
311 s = lgetenv("LESSCHARSET");
312 if (icharset(s, 0))
313 return;
314
315 /*
316 * LESSCHARSET is not defined: try LESSCHARDEF.

--- 32 unchanged lines hidden (view full) ---

349#endif
350
351#if HAVE_LOCALE
352 /*
353 * Get character definitions from locale functions,
354 * rather than from predefined charset entry.
355 */
356 ilocale();
331 * See if environment variable LESSCHARSET is defined.
332 */
333 s = lgetenv("LESSCHARSET");
334 if (icharset(s, 0))
335 return;
336
337 /*
338 * LESSCHARSET is not defined: try LESSCHARDEF.

--- 32 unchanged lines hidden (view full) ---

371#endif
372
373#if HAVE_LOCALE
374 /*
375 * Get character definitions from locale functions,
376 * rather than from predefined charset entry.
377 */
378 ilocale();
379#else
357#if MSDOS_COMPILER
358 /*
359 * Default to "dos".
360 */
361 (void) icharset("dos", 1);
362#else
363 /*
364 * Default to "latin1".

--- 13 unchanged lines hidden (view full) ---

378
379#if HAVE_LOCALE
380 setlocale(LC_ALL, "");
381#endif
382
383 set_charset();
384
385 s = lgetenv("LESSBINFMT");
380#if MSDOS_COMPILER
381 /*
382 * Default to "dos".
383 */
384 (void) icharset("dos", 1);
385#else
386 /*
387 * Default to "latin1".

--- 13 unchanged lines hidden (view full) ---

401
402#if HAVE_LOCALE
403 setlocale(LC_ALL, "");
404#endif
405
406 set_charset();
407
408 s = lgetenv("LESSBINFMT");
386 setbinfmt(s, &binfmt, "*s<%02X>");
409 setfmt(s, &binfmt, &binattr, "*s<%02X>");
387
388 s = lgetenv("LESSUTFBINFMT");
410
411 s = lgetenv("LESSUTFBINFMT");
389 setbinfmt(s, &utfbinfmt, "<U+%04lX>");
412 setfmt(s, &utfbinfmt, &binattr, "<U+%04lX>");
390}
391
392/*
393 * Is a given character a "binary" character?
394 */
395 public int
396binary_char(c)
397 LWCHAR c;

--- 140 unchanged lines hidden (view full) ---

538
539 for (i = 1; i < len; i++)
540 if (!IS_UTF8_TRAIL(s[i]))
541 return (0);
542 return (1);
543}
544
545/*
413}
414
415/*
416 * Is a given character a "binary" character?
417 */
418 public int
419binary_char(c)
420 LWCHAR c;

--- 140 unchanged lines hidden (view full) ---

561
562 for (i = 1; i < len; i++)
563 if (!IS_UTF8_TRAIL(s[i]))
564 return (0);
565 return (1);
566}
567
568/*
546 * Return number of invalid UTF-8 sequences found in a buffer.
569 * Skip bytes until a UTF-8 lead byte (11xxxxxx) or ASCII byte (0xxxxxxx) is found.
547 */
570 */
548 public int
549utf_bin_count(data, len)
550 char *data;
551 int len;
571 public void
572utf_skip_to_lead(pp, limit)
573 char **pp;
574 char *limit;
552{
575{
553 int bin_count = 0;
554 while (len > 0)
555 {
556 if (is_utf8_well_formed(data, len))
557 {
558 int clen = utf_len(*data & 0377);
559 data += clen;
560 len -= clen;
561 } else
562 {
563 /* Skip to next lead byte. */
564 bin_count++;
565 do {
566 ++data;
567 --len;
568 } while (len > 0 && !IS_UTF8_LEAD(*data & 0377));
569 }
570 }
571 return (bin_count);
576 do {
577 ++(*pp);
578 } while (*pp < limit && !IS_UTF8_LEAD((*pp)[0] & 0377) && !IS_ASCII_OCTET((*pp)[0]));
572}
573
579}
580
581
574/*
575 * Get the value of a UTF-8 character.
576 */
577 public LWCHAR
578get_wchar(p)
579 constant char *p;
580{
581 switch (utf_len(p[0]))

--- 103 unchanged lines hidden (view full) ---

685 LWCHAR ch;
686 int len;
687 char *p = *pp;
688
689 if (!utf_mode)
690 {
691 /* It's easy if chars are one byte. */
692 if (dir > 0)
582/*
583 * Get the value of a UTF-8 character.
584 */
585 public LWCHAR
586get_wchar(p)
587 constant char *p;
588{
589 switch (utf_len(p[0]))

--- 103 unchanged lines hidden (view full) ---

693 LWCHAR ch;
694 int len;
695 char *p = *pp;
696
697 if (!utf_mode)
698 {
699 /* It's easy if chars are one byte. */
700 if (dir > 0)
693 ch = (LWCHAR) ((p < limit) ? *p++ : 0);
701 ch = (LWCHAR) (unsigned char) ((p < limit) ? *p++ : 0);
694 else
702 else
695 ch = (LWCHAR) ((p > limit) ? *--p : 0);
703 ch = (LWCHAR) (unsigned char) ((p > limit) ? *--p : 0);
696 } else if (dir > 0)
697 {
698 len = utf_len(*p);
699 if (p + len > limit)
700 {
701 ch = 0;
702 p = (char *) limit;
703 } else

--- 31 unchanged lines hidden (view full) ---

735DECLARE_RANGE_TABLE_START(ubin)
736#include "ubin.uni"
737DECLARE_RANGE_TABLE_END(ubin)
738
739DECLARE_RANGE_TABLE_START(wide)
740#include "wide.uni"
741DECLARE_RANGE_TABLE_END(wide)
742
704 } else if (dir > 0)
705 {
706 len = utf_len(*p);
707 if (p + len > limit)
708 {
709 ch = 0;
710 p = (char *) limit;
711 } else

--- 31 unchanged lines hidden (view full) ---

743DECLARE_RANGE_TABLE_START(ubin)
744#include "ubin.uni"
745DECLARE_RANGE_TABLE_END(ubin)
746
747DECLARE_RANGE_TABLE_START(wide)
748#include "wide.uni"
749DECLARE_RANGE_TABLE_END(wide)
750
751DECLARE_RANGE_TABLE_START(fmt)
752#include "fmt.uni"
753DECLARE_RANGE_TABLE_END(fmt)
754
743/* comb_table is special pairs, not ranges. */
744static struct wchar_range comb_table[] = {
745 {0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627},
746};
747
748
749 static int
750is_in_table(ch, table)

--- 24 unchanged lines hidden (view full) ---

775/*
776 * Is a character a UTF-8 composing character?
777 * If a composing character follows any char, the two combine into one glyph.
778 */
779 public int
780is_composing_char(ch)
781 LWCHAR ch;
782{
755/* comb_table is special pairs, not ranges. */
756static struct wchar_range comb_table[] = {
757 {0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627},
758};
759
760
761 static int
762is_in_table(ch, table)

--- 24 unchanged lines hidden (view full) ---

787/*
788 * Is a character a UTF-8 composing character?
789 * If a composing character follows any char, the two combine into one glyph.
790 */
791 public int
792is_composing_char(ch)
793 LWCHAR ch;
794{
783 return is_in_table(ch, &compose_table);
795 return is_in_table(ch, &compose_table) ||
796 (bs_mode != BS_CONTROL && is_in_table(ch, &fmt_table));
784}
785
786/*
787 * Should this UTF-8 character be treated as binary?
788 */
789 public int
790is_ubin_char(ch)
791 LWCHAR ch;
792{
797}
798
799/*
800 * Should this UTF-8 character be treated as binary?
801 */
802 public int
803is_ubin_char(ch)
804 LWCHAR ch;
805{
793 return is_in_table(ch, &ubin_table);
806 int ubin = is_in_table(ch, &ubin_table) ||
807 (bs_mode == BS_CONTROL && is_in_table(ch, &fmt_table));
808#if MSDOS_COMPILER==WIN32C
809 if (!ubin && utf_mode == 2 && ch < 0x10000)
810 {
811 /*
812 * Consider it binary if it can't be converted.
813 */
814 BOOL used_default = TRUE;
815 WideCharToMultiByte(GetConsoleOutputCP(), WC_NO_BEST_FIT_CHARS, (LPCWSTR) &ch, 1, NULL, 0, NULL, &used_default);
816 if (used_default)
817 ubin = 1;
818 }
819#endif
820 return ubin;
794}
795
796/*
797 * Is this a double width UTF-8 character?
798 */
799 public int
800is_wide_char(ch)
801 LWCHAR ch;

--- 25 unchanged lines hidden ---
821}
822
823/*
824 * Is this a double width UTF-8 character?
825 */
826 public int
827is_wide_char(ch)
828 LWCHAR ch;

--- 25 unchanged lines hidden ---