charset.c (7e1b7636c894be9d1130c284089ce1ea0786ecec) | charset.c (b2ea244070ff84eab79e04befb7aa30c982fc84d) |
---|---|
1/* 2 * Copyright (C) 1984-2017 Mark Nudelman 3 * 4 * You may distribute under the terms of either the GNU General Public 5 * License or the Less License, as specified in the README file. 6 * 7 * For more information, see the README file. 8 */ --- 8 unchanged lines hidden (view full) --- 17#if HAVE_LOCALE 18#include <locale.h> 19#include <ctype.h> 20#include <langinfo.h> 21#endif 22 23#include "charset.h" 24 | 1/* 2 * Copyright (C) 1984-2017 Mark Nudelman 3 * 4 * You may distribute under the terms of either the GNU General Public 5 * License or the Less License, as specified in the README file. 6 * 7 * For more information, see the README file. 8 */ --- 8 unchanged lines hidden (view full) --- 17#if HAVE_LOCALE 18#include <locale.h> 19#include <ctype.h> 20#include <langinfo.h> 21#endif 22 23#include "charset.h" 24 |
25#if MSDOS_COMPILER==WIN32C 26#define WIN32_LEAN_AND_MEAN 27#include <windows.h> 28#endif 29 30extern int bs_mode; 31 |
|
25public int utf_mode = 0; 26 27/* 28 * Predefined character sets, 29 * selected by the LESSCHARSET environment variable. 30 */ 31struct charset { 32 char *name; --- 177 unchanged lines hidden (view full) --- 210 } 211 212 for (p = charsets; p->name != NULL; p++) 213 { 214 if (strcmp(name, p->name) == 0) 215 { 216 ichardef(p->desc); 217 if (p->p_flag != NULL) | 32public int utf_mode = 0; 33 34/* 35 * Predefined character sets, 36 * selected by the LESSCHARSET environment variable. 37 */ 38struct charset { 39 char *name; --- 177 unchanged lines hidden (view full) --- 217 } 218 219 for (p = charsets; p->name != NULL; p++) 220 { 221 if (strcmp(name, p->name) == 0) 222 { 223 ichardef(p->desc); 224 if (p->p_flag != NULL) |
225 { 226#if MSDOS_COMPILER==WIN32C 227 *(p->p_flag) = 1 + (GetConsoleOutputCP() != CP_UTF8); 228#else |
|
218 *(p->p_flag) = 1; | 229 *(p->p_flag) = 1; |
230#endif 231 } |
|
219 return (1); 220 } 221 } 222 223 if (!no_error) { 224 error("invalid charset name", NULL_PARG); 225 quit(QUIT_ERROR); 226 } --- 19 unchanged lines hidden (view full) --- 246 chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR; 247 } 248} 249#endif 250 251/* 252 * Define the printing format for control (or binary utf) chars. 253 */ | 232 return (1); 233 } 234 } 235 236 if (!no_error) { 237 error("invalid charset name", NULL_PARG); 238 quit(QUIT_ERROR); 239 } --- 19 unchanged lines hidden (view full) --- 259 chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR; 260 } 261} 262#endif 263 264/* 265 * Define the printing format for control (or binary utf) chars. 266 */ |
254 static void 255setbinfmt(s, fmtvarptr, default_fmt) | 267 public void 268setfmt(s, fmtvarptr, attrptr, default_fmt) |
256 char *s; 257 char **fmtvarptr; | 269 char *s; 270 char **fmtvarptr; |
271 int *attrptr; |
|
258 char *default_fmt; 259{ 260 if (s && utf_mode) 261 { 262 /* It would be too hard to account for width otherwise. */ | 272 char *default_fmt; 273{ 274 if (s && utf_mode) 275 { 276 /* It would be too hard to account for width otherwise. */ |
263 char *t = s; | 277 char constant *t = s; |
264 while (*t) 265 { 266 if (*t < ' ' || *t > '~') 267 { 268 s = default_fmt; 269 goto attr; 270 } 271 t++; --- 5 unchanged lines hidden (view full) --- 277 (*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) || 278 (*s != '*' && strchr(s, 'n'))) 279 s = default_fmt; 280 281 /* 282 * Select the attributes if it starts with "*". 283 */ 284 attr: | 278 while (*t) 279 { 280 if (*t < ' ' || *t > '~') 281 { 282 s = default_fmt; 283 goto attr; 284 } 285 t++; --- 5 unchanged lines hidden (view full) --- 291 (*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) || 292 (*s != '*' && strchr(s, 'n'))) 293 s = default_fmt; 294 295 /* 296 * Select the attributes if it starts with "*". 297 */ 298 attr: |
285 if (*s == '*') | 299 if (*s == '*' && s[1] != '\0') |
286 { 287 switch (s[1]) 288 { | 300 { 301 switch (s[1]) 302 { |
289 case 'd': binattr = AT_BOLD; break; 290 case 'k': binattr = AT_BLINK; break; 291 case 's': binattr = AT_STANDOUT; break; 292 case 'u': binattr = AT_UNDERLINE; break; 293 default: binattr = AT_NORMAL; break; | 303 case 'd': *attrptr = AT_BOLD; break; 304 case 'k': *attrptr = AT_BLINK; break; 305 case 's': *attrptr = AT_STANDOUT; break; 306 case 'u': *attrptr = AT_UNDERLINE; break; 307 default: *attrptr = AT_NORMAL; break; |
294 } 295 s += 2; 296 } 297 *fmtvarptr = s; 298} 299 300/* 301 * 302 */ 303 static void 304set_charset() 305{ 306 char *s; 307 | 308 } 309 s += 2; 310 } 311 *fmtvarptr = s; 312} 313 314/* 315 * 316 */ 317 static void 318set_charset() 319{ 320 char *s; 321 |
322#if MSDOS_COMPILER==WIN32C |
|
308 /* | 323 /* |
324 * If the Windows console is using UTF-8, we'll use it too. 325 */ 326 if (GetConsoleOutputCP() == CP_UTF8) 327 if (icharset("utf-8", 1)) 328 return; 329#endif 330 /* |
|
309 * See if environment variable LESSCHARSET is defined. 310 */ 311 s = lgetenv("LESSCHARSET"); 312 if (icharset(s, 0)) 313 return; 314 315 /* 316 * LESSCHARSET is not defined: try LESSCHARDEF. --- 32 unchanged lines hidden (view full) --- 349#endif 350 351#if HAVE_LOCALE 352 /* 353 * Get character definitions from locale functions, 354 * rather than from predefined charset entry. 355 */ 356 ilocale(); | 331 * See if environment variable LESSCHARSET is defined. 332 */ 333 s = lgetenv("LESSCHARSET"); 334 if (icharset(s, 0)) 335 return; 336 337 /* 338 * LESSCHARSET is not defined: try LESSCHARDEF. --- 32 unchanged lines hidden (view full) --- 371#endif 372 373#if HAVE_LOCALE 374 /* 375 * Get character definitions from locale functions, 376 * rather than from predefined charset entry. 377 */ 378 ilocale(); |
379#else |
|
357#if MSDOS_COMPILER 358 /* 359 * Default to "dos". 360 */ 361 (void) icharset("dos", 1); 362#else 363 /* 364 * Default to "latin1". --- 13 unchanged lines hidden (view full) --- 378 379#if HAVE_LOCALE 380 setlocale(LC_ALL, ""); 381#endif 382 383 set_charset(); 384 385 s = lgetenv("LESSBINFMT"); | 380#if MSDOS_COMPILER 381 /* 382 * Default to "dos". 383 */ 384 (void) icharset("dos", 1); 385#else 386 /* 387 * Default to "latin1". --- 13 unchanged lines hidden (view full) --- 401 402#if HAVE_LOCALE 403 setlocale(LC_ALL, ""); 404#endif 405 406 set_charset(); 407 408 s = lgetenv("LESSBINFMT"); |
386 setbinfmt(s, &binfmt, "*s<%02X>"); | 409 setfmt(s, &binfmt, &binattr, "*s<%02X>"); |
387 388 s = lgetenv("LESSUTFBINFMT"); | 410 411 s = lgetenv("LESSUTFBINFMT"); |
389 setbinfmt(s, &utfbinfmt, "<U+%04lX>"); | 412 setfmt(s, &utfbinfmt, &binattr, "<U+%04lX>"); |
390} 391 392/* 393 * Is a given character a "binary" character? 394 */ 395 public int 396binary_char(c) 397 LWCHAR c; --- 140 unchanged lines hidden (view full) --- 538 539 for (i = 1; i < len; i++) 540 if (!IS_UTF8_TRAIL(s[i])) 541 return (0); 542 return (1); 543} 544 545/* | 413} 414 415/* 416 * Is a given character a "binary" character? 417 */ 418 public int 419binary_char(c) 420 LWCHAR c; --- 140 unchanged lines hidden (view full) --- 561 562 for (i = 1; i < len; i++) 563 if (!IS_UTF8_TRAIL(s[i])) 564 return (0); 565 return (1); 566} 567 568/* |
546 * Return number of invalid UTF-8 sequences found in a buffer. | 569 * Skip bytes until a UTF-8 lead byte (11xxxxxx) or ASCII byte (0xxxxxxx) is found. |
547 */ | 570 */ |
548 public int 549utf_bin_count(data, len) 550 char *data; 551 int len; | 571 public void 572utf_skip_to_lead(pp, limit) 573 char **pp; 574 char *limit; |
552{ | 575{ |
553 int bin_count = 0; 554 while (len > 0) 555 { 556 if (is_utf8_well_formed(data, len)) 557 { 558 int clen = utf_len(*data & 0377); 559 data += clen; 560 len -= clen; 561 } else 562 { 563 /* Skip to next lead byte. */ 564 bin_count++; 565 do { 566 ++data; 567 --len; 568 } while (len > 0 && !IS_UTF8_LEAD(*data & 0377)); 569 } 570 } 571 return (bin_count); | 576 do { 577 ++(*pp); 578 } while (*pp < limit && !IS_UTF8_LEAD((*pp)[0] & 0377) && !IS_ASCII_OCTET((*pp)[0])); |
572} 573 | 579} 580 |
581 |
|
574/* 575 * Get the value of a UTF-8 character. 576 */ 577 public LWCHAR 578get_wchar(p) 579 constant char *p; 580{ 581 switch (utf_len(p[0])) --- 103 unchanged lines hidden (view full) --- 685 LWCHAR ch; 686 int len; 687 char *p = *pp; 688 689 if (!utf_mode) 690 { 691 /* It's easy if chars are one byte. */ 692 if (dir > 0) | 582/* 583 * Get the value of a UTF-8 character. 584 */ 585 public LWCHAR 586get_wchar(p) 587 constant char *p; 588{ 589 switch (utf_len(p[0])) --- 103 unchanged lines hidden (view full) --- 693 LWCHAR ch; 694 int len; 695 char *p = *pp; 696 697 if (!utf_mode) 698 { 699 /* It's easy if chars are one byte. */ 700 if (dir > 0) |
693 ch = (LWCHAR) ((p < limit) ? *p++ : 0); | 701 ch = (LWCHAR) (unsigned char) ((p < limit) ? *p++ : 0); |
694 else | 702 else |
695 ch = (LWCHAR) ((p > limit) ? *--p : 0); | 703 ch = (LWCHAR) (unsigned char) ((p > limit) ? *--p : 0); |
696 } else if (dir > 0) 697 { 698 len = utf_len(*p); 699 if (p + len > limit) 700 { 701 ch = 0; 702 p = (char *) limit; 703 } else --- 31 unchanged lines hidden (view full) --- 735DECLARE_RANGE_TABLE_START(ubin) 736#include "ubin.uni" 737DECLARE_RANGE_TABLE_END(ubin) 738 739DECLARE_RANGE_TABLE_START(wide) 740#include "wide.uni" 741DECLARE_RANGE_TABLE_END(wide) 742 | 704 } else if (dir > 0) 705 { 706 len = utf_len(*p); 707 if (p + len > limit) 708 { 709 ch = 0; 710 p = (char *) limit; 711 } else --- 31 unchanged lines hidden (view full) --- 743DECLARE_RANGE_TABLE_START(ubin) 744#include "ubin.uni" 745DECLARE_RANGE_TABLE_END(ubin) 746 747DECLARE_RANGE_TABLE_START(wide) 748#include "wide.uni" 749DECLARE_RANGE_TABLE_END(wide) 750 |
751DECLARE_RANGE_TABLE_START(fmt) 752#include "fmt.uni" 753DECLARE_RANGE_TABLE_END(fmt) 754 |
|
743/* comb_table is special pairs, not ranges. */ 744static struct wchar_range comb_table[] = { 745 {0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627}, 746}; 747 748 749 static int 750is_in_table(ch, table) --- 24 unchanged lines hidden (view full) --- 775/* 776 * Is a character a UTF-8 composing character? 777 * If a composing character follows any char, the two combine into one glyph. 778 */ 779 public int 780is_composing_char(ch) 781 LWCHAR ch; 782{ | 755/* comb_table is special pairs, not ranges. */ 756static struct wchar_range comb_table[] = { 757 {0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627}, 758}; 759 760 761 static int 762is_in_table(ch, table) --- 24 unchanged lines hidden (view full) --- 787/* 788 * Is a character a UTF-8 composing character? 789 * If a composing character follows any char, the two combine into one glyph. 790 */ 791 public int 792is_composing_char(ch) 793 LWCHAR ch; 794{ |
783 return is_in_table(ch, &compose_table); | 795 return is_in_table(ch, &compose_table) || 796 (bs_mode != BS_CONTROL && is_in_table(ch, &fmt_table)); |
784} 785 786/* 787 * Should this UTF-8 character be treated as binary? 788 */ 789 public int 790is_ubin_char(ch) 791 LWCHAR ch; 792{ | 797} 798 799/* 800 * Should this UTF-8 character be treated as binary? 801 */ 802 public int 803is_ubin_char(ch) 804 LWCHAR ch; 805{ |
793 return is_in_table(ch, &ubin_table); | 806 int ubin = is_in_table(ch, &ubin_table) || 807 (bs_mode == BS_CONTROL && is_in_table(ch, &fmt_table)); 808#if MSDOS_COMPILER==WIN32C 809 if (!ubin && utf_mode == 2 && ch < 0x10000) 810 { 811 /* 812 * Consider it binary if it can't be converted. 813 */ 814 BOOL used_default = TRUE; 815 WideCharToMultiByte(GetConsoleOutputCP(), WC_NO_BEST_FIT_CHARS, (LPCWSTR) &ch, 1, NULL, 0, NULL, &used_default); 816 if (used_default) 817 ubin = 1; 818 } 819#endif 820 return ubin; |
794} 795 796/* 797 * Is this a double width UTF-8 character? 798 */ 799 public int 800is_wide_char(ch) 801 LWCHAR ch; --- 25 unchanged lines hidden --- | 821} 822 823/* 824 * Is this a double width UTF-8 character? 825 */ 826 public int 827is_wide_char(ch) 828 LWCHAR ch; --- 25 unchanged lines hidden --- |