xref: /freebsd/contrib/less/charset.c (revision 99429157e8615dc3b7f11afbe3ed92de7476a5db)
1 /*
2  * Copyright (C) 1984-2017  Mark Nudelman
3  *
4  * You may distribute under the terms of either the GNU General Public
5  * License or the Less License, as specified in the README file.
6  *
7  * For more information, see the README file.
8  */
9 
10 
11 /*
12  * Functions to define the character set
13  * and do things specific to the character set.
14  */
15 
16 #include "less.h"
17 #if HAVE_LOCALE
18 #include <locale.h>
19 #include <ctype.h>
20 #include <langinfo.h>
21 #endif
22 
23 #include "charset.h"
24 
25 public int utf_mode = 0;
26 
27 /*
28  * Predefined character sets,
29  * selected by the LESSCHARSET environment variable.
30  */
31 struct charset {
32 	char *name;
33 	int *p_flag;
34 	char *desc;
35 } charsets[] = {
36 	{ "ascii",		NULL,       "8bcccbcc18b95.b" },
37 	{ "utf-8",		&utf_mode,  "8bcccbcc18b95.b126.bb" },
38 	{ "iso8859",		NULL,       "8bcccbcc18b95.33b." },
39 	{ "latin3",		NULL,       "8bcccbcc18b95.33b5.b8.b15.b4.b12.b18.b12.b." },
40 	{ "arabic",		NULL,       "8bcccbcc18b95.33b.3b.7b2.13b.3b.b26.5b19.b" },
41 	{ "greek",		NULL,       "8bcccbcc18b95.33b4.2b4.b3.b35.b44.b" },
42 	{ "greek2005",		NULL,       "8bcccbcc18b95.33b14.b35.b44.b" },
43 	{ "hebrew",		NULL,       "8bcccbcc18b95.33b.b29.32b28.2b2.b" },
44 	{ "koi8-r",		NULL,       "8bcccbcc18b95.b." },
45 	{ "KOI8-T",		NULL,       "8bcccbcc18b95.b8.b6.b8.b.b.5b7.3b4.b4.b3.b.b.3b." },
46 	{ "georgianps",		NULL,       "8bcccbcc18b95.3b11.4b12.2b." },
47 	{ "tcvn",		NULL,       "b..b...bcccbccbbb7.8b95.b48.5b." },
48 	{ "TIS-620",		NULL,       "8bcccbcc18b95.b.4b.11b7.8b." },
49 	{ "next",		NULL,       "8bcccbcc18b95.bb125.bb" },
50 	{ "dos",		NULL,       "8bcccbcc12bc5b95.b." },
51 	{ "windows-1251",	NULL,       "8bcccbcc12bc5b95.b24.b." },
52 	{ "windows-1252",	NULL,       "8bcccbcc12bc5b95.b.b11.b.2b12.b." },
53 	{ "windows-1255",	NULL,       "8bcccbcc12bc5b95.b.b8.b.5b9.b.4b." },
54 	{ "ebcdic",		NULL,       "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
55 	{ "IBM-1047",		NULL,       "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
56 	{ NULL, NULL, NULL }
57 };
58 
59 /*
60  * Support "locale charmap"/nl_langinfo(CODESET) values, as well as others.
61  */
62 struct cs_alias {
63 	char *name;
64 	char *oname;
65 } cs_aliases[] = {
66 	{ "UTF-8",		"utf-8" },
67 	{ "utf8",		"utf-8" },
68 	{ "UTF8",		"utf-8" },
69 	{ "ANSI_X3.4-1968",	"ascii" },
70 	{ "US-ASCII",		"ascii" },
71 	{ "latin1",		"iso8859" },
72 	{ "ISO-8859-1",		"iso8859" },
73 	{ "latin9",		"iso8859" },
74 	{ "ISO-8859-15",	"iso8859" },
75 	{ "latin2",		"iso8859" },
76 	{ "ISO-8859-2",		"iso8859" },
77 	{ "ISO-8859-3",		"latin3" },
78 	{ "latin4",		"iso8859" },
79 	{ "ISO-8859-4",		"iso8859" },
80 	{ "cyrillic",		"iso8859" },
81 	{ "ISO-8859-5",		"iso8859" },
82 	{ "ISO-8859-6",		"arabic" },
83 	{ "ISO-8859-7",		"greek" },
84 	{ "IBM9005",		"greek2005" },
85 	{ "ISO-8859-8",		"hebrew" },
86 	{ "latin5",		"iso8859" },
87 	{ "ISO-8859-9",		"iso8859" },
88 	{ "latin6",		"iso8859" },
89 	{ "ISO-8859-10",	"iso8859" },
90 	{ "latin7",		"iso8859" },
91 	{ "ISO-8859-13",	"iso8859" },
92 	{ "latin8",		"iso8859" },
93 	{ "ISO-8859-14",	"iso8859" },
94 	{ "latin10",		"iso8859" },
95 	{ "ISO-8859-16",	"iso8859" },
96 	{ "IBM437",		"dos" },
97 	{ "EBCDIC-US",		"ebcdic" },
98 	{ "IBM1047",		"IBM-1047" },
99 	{ "KOI8-R",		"koi8-r" },
100 	{ "KOI8-U",		"koi8-r" },
101 	{ "GEORGIAN-PS",	"georgianps" },
102 	{ "TCVN5712-1", 	"tcvn" },
103 	{ "NEXTSTEP",		"next" },
104 	{ "windows",		"windows-1252" }, /* backward compatibility */
105 	{ "CP1251",		"windows-1251" },
106 	{ "CP1252",		"windows-1252" },
107 	{ "CP1255",		"windows-1255" },
108 	{ NULL, NULL }
109 };
110 
111 #define	IS_BINARY_CHAR	01
112 #define	IS_CONTROL_CHAR	02
113 
114 static char chardef[256];
115 static char *binfmt = NULL;
116 static char *utfbinfmt = NULL;
117 public int binattr = AT_STANDOUT;
118 
119 
120 /*
121  * Define a charset, given a description string.
122  * The string consists of 256 letters,
123  * one for each character in the charset.
124  * If the string is shorter than 256 letters, missing letters
125  * are taken to be identical to the last one.
126  * A decimal number followed by a letter is taken to be a
127  * repetition of the letter.
128  *
129  * Each letter is one of:
130  *	. normal character
131  *	b binary character
132  *	c control character
133  */
134 	static void
135 ichardef(s)
136 	char *s;
137 {
138 	char *cp;
139 	int n;
140 	char v;
141 
142 	n = 0;
143 	v = 0;
144 	cp = chardef;
145 	while (*s != '\0')
146 	{
147 		switch (*s++)
148 		{
149 		case '.':
150 			v = 0;
151 			break;
152 		case 'c':
153 			v = IS_CONTROL_CHAR;
154 			break;
155 		case 'b':
156 			v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
157 			break;
158 
159 		case '0': case '1': case '2': case '3': case '4':
160 		case '5': case '6': case '7': case '8': case '9':
161 			n = (10 * n) + (s[-1] - '0');
162 			continue;
163 
164 		default:
165 			error("invalid chardef", NULL_PARG);
166 			quit(QUIT_ERROR);
167 			/*NOTREACHED*/
168 		}
169 
170 		do
171 		{
172 			if (cp >= chardef + sizeof(chardef))
173 			{
174 				error("chardef longer than 256", NULL_PARG);
175 				quit(QUIT_ERROR);
176 				/*NOTREACHED*/
177 			}
178 			*cp++ = v;
179 		} while (--n > 0);
180 		n = 0;
181 	}
182 
183 	while (cp < chardef + sizeof(chardef))
184 		*cp++ = v;
185 }
186 
187 /*
188  * Define a charset, given a charset name.
189  * The valid charset names are listed in the "charsets" array.
190  */
191 	static int
192 icharset(name, no_error)
193 	char *name;
194 	int no_error;
195 {
196 	struct charset *p;
197 	struct cs_alias *a;
198 
199 	if (name == NULL || *name == '\0')
200 		return (0);
201 
202 	/* First see if the name is an alias. */
203 	for (a = cs_aliases;  a->name != NULL;  a++)
204 	{
205 		if (strcmp(name, a->name) == 0)
206 		{
207 			name = a->oname;
208 			break;
209 		}
210 	}
211 
212 	for (p = charsets;  p->name != NULL;  p++)
213 	{
214 		if (strcmp(name, p->name) == 0)
215 		{
216 			ichardef(p->desc);
217 			if (p->p_flag != NULL)
218 				*(p->p_flag) = 1;
219 			return (1);
220 		}
221 	}
222 
223 	if (!no_error) {
224 		error("invalid charset name", NULL_PARG);
225 		quit(QUIT_ERROR);
226 	}
227 	return (0);
228 }
229 
230 #if HAVE_LOCALE
231 /*
232  * Define a charset, given a locale name.
233  */
234 	static void
235 ilocale()
236 {
237 	int c;
238 
239 	for (c = 0;  c < (int) sizeof(chardef);  c++)
240 	{
241 		if (isprint(c))
242 			chardef[c] = 0;
243 		else if (iscntrl(c))
244 			chardef[c] = IS_CONTROL_CHAR;
245 		else
246 			chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
247 	}
248 }
249 #endif
250 
251 /*
252  * Define the printing format for control (or binary utf) chars.
253  */
254    	static void
255 setbinfmt(s, fmtvarptr, default_fmt)
256 	char *s;
257 	char **fmtvarptr;
258 	char *default_fmt;
259 {
260 	if (s && utf_mode)
261 	{
262 		/* It would be too hard to account for width otherwise.  */
263 		char *t = s;
264 		while (*t)
265 		{
266 			if (*t < ' ' || *t > '~')
267 			{
268 				s = default_fmt;
269 				goto attr;
270 			}
271 			t++;
272 		}
273 	}
274 
275 	/* %n is evil */
276 	if (s == NULL || *s == '\0' ||
277 	    (*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) ||
278 	    (*s != '*' && strchr(s, 'n')))
279 		s = default_fmt;
280 
281 	/*
282 	 * Select the attributes if it starts with "*".
283 	 */
284  attr:
285 	if (*s == '*')
286 	{
287 		switch (s[1])
288 		{
289 		case 'd':  binattr = AT_BOLD;      break;
290 		case 'k':  binattr = AT_BLINK;     break;
291 		case 's':  binattr = AT_STANDOUT;  break;
292 		case 'u':  binattr = AT_UNDERLINE; break;
293 		default:   binattr = AT_NORMAL;    break;
294 		}
295 		s += 2;
296 	}
297 	*fmtvarptr = s;
298 }
299 
300 /*
301  *
302  */
303 	static void
304 set_charset()
305 {
306 	char *s;
307 
308 	/*
309 	 * See if environment variable LESSCHARSET is defined.
310 	 */
311 	s = lgetenv("LESSCHARSET");
312 	if (icharset(s, 0))
313 		return;
314 
315 	/*
316 	 * LESSCHARSET is not defined: try LESSCHARDEF.
317 	 */
318 	s = lgetenv("LESSCHARDEF");
319 	if (s != NULL && *s != '\0')
320 	{
321 		ichardef(s);
322 		return;
323 	}
324 
325 #if HAVE_LOCALE
326 #ifdef CODESET
327 	/*
328 	 * Try using the codeset name as the charset name.
329 	 */
330 	s = nl_langinfo(CODESET);
331 	if (icharset(s, 1))
332 		return;
333 #endif
334 #endif
335 
336 #if HAVE_STRSTR
337 	/*
338 	 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
339 	 */
340 	if ((s = lgetenv("LC_ALL")) != NULL ||
341 	    (s = lgetenv("LC_CTYPE")) != NULL ||
342 	    (s = lgetenv("LANG")) != NULL)
343 	{
344 		if (   strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL
345 		    || strstr(s, "UTF8")  != NULL || strstr(s, "utf8")  != NULL)
346 			if (icharset("utf-8", 1))
347 				return;
348 	}
349 #endif
350 
351 #if HAVE_LOCALE
352 	/*
353 	 * Get character definitions from locale functions,
354 	 * rather than from predefined charset entry.
355 	 */
356 	ilocale();
357 #if MSDOS_COMPILER
358 	/*
359 	 * Default to "dos".
360 	 */
361 	(void) icharset("dos", 1);
362 #else
363 	/*
364 	 * Default to "latin1".
365 	 */
366 	(void) icharset("latin1", 1);
367 #endif
368 #endif
369 }
370 
371 /*
372  * Initialize charset data structures.
373  */
374 	public void
375 init_charset()
376 {
377 	char *s;
378 
379 #if HAVE_LOCALE
380 	setlocale(LC_ALL, "");
381 #endif
382 
383 	set_charset();
384 
385 	s = lgetenv("LESSBINFMT");
386 	setbinfmt(s, &binfmt, "*s<%02X>");
387 
388 	s = lgetenv("LESSUTFBINFMT");
389 	setbinfmt(s, &utfbinfmt, "<U+%04lX>");
390 }
391 
392 /*
393  * Is a given character a "binary" character?
394  */
395 	public int
396 binary_char(c)
397 	LWCHAR c;
398 {
399 	if (utf_mode)
400 		return (is_ubin_char(c));
401 	c &= 0377;
402 	return (chardef[c] & IS_BINARY_CHAR);
403 }
404 
405 /*
406  * Is a given character a "control" character?
407  */
408 	public int
409 control_char(c)
410 	LWCHAR c;
411 {
412 	c &= 0377;
413 	return (chardef[c] & IS_CONTROL_CHAR);
414 }
415 
416 /*
417  * Return the printable form of a character.
418  * For example, in the "ascii" charset '\3' is printed as "^C".
419  */
420 	public char *
421 prchar(c)
422 	LWCHAR c;
423 {
424 	/* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */
425 	static char buf[32];
426 
427 	c &= 0377;
428 	if ((c < 128 || !utf_mode) && !control_char(c))
429 		SNPRINTF1(buf, sizeof(buf), "%c", (int) c);
430 	else if (c == ESC)
431 		strcpy(buf, "ESC");
432 #if IS_EBCDIC_HOST
433 	else if (!binary_char(c) && c < 64)
434 		SNPRINTF1(buf, sizeof(buf), "^%c",
435 		/*
436 		 * This array roughly inverts CONTROL() #defined in less.h,
437 	 	 * and should be kept in sync with CONTROL() and IBM-1047.
438  	 	 */
439 		"@ABC.I.?...KLMNO"
440 		"PQRS.JH.XY.."
441 		"\\]^_"
442 		"......W[.....EFG"
443 		"..V....D....TU.Z"[c]);
444 #else
445   	else if (c < 128 && !control_char(c ^ 0100))
446   		SNPRINTF1(buf, sizeof(buf), "^%c", (int) (c ^ 0100));
447 #endif
448 	else
449 		SNPRINTF1(buf, sizeof(buf), binfmt, c);
450 	return (buf);
451 }
452 
453 /*
454  * Return the printable form of a UTF-8 character.
455  */
456 	public char *
457 prutfchar(ch)
458 	LWCHAR ch;
459 {
460 	static char buf[32];
461 
462 	if (ch == ESC)
463 		strcpy(buf, "ESC");
464   	else if (ch < 128 && control_char(ch))
465 	{
466 		if (!control_char(ch ^ 0100))
467 			SNPRINTF1(buf, sizeof(buf), "^%c", ((char) ch) ^ 0100);
468 		else
469 			SNPRINTF1(buf, sizeof(buf), binfmt, (char) ch);
470 	} else if (is_ubin_char(ch))
471 	{
472 		SNPRINTF1(buf, sizeof(buf), utfbinfmt, ch);
473 	} else
474 	{
475 		char *p = buf;
476 		if (ch >= 0x80000000)
477 			ch = 0xFFFD; /* REPLACEMENT CHARACTER */
478 		put_wchar(&p, ch);
479 		*p = '\0';
480 	}
481 	return (buf);
482 }
483 
484 /*
485  * Get the length of a UTF-8 character in bytes.
486  */
487 	public int
488 utf_len(ch)
489 	unsigned char ch;
490 {
491 	if ((ch & 0x80) == 0)
492 		return 1;
493 	if ((ch & 0xE0) == 0xC0)
494 		return 2;
495 	if ((ch & 0xF0) == 0xE0)
496 		return 3;
497 	if ((ch & 0xF8) == 0xF0)
498 		return 4;
499 	if ((ch & 0xFC) == 0xF8)
500 		return 5;
501 	if ((ch & 0xFE) == 0xFC)
502 		return 6;
503 	/* Invalid UTF-8 encoding. */
504 	return 1;
505 }
506 
507 /*
508  * Does the parameter point to the lead byte of a well-formed UTF-8 character?
509  */
510 	public int
511 is_utf8_well_formed(ss, slen)
512 	char *ss;
513 	int slen;
514 {
515 	int i;
516 	int len;
517 	unsigned char *s = (unsigned char *) ss;
518 
519 	if (IS_UTF8_INVALID(s[0]))
520 		return (0);
521 
522 	len = utf_len(s[0]);
523 	if (len > slen)
524 		return (0);
525 	if (len == 1)
526 		return (1);
527 	if (len == 2)
528 	{
529 		if (s[0] < 0xC2)
530 		    return (0);
531 	} else
532 	{
533 		unsigned char mask;
534 		mask = (~((1 << (8-len)) - 1)) & 0xFF;
535 		if (s[0] == mask && (s[1] & mask) == 0x80)
536 			return (0);
537 	}
538 
539 	for (i = 1;  i < len;  i++)
540 		if (!IS_UTF8_TRAIL(s[i]))
541 			return (0);
542 	return (1);
543 }
544 
545 /*
546  * Return number of invalid UTF-8 sequences found in a buffer.
547  */
548 	public int
549 utf_bin_count(data, len)
550 	char *data;
551 	int len;
552 {
553 	int bin_count = 0;
554 	while (len > 0)
555 	{
556 		if (is_utf8_well_formed(data, len))
557 		{
558 			int clen = utf_len(*data & 0377);
559 			data += clen;
560 			len -= clen;
561 		} else
562 		{
563 			/* Skip to next lead byte. */
564 			bin_count++;
565 			do {
566 				++data;
567 				--len;
568 			} while (len > 0 && !IS_UTF8_LEAD(*data & 0377));
569 		}
570 	}
571 	return (bin_count);
572 }
573 
574 /*
575  * Get the value of a UTF-8 character.
576  */
577 	public LWCHAR
578 get_wchar(p)
579 	constant char *p;
580 {
581 	switch (utf_len(p[0]))
582 	{
583 	case 1:
584 	default:
585 		/* 0xxxxxxx */
586 		return (LWCHAR)
587 			(p[0] & 0xFF);
588 	case 2:
589 		/* 110xxxxx 10xxxxxx */
590 		return (LWCHAR) (
591 			((p[0] & 0x1F) << 6) |
592 			(p[1] & 0x3F));
593 	case 3:
594 		/* 1110xxxx 10xxxxxx 10xxxxxx */
595 		return (LWCHAR) (
596 			((p[0] & 0x0F) << 12) |
597 			((p[1] & 0x3F) << 6) |
598 			(p[2] & 0x3F));
599 	case 4:
600 		/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
601 		return (LWCHAR) (
602 			((p[0] & 0x07) << 18) |
603 			((p[1] & 0x3F) << 12) |
604 			((p[2] & 0x3F) << 6) |
605 			(p[3] & 0x3F));
606 	case 5:
607 		/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
608 		return (LWCHAR) (
609 			((p[0] & 0x03) << 24) |
610 			((p[1] & 0x3F) << 18) |
611 			((p[2] & 0x3F) << 12) |
612 			((p[3] & 0x3F) << 6) |
613 			(p[4] & 0x3F));
614 	case 6:
615 		/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
616 		return (LWCHAR) (
617 			((p[0] & 0x01) << 30) |
618 			((p[1] & 0x3F) << 24) |
619 			((p[2] & 0x3F) << 18) |
620 			((p[3] & 0x3F) << 12) |
621 			((p[4] & 0x3F) << 6) |
622 			(p[5] & 0x3F));
623 	}
624 }
625 
626 /*
627  * Store a character into a UTF-8 string.
628  */
629 	public void
630 put_wchar(pp, ch)
631 	char **pp;
632 	LWCHAR ch;
633 {
634 	if (!utf_mode || ch < 0x80)
635 	{
636 		/* 0xxxxxxx */
637 		*(*pp)++ = (char) ch;
638 	} else if (ch < 0x800)
639 	{
640 		/* 110xxxxx 10xxxxxx */
641 		*(*pp)++ = (char) (0xC0 | ((ch >> 6) & 0x1F));
642 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
643 	} else if (ch < 0x10000)
644 	{
645 		/* 1110xxxx 10xxxxxx 10xxxxxx */
646 		*(*pp)++ = (char) (0xE0 | ((ch >> 12) & 0x0F));
647 		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
648 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
649 	} else if (ch < 0x200000)
650 	{
651 		/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
652 		*(*pp)++ = (char) (0xF0 | ((ch >> 18) & 0x07));
653 		*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
654 		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
655 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
656 	} else if (ch < 0x4000000)
657 	{
658 		/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
659 		*(*pp)++ = (char) (0xF0 | ((ch >> 24) & 0x03));
660 		*(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F));
661 		*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
662 		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
663 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
664 	} else
665 	{
666 		/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
667 		*(*pp)++ = (char) (0xF0 | ((ch >> 30) & 0x01));
668 		*(*pp)++ = (char) (0x80 | ((ch >> 24) & 0x3F));
669 		*(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F));
670 		*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
671 		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
672 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
673 	}
674 }
675 
676 /*
677  * Step forward or backward one character in a string.
678  */
679 	public LWCHAR
680 step_char(pp, dir, limit)
681 	char **pp;
682 	signed int dir;
683 	constant char *limit;
684 {
685 	LWCHAR ch;
686 	int len;
687 	char *p = *pp;
688 
689 	if (!utf_mode)
690 	{
691 		/* It's easy if chars are one byte. */
692 		if (dir > 0)
693 			ch = (LWCHAR) ((p < limit) ? *p++ : 0);
694 		else
695 			ch = (LWCHAR) ((p > limit) ? *--p : 0);
696 	} else if (dir > 0)
697 	{
698 		len = utf_len(*p);
699 		if (p + len > limit)
700 		{
701 			ch = 0;
702 			p = (char *) limit;
703 		} else
704 		{
705 			ch = get_wchar(p);
706 			p += len;
707 		}
708 	} else
709 	{
710 		while (p > limit && IS_UTF8_TRAIL(p[-1]))
711 			p--;
712 		if (p > limit)
713 			ch = get_wchar(--p);
714 		else
715 			ch = 0;
716 	}
717 	*pp = p;
718 	return ch;
719 }
720 
721 /*
722  * Unicode characters data
723  * Actual data is in the generated *.uni files.
724  */
725 
726 #define DECLARE_RANGE_TABLE_START(name) \
727     static struct wchar_range name##_array[] = {
728 #define DECLARE_RANGE_TABLE_END(name) \
729     }; struct wchar_range_table name##_table = { name##_array, sizeof(name##_array)/sizeof(*name##_array) };
730 
731 DECLARE_RANGE_TABLE_START(compose)
732 #include "compose.uni"
733 DECLARE_RANGE_TABLE_END(compose)
734 
735 DECLARE_RANGE_TABLE_START(ubin)
736 #include "ubin.uni"
737 DECLARE_RANGE_TABLE_END(ubin)
738 
739 DECLARE_RANGE_TABLE_START(wide)
740 #include "wide.uni"
741 DECLARE_RANGE_TABLE_END(wide)
742 
743 /* comb_table is special pairs, not ranges. */
744 static struct wchar_range comb_table[] = {
745 	{0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627},
746 };
747 
748 
749 	static int
750 is_in_table(ch, table)
751 	LWCHAR ch;
752 	struct wchar_range_table *table;
753 {
754 	int hi;
755 	int lo;
756 
757 	/* Binary search in the table. */
758 	if (ch < table->table[0].first)
759 		return 0;
760 	lo = 0;
761 	hi = table->count - 1;
762 	while (lo <= hi)
763 	{
764 		int mid = (lo + hi) / 2;
765 		if (ch > table->table[mid].last)
766 			lo = mid + 1;
767 		else if (ch < table->table[mid].first)
768 			hi = mid - 1;
769 		else
770 			return 1;
771 	}
772 	return 0;
773 }
774 
775 /*
776  * Is a character a UTF-8 composing character?
777  * If a composing character follows any char, the two combine into one glyph.
778  */
779 	public int
780 is_composing_char(ch)
781 	LWCHAR ch;
782 {
783 	return is_in_table(ch, &compose_table);
784 }
785 
786 /*
787  * Should this UTF-8 character be treated as binary?
788  */
789 	public int
790 is_ubin_char(ch)
791 	LWCHAR ch;
792 {
793 	return is_in_table(ch, &ubin_table);
794 }
795 
796 /*
797  * Is this a double width UTF-8 character?
798  */
799 	public int
800 is_wide_char(ch)
801 	LWCHAR ch;
802 {
803 	return is_in_table(ch, &wide_table);
804 }
805 
806 /*
807  * Is a character a UTF-8 combining character?
808  * A combining char acts like an ordinary char, but if it follows
809  * a specific char (not any char), the two combine into one glyph.
810  */
811 	public int
812 is_combining_char(ch1, ch2)
813 	LWCHAR ch1;
814 	LWCHAR ch2;
815 {
816 	/* The table is small; use linear search. */
817 	int i;
818 	for (i = 0;  i < sizeof(comb_table)/sizeof(*comb_table);  i++)
819 	{
820 		if (ch1 == comb_table[i].first &&
821 		    ch2 == comb_table[i].last)
822 			return 1;
823 	}
824 	return 0;
825 }
826 
827