xref: /freebsd/contrib/less/charset.c (revision 1ea316270f1f75922ac53976d5d8808a41442f46)
1 /*
2  * Copyright (C) 1984-2015  Mark Nudelman
3  *
4  * You may distribute under the terms of either the GNU General Public
5  * License or the Less License, as specified in the README file.
6  *
7  * For more information, see the README file.
8  */
9 
10 
11 /*
12  * Functions to define the character set
13  * and do things specific to the character set.
14  */
15 
16 #include "less.h"
17 #if HAVE_LOCALE
18 #include <locale.h>
19 #include <ctype.h>
20 #include <langinfo.h>
21 #endif
22 
23 #include "charset.h"
24 
25 public int utf_mode = 0;
26 
27 /*
28  * Predefined character sets,
29  * selected by the LESSCHARSET environment variable.
30  */
31 struct charset {
32 	char *name;
33 	int *p_flag;
34 	char *desc;
35 } charsets[] = {
36 	{ "ascii",		NULL,       "8bcccbcc18b95.b" },
37 	{ "utf-8",		&utf_mode,  "8bcccbcc18b95.b126.bb" },
38 	{ "iso8859",		NULL,       "8bcccbcc18b95.33b." },
39 	{ "latin3",		NULL,       "8bcccbcc18b95.33b5.b8.b15.b4.b12.b18.b12.b." },
40 	{ "arabic",		NULL,       "8bcccbcc18b95.33b.3b.7b2.13b.3b.b26.5b19.b" },
41 	{ "greek",		NULL,       "8bcccbcc18b95.33b4.2b4.b3.b35.b44.b" },
42 	{ "greek2005",		NULL,       "8bcccbcc18b95.33b14.b35.b44.b" },
43 	{ "hebrew",		NULL,       "8bcccbcc18b95.33b.b29.32b28.2b2.b" },
44 	{ "koi8-r",		NULL,       "8bcccbcc18b95.b." },
45 	{ "KOI8-T",		NULL,       "8bcccbcc18b95.b8.b6.b8.b.b.5b7.3b4.b4.b3.b.b.3b." },
46 	{ "georgianps",		NULL,       "8bcccbcc18b95.3b11.4b12.2b." },
47 	{ "tcvn",		NULL,       "b..b...bcccbccbbb7.8b95.b48.5b." },
48 	{ "TIS-620",		NULL,       "8bcccbcc18b95.b.4b.11b7.8b." },
49 	{ "next",		NULL,       "8bcccbcc18b95.bb125.bb" },
50 	{ "dos",		NULL,       "8bcccbcc12bc5b95.b." },
51 	{ "windows-1251",	NULL,       "8bcccbcc12bc5b95.b24.b." },
52 	{ "windows-1252",	NULL,       "8bcccbcc12bc5b95.b.b11.b.2b12.b." },
53 	{ "windows-1255",	NULL,       "8bcccbcc12bc5b95.b.b8.b.5b9.b.4b." },
54 	{ "ebcdic",		NULL,       "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
55 	{ "IBM-1047",		NULL,       "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
56 	{ NULL, NULL, NULL }
57 };
58 
59 /*
60  * Support "locale charmap"/nl_langinfo(CODESET) values, as well as others.
61  */
62 struct cs_alias {
63 	char *name;
64 	char *oname;
65 } cs_aliases[] = {
66 	{ "UTF-8",		"utf-8" },
67 	{ "ANSI_X3.4-1968",	"ascii" },
68 	{ "US-ASCII",		"ascii" },
69 	{ "latin1",		"iso8859" },
70 	{ "ISO-8859-1",		"iso8859" },
71 	{ "latin9",		"iso8859" },
72 	{ "ISO-8859-15",	"iso8859" },
73 	{ "latin2",		"iso8859" },
74 	{ "ISO-8859-2",		"iso8859" },
75 	{ "ISO-8859-3",		"latin3" },
76 	{ "latin4",		"iso8859" },
77 	{ "ISO-8859-4",		"iso8859" },
78 	{ "cyrillic",		"iso8859" },
79 	{ "ISO-8859-5",		"iso8859" },
80 	{ "ISO-8859-6",		"arabic" },
81 	{ "ISO-8859-7",		"greek" },
82 	{ "IBM9005",		"greek2005" },
83 	{ "ISO-8859-8",		"hebrew" },
84 	{ "latin5",		"iso8859" },
85 	{ "ISO-8859-9",		"iso8859" },
86 	{ "latin6",		"iso8859" },
87 	{ "ISO-8859-10",	"iso8859" },
88 	{ "latin7",		"iso8859" },
89 	{ "ISO-8859-13",	"iso8859" },
90 	{ "latin8",		"iso8859" },
91 	{ "ISO-8859-14",	"iso8859" },
92 	{ "latin10",		"iso8859" },
93 	{ "ISO-8859-16",	"iso8859" },
94 	{ "IBM437",		"dos" },
95 	{ "EBCDIC-US",		"ebcdic" },
96 	{ "IBM1047",		"IBM-1047" },
97 	{ "KOI8-R",		"koi8-r" },
98 	{ "KOI8-U",		"koi8-r" },
99 	{ "GEORGIAN-PS",	"georgianps" },
100 	{ "TCVN5712-1", 	"tcvn" },
101 	{ "NEXTSTEP",		"next" },
102 	{ "windows",		"windows-1252" }, /* backward compatibility */
103 	{ "CP1251",		"windows-1251" },
104 	{ "CP1252",		"windows-1252" },
105 	{ "CP1255",		"windows-1255" },
106 	{ NULL, NULL }
107 };
108 
109 #define	IS_BINARY_CHAR	01
110 #define	IS_CONTROL_CHAR	02
111 
112 static char chardef[256];
113 static char *binfmt = NULL;
114 static char *utfbinfmt = NULL;
115 public int binattr = AT_STANDOUT;
116 
117 
118 /*
119  * Define a charset, given a description string.
120  * The string consists of 256 letters,
121  * one for each character in the charset.
122  * If the string is shorter than 256 letters, missing letters
123  * are taken to be identical to the last one.
124  * A decimal number followed by a letter is taken to be a
125  * repetition of the letter.
126  *
127  * Each letter is one of:
128  *	. normal character
129  *	b binary character
130  *	c control character
131  */
132 	static void
133 ichardef(char *s)
134 {
135 	char *cp;
136 	int n;
137 	char v;
138 
139 	n = 0;
140 	v = 0;
141 	cp = chardef;
142 	while (*s != '\0')
143 	{
144 		switch (*s++)
145 		{
146 		case '.':
147 			v = 0;
148 			break;
149 		case 'c':
150 			v = IS_CONTROL_CHAR;
151 			break;
152 		case 'b':
153 			v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
154 			break;
155 
156 		case '0': case '1': case '2': case '3': case '4':
157 		case '5': case '6': case '7': case '8': case '9':
158 			n = (10 * n) + (s[-1] - '0');
159 			continue;
160 
161 		default:
162 			error("invalid chardef", NULL_PARG);
163 			quit(QUIT_ERROR);
164 			/*NOTREACHED*/
165 		}
166 
167 		do
168 		{
169 			if (cp >= chardef + sizeof(chardef))
170 			{
171 				error("chardef longer than 256", NULL_PARG);
172 				quit(QUIT_ERROR);
173 				/*NOTREACHED*/
174 			}
175 			*cp++ = v;
176 		} while (--n > 0);
177 		n = 0;
178 	}
179 
180 	while (cp < chardef + sizeof(chardef))
181 		*cp++ = v;
182 }
183 
184 /*
185  * Define a charset, given a charset name.
186  * The valid charset names are listed in the "charsets" array.
187  */
188 	static int
189 icharset(char *name, int no_error)
190 {
191 	struct charset *p;
192 	struct cs_alias *a;
193 
194 	if (name == NULL || *name == '\0')
195 		return (0);
196 
197 	/* First see if the name is an alias. */
198 	for (a = cs_aliases;  a->name != NULL;  a++)
199 	{
200 		if (strcmp(name, a->name) == 0)
201 		{
202 			name = a->oname;
203 			break;
204 		}
205 	}
206 
207 	for (p = charsets;  p->name != NULL;  p++)
208 	{
209 		if (strcmp(name, p->name) == 0)
210 		{
211 			ichardef(p->desc);
212 			if (p->p_flag != NULL)
213 				*(p->p_flag) = 1;
214 			return (1);
215 		}
216 	}
217 
218 	if (!no_error) {
219 		error("invalid charset name", NULL_PARG);
220 		quit(QUIT_ERROR);
221 	}
222 	return (0);
223 }
224 
225 #if HAVE_LOCALE
226 /*
227  * Define a charset, given a locale name.
228  */
229 	static void
230 ilocale(void)
231 {
232 	int c;
233 
234 	for (c = 0;  c < (int) sizeof(chardef);  c++)
235 	{
236 		if (isprint(c))
237 			chardef[c] = 0;
238 		else if (iscntrl(c))
239 			chardef[c] = IS_CONTROL_CHAR;
240 		else
241 			chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
242 	}
243 }
244 #endif
245 
246 /*
247  * Define the printing format for control (or binary utf) chars.
248  */
249    	static void
250 setbinfmt(char *s, char **fmtvarptr, char *default_fmt)
251 {
252 	if (s && utf_mode)
253 	{
254 		/* It would be too hard to account for width otherwise.  */
255 		char *t = s;
256 		while (*t)
257 		{
258 			if (*t < ' ' || *t > '~')
259 			{
260 				s = default_fmt;
261 				goto attr;
262 			}
263 			t++;
264 		}
265 	}
266 
267 	/* %n is evil */
268 	if (s == NULL || *s == '\0' ||
269 	    (*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) ||
270 	    (*s != '*' && strchr(s, 'n')))
271 		s = default_fmt;
272 
273 	/*
274 	 * Select the attributes if it starts with "*".
275 	 */
276  attr:
277 	if (*s == '*')
278 	{
279 		switch (s[1])
280 		{
281 		case 'd':  binattr = AT_BOLD;      break;
282 		case 'k':  binattr = AT_BLINK;     break;
283 		case 's':  binattr = AT_STANDOUT;  break;
284 		case 'u':  binattr = AT_UNDERLINE; break;
285 		default:   binattr = AT_NORMAL;    break;
286 		}
287 		s += 2;
288 	}
289 	*fmtvarptr = s;
290 }
291 
292 /*
293  *
294  */
295 	static void
296 set_charset(void)
297 {
298 	char *s;
299 
300 	/*
301 	 * See if environment variable LESSCHARSET is defined.
302 	 */
303 	s = lgetenv("LESSCHARSET");
304 	if (icharset(s, 0))
305 		return;
306 
307 	/*
308 	 * LESSCHARSET is not defined: try LESSCHARDEF.
309 	 */
310 	s = lgetenv("LESSCHARDEF");
311 	if (s != NULL && *s != '\0')
312 	{
313 		ichardef(s);
314 		return;
315 	}
316 
317 #if HAVE_LOCALE
318 #ifdef CODESET
319 	/*
320 	 * Try using the codeset name as the charset name.
321 	 */
322 	s = nl_langinfo(CODESET);
323 	if (icharset(s, 1))
324 		return;
325 #endif
326 #endif
327 
328 #if HAVE_STRSTR
329 	/*
330 	 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
331 	 */
332 	if ((s = lgetenv("LC_ALL")) != NULL ||
333 	    (s = lgetenv("LC_CTYPE")) != NULL ||
334 	    (s = lgetenv("LANG")) != NULL)
335 	{
336 		if (   strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL
337 		    || strstr(s, "UTF8")  != NULL || strstr(s, "utf8")  != NULL)
338 			if (icharset("utf-8", 1))
339 				return;
340 	}
341 #endif
342 
343 #if HAVE_LOCALE
344 	/*
345 	 * Get character definitions from locale functions,
346 	 * rather than from predefined charset entry.
347 	 */
348 	ilocale();
349 #if MSDOS_COMPILER
350 	/*
351 	 * Default to "dos".
352 	 */
353 	(void) icharset("dos", 1);
354 #else
355 	/*
356 	 * Default to "latin1".
357 	 */
358 	(void) icharset("latin1", 1);
359 #endif
360 #endif
361 }
362 
363 /*
364  * Initialize charset data structures.
365  */
366 	public void
367 init_charset(void)
368 {
369 	char *s;
370 
371 #if HAVE_LOCALE
372 	setlocale(LC_ALL, "");
373 #endif
374 
375 	set_charset();
376 
377 	s = lgetenv("LESSBINFMT");
378 	setbinfmt(s, &binfmt, "*s<%02X>");
379 
380 	s = lgetenv("LESSUTFBINFMT");
381 	setbinfmt(s, &utfbinfmt, "<U+%04lX>");
382 }
383 
384 /*
385  * Is a given character a "binary" character?
386  */
387 	public int
388 binary_char(LWCHAR c)
389 {
390 	if (utf_mode)
391 		return (is_ubin_char(c));
392 	c &= 0377;
393 	return (chardef[c] & IS_BINARY_CHAR);
394 }
395 
396 /*
397  * Is a given character a "control" character?
398  */
399 	public int
400 control_char(LWCHAR c)
401 {
402 	c &= 0377;
403 	return (chardef[c] & IS_CONTROL_CHAR);
404 }
405 
406 /*
407  * Return the printable form of a character.
408  * For example, in the "ascii" charset '\3' is printed as "^C".
409  */
410 	public char *
411 prchar(LWCHAR c)
412 {
413 	/* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */
414 	static char buf[32];
415 
416 	c &= 0377;
417 	if ((c < 128 || !utf_mode) && !control_char(c))
418 		SNPRINTF1(buf, sizeof(buf), "%c", (int) c);
419 	else if (c == ESC)
420 		strcpy(buf, "ESC");
421 #if IS_EBCDIC_HOST
422 	else if (!binary_char(c) && c < 64)
423 		SNPRINTF1(buf, sizeof(buf), "^%c",
424 		/*
425 		 * This array roughly inverts CONTROL() #defined in less.h,
426 	 	 * and should be kept in sync with CONTROL() and IBM-1047.
427  	 	 */
428 		"@ABC.I.?...KLMNO"
429 		"PQRS.JH.XY.."
430 		"\\]^_"
431 		"......W[.....EFG"
432 		"..V....D....TU.Z"[c]);
433 #else
434   	else if (c < 128 && !control_char(c ^ 0100))
435   		SNPRINTF1(buf, sizeof(buf), "^%c", (int) (c ^ 0100));
436 #endif
437 	else
438 		SNPRINTF1(buf, sizeof(buf), binfmt, c);
439 	return (buf);
440 }
441 
442 /*
443  * Return the printable form of a UTF-8 character.
444  */
445 	public char *
446 prutfchar(LWCHAR ch)
447 {
448 	static char buf[32];
449 
450 	if (ch == ESC)
451 		strcpy(buf, "ESC");
452   	else if (ch < 128 && control_char(ch))
453 	{
454 		if (!control_char(ch ^ 0100))
455 			SNPRINTF1(buf, sizeof(buf), "^%c", ((char) ch) ^ 0100);
456 		else
457 			SNPRINTF1(buf, sizeof(buf), binfmt, (char) ch);
458 	} else if (is_ubin_char(ch))
459 	{
460 		SNPRINTF1(buf, sizeof(buf), utfbinfmt, ch);
461 	} else
462 	{
463 		char *p = buf;
464 		if (ch >= 0x80000000)
465 			ch = 0xFFFD; /* REPLACEMENT CHARACTER */
466 		put_wchar(&p, ch);
467 		*p = '\0';
468 	}
469 	return (buf);
470 }
471 
472 /*
473  * Get the length of a UTF-8 character in bytes.
474  */
475 	public int
476 utf_len(char ch)
477 {
478 	if ((ch & 0x80) == 0)
479 		return 1;
480 	if ((ch & 0xE0) == 0xC0)
481 		return 2;
482 	if ((ch & 0xF0) == 0xE0)
483 		return 3;
484 	if ((ch & 0xF8) == 0xF0)
485 		return 4;
486 	if ((ch & 0xFC) == 0xF8)
487 		return 5;
488 	if ((ch & 0xFE) == 0xFC)
489 		return 6;
490 	/* Invalid UTF-8 encoding. */
491 	return 1;
492 }
493 
494 /*
495  * Does the parameter point to the lead byte of a well-formed UTF-8 character?
496  */
497 	public int
498 is_utf8_well_formed(unsigned char *s, int slen)
499 {
500 	int i;
501 	int len;
502 
503 	if (IS_UTF8_INVALID(s[0]))
504 		return (0);
505 
506 	len = utf_len((char) s[0]);
507 	if (len > slen)
508 		return (0);
509 	if (len == 1)
510 		return (1);
511 	if (len == 2)
512 	{
513 		if (s[0] < 0xC2)
514 		    return (0);
515 	} else
516 	{
517 		unsigned char mask;
518 		mask = (~((1 << (8-len)) - 1)) & 0xFF;
519 		if (s[0] == mask && (s[1] & mask) == 0x80)
520 			return (0);
521 	}
522 
523 	for (i = 1;  i < len;  i++)
524 		if (!IS_UTF8_TRAIL(s[i]))
525 			return (0);
526 	return (1);
527 }
528 
529 /*
530  * Return number of invalid UTF-8 sequences found in a buffer.
531  */
532 	public int
533 utf_bin_count(unsigned char *data, int len)
534 {
535 	int bin_count = 0;
536 	while (len > 0)
537 	{
538 		if (is_utf8_well_formed(data, len))
539 		{
540 			int clen = utf_len(*data);
541 			data += clen;
542 			len -= clen;
543 		} else
544 		{
545 			/* Skip to next lead byte. */
546 			bin_count++;
547 			do {
548 				++data;
549 				--len;
550 			} while (len > 0 && !IS_UTF8_LEAD(*data));
551 		}
552 	}
553 	return (bin_count);
554 }
555 
556 /*
557  * Get the value of a UTF-8 character.
558  */
559 	public LWCHAR
560 get_wchar(constant char *p)
561 {
562 	switch (utf_len(p[0]))
563 	{
564 	case 1:
565 	default:
566 		/* 0xxxxxxx */
567 		return (LWCHAR)
568 			(p[0] & 0xFF);
569 	case 2:
570 		/* 110xxxxx 10xxxxxx */
571 		return (LWCHAR) (
572 			((p[0] & 0x1F) << 6) |
573 			(p[1] & 0x3F));
574 	case 3:
575 		/* 1110xxxx 10xxxxxx 10xxxxxx */
576 		return (LWCHAR) (
577 			((p[0] & 0x0F) << 12) |
578 			((p[1] & 0x3F) << 6) |
579 			(p[2] & 0x3F));
580 	case 4:
581 		/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
582 		return (LWCHAR) (
583 			((p[0] & 0x07) << 18) |
584 			((p[1] & 0x3F) << 12) |
585 			((p[2] & 0x3F) << 6) |
586 			(p[3] & 0x3F));
587 	case 5:
588 		/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
589 		return (LWCHAR) (
590 			((p[0] & 0x03) << 24) |
591 			((p[1] & 0x3F) << 18) |
592 			((p[2] & 0x3F) << 12) |
593 			((p[3] & 0x3F) << 6) |
594 			(p[4] & 0x3F));
595 	case 6:
596 		/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
597 		return (LWCHAR) (
598 			((p[0] & 0x01) << 30) |
599 			((p[1] & 0x3F) << 24) |
600 			((p[2] & 0x3F) << 18) |
601 			((p[3] & 0x3F) << 12) |
602 			((p[4] & 0x3F) << 6) |
603 			(p[5] & 0x3F));
604 	}
605 }
606 
607 /*
608  * Store a character into a UTF-8 string.
609  */
610 	public void
611 put_wchar(char **pp, LWCHAR ch)
612 {
613 	if (!utf_mode || ch < 0x80)
614 	{
615 		/* 0xxxxxxx */
616 		*(*pp)++ = (char) ch;
617 	} else if (ch < 0x800)
618 	{
619 		/* 110xxxxx 10xxxxxx */
620 		*(*pp)++ = (char) (0xC0 | ((ch >> 6) & 0x1F));
621 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
622 	} else if (ch < 0x10000)
623 	{
624 		/* 1110xxxx 10xxxxxx 10xxxxxx */
625 		*(*pp)++ = (char) (0xE0 | ((ch >> 12) & 0x0F));
626 		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
627 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
628 	} else if (ch < 0x200000)
629 	{
630 		/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
631 		*(*pp)++ = (char) (0xF0 | ((ch >> 18) & 0x07));
632 		*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
633 		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
634 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
635 	} else if (ch < 0x4000000)
636 	{
637 		/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
638 		*(*pp)++ = (char) (0xF0 | ((ch >> 24) & 0x03));
639 		*(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F));
640 		*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
641 		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
642 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
643 	} else
644 	{
645 		/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
646 		*(*pp)++ = (char) (0xF0 | ((ch >> 30) & 0x01));
647 		*(*pp)++ = (char) (0x80 | ((ch >> 24) & 0x3F));
648 		*(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F));
649 		*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
650 		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
651 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
652 	}
653 }
654 
655 /*
656  * Step forward or backward one character in a string.
657  */
658 	public LWCHAR
659 step_char(constant char **pp, signed int dir, constant char *limit)
660 {
661 	LWCHAR ch;
662 	int len;
663 	constant char *p = *pp;
664 
665 	if (!utf_mode)
666 	{
667 		/* It's easy if chars are one byte. */
668 		if (dir > 0)
669 			ch = (LWCHAR) ((p < limit) ? *p++ : 0);
670 		else
671 			ch = (LWCHAR) ((p > limit) ? *--p : 0);
672 	} else if (dir > 0)
673 	{
674 		len = utf_len(*p);
675 		if (p + len > limit)
676 		{
677 			ch = 0;
678 			p = limit;
679 		} else
680 		{
681 			ch = get_wchar(p);
682 			p += len;
683 		}
684 	} else
685 	{
686 		while (p > limit && IS_UTF8_TRAIL(p[-1]))
687 			p--;
688 		if (p > limit)
689 			ch = get_wchar(--p);
690 		else
691 			ch = 0;
692 	}
693 	*pp = p;
694 	return ch;
695 }
696 
697 /*
698  * Unicode characters data
699  * Actual data is in the generated *.uni files.
700  */
701 
702 #define DECLARE_RANGE_TABLE_START(name) \
703     static struct wchar_range name##_array[] = {
704 #define DECLARE_RANGE_TABLE_END(name) \
705     }; struct wchar_range_table name##_table = { name##_array, sizeof(name##_array)/sizeof(*name##_array) };
706 
707 DECLARE_RANGE_TABLE_START(compose)
708 #include "compose.uni"
709 DECLARE_RANGE_TABLE_END(compose)
710 
711 DECLARE_RANGE_TABLE_START(ubin)
712 #include "ubin.uni"
713 DECLARE_RANGE_TABLE_END(ubin)
714 
715 DECLARE_RANGE_TABLE_START(wide)
716 #include "wide.uni"
717 DECLARE_RANGE_TABLE_END(wide)
718 
719 /* comb_table is special pairs, not ranges. */
720 static struct wchar_range comb_table[] = {
721 	{0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627},
722 };
723 
724 
725 	static int
726 is_in_table(LWCHAR ch, struct wchar_range_table *table)
727 {
728 	int hi;
729 	int lo;
730 
731 	/* Binary search in the table. */
732 	if (ch < table->table[0].first)
733 		return 0;
734 	lo = 0;
735 	hi = table->count - 1;
736 	while (lo <= hi)
737 	{
738 		int mid = (lo + hi) / 2;
739 		if (ch > table->table[mid].last)
740 			lo = mid + 1;
741 		else if (ch < table->table[mid].first)
742 			hi = mid - 1;
743 		else
744 			return 1;
745 	}
746 	return 0;
747 }
748 
749 /*
750  * Is a character a UTF-8 composing character?
751  * If a composing character follows any char, the two combine into one glyph.
752  */
753 	public int
754 is_composing_char(LWCHAR ch)
755 {
756 	return is_in_table(ch, &compose_table);
757 }
758 
759 /*
760  * Should this UTF-8 character be treated as binary?
761  */
762 	public int
763 is_ubin_char(LWCHAR ch)
764 {
765 	return is_in_table(ch, &ubin_table);
766 }
767 
768 /*
769  * Is this a double width UTF-8 character?
770  */
771 	public int
772 is_wide_char(LWCHAR ch)
773 {
774 	return is_in_table(ch, &wide_table);
775 }
776 
777 /*
778  * Is a character a UTF-8 combining character?
779  * A combining char acts like an ordinary char, but if it follows
780  * a specific char (not any char), the two combine into one glyph.
781  */
782 	public int
783 is_combining_char(LWCHAR ch1, LWCHAR ch2)
784 {
785 	/* The table is small; use linear search. */
786 	int i;
787 	for (i = 0;  i < sizeof(comb_table)/sizeof(*comb_table);  i++)
788 	{
789 		if (ch1 == comb_table[i].first &&
790 		    ch2 == comb_table[i].last)
791 			return 1;
792 	}
793 	return 0;
794 }
795 
796