xref: /freebsd/lib/libiconv_modules/HZ/citrus_hz.c (revision 52f72944b8f5abb2386eae924357dee8aea17d5b)
1 /* $FreeBSD$ */
2 /* $NetBSD: citrus_hz.c,v 1.2 2008/06/14 16:01:07 tnozaki Exp $ */
3 
4 /*-
5  * SPDX-License-Identifier: BSD-2-Clause
6  *
7  * Copyright (c)2004, 2006 Citrus Project,
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  */
32 
33 #include <sys/cdefs.h>
34 #include <sys/queue.h>
35 #include <sys/types.h>
36 
37 #include <assert.h>
38 #include <errno.h>
39 #include <limits.h>
40 #include <stddef.h>
41 #include <stdint.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <wchar.h>
45 
46 #include "citrus_namespace.h"
47 #include "citrus_types.h"
48 #include "citrus_bcs.h"
49 #include "citrus_module.h"
50 #include "citrus_stdenc.h"
51 
52 #include "citrus_hz.h"
53 #include "citrus_prop.h"
54 
55 /*
56  * wchar_t mapping:
57  *
58  * CTRL/ASCII	00000000 00000000 00000000 gxxxxxxx
59  * GB2312	00000000 00000000 0xxxxxxx gxxxxxxx
60  * 94/96*n (~M)	0mmmmmmm 0xxxxxxx 0xxxxxxx gxxxxxxx
61  */
62 
63 #define ESCAPE_CHAR	'~'
64 
65 typedef enum {
66 	CTRL = 0, ASCII = 1, GB2312 = 2, CS94 = 3, CS96 = 4
67 } charset_t;
68 
69 typedef struct {
70 	int	 start;
71 	int	 end;
72 	int	 width;
73 } range_t;
74 
75 static const range_t ranges[] = {
76 #define RANGE(start, end) { start, end, (end - start) + 1 }
77 /* CTRL   */ RANGE(0x00, 0x1F),
78 /* ASCII  */ RANGE(0x20, 0x7F),
79 /* GB2312 */ RANGE(0x21, 0x7E),
80 /* CS94   */ RANGE(0x21, 0x7E),
81 /* CS96   */ RANGE(0x20, 0x7F),
82 #undef RANGE
83 };
84 
85 typedef struct escape_t escape_t;
86 typedef struct {
87 	charset_t	 charset;
88 	escape_t	*escape;
89 	ssize_t		 length;
90 #define ROWCOL_MAX	3
91 } graphic_t;
92 
93 typedef TAILQ_HEAD(escape_list, escape_t) escape_list;
94 struct escape_t {
95 	TAILQ_ENTRY(escape_t)	 entry;
96 	escape_list		*set;
97 	graphic_t		*left;
98 	graphic_t		*right;
99 	int			 ch;
100 };
101 
102 #define GL(escape)	((escape)->left)
103 #define GR(escape)	((escape)->right)
104 #define SET(escape)	((escape)->set)
105 #define ESC(escape)	((escape)->ch)
106 #define INIT(escape)	(TAILQ_FIRST(SET(escape)))
107 
108 static __inline escape_t *
109 find_escape(escape_list *set, int ch)
110 {
111 	escape_t *escape;
112 
113 	TAILQ_FOREACH(escape, set, entry) {
114 		if (ESC(escape) == ch)
115 			break;
116 	}
117 
118 	return (escape);
119 }
120 
121 typedef struct {
122 	escape_list	 e0;
123 	escape_list	 e1;
124 	graphic_t	*ascii;
125 	graphic_t	*gb2312;
126 } _HZEncodingInfo;
127 
128 #define E0SET(ei)	(&(ei)->e0)
129 #define E1SET(ei)	(&(ei)->e1)
130 #define INIT0(ei)	(TAILQ_FIRST(E0SET(ei)))
131 #define INIT1(ei)	(TAILQ_FIRST(E1SET(ei)))
132 
133 typedef struct {
134 	escape_t	*inuse;
135 	int		 chlen;
136 	char		 ch[ROWCOL_MAX];
137 } _HZState;
138 
139 #define _CEI_TO_EI(_cei_)		(&(_cei_)->ei)
140 #define _CEI_TO_STATE(_cei_, _func_)	(_cei_)->states.s_##_func_
141 
142 #define _FUNCNAME(m)			_citrus_HZ_##m
143 #define _ENCODING_INFO			_HZEncodingInfo
144 #define _ENCODING_STATE			_HZState
145 #define _ENCODING_MB_CUR_MAX(_ei_)	MB_LEN_MAX
146 #define _ENCODING_IS_STATE_DEPENDENT		1
147 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_)	((_ps_)->inuse == NULL)
148 
149 static __inline void
150 _citrus_HZ_init_state(_HZEncodingInfo * __restrict ei,
151     _HZState * __restrict psenc)
152 {
153 
154 	psenc->chlen = 0;
155 	psenc->inuse = INIT0(ei);
156 }
157 
158 #if 0
159 static __inline void
160 /*ARGSUSED*/
161 _citrus_HZ_pack_state(_HZEncodingInfo * __restrict ei __unused,
162     void *__restrict pspriv, const _HZState * __restrict psenc)
163 {
164 
165 	memcpy(pspriv, (const void *)psenc, sizeof(*psenc));
166 }
167 
168 static __inline void
169 /*ARGSUSED*/
170 _citrus_HZ_unpack_state(_HZEncodingInfo * __restrict ei __unused,
171     _HZState * __restrict psenc, const void * __restrict pspriv)
172 {
173 
174 	memcpy((void *)psenc, pspriv, sizeof(*psenc));
175 }
176 #endif
177 
178 static int
179 _citrus_HZ_mbrtowc_priv(_HZEncodingInfo * __restrict ei,
180     wchar_t * __restrict pwc, char ** __restrict s, size_t n,
181     _HZState * __restrict psenc, size_t * __restrict nresult)
182 {
183 	escape_t *candidate, *init;
184 	graphic_t *graphic;
185 	const range_t *range;
186 	char *s0;
187 	wchar_t wc;
188 	int bit, ch, head, len, tail;
189 
190 	if (*s == NULL) {
191 		_citrus_HZ_init_state(ei, psenc);
192 		*nresult = 1;
193 		return (0);
194 	}
195 	s0 = *s;
196 	if (psenc->chlen < 0 || psenc->inuse == NULL)
197 		return (EINVAL);
198 
199 	wc = (wchar_t)0;
200 	bit = head = tail = 0;
201 	graphic = NULL;
202 	for (len = 0; len <= MB_LEN_MAX;) {
203 		if (psenc->chlen == tail) {
204 			if (n-- < 1) {
205 				*s = s0;
206 				*nresult = (size_t)-2;
207 				return (0);
208 			}
209 			psenc->ch[psenc->chlen++] = *s0++;
210 			++len;
211 		}
212 		ch = (unsigned char)psenc->ch[tail++];
213 		if (tail == 1) {
214 			if ((ch & ~0x80) <= 0x1F) {
215 				if (psenc->inuse != INIT0(ei))
216 					break;
217 				wc = (wchar_t)ch;
218 				goto done;
219 			}
220 			if (ch & 0x80) {
221 				graphic = GR(psenc->inuse);
222 				bit = 0x80;
223 				ch &= ~0x80;
224 			} else {
225 				graphic = GL(psenc->inuse);
226 				if (ch == ESCAPE_CHAR)
227 					continue;
228 				bit = 0x0;
229 			}
230 			if (graphic == NULL)
231 				break;
232 		} else if (tail == 2 && psenc->ch[0] == ESCAPE_CHAR) {
233 			if (tail < psenc->chlen)
234 				return (EINVAL);
235 			if (ch == ESCAPE_CHAR) {
236 				++head;
237 			} else if (ch == '\n') {
238 				if (psenc->inuse != INIT0(ei))
239 					break;
240 				tail = psenc->chlen = 0;
241 				continue;
242 			} else {
243 				candidate = NULL;
244 				init = INIT0(ei);
245 				if (psenc->inuse == init) {
246 					init = INIT1(ei);
247 				} else if (INIT(psenc->inuse) == init) {
248 					if (ESC(init) != ch)
249 						break;
250 					candidate = init;
251 				}
252 				if (candidate == NULL) {
253 					candidate = find_escape(
254 					    SET(psenc->inuse), ch);
255 					if (candidate == NULL) {
256 						if (init == NULL ||
257 						    ESC(init) != ch)
258 							break;
259 						candidate = init;
260 					}
261 				}
262 				psenc->inuse = candidate;
263 				tail = psenc->chlen = 0;
264 				continue;
265 			}
266 		} else if (ch & 0x80) {
267 			if (graphic != GR(psenc->inuse))
268 				break;
269 			ch &= ~0x80;
270 		} else {
271 			if (graphic != GL(psenc->inuse))
272 				break;
273 		}
274 		range = &ranges[(size_t)graphic->charset];
275 		if (range->start > ch || range->end < ch)
276 			break;
277 		wc <<= 8;
278 		wc |= ch;
279 		if (graphic->length == (tail - head)) {
280 			if (graphic->charset > GB2312)
281 				bit |= ESC(psenc->inuse) << 24;
282 			wc |= bit;
283 			goto done;
284 		}
285 	}
286 	*nresult = (size_t)-1;
287 	return (EILSEQ);
288 done:
289 	if (tail < psenc->chlen)
290 		return (EINVAL);
291 	*s = s0;
292 	if (pwc != NULL)
293 		*pwc = wc;
294 	psenc->chlen = 0;
295 	*nresult = (wc == 0) ? 0 : len;
296 
297 	return (0);
298 }
299 
300 static int
301 _citrus_HZ_wcrtomb_priv(_HZEncodingInfo * __restrict ei,
302     char * __restrict s, size_t n, wchar_t wc,
303     _HZState * __restrict psenc, size_t * __restrict nresult)
304 {
305 	escape_t *candidate, *init;
306 	graphic_t *graphic;
307 	const range_t *range;
308 	size_t len;
309 	int bit, ch;
310 
311 	if (psenc->chlen != 0 || psenc->inuse == NULL)
312 		return (EINVAL);
313 	if (wc & 0x80) {
314 		bit = 0x80;
315 		wc &= ~0x80;
316 	} else {
317 		bit = 0x0;
318 	}
319 	if ((uint32_t)wc <= 0x1F) {
320 		candidate = INIT0(ei);
321 		graphic = (bit == 0) ? candidate->left : candidate->right;
322 		if (graphic == NULL)
323 			goto ilseq;
324 		range = &ranges[(size_t)CTRL];
325 		len = 1;
326 	} else if ((uint32_t)wc <= 0x7F) {
327 		graphic = ei->ascii;
328 		if (graphic == NULL)
329 			goto ilseq;
330 		candidate = graphic->escape;
331 		range = &ranges[(size_t)graphic->charset];
332 		len = graphic->length;
333 	} else if ((uint32_t)wc <= 0x7F7F) {
334 		graphic = ei->gb2312;
335 		if (graphic == NULL)
336 			goto ilseq;
337 		candidate = graphic->escape;
338 		range = &ranges[(size_t)graphic->charset];
339 		len = graphic->length;
340 	} else {
341 		ch = (wc >> 24) & 0xFF;
342 		candidate = find_escape(E0SET(ei), ch);
343 		if (candidate == NULL) {
344 			candidate = find_escape(E1SET(ei), ch);
345 			if (candidate == NULL)
346 				goto ilseq;
347 		}
348 		wc &= ~0xFF000000;
349 		graphic = (bit == 0) ? candidate->left : candidate->right;
350 		if (graphic == NULL)
351 			goto ilseq;
352 		range = &ranges[(size_t)graphic->charset];
353 		len = graphic->length;
354 	}
355 	if (psenc->inuse != candidate) {
356 		init = INIT0(ei);
357 		if (SET(psenc->inuse) == SET(candidate)) {
358 			if (INIT(psenc->inuse) != init ||
359 			    psenc->inuse == init || candidate == init)
360 				init = NULL;
361 		} else if (candidate == (init = INIT(candidate))) {
362 			init = NULL;
363 		}
364 		if (init != NULL) {
365 			if (n < 2)
366 				return (E2BIG);
367 			n -= 2;
368 			psenc->ch[psenc->chlen++] = ESCAPE_CHAR;
369 			psenc->ch[psenc->chlen++] = ESC(init);
370 		}
371 		if (n < 2)
372 			return (E2BIG);
373 		n -= 2;
374 		psenc->ch[psenc->chlen++] = ESCAPE_CHAR;
375 		psenc->ch[psenc->chlen++] = ESC(candidate);
376 		psenc->inuse = candidate;
377 	}
378 	if (n < len)
379 		return (E2BIG);
380 	while (len-- > 0) {
381 		ch = (wc >> (len * 8)) & 0xFF;
382 		if (range->start > ch || range->end < ch)
383 			goto ilseq;
384 		psenc->ch[psenc->chlen++] = ch | bit;
385 	}
386 	memcpy(s, psenc->ch, psenc->chlen);
387 	*nresult = psenc->chlen;
388 	psenc->chlen = 0;
389 
390 	return (0);
391 
392 ilseq:
393 	*nresult = (size_t)-1;
394 	return (EILSEQ);
395 }
396 
397 static __inline int
398 _citrus_HZ_put_state_reset(_HZEncodingInfo * __restrict ei,
399     char * __restrict s, size_t n, _HZState * __restrict psenc,
400     size_t * __restrict nresult)
401 {
402 	escape_t *candidate;
403 
404 	if (psenc->chlen != 0 || psenc->inuse == NULL)
405 		return (EINVAL);
406 	candidate = INIT0(ei);
407 	if (psenc->inuse != candidate) {
408 		if (n < 2)
409 			return (E2BIG);
410 		n -= 2;
411 		psenc->ch[psenc->chlen++] = ESCAPE_CHAR;
412 		psenc->ch[psenc->chlen++] = ESC(candidate);
413 	}
414 	if (n < 1)
415 		return (E2BIG);
416 	if (psenc->chlen > 0)
417 		memcpy(s, psenc->ch, psenc->chlen);
418 	*nresult = psenc->chlen;
419 	_citrus_HZ_init_state(ei, psenc);
420 
421 	return (0);
422 }
423 
424 static __inline int
425 _citrus_HZ_stdenc_get_state_desc_generic(_HZEncodingInfo * __restrict ei,
426     _HZState * __restrict psenc, int * __restrict rstate)
427 {
428 
429 	if (psenc->chlen < 0 || psenc->inuse == NULL)
430 		return (EINVAL);
431 	*rstate = (psenc->chlen == 0)
432 	    ? ((psenc->inuse == INIT0(ei))
433 	        ? _STDENC_SDGEN_INITIAL
434 	        : _STDENC_SDGEN_STABLE)
435 	    : ((psenc->ch[0] == ESCAPE_CHAR)
436 	        ? _STDENC_SDGEN_INCOMPLETE_SHIFT
437 	        : _STDENC_SDGEN_INCOMPLETE_CHAR);
438 
439 	return (0);
440 }
441 
442 static __inline int
443 /*ARGSUSED*/
444 _citrus_HZ_stdenc_wctocs(_HZEncodingInfo * __restrict ei __unused,
445     _csid_t * __restrict csid, _index_t * __restrict idx, wchar_t wc)
446 {
447 	int bit;
448 
449 	if (wc & 0x80) {
450 		bit = 0x80;
451 		wc &= ~0x80;
452 	} else
453 		bit = 0x0;
454 	if ((uint32_t)wc <= 0x7F) {
455 		*csid = (_csid_t)bit;
456 		*idx = (_index_t)wc;
457 	} else if ((uint32_t)wc <= 0x7F7F) {
458 		*csid = (_csid_t)(bit | 0x8000);
459 		*idx = (_index_t)wc;
460 	} else {
461 		*csid = (_index_t)(wc & ~0x00FFFF7F);
462 		*idx = (_csid_t)(wc & 0x00FFFF7F);
463 	}
464 
465 	return (0);
466 }
467 
468 static __inline int
469 /*ARGSUSED*/
470 _citrus_HZ_stdenc_cstowc(_HZEncodingInfo * __restrict ei __unused,
471     wchar_t * __restrict wc, _csid_t csid, _index_t idx)
472 {
473 
474 	*wc = (wchar_t)idx;
475 	switch (csid) {
476 	case 0x80:
477 	case 0x8080:
478 		*wc |= (wchar_t)0x80;
479 		/*FALLTHROUGH*/
480 	case 0x0:
481 	case 0x8000:
482 		break;
483 	default:
484 		*wc |= (wchar_t)csid;
485 	}
486 
487 	return (0);
488 }
489 
490 static void
491 _citrus_HZ_encoding_module_uninit(_HZEncodingInfo *ei)
492 {
493 	escape_t *escape;
494 
495 	while ((escape = TAILQ_FIRST(E0SET(ei))) != NULL) {
496 		TAILQ_REMOVE(E0SET(ei), escape, entry);
497 		free(GL(escape));
498 		free(GR(escape));
499 		free(escape);
500 	}
501 	while ((escape = TAILQ_FIRST(E1SET(ei))) != NULL) {
502 		TAILQ_REMOVE(E1SET(ei), escape, entry);
503 		free(GL(escape));
504 		free(GR(escape));
505 		free(escape);
506 	}
507 }
508 
509 static int
510 _citrus_HZ_parse_char(void *context, const char *name __unused, const char *s)
511 {
512 	escape_t *escape;
513 	void **p;
514 
515 	p = (void **)context;
516 	escape = (escape_t *)p[0];
517 	if (escape->ch != '\0')
518 		return (EINVAL);
519 	escape->ch = *s++;
520 	if (escape->ch == ESCAPE_CHAR || *s != '\0')
521 		return (EINVAL);
522 
523 	return (0);
524 }
525 
526 static int
527 _citrus_HZ_parse_graphic(void *context, const char *name, const char *s)
528 {
529 	_HZEncodingInfo *ei;
530 	escape_t *escape;
531 	graphic_t *graphic;
532 	void **p;
533 
534 	p = (void **)context;
535 	escape = (escape_t *)p[0];
536 	ei = (_HZEncodingInfo *)p[1];
537 	graphic = calloc(1, sizeof(*graphic));
538 	if (graphic == NULL)
539 		return (ENOMEM);
540 	if (strcmp("GL", name) == 0) {
541 		if (GL(escape) != NULL)
542 			goto release;
543 		GL(escape) = graphic;
544 	} else if (strcmp("GR", name) == 0) {
545 		if (GR(escape) != NULL)
546 			goto release;
547 		GR(escape) = graphic;
548 	} else {
549 release:
550 		free(graphic);
551 		return (EINVAL);
552 	}
553 	graphic->escape = escape;
554 	if (_bcs_strncasecmp("ASCII", s, 5) == 0) {
555 		if (s[5] != '\0')
556 			return (EINVAL);
557 		graphic->charset = ASCII;
558 		graphic->length = 1;
559 		ei->ascii = graphic;
560 		return (0);
561 	} else if (_bcs_strncasecmp("GB2312", s, 6) == 0) {
562 		if (s[6] != '\0')
563 			return (EINVAL);
564 		graphic->charset = GB2312;
565 		graphic->length = 2;
566 		ei->gb2312 = graphic;
567 		return (0);
568 	} else if (strncmp("94*", s, 3) == 0)
569 		graphic->charset = CS94;
570 	else if (strncmp("96*", s, 3) == 0)
571 		graphic->charset = CS96;
572 	else
573 		return (EINVAL);
574 	s += 3;
575 	switch(*s) {
576 	case '1': case '2': case '3':
577 		graphic->length = (size_t)(*s - '0');
578 		if (*++s == '\0')
579 			break;
580 	/*FALLTHROUGH*/
581 	default:
582 		return (EINVAL);
583 	}
584 	return (0);
585 }
586 
587 static const _citrus_prop_hint_t escape_hints[] = {
588 _CITRUS_PROP_HINT_STR("CH", &_citrus_HZ_parse_char),
589 _CITRUS_PROP_HINT_STR("GL", &_citrus_HZ_parse_graphic),
590 _CITRUS_PROP_HINT_STR("GR", &_citrus_HZ_parse_graphic),
591 _CITRUS_PROP_HINT_END
592 };
593 
594 static int
595 _citrus_HZ_parse_escape(void *context, const char *name, const char *s)
596 {
597 	_HZEncodingInfo *ei;
598 	escape_t *escape;
599 	void *p[2];
600 
601 	ei = (_HZEncodingInfo *)context;
602 	escape = calloc(1, sizeof(*escape));
603 	if (escape == NULL)
604 		return (EINVAL);
605 	if (strcmp("0", name) == 0) {
606 		escape->set = E0SET(ei);
607 		TAILQ_INSERT_TAIL(E0SET(ei), escape, entry);
608 	} else if (strcmp("1", name) == 0) {
609 		escape->set = E1SET(ei);
610 		TAILQ_INSERT_TAIL(E1SET(ei), escape, entry);
611 	} else {
612 		free(escape);
613 		return (EINVAL);
614 	}
615 	p[0] = (void *)escape;
616 	p[1] = (void *)ei;
617 	return (_citrus_prop_parse_variable(
618 	    escape_hints, (void *)&p[0], s, strlen(s)));
619 }
620 
621 static const _citrus_prop_hint_t root_hints[] = {
622 _CITRUS_PROP_HINT_STR("0", &_citrus_HZ_parse_escape),
623 _CITRUS_PROP_HINT_STR("1", &_citrus_HZ_parse_escape),
624 _CITRUS_PROP_HINT_END
625 };
626 
627 static int
628 _citrus_HZ_encoding_module_init(_HZEncodingInfo * __restrict ei,
629     const void * __restrict var, size_t lenvar)
630 {
631 	int errnum;
632 
633 	memset(ei, 0, sizeof(*ei));
634 	TAILQ_INIT(E0SET(ei));
635 	TAILQ_INIT(E1SET(ei));
636 	errnum = _citrus_prop_parse_variable(
637 	    root_hints, (void *)ei, var, lenvar);
638 	if (errnum != 0)
639 		_citrus_HZ_encoding_module_uninit(ei);
640 	return (errnum);
641 }
642 
643 /* ----------------------------------------------------------------------
644  * public interface for stdenc
645  */
646 
647 _CITRUS_STDENC_DECLS(HZ);
648 _CITRUS_STDENC_DEF_OPS(HZ);
649 
650 #include "citrus_stdenc_template.h"
651