xref: /freebsd/lib/libiconv_modules/HZ/citrus_hz.c (revision b9128a37faafede823eb456aa65a11ac69997284)
1 /* $NetBSD: citrus_hz.c,v 1.2 2008/06/14 16:01:07 tnozaki Exp $ */
2 
3 /*-
4  * SPDX-License-Identifier: BSD-2-Clause
5  *
6  * Copyright (c)2004, 2006 Citrus Project,
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  */
31 
32 #include <sys/cdefs.h>
33 #include <sys/queue.h>
34 #include <sys/types.h>
35 
36 #include <assert.h>
37 #include <errno.h>
38 #include <limits.h>
39 #include <stddef.h>
40 #include <stdint.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <wchar.h>
44 
45 #include "citrus_namespace.h"
46 #include "citrus_types.h"
47 #include "citrus_bcs.h"
48 #include "citrus_module.h"
49 #include "citrus_stdenc.h"
50 
51 #include "citrus_hz.h"
52 #include "citrus_prop.h"
53 
54 /*
55  * wchar_t mapping:
56  *
57  * CTRL/ASCII	00000000 00000000 00000000 gxxxxxxx
58  * GB2312	00000000 00000000 0xxxxxxx gxxxxxxx
59  * 94/96*n (~M)	0mmmmmmm 0xxxxxxx 0xxxxxxx gxxxxxxx
60  */
61 
62 #define ESCAPE_CHAR	'~'
63 
64 typedef enum {
65 	CTRL = 0, ASCII = 1, GB2312 = 2, CS94 = 3, CS96 = 4
66 } charset_t;
67 
68 typedef struct {
69 	int	 start;
70 	int	 end;
71 	int	 width;
72 } range_t;
73 
74 static const range_t ranges[] = {
75 #define RANGE(start, end) { start, end, (end - start) + 1 }
76 /* CTRL   */ RANGE(0x00, 0x1F),
77 /* ASCII  */ RANGE(0x20, 0x7F),
78 /* GB2312 */ RANGE(0x21, 0x7E),
79 /* CS94   */ RANGE(0x21, 0x7E),
80 /* CS96   */ RANGE(0x20, 0x7F),
81 #undef RANGE
82 };
83 
84 typedef struct escape_t escape_t;
85 typedef struct {
86 	charset_t	 charset;
87 	escape_t	*escape;
88 	ssize_t		 length;
89 #define ROWCOL_MAX	3
90 } graphic_t;
91 
92 typedef TAILQ_HEAD(escape_list, escape_t) escape_list;
93 struct escape_t {
94 	TAILQ_ENTRY(escape_t)	 entry;
95 	escape_list		*set;
96 	graphic_t		*left;
97 	graphic_t		*right;
98 	int			 ch;
99 };
100 
101 #define GL(escape)	((escape)->left)
102 #define GR(escape)	((escape)->right)
103 #define SET(escape)	((escape)->set)
104 #define ESC(escape)	((escape)->ch)
105 #define INIT(escape)	(TAILQ_FIRST(SET(escape)))
106 
107 static __inline escape_t *
108 find_escape(escape_list *set, int ch)
109 {
110 	escape_t *escape;
111 
112 	TAILQ_FOREACH(escape, set, entry) {
113 		if (ESC(escape) == ch)
114 			break;
115 	}
116 
117 	return (escape);
118 }
119 
120 typedef struct {
121 	escape_list	 e0;
122 	escape_list	 e1;
123 	graphic_t	*ascii;
124 	graphic_t	*gb2312;
125 } _HZEncodingInfo;
126 
127 #define E0SET(ei)	(&(ei)->e0)
128 #define E1SET(ei)	(&(ei)->e1)
129 #define INIT0(ei)	(TAILQ_FIRST(E0SET(ei)))
130 #define INIT1(ei)	(TAILQ_FIRST(E1SET(ei)))
131 
132 typedef struct {
133 	escape_t	*inuse;
134 	int		 chlen;
135 	char		 ch[ROWCOL_MAX];
136 } _HZState;
137 
138 #define _CEI_TO_EI(_cei_)		(&(_cei_)->ei)
139 #define _CEI_TO_STATE(_cei_, _func_)	(_cei_)->states.s_##_func_
140 
141 #define _FUNCNAME(m)			_citrus_HZ_##m
142 #define _ENCODING_INFO			_HZEncodingInfo
143 #define _ENCODING_STATE			_HZState
144 #define _ENCODING_MB_CUR_MAX(_ei_)	MB_LEN_MAX
145 #define _ENCODING_IS_STATE_DEPENDENT		1
146 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_)	((_ps_)->inuse == NULL)
147 
148 static __inline void
149 _citrus_HZ_init_state(_HZEncodingInfo * __restrict ei,
150     _HZState * __restrict psenc)
151 {
152 
153 	psenc->chlen = 0;
154 	psenc->inuse = INIT0(ei);
155 }
156 
157 #if 0
158 static __inline void
159 /*ARGSUSED*/
160 _citrus_HZ_pack_state(_HZEncodingInfo * __restrict ei __unused,
161     void *__restrict pspriv, const _HZState * __restrict psenc)
162 {
163 
164 	memcpy(pspriv, (const void *)psenc, sizeof(*psenc));
165 }
166 
167 static __inline void
168 /*ARGSUSED*/
169 _citrus_HZ_unpack_state(_HZEncodingInfo * __restrict ei __unused,
170     _HZState * __restrict psenc, const void * __restrict pspriv)
171 {
172 
173 	memcpy((void *)psenc, pspriv, sizeof(*psenc));
174 }
175 #endif
176 
177 static int
178 _citrus_HZ_mbrtowc_priv(_HZEncodingInfo * __restrict ei,
179     wchar_t * __restrict pwc, char ** __restrict s, size_t n,
180     _HZState * __restrict psenc, size_t * __restrict nresult)
181 {
182 	escape_t *candidate, *init;
183 	graphic_t *graphic;
184 	const range_t *range;
185 	char *s0;
186 	wchar_t wc;
187 	int bit, ch, head, len, tail;
188 
189 	if (*s == NULL) {
190 		_citrus_HZ_init_state(ei, psenc);
191 		*nresult = 1;
192 		return (0);
193 	}
194 	s0 = *s;
195 	if (psenc->chlen < 0 || psenc->inuse == NULL)
196 		return (EINVAL);
197 
198 	wc = (wchar_t)0;
199 	bit = head = tail = 0;
200 	graphic = NULL;
201 	for (len = 0; len <= MB_LEN_MAX;) {
202 		if (psenc->chlen == tail) {
203 			if (n-- < 1) {
204 				*s = s0;
205 				*nresult = (size_t)-2;
206 				return (0);
207 			}
208 			psenc->ch[psenc->chlen++] = *s0++;
209 			++len;
210 		}
211 		ch = (unsigned char)psenc->ch[tail++];
212 		if (tail == 1) {
213 			if ((ch & ~0x80) <= 0x1F) {
214 				if (psenc->inuse != INIT0(ei))
215 					break;
216 				wc = (wchar_t)ch;
217 				goto done;
218 			}
219 			if (ch & 0x80) {
220 				graphic = GR(psenc->inuse);
221 				bit = 0x80;
222 				ch &= ~0x80;
223 			} else {
224 				graphic = GL(psenc->inuse);
225 				if (ch == ESCAPE_CHAR)
226 					continue;
227 				bit = 0x0;
228 			}
229 			if (graphic == NULL)
230 				break;
231 		} else if (tail == 2 && psenc->ch[0] == ESCAPE_CHAR) {
232 			if (tail < psenc->chlen)
233 				return (EINVAL);
234 			if (ch == ESCAPE_CHAR) {
235 				++head;
236 			} else if (ch == '\n') {
237 				if (psenc->inuse != INIT0(ei))
238 					break;
239 				tail = psenc->chlen = 0;
240 				continue;
241 			} else {
242 				candidate = NULL;
243 				init = INIT0(ei);
244 				if (psenc->inuse == init) {
245 					init = INIT1(ei);
246 				} else if (INIT(psenc->inuse) == init) {
247 					if (ESC(init) != ch)
248 						break;
249 					candidate = init;
250 				}
251 				if (candidate == NULL) {
252 					candidate = find_escape(
253 					    SET(psenc->inuse), ch);
254 					if (candidate == NULL) {
255 						if (init == NULL ||
256 						    ESC(init) != ch)
257 							break;
258 						candidate = init;
259 					}
260 				}
261 				psenc->inuse = candidate;
262 				tail = psenc->chlen = 0;
263 				continue;
264 			}
265 		} else if (ch & 0x80) {
266 			if (graphic != GR(psenc->inuse))
267 				break;
268 			ch &= ~0x80;
269 		} else {
270 			if (graphic != GL(psenc->inuse))
271 				break;
272 		}
273 		range = &ranges[(size_t)graphic->charset];
274 		if (range->start > ch || range->end < ch)
275 			break;
276 		wc <<= 8;
277 		wc |= ch;
278 		if (graphic->length == (tail - head)) {
279 			if (graphic->charset > GB2312)
280 				bit |= ESC(psenc->inuse) << 24;
281 			wc |= bit;
282 			goto done;
283 		}
284 	}
285 	*nresult = (size_t)-1;
286 	return (EILSEQ);
287 done:
288 	if (tail < psenc->chlen)
289 		return (EINVAL);
290 	*s = s0;
291 	if (pwc != NULL)
292 		*pwc = wc;
293 	psenc->chlen = 0;
294 	*nresult = (wc == 0) ? 0 : len;
295 
296 	return (0);
297 }
298 
299 static int
300 _citrus_HZ_wcrtomb_priv(_HZEncodingInfo * __restrict ei,
301     char * __restrict s, size_t n, wchar_t wc,
302     _HZState * __restrict psenc, size_t * __restrict nresult)
303 {
304 	escape_t *candidate, *init;
305 	graphic_t *graphic;
306 	const range_t *range;
307 	size_t len;
308 	int bit, ch;
309 
310 	if (psenc->chlen != 0 || psenc->inuse == NULL)
311 		return (EINVAL);
312 	if (wc & 0x80) {
313 		bit = 0x80;
314 		wc &= ~0x80;
315 	} else {
316 		bit = 0x0;
317 	}
318 	if ((uint32_t)wc <= 0x1F) {
319 		candidate = INIT0(ei);
320 		graphic = (bit == 0) ? candidate->left : candidate->right;
321 		if (graphic == NULL)
322 			goto ilseq;
323 		range = &ranges[(size_t)CTRL];
324 		len = 1;
325 	} else if ((uint32_t)wc <= 0x7F) {
326 		graphic = ei->ascii;
327 		if (graphic == NULL)
328 			goto ilseq;
329 		candidate = graphic->escape;
330 		range = &ranges[(size_t)graphic->charset];
331 		len = graphic->length;
332 	} else if ((uint32_t)wc <= 0x7F7F) {
333 		graphic = ei->gb2312;
334 		if (graphic == NULL)
335 			goto ilseq;
336 		candidate = graphic->escape;
337 		range = &ranges[(size_t)graphic->charset];
338 		len = graphic->length;
339 	} else {
340 		ch = (wc >> 24) & 0xFF;
341 		candidate = find_escape(E0SET(ei), ch);
342 		if (candidate == NULL) {
343 			candidate = find_escape(E1SET(ei), ch);
344 			if (candidate == NULL)
345 				goto ilseq;
346 		}
347 		wc &= ~0xFF000000;
348 		graphic = (bit == 0) ? candidate->left : candidate->right;
349 		if (graphic == NULL)
350 			goto ilseq;
351 		range = &ranges[(size_t)graphic->charset];
352 		len = graphic->length;
353 	}
354 	if (psenc->inuse != candidate) {
355 		init = INIT0(ei);
356 		if (SET(psenc->inuse) == SET(candidate)) {
357 			if (INIT(psenc->inuse) != init ||
358 			    psenc->inuse == init || candidate == init)
359 				init = NULL;
360 		} else if (candidate == (init = INIT(candidate))) {
361 			init = NULL;
362 		}
363 		if (init != NULL) {
364 			if (n < 2)
365 				return (E2BIG);
366 			n -= 2;
367 			psenc->ch[psenc->chlen++] = ESCAPE_CHAR;
368 			psenc->ch[psenc->chlen++] = ESC(init);
369 		}
370 		if (n < 2)
371 			return (E2BIG);
372 		n -= 2;
373 		psenc->ch[psenc->chlen++] = ESCAPE_CHAR;
374 		psenc->ch[psenc->chlen++] = ESC(candidate);
375 		psenc->inuse = candidate;
376 	}
377 	if (n < len)
378 		return (E2BIG);
379 	while (len-- > 0) {
380 		ch = (wc >> (len * 8)) & 0xFF;
381 		if (range->start > ch || range->end < ch)
382 			goto ilseq;
383 		psenc->ch[psenc->chlen++] = ch | bit;
384 	}
385 	memcpy(s, psenc->ch, psenc->chlen);
386 	*nresult = psenc->chlen;
387 	psenc->chlen = 0;
388 
389 	return (0);
390 
391 ilseq:
392 	*nresult = (size_t)-1;
393 	return (EILSEQ);
394 }
395 
396 static __inline int
397 _citrus_HZ_put_state_reset(_HZEncodingInfo * __restrict ei,
398     char * __restrict s, size_t n, _HZState * __restrict psenc,
399     size_t * __restrict nresult)
400 {
401 	escape_t *candidate;
402 
403 	if (psenc->chlen != 0 || psenc->inuse == NULL)
404 		return (EINVAL);
405 	candidate = INIT0(ei);
406 	if (psenc->inuse != candidate) {
407 		if (n < 2)
408 			return (E2BIG);
409 		n -= 2;
410 		psenc->ch[psenc->chlen++] = ESCAPE_CHAR;
411 		psenc->ch[psenc->chlen++] = ESC(candidate);
412 	}
413 	if (n < 1)
414 		return (E2BIG);
415 	if (psenc->chlen > 0)
416 		memcpy(s, psenc->ch, psenc->chlen);
417 	*nresult = psenc->chlen;
418 	_citrus_HZ_init_state(ei, psenc);
419 
420 	return (0);
421 }
422 
423 static __inline int
424 _citrus_HZ_stdenc_get_state_desc_generic(_HZEncodingInfo * __restrict ei,
425     _HZState * __restrict psenc, int * __restrict rstate)
426 {
427 
428 	if (psenc->chlen < 0 || psenc->inuse == NULL)
429 		return (EINVAL);
430 	*rstate = (psenc->chlen == 0)
431 	    ? ((psenc->inuse == INIT0(ei))
432 	        ? _STDENC_SDGEN_INITIAL
433 	        : _STDENC_SDGEN_STABLE)
434 	    : ((psenc->ch[0] == ESCAPE_CHAR)
435 	        ? _STDENC_SDGEN_INCOMPLETE_SHIFT
436 	        : _STDENC_SDGEN_INCOMPLETE_CHAR);
437 
438 	return (0);
439 }
440 
441 static __inline int
442 /*ARGSUSED*/
443 _citrus_HZ_stdenc_wctocs(_HZEncodingInfo * __restrict ei __unused,
444     _csid_t * __restrict csid, _index_t * __restrict idx, wchar_t wc)
445 {
446 	int bit;
447 
448 	if (wc & 0x80) {
449 		bit = 0x80;
450 		wc &= ~0x80;
451 	} else
452 		bit = 0x0;
453 	if ((uint32_t)wc <= 0x7F) {
454 		*csid = (_csid_t)bit;
455 		*idx = (_index_t)wc;
456 	} else if ((uint32_t)wc <= 0x7F7F) {
457 		*csid = (_csid_t)(bit | 0x8000);
458 		*idx = (_index_t)wc;
459 	} else {
460 		*csid = (_index_t)(wc & ~0x00FFFF7F);
461 		*idx = (_csid_t)(wc & 0x00FFFF7F);
462 	}
463 
464 	return (0);
465 }
466 
467 static __inline int
468 /*ARGSUSED*/
469 _citrus_HZ_stdenc_cstowc(_HZEncodingInfo * __restrict ei __unused,
470     wchar_t * __restrict wc, _csid_t csid, _index_t idx)
471 {
472 
473 	*wc = (wchar_t)idx;
474 	switch (csid) {
475 	case 0x80:
476 	case 0x8080:
477 		*wc |= (wchar_t)0x80;
478 		/*FALLTHROUGH*/
479 	case 0x0:
480 	case 0x8000:
481 		break;
482 	default:
483 		*wc |= (wchar_t)csid;
484 	}
485 
486 	return (0);
487 }
488 
489 static void
490 _citrus_HZ_encoding_module_uninit(_HZEncodingInfo *ei)
491 {
492 	escape_t *escape;
493 
494 	while ((escape = TAILQ_FIRST(E0SET(ei))) != NULL) {
495 		TAILQ_REMOVE(E0SET(ei), escape, entry);
496 		free(GL(escape));
497 		free(GR(escape));
498 		free(escape);
499 	}
500 	while ((escape = TAILQ_FIRST(E1SET(ei))) != NULL) {
501 		TAILQ_REMOVE(E1SET(ei), escape, entry);
502 		free(GL(escape));
503 		free(GR(escape));
504 		free(escape);
505 	}
506 }
507 
508 static int
509 _citrus_HZ_parse_char(void *context, const char *name __unused, const char *s)
510 {
511 	escape_t *escape;
512 	void **p;
513 
514 	p = (void **)context;
515 	escape = (escape_t *)p[0];
516 	if (escape->ch != '\0')
517 		return (EINVAL);
518 	escape->ch = *s++;
519 	if (escape->ch == ESCAPE_CHAR || *s != '\0')
520 		return (EINVAL);
521 
522 	return (0);
523 }
524 
525 static int
526 _citrus_HZ_parse_graphic(void *context, const char *name, const char *s)
527 {
528 	_HZEncodingInfo *ei;
529 	escape_t *escape;
530 	graphic_t *graphic;
531 	void **p;
532 
533 	p = (void **)context;
534 	escape = (escape_t *)p[0];
535 	ei = (_HZEncodingInfo *)p[1];
536 	graphic = calloc(1, sizeof(*graphic));
537 	if (graphic == NULL)
538 		return (ENOMEM);
539 	if (strcmp("GL", name) == 0) {
540 		if (GL(escape) != NULL)
541 			goto release;
542 		GL(escape) = graphic;
543 	} else if (strcmp("GR", name) == 0) {
544 		if (GR(escape) != NULL)
545 			goto release;
546 		GR(escape) = graphic;
547 	} else {
548 release:
549 		free(graphic);
550 		return (EINVAL);
551 	}
552 	graphic->escape = escape;
553 	if (_bcs_strncasecmp("ASCII", s, 5) == 0) {
554 		if (s[5] != '\0')
555 			return (EINVAL);
556 		graphic->charset = ASCII;
557 		graphic->length = 1;
558 		ei->ascii = graphic;
559 		return (0);
560 	} else if (_bcs_strncasecmp("GB2312", s, 6) == 0) {
561 		if (s[6] != '\0')
562 			return (EINVAL);
563 		graphic->charset = GB2312;
564 		graphic->length = 2;
565 		ei->gb2312 = graphic;
566 		return (0);
567 	} else if (strncmp("94*", s, 3) == 0)
568 		graphic->charset = CS94;
569 	else if (strncmp("96*", s, 3) == 0)
570 		graphic->charset = CS96;
571 	else
572 		return (EINVAL);
573 	s += 3;
574 	switch(*s) {
575 	case '1': case '2': case '3':
576 		graphic->length = (size_t)(*s - '0');
577 		if (*++s == '\0')
578 			break;
579 	/*FALLTHROUGH*/
580 	default:
581 		return (EINVAL);
582 	}
583 	return (0);
584 }
585 
586 static const _citrus_prop_hint_t escape_hints[] = {
587 _CITRUS_PROP_HINT_STR("CH", &_citrus_HZ_parse_char),
588 _CITRUS_PROP_HINT_STR("GL", &_citrus_HZ_parse_graphic),
589 _CITRUS_PROP_HINT_STR("GR", &_citrus_HZ_parse_graphic),
590 _CITRUS_PROP_HINT_END
591 };
592 
593 static int
594 _citrus_HZ_parse_escape(void *context, const char *name, const char *s)
595 {
596 	_HZEncodingInfo *ei;
597 	escape_t *escape;
598 	void *p[2];
599 
600 	ei = (_HZEncodingInfo *)context;
601 	escape = calloc(1, sizeof(*escape));
602 	if (escape == NULL)
603 		return (EINVAL);
604 	if (strcmp("0", name) == 0) {
605 		escape->set = E0SET(ei);
606 		TAILQ_INSERT_TAIL(E0SET(ei), escape, entry);
607 	} else if (strcmp("1", name) == 0) {
608 		escape->set = E1SET(ei);
609 		TAILQ_INSERT_TAIL(E1SET(ei), escape, entry);
610 	} else {
611 		free(escape);
612 		return (EINVAL);
613 	}
614 	p[0] = (void *)escape;
615 	p[1] = (void *)ei;
616 	return (_citrus_prop_parse_variable(
617 	    escape_hints, (void *)&p[0], s, strlen(s)));
618 }
619 
620 static const _citrus_prop_hint_t root_hints[] = {
621 _CITRUS_PROP_HINT_STR("0", &_citrus_HZ_parse_escape),
622 _CITRUS_PROP_HINT_STR("1", &_citrus_HZ_parse_escape),
623 _CITRUS_PROP_HINT_END
624 };
625 
626 static int
627 _citrus_HZ_encoding_module_init(_HZEncodingInfo * __restrict ei,
628     const void * __restrict var, size_t lenvar)
629 {
630 	int errnum;
631 
632 	memset(ei, 0, sizeof(*ei));
633 	TAILQ_INIT(E0SET(ei));
634 	TAILQ_INIT(E1SET(ei));
635 	errnum = _citrus_prop_parse_variable(
636 	    root_hints, (void *)ei, var, lenvar);
637 	if (errnum != 0)
638 		_citrus_HZ_encoding_module_uninit(ei);
639 	return (errnum);
640 }
641 
642 /* ----------------------------------------------------------------------
643  * public interface for stdenc
644  */
645 
646 _CITRUS_STDENC_DECLS(HZ);
647 _CITRUS_STDENC_DEF_OPS(HZ);
648 
649 #include "citrus_stdenc_template.h"
650