xref: /freebsd/lib/libiconv_modules/HZ/citrus_hz.c (revision 884a2a699669ec61e2366e3e358342dbc94be24a)
1 /* $FreeBSD$ */
2 /* $NetBSD: citrus_hz.c,v 1.2 2008/06/14 16:01:07 tnozaki Exp $ */
3 
4 /*-
5  * Copyright (c)2004, 2006 Citrus Project,
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  */
30 
31 #include <sys/cdefs.h>
32 #include <sys/queue.h>
33 #include <sys/types.h>
34 
35 #include <assert.h>
36 #include <errno.h>
37 #include <limits.h>
38 #include <stddef.h>
39 #include <stdint.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <wchar.h>
43 
44 #include "citrus_namespace.h"
45 #include "citrus_types.h"
46 #include "citrus_bcs.h"
47 #include "citrus_module.h"
48 #include "citrus_stdenc.h"
49 
50 #include "citrus_hz.h"
51 #include "citrus_prop.h"
52 
53 /*
54  * wchar_t mapping:
55  *
56  * CTRL/ASCII	00000000 00000000 00000000 gxxxxxxx
57  * GB2312	00000000 00000000 0xxxxxxx gxxxxxxx
58  * 94/96*n (~M)	0mmmmmmm 0xxxxxxx 0xxxxxxx gxxxxxxx
59  */
60 
61 #define ESCAPE_CHAR	'~'
62 
63 typedef enum {
64 	CTRL = 0, ASCII = 1, GB2312 = 2, CS94 = 3, CS96 = 4
65 } charset_t;
66 
67 typedef struct {
68 	int	 end;
69 	int	 start;
70 	int	 width;
71 } range_t;
72 
73 static const range_t ranges[] = {
74 #define RANGE(start, end) { start, end, (end - start) + 1 }
75 /* CTRL   */ RANGE(0x00, 0x1F),
76 /* ASCII  */ RANGE(0x20, 0x7F),
77 /* GB2312 */ RANGE(0x21, 0x7E),
78 /* CS94   */ RANGE(0x21, 0x7E),
79 /* CS96   */ RANGE(0x20, 0x7F),
80 #undef RANGE
81 };
82 
83 typedef struct escape_t escape_t;
84 typedef struct {
85 	charset_t	 charset;
86 	escape_t	*escape;
87 	ssize_t		 length;
88 #define ROWCOL_MAX	3
89 } graphic_t;
90 
91 typedef TAILQ_HEAD(escape_list, escape_t) escape_list;
92 struct escape_t {
93 	TAILQ_ENTRY(escape_t)	 entry;
94 	escape_list		*set;
95 	graphic_t		*left;
96 	graphic_t		*right;
97 	int			 ch;
98 };
99 
100 #define GL(escape)	((escape)->left)
101 #define GR(escape)	((escape)->right)
102 #define SET(escape)	((escape)->set)
103 #define ESC(escape)	((escape)->ch)
104 #define INIT(escape)	(TAILQ_FIRST(SET(escape)))
105 
106 static __inline escape_t *
107 find_escape(escape_list *set, int ch)
108 {
109 	escape_t *escape;
110 
111 	TAILQ_FOREACH(escape, set, entry) {
112 		if (ESC(escape) == ch)
113 			break;
114 	}
115 
116 	return (escape);
117 }
118 
119 typedef struct {
120 	escape_list	 e0;
121 	escape_list	 e1;
122 	graphic_t	*ascii;
123 	graphic_t	*gb2312;
124 } _HZEncodingInfo;
125 
126 #define E0SET(ei)	(&(ei)->e0)
127 #define E1SET(ei)	(&(ei)->e1)
128 #define INIT0(ei)	(TAILQ_FIRST(E0SET(ei)))
129 #define INIT1(ei)	(TAILQ_FIRST(E1SET(ei)))
130 
131 typedef struct {
132 	escape_t	*inuse;
133 	int		 chlen;
134 	char		 ch[ROWCOL_MAX];
135 } _HZState;
136 
137 #define _CEI_TO_EI(_cei_)		(&(_cei_)->ei)
138 #define _CEI_TO_STATE(_cei_, _func_)	(_cei_)->states.s_##_func_
139 
140 #define _FUNCNAME(m)			_citrus_HZ_##m
141 #define _ENCODING_INFO			_HZEncodingInfo
142 #define _ENCODING_STATE			_HZState
143 #define _ENCODING_MB_CUR_MAX(_ei_)	MB_LEN_MAX
144 #define _ENCODING_IS_STATE_DEPENDENT		1
145 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_)	((_ps_)->inuse == NULL)
146 
147 static __inline void
148 _citrus_HZ_init_state(_HZEncodingInfo * __restrict ei,
149     _HZState * __restrict psenc)
150 {
151 
152 	psenc->chlen = 0;
153 	psenc->inuse = INIT0(ei);
154 }
155 
156 static __inline void
157 /*ARGSUSED*/
158 _citrus_HZ_pack_state(_HZEncodingInfo * __restrict ei __unused,
159     void *__restrict pspriv, const _HZState * __restrict psenc)
160 {
161 
162 	memcpy(pspriv, (const void *)psenc, sizeof(*psenc));
163 }
164 
165 static __inline void
166 /*ARGSUSED*/
167 _citrus_HZ_unpack_state(_HZEncodingInfo * __restrict ei __unused,
168     _HZState * __restrict psenc, const void * __restrict pspriv)
169 {
170 
171 	memcpy((void *)psenc, pspriv, sizeof(*psenc));
172 }
173 
174 static int
175 _citrus_HZ_mbrtowc_priv(_HZEncodingInfo * __restrict ei,
176     wchar_t * __restrict pwc, char ** __restrict s, size_t n,
177     _HZState * __restrict psenc, size_t * __restrict nresult)
178 {
179 	escape_t *candidate, *init;
180 	graphic_t *graphic;
181 	const range_t *range;
182 	char *s0;
183 	wchar_t wc;
184 	int bit, ch, head, len, tail;
185 
186 	if (*s == NULL) {
187 		_citrus_HZ_init_state(ei, psenc);
188 		*nresult = 1;
189 		return (0);
190 	}
191 	s0 = *s;
192 	if (psenc->chlen < 0 || psenc->inuse == NULL)
193 		return (EINVAL);
194 
195 	wc = (wchar_t)0;
196 	bit = head = tail = 0;
197 	graphic = NULL;
198 	for (len = 0; len <= MB_LEN_MAX;) {
199 		if (psenc->chlen == tail) {
200 			if (n-- < 1) {
201 				*s = s0;
202 				*nresult = (size_t)-2;
203 				return (0);
204 			}
205 			psenc->ch[psenc->chlen++] = *s0++;
206 			++len;
207 		}
208 		ch = (unsigned char)psenc->ch[tail++];
209 		if (tail == 1) {
210 			if ((ch & ~0x80) <= 0x1F) {
211 				if (psenc->inuse != INIT0(ei))
212 					break;
213 				wc = (wchar_t)ch;
214 				goto done;
215 			}
216 			if (ch & 0x80) {
217 				graphic = GR(psenc->inuse);
218 				bit = 0x80;
219 				ch &= ~0x80;
220 			} else {
221 				graphic = GL(psenc->inuse);
222 				if (ch == ESCAPE_CHAR)
223 					continue;
224 				bit = 0x0;
225 			}
226 			if (graphic == NULL)
227 				break;
228 		} else if (tail == 2 && psenc->ch[0] == ESCAPE_CHAR) {
229 			if (tail < psenc->chlen)
230 				return (EINVAL);
231 			if (ch == ESCAPE_CHAR) {
232 				++head;
233 			} else if (ch == '\n') {
234 				if (psenc->inuse != INIT0(ei))
235 					break;
236 				tail = psenc->chlen = 0;
237 				continue;
238 			} else {
239 				candidate = NULL;
240 				init = INIT0(ei);
241 				if (psenc->inuse == init) {
242 					init = INIT1(ei);
243 				} else if (INIT(psenc->inuse) == init) {
244 					if (ESC(init) != ch)
245 						break;
246 					candidate = init;
247 				}
248 				if (candidate == NULL) {
249 					candidate = find_escape(
250 					    SET(psenc->inuse), ch);
251 					if (candidate == NULL) {
252 						if (init == NULL ||
253 						    ESC(init) != ch)
254 							break;
255 						candidate = init;
256 					}
257 				}
258 				psenc->inuse = candidate;
259 				tail = psenc->chlen = 0;
260 				continue;
261 			}
262 		} else if (ch & 0x80) {
263 			if (graphic != GR(psenc->inuse))
264 				break;
265 			ch &= ~0x80;
266 		} else {
267 			if (graphic != GL(psenc->inuse))
268 				break;
269 		}
270 		range = &ranges[(size_t)graphic->charset];
271 		if (range->start > ch || range->end < ch)
272 			break;
273 		wc <<= 8;
274 		wc |= ch;
275 		if (graphic->length == (tail - head)) {
276 			if (graphic->charset > GB2312)
277 				bit |= ESC(psenc->inuse) << 24;
278 			wc |= bit;
279 			goto done;
280 		}
281 	}
282 	*nresult = (size_t)-1;
283 	return (EILSEQ);
284 done:
285 	if (tail < psenc->chlen)
286 		return (EINVAL);
287 	*s = s0;
288 	if (pwc != NULL)
289 		*pwc = wc;
290 	psenc->chlen = 0;
291 	*nresult = (wc == 0) ? 0 : len;
292 
293 	return (0);
294 }
295 
296 static int
297 _citrus_HZ_wcrtomb_priv(_HZEncodingInfo * __restrict ei,
298     char * __restrict s, size_t n, wchar_t wc,
299     _HZState * __restrict psenc, size_t * __restrict nresult)
300 {
301 	escape_t *candidate, *init;
302 	graphic_t *graphic;
303 	const range_t *range;
304 	size_t len;
305 	int bit, ch;
306 
307 	if (psenc->chlen != 0 || psenc->inuse == NULL)
308 		return (EINVAL);
309 	if (wc & 0x80) {
310 		bit = 0x80;
311 		wc &= ~0x80;
312 	} else {
313 		bit = 0x0;
314 	}
315 	if ((uint32_t)wc <= 0x1F) {
316 		candidate = INIT0(ei);
317 		graphic = (bit == 0) ? candidate->left : candidate->right;
318 		if (graphic == NULL)
319 			goto ilseq;
320 		range = &ranges[(size_t)CTRL];
321 		len = 1;
322 	} else if ((uint32_t)wc <= 0x7F) {
323 		graphic = ei->ascii;
324 		if (graphic == NULL)
325 			goto ilseq;
326 		candidate = graphic->escape;
327 		range = &ranges[(size_t)graphic->charset];
328 		len = graphic->length;
329 	} else if ((uint32_t)wc <= 0x7F7F) {
330 		graphic = ei->gb2312;
331 		if (graphic == NULL)
332 			goto ilseq;
333 		candidate = graphic->escape;
334 		range = &ranges[(size_t)graphic->charset];
335 		len = graphic->length;
336 	} else {
337 		ch = (wc >> 24) & 0xFF;
338 		candidate = find_escape(E0SET(ei), ch);
339 		if (candidate == NULL) {
340 			candidate = find_escape(E1SET(ei), ch);
341 			if (candidate == NULL)
342 				goto ilseq;
343 		}
344 		wc &= ~0xFF000000;
345 		graphic = (bit == 0) ? candidate->left : candidate->right;
346 		if (graphic == NULL)
347 			goto ilseq;
348 		range = &ranges[(size_t)graphic->charset];
349 		len = graphic->length;
350 	}
351 	if (psenc->inuse != candidate) {
352 		init = INIT0(ei);
353 		if (SET(psenc->inuse) == SET(candidate)) {
354 			if (INIT(psenc->inuse) != init ||
355 			    psenc->inuse == init || candidate == init)
356 				init = NULL;
357 		} else if (candidate == (init = INIT(candidate))) {
358 			init = NULL;
359 		}
360 		if (init != NULL) {
361 			if (n < 2)
362 				return (E2BIG);
363 			n -= 2;
364 			psenc->ch[psenc->chlen++] = ESCAPE_CHAR;
365 			psenc->ch[psenc->chlen++] = ESC(init);
366 		}
367 		if (n < 2)
368 			return (E2BIG);
369 		n -= 2;
370 		psenc->ch[psenc->chlen++] = ESCAPE_CHAR;
371 		psenc->ch[psenc->chlen++] = ESC(candidate);
372 		psenc->inuse = candidate;
373 	}
374 	if (n < len)
375 		return (E2BIG);
376 	while (len-- > 0) {
377 		ch = (wc >> (len * 8)) & 0xFF;
378 		if (range->start > ch || range->end < ch)
379 			goto ilseq;
380 		psenc->ch[psenc->chlen++] = ch | bit;
381 	}
382 	memcpy(s, psenc->ch, psenc->chlen);
383 	*nresult = psenc->chlen;
384 	psenc->chlen = 0;
385 
386 	return (0);
387 
388 ilseq:
389 	*nresult = (size_t)-1;
390 	return (EILSEQ);
391 }
392 
393 static __inline int
394 _citrus_HZ_put_state_reset(_HZEncodingInfo * __restrict ei,
395     char * __restrict s, size_t n, _HZState * __restrict psenc,
396     size_t * __restrict nresult)
397 {
398 	escape_t *candidate;
399 
400 	if (psenc->chlen != 0 || psenc->inuse == NULL)
401 		return (EINVAL);
402 	candidate = INIT0(ei);
403 	if (psenc->inuse != candidate) {
404 		if (n < 2)
405 			return (E2BIG);
406 		n -= 2;
407 		psenc->ch[psenc->chlen++] = ESCAPE_CHAR;
408 		psenc->ch[psenc->chlen++] = ESC(candidate);
409 	}
410 	if (n < 1)
411 		return (E2BIG);
412 	if (psenc->chlen > 0)
413 		memcpy(s, psenc->ch, psenc->chlen);
414 	*nresult = psenc->chlen;
415 	_citrus_HZ_init_state(ei, psenc);
416 
417 	return (0);
418 }
419 
420 static __inline int
421 _citrus_HZ_stdenc_get_state_desc_generic(_HZEncodingInfo * __restrict ei,
422     _HZState * __restrict psenc, int * __restrict rstate)
423 {
424 
425 	if (psenc->chlen < 0 || psenc->inuse == NULL)
426 		return (EINVAL);
427 	*rstate = (psenc->chlen == 0)
428 	    ? ((psenc->inuse == INIT0(ei))
429 	        ? _STDENC_SDGEN_INITIAL
430 	        : _STDENC_SDGEN_STABLE)
431 	    : ((psenc->ch[0] == ESCAPE_CHAR)
432 	        ? _STDENC_SDGEN_INCOMPLETE_SHIFT
433 	        : _STDENC_SDGEN_INCOMPLETE_CHAR);
434 
435 	return (0);
436 }
437 
438 static __inline int
439 /*ARGSUSED*/
440 _citrus_HZ_stdenc_wctocs(_HZEncodingInfo * __restrict ei __unused,
441     _csid_t * __restrict csid, _index_t * __restrict idx, wchar_t wc)
442 {
443 	int bit;
444 
445 	if (wc & 0x80) {
446 		bit = 0x80;
447 		wc &= ~0x80;
448 	} else
449 		bit = 0x0;
450 	if ((uint32_t)wc <= 0x7F) {
451 		*csid = (_csid_t)bit;
452 		*idx = (_index_t)wc;
453 	} else if ((uint32_t)wc <= 0x7F7F) {
454 		*csid = (_csid_t)(bit | 0x8000);
455 		*idx = (_index_t)wc;
456 	} else {
457 		*csid = (_index_t)(wc & ~0x00FFFF7F);
458 		*idx = (_csid_t)(wc & 0x00FFFF7F);
459 	}
460 
461 	return (0);
462 }
463 
464 static __inline int
465 /*ARGSUSED*/
466 _citrus_HZ_stdenc_cstowc(_HZEncodingInfo * __restrict ei __unused,
467     wchar_t * __restrict wc, _csid_t csid, _index_t idx)
468 {
469 
470 	*wc = (wchar_t)idx;
471 	switch (csid) {
472 	case 0x80:
473 	case 0x8080:
474 		*wc |= (wchar_t)0x80;
475 		/*FALLTHROUGH*/
476 	case 0x0:
477 	case 0x8000:
478 		break;
479 	default:
480 		*wc |= (wchar_t)csid;
481 	}
482 
483 	return (0);
484 }
485 
486 static void
487 _citrus_HZ_encoding_module_uninit(_HZEncodingInfo *ei)
488 {
489 	escape_t *escape;
490 
491 	while ((escape = TAILQ_FIRST(E0SET(ei))) != NULL) {
492 		TAILQ_REMOVE(E0SET(ei), escape, entry);
493 		free(GL(escape));
494 		free(GR(escape));
495 		free(escape);
496 	}
497 	while ((escape = TAILQ_FIRST(E1SET(ei))) != NULL) {
498 		TAILQ_REMOVE(E1SET(ei), escape, entry);
499 		free(GL(escape));
500 		free(GR(escape));
501 		free(escape);
502 	}
503 }
504 
505 static int
506 _citrus_HZ_parse_char(void **context, const char *name __unused, const char *s)
507 {
508 	escape_t *escape;
509 	void **p;
510 
511 	p = (void **)*context;
512 	escape = (escape_t *)p[0];
513 	if (escape->ch != '\0')
514 		return (EINVAL);
515 	escape->ch = *s++;
516 	if (escape->ch == ESCAPE_CHAR || *s != '\0')
517 		return (EINVAL);
518 
519 	return (0);
520 }
521 
522 static int
523 _citrus_HZ_parse_graphic(void **context, const char *name, const char *s)
524 {
525 	_HZEncodingInfo *ei;
526 	escape_t *escape;
527 	graphic_t *graphic;
528 	void **p;
529 
530 	p = (void **)*context;
531 	escape = (escape_t *)p[0];
532 	ei = (_HZEncodingInfo *)p[1];
533 	graphic = malloc(sizeof(*graphic));
534 	if (graphic == NULL)
535 		return (ENOMEM);
536 	memset(graphic, 0, sizeof(*graphic));
537 	if (strcmp("GL", name) == 0) {
538 		if (GL(escape) != NULL)
539 			goto release;
540 		GL(escape) = graphic;
541 	} else if (strcmp("GR", name) == 0) {
542 		if (GR(escape) != NULL)
543 			goto release;
544 		GR(escape) = graphic;
545 	} else {
546 release:
547 		free(graphic);
548 		return (EINVAL);
549 	}
550 	graphic->escape = escape;
551 	if (_bcs_strncasecmp("ASCII", s, 5) == 0) {
552 		if (s[5] != '\0')
553 			return (EINVAL);
554 		graphic->charset = ASCII;
555 		graphic->length = 1;
556 		ei->ascii = graphic;
557 		return (0);
558 	} else if (_bcs_strncasecmp("GB2312", s, 6) == 0) {
559 		if (s[6] != '\0')
560 			return (EINVAL);
561 		graphic->charset = GB2312;
562 		graphic->length = 2;
563 		ei->gb2312 = graphic;
564 		return (0);
565 	} else if (strncmp("94*", s, 3) == 0)
566 		graphic->charset = CS94;
567 	else if (strncmp("96*", s, 3) == 0)
568 		graphic->charset = CS96;
569 	else
570 		return (EINVAL);
571 	s += 3;
572 	switch(*s) {
573 	case '1': case '2': case '3':
574 		graphic->length = (size_t)(*s - '0');
575 		if (*++s == '\0')
576 			break;
577 	/*FALLTHROUGH*/
578 	default:
579 		return (EINVAL);
580 	}
581 	return (0);
582 }
583 
584 static const _citrus_prop_hint_t escape_hints[] = {
585 _CITRUS_PROP_HINT_STR("CH", &_citrus_HZ_parse_char),
586 _CITRUS_PROP_HINT_STR("GL", &_citrus_HZ_parse_graphic),
587 _CITRUS_PROP_HINT_STR("GR", &_citrus_HZ_parse_graphic),
588 _CITRUS_PROP_HINT_END
589 };
590 
591 static int
592 _citrus_HZ_parse_escape(void **context, const char *name, const char *s)
593 {
594 	_HZEncodingInfo *ei;
595 	escape_t *escape;
596 	void *p[2];
597 
598 	ei = (_HZEncodingInfo *)*context;
599 	escape = malloc(sizeof(*escape));
600 	if (escape == NULL)
601 		return (EINVAL);
602 	memset(escape, 0, sizeof(*escape));
603 	if (strcmp("0", name) == 0) {
604 		escape->set = E0SET(ei);
605 		TAILQ_INSERT_TAIL(E0SET(ei), escape, entry);
606 	} else if (strcmp("1", name) == 0) {
607 		escape->set = E1SET(ei);
608 		TAILQ_INSERT_TAIL(E1SET(ei), escape, entry);
609 	} else {
610 		free(escape);
611 		return (EINVAL);
612 	}
613 	p[0] = (void *)escape;
614 	p[1] = (void *)ei;
615 	return (_citrus_prop_parse_variable(
616 	    escape_hints, (void *)&p[0], s, strlen(s)));
617 }
618 
619 static const _citrus_prop_hint_t root_hints[] = {
620 _CITRUS_PROP_HINT_STR("0", &_citrus_HZ_parse_escape),
621 _CITRUS_PROP_HINT_STR("1", &_citrus_HZ_parse_escape),
622 _CITRUS_PROP_HINT_END
623 };
624 
625 static int
626 _citrus_HZ_encoding_module_init(_HZEncodingInfo * __restrict ei,
627     const void * __restrict var, size_t lenvar)
628 {
629 	int errnum;
630 
631 	memset(ei, 0, sizeof(*ei));
632 	TAILQ_INIT(E0SET(ei));
633 	TAILQ_INIT(E1SET(ei));
634 	errnum = _citrus_prop_parse_variable(
635 	    root_hints, (void *)ei, var, lenvar);
636 	if (errnum != 0)
637 		_citrus_HZ_encoding_module_uninit(ei);
638 	return (errnum);
639 }
640 
641 /* ----------------------------------------------------------------------
642  * public interface for stdenc
643  */
644 
645 _CITRUS_STDENC_DECLS(HZ);
646 _CITRUS_STDENC_DEF_OPS(HZ);
647 
648 #include "citrus_stdenc_template.h"
649