xref: /freebsd/lib/libiconv_modules/HZ/citrus_hz.c (revision aa24f48b361effe51163877d84f1b70d32b77e04)
1 /* $FreeBSD$ */
2 /* $NetBSD: citrus_hz.c,v 1.2 2008/06/14 16:01:07 tnozaki Exp $ */
3 
4 /*-
5  * Copyright (c)2004, 2006 Citrus Project,
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  */
30 
31 #include <sys/cdefs.h>
32 #include <sys/queue.h>
33 #include <sys/types.h>
34 
35 #include <assert.h>
36 #include <errno.h>
37 #include <limits.h>
38 #include <stddef.h>
39 #include <stdint.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <wchar.h>
43 
44 #include "citrus_namespace.h"
45 #include "citrus_types.h"
46 #include "citrus_bcs.h"
47 #include "citrus_module.h"
48 #include "citrus_stdenc.h"
49 
50 #include "citrus_hz.h"
51 #include "citrus_prop.h"
52 
53 /*
54  * wchar_t mapping:
55  *
56  * CTRL/ASCII	00000000 00000000 00000000 gxxxxxxx
57  * GB2312	00000000 00000000 0xxxxxxx gxxxxxxx
58  * 94/96*n (~M)	0mmmmmmm 0xxxxxxx 0xxxxxxx gxxxxxxx
59  */
60 
61 #define ESCAPE_CHAR	'~'
62 
63 typedef enum {
64 	CTRL = 0, ASCII = 1, GB2312 = 2, CS94 = 3, CS96 = 4
65 } charset_t;
66 
67 typedef struct {
68 	int	 start;
69 	int	 end;
70 	int	 width;
71 } range_t;
72 
73 static const range_t ranges[] = {
74 #define RANGE(start, end) { start, end, (end - start) + 1 }
75 /* CTRL   */ RANGE(0x00, 0x1F),
76 /* ASCII  */ RANGE(0x20, 0x7F),
77 /* GB2312 */ RANGE(0x21, 0x7E),
78 /* CS94   */ RANGE(0x21, 0x7E),
79 /* CS96   */ RANGE(0x20, 0x7F),
80 #undef RANGE
81 };
82 
83 typedef struct escape_t escape_t;
84 typedef struct {
85 	charset_t	 charset;
86 	escape_t	*escape;
87 	ssize_t		 length;
88 #define ROWCOL_MAX	3
89 } graphic_t;
90 
91 typedef TAILQ_HEAD(escape_list, escape_t) escape_list;
92 struct escape_t {
93 	TAILQ_ENTRY(escape_t)	 entry;
94 	escape_list		*set;
95 	graphic_t		*left;
96 	graphic_t		*right;
97 	int			 ch;
98 };
99 
100 #define GL(escape)	((escape)->left)
101 #define GR(escape)	((escape)->right)
102 #define SET(escape)	((escape)->set)
103 #define ESC(escape)	((escape)->ch)
104 #define INIT(escape)	(TAILQ_FIRST(SET(escape)))
105 
106 static __inline escape_t *
107 find_escape(escape_list *set, int ch)
108 {
109 	escape_t *escape;
110 
111 	TAILQ_FOREACH(escape, set, entry) {
112 		if (ESC(escape) == ch)
113 			break;
114 	}
115 
116 	return (escape);
117 }
118 
119 typedef struct {
120 	escape_list	 e0;
121 	escape_list	 e1;
122 	graphic_t	*ascii;
123 	graphic_t	*gb2312;
124 } _HZEncodingInfo;
125 
126 #define E0SET(ei)	(&(ei)->e0)
127 #define E1SET(ei)	(&(ei)->e1)
128 #define INIT0(ei)	(TAILQ_FIRST(E0SET(ei)))
129 #define INIT1(ei)	(TAILQ_FIRST(E1SET(ei)))
130 
131 typedef struct {
132 	escape_t	*inuse;
133 	int		 chlen;
134 	char		 ch[ROWCOL_MAX];
135 } _HZState;
136 
137 #define _CEI_TO_EI(_cei_)		(&(_cei_)->ei)
138 #define _CEI_TO_STATE(_cei_, _func_)	(_cei_)->states.s_##_func_
139 
140 #define _FUNCNAME(m)			_citrus_HZ_##m
141 #define _ENCODING_INFO			_HZEncodingInfo
142 #define _ENCODING_STATE			_HZState
143 #define _ENCODING_MB_CUR_MAX(_ei_)	MB_LEN_MAX
144 #define _ENCODING_IS_STATE_DEPENDENT		1
145 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_)	((_ps_)->inuse == NULL)
146 
147 static __inline void
148 _citrus_HZ_init_state(_HZEncodingInfo * __restrict ei,
149     _HZState * __restrict psenc)
150 {
151 
152 	psenc->chlen = 0;
153 	psenc->inuse = INIT0(ei);
154 }
155 
156 #if 0
157 static __inline void
158 /*ARGSUSED*/
159 _citrus_HZ_pack_state(_HZEncodingInfo * __restrict ei __unused,
160     void *__restrict pspriv, const _HZState * __restrict psenc)
161 {
162 
163 	memcpy(pspriv, (const void *)psenc, sizeof(*psenc));
164 }
165 
166 static __inline void
167 /*ARGSUSED*/
168 _citrus_HZ_unpack_state(_HZEncodingInfo * __restrict ei __unused,
169     _HZState * __restrict psenc, const void * __restrict pspriv)
170 {
171 
172 	memcpy((void *)psenc, pspriv, sizeof(*psenc));
173 }
174 #endif
175 
176 static int
177 _citrus_HZ_mbrtowc_priv(_HZEncodingInfo * __restrict ei,
178     wchar_t * __restrict pwc, char ** __restrict s, size_t n,
179     _HZState * __restrict psenc, size_t * __restrict nresult)
180 {
181 	escape_t *candidate, *init;
182 	graphic_t *graphic;
183 	const range_t *range;
184 	char *s0;
185 	wchar_t wc;
186 	int bit, ch, head, len, tail;
187 
188 	if (*s == NULL) {
189 		_citrus_HZ_init_state(ei, psenc);
190 		*nresult = 1;
191 		return (0);
192 	}
193 	s0 = *s;
194 	if (psenc->chlen < 0 || psenc->inuse == NULL)
195 		return (EINVAL);
196 
197 	wc = (wchar_t)0;
198 	bit = head = tail = 0;
199 	graphic = NULL;
200 	for (len = 0; len <= MB_LEN_MAX;) {
201 		if (psenc->chlen == tail) {
202 			if (n-- < 1) {
203 				*s = s0;
204 				*nresult = (size_t)-2;
205 				return (0);
206 			}
207 			psenc->ch[psenc->chlen++] = *s0++;
208 			++len;
209 		}
210 		ch = (unsigned char)psenc->ch[tail++];
211 		if (tail == 1) {
212 			if ((ch & ~0x80) <= 0x1F) {
213 				if (psenc->inuse != INIT0(ei))
214 					break;
215 				wc = (wchar_t)ch;
216 				goto done;
217 			}
218 			if (ch & 0x80) {
219 				graphic = GR(psenc->inuse);
220 				bit = 0x80;
221 				ch &= ~0x80;
222 			} else {
223 				graphic = GL(psenc->inuse);
224 				if (ch == ESCAPE_CHAR)
225 					continue;
226 				bit = 0x0;
227 			}
228 			if (graphic == NULL)
229 				break;
230 		} else if (tail == 2 && psenc->ch[0] == ESCAPE_CHAR) {
231 			if (tail < psenc->chlen)
232 				return (EINVAL);
233 			if (ch == ESCAPE_CHAR) {
234 				++head;
235 			} else if (ch == '\n') {
236 				if (psenc->inuse != INIT0(ei))
237 					break;
238 				tail = psenc->chlen = 0;
239 				continue;
240 			} else {
241 				candidate = NULL;
242 				init = INIT0(ei);
243 				if (psenc->inuse == init) {
244 					init = INIT1(ei);
245 				} else if (INIT(psenc->inuse) == init) {
246 					if (ESC(init) != ch)
247 						break;
248 					candidate = init;
249 				}
250 				if (candidate == NULL) {
251 					candidate = find_escape(
252 					    SET(psenc->inuse), ch);
253 					if (candidate == NULL) {
254 						if (init == NULL ||
255 						    ESC(init) != ch)
256 							break;
257 						candidate = init;
258 					}
259 				}
260 				psenc->inuse = candidate;
261 				tail = psenc->chlen = 0;
262 				continue;
263 			}
264 		} else if (ch & 0x80) {
265 			if (graphic != GR(psenc->inuse))
266 				break;
267 			ch &= ~0x80;
268 		} else {
269 			if (graphic != GL(psenc->inuse))
270 				break;
271 		}
272 		range = &ranges[(size_t)graphic->charset];
273 		if (range->start > ch || range->end < ch)
274 			break;
275 		wc <<= 8;
276 		wc |= ch;
277 		if (graphic->length == (tail - head)) {
278 			if (graphic->charset > GB2312)
279 				bit |= ESC(psenc->inuse) << 24;
280 			wc |= bit;
281 			goto done;
282 		}
283 	}
284 	*nresult = (size_t)-1;
285 	return (EILSEQ);
286 done:
287 	if (tail < psenc->chlen)
288 		return (EINVAL);
289 	*s = s0;
290 	if (pwc != NULL)
291 		*pwc = wc;
292 	psenc->chlen = 0;
293 	*nresult = (wc == 0) ? 0 : len;
294 
295 	return (0);
296 }
297 
298 static int
299 _citrus_HZ_wcrtomb_priv(_HZEncodingInfo * __restrict ei,
300     char * __restrict s, size_t n, wchar_t wc,
301     _HZState * __restrict psenc, size_t * __restrict nresult)
302 {
303 	escape_t *candidate, *init;
304 	graphic_t *graphic;
305 	const range_t *range;
306 	size_t len;
307 	int bit, ch;
308 
309 	if (psenc->chlen != 0 || psenc->inuse == NULL)
310 		return (EINVAL);
311 	if (wc & 0x80) {
312 		bit = 0x80;
313 		wc &= ~0x80;
314 	} else {
315 		bit = 0x0;
316 	}
317 	if ((uint32_t)wc <= 0x1F) {
318 		candidate = INIT0(ei);
319 		graphic = (bit == 0) ? candidate->left : candidate->right;
320 		if (graphic == NULL)
321 			goto ilseq;
322 		range = &ranges[(size_t)CTRL];
323 		len = 1;
324 	} else if ((uint32_t)wc <= 0x7F) {
325 		graphic = ei->ascii;
326 		if (graphic == NULL)
327 			goto ilseq;
328 		candidate = graphic->escape;
329 		range = &ranges[(size_t)graphic->charset];
330 		len = graphic->length;
331 	} else if ((uint32_t)wc <= 0x7F7F) {
332 		graphic = ei->gb2312;
333 		if (graphic == NULL)
334 			goto ilseq;
335 		candidate = graphic->escape;
336 		range = &ranges[(size_t)graphic->charset];
337 		len = graphic->length;
338 	} else {
339 		ch = (wc >> 24) & 0xFF;
340 		candidate = find_escape(E0SET(ei), ch);
341 		if (candidate == NULL) {
342 			candidate = find_escape(E1SET(ei), ch);
343 			if (candidate == NULL)
344 				goto ilseq;
345 		}
346 		wc &= ~0xFF000000;
347 		graphic = (bit == 0) ? candidate->left : candidate->right;
348 		if (graphic == NULL)
349 			goto ilseq;
350 		range = &ranges[(size_t)graphic->charset];
351 		len = graphic->length;
352 	}
353 	if (psenc->inuse != candidate) {
354 		init = INIT0(ei);
355 		if (SET(psenc->inuse) == SET(candidate)) {
356 			if (INIT(psenc->inuse) != init ||
357 			    psenc->inuse == init || candidate == init)
358 				init = NULL;
359 		} else if (candidate == (init = INIT(candidate))) {
360 			init = NULL;
361 		}
362 		if (init != NULL) {
363 			if (n < 2)
364 				return (E2BIG);
365 			n -= 2;
366 			psenc->ch[psenc->chlen++] = ESCAPE_CHAR;
367 			psenc->ch[psenc->chlen++] = ESC(init);
368 		}
369 		if (n < 2)
370 			return (E2BIG);
371 		n -= 2;
372 		psenc->ch[psenc->chlen++] = ESCAPE_CHAR;
373 		psenc->ch[psenc->chlen++] = ESC(candidate);
374 		psenc->inuse = candidate;
375 	}
376 	if (n < len)
377 		return (E2BIG);
378 	while (len-- > 0) {
379 		ch = (wc >> (len * 8)) & 0xFF;
380 		if (range->start > ch || range->end < ch)
381 			goto ilseq;
382 		psenc->ch[psenc->chlen++] = ch | bit;
383 	}
384 	memcpy(s, psenc->ch, psenc->chlen);
385 	*nresult = psenc->chlen;
386 	psenc->chlen = 0;
387 
388 	return (0);
389 
390 ilseq:
391 	*nresult = (size_t)-1;
392 	return (EILSEQ);
393 }
394 
395 static __inline int
396 _citrus_HZ_put_state_reset(_HZEncodingInfo * __restrict ei,
397     char * __restrict s, size_t n, _HZState * __restrict psenc,
398     size_t * __restrict nresult)
399 {
400 	escape_t *candidate;
401 
402 	if (psenc->chlen != 0 || psenc->inuse == NULL)
403 		return (EINVAL);
404 	candidate = INIT0(ei);
405 	if (psenc->inuse != candidate) {
406 		if (n < 2)
407 			return (E2BIG);
408 		n -= 2;
409 		psenc->ch[psenc->chlen++] = ESCAPE_CHAR;
410 		psenc->ch[psenc->chlen++] = ESC(candidate);
411 	}
412 	if (n < 1)
413 		return (E2BIG);
414 	if (psenc->chlen > 0)
415 		memcpy(s, psenc->ch, psenc->chlen);
416 	*nresult = psenc->chlen;
417 	_citrus_HZ_init_state(ei, psenc);
418 
419 	return (0);
420 }
421 
422 static __inline int
423 _citrus_HZ_stdenc_get_state_desc_generic(_HZEncodingInfo * __restrict ei,
424     _HZState * __restrict psenc, int * __restrict rstate)
425 {
426 
427 	if (psenc->chlen < 0 || psenc->inuse == NULL)
428 		return (EINVAL);
429 	*rstate = (psenc->chlen == 0)
430 	    ? ((psenc->inuse == INIT0(ei))
431 	        ? _STDENC_SDGEN_INITIAL
432 	        : _STDENC_SDGEN_STABLE)
433 	    : ((psenc->ch[0] == ESCAPE_CHAR)
434 	        ? _STDENC_SDGEN_INCOMPLETE_SHIFT
435 	        : _STDENC_SDGEN_INCOMPLETE_CHAR);
436 
437 	return (0);
438 }
439 
440 static __inline int
441 /*ARGSUSED*/
442 _citrus_HZ_stdenc_wctocs(_HZEncodingInfo * __restrict ei __unused,
443     _csid_t * __restrict csid, _index_t * __restrict idx, wchar_t wc)
444 {
445 	int bit;
446 
447 	if (wc & 0x80) {
448 		bit = 0x80;
449 		wc &= ~0x80;
450 	} else
451 		bit = 0x0;
452 	if ((uint32_t)wc <= 0x7F) {
453 		*csid = (_csid_t)bit;
454 		*idx = (_index_t)wc;
455 	} else if ((uint32_t)wc <= 0x7F7F) {
456 		*csid = (_csid_t)(bit | 0x8000);
457 		*idx = (_index_t)wc;
458 	} else {
459 		*csid = (_index_t)(wc & ~0x00FFFF7F);
460 		*idx = (_csid_t)(wc & 0x00FFFF7F);
461 	}
462 
463 	return (0);
464 }
465 
466 static __inline int
467 /*ARGSUSED*/
468 _citrus_HZ_stdenc_cstowc(_HZEncodingInfo * __restrict ei __unused,
469     wchar_t * __restrict wc, _csid_t csid, _index_t idx)
470 {
471 
472 	*wc = (wchar_t)idx;
473 	switch (csid) {
474 	case 0x80:
475 	case 0x8080:
476 		*wc |= (wchar_t)0x80;
477 		/*FALLTHROUGH*/
478 	case 0x0:
479 	case 0x8000:
480 		break;
481 	default:
482 		*wc |= (wchar_t)csid;
483 	}
484 
485 	return (0);
486 }
487 
488 static void
489 _citrus_HZ_encoding_module_uninit(_HZEncodingInfo *ei)
490 {
491 	escape_t *escape;
492 
493 	while ((escape = TAILQ_FIRST(E0SET(ei))) != NULL) {
494 		TAILQ_REMOVE(E0SET(ei), escape, entry);
495 		free(GL(escape));
496 		free(GR(escape));
497 		free(escape);
498 	}
499 	while ((escape = TAILQ_FIRST(E1SET(ei))) != NULL) {
500 		TAILQ_REMOVE(E1SET(ei), escape, entry);
501 		free(GL(escape));
502 		free(GR(escape));
503 		free(escape);
504 	}
505 }
506 
507 static int
508 _citrus_HZ_parse_char(void *context, const char *name __unused, const char *s)
509 {
510 	escape_t *escape;
511 	void **p;
512 
513 	p = (void **)context;
514 	escape = (escape_t *)p[0];
515 	if (escape->ch != '\0')
516 		return (EINVAL);
517 	escape->ch = *s++;
518 	if (escape->ch == ESCAPE_CHAR || *s != '\0')
519 		return (EINVAL);
520 
521 	return (0);
522 }
523 
524 static int
525 _citrus_HZ_parse_graphic(void *context, const char *name, const char *s)
526 {
527 	_HZEncodingInfo *ei;
528 	escape_t *escape;
529 	graphic_t *graphic;
530 	void **p;
531 
532 	p = (void **)context;
533 	escape = (escape_t *)p[0];
534 	ei = (_HZEncodingInfo *)p[1];
535 	graphic = calloc(1, sizeof(*graphic));
536 	if (graphic == NULL)
537 		return (ENOMEM);
538 	if (strcmp("GL", name) == 0) {
539 		if (GL(escape) != NULL)
540 			goto release;
541 		GL(escape) = graphic;
542 	} else if (strcmp("GR", name) == 0) {
543 		if (GR(escape) != NULL)
544 			goto release;
545 		GR(escape) = graphic;
546 	} else {
547 release:
548 		free(graphic);
549 		return (EINVAL);
550 	}
551 	graphic->escape = escape;
552 	if (_bcs_strncasecmp("ASCII", s, 5) == 0) {
553 		if (s[5] != '\0')
554 			return (EINVAL);
555 		graphic->charset = ASCII;
556 		graphic->length = 1;
557 		ei->ascii = graphic;
558 		return (0);
559 	} else if (_bcs_strncasecmp("GB2312", s, 6) == 0) {
560 		if (s[6] != '\0')
561 			return (EINVAL);
562 		graphic->charset = GB2312;
563 		graphic->length = 2;
564 		ei->gb2312 = graphic;
565 		return (0);
566 	} else if (strncmp("94*", s, 3) == 0)
567 		graphic->charset = CS94;
568 	else if (strncmp("96*", s, 3) == 0)
569 		graphic->charset = CS96;
570 	else
571 		return (EINVAL);
572 	s += 3;
573 	switch(*s) {
574 	case '1': case '2': case '3':
575 		graphic->length = (size_t)(*s - '0');
576 		if (*++s == '\0')
577 			break;
578 	/*FALLTHROUGH*/
579 	default:
580 		return (EINVAL);
581 	}
582 	return (0);
583 }
584 
585 static const _citrus_prop_hint_t escape_hints[] = {
586 _CITRUS_PROP_HINT_STR("CH", &_citrus_HZ_parse_char),
587 _CITRUS_PROP_HINT_STR("GL", &_citrus_HZ_parse_graphic),
588 _CITRUS_PROP_HINT_STR("GR", &_citrus_HZ_parse_graphic),
589 _CITRUS_PROP_HINT_END
590 };
591 
592 static int
593 _citrus_HZ_parse_escape(void *context, const char *name, const char *s)
594 {
595 	_HZEncodingInfo *ei;
596 	escape_t *escape;
597 	void *p[2];
598 
599 	ei = (_HZEncodingInfo *)context;
600 	escape = calloc(1, sizeof(*escape));
601 	if (escape == NULL)
602 		return (EINVAL);
603 	if (strcmp("0", name) == 0) {
604 		escape->set = E0SET(ei);
605 		TAILQ_INSERT_TAIL(E0SET(ei), escape, entry);
606 	} else if (strcmp("1", name) == 0) {
607 		escape->set = E1SET(ei);
608 		TAILQ_INSERT_TAIL(E1SET(ei), escape, entry);
609 	} else {
610 		free(escape);
611 		return (EINVAL);
612 	}
613 	p[0] = (void *)escape;
614 	p[1] = (void *)ei;
615 	return (_citrus_prop_parse_variable(
616 	    escape_hints, (void *)&p[0], s, strlen(s)));
617 }
618 
619 static const _citrus_prop_hint_t root_hints[] = {
620 _CITRUS_PROP_HINT_STR("0", &_citrus_HZ_parse_escape),
621 _CITRUS_PROP_HINT_STR("1", &_citrus_HZ_parse_escape),
622 _CITRUS_PROP_HINT_END
623 };
624 
625 static int
626 _citrus_HZ_encoding_module_init(_HZEncodingInfo * __restrict ei,
627     const void * __restrict var, size_t lenvar)
628 {
629 	int errnum;
630 
631 	memset(ei, 0, sizeof(*ei));
632 	TAILQ_INIT(E0SET(ei));
633 	TAILQ_INIT(E1SET(ei));
634 	errnum = _citrus_prop_parse_variable(
635 	    root_hints, (void *)ei, var, lenvar);
636 	if (errnum != 0)
637 		_citrus_HZ_encoding_module_uninit(ei);
638 	return (errnum);
639 }
640 
641 /* ----------------------------------------------------------------------
642  * public interface for stdenc
643  */
644 
645 _CITRUS_STDENC_DECLS(HZ);
646 _CITRUS_STDENC_DEF_OPS(HZ);
647 
648 #include "citrus_stdenc_template.h"
649