xref: /illumos-gate/usr/src/cmd/iconv/scanner.c (revision e86372a01d2d16a5dd4a64e144ed978ba17fe7dd)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
14  */
15 
16 /*
17  * This file contains the "scanner", which tokenizes charmap files
18  * for iconv for processing by the higher level grammar processor.
19  */
20 
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <ctype.h>
24 #include <limits.h>
25 #include <string.h>
26 #include <widec.h>
27 #include <sys/types.h>
28 #include <assert.h>
29 #include "charmap.h"
30 #include "parser.tab.h"
31 
32 int			com_char = '#';
33 int			esc_char = '\\';
34 int			mb_cur_min = 1;
35 int			mb_cur_max = MB_LEN_MAX;
36 int			lineno = 1;
37 int			warnings = 0;
38 static int		nextline;
39 static FILE		*input = stdin;
40 static const char	*filename = "<stdin>";
41 static int		instring = 0;
42 static int		escaped = 0;
43 
44 /*
45  * Token space ... grows on demand.
46  */
47 static char *token = NULL;
48 static int tokidx;
49 static int toksz = 0;
50 static int hadtok = 0;
51 
52 /*
53  * The last keyword seen.  This is useful to trigger the special lexer rules
54  * for "copy" and also collating symbols and elements.
55  */
56 int	last_kw = 0;
57 static int	category = T_END;
58 
59 static struct token {
60 	int id;
61 	const char *name;
62 } keywords[] = {
63 	{ T_COM_CHAR,		"comment_char" },
64 	{ T_ESC_CHAR,		"escape_char" },
65 	{ T_END,		"END" },
66 
67 	/*
68 	 * These are keywords used in the charmap file.  Note that
69 	 * Solaris orginally used angle brackets to wrap some of them,
70 	 * but we removed that to simplify our parser.  The first of these
71 	 * items are "global items."
72 	 */
73 	{ T_CHARMAP,		"CHARMAP" },
74 	{ T_WIDTH,		"WIDTH" },
75 	{ T_WIDTH_DEFAULT,	"WIDTH_DEFAULT" },
76 
77 	{ -1, NULL },
78 };
79 
80 /*
81  * These special words are only used in a charmap file, enclosed in <>.
82  */
83 static struct token symwords[] = {
84 	{ T_COM_CHAR,		"comment_char" },
85 	{ T_ESC_CHAR,		"escape_char" },
86 	{ T_CODE_SET,		"code_set_name" },
87 	{ T_MB_CUR_MAX,		"mb_cur_max" },
88 	{ T_MB_CUR_MIN,		"mb_cur_min" },
89 	{ -1, NULL },
90 };
91 
92 static int categories[] = {
93 	T_CHARMAP,
94 	0
95 };
96 
97 void
98 reset_scanner(const char *fname)
99 {
100 	if (fname == NULL) {
101 		filename = "<stdin>";
102 		input = stdin;
103 	} else {
104 		if (input != stdin)
105 			(void) fclose(input);
106 		if ((input = fopen(fname, "r")) == NULL) {
107 			perror(fname);
108 			exit(1);
109 		}
110 		filename = fname;
111 	}
112 	com_char = '#';
113 	esc_char = '\\';
114 	instring = 0;
115 	escaped = 0;
116 	lineno = 1;
117 	nextline = 1;
118 	tokidx = 0;
119 	last_kw = 0;
120 	category = T_END;
121 }
122 
123 #define	hex(x)	\
124 	(isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
125 #define	isodigit(x)	((x >= '0') && (x <= '7'))
126 
127 static int
128 scanc(void)
129 {
130 	int	c;
131 
132 	c = getc(input);
133 	lineno = nextline;
134 	if (c == '\n') {
135 		nextline++;
136 	}
137 	return (c);
138 }
139 
140 static void
141 unscanc(int c)
142 {
143 	if (c == '\n') {
144 		nextline--;
145 	}
146 	if (ungetc(c, input) < 0) {
147 		yyerror(_("ungetc failed"));
148 	}
149 }
150 
151 static int
152 scan_hex_byte(void)
153 {
154 	int	c1, c2;
155 	int	v;
156 
157 	c1 = scanc();
158 	if (!isxdigit(c1)) {
159 		yyerror(_("malformed hex digit"));
160 		return (0);
161 	}
162 	c2 = scanc();
163 	if (!isxdigit(c2)) {
164 		yyerror(_("malformed hex digit"));
165 		return (0);
166 	}
167 	v = ((hex(c1) << 4) | hex(c2));
168 	return (v);
169 }
170 
171 static int
172 scan_dec_byte(void)
173 {
174 	int	c1, c2, c3;
175 	int	b;
176 
177 	c1 = scanc();
178 	if (!isdigit(c1)) {
179 		yyerror(_("malformed decimal digit"));
180 		return (0);
181 	}
182 	b = c1 - '0';
183 	c2 = scanc();
184 	if (!isdigit(c2)) {
185 		yyerror(_("malformed decimal digit"));
186 		return (0);
187 	}
188 	b *= 10;
189 	b += (c2 - '0');
190 	c3 = scanc();
191 	if (!isdigit(c3)) {
192 		unscanc(c3);
193 	} else {
194 		b *= 10;
195 		b += (c3 - '0');
196 	}
197 	return (b);
198 }
199 
200 static int
201 scan_oct_byte(void)
202 {
203 	int c1, c2, c3;
204 	int	b;
205 
206 	b = 0;
207 
208 	c1 = scanc();
209 	if (!isodigit(c1)) {
210 		yyerror(_("malformed octal digit"));
211 		return (0);
212 	}
213 	b = c1 - '0';
214 	c2 = scanc();
215 	if (!isodigit(c2)) {
216 		yyerror(_("malformed octal digit"));
217 		return (0);
218 	}
219 	b *= 8;
220 	b += (c2 - '0');
221 	c3 = scanc();
222 	if (!isodigit(c3)) {
223 		unscanc(c3);
224 	} else {
225 		b *= 8;
226 		b += (c3 - '0');
227 	}
228 	return (b);
229 }
230 
231 void
232 add_tok(int c)
233 {
234 	if ((tokidx + 1) >= toksz) {
235 		toksz += 64;
236 		if ((token = realloc(token, toksz)) == NULL) {
237 			yyerror(_("out of memory"));
238 			tokidx = 0;
239 			toksz = 0;
240 			return;
241 		}
242 	}
243 
244 	token[tokidx++] = (char)c;
245 	token[tokidx] = 0;
246 }
247 
248 static int
249 get_byte(void)
250 {
251 	int	c;
252 
253 	if ((c = scanc()) != esc_char) {
254 		unscanc(c);
255 		return (EOF);
256 	}
257 	c = scanc();
258 
259 	switch (c) {
260 	case 'd':
261 	case 'D':
262 		return (scan_dec_byte());
263 	case 'x':
264 	case 'X':
265 		return (scan_hex_byte());
266 	case '0':
267 	case '1':
268 	case '2':
269 	case '3':
270 	case '4':
271 	case '5':
272 	case '6':
273 	case '7':
274 		/* put the character back so we can get it */
275 		unscanc(c);
276 		return (scan_oct_byte());
277 	default:
278 		unscanc(c);
279 		unscanc(esc_char);
280 		return (EOF);
281 	}
282 }
283 
284 int
285 get_escaped(int c)
286 {
287 	switch (c) {
288 	case 'n':
289 		return ('\n');
290 	case 'r':
291 		return ('\r');
292 	case 't':
293 		return ('\t');
294 	case 'f':
295 		return ('\f');
296 	case 'v':
297 		return ('\v');
298 	case 'b':
299 		return ('\b');
300 	case 'a':
301 		return ('\a');
302 	default:
303 		return (c);
304 	}
305 }
306 
307 int
308 get_wide(void)
309 {
310 	/* NB: yylval.mbs[0] is the length */
311 	char *mbs = &yylval.mbs[1];
312 	int mbi = 0;
313 	int c;
314 
315 	mbs[mbi] = 0;
316 	if (mb_cur_max > MB_LEN_MAX) {
317 		yyerror(_("max multibyte character size too big"));
318 		return (T_NULL);
319 	}
320 	for (;;) {
321 		if ((c = get_byte()) == EOF)
322 			break;
323 		if (mbi == mb_cur_max) {
324 			unscanc(c);
325 			yyerror(_("length > mb_cur_max"));
326 			return (T_NULL);
327 		}
328 		mbs[mbi++] = c;
329 		mbs[mbi] = 0;
330 	}
331 
332 	/* result in yylval.mbs */
333 	mbs[-1] = mbi;
334 	return (T_CHAR);
335 }
336 
337 int
338 get_symbol(void)
339 {
340 	int	c;
341 
342 	while ((c = scanc()) != EOF) {
343 		if (escaped) {
344 			escaped = 0;
345 			if (c == '\n')
346 				continue;
347 			add_tok(get_escaped(c));
348 			continue;
349 		}
350 		if (c == esc_char) {
351 			escaped = 1;
352 			continue;
353 		}
354 		if (c == '\n') {	/* well that's strange! */
355 			yyerror(_("unterminated symbolic name"));
356 			continue;
357 		}
358 		if (c == '>') {		/* end of symbol */
359 
360 			/*
361 			 * This restarts the token from the beginning
362 			 * the next time we scan a character.  (This
363 			 * token is complete.)
364 			 */
365 
366 			if (token == NULL) {
367 				yyerror(_("missing symbolic name"));
368 				return (T_NULL);
369 			}
370 			tokidx = 0;
371 
372 			/*
373 			 * A few symbols are handled as keywords outside
374 			 * of the normal categories.
375 			 */
376 			if (category == T_END) {
377 				int i;
378 				for (i = 0; symwords[i].name != 0; i++) {
379 					if (strcmp(token, symwords[i].name) ==
380 					    0) {
381 						last_kw = symwords[i].id;
382 						return (last_kw);
383 					}
384 				}
385 			}
386 			/* its an undefined symbol */
387 			yylval.token = strdup(token);
388 			if (yylval.token == NULL) {
389 				perror("malloc");
390 				exit(1);
391 			}
392 			token = NULL;
393 			toksz = 0;
394 			tokidx = 0;
395 			return (T_SYMBOL);
396 		}
397 		add_tok(c);
398 	}
399 
400 	yyerror(_("unterminated symbolic name"));
401 	return (EOF);
402 }
403 
404 
405 static int
406 consume_token(void)
407 {
408 	int	len = tokidx;
409 	int	i;
410 
411 	tokidx = 0;
412 	if (token == NULL)
413 		return (T_NULL);
414 
415 	/*
416 	 * this one is special, because we don't want it to alter the
417 	 * last_kw field.
418 	 */
419 	if (strcmp(token, "...") == 0) {
420 		return (T_ELLIPSIS);
421 	}
422 
423 	/* search for reserved words first */
424 	for (i = 0; keywords[i].name; i++) {
425 		int j;
426 		if (strcmp(keywords[i].name, token) != 0) {
427 			continue;
428 		}
429 
430 		last_kw = keywords[i].id;
431 
432 		/* clear the top level category if we're done with it */
433 		if (last_kw == T_END) {
434 			category = T_END;
435 		}
436 
437 		/* set the top level category if we're changing */
438 		for (j = 0; categories[j]; j++) {
439 			if (categories[j] != last_kw)
440 				continue;
441 			category = last_kw;
442 		}
443 
444 		return (keywords[i].id);
445 	}
446 
447 	/* maybe its a numeric constant? */
448 	if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
449 		char *eptr;
450 		yylval.num = strtol(token, &eptr, 10);
451 		if (*eptr != 0)
452 			yyerror(_("malformed number"));
453 		return (T_NUMBER);
454 	}
455 
456 	/*
457 	 * A single lone character is treated as a character literal.
458 	 * To avoid duplication of effort, we stick in the charmap.
459 	 */
460 	if (len == 1) {
461 		yylval.mbs[0] = 1; /* length */
462 		yylval.mbs[1] = token[0];
463 		yylval.mbs[2] = '\0';
464 		return (T_CHAR);
465 	}
466 
467 	/* anything else is treated as a symbolic name */
468 	yylval.token = strdup(token);
469 	token = NULL;
470 	toksz = 0;
471 	tokidx = 0;
472 	return (T_NAME);
473 }
474 
475 void
476 scan_to_eol(void)
477 {
478 	int	c;
479 	while ((c = scanc()) != '\n') {
480 		if (c == EOF) {
481 			/* end of file without newline! */
482 			errf(_("missing newline"));
483 			return;
484 		}
485 	}
486 	assert(c == '\n');
487 }
488 
489 int
490 yylex(void)
491 {
492 	int		c;
493 
494 	while ((c = scanc()) != EOF) {
495 
496 		/* special handling for quoted string */
497 		if (instring) {
498 			if (escaped) {
499 				escaped = 0;
500 
501 				/* if newline, just eat and forget it */
502 				if (c == '\n')
503 					continue;
504 
505 				if (strchr("xXd01234567", c)) {
506 					unscanc(c);
507 					unscanc(esc_char);
508 					return (get_wide());
509 				}
510 				yylval.mbs[0] = 1; /* length */
511 				yylval.mbs[1] = get_escaped(c);
512 				yylval.mbs[2] = '\0';
513 				return (T_CHAR);
514 			}
515 			if (c == esc_char) {
516 				escaped = 1;
517 				continue;
518 			}
519 			switch (c) {
520 			case '<':
521 				return (get_symbol());
522 			case '>':
523 				/* oops! should generate syntax error  */
524 				return (T_GT);
525 			case '"':
526 				instring = 0;
527 				return (T_QUOTE);
528 			default:
529 				yylval.mbs[0] = 1; /* length */
530 				yylval.mbs[1] = c;
531 				yylval.mbs[2] = '\0';
532 				return (T_CHAR);
533 			}
534 		}
535 
536 		/* escaped characters first */
537 		if (escaped) {
538 			escaped = 0;
539 			if (c == '\n') {
540 				/* eat the newline */
541 				continue;
542 			}
543 			hadtok = 1;
544 			if (tokidx) {
545 				/* an escape mid-token is nonsense */
546 				return (T_NULL);
547 			}
548 
549 			/* numeric escapes are treated as wide characters */
550 			if (strchr("xXd01234567", c)) {
551 				unscanc(c);
552 				unscanc(esc_char);
553 				return (get_wide());
554 			}
555 
556 			add_tok(get_escaped(c));
557 			continue;
558 		}
559 
560 		/* if it is the escape charter itself note it */
561 		if (c == esc_char) {
562 			escaped = 1;
563 			continue;
564 		}
565 
566 		/* remove from the comment char to end of line */
567 		if (c == com_char) {
568 			while (c != '\n') {
569 				if ((c = scanc()) == EOF) {
570 					/* end of file without newline! */
571 					return (EOF);
572 				}
573 			}
574 			assert(c == '\n');
575 			if (!hadtok) {
576 				/*
577 				 * If there were no tokens on this line,
578 				 * then just pretend it didn't exist at all.
579 				 */
580 				continue;
581 			}
582 			hadtok = 0;
583 			return (T_NL);
584 		}
585 
586 		if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
587 			/*
588 			 * These are all token delimiters.  If there
589 			 * is a token already in progress, we need to
590 			 * process it.
591 			 */
592 			unscanc(c);
593 			return (consume_token());
594 		}
595 
596 		switch (c) {
597 		case '\n':
598 			if (!hadtok) {
599 				/*
600 				 * If the line was completely devoid of tokens,
601 				 * then just ignore it.
602 				 */
603 				continue;
604 			}
605 			/* we're starting a new line, reset the token state */
606 			hadtok = 0;
607 			return (T_NL);
608 		case ',':
609 			hadtok = 1;
610 			return (T_COMMA);
611 		case ';':
612 			hadtok = 1;
613 			return (T_SEMI);
614 		case '(':
615 			hadtok = 1;
616 			return (T_LPAREN);
617 		case ')':
618 			hadtok = 1;
619 			return (T_RPAREN);
620 		case '>':
621 			hadtok = 1;
622 			return (T_GT);
623 		case '<':
624 			/* symbol start! */
625 			hadtok = 1;
626 			return (get_symbol());
627 		case ' ':
628 		case '\t':
629 			/* whitespace, just ignore it */
630 			continue;
631 		case '"':
632 			hadtok = 1;
633 			instring = 1;
634 			return (T_QUOTE);
635 		default:
636 			hadtok = 1;
637 			add_tok(c);
638 			continue;
639 		}
640 	}
641 	return (EOF);
642 }
643 
644 void
645 yyerror(const char *msg)
646 {
647 	(void) fprintf(stderr, _("%s: %d: error: %s\n"),
648 	    filename, lineno, msg);
649 	exit(1);
650 }
651 
652 void
653 errf(const char *fmt, ...)
654 {
655 	char	*msg;
656 
657 	va_list	va;
658 	va_start(va, fmt);
659 	(void) vasprintf(&msg, fmt, va);
660 	va_end(va);
661 
662 	(void) fprintf(stderr, _("%s: %d: error: %s\n"),
663 	    filename, lineno, msg);
664 	free(msg);
665 	exit(1);
666 }
667 
668 void
669 warn(const char *fmt, ...)
670 {
671 	char	*msg;
672 
673 	va_list	va;
674 	va_start(va, fmt);
675 	(void) vasprintf(&msg, fmt, va);
676 	va_end(va);
677 
678 	(void) fprintf(stderr, _("%s: %d: warning: %s\n"),
679 	    filename, lineno, msg);
680 	free(msg);
681 	warnings++;
682 }
683