1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
14 * Copyright 2013 DEY Storage Systems, Inc.
15 */
16
17 /*
18 * This file contains the "scanner", which tokenizes the input files
19 * for localedef for processing by the higher level grammar processor.
20 */
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <ctype.h>
25 #include <limits.h>
26 #include <string.h>
27 #include <widec.h>
28 #include <sys/types.h>
29 #include <assert.h>
30 #include "localedef.h"
31 #include "parser.tab.h"
32
33 int com_char = '#';
34 int esc_char = '\\';
35 int mb_cur_min = 1;
36 int mb_cur_max = 1;
37 int lineno = 1;
38 int warnings = 0;
39 static int nextline;
40 static FILE *input = stdin;
41 static const char *filename = "<stdin>";
42 static int instring = 0;
43 static int escaped = 0;
44
45 /*
46 * Token space ... grows on demand.
47 */
48 static char *token = NULL;
49 static int tokidx;
50 static int toksz = 0;
51 static int hadtok = 0;
52
53 /*
54 * Wide string space ... grows on demand.
55 */
56 static wchar_t *widestr = NULL;
57 static int wideidx = 0;
58 static int widesz = 0;
59
60 /*
61 * The last keyword seen. This is useful to trigger the special lexer rules
62 * for "copy" and also collating symbols and elements.
63 */
64 int last_kw = 0;
65 static int category = T_END;
66
67 static struct token {
68 int id;
69 const char *name;
70 } keywords[] = {
71 { T_COM_CHAR, "comment_char" },
72 { T_ESC_CHAR, "escape_char" },
73 { T_END, "END" },
74 { T_COPY, "copy" },
75 { T_MESSAGES, "LC_MESSAGES" },
76 { T_YESSTR, "yesstr" },
77 { T_YESEXPR, "yesexpr" },
78 { T_NOSTR, "nostr" },
79 { T_NOEXPR, "noexpr" },
80 { T_MONETARY, "LC_MONETARY" },
81 { T_INT_CURR_SYMBOL, "int_curr_symbol" },
82 { T_CURRENCY_SYMBOL, "currency_symbol" },
83 { T_MON_DECIMAL_POINT, "mon_decimal_point" },
84 { T_MON_THOUSANDS_SEP, "mon_thousands_sep" },
85 { T_POSITIVE_SIGN, "positive_sign" },
86 { T_NEGATIVE_SIGN, "negative_sign" },
87 { T_MON_GROUPING, "mon_grouping" },
88 { T_INT_FRAC_DIGITS, "int_frac_digits" },
89 { T_FRAC_DIGITS, "frac_digits" },
90 { T_P_CS_PRECEDES, "p_cs_precedes" },
91 { T_P_SEP_BY_SPACE, "p_sep_by_space" },
92 { T_N_CS_PRECEDES, "n_cs_precedes" },
93 { T_N_SEP_BY_SPACE, "n_sep_by_space" },
94 { T_P_SIGN_POSN, "p_sign_posn" },
95 { T_N_SIGN_POSN, "n_sign_posn" },
96 { T_INT_P_CS_PRECEDES, "int_p_cs_precedes" },
97 { T_INT_N_CS_PRECEDES, "int_n_cs_precedes" },
98 { T_INT_P_SEP_BY_SPACE, "int_p_sep_by_space" },
99 { T_INT_N_SEP_BY_SPACE, "int_n_sep_by_space" },
100 { T_INT_P_SIGN_POSN, "int_p_sign_posn" },
101 { T_INT_N_SIGN_POSN, "int_n_sign_posn" },
102 { T_COLLATE, "LC_COLLATE" },
103 { T_COLLATING_SYMBOL, "collating-symbol" },
104 { T_COLLATING_ELEMENT, "collating-element" },
105 { T_FROM, "from" },
106 { T_ORDER_START, "order_start" },
107 { T_ORDER_END, "order_end" },
108 { T_FORWARD, "forward" },
109 { T_BACKWARD, "backward" },
110 { T_POSITION, "position" },
111 { T_IGNORE, "IGNORE" },
112 { T_UNDEFINED, "UNDEFINED" },
113 { T_NUMERIC, "LC_NUMERIC" },
114 { T_DECIMAL_POINT, "decimal_point" },
115 { T_THOUSANDS_SEP, "thousands_sep" },
116 { T_GROUPING, "grouping" },
117 { T_TIME, "LC_TIME" },
118 { T_ABDAY, "abday" },
119 { T_DAY, "day" },
120 { T_ABMON, "abmon" },
121 { T_MON, "mon" },
122 { T_D_T_FMT, "d_t_fmt" },
123 { T_D_FMT, "d_fmt" },
124 { T_T_FMT, "t_fmt" },
125 { T_AM_PM, "am_pm" },
126 { T_T_FMT_AMPM, "t_fmt_ampm" },
127 { T_ERA, "era" },
128 { T_ERA_D_FMT, "era_d_fmt" },
129 { T_ERA_T_FMT, "era_t_fmt" },
130 { T_ERA_D_T_FMT, "era_d_t_fmt" },
131 { T_ALT_DIGITS, "alt_digits" },
132 { T_CTYPE, "LC_CTYPE" },
133 { T_ISUPPER, "upper" },
134 { T_ISLOWER, "lower" },
135 { T_ISALPHA, "alpha" },
136 { T_ISDIGIT, "digit" },
137 { T_ISPUNCT, "punct" },
138 { T_ISXDIGIT, "xdigit" },
139 { T_ISSPACE, "space" },
140 { T_ISPRINT, "print" },
141 { T_ISGRAPH, "graph" },
142 { T_ISBLANK, "blank" },
143 { T_ISCNTRL, "cntrl" },
144 /*
145 * These entries are local additions, and not specified by
146 * TOG. Note that they are not guaranteed to be accurate for
147 * all locales, and so applications should not depend on them.
148 */
149 { T_ISSPECIAL, "special" },
150 { T_ISENGLISH, "english" },
151 { T_ISPHONOGRAM, "phonogram" },
152 { T_ISIDEOGRAM, "ideogram" },
153 { T_ISNUMBER, "number" },
154 /*
155 * We have to support this in the grammar, but it would be a
156 * syntax error to define a character as one of these without
157 * also defining it as an alpha or digit. We ignore it in our
158 * parsing.
159 */
160 { T_ISALNUM, "alnum" },
161 { T_TOUPPER, "toupper" },
162 { T_TOLOWER, "tolower" },
163
164 /*
165 * These are keywords used in the charmap file. Note that
166 * Solaris orginally used angle brackets to wrap some of them,
167 * but we removed that to simplify our parser. The first of these
168 * items are "global items."
169 */
170 { T_CHARMAP, "CHARMAP" },
171 { T_WIDTH, "WIDTH" },
172
173 { -1, NULL },
174 };
175
176 /*
177 * These special words are only used in a charmap file, enclosed in <>.
178 */
179 static struct token symwords[] = {
180 { T_COM_CHAR, "comment_char" },
181 { T_ESC_CHAR, "escape_char" },
182 { T_CODE_SET, "code_set_name" },
183 { T_MB_CUR_MAX, "mb_cur_max" },
184 { T_MB_CUR_MIN, "mb_cur_min" },
185 { -1, NULL },
186 };
187
188 static int categories[] = {
189 T_CHARMAP,
190 T_CTYPE,
191 T_COLLATE,
192 T_MESSAGES,
193 T_MONETARY,
194 T_NUMERIC,
195 T_TIME,
196 T_WIDTH,
197 0
198 };
199
200 void
reset_scanner(const char * fname)201 reset_scanner(const char *fname)
202 {
203 if (fname == NULL) {
204 filename = "<stdin>";
205 input = stdin;
206 } else {
207 if (input != stdin)
208 (void) fclose(input);
209 if ((input = fopen(fname, "r")) == NULL) {
210 perror("fopen");
211 exit(4);
212 }
213 filename = fname;
214 }
215 com_char = '#';
216 esc_char = '\\';
217 instring = 0;
218 escaped = 0;
219 lineno = 1;
220 nextline = 1;
221 tokidx = 0;
222 wideidx = 0;
223 }
224
225 #define hex(x) \
226 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
227 #define isodigit(x) ((x >= '0') && (x <= '7'))
228
229 static int
scanc(void)230 scanc(void)
231 {
232 int c;
233
234 c = getc(input);
235 lineno = nextline;
236 if (c == '\n') {
237 nextline++;
238 }
239 return (c);
240 }
241
242 static void
unscanc(int c)243 unscanc(int c)
244 {
245 if (c == '\n') {
246 nextline--;
247 }
248 if (ungetc(c, input) < 0) {
249 yyerror(_("ungetc failed"));
250 }
251 }
252
253 static int
scan_hex_byte(void)254 scan_hex_byte(void)
255 {
256 int c1, c2;
257 int v;
258
259 c1 = scanc();
260 if (!isxdigit(c1)) {
261 yyerror(_("malformed hex digit"));
262 return (0);
263 }
264 c2 = scanc();
265 if (!isxdigit(c2)) {
266 yyerror(_("malformed hex digit"));
267 return (0);
268 }
269 v = ((hex(c1) << 4) | hex(c2));
270 return (v);
271 }
272
273 static int
scan_dec_byte(void)274 scan_dec_byte(void)
275 {
276 int c1, c2, c3;
277 int b;
278
279 c1 = scanc();
280 if (!isdigit(c1)) {
281 yyerror(_("malformed decimal digit"));
282 return (0);
283 }
284 b = c1 - '0';
285 c2 = scanc();
286 if (!isdigit(c2)) {
287 yyerror(_("malformed decimal digit"));
288 return (0);
289 }
290 b *= 10;
291 b += (c2 - '0');
292 c3 = scanc();
293 if (!isdigit(c3)) {
294 unscanc(c3);
295 } else {
296 b *= 10;
297 b += (c3 - '0');
298 }
299 return (b);
300 }
301
302 static int
scan_oct_byte(void)303 scan_oct_byte(void)
304 {
305 int c1, c2, c3;
306 int b;
307
308 b = 0;
309
310 c1 = scanc();
311 if (!isodigit(c1)) {
312 yyerror(_("malformed octal digit"));
313 return (0);
314 }
315 b = c1 - '0';
316 c2 = scanc();
317 if (!isodigit(c2)) {
318 yyerror(_("malformed octal digit"));
319 return (0);
320 }
321 b *= 8;
322 b += (c2 - '0');
323 c3 = scanc();
324 if (!isodigit(c3)) {
325 unscanc(c3);
326 } else {
327 b *= 8;
328 b += (c3 - '0');
329 }
330 return (b);
331 }
332
333 void
add_tok(int c)334 add_tok(int c)
335 {
336 if ((tokidx + 1) >= toksz) {
337 toksz += 64;
338 if ((token = realloc(token, toksz)) == NULL) {
339 yyerror(_("out of memory"));
340 tokidx = 0;
341 toksz = 0;
342 return;
343 }
344 }
345
346 token[tokidx++] = (char)c;
347 token[tokidx] = 0;
348 }
349 void
add_wcs(wchar_t c)350 add_wcs(wchar_t c)
351 {
352 if ((wideidx + 1) >= widesz) {
353 widesz += 64;
354 widestr = realloc(widestr, (widesz * sizeof (wchar_t)));
355 if (widestr == NULL) {
356 yyerror(_("out of memory"));
357 wideidx = 0;
358 widesz = 0;
359 return;
360 }
361 }
362
363 widestr[wideidx++] = c;
364 widestr[wideidx] = 0;
365 }
366
367 wchar_t *
get_wcs(void)368 get_wcs(void)
369 {
370 wchar_t *ws = widestr;
371 wideidx = 0;
372 widestr = NULL;
373 widesz = 0;
374 if (ws == NULL) {
375 if ((ws = wsdup(L"")) == NULL) {
376 yyerror(_("out of memory"));
377 }
378 }
379 return (ws);
380 }
381
382 static int
get_byte(void)383 get_byte(void)
384 {
385 int c;
386
387 if ((c = scanc()) != esc_char) {
388 unscanc(c);
389 return (EOF);
390 }
391 c = scanc();
392
393 switch (c) {
394 case 'd':
395 case 'D':
396 return (scan_dec_byte());
397 case 'x':
398 case 'X':
399 return (scan_hex_byte());
400 case '0':
401 case '1':
402 case '2':
403 case '3':
404 case '4':
405 case '5':
406 case '6':
407 case '7':
408 /* put the character back so we can get it */
409 unscanc(c);
410 return (scan_oct_byte());
411 default:
412 unscanc(c);
413 unscanc(esc_char);
414 return (EOF);
415 }
416 }
417
418 int
get_escaped(int c)419 get_escaped(int c)
420 {
421 switch (c) {
422 case 'n':
423 return ('\n');
424 case 'r':
425 return ('\r');
426 case 't':
427 return ('\t');
428 case 'f':
429 return ('\f');
430 case 'v':
431 return ('\v');
432 case 'b':
433 return ('\b');
434 case 'a':
435 return ('\a');
436 default:
437 return (c);
438 }
439 }
440
441 int
get_wide(void)442 get_wide(void)
443 {
444 static char mbs[MB_LEN_MAX + 1] = "";
445 static int mbi = 0;
446 int c;
447 wchar_t wc;
448
449 if (mb_cur_max >= sizeof (mbs)) {
450 yyerror(_("max multibyte character size too big"));
451 mbi = 0;
452 return (T_NULL);
453 }
454 for (;;) {
455 if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) {
456 /*
457 * end of the byte sequence reached, but no
458 * valid wide decoding. fatal error.
459 */
460 mbi = 0;
461 yyerror(_("not a valid character encoding"));
462 return (T_NULL);
463 }
464 mbs[mbi++] = c;
465 mbs[mbi] = 0;
466
467 /* does it decode? */
468 if (to_wide(&wc, mbs) >= 0) {
469 break;
470 }
471 }
472
473 mbi = 0;
474 if ((category != T_CHARMAP) && (category != T_WIDTH)) {
475 if (check_charmap(wc) < 0) {
476 yyerror(_("no symbolic name for character"));
477 return (T_NULL);
478 }
479 }
480
481 yylval.wc = wc;
482 return (T_CHAR);
483 }
484
485 int
get_symbol(void)486 get_symbol(void)
487 {
488 int c;
489
490 while ((c = scanc()) != EOF) {
491 if (escaped) {
492 escaped = 0;
493 if (c == '\n')
494 continue;
495 add_tok(get_escaped(c));
496 continue;
497 }
498 if (c == esc_char) {
499 escaped = 1;
500 continue;
501 }
502 if (c == '\n') { /* well that's strange! */
503 yyerror(_("unterminated symbolic name"));
504 continue;
505 }
506 if (c == '>') { /* end of symbol */
507
508 /*
509 * This restarts the token from the beginning
510 * the next time we scan a character. (This
511 * token is complete.)
512 */
513
514 if (token == NULL) {
515 yyerror(_("missing symbolic name"));
516 return (T_NULL);
517 }
518 tokidx = 0;
519
520 /*
521 * A few symbols are handled as keywords outside
522 * of the normal categories.
523 */
524 if (category == T_END) {
525 int i;
526 for (i = 0; symwords[i].name != 0; i++) {
527 if (strcmp(token, symwords[i].name) ==
528 0) {
529 last_kw = symwords[i].id;
530 return (last_kw);
531 }
532 }
533 }
534 /*
535 * Contextual rule: Only literal characters are
536 * permitted in CHARMAP. Anywhere else the symbolic
537 * forms are fine.
538 */
539 if ((category != T_CHARMAP) &&
540 (lookup_charmap(token, &yylval.wc)) != -1) {
541 return (T_CHAR);
542 }
543 if ((yylval.collsym = lookup_collsym(token)) != NULL) {
544 return (T_COLLSYM);
545 }
546 if ((yylval.collelem = lookup_collelem(token)) !=
547 NULL) {
548 return (T_COLLELEM);
549 }
550 /* its an undefined symbol */
551 yylval.token = strdup(token);
552 token = NULL;
553 toksz = 0;
554 tokidx = 0;
555 return (T_SYMBOL);
556 }
557 add_tok(c);
558 }
559
560 yyerror(_("unterminated symbolic name"));
561 return (EOF);
562 }
563
564 int
get_category(void)565 get_category(void)
566 {
567 return (category);
568 }
569
570 static int
consume_token(void)571 consume_token(void)
572 {
573 int len = tokidx;
574 int i;
575
576 tokidx = 0;
577 if (token == NULL)
578 return (T_NULL);
579
580 /*
581 * this one is special, because we don't want it to alter the
582 * last_kw field.
583 */
584 if (strcmp(token, "...") == 0) {
585 return (T_ELLIPSIS);
586 }
587
588 /* search for reserved words first */
589 for (i = 0; keywords[i].name; i++) {
590 int j;
591 if (strcmp(keywords[i].name, token) != 0) {
592 continue;
593 }
594
595 last_kw = keywords[i].id;
596
597 /* clear the top level category if we're done with it */
598 if (last_kw == T_END) {
599 category = T_END;
600 }
601
602 /* set the top level category if we're changing */
603 for (j = 0; categories[j]; j++) {
604 if (categories[j] != last_kw)
605 continue;
606 category = last_kw;
607 }
608
609 return (keywords[i].id);
610 }
611
612 /* maybe its a numeric constant? */
613 if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
614 char *eptr;
615 yylval.num = strtol(token, &eptr, 10);
616 if (*eptr != 0)
617 yyerror(_("malformed number"));
618 return (T_NUMBER);
619 }
620
621 /*
622 * A single lone character is treated as a character literal.
623 * To avoid duplication of effort, we stick in the charmap.
624 */
625 if (len == 1) {
626 yylval.wc = token[0];
627 return (T_CHAR);
628 }
629
630 /* anything else is treated as a symbolic name */
631 yylval.token = strdup(token);
632 token = NULL;
633 toksz = 0;
634 tokidx = 0;
635 return (T_NAME);
636 }
637
638 void
scan_to_eol(void)639 scan_to_eol(void)
640 {
641 int c;
642 while ((c = scanc()) != '\n') {
643 if (c == EOF) {
644 /* end of file without newline! */
645 errf(_("missing newline"));
646 return;
647 }
648 }
649 assert(c == '\n');
650 }
651
652 int
yylex(void)653 yylex(void)
654 {
655 int c;
656
657 while ((c = scanc()) != EOF) {
658
659 /* special handling for quoted string */
660 if (instring) {
661 if (escaped) {
662 escaped = 0;
663
664 /* if newline, just eat and forget it */
665 if (c == '\n')
666 continue;
667
668 if (strchr("xXd01234567", c)) {
669 unscanc(c);
670 unscanc(esc_char);
671 return (get_wide());
672 }
673 yylval.wc = get_escaped(c);
674 return (T_CHAR);
675 }
676 if (c == esc_char) {
677 escaped = 1;
678 continue;
679 }
680 switch (c) {
681 case '<':
682 return (get_symbol());
683 case '>':
684 /* oops! should generate syntax error */
685 return (T_GT);
686 case '"':
687 instring = 0;
688 return (T_QUOTE);
689 default:
690 yylval.wc = c;
691 return (T_CHAR);
692 }
693 }
694
695 /* escaped characters first */
696 if (escaped) {
697 escaped = 0;
698 if (c == '\n') {
699 /* eat the newline */
700 continue;
701 }
702 hadtok = 1;
703 if (tokidx) {
704 /* an escape mid-token is nonsense */
705 return (T_NULL);
706 }
707
708 /* numeric escapes are treated as wide characters */
709 if (strchr("xXd01234567", c)) {
710 unscanc(c);
711 unscanc(esc_char);
712 return (get_wide());
713 }
714
715 add_tok(get_escaped(c));
716 continue;
717 }
718
719 /* if it is the escape charter itself note it */
720 if (c == esc_char) {
721 escaped = 1;
722 continue;
723 }
724
725 /* remove from the comment char to end of line */
726 if (c == com_char) {
727 while (c != '\n') {
728 if ((c = scanc()) == EOF) {
729 /* end of file without newline! */
730 return (EOF);
731 }
732 }
733 assert(c == '\n');
734 if (!hadtok) {
735 /*
736 * If there were no tokens on this line,
737 * then just pretend it didn't exist at all.
738 */
739 continue;
740 }
741 hadtok = 0;
742 return (T_NL);
743 }
744
745 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
746 /*
747 * These are all token delimiters. If there
748 * is a token already in progress, we need to
749 * process it.
750 */
751 unscanc(c);
752 return (consume_token());
753 }
754
755 switch (c) {
756 case '\n':
757 if (!hadtok) {
758 /*
759 * If the line was completely devoid of tokens,
760 * then just ignore it.
761 */
762 continue;
763 }
764 /* we're starting a new line, reset the token state */
765 hadtok = 0;
766 return (T_NL);
767 case ',':
768 hadtok = 1;
769 return (T_COMMA);
770 case ';':
771 hadtok = 1;
772 return (T_SEMI);
773 case '(':
774 hadtok = 1;
775 return (T_LPAREN);
776 case ')':
777 hadtok = 1;
778 return (T_RPAREN);
779 case '>':
780 hadtok = 1;
781 return (T_GT);
782 case '<':
783 /* symbol start! */
784 hadtok = 1;
785 return (get_symbol());
786 case ' ':
787 case '\t':
788 /* whitespace, just ignore it */
789 continue;
790 case '"':
791 hadtok = 1;
792 instring = 1;
793 return (T_QUOTE);
794 default:
795 hadtok = 1;
796 add_tok(c);
797 continue;
798 }
799 }
800 return (EOF);
801 }
802
803 void
yyerror(const char * msg)804 yyerror(const char *msg)
805 {
806 (void) fprintf(stderr, _("%s: %d: error: %s\n"),
807 filename, lineno, msg);
808 exit(4);
809 }
810
811 void
errf(const char * fmt,...)812 errf(const char *fmt, ...)
813 {
814 char *msg;
815
816 va_list va;
817 va_start(va, fmt);
818 (void) vasprintf(&msg, fmt, va);
819 va_end(va);
820
821 (void) fprintf(stderr, _("%s: %d: error: %s\n"),
822 filename, lineno, msg);
823 free(msg);
824 exit(4);
825 }
826
827 void
warn(const char * fmt,...)828 warn(const char *fmt, ...)
829 {
830 char *msg;
831
832 va_list va;
833 va_start(va, fmt);
834 (void) vasprintf(&msg, fmt, va);
835 va_end(va);
836
837 (void) fprintf(stderr, _("%s: %d: warning: %s\n"),
838 filename, lineno, msg);
839 free(msg);
840 warnings++;
841 if (!warnok)
842 exit(4);
843 }
844