xref: /illumos-gate/usr/src/cmd/msgfmt/gnu_lex.c (revision fc910014e8a32a65612105835a10995f2c13d942)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2001, 2002 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include "gnu_msgfmt.h"
28 #include "gnu_lex.h"
29 #include "y.tab.h"
30 
31 int	cur_line = 1;
32 
33 static char	backbuf[MB_LEN_MAX];
34 static int	backlen = 0;
35 
36 /*
37  * get_mb() returns one multibyte character.
38  *
39  * This function uses the iconv() function to find out one
40  * multibyte character from a sequence of bytes in the file stream.
41  * The conversion from the codeset specified in the PO file to UTF-8
42  * is performed.  The funcition reads another byte and calls iconv(),
43  * until iconv() successfully returns as a valid UTF-8 character has
44  * been converted or returns EILSEQ.  If iconv() successfully returned,
45  * the function returns the read bytes as one character.  Otherwise,
46  * returns error.  The string converted to UTF-8 in outbuf won't be
47  * used at all.
48  */
49 static size_t
50 get_mb(unsigned char *tmpbuf, unsigned char fc)
51 {
52 	int	c;
53 	char	outbuf[8];			/* max size of a UTF-8 char */
54 	const char	*inptr;
55 	char	*outptr;
56 	size_t	insize = 0, inlen, outlen, ret;
57 
58 	tmpbuf[insize++] = fc;		/* size of tmpbuf is MB_LEN_MAX+1 */
59 
60 	if (cd == (iconv_t)-1) {
61 		/* no conversion */
62 		tmpbuf[insize] = '\0';
63 		return (insize);
64 	}
65 
66 	for (; ; ) {
67 		inptr = (const char *)tmpbuf;
68 		outptr = &outbuf[0];
69 		inlen = insize;
70 		outlen = sizeof (outbuf);
71 
72 		errno = 0;
73 		ret = iconv(cd, &inptr, &inlen, &outptr, &outlen);
74 		if (ret == (size_t)-1) {
75 			/* iconv failed */
76 			switch (errno) {
77 			case EILSEQ:
78 				/* invalid character found */
79 				error(gettext(ERR_INVALID_CHAR),
80 					cur_line, cur_po);
81 				/* NOTREACHED */
82 			case EINVAL:
83 				/* not enough input */
84 				if (insize == MB_LEN_MAX) {
85 					/* invalid character found */
86 					error(gettext(ERR_INVALID_CHAR),
87 						cur_line, cur_po);
88 					/* NOTREACHED */
89 				}
90 				c = getc(fp);
91 				if (c == EOF) {
92 					error(gettext(ERR_UNEXP_EOF),
93 						cur_line, cur_po);
94 					/* NOTREACHED */
95 				}
96 				tmpbuf[insize++] = (unsigned char)c;
97 
98 				/* initialize the conversion */
99 				outptr = &outbuf[0];
100 				outlen = sizeof (outbuf);
101 				(void) iconv(cd, NULL, NULL, &outptr, &outlen);
102 
103 				continue;
104 				/* NOTREACHED */
105 			default:
106 				/* should never happen */
107 				error(ERR_INTERNAL,
108 					cur_line, cur_po);
109 				/* NOTREACHED */
110 			}
111 			/* NOTREACHED */
112 		}
113 		tmpbuf[insize] = '\0';
114 		return (insize);
115 		/* NOTRECHED */
116 	}
117 }
118 
119 static void
120 po_uninput(int c)
121 {
122 	(void) ungetc(c, fp);
123 	if (c == '\n')
124 		cur_line--;
125 }
126 
127 static void
128 po_ungetc(struct ch *pch)
129 {
130 	if (backlen) {
131 		error(gettext(ERR_INTERNAL), cur_line, cur_po);
132 		/* NOTREACHED */
133 	}
134 	if (!pch->eof) {
135 		backlen = pch->len;
136 		(void) memcpy(backbuf, pch->buf, backlen);
137 	}
138 }
139 
140 static struct ch *
141 po_getc(void)
142 {
143 	static struct ch	och;
144 	int	c;
145 
146 	if (backlen) {
147 		och.len = backlen;
148 		(void) memcpy(och.buf, backbuf, backlen);
149 		backlen = 0;
150 		return (&och);
151 	}
152 
153 	for (; ; ) {
154 		c = getc(fp);
155 		if (c == EOF) {
156 			if (ferror(fp)) {
157 				/* error happend */
158 				error(gettext(ERR_READ_FAILED), cur_po);
159 				/* NOTREACHED */
160 			}
161 			och.len = 0;
162 			och.eof = 1;
163 			return (&och);
164 		}
165 		if (c == '\\') {
166 			c = getc(fp);
167 			if (c == '\n') {
168 				/* this newline should be escaped */
169 				cur_line++;
170 				continue;
171 			} else {
172 				po_uninput(c);
173 				och.len = 1;
174 				och.eof = 0;
175 				och.buf[0] = '\\';
176 				return (&och);
177 			}
178 			/* NOTREACHED */
179 		}
180 		if (c == '\n') {
181 			cur_line++;
182 			och.len = 1;
183 			och.eof = 0;
184 			och.buf[0] = '\n';
185 			return (&och);
186 		}
187 		if (isascii((unsigned char)c)) {
188 			/* single byte ascii */
189 			och.len = 1;
190 			och.eof = 0;
191 			och.buf[0] = (unsigned char)c;
192 			return (&och);
193 		}
194 
195 		och.len = get_mb(&och.buf[0], (unsigned char)c);
196 		och.eof = 0;
197 		return (&och);
198 	}
199 	/* NOTREACHED */
200 }
201 
202 static void
203 extend_buf(char **buf, size_t *size, size_t add)
204 {
205 	char	*tmp;
206 
207 	*size += add;
208 	tmp = (char *)Xrealloc(*buf, *size);
209 	*buf = tmp;
210 }
211 
212 static struct ch	*
213 expand_es(void)
214 {
215 	int	c, n, loop;
216 	static struct ch	och;
217 	struct ch	*pch;
218 
219 	pch = po_getc();
220 	if (pch->eof) {
221 		error(gettext(ERR_UNEXP_EOF),
222 			cur_line, cur_po);
223 		/* NOTREACHED */
224 	}
225 	if (pch->len > 1) {
226 		/* not a valid escape sequence */
227 		return (pch);
228 	}
229 
230 	och.len = 1;
231 	och.eof = 0;
232 	switch (pch->buf[0]) {
233 	case '"':
234 	case '\\':
235 		och.buf[0] = pch->buf[0];
236 		break;
237 	case 'b':
238 		och.buf[0] = '\b';
239 		break;
240 	case 'f':
241 		och.buf[0] = '\f';
242 		break;
243 	case 'n':
244 		och.buf[0] = '\n';
245 		break;
246 	case 'r':
247 		och.buf[0] = '\r';
248 		break;
249 	case 't':
250 		och.buf[0] = '\t';
251 		break;
252 	case 'v':
253 		och.buf[0] = '\v';
254 		break;
255 	case 'a':
256 		och.buf[0] = '\a';
257 		break;
258 	case '0':
259 	case '1':
260 	case '2':
261 	case '3':
262 	case '4':
263 	case '5':
264 	case '6':
265 	case '7':
266 		/* octal */
267 		c = pch->buf[0];
268 		for (n = 0, loop = 0; ; ) {
269 			n = n * 8 + c - '0';
270 			loop++;
271 			if (loop >= 3)
272 				break;
273 			pch = po_getc();
274 			if (pch->eof) {
275 				error(gettext(ERR_UNEXP_EOF),
276 					cur_line, cur_po);
277 				/* NOTREACHED */
278 			}
279 			if ((pch->len > 1) || (pch->buf[0] < '0') ||
280 				(pch->buf[0] > '7'))
281 				break;
282 			c = pch->buf[0];
283 		}
284 		po_ungetc(pch);
285 		och.buf[0] = (unsigned char)n;
286 		break;
287 	case 'x':
288 		/* hex */
289 		pch = po_getc();
290 		if (pch->eof) {
291 			error(gettext(ERR_UNEXP_EOF),
292 				cur_line, cur_po);
293 			/* NOTREACHED */
294 		}
295 		if (pch->len > 1) {
296 			po_ungetc(pch);
297 			och.buf[0] = 'x';
298 			break;
299 		}
300 		c = pch->buf[0];
301 		if (!isxdigit((unsigned char)c)) {
302 			po_ungetc(pch);
303 			och.buf[0] = 'x';
304 			break;
305 		}
306 		if (isdigit((unsigned char)c)) {
307 			n = c - '0';
308 		} else if (isupper((unsigned char)c)) {
309 			n = c - 'A' + 10;
310 		} else {
311 			n = c - 'a' + 10;
312 		}
313 
314 		pch = po_getc();
315 		if (pch->eof) {
316 			error(gettext(ERR_UNEXP_EOF),
317 				cur_line, cur_po);
318 			/* NOTREACHED */
319 		}
320 		if (pch->len > 1) {
321 			po_ungetc(pch);
322 			och.buf[0] = (unsigned char)n;
323 			break;
324 		}
325 		c = pch->buf[0];
326 		if (!isxdigit((unsigned char)c)) {
327 			po_ungetc(pch);
328 			och.buf[0] = (unsigned char)n;
329 			break;
330 		}
331 		n *= 16;
332 		if (isdigit((unsigned char)c)) {
333 			n += c - '0';
334 		} else if (isupper((unsigned char)c)) {
335 			n += c - 'A' + 10;
336 		} else {
337 			n += c - 'a' + 10;
338 		}
339 		och.buf[0] = (unsigned char)n;
340 		break;
341 
342 	default:
343 		och.buf[0] = pch->buf[0];
344 		break;
345 	}
346 	return (&och);
347 }
348 
349 int
350 yylex(void)
351 {
352 	unsigned int	uc;
353 	struct ch	*pch;
354 	char	*buf;
355 	size_t	buf_size, buf_pos;
356 
357 	for (; ; ) {
358 		pch = po_getc();
359 
360 		if (pch->eof) {
361 			/* EOF */
362 			return (0);
363 		}
364 
365 		if (pch->len > 1) {
366 			/* multi byte */
367 			yylval.c.len = pch->len;
368 			(void) memcpy(yylval.c.buf, pch->buf, pch->len);
369 			return (CHR);
370 		}
371 		/* single byte */
372 		switch (pch->buf[0]) {
373 		case ' ':
374 		case '\t':
375 		case '\n':
376 			break;
377 
378 		case '#':
379 			/* comment start */
380 			buf_size = CBUFSIZE;
381 			buf = (char *)Xmalloc(buf_size);
382 			buf_pos = 0;
383 			pch = po_getc();
384 			while (!pch->eof &&
385 				((pch->len != 1) || (pch->buf[0] != '\n'))) {
386 				if (buf_pos + pch->len + 1 > buf_size)
387 					extend_buf(&buf, &buf_size, CBUFSIZE);
388 				(void) memcpy(buf + buf_pos,
389 					pch->buf, pch->len);
390 				buf_pos += pch->len;
391 				pch = po_getc();
392 			}
393 			buf[buf_pos] = '\0';
394 			yylval.str = buf;
395 			return (COMMENT);
396 			/* NOTREACHED */
397 
398 		case '[':
399 		case ']':
400 			return (pch->buf[0]);
401 			/* NOTREACHED */
402 
403 		case '"':
404 			buf_size = MBUFSIZE;
405 			buf = (char *)Xmalloc(buf_size);
406 			buf_pos = 0;
407 			for (; ; ) {
408 				pch = po_getc();
409 
410 				if (pch->eof) {
411 					/* EOF */
412 					error(gettext(ERR_UNEXP_EOF),
413 						cur_line, cur_po);
414 					/* NOTREACHED */
415 				}
416 
417 				if (pch->len == 1) {
418 					uc = pch->buf[0];
419 
420 					if (uc == '\n') {
421 						error(gettext(ERR_UNEXP_EOL),
422 							cur_line, cur_po);
423 						/* NOTREACHED */
424 					}
425 					if (uc == '"')
426 						break;
427 					if (uc == '\\')
428 						pch = expand_es();
429 				}
430 				if (buf_pos + pch->len + 1 > buf_size)
431 					extend_buf(&buf, &buf_size,
432 						MBUFSIZE);
433 				(void) memcpy(buf + buf_pos,
434 					pch->buf, pch->len);
435 				buf_pos += pch->len;
436 			}
437 
438 			buf[buf_pos] = '\0';
439 			yylval.str = buf;
440 			return (STR);
441 			/* NOTREACHED */
442 
443 		default:
444 			uc = pch->buf[0];
445 
446 			if (isalpha(uc) || (uc == '_')) {
447 				buf_size = KBUFSIZE;
448 				buf = (char *)Xmalloc(buf_size);
449 				buf_pos = 0;
450 				buf[buf_pos++] = (char)uc;
451 				pch = po_getc();
452 				while (!pch->eof &&
453 					(pch->len == 1) &&
454 					(isalpha(uc = pch->buf[0]) ||
455 					isdigit(uc) || (uc == '_'))) {
456 					if (buf_pos + 1 + 1 > buf_size)
457 						extend_buf(&buf, &buf_size,
458 							KBUFSIZE);
459 					buf[buf_pos++] = (char)uc;
460 					pch = po_getc();
461 				}
462 				/* push back the last char */
463 				po_ungetc(pch);
464 				buf[buf_pos] = '\0';
465 				yylval.str = buf;
466 				if (buf_pos > MAX_KW_LEN) {
467 					/* kbuf is longer than any keywords */
468 					return (SYMBOL);
469 				}
470 				yylval.num = cur_line;
471 				if (strcmp(buf, KW_DOMAIN) == 0) {
472 					free(buf);
473 					return (DOMAIN);
474 				} else if (strcmp(buf, KW_MSGID) == 0) {
475 					free(buf);
476 					return (MSGID);
477 				} else if (strcmp(buf, KW_MSGID_PLURAL) == 0) {
478 					free(buf);
479 					return (MSGID_PLURAL);
480 				} else if (strcmp(buf, KW_MSGSTR) == 0) {
481 					free(buf);
482 					return (MSGSTR);
483 				} else {
484 					free(buf);
485 					return (SYMBOL);
486 				}
487 				/* NOTREACHED */
488 			}
489 			if (isdigit(uc)) {
490 				buf_size = NBUFSIZE;
491 				buf = (char *)Xmalloc(buf_size);
492 				buf_pos = 0;
493 				buf[buf_pos++] = (char)uc;
494 				pch = po_getc();
495 				while (!pch->eof &&
496 					(pch->len == 1) &&
497 					isdigit(uc = pch->buf[0])) {
498 					if (buf_pos + 1 + 1 > buf_size)
499 						extend_buf(&buf, &buf_size,
500 							NBUFSIZE);
501 					buf[buf_pos++] = (char)uc;
502 					pch = po_getc();
503 				}
504 				/* push back the last char */
505 				po_ungetc(pch);
506 				buf[buf_pos] = '\0';
507 				yylval.num = atoi(buf);
508 				free(buf);
509 				return (NUM);
510 			}
511 			/* just a char */
512 			yylval.c.len = 1;
513 			yylval.c.buf[0] = uc;
514 			return (CHR);
515 			/* NOTREACHED */
516 		}
517 	}
518 }
519