xref: /illumos-gate/usr/src/cmd/mandoc/mandoc.c (revision 2f8bbd9dee64b0f32e2f0e385b450b0d7dca7e32)
1 /*	$Id: mandoc.c,v 1.103 2017/07/03 13:40:19 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2011-2015, 2017 Ingo Schwarze <schwarze@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 #include "config.h"
19 
20 #include <sys/types.h>
21 
22 #include <assert.h>
23 #include <ctype.h>
24 #include <errno.h>
25 #include <limits.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <time.h>
30 
31 #include "mandoc_aux.h"
32 #include "mandoc.h"
33 #include "roff.h"
34 #include "libmandoc.h"
35 
36 static	int	 a2time(time_t *, const char *, const char *);
37 static	char	*time2a(time_t);
38 
39 
40 enum mandoc_esc
41 mandoc_escape(const char **end, const char **start, int *sz)
42 {
43 	const char	*local_start;
44 	int		 local_sz;
45 	char		 term;
46 	enum mandoc_esc	 gly;
47 
48 	/*
49 	 * When the caller doesn't provide return storage,
50 	 * use local storage.
51 	 */
52 
53 	if (NULL == start)
54 		start = &local_start;
55 	if (NULL == sz)
56 		sz = &local_sz;
57 
58 	/*
59 	 * Beyond the backslash, at least one input character
60 	 * is part of the escape sequence.  With one exception
61 	 * (see below), that character won't be returned.
62 	 */
63 
64 	gly = ESCAPE_ERROR;
65 	*start = ++*end;
66 	*sz = 0;
67 	term = '\0';
68 
69 	switch ((*start)[-1]) {
70 	/*
71 	 * First the glyphs.  There are several different forms of
72 	 * these, but each eventually returns a substring of the glyph
73 	 * name.
74 	 */
75 	case '(':
76 		gly = ESCAPE_SPECIAL;
77 		*sz = 2;
78 		break;
79 	case '[':
80 		gly = ESCAPE_SPECIAL;
81 		term = ']';
82 		break;
83 	case 'C':
84 		if ('\'' != **start)
85 			return ESCAPE_ERROR;
86 		*start = ++*end;
87 		gly = ESCAPE_SPECIAL;
88 		term = '\'';
89 		break;
90 
91 	/*
92 	 * Escapes taking no arguments at all.
93 	 */
94 	case 'd':
95 	case 'u':
96 	case ',':
97 	case '/':
98 		return ESCAPE_IGNORE;
99 	case 'p':
100 		return ESCAPE_BREAK;
101 
102 	/*
103 	 * The \z escape is supposed to output the following
104 	 * character without advancing the cursor position.
105 	 * Since we are mostly dealing with terminal mode,
106 	 * let us just skip the next character.
107 	 */
108 	case 'z':
109 		return ESCAPE_SKIPCHAR;
110 
111 	/*
112 	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
113 	 * 'X' is the trigger.  These have opaque sub-strings.
114 	 */
115 	case 'F':
116 	case 'g':
117 	case 'k':
118 	case 'M':
119 	case 'm':
120 	case 'n':
121 	case 'V':
122 	case 'Y':
123 		gly = ESCAPE_IGNORE;
124 		/* FALLTHROUGH */
125 	case 'f':
126 		if (ESCAPE_ERROR == gly)
127 			gly = ESCAPE_FONT;
128 		switch (**start) {
129 		case '(':
130 			*start = ++*end;
131 			*sz = 2;
132 			break;
133 		case '[':
134 			*start = ++*end;
135 			term = ']';
136 			break;
137 		default:
138 			*sz = 1;
139 			break;
140 		}
141 		break;
142 
143 	/*
144 	 * These escapes are of the form \X'Y', where 'X' is the trigger
145 	 * and 'Y' is any string.  These have opaque sub-strings.
146 	 * The \B and \w escapes are handled in roff.c, roff_res().
147 	 */
148 	case 'A':
149 	case 'b':
150 	case 'D':
151 	case 'R':
152 	case 'X':
153 	case 'Z':
154 		gly = ESCAPE_IGNORE;
155 		/* FALLTHROUGH */
156 	case 'o':
157 		if (**start == '\0')
158 			return ESCAPE_ERROR;
159 		if (gly == ESCAPE_ERROR)
160 			gly = ESCAPE_OVERSTRIKE;
161 		term = **start;
162 		*start = ++*end;
163 		break;
164 
165 	/*
166 	 * These escapes are of the form \X'N', where 'X' is the trigger
167 	 * and 'N' resolves to a numerical expression.
168 	 */
169 	case 'h':
170 	case 'H':
171 	case 'L':
172 	case 'l':
173 	case 'S':
174 	case 'v':
175 	case 'x':
176 		if (strchr(" %&()*+-./0123456789:<=>", **start)) {
177 			if ('\0' != **start)
178 				++*end;
179 			return ESCAPE_ERROR;
180 		}
181 		switch ((*start)[-1]) {
182 		case 'h':
183 			gly = ESCAPE_HORIZ;
184 			break;
185 		case 'l':
186 			gly = ESCAPE_HLINE;
187 			break;
188 		default:
189 			gly = ESCAPE_IGNORE;
190 			break;
191 		}
192 		term = **start;
193 		*start = ++*end;
194 		break;
195 
196 	/*
197 	 * Special handling for the numbered character escape.
198 	 * XXX Do any other escapes need similar handling?
199 	 */
200 	case 'N':
201 		if ('\0' == **start)
202 			return ESCAPE_ERROR;
203 		(*end)++;
204 		if (isdigit((unsigned char)**start)) {
205 			*sz = 1;
206 			return ESCAPE_IGNORE;
207 		}
208 		(*start)++;
209 		while (isdigit((unsigned char)**end))
210 			(*end)++;
211 		*sz = *end - *start;
212 		if ('\0' != **end)
213 			(*end)++;
214 		return ESCAPE_NUMBERED;
215 
216 	/*
217 	 * Sizes get a special category of their own.
218 	 */
219 	case 's':
220 		gly = ESCAPE_IGNORE;
221 
222 		/* See +/- counts as a sign. */
223 		if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
224 			*start = ++*end;
225 
226 		switch (**end) {
227 		case '(':
228 			*start = ++*end;
229 			*sz = 2;
230 			break;
231 		case '[':
232 			*start = ++*end;
233 			term = ']';
234 			break;
235 		case '\'':
236 			*start = ++*end;
237 			term = '\'';
238 			break;
239 		case '3':
240 		case '2':
241 		case '1':
242 			*sz = (*end)[-1] == 's' &&
243 			    isdigit((unsigned char)(*end)[1]) ? 2 : 1;
244 			break;
245 		default:
246 			*sz = 1;
247 			break;
248 		}
249 
250 		break;
251 
252 	/*
253 	 * Anything else is assumed to be a glyph.
254 	 * In this case, pass back the character after the backslash.
255 	 */
256 	default:
257 		gly = ESCAPE_SPECIAL;
258 		*start = --*end;
259 		*sz = 1;
260 		break;
261 	}
262 
263 	assert(ESCAPE_ERROR != gly);
264 
265 	/*
266 	 * Read up to the terminating character,
267 	 * paying attention to nested escapes.
268 	 */
269 
270 	if ('\0' != term) {
271 		while (**end != term) {
272 			switch (**end) {
273 			case '\0':
274 				return ESCAPE_ERROR;
275 			case '\\':
276 				(*end)++;
277 				if (ESCAPE_ERROR ==
278 				    mandoc_escape(end, NULL, NULL))
279 					return ESCAPE_ERROR;
280 				break;
281 			default:
282 				(*end)++;
283 				break;
284 			}
285 		}
286 		*sz = (*end)++ - *start;
287 	} else {
288 		assert(*sz > 0);
289 		if ((size_t)*sz > strlen(*start))
290 			return ESCAPE_ERROR;
291 		*end += *sz;
292 	}
293 
294 	/* Run post-processors. */
295 
296 	switch (gly) {
297 	case ESCAPE_FONT:
298 		if (2 == *sz) {
299 			if ('C' == **start) {
300 				/*
301 				 * Treat constant-width font modes
302 				 * just like regular font modes.
303 				 */
304 				(*start)++;
305 				(*sz)--;
306 			} else {
307 				if ('B' == (*start)[0] && 'I' == (*start)[1])
308 					gly = ESCAPE_FONTBI;
309 				break;
310 			}
311 		} else if (1 != *sz)
312 			break;
313 
314 		switch (**start) {
315 		case '3':
316 		case 'B':
317 			gly = ESCAPE_FONTBOLD;
318 			break;
319 		case '2':
320 		case 'I':
321 			gly = ESCAPE_FONTITALIC;
322 			break;
323 		case 'P':
324 			gly = ESCAPE_FONTPREV;
325 			break;
326 		case '1':
327 		case 'R':
328 			gly = ESCAPE_FONTROMAN;
329 			break;
330 		}
331 		break;
332 	case ESCAPE_SPECIAL:
333 		if (1 == *sz && 'c' == **start)
334 			gly = ESCAPE_NOSPACE;
335 		/*
336 		 * Unicode escapes are defined in groff as \[u0000]
337 		 * to \[u10FFFF], where the contained value must be
338 		 * a valid Unicode codepoint.  Here, however, only
339 		 * check the length and range.
340 		 */
341 		if (**start != 'u' || *sz < 5 || *sz > 7)
342 			break;
343 		if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
344 			break;
345 		if (*sz == 6 && (*start)[1] == '0')
346 			break;
347 		if (*sz == 5 && (*start)[1] == 'D' &&
348 		    strchr("89ABCDEF", (*start)[2]) != NULL)
349 			break;
350 		if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
351 		    + 1 == *sz)
352 			gly = ESCAPE_UNICODE;
353 		break;
354 	default:
355 		break;
356 	}
357 
358 	return gly;
359 }
360 
361 /*
362  * Parse a quoted or unquoted roff-style request or macro argument.
363  * Return a pointer to the parsed argument, which is either the original
364  * pointer or advanced by one byte in case the argument is quoted.
365  * NUL-terminate the argument in place.
366  * Collapse pairs of quotes inside quoted arguments.
367  * Advance the argument pointer to the next argument,
368  * or to the NUL byte terminating the argument line.
369  */
370 char *
371 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
372 {
373 	char	 *start, *cp;
374 	int	  quoted, pairs, white;
375 
376 	/* Quoting can only start with a new word. */
377 	start = *cpp;
378 	quoted = 0;
379 	if ('"' == *start) {
380 		quoted = 1;
381 		start++;
382 	}
383 
384 	pairs = 0;
385 	white = 0;
386 	for (cp = start; '\0' != *cp; cp++) {
387 
388 		/*
389 		 * Move the following text left
390 		 * after quoted quotes and after "\\" and "\t".
391 		 */
392 		if (pairs)
393 			cp[-pairs] = cp[0];
394 
395 		if ('\\' == cp[0]) {
396 			/*
397 			 * In copy mode, translate double to single
398 			 * backslashes and backslash-t to literal tabs.
399 			 */
400 			switch (cp[1]) {
401 			case 't':
402 				cp[0] = '\t';
403 				/* FALLTHROUGH */
404 			case '\\':
405 				pairs++;
406 				cp++;
407 				break;
408 			case ' ':
409 				/* Skip escaped blanks. */
410 				if (0 == quoted)
411 					cp++;
412 				break;
413 			default:
414 				break;
415 			}
416 		} else if (0 == quoted) {
417 			if (' ' == cp[0]) {
418 				/* Unescaped blanks end unquoted args. */
419 				white = 1;
420 				break;
421 			}
422 		} else if ('"' == cp[0]) {
423 			if ('"' == cp[1]) {
424 				/* Quoted quotes collapse. */
425 				pairs++;
426 				cp++;
427 			} else {
428 				/* Unquoted quotes end quoted args. */
429 				quoted = 2;
430 				break;
431 			}
432 		}
433 	}
434 
435 	/* Quoted argument without a closing quote. */
436 	if (1 == quoted)
437 		mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL);
438 
439 	/* NUL-terminate this argument and move to the next one. */
440 	if (pairs)
441 		cp[-pairs] = '\0';
442 	if ('\0' != *cp) {
443 		*cp++ = '\0';
444 		while (' ' == *cp)
445 			cp++;
446 	}
447 	*pos += (int)(cp - start) + (quoted ? 1 : 0);
448 	*cpp = cp;
449 
450 	if ('\0' == *cp && (white || ' ' == cp[-1]))
451 		mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL);
452 
453 	return start;
454 }
455 
456 static int
457 a2time(time_t *t, const char *fmt, const char *p)
458 {
459 	struct tm	 tm;
460 	char		*pp;
461 
462 	memset(&tm, 0, sizeof(struct tm));
463 
464 	pp = NULL;
465 #if HAVE_STRPTIME
466 	pp = strptime(p, fmt, &tm);
467 #endif
468 	if (NULL != pp && '\0' == *pp) {
469 		*t = mktime(&tm);
470 		return 1;
471 	}
472 
473 	return 0;
474 }
475 
476 static char *
477 time2a(time_t t)
478 {
479 	struct tm	*tm;
480 	char		*buf, *p;
481 	size_t		 ssz;
482 	int		 isz;
483 
484 	tm = localtime(&t);
485 	if (tm == NULL)
486 		return NULL;
487 
488 	/*
489 	 * Reserve space:
490 	 * up to 9 characters for the month (September) + blank
491 	 * up to 2 characters for the day + comma + blank
492 	 * 4 characters for the year and a terminating '\0'
493 	 */
494 
495 	p = buf = mandoc_malloc(10 + 4 + 4 + 1);
496 
497 	if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
498 		goto fail;
499 	p += (int)ssz;
500 
501 	/*
502 	 * The output format is just "%d" here, not "%2d" or "%02d".
503 	 * That's also the reason why we can't just format the
504 	 * date as a whole with "%B %e, %Y" or "%B %d, %Y".
505 	 * Besides, the present approach is less prone to buffer
506 	 * overflows, in case anybody should ever introduce the bug
507 	 * of looking at LC_TIME.
508 	 */
509 
510 	if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1)
511 		goto fail;
512 	p += isz;
513 
514 	if (strftime(p, 4 + 1, "%Y", tm) == 0)
515 		goto fail;
516 	return buf;
517 
518 fail:
519 	free(buf);
520 	return NULL;
521 }
522 
523 char *
524 mandoc_normdate(struct roff_man *man, char *in, int ln, int pos)
525 {
526 	char		*cp;
527 	time_t		 t;
528 
529 	/* No date specified: use today's date. */
530 
531 	if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) {
532 		mandoc_msg(MANDOCERR_DATE_MISSING, man->parse, ln, pos, NULL);
533 		return time2a(time(NULL));
534 	}
535 
536 	/* Valid mdoc(7) date format. */
537 
538 	if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) ||
539 	    a2time(&t, "%b %d, %Y", in)) {
540 		cp = time2a(t);
541 		if (t > time(NULL) + 86400)
542 			mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse,
543 			    ln, pos, cp);
544 		return cp;
545 	}
546 
547 	/* In man(7), do not warn about the legacy format. */
548 
549 	if (a2time(&t, "%Y-%m-%d", in) == 0)
550 		mandoc_msg(MANDOCERR_DATE_BAD, man->parse, ln, pos, in);
551 	else if (t > time(NULL) + 86400)
552 		mandoc_msg(MANDOCERR_DATE_FUTURE, man->parse, ln, pos, in);
553 	else if (man->macroset == MACROSET_MDOC)
554 		mandoc_vmsg(MANDOCERR_DATE_LEGACY, man->parse,
555 		    ln, pos, "Dd %s", in);
556 
557 	/* Use any non-mdoc(7) date verbatim. */
558 
559 	return mandoc_strdup(in);
560 }
561 
562 int
563 mandoc_eos(const char *p, size_t sz)
564 {
565 	const char	*q;
566 	int		 enclosed, found;
567 
568 	if (0 == sz)
569 		return 0;
570 
571 	/*
572 	 * End-of-sentence recognition must include situations where
573 	 * some symbols, such as `)', allow prior EOS punctuation to
574 	 * propagate outward.
575 	 */
576 
577 	enclosed = found = 0;
578 	for (q = p + (int)sz - 1; q >= p; q--) {
579 		switch (*q) {
580 		case '\"':
581 		case '\'':
582 		case ']':
583 		case ')':
584 			if (0 == found)
585 				enclosed = 1;
586 			break;
587 		case '.':
588 		case '!':
589 		case '?':
590 			found = 1;
591 			break;
592 		default:
593 			return found &&
594 			    (!enclosed || isalnum((unsigned char)*q));
595 		}
596 	}
597 
598 	return found && !enclosed;
599 }
600 
601 /*
602  * Convert a string to a long that may not be <0.
603  * If the string is invalid, or is less than 0, return -1.
604  */
605 int
606 mandoc_strntoi(const char *p, size_t sz, int base)
607 {
608 	char		 buf[32];
609 	char		*ep;
610 	long		 v;
611 
612 	if (sz > 31)
613 		return -1;
614 
615 	memcpy(buf, p, sz);
616 	buf[(int)sz] = '\0';
617 
618 	errno = 0;
619 	v = strtol(buf, &ep, base);
620 
621 	if (buf[0] == '\0' || *ep != '\0')
622 		return -1;
623 
624 	if (v > INT_MAX)
625 		v = INT_MAX;
626 	if (v < INT_MIN)
627 		v = INT_MIN;
628 
629 	return (int)v;
630 }
631