xref: /titanic_52/usr/src/cmd/mandoc/mandoc.c (revision 260e9a87725c090ba5835b1f9f0b62fa2f96036f)
1*260e9a87SYuri Pankov /*	$Id: mandoc.c,v 1.92 2015/02/20 23:55:10 schwarze Exp $ */
295c635efSGarrett D'Amore /*
3*260e9a87SYuri Pankov  * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4*260e9a87SYuri Pankov  * Copyright (c) 2011-2015 Ingo Schwarze <schwarze@openbsd.org>
595c635efSGarrett D'Amore  *
695c635efSGarrett D'Amore  * Permission to use, copy, modify, and distribute this software for any
795c635efSGarrett D'Amore  * purpose with or without fee is hereby granted, provided that the above
895c635efSGarrett D'Amore  * copyright notice and this permission notice appear in all copies.
995c635efSGarrett D'Amore  *
1095c635efSGarrett D'Amore  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1195c635efSGarrett D'Amore  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1295c635efSGarrett D'Amore  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1395c635efSGarrett D'Amore  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1495c635efSGarrett D'Amore  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1595c635efSGarrett D'Amore  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1695c635efSGarrett D'Amore  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1795c635efSGarrett D'Amore  */
1895c635efSGarrett D'Amore #include "config.h"
1995c635efSGarrett D'Amore 
2095c635efSGarrett D'Amore #include <sys/types.h>
2195c635efSGarrett D'Amore 
2295c635efSGarrett D'Amore #include <assert.h>
2395c635efSGarrett D'Amore #include <ctype.h>
2495c635efSGarrett D'Amore #include <errno.h>
2595c635efSGarrett D'Amore #include <limits.h>
2695c635efSGarrett D'Amore #include <stdlib.h>
2795c635efSGarrett D'Amore #include <stdio.h>
2895c635efSGarrett D'Amore #include <string.h>
2995c635efSGarrett D'Amore #include <time.h>
3095c635efSGarrett D'Amore 
3195c635efSGarrett D'Amore #include "mandoc.h"
32*260e9a87SYuri Pankov #include "mandoc_aux.h"
3395c635efSGarrett D'Amore #include "libmandoc.h"
3495c635efSGarrett D'Amore 
3595c635efSGarrett D'Amore #define DATESIZE 32
3695c635efSGarrett D'Amore 
3795c635efSGarrett D'Amore static	int	 a2time(time_t *, const char *, const char *);
3895c635efSGarrett D'Amore static	char	*time2a(time_t);
3995c635efSGarrett D'Amore 
4095c635efSGarrett D'Amore 
4195c635efSGarrett D'Amore enum mandoc_esc
4295c635efSGarrett D'Amore mandoc_escape(const char **end, const char **start, int *sz)
4395c635efSGarrett D'Amore {
44698f87a4SGarrett D'Amore 	const char	*local_start;
45698f87a4SGarrett D'Amore 	int		 local_sz;
46698f87a4SGarrett D'Amore 	char		 term;
4795c635efSGarrett D'Amore 	enum mandoc_esc	 gly;
4895c635efSGarrett D'Amore 
49698f87a4SGarrett D'Amore 	/*
50698f87a4SGarrett D'Amore 	 * When the caller doesn't provide return storage,
51698f87a4SGarrett D'Amore 	 * use local storage.
52698f87a4SGarrett D'Amore 	 */
5395c635efSGarrett D'Amore 
54698f87a4SGarrett D'Amore 	if (NULL == start)
55698f87a4SGarrett D'Amore 		start = &local_start;
56698f87a4SGarrett D'Amore 	if (NULL == sz)
57698f87a4SGarrett D'Amore 		sz = &local_sz;
58698f87a4SGarrett D'Amore 
59698f87a4SGarrett D'Amore 	/*
60698f87a4SGarrett D'Amore 	 * Beyond the backslash, at least one input character
61698f87a4SGarrett D'Amore 	 * is part of the escape sequence.  With one exception
62698f87a4SGarrett D'Amore 	 * (see below), that character won't be returned.
63698f87a4SGarrett D'Amore 	 */
64698f87a4SGarrett D'Amore 
65698f87a4SGarrett D'Amore 	gly = ESCAPE_ERROR;
66698f87a4SGarrett D'Amore 	*start = ++*end;
67698f87a4SGarrett D'Amore 	*sz = 0;
68698f87a4SGarrett D'Amore 	term = '\0';
69698f87a4SGarrett D'Amore 
70698f87a4SGarrett D'Amore 	switch ((*start)[-1]) {
7195c635efSGarrett D'Amore 	/*
7295c635efSGarrett D'Amore 	 * First the glyphs.  There are several different forms of
7395c635efSGarrett D'Amore 	 * these, but each eventually returns a substring of the glyph
7495c635efSGarrett D'Amore 	 * name.
7595c635efSGarrett D'Amore 	 */
76*260e9a87SYuri Pankov 	case '(':
7795c635efSGarrett D'Amore 		gly = ESCAPE_SPECIAL;
78698f87a4SGarrett D'Amore 		*sz = 2;
7995c635efSGarrett D'Amore 		break;
80*260e9a87SYuri Pankov 	case '[':
8195c635efSGarrett D'Amore 		gly = ESCAPE_SPECIAL;
8295c635efSGarrett D'Amore 		term = ']';
8395c635efSGarrett D'Amore 		break;
84*260e9a87SYuri Pankov 	case 'C':
85698f87a4SGarrett D'Amore 		if ('\'' != **start)
8695c635efSGarrett D'Amore 			return(ESCAPE_ERROR);
87698f87a4SGarrett D'Amore 		*start = ++*end;
8895c635efSGarrett D'Amore 		gly = ESCAPE_SPECIAL;
8995c635efSGarrett D'Amore 		term = '\'';
9095c635efSGarrett D'Amore 		break;
9195c635efSGarrett D'Amore 
9295c635efSGarrett D'Amore 	/*
93698f87a4SGarrett D'Amore 	 * Escapes taking no arguments at all.
94698f87a4SGarrett D'Amore 	 */
95*260e9a87SYuri Pankov 	case 'd':
96698f87a4SGarrett D'Amore 		/* FALLTHROUGH */
97*260e9a87SYuri Pankov 	case 'u':
98698f87a4SGarrett D'Amore 		return(ESCAPE_IGNORE);
99698f87a4SGarrett D'Amore 
100698f87a4SGarrett D'Amore 	/*
101698f87a4SGarrett D'Amore 	 * The \z escape is supposed to output the following
102698f87a4SGarrett D'Amore 	 * character without advancing the cursor position.
103698f87a4SGarrett D'Amore 	 * Since we are mostly dealing with terminal mode,
104698f87a4SGarrett D'Amore 	 * let us just skip the next character.
105698f87a4SGarrett D'Amore 	 */
106*260e9a87SYuri Pankov 	case 'z':
107698f87a4SGarrett D'Amore 		return(ESCAPE_SKIPCHAR);
108698f87a4SGarrett D'Amore 
109698f87a4SGarrett D'Amore 	/*
11095c635efSGarrett D'Amore 	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
11195c635efSGarrett D'Amore 	 * 'X' is the trigger.  These have opaque sub-strings.
11295c635efSGarrett D'Amore 	 */
113*260e9a87SYuri Pankov 	case 'F':
11495c635efSGarrett D'Amore 		/* FALLTHROUGH */
115*260e9a87SYuri Pankov 	case 'g':
11695c635efSGarrett D'Amore 		/* FALLTHROUGH */
117*260e9a87SYuri Pankov 	case 'k':
11895c635efSGarrett D'Amore 		/* FALLTHROUGH */
119*260e9a87SYuri Pankov 	case 'M':
12095c635efSGarrett D'Amore 		/* FALLTHROUGH */
121*260e9a87SYuri Pankov 	case 'm':
12295c635efSGarrett D'Amore 		/* FALLTHROUGH */
123*260e9a87SYuri Pankov 	case 'n':
12495c635efSGarrett D'Amore 		/* FALLTHROUGH */
125*260e9a87SYuri Pankov 	case 'V':
12695c635efSGarrett D'Amore 		/* FALLTHROUGH */
127*260e9a87SYuri Pankov 	case 'Y':
12895c635efSGarrett D'Amore 		gly = ESCAPE_IGNORE;
12995c635efSGarrett D'Amore 		/* FALLTHROUGH */
130*260e9a87SYuri Pankov 	case 'f':
13195c635efSGarrett D'Amore 		if (ESCAPE_ERROR == gly)
13295c635efSGarrett D'Amore 			gly = ESCAPE_FONT;
133698f87a4SGarrett D'Amore 		switch (**start) {
134*260e9a87SYuri Pankov 		case '(':
135698f87a4SGarrett D'Amore 			*start = ++*end;
136698f87a4SGarrett D'Amore 			*sz = 2;
13795c635efSGarrett D'Amore 			break;
138*260e9a87SYuri Pankov 		case '[':
139698f87a4SGarrett D'Amore 			*start = ++*end;
14095c635efSGarrett D'Amore 			term = ']';
14195c635efSGarrett D'Amore 			break;
14295c635efSGarrett D'Amore 		default:
143698f87a4SGarrett D'Amore 			*sz = 1;
14495c635efSGarrett D'Amore 			break;
14595c635efSGarrett D'Amore 		}
14695c635efSGarrett D'Amore 		break;
14795c635efSGarrett D'Amore 
14895c635efSGarrett D'Amore 	/*
14995c635efSGarrett D'Amore 	 * These escapes are of the form \X'Y', where 'X' is the trigger
15095c635efSGarrett D'Amore 	 * and 'Y' is any string.  These have opaque sub-strings.
151*260e9a87SYuri Pankov 	 * The \B and \w escapes are handled in roff.c, roff_res().
15295c635efSGarrett D'Amore 	 */
153*260e9a87SYuri Pankov 	case 'A':
15495c635efSGarrett D'Amore 		/* FALLTHROUGH */
155*260e9a87SYuri Pankov 	case 'b':
15695c635efSGarrett D'Amore 		/* FALLTHROUGH */
157*260e9a87SYuri Pankov 	case 'D':
158698f87a4SGarrett D'Amore 		/* FALLTHROUGH */
159*260e9a87SYuri Pankov 	case 'R':
16095c635efSGarrett D'Amore 		/* FALLTHROUGH */
161*260e9a87SYuri Pankov 	case 'X':
16295c635efSGarrett D'Amore 		/* FALLTHROUGH */
163*260e9a87SYuri Pankov 	case 'Z':
16495c635efSGarrett D'Amore 		gly = ESCAPE_IGNORE;
165*260e9a87SYuri Pankov 		/* FALLTHROUGH */
166*260e9a87SYuri Pankov 	case 'o':
167*260e9a87SYuri Pankov 		if (**start == '\0')
168*260e9a87SYuri Pankov 			return(ESCAPE_ERROR);
169*260e9a87SYuri Pankov 		if (gly == ESCAPE_ERROR)
170*260e9a87SYuri Pankov 			gly = ESCAPE_OVERSTRIKE;
171*260e9a87SYuri Pankov 		term = **start;
172698f87a4SGarrett D'Amore 		*start = ++*end;
17395c635efSGarrett D'Amore 		break;
17495c635efSGarrett D'Amore 
17595c635efSGarrett D'Amore 	/*
17695c635efSGarrett D'Amore 	 * These escapes are of the form \X'N', where 'X' is the trigger
17795c635efSGarrett D'Amore 	 * and 'N' resolves to a numerical expression.
17895c635efSGarrett D'Amore 	 */
179*260e9a87SYuri Pankov 	case 'h':
18095c635efSGarrett D'Amore 		/* FALLTHROUGH */
181*260e9a87SYuri Pankov 	case 'H':
18295c635efSGarrett D'Amore 		/* FALLTHROUGH */
183*260e9a87SYuri Pankov 	case 'L':
18495c635efSGarrett D'Amore 		/* FALLTHROUGH */
185*260e9a87SYuri Pankov 	case 'l':
18695c635efSGarrett D'Amore 		/* FALLTHROUGH */
187*260e9a87SYuri Pankov 	case 'S':
18895c635efSGarrett D'Amore 		/* FALLTHROUGH */
189*260e9a87SYuri Pankov 	case 'v':
19095c635efSGarrett D'Amore 		/* FALLTHROUGH */
191*260e9a87SYuri Pankov 	case 'x':
192*260e9a87SYuri Pankov 		if (strchr(" %&()*+-./0123456789:<=>", **start)) {
193*260e9a87SYuri Pankov 			if ('\0' != **start)
194*260e9a87SYuri Pankov 				++*end;
19595c635efSGarrett D'Amore 			return(ESCAPE_ERROR);
196*260e9a87SYuri Pankov 		}
197698f87a4SGarrett D'Amore 		gly = ESCAPE_IGNORE;
198*260e9a87SYuri Pankov 		term = **start;
199698f87a4SGarrett D'Amore 		*start = ++*end;
20095c635efSGarrett D'Amore 		break;
20195c635efSGarrett D'Amore 
20295c635efSGarrett D'Amore 	/*
20395c635efSGarrett D'Amore 	 * Special handling for the numbered character escape.
20495c635efSGarrett D'Amore 	 * XXX Do any other escapes need similar handling?
20595c635efSGarrett D'Amore 	 */
206*260e9a87SYuri Pankov 	case 'N':
207698f87a4SGarrett D'Amore 		if ('\0' == **start)
20895c635efSGarrett D'Amore 			return(ESCAPE_ERROR);
209698f87a4SGarrett D'Amore 		(*end)++;
210698f87a4SGarrett D'Amore 		if (isdigit((unsigned char)**start)) {
211698f87a4SGarrett D'Amore 			*sz = 1;
21295c635efSGarrett D'Amore 			return(ESCAPE_IGNORE);
213698f87a4SGarrett D'Amore 		}
214698f87a4SGarrett D'Amore 		(*start)++;
21595c635efSGarrett D'Amore 		while (isdigit((unsigned char)**end))
21695c635efSGarrett D'Amore 			(*end)++;
217698f87a4SGarrett D'Amore 		*sz = *end - *start;
21895c635efSGarrett D'Amore 		if ('\0' != **end)
21995c635efSGarrett D'Amore 			(*end)++;
22095c635efSGarrett D'Amore 		return(ESCAPE_NUMBERED);
22195c635efSGarrett D'Amore 
22295c635efSGarrett D'Amore 	/*
22395c635efSGarrett D'Amore 	 * Sizes get a special category of their own.
22495c635efSGarrett D'Amore 	 */
225*260e9a87SYuri Pankov 	case 's':
22695c635efSGarrett D'Amore 		gly = ESCAPE_IGNORE;
22795c635efSGarrett D'Amore 
22895c635efSGarrett D'Amore 		/* See +/- counts as a sign. */
229698f87a4SGarrett D'Amore 		if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
230*260e9a87SYuri Pankov 			*start = ++*end;
23195c635efSGarrett D'Amore 
232698f87a4SGarrett D'Amore 		switch (**end) {
233*260e9a87SYuri Pankov 		case '(':
234698f87a4SGarrett D'Amore 			*start = ++*end;
235698f87a4SGarrett D'Amore 			*sz = 2;
23695c635efSGarrett D'Amore 			break;
237*260e9a87SYuri Pankov 		case '[':
238698f87a4SGarrett D'Amore 			*start = ++*end;
239698f87a4SGarrett D'Amore 			term = ']';
24095c635efSGarrett D'Amore 			break;
241*260e9a87SYuri Pankov 		case '\'':
242698f87a4SGarrett D'Amore 			*start = ++*end;
243698f87a4SGarrett D'Amore 			term = '\'';
24495c635efSGarrett D'Amore 			break;
245*260e9a87SYuri Pankov 		case '3':
246*260e9a87SYuri Pankov 			/* FALLTHROUGH */
247*260e9a87SYuri Pankov 		case '2':
248*260e9a87SYuri Pankov 			/* FALLTHROUGH */
249*260e9a87SYuri Pankov 		case '1':
250*260e9a87SYuri Pankov 			*sz = (*end)[-1] == 's' &&
251*260e9a87SYuri Pankov 			    isdigit((unsigned char)(*end)[1]) ? 2 : 1;
252*260e9a87SYuri Pankov 			break;
25395c635efSGarrett D'Amore 		default:
254698f87a4SGarrett D'Amore 			*sz = 1;
25595c635efSGarrett D'Amore 			break;
25695c635efSGarrett D'Amore 		}
25795c635efSGarrett D'Amore 
25895c635efSGarrett D'Amore 		break;
25995c635efSGarrett D'Amore 
26095c635efSGarrett D'Amore 	/*
26195c635efSGarrett D'Amore 	 * Anything else is assumed to be a glyph.
262698f87a4SGarrett D'Amore 	 * In this case, pass back the character after the backslash.
26395c635efSGarrett D'Amore 	 */
26495c635efSGarrett D'Amore 	default:
26595c635efSGarrett D'Amore 		gly = ESCAPE_SPECIAL;
266698f87a4SGarrett D'Amore 		*start = --*end;
267698f87a4SGarrett D'Amore 		*sz = 1;
26895c635efSGarrett D'Amore 		break;
26995c635efSGarrett D'Amore 	}
27095c635efSGarrett D'Amore 
27195c635efSGarrett D'Amore 	assert(ESCAPE_ERROR != gly);
27295c635efSGarrett D'Amore 
27395c635efSGarrett D'Amore 	/*
274698f87a4SGarrett D'Amore 	 * Read up to the terminating character,
275698f87a4SGarrett D'Amore 	 * paying attention to nested escapes.
27695c635efSGarrett D'Amore 	 */
27795c635efSGarrett D'Amore 
27895c635efSGarrett D'Amore 	if ('\0' != term) {
279698f87a4SGarrett D'Amore 		while (**end != term) {
280698f87a4SGarrett D'Amore 			switch (**end) {
281*260e9a87SYuri Pankov 			case '\0':
28295c635efSGarrett D'Amore 				return(ESCAPE_ERROR);
283*260e9a87SYuri Pankov 			case '\\':
28495c635efSGarrett D'Amore 				(*end)++;
285698f87a4SGarrett D'Amore 				if (ESCAPE_ERROR ==
286698f87a4SGarrett D'Amore 				    mandoc_escape(end, NULL, NULL))
28795c635efSGarrett D'Amore 					return(ESCAPE_ERROR);
288698f87a4SGarrett D'Amore 				break;
289698f87a4SGarrett D'Amore 			default:
290698f87a4SGarrett D'Amore 				(*end)++;
291698f87a4SGarrett D'Amore 				break;
292698f87a4SGarrett D'Amore 			}
293698f87a4SGarrett D'Amore 		}
294698f87a4SGarrett D'Amore 		*sz = (*end)++ - *start;
295698f87a4SGarrett D'Amore 	} else {
296698f87a4SGarrett D'Amore 		assert(*sz > 0);
297698f87a4SGarrett D'Amore 		if ((size_t)*sz > strlen(*start))
298698f87a4SGarrett D'Amore 			return(ESCAPE_ERROR);
299698f87a4SGarrett D'Amore 		*end += *sz;
300698f87a4SGarrett D'Amore 	}
30195c635efSGarrett D'Amore 
30295c635efSGarrett D'Amore 	/* Run post-processors. */
30395c635efSGarrett D'Amore 
30495c635efSGarrett D'Amore 	switch (gly) {
305*260e9a87SYuri Pankov 	case ESCAPE_FONT:
306698f87a4SGarrett D'Amore 		if (2 == *sz) {
307698f87a4SGarrett D'Amore 			if ('C' == **start) {
30895c635efSGarrett D'Amore 				/*
309698f87a4SGarrett D'Amore 				 * Treat constant-width font modes
310698f87a4SGarrett D'Amore 				 * just like regular font modes.
31195c635efSGarrett D'Amore 				 */
312698f87a4SGarrett D'Amore 				(*start)++;
313698f87a4SGarrett D'Amore 				(*sz)--;
314698f87a4SGarrett D'Amore 			} else {
315698f87a4SGarrett D'Amore 				if ('B' == (*start)[0] && 'I' == (*start)[1])
316698f87a4SGarrett D'Amore 					gly = ESCAPE_FONTBI;
317698f87a4SGarrett D'Amore 				break;
318698f87a4SGarrett D'Amore 			}
319698f87a4SGarrett D'Amore 		} else if (1 != *sz)
32095c635efSGarrett D'Amore 			break;
32195c635efSGarrett D'Amore 
322698f87a4SGarrett D'Amore 		switch (**start) {
323*260e9a87SYuri Pankov 		case '3':
32495c635efSGarrett D'Amore 			/* FALLTHROUGH */
325*260e9a87SYuri Pankov 		case 'B':
32695c635efSGarrett D'Amore 			gly = ESCAPE_FONTBOLD;
32795c635efSGarrett D'Amore 			break;
328*260e9a87SYuri Pankov 		case '2':
32995c635efSGarrett D'Amore 			/* FALLTHROUGH */
330*260e9a87SYuri Pankov 		case 'I':
33195c635efSGarrett D'Amore 			gly = ESCAPE_FONTITALIC;
33295c635efSGarrett D'Amore 			break;
333*260e9a87SYuri Pankov 		case 'P':
33495c635efSGarrett D'Amore 			gly = ESCAPE_FONTPREV;
33595c635efSGarrett D'Amore 			break;
336*260e9a87SYuri Pankov 		case '1':
33795c635efSGarrett D'Amore 			/* FALLTHROUGH */
338*260e9a87SYuri Pankov 		case 'R':
33995c635efSGarrett D'Amore 			gly = ESCAPE_FONTROMAN;
34095c635efSGarrett D'Amore 			break;
34195c635efSGarrett D'Amore 		}
34295c635efSGarrett D'Amore 		break;
343*260e9a87SYuri Pankov 	case ESCAPE_SPECIAL:
344698f87a4SGarrett D'Amore 		if (1 == *sz && 'c' == **start)
34595c635efSGarrett D'Amore 			gly = ESCAPE_NOSPACE;
346*260e9a87SYuri Pankov 		/*
347*260e9a87SYuri Pankov 		 * Unicode escapes are defined in groff as \[u0000]
348*260e9a87SYuri Pankov 		 * to \[u10FFFF], where the contained value must be
349*260e9a87SYuri Pankov 		 * a valid Unicode codepoint.  Here, however, only
350*260e9a87SYuri Pankov 		 * check the length and range.
351*260e9a87SYuri Pankov 		 */
352*260e9a87SYuri Pankov 		if (**start != 'u' || *sz < 5 || *sz > 7)
353*260e9a87SYuri Pankov 			break;
354*260e9a87SYuri Pankov 		if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
355*260e9a87SYuri Pankov 			break;
356*260e9a87SYuri Pankov 		if (*sz == 6 && (*start)[1] == '0')
357*260e9a87SYuri Pankov 			break;
358*260e9a87SYuri Pankov 		if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
359*260e9a87SYuri Pankov 		    + 1 == *sz)
360*260e9a87SYuri Pankov 			gly = ESCAPE_UNICODE;
36195c635efSGarrett D'Amore 		break;
36295c635efSGarrett D'Amore 	default:
36395c635efSGarrett D'Amore 		break;
36495c635efSGarrett D'Amore 	}
36595c635efSGarrett D'Amore 
36695c635efSGarrett D'Amore 	return(gly);
36795c635efSGarrett D'Amore }
36895c635efSGarrett D'Amore 
36995c635efSGarrett D'Amore /*
37095c635efSGarrett D'Amore  * Parse a quoted or unquoted roff-style request or macro argument.
37195c635efSGarrett D'Amore  * Return a pointer to the parsed argument, which is either the original
37295c635efSGarrett D'Amore  * pointer or advanced by one byte in case the argument is quoted.
373698f87a4SGarrett D'Amore  * NUL-terminate the argument in place.
37495c635efSGarrett D'Amore  * Collapse pairs of quotes inside quoted arguments.
37595c635efSGarrett D'Amore  * Advance the argument pointer to the next argument,
376698f87a4SGarrett D'Amore  * or to the NUL byte terminating the argument line.
37795c635efSGarrett D'Amore  */
37895c635efSGarrett D'Amore char *
37995c635efSGarrett D'Amore mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
38095c635efSGarrett D'Amore {
38195c635efSGarrett D'Amore 	char	 *start, *cp;
38295c635efSGarrett D'Amore 	int	  quoted, pairs, white;
38395c635efSGarrett D'Amore 
38495c635efSGarrett D'Amore 	/* Quoting can only start with a new word. */
38595c635efSGarrett D'Amore 	start = *cpp;
38695c635efSGarrett D'Amore 	quoted = 0;
38795c635efSGarrett D'Amore 	if ('"' == *start) {
38895c635efSGarrett D'Amore 		quoted = 1;
38995c635efSGarrett D'Amore 		start++;
39095c635efSGarrett D'Amore 	}
39195c635efSGarrett D'Amore 
39295c635efSGarrett D'Amore 	pairs = 0;
39395c635efSGarrett D'Amore 	white = 0;
39495c635efSGarrett D'Amore 	for (cp = start; '\0' != *cp; cp++) {
395698f87a4SGarrett D'Amore 
396698f87a4SGarrett D'Amore 		/*
397698f87a4SGarrett D'Amore 		 * Move the following text left
398698f87a4SGarrett D'Amore 		 * after quoted quotes and after "\\" and "\t".
399698f87a4SGarrett D'Amore 		 */
40095c635efSGarrett D'Amore 		if (pairs)
40195c635efSGarrett D'Amore 			cp[-pairs] = cp[0];
402698f87a4SGarrett D'Amore 
40395c635efSGarrett D'Amore 		if ('\\' == cp[0]) {
404698f87a4SGarrett D'Amore 			/*
405698f87a4SGarrett D'Amore 			 * In copy mode, translate double to single
406698f87a4SGarrett D'Amore 			 * backslashes and backslash-t to literal tabs.
407698f87a4SGarrett D'Amore 			 */
408698f87a4SGarrett D'Amore 			switch (cp[1]) {
409*260e9a87SYuri Pankov 			case 't':
410698f87a4SGarrett D'Amore 				cp[0] = '\t';
411698f87a4SGarrett D'Amore 				/* FALLTHROUGH */
412*260e9a87SYuri Pankov 			case '\\':
41395c635efSGarrett D'Amore 				pairs++;
41495c635efSGarrett D'Amore 				cp++;
415698f87a4SGarrett D'Amore 				break;
416*260e9a87SYuri Pankov 			case ' ':
41795c635efSGarrett D'Amore 				/* Skip escaped blanks. */
418698f87a4SGarrett D'Amore 				if (0 == quoted)
41995c635efSGarrett D'Amore 					cp++;
420698f87a4SGarrett D'Amore 				break;
421698f87a4SGarrett D'Amore 			default:
422698f87a4SGarrett D'Amore 				break;
423698f87a4SGarrett D'Amore 			}
42495c635efSGarrett D'Amore 		} else if (0 == quoted) {
42595c635efSGarrett D'Amore 			if (' ' == cp[0]) {
42695c635efSGarrett D'Amore 				/* Unescaped blanks end unquoted args. */
42795c635efSGarrett D'Amore 				white = 1;
42895c635efSGarrett D'Amore 				break;
42995c635efSGarrett D'Amore 			}
43095c635efSGarrett D'Amore 		} else if ('"' == cp[0]) {
43195c635efSGarrett D'Amore 			if ('"' == cp[1]) {
43295c635efSGarrett D'Amore 				/* Quoted quotes collapse. */
43395c635efSGarrett D'Amore 				pairs++;
43495c635efSGarrett D'Amore 				cp++;
43595c635efSGarrett D'Amore 			} else {
43695c635efSGarrett D'Amore 				/* Unquoted quotes end quoted args. */
43795c635efSGarrett D'Amore 				quoted = 2;
43895c635efSGarrett D'Amore 				break;
43995c635efSGarrett D'Amore 			}
44095c635efSGarrett D'Amore 		}
44195c635efSGarrett D'Amore 	}
44295c635efSGarrett D'Amore 
44395c635efSGarrett D'Amore 	/* Quoted argument without a closing quote. */
44495c635efSGarrett D'Amore 	if (1 == quoted)
445*260e9a87SYuri Pankov 		mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL);
44695c635efSGarrett D'Amore 
447698f87a4SGarrett D'Amore 	/* NUL-terminate this argument and move to the next one. */
44895c635efSGarrett D'Amore 	if (pairs)
44995c635efSGarrett D'Amore 		cp[-pairs] = '\0';
45095c635efSGarrett D'Amore 	if ('\0' != *cp) {
45195c635efSGarrett D'Amore 		*cp++ = '\0';
45295c635efSGarrett D'Amore 		while (' ' == *cp)
45395c635efSGarrett D'Amore 			cp++;
45495c635efSGarrett D'Amore 	}
45595c635efSGarrett D'Amore 	*pos += (int)(cp - start) + (quoted ? 1 : 0);
45695c635efSGarrett D'Amore 	*cpp = cp;
45795c635efSGarrett D'Amore 
45895c635efSGarrett D'Amore 	if ('\0' == *cp && (white || ' ' == cp[-1]))
459*260e9a87SYuri Pankov 		mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL);
46095c635efSGarrett D'Amore 
46195c635efSGarrett D'Amore 	return(start);
46295c635efSGarrett D'Amore }
46395c635efSGarrett D'Amore 
46495c635efSGarrett D'Amore static int
46595c635efSGarrett D'Amore a2time(time_t *t, const char *fmt, const char *p)
46695c635efSGarrett D'Amore {
46795c635efSGarrett D'Amore 	struct tm	 tm;
46895c635efSGarrett D'Amore 	char		*pp;
46995c635efSGarrett D'Amore 
47095c635efSGarrett D'Amore 	memset(&tm, 0, sizeof(struct tm));
47195c635efSGarrett D'Amore 
47295c635efSGarrett D'Amore 	pp = NULL;
473*260e9a87SYuri Pankov #if HAVE_STRPTIME
47495c635efSGarrett D'Amore 	pp = strptime(p, fmt, &tm);
47595c635efSGarrett D'Amore #endif
47695c635efSGarrett D'Amore 	if (NULL != pp && '\0' == *pp) {
47795c635efSGarrett D'Amore 		*t = mktime(&tm);
47895c635efSGarrett D'Amore 		return(1);
47995c635efSGarrett D'Amore 	}
48095c635efSGarrett D'Amore 
48195c635efSGarrett D'Amore 	return(0);
48295c635efSGarrett D'Amore }
48395c635efSGarrett D'Amore 
48495c635efSGarrett D'Amore static char *
48595c635efSGarrett D'Amore time2a(time_t t)
48695c635efSGarrett D'Amore {
48795c635efSGarrett D'Amore 	struct tm	*tm;
48895c635efSGarrett D'Amore 	char		*buf, *p;
48995c635efSGarrett D'Amore 	size_t		 ssz;
49095c635efSGarrett D'Amore 	int		 isz;
49195c635efSGarrett D'Amore 
49295c635efSGarrett D'Amore 	tm = localtime(&t);
493*260e9a87SYuri Pankov 	if (tm == NULL)
494*260e9a87SYuri Pankov 		return(NULL);
49595c635efSGarrett D'Amore 
49695c635efSGarrett D'Amore 	/*
49795c635efSGarrett D'Amore 	 * Reserve space:
49895c635efSGarrett D'Amore 	 * up to 9 characters for the month (September) + blank
49995c635efSGarrett D'Amore 	 * up to 2 characters for the day + comma + blank
50095c635efSGarrett D'Amore 	 * 4 characters for the year and a terminating '\0'
50195c635efSGarrett D'Amore 	 */
50295c635efSGarrett D'Amore 	p = buf = mandoc_malloc(10 + 4 + 4 + 1);
50395c635efSGarrett D'Amore 
50495c635efSGarrett D'Amore 	if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
50595c635efSGarrett D'Amore 		goto fail;
50695c635efSGarrett D'Amore 	p += (int)ssz;
50795c635efSGarrett D'Amore 
50895c635efSGarrett D'Amore 	if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
50995c635efSGarrett D'Amore 		goto fail;
51095c635efSGarrett D'Amore 	p += isz;
51195c635efSGarrett D'Amore 
51295c635efSGarrett D'Amore 	if (0 == strftime(p, 4 + 1, "%Y", tm))
51395c635efSGarrett D'Amore 		goto fail;
51495c635efSGarrett D'Amore 	return(buf);
51595c635efSGarrett D'Amore 
51695c635efSGarrett D'Amore fail:
51795c635efSGarrett D'Amore 	free(buf);
51895c635efSGarrett D'Amore 	return(NULL);
51995c635efSGarrett D'Amore }
52095c635efSGarrett D'Amore 
52195c635efSGarrett D'Amore char *
52295c635efSGarrett D'Amore mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
52395c635efSGarrett D'Amore {
52495c635efSGarrett D'Amore 	char		*out;
52595c635efSGarrett D'Amore 	time_t		 t;
52695c635efSGarrett D'Amore 
52795c635efSGarrett D'Amore 	if (NULL == in || '\0' == *in ||
52895c635efSGarrett D'Amore 	    0 == strcmp(in, "$" "Mdocdate$")) {
529*260e9a87SYuri Pankov 		mandoc_msg(MANDOCERR_DATE_MISSING, parse, ln, pos, NULL);
53095c635efSGarrett D'Amore 		time(&t);
53195c635efSGarrett D'Amore 	}
53295c635efSGarrett D'Amore 	else if (a2time(&t, "%Y-%m-%d", in))
53395c635efSGarrett D'Amore 		t = 0;
53495c635efSGarrett D'Amore 	else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
53595c635efSGarrett D'Amore 	    !a2time(&t, "%b %d, %Y", in)) {
536*260e9a87SYuri Pankov 		mandoc_msg(MANDOCERR_DATE_BAD, parse, ln, pos, in);
53795c635efSGarrett D'Amore 		t = 0;
53895c635efSGarrett D'Amore 	}
53995c635efSGarrett D'Amore 	out = t ? time2a(t) : NULL;
54095c635efSGarrett D'Amore 	return(out ? out : mandoc_strdup(in));
54195c635efSGarrett D'Amore }
54295c635efSGarrett D'Amore 
54395c635efSGarrett D'Amore int
544*260e9a87SYuri Pankov mandoc_eos(const char *p, size_t sz)
54595c635efSGarrett D'Amore {
54695c635efSGarrett D'Amore 	const char	*q;
547*260e9a87SYuri Pankov 	int		 enclosed, found;
54895c635efSGarrett D'Amore 
54995c635efSGarrett D'Amore 	if (0 == sz)
55095c635efSGarrett D'Amore 		return(0);
55195c635efSGarrett D'Amore 
55295c635efSGarrett D'Amore 	/*
55395c635efSGarrett D'Amore 	 * End-of-sentence recognition must include situations where
55495c635efSGarrett D'Amore 	 * some symbols, such as `)', allow prior EOS punctuation to
55595c635efSGarrett D'Amore 	 * propagate outward.
55695c635efSGarrett D'Amore 	 */
55795c635efSGarrett D'Amore 
558*260e9a87SYuri Pankov 	enclosed = found = 0;
55995c635efSGarrett D'Amore 	for (q = p + (int)sz - 1; q >= p; q--) {
56095c635efSGarrett D'Amore 		switch (*q) {
561*260e9a87SYuri Pankov 		case '\"':
56295c635efSGarrett D'Amore 			/* FALLTHROUGH */
563*260e9a87SYuri Pankov 		case '\'':
56495c635efSGarrett D'Amore 			/* FALLTHROUGH */
565*260e9a87SYuri Pankov 		case ']':
56695c635efSGarrett D'Amore 			/* FALLTHROUGH */
567*260e9a87SYuri Pankov 		case ')':
56895c635efSGarrett D'Amore 			if (0 == found)
56995c635efSGarrett D'Amore 				enclosed = 1;
57095c635efSGarrett D'Amore 			break;
571*260e9a87SYuri Pankov 		case '.':
57295c635efSGarrett D'Amore 			/* FALLTHROUGH */
573*260e9a87SYuri Pankov 		case '!':
57495c635efSGarrett D'Amore 			/* FALLTHROUGH */
575*260e9a87SYuri Pankov 		case '?':
57695c635efSGarrett D'Amore 			found = 1;
57795c635efSGarrett D'Amore 			break;
57895c635efSGarrett D'Amore 		default:
57995c635efSGarrett D'Amore 			return(found && (!enclosed || isalnum((unsigned char)*q)));
58095c635efSGarrett D'Amore 		}
58195c635efSGarrett D'Amore 	}
58295c635efSGarrett D'Amore 
58395c635efSGarrett D'Amore 	return(found && !enclosed);
58495c635efSGarrett D'Amore }
58595c635efSGarrett D'Amore 
58695c635efSGarrett D'Amore /*
58795c635efSGarrett D'Amore  * Convert a string to a long that may not be <0.
58895c635efSGarrett D'Amore  * If the string is invalid, or is less than 0, return -1.
58995c635efSGarrett D'Amore  */
59095c635efSGarrett D'Amore int
59195c635efSGarrett D'Amore mandoc_strntoi(const char *p, size_t sz, int base)
59295c635efSGarrett D'Amore {
59395c635efSGarrett D'Amore 	char		 buf[32];
59495c635efSGarrett D'Amore 	char		*ep;
59595c635efSGarrett D'Amore 	long		 v;
59695c635efSGarrett D'Amore 
59795c635efSGarrett D'Amore 	if (sz > 31)
59895c635efSGarrett D'Amore 		return(-1);
59995c635efSGarrett D'Amore 
60095c635efSGarrett D'Amore 	memcpy(buf, p, sz);
60195c635efSGarrett D'Amore 	buf[(int)sz] = '\0';
60295c635efSGarrett D'Amore 
60395c635efSGarrett D'Amore 	errno = 0;
60495c635efSGarrett D'Amore 	v = strtol(buf, &ep, base);
60595c635efSGarrett D'Amore 
60695c635efSGarrett D'Amore 	if (buf[0] == '\0' || *ep != '\0')
60795c635efSGarrett D'Amore 		return(-1);
60895c635efSGarrett D'Amore 
60995c635efSGarrett D'Amore 	if (v > INT_MAX)
61095c635efSGarrett D'Amore 		v = INT_MAX;
61195c635efSGarrett D'Amore 	if (v < INT_MIN)
61295c635efSGarrett D'Amore 		v = INT_MIN;
61395c635efSGarrett D'Amore 
61495c635efSGarrett D'Amore 	return((int)v);
61595c635efSGarrett D'Amore }
616