xref: /illumos-gate/usr/src/cmd/mandoc/html.c (revision 985cc36c07a787e0cb720fcf2fab565aa2a77590)
1 /*	$Id: html.c,v 1.192 2016/01/04 12:45:29 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2011-2015 Ingo Schwarze <schwarze@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 #include "config.h"
19 
20 #include <sys/types.h>
21 
22 #include <assert.h>
23 #include <ctype.h>
24 #include <stdarg.h>
25 #include <stdio.h>
26 #include <stdint.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unistd.h>
30 
31 #include "mandoc.h"
32 #include "mandoc_aux.h"
33 #include "out.h"
34 #include "html.h"
35 #include "manconf.h"
36 #include "main.h"
37 
38 struct	htmldata {
39 	const char	 *name;
40 	int		  flags;
41 #define	HTML_CLRLINE	 (1 << 0)
42 #define	HTML_NOSTACK	 (1 << 1)
43 #define	HTML_AUTOCLOSE	 (1 << 2) /* Tag has auto-closure. */
44 };
45 
46 static	const struct htmldata htmltags[TAG_MAX] = {
47 	{"html",	HTML_CLRLINE}, /* TAG_HTML */
48 	{"head",	HTML_CLRLINE}, /* TAG_HEAD */
49 	{"body",	HTML_CLRLINE}, /* TAG_BODY */
50 	{"meta",	HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */
51 	{"title",	HTML_CLRLINE}, /* TAG_TITLE */
52 	{"div",		HTML_CLRLINE}, /* TAG_DIV */
53 	{"h1",		0}, /* TAG_H1 */
54 	{"h2",		0}, /* TAG_H2 */
55 	{"span",	0}, /* TAG_SPAN */
56 	{"link",	HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */
57 	{"br",		HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */
58 	{"a",		0}, /* TAG_A */
59 	{"table",	HTML_CLRLINE}, /* TAG_TABLE */
60 	{"tbody",	HTML_CLRLINE}, /* TAG_TBODY */
61 	{"col",		HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */
62 	{"tr",		HTML_CLRLINE}, /* TAG_TR */
63 	{"td",		HTML_CLRLINE}, /* TAG_TD */
64 	{"li",		HTML_CLRLINE}, /* TAG_LI */
65 	{"ul",		HTML_CLRLINE}, /* TAG_UL */
66 	{"ol",		HTML_CLRLINE}, /* TAG_OL */
67 	{"dl",		HTML_CLRLINE}, /* TAG_DL */
68 	{"dt",		HTML_CLRLINE}, /* TAG_DT */
69 	{"dd",		HTML_CLRLINE}, /* TAG_DD */
70 	{"blockquote",	HTML_CLRLINE}, /* TAG_BLOCKQUOTE */
71 	{"pre",		HTML_CLRLINE }, /* TAG_PRE */
72 	{"b",		0 }, /* TAG_B */
73 	{"i",		0 }, /* TAG_I */
74 	{"code",	0 }, /* TAG_CODE */
75 	{"small",	0 }, /* TAG_SMALL */
76 	{"style",	HTML_CLRLINE}, /* TAG_STYLE */
77 	{"math",	HTML_CLRLINE}, /* TAG_MATH */
78 	{"mrow",	0}, /* TAG_MROW */
79 	{"mi",		0}, /* TAG_MI */
80 	{"mo",		0}, /* TAG_MO */
81 	{"msup",	0}, /* TAG_MSUP */
82 	{"msub",	0}, /* TAG_MSUB */
83 	{"msubsup",	0}, /* TAG_MSUBSUP */
84 	{"mfrac",	0}, /* TAG_MFRAC */
85 	{"msqrt",	0}, /* TAG_MSQRT */
86 	{"mfenced",	0}, /* TAG_MFENCED */
87 	{"mtable",	0}, /* TAG_MTABLE */
88 	{"mtr",		0}, /* TAG_MTR */
89 	{"mtd",		0}, /* TAG_MTD */
90 	{"munderover",	0}, /* TAG_MUNDEROVER */
91 	{"munder",	0}, /* TAG_MUNDER*/
92 	{"mover",	0}, /* TAG_MOVER*/
93 };
94 
95 static	const char	*const htmlattrs[ATTR_MAX] = {
96 	"name", /* ATTR_NAME */
97 	"rel", /* ATTR_REL */
98 	"href", /* ATTR_HREF */
99 	"type", /* ATTR_TYPE */
100 	"media", /* ATTR_MEDIA */
101 	"class", /* ATTR_CLASS */
102 	"style", /* ATTR_STYLE */
103 	"id", /* ATTR_ID */
104 	"colspan", /* ATTR_COLSPAN */
105 	"charset", /* ATTR_CHARSET */
106 	"open", /* ATTR_OPEN */
107 	"close", /* ATTR_CLOSE */
108 	"mathvariant", /* ATTR_MATHVARIANT */
109 };
110 
111 static	const char	*const roffscales[SCALE_MAX] = {
112 	"cm", /* SCALE_CM */
113 	"in", /* SCALE_IN */
114 	"pc", /* SCALE_PC */
115 	"pt", /* SCALE_PT */
116 	"em", /* SCALE_EM */
117 	"em", /* SCALE_MM */
118 	"ex", /* SCALE_EN */
119 	"ex", /* SCALE_BU */
120 	"em", /* SCALE_VS */
121 	"ex", /* SCALE_FS */
122 };
123 
124 static	void	 bufncat(struct html *, const char *, size_t);
125 static	void	 print_ctag(struct html *, struct tag *);
126 static	int	 print_escape(char);
127 static	int	 print_encode(struct html *, const char *, int);
128 static	void	 print_metaf(struct html *, enum mandoc_esc);
129 static	void	 print_attr(struct html *, const char *, const char *);
130 
131 
132 void *
133 html_alloc(const struct manoutput *outopts)
134 {
135 	struct html	*h;
136 
137 	h = mandoc_calloc(1, sizeof(struct html));
138 
139 	h->tags.head = NULL;
140 	h->style = outopts->style;
141 	h->base_man = outopts->man;
142 	h->base_includes = outopts->includes;
143 	if (outopts->fragment)
144 		h->oflags |= HTML_FRAGMENT;
145 
146 	return h;
147 }
148 
149 void
150 html_free(void *p)
151 {
152 	struct tag	*tag;
153 	struct html	*h;
154 
155 	h = (struct html *)p;
156 
157 	while ((tag = h->tags.head) != NULL) {
158 		h->tags.head = tag->next;
159 		free(tag);
160 	}
161 
162 	free(h);
163 }
164 
165 void
166 print_gen_head(struct html *h)
167 {
168 	struct htmlpair	 tag[4];
169 	struct tag	*t;
170 
171 	tag[0].key = ATTR_CHARSET;
172 	tag[0].val = "utf-8";
173 	print_otag(h, TAG_META, 1, tag);
174 
175 	/*
176 	 * Print a default style-sheet.
177 	 */
178 	t = print_otag(h, TAG_STYLE, 0, NULL);
179 	print_text(h, "table.head, table.foot { width: 100%; }\n"
180 	      "td.head-rtitle, td.foot-os { text-align: right; }\n"
181 	      "td.head-vol { text-align: center; }\n"
182 	      "table.foot td { width: 50%; }\n"
183 	      "table.head td { width: 33%; }\n"
184 	      "div.spacer { margin: 1em 0; }\n");
185 	print_tagq(h, t);
186 
187 	if (h->style) {
188 		tag[0].key = ATTR_REL;
189 		tag[0].val = "stylesheet";
190 		tag[1].key = ATTR_HREF;
191 		tag[1].val = h->style;
192 		tag[2].key = ATTR_TYPE;
193 		tag[2].val = "text/css";
194 		tag[3].key = ATTR_MEDIA;
195 		tag[3].val = "all";
196 		print_otag(h, TAG_LINK, 4, tag);
197 	}
198 }
199 
200 static void
201 print_metaf(struct html *h, enum mandoc_esc deco)
202 {
203 	enum htmlfont	 font;
204 
205 	switch (deco) {
206 	case ESCAPE_FONTPREV:
207 		font = h->metal;
208 		break;
209 	case ESCAPE_FONTITALIC:
210 		font = HTMLFONT_ITALIC;
211 		break;
212 	case ESCAPE_FONTBOLD:
213 		font = HTMLFONT_BOLD;
214 		break;
215 	case ESCAPE_FONTBI:
216 		font = HTMLFONT_BI;
217 		break;
218 	case ESCAPE_FONT:
219 	case ESCAPE_FONTROMAN:
220 		font = HTMLFONT_NONE;
221 		break;
222 	default:
223 		abort();
224 	}
225 
226 	if (h->metaf) {
227 		print_tagq(h, h->metaf);
228 		h->metaf = NULL;
229 	}
230 
231 	h->metal = h->metac;
232 	h->metac = font;
233 
234 	switch (font) {
235 	case HTMLFONT_ITALIC:
236 		h->metaf = print_otag(h, TAG_I, 0, NULL);
237 		break;
238 	case HTMLFONT_BOLD:
239 		h->metaf = print_otag(h, TAG_B, 0, NULL);
240 		break;
241 	case HTMLFONT_BI:
242 		h->metaf = print_otag(h, TAG_B, 0, NULL);
243 		print_otag(h, TAG_I, 0, NULL);
244 		break;
245 	default:
246 		break;
247 	}
248 }
249 
250 int
251 html_strlen(const char *cp)
252 {
253 	size_t		 rsz;
254 	int		 skip, sz;
255 
256 	/*
257 	 * Account for escaped sequences within string length
258 	 * calculations.  This follows the logic in term_strlen() as we
259 	 * must calculate the width of produced strings.
260 	 * Assume that characters are always width of "1".  This is
261 	 * hacky, but it gets the job done for approximation of widths.
262 	 */
263 
264 	sz = 0;
265 	skip = 0;
266 	while (1) {
267 		rsz = strcspn(cp, "\\");
268 		if (rsz) {
269 			cp += rsz;
270 			if (skip) {
271 				skip = 0;
272 				rsz--;
273 			}
274 			sz += rsz;
275 		}
276 		if ('\0' == *cp)
277 			break;
278 		cp++;
279 		switch (mandoc_escape(&cp, NULL, NULL)) {
280 		case ESCAPE_ERROR:
281 			return sz;
282 		case ESCAPE_UNICODE:
283 		case ESCAPE_NUMBERED:
284 		case ESCAPE_SPECIAL:
285 		case ESCAPE_OVERSTRIKE:
286 			if (skip)
287 				skip = 0;
288 			else
289 				sz++;
290 			break;
291 		case ESCAPE_SKIPCHAR:
292 			skip = 1;
293 			break;
294 		default:
295 			break;
296 		}
297 	}
298 	return sz;
299 }
300 
301 static int
302 print_escape(char c)
303 {
304 
305 	switch (c) {
306 	case '<':
307 		printf("&lt;");
308 		break;
309 	case '>':
310 		printf("&gt;");
311 		break;
312 	case '&':
313 		printf("&amp;");
314 		break;
315 	case '"':
316 		printf("&quot;");
317 		break;
318 	case ASCII_NBRSP:
319 		printf("&nbsp;");
320 		break;
321 	case ASCII_HYPH:
322 		putchar('-');
323 		break;
324 	case ASCII_BREAK:
325 		break;
326 	default:
327 		return 0;
328 	}
329 	return 1;
330 }
331 
332 static int
333 print_encode(struct html *h, const char *p, int norecurse)
334 {
335 	size_t		 sz;
336 	int		 c, len, nospace;
337 	const char	*seq;
338 	enum mandoc_esc	 esc;
339 	static const char rejs[9] = { '\\', '<', '>', '&', '"',
340 		ASCII_NBRSP, ASCII_HYPH, ASCII_BREAK, '\0' };
341 
342 	nospace = 0;
343 
344 	while ('\0' != *p) {
345 		if (HTML_SKIPCHAR & h->flags && '\\' != *p) {
346 			h->flags &= ~HTML_SKIPCHAR;
347 			p++;
348 			continue;
349 		}
350 
351 		sz = strcspn(p, rejs);
352 
353 		fwrite(p, 1, sz, stdout);
354 		p += (int)sz;
355 
356 		if ('\0' == *p)
357 			break;
358 
359 		if (print_escape(*p++))
360 			continue;
361 
362 		esc = mandoc_escape(&p, &seq, &len);
363 		if (ESCAPE_ERROR == esc)
364 			break;
365 
366 		switch (esc) {
367 		case ESCAPE_FONT:
368 		case ESCAPE_FONTPREV:
369 		case ESCAPE_FONTBOLD:
370 		case ESCAPE_FONTITALIC:
371 		case ESCAPE_FONTBI:
372 		case ESCAPE_FONTROMAN:
373 			if (0 == norecurse)
374 				print_metaf(h, esc);
375 			continue;
376 		case ESCAPE_SKIPCHAR:
377 			h->flags |= HTML_SKIPCHAR;
378 			continue;
379 		default:
380 			break;
381 		}
382 
383 		if (h->flags & HTML_SKIPCHAR) {
384 			h->flags &= ~HTML_SKIPCHAR;
385 			continue;
386 		}
387 
388 		switch (esc) {
389 		case ESCAPE_UNICODE:
390 			/* Skip past "u" header. */
391 			c = mchars_num2uc(seq + 1, len - 1);
392 			break;
393 		case ESCAPE_NUMBERED:
394 			c = mchars_num2char(seq, len);
395 			if (c < 0)
396 				continue;
397 			break;
398 		case ESCAPE_SPECIAL:
399 			c = mchars_spec2cp(seq, len);
400 			if (c <= 0)
401 				continue;
402 			break;
403 		case ESCAPE_NOSPACE:
404 			if ('\0' == *p)
405 				nospace = 1;
406 			continue;
407 		case ESCAPE_OVERSTRIKE:
408 			if (len == 0)
409 				continue;
410 			c = seq[len - 1];
411 			break;
412 		default:
413 			continue;
414 		}
415 		if ((c < 0x20 && c != 0x09) ||
416 		    (c > 0x7E && c < 0xA0))
417 			c = 0xFFFD;
418 		if (c > 0x7E)
419 			printf("&#%d;", c);
420 		else if ( ! print_escape(c))
421 			putchar(c);
422 	}
423 
424 	return nospace;
425 }
426 
427 static void
428 print_attr(struct html *h, const char *key, const char *val)
429 {
430 	printf(" %s=\"", key);
431 	(void)print_encode(h, val, 1);
432 	putchar('\"');
433 }
434 
435 struct tag *
436 print_otag(struct html *h, enum htmltag tag,
437 		int sz, const struct htmlpair *p)
438 {
439 	int		 i;
440 	struct tag	*t;
441 
442 	/* Push this tags onto the stack of open scopes. */
443 
444 	if ( ! (HTML_NOSTACK & htmltags[tag].flags)) {
445 		t = mandoc_malloc(sizeof(struct tag));
446 		t->tag = tag;
447 		t->next = h->tags.head;
448 		h->tags.head = t;
449 	} else
450 		t = NULL;
451 
452 	if ( ! (HTML_NOSPACE & h->flags))
453 		if ( ! (HTML_CLRLINE & htmltags[tag].flags)) {
454 			/* Manage keeps! */
455 			if ( ! (HTML_KEEP & h->flags)) {
456 				if (HTML_PREKEEP & h->flags)
457 					h->flags |= HTML_KEEP;
458 				putchar(' ');
459 			} else
460 				printf("&#160;");
461 		}
462 
463 	if ( ! (h->flags & HTML_NONOSPACE))
464 		h->flags &= ~HTML_NOSPACE;
465 	else
466 		h->flags |= HTML_NOSPACE;
467 
468 	/* Print out the tag name and attributes. */
469 
470 	printf("<%s", htmltags[tag].name);
471 	for (i = 0; i < sz; i++)
472 		print_attr(h, htmlattrs[p[i].key], p[i].val);
473 
474 	/* Accommodate for "well-formed" singleton escaping. */
475 
476 	if (HTML_AUTOCLOSE & htmltags[tag].flags)
477 		putchar('/');
478 
479 	putchar('>');
480 
481 	h->flags |= HTML_NOSPACE;
482 
483 	if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags)
484 		putchar('\n');
485 
486 	return t;
487 }
488 
489 static void
490 print_ctag(struct html *h, struct tag *tag)
491 {
492 
493 	/*
494 	 * Remember to close out and nullify the current
495 	 * meta-font and table, if applicable.
496 	 */
497 	if (tag == h->metaf)
498 		h->metaf = NULL;
499 	if (tag == h->tblt)
500 		h->tblt = NULL;
501 
502 	printf("</%s>", htmltags[tag->tag].name);
503 	if (HTML_CLRLINE & htmltags[tag->tag].flags) {
504 		h->flags |= HTML_NOSPACE;
505 		putchar('\n');
506 	}
507 
508 	h->tags.head = tag->next;
509 	free(tag);
510 }
511 
512 void
513 print_gen_decls(struct html *h)
514 {
515 
516 	puts("<!DOCTYPE html>");
517 }
518 
519 void
520 print_text(struct html *h, const char *word)
521 {
522 
523 	if ( ! (HTML_NOSPACE & h->flags)) {
524 		/* Manage keeps! */
525 		if ( ! (HTML_KEEP & h->flags)) {
526 			if (HTML_PREKEEP & h->flags)
527 				h->flags |= HTML_KEEP;
528 			putchar(' ');
529 		} else
530 			printf("&#160;");
531 	}
532 
533 	assert(NULL == h->metaf);
534 	switch (h->metac) {
535 	case HTMLFONT_ITALIC:
536 		h->metaf = print_otag(h, TAG_I, 0, NULL);
537 		break;
538 	case HTMLFONT_BOLD:
539 		h->metaf = print_otag(h, TAG_B, 0, NULL);
540 		break;
541 	case HTMLFONT_BI:
542 		h->metaf = print_otag(h, TAG_B, 0, NULL);
543 		print_otag(h, TAG_I, 0, NULL);
544 		break;
545 	default:
546 		break;
547 	}
548 
549 	assert(word);
550 	if ( ! print_encode(h, word, 0)) {
551 		if ( ! (h->flags & HTML_NONOSPACE))
552 			h->flags &= ~HTML_NOSPACE;
553 		h->flags &= ~HTML_NONEWLINE;
554 	} else
555 		h->flags |= HTML_NOSPACE | HTML_NONEWLINE;
556 
557 	if (h->metaf) {
558 		print_tagq(h, h->metaf);
559 		h->metaf = NULL;
560 	}
561 
562 	h->flags &= ~HTML_IGNDELIM;
563 }
564 
565 void
566 print_tagq(struct html *h, const struct tag *until)
567 {
568 	struct tag	*tag;
569 
570 	while ((tag = h->tags.head) != NULL) {
571 		print_ctag(h, tag);
572 		if (until && tag == until)
573 			return;
574 	}
575 }
576 
577 void
578 print_stagq(struct html *h, const struct tag *suntil)
579 {
580 	struct tag	*tag;
581 
582 	while ((tag = h->tags.head) != NULL) {
583 		if (suntil && tag == suntil)
584 			return;
585 		print_ctag(h, tag);
586 	}
587 }
588 
589 void
590 print_paragraph(struct html *h)
591 {
592 	struct tag	*t;
593 	struct htmlpair	 tag;
594 
595 	PAIR_CLASS_INIT(&tag, "spacer");
596 	t = print_otag(h, TAG_DIV, 1, &tag);
597 	print_tagq(h, t);
598 }
599 
600 
601 void
602 bufinit(struct html *h)
603 {
604 
605 	h->buf[0] = '\0';
606 	h->buflen = 0;
607 }
608 
609 void
610 bufcat_style(struct html *h, const char *key, const char *val)
611 {
612 
613 	bufcat(h, key);
614 	bufcat(h, ":");
615 	bufcat(h, val);
616 	bufcat(h, ";");
617 }
618 
619 void
620 bufcat(struct html *h, const char *p)
621 {
622 
623 	/*
624 	 * XXX This is broken and not easy to fix.
625 	 * When using the -Oincludes option, buffmt_includes()
626 	 * may pass in strings overrunning BUFSIZ, causing a crash.
627 	 */
628 
629 	h->buflen = strlcat(h->buf, p, BUFSIZ);
630 	assert(h->buflen < BUFSIZ);
631 }
632 
633 void
634 bufcat_fmt(struct html *h, const char *fmt, ...)
635 {
636 	va_list		 ap;
637 
638 	va_start(ap, fmt);
639 	(void)vsnprintf(h->buf + (int)h->buflen,
640 	    BUFSIZ - h->buflen - 1, fmt, ap);
641 	va_end(ap);
642 	h->buflen = strlen(h->buf);
643 }
644 
645 static void
646 bufncat(struct html *h, const char *p, size_t sz)
647 {
648 
649 	assert(h->buflen + sz + 1 < BUFSIZ);
650 	strncat(h->buf, p, sz);
651 	h->buflen += sz;
652 }
653 
654 void
655 buffmt_includes(struct html *h, const char *name)
656 {
657 	const char	*p, *pp;
658 
659 	pp = h->base_includes;
660 
661 	bufinit(h);
662 	while (NULL != (p = strchr(pp, '%'))) {
663 		bufncat(h, pp, (size_t)(p - pp));
664 		switch (*(p + 1)) {
665 		case'I':
666 			bufcat(h, name);
667 			break;
668 		default:
669 			bufncat(h, p, 2);
670 			break;
671 		}
672 		pp = p + 2;
673 	}
674 	if (pp)
675 		bufcat(h, pp);
676 }
677 
678 void
679 buffmt_man(struct html *h, const char *name, const char *sec)
680 {
681 	const char	*p, *pp;
682 
683 	pp = h->base_man;
684 
685 	bufinit(h);
686 	while (NULL != (p = strchr(pp, '%'))) {
687 		bufncat(h, pp, (size_t)(p - pp));
688 		switch (*(p + 1)) {
689 		case 'S':
690 			bufcat(h, sec ? sec : "1");
691 			break;
692 		case 'N':
693 			bufcat_fmt(h, "%s", name);
694 			break;
695 		default:
696 			bufncat(h, p, 2);
697 			break;
698 		}
699 		pp = p + 2;
700 	}
701 	if (pp)
702 		bufcat(h, pp);
703 }
704 
705 void
706 bufcat_su(struct html *h, const char *p, const struct roffsu *su)
707 {
708 	double		 v;
709 
710 	v = su->scale;
711 	if (SCALE_MM == su->unit && 0.0 == (v /= 100.0))
712 		v = 1.0;
713 	else if (SCALE_BU == su->unit)
714 		v /= 24.0;
715 
716 	bufcat_fmt(h, "%s: %.2f%s;", p, v, roffscales[su->unit]);
717 }
718 
719 void
720 bufcat_id(struct html *h, const char *src)
721 {
722 
723 	/* Cf. <http://www.w3.org/TR/html5/dom.html#the-id-attribute>. */
724 
725 	for (; '\0' != *src; src++)
726 		bufncat(h, *src == ' ' ? "_" : src, 1);
727 }
728