xref: /freebsd/contrib/mandoc/term_ascii.c (revision 06410c1b51637e5e1f392d553b5008948af58014)
1 /* $Id: term_ascii.c,v 1.71 2025/07/16 14:33:08 schwarze Exp $ */
2 /*
3  * Copyright (c) 2014,2015,2017-2020,2025 Ingo Schwarze <schwarze@openbsd.org>
4  * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 #include "config.h"
19 
20 #include <sys/types.h>
21 
22 #include <assert.h>
23 #if HAVE_WCHAR
24 #include <langinfo.h>
25 #include <locale.h>
26 #endif
27 #include <stdint.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <unistd.h>
32 #if HAVE_WCHAR
33 #include <wchar.h>
34 #endif
35 
36 #include "mandoc.h"
37 #include "mandoc_aux.h"
38 #include "out.h"
39 #include "term.h"
40 #include "manconf.h"
41 #include "main.h"
42 
43 static	struct termp	 *ascii_init(enum termenc, const struct manoutput *);
44 static	int		  ascii_hspan(const struct termp *,
45 				const struct roffsu *);
46 static	size_t		  ascii_getwidth(const struct termp *, int);
47 static	void		  ascii_advance(struct termp *, size_t);
48 static	void		  ascii_begin(struct termp *);
49 static	void		  ascii_end(struct termp *);
50 static	void		  ascii_endline(struct termp *);
51 static	void		  ascii_letter(struct termp *, int);
52 static	void		  ascii_setwidth(struct termp *, int, int);
53 
54 #if HAVE_WCHAR
55 static	void		  locale_advance(struct termp *, size_t);
56 static	void		  locale_endline(struct termp *);
57 static	void		  locale_letter(struct termp *, int);
58 static	size_t		  locale_getwidth(const struct termp *, int);
59 #endif
60 
61 
62 static struct termp *
63 ascii_init(enum termenc enc, const struct manoutput *outopts)
64 {
65 #if HAVE_WCHAR
66 	char		*v;
67 #endif
68 	struct termp	*p;
69 
70 	p = mandoc_calloc(1, sizeof(*p));
71 	p->tcol = p->tcols = mandoc_calloc(1, sizeof(*p->tcol));
72 	p->maxtcol = 1;
73 
74 	p->line = 1;
75 	p->defindent = 5;
76 	p->fontq = mandoc_reallocarray(NULL,
77 	     (p->fontsz = 8), sizeof(*p->fontq));
78 	p->fontq[0] = p->fontl = TERMFONT_NONE;
79 
80 	p->begin = ascii_begin;
81 	p->end = ascii_end;
82 	p->hspan = ascii_hspan;
83 	p->type = TERMTYPE_CHAR;
84 	p->enc = TERMENC_ASCII;
85 	p->advance = ascii_advance;
86 	p->endline = ascii_endline;
87 	p->letter = ascii_letter;
88 	p->setwidth = ascii_setwidth;
89 	p->getwidth = ascii_getwidth;
90 
91 #if HAVE_WCHAR
92 	if (enc != TERMENC_ASCII) {
93 
94 		/*
95 		 * Do not change any of this to LC_ALL.  It might break
96 		 * the formatting by subtly changing the behaviour of
97 		 * various functions, for example strftime(3).  As a
98 		 * worst case, it might even cause buffer overflows.
99 		 */
100 
101 		v = enc == TERMENC_LOCALE ?
102 		    setlocale(LC_CTYPE, "") :
103 		    setlocale(LC_CTYPE, UTF8_LOCALE);
104 
105 		/*
106 		 * We only support UTF-8,
107 		 * so revert to ASCII for anything else.
108 		 */
109 
110 		if (v != NULL &&
111 		    strcmp(nl_langinfo(CODESET), "UTF-8") != 0)
112 			v = setlocale(LC_CTYPE, "C");
113 
114 		if (v != NULL && MB_CUR_MAX > 1) {
115 			p->enc = TERMENC_UTF8;
116 			p->advance = locale_advance;
117 			p->endline = locale_endline;
118 			p->letter = locale_letter;
119 			p->getwidth = locale_getwidth;
120 		}
121 	}
122 #endif
123 	p->defrmargin = term_len(p, outopts->width ? outopts->width : 78);
124 	p->lastrmargin = p->defrmargin;
125 
126 	if (outopts->indent)
127 		p->defindent = outopts->indent;
128 	if (outopts->synopsisonly)
129 		p->synopsisonly = 1;
130 
131 	assert(p->defindent < UINT16_MAX);
132 	assert(p->defrmargin < UINT16_MAX);
133 	return p;
134 }
135 
136 void *
137 ascii_alloc(const struct manoutput *outopts)
138 {
139 	return ascii_init(TERMENC_ASCII, outopts);
140 }
141 
142 void *
143 utf8_alloc(const struct manoutput *outopts)
144 {
145 	return ascii_init(TERMENC_UTF8, outopts);
146 }
147 
148 void *
149 locale_alloc(const struct manoutput *outopts)
150 {
151 	return ascii_init(TERMENC_LOCALE, outopts);
152 }
153 
154 static void
155 ascii_setwidth(struct termp *p, int iop, int width)
156 {
157 	p->tcol->rmargin = p->defrmargin;
158 	if (iop > 0)
159 		p->defrmargin += width;
160 	else if (iop == 0)
161 		p->defrmargin = width ? (size_t)width : p->lastrmargin;
162 	else if (p->defrmargin > (size_t)width)
163 		p->defrmargin -= width;
164 	else
165 		p->defrmargin = 0;
166 	if (p->defrmargin > term_len(p, 1000))
167 		p->defrmargin = term_len(p, 1000);
168 	p->lastrmargin = p->tcol->rmargin;
169 	p->tcol->rmargin = p->maxrmargin = p->defrmargin;
170 }
171 
172 void
173 terminal_sepline(void *arg)
174 {
175 	struct termp	*p;
176 	size_t		 i;	/* Printed width in basic units. */
177 	size_t		 sz;	/* Width of a dash in basic units. */
178 
179 	p = (struct termp *)arg;
180 	(*p->endline)(p);
181 	sz = (*p->getwidth)(p, '-');
182 	for (i = 0; i < p->defrmargin; i += sz)
183 		(*p->letter)(p, '-');
184 	(*p->endline)(p);
185 	(*p->endline)(p);
186 }
187 
188 static size_t
189 ascii_getwidth(const struct termp *p, int c)
190 {
191 	switch (c) {
192 	case ASCII_BREAK:
193 	case ASCII_NBRZW:
194 	case ASCII_TABREF:
195 		return 0;
196 	default:
197 		return 24;
198 	}
199 }
200 
201 void
202 ascii_free(void *arg)
203 {
204 	term_free((struct termp *)arg);
205 }
206 
207 static void
208 ascii_letter(struct termp *p, int c)
209 {
210 	putchar(c);
211 }
212 
213 static void
214 ascii_begin(struct termp *p)
215 {
216 	(*p->headf)(p, p->argf);
217 }
218 
219 static void
220 ascii_end(struct termp *p)
221 {
222 	(*p->footf)(p, p->argf);
223 }
224 
225 static void
226 ascii_endline(struct termp *p)
227 {
228 	p->line++;
229 	if ((int)p->tcol->offset > p->ti)
230 		p->tcol->offset -= p->ti;
231 	else
232 		p->tcol->offset = 0;
233 	p->ti = 0;
234 	p->minbl = 0;
235 	p->viscol = 0;
236 	putchar('\n');
237 }
238 
239 static void
240 ascii_advance(struct termp *p, size_t len)
241 {
242 	size_t		 dst;	/* Destination column in basic units. */
243 	size_t		 sz;	/* Width of a space in basic units. */
244 
245 	sz = (*p->getwidth)(p, ' ');
246 
247 	/*
248 	 * XXX We used to have "assert(len < UINT16_MAX)" here.
249 	 * that is not quite right because the input document
250 	 * can trigger that by merely providing large input.
251 	 * For now, simply truncate.
252 	 */
253 	if (len > 256 * sz)
254 		len = 256 * sz;
255 
256 	dst = p->viscol + len;
257 	while (p->viscol + sz / 2 < dst) {
258 		putchar(' ');
259 		p->viscol += sz;
260 	}
261 }
262 
263 static int
264 ascii_hspan(const struct termp *p, const struct roffsu *su)
265 {
266 	double		 r;
267 
268 	switch (su->unit) {
269 	case SCALE_BU:
270 		r = su->scale;
271 		break;
272 	case SCALE_CM:
273 		r = su->scale * 240.0 / 2.54;
274 		break;
275 	case SCALE_FS:
276 		r = su->scale * 65536.0;
277 		break;
278 	case SCALE_IN:
279 		r = su->scale * 240.0;
280 		break;
281 	case SCALE_MM:
282 		r = su->scale * 0.24;
283 		break;
284 	case SCALE_VS:
285 	case SCALE_PC:
286 		r = su->scale * 40.0;
287 		break;
288 	case SCALE_PT:
289 		r = su->scale * 10.0 / 3.0;
290 		break;
291 	case SCALE_EN:
292 	case SCALE_EM:
293 		r = su->scale * 24.0;
294 		break;
295 	default:
296 		abort();
297 	}
298 	return r > 0.0 ? r + 0.01 : r - 0.01;
299 }
300 
301 const char *
302 ascii_uc2str(int uc)
303 {
304 	static const char nbrsp[2] = { ASCII_NBRSP, '\0' };
305 	static const char *tab[] = {
306 	"<NUL>","<SOH>","<STX>","<ETX>","<EOT>","<ENQ>","<ACK>","<BEL>",
307 	"<BS>",	"\t",	"<LF>",	"<VT>",	"<FF>",	"<CR>",	"<SO>",	"<SI>",
308 	"<DLE>","<DC1>","<DC2>","<DC3>","<DC4>","<NAK>","<SYN>","<ETB>",
309 	"<CAN>","<EM>",	"<SUB>","<ESC>","<FS>",	"<GS>",	"<RS>",	"<US>",
310 	" ",	"!",	"\"",	"#",	"$",	"%",	"&",	"'",
311 	"(",	")",	"*",	"+",	",",	"-",	".",	"/",
312 	"0",	"1",	"2",	"3",	"4",	"5",	"6",	"7",
313 	"8",	"9",	":",	";",	"<",	"=",	">",	"?",
314 	"@",	"A",	"B",	"C",	"D",	"E",	"F",	"G",
315 	"H",	"I",	"J",	"K",	"L",	"M",	"N",	"O",
316 	"P",	"Q",	"R",	"S",	"T",	"U",	"V",	"W",
317 	"X",	"Y",	"Z",	"[",	"\\",	"]",	"^",	"_",
318 	"`",	"a",	"b",	"c",	"d",	"e",	"f",	"g",
319 	"h",	"i",	"j",	"k",	"l",	"m",	"n",	"o",
320 	"p",	"q",	"r",	"s",	"t",	"u",	"v",	"w",
321 	"x",	"y",	"z",	"{",	"|",	"}",	"~",	"<DEL>",
322 	"<80>",	"<81>",	"<82>",	"<83>",	"<84>",	"<85>",	"<86>",	"<87>",
323 	"<88>",	"<89>",	"<8A>",	"<8B>",	"<8C>",	"<8D>",	"<8E>",	"<8F>",
324 	"<90>",	"<91>",	"<92>",	"<93>",	"<94>",	"<95>",	"<96>",	"<97>",
325 	"<98>",	"<99>",	"<9A>",	"<9B>",	"<9C>",	"<9D>",	"<9E>",	"<9F>",
326 	nbrsp,	"!",	"/\bc",	"-\bL",	"o\bx",	"=\bY",	"|",	"<section>",
327 	"\"",	"(C)",	"_\ba",	"<<",	"~",	"",	"(R)",	"-",
328 	"<degree>","+-","^2",	"^3",	"'","<micro>","<paragraph>",".",
329 	",",	"^1",	"_\bo",	">>",	"1/4",	"1/2",	"3/4",	"?",
330 	"`\bA",	"'\bA",	"^\bA",	"~\bA",	"\"\bA","o\bA",	"AE",	",\bC",
331 	"`\bE",	"'\bE",	"^\bE",	"\"\bE","`\bI",	"'\bI",	"^\bI",	"\"\bI",
332 	"Dh",	"~\bN",	"`\bO",	"'\bO",	"^\bO",	"~\bO",	"\"\bO","x",
333 	"/\bO",	"`\bU",	"'\bU",	"^\bU",	"\"\bU","'\bY",	"Th",	"ss",
334 	"`\ba",	"'\ba",	"^\ba",	"~\ba",	"\"\ba","o\ba",	"ae",	",\bc",
335 	"`\be",	"'\be",	"^\be",	"\"\be","`\bi",	"'\bi",	"^\bi",	"\"\bi",
336 	"dh",	"~\bn",	"`\bo",	"'\bo",	"^\bo",	"~\bo",	"\"\bo","/",
337 	"/\bo",	"`\bu",	"'\bu",	"^\bu",	"\"\bu","'\by",	"th",	"\"\by",
338 	"A",	"a",	"A",	"a",	"A",	"a",	"'\bC",	"'\bc",
339 	"^\bC",	"^\bc",	"C",	"c",	"C",	"c",	"D",	"d",
340 	"/\bD",	"/\bd",	"E",	"e",	"E",	"e",	"E",	"e",
341 	"E",	"e",	"E",	"e",	"^\bG",	"^\bg",	"G",	"g",
342 	"G",	"g",	",\bG",	",\bg",	"^\bH",	"^\bh",	"/\bH",	"/\bh",
343 	"~\bI",	"~\bi",	"I",	"i",	"I",	"i",	"I",	"i",
344 	"I",	"i",	"IJ",	"ij",	"^\bJ",	"^\bj",	",\bK",	",\bk",
345 	"q",	"'\bL",	"'\bl",	",\bL",	",\bl",	"L",	"l",	"L",
346 	"l",	"/\bL",	"/\bl",	"'\bN",	"'\bn",	",\bN",	",\bn",	"N",
347 	"n",	"'n",	"Ng",	"ng",	"O",	"o",	"O",	"o",
348 	"O",	"o",	"OE",	"oe",	"'\bR",	"'\br",	",\bR",	",\br",
349 	"R",	"r",	"'\bS",	"'\bs",	"^\bS",	"^\bs",	",\bS",	",\bs",
350 	"S",	"s",	",\bT",	",\bt",	"T",	"t",	"/\bT",	"/\bt",
351 	"~\bU",	"~\bu",	"U",	"u",	"U",	"u",	"U",	"u",
352 	"U",	"u",	"U",	"u",	"^\bW",	"^\bw",	"^\bY",	"^\by",
353 	"\"\bY","'\bZ",	"'\bz",	"Z",	"z",	"Z",	"z",	"s",
354 	"b",	"B",	"B",	"b",	"6",	"6",	"O",	"C",
355 	"c",	"D",	"D",	"D",	"d",	"d",	"3",	"@",
356 	"E",	"F",	",\bf",	"G",	"G",	"hv",	"I",	"/\bI",
357 	"K",	"k",	"/\bl",	"l",	"W",	"N",	"n",	"~\bO",
358 	"O",	"o",	"OI",	"oi",	"P",	"p",	"YR",	"2",
359 	"2",	"SH",	"sh",	"t",	"T",	"t",	"T",	"U",
360 	"u",	"Y",	"V",	"Y",	"y",	"/\bZ",	"/\bz",	"ZH",
361 	"ZH",	"zh",	"zh",	"/\b2",	"5",	"5",	"ts",	"w",
362 	"|",	"||",	"|=",	"!",	"DZ",	"Dz",	"dz",	"LJ",
363 	"Lj",	"lj",	"NJ",	"Nj",	"nj",	"A",	"a",	"I",
364 	"i",	"O",	"o",	"U",	"u",	"U",	"u",	"U",
365 	"u",	"U",	"u",	"U",	"u",	"@",	"A",	"a",
366 	"A",	"a",	"AE",	"ae",	"/\bG",	"/\bg",	"G",	"g",
367 	"K",	"k",	"O",	"o",	"O",	"o",	"ZH",	"zh",
368 	"j",	"DZ",	"Dz",	"dz",	"'\bG",	"'\bg",	"HV",	"W",
369 	"`\bN",	"`\bn",	"A",	"a",	"'\bAE","'\bae","O",	"o"};
370 
371 	assert(uc >= 0);
372 	if ((size_t)uc < sizeof(tab)/sizeof(tab[0]))
373 		return tab[uc];
374 	return mchars_uc2str(uc);
375 }
376 
377 #if HAVE_WCHAR
378 static size_t
379 locale_getwidth(const struct termp *p, int c)
380 {
381 	int		rc;
382 
383 	if (c == ASCII_NBRSP)
384 		c = ' ';
385 	rc = wcwidth(c);
386 	if (rc < 0)
387 		rc = 0;
388 	return rc * 24;
389 }
390 
391 static void
392 locale_advance(struct termp *p, size_t len)
393 {
394 	size_t		 dst;	/* Destination column in basic units. */
395 	size_t		 sz;	/* Width of a space in basic units. */
396 
397 	sz = (*p->getwidth)(p, ' ');
398 
399 	/*
400 	 * XXX We used to have "assert(len < UINT16_MAX)" here.
401 	 * that is not quite right because the input document
402 	 * can trigger that by merely providing large input.
403 	 * For now, simply truncate.
404 	 */
405 	if (len > 256 * sz)
406 		len = 256 * sz;
407 
408 	dst = p->viscol + len;
409 	while (p->viscol + sz / 2 < dst) {
410 		putwchar(L' ');
411 		p->viscol += sz;
412 	}
413 }
414 
415 static void
416 locale_endline(struct termp *p)
417 {
418 	p->line++;
419 	if ((int)p->tcol->offset > p->ti)
420 		p->tcol->offset -= p->ti;
421 	else
422 		p->tcol->offset = 0;
423 	p->ti = 0;
424 	p->minbl = 0;
425 	p->viscol = 0;
426 	putwchar(L'\n');
427 }
428 
429 static void
430 locale_letter(struct termp *p, int c)
431 {
432 	putwchar(c);
433 }
434 #endif
435