1.\" $Id: mandoc.3,v 1.41 2017/07/04 23:40:01 schwarze Exp $ 2.\" 3.\" Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4.\" Copyright (c) 2010-2017 Ingo Schwarze <schwarze@openbsd.org> 5.\" 6.\" Permission to use, copy, modify, and distribute this software for any 7.\" purpose with or without fee is hereby granted, provided that the above 8.\" copyright notice and this permission notice appear in all copies. 9.\" 10.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17.\" 18.Dd $Mdocdate: July 4 2017 $ 19.Dt MANDOC 3 20.Os 21.Sh NAME 22.Nm mandoc , 23.Nm deroff , 24.Nm mandocmsg , 25.Nm man_mparse , 26.Nm man_validate , 27.Nm mdoc_validate , 28.Nm mparse_alloc , 29.Nm mparse_free , 30.Nm mparse_getkeep , 31.Nm mparse_keep , 32.Nm mparse_open , 33.Nm mparse_readfd , 34.Nm mparse_reset , 35.Nm mparse_result , 36.Nm mparse_strerror , 37.Nm mparse_strlevel , 38.Nm mparse_updaterc 39.Nd mandoc macro compiler library 40.Sh SYNOPSIS 41.In sys/types.h 42.In mandoc.h 43.Pp 44.Fd "#define ASCII_NBRSP" 45.Fd "#define ASCII_HYPH" 46.Fd "#define ASCII_BREAK" 47.Ft struct mparse * 48.Fo mparse_alloc 49.Fa "int options" 50.Fa "enum mandocerr mmin" 51.Fa "mandocmsg mmsg" 52.Fa "enum mandoc_os oe_e" 53.Fa "char *os_s" 54.Fc 55.Ft void 56.Fo (*mandocmsg) 57.Fa "enum mandocerr errtype" 58.Fa "enum mandoclevel level" 59.Fa "const char *file" 60.Fa "int line" 61.Fa "int col" 62.Fa "const char *msg" 63.Fc 64.Ft void 65.Fo mparse_free 66.Fa "struct mparse *parse" 67.Fc 68.Ft const char * 69.Fo mparse_getkeep 70.Fa "const struct mparse *parse" 71.Fc 72.Ft void 73.Fo mparse_keep 74.Fa "struct mparse *parse" 75.Fc 76.Ft int 77.Fo mparse_open 78.Fa "struct mparse *parse" 79.Fa "const char *fname" 80.Fc 81.Ft "enum mandoclevel" 82.Fo mparse_readfd 83.Fa "struct mparse *parse" 84.Fa "int fd" 85.Fa "const char *fname" 86.Fc 87.Ft void 88.Fo mparse_reset 89.Fa "struct mparse *parse" 90.Fc 91.Ft void 92.Fo mparse_result 93.Fa "struct mparse *parse" 94.Fa "struct roff_man **man" 95.Fa "char **sodest" 96.Fc 97.Ft "const char *" 98.Fo mparse_strerror 99.Fa "enum mandocerr" 100.Fc 101.Ft "const char *" 102.Fo mparse_strlevel 103.Fa "enum mandoclevel" 104.Fc 105.Ft void 106.Fo mparse_updaterc 107.Fa "struct mparse *parse" 108.Fa "enum mandoclevel *rc" 109.Fc 110.In roff.h 111.Ft void 112.Fo deroff 113.Fa "char **dest" 114.Fa "const struct roff_node *node" 115.Fc 116.In sys/types.h 117.In mandoc.h 118.In mdoc.h 119.Vt extern const char * const * mdoc_argnames; 120.Vt extern const char * const * mdoc_macronames; 121.Ft void 122.Fo mdoc_validate 123.Fa "struct roff_man *mdoc" 124.Fc 125.In sys/types.h 126.In mandoc.h 127.In man.h 128.Vt extern const char * const * man_macronames; 129.Ft "const struct mparse *" 130.Fo man_mparse 131.Fa "const struct roff_man *man" 132.Fc 133.Ft void 134.Fo man_validate 135.Fa "struct roff_man *man" 136.Fc 137.Sh DESCRIPTION 138The 139.Nm mandoc 140library parses a 141.Ux 142manual into an abstract syntax tree (AST). 143.Ux 144manuals are composed of 145.Xr mdoc 7 146or 147.Xr man 7 , 148and may be mixed with 149.Xr roff 7 , 150.Xr tbl 7 , 151and 152.Xr eqn 7 153invocations. 154.Pp 155The following describes a general parse sequence: 156.Bl -enum 157.It 158initiate a parsing sequence with 159.Xr mchars_alloc 3 160and 161.Fn mparse_alloc ; 162.It 163open a file with 164.Xr open 2 165or 166.Fn mparse_open ; 167.It 168parse it with 169.Fn mparse_readfd ; 170.It 171close it with 172.Xr close 2 ; 173.It 174retrieve the syntax tree with 175.Fn mparse_result ; 176.It 177depending on whether the 178.Fa macroset 179member of the returned 180.Vt struct roff_man 181is 182.Dv MACROSET_MDOC 183or 184.Dv MACROSET_MAN , 185validate it with 186.Fn mdoc_validate 187or 188.Fn man_validate , 189respectively; 190.It 191if information about the validity of the input is needed, fetch it with 192.Fn mparse_updaterc ; 193.It 194iterate over parse nodes with starting from the 195.Fa first 196member of the returned 197.Vt struct roff_man ; 198.It 199free all allocated memory with 200.Fn mparse_free 201and 202.Xr mchars_free 3 , 203or invoke 204.Fn mparse_reset 205and go back to step 2 to parse new files. 206.El 207.Sh REFERENCE 208This section documents the functions, types, and variables available 209via 210.In mandoc.h , 211with the exception of those documented in 212.Xr mandoc_escape 3 213and 214.Xr mchars_alloc 3 . 215.Ss Types 216.Bl -ohang 217.It Vt "enum mandocerr" 218An error or warning message during parsing. 219.It Vt "enum mandoclevel" 220A classification of an 221.Vt "enum mandocerr" 222as regards system operation. 223See the DIAGNOSTICS section in 224.Xr mandoc 1 225regarding the meanings of the levels. 226.It Vt "struct mparse" 227An opaque pointer to a running parse sequence. 228Created with 229.Fn mparse_alloc 230and freed with 231.Fn mparse_free . 232This may be used across parsed input if 233.Fn mparse_reset 234is called between parses. 235.It Vt "mandocmsg" 236A prototype for a function to handle error and warning 237messages emitted by the parser. 238.El 239.Ss Functions 240.Bl -ohang 241.It Fn deroff 242Obtain a text-only representation of a 243.Vt struct roff_node , 244including text contained in its child nodes. 245To be used on children of the 246.Fa first 247member of 248.Vt struct roff_man . 249When it is no longer needed, the pointer returned from 250.Fn deroff 251can be passed to 252.Xr free 3 . 253.It Fn man_mparse 254Get the parser used for the current output. 255Declared in 256.In man.h , 257implemented in 258.Pa man.c . 259.It Fn man_validate 260Validate the 261.Dv MACROSET_MAN 262parse tree obtained with 263.Fn mparse_result . 264Declared in 265.In man.h , 266implemented in 267.Pa man.c . 268.It Fn mdoc_validate 269Validate the 270.Dv MACROSET_MDOC 271parse tree obtained with 272.Fn mparse_result . 273Declared in 274.In mdoc.h , 275implemented in 276.Pa mdoc.c . 277.It Fn mparse_alloc 278Allocate a parser. 279The arguments have the following effect: 280.Bl -tag -offset 5n -width inttype 281.It Ar options 282When the 283.Dv MPARSE_MDOC 284or 285.Dv MPARSE_MAN 286bit is set, only that parser is used. 287Otherwise, the document type is automatically detected. 288.Pp 289When the 290.Dv MPARSE_SO 291bit is set, 292.Xr roff 7 293.Ic \&so 294file inclusion requests are always honoured. 295Otherwise, if the request is the only content in an input file, 296only the file name is remembered, to be returned in the 297.Fa sodest 298argument of 299.Fn mparse_result . 300.Pp 301When the 302.Dv MPARSE_QUICK 303bit is set, parsing is aborted after the NAME section. 304This is for example useful in 305.Xr makewhatis 8 306.Fl Q 307to quickly build minimal databases. 308.It Ar mmin 309Can be set to 310.Dv MANDOCERR_BASE , 311.Dv MANDOCERR_STYLE , 312.Dv MANDOCERR_WARNING , 313.Dv MANDOCERR_ERROR , 314.Dv MANDOCERR_UNSUPP , 315or 316.Dv MANDOCERR_MAX . 317Messages below the selected level will be suppressed. 318.It Ar mmsg 319A callback function to handle errors and warnings. 320See 321.Pa main.c 322for an example. 323If printing of error messages is not desired, 324.Dv NULL 325may be passed. 326.It Ar os_e 327Operating system to check base system conventions for. 328If 329.Dv MANDOC_OS_OTHER , 330the system is automatically detected from 331.Ic \&Os , 332.Fl Ios , 333or 334.Xr uname 3 . 335.It Ar os_s 336A default string for the 337.Xr mdoc 7 338.Ic \&Os 339macro, overriding the 340.Dv OSNAME 341preprocessor definition and the results of 342.Xr uname 3 . 343Passing 344.Dv NULL 345sets no default. 346.El 347.Pp 348The same parser may be used for multiple files so long as 349.Fn mparse_reset 350is called between parses. 351.Fn mparse_free 352must be called to free the memory allocated by this function. 353Declared in 354.In mandoc.h , 355implemented in 356.Pa read.c . 357.It Fn mparse_free 358Free all memory allocated by 359.Fn mparse_alloc . 360Declared in 361.In mandoc.h , 362implemented in 363.Pa read.c . 364.It Fn mparse_getkeep 365Acquire the keep buffer. 366Must follow a call of 367.Fn mparse_keep . 368Declared in 369.In mandoc.h , 370implemented in 371.Pa read.c . 372.It Fn mparse_keep 373Instruct the parser to retain a copy of its parsed input. 374This can be acquired with subsequent 375.Fn mparse_getkeep 376calls. 377Declared in 378.In mandoc.h , 379implemented in 380.Pa read.c . 381.It Fn mparse_open 382Open the file for reading. 383If that fails and 384.Fa fname 385does not already end in 386.Ql .gz , 387try again after appending 388.Ql .gz . 389Save the information whether the file is zipped or not. 390Return a file descriptor open for reading or -1 on failure. 391It can be passed to 392.Fn mparse_readfd 393or used directly. 394Declared in 395.In mandoc.h , 396implemented in 397.Pa read.c . 398.It Fn mparse_readfd 399Parse a file descriptor opened with 400.Xr open 2 401or 402.Fn mparse_open . 403Pass the associated filename in 404.Va fname . 405This function may be called multiple times with different parameters; however, 406.Xr close 2 407and 408.Fn mparse_reset 409should be invoked between parses. 410Declared in 411.In mandoc.h , 412implemented in 413.Pa read.c . 414.It Fn mparse_reset 415Reset a parser so that 416.Fn mparse_readfd 417may be used again. 418Declared in 419.In mandoc.h , 420implemented in 421.Pa read.c . 422.It Fn mparse_result 423Obtain the result of a parse. 424One of the two pointers will be filled in. 425Declared in 426.In mandoc.h , 427implemented in 428.Pa read.c . 429.It Fn mparse_strerror 430Return a statically-allocated string representation of an error code. 431Declared in 432.In mandoc.h , 433implemented in 434.Pa read.c . 435.It Fn mparse_strlevel 436Return a statically-allocated string representation of a level code. 437Declared in 438.In mandoc.h , 439implemented in 440.Pa read.c . 441.It Fn mparse_updaterc 442If the highest warning or error level that occurred during the current 443.Fa parse 444is higher than 445.Pf * Fa rc , 446update 447.Pf * Fa rc 448accordingly. 449This is useful after calling 450.Fn mdoc_validate 451or 452.Fn man_validate . 453Declared in 454.In mandoc.h , 455implemented in 456.Pa read.c . 457.El 458.Ss Variables 459.Bl -ohang 460.It Va man_macronames 461The string representation of a 462.Xr man 7 463macro as indexed by 464.Vt "enum mant" . 465.It Va mdoc_argnames 466The string representation of an 467.Xr mdoc 7 468macro argument as indexed by 469.Vt "enum mdocargt" . 470.It Va mdoc_macronames 471The string representation of an 472.Xr mdoc 7 473macro as indexed by 474.Vt "enum mdoct" . 475.El 476.Sh IMPLEMENTATION NOTES 477This section consists of structural documentation for 478.Xr mdoc 7 479and 480.Xr man 7 481syntax trees and strings. 482.Ss Man and Mdoc Strings 483Strings may be extracted from mdoc and man meta-data, or from text 484nodes (MDOC_TEXT and MAN_TEXT, respectively). 485These strings have special non-printing formatting cues embedded in the 486text itself, as well as 487.Xr roff 7 488escapes preserved from input. 489Implementing systems will need to handle both situations to produce 490human-readable text. 491In general, strings may be assumed to consist of 7-bit ASCII characters. 492.Pp 493The following non-printing characters may be embedded in text strings: 494.Bl -tag -width Ds 495.It Dv ASCII_NBRSP 496A non-breaking space character. 497.It Dv ASCII_HYPH 498A soft hyphen. 499.It Dv ASCII_BREAK 500A breakable zero-width space. 501.El 502.Pp 503Escape characters are also passed verbatim into text strings. 504An escape character is a sequence of characters beginning with the 505backslash 506.Pq Sq \e . 507To construct human-readable text, these should be intercepted with 508.Xr mandoc_escape 3 509and converted with one the functions described in 510.Xr mchars_alloc 3 . 511.Ss Man Abstract Syntax Tree 512This AST is governed by the ontological rules dictated in 513.Xr man 7 514and derives its terminology accordingly. 515.Pp 516The AST is composed of 517.Vt struct roff_node 518nodes with element, root and text types as declared by the 519.Va type 520field. 521Each node also provides its parse point (the 522.Va line , 523.Va pos , 524and 525.Va sec 526fields), its position in the tree (the 527.Va parent , 528.Va child , 529.Va next 530and 531.Va prev 532fields) and some type-specific data. 533.Pp 534The tree itself is arranged according to the following normal form, 535where capitalised non-terminals represent nodes. 536.Pp 537.Bl -tag -width "ELEMENTXX" -compact 538.It ROOT 539\(<- mnode+ 540.It mnode 541\(<- ELEMENT | TEXT | BLOCK 542.It BLOCK 543\(<- HEAD BODY 544.It HEAD 545\(<- mnode* 546.It BODY 547\(<- mnode* 548.It ELEMENT 549\(<- ELEMENT | TEXT* 550.It TEXT 551\(<- [[:ascii:]]* 552.El 553.Pp 554The only elements capable of nesting other elements are those with 555next-line scope as documented in 556.Xr man 7 . 557.Ss Mdoc Abstract Syntax Tree 558This AST is governed by the ontological 559rules dictated in 560.Xr mdoc 7 561and derives its terminology accordingly. 562.Qq In-line 563elements described in 564.Xr mdoc 7 565are described simply as 566.Qq elements . 567.Pp 568The AST is composed of 569.Vt struct roff_node 570nodes with block, head, body, element, root and text types as declared 571by the 572.Va type 573field. 574Each node also provides its parse point (the 575.Va line , 576.Va pos , 577and 578.Va sec 579fields), its position in the tree (the 580.Va parent , 581.Va child , 582.Va last , 583.Va next 584and 585.Va prev 586fields) and some type-specific data, in particular, for nodes generated 587from macros, the generating macro in the 588.Va tok 589field. 590.Pp 591The tree itself is arranged according to the following normal form, 592where capitalised non-terminals represent nodes. 593.Pp 594.Bl -tag -width "ELEMENTXX" -compact 595.It ROOT 596\(<- mnode+ 597.It mnode 598\(<- BLOCK | ELEMENT | TEXT 599.It BLOCK 600\(<- HEAD [TEXT] (BODY [TEXT])+ [TAIL [TEXT]] 601.It ELEMENT 602\(<- TEXT* 603.It HEAD 604\(<- mnode* 605.It BODY 606\(<- mnode* [ENDBODY mnode*] 607.It TAIL 608\(<- mnode* 609.It TEXT 610\(<- [[:ascii:]]* 611.El 612.Pp 613Of note are the TEXT nodes following the HEAD, BODY and TAIL nodes of 614the BLOCK production: these refer to punctuation marks. 615Furthermore, although a TEXT node will generally have a non-zero-length 616string, in the specific case of 617.Sq \&.Bd \-literal , 618an empty line will produce a zero-length string. 619Multiple body parts are only found in invocations of 620.Sq \&Bl \-column , 621where a new body introduces a new phrase. 622.Pp 623The 624.Xr mdoc 7 625syntax tree accommodates for broken block structures as well. 626The ENDBODY node is available to end the formatting associated 627with a given block before the physical end of that block. 628It has a non-null 629.Va end 630field, is of the BODY 631.Va type , 632has the same 633.Va tok 634as the BLOCK it is ending, and has a 635.Va pending 636field pointing to that BLOCK's BODY node. 637It is an indirect child of that BODY node 638and has no children of its own. 639.Pp 640An ENDBODY node is generated when a block ends while one of its child 641blocks is still open, like in the following example: 642.Bd -literal -offset indent 643\&.Ao ao 644\&.Bo bo ac 645\&.Ac bc 646\&.Bc end 647.Ed 648.Pp 649This example results in the following block structure: 650.Bd -literal -offset indent 651BLOCK Ao 652 HEAD Ao 653 BODY Ao 654 TEXT ao 655 BLOCK Bo, pending -> Ao 656 HEAD Bo 657 BODY Bo 658 TEXT bo 659 TEXT ac 660 ENDBODY Ao, pending -> Ao 661 TEXT bc 662TEXT end 663.Ed 664.Pp 665Here, the formatting of the 666.Ic \&Ao 667block extends from TEXT ao to TEXT ac, 668while the formatting of the 669.Ic \&Bo 670block extends from TEXT bo to TEXT bc. 671It renders as follows in 672.Fl T Ns Cm ascii 673mode: 674.Pp 675.Dl <ao [bo ac> bc] end 676.Pp 677Support for badly-nested blocks is only provided for backward 678compatibility with some older 679.Xr mdoc 7 680implementations. 681Using badly-nested blocks is 682.Em strongly discouraged ; 683for example, the 684.Fl T Ns Cm html 685front-end to 686.Xr mandoc 1 687is unable to render them in any meaningful way. 688Furthermore, behaviour when encountering badly-nested blocks is not 689consistent across troff implementations, especially when using multiple 690levels of badly-nested blocks. 691.Sh SEE ALSO 692.Xr mandoc 1 , 693.Xr man.cgi 3 , 694.Xr mandoc_escape 3 , 695.Xr mandoc_headers 3 , 696.Xr mandoc_malloc 3 , 697.Xr mansearch 3 , 698.Xr mchars_alloc 3 , 699.Xr tbl 3 , 700.Xr eqn 7 , 701.Xr man 7 , 702.Xr mandoc_char 7 , 703.Xr mdoc 7 , 704.Xr roff 7 , 705.Xr tbl 7 706.Sh AUTHORS 707.An -nosplit 708The 709.Nm 710library was written by 711.An Kristaps Dzonsons Aq Mt kristaps@bsd.lv 712and is maintained by 713.An Ingo Schwarze Aq Mt schwarze@openbsd.org . 714