xref: /freebsd/contrib/xz/src/common/tuklib_mbstr_wrap.h (revision 128836d304d93f2d00eb14069c27089ab46c38d4)
1*128836d3SXin LI // SPDX-License-Identifier: 0BSD
2*128836d3SXin LI 
3*128836d3SXin LI ///////////////////////////////////////////////////////////////////////////////
4*128836d3SXin LI //
5*128836d3SXin LI /// \file       tuklib_mbstr_wrap.h
6*128836d3SXin LI /// \brief      Word wrapping for multibyte strings
7*128836d3SXin LI ///
8*128836d3SXin LI /// The word wrapping functions are intended to be usable, for example,
9*128836d3SXin LI /// for printing --help text in command line tools. While manually-wrapped
10*128836d3SXin LI /// --help text allows precise formatting, such freedom requires translators
11*128836d3SXin LI /// to count spaces and determine where line breaks should occur. It's
12*128836d3SXin LI /// tedious and error prone, and experience has shown that only some
13*128836d3SXin LI /// translators do it well. Automatic word wrapping is less flexible but
14*128836d3SXin LI /// results in polished-enough look with less effort from everyone.
15*128836d3SXin LI /// Right-to-left languages and languages that don't use spaces between
16*128836d3SXin LI /// words will still need extra effort though.
17*128836d3SXin LI //
18*128836d3SXin LI //  Author:     Lasse Collin
19*128836d3SXin LI //
20*128836d3SXin LI ///////////////////////////////////////////////////////////////////////////////
21*128836d3SXin LI 
22*128836d3SXin LI #ifndef TUKLIB_MBSTR_WRAP_H
23*128836d3SXin LI #define TUKLIB_MBSTR_WRAP_H
24*128836d3SXin LI 
25*128836d3SXin LI #include "tuklib_common.h"
26*128836d3SXin LI #include <stdio.h>
27*128836d3SXin LI 
28*128836d3SXin LI TUKLIB_DECLS_BEGIN
29*128836d3SXin LI 
30*128836d3SXin LI /// One or more output lines exceeded right_margin.
31*128836d3SXin LI /// This only a warning; everything was still printed successfully.
32*128836d3SXin LI #define TUKLIB_WRAP_WARN_OVERLONG   0x01
33*128836d3SXin LI 
34*128836d3SXin LI /// Error writing to to the output FILE. The error flag in the FILE
35*128836d3SXin LI /// should have been set as well.
36*128836d3SXin LI #define TUKLIB_WRAP_ERR_IO          0x02
37*128836d3SXin LI 
38*128836d3SXin LI /// Invalid options in struct tuklib_wrap_opt.
39*128836d3SXin LI /// Nothing was printed.
40*128836d3SXin LI #define TUKLIB_WRAP_ERR_OPT         0x04
41*128836d3SXin LI 
42*128836d3SXin LI /// Invalid or unsupported multibyte character in the input string:
43*128836d3SXin LI /// either mbrtowc() failed or wcwidth() returned a negative value.
44*128836d3SXin LI #define TUKLIB_WRAP_ERR_STR         0x08
45*128836d3SXin LI 
46*128836d3SXin LI /// Only tuklib_wrapf(): Error in converting the format string.
47*128836d3SXin LI /// It's either a memory allocation failure or something bad with the
48*128836d3SXin LI /// format string or arguments.
49*128836d3SXin LI #define TUKLIB_WRAP_ERR_FORMAT      0x10
50*128836d3SXin LI 
51*128836d3SXin LI /// Options for tuklib_wraps() and tuklib_wrapf()
52*128836d3SXin LI struct tuklib_wrap_opt {
53*128836d3SXin LI 	/// Indentation of the first output line after `\n` or `\r`.
54*128836d3SXin LI 	/// This can be anything less than right_margin.
55*128836d3SXin LI 	unsigned short left_margin;
56*128836d3SXin LI 
57*128836d3SXin LI 	/// Column where word-wrapped continuation lines start.
58*128836d3SXin LI 	/// This can be anything less than right_margin.
59*128836d3SXin LI 	unsigned short left_cont;
60*128836d3SXin LI 
61*128836d3SXin LI 	/// Column where the text after `\v` will start, either on the current
62*128836d3SXin LI 	/// line (when there is room to add at least one space) or on a new
63*128836d3SXin LI 	/// empty line.
64*128836d3SXin LI 	unsigned short left2_margin;
65*128836d3SXin LI 
66*128836d3SXin LI 	/// Like left_cont but for text after a `\v`. However, this must
67*128836d3SXin LI 	/// be greater than or equal to left2_margin in addition to being
68*128836d3SXin LI 	/// less than right_margin.
69*128836d3SXin LI 	unsigned short left2_cont;
70*128836d3SXin LI 
71*128836d3SXin LI 	/// For 80-column terminals, it is recommended to use 79 here for
72*128836d3SXin LI 	/// maximum portability. 80 will work most of the time but it will
73*128836d3SXin LI 	/// result in unwanted empty lines in the rare case where a terminal
74*128836d3SXin LI 	/// moves the cursor to the beginning of the next line immediately
75*128836d3SXin LI 	/// when the last column has been used.
76*128836d3SXin LI 	unsigned short right_margin;
77*128836d3SXin LI };
78*128836d3SXin LI 
79*128836d3SXin LI #define tuklib_wraps TUKLIB_SYMBOL(tuklib_wraps)
80*128836d3SXin LI extern int tuklib_wraps(FILE *stream, const struct tuklib_wrap_opt *opt,
81*128836d3SXin LI 		const char *str);
82*128836d3SXin LI ///<
83*128836d3SXin LI /// \brief      Word wrap a multibyte string and write it to a FILE
84*128836d3SXin LI ///
85*128836d3SXin LI /// Word wrapping is done only at spaces and at the special control characters
86*128836d3SXin LI /// described below. Multiple consecutive spaces are handled properly: strings
87*128836d3SXin LI /// that have two (or more) spaces after a full sentence will look good even
88*128836d3SXin LI /// when the spaces occur at a word wrapping boundary. Trailing spaces are
89*128836d3SXin LI /// ignored at the end of a line or at the end of a string.
90*128836d3SXin LI ///
91*128836d3SXin LI /// The following control characters have been repurposed:
92*128836d3SXin LI ///
93*128836d3SXin LI ///   - `\t` = Zero-width space allows a line break without producing any
94*128836d3SXin LI ///            output by itself. This can be useful after hard hyphens as
95*128836d3SXin LI ///            hyphens aren't otherwise used for line breaking. This can also
96*128836d3SXin LI ///            be useful in languages that don't use spaces between words.
97*128836d3SXin LI ///            (The Unicode character U+200B isn't supported.)
98*128836d3SXin LI ///   - `\b` = Text between a pair of `\b` characters is treated as an
99*128836d3SXin LI ///            unbreakable block (not wrapped even if there are spaces).
100*128836d3SXin LI ///            For example, a non-breaking space can be done like
101*128836d3SXin LI ///            in `"123\b \bMiB"`. Control characters (like `\n` or `\t`)
102*128836d3SXin LI ///            aren't allowed before the closing `\b`. If closing `\b` is
103*128836d3SXin LI ///            missing, the block extends to the end of the string. Empty
104*128836d3SXin LI ///            blocks are treated as zero-width characters. If line breaks
105*128836d3SXin LI ///            are possible around an empty block (like in `"foo \b\b bar"`
106*128836d3SXin LI ///            or `"foo \b"`), it can result in weird output.
107*128836d3SXin LI ///   - `\v` = Change to alternative indentation (left2_margin).
108*128836d3SXin LI ///   - `\r` = Reset back to the initial indentation and add a newline.
109*128836d3SXin LI ///            The next line will be indented by left_margin.
110*128836d3SXin LI ///   - `\n` = Add a newline without resetting the effect of `\v`. The
111*128836d3SXin LI ///            next line will be indented by left_margin or left2_margin
112*128836d3SXin LI ///            (not left_cont or left2_cont).
113*128836d3SXin LI ///
114*128836d3SXin LI /// Only `\n` should appear in translatable strings. `\t` works too but
115*128836d3SXin LI /// even that might confuse some translators even if there is a TRANSLATORS
116*128836d3SXin LI /// comment explaining its meaning.
117*128836d3SXin LI ///
118*128836d3SXin LI /// To use the other control characters in messages, one should use
119*128836d3SXin LI /// tuklib_wrapf() with appropriate printf format string to combine
120*128836d3SXin LI /// translatable strings with non-translatable portions. For example:
121*128836d3SXin LI ///
122*128836d3SXin LI /// \code{.c}
123*128836d3SXin LI /// static const struct tuklib_wrap_opt wrap2 = { 2,  2, 22, 22, 79 };
124*128836d3SXin LI /// int e = 0;
125*128836d3SXin LI /// ...
126*128836d3SXin LI /// e |= tuklib_wrapf(stdout, &wrap2,
127*128836d3SXin LI ///                   "-h, --help\v%s\r"
128*128836d3SXin LI ///                   "    --version\v%s",
129*128836d3SXin LI ///                   W_("display this help and exit"),
130*128836d3SXin LI ///                   W_("display version information and exit"));
131*128836d3SXin LI /// ...
132*128836d3SXin LI /// if (e != 0) {
133*128836d3SXin LI ///     // Handle warning or error.
134*128836d3SXin LI ///     ...
135*128836d3SXin LI /// }
136*128836d3SXin LI /// \endcode
137*128836d3SXin LI ///
138*128836d3SXin LI /// Control characters other than `\n` and `\t` are unusable in
139*128836d3SXin LI /// translatable strings:
140*128836d3SXin LI ///
141*128836d3SXin LI ///   - Gettext tools show annoying warnings if C escape sequences other
142*128836d3SXin LI ///     than `\n` or `\t` are seen. (Otherwise they still work perfectly
143*128836d3SXin LI ///     fine though.)
144*128836d3SXin LI ///
145*128836d3SXin LI ///   - While at least Poedit and Lokalize support all escapes, some
146*128836d3SXin LI ///     editors only support `\n` and `\t`.
147*128836d3SXin LI ///
148*128836d3SXin LI ///   - They could confuse some translators, resulting in broken
149*128836d3SXin LI ///     translations.
150*128836d3SXin LI ///
151*128836d3SXin LI /// Using non-control characters would solve some issues but it wouldn't
152*128836d3SXin LI /// help with the unfortunate real-world issue that some translators would
153*128836d3SXin LI /// likely have trouble understanding a new syntax. The Gettext manual
154*128836d3SXin LI /// specifically warns about this, see the subheading "No unusual markup"
155*128836d3SXin LI /// in `info (gettext)Preparing Strings`. (While using `\t` for zero-width
156*128836d3SXin LI /// space is such custom markup, most translators will never need it.)
157*128836d3SXin LI ///
158*128836d3SXin LI /// Translators can use the Unicode character U+00A0 (or U+202F) if they
159*128836d3SXin LI /// need a non-breaking space. For example, in French a non-breaking space
160*128836d3SXin LI /// may be needed before colons and question marks (U+00A0 is common in
161*128836d3SXin LI /// real-world French PO files).
162*128836d3SXin LI ///
163*128836d3SXin LI /// Using a non-ASCII char in a string in the C code (like `"123\u00A0MiB"`)
164*128836d3SXin LI /// can work if one tells xgettext that input encoding is UTF-8, one
165*128836d3SXin LI /// ensures that the C compiler uses UTF-8 as the input charset, and one
166*128836d3SXin LI /// is certain that the program is *always* run under an UTF-8 locale.
167*128836d3SXin LI /// Unfortunately a portable program cannot make this kind of assumptions,
168*128836d3SXin LI /// which means that there is no pretty way to have a non-breaking space in
169*128836d3SXin LI /// a translatable string.
170*128836d3SXin LI ///
171*128836d3SXin LI /// Optional: To tell translators which strings are automatically word
172*128836d3SXin LI /// wrapped, see the macro `W_` in tuklib_gettext.h.
173*128836d3SXin LI ///
174*128836d3SXin LI /// \param      stream      Output FILE stream. For decent performance, it
175*128836d3SXin LI ///                         should be in buffered mode because this function
176*128836d3SXin LI ///                         writes the output one byte at a time with fputc().
177*128836d3SXin LI /// \param      opt         Word wrapping options.
178*128836d3SXin LI /// \param      str         Null-terminated multibyte string that is in
179*128836d3SXin LI ///                         the encoding used by the current locale.
180*128836d3SXin LI ///
181*128836d3SXin LI /// \return     Returns 0 on success. If an error or warning occurs, one of
182*128836d3SXin LI ///             TUKLIB_WRAP_* codes is returned. Those codes are powers
183*128836d3SXin LI ///             of two. When warning/error detection can be delayed, the
184*128836d3SXin LI ///             return values can be accumulated from multiple calls using
185*128836d3SXin LI ///             bitwise-or into a single variable which can be checked after
186*128836d3SXin LI ///             all strings have (hopefully) been printed.
187*128836d3SXin LI 
188*128836d3SXin LI #define tuklib_wrapf TUKLIB_SYMBOL(tuklib_wrapf)
189*128836d3SXin LI tuklib_attr_format_printf(3, 4)
190*128836d3SXin LI extern int tuklib_wrapf(FILE *stream, const struct tuklib_wrap_opt *opt,
191*128836d3SXin LI 		const char *fmt, ...);
192*128836d3SXin LI ///<
193*128836d3SXin LI /// \brief      Format and word-wrap a multibyte string and write it to a FILE
194*128836d3SXin LI ///
195*128836d3SXin LI /// This is like tuklib_wraps() except that this takes a printf
196*128836d3SXin LI /// format string.
197*128836d3SXin LI ///
198*128836d3SXin LI /// \note       On platforms that lack vasprintf(), the intermediate
199*128836d3SXin LI ///             result from vsnprintf() must fit into a 128 KiB buffer.
200*128836d3SXin LI ///             TUKLIB_WRAP_ERR_FORMAT is returned if it doesn't but
201*128836d3SXin LI ///             only on platforms that lack vasprintf().
202*128836d3SXin LI 
203*128836d3SXin LI TUKLIB_DECLS_END
204*128836d3SXin LI #endif
205