xref: /illumos-gate/usr/src/man/man3c/c16rtomb.3c (revision 7dbbfe7762f9eabac3999ee1a8b38311d428f7a8)
1.\"
2.\" This file and its contents are supplied under the terms of the
3.\" Common Development and Distribution License ("CDDL"), version 1.0.
4.\" You may only use this file in accordance with the terms of version
5.\" 1.0 of the CDDL.
6.\"
7.\" A full copy of the text of the CDDL should have accompanied this
8.\" source.  A copy of the CDDL is also available via the Internet at
9.\" http://www.illumos.org/license/CDDL.
10.\"
11.\"
12.\" Copyright 2020 Robert Mustacchi
13.\"
14.Dd February 17, 2023
15.Dt C16RTOMB 3C
16.Os
17.Sh NAME
18.Nm c16rtomb ,
19.Nm c32rtomb ,
20.Nm wcrtomb ,
21.Nm wcrtomb_l
22.Nd convert wide-characters to character sequences
23.Sh SYNOPSIS
24.In uchar.h
25.Ft size_t
26.Fo c16rtomb
27.Fa "char *restrict str"
28.Fa "char16_t c16"
29.Fa "mbstate_t *restrict ps"
30.Fc
31.Ft size_t
32.Fo c32rtomb
33.Fa "char *restrict str"
34.Fa "char32_t c32"
35.Fa "mbstate_t *restrict ps"
36.Fc
37.In stdio.h
38.Ft size_t
39.Fo wcrtomb
40.Fa "char *restrict str"
41.Fa "wchar_t wc"
42.Fa "mbstate_t *restrict ps"
43.Fc
44.In stdio.h
45.In xlocale.h
46.Ft size_t
47.Fo wcrtomb_l
48.Fa "char *restrict str"
49.Fa "wchar_t wc"
50.Fa "mbstate_t *restrict ps"
51.Fa "locale_t loc"
52.Fc
53.Sh DESCRIPTION
54The
55.Fn c16rtomb ,
56.Fn c32rtomb ,
57.Fn wcrtomb ,
58and
59.Fn wcrtomb_l
60functions convert wide-character sequences into a series of multi-byte
61characters.
62The functions work in the following formats:
63.Bl -tag -width wcrtomb_l
64.It Fn c16rtomb
65A UTF-16 code sequence, where every code point is represented by one or
66two
67.Vt char16_t .
68The UTF-16 encoding will encode certain Unicode code points as a pair of
69two 16-bit code sequences, commonly referred to as a surrogate pair.
70.It Fn c32rtomb
71A UTF-32 code sequence, where every code point is represented by a
72single
73.Vt char32_t .
74It is illegal to pass reserved Unicode code points.
75.It Fn wcrtomb , Fn wcrtomb_l
76Wide characters, being a 32-bit value where every code point is
77represented by a single
78.Vt wchar_t .
79While the
80.Vt wchar_t
81and
82.Vt char32_t
83are different types, in this implementation, they are similar encodings.
84.El
85.Pp
86The functions all work by looking at the passed in wide-character
87.Po
88.Fa c16 ,
89.Fa c32 ,
90.Fa wc
91.Pc
92and appending it to the current conversion state,
93.Fa ps .
94Once a valid code point, based on the current locale, is found, then it
95will be converted into a series of characters that are stored in
96.Fa str .
97Up to
98.Dv MB_CUR_MAX
99bytes will be stored in
100.Fa str .
101It is the caller's responsibility to ensure that there is sufficient
102space in
103.Fa str .
104.Pp
105The functions are all influenced by the
106.Dv LC_CTYPE
107category of the current locale for determining what is considered a
108valid character.
109For example, in the
110.Sy C
111locale,
112only ASCII characters are recognized, while in a
113.Sy UTF-8
114based locale like
115.Sy en_us.UTF-8 ,
116all valid Unicode code points are recognized and will be converted into
117the corresponding multi-byte sequence.
118The
119.Fn wcrtomb_l
120function uses the locale passed in
121.Fa loc
122rather than the locale of the current thread.
123.Pp
124The
125.Fa ps
126argument represents a multi-byte conversion state which can be used
127across multiple calls to a given function
128.Pq but not mixed between functions .
129These allow for characters to be consumed from subsequent buffers, e.g.
130different values of
131.Fa str .
132The functions may be called from multiple threads as long as they use
133unique values for
134.Fa ps .
135If
136.Fa ps
137is
138.Dv NULL ,
139then a function-specific buffer will be used for the conversion state;
140however, this is stored between all threads and its use is not
141recommended.
142.Pp
143The functions all have a special behavior when
144.Dv NULL
145is passed for
146.Fa str .
147They instead will treat it as though a the NULL wide-character was
148passed in
149.Fa c16 ,
150.Fa c32 ,
151or
152.Fa wc
153and an internal buffer
154.Pq buf
155will be used to write out the results of the
156conversion.
157In other words, the functions would be called as:
158.Bd -literal -offset indent
159c16rtomb(buf, L'\\0', ps)
160c32rtomb(buf, L'\\0', ps)
161wcrtomb(buf, L'\\0', ps)
162wcrtomb_l(buf, L'\\0', ps, loc)
163.Ed
164.Ss Locale Details
165Not all locales in the system are Unicode based locales.
166For example, ISO 8859 family locales have code points with values that
167do not match their counterparts in Unicode.
168When using these functions with non-Unicode based locales, the code
169points returned will be those determined by the locale.
170They will not be converted from the corresponding Unicode code point.
171For example, if using the Euro sign in ISO 8859-15, these functions
172will not encode the Unicode value 0x20ac into the ISO 8859-15 value
1730xa4.
174.Pp
175Regardless of the locale, the characters returned will be encoded as
176though the code point were the corresponding value in Unicode.
177This means that when using UTF-16, if the corresponding code point were
178in the range for surrogate pairs, then the
179.Fn c16rtomb
180function will expect to receive that code point in that fashion.
181.Pp
182This behavior of the
183.Fn c16rtomb
184and
185.Fn c32rtomb
186functions should not be relied upon, is not portable, and subject to
187change for non-Unicode locales.
188.Sh RETURN VALUES
189Upon successful completion, the
190.Fn c16rtomb ,
191.Fn c32rtomb ,
192.Fn wcrtomb ,
193and
194.Fn wcrtomb_l
195functions return the number of bytes stored in
196.Fa str .
197Otherwise,
198.Sy (size_t)-1
199is returned to indicate an encoding error and
200.Va errno
201is set.
202.Sh EXAMPLES
203.Sy Example 1
204Converting a UTF-32 character into a multi-byte character sequence.
205.Bd -literal
206#include <locale.h>
207#include <stdlib.h>
208#include <string.h>
209#include <err.h>
210#include <stdio.h>
211#include <uchar.h>
212
213int
214main(void)
215{
216        mbstate_t mbs;
217        size_t ret;
218        char buf[MB_CUR_MAX];
219        char32_t val = 0x5149;
220        const char *uchar_exp = "\exe5\ex85\ex89";
221
222        (void) memset(&mbs, 0, sizeof (mbs));
223        (void) setlocale(LC_CTYPE, "en_US.UTF-8");
224        ret = c32rtomb(buf, val, &mbs);
225        if (ret != strlen(uchar_exp)) {
226                errx(EXIT_FAILURE, "failed to convert string, got %zd",
227                    ret);
228        }
229
230        if (strncmp(buf, uchar_exp, ret) != 0) {
231                errx(EXIT_FAILURE, "converted char32_t does not match "
232                    "expected value");
233        }
234
235        return (0);
236}
237.Ed
238.Sh ERRORS
239The
240.Fn c16rtomb ,
241.Fn c32rtomb ,
242.Fn wcrtomb ,
243and
244.Fn wcrtomb_l
245functions will fail if:
246.Bl -tag -width Er
247.It Er EINVAL
248The conversion state in
249.Fa ps
250is invalid.
251.It Er EILSEQ
252An invalid character sequence has been detected.
253.El
254.Sh MT-LEVEL
255The
256.Fn c16rtomb ,
257.Fn c32rtomb ,
258.Fn wcrtomb ,
259and
260.Fn wcrtomb_l
261functions are
262.Sy MT-Safe
263as long as different
264.Vt mbstate_t
265structures are passed in
266.Fa ps .
267If
268.Fa ps
269is
270.Dv NULL
271or different threads use the same value for
272.Fa ps ,
273then the functions are
274.Sy Unsafe .
275.Sh INTERFACE STABILITY
276.Sy Committed
277.Sh SEE ALSO
278.Xr mbrtoc16 3C ,
279.Xr mbrtoc32 3C ,
280.Xr mbrtowc 3C ,
281.Xr newlocale 3C ,
282.Xr setlocale 3C ,
283.Xr uselocale 3C ,
284.Xr uchar.h 3HEAD ,
285.Xr environ 7
286