xref: /illumos-gate/usr/src/man/man3c/mbrtoc16.3c (revision ed093b41a93e8563e6e1e5dae0768dda2a7bcc27)
1.\"
2.\" This file and its contents are supplied under the terms of the
3.\" Common Development and Distribution License ("CDDL"), version 1.0.
4.\" You may only use this file in accordance with the terms of version
5.\" 1.0 of the CDDL.
6.\"
7.\" A full copy of the text of the CDDL should have accompanied this
8.\" source.  A copy of the CDDL is also available via the Internet at
9.\" http://www.illumos.org/license/CDDL.
10.\"
11.\"
12.\" Copyright 2020 Robert Mustacchi
13.\"
14.Dd September 20, 2021
15.Dt MBRTOC16 3C
16.Os
17.Sh NAME
18.Nm mbrtoc16 ,
19.Nm mbrtoc32 ,
20.Nm mbrtowc ,
21.Nm mbrtowc_l
22.Nd convert characters to wide characters
23.Sh SYNOPSIS
24.In wchar.h
25.Ft size_t
26.Fo mbrtowc
27.Fa "wchar_t *restrict pwc"
28.Fa "const char *restrict str"
29.Fa "size_t len"
30.Fa "mstate_t *restrict ps"
31.Fc
32.In wchar.h
33.In xlocale.h
34.Ft size_t
35.Fo mbrtowc
36.Fa "wchar_t *restrict pwc"
37.Fa "const char *restrict str"
38.Fa "size_t len"
39.Fa "mstate_t *restrict ps"
40.Fa "locale_t loc"
41.Fc
42.In uchar.h
43.Ft size_t
44.Fo mbrtoc16
45.Fa "char16_t *restrict p16c"
46.Fa "const char *restrict str"
47.Fa "size_t len"
48.Fa "mbstate_t *restrict ps"
49.Fc
50.Ft size_t
51.Fo mbrtoc32
52.Fa "char32_t *restrict p32c"
53.Fa "const char *restrict str"
54.Fa "size_t len"
55.Fa "mbstate_t *restrict ps"
56.Fc
57.Sh DESCRIPTION
58The
59.Fn mbrtoc16 ,
60.Fn mbrtoc32 ,
61.Fn mbrtowc ,
62and
63.Fn mbrtowc_l
64functions convert character sequences, which may contain multi-byte
65characters, into different character formats.
66The functions work in the following formats:
67.Bl -tag -width mbrtowc_l
68.It Fn mbrtoc16
69A UTF-16 code sequence, where every code point is represented by one or
70two
71.Vt char16_t .
72The UTF-16 encoding will encode certain Unicode code points as a pair of
73two 16-bit code sequences, commonly referred to as a surrogate pair.
74.It Fn mbrtoc32
75A UTF-32 code sequence, where every code point is represented by a
76single
77.Vt char32_t .
78.It Fn mbrtowc , Fn mbrtowc_l
79Wide characters, being a 32-bit value where every code point is
80represented by a single
81.Vt wchar_t .
82While the
83.Vt wchar_t
84and
85.Vt char32_t
86are different types, in this implementation, they are similar encodings.
87.El
88.Pp
89The functions consume up to
90.Fa len
91characters from the string
92.Fa str
93and accumulate them in
94.Fa ps
95until a valid character is found, which is influenced by
96the
97.Dv LC_CTYPE
98category of the current locale.
99For example, in the
100.Sy C
101locale, only ASCII characters are recognized, while in a
102.Sy UTF-8
103based locale like
104.Sy en_US.UTF-8 ,
105UTF-8 multi-byte character sequences that represent Unicode code points
106are recognized.
107The
108.Fn mbrtowc_l
109function uses the locale passed in
110.Fa loc
111rather than the locale of the current thread.
112.Pp
113When a valid character sequence has been found, it is converted to
114either a 16-bit character sequence for
115.Fn mbrtoc16
116or a 32-bit character sequence for
117.Fn mbrtoc32
118and will be stored in
119.Fa p16c
120and
121.Fa p32c
122respectively.
123.Pp
124The
125.Fa ps
126argument represents a multi-byte conversion state which can be used
127across multiple calls to a given function
128.Pq but not mixed between functions .
129These allow for characters to be consumed from subsequent buffers, e.g.
130different values of
131.Fa str .
132The functions may be called from multiple threads as long as they use
133unique values for
134.Fa ps .
135If
136.Fa ps
137is
138.Dv NULL ,
139then a function-specific buffer will be used for the conversion state;
140however, this is stored between all threads and its use is not
141recommended.
142.Pp
143When using these functions, more than one character may be output for a
144given set of consumed input characters.
145An example of this is when a given code point is represented as a set of
146surrogate pairs in UTF-16, which require two 16-bit characters to
147represent a code point.
148When this occurs, the functions return the special return value
149.Sy -3 .
150.Pp
151The functions all have a special behavior when
152.Dv NULL
153is passed for
154.Fa str .
155They instead will treat it as though
156.Fa pwc ,
157.Fa p16c ,
158or
159.Fa p32c
160were
161.Dv NULL ,
162.Fa str
163had been passed as the empty string, "" and the length,
164.Fa len ,
165would appear as the value 1.
166In other words, the functions would be called as:
167.Bd -literal -offset indent
168mbrtowc(NULL, "", 1, ps)
169mbrtowc_l(NULL, "", 1, ps)
170mbrtoc16(NULL, "", 1, ps)
171mbrtoc32(NULL, "", 1, ps)
172.Ed
173.Ss Locale Details
174Not all locales in the system are Unicode based locales.
175For example, ISO 8859 family locales have code points with values that
176do not match their counterparts in Unicode.
177When using these functions with non-Unicode based locales, the code
178points returned will be those determined by the locale.
179They will not be converted to the corresponding Unicode code point.
180For example, if using the Euro sign in ISO 8859-15, these functions
181might return the code point 0xa4 and not the Unicode value 0x20ac.
182.Pp
183Regardless of the locale, the characters returned will be encoded as
184though the code point were the corresponding value in Unicode.
185This means that if a locale returns a value that would be a surrogate
186pair in the UTF-16 encoding, it will still be encoded as a UTF-16
187character.
188.Pp
189This behavior of the
190.Fn mbrtoc16
191and
192.Fn mbrtoc32
193functions should not be relied upon, is not portable, and subject to
194change for non-Unicode locales.
195.Sh RETURN VALUES
196The
197.Fn mbrtoc16 ,
198.Fn mbrtoc32 ,
199.Fn mbrtowc ,
200and
201.Fn mbrtowc_l
202functions return the following values:
203.Bl -tag -width (size_t)-3
204.It Sy 0
205.Fa len
206or fewer bytes of
207.Fa str
208were consumed and the null wide character was written into the wide
209character buffer
210.Po
211.Fa pwc ,
212.Fa p16c ,
213.Fa p32c
214.Pc .
215.It Sy between 1 and len
216The specified number of bytes were consumed and a single character was
217written into the wide character buffer
218.Po
219.Fa pwc ,
220.Fa p16c ,
221.Fa p32c
222.Pc .
223.It Sy (size_t)-1
224An encoding error has occurred.
225The next
226.Fa len
227bytes of
228.Fa str
229do not contribute to a valid character.
230.Va errno
231has been set to
232.Er EILSEQ .
233No data was written into the wide character buffer
234.Po
235.Fa pwc ,
236.Fa p16c ,
237.Fa p32c
238.Pc .
239.It Sy (size_t)-2
240.Fa len
241bytes of
242.Fa str
243were consumed, but a complete multi-byte character sequence has not been
244found and no data was written into the wide character buffer
245.Po
246.Fa pwc ,
247.Fa p16c ,
248.Fa p32c
249.Pc .
250.It Sy (size_t)-3
251A character has been written into the wide character buffer
252.Po
253.Fa pwc ,
254.Fa p16c ,
255.Fa p32c
256.Pc .
257This character was from a previous call (such as another part of a
258UTF-16 surrogate pair) and no input was consumed.
259This is limited to the
260.Fn mbrtoc16
261and
262.Fn mbrtoc32
263functions.
264.El
265.Sh EXAMPLES
266.Sy Example 1
267Using the
268.Fn mbrtoc32
269function to convert a multibyte string.
270.Bd -literal
271#include <locale.h>
272#include <stdlib.h>
273#include <string.h>
274#include <err.h>
275#include <stdio.h>
276#include <uchar.h>
277
278int
279main(void)
280{
281	mbstate_t mbs;
282	char32_t out;
283	size_t ret;
284	const char *uchar_str = "\exe5\ex85\ex89";
285
286	(void) memset(&mbs, 0, sizeof (mbs));
287	(void) setlocale(LC_CTYPE, "en_US.UTF-8");
288	ret = mbrtoc32(&out, uchar_str, strlen(uchar_str), &mbs);
289	if (ret != strlen(uchar_str)) {
290		errx(EXIT_FAILURE, "failed to convert string, got %zd",
291		    ret);
292	}
293
294	(void) printf("Converted %zu bytes into UTF-32 character "
295	    "0x%x\n", ret, out);
296	return (0);
297}
298.Ed
299.Pp
300When compiled and run, this produces:
301.Bd -literal -offset indent
302$ ./a.out
303Converted 3 bytes into UTF-32 character 0x5149
304.Ed
305.Pp
306.Sy Example 2
307Handling surrogate pairs from the
308.Fn mbrtoc16
309function.
310.Bd -literal
311#include <locale.h>
312#include <stdlib.h>
313#include <string.h>
314#include <err.h>
315#include <stdio.h>
316#include <uchar.h>
317
318int
319main(void)
320{
321        mbstate_t mbs;
322        char16_t first, second;
323        size_t ret;
324        const char *uchar_str = "\exf0\ex9f\ex92\exa9";
325
326        (void) memset(&mbs, 0, sizeof (mbs));
327        (void) setlocale(LC_CTYPE, "en_US.UTF-8");
328        ret = mbrtoc16(&first, uchar_str, strlen(uchar_str), &mbs);
329        if (ret != strlen(uchar_str)) {
330                errx(EXIT_FAILURE, "failed to convert string, got %zd",
331                    ret);
332        }
333
334        ret = mbrtoc16(&second, "", 0, &mbs);
335        if (ret != (size_t)-3) {
336                errx(EXIT_FAILURE, "didn't get second surrogate pair, "
337                    "got %zd", ret);
338        }
339
340        (void) printf("UTF-16 surrogates: 0x%x 0x%x\n", first, second);
341        return (0);
342}
343.Ed
344.Pp
345When compiled and run, this produces:
346.Bd -literal -offset indent
347$ ./a.out
348UTF-16 surrogates: 0xd83d 0xdca9
349.Ed
350.Sh ERRORS
351The
352.Fn mbrtoc16 ,
353.Fn mbrtoc32 ,
354.Fn mbrtowc ,
355and
356.Fn mbrtowc_l
357functions will fail if:
358.Bl -tag -width Er
359.It Er EINVAL
360The conversion state in
361.Fa ps
362is invalid.
363.It Er EILSEQ
364An invalid character sequence has been detected.
365.El
366.Sh MT-LEVEL
367The
368.Fn mbrtoc16 ,
369.Fn mbrtoc32 ,
370.Fn mbrtowc ,
371and
372.Fn mbrtowc_l
373functions are
374.Sy MT-Safe
375as long as different
376.Vt mbstate_t
377structures are passed in
378.Fa ps .
379If
380.Fa ps
381is
382.Dv NULL
383or different threads use the same value for
384.Fa ps ,
385then the functions are
386.Sy Unsafe .
387.Sh INTERFACE STABILITY
388.Sy Committed
389.Sh SEE ALSO
390.Xr c16rtomb 3C ,
391.Xr c32rtomb 3C ,
392.Xr newlocale 3C ,
393.Xr setlocale 3C ,
394.Xr uselocale 3C ,
395.Xr wcrtomb 3C ,
396.Xr uchar.h 3HEAD ,
397.Xr environ 7
398