xref: /illumos-gate/usr/src/man/man3c/mbrtoc16.3c (revision 1e56f352c1c208679012bca47d552e127f5b1072)
1.\"
2.\" This file and its contents are supplied under the terms of the
3.\" Common Development and Distribution License ("CDDL"), version 1.0.
4.\" You may only use this file in accordance with the terms of version
5.\" 1.0 of the CDDL.
6.\"
7.\" A full copy of the text of the CDDL should have accompanied this
8.\" source.  A copy of the CDDL is also available via the Internet at
9.\" http://www.illumos.org/license/CDDL.
10.\"
11.\"
12.\" Copyright 2020 Robert Mustacchi
13.\" Copyright 2023 Bill Sommerfeld
14.\"
15.Dd June 5, 2023
16.Dt MBRTOC16 3C
17.Os
18.Sh NAME
19.Nm mbrtoc16 ,
20.Nm mbrtoc32 ,
21.Nm mbrtowc ,
22.Nm mbrtowc_l
23.Nd convert characters to wide characters
24.Sh SYNOPSIS
25.In wchar.h
26.Ft size_t
27.Fo mbrtowc
28.Fa "wchar_t *restrict pwc"
29.Fa "const char *restrict str"
30.Fa "size_t len"
31.Fa "mstate_t *restrict ps"
32.Fc
33.In wchar.h
34.In xlocale.h
35.Ft size_t
36.Fo mbrtowc_l
37.Fa "wchar_t *restrict pwc"
38.Fa "const char *restrict str"
39.Fa "size_t len"
40.Fa "mstate_t *restrict ps"
41.Fa "locale_t loc"
42.Fc
43.In uchar.h
44.Ft size_t
45.Fo mbrtoc16
46.Fa "char16_t *restrict p16c"
47.Fa "const char *restrict str"
48.Fa "size_t len"
49.Fa "mbstate_t *restrict ps"
50.Fc
51.Ft size_t
52.Fo mbrtoc32
53.Fa "char32_t *restrict p32c"
54.Fa "const char *restrict str"
55.Fa "size_t len"
56.Fa "mbstate_t *restrict ps"
57.Fc
58.Sh DESCRIPTION
59The
60.Fn mbrtoc16 ,
61.Fn mbrtoc32 ,
62.Fn mbrtowc ,
63and
64.Fn mbrtowc_l
65functions convert character sequences, which may contain multi-byte
66characters, into different character formats.
67The functions work in the following formats:
68.Bl -tag -width mbrtowc_l
69.It Fn mbrtoc16
70A UTF-16 code sequence, where every code point is represented by one or
71two
72.Vt char16_t .
73The UTF-16 encoding will encode certain Unicode code points as a pair of
74two 16-bit code sequences, commonly referred to as a surrogate pair.
75.It Fn mbrtoc32
76A UTF-32 code sequence, where every code point is represented by a
77single
78.Vt char32_t .
79.It Fn mbrtowc , Fn mbrtowc_l
80Wide characters, being a 32-bit value where every code point is
81represented by a single
82.Vt wchar_t .
83While the
84.Vt wchar_t
85and
86.Vt char32_t
87are different types, in this implementation, they are similar encodings.
88.El
89.Pp
90The functions consume up to
91.Fa len
92characters from the string
93.Fa str
94and accumulate them in
95.Fa ps
96until a valid character is found, which is influenced by
97the
98.Dv LC_CTYPE
99category of the current locale.
100For example, in the
101.Sy C
102locale, only ASCII characters are recognized, while in a
103.Sy UTF-8
104based locale like
105.Sy en_US.UTF-8 ,
106UTF-8 multi-byte character sequences that represent Unicode code points
107are recognized.
108The
109.Fn mbrtowc_l
110function uses the locale passed in
111.Fa loc
112rather than the locale of the current thread.
113.Pp
114When a valid character sequence has been found, it is converted to
115either a 16-bit character sequence for
116.Fn mbrtoc16
117or a 32-bit character sequence for
118.Fn mbrtoc32
119and will be stored in
120.Fa p16c
121and
122.Fa p32c
123respectively.
124.Pp
125The
126.Fa ps
127argument represents a multi-byte conversion state which can be used
128across multiple calls to a given function
129.Pq but not mixed between functions .
130These allow for characters to be consumed from subsequent buffers, e.g.
131different values of
132.Fa str .
133The functions may be called from multiple threads as long as they use
134unique values for
135.Fa ps .
136If
137.Fa ps
138is
139.Dv NULL ,
140then a function-specific buffer will be used for the conversion state;
141however, this is stored between all threads and its use is not
142recommended.
143.Pp
144When using these functions, more than one character may be output for a
145given set of consumed input characters.
146An example of this is when a given code point is represented as a set of
147surrogate pairs in UTF-16, which require two 16-bit characters to
148represent a code point.
149When this occurs, the functions return the special return value
150.Sy -3 .
151.Pp
152The functions all have a special behavior when
153.Dv NULL
154is passed for
155.Fa str .
156They instead will treat it as though
157.Fa pwc ,
158.Fa p16c ,
159or
160.Fa p32c
161were
162.Dv NULL ,
163.Fa str
164had been passed as the empty string, "" and the length,
165.Fa len ,
166would appear as the value 1.
167In other words, the functions would be called as:
168.Bd -literal -offset indent
169mbrtowc(NULL, "", 1, ps)
170mbrtowc_l(NULL, "", 1, ps)
171mbrtoc16(NULL, "", 1, ps)
172mbrtoc32(NULL, "", 1, ps)
173.Ed
174.Ss Locale Details
175Not all locales in the system are Unicode based locales.
176For example, ISO 8859 family locales have code points with values that
177do not match their counterparts in Unicode.
178When using these functions with non-Unicode based locales, the code
179points returned will be those determined by the locale.
180They will not be converted to the corresponding Unicode code point.
181For example, if using the Euro sign in ISO 8859-15, these functions
182might return the code point 0xa4 and not the Unicode value 0x20ac.
183.Pp
184Regardless of the locale, the characters returned will be encoded as
185though the code point were the corresponding value in Unicode.
186This means that if a locale returns a value that would be a surrogate
187pair in the UTF-16 encoding, it will still be encoded as a UTF-16
188character.
189.Pp
190This behavior of the
191.Fn mbrtoc16
192and
193.Fn mbrtoc32
194functions should not be relied upon, is not portable, and subject to
195change for non-Unicode locales.
196.Sh RETURN VALUES
197The
198.Fn mbrtoc16 ,
199.Fn mbrtoc32 ,
200.Fn mbrtowc ,
201and
202.Fn mbrtowc_l
203functions return the following values:
204.Bl -tag -width (size_t)-3
205.It Sy 0
206.Fa len
207or fewer bytes of
208.Fa str
209were consumed and the null wide character was written into the wide
210character buffer
211.Po
212.Fa pwc ,
213.Fa p16c ,
214.Fa p32c
215.Pc .
216.It Sy between 1 and len
217The specified number of bytes were consumed and a single character was
218written into the wide character buffer
219.Po
220.Fa pwc ,
221.Fa p16c ,
222.Fa p32c
223.Pc .
224.It Sy (size_t)-1
225An encoding error has occurred.
226The next
227.Fa len
228bytes of
229.Fa str
230do not contribute to a valid character.
231.Va errno
232has been set to
233.Er EILSEQ .
234No data was written into the wide character buffer
235.Po
236.Fa pwc ,
237.Fa p16c ,
238.Fa p32c
239.Pc .
240.It Sy (size_t)-2
241.Fa len
242bytes of
243.Fa str
244were consumed, but a complete multi-byte character sequence has not been
245found and no data was written into the wide character buffer
246.Po
247.Fa pwc ,
248.Fa p16c ,
249.Fa p32c
250.Pc .
251.It Sy (size_t)-3
252A character has been written into the wide character buffer
253.Po
254.Fa pwc ,
255.Fa p16c ,
256.Fa p32c
257.Pc .
258This character was from a previous call (such as another part of a
259UTF-16 surrogate pair) and no input was consumed.
260This is limited to the
261.Fn mbrtoc16
262and
263.Fn mbrtoc32
264functions.
265.El
266.Sh EXAMPLES
267.Sy Example 1
268Using the
269.Fn mbrtoc32
270function to convert a multibyte string.
271.Bd -literal
272#include <locale.h>
273#include <stdlib.h>
274#include <string.h>
275#include <err.h>
276#include <stdio.h>
277#include <uchar.h>
278
279int
280main(void)
281{
282	mbstate_t mbs;
283	char32_t out;
284	size_t ret;
285	const char *uchar_str = "\exe5\ex85\ex89";
286
287	(void) memset(&mbs, 0, sizeof (mbs));
288	(void) setlocale(LC_CTYPE, "en_US.UTF-8");
289	ret = mbrtoc32(&out, uchar_str, strlen(uchar_str), &mbs);
290	if (ret != strlen(uchar_str)) {
291		errx(EXIT_FAILURE, "failed to convert string, got %zd",
292		    ret);
293	}
294
295	(void) printf("Converted %zu bytes into UTF-32 character "
296	    "0x%x\n", ret, out);
297	return (0);
298}
299.Ed
300.Pp
301When compiled and run, this produces:
302.Bd -literal -offset indent
303$ ./a.out
304Converted 3 bytes into UTF-32 character 0x5149
305.Ed
306.Pp
307.Sy Example 2
308Handling surrogate pairs from the
309.Fn mbrtoc16
310function.
311.Bd -literal
312#include <locale.h>
313#include <stdlib.h>
314#include <string.h>
315#include <err.h>
316#include <stdio.h>
317#include <uchar.h>
318
319int
320main(void)
321{
322        mbstate_t mbs;
323        char16_t first, second;
324        size_t ret;
325        const char *uchar_str = "\exf0\ex9f\ex92\exa9";
326
327        (void) memset(&mbs, 0, sizeof (mbs));
328        (void) setlocale(LC_CTYPE, "en_US.UTF-8");
329        ret = mbrtoc16(&first, uchar_str, strlen(uchar_str), &mbs);
330        if (ret != strlen(uchar_str)) {
331                errx(EXIT_FAILURE, "failed to convert string, got %zd",
332                    ret);
333        }
334
335        ret = mbrtoc16(&second, "", 0, &mbs);
336        if (ret != (size_t)-3) {
337                errx(EXIT_FAILURE, "didn't get second surrogate pair, "
338                    "got %zd", ret);
339        }
340
341        (void) printf("UTF-16 surrogates: 0x%x 0x%x\n", first, second);
342        return (0);
343}
344.Ed
345.Pp
346When compiled and run, this produces:
347.Bd -literal -offset indent
348$ ./a.out
349UTF-16 surrogates: 0xd83d 0xdca9
350.Ed
351.Sh ERRORS
352The
353.Fn mbrtoc16 ,
354.Fn mbrtoc32 ,
355.Fn mbrtowc ,
356and
357.Fn mbrtowc_l
358functions will fail if:
359.Bl -tag -width Er
360.It Er EINVAL
361The conversion state in
362.Fa ps
363is invalid.
364.It Er EILSEQ
365An invalid character sequence has been detected.
366.El
367.Sh MT-LEVEL
368The
369.Fn mbrtoc16 ,
370.Fn mbrtoc32 ,
371.Fn mbrtowc ,
372and
373.Fn mbrtowc_l
374functions are
375.Sy MT-Safe
376as long as different
377.Vt mbstate_t
378structures are passed in
379.Fa ps .
380If
381.Fa ps
382is
383.Dv NULL
384or different threads use the same value for
385.Fa ps ,
386then the functions are
387.Sy Unsafe .
388.Sh INTERFACE STABILITY
389.Sy Committed
390.Sh SEE ALSO
391.Xr c16rtomb 3C ,
392.Xr c32rtomb 3C ,
393.Xr newlocale 3C ,
394.Xr setlocale 3C ,
395.Xr uselocale 3C ,
396.Xr wcrtomb 3C ,
397.Xr uchar.h 3HEAD ,
398.Xr environ 7
399