1.\" 2.\" This file and its contents are supplied under the terms of the 3.\" Common Development and Distribution License ("CDDL"), version 1.0. 4.\" You may only use this file in accordance with the terms of version 5.\" 1.0 of the CDDL. 6.\" 7.\" A full copy of the text of the CDDL should have accompanied this 8.\" source. A copy of the CDDL is also available via the Internet at 9.\" http://www.illumos.org/license/CDDL. 10.\" 11.\" 12.\" Copyright 2020 Robert Mustacchi 13.\" 14.Dd April 23, 2020 15.Dt MBRTOC16 3C 16.Os 17.Sh NAME 18.Nm mbrtoc16 , 19.Nm mbrtoc32 , 20.Nm mbrtowc , 21.Nm mbrtowc_l 22.Nd convert characters to wide characters 23.Sh SYNOPSIS 24.In wchar.h 25.Ft size_t 26.Fo mbrtowc 27.Fa "wchar_t *restrict pwc" 28.Fa "const char *restrict str" 29.Fa "size_t len" 30.Fa "mstate_t *restrict ps" 31.Fc 32.In wchar.h 33.In xlocale.h 34.Ft size_t 35.Fo mbrtowc 36.Fa "wchar_t *restrict pwc" 37.Fa "const char *restrict str" 38.Fa "size_t len" 39.Fa "mstate_t *restrict ps" 40.Fa "locale_t loc" 41.Fc 42.In uchar.h 43.Ft size_t 44.Fo mbrtoc16 45.Fa "char16_t *restrict p16c" 46.Fa "const char *restrict str" 47.Fa "size_t len" 48.Fa "mbstate_t *restrict ps" 49.Fc 50.Ft size_t 51.Fo mbrtoc32 52.Fa "char32_t *restrict p32c" 53.Fa "const char *restrict str" 54.Fa "size_t len" 55.Fa "mbstate_t *restrict ps" 56.Fc 57.Sh DESCRIPTION 58The 59.Fn mbrtoc16 , 60.Fn mbrtoc32 , 61.Fn mbrtowc , 62and 63.Fn mbrtowc_l 64functions convert character sequences, which may contain multi-byte 65characters, into different character formats. 66The functions work in the following formats: 67.Bl -tag -width mbrtowc_l 68.It Fn mbrtoc16 69A UTF-16 code sequence, where every code point is represented by one or 70two 71.Vt char16_t . 72The UTF-16 encoding will encode certain Unicode code points as a pair of 73two 16-bit code sequences, commonly referred to as a surrogate pair. 74.It Fn mbrtoc32 75A UTF-32 code sequence, where every code point is represented by a 76single 77.Vt char32_t . 78.It Fn mbrtowc , Fn mbrtowc_l 79Wide characters, being a 32-bit value where every code point is 80represented by a single 81.Vt wchar_t . 82While the 83.Vt wchar_t 84and 85.Vt char32_t 86are different types, in this implementation, they are similar encodings. 87.El 88.Pp 89The functions consume up to 90.Fa len 91characters from the string 92.Fa str 93and accumulate them in 94.Fa ps 95until a valid character is found, which is influenced by 96the 97.Dv LC_CTYPE 98category of the current locale. 99For example, in the 100.Sy C 101locale, only ASCII characters are recognized, while in a 102.Sy UTF-8 103based locale like 104.Sy en_US.UTF-8 , 105UTF-8 multi-byte character sequences that represent Unicode code points 106are recognized. 107The 108.Fn mbrtowc_l 109function uses the locale passed in 110.Fa loc 111rather than the locale of the current thread. 112.Pp 113When a valid character sequence has been found, it is converted to 114either a 16-bit character sequence for 115.Fn mbrtoc16 116or a 32-bit character sequence for 117.Fn mbrtoc32 118and will be stored in 119.Fa p16c 120and 121.Fa p32c 122respectively. 123.Pp 124The 125.Fa ps 126argument represents a multi-byte conversion state which can be used 127across multiple calls to a given function 128.Pq but not mixed between functions . 129These allow for characters to be consumed from subsequent buffers, e.g. 130different values of 131.Fa str . 132The functions may be called from multiple threads as long as they use 133unique values for 134.Fa ps . 135If 136.Fa ps 137is 138.Dv NULL , 139then a function-specific buffer will be used for the conversion state; 140however, this is stored between all threads and its use is not 141recommended. 142.Pp 143When using these functions, more than one character may be output for a 144given set of consumed input characters. 145An example of this is when a given code point is represented as a set of 146surrogate pairs in UTF-16, which require two 16-bit characters to 147represent a code point. 148When this occurs, the functions return the special return value 149.Sy -3 . 150.Pp 151The functions all have a special behavior when 152.Dv NULL 153is passed for 154.Fa str . 155They instead will treat it as though 156.Fa pwc , 157.Fa p16c , 158or 159.Fa p32c 160were 161.Dv NULL , 162.Fa str 163had been passed as the empty string, "" and the length, 164.Fa len , 165would appear as the value 1. 166In other words, the functions would be called as: 167.Bd -literal -offset indent 168mbrtowc(NULL, "", 1, ps) 169mbrtowc_l(NULL, "", 1, ps) 170mbrtoc16(NULL, "", 1, ps) 171mbrtoc32(NULL, "", 1, ps) 172.Ed 173.Ss Locale Details 174Not all locales in the system are Unicode based locales. 175For example, ISO 8859 family locales have code points with values that 176do not match their counterparts in Unicode. 177When using these functions with non-Unicode based locales, the code 178points returned will be those determined by the locale. 179They will not be converted to the corresponding Unicode code point. 180For example, if using the Euro sign in ISO 8859-15, these functions 181might return the code point 0xa4 and not the Unicode value 0x20ac. 182.Pp 183Regardless of the locale, the characters returned will be encoded as 184though the code point were the corresponding value in Unicode. 185This means that if a locale returns a value that would be a surrogate 186pair in the UTF-16 encoding, it will still be encoded as a UTF-16 187character. 188.Pp 189This behavior of the 190.Fn mbrtoc16 191and 192.Fn mbrtoc32 193functions should not be relied upon, is not portable, and subject to 194change for non-Unicode locales. 195.Sh RETURN VALUES 196The 197.Fn mbrtoc16 , 198.Fn mbrtoc32 , 199.Fn mbrtowc , 200and 201.Fn mbrtowc_l 202functions return the following values: 203.Bl -tag -width (size_t)-3 204.It Sy 0 205.Fa len 206or fewer bytes of 207.Fa str 208were consumed and the null wide character was written into the wide 209character buffer 210.Po 211.Fa pwc , 212.Fa p16c , 213.Fa p32c 214.Pc . 215.It Sy between 1 and len 216The specified number of bytes were consumed and a single character was 217written into the wide character buffer 218.Po 219.Fa pwc , 220.Fa p16c , 221.Fa p32c 222.Pc . 223.It Sy (size_t)-1 224An encoding error has occurred. 225The next 226.Fa len 227bytes of 228.Fa str 229do not contribute to a valid character. 230.Va errno 231has been set to 232.Er EILSEQ . 233No data was written into the wide character buffer 234.Po 235.Fa pwc , 236.Fa p16c , 237.Fa p32c 238.Pc . 239.It Sy (size_t)-2 240.Fa len 241bytes of 242.Fa str 243were consumed, but a complete multi-byte character sequence has not been 244found and no data was written into the wide character buffer 245.Po 246.Fa pwc , 247.Fa p16c , 248.Fa p32c 249.Pc . 250.It Sy (size_t)-3 251A character has been written into the wide character buffer 252.Po 253.Fa pwc , 254.Fa p16c , 255.Fa p32c 256.Pc . 257This character was from a previous call (such as another part of a 258UTF-16 surrogate pair) and no input was consumed. 259This is limited to the 260.Fn mbrtoc16 261and 262.Fn mbrtoc32 263functions. 264.El 265.Sh EXAMPLES 266.Sy Example 1 267Using the 268.Fn mbrtoc32 269function to convert a multibyte string. 270.Bd -literal 271#include <locale.h> 272#include <stdlib.h> 273#include <string.h> 274#include <err.h> 275#include <stdio.h> 276#include <uchar.h> 277 278int 279main(void) 280{ 281 mbstate_t mbs; 282 char32_t out; 283 size_t ret; 284 const char *uchar_str = "\exe5\ex85\ex89"; 285 286 (void) memset(&mbs, 0, sizeof (mbs)); 287 (void) setlocale(LC_CTYPE, "en_US.UTF-8"); 288 ret = mbrtoc32(&out, uchar_str, strlen(uchar_str), &mbs); 289 if (ret != strlen(uchar_str)) { 290 errx(EXIT_FAILURE, "failed to convert string, got %zd", 291 ret); 292 } 293 294 (void) printf("Converted %zu bytes into UTF-32 character " 295 "0x%x\n", ret, out); 296 return (0); 297} 298.Ed 299.Pp 300When compiled and run, this produces: 301.Bd -literal -offset indent 302$ ./a.out 303Converted 3 bytes into UTF-32 character 0x5149 304.Ed 305.Pp 306.Sy Example 2 307Handling surrogate pairs from the 308.Fn mbrtoc16 309function. 310.Bd -literal 311#include <locale.h> 312#include <stdlib.h> 313#include <string.h> 314#include <err.h> 315#include <stdio.h> 316#include <uchar.h> 317 318int 319main(void) 320{ 321 mbstate_t mbs; 322 char16_t first, second; 323 size_t ret; 324 const char *uchar_str = "\exf0\ex9f\ex92\exa9"; 325 326 (void) memset(&mbs, '\0', sizeof (mbs)); 327 (void) setlocale(LC_CTYPE, "en_US.UTF-8"); 328 ret = mbrtoc16(&first, uchar_str, strlen(uchar_str), &mbs); 329 if (ret != strlen(uchar_str)) { 330 errx(EXIT_FAILURE, "failed to convert string, got %zd", 331 ret); 332 } 333 334 ret = mbrtoc16(&second, "", 0, &mbs); 335 if (ret != (size_t)-3) { 336 errx(EXIT_FAILURE, "didn't get second surrogate pair, " 337 "got %zd", ret); 338 } 339 340 (void) printf("UTF-16 surrogates: 0x%x 0x%x\n", first, second); 341 return (0); 342} 343.Ed 344.Pp 345When compiled and run, this produces: 346.Bd -literal -offset indent 347$ ./a.out 348UTF-16 surrogates: 0xd83d 0xdca9 349.Ed 350.Sh ERRORS 351The 352.Fn mbrtoc16 , 353.Fn mbrtoc32 , 354.Fn mbrtowc , 355and 356.Fn mbrtowc_l 357functions will fail if: 358.Bl -tag -width Er 359.It Er EINVAL 360The conversion state in 361.Fa ps 362is invalid. 363.It Er EILSEQ 364An invalid character sequence has been detected. 365.El 366.Sh MT-LEVEL 367The 368.Fn mbrtoc16 , 369.Fn mbrtoc32 , 370.Fn mbrtowc , 371and 372.Fn mbrtowc_l 373functions are 374.Sy MT-Safe 375as long as different 376.Vt mbstate_t 377structures are passed in 378.Fa ps . 379If 380.Fa ps 381is 382.Dv NULL 383or different threads use the same value for 384.Fa ps , 385then the functions are 386.Sy Unsafe . 387.Sh INTERFACE STABILITY 388.Sy Committed 389.Sh SEE ALSO 390.Xr c16rtomb 3C , 391.Xr c32rtomb 3C , 392.Xr newlocale 3C , 393.Xr setlocale 3C , 394.Xr uselocale 3C , 395.Xr wcrtomb 3C , 396.Xr uchar.h 3HEAD , 397.Xr environ 5 398