1.\" 2.\" This file and its contents are supplied under the terms of the 3.\" Common Development and Distribution License ("CDDL"), version 1.0. 4.\" You may only use this file in accordance with the terms of version 5.\" 1.0 of the CDDL. 6.\" 7.\" A full copy of the text of the CDDL should have accompanied this 8.\" source. A copy of the CDDL is also available via the Internet at 9.\" http://www.illumos.org/license/CDDL. 10.\" 11.\" 12.\" Copyright 2020 Robert Mustacchi 13.\" Copyright 2023 Bill Sommerfeld 14.\" 15.Dd June 5, 2023 16.Dt MBRTOC16 3C 17.Os 18.Sh NAME 19.Nm mbrtoc16 , 20.Nm mbrtoc32 , 21.Nm mbrtowc , 22.Nm mbrtowc_l 23.Nd convert characters to wide characters 24.Sh SYNOPSIS 25.In wchar.h 26.Ft size_t 27.Fo mbrtowc 28.Fa "wchar_t *restrict pwc" 29.Fa "const char *restrict str" 30.Fa "size_t len" 31.Fa "mstate_t *restrict ps" 32.Fc 33.In wchar.h 34.In xlocale.h 35.Ft size_t 36.Fo mbrtowc_l 37.Fa "wchar_t *restrict pwc" 38.Fa "const char *restrict str" 39.Fa "size_t len" 40.Fa "mstate_t *restrict ps" 41.Fa "locale_t loc" 42.Fc 43.In uchar.h 44.Ft size_t 45.Fo mbrtoc16 46.Fa "char16_t *restrict p16c" 47.Fa "const char *restrict str" 48.Fa "size_t len" 49.Fa "mbstate_t *restrict ps" 50.Fc 51.Ft size_t 52.Fo mbrtoc32 53.Fa "char32_t *restrict p32c" 54.Fa "const char *restrict str" 55.Fa "size_t len" 56.Fa "mbstate_t *restrict ps" 57.Fc 58.Sh DESCRIPTION 59The 60.Fn mbrtoc16 , 61.Fn mbrtoc32 , 62.Fn mbrtowc , 63and 64.Fn mbrtowc_l 65functions convert character sequences, which may contain multi-byte 66characters, into different character formats. 67The functions work in the following formats: 68.Bl -tag -width mbrtowc_l 69.It Fn mbrtoc16 70A UTF-16 code sequence, where every code point is represented by one or 71two 72.Vt char16_t . 73The UTF-16 encoding will encode certain Unicode code points as a pair of 74two 16-bit code sequences, commonly referred to as a surrogate pair. 75.It Fn mbrtoc32 76A UTF-32 code sequence, where every code point is represented by a 77single 78.Vt char32_t . 79.It Fn mbrtowc , Fn mbrtowc_l 80Wide characters, being a 32-bit value where every code point is 81represented by a single 82.Vt wchar_t . 83While the 84.Vt wchar_t 85and 86.Vt char32_t 87are different types, in this implementation, they are similar encodings. 88.El 89.Pp 90The functions consume up to 91.Fa len 92characters from the string 93.Fa str 94and accumulate them in 95.Fa ps 96until a valid character is found, which is influenced by 97the 98.Dv LC_CTYPE 99category of the current locale. 100For example, in the 101.Sy C 102locale, only ASCII characters are recognized, while in a 103.Sy UTF-8 104based locale like 105.Sy en_US.UTF-8 , 106UTF-8 multi-byte character sequences that represent Unicode code points 107are recognized. 108The 109.Fn mbrtowc_l 110function uses the locale passed in 111.Fa loc 112rather than the locale of the current thread. 113.Pp 114When a valid character sequence has been found, it is converted to 115either a 16-bit character sequence for 116.Fn mbrtoc16 117or a 32-bit character sequence for 118.Fn mbrtoc32 119and will be stored in 120.Fa p16c 121and 122.Fa p32c 123respectively. 124.Pp 125The 126.Fa ps 127argument represents a multi-byte conversion state which can be used 128across multiple calls to a given function 129.Pq but not mixed between functions . 130These allow for characters to be consumed from subsequent buffers, e.g. 131different values of 132.Fa str . 133The functions may be called from multiple threads as long as they use 134unique values for 135.Fa ps . 136If 137.Fa ps 138is 139.Dv NULL , 140then a function-specific buffer will be used for the conversion state; 141however, this is stored between all threads and its use is not 142recommended. 143.Pp 144When using these functions, more than one character may be output for a 145given set of consumed input characters. 146An example of this is when a given code point is represented as a set of 147surrogate pairs in UTF-16, which require two 16-bit characters to 148represent a code point. 149When this occurs, the functions return the special return value 150.Sy -3 . 151.Pp 152The functions all have a special behavior when 153.Dv NULL 154is passed for 155.Fa str . 156They instead will treat it as though 157.Fa pwc , 158.Fa p16c , 159or 160.Fa p32c 161were 162.Dv NULL , 163.Fa str 164had been passed as the empty string, "" and the length, 165.Fa len , 166would appear as the value 1. 167In other words, the functions would be called as: 168.Bd -literal -offset indent 169mbrtowc(NULL, "", 1, ps) 170mbrtowc_l(NULL, "", 1, ps) 171mbrtoc16(NULL, "", 1, ps) 172mbrtoc32(NULL, "", 1, ps) 173.Ed 174.Ss Locale Details 175Not all locales in the system are Unicode based locales. 176For example, ISO 8859 family locales have code points with values that 177do not match their counterparts in Unicode. 178When using these functions with non-Unicode based locales, the code 179points returned will be those determined by the locale. 180They will not be converted to the corresponding Unicode code point. 181For example, if using the Euro sign in ISO 8859-15, these functions 182might return the code point 0xa4 and not the Unicode value 0x20ac. 183.Pp 184Regardless of the locale, the characters returned will be encoded as 185though the code point were the corresponding value in Unicode. 186This means that if a locale returns a value that would be a surrogate 187pair in the UTF-16 encoding, it will still be encoded as a UTF-16 188character. 189.Pp 190This behavior of the 191.Fn mbrtoc16 192and 193.Fn mbrtoc32 194functions should not be relied upon, is not portable, and subject to 195change for non-Unicode locales. 196.Sh RETURN VALUES 197The 198.Fn mbrtoc16 , 199.Fn mbrtoc32 , 200.Fn mbrtowc , 201and 202.Fn mbrtowc_l 203functions return the following values: 204.Bl -tag -width (size_t)-3 205.It Sy 0 206.Fa len 207or fewer bytes of 208.Fa str 209were consumed and the null wide character was written into the wide 210character buffer 211.Po 212.Fa pwc , 213.Fa p16c , 214.Fa p32c 215.Pc . 216.It Sy between 1 and len 217The specified number of bytes were consumed and a single character was 218written into the wide character buffer 219.Po 220.Fa pwc , 221.Fa p16c , 222.Fa p32c 223.Pc . 224.It Sy (size_t)-1 225An encoding error has occurred. 226The next 227.Fa len 228bytes of 229.Fa str 230do not contribute to a valid character. 231.Va errno 232has been set to 233.Er EILSEQ . 234No data was written into the wide character buffer 235.Po 236.Fa pwc , 237.Fa p16c , 238.Fa p32c 239.Pc . 240.It Sy (size_t)-2 241.Fa len 242bytes of 243.Fa str 244were consumed, but a complete multi-byte character sequence has not been 245found and no data was written into the wide character buffer 246.Po 247.Fa pwc , 248.Fa p16c , 249.Fa p32c 250.Pc . 251.It Sy (size_t)-3 252A character has been written into the wide character buffer 253.Po 254.Fa pwc , 255.Fa p16c , 256.Fa p32c 257.Pc . 258This character was from a previous call (such as another part of a 259UTF-16 surrogate pair) and no input was consumed. 260This is limited to the 261.Fn mbrtoc16 262and 263.Fn mbrtoc32 264functions. 265.El 266.Sh EXAMPLES 267.Sy Example 1 268Using the 269.Fn mbrtoc32 270function to convert a multibyte string. 271.Bd -literal 272#include <locale.h> 273#include <stdlib.h> 274#include <string.h> 275#include <err.h> 276#include <stdio.h> 277#include <uchar.h> 278 279int 280main(void) 281{ 282 mbstate_t mbs; 283 char32_t out; 284 size_t ret; 285 const char *uchar_str = "\exe5\ex85\ex89"; 286 287 (void) memset(&mbs, 0, sizeof (mbs)); 288 (void) setlocale(LC_CTYPE, "en_US.UTF-8"); 289 ret = mbrtoc32(&out, uchar_str, strlen(uchar_str), &mbs); 290 if (ret != strlen(uchar_str)) { 291 errx(EXIT_FAILURE, "failed to convert string, got %zd", 292 ret); 293 } 294 295 (void) printf("Converted %zu bytes into UTF-32 character " 296 "0x%x\n", ret, out); 297 return (0); 298} 299.Ed 300.Pp 301When compiled and run, this produces: 302.Bd -literal -offset indent 303$ ./a.out 304Converted 3 bytes into UTF-32 character 0x5149 305.Ed 306.Pp 307.Sy Example 2 308Handling surrogate pairs from the 309.Fn mbrtoc16 310function. 311.Bd -literal 312#include <locale.h> 313#include <stdlib.h> 314#include <string.h> 315#include <err.h> 316#include <stdio.h> 317#include <uchar.h> 318 319int 320main(void) 321{ 322 mbstate_t mbs; 323 char16_t first, second; 324 size_t ret; 325 const char *uchar_str = "\exf0\ex9f\ex92\exa9"; 326 327 (void) memset(&mbs, 0, sizeof (mbs)); 328 (void) setlocale(LC_CTYPE, "en_US.UTF-8"); 329 ret = mbrtoc16(&first, uchar_str, strlen(uchar_str), &mbs); 330 if (ret != strlen(uchar_str)) { 331 errx(EXIT_FAILURE, "failed to convert string, got %zd", 332 ret); 333 } 334 335 ret = mbrtoc16(&second, "", 0, &mbs); 336 if (ret != (size_t)-3) { 337 errx(EXIT_FAILURE, "didn't get second surrogate pair, " 338 "got %zd", ret); 339 } 340 341 (void) printf("UTF-16 surrogates: 0x%x 0x%x\n", first, second); 342 return (0); 343} 344.Ed 345.Pp 346When compiled and run, this produces: 347.Bd -literal -offset indent 348$ ./a.out 349UTF-16 surrogates: 0xd83d 0xdca9 350.Ed 351.Sh ERRORS 352The 353.Fn mbrtoc16 , 354.Fn mbrtoc32 , 355.Fn mbrtowc , 356and 357.Fn mbrtowc_l 358functions will fail if: 359.Bl -tag -width Er 360.It Er EINVAL 361The conversion state in 362.Fa ps 363is invalid. 364.It Er EILSEQ 365An invalid character sequence has been detected. 366.El 367.Sh MT-LEVEL 368The 369.Fn mbrtoc16 , 370.Fn mbrtoc32 , 371.Fn mbrtowc , 372and 373.Fn mbrtowc_l 374functions are 375.Sy MT-Safe 376as long as different 377.Vt mbstate_t 378structures are passed in 379.Fa ps . 380If 381.Fa ps 382is 383.Dv NULL 384or different threads use the same value for 385.Fa ps , 386then the functions are 387.Sy Unsafe . 388.Sh INTERFACE STABILITY 389.Sy Committed 390.Sh SEE ALSO 391.Xr c16rtomb 3C , 392.Xr c32rtomb 3C , 393.Xr newlocale 3C , 394.Xr setlocale 3C , 395.Xr uselocale 3C , 396.Xr wcrtomb 3C , 397.Xr uchar.h 3HEAD , 398.Xr environ 7 399