xref: /freebsd/contrib/xz/src/common/tuklib_mbstr_width.c (revision 128836d304d93f2d00eb14069c27089ab46c38d4)
1 // SPDX-License-Identifier: 0BSD
2 
3 ///////////////////////////////////////////////////////////////////////////////
4 //
5 /// \file       tuklib_mbstr_width.c
6 /// \brief      Calculate width of a multibyte string
7 //
8 //  Author:     Lasse Collin
9 //
10 ///////////////////////////////////////////////////////////////////////////////
11 
12 #include "tuklib_mbstr.h"
13 #include <string.h>
14 
15 #ifdef HAVE_MBRTOWC
16 #	include <wchar.h>
17 #endif
18 
19 
20 extern size_t
tuklib_mbstr_width(const char * str,size_t * bytes)21 tuklib_mbstr_width(const char *str, size_t *bytes)
22 {
23 	const size_t len = strlen(str);
24 	if (bytes != NULL)
25 		*bytes = len;
26 
27 	return tuklib_mbstr_width_mem(str, len);
28 }
29 
30 
31 extern size_t
tuklib_mbstr_width_mem(const char * str,size_t len)32 tuklib_mbstr_width_mem(const char *str, size_t len)
33 {
34 #ifndef HAVE_MBRTOWC
35 	// In single-byte mode, the width of the string is the same
36 	// as its length.
37 	(void)str;
38 	return len;
39 
40 #else
41 	mbstate_t state;
42 	memset(&state, 0, sizeof(state));
43 
44 	size_t width = 0;
45 	size_t i = 0;
46 
47 	// Convert one multibyte character at a time to wchar_t
48 	// and get its width using wcwidth().
49 	while (i < len) {
50 		wchar_t wc;
51 		const size_t ret = mbrtowc(&wc, str + i, len - i, &state);
52 		if (ret < 1 || ret > len - i)
53 			return (size_t)-1;
54 
55 		i += ret;
56 
57 #ifdef HAVE_WCWIDTH
58 		const int wc_width = wcwidth(wc);
59 		if (wc_width < 0)
60 			return (size_t)-1;
61 
62 		width += (size_t)wc_width;
63 #else
64 		// Without wcwidth() (like in a native Windows build),
65 		// assume that one multibyte char == one column. With
66 		// UTF-8, this is less bad than one byte == one column.
67 		// This way quite a few languages will be handled correctly
68 		// in practice; CJK chars will be very wrong though.
69 		++width;
70 #endif
71 	}
72 
73 	// It's good to check that the string ended in the initial state.
74 	// However, in practice this is redundant:
75 	//
76 	//   - No one will use this code with character sets that have
77 	//     locking shift states.
78 	//
79 	//   - We already checked that mbrtowc() didn't return (size_t)-2
80 	//     which would indicate a partial multibyte character.
81 	if (!mbsinit(&state))
82 		return (size_t)-1;
83 
84 	return width;
85 #endif
86 }
87