xref: /freebsd/contrib/xz/src/common/tuklib_mbstr_nonprint.c (revision 128836d304d93f2d00eb14069c27089ab46c38d4)
1 // SPDX-License-Identifier: 0BSD
2 
3 ///////////////////////////////////////////////////////////////////////////////
4 //
5 /// \file       tuklib_mbstr_nonprint.c
6 /// \brief      Find and replace non-printable characters with question marks
7 //
8 //  Author:     Lasse Collin
9 //
10 ///////////////////////////////////////////////////////////////////////////////
11 
12 #include "tuklib_mbstr_nonprint.h"
13 #include <stdlib.h>
14 #include <string.h>
15 #include <errno.h>
16 
17 #ifdef HAVE_MBRTOWC
18 #	include <wchar.h>
19 #	include <wctype.h>
20 #else
21 #	include <ctype.h>
22 #endif
23 
24 
25 static bool
is_next_printable(const char * str,size_t len,size_t * next_len)26 is_next_printable(const char *str, size_t len, size_t *next_len)
27 {
28 #ifdef HAVE_MBRTOWC
29 	// This assumes that character sets with locking shift states aren't
30 	// used, and thus mbsinit() is never needed.
31 	mbstate_t ps;
32 	memset(&ps, 0, sizeof(ps));
33 
34 	wchar_t wc;
35 	*next_len = mbrtowc(&wc, str, len, &ps);
36 
37 	if (*next_len == (size_t)-2) {
38 		// Incomplete multibyte sequence: Treat the whole sequence
39 		// as a single non-printable multibyte character that ends
40 		// the string.
41 		*next_len = len;
42 		return false;
43 	}
44 
45 	// Check more broadly than just ret == (size_t)-1 to be safe
46 	// in case mbrtowc() returns something weird. This check
47 	// covers (size_t)-1 (that is, SIZE_MAX) too because len is from
48 	// strlen() and the terminating '\0' isn't part of the length.
49 	if (*next_len < 1 || *next_len > len) {
50 		// Invalid multibyte sequence: Treat the first byte as
51 		// a non-printable single-byte character. Decoding will
52 		// be restarted from the next byte on the next call to
53 		// this function.
54 		*next_len = 1;
55 		return false;
56 	}
57 
58 #	if defined(_WIN32) && !defined(__CYGWIN__)
59 	// On Windows, wchar_t stores UTF-16 code units, thus characters
60 	// outside the Basic Multilingual Plane (BMP) don't fit into
61 	// a single wchar_t. In an UTF-8 locale, UCRT's mbrtowc() returns
62 	// successfully when the input is a non-BMP character but the
63 	// output is the replacement character U+FFFD.
64 	//
65 	// iswprint() returns 0 for U+FFFD on Windows for some reason. Treat
66 	// U+FFFD as printable and thus also all non-BMP chars as printable.
67 	if (wc == 0xFFFD)
68 		return true;
69 #	endif
70 
71 	return iswprint((wint_t)wc) != 0;
72 #else
73 	(void)len;
74 	*next_len = 1;
75 	return isprint((unsigned char)str[0]) != 0;
76 #endif
77 }
78 
79 
80 static bool
has_nonprint(const char * str,size_t len)81 has_nonprint(const char *str, size_t len)
82 {
83 	for (size_t i = 0; i < len; ) {
84 		size_t next_len;
85 		if (!is_next_printable(str + i, len - i, &next_len))
86 			return true;
87 
88 		i += next_len;
89 	}
90 
91 	return false;
92 }
93 
94 
95 extern bool
tuklib_has_nonprint(const char * str)96 tuklib_has_nonprint(const char *str)
97 {
98 	const int saved_errno = errno;
99 	const bool ret = has_nonprint(str, strlen(str));
100 	errno = saved_errno;
101 	return ret;
102 }
103 
104 
105 extern const char *
tuklib_mask_nonprint_r(const char * str,char ** mem)106 tuklib_mask_nonprint_r(const char *str, char **mem)
107 {
108 	const int saved_errno = errno;
109 
110 	// Free the old string, if any.
111 	free(*mem);
112 	*mem = NULL;
113 
114 	// If the whole input string contains only printable characters,
115 	// return the input string.
116 	const size_t len = strlen(str);
117 	if (!has_nonprint(str, len)) {
118 		errno = saved_errno;
119 		return str;
120 	}
121 
122 	// Allocate memory for the masked string. Since we use the single-byte
123 	// character '?' to mask non-printable characters, it's possible that
124 	// a few bytes less memory would be needed in reality if multibyte
125 	// characters are masked.
126 	//
127 	// If allocation fails, return "???" because it should be safer than
128 	// returning the unmasked string.
129 	*mem = malloc(len + 1);
130 	if (*mem == NULL) {
131 		errno = saved_errno;
132 		return "???";
133 	}
134 
135 	// Replace all non-printable characters with '?'.
136 	char *dest = *mem;
137 
138 	for (size_t i = 0; i < len; ) {
139 		size_t next_len;
140 		if (is_next_printable(str + i, len - i, &next_len)) {
141 			memcpy(dest, str + i, next_len);
142 			dest += next_len;
143 		} else {
144 			*dest++ = '?';
145 		}
146 
147 		i += next_len;
148 	}
149 
150 	*dest = '\0';
151 
152 	errno = saved_errno;
153 	return *mem;
154 }
155 
156 
157 extern const char *
tuklib_mask_nonprint(const char * str)158 tuklib_mask_nonprint(const char *str)
159 {
160 	static char *mem = NULL;
161 	return tuklib_mask_nonprint_r(str, &mem);
162 }
163