xref: /freebsd/contrib/xz/src/common/tuklib_mbstr_nonprint.c (revision 128836d304d93f2d00eb14069c27089ab46c38d4)
1*128836d3SXin LI // SPDX-License-Identifier: 0BSD
2*128836d3SXin LI 
3*128836d3SXin LI ///////////////////////////////////////////////////////////////////////////////
4*128836d3SXin LI //
5*128836d3SXin LI /// \file       tuklib_mbstr_nonprint.c
6*128836d3SXin LI /// \brief      Find and replace non-printable characters with question marks
7*128836d3SXin LI //
8*128836d3SXin LI //  Author:     Lasse Collin
9*128836d3SXin LI //
10*128836d3SXin LI ///////////////////////////////////////////////////////////////////////////////
11*128836d3SXin LI 
12*128836d3SXin LI #include "tuklib_mbstr_nonprint.h"
13*128836d3SXin LI #include <stdlib.h>
14*128836d3SXin LI #include <string.h>
15*128836d3SXin LI #include <errno.h>
16*128836d3SXin LI 
17*128836d3SXin LI #ifdef HAVE_MBRTOWC
18*128836d3SXin LI #	include <wchar.h>
19*128836d3SXin LI #	include <wctype.h>
20*128836d3SXin LI #else
21*128836d3SXin LI #	include <ctype.h>
22*128836d3SXin LI #endif
23*128836d3SXin LI 
24*128836d3SXin LI 
25*128836d3SXin LI static bool
is_next_printable(const char * str,size_t len,size_t * next_len)26*128836d3SXin LI is_next_printable(const char *str, size_t len, size_t *next_len)
27*128836d3SXin LI {
28*128836d3SXin LI #ifdef HAVE_MBRTOWC
29*128836d3SXin LI 	// This assumes that character sets with locking shift states aren't
30*128836d3SXin LI 	// used, and thus mbsinit() is never needed.
31*128836d3SXin LI 	mbstate_t ps;
32*128836d3SXin LI 	memset(&ps, 0, sizeof(ps));
33*128836d3SXin LI 
34*128836d3SXin LI 	wchar_t wc;
35*128836d3SXin LI 	*next_len = mbrtowc(&wc, str, len, &ps);
36*128836d3SXin LI 
37*128836d3SXin LI 	if (*next_len == (size_t)-2) {
38*128836d3SXin LI 		// Incomplete multibyte sequence: Treat the whole sequence
39*128836d3SXin LI 		// as a single non-printable multibyte character that ends
40*128836d3SXin LI 		// the string.
41*128836d3SXin LI 		*next_len = len;
42*128836d3SXin LI 		return false;
43*128836d3SXin LI 	}
44*128836d3SXin LI 
45*128836d3SXin LI 	// Check more broadly than just ret == (size_t)-1 to be safe
46*128836d3SXin LI 	// in case mbrtowc() returns something weird. This check
47*128836d3SXin LI 	// covers (size_t)-1 (that is, SIZE_MAX) too because len is from
48*128836d3SXin LI 	// strlen() and the terminating '\0' isn't part of the length.
49*128836d3SXin LI 	if (*next_len < 1 || *next_len > len) {
50*128836d3SXin LI 		// Invalid multibyte sequence: Treat the first byte as
51*128836d3SXin LI 		// a non-printable single-byte character. Decoding will
52*128836d3SXin LI 		// be restarted from the next byte on the next call to
53*128836d3SXin LI 		// this function.
54*128836d3SXin LI 		*next_len = 1;
55*128836d3SXin LI 		return false;
56*128836d3SXin LI 	}
57*128836d3SXin LI 
58*128836d3SXin LI #	if defined(_WIN32) && !defined(__CYGWIN__)
59*128836d3SXin LI 	// On Windows, wchar_t stores UTF-16 code units, thus characters
60*128836d3SXin LI 	// outside the Basic Multilingual Plane (BMP) don't fit into
61*128836d3SXin LI 	// a single wchar_t. In an UTF-8 locale, UCRT's mbrtowc() returns
62*128836d3SXin LI 	// successfully when the input is a non-BMP character but the
63*128836d3SXin LI 	// output is the replacement character U+FFFD.
64*128836d3SXin LI 	//
65*128836d3SXin LI 	// iswprint() returns 0 for U+FFFD on Windows for some reason. Treat
66*128836d3SXin LI 	// U+FFFD as printable and thus also all non-BMP chars as printable.
67*128836d3SXin LI 	if (wc == 0xFFFD)
68*128836d3SXin LI 		return true;
69*128836d3SXin LI #	endif
70*128836d3SXin LI 
71*128836d3SXin LI 	return iswprint((wint_t)wc) != 0;
72*128836d3SXin LI #else
73*128836d3SXin LI 	(void)len;
74*128836d3SXin LI 	*next_len = 1;
75*128836d3SXin LI 	return isprint((unsigned char)str[0]) != 0;
76*128836d3SXin LI #endif
77*128836d3SXin LI }
78*128836d3SXin LI 
79*128836d3SXin LI 
80*128836d3SXin LI static bool
has_nonprint(const char * str,size_t len)81*128836d3SXin LI has_nonprint(const char *str, size_t len)
82*128836d3SXin LI {
83*128836d3SXin LI 	for (size_t i = 0; i < len; ) {
84*128836d3SXin LI 		size_t next_len;
85*128836d3SXin LI 		if (!is_next_printable(str + i, len - i, &next_len))
86*128836d3SXin LI 			return true;
87*128836d3SXin LI 
88*128836d3SXin LI 		i += next_len;
89*128836d3SXin LI 	}
90*128836d3SXin LI 
91*128836d3SXin LI 	return false;
92*128836d3SXin LI }
93*128836d3SXin LI 
94*128836d3SXin LI 
95*128836d3SXin LI extern bool
tuklib_has_nonprint(const char * str)96*128836d3SXin LI tuklib_has_nonprint(const char *str)
97*128836d3SXin LI {
98*128836d3SXin LI 	const int saved_errno = errno;
99*128836d3SXin LI 	const bool ret = has_nonprint(str, strlen(str));
100*128836d3SXin LI 	errno = saved_errno;
101*128836d3SXin LI 	return ret;
102*128836d3SXin LI }
103*128836d3SXin LI 
104*128836d3SXin LI 
105*128836d3SXin LI extern const char *
tuklib_mask_nonprint_r(const char * str,char ** mem)106*128836d3SXin LI tuklib_mask_nonprint_r(const char *str, char **mem)
107*128836d3SXin LI {
108*128836d3SXin LI 	const int saved_errno = errno;
109*128836d3SXin LI 
110*128836d3SXin LI 	// Free the old string, if any.
111*128836d3SXin LI 	free(*mem);
112*128836d3SXin LI 	*mem = NULL;
113*128836d3SXin LI 
114*128836d3SXin LI 	// If the whole input string contains only printable characters,
115*128836d3SXin LI 	// return the input string.
116*128836d3SXin LI 	const size_t len = strlen(str);
117*128836d3SXin LI 	if (!has_nonprint(str, len)) {
118*128836d3SXin LI 		errno = saved_errno;
119*128836d3SXin LI 		return str;
120*128836d3SXin LI 	}
121*128836d3SXin LI 
122*128836d3SXin LI 	// Allocate memory for the masked string. Since we use the single-byte
123*128836d3SXin LI 	// character '?' to mask non-printable characters, it's possible that
124*128836d3SXin LI 	// a few bytes less memory would be needed in reality if multibyte
125*128836d3SXin LI 	// characters are masked.
126*128836d3SXin LI 	//
127*128836d3SXin LI 	// If allocation fails, return "???" because it should be safer than
128*128836d3SXin LI 	// returning the unmasked string.
129*128836d3SXin LI 	*mem = malloc(len + 1);
130*128836d3SXin LI 	if (*mem == NULL) {
131*128836d3SXin LI 		errno = saved_errno;
132*128836d3SXin LI 		return "???";
133*128836d3SXin LI 	}
134*128836d3SXin LI 
135*128836d3SXin LI 	// Replace all non-printable characters with '?'.
136*128836d3SXin LI 	char *dest = *mem;
137*128836d3SXin LI 
138*128836d3SXin LI 	for (size_t i = 0; i < len; ) {
139*128836d3SXin LI 		size_t next_len;
140*128836d3SXin LI 		if (is_next_printable(str + i, len - i, &next_len)) {
141*128836d3SXin LI 			memcpy(dest, str + i, next_len);
142*128836d3SXin LI 			dest += next_len;
143*128836d3SXin LI 		} else {
144*128836d3SXin LI 			*dest++ = '?';
145*128836d3SXin LI 		}
146*128836d3SXin LI 
147*128836d3SXin LI 		i += next_len;
148*128836d3SXin LI 	}
149*128836d3SXin LI 
150*128836d3SXin LI 	*dest = '\0';
151*128836d3SXin LI 
152*128836d3SXin LI 	errno = saved_errno;
153*128836d3SXin LI 	return *mem;
154*128836d3SXin LI }
155*128836d3SXin LI 
156*128836d3SXin LI 
157*128836d3SXin LI extern const char *
tuklib_mask_nonprint(const char * str)158*128836d3SXin LI tuklib_mask_nonprint(const char *str)
159*128836d3SXin LI {
160*128836d3SXin LI 	static char *mem = NULL;
161*128836d3SXin LI 	return tuklib_mask_nonprint_r(str, &mem);
162*128836d3SXin LI }
163