1 // SPDX-License-Identifier: 0BSD
2
3 ///////////////////////////////////////////////////////////////////////////////
4 //
5 /// \file tuklib_mbstr_nonprint.c
6 /// \brief Find and replace non-printable characters with question marks
7 //
8 // Author: Lasse Collin
9 //
10 ///////////////////////////////////////////////////////////////////////////////
11
12 #include "tuklib_mbstr_nonprint.h"
13 #include <stdlib.h>
14 #include <string.h>
15 #include <errno.h>
16
17 #ifdef HAVE_MBRTOWC
18 # include <wchar.h>
19 # include <wctype.h>
20 #else
21 # include <ctype.h>
22 #endif
23
24
25 static bool
is_next_printable(const char * str,size_t len,size_t * next_len)26 is_next_printable(const char *str, size_t len, size_t *next_len)
27 {
28 #ifdef HAVE_MBRTOWC
29 // This assumes that character sets with locking shift states aren't
30 // used, and thus mbsinit() is never needed.
31 mbstate_t ps;
32 memset(&ps, 0, sizeof(ps));
33
34 wchar_t wc;
35 *next_len = mbrtowc(&wc, str, len, &ps);
36
37 if (*next_len == (size_t)-2) {
38 // Incomplete multibyte sequence: Treat the whole sequence
39 // as a single non-printable multibyte character that ends
40 // the string.
41 *next_len = len;
42 return false;
43 }
44
45 // Check more broadly than just ret == (size_t)-1 to be safe
46 // in case mbrtowc() returns something weird. This check
47 // covers (size_t)-1 (that is, SIZE_MAX) too because len is from
48 // strlen() and the terminating '\0' isn't part of the length.
49 if (*next_len < 1 || *next_len > len) {
50 // Invalid multibyte sequence: Treat the first byte as
51 // a non-printable single-byte character. Decoding will
52 // be restarted from the next byte on the next call to
53 // this function.
54 *next_len = 1;
55 return false;
56 }
57
58 # if defined(_WIN32) && !defined(__CYGWIN__)
59 // On Windows, wchar_t stores UTF-16 code units, thus characters
60 // outside the Basic Multilingual Plane (BMP) don't fit into
61 // a single wchar_t. In an UTF-8 locale, UCRT's mbrtowc() returns
62 // successfully when the input is a non-BMP character but the
63 // output is the replacement character U+FFFD.
64 //
65 // iswprint() returns 0 for U+FFFD on Windows for some reason. Treat
66 // U+FFFD as printable and thus also all non-BMP chars as printable.
67 if (wc == 0xFFFD)
68 return true;
69 # endif
70
71 return iswprint((wint_t)wc) != 0;
72 #else
73 (void)len;
74 *next_len = 1;
75 return isprint((unsigned char)str[0]) != 0;
76 #endif
77 }
78
79
80 static bool
has_nonprint(const char * str,size_t len)81 has_nonprint(const char *str, size_t len)
82 {
83 for (size_t i = 0; i < len; ) {
84 size_t next_len;
85 if (!is_next_printable(str + i, len - i, &next_len))
86 return true;
87
88 i += next_len;
89 }
90
91 return false;
92 }
93
94
95 extern bool
tuklib_has_nonprint(const char * str)96 tuklib_has_nonprint(const char *str)
97 {
98 const int saved_errno = errno;
99 const bool ret = has_nonprint(str, strlen(str));
100 errno = saved_errno;
101 return ret;
102 }
103
104
105 extern const char *
tuklib_mask_nonprint_r(const char * str,char ** mem)106 tuklib_mask_nonprint_r(const char *str, char **mem)
107 {
108 const int saved_errno = errno;
109
110 // Free the old string, if any.
111 free(*mem);
112 *mem = NULL;
113
114 // If the whole input string contains only printable characters,
115 // return the input string.
116 const size_t len = strlen(str);
117 if (!has_nonprint(str, len)) {
118 errno = saved_errno;
119 return str;
120 }
121
122 // Allocate memory for the masked string. Since we use the single-byte
123 // character '?' to mask non-printable characters, it's possible that
124 // a few bytes less memory would be needed in reality if multibyte
125 // characters are masked.
126 //
127 // If allocation fails, return "???" because it should be safer than
128 // returning the unmasked string.
129 *mem = malloc(len + 1);
130 if (*mem == NULL) {
131 errno = saved_errno;
132 return "???";
133 }
134
135 // Replace all non-printable characters with '?'.
136 char *dest = *mem;
137
138 for (size_t i = 0; i < len; ) {
139 size_t next_len;
140 if (is_next_printable(str + i, len - i, &next_len)) {
141 memcpy(dest, str + i, next_len);
142 dest += next_len;
143 } else {
144 *dest++ = '?';
145 }
146
147 i += next_len;
148 }
149
150 *dest = '\0';
151
152 errno = saved_errno;
153 return *mem;
154 }
155
156
157 extern const char *
tuklib_mask_nonprint(const char * str)158 tuklib_mask_nonprint(const char *str)
159 {
160 static char *mem = NULL;
161 return tuklib_mask_nonprint_r(str, &mem);
162 }
163