1*128836d3SXin LI // SPDX-License-Identifier: 0BSD
2*128836d3SXin LI
3*128836d3SXin LI ///////////////////////////////////////////////////////////////////////////////
4*128836d3SXin LI //
5*128836d3SXin LI /// \file tuklib_mbstr_nonprint.c
6*128836d3SXin LI /// \brief Find and replace non-printable characters with question marks
7*128836d3SXin LI //
8*128836d3SXin LI // Author: Lasse Collin
9*128836d3SXin LI //
10*128836d3SXin LI ///////////////////////////////////////////////////////////////////////////////
11*128836d3SXin LI
12*128836d3SXin LI #include "tuklib_mbstr_nonprint.h"
13*128836d3SXin LI #include <stdlib.h>
14*128836d3SXin LI #include <string.h>
15*128836d3SXin LI #include <errno.h>
16*128836d3SXin LI
17*128836d3SXin LI #ifdef HAVE_MBRTOWC
18*128836d3SXin LI # include <wchar.h>
19*128836d3SXin LI # include <wctype.h>
20*128836d3SXin LI #else
21*128836d3SXin LI # include <ctype.h>
22*128836d3SXin LI #endif
23*128836d3SXin LI
24*128836d3SXin LI
25*128836d3SXin LI static bool
is_next_printable(const char * str,size_t len,size_t * next_len)26*128836d3SXin LI is_next_printable(const char *str, size_t len, size_t *next_len)
27*128836d3SXin LI {
28*128836d3SXin LI #ifdef HAVE_MBRTOWC
29*128836d3SXin LI // This assumes that character sets with locking shift states aren't
30*128836d3SXin LI // used, and thus mbsinit() is never needed.
31*128836d3SXin LI mbstate_t ps;
32*128836d3SXin LI memset(&ps, 0, sizeof(ps));
33*128836d3SXin LI
34*128836d3SXin LI wchar_t wc;
35*128836d3SXin LI *next_len = mbrtowc(&wc, str, len, &ps);
36*128836d3SXin LI
37*128836d3SXin LI if (*next_len == (size_t)-2) {
38*128836d3SXin LI // Incomplete multibyte sequence: Treat the whole sequence
39*128836d3SXin LI // as a single non-printable multibyte character that ends
40*128836d3SXin LI // the string.
41*128836d3SXin LI *next_len = len;
42*128836d3SXin LI return false;
43*128836d3SXin LI }
44*128836d3SXin LI
45*128836d3SXin LI // Check more broadly than just ret == (size_t)-1 to be safe
46*128836d3SXin LI // in case mbrtowc() returns something weird. This check
47*128836d3SXin LI // covers (size_t)-1 (that is, SIZE_MAX) too because len is from
48*128836d3SXin LI // strlen() and the terminating '\0' isn't part of the length.
49*128836d3SXin LI if (*next_len < 1 || *next_len > len) {
50*128836d3SXin LI // Invalid multibyte sequence: Treat the first byte as
51*128836d3SXin LI // a non-printable single-byte character. Decoding will
52*128836d3SXin LI // be restarted from the next byte on the next call to
53*128836d3SXin LI // this function.
54*128836d3SXin LI *next_len = 1;
55*128836d3SXin LI return false;
56*128836d3SXin LI }
57*128836d3SXin LI
58*128836d3SXin LI # if defined(_WIN32) && !defined(__CYGWIN__)
59*128836d3SXin LI // On Windows, wchar_t stores UTF-16 code units, thus characters
60*128836d3SXin LI // outside the Basic Multilingual Plane (BMP) don't fit into
61*128836d3SXin LI // a single wchar_t. In an UTF-8 locale, UCRT's mbrtowc() returns
62*128836d3SXin LI // successfully when the input is a non-BMP character but the
63*128836d3SXin LI // output is the replacement character U+FFFD.
64*128836d3SXin LI //
65*128836d3SXin LI // iswprint() returns 0 for U+FFFD on Windows for some reason. Treat
66*128836d3SXin LI // U+FFFD as printable and thus also all non-BMP chars as printable.
67*128836d3SXin LI if (wc == 0xFFFD)
68*128836d3SXin LI return true;
69*128836d3SXin LI # endif
70*128836d3SXin LI
71*128836d3SXin LI return iswprint((wint_t)wc) != 0;
72*128836d3SXin LI #else
73*128836d3SXin LI (void)len;
74*128836d3SXin LI *next_len = 1;
75*128836d3SXin LI return isprint((unsigned char)str[0]) != 0;
76*128836d3SXin LI #endif
77*128836d3SXin LI }
78*128836d3SXin LI
79*128836d3SXin LI
80*128836d3SXin LI static bool
has_nonprint(const char * str,size_t len)81*128836d3SXin LI has_nonprint(const char *str, size_t len)
82*128836d3SXin LI {
83*128836d3SXin LI for (size_t i = 0; i < len; ) {
84*128836d3SXin LI size_t next_len;
85*128836d3SXin LI if (!is_next_printable(str + i, len - i, &next_len))
86*128836d3SXin LI return true;
87*128836d3SXin LI
88*128836d3SXin LI i += next_len;
89*128836d3SXin LI }
90*128836d3SXin LI
91*128836d3SXin LI return false;
92*128836d3SXin LI }
93*128836d3SXin LI
94*128836d3SXin LI
95*128836d3SXin LI extern bool
tuklib_has_nonprint(const char * str)96*128836d3SXin LI tuklib_has_nonprint(const char *str)
97*128836d3SXin LI {
98*128836d3SXin LI const int saved_errno = errno;
99*128836d3SXin LI const bool ret = has_nonprint(str, strlen(str));
100*128836d3SXin LI errno = saved_errno;
101*128836d3SXin LI return ret;
102*128836d3SXin LI }
103*128836d3SXin LI
104*128836d3SXin LI
105*128836d3SXin LI extern const char *
tuklib_mask_nonprint_r(const char * str,char ** mem)106*128836d3SXin LI tuklib_mask_nonprint_r(const char *str, char **mem)
107*128836d3SXin LI {
108*128836d3SXin LI const int saved_errno = errno;
109*128836d3SXin LI
110*128836d3SXin LI // Free the old string, if any.
111*128836d3SXin LI free(*mem);
112*128836d3SXin LI *mem = NULL;
113*128836d3SXin LI
114*128836d3SXin LI // If the whole input string contains only printable characters,
115*128836d3SXin LI // return the input string.
116*128836d3SXin LI const size_t len = strlen(str);
117*128836d3SXin LI if (!has_nonprint(str, len)) {
118*128836d3SXin LI errno = saved_errno;
119*128836d3SXin LI return str;
120*128836d3SXin LI }
121*128836d3SXin LI
122*128836d3SXin LI // Allocate memory for the masked string. Since we use the single-byte
123*128836d3SXin LI // character '?' to mask non-printable characters, it's possible that
124*128836d3SXin LI // a few bytes less memory would be needed in reality if multibyte
125*128836d3SXin LI // characters are masked.
126*128836d3SXin LI //
127*128836d3SXin LI // If allocation fails, return "???" because it should be safer than
128*128836d3SXin LI // returning the unmasked string.
129*128836d3SXin LI *mem = malloc(len + 1);
130*128836d3SXin LI if (*mem == NULL) {
131*128836d3SXin LI errno = saved_errno;
132*128836d3SXin LI return "???";
133*128836d3SXin LI }
134*128836d3SXin LI
135*128836d3SXin LI // Replace all non-printable characters with '?'.
136*128836d3SXin LI char *dest = *mem;
137*128836d3SXin LI
138*128836d3SXin LI for (size_t i = 0; i < len; ) {
139*128836d3SXin LI size_t next_len;
140*128836d3SXin LI if (is_next_printable(str + i, len - i, &next_len)) {
141*128836d3SXin LI memcpy(dest, str + i, next_len);
142*128836d3SXin LI dest += next_len;
143*128836d3SXin LI } else {
144*128836d3SXin LI *dest++ = '?';
145*128836d3SXin LI }
146*128836d3SXin LI
147*128836d3SXin LI i += next_len;
148*128836d3SXin LI }
149*128836d3SXin LI
150*128836d3SXin LI *dest = '\0';
151*128836d3SXin LI
152*128836d3SXin LI errno = saved_errno;
153*128836d3SXin LI return *mem;
154*128836d3SXin LI }
155*128836d3SXin LI
156*128836d3SXin LI
157*128836d3SXin LI extern const char *
tuklib_mask_nonprint(const char * str)158*128836d3SXin LI tuklib_mask_nonprint(const char *str)
159*128836d3SXin LI {
160*128836d3SXin LI static char *mem = NULL;
161*128836d3SXin LI return tuklib_mask_nonprint_r(str, &mem);
162*128836d3SXin LI }
163