1 // SPDX-License-Identifier: 0BSD 2 3 /////////////////////////////////////////////////////////////////////////////// 4 // 5 /// \file tuklib_mbstr_nonprint.c 6 /// \brief Find and replace non-printable characters with question marks 7 // 8 // Author: Lasse Collin 9 // 10 /////////////////////////////////////////////////////////////////////////////// 11 12 #include "tuklib_mbstr_nonprint.h" 13 #include <stdlib.h> 14 #include <string.h> 15 #include <errno.h> 16 17 #ifdef HAVE_MBRTOWC 18 # include <wchar.h> 19 # include <wctype.h> 20 #else 21 # include <ctype.h> 22 #endif 23 24 25 static bool 26 is_next_printable(const char *str, size_t len, size_t *next_len) 27 { 28 #ifdef HAVE_MBRTOWC 29 // This assumes that character sets with locking shift states aren't 30 // used, and thus mbsinit() is never needed. 31 mbstate_t ps; 32 memset(&ps, 0, sizeof(ps)); 33 34 wchar_t wc; 35 *next_len = mbrtowc(&wc, str, len, &ps); 36 37 if (*next_len == (size_t)-2) { 38 // Incomplete multibyte sequence: Treat the whole sequence 39 // as a single non-printable multibyte character that ends 40 // the string. 41 *next_len = len; 42 return false; 43 } 44 45 // Check more broadly than just ret == (size_t)-1 to be safe 46 // in case mbrtowc() returns something weird. This check 47 // covers (size_t)-1 (that is, SIZE_MAX) too because len is from 48 // strlen() and the terminating '\0' isn't part of the length. 49 if (*next_len < 1 || *next_len > len) { 50 // Invalid multibyte sequence: Treat the first byte as 51 // a non-printable single-byte character. Decoding will 52 // be restarted from the next byte on the next call to 53 // this function. 54 *next_len = 1; 55 return false; 56 } 57 58 # if defined(_WIN32) && !defined(__CYGWIN__) 59 // On Windows, wchar_t stores UTF-16 code units, thus characters 60 // outside the Basic Multilingual Plane (BMP) don't fit into 61 // a single wchar_t. In an UTF-8 locale, UCRT's mbrtowc() returns 62 // successfully when the input is a non-BMP character but the 63 // output is the replacement character U+FFFD. 64 // 65 // iswprint() returns 0 for U+FFFD on Windows for some reason. Treat 66 // U+FFFD as printable and thus also all non-BMP chars as printable. 67 if (wc == 0xFFFD) 68 return true; 69 # endif 70 71 return iswprint((wint_t)wc) != 0; 72 #else 73 (void)len; 74 *next_len = 1; 75 return isprint((unsigned char)str[0]) != 0; 76 #endif 77 } 78 79 80 static bool 81 has_nonprint(const char *str, size_t len) 82 { 83 for (size_t i = 0; i < len; ) { 84 size_t next_len; 85 if (!is_next_printable(str + i, len - i, &next_len)) 86 return true; 87 88 i += next_len; 89 } 90 91 return false; 92 } 93 94 95 extern bool 96 tuklib_has_nonprint(const char *str) 97 { 98 const int saved_errno = errno; 99 const bool ret = has_nonprint(str, strlen(str)); 100 errno = saved_errno; 101 return ret; 102 } 103 104 105 extern const char * 106 tuklib_mask_nonprint_r(const char *str, char **mem) 107 { 108 const int saved_errno = errno; 109 110 // Free the old string, if any. 111 free(*mem); 112 *mem = NULL; 113 114 // If the whole input string contains only printable characters, 115 // return the input string. 116 const size_t len = strlen(str); 117 if (!has_nonprint(str, len)) { 118 errno = saved_errno; 119 return str; 120 } 121 122 // Allocate memory for the masked string. Since we use the single-byte 123 // character '?' to mask non-printable characters, it's possible that 124 // a few bytes less memory would be needed in reality if multibyte 125 // characters are masked. 126 // 127 // If allocation fails, return "???" because it should be safer than 128 // returning the unmasked string. 129 *mem = malloc(len + 1); 130 if (*mem == NULL) { 131 errno = saved_errno; 132 return "???"; 133 } 134 135 // Replace all non-printable characters with '?'. 136 char *dest = *mem; 137 138 for (size_t i = 0; i < len; ) { 139 size_t next_len; 140 if (is_next_printable(str + i, len - i, &next_len)) { 141 memcpy(dest, str + i, next_len); 142 dest += next_len; 143 } else { 144 *dest++ = '?'; 145 } 146 147 i += next_len; 148 } 149 150 *dest = '\0'; 151 152 errno = saved_errno; 153 return *mem; 154 } 155 156 157 extern const char * 158 tuklib_mask_nonprint(const char *str) 159 { 160 static char *mem = NULL; 161 return tuklib_mask_nonprint_r(str, &mem); 162 } 163