1 /* $OpenBSD: utf8.c,v 1.11 2020/05/01 06:28:52 djm Exp $ */ 2 /* 3 * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 /* 19 * Utility functions for multibyte-character handling, 20 * in particular to sanitize untrusted strings for terminal output. 21 */ 22 23 #include "includes.h" 24 25 #include <sys/types.h> 26 #ifdef HAVE_LANGINFO_H 27 # include <langinfo.h> 28 #endif 29 #include <limits.h> 30 #include <locale.h> 31 #include <stdarg.h> 32 #include <stdio.h> 33 #include <stdlib.h> 34 #include <string.h> 35 #if defined(HAVE_STRNVIS) && defined(HAVE_VIS_H) && !defined(BROKEN_STRNVIS) 36 # include <vis.h> 37 #endif 38 #ifdef HAVE_WCHAR_H 39 # include <wchar.h> 40 #endif 41 42 #include "utf8.h" 43 44 static int dangerous_locale(void); 45 static int grow_dst(char **, size_t *, size_t, char **, size_t); 46 47 48 /* 49 * For US-ASCII and UTF-8 encodings, we can safely recover from 50 * encoding errors and from non-printable characters. For any 51 * other encodings, err to the side of caution and abort parsing: 52 * For state-dependent encodings, recovery is impossible. 53 * For arbitrary encodings, replacement of non-printable 54 * characters would be non-trivial and too fragile. 55 * The comments indicate what nl_langinfo(CODESET) 56 * returns for US-ASCII on various operating systems. 57 */ 58 59 static int 60 dangerous_locale(void) { 61 char *loc; 62 63 loc = nl_langinfo(CODESET); 64 return strcmp(loc, "UTF-8") != 0 && 65 strcmp(loc, "US-ASCII") != 0 && /* OpenBSD */ 66 strcmp(loc, "ANSI_X3.4-1968") != 0 && /* Linux */ 67 strcmp(loc, "ISO8859-1") != 0 && /* AIX */ 68 strcmp(loc, "646") != 0 && /* Solaris, NetBSD */ 69 strcmp(loc, "") != 0; /* Solaris 6 */ 70 } 71 72 static int 73 grow_dst(char **dst, size_t *sz, size_t maxsz, char **dp, size_t need) 74 { 75 char *tp; 76 size_t tsz; 77 78 if (*dp + need < *dst + *sz) 79 return 0; 80 tsz = *sz + 128; 81 if (tsz > maxsz) 82 tsz = maxsz; 83 if ((tp = recallocarray(*dst, *sz, tsz, 1)) == NULL) 84 return -1; 85 *dp = tp + (*dp - *dst); 86 *dst = tp; 87 *sz = tsz; 88 return 0; 89 } 90 91 /* 92 * The following two functions limit the number of bytes written, 93 * including the terminating '\0', to sz. Unless wp is NULL, 94 * they limit the number of display columns occupied to *wp. 95 * Whichever is reached first terminates the output string. 96 * To stay close to the standard interfaces, they return the number of 97 * non-NUL bytes that would have been written if both were unlimited. 98 * If wp is NULL, newline, carriage return, and tab are allowed; 99 * otherwise, the actual number of columns occupied by what was 100 * written is returned in *wp. 101 */ 102 103 int 104 vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap) 105 { 106 char *src; /* Source string returned from vasprintf. */ 107 char *sp; /* Pointer into src. */ 108 char *dst; /* Destination string to be returned. */ 109 char *dp; /* Pointer into dst. */ 110 char *tp; /* Temporary pointer for dst. */ 111 size_t sz; /* Number of bytes allocated for dst. */ 112 wchar_t wc; /* Wide character at sp. */ 113 int len; /* Number of bytes in the character at sp. */ 114 int ret; /* Number of bytes needed to format src. */ 115 int width; /* Display width of the character wc. */ 116 int total_width, max_width, print; 117 118 src = NULL; 119 if ((ret = vasprintf(&src, fmt, ap)) <= 0) 120 goto fail; 121 122 sz = strlen(src) + 1; 123 if ((dst = malloc(sz)) == NULL) { 124 free(src); 125 ret = -1; 126 goto fail; 127 } 128 129 if (maxsz > INT_MAX) 130 maxsz = INT_MAX; 131 132 sp = src; 133 dp = dst; 134 ret = 0; 135 print = 1; 136 total_width = 0; 137 max_width = wp == NULL ? INT_MAX : *wp; 138 while (*sp != '\0') { 139 if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) { 140 (void)mbtowc(NULL, NULL, MB_CUR_MAX); 141 if (dangerous_locale()) { 142 ret = -1; 143 break; 144 } 145 len = 1; 146 width = -1; 147 } else if (wp == NULL && 148 (wc == L'\n' || wc == L'\r' || wc == L'\t')) { 149 /* 150 * Don't use width uninitialized; the actual 151 * value doesn't matter because total_width 152 * is only returned for wp != NULL. 153 */ 154 width = 0; 155 } else if ((width = wcwidth(wc)) == -1 && 156 dangerous_locale()) { 157 ret = -1; 158 break; 159 } 160 161 /* Valid, printable character. */ 162 163 if (width >= 0) { 164 if (print && (dp - dst >= (int)maxsz - len || 165 total_width > max_width - width)) 166 print = 0; 167 if (print) { 168 if (grow_dst(&dst, &sz, maxsz, 169 &dp, len) == -1) { 170 ret = -1; 171 break; 172 } 173 total_width += width; 174 memcpy(dp, sp, len); 175 dp += len; 176 } 177 sp += len; 178 if (ret >= 0) 179 ret += len; 180 continue; 181 } 182 183 /* Escaping required. */ 184 185 while (len > 0) { 186 if (print && (dp - dst >= (int)maxsz - 4 || 187 total_width > max_width - 4)) 188 print = 0; 189 if (print) { 190 if (grow_dst(&dst, &sz, maxsz, 191 &dp, 4) == -1) { 192 ret = -1; 193 break; 194 } 195 tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0); 196 width = tp - dp; 197 total_width += width; 198 dp = tp; 199 } else 200 width = 4; 201 len--; 202 sp++; 203 if (ret >= 0) 204 ret += width; 205 } 206 if (len > 0) 207 break; 208 } 209 free(src); 210 *dp = '\0'; 211 *str = dst; 212 if (wp != NULL) 213 *wp = total_width; 214 215 /* 216 * If the string was truncated by the width limit but 217 * would have fit into the size limit, the only sane way 218 * to report the problem is using the return value, such 219 * that the usual idiom "if (ret < 0 || ret >= sz) error" 220 * works as expected. 221 */ 222 223 if (ret < (int)maxsz && !print) 224 ret = -1; 225 return ret; 226 227 fail: 228 if (wp != NULL) 229 *wp = 0; 230 if (ret == 0) { 231 *str = src; 232 return 0; 233 } else { 234 *str = NULL; 235 return -1; 236 } 237 } 238 239 int 240 snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...) 241 { 242 va_list ap; 243 char *cp = NULL; 244 int ret; 245 246 va_start(ap, fmt); 247 ret = vasnmprintf(&cp, sz, wp, fmt, ap); 248 va_end(ap); 249 if (cp != NULL) { 250 (void)strlcpy(str, cp, sz); 251 free(cp); 252 } else 253 *str = '\0'; 254 return ret; 255 } 256 257 int 258 asmprintf(char **outp, size_t sz, int *wp, const char *fmt, ...) 259 { 260 va_list ap; 261 int ret; 262 263 *outp = NULL; 264 va_start(ap, fmt); 265 ret = vasnmprintf(outp, sz, wp, fmt, ap); 266 va_end(ap); 267 268 return ret; 269 } 270 271 /* 272 * To stay close to the standard interfaces, the following functions 273 * return the number of non-NUL bytes written. 274 */ 275 276 int 277 vfmprintf(FILE *stream, const char *fmt, va_list ap) 278 { 279 char *str = NULL; 280 int ret; 281 282 if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0) { 283 free(str); 284 return -1; 285 } 286 if (fputs(str, stream) == EOF) 287 ret = -1; 288 free(str); 289 return ret; 290 } 291 292 int 293 fmprintf(FILE *stream, const char *fmt, ...) 294 { 295 va_list ap; 296 int ret; 297 298 va_start(ap, fmt); 299 ret = vfmprintf(stream, fmt, ap); 300 va_end(ap); 301 return ret; 302 } 303 304 int 305 mprintf(const char *fmt, ...) 306 { 307 va_list ap; 308 int ret; 309 310 va_start(ap, fmt); 311 ret = vfmprintf(stdout, fmt, ap); 312 va_end(ap); 313 return ret; 314 } 315 316 /* 317 * Set up libc for multibyte output in the user's chosen locale. 318 * 319 * XXX: we are known to have problems with Turkish (i/I confusion) so we 320 * deliberately fall back to the C locale for now. Longer term we should 321 * always prefer to select C.[encoding] if possible, but there's no 322 * standardisation in locales between systems, so we'll need to survey 323 * what's out there first. 324 */ 325 void 326 msetlocale(void) 327 { 328 const char *vars[] = { "LC_ALL", "LC_CTYPE", "LANG", NULL }; 329 char *cp; 330 int i; 331 332 /* 333 * We can't yet cope with dotless/dotted I in Turkish locales, 334 * so fall back to the C locale for these. 335 */ 336 for (i = 0; vars[i] != NULL; i++) { 337 if ((cp = getenv(vars[i])) == NULL) 338 continue; 339 if (strncasecmp(cp, "TR", 2) != 0) 340 break; 341 /* 342 * If we're in a UTF-8 locale then prefer to use 343 * the C.UTF-8 locale (or equivalent) if it exists. 344 */ 345 if ((strcasestr(cp, "UTF-8") != NULL || 346 strcasestr(cp, "UTF8") != NULL) && 347 (setlocale(LC_CTYPE, "C.UTF-8") != NULL || 348 setlocale(LC_CTYPE, "POSIX.UTF-8") != NULL)) 349 return; 350 setlocale(LC_CTYPE, "C"); 351 return; 352 } 353 /* We can handle this locale */ 354 setlocale(LC_CTYPE, ""); 355 } 356