1 /** \file tokenize.c 2 * 3 * Tokenize a string, accommodating quoted strings. 4 * 5 * @addtogroup autoopts 6 * @{ 7 */ 8 /* 9 * This file defines the string_tokenize interface 10 * This file is part of AutoOpts, a companion to AutoGen. 11 * AutoOpts is free software. 12 * AutoOpts is Copyright (C) 1992-2018 by Bruce Korb - all rights reserved 13 * 14 * AutoOpts is available under any one of two licenses. The license 15 * in use must be one of these two and the choice is under the control 16 * of the user of the license. 17 * 18 * The GNU Lesser General Public License, version 3 or later 19 * See the files "COPYING.lgplv3" and "COPYING.gplv3" 20 * 21 * The Modified Berkeley Software Distribution License 22 * See the file "COPYING.mbsd" 23 * 24 * These files have the following sha256 sums: 25 * 26 * 8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95 COPYING.gplv3 27 * 4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b COPYING.lgplv3 28 * 13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239 COPYING.mbsd 29 */ 30 31 static void 32 copy_cooked(ch_t ** ppDest, char const ** ppSrc) 33 { 34 ch_t * pDest = (ch_t *)*ppDest; 35 const ch_t * pSrc = (const ch_t *)(*ppSrc + 1); 36 37 for (;;) { 38 ch_t ch = *(pSrc++); 39 switch (ch) { 40 case NUL: *ppSrc = NULL; return; 41 case '"': goto done; 42 case '\\': 43 pSrc += ao_string_cook_escape_char((char *)pSrc, (char *)&ch, 0x7F); 44 if (ch == 0x7F) 45 break; 46 /* FALLTHROUGH */ 47 48 default: 49 *(pDest++) = ch; 50 } 51 } 52 53 done: 54 *ppDest = (ch_t *)pDest; /* next spot for storing character */ 55 *ppSrc = (char const *)pSrc; /* char following closing quote */ 56 } 57 58 59 static void 60 copy_raw(ch_t ** ppDest, char const ** ppSrc) 61 { 62 ch_t * pDest = *ppDest; 63 cc_t * pSrc = (cc_t *) (*ppSrc + 1); 64 65 for (;;) { 66 ch_t ch = *(pSrc++); 67 switch (ch) { 68 case NUL: *ppSrc = NULL; return; 69 case '\'': goto done; 70 case '\\': 71 /* 72 * *Four* escapes are handled: newline removal, escape char 73 * quoting and apostrophe quoting 74 */ 75 switch (*pSrc) { 76 case NUL: *ppSrc = NULL; return; 77 case '\r': 78 if (*(++pSrc) == NL) 79 ++pSrc; 80 continue; 81 82 case NL: 83 ++pSrc; 84 continue; 85 86 case '\'': 87 ch = '\''; 88 /* FALLTHROUGH */ 89 90 case '\\': 91 ++pSrc; 92 break; 93 } 94 /* FALLTHROUGH */ 95 96 default: 97 *(pDest++) = ch; 98 } 99 } 100 101 done: 102 *ppDest = pDest; /* next spot for storing character */ 103 *ppSrc = (char const *) pSrc; /* char following closing quote */ 104 } 105 106 static token_list_t * 107 alloc_token_list(char const * str) 108 { 109 token_list_t * res; 110 111 int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */ 112 113 if (str == NULL) goto enoent_res; 114 115 /* 116 * Trim leading white space. Use "ENOENT" and a NULL return to indicate 117 * an empty string was passed. 118 */ 119 str = SPN_WHITESPACE_CHARS(str); 120 if (*str == NUL) goto enoent_res; 121 122 /* 123 * Take an approximate count of tokens. If no quoted strings are used, 124 * it will be accurate. If quoted strings are used, it will be a little 125 * high and we'll squander the space for a few extra pointers. 126 */ 127 { 128 char const * pz = str; 129 130 do { 131 max_token_ct++; 132 pz = BRK_WHITESPACE_CHARS(pz+1); 133 pz = SPN_WHITESPACE_CHARS(pz); 134 } while (*pz != NUL); 135 136 res = malloc(sizeof(*res) + (size_t)(pz - str) 137 + ((size_t)max_token_ct * sizeof(ch_t *))); 138 } 139 140 if (res == NULL) 141 errno = ENOMEM; 142 else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1)); 143 144 return res; 145 146 enoent_res: 147 148 errno = ENOENT; 149 return NULL; 150 } 151 152 /*=export_func ao_string_tokenize 153 * 154 * what: tokenize an input string 155 * 156 * arg: + char const * + string + string to be tokenized + 157 * 158 * ret_type: token_list_t * 159 * ret_desc: pointer to a structure that lists each token 160 * 161 * doc: 162 * 163 * This function will convert one input string into a list of strings. 164 * The list of strings is derived by separating the input based on 165 * white space separation. However, if the input contains either single 166 * or double quote characters, then the text after that character up to 167 * a matching quote will become the string in the list. 168 * 169 * The returned pointer should be deallocated with @code{free(3C)} when 170 * are done using the data. The data are placed in a single block of 171 * allocated memory. Do not deallocate individual token/strings. 172 * 173 * The structure pointed to will contain at least these two fields: 174 * @table @samp 175 * @item tkn_ct 176 * The number of tokens found in the input string. 177 * @item tok_list 178 * An array of @code{tkn_ct + 1} pointers to substring tokens, with 179 * the last pointer set to NULL. 180 * @end table 181 * 182 * There are two types of quoted strings: single quoted (@code{'}) and 183 * double quoted (@code{"}). Singly quoted strings are fairly raw in that 184 * escape characters (@code{\\}) are simply another character, except when 185 * preceding the following characters: 186 * @example 187 * @code{\\} double backslashes reduce to one 188 * @code{'} incorporates the single quote into the string 189 * @code{\n} suppresses both the backslash and newline character 190 * @end example 191 * 192 * Double quote strings are formed according to the rules of string 193 * constants in ANSI-C programs. 194 * 195 * example: 196 * @example 197 * #include <stdlib.h> 198 * int ix; 199 * token_list_t * ptl = ao_string_tokenize(some_string) 200 * for (ix = 0; ix < ptl->tkn_ct; ix++) 201 * do_something_with_tkn(ptl->tkn_list[ix]); 202 * free(ptl); 203 * @end example 204 * Note that everything is freed with the one call to @code{free(3C)}. 205 * 206 * err: 207 * NULL is returned and @code{errno} will be set to indicate the problem: 208 * @itemize @bullet 209 * @item 210 * @code{EINVAL} - There was an unterminated quoted string. 211 * @item 212 * @code{ENOENT} - The input string was empty. 213 * @item 214 * @code{ENOMEM} - There is not enough memory. 215 * @end itemize 216 =*/ 217 token_list_t * 218 ao_string_tokenize(char const * str) 219 { 220 token_list_t * res = alloc_token_list(str); 221 ch_t * pzDest; 222 223 /* 224 * Now copy each token into the output buffer. 225 */ 226 if (res == NULL) 227 return res; 228 229 pzDest = (ch_t *)(res->tkn_list[0]); 230 res->tkn_ct = 0; 231 232 do { 233 res->tkn_list[ res->tkn_ct++ ] = pzDest; 234 for (;;) { 235 int ch = (ch_t)*str; 236 if (IS_WHITESPACE_CHAR(ch)) { 237 found_white_space: 238 str = SPN_WHITESPACE_CHARS(str+1); 239 break; 240 } 241 242 switch (ch) { 243 case '"': 244 copy_cooked(&pzDest, &str); 245 if (str == NULL) { 246 free(res); 247 errno = EINVAL; 248 return NULL; 249 } 250 if (IS_WHITESPACE_CHAR(*str)) 251 goto found_white_space; 252 break; 253 254 case '\'': 255 copy_raw(&pzDest, &str); 256 if (str == NULL) { 257 free(res); 258 errno = EINVAL; 259 return NULL; 260 } 261 if (IS_WHITESPACE_CHAR(*str)) 262 goto found_white_space; 263 break; 264 265 case NUL: 266 goto copy_done; 267 268 default: 269 str++; 270 *(pzDest++) = (unsigned char)ch; 271 } 272 } copy_done:; 273 274 /* 275 * NUL terminate the last token and see if we have any more tokens. 276 */ 277 *(pzDest++) = NUL; 278 } while (*str != NUL); 279 280 res->tkn_list[ res->tkn_ct ] = NULL; 281 282 return res; 283 } 284 285 #ifdef TEST 286 #include <stdio.h> 287 #include <string.h> 288 289 int 290 main(int argc, char ** argv) 291 { 292 if (argc == 1) { 293 printf("USAGE: %s arg [ ... ]\n", *argv); 294 return 1; 295 } 296 while (--argc > 0) { 297 char * arg = *(++argv); 298 token_list_t * p = ao_string_tokenize(arg); 299 if (p == NULL) { 300 printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n", 301 arg, errno, strerror(errno)); 302 } else { 303 int ix = 0; 304 printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct); 305 do { 306 printf(" %3d: ``%s''\n", ix+1, p->tkn_list[ix]); 307 } while (++ix < p->tkn_ct); 308 free(p); 309 } 310 } 311 return 0; 312 } 313 #endif 314 315 /** @} 316 * 317 * Local Variables: 318 * mode: C 319 * c-file-style: "stroustrup" 320 * indent-tabs-mode: nil 321 * End: 322 * end of autoopts/tokenize.c */ 323