1 /** \file tokenize.c 2 * 3 * Tokenize a string, accommodating quoted strings. 4 * 5 * @addtogroup autoopts 6 * @{ 7 */ 8 /* 9 * This file defines the string_tokenize interface 10 * This file is part of AutoOpts, a companion to AutoGen. 11 * AutoOpts is free software. 12 * AutoOpts is Copyright (C) 1992-2015 by Bruce Korb - all rights reserved 13 * 14 * AutoOpts is available under any one of two licenses. The license 15 * in use must be one of these two and the choice is under the control 16 * of the user of the license. 17 * 18 * The GNU Lesser General Public License, version 3 or later 19 * See the files "COPYING.lgplv3" and "COPYING.gplv3" 20 * 21 * The Modified Berkeley Software Distribution License 22 * See the file "COPYING.mbsd" 23 * 24 * These files have the following sha256 sums: 25 * 26 * 8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95 COPYING.gplv3 27 * 4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b COPYING.lgplv3 28 * 13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239 COPYING.mbsd 29 */ 30 31 #include <errno.h> 32 #include <stdlib.h> 33 34 #define cc_t const unsigned char 35 #define ch_t unsigned char 36 37 /* = = = START-STATIC-FORWARD = = = */ 38 static void 39 copy_cooked(ch_t ** ppDest, char const ** ppSrc); 40 41 static void 42 copy_raw(ch_t ** ppDest, char const ** ppSrc); 43 44 static token_list_t * 45 alloc_token_list(char const * str); 46 /* = = = END-STATIC-FORWARD = = = */ 47 48 static void 49 copy_cooked(ch_t ** ppDest, char const ** ppSrc) 50 { 51 ch_t * pDest = (ch_t *)*ppDest; 52 const ch_t * pSrc = (const ch_t *)(*ppSrc + 1); 53 54 for (;;) { 55 ch_t ch = *(pSrc++); 56 switch (ch) { 57 case NUL: *ppSrc = NULL; return; 58 case '"': goto done; 59 case '\\': 60 pSrc += ao_string_cook_escape_char((const char *)pSrc, (char *)&ch, 0x7F); 61 if (ch == 0x7F) 62 break; 63 /* FALLTHROUGH */ 64 65 default: 66 *(pDest++) = ch; 67 } 68 } 69 70 done: 71 *ppDest = (ch_t *)pDest; /* next spot for storing character */ 72 *ppSrc = (char const *)pSrc; /* char following closing quote */ 73 } 74 75 76 static void 77 copy_raw(ch_t ** ppDest, char const ** ppSrc) 78 { 79 ch_t * pDest = *ppDest; 80 cc_t * pSrc = (cc_t *) (*ppSrc + 1); 81 82 for (;;) { 83 ch_t ch = *(pSrc++); 84 switch (ch) { 85 case NUL: *ppSrc = NULL; return; 86 case '\'': goto done; 87 case '\\': 88 /* 89 * *Four* escapes are handled: newline removal, escape char 90 * quoting and apostrophe quoting 91 */ 92 switch (*pSrc) { 93 case NUL: *ppSrc = NULL; return; 94 case '\r': 95 if (*(++pSrc) == NL) 96 ++pSrc; 97 continue; 98 99 case NL: 100 ++pSrc; 101 continue; 102 103 case '\'': 104 ch = '\''; 105 /* FALLTHROUGH */ 106 107 case '\\': 108 ++pSrc; 109 break; 110 } 111 /* FALLTHROUGH */ 112 113 default: 114 *(pDest++) = ch; 115 } 116 } 117 118 done: 119 *ppDest = pDest; /* next spot for storing character */ 120 *ppSrc = (char const *) pSrc; /* char following closing quote */ 121 } 122 123 static token_list_t * 124 alloc_token_list(char const * str) 125 { 126 token_list_t * res; 127 128 int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */ 129 130 if (str == NULL) goto enoent_res; 131 132 /* 133 * Trim leading white space. Use "ENOENT" and a NULL return to indicate 134 * an empty string was passed. 135 */ 136 str = SPN_WHITESPACE_CHARS(str); 137 if (*str == NUL) goto enoent_res; 138 139 /* 140 * Take an approximate count of tokens. If no quoted strings are used, 141 * it will be accurate. If quoted strings are used, it will be a little 142 * high and we'll squander the space for a few extra pointers. 143 */ 144 { 145 char const * pz = str; 146 147 do { 148 max_token_ct++; 149 pz = BRK_WHITESPACE_CHARS(pz+1); 150 pz = SPN_WHITESPACE_CHARS(pz); 151 } while (*pz != NUL); 152 153 res = malloc(sizeof(*res) + (size_t)(pz - str) 154 + ((size_t)max_token_ct * sizeof(ch_t *))); 155 } 156 157 if (res == NULL) 158 errno = ENOMEM; 159 else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1)); 160 161 return res; 162 163 enoent_res: 164 165 errno = ENOENT; 166 return NULL; 167 } 168 169 /*=export_func ao_string_tokenize 170 * 171 * what: tokenize an input string 172 * 173 * arg: + char const * + string + string to be tokenized + 174 * 175 * ret_type: token_list_t * 176 * ret_desc: pointer to a structure that lists each token 177 * 178 * doc: 179 * 180 * This function will convert one input string into a list of strings. 181 * The list of strings is derived by separating the input based on 182 * white space separation. However, if the input contains either single 183 * or double quote characters, then the text after that character up to 184 * a matching quote will become the string in the list. 185 * 186 * The returned pointer should be deallocated with @code{free(3C)} when 187 * are done using the data. The data are placed in a single block of 188 * allocated memory. Do not deallocate individual token/strings. 189 * 190 * The structure pointed to will contain at least these two fields: 191 * @table @samp 192 * @item tkn_ct 193 * The number of tokens found in the input string. 194 * @item tok_list 195 * An array of @code{tkn_ct + 1} pointers to substring tokens, with 196 * the last pointer set to NULL. 197 * @end table 198 * 199 * There are two types of quoted strings: single quoted (@code{'}) and 200 * double quoted (@code{"}). Singly quoted strings are fairly raw in that 201 * escape characters (@code{\\}) are simply another character, except when 202 * preceding the following characters: 203 * @example 204 * @code{\\} double backslashes reduce to one 205 * @code{'} incorporates the single quote into the string 206 * @code{\n} suppresses both the backslash and newline character 207 * @end example 208 * 209 * Double quote strings are formed according to the rules of string 210 * constants in ANSI-C programs. 211 * 212 * example: 213 * @example 214 * #include <stdlib.h> 215 * int ix; 216 * token_list_t * ptl = ao_string_tokenize(some_string) 217 * for (ix = 0; ix < ptl->tkn_ct; ix++) 218 * do_something_with_tkn(ptl->tkn_list[ix]); 219 * free(ptl); 220 * @end example 221 * Note that everything is freed with the one call to @code{free(3C)}. 222 * 223 * err: 224 * NULL is returned and @code{errno} will be set to indicate the problem: 225 * @itemize @bullet 226 * @item 227 * @code{EINVAL} - There was an unterminated quoted string. 228 * @item 229 * @code{ENOENT} - The input string was empty. 230 * @item 231 * @code{ENOMEM} - There is not enough memory. 232 * @end itemize 233 =*/ 234 token_list_t * 235 ao_string_tokenize(char const * str) 236 { 237 token_list_t * res = alloc_token_list(str); 238 ch_t * pzDest; 239 240 /* 241 * Now copy each token into the output buffer. 242 */ 243 if (res == NULL) 244 return res; 245 246 pzDest = (ch_t *)(res->tkn_list[0]); 247 res->tkn_ct = 0; 248 249 do { 250 res->tkn_list[ res->tkn_ct++ ] = pzDest; 251 for (;;) { 252 int ch = (ch_t)*str; 253 if (IS_WHITESPACE_CHAR(ch)) { 254 found_white_space: 255 str = SPN_WHITESPACE_CHARS(str+1); 256 break; 257 } 258 259 switch (ch) { 260 case '"': 261 copy_cooked(&pzDest, &str); 262 if (str == NULL) { 263 free(res); 264 errno = EINVAL; 265 return NULL; 266 } 267 if (IS_WHITESPACE_CHAR(*str)) 268 goto found_white_space; 269 break; 270 271 case '\'': 272 copy_raw(&pzDest, &str); 273 if (str == NULL) { 274 free(res); 275 errno = EINVAL; 276 return NULL; 277 } 278 if (IS_WHITESPACE_CHAR(*str)) 279 goto found_white_space; 280 break; 281 282 case NUL: 283 goto copy_done; 284 285 default: 286 str++; 287 *(pzDest++) = (unsigned char)ch; 288 } 289 } copy_done:; 290 291 /* 292 * NUL terminate the last token and see if we have any more tokens. 293 */ 294 *(pzDest++) = NUL; 295 } while (*str != NUL); 296 297 res->tkn_list[ res->tkn_ct ] = NULL; 298 299 return res; 300 } 301 302 #ifdef TEST 303 #include <stdio.h> 304 #include <string.h> 305 306 int 307 main(int argc, char ** argv) 308 { 309 if (argc == 1) { 310 printf("USAGE: %s arg [ ... ]\n", *argv); 311 return 1; 312 } 313 while (--argc > 0) { 314 char * arg = *(++argv); 315 token_list_t * p = ao_string_tokenize(arg); 316 if (p == NULL) { 317 printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n", 318 arg, errno, strerror(errno)); 319 } else { 320 int ix = 0; 321 printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct); 322 do { 323 printf(" %3d: ``%s''\n", ix+1, p->tkn_list[ix]); 324 } while (++ix < p->tkn_ct); 325 free(p); 326 } 327 } 328 return 0; 329 } 330 #endif 331 332 /** @} 333 * 334 * Local Variables: 335 * mode: C 336 * c-file-style: "stroustrup" 337 * indent-tabs-mode: nil 338 * End: 339 * end of autoopts/tokenize.c */ 340