1 /* 2 * This file defines the string_tokenize interface 3 * Time-stamp: "2006-06-24 15:27:49 bkorb" 4 * 5 * string_tokenize copyright 2005 Bruce Korb 6 * 7 * string_tokenize is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * string_tokenize is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with string_tokenize; if not, write to: 19 * The Free Software Foundation, Inc., 20 * 51 Franklin Street, Fifth Floor, 21 * Boston, MA 02110-1301, USA. 22 */ 23 #include <ctype.h> 24 #include <errno.h> 25 #include <stdlib.h> 26 27 #define cc_t const unsigned char 28 #define ch_t unsigned char 29 30 /* = = = START-STATIC-FORWARD = = = */ 31 /* static forward declarations maintained by :mkfwd */ 32 static void 33 copy_cooked( ch_t** ppDest, char const ** ppSrc ); 34 35 static void 36 copy_raw( ch_t** ppDest, char const ** ppSrc ); 37 /* = = = END-STATIC-FORWARD = = = */ 38 39 static void 40 copy_cooked( ch_t** ppDest, char const ** ppSrc ) 41 { 42 ch_t* pDest = (ch_t*)*ppDest; 43 const ch_t* pSrc = (const ch_t*)(*ppSrc + 1); 44 45 for (;;) { 46 ch_t ch = *(pSrc++); 47 switch (ch) { 48 case NUL: *ppSrc = NULL; return; 49 case '"': goto done; 50 case '\\': 51 pSrc += ao_string_cook_escape_char( (char*)pSrc, (char*)&ch, 0x7F ); 52 if (ch == 0x7F) 53 break; 54 /* FALLTHROUGH */ 55 56 default: 57 *(pDest++) = ch; 58 } 59 } 60 61 done: 62 *ppDest = (ch_t*)pDest; /* next spot for storing character */ 63 *ppSrc = (char const *)pSrc; /* char following closing quote */ 64 } 65 66 67 static void 68 copy_raw( ch_t** ppDest, char const ** ppSrc ) 69 { 70 ch_t* pDest = *ppDest; 71 cc_t* pSrc = (cc_t*) (*ppSrc + 1); 72 73 for (;;) { 74 ch_t ch = *(pSrc++); 75 switch (ch) { 76 case NUL: *ppSrc = NULL; return; 77 case '\'': goto done; 78 case '\\': 79 /* 80 * *Four* escapes are handled: newline removal, escape char 81 * quoting and apostrophe quoting 82 */ 83 switch (*pSrc) { 84 case NUL: *ppSrc = NULL; return; 85 case '\r': 86 if (*(++pSrc) == '\n') 87 ++pSrc; 88 continue; 89 90 case '\n': 91 ++pSrc; 92 continue; 93 94 case '\'': 95 ch = '\''; 96 /* FALLTHROUGH */ 97 98 case '\\': 99 ++pSrc; 100 break; 101 } 102 /* FALLTHROUGH */ 103 104 default: 105 *(pDest++) = ch; 106 } 107 } 108 109 done: 110 *ppDest = pDest; /* next spot for storing character */ 111 *ppSrc = (char const *) pSrc; /* char following closing quote */ 112 } 113 114 115 /*=export_func ao_string_tokenize 116 * 117 * what: tokenize an input string 118 * 119 * arg: + char const* + string + string to be tokenized + 120 * 121 * ret_type: token_list_t* 122 * ret_desc: pointer to a structure that lists each token 123 * 124 * doc: 125 * 126 * This function will convert one input string into a list of strings. 127 * The list of strings is derived by separating the input based on 128 * white space separation. However, if the input contains either single 129 * or double quote characters, then the text after that character up to 130 * a matching quote will become the string in the list. 131 * 132 * The returned pointer should be deallocated with @code{free(3C)} when 133 * are done using the data. The data are placed in a single block of 134 * allocated memory. Do not deallocate individual token/strings. 135 * 136 * The structure pointed to will contain at least these two fields: 137 * @table @samp 138 * @item tkn_ct 139 * The number of tokens found in the input string. 140 * @item tok_list 141 * An array of @code{tkn_ct + 1} pointers to substring tokens, with 142 * the last pointer set to NULL. 143 * @end table 144 * 145 * There are two types of quoted strings: single quoted (@code{'}) and 146 * double quoted (@code{"}). Singly quoted strings are fairly raw in that 147 * escape characters (@code{\\}) are simply another character, except when 148 * preceding the following characters: 149 * @example 150 * @code{\\} double backslashes reduce to one 151 * @code{'} incorporates the single quote into the string 152 * @code{\n} suppresses both the backslash and newline character 153 * @end example 154 * 155 * Double quote strings are formed according to the rules of string 156 * constants in ANSI-C programs. 157 * 158 * example: 159 * @example 160 * #include <stdlib.h> 161 * int ix; 162 * token_list_t* ptl = ao_string_tokenize( some_string ) 163 * for (ix = 0; ix < ptl->tkn_ct; ix++) 164 * do_something_with_tkn( ptl->tkn_list[ix] ); 165 * free( ptl ); 166 * @end example 167 * Note that everything is freed with the one call to @code{free(3C)}. 168 * 169 * err: 170 * NULL is returned and @code{errno} will be set to indicate the problem: 171 * @itemize @bullet 172 * @item 173 * @code{EINVAL} - There was an unterminated quoted string. 174 * @item 175 * @code{ENOENT} - The input string was empty. 176 * @item 177 * @code{ENOMEM} - There is not enough memory. 178 * @end itemize 179 =*/ 180 token_list_t* 181 ao_string_tokenize( char const* str ) 182 { 183 int max_token_ct = 1; /* allow for trailing NUL on string */ 184 token_list_t* res; 185 186 if (str == NULL) goto bogus_str; 187 188 /* 189 * Trim leading white space. Use "ENOENT" and a NULL return to indicate 190 * an empty string was passed. 191 */ 192 while (isspace( (ch_t)*str )) str++; 193 if (*str == NUL) { 194 bogus_str: 195 errno = ENOENT; 196 return NULL; 197 } 198 199 /* 200 * Take an approximate count of tokens. If no quoted strings are used, 201 * it will be accurate. If quoted strings are used, it will be a little 202 * high and we'll squander the space for a few extra pointers. 203 */ 204 { 205 cc_t* pz = (cc_t*)str; 206 207 do { 208 max_token_ct++; 209 while (! isspace( *++pz )) 210 if (*pz == NUL) goto found_nul; 211 while (isspace( *pz )) pz++; 212 } while (*pz != NUL); 213 214 found_nul: 215 ; 216 } 217 218 res = malloc( sizeof(*res) + strlen(str) + (max_token_ct * sizeof(ch_t*)) ); 219 if (res == NULL) { 220 errno = ENOMEM; 221 return res; 222 } 223 224 /* 225 * Now copy each token into the output buffer. 226 */ 227 { 228 ch_t* pzDest = (ch_t*)(res->tkn_list + (max_token_ct + 1)); 229 res->tkn_ct = 0; 230 231 do { 232 res->tkn_list[ res->tkn_ct++ ] = pzDest; 233 for (;;) { 234 int ch = (ch_t)*str; 235 if (isspace( ch )) { 236 found_white_space: 237 while (isspace( (ch_t)*++str )) ; 238 break; 239 } 240 241 switch (ch) { 242 case '"': 243 copy_cooked( &pzDest, &str ); 244 if (str == NULL) { 245 free(res); 246 errno = EINVAL; 247 return NULL; 248 } 249 if (isspace( (ch_t)*str )) 250 goto found_white_space; 251 break; 252 253 case '\'': 254 copy_raw( &pzDest, &str ); 255 if (str == NULL) { 256 free(res); 257 errno = EINVAL; 258 return NULL; 259 } 260 if (isspace( (ch_t)*str )) 261 goto found_white_space; 262 break; 263 264 case NUL: 265 goto copy_done; 266 267 default: 268 str++; 269 *(pzDest++) = ch; 270 } 271 } copy_done:; 272 273 /* 274 * NUL terminate the last token and see if we have any more tokens. 275 */ 276 *(pzDest++) = NUL; 277 } while (*str != NUL); 278 279 res->tkn_list[ res->tkn_ct ] = NULL; 280 } 281 282 return res; 283 } 284 285 #ifdef TEST 286 #include <stdio.h> 287 #include <string.h> 288 289 int 290 main( int argc, char** argv ) 291 { 292 if (argc == 1) { 293 printf("USAGE: %s arg [ ... ]\n", *argv); 294 return 1; 295 } 296 while (--argc > 0) { 297 char* arg = *(++argv); 298 token_list_t* p = ao_string_tokenize( arg ); 299 if (p == NULL) { 300 printf( "Parsing string ``%s'' failed:\n\terrno %d (%s)\n", 301 arg, errno, strerror( errno )); 302 } else { 303 int ix = 0; 304 printf( "Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct ); 305 do { 306 printf( " %3d: ``%s''\n", ix+1, p->tkn_list[ix] ); 307 } while (++ix < p->tkn_ct); 308 free(p); 309 } 310 } 311 return 0; 312 } 313 #endif 314 315 /* 316 * Local Variables: 317 * mode: C 318 * c-file-style: "stroustrup" 319 * indent-tabs-mode: nil 320 * End: 321 * end of autoopts/tokenize.c */ 322