1 /** \file tokenize.c
2 *
3 * Tokenize a string, accommodating quoted strings.
4 *
5 * @addtogroup autoopts
6 * @{
7 */
8 /*
9 * This file defines the string_tokenize interface
10 * This file is part of AutoOpts, a companion to AutoGen.
11 * AutoOpts is free software.
12 * AutoOpts is Copyright (C) 1992-2018 by Bruce Korb - all rights reserved
13 *
14 * AutoOpts is available under any one of two licenses. The license
15 * in use must be one of these two and the choice is under the control
16 * of the user of the license.
17 *
18 * The GNU Lesser General Public License, version 3 or later
19 * See the files "COPYING.lgplv3" and "COPYING.gplv3"
20 *
21 * The Modified Berkeley Software Distribution License
22 * See the file "COPYING.mbsd"
23 *
24 * These files have the following sha256 sums:
25 *
26 * 8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95 COPYING.gplv3
27 * 4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b COPYING.lgplv3
28 * 13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239 COPYING.mbsd
29 */
30
31 static void
copy_cooked(ch_t ** ppDest,char const ** ppSrc)32 copy_cooked(ch_t ** ppDest, char const ** ppSrc)
33 {
34 ch_t * pDest = (ch_t *)*ppDest;
35 const ch_t * pSrc = (const ch_t *)(*ppSrc + 1);
36
37 for (;;) {
38 ch_t ch = *(pSrc++);
39 switch (ch) {
40 case NUL: *ppSrc = NULL; return;
41 case '"': goto done;
42 case '\\':
43 pSrc += ao_string_cook_escape_char((char *)pSrc, (char *)&ch, 0x7F);
44 if (ch == 0x7F)
45 break;
46 /* FALLTHROUGH */
47
48 default:
49 *(pDest++) = ch;
50 }
51 }
52
53 done:
54 *ppDest = (ch_t *)pDest; /* next spot for storing character */
55 *ppSrc = (char const *)pSrc; /* char following closing quote */
56 }
57
58
59 static void
copy_raw(ch_t ** ppDest,char const ** ppSrc)60 copy_raw(ch_t ** ppDest, char const ** ppSrc)
61 {
62 ch_t * pDest = *ppDest;
63 cc_t * pSrc = (cc_t *) (*ppSrc + 1);
64
65 for (;;) {
66 ch_t ch = *(pSrc++);
67 switch (ch) {
68 case NUL: *ppSrc = NULL; return;
69 case '\'': goto done;
70 case '\\':
71 /*
72 * *Four* escapes are handled: newline removal, escape char
73 * quoting and apostrophe quoting
74 */
75 switch (*pSrc) {
76 case NUL: *ppSrc = NULL; return;
77 case '\r':
78 if (*(++pSrc) == NL)
79 ++pSrc;
80 continue;
81
82 case NL:
83 ++pSrc;
84 continue;
85
86 case '\'':
87 ch = '\'';
88 /* FALLTHROUGH */
89
90 case '\\':
91 ++pSrc;
92 break;
93 }
94 /* FALLTHROUGH */
95
96 default:
97 *(pDest++) = ch;
98 }
99 }
100
101 done:
102 *ppDest = pDest; /* next spot for storing character */
103 *ppSrc = (char const *) pSrc; /* char following closing quote */
104 }
105
106 static token_list_t *
alloc_token_list(char const * str)107 alloc_token_list(char const * str)
108 {
109 token_list_t * res;
110
111 int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
112
113 if (str == NULL) goto enoent_res;
114
115 /*
116 * Trim leading white space. Use "ENOENT" and a NULL return to indicate
117 * an empty string was passed.
118 */
119 str = SPN_WHITESPACE_CHARS(str);
120 if (*str == NUL) goto enoent_res;
121
122 /*
123 * Take an approximate count of tokens. If no quoted strings are used,
124 * it will be accurate. If quoted strings are used, it will be a little
125 * high and we'll squander the space for a few extra pointers.
126 */
127 {
128 char const * pz = str;
129
130 do {
131 max_token_ct++;
132 pz = BRK_WHITESPACE_CHARS(pz+1);
133 pz = SPN_WHITESPACE_CHARS(pz);
134 } while (*pz != NUL);
135
136 res = malloc(sizeof(*res) + (size_t)(pz - str)
137 + ((size_t)max_token_ct * sizeof(ch_t *)));
138 }
139
140 if (res == NULL)
141 errno = ENOMEM;
142 else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1));
143
144 return res;
145
146 enoent_res:
147
148 errno = ENOENT;
149 return NULL;
150 }
151
152 /*=export_func ao_string_tokenize
153 *
154 * what: tokenize an input string
155 *
156 * arg: + char const * + string + string to be tokenized +
157 *
158 * ret_type: token_list_t *
159 * ret_desc: pointer to a structure that lists each token
160 *
161 * doc:
162 *
163 * This function will convert one input string into a list of strings.
164 * The list of strings is derived by separating the input based on
165 * white space separation. However, if the input contains either single
166 * or double quote characters, then the text after that character up to
167 * a matching quote will become the string in the list.
168 *
169 * The returned pointer should be deallocated with @code{free(3C)} when
170 * are done using the data. The data are placed in a single block of
171 * allocated memory. Do not deallocate individual token/strings.
172 *
173 * The structure pointed to will contain at least these two fields:
174 * @table @samp
175 * @item tkn_ct
176 * The number of tokens found in the input string.
177 * @item tok_list
178 * An array of @code{tkn_ct + 1} pointers to substring tokens, with
179 * the last pointer set to NULL.
180 * @end table
181 *
182 * There are two types of quoted strings: single quoted (@code{'}) and
183 * double quoted (@code{"}). Singly quoted strings are fairly raw in that
184 * escape characters (@code{\\}) are simply another character, except when
185 * preceding the following characters:
186 * @example
187 * @code{\\} double backslashes reduce to one
188 * @code{'} incorporates the single quote into the string
189 * @code{\n} suppresses both the backslash and newline character
190 * @end example
191 *
192 * Double quote strings are formed according to the rules of string
193 * constants in ANSI-C programs.
194 *
195 * example:
196 * @example
197 * #include <stdlib.h>
198 * int ix;
199 * token_list_t * ptl = ao_string_tokenize(some_string)
200 * for (ix = 0; ix < ptl->tkn_ct; ix++)
201 * do_something_with_tkn(ptl->tkn_list[ix]);
202 * free(ptl);
203 * @end example
204 * Note that everything is freed with the one call to @code{free(3C)}.
205 *
206 * err:
207 * NULL is returned and @code{errno} will be set to indicate the problem:
208 * @itemize @bullet
209 * @item
210 * @code{EINVAL} - There was an unterminated quoted string.
211 * @item
212 * @code{ENOENT} - The input string was empty.
213 * @item
214 * @code{ENOMEM} - There is not enough memory.
215 * @end itemize
216 =*/
217 token_list_t *
ao_string_tokenize(char const * str)218 ao_string_tokenize(char const * str)
219 {
220 token_list_t * res = alloc_token_list(str);
221 ch_t * pzDest;
222
223 /*
224 * Now copy each token into the output buffer.
225 */
226 if (res == NULL)
227 return res;
228
229 pzDest = (ch_t *)(res->tkn_list[0]);
230 res->tkn_ct = 0;
231
232 do {
233 res->tkn_list[ res->tkn_ct++ ] = pzDest;
234 for (;;) {
235 int ch = (ch_t)*str;
236 if (IS_WHITESPACE_CHAR(ch)) {
237 found_white_space:
238 str = SPN_WHITESPACE_CHARS(str+1);
239 break;
240 }
241
242 switch (ch) {
243 case '"':
244 copy_cooked(&pzDest, &str);
245 if (str == NULL) {
246 free(res);
247 errno = EINVAL;
248 return NULL;
249 }
250 if (IS_WHITESPACE_CHAR(*str))
251 goto found_white_space;
252 break;
253
254 case '\'':
255 copy_raw(&pzDest, &str);
256 if (str == NULL) {
257 free(res);
258 errno = EINVAL;
259 return NULL;
260 }
261 if (IS_WHITESPACE_CHAR(*str))
262 goto found_white_space;
263 break;
264
265 case NUL:
266 goto copy_done;
267
268 default:
269 str++;
270 *(pzDest++) = (unsigned char)ch;
271 }
272 } copy_done:;
273
274 /*
275 * NUL terminate the last token and see if we have any more tokens.
276 */
277 *(pzDest++) = NUL;
278 } while (*str != NUL);
279
280 res->tkn_list[ res->tkn_ct ] = NULL;
281
282 return res;
283 }
284
285 #ifdef TEST
286 #include <stdio.h>
287 #include <string.h>
288
289 int
main(int argc,char ** argv)290 main(int argc, char ** argv)
291 {
292 if (argc == 1) {
293 printf("USAGE: %s arg [ ... ]\n", *argv);
294 return 1;
295 }
296 while (--argc > 0) {
297 char * arg = *(++argv);
298 token_list_t * p = ao_string_tokenize(arg);
299 if (p == NULL) {
300 printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
301 arg, errno, strerror(errno));
302 } else {
303 int ix = 0;
304 printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
305 do {
306 printf(" %3d: ``%s''\n", ix+1, p->tkn_list[ix]);
307 } while (++ix < p->tkn_ct);
308 free(p);
309 }
310 }
311 return 0;
312 }
313 #endif
314
315 /** @}
316 *
317 * Local Variables:
318 * mode: C
319 * c-file-style: "stroustrup"
320 * indent-tabs-mode: nil
321 * End:
322 * end of autoopts/tokenize.c */
323