xref: /freebsd/contrib/ntp/sntp/libopts/tokenize.c (revision a90b9d0159070121c221b966469c3e36d912bf82)
1 /** \file tokenize.c
2  *
3  *  Tokenize a string, accommodating quoted strings.
4  *
5  * @addtogroup autoopts
6  * @{
7  */
8 /*
9  *  This file defines the string_tokenize interface
10  *  This file is part of AutoOpts, a companion to AutoGen.
11  *  AutoOpts is free software.
12  *  AutoOpts is Copyright (C) 1992-2018 by Bruce Korb - all rights reserved
13  *
14  *  AutoOpts is available under any one of two licenses.  The license
15  *  in use must be one of these two and the choice is under the control
16  *  of the user of the license.
17  *
18  *   The GNU Lesser General Public License, version 3 or later
19  *      See the files "COPYING.lgplv3" and "COPYING.gplv3"
20  *
21  *   The Modified Berkeley Software Distribution License
22  *      See the file "COPYING.mbsd"
23  *
24  *  These files have the following sha256 sums:
25  *
26  *  8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95  COPYING.gplv3
27  *  4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b  COPYING.lgplv3
28  *  13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239  COPYING.mbsd
29  */
30 
31 static void
32 copy_cooked(ch_t ** ppDest, char const ** ppSrc)
33 {
34     ch_t * pDest = (ch_t *)*ppDest;
35     const ch_t * pSrc  = (const ch_t *)(*ppSrc + 1);
36 
37     for (;;) {
38         ch_t ch = *(pSrc++);
39         switch (ch) {
40         case NUL:   *ppSrc = NULL; return;
41         case '"':   goto done;
42         case '\\':
43             pSrc += ao_string_cook_escape_char((char *)pSrc, (char *)&ch, 0x7F);
44             if (ch == 0x7F)
45                 break;
46             /* FALLTHROUGH */
47 
48         default:
49             *(pDest++) = ch;
50         }
51     }
52 
53  done:
54     *ppDest = (ch_t *)pDest; /* next spot for storing character */
55     *ppSrc  = (char const *)pSrc;  /* char following closing quote    */
56 }
57 
58 
59 static void
60 copy_raw(ch_t ** ppDest, char const ** ppSrc)
61 {
62     ch_t * pDest = *ppDest;
63     cc_t * pSrc  = (cc_t *) (*ppSrc + 1);
64 
65     for (;;) {
66         ch_t ch = *(pSrc++);
67         switch (ch) {
68         case NUL:   *ppSrc = NULL; return;
69         case '\'':  goto done;
70         case '\\':
71             /*
72              *  *Four* escapes are handled:  newline removal, escape char
73              *  quoting and apostrophe quoting
74              */
75             switch (*pSrc) {
76             case NUL:   *ppSrc = NULL; return;
77             case '\r':
78                 if (*(++pSrc) == NL)
79                     ++pSrc;
80                 continue;
81 
82             case NL:
83                 ++pSrc;
84                 continue;
85 
86             case '\'':
87                 ch = '\'';
88                 /* FALLTHROUGH */
89 
90             case '\\':
91                 ++pSrc;
92                 break;
93             }
94             /* FALLTHROUGH */
95 
96         default:
97             *(pDest++) = ch;
98         }
99     }
100 
101  done:
102     *ppDest = pDest; /* next spot for storing character */
103     *ppSrc  = (char const *) pSrc;  /* char following closing quote    */
104 }
105 
106 static token_list_t *
107 alloc_token_list(char const * str)
108 {
109     token_list_t * res;
110 
111     int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
112 
113     if (str == NULL) goto enoent_res;
114 
115     /*
116      *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
117      *  an empty string was passed.
118      */
119     str = SPN_WHITESPACE_CHARS(str);
120     if (*str == NUL)  goto enoent_res;
121 
122     /*
123      *  Take an approximate count of tokens.  If no quoted strings are used,
124      *  it will be accurate.  If quoted strings are used, it will be a little
125      *  high and we'll squander the space for a few extra pointers.
126      */
127     {
128         char const * pz = str;
129 
130         do {
131             max_token_ct++;
132             pz = BRK_WHITESPACE_CHARS(pz+1);
133             pz = SPN_WHITESPACE_CHARS(pz);
134         } while (*pz != NUL);
135 
136         res = malloc(sizeof(*res) + (size_t)(pz - str)
137                      + ((size_t)max_token_ct * sizeof(ch_t *)));
138     }
139 
140     if (res == NULL)
141         errno = ENOMEM;
142     else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1));
143 
144     return res;
145 
146     enoent_res:
147 
148     errno = ENOENT;
149     return NULL;
150 }
151 
152 /*=export_func ao_string_tokenize
153  *
154  * what: tokenize an input string
155  *
156  * arg:  + char const * + string + string to be tokenized +
157  *
158  * ret_type:  token_list_t *
159  * ret_desc:  pointer to a structure that lists each token
160  *
161  * doc:
162  *
163  * This function will convert one input string into a list of strings.
164  * The list of strings is derived by separating the input based on
165  * white space separation.  However, if the input contains either single
166  * or double quote characters, then the text after that character up to
167  * a matching quote will become the string in the list.
168  *
169  *  The returned pointer should be deallocated with @code{free(3C)} when
170  *  are done using the data.  The data are placed in a single block of
171  *  allocated memory.  Do not deallocate individual token/strings.
172  *
173  *  The structure pointed to will contain at least these two fields:
174  *  @table @samp
175  *  @item tkn_ct
176  *  The number of tokens found in the input string.
177  *  @item tok_list
178  *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
179  *  the last pointer set to NULL.
180  *  @end table
181  *
182  * There are two types of quoted strings: single quoted (@code{'}) and
183  * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
184  * escape characters (@code{\\}) are simply another character, except when
185  * preceding the following characters:
186  * @example
187  * @code{\\}  double backslashes reduce to one
188  * @code{'}   incorporates the single quote into the string
189  * @code{\n}  suppresses both the backslash and newline character
190  * @end example
191  *
192  * Double quote strings are formed according to the rules of string
193  * constants in ANSI-C programs.
194  *
195  * example:
196  * @example
197  *    #include <stdlib.h>
198  *    int ix;
199  *    token_list_t * ptl = ao_string_tokenize(some_string)
200  *    for (ix = 0; ix < ptl->tkn_ct; ix++)
201  *       do_something_with_tkn(ptl->tkn_list[ix]);
202  *    free(ptl);
203  * @end example
204  * Note that everything is freed with the one call to @code{free(3C)}.
205  *
206  * err:
207  *  NULL is returned and @code{errno} will be set to indicate the problem:
208  *  @itemize @bullet
209  *  @item
210  *  @code{EINVAL} - There was an unterminated quoted string.
211  *  @item
212  *  @code{ENOENT} - The input string was empty.
213  *  @item
214  *  @code{ENOMEM} - There is not enough memory.
215  *  @end itemize
216 =*/
217 token_list_t *
218 ao_string_tokenize(char const * str)
219 {
220     token_list_t * res = alloc_token_list(str);
221     ch_t * pzDest;
222 
223     /*
224      *  Now copy each token into the output buffer.
225      */
226     if (res == NULL)
227         return res;
228 
229     pzDest = (ch_t *)(res->tkn_list[0]);
230     res->tkn_ct  = 0;
231 
232     do  {
233         res->tkn_list[ res->tkn_ct++ ] = pzDest;
234         for (;;) {
235             int ch = (ch_t)*str;
236             if (IS_WHITESPACE_CHAR(ch)) {
237             found_white_space:
238                 str = SPN_WHITESPACE_CHARS(str+1);
239                 break;
240             }
241 
242             switch (ch) {
243             case '"':
244                 copy_cooked(&pzDest, &str);
245                 if (str == NULL) {
246                     free(res);
247                     errno = EINVAL;
248                     return NULL;
249                 }
250                 if (IS_WHITESPACE_CHAR(*str))
251                     goto found_white_space;
252                 break;
253 
254             case '\'':
255                 copy_raw(&pzDest, &str);
256                 if (str == NULL) {
257                     free(res);
258                     errno = EINVAL;
259                     return NULL;
260                 }
261                 if (IS_WHITESPACE_CHAR(*str))
262                     goto found_white_space;
263                 break;
264 
265             case NUL:
266                 goto copy_done;
267 
268             default:
269                 str++;
270                 *(pzDest++) = (unsigned char)ch;
271             }
272         } copy_done:;
273 
274         /*
275          * NUL terminate the last token and see if we have any more tokens.
276          */
277         *(pzDest++) = NUL;
278     } while (*str != NUL);
279 
280     res->tkn_list[ res->tkn_ct ] = NULL;
281 
282     return res;
283 }
284 
285 #ifdef TEST
286 #include <stdio.h>
287 #include <string.h>
288 
289 int
290 main(int argc, char ** argv)
291 {
292     if (argc == 1) {
293         printf("USAGE:  %s arg [ ... ]\n", *argv);
294         return 1;
295     }
296     while (--argc > 0) {
297         char * arg = *(++argv);
298         token_list_t * p = ao_string_tokenize(arg);
299         if (p == NULL) {
300             printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
301                    arg, errno, strerror(errno));
302         } else {
303             int ix = 0;
304             printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
305             do {
306                 printf(" %3d:  ``%s''\n", ix+1, p->tkn_list[ix]);
307             } while (++ix < p->tkn_ct);
308             free(p);
309         }
310     }
311     return 0;
312 }
313 #endif
314 
315 /** @}
316  *
317  * Local Variables:
318  * mode: C
319  * c-file-style: "stroustrup"
320  * indent-tabs-mode: nil
321  * End:
322  * end of autoopts/tokenize.c */
323