xref: /freebsd/contrib/ntp/sntp/libopts/tokenize.c (revision a2464ee12761660f50d0b6f59f233949ebcacc87)
1 /** \file tokenize.c
2  *
3  *  Tokenize a string, accommodating quoted strings.
4  *
5  * @addtogroup autoopts
6  * @{
7  */
8 /*
9  *  This file defines the string_tokenize interface
10  *  This file is part of AutoOpts, a companion to AutoGen.
11  *  AutoOpts is free software.
12  *  AutoOpts is Copyright (C) 1992-2015 by Bruce Korb - all rights reserved
13  *
14  *  AutoOpts is available under any one of two licenses.  The license
15  *  in use must be one of these two and the choice is under the control
16  *  of the user of the license.
17  *
18  *   The GNU Lesser General Public License, version 3 or later
19  *      See the files "COPYING.lgplv3" and "COPYING.gplv3"
20  *
21  *   The Modified Berkeley Software Distribution License
22  *      See the file "COPYING.mbsd"
23  *
24  *  These files have the following sha256 sums:
25  *
26  *  8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95  COPYING.gplv3
27  *  4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b  COPYING.lgplv3
28  *  13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239  COPYING.mbsd
29  */
30 
31 #include <errno.h>
32 #include <stdlib.h>
33 
34 #define cc_t   const unsigned char
35 #define ch_t   unsigned char
36 
37 /* = = = START-STATIC-FORWARD = = = */
38 static void
39 copy_cooked(ch_t ** ppDest, char const ** ppSrc);
40 
41 static void
42 copy_raw(ch_t ** ppDest, char const ** ppSrc);
43 
44 static token_list_t *
45 alloc_token_list(char const * str);
46 /* = = = END-STATIC-FORWARD = = = */
47 
48 static void
49 copy_cooked(ch_t ** ppDest, char const ** ppSrc)
50 {
51     ch_t * pDest = (ch_t *)*ppDest;
52     const ch_t * pSrc  = (const ch_t *)(*ppSrc + 1);
53 
54     for (;;) {
55         ch_t ch = *(pSrc++);
56         switch (ch) {
57         case NUL:   *ppSrc = NULL; return;
58         case '"':   goto done;
59         case '\\':
60             pSrc += ao_string_cook_escape_char((const char *)pSrc, (char *)&ch, 0x7F);
61             if (ch == 0x7F)
62                 break;
63             /* FALLTHROUGH */
64 
65         default:
66             *(pDest++) = ch;
67         }
68     }
69 
70  done:
71     *ppDest = (ch_t *)pDest; /* next spot for storing character */
72     *ppSrc  = (char const *)pSrc;  /* char following closing quote    */
73 }
74 
75 
76 static void
77 copy_raw(ch_t ** ppDest, char const ** ppSrc)
78 {
79     ch_t * pDest = *ppDest;
80     cc_t * pSrc  = (cc_t *) (*ppSrc + 1);
81 
82     for (;;) {
83         ch_t ch = *(pSrc++);
84         switch (ch) {
85         case NUL:   *ppSrc = NULL; return;
86         case '\'':  goto done;
87         case '\\':
88             /*
89              *  *Four* escapes are handled:  newline removal, escape char
90              *  quoting and apostrophe quoting
91              */
92             switch (*pSrc) {
93             case NUL:   *ppSrc = NULL; return;
94             case '\r':
95                 if (*(++pSrc) == NL)
96                     ++pSrc;
97                 continue;
98 
99             case NL:
100                 ++pSrc;
101                 continue;
102 
103             case '\'':
104                 ch = '\'';
105                 /* FALLTHROUGH */
106 
107             case '\\':
108                 ++pSrc;
109                 break;
110             }
111             /* FALLTHROUGH */
112 
113         default:
114             *(pDest++) = ch;
115         }
116     }
117 
118  done:
119     *ppDest = pDest; /* next spot for storing character */
120     *ppSrc  = (char const *) pSrc;  /* char following closing quote    */
121 }
122 
123 static token_list_t *
124 alloc_token_list(char const * str)
125 {
126     token_list_t * res;
127 
128     int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
129 
130     if (str == NULL) goto enoent_res;
131 
132     /*
133      *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
134      *  an empty string was passed.
135      */
136     str = SPN_WHITESPACE_CHARS(str);
137     if (*str == NUL)  goto enoent_res;
138 
139     /*
140      *  Take an approximate count of tokens.  If no quoted strings are used,
141      *  it will be accurate.  If quoted strings are used, it will be a little
142      *  high and we'll squander the space for a few extra pointers.
143      */
144     {
145         char const * pz = str;
146 
147         do {
148             max_token_ct++;
149             pz = BRK_WHITESPACE_CHARS(pz+1);
150             pz = SPN_WHITESPACE_CHARS(pz);
151         } while (*pz != NUL);
152 
153         res = malloc(sizeof(*res) + (size_t)(pz - str)
154                      + ((size_t)max_token_ct * sizeof(ch_t *)));
155     }
156 
157     if (res == NULL)
158         errno = ENOMEM;
159     else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1));
160 
161     return res;
162 
163     enoent_res:
164 
165     errno = ENOENT;
166     return NULL;
167 }
168 
169 /*=export_func ao_string_tokenize
170  *
171  * what: tokenize an input string
172  *
173  * arg:  + char const * + string + string to be tokenized +
174  *
175  * ret_type:  token_list_t *
176  * ret_desc:  pointer to a structure that lists each token
177  *
178  * doc:
179  *
180  * This function will convert one input string into a list of strings.
181  * The list of strings is derived by separating the input based on
182  * white space separation.  However, if the input contains either single
183  * or double quote characters, then the text after that character up to
184  * a matching quote will become the string in the list.
185  *
186  *  The returned pointer should be deallocated with @code{free(3C)} when
187  *  are done using the data.  The data are placed in a single block of
188  *  allocated memory.  Do not deallocate individual token/strings.
189  *
190  *  The structure pointed to will contain at least these two fields:
191  *  @table @samp
192  *  @item tkn_ct
193  *  The number of tokens found in the input string.
194  *  @item tok_list
195  *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
196  *  the last pointer set to NULL.
197  *  @end table
198  *
199  * There are two types of quoted strings: single quoted (@code{'}) and
200  * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
201  * escape characters (@code{\\}) are simply another character, except when
202  * preceding the following characters:
203  * @example
204  * @code{\\}  double backslashes reduce to one
205  * @code{'}   incorporates the single quote into the string
206  * @code{\n}  suppresses both the backslash and newline character
207  * @end example
208  *
209  * Double quote strings are formed according to the rules of string
210  * constants in ANSI-C programs.
211  *
212  * example:
213  * @example
214  *    #include <stdlib.h>
215  *    int ix;
216  *    token_list_t * ptl = ao_string_tokenize(some_string)
217  *    for (ix = 0; ix < ptl->tkn_ct; ix++)
218  *       do_something_with_tkn(ptl->tkn_list[ix]);
219  *    free(ptl);
220  * @end example
221  * Note that everything is freed with the one call to @code{free(3C)}.
222  *
223  * err:
224  *  NULL is returned and @code{errno} will be set to indicate the problem:
225  *  @itemize @bullet
226  *  @item
227  *  @code{EINVAL} - There was an unterminated quoted string.
228  *  @item
229  *  @code{ENOENT} - The input string was empty.
230  *  @item
231  *  @code{ENOMEM} - There is not enough memory.
232  *  @end itemize
233 =*/
234 token_list_t *
235 ao_string_tokenize(char const * str)
236 {
237     token_list_t * res = alloc_token_list(str);
238     ch_t * pzDest;
239 
240     /*
241      *  Now copy each token into the output buffer.
242      */
243     if (res == NULL)
244         return res;
245 
246     pzDest = (ch_t *)(res->tkn_list[0]);
247     res->tkn_ct  = 0;
248 
249     do  {
250         res->tkn_list[ res->tkn_ct++ ] = pzDest;
251         for (;;) {
252             int ch = (ch_t)*str;
253             if (IS_WHITESPACE_CHAR(ch)) {
254             found_white_space:
255                 str = SPN_WHITESPACE_CHARS(str+1);
256                 break;
257             }
258 
259             switch (ch) {
260             case '"':
261                 copy_cooked(&pzDest, &str);
262                 if (str == NULL) {
263                     free(res);
264                     errno = EINVAL;
265                     return NULL;
266                 }
267                 if (IS_WHITESPACE_CHAR(*str))
268                     goto found_white_space;
269                 break;
270 
271             case '\'':
272                 copy_raw(&pzDest, &str);
273                 if (str == NULL) {
274                     free(res);
275                     errno = EINVAL;
276                     return NULL;
277                 }
278                 if (IS_WHITESPACE_CHAR(*str))
279                     goto found_white_space;
280                 break;
281 
282             case NUL:
283                 goto copy_done;
284 
285             default:
286                 str++;
287                 *(pzDest++) = (unsigned char)ch;
288             }
289         } copy_done:;
290 
291         /*
292          * NUL terminate the last token and see if we have any more tokens.
293          */
294         *(pzDest++) = NUL;
295     } while (*str != NUL);
296 
297     res->tkn_list[ res->tkn_ct ] = NULL;
298 
299     return res;
300 }
301 
302 #ifdef TEST
303 #include <stdio.h>
304 #include <string.h>
305 
306 int
307 main(int argc, char ** argv)
308 {
309     if (argc == 1) {
310         printf("USAGE:  %s arg [ ... ]\n", *argv);
311         return 1;
312     }
313     while (--argc > 0) {
314         char * arg = *(++argv);
315         token_list_t * p = ao_string_tokenize(arg);
316         if (p == NULL) {
317             printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
318                    arg, errno, strerror(errno));
319         } else {
320             int ix = 0;
321             printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
322             do {
323                 printf(" %3d:  ``%s''\n", ix+1, p->tkn_list[ix]);
324             } while (++ix < p->tkn_ct);
325             free(p);
326         }
327     }
328     return 0;
329 }
330 #endif
331 
332 /** @}
333  *
334  * Local Variables:
335  * mode: C
336  * c-file-style: "stroustrup"
337  * indent-tabs-mode: nil
338  * End:
339  * end of autoopts/tokenize.c */
340