xref: /freebsd/contrib/ntp/sntp/libopts/tokenize.c (revision a466cc55373fc3cf86837f09da729535b57e69a1)
12b15cb3dSCy Schubert /** \file tokenize.c
22b15cb3dSCy Schubert  *
32b15cb3dSCy Schubert  *  Tokenize a string, accommodating quoted strings.
42b15cb3dSCy Schubert  *
52b15cb3dSCy Schubert  * @addtogroup autoopts
62b15cb3dSCy Schubert  * @{
72b15cb3dSCy Schubert  */
8ea906c41SOllivier Robert /*
9ea906c41SOllivier Robert  *  This file defines the string_tokenize interface
102b15cb3dSCy Schubert  *  This file is part of AutoOpts, a companion to AutoGen.
112b15cb3dSCy Schubert  *  AutoOpts is free software.
12*a466cc55SCy Schubert  *  AutoOpts is Copyright (C) 1992-2018 by Bruce Korb - all rights reserved
13ea906c41SOllivier Robert  *
142b15cb3dSCy Schubert  *  AutoOpts is available under any one of two licenses.  The license
152b15cb3dSCy Schubert  *  in use must be one of these two and the choice is under the control
162b15cb3dSCy Schubert  *  of the user of the license.
17ea906c41SOllivier Robert  *
182b15cb3dSCy Schubert  *   The GNU Lesser General Public License, version 3 or later
192b15cb3dSCy Schubert  *      See the files "COPYING.lgplv3" and "COPYING.gplv3"
20ea906c41SOllivier Robert  *
212b15cb3dSCy Schubert  *   The Modified Berkeley Software Distribution License
222b15cb3dSCy Schubert  *      See the file "COPYING.mbsd"
23ea906c41SOllivier Robert  *
242b15cb3dSCy Schubert  *  These files have the following sha256 sums:
252b15cb3dSCy Schubert  *
262b15cb3dSCy Schubert  *  8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95  COPYING.gplv3
272b15cb3dSCy Schubert  *  4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b  COPYING.lgplv3
282b15cb3dSCy Schubert  *  13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239  COPYING.mbsd
29ea906c41SOllivier Robert  */
302b15cb3dSCy Schubert 
31ea906c41SOllivier Robert static void
copy_cooked(ch_t ** ppDest,char const ** ppSrc)32ea906c41SOllivier Robert copy_cooked(ch_t ** ppDest, char const ** ppSrc)
33ea906c41SOllivier Robert {
34ea906c41SOllivier Robert     ch_t * pDest = (ch_t *)*ppDest;
35ea906c41SOllivier Robert     const ch_t * pSrc  = (const ch_t *)(*ppSrc + 1);
36ea906c41SOllivier Robert 
37ea906c41SOllivier Robert     for (;;) {
38ea906c41SOllivier Robert         ch_t ch = *(pSrc++);
39ea906c41SOllivier Robert         switch (ch) {
40ea906c41SOllivier Robert         case NUL:   *ppSrc = NULL; return;
41ea906c41SOllivier Robert         case '"':   goto done;
42ea906c41SOllivier Robert         case '\\':
43*a466cc55SCy Schubert             pSrc += ao_string_cook_escape_char((char *)pSrc, (char *)&ch, 0x7F);
44ea906c41SOllivier Robert             if (ch == 0x7F)
45ea906c41SOllivier Robert                 break;
46ea906c41SOllivier Robert             /* FALLTHROUGH */
47ea906c41SOllivier Robert 
48ea906c41SOllivier Robert         default:
49ea906c41SOllivier Robert             *(pDest++) = ch;
50ea906c41SOllivier Robert         }
51ea906c41SOllivier Robert     }
52ea906c41SOllivier Robert 
53ea906c41SOllivier Robert  done:
54ea906c41SOllivier Robert     *ppDest = (ch_t *)pDest; /* next spot for storing character */
55ea906c41SOllivier Robert     *ppSrc  = (char const *)pSrc;  /* char following closing quote    */
56ea906c41SOllivier Robert }
57ea906c41SOllivier Robert 
58ea906c41SOllivier Robert 
59ea906c41SOllivier Robert static void
copy_raw(ch_t ** ppDest,char const ** ppSrc)60ea906c41SOllivier Robert copy_raw(ch_t ** ppDest, char const ** ppSrc)
61ea906c41SOllivier Robert {
62ea906c41SOllivier Robert     ch_t * pDest = *ppDest;
63ea906c41SOllivier Robert     cc_t * pSrc  = (cc_t *) (*ppSrc + 1);
64ea906c41SOllivier Robert 
65ea906c41SOllivier Robert     for (;;) {
66ea906c41SOllivier Robert         ch_t ch = *(pSrc++);
67ea906c41SOllivier Robert         switch (ch) {
68ea906c41SOllivier Robert         case NUL:   *ppSrc = NULL; return;
69ea906c41SOllivier Robert         case '\'':  goto done;
70ea906c41SOllivier Robert         case '\\':
71ea906c41SOllivier Robert             /*
72ea906c41SOllivier Robert              *  *Four* escapes are handled:  newline removal, escape char
73ea906c41SOllivier Robert              *  quoting and apostrophe quoting
74ea906c41SOllivier Robert              */
75ea906c41SOllivier Robert             switch (*pSrc) {
76ea906c41SOllivier Robert             case NUL:   *ppSrc = NULL; return;
77ea906c41SOllivier Robert             case '\r':
782b15cb3dSCy Schubert                 if (*(++pSrc) == NL)
79ea906c41SOllivier Robert                     ++pSrc;
80ea906c41SOllivier Robert                 continue;
81ea906c41SOllivier Robert 
822b15cb3dSCy Schubert             case NL:
83ea906c41SOllivier Robert                 ++pSrc;
84ea906c41SOllivier Robert                 continue;
85ea906c41SOllivier Robert 
86ea906c41SOllivier Robert             case '\'':
87ea906c41SOllivier Robert                 ch = '\'';
88ea906c41SOllivier Robert                 /* FALLTHROUGH */
89ea906c41SOllivier Robert 
90ea906c41SOllivier Robert             case '\\':
91ea906c41SOllivier Robert                 ++pSrc;
92ea906c41SOllivier Robert                 break;
93ea906c41SOllivier Robert             }
94ea906c41SOllivier Robert             /* FALLTHROUGH */
95ea906c41SOllivier Robert 
96ea906c41SOllivier Robert         default:
97ea906c41SOllivier Robert             *(pDest++) = ch;
98ea906c41SOllivier Robert         }
99ea906c41SOllivier Robert     }
100ea906c41SOllivier Robert 
101ea906c41SOllivier Robert  done:
102ea906c41SOllivier Robert     *ppDest = pDest; /* next spot for storing character */
103ea906c41SOllivier Robert     *ppSrc  = (char const *) pSrc;  /* char following closing quote    */
104ea906c41SOllivier Robert }
105ea906c41SOllivier Robert 
1062b15cb3dSCy Schubert static token_list_t *
alloc_token_list(char const * str)1072b15cb3dSCy Schubert alloc_token_list(char const * str)
1082b15cb3dSCy Schubert {
1092b15cb3dSCy Schubert     token_list_t * res;
1102b15cb3dSCy Schubert 
1112b15cb3dSCy Schubert     int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
1122b15cb3dSCy Schubert 
1132b15cb3dSCy Schubert     if (str == NULL) goto enoent_res;
1142b15cb3dSCy Schubert 
1152b15cb3dSCy Schubert     /*
1162b15cb3dSCy Schubert      *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
1172b15cb3dSCy Schubert      *  an empty string was passed.
1182b15cb3dSCy Schubert      */
1192b15cb3dSCy Schubert     str = SPN_WHITESPACE_CHARS(str);
1202b15cb3dSCy Schubert     if (*str == NUL)  goto enoent_res;
1212b15cb3dSCy Schubert 
1222b15cb3dSCy Schubert     /*
1232b15cb3dSCy Schubert      *  Take an approximate count of tokens.  If no quoted strings are used,
1242b15cb3dSCy Schubert      *  it will be accurate.  If quoted strings are used, it will be a little
1252b15cb3dSCy Schubert      *  high and we'll squander the space for a few extra pointers.
1262b15cb3dSCy Schubert      */
1272b15cb3dSCy Schubert     {
1282b15cb3dSCy Schubert         char const * pz = str;
1292b15cb3dSCy Schubert 
1302b15cb3dSCy Schubert         do {
1312b15cb3dSCy Schubert             max_token_ct++;
1322b15cb3dSCy Schubert             pz = BRK_WHITESPACE_CHARS(pz+1);
1332b15cb3dSCy Schubert             pz = SPN_WHITESPACE_CHARS(pz);
1342b15cb3dSCy Schubert         } while (*pz != NUL);
1352b15cb3dSCy Schubert 
1362b15cb3dSCy Schubert         res = malloc(sizeof(*res) + (size_t)(pz - str)
1372b15cb3dSCy Schubert                      + ((size_t)max_token_ct * sizeof(ch_t *)));
1382b15cb3dSCy Schubert     }
1392b15cb3dSCy Schubert 
1402b15cb3dSCy Schubert     if (res == NULL)
1412b15cb3dSCy Schubert         errno = ENOMEM;
1422b15cb3dSCy Schubert     else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1));
1432b15cb3dSCy Schubert 
1442b15cb3dSCy Schubert     return res;
1452b15cb3dSCy Schubert 
1462b15cb3dSCy Schubert     enoent_res:
1472b15cb3dSCy Schubert 
1482b15cb3dSCy Schubert     errno = ENOENT;
1492b15cb3dSCy Schubert     return NULL;
1502b15cb3dSCy Schubert }
151ea906c41SOllivier Robert 
152ea906c41SOllivier Robert /*=export_func ao_string_tokenize
153ea906c41SOllivier Robert  *
154ea906c41SOllivier Robert  * what: tokenize an input string
155ea906c41SOllivier Robert  *
156ea906c41SOllivier Robert  * arg:  + char const * + string + string to be tokenized +
157ea906c41SOllivier Robert  *
158ea906c41SOllivier Robert  * ret_type:  token_list_t *
159ea906c41SOllivier Robert  * ret_desc:  pointer to a structure that lists each token
160ea906c41SOllivier Robert  *
161ea906c41SOllivier Robert  * doc:
162ea906c41SOllivier Robert  *
163ea906c41SOllivier Robert  * This function will convert one input string into a list of strings.
164ea906c41SOllivier Robert  * The list of strings is derived by separating the input based on
165ea906c41SOllivier Robert  * white space separation.  However, if the input contains either single
166ea906c41SOllivier Robert  * or double quote characters, then the text after that character up to
167ea906c41SOllivier Robert  * a matching quote will become the string in the list.
168ea906c41SOllivier Robert  *
169ea906c41SOllivier Robert  *  The returned pointer should be deallocated with @code{free(3C)} when
170ea906c41SOllivier Robert  *  are done using the data.  The data are placed in a single block of
171ea906c41SOllivier Robert  *  allocated memory.  Do not deallocate individual token/strings.
172ea906c41SOllivier Robert  *
173ea906c41SOllivier Robert  *  The structure pointed to will contain at least these two fields:
174ea906c41SOllivier Robert  *  @table @samp
175ea906c41SOllivier Robert  *  @item tkn_ct
176ea906c41SOllivier Robert  *  The number of tokens found in the input string.
177ea906c41SOllivier Robert  *  @item tok_list
178ea906c41SOllivier Robert  *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
179ea906c41SOllivier Robert  *  the last pointer set to NULL.
180ea906c41SOllivier Robert  *  @end table
181ea906c41SOllivier Robert  *
182ea906c41SOllivier Robert  * There are two types of quoted strings: single quoted (@code{'}) and
183ea906c41SOllivier Robert  * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
184ea906c41SOllivier Robert  * escape characters (@code{\\}) are simply another character, except when
185ea906c41SOllivier Robert  * preceding the following characters:
186ea906c41SOllivier Robert  * @example
187ea906c41SOllivier Robert  * @code{\\}  double backslashes reduce to one
188ea906c41SOllivier Robert  * @code{'}   incorporates the single quote into the string
189ea906c41SOllivier Robert  * @code{\n}  suppresses both the backslash and newline character
190ea906c41SOllivier Robert  * @end example
191ea906c41SOllivier Robert  *
192ea906c41SOllivier Robert  * Double quote strings are formed according to the rules of string
193ea906c41SOllivier Robert  * constants in ANSI-C programs.
194ea906c41SOllivier Robert  *
195ea906c41SOllivier Robert  * example:
196ea906c41SOllivier Robert  * @example
197ea906c41SOllivier Robert  *    #include <stdlib.h>
198ea906c41SOllivier Robert  *    int ix;
199ea906c41SOllivier Robert  *    token_list_t * ptl = ao_string_tokenize(some_string)
200ea906c41SOllivier Robert  *    for (ix = 0; ix < ptl->tkn_ct; ix++)
201ea906c41SOllivier Robert  *       do_something_with_tkn(ptl->tkn_list[ix]);
202ea906c41SOllivier Robert  *    free(ptl);
203ea906c41SOllivier Robert  * @end example
204ea906c41SOllivier Robert  * Note that everything is freed with the one call to @code{free(3C)}.
205ea906c41SOllivier Robert  *
206ea906c41SOllivier Robert  * err:
207ea906c41SOllivier Robert  *  NULL is returned and @code{errno} will be set to indicate the problem:
208ea906c41SOllivier Robert  *  @itemize @bullet
209ea906c41SOllivier Robert  *  @item
210ea906c41SOllivier Robert  *  @code{EINVAL} - There was an unterminated quoted string.
211ea906c41SOllivier Robert  *  @item
212ea906c41SOllivier Robert  *  @code{ENOENT} - The input string was empty.
213ea906c41SOllivier Robert  *  @item
214ea906c41SOllivier Robert  *  @code{ENOMEM} - There is not enough memory.
215ea906c41SOllivier Robert  *  @end itemize
216ea906c41SOllivier Robert =*/
217ea906c41SOllivier Robert token_list_t *
ao_string_tokenize(char const * str)218ea906c41SOllivier Robert ao_string_tokenize(char const * str)
219ea906c41SOllivier Robert {
2202b15cb3dSCy Schubert     token_list_t * res = alloc_token_list(str);
2212b15cb3dSCy Schubert     ch_t * pzDest;
222ea906c41SOllivier Robert 
223ea906c41SOllivier Robert     /*
224ea906c41SOllivier Robert      *  Now copy each token into the output buffer.
225ea906c41SOllivier Robert      */
2262b15cb3dSCy Schubert     if (res == NULL)
2272b15cb3dSCy Schubert         return res;
2282b15cb3dSCy Schubert 
2292b15cb3dSCy Schubert     pzDest = (ch_t *)(res->tkn_list[0]);
230ea906c41SOllivier Robert     res->tkn_ct  = 0;
231ea906c41SOllivier Robert 
232ea906c41SOllivier Robert     do  {
233ea906c41SOllivier Robert         res->tkn_list[ res->tkn_ct++ ] = pzDest;
234ea906c41SOllivier Robert         for (;;) {
235ea906c41SOllivier Robert             int ch = (ch_t)*str;
2362b15cb3dSCy Schubert             if (IS_WHITESPACE_CHAR(ch)) {
237ea906c41SOllivier Robert             found_white_space:
2382b15cb3dSCy Schubert                 str = SPN_WHITESPACE_CHARS(str+1);
239ea906c41SOllivier Robert                 break;
240ea906c41SOllivier Robert             }
241ea906c41SOllivier Robert 
242ea906c41SOllivier Robert             switch (ch) {
243ea906c41SOllivier Robert             case '"':
244ea906c41SOllivier Robert                 copy_cooked(&pzDest, &str);
245ea906c41SOllivier Robert                 if (str == NULL) {
246ea906c41SOllivier Robert                     free(res);
247ea906c41SOllivier Robert                     errno = EINVAL;
248ea906c41SOllivier Robert                     return NULL;
249ea906c41SOllivier Robert                 }
2502b15cb3dSCy Schubert                 if (IS_WHITESPACE_CHAR(*str))
251ea906c41SOllivier Robert                     goto found_white_space;
252ea906c41SOllivier Robert                 break;
253ea906c41SOllivier Robert 
254ea906c41SOllivier Robert             case '\'':
255ea906c41SOllivier Robert                 copy_raw(&pzDest, &str);
256ea906c41SOllivier Robert                 if (str == NULL) {
257ea906c41SOllivier Robert                     free(res);
258ea906c41SOllivier Robert                     errno = EINVAL;
259ea906c41SOllivier Robert                     return NULL;
260ea906c41SOllivier Robert                 }
2612b15cb3dSCy Schubert                 if (IS_WHITESPACE_CHAR(*str))
262ea906c41SOllivier Robert                     goto found_white_space;
263ea906c41SOllivier Robert                 break;
264ea906c41SOllivier Robert 
265ea906c41SOllivier Robert             case NUL:
266ea906c41SOllivier Robert                 goto copy_done;
267ea906c41SOllivier Robert 
268ea906c41SOllivier Robert             default:
269ea906c41SOllivier Robert                 str++;
2702b15cb3dSCy Schubert                 *(pzDest++) = (unsigned char)ch;
271ea906c41SOllivier Robert             }
272ea906c41SOllivier Robert         } copy_done:;
273ea906c41SOllivier Robert 
274ea906c41SOllivier Robert         /*
275ea906c41SOllivier Robert          * NUL terminate the last token and see if we have any more tokens.
276ea906c41SOllivier Robert          */
277ea906c41SOllivier Robert         *(pzDest++) = NUL;
278ea906c41SOllivier Robert     } while (*str != NUL);
279ea906c41SOllivier Robert 
280ea906c41SOllivier Robert     res->tkn_list[ res->tkn_ct ] = NULL;
281ea906c41SOllivier Robert 
282ea906c41SOllivier Robert     return res;
283ea906c41SOllivier Robert }
284ea906c41SOllivier Robert 
285ea906c41SOllivier Robert #ifdef TEST
286ea906c41SOllivier Robert #include <stdio.h>
287ea906c41SOllivier Robert #include <string.h>
288ea906c41SOllivier Robert 
289ea906c41SOllivier Robert int
main(int argc,char ** argv)290ea906c41SOllivier Robert main(int argc, char ** argv)
291ea906c41SOllivier Robert {
292ea906c41SOllivier Robert     if (argc == 1) {
293ea906c41SOllivier Robert         printf("USAGE:  %s arg [ ... ]\n", *argv);
294ea906c41SOllivier Robert         return 1;
295ea906c41SOllivier Robert     }
296ea906c41SOllivier Robert     while (--argc > 0) {
297ea906c41SOllivier Robert         char * arg = *(++argv);
298ea906c41SOllivier Robert         token_list_t * p = ao_string_tokenize(arg);
299ea906c41SOllivier Robert         if (p == NULL) {
300ea906c41SOllivier Robert             printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
301ea906c41SOllivier Robert                    arg, errno, strerror(errno));
302ea906c41SOllivier Robert         } else {
303ea906c41SOllivier Robert             int ix = 0;
304ea906c41SOllivier Robert             printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
305ea906c41SOllivier Robert             do {
306ea906c41SOllivier Robert                 printf(" %3d:  ``%s''\n", ix+1, p->tkn_list[ix]);
307ea906c41SOllivier Robert             } while (++ix < p->tkn_ct);
308ea906c41SOllivier Robert             free(p);
309ea906c41SOllivier Robert         }
310ea906c41SOllivier Robert     }
311ea906c41SOllivier Robert     return 0;
312ea906c41SOllivier Robert }
313ea906c41SOllivier Robert #endif
314ea906c41SOllivier Robert 
3152b15cb3dSCy Schubert /** @}
3162b15cb3dSCy Schubert  *
317ea906c41SOllivier Robert  * Local Variables:
318ea906c41SOllivier Robert  * mode: C
319ea906c41SOllivier Robert  * c-file-style: "stroustrup"
320ea906c41SOllivier Robert  * indent-tabs-mode: nil
321ea906c41SOllivier Robert  * End:
322ea906c41SOllivier Robert  * end of autoopts/tokenize.c */
323