xref: /freebsd/contrib/ntp/sntp/libopts/tokenize.c (revision 884a2a699669ec61e2366e3e358342dbc94be24a)
1 /*
2  *  This file defines the string_tokenize interface
3  * Time-stamp:      "2006-06-24 15:27:49 bkorb"
4  *
5  *  string_tokenize copyright 2005 Bruce Korb
6  *
7  *  string_tokenize is free software; you can redistribute it and/or
8  *  modify it under the terms of the GNU Lesser General Public
9  *  License as published by the Free Software Foundation; either
10  *  version 2.1 of the License, or (at your option) any later version.
11  *
12  *  string_tokenize is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  *  Lesser General Public License for more details.
16  *
17  *  You should have received a copy of the GNU Lesser General Public
18  *  License along with string_tokenize; if not, write to:
19  *             The Free Software Foundation, Inc.,
20  *             51 Franklin Street, Fifth Floor,
21  *             Boston, MA  02110-1301, USA.
22  */
23 #include <ctype.h>
24 #include <errno.h>
25 #include <stdlib.h>
26 
27 #define cc_t   const unsigned char
28 #define ch_t   unsigned char
29 
30 /* = = = START-STATIC-FORWARD = = = */
31 /* static forward declarations maintained by :mkfwd */
32 static void
33 copy_cooked( ch_t** ppDest, char const ** ppSrc );
34 
35 static void
36 copy_raw( ch_t** ppDest, char const ** ppSrc );
37 /* = = = END-STATIC-FORWARD = = = */
38 
39 static void
40 copy_cooked( ch_t** ppDest, char const ** ppSrc )
41 {
42     ch_t* pDest = (ch_t*)*ppDest;
43     const ch_t* pSrc  = (const ch_t*)(*ppSrc + 1);
44 
45     for (;;) {
46         ch_t ch = *(pSrc++);
47         switch (ch) {
48         case NUL:   *ppSrc = NULL; return;
49         case '"':   goto done;
50         case '\\':
51             pSrc += ao_string_cook_escape_char( (char*)pSrc, (char*)&ch, 0x7F );
52             if (ch == 0x7F)
53                 break;
54             /* FALLTHROUGH */
55 
56         default:
57             *(pDest++) = ch;
58         }
59     }
60 
61  done:
62     *ppDest = (ch_t*)pDest; /* next spot for storing character */
63     *ppSrc  = (char const *)pSrc;  /* char following closing quote    */
64 }
65 
66 
67 static void
68 copy_raw( ch_t** ppDest, char const ** ppSrc )
69 {
70     ch_t* pDest = *ppDest;
71     cc_t* pSrc  = (cc_t*) (*ppSrc + 1);
72 
73     for (;;) {
74         ch_t ch = *(pSrc++);
75         switch (ch) {
76         case NUL:   *ppSrc = NULL; return;
77         case '\'':  goto done;
78         case '\\':
79             /*
80              *  *Four* escapes are handled:  newline removal, escape char
81              *  quoting and apostrophe quoting
82              */
83             switch (*pSrc) {
84             case NUL:   *ppSrc = NULL; return;
85             case '\r':
86                 if (*(++pSrc) == '\n')
87                     ++pSrc;
88                 continue;
89 
90             case '\n':
91                 ++pSrc;
92                 continue;
93 
94             case '\'':
95                 ch = '\'';
96                 /* FALLTHROUGH */
97 
98             case '\\':
99                 ++pSrc;
100                 break;
101             }
102             /* FALLTHROUGH */
103 
104         default:
105             *(pDest++) = ch;
106         }
107     }
108 
109  done:
110     *ppDest = pDest; /* next spot for storing character */
111     *ppSrc  = (char const *) pSrc;  /* char following closing quote    */
112 }
113 
114 
115 /*=export_func ao_string_tokenize
116  *
117  * what: tokenize an input string
118  *
119  * arg:  + char const* + string + string to be tokenized +
120  *
121  * ret_type:  token_list_t*
122  * ret_desc:  pointer to a structure that lists each token
123  *
124  * doc:
125  *
126  * This function will convert one input string into a list of strings.
127  * The list of strings is derived by separating the input based on
128  * white space separation.  However, if the input contains either single
129  * or double quote characters, then the text after that character up to
130  * a matching quote will become the string in the list.
131  *
132  *  The returned pointer should be deallocated with @code{free(3C)} when
133  *  are done using the data.  The data are placed in a single block of
134  *  allocated memory.  Do not deallocate individual token/strings.
135  *
136  *  The structure pointed to will contain at least these two fields:
137  *  @table @samp
138  *  @item tkn_ct
139  *  The number of tokens found in the input string.
140  *  @item tok_list
141  *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
142  *  the last pointer set to NULL.
143  *  @end table
144  *
145  * There are two types of quoted strings: single quoted (@code{'}) and
146  * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
147  * escape characters (@code{\\}) are simply another character, except when
148  * preceding the following characters:
149  * @example
150  * @code{\\}  double backslashes reduce to one
151  * @code{'}   incorporates the single quote into the string
152  * @code{\n}  suppresses both the backslash and newline character
153  * @end example
154  *
155  * Double quote strings are formed according to the rules of string
156  * constants in ANSI-C programs.
157  *
158  * example:
159  * @example
160  *    #include <stdlib.h>
161  *    int ix;
162  *    token_list_t* ptl = ao_string_tokenize( some_string )
163  *    for (ix = 0; ix < ptl->tkn_ct; ix++)
164  *       do_something_with_tkn( ptl->tkn_list[ix] );
165  *    free( ptl );
166  * @end example
167  * Note that everything is freed with the one call to @code{free(3C)}.
168  *
169  * err:
170  *  NULL is returned and @code{errno} will be set to indicate the problem:
171  *  @itemize @bullet
172  *  @item
173  *  @code{EINVAL} - There was an unterminated quoted string.
174  *  @item
175  *  @code{ENOENT} - The input string was empty.
176  *  @item
177  *  @code{ENOMEM} - There is not enough memory.
178  *  @end itemize
179 =*/
180 token_list_t*
181 ao_string_tokenize( char const* str )
182 {
183     int max_token_ct = 1; /* allow for trailing NUL on string */
184     token_list_t* res;
185 
186     if (str == NULL)  goto bogus_str;
187 
188     /*
189      *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
190      *  an empty string was passed.
191      */
192     while (isspace( (ch_t)*str ))  str++;
193     if (*str == NUL) {
194     bogus_str:
195         errno = ENOENT;
196         return NULL;
197     }
198 
199     /*
200      *  Take an approximate count of tokens.  If no quoted strings are used,
201      *  it will be accurate.  If quoted strings are used, it will be a little
202      *  high and we'll squander the space for a few extra pointers.
203      */
204     {
205         cc_t* pz = (cc_t*)str;
206 
207         do {
208             max_token_ct++;
209             while (! isspace( *++pz ))
210                 if (*pz == NUL) goto found_nul;
211             while (isspace( *pz ))  pz++;
212         } while (*pz != NUL);
213 
214     found_nul:
215         ;
216     }
217 
218     res = malloc( sizeof(*res) + strlen(str) + (max_token_ct * sizeof(ch_t*)) );
219     if (res == NULL) {
220         errno = ENOMEM;
221         return res;
222     }
223 
224     /*
225      *  Now copy each token into the output buffer.
226      */
227     {
228         ch_t* pzDest = (ch_t*)(res->tkn_list + (max_token_ct + 1));
229         res->tkn_ct  = 0;
230 
231         do  {
232             res->tkn_list[ res->tkn_ct++ ] = pzDest;
233             for (;;) {
234                 int ch = (ch_t)*str;
235                 if (isspace( ch )) {
236                 found_white_space:
237                     while (isspace( (ch_t)*++str ))  ;
238                     break;
239                 }
240 
241                 switch (ch) {
242                 case '"':
243                     copy_cooked( &pzDest, &str );
244                     if (str == NULL) {
245                         free(res);
246                         errno = EINVAL;
247                         return NULL;
248                     }
249                     if (isspace( (ch_t)*str ))
250                         goto found_white_space;
251                     break;
252 
253                 case '\'':
254                     copy_raw( &pzDest, &str );
255                     if (str == NULL) {
256                         free(res);
257                         errno = EINVAL;
258                         return NULL;
259                     }
260                     if (isspace( (ch_t)*str ))
261                         goto found_white_space;
262                     break;
263 
264                 case NUL:
265                     goto copy_done;
266 
267                 default:
268                     str++;
269                     *(pzDest++) = ch;
270                 }
271             } copy_done:;
272 
273             /*
274              * NUL terminate the last token and see if we have any more tokens.
275              */
276             *(pzDest++) = NUL;
277         } while (*str != NUL);
278 
279         res->tkn_list[ res->tkn_ct ] = NULL;
280     }
281 
282     return res;
283 }
284 
285 #ifdef TEST
286 #include <stdio.h>
287 #include <string.h>
288 
289 int
290 main( int argc, char** argv )
291 {
292     if (argc == 1) {
293         printf("USAGE:  %s arg [ ... ]\n", *argv);
294         return 1;
295     }
296     while (--argc > 0) {
297         char* arg = *(++argv);
298         token_list_t* p = ao_string_tokenize( arg );
299         if (p == NULL) {
300             printf( "Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
301                     arg, errno, strerror( errno ));
302         } else {
303             int ix = 0;
304             printf( "Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct );
305             do {
306                 printf( " %3d:  ``%s''\n", ix+1, p->tkn_list[ix] );
307             } while (++ix < p->tkn_ct);
308             free(p);
309         }
310     }
311     return 0;
312 }
313 #endif
314 
315 /*
316  * Local Variables:
317  * mode: C
318  * c-file-style: "stroustrup"
319  * indent-tabs-mode: nil
320  * End:
321  * end of autoopts/tokenize.c */
322