1 /* $NetBSD: str.c,v 1.105 2024/07/07 07:50:57 rillig Exp $ */
2
3 /*
4 * Copyright (c) 1988, 1989, 1990, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Adam de Boor.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 /*
36 * Copyright (c) 1989 by Berkeley Softworks
37 * All rights reserved.
38 *
39 * This code is derived from software contributed to Berkeley by
40 * Adam de Boor.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. All advertising materials mentioning features or use of this software
51 * must display the following acknowledgement:
52 * This product includes software developed by the University of
53 * California, Berkeley and its contributors.
54 * 4. Neither the name of the University nor the names of its contributors
55 * may be used to endorse or promote products derived from this software
56 * without specific prior written permission.
57 *
58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 */
70
71 #include "make.h"
72
73 /* "@(#)str.c 5.8 (Berkeley) 6/1/90" */
74 MAKE_RCSID("$NetBSD: str.c,v 1.105 2024/07/07 07:50:57 rillig Exp $");
75
76
77 static HashTable interned_strings;
78
79
80 /* Return the concatenation of s1 and s2, freshly allocated. */
81 char *
str_concat2(const char * s1,const char * s2)82 str_concat2(const char *s1, const char *s2)
83 {
84 size_t len1 = strlen(s1);
85 size_t len2 = strlen(s2);
86 char *result = bmake_malloc(len1 + len2 + 1);
87 memcpy(result, s1, len1);
88 memcpy(result + len1, s2, len2 + 1);
89 return result;
90 }
91
92 /* Return the concatenation of s1, s2 and s3, freshly allocated. */
93 char *
str_concat3(const char * s1,const char * s2,const char * s3)94 str_concat3(const char *s1, const char *s2, const char *s3)
95 {
96 size_t len1 = strlen(s1);
97 size_t len2 = strlen(s2);
98 size_t len3 = strlen(s3);
99 char *result = bmake_malloc(len1 + len2 + len3 + 1);
100 memcpy(result, s1, len1);
101 memcpy(result + len1, s2, len2);
102 memcpy(result + len1 + len2, s3, len3 + 1);
103 return result;
104 }
105
106 /*
107 * Fracture a string into an array of words (as delineated by tabs or spaces)
108 * taking quotation marks into account.
109 *
110 * A string that is empty or only contains whitespace nevertheless results in
111 * a single word. This is unexpected in many places, and the caller needs to
112 * correct for this edge case.
113 *
114 * If expand is true, quotes are removed and escape sequences such as \r, \t,
115 * etc... are expanded. In this case, return NULL on parse errors.
116 *
117 * Returns the fractured words, which must be freed later using Words_Free,
118 * unless the returned Words.words was NULL.
119 */
120 SubstringWords
Substring_Words(const char * str,bool expand)121 Substring_Words(const char *str, bool expand)
122 {
123 size_t str_len;
124 char *words_buf;
125 size_t words_cap;
126 Substring *words;
127 size_t words_len;
128 char inquote;
129 char *word_start;
130 char *word_end;
131 const char *str_p;
132
133 /* XXX: why only hspace, not whitespace? */
134 cpp_skip_hspace(&str); /* skip leading space chars. */
135
136 /* words_buf holds the words, separated by '\0'. */
137 str_len = strlen(str);
138 words_buf = bmake_malloc(str_len + 1);
139
140 words_cap = str_len / 5 > 50 ? str_len / 5 : 50;
141 words = bmake_malloc((words_cap + 1) * sizeof(words[0]));
142
143 /*
144 * copy the string; at the same time, parse backslashes,
145 * quotes and build the word list.
146 */
147 words_len = 0;
148 inquote = '\0';
149 word_start = words_buf;
150 word_end = words_buf;
151 for (str_p = str;; str_p++) {
152 char ch = *str_p;
153 switch (ch) {
154 case '"':
155 case '\'':
156 if (inquote != '\0') {
157 if (inquote == ch)
158 inquote = '\0';
159 else
160 break;
161 } else {
162 inquote = ch;
163 /* Don't miss "" or '' */
164 if (word_start == NULL && str_p[1] == inquote) {
165 if (!expand) {
166 word_start = word_end;
167 *word_end++ = ch;
168 } else
169 word_start = word_end + 1;
170 str_p++;
171 inquote = '\0';
172 break;
173 }
174 }
175 if (!expand) {
176 if (word_start == NULL)
177 word_start = word_end;
178 *word_end++ = ch;
179 }
180 continue;
181 case ' ':
182 case '\t':
183 case '\n':
184 if (inquote != '\0')
185 break;
186 if (word_start == NULL)
187 continue;
188 /* FALLTHROUGH */
189 case '\0':
190 /*
191 * end of a token -- make sure there's enough words
192 * space and save off a pointer.
193 */
194 if (word_start == NULL)
195 goto done;
196
197 *word_end++ = '\0';
198 if (words_len == words_cap) {
199 words_cap *= 2;
200 words = bmake_realloc(words,
201 (words_cap + 1) * sizeof(words[0]));
202 }
203 words[words_len++] =
204 Substring_Init(word_start, word_end - 1);
205 word_start = NULL;
206 if (ch == '\n' || ch == '\0') {
207 if (expand && inquote != '\0') {
208 SubstringWords res;
209
210 free(words);
211 free(words_buf);
212
213 res.words = NULL;
214 res.len = 0;
215 res.freeIt = NULL;
216 return res;
217 }
218 goto done;
219 }
220 continue;
221 case '\\':
222 if (!expand) {
223 if (word_start == NULL)
224 word_start = word_end;
225 *word_end++ = '\\';
226 /* catch lonely '\' at end of string */
227 if (str_p[1] == '\0')
228 continue;
229 ch = *++str_p;
230 break;
231 }
232
233 switch (ch = *++str_p) {
234 case '\0':
235 case '\n':
236 /* hmmm; fix it up as best we can */
237 ch = '\\';
238 str_p--;
239 break;
240 case 'b':
241 ch = '\b';
242 break;
243 case 'f':
244 ch = '\f';
245 break;
246 case 'n':
247 ch = '\n';
248 break;
249 case 'r':
250 ch = '\r';
251 break;
252 case 't':
253 ch = '\t';
254 break;
255 }
256 break;
257 }
258 if (word_start == NULL)
259 word_start = word_end;
260 *word_end++ = ch;
261 }
262 done:
263 words[words_len] = Substring_Init(NULL, NULL); /* useful for argv */
264
265 {
266 SubstringWords result;
267
268 result.words = words;
269 result.len = words_len;
270 result.freeIt = words_buf;
271 return result;
272 }
273 }
274
275 Words
Str_Words(const char * str,bool expand)276 Str_Words(const char *str, bool expand)
277 {
278 SubstringWords swords;
279 Words words;
280 size_t i;
281
282 swords = Substring_Words(str, expand);
283 if (swords.words == NULL) {
284 words.words = NULL;
285 words.len = 0;
286 words.freeIt = NULL;
287 return words;
288 }
289
290 words.words = bmake_malloc((swords.len + 1) * sizeof(words.words[0]));
291 words.len = swords.len;
292 words.freeIt = swords.freeIt;
293 for (i = 0; i < swords.len + 1; i++)
294 words.words[i] = UNCONST(swords.words[i].start);
295 free(swords.words);
296 return words;
297 }
298
299 /*
300 * Test if a string matches a pattern like "*.[ch]". The pattern matching
301 * characters are '*', '?' and '[]', as in fnmatch(3).
302 *
303 * See varmod-match.mk for examples and edge cases.
304 */
305 StrMatchResult
Str_Match(const char * str,const char * pat)306 Str_Match(const char *str, const char *pat)
307 {
308 StrMatchResult res = { NULL, false };
309 bool asterisk = false;
310 const char *fixed_str = str;
311 const char *fixed_pat = pat;
312
313 match_fixed_length:
314 str = fixed_str;
315 pat = fixed_pat;
316 for (; *pat != '\0' && *pat != '*'; str++, pat++) {
317 if (*str == '\0')
318 return res;
319
320 if (*pat == '?') /* match any single character */
321 continue;
322
323 if (*pat == '[') { /* match a character from a list */
324 bool neg = pat[1] == '^';
325 pat += neg ? 2 : 1;
326
327 next_char_in_list:
328 if (*pat == '\0')
329 res.error = "Unfinished character list";
330 if (*pat == ']' || *pat == '\0') {
331 if (neg)
332 goto end_of_char_list;
333 goto no_match;
334 }
335 if (*pat == *str)
336 goto end_of_char_list;
337 if (pat[1] == '-' && pat[2] == '\0') {
338 res.error = "Unfinished character range";
339 res.matched = neg;
340 return res;
341 }
342 if (pat[1] == '-') {
343 unsigned char e1 = (unsigned char)pat[0];
344 unsigned char c = (unsigned char)*str;
345 unsigned char e2 = (unsigned char)pat[2];
346 if ((e1 <= c && c <= e2)
347 || (e2 <= c && c <= e1))
348 goto end_of_char_list;
349 pat += 2;
350 }
351 pat++;
352 goto next_char_in_list;
353
354 end_of_char_list:
355 if (neg && *pat != ']' && *pat != '\0')
356 goto no_match;
357 while (*pat != ']' && *pat != '\0')
358 pat++;
359 if (*pat == '\0') {
360 res.error = "Unfinished character list";
361 pat--;
362 }
363 continue;
364 }
365
366 if (*pat == '\\') /* match the next character exactly */
367 pat++;
368 if (*pat != *str) {
369 if (asterisk && str == fixed_str) {
370 while (*str != '\0' && *str != *pat)
371 str++;
372 fixed_str = str;
373 goto match_fixed_length;
374 }
375 goto no_match;
376 }
377 }
378
379 if (*pat == '*') {
380 asterisk = true;
381 while (*pat == '*')
382 pat++;
383 if (*pat == '\0') {
384 res.matched = true;
385 return res;
386 }
387 fixed_str = str;
388 fixed_pat = pat;
389 goto match_fixed_length;
390 }
391 if (asterisk && *str != '\0') {
392 fixed_str += strlen(str);
393 goto match_fixed_length;
394 }
395 res.matched = *str == '\0';
396 return res;
397
398 no_match:
399 if (!asterisk)
400 return res;
401 fixed_str++;
402 goto match_fixed_length;
403 }
404
405 void
Str_Intern_Init(void)406 Str_Intern_Init(void)
407 {
408 HashTable_Init(&interned_strings);
409 }
410
411 #ifdef CLEANUP
412 void
Str_Intern_End(void)413 Str_Intern_End(void)
414 {
415 HashTable_Done(&interned_strings);
416 }
417 #endif
418
419 /* Return a canonical instance of str, with unlimited lifetime. */
420 const char *
Str_Intern(const char * str)421 Str_Intern(const char *str)
422 {
423 return HashTable_CreateEntry(&interned_strings, str, NULL)->key;
424 }
425