1 /*- 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Ken Arnold. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 #if 0 34 #ifndef lint 35 static const char copyright[] = 36 "@(#) Copyright (c) 1989, 1993\n\ 37 The Regents of the University of California. All rights reserved.\n"; 38 #endif /* not lint */ 39 40 #ifndef lint 41 static const char sccsid[] = "@(#)strfile.c 8.1 (Berkeley) 5/31/93"; 42 #endif /* not lint */ 43 #endif 44 #include <sys/cdefs.h> 45 #include <sys/param.h> 46 #include <sys/endian.h> 47 #include <ctype.h> 48 #include <locale.h> 49 #include <stdbool.h> 50 #include <stdio.h> 51 #include <stdlib.h> 52 #include <string.h> 53 #include <time.h> 54 #include <unistd.h> 55 56 #include "strfile.h" 57 58 /* 59 * This program takes a file composed of strings separated by 60 * lines starting with two consecutive delimiting character (default 61 * character is '%') and creates another file which consists of a table 62 * describing the file (structure from "strfile.h"), a table of seek 63 * pointers to the start of the strings, and the strings, each terminated 64 * by a null byte. Usage: 65 * 66 * % strfile [-iorsx] [ -cC ] sourcefile [ datafile ] 67 * 68 * C - Allow comments marked by a double delimiter at line's beginning 69 * c - Change delimiting character from '%' to 'C' 70 * s - Silent. Give no summary of data processed at the end of 71 * the run. 72 * o - order the strings in alphabetic order 73 * i - if ordering, ignore case 74 * r - randomize the order of the strings 75 * x - set rotated bit 76 * 77 * Ken Arnold Sept. 7, 1978 -- 78 * 79 * Added ordering options. 80 */ 81 82 #define STORING_PTRS (Oflag || Rflag) 83 #define CHUNKSIZE 512 84 85 #define ALLOC(ptr, sz) do { \ 86 if (ptr == NULL) \ 87 ptr = malloc(CHUNKSIZE * sizeof(*ptr)); \ 88 else if (((sz) + 1) % CHUNKSIZE == 0) \ 89 ptr = realloc(ptr, ((sz) + CHUNKSIZE) * sizeof(*ptr)); \ 90 if (ptr == NULL) { \ 91 fprintf(stderr, "out of space\n"); \ 92 exit(1); \ 93 } \ 94 } while (0) 95 96 typedef struct { 97 int first; 98 off_t pos; 99 } STR; 100 101 static char *Infile = NULL, /* input file name */ 102 Outfile[MAXPATHLEN] = "", /* output file name */ 103 Delimch = '%'; /* delimiting character */ 104 105 static int Cflag = false; /* embedded comments */ 106 static int Sflag = false; /* silent run flag */ 107 static int Oflag = false; /* ordering flag */ 108 static int Iflag = false; /* ignore case flag */ 109 static int Rflag = false; /* randomize order flag */ 110 static int Xflag = false; /* set rotated bit */ 111 static uint32_t Num_pts = 0; /* number of pointers/strings */ 112 113 static off_t *Seekpts; 114 115 static FILE *Sort_1, *Sort_2; /* pointers for sorting */ 116 117 static STRFILE Tbl; /* statistics table */ 118 119 static STR *Firstch; /* first chars of each string */ 120 121 static void add_offset(FILE *, off_t); 122 static int cmp_str(const void *, const void *); 123 static int stable_collate_range_cmp(int, int); 124 static void do_order(void); 125 static void getargs(int, char **); 126 static void randomize(void); 127 static void usage(void) __dead2; 128 129 /* 130 * main: 131 * Drive the sucker. There are two main modes -- either we store 132 * the seek pointers, if the table is to be sorted or randomized, 133 * or we write the pointer directly to the file, if we are to stay 134 * in file order. If the former, we allocate and re-allocate in 135 * CHUNKSIZE blocks; if the latter, we just write each pointer, 136 * and then seek back to the beginning to write in the table. 137 */ 138 int 139 main(int ac, char *av[]) 140 { 141 char *sp, *nsp, dc; 142 FILE *inf, *outf; 143 off_t last_off, pos, *p; 144 size_t length; 145 int first; 146 uint32_t cnt; 147 STR *fp; 148 static char string[257]; 149 150 setlocale(LC_ALL, ""); 151 152 getargs(ac, av); /* evalute arguments */ 153 dc = Delimch; 154 if ((inf = fopen(Infile, "r")) == NULL) { 155 perror(Infile); 156 exit(1); 157 } 158 159 if ((outf = fopen(Outfile, "w")) == NULL) { 160 perror(Outfile); 161 exit(1); 162 } 163 if (!STORING_PTRS) 164 fseek(outf, (long)sizeof(Tbl), SEEK_SET); 165 166 /* 167 * Write the strings onto the file 168 */ 169 170 Tbl.str_longlen = 0; 171 Tbl.str_shortlen = 0xffffffff; 172 Tbl.str_delim = dc; 173 Tbl.str_version = VERSION; 174 first = Oflag; 175 add_offset(outf, ftello(inf)); 176 last_off = 0; 177 do { 178 sp = fgets(string, 256, inf); 179 if (sp == NULL || (sp[0] == dc && sp[1] == '\n')) { 180 pos = ftello(inf); 181 length = (size_t)(pos - last_off) - 182 (sp != NULL ? strlen(sp) : 0); 183 last_off = pos; 184 if (length == 0) 185 continue; 186 add_offset(outf, pos); 187 if ((size_t)Tbl.str_longlen < length) 188 Tbl.str_longlen = length; 189 if ((size_t)Tbl.str_shortlen > length) 190 Tbl.str_shortlen = length; 191 first = Oflag; 192 } 193 else if (first) { 194 for (nsp = sp; !isalnum((unsigned char)*nsp); nsp++) 195 continue; 196 ALLOC(Firstch, Num_pts); 197 fp = &Firstch[Num_pts - 1]; 198 if (Iflag && isupper((unsigned char)*nsp)) 199 fp->first = tolower((unsigned char)*nsp); 200 else 201 fp->first = *nsp; 202 fp->pos = Seekpts[Num_pts - 1]; 203 first = false; 204 } 205 } while (sp != NULL); 206 207 /* 208 * write the tables in 209 */ 210 211 fclose(inf); 212 Tbl.str_numstr = Num_pts - 1; 213 214 if (Cflag) 215 Tbl.str_flags |= STR_COMMENTS; 216 217 if (Oflag) 218 do_order(); 219 else if (Rflag) 220 randomize(); 221 222 if (Xflag) 223 Tbl.str_flags |= STR_ROTATED; 224 225 if (!Sflag) { 226 printf("\"%s\" created\n", Outfile); 227 if (Num_pts == 2) 228 puts("There was 1 string"); 229 else 230 printf("There were %u strings\n", Num_pts - 1); 231 printf("Longest string: %u byte%s\n", Tbl.str_longlen, 232 Tbl.str_longlen == 1 ? "" : "s"); 233 printf("Shortest string: %u byte%s\n", Tbl.str_shortlen, 234 Tbl.str_shortlen == 1 ? "" : "s"); 235 } 236 237 rewind(outf); 238 Tbl.str_version = htobe32(Tbl.str_version); 239 Tbl.str_numstr = htobe32(Tbl.str_numstr); 240 Tbl.str_longlen = htobe32(Tbl.str_longlen); 241 Tbl.str_shortlen = htobe32(Tbl.str_shortlen); 242 Tbl.str_flags = htobe32(Tbl.str_flags); 243 fwrite((char *)&Tbl, sizeof(Tbl), 1, outf); 244 if (STORING_PTRS) { 245 for (p = Seekpts, cnt = Num_pts; cnt--; ++p) 246 *p = htobe64(*p); 247 fwrite(Seekpts, sizeof(*Seekpts), (size_t)Num_pts, outf); 248 } 249 fclose(outf); 250 exit(0); 251 } 252 253 /* 254 * This routine evaluates arguments from the command line 255 */ 256 void 257 getargs(int argc, char **argv) 258 { 259 int ch; 260 261 while ((ch = getopt(argc, argv, "Cc:iorsx")) != -1) 262 switch(ch) { 263 case 'C': /* embedded comments */ 264 Cflag++; 265 break; 266 case 'c': /* new delimiting char */ 267 Delimch = *optarg; 268 if (!isascii(Delimch)) { 269 printf("bad delimiting character: '\\%o\n'", 270 (unsigned char)Delimch); 271 } 272 break; 273 case 'i': /* ignore case in ordering */ 274 Iflag++; 275 break; 276 case 'o': /* order strings */ 277 Oflag++; 278 break; 279 case 'r': /* randomize pointers */ 280 Rflag++; 281 break; 282 case 's': /* silent */ 283 Sflag++; 284 break; 285 case 'x': /* set the rotated bit */ 286 Xflag++; 287 break; 288 case '?': 289 default: 290 usage(); 291 } 292 argv += optind; 293 294 if (*argv) { 295 Infile = *argv; 296 if (*++argv) { 297 if (strlcpy(Outfile, *argv, sizeof(Outfile)) >= 298 sizeof(Outfile)) { 299 fprintf(stderr, 300 "output_file path is too long\n"); 301 exit(1); 302 } 303 } 304 } 305 if (!Infile) { 306 puts("No input file name"); 307 usage(); 308 } 309 if (*Outfile == '\0') { 310 if ((size_t)snprintf(Outfile, sizeof(Outfile), "%s.dat", 311 Infile) >= sizeof(Outfile)) { 312 fprintf(stderr, 313 "generated output_file path is too long\n"); 314 exit(1); 315 } 316 } 317 } 318 319 void 320 usage(void) 321 { 322 fprintf(stderr, 323 "strfile [-Ciorsx] [-c char] source_file [output_file]\n"); 324 exit(1); 325 } 326 327 /* 328 * add_offset: 329 * Add an offset to the list, or write it out, as appropriate. 330 */ 331 void 332 add_offset(FILE *fp, off_t off) 333 { 334 off_t beoff; 335 336 if (!STORING_PTRS) { 337 beoff = htobe64(off); 338 fwrite(&beoff, 1, sizeof(beoff), fp); 339 } else { 340 ALLOC(Seekpts, Num_pts + 1); 341 Seekpts[Num_pts] = off; 342 } 343 Num_pts++; 344 } 345 346 /* 347 * do_order: 348 * Order the strings alphabetically (possibly ignoring case). 349 */ 350 void 351 do_order(void) 352 { 353 uint32_t i; 354 off_t *lp; 355 STR *fp; 356 357 Sort_1 = fopen(Infile, "r"); 358 Sort_2 = fopen(Infile, "r"); 359 qsort(Firstch, (size_t)Tbl.str_numstr, sizeof(*Firstch), cmp_str); 360 i = Tbl.str_numstr; 361 lp = Seekpts; 362 fp = Firstch; 363 while (i--) 364 *lp++ = fp++->pos; 365 fclose(Sort_1); 366 fclose(Sort_2); 367 Tbl.str_flags |= STR_ORDERED; 368 } 369 370 static int 371 stable_collate_range_cmp(int c1, int c2) 372 { 373 static char s1[2], s2[2]; 374 int ret; 375 376 s1[0] = c1; 377 s2[0] = c2; 378 if ((ret = strcoll(s1, s2)) != 0) 379 return (ret); 380 return (c1 - c2); 381 } 382 383 /* 384 * cmp_str: 385 * Compare two strings in the file 386 */ 387 int 388 cmp_str(const void *s1, const void *s2) 389 { 390 const STR *p1, *p2; 391 int c1, c2, n1, n2, r; 392 393 #define SET_N(nf,ch) (nf = (ch == '\n')) 394 #define IS_END(ch,nf) (ch == EOF || (ch == (unsigned char)Delimch && nf)) 395 396 p1 = (const STR *)s1; 397 p2 = (const STR *)s2; 398 399 c1 = (unsigned char)p1->first; 400 c2 = (unsigned char)p2->first; 401 if ((r = stable_collate_range_cmp(c1, c2)) != 0) 402 return (r); 403 404 fseeko(Sort_1, p1->pos, SEEK_SET); 405 fseeko(Sort_2, p2->pos, SEEK_SET); 406 407 n1 = false; 408 n2 = false; 409 while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0' && c1 != EOF) 410 SET_N(n1, c1); 411 while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0' && c2 != EOF) 412 SET_N(n2, c2); 413 414 while (!IS_END(c1, n1) && !IS_END(c2, n2)) { 415 if (Iflag) { 416 if (isupper(c1)) 417 c1 = tolower(c1); 418 if (isupper(c2)) 419 c2 = tolower(c2); 420 } 421 if ((r = stable_collate_range_cmp(c1, c2)) != 0) 422 return (r); 423 SET_N(n1, c1); 424 SET_N(n2, c2); 425 c1 = getc(Sort_1); 426 c2 = getc(Sort_2); 427 } 428 if (IS_END(c1, n1)) 429 c1 = 0; 430 if (IS_END(c2, n2)) 431 c2 = 0; 432 433 return (stable_collate_range_cmp(c1, c2)); 434 } 435 436 /* 437 * randomize: 438 * Randomize the order of the string table. We must be careful 439 * not to randomize across delimiter boundaries. All 440 * randomization is done within each block. 441 */ 442 void 443 randomize(void) 444 { 445 uint32_t cnt, i; 446 off_t tmp; 447 off_t *sp; 448 449 Tbl.str_flags |= STR_RANDOM; 450 cnt = Tbl.str_numstr; 451 452 /* 453 * move things around randomly 454 */ 455 456 for (sp = Seekpts; cnt > 0; cnt--, sp++) { 457 i = arc4random_uniform(cnt); 458 tmp = sp[0]; 459 sp[0] = sp[i]; 460 sp[i] = tmp; 461 } 462 } 463