1 /*-
2 * Copyright (c) 2014 Sebastian Freundt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #include "archive_platform.h"
27
28 /**
29 * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
30 * ISO 28500:2009.
31 * For the purposes of this file we used the final draft from:
32 * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
33 *
34 * Todo:
35 * [ ] real-world warcs can contain resources at endpoints ending in /
36 * e.g. http://bibnum.bnf.fr/warc/
37 * if you're lucky their response contains a Content-Location: header
38 * pointing to a unix-compliant filename, in the example above it's
39 * Content-Location: http://bibnum.bnf.fr/warc/index.html
40 * however, that's not mandated and github for example doesn't follow
41 * this convention.
42 * We need a set of archive options to control what to do with
43 * entries like these, at the moment care is taken to skip them.
44 *
45 **/
46
47 #ifdef HAVE_SYS_STAT_H
48 #include <sys/stat.h>
49 #endif
50 #ifdef HAVE_ERRNO_H
51 #include <errno.h>
52 #endif
53 #ifdef HAVE_STDLIB_H
54 #include <stdlib.h>
55 #endif
56 #ifdef HAVE_STRING_H
57 #include <string.h>
58 #endif
59 #ifdef HAVE_LIMITS_H
60 #include <limits.h>
61 #endif
62 #ifdef HAVE_CTYPE_H
63 #include <ctype.h>
64 #endif
65 #ifdef HAVE_TIME_H
66 #include <time.h>
67 #endif
68
69 #include "archive.h"
70 #include "archive_entry.h"
71 #include "archive_private.h"
72 #include "archive_read_private.h"
73
74 typedef enum {
75 WT_NONE,
76 /* warcinfo */
77 WT_INFO,
78 /* metadata */
79 WT_META,
80 /* resource */
81 WT_RSRC,
82 /* request, unsupported */
83 WT_REQ,
84 /* response, unsupported */
85 WT_RSP,
86 /* revisit, unsupported */
87 WT_RVIS,
88 /* conversion, unsupported */
89 WT_CONV,
90 /* continuation, unsupported at the moment */
91 WT_CONT,
92 /* invalid type */
93 LAST_WT
94 } warc_type_t;
95
96 typedef struct {
97 size_t len;
98 const char *str;
99 } warc_string_t;
100
101 typedef struct {
102 size_t len;
103 char *str;
104 } warc_strbuf_t;
105
106 struct warc_s {
107 /* content length ahead */
108 size_t cntlen;
109 /* and how much we've processed so far */
110 size_t cntoff;
111 /* and how much we need to consume between calls */
112 size_t unconsumed;
113
114 /* string pool */
115 warc_strbuf_t pool;
116 /* previous version */
117 unsigned int pver;
118 /* stringified format name */
119 struct archive_string sver;
120 };
121
122 static int _warc_bid(struct archive_read *a, int);
123 static int _warc_cleanup(struct archive_read *a);
124 static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
125 static int _warc_skip(struct archive_read *a);
126 static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
127
128 /* private routines */
129 static unsigned int _warc_rdver(const char *buf, size_t bsz);
130 static unsigned int _warc_rdtyp(const char *buf, size_t bsz);
131 static warc_string_t _warc_rduri(const char *buf, size_t bsz);
132 static ssize_t _warc_rdlen(const char *buf, size_t bsz);
133 static time_t _warc_rdrtm(const char *buf, size_t bsz);
134 static time_t _warc_rdmtm(const char *buf, size_t bsz);
135 static const char *_warc_find_eoh(const char *buf, size_t bsz);
136 static const char *_warc_find_eol(const char *buf, size_t bsz);
137
138 int
archive_read_support_format_warc(struct archive * _a)139 archive_read_support_format_warc(struct archive *_a)
140 {
141 struct archive_read *a = (struct archive_read *)_a;
142 struct warc_s *w;
143 int r;
144
145 archive_check_magic(_a, ARCHIVE_READ_MAGIC,
146 ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
147
148 if ((w = calloc(1, sizeof(*w))) == NULL) {
149 archive_set_error(&a->archive, ENOMEM,
150 "Can't allocate warc data");
151 return (ARCHIVE_FATAL);
152 }
153
154 r = __archive_read_register_format(
155 a, w, "warc",
156 _warc_bid, NULL, _warc_rdhdr, _warc_read,
157 _warc_skip, NULL, _warc_cleanup, NULL, NULL);
158
159 if (r != ARCHIVE_OK) {
160 free(w);
161 return (r);
162 }
163 return (ARCHIVE_OK);
164 }
165
166 static int
_warc_cleanup(struct archive_read * a)167 _warc_cleanup(struct archive_read *a)
168 {
169 struct warc_s *w = a->format->data;
170
171 if (w->pool.len > 0U) {
172 free(w->pool.str);
173 }
174 archive_string_free(&w->sver);
175 free(w);
176 a->format->data = NULL;
177 return (ARCHIVE_OK);
178 }
179
180 static int
_warc_bid(struct archive_read * a,int best_bid)181 _warc_bid(struct archive_read *a, int best_bid)
182 {
183 const char *hdr;
184 ssize_t nrd;
185 unsigned int ver;
186
187 (void)best_bid; /* UNUSED */
188
189 /* check first line of file, it should be a record already */
190 if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
191 /* no idea what to do */
192 return -1;
193 } else if (nrd < 12) {
194 /* nah, not for us, our magic cookie is at least 12 bytes */
195 return -1;
196 }
197
198 /* otherwise snarf the record's version number */
199 ver = _warc_rdver(hdr, nrd);
200 if (ver < 1200U || ver > 10000U) {
201 /* we only support WARC 0.12 to 1.0 */
202 return -1;
203 }
204
205 /* otherwise be confident */
206 return (64);
207 }
208
209 static int
_warc_rdhdr(struct archive_read * a,struct archive_entry * entry)210 _warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
211 {
212 #define HDR_PROBE_LEN (12U)
213 struct warc_s *w = a->format->data;
214 unsigned int ver;
215 const char *buf;
216 ssize_t nrd;
217 const char *eoh;
218 char *tmp;
219 /* for the file name, saves some strndup()'ing */
220 warc_string_t fnam;
221 /* warc record type, not that we really use it a lot */
222 warc_type_t ftyp;
223 /* content-length+error monad */
224 ssize_t cntlen;
225 /* record time is the WARC-Date time we reinterpret it as ctime */
226 time_t rtime;
227 /* mtime is the Last-Modified time which will be the entry's mtime */
228 time_t mtime;
229
230 start_over:
231 /* just use read_ahead() they keep track of unconsumed
232 * bits and bobs for us; no need to put an extra shift in
233 * and reproduce that functionality here */
234 buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
235
236 if (nrd < 0) {
237 /* no good */
238 archive_set_error(
239 &a->archive, ARCHIVE_ERRNO_MISC,
240 "Bad record header");
241 return (ARCHIVE_FATAL);
242 } else if (buf == NULL) {
243 /* there should be room for at least WARC/bla\r\n
244 * must be EOF therefore */
245 return (ARCHIVE_EOF);
246 }
247 /* looks good so far, try and find the end of the header now */
248 eoh = _warc_find_eoh(buf, nrd);
249 if (eoh == NULL) {
250 /* still no good, the header end might be beyond the
251 * probe we've requested, but then again who'd cram
252 * so much stuff into the header *and* be 28500-compliant */
253 archive_set_error(
254 &a->archive, ARCHIVE_ERRNO_MISC,
255 "Bad record header");
256 return (ARCHIVE_FATAL);
257 }
258 ver = _warc_rdver(buf, eoh - buf);
259 /* we currently support WARC 0.12 to 1.0 */
260 if (ver == 0U) {
261 archive_set_error(
262 &a->archive, ARCHIVE_ERRNO_MISC,
263 "Invalid record version");
264 return (ARCHIVE_FATAL);
265 } else if (ver < 1200U || ver > 10000U) {
266 archive_set_error(
267 &a->archive, ARCHIVE_ERRNO_MISC,
268 "Unsupported record version: %u.%u",
269 ver / 10000, (ver % 10000) / 100);
270 return (ARCHIVE_FATAL);
271 }
272 cntlen = _warc_rdlen(buf, eoh - buf);
273 if (cntlen < 0) {
274 /* nightmare! the specs say content-length is mandatory
275 * so I don't feel overly bad stopping the reader here */
276 archive_set_error(
277 &a->archive, EINVAL,
278 "Bad content length");
279 return (ARCHIVE_FATAL);
280 }
281 rtime = _warc_rdrtm(buf, eoh - buf);
282 if (rtime == (time_t)-1) {
283 /* record time is mandatory as per WARC/1.0,
284 * so just barf here, fast and loud */
285 archive_set_error(
286 &a->archive, EINVAL,
287 "Bad record time");
288 return (ARCHIVE_FATAL);
289 }
290
291 /* let the world know we're a WARC archive */
292 a->archive.archive_format = ARCHIVE_FORMAT_WARC;
293 if (ver != w->pver) {
294 /* stringify this entry's version */
295 archive_string_sprintf(&w->sver,
296 "WARC/%u.%u", ver / 10000, (ver % 10000) / 100);
297 /* remember the version */
298 w->pver = ver;
299 }
300 /* start off with the type */
301 ftyp = _warc_rdtyp(buf, eoh - buf);
302 /* and let future calls know about the content */
303 w->cntlen = cntlen;
304 w->cntoff = 0U;
305 mtime = 0;/* Avoid compiling error on some platform. */
306
307 switch (ftyp) {
308 case WT_RSRC:
309 case WT_RSP:
310 /* only try and read the filename in the cases that are
311 * guaranteed to have one */
312 fnam = _warc_rduri(buf, eoh - buf);
313 /* check the last character in the URI to avoid creating
314 * directory endpoints as files, see Todo above */
315 if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
316 /* break here for now */
317 fnam.len = 0U;
318 fnam.str = NULL;
319 break;
320 }
321 /* bang to our string pool, so we save a
322 * malloc()+free() roundtrip */
323 if (fnam.len + 1U > w->pool.len) {
324 w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
325 tmp = realloc(w->pool.str, w->pool.len);
326 if (tmp == NULL) {
327 archive_set_error(
328 &a->archive, ENOMEM,
329 "Out of memory");
330 return (ARCHIVE_FATAL);
331 }
332 w->pool.str = tmp;
333 }
334 memcpy(w->pool.str, fnam.str, fnam.len);
335 w->pool.str[fnam.len] = '\0';
336 /* let no one else know about the pool, it's a secret, shhh */
337 fnam.str = w->pool.str;
338
339 /* snarf mtime or deduce from rtime
340 * this is a custom header added by our writer, it's quite
341 * hard to believe anyone else would go through with it
342 * (apart from being part of some http responses of course) */
343 if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
344 mtime = rtime;
345 }
346 break;
347 case WT_NONE:
348 case WT_INFO:
349 case WT_META:
350 case WT_REQ:
351 case WT_RVIS:
352 case WT_CONV:
353 case WT_CONT:
354 case LAST_WT:
355 default:
356 fnam.len = 0U;
357 fnam.str = NULL;
358 break;
359 }
360
361 /* now eat some of those delicious buffer bits */
362 __archive_read_consume(a, eoh - buf);
363
364 switch (ftyp) {
365 case WT_RSRC:
366 case WT_RSP:
367 if (fnam.len > 0U) {
368 /* populate entry object */
369 archive_entry_set_filetype(entry, AE_IFREG);
370 archive_entry_copy_pathname(entry, fnam.str);
371 archive_entry_set_size(entry, cntlen);
372 archive_entry_set_perm(entry, 0644);
373 /* rtime is the new ctime, mtime stays mtime */
374 archive_entry_set_ctime(entry, rtime, 0L);
375 archive_entry_set_mtime(entry, mtime, 0L);
376 break;
377 }
378 /* FALLTHROUGH */
379 case WT_NONE:
380 case WT_INFO:
381 case WT_META:
382 case WT_REQ:
383 case WT_RVIS:
384 case WT_CONV:
385 case WT_CONT:
386 case LAST_WT:
387 default:
388 /* consume the content and start over */
389 if (_warc_skip(a) < 0)
390 return (ARCHIVE_FATAL);
391 goto start_over;
392 }
393 return (ARCHIVE_OK);
394 }
395
396 static int
_warc_read(struct archive_read * a,const void ** buf,size_t * bsz,int64_t * off)397 _warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
398 {
399 struct warc_s *w = a->format->data;
400 const char *rab;
401 ssize_t nrd;
402
403 if (w->cntoff >= w->cntlen) {
404 eof:
405 /* it's our lucky day, no work, we can leave early */
406 *buf = NULL;
407 *bsz = 0U;
408 *off = w->cntoff + 4U/*for \r\n\r\n separator*/;
409 w->unconsumed = 0U;
410 return (ARCHIVE_EOF);
411 }
412
413 if (w->unconsumed) {
414 __archive_read_consume(a, w->unconsumed);
415 w->unconsumed = 0U;
416 }
417
418 rab = __archive_read_ahead(a, 1U, &nrd);
419 if (nrd < 0) {
420 *bsz = 0U;
421 /* big catastrophe */
422 return (int)nrd;
423 } else if (nrd == 0) {
424 goto eof;
425 } else if ((size_t)nrd > w->cntlen - w->cntoff) {
426 /* clamp to content-length */
427 nrd = w->cntlen - w->cntoff;
428 }
429 *off = w->cntoff;
430 *bsz = nrd;
431 *buf = rab;
432
433 w->cntoff += nrd;
434 w->unconsumed = (size_t)nrd;
435 return (ARCHIVE_OK);
436 }
437
438 static int
_warc_skip(struct archive_read * a)439 _warc_skip(struct archive_read *a)
440 {
441 struct warc_s *w = a->format->data;
442
443 if (__archive_read_consume(a, w->cntlen) < 0 ||
444 __archive_read_consume(a, 4U/*\r\n\r\n separator*/) < 0)
445 return (ARCHIVE_FATAL);
446 w->cntlen = 0U;
447 w->cntoff = 0U;
448 return (ARCHIVE_OK);
449 }
450
451
452 /* private routines */
453 static void*
deconst(const void * c)454 deconst(const void *c)
455 {
456 return (void *)(uintptr_t)c;
457 }
458
459 static char*
xmemmem(const char * hay,const size_t haysize,const char * needle,const size_t needlesize)460 xmemmem(const char *hay, const size_t haysize,
461 const char *needle, const size_t needlesize)
462 {
463 const char *const eoh = hay + haysize;
464 const char *const eon = needle + needlesize;
465 const char *hp;
466 const char *np;
467 const char *cand;
468 unsigned int hsum;
469 unsigned int nsum;
470 unsigned int eqp;
471
472 /* trivial checks first
473 * a 0-sized needle is defined to be found anywhere in haystack
474 * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
475 * that happens to begin with *NEEDLE) */
476 if (needlesize == 0UL) {
477 return deconst(hay);
478 } else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
479 /* trivial */
480 return NULL;
481 }
482
483 /* First characters of haystack and needle are the same now. Both are
484 * guaranteed to be at least one character long. Now computes the sum
485 * of characters values of needle together with the sum of the first
486 * needle_len characters of haystack. */
487 for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
488 hp < eoh && np < eon;
489 hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
490
491 /* HP now references the (NEEDLESIZE + 1)-th character. */
492 if (np < eon) {
493 /* haystack is smaller than needle, :O */
494 return NULL;
495 } else if (eqp) {
496 /* found a match */
497 return deconst(hay);
498 }
499
500 /* now loop through the rest of haystack,
501 * updating the sum iteratively */
502 for (cand = hay; hp < eoh; hp++) {
503 hsum ^= *cand++;
504 hsum ^= *hp;
505
506 /* Since the sum of the characters is already known to be
507 * equal at that point, it is enough to check just NEEDLESIZE - 1
508 * characters for equality,
509 * also CAND is by design < HP, so no need for range checks */
510 if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
511 return deconst(cand);
512 }
513 }
514 return NULL;
515 }
516
517 static int
strtoi_lim(const char * str,const char ** ep,int llim,int ulim)518 strtoi_lim(const char *str, const char **ep, int llim, int ulim)
519 {
520 int res = 0;
521 const char *sp;
522 /* we keep track of the number of digits via rulim */
523 int rulim;
524
525 for (sp = str, rulim = ulim > 10 ? ulim : 10;
526 res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
527 sp++, rulim /= 10) {
528 res *= 10;
529 res += *sp - '0';
530 }
531 if (sp == str) {
532 res = -1;
533 } else if (res < llim || res > ulim) {
534 res = -2;
535 }
536 *ep = (const char*)sp;
537 return res;
538 }
539
540 static time_t
time_from_tm(struct tm * t)541 time_from_tm(struct tm *t)
542 {
543 #if HAVE__MKGMTIME
544 return _mkgmtime(t);
545 #elif HAVE_TIMEGM
546 /* Use platform timegm() if available. */
547 return (timegm(t));
548 #else
549 /* Else use direct calculation using POSIX assumptions. */
550 /* First, fix up tm_yday based on the year/month/day. */
551 if (mktime(t) == (time_t)-1)
552 return ((time_t)-1);
553 /* Then we can compute timegm() from first principles. */
554 return (t->tm_sec
555 + t->tm_min * 60
556 + t->tm_hour * 3600
557 + t->tm_yday * 86400
558 + (t->tm_year - 70) * 31536000
559 + ((t->tm_year - 69) / 4) * 86400
560 - ((t->tm_year - 1) / 100) * 86400
561 + ((t->tm_year + 299) / 400) * 86400);
562 #endif
563 }
564
565 static time_t
xstrpisotime(const char * s,char ** endptr)566 xstrpisotime(const char *s, char **endptr)
567 {
568 /** like strptime() but strictly for ISO 8601 Zulu strings */
569 struct tm tm;
570 time_t res = (time_t)-1;
571
572 /* make sure tm is clean */
573 memset(&tm, 0, sizeof(tm));
574
575 /* as a courtesy to our callers, and since this is a non-standard
576 * routine, we skip leading whitespace */
577 while (*s == ' ' || *s == '\t')
578 ++s;
579
580 /* read year */
581 if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
582 goto out;
583 }
584 /* read month */
585 if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
586 goto out;
587 }
588 /* read day-of-month */
589 if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
590 goto out;
591 }
592 /* read hour */
593 if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
594 goto out;
595 }
596 /* read minute */
597 if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
598 goto out;
599 }
600 /* read second */
601 if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
602 goto out;
603 }
604
605 /* massage TM to fulfill some of POSIX' constraints */
606 tm.tm_year -= 1900;
607 tm.tm_mon--;
608
609 /* now convert our custom tm struct to a unix stamp using UTC */
610 res = time_from_tm(&tm);
611
612 out:
613 if (endptr != NULL) {
614 *endptr = deconst(s);
615 }
616 return res;
617 }
618
619 static unsigned int
_warc_rdver(const char * buf,size_t bsz)620 _warc_rdver(const char *buf, size_t bsz)
621 {
622 static const char magic[] = "WARC/";
623 const char *c;
624 unsigned int ver = 0U;
625 unsigned int end = 0U;
626
627 if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
628 /* buffer too small or invalid magic */
629 return ver;
630 }
631 /* looks good so far, read the version number for a laugh */
632 buf += sizeof(magic) - 1U;
633
634 if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') &&
635 isdigit((unsigned char)buf[2U])) {
636 /* we support a maximum of 2 digits in the minor version */
637 if (isdigit((unsigned char)buf[3U]))
638 end = 1U;
639 /* set up major version */
640 ver = (buf[0U] - '0') * 10000U;
641 /* set up minor version */
642 if (end == 1U) {
643 ver += (buf[2U] - '0') * 1000U;
644 ver += (buf[3U] - '0') * 100U;
645 } else
646 ver += (buf[2U] - '0') * 100U;
647 /*
648 * WARC below version 0.12 has a space-separated header
649 * WARC 0.12 and above terminates the version with a CRLF
650 */
651 c = buf + 3U + end;
652 if (ver >= 1200U) {
653 if (memcmp(c, "\r\n", 2U) != 0)
654 ver = 0U;
655 } else {
656 /* ver < 1200U */
657 if (*c != ' ' && *c != '\t')
658 ver = 0U;
659 }
660 }
661 return ver;
662 }
663
664 static unsigned int
_warc_rdtyp(const char * buf,size_t bsz)665 _warc_rdtyp(const char *buf, size_t bsz)
666 {
667 static const char _key[] = "\r\nWARC-Type:";
668 const char *val, *eol;
669
670 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
671 /* no bother */
672 return WT_NONE;
673 }
674 val += sizeof(_key) - 1U;
675 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
676 /* no end of line */
677 return WT_NONE;
678 }
679
680 /* overread whitespace */
681 while (val < eol && (*val == ' ' || *val == '\t'))
682 ++val;
683
684 if (val + 8U == eol) {
685 if (memcmp(val, "resource", 8U) == 0)
686 return WT_RSRC;
687 else if (memcmp(val, "response", 8U) == 0)
688 return WT_RSP;
689 }
690 return WT_NONE;
691 }
692
693 static warc_string_t
_warc_rduri(const char * buf,size_t bsz)694 _warc_rduri(const char *buf, size_t bsz)
695 {
696 static const char _key[] = "\r\nWARC-Target-URI:";
697 const char *val, *uri, *eol, *p;
698 warc_string_t res = {0U, NULL};
699
700 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
701 /* no bother */
702 return res;
703 }
704 /* overread whitespace */
705 val += sizeof(_key) - 1U;
706 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
707 /* no end of line */
708 return res;
709 }
710
711 while (val < eol && (*val == ' ' || *val == '\t'))
712 ++val;
713
714 /* overread URL designators */
715 if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) {
716 /* not touching that! */
717 return res;
718 }
719
720 /* spaces inside uri are not allowed, CRLF should follow */
721 for (p = val; p < eol; p++) {
722 if (isspace((unsigned char)*p))
723 return res;
724 }
725
726 /* there must be at least space for ftp */
727 if (uri < (val + 3U))
728 return res;
729
730 /* move uri to point to after :// */
731 uri += 3U;
732
733 /* now then, inspect the URI */
734 if (memcmp(val, "file", 4U) == 0) {
735 /* perfect, nothing left to do here */
736
737 } else if (memcmp(val, "http", 4U) == 0 ||
738 memcmp(val, "ftp", 3U) == 0) {
739 /* overread domain, and the first / */
740 while (uri < eol && *uri++ != '/');
741 } else {
742 /* not sure what to do? best to bugger off */
743 return res;
744 }
745 res.str = uri;
746 res.len = eol - uri;
747 return res;
748 }
749
750 static ssize_t
_warc_rdlen(const char * buf,size_t bsz)751 _warc_rdlen(const char *buf, size_t bsz)
752 {
753 static const char _key[] = "\r\nContent-Length:";
754 const char *val, *eol;
755 char *on = NULL;
756 long int len;
757
758 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
759 /* no bother */
760 return -1;
761 }
762 val += sizeof(_key) - 1U;
763 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
764 /* no end of line */
765 return -1;
766 }
767
768 /* skip leading whitespace */
769 while (val < eol && (*val == ' ' || *val == '\t'))
770 val++;
771 /* there must be at least one digit */
772 if (!isdigit((unsigned char)*val))
773 return -1;
774 errno = 0;
775 len = strtol(val, &on, 10);
776 if (errno != 0 || on != eol) {
777 /* line must end here */
778 return -1;
779 }
780
781 return (size_t)len;
782 }
783
784 static time_t
_warc_rdrtm(const char * buf,size_t bsz)785 _warc_rdrtm(const char *buf, size_t bsz)
786 {
787 static const char _key[] = "\r\nWARC-Date:";
788 const char *val, *eol;
789 char *on = NULL;
790 time_t res;
791
792 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
793 /* no bother */
794 return (time_t)-1;
795 }
796 val += sizeof(_key) - 1U;
797 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
798 /* no end of line */
799 return -1;
800 }
801
802 /* xstrpisotime() kindly overreads whitespace for us, so use that */
803 res = xstrpisotime(val, &on);
804 if (on != eol) {
805 /* line must end here */
806 return -1;
807 }
808 return res;
809 }
810
811 static time_t
_warc_rdmtm(const char * buf,size_t bsz)812 _warc_rdmtm(const char *buf, size_t bsz)
813 {
814 static const char _key[] = "\r\nLast-Modified:";
815 const char *val, *eol;
816 char *on = NULL;
817 time_t res;
818
819 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
820 /* no bother */
821 return (time_t)-1;
822 }
823 val += sizeof(_key) - 1U;
824 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
825 /* no end of line */
826 return -1;
827 }
828
829 /* xstrpisotime() kindly overreads whitespace for us, so use that */
830 res = xstrpisotime(val, &on);
831 if (on != eol) {
832 /* line must end here */
833 return -1;
834 }
835 return res;
836 }
837
838 static const char*
_warc_find_eoh(const char * buf,size_t bsz)839 _warc_find_eoh(const char *buf, size_t bsz)
840 {
841 static const char _marker[] = "\r\n\r\n";
842 const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
843
844 if (hit != NULL) {
845 hit += sizeof(_marker) - 1U;
846 }
847 return hit;
848 }
849
850 static const char*
_warc_find_eol(const char * buf,size_t bsz)851 _warc_find_eol(const char *buf, size_t bsz)
852 {
853 static const char _marker[] = "\r\n";
854 const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
855
856 return hit;
857 }
858 /* archive_read_support_format_warc.c ends here */
859