xref: /freebsd/contrib/libarchive/libarchive/archive_write_set_format_warc.c (revision b9128a37faafede823eb456aa65a11ac69997284)
1  /*-
2   * Copyright (c) 2014 Sebastian Freundt
3   * Author: Sebastian Freundt  <devel@fresse.org>
4   *
5   * All rights reserved.
6   *
7   * Redistribution and use in source and binary forms, with or without
8   * modification, are permitted provided that the following conditions
9   * are met:
10   * 1. Redistributions of source code must retain the above copyright
11   *    notice, this list of conditions and the following disclaimer.
12   * 2. Redistributions in binary form must reproduce the above copyright
13   *    notice, this list of conditions and the following disclaimer in the
14   *    documentation and/or other materials provided with the distribution.
15   *
16   * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
17   * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19   * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
20   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25   * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26   */
27  
28  #include "archive_platform.h"
29  
30  #ifdef HAVE_ERRNO_H
31  #include <errno.h>
32  #endif
33  #include <stdio.h>
34  #ifdef HAVE_STDLIB_H
35  #include <stdlib.h>
36  #endif
37  #ifdef HAVE_STRING_H
38  #include <string.h>
39  #endif
40  #ifdef HAVE_TIME_H
41  #include <time.h>
42  #endif
43  
44  #include "archive.h"
45  #include "archive_entry.h"
46  #include "archive_entry_locale.h"
47  #include "archive_private.h"
48  #include "archive_random_private.h"
49  #include "archive_write_private.h"
50  #include "archive_write_set_format_private.h"
51  
52  struct warc_s {
53  	unsigned int omit_warcinfo:1;
54  
55  	time_t now;
56  	mode_t typ;
57  	unsigned int rng;
58  	/* populated size */
59  	uint64_t populz;
60  };
61  
62  static const char warcinfo[] =
63      "software: libarchive/" ARCHIVE_VERSION_ONLY_STRING "\r\n"
64      "format: WARC file version 1.0\r\n";
65  
66  typedef enum {
67  	WT_NONE,
68  	/* warcinfo */
69  	WT_INFO,
70  	/* metadata */
71  	WT_META,
72  	/* resource */
73  	WT_RSRC,
74  	/* request, unsupported */
75  	WT_REQ,
76  	/* response, unsupported */
77  	WT_RSP,
78  	/* revisit, unsupported */
79  	WT_RVIS,
80  	/* conversion, unsupported */
81  	WT_CONV,
82  	/* continuation, unsupported at the moment */
83  	WT_CONT,
84  	/* invalid type */
85  	LAST_WT
86  } warc_type_t;
87  
88  typedef struct {
89  	warc_type_t type;
90  	const char *tgturi;
91  	const char *recid;
92  	time_t rtime;
93  	time_t mtime;
94  	const char *cnttyp;
95  	uint64_t cntlen;
96  } warc_essential_hdr_t;
97  
98  typedef struct {
99  	unsigned int u[4U];
100  } warc_uuid_t;
101  
102  static int _warc_options(struct archive_write*, const char *key, const char *v);
103  static int _warc_header(struct archive_write *a, struct archive_entry *entry);
104  static ssize_t _warc_data(struct archive_write *a, const void *buf, size_t sz);
105  static int _warc_finish_entry(struct archive_write *a);
106  static int _warc_close(struct archive_write *a);
107  static int _warc_free(struct archive_write *a);
108  
109  /* private routines */
110  static ssize_t _popul_ehdr(struct archive_string *t, size_t z, warc_essential_hdr_t);
111  static int _gen_uuid(warc_uuid_t *tgt);
112  
113  
114  /*
115   * Set output format to ISO 28500 (aka WARC) format.
116   */
117  int
118  archive_write_set_format_warc(struct archive *_a)
119  {
120  	struct archive_write *a = (struct archive_write *)_a;
121  	struct warc_s *w;
122  
123  	archive_check_magic(_a, ARCHIVE_WRITE_MAGIC,
124  	    ARCHIVE_STATE_NEW, "archive_write_set_format_warc");
125  
126  	/* If another format was already registered, unregister it. */
127  	if (a->format_free != NULL) {
128  		(a->format_free)(a);
129  	}
130  
131  	w = malloc(sizeof(*w));
132  	if (w == NULL) {
133  		archive_set_error(&a->archive, ENOMEM,
134  		    "Can't allocate warc data");
135  		return (ARCHIVE_FATAL);
136  	}
137  	/* by default we're emitting a file wide header */
138  	w->omit_warcinfo = 0U;
139  	/* obtain current time for date fields */
140  	w->now = time(NULL);
141  	/* reset file type info */
142  	w->typ = 0;
143  	/* also initialise our rng */
144  	w->rng = (unsigned int)w->now;
145  
146  	a->format_data = w;
147  	a->format_name = "WARC/1.0";
148  	a->format_options = _warc_options;
149  	a->format_write_header = _warc_header;
150  	a->format_write_data = _warc_data;
151  	a->format_close = _warc_close;
152  	a->format_free = _warc_free;
153  	a->format_finish_entry = _warc_finish_entry;
154  	a->archive.archive_format = ARCHIVE_FORMAT_WARC;
155  	a->archive.archive_format_name = "WARC/1.0";
156  	return (ARCHIVE_OK);
157  }
158  
159  
160  /* archive methods */
161  static int
162  _warc_options(struct archive_write *a, const char *key, const char *val)
163  {
164  	struct warc_s *w = a->format_data;
165  
166  	if (strcmp(key, "omit-warcinfo") == 0) {
167  		if (val == NULL || strcmp(val, "true") == 0) {
168  			/* great */
169  			w->omit_warcinfo = 1U;
170  			return (ARCHIVE_OK);
171  		}
172  	}
173  
174  	/* Note: The "warn" return is just to inform the options
175  	 * supervisor that we didn't handle it.  It will generate
176  	 * a suitable error if no one used this option. */
177  	return (ARCHIVE_WARN);
178  }
179  
180  static int
181  _warc_header(struct archive_write *a, struct archive_entry *entry)
182  {
183  	struct warc_s *w = a->format_data;
184  	struct archive_string hdr;
185  #define MAX_HDR_SIZE 512
186  
187  	/* check whether warcinfo record needs outputting */
188  	if (!w->omit_warcinfo) {
189  		ssize_t r;
190  		warc_essential_hdr_t wi = {
191  			WT_INFO,
192  			/*uri*/NULL,
193  			/*urn*/NULL,
194  			/*rtm*/0,
195  			/*mtm*/0,
196  			/*cty*/"application/warc-fields",
197  			/*len*/sizeof(warcinfo) - 1U,
198  		};
199  		wi.rtime = w->now;
200  		wi.mtime = w->now;
201  
202  		archive_string_init(&hdr);
203  		r = _popul_ehdr(&hdr, MAX_HDR_SIZE, wi);
204  		if (r >= 0) {
205  			/* jackpot! */
206  			/* now also use HDR buffer for the actual warcinfo */
207  			archive_strncat(&hdr, warcinfo, sizeof(warcinfo) -1);
208  
209  			/* append end-of-record indicator */
210  			archive_strncat(&hdr, "\r\n\r\n", 4);
211  
212  			/* write to output stream */
213  			__archive_write_output(a, hdr.s, archive_strlen(&hdr));
214  		}
215  		/* indicate we're done with file header writing */
216  		w->omit_warcinfo = 1U;
217  		archive_string_free(&hdr);
218  	}
219  
220  	if (archive_entry_pathname(entry) == NULL) {
221  		archive_set_error(&a->archive, EINVAL,
222  		    "Invalid filename");
223  		return (ARCHIVE_WARN);
224  	}
225  
226  	w->typ = archive_entry_filetype(entry);
227  	w->populz = 0U;
228  	if (w->typ == AE_IFREG) {
229  		warc_essential_hdr_t rh = {
230  			WT_RSRC,
231  			/*uri*/NULL,
232  			/*urn*/NULL,
233  			/*rtm*/0,
234  			/*mtm*/0,
235  			/*cty*/NULL,
236  			/*len*/0,
237  		};
238  		ssize_t r;
239  		rh.tgturi = archive_entry_pathname(entry);
240  		rh.rtime = w->now;
241  		rh.mtime = archive_entry_mtime(entry);
242  		rh.cntlen = (size_t)archive_entry_size(entry);
243  
244  		archive_string_init(&hdr);
245  		r = _popul_ehdr(&hdr, MAX_HDR_SIZE, rh);
246  		if (r < 0) {
247  			/* don't bother */
248  			archive_set_error(
249  				&a->archive,
250  				ARCHIVE_ERRNO_FILE_FORMAT,
251  				"cannot archive file");
252  			return (ARCHIVE_WARN);
253  		}
254  		/* otherwise append to output stream */
255  		__archive_write_output(a, hdr.s, r);
256  		/* and let subsequent calls to _data() know about the size */
257  		w->populz = rh.cntlen;
258  		archive_string_free(&hdr);
259  		return (ARCHIVE_OK);
260  	}
261  	/* just resort to erroring as per Tim's advice */
262  	__archive_write_entry_filetype_unsupported(
263  	    &a->archive, entry, "WARC");
264  	return (ARCHIVE_FAILED);
265  }
266  
267  static ssize_t
268  _warc_data(struct archive_write *a, const void *buf, size_t len)
269  {
270  	struct warc_s *w = a->format_data;
271  
272  	if (w->typ == AE_IFREG) {
273  		int rc;
274  
275  		/* never write more bytes than announced */
276  		if (len > w->populz) {
277  			len = (size_t)w->populz;
278  		}
279  
280  		/* now then, out we put the whole shebang */
281  		rc = __archive_write_output(a, buf, len);
282  		if (rc != ARCHIVE_OK) {
283  			return rc;
284  		}
285  	}
286  	return len;
287  }
288  
289  static int
290  _warc_finish_entry(struct archive_write *a)
291  {
292  	static const char _eor[] = "\r\n\r\n";
293  	struct warc_s *w = a->format_data;
294  
295  	if (w->typ == AE_IFREG) {
296  		int rc = __archive_write_output(a, _eor, sizeof(_eor) - 1U);
297  
298  		if (rc != ARCHIVE_OK) {
299  			return rc;
300  		}
301  	}
302  	/* reset type info */
303  	w->typ = 0;
304  	return (ARCHIVE_OK);
305  }
306  
307  static int
308  _warc_close(struct archive_write *a)
309  {
310  	(void)a; /* UNUSED */
311  	return (ARCHIVE_OK);
312  }
313  
314  static int
315  _warc_free(struct archive_write *a)
316  {
317  	struct warc_s *w = a->format_data;
318  
319  	free(w);
320  	a->format_data = NULL;
321  	return (ARCHIVE_OK);
322  }
323  
324  
325  /* private routines */
326  static void
327  xstrftime(struct archive_string *as, const char *fmt, time_t t)
328  {
329  /** like strftime(3) but for time_t objects */
330  	struct tm *rt;
331  #if defined(HAVE_GMTIME_R) || defined(HAVE_GMTIME_S)
332  	struct tm timeHere;
333  #endif
334  	char strtime[100];
335  	size_t len;
336  
337  #if defined(HAVE_GMTIME_S)
338  	rt = gmtime_s(&timeHere, &t) ? NULL : &timeHere;
339  #elif defined(HAVE_GMTIME_R)
340  	rt = gmtime_r(&t, &timeHere);
341  #else
342  	rt = gmtime(&t);
343  #endif
344  	if (!rt)
345  		return;
346  	/* leave the hard yacker to our role model strftime() */
347  	len = strftime(strtime, sizeof(strtime)-1, fmt, rt);
348  	archive_strncat(as, strtime, len);
349  }
350  
351  static ssize_t
352  _popul_ehdr(struct archive_string *tgt, size_t tsz, warc_essential_hdr_t hdr)
353  {
354  	static const char _ver[] = "WARC/1.0\r\n";
355  	static const char * const _typ[LAST_WT] = {
356  		NULL, "warcinfo", "metadata", "resource", NULL
357  	};
358  	char std_uuid[48U];
359  
360  	if (hdr.type == WT_NONE || hdr.type > WT_RSRC) {
361  		/* brilliant, how exactly did we get here? */
362  		return -1;
363  	}
364  
365  	archive_strcpy(tgt, _ver);
366  
367  	archive_string_sprintf(tgt, "WARC-Type: %s\r\n", _typ[hdr.type]);
368  
369  	if (hdr.tgturi != NULL) {
370  		/* check if there's a xyz:// */
371  		static const char _uri[] = "";
372  		static const char _fil[] = "file://";
373  		const char *u;
374  		char *chk = strchr(hdr.tgturi, ':');
375  
376  		if (chk != NULL && chk[1U] == '/' && chk[2U] == '/') {
377  			/* yep, it's definitely a URI */
378  			u = _uri;
379  		} else {
380  			/* hm, best to prepend file:// then */
381  			u = _fil;
382  		}
383  		archive_string_sprintf(tgt,
384  			"WARC-Target-URI: %s%s\r\n", u, hdr.tgturi);
385  	}
386  
387  	/* record time is usually when the http is sent off,
388  	 * just treat the archive writing as such for a moment */
389  	xstrftime(tgt, "WARC-Date: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.rtime);
390  
391  	/* while we're at it, record the mtime */
392  	xstrftime(tgt, "Last-Modified: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.mtime);
393  
394  	if (hdr.recid == NULL) {
395  		/* generate one, grrrr */
396  		warc_uuid_t u;
397  
398  		_gen_uuid(&u);
399  		/* Unfortunately, archive_string_sprintf does not
400  		 * handle the minimum number following '%'.
401  		 * So we have to use snprintf function here instead
402  		 * of archive_string_snprintf function. */
403  #if defined(_WIN32) && !defined(__CYGWIN__) && !( defined(_MSC_VER) && _MSC_VER >= 1900)
404  #define snprintf _snprintf
405  #endif
406  		snprintf(
407  			std_uuid, sizeof(std_uuid),
408  			"<urn:uuid:%08x-%04x-%04x-%04x-%04x%08x>",
409  			u.u[0U],
410  			u.u[1U] >> 16U, u.u[1U] & 0xffffU,
411  			u.u[2U] >> 16U, u.u[2U] & 0xffffU,
412  			u.u[3U]);
413  		hdr.recid = std_uuid;
414  	}
415  
416  	/* record-id is mandatory, fingers crossed we won't fail */
417  	archive_string_sprintf(tgt, "WARC-Record-ID: %s\r\n", hdr.recid);
418  
419  	if (hdr.cnttyp != NULL) {
420  		archive_string_sprintf(tgt, "Content-Type: %s\r\n", hdr.cnttyp);
421  	}
422  
423  	/* next one is mandatory */
424  	archive_string_sprintf(tgt, "Content-Length: %ju\r\n", (uintmax_t)hdr.cntlen);
425  	/**/
426  	archive_strncat(tgt, "\r\n", 2);
427  
428  	return (archive_strlen(tgt) >= tsz)? -1: (ssize_t)archive_strlen(tgt);
429  }
430  
431  static int
432  _gen_uuid(warc_uuid_t *tgt)
433  {
434  	archive_random(tgt->u, sizeof(tgt->u));
435  	/* obey uuid version 4 rules */
436  	tgt->u[1U] &= 0xffff0fffU;
437  	tgt->u[1U] |= 0x4000U;
438  	tgt->u[2U] &= 0x3fffffffU;
439  	tgt->u[2U] |= 0x80000000U;
440  	return 0;
441  }
442  
443  /* archive_write_set_format_warc.c ends here */
444