1 /*- 2 * Copyright (c) 2014 Sebastian Freundt 3 * Author: Sebastian Freundt <devel@fresse.org> 4 * 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "archive_platform.h" 29 30 #ifdef HAVE_ERRNO_H 31 #include <errno.h> 32 #endif 33 #include <stdio.h> 34 #ifdef HAVE_STDLIB_H 35 #include <stdlib.h> 36 #endif 37 #ifdef HAVE_STRING_H 38 #include <string.h> 39 #endif 40 #ifdef HAVE_TIME_H 41 #include <time.h> 42 #endif 43 44 #include "archive.h" 45 #include "archive_entry.h" 46 #include "archive_entry_locale.h" 47 #include "archive_private.h" 48 #include "archive_random_private.h" 49 #include "archive_write_private.h" 50 #include "archive_write_set_format_private.h" 51 52 struct warc_s { 53 unsigned int omit_warcinfo:1; 54 55 time_t now; 56 mode_t typ; 57 unsigned int rng; 58 /* populated size */ 59 uint64_t populz; 60 }; 61 62 static const char warcinfo[] = 63 "software: libarchive/" ARCHIVE_VERSION_ONLY_STRING "\r\n" 64 "format: WARC file version 1.0\r\n"; 65 66 typedef enum { 67 WT_NONE, 68 /* warcinfo */ 69 WT_INFO, 70 /* metadata */ 71 WT_META, 72 /* resource */ 73 WT_RSRC, 74 /* request, unsupported */ 75 WT_REQ, 76 /* response, unsupported */ 77 WT_RSP, 78 /* revisit, unsupported */ 79 WT_RVIS, 80 /* conversion, unsupported */ 81 WT_CONV, 82 /* continuation, unsupported at the moment */ 83 WT_CONT, 84 /* invalid type */ 85 LAST_WT 86 } warc_type_t; 87 88 typedef struct { 89 warc_type_t type; 90 const char *tgturi; 91 const char *recid; 92 time_t rtime; 93 time_t mtime; 94 const char *cnttyp; 95 uint64_t cntlen; 96 } warc_essential_hdr_t; 97 98 typedef struct { 99 unsigned int u[4U]; 100 } warc_uuid_t; 101 102 static int _warc_options(struct archive_write*, const char *key, const char *v); 103 static int _warc_header(struct archive_write *a, struct archive_entry *entry); 104 static ssize_t _warc_data(struct archive_write *a, const void *buf, size_t sz); 105 static int _warc_finish_entry(struct archive_write *a); 106 static int _warc_close(struct archive_write *a); 107 static int _warc_free(struct archive_write *a); 108 109 /* private routines */ 110 static ssize_t _popul_ehdr(struct archive_string *t, size_t z, warc_essential_hdr_t); 111 static int _gen_uuid(warc_uuid_t *tgt); 112 113 114 /* 115 * Set output format to ISO 28500 (aka WARC) format. 116 */ 117 int 118 archive_write_set_format_warc(struct archive *_a) 119 { 120 struct archive_write *a = (struct archive_write *)_a; 121 struct warc_s *w; 122 123 archive_check_magic(_a, ARCHIVE_WRITE_MAGIC, 124 ARCHIVE_STATE_NEW, "archive_write_set_format_warc"); 125 126 /* If another format was already registered, unregister it. */ 127 if (a->format_free != NULL) { 128 (a->format_free)(a); 129 } 130 131 w = malloc(sizeof(*w)); 132 if (w == NULL) { 133 archive_set_error(&a->archive, ENOMEM, 134 "Can't allocate warc data"); 135 return (ARCHIVE_FATAL); 136 } 137 /* by default we're emitting a file wide header */ 138 w->omit_warcinfo = 0U; 139 /* obtain current time for date fields */ 140 w->now = time(NULL); 141 /* reset file type info */ 142 w->typ = 0; 143 /* also initialise our rng */ 144 w->rng = (unsigned int)w->now; 145 146 a->format_data = w; 147 a->format_name = "WARC/1.0"; 148 a->format_options = _warc_options; 149 a->format_write_header = _warc_header; 150 a->format_write_data = _warc_data; 151 a->format_close = _warc_close; 152 a->format_free = _warc_free; 153 a->format_finish_entry = _warc_finish_entry; 154 a->archive.archive_format = ARCHIVE_FORMAT_WARC; 155 a->archive.archive_format_name = "WARC/1.0"; 156 return (ARCHIVE_OK); 157 } 158 159 160 /* archive methods */ 161 static int 162 _warc_options(struct archive_write *a, const char *key, const char *val) 163 { 164 struct warc_s *w = a->format_data; 165 166 if (strcmp(key, "omit-warcinfo") == 0) { 167 if (val == NULL || strcmp(val, "true") == 0) { 168 /* great */ 169 w->omit_warcinfo = 1U; 170 return (ARCHIVE_OK); 171 } 172 } 173 174 /* Note: The "warn" return is just to inform the options 175 * supervisor that we didn't handle it. It will generate 176 * a suitable error if no one used this option. */ 177 return (ARCHIVE_WARN); 178 } 179 180 static int 181 _warc_header(struct archive_write *a, struct archive_entry *entry) 182 { 183 struct warc_s *w = a->format_data; 184 struct archive_string hdr; 185 #define MAX_HDR_SIZE 512 186 187 /* check whether warcinfo record needs outputting */ 188 if (!w->omit_warcinfo) { 189 ssize_t r; 190 warc_essential_hdr_t wi = { 191 WT_INFO, 192 /*uri*/NULL, 193 /*urn*/NULL, 194 /*rtm*/0, 195 /*mtm*/0, 196 /*cty*/"application/warc-fields", 197 /*len*/sizeof(warcinfo) - 1U, 198 }; 199 wi.rtime = w->now; 200 wi.mtime = w->now; 201 202 archive_string_init(&hdr); 203 r = _popul_ehdr(&hdr, MAX_HDR_SIZE, wi); 204 if (r >= 0) { 205 /* jackpot! */ 206 /* now also use HDR buffer for the actual warcinfo */ 207 archive_strncat(&hdr, warcinfo, sizeof(warcinfo) -1); 208 209 /* append end-of-record indicator */ 210 archive_strncat(&hdr, "\r\n\r\n", 4); 211 212 /* write to output stream */ 213 __archive_write_output(a, hdr.s, archive_strlen(&hdr)); 214 } 215 /* indicate we're done with file header writing */ 216 w->omit_warcinfo = 1U; 217 archive_string_free(&hdr); 218 } 219 220 if (archive_entry_pathname(entry) == NULL) { 221 archive_set_error(&a->archive, EINVAL, 222 "Invalid filename"); 223 return (ARCHIVE_WARN); 224 } 225 226 w->typ = archive_entry_filetype(entry); 227 w->populz = 0U; 228 if (w->typ == AE_IFREG) { 229 warc_essential_hdr_t rh = { 230 WT_RSRC, 231 /*uri*/NULL, 232 /*urn*/NULL, 233 /*rtm*/0, 234 /*mtm*/0, 235 /*cty*/NULL, 236 /*len*/0, 237 }; 238 ssize_t r; 239 rh.tgturi = archive_entry_pathname(entry); 240 rh.rtime = w->now; 241 rh.mtime = archive_entry_mtime(entry); 242 rh.cntlen = (size_t)archive_entry_size(entry); 243 244 archive_string_init(&hdr); 245 r = _popul_ehdr(&hdr, MAX_HDR_SIZE, rh); 246 if (r < 0) { 247 /* don't bother */ 248 archive_set_error( 249 &a->archive, 250 ARCHIVE_ERRNO_FILE_FORMAT, 251 "cannot archive file"); 252 return (ARCHIVE_WARN); 253 } 254 /* otherwise append to output stream */ 255 __archive_write_output(a, hdr.s, r); 256 /* and let subsequent calls to _data() know about the size */ 257 w->populz = rh.cntlen; 258 archive_string_free(&hdr); 259 return (ARCHIVE_OK); 260 } 261 /* just resort to erroring as per Tim's advice */ 262 __archive_write_entry_filetype_unsupported( 263 &a->archive, entry, "WARC"); 264 return (ARCHIVE_FAILED); 265 } 266 267 static ssize_t 268 _warc_data(struct archive_write *a, const void *buf, size_t len) 269 { 270 struct warc_s *w = a->format_data; 271 272 if (w->typ == AE_IFREG) { 273 int rc; 274 275 /* never write more bytes than announced */ 276 if (len > w->populz) { 277 len = (size_t)w->populz; 278 } 279 280 /* now then, out we put the whole shebang */ 281 rc = __archive_write_output(a, buf, len); 282 if (rc != ARCHIVE_OK) { 283 return rc; 284 } 285 } 286 return len; 287 } 288 289 static int 290 _warc_finish_entry(struct archive_write *a) 291 { 292 static const char _eor[] = "\r\n\r\n"; 293 struct warc_s *w = a->format_data; 294 295 if (w->typ == AE_IFREG) { 296 int rc = __archive_write_output(a, _eor, sizeof(_eor) - 1U); 297 298 if (rc != ARCHIVE_OK) { 299 return rc; 300 } 301 } 302 /* reset type info */ 303 w->typ = 0; 304 return (ARCHIVE_OK); 305 } 306 307 static int 308 _warc_close(struct archive_write *a) 309 { 310 (void)a; /* UNUSED */ 311 return (ARCHIVE_OK); 312 } 313 314 static int 315 _warc_free(struct archive_write *a) 316 { 317 struct warc_s *w = a->format_data; 318 319 free(w); 320 a->format_data = NULL; 321 return (ARCHIVE_OK); 322 } 323 324 325 /* private routines */ 326 static void 327 xstrftime(struct archive_string *as, const char *fmt, time_t t) 328 { 329 /** like strftime(3) but for time_t objects */ 330 struct tm *rt; 331 #if defined(HAVE_GMTIME_R) || defined(HAVE_GMTIME_S) 332 struct tm timeHere; 333 #endif 334 char strtime[100]; 335 size_t len; 336 337 #if defined(HAVE_GMTIME_S) 338 rt = gmtime_s(&timeHere, &t) ? NULL : &timeHere; 339 #elif defined(HAVE_GMTIME_R) 340 rt = gmtime_r(&t, &timeHere); 341 #else 342 rt = gmtime(&t); 343 #endif 344 if (!rt) 345 return; 346 /* leave the hard yacker to our role model strftime() */ 347 len = strftime(strtime, sizeof(strtime)-1, fmt, rt); 348 archive_strncat(as, strtime, len); 349 } 350 351 static ssize_t 352 _popul_ehdr(struct archive_string *tgt, size_t tsz, warc_essential_hdr_t hdr) 353 { 354 static const char _ver[] = "WARC/1.0\r\n"; 355 static const char * const _typ[LAST_WT] = { 356 NULL, "warcinfo", "metadata", "resource", NULL 357 }; 358 char std_uuid[48U]; 359 360 if (hdr.type == WT_NONE || hdr.type > WT_RSRC) { 361 /* brilliant, how exactly did we get here? */ 362 return -1; 363 } 364 365 archive_strcpy(tgt, _ver); 366 367 archive_string_sprintf(tgt, "WARC-Type: %s\r\n", _typ[hdr.type]); 368 369 if (hdr.tgturi != NULL) { 370 /* check if there's a xyz:// */ 371 static const char _uri[] = ""; 372 static const char _fil[] = "file://"; 373 const char *u; 374 char *chk = strchr(hdr.tgturi, ':'); 375 376 if (chk != NULL && chk[1U] == '/' && chk[2U] == '/') { 377 /* yep, it's definitely a URI */ 378 u = _uri; 379 } else { 380 /* hm, best to prepend file:// then */ 381 u = _fil; 382 } 383 archive_string_sprintf(tgt, 384 "WARC-Target-URI: %s%s\r\n", u, hdr.tgturi); 385 } 386 387 /* record time is usually when the http is sent off, 388 * just treat the archive writing as such for a moment */ 389 xstrftime(tgt, "WARC-Date: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.rtime); 390 391 /* while we're at it, record the mtime */ 392 xstrftime(tgt, "Last-Modified: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.mtime); 393 394 if (hdr.recid == NULL) { 395 /* generate one, grrrr */ 396 warc_uuid_t u; 397 398 _gen_uuid(&u); 399 /* Unfortunately, archive_string_sprintf does not 400 * handle the minimum number following '%'. 401 * So we have to use snprintf function here instead 402 * of archive_string_snprintf function. */ 403 #if defined(_WIN32) && !defined(__CYGWIN__) && !( defined(_MSC_VER) && _MSC_VER >= 1900) 404 #define snprintf _snprintf 405 #endif 406 snprintf( 407 std_uuid, sizeof(std_uuid), 408 "<urn:uuid:%08x-%04x-%04x-%04x-%04x%08x>", 409 u.u[0U], 410 u.u[1U] >> 16U, u.u[1U] & 0xffffU, 411 u.u[2U] >> 16U, u.u[2U] & 0xffffU, 412 u.u[3U]); 413 hdr.recid = std_uuid; 414 } 415 416 /* record-id is mandatory, fingers crossed we won't fail */ 417 archive_string_sprintf(tgt, "WARC-Record-ID: %s\r\n", hdr.recid); 418 419 if (hdr.cnttyp != NULL) { 420 archive_string_sprintf(tgt, "Content-Type: %s\r\n", hdr.cnttyp); 421 } 422 423 /* next one is mandatory */ 424 archive_string_sprintf(tgt, "Content-Length: %ju\r\n", (uintmax_t)hdr.cntlen); 425 /**/ 426 archive_strncat(tgt, "\r\n", 2); 427 428 return (archive_strlen(tgt) >= tsz)? -1: (ssize_t)archive_strlen(tgt); 429 } 430 431 static int 432 _gen_uuid(warc_uuid_t *tgt) 433 { 434 archive_random(tgt->u, sizeof(tgt->u)); 435 /* obey uuid version 4 rules */ 436 tgt->u[1U] &= 0xffff0fffU; 437 tgt->u[1U] |= 0x4000U; 438 tgt->u[2U] &= 0x3fffffffU; 439 tgt->u[2U] |= 0x80000000U; 440 return 0; 441 } 442 443 /* archive_write_set_format_warc.c ends here */ 444