xref: /freebsd/usr.bin/mkuzip/mkuzip.c (revision 5e3190f700637fcfc1a52daeaa4a031fdd2557c7)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2004-2016 Maxim Sobolev <sobomax@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/types.h>
31 #include <sys/endian.h>
32 #include <sys/param.h>
33 #include <sys/sysctl.h>
34 #include <sys/stat.h>
35 #include <sys/uio.h>
36 #include <netinet/in.h>
37 #include <assert.h>
38 #include <ctype.h>
39 #include <err.h>
40 #include <fcntl.h>
41 #include <pthread.h>
42 #include <signal.h>
43 #include <stdint.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <unistd.h>
48 
49 #include "mkuzip.h"
50 #include "mkuz_cloop.h"
51 #include "mkuz_blockcache.h"
52 #include "mkuz_lzma.h"
53 #include "mkuz_zlib.h"
54 #include "mkuz_zstd.h"
55 #include "mkuz_blk.h"
56 #include "mkuz_cfg.h"
57 #include "mkuz_conveyor.h"
58 #include "mkuz_format.h"
59 #include "mkuz_fqueue.h"
60 #include "mkuz_time.h"
61 #include "mkuz_insize.h"
62 
63 #define DEFAULT_CLSTSIZE	16384
64 
65 enum UZ_ALGORITHM {
66 	UZ_ZLIB = 0,
67 	UZ_LZMA,
68 	UZ_ZSTD,
69 	UZ_INVALID
70 };
71 
72 static const struct mkuz_format uzip_fmts[] = {
73 	[UZ_ZLIB] = {
74 		.option = "zlib",
75 		.magic = CLOOP_MAGIC_ZLIB,
76 		.default_sufx = DEFAULT_SUFX_ZLIB,
77 		.f_compress_bound = mkuz_zlib_cbound,
78 		.f_init = mkuz_zlib_init,
79 		.f_compress = mkuz_zlib_compress,
80 	},
81 	[UZ_LZMA] = {
82 		.option = "lzma",
83 		.magic = CLOOP_MAGIC_LZMA,
84 		.default_sufx = DEFAULT_SUFX_LZMA,
85 		.f_compress_bound = mkuz_lzma_cbound,
86 		.f_init = mkuz_lzma_init,
87 		.f_compress = mkuz_lzma_compress,
88 	},
89 	[UZ_ZSTD] = {
90 		.option = "zstd",
91 		.magic = CLOOP_MAGIC_ZSTD,
92 		.default_sufx = DEFAULT_SUFX_ZSTD,
93 		.f_compress_bound = mkuz_zstd_cbound,
94 		.f_init = mkuz_zstd_init,
95 		.f_compress = mkuz_zstd_compress,
96 	},
97 };
98 
99 static struct mkuz_blk *readblock(int, u_int32_t);
100 static void usage(void) __dead2;
101 static void cleanup(void);
102 
103 static char *cleanfile = NULL;
104 
105 static int
106 cmp_blkno(const struct mkuz_blk *bp, void *p)
107 {
108 	uint32_t *ap;
109 
110 	ap = (uint32_t *)p;
111 
112 	return (bp->info.blkno == *ap);
113 }
114 
115 int main(int argc, char **argv)
116 {
117 	struct mkuz_cfg cfs;
118 	char *oname;
119 	uint64_t *toc;
120 	int i, io, opt, tmp;
121 	struct {
122 		int en;
123 		FILE *f;
124 	} summary;
125 	struct iovec iov[2];
126 	uint64_t offset, last_offset;
127 	struct cloop_header hdr;
128 	struct mkuz_conveyor *cvp;
129 	struct mkuz_blk_info *chit;
130 	size_t ncpusz, ncpu, magiclen;
131 	double st, et;
132 	enum UZ_ALGORITHM comp_alg;
133 	int comp_level;
134 
135 	st = getdtime();
136 
137 	ncpusz = sizeof(size_t);
138 	if (sysctlbyname("hw.ncpu", &ncpu, &ncpusz, NULL, 0) < 0) {
139 		ncpu = 1;
140 	} else if (ncpu > MAX_WORKERS_AUTO) {
141 		ncpu = MAX_WORKERS_AUTO;
142 	}
143 
144 	memset(&hdr, 0, sizeof(hdr));
145 	cfs.blksz = DEFAULT_CLSTSIZE;
146 	oname = NULL;
147 	cfs.verbose = 0;
148 	cfs.no_zcomp = 0;
149 	cfs.en_dedup = 0;
150 	summary.en = 0;
151 	summary.f = stderr;
152 	comp_alg = UZ_ZLIB;
153 	comp_level = USE_DEFAULT_LEVEL;
154 	cfs.nworkers = ncpu;
155 	struct mkuz_blk *iblk, *oblk;
156 
157 	while((opt = getopt(argc, argv, "A:C:o:s:vZdLSj:")) != -1) {
158 		switch(opt) {
159 		case 'A':
160 			for (tmp = UZ_ZLIB; tmp < UZ_INVALID; tmp++) {
161 				if (strcmp(uzip_fmts[tmp].option, optarg) == 0)
162 					break;
163 			}
164 			if (tmp == UZ_INVALID)
165 				errx(1, "invalid algorithm specified: %s",
166 				    optarg);
167 				/* Not reached */
168 			comp_alg = tmp;
169 			break;
170 		case 'C':
171 			comp_level = atoi(optarg);
172 			break;
173 		case 'o':
174 			oname = optarg;
175 			break;
176 
177 		case 's':
178 			tmp = atoi(optarg);
179 			if (tmp <= 0) {
180 				errx(1, "invalid cluster size specified: %s",
181 				    optarg);
182 				/* Not reached */
183 			}
184 			cfs.blksz = tmp;
185 			break;
186 
187 		case 'v':
188 			cfs.verbose = 1;
189 			break;
190 
191 		case 'Z':
192 			cfs.no_zcomp = 1;
193 			break;
194 
195 		case 'd':
196 			cfs.en_dedup = 1;
197 			break;
198 
199 		case 'L':
200 			comp_alg = UZ_LZMA;
201 			break;
202 
203 		case 'S':
204 			summary.en = 1;
205 			summary.f = stdout;
206 			break;
207 
208 		case 'j':
209 			tmp = atoi(optarg);
210 			if (tmp <= 0) {
211 				errx(1, "invalid number of compression threads"
212                                     " specified: %s", optarg);
213 				/* Not reached */
214 			}
215 			cfs.nworkers = tmp;
216 			break;
217 
218 		default:
219 			usage();
220 			/* Not reached */
221 		}
222 	}
223 	argc -= optind;
224 	argv += optind;
225 
226 	if (argc != 1) {
227 		usage();
228 		/* Not reached */
229 	}
230 
231 	cfs.handler = &uzip_fmts[comp_alg];
232 
233 	magiclen = strlcpy(hdr.magic, cfs.handler->magic, sizeof(hdr.magic));
234 	assert(magiclen < sizeof(hdr.magic));
235 
236 	if (cfs.en_dedup != 0) {
237 		/*
238 		 * Dedupe requires a version 3 format.  Don't downgrade newer
239 		 * formats.
240 		 */
241 		if (hdr.magic[CLOOP_OFS_VERSN] == CLOOP_MAJVER_2)
242 			hdr.magic[CLOOP_OFS_VERSN] = CLOOP_MAJVER_3;
243 		hdr.magic[CLOOP_OFS_COMPR] =
244 		    tolower(hdr.magic[CLOOP_OFS_COMPR]);
245 	}
246 
247 	if (cfs.blksz % DEV_BSIZE != 0)
248 		errx(1, "cluster size should be multiple of %d", DEV_BSIZE);
249 
250 	cfs.cbound_blksz = cfs.handler->f_compress_bound(cfs.blksz);
251 	if (cfs.cbound_blksz > MAXPHYS)
252 		errx(1, "maximal compressed cluster size %zu greater than MAXPHYS %zu",
253 		    cfs.cbound_blksz, (size_t)MAXPHYS);
254 
255 	cfs.handler->f_init(&comp_level);
256 	cfs.comp_level = comp_level;
257 
258 	cfs.iname = argv[0];
259 	if (oname == NULL) {
260 		asprintf(&oname, "%s%s", cfs.iname, cfs.handler->default_sufx);
261 		if (oname == NULL) {
262 			err(1, "can't allocate memory");
263 			/* Not reached */
264 		}
265 	}
266 
267 	signal(SIGHUP, exit);
268 	signal(SIGINT, exit);
269 	signal(SIGTERM, exit);
270 	signal(SIGXCPU, exit);
271 	signal(SIGXFSZ, exit);
272 	atexit(cleanup);
273 
274 	cfs.fdr = open(cfs.iname, O_RDONLY);
275 	if (cfs.fdr < 0) {
276 		err(1, "open(%s)", cfs.iname);
277 		/* Not reached */
278 	}
279 	cfs.isize = mkuz_get_insize(&cfs);
280 	if (cfs.isize < 0) {
281 		errx(1, "can't determine input image size");
282 		/* Not reached */
283 	}
284 	hdr.nblocks = cfs.isize / cfs.blksz;
285 	if ((cfs.isize % cfs.blksz) != 0) {
286 		if (cfs.verbose != 0)
287 			fprintf(stderr, "file size is not multiple "
288 			"of %d, padding data\n", cfs.blksz);
289 		hdr.nblocks++;
290 	}
291 	toc = mkuz_safe_malloc((hdr.nblocks + 1) * sizeof(*toc));
292 
293 	/*
294 	 * Initialize last+1 entry with non-heap trash.  If final padding is
295 	 * added later, it may or may not be overwritten with an offset
296 	 * representing the length of the final compressed block.  If not,
297 	 * initialize to a defined value.
298 	 */
299 	toc[hdr.nblocks] = 0;
300 
301 	cfs.fdw = open(oname, (cfs.en_dedup ? O_RDWR : O_WRONLY) | O_TRUNC | O_CREAT,
302 		   S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
303 	if (cfs.fdw < 0) {
304 		err(1, "open(%s)", oname);
305 		/* Not reached */
306 	}
307 	cleanfile = oname;
308 
309 	/* Prepare header that we will write later when we have index ready. */
310 	iov[0].iov_base = (char *)&hdr;
311 	iov[0].iov_len = sizeof(hdr);
312 	iov[1].iov_base = (char *)toc;
313 	iov[1].iov_len = (hdr.nblocks + 1) * sizeof(*toc);
314 	offset = iov[0].iov_len + iov[1].iov_len;
315 
316 	/* Reserve space for header */
317 	lseek(cfs.fdw, offset, SEEK_SET);
318 
319 	if (cfs.verbose != 0) {
320 		fprintf(stderr, "data size %ju bytes, number of clusters "
321 		    "%u, index length %zu bytes\n", cfs.isize,
322 		    hdr.nblocks, iov[1].iov_len);
323 	}
324 
325 	cvp = mkuz_conveyor_ctor(&cfs);
326 
327 	last_offset = 0;
328         iblk = oblk = NULL;
329 	for(i = io = 0; iblk != MKUZ_BLK_EOF; i++) {
330 		iblk = readblock(cfs.fdr, cfs.blksz);
331 		mkuz_fqueue_enq(cvp->wrk_queue, iblk);
332 		if (iblk != MKUZ_BLK_EOF &&
333 		    (i < (cfs.nworkers * ITEMS_PER_WORKER))) {
334 			continue;
335 		}
336 drain:
337 		oblk = mkuz_fqueue_deq_when(cvp->results, cmp_blkno, &io);
338 		assert(oblk->info.blkno == (unsigned)io);
339 		oblk->info.offset = offset;
340 		chit = NULL;
341 		if (cfs.en_dedup != 0 && oblk->info.len > 0) {
342 			chit = mkuz_blkcache_regblock(cfs.fdw, oblk);
343 			/*
344 			 * There should be at least one non-empty block
345 			 * between us and the backref'ed offset, otherwise
346 			 * we won't be able to parse that sequence correctly
347 			 * as it would be indistinguishible from another
348 			 * empty block.
349 			 */
350 			if (chit != NULL && chit->offset == last_offset) {
351 				chit = NULL;
352 			}
353 		}
354 		if (chit != NULL) {
355 			toc[io] = htobe64(chit->offset);
356 			oblk->info.len = 0;
357 		} else {
358 			if (oblk->info.len > 0 && write(cfs.fdw, oblk->data,
359 			    oblk->info.len) < 0) {
360 				err(1, "write(%s)", oname);
361 				/* Not reached */
362 			}
363 			toc[io] = htobe64(offset);
364 			last_offset = offset;
365 			offset += oblk->info.len;
366 		}
367 		if (cfs.verbose != 0) {
368 			fprintf(stderr, "cluster #%d, in %u bytes, "
369 			    "out len=%lu offset=%lu", io, cfs.blksz,
370 			    (u_long)oblk->info.len, (u_long)be64toh(toc[io]));
371 			if (chit != NULL) {
372 				fprintf(stderr, " (backref'ed to #%d)",
373 				    chit->blkno);
374 			}
375 			fprintf(stderr, "\n");
376 		}
377 		free(oblk);
378 		io += 1;
379 		if (iblk == MKUZ_BLK_EOF) {
380 			if (io < i)
381 				goto drain;
382 			/* Last block, see if we need to add some padding */
383 			if ((offset % DEV_BSIZE) == 0)
384 				continue;
385 			oblk = mkuz_blk_ctor(DEV_BSIZE - (offset % DEV_BSIZE));
386 			oblk->info.blkno = io;
387 			oblk->info.len = oblk->alen;
388 			if (cfs.verbose != 0) {
389 				fprintf(stderr, "padding data with %lu bytes "
390 				    "so that file size is multiple of %d\n",
391 				    (u_long)oblk->alen, DEV_BSIZE);
392 			}
393 			mkuz_fqueue_enq(cvp->results, oblk);
394 			goto drain;
395 		}
396 	}
397 
398 	close(cfs.fdr);
399 
400 	if (cfs.verbose != 0 || summary.en != 0) {
401 		et = getdtime();
402 		fprintf(summary.f, "compressed data to %ju bytes, saved %lld "
403 		    "bytes, %.2f%% decrease, %.2f bytes/sec.\n", offset,
404 		    (long long)(cfs.isize - offset),
405 		    100.0 * (long long)(cfs.isize - offset) /
406 		    (float)cfs.isize, (float)cfs.isize / (et - st));
407 	}
408 
409 	/* Convert to big endian */
410 	hdr.blksz = htonl(cfs.blksz);
411 	hdr.nblocks = htonl(hdr.nblocks);
412 	/* Write headers into pre-allocated space */
413 	lseek(cfs.fdw, 0, SEEK_SET);
414 	if (writev(cfs.fdw, iov, 2) < 0) {
415 		err(1, "writev(%s)", oname);
416 		/* Not reached */
417 	}
418 	cleanfile = NULL;
419 	close(cfs.fdw);
420 
421 	exit(0);
422 }
423 
424 static struct mkuz_blk *
425 readblock(int fd, u_int32_t clstsize)
426 {
427 	int numread;
428 	struct mkuz_blk *rval;
429 	static int blockcnt;
430 	off_t cpos;
431 
432 	rval = mkuz_blk_ctor(clstsize);
433 
434 	rval->info.blkno = blockcnt;
435 	blockcnt += 1;
436 	cpos = lseek(fd, 0, SEEK_CUR);
437 	if (cpos < 0) {
438 		err(1, "readblock: lseek() failed");
439 		/* Not reached */
440 	}
441 	rval->info.offset = cpos;
442 
443 	numread = read(fd, rval->data, clstsize);
444 	if (numread < 0) {
445 		err(1, "readblock: read() failed");
446 		/* Not reached */
447 	}
448 	if (numread == 0) {
449 		free(rval);
450 		return MKUZ_BLK_EOF;
451 	}
452 	rval->info.len = numread;
453 	return rval;
454 }
455 
456 static void
457 usage(void)
458 {
459 
460 	fprintf(stderr, "usage: mkuzip [-vZdLS] [-o outfile] [-s cluster_size] "
461 	    "[-j ncompr] infile\n");
462 	exit(1);
463 }
464 
465 void *
466 mkuz_safe_malloc(size_t size)
467 {
468 	void *retval;
469 
470 	retval = malloc(size);
471 	if (retval == NULL) {
472 		err(1, "can't allocate memory");
473 		/* Not reached */
474 	}
475 	return retval;
476 }
477 
478 void *
479 mkuz_safe_zmalloc(size_t size)
480 {
481 	void *retval;
482 
483 	retval = mkuz_safe_malloc(size);
484 	bzero(retval, size);
485 	return retval;
486 }
487 
488 static void
489 cleanup(void)
490 {
491 
492 	if (cleanfile != NULL)
493 		unlink(cleanfile);
494 }
495 
496 int
497 mkuz_memvcmp(const void *memory, unsigned char val, size_t size)
498 {
499     const u_char *mm;
500 
501     mm = (const u_char *)memory;
502     return (*mm == val) && memcmp(mm, mm + 1, size - 1) == 0;
503 }
504