xref: /freebsd/usr.bin/mkuzip/mkuzip.c (revision c7d813a93eeb447470734c9bc0c140d90a54c271)
1 /*
2  * Copyright (c) 2004-2016 Maxim Sobolev <sobomax@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/types.h>
31 #include <sys/disk.h>
32 #include <sys/endian.h>
33 #include <sys/param.h>
34 #include <sys/sysctl.h>
35 #include <sys/stat.h>
36 #include <sys/uio.h>
37 #include <netinet/in.h>
38 #include <assert.h>
39 #include <ctype.h>
40 #include <err.h>
41 #include <fcntl.h>
42 #include <pthread.h>
43 #include <signal.h>
44 #include <stdint.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <unistd.h>
49 
50 #include "mkuzip.h"
51 #include "mkuz_cloop.h"
52 #include "mkuz_blockcache.h"
53 #include "mkuz_zlib.h"
54 #include "mkuz_lzma.h"
55 #include "mkuz_blk.h"
56 #include "mkuz_cfg.h"
57 #include "mkuz_conveyor.h"
58 #include "mkuz_format.h"
59 #include "mkuz_fqueue.h"
60 #include "mkuz_time.h"
61 
62 #define DEFAULT_CLSTSIZE	16384
63 
64 static struct mkuz_format uzip_fmt = {
65 	.magic = CLOOP_MAGIC_ZLIB,
66 	.default_sufx = DEFAULT_SUFX_ZLIB,
67 	.f_init = &mkuz_zlib_init,
68 	.f_compress = &mkuz_zlib_compress
69 };
70 
71 static struct mkuz_format ulzma_fmt = {
72         .magic = CLOOP_MAGIC_LZMA,
73         .default_sufx = DEFAULT_SUFX_LZMA,
74         .f_init = &mkuz_lzma_init,
75         .f_compress = &mkuz_lzma_compress
76 };
77 
78 static struct mkuz_blk *readblock(int, u_int32_t);
79 static void usage(void);
80 static void cleanup(void);
81 
82 static char *cleanfile = NULL;
83 
84 static int
85 cmp_blkno(const struct mkuz_blk *bp, void *p)
86 {
87 	uint32_t *ap;
88 
89 	ap = (uint32_t *)p;
90 
91 	return (bp->info.blkno == *ap);
92 }
93 
94 int main(int argc, char **argv)
95 {
96 	struct mkuz_cfg cfs;
97 	char *iname, *oname;
98 	uint64_t *toc;
99 	int i, io, opt, tmp;
100 	struct {
101 		int en;
102 		FILE *f;
103 	} summary;
104 	struct iovec iov[2];
105 	struct stat sb;
106 	uint64_t offset, last_offset;
107 	struct cloop_header hdr;
108 	struct mkuz_conveyor *cvp;
109         void *c_ctx;
110 	struct mkuz_blk_info *chit;
111 	size_t ncpusz, ncpu, magiclen;
112 	double st, et;
113 
114 	st = getdtime();
115 
116 	ncpusz = sizeof(size_t);
117 	if (sysctlbyname("hw.ncpu", &ncpu, &ncpusz, NULL, 0) < 0) {
118 		ncpu = 1;
119 	} else if (ncpu > MAX_WORKERS_AUTO) {
120 		ncpu = MAX_WORKERS_AUTO;
121 	}
122 
123 	memset(&hdr, 0, sizeof(hdr));
124 	cfs.blksz = DEFAULT_CLSTSIZE;
125 	oname = NULL;
126 	cfs.verbose = 0;
127 	cfs.no_zcomp = 0;
128 	cfs.en_dedup = 0;
129 	summary.en = 0;
130 	summary.f = stderr;
131 	cfs.handler = &uzip_fmt;
132 	cfs.nworkers = ncpu;
133 	struct mkuz_blk *iblk, *oblk;
134 
135 	while((opt = getopt(argc, argv, "o:s:vZdLSj:")) != -1) {
136 		switch(opt) {
137 		case 'o':
138 			oname = optarg;
139 			break;
140 
141 		case 's':
142 			tmp = atoi(optarg);
143 			if (tmp <= 0) {
144 				errx(1, "invalid cluster size specified: %s",
145 				    optarg);
146 				/* Not reached */
147 			}
148 			cfs.blksz = tmp;
149 			break;
150 
151 		case 'v':
152 			cfs.verbose = 1;
153 			break;
154 
155 		case 'Z':
156 			cfs.no_zcomp = 1;
157 			break;
158 
159 		case 'd':
160 			cfs.en_dedup = 1;
161 			break;
162 
163 		case 'L':
164 			cfs.handler = &ulzma_fmt;
165 			break;
166 
167 		case 'S':
168 			summary.en = 1;
169 			summary.f = stdout;
170 			break;
171 
172 		case 'j':
173 			tmp = atoi(optarg);
174 			if (tmp <= 0) {
175 				errx(1, "invalid number of compression threads"
176                                     " specified: %s", optarg);
177 				/* Not reached */
178 			}
179 			cfs.nworkers = tmp;
180 			break;
181 
182 		default:
183 			usage();
184 			/* Not reached */
185 		}
186 	}
187 	argc -= optind;
188 	argv += optind;
189 
190 	if (argc != 1) {
191 		usage();
192 		/* Not reached */
193 	}
194 
195 	magiclen = strlcpy(hdr.magic, cfs.handler->magic, sizeof(hdr.magic));
196 	assert(magiclen < sizeof(hdr.magic));
197 
198 	if (cfs.en_dedup != 0) {
199 		hdr.magic[CLOOP_OFS_VERSN] = CLOOP_MAJVER_3;
200 		hdr.magic[CLOOP_OFS_COMPR] =
201 		    tolower(hdr.magic[CLOOP_OFS_COMPR]);
202 	}
203 
204 	c_ctx = cfs.handler->f_init(cfs.blksz);
205 
206 	iname = argv[0];
207 	if (oname == NULL) {
208 		asprintf(&oname, "%s%s", iname, cfs.handler->default_sufx);
209 		if (oname == NULL) {
210 			err(1, "can't allocate memory");
211 			/* Not reached */
212 		}
213 	}
214 
215 	signal(SIGHUP, exit);
216 	signal(SIGINT, exit);
217 	signal(SIGTERM, exit);
218 	signal(SIGXCPU, exit);
219 	signal(SIGXFSZ, exit);
220 	atexit(cleanup);
221 
222 	cfs.fdr = open(iname, O_RDONLY);
223 	if (cfs.fdr < 0) {
224 		err(1, "open(%s)", iname);
225 		/* Not reached */
226 	}
227 	if (fstat(cfs.fdr, &sb) != 0) {
228 		err(1, "fstat(%s)", iname);
229 		/* Not reached */
230 	}
231 	if (S_ISCHR(sb.st_mode)) {
232 		off_t ms;
233 
234 		if (ioctl(cfs.fdr, DIOCGMEDIASIZE, &ms) < 0) {
235 			err(1, "ioctl(DIOCGMEDIASIZE)");
236 			/* Not reached */
237 		}
238 		sb.st_size = ms;
239 	} else if (!S_ISREG(sb.st_mode)) {
240 		fprintf(stderr, "%s: not a character device or regular file\n",
241 			iname);
242 		exit(1);
243 	}
244 	hdr.nblocks = sb.st_size / cfs.blksz;
245 	if ((sb.st_size % cfs.blksz) != 0) {
246 		if (cfs.verbose != 0)
247 			fprintf(stderr, "file size is not multiple "
248 			"of %d, padding data\n", cfs.blksz);
249 		hdr.nblocks++;
250 	}
251 	toc = mkuz_safe_malloc((hdr.nblocks + 1) * sizeof(*toc));
252 
253 	cfs.fdw = open(oname, (cfs.en_dedup ? O_RDWR : O_WRONLY) | O_TRUNC | O_CREAT,
254 		   S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
255 	if (cfs.fdw < 0) {
256 		err(1, "open(%s)", oname);
257 		/* Not reached */
258 	}
259 	cleanfile = oname;
260 
261 	/* Prepare header that we will write later when we have index ready. */
262 	iov[0].iov_base = (char *)&hdr;
263 	iov[0].iov_len = sizeof(hdr);
264 	iov[1].iov_base = (char *)toc;
265 	iov[1].iov_len = (hdr.nblocks + 1) * sizeof(*toc);
266 	offset = iov[0].iov_len + iov[1].iov_len;
267 
268 	/* Reserve space for header */
269 	lseek(cfs.fdw, offset, SEEK_SET);
270 
271 	if (cfs.verbose != 0) {
272 		fprintf(stderr, "data size %ju bytes, number of clusters "
273 		    "%u, index length %zu bytes\n", sb.st_size,
274 		    hdr.nblocks, iov[1].iov_len);
275 	}
276 
277 	cvp = mkuz_conveyor_ctor(&cfs);
278 
279 	last_offset = 0;
280         iblk = oblk = NULL;
281 	for(i = io = 0; iblk != MKUZ_BLK_EOF; i++) {
282 		iblk = readblock(cfs.fdr, cfs.blksz);
283 		mkuz_fqueue_enq(cvp->wrk_queue, iblk);
284 		if (iblk != MKUZ_BLK_EOF &&
285 		    (i < (cfs.nworkers * ITEMS_PER_WORKER))) {
286 			continue;
287 		}
288 drain:
289 		oblk = mkuz_fqueue_deq_when(cvp->results, cmp_blkno, &io);
290 		assert(oblk->info.blkno == (unsigned)io);
291 		oblk->info.offset = offset;
292 		chit = NULL;
293 		if (cfs.en_dedup != 0 && oblk->info.len > 0) {
294 			chit = mkuz_blkcache_regblock(cfs.fdw, oblk);
295 			/*
296 			 * There should be at least one non-empty block
297 			 * between us and the backref'ed offset, otherwise
298 			 * we won't be able to parse that sequence correctly
299 			 * as it would be indistinguishible from another
300 			 * empty block.
301 			 */
302 			if (chit != NULL && chit->offset == last_offset) {
303 				chit = NULL;
304 			}
305 		}
306 		if (chit != NULL) {
307 			toc[io] = htobe64(chit->offset);
308 			oblk->info.len = 0;
309 		} else {
310 			if (oblk->info.len > 0 && write(cfs.fdw, oblk->data,
311 			    oblk->info.len) < 0) {
312 				err(1, "write(%s)", oname);
313 				/* Not reached */
314 			}
315 			toc[io] = htobe64(offset);
316 			last_offset = offset;
317 			offset += oblk->info.len;
318 		}
319 		if (cfs.verbose != 0) {
320 			fprintf(stderr, "cluster #%d, in %u bytes, "
321 			    "out len=%lu offset=%lu", io, cfs.blksz,
322 			    (u_long)oblk->info.len, (u_long)be64toh(toc[io]));
323 			if (chit != NULL) {
324 				fprintf(stderr, " (backref'ed to #%d)",
325 				    chit->blkno);
326 			}
327 			fprintf(stderr, "\n");
328 		}
329 		free(oblk);
330 		io += 1;
331 		if (iblk == MKUZ_BLK_EOF) {
332 			if (io < i)
333 				goto drain;
334 			/* Last block, see if we need to add some padding */
335 			if ((offset % DEV_BSIZE) == 0)
336 				continue;
337 			oblk = mkuz_blk_ctor(DEV_BSIZE - (offset % DEV_BSIZE));
338 			oblk->info.blkno = io;
339 			oblk->info.len = oblk->alen;
340 			if (cfs.verbose != 0) {
341 				fprintf(stderr, "padding data with %lu bytes "
342 				    "so that file size is multiple of %d\n",
343 				    (u_long)oblk->alen, DEV_BSIZE);
344 			}
345 			mkuz_fqueue_enq(cvp->results, oblk);
346 			goto drain;
347 		}
348 	}
349 
350 	close(cfs.fdr);
351 
352 	if (cfs.verbose != 0 || summary.en != 0) {
353 		et = getdtime();
354 		fprintf(summary.f, "compressed data to %ju bytes, saved %lld "
355 		    "bytes, %.2f%% decrease, %.2f bytes/sec.\n", offset,
356 		    (long long)(sb.st_size - offset),
357 		    100.0 * (long long)(sb.st_size - offset) /
358 		    (float)sb.st_size, (float)sb.st_size / (et - st));
359 	}
360 
361 	/* Convert to big endian */
362 	hdr.blksz = htonl(cfs.blksz);
363 	hdr.nblocks = htonl(hdr.nblocks);
364 	/* Write headers into pre-allocated space */
365 	lseek(cfs.fdw, 0, SEEK_SET);
366 	if (writev(cfs.fdw, iov, 2) < 0) {
367 		err(1, "writev(%s)", oname);
368 		/* Not reached */
369 	}
370 	cleanfile = NULL;
371 	close(cfs.fdw);
372 
373 	exit(0);
374 }
375 
376 static struct mkuz_blk *
377 readblock(int fd, u_int32_t clstsize)
378 {
379 	int numread;
380 	struct mkuz_blk *rval;
381 	static int blockcnt;
382 	off_t cpos;
383 
384 	rval = mkuz_blk_ctor(clstsize);
385 
386 	rval->info.blkno = blockcnt;
387 	blockcnt += 1;
388 	cpos = lseek(fd, 0, SEEK_CUR);
389 	if (cpos < 0) {
390 		err(1, "readblock: lseek() failed");
391 		/* Not reached */
392 	}
393 	rval->info.offset = cpos;
394 
395 	numread = read(fd, rval->data, clstsize);
396 	if (numread < 0) {
397 		err(1, "readblock: read() failed");
398 		/* Not reached */
399 	}
400 	if (numread == 0) {
401 		free(rval);
402 		return MKUZ_BLK_EOF;
403 	}
404 	rval->info.len = numread;
405 	return rval;
406 }
407 
408 static void
409 usage(void)
410 {
411 
412 	fprintf(stderr, "usage: mkuzip [-vZdLS] [-o outfile] [-s cluster_size] "
413 	    "[-j ncompr] infile\n");
414 	exit(1);
415 }
416 
417 void *
418 mkuz_safe_malloc(size_t size)
419 {
420 	void *retval;
421 
422 	retval = malloc(size);
423 	if (retval == NULL) {
424 		err(1, "can't allocate memory");
425 		/* Not reached */
426 	}
427 	return retval;
428 }
429 
430 void *
431 mkuz_safe_zmalloc(size_t size)
432 {
433 	void *retval;
434 
435 	retval = mkuz_safe_malloc(size);
436 	bzero(retval, size);
437 	return retval;
438 }
439 
440 static void
441 cleanup(void)
442 {
443 
444 	if (cleanfile != NULL)
445 		unlink(cleanfile);
446 }
447 
448 int
449 mkuz_memvcmp(const void *memory, unsigned char val, size_t size)
450 {
451     const u_char *mm;
452 
453     mm = (const u_char *)memory;
454     return (*mm == val) && memcmp(mm, mm + 1, size - 1) == 0;
455 }
456