xref: /freebsd/sys/contrib/openzfs/cmd/zstream/zstream_redup.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * This file and its contents are supplied under the terms of the
6  * Common Development and Distribution License ("CDDL"), version 1.0.
7  * You may only use this file in accordance with the terms of version
8  * 1.0 of the CDDL.
9  *
10  * A full copy of the text of the CDDL should have accompanied this
11  * source.  A copy of the CDDL is also available via the Internet at
12  * http://www.illumos.org/license/CDDL.
13  *
14  * CDDL HEADER END
15  */
16 
17 /*
18  * Copyright (c) 2020 by Delphix. All rights reserved.
19  */
20 
21 #include <assert.h>
22 #include <cityhash.h>
23 #include <ctype.h>
24 #include <errno.h>
25 #include <fcntl.h>
26 #include <libzfs.h>
27 #include <libzutil.h>
28 #include <stddef.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <umem.h>
33 #include <unistd.h>
34 #include <sys/debug.h>
35 #include <sys/stat.h>
36 #include <sys/zfs_ioctl.h>
37 #include <sys/zio_checksum.h>
38 #include "zfs_fletcher.h"
39 #include "zstream.h"
40 
41 
42 #define	MAX_RDT_PHYSMEM_PERCENT		20
43 #define	SMALLEST_POSSIBLE_MAX_RDT_MB		128
44 
45 typedef struct redup_entry {
46 	struct redup_entry	*rde_next;
47 	uint64_t rde_guid;
48 	uint64_t rde_object;
49 	uint64_t rde_offset;
50 	uint64_t rde_stream_offset;
51 } redup_entry_t;
52 
53 typedef struct redup_table {
54 	redup_entry_t	**redup_hash_array;
55 	umem_cache_t	*ddecache;
56 	uint64_t	ddt_count;
57 	int		numhashbits;
58 } redup_table_t;
59 
60 void *
safe_calloc(size_t n)61 safe_calloc(size_t n)
62 {
63 	void *rv = calloc(1, n);
64 	if (rv == NULL) {
65 		fprintf(stderr,
66 		    "Error: could not allocate %u bytes of memory\n",
67 		    (int)n);
68 		exit(1);
69 	}
70 	return (rv);
71 }
72 
73 /*
74  * Safe version of fread(), exits on error.
75  */
76 int
sfread(void * buf,size_t size,FILE * fp)77 sfread(void *buf, size_t size, FILE *fp)
78 {
79 	int rv = fread(buf, size, 1, fp);
80 	if (rv == 0 && ferror(fp)) {
81 		(void) fprintf(stderr, "Error while reading file: %s\n",
82 		    strerror(errno));
83 		exit(1);
84 	}
85 	return (rv);
86 }
87 
88 /*
89  * Safe version of pread(), exits on error.
90  */
91 static void
spread(int fd,void * buf,size_t count,off_t offset)92 spread(int fd, void *buf, size_t count, off_t offset)
93 {
94 	ssize_t err = pread(fd, buf, count, offset);
95 	if (err == -1) {
96 		(void) fprintf(stderr,
97 		    "Error while reading file: %s\n",
98 		    strerror(errno));
99 		exit(1);
100 	} else if (err != count) {
101 		(void) fprintf(stderr,
102 		    "Error while reading file: short read\n");
103 		exit(1);
104 	}
105 }
106 
107 static int
dump_record(dmu_replay_record_t * drr,void * payload,int payload_len,zio_cksum_t * zc,int outfd)108 dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
109     zio_cksum_t *zc, int outfd)
110 {
111 	assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum)
112 	    == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
113 	fletcher_4_incremental_native(drr,
114 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
115 	if (drr->drr_type != DRR_BEGIN) {
116 		assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
117 		    drr_checksum.drr_checksum));
118 		drr->drr_u.drr_checksum.drr_checksum = *zc;
119 	}
120 	fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
121 	    sizeof (zio_cksum_t), zc);
122 	if (write(outfd, drr, sizeof (*drr)) == -1)
123 		return (errno);
124 	if (payload_len != 0) {
125 		fletcher_4_incremental_native(payload, payload_len, zc);
126 		if (write(outfd, payload, payload_len) == -1)
127 			return (errno);
128 	}
129 	return (0);
130 }
131 
132 static void
rdt_insert(redup_table_t * rdt,uint64_t guid,uint64_t object,uint64_t offset,uint64_t stream_offset)133 rdt_insert(redup_table_t *rdt,
134     uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset)
135 {
136 	uint64_t ch = cityhash3(guid, object, offset);
137 	uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
138 	redup_entry_t **rdepp;
139 
140 	rdepp = &(rdt->redup_hash_array[hashcode]);
141 	redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL);
142 	rde->rde_next = *rdepp;
143 	rde->rde_guid = guid;
144 	rde->rde_object = object;
145 	rde->rde_offset = offset;
146 	rde->rde_stream_offset = stream_offset;
147 	*rdepp = rde;
148 	rdt->ddt_count++;
149 }
150 
151 static void
rdt_lookup(redup_table_t * rdt,uint64_t guid,uint64_t object,uint64_t offset,uint64_t * stream_offsetp)152 rdt_lookup(redup_table_t *rdt,
153     uint64_t guid, uint64_t object, uint64_t offset,
154     uint64_t *stream_offsetp)
155 {
156 	uint64_t ch = cityhash3(guid, object, offset);
157 	uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
158 
159 	for (redup_entry_t *rde = rdt->redup_hash_array[hashcode];
160 	    rde != NULL; rde = rde->rde_next) {
161 		if (rde->rde_guid == guid &&
162 		    rde->rde_object == object &&
163 		    rde->rde_offset == offset) {
164 			*stream_offsetp = rde->rde_stream_offset;
165 			return;
166 		}
167 	}
168 	assert(!"could not find expected redup table entry");
169 }
170 
171 /*
172  * Convert a dedup stream (generated by "zfs send -D") to a
173  * non-deduplicated stream.  The entire infd will be converted, including
174  * any substreams in a stream package (generated by "zfs send -RD"). The
175  * infd must be seekable.
176  */
177 static void
zfs_redup_stream(int infd,int outfd,boolean_t verbose)178 zfs_redup_stream(int infd, int outfd, boolean_t verbose)
179 {
180 	int bufsz = SPA_MAXBLOCKSIZE;
181 	dmu_replay_record_t thedrr;
182 	dmu_replay_record_t *drr = &thedrr;
183 	redup_table_t rdt;
184 	zio_cksum_t stream_cksum;
185 	uint64_t numbuckets;
186 	uint64_t num_records = 0;
187 	uint64_t num_write_byref_records = 0;
188 
189 	memset(&thedrr, 0, sizeof (dmu_replay_record_t));
190 
191 #ifdef _ILP32
192 	uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20;
193 #else
194 	uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
195 	uint64_t max_rde_size =
196 	    MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100,
197 	    SMALLEST_POSSIBLE_MAX_RDT_MB << 20);
198 #endif
199 
200 	numbuckets = max_rde_size / (sizeof (redup_entry_t));
201 
202 	/*
203 	 * numbuckets must be a power of 2.  Increase number to
204 	 * a power of 2 if necessary.
205 	 */
206 	if (!ISP2(numbuckets))
207 		numbuckets = 1ULL << highbit64(numbuckets);
208 
209 	rdt.redup_hash_array =
210 	    safe_calloc(numbuckets * sizeof (redup_entry_t *));
211 	rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0,
212 	    NULL, NULL, NULL, NULL, NULL, 0);
213 	rdt.numhashbits = highbit64(numbuckets) - 1;
214 	rdt.ddt_count = 0;
215 
216 	char *buf = safe_calloc(bufsz);
217 	FILE *ofp = fdopen(infd, "r");
218 	long offset = ftell(ofp);
219 	int begin = 0;
220 	boolean_t seen = B_FALSE;
221 	while (sfread(drr, sizeof (*drr), ofp) != 0) {
222 		num_records++;
223 
224 		/*
225 		 * We need to regenerate the checksum.
226 		 */
227 		if (drr->drr_type != DRR_BEGIN) {
228 			memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
229 			    sizeof (drr->drr_u.drr_checksum.drr_checksum));
230 		}
231 
232 		uint64_t payload_size = 0;
233 		switch (drr->drr_type) {
234 		case DRR_BEGIN:
235 		{
236 			struct drr_begin *drrb = &drr->drr_u.drr_begin;
237 			int fflags;
238 			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
239 			VERIFY0(begin++);
240 			seen = B_TRUE;
241 
242 			assert(drrb->drr_magic == DMU_BACKUP_MAGIC);
243 
244 			/* clear the DEDUP feature flag for this stream */
245 			fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
246 			fflags &= ~(DMU_BACKUP_FEATURE_DEDUP |
247 			    DMU_BACKUP_FEATURE_DEDUPPROPS);
248 			/* cppcheck-suppress syntaxError */
249 			DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
250 
251 			uint32_t sz = drr->drr_payloadlen;
252 
253 			VERIFY3U(sz, <=, 1U << 28);
254 
255 			if (sz != 0) {
256 				if (sz > bufsz) {
257 					free(buf);
258 					buf = safe_calloc(sz);
259 					bufsz = sz;
260 				}
261 				(void) sfread(buf, sz, ofp);
262 			}
263 			payload_size = sz;
264 			break;
265 		}
266 
267 		case DRR_END:
268 		{
269 			struct drr_end *drre = &drr->drr_u.drr_end;
270 			/*
271 			 * We would prefer to just check --begin == 0, but
272 			 * replication streams have an end of stream END
273 			 * record, so we must avoid tripping it.
274 			 */
275 			VERIFY3B(seen, ==, B_TRUE);
276 			begin--;
277 			/*
278 			 * Use the recalculated checksum, unless this is
279 			 * the END record of a stream package, which has
280 			 * no checksum.
281 			 */
282 			if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum))
283 				drre->drr_checksum = stream_cksum;
284 			break;
285 		}
286 
287 		case DRR_OBJECT:
288 		{
289 			struct drr_object *drro = &drr->drr_u.drr_object;
290 			VERIFY3S(begin, ==, 1);
291 
292 			if (drro->drr_bonuslen > 0) {
293 				payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
294 				(void) sfread(buf, payload_size, ofp);
295 			}
296 			break;
297 		}
298 
299 		case DRR_SPILL:
300 		{
301 			struct drr_spill *drrs = &drr->drr_u.drr_spill;
302 			VERIFY3S(begin, ==, 1);
303 			payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
304 			(void) sfread(buf, payload_size, ofp);
305 			break;
306 		}
307 
308 		case DRR_WRITE_BYREF:
309 		{
310 			struct drr_write_byref drrwb =
311 			    drr->drr_u.drr_write_byref;
312 			VERIFY3S(begin, ==, 1);
313 
314 			num_write_byref_records++;
315 
316 			/*
317 			 * Look up in hash table by drrwb->drr_refguid,
318 			 * drr_refobject, drr_refoffset.  Replace this
319 			 * record with the found WRITE record, but with
320 			 * drr_object,drr_offset,drr_toguid replaced with ours.
321 			 */
322 			uint64_t stream_offset = 0;
323 			rdt_lookup(&rdt, drrwb.drr_refguid,
324 			    drrwb.drr_refobject, drrwb.drr_refoffset,
325 			    &stream_offset);
326 
327 			spread(infd, drr, sizeof (*drr), stream_offset);
328 
329 			assert(drr->drr_type == DRR_WRITE);
330 			struct drr_write *drrw = &drr->drr_u.drr_write;
331 			assert(drrw->drr_toguid == drrwb.drr_refguid);
332 			assert(drrw->drr_object == drrwb.drr_refobject);
333 			assert(drrw->drr_offset == drrwb.drr_refoffset);
334 
335 			payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
336 			spread(infd, buf, payload_size,
337 			    stream_offset + sizeof (*drr));
338 
339 			drrw->drr_toguid = drrwb.drr_toguid;
340 			drrw->drr_object = drrwb.drr_object;
341 			drrw->drr_offset = drrwb.drr_offset;
342 			break;
343 		}
344 
345 		case DRR_WRITE:
346 		{
347 			struct drr_write *drrw = &drr->drr_u.drr_write;
348 			VERIFY3S(begin, ==, 1);
349 			payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
350 			(void) sfread(buf, payload_size, ofp);
351 
352 			rdt_insert(&rdt, drrw->drr_toguid,
353 			    drrw->drr_object, drrw->drr_offset, offset);
354 			break;
355 		}
356 
357 		case DRR_WRITE_EMBEDDED:
358 		{
359 			struct drr_write_embedded *drrwe =
360 			    &drr->drr_u.drr_write_embedded;
361 			VERIFY3S(begin, ==, 1);
362 			payload_size =
363 			    P2ROUNDUP((uint64_t)drrwe->drr_psize, 8);
364 			(void) sfread(buf, payload_size, ofp);
365 			break;
366 		}
367 
368 		case DRR_FREEOBJECTS:
369 		case DRR_FREE:
370 		case DRR_OBJECT_RANGE:
371 			VERIFY3S(begin, ==, 1);
372 			break;
373 
374 		default:
375 			(void) fprintf(stderr, "INVALID record type 0x%x\n",
376 			    drr->drr_type);
377 			/* should never happen, so assert */
378 			assert(B_FALSE);
379 		}
380 
381 		if (feof(ofp)) {
382 			fprintf(stderr, "Error: unexpected end-of-file\n");
383 			exit(1);
384 		}
385 		if (ferror(ofp)) {
386 			fprintf(stderr, "Error while reading file: %s\n",
387 			    strerror(errno));
388 			exit(1);
389 		}
390 
391 		/*
392 		 * We need to recalculate the checksum, and it needs to be
393 		 * initially zero to do that.  BEGIN records don't have
394 		 * a checksum.
395 		 */
396 		if (drr->drr_type != DRR_BEGIN) {
397 			memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
398 			    sizeof (drr->drr_u.drr_checksum.drr_checksum));
399 		}
400 		if (dump_record(drr, buf, payload_size,
401 		    &stream_cksum, outfd) != 0)
402 			break;
403 		if (drr->drr_type == DRR_END) {
404 			/*
405 			 * Typically the END record is either the last
406 			 * thing in the stream, or it is followed
407 			 * by a BEGIN record (which also zeros the checksum).
408 			 * However, a stream package ends with two END
409 			 * records.  The last END record's checksum starts
410 			 * from zero.
411 			 */
412 			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
413 		}
414 		offset = ftell(ofp);
415 	}
416 
417 	if (verbose) {
418 		char mem_str[16];
419 		zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t),
420 		    mem_str, sizeof (mem_str));
421 		fprintf(stderr, "converted stream with %llu total records, "
422 		    "including %llu dedup records, using %sB memory.\n",
423 		    (long long)num_records,
424 		    (long long)num_write_byref_records,
425 		    mem_str);
426 	}
427 
428 	umem_cache_destroy(rdt.ddecache);
429 	free(rdt.redup_hash_array);
430 	free(buf);
431 	(void) fclose(ofp);
432 }
433 
434 int
zstream_do_redup(int argc,char * argv[])435 zstream_do_redup(int argc, char *argv[])
436 {
437 	boolean_t verbose = B_FALSE;
438 	int c;
439 
440 	while ((c = getopt(argc, argv, "v")) != -1) {
441 		switch (c) {
442 		case 'v':
443 			verbose = B_TRUE;
444 			break;
445 		case '?':
446 			(void) fprintf(stderr, "invalid option '%c'\n",
447 			    optopt);
448 			zstream_usage();
449 			break;
450 		}
451 	}
452 
453 	argc -= optind;
454 	argv += optind;
455 
456 	if (argc != 1)
457 		zstream_usage();
458 
459 	const char *filename = argv[0];
460 
461 	if (isatty(STDOUT_FILENO)) {
462 		(void) fprintf(stderr,
463 		    "Error: Stream can not be written to a terminal.\n"
464 		    "You must redirect standard output.\n");
465 		return (1);
466 	}
467 
468 	int fd = open(filename, O_RDONLY);
469 	if (fd == -1) {
470 		(void) fprintf(stderr,
471 		    "Error while opening file '%s': %s\n",
472 		    filename, strerror(errno));
473 		exit(1);
474 	}
475 
476 	fletcher_4_init();
477 	zfs_redup_stream(fd, STDOUT_FILENO, verbose);
478 	fletcher_4_fini();
479 
480 	close(fd);
481 
482 	return (0);
483 }
484