1 /*
2 * CDDL HEADER START
3 *
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
8 *
9 * A full copy of the text of the CDDL should have accompanied this
10 * source. A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
12 *
13 * CDDL HEADER END
14 */
15
16 /*
17 * Copyright (c) 2020 by Delphix. All rights reserved.
18 */
19
20 #include <assert.h>
21 #include <cityhash.h>
22 #include <ctype.h>
23 #include <errno.h>
24 #include <fcntl.h>
25 #include <libzfs.h>
26 #include <libzutil.h>
27 #include <stddef.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <umem.h>
32 #include <unistd.h>
33 #include <sys/debug.h>
34 #include <sys/stat.h>
35 #include <sys/zfs_ioctl.h>
36 #include <sys/zio_checksum.h>
37 #include "zfs_fletcher.h"
38 #include "zstream.h"
39
40
41 #define MAX_RDT_PHYSMEM_PERCENT 20
42 #define SMALLEST_POSSIBLE_MAX_RDT_MB 128
43
44 typedef struct redup_entry {
45 struct redup_entry *rde_next;
46 uint64_t rde_guid;
47 uint64_t rde_object;
48 uint64_t rde_offset;
49 uint64_t rde_stream_offset;
50 } redup_entry_t;
51
52 typedef struct redup_table {
53 redup_entry_t **redup_hash_array;
54 umem_cache_t *ddecache;
55 uint64_t ddt_count;
56 int numhashbits;
57 } redup_table_t;
58
59 void *
safe_calloc(size_t n)60 safe_calloc(size_t n)
61 {
62 void *rv = calloc(1, n);
63 if (rv == NULL) {
64 fprintf(stderr,
65 "Error: could not allocate %u bytes of memory\n",
66 (int)n);
67 exit(1);
68 }
69 return (rv);
70 }
71
72 /*
73 * Safe version of fread(), exits on error.
74 */
75 int
sfread(void * buf,size_t size,FILE * fp)76 sfread(void *buf, size_t size, FILE *fp)
77 {
78 int rv = fread(buf, size, 1, fp);
79 if (rv == 0 && ferror(fp)) {
80 (void) fprintf(stderr, "Error while reading file: %s\n",
81 strerror(errno));
82 exit(1);
83 }
84 return (rv);
85 }
86
87 /*
88 * Safe version of pread(), exits on error.
89 */
90 static void
spread(int fd,void * buf,size_t count,off_t offset)91 spread(int fd, void *buf, size_t count, off_t offset)
92 {
93 ssize_t err = pread(fd, buf, count, offset);
94 if (err == -1) {
95 (void) fprintf(stderr,
96 "Error while reading file: %s\n",
97 strerror(errno));
98 exit(1);
99 } else if (err != count) {
100 (void) fprintf(stderr,
101 "Error while reading file: short read\n");
102 exit(1);
103 }
104 }
105
106 static int
dump_record(dmu_replay_record_t * drr,void * payload,int payload_len,zio_cksum_t * zc,int outfd)107 dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
108 zio_cksum_t *zc, int outfd)
109 {
110 assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum)
111 == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
112 fletcher_4_incremental_native(drr,
113 offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
114 if (drr->drr_type != DRR_BEGIN) {
115 assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
116 drr_checksum.drr_checksum));
117 drr->drr_u.drr_checksum.drr_checksum = *zc;
118 }
119 fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
120 sizeof (zio_cksum_t), zc);
121 if (write(outfd, drr, sizeof (*drr)) == -1)
122 return (errno);
123 if (payload_len != 0) {
124 fletcher_4_incremental_native(payload, payload_len, zc);
125 if (write(outfd, payload, payload_len) == -1)
126 return (errno);
127 }
128 return (0);
129 }
130
131 static void
rdt_insert(redup_table_t * rdt,uint64_t guid,uint64_t object,uint64_t offset,uint64_t stream_offset)132 rdt_insert(redup_table_t *rdt,
133 uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset)
134 {
135 uint64_t ch = cityhash3(guid, object, offset);
136 uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
137 redup_entry_t **rdepp;
138
139 rdepp = &(rdt->redup_hash_array[hashcode]);
140 redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL);
141 rde->rde_next = *rdepp;
142 rde->rde_guid = guid;
143 rde->rde_object = object;
144 rde->rde_offset = offset;
145 rde->rde_stream_offset = stream_offset;
146 *rdepp = rde;
147 rdt->ddt_count++;
148 }
149
150 static void
rdt_lookup(redup_table_t * rdt,uint64_t guid,uint64_t object,uint64_t offset,uint64_t * stream_offsetp)151 rdt_lookup(redup_table_t *rdt,
152 uint64_t guid, uint64_t object, uint64_t offset,
153 uint64_t *stream_offsetp)
154 {
155 uint64_t ch = cityhash3(guid, object, offset);
156 uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
157
158 for (redup_entry_t *rde = rdt->redup_hash_array[hashcode];
159 rde != NULL; rde = rde->rde_next) {
160 if (rde->rde_guid == guid &&
161 rde->rde_object == object &&
162 rde->rde_offset == offset) {
163 *stream_offsetp = rde->rde_stream_offset;
164 return;
165 }
166 }
167 assert(!"could not find expected redup table entry");
168 }
169
170 /*
171 * Convert a dedup stream (generated by "zfs send -D") to a
172 * non-deduplicated stream. The entire infd will be converted, including
173 * any substreams in a stream package (generated by "zfs send -RD"). The
174 * infd must be seekable.
175 */
176 static void
zfs_redup_stream(int infd,int outfd,boolean_t verbose)177 zfs_redup_stream(int infd, int outfd, boolean_t verbose)
178 {
179 int bufsz = SPA_MAXBLOCKSIZE;
180 dmu_replay_record_t thedrr;
181 dmu_replay_record_t *drr = &thedrr;
182 redup_table_t rdt;
183 zio_cksum_t stream_cksum;
184 uint64_t numbuckets;
185 uint64_t num_records = 0;
186 uint64_t num_write_byref_records = 0;
187
188 memset(&thedrr, 0, sizeof (dmu_replay_record_t));
189
190 #ifdef _ILP32
191 uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20;
192 #else
193 uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
194 uint64_t max_rde_size =
195 MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100,
196 SMALLEST_POSSIBLE_MAX_RDT_MB << 20);
197 #endif
198
199 numbuckets = max_rde_size / (sizeof (redup_entry_t));
200
201 /*
202 * numbuckets must be a power of 2. Increase number to
203 * a power of 2 if necessary.
204 */
205 if (!ISP2(numbuckets))
206 numbuckets = 1ULL << highbit64(numbuckets);
207
208 rdt.redup_hash_array =
209 safe_calloc(numbuckets * sizeof (redup_entry_t *));
210 rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0,
211 NULL, NULL, NULL, NULL, NULL, 0);
212 rdt.numhashbits = highbit64(numbuckets) - 1;
213 rdt.ddt_count = 0;
214
215 char *buf = safe_calloc(bufsz);
216 FILE *ofp = fdopen(infd, "r");
217 long offset = ftell(ofp);
218 int begin = 0;
219 boolean_t seen = B_FALSE;
220 while (sfread(drr, sizeof (*drr), ofp) != 0) {
221 num_records++;
222
223 /*
224 * We need to regenerate the checksum.
225 */
226 if (drr->drr_type != DRR_BEGIN) {
227 memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
228 sizeof (drr->drr_u.drr_checksum.drr_checksum));
229 }
230
231 uint64_t payload_size = 0;
232 switch (drr->drr_type) {
233 case DRR_BEGIN:
234 {
235 struct drr_begin *drrb = &drr->drr_u.drr_begin;
236 int fflags;
237 ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
238 VERIFY0(begin++);
239 seen = B_TRUE;
240
241 assert(drrb->drr_magic == DMU_BACKUP_MAGIC);
242
243 /* clear the DEDUP feature flag for this stream */
244 fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
245 fflags &= ~(DMU_BACKUP_FEATURE_DEDUP |
246 DMU_BACKUP_FEATURE_DEDUPPROPS);
247 /* cppcheck-suppress syntaxError */
248 DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
249
250 uint32_t sz = drr->drr_payloadlen;
251
252 VERIFY3U(sz, <=, 1U << 28);
253
254 if (sz != 0) {
255 if (sz > bufsz) {
256 free(buf);
257 buf = safe_calloc(sz);
258 bufsz = sz;
259 }
260 (void) sfread(buf, sz, ofp);
261 }
262 payload_size = sz;
263 break;
264 }
265
266 case DRR_END:
267 {
268 struct drr_end *drre = &drr->drr_u.drr_end;
269 /*
270 * We would prefer to just check --begin == 0, but
271 * replication streams have an end of stream END
272 * record, so we must avoid tripping it.
273 */
274 VERIFY3B(seen, ==, B_TRUE);
275 begin--;
276 /*
277 * Use the recalculated checksum, unless this is
278 * the END record of a stream package, which has
279 * no checksum.
280 */
281 if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum))
282 drre->drr_checksum = stream_cksum;
283 break;
284 }
285
286 case DRR_OBJECT:
287 {
288 struct drr_object *drro = &drr->drr_u.drr_object;
289 VERIFY3S(begin, ==, 1);
290
291 if (drro->drr_bonuslen > 0) {
292 payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
293 (void) sfread(buf, payload_size, ofp);
294 }
295 break;
296 }
297
298 case DRR_SPILL:
299 {
300 struct drr_spill *drrs = &drr->drr_u.drr_spill;
301 VERIFY3S(begin, ==, 1);
302 payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
303 (void) sfread(buf, payload_size, ofp);
304 break;
305 }
306
307 case DRR_WRITE_BYREF:
308 {
309 struct drr_write_byref drrwb =
310 drr->drr_u.drr_write_byref;
311 VERIFY3S(begin, ==, 1);
312
313 num_write_byref_records++;
314
315 /*
316 * Look up in hash table by drrwb->drr_refguid,
317 * drr_refobject, drr_refoffset. Replace this
318 * record with the found WRITE record, but with
319 * drr_object,drr_offset,drr_toguid replaced with ours.
320 */
321 uint64_t stream_offset = 0;
322 rdt_lookup(&rdt, drrwb.drr_refguid,
323 drrwb.drr_refobject, drrwb.drr_refoffset,
324 &stream_offset);
325
326 spread(infd, drr, sizeof (*drr), stream_offset);
327
328 assert(drr->drr_type == DRR_WRITE);
329 struct drr_write *drrw = &drr->drr_u.drr_write;
330 assert(drrw->drr_toguid == drrwb.drr_refguid);
331 assert(drrw->drr_object == drrwb.drr_refobject);
332 assert(drrw->drr_offset == drrwb.drr_refoffset);
333
334 payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
335 spread(infd, buf, payload_size,
336 stream_offset + sizeof (*drr));
337
338 drrw->drr_toguid = drrwb.drr_toguid;
339 drrw->drr_object = drrwb.drr_object;
340 drrw->drr_offset = drrwb.drr_offset;
341 break;
342 }
343
344 case DRR_WRITE:
345 {
346 struct drr_write *drrw = &drr->drr_u.drr_write;
347 VERIFY3S(begin, ==, 1);
348 payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
349 (void) sfread(buf, payload_size, ofp);
350
351 rdt_insert(&rdt, drrw->drr_toguid,
352 drrw->drr_object, drrw->drr_offset, offset);
353 break;
354 }
355
356 case DRR_WRITE_EMBEDDED:
357 {
358 struct drr_write_embedded *drrwe =
359 &drr->drr_u.drr_write_embedded;
360 VERIFY3S(begin, ==, 1);
361 payload_size =
362 P2ROUNDUP((uint64_t)drrwe->drr_psize, 8);
363 (void) sfread(buf, payload_size, ofp);
364 break;
365 }
366
367 case DRR_FREEOBJECTS:
368 case DRR_FREE:
369 case DRR_OBJECT_RANGE:
370 VERIFY3S(begin, ==, 1);
371 break;
372
373 default:
374 (void) fprintf(stderr, "INVALID record type 0x%x\n",
375 drr->drr_type);
376 /* should never happen, so assert */
377 assert(B_FALSE);
378 }
379
380 if (feof(ofp)) {
381 fprintf(stderr, "Error: unexpected end-of-file\n");
382 exit(1);
383 }
384 if (ferror(ofp)) {
385 fprintf(stderr, "Error while reading file: %s\n",
386 strerror(errno));
387 exit(1);
388 }
389
390 /*
391 * We need to recalculate the checksum, and it needs to be
392 * initially zero to do that. BEGIN records don't have
393 * a checksum.
394 */
395 if (drr->drr_type != DRR_BEGIN) {
396 memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
397 sizeof (drr->drr_u.drr_checksum.drr_checksum));
398 }
399 if (dump_record(drr, buf, payload_size,
400 &stream_cksum, outfd) != 0)
401 break;
402 if (drr->drr_type == DRR_END) {
403 /*
404 * Typically the END record is either the last
405 * thing in the stream, or it is followed
406 * by a BEGIN record (which also zeros the checksum).
407 * However, a stream package ends with two END
408 * records. The last END record's checksum starts
409 * from zero.
410 */
411 ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
412 }
413 offset = ftell(ofp);
414 }
415
416 if (verbose) {
417 char mem_str[16];
418 zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t),
419 mem_str, sizeof (mem_str));
420 fprintf(stderr, "converted stream with %llu total records, "
421 "including %llu dedup records, using %sB memory.\n",
422 (long long)num_records,
423 (long long)num_write_byref_records,
424 mem_str);
425 }
426
427 umem_cache_destroy(rdt.ddecache);
428 free(rdt.redup_hash_array);
429 free(buf);
430 (void) fclose(ofp);
431 }
432
433 int
zstream_do_redup(int argc,char * argv[])434 zstream_do_redup(int argc, char *argv[])
435 {
436 boolean_t verbose = B_FALSE;
437 int c;
438
439 while ((c = getopt(argc, argv, "v")) != -1) {
440 switch (c) {
441 case 'v':
442 verbose = B_TRUE;
443 break;
444 case '?':
445 (void) fprintf(stderr, "invalid option '%c'\n",
446 optopt);
447 zstream_usage();
448 break;
449 }
450 }
451
452 argc -= optind;
453 argv += optind;
454
455 if (argc != 1)
456 zstream_usage();
457
458 const char *filename = argv[0];
459
460 if (isatty(STDOUT_FILENO)) {
461 (void) fprintf(stderr,
462 "Error: Stream can not be written to a terminal.\n"
463 "You must redirect standard output.\n");
464 return (1);
465 }
466
467 int fd = open(filename, O_RDONLY);
468 if (fd == -1) {
469 (void) fprintf(stderr,
470 "Error while opening file '%s': %s\n",
471 filename, strerror(errno));
472 exit(1);
473 }
474
475 fletcher_4_init();
476 zfs_redup_stream(fd, STDOUT_FILENO, verbose);
477 fletcher_4_fini();
478
479 close(fd);
480
481 return (0);
482 }
483