1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * This file and its contents are supplied under the terms of the 6 * Common Development and Distribution License ("CDDL"), version 1.0. 7 * You may only use this file in accordance with the terms of version 8 * 1.0 of the CDDL. 9 * 10 * A full copy of the text of the CDDL should have accompanied this 11 * source. A copy of the CDDL is also available via the Internet at 12 * http://www.illumos.org/license/CDDL. 13 * 14 * CDDL HEADER END 15 */ 16 17 /* 18 * Copyright (c) 2020 by Delphix. All rights reserved. 19 */ 20 21 #include <assert.h> 22 #include <cityhash.h> 23 #include <ctype.h> 24 #include <errno.h> 25 #include <fcntl.h> 26 #include <libzfs.h> 27 #include <libzutil.h> 28 #include <stddef.h> 29 #include <stdio.h> 30 #include <stdlib.h> 31 #include <string.h> 32 #include <umem.h> 33 #include <unistd.h> 34 #include <sys/debug.h> 35 #include <sys/stat.h> 36 #include <sys/zfs_ioctl.h> 37 #include <sys/zio_checksum.h> 38 #include "zfs_fletcher.h" 39 #include "zstream.h" 40 41 42 #define MAX_RDT_PHYSMEM_PERCENT 20 43 #define SMALLEST_POSSIBLE_MAX_RDT_MB 128 44 45 typedef struct redup_entry { 46 struct redup_entry *rde_next; 47 uint64_t rde_guid; 48 uint64_t rde_object; 49 uint64_t rde_offset; 50 uint64_t rde_stream_offset; 51 } redup_entry_t; 52 53 typedef struct redup_table { 54 redup_entry_t **redup_hash_array; 55 umem_cache_t *ddecache; 56 uint64_t ddt_count; 57 int numhashbits; 58 } redup_table_t; 59 60 void * 61 safe_calloc(size_t n) 62 { 63 void *rv = calloc(1, n); 64 if (rv == NULL) { 65 fprintf(stderr, 66 "Error: could not allocate %u bytes of memory\n", 67 (int)n); 68 exit(1); 69 } 70 return (rv); 71 } 72 73 /* 74 * Safe version of fread(), exits on error. 75 */ 76 int 77 sfread(void *buf, size_t size, FILE *fp) 78 { 79 int rv = fread(buf, size, 1, fp); 80 if (rv == 0 && ferror(fp)) { 81 (void) fprintf(stderr, "Error while reading file: %s\n", 82 strerror(errno)); 83 exit(1); 84 } 85 return (rv); 86 } 87 88 /* 89 * Safe version of pread(), exits on error. 90 */ 91 static void 92 spread(int fd, void *buf, size_t count, off_t offset) 93 { 94 ssize_t err = pread(fd, buf, count, offset); 95 if (err == -1) { 96 (void) fprintf(stderr, 97 "Error while reading file: %s\n", 98 strerror(errno)); 99 exit(1); 100 } else if (err != count) { 101 (void) fprintf(stderr, 102 "Error while reading file: short read\n"); 103 exit(1); 104 } 105 } 106 107 static int 108 dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, 109 zio_cksum_t *zc, int outfd) 110 { 111 assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum) 112 == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); 113 fletcher_4_incremental_native(drr, 114 offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); 115 if (drr->drr_type != DRR_BEGIN) { 116 assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. 117 drr_checksum.drr_checksum)); 118 drr->drr_u.drr_checksum.drr_checksum = *zc; 119 } 120 fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, 121 sizeof (zio_cksum_t), zc); 122 if (write(outfd, drr, sizeof (*drr)) == -1) 123 return (errno); 124 if (payload_len != 0) { 125 fletcher_4_incremental_native(payload, payload_len, zc); 126 if (write(outfd, payload, payload_len) == -1) 127 return (errno); 128 } 129 return (0); 130 } 131 132 static void 133 rdt_insert(redup_table_t *rdt, 134 uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset) 135 { 136 uint64_t ch = cityhash3(guid, object, offset); 137 uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); 138 redup_entry_t **rdepp; 139 140 rdepp = &(rdt->redup_hash_array[hashcode]); 141 redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL); 142 rde->rde_next = *rdepp; 143 rde->rde_guid = guid; 144 rde->rde_object = object; 145 rde->rde_offset = offset; 146 rde->rde_stream_offset = stream_offset; 147 *rdepp = rde; 148 rdt->ddt_count++; 149 } 150 151 static void 152 rdt_lookup(redup_table_t *rdt, 153 uint64_t guid, uint64_t object, uint64_t offset, 154 uint64_t *stream_offsetp) 155 { 156 uint64_t ch = cityhash3(guid, object, offset); 157 uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); 158 159 for (redup_entry_t *rde = rdt->redup_hash_array[hashcode]; 160 rde != NULL; rde = rde->rde_next) { 161 if (rde->rde_guid == guid && 162 rde->rde_object == object && 163 rde->rde_offset == offset) { 164 *stream_offsetp = rde->rde_stream_offset; 165 return; 166 } 167 } 168 assert(!"could not find expected redup table entry"); 169 } 170 171 /* 172 * Convert a dedup stream (generated by "zfs send -D") to a 173 * non-deduplicated stream. The entire infd will be converted, including 174 * any substreams in a stream package (generated by "zfs send -RD"). The 175 * infd must be seekable. 176 */ 177 static void 178 zfs_redup_stream(int infd, int outfd, boolean_t verbose) 179 { 180 int bufsz = SPA_MAXBLOCKSIZE; 181 dmu_replay_record_t thedrr; 182 dmu_replay_record_t *drr = &thedrr; 183 redup_table_t rdt; 184 zio_cksum_t stream_cksum; 185 uint64_t numbuckets; 186 uint64_t num_records = 0; 187 uint64_t num_write_byref_records = 0; 188 189 memset(&thedrr, 0, sizeof (dmu_replay_record_t)); 190 191 #ifdef _ILP32 192 uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20; 193 #else 194 uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); 195 uint64_t max_rde_size = 196 MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100, 197 SMALLEST_POSSIBLE_MAX_RDT_MB << 20); 198 #endif 199 200 numbuckets = max_rde_size / (sizeof (redup_entry_t)); 201 202 /* 203 * numbuckets must be a power of 2. Increase number to 204 * a power of 2 if necessary. 205 */ 206 if (!ISP2(numbuckets)) 207 numbuckets = 1ULL << highbit64(numbuckets); 208 209 rdt.redup_hash_array = 210 safe_calloc(numbuckets * sizeof (redup_entry_t *)); 211 rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0, 212 NULL, NULL, NULL, NULL, NULL, 0); 213 rdt.numhashbits = highbit64(numbuckets) - 1; 214 rdt.ddt_count = 0; 215 216 char *buf = safe_calloc(bufsz); 217 FILE *ofp = fdopen(infd, "r"); 218 long offset = ftell(ofp); 219 int begin = 0; 220 boolean_t seen = B_FALSE; 221 while (sfread(drr, sizeof (*drr), ofp) != 0) { 222 num_records++; 223 224 /* 225 * We need to regenerate the checksum. 226 */ 227 if (drr->drr_type != DRR_BEGIN) { 228 memset(&drr->drr_u.drr_checksum.drr_checksum, 0, 229 sizeof (drr->drr_u.drr_checksum.drr_checksum)); 230 } 231 232 uint64_t payload_size = 0; 233 switch (drr->drr_type) { 234 case DRR_BEGIN: 235 { 236 struct drr_begin *drrb = &drr->drr_u.drr_begin; 237 int fflags; 238 ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); 239 VERIFY0(begin++); 240 seen = B_TRUE; 241 242 assert(drrb->drr_magic == DMU_BACKUP_MAGIC); 243 244 /* clear the DEDUP feature flag for this stream */ 245 fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 246 fflags &= ~(DMU_BACKUP_FEATURE_DEDUP | 247 DMU_BACKUP_FEATURE_DEDUPPROPS); 248 /* cppcheck-suppress syntaxError */ 249 DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); 250 251 uint32_t sz = drr->drr_payloadlen; 252 253 VERIFY3U(sz, <=, 1U << 28); 254 255 if (sz != 0) { 256 if (sz > bufsz) { 257 free(buf); 258 buf = safe_calloc(sz); 259 bufsz = sz; 260 } 261 (void) sfread(buf, sz, ofp); 262 } 263 payload_size = sz; 264 break; 265 } 266 267 case DRR_END: 268 { 269 struct drr_end *drre = &drr->drr_u.drr_end; 270 /* 271 * We would prefer to just check --begin == 0, but 272 * replication streams have an end of stream END 273 * record, so we must avoid tripping it. 274 */ 275 VERIFY3B(seen, ==, B_TRUE); 276 begin--; 277 /* 278 * Use the recalculated checksum, unless this is 279 * the END record of a stream package, which has 280 * no checksum. 281 */ 282 if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum)) 283 drre->drr_checksum = stream_cksum; 284 break; 285 } 286 287 case DRR_OBJECT: 288 { 289 struct drr_object *drro = &drr->drr_u.drr_object; 290 VERIFY3S(begin, ==, 1); 291 292 if (drro->drr_bonuslen > 0) { 293 payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro); 294 (void) sfread(buf, payload_size, ofp); 295 } 296 break; 297 } 298 299 case DRR_SPILL: 300 { 301 struct drr_spill *drrs = &drr->drr_u.drr_spill; 302 VERIFY3S(begin, ==, 1); 303 payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs); 304 (void) sfread(buf, payload_size, ofp); 305 break; 306 } 307 308 case DRR_WRITE_BYREF: 309 { 310 struct drr_write_byref drrwb = 311 drr->drr_u.drr_write_byref; 312 VERIFY3S(begin, ==, 1); 313 314 num_write_byref_records++; 315 316 /* 317 * Look up in hash table by drrwb->drr_refguid, 318 * drr_refobject, drr_refoffset. Replace this 319 * record with the found WRITE record, but with 320 * drr_object,drr_offset,drr_toguid replaced with ours. 321 */ 322 uint64_t stream_offset = 0; 323 rdt_lookup(&rdt, drrwb.drr_refguid, 324 drrwb.drr_refobject, drrwb.drr_refoffset, 325 &stream_offset); 326 327 spread(infd, drr, sizeof (*drr), stream_offset); 328 329 assert(drr->drr_type == DRR_WRITE); 330 struct drr_write *drrw = &drr->drr_u.drr_write; 331 assert(drrw->drr_toguid == drrwb.drr_refguid); 332 assert(drrw->drr_object == drrwb.drr_refobject); 333 assert(drrw->drr_offset == drrwb.drr_refoffset); 334 335 payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); 336 spread(infd, buf, payload_size, 337 stream_offset + sizeof (*drr)); 338 339 drrw->drr_toguid = drrwb.drr_toguid; 340 drrw->drr_object = drrwb.drr_object; 341 drrw->drr_offset = drrwb.drr_offset; 342 break; 343 } 344 345 case DRR_WRITE: 346 { 347 struct drr_write *drrw = &drr->drr_u.drr_write; 348 VERIFY3S(begin, ==, 1); 349 payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); 350 (void) sfread(buf, payload_size, ofp); 351 352 rdt_insert(&rdt, drrw->drr_toguid, 353 drrw->drr_object, drrw->drr_offset, offset); 354 break; 355 } 356 357 case DRR_WRITE_EMBEDDED: 358 { 359 struct drr_write_embedded *drrwe = 360 &drr->drr_u.drr_write_embedded; 361 VERIFY3S(begin, ==, 1); 362 payload_size = 363 P2ROUNDUP((uint64_t)drrwe->drr_psize, 8); 364 (void) sfread(buf, payload_size, ofp); 365 break; 366 } 367 368 case DRR_FREEOBJECTS: 369 case DRR_FREE: 370 case DRR_OBJECT_RANGE: 371 VERIFY3S(begin, ==, 1); 372 break; 373 374 default: 375 (void) fprintf(stderr, "INVALID record type 0x%x\n", 376 drr->drr_type); 377 /* should never happen, so assert */ 378 assert(B_FALSE); 379 } 380 381 if (feof(ofp)) { 382 fprintf(stderr, "Error: unexpected end-of-file\n"); 383 exit(1); 384 } 385 if (ferror(ofp)) { 386 fprintf(stderr, "Error while reading file: %s\n", 387 strerror(errno)); 388 exit(1); 389 } 390 391 /* 392 * We need to recalculate the checksum, and it needs to be 393 * initially zero to do that. BEGIN records don't have 394 * a checksum. 395 */ 396 if (drr->drr_type != DRR_BEGIN) { 397 memset(&drr->drr_u.drr_checksum.drr_checksum, 0, 398 sizeof (drr->drr_u.drr_checksum.drr_checksum)); 399 } 400 if (dump_record(drr, buf, payload_size, 401 &stream_cksum, outfd) != 0) 402 break; 403 if (drr->drr_type == DRR_END) { 404 /* 405 * Typically the END record is either the last 406 * thing in the stream, or it is followed 407 * by a BEGIN record (which also zeros the checksum). 408 * However, a stream package ends with two END 409 * records. The last END record's checksum starts 410 * from zero. 411 */ 412 ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); 413 } 414 offset = ftell(ofp); 415 } 416 417 if (verbose) { 418 char mem_str[16]; 419 zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t), 420 mem_str, sizeof (mem_str)); 421 fprintf(stderr, "converted stream with %llu total records, " 422 "including %llu dedup records, using %sB memory.\n", 423 (long long)num_records, 424 (long long)num_write_byref_records, 425 mem_str); 426 } 427 428 umem_cache_destroy(rdt.ddecache); 429 free(rdt.redup_hash_array); 430 free(buf); 431 (void) fclose(ofp); 432 } 433 434 int 435 zstream_do_redup(int argc, char *argv[]) 436 { 437 boolean_t verbose = B_FALSE; 438 int c; 439 440 while ((c = getopt(argc, argv, "v")) != -1) { 441 switch (c) { 442 case 'v': 443 verbose = B_TRUE; 444 break; 445 case '?': 446 (void) fprintf(stderr, "invalid option '%c'\n", 447 optopt); 448 zstream_usage(); 449 break; 450 } 451 } 452 453 argc -= optind; 454 argv += optind; 455 456 if (argc != 1) 457 zstream_usage(); 458 459 const char *filename = argv[0]; 460 461 if (isatty(STDOUT_FILENO)) { 462 (void) fprintf(stderr, 463 "Error: Stream can not be written to a terminal.\n" 464 "You must redirect standard output.\n"); 465 return (1); 466 } 467 468 int fd = open(filename, O_RDONLY); 469 if (fd == -1) { 470 (void) fprintf(stderr, 471 "Error while opening file '%s': %s\n", 472 filename, strerror(errno)); 473 exit(1); 474 } 475 476 fletcher_4_init(); 477 zfs_redup_stream(fd, STDOUT_FILENO, verbose); 478 fletcher_4_fini(); 479 480 close(fd); 481 482 return (0); 483 } 484