1 /* 2 * CDDL HEADER START 3 * 4 * This file and its contents are supplied under the terms of the 5 * Common Development and Distribution License ("CDDL"), version 1.0. 6 * You may only use this file in accordance with the terms of version 7 * 1.0 of the CDDL. 8 * 9 * A full copy of the text of the CDDL should have accompanied this 10 * source. A copy of the CDDL is also available via the Internet at 11 * http://www.illumos.org/license/CDDL. 12 * 13 * CDDL HEADER END 14 */ 15 16 /* 17 * Copyright (c) 2020 by Delphix. All rights reserved. 18 */ 19 20 #include <assert.h> 21 #include <cityhash.h> 22 #include <ctype.h> 23 #include <errno.h> 24 #include <fcntl.h> 25 #include <libzfs.h> 26 #include <libzutil.h> 27 #include <stddef.h> 28 #include <stdio.h> 29 #include <stdlib.h> 30 #include <string.h> 31 #include <umem.h> 32 #include <unistd.h> 33 #include <sys/debug.h> 34 #include <sys/stat.h> 35 #include <sys/zfs_ioctl.h> 36 #include <sys/zio_checksum.h> 37 #include "zfs_fletcher.h" 38 #include "zstream.h" 39 40 41 #define MAX_RDT_PHYSMEM_PERCENT 20 42 #define SMALLEST_POSSIBLE_MAX_RDT_MB 128 43 44 typedef struct redup_entry { 45 struct redup_entry *rde_next; 46 uint64_t rde_guid; 47 uint64_t rde_object; 48 uint64_t rde_offset; 49 uint64_t rde_stream_offset; 50 } redup_entry_t; 51 52 typedef struct redup_table { 53 redup_entry_t **redup_hash_array; 54 umem_cache_t *ddecache; 55 uint64_t ddt_count; 56 int numhashbits; 57 } redup_table_t; 58 59 int 60 highbit64(uint64_t i) 61 { 62 if (i == 0) 63 return (0); 64 65 return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); 66 } 67 68 void * 69 safe_calloc(size_t n) 70 { 71 void *rv = calloc(1, n); 72 if (rv == NULL) { 73 fprintf(stderr, 74 "Error: could not allocate %u bytes of memory\n", 75 (int)n); 76 exit(1); 77 } 78 return (rv); 79 } 80 81 /* 82 * Safe version of fread(), exits on error. 83 */ 84 int 85 sfread(void *buf, size_t size, FILE *fp) 86 { 87 int rv = fread(buf, size, 1, fp); 88 if (rv == 0 && ferror(fp)) { 89 (void) fprintf(stderr, "Error while reading file: %s\n", 90 strerror(errno)); 91 exit(1); 92 } 93 return (rv); 94 } 95 96 /* 97 * Safe version of pread(), exits on error. 98 */ 99 static void 100 spread(int fd, void *buf, size_t count, off_t offset) 101 { 102 ssize_t err = pread(fd, buf, count, offset); 103 if (err == -1) { 104 (void) fprintf(stderr, 105 "Error while reading file: %s\n", 106 strerror(errno)); 107 exit(1); 108 } else if (err != count) { 109 (void) fprintf(stderr, 110 "Error while reading file: short read\n"); 111 exit(1); 112 } 113 } 114 115 static int 116 dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, 117 zio_cksum_t *zc, int outfd) 118 { 119 assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum) 120 == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); 121 fletcher_4_incremental_native(drr, 122 offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); 123 if (drr->drr_type != DRR_BEGIN) { 124 assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. 125 drr_checksum.drr_checksum)); 126 drr->drr_u.drr_checksum.drr_checksum = *zc; 127 } 128 fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, 129 sizeof (zio_cksum_t), zc); 130 if (write(outfd, drr, sizeof (*drr)) == -1) 131 return (errno); 132 if (payload_len != 0) { 133 fletcher_4_incremental_native(payload, payload_len, zc); 134 if (write(outfd, payload, payload_len) == -1) 135 return (errno); 136 } 137 return (0); 138 } 139 140 static void 141 rdt_insert(redup_table_t *rdt, 142 uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset) 143 { 144 uint64_t ch = cityhash4(guid, object, offset, 0); 145 uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); 146 redup_entry_t **rdepp; 147 148 rdepp = &(rdt->redup_hash_array[hashcode]); 149 redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL); 150 rde->rde_next = *rdepp; 151 rde->rde_guid = guid; 152 rde->rde_object = object; 153 rde->rde_offset = offset; 154 rde->rde_stream_offset = stream_offset; 155 *rdepp = rde; 156 rdt->ddt_count++; 157 } 158 159 static void 160 rdt_lookup(redup_table_t *rdt, 161 uint64_t guid, uint64_t object, uint64_t offset, 162 uint64_t *stream_offsetp) 163 { 164 uint64_t ch = cityhash4(guid, object, offset, 0); 165 uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); 166 167 for (redup_entry_t *rde = rdt->redup_hash_array[hashcode]; 168 rde != NULL; rde = rde->rde_next) { 169 if (rde->rde_guid == guid && 170 rde->rde_object == object && 171 rde->rde_offset == offset) { 172 *stream_offsetp = rde->rde_stream_offset; 173 return; 174 } 175 } 176 assert(!"could not find expected redup table entry"); 177 } 178 179 /* 180 * Convert a dedup stream (generated by "zfs send -D") to a 181 * non-deduplicated stream. The entire infd will be converted, including 182 * any substreams in a stream package (generated by "zfs send -RD"). The 183 * infd must be seekable. 184 */ 185 static void 186 zfs_redup_stream(int infd, int outfd, boolean_t verbose) 187 { 188 int bufsz = SPA_MAXBLOCKSIZE; 189 dmu_replay_record_t thedrr; 190 dmu_replay_record_t *drr = &thedrr; 191 redup_table_t rdt; 192 zio_cksum_t stream_cksum; 193 uint64_t numbuckets; 194 uint64_t num_records = 0; 195 uint64_t num_write_byref_records = 0; 196 197 memset(&thedrr, 0, sizeof (dmu_replay_record_t)); 198 199 #ifdef _ILP32 200 uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20; 201 #else 202 uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); 203 uint64_t max_rde_size = 204 MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100, 205 SMALLEST_POSSIBLE_MAX_RDT_MB << 20); 206 #endif 207 208 numbuckets = max_rde_size / (sizeof (redup_entry_t)); 209 210 /* 211 * numbuckets must be a power of 2. Increase number to 212 * a power of 2 if necessary. 213 */ 214 if (!ISP2(numbuckets)) 215 numbuckets = 1ULL << highbit64(numbuckets); 216 217 rdt.redup_hash_array = 218 safe_calloc(numbuckets * sizeof (redup_entry_t *)); 219 rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0, 220 NULL, NULL, NULL, NULL, NULL, 0); 221 rdt.numhashbits = highbit64(numbuckets) - 1; 222 rdt.ddt_count = 0; 223 224 char *buf = safe_calloc(bufsz); 225 FILE *ofp = fdopen(infd, "r"); 226 long offset = ftell(ofp); 227 int begin = 0; 228 boolean_t seen = B_FALSE; 229 while (sfread(drr, sizeof (*drr), ofp) != 0) { 230 num_records++; 231 232 /* 233 * We need to regenerate the checksum. 234 */ 235 if (drr->drr_type != DRR_BEGIN) { 236 memset(&drr->drr_u.drr_checksum.drr_checksum, 0, 237 sizeof (drr->drr_u.drr_checksum.drr_checksum)); 238 } 239 240 uint64_t payload_size = 0; 241 switch (drr->drr_type) { 242 case DRR_BEGIN: 243 { 244 struct drr_begin *drrb = &drr->drr_u.drr_begin; 245 int fflags; 246 ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); 247 VERIFY0(begin++); 248 seen = B_TRUE; 249 250 assert(drrb->drr_magic == DMU_BACKUP_MAGIC); 251 252 /* clear the DEDUP feature flag for this stream */ 253 fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 254 fflags &= ~(DMU_BACKUP_FEATURE_DEDUP | 255 DMU_BACKUP_FEATURE_DEDUPPROPS); 256 /* cppcheck-suppress syntaxError */ 257 DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); 258 259 uint32_t sz = drr->drr_payloadlen; 260 261 VERIFY3U(sz, <=, 1U << 28); 262 263 if (sz != 0) { 264 if (sz > bufsz) { 265 free(buf); 266 buf = safe_calloc(sz); 267 bufsz = sz; 268 } 269 (void) sfread(buf, sz, ofp); 270 } 271 payload_size = sz; 272 break; 273 } 274 275 case DRR_END: 276 { 277 struct drr_end *drre = &drr->drr_u.drr_end; 278 /* 279 * We would prefer to just check --begin == 0, but 280 * replication streams have an end of stream END 281 * record, so we must avoid tripping it. 282 */ 283 VERIFY3B(seen, ==, B_TRUE); 284 begin--; 285 /* 286 * Use the recalculated checksum, unless this is 287 * the END record of a stream package, which has 288 * no checksum. 289 */ 290 if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum)) 291 drre->drr_checksum = stream_cksum; 292 break; 293 } 294 295 case DRR_OBJECT: 296 { 297 struct drr_object *drro = &drr->drr_u.drr_object; 298 VERIFY3S(begin, ==, 1); 299 300 if (drro->drr_bonuslen > 0) { 301 payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro); 302 (void) sfread(buf, payload_size, ofp); 303 } 304 break; 305 } 306 307 case DRR_SPILL: 308 { 309 struct drr_spill *drrs = &drr->drr_u.drr_spill; 310 VERIFY3S(begin, ==, 1); 311 payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs); 312 (void) sfread(buf, payload_size, ofp); 313 break; 314 } 315 316 case DRR_WRITE_BYREF: 317 { 318 struct drr_write_byref drrwb = 319 drr->drr_u.drr_write_byref; 320 VERIFY3S(begin, ==, 1); 321 322 num_write_byref_records++; 323 324 /* 325 * Look up in hash table by drrwb->drr_refguid, 326 * drr_refobject, drr_refoffset. Replace this 327 * record with the found WRITE record, but with 328 * drr_object,drr_offset,drr_toguid replaced with ours. 329 */ 330 uint64_t stream_offset = 0; 331 rdt_lookup(&rdt, drrwb.drr_refguid, 332 drrwb.drr_refobject, drrwb.drr_refoffset, 333 &stream_offset); 334 335 spread(infd, drr, sizeof (*drr), stream_offset); 336 337 assert(drr->drr_type == DRR_WRITE); 338 struct drr_write *drrw = &drr->drr_u.drr_write; 339 assert(drrw->drr_toguid == drrwb.drr_refguid); 340 assert(drrw->drr_object == drrwb.drr_refobject); 341 assert(drrw->drr_offset == drrwb.drr_refoffset); 342 343 payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); 344 spread(infd, buf, payload_size, 345 stream_offset + sizeof (*drr)); 346 347 drrw->drr_toguid = drrwb.drr_toguid; 348 drrw->drr_object = drrwb.drr_object; 349 drrw->drr_offset = drrwb.drr_offset; 350 break; 351 } 352 353 case DRR_WRITE: 354 { 355 struct drr_write *drrw = &drr->drr_u.drr_write; 356 VERIFY3S(begin, ==, 1); 357 payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); 358 (void) sfread(buf, payload_size, ofp); 359 360 rdt_insert(&rdt, drrw->drr_toguid, 361 drrw->drr_object, drrw->drr_offset, offset); 362 break; 363 } 364 365 case DRR_WRITE_EMBEDDED: 366 { 367 struct drr_write_embedded *drrwe = 368 &drr->drr_u.drr_write_embedded; 369 VERIFY3S(begin, ==, 1); 370 payload_size = 371 P2ROUNDUP((uint64_t)drrwe->drr_psize, 8); 372 (void) sfread(buf, payload_size, ofp); 373 break; 374 } 375 376 case DRR_FREEOBJECTS: 377 case DRR_FREE: 378 case DRR_OBJECT_RANGE: 379 VERIFY3S(begin, ==, 1); 380 break; 381 382 default: 383 (void) fprintf(stderr, "INVALID record type 0x%x\n", 384 drr->drr_type); 385 /* should never happen, so assert */ 386 assert(B_FALSE); 387 } 388 389 if (feof(ofp)) { 390 fprintf(stderr, "Error: unexpected end-of-file\n"); 391 exit(1); 392 } 393 if (ferror(ofp)) { 394 fprintf(stderr, "Error while reading file: %s\n", 395 strerror(errno)); 396 exit(1); 397 } 398 399 /* 400 * We need to recalculate the checksum, and it needs to be 401 * initially zero to do that. BEGIN records don't have 402 * a checksum. 403 */ 404 if (drr->drr_type != DRR_BEGIN) { 405 memset(&drr->drr_u.drr_checksum.drr_checksum, 0, 406 sizeof (drr->drr_u.drr_checksum.drr_checksum)); 407 } 408 if (dump_record(drr, buf, payload_size, 409 &stream_cksum, outfd) != 0) 410 break; 411 if (drr->drr_type == DRR_END) { 412 /* 413 * Typically the END record is either the last 414 * thing in the stream, or it is followed 415 * by a BEGIN record (which also zeros the checksum). 416 * However, a stream package ends with two END 417 * records. The last END record's checksum starts 418 * from zero. 419 */ 420 ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); 421 } 422 offset = ftell(ofp); 423 } 424 425 if (verbose) { 426 char mem_str[16]; 427 zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t), 428 mem_str, sizeof (mem_str)); 429 fprintf(stderr, "converted stream with %llu total records, " 430 "including %llu dedup records, using %sB memory.\n", 431 (long long)num_records, 432 (long long)num_write_byref_records, 433 mem_str); 434 } 435 436 umem_cache_destroy(rdt.ddecache); 437 free(rdt.redup_hash_array); 438 free(buf); 439 (void) fclose(ofp); 440 } 441 442 int 443 zstream_do_redup(int argc, char *argv[]) 444 { 445 boolean_t verbose = B_FALSE; 446 int c; 447 448 while ((c = getopt(argc, argv, "v")) != -1) { 449 switch (c) { 450 case 'v': 451 verbose = B_TRUE; 452 break; 453 case '?': 454 (void) fprintf(stderr, "invalid option '%c'\n", 455 optopt); 456 zstream_usage(); 457 break; 458 } 459 } 460 461 argc -= optind; 462 argv += optind; 463 464 if (argc != 1) 465 zstream_usage(); 466 467 const char *filename = argv[0]; 468 469 if (isatty(STDOUT_FILENO)) { 470 (void) fprintf(stderr, 471 "Error: Stream can not be written to a terminal.\n" 472 "You must redirect standard output.\n"); 473 return (1); 474 } 475 476 int fd = open(filename, O_RDONLY); 477 if (fd == -1) { 478 (void) fprintf(stderr, 479 "Error while opening file '%s': %s\n", 480 filename, strerror(errno)); 481 exit(1); 482 } 483 484 fletcher_4_init(); 485 zfs_redup_stream(fd, STDOUT_FILENO, verbose); 486 fletcher_4_fini(); 487 488 close(fd); 489 490 return (0); 491 } 492