1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * The objective of this program is to provide a DMU/ZAP/SPA stress test 28 * that runs entirely in userland, is easy to use, and easy to extend. 29 * 30 * The overall design of the ztest program is as follows: 31 * 32 * (1) For each major functional area (e.g. adding vdevs to a pool, 33 * creating and destroying datasets, reading and writing objects, etc) 34 * we have a simple routine to test that functionality. These 35 * individual routines do not have to do anything "stressful". 36 * 37 * (2) We turn these simple functionality tests into a stress test by 38 * running them all in parallel, with as many threads as desired, 39 * and spread across as many datasets, objects, and vdevs as desired. 40 * 41 * (3) While all this is happening, we inject faults into the pool to 42 * verify that self-healing data really works. 43 * 44 * (4) Every time we open a dataset, we change its checksum and compression 45 * functions. Thus even individual objects vary from block to block 46 * in which checksum they use and whether they're compressed. 47 * 48 * (5) To verify that we never lose on-disk consistency after a crash, 49 * we run the entire test in a child of the main process. 50 * At random times, the child self-immolates with a SIGKILL. 51 * This is the software equivalent of pulling the power cord. 52 * The parent then runs the test again, using the existing 53 * storage pool, as many times as desired. 54 * 55 * (6) To verify that we don't have future leaks or temporal incursions, 56 * many of the functional tests record the transaction group number 57 * as part of their data. When reading old data, they verify that 58 * the transaction group number is less than the current, open txg. 59 * If you add a new test, please do this if applicable. 60 * 61 * When run with no arguments, ztest runs for about five minutes and 62 * produces no output if successful. To get a little bit of information, 63 * specify -V. To get more information, specify -VV, and so on. 64 * 65 * To turn this into an overnight stress test, use -T to specify run time. 66 * 67 * You can ask more more vdevs [-v], datasets [-d], or threads [-t] 68 * to increase the pool capacity, fanout, and overall stress level. 69 * 70 * The -N(okill) option will suppress kills, so each child runs to completion. 71 * This can be useful when you're trying to distinguish temporal incursions 72 * from plain old race conditions. 73 */ 74 75 #include <sys/zfs_context.h> 76 #include <sys/spa.h> 77 #include <sys/dmu.h> 78 #include <sys/txg.h> 79 #include <sys/dbuf.h> 80 #include <sys/zap.h> 81 #include <sys/dmu_objset.h> 82 #include <sys/poll.h> 83 #include <sys/stat.h> 84 #include <sys/time.h> 85 #include <sys/wait.h> 86 #include <sys/mman.h> 87 #include <sys/resource.h> 88 #include <sys/zio.h> 89 #include <sys/zil.h> 90 #include <sys/zil_impl.h> 91 #include <sys/vdev_impl.h> 92 #include <sys/vdev_file.h> 93 #include <sys/spa_impl.h> 94 #include <sys/metaslab_impl.h> 95 #include <sys/dsl_prop.h> 96 #include <sys/dsl_dataset.h> 97 #include <sys/refcount.h> 98 #include <stdio.h> 99 #include <stdio_ext.h> 100 #include <stdlib.h> 101 #include <unistd.h> 102 #include <signal.h> 103 #include <umem.h> 104 #include <dlfcn.h> 105 #include <ctype.h> 106 #include <math.h> 107 #include <sys/fs/zfs.h> 108 #include <libnvpair.h> 109 110 static char cmdname[] = "ztest"; 111 static char *zopt_pool = cmdname; 112 113 static uint64_t zopt_vdevs = 5; 114 static uint64_t zopt_vdevtime; 115 static int zopt_ashift = SPA_MINBLOCKSHIFT; 116 static int zopt_mirrors = 2; 117 static int zopt_raidz = 4; 118 static int zopt_raidz_parity = 1; 119 static size_t zopt_vdev_size = SPA_MINDEVSIZE; 120 static int zopt_datasets = 7; 121 static int zopt_threads = 23; 122 static uint64_t zopt_passtime = 60; /* 60 seconds */ 123 static uint64_t zopt_killrate = 70; /* 70% kill rate */ 124 static int zopt_verbose = 0; 125 static int zopt_init = 1; 126 static char *zopt_dir = "/tmp"; 127 static uint64_t zopt_time = 300; /* 5 minutes */ 128 static int zopt_maxfaults; 129 130 #define BT_MAGIC 0x123456789abcdefULL 131 132 enum ztest_io_type { 133 ZTEST_IO_WRITE_TAG, 134 ZTEST_IO_WRITE_PATTERN, 135 ZTEST_IO_WRITE_ZEROES, 136 ZTEST_IO_TRUNCATE, 137 ZTEST_IO_SETATTR, 138 ZTEST_IO_TYPES 139 }; 140 141 typedef struct ztest_block_tag { 142 uint64_t bt_magic; 143 uint64_t bt_objset; 144 uint64_t bt_object; 145 uint64_t bt_offset; 146 uint64_t bt_gen; 147 uint64_t bt_txg; 148 uint64_t bt_crtxg; 149 } ztest_block_tag_t; 150 151 typedef struct bufwad { 152 uint64_t bw_index; 153 uint64_t bw_txg; 154 uint64_t bw_data; 155 } bufwad_t; 156 157 /* 158 * XXX -- fix zfs range locks to be generic so we can use them here. 159 */ 160 typedef enum { 161 RL_READER, 162 RL_WRITER, 163 RL_APPEND 164 } rl_type_t; 165 166 typedef struct rll { 167 void *rll_writer; 168 int rll_readers; 169 mutex_t rll_lock; 170 cond_t rll_cv; 171 } rll_t; 172 173 typedef struct rl { 174 uint64_t rl_object; 175 uint64_t rl_offset; 176 uint64_t rl_size; 177 rll_t *rl_lock; 178 } rl_t; 179 180 #define ZTEST_RANGE_LOCKS 64 181 #define ZTEST_OBJECT_LOCKS 64 182 183 /* 184 * Object descriptor. Used as a template for object lookup/create/remove. 185 */ 186 typedef struct ztest_od { 187 uint64_t od_dir; 188 uint64_t od_object; 189 dmu_object_type_t od_type; 190 dmu_object_type_t od_crtype; 191 uint64_t od_blocksize; 192 uint64_t od_crblocksize; 193 uint64_t od_gen; 194 uint64_t od_crgen; 195 char od_name[MAXNAMELEN]; 196 } ztest_od_t; 197 198 /* 199 * Per-dataset state. 200 */ 201 typedef struct ztest_ds { 202 objset_t *zd_os; 203 zilog_t *zd_zilog; 204 uint64_t zd_seq; 205 ztest_od_t *zd_od; /* debugging aid */ 206 char zd_name[MAXNAMELEN]; 207 mutex_t zd_dirobj_lock; 208 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 209 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 210 } ztest_ds_t; 211 212 /* 213 * Per-iteration state. 214 */ 215 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 216 217 typedef struct ztest_info { 218 ztest_func_t *zi_func; /* test function */ 219 uint64_t zi_iters; /* iterations per execution */ 220 uint64_t *zi_interval; /* execute every <interval> seconds */ 221 uint64_t zi_call_count; /* per-pass count */ 222 uint64_t zi_call_time; /* per-pass time */ 223 uint64_t zi_call_next; /* next time to call this function */ 224 } ztest_info_t; 225 226 /* 227 * Note: these aren't static because we want dladdr() to work. 228 */ 229 ztest_func_t ztest_dmu_read_write; 230 ztest_func_t ztest_dmu_write_parallel; 231 ztest_func_t ztest_dmu_object_alloc_free; 232 ztest_func_t ztest_dmu_commit_callbacks; 233 ztest_func_t ztest_zap; 234 ztest_func_t ztest_zap_parallel; 235 ztest_func_t ztest_zil_commit; 236 ztest_func_t ztest_dmu_read_write_zcopy; 237 ztest_func_t ztest_dmu_objset_create_destroy; 238 ztest_func_t ztest_dmu_prealloc; 239 ztest_func_t ztest_fzap; 240 ztest_func_t ztest_dmu_snapshot_create_destroy; 241 ztest_func_t ztest_dsl_prop_get_set; 242 ztest_func_t ztest_spa_prop_get_set; 243 ztest_func_t ztest_spa_create_destroy; 244 ztest_func_t ztest_fault_inject; 245 ztest_func_t ztest_ddt_repair; 246 ztest_func_t ztest_dmu_snapshot_hold; 247 ztest_func_t ztest_spa_rename; 248 ztest_func_t ztest_scrub; 249 ztest_func_t ztest_dsl_dataset_promote_busy; 250 ztest_func_t ztest_vdev_attach_detach; 251 ztest_func_t ztest_vdev_LUN_growth; 252 ztest_func_t ztest_vdev_add_remove; 253 ztest_func_t ztest_vdev_aux_add_remove; 254 255 uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 256 uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 257 uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 258 uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 259 uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 260 261 ztest_info_t ztest_info[] = { 262 { ztest_dmu_read_write, 1, &zopt_always }, 263 { ztest_dmu_write_parallel, 10, &zopt_always }, 264 { ztest_dmu_object_alloc_free, 1, &zopt_always }, 265 { ztest_dmu_commit_callbacks, 1, &zopt_always }, 266 { ztest_zap, 30, &zopt_always }, 267 { ztest_zap_parallel, 100, &zopt_always }, 268 { ztest_zil_commit, 1, &zopt_incessant }, 269 { ztest_dmu_read_write_zcopy, 1, &zopt_often }, 270 { ztest_dmu_objset_create_destroy, 1, &zopt_often }, 271 { ztest_dsl_prop_get_set, 1, &zopt_often }, 272 { ztest_spa_prop_get_set, 1, &zopt_sometimes }, 273 #if 0 274 { ztest_dmu_prealloc, 1, &zopt_sometimes }, 275 #endif 276 { ztest_fzap, 1, &zopt_sometimes }, 277 { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, 278 { ztest_spa_create_destroy, 1, &zopt_sometimes }, 279 { ztest_fault_inject, 1, &zopt_sometimes }, 280 { ztest_ddt_repair, 1, &zopt_sometimes }, 281 { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, 282 { ztest_spa_rename, 1, &zopt_rarely }, 283 { ztest_scrub, 1, &zopt_rarely }, 284 { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, 285 { ztest_vdev_attach_detach, 1, &zopt_rarely }, 286 { ztest_vdev_LUN_growth, 1, &zopt_rarely }, 287 { ztest_vdev_add_remove, 1, &zopt_vdevtime }, 288 { ztest_vdev_aux_add_remove, 1, &zopt_vdevtime }, 289 }; 290 291 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 292 293 /* 294 * The following struct is used to hold a list of uncalled commit callbacks. 295 * The callbacks are ordered by txg number. 296 */ 297 typedef struct ztest_cb_list { 298 mutex_t zcl_callbacks_lock; 299 list_t zcl_callbacks; 300 } ztest_cb_list_t; 301 302 /* 303 * Stuff we need to share writably between parent and child. 304 */ 305 typedef struct ztest_shared { 306 char *zs_pool; 307 spa_t *zs_spa; 308 hrtime_t zs_proc_start; 309 hrtime_t zs_proc_stop; 310 hrtime_t zs_thread_start; 311 hrtime_t zs_thread_stop; 312 hrtime_t zs_thread_kill; 313 uint64_t zs_enospc_count; 314 uint64_t zs_vdev_next_leaf; 315 uint64_t zs_vdev_aux; 316 uint64_t zs_alloc; 317 uint64_t zs_space; 318 mutex_t zs_vdev_lock; 319 rwlock_t zs_name_lock; 320 ztest_info_t zs_info[ZTEST_FUNCS]; 321 ztest_ds_t zs_zd[]; 322 } ztest_shared_t; 323 324 #define ID_PARALLEL -1ULL 325 326 static char ztest_dev_template[] = "%s/%s.%llua"; 327 static char ztest_aux_template[] = "%s/%s.%s.%llu"; 328 ztest_shared_t *ztest_shared; 329 uint64_t *ztest_seq; 330 331 static int ztest_random_fd; 332 static int ztest_dump_core = 1; 333 334 static boolean_t ztest_exiting; 335 336 /* Global commit callback list */ 337 static ztest_cb_list_t zcl; 338 339 extern uint64_t metaslab_gang_bang; 340 extern uint64_t metaslab_df_alloc_threshold; 341 static uint64_t metaslab_sz; 342 343 enum ztest_object { 344 ZTEST_META_DNODE = 0, 345 ZTEST_DIROBJ, 346 ZTEST_OBJECTS 347 }; 348 349 static void usage(boolean_t) __NORETURN; 350 351 /* 352 * These libumem hooks provide a reasonable set of defaults for the allocator's 353 * debugging facilities. 354 */ 355 const char * 356 _umem_debug_init() 357 { 358 return ("default,verbose"); /* $UMEM_DEBUG setting */ 359 } 360 361 const char * 362 _umem_logging_init(void) 363 { 364 return ("fail,contents"); /* $UMEM_LOGGING setting */ 365 } 366 367 #define FATAL_MSG_SZ 1024 368 369 char *fatal_msg; 370 371 static void 372 fatal(int do_perror, char *message, ...) 373 { 374 va_list args; 375 int save_errno = errno; 376 char buf[FATAL_MSG_SZ]; 377 378 (void) fflush(stdout); 379 380 va_start(args, message); 381 (void) sprintf(buf, "ztest: "); 382 /* LINTED */ 383 (void) vsprintf(buf + strlen(buf), message, args); 384 va_end(args); 385 if (do_perror) { 386 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 387 ": %s", strerror(save_errno)); 388 } 389 (void) fprintf(stderr, "%s\n", buf); 390 fatal_msg = buf; /* to ease debugging */ 391 if (ztest_dump_core) 392 abort(); 393 exit(3); 394 } 395 396 static int 397 str2shift(const char *buf) 398 { 399 const char *ends = "BKMGTPEZ"; 400 int i; 401 402 if (buf[0] == '\0') 403 return (0); 404 for (i = 0; i < strlen(ends); i++) { 405 if (toupper(buf[0]) == ends[i]) 406 break; 407 } 408 if (i == strlen(ends)) { 409 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 410 buf); 411 usage(B_FALSE); 412 } 413 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 414 return (10*i); 415 } 416 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 417 usage(B_FALSE); 418 /* NOTREACHED */ 419 } 420 421 static uint64_t 422 nicenumtoull(const char *buf) 423 { 424 char *end; 425 uint64_t val; 426 427 val = strtoull(buf, &end, 0); 428 if (end == buf) { 429 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 430 usage(B_FALSE); 431 } else if (end[0] == '.') { 432 double fval = strtod(buf, &end); 433 fval *= pow(2, str2shift(end)); 434 if (fval > UINT64_MAX) { 435 (void) fprintf(stderr, "ztest: value too large: %s\n", 436 buf); 437 usage(B_FALSE); 438 } 439 val = (uint64_t)fval; 440 } else { 441 int shift = str2shift(end); 442 if (shift >= 64 || (val << shift) >> shift != val) { 443 (void) fprintf(stderr, "ztest: value too large: %s\n", 444 buf); 445 usage(B_FALSE); 446 } 447 val <<= shift; 448 } 449 return (val); 450 } 451 452 static void 453 usage(boolean_t requested) 454 { 455 char nice_vdev_size[10]; 456 char nice_gang_bang[10]; 457 FILE *fp = requested ? stdout : stderr; 458 459 nicenum(zopt_vdev_size, nice_vdev_size); 460 nicenum(metaslab_gang_bang, nice_gang_bang); 461 462 (void) fprintf(fp, "Usage: %s\n" 463 "\t[-v vdevs (default: %llu)]\n" 464 "\t[-s size_of_each_vdev (default: %s)]\n" 465 "\t[-a alignment_shift (default: %d) (use 0 for random)]\n" 466 "\t[-m mirror_copies (default: %d)]\n" 467 "\t[-r raidz_disks (default: %d)]\n" 468 "\t[-R raidz_parity (default: %d)]\n" 469 "\t[-d datasets (default: %d)]\n" 470 "\t[-t threads (default: %d)]\n" 471 "\t[-g gang_block_threshold (default: %s)]\n" 472 "\t[-i initialize pool i times (default: %d)]\n" 473 "\t[-k kill percentage (default: %llu%%)]\n" 474 "\t[-p pool_name (default: %s)]\n" 475 "\t[-f file directory for vdev files (default: %s)]\n" 476 "\t[-V(erbose)] (use multiple times for ever more blather)\n" 477 "\t[-E(xisting)] (use existing pool instead of creating new one)\n" 478 "\t[-T time] total run time (default: %llu sec)\n" 479 "\t[-P passtime] time per pass (default: %llu sec)\n" 480 "\t[-h] (print help)\n" 481 "", 482 cmdname, 483 (u_longlong_t)zopt_vdevs, /* -v */ 484 nice_vdev_size, /* -s */ 485 zopt_ashift, /* -a */ 486 zopt_mirrors, /* -m */ 487 zopt_raidz, /* -r */ 488 zopt_raidz_parity, /* -R */ 489 zopt_datasets, /* -d */ 490 zopt_threads, /* -t */ 491 nice_gang_bang, /* -g */ 492 zopt_init, /* -i */ 493 (u_longlong_t)zopt_killrate, /* -k */ 494 zopt_pool, /* -p */ 495 zopt_dir, /* -f */ 496 (u_longlong_t)zopt_time, /* -T */ 497 (u_longlong_t)zopt_passtime); /* -P */ 498 exit(requested ? 0 : 1); 499 } 500 501 static void 502 process_options(int argc, char **argv) 503 { 504 int opt; 505 uint64_t value; 506 507 /* By default, test gang blocks for blocks 32K and greater */ 508 metaslab_gang_bang = 32 << 10; 509 510 while ((opt = getopt(argc, argv, 511 "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:h")) != EOF) { 512 value = 0; 513 switch (opt) { 514 case 'v': 515 case 's': 516 case 'a': 517 case 'm': 518 case 'r': 519 case 'R': 520 case 'd': 521 case 't': 522 case 'g': 523 case 'i': 524 case 'k': 525 case 'T': 526 case 'P': 527 value = nicenumtoull(optarg); 528 } 529 switch (opt) { 530 case 'v': 531 zopt_vdevs = value; 532 break; 533 case 's': 534 zopt_vdev_size = MAX(SPA_MINDEVSIZE, value); 535 break; 536 case 'a': 537 zopt_ashift = value; 538 break; 539 case 'm': 540 zopt_mirrors = value; 541 break; 542 case 'r': 543 zopt_raidz = MAX(1, value); 544 break; 545 case 'R': 546 zopt_raidz_parity = MIN(MAX(value, 1), 3); 547 break; 548 case 'd': 549 zopt_datasets = MAX(1, value); 550 break; 551 case 't': 552 zopt_threads = MAX(1, value); 553 break; 554 case 'g': 555 metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value); 556 break; 557 case 'i': 558 zopt_init = value; 559 break; 560 case 'k': 561 zopt_killrate = value; 562 break; 563 case 'p': 564 zopt_pool = strdup(optarg); 565 break; 566 case 'f': 567 zopt_dir = strdup(optarg); 568 break; 569 case 'V': 570 zopt_verbose++; 571 break; 572 case 'E': 573 zopt_init = 0; 574 break; 575 case 'T': 576 zopt_time = value; 577 break; 578 case 'P': 579 zopt_passtime = MAX(1, value); 580 break; 581 case 'h': 582 usage(B_TRUE); 583 break; 584 case '?': 585 default: 586 usage(B_FALSE); 587 break; 588 } 589 } 590 591 zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1); 592 593 zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time * NANOSEC / zopt_vdevs : 594 UINT64_MAX >> 2); 595 zopt_maxfaults = MAX(zopt_mirrors, 1) * (zopt_raidz_parity + 1) - 1; 596 } 597 598 static void 599 ztest_kill(ztest_shared_t *zs) 600 { 601 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(zs->zs_spa)); 602 zs->zs_space = metaslab_class_get_space(spa_normal_class(zs->zs_spa)); 603 (void) kill(getpid(), SIGKILL); 604 } 605 606 static uint64_t 607 ztest_random(uint64_t range) 608 { 609 uint64_t r; 610 611 if (range == 0) 612 return (0); 613 614 if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r)) 615 fatal(1, "short read from /dev/urandom"); 616 617 return (r % range); 618 } 619 620 /* ARGSUSED */ 621 static void 622 ztest_record_enospc(const char *s) 623 { 624 ztest_shared->zs_enospc_count++; 625 } 626 627 static uint64_t 628 ztest_get_ashift(void) 629 { 630 if (zopt_ashift == 0) 631 return (SPA_MINBLOCKSHIFT + ztest_random(3)); 632 return (zopt_ashift); 633 } 634 635 static nvlist_t * 636 make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift) 637 { 638 char pathbuf[MAXPATHLEN]; 639 uint64_t vdev; 640 nvlist_t *file; 641 642 if (ashift == 0) 643 ashift = ztest_get_ashift(); 644 645 if (path == NULL) { 646 path = pathbuf; 647 648 if (aux != NULL) { 649 vdev = ztest_shared->zs_vdev_aux; 650 (void) sprintf(path, ztest_aux_template, 651 zopt_dir, zopt_pool, aux, vdev); 652 } else { 653 vdev = ztest_shared->zs_vdev_next_leaf++; 654 (void) sprintf(path, ztest_dev_template, 655 zopt_dir, zopt_pool, vdev); 656 } 657 } 658 659 if (size != 0) { 660 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 661 if (fd == -1) 662 fatal(1, "can't open %s", path); 663 if (ftruncate(fd, size) != 0) 664 fatal(1, "can't ftruncate %s", path); 665 (void) close(fd); 666 } 667 668 VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); 669 VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); 670 VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0); 671 VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); 672 673 return (file); 674 } 675 676 static nvlist_t * 677 make_vdev_raidz(char *path, char *aux, size_t size, uint64_t ashift, int r) 678 { 679 nvlist_t *raidz, **child; 680 int c; 681 682 if (r < 2) 683 return (make_vdev_file(path, aux, size, ashift)); 684 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 685 686 for (c = 0; c < r; c++) 687 child[c] = make_vdev_file(path, aux, size, ashift); 688 689 VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); 690 VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, 691 VDEV_TYPE_RAIDZ) == 0); 692 VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, 693 zopt_raidz_parity) == 0); 694 VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, 695 child, r) == 0); 696 697 for (c = 0; c < r; c++) 698 nvlist_free(child[c]); 699 700 umem_free(child, r * sizeof (nvlist_t *)); 701 702 return (raidz); 703 } 704 705 static nvlist_t * 706 make_vdev_mirror(char *path, char *aux, size_t size, uint64_t ashift, 707 int r, int m) 708 { 709 nvlist_t *mirror, **child; 710 int c; 711 712 if (m < 1) 713 return (make_vdev_raidz(path, aux, size, ashift, r)); 714 715 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 716 717 for (c = 0; c < m; c++) 718 child[c] = make_vdev_raidz(path, aux, size, ashift, r); 719 720 VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); 721 VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, 722 VDEV_TYPE_MIRROR) == 0); 723 VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 724 child, m) == 0); 725 726 for (c = 0; c < m; c++) 727 nvlist_free(child[c]); 728 729 umem_free(child, m * sizeof (nvlist_t *)); 730 731 return (mirror); 732 } 733 734 static nvlist_t * 735 make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift, 736 int log, int r, int m, int t) 737 { 738 nvlist_t *root, **child; 739 int c; 740 741 ASSERT(t > 0); 742 743 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 744 745 for (c = 0; c < t; c++) { 746 child[c] = make_vdev_mirror(path, aux, size, ashift, r, m); 747 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, 748 log) == 0); 749 } 750 751 VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0); 752 VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); 753 VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 754 child, t) == 0); 755 756 for (c = 0; c < t; c++) 757 nvlist_free(child[c]); 758 759 umem_free(child, t * sizeof (nvlist_t *)); 760 761 return (root); 762 } 763 764 static int 765 ztest_random_blocksize(void) 766 { 767 return (1 << (SPA_MINBLOCKSHIFT + 768 ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1))); 769 } 770 771 static int 772 ztest_random_ibshift(void) 773 { 774 return (DN_MIN_INDBLKSHIFT + 775 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 776 } 777 778 static uint64_t 779 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 780 { 781 uint64_t top; 782 vdev_t *rvd = spa->spa_root_vdev; 783 vdev_t *tvd; 784 785 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 786 787 do { 788 top = ztest_random(rvd->vdev_children); 789 tvd = rvd->vdev_child[top]; 790 } while (tvd->vdev_ishole || (tvd->vdev_islog && !log_ok) || 791 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 792 793 return (top); 794 } 795 796 static uint64_t 797 ztest_random_dsl_prop(zfs_prop_t prop) 798 { 799 uint64_t value; 800 801 do { 802 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 803 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 804 805 return (value); 806 } 807 808 static int 809 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 810 boolean_t inherit) 811 { 812 const char *propname = zfs_prop_to_name(prop); 813 const char *valname; 814 char setpoint[MAXPATHLEN]; 815 uint64_t curval; 816 int error; 817 818 error = dsl_prop_set(osname, propname, sizeof (value), 819 inherit ? 0 : 1, &value); 820 821 if (error == ENOSPC) { 822 ztest_record_enospc(FTAG); 823 return (error); 824 } 825 ASSERT3U(error, ==, 0); 826 827 VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval), 828 1, &curval, setpoint), ==, 0); 829 830 if (zopt_verbose >= 6) { 831 VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0); 832 (void) printf("%s %s = %s at '%s'\n", 833 osname, propname, valname, setpoint); 834 } 835 836 return (error); 837 } 838 839 #if 0 840 static int 841 ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value) 842 { 843 spa_t *spa = zs->zs_spa; 844 nvlist_t *props = NULL; 845 int error; 846 847 VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); 848 VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0); 849 850 error = spa_prop_set(spa, props); 851 852 nvlist_free(props); 853 854 if (error == ENOSPC) { 855 ztest_record_enospc(FTAG); 856 return (error); 857 } 858 ASSERT3U(error, ==, 0); 859 860 return (error); 861 } 862 #endif 863 864 static void 865 ztest_rll_init(rll_t *rll) 866 { 867 rll->rll_writer = NULL; 868 rll->rll_readers = 0; 869 VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0); 870 VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0); 871 } 872 873 static void 874 ztest_rll_destroy(rll_t *rll) 875 { 876 ASSERT(rll->rll_writer == NULL); 877 ASSERT(rll->rll_readers == 0); 878 VERIFY(_mutex_destroy(&rll->rll_lock) == 0); 879 VERIFY(cond_destroy(&rll->rll_cv) == 0); 880 } 881 882 static void 883 ztest_rll_lock(rll_t *rll, rl_type_t type) 884 { 885 VERIFY(mutex_lock(&rll->rll_lock) == 0); 886 887 if (type == RL_READER) { 888 while (rll->rll_writer != NULL) 889 (void) cond_wait(&rll->rll_cv, &rll->rll_lock); 890 rll->rll_readers++; 891 } else { 892 while (rll->rll_writer != NULL || rll->rll_readers) 893 (void) cond_wait(&rll->rll_cv, &rll->rll_lock); 894 rll->rll_writer = curthread; 895 } 896 897 VERIFY(mutex_unlock(&rll->rll_lock) == 0); 898 } 899 900 static void 901 ztest_rll_unlock(rll_t *rll) 902 { 903 VERIFY(mutex_lock(&rll->rll_lock) == 0); 904 905 if (rll->rll_writer) { 906 ASSERT(rll->rll_readers == 0); 907 rll->rll_writer = NULL; 908 } else { 909 ASSERT(rll->rll_readers != 0); 910 ASSERT(rll->rll_writer == NULL); 911 rll->rll_readers--; 912 } 913 914 if (rll->rll_writer == NULL && rll->rll_readers == 0) 915 VERIFY(cond_broadcast(&rll->rll_cv) == 0); 916 917 VERIFY(mutex_unlock(&rll->rll_lock) == 0); 918 } 919 920 static void 921 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 922 { 923 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 924 925 ztest_rll_lock(rll, type); 926 } 927 928 static void 929 ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 930 { 931 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 932 933 ztest_rll_unlock(rll); 934 } 935 936 static rl_t * 937 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 938 uint64_t size, rl_type_t type) 939 { 940 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 941 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 942 rl_t *rl; 943 944 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 945 rl->rl_object = object; 946 rl->rl_offset = offset; 947 rl->rl_size = size; 948 rl->rl_lock = rll; 949 950 ztest_rll_lock(rll, type); 951 952 return (rl); 953 } 954 955 static void 956 ztest_range_unlock(rl_t *rl) 957 { 958 rll_t *rll = rl->rl_lock; 959 960 ztest_rll_unlock(rll); 961 962 umem_free(rl, sizeof (*rl)); 963 } 964 965 static void 966 ztest_zd_init(ztest_ds_t *zd, objset_t *os) 967 { 968 zd->zd_os = os; 969 zd->zd_zilog = dmu_objset_zil(os); 970 zd->zd_seq = 0; 971 dmu_objset_name(os, zd->zd_name); 972 973 VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0); 974 975 for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) 976 ztest_rll_init(&zd->zd_object_lock[l]); 977 978 for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) 979 ztest_rll_init(&zd->zd_range_lock[l]); 980 } 981 982 static void 983 ztest_zd_fini(ztest_ds_t *zd) 984 { 985 VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0); 986 987 for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) 988 ztest_rll_destroy(&zd->zd_object_lock[l]); 989 990 for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) 991 ztest_rll_destroy(&zd->zd_range_lock[l]); 992 } 993 994 #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) 995 996 static uint64_t 997 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 998 { 999 uint64_t txg; 1000 int error; 1001 1002 /* 1003 * Attempt to assign tx to some transaction group. 1004 */ 1005 error = dmu_tx_assign(tx, txg_how); 1006 if (error) { 1007 if (error == ERESTART) { 1008 ASSERT(txg_how == TXG_NOWAIT); 1009 dmu_tx_wait(tx); 1010 } else { 1011 ASSERT3U(error, ==, ENOSPC); 1012 ztest_record_enospc(tag); 1013 } 1014 dmu_tx_abort(tx); 1015 return (0); 1016 } 1017 txg = dmu_tx_get_txg(tx); 1018 ASSERT(txg != 0); 1019 return (txg); 1020 } 1021 1022 static void 1023 ztest_pattern_set(void *buf, uint64_t size, uint64_t value) 1024 { 1025 uint64_t *ip = buf; 1026 uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); 1027 1028 while (ip < ip_end) 1029 *ip++ = value; 1030 } 1031 1032 static boolean_t 1033 ztest_pattern_match(void *buf, uint64_t size, uint64_t value) 1034 { 1035 uint64_t *ip = buf; 1036 uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); 1037 uint64_t diff = 0; 1038 1039 while (ip < ip_end) 1040 diff |= (value - *ip++); 1041 1042 return (diff == 0); 1043 } 1044 1045 static void 1046 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1047 uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) 1048 { 1049 bt->bt_magic = BT_MAGIC; 1050 bt->bt_objset = dmu_objset_id(os); 1051 bt->bt_object = object; 1052 bt->bt_offset = offset; 1053 bt->bt_gen = gen; 1054 bt->bt_txg = txg; 1055 bt->bt_crtxg = crtxg; 1056 } 1057 1058 static void 1059 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1060 uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) 1061 { 1062 ASSERT(bt->bt_magic == BT_MAGIC); 1063 ASSERT(bt->bt_objset == dmu_objset_id(os)); 1064 ASSERT(bt->bt_object == object); 1065 ASSERT(bt->bt_offset == offset); 1066 ASSERT(bt->bt_gen <= gen); 1067 ASSERT(bt->bt_txg <= txg); 1068 ASSERT(bt->bt_crtxg == crtxg); 1069 } 1070 1071 static ztest_block_tag_t * 1072 ztest_bt_bonus(dmu_buf_t *db) 1073 { 1074 dmu_object_info_t doi; 1075 ztest_block_tag_t *bt; 1076 1077 dmu_object_info_from_db(db, &doi); 1078 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1079 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1080 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1081 1082 return (bt); 1083 } 1084 1085 /* 1086 * ZIL logging ops 1087 */ 1088 1089 #define lrz_type lr_mode 1090 #define lrz_blocksize lr_uid 1091 #define lrz_ibshift lr_gid 1092 #define lrz_bonustype lr_rdev 1093 #define lrz_bonuslen lr_crtime[1] 1094 1095 static uint64_t 1096 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1097 { 1098 char *name = (void *)(lr + 1); /* name follows lr */ 1099 size_t namesize = strlen(name) + 1; 1100 itx_t *itx; 1101 1102 if (zil_replaying(zd->zd_zilog, tx)) 1103 return (0); 1104 1105 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1106 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1107 sizeof (*lr) + namesize - sizeof (lr_t)); 1108 1109 return (zil_itx_assign(zd->zd_zilog, itx, tx)); 1110 } 1111 1112 static uint64_t 1113 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr) 1114 { 1115 char *name = (void *)(lr + 1); /* name follows lr */ 1116 size_t namesize = strlen(name) + 1; 1117 itx_t *itx; 1118 1119 if (zil_replaying(zd->zd_zilog, tx)) 1120 return (0); 1121 1122 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1123 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1124 sizeof (*lr) + namesize - sizeof (lr_t)); 1125 1126 return (zil_itx_assign(zd->zd_zilog, itx, tx)); 1127 } 1128 1129 static uint64_t 1130 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1131 { 1132 itx_t *itx; 1133 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1134 1135 if (zil_replaying(zd->zd_zilog, tx)) 1136 return (0); 1137 1138 if (lr->lr_length > ZIL_MAX_LOG_DATA) 1139 write_state = WR_INDIRECT; 1140 1141 itx = zil_itx_create(TX_WRITE, 1142 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1143 1144 if (write_state == WR_COPIED && 1145 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1146 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { 1147 zil_itx_destroy(itx); 1148 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1149 write_state = WR_NEED_COPY; 1150 } 1151 itx->itx_private = zd; 1152 itx->itx_wr_state = write_state; 1153 itx->itx_sync = (ztest_random(8) == 0); 1154 itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0); 1155 1156 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1157 sizeof (*lr) - sizeof (lr_t)); 1158 1159 return (zil_itx_assign(zd->zd_zilog, itx, tx)); 1160 } 1161 1162 static uint64_t 1163 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 1164 { 1165 itx_t *itx; 1166 1167 if (zil_replaying(zd->zd_zilog, tx)) 1168 return (0); 1169 1170 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 1171 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1172 sizeof (*lr) - sizeof (lr_t)); 1173 1174 return (zil_itx_assign(zd->zd_zilog, itx, tx)); 1175 } 1176 1177 static uint64_t 1178 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 1179 { 1180 itx_t *itx; 1181 1182 if (zil_replaying(zd->zd_zilog, tx)) 1183 return (0); 1184 1185 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 1186 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1187 sizeof (*lr) - sizeof (lr_t)); 1188 1189 return (zil_itx_assign(zd->zd_zilog, itx, tx)); 1190 } 1191 1192 /* 1193 * ZIL replay ops 1194 */ 1195 static int 1196 ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap) 1197 { 1198 char *name = (void *)(lr + 1); /* name follows lr */ 1199 objset_t *os = zd->zd_os; 1200 ztest_block_tag_t *bbt; 1201 dmu_buf_t *db; 1202 dmu_tx_t *tx; 1203 uint64_t txg; 1204 int error = 0; 1205 1206 if (byteswap) 1207 byteswap_uint64_array(lr, sizeof (*lr)); 1208 1209 ASSERT(lr->lr_doid == ZTEST_DIROBJ); 1210 ASSERT(name[0] != '\0'); 1211 1212 tx = dmu_tx_create(os); 1213 1214 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 1215 1216 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1217 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1218 } else { 1219 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1220 } 1221 1222 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1223 if (txg == 0) 1224 return (ENOSPC); 1225 1226 ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid); 1227 1228 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1229 if (lr->lr_foid == 0) { 1230 lr->lr_foid = zap_create(os, 1231 lr->lrz_type, lr->lrz_bonustype, 1232 lr->lrz_bonuslen, tx); 1233 } else { 1234 error = zap_create_claim(os, lr->lr_foid, 1235 lr->lrz_type, lr->lrz_bonustype, 1236 lr->lrz_bonuslen, tx); 1237 } 1238 } else { 1239 if (lr->lr_foid == 0) { 1240 lr->lr_foid = dmu_object_alloc(os, 1241 lr->lrz_type, 0, lr->lrz_bonustype, 1242 lr->lrz_bonuslen, tx); 1243 } else { 1244 error = dmu_object_claim(os, lr->lr_foid, 1245 lr->lrz_type, 0, lr->lrz_bonustype, 1246 lr->lrz_bonuslen, tx); 1247 } 1248 } 1249 1250 if (error) { 1251 ASSERT3U(error, ==, EEXIST); 1252 ASSERT(zd->zd_zilog->zl_replay); 1253 dmu_tx_commit(tx); 1254 return (error); 1255 } 1256 1257 ASSERT(lr->lr_foid != 0); 1258 1259 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 1260 VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid, 1261 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 1262 1263 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 1264 bbt = ztest_bt_bonus(db); 1265 dmu_buf_will_dirty(db, tx); 1266 ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg); 1267 dmu_buf_rele(db, FTAG); 1268 1269 VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 1270 &lr->lr_foid, tx)); 1271 1272 (void) ztest_log_create(zd, tx, lr); 1273 1274 dmu_tx_commit(tx); 1275 1276 return (0); 1277 } 1278 1279 static int 1280 ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap) 1281 { 1282 char *name = (void *)(lr + 1); /* name follows lr */ 1283 objset_t *os = zd->zd_os; 1284 dmu_object_info_t doi; 1285 dmu_tx_t *tx; 1286 uint64_t object, txg; 1287 1288 if (byteswap) 1289 byteswap_uint64_array(lr, sizeof (*lr)); 1290 1291 ASSERT(lr->lr_doid == ZTEST_DIROBJ); 1292 ASSERT(name[0] != '\0'); 1293 1294 VERIFY3U(0, ==, 1295 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 1296 ASSERT(object != 0); 1297 1298 ztest_object_lock(zd, object, RL_WRITER); 1299 1300 VERIFY3U(0, ==, dmu_object_info(os, object, &doi)); 1301 1302 tx = dmu_tx_create(os); 1303 1304 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 1305 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 1306 1307 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1308 if (txg == 0) { 1309 ztest_object_unlock(zd, object); 1310 return (ENOSPC); 1311 } 1312 1313 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 1314 VERIFY3U(0, ==, zap_destroy(os, object, tx)); 1315 } else { 1316 VERIFY3U(0, ==, dmu_object_free(os, object, tx)); 1317 } 1318 1319 VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx)); 1320 1321 (void) ztest_log_remove(zd, tx, lr); 1322 1323 dmu_tx_commit(tx); 1324 1325 ztest_object_unlock(zd, object); 1326 1327 return (0); 1328 } 1329 1330 static int 1331 ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap) 1332 { 1333 objset_t *os = zd->zd_os; 1334 void *data = lr + 1; /* data follows lr */ 1335 uint64_t offset, length; 1336 ztest_block_tag_t *bt = data; 1337 ztest_block_tag_t *bbt; 1338 uint64_t gen, txg, lrtxg, crtxg; 1339 dmu_object_info_t doi; 1340 dmu_tx_t *tx; 1341 dmu_buf_t *db; 1342 arc_buf_t *abuf = NULL; 1343 rl_t *rl; 1344 1345 if (byteswap) 1346 byteswap_uint64_array(lr, sizeof (*lr)); 1347 1348 offset = lr->lr_offset; 1349 length = lr->lr_length; 1350 1351 /* If it's a dmu_sync() block, write the whole block */ 1352 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 1353 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 1354 if (length < blocksize) { 1355 offset -= offset % blocksize; 1356 length = blocksize; 1357 } 1358 } 1359 1360 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 1361 byteswap_uint64_array(bt, sizeof (*bt)); 1362 1363 if (bt->bt_magic != BT_MAGIC) 1364 bt = NULL; 1365 1366 ztest_object_lock(zd, lr->lr_foid, RL_READER); 1367 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); 1368 1369 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 1370 1371 dmu_object_info_from_db(db, &doi); 1372 1373 bbt = ztest_bt_bonus(db); 1374 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 1375 gen = bbt->bt_gen; 1376 crtxg = bbt->bt_crtxg; 1377 lrtxg = lr->lr_common.lrc_txg; 1378 1379 tx = dmu_tx_create(os); 1380 1381 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 1382 1383 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 1384 P2PHASE(offset, length) == 0) 1385 abuf = dmu_request_arcbuf(db, length); 1386 1387 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1388 if (txg == 0) { 1389 if (abuf != NULL) 1390 dmu_return_arcbuf(abuf); 1391 dmu_buf_rele(db, FTAG); 1392 ztest_range_unlock(rl); 1393 ztest_object_unlock(zd, lr->lr_foid); 1394 return (ENOSPC); 1395 } 1396 1397 if (bt != NULL) { 1398 /* 1399 * Usually, verify the old data before writing new data -- 1400 * but not always, because we also want to verify correct 1401 * behavior when the data was not recently read into cache. 1402 */ 1403 ASSERT(offset % doi.doi_data_block_size == 0); 1404 if (ztest_random(4) != 0) { 1405 int prefetch = ztest_random(2) ? 1406 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 1407 ztest_block_tag_t rbt; 1408 1409 VERIFY(dmu_read(os, lr->lr_foid, offset, 1410 sizeof (rbt), &rbt, prefetch) == 0); 1411 if (rbt.bt_magic == BT_MAGIC) { 1412 ztest_bt_verify(&rbt, os, lr->lr_foid, 1413 offset, gen, txg, crtxg); 1414 } 1415 } 1416 1417 /* 1418 * Writes can appear to be newer than the bonus buffer because 1419 * the ztest_get_data() callback does a dmu_read() of the 1420 * open-context data, which may be different than the data 1421 * as it was when the write was generated. 1422 */ 1423 if (zd->zd_zilog->zl_replay) { 1424 ztest_bt_verify(bt, os, lr->lr_foid, offset, 1425 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 1426 bt->bt_crtxg); 1427 } 1428 1429 /* 1430 * Set the bt's gen/txg to the bonus buffer's gen/txg 1431 * so that all of the usual ASSERTs will work. 1432 */ 1433 ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg); 1434 } 1435 1436 if (abuf == NULL) { 1437 dmu_write(os, lr->lr_foid, offset, length, data, tx); 1438 } else { 1439 bcopy(data, abuf->b_data, length); 1440 dmu_assign_arcbuf(db, offset, abuf, tx); 1441 } 1442 1443 (void) ztest_log_write(zd, tx, lr); 1444 1445 dmu_buf_rele(db, FTAG); 1446 1447 dmu_tx_commit(tx); 1448 1449 ztest_range_unlock(rl); 1450 ztest_object_unlock(zd, lr->lr_foid); 1451 1452 return (0); 1453 } 1454 1455 static int 1456 ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap) 1457 { 1458 objset_t *os = zd->zd_os; 1459 dmu_tx_t *tx; 1460 uint64_t txg; 1461 rl_t *rl; 1462 1463 if (byteswap) 1464 byteswap_uint64_array(lr, sizeof (*lr)); 1465 1466 ztest_object_lock(zd, lr->lr_foid, RL_READER); 1467 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 1468 RL_WRITER); 1469 1470 tx = dmu_tx_create(os); 1471 1472 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 1473 1474 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1475 if (txg == 0) { 1476 ztest_range_unlock(rl); 1477 ztest_object_unlock(zd, lr->lr_foid); 1478 return (ENOSPC); 1479 } 1480 1481 VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 1482 lr->lr_length, tx) == 0); 1483 1484 (void) ztest_log_truncate(zd, tx, lr); 1485 1486 dmu_tx_commit(tx); 1487 1488 ztest_range_unlock(rl); 1489 ztest_object_unlock(zd, lr->lr_foid); 1490 1491 return (0); 1492 } 1493 1494 static int 1495 ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap) 1496 { 1497 objset_t *os = zd->zd_os; 1498 dmu_tx_t *tx; 1499 dmu_buf_t *db; 1500 ztest_block_tag_t *bbt; 1501 uint64_t txg, lrtxg, crtxg; 1502 1503 if (byteswap) 1504 byteswap_uint64_array(lr, sizeof (*lr)); 1505 1506 ztest_object_lock(zd, lr->lr_foid, RL_WRITER); 1507 1508 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 1509 1510 tx = dmu_tx_create(os); 1511 dmu_tx_hold_bonus(tx, lr->lr_foid); 1512 1513 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1514 if (txg == 0) { 1515 dmu_buf_rele(db, FTAG); 1516 ztest_object_unlock(zd, lr->lr_foid); 1517 return (ENOSPC); 1518 } 1519 1520 bbt = ztest_bt_bonus(db); 1521 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 1522 crtxg = bbt->bt_crtxg; 1523 lrtxg = lr->lr_common.lrc_txg; 1524 1525 if (zd->zd_zilog->zl_replay) { 1526 ASSERT(lr->lr_size != 0); 1527 ASSERT(lr->lr_mode != 0); 1528 ASSERT(lrtxg != 0); 1529 } else { 1530 /* 1531 * Randomly change the size and increment the generation. 1532 */ 1533 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 1534 sizeof (*bbt); 1535 lr->lr_mode = bbt->bt_gen + 1; 1536 ASSERT(lrtxg == 0); 1537 } 1538 1539 /* 1540 * Verify that the current bonus buffer is not newer than our txg. 1541 */ 1542 ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, 1543 MAX(txg, lrtxg), crtxg); 1544 1545 dmu_buf_will_dirty(db, tx); 1546 1547 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 1548 ASSERT3U(lr->lr_size, <=, db->db_size); 1549 VERIFY3U(dmu_set_bonus(db, lr->lr_size, tx), ==, 0); 1550 bbt = ztest_bt_bonus(db); 1551 1552 ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg); 1553 1554 dmu_buf_rele(db, FTAG); 1555 1556 (void) ztest_log_setattr(zd, tx, lr); 1557 1558 dmu_tx_commit(tx); 1559 1560 ztest_object_unlock(zd, lr->lr_foid); 1561 1562 return (0); 1563 } 1564 1565 zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 1566 NULL, /* 0 no such transaction type */ 1567 ztest_replay_create, /* TX_CREATE */ 1568 NULL, /* TX_MKDIR */ 1569 NULL, /* TX_MKXATTR */ 1570 NULL, /* TX_SYMLINK */ 1571 ztest_replay_remove, /* TX_REMOVE */ 1572 NULL, /* TX_RMDIR */ 1573 NULL, /* TX_LINK */ 1574 NULL, /* TX_RENAME */ 1575 ztest_replay_write, /* TX_WRITE */ 1576 ztest_replay_truncate, /* TX_TRUNCATE */ 1577 ztest_replay_setattr, /* TX_SETATTR */ 1578 NULL, /* TX_ACL */ 1579 NULL, /* TX_CREATE_ACL */ 1580 NULL, /* TX_CREATE_ATTR */ 1581 NULL, /* TX_CREATE_ACL_ATTR */ 1582 NULL, /* TX_MKDIR_ACL */ 1583 NULL, /* TX_MKDIR_ATTR */ 1584 NULL, /* TX_MKDIR_ACL_ATTR */ 1585 NULL, /* TX_WRITE2 */ 1586 }; 1587 1588 /* 1589 * ZIL get_data callbacks 1590 */ 1591 1592 static void 1593 ztest_get_done(zgd_t *zgd, int error) 1594 { 1595 ztest_ds_t *zd = zgd->zgd_private; 1596 uint64_t object = zgd->zgd_rl->rl_object; 1597 1598 if (zgd->zgd_db) 1599 dmu_buf_rele(zgd->zgd_db, zgd); 1600 1601 ztest_range_unlock(zgd->zgd_rl); 1602 ztest_object_unlock(zd, object); 1603 1604 if (error == 0 && zgd->zgd_bp) 1605 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 1606 1607 umem_free(zgd, sizeof (*zgd)); 1608 } 1609 1610 static int 1611 ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 1612 { 1613 ztest_ds_t *zd = arg; 1614 objset_t *os = zd->zd_os; 1615 uint64_t object = lr->lr_foid; 1616 uint64_t offset = lr->lr_offset; 1617 uint64_t size = lr->lr_length; 1618 blkptr_t *bp = &lr->lr_blkptr; 1619 uint64_t txg = lr->lr_common.lrc_txg; 1620 uint64_t crtxg; 1621 dmu_object_info_t doi; 1622 dmu_buf_t *db; 1623 zgd_t *zgd; 1624 int error; 1625 1626 ztest_object_lock(zd, object, RL_READER); 1627 error = dmu_bonus_hold(os, object, FTAG, &db); 1628 if (error) { 1629 ztest_object_unlock(zd, object); 1630 return (error); 1631 } 1632 1633 crtxg = ztest_bt_bonus(db)->bt_crtxg; 1634 1635 if (crtxg == 0 || crtxg > txg) { 1636 dmu_buf_rele(db, FTAG); 1637 ztest_object_unlock(zd, object); 1638 return (ENOENT); 1639 } 1640 1641 dmu_object_info_from_db(db, &doi); 1642 dmu_buf_rele(db, FTAG); 1643 db = NULL; 1644 1645 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 1646 zgd->zgd_zilog = zd->zd_zilog; 1647 zgd->zgd_private = zd; 1648 1649 if (buf != NULL) { /* immediate write */ 1650 zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, 1651 RL_READER); 1652 1653 error = dmu_read(os, object, offset, size, buf, 1654 DMU_READ_NO_PREFETCH); 1655 ASSERT(error == 0); 1656 } else { 1657 size = doi.doi_data_block_size; 1658 if (ISP2(size)) { 1659 offset = P2ALIGN(offset, size); 1660 } else { 1661 ASSERT(offset < size); 1662 offset = 0; 1663 } 1664 1665 zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, 1666 RL_READER); 1667 1668 error = dmu_buf_hold(os, object, offset, zgd, &db); 1669 1670 if (error == 0) { 1671 zgd->zgd_db = db; 1672 zgd->zgd_bp = bp; 1673 1674 ASSERT(db->db_offset == offset); 1675 ASSERT(db->db_size == size); 1676 1677 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1678 ztest_get_done, zgd); 1679 1680 if (error == 0) 1681 return (0); 1682 } 1683 } 1684 1685 ztest_get_done(zgd, error); 1686 1687 return (error); 1688 } 1689 1690 static void * 1691 ztest_lr_alloc(size_t lrsize, char *name) 1692 { 1693 char *lr; 1694 size_t namesize = name ? strlen(name) + 1 : 0; 1695 1696 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 1697 1698 if (name) 1699 bcopy(name, lr + lrsize, namesize); 1700 1701 return (lr); 1702 } 1703 1704 void 1705 ztest_lr_free(void *lr, size_t lrsize, char *name) 1706 { 1707 size_t namesize = name ? strlen(name) + 1 : 0; 1708 1709 umem_free(lr, lrsize + namesize); 1710 } 1711 1712 /* 1713 * Lookup a bunch of objects. Returns the number of objects not found. 1714 */ 1715 static int 1716 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 1717 { 1718 int missing = 0; 1719 int error; 1720 1721 ASSERT(_mutex_held(&zd->zd_dirobj_lock)); 1722 1723 for (int i = 0; i < count; i++, od++) { 1724 od->od_object = 0; 1725 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 1726 sizeof (uint64_t), 1, &od->od_object); 1727 if (error) { 1728 ASSERT(error == ENOENT); 1729 ASSERT(od->od_object == 0); 1730 missing++; 1731 } else { 1732 dmu_buf_t *db; 1733 ztest_block_tag_t *bbt; 1734 dmu_object_info_t doi; 1735 1736 ASSERT(od->od_object != 0); 1737 ASSERT(missing == 0); /* there should be no gaps */ 1738 1739 ztest_object_lock(zd, od->od_object, RL_READER); 1740 VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os, 1741 od->od_object, FTAG, &db)); 1742 dmu_object_info_from_db(db, &doi); 1743 bbt = ztest_bt_bonus(db); 1744 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 1745 od->od_type = doi.doi_type; 1746 od->od_blocksize = doi.doi_data_block_size; 1747 od->od_gen = bbt->bt_gen; 1748 dmu_buf_rele(db, FTAG); 1749 ztest_object_unlock(zd, od->od_object); 1750 } 1751 } 1752 1753 return (missing); 1754 } 1755 1756 static int 1757 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 1758 { 1759 int missing = 0; 1760 1761 ASSERT(_mutex_held(&zd->zd_dirobj_lock)); 1762 1763 for (int i = 0; i < count; i++, od++) { 1764 if (missing) { 1765 od->od_object = 0; 1766 missing++; 1767 continue; 1768 } 1769 1770 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 1771 1772 lr->lr_doid = od->od_dir; 1773 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 1774 lr->lrz_type = od->od_crtype; 1775 lr->lrz_blocksize = od->od_crblocksize; 1776 lr->lrz_ibshift = ztest_random_ibshift(); 1777 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 1778 lr->lrz_bonuslen = dmu_bonus_max(); 1779 lr->lr_gen = od->od_crgen; 1780 lr->lr_crtime[0] = time(NULL); 1781 1782 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 1783 ASSERT(missing == 0); 1784 od->od_object = 0; 1785 missing++; 1786 } else { 1787 od->od_object = lr->lr_foid; 1788 od->od_type = od->od_crtype; 1789 od->od_blocksize = od->od_crblocksize; 1790 od->od_gen = od->od_crgen; 1791 ASSERT(od->od_object != 0); 1792 } 1793 1794 ztest_lr_free(lr, sizeof (*lr), od->od_name); 1795 } 1796 1797 return (missing); 1798 } 1799 1800 static int 1801 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 1802 { 1803 int missing = 0; 1804 int error; 1805 1806 ASSERT(_mutex_held(&zd->zd_dirobj_lock)); 1807 1808 od += count - 1; 1809 1810 for (int i = count - 1; i >= 0; i--, od--) { 1811 if (missing) { 1812 missing++; 1813 continue; 1814 } 1815 1816 if (od->od_object == 0) 1817 continue; 1818 1819 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 1820 1821 lr->lr_doid = od->od_dir; 1822 1823 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 1824 ASSERT3U(error, ==, ENOSPC); 1825 missing++; 1826 } else { 1827 od->od_object = 0; 1828 } 1829 ztest_lr_free(lr, sizeof (*lr), od->od_name); 1830 } 1831 1832 return (missing); 1833 } 1834 1835 static int 1836 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 1837 void *data) 1838 { 1839 lr_write_t *lr; 1840 int error; 1841 1842 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 1843 1844 lr->lr_foid = object; 1845 lr->lr_offset = offset; 1846 lr->lr_length = size; 1847 lr->lr_blkoff = 0; 1848 BP_ZERO(&lr->lr_blkptr); 1849 1850 bcopy(data, lr + 1, size); 1851 1852 error = ztest_replay_write(zd, lr, B_FALSE); 1853 1854 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 1855 1856 return (error); 1857 } 1858 1859 static int 1860 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 1861 { 1862 lr_truncate_t *lr; 1863 int error; 1864 1865 lr = ztest_lr_alloc(sizeof (*lr), NULL); 1866 1867 lr->lr_foid = object; 1868 lr->lr_offset = offset; 1869 lr->lr_length = size; 1870 1871 error = ztest_replay_truncate(zd, lr, B_FALSE); 1872 1873 ztest_lr_free(lr, sizeof (*lr), NULL); 1874 1875 return (error); 1876 } 1877 1878 static int 1879 ztest_setattr(ztest_ds_t *zd, uint64_t object) 1880 { 1881 lr_setattr_t *lr; 1882 int error; 1883 1884 lr = ztest_lr_alloc(sizeof (*lr), NULL); 1885 1886 lr->lr_foid = object; 1887 lr->lr_size = 0; 1888 lr->lr_mode = 0; 1889 1890 error = ztest_replay_setattr(zd, lr, B_FALSE); 1891 1892 ztest_lr_free(lr, sizeof (*lr), NULL); 1893 1894 return (error); 1895 } 1896 1897 static void 1898 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 1899 { 1900 objset_t *os = zd->zd_os; 1901 dmu_tx_t *tx; 1902 uint64_t txg; 1903 rl_t *rl; 1904 1905 txg_wait_synced(dmu_objset_pool(os), 0); 1906 1907 ztest_object_lock(zd, object, RL_READER); 1908 rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); 1909 1910 tx = dmu_tx_create(os); 1911 1912 dmu_tx_hold_write(tx, object, offset, size); 1913 1914 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1915 1916 if (txg != 0) { 1917 dmu_prealloc(os, object, offset, size, tx); 1918 dmu_tx_commit(tx); 1919 txg_wait_synced(dmu_objset_pool(os), txg); 1920 } else { 1921 (void) dmu_free_long_range(os, object, offset, size); 1922 } 1923 1924 ztest_range_unlock(rl); 1925 ztest_object_unlock(zd, object); 1926 } 1927 1928 static void 1929 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 1930 { 1931 ztest_block_tag_t wbt; 1932 dmu_object_info_t doi; 1933 enum ztest_io_type io_type; 1934 uint64_t blocksize; 1935 void *data; 1936 1937 VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0); 1938 blocksize = doi.doi_data_block_size; 1939 data = umem_alloc(blocksize, UMEM_NOFAIL); 1940 1941 /* 1942 * Pick an i/o type at random, biased toward writing block tags. 1943 */ 1944 io_type = ztest_random(ZTEST_IO_TYPES); 1945 if (ztest_random(2) == 0) 1946 io_type = ZTEST_IO_WRITE_TAG; 1947 1948 switch (io_type) { 1949 1950 case ZTEST_IO_WRITE_TAG: 1951 ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0); 1952 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 1953 break; 1954 1955 case ZTEST_IO_WRITE_PATTERN: 1956 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 1957 if (ztest_random(2) == 0) { 1958 /* 1959 * Induce fletcher2 collisions to ensure that 1960 * zio_ddt_collision() detects and resolves them 1961 * when using fletcher2-verify for deduplication. 1962 */ 1963 ((uint64_t *)data)[0] ^= 1ULL << 63; 1964 ((uint64_t *)data)[4] ^= 1ULL << 63; 1965 } 1966 (void) ztest_write(zd, object, offset, blocksize, data); 1967 break; 1968 1969 case ZTEST_IO_WRITE_ZEROES: 1970 bzero(data, blocksize); 1971 (void) ztest_write(zd, object, offset, blocksize, data); 1972 break; 1973 1974 case ZTEST_IO_TRUNCATE: 1975 (void) ztest_truncate(zd, object, offset, blocksize); 1976 break; 1977 1978 case ZTEST_IO_SETATTR: 1979 (void) ztest_setattr(zd, object); 1980 break; 1981 } 1982 1983 umem_free(data, blocksize); 1984 } 1985 1986 /* 1987 * Initialize an object description template. 1988 */ 1989 static void 1990 ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, 1991 dmu_object_type_t type, uint64_t blocksize, uint64_t gen) 1992 { 1993 od->od_dir = ZTEST_DIROBJ; 1994 od->od_object = 0; 1995 1996 od->od_crtype = type; 1997 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 1998 od->od_crgen = gen; 1999 2000 od->od_type = DMU_OT_NONE; 2001 od->od_blocksize = 0; 2002 od->od_gen = 0; 2003 2004 (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]", 2005 tag, (int64_t)id, index); 2006 } 2007 2008 /* 2009 * Lookup or create the objects for a test using the od template. 2010 * If the objects do not all exist, or if 'remove' is specified, 2011 * remove any existing objects and create new ones. Otherwise, 2012 * use the existing objects. 2013 */ 2014 static int 2015 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2016 { 2017 int count = size / sizeof (*od); 2018 int rv = 0; 2019 2020 VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0); 2021 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2022 (ztest_remove(zd, od, count) != 0 || 2023 ztest_create(zd, od, count) != 0)) 2024 rv = -1; 2025 zd->zd_od = od; 2026 VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0); 2027 2028 return (rv); 2029 } 2030 2031 /* ARGSUSED */ 2032 void 2033 ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2034 { 2035 zilog_t *zilog = zd->zd_zilog; 2036 2037 zil_commit(zilog, UINT64_MAX, ztest_random(ZTEST_OBJECTS)); 2038 2039 /* 2040 * Remember the committed values in zd, which is in parent/child 2041 * shared memory. If we die, the next iteration of ztest_run() 2042 * will verify that the log really does contain this record. 2043 */ 2044 mutex_enter(&zilog->zl_lock); 2045 ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq); 2046 zd->zd_seq = zilog->zl_commit_lr_seq; 2047 mutex_exit(&zilog->zl_lock); 2048 } 2049 2050 /* 2051 * Verify that we can't destroy an active pool, create an existing pool, 2052 * or create a pool with a bad vdev spec. 2053 */ 2054 /* ARGSUSED */ 2055 void 2056 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 2057 { 2058 ztest_shared_t *zs = ztest_shared; 2059 spa_t *spa; 2060 nvlist_t *nvroot; 2061 2062 /* 2063 * Attempt to create using a bad file. 2064 */ 2065 nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); 2066 VERIFY3U(ENOENT, ==, 2067 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 2068 nvlist_free(nvroot); 2069 2070 /* 2071 * Attempt to create using a bad mirror. 2072 */ 2073 nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1); 2074 VERIFY3U(ENOENT, ==, 2075 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 2076 nvlist_free(nvroot); 2077 2078 /* 2079 * Attempt to create an existing pool. It shouldn't matter 2080 * what's in the nvroot; we should fail with EEXIST. 2081 */ 2082 (void) rw_rdlock(&zs->zs_name_lock); 2083 nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); 2084 VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL)); 2085 nvlist_free(nvroot); 2086 VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); 2087 VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool)); 2088 spa_close(spa, FTAG); 2089 2090 (void) rw_unlock(&zs->zs_name_lock); 2091 } 2092 2093 static vdev_t * 2094 vdev_lookup_by_path(vdev_t *vd, const char *path) 2095 { 2096 vdev_t *mvd; 2097 2098 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 2099 return (vd); 2100 2101 for (int c = 0; c < vd->vdev_children; c++) 2102 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 2103 NULL) 2104 return (mvd); 2105 2106 return (NULL); 2107 } 2108 2109 /* 2110 * Find the first available hole which can be used as a top-level. 2111 */ 2112 int 2113 find_vdev_hole(spa_t *spa) 2114 { 2115 vdev_t *rvd = spa->spa_root_vdev; 2116 int c; 2117 2118 ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV); 2119 2120 for (c = 0; c < rvd->vdev_children; c++) { 2121 vdev_t *cvd = rvd->vdev_child[c]; 2122 2123 if (cvd->vdev_ishole) 2124 break; 2125 } 2126 return (c); 2127 } 2128 2129 /* 2130 * Verify that vdev_add() works as expected. 2131 */ 2132 /* ARGSUSED */ 2133 void 2134 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 2135 { 2136 ztest_shared_t *zs = ztest_shared; 2137 spa_t *spa = zs->zs_spa; 2138 uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; 2139 uint64_t guid; 2140 nvlist_t *nvroot; 2141 int error; 2142 2143 VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); 2144 2145 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2146 2147 ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; 2148 2149 /* 2150 * If we have slogs then remove them 1/4 of the time. 2151 */ 2152 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 2153 /* 2154 * Grab the guid from the head of the log class rotor. 2155 */ 2156 guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid; 2157 2158 spa_config_exit(spa, SCL_VDEV, FTAG); 2159 2160 /* 2161 * We have to grab the zs_name_lock as writer to 2162 * prevent a race between removing a slog (dmu_objset_find) 2163 * and destroying a dataset. Removing the slog will 2164 * grab a reference on the dataset which may cause 2165 * dmu_objset_destroy() to fail with EBUSY thus 2166 * leaving the dataset in an inconsistent state. 2167 */ 2168 VERIFY(rw_wrlock(&ztest_shared->zs_name_lock) == 0); 2169 error = spa_vdev_remove(spa, guid, B_FALSE); 2170 VERIFY(rw_unlock(&ztest_shared->zs_name_lock) == 0); 2171 2172 if (error && error != EEXIST) 2173 fatal(0, "spa_vdev_remove() = %d", error); 2174 } else { 2175 spa_config_exit(spa, SCL_VDEV, FTAG); 2176 2177 /* 2178 * Make 1/4 of the devices be log devices. 2179 */ 2180 nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, 2181 ztest_random(4) == 0, zopt_raidz, zopt_mirrors, 1); 2182 2183 error = spa_vdev_add(spa, nvroot); 2184 nvlist_free(nvroot); 2185 2186 if (error == ENOSPC) 2187 ztest_record_enospc("spa_vdev_add"); 2188 else if (error != 0) 2189 fatal(0, "spa_vdev_add() = %d", error); 2190 } 2191 2192 VERIFY(mutex_unlock(&ztest_shared->zs_vdev_lock) == 0); 2193 } 2194 2195 /* 2196 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 2197 */ 2198 /* ARGSUSED */ 2199 void 2200 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 2201 { 2202 ztest_shared_t *zs = ztest_shared; 2203 spa_t *spa = zs->zs_spa; 2204 vdev_t *rvd = spa->spa_root_vdev; 2205 spa_aux_vdev_t *sav; 2206 char *aux; 2207 uint64_t guid = 0; 2208 int error; 2209 2210 if (ztest_random(2) == 0) { 2211 sav = &spa->spa_spares; 2212 aux = ZPOOL_CONFIG_SPARES; 2213 } else { 2214 sav = &spa->spa_l2cache; 2215 aux = ZPOOL_CONFIG_L2CACHE; 2216 } 2217 2218 VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); 2219 2220 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2221 2222 if (sav->sav_count != 0 && ztest_random(4) == 0) { 2223 /* 2224 * Pick a random device to remove. 2225 */ 2226 guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; 2227 } else { 2228 /* 2229 * Find an unused device we can add. 2230 */ 2231 zs->zs_vdev_aux = 0; 2232 for (;;) { 2233 char path[MAXPATHLEN]; 2234 int c; 2235 (void) sprintf(path, ztest_aux_template, zopt_dir, 2236 zopt_pool, aux, zs->zs_vdev_aux); 2237 for (c = 0; c < sav->sav_count; c++) 2238 if (strcmp(sav->sav_vdevs[c]->vdev_path, 2239 path) == 0) 2240 break; 2241 if (c == sav->sav_count && 2242 vdev_lookup_by_path(rvd, path) == NULL) 2243 break; 2244 zs->zs_vdev_aux++; 2245 } 2246 } 2247 2248 spa_config_exit(spa, SCL_VDEV, FTAG); 2249 2250 if (guid == 0) { 2251 /* 2252 * Add a new device. 2253 */ 2254 nvlist_t *nvroot = make_vdev_root(NULL, aux, 2255 (zopt_vdev_size * 5) / 4, 0, 0, 0, 0, 1); 2256 error = spa_vdev_add(spa, nvroot); 2257 if (error != 0) 2258 fatal(0, "spa_vdev_add(%p) = %d", nvroot, error); 2259 nvlist_free(nvroot); 2260 } else { 2261 /* 2262 * Remove an existing device. Sometimes, dirty its 2263 * vdev state first to make sure we handle removal 2264 * of devices that have pending state changes. 2265 */ 2266 if (ztest_random(2) == 0) 2267 (void) vdev_online(spa, guid, 0, NULL); 2268 2269 error = spa_vdev_remove(spa, guid, B_FALSE); 2270 if (error != 0 && error != EBUSY) 2271 fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); 2272 } 2273 2274 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 2275 } 2276 2277 /* 2278 * Verify that we can attach and detach devices. 2279 */ 2280 /* ARGSUSED */ 2281 void 2282 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 2283 { 2284 ztest_shared_t *zs = ztest_shared; 2285 spa_t *spa = zs->zs_spa; 2286 spa_aux_vdev_t *sav = &spa->spa_spares; 2287 vdev_t *rvd = spa->spa_root_vdev; 2288 vdev_t *oldvd, *newvd, *pvd; 2289 nvlist_t *root; 2290 uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; 2291 uint64_t leaf, top; 2292 uint64_t ashift = ztest_get_ashift(); 2293 uint64_t oldguid, pguid; 2294 size_t oldsize, newsize; 2295 char oldpath[MAXPATHLEN], newpath[MAXPATHLEN]; 2296 int replacing; 2297 int oldvd_has_siblings = B_FALSE; 2298 int newvd_is_spare = B_FALSE; 2299 int oldvd_is_log; 2300 int error, expected_error; 2301 2302 VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); 2303 2304 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2305 2306 /* 2307 * Decide whether to do an attach or a replace. 2308 */ 2309 replacing = ztest_random(2); 2310 2311 /* 2312 * Pick a random top-level vdev. 2313 */ 2314 top = ztest_random_vdev_top(spa, B_TRUE); 2315 2316 /* 2317 * Pick a random leaf within it. 2318 */ 2319 leaf = ztest_random(leaves); 2320 2321 /* 2322 * Locate this vdev. 2323 */ 2324 oldvd = rvd->vdev_child[top]; 2325 if (zopt_mirrors >= 1) { 2326 ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); 2327 ASSERT(oldvd->vdev_children >= zopt_mirrors); 2328 oldvd = oldvd->vdev_child[leaf / zopt_raidz]; 2329 } 2330 if (zopt_raidz > 1) { 2331 ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); 2332 ASSERT(oldvd->vdev_children == zopt_raidz); 2333 oldvd = oldvd->vdev_child[leaf % zopt_raidz]; 2334 } 2335 2336 /* 2337 * If we're already doing an attach or replace, oldvd may be a 2338 * mirror vdev -- in which case, pick a random child. 2339 */ 2340 while (oldvd->vdev_children != 0) { 2341 oldvd_has_siblings = B_TRUE; 2342 ASSERT(oldvd->vdev_children >= 2); 2343 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 2344 } 2345 2346 oldguid = oldvd->vdev_guid; 2347 oldsize = vdev_get_min_asize(oldvd); 2348 oldvd_is_log = oldvd->vdev_top->vdev_islog; 2349 (void) strcpy(oldpath, oldvd->vdev_path); 2350 pvd = oldvd->vdev_parent; 2351 pguid = pvd->vdev_guid; 2352 2353 /* 2354 * If oldvd has siblings, then half of the time, detach it. 2355 */ 2356 if (oldvd_has_siblings && ztest_random(2) == 0) { 2357 spa_config_exit(spa, SCL_VDEV, FTAG); 2358 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 2359 if (error != 0 && error != ENODEV && error != EBUSY && 2360 error != ENOTSUP) 2361 fatal(0, "detach (%s) returned %d", oldpath, error); 2362 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 2363 return; 2364 } 2365 2366 /* 2367 * For the new vdev, choose with equal probability between the two 2368 * standard paths (ending in either 'a' or 'b') or a random hot spare. 2369 */ 2370 if (sav->sav_count != 0 && ztest_random(3) == 0) { 2371 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 2372 newvd_is_spare = B_TRUE; 2373 (void) strcpy(newpath, newvd->vdev_path); 2374 } else { 2375 (void) snprintf(newpath, sizeof (newpath), ztest_dev_template, 2376 zopt_dir, zopt_pool, top * leaves + leaf); 2377 if (ztest_random(2) == 0) 2378 newpath[strlen(newpath) - 1] = 'b'; 2379 newvd = vdev_lookup_by_path(rvd, newpath); 2380 } 2381 2382 if (newvd) { 2383 newsize = vdev_get_min_asize(newvd); 2384 } else { 2385 /* 2386 * Make newsize a little bigger or smaller than oldsize. 2387 * If it's smaller, the attach should fail. 2388 * If it's larger, and we're doing a replace, 2389 * we should get dynamic LUN growth when we're done. 2390 */ 2391 newsize = 10 * oldsize / (9 + ztest_random(3)); 2392 } 2393 2394 /* 2395 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 2396 * unless it's a replace; in that case any non-replacing parent is OK. 2397 * 2398 * If newvd is already part of the pool, it should fail with EBUSY. 2399 * 2400 * If newvd is too small, it should fail with EOVERFLOW. 2401 */ 2402 if (pvd->vdev_ops != &vdev_mirror_ops && 2403 pvd->vdev_ops != &vdev_root_ops && (!replacing || 2404 pvd->vdev_ops == &vdev_replacing_ops || 2405 pvd->vdev_ops == &vdev_spare_ops)) 2406 expected_error = ENOTSUP; 2407 else if (newvd_is_spare && (!replacing || oldvd_is_log)) 2408 expected_error = ENOTSUP; 2409 else if (newvd == oldvd) 2410 expected_error = replacing ? 0 : EBUSY; 2411 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 2412 expected_error = EBUSY; 2413 else if (newsize < oldsize) 2414 expected_error = EOVERFLOW; 2415 else if (ashift > oldvd->vdev_top->vdev_ashift) 2416 expected_error = EDOM; 2417 else 2418 expected_error = 0; 2419 2420 spa_config_exit(spa, SCL_VDEV, FTAG); 2421 2422 /* 2423 * Build the nvlist describing newpath. 2424 */ 2425 root = make_vdev_root(newpath, NULL, newvd == NULL ? newsize : 0, 2426 ashift, 0, 0, 0, 1); 2427 2428 error = spa_vdev_attach(spa, oldguid, root, replacing); 2429 2430 nvlist_free(root); 2431 2432 /* 2433 * If our parent was the replacing vdev, but the replace completed, 2434 * then instead of failing with ENOTSUP we may either succeed, 2435 * fail with ENODEV, or fail with EOVERFLOW. 2436 */ 2437 if (expected_error == ENOTSUP && 2438 (error == 0 || error == ENODEV || error == EOVERFLOW)) 2439 expected_error = error; 2440 2441 /* 2442 * If someone grew the LUN, the replacement may be too small. 2443 */ 2444 if (error == EOVERFLOW || error == EBUSY) 2445 expected_error = error; 2446 2447 /* XXX workaround 6690467 */ 2448 if (error != expected_error && expected_error != EBUSY) { 2449 fatal(0, "attach (%s %llu, %s %llu, %d) " 2450 "returned %d, expected %d", 2451 oldpath, (longlong_t)oldsize, newpath, 2452 (longlong_t)newsize, replacing, error, expected_error); 2453 } 2454 2455 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 2456 } 2457 2458 /* 2459 * Callback function which expands the physical size of the vdev. 2460 */ 2461 vdev_t * 2462 grow_vdev(vdev_t *vd, void *arg) 2463 { 2464 spa_t *spa = vd->vdev_spa; 2465 size_t *newsize = arg; 2466 size_t fsize; 2467 int fd; 2468 2469 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); 2470 ASSERT(vd->vdev_ops->vdev_op_leaf); 2471 2472 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 2473 return (vd); 2474 2475 fsize = lseek(fd, 0, SEEK_END); 2476 (void) ftruncate(fd, *newsize); 2477 2478 if (zopt_verbose >= 6) { 2479 (void) printf("%s grew from %lu to %lu bytes\n", 2480 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 2481 } 2482 (void) close(fd); 2483 return (NULL); 2484 } 2485 2486 /* 2487 * Callback function which expands a given vdev by calling vdev_online(). 2488 */ 2489 /* ARGSUSED */ 2490 vdev_t * 2491 online_vdev(vdev_t *vd, void *arg) 2492 { 2493 spa_t *spa = vd->vdev_spa; 2494 vdev_t *tvd = vd->vdev_top; 2495 uint64_t guid = vd->vdev_guid; 2496 uint64_t generation = spa->spa_config_generation + 1; 2497 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 2498 int error; 2499 2500 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); 2501 ASSERT(vd->vdev_ops->vdev_op_leaf); 2502 2503 /* Calling vdev_online will initialize the new metaslabs */ 2504 spa_config_exit(spa, SCL_STATE, spa); 2505 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 2506 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 2507 2508 /* 2509 * If vdev_online returned an error or the underlying vdev_open 2510 * failed then we abort the expand. The only way to know that 2511 * vdev_open fails is by checking the returned newstate. 2512 */ 2513 if (error || newstate != VDEV_STATE_HEALTHY) { 2514 if (zopt_verbose >= 5) { 2515 (void) printf("Unable to expand vdev, state %llu, " 2516 "error %d\n", (u_longlong_t)newstate, error); 2517 } 2518 return (vd); 2519 } 2520 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 2521 2522 /* 2523 * Since we dropped the lock we need to ensure that we're 2524 * still talking to the original vdev. It's possible this 2525 * vdev may have been detached/replaced while we were 2526 * trying to online it. 2527 */ 2528 if (generation != spa->spa_config_generation) { 2529 if (zopt_verbose >= 5) { 2530 (void) printf("vdev configuration has changed, " 2531 "guid %llu, state %llu, expected gen %llu, " 2532 "got gen %llu\n", 2533 (u_longlong_t)guid, 2534 (u_longlong_t)tvd->vdev_state, 2535 (u_longlong_t)generation, 2536 (u_longlong_t)spa->spa_config_generation); 2537 } 2538 return (vd); 2539 } 2540 return (NULL); 2541 } 2542 2543 /* 2544 * Traverse the vdev tree calling the supplied function. 2545 * We continue to walk the tree until we either have walked all 2546 * children or we receive a non-NULL return from the callback. 2547 * If a NULL callback is passed, then we just return back the first 2548 * leaf vdev we encounter. 2549 */ 2550 vdev_t * 2551 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 2552 { 2553 if (vd->vdev_ops->vdev_op_leaf) { 2554 if (func == NULL) 2555 return (vd); 2556 else 2557 return (func(vd, arg)); 2558 } 2559 2560 for (uint_t c = 0; c < vd->vdev_children; c++) { 2561 vdev_t *cvd = vd->vdev_child[c]; 2562 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 2563 return (cvd); 2564 } 2565 return (NULL); 2566 } 2567 2568 /* 2569 * Verify that dynamic LUN growth works as expected. 2570 */ 2571 /* ARGSUSED */ 2572 void 2573 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 2574 { 2575 ztest_shared_t *zs = ztest_shared; 2576 spa_t *spa = zs->zs_spa; 2577 vdev_t *vd, *tvd; 2578 metaslab_class_t *mc; 2579 metaslab_group_t *mg; 2580 size_t psize, newsize; 2581 uint64_t top; 2582 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 2583 2584 VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); 2585 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 2586 2587 top = ztest_random_vdev_top(spa, B_TRUE); 2588 2589 tvd = spa->spa_root_vdev->vdev_child[top]; 2590 mg = tvd->vdev_mg; 2591 mc = mg->mg_class; 2592 old_ms_count = tvd->vdev_ms_count; 2593 old_class_space = metaslab_class_get_space(mc); 2594 2595 /* 2596 * Determine the size of the first leaf vdev associated with 2597 * our top-level device. 2598 */ 2599 vd = vdev_walk_tree(tvd, NULL, NULL); 2600 ASSERT3P(vd, !=, NULL); 2601 ASSERT(vd->vdev_ops->vdev_op_leaf); 2602 2603 psize = vd->vdev_psize; 2604 2605 /* 2606 * We only try to expand the vdev if it's healthy, less than 4x its 2607 * original size, and it has a valid psize. 2608 */ 2609 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 2610 psize == 0 || psize >= 4 * zopt_vdev_size) { 2611 spa_config_exit(spa, SCL_STATE, spa); 2612 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 2613 return; 2614 } 2615 ASSERT(psize > 0); 2616 newsize = psize + psize / 8; 2617 ASSERT3U(newsize, >, psize); 2618 2619 if (zopt_verbose >= 6) { 2620 (void) printf("Expanding LUN %s from %lu to %lu\n", 2621 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 2622 } 2623 2624 /* 2625 * Growing the vdev is a two step process: 2626 * 1). expand the physical size (i.e. relabel) 2627 * 2). online the vdev to create the new metaslabs 2628 */ 2629 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 2630 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 2631 tvd->vdev_state != VDEV_STATE_HEALTHY) { 2632 if (zopt_verbose >= 5) { 2633 (void) printf("Could not expand LUN because " 2634 "the vdev configuration changed.\n"); 2635 } 2636 spa_config_exit(spa, SCL_STATE, spa); 2637 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 2638 return; 2639 } 2640 2641 spa_config_exit(spa, SCL_STATE, spa); 2642 2643 /* 2644 * Expanding the LUN will update the config asynchronously, 2645 * thus we must wait for the async thread to complete any 2646 * pending tasks before proceeding. 2647 */ 2648 for (;;) { 2649 boolean_t done; 2650 mutex_enter(&spa->spa_async_lock); 2651 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 2652 mutex_exit(&spa->spa_async_lock); 2653 if (done) 2654 break; 2655 txg_wait_synced(spa_get_dsl(spa), 0); 2656 (void) poll(NULL, 0, 100); 2657 } 2658 2659 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 2660 2661 tvd = spa->spa_root_vdev->vdev_child[top]; 2662 new_ms_count = tvd->vdev_ms_count; 2663 new_class_space = metaslab_class_get_space(mc); 2664 2665 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 2666 if (zopt_verbose >= 5) { 2667 (void) printf("Could not verify LUN expansion due to " 2668 "intervening vdev offline or remove.\n"); 2669 } 2670 spa_config_exit(spa, SCL_STATE, spa); 2671 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 2672 return; 2673 } 2674 2675 /* 2676 * Make sure we were able to grow the vdev. 2677 */ 2678 if (new_ms_count <= old_ms_count) 2679 fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n", 2680 old_ms_count, new_ms_count); 2681 2682 /* 2683 * Make sure we were able to grow the pool. 2684 */ 2685 if (new_class_space <= old_class_space) 2686 fatal(0, "LUN expansion failed: class_space %llu <= %llu\n", 2687 old_class_space, new_class_space); 2688 2689 if (zopt_verbose >= 5) { 2690 char oldnumbuf[6], newnumbuf[6]; 2691 2692 nicenum(old_class_space, oldnumbuf); 2693 nicenum(new_class_space, newnumbuf); 2694 (void) printf("%s grew from %s to %s\n", 2695 spa->spa_name, oldnumbuf, newnumbuf); 2696 } 2697 2698 spa_config_exit(spa, SCL_STATE, spa); 2699 VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); 2700 } 2701 2702 /* 2703 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 2704 */ 2705 /* ARGSUSED */ 2706 static void 2707 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 2708 { 2709 /* 2710 * Create the objects common to all ztest datasets. 2711 */ 2712 VERIFY(zap_create_claim(os, ZTEST_DIROBJ, 2713 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0); 2714 } 2715 2716 /* ARGSUSED */ 2717 static int 2718 ztest_objset_destroy_cb(char *name, void *arg) 2719 { 2720 objset_t *os; 2721 dmu_object_info_t doi; 2722 int error; 2723 2724 /* 2725 * Verify that the dataset contains a directory object. 2726 */ 2727 VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os)); 2728 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 2729 if (error != ENOENT) { 2730 /* We could have crashed in the middle of destroying it */ 2731 ASSERT3U(error, ==, 0); 2732 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 2733 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 2734 } 2735 dmu_objset_rele(os, FTAG); 2736 2737 /* 2738 * Destroy the dataset. 2739 */ 2740 VERIFY3U(0, ==, dmu_objset_destroy(name, B_FALSE)); 2741 return (0); 2742 } 2743 2744 static boolean_t 2745 ztest_snapshot_create(char *osname, uint64_t id) 2746 { 2747 char snapname[MAXNAMELEN]; 2748 int error; 2749 2750 (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname, 2751 (u_longlong_t)id); 2752 2753 error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1, 2754 NULL, B_FALSE); 2755 if (error == ENOSPC) { 2756 ztest_record_enospc(FTAG); 2757 return (B_FALSE); 2758 } 2759 if (error != 0 && error != EEXIST) 2760 fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error); 2761 return (B_TRUE); 2762 } 2763 2764 static boolean_t 2765 ztest_snapshot_destroy(char *osname, uint64_t id) 2766 { 2767 char snapname[MAXNAMELEN]; 2768 int error; 2769 2770 (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname, 2771 (u_longlong_t)id); 2772 2773 error = dmu_objset_destroy(snapname, B_FALSE); 2774 if (error != 0 && error != ENOENT) 2775 fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); 2776 return (B_TRUE); 2777 } 2778 2779 /* ARGSUSED */ 2780 void 2781 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 2782 { 2783 ztest_shared_t *zs = ztest_shared; 2784 ztest_ds_t zdtmp; 2785 int iters; 2786 int error; 2787 objset_t *os, *os2; 2788 char name[MAXNAMELEN]; 2789 zilog_t *zilog; 2790 2791 (void) rw_rdlock(&zs->zs_name_lock); 2792 2793 (void) snprintf(name, MAXNAMELEN, "%s/temp_%llu", 2794 zs->zs_pool, (u_longlong_t)id); 2795 2796 /* 2797 * If this dataset exists from a previous run, process its replay log 2798 * half of the time. If we don't replay it, then dmu_objset_destroy() 2799 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 2800 */ 2801 if (ztest_random(2) == 0 && 2802 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) { 2803 ztest_zd_init(&zdtmp, os); 2804 zil_replay(os, &zdtmp, ztest_replay_vector); 2805 ztest_zd_fini(&zdtmp); 2806 dmu_objset_disown(os, FTAG); 2807 } 2808 2809 /* 2810 * There may be an old instance of the dataset we're about to 2811 * create lying around from a previous run. If so, destroy it 2812 * and all of its snapshots. 2813 */ 2814 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 2815 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 2816 2817 /* 2818 * Verify that the destroyed dataset is no longer in the namespace. 2819 */ 2820 VERIFY3U(ENOENT, ==, dmu_objset_hold(name, FTAG, &os)); 2821 2822 /* 2823 * Verify that we can create a new dataset. 2824 */ 2825 error = dmu_objset_create(name, DMU_OST_OTHER, 0, 2826 ztest_objset_create_cb, NULL); 2827 if (error) { 2828 if (error == ENOSPC) { 2829 ztest_record_enospc(FTAG); 2830 (void) rw_unlock(&zs->zs_name_lock); 2831 return; 2832 } 2833 fatal(0, "dmu_objset_create(%s) = %d", name, error); 2834 } 2835 2836 VERIFY3U(0, ==, 2837 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os)); 2838 2839 ztest_zd_init(&zdtmp, os); 2840 2841 /* 2842 * Open the intent log for it. 2843 */ 2844 zilog = zil_open(os, ztest_get_data); 2845 2846 /* 2847 * Put some objects in there, do a little I/O to them, 2848 * and randomly take a couple of snapshots along the way. 2849 */ 2850 iters = ztest_random(5); 2851 for (int i = 0; i < iters; i++) { 2852 ztest_dmu_object_alloc_free(&zdtmp, id); 2853 if (ztest_random(iters) == 0) 2854 (void) ztest_snapshot_create(name, i); 2855 } 2856 2857 /* 2858 * Verify that we cannot create an existing dataset. 2859 */ 2860 VERIFY3U(EEXIST, ==, 2861 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL)); 2862 2863 /* 2864 * Verify that we can hold an objset that is also owned. 2865 */ 2866 VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2)); 2867 dmu_objset_rele(os2, FTAG); 2868 2869 /* 2870 * Verify that we cannot own an objset that is already owned. 2871 */ 2872 VERIFY3U(EBUSY, ==, 2873 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2)); 2874 2875 zil_close(zilog); 2876 dmu_objset_disown(os, FTAG); 2877 ztest_zd_fini(&zdtmp); 2878 2879 (void) rw_unlock(&zs->zs_name_lock); 2880 } 2881 2882 /* 2883 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 2884 */ 2885 void 2886 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 2887 { 2888 ztest_shared_t *zs = ztest_shared; 2889 2890 (void) rw_rdlock(&zs->zs_name_lock); 2891 (void) ztest_snapshot_destroy(zd->zd_name, id); 2892 (void) ztest_snapshot_create(zd->zd_name, id); 2893 (void) rw_unlock(&zs->zs_name_lock); 2894 } 2895 2896 /* 2897 * Cleanup non-standard snapshots and clones. 2898 */ 2899 void 2900 ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 2901 { 2902 char snap1name[MAXNAMELEN]; 2903 char clone1name[MAXNAMELEN]; 2904 char snap2name[MAXNAMELEN]; 2905 char clone2name[MAXNAMELEN]; 2906 char snap3name[MAXNAMELEN]; 2907 int error; 2908 2909 (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id); 2910 (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id); 2911 (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id); 2912 (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id); 2913 (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id); 2914 2915 error = dmu_objset_destroy(clone2name, B_FALSE); 2916 if (error && error != ENOENT) 2917 fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error); 2918 error = dmu_objset_destroy(snap3name, B_FALSE); 2919 if (error && error != ENOENT) 2920 fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error); 2921 error = dmu_objset_destroy(snap2name, B_FALSE); 2922 if (error && error != ENOENT) 2923 fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error); 2924 error = dmu_objset_destroy(clone1name, B_FALSE); 2925 if (error && error != ENOENT) 2926 fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error); 2927 error = dmu_objset_destroy(snap1name, B_FALSE); 2928 if (error && error != ENOENT) 2929 fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error); 2930 } 2931 2932 /* 2933 * Verify dsl_dataset_promote handles EBUSY 2934 */ 2935 void 2936 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 2937 { 2938 ztest_shared_t *zs = ztest_shared; 2939 objset_t *clone; 2940 dsl_dataset_t *ds; 2941 char snap1name[MAXNAMELEN]; 2942 char clone1name[MAXNAMELEN]; 2943 char snap2name[MAXNAMELEN]; 2944 char clone2name[MAXNAMELEN]; 2945 char snap3name[MAXNAMELEN]; 2946 char *osname = zd->zd_name; 2947 int error; 2948 2949 (void) rw_rdlock(&zs->zs_name_lock); 2950 2951 ztest_dsl_dataset_cleanup(osname, id); 2952 2953 (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id); 2954 (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id); 2955 (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id); 2956 (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id); 2957 (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id); 2958 2959 error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1, 2960 NULL, B_FALSE); 2961 if (error && error != EEXIST) { 2962 if (error == ENOSPC) { 2963 ztest_record_enospc(FTAG); 2964 goto out; 2965 } 2966 fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); 2967 } 2968 2969 error = dmu_objset_hold(snap1name, FTAG, &clone); 2970 if (error) 2971 fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error); 2972 2973 error = dmu_objset_clone(clone1name, dmu_objset_ds(clone), 0); 2974 dmu_objset_rele(clone, FTAG); 2975 if (error) { 2976 if (error == ENOSPC) { 2977 ztest_record_enospc(FTAG); 2978 goto out; 2979 } 2980 fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); 2981 } 2982 2983 error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1, 2984 NULL, B_FALSE); 2985 if (error && error != EEXIST) { 2986 if (error == ENOSPC) { 2987 ztest_record_enospc(FTAG); 2988 goto out; 2989 } 2990 fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); 2991 } 2992 2993 error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1, 2994 NULL, B_FALSE); 2995 if (error && error != EEXIST) { 2996 if (error == ENOSPC) { 2997 ztest_record_enospc(FTAG); 2998 goto out; 2999 } 3000 fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); 3001 } 3002 3003 error = dmu_objset_hold(snap3name, FTAG, &clone); 3004 if (error) 3005 fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); 3006 3007 error = dmu_objset_clone(clone2name, dmu_objset_ds(clone), 0); 3008 dmu_objset_rele(clone, FTAG); 3009 if (error) { 3010 if (error == ENOSPC) { 3011 ztest_record_enospc(FTAG); 3012 goto out; 3013 } 3014 fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); 3015 } 3016 3017 error = dsl_dataset_own(snap1name, B_FALSE, FTAG, &ds); 3018 if (error) 3019 fatal(0, "dsl_dataset_own(%s) = %d", snap1name, error); 3020 error = dsl_dataset_promote(clone2name, NULL); 3021 if (error != EBUSY) 3022 fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, 3023 error); 3024 dsl_dataset_disown(ds, FTAG); 3025 3026 out: 3027 ztest_dsl_dataset_cleanup(osname, id); 3028 3029 (void) rw_unlock(&zs->zs_name_lock); 3030 } 3031 3032 /* 3033 * Verify that dmu_object_{alloc,free} work as expected. 3034 */ 3035 void 3036 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 3037 { 3038 ztest_od_t od[4]; 3039 int batchsize = sizeof (od) / sizeof (od[0]); 3040 3041 for (int b = 0; b < batchsize; b++) 3042 ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0); 3043 3044 /* 3045 * Destroy the previous batch of objects, create a new batch, 3046 * and do some I/O on the new objects. 3047 */ 3048 if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0) 3049 return; 3050 3051 while (ztest_random(4 * batchsize) != 0) 3052 ztest_io(zd, od[ztest_random(batchsize)].od_object, 3053 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 3054 } 3055 3056 /* 3057 * Verify that dmu_{read,write} work as expected. 3058 */ 3059 void 3060 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 3061 { 3062 objset_t *os = zd->zd_os; 3063 ztest_od_t od[2]; 3064 dmu_tx_t *tx; 3065 int i, freeit, error; 3066 uint64_t n, s, txg; 3067 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 3068 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 3069 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 3070 uint64_t regions = 997; 3071 uint64_t stride = 123456789ULL; 3072 uint64_t width = 40; 3073 int free_percent = 5; 3074 3075 /* 3076 * This test uses two objects, packobj and bigobj, that are always 3077 * updated together (i.e. in the same tx) so that their contents are 3078 * in sync and can be compared. Their contents relate to each other 3079 * in a simple way: packobj is a dense array of 'bufwad' structures, 3080 * while bigobj is a sparse array of the same bufwads. Specifically, 3081 * for any index n, there are three bufwads that should be identical: 3082 * 3083 * packobj, at offset n * sizeof (bufwad_t) 3084 * bigobj, at the head of the nth chunk 3085 * bigobj, at the tail of the nth chunk 3086 * 3087 * The chunk size is arbitrary. It doesn't have to be a power of two, 3088 * and it doesn't have any relation to the object blocksize. 3089 * The only requirement is that it can hold at least two bufwads. 3090 * 3091 * Normally, we write the bufwad to each of these locations. 3092 * However, free_percent of the time we instead write zeroes to 3093 * packobj and perform a dmu_free_range() on bigobj. By comparing 3094 * bigobj to packobj, we can verify that the DMU is correctly 3095 * tracking which parts of an object are allocated and free, 3096 * and that the contents of the allocated blocks are correct. 3097 */ 3098 3099 /* 3100 * Read the directory info. If it's the first time, set things up. 3101 */ 3102 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize); 3103 ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize); 3104 3105 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 3106 return; 3107 3108 bigobj = od[0].od_object; 3109 packobj = od[1].od_object; 3110 chunksize = od[0].od_gen; 3111 ASSERT(chunksize == od[1].od_gen); 3112 3113 /* 3114 * Prefetch a random chunk of the big object. 3115 * Our aim here is to get some async reads in flight 3116 * for blocks that we may free below; the DMU should 3117 * handle this race correctly. 3118 */ 3119 n = ztest_random(regions) * stride + ztest_random(width); 3120 s = 1 + ztest_random(2 * width - 1); 3121 dmu_prefetch(os, bigobj, n * chunksize, s * chunksize); 3122 3123 /* 3124 * Pick a random index and compute the offsets into packobj and bigobj. 3125 */ 3126 n = ztest_random(regions) * stride + ztest_random(width); 3127 s = 1 + ztest_random(width - 1); 3128 3129 packoff = n * sizeof (bufwad_t); 3130 packsize = s * sizeof (bufwad_t); 3131 3132 bigoff = n * chunksize; 3133 bigsize = s * chunksize; 3134 3135 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 3136 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 3137 3138 /* 3139 * free_percent of the time, free a range of bigobj rather than 3140 * overwriting it. 3141 */ 3142 freeit = (ztest_random(100) < free_percent); 3143 3144 /* 3145 * Read the current contents of our objects. 3146 */ 3147 error = dmu_read(os, packobj, packoff, packsize, packbuf, 3148 DMU_READ_PREFETCH); 3149 ASSERT3U(error, ==, 0); 3150 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 3151 DMU_READ_PREFETCH); 3152 ASSERT3U(error, ==, 0); 3153 3154 /* 3155 * Get a tx for the mods to both packobj and bigobj. 3156 */ 3157 tx = dmu_tx_create(os); 3158 3159 dmu_tx_hold_write(tx, packobj, packoff, packsize); 3160 3161 if (freeit) 3162 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 3163 else 3164 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 3165 3166 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 3167 if (txg == 0) { 3168 umem_free(packbuf, packsize); 3169 umem_free(bigbuf, bigsize); 3170 return; 3171 } 3172 3173 dmu_object_set_checksum(os, bigobj, 3174 (enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx); 3175 3176 dmu_object_set_compress(os, bigobj, 3177 (enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx); 3178 3179 /* 3180 * For each index from n to n + s, verify that the existing bufwad 3181 * in packobj matches the bufwads at the head and tail of the 3182 * corresponding chunk in bigobj. Then update all three bufwads 3183 * with the new values we want to write out. 3184 */ 3185 for (i = 0; i < s; i++) { 3186 /* LINTED */ 3187 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 3188 /* LINTED */ 3189 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 3190 /* LINTED */ 3191 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 3192 3193 ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); 3194 ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); 3195 3196 if (pack->bw_txg > txg) 3197 fatal(0, "future leak: got %llx, open txg is %llx", 3198 pack->bw_txg, txg); 3199 3200 if (pack->bw_data != 0 && pack->bw_index != n + i) 3201 fatal(0, "wrong index: got %llx, wanted %llx+%llx", 3202 pack->bw_index, n, i); 3203 3204 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) 3205 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); 3206 3207 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) 3208 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); 3209 3210 if (freeit) { 3211 bzero(pack, sizeof (bufwad_t)); 3212 } else { 3213 pack->bw_index = n + i; 3214 pack->bw_txg = txg; 3215 pack->bw_data = 1 + ztest_random(-2ULL); 3216 } 3217 *bigH = *pack; 3218 *bigT = *pack; 3219 } 3220 3221 /* 3222 * We've verified all the old bufwads, and made new ones. 3223 * Now write them out. 3224 */ 3225 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 3226 3227 if (freeit) { 3228 if (zopt_verbose >= 7) { 3229 (void) printf("freeing offset %llx size %llx" 3230 " txg %llx\n", 3231 (u_longlong_t)bigoff, 3232 (u_longlong_t)bigsize, 3233 (u_longlong_t)txg); 3234 } 3235 VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 3236 } else { 3237 if (zopt_verbose >= 7) { 3238 (void) printf("writing offset %llx size %llx" 3239 " txg %llx\n", 3240 (u_longlong_t)bigoff, 3241 (u_longlong_t)bigsize, 3242 (u_longlong_t)txg); 3243 } 3244 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 3245 } 3246 3247 dmu_tx_commit(tx); 3248 3249 /* 3250 * Sanity check the stuff we just wrote. 3251 */ 3252 { 3253 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 3254 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 3255 3256 VERIFY(0 == dmu_read(os, packobj, packoff, 3257 packsize, packcheck, DMU_READ_PREFETCH)); 3258 VERIFY(0 == dmu_read(os, bigobj, bigoff, 3259 bigsize, bigcheck, DMU_READ_PREFETCH)); 3260 3261 ASSERT(bcmp(packbuf, packcheck, packsize) == 0); 3262 ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); 3263 3264 umem_free(packcheck, packsize); 3265 umem_free(bigcheck, bigsize); 3266 } 3267 3268 umem_free(packbuf, packsize); 3269 umem_free(bigbuf, bigsize); 3270 } 3271 3272 void 3273 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 3274 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 3275 { 3276 uint64_t i; 3277 bufwad_t *pack; 3278 bufwad_t *bigH; 3279 bufwad_t *bigT; 3280 3281 /* 3282 * For each index from n to n + s, verify that the existing bufwad 3283 * in packobj matches the bufwads at the head and tail of the 3284 * corresponding chunk in bigobj. Then update all three bufwads 3285 * with the new values we want to write out. 3286 */ 3287 for (i = 0; i < s; i++) { 3288 /* LINTED */ 3289 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 3290 /* LINTED */ 3291 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 3292 /* LINTED */ 3293 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 3294 3295 ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); 3296 ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); 3297 3298 if (pack->bw_txg > txg) 3299 fatal(0, "future leak: got %llx, open txg is %llx", 3300 pack->bw_txg, txg); 3301 3302 if (pack->bw_data != 0 && pack->bw_index != n + i) 3303 fatal(0, "wrong index: got %llx, wanted %llx+%llx", 3304 pack->bw_index, n, i); 3305 3306 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) 3307 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); 3308 3309 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) 3310 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); 3311 3312 pack->bw_index = n + i; 3313 pack->bw_txg = txg; 3314 pack->bw_data = 1 + ztest_random(-2ULL); 3315 3316 *bigH = *pack; 3317 *bigT = *pack; 3318 } 3319 } 3320 3321 void 3322 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 3323 { 3324 objset_t *os = zd->zd_os; 3325 ztest_od_t od[2]; 3326 dmu_tx_t *tx; 3327 uint64_t i; 3328 int error; 3329 uint64_t n, s, txg; 3330 bufwad_t *packbuf, *bigbuf; 3331 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 3332 uint64_t blocksize = ztest_random_blocksize(); 3333 uint64_t chunksize = blocksize; 3334 uint64_t regions = 997; 3335 uint64_t stride = 123456789ULL; 3336 uint64_t width = 9; 3337 dmu_buf_t *bonus_db; 3338 arc_buf_t **bigbuf_arcbufs; 3339 dmu_object_info_t doi; 3340 3341 /* 3342 * This test uses two objects, packobj and bigobj, that are always 3343 * updated together (i.e. in the same tx) so that their contents are 3344 * in sync and can be compared. Their contents relate to each other 3345 * in a simple way: packobj is a dense array of 'bufwad' structures, 3346 * while bigobj is a sparse array of the same bufwads. Specifically, 3347 * for any index n, there are three bufwads that should be identical: 3348 * 3349 * packobj, at offset n * sizeof (bufwad_t) 3350 * bigobj, at the head of the nth chunk 3351 * bigobj, at the tail of the nth chunk 3352 * 3353 * The chunk size is set equal to bigobj block size so that 3354 * dmu_assign_arcbuf() can be tested for object updates. 3355 */ 3356 3357 /* 3358 * Read the directory info. If it's the first time, set things up. 3359 */ 3360 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); 3361 ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize); 3362 3363 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 3364 return; 3365 3366 bigobj = od[0].od_object; 3367 packobj = od[1].od_object; 3368 blocksize = od[0].od_blocksize; 3369 chunksize = blocksize; 3370 ASSERT(chunksize == od[1].od_gen); 3371 3372 VERIFY(dmu_object_info(os, bigobj, &doi) == 0); 3373 VERIFY(ISP2(doi.doi_data_block_size)); 3374 VERIFY(chunksize == doi.doi_data_block_size); 3375 VERIFY(chunksize >= 2 * sizeof (bufwad_t)); 3376 3377 /* 3378 * Pick a random index and compute the offsets into packobj and bigobj. 3379 */ 3380 n = ztest_random(regions) * stride + ztest_random(width); 3381 s = 1 + ztest_random(width - 1); 3382 3383 packoff = n * sizeof (bufwad_t); 3384 packsize = s * sizeof (bufwad_t); 3385 3386 bigoff = n * chunksize; 3387 bigsize = s * chunksize; 3388 3389 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 3390 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 3391 3392 VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 3393 3394 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 3395 3396 /* 3397 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 3398 * Iteration 1 test zcopy to already referenced dbufs. 3399 * Iteration 2 test zcopy to dirty dbuf in the same txg. 3400 * Iteration 3 test zcopy to dbuf dirty in previous txg. 3401 * Iteration 4 test zcopy when dbuf is no longer dirty. 3402 * Iteration 5 test zcopy when it can't be done. 3403 * Iteration 6 one more zcopy write. 3404 */ 3405 for (i = 0; i < 7; i++) { 3406 uint64_t j; 3407 uint64_t off; 3408 3409 /* 3410 * In iteration 5 (i == 5) use arcbufs 3411 * that don't match bigobj blksz to test 3412 * dmu_assign_arcbuf() when it can't directly 3413 * assign an arcbuf to a dbuf. 3414 */ 3415 for (j = 0; j < s; j++) { 3416 if (i != 5) { 3417 bigbuf_arcbufs[j] = 3418 dmu_request_arcbuf(bonus_db, chunksize); 3419 } else { 3420 bigbuf_arcbufs[2 * j] = 3421 dmu_request_arcbuf(bonus_db, chunksize / 2); 3422 bigbuf_arcbufs[2 * j + 1] = 3423 dmu_request_arcbuf(bonus_db, chunksize / 2); 3424 } 3425 } 3426 3427 /* 3428 * Get a tx for the mods to both packobj and bigobj. 3429 */ 3430 tx = dmu_tx_create(os); 3431 3432 dmu_tx_hold_write(tx, packobj, packoff, packsize); 3433 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 3434 3435 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 3436 if (txg == 0) { 3437 umem_free(packbuf, packsize); 3438 umem_free(bigbuf, bigsize); 3439 for (j = 0; j < s; j++) { 3440 if (i != 5) { 3441 dmu_return_arcbuf(bigbuf_arcbufs[j]); 3442 } else { 3443 dmu_return_arcbuf( 3444 bigbuf_arcbufs[2 * j]); 3445 dmu_return_arcbuf( 3446 bigbuf_arcbufs[2 * j + 1]); 3447 } 3448 } 3449 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 3450 dmu_buf_rele(bonus_db, FTAG); 3451 return; 3452 } 3453 3454 /* 3455 * 50% of the time don't read objects in the 1st iteration to 3456 * test dmu_assign_arcbuf() for the case when there're no 3457 * existing dbufs for the specified offsets. 3458 */ 3459 if (i != 0 || ztest_random(2) != 0) { 3460 error = dmu_read(os, packobj, packoff, 3461 packsize, packbuf, DMU_READ_PREFETCH); 3462 ASSERT3U(error, ==, 0); 3463 error = dmu_read(os, bigobj, bigoff, bigsize, 3464 bigbuf, DMU_READ_PREFETCH); 3465 ASSERT3U(error, ==, 0); 3466 } 3467 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 3468 n, chunksize, txg); 3469 3470 /* 3471 * We've verified all the old bufwads, and made new ones. 3472 * Now write them out. 3473 */ 3474 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 3475 if (zopt_verbose >= 7) { 3476 (void) printf("writing offset %llx size %llx" 3477 " txg %llx\n", 3478 (u_longlong_t)bigoff, 3479 (u_longlong_t)bigsize, 3480 (u_longlong_t)txg); 3481 } 3482 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 3483 dmu_buf_t *dbt; 3484 if (i != 5) { 3485 bcopy((caddr_t)bigbuf + (off - bigoff), 3486 bigbuf_arcbufs[j]->b_data, chunksize); 3487 } else { 3488 bcopy((caddr_t)bigbuf + (off - bigoff), 3489 bigbuf_arcbufs[2 * j]->b_data, 3490 chunksize / 2); 3491 bcopy((caddr_t)bigbuf + (off - bigoff) + 3492 chunksize / 2, 3493 bigbuf_arcbufs[2 * j + 1]->b_data, 3494 chunksize / 2); 3495 } 3496 3497 if (i == 1) { 3498 VERIFY(dmu_buf_hold(os, bigobj, off, 3499 FTAG, &dbt) == 0); 3500 } 3501 if (i != 5) { 3502 dmu_assign_arcbuf(bonus_db, off, 3503 bigbuf_arcbufs[j], tx); 3504 } else { 3505 dmu_assign_arcbuf(bonus_db, off, 3506 bigbuf_arcbufs[2 * j], tx); 3507 dmu_assign_arcbuf(bonus_db, 3508 off + chunksize / 2, 3509 bigbuf_arcbufs[2 * j + 1], tx); 3510 } 3511 if (i == 1) { 3512 dmu_buf_rele(dbt, FTAG); 3513 } 3514 } 3515 dmu_tx_commit(tx); 3516 3517 /* 3518 * Sanity check the stuff we just wrote. 3519 */ 3520 { 3521 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 3522 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 3523 3524 VERIFY(0 == dmu_read(os, packobj, packoff, 3525 packsize, packcheck, DMU_READ_PREFETCH)); 3526 VERIFY(0 == dmu_read(os, bigobj, bigoff, 3527 bigsize, bigcheck, DMU_READ_PREFETCH)); 3528 3529 ASSERT(bcmp(packbuf, packcheck, packsize) == 0); 3530 ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); 3531 3532 umem_free(packcheck, packsize); 3533 umem_free(bigcheck, bigsize); 3534 } 3535 if (i == 2) { 3536 txg_wait_open(dmu_objset_pool(os), 0); 3537 } else if (i == 3) { 3538 txg_wait_synced(dmu_objset_pool(os), 0); 3539 } 3540 } 3541 3542 dmu_buf_rele(bonus_db, FTAG); 3543 umem_free(packbuf, packsize); 3544 umem_free(bigbuf, bigsize); 3545 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 3546 } 3547 3548 /* ARGSUSED */ 3549 void 3550 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 3551 { 3552 ztest_od_t od[1]; 3553 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 3554 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 3555 3556 /* 3557 * Have multiple threads write to large offsets in an object 3558 * to verify that parallel writes to an object -- even to the 3559 * same blocks within the object -- doesn't cause any trouble. 3560 */ 3561 ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); 3562 3563 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 3564 return; 3565 3566 while (ztest_random(10) != 0) 3567 ztest_io(zd, od[0].od_object, offset); 3568 } 3569 3570 void 3571 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 3572 { 3573 ztest_od_t od[1]; 3574 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 3575 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 3576 uint64_t count = ztest_random(20) + 1; 3577 uint64_t blocksize = ztest_random_blocksize(); 3578 void *data; 3579 3580 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); 3581 3582 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) 3583 return; 3584 3585 if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0) 3586 return; 3587 3588 ztest_prealloc(zd, od[0].od_object, offset, count * blocksize); 3589 3590 data = umem_zalloc(blocksize, UMEM_NOFAIL); 3591 3592 while (ztest_random(count) != 0) { 3593 uint64_t randoff = offset + (ztest_random(count) * blocksize); 3594 if (ztest_write(zd, od[0].od_object, randoff, blocksize, 3595 data) != 0) 3596 break; 3597 while (ztest_random(4) != 0) 3598 ztest_io(zd, od[0].od_object, randoff); 3599 } 3600 3601 umem_free(data, blocksize); 3602 } 3603 3604 /* 3605 * Verify that zap_{create,destroy,add,remove,update} work as expected. 3606 */ 3607 #define ZTEST_ZAP_MIN_INTS 1 3608 #define ZTEST_ZAP_MAX_INTS 4 3609 #define ZTEST_ZAP_MAX_PROPS 1000 3610 3611 void 3612 ztest_zap(ztest_ds_t *zd, uint64_t id) 3613 { 3614 objset_t *os = zd->zd_os; 3615 ztest_od_t od[1]; 3616 uint64_t object; 3617 uint64_t txg, last_txg; 3618 uint64_t value[ZTEST_ZAP_MAX_INTS]; 3619 uint64_t zl_ints, zl_intsize, prop; 3620 int i, ints; 3621 dmu_tx_t *tx; 3622 char propname[100], txgname[100]; 3623 int error; 3624 char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 3625 3626 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0); 3627 3628 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) 3629 return; 3630 3631 object = od[0].od_object; 3632 3633 /* 3634 * Generate a known hash collision, and verify that 3635 * we can lookup and remove both entries. 3636 */ 3637 tx = dmu_tx_create(os); 3638 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 3639 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 3640 if (txg == 0) 3641 return; 3642 for (i = 0; i < 2; i++) { 3643 value[i] = i; 3644 VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t), 3645 1, &value[i], tx)); 3646 } 3647 for (i = 0; i < 2; i++) { 3648 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 3649 sizeof (uint64_t), 1, &value[i], tx)); 3650 VERIFY3U(0, ==, 3651 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 3652 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 3653 ASSERT3U(zl_ints, ==, 1); 3654 } 3655 for (i = 0; i < 2; i++) { 3656 VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx)); 3657 } 3658 dmu_tx_commit(tx); 3659 3660 /* 3661 * Generate a buch of random entries. 3662 */ 3663 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 3664 3665 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 3666 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); 3667 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); 3668 bzero(value, sizeof (value)); 3669 last_txg = 0; 3670 3671 /* 3672 * If these zap entries already exist, validate their contents. 3673 */ 3674 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 3675 if (error == 0) { 3676 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 3677 ASSERT3U(zl_ints, ==, 1); 3678 3679 VERIFY(zap_lookup(os, object, txgname, zl_intsize, 3680 zl_ints, &last_txg) == 0); 3681 3682 VERIFY(zap_length(os, object, propname, &zl_intsize, 3683 &zl_ints) == 0); 3684 3685 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 3686 ASSERT3U(zl_ints, ==, ints); 3687 3688 VERIFY(zap_lookup(os, object, propname, zl_intsize, 3689 zl_ints, value) == 0); 3690 3691 for (i = 0; i < ints; i++) { 3692 ASSERT3U(value[i], ==, last_txg + object + i); 3693 } 3694 } else { 3695 ASSERT3U(error, ==, ENOENT); 3696 } 3697 3698 /* 3699 * Atomically update two entries in our zap object. 3700 * The first is named txg_%llu, and contains the txg 3701 * in which the property was last updated. The second 3702 * is named prop_%llu, and the nth element of its value 3703 * should be txg + object + n. 3704 */ 3705 tx = dmu_tx_create(os); 3706 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 3707 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 3708 if (txg == 0) 3709 return; 3710 3711 if (last_txg > txg) 3712 fatal(0, "zap future leak: old %llu new %llu", last_txg, txg); 3713 3714 for (i = 0; i < ints; i++) 3715 value[i] = txg + object + i; 3716 3717 VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t), 3718 1, &txg, tx)); 3719 VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t), 3720 ints, value, tx)); 3721 3722 dmu_tx_commit(tx); 3723 3724 /* 3725 * Remove a random pair of entries. 3726 */ 3727 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 3728 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); 3729 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); 3730 3731 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 3732 3733 if (error == ENOENT) 3734 return; 3735 3736 ASSERT3U(error, ==, 0); 3737 3738 tx = dmu_tx_create(os); 3739 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 3740 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 3741 if (txg == 0) 3742 return; 3743 VERIFY3U(0, ==, zap_remove(os, object, txgname, tx)); 3744 VERIFY3U(0, ==, zap_remove(os, object, propname, tx)); 3745 dmu_tx_commit(tx); 3746 } 3747 3748 /* 3749 * Testcase to test the upgrading of a microzap to fatzap. 3750 */ 3751 void 3752 ztest_fzap(ztest_ds_t *zd, uint64_t id) 3753 { 3754 objset_t *os = zd->zd_os; 3755 ztest_od_t od[1]; 3756 uint64_t object, txg; 3757 3758 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0); 3759 3760 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) 3761 return; 3762 3763 object = od[0].od_object; 3764 3765 /* 3766 * Add entries to this ZAP and make sure it spills over 3767 * and gets upgraded to a fatzap. Also, since we are adding 3768 * 2050 entries we should see ptrtbl growth and leaf-block split. 3769 */ 3770 for (int i = 0; i < 2050; i++) { 3771 char name[MAXNAMELEN]; 3772 uint64_t value = i; 3773 dmu_tx_t *tx; 3774 int error; 3775 3776 (void) snprintf(name, sizeof (name), "fzap-%llu-%llu", 3777 id, value); 3778 3779 tx = dmu_tx_create(os); 3780 dmu_tx_hold_zap(tx, object, B_TRUE, name); 3781 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 3782 if (txg == 0) 3783 return; 3784 error = zap_add(os, object, name, sizeof (uint64_t), 1, 3785 &value, tx); 3786 ASSERT(error == 0 || error == EEXIST); 3787 dmu_tx_commit(tx); 3788 } 3789 } 3790 3791 /* ARGSUSED */ 3792 void 3793 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 3794 { 3795 objset_t *os = zd->zd_os; 3796 ztest_od_t od[1]; 3797 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 3798 dmu_tx_t *tx; 3799 int i, namelen, error; 3800 int micro = ztest_random(2); 3801 char name[20], string_value[20]; 3802 void *data; 3803 3804 ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0); 3805 3806 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 3807 return; 3808 3809 object = od[0].od_object; 3810 3811 /* 3812 * Generate a random name of the form 'xxx.....' where each 3813 * x is a random printable character and the dots are dots. 3814 * There are 94 such characters, and the name length goes from 3815 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 3816 */ 3817 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 3818 3819 for (i = 0; i < 3; i++) 3820 name[i] = '!' + ztest_random('~' - '!' + 1); 3821 for (; i < namelen - 1; i++) 3822 name[i] = '.'; 3823 name[i] = '\0'; 3824 3825 if ((namelen & 1) || micro) { 3826 wsize = sizeof (txg); 3827 wc = 1; 3828 data = &txg; 3829 } else { 3830 wsize = 1; 3831 wc = namelen; 3832 data = string_value; 3833 } 3834 3835 count = -1ULL; 3836 VERIFY(zap_count(os, object, &count) == 0); 3837 ASSERT(count != -1ULL); 3838 3839 /* 3840 * Select an operation: length, lookup, add, update, remove. 3841 */ 3842 i = ztest_random(5); 3843 3844 if (i >= 2) { 3845 tx = dmu_tx_create(os); 3846 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 3847 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 3848 if (txg == 0) 3849 return; 3850 bcopy(name, string_value, namelen); 3851 } else { 3852 tx = NULL; 3853 txg = 0; 3854 bzero(string_value, namelen); 3855 } 3856 3857 switch (i) { 3858 3859 case 0: 3860 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 3861 if (error == 0) { 3862 ASSERT3U(wsize, ==, zl_wsize); 3863 ASSERT3U(wc, ==, zl_wc); 3864 } else { 3865 ASSERT3U(error, ==, ENOENT); 3866 } 3867 break; 3868 3869 case 1: 3870 error = zap_lookup(os, object, name, wsize, wc, data); 3871 if (error == 0) { 3872 if (data == string_value && 3873 bcmp(name, data, namelen) != 0) 3874 fatal(0, "name '%s' != val '%s' len %d", 3875 name, data, namelen); 3876 } else { 3877 ASSERT3U(error, ==, ENOENT); 3878 } 3879 break; 3880 3881 case 2: 3882 error = zap_add(os, object, name, wsize, wc, data, tx); 3883 ASSERT(error == 0 || error == EEXIST); 3884 break; 3885 3886 case 3: 3887 VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0); 3888 break; 3889 3890 case 4: 3891 error = zap_remove(os, object, name, tx); 3892 ASSERT(error == 0 || error == ENOENT); 3893 break; 3894 } 3895 3896 if (tx != NULL) 3897 dmu_tx_commit(tx); 3898 } 3899 3900 /* 3901 * Commit callback data. 3902 */ 3903 typedef struct ztest_cb_data { 3904 list_node_t zcd_node; 3905 uint64_t zcd_txg; 3906 int zcd_expected_err; 3907 boolean_t zcd_added; 3908 boolean_t zcd_called; 3909 spa_t *zcd_spa; 3910 } ztest_cb_data_t; 3911 3912 /* This is the actual commit callback function */ 3913 static void 3914 ztest_commit_callback(void *arg, int error) 3915 { 3916 ztest_cb_data_t *data = arg; 3917 uint64_t synced_txg; 3918 3919 VERIFY(data != NULL); 3920 VERIFY3S(data->zcd_expected_err, ==, error); 3921 VERIFY(!data->zcd_called); 3922 3923 synced_txg = spa_last_synced_txg(data->zcd_spa); 3924 if (data->zcd_txg > synced_txg) 3925 fatal(0, "commit callback of txg %" PRIu64 " called prematurely" 3926 ", last synced txg = %" PRIu64 "\n", data->zcd_txg, 3927 synced_txg); 3928 3929 data->zcd_called = B_TRUE; 3930 3931 if (error == ECANCELED) { 3932 ASSERT3U(data->zcd_txg, ==, 0); 3933 ASSERT(!data->zcd_added); 3934 3935 /* 3936 * The private callback data should be destroyed here, but 3937 * since we are going to check the zcd_called field after 3938 * dmu_tx_abort(), we will destroy it there. 3939 */ 3940 return; 3941 } 3942 3943 /* Was this callback added to the global callback list? */ 3944 if (!data->zcd_added) 3945 goto out; 3946 3947 ASSERT3U(data->zcd_txg, !=, 0); 3948 3949 /* Remove our callback from the list */ 3950 (void) mutex_lock(&zcl.zcl_callbacks_lock); 3951 list_remove(&zcl.zcl_callbacks, data); 3952 (void) mutex_unlock(&zcl.zcl_callbacks_lock); 3953 3954 out: 3955 umem_free(data, sizeof (ztest_cb_data_t)); 3956 } 3957 3958 /* Allocate and initialize callback data structure */ 3959 static ztest_cb_data_t * 3960 ztest_create_cb_data(objset_t *os, uint64_t txg) 3961 { 3962 ztest_cb_data_t *cb_data; 3963 3964 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 3965 3966 cb_data->zcd_txg = txg; 3967 cb_data->zcd_spa = dmu_objset_spa(os); 3968 3969 return (cb_data); 3970 } 3971 3972 /* 3973 * If a number of txgs equal to this threshold have been created after a commit 3974 * callback has been registered but not called, then we assume there is an 3975 * implementation bug. 3976 */ 3977 #define ZTEST_COMMIT_CALLBACK_THRESH (TXG_CONCURRENT_STATES + 2) 3978 3979 /* 3980 * Commit callback test. 3981 */ 3982 void 3983 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 3984 { 3985 objset_t *os = zd->zd_os; 3986 ztest_od_t od[1]; 3987 dmu_tx_t *tx; 3988 ztest_cb_data_t *cb_data[3], *tmp_cb; 3989 uint64_t old_txg, txg; 3990 int i, error; 3991 3992 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); 3993 3994 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 3995 return; 3996 3997 tx = dmu_tx_create(os); 3998 3999 cb_data[0] = ztest_create_cb_data(os, 0); 4000 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 4001 4002 dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t)); 4003 4004 /* Every once in a while, abort the transaction on purpose */ 4005 if (ztest_random(100) == 0) 4006 error = -1; 4007 4008 if (!error) 4009 error = dmu_tx_assign(tx, TXG_NOWAIT); 4010 4011 txg = error ? 0 : dmu_tx_get_txg(tx); 4012 4013 cb_data[0]->zcd_txg = txg; 4014 cb_data[1] = ztest_create_cb_data(os, txg); 4015 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 4016 4017 if (error) { 4018 /* 4019 * It's not a strict requirement to call the registered 4020 * callbacks from inside dmu_tx_abort(), but that's what 4021 * it's supposed to happen in the current implementation 4022 * so we will check for that. 4023 */ 4024 for (i = 0; i < 2; i++) { 4025 cb_data[i]->zcd_expected_err = ECANCELED; 4026 VERIFY(!cb_data[i]->zcd_called); 4027 } 4028 4029 dmu_tx_abort(tx); 4030 4031 for (i = 0; i < 2; i++) { 4032 VERIFY(cb_data[i]->zcd_called); 4033 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 4034 } 4035 4036 return; 4037 } 4038 4039 cb_data[2] = ztest_create_cb_data(os, txg); 4040 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 4041 4042 /* 4043 * Read existing data to make sure there isn't a future leak. 4044 */ 4045 VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t), 4046 &old_txg, DMU_READ_PREFETCH)); 4047 4048 if (old_txg > txg) 4049 fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64, 4050 old_txg, txg); 4051 4052 dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx); 4053 4054 (void) mutex_lock(&zcl.zcl_callbacks_lock); 4055 4056 /* 4057 * Since commit callbacks don't have any ordering requirement and since 4058 * it is theoretically possible for a commit callback to be called 4059 * after an arbitrary amount of time has elapsed since its txg has been 4060 * synced, it is difficult to reliably determine whether a commit 4061 * callback hasn't been called due to high load or due to a flawed 4062 * implementation. 4063 * 4064 * In practice, we will assume that if after a certain number of txgs a 4065 * commit callback hasn't been called, then most likely there's an 4066 * implementation bug.. 4067 */ 4068 tmp_cb = list_head(&zcl.zcl_callbacks); 4069 if (tmp_cb != NULL && 4070 tmp_cb->zcd_txg > txg - ZTEST_COMMIT_CALLBACK_THRESH) { 4071 fatal(0, "Commit callback threshold exceeded, oldest txg: %" 4072 PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); 4073 } 4074 4075 /* 4076 * Let's find the place to insert our callbacks. 4077 * 4078 * Even though the list is ordered by txg, it is possible for the 4079 * insertion point to not be the end because our txg may already be 4080 * quiescing at this point and other callbacks in the open txg 4081 * (from other objsets) may have sneaked in. 4082 */ 4083 tmp_cb = list_tail(&zcl.zcl_callbacks); 4084 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 4085 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 4086 4087 /* Add the 3 callbacks to the list */ 4088 for (i = 0; i < 3; i++) { 4089 if (tmp_cb == NULL) 4090 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 4091 else 4092 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 4093 cb_data[i]); 4094 4095 cb_data[i]->zcd_added = B_TRUE; 4096 VERIFY(!cb_data[i]->zcd_called); 4097 4098 tmp_cb = cb_data[i]; 4099 } 4100 4101 (void) mutex_unlock(&zcl.zcl_callbacks_lock); 4102 4103 dmu_tx_commit(tx); 4104 } 4105 4106 /* ARGSUSED */ 4107 void 4108 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 4109 { 4110 zfs_prop_t proplist[] = { 4111 ZFS_PROP_CHECKSUM, 4112 ZFS_PROP_COMPRESSION, 4113 ZFS_PROP_COPIES, 4114 ZFS_PROP_DEDUP 4115 }; 4116 ztest_shared_t *zs = ztest_shared; 4117 4118 (void) rw_rdlock(&zs->zs_name_lock); 4119 4120 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) 4121 (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 4122 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 4123 4124 (void) rw_unlock(&zs->zs_name_lock); 4125 } 4126 4127 /* ARGSUSED */ 4128 void 4129 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 4130 { 4131 ztest_shared_t *zs = ztest_shared; 4132 nvlist_t *props = NULL; 4133 4134 (void) rw_rdlock(&zs->zs_name_lock); 4135 4136 #if 0 4137 (void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO, 4138 ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN)); 4139 #endif 4140 4141 VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0); 4142 4143 if (zopt_verbose >= 6) 4144 dump_nvlist(props, 4); 4145 4146 nvlist_free(props); 4147 4148 (void) rw_unlock(&zs->zs_name_lock); 4149 } 4150 4151 /* 4152 * Test snapshot hold/release and deferred destroy. 4153 */ 4154 void 4155 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 4156 { 4157 int error; 4158 objset_t *os = zd->zd_os; 4159 objset_t *origin; 4160 char snapname[100]; 4161 char fullname[100]; 4162 char clonename[100]; 4163 char tag[100]; 4164 char osname[MAXNAMELEN]; 4165 4166 (void) rw_rdlock(&ztest_shared->zs_name_lock); 4167 4168 dmu_objset_name(os, osname); 4169 4170 (void) snprintf(snapname, 100, "sh1_%llu", id); 4171 (void) snprintf(fullname, 100, "%s@%s", osname, snapname); 4172 (void) snprintf(clonename, 100, "%s/ch1_%llu", osname, id); 4173 (void) snprintf(tag, 100, "%tag_%llu", id); 4174 4175 /* 4176 * Clean up from any previous run. 4177 */ 4178 (void) dmu_objset_destroy(clonename, B_FALSE); 4179 (void) dsl_dataset_user_release(osname, snapname, tag, B_FALSE); 4180 (void) dmu_objset_destroy(fullname, B_FALSE); 4181 4182 /* 4183 * Create snapshot, clone it, mark snap for deferred destroy, 4184 * destroy clone, verify snap was also destroyed. 4185 */ 4186 error = dmu_objset_snapshot(osname, snapname, NULL, FALSE); 4187 if (error) { 4188 if (error == ENOSPC) { 4189 ztest_record_enospc("dmu_objset_snapshot"); 4190 goto out; 4191 } 4192 fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); 4193 } 4194 4195 error = dmu_objset_hold(fullname, FTAG, &origin); 4196 if (error) 4197 fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); 4198 4199 error = dmu_objset_clone(clonename, dmu_objset_ds(origin), 0); 4200 dmu_objset_rele(origin, FTAG); 4201 if (error) { 4202 if (error == ENOSPC) { 4203 ztest_record_enospc("dmu_objset_clone"); 4204 goto out; 4205 } 4206 fatal(0, "dmu_objset_clone(%s) = %d", clonename, error); 4207 } 4208 4209 error = dmu_objset_destroy(fullname, B_TRUE); 4210 if (error) { 4211 fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d", 4212 fullname, error); 4213 } 4214 4215 error = dmu_objset_destroy(clonename, B_FALSE); 4216 if (error) 4217 fatal(0, "dmu_objset_destroy(%s) = %d", clonename, error); 4218 4219 error = dmu_objset_hold(fullname, FTAG, &origin); 4220 if (error != ENOENT) 4221 fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); 4222 4223 /* 4224 * Create snapshot, add temporary hold, verify that we can't 4225 * destroy a held snapshot, mark for deferred destroy, 4226 * release hold, verify snapshot was destroyed. 4227 */ 4228 error = dmu_objset_snapshot(osname, snapname, NULL, FALSE); 4229 if (error) { 4230 if (error == ENOSPC) { 4231 ztest_record_enospc("dmu_objset_snapshot"); 4232 goto out; 4233 } 4234 fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); 4235 } 4236 4237 error = dsl_dataset_user_hold(osname, snapname, tag, B_FALSE, B_TRUE); 4238 if (error) 4239 fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag); 4240 4241 error = dmu_objset_destroy(fullname, B_FALSE); 4242 if (error != EBUSY) { 4243 fatal(0, "dmu_objset_destroy(%s, B_FALSE) = %d", 4244 fullname, error); 4245 } 4246 4247 error = dmu_objset_destroy(fullname, B_TRUE); 4248 if (error) { 4249 fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d", 4250 fullname, error); 4251 } 4252 4253 error = dsl_dataset_user_release(osname, snapname, tag, B_FALSE); 4254 if (error) 4255 fatal(0, "dsl_dataset_user_release(%s)", fullname, tag); 4256 4257 VERIFY(dmu_objset_hold(fullname, FTAG, &origin) == ENOENT); 4258 4259 out: 4260 (void) rw_unlock(&ztest_shared->zs_name_lock); 4261 } 4262 4263 /* 4264 * Inject random faults into the on-disk data. 4265 */ 4266 /* ARGSUSED */ 4267 void 4268 ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 4269 { 4270 ztest_shared_t *zs = ztest_shared; 4271 spa_t *spa = zs->zs_spa; 4272 int fd; 4273 uint64_t offset; 4274 uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; 4275 uint64_t bad = 0x1990c0ffeedecade; 4276 uint64_t top, leaf; 4277 char path0[MAXPATHLEN]; 4278 char pathrand[MAXPATHLEN]; 4279 size_t fsize; 4280 int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */ 4281 int iters = 1000; 4282 int maxfaults = zopt_maxfaults; 4283 vdev_t *vd0 = NULL; 4284 uint64_t guid0 = 0; 4285 boolean_t islog = B_FALSE; 4286 4287 ASSERT(leaves >= 1); 4288 4289 /* 4290 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 4291 */ 4292 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4293 4294 if (ztest_random(2) == 0) { 4295 /* 4296 * Inject errors on a normal data device or slog device. 4297 */ 4298 top = ztest_random_vdev_top(spa, B_TRUE); 4299 leaf = ztest_random(leaves); 4300 4301 /* 4302 * Generate paths to the first leaf in this top-level vdev, 4303 * and to the random leaf we selected. We'll induce transient 4304 * write failures and random online/offline activity on leaf 0, 4305 * and we'll write random garbage to the randomly chosen leaf. 4306 */ 4307 (void) snprintf(path0, sizeof (path0), ztest_dev_template, 4308 zopt_dir, zopt_pool, top * leaves + 0); 4309 (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template, 4310 zopt_dir, zopt_pool, top * leaves + leaf); 4311 4312 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 4313 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 4314 islog = B_TRUE; 4315 4316 if (vd0 != NULL && maxfaults != 1) { 4317 /* 4318 * Make vd0 explicitly claim to be unreadable, 4319 * or unwriteable, or reach behind its back 4320 * and close the underlying fd. We can do this if 4321 * maxfaults == 0 because we'll fail and reexecute, 4322 * and we can do it if maxfaults >= 2 because we'll 4323 * have enough redundancy. If maxfaults == 1, the 4324 * combination of this with injection of random data 4325 * corruption below exceeds the pool's fault tolerance. 4326 */ 4327 vdev_file_t *vf = vd0->vdev_tsd; 4328 4329 if (vf != NULL && ztest_random(3) == 0) { 4330 (void) close(vf->vf_vnode->v_fd); 4331 vf->vf_vnode->v_fd = -1; 4332 } else if (ztest_random(2) == 0) { 4333 vd0->vdev_cant_read = B_TRUE; 4334 } else { 4335 vd0->vdev_cant_write = B_TRUE; 4336 } 4337 guid0 = vd0->vdev_guid; 4338 } 4339 } else { 4340 /* 4341 * Inject errors on an l2cache device. 4342 */ 4343 spa_aux_vdev_t *sav = &spa->spa_l2cache; 4344 4345 if (sav->sav_count == 0) { 4346 spa_config_exit(spa, SCL_STATE, FTAG); 4347 return; 4348 } 4349 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 4350 guid0 = vd0->vdev_guid; 4351 (void) strcpy(path0, vd0->vdev_path); 4352 (void) strcpy(pathrand, vd0->vdev_path); 4353 4354 leaf = 0; 4355 leaves = 1; 4356 maxfaults = INT_MAX; /* no limit on cache devices */ 4357 } 4358 4359 spa_config_exit(spa, SCL_STATE, FTAG); 4360 4361 /* 4362 * If we can tolerate two or more faults, or we're dealing 4363 * with a slog, randomly online/offline vd0. 4364 */ 4365 if ((maxfaults >= 2 || islog) && guid0 != 0) { 4366 if (ztest_random(10) < 6) { 4367 int flags = (ztest_random(2) == 0 ? 4368 ZFS_OFFLINE_TEMPORARY : 0); 4369 4370 /* 4371 * We have to grab the zs_name_lock as writer to 4372 * prevent a race between offlining a slog and 4373 * destroying a dataset. Offlining the slog will 4374 * grab a reference on the dataset which may cause 4375 * dmu_objset_destroy() to fail with EBUSY thus 4376 * leaving the dataset in an inconsistent state. 4377 */ 4378 if (islog) 4379 (void) rw_wrlock(&ztest_shared->zs_name_lock); 4380 4381 VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); 4382 4383 if (islog) 4384 (void) rw_unlock(&ztest_shared->zs_name_lock); 4385 } else { 4386 (void) vdev_online(spa, guid0, 0, NULL); 4387 } 4388 } 4389 4390 if (maxfaults == 0) 4391 return; 4392 4393 /* 4394 * We have at least single-fault tolerance, so inject data corruption. 4395 */ 4396 fd = open(pathrand, O_RDWR); 4397 4398 if (fd == -1) /* we hit a gap in the device namespace */ 4399 return; 4400 4401 fsize = lseek(fd, 0, SEEK_END); 4402 4403 while (--iters != 0) { 4404 offset = ztest_random(fsize / (leaves << bshift)) * 4405 (leaves << bshift) + (leaf << bshift) + 4406 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 4407 4408 if (offset >= fsize) 4409 continue; 4410 4411 if (zopt_verbose >= 7) 4412 (void) printf("injecting bad word into %s," 4413 " offset 0x%llx\n", pathrand, (u_longlong_t)offset); 4414 4415 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 4416 fatal(1, "can't inject bad word at 0x%llx in %s", 4417 offset, pathrand); 4418 } 4419 4420 (void) close(fd); 4421 } 4422 4423 /* 4424 * Verify that DDT repair works as expected. 4425 */ 4426 void 4427 ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) 4428 { 4429 ztest_shared_t *zs = ztest_shared; 4430 spa_t *spa = zs->zs_spa; 4431 objset_t *os = zd->zd_os; 4432 ztest_od_t od[1]; 4433 uint64_t object, blocksize, txg, pattern, psize; 4434 enum zio_checksum checksum = spa_dedup_checksum(spa); 4435 dmu_buf_t *db; 4436 dmu_tx_t *tx; 4437 void *buf; 4438 blkptr_t blk; 4439 int copies = 2 * ZIO_DEDUPDITTO_MIN; 4440 4441 blocksize = ztest_random_blocksize(); 4442 blocksize = MIN(blocksize, 2048); /* because we write so many */ 4443 4444 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); 4445 4446 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 4447 return; 4448 4449 /* 4450 * Take the name lock as writer to prevent anyone else from changing 4451 * the pool and dataset properies we need to maintain during this test. 4452 */ 4453 (void) rw_wrlock(&zs->zs_name_lock); 4454 4455 if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum, 4456 B_FALSE) != 0 || 4457 ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1, 4458 B_FALSE) != 0) { 4459 (void) rw_unlock(&zs->zs_name_lock); 4460 return; 4461 } 4462 4463 object = od[0].od_object; 4464 blocksize = od[0].od_blocksize; 4465 pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os); 4466 4467 ASSERT(object != 0); 4468 4469 tx = dmu_tx_create(os); 4470 dmu_tx_hold_write(tx, object, 0, copies * blocksize); 4471 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 4472 if (txg == 0) { 4473 (void) rw_unlock(&zs->zs_name_lock); 4474 return; 4475 } 4476 4477 /* 4478 * Write all the copies of our block. 4479 */ 4480 for (int i = 0; i < copies; i++) { 4481 uint64_t offset = i * blocksize; 4482 VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db) == 0); 4483 ASSERT(db->db_offset == offset); 4484 ASSERT(db->db_size == blocksize); 4485 ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) || 4486 ztest_pattern_match(db->db_data, db->db_size, 0ULL)); 4487 dmu_buf_will_fill(db, tx); 4488 ztest_pattern_set(db->db_data, db->db_size, pattern); 4489 dmu_buf_rele(db, FTAG); 4490 } 4491 4492 dmu_tx_commit(tx); 4493 txg_wait_synced(spa_get_dsl(spa), txg); 4494 4495 /* 4496 * Find out what block we got. 4497 */ 4498 VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db) == 0); 4499 blk = *((dmu_buf_impl_t *)db)->db_blkptr; 4500 dmu_buf_rele(db, FTAG); 4501 4502 /* 4503 * Damage the block. Dedup-ditto will save us when we read it later. 4504 */ 4505 psize = BP_GET_PSIZE(&blk); 4506 buf = zio_buf_alloc(psize); 4507 ztest_pattern_set(buf, psize, ~pattern); 4508 4509 (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, 4510 buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, 4511 ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); 4512 4513 zio_buf_free(buf, psize); 4514 4515 (void) rw_unlock(&zs->zs_name_lock); 4516 } 4517 4518 /* 4519 * Scrub the pool. 4520 */ 4521 /* ARGSUSED */ 4522 void 4523 ztest_scrub(ztest_ds_t *zd, uint64_t id) 4524 { 4525 ztest_shared_t *zs = ztest_shared; 4526 spa_t *spa = zs->zs_spa; 4527 4528 (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING); 4529 (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */ 4530 (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING); 4531 } 4532 4533 /* 4534 * Rename the pool to a different name and then rename it back. 4535 */ 4536 /* ARGSUSED */ 4537 void 4538 ztest_spa_rename(ztest_ds_t *zd, uint64_t id) 4539 { 4540 ztest_shared_t *zs = ztest_shared; 4541 char *oldname, *newname; 4542 spa_t *spa; 4543 4544 (void) rw_wrlock(&zs->zs_name_lock); 4545 4546 oldname = zs->zs_pool; 4547 newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL); 4548 (void) strcpy(newname, oldname); 4549 (void) strcat(newname, "_tmp"); 4550 4551 /* 4552 * Do the rename 4553 */ 4554 VERIFY3U(0, ==, spa_rename(oldname, newname)); 4555 4556 /* 4557 * Try to open it under the old name, which shouldn't exist 4558 */ 4559 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 4560 4561 /* 4562 * Open it under the new name and make sure it's still the same spa_t. 4563 */ 4564 VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); 4565 4566 ASSERT(spa == zs->zs_spa); 4567 spa_close(spa, FTAG); 4568 4569 /* 4570 * Rename it back to the original 4571 */ 4572 VERIFY3U(0, ==, spa_rename(newname, oldname)); 4573 4574 /* 4575 * Make sure it can still be opened 4576 */ 4577 VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); 4578 4579 ASSERT(spa == zs->zs_spa); 4580 spa_close(spa, FTAG); 4581 4582 umem_free(newname, strlen(newname) + 1); 4583 4584 (void) rw_unlock(&zs->zs_name_lock); 4585 } 4586 4587 /* 4588 * Verify pool integrity by running zdb. 4589 */ 4590 static void 4591 ztest_run_zdb(char *pool) 4592 { 4593 int status; 4594 char zdb[MAXPATHLEN + MAXNAMELEN + 20]; 4595 char zbuf[1024]; 4596 char *bin; 4597 char *ztest; 4598 char *isa; 4599 int isalen; 4600 FILE *fp; 4601 4602 (void) realpath(getexecname(), zdb); 4603 4604 /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */ 4605 bin = strstr(zdb, "/usr/bin/"); 4606 ztest = strstr(bin, "/ztest"); 4607 isa = bin + 8; 4608 isalen = ztest - isa; 4609 isa = strdup(isa); 4610 /* LINTED */ 4611 (void) sprintf(bin, 4612 "/usr/sbin%.*s/zdb -bcc%s%s -U /tmp/zpool.cache %s", 4613 isalen, 4614 isa, 4615 zopt_verbose >= 3 ? "s" : "", 4616 zopt_verbose >= 4 ? "v" : "", 4617 pool); 4618 free(isa); 4619 4620 if (zopt_verbose >= 5) 4621 (void) printf("Executing %s\n", strstr(zdb, "zdb ")); 4622 4623 fp = popen(zdb, "r"); 4624 4625 while (fgets(zbuf, sizeof (zbuf), fp) != NULL) 4626 if (zopt_verbose >= 3) 4627 (void) printf("%s", zbuf); 4628 4629 status = pclose(fp); 4630 4631 if (status == 0) 4632 return; 4633 4634 ztest_dump_core = 0; 4635 if (WIFEXITED(status)) 4636 fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 4637 else 4638 fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status)); 4639 } 4640 4641 static void 4642 ztest_walk_pool_directory(char *header) 4643 { 4644 spa_t *spa = NULL; 4645 4646 if (zopt_verbose >= 6) 4647 (void) printf("%s\n", header); 4648 4649 mutex_enter(&spa_namespace_lock); 4650 while ((spa = spa_next(spa)) != NULL) 4651 if (zopt_verbose >= 6) 4652 (void) printf("\t%s\n", spa_name(spa)); 4653 mutex_exit(&spa_namespace_lock); 4654 } 4655 4656 static void 4657 ztest_spa_import_export(char *oldname, char *newname) 4658 { 4659 nvlist_t *config, *newconfig; 4660 uint64_t pool_guid; 4661 spa_t *spa; 4662 4663 if (zopt_verbose >= 4) { 4664 (void) printf("import/export: old = %s, new = %s\n", 4665 oldname, newname); 4666 } 4667 4668 /* 4669 * Clean up from previous runs. 4670 */ 4671 (void) spa_destroy(newname); 4672 4673 /* 4674 * Get the pool's configuration and guid. 4675 */ 4676 VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); 4677 4678 /* 4679 * Kick off a scrub to tickle scrub/export races. 4680 */ 4681 if (ztest_random(2) == 0) 4682 (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING); 4683 4684 pool_guid = spa_guid(spa); 4685 spa_close(spa, FTAG); 4686 4687 ztest_walk_pool_directory("pools before export"); 4688 4689 /* 4690 * Export it. 4691 */ 4692 VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE)); 4693 4694 ztest_walk_pool_directory("pools after export"); 4695 4696 /* 4697 * Try to import it. 4698 */ 4699 newconfig = spa_tryimport(config); 4700 ASSERT(newconfig != NULL); 4701 nvlist_free(newconfig); 4702 4703 /* 4704 * Import it under the new name. 4705 */ 4706 VERIFY3U(0, ==, spa_import(newname, config, NULL)); 4707 4708 ztest_walk_pool_directory("pools after import"); 4709 4710 /* 4711 * Try to import it again -- should fail with EEXIST. 4712 */ 4713 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL)); 4714 4715 /* 4716 * Try to import it under a different name -- should fail with EEXIST. 4717 */ 4718 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL)); 4719 4720 /* 4721 * Verify that the pool is no longer visible under the old name. 4722 */ 4723 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 4724 4725 /* 4726 * Verify that we can open and close the pool using the new name. 4727 */ 4728 VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); 4729 ASSERT(pool_guid == spa_guid(spa)); 4730 spa_close(spa, FTAG); 4731 4732 nvlist_free(config); 4733 } 4734 4735 static void 4736 ztest_resume(spa_t *spa) 4737 { 4738 if (spa_suspended(spa) && zopt_verbose >= 6) 4739 (void) printf("resuming from suspended state\n"); 4740 spa_vdev_state_enter(spa, SCL_NONE); 4741 vdev_clear(spa, NULL); 4742 (void) spa_vdev_state_exit(spa, NULL, 0); 4743 (void) zio_resume(spa); 4744 } 4745 4746 static void * 4747 ztest_resume_thread(void *arg) 4748 { 4749 spa_t *spa = arg; 4750 4751 while (!ztest_exiting) { 4752 if (spa_suspended(spa)) 4753 ztest_resume(spa); 4754 (void) poll(NULL, 0, 100); 4755 } 4756 return (NULL); 4757 } 4758 4759 static void * 4760 ztest_deadman_thread(void *arg) 4761 { 4762 ztest_shared_t *zs = arg; 4763 int grace = 300; 4764 hrtime_t delta; 4765 4766 delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace; 4767 4768 (void) poll(NULL, 0, (int)(1000 * delta)); 4769 4770 fatal(0, "failed to complete within %d seconds of deadline", grace); 4771 4772 return (NULL); 4773 } 4774 4775 static void 4776 ztest_execute(ztest_info_t *zi, uint64_t id) 4777 { 4778 ztest_shared_t *zs = ztest_shared; 4779 ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets]; 4780 hrtime_t functime = gethrtime(); 4781 4782 for (int i = 0; i < zi->zi_iters; i++) 4783 zi->zi_func(zd, id); 4784 4785 functime = gethrtime() - functime; 4786 4787 atomic_add_64(&zi->zi_call_count, 1); 4788 atomic_add_64(&zi->zi_call_time, functime); 4789 4790 if (zopt_verbose >= 4) { 4791 Dl_info dli; 4792 (void) dladdr((void *)zi->zi_func, &dli); 4793 (void) printf("%6.2f sec in %s\n", 4794 (double)functime / NANOSEC, dli.dli_sname); 4795 } 4796 } 4797 4798 static void * 4799 ztest_thread(void *arg) 4800 { 4801 uint64_t id = (uintptr_t)arg; 4802 ztest_shared_t *zs = ztest_shared; 4803 uint64_t call_next; 4804 hrtime_t now; 4805 ztest_info_t *zi; 4806 4807 while ((now = gethrtime()) < zs->zs_thread_stop) { 4808 /* 4809 * See if it's time to force a crash. 4810 */ 4811 if (now > zs->zs_thread_kill) 4812 ztest_kill(zs); 4813 4814 /* 4815 * If we're getting ENOSPC with some regularity, stop. 4816 */ 4817 if (zs->zs_enospc_count > 10) 4818 break; 4819 4820 /* 4821 * Pick a random function to execute. 4822 */ 4823 zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)]; 4824 call_next = zi->zi_call_next; 4825 4826 if (now >= call_next && 4827 atomic_cas_64(&zi->zi_call_next, call_next, call_next + 4828 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) 4829 ztest_execute(zi, id); 4830 } 4831 4832 return (NULL); 4833 } 4834 4835 static void 4836 ztest_dataset_name(char *dsname, char *pool, int d) 4837 { 4838 (void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d); 4839 } 4840 4841 static void 4842 ztest_dataset_destroy(ztest_shared_t *zs, int d) 4843 { 4844 char name[MAXNAMELEN]; 4845 4846 ztest_dataset_name(name, zs->zs_pool, d); 4847 4848 if (zopt_verbose >= 3) 4849 (void) printf("Destroying %s to free up space\n", name); 4850 4851 /* 4852 * Cleanup any non-standard clones and snapshots. In general, 4853 * ztest thread t operates on dataset (t % zopt_datasets), 4854 * so there may be more than one thing to clean up. 4855 */ 4856 for (int t = d; t < zopt_threads; t += zopt_datasets) 4857 ztest_dsl_dataset_cleanup(name, t); 4858 4859 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 4860 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 4861 } 4862 4863 static void 4864 ztest_dataset_dirobj_verify(ztest_ds_t *zd) 4865 { 4866 uint64_t usedobjs, dirobjs, scratch; 4867 4868 /* 4869 * ZTEST_DIROBJ is the object directory for the entire dataset. 4870 * Therefore, the number of objects in use should equal the 4871 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 4872 * If not, we have an object leak. 4873 * 4874 * Note that we can only check this in ztest_dataset_open(), 4875 * when the open-context and syncing-context values agree. 4876 * That's because zap_count() returns the open-context value, 4877 * while dmu_objset_space() returns the rootbp fill count. 4878 */ 4879 VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 4880 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 4881 ASSERT3U(dirobjs + 1, ==, usedobjs); 4882 } 4883 4884 static int 4885 ztest_dataset_open(ztest_shared_t *zs, int d) 4886 { 4887 ztest_ds_t *zd = &zs->zs_zd[d]; 4888 uint64_t committed_seq = zd->zd_seq; 4889 objset_t *os; 4890 zilog_t *zilog; 4891 char name[MAXNAMELEN]; 4892 int error; 4893 4894 ztest_dataset_name(name, zs->zs_pool, d); 4895 4896 (void) rw_rdlock(&zs->zs_name_lock); 4897 4898 error = dmu_objset_create(name, DMU_OST_OTHER, 0, 4899 ztest_objset_create_cb, NULL); 4900 if (error == ENOSPC) { 4901 (void) rw_unlock(&zs->zs_name_lock); 4902 ztest_record_enospc(FTAG); 4903 return (error); 4904 } 4905 ASSERT(error == 0 || error == EEXIST); 4906 4907 VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0); 4908 (void) rw_unlock(&zs->zs_name_lock); 4909 4910 ztest_zd_init(zd, os); 4911 4912 zilog = zd->zd_zilog; 4913 4914 if (zilog->zl_header->zh_claim_lr_seq != 0 && 4915 zilog->zl_header->zh_claim_lr_seq < committed_seq) 4916 fatal(0, "missing log records: claimed %llu < committed %llu", 4917 zilog->zl_header->zh_claim_lr_seq, committed_seq); 4918 4919 ztest_dataset_dirobj_verify(zd); 4920 4921 zil_replay(os, zd, ztest_replay_vector); 4922 4923 ztest_dataset_dirobj_verify(zd); 4924 4925 if (zopt_verbose >= 6) 4926 (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", 4927 zd->zd_name, 4928 (u_longlong_t)zilog->zl_parse_blk_count, 4929 (u_longlong_t)zilog->zl_parse_lr_count, 4930 (u_longlong_t)zilog->zl_replaying_seq); 4931 4932 zilog = zil_open(os, ztest_get_data); 4933 4934 if (zilog->zl_replaying_seq != 0 && 4935 zilog->zl_replaying_seq < committed_seq) 4936 fatal(0, "missing log records: replayed %llu < committed %llu", 4937 zilog->zl_replaying_seq, committed_seq); 4938 4939 return (0); 4940 } 4941 4942 static void 4943 ztest_dataset_close(ztest_shared_t *zs, int d) 4944 { 4945 ztest_ds_t *zd = &zs->zs_zd[d]; 4946 4947 zil_close(zd->zd_zilog); 4948 dmu_objset_rele(zd->zd_os, zd); 4949 4950 ztest_zd_fini(zd); 4951 } 4952 4953 /* 4954 * Kick off threads to run tests on all datasets in parallel. 4955 */ 4956 static void 4957 ztest_run(ztest_shared_t *zs) 4958 { 4959 thread_t *tid; 4960 spa_t *spa; 4961 thread_t resume_tid; 4962 int error; 4963 4964 ztest_exiting = B_FALSE; 4965 4966 /* 4967 * Initialize parent/child shared state. 4968 */ 4969 VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0); 4970 VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0); 4971 4972 zs->zs_thread_start = gethrtime(); 4973 zs->zs_thread_stop = zs->zs_thread_start + zopt_passtime * NANOSEC; 4974 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 4975 zs->zs_thread_kill = zs->zs_thread_stop; 4976 if (ztest_random(100) < zopt_killrate) 4977 zs->zs_thread_kill -= ztest_random(zopt_passtime * NANOSEC); 4978 4979 (void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL); 4980 4981 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 4982 offsetof(ztest_cb_data_t, zcd_node)); 4983 4984 /* 4985 * Open our pool. 4986 */ 4987 kernel_init(FREAD | FWRITE); 4988 VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0); 4989 zs->zs_spa = spa; 4990 4991 spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN; 4992 4993 /* 4994 * We don't expect the pool to suspend unless maxfaults == 0, 4995 * in which case ztest_fault_inject() temporarily takes away 4996 * the only valid replica. 4997 */ 4998 if (zopt_maxfaults == 0) 4999 spa->spa_failmode = ZIO_FAILURE_MODE_WAIT; 5000 else 5001 spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; 5002 5003 /* 5004 * Create a thread to periodically resume suspended I/O. 5005 */ 5006 VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND, 5007 &resume_tid) == 0); 5008 5009 /* 5010 * Create a deadman thread to abort() if we hang. 5011 */ 5012 VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND, 5013 NULL) == 0); 5014 5015 /* 5016 * Verify that we can safely inquire about about any object, 5017 * whether it's allocated or not. To make it interesting, 5018 * we probe a 5-wide window around each power of two. 5019 * This hits all edge cases, including zero and the max. 5020 */ 5021 for (int t = 0; t < 64; t++) { 5022 for (int d = -5; d <= 5; d++) { 5023 error = dmu_object_info(spa->spa_meta_objset, 5024 (1ULL << t) + d, NULL); 5025 ASSERT(error == 0 || error == ENOENT || 5026 error == EINVAL); 5027 } 5028 } 5029 5030 /* 5031 * If we got any ENOSPC errors on the previous run, destroy something. 5032 */ 5033 if (zs->zs_enospc_count != 0) { 5034 int d = ztest_random(zopt_datasets); 5035 ztest_dataset_destroy(zs, d); 5036 } 5037 zs->zs_enospc_count = 0; 5038 5039 tid = umem_zalloc(zopt_threads * sizeof (thread_t), UMEM_NOFAIL); 5040 5041 if (zopt_verbose >= 4) 5042 (void) printf("starting main threads...\n"); 5043 5044 /* 5045 * Kick off all the tests that run in parallel. 5046 */ 5047 for (int t = 0; t < zopt_threads; t++) { 5048 if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0) 5049 return; 5050 VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t, 5051 THR_BOUND, &tid[t]) == 0); 5052 } 5053 5054 /* 5055 * Wait for all of the tests to complete. We go in reverse order 5056 * so we don't close datasets while threads are still using them. 5057 */ 5058 for (int t = zopt_threads - 1; t >= 0; t--) { 5059 VERIFY(thr_join(tid[t], NULL, NULL) == 0); 5060 if (t < zopt_datasets) 5061 ztest_dataset_close(zs, t); 5062 } 5063 5064 txg_wait_synced(spa_get_dsl(spa), 0); 5065 5066 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 5067 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 5068 5069 umem_free(tid, zopt_threads * sizeof (thread_t)); 5070 5071 /* Kill the resume thread */ 5072 ztest_exiting = B_TRUE; 5073 VERIFY(thr_join(resume_tid, NULL, NULL) == 0); 5074 ztest_resume(spa); 5075 5076 /* 5077 * Right before closing the pool, kick off a bunch of async I/O; 5078 * spa_close() should wait for it to complete. 5079 */ 5080 for (uint64_t object = 1; object < 50; object++) 5081 dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20); 5082 5083 spa_close(spa, FTAG); 5084 5085 /* 5086 * Verify that we can loop over all pools. 5087 */ 5088 mutex_enter(&spa_namespace_lock); 5089 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 5090 if (zopt_verbose > 3) 5091 (void) printf("spa_next: found %s\n", spa_name(spa)); 5092 mutex_exit(&spa_namespace_lock); 5093 5094 /* 5095 * Verify that we can export the pool and reimport it under a 5096 * different name. 5097 */ 5098 if (ztest_random(2) == 0) { 5099 char name[MAXNAMELEN]; 5100 (void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool); 5101 ztest_spa_import_export(zs->zs_pool, name); 5102 ztest_spa_import_export(name, zs->zs_pool); 5103 } 5104 5105 kernel_fini(); 5106 } 5107 5108 static void 5109 ztest_freeze(ztest_shared_t *zs) 5110 { 5111 ztest_ds_t *zd = &zs->zs_zd[0]; 5112 spa_t *spa; 5113 5114 if (zopt_verbose >= 3) 5115 (void) printf("testing spa_freeze()...\n"); 5116 5117 kernel_init(FREAD | FWRITE); 5118 VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); 5119 VERIFY3U(0, ==, ztest_dataset_open(zs, 0)); 5120 5121 /* 5122 * Force the first log block to be transactionally allocated. 5123 * We have to do this before we freeze the pool -- otherwise 5124 * the log chain won't be anchored. 5125 */ 5126 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 5127 ztest_dmu_object_alloc_free(zd, 0); 5128 zil_commit(zd->zd_zilog, UINT64_MAX, 0); 5129 } 5130 5131 txg_wait_synced(spa_get_dsl(spa), 0); 5132 5133 /* 5134 * Freeze the pool. This stops spa_sync() from doing anything, 5135 * so that the only way to record changes from now on is the ZIL. 5136 */ 5137 spa_freeze(spa); 5138 5139 /* 5140 * Run tests that generate log records but don't alter the pool config 5141 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 5142 * We do a txg_wait_synced() after each iteration to force the txg 5143 * to increase well beyond the last synced value in the uberblock. 5144 * The ZIL should be OK with that. 5145 */ 5146 while (ztest_random(20) != 0) { 5147 ztest_dmu_write_parallel(zd, 0); 5148 ztest_dmu_object_alloc_free(zd, 0); 5149 txg_wait_synced(spa_get_dsl(spa), 0); 5150 } 5151 5152 /* 5153 * Commit all of the changes we just generated. 5154 */ 5155 zil_commit(zd->zd_zilog, UINT64_MAX, 0); 5156 txg_wait_synced(spa_get_dsl(spa), 0); 5157 5158 /* 5159 * Close our dataset and close the pool. 5160 */ 5161 ztest_dataset_close(zs, 0); 5162 spa_close(spa, FTAG); 5163 kernel_fini(); 5164 5165 /* 5166 * Open and close the pool and dataset to induce log replay. 5167 */ 5168 kernel_init(FREAD | FWRITE); 5169 VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); 5170 VERIFY3U(0, ==, ztest_dataset_open(zs, 0)); 5171 ztest_dataset_close(zs, 0); 5172 spa_close(spa, FTAG); 5173 kernel_fini(); 5174 5175 list_destroy(&zcl.zcl_callbacks); 5176 5177 (void) _mutex_destroy(&zcl.zcl_callbacks_lock); 5178 5179 (void) rwlock_destroy(&zs->zs_name_lock); 5180 (void) _mutex_destroy(&zs->zs_vdev_lock); 5181 } 5182 5183 void 5184 print_time(hrtime_t t, char *timebuf) 5185 { 5186 hrtime_t s = t / NANOSEC; 5187 hrtime_t m = s / 60; 5188 hrtime_t h = m / 60; 5189 hrtime_t d = h / 24; 5190 5191 s -= m * 60; 5192 m -= h * 60; 5193 h -= d * 24; 5194 5195 timebuf[0] = '\0'; 5196 5197 if (d) 5198 (void) sprintf(timebuf, 5199 "%llud%02lluh%02llum%02llus", d, h, m, s); 5200 else if (h) 5201 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 5202 else if (m) 5203 (void) sprintf(timebuf, "%llum%02llus", m, s); 5204 else 5205 (void) sprintf(timebuf, "%llus", s); 5206 } 5207 5208 /* 5209 * Create a storage pool with the given name and initial vdev size. 5210 * Then test spa_freeze() functionality. 5211 */ 5212 static void 5213 ztest_init(ztest_shared_t *zs) 5214 { 5215 spa_t *spa; 5216 nvlist_t *nvroot; 5217 5218 VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0); 5219 VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0); 5220 5221 kernel_init(FREAD | FWRITE); 5222 5223 /* 5224 * Create the storage pool. 5225 */ 5226 (void) spa_destroy(zs->zs_pool); 5227 ztest_shared->zs_vdev_next_leaf = 0; 5228 nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, 5229 0, zopt_raidz, zopt_mirrors, 1); 5230 VERIFY3U(0, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL)); 5231 nvlist_free(nvroot); 5232 5233 VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); 5234 metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 5235 spa_close(spa, FTAG); 5236 5237 kernel_fini(); 5238 5239 ztest_run_zdb(zs->zs_pool); 5240 5241 ztest_freeze(zs); 5242 5243 ztest_run_zdb(zs->zs_pool); 5244 } 5245 5246 int 5247 main(int argc, char **argv) 5248 { 5249 int kills = 0; 5250 int iters = 0; 5251 ztest_shared_t *zs; 5252 size_t shared_size; 5253 ztest_info_t *zi; 5254 char timebuf[100]; 5255 char numbuf[6]; 5256 spa_t *spa; 5257 5258 (void) setvbuf(stdout, NULL, _IOLBF, 0); 5259 5260 /* Override location of zpool.cache */ 5261 spa_config_path = "/tmp/zpool.cache"; 5262 5263 ztest_random_fd = open("/dev/urandom", O_RDONLY); 5264 5265 process_options(argc, argv); 5266 5267 /* 5268 * Blow away any existing copy of zpool.cache 5269 */ 5270 if (zopt_init != 0) 5271 (void) remove("/tmp/zpool.cache"); 5272 5273 shared_size = sizeof (*zs) + zopt_datasets * sizeof (ztest_ds_t); 5274 5275 zs = ztest_shared = (void *)mmap(0, 5276 P2ROUNDUP(shared_size, getpagesize()), 5277 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); 5278 5279 if (zopt_verbose >= 1) { 5280 (void) printf("%llu vdevs, %d datasets, %d threads," 5281 " %llu seconds...\n", 5282 (u_longlong_t)zopt_vdevs, zopt_datasets, zopt_threads, 5283 (u_longlong_t)zopt_time); 5284 } 5285 5286 /* 5287 * Create and initialize our storage pool. 5288 */ 5289 for (int i = 1; i <= zopt_init; i++) { 5290 bzero(zs, sizeof (ztest_shared_t)); 5291 if (zopt_verbose >= 3 && zopt_init != 1) 5292 (void) printf("ztest_init(), pass %d\n", i); 5293 zs->zs_pool = zopt_pool; 5294 ztest_init(zs); 5295 } 5296 5297 zs->zs_pool = zopt_pool; 5298 zs->zs_proc_start = gethrtime(); 5299 zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC; 5300 5301 for (int f = 0; f < ZTEST_FUNCS; f++) { 5302 zi = &zs->zs_info[f]; 5303 *zi = ztest_info[f]; 5304 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 5305 zi->zi_call_next = UINT64_MAX; 5306 else 5307 zi->zi_call_next = zs->zs_proc_start + 5308 ztest_random(2 * zi->zi_interval[0] + 1); 5309 } 5310 5311 /* 5312 * Run the tests in a loop. These tests include fault injection 5313 * to verify that self-healing data works, and forced crashes 5314 * to verify that we never lose on-disk consistency. 5315 */ 5316 while (gethrtime() < zs->zs_proc_stop) { 5317 int status; 5318 pid_t pid; 5319 5320 /* 5321 * Initialize the workload counters for each function. 5322 */ 5323 for (int f = 0; f < ZTEST_FUNCS; f++) { 5324 zi = &zs->zs_info[f]; 5325 zi->zi_call_count = 0; 5326 zi->zi_call_time = 0; 5327 } 5328 5329 /* Set the allocation switch size */ 5330 metaslab_df_alloc_threshold = ztest_random(metaslab_sz / 4) + 1; 5331 5332 pid = fork(); 5333 5334 if (pid == -1) 5335 fatal(1, "fork failed"); 5336 5337 if (pid == 0) { /* child */ 5338 struct rlimit rl = { 1024, 1024 }; 5339 (void) setrlimit(RLIMIT_NOFILE, &rl); 5340 (void) enable_extended_FILE_stdio(-1, -1); 5341 ztest_run(zs); 5342 exit(0); 5343 } 5344 5345 while (waitpid(pid, &status, 0) != pid) 5346 continue; 5347 5348 if (WIFEXITED(status)) { 5349 if (WEXITSTATUS(status) != 0) { 5350 (void) fprintf(stderr, 5351 "child exited with code %d\n", 5352 WEXITSTATUS(status)); 5353 exit(2); 5354 } 5355 } else if (WIFSIGNALED(status)) { 5356 if (WTERMSIG(status) != SIGKILL) { 5357 (void) fprintf(stderr, 5358 "child died with signal %d\n", 5359 WTERMSIG(status)); 5360 exit(3); 5361 } 5362 kills++; 5363 } else { 5364 (void) fprintf(stderr, "something strange happened " 5365 "to child\n"); 5366 exit(4); 5367 } 5368 5369 iters++; 5370 5371 if (zopt_verbose >= 1) { 5372 hrtime_t now = gethrtime(); 5373 5374 now = MIN(now, zs->zs_proc_stop); 5375 print_time(zs->zs_proc_stop - now, timebuf); 5376 nicenum(zs->zs_space, numbuf); 5377 5378 (void) printf("Pass %3d, %8s, %3llu ENOSPC, " 5379 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 5380 iters, 5381 WIFEXITED(status) ? "Complete" : "SIGKILL", 5382 (u_longlong_t)zs->zs_enospc_count, 5383 100.0 * zs->zs_alloc / zs->zs_space, 5384 numbuf, 5385 100.0 * (now - zs->zs_proc_start) / 5386 (zopt_time * NANOSEC), timebuf); 5387 } 5388 5389 if (zopt_verbose >= 2) { 5390 (void) printf("\nWorkload summary:\n\n"); 5391 (void) printf("%7s %9s %s\n", 5392 "Calls", "Time", "Function"); 5393 (void) printf("%7s %9s %s\n", 5394 "-----", "----", "--------"); 5395 for (int f = 0; f < ZTEST_FUNCS; f++) { 5396 Dl_info dli; 5397 5398 zi = &zs->zs_info[f]; 5399 print_time(zi->zi_call_time, timebuf); 5400 (void) dladdr((void *)zi->zi_func, &dli); 5401 (void) printf("%7llu %9s %s\n", 5402 (u_longlong_t)zi->zi_call_count, timebuf, 5403 dli.dli_sname); 5404 } 5405 (void) printf("\n"); 5406 } 5407 5408 /* 5409 * It's possible that we killed a child during a rename test, 5410 * in which case we'll have a 'ztest_tmp' pool lying around 5411 * instead of 'ztest'. Do a blind rename in case this happened. 5412 */ 5413 kernel_init(FREAD); 5414 if (spa_open(zopt_pool, &spa, FTAG) == 0) { 5415 spa_close(spa, FTAG); 5416 } else { 5417 char tmpname[MAXNAMELEN]; 5418 kernel_fini(); 5419 kernel_init(FREAD | FWRITE); 5420 (void) snprintf(tmpname, sizeof (tmpname), "%s_tmp", 5421 zopt_pool); 5422 (void) spa_rename(tmpname, zopt_pool); 5423 } 5424 kernel_fini(); 5425 5426 ztest_run_zdb(zopt_pool); 5427 } 5428 5429 if (zopt_verbose >= 1) { 5430 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 5431 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 5432 } 5433 5434 return (0); 5435 } 5436