1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2013 Steven Hartland. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2017 Joyent, Inc. 28 * Copyright (c) 2017, Intel Corporation. 29 * Copyright 2017 RackTop Systems. 30 */ 31 32 /* 33 * The objective of this program is to provide a DMU/ZAP/SPA stress test 34 * that runs entirely in userland, is easy to use, and easy to extend. 35 * 36 * The overall design of the ztest program is as follows: 37 * 38 * (1) For each major functional area (e.g. adding vdevs to a pool, 39 * creating and destroying datasets, reading and writing objects, etc) 40 * we have a simple routine to test that functionality. These 41 * individual routines do not have to do anything "stressful". 42 * 43 * (2) We turn these simple functionality tests into a stress test by 44 * running them all in parallel, with as many threads as desired, 45 * and spread across as many datasets, objects, and vdevs as desired. 46 * 47 * (3) While all this is happening, we inject faults into the pool to 48 * verify that self-healing data really works. 49 * 50 * (4) Every time we open a dataset, we change its checksum and compression 51 * functions. Thus even individual objects vary from block to block 52 * in which checksum they use and whether they're compressed. 53 * 54 * (5) To verify that we never lose on-disk consistency after a crash, 55 * we run the entire test in a child of the main process. 56 * At random times, the child self-immolates with a SIGKILL. 57 * This is the software equivalent of pulling the power cord. 58 * The parent then runs the test again, using the existing 59 * storage pool, as many times as desired. If backwards compatibility 60 * testing is enabled ztest will sometimes run the "older" version 61 * of ztest after a SIGKILL. 62 * 63 * (6) To verify that we don't have future leaks or temporal incursions, 64 * many of the functional tests record the transaction group number 65 * as part of their data. When reading old data, they verify that 66 * the transaction group number is less than the current, open txg. 67 * If you add a new test, please do this if applicable. 68 * 69 * When run with no arguments, ztest runs for about five minutes and 70 * produces no output if successful. To get a little bit of information, 71 * specify -V. To get more information, specify -VV, and so on. 72 * 73 * To turn this into an overnight stress test, use -T to specify run time. 74 * 75 * You can ask more more vdevs [-v], datasets [-d], or threads [-t] 76 * to increase the pool capacity, fanout, and overall stress level. 77 * 78 * Use the -k option to set the desired frequency of kills. 79 * 80 * When ztest invokes itself it passes all relevant information through a 81 * temporary file which is mmap-ed in the child process. This allows shared 82 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 83 * stored at offset 0 of this file and contains information on the size and 84 * number of shared structures in the file. The information stored in this file 85 * must remain backwards compatible with older versions of ztest so that 86 * ztest can invoke them during backwards compatibility testing (-B). 87 */ 88 89 #include <sys/zfs_context.h> 90 #include <sys/spa.h> 91 #include <sys/dmu.h> 92 #include <sys/txg.h> 93 #include <sys/dbuf.h> 94 #include <sys/zap.h> 95 #include <sys/dmu_objset.h> 96 #include <sys/poll.h> 97 #include <sys/stat.h> 98 #include <sys/time.h> 99 #include <sys/wait.h> 100 #include <sys/mman.h> 101 #include <sys/resource.h> 102 #include <sys/zio.h> 103 #include <sys/zil.h> 104 #include <sys/zil_impl.h> 105 #include <sys/vdev_impl.h> 106 #include <sys/vdev_file.h> 107 #include <sys/vdev_initialize.h> 108 #include <sys/vdev_trim.h> 109 #include <sys/spa_impl.h> 110 #include <sys/metaslab_impl.h> 111 #include <sys/dsl_prop.h> 112 #include <sys/dsl_dataset.h> 113 #include <sys/dsl_destroy.h> 114 #include <sys/dsl_scan.h> 115 #include <sys/zio_checksum.h> 116 #include <sys/refcount.h> 117 #include <sys/zfeature.h> 118 #include <sys/dsl_userhold.h> 119 #include <sys/abd.h> 120 #include <stdio.h> 121 #include <stdio_ext.h> 122 #include <stdlib.h> 123 #include <unistd.h> 124 #include <signal.h> 125 #include <umem.h> 126 #include <dlfcn.h> 127 #include <ctype.h> 128 #include <math.h> 129 #include <sys/fs/zfs.h> 130 #include <zfs_fletcher.h> 131 #include <libnvpair.h> 132 #include <libzutil.h> 133 #include <libcmdutils.h> 134 135 static int ztest_fd_data = -1; 136 static int ztest_fd_rand = -1; 137 138 typedef struct ztest_shared_hdr { 139 uint64_t zh_hdr_size; 140 uint64_t zh_opts_size; 141 uint64_t zh_size; 142 uint64_t zh_stats_size; 143 uint64_t zh_stats_count; 144 uint64_t zh_ds_size; 145 uint64_t zh_ds_count; 146 } ztest_shared_hdr_t; 147 148 static ztest_shared_hdr_t *ztest_shared_hdr; 149 150 enum ztest_class_state { 151 ZTEST_VDEV_CLASS_OFF, 152 ZTEST_VDEV_CLASS_ON, 153 ZTEST_VDEV_CLASS_RND 154 }; 155 156 typedef struct ztest_shared_opts { 157 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 158 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 159 char zo_alt_ztest[MAXNAMELEN]; 160 char zo_alt_libpath[MAXNAMELEN]; 161 uint64_t zo_vdevs; 162 uint64_t zo_vdevtime; 163 size_t zo_vdev_size; 164 int zo_ashift; 165 int zo_mirrors; 166 int zo_raidz; 167 int zo_raidz_parity; 168 int zo_datasets; 169 int zo_threads; 170 uint64_t zo_passtime; 171 uint64_t zo_killrate; 172 int zo_verbose; 173 int zo_init; 174 uint64_t zo_time; 175 uint64_t zo_maxloops; 176 uint64_t zo_metaslab_force_ganging; 177 int zo_mmp_test; 178 int zo_special_vdevs; 179 } ztest_shared_opts_t; 180 181 static const ztest_shared_opts_t ztest_opts_defaults = { 182 .zo_pool = { 'z', 't', 'e', 's', 't', '\0' }, 183 .zo_dir = { '/', 't', 'm', 'p', '\0' }, 184 .zo_alt_ztest = { '\0' }, 185 .zo_alt_libpath = { '\0' }, 186 .zo_vdevs = 5, 187 .zo_ashift = SPA_MINBLOCKSHIFT, 188 .zo_mirrors = 2, 189 .zo_raidz = 4, 190 .zo_raidz_parity = 1, 191 .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ 192 .zo_datasets = 7, 193 .zo_threads = 23, 194 .zo_passtime = 60, /* 60 seconds */ 195 .zo_killrate = 70, /* 70% kill rate */ 196 .zo_verbose = 0, 197 .zo_mmp_test = 0, 198 .zo_init = 1, 199 .zo_time = 300, /* 5 minutes */ 200 .zo_maxloops = 50, /* max loops during spa_freeze() */ 201 .zo_metaslab_force_ganging = 32 << 10, 202 .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, 203 }; 204 205 extern uint64_t metaslab_force_ganging; 206 extern uint64_t metaslab_df_alloc_threshold; 207 extern uint64_t zfs_deadman_synctime_ms; 208 extern int metaslab_preload_limit; 209 extern boolean_t zfs_compressed_arc_enabled; 210 extern boolean_t zfs_abd_scatter_enabled; 211 extern int dmu_object_alloc_chunk_shift; 212 extern boolean_t zfs_force_some_double_word_sm_entries; 213 extern unsigned long zfs_reconstruct_indirect_damage_fraction; 214 215 static ztest_shared_opts_t *ztest_shared_opts; 216 static ztest_shared_opts_t ztest_opts; 217 static char *ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; 218 219 typedef struct ztest_shared_ds { 220 uint64_t zd_seq; 221 } ztest_shared_ds_t; 222 223 static ztest_shared_ds_t *ztest_shared_ds; 224 #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 225 226 #define BT_MAGIC 0x123456789abcdefULL 227 #define MAXFAULTS() \ 228 (MAX(zs->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1) 229 230 enum ztest_io_type { 231 ZTEST_IO_WRITE_TAG, 232 ZTEST_IO_WRITE_PATTERN, 233 ZTEST_IO_WRITE_ZEROES, 234 ZTEST_IO_TRUNCATE, 235 ZTEST_IO_SETATTR, 236 ZTEST_IO_REWRITE, 237 ZTEST_IO_TYPES 238 }; 239 240 typedef struct ztest_block_tag { 241 uint64_t bt_magic; 242 uint64_t bt_objset; 243 uint64_t bt_object; 244 uint64_t bt_dnodesize; 245 uint64_t bt_offset; 246 uint64_t bt_gen; 247 uint64_t bt_txg; 248 uint64_t bt_crtxg; 249 } ztest_block_tag_t; 250 251 typedef struct bufwad { 252 uint64_t bw_index; 253 uint64_t bw_txg; 254 uint64_t bw_data; 255 } bufwad_t; 256 257 /* 258 * It would be better to use a rangelock_t per object. Unfortunately 259 * the rangelock_t is not a drop-in replacement for rl_t, because we 260 * still need to map from object ID to rangelock_t. 261 */ 262 typedef enum { 263 RL_READER, 264 RL_WRITER, 265 RL_APPEND 266 } rl_type_t; 267 268 typedef struct rll { 269 void *rll_writer; 270 int rll_readers; 271 kmutex_t rll_lock; 272 kcondvar_t rll_cv; 273 } rll_t; 274 275 typedef struct rl { 276 uint64_t rl_object; 277 uint64_t rl_offset; 278 uint64_t rl_size; 279 rll_t *rl_lock; 280 } rl_t; 281 282 #define ZTEST_RANGE_LOCKS 64 283 #define ZTEST_OBJECT_LOCKS 64 284 285 /* 286 * Object descriptor. Used as a template for object lookup/create/remove. 287 */ 288 typedef struct ztest_od { 289 uint64_t od_dir; 290 uint64_t od_object; 291 dmu_object_type_t od_type; 292 dmu_object_type_t od_crtype; 293 uint64_t od_blocksize; 294 uint64_t od_crblocksize; 295 uint64_t od_crdnodesize; 296 uint64_t od_gen; 297 uint64_t od_crgen; 298 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 299 } ztest_od_t; 300 301 /* 302 * Per-dataset state. 303 */ 304 typedef struct ztest_ds { 305 ztest_shared_ds_t *zd_shared; 306 objset_t *zd_os; 307 krwlock_t zd_zilog_lock; 308 zilog_t *zd_zilog; 309 ztest_od_t *zd_od; /* debugging aid */ 310 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 311 kmutex_t zd_dirobj_lock; 312 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 313 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 314 } ztest_ds_t; 315 316 /* 317 * Per-iteration state. 318 */ 319 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 320 321 typedef struct ztest_info { 322 ztest_func_t *zi_func; /* test function */ 323 uint64_t zi_iters; /* iterations per execution */ 324 uint64_t *zi_interval; /* execute every <interval> seconds */ 325 } ztest_info_t; 326 327 typedef struct ztest_shared_callstate { 328 uint64_t zc_count; /* per-pass count */ 329 uint64_t zc_time; /* per-pass time */ 330 uint64_t zc_next; /* next time to call this function */ 331 } ztest_shared_callstate_t; 332 333 static ztest_shared_callstate_t *ztest_shared_callstate; 334 #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 335 336 /* 337 * Note: these aren't static because we want dladdr() to work. 338 */ 339 ztest_func_t ztest_dmu_read_write; 340 ztest_func_t ztest_dmu_write_parallel; 341 ztest_func_t ztest_dmu_object_alloc_free; 342 ztest_func_t ztest_dmu_object_next_chunk; 343 ztest_func_t ztest_dmu_commit_callbacks; 344 ztest_func_t ztest_zap; 345 ztest_func_t ztest_zap_parallel; 346 ztest_func_t ztest_zil_commit; 347 ztest_func_t ztest_zil_remount; 348 ztest_func_t ztest_dmu_read_write_zcopy; 349 ztest_func_t ztest_dmu_objset_create_destroy; 350 ztest_func_t ztest_dmu_prealloc; 351 ztest_func_t ztest_fzap; 352 ztest_func_t ztest_dmu_snapshot_create_destroy; 353 ztest_func_t ztest_dsl_prop_get_set; 354 ztest_func_t ztest_spa_prop_get_set; 355 ztest_func_t ztest_spa_create_destroy; 356 ztest_func_t ztest_fault_inject; 357 ztest_func_t ztest_ddt_repair; 358 ztest_func_t ztest_dmu_snapshot_hold; 359 ztest_func_t ztest_mmp_enable_disable; 360 ztest_func_t ztest_scrub; 361 ztest_func_t ztest_dsl_dataset_promote_busy; 362 ztest_func_t ztest_vdev_attach_detach; 363 ztest_func_t ztest_vdev_LUN_growth; 364 ztest_func_t ztest_vdev_add_remove; 365 ztest_func_t ztest_vdev_class_add; 366 ztest_func_t ztest_vdev_aux_add_remove; 367 ztest_func_t ztest_split_pool; 368 ztest_func_t ztest_reguid; 369 ztest_func_t ztest_spa_upgrade; 370 ztest_func_t ztest_fletcher; 371 ztest_func_t ztest_fletcher_incr; 372 ztest_func_t ztest_device_removal; 373 ztest_func_t ztest_remap_blocks; 374 ztest_func_t ztest_spa_checkpoint_create_discard; 375 ztest_func_t ztest_initialize; 376 ztest_func_t ztest_trim; 377 ztest_func_t ztest_verify_dnode_bt; 378 379 uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 380 uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 381 uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 382 uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 383 uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 384 385 ztest_info_t ztest_info[] = { 386 { ztest_dmu_read_write, 1, &zopt_always }, 387 { ztest_dmu_write_parallel, 10, &zopt_always }, 388 { ztest_dmu_object_alloc_free, 1, &zopt_always }, 389 { ztest_dmu_object_next_chunk, 1, &zopt_sometimes }, 390 { ztest_dmu_commit_callbacks, 1, &zopt_always }, 391 { ztest_zap, 30, &zopt_always }, 392 { ztest_zap_parallel, 100, &zopt_always }, 393 { ztest_split_pool, 1, &zopt_always }, 394 { ztest_zil_commit, 1, &zopt_incessant }, 395 { ztest_zil_remount, 1, &zopt_sometimes }, 396 { ztest_dmu_read_write_zcopy, 1, &zopt_often }, 397 { ztest_dmu_objset_create_destroy, 1, &zopt_often }, 398 { ztest_dsl_prop_get_set, 1, &zopt_often }, 399 { ztest_spa_prop_get_set, 1, &zopt_sometimes }, 400 #if 0 401 { ztest_dmu_prealloc, 1, &zopt_sometimes }, 402 #endif 403 { ztest_fzap, 1, &zopt_sometimes }, 404 { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, 405 { ztest_spa_create_destroy, 1, &zopt_sometimes }, 406 { ztest_fault_inject, 1, &zopt_incessant }, 407 { ztest_ddt_repair, 1, &zopt_sometimes }, 408 { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, 409 { ztest_mmp_enable_disable, 1, &zopt_sometimes }, 410 { ztest_reguid, 1, &zopt_rarely }, 411 { ztest_scrub, 1, &zopt_often }, 412 { ztest_spa_upgrade, 1, &zopt_rarely }, 413 { ztest_fletcher, 1, &zopt_rarely }, 414 { ztest_fletcher_incr, 1, &zopt_rarely }, 415 { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, 416 { ztest_vdev_attach_detach, 1, &zopt_incessant }, 417 { ztest_vdev_LUN_growth, 1, &zopt_rarely }, 418 { ztest_vdev_add_remove, 1, 419 &ztest_opts.zo_vdevtime }, 420 { ztest_vdev_class_add, 1, 421 &ztest_opts.zo_vdevtime }, 422 { ztest_vdev_aux_add_remove, 1, 423 &ztest_opts.zo_vdevtime }, 424 { ztest_device_removal, 1, &zopt_sometimes }, 425 { ztest_remap_blocks, 1, &zopt_sometimes }, 426 { ztest_spa_checkpoint_create_discard, 1, &zopt_rarely }, 427 { ztest_initialize, 1, &zopt_sometimes }, 428 { ztest_trim, 1, &zopt_sometimes }, 429 { ztest_verify_dnode_bt, 1, &zopt_sometimes } 430 }; 431 432 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 433 434 /* 435 * The following struct is used to hold a list of uncalled commit callbacks. 436 * The callbacks are ordered by txg number. 437 */ 438 typedef struct ztest_cb_list { 439 kmutex_t zcl_callbacks_lock; 440 list_t zcl_callbacks; 441 } ztest_cb_list_t; 442 443 /* 444 * Stuff we need to share writably between parent and child. 445 */ 446 typedef struct ztest_shared { 447 boolean_t zs_do_init; 448 hrtime_t zs_proc_start; 449 hrtime_t zs_proc_stop; 450 hrtime_t zs_thread_start; 451 hrtime_t zs_thread_stop; 452 hrtime_t zs_thread_kill; 453 uint64_t zs_enospc_count; 454 uint64_t zs_vdev_next_leaf; 455 uint64_t zs_vdev_aux; 456 uint64_t zs_alloc; 457 uint64_t zs_space; 458 uint64_t zs_splits; 459 uint64_t zs_mirrors; 460 uint64_t zs_metaslab_sz; 461 uint64_t zs_metaslab_df_alloc_threshold; 462 uint64_t zs_guid; 463 } ztest_shared_t; 464 465 #define ID_PARALLEL -1ULL 466 467 static char ztest_dev_template[] = "%s/%s.%llua"; 468 static char ztest_aux_template[] = "%s/%s.%s.%llu"; 469 ztest_shared_t *ztest_shared; 470 471 static spa_t *ztest_spa = NULL; 472 static ztest_ds_t *ztest_ds; 473 474 static kmutex_t ztest_vdev_lock; 475 static boolean_t ztest_device_removal_active = B_FALSE; 476 static kmutex_t ztest_checkpoint_lock; 477 478 /* 479 * The ztest_name_lock protects the pool and dataset namespace used by 480 * the individual tests. To modify the namespace, consumers must grab 481 * this lock as writer. Grabbing the lock as reader will ensure that the 482 * namespace does not change while the lock is held. 483 */ 484 static krwlock_t ztest_name_lock; 485 486 static boolean_t ztest_dump_core = B_TRUE; 487 static boolean_t ztest_exiting; 488 489 /* Global commit callback list */ 490 static ztest_cb_list_t zcl; 491 492 enum ztest_object { 493 ZTEST_META_DNODE = 0, 494 ZTEST_DIROBJ, 495 ZTEST_OBJECTS 496 }; 497 498 static void usage(boolean_t) __NORETURN; 499 500 /* 501 * These libumem hooks provide a reasonable set of defaults for the allocator's 502 * debugging facilities. 503 */ 504 const char * 505 _umem_debug_init() 506 { 507 return ("default,verbose"); /* $UMEM_DEBUG setting */ 508 } 509 510 const char * 511 _umem_logging_init(void) 512 { 513 return ("fail,contents"); /* $UMEM_LOGGING setting */ 514 } 515 516 #define FATAL_MSG_SZ 1024 517 518 char *fatal_msg; 519 520 static void 521 fatal(int do_perror, char *message, ...) 522 { 523 va_list args; 524 int save_errno = errno; 525 char buf[FATAL_MSG_SZ]; 526 527 (void) fflush(stdout); 528 529 va_start(args, message); 530 (void) sprintf(buf, "ztest: "); 531 /* LINTED */ 532 (void) vsprintf(buf + strlen(buf), message, args); 533 va_end(args); 534 if (do_perror) { 535 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 536 ": %s", strerror(save_errno)); 537 } 538 (void) fprintf(stderr, "%s\n", buf); 539 fatal_msg = buf; /* to ease debugging */ 540 if (ztest_dump_core) 541 abort(); 542 exit(3); 543 } 544 545 static int 546 str2shift(const char *buf) 547 { 548 const char *ends = "BKMGTPEZ"; 549 int i; 550 551 if (buf[0] == '\0') 552 return (0); 553 for (i = 0; i < strlen(ends); i++) { 554 if (toupper(buf[0]) == ends[i]) 555 break; 556 } 557 if (i == strlen(ends)) { 558 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 559 buf); 560 usage(B_FALSE); 561 } 562 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 563 return (10*i); 564 } 565 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 566 usage(B_FALSE); 567 /* NOTREACHED */ 568 } 569 570 static uint64_t 571 nicenumtoull(const char *buf) 572 { 573 char *end; 574 uint64_t val; 575 576 val = strtoull(buf, &end, 0); 577 if (end == buf) { 578 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 579 usage(B_FALSE); 580 } else if (end[0] == '.') { 581 double fval = strtod(buf, &end); 582 fval *= pow(2, str2shift(end)); 583 if (fval > UINT64_MAX) { 584 (void) fprintf(stderr, "ztest: value too large: %s\n", 585 buf); 586 usage(B_FALSE); 587 } 588 val = (uint64_t)fval; 589 } else { 590 int shift = str2shift(end); 591 if (shift >= 64 || (val << shift) >> shift != val) { 592 (void) fprintf(stderr, "ztest: value too large: %s\n", 593 buf); 594 usage(B_FALSE); 595 } 596 val <<= shift; 597 } 598 return (val); 599 } 600 601 static void 602 usage(boolean_t requested) 603 { 604 const ztest_shared_opts_t *zo = &ztest_opts_defaults; 605 606 char nice_vdev_size[NN_NUMBUF_SZ]; 607 char nice_force_ganging[NN_NUMBUF_SZ]; 608 FILE *fp = requested ? stdout : stderr; 609 610 nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size)); 611 nicenum(zo->zo_metaslab_force_ganging, nice_force_ganging, 612 sizeof (nice_force_ganging)); 613 614 (void) fprintf(fp, "Usage: %s\n" 615 "\t[-v vdevs (default: %llu)]\n" 616 "\t[-s size_of_each_vdev (default: %s)]\n" 617 "\t[-a alignment_shift (default: %d)] use 0 for random\n" 618 "\t[-m mirror_copies (default: %d)]\n" 619 "\t[-r raidz_disks (default: %d)]\n" 620 "\t[-R raidz_parity (default: %d)]\n" 621 "\t[-d datasets (default: %d)]\n" 622 "\t[-t threads (default: %d)]\n" 623 "\t[-g gang_block_threshold (default: %s)]\n" 624 "\t[-i init_count (default: %d)] initialize pool i times\n" 625 "\t[-k kill_percentage (default: %llu%%)]\n" 626 "\t[-p pool_name (default: %s)]\n" 627 "\t[-f dir (default: %s)] file directory for vdev files\n" 628 "\t[-M] Multi-host simulate pool imported on remote host\n" 629 "\t[-V] verbose (use multiple times for ever more blather)\n" 630 "\t[-E] use existing pool instead of creating new one\n" 631 "\t[-T time (default: %llu sec)] total run time\n" 632 "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n" 633 "\t[-P passtime (default: %llu sec)] time per pass\n" 634 "\t[-B alt_ztest (default: <none>)] alternate ztest path\n" 635 "\t[-C vdev class state (default: random)] special=on|off|random\n" 636 "\t[-o variable=value] ... set global variable to an unsigned\n" 637 "\t 32-bit integer value\n" 638 "\t[-h] (print help)\n" 639 "", 640 zo->zo_pool, 641 (u_longlong_t)zo->zo_vdevs, /* -v */ 642 nice_vdev_size, /* -s */ 643 zo->zo_ashift, /* -a */ 644 zo->zo_mirrors, /* -m */ 645 zo->zo_raidz, /* -r */ 646 zo->zo_raidz_parity, /* -R */ 647 zo->zo_datasets, /* -d */ 648 zo->zo_threads, /* -t */ 649 nice_force_ganging, /* -g */ 650 zo->zo_init, /* -i */ 651 (u_longlong_t)zo->zo_killrate, /* -k */ 652 zo->zo_pool, /* -p */ 653 zo->zo_dir, /* -f */ 654 (u_longlong_t)zo->zo_time, /* -T */ 655 (u_longlong_t)zo->zo_maxloops, /* -F */ 656 (u_longlong_t)zo->zo_passtime); 657 exit(requested ? 0 : 1); 658 } 659 660 661 static void 662 ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) 663 { 664 char name[32]; 665 char *value; 666 int state = ZTEST_VDEV_CLASS_RND; 667 668 (void) strlcpy(name, input, sizeof (name)); 669 670 value = strchr(name, '='); 671 if (value == NULL) { 672 (void) fprintf(stderr, "missing value in property=value " 673 "'-C' argument (%s)\n", input); 674 usage(B_FALSE); 675 } 676 *(value) = '\0'; 677 value++; 678 679 if (strcmp(value, "on") == 0) { 680 state = ZTEST_VDEV_CLASS_ON; 681 } else if (strcmp(value, "off") == 0) { 682 state = ZTEST_VDEV_CLASS_OFF; 683 } else if (strcmp(value, "random") == 0) { 684 state = ZTEST_VDEV_CLASS_RND; 685 } else { 686 (void) fprintf(stderr, "invalid property value '%s'\n", value); 687 usage(B_FALSE); 688 } 689 690 if (strcmp(name, "special") == 0) { 691 zo->zo_special_vdevs = state; 692 } else { 693 (void) fprintf(stderr, "invalid property name '%s'\n", name); 694 usage(B_FALSE); 695 } 696 if (zo->zo_verbose >= 3) 697 (void) printf("%s vdev state is '%s'\n", name, value); 698 } 699 700 static void 701 process_options(int argc, char **argv) 702 { 703 char *path; 704 ztest_shared_opts_t *zo = &ztest_opts; 705 706 int opt; 707 uint64_t value; 708 char altdir[MAXNAMELEN] = { 0 }; 709 710 bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); 711 712 while ((opt = getopt(argc, argv, 713 "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:")) != EOF) { 714 value = 0; 715 switch (opt) { 716 case 'v': 717 case 's': 718 case 'a': 719 case 'm': 720 case 'r': 721 case 'R': 722 case 'd': 723 case 't': 724 case 'g': 725 case 'i': 726 case 'k': 727 case 'T': 728 case 'P': 729 case 'F': 730 value = nicenumtoull(optarg); 731 } 732 switch (opt) { 733 case 'v': 734 zo->zo_vdevs = value; 735 break; 736 case 's': 737 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 738 break; 739 case 'a': 740 zo->zo_ashift = value; 741 break; 742 case 'm': 743 zo->zo_mirrors = value; 744 break; 745 case 'r': 746 zo->zo_raidz = MAX(1, value); 747 break; 748 case 'R': 749 zo->zo_raidz_parity = MIN(MAX(value, 1), 3); 750 break; 751 case 'd': 752 zo->zo_datasets = MAX(1, value); 753 break; 754 case 't': 755 zo->zo_threads = MAX(1, value); 756 break; 757 case 'g': 758 zo->zo_metaslab_force_ganging = 759 MAX(SPA_MINBLOCKSIZE << 1, value); 760 break; 761 case 'i': 762 zo->zo_init = value; 763 break; 764 case 'k': 765 zo->zo_killrate = value; 766 break; 767 case 'p': 768 (void) strlcpy(zo->zo_pool, optarg, 769 sizeof (zo->zo_pool)); 770 break; 771 case 'f': 772 path = realpath(optarg, NULL); 773 if (path == NULL) { 774 (void) fprintf(stderr, "error: %s: %s\n", 775 optarg, strerror(errno)); 776 usage(B_FALSE); 777 } else { 778 (void) strlcpy(zo->zo_dir, path, 779 sizeof (zo->zo_dir)); 780 } 781 break; 782 case 'M': 783 zo->zo_mmp_test = 1; 784 break; 785 case 'V': 786 zo->zo_verbose++; 787 break; 788 case 'E': 789 zo->zo_init = 0; 790 break; 791 case 'T': 792 zo->zo_time = value; 793 break; 794 case 'P': 795 zo->zo_passtime = MAX(1, value); 796 break; 797 case 'F': 798 zo->zo_maxloops = MAX(1, value); 799 break; 800 case 'B': 801 (void) strlcpy(altdir, optarg, sizeof (altdir)); 802 break; 803 case 'C': 804 ztest_parse_name_value(optarg, zo); 805 break; 806 case 'o': 807 if (set_global_var(optarg) != 0) 808 usage(B_FALSE); 809 break; 810 case 'h': 811 usage(B_TRUE); 812 break; 813 case '?': 814 default: 815 usage(B_FALSE); 816 break; 817 } 818 } 819 820 zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1); 821 822 zo->zo_vdevtime = 823 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 824 UINT64_MAX >> 2); 825 826 if (strlen(altdir) > 0) { 827 char *cmd; 828 char *realaltdir; 829 char *bin; 830 char *ztest; 831 char *isa; 832 int isalen; 833 834 cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 835 realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 836 837 VERIFY(NULL != realpath(getexecname(), cmd)); 838 if (0 != access(altdir, F_OK)) { 839 ztest_dump_core = B_FALSE; 840 fatal(B_TRUE, "invalid alternate ztest path: %s", 841 altdir); 842 } 843 VERIFY(NULL != realpath(altdir, realaltdir)); 844 845 /* 846 * 'cmd' should be of the form "<anything>/usr/bin/<isa>/ztest". 847 * We want to extract <isa> to determine if we should use 848 * 32 or 64 bit binaries. 849 */ 850 bin = strstr(cmd, "/usr/bin/"); 851 ztest = strstr(bin, "/ztest"); 852 isa = bin + 9; 853 isalen = ztest - isa; 854 (void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest), 855 "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa); 856 (void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath), 857 "%s/usr/lib/%.*s", realaltdir, isalen, isa); 858 859 if (0 != access(zo->zo_alt_ztest, X_OK)) { 860 ztest_dump_core = B_FALSE; 861 fatal(B_TRUE, "invalid alternate ztest: %s", 862 zo->zo_alt_ztest); 863 } else if (0 != access(zo->zo_alt_libpath, X_OK)) { 864 ztest_dump_core = B_FALSE; 865 fatal(B_TRUE, "invalid alternate lib directory %s", 866 zo->zo_alt_libpath); 867 } 868 869 umem_free(cmd, MAXPATHLEN); 870 umem_free(realaltdir, MAXPATHLEN); 871 } 872 } 873 874 static void 875 ztest_kill(ztest_shared_t *zs) 876 { 877 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 878 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 879 880 /* 881 * Before we kill off ztest, make sure that the config is updated. 882 * See comment above spa_write_cachefile(). 883 */ 884 mutex_enter(&spa_namespace_lock); 885 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE); 886 mutex_exit(&spa_namespace_lock); 887 888 zfs_dbgmsg_print(FTAG); 889 (void) kill(getpid(), SIGKILL); 890 } 891 892 static uint64_t 893 ztest_random(uint64_t range) 894 { 895 uint64_t r; 896 897 ASSERT3S(ztest_fd_rand, >=, 0); 898 899 if (range == 0) 900 return (0); 901 902 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) 903 fatal(1, "short read from /dev/urandom"); 904 905 return (r % range); 906 } 907 908 /* ARGSUSED */ 909 static void 910 ztest_record_enospc(const char *s) 911 { 912 ztest_shared->zs_enospc_count++; 913 } 914 915 static uint64_t 916 ztest_get_ashift(void) 917 { 918 if (ztest_opts.zo_ashift == 0) 919 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 920 return (ztest_opts.zo_ashift); 921 } 922 923 static nvlist_t * 924 make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) 925 { 926 char pathbuf[MAXPATHLEN]; 927 uint64_t vdev; 928 nvlist_t *file; 929 930 if (ashift == 0) 931 ashift = ztest_get_ashift(); 932 933 if (path == NULL) { 934 path = pathbuf; 935 936 if (aux != NULL) { 937 vdev = ztest_shared->zs_vdev_aux; 938 (void) snprintf(path, sizeof (pathbuf), 939 ztest_aux_template, ztest_opts.zo_dir, 940 pool == NULL ? ztest_opts.zo_pool : pool, 941 aux, vdev); 942 } else { 943 vdev = ztest_shared->zs_vdev_next_leaf++; 944 (void) snprintf(path, sizeof (pathbuf), 945 ztest_dev_template, ztest_opts.zo_dir, 946 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 947 } 948 } 949 950 if (size != 0) { 951 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 952 if (fd == -1) 953 fatal(1, "can't open %s", path); 954 if (ftruncate(fd, size) != 0) 955 fatal(1, "can't ftruncate %s", path); 956 (void) close(fd); 957 } 958 959 VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); 960 VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); 961 VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0); 962 VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); 963 964 return (file); 965 } 966 967 static nvlist_t * 968 make_vdev_raidz(char *path, char *aux, char *pool, size_t size, 969 uint64_t ashift, int r) 970 { 971 nvlist_t *raidz, **child; 972 int c; 973 974 if (r < 2) 975 return (make_vdev_file(path, aux, pool, size, ashift)); 976 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 977 978 for (c = 0; c < r; c++) 979 child[c] = make_vdev_file(path, aux, pool, size, ashift); 980 981 VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); 982 VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, 983 VDEV_TYPE_RAIDZ) == 0); 984 VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, 985 ztest_opts.zo_raidz_parity) == 0); 986 VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, 987 child, r) == 0); 988 989 for (c = 0; c < r; c++) 990 nvlist_free(child[c]); 991 992 umem_free(child, r * sizeof (nvlist_t *)); 993 994 return (raidz); 995 } 996 997 static nvlist_t * 998 make_vdev_mirror(char *path, char *aux, char *pool, size_t size, 999 uint64_t ashift, int r, int m) 1000 { 1001 nvlist_t *mirror, **child; 1002 int c; 1003 1004 if (m < 1) 1005 return (make_vdev_raidz(path, aux, pool, size, ashift, r)); 1006 1007 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 1008 1009 for (c = 0; c < m; c++) 1010 child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r); 1011 1012 VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); 1013 VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, 1014 VDEV_TYPE_MIRROR) == 0); 1015 VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 1016 child, m) == 0); 1017 1018 for (c = 0; c < m; c++) 1019 nvlist_free(child[c]); 1020 1021 umem_free(child, m * sizeof (nvlist_t *)); 1022 1023 return (mirror); 1024 } 1025 1026 static nvlist_t * 1027 make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, 1028 const char *class, int r, int m, int t) 1029 { 1030 nvlist_t *root, **child; 1031 int c; 1032 boolean_t log; 1033 1034 ASSERT(t > 0); 1035 1036 log = (class != NULL && strcmp(class, "log") == 0); 1037 1038 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 1039 1040 for (c = 0; c < t; c++) { 1041 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 1042 r, m); 1043 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, 1044 log) == 0); 1045 1046 if (class != NULL && class[0] != '\0') { 1047 ASSERT(m > 1 || log); /* expecting a mirror */ 1048 VERIFY(nvlist_add_string(child[c], 1049 ZPOOL_CONFIG_ALLOCATION_BIAS, class) == 0); 1050 } 1051 } 1052 1053 VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0); 1054 VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); 1055 VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 1056 child, t) == 0); 1057 1058 for (c = 0; c < t; c++) 1059 nvlist_free(child[c]); 1060 1061 umem_free(child, t * sizeof (nvlist_t *)); 1062 1063 return (root); 1064 } 1065 1066 /* 1067 * Find a random spa version. Returns back a random spa version in the 1068 * range [initial_version, SPA_VERSION_FEATURES]. 1069 */ 1070 static uint64_t 1071 ztest_random_spa_version(uint64_t initial_version) 1072 { 1073 uint64_t version = initial_version; 1074 1075 if (version <= SPA_VERSION_BEFORE_FEATURES) { 1076 version = version + 1077 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 1078 } 1079 1080 if (version > SPA_VERSION_BEFORE_FEATURES) 1081 version = SPA_VERSION_FEATURES; 1082 1083 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1084 return (version); 1085 } 1086 1087 static int 1088 ztest_random_blocksize(void) 1089 { 1090 uint64_t block_shift; 1091 1092 ASSERT(ztest_spa->spa_max_ashift != 0); 1093 1094 /* 1095 * Choose a block size >= the ashift. 1096 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1097 */ 1098 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1099 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1100 maxbs = 20; 1101 block_shift = ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1102 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1103 } 1104 1105 static int 1106 ztest_random_dnodesize(void) 1107 { 1108 int slots; 1109 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1110 1111 if (max_slots == DNODE_MIN_SLOTS) 1112 return (DNODE_MIN_SIZE); 1113 1114 /* 1115 * Weight the random distribution more heavily toward smaller 1116 * dnode sizes since that is more likely to reflect real-world 1117 * usage. 1118 */ 1119 ASSERT3U(max_slots, >, 4); 1120 switch (ztest_random(10)) { 1121 case 0: 1122 slots = 5 + ztest_random(max_slots - 4); 1123 break; 1124 case 1 ... 4: 1125 slots = 2 + ztest_random(3); 1126 break; 1127 default: 1128 slots = 1; 1129 break; 1130 } 1131 1132 return (slots << DNODE_SHIFT); 1133 } 1134 1135 static int 1136 ztest_random_ibshift(void) 1137 { 1138 return (DN_MIN_INDBLKSHIFT + 1139 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1140 } 1141 1142 static uint64_t 1143 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1144 { 1145 uint64_t top; 1146 vdev_t *rvd = spa->spa_root_vdev; 1147 vdev_t *tvd; 1148 1149 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1150 1151 do { 1152 top = ztest_random(rvd->vdev_children); 1153 tvd = rvd->vdev_child[top]; 1154 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1155 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1156 1157 return (top); 1158 } 1159 1160 static uint64_t 1161 ztest_random_dsl_prop(zfs_prop_t prop) 1162 { 1163 uint64_t value; 1164 1165 do { 1166 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1167 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1168 1169 return (value); 1170 } 1171 1172 static int 1173 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1174 boolean_t inherit) 1175 { 1176 const char *propname = zfs_prop_to_name(prop); 1177 const char *valname; 1178 char setpoint[MAXPATHLEN]; 1179 uint64_t curval; 1180 int error; 1181 1182 error = dsl_prop_set_int(osname, propname, 1183 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1184 1185 if (error == ENOSPC) { 1186 ztest_record_enospc(FTAG); 1187 return (error); 1188 } 1189 ASSERT0(error); 1190 1191 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1192 1193 if (ztest_opts.zo_verbose >= 6) { 1194 VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0); 1195 (void) printf("%s %s = %s at '%s'\n", 1196 osname, propname, valname, setpoint); 1197 } 1198 1199 return (error); 1200 } 1201 1202 static int 1203 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1204 { 1205 spa_t *spa = ztest_spa; 1206 nvlist_t *props = NULL; 1207 int error; 1208 1209 VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); 1210 VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0); 1211 1212 error = spa_prop_set(spa, props); 1213 1214 nvlist_free(props); 1215 1216 if (error == ENOSPC) { 1217 ztest_record_enospc(FTAG); 1218 return (error); 1219 } 1220 ASSERT0(error); 1221 1222 return (error); 1223 } 1224 1225 static int 1226 ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, 1227 boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) 1228 { 1229 int err; 1230 1231 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1232 if (decrypt && err == EACCES) { 1233 char ddname[ZFS_MAX_DATASET_NAME_LEN]; 1234 dsl_crypto_params_t *dcp; 1235 nvlist_t *crypto_args = fnvlist_alloc(); 1236 char *cp = NULL; 1237 1238 /* spa_keystore_load_wkey() expects a dsl dir name */ 1239 (void) strcpy(ddname, name); 1240 cp = strchr(ddname, '@'); 1241 if (cp != NULL) 1242 *cp = '\0'; 1243 1244 fnvlist_add_uint8_array(crypto_args, "wkeydata", 1245 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 1246 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, 1247 crypto_args, &dcp)); 1248 err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); 1249 dsl_crypto_params_free(dcp, B_FALSE); 1250 fnvlist_free(crypto_args); 1251 1252 if (err != 0) 1253 return (err); 1254 1255 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1256 } 1257 1258 return (err); 1259 } 1260 1261 static void 1262 ztest_rll_init(rll_t *rll) 1263 { 1264 rll->rll_writer = NULL; 1265 rll->rll_readers = 0; 1266 mutex_init(&rll->rll_lock, NULL, USYNC_THREAD, NULL); 1267 cv_init(&rll->rll_cv, NULL, USYNC_THREAD, NULL); 1268 } 1269 1270 static void 1271 ztest_rll_destroy(rll_t *rll) 1272 { 1273 ASSERT(rll->rll_writer == NULL); 1274 ASSERT(rll->rll_readers == 0); 1275 mutex_destroy(&rll->rll_lock); 1276 cv_destroy(&rll->rll_cv); 1277 } 1278 1279 static void 1280 ztest_rll_lock(rll_t *rll, rl_type_t type) 1281 { 1282 mutex_enter(&rll->rll_lock); 1283 1284 if (type == RL_READER) { 1285 while (rll->rll_writer != NULL) 1286 cv_wait(&rll->rll_cv, &rll->rll_lock); 1287 rll->rll_readers++; 1288 } else { 1289 while (rll->rll_writer != NULL || rll->rll_readers) 1290 cv_wait(&rll->rll_cv, &rll->rll_lock); 1291 rll->rll_writer = curthread; 1292 } 1293 1294 mutex_exit(&rll->rll_lock); 1295 } 1296 1297 static void 1298 ztest_rll_unlock(rll_t *rll) 1299 { 1300 mutex_enter(&rll->rll_lock); 1301 1302 if (rll->rll_writer) { 1303 ASSERT(rll->rll_readers == 0); 1304 rll->rll_writer = NULL; 1305 } else { 1306 ASSERT(rll->rll_readers != 0); 1307 ASSERT(rll->rll_writer == NULL); 1308 rll->rll_readers--; 1309 } 1310 1311 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1312 cv_broadcast(&rll->rll_cv); 1313 1314 mutex_exit(&rll->rll_lock); 1315 } 1316 1317 static void 1318 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1319 { 1320 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1321 1322 ztest_rll_lock(rll, type); 1323 } 1324 1325 static void 1326 ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1327 { 1328 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1329 1330 ztest_rll_unlock(rll); 1331 } 1332 1333 static rl_t * 1334 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1335 uint64_t size, rl_type_t type) 1336 { 1337 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1338 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1339 rl_t *rl; 1340 1341 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1342 rl->rl_object = object; 1343 rl->rl_offset = offset; 1344 rl->rl_size = size; 1345 rl->rl_lock = rll; 1346 1347 ztest_rll_lock(rll, type); 1348 1349 return (rl); 1350 } 1351 1352 static void 1353 ztest_range_unlock(rl_t *rl) 1354 { 1355 rll_t *rll = rl->rl_lock; 1356 1357 ztest_rll_unlock(rll); 1358 1359 umem_free(rl, sizeof (*rl)); 1360 } 1361 1362 static void 1363 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1364 { 1365 zd->zd_os = os; 1366 zd->zd_zilog = dmu_objset_zil(os); 1367 zd->zd_shared = szd; 1368 dmu_objset_name(os, zd->zd_name); 1369 1370 if (zd->zd_shared != NULL) 1371 zd->zd_shared->zd_seq = 0; 1372 1373 rw_init(&zd->zd_zilog_lock, NULL, USYNC_THREAD, NULL); 1374 mutex_init(&zd->zd_dirobj_lock, NULL, USYNC_THREAD, NULL); 1375 1376 for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1377 ztest_rll_init(&zd->zd_object_lock[l]); 1378 1379 for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) 1380 ztest_rll_init(&zd->zd_range_lock[l]); 1381 } 1382 1383 static void 1384 ztest_zd_fini(ztest_ds_t *zd) 1385 { 1386 mutex_destroy(&zd->zd_dirobj_lock); 1387 1388 for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1389 ztest_rll_destroy(&zd->zd_object_lock[l]); 1390 1391 for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) 1392 ztest_rll_destroy(&zd->zd_range_lock[l]); 1393 } 1394 1395 #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) 1396 1397 static uint64_t 1398 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 1399 { 1400 uint64_t txg; 1401 int error; 1402 1403 /* 1404 * Attempt to assign tx to some transaction group. 1405 */ 1406 error = dmu_tx_assign(tx, txg_how); 1407 if (error) { 1408 if (error == ERESTART) { 1409 ASSERT(txg_how == TXG_NOWAIT); 1410 dmu_tx_wait(tx); 1411 } else { 1412 ASSERT3U(error, ==, ENOSPC); 1413 ztest_record_enospc(tag); 1414 } 1415 dmu_tx_abort(tx); 1416 return (0); 1417 } 1418 txg = dmu_tx_get_txg(tx); 1419 ASSERT(txg != 0); 1420 return (txg); 1421 } 1422 1423 static void 1424 ztest_pattern_set(void *buf, uint64_t size, uint64_t value) 1425 { 1426 uint64_t *ip = buf; 1427 uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); 1428 1429 while (ip < ip_end) 1430 *ip++ = value; 1431 } 1432 1433 static boolean_t 1434 ztest_pattern_match(void *buf, uint64_t size, uint64_t value) 1435 { 1436 uint64_t *ip = buf; 1437 uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); 1438 uint64_t diff = 0; 1439 1440 while (ip < ip_end) 1441 diff |= (value - *ip++); 1442 1443 return (diff == 0); 1444 } 1445 1446 static void 1447 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1448 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1449 uint64_t crtxg) 1450 { 1451 bt->bt_magic = BT_MAGIC; 1452 bt->bt_objset = dmu_objset_id(os); 1453 bt->bt_object = object; 1454 bt->bt_dnodesize = dnodesize; 1455 bt->bt_offset = offset; 1456 bt->bt_gen = gen; 1457 bt->bt_txg = txg; 1458 bt->bt_crtxg = crtxg; 1459 } 1460 1461 static void 1462 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1463 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1464 uint64_t crtxg) 1465 { 1466 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1467 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1468 ASSERT3U(bt->bt_object, ==, object); 1469 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1470 ASSERT3U(bt->bt_offset, ==, offset); 1471 ASSERT3U(bt->bt_gen, <=, gen); 1472 ASSERT3U(bt->bt_txg, <=, txg); 1473 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1474 } 1475 1476 static ztest_block_tag_t * 1477 ztest_bt_bonus(dmu_buf_t *db) 1478 { 1479 dmu_object_info_t doi; 1480 ztest_block_tag_t *bt; 1481 1482 dmu_object_info_from_db(db, &doi); 1483 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1484 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1485 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1486 1487 return (bt); 1488 } 1489 1490 /* 1491 * Generate a token to fill up unused bonus buffer space. Try to make 1492 * it unique to the object, generation, and offset to verify that data 1493 * is not getting overwritten by data from other dnodes. 1494 */ 1495 #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1496 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1497 1498 /* 1499 * Fill up the unused bonus buffer region before the block tag with a 1500 * verifiable pattern. Filling the whole bonus area with non-zero data 1501 * helps ensure that all dnode traversal code properly skips the 1502 * interior regions of large dnodes. 1503 */ 1504 void 1505 ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1506 objset_t *os, uint64_t gen) 1507 { 1508 uint64_t *bonusp; 1509 1510 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1511 1512 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1513 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1514 gen, bonusp - (uint64_t *)db->db_data); 1515 *bonusp = token; 1516 } 1517 } 1518 1519 /* 1520 * Verify that the unused area of a bonus buffer is filled with the 1521 * expected tokens. 1522 */ 1523 void 1524 ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1525 objset_t *os, uint64_t gen) 1526 { 1527 uint64_t *bonusp; 1528 1529 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1530 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1531 gen, bonusp - (uint64_t *)db->db_data); 1532 VERIFY3U(*bonusp, ==, token); 1533 } 1534 } 1535 1536 /* 1537 * ZIL logging ops 1538 */ 1539 1540 #define lrz_type lr_mode 1541 #define lrz_blocksize lr_uid 1542 #define lrz_ibshift lr_gid 1543 #define lrz_bonustype lr_rdev 1544 #define lrz_dnodesize lr_crtime[1] 1545 1546 static void 1547 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1548 { 1549 char *name = (void *)(lr + 1); /* name follows lr */ 1550 size_t namesize = strlen(name) + 1; 1551 itx_t *itx; 1552 1553 if (zil_replaying(zd->zd_zilog, tx)) 1554 return; 1555 1556 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1557 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1558 sizeof (*lr) + namesize - sizeof (lr_t)); 1559 1560 zil_itx_assign(zd->zd_zilog, itx, tx); 1561 } 1562 1563 static void 1564 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1565 { 1566 char *name = (void *)(lr + 1); /* name follows lr */ 1567 size_t namesize = strlen(name) + 1; 1568 itx_t *itx; 1569 1570 if (zil_replaying(zd->zd_zilog, tx)) 1571 return; 1572 1573 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1574 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1575 sizeof (*lr) + namesize - sizeof (lr_t)); 1576 1577 itx->itx_oid = object; 1578 zil_itx_assign(zd->zd_zilog, itx, tx); 1579 } 1580 1581 static void 1582 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1583 { 1584 itx_t *itx; 1585 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1586 1587 if (zil_replaying(zd->zd_zilog, tx)) 1588 return; 1589 1590 if (lr->lr_length > ZIL_MAX_LOG_DATA) 1591 write_state = WR_INDIRECT; 1592 1593 itx = zil_itx_create(TX_WRITE, 1594 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1595 1596 if (write_state == WR_COPIED && 1597 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1598 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { 1599 zil_itx_destroy(itx); 1600 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1601 write_state = WR_NEED_COPY; 1602 } 1603 itx->itx_private = zd; 1604 itx->itx_wr_state = write_state; 1605 itx->itx_sync = (ztest_random(8) == 0); 1606 1607 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1608 sizeof (*lr) - sizeof (lr_t)); 1609 1610 zil_itx_assign(zd->zd_zilog, itx, tx); 1611 } 1612 1613 static void 1614 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 1615 { 1616 itx_t *itx; 1617 1618 if (zil_replaying(zd->zd_zilog, tx)) 1619 return; 1620 1621 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 1622 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1623 sizeof (*lr) - sizeof (lr_t)); 1624 1625 itx->itx_sync = B_FALSE; 1626 zil_itx_assign(zd->zd_zilog, itx, tx); 1627 } 1628 1629 static void 1630 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 1631 { 1632 itx_t *itx; 1633 1634 if (zil_replaying(zd->zd_zilog, tx)) 1635 return; 1636 1637 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 1638 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1639 sizeof (*lr) - sizeof (lr_t)); 1640 1641 itx->itx_sync = B_FALSE; 1642 zil_itx_assign(zd->zd_zilog, itx, tx); 1643 } 1644 1645 /* 1646 * ZIL replay ops 1647 */ 1648 static int 1649 ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 1650 { 1651 ztest_ds_t *zd = arg1; 1652 lr_create_t *lr = arg2; 1653 char *name = (void *)(lr + 1); /* name follows lr */ 1654 objset_t *os = zd->zd_os; 1655 ztest_block_tag_t *bbt; 1656 dmu_buf_t *db; 1657 dmu_tx_t *tx; 1658 uint64_t txg; 1659 int error = 0; 1660 int bonuslen; 1661 1662 if (byteswap) 1663 byteswap_uint64_array(lr, sizeof (*lr)); 1664 1665 ASSERT(lr->lr_doid == ZTEST_DIROBJ); 1666 ASSERT(name[0] != '\0'); 1667 1668 tx = dmu_tx_create(os); 1669 1670 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 1671 1672 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1673 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1674 } else { 1675 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1676 } 1677 1678 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1679 if (txg == 0) 1680 return (ENOSPC); 1681 1682 ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid); 1683 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 1684 1685 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1686 if (lr->lr_foid == 0) { 1687 lr->lr_foid = zap_create_dnsize(os, 1688 lr->lrz_type, lr->lrz_bonustype, 1689 bonuslen, lr->lrz_dnodesize, tx); 1690 } else { 1691 error = zap_create_claim_dnsize(os, lr->lr_foid, 1692 lr->lrz_type, lr->lrz_bonustype, 1693 bonuslen, lr->lrz_dnodesize, tx); 1694 } 1695 } else { 1696 if (lr->lr_foid == 0) { 1697 lr->lr_foid = dmu_object_alloc_dnsize(os, 1698 lr->lrz_type, 0, lr->lrz_bonustype, 1699 bonuslen, lr->lrz_dnodesize, tx); 1700 } else { 1701 error = dmu_object_claim_dnsize(os, lr->lr_foid, 1702 lr->lrz_type, 0, lr->lrz_bonustype, 1703 bonuslen, lr->lrz_dnodesize, tx); 1704 } 1705 } 1706 1707 if (error) { 1708 ASSERT3U(error, ==, EEXIST); 1709 ASSERT(zd->zd_zilog->zl_replay); 1710 dmu_tx_commit(tx); 1711 return (error); 1712 } 1713 1714 ASSERT(lr->lr_foid != 0); 1715 1716 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 1717 VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid, 1718 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 1719 1720 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 1721 bbt = ztest_bt_bonus(db); 1722 dmu_buf_will_dirty(db, tx); 1723 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 1724 lr->lr_gen, txg, txg); 1725 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 1726 dmu_buf_rele(db, FTAG); 1727 1728 VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 1729 &lr->lr_foid, tx)); 1730 1731 (void) ztest_log_create(zd, tx, lr); 1732 1733 dmu_tx_commit(tx); 1734 1735 return (0); 1736 } 1737 1738 static int 1739 ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 1740 { 1741 ztest_ds_t *zd = arg1; 1742 lr_remove_t *lr = arg2; 1743 char *name = (void *)(lr + 1); /* name follows lr */ 1744 objset_t *os = zd->zd_os; 1745 dmu_object_info_t doi; 1746 dmu_tx_t *tx; 1747 uint64_t object, txg; 1748 1749 if (byteswap) 1750 byteswap_uint64_array(lr, sizeof (*lr)); 1751 1752 ASSERT(lr->lr_doid == ZTEST_DIROBJ); 1753 ASSERT(name[0] != '\0'); 1754 1755 VERIFY3U(0, ==, 1756 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 1757 ASSERT(object != 0); 1758 1759 ztest_object_lock(zd, object, RL_WRITER); 1760 1761 VERIFY3U(0, ==, dmu_object_info(os, object, &doi)); 1762 1763 tx = dmu_tx_create(os); 1764 1765 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 1766 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 1767 1768 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1769 if (txg == 0) { 1770 ztest_object_unlock(zd, object); 1771 return (ENOSPC); 1772 } 1773 1774 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 1775 VERIFY3U(0, ==, zap_destroy(os, object, tx)); 1776 } else { 1777 VERIFY3U(0, ==, dmu_object_free(os, object, tx)); 1778 } 1779 1780 VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx)); 1781 1782 (void) ztest_log_remove(zd, tx, lr, object); 1783 1784 dmu_tx_commit(tx); 1785 1786 ztest_object_unlock(zd, object); 1787 1788 return (0); 1789 } 1790 1791 static int 1792 ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 1793 { 1794 ztest_ds_t *zd = arg1; 1795 lr_write_t *lr = arg2; 1796 objset_t *os = zd->zd_os; 1797 void *data = lr + 1; /* data follows lr */ 1798 uint64_t offset, length; 1799 ztest_block_tag_t *bt = data; 1800 ztest_block_tag_t *bbt; 1801 uint64_t gen, txg, lrtxg, crtxg; 1802 dmu_object_info_t doi; 1803 dmu_tx_t *tx; 1804 dmu_buf_t *db; 1805 arc_buf_t *abuf = NULL; 1806 rl_t *rl; 1807 1808 if (byteswap) 1809 byteswap_uint64_array(lr, sizeof (*lr)); 1810 1811 offset = lr->lr_offset; 1812 length = lr->lr_length; 1813 1814 /* If it's a dmu_sync() block, write the whole block */ 1815 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 1816 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 1817 if (length < blocksize) { 1818 offset -= offset % blocksize; 1819 length = blocksize; 1820 } 1821 } 1822 1823 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 1824 byteswap_uint64_array(bt, sizeof (*bt)); 1825 1826 if (bt->bt_magic != BT_MAGIC) 1827 bt = NULL; 1828 1829 ztest_object_lock(zd, lr->lr_foid, RL_READER); 1830 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); 1831 1832 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 1833 1834 dmu_object_info_from_db(db, &doi); 1835 1836 bbt = ztest_bt_bonus(db); 1837 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 1838 gen = bbt->bt_gen; 1839 crtxg = bbt->bt_crtxg; 1840 lrtxg = lr->lr_common.lrc_txg; 1841 1842 tx = dmu_tx_create(os); 1843 1844 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 1845 1846 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 1847 P2PHASE(offset, length) == 0) 1848 abuf = dmu_request_arcbuf(db, length); 1849 1850 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1851 if (txg == 0) { 1852 if (abuf != NULL) 1853 dmu_return_arcbuf(abuf); 1854 dmu_buf_rele(db, FTAG); 1855 ztest_range_unlock(rl); 1856 ztest_object_unlock(zd, lr->lr_foid); 1857 return (ENOSPC); 1858 } 1859 1860 if (bt != NULL) { 1861 /* 1862 * Usually, verify the old data before writing new data -- 1863 * but not always, because we also want to verify correct 1864 * behavior when the data was not recently read into cache. 1865 */ 1866 ASSERT(offset % doi.doi_data_block_size == 0); 1867 if (ztest_random(4) != 0) { 1868 int prefetch = ztest_random(2) ? 1869 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 1870 ztest_block_tag_t rbt; 1871 1872 VERIFY(dmu_read(os, lr->lr_foid, offset, 1873 sizeof (rbt), &rbt, prefetch) == 0); 1874 if (rbt.bt_magic == BT_MAGIC) { 1875 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 1876 offset, gen, txg, crtxg); 1877 } 1878 } 1879 1880 /* 1881 * Writes can appear to be newer than the bonus buffer because 1882 * the ztest_get_data() callback does a dmu_read() of the 1883 * open-context data, which may be different than the data 1884 * as it was when the write was generated. 1885 */ 1886 if (zd->zd_zilog->zl_replay) { 1887 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 1888 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 1889 bt->bt_crtxg); 1890 } 1891 1892 /* 1893 * Set the bt's gen/txg to the bonus buffer's gen/txg 1894 * so that all of the usual ASSERTs will work. 1895 */ 1896 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 1897 crtxg); 1898 } 1899 1900 if (abuf == NULL) { 1901 dmu_write(os, lr->lr_foid, offset, length, data, tx); 1902 } else { 1903 bcopy(data, abuf->b_data, length); 1904 dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx); 1905 } 1906 1907 (void) ztest_log_write(zd, tx, lr); 1908 1909 dmu_buf_rele(db, FTAG); 1910 1911 dmu_tx_commit(tx); 1912 1913 ztest_range_unlock(rl); 1914 ztest_object_unlock(zd, lr->lr_foid); 1915 1916 return (0); 1917 } 1918 1919 static int 1920 ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 1921 { 1922 ztest_ds_t *zd = arg1; 1923 lr_truncate_t *lr = arg2; 1924 objset_t *os = zd->zd_os; 1925 dmu_tx_t *tx; 1926 uint64_t txg; 1927 rl_t *rl; 1928 1929 if (byteswap) 1930 byteswap_uint64_array(lr, sizeof (*lr)); 1931 1932 ztest_object_lock(zd, lr->lr_foid, RL_READER); 1933 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 1934 RL_WRITER); 1935 1936 tx = dmu_tx_create(os); 1937 1938 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 1939 1940 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1941 if (txg == 0) { 1942 ztest_range_unlock(rl); 1943 ztest_object_unlock(zd, lr->lr_foid); 1944 return (ENOSPC); 1945 } 1946 1947 VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 1948 lr->lr_length, tx) == 0); 1949 1950 (void) ztest_log_truncate(zd, tx, lr); 1951 1952 dmu_tx_commit(tx); 1953 1954 ztest_range_unlock(rl); 1955 ztest_object_unlock(zd, lr->lr_foid); 1956 1957 return (0); 1958 } 1959 1960 static int 1961 ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 1962 { 1963 ztest_ds_t *zd = arg1; 1964 lr_setattr_t *lr = arg2; 1965 objset_t *os = zd->zd_os; 1966 dmu_tx_t *tx; 1967 dmu_buf_t *db; 1968 ztest_block_tag_t *bbt; 1969 uint64_t txg, lrtxg, crtxg, dnodesize; 1970 1971 if (byteswap) 1972 byteswap_uint64_array(lr, sizeof (*lr)); 1973 1974 ztest_object_lock(zd, lr->lr_foid, RL_WRITER); 1975 1976 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 1977 1978 tx = dmu_tx_create(os); 1979 dmu_tx_hold_bonus(tx, lr->lr_foid); 1980 1981 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1982 if (txg == 0) { 1983 dmu_buf_rele(db, FTAG); 1984 ztest_object_unlock(zd, lr->lr_foid); 1985 return (ENOSPC); 1986 } 1987 1988 bbt = ztest_bt_bonus(db); 1989 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 1990 crtxg = bbt->bt_crtxg; 1991 lrtxg = lr->lr_common.lrc_txg; 1992 dnodesize = bbt->bt_dnodesize; 1993 1994 if (zd->zd_zilog->zl_replay) { 1995 ASSERT(lr->lr_size != 0); 1996 ASSERT(lr->lr_mode != 0); 1997 ASSERT(lrtxg != 0); 1998 } else { 1999 /* 2000 * Randomly change the size and increment the generation. 2001 */ 2002 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 2003 sizeof (*bbt); 2004 lr->lr_mode = bbt->bt_gen + 1; 2005 ASSERT(lrtxg == 0); 2006 } 2007 2008 /* 2009 * Verify that the current bonus buffer is not newer than our txg. 2010 */ 2011 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2012 MAX(txg, lrtxg), crtxg); 2013 2014 dmu_buf_will_dirty(db, tx); 2015 2016 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 2017 ASSERT3U(lr->lr_size, <=, db->db_size); 2018 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 2019 bbt = ztest_bt_bonus(db); 2020 2021 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2022 txg, crtxg); 2023 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 2024 2025 dmu_buf_rele(db, FTAG); 2026 2027 (void) ztest_log_setattr(zd, tx, lr); 2028 2029 dmu_tx_commit(tx); 2030 2031 ztest_object_unlock(zd, lr->lr_foid); 2032 2033 return (0); 2034 } 2035 2036 zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 2037 NULL, /* 0 no such transaction type */ 2038 ztest_replay_create, /* TX_CREATE */ 2039 NULL, /* TX_MKDIR */ 2040 NULL, /* TX_MKXATTR */ 2041 NULL, /* TX_SYMLINK */ 2042 ztest_replay_remove, /* TX_REMOVE */ 2043 NULL, /* TX_RMDIR */ 2044 NULL, /* TX_LINK */ 2045 NULL, /* TX_RENAME */ 2046 ztest_replay_write, /* TX_WRITE */ 2047 ztest_replay_truncate, /* TX_TRUNCATE */ 2048 ztest_replay_setattr, /* TX_SETATTR */ 2049 NULL, /* TX_ACL */ 2050 NULL, /* TX_CREATE_ACL */ 2051 NULL, /* TX_CREATE_ATTR */ 2052 NULL, /* TX_CREATE_ACL_ATTR */ 2053 NULL, /* TX_MKDIR_ACL */ 2054 NULL, /* TX_MKDIR_ATTR */ 2055 NULL, /* TX_MKDIR_ACL_ATTR */ 2056 NULL, /* TX_WRITE2 */ 2057 }; 2058 2059 /* 2060 * ZIL get_data callbacks 2061 */ 2062 2063 /* ARGSUSED */ 2064 static void 2065 ztest_get_done(zgd_t *zgd, int error) 2066 { 2067 ztest_ds_t *zd = zgd->zgd_private; 2068 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 2069 2070 if (zgd->zgd_db) 2071 dmu_buf_rele(zgd->zgd_db, zgd); 2072 2073 ztest_range_unlock((rl_t *)zgd->zgd_lr); 2074 ztest_object_unlock(zd, object); 2075 2076 umem_free(zgd, sizeof (*zgd)); 2077 } 2078 2079 static int 2080 ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, 2081 zio_t *zio) 2082 { 2083 ztest_ds_t *zd = arg; 2084 objset_t *os = zd->zd_os; 2085 uint64_t object = lr->lr_foid; 2086 uint64_t offset = lr->lr_offset; 2087 uint64_t size = lr->lr_length; 2088 uint64_t txg = lr->lr_common.lrc_txg; 2089 uint64_t crtxg; 2090 dmu_object_info_t doi; 2091 dmu_buf_t *db; 2092 zgd_t *zgd; 2093 int error; 2094 2095 ASSERT3P(lwb, !=, NULL); 2096 ASSERT3P(zio, !=, NULL); 2097 ASSERT3U(size, !=, 0); 2098 2099 ztest_object_lock(zd, object, RL_READER); 2100 error = dmu_bonus_hold(os, object, FTAG, &db); 2101 if (error) { 2102 ztest_object_unlock(zd, object); 2103 return (error); 2104 } 2105 2106 crtxg = ztest_bt_bonus(db)->bt_crtxg; 2107 2108 if (crtxg == 0 || crtxg > txg) { 2109 dmu_buf_rele(db, FTAG); 2110 ztest_object_unlock(zd, object); 2111 return (ENOENT); 2112 } 2113 2114 dmu_object_info_from_db(db, &doi); 2115 dmu_buf_rele(db, FTAG); 2116 db = NULL; 2117 2118 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 2119 zgd->zgd_lwb = lwb; 2120 zgd->zgd_private = zd; 2121 2122 if (buf != NULL) { /* immediate write */ 2123 zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd, 2124 object, offset, size, RL_READER); 2125 2126 error = dmu_read(os, object, offset, size, buf, 2127 DMU_READ_NO_PREFETCH); 2128 ASSERT(error == 0); 2129 } else { 2130 size = doi.doi_data_block_size; 2131 if (ISP2(size)) { 2132 offset = P2ALIGN(offset, size); 2133 } else { 2134 ASSERT(offset < size); 2135 offset = 0; 2136 } 2137 2138 zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd, 2139 object, offset, size, RL_READER); 2140 2141 error = dmu_buf_hold(os, object, offset, zgd, &db, 2142 DMU_READ_NO_PREFETCH); 2143 2144 if (error == 0) { 2145 blkptr_t *bp = &lr->lr_blkptr; 2146 2147 zgd->zgd_db = db; 2148 zgd->zgd_bp = bp; 2149 2150 ASSERT(db->db_offset == offset); 2151 ASSERT(db->db_size == size); 2152 2153 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2154 ztest_get_done, zgd); 2155 2156 if (error == 0) 2157 return (0); 2158 } 2159 } 2160 2161 ztest_get_done(zgd, error); 2162 2163 return (error); 2164 } 2165 2166 static void * 2167 ztest_lr_alloc(size_t lrsize, char *name) 2168 { 2169 char *lr; 2170 size_t namesize = name ? strlen(name) + 1 : 0; 2171 2172 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2173 2174 if (name) 2175 bcopy(name, lr + lrsize, namesize); 2176 2177 return (lr); 2178 } 2179 2180 void 2181 ztest_lr_free(void *lr, size_t lrsize, char *name) 2182 { 2183 size_t namesize = name ? strlen(name) + 1 : 0; 2184 2185 umem_free(lr, lrsize + namesize); 2186 } 2187 2188 /* 2189 * Lookup a bunch of objects. Returns the number of objects not found. 2190 */ 2191 static int 2192 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2193 { 2194 int missing = 0; 2195 int error; 2196 2197 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2198 2199 for (int i = 0; i < count; i++, od++) { 2200 od->od_object = 0; 2201 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2202 sizeof (uint64_t), 1, &od->od_object); 2203 if (error) { 2204 ASSERT(error == ENOENT); 2205 ASSERT(od->od_object == 0); 2206 missing++; 2207 } else { 2208 dmu_buf_t *db; 2209 ztest_block_tag_t *bbt; 2210 dmu_object_info_t doi; 2211 2212 ASSERT(od->od_object != 0); 2213 ASSERT(missing == 0); /* there should be no gaps */ 2214 2215 ztest_object_lock(zd, od->od_object, RL_READER); 2216 VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os, 2217 od->od_object, FTAG, &db)); 2218 dmu_object_info_from_db(db, &doi); 2219 bbt = ztest_bt_bonus(db); 2220 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2221 od->od_type = doi.doi_type; 2222 od->od_blocksize = doi.doi_data_block_size; 2223 od->od_gen = bbt->bt_gen; 2224 dmu_buf_rele(db, FTAG); 2225 ztest_object_unlock(zd, od->od_object); 2226 } 2227 } 2228 2229 return (missing); 2230 } 2231 2232 static int 2233 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2234 { 2235 int missing = 0; 2236 2237 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2238 2239 for (int i = 0; i < count; i++, od++) { 2240 if (missing) { 2241 od->od_object = 0; 2242 missing++; 2243 continue; 2244 } 2245 2246 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2247 2248 lr->lr_doid = od->od_dir; 2249 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2250 lr->lrz_type = od->od_crtype; 2251 lr->lrz_blocksize = od->od_crblocksize; 2252 lr->lrz_ibshift = ztest_random_ibshift(); 2253 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2254 lr->lrz_dnodesize = od->od_crdnodesize; 2255 lr->lr_gen = od->od_crgen; 2256 lr->lr_crtime[0] = time(NULL); 2257 2258 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2259 ASSERT(missing == 0); 2260 od->od_object = 0; 2261 missing++; 2262 } else { 2263 od->od_object = lr->lr_foid; 2264 od->od_type = od->od_crtype; 2265 od->od_blocksize = od->od_crblocksize; 2266 od->od_gen = od->od_crgen; 2267 ASSERT(od->od_object != 0); 2268 } 2269 2270 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2271 } 2272 2273 return (missing); 2274 } 2275 2276 static int 2277 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2278 { 2279 int missing = 0; 2280 int error; 2281 2282 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2283 2284 od += count - 1; 2285 2286 for (int i = count - 1; i >= 0; i--, od--) { 2287 if (missing) { 2288 missing++; 2289 continue; 2290 } 2291 2292 /* 2293 * No object was found. 2294 */ 2295 if (od->od_object == 0) 2296 continue; 2297 2298 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2299 2300 lr->lr_doid = od->od_dir; 2301 2302 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2303 ASSERT3U(error, ==, ENOSPC); 2304 missing++; 2305 } else { 2306 od->od_object = 0; 2307 } 2308 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2309 } 2310 2311 return (missing); 2312 } 2313 2314 static int 2315 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2316 void *data) 2317 { 2318 lr_write_t *lr; 2319 int error; 2320 2321 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2322 2323 lr->lr_foid = object; 2324 lr->lr_offset = offset; 2325 lr->lr_length = size; 2326 lr->lr_blkoff = 0; 2327 BP_ZERO(&lr->lr_blkptr); 2328 2329 bcopy(data, lr + 1, size); 2330 2331 error = ztest_replay_write(zd, lr, B_FALSE); 2332 2333 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2334 2335 return (error); 2336 } 2337 2338 static int 2339 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2340 { 2341 lr_truncate_t *lr; 2342 int error; 2343 2344 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2345 2346 lr->lr_foid = object; 2347 lr->lr_offset = offset; 2348 lr->lr_length = size; 2349 2350 error = ztest_replay_truncate(zd, lr, B_FALSE); 2351 2352 ztest_lr_free(lr, sizeof (*lr), NULL); 2353 2354 return (error); 2355 } 2356 2357 static int 2358 ztest_setattr(ztest_ds_t *zd, uint64_t object) 2359 { 2360 lr_setattr_t *lr; 2361 int error; 2362 2363 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2364 2365 lr->lr_foid = object; 2366 lr->lr_size = 0; 2367 lr->lr_mode = 0; 2368 2369 error = ztest_replay_setattr(zd, lr, B_FALSE); 2370 2371 ztest_lr_free(lr, sizeof (*lr), NULL); 2372 2373 return (error); 2374 } 2375 2376 static void 2377 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2378 { 2379 objset_t *os = zd->zd_os; 2380 dmu_tx_t *tx; 2381 uint64_t txg; 2382 rl_t *rl; 2383 2384 txg_wait_synced(dmu_objset_pool(os), 0); 2385 2386 ztest_object_lock(zd, object, RL_READER); 2387 rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); 2388 2389 tx = dmu_tx_create(os); 2390 2391 dmu_tx_hold_write(tx, object, offset, size); 2392 2393 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2394 2395 if (txg != 0) { 2396 dmu_prealloc(os, object, offset, size, tx); 2397 dmu_tx_commit(tx); 2398 txg_wait_synced(dmu_objset_pool(os), txg); 2399 } else { 2400 (void) dmu_free_long_range(os, object, offset, size); 2401 } 2402 2403 ztest_range_unlock(rl); 2404 ztest_object_unlock(zd, object); 2405 } 2406 2407 static void 2408 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2409 { 2410 int err; 2411 ztest_block_tag_t wbt; 2412 dmu_object_info_t doi; 2413 enum ztest_io_type io_type; 2414 uint64_t blocksize; 2415 void *data; 2416 2417 VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0); 2418 blocksize = doi.doi_data_block_size; 2419 data = umem_alloc(blocksize, UMEM_NOFAIL); 2420 2421 /* 2422 * Pick an i/o type at random, biased toward writing block tags. 2423 */ 2424 io_type = ztest_random(ZTEST_IO_TYPES); 2425 if (ztest_random(2) == 0) 2426 io_type = ZTEST_IO_WRITE_TAG; 2427 2428 rw_enter(&zd->zd_zilog_lock, RW_READER); 2429 2430 switch (io_type) { 2431 2432 case ZTEST_IO_WRITE_TAG: 2433 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2434 offset, 0, 0, 0); 2435 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2436 break; 2437 2438 case ZTEST_IO_WRITE_PATTERN: 2439 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2440 if (ztest_random(2) == 0) { 2441 /* 2442 * Induce fletcher2 collisions to ensure that 2443 * zio_ddt_collision() detects and resolves them 2444 * when using fletcher2-verify for deduplication. 2445 */ 2446 ((uint64_t *)data)[0] ^= 1ULL << 63; 2447 ((uint64_t *)data)[4] ^= 1ULL << 63; 2448 } 2449 (void) ztest_write(zd, object, offset, blocksize, data); 2450 break; 2451 2452 case ZTEST_IO_WRITE_ZEROES: 2453 bzero(data, blocksize); 2454 (void) ztest_write(zd, object, offset, blocksize, data); 2455 break; 2456 2457 case ZTEST_IO_TRUNCATE: 2458 (void) ztest_truncate(zd, object, offset, blocksize); 2459 break; 2460 2461 case ZTEST_IO_SETATTR: 2462 (void) ztest_setattr(zd, object); 2463 break; 2464 2465 case ZTEST_IO_REWRITE: 2466 rw_enter(&ztest_name_lock, RW_READER); 2467 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2468 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2469 B_FALSE); 2470 VERIFY(err == 0 || err == ENOSPC); 2471 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2472 ZFS_PROP_COMPRESSION, 2473 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2474 B_FALSE); 2475 VERIFY(err == 0 || err == ENOSPC); 2476 rw_exit(&ztest_name_lock); 2477 2478 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2479 DMU_READ_NO_PREFETCH)); 2480 2481 (void) ztest_write(zd, object, offset, blocksize, data); 2482 break; 2483 } 2484 2485 rw_exit(&zd->zd_zilog_lock); 2486 2487 umem_free(data, blocksize); 2488 } 2489 2490 /* 2491 * Initialize an object description template. 2492 */ 2493 static void 2494 ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, 2495 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2496 uint64_t gen) 2497 { 2498 od->od_dir = ZTEST_DIROBJ; 2499 od->od_object = 0; 2500 2501 od->od_crtype = type; 2502 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2503 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2504 od->od_crgen = gen; 2505 2506 od->od_type = DMU_OT_NONE; 2507 od->od_blocksize = 0; 2508 od->od_gen = 0; 2509 2510 (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]", 2511 tag, (int64_t)id, index); 2512 } 2513 2514 /* 2515 * Lookup or create the objects for a test using the od template. 2516 * If the objects do not all exist, or if 'remove' is specified, 2517 * remove any existing objects and create new ones. Otherwise, 2518 * use the existing objects. 2519 */ 2520 static int 2521 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2522 { 2523 int count = size / sizeof (*od); 2524 int rv = 0; 2525 2526 mutex_enter(&zd->zd_dirobj_lock); 2527 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2528 (ztest_remove(zd, od, count) != 0 || 2529 ztest_create(zd, od, count) != 0)) 2530 rv = -1; 2531 zd->zd_od = od; 2532 mutex_exit(&zd->zd_dirobj_lock); 2533 2534 return (rv); 2535 } 2536 2537 /* ARGSUSED */ 2538 void 2539 ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2540 { 2541 zilog_t *zilog = zd->zd_zilog; 2542 2543 rw_enter(&zd->zd_zilog_lock, RW_READER); 2544 2545 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2546 2547 /* 2548 * Remember the committed values in zd, which is in parent/child 2549 * shared memory. If we die, the next iteration of ztest_run() 2550 * will verify that the log really does contain this record. 2551 */ 2552 mutex_enter(&zilog->zl_lock); 2553 ASSERT(zd->zd_shared != NULL); 2554 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2555 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2556 mutex_exit(&zilog->zl_lock); 2557 2558 rw_exit(&zd->zd_zilog_lock); 2559 } 2560 2561 /* 2562 * This function is designed to simulate the operations that occur during a 2563 * mount/unmount operation. We hold the dataset across these operations in an 2564 * attempt to expose any implicit assumptions about ZIL management. 2565 */ 2566 /* ARGSUSED */ 2567 void 2568 ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2569 { 2570 objset_t *os = zd->zd_os; 2571 2572 /* 2573 * We grab the zd_dirobj_lock to ensure that no other thread is 2574 * updating the zil (i.e. adding in-memory log records) and the 2575 * zd_zilog_lock to block any I/O. 2576 */ 2577 mutex_enter(&zd->zd_dirobj_lock); 2578 rw_enter(&zd->zd_zilog_lock, RW_WRITER); 2579 2580 /* zfsvfs_teardown() */ 2581 zil_close(zd->zd_zilog); 2582 2583 /* zfsvfs_setup() */ 2584 VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog); 2585 zil_replay(os, zd, ztest_replay_vector); 2586 2587 rw_exit(&zd->zd_zilog_lock); 2588 mutex_exit(&zd->zd_dirobj_lock); 2589 } 2590 2591 /* 2592 * Verify that we can't destroy an active pool, create an existing pool, 2593 * or create a pool with a bad vdev spec. 2594 */ 2595 /* ARGSUSED */ 2596 void 2597 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 2598 { 2599 ztest_shared_opts_t *zo = &ztest_opts; 2600 spa_t *spa; 2601 nvlist_t *nvroot; 2602 2603 if (zo->zo_mmp_test) 2604 return; 2605 2606 /* 2607 * Attempt to create using a bad file. 2608 */ 2609 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 2610 VERIFY3U(ENOENT, ==, 2611 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 2612 nvlist_free(nvroot); 2613 2614 /* 2615 * Attempt to create using a bad mirror. 2616 */ 2617 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); 2618 VERIFY3U(ENOENT, ==, 2619 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 2620 nvlist_free(nvroot); 2621 2622 /* 2623 * Attempt to create an existing pool. It shouldn't matter 2624 * what's in the nvroot; we should fail with EEXIST. 2625 */ 2626 rw_enter(&ztest_name_lock, RW_READER); 2627 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 2628 VERIFY3U(EEXIST, ==, 2629 spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); 2630 nvlist_free(nvroot); 2631 VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG)); 2632 VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool)); 2633 spa_close(spa, FTAG); 2634 2635 rw_exit(&ztest_name_lock); 2636 } 2637 2638 /* 2639 * Start and then stop the MMP threads to ensure the startup and shutdown code 2640 * works properly. Actual protection and property-related code tested via ZTS. 2641 */ 2642 /* ARGSUSED */ 2643 void 2644 ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) 2645 { 2646 ztest_shared_opts_t *zo = &ztest_opts; 2647 spa_t *spa = ztest_spa; 2648 2649 if (zo->zo_mmp_test) 2650 return; 2651 2652 /* 2653 * Since enabling MMP involves setting a property, it could not be done 2654 * while the pool is suspended. 2655 */ 2656 if (spa_suspended(spa)) 2657 return; 2658 2659 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2660 mutex_enter(&spa->spa_props_lock); 2661 2662 zfs_multihost_fail_intervals = 0; 2663 2664 if (!spa_multihost(spa)) { 2665 spa->spa_multihost = B_TRUE; 2666 mmp_thread_start(spa); 2667 } 2668 2669 mutex_exit(&spa->spa_props_lock); 2670 spa_config_exit(spa, SCL_CONFIG, FTAG); 2671 2672 txg_wait_synced(spa_get_dsl(spa), 0); 2673 mmp_signal_all_threads(); 2674 txg_wait_synced(spa_get_dsl(spa), 0); 2675 2676 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2677 mutex_enter(&spa->spa_props_lock); 2678 2679 if (spa_multihost(spa)) { 2680 mmp_thread_stop(spa); 2681 spa->spa_multihost = B_FALSE; 2682 } 2683 2684 mutex_exit(&spa->spa_props_lock); 2685 spa_config_exit(spa, SCL_CONFIG, FTAG); 2686 } 2687 2688 /* ARGSUSED */ 2689 void 2690 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 2691 { 2692 spa_t *spa; 2693 uint64_t initial_version = SPA_VERSION_INITIAL; 2694 uint64_t version, newversion; 2695 nvlist_t *nvroot, *props; 2696 char *name; 2697 2698 if (ztest_opts.zo_mmp_test) 2699 return; 2700 2701 mutex_enter(&ztest_vdev_lock); 2702 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 2703 2704 /* 2705 * Clean up from previous runs. 2706 */ 2707 (void) spa_destroy(name); 2708 2709 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 2710 NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); 2711 2712 /* 2713 * If we're configuring a RAIDZ device then make sure that the 2714 * the initial version is capable of supporting that feature. 2715 */ 2716 switch (ztest_opts.zo_raidz_parity) { 2717 case 0: 2718 case 1: 2719 initial_version = SPA_VERSION_INITIAL; 2720 break; 2721 case 2: 2722 initial_version = SPA_VERSION_RAIDZ2; 2723 break; 2724 case 3: 2725 initial_version = SPA_VERSION_RAIDZ3; 2726 break; 2727 } 2728 2729 /* 2730 * Create a pool with a spa version that can be upgraded. Pick 2731 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 2732 */ 2733 do { 2734 version = ztest_random_spa_version(initial_version); 2735 } while (version > SPA_VERSION_BEFORE_FEATURES); 2736 2737 props = fnvlist_alloc(); 2738 fnvlist_add_uint64(props, 2739 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 2740 VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); 2741 fnvlist_free(nvroot); 2742 fnvlist_free(props); 2743 2744 VERIFY0(spa_open(name, &spa, FTAG)); 2745 VERIFY3U(spa_version(spa), ==, version); 2746 newversion = ztest_random_spa_version(version + 1); 2747 2748 if (ztest_opts.zo_verbose >= 4) { 2749 (void) printf("upgrading spa version from %llu to %llu\n", 2750 (u_longlong_t)version, (u_longlong_t)newversion); 2751 } 2752 2753 spa_upgrade(spa, newversion); 2754 VERIFY3U(spa_version(spa), >, version); 2755 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 2756 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 2757 spa_close(spa, FTAG); 2758 2759 strfree(name); 2760 mutex_exit(&ztest_vdev_lock); 2761 } 2762 2763 static void 2764 ztest_spa_checkpoint(spa_t *spa) 2765 { 2766 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 2767 2768 int error = spa_checkpoint(spa->spa_name); 2769 2770 switch (error) { 2771 case 0: 2772 case ZFS_ERR_DEVRM_IN_PROGRESS: 2773 case ZFS_ERR_DISCARDING_CHECKPOINT: 2774 case ZFS_ERR_CHECKPOINT_EXISTS: 2775 break; 2776 case ENOSPC: 2777 ztest_record_enospc(FTAG); 2778 break; 2779 default: 2780 fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error); 2781 } 2782 } 2783 2784 static void 2785 ztest_spa_discard_checkpoint(spa_t *spa) 2786 { 2787 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 2788 2789 int error = spa_checkpoint_discard(spa->spa_name); 2790 2791 switch (error) { 2792 case 0: 2793 case ZFS_ERR_DISCARDING_CHECKPOINT: 2794 case ZFS_ERR_NO_CHECKPOINT: 2795 break; 2796 default: 2797 fatal(0, "spa_discard_checkpoint(%s) = %d", 2798 spa->spa_name, error); 2799 } 2800 2801 } 2802 2803 /* ARGSUSED */ 2804 void 2805 ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 2806 { 2807 spa_t *spa = ztest_spa; 2808 2809 mutex_enter(&ztest_checkpoint_lock); 2810 if (ztest_random(2) == 0) { 2811 ztest_spa_checkpoint(spa); 2812 } else { 2813 ztest_spa_discard_checkpoint(spa); 2814 } 2815 mutex_exit(&ztest_checkpoint_lock); 2816 } 2817 2818 2819 static vdev_t * 2820 vdev_lookup_by_path(vdev_t *vd, const char *path) 2821 { 2822 vdev_t *mvd; 2823 2824 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 2825 return (vd); 2826 2827 for (int c = 0; c < vd->vdev_children; c++) 2828 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 2829 NULL) 2830 return (mvd); 2831 2832 return (NULL); 2833 } 2834 2835 static int 2836 spa_num_top_vdevs(spa_t *spa) 2837 { 2838 vdev_t *rvd = spa->spa_root_vdev; 2839 ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); 2840 return (rvd->vdev_children); 2841 } 2842 2843 /* 2844 * Verify that vdev_add() works as expected. 2845 */ 2846 /* ARGSUSED */ 2847 void 2848 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 2849 { 2850 ztest_shared_t *zs = ztest_shared; 2851 spa_t *spa = ztest_spa; 2852 uint64_t leaves; 2853 uint64_t guid; 2854 nvlist_t *nvroot; 2855 int error; 2856 2857 if (ztest_opts.zo_mmp_test) 2858 return; 2859 2860 mutex_enter(&ztest_vdev_lock); 2861 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; 2862 2863 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2864 2865 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 2866 2867 /* 2868 * If we have slogs then remove them 1/4 of the time. 2869 */ 2870 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 2871 metaslab_group_t *mg; 2872 2873 /* 2874 * find the first real slog in log allocation class 2875 */ 2876 mg = spa_log_class(spa)->mc_rotor; 2877 while (!mg->mg_vd->vdev_islog) 2878 mg = mg->mg_next; 2879 2880 guid = mg->mg_vd->vdev_guid; 2881 2882 spa_config_exit(spa, SCL_VDEV, FTAG); 2883 2884 /* 2885 * We have to grab the zs_name_lock as writer to 2886 * prevent a race between removing a slog (dmu_objset_find) 2887 * and destroying a dataset. Removing the slog will 2888 * grab a reference on the dataset which may cause 2889 * dmu_objset_destroy() to fail with EBUSY thus 2890 * leaving the dataset in an inconsistent state. 2891 */ 2892 rw_enter(&ztest_name_lock, RW_WRITER); 2893 error = spa_vdev_remove(spa, guid, B_FALSE); 2894 rw_exit(&ztest_name_lock); 2895 2896 switch (error) { 2897 case 0: 2898 case EEXIST: 2899 case ZFS_ERR_CHECKPOINT_EXISTS: 2900 case ZFS_ERR_DISCARDING_CHECKPOINT: 2901 break; 2902 default: 2903 fatal(0, "spa_vdev_remove() = %d", error); 2904 } 2905 } else { 2906 spa_config_exit(spa, SCL_VDEV, FTAG); 2907 2908 /* 2909 * Make 1/4 of the devices be log devices 2910 */ 2911 nvroot = make_vdev_root(NULL, NULL, NULL, 2912 ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? 2913 "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); 2914 2915 error = spa_vdev_add(spa, nvroot); 2916 nvlist_free(nvroot); 2917 2918 switch (error) { 2919 case 0: 2920 break; 2921 case ENOSPC: 2922 ztest_record_enospc("spa_vdev_add"); 2923 break; 2924 default: 2925 fatal(0, "spa_vdev_add() = %d", error); 2926 } 2927 } 2928 2929 mutex_exit(&ztest_vdev_lock); 2930 } 2931 2932 /* ARGSUSED */ 2933 void 2934 ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) 2935 { 2936 ztest_shared_t *zs = ztest_shared; 2937 spa_t *spa = ztest_spa; 2938 uint64_t leaves; 2939 nvlist_t *nvroot; 2940 const char *class = (ztest_random(2) == 0) ? 2941 VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; 2942 int error; 2943 2944 /* 2945 * By default add a special vdev 50% of the time 2946 */ 2947 if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || 2948 (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && 2949 ztest_random(2) == 0)) { 2950 return; 2951 } 2952 2953 mutex_enter(&ztest_vdev_lock); 2954 2955 /* Only test with mirrors */ 2956 if (zs->zs_mirrors < 2) { 2957 mutex_exit(&ztest_vdev_lock); 2958 return; 2959 } 2960 2961 /* requires feature@allocation_classes */ 2962 if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { 2963 mutex_exit(&ztest_vdev_lock); 2964 return; 2965 } 2966 2967 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; 2968 2969 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2970 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 2971 spa_config_exit(spa, SCL_VDEV, FTAG); 2972 2973 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 2974 class, ztest_opts.zo_raidz, zs->zs_mirrors, 1); 2975 2976 error = spa_vdev_add(spa, nvroot); 2977 nvlist_free(nvroot); 2978 2979 if (error == ENOSPC) 2980 ztest_record_enospc("spa_vdev_add"); 2981 else if (error != 0) 2982 fatal(0, "spa_vdev_add() = %d", error); 2983 2984 /* 2985 * 50% of the time allow small blocks in the special class 2986 */ 2987 if (error == 0 && 2988 spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { 2989 if (ztest_opts.zo_verbose >= 3) 2990 (void) printf("Enabling special VDEV small blocks\n"); 2991 (void) ztest_dsl_prop_set_uint64(zd->zd_name, 2992 ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); 2993 } 2994 2995 mutex_exit(&ztest_vdev_lock); 2996 2997 if (ztest_opts.zo_verbose >= 3) { 2998 metaslab_class_t *mc; 2999 3000 if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) 3001 mc = spa_special_class(spa); 3002 else 3003 mc = spa_dedup_class(spa); 3004 (void) printf("Added a %s mirrored vdev (of %d)\n", 3005 class, (int)mc->mc_groups); 3006 } 3007 } 3008 3009 /* 3010 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 3011 */ 3012 /* ARGSUSED */ 3013 void 3014 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 3015 { 3016 ztest_shared_t *zs = ztest_shared; 3017 spa_t *spa = ztest_spa; 3018 vdev_t *rvd = spa->spa_root_vdev; 3019 spa_aux_vdev_t *sav; 3020 char *aux; 3021 uint64_t guid = 0; 3022 int error; 3023 3024 if (ztest_opts.zo_mmp_test) 3025 return; 3026 3027 if (ztest_random(2) == 0) { 3028 sav = &spa->spa_spares; 3029 aux = ZPOOL_CONFIG_SPARES; 3030 } else { 3031 sav = &spa->spa_l2cache; 3032 aux = ZPOOL_CONFIG_L2CACHE; 3033 } 3034 3035 mutex_enter(&ztest_vdev_lock); 3036 3037 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3038 3039 if (sav->sav_count != 0 && ztest_random(4) == 0) { 3040 /* 3041 * Pick a random device to remove. 3042 */ 3043 guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; 3044 } else { 3045 /* 3046 * Find an unused device we can add. 3047 */ 3048 zs->zs_vdev_aux = 0; 3049 for (;;) { 3050 char path[MAXPATHLEN]; 3051 int c; 3052 (void) snprintf(path, sizeof (path), ztest_aux_template, 3053 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 3054 zs->zs_vdev_aux); 3055 for (c = 0; c < sav->sav_count; c++) 3056 if (strcmp(sav->sav_vdevs[c]->vdev_path, 3057 path) == 0) 3058 break; 3059 if (c == sav->sav_count && 3060 vdev_lookup_by_path(rvd, path) == NULL) 3061 break; 3062 zs->zs_vdev_aux++; 3063 } 3064 } 3065 3066 spa_config_exit(spa, SCL_VDEV, FTAG); 3067 3068 if (guid == 0) { 3069 /* 3070 * Add a new device. 3071 */ 3072 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 3073 (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); 3074 error = spa_vdev_add(spa, nvroot); 3075 3076 switch (error) { 3077 case 0: 3078 break; 3079 default: 3080 fatal(0, "spa_vdev_add(%p) = %d", nvroot, error); 3081 } 3082 nvlist_free(nvroot); 3083 } else { 3084 /* 3085 * Remove an existing device. Sometimes, dirty its 3086 * vdev state first to make sure we handle removal 3087 * of devices that have pending state changes. 3088 */ 3089 if (ztest_random(2) == 0) 3090 (void) vdev_online(spa, guid, 0, NULL); 3091 3092 error = spa_vdev_remove(spa, guid, B_FALSE); 3093 3094 switch (error) { 3095 case 0: 3096 case EBUSY: 3097 case ZFS_ERR_CHECKPOINT_EXISTS: 3098 case ZFS_ERR_DISCARDING_CHECKPOINT: 3099 break; 3100 default: 3101 fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); 3102 } 3103 } 3104 3105 mutex_exit(&ztest_vdev_lock); 3106 } 3107 3108 /* 3109 * split a pool if it has mirror tlvdevs 3110 */ 3111 /* ARGSUSED */ 3112 void 3113 ztest_split_pool(ztest_ds_t *zd, uint64_t id) 3114 { 3115 ztest_shared_t *zs = ztest_shared; 3116 spa_t *spa = ztest_spa; 3117 vdev_t *rvd = spa->spa_root_vdev; 3118 nvlist_t *tree, **child, *config, *split, **schild; 3119 uint_t c, children, schildren = 0, lastlogid = 0; 3120 int error = 0; 3121 3122 if (ztest_opts.zo_mmp_test) 3123 return; 3124 3125 mutex_enter(&ztest_vdev_lock); 3126 3127 /* ensure we have a useable config; mirrors of raidz aren't supported */ 3128 if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) { 3129 mutex_exit(&ztest_vdev_lock); 3130 return; 3131 } 3132 3133 /* clean up the old pool, if any */ 3134 (void) spa_destroy("splitp"); 3135 3136 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3137 3138 /* generate a config from the existing config */ 3139 mutex_enter(&spa->spa_props_lock); 3140 VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE, 3141 &tree) == 0); 3142 mutex_exit(&spa->spa_props_lock); 3143 3144 VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, 3145 &children) == 0); 3146 3147 schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); 3148 for (c = 0; c < children; c++) { 3149 vdev_t *tvd = rvd->vdev_child[c]; 3150 nvlist_t **mchild; 3151 uint_t mchildren; 3152 3153 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 3154 VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME, 3155 0) == 0); 3156 VERIFY(nvlist_add_string(schild[schildren], 3157 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0); 3158 VERIFY(nvlist_add_uint64(schild[schildren], 3159 ZPOOL_CONFIG_IS_HOLE, 1) == 0); 3160 if (lastlogid == 0) 3161 lastlogid = schildren; 3162 ++schildren; 3163 continue; 3164 } 3165 lastlogid = 0; 3166 VERIFY(nvlist_lookup_nvlist_array(child[c], 3167 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); 3168 VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0); 3169 } 3170 3171 /* OK, create a config that can be used to split */ 3172 VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0); 3173 VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE, 3174 VDEV_TYPE_ROOT) == 0); 3175 VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild, 3176 lastlogid != 0 ? lastlogid : schildren) == 0); 3177 3178 VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0); 3179 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0); 3180 3181 for (c = 0; c < schildren; c++) 3182 nvlist_free(schild[c]); 3183 free(schild); 3184 nvlist_free(split); 3185 3186 spa_config_exit(spa, SCL_VDEV, FTAG); 3187 3188 rw_enter(&ztest_name_lock, RW_WRITER); 3189 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 3190 rw_exit(&ztest_name_lock); 3191 3192 nvlist_free(config); 3193 3194 if (error == 0) { 3195 (void) printf("successful split - results:\n"); 3196 mutex_enter(&spa_namespace_lock); 3197 show_pool_stats(spa); 3198 show_pool_stats(spa_lookup("splitp")); 3199 mutex_exit(&spa_namespace_lock); 3200 ++zs->zs_splits; 3201 --zs->zs_mirrors; 3202 } 3203 mutex_exit(&ztest_vdev_lock); 3204 } 3205 3206 /* 3207 * Verify that we can attach and detach devices. 3208 */ 3209 /* ARGSUSED */ 3210 void 3211 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 3212 { 3213 ztest_shared_t *zs = ztest_shared; 3214 spa_t *spa = ztest_spa; 3215 spa_aux_vdev_t *sav = &spa->spa_spares; 3216 vdev_t *rvd = spa->spa_root_vdev; 3217 vdev_t *oldvd, *newvd, *pvd; 3218 nvlist_t *root; 3219 uint64_t leaves; 3220 uint64_t leaf, top; 3221 uint64_t ashift = ztest_get_ashift(); 3222 uint64_t oldguid, pguid; 3223 uint64_t oldsize, newsize; 3224 char oldpath[MAXPATHLEN], newpath[MAXPATHLEN]; 3225 int replacing; 3226 int oldvd_has_siblings = B_FALSE; 3227 int newvd_is_spare = B_FALSE; 3228 int oldvd_is_log; 3229 int error, expected_error; 3230 3231 if (ztest_opts.zo_mmp_test) 3232 return; 3233 3234 mutex_enter(&ztest_vdev_lock); 3235 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; 3236 3237 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3238 3239 /* 3240 * If a vdev is in the process of being removed, its removal may 3241 * finish while we are in progress, leading to an unexpected error 3242 * value. Don't bother trying to attach while we are in the middle 3243 * of removal. 3244 */ 3245 if (ztest_device_removal_active) { 3246 spa_config_exit(spa, SCL_ALL, FTAG); 3247 mutex_exit(&ztest_vdev_lock); 3248 return; 3249 } 3250 3251 /* 3252 * Decide whether to do an attach or a replace. 3253 */ 3254 replacing = ztest_random(2); 3255 3256 /* 3257 * Pick a random top-level vdev. 3258 */ 3259 top = ztest_random_vdev_top(spa, B_TRUE); 3260 3261 /* 3262 * Pick a random leaf within it. 3263 */ 3264 leaf = ztest_random(leaves); 3265 3266 /* 3267 * Locate this vdev. 3268 */ 3269 oldvd = rvd->vdev_child[top]; 3270 3271 /* pick a child from the mirror */ 3272 if (zs->zs_mirrors >= 1) { 3273 ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); 3274 ASSERT(oldvd->vdev_children >= zs->zs_mirrors); 3275 oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz]; 3276 } 3277 3278 /* pick a child out of the raidz group */ 3279 if (ztest_opts.zo_raidz > 1) { 3280 ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); 3281 ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz); 3282 oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz]; 3283 } 3284 3285 /* 3286 * If we're already doing an attach or replace, oldvd may be a 3287 * mirror vdev -- in which case, pick a random child. 3288 */ 3289 while (oldvd->vdev_children != 0) { 3290 oldvd_has_siblings = B_TRUE; 3291 ASSERT(oldvd->vdev_children >= 2); 3292 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3293 } 3294 3295 oldguid = oldvd->vdev_guid; 3296 oldsize = vdev_get_min_asize(oldvd); 3297 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3298 (void) strcpy(oldpath, oldvd->vdev_path); 3299 pvd = oldvd->vdev_parent; 3300 pguid = pvd->vdev_guid; 3301 3302 /* 3303 * If oldvd has siblings, then half of the time, detach it. 3304 */ 3305 if (oldvd_has_siblings && ztest_random(2) == 0) { 3306 spa_config_exit(spa, SCL_ALL, FTAG); 3307 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3308 if (error != 0 && error != ENODEV && error != EBUSY && 3309 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3310 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3311 fatal(0, "detach (%s) returned %d", oldpath, error); 3312 mutex_exit(&ztest_vdev_lock); 3313 return; 3314 } 3315 3316 /* 3317 * For the new vdev, choose with equal probability between the two 3318 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3319 */ 3320 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3321 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3322 newvd_is_spare = B_TRUE; 3323 (void) strcpy(newpath, newvd->vdev_path); 3324 } else { 3325 (void) snprintf(newpath, sizeof (newpath), ztest_dev_template, 3326 ztest_opts.zo_dir, ztest_opts.zo_pool, 3327 top * leaves + leaf); 3328 if (ztest_random(2) == 0) 3329 newpath[strlen(newpath) - 1] = 'b'; 3330 newvd = vdev_lookup_by_path(rvd, newpath); 3331 } 3332 3333 if (newvd) { 3334 /* 3335 * Reopen to ensure the vdev's asize field isn't stale. 3336 */ 3337 vdev_reopen(newvd); 3338 newsize = vdev_get_min_asize(newvd); 3339 } else { 3340 /* 3341 * Make newsize a little bigger or smaller than oldsize. 3342 * If it's smaller, the attach should fail. 3343 * If it's larger, and we're doing a replace, 3344 * we should get dynamic LUN growth when we're done. 3345 */ 3346 newsize = 10 * oldsize / (9 + ztest_random(3)); 3347 } 3348 3349 /* 3350 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3351 * unless it's a replace; in that case any non-replacing parent is OK. 3352 * 3353 * If newvd is already part of the pool, it should fail with EBUSY. 3354 * 3355 * If newvd is too small, it should fail with EOVERFLOW. 3356 */ 3357 if (pvd->vdev_ops != &vdev_mirror_ops && 3358 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3359 pvd->vdev_ops == &vdev_replacing_ops || 3360 pvd->vdev_ops == &vdev_spare_ops)) 3361 expected_error = ENOTSUP; 3362 else if (newvd_is_spare && (!replacing || oldvd_is_log)) 3363 expected_error = ENOTSUP; 3364 else if (newvd == oldvd) 3365 expected_error = replacing ? 0 : EBUSY; 3366 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3367 expected_error = EBUSY; 3368 else if (newsize < oldsize) 3369 expected_error = EOVERFLOW; 3370 else if (ashift > oldvd->vdev_top->vdev_ashift) 3371 expected_error = EDOM; 3372 else 3373 expected_error = 0; 3374 3375 spa_config_exit(spa, SCL_ALL, FTAG); 3376 3377 /* 3378 * Build the nvlist describing newpath. 3379 */ 3380 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3381 ashift, NULL, 0, 0, 1); 3382 3383 error = spa_vdev_attach(spa, oldguid, root, replacing); 3384 3385 nvlist_free(root); 3386 3387 /* 3388 * If our parent was the replacing vdev, but the replace completed, 3389 * then instead of failing with ENOTSUP we may either succeed, 3390 * fail with ENODEV, or fail with EOVERFLOW. 3391 */ 3392 if (expected_error == ENOTSUP && 3393 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3394 expected_error = error; 3395 3396 /* 3397 * If someone grew the LUN, the replacement may be too small. 3398 */ 3399 if (error == EOVERFLOW || error == EBUSY) 3400 expected_error = error; 3401 3402 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3403 error == ZFS_ERR_DISCARDING_CHECKPOINT) 3404 expected_error = error; 3405 3406 /* XXX workaround 6690467 */ 3407 if (error != expected_error && expected_error != EBUSY) { 3408 fatal(0, "attach (%s %llu, %s %llu, %d) " 3409 "returned %d, expected %d", 3410 oldpath, oldsize, newpath, 3411 newsize, replacing, error, expected_error); 3412 } 3413 3414 mutex_exit(&ztest_vdev_lock); 3415 } 3416 3417 /* ARGSUSED */ 3418 void 3419 ztest_device_removal(ztest_ds_t *zd, uint64_t id) 3420 { 3421 spa_t *spa = ztest_spa; 3422 vdev_t *vd; 3423 uint64_t guid; 3424 int error; 3425 3426 mutex_enter(&ztest_vdev_lock); 3427 3428 if (ztest_device_removal_active) { 3429 mutex_exit(&ztest_vdev_lock); 3430 return; 3431 } 3432 3433 /* 3434 * Remove a random top-level vdev and wait for removal to finish. 3435 */ 3436 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3437 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 3438 guid = vd->vdev_guid; 3439 spa_config_exit(spa, SCL_VDEV, FTAG); 3440 3441 error = spa_vdev_remove(spa, guid, B_FALSE); 3442 if (error == 0) { 3443 ztest_device_removal_active = B_TRUE; 3444 mutex_exit(&ztest_vdev_lock); 3445 3446 while (spa->spa_vdev_removal != NULL) 3447 txg_wait_synced(spa_get_dsl(spa), 0); 3448 } else { 3449 mutex_exit(&ztest_vdev_lock); 3450 return; 3451 } 3452 3453 /* 3454 * The pool needs to be scrubbed after completing device removal. 3455 * Failure to do so may result in checksum errors due to the 3456 * strategy employed by ztest_fault_inject() when selecting which 3457 * offset are redundant and can be damaged. 3458 */ 3459 error = spa_scan(spa, POOL_SCAN_SCRUB); 3460 if (error == 0) { 3461 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 3462 txg_wait_synced(spa_get_dsl(spa), 0); 3463 } 3464 3465 mutex_enter(&ztest_vdev_lock); 3466 ztest_device_removal_active = B_FALSE; 3467 mutex_exit(&ztest_vdev_lock); 3468 } 3469 3470 /* 3471 * Callback function which expands the physical size of the vdev. 3472 */ 3473 vdev_t * 3474 grow_vdev(vdev_t *vd, void *arg) 3475 { 3476 spa_t *spa = vd->vdev_spa; 3477 size_t *newsize = arg; 3478 size_t fsize; 3479 int fd; 3480 3481 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); 3482 ASSERT(vd->vdev_ops->vdev_op_leaf); 3483 3484 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 3485 return (vd); 3486 3487 fsize = lseek(fd, 0, SEEK_END); 3488 (void) ftruncate(fd, *newsize); 3489 3490 if (ztest_opts.zo_verbose >= 6) { 3491 (void) printf("%s grew from %lu to %lu bytes\n", 3492 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 3493 } 3494 (void) close(fd); 3495 return (NULL); 3496 } 3497 3498 /* 3499 * Callback function which expands a given vdev by calling vdev_online(). 3500 */ 3501 /* ARGSUSED */ 3502 vdev_t * 3503 online_vdev(vdev_t *vd, void *arg) 3504 { 3505 spa_t *spa = vd->vdev_spa; 3506 vdev_t *tvd = vd->vdev_top; 3507 uint64_t guid = vd->vdev_guid; 3508 uint64_t generation = spa->spa_config_generation + 1; 3509 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 3510 int error; 3511 3512 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); 3513 ASSERT(vd->vdev_ops->vdev_op_leaf); 3514 3515 /* Calling vdev_online will initialize the new metaslabs */ 3516 spa_config_exit(spa, SCL_STATE, spa); 3517 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 3518 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3519 3520 /* 3521 * If vdev_online returned an error or the underlying vdev_open 3522 * failed then we abort the expand. The only way to know that 3523 * vdev_open fails is by checking the returned newstate. 3524 */ 3525 if (error || newstate != VDEV_STATE_HEALTHY) { 3526 if (ztest_opts.zo_verbose >= 5) { 3527 (void) printf("Unable to expand vdev, state %llu, " 3528 "error %d\n", (u_longlong_t)newstate, error); 3529 } 3530 return (vd); 3531 } 3532 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 3533 3534 /* 3535 * Since we dropped the lock we need to ensure that we're 3536 * still talking to the original vdev. It's possible this 3537 * vdev may have been detached/replaced while we were 3538 * trying to online it. 3539 */ 3540 if (generation != spa->spa_config_generation) { 3541 if (ztest_opts.zo_verbose >= 5) { 3542 (void) printf("vdev configuration has changed, " 3543 "guid %llu, state %llu, expected gen %llu, " 3544 "got gen %llu\n", 3545 (u_longlong_t)guid, 3546 (u_longlong_t)tvd->vdev_state, 3547 (u_longlong_t)generation, 3548 (u_longlong_t)spa->spa_config_generation); 3549 } 3550 return (vd); 3551 } 3552 return (NULL); 3553 } 3554 3555 /* 3556 * Traverse the vdev tree calling the supplied function. 3557 * We continue to walk the tree until we either have walked all 3558 * children or we receive a non-NULL return from the callback. 3559 * If a NULL callback is passed, then we just return back the first 3560 * leaf vdev we encounter. 3561 */ 3562 vdev_t * 3563 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 3564 { 3565 if (vd->vdev_ops->vdev_op_leaf) { 3566 if (func == NULL) 3567 return (vd); 3568 else 3569 return (func(vd, arg)); 3570 } 3571 3572 for (uint_t c = 0; c < vd->vdev_children; c++) { 3573 vdev_t *cvd = vd->vdev_child[c]; 3574 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 3575 return (cvd); 3576 } 3577 return (NULL); 3578 } 3579 3580 /* 3581 * Verify that dynamic LUN growth works as expected. 3582 */ 3583 /* ARGSUSED */ 3584 void 3585 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 3586 { 3587 spa_t *spa = ztest_spa; 3588 vdev_t *vd, *tvd; 3589 metaslab_class_t *mc; 3590 metaslab_group_t *mg; 3591 size_t psize, newsize; 3592 uint64_t top; 3593 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 3594 3595 mutex_enter(&ztest_checkpoint_lock); 3596 mutex_enter(&ztest_vdev_lock); 3597 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3598 3599 /* 3600 * If there is a vdev removal in progress, it could complete while 3601 * we are running, in which case we would not be able to verify 3602 * that the metaslab_class space increased (because it decreases 3603 * when the device removal completes). 3604 */ 3605 if (ztest_device_removal_active) { 3606 spa_config_exit(spa, SCL_STATE, spa); 3607 mutex_exit(&ztest_vdev_lock); 3608 mutex_exit(&ztest_checkpoint_lock); 3609 return; 3610 } 3611 3612 top = ztest_random_vdev_top(spa, B_TRUE); 3613 3614 tvd = spa->spa_root_vdev->vdev_child[top]; 3615 mg = tvd->vdev_mg; 3616 mc = mg->mg_class; 3617 old_ms_count = tvd->vdev_ms_count; 3618 old_class_space = metaslab_class_get_space(mc); 3619 3620 /* 3621 * Determine the size of the first leaf vdev associated with 3622 * our top-level device. 3623 */ 3624 vd = vdev_walk_tree(tvd, NULL, NULL); 3625 ASSERT3P(vd, !=, NULL); 3626 ASSERT(vd->vdev_ops->vdev_op_leaf); 3627 3628 psize = vd->vdev_psize; 3629 3630 /* 3631 * We only try to expand the vdev if it's healthy, less than 4x its 3632 * original size, and it has a valid psize. 3633 */ 3634 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 3635 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 3636 spa_config_exit(spa, SCL_STATE, spa); 3637 mutex_exit(&ztest_vdev_lock); 3638 mutex_exit(&ztest_checkpoint_lock); 3639 return; 3640 } 3641 ASSERT(psize > 0); 3642 newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); 3643 ASSERT3U(newsize, >, psize); 3644 3645 if (ztest_opts.zo_verbose >= 6) { 3646 (void) printf("Expanding LUN %s from %lu to %lu\n", 3647 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 3648 } 3649 3650 /* 3651 * Growing the vdev is a two step process: 3652 * 1). expand the physical size (i.e. relabel) 3653 * 2). online the vdev to create the new metaslabs 3654 */ 3655 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 3656 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 3657 tvd->vdev_state != VDEV_STATE_HEALTHY) { 3658 if (ztest_opts.zo_verbose >= 5) { 3659 (void) printf("Could not expand LUN because " 3660 "the vdev configuration changed.\n"); 3661 } 3662 spa_config_exit(spa, SCL_STATE, spa); 3663 mutex_exit(&ztest_vdev_lock); 3664 mutex_exit(&ztest_checkpoint_lock); 3665 return; 3666 } 3667 3668 spa_config_exit(spa, SCL_STATE, spa); 3669 3670 /* 3671 * Expanding the LUN will update the config asynchronously, 3672 * thus we must wait for the async thread to complete any 3673 * pending tasks before proceeding. 3674 */ 3675 for (;;) { 3676 boolean_t done; 3677 mutex_enter(&spa->spa_async_lock); 3678 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 3679 mutex_exit(&spa->spa_async_lock); 3680 if (done) 3681 break; 3682 txg_wait_synced(spa_get_dsl(spa), 0); 3683 (void) poll(NULL, 0, 100); 3684 } 3685 3686 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3687 3688 tvd = spa->spa_root_vdev->vdev_child[top]; 3689 new_ms_count = tvd->vdev_ms_count; 3690 new_class_space = metaslab_class_get_space(mc); 3691 3692 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 3693 if (ztest_opts.zo_verbose >= 5) { 3694 (void) printf("Could not verify LUN expansion due to " 3695 "intervening vdev offline or remove.\n"); 3696 } 3697 spa_config_exit(spa, SCL_STATE, spa); 3698 mutex_exit(&ztest_vdev_lock); 3699 mutex_exit(&ztest_checkpoint_lock); 3700 return; 3701 } 3702 3703 /* 3704 * Make sure we were able to grow the vdev. 3705 */ 3706 if (new_ms_count <= old_ms_count) { 3707 fatal(0, "LUN expansion failed: ms_count %llu < %llu\n", 3708 old_ms_count, new_ms_count); 3709 } 3710 3711 /* 3712 * Make sure we were able to grow the pool. 3713 */ 3714 if (new_class_space <= old_class_space) { 3715 fatal(0, "LUN expansion failed: class_space %llu < %llu\n", 3716 old_class_space, new_class_space); 3717 } 3718 3719 if (ztest_opts.zo_verbose >= 5) { 3720 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 3721 3722 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 3723 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 3724 (void) printf("%s grew from %s to %s\n", 3725 spa->spa_name, oldnumbuf, newnumbuf); 3726 } 3727 3728 spa_config_exit(spa, SCL_STATE, spa); 3729 mutex_exit(&ztest_vdev_lock); 3730 mutex_exit(&ztest_checkpoint_lock); 3731 } 3732 3733 /* 3734 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 3735 */ 3736 /* ARGSUSED */ 3737 static void 3738 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 3739 { 3740 /* 3741 * Create the objects common to all ztest datasets. 3742 */ 3743 VERIFY(zap_create_claim(os, ZTEST_DIROBJ, 3744 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0); 3745 } 3746 3747 static int 3748 ztest_dataset_create(char *dsname) 3749 { 3750 int err; 3751 uint64_t rand; 3752 dsl_crypto_params_t *dcp = NULL; 3753 3754 /* 3755 * 50% of the time, we create encrypted datasets 3756 * using a random cipher suite and a hard-coded 3757 * wrapping key. 3758 */ 3759 #ifdef WITHCRYPTO 3760 /* 3761 * Until the crypto framework is compiled in userland, the ztest using 3762 * crypto will not work. 3763 */ 3764 rand = ztest_random(2); 3765 #else 3766 rand = 0; 3767 #endif 3768 if (rand != 0) { 3769 nvlist_t *crypto_args = fnvlist_alloc(); 3770 nvlist_t *props = fnvlist_alloc(); 3771 3772 /* slight bias towards the default cipher suite */ 3773 rand = ztest_random(ZIO_CRYPT_FUNCTIONS); 3774 if (rand < ZIO_CRYPT_AES_128_CCM) 3775 rand = ZIO_CRYPT_ON; 3776 3777 fnvlist_add_uint64(props, 3778 zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); 3779 fnvlist_add_uint8_array(crypto_args, "wkeydata", 3780 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 3781 3782 /* 3783 * These parameters aren't really used by the kernel. They 3784 * are simply stored so that userspace knows how to load 3785 * the wrapping key. 3786 */ 3787 fnvlist_add_uint64(props, 3788 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); 3789 fnvlist_add_string(props, 3790 zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); 3791 fnvlist_add_uint64(props, 3792 zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); 3793 fnvlist_add_uint64(props, 3794 zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); 3795 3796 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, 3797 crypto_args, &dcp)); 3798 3799 fnvlist_free(crypto_args); 3800 fnvlist_free(props); 3801 } 3802 3803 err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, 3804 ztest_objset_create_cb, NULL); 3805 dsl_crypto_params_free(dcp, !!err); 3806 3807 rand = ztest_random(100); 3808 if (err || rand < 80) 3809 return (err); 3810 3811 if (ztest_opts.zo_verbose >= 6) 3812 (void) printf("Setting dataset %s to sync always\n", dsname); 3813 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 3814 ZFS_SYNC_ALWAYS, B_FALSE)); 3815 } 3816 3817 /* ARGSUSED */ 3818 static int 3819 ztest_objset_destroy_cb(const char *name, void *arg) 3820 { 3821 objset_t *os; 3822 dmu_object_info_t doi; 3823 int error; 3824 3825 /* 3826 * Verify that the dataset contains a directory object. 3827 */ 3828 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 3829 B_TRUE, FTAG, &os)); 3830 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 3831 if (error != ENOENT) { 3832 /* We could have crashed in the middle of destroying it */ 3833 ASSERT0(error); 3834 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 3835 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 3836 } 3837 dmu_objset_disown(os, B_TRUE, FTAG); 3838 3839 /* 3840 * Destroy the dataset. 3841 */ 3842 if (strchr(name, '@') != NULL) { 3843 VERIFY0(dsl_destroy_snapshot(name, B_TRUE)); 3844 } else { 3845 error = dsl_destroy_head(name); 3846 /* There could be a hold on this dataset */ 3847 if (error != EBUSY) 3848 ASSERT0(error); 3849 } 3850 return (0); 3851 } 3852 3853 static boolean_t 3854 ztest_snapshot_create(char *osname, uint64_t id) 3855 { 3856 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 3857 int error; 3858 3859 (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id); 3860 3861 error = dmu_objset_snapshot_one(osname, snapname); 3862 if (error == ENOSPC) { 3863 ztest_record_enospc(FTAG); 3864 return (B_FALSE); 3865 } 3866 if (error != 0 && error != EEXIST) { 3867 fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname, 3868 snapname, error); 3869 } 3870 return (B_TRUE); 3871 } 3872 3873 static boolean_t 3874 ztest_snapshot_destroy(char *osname, uint64_t id) 3875 { 3876 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 3877 int error; 3878 3879 (void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname, 3880 (u_longlong_t)id); 3881 3882 error = dsl_destroy_snapshot(snapname, B_FALSE); 3883 if (error != 0 && error != ENOENT) 3884 fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); 3885 return (B_TRUE); 3886 } 3887 3888 /* ARGSUSED */ 3889 void 3890 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 3891 { 3892 ztest_ds_t zdtmp; 3893 int iters; 3894 int error; 3895 objset_t *os, *os2; 3896 char name[ZFS_MAX_DATASET_NAME_LEN]; 3897 zilog_t *zilog; 3898 3899 rw_enter(&ztest_name_lock, RW_READER); 3900 3901 (void) snprintf(name, sizeof (name), "%s/temp_%llu", 3902 ztest_opts.zo_pool, (u_longlong_t)id); 3903 3904 /* 3905 * If this dataset exists from a previous run, process its replay log 3906 * half of the time. If we don't replay it, then dmu_objset_destroy() 3907 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 3908 */ 3909 if (ztest_random(2) == 0 && 3910 ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 3911 B_TRUE, FTAG, &os) == 0) { 3912 ztest_zd_init(&zdtmp, NULL, os); 3913 zil_replay(os, &zdtmp, ztest_replay_vector); 3914 ztest_zd_fini(&zdtmp); 3915 dmu_objset_disown(os, B_TRUE, FTAG); 3916 } 3917 3918 /* 3919 * There may be an old instance of the dataset we're about to 3920 * create lying around from a previous run. If so, destroy it 3921 * and all of its snapshots. 3922 */ 3923 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 3924 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 3925 3926 /* 3927 * Verify that the destroyed dataset is no longer in the namespace. 3928 */ 3929 VERIFY3U(ENOENT, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 3930 B_TRUE, FTAG, &os)); 3931 3932 /* 3933 * Verify that we can create a new dataset. 3934 */ 3935 error = ztest_dataset_create(name); 3936 if (error) { 3937 if (error == ENOSPC) { 3938 ztest_record_enospc(FTAG); 3939 rw_exit(&ztest_name_lock); 3940 return; 3941 } 3942 fatal(0, "dmu_objset_create(%s) = %d", name, error); 3943 } 3944 3945 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, 3946 FTAG, &os)); 3947 3948 ztest_zd_init(&zdtmp, NULL, os); 3949 3950 /* 3951 * Open the intent log for it. 3952 */ 3953 zilog = zil_open(os, ztest_get_data); 3954 3955 /* 3956 * Put some objects in there, do a little I/O to them, 3957 * and randomly take a couple of snapshots along the way. 3958 */ 3959 iters = ztest_random(5); 3960 for (int i = 0; i < iters; i++) { 3961 ztest_dmu_object_alloc_free(&zdtmp, id); 3962 if (ztest_random(iters) == 0) 3963 (void) ztest_snapshot_create(name, i); 3964 } 3965 3966 /* 3967 * Verify that we cannot create an existing dataset. 3968 */ 3969 VERIFY3U(EEXIST, ==, 3970 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); 3971 3972 /* 3973 * Verify that we can hold an objset that is also owned. 3974 */ 3975 VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2)); 3976 dmu_objset_rele(os2, FTAG); 3977 3978 /* 3979 * Verify that we cannot own an objset that is already owned. 3980 */ 3981 VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, 3982 B_FALSE, B_TRUE, FTAG, &os2)); 3983 3984 zil_close(zilog); 3985 dmu_objset_disown(os, B_TRUE, FTAG); 3986 ztest_zd_fini(&zdtmp); 3987 3988 rw_exit(&ztest_name_lock); 3989 } 3990 3991 /* 3992 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 3993 */ 3994 void 3995 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 3996 { 3997 rw_enter(&ztest_name_lock, RW_READER); 3998 (void) ztest_snapshot_destroy(zd->zd_name, id); 3999 (void) ztest_snapshot_create(zd->zd_name, id); 4000 rw_exit(&ztest_name_lock); 4001 } 4002 4003 /* 4004 * Cleanup non-standard snapshots and clones. 4005 */ 4006 void 4007 ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 4008 { 4009 char snap1name[ZFS_MAX_DATASET_NAME_LEN]; 4010 char clone1name[ZFS_MAX_DATASET_NAME_LEN]; 4011 char snap2name[ZFS_MAX_DATASET_NAME_LEN]; 4012 char clone2name[ZFS_MAX_DATASET_NAME_LEN]; 4013 char snap3name[ZFS_MAX_DATASET_NAME_LEN]; 4014 int error; 4015 4016 (void) snprintf(snap1name, sizeof (snap1name), 4017 "%s@s1_%llu", osname, id); 4018 (void) snprintf(clone1name, sizeof (clone1name), 4019 "%s/c1_%llu", osname, id); 4020 (void) snprintf(snap2name, sizeof (snap2name), 4021 "%s@s2_%llu", clone1name, id); 4022 (void) snprintf(clone2name, sizeof (clone2name), 4023 "%s/c2_%llu", osname, id); 4024 (void) snprintf(snap3name, sizeof (snap3name), 4025 "%s@s3_%llu", clone1name, id); 4026 4027 error = dsl_destroy_head(clone2name); 4028 if (error && error != ENOENT) 4029 fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error); 4030 error = dsl_destroy_snapshot(snap3name, B_FALSE); 4031 if (error && error != ENOENT) 4032 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error); 4033 error = dsl_destroy_snapshot(snap2name, B_FALSE); 4034 if (error && error != ENOENT) 4035 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error); 4036 error = dsl_destroy_head(clone1name); 4037 if (error && error != ENOENT) 4038 fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error); 4039 error = dsl_destroy_snapshot(snap1name, B_FALSE); 4040 if (error && error != ENOENT) 4041 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error); 4042 } 4043 4044 /* 4045 * Verify dsl_dataset_promote handles EBUSY 4046 */ 4047 void 4048 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 4049 { 4050 objset_t *os; 4051 char snap1name[ZFS_MAX_DATASET_NAME_LEN]; 4052 char clone1name[ZFS_MAX_DATASET_NAME_LEN]; 4053 char snap2name[ZFS_MAX_DATASET_NAME_LEN]; 4054 char clone2name[ZFS_MAX_DATASET_NAME_LEN]; 4055 char snap3name[ZFS_MAX_DATASET_NAME_LEN]; 4056 char *osname = zd->zd_name; 4057 int error; 4058 4059 rw_enter(&ztest_name_lock, RW_READER); 4060 4061 ztest_dsl_dataset_cleanup(osname, id); 4062 4063 (void) snprintf(snap1name, sizeof (snap1name), 4064 "%s@s1_%llu", osname, id); 4065 (void) snprintf(clone1name, sizeof (clone1name), 4066 "%s/c1_%llu", osname, id); 4067 (void) snprintf(snap2name, sizeof (snap2name), 4068 "%s@s2_%llu", clone1name, id); 4069 (void) snprintf(clone2name, sizeof (clone2name), 4070 "%s/c2_%llu", osname, id); 4071 (void) snprintf(snap3name, sizeof (snap3name), 4072 "%s@s3_%llu", clone1name, id); 4073 4074 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 4075 if (error && error != EEXIST) { 4076 if (error == ENOSPC) { 4077 ztest_record_enospc(FTAG); 4078 goto out; 4079 } 4080 fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); 4081 } 4082 4083 error = dmu_objset_clone(clone1name, snap1name); 4084 if (error) { 4085 if (error == ENOSPC) { 4086 ztest_record_enospc(FTAG); 4087 goto out; 4088 } 4089 fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); 4090 } 4091 4092 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 4093 if (error && error != EEXIST) { 4094 if (error == ENOSPC) { 4095 ztest_record_enospc(FTAG); 4096 goto out; 4097 } 4098 fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); 4099 } 4100 4101 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 4102 if (error && error != EEXIST) { 4103 if (error == ENOSPC) { 4104 ztest_record_enospc(FTAG); 4105 goto out; 4106 } 4107 fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); 4108 } 4109 4110 error = dmu_objset_clone(clone2name, snap3name); 4111 if (error) { 4112 if (error == ENOSPC) { 4113 ztest_record_enospc(FTAG); 4114 goto out; 4115 } 4116 fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); 4117 } 4118 4119 error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, 4120 FTAG, &os); 4121 if (error) 4122 fatal(0, "dmu_objset_own(%s) = %d", snap2name, error); 4123 error = dsl_dataset_promote(clone2name, NULL); 4124 if (error == ENOSPC) { 4125 dmu_objset_disown(os, B_TRUE, FTAG); 4126 ztest_record_enospc(FTAG); 4127 goto out; 4128 } 4129 if (error != EBUSY) 4130 fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, 4131 error); 4132 dmu_objset_disown(os, B_TRUE, FTAG); 4133 4134 out: 4135 ztest_dsl_dataset_cleanup(osname, id); 4136 4137 rw_exit(&ztest_name_lock); 4138 } 4139 4140 /* 4141 * Verify that dmu_object_{alloc,free} work as expected. 4142 */ 4143 void 4144 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 4145 { 4146 ztest_od_t od[4]; 4147 int batchsize = sizeof (od) / sizeof (od[0]); 4148 4149 for (int b = 0; b < batchsize; b++) { 4150 ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 4151 0, 0, 0); 4152 } 4153 4154 /* 4155 * Destroy the previous batch of objects, create a new batch, 4156 * and do some I/O on the new objects. 4157 */ 4158 if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0) 4159 return; 4160 4161 while (ztest_random(4 * batchsize) != 0) 4162 ztest_io(zd, od[ztest_random(batchsize)].od_object, 4163 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4164 } 4165 4166 /* 4167 * Rewind the global allocator to verify object allocation backfilling. 4168 */ 4169 void 4170 ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 4171 { 4172 objset_t *os = zd->zd_os; 4173 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 4174 uint64_t object; 4175 4176 /* 4177 * Rewind the global allocator randomly back to a lower object number 4178 * to force backfilling and reclamation of recently freed dnodes. 4179 */ 4180 mutex_enter(&os->os_obj_lock); 4181 object = ztest_random(os->os_obj_next_chunk); 4182 os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); 4183 mutex_exit(&os->os_obj_lock); 4184 } 4185 4186 /* 4187 * Verify that dmu_{read,write} work as expected. 4188 */ 4189 void 4190 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 4191 { 4192 objset_t *os = zd->zd_os; 4193 ztest_od_t od[2]; 4194 dmu_tx_t *tx; 4195 int i, freeit, error; 4196 uint64_t n, s, txg; 4197 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 4198 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4199 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 4200 uint64_t regions = 997; 4201 uint64_t stride = 123456789ULL; 4202 uint64_t width = 40; 4203 int free_percent = 5; 4204 4205 /* 4206 * This test uses two objects, packobj and bigobj, that are always 4207 * updated together (i.e. in the same tx) so that their contents are 4208 * in sync and can be compared. Their contents relate to each other 4209 * in a simple way: packobj is a dense array of 'bufwad' structures, 4210 * while bigobj is a sparse array of the same bufwads. Specifically, 4211 * for any index n, there are three bufwads that should be identical: 4212 * 4213 * packobj, at offset n * sizeof (bufwad_t) 4214 * bigobj, at the head of the nth chunk 4215 * bigobj, at the tail of the nth chunk 4216 * 4217 * The chunk size is arbitrary. It doesn't have to be a power of two, 4218 * and it doesn't have any relation to the object blocksize. 4219 * The only requirement is that it can hold at least two bufwads. 4220 * 4221 * Normally, we write the bufwad to each of these locations. 4222 * However, free_percent of the time we instead write zeroes to 4223 * packobj and perform a dmu_free_range() on bigobj. By comparing 4224 * bigobj to packobj, we can verify that the DMU is correctly 4225 * tracking which parts of an object are allocated and free, 4226 * and that the contents of the allocated blocks are correct. 4227 */ 4228 4229 /* 4230 * Read the directory info. If it's the first time, set things up. 4231 */ 4232 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 4233 chunksize); 4234 ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 4235 chunksize); 4236 4237 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 4238 return; 4239 4240 bigobj = od[0].od_object; 4241 packobj = od[1].od_object; 4242 chunksize = od[0].od_gen; 4243 ASSERT(chunksize == od[1].od_gen); 4244 4245 /* 4246 * Prefetch a random chunk of the big object. 4247 * Our aim here is to get some async reads in flight 4248 * for blocks that we may free below; the DMU should 4249 * handle this race correctly. 4250 */ 4251 n = ztest_random(regions) * stride + ztest_random(width); 4252 s = 1 + ztest_random(2 * width - 1); 4253 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 4254 ZIO_PRIORITY_SYNC_READ); 4255 4256 /* 4257 * Pick a random index and compute the offsets into packobj and bigobj. 4258 */ 4259 n = ztest_random(regions) * stride + ztest_random(width); 4260 s = 1 + ztest_random(width - 1); 4261 4262 packoff = n * sizeof (bufwad_t); 4263 packsize = s * sizeof (bufwad_t); 4264 4265 bigoff = n * chunksize; 4266 bigsize = s * chunksize; 4267 4268 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 4269 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 4270 4271 /* 4272 * free_percent of the time, free a range of bigobj rather than 4273 * overwriting it. 4274 */ 4275 freeit = (ztest_random(100) < free_percent); 4276 4277 /* 4278 * Read the current contents of our objects. 4279 */ 4280 error = dmu_read(os, packobj, packoff, packsize, packbuf, 4281 DMU_READ_PREFETCH); 4282 ASSERT0(error); 4283 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 4284 DMU_READ_PREFETCH); 4285 ASSERT0(error); 4286 4287 /* 4288 * Get a tx for the mods to both packobj and bigobj. 4289 */ 4290 tx = dmu_tx_create(os); 4291 4292 dmu_tx_hold_write(tx, packobj, packoff, packsize); 4293 4294 if (freeit) 4295 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 4296 else 4297 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 4298 4299 /* This accounts for setting the checksum/compression. */ 4300 dmu_tx_hold_bonus(tx, bigobj); 4301 4302 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4303 if (txg == 0) { 4304 umem_free(packbuf, packsize); 4305 umem_free(bigbuf, bigsize); 4306 return; 4307 } 4308 4309 enum zio_checksum cksum; 4310 do { 4311 cksum = (enum zio_checksum) 4312 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 4313 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 4314 dmu_object_set_checksum(os, bigobj, cksum, tx); 4315 4316 enum zio_compress comp; 4317 do { 4318 comp = (enum zio_compress) 4319 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 4320 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 4321 dmu_object_set_compress(os, bigobj, comp, tx); 4322 4323 /* 4324 * For each index from n to n + s, verify that the existing bufwad 4325 * in packobj matches the bufwads at the head and tail of the 4326 * corresponding chunk in bigobj. Then update all three bufwads 4327 * with the new values we want to write out. 4328 */ 4329 for (i = 0; i < s; i++) { 4330 /* LINTED */ 4331 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4332 /* LINTED */ 4333 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4334 /* LINTED */ 4335 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4336 4337 ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); 4338 ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); 4339 4340 if (pack->bw_txg > txg) 4341 fatal(0, "future leak: got %llx, open txg is %llx", 4342 pack->bw_txg, txg); 4343 4344 if (pack->bw_data != 0 && pack->bw_index != n + i) 4345 fatal(0, "wrong index: got %llx, wanted %llx+%llx", 4346 pack->bw_index, n, i); 4347 4348 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4349 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); 4350 4351 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4352 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); 4353 4354 if (freeit) { 4355 bzero(pack, sizeof (bufwad_t)); 4356 } else { 4357 pack->bw_index = n + i; 4358 pack->bw_txg = txg; 4359 pack->bw_data = 1 + ztest_random(-2ULL); 4360 } 4361 *bigH = *pack; 4362 *bigT = *pack; 4363 } 4364 4365 /* 4366 * We've verified all the old bufwads, and made new ones. 4367 * Now write them out. 4368 */ 4369 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 4370 4371 if (freeit) { 4372 if (ztest_opts.zo_verbose >= 7) { 4373 (void) printf("freeing offset %llx size %llx" 4374 " txg %llx\n", 4375 (u_longlong_t)bigoff, 4376 (u_longlong_t)bigsize, 4377 (u_longlong_t)txg); 4378 } 4379 VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 4380 } else { 4381 if (ztest_opts.zo_verbose >= 7) { 4382 (void) printf("writing offset %llx size %llx" 4383 " txg %llx\n", 4384 (u_longlong_t)bigoff, 4385 (u_longlong_t)bigsize, 4386 (u_longlong_t)txg); 4387 } 4388 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 4389 } 4390 4391 dmu_tx_commit(tx); 4392 4393 /* 4394 * Sanity check the stuff we just wrote. 4395 */ 4396 { 4397 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 4398 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 4399 4400 VERIFY(0 == dmu_read(os, packobj, packoff, 4401 packsize, packcheck, DMU_READ_PREFETCH)); 4402 VERIFY(0 == dmu_read(os, bigobj, bigoff, 4403 bigsize, bigcheck, DMU_READ_PREFETCH)); 4404 4405 ASSERT(bcmp(packbuf, packcheck, packsize) == 0); 4406 ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); 4407 4408 umem_free(packcheck, packsize); 4409 umem_free(bigcheck, bigsize); 4410 } 4411 4412 umem_free(packbuf, packsize); 4413 umem_free(bigbuf, bigsize); 4414 } 4415 4416 void 4417 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 4418 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 4419 { 4420 uint64_t i; 4421 bufwad_t *pack; 4422 bufwad_t *bigH; 4423 bufwad_t *bigT; 4424 4425 /* 4426 * For each index from n to n + s, verify that the existing bufwad 4427 * in packobj matches the bufwads at the head and tail of the 4428 * corresponding chunk in bigobj. Then update all three bufwads 4429 * with the new values we want to write out. 4430 */ 4431 for (i = 0; i < s; i++) { 4432 /* LINTED */ 4433 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4434 /* LINTED */ 4435 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4436 /* LINTED */ 4437 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4438 4439 ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); 4440 ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); 4441 4442 if (pack->bw_txg > txg) 4443 fatal(0, "future leak: got %llx, open txg is %llx", 4444 pack->bw_txg, txg); 4445 4446 if (pack->bw_data != 0 && pack->bw_index != n + i) 4447 fatal(0, "wrong index: got %llx, wanted %llx+%llx", 4448 pack->bw_index, n, i); 4449 4450 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4451 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); 4452 4453 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4454 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); 4455 4456 pack->bw_index = n + i; 4457 pack->bw_txg = txg; 4458 pack->bw_data = 1 + ztest_random(-2ULL); 4459 4460 *bigH = *pack; 4461 *bigT = *pack; 4462 } 4463 } 4464 4465 void 4466 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 4467 { 4468 objset_t *os = zd->zd_os; 4469 ztest_od_t od[2]; 4470 dmu_tx_t *tx; 4471 uint64_t i; 4472 int error; 4473 uint64_t n, s, txg; 4474 bufwad_t *packbuf, *bigbuf; 4475 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4476 uint64_t blocksize = ztest_random_blocksize(); 4477 uint64_t chunksize = blocksize; 4478 uint64_t regions = 997; 4479 uint64_t stride = 123456789ULL; 4480 uint64_t width = 9; 4481 dmu_buf_t *bonus_db; 4482 arc_buf_t **bigbuf_arcbufs; 4483 dmu_object_info_t doi; 4484 4485 /* 4486 * This test uses two objects, packobj and bigobj, that are always 4487 * updated together (i.e. in the same tx) so that their contents are 4488 * in sync and can be compared. Their contents relate to each other 4489 * in a simple way: packobj is a dense array of 'bufwad' structures, 4490 * while bigobj is a sparse array of the same bufwads. Specifically, 4491 * for any index n, there are three bufwads that should be identical: 4492 * 4493 * packobj, at offset n * sizeof (bufwad_t) 4494 * bigobj, at the head of the nth chunk 4495 * bigobj, at the tail of the nth chunk 4496 * 4497 * The chunk size is set equal to bigobj block size so that 4498 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. 4499 */ 4500 4501 /* 4502 * Read the directory info. If it's the first time, set things up. 4503 */ 4504 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 4505 0, 0); 4506 ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 4507 chunksize); 4508 4509 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 4510 return; 4511 4512 bigobj = od[0].od_object; 4513 packobj = od[1].od_object; 4514 blocksize = od[0].od_blocksize; 4515 chunksize = blocksize; 4516 ASSERT(chunksize == od[1].od_gen); 4517 4518 VERIFY(dmu_object_info(os, bigobj, &doi) == 0); 4519 VERIFY(ISP2(doi.doi_data_block_size)); 4520 VERIFY(chunksize == doi.doi_data_block_size); 4521 VERIFY(chunksize >= 2 * sizeof (bufwad_t)); 4522 4523 /* 4524 * Pick a random index and compute the offsets into packobj and bigobj. 4525 */ 4526 n = ztest_random(regions) * stride + ztest_random(width); 4527 s = 1 + ztest_random(width - 1); 4528 4529 packoff = n * sizeof (bufwad_t); 4530 packsize = s * sizeof (bufwad_t); 4531 4532 bigoff = n * chunksize; 4533 bigsize = s * chunksize; 4534 4535 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 4536 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 4537 4538 VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 4539 4540 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 4541 4542 /* 4543 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 4544 * Iteration 1 test zcopy to already referenced dbufs. 4545 * Iteration 2 test zcopy to dirty dbuf in the same txg. 4546 * Iteration 3 test zcopy to dbuf dirty in previous txg. 4547 * Iteration 4 test zcopy when dbuf is no longer dirty. 4548 * Iteration 5 test zcopy when it can't be done. 4549 * Iteration 6 one more zcopy write. 4550 */ 4551 for (i = 0; i < 7; i++) { 4552 uint64_t j; 4553 uint64_t off; 4554 4555 /* 4556 * In iteration 5 (i == 5) use arcbufs 4557 * that don't match bigobj blksz to test 4558 * dmu_assign_arcbuf_by_dbuf() when it can't directly 4559 * assign an arcbuf to a dbuf. 4560 */ 4561 for (j = 0; j < s; j++) { 4562 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 4563 bigbuf_arcbufs[j] = 4564 dmu_request_arcbuf(bonus_db, chunksize); 4565 } else { 4566 bigbuf_arcbufs[2 * j] = 4567 dmu_request_arcbuf(bonus_db, chunksize / 2); 4568 bigbuf_arcbufs[2 * j + 1] = 4569 dmu_request_arcbuf(bonus_db, chunksize / 2); 4570 } 4571 } 4572 4573 /* 4574 * Get a tx for the mods to both packobj and bigobj. 4575 */ 4576 tx = dmu_tx_create(os); 4577 4578 dmu_tx_hold_write(tx, packobj, packoff, packsize); 4579 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 4580 4581 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4582 if (txg == 0) { 4583 umem_free(packbuf, packsize); 4584 umem_free(bigbuf, bigsize); 4585 for (j = 0; j < s; j++) { 4586 if (i != 5 || 4587 chunksize < (SPA_MINBLOCKSIZE * 2)) { 4588 dmu_return_arcbuf(bigbuf_arcbufs[j]); 4589 } else { 4590 dmu_return_arcbuf( 4591 bigbuf_arcbufs[2 * j]); 4592 dmu_return_arcbuf( 4593 bigbuf_arcbufs[2 * j + 1]); 4594 } 4595 } 4596 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 4597 dmu_buf_rele(bonus_db, FTAG); 4598 return; 4599 } 4600 4601 /* 4602 * 50% of the time don't read objects in the 1st iteration to 4603 * test dmu_assign_arcbuf_by_dbuf() for the case when there are 4604 * no existing dbufs for the specified offsets. 4605 */ 4606 if (i != 0 || ztest_random(2) != 0) { 4607 error = dmu_read(os, packobj, packoff, 4608 packsize, packbuf, DMU_READ_PREFETCH); 4609 ASSERT0(error); 4610 error = dmu_read(os, bigobj, bigoff, bigsize, 4611 bigbuf, DMU_READ_PREFETCH); 4612 ASSERT0(error); 4613 } 4614 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 4615 n, chunksize, txg); 4616 4617 /* 4618 * We've verified all the old bufwads, and made new ones. 4619 * Now write them out. 4620 */ 4621 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 4622 if (ztest_opts.zo_verbose >= 7) { 4623 (void) printf("writing offset %llx size %llx" 4624 " txg %llx\n", 4625 (u_longlong_t)bigoff, 4626 (u_longlong_t)bigsize, 4627 (u_longlong_t)txg); 4628 } 4629 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 4630 dmu_buf_t *dbt; 4631 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 4632 bcopy((caddr_t)bigbuf + (off - bigoff), 4633 bigbuf_arcbufs[j]->b_data, chunksize); 4634 } else { 4635 bcopy((caddr_t)bigbuf + (off - bigoff), 4636 bigbuf_arcbufs[2 * j]->b_data, 4637 chunksize / 2); 4638 bcopy((caddr_t)bigbuf + (off - bigoff) + 4639 chunksize / 2, 4640 bigbuf_arcbufs[2 * j + 1]->b_data, 4641 chunksize / 2); 4642 } 4643 4644 if (i == 1) { 4645 VERIFY(dmu_buf_hold(os, bigobj, off, 4646 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 4647 } 4648 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 4649 dmu_assign_arcbuf_by_dbuf(bonus_db, off, 4650 bigbuf_arcbufs[j], tx); 4651 } else { 4652 dmu_assign_arcbuf_by_dbuf(bonus_db, off, 4653 bigbuf_arcbufs[2 * j], tx); 4654 dmu_assign_arcbuf_by_dbuf(bonus_db, 4655 off + chunksize / 2, 4656 bigbuf_arcbufs[2 * j + 1], tx); 4657 } 4658 if (i == 1) { 4659 dmu_buf_rele(dbt, FTAG); 4660 } 4661 } 4662 dmu_tx_commit(tx); 4663 4664 /* 4665 * Sanity check the stuff we just wrote. 4666 */ 4667 { 4668 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 4669 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 4670 4671 VERIFY(0 == dmu_read(os, packobj, packoff, 4672 packsize, packcheck, DMU_READ_PREFETCH)); 4673 VERIFY(0 == dmu_read(os, bigobj, bigoff, 4674 bigsize, bigcheck, DMU_READ_PREFETCH)); 4675 4676 ASSERT(bcmp(packbuf, packcheck, packsize) == 0); 4677 ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); 4678 4679 umem_free(packcheck, packsize); 4680 umem_free(bigcheck, bigsize); 4681 } 4682 if (i == 2) { 4683 txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); 4684 } else if (i == 3) { 4685 txg_wait_synced(dmu_objset_pool(os), 0); 4686 } 4687 } 4688 4689 dmu_buf_rele(bonus_db, FTAG); 4690 umem_free(packbuf, packsize); 4691 umem_free(bigbuf, bigsize); 4692 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 4693 } 4694 4695 /* ARGSUSED */ 4696 void 4697 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 4698 { 4699 ztest_od_t od[1]; 4700 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 4701 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4702 4703 /* 4704 * Have multiple threads write to large offsets in an object 4705 * to verify that parallel writes to an object -- even to the 4706 * same blocks within the object -- doesn't cause any trouble. 4707 */ 4708 ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 4709 0, 0, 0); 4710 4711 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 4712 return; 4713 4714 while (ztest_random(10) != 0) 4715 ztest_io(zd, od[0].od_object, offset); 4716 } 4717 4718 void 4719 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 4720 { 4721 ztest_od_t od[1]; 4722 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 4723 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4724 uint64_t count = ztest_random(20) + 1; 4725 uint64_t blocksize = ztest_random_blocksize(); 4726 void *data; 4727 4728 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 4729 0, 0); 4730 4731 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) 4732 return; 4733 4734 if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0) 4735 return; 4736 4737 ztest_prealloc(zd, od[0].od_object, offset, count * blocksize); 4738 4739 data = umem_zalloc(blocksize, UMEM_NOFAIL); 4740 4741 while (ztest_random(count) != 0) { 4742 uint64_t randoff = offset + (ztest_random(count) * blocksize); 4743 if (ztest_write(zd, od[0].od_object, randoff, blocksize, 4744 data) != 0) 4745 break; 4746 while (ztest_random(4) != 0) 4747 ztest_io(zd, od[0].od_object, randoff); 4748 } 4749 4750 umem_free(data, blocksize); 4751 } 4752 4753 /* 4754 * Verify that zap_{create,destroy,add,remove,update} work as expected. 4755 */ 4756 #define ZTEST_ZAP_MIN_INTS 1 4757 #define ZTEST_ZAP_MAX_INTS 4 4758 #define ZTEST_ZAP_MAX_PROPS 1000 4759 4760 void 4761 ztest_zap(ztest_ds_t *zd, uint64_t id) 4762 { 4763 objset_t *os = zd->zd_os; 4764 ztest_od_t od[1]; 4765 uint64_t object; 4766 uint64_t txg, last_txg; 4767 uint64_t value[ZTEST_ZAP_MAX_INTS]; 4768 uint64_t zl_ints, zl_intsize, prop; 4769 int i, ints; 4770 dmu_tx_t *tx; 4771 char propname[100], txgname[100]; 4772 int error; 4773 char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 4774 4775 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 4776 4777 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) 4778 return; 4779 4780 object = od[0].od_object; 4781 4782 /* 4783 * Generate a known hash collision, and verify that 4784 * we can lookup and remove both entries. 4785 */ 4786 tx = dmu_tx_create(os); 4787 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 4788 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4789 if (txg == 0) 4790 return; 4791 for (i = 0; i < 2; i++) { 4792 value[i] = i; 4793 VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t), 4794 1, &value[i], tx)); 4795 } 4796 for (i = 0; i < 2; i++) { 4797 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 4798 sizeof (uint64_t), 1, &value[i], tx)); 4799 VERIFY3U(0, ==, 4800 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 4801 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 4802 ASSERT3U(zl_ints, ==, 1); 4803 } 4804 for (i = 0; i < 2; i++) { 4805 VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx)); 4806 } 4807 dmu_tx_commit(tx); 4808 4809 /* 4810 * Generate a buch of random entries. 4811 */ 4812 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 4813 4814 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 4815 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); 4816 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); 4817 bzero(value, sizeof (value)); 4818 last_txg = 0; 4819 4820 /* 4821 * If these zap entries already exist, validate their contents. 4822 */ 4823 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 4824 if (error == 0) { 4825 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 4826 ASSERT3U(zl_ints, ==, 1); 4827 4828 VERIFY(zap_lookup(os, object, txgname, zl_intsize, 4829 zl_ints, &last_txg) == 0); 4830 4831 VERIFY(zap_length(os, object, propname, &zl_intsize, 4832 &zl_ints) == 0); 4833 4834 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 4835 ASSERT3U(zl_ints, ==, ints); 4836 4837 VERIFY(zap_lookup(os, object, propname, zl_intsize, 4838 zl_ints, value) == 0); 4839 4840 for (i = 0; i < ints; i++) { 4841 ASSERT3U(value[i], ==, last_txg + object + i); 4842 } 4843 } else { 4844 ASSERT3U(error, ==, ENOENT); 4845 } 4846 4847 /* 4848 * Atomically update two entries in our zap object. 4849 * The first is named txg_%llu, and contains the txg 4850 * in which the property was last updated. The second 4851 * is named prop_%llu, and the nth element of its value 4852 * should be txg + object + n. 4853 */ 4854 tx = dmu_tx_create(os); 4855 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 4856 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4857 if (txg == 0) 4858 return; 4859 4860 if (last_txg > txg) 4861 fatal(0, "zap future leak: old %llu new %llu", last_txg, txg); 4862 4863 for (i = 0; i < ints; i++) 4864 value[i] = txg + object + i; 4865 4866 VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t), 4867 1, &txg, tx)); 4868 VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t), 4869 ints, value, tx)); 4870 4871 dmu_tx_commit(tx); 4872 4873 /* 4874 * Remove a random pair of entries. 4875 */ 4876 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 4877 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); 4878 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); 4879 4880 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 4881 4882 if (error == ENOENT) 4883 return; 4884 4885 ASSERT0(error); 4886 4887 tx = dmu_tx_create(os); 4888 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 4889 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4890 if (txg == 0) 4891 return; 4892 VERIFY3U(0, ==, zap_remove(os, object, txgname, tx)); 4893 VERIFY3U(0, ==, zap_remove(os, object, propname, tx)); 4894 dmu_tx_commit(tx); 4895 } 4896 4897 /* 4898 * Testcase to test the upgrading of a microzap to fatzap. 4899 */ 4900 void 4901 ztest_fzap(ztest_ds_t *zd, uint64_t id) 4902 { 4903 objset_t *os = zd->zd_os; 4904 ztest_od_t od[1]; 4905 uint64_t object, txg; 4906 4907 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 4908 4909 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) 4910 return; 4911 4912 object = od[0].od_object; 4913 4914 /* 4915 * Add entries to this ZAP and make sure it spills over 4916 * and gets upgraded to a fatzap. Also, since we are adding 4917 * 2050 entries we should see ptrtbl growth and leaf-block split. 4918 */ 4919 for (int i = 0; i < 2050; i++) { 4920 char name[ZFS_MAX_DATASET_NAME_LEN]; 4921 uint64_t value = i; 4922 dmu_tx_t *tx; 4923 int error; 4924 4925 (void) snprintf(name, sizeof (name), "fzap-%llu-%llu", 4926 id, value); 4927 4928 tx = dmu_tx_create(os); 4929 dmu_tx_hold_zap(tx, object, B_TRUE, name); 4930 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4931 if (txg == 0) 4932 return; 4933 error = zap_add(os, object, name, sizeof (uint64_t), 1, 4934 &value, tx); 4935 ASSERT(error == 0 || error == EEXIST); 4936 dmu_tx_commit(tx); 4937 } 4938 } 4939 4940 /* ARGSUSED */ 4941 void 4942 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 4943 { 4944 objset_t *os = zd->zd_os; 4945 ztest_od_t od[1]; 4946 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 4947 dmu_tx_t *tx; 4948 int i, namelen, error; 4949 int micro = ztest_random(2); 4950 char name[20], string_value[20]; 4951 void *data; 4952 4953 ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 4954 0, 0, 0); 4955 4956 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 4957 return; 4958 4959 object = od[0].od_object; 4960 4961 /* 4962 * Generate a random name of the form 'xxx.....' where each 4963 * x is a random printable character and the dots are dots. 4964 * There are 94 such characters, and the name length goes from 4965 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 4966 */ 4967 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 4968 4969 for (i = 0; i < 3; i++) 4970 name[i] = '!' + ztest_random('~' - '!' + 1); 4971 for (; i < namelen - 1; i++) 4972 name[i] = '.'; 4973 name[i] = '\0'; 4974 4975 if ((namelen & 1) || micro) { 4976 wsize = sizeof (txg); 4977 wc = 1; 4978 data = &txg; 4979 } else { 4980 wsize = 1; 4981 wc = namelen; 4982 data = string_value; 4983 } 4984 4985 count = -1ULL; 4986 VERIFY0(zap_count(os, object, &count)); 4987 ASSERT(count != -1ULL); 4988 4989 /* 4990 * Select an operation: length, lookup, add, update, remove. 4991 */ 4992 i = ztest_random(5); 4993 4994 if (i >= 2) { 4995 tx = dmu_tx_create(os); 4996 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 4997 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4998 if (txg == 0) 4999 return; 5000 bcopy(name, string_value, namelen); 5001 } else { 5002 tx = NULL; 5003 txg = 0; 5004 bzero(string_value, namelen); 5005 } 5006 5007 switch (i) { 5008 5009 case 0: 5010 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 5011 if (error == 0) { 5012 ASSERT3U(wsize, ==, zl_wsize); 5013 ASSERT3U(wc, ==, zl_wc); 5014 } else { 5015 ASSERT3U(error, ==, ENOENT); 5016 } 5017 break; 5018 5019 case 1: 5020 error = zap_lookup(os, object, name, wsize, wc, data); 5021 if (error == 0) { 5022 if (data == string_value && 5023 bcmp(name, data, namelen) != 0) 5024 fatal(0, "name '%s' != val '%s' len %d", 5025 name, data, namelen); 5026 } else { 5027 ASSERT3U(error, ==, ENOENT); 5028 } 5029 break; 5030 5031 case 2: 5032 error = zap_add(os, object, name, wsize, wc, data, tx); 5033 ASSERT(error == 0 || error == EEXIST); 5034 break; 5035 5036 case 3: 5037 VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0); 5038 break; 5039 5040 case 4: 5041 error = zap_remove(os, object, name, tx); 5042 ASSERT(error == 0 || error == ENOENT); 5043 break; 5044 } 5045 5046 if (tx != NULL) 5047 dmu_tx_commit(tx); 5048 } 5049 5050 /* 5051 * Commit callback data. 5052 */ 5053 typedef struct ztest_cb_data { 5054 list_node_t zcd_node; 5055 uint64_t zcd_txg; 5056 int zcd_expected_err; 5057 boolean_t zcd_added; 5058 boolean_t zcd_called; 5059 spa_t *zcd_spa; 5060 } ztest_cb_data_t; 5061 5062 /* This is the actual commit callback function */ 5063 static void 5064 ztest_commit_callback(void *arg, int error) 5065 { 5066 ztest_cb_data_t *data = arg; 5067 uint64_t synced_txg; 5068 5069 VERIFY(data != NULL); 5070 VERIFY3S(data->zcd_expected_err, ==, error); 5071 VERIFY(!data->zcd_called); 5072 5073 synced_txg = spa_last_synced_txg(data->zcd_spa); 5074 if (data->zcd_txg > synced_txg) 5075 fatal(0, "commit callback of txg %" PRIu64 " called prematurely" 5076 ", last synced txg = %" PRIu64 "\n", data->zcd_txg, 5077 synced_txg); 5078 5079 data->zcd_called = B_TRUE; 5080 5081 if (error == ECANCELED) { 5082 ASSERT0(data->zcd_txg); 5083 ASSERT(!data->zcd_added); 5084 5085 /* 5086 * The private callback data should be destroyed here, but 5087 * since we are going to check the zcd_called field after 5088 * dmu_tx_abort(), we will destroy it there. 5089 */ 5090 return; 5091 } 5092 5093 /* Was this callback added to the global callback list? */ 5094 if (!data->zcd_added) 5095 goto out; 5096 5097 ASSERT3U(data->zcd_txg, !=, 0); 5098 5099 /* Remove our callback from the list */ 5100 mutex_enter(&zcl.zcl_callbacks_lock); 5101 list_remove(&zcl.zcl_callbacks, data); 5102 mutex_exit(&zcl.zcl_callbacks_lock); 5103 5104 out: 5105 umem_free(data, sizeof (ztest_cb_data_t)); 5106 } 5107 5108 /* Allocate and initialize callback data structure */ 5109 static ztest_cb_data_t * 5110 ztest_create_cb_data(objset_t *os, uint64_t txg) 5111 { 5112 ztest_cb_data_t *cb_data; 5113 5114 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 5115 5116 cb_data->zcd_txg = txg; 5117 cb_data->zcd_spa = dmu_objset_spa(os); 5118 5119 return (cb_data); 5120 } 5121 5122 /* 5123 * If a number of txgs equal to this threshold have been created after a commit 5124 * callback has been registered but not called, then we assume there is an 5125 * implementation bug. 5126 */ 5127 #define ZTEST_COMMIT_CALLBACK_THRESH (TXG_CONCURRENT_STATES + 2) 5128 5129 /* 5130 * Commit callback test. 5131 */ 5132 void 5133 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 5134 { 5135 objset_t *os = zd->zd_os; 5136 ztest_od_t od[1]; 5137 dmu_tx_t *tx; 5138 ztest_cb_data_t *cb_data[3], *tmp_cb; 5139 uint64_t old_txg, txg; 5140 int i, error; 5141 5142 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5143 5144 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 5145 return; 5146 5147 tx = dmu_tx_create(os); 5148 5149 cb_data[0] = ztest_create_cb_data(os, 0); 5150 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 5151 5152 dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t)); 5153 5154 /* Every once in a while, abort the transaction on purpose */ 5155 if (ztest_random(100) == 0) 5156 error = -1; 5157 5158 if (!error) 5159 error = dmu_tx_assign(tx, TXG_NOWAIT); 5160 5161 txg = error ? 0 : dmu_tx_get_txg(tx); 5162 5163 cb_data[0]->zcd_txg = txg; 5164 cb_data[1] = ztest_create_cb_data(os, txg); 5165 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 5166 5167 if (error) { 5168 /* 5169 * It's not a strict requirement to call the registered 5170 * callbacks from inside dmu_tx_abort(), but that's what 5171 * it's supposed to happen in the current implementation 5172 * so we will check for that. 5173 */ 5174 for (i = 0; i < 2; i++) { 5175 cb_data[i]->zcd_expected_err = ECANCELED; 5176 VERIFY(!cb_data[i]->zcd_called); 5177 } 5178 5179 dmu_tx_abort(tx); 5180 5181 for (i = 0; i < 2; i++) { 5182 VERIFY(cb_data[i]->zcd_called); 5183 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 5184 } 5185 5186 return; 5187 } 5188 5189 cb_data[2] = ztest_create_cb_data(os, txg); 5190 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 5191 5192 /* 5193 * Read existing data to make sure there isn't a future leak. 5194 */ 5195 VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t), 5196 &old_txg, DMU_READ_PREFETCH)); 5197 5198 if (old_txg > txg) 5199 fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64, 5200 old_txg, txg); 5201 5202 dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx); 5203 5204 mutex_enter(&zcl.zcl_callbacks_lock); 5205 5206 /* 5207 * Since commit callbacks don't have any ordering requirement and since 5208 * it is theoretically possible for a commit callback to be called 5209 * after an arbitrary amount of time has elapsed since its txg has been 5210 * synced, it is difficult to reliably determine whether a commit 5211 * callback hasn't been called due to high load or due to a flawed 5212 * implementation. 5213 * 5214 * In practice, we will assume that if after a certain number of txgs a 5215 * commit callback hasn't been called, then most likely there's an 5216 * implementation bug.. 5217 */ 5218 tmp_cb = list_head(&zcl.zcl_callbacks); 5219 if (tmp_cb != NULL && 5220 (txg - ZTEST_COMMIT_CALLBACK_THRESH) > tmp_cb->zcd_txg) { 5221 fatal(0, "Commit callback threshold exceeded, oldest txg: %" 5222 PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); 5223 } 5224 5225 /* 5226 * Let's find the place to insert our callbacks. 5227 * 5228 * Even though the list is ordered by txg, it is possible for the 5229 * insertion point to not be the end because our txg may already be 5230 * quiescing at this point and other callbacks in the open txg 5231 * (from other objsets) may have sneaked in. 5232 */ 5233 tmp_cb = list_tail(&zcl.zcl_callbacks); 5234 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 5235 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 5236 5237 /* Add the 3 callbacks to the list */ 5238 for (i = 0; i < 3; i++) { 5239 if (tmp_cb == NULL) 5240 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 5241 else 5242 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 5243 cb_data[i]); 5244 5245 cb_data[i]->zcd_added = B_TRUE; 5246 VERIFY(!cb_data[i]->zcd_called); 5247 5248 tmp_cb = cb_data[i]; 5249 } 5250 5251 mutex_exit(&zcl.zcl_callbacks_lock); 5252 5253 dmu_tx_commit(tx); 5254 } 5255 5256 /* 5257 * Visit each object in the dataset. Verify that its properties 5258 * are consistent what was stored in the block tag when it was created, 5259 * and that its unused bonus buffer space has not been overwritten. 5260 */ 5261 void 5262 ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 5263 { 5264 objset_t *os = zd->zd_os; 5265 uint64_t obj; 5266 int err = 0; 5267 5268 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 5269 ztest_block_tag_t *bt = NULL; 5270 dmu_object_info_t doi; 5271 dmu_buf_t *db; 5272 5273 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) 5274 continue; 5275 5276 dmu_object_info_from_db(db, &doi); 5277 if (doi.doi_bonus_size >= sizeof (*bt)) 5278 bt = ztest_bt_bonus(db); 5279 5280 if (bt && bt->bt_magic == BT_MAGIC) { 5281 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 5282 bt->bt_offset, bt->bt_gen, bt->bt_txg, 5283 bt->bt_crtxg); 5284 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 5285 } 5286 5287 dmu_buf_rele(db, FTAG); 5288 } 5289 } 5290 5291 /* ARGSUSED */ 5292 void 5293 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 5294 { 5295 zfs_prop_t proplist[] = { 5296 ZFS_PROP_CHECKSUM, 5297 ZFS_PROP_COMPRESSION, 5298 ZFS_PROP_COPIES, 5299 ZFS_PROP_DEDUP 5300 }; 5301 5302 rw_enter(&ztest_name_lock, RW_READER); 5303 5304 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) 5305 (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 5306 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 5307 5308 rw_exit(&ztest_name_lock); 5309 } 5310 5311 /* ARGSUSED */ 5312 void 5313 ztest_remap_blocks(ztest_ds_t *zd, uint64_t id) 5314 { 5315 rw_enter(&ztest_name_lock, RW_READER); 5316 5317 int error = dmu_objset_remap_indirects(zd->zd_name); 5318 if (error == ENOSPC) 5319 error = 0; 5320 ASSERT0(error); 5321 5322 rw_exit(&ztest_name_lock); 5323 } 5324 5325 /* ARGSUSED */ 5326 void 5327 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 5328 { 5329 nvlist_t *props = NULL; 5330 5331 rw_enter(&ztest_name_lock, RW_READER); 5332 5333 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO, 5334 ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN)); 5335 5336 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); 5337 5338 VERIFY0(spa_prop_get(ztest_spa, &props)); 5339 5340 if (ztest_opts.zo_verbose >= 6) 5341 dump_nvlist(props, 4); 5342 5343 nvlist_free(props); 5344 5345 rw_exit(&ztest_name_lock); 5346 } 5347 5348 void 5349 ztest_fletcher(ztest_ds_t *zd, uint64_t id) 5350 { 5351 hrtime_t end = gethrtime() + NANOSEC; 5352 5353 while (gethrtime() <= end) { 5354 int run_count = 100; 5355 void *buf; 5356 struct abd *abd_data, *abd_meta; 5357 uint32_t size; 5358 uint_t *ptr; 5359 int i; 5360 zio_cksum_t zc_ref; 5361 zio_cksum_t zc_ref_byteswap; 5362 5363 size = ztest_random_blocksize(); 5364 5365 buf = umem_alloc(size, UMEM_NOFAIL); 5366 abd_data = abd_alloc(size, B_FALSE); 5367 abd_meta = abd_alloc(size, B_TRUE); 5368 5369 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 5370 *ptr = ztest_random(UINT_MAX); 5371 5372 abd_copy_from_buf_off(abd_data, buf, 0, size); 5373 abd_copy_from_buf_off(abd_meta, buf, 0, size); 5374 5375 VERIFY0(fletcher_4_impl_set("scalar")); 5376 fletcher_4_native(buf, size, NULL, &zc_ref); 5377 fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); 5378 5379 VERIFY0(fletcher_4_impl_set("cycle")); 5380 while (run_count-- > 0) { 5381 zio_cksum_t zc; 5382 zio_cksum_t zc_byteswap; 5383 5384 fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); 5385 fletcher_4_native(buf, size, NULL, &zc); 5386 5387 VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); 5388 VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, 5389 sizeof (zc_byteswap))); 5390 5391 /* Test ABD - data */ 5392 abd_fletcher_4_byteswap(abd_data, size, NULL, 5393 &zc_byteswap); 5394 abd_fletcher_4_native(abd_data, size, NULL, &zc); 5395 5396 VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); 5397 VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, 5398 sizeof (zc_byteswap))); 5399 5400 /* Test ABD - metadata */ 5401 abd_fletcher_4_byteswap(abd_meta, size, NULL, 5402 &zc_byteswap); 5403 abd_fletcher_4_native(abd_meta, size, NULL, &zc); 5404 5405 VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); 5406 VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, 5407 sizeof (zc_byteswap))); 5408 5409 } 5410 5411 umem_free(buf, size); 5412 abd_free(abd_data); 5413 abd_free(abd_meta); 5414 } 5415 } 5416 5417 void 5418 ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) 5419 { 5420 void *buf; 5421 size_t size; 5422 uint_t *ptr; 5423 int i; 5424 zio_cksum_t zc_ref; 5425 zio_cksum_t zc_ref_bswap; 5426 5427 hrtime_t end = gethrtime() + NANOSEC; 5428 5429 while (gethrtime() <= end) { 5430 int run_count = 100; 5431 5432 size = ztest_random_blocksize(); 5433 buf = umem_alloc(size, UMEM_NOFAIL); 5434 5435 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 5436 *ptr = ztest_random(UINT_MAX); 5437 5438 VERIFY0(fletcher_4_impl_set("scalar")); 5439 fletcher_4_native(buf, size, NULL, &zc_ref); 5440 fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); 5441 5442 VERIFY0(fletcher_4_impl_set("cycle")); 5443 5444 while (run_count-- > 0) { 5445 zio_cksum_t zc; 5446 zio_cksum_t zc_bswap; 5447 size_t pos = 0; 5448 5449 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 5450 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 5451 5452 while (pos < size) { 5453 size_t inc = 64 * ztest_random(size / 67); 5454 /* sometimes add few bytes to test non-simd */ 5455 if (ztest_random(100) < 10) 5456 inc += P2ALIGN(ztest_random(64), 5457 sizeof (uint32_t)); 5458 5459 if (inc > (size - pos)) 5460 inc = size - pos; 5461 5462 fletcher_4_incremental_native(buf + pos, inc, 5463 &zc); 5464 fletcher_4_incremental_byteswap(buf + pos, inc, 5465 &zc_bswap); 5466 5467 pos += inc; 5468 } 5469 5470 VERIFY3U(pos, ==, size); 5471 5472 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 5473 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 5474 5475 /* 5476 * verify if incremental on the whole buffer is 5477 * equivalent to non-incremental version 5478 */ 5479 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 5480 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 5481 5482 fletcher_4_incremental_native(buf, size, &zc); 5483 fletcher_4_incremental_byteswap(buf, size, &zc_bswap); 5484 5485 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 5486 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 5487 } 5488 5489 umem_free(buf, size); 5490 } 5491 } 5492 5493 static int 5494 user_release_one(const char *snapname, const char *holdname) 5495 { 5496 nvlist_t *snaps, *holds; 5497 int error; 5498 5499 snaps = fnvlist_alloc(); 5500 holds = fnvlist_alloc(); 5501 fnvlist_add_boolean(holds, holdname); 5502 fnvlist_add_nvlist(snaps, snapname, holds); 5503 fnvlist_free(holds); 5504 error = dsl_dataset_user_release(snaps, NULL); 5505 fnvlist_free(snaps); 5506 return (error); 5507 } 5508 5509 /* 5510 * Test snapshot hold/release and deferred destroy. 5511 */ 5512 void 5513 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 5514 { 5515 int error; 5516 objset_t *os = zd->zd_os; 5517 objset_t *origin; 5518 char snapname[100]; 5519 char fullname[100]; 5520 char clonename[100]; 5521 char tag[100]; 5522 char osname[ZFS_MAX_DATASET_NAME_LEN]; 5523 nvlist_t *holds; 5524 5525 rw_enter(&ztest_name_lock, RW_READER); 5526 5527 dmu_objset_name(os, osname); 5528 5529 (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", id); 5530 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 5531 (void) snprintf(clonename, sizeof (clonename), 5532 "%s/ch1_%llu", osname, id); 5533 (void) snprintf(tag, sizeof (tag), "tag_%llu", id); 5534 5535 /* 5536 * Clean up from any previous run. 5537 */ 5538 error = dsl_destroy_head(clonename); 5539 if (error != ENOENT) 5540 ASSERT0(error); 5541 error = user_release_one(fullname, tag); 5542 if (error != ESRCH && error != ENOENT) 5543 ASSERT0(error); 5544 error = dsl_destroy_snapshot(fullname, B_FALSE); 5545 if (error != ENOENT) 5546 ASSERT0(error); 5547 5548 /* 5549 * Create snapshot, clone it, mark snap for deferred destroy, 5550 * destroy clone, verify snap was also destroyed. 5551 */ 5552 error = dmu_objset_snapshot_one(osname, snapname); 5553 if (error) { 5554 if (error == ENOSPC) { 5555 ztest_record_enospc("dmu_objset_snapshot"); 5556 goto out; 5557 } 5558 fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); 5559 } 5560 5561 error = dmu_objset_clone(clonename, fullname); 5562 if (error) { 5563 if (error == ENOSPC) { 5564 ztest_record_enospc("dmu_objset_clone"); 5565 goto out; 5566 } 5567 fatal(0, "dmu_objset_clone(%s) = %d", clonename, error); 5568 } 5569 5570 error = dsl_destroy_snapshot(fullname, B_TRUE); 5571 if (error) { 5572 fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5573 fullname, error); 5574 } 5575 5576 error = dsl_destroy_head(clonename); 5577 if (error) 5578 fatal(0, "dsl_destroy_head(%s) = %d", clonename, error); 5579 5580 error = dmu_objset_hold(fullname, FTAG, &origin); 5581 if (error != ENOENT) 5582 fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); 5583 5584 /* 5585 * Create snapshot, add temporary hold, verify that we can't 5586 * destroy a held snapshot, mark for deferred destroy, 5587 * release hold, verify snapshot was destroyed. 5588 */ 5589 error = dmu_objset_snapshot_one(osname, snapname); 5590 if (error) { 5591 if (error == ENOSPC) { 5592 ztest_record_enospc("dmu_objset_snapshot"); 5593 goto out; 5594 } 5595 fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); 5596 } 5597 5598 holds = fnvlist_alloc(); 5599 fnvlist_add_string(holds, fullname, tag); 5600 error = dsl_dataset_user_hold(holds, 0, NULL); 5601 fnvlist_free(holds); 5602 5603 if (error == ENOSPC) { 5604 ztest_record_enospc("dsl_dataset_user_hold"); 5605 goto out; 5606 } else if (error) { 5607 fatal(0, "dsl_dataset_user_hold(%s, %s) = %u", 5608 fullname, tag, error); 5609 } 5610 5611 error = dsl_destroy_snapshot(fullname, B_FALSE); 5612 if (error != EBUSY) { 5613 fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 5614 fullname, error); 5615 } 5616 5617 error = dsl_destroy_snapshot(fullname, B_TRUE); 5618 if (error) { 5619 fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5620 fullname, error); 5621 } 5622 5623 error = user_release_one(fullname, tag); 5624 if (error) 5625 fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error); 5626 5627 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 5628 5629 out: 5630 rw_exit(&ztest_name_lock); 5631 } 5632 5633 /* 5634 * Inject random faults into the on-disk data. 5635 */ 5636 /* ARGSUSED */ 5637 void 5638 ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 5639 { 5640 ztest_shared_t *zs = ztest_shared; 5641 spa_t *spa = ztest_spa; 5642 int fd; 5643 uint64_t offset; 5644 uint64_t leaves; 5645 uint64_t bad = 0x1990c0ffeedecade; 5646 uint64_t top, leaf; 5647 char path0[MAXPATHLEN]; 5648 char pathrand[MAXPATHLEN]; 5649 size_t fsize; 5650 int bshift = SPA_MAXBLOCKSHIFT + 2; 5651 int iters = 1000; 5652 int maxfaults; 5653 int mirror_save; 5654 vdev_t *vd0 = NULL; 5655 uint64_t guid0 = 0; 5656 boolean_t islog = B_FALSE; 5657 5658 mutex_enter(&ztest_vdev_lock); 5659 5660 /* 5661 * Device removal is in progress, fault injection must be disabled 5662 * until it completes and the pool is scrubbed. The fault injection 5663 * strategy for damaging blocks does not take in to account evacuated 5664 * blocks which may have already been damaged. 5665 */ 5666 if (ztest_device_removal_active) { 5667 mutex_exit(&ztest_vdev_lock); 5668 return; 5669 } 5670 5671 maxfaults = MAXFAULTS(); 5672 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; 5673 mirror_save = zs->zs_mirrors; 5674 mutex_exit(&ztest_vdev_lock); 5675 5676 ASSERT(leaves >= 1); 5677 5678 /* 5679 * Grab the name lock as reader. There are some operations 5680 * which don't like to have their vdevs changed while 5681 * they are in progress (i.e. spa_change_guid). Those 5682 * operations will have grabbed the name lock as writer. 5683 */ 5684 rw_enter(&ztest_name_lock, RW_READER); 5685 5686 /* 5687 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 5688 */ 5689 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5690 5691 if (ztest_random(2) == 0) { 5692 /* 5693 * Inject errors on a normal data device or slog device. 5694 */ 5695 top = ztest_random_vdev_top(spa, B_TRUE); 5696 leaf = ztest_random(leaves) + zs->zs_splits; 5697 5698 /* 5699 * Generate paths to the first leaf in this top-level vdev, 5700 * and to the random leaf we selected. We'll induce transient 5701 * write failures and random online/offline activity on leaf 0, 5702 * and we'll write random garbage to the randomly chosen leaf. 5703 */ 5704 (void) snprintf(path0, sizeof (path0), ztest_dev_template, 5705 ztest_opts.zo_dir, ztest_opts.zo_pool, 5706 top * leaves + zs->zs_splits); 5707 (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template, 5708 ztest_opts.zo_dir, ztest_opts.zo_pool, 5709 top * leaves + leaf); 5710 5711 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 5712 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 5713 islog = B_TRUE; 5714 5715 /* 5716 * If the top-level vdev needs to be resilvered 5717 * then we only allow faults on the device that is 5718 * resilvering. 5719 */ 5720 if (vd0 != NULL && maxfaults != 1 && 5721 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 5722 vd0->vdev_resilver_txg != 0)) { 5723 /* 5724 * Make vd0 explicitly claim to be unreadable, 5725 * or unwriteable, or reach behind its back 5726 * and close the underlying fd. We can do this if 5727 * maxfaults == 0 because we'll fail and reexecute, 5728 * and we can do it if maxfaults >= 2 because we'll 5729 * have enough redundancy. If maxfaults == 1, the 5730 * combination of this with injection of random data 5731 * corruption below exceeds the pool's fault tolerance. 5732 */ 5733 vdev_file_t *vf = vd0->vdev_tsd; 5734 5735 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 5736 (long long)vd0->vdev_id, (int)maxfaults); 5737 5738 if (vf != NULL && ztest_random(3) == 0) { 5739 (void) close(vf->vf_vnode->v_fd); 5740 vf->vf_vnode->v_fd = -1; 5741 } else if (ztest_random(2) == 0) { 5742 vd0->vdev_cant_read = B_TRUE; 5743 } else { 5744 vd0->vdev_cant_write = B_TRUE; 5745 } 5746 guid0 = vd0->vdev_guid; 5747 } 5748 } else { 5749 /* 5750 * Inject errors on an l2cache device. 5751 */ 5752 spa_aux_vdev_t *sav = &spa->spa_l2cache; 5753 5754 if (sav->sav_count == 0) { 5755 spa_config_exit(spa, SCL_STATE, FTAG); 5756 rw_exit(&ztest_name_lock); 5757 return; 5758 } 5759 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 5760 guid0 = vd0->vdev_guid; 5761 (void) strcpy(path0, vd0->vdev_path); 5762 (void) strcpy(pathrand, vd0->vdev_path); 5763 5764 leaf = 0; 5765 leaves = 1; 5766 maxfaults = INT_MAX; /* no limit on cache devices */ 5767 } 5768 5769 spa_config_exit(spa, SCL_STATE, FTAG); 5770 rw_exit(&ztest_name_lock); 5771 5772 /* 5773 * If we can tolerate two or more faults, or we're dealing 5774 * with a slog, randomly online/offline vd0. 5775 */ 5776 if ((maxfaults >= 2 || islog) && guid0 != 0) { 5777 if (ztest_random(10) < 6) { 5778 int flags = (ztest_random(2) == 0 ? 5779 ZFS_OFFLINE_TEMPORARY : 0); 5780 5781 /* 5782 * We have to grab the zs_name_lock as writer to 5783 * prevent a race between offlining a slog and 5784 * destroying a dataset. Offlining the slog will 5785 * grab a reference on the dataset which may cause 5786 * dmu_objset_destroy() to fail with EBUSY thus 5787 * leaving the dataset in an inconsistent state. 5788 */ 5789 if (islog) 5790 rw_enter(&ztest_name_lock, RW_WRITER); 5791 5792 VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); 5793 5794 if (islog) 5795 rw_exit(&ztest_name_lock); 5796 } else { 5797 /* 5798 * Ideally we would like to be able to randomly 5799 * call vdev_[on|off]line without holding locks 5800 * to force unpredictable failures but the side 5801 * effects of vdev_[on|off]line prevent us from 5802 * doing so. We grab the ztest_vdev_lock here to 5803 * prevent a race between injection testing and 5804 * aux_vdev removal. 5805 */ 5806 mutex_enter(&ztest_vdev_lock); 5807 (void) vdev_online(spa, guid0, 0, NULL); 5808 mutex_exit(&ztest_vdev_lock); 5809 } 5810 } 5811 5812 if (maxfaults == 0) 5813 return; 5814 5815 /* 5816 * We have at least single-fault tolerance, so inject data corruption. 5817 */ 5818 fd = open(pathrand, O_RDWR); 5819 5820 if (fd == -1) /* we hit a gap in the device namespace */ 5821 return; 5822 5823 fsize = lseek(fd, 0, SEEK_END); 5824 5825 while (--iters != 0) { 5826 /* 5827 * The offset must be chosen carefully to ensure that 5828 * we do not inject a given logical block with errors 5829 * on two different leaf devices, because ZFS can not 5830 * tolerate that (if maxfaults==1). 5831 * 5832 * We divide each leaf into chunks of size 5833 * (# leaves * SPA_MAXBLOCKSIZE * 4). Within each chunk 5834 * there is a series of ranges to which we can inject errors. 5835 * Each range can accept errors on only a single leaf vdev. 5836 * The error injection ranges are separated by ranges 5837 * which we will not inject errors on any device (DMZs). 5838 * Each DMZ must be large enough such that a single block 5839 * can not straddle it, so that a single block can not be 5840 * a target in two different injection ranges (on different 5841 * leaf vdevs). 5842 * 5843 * For example, with 3 leaves, each chunk looks like: 5844 * 0 to 32M: injection range for leaf 0 5845 * 32M to 64M: DMZ - no injection allowed 5846 * 64M to 96M: injection range for leaf 1 5847 * 96M to 128M: DMZ - no injection allowed 5848 * 128M to 160M: injection range for leaf 2 5849 * 160M to 192M: DMZ - no injection allowed 5850 */ 5851 offset = ztest_random(fsize / (leaves << bshift)) * 5852 (leaves << bshift) + (leaf << bshift) + 5853 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 5854 5855 /* 5856 * Only allow damage to the labels at one end of the vdev. 5857 * 5858 * If all labels are damaged, the device will be totally 5859 * inaccessible, which will result in loss of data, 5860 * because we also damage (parts of) the other side of 5861 * the mirror/raidz. 5862 * 5863 * Additionally, we will always have both an even and an 5864 * odd label, so that we can handle crashes in the 5865 * middle of vdev_config_sync(). 5866 */ 5867 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 5868 continue; 5869 5870 /* 5871 * The two end labels are stored at the "end" of the disk, but 5872 * the end of the disk (vdev_psize) is aligned to 5873 * sizeof (vdev_label_t). 5874 */ 5875 uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); 5876 if ((leaf & 1) == 1 && 5877 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 5878 continue; 5879 5880 mutex_enter(&ztest_vdev_lock); 5881 if (mirror_save != zs->zs_mirrors) { 5882 mutex_exit(&ztest_vdev_lock); 5883 (void) close(fd); 5884 return; 5885 } 5886 5887 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 5888 fatal(1, "can't inject bad word at 0x%llx in %s", 5889 offset, pathrand); 5890 5891 mutex_exit(&ztest_vdev_lock); 5892 5893 if (ztest_opts.zo_verbose >= 7) 5894 (void) printf("injected bad word into %s," 5895 " offset 0x%llx\n", pathrand, (u_longlong_t)offset); 5896 } 5897 5898 (void) close(fd); 5899 } 5900 5901 /* 5902 * Verify that DDT repair works as expected. 5903 */ 5904 void 5905 ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) 5906 { 5907 ztest_shared_t *zs = ztest_shared; 5908 spa_t *spa = ztest_spa; 5909 objset_t *os = zd->zd_os; 5910 ztest_od_t od[1]; 5911 uint64_t object, blocksize, txg, pattern, psize; 5912 enum zio_checksum checksum = spa_dedup_checksum(spa); 5913 dmu_buf_t *db; 5914 dmu_tx_t *tx; 5915 abd_t *abd; 5916 blkptr_t blk; 5917 int copies = 2 * ZIO_DEDUPDITTO_MIN; 5918 5919 blocksize = ztest_random_blocksize(); 5920 blocksize = MIN(blocksize, 2048); /* because we write so many */ 5921 5922 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 5923 0, 0); 5924 5925 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 5926 return; 5927 5928 /* 5929 * Take the name lock as writer to prevent anyone else from changing 5930 * the pool and dataset properies we need to maintain during this test. 5931 */ 5932 rw_enter(&ztest_name_lock, RW_WRITER); 5933 5934 if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum, 5935 B_FALSE) != 0 || 5936 ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1, 5937 B_FALSE) != 0) { 5938 rw_exit(&ztest_name_lock); 5939 return; 5940 } 5941 5942 dmu_objset_stats_t dds; 5943 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 5944 dmu_objset_fast_stat(os, &dds); 5945 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 5946 5947 object = od[0].od_object; 5948 blocksize = od[0].od_blocksize; 5949 pattern = zs->zs_guid ^ dds.dds_guid; 5950 5951 ASSERT(object != 0); 5952 5953 tx = dmu_tx_create(os); 5954 dmu_tx_hold_write(tx, object, 0, copies * blocksize); 5955 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 5956 if (txg == 0) { 5957 rw_exit(&ztest_name_lock); 5958 return; 5959 } 5960 5961 /* 5962 * Write all the copies of our block. 5963 */ 5964 for (int i = 0; i < copies; i++) { 5965 uint64_t offset = i * blocksize; 5966 int error = dmu_buf_hold(os, object, offset, FTAG, &db, 5967 DMU_READ_NO_PREFETCH); 5968 if (error != 0) { 5969 fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u", 5970 os, (long long)object, (long long) offset, error); 5971 } 5972 ASSERT(db->db_offset == offset); 5973 ASSERT(db->db_size == blocksize); 5974 ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) || 5975 ztest_pattern_match(db->db_data, db->db_size, 0ULL)); 5976 dmu_buf_will_fill(db, tx); 5977 ztest_pattern_set(db->db_data, db->db_size, pattern); 5978 dmu_buf_rele(db, FTAG); 5979 } 5980 5981 dmu_tx_commit(tx); 5982 txg_wait_synced(spa_get_dsl(spa), txg); 5983 5984 /* 5985 * Find out what block we got. 5986 */ 5987 VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db, 5988 DMU_READ_NO_PREFETCH)); 5989 blk = *((dmu_buf_impl_t *)db)->db_blkptr; 5990 dmu_buf_rele(db, FTAG); 5991 5992 /* 5993 * Damage the block. Dedup-ditto will save us when we read it later. 5994 */ 5995 psize = BP_GET_PSIZE(&blk); 5996 abd = abd_alloc_linear(psize, B_TRUE); 5997 ztest_pattern_set(abd_to_buf(abd), psize, ~pattern); 5998 5999 (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, 6000 abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, 6001 ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); 6002 6003 abd_free(abd); 6004 6005 rw_exit(&ztest_name_lock); 6006 } 6007 6008 /* 6009 * Scrub the pool. 6010 */ 6011 /* ARGSUSED */ 6012 void 6013 ztest_scrub(ztest_ds_t *zd, uint64_t id) 6014 { 6015 spa_t *spa = ztest_spa; 6016 6017 /* 6018 * Scrub in progress by device removal. 6019 */ 6020 if (ztest_device_removal_active) 6021 return; 6022 6023 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6024 (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */ 6025 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6026 } 6027 6028 /* 6029 * Change the guid for the pool. 6030 */ 6031 /* ARGSUSED */ 6032 void 6033 ztest_reguid(ztest_ds_t *zd, uint64_t id) 6034 { 6035 spa_t *spa = ztest_spa; 6036 uint64_t orig, load; 6037 int error; 6038 6039 if (ztest_opts.zo_mmp_test) 6040 return; 6041 6042 orig = spa_guid(spa); 6043 load = spa_load_guid(spa); 6044 6045 rw_enter(&ztest_name_lock, RW_WRITER); 6046 error = spa_change_guid(spa); 6047 rw_exit(&ztest_name_lock); 6048 6049 if (error != 0) 6050 return; 6051 6052 if (ztest_opts.zo_verbose >= 4) { 6053 (void) printf("Changed guid old %llu -> %llu\n", 6054 (u_longlong_t)orig, (u_longlong_t)spa_guid(spa)); 6055 } 6056 6057 VERIFY3U(orig, !=, spa_guid(spa)); 6058 VERIFY3U(load, ==, spa_load_guid(spa)); 6059 } 6060 6061 static vdev_t * 6062 ztest_random_concrete_vdev_leaf(vdev_t *vd) 6063 { 6064 if (vd == NULL) 6065 return (NULL); 6066 6067 if (vd->vdev_children == 0) 6068 return (vd); 6069 6070 vdev_t *eligible[vd->vdev_children]; 6071 int eligible_idx = 0, i; 6072 for (i = 0; i < vd->vdev_children; i++) { 6073 vdev_t *cvd = vd->vdev_child[i]; 6074 if (cvd->vdev_top->vdev_removing) 6075 continue; 6076 if (cvd->vdev_children > 0 || 6077 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 6078 eligible[eligible_idx++] = cvd; 6079 } 6080 } 6081 VERIFY(eligible_idx > 0); 6082 6083 uint64_t child_no = ztest_random(eligible_idx); 6084 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 6085 } 6086 6087 /* ARGSUSED */ 6088 void 6089 ztest_initialize(ztest_ds_t *zd, uint64_t id) 6090 { 6091 spa_t *spa = ztest_spa; 6092 int error = 0; 6093 6094 mutex_enter(&ztest_vdev_lock); 6095 6096 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6097 6098 /* Random leaf vdev */ 6099 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 6100 if (rand_vd == NULL) { 6101 spa_config_exit(spa, SCL_VDEV, FTAG); 6102 mutex_exit(&ztest_vdev_lock); 6103 return; 6104 } 6105 6106 /* 6107 * The random vdev we've selected may change as soon as we 6108 * drop the spa_config_lock. We create local copies of things 6109 * we're interested in. 6110 */ 6111 uint64_t guid = rand_vd->vdev_guid; 6112 char *path = strdup(rand_vd->vdev_path); 6113 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 6114 6115 zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid); 6116 spa_config_exit(spa, SCL_VDEV, FTAG); 6117 6118 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 6119 6120 nvlist_t *vdev_guids = fnvlist_alloc(); 6121 nvlist_t *vdev_errlist = fnvlist_alloc(); 6122 fnvlist_add_uint64(vdev_guids, path, guid); 6123 error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); 6124 fnvlist_free(vdev_guids); 6125 fnvlist_free(vdev_errlist); 6126 6127 switch (cmd) { 6128 case POOL_INITIALIZE_CANCEL: 6129 if (ztest_opts.zo_verbose >= 4) { 6130 (void) printf("Cancel initialize %s", path); 6131 if (!active) 6132 (void) printf(" failed (no initialize active)"); 6133 (void) printf("\n"); 6134 } 6135 break; 6136 case POOL_INITIALIZE_START: 6137 if (ztest_opts.zo_verbose >= 4) { 6138 (void) printf("Start initialize %s", path); 6139 if (active && error == 0) 6140 (void) printf(" failed (already active)"); 6141 else if (error != 0) 6142 (void) printf(" failed (error %d)", error); 6143 (void) printf("\n"); 6144 } 6145 break; 6146 case POOL_INITIALIZE_SUSPEND: 6147 if (ztest_opts.zo_verbose >= 4) { 6148 (void) printf("Suspend initialize %s", path); 6149 if (!active) 6150 (void) printf(" failed (no initialize active)"); 6151 (void) printf("\n"); 6152 } 6153 break; 6154 } 6155 free(path); 6156 mutex_exit(&ztest_vdev_lock); 6157 } 6158 6159 /* ARGSUSED */ 6160 void 6161 ztest_trim(ztest_ds_t *zd, uint64_t id) 6162 { 6163 spa_t *spa = ztest_spa; 6164 int error = 0; 6165 6166 mutex_enter(&ztest_vdev_lock); 6167 6168 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6169 6170 /* Random leaf vdev */ 6171 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 6172 if (rand_vd == NULL) { 6173 spa_config_exit(spa, SCL_VDEV, FTAG); 6174 mutex_exit(&ztest_vdev_lock); 6175 return; 6176 } 6177 6178 /* 6179 * The random vdev we've selected may change as soon as we 6180 * drop the spa_config_lock. We create local copies of things 6181 * we're interested in. 6182 */ 6183 uint64_t guid = rand_vd->vdev_guid; 6184 char *path = strdup(rand_vd->vdev_path); 6185 boolean_t active = rand_vd->vdev_trim_thread != NULL; 6186 6187 zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid); 6188 spa_config_exit(spa, SCL_VDEV, FTAG); 6189 6190 uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); 6191 uint64_t rate = 1 << ztest_random(30); 6192 boolean_t partial = (ztest_random(5) > 0); 6193 boolean_t secure = (ztest_random(5) > 0); 6194 6195 nvlist_t *vdev_guids = fnvlist_alloc(); 6196 nvlist_t *vdev_errlist = fnvlist_alloc(); 6197 fnvlist_add_uint64(vdev_guids, path, guid); 6198 error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, 6199 secure, vdev_errlist); 6200 fnvlist_free(vdev_guids); 6201 fnvlist_free(vdev_errlist); 6202 6203 switch (cmd) { 6204 case POOL_TRIM_CANCEL: 6205 if (ztest_opts.zo_verbose >= 4) { 6206 (void) printf("Cancel TRIM %s", path); 6207 if (!active) 6208 (void) printf(" failed (no TRIM active)"); 6209 (void) printf("\n"); 6210 } 6211 break; 6212 case POOL_TRIM_START: 6213 if (ztest_opts.zo_verbose >= 4) { 6214 (void) printf("Start TRIM %s", path); 6215 if (active && error == 0) 6216 (void) printf(" failed (already active)"); 6217 else if (error != 0) 6218 (void) printf(" failed (error %d)", error); 6219 (void) printf("\n"); 6220 } 6221 break; 6222 case POOL_TRIM_SUSPEND: 6223 if (ztest_opts.zo_verbose >= 4) { 6224 (void) printf("Suspend TRIM %s", path); 6225 if (!active) 6226 (void) printf(" failed (no TRIM active)"); 6227 (void) printf("\n"); 6228 } 6229 break; 6230 } 6231 free(path); 6232 mutex_exit(&ztest_vdev_lock); 6233 } 6234 6235 /* 6236 * Verify pool integrity by running zdb. 6237 */ 6238 static void 6239 ztest_run_zdb(char *pool) 6240 { 6241 int status; 6242 char zdb[MAXPATHLEN + MAXNAMELEN + 20]; 6243 char zbuf[1024]; 6244 FILE *fp; 6245 6246 (void) snprintf(zdb, sizeof (zdb), 6247 "/usr/sbin/zdb -bcc%s%s -G -d -U %s " 6248 "-o zfs_reconstruct_indirect_combinations_max=65536 %s", 6249 ztest_opts.zo_verbose >= 3 ? "s" : "", 6250 ztest_opts.zo_verbose >= 4 ? "v" : "", 6251 spa_config_path, 6252 pool); 6253 6254 if (ztest_opts.zo_verbose >= 5) 6255 (void) printf("Executing %s\n", strstr(zdb, "zdb ")); 6256 6257 fp = popen(zdb, "r"); 6258 6259 while (fgets(zbuf, sizeof (zbuf), fp) != NULL) 6260 if (ztest_opts.zo_verbose >= 3) 6261 (void) printf("%s", zbuf); 6262 6263 status = pclose(fp); 6264 6265 if (status == 0) 6266 return; 6267 6268 ztest_dump_core = 0; 6269 if (WIFEXITED(status)) 6270 fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 6271 else 6272 fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status)); 6273 } 6274 6275 static void 6276 ztest_walk_pool_directory(char *header) 6277 { 6278 spa_t *spa = NULL; 6279 6280 if (ztest_opts.zo_verbose >= 6) 6281 (void) printf("%s\n", header); 6282 6283 mutex_enter(&spa_namespace_lock); 6284 while ((spa = spa_next(spa)) != NULL) 6285 if (ztest_opts.zo_verbose >= 6) 6286 (void) printf("\t%s\n", spa_name(spa)); 6287 mutex_exit(&spa_namespace_lock); 6288 } 6289 6290 static void 6291 ztest_spa_import_export(char *oldname, char *newname) 6292 { 6293 nvlist_t *config, *newconfig; 6294 uint64_t pool_guid; 6295 spa_t *spa; 6296 int error; 6297 6298 if (ztest_opts.zo_verbose >= 4) { 6299 (void) printf("import/export: old = %s, new = %s\n", 6300 oldname, newname); 6301 } 6302 6303 /* 6304 * Clean up from previous runs. 6305 */ 6306 (void) spa_destroy(newname); 6307 6308 /* 6309 * Get the pool's configuration and guid. 6310 */ 6311 VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); 6312 6313 /* 6314 * Kick off a scrub to tickle scrub/export races. 6315 */ 6316 if (ztest_random(2) == 0) 6317 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6318 6319 pool_guid = spa_guid(spa); 6320 spa_close(spa, FTAG); 6321 6322 ztest_walk_pool_directory("pools before export"); 6323 6324 /* 6325 * Export it. 6326 */ 6327 VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE)); 6328 6329 ztest_walk_pool_directory("pools after export"); 6330 6331 /* 6332 * Try to import it. 6333 */ 6334 newconfig = spa_tryimport(config); 6335 ASSERT(newconfig != NULL); 6336 nvlist_free(newconfig); 6337 6338 /* 6339 * Import it under the new name. 6340 */ 6341 error = spa_import(newname, config, NULL, 0); 6342 if (error != 0) { 6343 dump_nvlist(config, 0); 6344 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 6345 oldname, newname, error); 6346 } 6347 6348 ztest_walk_pool_directory("pools after import"); 6349 6350 /* 6351 * Try to import it again -- should fail with EEXIST. 6352 */ 6353 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 6354 6355 /* 6356 * Try to import it under a different name -- should fail with EEXIST. 6357 */ 6358 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 6359 6360 /* 6361 * Verify that the pool is no longer visible under the old name. 6362 */ 6363 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 6364 6365 /* 6366 * Verify that we can open and close the pool using the new name. 6367 */ 6368 VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); 6369 ASSERT(pool_guid == spa_guid(spa)); 6370 spa_close(spa, FTAG); 6371 6372 nvlist_free(config); 6373 } 6374 6375 static void 6376 ztest_resume(spa_t *spa) 6377 { 6378 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 6379 (void) printf("resuming from suspended state\n"); 6380 spa_vdev_state_enter(spa, SCL_NONE); 6381 vdev_clear(spa, NULL); 6382 (void) spa_vdev_state_exit(spa, NULL, 0); 6383 (void) zio_resume(spa); 6384 } 6385 6386 static void * 6387 ztest_resume_thread(void *arg) 6388 { 6389 spa_t *spa = arg; 6390 6391 while (!ztest_exiting) { 6392 if (spa_suspended(spa)) 6393 ztest_resume(spa); 6394 (void) poll(NULL, 0, 100); 6395 6396 /* 6397 * Periodically change the zfs_compressed_arc_enabled setting. 6398 */ 6399 if (ztest_random(10) == 0) 6400 zfs_compressed_arc_enabled = ztest_random(2); 6401 6402 /* 6403 * Periodically change the zfs_abd_scatter_enabled setting. 6404 */ 6405 if (ztest_random(10) == 0) 6406 zfs_abd_scatter_enabled = ztest_random(2); 6407 } 6408 return (NULL); 6409 } 6410 6411 static void * 6412 ztest_deadman_thread(void *arg) 6413 { 6414 ztest_shared_t *zs = arg; 6415 spa_t *spa = ztest_spa; 6416 hrtime_t delta, total = 0; 6417 6418 for (;;) { 6419 delta = zs->zs_thread_stop - zs->zs_thread_start + 6420 MSEC2NSEC(zfs_deadman_synctime_ms); 6421 6422 (void) poll(NULL, 0, (int)NSEC2MSEC(delta)); 6423 6424 /* 6425 * If the pool is suspended then fail immediately. Otherwise, 6426 * check to see if the pool is making any progress. If 6427 * vdev_deadman() discovers that there hasn't been any recent 6428 * I/Os then it will end up aborting the tests. 6429 */ 6430 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 6431 fatal(0, "aborting test after %llu seconds because " 6432 "pool has transitioned to a suspended state.", 6433 zfs_deadman_synctime_ms / 1000); 6434 return (NULL); 6435 } 6436 vdev_deadman(spa->spa_root_vdev); 6437 6438 total += zfs_deadman_synctime_ms/1000; 6439 (void) printf("ztest has been running for %lld seconds\n", 6440 total); 6441 } 6442 } 6443 6444 static void 6445 ztest_execute(int test, ztest_info_t *zi, uint64_t id) 6446 { 6447 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 6448 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 6449 hrtime_t functime = gethrtime(); 6450 6451 for (int i = 0; i < zi->zi_iters; i++) 6452 zi->zi_func(zd, id); 6453 6454 functime = gethrtime() - functime; 6455 6456 atomic_add_64(&zc->zc_count, 1); 6457 atomic_add_64(&zc->zc_time, functime); 6458 6459 if (ztest_opts.zo_verbose >= 4) { 6460 Dl_info dli; 6461 (void) dladdr((void *)zi->zi_func, &dli); 6462 (void) printf("%6.2f sec in %s\n", 6463 (double)functime / NANOSEC, dli.dli_sname); 6464 } 6465 } 6466 6467 static void * 6468 ztest_thread(void *arg) 6469 { 6470 int rand; 6471 uint64_t id = (uintptr_t)arg; 6472 ztest_shared_t *zs = ztest_shared; 6473 uint64_t call_next; 6474 hrtime_t now; 6475 ztest_info_t *zi; 6476 ztest_shared_callstate_t *zc; 6477 6478 while ((now = gethrtime()) < zs->zs_thread_stop) { 6479 /* 6480 * See if it's time to force a crash. 6481 */ 6482 if (now > zs->zs_thread_kill) 6483 ztest_kill(zs); 6484 6485 /* 6486 * If we're getting ENOSPC with some regularity, stop. 6487 */ 6488 if (zs->zs_enospc_count > 10) 6489 break; 6490 6491 /* 6492 * Pick a random function to execute. 6493 */ 6494 rand = ztest_random(ZTEST_FUNCS); 6495 zi = &ztest_info[rand]; 6496 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 6497 call_next = zc->zc_next; 6498 6499 if (now >= call_next && 6500 atomic_cas_64(&zc->zc_next, call_next, call_next + 6501 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 6502 ztest_execute(rand, zi, id); 6503 } 6504 } 6505 6506 return (NULL); 6507 } 6508 6509 static void 6510 ztest_dataset_name(char *dsname, char *pool, int d) 6511 { 6512 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 6513 } 6514 6515 static void 6516 ztest_dataset_destroy(int d) 6517 { 6518 char name[ZFS_MAX_DATASET_NAME_LEN]; 6519 6520 ztest_dataset_name(name, ztest_opts.zo_pool, d); 6521 6522 if (ztest_opts.zo_verbose >= 3) 6523 (void) printf("Destroying %s to free up space\n", name); 6524 6525 /* 6526 * Cleanup any non-standard clones and snapshots. In general, 6527 * ztest thread t operates on dataset (t % zopt_datasets), 6528 * so there may be more than one thing to clean up. 6529 */ 6530 for (int t = d; t < ztest_opts.zo_threads; 6531 t += ztest_opts.zo_datasets) { 6532 ztest_dsl_dataset_cleanup(name, t); 6533 } 6534 6535 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 6536 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 6537 } 6538 6539 static void 6540 ztest_dataset_dirobj_verify(ztest_ds_t *zd) 6541 { 6542 uint64_t usedobjs, dirobjs, scratch; 6543 6544 /* 6545 * ZTEST_DIROBJ is the object directory for the entire dataset. 6546 * Therefore, the number of objects in use should equal the 6547 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 6548 * If not, we have an object leak. 6549 * 6550 * Note that we can only check this in ztest_dataset_open(), 6551 * when the open-context and syncing-context values agree. 6552 * That's because zap_count() returns the open-context value, 6553 * while dmu_objset_space() returns the rootbp fill count. 6554 */ 6555 VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 6556 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 6557 ASSERT3U(dirobjs + 1, ==, usedobjs); 6558 } 6559 6560 static int 6561 ztest_dataset_open(int d) 6562 { 6563 ztest_ds_t *zd = &ztest_ds[d]; 6564 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 6565 objset_t *os; 6566 zilog_t *zilog; 6567 char name[ZFS_MAX_DATASET_NAME_LEN]; 6568 int error; 6569 6570 ztest_dataset_name(name, ztest_opts.zo_pool, d); 6571 6572 rw_enter(&ztest_name_lock, RW_READER); 6573 6574 error = ztest_dataset_create(name); 6575 if (error == ENOSPC) { 6576 rw_exit(&ztest_name_lock); 6577 ztest_record_enospc(FTAG); 6578 return (error); 6579 } 6580 ASSERT(error == 0 || error == EEXIST); 6581 6582 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 6583 B_TRUE, zd, &os)); 6584 rw_exit(&ztest_name_lock); 6585 6586 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 6587 6588 zilog = zd->zd_zilog; 6589 6590 if (zilog->zl_header->zh_claim_lr_seq != 0 && 6591 zilog->zl_header->zh_claim_lr_seq < committed_seq) 6592 fatal(0, "missing log records: claimed %llu < committed %llu", 6593 zilog->zl_header->zh_claim_lr_seq, committed_seq); 6594 6595 ztest_dataset_dirobj_verify(zd); 6596 6597 zil_replay(os, zd, ztest_replay_vector); 6598 6599 ztest_dataset_dirobj_verify(zd); 6600 6601 if (ztest_opts.zo_verbose >= 6) 6602 (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", 6603 zd->zd_name, 6604 (u_longlong_t)zilog->zl_parse_blk_count, 6605 (u_longlong_t)zilog->zl_parse_lr_count, 6606 (u_longlong_t)zilog->zl_replaying_seq); 6607 6608 zilog = zil_open(os, ztest_get_data); 6609 6610 if (zilog->zl_replaying_seq != 0 && 6611 zilog->zl_replaying_seq < committed_seq) 6612 fatal(0, "missing log records: replayed %llu < committed %llu", 6613 zilog->zl_replaying_seq, committed_seq); 6614 6615 return (0); 6616 } 6617 6618 static void 6619 ztest_dataset_close(int d) 6620 { 6621 ztest_ds_t *zd = &ztest_ds[d]; 6622 6623 zil_close(zd->zd_zilog); 6624 dmu_objset_disown(zd->zd_os, B_TRUE, zd); 6625 6626 ztest_zd_fini(zd); 6627 } 6628 6629 /* 6630 * Kick off threads to run tests on all datasets in parallel. 6631 */ 6632 static void 6633 ztest_run(ztest_shared_t *zs) 6634 { 6635 thread_t *tid; 6636 spa_t *spa; 6637 objset_t *os; 6638 thread_t resume_tid; 6639 int error; 6640 6641 ztest_exiting = B_FALSE; 6642 6643 /* 6644 * Initialize parent/child shared state. 6645 */ 6646 mutex_init(&ztest_checkpoint_lock, NULL, USYNC_THREAD, NULL); 6647 mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL); 6648 rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL); 6649 6650 zs->zs_thread_start = gethrtime(); 6651 zs->zs_thread_stop = 6652 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 6653 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 6654 zs->zs_thread_kill = zs->zs_thread_stop; 6655 if (ztest_random(100) < ztest_opts.zo_killrate) { 6656 zs->zs_thread_kill -= 6657 ztest_random(ztest_opts.zo_passtime * NANOSEC); 6658 } 6659 6660 mutex_init(&zcl.zcl_callbacks_lock, NULL, USYNC_THREAD, NULL); 6661 6662 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 6663 offsetof(ztest_cb_data_t, zcd_node)); 6664 6665 /* 6666 * Open our pool. 6667 */ 6668 kernel_init(FREAD | FWRITE); 6669 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 6670 metaslab_preload_limit = ztest_random(20) + 1; 6671 ztest_spa = spa; 6672 6673 dmu_objset_stats_t dds; 6674 VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, 6675 DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); 6676 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 6677 dmu_objset_fast_stat(os, &dds); 6678 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 6679 zs->zs_guid = dds.dds_guid; 6680 dmu_objset_disown(os, B_TRUE, FTAG); 6681 6682 spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN; 6683 6684 /* 6685 * We don't expect the pool to suspend unless maxfaults == 0, 6686 * in which case ztest_fault_inject() temporarily takes away 6687 * the only valid replica. 6688 */ 6689 if (MAXFAULTS() == 0) 6690 spa->spa_failmode = ZIO_FAILURE_MODE_WAIT; 6691 else 6692 spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; 6693 6694 /* 6695 * Create a thread to periodically resume suspended I/O. 6696 */ 6697 VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND, 6698 &resume_tid) == 0); 6699 6700 /* 6701 * Create a deadman thread to abort() if we hang. 6702 */ 6703 VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND, 6704 NULL) == 0); 6705 6706 /* 6707 * Verify that we can safely inquire about any object, 6708 * whether it's allocated or not. To make it interesting, 6709 * we probe a 5-wide window around each power of two. 6710 * This hits all edge cases, including zero and the max. 6711 */ 6712 for (int t = 0; t < 64; t++) { 6713 for (int d = -5; d <= 5; d++) { 6714 error = dmu_object_info(spa->spa_meta_objset, 6715 (1ULL << t) + d, NULL); 6716 ASSERT(error == 0 || error == ENOENT || 6717 error == EINVAL); 6718 } 6719 } 6720 6721 /* 6722 * If we got any ENOSPC errors on the previous run, destroy something. 6723 */ 6724 if (zs->zs_enospc_count != 0) { 6725 int d = ztest_random(ztest_opts.zo_datasets); 6726 ztest_dataset_destroy(d); 6727 } 6728 zs->zs_enospc_count = 0; 6729 6730 tid = umem_zalloc(ztest_opts.zo_threads * sizeof (thread_t), 6731 UMEM_NOFAIL); 6732 6733 if (ztest_opts.zo_verbose >= 4) 6734 (void) printf("starting main threads...\n"); 6735 6736 /* 6737 * Kick off all the tests that run in parallel. 6738 */ 6739 for (int t = 0; t < ztest_opts.zo_threads; t++) { 6740 if (t < ztest_opts.zo_datasets && 6741 ztest_dataset_open(t) != 0) 6742 return; 6743 VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t, 6744 THR_BOUND, &tid[t]) == 0); 6745 } 6746 6747 /* 6748 * Wait for all of the tests to complete. We go in reverse order 6749 * so we don't close datasets while threads are still using them. 6750 */ 6751 for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) { 6752 VERIFY(thr_join(tid[t], NULL, NULL) == 0); 6753 if (t < ztest_opts.zo_datasets) 6754 ztest_dataset_close(t); 6755 } 6756 6757 txg_wait_synced(spa_get_dsl(spa), 0); 6758 6759 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 6760 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 6761 zfs_dbgmsg_print(FTAG); 6762 6763 umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t)); 6764 6765 /* Kill the resume thread */ 6766 ztest_exiting = B_TRUE; 6767 VERIFY(thr_join(resume_tid, NULL, NULL) == 0); 6768 ztest_resume(spa); 6769 6770 /* 6771 * Right before closing the pool, kick off a bunch of async I/O; 6772 * spa_close() should wait for it to complete. 6773 */ 6774 for (uint64_t object = 1; object < 50; object++) { 6775 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 6776 ZIO_PRIORITY_SYNC_READ); 6777 } 6778 6779 spa_close(spa, FTAG); 6780 6781 /* 6782 * Verify that we can loop over all pools. 6783 */ 6784 mutex_enter(&spa_namespace_lock); 6785 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 6786 if (ztest_opts.zo_verbose > 3) 6787 (void) printf("spa_next: found %s\n", spa_name(spa)); 6788 mutex_exit(&spa_namespace_lock); 6789 6790 /* 6791 * Verify that we can export the pool and reimport it under a 6792 * different name. 6793 */ 6794 if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { 6795 char name[ZFS_MAX_DATASET_NAME_LEN]; 6796 (void) snprintf(name, sizeof (name), "%s_import", 6797 ztest_opts.zo_pool); 6798 ztest_spa_import_export(ztest_opts.zo_pool, name); 6799 ztest_spa_import_export(name, ztest_opts.zo_pool); 6800 } 6801 6802 kernel_fini(); 6803 6804 list_destroy(&zcl.zcl_callbacks); 6805 6806 mutex_destroy(&zcl.zcl_callbacks_lock); 6807 6808 rw_destroy(&ztest_name_lock); 6809 mutex_destroy(&ztest_vdev_lock); 6810 mutex_destroy(&ztest_checkpoint_lock); 6811 } 6812 6813 static void 6814 ztest_freeze(void) 6815 { 6816 ztest_ds_t *zd = &ztest_ds[0]; 6817 spa_t *spa; 6818 int numloops = 0; 6819 6820 if (ztest_opts.zo_verbose >= 3) 6821 (void) printf("testing spa_freeze()...\n"); 6822 6823 kernel_init(FREAD | FWRITE); 6824 VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); 6825 VERIFY3U(0, ==, ztest_dataset_open(0)); 6826 ztest_spa = spa; 6827 6828 /* 6829 * Force the first log block to be transactionally allocated. 6830 * We have to do this before we freeze the pool -- otherwise 6831 * the log chain won't be anchored. 6832 */ 6833 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 6834 ztest_dmu_object_alloc_free(zd, 0); 6835 zil_commit(zd->zd_zilog, 0); 6836 } 6837 6838 txg_wait_synced(spa_get_dsl(spa), 0); 6839 6840 /* 6841 * Freeze the pool. This stops spa_sync() from doing anything, 6842 * so that the only way to record changes from now on is the ZIL. 6843 */ 6844 spa_freeze(spa); 6845 6846 /* 6847 * Because it is hard to predict how much space a write will actually 6848 * require beforehand, we leave ourselves some fudge space to write over 6849 * capacity. 6850 */ 6851 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 6852 6853 /* 6854 * Run tests that generate log records but don't alter the pool config 6855 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 6856 * We do a txg_wait_synced() after each iteration to force the txg 6857 * to increase well beyond the last synced value in the uberblock. 6858 * The ZIL should be OK with that. 6859 * 6860 * Run a random number of times less than zo_maxloops and ensure we do 6861 * not run out of space on the pool. 6862 */ 6863 while (ztest_random(10) != 0 && 6864 numloops++ < ztest_opts.zo_maxloops && 6865 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 6866 ztest_od_t od; 6867 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 6868 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 6869 ztest_io(zd, od.od_object, 6870 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 6871 txg_wait_synced(spa_get_dsl(spa), 0); 6872 } 6873 6874 /* 6875 * Commit all of the changes we just generated. 6876 */ 6877 zil_commit(zd->zd_zilog, 0); 6878 txg_wait_synced(spa_get_dsl(spa), 0); 6879 6880 /* 6881 * Close our dataset and close the pool. 6882 */ 6883 ztest_dataset_close(0); 6884 spa_close(spa, FTAG); 6885 kernel_fini(); 6886 6887 /* 6888 * Open and close the pool and dataset to induce log replay. 6889 */ 6890 kernel_init(FREAD | FWRITE); 6891 VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); 6892 ASSERT(spa_freeze_txg(spa) == UINT64_MAX); 6893 VERIFY3U(0, ==, ztest_dataset_open(0)); 6894 ztest_spa = spa; 6895 txg_wait_synced(spa_get_dsl(spa), 0); 6896 ztest_dataset_close(0); 6897 ztest_reguid(NULL, 0); 6898 6899 spa_close(spa, FTAG); 6900 kernel_fini(); 6901 } 6902 6903 void 6904 print_time(hrtime_t t, char *timebuf) 6905 { 6906 hrtime_t s = t / NANOSEC; 6907 hrtime_t m = s / 60; 6908 hrtime_t h = m / 60; 6909 hrtime_t d = h / 24; 6910 6911 s -= m * 60; 6912 m -= h * 60; 6913 h -= d * 24; 6914 6915 timebuf[0] = '\0'; 6916 6917 if (d) 6918 (void) sprintf(timebuf, 6919 "%llud%02lluh%02llum%02llus", d, h, m, s); 6920 else if (h) 6921 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 6922 else if (m) 6923 (void) sprintf(timebuf, "%llum%02llus", m, s); 6924 else 6925 (void) sprintf(timebuf, "%llus", s); 6926 } 6927 6928 static nvlist_t * 6929 make_random_props() 6930 { 6931 nvlist_t *props; 6932 6933 VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); 6934 6935 if (ztest_random(2) == 0) 6936 return (props); 6937 VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0); 6938 6939 return (props); 6940 } 6941 6942 /* 6943 * Import a storage pool with the given name. 6944 */ 6945 static void 6946 ztest_import(ztest_shared_t *zs) 6947 { 6948 importargs_t args = { 0 }; 6949 spa_t *spa; 6950 nvlist_t *cfg = NULL; 6951 int nsearch = 1; 6952 char *searchdirs[nsearch]; 6953 char *name = ztest_opts.zo_pool; 6954 int flags = ZFS_IMPORT_MISSING_LOG; 6955 int error; 6956 6957 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 6958 rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL); 6959 6960 kernel_init(FREAD | FWRITE); 6961 6962 searchdirs[0] = ztest_opts.zo_dir; 6963 args.paths = nsearch; 6964 args.path = searchdirs; 6965 args.can_be_active = B_FALSE; 6966 6967 error = zpool_find_config(NULL, name, &cfg, &args, 6968 &libzpool_config_ops); 6969 if (error) 6970 (void) fatal(0, "No pools found\n"); 6971 6972 VERIFY0(spa_import(name, cfg, NULL, flags)); 6973 VERIFY0(spa_open(name, &spa, FTAG)); 6974 zs->zs_metaslab_sz = 6975 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 6976 spa_close(spa, FTAG); 6977 6978 kernel_fini(); 6979 6980 if (!ztest_opts.zo_mmp_test) { 6981 ztest_run_zdb(ztest_opts.zo_pool); 6982 ztest_freeze(); 6983 ztest_run_zdb(ztest_opts.zo_pool); 6984 } 6985 6986 rw_destroy(&ztest_name_lock); 6987 mutex_destroy(&ztest_vdev_lock); 6988 } 6989 6990 /* 6991 * Create a storage pool with the given name and initial vdev size. 6992 * Then test spa_freeze() functionality. 6993 */ 6994 static void 6995 ztest_init(ztest_shared_t *zs) 6996 { 6997 spa_t *spa; 6998 nvlist_t *nvroot, *props; 6999 7000 mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL); 7001 mutex_init(&ztest_checkpoint_lock, NULL, USYNC_THREAD, NULL); 7002 rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL); 7003 7004 kernel_init(FREAD | FWRITE); 7005 7006 /* 7007 * Create the storage pool. 7008 */ 7009 (void) spa_destroy(ztest_opts.zo_pool); 7010 ztest_shared->zs_vdev_next_leaf = 0; 7011 zs->zs_splits = 0; 7012 zs->zs_mirrors = ztest_opts.zo_mirrors; 7013 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 7014 NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); 7015 props = make_random_props(); 7016 for (int i = 0; i < SPA_FEATURES; i++) { 7017 char buf[1024]; 7018 7019 /* 7020 * 75% chance of using the log space map feature. We want ztest 7021 * to exercise both the code paths that use the log space map 7022 * feature and the ones that don't. 7023 */ 7024 if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) 7025 continue; 7026 7027 (void) snprintf(buf, sizeof (buf), "feature@%s", 7028 spa_feature_table[i].fi_uname); 7029 VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0)); 7030 } 7031 VERIFY3U(0, ==, 7032 spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); 7033 nvlist_free(nvroot); 7034 nvlist_free(props); 7035 7036 VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7037 zs->zs_metaslab_sz = 7038 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7039 7040 spa_close(spa, FTAG); 7041 7042 kernel_fini(); 7043 7044 if (!ztest_opts.zo_mmp_test) { 7045 ztest_run_zdb(ztest_opts.zo_pool); 7046 ztest_freeze(); 7047 ztest_run_zdb(ztest_opts.zo_pool); 7048 } 7049 7050 rw_destroy(&ztest_name_lock); 7051 mutex_destroy(&ztest_vdev_lock); 7052 mutex_destroy(&ztest_checkpoint_lock); 7053 } 7054 7055 static void 7056 setup_data_fd(void) 7057 { 7058 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 7059 7060 ztest_fd_data = mkstemp(ztest_name_data); 7061 ASSERT3S(ztest_fd_data, >=, 0); 7062 (void) unlink(ztest_name_data); 7063 } 7064 7065 static int 7066 shared_data_size(ztest_shared_hdr_t *hdr) 7067 { 7068 int size; 7069 7070 size = hdr->zh_hdr_size; 7071 size += hdr->zh_opts_size; 7072 size += hdr->zh_size; 7073 size += hdr->zh_stats_size * hdr->zh_stats_count; 7074 size += hdr->zh_ds_size * hdr->zh_ds_count; 7075 7076 return (size); 7077 } 7078 7079 static void 7080 setup_hdr(void) 7081 { 7082 int size; 7083 ztest_shared_hdr_t *hdr; 7084 7085 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 7086 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 7087 ASSERT(hdr != MAP_FAILED); 7088 7089 VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 7090 7091 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 7092 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 7093 hdr->zh_size = sizeof (ztest_shared_t); 7094 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 7095 hdr->zh_stats_count = ZTEST_FUNCS; 7096 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 7097 hdr->zh_ds_count = ztest_opts.zo_datasets; 7098 7099 size = shared_data_size(hdr); 7100 VERIFY3U(0, ==, ftruncate(ztest_fd_data, size)); 7101 7102 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 7103 } 7104 7105 static void 7106 setup_data(void) 7107 { 7108 int size, offset; 7109 ztest_shared_hdr_t *hdr; 7110 uint8_t *buf; 7111 7112 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 7113 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 7114 ASSERT(hdr != MAP_FAILED); 7115 7116 size = shared_data_size(hdr); 7117 7118 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 7119 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 7120 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 7121 ASSERT(hdr != MAP_FAILED); 7122 buf = (uint8_t *)hdr; 7123 7124 offset = hdr->zh_hdr_size; 7125 ztest_shared_opts = (void *)&buf[offset]; 7126 offset += hdr->zh_opts_size; 7127 ztest_shared = (void *)&buf[offset]; 7128 offset += hdr->zh_size; 7129 ztest_shared_callstate = (void *)&buf[offset]; 7130 offset += hdr->zh_stats_size * hdr->zh_stats_count; 7131 ztest_shared_ds = (void *)&buf[offset]; 7132 } 7133 7134 static boolean_t 7135 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 7136 { 7137 pid_t pid; 7138 int status; 7139 char *cmdbuf = NULL; 7140 7141 pid = fork(); 7142 7143 if (cmd == NULL) { 7144 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 7145 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 7146 cmd = cmdbuf; 7147 } 7148 7149 if (pid == -1) 7150 fatal(1, "fork failed"); 7151 7152 if (pid == 0) { /* child */ 7153 char *emptyargv[2] = { cmd, NULL }; 7154 char fd_data_str[12]; 7155 7156 struct rlimit rl = { 1024, 1024 }; 7157 (void) setrlimit(RLIMIT_NOFILE, &rl); 7158 7159 (void) close(ztest_fd_rand); 7160 VERIFY3U(11, >=, 7161 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 7162 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 7163 7164 (void) enable_extended_FILE_stdio(-1, -1); 7165 if (libpath != NULL) 7166 VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1)); 7167 (void) execv(cmd, emptyargv); 7168 ztest_dump_core = B_FALSE; 7169 fatal(B_TRUE, "exec failed: %s", cmd); 7170 } 7171 7172 if (cmdbuf != NULL) { 7173 umem_free(cmdbuf, MAXPATHLEN); 7174 cmd = NULL; 7175 } 7176 7177 while (waitpid(pid, &status, 0) != pid) 7178 continue; 7179 if (statusp != NULL) 7180 *statusp = status; 7181 7182 if (WIFEXITED(status)) { 7183 if (WEXITSTATUS(status) != 0) { 7184 (void) fprintf(stderr, "child exited with code %d\n", 7185 WEXITSTATUS(status)); 7186 exit(2); 7187 } 7188 return (B_FALSE); 7189 } else if (WIFSIGNALED(status)) { 7190 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 7191 (void) fprintf(stderr, "child died with signal %d\n", 7192 WTERMSIG(status)); 7193 exit(3); 7194 } 7195 return (B_TRUE); 7196 } else { 7197 (void) fprintf(stderr, "something strange happened to child\n"); 7198 exit(4); 7199 /* NOTREACHED */ 7200 } 7201 } 7202 7203 static void 7204 ztest_run_init(void) 7205 { 7206 ztest_shared_t *zs = ztest_shared; 7207 7208 /* 7209 * Blow away any existing copy of zpool.cache 7210 */ 7211 (void) remove(spa_config_path); 7212 7213 if (ztest_opts.zo_init == 0) { 7214 if (ztest_opts.zo_verbose >= 1) 7215 (void) printf("Importing pool %s\n", 7216 ztest_opts.zo_pool); 7217 ztest_import(zs); 7218 return; 7219 } 7220 7221 /* 7222 * Create and initialize our storage pool. 7223 */ 7224 for (int i = 1; i <= ztest_opts.zo_init; i++) { 7225 bzero(zs, sizeof (ztest_shared_t)); 7226 if (ztest_opts.zo_verbose >= 3 && 7227 ztest_opts.zo_init != 1) { 7228 (void) printf("ztest_init(), pass %d\n", i); 7229 } 7230 ztest_init(zs); 7231 } 7232 } 7233 7234 int 7235 main(int argc, char **argv) 7236 { 7237 int kills = 0; 7238 int iters = 0; 7239 int older = 0; 7240 int newer = 0; 7241 ztest_shared_t *zs; 7242 ztest_info_t *zi; 7243 ztest_shared_callstate_t *zc; 7244 char timebuf[100]; 7245 char numbuf[NN_NUMBUF_SZ]; 7246 char *cmd; 7247 boolean_t hasalt; 7248 char *fd_data_str = getenv("ZTEST_FD_DATA"); 7249 7250 (void) setvbuf(stdout, NULL, _IOLBF, 0); 7251 7252 dprintf_setup(&argc, argv); 7253 zfs_deadman_synctime_ms = 300000; 7254 /* 7255 * As two-word space map entries may not come up often (especially 7256 * if pool and vdev sizes are small) we want to force at least some 7257 * of them so the feature get tested. 7258 */ 7259 zfs_force_some_double_word_sm_entries = B_TRUE; 7260 7261 /* 7262 * Verify that even extensively damaged split blocks with many 7263 * segments can be reconstructed in a reasonable amount of time 7264 * when reconstruction is known to be possible. 7265 */ 7266 zfs_reconstruct_indirect_damage_fraction = 4; 7267 7268 ztest_fd_rand = open("/dev/urandom", O_RDONLY); 7269 ASSERT3S(ztest_fd_rand, >=, 0); 7270 7271 if (!fd_data_str) { 7272 process_options(argc, argv); 7273 7274 setup_data_fd(); 7275 setup_hdr(); 7276 setup_data(); 7277 bcopy(&ztest_opts, ztest_shared_opts, 7278 sizeof (*ztest_shared_opts)); 7279 } else { 7280 ztest_fd_data = atoi(fd_data_str); 7281 setup_data(); 7282 bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts)); 7283 } 7284 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 7285 7286 /* Override location of zpool.cache */ 7287 VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache", 7288 ztest_opts.zo_dir), !=, -1); 7289 7290 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 7291 UMEM_NOFAIL); 7292 zs = ztest_shared; 7293 7294 if (fd_data_str) { 7295 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 7296 metaslab_df_alloc_threshold = 7297 zs->zs_metaslab_df_alloc_threshold; 7298 7299 if (zs->zs_do_init) 7300 ztest_run_init(); 7301 else 7302 ztest_run(zs); 7303 exit(0); 7304 } 7305 7306 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 7307 7308 if (ztest_opts.zo_verbose >= 1) { 7309 (void) printf("%llu vdevs, %d datasets, %d threads," 7310 " %llu seconds...\n", 7311 (u_longlong_t)ztest_opts.zo_vdevs, 7312 ztest_opts.zo_datasets, 7313 ztest_opts.zo_threads, 7314 (u_longlong_t)ztest_opts.zo_time); 7315 } 7316 7317 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 7318 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 7319 7320 zs->zs_do_init = B_TRUE; 7321 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 7322 if (ztest_opts.zo_verbose >= 1) { 7323 (void) printf("Executing older ztest for " 7324 "initialization: %s\n", ztest_opts.zo_alt_ztest); 7325 } 7326 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 7327 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 7328 } else { 7329 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 7330 } 7331 zs->zs_do_init = B_FALSE; 7332 7333 zs->zs_proc_start = gethrtime(); 7334 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 7335 7336 for (int f = 0; f < ZTEST_FUNCS; f++) { 7337 zi = &ztest_info[f]; 7338 zc = ZTEST_GET_SHARED_CALLSTATE(f); 7339 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 7340 zc->zc_next = UINT64_MAX; 7341 else 7342 zc->zc_next = zs->zs_proc_start + 7343 ztest_random(2 * zi->zi_interval[0] + 1); 7344 } 7345 7346 /* 7347 * Run the tests in a loop. These tests include fault injection 7348 * to verify that self-healing data works, and forced crashes 7349 * to verify that we never lose on-disk consistency. 7350 */ 7351 while (gethrtime() < zs->zs_proc_stop) { 7352 int status; 7353 boolean_t killed; 7354 7355 /* 7356 * Initialize the workload counters for each function. 7357 */ 7358 for (int f = 0; f < ZTEST_FUNCS; f++) { 7359 zc = ZTEST_GET_SHARED_CALLSTATE(f); 7360 zc->zc_count = 0; 7361 zc->zc_time = 0; 7362 } 7363 7364 /* Set the allocation switch size */ 7365 zs->zs_metaslab_df_alloc_threshold = 7366 ztest_random(zs->zs_metaslab_sz / 4) + 1; 7367 7368 if (!hasalt || ztest_random(2) == 0) { 7369 if (hasalt && ztest_opts.zo_verbose >= 1) { 7370 (void) printf("Executing newer ztest: %s\n", 7371 cmd); 7372 } 7373 newer++; 7374 killed = exec_child(cmd, NULL, B_TRUE, &status); 7375 } else { 7376 if (hasalt && ztest_opts.zo_verbose >= 1) { 7377 (void) printf("Executing older ztest: %s\n", 7378 ztest_opts.zo_alt_ztest); 7379 } 7380 older++; 7381 killed = exec_child(ztest_opts.zo_alt_ztest, 7382 ztest_opts.zo_alt_libpath, B_TRUE, &status); 7383 } 7384 7385 if (killed) 7386 kills++; 7387 iters++; 7388 7389 if (ztest_opts.zo_verbose >= 1) { 7390 hrtime_t now = gethrtime(); 7391 7392 now = MIN(now, zs->zs_proc_stop); 7393 print_time(zs->zs_proc_stop - now, timebuf); 7394 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 7395 7396 (void) printf("Pass %3d, %8s, %3llu ENOSPC, " 7397 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 7398 iters, 7399 WIFEXITED(status) ? "Complete" : "SIGKILL", 7400 (u_longlong_t)zs->zs_enospc_count, 7401 100.0 * zs->zs_alloc / zs->zs_space, 7402 numbuf, 7403 100.0 * (now - zs->zs_proc_start) / 7404 (ztest_opts.zo_time * NANOSEC), timebuf); 7405 } 7406 7407 if (ztest_opts.zo_verbose >= 2) { 7408 (void) printf("\nWorkload summary:\n\n"); 7409 (void) printf("%7s %9s %s\n", 7410 "Calls", "Time", "Function"); 7411 (void) printf("%7s %9s %s\n", 7412 "-----", "----", "--------"); 7413 for (int f = 0; f < ZTEST_FUNCS; f++) { 7414 Dl_info dli; 7415 7416 zi = &ztest_info[f]; 7417 zc = ZTEST_GET_SHARED_CALLSTATE(f); 7418 print_time(zc->zc_time, timebuf); 7419 (void) dladdr((void *)zi->zi_func, &dli); 7420 (void) printf("%7llu %9s %s\n", 7421 (u_longlong_t)zc->zc_count, timebuf, 7422 dli.dli_sname); 7423 } 7424 (void) printf("\n"); 7425 } 7426 7427 if (!ztest_opts.zo_mmp_test) 7428 ztest_run_zdb(ztest_opts.zo_pool); 7429 } 7430 7431 if (ztest_opts.zo_verbose >= 1) { 7432 if (hasalt) { 7433 (void) printf("%d runs of older ztest: %s\n", older, 7434 ztest_opts.zo_alt_ztest); 7435 (void) printf("%d runs of newer ztest: %s\n", newer, 7436 cmd); 7437 } 7438 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 7439 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 7440 } 7441 7442 umem_free(cmd, MAXNAMELEN); 7443 7444 return (0); 7445 } 7446