1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2024 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2013 Steven Hartland. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2017 Joyent, Inc. 28 * Copyright (c) 2017, Intel Corporation. 29 * Copyright (c) 2023, Klara, Inc. 30 */ 31 32 /* 33 * The objective of this program is to provide a DMU/ZAP/SPA stress test 34 * that runs entirely in userland, is easy to use, and easy to extend. 35 * 36 * The overall design of the ztest program is as follows: 37 * 38 * (1) For each major functional area (e.g. adding vdevs to a pool, 39 * creating and destroying datasets, reading and writing objects, etc) 40 * we have a simple routine to test that functionality. These 41 * individual routines do not have to do anything "stressful". 42 * 43 * (2) We turn these simple functionality tests into a stress test by 44 * running them all in parallel, with as many threads as desired, 45 * and spread across as many datasets, objects, and vdevs as desired. 46 * 47 * (3) While all this is happening, we inject faults into the pool to 48 * verify that self-healing data really works. 49 * 50 * (4) Every time we open a dataset, we change its checksum and compression 51 * functions. Thus even individual objects vary from block to block 52 * in which checksum they use and whether they're compressed. 53 * 54 * (5) To verify that we never lose on-disk consistency after a crash, 55 * we run the entire test in a child of the main process. 56 * At random times, the child self-immolates with a SIGKILL. 57 * This is the software equivalent of pulling the power cord. 58 * The parent then runs the test again, using the existing 59 * storage pool, as many times as desired. If backwards compatibility 60 * testing is enabled ztest will sometimes run the "older" version 61 * of ztest after a SIGKILL. 62 * 63 * (6) To verify that we don't have future leaks or temporal incursions, 64 * many of the functional tests record the transaction group number 65 * as part of their data. When reading old data, they verify that 66 * the transaction group number is less than the current, open txg. 67 * If you add a new test, please do this if applicable. 68 * 69 * (7) Threads are created with a reduced stack size, for sanity checking. 70 * Therefore, it's important not to allocate huge buffers on the stack. 71 * 72 * When run with no arguments, ztest runs for about five minutes and 73 * produces no output if successful. To get a little bit of information, 74 * specify -V. To get more information, specify -VV, and so on. 75 * 76 * To turn this into an overnight stress test, use -T to specify run time. 77 * 78 * You can ask more vdevs [-v], datasets [-d], or threads [-t] 79 * to increase the pool capacity, fanout, and overall stress level. 80 * 81 * Use the -k option to set the desired frequency of kills. 82 * 83 * When ztest invokes itself it passes all relevant information through a 84 * temporary file which is mmap-ed in the child process. This allows shared 85 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 86 * stored at offset 0 of this file and contains information on the size and 87 * number of shared structures in the file. The information stored in this file 88 * must remain backwards compatible with older versions of ztest so that 89 * ztest can invoke them during backwards compatibility testing (-B). 90 */ 91 92 #include <sys/zfs_context.h> 93 #include <sys/spa.h> 94 #include <sys/dmu.h> 95 #include <sys/txg.h> 96 #include <sys/dbuf.h> 97 #include <sys/zap.h> 98 #include <sys/dmu_objset.h> 99 #include <sys/poll.h> 100 #include <sys/stat.h> 101 #include <sys/time.h> 102 #include <sys/wait.h> 103 #include <sys/mman.h> 104 #include <sys/resource.h> 105 #include <sys/zio.h> 106 #include <sys/zil.h> 107 #include <sys/zil_impl.h> 108 #include <sys/vdev_draid.h> 109 #include <sys/vdev_impl.h> 110 #include <sys/vdev_file.h> 111 #include <sys/vdev_initialize.h> 112 #include <sys/vdev_raidz.h> 113 #include <sys/vdev_trim.h> 114 #include <sys/spa_impl.h> 115 #include <sys/metaslab_impl.h> 116 #include <sys/dsl_prop.h> 117 #include <sys/dsl_dataset.h> 118 #include <sys/dsl_destroy.h> 119 #include <sys/dsl_scan.h> 120 #include <sys/zio_checksum.h> 121 #include <sys/zfs_refcount.h> 122 #include <sys/zfeature.h> 123 #include <sys/dsl_userhold.h> 124 #include <sys/abd.h> 125 #include <sys/blake3.h> 126 #include <stdio.h> 127 #include <stdlib.h> 128 #include <unistd.h> 129 #include <getopt.h> 130 #include <signal.h> 131 #include <umem.h> 132 #include <ctype.h> 133 #include <math.h> 134 #include <sys/fs/zfs.h> 135 #include <zfs_fletcher.h> 136 #include <libnvpair.h> 137 #include <libzutil.h> 138 #include <sys/crypto/icp.h> 139 #include <sys/zfs_impl.h> 140 #include <sys/backtrace.h> 141 142 static int ztest_fd_data = -1; 143 static int ztest_fd_rand = -1; 144 145 typedef struct ztest_shared_hdr { 146 uint64_t zh_hdr_size; 147 uint64_t zh_opts_size; 148 uint64_t zh_size; 149 uint64_t zh_stats_size; 150 uint64_t zh_stats_count; 151 uint64_t zh_ds_size; 152 uint64_t zh_ds_count; 153 uint64_t zh_scratch_state_size; 154 } ztest_shared_hdr_t; 155 156 static ztest_shared_hdr_t *ztest_shared_hdr; 157 158 enum ztest_class_state { 159 ZTEST_VDEV_CLASS_OFF, 160 ZTEST_VDEV_CLASS_ON, 161 ZTEST_VDEV_CLASS_RND 162 }; 163 164 /* Dedicated RAIDZ Expansion test states */ 165 typedef enum { 166 RAIDZ_EXPAND_NONE, /* Default is none, must opt-in */ 167 RAIDZ_EXPAND_REQUESTED, /* The '-X' option was used */ 168 RAIDZ_EXPAND_STARTED, /* Testing has commenced */ 169 RAIDZ_EXPAND_KILLED, /* Reached the proccess kill */ 170 RAIDZ_EXPAND_CHECKED, /* Pool scrub verification done */ 171 } raidz_expand_test_state_t; 172 173 174 #define ZO_GVARS_MAX_ARGLEN ((size_t)64) 175 #define ZO_GVARS_MAX_COUNT ((size_t)10) 176 177 typedef struct ztest_shared_opts { 178 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 179 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 180 char zo_alt_ztest[MAXNAMELEN]; 181 char zo_alt_libpath[MAXNAMELEN]; 182 uint64_t zo_vdevs; 183 uint64_t zo_vdevtime; 184 size_t zo_vdev_size; 185 int zo_ashift; 186 int zo_mirrors; 187 int zo_raid_do_expand; 188 int zo_raid_children; 189 int zo_raid_parity; 190 char zo_raid_type[8]; 191 int zo_draid_data; 192 int zo_draid_spares; 193 int zo_datasets; 194 int zo_threads; 195 uint64_t zo_passtime; 196 uint64_t zo_killrate; 197 int zo_verbose; 198 int zo_init; 199 uint64_t zo_time; 200 uint64_t zo_maxloops; 201 uint64_t zo_metaslab_force_ganging; 202 raidz_expand_test_state_t zo_raidz_expand_test; 203 int zo_mmp_test; 204 int zo_special_vdevs; 205 int zo_dump_dbgmsg; 206 int zo_gvars_count; 207 char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; 208 } ztest_shared_opts_t; 209 210 /* Default values for command line options. */ 211 #define DEFAULT_POOL "ztest" 212 #define DEFAULT_VDEV_DIR "/tmp" 213 #define DEFAULT_VDEV_COUNT 5 214 #define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ 215 #define DEFAULT_VDEV_SIZE_STR "256M" 216 #define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT 217 #define DEFAULT_MIRRORS 2 218 #define DEFAULT_RAID_CHILDREN 4 219 #define DEFAULT_RAID_PARITY 1 220 #define DEFAULT_DRAID_DATA 4 221 #define DEFAULT_DRAID_SPARES 1 222 #define DEFAULT_DATASETS_COUNT 7 223 #define DEFAULT_THREADS 23 224 #define DEFAULT_RUN_TIME 300 /* 300 seconds */ 225 #define DEFAULT_RUN_TIME_STR "300 sec" 226 #define DEFAULT_PASS_TIME 60 /* 60 seconds */ 227 #define DEFAULT_PASS_TIME_STR "60 sec" 228 #define DEFAULT_KILL_RATE 70 /* 70% kill rate */ 229 #define DEFAULT_KILLRATE_STR "70%" 230 #define DEFAULT_INITS 1 231 #define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ 232 #define DEFAULT_FORCE_GANGING (64 << 10) 233 #define DEFAULT_FORCE_GANGING_STR "64K" 234 235 /* Simplifying assumption: -1 is not a valid default. */ 236 #define NO_DEFAULT -1 237 238 static const ztest_shared_opts_t ztest_opts_defaults = { 239 .zo_pool = DEFAULT_POOL, 240 .zo_dir = DEFAULT_VDEV_DIR, 241 .zo_alt_ztest = { '\0' }, 242 .zo_alt_libpath = { '\0' }, 243 .zo_vdevs = DEFAULT_VDEV_COUNT, 244 .zo_ashift = DEFAULT_ASHIFT, 245 .zo_mirrors = DEFAULT_MIRRORS, 246 .zo_raid_children = DEFAULT_RAID_CHILDREN, 247 .zo_raid_parity = DEFAULT_RAID_PARITY, 248 .zo_raid_type = VDEV_TYPE_RAIDZ, 249 .zo_vdev_size = DEFAULT_VDEV_SIZE, 250 .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ 251 .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ 252 .zo_datasets = DEFAULT_DATASETS_COUNT, 253 .zo_threads = DEFAULT_THREADS, 254 .zo_passtime = DEFAULT_PASS_TIME, 255 .zo_killrate = DEFAULT_KILL_RATE, 256 .zo_verbose = 0, 257 .zo_mmp_test = 0, 258 .zo_init = DEFAULT_INITS, 259 .zo_time = DEFAULT_RUN_TIME, 260 .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ 261 .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, 262 .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, 263 .zo_gvars_count = 0, 264 .zo_raidz_expand_test = RAIDZ_EXPAND_NONE, 265 }; 266 267 extern uint64_t metaslab_force_ganging; 268 extern uint64_t metaslab_df_alloc_threshold; 269 extern uint64_t zfs_deadman_synctime_ms; 270 extern uint_t metaslab_preload_limit; 271 extern int zfs_compressed_arc_enabled; 272 extern int zfs_abd_scatter_enabled; 273 extern uint_t dmu_object_alloc_chunk_shift; 274 extern boolean_t zfs_force_some_double_word_sm_entries; 275 extern unsigned long zio_decompress_fail_fraction; 276 extern unsigned long zfs_reconstruct_indirect_damage_fraction; 277 extern uint64_t raidz_expand_max_reflow_bytes; 278 extern uint_t raidz_expand_pause_point; 279 extern boolean_t ddt_prune_artificial_age; 280 extern boolean_t ddt_dump_prune_histogram; 281 282 283 static ztest_shared_opts_t *ztest_shared_opts; 284 static ztest_shared_opts_t ztest_opts; 285 static const char *const ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; 286 287 typedef struct ztest_shared_ds { 288 uint64_t zd_seq; 289 } ztest_shared_ds_t; 290 291 static ztest_shared_ds_t *ztest_shared_ds; 292 #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 293 294 typedef struct ztest_scratch_state { 295 uint64_t zs_raidz_scratch_verify_pause; 296 } ztest_shared_scratch_state_t; 297 298 static ztest_shared_scratch_state_t *ztest_scratch_state; 299 300 #define BT_MAGIC 0x123456789abcdefULL 301 #define MAXFAULTS(zs) \ 302 (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) 303 304 enum ztest_io_type { 305 ZTEST_IO_WRITE_TAG, 306 ZTEST_IO_WRITE_PATTERN, 307 ZTEST_IO_WRITE_ZEROES, 308 ZTEST_IO_TRUNCATE, 309 ZTEST_IO_SETATTR, 310 ZTEST_IO_REWRITE, 311 ZTEST_IO_TYPES 312 }; 313 314 typedef struct ztest_block_tag { 315 uint64_t bt_magic; 316 uint64_t bt_objset; 317 uint64_t bt_object; 318 uint64_t bt_dnodesize; 319 uint64_t bt_offset; 320 uint64_t bt_gen; 321 uint64_t bt_txg; 322 uint64_t bt_crtxg; 323 } ztest_block_tag_t; 324 325 typedef struct bufwad { 326 uint64_t bw_index; 327 uint64_t bw_txg; 328 uint64_t bw_data; 329 } bufwad_t; 330 331 /* 332 * It would be better to use a rangelock_t per object. Unfortunately 333 * the rangelock_t is not a drop-in replacement for rl_t, because we 334 * still need to map from object ID to rangelock_t. 335 */ 336 typedef enum { 337 ZTRL_READER, 338 ZTRL_WRITER, 339 ZTRL_APPEND 340 } rl_type_t; 341 342 typedef struct rll { 343 void *rll_writer; 344 int rll_readers; 345 kmutex_t rll_lock; 346 kcondvar_t rll_cv; 347 } rll_t; 348 349 typedef struct rl { 350 uint64_t rl_object; 351 uint64_t rl_offset; 352 uint64_t rl_size; 353 rll_t *rl_lock; 354 } rl_t; 355 356 #define ZTEST_RANGE_LOCKS 64 357 #define ZTEST_OBJECT_LOCKS 64 358 359 /* 360 * Object descriptor. Used as a template for object lookup/create/remove. 361 */ 362 typedef struct ztest_od { 363 uint64_t od_dir; 364 uint64_t od_object; 365 dmu_object_type_t od_type; 366 dmu_object_type_t od_crtype; 367 uint64_t od_blocksize; 368 uint64_t od_crblocksize; 369 uint64_t od_crdnodesize; 370 uint64_t od_gen; 371 uint64_t od_crgen; 372 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 373 } ztest_od_t; 374 375 /* 376 * Per-dataset state. 377 */ 378 typedef struct ztest_ds { 379 ztest_shared_ds_t *zd_shared; 380 objset_t *zd_os; 381 pthread_rwlock_t zd_zilog_lock; 382 zilog_t *zd_zilog; 383 ztest_od_t *zd_od; /* debugging aid */ 384 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 385 kmutex_t zd_dirobj_lock; 386 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 387 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 388 } ztest_ds_t; 389 390 /* 391 * Per-iteration state. 392 */ 393 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 394 395 typedef struct ztest_info { 396 ztest_func_t *zi_func; /* test function */ 397 uint64_t zi_iters; /* iterations per execution */ 398 uint64_t *zi_interval; /* execute every <interval> seconds */ 399 const char *zi_funcname; /* name of test function */ 400 } ztest_info_t; 401 402 typedef struct ztest_shared_callstate { 403 uint64_t zc_count; /* per-pass count */ 404 uint64_t zc_time; /* per-pass time */ 405 uint64_t zc_next; /* next time to call this function */ 406 } ztest_shared_callstate_t; 407 408 static ztest_shared_callstate_t *ztest_shared_callstate; 409 #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 410 411 ztest_func_t ztest_dmu_read_write; 412 ztest_func_t ztest_dmu_write_parallel; 413 ztest_func_t ztest_dmu_object_alloc_free; 414 ztest_func_t ztest_dmu_object_next_chunk; 415 ztest_func_t ztest_dmu_commit_callbacks; 416 ztest_func_t ztest_zap; 417 ztest_func_t ztest_zap_parallel; 418 ztest_func_t ztest_zil_commit; 419 ztest_func_t ztest_zil_remount; 420 ztest_func_t ztest_dmu_read_write_zcopy; 421 ztest_func_t ztest_dmu_objset_create_destroy; 422 ztest_func_t ztest_dmu_prealloc; 423 ztest_func_t ztest_fzap; 424 ztest_func_t ztest_dmu_snapshot_create_destroy; 425 ztest_func_t ztest_dsl_prop_get_set; 426 ztest_func_t ztest_spa_prop_get_set; 427 ztest_func_t ztest_spa_create_destroy; 428 ztest_func_t ztest_fault_inject; 429 ztest_func_t ztest_dmu_snapshot_hold; 430 ztest_func_t ztest_mmp_enable_disable; 431 ztest_func_t ztest_scrub; 432 ztest_func_t ztest_dsl_dataset_promote_busy; 433 ztest_func_t ztest_vdev_attach_detach; 434 ztest_func_t ztest_vdev_raidz_attach; 435 ztest_func_t ztest_vdev_LUN_growth; 436 ztest_func_t ztest_vdev_add_remove; 437 ztest_func_t ztest_vdev_class_add; 438 ztest_func_t ztest_vdev_aux_add_remove; 439 ztest_func_t ztest_split_pool; 440 ztest_func_t ztest_reguid; 441 ztest_func_t ztest_spa_upgrade; 442 ztest_func_t ztest_device_removal; 443 ztest_func_t ztest_spa_checkpoint_create_discard; 444 ztest_func_t ztest_initialize; 445 ztest_func_t ztest_trim; 446 ztest_func_t ztest_blake3; 447 ztest_func_t ztest_fletcher; 448 ztest_func_t ztest_fletcher_incr; 449 ztest_func_t ztest_verify_dnode_bt; 450 ztest_func_t ztest_pool_prefetch_ddt; 451 ztest_func_t ztest_ddt_prune; 452 453 static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 454 static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 455 static uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 456 static uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 457 static uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 458 459 #define ZTI_INIT(func, iters, interval) \ 460 { .zi_func = (func), \ 461 .zi_iters = (iters), \ 462 .zi_interval = (interval), \ 463 .zi_funcname = # func } 464 465 static ztest_info_t ztest_info[] = { 466 ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), 467 ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), 468 ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), 469 ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), 470 ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), 471 ZTI_INIT(ztest_zap, 30, &zopt_always), 472 ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), 473 ZTI_INIT(ztest_split_pool, 1, &zopt_sometimes), 474 ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), 475 ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), 476 ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), 477 ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), 478 ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), 479 ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), 480 #if 0 481 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), 482 #endif 483 ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), 484 ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), 485 ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), 486 ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), 487 ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), 488 ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), 489 ZTI_INIT(ztest_reguid, 1, &zopt_rarely), 490 ZTI_INIT(ztest_scrub, 1, &zopt_rarely), 491 ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), 492 ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), 493 ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), 494 ZTI_INIT(ztest_vdev_raidz_attach, 1, &zopt_sometimes), 495 ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), 496 ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), 497 ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), 498 ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), 499 ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), 500 ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), 501 ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), 502 ZTI_INIT(ztest_trim, 1, &zopt_sometimes), 503 ZTI_INIT(ztest_blake3, 1, &zopt_rarely), 504 ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), 505 ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), 506 ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), 507 ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely), 508 ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely), 509 }; 510 511 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 512 513 /* 514 * The following struct is used to hold a list of uncalled commit callbacks. 515 * The callbacks are ordered by txg number. 516 */ 517 typedef struct ztest_cb_list { 518 kmutex_t zcl_callbacks_lock; 519 list_t zcl_callbacks; 520 } ztest_cb_list_t; 521 522 /* 523 * Stuff we need to share writably between parent and child. 524 */ 525 typedef struct ztest_shared { 526 boolean_t zs_do_init; 527 hrtime_t zs_proc_start; 528 hrtime_t zs_proc_stop; 529 hrtime_t zs_thread_start; 530 hrtime_t zs_thread_stop; 531 hrtime_t zs_thread_kill; 532 uint64_t zs_enospc_count; 533 uint64_t zs_vdev_next_leaf; 534 uint64_t zs_vdev_aux; 535 uint64_t zs_alloc; 536 uint64_t zs_space; 537 uint64_t zs_splits; 538 uint64_t zs_mirrors; 539 uint64_t zs_metaslab_sz; 540 uint64_t zs_metaslab_df_alloc_threshold; 541 uint64_t zs_guid; 542 } ztest_shared_t; 543 544 #define ID_PARALLEL -1ULL 545 546 static char ztest_dev_template[] = "%s/%s.%llua"; 547 static char ztest_aux_template[] = "%s/%s.%s.%llu"; 548 static ztest_shared_t *ztest_shared; 549 550 static spa_t *ztest_spa = NULL; 551 static ztest_ds_t *ztest_ds; 552 553 static kmutex_t ztest_vdev_lock; 554 static boolean_t ztest_device_removal_active = B_FALSE; 555 static boolean_t ztest_pool_scrubbed = B_FALSE; 556 static kmutex_t ztest_checkpoint_lock; 557 558 /* 559 * The ztest_name_lock protects the pool and dataset namespace used by 560 * the individual tests. To modify the namespace, consumers must grab 561 * this lock as writer. Grabbing the lock as reader will ensure that the 562 * namespace does not change while the lock is held. 563 */ 564 static pthread_rwlock_t ztest_name_lock; 565 566 static boolean_t ztest_dump_core = B_TRUE; 567 static boolean_t ztest_exiting; 568 569 /* Global commit callback list */ 570 static ztest_cb_list_t zcl; 571 /* Commit cb delay */ 572 static uint64_t zc_min_txg_delay = UINT64_MAX; 573 static int zc_cb_counter = 0; 574 575 /* 576 * Minimum number of commit callbacks that need to be registered for us to check 577 * whether the minimum txg delay is acceptable. 578 */ 579 #define ZTEST_COMMIT_CB_MIN_REG 100 580 581 /* 582 * If a number of txgs equal to this threshold have been created after a commit 583 * callback has been registered but not called, then we assume there is an 584 * implementation bug. 585 */ 586 #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) 587 588 enum ztest_object { 589 ZTEST_META_DNODE = 0, 590 ZTEST_DIROBJ, 591 ZTEST_OBJECTS 592 }; 593 594 static __attribute__((noreturn)) void usage(boolean_t requested); 595 static int ztest_scrub_impl(spa_t *spa); 596 597 /* 598 * These libumem hooks provide a reasonable set of defaults for the allocator's 599 * debugging facilities. 600 */ 601 const char * 602 _umem_debug_init(void) 603 { 604 return ("default,verbose"); /* $UMEM_DEBUG setting */ 605 } 606 607 const char * 608 _umem_logging_init(void) 609 { 610 return ("fail,contents"); /* $UMEM_LOGGING setting */ 611 } 612 613 static void 614 dump_debug_buffer(void) 615 { 616 ssize_t ret __attribute__((unused)); 617 618 if (!ztest_opts.zo_dump_dbgmsg) 619 return; 620 621 /* 622 * We use write() instead of printf() so that this function 623 * is safe to call from a signal handler. 624 */ 625 ret = write(STDERR_FILENO, "\n", 1); 626 zfs_dbgmsg_print(STDERR_FILENO, "ztest"); 627 } 628 629 static void sig_handler(int signo) 630 { 631 struct sigaction action; 632 633 libspl_backtrace(STDERR_FILENO); 634 dump_debug_buffer(); 635 636 /* 637 * Restore default action and re-raise signal so SIGSEGV and 638 * SIGABRT can trigger a core dump. 639 */ 640 action.sa_handler = SIG_DFL; 641 sigemptyset(&action.sa_mask); 642 action.sa_flags = 0; 643 (void) sigaction(signo, &action, NULL); 644 raise(signo); 645 } 646 647 #define FATAL_MSG_SZ 1024 648 649 static const char *fatal_msg; 650 651 static __attribute__((format(printf, 2, 3))) __attribute__((noreturn)) void 652 fatal(int do_perror, const char *message, ...) 653 { 654 va_list args; 655 int save_errno = errno; 656 char *buf; 657 658 (void) fflush(stdout); 659 buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); 660 if (buf == NULL) 661 goto out; 662 663 va_start(args, message); 664 (void) sprintf(buf, "ztest: "); 665 /* LINTED */ 666 (void) vsprintf(buf + strlen(buf), message, args); 667 va_end(args); 668 if (do_perror) { 669 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 670 ": %s", strerror(save_errno)); 671 } 672 (void) fprintf(stderr, "%s\n", buf); 673 fatal_msg = buf; /* to ease debugging */ 674 675 out: 676 if (ztest_dump_core) 677 abort(); 678 else 679 dump_debug_buffer(); 680 681 exit(3); 682 } 683 684 static int 685 str2shift(const char *buf) 686 { 687 const char *ends = "BKMGTPEZ"; 688 int i, len; 689 690 if (buf[0] == '\0') 691 return (0); 692 693 len = strlen(ends); 694 for (i = 0; i < len; i++) { 695 if (toupper(buf[0]) == ends[i]) 696 break; 697 } 698 if (i == len) { 699 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 700 buf); 701 usage(B_FALSE); 702 } 703 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 704 return (10*i); 705 } 706 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 707 usage(B_FALSE); 708 } 709 710 static uint64_t 711 nicenumtoull(const char *buf) 712 { 713 char *end; 714 uint64_t val; 715 716 val = strtoull(buf, &end, 0); 717 if (end == buf) { 718 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 719 usage(B_FALSE); 720 } else if (end[0] == '.') { 721 double fval = strtod(buf, &end); 722 fval *= pow(2, str2shift(end)); 723 /* 724 * UINT64_MAX is not exactly representable as a double. 725 * The closest representation is UINT64_MAX + 1, so we 726 * use a >= comparison instead of > for the bounds check. 727 */ 728 if (fval >= (double)UINT64_MAX) { 729 (void) fprintf(stderr, "ztest: value too large: %s\n", 730 buf); 731 usage(B_FALSE); 732 } 733 val = (uint64_t)fval; 734 } else { 735 int shift = str2shift(end); 736 if (shift >= 64 || (val << shift) >> shift != val) { 737 (void) fprintf(stderr, "ztest: value too large: %s\n", 738 buf); 739 usage(B_FALSE); 740 } 741 val <<= shift; 742 } 743 return (val); 744 } 745 746 typedef struct ztest_option { 747 const char short_opt; 748 const char *long_opt; 749 const char *long_opt_param; 750 const char *comment; 751 unsigned int default_int; 752 const char *default_str; 753 } ztest_option_t; 754 755 /* 756 * The following option_table is used for generating the usage info as well as 757 * the long and short option information for calling getopt_long(). 758 */ 759 static ztest_option_t option_table[] = { 760 { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, 761 NULL}, 762 { 's', "vdev-size", "INTEGER", "Size of each vdev", 763 NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, 764 { 'a', "alignment-shift", "INTEGER", 765 "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, 766 { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", 767 DEFAULT_MIRRORS, NULL}, 768 { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", 769 DEFAULT_RAID_CHILDREN, NULL}, 770 { 'R', "raid-parity", "INTEGER", "Raid parity", 771 DEFAULT_RAID_PARITY, NULL}, 772 { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind", 773 NO_DEFAULT, "random"}, 774 { 'D', "draid-data", "INTEGER", "Number of draid data drives", 775 DEFAULT_DRAID_DATA, NULL}, 776 { 'S', "draid-spares", "INTEGER", "Number of draid spares", 777 DEFAULT_DRAID_SPARES, NULL}, 778 { 'd', "datasets", "INTEGER", "Number of datasets", 779 DEFAULT_DATASETS_COUNT, NULL}, 780 { 't', "threads", "INTEGER", "Number of ztest threads", 781 DEFAULT_THREADS, NULL}, 782 { 'g', "gang-block-threshold", "INTEGER", 783 "Metaslab gang block threshold", 784 NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, 785 { 'i', "init-count", "INTEGER", "Number of times to initialize pool", 786 DEFAULT_INITS, NULL}, 787 { 'k', "kill-percentage", "INTEGER", "Kill percentage", 788 NO_DEFAULT, DEFAULT_KILLRATE_STR}, 789 { 'p', "pool-name", "STRING", "Pool name", 790 NO_DEFAULT, DEFAULT_POOL}, 791 { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", 792 NO_DEFAULT, DEFAULT_VDEV_DIR}, 793 { 'M', "multi-host", NULL, 794 "Multi-host; simulate pool imported on remote host", 795 NO_DEFAULT, NULL}, 796 { 'E', "use-existing-pool", NULL, 797 "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, 798 { 'T', "run-time", "INTEGER", "Total run time", 799 NO_DEFAULT, DEFAULT_RUN_TIME_STR}, 800 { 'P', "pass-time", "INTEGER", "Time per pass", 801 NO_DEFAULT, DEFAULT_PASS_TIME_STR}, 802 { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", 803 DEFAULT_MAX_LOOPS, NULL}, 804 { 'B', "alt-ztest", "PATH", "Alternate ztest path", 805 NO_DEFAULT, NULL}, 806 { 'C', "vdev-class-state", "on|off|random", "vdev class state", 807 NO_DEFAULT, "random"}, 808 { 'X', "raidz-expansion", NULL, 809 "Perform a dedicated raidz expansion test", 810 NO_DEFAULT, NULL}, 811 { 'o', "option", "\"OPTION=INTEGER\"", 812 "Set global variable to an unsigned 32-bit integer value", 813 NO_DEFAULT, NULL}, 814 { 'G', "dump-debug-msg", NULL, 815 "Dump zfs_dbgmsg buffer before exiting due to an error", 816 NO_DEFAULT, NULL}, 817 { 'V', "verbose", NULL, 818 "Verbose (use multiple times for ever more verbosity)", 819 NO_DEFAULT, NULL}, 820 { 'h', "help", NULL, "Show this help", 821 NO_DEFAULT, NULL}, 822 {0, 0, 0, 0, 0, 0} 823 }; 824 825 static struct option *long_opts = NULL; 826 static char *short_opts = NULL; 827 828 static void 829 init_options(void) 830 { 831 ASSERT3P(long_opts, ==, NULL); 832 ASSERT3P(short_opts, ==, NULL); 833 834 int count = sizeof (option_table) / sizeof (option_table[0]); 835 long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); 836 837 short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); 838 int short_opt_index = 0; 839 840 for (int i = 0; i < count; i++) { 841 long_opts[i].val = option_table[i].short_opt; 842 long_opts[i].name = option_table[i].long_opt; 843 long_opts[i].has_arg = option_table[i].long_opt_param != NULL 844 ? required_argument : no_argument; 845 long_opts[i].flag = NULL; 846 short_opts[short_opt_index++] = option_table[i].short_opt; 847 if (option_table[i].long_opt_param != NULL) { 848 short_opts[short_opt_index++] = ':'; 849 } 850 } 851 } 852 853 static void 854 fini_options(void) 855 { 856 int count = sizeof (option_table) / sizeof (option_table[0]); 857 858 umem_free(long_opts, sizeof (struct option) * count); 859 umem_free(short_opts, sizeof (char) * 2 * count); 860 861 long_opts = NULL; 862 short_opts = NULL; 863 } 864 865 static __attribute__((noreturn)) void 866 usage(boolean_t requested) 867 { 868 char option[80]; 869 FILE *fp = requested ? stdout : stderr; 870 871 (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); 872 for (int i = 0; option_table[i].short_opt != 0; i++) { 873 if (option_table[i].long_opt_param != NULL) { 874 (void) sprintf(option, " -%c --%s=%s", 875 option_table[i].short_opt, 876 option_table[i].long_opt, 877 option_table[i].long_opt_param); 878 } else { 879 (void) sprintf(option, " -%c --%s", 880 option_table[i].short_opt, 881 option_table[i].long_opt); 882 } 883 (void) fprintf(fp, " %-43s%s", option, 884 option_table[i].comment); 885 886 if (option_table[i].long_opt_param != NULL) { 887 if (option_table[i].default_str != NULL) { 888 (void) fprintf(fp, " (default: %s)", 889 option_table[i].default_str); 890 } else if (option_table[i].default_int != NO_DEFAULT) { 891 (void) fprintf(fp, " (default: %u)", 892 option_table[i].default_int); 893 } 894 } 895 (void) fprintf(fp, "\n"); 896 } 897 exit(requested ? 0 : 1); 898 } 899 900 static uint64_t 901 ztest_random(uint64_t range) 902 { 903 uint64_t r; 904 905 ASSERT3S(ztest_fd_rand, >=, 0); 906 907 if (range == 0) 908 return (0); 909 910 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) 911 fatal(B_TRUE, "short read from /dev/urandom"); 912 913 return (r % range); 914 } 915 916 static void 917 ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) 918 { 919 char name[32]; 920 char *value; 921 int state = ZTEST_VDEV_CLASS_RND; 922 923 (void) strlcpy(name, input, sizeof (name)); 924 925 value = strchr(name, '='); 926 if (value == NULL) { 927 (void) fprintf(stderr, "missing value in property=value " 928 "'-C' argument (%s)\n", input); 929 usage(B_FALSE); 930 } 931 *(value) = '\0'; 932 value++; 933 934 if (strcmp(value, "on") == 0) { 935 state = ZTEST_VDEV_CLASS_ON; 936 } else if (strcmp(value, "off") == 0) { 937 state = ZTEST_VDEV_CLASS_OFF; 938 } else if (strcmp(value, "random") == 0) { 939 state = ZTEST_VDEV_CLASS_RND; 940 } else { 941 (void) fprintf(stderr, "invalid property value '%s'\n", value); 942 usage(B_FALSE); 943 } 944 945 if (strcmp(name, "special") == 0) { 946 zo->zo_special_vdevs = state; 947 } else { 948 (void) fprintf(stderr, "invalid property name '%s'\n", name); 949 usage(B_FALSE); 950 } 951 if (zo->zo_verbose >= 3) 952 (void) printf("%s vdev state is '%s'\n", name, value); 953 } 954 955 static void 956 process_options(int argc, char **argv) 957 { 958 char *path; 959 ztest_shared_opts_t *zo = &ztest_opts; 960 961 int opt; 962 uint64_t value; 963 const char *raid_kind = "random"; 964 965 memcpy(zo, &ztest_opts_defaults, sizeof (*zo)); 966 967 init_options(); 968 969 while ((opt = getopt_long(argc, argv, short_opts, long_opts, 970 NULL)) != EOF) { 971 value = 0; 972 switch (opt) { 973 case 'v': 974 case 's': 975 case 'a': 976 case 'm': 977 case 'r': 978 case 'R': 979 case 'D': 980 case 'S': 981 case 'd': 982 case 't': 983 case 'g': 984 case 'i': 985 case 'k': 986 case 'T': 987 case 'P': 988 case 'F': 989 value = nicenumtoull(optarg); 990 } 991 switch (opt) { 992 case 'v': 993 zo->zo_vdevs = value; 994 break; 995 case 's': 996 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 997 break; 998 case 'a': 999 zo->zo_ashift = value; 1000 break; 1001 case 'm': 1002 zo->zo_mirrors = value; 1003 break; 1004 case 'r': 1005 zo->zo_raid_children = MAX(1, value); 1006 break; 1007 case 'R': 1008 zo->zo_raid_parity = MIN(MAX(value, 1), 3); 1009 break; 1010 case 'K': 1011 raid_kind = optarg; 1012 break; 1013 case 'D': 1014 zo->zo_draid_data = MAX(1, value); 1015 break; 1016 case 'S': 1017 zo->zo_draid_spares = MAX(1, value); 1018 break; 1019 case 'd': 1020 zo->zo_datasets = MAX(1, value); 1021 break; 1022 case 't': 1023 zo->zo_threads = MAX(1, value); 1024 break; 1025 case 'g': 1026 zo->zo_metaslab_force_ganging = 1027 MAX(SPA_MINBLOCKSIZE << 1, value); 1028 break; 1029 case 'i': 1030 zo->zo_init = value; 1031 break; 1032 case 'k': 1033 zo->zo_killrate = value; 1034 break; 1035 case 'p': 1036 (void) strlcpy(zo->zo_pool, optarg, 1037 sizeof (zo->zo_pool)); 1038 break; 1039 case 'f': 1040 path = realpath(optarg, NULL); 1041 if (path == NULL) { 1042 (void) fprintf(stderr, "error: %s: %s\n", 1043 optarg, strerror(errno)); 1044 usage(B_FALSE); 1045 } else { 1046 (void) strlcpy(zo->zo_dir, path, 1047 sizeof (zo->zo_dir)); 1048 free(path); 1049 } 1050 break; 1051 case 'M': 1052 zo->zo_mmp_test = 1; 1053 break; 1054 case 'V': 1055 zo->zo_verbose++; 1056 break; 1057 case 'X': 1058 zo->zo_raidz_expand_test = RAIDZ_EXPAND_REQUESTED; 1059 break; 1060 case 'E': 1061 zo->zo_init = 0; 1062 break; 1063 case 'T': 1064 zo->zo_time = value; 1065 break; 1066 case 'P': 1067 zo->zo_passtime = MAX(1, value); 1068 break; 1069 case 'F': 1070 zo->zo_maxloops = MAX(1, value); 1071 break; 1072 case 'B': 1073 (void) strlcpy(zo->zo_alt_ztest, optarg, 1074 sizeof (zo->zo_alt_ztest)); 1075 break; 1076 case 'C': 1077 ztest_parse_name_value(optarg, zo); 1078 break; 1079 case 'o': 1080 if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { 1081 (void) fprintf(stderr, 1082 "max global var count (%zu) exceeded\n", 1083 ZO_GVARS_MAX_COUNT); 1084 usage(B_FALSE); 1085 } 1086 char *v = zo->zo_gvars[zo->zo_gvars_count]; 1087 if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= 1088 ZO_GVARS_MAX_ARGLEN) { 1089 (void) fprintf(stderr, 1090 "global var option '%s' is too long\n", 1091 optarg); 1092 usage(B_FALSE); 1093 } 1094 zo->zo_gvars_count++; 1095 break; 1096 case 'G': 1097 zo->zo_dump_dbgmsg = 1; 1098 break; 1099 case 'h': 1100 usage(B_TRUE); 1101 break; 1102 case '?': 1103 default: 1104 usage(B_FALSE); 1105 break; 1106 } 1107 } 1108 1109 fini_options(); 1110 1111 /* Force compatible options for raidz expansion run */ 1112 if (zo->zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) { 1113 zo->zo_mmp_test = 0; 1114 zo->zo_mirrors = 0; 1115 zo->zo_vdevs = 1; 1116 zo->zo_vdev_size = DEFAULT_VDEV_SIZE * 2; 1117 zo->zo_raid_do_expand = B_FALSE; 1118 raid_kind = "raidz"; 1119 } 1120 1121 if (strcmp(raid_kind, "random") == 0) { 1122 switch (ztest_random(3)) { 1123 case 0: 1124 raid_kind = "raidz"; 1125 break; 1126 case 1: 1127 raid_kind = "eraidz"; 1128 break; 1129 case 2: 1130 raid_kind = "draid"; 1131 break; 1132 } 1133 1134 if (ztest_opts.zo_verbose >= 3) 1135 (void) printf("choosing RAID type '%s'\n", raid_kind); 1136 } 1137 1138 if (strcmp(raid_kind, "draid") == 0) { 1139 uint64_t min_devsize; 1140 1141 /* With fewer disk use 256M, otherwise 128M is OK */ 1142 min_devsize = (ztest_opts.zo_raid_children < 16) ? 1143 (256ULL << 20) : (128ULL << 20); 1144 1145 /* No top-level mirrors with dRAID for now */ 1146 zo->zo_mirrors = 0; 1147 1148 /* Use more appropriate defaults for dRAID */ 1149 if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) 1150 zo->zo_vdevs = 1; 1151 if (zo->zo_raid_children == 1152 ztest_opts_defaults.zo_raid_children) 1153 zo->zo_raid_children = 16; 1154 if (zo->zo_ashift < 12) 1155 zo->zo_ashift = 12; 1156 if (zo->zo_vdev_size < min_devsize) 1157 zo->zo_vdev_size = min_devsize; 1158 1159 if (zo->zo_draid_data + zo->zo_raid_parity > 1160 zo->zo_raid_children - zo->zo_draid_spares) { 1161 (void) fprintf(stderr, "error: too few draid " 1162 "children (%d) for stripe width (%d)\n", 1163 zo->zo_raid_children, 1164 zo->zo_draid_data + zo->zo_raid_parity); 1165 usage(B_FALSE); 1166 } 1167 1168 (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, 1169 sizeof (zo->zo_raid_type)); 1170 1171 } else if (strcmp(raid_kind, "eraidz") == 0) { 1172 /* using eraidz (expandable raidz) */ 1173 zo->zo_raid_do_expand = B_TRUE; 1174 1175 /* tests expect top-level to be raidz */ 1176 zo->zo_mirrors = 0; 1177 zo->zo_vdevs = 1; 1178 1179 /* Make sure parity is less than data columns */ 1180 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1181 zo->zo_raid_children - 1); 1182 1183 } else /* using raidz */ { 1184 ASSERT0(strcmp(raid_kind, "raidz")); 1185 1186 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1187 zo->zo_raid_children - 1); 1188 } 1189 1190 zo->zo_vdevtime = 1191 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 1192 UINT64_MAX >> 2); 1193 1194 if (*zo->zo_alt_ztest) { 1195 const char *invalid_what = "ztest"; 1196 char *val = zo->zo_alt_ztest; 1197 if (0 != access(val, X_OK) || 1198 (strrchr(val, '/') == NULL && (errno == EINVAL))) 1199 goto invalid; 1200 1201 int dirlen = strrchr(val, '/') - val; 1202 strlcpy(zo->zo_alt_libpath, val, 1203 MIN(sizeof (zo->zo_alt_libpath), dirlen + 1)); 1204 invalid_what = "library path", val = zo->zo_alt_libpath; 1205 if (strrchr(val, '/') == NULL && (errno == EINVAL)) 1206 goto invalid; 1207 *strrchr(val, '/') = '\0'; 1208 strlcat(val, "/lib", sizeof (zo->zo_alt_libpath)); 1209 1210 if (0 != access(zo->zo_alt_libpath, X_OK)) 1211 goto invalid; 1212 return; 1213 1214 invalid: 1215 ztest_dump_core = B_FALSE; 1216 fatal(B_TRUE, "invalid alternate %s %s", invalid_what, val); 1217 } 1218 } 1219 1220 static void 1221 ztest_kill(ztest_shared_t *zs) 1222 { 1223 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 1224 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 1225 1226 /* 1227 * Before we kill ourselves, make sure that the config is updated. 1228 * See comment above spa_write_cachefile(). 1229 */ 1230 if (raidz_expand_pause_point != RAIDZ_EXPAND_PAUSE_NONE) { 1231 if (mutex_tryenter(&spa_namespace_lock)) { 1232 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, 1233 B_FALSE); 1234 mutex_exit(&spa_namespace_lock); 1235 1236 ztest_scratch_state->zs_raidz_scratch_verify_pause = 1237 raidz_expand_pause_point; 1238 } else { 1239 /* 1240 * Do not verify scratch object in case if 1241 * spa_namespace_lock cannot be acquired, 1242 * it can cause deadlock in spa_config_update(). 1243 */ 1244 raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; 1245 1246 return; 1247 } 1248 } else { 1249 mutex_enter(&spa_namespace_lock); 1250 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); 1251 mutex_exit(&spa_namespace_lock); 1252 } 1253 1254 (void) raise(SIGKILL); 1255 } 1256 1257 static void 1258 ztest_record_enospc(const char *s) 1259 { 1260 (void) s; 1261 ztest_shared->zs_enospc_count++; 1262 } 1263 1264 static uint64_t 1265 ztest_get_ashift(void) 1266 { 1267 if (ztest_opts.zo_ashift == 0) 1268 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 1269 return (ztest_opts.zo_ashift); 1270 } 1271 1272 static boolean_t 1273 ztest_is_draid_spare(const char *name) 1274 { 1275 uint64_t spare_id = 0, parity = 0, vdev_id = 0; 1276 1277 if (sscanf(name, VDEV_TYPE_DRAID "%"PRIu64"-%"PRIu64"-%"PRIu64"", 1278 &parity, &vdev_id, &spare_id) == 3) { 1279 return (B_TRUE); 1280 } 1281 1282 return (B_FALSE); 1283 } 1284 1285 static nvlist_t * 1286 make_vdev_file(const char *path, const char *aux, const char *pool, 1287 size_t size, uint64_t ashift) 1288 { 1289 char *pathbuf = NULL; 1290 uint64_t vdev; 1291 nvlist_t *file; 1292 boolean_t draid_spare = B_FALSE; 1293 1294 1295 if (ashift == 0) 1296 ashift = ztest_get_ashift(); 1297 1298 if (path == NULL) { 1299 pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1300 path = pathbuf; 1301 1302 if (aux != NULL) { 1303 vdev = ztest_shared->zs_vdev_aux; 1304 (void) snprintf(pathbuf, MAXPATHLEN, 1305 ztest_aux_template, ztest_opts.zo_dir, 1306 pool == NULL ? ztest_opts.zo_pool : pool, 1307 aux, vdev); 1308 } else { 1309 vdev = ztest_shared->zs_vdev_next_leaf++; 1310 (void) snprintf(pathbuf, MAXPATHLEN, 1311 ztest_dev_template, ztest_opts.zo_dir, 1312 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 1313 } 1314 } else { 1315 draid_spare = ztest_is_draid_spare(path); 1316 } 1317 1318 if (size != 0 && !draid_spare) { 1319 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 1320 if (fd == -1) 1321 fatal(B_TRUE, "can't open %s", path); 1322 if (ftruncate(fd, size) != 0) 1323 fatal(B_TRUE, "can't ftruncate %s", path); 1324 (void) close(fd); 1325 } 1326 1327 file = fnvlist_alloc(); 1328 fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, 1329 draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); 1330 fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); 1331 fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); 1332 umem_free(pathbuf, MAXPATHLEN); 1333 1334 return (file); 1335 } 1336 1337 static nvlist_t * 1338 make_vdev_raid(const char *path, const char *aux, const char *pool, size_t size, 1339 uint64_t ashift, int r) 1340 { 1341 nvlist_t *raid, **child; 1342 int c; 1343 1344 if (r < 2) 1345 return (make_vdev_file(path, aux, pool, size, ashift)); 1346 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 1347 1348 for (c = 0; c < r; c++) 1349 child[c] = make_vdev_file(path, aux, pool, size, ashift); 1350 1351 raid = fnvlist_alloc(); 1352 fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, 1353 ztest_opts.zo_raid_type); 1354 fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, 1355 ztest_opts.zo_raid_parity); 1356 fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, 1357 (const nvlist_t **)child, r); 1358 1359 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { 1360 uint64_t ndata = ztest_opts.zo_draid_data; 1361 uint64_t nparity = ztest_opts.zo_raid_parity; 1362 uint64_t nspares = ztest_opts.zo_draid_spares; 1363 uint64_t children = ztest_opts.zo_raid_children; 1364 uint64_t ngroups = 1; 1365 1366 /* 1367 * Calculate the minimum number of groups required to fill a 1368 * slice. This is the LCM of the stripe width (data + parity) 1369 * and the number of data drives (children - spares). 1370 */ 1371 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1372 ngroups++; 1373 1374 /* Store the basic dRAID configuration. */ 1375 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1376 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1377 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1378 } 1379 1380 for (c = 0; c < r; c++) 1381 fnvlist_free(child[c]); 1382 1383 umem_free(child, r * sizeof (nvlist_t *)); 1384 1385 return (raid); 1386 } 1387 1388 static nvlist_t * 1389 make_vdev_mirror(const char *path, const char *aux, const char *pool, 1390 size_t size, uint64_t ashift, int r, int m) 1391 { 1392 nvlist_t *mirror, **child; 1393 int c; 1394 1395 if (m < 1) 1396 return (make_vdev_raid(path, aux, pool, size, ashift, r)); 1397 1398 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 1399 1400 for (c = 0; c < m; c++) 1401 child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); 1402 1403 mirror = fnvlist_alloc(); 1404 fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); 1405 fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 1406 (const nvlist_t **)child, m); 1407 1408 for (c = 0; c < m; c++) 1409 fnvlist_free(child[c]); 1410 1411 umem_free(child, m * sizeof (nvlist_t *)); 1412 1413 return (mirror); 1414 } 1415 1416 static nvlist_t * 1417 make_vdev_root(const char *path, const char *aux, const char *pool, size_t size, 1418 uint64_t ashift, const char *class, int r, int m, int t) 1419 { 1420 nvlist_t *root, **child; 1421 int c; 1422 boolean_t log; 1423 1424 ASSERT3S(t, >, 0); 1425 1426 log = (class != NULL && strcmp(class, "log") == 0); 1427 1428 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 1429 1430 for (c = 0; c < t; c++) { 1431 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 1432 r, m); 1433 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); 1434 1435 if (class != NULL && class[0] != '\0') { 1436 ASSERT(m > 1 || log); /* expecting a mirror */ 1437 fnvlist_add_string(child[c], 1438 ZPOOL_CONFIG_ALLOCATION_BIAS, class); 1439 } 1440 } 1441 1442 root = fnvlist_alloc(); 1443 fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 1444 fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 1445 (const nvlist_t **)child, t); 1446 1447 for (c = 0; c < t; c++) 1448 fnvlist_free(child[c]); 1449 1450 umem_free(child, t * sizeof (nvlist_t *)); 1451 1452 return (root); 1453 } 1454 1455 /* 1456 * Find a random spa version. Returns back a random spa version in the 1457 * range [initial_version, SPA_VERSION_FEATURES]. 1458 */ 1459 static uint64_t 1460 ztest_random_spa_version(uint64_t initial_version) 1461 { 1462 uint64_t version = initial_version; 1463 1464 if (version <= SPA_VERSION_BEFORE_FEATURES) { 1465 version = version + 1466 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 1467 } 1468 1469 if (version > SPA_VERSION_BEFORE_FEATURES) 1470 version = SPA_VERSION_FEATURES; 1471 1472 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1473 return (version); 1474 } 1475 1476 static int 1477 ztest_random_blocksize(void) 1478 { 1479 ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); 1480 1481 /* 1482 * Choose a block size >= the ashift. 1483 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1484 */ 1485 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1486 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1487 maxbs = 20; 1488 uint64_t block_shift = 1489 ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1490 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1491 } 1492 1493 static int 1494 ztest_random_dnodesize(void) 1495 { 1496 int slots; 1497 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1498 1499 if (max_slots == DNODE_MIN_SLOTS) 1500 return (DNODE_MIN_SIZE); 1501 1502 /* 1503 * Weight the random distribution more heavily toward smaller 1504 * dnode sizes since that is more likely to reflect real-world 1505 * usage. 1506 */ 1507 ASSERT3U(max_slots, >, 4); 1508 switch (ztest_random(10)) { 1509 case 0: 1510 slots = 5 + ztest_random(max_slots - 4); 1511 break; 1512 case 1 ... 4: 1513 slots = 2 + ztest_random(3); 1514 break; 1515 default: 1516 slots = 1; 1517 break; 1518 } 1519 1520 return (slots << DNODE_SHIFT); 1521 } 1522 1523 static int 1524 ztest_random_ibshift(void) 1525 { 1526 return (DN_MIN_INDBLKSHIFT + 1527 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1528 } 1529 1530 static uint64_t 1531 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1532 { 1533 uint64_t top; 1534 vdev_t *rvd = spa->spa_root_vdev; 1535 vdev_t *tvd; 1536 1537 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 1538 1539 do { 1540 top = ztest_random(rvd->vdev_children); 1541 tvd = rvd->vdev_child[top]; 1542 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1543 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1544 1545 return (top); 1546 } 1547 1548 static uint64_t 1549 ztest_random_dsl_prop(zfs_prop_t prop) 1550 { 1551 uint64_t value; 1552 1553 do { 1554 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1555 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1556 1557 return (value); 1558 } 1559 1560 static int 1561 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1562 boolean_t inherit) 1563 { 1564 const char *propname = zfs_prop_to_name(prop); 1565 const char *valname; 1566 char *setpoint; 1567 uint64_t curval; 1568 int error; 1569 1570 error = dsl_prop_set_int(osname, propname, 1571 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1572 1573 if (error == ENOSPC) { 1574 ztest_record_enospc(FTAG); 1575 return (error); 1576 } 1577 ASSERT0(error); 1578 1579 setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1580 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1581 1582 if (ztest_opts.zo_verbose >= 6) { 1583 int err; 1584 1585 err = zfs_prop_index_to_string(prop, curval, &valname); 1586 if (err) 1587 (void) printf("%s %s = %llu at '%s'\n", osname, 1588 propname, (unsigned long long)curval, setpoint); 1589 else 1590 (void) printf("%s %s = %s at '%s'\n", 1591 osname, propname, valname, setpoint); 1592 } 1593 umem_free(setpoint, MAXPATHLEN); 1594 1595 return (error); 1596 } 1597 1598 static int 1599 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1600 { 1601 spa_t *spa = ztest_spa; 1602 nvlist_t *props = NULL; 1603 int error; 1604 1605 props = fnvlist_alloc(); 1606 fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); 1607 1608 error = spa_prop_set(spa, props); 1609 1610 fnvlist_free(props); 1611 1612 if (error == ENOSPC) { 1613 ztest_record_enospc(FTAG); 1614 return (error); 1615 } 1616 ASSERT0(error); 1617 1618 return (error); 1619 } 1620 1621 static int 1622 ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, 1623 boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) 1624 { 1625 int err; 1626 char *cp = NULL; 1627 char ddname[ZFS_MAX_DATASET_NAME_LEN]; 1628 1629 strlcpy(ddname, name, sizeof (ddname)); 1630 cp = strchr(ddname, '@'); 1631 if (cp != NULL) 1632 *cp = '\0'; 1633 1634 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1635 while (decrypt && err == EACCES) { 1636 dsl_crypto_params_t *dcp; 1637 nvlist_t *crypto_args = fnvlist_alloc(); 1638 1639 fnvlist_add_uint8_array(crypto_args, "wkeydata", 1640 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 1641 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, 1642 crypto_args, &dcp)); 1643 err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); 1644 /* 1645 * Note: if there was an error loading, the wkey was not 1646 * consumed, and needs to be freed. 1647 */ 1648 dsl_crypto_params_free(dcp, (err != 0)); 1649 fnvlist_free(crypto_args); 1650 1651 if (err == EINVAL) { 1652 /* 1653 * We couldn't load a key for this dataset so try 1654 * the parent. This loop will eventually hit the 1655 * encryption root since ztest only makes clones 1656 * as children of their origin datasets. 1657 */ 1658 cp = strrchr(ddname, '/'); 1659 if (cp == NULL) 1660 return (err); 1661 1662 *cp = '\0'; 1663 err = EACCES; 1664 continue; 1665 } else if (err != 0) { 1666 break; 1667 } 1668 1669 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1670 break; 1671 } 1672 1673 return (err); 1674 } 1675 1676 static void 1677 ztest_rll_init(rll_t *rll) 1678 { 1679 rll->rll_writer = NULL; 1680 rll->rll_readers = 0; 1681 mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); 1682 cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); 1683 } 1684 1685 static void 1686 ztest_rll_destroy(rll_t *rll) 1687 { 1688 ASSERT3P(rll->rll_writer, ==, NULL); 1689 ASSERT0(rll->rll_readers); 1690 mutex_destroy(&rll->rll_lock); 1691 cv_destroy(&rll->rll_cv); 1692 } 1693 1694 static void 1695 ztest_rll_lock(rll_t *rll, rl_type_t type) 1696 { 1697 mutex_enter(&rll->rll_lock); 1698 1699 if (type == ZTRL_READER) { 1700 while (rll->rll_writer != NULL) 1701 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1702 rll->rll_readers++; 1703 } else { 1704 while (rll->rll_writer != NULL || rll->rll_readers) 1705 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1706 rll->rll_writer = curthread; 1707 } 1708 1709 mutex_exit(&rll->rll_lock); 1710 } 1711 1712 static void 1713 ztest_rll_unlock(rll_t *rll) 1714 { 1715 mutex_enter(&rll->rll_lock); 1716 1717 if (rll->rll_writer) { 1718 ASSERT0(rll->rll_readers); 1719 rll->rll_writer = NULL; 1720 } else { 1721 ASSERT3S(rll->rll_readers, >, 0); 1722 ASSERT3P(rll->rll_writer, ==, NULL); 1723 rll->rll_readers--; 1724 } 1725 1726 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1727 cv_broadcast(&rll->rll_cv); 1728 1729 mutex_exit(&rll->rll_lock); 1730 } 1731 1732 static void 1733 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1734 { 1735 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1736 1737 ztest_rll_lock(rll, type); 1738 } 1739 1740 static void 1741 ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1742 { 1743 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1744 1745 ztest_rll_unlock(rll); 1746 } 1747 1748 static rl_t * 1749 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1750 uint64_t size, rl_type_t type) 1751 { 1752 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1753 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1754 rl_t *rl; 1755 1756 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1757 rl->rl_object = object; 1758 rl->rl_offset = offset; 1759 rl->rl_size = size; 1760 rl->rl_lock = rll; 1761 1762 ztest_rll_lock(rll, type); 1763 1764 return (rl); 1765 } 1766 1767 static void 1768 ztest_range_unlock(rl_t *rl) 1769 { 1770 rll_t *rll = rl->rl_lock; 1771 1772 ztest_rll_unlock(rll); 1773 1774 umem_free(rl, sizeof (*rl)); 1775 } 1776 1777 static void 1778 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1779 { 1780 zd->zd_os = os; 1781 zd->zd_zilog = dmu_objset_zil(os); 1782 zd->zd_shared = szd; 1783 dmu_objset_name(os, zd->zd_name); 1784 int l; 1785 1786 if (zd->zd_shared != NULL) 1787 zd->zd_shared->zd_seq = 0; 1788 1789 VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); 1790 mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); 1791 1792 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1793 ztest_rll_init(&zd->zd_object_lock[l]); 1794 1795 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1796 ztest_rll_init(&zd->zd_range_lock[l]); 1797 } 1798 1799 static void 1800 ztest_zd_fini(ztest_ds_t *zd) 1801 { 1802 int l; 1803 1804 mutex_destroy(&zd->zd_dirobj_lock); 1805 (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); 1806 1807 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1808 ztest_rll_destroy(&zd->zd_object_lock[l]); 1809 1810 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1811 ztest_rll_destroy(&zd->zd_range_lock[l]); 1812 } 1813 1814 #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) 1815 1816 static uint64_t 1817 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 1818 { 1819 uint64_t txg; 1820 int error; 1821 1822 /* 1823 * Attempt to assign tx to some transaction group. 1824 */ 1825 error = dmu_tx_assign(tx, txg_how); 1826 if (error) { 1827 if (error == ERESTART) { 1828 ASSERT3U(txg_how, ==, TXG_NOWAIT); 1829 dmu_tx_wait(tx); 1830 } else { 1831 ASSERT3U(error, ==, ENOSPC); 1832 ztest_record_enospc(tag); 1833 } 1834 dmu_tx_abort(tx); 1835 return (0); 1836 } 1837 txg = dmu_tx_get_txg(tx); 1838 ASSERT3U(txg, !=, 0); 1839 return (txg); 1840 } 1841 1842 static void 1843 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1844 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1845 uint64_t crtxg) 1846 { 1847 bt->bt_magic = BT_MAGIC; 1848 bt->bt_objset = dmu_objset_id(os); 1849 bt->bt_object = object; 1850 bt->bt_dnodesize = dnodesize; 1851 bt->bt_offset = offset; 1852 bt->bt_gen = gen; 1853 bt->bt_txg = txg; 1854 bt->bt_crtxg = crtxg; 1855 } 1856 1857 static void 1858 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1859 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1860 uint64_t crtxg) 1861 { 1862 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1863 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1864 ASSERT3U(bt->bt_object, ==, object); 1865 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1866 ASSERT3U(bt->bt_offset, ==, offset); 1867 ASSERT3U(bt->bt_gen, <=, gen); 1868 ASSERT3U(bt->bt_txg, <=, txg); 1869 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1870 } 1871 1872 static ztest_block_tag_t * 1873 ztest_bt_bonus(dmu_buf_t *db) 1874 { 1875 dmu_object_info_t doi; 1876 ztest_block_tag_t *bt; 1877 1878 dmu_object_info_from_db(db, &doi); 1879 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1880 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1881 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1882 1883 return (bt); 1884 } 1885 1886 /* 1887 * Generate a token to fill up unused bonus buffer space. Try to make 1888 * it unique to the object, generation, and offset to verify that data 1889 * is not getting overwritten by data from other dnodes. 1890 */ 1891 #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1892 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1893 1894 /* 1895 * Fill up the unused bonus buffer region before the block tag with a 1896 * verifiable pattern. Filling the whole bonus area with non-zero data 1897 * helps ensure that all dnode traversal code properly skips the 1898 * interior regions of large dnodes. 1899 */ 1900 static void 1901 ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1902 objset_t *os, uint64_t gen) 1903 { 1904 uint64_t *bonusp; 1905 1906 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1907 1908 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1909 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1910 gen, bonusp - (uint64_t *)db->db_data); 1911 *bonusp = token; 1912 } 1913 } 1914 1915 /* 1916 * Verify that the unused area of a bonus buffer is filled with the 1917 * expected tokens. 1918 */ 1919 static void 1920 ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1921 objset_t *os, uint64_t gen) 1922 { 1923 uint64_t *bonusp; 1924 1925 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1926 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1927 gen, bonusp - (uint64_t *)db->db_data); 1928 VERIFY3U(*bonusp, ==, token); 1929 } 1930 } 1931 1932 /* 1933 * ZIL logging ops 1934 */ 1935 1936 #define lrz_type lr_mode 1937 #define lrz_blocksize lr_uid 1938 #define lrz_ibshift lr_gid 1939 #define lrz_bonustype lr_rdev 1940 #define lrz_dnodesize lr_crtime[1] 1941 1942 static void 1943 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1944 { 1945 char *name = (char *)&lr->lr_data[0]; /* name follows lr */ 1946 size_t namesize = strlen(name) + 1; 1947 itx_t *itx; 1948 1949 if (zil_replaying(zd->zd_zilog, tx)) 1950 return; 1951 1952 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1953 memcpy(&itx->itx_lr + 1, &lr->lr_create.lr_common + 1, 1954 sizeof (*lr) + namesize - sizeof (lr_t)); 1955 1956 zil_itx_assign(zd->zd_zilog, itx, tx); 1957 } 1958 1959 static void 1960 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1961 { 1962 char *name = (char *)&lr->lr_data[0]; /* name follows lr */ 1963 size_t namesize = strlen(name) + 1; 1964 itx_t *itx; 1965 1966 if (zil_replaying(zd->zd_zilog, tx)) 1967 return; 1968 1969 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1970 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1971 sizeof (*lr) + namesize - sizeof (lr_t)); 1972 1973 itx->itx_oid = object; 1974 zil_itx_assign(zd->zd_zilog, itx, tx); 1975 } 1976 1977 static void 1978 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1979 { 1980 itx_t *itx; 1981 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1982 1983 if (zil_replaying(zd->zd_zilog, tx)) 1984 return; 1985 1986 if (lr->lr_length > zil_max_log_data(zd->zd_zilog, sizeof (lr_write_t))) 1987 write_state = WR_INDIRECT; 1988 1989 itx = zil_itx_create(TX_WRITE, 1990 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1991 1992 if (write_state == WR_COPIED && 1993 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1994 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { 1995 zil_itx_destroy(itx); 1996 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1997 write_state = WR_NEED_COPY; 1998 } 1999 itx->itx_private = zd; 2000 itx->itx_wr_state = write_state; 2001 itx->itx_sync = (ztest_random(8) == 0); 2002 2003 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2004 sizeof (*lr) - sizeof (lr_t)); 2005 2006 zil_itx_assign(zd->zd_zilog, itx, tx); 2007 } 2008 2009 static void 2010 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 2011 { 2012 itx_t *itx; 2013 2014 if (zil_replaying(zd->zd_zilog, tx)) 2015 return; 2016 2017 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 2018 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2019 sizeof (*lr) - sizeof (lr_t)); 2020 2021 itx->itx_sync = B_FALSE; 2022 zil_itx_assign(zd->zd_zilog, itx, tx); 2023 } 2024 2025 static void 2026 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 2027 { 2028 itx_t *itx; 2029 2030 if (zil_replaying(zd->zd_zilog, tx)) 2031 return; 2032 2033 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 2034 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2035 sizeof (*lr) - sizeof (lr_t)); 2036 2037 itx->itx_sync = B_FALSE; 2038 zil_itx_assign(zd->zd_zilog, itx, tx); 2039 } 2040 2041 /* 2042 * ZIL replay ops 2043 */ 2044 static int 2045 ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 2046 { 2047 ztest_ds_t *zd = arg1; 2048 lr_create_t *lrc = arg2; 2049 _lr_create_t *lr = &lrc->lr_create; 2050 char *name = (char *)&lrc->lr_data[0]; /* name follows lr */ 2051 objset_t *os = zd->zd_os; 2052 ztest_block_tag_t *bbt; 2053 dmu_buf_t *db; 2054 dmu_tx_t *tx; 2055 uint64_t txg; 2056 int error = 0; 2057 int bonuslen; 2058 2059 if (byteswap) 2060 byteswap_uint64_array(lr, sizeof (*lr)); 2061 2062 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2063 ASSERT3S(name[0], !=, '\0'); 2064 2065 tx = dmu_tx_create(os); 2066 2067 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 2068 2069 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2070 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 2071 } else { 2072 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 2073 } 2074 2075 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2076 if (txg == 0) 2077 return (ENOSPC); 2078 2079 ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); 2080 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 2081 2082 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2083 if (lr->lr_foid == 0) { 2084 lr->lr_foid = zap_create_dnsize(os, 2085 lr->lrz_type, lr->lrz_bonustype, 2086 bonuslen, lr->lrz_dnodesize, tx); 2087 } else { 2088 error = zap_create_claim_dnsize(os, lr->lr_foid, 2089 lr->lrz_type, lr->lrz_bonustype, 2090 bonuslen, lr->lrz_dnodesize, tx); 2091 } 2092 } else { 2093 if (lr->lr_foid == 0) { 2094 lr->lr_foid = dmu_object_alloc_dnsize(os, 2095 lr->lrz_type, 0, lr->lrz_bonustype, 2096 bonuslen, lr->lrz_dnodesize, tx); 2097 } else { 2098 error = dmu_object_claim_dnsize(os, lr->lr_foid, 2099 lr->lrz_type, 0, lr->lrz_bonustype, 2100 bonuslen, lr->lrz_dnodesize, tx); 2101 } 2102 } 2103 2104 if (error) { 2105 ASSERT3U(error, ==, EEXIST); 2106 ASSERT(zd->zd_zilog->zl_replay); 2107 dmu_tx_commit(tx); 2108 return (error); 2109 } 2110 2111 ASSERT3U(lr->lr_foid, !=, 0); 2112 2113 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 2114 VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, 2115 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 2116 2117 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2118 bbt = ztest_bt_bonus(db); 2119 dmu_buf_will_dirty(db, tx); 2120 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 2121 lr->lr_gen, txg, txg); 2122 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 2123 dmu_buf_rele(db, FTAG); 2124 2125 VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 2126 &lr->lr_foid, tx)); 2127 2128 (void) ztest_log_create(zd, tx, lrc); 2129 2130 dmu_tx_commit(tx); 2131 2132 return (0); 2133 } 2134 2135 static int 2136 ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 2137 { 2138 ztest_ds_t *zd = arg1; 2139 lr_remove_t *lr = arg2; 2140 char *name = (char *)&lr->lr_data[0]; /* name follows lr */ 2141 objset_t *os = zd->zd_os; 2142 dmu_object_info_t doi; 2143 dmu_tx_t *tx; 2144 uint64_t object, txg; 2145 2146 if (byteswap) 2147 byteswap_uint64_array(lr, sizeof (*lr)); 2148 2149 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2150 ASSERT3S(name[0], !=, '\0'); 2151 2152 VERIFY0( 2153 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 2154 ASSERT3U(object, !=, 0); 2155 2156 ztest_object_lock(zd, object, ZTRL_WRITER); 2157 2158 VERIFY0(dmu_object_info(os, object, &doi)); 2159 2160 tx = dmu_tx_create(os); 2161 2162 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 2163 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 2164 2165 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2166 if (txg == 0) { 2167 ztest_object_unlock(zd, object); 2168 return (ENOSPC); 2169 } 2170 2171 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 2172 VERIFY0(zap_destroy(os, object, tx)); 2173 } else { 2174 VERIFY0(dmu_object_free(os, object, tx)); 2175 } 2176 2177 VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); 2178 2179 (void) ztest_log_remove(zd, tx, lr, object); 2180 2181 dmu_tx_commit(tx); 2182 2183 ztest_object_unlock(zd, object); 2184 2185 return (0); 2186 } 2187 2188 static int 2189 ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 2190 { 2191 ztest_ds_t *zd = arg1; 2192 lr_write_t *lr = arg2; 2193 objset_t *os = zd->zd_os; 2194 uint8_t *data = &lr->lr_data[0]; /* data follows lr */ 2195 uint64_t offset, length; 2196 ztest_block_tag_t *bt = (ztest_block_tag_t *)data; 2197 ztest_block_tag_t *bbt; 2198 uint64_t gen, txg, lrtxg, crtxg; 2199 dmu_object_info_t doi; 2200 dmu_tx_t *tx; 2201 dmu_buf_t *db; 2202 arc_buf_t *abuf = NULL; 2203 rl_t *rl; 2204 2205 if (byteswap) 2206 byteswap_uint64_array(lr, sizeof (*lr)); 2207 2208 offset = lr->lr_offset; 2209 length = lr->lr_length; 2210 2211 /* If it's a dmu_sync() block, write the whole block */ 2212 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 2213 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 2214 if (length < blocksize) { 2215 offset -= offset % blocksize; 2216 length = blocksize; 2217 } 2218 } 2219 2220 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 2221 byteswap_uint64_array(bt, sizeof (*bt)); 2222 2223 if (bt->bt_magic != BT_MAGIC) 2224 bt = NULL; 2225 2226 ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); 2227 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, ZTRL_WRITER); 2228 2229 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2230 2231 dmu_object_info_from_db(db, &doi); 2232 2233 bbt = ztest_bt_bonus(db); 2234 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2235 gen = bbt->bt_gen; 2236 crtxg = bbt->bt_crtxg; 2237 lrtxg = lr->lr_common.lrc_txg; 2238 2239 tx = dmu_tx_create(os); 2240 2241 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 2242 2243 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 2244 P2PHASE(offset, length) == 0) 2245 abuf = dmu_request_arcbuf(db, length); 2246 2247 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2248 if (txg == 0) { 2249 if (abuf != NULL) 2250 dmu_return_arcbuf(abuf); 2251 dmu_buf_rele(db, FTAG); 2252 ztest_range_unlock(rl); 2253 ztest_object_unlock(zd, lr->lr_foid); 2254 return (ENOSPC); 2255 } 2256 2257 if (bt != NULL) { 2258 /* 2259 * Usually, verify the old data before writing new data -- 2260 * but not always, because we also want to verify correct 2261 * behavior when the data was not recently read into cache. 2262 */ 2263 ASSERT(doi.doi_data_block_size); 2264 ASSERT0(offset % doi.doi_data_block_size); 2265 if (ztest_random(4) != 0) { 2266 int prefetch = ztest_random(2) ? 2267 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 2268 2269 /* 2270 * We will randomly set when to do O_DIRECT on a read. 2271 */ 2272 if (ztest_random(4) == 0) 2273 prefetch |= DMU_DIRECTIO; 2274 2275 ztest_block_tag_t rbt; 2276 2277 VERIFY(dmu_read(os, lr->lr_foid, offset, 2278 sizeof (rbt), &rbt, prefetch) == 0); 2279 if (rbt.bt_magic == BT_MAGIC) { 2280 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 2281 offset, gen, txg, crtxg); 2282 } 2283 } 2284 2285 /* 2286 * Writes can appear to be newer than the bonus buffer because 2287 * the ztest_get_data() callback does a dmu_read() of the 2288 * open-context data, which may be different than the data 2289 * as it was when the write was generated. 2290 */ 2291 if (zd->zd_zilog->zl_replay) { 2292 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 2293 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 2294 bt->bt_crtxg); 2295 } 2296 2297 /* 2298 * Set the bt's gen/txg to the bonus buffer's gen/txg 2299 * so that all of the usual ASSERTs will work. 2300 */ 2301 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 2302 crtxg); 2303 } 2304 2305 if (abuf == NULL) { 2306 dmu_write(os, lr->lr_foid, offset, length, data, tx); 2307 } else { 2308 memcpy(abuf->b_data, data, length); 2309 VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx)); 2310 } 2311 2312 (void) ztest_log_write(zd, tx, lr); 2313 2314 dmu_buf_rele(db, FTAG); 2315 2316 dmu_tx_commit(tx); 2317 2318 ztest_range_unlock(rl); 2319 ztest_object_unlock(zd, lr->lr_foid); 2320 2321 return (0); 2322 } 2323 2324 static int 2325 ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 2326 { 2327 ztest_ds_t *zd = arg1; 2328 lr_truncate_t *lr = arg2; 2329 objset_t *os = zd->zd_os; 2330 dmu_tx_t *tx; 2331 uint64_t txg; 2332 rl_t *rl; 2333 2334 if (byteswap) 2335 byteswap_uint64_array(lr, sizeof (*lr)); 2336 2337 ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); 2338 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 2339 ZTRL_WRITER); 2340 2341 tx = dmu_tx_create(os); 2342 2343 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 2344 2345 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2346 if (txg == 0) { 2347 ztest_range_unlock(rl); 2348 ztest_object_unlock(zd, lr->lr_foid); 2349 return (ENOSPC); 2350 } 2351 2352 VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 2353 lr->lr_length, tx)); 2354 2355 (void) ztest_log_truncate(zd, tx, lr); 2356 2357 dmu_tx_commit(tx); 2358 2359 ztest_range_unlock(rl); 2360 ztest_object_unlock(zd, lr->lr_foid); 2361 2362 return (0); 2363 } 2364 2365 static int 2366 ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 2367 { 2368 ztest_ds_t *zd = arg1; 2369 lr_setattr_t *lr = arg2; 2370 objset_t *os = zd->zd_os; 2371 dmu_tx_t *tx; 2372 dmu_buf_t *db; 2373 ztest_block_tag_t *bbt; 2374 uint64_t txg, lrtxg, crtxg, dnodesize; 2375 2376 if (byteswap) 2377 byteswap_uint64_array(lr, sizeof (*lr)); 2378 2379 ztest_object_lock(zd, lr->lr_foid, ZTRL_WRITER); 2380 2381 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2382 2383 tx = dmu_tx_create(os); 2384 dmu_tx_hold_bonus(tx, lr->lr_foid); 2385 2386 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2387 if (txg == 0) { 2388 dmu_buf_rele(db, FTAG); 2389 ztest_object_unlock(zd, lr->lr_foid); 2390 return (ENOSPC); 2391 } 2392 2393 bbt = ztest_bt_bonus(db); 2394 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2395 crtxg = bbt->bt_crtxg; 2396 lrtxg = lr->lr_common.lrc_txg; 2397 dnodesize = bbt->bt_dnodesize; 2398 2399 if (zd->zd_zilog->zl_replay) { 2400 ASSERT3U(lr->lr_size, !=, 0); 2401 ASSERT3U(lr->lr_mode, !=, 0); 2402 ASSERT3U(lrtxg, !=, 0); 2403 } else { 2404 /* 2405 * Randomly change the size and increment the generation. 2406 */ 2407 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 2408 sizeof (*bbt); 2409 lr->lr_mode = bbt->bt_gen + 1; 2410 ASSERT0(lrtxg); 2411 } 2412 2413 /* 2414 * Verify that the current bonus buffer is not newer than our txg. 2415 */ 2416 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2417 MAX(txg, lrtxg), crtxg); 2418 2419 dmu_buf_will_dirty(db, tx); 2420 2421 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 2422 ASSERT3U(lr->lr_size, <=, db->db_size); 2423 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 2424 bbt = ztest_bt_bonus(db); 2425 2426 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2427 txg, crtxg); 2428 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 2429 dmu_buf_rele(db, FTAG); 2430 2431 (void) ztest_log_setattr(zd, tx, lr); 2432 2433 dmu_tx_commit(tx); 2434 2435 ztest_object_unlock(zd, lr->lr_foid); 2436 2437 return (0); 2438 } 2439 2440 static zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 2441 NULL, /* 0 no such transaction type */ 2442 ztest_replay_create, /* TX_CREATE */ 2443 NULL, /* TX_MKDIR */ 2444 NULL, /* TX_MKXATTR */ 2445 NULL, /* TX_SYMLINK */ 2446 ztest_replay_remove, /* TX_REMOVE */ 2447 NULL, /* TX_RMDIR */ 2448 NULL, /* TX_LINK */ 2449 NULL, /* TX_RENAME */ 2450 ztest_replay_write, /* TX_WRITE */ 2451 ztest_replay_truncate, /* TX_TRUNCATE */ 2452 ztest_replay_setattr, /* TX_SETATTR */ 2453 NULL, /* TX_ACL */ 2454 NULL, /* TX_CREATE_ACL */ 2455 NULL, /* TX_CREATE_ATTR */ 2456 NULL, /* TX_CREATE_ACL_ATTR */ 2457 NULL, /* TX_MKDIR_ACL */ 2458 NULL, /* TX_MKDIR_ATTR */ 2459 NULL, /* TX_MKDIR_ACL_ATTR */ 2460 NULL, /* TX_WRITE2 */ 2461 NULL, /* TX_SETSAXATTR */ 2462 NULL, /* TX_RENAME_EXCHANGE */ 2463 NULL, /* TX_RENAME_WHITEOUT */ 2464 }; 2465 2466 /* 2467 * ZIL get_data callbacks 2468 */ 2469 2470 static void 2471 ztest_get_done(zgd_t *zgd, int error) 2472 { 2473 (void) error; 2474 ztest_ds_t *zd = zgd->zgd_private; 2475 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 2476 2477 if (zgd->zgd_db) 2478 dmu_buf_rele(zgd->zgd_db, zgd); 2479 2480 ztest_range_unlock((rl_t *)zgd->zgd_lr); 2481 ztest_object_unlock(zd, object); 2482 2483 umem_free(zgd, sizeof (*zgd)); 2484 } 2485 2486 static int 2487 ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, 2488 struct lwb *lwb, zio_t *zio) 2489 { 2490 (void) arg2; 2491 ztest_ds_t *zd = arg; 2492 objset_t *os = zd->zd_os; 2493 uint64_t object = lr->lr_foid; 2494 uint64_t offset = lr->lr_offset; 2495 uint64_t size = lr->lr_length; 2496 uint64_t txg = lr->lr_common.lrc_txg; 2497 uint64_t crtxg; 2498 dmu_object_info_t doi; 2499 dmu_buf_t *db; 2500 zgd_t *zgd; 2501 int error; 2502 2503 ASSERT3P(lwb, !=, NULL); 2504 ASSERT3U(size, !=, 0); 2505 2506 ztest_object_lock(zd, object, ZTRL_READER); 2507 error = dmu_bonus_hold(os, object, FTAG, &db); 2508 if (error) { 2509 ztest_object_unlock(zd, object); 2510 return (error); 2511 } 2512 2513 crtxg = ztest_bt_bonus(db)->bt_crtxg; 2514 2515 if (crtxg == 0 || crtxg > txg) { 2516 dmu_buf_rele(db, FTAG); 2517 ztest_object_unlock(zd, object); 2518 return (ENOENT); 2519 } 2520 2521 dmu_object_info_from_db(db, &doi); 2522 dmu_buf_rele(db, FTAG); 2523 db = NULL; 2524 2525 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 2526 zgd->zgd_lwb = lwb; 2527 zgd->zgd_private = zd; 2528 2529 if (buf != NULL) { /* immediate write */ 2530 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2531 object, offset, size, ZTRL_READER); 2532 2533 error = dmu_read(os, object, offset, size, buf, 2534 DMU_READ_NO_PREFETCH); 2535 ASSERT0(error); 2536 } else { 2537 ASSERT3P(zio, !=, NULL); 2538 size = doi.doi_data_block_size; 2539 if (ISP2(size)) { 2540 offset = P2ALIGN_TYPED(offset, size, uint64_t); 2541 } else { 2542 ASSERT3U(offset, <, size); 2543 offset = 0; 2544 } 2545 2546 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2547 object, offset, size, ZTRL_READER); 2548 2549 error = dmu_buf_hold_noread(os, object, offset, zgd, &db); 2550 2551 if (error == 0) { 2552 blkptr_t *bp = &lr->lr_blkptr; 2553 2554 zgd->zgd_db = db; 2555 zgd->zgd_bp = bp; 2556 2557 ASSERT3U(db->db_offset, ==, offset); 2558 ASSERT3U(db->db_size, ==, size); 2559 2560 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2561 ztest_get_done, zgd); 2562 2563 if (error == 0) 2564 return (0); 2565 } 2566 } 2567 2568 ztest_get_done(zgd, error); 2569 2570 return (error); 2571 } 2572 2573 static void * 2574 ztest_lr_alloc(size_t lrsize, char *name) 2575 { 2576 char *lr; 2577 size_t namesize = name ? strlen(name) + 1 : 0; 2578 2579 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2580 2581 if (name) 2582 memcpy(lr + lrsize, name, namesize); 2583 2584 return (lr); 2585 } 2586 2587 static void 2588 ztest_lr_free(void *lr, size_t lrsize, char *name) 2589 { 2590 size_t namesize = name ? strlen(name) + 1 : 0; 2591 2592 umem_free(lr, lrsize + namesize); 2593 } 2594 2595 /* 2596 * Lookup a bunch of objects. Returns the number of objects not found. 2597 */ 2598 static int 2599 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2600 { 2601 int missing = 0; 2602 int error; 2603 int i; 2604 2605 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2606 2607 for (i = 0; i < count; i++, od++) { 2608 od->od_object = 0; 2609 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2610 sizeof (uint64_t), 1, &od->od_object); 2611 if (error) { 2612 ASSERT3S(error, ==, ENOENT); 2613 ASSERT0(od->od_object); 2614 missing++; 2615 } else { 2616 dmu_buf_t *db; 2617 ztest_block_tag_t *bbt; 2618 dmu_object_info_t doi; 2619 2620 ASSERT3U(od->od_object, !=, 0); 2621 ASSERT0(missing); /* there should be no gaps */ 2622 2623 ztest_object_lock(zd, od->od_object, ZTRL_READER); 2624 VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, 2625 FTAG, &db)); 2626 dmu_object_info_from_db(db, &doi); 2627 bbt = ztest_bt_bonus(db); 2628 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2629 od->od_type = doi.doi_type; 2630 od->od_blocksize = doi.doi_data_block_size; 2631 od->od_gen = bbt->bt_gen; 2632 dmu_buf_rele(db, FTAG); 2633 ztest_object_unlock(zd, od->od_object); 2634 } 2635 } 2636 2637 return (missing); 2638 } 2639 2640 static int 2641 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2642 { 2643 int missing = 0; 2644 int i; 2645 2646 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2647 2648 for (i = 0; i < count; i++, od++) { 2649 if (missing) { 2650 od->od_object = 0; 2651 missing++; 2652 continue; 2653 } 2654 2655 lr_create_t *lrc = ztest_lr_alloc(sizeof (*lrc), od->od_name); 2656 _lr_create_t *lr = &lrc->lr_create; 2657 2658 lr->lr_doid = od->od_dir; 2659 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2660 lr->lrz_type = od->od_crtype; 2661 lr->lrz_blocksize = od->od_crblocksize; 2662 lr->lrz_ibshift = ztest_random_ibshift(); 2663 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2664 lr->lrz_dnodesize = od->od_crdnodesize; 2665 lr->lr_gen = od->od_crgen; 2666 lr->lr_crtime[0] = time(NULL); 2667 2668 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2669 ASSERT0(missing); 2670 od->od_object = 0; 2671 missing++; 2672 } else { 2673 od->od_object = lr->lr_foid; 2674 od->od_type = od->od_crtype; 2675 od->od_blocksize = od->od_crblocksize; 2676 od->od_gen = od->od_crgen; 2677 ASSERT3U(od->od_object, !=, 0); 2678 } 2679 2680 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2681 } 2682 2683 return (missing); 2684 } 2685 2686 static int 2687 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2688 { 2689 int missing = 0; 2690 int error; 2691 int i; 2692 2693 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2694 2695 od += count - 1; 2696 2697 for (i = count - 1; i >= 0; i--, od--) { 2698 if (missing) { 2699 missing++; 2700 continue; 2701 } 2702 2703 /* 2704 * No object was found. 2705 */ 2706 if (od->od_object == 0) 2707 continue; 2708 2709 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2710 2711 lr->lr_doid = od->od_dir; 2712 2713 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2714 ASSERT3U(error, ==, ENOSPC); 2715 missing++; 2716 } else { 2717 od->od_object = 0; 2718 } 2719 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2720 } 2721 2722 return (missing); 2723 } 2724 2725 static int 2726 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2727 const void *data) 2728 { 2729 lr_write_t *lr; 2730 int error; 2731 2732 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2733 2734 lr->lr_foid = object; 2735 lr->lr_offset = offset; 2736 lr->lr_length = size; 2737 lr->lr_blkoff = 0; 2738 BP_ZERO(&lr->lr_blkptr); 2739 2740 memcpy(&lr->lr_data[0], data, size); 2741 2742 error = ztest_replay_write(zd, lr, B_FALSE); 2743 2744 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2745 2746 return (error); 2747 } 2748 2749 static int 2750 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2751 { 2752 lr_truncate_t *lr; 2753 int error; 2754 2755 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2756 2757 lr->lr_foid = object; 2758 lr->lr_offset = offset; 2759 lr->lr_length = size; 2760 2761 error = ztest_replay_truncate(zd, lr, B_FALSE); 2762 2763 ztest_lr_free(lr, sizeof (*lr), NULL); 2764 2765 return (error); 2766 } 2767 2768 static int 2769 ztest_setattr(ztest_ds_t *zd, uint64_t object) 2770 { 2771 lr_setattr_t *lr; 2772 int error; 2773 2774 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2775 2776 lr->lr_foid = object; 2777 lr->lr_size = 0; 2778 lr->lr_mode = 0; 2779 2780 error = ztest_replay_setattr(zd, lr, B_FALSE); 2781 2782 ztest_lr_free(lr, sizeof (*lr), NULL); 2783 2784 return (error); 2785 } 2786 2787 static void 2788 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2789 { 2790 objset_t *os = zd->zd_os; 2791 dmu_tx_t *tx; 2792 uint64_t txg; 2793 rl_t *rl; 2794 2795 txg_wait_synced(dmu_objset_pool(os), 0); 2796 2797 ztest_object_lock(zd, object, ZTRL_READER); 2798 rl = ztest_range_lock(zd, object, offset, size, ZTRL_WRITER); 2799 2800 tx = dmu_tx_create(os); 2801 2802 dmu_tx_hold_write(tx, object, offset, size); 2803 2804 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2805 2806 if (txg != 0) { 2807 dmu_prealloc(os, object, offset, size, tx); 2808 dmu_tx_commit(tx); 2809 txg_wait_synced(dmu_objset_pool(os), txg); 2810 } else { 2811 (void) dmu_free_long_range(os, object, offset, size); 2812 } 2813 2814 ztest_range_unlock(rl); 2815 ztest_object_unlock(zd, object); 2816 } 2817 2818 static void 2819 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2820 { 2821 int err; 2822 ztest_block_tag_t wbt; 2823 dmu_object_info_t doi; 2824 enum ztest_io_type io_type; 2825 uint64_t blocksize; 2826 void *data; 2827 uint32_t dmu_read_flags = DMU_READ_NO_PREFETCH; 2828 2829 /* 2830 * We will randomly set when to do O_DIRECT on a read. 2831 */ 2832 if (ztest_random(4) == 0) 2833 dmu_read_flags |= DMU_DIRECTIO; 2834 2835 VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); 2836 blocksize = doi.doi_data_block_size; 2837 data = umem_alloc(blocksize, UMEM_NOFAIL); 2838 2839 /* 2840 * Pick an i/o type at random, biased toward writing block tags. 2841 */ 2842 io_type = ztest_random(ZTEST_IO_TYPES); 2843 if (ztest_random(2) == 0) 2844 io_type = ZTEST_IO_WRITE_TAG; 2845 2846 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2847 2848 switch (io_type) { 2849 2850 case ZTEST_IO_WRITE_TAG: 2851 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2852 offset, 0, 0, 0); 2853 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2854 break; 2855 2856 case ZTEST_IO_WRITE_PATTERN: 2857 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2858 if (ztest_random(2) == 0) { 2859 /* 2860 * Induce fletcher2 collisions to ensure that 2861 * zio_ddt_collision() detects and resolves them 2862 * when using fletcher2-verify for deduplication. 2863 */ 2864 ((uint64_t *)data)[0] ^= 1ULL << 63; 2865 ((uint64_t *)data)[4] ^= 1ULL << 63; 2866 } 2867 (void) ztest_write(zd, object, offset, blocksize, data); 2868 break; 2869 2870 case ZTEST_IO_WRITE_ZEROES: 2871 memset(data, 0, blocksize); 2872 (void) ztest_write(zd, object, offset, blocksize, data); 2873 break; 2874 2875 case ZTEST_IO_TRUNCATE: 2876 (void) ztest_truncate(zd, object, offset, blocksize); 2877 break; 2878 2879 case ZTEST_IO_SETATTR: 2880 (void) ztest_setattr(zd, object); 2881 break; 2882 default: 2883 break; 2884 2885 case ZTEST_IO_REWRITE: 2886 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2887 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2888 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2889 B_FALSE); 2890 ASSERT(err == 0 || err == ENOSPC); 2891 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2892 ZFS_PROP_COMPRESSION, 2893 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2894 B_FALSE); 2895 ASSERT(err == 0 || err == ENOSPC); 2896 (void) pthread_rwlock_unlock(&ztest_name_lock); 2897 2898 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2899 dmu_read_flags)); 2900 2901 (void) ztest_write(zd, object, offset, blocksize, data); 2902 break; 2903 } 2904 2905 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2906 2907 umem_free(data, blocksize); 2908 } 2909 2910 /* 2911 * Initialize an object description template. 2912 */ 2913 static void 2914 ztest_od_init(ztest_od_t *od, uint64_t id, const char *tag, uint64_t index, 2915 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2916 uint64_t gen) 2917 { 2918 od->od_dir = ZTEST_DIROBJ; 2919 od->od_object = 0; 2920 2921 od->od_crtype = type; 2922 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2923 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2924 od->od_crgen = gen; 2925 2926 od->od_type = DMU_OT_NONE; 2927 od->od_blocksize = 0; 2928 od->od_gen = 0; 2929 2930 (void) snprintf(od->od_name, sizeof (od->od_name), 2931 "%s(%"PRId64")[%"PRIu64"]", 2932 tag, id, index); 2933 } 2934 2935 /* 2936 * Lookup or create the objects for a test using the od template. 2937 * If the objects do not all exist, or if 'remove' is specified, 2938 * remove any existing objects and create new ones. Otherwise, 2939 * use the existing objects. 2940 */ 2941 static int 2942 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2943 { 2944 int count = size / sizeof (*od); 2945 int rv = 0; 2946 2947 mutex_enter(&zd->zd_dirobj_lock); 2948 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2949 (ztest_remove(zd, od, count) != 0 || 2950 ztest_create(zd, od, count) != 0)) 2951 rv = -1; 2952 zd->zd_od = od; 2953 mutex_exit(&zd->zd_dirobj_lock); 2954 2955 return (rv); 2956 } 2957 2958 void 2959 ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2960 { 2961 (void) id; 2962 zilog_t *zilog = zd->zd_zilog; 2963 2964 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2965 2966 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2967 2968 /* 2969 * Remember the committed values in zd, which is in parent/child 2970 * shared memory. If we die, the next iteration of ztest_run() 2971 * will verify that the log really does contain this record. 2972 */ 2973 mutex_enter(&zilog->zl_lock); 2974 ASSERT3P(zd->zd_shared, !=, NULL); 2975 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2976 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2977 mutex_exit(&zilog->zl_lock); 2978 2979 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2980 } 2981 2982 /* 2983 * This function is designed to simulate the operations that occur during a 2984 * mount/unmount operation. We hold the dataset across these operations in an 2985 * attempt to expose any implicit assumptions about ZIL management. 2986 */ 2987 void 2988 ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2989 { 2990 (void) id; 2991 objset_t *os = zd->zd_os; 2992 2993 /* 2994 * We hold the ztest_vdev_lock so we don't cause problems with 2995 * other threads that wish to remove a log device, such as 2996 * ztest_device_removal(). 2997 */ 2998 mutex_enter(&ztest_vdev_lock); 2999 3000 /* 3001 * We grab the zd_dirobj_lock to ensure that no other thread is 3002 * updating the zil (i.e. adding in-memory log records) and the 3003 * zd_zilog_lock to block any I/O. 3004 */ 3005 mutex_enter(&zd->zd_dirobj_lock); 3006 (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); 3007 3008 /* zfsvfs_teardown() */ 3009 zil_close(zd->zd_zilog); 3010 3011 /* zfsvfs_setup() */ 3012 VERIFY3P(zil_open(os, ztest_get_data, NULL), ==, zd->zd_zilog); 3013 zil_replay(os, zd, ztest_replay_vector); 3014 3015 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 3016 mutex_exit(&zd->zd_dirobj_lock); 3017 mutex_exit(&ztest_vdev_lock); 3018 } 3019 3020 /* 3021 * Verify that we can't destroy an active pool, create an existing pool, 3022 * or create a pool with a bad vdev spec. 3023 */ 3024 void 3025 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 3026 { 3027 (void) zd, (void) id; 3028 ztest_shared_opts_t *zo = &ztest_opts; 3029 spa_t *spa; 3030 nvlist_t *nvroot; 3031 3032 if (zo->zo_mmp_test) 3033 return; 3034 3035 /* 3036 * Attempt to create using a bad file. 3037 */ 3038 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 3039 VERIFY3U(ENOENT, ==, 3040 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 3041 fnvlist_free(nvroot); 3042 3043 /* 3044 * Attempt to create using a bad mirror. 3045 */ 3046 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); 3047 VERIFY3U(ENOENT, ==, 3048 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 3049 fnvlist_free(nvroot); 3050 3051 /* 3052 * Attempt to create an existing pool. It shouldn't matter 3053 * what's in the nvroot; we should fail with EEXIST. 3054 */ 3055 (void) pthread_rwlock_rdlock(&ztest_name_lock); 3056 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 3057 VERIFY3U(EEXIST, ==, 3058 spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); 3059 fnvlist_free(nvroot); 3060 3061 /* 3062 * We open a reference to the spa and then we try to export it 3063 * expecting one of the following errors: 3064 * 3065 * EBUSY 3066 * Because of the reference we just opened. 3067 * 3068 * ZFS_ERR_EXPORT_IN_PROGRESS 3069 * For the case that there is another ztest thread doing 3070 * an export concurrently. 3071 */ 3072 VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); 3073 int error = spa_destroy(zo->zo_pool); 3074 if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { 3075 fatal(B_FALSE, "spa_destroy(%s) returned unexpected value %d", 3076 spa->spa_name, error); 3077 } 3078 spa_close(spa, FTAG); 3079 3080 (void) pthread_rwlock_unlock(&ztest_name_lock); 3081 } 3082 3083 /* 3084 * Start and then stop the MMP threads to ensure the startup and shutdown code 3085 * works properly. Actual protection and property-related code tested via ZTS. 3086 */ 3087 void 3088 ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) 3089 { 3090 (void) zd, (void) id; 3091 ztest_shared_opts_t *zo = &ztest_opts; 3092 spa_t *spa = ztest_spa; 3093 3094 if (zo->zo_mmp_test) 3095 return; 3096 3097 /* 3098 * Since enabling MMP involves setting a property, it could not be done 3099 * while the pool is suspended. 3100 */ 3101 if (spa_suspended(spa)) 3102 return; 3103 3104 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3105 mutex_enter(&spa->spa_props_lock); 3106 3107 zfs_multihost_fail_intervals = 0; 3108 3109 if (!spa_multihost(spa)) { 3110 spa->spa_multihost = B_TRUE; 3111 mmp_thread_start(spa); 3112 } 3113 3114 mutex_exit(&spa->spa_props_lock); 3115 spa_config_exit(spa, SCL_CONFIG, FTAG); 3116 3117 txg_wait_synced(spa_get_dsl(spa), 0); 3118 mmp_signal_all_threads(); 3119 txg_wait_synced(spa_get_dsl(spa), 0); 3120 3121 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3122 mutex_enter(&spa->spa_props_lock); 3123 3124 if (spa_multihost(spa)) { 3125 mmp_thread_stop(spa); 3126 spa->spa_multihost = B_FALSE; 3127 } 3128 3129 mutex_exit(&spa->spa_props_lock); 3130 spa_config_exit(spa, SCL_CONFIG, FTAG); 3131 } 3132 3133 static int 3134 ztest_get_raidz_children(spa_t *spa) 3135 { 3136 (void) spa; 3137 vdev_t *raidvd; 3138 3139 ASSERT(MUTEX_HELD(&ztest_vdev_lock)); 3140 3141 if (ztest_opts.zo_raid_do_expand) { 3142 raidvd = ztest_spa->spa_root_vdev->vdev_child[0]; 3143 3144 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3145 3146 return (raidvd->vdev_children); 3147 } 3148 3149 return (ztest_opts.zo_raid_children); 3150 } 3151 3152 void 3153 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 3154 { 3155 (void) zd, (void) id; 3156 spa_t *spa; 3157 uint64_t initial_version = SPA_VERSION_INITIAL; 3158 uint64_t raidz_children, version, newversion; 3159 nvlist_t *nvroot, *props; 3160 char *name; 3161 3162 if (ztest_opts.zo_mmp_test) 3163 return; 3164 3165 /* dRAID added after feature flags, skip upgrade test. */ 3166 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) 3167 return; 3168 3169 mutex_enter(&ztest_vdev_lock); 3170 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 3171 3172 /* 3173 * Clean up from previous runs. 3174 */ 3175 (void) spa_destroy(name); 3176 3177 raidz_children = ztest_get_raidz_children(ztest_spa); 3178 3179 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 3180 NULL, raidz_children, ztest_opts.zo_mirrors, 1); 3181 3182 /* 3183 * If we're configuring a RAIDZ device then make sure that the 3184 * initial version is capable of supporting that feature. 3185 */ 3186 switch (ztest_opts.zo_raid_parity) { 3187 case 0: 3188 case 1: 3189 initial_version = SPA_VERSION_INITIAL; 3190 break; 3191 case 2: 3192 initial_version = SPA_VERSION_RAIDZ2; 3193 break; 3194 case 3: 3195 initial_version = SPA_VERSION_RAIDZ3; 3196 break; 3197 } 3198 3199 /* 3200 * Create a pool with a spa version that can be upgraded. Pick 3201 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 3202 */ 3203 do { 3204 version = ztest_random_spa_version(initial_version); 3205 } while (version > SPA_VERSION_BEFORE_FEATURES); 3206 3207 props = fnvlist_alloc(); 3208 fnvlist_add_uint64(props, 3209 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 3210 VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); 3211 fnvlist_free(nvroot); 3212 fnvlist_free(props); 3213 3214 VERIFY0(spa_open(name, &spa, FTAG)); 3215 VERIFY3U(spa_version(spa), ==, version); 3216 newversion = ztest_random_spa_version(version + 1); 3217 3218 if (ztest_opts.zo_verbose >= 4) { 3219 (void) printf("upgrading spa version from " 3220 "%"PRIu64" to %"PRIu64"\n", 3221 version, newversion); 3222 } 3223 3224 spa_upgrade(spa, newversion); 3225 VERIFY3U(spa_version(spa), >, version); 3226 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 3227 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 3228 spa_close(spa, FTAG); 3229 3230 kmem_strfree(name); 3231 mutex_exit(&ztest_vdev_lock); 3232 } 3233 3234 static void 3235 ztest_spa_checkpoint(spa_t *spa) 3236 { 3237 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3238 3239 int error = spa_checkpoint(spa->spa_name); 3240 3241 switch (error) { 3242 case 0: 3243 case ZFS_ERR_DEVRM_IN_PROGRESS: 3244 case ZFS_ERR_DISCARDING_CHECKPOINT: 3245 case ZFS_ERR_CHECKPOINT_EXISTS: 3246 case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: 3247 break; 3248 case ENOSPC: 3249 ztest_record_enospc(FTAG); 3250 break; 3251 default: 3252 fatal(B_FALSE, "spa_checkpoint(%s) = %d", spa->spa_name, error); 3253 } 3254 } 3255 3256 static void 3257 ztest_spa_discard_checkpoint(spa_t *spa) 3258 { 3259 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3260 3261 int error = spa_checkpoint_discard(spa->spa_name); 3262 3263 switch (error) { 3264 case 0: 3265 case ZFS_ERR_DISCARDING_CHECKPOINT: 3266 case ZFS_ERR_NO_CHECKPOINT: 3267 break; 3268 default: 3269 fatal(B_FALSE, "spa_discard_checkpoint(%s) = %d", 3270 spa->spa_name, error); 3271 } 3272 3273 } 3274 3275 void 3276 ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 3277 { 3278 (void) zd, (void) id; 3279 spa_t *spa = ztest_spa; 3280 3281 mutex_enter(&ztest_checkpoint_lock); 3282 if (ztest_random(2) == 0) { 3283 ztest_spa_checkpoint(spa); 3284 } else { 3285 ztest_spa_discard_checkpoint(spa); 3286 } 3287 mutex_exit(&ztest_checkpoint_lock); 3288 } 3289 3290 3291 static vdev_t * 3292 vdev_lookup_by_path(vdev_t *vd, const char *path) 3293 { 3294 vdev_t *mvd; 3295 int c; 3296 3297 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 3298 return (vd); 3299 3300 for (c = 0; c < vd->vdev_children; c++) 3301 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 3302 NULL) 3303 return (mvd); 3304 3305 return (NULL); 3306 } 3307 3308 static int 3309 spa_num_top_vdevs(spa_t *spa) 3310 { 3311 vdev_t *rvd = spa->spa_root_vdev; 3312 ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); 3313 return (rvd->vdev_children); 3314 } 3315 3316 /* 3317 * Verify that vdev_add() works as expected. 3318 */ 3319 void 3320 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 3321 { 3322 (void) zd, (void) id; 3323 ztest_shared_t *zs = ztest_shared; 3324 spa_t *spa = ztest_spa; 3325 uint64_t leaves; 3326 uint64_t guid; 3327 uint64_t raidz_children; 3328 3329 nvlist_t *nvroot; 3330 int error; 3331 3332 if (ztest_opts.zo_mmp_test) 3333 return; 3334 3335 mutex_enter(&ztest_vdev_lock); 3336 raidz_children = ztest_get_raidz_children(spa); 3337 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 3338 3339 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3340 3341 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3342 3343 /* 3344 * If we have slogs then remove them 1/4 of the time. 3345 */ 3346 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 3347 metaslab_group_t *mg; 3348 3349 /* 3350 * find the first real slog in log allocation class 3351 */ 3352 mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; 3353 while (!mg->mg_vd->vdev_islog) 3354 mg = mg->mg_next; 3355 3356 guid = mg->mg_vd->vdev_guid; 3357 3358 spa_config_exit(spa, SCL_VDEV, FTAG); 3359 3360 /* 3361 * We have to grab the zs_name_lock as writer to 3362 * prevent a race between removing a slog (dmu_objset_find) 3363 * and destroying a dataset. Removing the slog will 3364 * grab a reference on the dataset which may cause 3365 * dsl_destroy_head() to fail with EBUSY thus 3366 * leaving the dataset in an inconsistent state. 3367 */ 3368 pthread_rwlock_wrlock(&ztest_name_lock); 3369 error = spa_vdev_remove(spa, guid, B_FALSE); 3370 pthread_rwlock_unlock(&ztest_name_lock); 3371 3372 switch (error) { 3373 case 0: 3374 case EEXIST: /* Generic zil_reset() error */ 3375 case EBUSY: /* Replay required */ 3376 case EACCES: /* Crypto key not loaded */ 3377 case ZFS_ERR_CHECKPOINT_EXISTS: 3378 case ZFS_ERR_DISCARDING_CHECKPOINT: 3379 break; 3380 default: 3381 fatal(B_FALSE, "spa_vdev_remove() = %d", error); 3382 } 3383 } else { 3384 spa_config_exit(spa, SCL_VDEV, FTAG); 3385 3386 /* 3387 * Make 1/4 of the devices be log devices 3388 */ 3389 nvroot = make_vdev_root(NULL, NULL, NULL, 3390 ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? 3391 "log" : NULL, raidz_children, zs->zs_mirrors, 3392 1); 3393 3394 error = spa_vdev_add(spa, nvroot, B_FALSE); 3395 fnvlist_free(nvroot); 3396 3397 switch (error) { 3398 case 0: 3399 break; 3400 case ENOSPC: 3401 ztest_record_enospc("spa_vdev_add"); 3402 break; 3403 default: 3404 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3405 } 3406 } 3407 3408 mutex_exit(&ztest_vdev_lock); 3409 } 3410 3411 void 3412 ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) 3413 { 3414 (void) zd, (void) id; 3415 ztest_shared_t *zs = ztest_shared; 3416 spa_t *spa = ztest_spa; 3417 uint64_t leaves; 3418 nvlist_t *nvroot; 3419 uint64_t raidz_children; 3420 const char *class = (ztest_random(2) == 0) ? 3421 VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; 3422 int error; 3423 3424 /* 3425 * By default add a special vdev 50% of the time 3426 */ 3427 if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || 3428 (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && 3429 ztest_random(2) == 0)) { 3430 return; 3431 } 3432 3433 mutex_enter(&ztest_vdev_lock); 3434 3435 /* Only test with mirrors */ 3436 if (zs->zs_mirrors < 2) { 3437 mutex_exit(&ztest_vdev_lock); 3438 return; 3439 } 3440 3441 /* requires feature@allocation_classes */ 3442 if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { 3443 mutex_exit(&ztest_vdev_lock); 3444 return; 3445 } 3446 3447 raidz_children = ztest_get_raidz_children(spa); 3448 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 3449 3450 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3451 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3452 spa_config_exit(spa, SCL_VDEV, FTAG); 3453 3454 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 3455 class, raidz_children, zs->zs_mirrors, 1); 3456 3457 error = spa_vdev_add(spa, nvroot, B_FALSE); 3458 fnvlist_free(nvroot); 3459 3460 if (error == ENOSPC) 3461 ztest_record_enospc("spa_vdev_add"); 3462 else if (error != 0) 3463 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3464 3465 /* 3466 * 50% of the time allow small blocks in the special class 3467 */ 3468 if (error == 0 && 3469 spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { 3470 if (ztest_opts.zo_verbose >= 3) 3471 (void) printf("Enabling special VDEV small blocks\n"); 3472 error = ztest_dsl_prop_set_uint64(zd->zd_name, 3473 ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); 3474 ASSERT(error == 0 || error == ENOSPC); 3475 } 3476 3477 mutex_exit(&ztest_vdev_lock); 3478 3479 if (ztest_opts.zo_verbose >= 3) { 3480 metaslab_class_t *mc; 3481 3482 if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) 3483 mc = spa_special_class(spa); 3484 else 3485 mc = spa_dedup_class(spa); 3486 (void) printf("Added a %s mirrored vdev (of %d)\n", 3487 class, (int)mc->mc_groups); 3488 } 3489 } 3490 3491 /* 3492 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 3493 */ 3494 void 3495 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 3496 { 3497 (void) zd, (void) id; 3498 ztest_shared_t *zs = ztest_shared; 3499 spa_t *spa = ztest_spa; 3500 vdev_t *rvd = spa->spa_root_vdev; 3501 spa_aux_vdev_t *sav; 3502 const char *aux; 3503 char *path; 3504 uint64_t guid = 0; 3505 int error, ignore_err = 0; 3506 3507 if (ztest_opts.zo_mmp_test) 3508 return; 3509 3510 path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3511 3512 if (ztest_random(2) == 0) { 3513 sav = &spa->spa_spares; 3514 aux = ZPOOL_CONFIG_SPARES; 3515 } else { 3516 sav = &spa->spa_l2cache; 3517 aux = ZPOOL_CONFIG_L2CACHE; 3518 } 3519 3520 mutex_enter(&ztest_vdev_lock); 3521 3522 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3523 3524 if (sav->sav_count != 0 && ztest_random(4) == 0) { 3525 /* 3526 * Pick a random device to remove. 3527 */ 3528 vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3529 3530 /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ 3531 if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) 3532 ignore_err = ENOTSUP; 3533 3534 guid = svd->vdev_guid; 3535 } else { 3536 /* 3537 * Find an unused device we can add. 3538 */ 3539 zs->zs_vdev_aux = 0; 3540 for (;;) { 3541 int c; 3542 (void) snprintf(path, MAXPATHLEN, ztest_aux_template, 3543 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 3544 zs->zs_vdev_aux); 3545 for (c = 0; c < sav->sav_count; c++) 3546 if (strcmp(sav->sav_vdevs[c]->vdev_path, 3547 path) == 0) 3548 break; 3549 if (c == sav->sav_count && 3550 vdev_lookup_by_path(rvd, path) == NULL) 3551 break; 3552 zs->zs_vdev_aux++; 3553 } 3554 } 3555 3556 spa_config_exit(spa, SCL_VDEV, FTAG); 3557 3558 if (guid == 0) { 3559 /* 3560 * Add a new device. 3561 */ 3562 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 3563 (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); 3564 error = spa_vdev_add(spa, nvroot, B_FALSE); 3565 3566 switch (error) { 3567 case 0: 3568 break; 3569 default: 3570 fatal(B_FALSE, "spa_vdev_add(%p) = %d", nvroot, error); 3571 } 3572 fnvlist_free(nvroot); 3573 } else { 3574 /* 3575 * Remove an existing device. Sometimes, dirty its 3576 * vdev state first to make sure we handle removal 3577 * of devices that have pending state changes. 3578 */ 3579 if (ztest_random(2) == 0) 3580 (void) vdev_online(spa, guid, 0, NULL); 3581 3582 error = spa_vdev_remove(spa, guid, B_FALSE); 3583 3584 switch (error) { 3585 case 0: 3586 case EBUSY: 3587 case ZFS_ERR_CHECKPOINT_EXISTS: 3588 case ZFS_ERR_DISCARDING_CHECKPOINT: 3589 break; 3590 default: 3591 if (error != ignore_err) 3592 fatal(B_FALSE, 3593 "spa_vdev_remove(%"PRIu64") = %d", 3594 guid, error); 3595 } 3596 } 3597 3598 mutex_exit(&ztest_vdev_lock); 3599 3600 umem_free(path, MAXPATHLEN); 3601 } 3602 3603 /* 3604 * split a pool if it has mirror tlvdevs 3605 */ 3606 void 3607 ztest_split_pool(ztest_ds_t *zd, uint64_t id) 3608 { 3609 (void) zd, (void) id; 3610 ztest_shared_t *zs = ztest_shared; 3611 spa_t *spa = ztest_spa; 3612 vdev_t *rvd = spa->spa_root_vdev; 3613 nvlist_t *tree, **child, *config, *split, **schild; 3614 uint_t c, children, schildren = 0, lastlogid = 0; 3615 int error = 0; 3616 3617 if (ztest_opts.zo_mmp_test) 3618 return; 3619 3620 mutex_enter(&ztest_vdev_lock); 3621 3622 /* ensure we have a usable config; mirrors of raidz aren't supported */ 3623 if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { 3624 mutex_exit(&ztest_vdev_lock); 3625 return; 3626 } 3627 3628 /* clean up the old pool, if any */ 3629 (void) spa_destroy("splitp"); 3630 3631 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3632 3633 /* generate a config from the existing config */ 3634 mutex_enter(&spa->spa_props_lock); 3635 tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); 3636 mutex_exit(&spa->spa_props_lock); 3637 3638 VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, 3639 &child, &children)); 3640 3641 schild = umem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 3642 UMEM_NOFAIL); 3643 for (c = 0; c < children; c++) { 3644 vdev_t *tvd = rvd->vdev_child[c]; 3645 nvlist_t **mchild; 3646 uint_t mchildren; 3647 3648 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 3649 schild[schildren] = fnvlist_alloc(); 3650 fnvlist_add_string(schild[schildren], 3651 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); 3652 fnvlist_add_uint64(schild[schildren], 3653 ZPOOL_CONFIG_IS_HOLE, 1); 3654 if (lastlogid == 0) 3655 lastlogid = schildren; 3656 ++schildren; 3657 continue; 3658 } 3659 lastlogid = 0; 3660 VERIFY0(nvlist_lookup_nvlist_array(child[c], 3661 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); 3662 schild[schildren++] = fnvlist_dup(mchild[0]); 3663 } 3664 3665 /* OK, create a config that can be used to split */ 3666 split = fnvlist_alloc(); 3667 fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 3668 fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, 3669 (const nvlist_t **)schild, lastlogid != 0 ? lastlogid : schildren); 3670 3671 config = fnvlist_alloc(); 3672 fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); 3673 3674 for (c = 0; c < schildren; c++) 3675 fnvlist_free(schild[c]); 3676 umem_free(schild, rvd->vdev_children * sizeof (nvlist_t *)); 3677 fnvlist_free(split); 3678 3679 spa_config_exit(spa, SCL_VDEV, FTAG); 3680 3681 (void) pthread_rwlock_wrlock(&ztest_name_lock); 3682 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 3683 (void) pthread_rwlock_unlock(&ztest_name_lock); 3684 3685 fnvlist_free(config); 3686 3687 if (error == 0) { 3688 (void) printf("successful split - results:\n"); 3689 mutex_enter(&spa_namespace_lock); 3690 show_pool_stats(spa); 3691 show_pool_stats(spa_lookup("splitp")); 3692 mutex_exit(&spa_namespace_lock); 3693 ++zs->zs_splits; 3694 --zs->zs_mirrors; 3695 } 3696 mutex_exit(&ztest_vdev_lock); 3697 } 3698 3699 /* 3700 * Verify that we can attach and detach devices. 3701 */ 3702 void 3703 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 3704 { 3705 (void) zd, (void) id; 3706 ztest_shared_t *zs = ztest_shared; 3707 spa_t *spa = ztest_spa; 3708 spa_aux_vdev_t *sav = &spa->spa_spares; 3709 vdev_t *rvd = spa->spa_root_vdev; 3710 vdev_t *oldvd, *newvd, *pvd; 3711 nvlist_t *root; 3712 uint64_t leaves; 3713 uint64_t leaf, top; 3714 uint64_t ashift = ztest_get_ashift(); 3715 uint64_t oldguid, pguid; 3716 uint64_t oldsize, newsize; 3717 uint64_t raidz_children; 3718 char *oldpath, *newpath; 3719 int replacing; 3720 int oldvd_has_siblings = B_FALSE; 3721 int newvd_is_spare = B_FALSE; 3722 int newvd_is_dspare = B_FALSE; 3723 int oldvd_is_log; 3724 int oldvd_is_special; 3725 int error, expected_error; 3726 3727 if (ztest_opts.zo_mmp_test) 3728 return; 3729 3730 oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3731 newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3732 3733 mutex_enter(&ztest_vdev_lock); 3734 raidz_children = ztest_get_raidz_children(spa); 3735 leaves = MAX(zs->zs_mirrors, 1) * raidz_children; 3736 3737 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3738 3739 /* 3740 * If a vdev is in the process of being removed, its removal may 3741 * finish while we are in progress, leading to an unexpected error 3742 * value. Don't bother trying to attach while we are in the middle 3743 * of removal. 3744 */ 3745 if (ztest_device_removal_active) { 3746 spa_config_exit(spa, SCL_ALL, FTAG); 3747 goto out; 3748 } 3749 3750 /* 3751 * RAIDZ leaf VDEV mirrors are not currently supported while a 3752 * RAIDZ expansion is in progress. 3753 */ 3754 if (ztest_opts.zo_raid_do_expand) { 3755 spa_config_exit(spa, SCL_ALL, FTAG); 3756 goto out; 3757 } 3758 3759 /* 3760 * Decide whether to do an attach or a replace. 3761 */ 3762 replacing = ztest_random(2); 3763 3764 /* 3765 * Pick a random top-level vdev. 3766 */ 3767 top = ztest_random_vdev_top(spa, B_TRUE); 3768 3769 /* 3770 * Pick a random leaf within it. 3771 */ 3772 leaf = ztest_random(leaves); 3773 3774 /* 3775 * Locate this vdev. 3776 */ 3777 oldvd = rvd->vdev_child[top]; 3778 3779 /* pick a child from the mirror */ 3780 if (zs->zs_mirrors >= 1) { 3781 ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); 3782 ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); 3783 oldvd = oldvd->vdev_child[leaf / raidz_children]; 3784 } 3785 3786 /* pick a child out of the raidz group */ 3787 if (ztest_opts.zo_raid_children > 1) { 3788 if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) 3789 ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); 3790 else 3791 ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); 3792 oldvd = oldvd->vdev_child[leaf % raidz_children]; 3793 } 3794 3795 /* 3796 * If we're already doing an attach or replace, oldvd may be a 3797 * mirror vdev -- in which case, pick a random child. 3798 */ 3799 while (oldvd->vdev_children != 0) { 3800 oldvd_has_siblings = B_TRUE; 3801 ASSERT3U(oldvd->vdev_children, >=, 2); 3802 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3803 } 3804 3805 oldguid = oldvd->vdev_guid; 3806 oldsize = vdev_get_min_asize(oldvd); 3807 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3808 oldvd_is_special = 3809 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_SPECIAL || 3810 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_DEDUP; 3811 (void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN); 3812 pvd = oldvd->vdev_parent; 3813 pguid = pvd->vdev_guid; 3814 3815 /* 3816 * If oldvd has siblings, then half of the time, detach it. Prior 3817 * to the detach the pool is scrubbed in order to prevent creating 3818 * unrepairable blocks as a result of the data corruption injection. 3819 */ 3820 if (oldvd_has_siblings && ztest_random(2) == 0) { 3821 spa_config_exit(spa, SCL_ALL, FTAG); 3822 3823 error = ztest_scrub_impl(spa); 3824 if (error) 3825 goto out; 3826 3827 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3828 if (error != 0 && error != ENODEV && error != EBUSY && 3829 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3830 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3831 fatal(B_FALSE, "detach (%s) returned %d", 3832 oldpath, error); 3833 goto out; 3834 } 3835 3836 /* 3837 * For the new vdev, choose with equal probability between the two 3838 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3839 */ 3840 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3841 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3842 newvd_is_spare = B_TRUE; 3843 3844 if (newvd->vdev_ops == &vdev_draid_spare_ops) 3845 newvd_is_dspare = B_TRUE; 3846 3847 (void) strlcpy(newpath, newvd->vdev_path, MAXPATHLEN); 3848 } else { 3849 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 3850 ztest_opts.zo_dir, ztest_opts.zo_pool, 3851 top * leaves + leaf); 3852 if (ztest_random(2) == 0) 3853 newpath[strlen(newpath) - 1] = 'b'; 3854 newvd = vdev_lookup_by_path(rvd, newpath); 3855 } 3856 3857 if (newvd) { 3858 /* 3859 * Reopen to ensure the vdev's asize field isn't stale. 3860 */ 3861 vdev_reopen(newvd); 3862 newsize = vdev_get_min_asize(newvd); 3863 } else { 3864 /* 3865 * Make newsize a little bigger or smaller than oldsize. 3866 * If it's smaller, the attach should fail. 3867 * If it's larger, and we're doing a replace, 3868 * we should get dynamic LUN growth when we're done. 3869 */ 3870 newsize = 10 * oldsize / (9 + ztest_random(3)); 3871 } 3872 3873 /* 3874 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3875 * unless it's a replace; in that case any non-replacing parent is OK. 3876 * 3877 * If newvd is already part of the pool, it should fail with EBUSY. 3878 * 3879 * If newvd is too small, it should fail with EOVERFLOW. 3880 * 3881 * If newvd is a distributed spare and it's being attached to a 3882 * dRAID which is not its parent it should fail with EINVAL. 3883 */ 3884 if (pvd->vdev_ops != &vdev_mirror_ops && 3885 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3886 pvd->vdev_ops == &vdev_replacing_ops || 3887 pvd->vdev_ops == &vdev_spare_ops)) 3888 expected_error = ENOTSUP; 3889 else if (newvd_is_spare && 3890 (!replacing || oldvd_is_log || oldvd_is_special)) 3891 expected_error = ENOTSUP; 3892 else if (newvd == oldvd) 3893 expected_error = replacing ? 0 : EBUSY; 3894 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3895 expected_error = EBUSY; 3896 else if (!newvd_is_dspare && newsize < oldsize) 3897 expected_error = EOVERFLOW; 3898 else if (ashift > oldvd->vdev_top->vdev_ashift) 3899 expected_error = EDOM; 3900 else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) 3901 expected_error = EINVAL; 3902 else 3903 expected_error = 0; 3904 3905 spa_config_exit(spa, SCL_ALL, FTAG); 3906 3907 /* 3908 * Build the nvlist describing newpath. 3909 */ 3910 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3911 ashift, NULL, 0, 0, 1); 3912 3913 /* 3914 * When supported select either a healing or sequential resilver. 3915 */ 3916 boolean_t rebuilding = B_FALSE; 3917 if (pvd->vdev_ops == &vdev_mirror_ops || 3918 pvd->vdev_ops == &vdev_root_ops) { 3919 rebuilding = !!ztest_random(2); 3920 } 3921 3922 error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); 3923 3924 fnvlist_free(root); 3925 3926 /* 3927 * If our parent was the replacing vdev, but the replace completed, 3928 * then instead of failing with ENOTSUP we may either succeed, 3929 * fail with ENODEV, or fail with EOVERFLOW. 3930 */ 3931 if (expected_error == ENOTSUP && 3932 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3933 expected_error = error; 3934 3935 /* 3936 * If someone grew the LUN, the replacement may be too small. 3937 */ 3938 if (error == EOVERFLOW || error == EBUSY) 3939 expected_error = error; 3940 3941 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3942 error == ZFS_ERR_DISCARDING_CHECKPOINT || 3943 error == ZFS_ERR_RESILVER_IN_PROGRESS || 3944 error == ZFS_ERR_REBUILD_IN_PROGRESS) 3945 expected_error = error; 3946 3947 if (error != expected_error && expected_error != EBUSY) { 3948 fatal(B_FALSE, "attach (%s %"PRIu64", %s %"PRIu64", %d) " 3949 "returned %d, expected %d", 3950 oldpath, oldsize, newpath, 3951 newsize, replacing, error, expected_error); 3952 } 3953 out: 3954 mutex_exit(&ztest_vdev_lock); 3955 3956 umem_free(oldpath, MAXPATHLEN); 3957 umem_free(newpath, MAXPATHLEN); 3958 } 3959 3960 static void 3961 raidz_scratch_verify(void) 3962 { 3963 spa_t *spa; 3964 uint64_t write_size, logical_size, offset; 3965 raidz_reflow_scratch_state_t state; 3966 vdev_raidz_expand_t *vre; 3967 vdev_t *raidvd; 3968 3969 ASSERT(raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE); 3970 3971 if (ztest_scratch_state->zs_raidz_scratch_verify_pause == 0) 3972 return; 3973 3974 kernel_init(SPA_MODE_READ); 3975 3976 mutex_enter(&spa_namespace_lock); 3977 spa = spa_lookup(ztest_opts.zo_pool); 3978 ASSERT(spa); 3979 spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; 3980 mutex_exit(&spa_namespace_lock); 3981 3982 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 3983 3984 ASSERT3U(RRSS_GET_OFFSET(&spa->spa_uberblock), !=, UINT64_MAX); 3985 3986 mutex_enter(&ztest_vdev_lock); 3987 3988 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); 3989 3990 vre = spa->spa_raidz_expand; 3991 if (vre == NULL) 3992 goto out; 3993 3994 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3995 offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3996 state = RRSS_GET_STATE(&spa->spa_uberblock); 3997 write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift, 3998 uint64_t); 3999 logical_size = write_size * raidvd->vdev_children; 4000 4001 switch (state) { 4002 /* 4003 * Initial state of reflow process. RAIDZ expansion was 4004 * requested by user, but scratch object was not created. 4005 */ 4006 case RRSS_SCRATCH_NOT_IN_USE: 4007 ASSERT3U(offset, ==, 0); 4008 break; 4009 4010 /* 4011 * Scratch object was synced and stored in boot area. 4012 */ 4013 case RRSS_SCRATCH_VALID: 4014 4015 /* 4016 * Scratch object was synced back to raidz start offset, 4017 * raidz is ready for sector by sector reflow process. 4018 */ 4019 case RRSS_SCRATCH_INVALID_SYNCED: 4020 4021 /* 4022 * Scratch object was synced back to raidz start offset 4023 * on zpool importing, raidz is ready for sector by sector 4024 * reflow process. 4025 */ 4026 case RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT: 4027 ASSERT3U(offset, ==, logical_size); 4028 break; 4029 4030 /* 4031 * Sector by sector reflow process started. 4032 */ 4033 case RRSS_SCRATCH_INVALID_SYNCED_REFLOW: 4034 ASSERT3U(offset, >=, logical_size); 4035 break; 4036 } 4037 4038 out: 4039 spa_config_exit(spa, SCL_ALL, FTAG); 4040 4041 mutex_exit(&ztest_vdev_lock); 4042 4043 ztest_scratch_state->zs_raidz_scratch_verify_pause = 0; 4044 4045 spa_close(spa, FTAG); 4046 kernel_fini(); 4047 } 4048 4049 static void 4050 ztest_scratch_thread(void *arg) 4051 { 4052 (void) arg; 4053 4054 /* wait up to 10 seconds */ 4055 for (int t = 100; t > 0; t -= 1) { 4056 if (raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) 4057 thread_exit(); 4058 4059 (void) poll(NULL, 0, 100); 4060 } 4061 4062 /* killed when the scratch area progress reached a certain point */ 4063 ztest_kill(ztest_shared); 4064 } 4065 4066 /* 4067 * Verify that we can attach raidz device. 4068 */ 4069 void 4070 ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) 4071 { 4072 (void) zd, (void) id; 4073 ztest_shared_t *zs = ztest_shared; 4074 spa_t *spa = ztest_spa; 4075 uint64_t leaves, raidz_children, newsize, ashift = ztest_get_ashift(); 4076 kthread_t *scratch_thread = NULL; 4077 vdev_t *newvd, *pvd; 4078 nvlist_t *root; 4079 char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 4080 int error, expected_error = 0; 4081 4082 mutex_enter(&ztest_vdev_lock); 4083 4084 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); 4085 4086 /* Only allow attach when raid-kind = 'eraidz' */ 4087 if (!ztest_opts.zo_raid_do_expand) { 4088 spa_config_exit(spa, SCL_ALL, FTAG); 4089 goto out; 4090 } 4091 4092 if (ztest_opts.zo_mmp_test) { 4093 spa_config_exit(spa, SCL_ALL, FTAG); 4094 goto out; 4095 } 4096 4097 if (ztest_device_removal_active) { 4098 spa_config_exit(spa, SCL_ALL, FTAG); 4099 goto out; 4100 } 4101 4102 pvd = vdev_lookup_top(spa, 0); 4103 4104 ASSERT(pvd->vdev_ops == &vdev_raidz_ops); 4105 4106 /* 4107 * Get size of a child of the raidz group, 4108 * make sure device is a bit bigger 4109 */ 4110 newvd = pvd->vdev_child[ztest_random(pvd->vdev_children)]; 4111 newsize = 10 * vdev_get_min_asize(newvd) / (9 + ztest_random(2)); 4112 4113 /* 4114 * Get next attached leaf id 4115 */ 4116 raidz_children = ztest_get_raidz_children(spa); 4117 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 4118 zs->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 4119 4120 if (spa->spa_raidz_expand) 4121 expected_error = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS; 4122 4123 spa_config_exit(spa, SCL_ALL, FTAG); 4124 4125 /* 4126 * Path to vdev to be attached 4127 */ 4128 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 4129 ztest_opts.zo_dir, ztest_opts.zo_pool, zs->zs_vdev_next_leaf); 4130 4131 /* 4132 * Build the nvlist describing newpath. 4133 */ 4134 root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL, 4135 0, 0, 1); 4136 4137 /* 4138 * 50% of the time, set raidz_expand_pause_point to cause 4139 * raidz_reflow_scratch_sync() to pause at a certain point and 4140 * then kill the test after 10 seconds so raidz_scratch_verify() 4141 * can confirm consistency when the pool is imported. 4142 */ 4143 if (ztest_random(2) == 0 && expected_error == 0) { 4144 raidz_expand_pause_point = 4145 ztest_random(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2) + 1; 4146 scratch_thread = thread_create(NULL, 0, ztest_scratch_thread, 4147 ztest_shared, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 4148 } 4149 4150 error = spa_vdev_attach(spa, pvd->vdev_guid, root, B_FALSE, B_FALSE); 4151 4152 nvlist_free(root); 4153 4154 if (error == EOVERFLOW || error == ENXIO || 4155 error == ZFS_ERR_CHECKPOINT_EXISTS || 4156 error == ZFS_ERR_DISCARDING_CHECKPOINT) 4157 expected_error = error; 4158 4159 if (error != 0 && error != expected_error) { 4160 fatal(0, "raidz attach (%s %"PRIu64") returned %d, expected %d", 4161 newpath, newsize, error, expected_error); 4162 } 4163 4164 if (raidz_expand_pause_point) { 4165 if (error != 0) { 4166 /* 4167 * Do not verify scratch object in case of error 4168 * returned by vdev attaching. 4169 */ 4170 raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; 4171 } 4172 4173 VERIFY0(thread_join(scratch_thread)); 4174 } 4175 out: 4176 mutex_exit(&ztest_vdev_lock); 4177 4178 umem_free(newpath, MAXPATHLEN); 4179 } 4180 4181 void 4182 ztest_device_removal(ztest_ds_t *zd, uint64_t id) 4183 { 4184 (void) zd, (void) id; 4185 spa_t *spa = ztest_spa; 4186 vdev_t *vd; 4187 uint64_t guid; 4188 int error; 4189 4190 mutex_enter(&ztest_vdev_lock); 4191 4192 if (ztest_device_removal_active) { 4193 mutex_exit(&ztest_vdev_lock); 4194 return; 4195 } 4196 4197 /* 4198 * Remove a random top-level vdev and wait for removal to finish. 4199 */ 4200 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 4201 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 4202 guid = vd->vdev_guid; 4203 spa_config_exit(spa, SCL_VDEV, FTAG); 4204 4205 error = spa_vdev_remove(spa, guid, B_FALSE); 4206 if (error == 0) { 4207 ztest_device_removal_active = B_TRUE; 4208 mutex_exit(&ztest_vdev_lock); 4209 4210 /* 4211 * spa->spa_vdev_removal is created in a sync task that 4212 * is initiated via dsl_sync_task_nowait(). Since the 4213 * task may not run before spa_vdev_remove() returns, we 4214 * must wait at least 1 txg to ensure that the removal 4215 * struct has been created. 4216 */ 4217 txg_wait_synced(spa_get_dsl(spa), 0); 4218 4219 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 4220 txg_wait_synced(spa_get_dsl(spa), 0); 4221 } else { 4222 mutex_exit(&ztest_vdev_lock); 4223 return; 4224 } 4225 4226 /* 4227 * The pool needs to be scrubbed after completing device removal. 4228 * Failure to do so may result in checksum errors due to the 4229 * strategy employed by ztest_fault_inject() when selecting which 4230 * offset are redundant and can be damaged. 4231 */ 4232 error = spa_scan(spa, POOL_SCAN_SCRUB); 4233 if (error == 0) { 4234 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 4235 txg_wait_synced(spa_get_dsl(spa), 0); 4236 } 4237 4238 mutex_enter(&ztest_vdev_lock); 4239 ztest_device_removal_active = B_FALSE; 4240 mutex_exit(&ztest_vdev_lock); 4241 } 4242 4243 /* 4244 * Callback function which expands the physical size of the vdev. 4245 */ 4246 static vdev_t * 4247 grow_vdev(vdev_t *vd, void *arg) 4248 { 4249 spa_t *spa __maybe_unused = vd->vdev_spa; 4250 size_t *newsize = arg; 4251 size_t fsize; 4252 int fd; 4253 4254 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 4255 ASSERT(vd->vdev_ops->vdev_op_leaf); 4256 4257 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 4258 return (vd); 4259 4260 fsize = lseek(fd, 0, SEEK_END); 4261 VERIFY0(ftruncate(fd, *newsize)); 4262 4263 if (ztest_opts.zo_verbose >= 6) { 4264 (void) printf("%s grew from %lu to %lu bytes\n", 4265 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 4266 } 4267 (void) close(fd); 4268 return (NULL); 4269 } 4270 4271 /* 4272 * Callback function which expands a given vdev by calling vdev_online(). 4273 */ 4274 static vdev_t * 4275 online_vdev(vdev_t *vd, void *arg) 4276 { 4277 (void) arg; 4278 spa_t *spa = vd->vdev_spa; 4279 vdev_t *tvd = vd->vdev_top; 4280 uint64_t guid = vd->vdev_guid; 4281 uint64_t generation = spa->spa_config_generation + 1; 4282 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 4283 int error; 4284 4285 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 4286 ASSERT(vd->vdev_ops->vdev_op_leaf); 4287 4288 /* Calling vdev_online will initialize the new metaslabs */ 4289 spa_config_exit(spa, SCL_STATE, spa); 4290 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 4291 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4292 4293 /* 4294 * If vdev_online returned an error or the underlying vdev_open 4295 * failed then we abort the expand. The only way to know that 4296 * vdev_open fails is by checking the returned newstate. 4297 */ 4298 if (error || newstate != VDEV_STATE_HEALTHY) { 4299 if (ztest_opts.zo_verbose >= 5) { 4300 (void) printf("Unable to expand vdev, state %u, " 4301 "error %d\n", newstate, error); 4302 } 4303 return (vd); 4304 } 4305 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 4306 4307 /* 4308 * Since we dropped the lock we need to ensure that we're 4309 * still talking to the original vdev. It's possible this 4310 * vdev may have been detached/replaced while we were 4311 * trying to online it. 4312 */ 4313 if (generation != spa->spa_config_generation) { 4314 if (ztest_opts.zo_verbose >= 5) { 4315 (void) printf("vdev configuration has changed, " 4316 "guid %"PRIu64", state %"PRIu64", " 4317 "expected gen %"PRIu64", got gen %"PRIu64"\n", 4318 guid, 4319 tvd->vdev_state, 4320 generation, 4321 spa->spa_config_generation); 4322 } 4323 return (vd); 4324 } 4325 return (NULL); 4326 } 4327 4328 /* 4329 * Traverse the vdev tree calling the supplied function. 4330 * We continue to walk the tree until we either have walked all 4331 * children or we receive a non-NULL return from the callback. 4332 * If a NULL callback is passed, then we just return back the first 4333 * leaf vdev we encounter. 4334 */ 4335 static vdev_t * 4336 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 4337 { 4338 uint_t c; 4339 4340 if (vd->vdev_ops->vdev_op_leaf) { 4341 if (func == NULL) 4342 return (vd); 4343 else 4344 return (func(vd, arg)); 4345 } 4346 4347 for (c = 0; c < vd->vdev_children; c++) { 4348 vdev_t *cvd = vd->vdev_child[c]; 4349 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 4350 return (cvd); 4351 } 4352 return (NULL); 4353 } 4354 4355 /* 4356 * Verify that dynamic LUN growth works as expected. 4357 */ 4358 void 4359 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 4360 { 4361 (void) zd, (void) id; 4362 spa_t *spa = ztest_spa; 4363 vdev_t *vd, *tvd; 4364 metaslab_class_t *mc; 4365 metaslab_group_t *mg; 4366 size_t psize, newsize; 4367 uint64_t top; 4368 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 4369 4370 mutex_enter(&ztest_checkpoint_lock); 4371 mutex_enter(&ztest_vdev_lock); 4372 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4373 4374 /* 4375 * If there is a vdev removal in progress, it could complete while 4376 * we are running, in which case we would not be able to verify 4377 * that the metaslab_class space increased (because it decreases 4378 * when the device removal completes). 4379 */ 4380 if (ztest_device_removal_active) { 4381 spa_config_exit(spa, SCL_STATE, spa); 4382 mutex_exit(&ztest_vdev_lock); 4383 mutex_exit(&ztest_checkpoint_lock); 4384 return; 4385 } 4386 4387 /* 4388 * If we are under raidz expansion, the test can failed because the 4389 * metaslabs count will not increase immediately after the vdev is 4390 * expanded. It will happen only after raidz expansion completion. 4391 */ 4392 if (spa->spa_raidz_expand) { 4393 spa_config_exit(spa, SCL_STATE, spa); 4394 mutex_exit(&ztest_vdev_lock); 4395 mutex_exit(&ztest_checkpoint_lock); 4396 return; 4397 } 4398 4399 top = ztest_random_vdev_top(spa, B_TRUE); 4400 4401 tvd = spa->spa_root_vdev->vdev_child[top]; 4402 mg = tvd->vdev_mg; 4403 mc = mg->mg_class; 4404 old_ms_count = tvd->vdev_ms_count; 4405 old_class_space = metaslab_class_get_space(mc); 4406 4407 /* 4408 * Determine the size of the first leaf vdev associated with 4409 * our top-level device. 4410 */ 4411 vd = vdev_walk_tree(tvd, NULL, NULL); 4412 ASSERT3P(vd, !=, NULL); 4413 ASSERT(vd->vdev_ops->vdev_op_leaf); 4414 4415 psize = vd->vdev_psize; 4416 4417 /* 4418 * We only try to expand the vdev if it's healthy, less than 4x its 4419 * original size, and it has a valid psize. 4420 */ 4421 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 4422 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 4423 spa_config_exit(spa, SCL_STATE, spa); 4424 mutex_exit(&ztest_vdev_lock); 4425 mutex_exit(&ztest_checkpoint_lock); 4426 return; 4427 } 4428 ASSERT3U(psize, >, 0); 4429 newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); 4430 ASSERT3U(newsize, >, psize); 4431 4432 if (ztest_opts.zo_verbose >= 6) { 4433 (void) printf("Expanding LUN %s from %lu to %lu\n", 4434 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 4435 } 4436 4437 /* 4438 * Growing the vdev is a two step process: 4439 * 1). expand the physical size (i.e. relabel) 4440 * 2). online the vdev to create the new metaslabs 4441 */ 4442 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 4443 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 4444 tvd->vdev_state != VDEV_STATE_HEALTHY) { 4445 if (ztest_opts.zo_verbose >= 5) { 4446 (void) printf("Could not expand LUN because " 4447 "the vdev configuration changed.\n"); 4448 } 4449 spa_config_exit(spa, SCL_STATE, spa); 4450 mutex_exit(&ztest_vdev_lock); 4451 mutex_exit(&ztest_checkpoint_lock); 4452 return; 4453 } 4454 4455 spa_config_exit(spa, SCL_STATE, spa); 4456 4457 /* 4458 * Expanding the LUN will update the config asynchronously, 4459 * thus we must wait for the async thread to complete any 4460 * pending tasks before proceeding. 4461 */ 4462 for (;;) { 4463 boolean_t done; 4464 mutex_enter(&spa->spa_async_lock); 4465 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 4466 mutex_exit(&spa->spa_async_lock); 4467 if (done) 4468 break; 4469 txg_wait_synced(spa_get_dsl(spa), 0); 4470 (void) poll(NULL, 0, 100); 4471 } 4472 4473 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4474 4475 tvd = spa->spa_root_vdev->vdev_child[top]; 4476 new_ms_count = tvd->vdev_ms_count; 4477 new_class_space = metaslab_class_get_space(mc); 4478 4479 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 4480 if (ztest_opts.zo_verbose >= 5) { 4481 (void) printf("Could not verify LUN expansion due to " 4482 "intervening vdev offline or remove.\n"); 4483 } 4484 spa_config_exit(spa, SCL_STATE, spa); 4485 mutex_exit(&ztest_vdev_lock); 4486 mutex_exit(&ztest_checkpoint_lock); 4487 return; 4488 } 4489 4490 /* 4491 * Make sure we were able to grow the vdev. 4492 */ 4493 if (new_ms_count <= old_ms_count) { 4494 fatal(B_FALSE, 4495 "LUN expansion failed: ms_count %"PRIu64" < %"PRIu64"\n", 4496 old_ms_count, new_ms_count); 4497 } 4498 4499 /* 4500 * Make sure we were able to grow the pool. 4501 */ 4502 if (new_class_space <= old_class_space) { 4503 fatal(B_FALSE, 4504 "LUN expansion failed: class_space %"PRIu64" < %"PRIu64"\n", 4505 old_class_space, new_class_space); 4506 } 4507 4508 if (ztest_opts.zo_verbose >= 5) { 4509 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 4510 4511 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 4512 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 4513 (void) printf("%s grew from %s to %s\n", 4514 spa->spa_name, oldnumbuf, newnumbuf); 4515 } 4516 4517 spa_config_exit(spa, SCL_STATE, spa); 4518 mutex_exit(&ztest_vdev_lock); 4519 mutex_exit(&ztest_checkpoint_lock); 4520 } 4521 4522 /* 4523 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 4524 */ 4525 static void 4526 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 4527 { 4528 (void) arg, (void) cr; 4529 4530 /* 4531 * Create the objects common to all ztest datasets. 4532 */ 4533 VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, 4534 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); 4535 } 4536 4537 static int 4538 ztest_dataset_create(char *dsname) 4539 { 4540 int err; 4541 uint64_t rand; 4542 dsl_crypto_params_t *dcp = NULL; 4543 4544 /* 4545 * 50% of the time, we create encrypted datasets 4546 * using a random cipher suite and a hard-coded 4547 * wrapping key. 4548 */ 4549 rand = ztest_random(2); 4550 if (rand != 0) { 4551 nvlist_t *crypto_args = fnvlist_alloc(); 4552 nvlist_t *props = fnvlist_alloc(); 4553 4554 /* slight bias towards the default cipher suite */ 4555 rand = ztest_random(ZIO_CRYPT_FUNCTIONS); 4556 if (rand < ZIO_CRYPT_AES_128_CCM) 4557 rand = ZIO_CRYPT_ON; 4558 4559 fnvlist_add_uint64(props, 4560 zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); 4561 fnvlist_add_uint8_array(crypto_args, "wkeydata", 4562 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 4563 4564 /* 4565 * These parameters aren't really used by the kernel. They 4566 * are simply stored so that userspace knows how to load 4567 * the wrapping key. 4568 */ 4569 fnvlist_add_uint64(props, 4570 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); 4571 fnvlist_add_string(props, 4572 zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); 4573 fnvlist_add_uint64(props, 4574 zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); 4575 fnvlist_add_uint64(props, 4576 zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); 4577 4578 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, 4579 crypto_args, &dcp)); 4580 4581 /* 4582 * Cycle through all available encryption implementations 4583 * to verify interoperability. 4584 */ 4585 VERIFY0(gcm_impl_set("cycle")); 4586 VERIFY0(aes_impl_set("cycle")); 4587 4588 fnvlist_free(crypto_args); 4589 fnvlist_free(props); 4590 } 4591 4592 err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, 4593 ztest_objset_create_cb, NULL); 4594 dsl_crypto_params_free(dcp, !!err); 4595 4596 rand = ztest_random(100); 4597 if (err || rand < 80) 4598 return (err); 4599 4600 if (ztest_opts.zo_verbose >= 5) 4601 (void) printf("Setting dataset %s to sync always\n", dsname); 4602 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 4603 ZFS_SYNC_ALWAYS, B_FALSE)); 4604 } 4605 4606 static int 4607 ztest_objset_destroy_cb(const char *name, void *arg) 4608 { 4609 (void) arg; 4610 objset_t *os; 4611 dmu_object_info_t doi; 4612 int error; 4613 4614 /* 4615 * Verify that the dataset contains a directory object. 4616 */ 4617 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4618 B_TRUE, FTAG, &os)); 4619 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 4620 if (error != ENOENT) { 4621 /* We could have crashed in the middle of destroying it */ 4622 ASSERT0(error); 4623 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 4624 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 4625 } 4626 dmu_objset_disown(os, B_TRUE, FTAG); 4627 4628 /* 4629 * Destroy the dataset. 4630 */ 4631 if (strchr(name, '@') != NULL) { 4632 error = dsl_destroy_snapshot(name, B_TRUE); 4633 if (error != ECHRNG) { 4634 /* 4635 * The program was executed, but encountered a runtime 4636 * error, such as insufficient slop, or a hold on the 4637 * dataset. 4638 */ 4639 ASSERT0(error); 4640 } 4641 } else { 4642 error = dsl_destroy_head(name); 4643 if (error == ENOSPC) { 4644 /* There could be checkpoint or insufficient slop */ 4645 ztest_record_enospc(FTAG); 4646 } else if (error != EBUSY) { 4647 /* There could be a hold on this dataset */ 4648 ASSERT0(error); 4649 } 4650 } 4651 return (0); 4652 } 4653 4654 static boolean_t 4655 ztest_snapshot_create(char *osname, uint64_t id) 4656 { 4657 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4658 int error; 4659 4660 (void) snprintf(snapname, sizeof (snapname), "%"PRIu64"", id); 4661 4662 error = dmu_objset_snapshot_one(osname, snapname); 4663 if (error == ENOSPC) { 4664 ztest_record_enospc(FTAG); 4665 return (B_FALSE); 4666 } 4667 if (error != 0 && error != EEXIST && error != ECHRNG) { 4668 fatal(B_FALSE, "ztest_snapshot_create(%s@%s) = %d", osname, 4669 snapname, error); 4670 } 4671 return (B_TRUE); 4672 } 4673 4674 static boolean_t 4675 ztest_snapshot_destroy(char *osname, uint64_t id) 4676 { 4677 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4678 int error; 4679 4680 (void) snprintf(snapname, sizeof (snapname), "%s@%"PRIu64"", 4681 osname, id); 4682 4683 error = dsl_destroy_snapshot(snapname, B_FALSE); 4684 if (error != 0 && error != ENOENT && error != ECHRNG) 4685 fatal(B_FALSE, "ztest_snapshot_destroy(%s) = %d", 4686 snapname, error); 4687 return (B_TRUE); 4688 } 4689 4690 void 4691 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 4692 { 4693 (void) zd; 4694 ztest_ds_t *zdtmp; 4695 int iters; 4696 int error; 4697 objset_t *os, *os2; 4698 char name[ZFS_MAX_DATASET_NAME_LEN]; 4699 zilog_t *zilog; 4700 int i; 4701 4702 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 4703 4704 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4705 4706 (void) snprintf(name, sizeof (name), "%s/temp_%"PRIu64"", 4707 ztest_opts.zo_pool, id); 4708 4709 /* 4710 * If this dataset exists from a previous run, process its replay log 4711 * half of the time. If we don't replay it, then dsl_destroy_head() 4712 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 4713 */ 4714 if (ztest_random(2) == 0 && 4715 ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 4716 B_TRUE, FTAG, &os) == 0) { 4717 ztest_zd_init(zdtmp, NULL, os); 4718 zil_replay(os, zdtmp, ztest_replay_vector); 4719 ztest_zd_fini(zdtmp); 4720 dmu_objset_disown(os, B_TRUE, FTAG); 4721 } 4722 4723 /* 4724 * There may be an old instance of the dataset we're about to 4725 * create lying around from a previous run. If so, destroy it 4726 * and all of its snapshots. 4727 */ 4728 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 4729 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 4730 4731 /* 4732 * Verify that the destroyed dataset is no longer in the namespace. 4733 * It may still be present if the destroy above fails with ENOSPC. 4734 */ 4735 error = ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, 4736 FTAG, &os); 4737 if (error == 0) { 4738 dmu_objset_disown(os, B_TRUE, FTAG); 4739 ztest_record_enospc(FTAG); 4740 goto out; 4741 } 4742 VERIFY3U(ENOENT, ==, error); 4743 4744 /* 4745 * Verify that we can create a new dataset. 4746 */ 4747 error = ztest_dataset_create(name); 4748 if (error) { 4749 if (error == ENOSPC) { 4750 ztest_record_enospc(FTAG); 4751 goto out; 4752 } 4753 fatal(B_FALSE, "dmu_objset_create(%s) = %d", name, error); 4754 } 4755 4756 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, 4757 FTAG, &os)); 4758 4759 ztest_zd_init(zdtmp, NULL, os); 4760 4761 /* 4762 * Open the intent log for it. 4763 */ 4764 zilog = zil_open(os, ztest_get_data, NULL); 4765 4766 /* 4767 * Put some objects in there, do a little I/O to them, 4768 * and randomly take a couple of snapshots along the way. 4769 */ 4770 iters = ztest_random(5); 4771 for (i = 0; i < iters; i++) { 4772 ztest_dmu_object_alloc_free(zdtmp, id); 4773 if (ztest_random(iters) == 0) 4774 (void) ztest_snapshot_create(name, i); 4775 } 4776 4777 /* 4778 * Verify that we cannot create an existing dataset. 4779 */ 4780 VERIFY3U(EEXIST, ==, 4781 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); 4782 4783 /* 4784 * Verify that we can hold an objset that is also owned. 4785 */ 4786 VERIFY0(dmu_objset_hold(name, FTAG, &os2)); 4787 dmu_objset_rele(os2, FTAG); 4788 4789 /* 4790 * Verify that we cannot own an objset that is already owned. 4791 */ 4792 VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, 4793 B_FALSE, B_TRUE, FTAG, &os2)); 4794 4795 zil_close(zilog); 4796 dmu_objset_disown(os, B_TRUE, FTAG); 4797 ztest_zd_fini(zdtmp); 4798 out: 4799 (void) pthread_rwlock_unlock(&ztest_name_lock); 4800 4801 umem_free(zdtmp, sizeof (ztest_ds_t)); 4802 } 4803 4804 /* 4805 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 4806 */ 4807 void 4808 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 4809 { 4810 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4811 (void) ztest_snapshot_destroy(zd->zd_name, id); 4812 (void) ztest_snapshot_create(zd->zd_name, id); 4813 (void) pthread_rwlock_unlock(&ztest_name_lock); 4814 } 4815 4816 /* 4817 * Cleanup non-standard snapshots and clones. 4818 */ 4819 static void 4820 ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 4821 { 4822 char *snap1name; 4823 char *clone1name; 4824 char *snap2name; 4825 char *clone2name; 4826 char *snap3name; 4827 int error; 4828 4829 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4830 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4831 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4832 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4833 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4834 4835 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4836 osname, id); 4837 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4838 osname, id); 4839 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4840 clone1name, id); 4841 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4842 osname, id); 4843 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4844 clone1name, id); 4845 4846 error = dsl_destroy_head(clone2name); 4847 if (error && error != ENOENT) 4848 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone2name, error); 4849 error = dsl_destroy_snapshot(snap3name, B_FALSE); 4850 if (error && error != ENOENT) 4851 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4852 snap3name, error); 4853 error = dsl_destroy_snapshot(snap2name, B_FALSE); 4854 if (error && error != ENOENT) 4855 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4856 snap2name, error); 4857 error = dsl_destroy_head(clone1name); 4858 if (error && error != ENOENT) 4859 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone1name, error); 4860 error = dsl_destroy_snapshot(snap1name, B_FALSE); 4861 if (error && error != ENOENT) 4862 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4863 snap1name, error); 4864 4865 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4866 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4867 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4868 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4869 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4870 } 4871 4872 /* 4873 * Verify dsl_dataset_promote handles EBUSY 4874 */ 4875 void 4876 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 4877 { 4878 objset_t *os; 4879 char *snap1name; 4880 char *clone1name; 4881 char *snap2name; 4882 char *clone2name; 4883 char *snap3name; 4884 char *osname = zd->zd_name; 4885 int error; 4886 4887 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4888 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4889 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4890 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4891 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4892 4893 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4894 4895 ztest_dsl_dataset_cleanup(osname, id); 4896 4897 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4898 osname, id); 4899 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4900 osname, id); 4901 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4902 clone1name, id); 4903 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4904 osname, id); 4905 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4906 clone1name, id); 4907 4908 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 4909 if (error && error != EEXIST) { 4910 if (error == ENOSPC) { 4911 ztest_record_enospc(FTAG); 4912 goto out; 4913 } 4914 fatal(B_FALSE, "dmu_take_snapshot(%s) = %d", snap1name, error); 4915 } 4916 4917 error = dmu_objset_clone(clone1name, snap1name); 4918 if (error) { 4919 if (error == ENOSPC) { 4920 ztest_record_enospc(FTAG); 4921 goto out; 4922 } 4923 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone1name, error); 4924 } 4925 4926 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 4927 if (error && error != EEXIST) { 4928 if (error == ENOSPC) { 4929 ztest_record_enospc(FTAG); 4930 goto out; 4931 } 4932 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap2name, error); 4933 } 4934 4935 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 4936 if (error && error != EEXIST) { 4937 if (error == ENOSPC) { 4938 ztest_record_enospc(FTAG); 4939 goto out; 4940 } 4941 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap3name, error); 4942 } 4943 4944 error = dmu_objset_clone(clone2name, snap3name); 4945 if (error) { 4946 if (error == ENOSPC) { 4947 ztest_record_enospc(FTAG); 4948 goto out; 4949 } 4950 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone2name, error); 4951 } 4952 4953 error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, 4954 FTAG, &os); 4955 if (error) 4956 fatal(B_FALSE, "dmu_objset_own(%s) = %d", snap2name, error); 4957 error = dsl_dataset_promote(clone2name, NULL); 4958 if (error == ENOSPC) { 4959 dmu_objset_disown(os, B_TRUE, FTAG); 4960 ztest_record_enospc(FTAG); 4961 goto out; 4962 } 4963 if (error != EBUSY) 4964 fatal(B_FALSE, "dsl_dataset_promote(%s), %d, not EBUSY", 4965 clone2name, error); 4966 dmu_objset_disown(os, B_TRUE, FTAG); 4967 4968 out: 4969 ztest_dsl_dataset_cleanup(osname, id); 4970 4971 (void) pthread_rwlock_unlock(&ztest_name_lock); 4972 4973 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4974 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4975 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4976 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4977 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4978 } 4979 4980 #undef OD_ARRAY_SIZE 4981 #define OD_ARRAY_SIZE 4 4982 4983 /* 4984 * Verify that dmu_object_{alloc,free} work as expected. 4985 */ 4986 void 4987 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 4988 { 4989 ztest_od_t *od; 4990 int batchsize; 4991 int size; 4992 int b; 4993 4994 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4995 od = umem_alloc(size, UMEM_NOFAIL); 4996 batchsize = OD_ARRAY_SIZE; 4997 4998 for (b = 0; b < batchsize; b++) 4999 ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 5000 0, 0, 0); 5001 5002 /* 5003 * Destroy the previous batch of objects, create a new batch, 5004 * and do some I/O on the new objects. 5005 */ 5006 if (ztest_object_init(zd, od, size, B_TRUE) != 0) { 5007 zd->zd_od = NULL; 5008 umem_free(od, size); 5009 return; 5010 } 5011 5012 while (ztest_random(4 * batchsize) != 0) 5013 ztest_io(zd, od[ztest_random(batchsize)].od_object, 5014 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5015 5016 umem_free(od, size); 5017 } 5018 5019 /* 5020 * Rewind the global allocator to verify object allocation backfilling. 5021 */ 5022 void 5023 ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 5024 { 5025 (void) id; 5026 objset_t *os = zd->zd_os; 5027 uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 5028 uint64_t object; 5029 5030 /* 5031 * Rewind the global allocator randomly back to a lower object number 5032 * to force backfilling and reclamation of recently freed dnodes. 5033 */ 5034 mutex_enter(&os->os_obj_lock); 5035 object = ztest_random(os->os_obj_next_chunk); 5036 os->os_obj_next_chunk = P2ALIGN_TYPED(object, dnodes_per_chunk, 5037 uint64_t); 5038 mutex_exit(&os->os_obj_lock); 5039 } 5040 5041 #undef OD_ARRAY_SIZE 5042 #define OD_ARRAY_SIZE 2 5043 5044 /* 5045 * Verify that dmu_{read,write} work as expected. 5046 */ 5047 void 5048 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 5049 { 5050 int size; 5051 ztest_od_t *od; 5052 5053 objset_t *os = zd->zd_os; 5054 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 5055 od = umem_alloc(size, UMEM_NOFAIL); 5056 dmu_tx_t *tx; 5057 int freeit, error; 5058 uint64_t i, n, s, txg; 5059 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 5060 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 5061 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 5062 uint64_t regions = 997; 5063 uint64_t stride = 123456789ULL; 5064 uint64_t width = 40; 5065 int free_percent = 5; 5066 uint32_t dmu_read_flags = DMU_READ_PREFETCH; 5067 5068 /* 5069 * We will randomly set when to do O_DIRECT on a read. 5070 */ 5071 if (ztest_random(4) == 0) 5072 dmu_read_flags |= DMU_DIRECTIO; 5073 5074 /* 5075 * This test uses two objects, packobj and bigobj, that are always 5076 * updated together (i.e. in the same tx) so that their contents are 5077 * in sync and can be compared. Their contents relate to each other 5078 * in a simple way: packobj is a dense array of 'bufwad' structures, 5079 * while bigobj is a sparse array of the same bufwads. Specifically, 5080 * for any index n, there are three bufwads that should be identical: 5081 * 5082 * packobj, at offset n * sizeof (bufwad_t) 5083 * bigobj, at the head of the nth chunk 5084 * bigobj, at the tail of the nth chunk 5085 * 5086 * The chunk size is arbitrary. It doesn't have to be a power of two, 5087 * and it doesn't have any relation to the object blocksize. 5088 * The only requirement is that it can hold at least two bufwads. 5089 * 5090 * Normally, we write the bufwad to each of these locations. 5091 * However, free_percent of the time we instead write zeroes to 5092 * packobj and perform a dmu_free_range() on bigobj. By comparing 5093 * bigobj to packobj, we can verify that the DMU is correctly 5094 * tracking which parts of an object are allocated and free, 5095 * and that the contents of the allocated blocks are correct. 5096 */ 5097 5098 /* 5099 * Read the directory info. If it's the first time, set things up. 5100 */ 5101 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); 5102 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5103 chunksize); 5104 5105 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5106 umem_free(od, size); 5107 return; 5108 } 5109 5110 bigobj = od[0].od_object; 5111 packobj = od[1].od_object; 5112 chunksize = od[0].od_gen; 5113 ASSERT3U(chunksize, ==, od[1].od_gen); 5114 5115 /* 5116 * Prefetch a random chunk of the big object. 5117 * Our aim here is to get some async reads in flight 5118 * for blocks that we may free below; the DMU should 5119 * handle this race correctly. 5120 */ 5121 n = ztest_random(regions) * stride + ztest_random(width); 5122 s = 1 + ztest_random(2 * width - 1); 5123 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 5124 ZIO_PRIORITY_SYNC_READ); 5125 5126 /* 5127 * Pick a random index and compute the offsets into packobj and bigobj. 5128 */ 5129 n = ztest_random(regions) * stride + ztest_random(width); 5130 s = 1 + ztest_random(width - 1); 5131 5132 packoff = n * sizeof (bufwad_t); 5133 packsize = s * sizeof (bufwad_t); 5134 5135 bigoff = n * chunksize; 5136 bigsize = s * chunksize; 5137 5138 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 5139 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 5140 5141 /* 5142 * free_percent of the time, free a range of bigobj rather than 5143 * overwriting it. 5144 */ 5145 freeit = (ztest_random(100) < free_percent); 5146 5147 /* 5148 * Read the current contents of our objects. 5149 */ 5150 error = dmu_read(os, packobj, packoff, packsize, packbuf, 5151 dmu_read_flags); 5152 ASSERT0(error); 5153 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 5154 dmu_read_flags); 5155 ASSERT0(error); 5156 5157 /* 5158 * Get a tx for the mods to both packobj and bigobj. 5159 */ 5160 tx = dmu_tx_create(os); 5161 5162 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5163 5164 if (freeit) 5165 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 5166 else 5167 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5168 5169 /* This accounts for setting the checksum/compression. */ 5170 dmu_tx_hold_bonus(tx, bigobj); 5171 5172 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5173 if (txg == 0) { 5174 umem_free(packbuf, packsize); 5175 umem_free(bigbuf, bigsize); 5176 umem_free(od, size); 5177 return; 5178 } 5179 5180 enum zio_checksum cksum; 5181 do { 5182 cksum = (enum zio_checksum) 5183 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 5184 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 5185 dmu_object_set_checksum(os, bigobj, cksum, tx); 5186 5187 enum zio_compress comp; 5188 do { 5189 comp = (enum zio_compress) 5190 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 5191 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 5192 dmu_object_set_compress(os, bigobj, comp, tx); 5193 5194 /* 5195 * For each index from n to n + s, verify that the existing bufwad 5196 * in packobj matches the bufwads at the head and tail of the 5197 * corresponding chunk in bigobj. Then update all three bufwads 5198 * with the new values we want to write out. 5199 */ 5200 for (i = 0; i < s; i++) { 5201 /* LINTED */ 5202 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 5203 /* LINTED */ 5204 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 5205 /* LINTED */ 5206 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 5207 5208 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 5209 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 5210 5211 if (pack->bw_txg > txg) 5212 fatal(B_FALSE, 5213 "future leak: got %"PRIx64", open txg is %"PRIx64"", 5214 pack->bw_txg, txg); 5215 5216 if (pack->bw_data != 0 && pack->bw_index != n + i) 5217 fatal(B_FALSE, "wrong index: " 5218 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 5219 pack->bw_index, n, i); 5220 5221 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 5222 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 5223 pack, bigH); 5224 5225 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 5226 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 5227 pack, bigT); 5228 5229 if (freeit) { 5230 memset(pack, 0, sizeof (bufwad_t)); 5231 } else { 5232 pack->bw_index = n + i; 5233 pack->bw_txg = txg; 5234 pack->bw_data = 1 + ztest_random(-2ULL); 5235 } 5236 *bigH = *pack; 5237 *bigT = *pack; 5238 } 5239 5240 /* 5241 * We've verified all the old bufwads, and made new ones. 5242 * Now write them out. 5243 */ 5244 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5245 5246 if (freeit) { 5247 if (ztest_opts.zo_verbose >= 7) { 5248 (void) printf("freeing offset %"PRIx64" size %"PRIx64"" 5249 " txg %"PRIx64"\n", 5250 bigoff, bigsize, txg); 5251 } 5252 VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 5253 } else { 5254 if (ztest_opts.zo_verbose >= 7) { 5255 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5256 " txg %"PRIx64"\n", 5257 bigoff, bigsize, txg); 5258 } 5259 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 5260 } 5261 5262 dmu_tx_commit(tx); 5263 5264 /* 5265 * Sanity check the stuff we just wrote. 5266 */ 5267 { 5268 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5269 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5270 5271 VERIFY0(dmu_read(os, packobj, packoff, 5272 packsize, packcheck, dmu_read_flags)); 5273 VERIFY0(dmu_read(os, bigobj, bigoff, 5274 bigsize, bigcheck, dmu_read_flags)); 5275 5276 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5277 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5278 5279 umem_free(packcheck, packsize); 5280 umem_free(bigcheck, bigsize); 5281 } 5282 5283 umem_free(packbuf, packsize); 5284 umem_free(bigbuf, bigsize); 5285 umem_free(od, size); 5286 } 5287 5288 static void 5289 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 5290 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 5291 { 5292 uint64_t i; 5293 bufwad_t *pack; 5294 bufwad_t *bigH; 5295 bufwad_t *bigT; 5296 5297 /* 5298 * For each index from n to n + s, verify that the existing bufwad 5299 * in packobj matches the bufwads at the head and tail of the 5300 * corresponding chunk in bigobj. Then update all three bufwads 5301 * with the new values we want to write out. 5302 */ 5303 for (i = 0; i < s; i++) { 5304 /* LINTED */ 5305 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 5306 /* LINTED */ 5307 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 5308 /* LINTED */ 5309 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 5310 5311 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 5312 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 5313 5314 if (pack->bw_txg > txg) 5315 fatal(B_FALSE, 5316 "future leak: got %"PRIx64", open txg is %"PRIx64"", 5317 pack->bw_txg, txg); 5318 5319 if (pack->bw_data != 0 && pack->bw_index != n + i) 5320 fatal(B_FALSE, "wrong index: " 5321 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 5322 pack->bw_index, n, i); 5323 5324 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 5325 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 5326 pack, bigH); 5327 5328 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 5329 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 5330 pack, bigT); 5331 5332 pack->bw_index = n + i; 5333 pack->bw_txg = txg; 5334 pack->bw_data = 1 + ztest_random(-2ULL); 5335 5336 *bigH = *pack; 5337 *bigT = *pack; 5338 } 5339 } 5340 5341 #undef OD_ARRAY_SIZE 5342 #define OD_ARRAY_SIZE 2 5343 5344 void 5345 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 5346 { 5347 objset_t *os = zd->zd_os; 5348 ztest_od_t *od; 5349 dmu_tx_t *tx; 5350 uint64_t i; 5351 int error; 5352 int size; 5353 uint64_t n, s, txg; 5354 bufwad_t *packbuf, *bigbuf; 5355 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 5356 uint64_t blocksize = ztest_random_blocksize(); 5357 uint64_t chunksize = blocksize; 5358 uint64_t regions = 997; 5359 uint64_t stride = 123456789ULL; 5360 uint64_t width = 9; 5361 dmu_buf_t *bonus_db; 5362 arc_buf_t **bigbuf_arcbufs; 5363 dmu_object_info_t doi; 5364 uint32_t dmu_read_flags = DMU_READ_PREFETCH; 5365 5366 /* 5367 * We will randomly set when to do O_DIRECT on a read. 5368 */ 5369 if (ztest_random(4) == 0) 5370 dmu_read_flags |= DMU_DIRECTIO; 5371 5372 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 5373 od = umem_alloc(size, UMEM_NOFAIL); 5374 5375 /* 5376 * This test uses two objects, packobj and bigobj, that are always 5377 * updated together (i.e. in the same tx) so that their contents are 5378 * in sync and can be compared. Their contents relate to each other 5379 * in a simple way: packobj is a dense array of 'bufwad' structures, 5380 * while bigobj is a sparse array of the same bufwads. Specifically, 5381 * for any index n, there are three bufwads that should be identical: 5382 * 5383 * packobj, at offset n * sizeof (bufwad_t) 5384 * bigobj, at the head of the nth chunk 5385 * bigobj, at the tail of the nth chunk 5386 * 5387 * The chunk size is set equal to bigobj block size so that 5388 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. 5389 */ 5390 5391 /* 5392 * Read the directory info. If it's the first time, set things up. 5393 */ 5394 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5395 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5396 chunksize); 5397 5398 5399 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5400 umem_free(od, size); 5401 return; 5402 } 5403 5404 bigobj = od[0].od_object; 5405 packobj = od[1].od_object; 5406 blocksize = od[0].od_blocksize; 5407 chunksize = blocksize; 5408 ASSERT3U(chunksize, ==, od[1].od_gen); 5409 5410 VERIFY0(dmu_object_info(os, bigobj, &doi)); 5411 VERIFY(ISP2(doi.doi_data_block_size)); 5412 VERIFY3U(chunksize, ==, doi.doi_data_block_size); 5413 VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); 5414 5415 /* 5416 * Pick a random index and compute the offsets into packobj and bigobj. 5417 */ 5418 n = ztest_random(regions) * stride + ztest_random(width); 5419 s = 1 + ztest_random(width - 1); 5420 5421 packoff = n * sizeof (bufwad_t); 5422 packsize = s * sizeof (bufwad_t); 5423 5424 bigoff = n * chunksize; 5425 bigsize = s * chunksize; 5426 5427 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 5428 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 5429 5430 VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 5431 5432 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 5433 5434 /* 5435 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 5436 * Iteration 1 test zcopy to already referenced dbufs. 5437 * Iteration 2 test zcopy to dirty dbuf in the same txg. 5438 * Iteration 3 test zcopy to dbuf dirty in previous txg. 5439 * Iteration 4 test zcopy when dbuf is no longer dirty. 5440 * Iteration 5 test zcopy when it can't be done. 5441 * Iteration 6 one more zcopy write. 5442 */ 5443 for (i = 0; i < 7; i++) { 5444 uint64_t j; 5445 uint64_t off; 5446 5447 /* 5448 * In iteration 5 (i == 5) use arcbufs 5449 * that don't match bigobj blksz to test 5450 * dmu_assign_arcbuf_by_dbuf() when it can't directly 5451 * assign an arcbuf to a dbuf. 5452 */ 5453 for (j = 0; j < s; j++) { 5454 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5455 bigbuf_arcbufs[j] = 5456 dmu_request_arcbuf(bonus_db, chunksize); 5457 } else { 5458 bigbuf_arcbufs[2 * j] = 5459 dmu_request_arcbuf(bonus_db, chunksize / 2); 5460 bigbuf_arcbufs[2 * j + 1] = 5461 dmu_request_arcbuf(bonus_db, chunksize / 2); 5462 } 5463 } 5464 5465 /* 5466 * Get a tx for the mods to both packobj and bigobj. 5467 */ 5468 tx = dmu_tx_create(os); 5469 5470 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5471 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5472 5473 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5474 if (txg == 0) { 5475 umem_free(packbuf, packsize); 5476 umem_free(bigbuf, bigsize); 5477 for (j = 0; j < s; j++) { 5478 if (i != 5 || 5479 chunksize < (SPA_MINBLOCKSIZE * 2)) { 5480 dmu_return_arcbuf(bigbuf_arcbufs[j]); 5481 } else { 5482 dmu_return_arcbuf( 5483 bigbuf_arcbufs[2 * j]); 5484 dmu_return_arcbuf( 5485 bigbuf_arcbufs[2 * j + 1]); 5486 } 5487 } 5488 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5489 umem_free(od, size); 5490 dmu_buf_rele(bonus_db, FTAG); 5491 return; 5492 } 5493 5494 /* 5495 * 50% of the time don't read objects in the 1st iteration to 5496 * test dmu_assign_arcbuf_by_dbuf() for the case when there are 5497 * no existing dbufs for the specified offsets. 5498 */ 5499 if (i != 0 || ztest_random(2) != 0) { 5500 error = dmu_read(os, packobj, packoff, 5501 packsize, packbuf, dmu_read_flags); 5502 ASSERT0(error); 5503 error = dmu_read(os, bigobj, bigoff, bigsize, 5504 bigbuf, dmu_read_flags); 5505 ASSERT0(error); 5506 } 5507 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 5508 n, chunksize, txg); 5509 5510 /* 5511 * We've verified all the old bufwads, and made new ones. 5512 * Now write them out. 5513 */ 5514 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5515 if (ztest_opts.zo_verbose >= 7) { 5516 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5517 " txg %"PRIx64"\n", 5518 bigoff, bigsize, txg); 5519 } 5520 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 5521 dmu_buf_t *dbt; 5522 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5523 memcpy(bigbuf_arcbufs[j]->b_data, 5524 (caddr_t)bigbuf + (off - bigoff), 5525 chunksize); 5526 } else { 5527 memcpy(bigbuf_arcbufs[2 * j]->b_data, 5528 (caddr_t)bigbuf + (off - bigoff), 5529 chunksize / 2); 5530 memcpy(bigbuf_arcbufs[2 * j + 1]->b_data, 5531 (caddr_t)bigbuf + (off - bigoff) + 5532 chunksize / 2, 5533 chunksize / 2); 5534 } 5535 5536 if (i == 1) { 5537 VERIFY(dmu_buf_hold(os, bigobj, off, 5538 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 5539 } 5540 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5541 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5542 off, bigbuf_arcbufs[j], tx)); 5543 } else { 5544 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5545 off, bigbuf_arcbufs[2 * j], tx)); 5546 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5547 off + chunksize / 2, 5548 bigbuf_arcbufs[2 * j + 1], tx)); 5549 } 5550 if (i == 1) { 5551 dmu_buf_rele(dbt, FTAG); 5552 } 5553 } 5554 dmu_tx_commit(tx); 5555 5556 /* 5557 * Sanity check the stuff we just wrote. 5558 */ 5559 { 5560 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5561 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5562 5563 VERIFY0(dmu_read(os, packobj, packoff, 5564 packsize, packcheck, dmu_read_flags)); 5565 VERIFY0(dmu_read(os, bigobj, bigoff, 5566 bigsize, bigcheck, dmu_read_flags)); 5567 5568 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5569 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5570 5571 umem_free(packcheck, packsize); 5572 umem_free(bigcheck, bigsize); 5573 } 5574 if (i == 2) { 5575 txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); 5576 } else if (i == 3) { 5577 txg_wait_synced(dmu_objset_pool(os), 0); 5578 } 5579 } 5580 5581 dmu_buf_rele(bonus_db, FTAG); 5582 umem_free(packbuf, packsize); 5583 umem_free(bigbuf, bigsize); 5584 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5585 umem_free(od, size); 5586 } 5587 5588 void 5589 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 5590 { 5591 (void) id; 5592 ztest_od_t *od; 5593 5594 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5595 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 5596 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5597 5598 /* 5599 * Have multiple threads write to large offsets in an object 5600 * to verify that parallel writes to an object -- even to the 5601 * same blocks within the object -- doesn't cause any trouble. 5602 */ 5603 ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5604 5605 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) 5606 return; 5607 5608 while (ztest_random(10) != 0) 5609 ztest_io(zd, od->od_object, offset); 5610 5611 umem_free(od, sizeof (ztest_od_t)); 5612 } 5613 5614 void 5615 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 5616 { 5617 ztest_od_t *od; 5618 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 5619 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5620 uint64_t count = ztest_random(20) + 1; 5621 uint64_t blocksize = ztest_random_blocksize(); 5622 void *data; 5623 5624 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5625 5626 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5627 5628 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5629 !ztest_random(2)) != 0) { 5630 umem_free(od, sizeof (ztest_od_t)); 5631 return; 5632 } 5633 5634 if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { 5635 umem_free(od, sizeof (ztest_od_t)); 5636 return; 5637 } 5638 5639 ztest_prealloc(zd, od->od_object, offset, count * blocksize); 5640 5641 data = umem_zalloc(blocksize, UMEM_NOFAIL); 5642 5643 while (ztest_random(count) != 0) { 5644 uint64_t randoff = offset + (ztest_random(count) * blocksize); 5645 if (ztest_write(zd, od->od_object, randoff, blocksize, 5646 data) != 0) 5647 break; 5648 while (ztest_random(4) != 0) 5649 ztest_io(zd, od->od_object, randoff); 5650 } 5651 5652 umem_free(data, blocksize); 5653 umem_free(od, sizeof (ztest_od_t)); 5654 } 5655 5656 /* 5657 * Verify that zap_{create,destroy,add,remove,update} work as expected. 5658 */ 5659 #define ZTEST_ZAP_MIN_INTS 1 5660 #define ZTEST_ZAP_MAX_INTS 4 5661 #define ZTEST_ZAP_MAX_PROPS 1000 5662 5663 void 5664 ztest_zap(ztest_ds_t *zd, uint64_t id) 5665 { 5666 objset_t *os = zd->zd_os; 5667 ztest_od_t *od; 5668 uint64_t object; 5669 uint64_t txg, last_txg; 5670 uint64_t value[ZTEST_ZAP_MAX_INTS]; 5671 uint64_t zl_ints, zl_intsize, prop; 5672 int i, ints; 5673 dmu_tx_t *tx; 5674 char propname[100], txgname[100]; 5675 int error; 5676 const char *const hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 5677 5678 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5679 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5680 5681 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5682 !ztest_random(2)) != 0) 5683 goto out; 5684 5685 object = od->od_object; 5686 5687 /* 5688 * Generate a known hash collision, and verify that 5689 * we can lookup and remove both entries. 5690 */ 5691 tx = dmu_tx_create(os); 5692 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5693 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5694 if (txg == 0) 5695 goto out; 5696 for (i = 0; i < 2; i++) { 5697 value[i] = i; 5698 VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 5699 1, &value[i], tx)); 5700 } 5701 for (i = 0; i < 2; i++) { 5702 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 5703 sizeof (uint64_t), 1, &value[i], tx)); 5704 VERIFY0( 5705 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 5706 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5707 ASSERT3U(zl_ints, ==, 1); 5708 } 5709 for (i = 0; i < 2; i++) { 5710 VERIFY0(zap_remove(os, object, hc[i], tx)); 5711 } 5712 dmu_tx_commit(tx); 5713 5714 /* 5715 * Generate a bunch of random entries. 5716 */ 5717 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 5718 5719 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5720 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5721 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5722 memset(value, 0, sizeof (value)); 5723 last_txg = 0; 5724 5725 /* 5726 * If these zap entries already exist, validate their contents. 5727 */ 5728 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5729 if (error == 0) { 5730 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5731 ASSERT3U(zl_ints, ==, 1); 5732 5733 VERIFY0(zap_lookup(os, object, txgname, zl_intsize, 5734 zl_ints, &last_txg)); 5735 5736 VERIFY0(zap_length(os, object, propname, &zl_intsize, 5737 &zl_ints)); 5738 5739 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5740 ASSERT3U(zl_ints, ==, ints); 5741 5742 VERIFY0(zap_lookup(os, object, propname, zl_intsize, 5743 zl_ints, value)); 5744 5745 for (i = 0; i < ints; i++) { 5746 ASSERT3U(value[i], ==, last_txg + object + i); 5747 } 5748 } else { 5749 ASSERT3U(error, ==, ENOENT); 5750 } 5751 5752 /* 5753 * Atomically update two entries in our zap object. 5754 * The first is named txg_%llu, and contains the txg 5755 * in which the property was last updated. The second 5756 * is named prop_%llu, and the nth element of its value 5757 * should be txg + object + n. 5758 */ 5759 tx = dmu_tx_create(os); 5760 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5761 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5762 if (txg == 0) 5763 goto out; 5764 5765 if (last_txg > txg) 5766 fatal(B_FALSE, "zap future leak: old %"PRIu64" new %"PRIu64"", 5767 last_txg, txg); 5768 5769 for (i = 0; i < ints; i++) 5770 value[i] = txg + object + i; 5771 5772 VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 5773 1, &txg, tx)); 5774 VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), 5775 ints, value, tx)); 5776 5777 dmu_tx_commit(tx); 5778 5779 /* 5780 * Remove a random pair of entries. 5781 */ 5782 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5783 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5784 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5785 5786 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5787 5788 if (error == ENOENT) 5789 goto out; 5790 5791 ASSERT0(error); 5792 5793 tx = dmu_tx_create(os); 5794 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5795 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5796 if (txg == 0) 5797 goto out; 5798 VERIFY0(zap_remove(os, object, txgname, tx)); 5799 VERIFY0(zap_remove(os, object, propname, tx)); 5800 dmu_tx_commit(tx); 5801 out: 5802 umem_free(od, sizeof (ztest_od_t)); 5803 } 5804 5805 /* 5806 * Test case to test the upgrading of a microzap to fatzap. 5807 */ 5808 void 5809 ztest_fzap(ztest_ds_t *zd, uint64_t id) 5810 { 5811 objset_t *os = zd->zd_os; 5812 ztest_od_t *od; 5813 uint64_t object, txg, value; 5814 5815 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5816 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5817 5818 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5819 !ztest_random(2)) != 0) 5820 goto out; 5821 object = od->od_object; 5822 5823 /* 5824 * Add entries to this ZAP and make sure it spills over 5825 * and gets upgraded to a fatzap. Also, since we are adding 5826 * 2050 entries we should see ptrtbl growth and leaf-block split. 5827 */ 5828 for (value = 0; value < 2050; value++) { 5829 char name[ZFS_MAX_DATASET_NAME_LEN]; 5830 dmu_tx_t *tx; 5831 int error; 5832 5833 (void) snprintf(name, sizeof (name), "fzap-%"PRIu64"-%"PRIu64"", 5834 id, value); 5835 5836 tx = dmu_tx_create(os); 5837 dmu_tx_hold_zap(tx, object, B_TRUE, name); 5838 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5839 if (txg == 0) 5840 goto out; 5841 error = zap_add(os, object, name, sizeof (uint64_t), 1, 5842 &value, tx); 5843 ASSERT(error == 0 || error == EEXIST); 5844 dmu_tx_commit(tx); 5845 } 5846 out: 5847 umem_free(od, sizeof (ztest_od_t)); 5848 } 5849 5850 void 5851 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 5852 { 5853 (void) id; 5854 objset_t *os = zd->zd_os; 5855 ztest_od_t *od; 5856 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 5857 dmu_tx_t *tx; 5858 int i, namelen, error; 5859 int micro = ztest_random(2); 5860 char name[20], string_value[20]; 5861 void *data; 5862 5863 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5864 ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); 5865 5866 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5867 umem_free(od, sizeof (ztest_od_t)); 5868 return; 5869 } 5870 5871 object = od->od_object; 5872 5873 /* 5874 * Generate a random name of the form 'xxx.....' where each 5875 * x is a random printable character and the dots are dots. 5876 * There are 94 such characters, and the name length goes from 5877 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 5878 */ 5879 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 5880 5881 for (i = 0; i < 3; i++) 5882 name[i] = '!' + ztest_random('~' - '!' + 1); 5883 for (; i < namelen - 1; i++) 5884 name[i] = '.'; 5885 name[i] = '\0'; 5886 5887 if ((namelen & 1) || micro) { 5888 wsize = sizeof (txg); 5889 wc = 1; 5890 data = &txg; 5891 } else { 5892 wsize = 1; 5893 wc = namelen; 5894 data = string_value; 5895 } 5896 5897 count = -1ULL; 5898 VERIFY0(zap_count(os, object, &count)); 5899 ASSERT3S(count, !=, -1ULL); 5900 5901 /* 5902 * Select an operation: length, lookup, add, update, remove. 5903 */ 5904 i = ztest_random(5); 5905 5906 if (i >= 2) { 5907 tx = dmu_tx_create(os); 5908 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5909 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5910 if (txg == 0) { 5911 umem_free(od, sizeof (ztest_od_t)); 5912 return; 5913 } 5914 memcpy(string_value, name, namelen); 5915 } else { 5916 tx = NULL; 5917 txg = 0; 5918 memset(string_value, 0, namelen); 5919 } 5920 5921 switch (i) { 5922 5923 case 0: 5924 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 5925 if (error == 0) { 5926 ASSERT3U(wsize, ==, zl_wsize); 5927 ASSERT3U(wc, ==, zl_wc); 5928 } else { 5929 ASSERT3U(error, ==, ENOENT); 5930 } 5931 break; 5932 5933 case 1: 5934 error = zap_lookup(os, object, name, wsize, wc, data); 5935 if (error == 0) { 5936 if (data == string_value && 5937 memcmp(name, data, namelen) != 0) 5938 fatal(B_FALSE, "name '%s' != val '%s' len %d", 5939 name, (char *)data, namelen); 5940 } else { 5941 ASSERT3U(error, ==, ENOENT); 5942 } 5943 break; 5944 5945 case 2: 5946 error = zap_add(os, object, name, wsize, wc, data, tx); 5947 ASSERT(error == 0 || error == EEXIST); 5948 break; 5949 5950 case 3: 5951 VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); 5952 break; 5953 5954 case 4: 5955 error = zap_remove(os, object, name, tx); 5956 ASSERT(error == 0 || error == ENOENT); 5957 break; 5958 } 5959 5960 if (tx != NULL) 5961 dmu_tx_commit(tx); 5962 5963 umem_free(od, sizeof (ztest_od_t)); 5964 } 5965 5966 /* 5967 * Commit callback data. 5968 */ 5969 typedef struct ztest_cb_data { 5970 list_node_t zcd_node; 5971 uint64_t zcd_txg; 5972 int zcd_expected_err; 5973 boolean_t zcd_added; 5974 boolean_t zcd_called; 5975 spa_t *zcd_spa; 5976 } ztest_cb_data_t; 5977 5978 /* This is the actual commit callback function */ 5979 static void 5980 ztest_commit_callback(void *arg, int error) 5981 { 5982 ztest_cb_data_t *data = arg; 5983 uint64_t synced_txg; 5984 5985 VERIFY3P(data, !=, NULL); 5986 VERIFY3S(data->zcd_expected_err, ==, error); 5987 VERIFY(!data->zcd_called); 5988 5989 synced_txg = spa_last_synced_txg(data->zcd_spa); 5990 if (data->zcd_txg > synced_txg) 5991 fatal(B_FALSE, 5992 "commit callback of txg %"PRIu64" called prematurely, " 5993 "last synced txg = %"PRIu64"\n", 5994 data->zcd_txg, synced_txg); 5995 5996 data->zcd_called = B_TRUE; 5997 5998 if (error == ECANCELED) { 5999 ASSERT0(data->zcd_txg); 6000 ASSERT(!data->zcd_added); 6001 6002 /* 6003 * The private callback data should be destroyed here, but 6004 * since we are going to check the zcd_called field after 6005 * dmu_tx_abort(), we will destroy it there. 6006 */ 6007 return; 6008 } 6009 6010 ASSERT(data->zcd_added); 6011 ASSERT3U(data->zcd_txg, !=, 0); 6012 6013 (void) mutex_enter(&zcl.zcl_callbacks_lock); 6014 6015 /* See if this cb was called more quickly */ 6016 if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) 6017 zc_min_txg_delay = synced_txg - data->zcd_txg; 6018 6019 /* Remove our callback from the list */ 6020 list_remove(&zcl.zcl_callbacks, data); 6021 6022 (void) mutex_exit(&zcl.zcl_callbacks_lock); 6023 6024 umem_free(data, sizeof (ztest_cb_data_t)); 6025 } 6026 6027 /* Allocate and initialize callback data structure */ 6028 static ztest_cb_data_t * 6029 ztest_create_cb_data(objset_t *os, uint64_t txg) 6030 { 6031 ztest_cb_data_t *cb_data; 6032 6033 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 6034 6035 cb_data->zcd_txg = txg; 6036 cb_data->zcd_spa = dmu_objset_spa(os); 6037 list_link_init(&cb_data->zcd_node); 6038 6039 return (cb_data); 6040 } 6041 6042 /* 6043 * Commit callback test. 6044 */ 6045 void 6046 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 6047 { 6048 objset_t *os = zd->zd_os; 6049 ztest_od_t *od; 6050 dmu_tx_t *tx; 6051 ztest_cb_data_t *cb_data[3], *tmp_cb; 6052 uint64_t old_txg, txg; 6053 int i, error = 0; 6054 6055 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 6056 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 6057 6058 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 6059 umem_free(od, sizeof (ztest_od_t)); 6060 return; 6061 } 6062 6063 tx = dmu_tx_create(os); 6064 6065 cb_data[0] = ztest_create_cb_data(os, 0); 6066 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 6067 6068 dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); 6069 6070 /* Every once in a while, abort the transaction on purpose */ 6071 if (ztest_random(100) == 0) 6072 error = -1; 6073 6074 if (!error) 6075 error = dmu_tx_assign(tx, TXG_NOWAIT); 6076 6077 txg = error ? 0 : dmu_tx_get_txg(tx); 6078 6079 cb_data[0]->zcd_txg = txg; 6080 cb_data[1] = ztest_create_cb_data(os, txg); 6081 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 6082 6083 if (error) { 6084 /* 6085 * It's not a strict requirement to call the registered 6086 * callbacks from inside dmu_tx_abort(), but that's what 6087 * it's supposed to happen in the current implementation 6088 * so we will check for that. 6089 */ 6090 for (i = 0; i < 2; i++) { 6091 cb_data[i]->zcd_expected_err = ECANCELED; 6092 VERIFY(!cb_data[i]->zcd_called); 6093 } 6094 6095 dmu_tx_abort(tx); 6096 6097 for (i = 0; i < 2; i++) { 6098 VERIFY(cb_data[i]->zcd_called); 6099 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 6100 } 6101 6102 umem_free(od, sizeof (ztest_od_t)); 6103 return; 6104 } 6105 6106 cb_data[2] = ztest_create_cb_data(os, txg); 6107 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 6108 6109 /* 6110 * Read existing data to make sure there isn't a future leak. 6111 */ 6112 VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), 6113 &old_txg, DMU_READ_PREFETCH)); 6114 6115 if (old_txg > txg) 6116 fatal(B_FALSE, 6117 "future leak: got %"PRIu64", open txg is %"PRIu64"", 6118 old_txg, txg); 6119 6120 dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); 6121 6122 (void) mutex_enter(&zcl.zcl_callbacks_lock); 6123 6124 /* 6125 * Since commit callbacks don't have any ordering requirement and since 6126 * it is theoretically possible for a commit callback to be called 6127 * after an arbitrary amount of time has elapsed since its txg has been 6128 * synced, it is difficult to reliably determine whether a commit 6129 * callback hasn't been called due to high load or due to a flawed 6130 * implementation. 6131 * 6132 * In practice, we will assume that if after a certain number of txgs a 6133 * commit callback hasn't been called, then most likely there's an 6134 * implementation bug.. 6135 */ 6136 tmp_cb = list_head(&zcl.zcl_callbacks); 6137 if (tmp_cb != NULL && 6138 tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { 6139 fatal(B_FALSE, 6140 "Commit callback threshold exceeded, " 6141 "oldest txg: %"PRIu64", open txg: %"PRIu64"\n", 6142 tmp_cb->zcd_txg, txg); 6143 } 6144 6145 /* 6146 * Let's find the place to insert our callbacks. 6147 * 6148 * Even though the list is ordered by txg, it is possible for the 6149 * insertion point to not be the end because our txg may already be 6150 * quiescing at this point and other callbacks in the open txg 6151 * (from other objsets) may have sneaked in. 6152 */ 6153 tmp_cb = list_tail(&zcl.zcl_callbacks); 6154 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 6155 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 6156 6157 /* Add the 3 callbacks to the list */ 6158 for (i = 0; i < 3; i++) { 6159 if (tmp_cb == NULL) 6160 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 6161 else 6162 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 6163 cb_data[i]); 6164 6165 cb_data[i]->zcd_added = B_TRUE; 6166 VERIFY(!cb_data[i]->zcd_called); 6167 6168 tmp_cb = cb_data[i]; 6169 } 6170 6171 zc_cb_counter += 3; 6172 6173 (void) mutex_exit(&zcl.zcl_callbacks_lock); 6174 6175 dmu_tx_commit(tx); 6176 6177 umem_free(od, sizeof (ztest_od_t)); 6178 } 6179 6180 /* 6181 * Visit each object in the dataset. Verify that its properties 6182 * are consistent what was stored in the block tag when it was created, 6183 * and that its unused bonus buffer space has not been overwritten. 6184 */ 6185 void 6186 ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 6187 { 6188 (void) id; 6189 objset_t *os = zd->zd_os; 6190 uint64_t obj; 6191 int err = 0; 6192 6193 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 6194 ztest_block_tag_t *bt = NULL; 6195 dmu_object_info_t doi; 6196 dmu_buf_t *db; 6197 6198 ztest_object_lock(zd, obj, ZTRL_READER); 6199 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { 6200 ztest_object_unlock(zd, obj); 6201 continue; 6202 } 6203 6204 dmu_object_info_from_db(db, &doi); 6205 if (doi.doi_bonus_size >= sizeof (*bt)) 6206 bt = ztest_bt_bonus(db); 6207 6208 if (bt && bt->bt_magic == BT_MAGIC) { 6209 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 6210 bt->bt_offset, bt->bt_gen, bt->bt_txg, 6211 bt->bt_crtxg); 6212 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 6213 } 6214 6215 dmu_buf_rele(db, FTAG); 6216 ztest_object_unlock(zd, obj); 6217 } 6218 } 6219 6220 void 6221 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 6222 { 6223 (void) id; 6224 zfs_prop_t proplist[] = { 6225 ZFS_PROP_CHECKSUM, 6226 ZFS_PROP_COMPRESSION, 6227 ZFS_PROP_COPIES, 6228 ZFS_PROP_DEDUP 6229 }; 6230 6231 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6232 6233 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) { 6234 int error = ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 6235 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 6236 ASSERT(error == 0 || error == ENOSPC); 6237 } 6238 6239 int error = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, 6240 ztest_random_blocksize(), (int)ztest_random(2)); 6241 ASSERT(error == 0 || error == ENOSPC); 6242 6243 (void) pthread_rwlock_unlock(&ztest_name_lock); 6244 } 6245 6246 void 6247 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 6248 { 6249 (void) zd, (void) id; 6250 6251 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6252 6253 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); 6254 6255 nvlist_t *props = fnvlist_alloc(); 6256 6257 VERIFY0(spa_prop_get(ztest_spa, props)); 6258 6259 if (ztest_opts.zo_verbose >= 6) 6260 dump_nvlist(props, 4); 6261 6262 fnvlist_free(props); 6263 6264 (void) pthread_rwlock_unlock(&ztest_name_lock); 6265 } 6266 6267 static int 6268 user_release_one(const char *snapname, const char *holdname) 6269 { 6270 nvlist_t *snaps, *holds; 6271 int error; 6272 6273 snaps = fnvlist_alloc(); 6274 holds = fnvlist_alloc(); 6275 fnvlist_add_boolean(holds, holdname); 6276 fnvlist_add_nvlist(snaps, snapname, holds); 6277 fnvlist_free(holds); 6278 error = dsl_dataset_user_release(snaps, NULL); 6279 fnvlist_free(snaps); 6280 return (error); 6281 } 6282 6283 /* 6284 * Test snapshot hold/release and deferred destroy. 6285 */ 6286 void 6287 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 6288 { 6289 int error; 6290 objset_t *os = zd->zd_os; 6291 objset_t *origin; 6292 char snapname[100]; 6293 char fullname[100]; 6294 char clonename[100]; 6295 char tag[100]; 6296 char osname[ZFS_MAX_DATASET_NAME_LEN]; 6297 nvlist_t *holds; 6298 6299 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6300 6301 dmu_objset_name(os, osname); 6302 6303 (void) snprintf(snapname, sizeof (snapname), "sh1_%"PRIu64"", id); 6304 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 6305 (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%"PRIu64"", 6306 osname, id); 6307 (void) snprintf(tag, sizeof (tag), "tag_%"PRIu64"", id); 6308 6309 /* 6310 * Clean up from any previous run. 6311 */ 6312 error = dsl_destroy_head(clonename); 6313 if (error != ENOENT) 6314 ASSERT0(error); 6315 error = user_release_one(fullname, tag); 6316 if (error != ESRCH && error != ENOENT) 6317 ASSERT0(error); 6318 error = dsl_destroy_snapshot(fullname, B_FALSE); 6319 if (error != ENOENT) 6320 ASSERT0(error); 6321 6322 /* 6323 * Create snapshot, clone it, mark snap for deferred destroy, 6324 * destroy clone, verify snap was also destroyed. 6325 */ 6326 error = dmu_objset_snapshot_one(osname, snapname); 6327 if (error) { 6328 if (error == ENOSPC) { 6329 ztest_record_enospc("dmu_objset_snapshot"); 6330 goto out; 6331 } 6332 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 6333 } 6334 6335 error = dmu_objset_clone(clonename, fullname); 6336 if (error) { 6337 if (error == ENOSPC) { 6338 ztest_record_enospc("dmu_objset_clone"); 6339 goto out; 6340 } 6341 fatal(B_FALSE, "dmu_objset_clone(%s) = %d", clonename, error); 6342 } 6343 6344 error = dsl_destroy_snapshot(fullname, B_TRUE); 6345 if (error) { 6346 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6347 fullname, error); 6348 } 6349 6350 error = dsl_destroy_head(clonename); 6351 if (error) 6352 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clonename, error); 6353 6354 error = dmu_objset_hold(fullname, FTAG, &origin); 6355 if (error != ENOENT) 6356 fatal(B_FALSE, "dmu_objset_hold(%s) = %d", fullname, error); 6357 6358 /* 6359 * Create snapshot, add temporary hold, verify that we can't 6360 * destroy a held snapshot, mark for deferred destroy, 6361 * release hold, verify snapshot was destroyed. 6362 */ 6363 error = dmu_objset_snapshot_one(osname, snapname); 6364 if (error) { 6365 if (error == ENOSPC) { 6366 ztest_record_enospc("dmu_objset_snapshot"); 6367 goto out; 6368 } 6369 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 6370 } 6371 6372 holds = fnvlist_alloc(); 6373 fnvlist_add_string(holds, fullname, tag); 6374 error = dsl_dataset_user_hold(holds, 0, NULL); 6375 fnvlist_free(holds); 6376 6377 if (error == ENOSPC) { 6378 ztest_record_enospc("dsl_dataset_user_hold"); 6379 goto out; 6380 } else if (error) { 6381 fatal(B_FALSE, "dsl_dataset_user_hold(%s, %s) = %u", 6382 fullname, tag, error); 6383 } 6384 6385 error = dsl_destroy_snapshot(fullname, B_FALSE); 6386 if (error != EBUSY) { 6387 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 6388 fullname, error); 6389 } 6390 6391 error = dsl_destroy_snapshot(fullname, B_TRUE); 6392 if (error) { 6393 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6394 fullname, error); 6395 } 6396 6397 error = user_release_one(fullname, tag); 6398 if (error) 6399 fatal(B_FALSE, "user_release_one(%s, %s) = %d", 6400 fullname, tag, error); 6401 6402 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 6403 6404 out: 6405 (void) pthread_rwlock_unlock(&ztest_name_lock); 6406 } 6407 6408 /* 6409 * Inject random faults into the on-disk data. 6410 */ 6411 void 6412 ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 6413 { 6414 (void) zd, (void) id; 6415 ztest_shared_t *zs = ztest_shared; 6416 spa_t *spa = ztest_spa; 6417 int fd; 6418 uint64_t offset; 6419 uint64_t leaves; 6420 uint64_t bad = 0x1990c0ffeedecadeull; 6421 uint64_t top, leaf; 6422 uint64_t raidz_children; 6423 char *path0; 6424 char *pathrand; 6425 size_t fsize; 6426 int bshift = SPA_MAXBLOCKSHIFT + 2; 6427 int iters = 1000; 6428 int maxfaults; 6429 int mirror_save; 6430 vdev_t *vd0 = NULL; 6431 uint64_t guid0 = 0; 6432 boolean_t islog = B_FALSE; 6433 boolean_t injected = B_FALSE; 6434 6435 path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6436 pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6437 6438 mutex_enter(&ztest_vdev_lock); 6439 6440 /* 6441 * Device removal is in progress, fault injection must be disabled 6442 * until it completes and the pool is scrubbed. The fault injection 6443 * strategy for damaging blocks does not take in to account evacuated 6444 * blocks which may have already been damaged. 6445 */ 6446 if (ztest_device_removal_active) 6447 goto out; 6448 6449 /* 6450 * The fault injection strategy for damaging blocks cannot be used 6451 * if raidz expansion is in progress. The leaves value 6452 * (attached raidz children) is variable and strategy for damaging 6453 * blocks will corrupt same data blocks on different child vdevs 6454 * because of the reflow process. 6455 */ 6456 if (spa->spa_raidz_expand != NULL) 6457 goto out; 6458 6459 maxfaults = MAXFAULTS(zs); 6460 raidz_children = ztest_get_raidz_children(spa); 6461 leaves = MAX(zs->zs_mirrors, 1) * raidz_children; 6462 mirror_save = zs->zs_mirrors; 6463 6464 ASSERT3U(leaves, >=, 1); 6465 6466 /* 6467 * While ztest is running the number of leaves will not change. This 6468 * is critical for the fault injection logic as it determines where 6469 * errors can be safely injected such that they are always repairable. 6470 * 6471 * When restarting ztest a different number of leaves may be requested 6472 * which will shift the regions to be damaged. This is fine as long 6473 * as the pool has been scrubbed prior to using the new mapping. 6474 * Failure to do can result in non-repairable damage being injected. 6475 */ 6476 if (ztest_pool_scrubbed == B_FALSE) 6477 goto out; 6478 6479 /* 6480 * Grab the name lock as reader. There are some operations 6481 * which don't like to have their vdevs changed while 6482 * they are in progress (i.e. spa_change_guid). Those 6483 * operations will have grabbed the name lock as writer. 6484 */ 6485 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6486 6487 /* 6488 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 6489 */ 6490 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6491 6492 if (ztest_random(2) == 0) { 6493 /* 6494 * Inject errors on a normal data device or slog device. 6495 */ 6496 top = ztest_random_vdev_top(spa, B_TRUE); 6497 leaf = ztest_random(leaves) + zs->zs_splits; 6498 6499 /* 6500 * Generate paths to the first leaf in this top-level vdev, 6501 * and to the random leaf we selected. We'll induce transient 6502 * write failures and random online/offline activity on leaf 0, 6503 * and we'll write random garbage to the randomly chosen leaf. 6504 */ 6505 (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, 6506 ztest_opts.zo_dir, ztest_opts.zo_pool, 6507 top * leaves + zs->zs_splits); 6508 (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, 6509 ztest_opts.zo_dir, ztest_opts.zo_pool, 6510 top * leaves + leaf); 6511 6512 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 6513 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 6514 islog = B_TRUE; 6515 6516 /* 6517 * If the top-level vdev needs to be resilvered 6518 * then we only allow faults on the device that is 6519 * resilvering. 6520 */ 6521 if (vd0 != NULL && maxfaults != 1 && 6522 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 6523 vd0->vdev_resilver_txg != 0)) { 6524 /* 6525 * Make vd0 explicitly claim to be unreadable, 6526 * or unwritable, or reach behind its back 6527 * and close the underlying fd. We can do this if 6528 * maxfaults == 0 because we'll fail and reexecute, 6529 * and we can do it if maxfaults >= 2 because we'll 6530 * have enough redundancy. If maxfaults == 1, the 6531 * combination of this with injection of random data 6532 * corruption below exceeds the pool's fault tolerance. 6533 */ 6534 vdev_file_t *vf = vd0->vdev_tsd; 6535 6536 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 6537 (long long)vd0->vdev_id, (int)maxfaults); 6538 6539 if (vf != NULL && ztest_random(3) == 0) { 6540 (void) close(vf->vf_file->f_fd); 6541 vf->vf_file->f_fd = -1; 6542 } else if (ztest_random(2) == 0) { 6543 vd0->vdev_cant_read = B_TRUE; 6544 } else { 6545 vd0->vdev_cant_write = B_TRUE; 6546 } 6547 guid0 = vd0->vdev_guid; 6548 } 6549 } else { 6550 /* 6551 * Inject errors on an l2cache device. 6552 */ 6553 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6554 6555 if (sav->sav_count == 0) { 6556 spa_config_exit(spa, SCL_STATE, FTAG); 6557 (void) pthread_rwlock_unlock(&ztest_name_lock); 6558 goto out; 6559 } 6560 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 6561 guid0 = vd0->vdev_guid; 6562 (void) strlcpy(path0, vd0->vdev_path, MAXPATHLEN); 6563 (void) strlcpy(pathrand, vd0->vdev_path, MAXPATHLEN); 6564 6565 leaf = 0; 6566 leaves = 1; 6567 maxfaults = INT_MAX; /* no limit on cache devices */ 6568 } 6569 6570 spa_config_exit(spa, SCL_STATE, FTAG); 6571 (void) pthread_rwlock_unlock(&ztest_name_lock); 6572 6573 /* 6574 * If we can tolerate two or more faults, or we're dealing 6575 * with a slog, randomly online/offline vd0. 6576 */ 6577 if ((maxfaults >= 2 || islog) && guid0 != 0) { 6578 if (ztest_random(10) < 6) { 6579 int flags = (ztest_random(2) == 0 ? 6580 ZFS_OFFLINE_TEMPORARY : 0); 6581 6582 /* 6583 * We have to grab the zs_name_lock as writer to 6584 * prevent a race between offlining a slog and 6585 * destroying a dataset. Offlining the slog will 6586 * grab a reference on the dataset which may cause 6587 * dsl_destroy_head() to fail with EBUSY thus 6588 * leaving the dataset in an inconsistent state. 6589 */ 6590 if (islog) 6591 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6592 6593 VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); 6594 6595 if (islog) 6596 (void) pthread_rwlock_unlock(&ztest_name_lock); 6597 } else { 6598 /* 6599 * Ideally we would like to be able to randomly 6600 * call vdev_[on|off]line without holding locks 6601 * to force unpredictable failures but the side 6602 * effects of vdev_[on|off]line prevent us from 6603 * doing so. 6604 */ 6605 (void) vdev_online(spa, guid0, 0, NULL); 6606 } 6607 } 6608 6609 if (maxfaults == 0) 6610 goto out; 6611 6612 /* 6613 * We have at least single-fault tolerance, so inject data corruption. 6614 */ 6615 fd = open(pathrand, O_RDWR); 6616 6617 if (fd == -1) /* we hit a gap in the device namespace */ 6618 goto out; 6619 6620 fsize = lseek(fd, 0, SEEK_END); 6621 6622 while (--iters != 0) { 6623 /* 6624 * The offset must be chosen carefully to ensure that 6625 * we do not inject a given logical block with errors 6626 * on two different leaf devices, because ZFS can not 6627 * tolerate that (if maxfaults==1). 6628 * 6629 * To achieve this we divide each leaf device into 6630 * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). 6631 * Each chunk is further divided into error-injection 6632 * ranges (can accept errors) and clear ranges (we do 6633 * not inject errors in those). Each error-injection 6634 * range can accept errors only for a single leaf vdev. 6635 * Error-injection ranges are separated by clear ranges. 6636 * 6637 * For example, with 3 leaves, each chunk looks like: 6638 * 0 to 32M: injection range for leaf 0 6639 * 32M to 64M: clear range - no injection allowed 6640 * 64M to 96M: injection range for leaf 1 6641 * 96M to 128M: clear range - no injection allowed 6642 * 128M to 160M: injection range for leaf 2 6643 * 160M to 192M: clear range - no injection allowed 6644 * 6645 * Each clear range must be large enough such that a 6646 * single block cannot straddle it. This way a block 6647 * can't be a target in two different injection ranges 6648 * (on different leaf vdevs). 6649 */ 6650 offset = ztest_random(fsize / (leaves << bshift)) * 6651 (leaves << bshift) + (leaf << bshift) + 6652 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 6653 6654 /* 6655 * Only allow damage to the labels at one end of the vdev. 6656 * 6657 * If all labels are damaged, the device will be totally 6658 * inaccessible, which will result in loss of data, 6659 * because we also damage (parts of) the other side of 6660 * the mirror/raidz. 6661 * 6662 * Additionally, we will always have both an even and an 6663 * odd label, so that we can handle crashes in the 6664 * middle of vdev_config_sync(). 6665 */ 6666 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 6667 continue; 6668 6669 /* 6670 * The two end labels are stored at the "end" of the disk, but 6671 * the end of the disk (vdev_psize) is aligned to 6672 * sizeof (vdev_label_t). 6673 */ 6674 uint64_t psize = P2ALIGN_TYPED(fsize, sizeof (vdev_label_t), 6675 uint64_t); 6676 if ((leaf & 1) == 1 && 6677 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 6678 continue; 6679 6680 if (mirror_save != zs->zs_mirrors) { 6681 (void) close(fd); 6682 goto out; 6683 } 6684 6685 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 6686 fatal(B_TRUE, 6687 "can't inject bad word at 0x%"PRIx64" in %s", 6688 offset, pathrand); 6689 6690 if (ztest_opts.zo_verbose >= 7) 6691 (void) printf("injected bad word into %s," 6692 " offset 0x%"PRIx64"\n", pathrand, offset); 6693 6694 injected = B_TRUE; 6695 } 6696 6697 (void) close(fd); 6698 out: 6699 mutex_exit(&ztest_vdev_lock); 6700 6701 if (injected && ztest_opts.zo_raid_do_expand) { 6702 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6703 if (error == 0) { 6704 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6705 txg_wait_synced(spa_get_dsl(spa), 0); 6706 } 6707 } 6708 6709 umem_free(path0, MAXPATHLEN); 6710 umem_free(pathrand, MAXPATHLEN); 6711 } 6712 6713 /* 6714 * By design ztest will never inject uncorrectable damage in to the pool. 6715 * Issue a scrub, wait for it to complete, and verify there is never any 6716 * persistent damage. 6717 * 6718 * Only after a full scrub has been completed is it safe to start injecting 6719 * data corruption. See the comment in zfs_fault_inject(). 6720 * 6721 * EBUSY may be returned for the following six cases. It's the callers 6722 * responsibility to handle them accordingly. 6723 * 6724 * Current state Requested 6725 * 1. Normal Scrub Running Normal Scrub or Error Scrub 6726 * 2. Normal Scrub Paused Error Scrub 6727 * 3. Normal Scrub Paused Pause Normal Scrub 6728 * 4. Error Scrub Running Normal Scrub or Error Scrub 6729 * 5. Error Scrub Paused Pause Error Scrub 6730 * 6. Resilvering Anything else 6731 */ 6732 static int 6733 ztest_scrub_impl(spa_t *spa) 6734 { 6735 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6736 if (error) 6737 return (error); 6738 6739 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6740 txg_wait_synced(spa_get_dsl(spa), 0); 6741 6742 if (spa_approx_errlog_size(spa) > 0) 6743 return (ECKSUM); 6744 6745 ztest_pool_scrubbed = B_TRUE; 6746 6747 return (0); 6748 } 6749 6750 /* 6751 * Scrub the pool. 6752 */ 6753 void 6754 ztest_scrub(ztest_ds_t *zd, uint64_t id) 6755 { 6756 (void) zd, (void) id; 6757 spa_t *spa = ztest_spa; 6758 int error; 6759 6760 /* 6761 * Scrub in progress by device removal. 6762 */ 6763 if (ztest_device_removal_active) 6764 return; 6765 6766 /* 6767 * Start a scrub, wait a moment, then force a restart. 6768 */ 6769 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6770 (void) poll(NULL, 0, 100); 6771 6772 error = ztest_scrub_impl(spa); 6773 if (error == EBUSY) 6774 error = 0; 6775 ASSERT0(error); 6776 } 6777 6778 /* 6779 * Change the guid for the pool. 6780 */ 6781 void 6782 ztest_reguid(ztest_ds_t *zd, uint64_t id) 6783 { 6784 (void) zd, (void) id; 6785 spa_t *spa = ztest_spa; 6786 uint64_t orig, load; 6787 int error; 6788 ztest_shared_t *zs = ztest_shared; 6789 6790 if (ztest_opts.zo_mmp_test) 6791 return; 6792 6793 orig = spa_guid(spa); 6794 load = spa_load_guid(spa); 6795 6796 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6797 error = spa_change_guid(spa, NULL); 6798 zs->zs_guid = spa_guid(spa); 6799 (void) pthread_rwlock_unlock(&ztest_name_lock); 6800 6801 if (error != 0) 6802 return; 6803 6804 if (ztest_opts.zo_verbose >= 4) { 6805 (void) printf("Changed guid old %"PRIu64" -> %"PRIu64"\n", 6806 orig, spa_guid(spa)); 6807 } 6808 6809 VERIFY3U(orig, !=, spa_guid(spa)); 6810 VERIFY3U(load, ==, spa_load_guid(spa)); 6811 } 6812 6813 void 6814 ztest_blake3(ztest_ds_t *zd, uint64_t id) 6815 { 6816 (void) zd, (void) id; 6817 hrtime_t end = gethrtime() + NANOSEC; 6818 zio_cksum_salt_t salt; 6819 void *salt_ptr = &salt.zcs_bytes; 6820 struct abd *abd_data, *abd_meta; 6821 void *buf, *templ; 6822 int i, *ptr; 6823 uint32_t size; 6824 BLAKE3_CTX ctx; 6825 const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); 6826 6827 size = ztest_random_blocksize(); 6828 buf = umem_alloc(size, UMEM_NOFAIL); 6829 abd_data = abd_alloc(size, B_FALSE); 6830 abd_meta = abd_alloc(size, B_TRUE); 6831 6832 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6833 *ptr = ztest_random(UINT_MAX); 6834 memset(salt_ptr, 'A', 32); 6835 6836 abd_copy_from_buf_off(abd_data, buf, 0, size); 6837 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6838 6839 while (gethrtime() <= end) { 6840 int run_count = 100; 6841 zio_cksum_t zc_ref1, zc_ref2; 6842 zio_cksum_t zc_res1, zc_res2; 6843 6844 void *ref1 = &zc_ref1; 6845 void *ref2 = &zc_ref2; 6846 void *res1 = &zc_res1; 6847 void *res2 = &zc_res2; 6848 6849 /* BLAKE3_KEY_LEN = 32 */ 6850 VERIFY0(blake3->setname("generic")); 6851 templ = abd_checksum_blake3_tmpl_init(&salt); 6852 Blake3_InitKeyed(&ctx, salt_ptr); 6853 Blake3_Update(&ctx, buf, size); 6854 Blake3_Final(&ctx, ref1); 6855 zc_ref2 = zc_ref1; 6856 ZIO_CHECKSUM_BSWAP(&zc_ref2); 6857 abd_checksum_blake3_tmpl_free(templ); 6858 6859 VERIFY0(blake3->setname("cycle")); 6860 while (run_count-- > 0) { 6861 6862 /* Test current implementation */ 6863 Blake3_InitKeyed(&ctx, salt_ptr); 6864 Blake3_Update(&ctx, buf, size); 6865 Blake3_Final(&ctx, res1); 6866 zc_res2 = zc_res1; 6867 ZIO_CHECKSUM_BSWAP(&zc_res2); 6868 6869 VERIFY0(memcmp(ref1, res1, 32)); 6870 VERIFY0(memcmp(ref2, res2, 32)); 6871 6872 /* Test ABD - data */ 6873 templ = abd_checksum_blake3_tmpl_init(&salt); 6874 abd_checksum_blake3_native(abd_data, size, 6875 templ, &zc_res1); 6876 abd_checksum_blake3_byteswap(abd_data, size, 6877 templ, &zc_res2); 6878 6879 VERIFY0(memcmp(ref1, res1, 32)); 6880 VERIFY0(memcmp(ref2, res2, 32)); 6881 6882 /* Test ABD - metadata */ 6883 abd_checksum_blake3_native(abd_meta, size, 6884 templ, &zc_res1); 6885 abd_checksum_blake3_byteswap(abd_meta, size, 6886 templ, &zc_res2); 6887 abd_checksum_blake3_tmpl_free(templ); 6888 6889 VERIFY0(memcmp(ref1, res1, 32)); 6890 VERIFY0(memcmp(ref2, res2, 32)); 6891 6892 } 6893 } 6894 6895 abd_free(abd_data); 6896 abd_free(abd_meta); 6897 umem_free(buf, size); 6898 } 6899 6900 void 6901 ztest_fletcher(ztest_ds_t *zd, uint64_t id) 6902 { 6903 (void) zd, (void) id; 6904 hrtime_t end = gethrtime() + NANOSEC; 6905 6906 while (gethrtime() <= end) { 6907 int run_count = 100; 6908 void *buf; 6909 struct abd *abd_data, *abd_meta; 6910 uint32_t size; 6911 int *ptr; 6912 int i; 6913 zio_cksum_t zc_ref; 6914 zio_cksum_t zc_ref_byteswap; 6915 6916 size = ztest_random_blocksize(); 6917 6918 buf = umem_alloc(size, UMEM_NOFAIL); 6919 abd_data = abd_alloc(size, B_FALSE); 6920 abd_meta = abd_alloc(size, B_TRUE); 6921 6922 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6923 *ptr = ztest_random(UINT_MAX); 6924 6925 abd_copy_from_buf_off(abd_data, buf, 0, size); 6926 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6927 6928 VERIFY0(fletcher_4_impl_set("scalar")); 6929 fletcher_4_native(buf, size, NULL, &zc_ref); 6930 fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); 6931 6932 VERIFY0(fletcher_4_impl_set("cycle")); 6933 while (run_count-- > 0) { 6934 zio_cksum_t zc; 6935 zio_cksum_t zc_byteswap; 6936 6937 fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); 6938 fletcher_4_native(buf, size, NULL, &zc); 6939 6940 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6941 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6942 sizeof (zc_byteswap))); 6943 6944 /* Test ABD - data */ 6945 abd_fletcher_4_byteswap(abd_data, size, NULL, 6946 &zc_byteswap); 6947 abd_fletcher_4_native(abd_data, size, NULL, &zc); 6948 6949 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6950 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6951 sizeof (zc_byteswap))); 6952 6953 /* Test ABD - metadata */ 6954 abd_fletcher_4_byteswap(abd_meta, size, NULL, 6955 &zc_byteswap); 6956 abd_fletcher_4_native(abd_meta, size, NULL, &zc); 6957 6958 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6959 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6960 sizeof (zc_byteswap))); 6961 6962 } 6963 6964 umem_free(buf, size); 6965 abd_free(abd_data); 6966 abd_free(abd_meta); 6967 } 6968 } 6969 6970 void 6971 ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) 6972 { 6973 (void) zd, (void) id; 6974 void *buf; 6975 size_t size; 6976 int *ptr; 6977 int i; 6978 zio_cksum_t zc_ref; 6979 zio_cksum_t zc_ref_bswap; 6980 6981 hrtime_t end = gethrtime() + NANOSEC; 6982 6983 while (gethrtime() <= end) { 6984 int run_count = 100; 6985 6986 size = ztest_random_blocksize(); 6987 buf = umem_alloc(size, UMEM_NOFAIL); 6988 6989 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6990 *ptr = ztest_random(UINT_MAX); 6991 6992 VERIFY0(fletcher_4_impl_set("scalar")); 6993 fletcher_4_native(buf, size, NULL, &zc_ref); 6994 fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); 6995 6996 VERIFY0(fletcher_4_impl_set("cycle")); 6997 6998 while (run_count-- > 0) { 6999 zio_cksum_t zc; 7000 zio_cksum_t zc_bswap; 7001 size_t pos = 0; 7002 7003 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 7004 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 7005 7006 while (pos < size) { 7007 size_t inc = 64 * ztest_random(size / 67); 7008 /* sometimes add few bytes to test non-simd */ 7009 if (ztest_random(100) < 10) 7010 inc += P2ALIGN_TYPED(ztest_random(64), 7011 sizeof (uint32_t), uint64_t); 7012 7013 if (inc > (size - pos)) 7014 inc = size - pos; 7015 7016 fletcher_4_incremental_native(buf + pos, inc, 7017 &zc); 7018 fletcher_4_incremental_byteswap(buf + pos, inc, 7019 &zc_bswap); 7020 7021 pos += inc; 7022 } 7023 7024 VERIFY3U(pos, ==, size); 7025 7026 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 7027 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 7028 7029 /* 7030 * verify if incremental on the whole buffer is 7031 * equivalent to non-incremental version 7032 */ 7033 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 7034 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 7035 7036 fletcher_4_incremental_native(buf, size, &zc); 7037 fletcher_4_incremental_byteswap(buf, size, &zc_bswap); 7038 7039 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 7040 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 7041 } 7042 7043 umem_free(buf, size); 7044 } 7045 } 7046 7047 void 7048 ztest_pool_prefetch_ddt(ztest_ds_t *zd, uint64_t id) 7049 { 7050 (void) zd, (void) id; 7051 spa_t *spa; 7052 7053 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7054 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7055 7056 ddt_prefetch_all(spa); 7057 7058 spa_close(spa, FTAG); 7059 (void) pthread_rwlock_unlock(&ztest_name_lock); 7060 } 7061 7062 static int 7063 ztest_set_global_vars(void) 7064 { 7065 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 7066 char *kv = ztest_opts.zo_gvars[i]; 7067 VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); 7068 VERIFY3U(strlen(kv), >, 0); 7069 int err = set_global_var(kv); 7070 if (ztest_opts.zo_verbose > 0) { 7071 (void) printf("setting global var %s ... %s\n", kv, 7072 err ? "failed" : "ok"); 7073 } 7074 if (err != 0) { 7075 (void) fprintf(stderr, 7076 "failed to set global var '%s'\n", kv); 7077 return (err); 7078 } 7079 } 7080 return (0); 7081 } 7082 7083 static char ** 7084 ztest_global_vars_to_zdb_args(void) 7085 { 7086 char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); 7087 char **cur = args; 7088 if (args == NULL) 7089 return (NULL); 7090 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 7091 *cur++ = (char *)"-o"; 7092 *cur++ = ztest_opts.zo_gvars[i]; 7093 } 7094 ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); 7095 *cur = NULL; 7096 return (args); 7097 } 7098 7099 /* The end of strings is indicated by a NULL element */ 7100 static char * 7101 join_strings(char **strings, const char *sep) 7102 { 7103 size_t totallen = 0; 7104 for (char **sp = strings; *sp != NULL; sp++) { 7105 totallen += strlen(*sp); 7106 totallen += strlen(sep); 7107 } 7108 if (totallen > 0) { 7109 ASSERT(totallen >= strlen(sep)); 7110 totallen -= strlen(sep); 7111 } 7112 7113 size_t buflen = totallen + 1; 7114 char *o = umem_alloc(buflen, UMEM_NOFAIL); /* trailing 0 byte */ 7115 o[0] = '\0'; 7116 for (char **sp = strings; *sp != NULL; sp++) { 7117 size_t would; 7118 would = strlcat(o, *sp, buflen); 7119 VERIFY3U(would, <, buflen); 7120 if (*(sp+1) == NULL) { 7121 break; 7122 } 7123 would = strlcat(o, sep, buflen); 7124 VERIFY3U(would, <, buflen); 7125 } 7126 ASSERT3S(strlen(o), ==, totallen); 7127 return (o); 7128 } 7129 7130 static int 7131 ztest_check_path(char *path) 7132 { 7133 struct stat s; 7134 /* return true on success */ 7135 return (!stat(path, &s)); 7136 } 7137 7138 static void 7139 ztest_get_zdb_bin(char *bin, int len) 7140 { 7141 char *zdb_path; 7142 /* 7143 * Try to use $ZDB and in-tree zdb path. If not successful, just 7144 * let popen to search through PATH. 7145 */ 7146 if ((zdb_path = getenv("ZDB"))) { 7147 strlcpy(bin, zdb_path, len); /* In env */ 7148 if (!ztest_check_path(bin)) { 7149 ztest_dump_core = 0; 7150 fatal(B_TRUE, "invalid ZDB '%s'", bin); 7151 } 7152 return; 7153 } 7154 7155 VERIFY3P(realpath(getexecname(), bin), !=, NULL); 7156 if (strstr(bin, ".libs/ztest")) { 7157 strstr(bin, ".libs/ztest")[0] = '\0'; /* In-tree */ 7158 strcat(bin, "zdb"); 7159 if (ztest_check_path(bin)) 7160 return; 7161 } 7162 strcpy(bin, "zdb"); 7163 } 7164 7165 static vdev_t * 7166 ztest_random_concrete_vdev_leaf(vdev_t *vd) 7167 { 7168 if (vd == NULL) 7169 return (NULL); 7170 7171 if (vd->vdev_children == 0) 7172 return (vd); 7173 7174 vdev_t *eligible[vd->vdev_children]; 7175 int eligible_idx = 0, i; 7176 for (i = 0; i < vd->vdev_children; i++) { 7177 vdev_t *cvd = vd->vdev_child[i]; 7178 if (cvd->vdev_top->vdev_removing) 7179 continue; 7180 if (cvd->vdev_children > 0 || 7181 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 7182 eligible[eligible_idx++] = cvd; 7183 } 7184 } 7185 VERIFY3S(eligible_idx, >, 0); 7186 7187 uint64_t child_no = ztest_random(eligible_idx); 7188 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 7189 } 7190 7191 void 7192 ztest_initialize(ztest_ds_t *zd, uint64_t id) 7193 { 7194 (void) zd, (void) id; 7195 spa_t *spa = ztest_spa; 7196 int error = 0; 7197 7198 mutex_enter(&ztest_vdev_lock); 7199 7200 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 7201 7202 /* Random leaf vdev */ 7203 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 7204 if (rand_vd == NULL) { 7205 spa_config_exit(spa, SCL_VDEV, FTAG); 7206 mutex_exit(&ztest_vdev_lock); 7207 return; 7208 } 7209 7210 /* 7211 * The random vdev we've selected may change as soon as we 7212 * drop the spa_config_lock. We create local copies of things 7213 * we're interested in. 7214 */ 7215 uint64_t guid = rand_vd->vdev_guid; 7216 char *path = strdup(rand_vd->vdev_path); 7217 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 7218 7219 zfs_dbgmsg("vd %px, guid %llu", rand_vd, (u_longlong_t)guid); 7220 spa_config_exit(spa, SCL_VDEV, FTAG); 7221 7222 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 7223 7224 nvlist_t *vdev_guids = fnvlist_alloc(); 7225 nvlist_t *vdev_errlist = fnvlist_alloc(); 7226 fnvlist_add_uint64(vdev_guids, path, guid); 7227 error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); 7228 fnvlist_free(vdev_guids); 7229 fnvlist_free(vdev_errlist); 7230 7231 switch (cmd) { 7232 case POOL_INITIALIZE_CANCEL: 7233 if (ztest_opts.zo_verbose >= 4) { 7234 (void) printf("Cancel initialize %s", path); 7235 if (!active) 7236 (void) printf(" failed (no initialize active)"); 7237 (void) printf("\n"); 7238 } 7239 break; 7240 case POOL_INITIALIZE_START: 7241 if (ztest_opts.zo_verbose >= 4) { 7242 (void) printf("Start initialize %s", path); 7243 if (active && error == 0) 7244 (void) printf(" failed (already active)"); 7245 else if (error != 0) 7246 (void) printf(" failed (error %d)", error); 7247 (void) printf("\n"); 7248 } 7249 break; 7250 case POOL_INITIALIZE_SUSPEND: 7251 if (ztest_opts.zo_verbose >= 4) { 7252 (void) printf("Suspend initialize %s", path); 7253 if (!active) 7254 (void) printf(" failed (no initialize active)"); 7255 (void) printf("\n"); 7256 } 7257 break; 7258 } 7259 free(path); 7260 mutex_exit(&ztest_vdev_lock); 7261 } 7262 7263 void 7264 ztest_trim(ztest_ds_t *zd, uint64_t id) 7265 { 7266 (void) zd, (void) id; 7267 spa_t *spa = ztest_spa; 7268 int error = 0; 7269 7270 mutex_enter(&ztest_vdev_lock); 7271 7272 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 7273 7274 /* Random leaf vdev */ 7275 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 7276 if (rand_vd == NULL) { 7277 spa_config_exit(spa, SCL_VDEV, FTAG); 7278 mutex_exit(&ztest_vdev_lock); 7279 return; 7280 } 7281 7282 /* 7283 * The random vdev we've selected may change as soon as we 7284 * drop the spa_config_lock. We create local copies of things 7285 * we're interested in. 7286 */ 7287 uint64_t guid = rand_vd->vdev_guid; 7288 char *path = strdup(rand_vd->vdev_path); 7289 boolean_t active = rand_vd->vdev_trim_thread != NULL; 7290 7291 zfs_dbgmsg("vd %p, guid %llu", rand_vd, (u_longlong_t)guid); 7292 spa_config_exit(spa, SCL_VDEV, FTAG); 7293 7294 uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); 7295 uint64_t rate = 1 << ztest_random(30); 7296 boolean_t partial = (ztest_random(5) > 0); 7297 boolean_t secure = (ztest_random(5) > 0); 7298 7299 nvlist_t *vdev_guids = fnvlist_alloc(); 7300 nvlist_t *vdev_errlist = fnvlist_alloc(); 7301 fnvlist_add_uint64(vdev_guids, path, guid); 7302 error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, 7303 secure, vdev_errlist); 7304 fnvlist_free(vdev_guids); 7305 fnvlist_free(vdev_errlist); 7306 7307 switch (cmd) { 7308 case POOL_TRIM_CANCEL: 7309 if (ztest_opts.zo_verbose >= 4) { 7310 (void) printf("Cancel TRIM %s", path); 7311 if (!active) 7312 (void) printf(" failed (no TRIM active)"); 7313 (void) printf("\n"); 7314 } 7315 break; 7316 case POOL_TRIM_START: 7317 if (ztest_opts.zo_verbose >= 4) { 7318 (void) printf("Start TRIM %s", path); 7319 if (active && error == 0) 7320 (void) printf(" failed (already active)"); 7321 else if (error != 0) 7322 (void) printf(" failed (error %d)", error); 7323 (void) printf("\n"); 7324 } 7325 break; 7326 case POOL_TRIM_SUSPEND: 7327 if (ztest_opts.zo_verbose >= 4) { 7328 (void) printf("Suspend TRIM %s", path); 7329 if (!active) 7330 (void) printf(" failed (no TRIM active)"); 7331 (void) printf("\n"); 7332 } 7333 break; 7334 } 7335 free(path); 7336 mutex_exit(&ztest_vdev_lock); 7337 } 7338 7339 void 7340 ztest_ddt_prune(ztest_ds_t *zd, uint64_t id) 7341 { 7342 (void) zd, (void) id; 7343 7344 spa_t *spa = ztest_spa; 7345 uint64_t pct = ztest_random(15) + 1; 7346 7347 (void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct); 7348 } 7349 7350 /* 7351 * Verify pool integrity by running zdb. 7352 */ 7353 static void 7354 ztest_run_zdb(uint64_t guid) 7355 { 7356 int status; 7357 char *bin; 7358 char *zdb; 7359 char *zbuf; 7360 const int len = MAXPATHLEN + MAXNAMELEN + 20; 7361 FILE *fp; 7362 7363 bin = umem_alloc(len, UMEM_NOFAIL); 7364 zdb = umem_alloc(len, UMEM_NOFAIL); 7365 zbuf = umem_alloc(1024, UMEM_NOFAIL); 7366 7367 ztest_get_zdb_bin(bin, len); 7368 7369 char **set_gvars_args = ztest_global_vars_to_zdb_args(); 7370 if (set_gvars_args == NULL) { 7371 fatal(B_FALSE, "Failed to allocate memory in " 7372 "ztest_global_vars_to_zdb_args(). Cannot run zdb.\n"); 7373 } 7374 char *set_gvars_args_joined = join_strings(set_gvars_args, " "); 7375 free(set_gvars_args); 7376 7377 size_t would = snprintf(zdb, len, 7378 "%s -bcc%s%s -G -d -Y -e -y %s -p %s %"PRIu64, 7379 bin, 7380 ztest_opts.zo_verbose >= 3 ? "s" : "", 7381 ztest_opts.zo_verbose >= 4 ? "v" : "", 7382 set_gvars_args_joined, 7383 ztest_opts.zo_dir, 7384 guid); 7385 ASSERT3U(would, <, len); 7386 7387 umem_free(set_gvars_args_joined, strlen(set_gvars_args_joined) + 1); 7388 7389 if (ztest_opts.zo_verbose >= 5) 7390 (void) printf("Executing %s\n", zdb); 7391 7392 fp = popen(zdb, "r"); 7393 7394 while (fgets(zbuf, 1024, fp) != NULL) 7395 if (ztest_opts.zo_verbose >= 3) 7396 (void) printf("%s", zbuf); 7397 7398 status = pclose(fp); 7399 7400 if (status == 0) 7401 goto out; 7402 7403 ztest_dump_core = 0; 7404 if (WIFEXITED(status)) 7405 fatal(B_FALSE, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 7406 else 7407 fatal(B_FALSE, "'%s' died with signal %d", 7408 zdb, WTERMSIG(status)); 7409 out: 7410 umem_free(bin, len); 7411 umem_free(zdb, len); 7412 umem_free(zbuf, 1024); 7413 } 7414 7415 static void 7416 ztest_walk_pool_directory(const char *header) 7417 { 7418 spa_t *spa = NULL; 7419 7420 if (ztest_opts.zo_verbose >= 6) 7421 (void) puts(header); 7422 7423 mutex_enter(&spa_namespace_lock); 7424 while ((spa = spa_next(spa)) != NULL) 7425 if (ztest_opts.zo_verbose >= 6) 7426 (void) printf("\t%s\n", spa_name(spa)); 7427 mutex_exit(&spa_namespace_lock); 7428 } 7429 7430 static void 7431 ztest_spa_import_export(char *oldname, char *newname) 7432 { 7433 nvlist_t *config, *newconfig; 7434 uint64_t pool_guid; 7435 spa_t *spa; 7436 int error; 7437 7438 if (ztest_opts.zo_verbose >= 4) { 7439 (void) printf("import/export: old = %s, new = %s\n", 7440 oldname, newname); 7441 } 7442 7443 /* 7444 * Clean up from previous runs. 7445 */ 7446 (void) spa_destroy(newname); 7447 7448 /* 7449 * Get the pool's configuration and guid. 7450 */ 7451 VERIFY0(spa_open(oldname, &spa, FTAG)); 7452 7453 /* 7454 * Kick off a scrub to tickle scrub/export races. 7455 */ 7456 if (ztest_random(2) == 0) 7457 (void) spa_scan(spa, POOL_SCAN_SCRUB); 7458 7459 pool_guid = spa_guid(spa); 7460 spa_close(spa, FTAG); 7461 7462 ztest_walk_pool_directory("pools before export"); 7463 7464 /* 7465 * Export it. 7466 */ 7467 VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); 7468 7469 ztest_walk_pool_directory("pools after export"); 7470 7471 /* 7472 * Try to import it. 7473 */ 7474 newconfig = spa_tryimport(config); 7475 ASSERT3P(newconfig, !=, NULL); 7476 fnvlist_free(newconfig); 7477 7478 /* 7479 * Import it under the new name. 7480 */ 7481 error = spa_import(newname, config, NULL, 0); 7482 if (error != 0) { 7483 dump_nvlist(config, 0); 7484 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 7485 oldname, newname, error); 7486 } 7487 7488 ztest_walk_pool_directory("pools after import"); 7489 7490 /* 7491 * Try to import it again -- should fail with EEXIST. 7492 */ 7493 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 7494 7495 /* 7496 * Try to import it under a different name -- should fail with EEXIST. 7497 */ 7498 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 7499 7500 /* 7501 * Verify that the pool is no longer visible under the old name. 7502 */ 7503 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 7504 7505 /* 7506 * Verify that we can open and close the pool using the new name. 7507 */ 7508 VERIFY0(spa_open(newname, &spa, FTAG)); 7509 ASSERT3U(pool_guid, ==, spa_guid(spa)); 7510 spa_close(spa, FTAG); 7511 7512 fnvlist_free(config); 7513 } 7514 7515 static void 7516 ztest_resume(spa_t *spa) 7517 { 7518 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 7519 (void) printf("resuming from suspended state\n"); 7520 spa_vdev_state_enter(spa, SCL_NONE); 7521 vdev_clear(spa, NULL); 7522 (void) spa_vdev_state_exit(spa, NULL, 0); 7523 (void) zio_resume(spa); 7524 } 7525 7526 static __attribute__((noreturn)) void 7527 ztest_resume_thread(void *arg) 7528 { 7529 spa_t *spa = arg; 7530 7531 /* 7532 * Synthesize aged DDT entries for ddt prune testing 7533 */ 7534 ddt_prune_artificial_age = B_TRUE; 7535 if (ztest_opts.zo_verbose >= 3) 7536 ddt_dump_prune_histogram = B_TRUE; 7537 7538 while (!ztest_exiting) { 7539 if (spa_suspended(spa)) 7540 ztest_resume(spa); 7541 (void) poll(NULL, 0, 100); 7542 7543 /* 7544 * Periodically change the zfs_compressed_arc_enabled setting. 7545 */ 7546 if (ztest_random(10) == 0) 7547 zfs_compressed_arc_enabled = ztest_random(2); 7548 7549 /* 7550 * Periodically change the zfs_abd_scatter_enabled setting. 7551 */ 7552 if (ztest_random(10) == 0) 7553 zfs_abd_scatter_enabled = ztest_random(2); 7554 } 7555 7556 thread_exit(); 7557 } 7558 7559 static __attribute__((noreturn)) void 7560 ztest_deadman_thread(void *arg) 7561 { 7562 ztest_shared_t *zs = arg; 7563 spa_t *spa = ztest_spa; 7564 hrtime_t delay, overdue, last_run = gethrtime(); 7565 7566 delay = (zs->zs_thread_stop - zs->zs_thread_start) + 7567 MSEC2NSEC(zfs_deadman_synctime_ms); 7568 7569 while (!ztest_exiting) { 7570 /* 7571 * Wait for the delay timer while checking occasionally 7572 * if we should stop. 7573 */ 7574 if (gethrtime() < last_run + delay) { 7575 (void) poll(NULL, 0, 1000); 7576 continue; 7577 } 7578 7579 /* 7580 * If the pool is suspended then fail immediately. Otherwise, 7581 * check to see if the pool is making any progress. If 7582 * vdev_deadman() discovers that there hasn't been any recent 7583 * I/Os then it will end up aborting the tests. 7584 */ 7585 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 7586 fatal(B_FALSE, 7587 "aborting test after %llu seconds because " 7588 "pool has transitioned to a suspended state.", 7589 (u_longlong_t)zfs_deadman_synctime_ms / 1000); 7590 } 7591 vdev_deadman(spa->spa_root_vdev, FTAG); 7592 7593 /* 7594 * If the process doesn't complete within a grace period of 7595 * zfs_deadman_synctime_ms over the expected finish time, 7596 * then it may be hung and is terminated. 7597 */ 7598 overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); 7599 if (gethrtime() > overdue) { 7600 fatal(B_FALSE, 7601 "aborting test after %llu seconds because " 7602 "the process is overdue for termination.", 7603 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7604 } 7605 7606 (void) printf("ztest has been running for %lld seconds\n", 7607 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7608 7609 last_run = gethrtime(); 7610 delay = MSEC2NSEC(zfs_deadman_checktime_ms); 7611 } 7612 7613 thread_exit(); 7614 } 7615 7616 static void 7617 ztest_execute(int test, ztest_info_t *zi, uint64_t id) 7618 { 7619 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 7620 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 7621 hrtime_t functime = gethrtime(); 7622 int i; 7623 7624 for (i = 0; i < zi->zi_iters; i++) 7625 zi->zi_func(zd, id); 7626 7627 functime = gethrtime() - functime; 7628 7629 atomic_add_64(&zc->zc_count, 1); 7630 atomic_add_64(&zc->zc_time, functime); 7631 7632 if (ztest_opts.zo_verbose >= 4) 7633 (void) printf("%6.2f sec in %s\n", 7634 (double)functime / NANOSEC, zi->zi_funcname); 7635 } 7636 7637 typedef struct ztest_raidz_expand_io { 7638 uint64_t rzx_id; 7639 uint64_t rzx_amount; 7640 uint64_t rzx_bufsize; 7641 const void *rzx_buffer; 7642 uint64_t rzx_alloc_max; 7643 spa_t *rzx_spa; 7644 } ztest_expand_io_t; 7645 7646 #undef OD_ARRAY_SIZE 7647 #define OD_ARRAY_SIZE 10 7648 7649 /* 7650 * Write a request amount of data to some dataset objects. 7651 * There will be ztest_opts.zo_threads count of these running in parallel. 7652 */ 7653 static __attribute__((noreturn)) void 7654 ztest_rzx_thread(void *arg) 7655 { 7656 ztest_expand_io_t *info = (ztest_expand_io_t *)arg; 7657 ztest_od_t *od; 7658 int batchsize; 7659 int od_size; 7660 ztest_ds_t *zd = &ztest_ds[info->rzx_id % ztest_opts.zo_datasets]; 7661 spa_t *spa = info->rzx_spa; 7662 7663 od_size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 7664 od = umem_alloc(od_size, UMEM_NOFAIL); 7665 batchsize = OD_ARRAY_SIZE; 7666 7667 /* Create objects to write to */ 7668 for (int b = 0; b < batchsize; b++) { 7669 ztest_od_init(od + b, info->rzx_id, FTAG, b, 7670 DMU_OT_UINT64_OTHER, 0, 0, 0); 7671 } 7672 if (ztest_object_init(zd, od, od_size, B_FALSE) != 0) { 7673 umem_free(od, od_size); 7674 thread_exit(); 7675 } 7676 7677 for (uint64_t offset = 0, written = 0; written < info->rzx_amount; 7678 offset += info->rzx_bufsize) { 7679 /* write to 10 objects */ 7680 for (int i = 0; i < batchsize && written < info->rzx_amount; 7681 i++) { 7682 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 7683 ztest_write(zd, od[i].od_object, offset, 7684 info->rzx_bufsize, info->rzx_buffer); 7685 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 7686 written += info->rzx_bufsize; 7687 } 7688 txg_wait_synced(spa_get_dsl(spa), 0); 7689 /* due to inflation, we'll typically bail here */ 7690 if (metaslab_class_get_alloc(spa_normal_class(spa)) > 7691 info->rzx_alloc_max) { 7692 break; 7693 } 7694 } 7695 7696 /* Remove a few objects to leave some holes in allocation space */ 7697 mutex_enter(&zd->zd_dirobj_lock); 7698 (void) ztest_remove(zd, od, 2); 7699 mutex_exit(&zd->zd_dirobj_lock); 7700 7701 umem_free(od, od_size); 7702 7703 thread_exit(); 7704 } 7705 7706 static __attribute__((noreturn)) void 7707 ztest_thread(void *arg) 7708 { 7709 int rand; 7710 uint64_t id = (uintptr_t)arg; 7711 ztest_shared_t *zs = ztest_shared; 7712 uint64_t call_next; 7713 hrtime_t now; 7714 ztest_info_t *zi; 7715 ztest_shared_callstate_t *zc; 7716 7717 while ((now = gethrtime()) < zs->zs_thread_stop) { 7718 /* 7719 * See if it's time to force a crash. 7720 */ 7721 if (now > zs->zs_thread_kill && 7722 raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) { 7723 ztest_kill(zs); 7724 } 7725 7726 /* 7727 * If we're getting ENOSPC with some regularity, stop. 7728 */ 7729 if (zs->zs_enospc_count > 10) 7730 break; 7731 7732 /* 7733 * Pick a random function to execute. 7734 */ 7735 rand = ztest_random(ZTEST_FUNCS); 7736 zi = &ztest_info[rand]; 7737 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 7738 call_next = zc->zc_next; 7739 7740 if (now >= call_next && 7741 atomic_cas_64(&zc->zc_next, call_next, call_next + 7742 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 7743 ztest_execute(rand, zi, id); 7744 } 7745 } 7746 7747 thread_exit(); 7748 } 7749 7750 static void 7751 ztest_dataset_name(char *dsname, const char *pool, int d) 7752 { 7753 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 7754 } 7755 7756 static void 7757 ztest_dataset_destroy(int d) 7758 { 7759 char name[ZFS_MAX_DATASET_NAME_LEN]; 7760 int t; 7761 7762 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7763 7764 if (ztest_opts.zo_verbose >= 3) 7765 (void) printf("Destroying %s to free up space\n", name); 7766 7767 /* 7768 * Cleanup any non-standard clones and snapshots. In general, 7769 * ztest thread t operates on dataset (t % zopt_datasets), 7770 * so there may be more than one thing to clean up. 7771 */ 7772 for (t = d; t < ztest_opts.zo_threads; 7773 t += ztest_opts.zo_datasets) 7774 ztest_dsl_dataset_cleanup(name, t); 7775 7776 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 7777 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 7778 } 7779 7780 static void 7781 ztest_dataset_dirobj_verify(ztest_ds_t *zd) 7782 { 7783 uint64_t usedobjs, dirobjs, scratch; 7784 7785 /* 7786 * ZTEST_DIROBJ is the object directory for the entire dataset. 7787 * Therefore, the number of objects in use should equal the 7788 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 7789 * If not, we have an object leak. 7790 * 7791 * Note that we can only check this in ztest_dataset_open(), 7792 * when the open-context and syncing-context values agree. 7793 * That's because zap_count() returns the open-context value, 7794 * while dmu_objset_space() returns the rootbp fill count. 7795 */ 7796 VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 7797 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 7798 ASSERT3U(dirobjs + 1, ==, usedobjs); 7799 } 7800 7801 static int 7802 ztest_dataset_open(int d) 7803 { 7804 ztest_ds_t *zd = &ztest_ds[d]; 7805 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 7806 objset_t *os; 7807 zilog_t *zilog; 7808 char name[ZFS_MAX_DATASET_NAME_LEN]; 7809 int error; 7810 7811 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7812 7813 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7814 7815 error = ztest_dataset_create(name); 7816 if (error == ENOSPC) { 7817 (void) pthread_rwlock_unlock(&ztest_name_lock); 7818 ztest_record_enospc(FTAG); 7819 return (error); 7820 } 7821 ASSERT(error == 0 || error == EEXIST); 7822 7823 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 7824 B_TRUE, zd, &os)); 7825 (void) pthread_rwlock_unlock(&ztest_name_lock); 7826 7827 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 7828 7829 zilog = zd->zd_zilog; 7830 7831 if (zilog->zl_header->zh_claim_lr_seq != 0 && 7832 zilog->zl_header->zh_claim_lr_seq < committed_seq) 7833 fatal(B_FALSE, "missing log records: " 7834 "claimed %"PRIu64" < committed %"PRIu64"", 7835 zilog->zl_header->zh_claim_lr_seq, committed_seq); 7836 7837 ztest_dataset_dirobj_verify(zd); 7838 7839 zil_replay(os, zd, ztest_replay_vector); 7840 7841 ztest_dataset_dirobj_verify(zd); 7842 7843 if (ztest_opts.zo_verbose >= 6) 7844 (void) printf("%s replay %"PRIu64" blocks, " 7845 "%"PRIu64" records, seq %"PRIu64"\n", 7846 zd->zd_name, 7847 zilog->zl_parse_blk_count, 7848 zilog->zl_parse_lr_count, 7849 zilog->zl_replaying_seq); 7850 7851 zilog = zil_open(os, ztest_get_data, NULL); 7852 7853 if (zilog->zl_replaying_seq != 0 && 7854 zilog->zl_replaying_seq < committed_seq) 7855 fatal(B_FALSE, "missing log records: " 7856 "replayed %"PRIu64" < committed %"PRIu64"", 7857 zilog->zl_replaying_seq, committed_seq); 7858 7859 return (0); 7860 } 7861 7862 static void 7863 ztest_dataset_close(int d) 7864 { 7865 ztest_ds_t *zd = &ztest_ds[d]; 7866 7867 zil_close(zd->zd_zilog); 7868 dmu_objset_disown(zd->zd_os, B_TRUE, zd); 7869 7870 ztest_zd_fini(zd); 7871 } 7872 7873 static int 7874 ztest_replay_zil_cb(const char *name, void *arg) 7875 { 7876 (void) arg; 7877 objset_t *os; 7878 ztest_ds_t *zdtmp; 7879 7880 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, 7881 B_TRUE, FTAG, &os)); 7882 7883 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 7884 7885 ztest_zd_init(zdtmp, NULL, os); 7886 zil_replay(os, zdtmp, ztest_replay_vector); 7887 ztest_zd_fini(zdtmp); 7888 7889 if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && 7890 ztest_opts.zo_verbose >= 6) { 7891 zilog_t *zilog = dmu_objset_zil(os); 7892 7893 (void) printf("%s replay %"PRIu64" blocks, " 7894 "%"PRIu64" records, seq %"PRIu64"\n", 7895 name, 7896 zilog->zl_parse_blk_count, 7897 zilog->zl_parse_lr_count, 7898 zilog->zl_replaying_seq); 7899 } 7900 7901 umem_free(zdtmp, sizeof (ztest_ds_t)); 7902 7903 dmu_objset_disown(os, B_TRUE, FTAG); 7904 return (0); 7905 } 7906 7907 static void 7908 ztest_freeze(void) 7909 { 7910 ztest_ds_t *zd = &ztest_ds[0]; 7911 spa_t *spa; 7912 int numloops = 0; 7913 7914 /* freeze not supported during RAIDZ expansion */ 7915 if (ztest_opts.zo_raid_do_expand) 7916 return; 7917 7918 if (ztest_opts.zo_verbose >= 3) 7919 (void) printf("testing spa_freeze()...\n"); 7920 7921 raidz_scratch_verify(); 7922 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7923 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7924 VERIFY0(ztest_dataset_open(0)); 7925 ztest_spa = spa; 7926 7927 /* 7928 * Force the first log block to be transactionally allocated. 7929 * We have to do this before we freeze the pool -- otherwise 7930 * the log chain won't be anchored. 7931 */ 7932 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 7933 ztest_dmu_object_alloc_free(zd, 0); 7934 zil_commit(zd->zd_zilog, 0); 7935 } 7936 7937 txg_wait_synced(spa_get_dsl(spa), 0); 7938 7939 /* 7940 * Freeze the pool. This stops spa_sync() from doing anything, 7941 * so that the only way to record changes from now on is the ZIL. 7942 */ 7943 spa_freeze(spa); 7944 7945 /* 7946 * Because it is hard to predict how much space a write will actually 7947 * require beforehand, we leave ourselves some fudge space to write over 7948 * capacity. 7949 */ 7950 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 7951 7952 /* 7953 * Run tests that generate log records but don't alter the pool config 7954 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 7955 * We do a txg_wait_synced() after each iteration to force the txg 7956 * to increase well beyond the last synced value in the uberblock. 7957 * The ZIL should be OK with that. 7958 * 7959 * Run a random number of times less than zo_maxloops and ensure we do 7960 * not run out of space on the pool. 7961 */ 7962 while (ztest_random(10) != 0 && 7963 numloops++ < ztest_opts.zo_maxloops && 7964 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 7965 ztest_od_t od; 7966 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 7967 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 7968 ztest_io(zd, od.od_object, 7969 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 7970 txg_wait_synced(spa_get_dsl(spa), 0); 7971 } 7972 7973 /* 7974 * Commit all of the changes we just generated. 7975 */ 7976 zil_commit(zd->zd_zilog, 0); 7977 txg_wait_synced(spa_get_dsl(spa), 0); 7978 7979 /* 7980 * Close our dataset and close the pool. 7981 */ 7982 ztest_dataset_close(0); 7983 spa_close(spa, FTAG); 7984 kernel_fini(); 7985 7986 /* 7987 * Open and close the pool and dataset to induce log replay. 7988 */ 7989 raidz_scratch_verify(); 7990 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7991 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7992 ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); 7993 VERIFY0(ztest_dataset_open(0)); 7994 ztest_spa = spa; 7995 txg_wait_synced(spa_get_dsl(spa), 0); 7996 ztest_dataset_close(0); 7997 ztest_reguid(NULL, 0); 7998 7999 spa_close(spa, FTAG); 8000 kernel_fini(); 8001 } 8002 8003 static void 8004 ztest_import_impl(void) 8005 { 8006 importargs_t args = { 0 }; 8007 nvlist_t *cfg = NULL; 8008 int nsearch = 1; 8009 char *searchdirs[nsearch]; 8010 int flags = ZFS_IMPORT_MISSING_LOG; 8011 8012 searchdirs[0] = ztest_opts.zo_dir; 8013 args.paths = nsearch; 8014 args.path = searchdirs; 8015 args.can_be_active = B_FALSE; 8016 8017 libpc_handle_t lpch = { 8018 .lpc_lib_handle = NULL, 8019 .lpc_ops = &libzpool_config_ops, 8020 .lpc_printerr = B_TRUE 8021 }; 8022 VERIFY0(zpool_find_config(&lpch, ztest_opts.zo_pool, &cfg, &args)); 8023 VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); 8024 fnvlist_free(cfg); 8025 } 8026 8027 /* 8028 * Import a storage pool with the given name. 8029 */ 8030 static void 8031 ztest_import(ztest_shared_t *zs) 8032 { 8033 spa_t *spa; 8034 8035 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8036 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8037 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8038 8039 raidz_scratch_verify(); 8040 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8041 8042 ztest_import_impl(); 8043 8044 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8045 zs->zs_metaslab_sz = 8046 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8047 zs->zs_guid = spa_guid(spa); 8048 spa_close(spa, FTAG); 8049 8050 kernel_fini(); 8051 8052 if (!ztest_opts.zo_mmp_test) { 8053 ztest_run_zdb(zs->zs_guid); 8054 ztest_freeze(); 8055 ztest_run_zdb(zs->zs_guid); 8056 } 8057 8058 (void) pthread_rwlock_destroy(&ztest_name_lock); 8059 mutex_destroy(&ztest_vdev_lock); 8060 mutex_destroy(&ztest_checkpoint_lock); 8061 } 8062 8063 /* 8064 * After the expansion was killed, check that the pool is healthy 8065 */ 8066 static void 8067 ztest_raidz_expand_check(spa_t *spa) 8068 { 8069 ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, RAIDZ_EXPAND_KILLED); 8070 /* 8071 * Set pool check done flag, main program will run a zdb check 8072 * of the pool when we exit. 8073 */ 8074 ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_CHECKED; 8075 8076 /* Wait for reflow to finish */ 8077 if (ztest_opts.zo_verbose >= 1) { 8078 (void) printf("\nwaiting for reflow to finish ...\n"); 8079 } 8080 pool_raidz_expand_stat_t rzx_stats; 8081 pool_raidz_expand_stat_t *pres = &rzx_stats; 8082 do { 8083 txg_wait_synced(spa_get_dsl(spa), 0); 8084 (void) poll(NULL, 0, 500); /* wait 1/2 second */ 8085 8086 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8087 (void) spa_raidz_expand_get_stats(spa, pres); 8088 spa_config_exit(spa, SCL_CONFIG, FTAG); 8089 } while (pres->pres_state != DSS_FINISHED && 8090 pres->pres_reflowed < pres->pres_to_reflow); 8091 8092 if (ztest_opts.zo_verbose >= 1) { 8093 (void) printf("verifying an interrupted raidz " 8094 "expansion using a pool scrub ...\n"); 8095 } 8096 8097 /* Will fail here if there is non-recoverable corruption detected */ 8098 int error = ztest_scrub_impl(spa); 8099 if (error == EBUSY) 8100 error = 0; 8101 8102 VERIFY0(error); 8103 8104 if (ztest_opts.zo_verbose >= 1) { 8105 (void) printf("raidz expansion scrub check complete\n"); 8106 } 8107 } 8108 8109 /* 8110 * Start a raidz expansion test. We run some I/O on the pool for a while 8111 * to get some data in the pool. Then we grow the raidz and 8112 * kill the test at the requested offset into the reflow, verifying that 8113 * doing such does not lead to pool corruption. 8114 */ 8115 static void 8116 ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) 8117 { 8118 nvlist_t *root; 8119 pool_raidz_expand_stat_t rzx_stats; 8120 pool_raidz_expand_stat_t *pres = &rzx_stats; 8121 kthread_t **run_threads; 8122 vdev_t *cvd, *rzvd = spa->spa_root_vdev->vdev_child[0]; 8123 int total_disks = rzvd->vdev_children; 8124 int data_disks = total_disks - vdev_get_nparity(rzvd); 8125 uint64_t alloc_goal; 8126 uint64_t csize; 8127 int error, t; 8128 int threads = ztest_opts.zo_threads; 8129 ztest_expand_io_t *thread_args; 8130 8131 ASSERT3U(ztest_opts.zo_raidz_expand_test, !=, RAIDZ_EXPAND_NONE); 8132 ASSERT3P(rzvd->vdev_ops, ==, &vdev_raidz_ops); 8133 ztest_opts.zo_raidz_expand_test = RAIDZ_EXPAND_STARTED; 8134 8135 /* Setup a 1 MiB buffer of random data */ 8136 uint64_t bufsize = 1024 * 1024; 8137 void *buffer = umem_alloc(bufsize, UMEM_NOFAIL); 8138 8139 if (read(ztest_fd_rand, buffer, bufsize) != bufsize) { 8140 fatal(B_TRUE, "short read from /dev/urandom"); 8141 } 8142 /* 8143 * Put some data in the pool and then attach a vdev to initiate 8144 * reflow. 8145 */ 8146 run_threads = umem_zalloc(threads * sizeof (kthread_t *), UMEM_NOFAIL); 8147 thread_args = umem_zalloc(threads * sizeof (ztest_expand_io_t), 8148 UMEM_NOFAIL); 8149 /* Aim for roughly 25% of allocatable space up to 1GB */ 8150 alloc_goal = (vdev_get_min_asize(rzvd) * data_disks) / total_disks; 8151 alloc_goal = MIN(alloc_goal >> 2, 1024*1024*1024); 8152 if (ztest_opts.zo_verbose >= 1) { 8153 (void) printf("adding data to pool '%s', goal %llu bytes\n", 8154 ztest_opts.zo_pool, (u_longlong_t)alloc_goal); 8155 } 8156 8157 /* 8158 * Kick off all the I/O generators that run in parallel. 8159 */ 8160 for (t = 0; t < threads; t++) { 8161 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 8162 umem_free(run_threads, threads * sizeof (kthread_t *)); 8163 umem_free(buffer, bufsize); 8164 return; 8165 } 8166 thread_args[t].rzx_id = t; 8167 thread_args[t].rzx_amount = alloc_goal / threads; 8168 thread_args[t].rzx_bufsize = bufsize; 8169 thread_args[t].rzx_buffer = buffer; 8170 thread_args[t].rzx_alloc_max = alloc_goal; 8171 thread_args[t].rzx_spa = spa; 8172 run_threads[t] = thread_create(NULL, 0, ztest_rzx_thread, 8173 &thread_args[t], 0, NULL, TS_RUN | TS_JOINABLE, 8174 defclsyspri); 8175 } 8176 8177 /* 8178 * Wait for all of the writers to complete. 8179 */ 8180 for (t = 0; t < threads; t++) 8181 VERIFY0(thread_join(run_threads[t])); 8182 8183 /* 8184 * Close all datasets. This must be done after all the threads 8185 * are joined so we can be sure none of the datasets are in-use 8186 * by any of the threads. 8187 */ 8188 for (t = 0; t < ztest_opts.zo_threads; t++) { 8189 if (t < ztest_opts.zo_datasets) 8190 ztest_dataset_close(t); 8191 } 8192 8193 txg_wait_synced(spa_get_dsl(spa), 0); 8194 8195 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 8196 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 8197 8198 umem_free(buffer, bufsize); 8199 umem_free(run_threads, threads * sizeof (kthread_t *)); 8200 umem_free(thread_args, threads * sizeof (ztest_expand_io_t)); 8201 8202 /* Set our reflow target to 25%, 50% or 75% of allocated size */ 8203 uint_t multiple = ztest_random(3) + 1; 8204 uint64_t reflow_max = (rzvd->vdev_stat.vs_alloc * multiple) / 4; 8205 raidz_expand_max_reflow_bytes = reflow_max; 8206 8207 if (ztest_opts.zo_verbose >= 1) { 8208 (void) printf("running raidz expansion test, killing when " 8209 "reflow reaches %llu bytes (%u/4 of allocated space)\n", 8210 (u_longlong_t)reflow_max, multiple); 8211 } 8212 8213 /* XXX - do we want some I/O load during the reflow? */ 8214 8215 /* 8216 * Use a disk size that is larger than existing ones 8217 */ 8218 cvd = rzvd->vdev_child[0]; 8219 csize = vdev_get_min_asize(cvd); 8220 csize += csize / 10; 8221 /* 8222 * Path to vdev to be attached 8223 */ 8224 char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 8225 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 8226 ztest_opts.zo_dir, ztest_opts.zo_pool, rzvd->vdev_children); 8227 /* 8228 * Build the nvlist describing newpath. 8229 */ 8230 root = make_vdev_root(newpath, NULL, NULL, csize, ztest_get_ashift(), 8231 NULL, 0, 0, 1); 8232 /* 8233 * Expand the raidz vdev by attaching the new disk 8234 */ 8235 if (ztest_opts.zo_verbose >= 1) { 8236 (void) printf("expanding raidz: %d wide to %d wide with '%s'\n", 8237 (int)rzvd->vdev_children, (int)rzvd->vdev_children + 1, 8238 newpath); 8239 } 8240 error = spa_vdev_attach(spa, rzvd->vdev_guid, root, B_FALSE, B_FALSE); 8241 nvlist_free(root); 8242 if (error != 0) { 8243 fatal(0, "raidz expand: attach (%s %llu) returned %d", 8244 newpath, (long long)csize, error); 8245 } 8246 8247 /* 8248 * Wait for reflow to begin 8249 */ 8250 while (spa->spa_raidz_expand == NULL) { 8251 txg_wait_synced(spa_get_dsl(spa), 0); 8252 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8253 } 8254 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8255 (void) spa_raidz_expand_get_stats(spa, pres); 8256 spa_config_exit(spa, SCL_CONFIG, FTAG); 8257 while (pres->pres_state != DSS_SCANNING) { 8258 txg_wait_synced(spa_get_dsl(spa), 0); 8259 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8260 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8261 (void) spa_raidz_expand_get_stats(spa, pres); 8262 spa_config_exit(spa, SCL_CONFIG, FTAG); 8263 } 8264 8265 ASSERT3U(pres->pres_state, ==, DSS_SCANNING); 8266 ASSERT3U(pres->pres_to_reflow, !=, 0); 8267 /* 8268 * Set so when we are killed we go to raidz checking rather than 8269 * restarting test. 8270 */ 8271 ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_KILLED; 8272 if (ztest_opts.zo_verbose >= 1) { 8273 (void) printf("raidz expansion reflow started, waiting for " 8274 "%llu bytes to be copied\n", (u_longlong_t)reflow_max); 8275 } 8276 8277 /* 8278 * Wait for reflow maximum to be reached and then kill the test 8279 */ 8280 while (pres->pres_reflowed < reflow_max) { 8281 txg_wait_synced(spa_get_dsl(spa), 0); 8282 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8283 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8284 (void) spa_raidz_expand_get_stats(spa, pres); 8285 spa_config_exit(spa, SCL_CONFIG, FTAG); 8286 } 8287 8288 /* Reset the reflow pause before killing */ 8289 raidz_expand_max_reflow_bytes = 0; 8290 8291 if (ztest_opts.zo_verbose >= 1) { 8292 (void) printf("killing raidz expansion test after reflow " 8293 "reached %llu bytes\n", (u_longlong_t)pres->pres_reflowed); 8294 } 8295 8296 /* 8297 * Kill ourself to simulate a panic during a reflow. Our parent will 8298 * restart the test and the changed flag value will drive the test 8299 * through the scrub/check code to verify the pool is not corrupted. 8300 */ 8301 ztest_kill(zs); 8302 } 8303 8304 static void 8305 ztest_generic_run(ztest_shared_t *zs, spa_t *spa) 8306 { 8307 kthread_t **run_threads; 8308 int t; 8309 8310 run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), 8311 UMEM_NOFAIL); 8312 8313 /* 8314 * Kick off all the tests that run in parallel. 8315 */ 8316 for (t = 0; t < ztest_opts.zo_threads; t++) { 8317 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 8318 umem_free(run_threads, ztest_opts.zo_threads * 8319 sizeof (kthread_t *)); 8320 return; 8321 } 8322 8323 run_threads[t] = thread_create(NULL, 0, ztest_thread, 8324 (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, 8325 defclsyspri); 8326 } 8327 8328 /* 8329 * Wait for all of the tests to complete. 8330 */ 8331 for (t = 0; t < ztest_opts.zo_threads; t++) 8332 VERIFY0(thread_join(run_threads[t])); 8333 8334 /* 8335 * Close all datasets. This must be done after all the threads 8336 * are joined so we can be sure none of the datasets are in-use 8337 * by any of the threads. 8338 */ 8339 for (t = 0; t < ztest_opts.zo_threads; t++) { 8340 if (t < ztest_opts.zo_datasets) 8341 ztest_dataset_close(t); 8342 } 8343 8344 txg_wait_synced(spa_get_dsl(spa), 0); 8345 8346 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 8347 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 8348 8349 umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); 8350 } 8351 8352 /* 8353 * Setup our test context and kick off threads to run tests on all datasets 8354 * in parallel. 8355 */ 8356 static void 8357 ztest_run(ztest_shared_t *zs) 8358 { 8359 spa_t *spa; 8360 objset_t *os; 8361 kthread_t *resume_thread, *deadman_thread; 8362 uint64_t object; 8363 int error; 8364 int t, d; 8365 8366 ztest_exiting = B_FALSE; 8367 8368 /* 8369 * Initialize parent/child shared state. 8370 */ 8371 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8372 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8373 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8374 8375 zs->zs_thread_start = gethrtime(); 8376 zs->zs_thread_stop = 8377 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 8378 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 8379 zs->zs_thread_kill = zs->zs_thread_stop; 8380 if (ztest_random(100) < ztest_opts.zo_killrate) { 8381 zs->zs_thread_kill -= 8382 ztest_random(ztest_opts.zo_passtime * NANOSEC); 8383 } 8384 8385 mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); 8386 8387 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 8388 offsetof(ztest_cb_data_t, zcd_node)); 8389 8390 /* 8391 * Open our pool. It may need to be imported first depending on 8392 * what tests were running when the previous pass was terminated. 8393 */ 8394 raidz_scratch_verify(); 8395 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8396 error = spa_open(ztest_opts.zo_pool, &spa, FTAG); 8397 if (error) { 8398 VERIFY3S(error, ==, ENOENT); 8399 ztest_import_impl(); 8400 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8401 zs->zs_metaslab_sz = 8402 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8403 } 8404 8405 metaslab_preload_limit = ztest_random(20) + 1; 8406 ztest_spa = spa; 8407 8408 /* 8409 * XXX - BUGBUG raidz expansion do not run this for generic for now 8410 */ 8411 if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) 8412 VERIFY0(vdev_raidz_impl_set("cycle")); 8413 8414 dmu_objset_stats_t dds; 8415 VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, 8416 DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); 8417 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 8418 dmu_objset_fast_stat(os, &dds); 8419 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 8420 dmu_objset_disown(os, B_TRUE, FTAG); 8421 8422 /* Give the dedicated raidz expansion test more grace time */ 8423 if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) 8424 zfs_deadman_synctime_ms *= 2; 8425 8426 /* 8427 * Create a thread to periodically resume suspended I/O. 8428 */ 8429 resume_thread = thread_create(NULL, 0, ztest_resume_thread, 8430 spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 8431 8432 /* 8433 * Create a deadman thread and set to panic if we hang. 8434 */ 8435 deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, 8436 zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 8437 8438 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; 8439 8440 /* 8441 * Verify that we can safely inquire about any object, 8442 * whether it's allocated or not. To make it interesting, 8443 * we probe a 5-wide window around each power of two. 8444 * This hits all edge cases, including zero and the max. 8445 */ 8446 for (t = 0; t < 64; t++) { 8447 for (d = -5; d <= 5; d++) { 8448 error = dmu_object_info(spa->spa_meta_objset, 8449 (1ULL << t) + d, NULL); 8450 ASSERT(error == 0 || error == ENOENT || 8451 error == EINVAL); 8452 } 8453 } 8454 8455 /* 8456 * If we got any ENOSPC errors on the previous run, destroy something. 8457 */ 8458 if (zs->zs_enospc_count != 0) { 8459 /* Not expecting ENOSPC errors during raidz expansion tests */ 8460 ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, 8461 RAIDZ_EXPAND_NONE); 8462 8463 int d = ztest_random(ztest_opts.zo_datasets); 8464 ztest_dataset_destroy(d); 8465 } 8466 zs->zs_enospc_count = 0; 8467 8468 /* 8469 * If we were in the middle of ztest_device_removal() and were killed 8470 * we need to ensure the removal and scrub complete before running 8471 * any tests that check ztest_device_removal_active. The removal will 8472 * be restarted automatically when the spa is opened, but we need to 8473 * initiate the scrub manually if it is not already in progress. Note 8474 * that we always run the scrub whenever an indirect vdev exists 8475 * because we have no way of knowing for sure if ztest_device_removal() 8476 * fully completed its scrub before the pool was reimported. 8477 * 8478 * Does not apply for the RAIDZ expansion specific test runs 8479 */ 8480 if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_NONE && 8481 (spa->spa_removing_phys.sr_state == DSS_SCANNING || 8482 spa->spa_removing_phys.sr_prev_indirect_vdev != -1)) { 8483 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 8484 txg_wait_synced(spa_get_dsl(spa), 0); 8485 8486 error = ztest_scrub_impl(spa); 8487 if (error == EBUSY) 8488 error = 0; 8489 ASSERT0(error); 8490 } 8491 8492 if (ztest_opts.zo_verbose >= 4) 8493 (void) printf("starting main threads...\n"); 8494 8495 /* 8496 * Replay all logs of all datasets in the pool. This is primarily for 8497 * temporary datasets which wouldn't otherwise get replayed, which 8498 * can trigger failures when attempting to offline a SLOG in 8499 * ztest_fault_inject(). 8500 */ 8501 (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, 8502 NULL, DS_FIND_CHILDREN); 8503 8504 if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) 8505 ztest_raidz_expand_run(zs, spa); 8506 else if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_KILLED) 8507 ztest_raidz_expand_check(spa); 8508 else 8509 ztest_generic_run(zs, spa); 8510 8511 /* Kill the resume and deadman threads */ 8512 ztest_exiting = B_TRUE; 8513 VERIFY0(thread_join(resume_thread)); 8514 VERIFY0(thread_join(deadman_thread)); 8515 ztest_resume(spa); 8516 8517 /* 8518 * Right before closing the pool, kick off a bunch of async I/O; 8519 * spa_close() should wait for it to complete. 8520 */ 8521 for (object = 1; object < 50; object++) { 8522 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 8523 ZIO_PRIORITY_SYNC_READ); 8524 } 8525 8526 /* Verify that at least one commit cb was called in a timely fashion */ 8527 if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) 8528 VERIFY0(zc_min_txg_delay); 8529 8530 spa_close(spa, FTAG); 8531 8532 /* 8533 * Verify that we can loop over all pools. 8534 */ 8535 mutex_enter(&spa_namespace_lock); 8536 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 8537 if (ztest_opts.zo_verbose > 3) 8538 (void) printf("spa_next: found %s\n", spa_name(spa)); 8539 mutex_exit(&spa_namespace_lock); 8540 8541 /* 8542 * Verify that we can export the pool and reimport it under a 8543 * different name. 8544 */ 8545 if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { 8546 char name[ZFS_MAX_DATASET_NAME_LEN]; 8547 (void) snprintf(name, sizeof (name), "%s_import", 8548 ztest_opts.zo_pool); 8549 ztest_spa_import_export(ztest_opts.zo_pool, name); 8550 ztest_spa_import_export(name, ztest_opts.zo_pool); 8551 } 8552 8553 kernel_fini(); 8554 8555 list_destroy(&zcl.zcl_callbacks); 8556 mutex_destroy(&zcl.zcl_callbacks_lock); 8557 (void) pthread_rwlock_destroy(&ztest_name_lock); 8558 mutex_destroy(&ztest_vdev_lock); 8559 mutex_destroy(&ztest_checkpoint_lock); 8560 } 8561 8562 static void 8563 print_time(hrtime_t t, char *timebuf) 8564 { 8565 hrtime_t s = t / NANOSEC; 8566 hrtime_t m = s / 60; 8567 hrtime_t h = m / 60; 8568 hrtime_t d = h / 24; 8569 8570 s -= m * 60; 8571 m -= h * 60; 8572 h -= d * 24; 8573 8574 timebuf[0] = '\0'; 8575 8576 if (d) 8577 (void) sprintf(timebuf, 8578 "%llud%02lluh%02llum%02llus", d, h, m, s); 8579 else if (h) 8580 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 8581 else if (m) 8582 (void) sprintf(timebuf, "%llum%02llus", m, s); 8583 else 8584 (void) sprintf(timebuf, "%llus", s); 8585 } 8586 8587 static nvlist_t * 8588 make_random_pool_props(void) 8589 { 8590 nvlist_t *props; 8591 8592 props = fnvlist_alloc(); 8593 8594 /* Twenty percent of the time enable ZPOOL_PROP_DEDUP_TABLE_QUOTA */ 8595 if (ztest_random(5) == 0) { 8596 fnvlist_add_uint64(props, 8597 zpool_prop_to_name(ZPOOL_PROP_DEDUP_TABLE_QUOTA), 8598 2 * 1024 * 1024); 8599 } 8600 8601 /* Fifty percent of the time enable ZPOOL_PROP_AUTOREPLACE */ 8602 if (ztest_random(2) == 0) { 8603 fnvlist_add_uint64(props, 8604 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); 8605 } 8606 8607 return (props); 8608 } 8609 8610 /* 8611 * Create a storage pool with the given name and initial vdev size. 8612 * Then test spa_freeze() functionality. 8613 */ 8614 static void 8615 ztest_init(ztest_shared_t *zs) 8616 { 8617 spa_t *spa; 8618 nvlist_t *nvroot, *props; 8619 int i; 8620 8621 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8622 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8623 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8624 8625 raidz_scratch_verify(); 8626 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8627 8628 /* 8629 * Create the storage pool. 8630 */ 8631 (void) spa_destroy(ztest_opts.zo_pool); 8632 ztest_shared->zs_vdev_next_leaf = 0; 8633 zs->zs_splits = 0; 8634 zs->zs_mirrors = ztest_opts.zo_mirrors; 8635 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 8636 NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 8637 props = make_random_pool_props(); 8638 8639 /* 8640 * We don't expect the pool to suspend unless maxfaults == 0, 8641 * in which case ztest_fault_inject() temporarily takes away 8642 * the only valid replica. 8643 */ 8644 fnvlist_add_uint64(props, 8645 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 8646 MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); 8647 8648 for (i = 0; i < SPA_FEATURES; i++) { 8649 char *buf; 8650 8651 if (!spa_feature_table[i].fi_zfs_mod_supported) 8652 continue; 8653 8654 /* 8655 * 75% chance of using the log space map feature. We want ztest 8656 * to exercise both the code paths that use the log space map 8657 * feature and the ones that don't. 8658 */ 8659 if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) 8660 continue; 8661 8662 /* 8663 * split 50/50 between legacy and fast dedup 8664 */ 8665 if (i == SPA_FEATURE_FAST_DEDUP && ztest_random(2) != 0) 8666 continue; 8667 8668 VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", 8669 spa_feature_table[i].fi_uname)); 8670 fnvlist_add_uint64(props, buf, 0); 8671 free(buf); 8672 } 8673 8674 VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); 8675 fnvlist_free(nvroot); 8676 fnvlist_free(props); 8677 8678 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8679 zs->zs_metaslab_sz = 8680 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8681 zs->zs_guid = spa_guid(spa); 8682 spa_close(spa, FTAG); 8683 8684 kernel_fini(); 8685 8686 if (!ztest_opts.zo_mmp_test) { 8687 ztest_run_zdb(zs->zs_guid); 8688 ztest_freeze(); 8689 ztest_run_zdb(zs->zs_guid); 8690 } 8691 8692 (void) pthread_rwlock_destroy(&ztest_name_lock); 8693 mutex_destroy(&ztest_vdev_lock); 8694 mutex_destroy(&ztest_checkpoint_lock); 8695 } 8696 8697 static void 8698 setup_data_fd(void) 8699 { 8700 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 8701 8702 ztest_fd_data = mkstemp(ztest_name_data); 8703 ASSERT3S(ztest_fd_data, >=, 0); 8704 (void) unlink(ztest_name_data); 8705 } 8706 8707 static int 8708 shared_data_size(ztest_shared_hdr_t *hdr) 8709 { 8710 int size; 8711 8712 size = hdr->zh_hdr_size; 8713 size += hdr->zh_opts_size; 8714 size += hdr->zh_size; 8715 size += hdr->zh_stats_size * hdr->zh_stats_count; 8716 size += hdr->zh_ds_size * hdr->zh_ds_count; 8717 size += hdr->zh_scratch_state_size; 8718 8719 return (size); 8720 } 8721 8722 static void 8723 setup_hdr(void) 8724 { 8725 int size; 8726 ztest_shared_hdr_t *hdr; 8727 8728 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 8729 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 8730 ASSERT3P(hdr, !=, MAP_FAILED); 8731 8732 VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 8733 8734 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 8735 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 8736 hdr->zh_size = sizeof (ztest_shared_t); 8737 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 8738 hdr->zh_stats_count = ZTEST_FUNCS; 8739 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 8740 hdr->zh_ds_count = ztest_opts.zo_datasets; 8741 hdr->zh_scratch_state_size = sizeof (ztest_shared_scratch_state_t); 8742 8743 size = shared_data_size(hdr); 8744 VERIFY0(ftruncate(ztest_fd_data, size)); 8745 8746 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 8747 } 8748 8749 static void 8750 setup_data(void) 8751 { 8752 int size, offset; 8753 ztest_shared_hdr_t *hdr; 8754 uint8_t *buf; 8755 8756 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 8757 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 8758 ASSERT3P(hdr, !=, MAP_FAILED); 8759 8760 size = shared_data_size(hdr); 8761 8762 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 8763 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 8764 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 8765 ASSERT3P(hdr, !=, MAP_FAILED); 8766 buf = (uint8_t *)hdr; 8767 8768 offset = hdr->zh_hdr_size; 8769 ztest_shared_opts = (void *)&buf[offset]; 8770 offset += hdr->zh_opts_size; 8771 ztest_shared = (void *)&buf[offset]; 8772 offset += hdr->zh_size; 8773 ztest_shared_callstate = (void *)&buf[offset]; 8774 offset += hdr->zh_stats_size * hdr->zh_stats_count; 8775 ztest_shared_ds = (void *)&buf[offset]; 8776 offset += hdr->zh_ds_size * hdr->zh_ds_count; 8777 ztest_scratch_state = (void *)&buf[offset]; 8778 } 8779 8780 static boolean_t 8781 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 8782 { 8783 pid_t pid; 8784 int status; 8785 char *cmdbuf = NULL; 8786 8787 pid = fork(); 8788 8789 if (cmd == NULL) { 8790 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 8791 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 8792 cmd = cmdbuf; 8793 } 8794 8795 if (pid == -1) 8796 fatal(B_TRUE, "fork failed"); 8797 8798 if (pid == 0) { /* child */ 8799 char fd_data_str[12]; 8800 8801 VERIFY3S(11, >=, 8802 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 8803 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 8804 8805 if (libpath != NULL) { 8806 const char *curlp = getenv("LD_LIBRARY_PATH"); 8807 if (curlp == NULL) 8808 VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); 8809 else { 8810 char *newlp = NULL; 8811 VERIFY3S(-1, !=, 8812 asprintf(&newlp, "%s:%s", libpath, curlp)); 8813 VERIFY0(setenv("LD_LIBRARY_PATH", newlp, 1)); 8814 free(newlp); 8815 } 8816 } 8817 (void) execl(cmd, cmd, (char *)NULL); 8818 ztest_dump_core = B_FALSE; 8819 fatal(B_TRUE, "exec failed: %s", cmd); 8820 } 8821 8822 if (cmdbuf != NULL) { 8823 umem_free(cmdbuf, MAXPATHLEN); 8824 cmd = NULL; 8825 } 8826 8827 while (waitpid(pid, &status, 0) != pid) 8828 continue; 8829 if (statusp != NULL) 8830 *statusp = status; 8831 8832 if (WIFEXITED(status)) { 8833 if (WEXITSTATUS(status) != 0) { 8834 (void) fprintf(stderr, "child exited with code %d\n", 8835 WEXITSTATUS(status)); 8836 exit(2); 8837 } 8838 return (B_FALSE); 8839 } else if (WIFSIGNALED(status)) { 8840 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 8841 (void) fprintf(stderr, "child died with signal %d\n", 8842 WTERMSIG(status)); 8843 exit(3); 8844 } 8845 return (B_TRUE); 8846 } else { 8847 (void) fprintf(stderr, "something strange happened to child\n"); 8848 exit(4); 8849 } 8850 } 8851 8852 static void 8853 ztest_run_init(void) 8854 { 8855 int i; 8856 8857 ztest_shared_t *zs = ztest_shared; 8858 8859 /* 8860 * Blow away any existing copy of zpool.cache 8861 */ 8862 (void) remove(spa_config_path); 8863 8864 if (ztest_opts.zo_init == 0) { 8865 if (ztest_opts.zo_verbose >= 1) 8866 (void) printf("Importing pool %s\n", 8867 ztest_opts.zo_pool); 8868 ztest_import(zs); 8869 return; 8870 } 8871 8872 /* 8873 * Create and initialize our storage pool. 8874 */ 8875 for (i = 1; i <= ztest_opts.zo_init; i++) { 8876 memset(zs, 0, sizeof (*zs)); 8877 if (ztest_opts.zo_verbose >= 3 && 8878 ztest_opts.zo_init != 1) { 8879 (void) printf("ztest_init(), pass %d\n", i); 8880 } 8881 ztest_init(zs); 8882 } 8883 } 8884 8885 int 8886 main(int argc, char **argv) 8887 { 8888 int kills = 0; 8889 int iters = 0; 8890 int older = 0; 8891 int newer = 0; 8892 ztest_shared_t *zs; 8893 ztest_info_t *zi; 8894 ztest_shared_callstate_t *zc; 8895 char timebuf[100]; 8896 char numbuf[NN_NUMBUF_SZ]; 8897 char *cmd; 8898 boolean_t hasalt; 8899 int f, err; 8900 char *fd_data_str = getenv("ZTEST_FD_DATA"); 8901 struct sigaction action; 8902 8903 (void) setvbuf(stdout, NULL, _IOLBF, 0); 8904 8905 dprintf_setup(&argc, argv); 8906 zfs_deadman_synctime_ms = 300000; 8907 zfs_deadman_checktime_ms = 30000; 8908 /* 8909 * As two-word space map entries may not come up often (especially 8910 * if pool and vdev sizes are small) we want to force at least some 8911 * of them so the feature get tested. 8912 */ 8913 zfs_force_some_double_word_sm_entries = B_TRUE; 8914 8915 /* 8916 * Verify that even extensively damaged split blocks with many 8917 * segments can be reconstructed in a reasonable amount of time 8918 * when reconstruction is known to be possible. 8919 * 8920 * Note: the lower this value is, the more damage we inflict, and 8921 * the more time ztest spends in recovering that damage. We chose 8922 * to induce damage 1/100th of the time so recovery is tested but 8923 * not so frequently that ztest doesn't get to test other code paths. 8924 */ 8925 zfs_reconstruct_indirect_damage_fraction = 100; 8926 8927 action.sa_handler = sig_handler; 8928 sigemptyset(&action.sa_mask); 8929 action.sa_flags = 0; 8930 8931 if (sigaction(SIGSEGV, &action, NULL) < 0) { 8932 (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", 8933 strerror(errno)); 8934 exit(EXIT_FAILURE); 8935 } 8936 8937 if (sigaction(SIGABRT, &action, NULL) < 0) { 8938 (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", 8939 strerror(errno)); 8940 exit(EXIT_FAILURE); 8941 } 8942 8943 /* 8944 * Force random_get_bytes() to use /dev/urandom in order to prevent 8945 * ztest from needlessly depleting the system entropy pool. 8946 */ 8947 random_path = "/dev/urandom"; 8948 ztest_fd_rand = open(random_path, O_RDONLY | O_CLOEXEC); 8949 ASSERT3S(ztest_fd_rand, >=, 0); 8950 8951 if (!fd_data_str) { 8952 process_options(argc, argv); 8953 8954 setup_data_fd(); 8955 setup_hdr(); 8956 setup_data(); 8957 memcpy(ztest_shared_opts, &ztest_opts, 8958 sizeof (*ztest_shared_opts)); 8959 } else { 8960 ztest_fd_data = atoi(fd_data_str); 8961 setup_data(); 8962 memcpy(&ztest_opts, ztest_shared_opts, sizeof (ztest_opts)); 8963 } 8964 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 8965 8966 err = ztest_set_global_vars(); 8967 if (err != 0 && !fd_data_str) { 8968 /* error message done by ztest_set_global_vars */ 8969 exit(EXIT_FAILURE); 8970 } else { 8971 /* children should not be spawned if setting gvars fails */ 8972 VERIFY3S(err, ==, 0); 8973 } 8974 8975 /* Override location of zpool.cache */ 8976 VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", 8977 ztest_opts.zo_dir), !=, -1); 8978 8979 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 8980 UMEM_NOFAIL); 8981 zs = ztest_shared; 8982 8983 if (fd_data_str) { 8984 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 8985 metaslab_df_alloc_threshold = 8986 zs->zs_metaslab_df_alloc_threshold; 8987 8988 if (zs->zs_do_init) 8989 ztest_run_init(); 8990 else 8991 ztest_run(zs); 8992 exit(0); 8993 } 8994 8995 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 8996 8997 if (ztest_opts.zo_verbose >= 1) { 8998 (void) printf("%"PRIu64" vdevs, %d datasets, %d threads, " 8999 "%d %s disks, parity %d, %"PRIu64" seconds...\n\n", 9000 ztest_opts.zo_vdevs, 9001 ztest_opts.zo_datasets, 9002 ztest_opts.zo_threads, 9003 ztest_opts.zo_raid_children, 9004 ztest_opts.zo_raid_type, 9005 ztest_opts.zo_raid_parity, 9006 ztest_opts.zo_time); 9007 } 9008 9009 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 9010 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 9011 9012 zs->zs_do_init = B_TRUE; 9013 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 9014 if (ztest_opts.zo_verbose >= 1) { 9015 (void) printf("Executing older ztest for " 9016 "initialization: %s\n", ztest_opts.zo_alt_ztest); 9017 } 9018 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 9019 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 9020 } else { 9021 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 9022 } 9023 zs->zs_do_init = B_FALSE; 9024 9025 zs->zs_proc_start = gethrtime(); 9026 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 9027 9028 for (f = 0; f < ZTEST_FUNCS; f++) { 9029 zi = &ztest_info[f]; 9030 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9031 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 9032 zc->zc_next = UINT64_MAX; 9033 else 9034 zc->zc_next = zs->zs_proc_start + 9035 ztest_random(2 * zi->zi_interval[0] + 1); 9036 } 9037 9038 /* 9039 * Run the tests in a loop. These tests include fault injection 9040 * to verify that self-healing data works, and forced crashes 9041 * to verify that we never lose on-disk consistency. 9042 */ 9043 while (gethrtime() < zs->zs_proc_stop) { 9044 int status; 9045 boolean_t killed; 9046 9047 /* 9048 * Initialize the workload counters for each function. 9049 */ 9050 for (f = 0; f < ZTEST_FUNCS; f++) { 9051 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9052 zc->zc_count = 0; 9053 zc->zc_time = 0; 9054 } 9055 9056 /* Set the allocation switch size */ 9057 zs->zs_metaslab_df_alloc_threshold = 9058 ztest_random(zs->zs_metaslab_sz / 4) + 1; 9059 9060 if (!hasalt || ztest_random(2) == 0) { 9061 if (hasalt && ztest_opts.zo_verbose >= 1) { 9062 (void) printf("Executing newer ztest: %s\n", 9063 cmd); 9064 } 9065 newer++; 9066 killed = exec_child(cmd, NULL, B_TRUE, &status); 9067 } else { 9068 if (hasalt && ztest_opts.zo_verbose >= 1) { 9069 (void) printf("Executing older ztest: %s\n", 9070 ztest_opts.zo_alt_ztest); 9071 } 9072 older++; 9073 killed = exec_child(ztest_opts.zo_alt_ztest, 9074 ztest_opts.zo_alt_libpath, B_TRUE, &status); 9075 } 9076 9077 if (killed) 9078 kills++; 9079 iters++; 9080 9081 if (ztest_opts.zo_verbose >= 1) { 9082 hrtime_t now = gethrtime(); 9083 9084 now = MIN(now, zs->zs_proc_stop); 9085 print_time(zs->zs_proc_stop - now, timebuf); 9086 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 9087 9088 (void) printf("Pass %3d, %8s, %3"PRIu64" ENOSPC, " 9089 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 9090 iters, 9091 WIFEXITED(status) ? "Complete" : "SIGKILL", 9092 zs->zs_enospc_count, 9093 100.0 * zs->zs_alloc / zs->zs_space, 9094 numbuf, 9095 100.0 * (now - zs->zs_proc_start) / 9096 (ztest_opts.zo_time * NANOSEC), timebuf); 9097 } 9098 9099 if (ztest_opts.zo_verbose >= 2) { 9100 (void) printf("\nWorkload summary:\n\n"); 9101 (void) printf("%7s %9s %s\n", 9102 "Calls", "Time", "Function"); 9103 (void) printf("%7s %9s %s\n", 9104 "-----", "----", "--------"); 9105 for (f = 0; f < ZTEST_FUNCS; f++) { 9106 zi = &ztest_info[f]; 9107 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9108 print_time(zc->zc_time, timebuf); 9109 (void) printf("%7"PRIu64" %9s %s\n", 9110 zc->zc_count, timebuf, 9111 zi->zi_funcname); 9112 } 9113 (void) printf("\n"); 9114 } 9115 9116 if (!ztest_opts.zo_mmp_test) 9117 ztest_run_zdb(zs->zs_guid); 9118 if (ztest_shared_opts->zo_raidz_expand_test == 9119 RAIDZ_EXPAND_CHECKED) 9120 break; /* raidz expand test complete */ 9121 } 9122 9123 if (ztest_opts.zo_verbose >= 1) { 9124 if (hasalt) { 9125 (void) printf("%d runs of older ztest: %s\n", older, 9126 ztest_opts.zo_alt_ztest); 9127 (void) printf("%d runs of newer ztest: %s\n", newer, 9128 cmd); 9129 } 9130 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 9131 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 9132 } 9133 9134 umem_free(cmd, MAXNAMELEN); 9135 9136 return (0); 9137 } 9138