1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2013 Steven Hartland. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2017 Joyent, Inc. 28 * Copyright (c) 2017, Intel Corporation. 29 */ 30 31 /* 32 * The objective of this program is to provide a DMU/ZAP/SPA stress test 33 * that runs entirely in userland, is easy to use, and easy to extend. 34 * 35 * The overall design of the ztest program is as follows: 36 * 37 * (1) For each major functional area (e.g. adding vdevs to a pool, 38 * creating and destroying datasets, reading and writing objects, etc) 39 * we have a simple routine to test that functionality. These 40 * individual routines do not have to do anything "stressful". 41 * 42 * (2) We turn these simple functionality tests into a stress test by 43 * running them all in parallel, with as many threads as desired, 44 * and spread across as many datasets, objects, and vdevs as desired. 45 * 46 * (3) While all this is happening, we inject faults into the pool to 47 * verify that self-healing data really works. 48 * 49 * (4) Every time we open a dataset, we change its checksum and compression 50 * functions. Thus even individual objects vary from block to block 51 * in which checksum they use and whether they're compressed. 52 * 53 * (5) To verify that we never lose on-disk consistency after a crash, 54 * we run the entire test in a child of the main process. 55 * At random times, the child self-immolates with a SIGKILL. 56 * This is the software equivalent of pulling the power cord. 57 * The parent then runs the test again, using the existing 58 * storage pool, as many times as desired. If backwards compatibility 59 * testing is enabled ztest will sometimes run the "older" version 60 * of ztest after a SIGKILL. 61 * 62 * (6) To verify that we don't have future leaks or temporal incursions, 63 * many of the functional tests record the transaction group number 64 * as part of their data. When reading old data, they verify that 65 * the transaction group number is less than the current, open txg. 66 * If you add a new test, please do this if applicable. 67 * 68 * (7) Threads are created with a reduced stack size, for sanity checking. 69 * Therefore, it's important not to allocate huge buffers on the stack. 70 * 71 * When run with no arguments, ztest runs for about five minutes and 72 * produces no output if successful. To get a little bit of information, 73 * specify -V. To get more information, specify -VV, and so on. 74 * 75 * To turn this into an overnight stress test, use -T to specify run time. 76 * 77 * You can ask more vdevs [-v], datasets [-d], or threads [-t] 78 * to increase the pool capacity, fanout, and overall stress level. 79 * 80 * Use the -k option to set the desired frequency of kills. 81 * 82 * When ztest invokes itself it passes all relevant information through a 83 * temporary file which is mmap-ed in the child process. This allows shared 84 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 85 * stored at offset 0 of this file and contains information on the size and 86 * number of shared structures in the file. The information stored in this file 87 * must remain backwards compatible with older versions of ztest so that 88 * ztest can invoke them during backwards compatibility testing (-B). 89 */ 90 91 #include <sys/zfs_context.h> 92 #include <sys/spa.h> 93 #include <sys/dmu.h> 94 #include <sys/txg.h> 95 #include <sys/dbuf.h> 96 #include <sys/zap.h> 97 #include <sys/dmu_objset.h> 98 #include <sys/poll.h> 99 #include <sys/stat.h> 100 #include <sys/time.h> 101 #include <sys/wait.h> 102 #include <sys/mman.h> 103 #include <sys/resource.h> 104 #include <sys/zio.h> 105 #include <sys/zil.h> 106 #include <sys/zil_impl.h> 107 #include <sys/vdev_draid.h> 108 #include <sys/vdev_impl.h> 109 #include <sys/vdev_file.h> 110 #include <sys/vdev_initialize.h> 111 #include <sys/vdev_raidz.h> 112 #include <sys/vdev_trim.h> 113 #include <sys/spa_impl.h> 114 #include <sys/metaslab_impl.h> 115 #include <sys/dsl_prop.h> 116 #include <sys/dsl_dataset.h> 117 #include <sys/dsl_destroy.h> 118 #include <sys/dsl_scan.h> 119 #include <sys/zio_checksum.h> 120 #include <sys/zfs_refcount.h> 121 #include <sys/zfeature.h> 122 #include <sys/dsl_userhold.h> 123 #include <sys/abd.h> 124 #include <sys/blake3.h> 125 #include <stdio.h> 126 #include <stdlib.h> 127 #include <unistd.h> 128 #include <getopt.h> 129 #include <signal.h> 130 #include <umem.h> 131 #include <ctype.h> 132 #include <math.h> 133 #include <sys/fs/zfs.h> 134 #include <zfs_fletcher.h> 135 #include <libnvpair.h> 136 #include <libzutil.h> 137 #include <sys/crypto/icp.h> 138 #if (__GLIBC__ && !__UCLIBC__) 139 #include <execinfo.h> /* for backtrace() */ 140 #endif 141 142 static int ztest_fd_data = -1; 143 static int ztest_fd_rand = -1; 144 145 typedef struct ztest_shared_hdr { 146 uint64_t zh_hdr_size; 147 uint64_t zh_opts_size; 148 uint64_t zh_size; 149 uint64_t zh_stats_size; 150 uint64_t zh_stats_count; 151 uint64_t zh_ds_size; 152 uint64_t zh_ds_count; 153 } ztest_shared_hdr_t; 154 155 static ztest_shared_hdr_t *ztest_shared_hdr; 156 157 enum ztest_class_state { 158 ZTEST_VDEV_CLASS_OFF, 159 ZTEST_VDEV_CLASS_ON, 160 ZTEST_VDEV_CLASS_RND 161 }; 162 163 #define ZO_GVARS_MAX_ARGLEN ((size_t)64) 164 #define ZO_GVARS_MAX_COUNT ((size_t)10) 165 166 typedef struct ztest_shared_opts { 167 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 168 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 169 char zo_alt_ztest[MAXNAMELEN]; 170 char zo_alt_libpath[MAXNAMELEN]; 171 uint64_t zo_vdevs; 172 uint64_t zo_vdevtime; 173 size_t zo_vdev_size; 174 int zo_ashift; 175 int zo_mirrors; 176 int zo_raid_children; 177 int zo_raid_parity; 178 char zo_raid_type[8]; 179 int zo_draid_data; 180 int zo_draid_spares; 181 int zo_datasets; 182 int zo_threads; 183 uint64_t zo_passtime; 184 uint64_t zo_killrate; 185 int zo_verbose; 186 int zo_init; 187 uint64_t zo_time; 188 uint64_t zo_maxloops; 189 uint64_t zo_metaslab_force_ganging; 190 int zo_mmp_test; 191 int zo_special_vdevs; 192 int zo_dump_dbgmsg; 193 int zo_gvars_count; 194 char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; 195 } ztest_shared_opts_t; 196 197 /* Default values for command line options. */ 198 #define DEFAULT_POOL "ztest" 199 #define DEFAULT_VDEV_DIR "/tmp" 200 #define DEFAULT_VDEV_COUNT 5 201 #define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ 202 #define DEFAULT_VDEV_SIZE_STR "256M" 203 #define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT 204 #define DEFAULT_MIRRORS 2 205 #define DEFAULT_RAID_CHILDREN 4 206 #define DEFAULT_RAID_PARITY 1 207 #define DEFAULT_DRAID_DATA 4 208 #define DEFAULT_DRAID_SPARES 1 209 #define DEFAULT_DATASETS_COUNT 7 210 #define DEFAULT_THREADS 23 211 #define DEFAULT_RUN_TIME 300 /* 300 seconds */ 212 #define DEFAULT_RUN_TIME_STR "300 sec" 213 #define DEFAULT_PASS_TIME 60 /* 60 seconds */ 214 #define DEFAULT_PASS_TIME_STR "60 sec" 215 #define DEFAULT_KILL_RATE 70 /* 70% kill rate */ 216 #define DEFAULT_KILLRATE_STR "70%" 217 #define DEFAULT_INITS 1 218 #define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ 219 #define DEFAULT_FORCE_GANGING (64 << 10) 220 #define DEFAULT_FORCE_GANGING_STR "64K" 221 222 /* Simplifying assumption: -1 is not a valid default. */ 223 #define NO_DEFAULT -1 224 225 static const ztest_shared_opts_t ztest_opts_defaults = { 226 .zo_pool = DEFAULT_POOL, 227 .zo_dir = DEFAULT_VDEV_DIR, 228 .zo_alt_ztest = { '\0' }, 229 .zo_alt_libpath = { '\0' }, 230 .zo_vdevs = DEFAULT_VDEV_COUNT, 231 .zo_ashift = DEFAULT_ASHIFT, 232 .zo_mirrors = DEFAULT_MIRRORS, 233 .zo_raid_children = DEFAULT_RAID_CHILDREN, 234 .zo_raid_parity = DEFAULT_RAID_PARITY, 235 .zo_raid_type = VDEV_TYPE_RAIDZ, 236 .zo_vdev_size = DEFAULT_VDEV_SIZE, 237 .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ 238 .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ 239 .zo_datasets = DEFAULT_DATASETS_COUNT, 240 .zo_threads = DEFAULT_THREADS, 241 .zo_passtime = DEFAULT_PASS_TIME, 242 .zo_killrate = DEFAULT_KILL_RATE, 243 .zo_verbose = 0, 244 .zo_mmp_test = 0, 245 .zo_init = DEFAULT_INITS, 246 .zo_time = DEFAULT_RUN_TIME, 247 .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ 248 .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, 249 .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, 250 .zo_gvars_count = 0, 251 }; 252 253 extern uint64_t metaslab_force_ganging; 254 extern uint64_t metaslab_df_alloc_threshold; 255 extern uint64_t zfs_deadman_synctime_ms; 256 extern uint_t metaslab_preload_limit; 257 extern int zfs_compressed_arc_enabled; 258 extern int zfs_abd_scatter_enabled; 259 extern uint_t dmu_object_alloc_chunk_shift; 260 extern boolean_t zfs_force_some_double_word_sm_entries; 261 extern unsigned long zio_decompress_fail_fraction; 262 extern unsigned long zfs_reconstruct_indirect_damage_fraction; 263 264 265 static ztest_shared_opts_t *ztest_shared_opts; 266 static ztest_shared_opts_t ztest_opts; 267 static const char *const ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; 268 269 typedef struct ztest_shared_ds { 270 uint64_t zd_seq; 271 } ztest_shared_ds_t; 272 273 static ztest_shared_ds_t *ztest_shared_ds; 274 #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 275 276 #define BT_MAGIC 0x123456789abcdefULL 277 #define MAXFAULTS(zs) \ 278 (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) 279 280 enum ztest_io_type { 281 ZTEST_IO_WRITE_TAG, 282 ZTEST_IO_WRITE_PATTERN, 283 ZTEST_IO_WRITE_ZEROES, 284 ZTEST_IO_TRUNCATE, 285 ZTEST_IO_SETATTR, 286 ZTEST_IO_REWRITE, 287 ZTEST_IO_TYPES 288 }; 289 290 typedef struct ztest_block_tag { 291 uint64_t bt_magic; 292 uint64_t bt_objset; 293 uint64_t bt_object; 294 uint64_t bt_dnodesize; 295 uint64_t bt_offset; 296 uint64_t bt_gen; 297 uint64_t bt_txg; 298 uint64_t bt_crtxg; 299 } ztest_block_tag_t; 300 301 typedef struct bufwad { 302 uint64_t bw_index; 303 uint64_t bw_txg; 304 uint64_t bw_data; 305 } bufwad_t; 306 307 /* 308 * It would be better to use a rangelock_t per object. Unfortunately 309 * the rangelock_t is not a drop-in replacement for rl_t, because we 310 * still need to map from object ID to rangelock_t. 311 */ 312 typedef enum { 313 RL_READER, 314 RL_WRITER, 315 RL_APPEND 316 } rl_type_t; 317 318 typedef struct rll { 319 void *rll_writer; 320 int rll_readers; 321 kmutex_t rll_lock; 322 kcondvar_t rll_cv; 323 } rll_t; 324 325 typedef struct rl { 326 uint64_t rl_object; 327 uint64_t rl_offset; 328 uint64_t rl_size; 329 rll_t *rl_lock; 330 } rl_t; 331 332 #define ZTEST_RANGE_LOCKS 64 333 #define ZTEST_OBJECT_LOCKS 64 334 335 /* 336 * Object descriptor. Used as a template for object lookup/create/remove. 337 */ 338 typedef struct ztest_od { 339 uint64_t od_dir; 340 uint64_t od_object; 341 dmu_object_type_t od_type; 342 dmu_object_type_t od_crtype; 343 uint64_t od_blocksize; 344 uint64_t od_crblocksize; 345 uint64_t od_crdnodesize; 346 uint64_t od_gen; 347 uint64_t od_crgen; 348 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 349 } ztest_od_t; 350 351 /* 352 * Per-dataset state. 353 */ 354 typedef struct ztest_ds { 355 ztest_shared_ds_t *zd_shared; 356 objset_t *zd_os; 357 pthread_rwlock_t zd_zilog_lock; 358 zilog_t *zd_zilog; 359 ztest_od_t *zd_od; /* debugging aid */ 360 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 361 kmutex_t zd_dirobj_lock; 362 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 363 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 364 } ztest_ds_t; 365 366 /* 367 * Per-iteration state. 368 */ 369 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 370 371 typedef struct ztest_info { 372 ztest_func_t *zi_func; /* test function */ 373 uint64_t zi_iters; /* iterations per execution */ 374 uint64_t *zi_interval; /* execute every <interval> seconds */ 375 const char *zi_funcname; /* name of test function */ 376 } ztest_info_t; 377 378 typedef struct ztest_shared_callstate { 379 uint64_t zc_count; /* per-pass count */ 380 uint64_t zc_time; /* per-pass time */ 381 uint64_t zc_next; /* next time to call this function */ 382 } ztest_shared_callstate_t; 383 384 static ztest_shared_callstate_t *ztest_shared_callstate; 385 #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 386 387 ztest_func_t ztest_dmu_read_write; 388 ztest_func_t ztest_dmu_write_parallel; 389 ztest_func_t ztest_dmu_object_alloc_free; 390 ztest_func_t ztest_dmu_object_next_chunk; 391 ztest_func_t ztest_dmu_commit_callbacks; 392 ztest_func_t ztest_zap; 393 ztest_func_t ztest_zap_parallel; 394 ztest_func_t ztest_zil_commit; 395 ztest_func_t ztest_zil_remount; 396 ztest_func_t ztest_dmu_read_write_zcopy; 397 ztest_func_t ztest_dmu_objset_create_destroy; 398 ztest_func_t ztest_dmu_prealloc; 399 ztest_func_t ztest_fzap; 400 ztest_func_t ztest_dmu_snapshot_create_destroy; 401 ztest_func_t ztest_dsl_prop_get_set; 402 ztest_func_t ztest_spa_prop_get_set; 403 ztest_func_t ztest_spa_create_destroy; 404 ztest_func_t ztest_fault_inject; 405 ztest_func_t ztest_dmu_snapshot_hold; 406 ztest_func_t ztest_mmp_enable_disable; 407 ztest_func_t ztest_scrub; 408 ztest_func_t ztest_dsl_dataset_promote_busy; 409 ztest_func_t ztest_vdev_attach_detach; 410 ztest_func_t ztest_vdev_LUN_growth; 411 ztest_func_t ztest_vdev_add_remove; 412 ztest_func_t ztest_vdev_class_add; 413 ztest_func_t ztest_vdev_aux_add_remove; 414 ztest_func_t ztest_split_pool; 415 ztest_func_t ztest_reguid; 416 ztest_func_t ztest_spa_upgrade; 417 ztest_func_t ztest_device_removal; 418 ztest_func_t ztest_spa_checkpoint_create_discard; 419 ztest_func_t ztest_initialize; 420 ztest_func_t ztest_trim; 421 ztest_func_t ztest_blake3; 422 ztest_func_t ztest_fletcher; 423 ztest_func_t ztest_fletcher_incr; 424 ztest_func_t ztest_verify_dnode_bt; 425 426 static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 427 static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 428 static uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 429 static uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 430 static uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 431 432 #define ZTI_INIT(func, iters, interval) \ 433 { .zi_func = (func), \ 434 .zi_iters = (iters), \ 435 .zi_interval = (interval), \ 436 .zi_funcname = # func } 437 438 static ztest_info_t ztest_info[] = { 439 ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), 440 ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), 441 ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), 442 ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), 443 ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), 444 ZTI_INIT(ztest_zap, 30, &zopt_always), 445 ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), 446 ZTI_INIT(ztest_split_pool, 1, &zopt_always), 447 ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), 448 ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), 449 ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), 450 ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), 451 ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), 452 ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), 453 #if 0 454 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), 455 #endif 456 ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), 457 ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), 458 ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), 459 ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), 460 ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), 461 ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), 462 ZTI_INIT(ztest_reguid, 1, &zopt_rarely), 463 ZTI_INIT(ztest_scrub, 1, &zopt_rarely), 464 ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), 465 ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), 466 ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), 467 ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), 468 ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), 469 ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), 470 ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), 471 ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), 472 ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), 473 ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), 474 ZTI_INIT(ztest_trim, 1, &zopt_sometimes), 475 ZTI_INIT(ztest_blake3, 1, &zopt_rarely), 476 ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), 477 ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), 478 ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), 479 }; 480 481 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 482 483 /* 484 * The following struct is used to hold a list of uncalled commit callbacks. 485 * The callbacks are ordered by txg number. 486 */ 487 typedef struct ztest_cb_list { 488 kmutex_t zcl_callbacks_lock; 489 list_t zcl_callbacks; 490 } ztest_cb_list_t; 491 492 /* 493 * Stuff we need to share writably between parent and child. 494 */ 495 typedef struct ztest_shared { 496 boolean_t zs_do_init; 497 hrtime_t zs_proc_start; 498 hrtime_t zs_proc_stop; 499 hrtime_t zs_thread_start; 500 hrtime_t zs_thread_stop; 501 hrtime_t zs_thread_kill; 502 uint64_t zs_enospc_count; 503 uint64_t zs_vdev_next_leaf; 504 uint64_t zs_vdev_aux; 505 uint64_t zs_alloc; 506 uint64_t zs_space; 507 uint64_t zs_splits; 508 uint64_t zs_mirrors; 509 uint64_t zs_metaslab_sz; 510 uint64_t zs_metaslab_df_alloc_threshold; 511 uint64_t zs_guid; 512 } ztest_shared_t; 513 514 #define ID_PARALLEL -1ULL 515 516 static char ztest_dev_template[] = "%s/%s.%llua"; 517 static char ztest_aux_template[] = "%s/%s.%s.%llu"; 518 static ztest_shared_t *ztest_shared; 519 520 static spa_t *ztest_spa = NULL; 521 static ztest_ds_t *ztest_ds; 522 523 static kmutex_t ztest_vdev_lock; 524 static boolean_t ztest_device_removal_active = B_FALSE; 525 static boolean_t ztest_pool_scrubbed = B_FALSE; 526 static kmutex_t ztest_checkpoint_lock; 527 528 /* 529 * The ztest_name_lock protects the pool and dataset namespace used by 530 * the individual tests. To modify the namespace, consumers must grab 531 * this lock as writer. Grabbing the lock as reader will ensure that the 532 * namespace does not change while the lock is held. 533 */ 534 static pthread_rwlock_t ztest_name_lock; 535 536 static boolean_t ztest_dump_core = B_TRUE; 537 static boolean_t ztest_exiting; 538 539 /* Global commit callback list */ 540 static ztest_cb_list_t zcl; 541 /* Commit cb delay */ 542 static uint64_t zc_min_txg_delay = UINT64_MAX; 543 static int zc_cb_counter = 0; 544 545 /* 546 * Minimum number of commit callbacks that need to be registered for us to check 547 * whether the minimum txg delay is acceptable. 548 */ 549 #define ZTEST_COMMIT_CB_MIN_REG 100 550 551 /* 552 * If a number of txgs equal to this threshold have been created after a commit 553 * callback has been registered but not called, then we assume there is an 554 * implementation bug. 555 */ 556 #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) 557 558 enum ztest_object { 559 ZTEST_META_DNODE = 0, 560 ZTEST_DIROBJ, 561 ZTEST_OBJECTS 562 }; 563 564 static __attribute__((noreturn)) void usage(boolean_t requested); 565 static int ztest_scrub_impl(spa_t *spa); 566 567 /* 568 * These libumem hooks provide a reasonable set of defaults for the allocator's 569 * debugging facilities. 570 */ 571 const char * 572 _umem_debug_init(void) 573 { 574 return ("default,verbose"); /* $UMEM_DEBUG setting */ 575 } 576 577 const char * 578 _umem_logging_init(void) 579 { 580 return ("fail,contents"); /* $UMEM_LOGGING setting */ 581 } 582 583 static void 584 dump_debug_buffer(void) 585 { 586 ssize_t ret __attribute__((unused)); 587 588 if (!ztest_opts.zo_dump_dbgmsg) 589 return; 590 591 /* 592 * We use write() instead of printf() so that this function 593 * is safe to call from a signal handler. 594 */ 595 ret = write(STDOUT_FILENO, "\n", 1); 596 zfs_dbgmsg_print("ztest"); 597 } 598 599 #define BACKTRACE_SZ 100 600 601 static void sig_handler(int signo) 602 { 603 struct sigaction action; 604 #if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ 605 int nptrs; 606 void *buffer[BACKTRACE_SZ]; 607 608 nptrs = backtrace(buffer, BACKTRACE_SZ); 609 backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); 610 #endif 611 dump_debug_buffer(); 612 613 /* 614 * Restore default action and re-raise signal so SIGSEGV and 615 * SIGABRT can trigger a core dump. 616 */ 617 action.sa_handler = SIG_DFL; 618 sigemptyset(&action.sa_mask); 619 action.sa_flags = 0; 620 (void) sigaction(signo, &action, NULL); 621 raise(signo); 622 } 623 624 #define FATAL_MSG_SZ 1024 625 626 static const char *fatal_msg; 627 628 static __attribute__((format(printf, 2, 3))) __attribute__((noreturn)) void 629 fatal(int do_perror, const char *message, ...) 630 { 631 va_list args; 632 int save_errno = errno; 633 char *buf; 634 635 (void) fflush(stdout); 636 buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); 637 if (buf == NULL) 638 goto out; 639 640 va_start(args, message); 641 (void) sprintf(buf, "ztest: "); 642 /* LINTED */ 643 (void) vsprintf(buf + strlen(buf), message, args); 644 va_end(args); 645 if (do_perror) { 646 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 647 ": %s", strerror(save_errno)); 648 } 649 (void) fprintf(stderr, "%s\n", buf); 650 fatal_msg = buf; /* to ease debugging */ 651 652 out: 653 if (ztest_dump_core) 654 abort(); 655 else 656 dump_debug_buffer(); 657 658 exit(3); 659 } 660 661 static int 662 str2shift(const char *buf) 663 { 664 const char *ends = "BKMGTPEZ"; 665 int i; 666 667 if (buf[0] == '\0') 668 return (0); 669 for (i = 0; i < strlen(ends); i++) { 670 if (toupper(buf[0]) == ends[i]) 671 break; 672 } 673 if (i == strlen(ends)) { 674 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 675 buf); 676 usage(B_FALSE); 677 } 678 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 679 return (10*i); 680 } 681 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 682 usage(B_FALSE); 683 } 684 685 static uint64_t 686 nicenumtoull(const char *buf) 687 { 688 char *end; 689 uint64_t val; 690 691 val = strtoull(buf, &end, 0); 692 if (end == buf) { 693 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 694 usage(B_FALSE); 695 } else if (end[0] == '.') { 696 double fval = strtod(buf, &end); 697 fval *= pow(2, str2shift(end)); 698 /* 699 * UINT64_MAX is not exactly representable as a double. 700 * The closest representation is UINT64_MAX + 1, so we 701 * use a >= comparison instead of > for the bounds check. 702 */ 703 if (fval >= (double)UINT64_MAX) { 704 (void) fprintf(stderr, "ztest: value too large: %s\n", 705 buf); 706 usage(B_FALSE); 707 } 708 val = (uint64_t)fval; 709 } else { 710 int shift = str2shift(end); 711 if (shift >= 64 || (val << shift) >> shift != val) { 712 (void) fprintf(stderr, "ztest: value too large: %s\n", 713 buf); 714 usage(B_FALSE); 715 } 716 val <<= shift; 717 } 718 return (val); 719 } 720 721 typedef struct ztest_option { 722 const char short_opt; 723 const char *long_opt; 724 const char *long_opt_param; 725 const char *comment; 726 unsigned int default_int; 727 const char *default_str; 728 } ztest_option_t; 729 730 /* 731 * The following option_table is used for generating the usage info as well as 732 * the long and short option information for calling getopt_long(). 733 */ 734 static ztest_option_t option_table[] = { 735 { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, 736 NULL}, 737 { 's', "vdev-size", "INTEGER", "Size of each vdev", 738 NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, 739 { 'a', "alignment-shift", "INTEGER", 740 "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, 741 { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", 742 DEFAULT_MIRRORS, NULL}, 743 { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", 744 DEFAULT_RAID_CHILDREN, NULL}, 745 { 'R', "raid-parity", "INTEGER", "Raid parity", 746 DEFAULT_RAID_PARITY, NULL}, 747 { 'K', "raid-kind", "raidz|draid|random", "Raid kind", 748 NO_DEFAULT, "random"}, 749 { 'D', "draid-data", "INTEGER", "Number of draid data drives", 750 DEFAULT_DRAID_DATA, NULL}, 751 { 'S', "draid-spares", "INTEGER", "Number of draid spares", 752 DEFAULT_DRAID_SPARES, NULL}, 753 { 'd', "datasets", "INTEGER", "Number of datasets", 754 DEFAULT_DATASETS_COUNT, NULL}, 755 { 't', "threads", "INTEGER", "Number of ztest threads", 756 DEFAULT_THREADS, NULL}, 757 { 'g', "gang-block-threshold", "INTEGER", 758 "Metaslab gang block threshold", 759 NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, 760 { 'i', "init-count", "INTEGER", "Number of times to initialize pool", 761 DEFAULT_INITS, NULL}, 762 { 'k', "kill-percentage", "INTEGER", "Kill percentage", 763 NO_DEFAULT, DEFAULT_KILLRATE_STR}, 764 { 'p', "pool-name", "STRING", "Pool name", 765 NO_DEFAULT, DEFAULT_POOL}, 766 { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", 767 NO_DEFAULT, DEFAULT_VDEV_DIR}, 768 { 'M', "multi-host", NULL, 769 "Multi-host; simulate pool imported on remote host", 770 NO_DEFAULT, NULL}, 771 { 'E', "use-existing-pool", NULL, 772 "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, 773 { 'T', "run-time", "INTEGER", "Total run time", 774 NO_DEFAULT, DEFAULT_RUN_TIME_STR}, 775 { 'P', "pass-time", "INTEGER", "Time per pass", 776 NO_DEFAULT, DEFAULT_PASS_TIME_STR}, 777 { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", 778 DEFAULT_MAX_LOOPS, NULL}, 779 { 'B', "alt-ztest", "PATH", "Alternate ztest path", 780 NO_DEFAULT, NULL}, 781 { 'C', "vdev-class-state", "on|off|random", "vdev class state", 782 NO_DEFAULT, "random"}, 783 { 'o', "option", "\"OPTION=INTEGER\"", 784 "Set global variable to an unsigned 32-bit integer value", 785 NO_DEFAULT, NULL}, 786 { 'G', "dump-debug-msg", NULL, 787 "Dump zfs_dbgmsg buffer before exiting due to an error", 788 NO_DEFAULT, NULL}, 789 { 'V', "verbose", NULL, 790 "Verbose (use multiple times for ever more verbosity)", 791 NO_DEFAULT, NULL}, 792 { 'h', "help", NULL, "Show this help", 793 NO_DEFAULT, NULL}, 794 {0, 0, 0, 0, 0, 0} 795 }; 796 797 static struct option *long_opts = NULL; 798 static char *short_opts = NULL; 799 800 static void 801 init_options(void) 802 { 803 ASSERT3P(long_opts, ==, NULL); 804 ASSERT3P(short_opts, ==, NULL); 805 806 int count = sizeof (option_table) / sizeof (option_table[0]); 807 long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); 808 809 short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); 810 int short_opt_index = 0; 811 812 for (int i = 0; i < count; i++) { 813 long_opts[i].val = option_table[i].short_opt; 814 long_opts[i].name = option_table[i].long_opt; 815 long_opts[i].has_arg = option_table[i].long_opt_param != NULL 816 ? required_argument : no_argument; 817 long_opts[i].flag = NULL; 818 short_opts[short_opt_index++] = option_table[i].short_opt; 819 if (option_table[i].long_opt_param != NULL) { 820 short_opts[short_opt_index++] = ':'; 821 } 822 } 823 } 824 825 static void 826 fini_options(void) 827 { 828 int count = sizeof (option_table) / sizeof (option_table[0]); 829 830 umem_free(long_opts, sizeof (struct option) * count); 831 umem_free(short_opts, sizeof (char) * 2 * count); 832 833 long_opts = NULL; 834 short_opts = NULL; 835 } 836 837 static __attribute__((noreturn)) void 838 usage(boolean_t requested) 839 { 840 char option[80]; 841 FILE *fp = requested ? stdout : stderr; 842 843 (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); 844 for (int i = 0; option_table[i].short_opt != 0; i++) { 845 if (option_table[i].long_opt_param != NULL) { 846 (void) sprintf(option, " -%c --%s=%s", 847 option_table[i].short_opt, 848 option_table[i].long_opt, 849 option_table[i].long_opt_param); 850 } else { 851 (void) sprintf(option, " -%c --%s", 852 option_table[i].short_opt, 853 option_table[i].long_opt); 854 } 855 (void) fprintf(fp, " %-40s%s", option, 856 option_table[i].comment); 857 858 if (option_table[i].long_opt_param != NULL) { 859 if (option_table[i].default_str != NULL) { 860 (void) fprintf(fp, " (default: %s)", 861 option_table[i].default_str); 862 } else if (option_table[i].default_int != NO_DEFAULT) { 863 (void) fprintf(fp, " (default: %u)", 864 option_table[i].default_int); 865 } 866 } 867 (void) fprintf(fp, "\n"); 868 } 869 exit(requested ? 0 : 1); 870 } 871 872 static uint64_t 873 ztest_random(uint64_t range) 874 { 875 uint64_t r; 876 877 ASSERT3S(ztest_fd_rand, >=, 0); 878 879 if (range == 0) 880 return (0); 881 882 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) 883 fatal(B_TRUE, "short read from /dev/urandom"); 884 885 return (r % range); 886 } 887 888 static void 889 ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) 890 { 891 char name[32]; 892 char *value; 893 int state = ZTEST_VDEV_CLASS_RND; 894 895 (void) strlcpy(name, input, sizeof (name)); 896 897 value = strchr(name, '='); 898 if (value == NULL) { 899 (void) fprintf(stderr, "missing value in property=value " 900 "'-C' argument (%s)\n", input); 901 usage(B_FALSE); 902 } 903 *(value) = '\0'; 904 value++; 905 906 if (strcmp(value, "on") == 0) { 907 state = ZTEST_VDEV_CLASS_ON; 908 } else if (strcmp(value, "off") == 0) { 909 state = ZTEST_VDEV_CLASS_OFF; 910 } else if (strcmp(value, "random") == 0) { 911 state = ZTEST_VDEV_CLASS_RND; 912 } else { 913 (void) fprintf(stderr, "invalid property value '%s'\n", value); 914 usage(B_FALSE); 915 } 916 917 if (strcmp(name, "special") == 0) { 918 zo->zo_special_vdevs = state; 919 } else { 920 (void) fprintf(stderr, "invalid property name '%s'\n", name); 921 usage(B_FALSE); 922 } 923 if (zo->zo_verbose >= 3) 924 (void) printf("%s vdev state is '%s'\n", name, value); 925 } 926 927 static void 928 process_options(int argc, char **argv) 929 { 930 char *path; 931 ztest_shared_opts_t *zo = &ztest_opts; 932 933 int opt; 934 uint64_t value; 935 const char *raid_kind = "random"; 936 937 memcpy(zo, &ztest_opts_defaults, sizeof (*zo)); 938 939 init_options(); 940 941 while ((opt = getopt_long(argc, argv, short_opts, long_opts, 942 NULL)) != EOF) { 943 value = 0; 944 switch (opt) { 945 case 'v': 946 case 's': 947 case 'a': 948 case 'm': 949 case 'r': 950 case 'R': 951 case 'D': 952 case 'S': 953 case 'd': 954 case 't': 955 case 'g': 956 case 'i': 957 case 'k': 958 case 'T': 959 case 'P': 960 case 'F': 961 value = nicenumtoull(optarg); 962 } 963 switch (opt) { 964 case 'v': 965 zo->zo_vdevs = value; 966 break; 967 case 's': 968 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 969 break; 970 case 'a': 971 zo->zo_ashift = value; 972 break; 973 case 'm': 974 zo->zo_mirrors = value; 975 break; 976 case 'r': 977 zo->zo_raid_children = MAX(1, value); 978 break; 979 case 'R': 980 zo->zo_raid_parity = MIN(MAX(value, 1), 3); 981 break; 982 case 'K': 983 raid_kind = optarg; 984 break; 985 case 'D': 986 zo->zo_draid_data = MAX(1, value); 987 break; 988 case 'S': 989 zo->zo_draid_spares = MAX(1, value); 990 break; 991 case 'd': 992 zo->zo_datasets = MAX(1, value); 993 break; 994 case 't': 995 zo->zo_threads = MAX(1, value); 996 break; 997 case 'g': 998 zo->zo_metaslab_force_ganging = 999 MAX(SPA_MINBLOCKSIZE << 1, value); 1000 break; 1001 case 'i': 1002 zo->zo_init = value; 1003 break; 1004 case 'k': 1005 zo->zo_killrate = value; 1006 break; 1007 case 'p': 1008 (void) strlcpy(zo->zo_pool, optarg, 1009 sizeof (zo->zo_pool)); 1010 break; 1011 case 'f': 1012 path = realpath(optarg, NULL); 1013 if (path == NULL) { 1014 (void) fprintf(stderr, "error: %s: %s\n", 1015 optarg, strerror(errno)); 1016 usage(B_FALSE); 1017 } else { 1018 (void) strlcpy(zo->zo_dir, path, 1019 sizeof (zo->zo_dir)); 1020 free(path); 1021 } 1022 break; 1023 case 'M': 1024 zo->zo_mmp_test = 1; 1025 break; 1026 case 'V': 1027 zo->zo_verbose++; 1028 break; 1029 case 'E': 1030 zo->zo_init = 0; 1031 break; 1032 case 'T': 1033 zo->zo_time = value; 1034 break; 1035 case 'P': 1036 zo->zo_passtime = MAX(1, value); 1037 break; 1038 case 'F': 1039 zo->zo_maxloops = MAX(1, value); 1040 break; 1041 case 'B': 1042 (void) strlcpy(zo->zo_alt_ztest, optarg, 1043 sizeof (zo->zo_alt_ztest)); 1044 break; 1045 case 'C': 1046 ztest_parse_name_value(optarg, zo); 1047 break; 1048 case 'o': 1049 if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { 1050 (void) fprintf(stderr, 1051 "max global var count (%zu) exceeded\n", 1052 ZO_GVARS_MAX_COUNT); 1053 usage(B_FALSE); 1054 } 1055 char *v = zo->zo_gvars[zo->zo_gvars_count]; 1056 if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= 1057 ZO_GVARS_MAX_ARGLEN) { 1058 (void) fprintf(stderr, 1059 "global var option '%s' is too long\n", 1060 optarg); 1061 usage(B_FALSE); 1062 } 1063 zo->zo_gvars_count++; 1064 break; 1065 case 'G': 1066 zo->zo_dump_dbgmsg = 1; 1067 break; 1068 case 'h': 1069 usage(B_TRUE); 1070 break; 1071 case '?': 1072 default: 1073 usage(B_FALSE); 1074 break; 1075 } 1076 } 1077 1078 fini_options(); 1079 1080 /* When raid choice is 'random' add a draid pool 50% of the time */ 1081 if (strcmp(raid_kind, "random") == 0) { 1082 raid_kind = (ztest_random(2) == 0) ? "draid" : "raidz"; 1083 1084 if (ztest_opts.zo_verbose >= 3) 1085 (void) printf("choosing RAID type '%s'\n", raid_kind); 1086 } 1087 1088 if (strcmp(raid_kind, "draid") == 0) { 1089 uint64_t min_devsize; 1090 1091 /* With fewer disk use 256M, otherwise 128M is OK */ 1092 min_devsize = (ztest_opts.zo_raid_children < 16) ? 1093 (256ULL << 20) : (128ULL << 20); 1094 1095 /* No top-level mirrors with dRAID for now */ 1096 zo->zo_mirrors = 0; 1097 1098 /* Use more appropriate defaults for dRAID */ 1099 if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) 1100 zo->zo_vdevs = 1; 1101 if (zo->zo_raid_children == 1102 ztest_opts_defaults.zo_raid_children) 1103 zo->zo_raid_children = 16; 1104 if (zo->zo_ashift < 12) 1105 zo->zo_ashift = 12; 1106 if (zo->zo_vdev_size < min_devsize) 1107 zo->zo_vdev_size = min_devsize; 1108 1109 if (zo->zo_draid_data + zo->zo_raid_parity > 1110 zo->zo_raid_children - zo->zo_draid_spares) { 1111 (void) fprintf(stderr, "error: too few draid " 1112 "children (%d) for stripe width (%d)\n", 1113 zo->zo_raid_children, 1114 zo->zo_draid_data + zo->zo_raid_parity); 1115 usage(B_FALSE); 1116 } 1117 1118 (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, 1119 sizeof (zo->zo_raid_type)); 1120 1121 } else /* using raidz */ { 1122 ASSERT0(strcmp(raid_kind, "raidz")); 1123 1124 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1125 zo->zo_raid_children - 1); 1126 } 1127 1128 zo->zo_vdevtime = 1129 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 1130 UINT64_MAX >> 2); 1131 1132 if (*zo->zo_alt_ztest) { 1133 const char *invalid_what = "ztest"; 1134 char *val = zo->zo_alt_ztest; 1135 if (0 != access(val, X_OK) || 1136 (strrchr(val, '/') == NULL && (errno = EINVAL))) 1137 goto invalid; 1138 1139 int dirlen = strrchr(val, '/') - val; 1140 strlcpy(zo->zo_alt_libpath, val, 1141 MIN(sizeof (zo->zo_alt_libpath), dirlen + 1)); 1142 invalid_what = "library path", val = zo->zo_alt_libpath; 1143 if (strrchr(val, '/') == NULL && (errno = EINVAL)) 1144 goto invalid; 1145 *strrchr(val, '/') = '\0'; 1146 strlcat(val, "/lib", sizeof (zo->zo_alt_libpath)); 1147 1148 if (0 != access(zo->zo_alt_libpath, X_OK)) 1149 goto invalid; 1150 return; 1151 1152 invalid: 1153 ztest_dump_core = B_FALSE; 1154 fatal(B_TRUE, "invalid alternate %s %s", invalid_what, val); 1155 } 1156 } 1157 1158 static void 1159 ztest_kill(ztest_shared_t *zs) 1160 { 1161 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 1162 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 1163 1164 /* 1165 * Before we kill ourselves, make sure that the config is updated. 1166 * See comment above spa_write_cachefile(). 1167 */ 1168 mutex_enter(&spa_namespace_lock); 1169 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); 1170 mutex_exit(&spa_namespace_lock); 1171 1172 (void) raise(SIGKILL); 1173 } 1174 1175 static void 1176 ztest_record_enospc(const char *s) 1177 { 1178 (void) s; 1179 ztest_shared->zs_enospc_count++; 1180 } 1181 1182 static uint64_t 1183 ztest_get_ashift(void) 1184 { 1185 if (ztest_opts.zo_ashift == 0) 1186 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 1187 return (ztest_opts.zo_ashift); 1188 } 1189 1190 static boolean_t 1191 ztest_is_draid_spare(const char *name) 1192 { 1193 uint64_t spare_id = 0, parity = 0, vdev_id = 0; 1194 1195 if (sscanf(name, VDEV_TYPE_DRAID "%"PRIu64"-%"PRIu64"-%"PRIu64"", 1196 &parity, &vdev_id, &spare_id) == 3) { 1197 return (B_TRUE); 1198 } 1199 1200 return (B_FALSE); 1201 } 1202 1203 static nvlist_t * 1204 make_vdev_file(const char *path, const char *aux, const char *pool, 1205 size_t size, uint64_t ashift) 1206 { 1207 char *pathbuf = NULL; 1208 uint64_t vdev; 1209 nvlist_t *file; 1210 boolean_t draid_spare = B_FALSE; 1211 1212 1213 if (ashift == 0) 1214 ashift = ztest_get_ashift(); 1215 1216 if (path == NULL) { 1217 pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1218 path = pathbuf; 1219 1220 if (aux != NULL) { 1221 vdev = ztest_shared->zs_vdev_aux; 1222 (void) snprintf(pathbuf, MAXPATHLEN, 1223 ztest_aux_template, ztest_opts.zo_dir, 1224 pool == NULL ? ztest_opts.zo_pool : pool, 1225 aux, vdev); 1226 } else { 1227 vdev = ztest_shared->zs_vdev_next_leaf++; 1228 (void) snprintf(pathbuf, MAXPATHLEN, 1229 ztest_dev_template, ztest_opts.zo_dir, 1230 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 1231 } 1232 } else { 1233 draid_spare = ztest_is_draid_spare(path); 1234 } 1235 1236 if (size != 0 && !draid_spare) { 1237 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 1238 if (fd == -1) 1239 fatal(B_TRUE, "can't open %s", path); 1240 if (ftruncate(fd, size) != 0) 1241 fatal(B_TRUE, "can't ftruncate %s", path); 1242 (void) close(fd); 1243 } 1244 1245 file = fnvlist_alloc(); 1246 fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, 1247 draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); 1248 fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); 1249 fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); 1250 umem_free(pathbuf, MAXPATHLEN); 1251 1252 return (file); 1253 } 1254 1255 static nvlist_t * 1256 make_vdev_raid(const char *path, const char *aux, const char *pool, size_t size, 1257 uint64_t ashift, int r) 1258 { 1259 nvlist_t *raid, **child; 1260 int c; 1261 1262 if (r < 2) 1263 return (make_vdev_file(path, aux, pool, size, ashift)); 1264 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 1265 1266 for (c = 0; c < r; c++) 1267 child[c] = make_vdev_file(path, aux, pool, size, ashift); 1268 1269 raid = fnvlist_alloc(); 1270 fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, 1271 ztest_opts.zo_raid_type); 1272 fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, 1273 ztest_opts.zo_raid_parity); 1274 fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, 1275 (const nvlist_t **)child, r); 1276 1277 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { 1278 uint64_t ndata = ztest_opts.zo_draid_data; 1279 uint64_t nparity = ztest_opts.zo_raid_parity; 1280 uint64_t nspares = ztest_opts.zo_draid_spares; 1281 uint64_t children = ztest_opts.zo_raid_children; 1282 uint64_t ngroups = 1; 1283 1284 /* 1285 * Calculate the minimum number of groups required to fill a 1286 * slice. This is the LCM of the stripe width (data + parity) 1287 * and the number of data drives (children - spares). 1288 */ 1289 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1290 ngroups++; 1291 1292 /* Store the basic dRAID configuration. */ 1293 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1294 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1295 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1296 } 1297 1298 for (c = 0; c < r; c++) 1299 fnvlist_free(child[c]); 1300 1301 umem_free(child, r * sizeof (nvlist_t *)); 1302 1303 return (raid); 1304 } 1305 1306 static nvlist_t * 1307 make_vdev_mirror(const char *path, const char *aux, const char *pool, 1308 size_t size, uint64_t ashift, int r, int m) 1309 { 1310 nvlist_t *mirror, **child; 1311 int c; 1312 1313 if (m < 1) 1314 return (make_vdev_raid(path, aux, pool, size, ashift, r)); 1315 1316 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 1317 1318 for (c = 0; c < m; c++) 1319 child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); 1320 1321 mirror = fnvlist_alloc(); 1322 fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); 1323 fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 1324 (const nvlist_t **)child, m); 1325 1326 for (c = 0; c < m; c++) 1327 fnvlist_free(child[c]); 1328 1329 umem_free(child, m * sizeof (nvlist_t *)); 1330 1331 return (mirror); 1332 } 1333 1334 static nvlist_t * 1335 make_vdev_root(const char *path, const char *aux, const char *pool, size_t size, 1336 uint64_t ashift, const char *class, int r, int m, int t) 1337 { 1338 nvlist_t *root, **child; 1339 int c; 1340 boolean_t log; 1341 1342 ASSERT3S(t, >, 0); 1343 1344 log = (class != NULL && strcmp(class, "log") == 0); 1345 1346 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 1347 1348 for (c = 0; c < t; c++) { 1349 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 1350 r, m); 1351 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); 1352 1353 if (class != NULL && class[0] != '\0') { 1354 ASSERT(m > 1 || log); /* expecting a mirror */ 1355 fnvlist_add_string(child[c], 1356 ZPOOL_CONFIG_ALLOCATION_BIAS, class); 1357 } 1358 } 1359 1360 root = fnvlist_alloc(); 1361 fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 1362 fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 1363 (const nvlist_t **)child, t); 1364 1365 for (c = 0; c < t; c++) 1366 fnvlist_free(child[c]); 1367 1368 umem_free(child, t * sizeof (nvlist_t *)); 1369 1370 return (root); 1371 } 1372 1373 /* 1374 * Find a random spa version. Returns back a random spa version in the 1375 * range [initial_version, SPA_VERSION_FEATURES]. 1376 */ 1377 static uint64_t 1378 ztest_random_spa_version(uint64_t initial_version) 1379 { 1380 uint64_t version = initial_version; 1381 1382 if (version <= SPA_VERSION_BEFORE_FEATURES) { 1383 version = version + 1384 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 1385 } 1386 1387 if (version > SPA_VERSION_BEFORE_FEATURES) 1388 version = SPA_VERSION_FEATURES; 1389 1390 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1391 return (version); 1392 } 1393 1394 static int 1395 ztest_random_blocksize(void) 1396 { 1397 ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); 1398 1399 /* 1400 * Choose a block size >= the ashift. 1401 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1402 */ 1403 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1404 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1405 maxbs = 20; 1406 uint64_t block_shift = 1407 ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1408 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1409 } 1410 1411 static int 1412 ztest_random_dnodesize(void) 1413 { 1414 int slots; 1415 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1416 1417 if (max_slots == DNODE_MIN_SLOTS) 1418 return (DNODE_MIN_SIZE); 1419 1420 /* 1421 * Weight the random distribution more heavily toward smaller 1422 * dnode sizes since that is more likely to reflect real-world 1423 * usage. 1424 */ 1425 ASSERT3U(max_slots, >, 4); 1426 switch (ztest_random(10)) { 1427 case 0: 1428 slots = 5 + ztest_random(max_slots - 4); 1429 break; 1430 case 1 ... 4: 1431 slots = 2 + ztest_random(3); 1432 break; 1433 default: 1434 slots = 1; 1435 break; 1436 } 1437 1438 return (slots << DNODE_SHIFT); 1439 } 1440 1441 static int 1442 ztest_random_ibshift(void) 1443 { 1444 return (DN_MIN_INDBLKSHIFT + 1445 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1446 } 1447 1448 static uint64_t 1449 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1450 { 1451 uint64_t top; 1452 vdev_t *rvd = spa->spa_root_vdev; 1453 vdev_t *tvd; 1454 1455 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 1456 1457 do { 1458 top = ztest_random(rvd->vdev_children); 1459 tvd = rvd->vdev_child[top]; 1460 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1461 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1462 1463 return (top); 1464 } 1465 1466 static uint64_t 1467 ztest_random_dsl_prop(zfs_prop_t prop) 1468 { 1469 uint64_t value; 1470 1471 do { 1472 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1473 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1474 1475 return (value); 1476 } 1477 1478 static int 1479 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1480 boolean_t inherit) 1481 { 1482 const char *propname = zfs_prop_to_name(prop); 1483 const char *valname; 1484 char *setpoint; 1485 uint64_t curval; 1486 int error; 1487 1488 error = dsl_prop_set_int(osname, propname, 1489 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1490 1491 if (error == ENOSPC) { 1492 ztest_record_enospc(FTAG); 1493 return (error); 1494 } 1495 ASSERT0(error); 1496 1497 setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1498 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1499 1500 if (ztest_opts.zo_verbose >= 6) { 1501 int err; 1502 1503 err = zfs_prop_index_to_string(prop, curval, &valname); 1504 if (err) 1505 (void) printf("%s %s = %llu at '%s'\n", osname, 1506 propname, (unsigned long long)curval, setpoint); 1507 else 1508 (void) printf("%s %s = %s at '%s'\n", 1509 osname, propname, valname, setpoint); 1510 } 1511 umem_free(setpoint, MAXPATHLEN); 1512 1513 return (error); 1514 } 1515 1516 static int 1517 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1518 { 1519 spa_t *spa = ztest_spa; 1520 nvlist_t *props = NULL; 1521 int error; 1522 1523 props = fnvlist_alloc(); 1524 fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); 1525 1526 error = spa_prop_set(spa, props); 1527 1528 fnvlist_free(props); 1529 1530 if (error == ENOSPC) { 1531 ztest_record_enospc(FTAG); 1532 return (error); 1533 } 1534 ASSERT0(error); 1535 1536 return (error); 1537 } 1538 1539 static int 1540 ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, 1541 boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) 1542 { 1543 int err; 1544 char *cp = NULL; 1545 char ddname[ZFS_MAX_DATASET_NAME_LEN]; 1546 1547 strlcpy(ddname, name, sizeof (ddname)); 1548 cp = strchr(ddname, '@'); 1549 if (cp != NULL) 1550 *cp = '\0'; 1551 1552 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1553 while (decrypt && err == EACCES) { 1554 dsl_crypto_params_t *dcp; 1555 nvlist_t *crypto_args = fnvlist_alloc(); 1556 1557 fnvlist_add_uint8_array(crypto_args, "wkeydata", 1558 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 1559 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, 1560 crypto_args, &dcp)); 1561 err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); 1562 /* 1563 * Note: if there was an error loading, the wkey was not 1564 * consumed, and needs to be freed. 1565 */ 1566 dsl_crypto_params_free(dcp, (err != 0)); 1567 fnvlist_free(crypto_args); 1568 1569 if (err == EINVAL) { 1570 /* 1571 * We couldn't load a key for this dataset so try 1572 * the parent. This loop will eventually hit the 1573 * encryption root since ztest only makes clones 1574 * as children of their origin datasets. 1575 */ 1576 cp = strrchr(ddname, '/'); 1577 if (cp == NULL) 1578 return (err); 1579 1580 *cp = '\0'; 1581 err = EACCES; 1582 continue; 1583 } else if (err != 0) { 1584 break; 1585 } 1586 1587 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1588 break; 1589 } 1590 1591 return (err); 1592 } 1593 1594 static void 1595 ztest_rll_init(rll_t *rll) 1596 { 1597 rll->rll_writer = NULL; 1598 rll->rll_readers = 0; 1599 mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); 1600 cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); 1601 } 1602 1603 static void 1604 ztest_rll_destroy(rll_t *rll) 1605 { 1606 ASSERT3P(rll->rll_writer, ==, NULL); 1607 ASSERT0(rll->rll_readers); 1608 mutex_destroy(&rll->rll_lock); 1609 cv_destroy(&rll->rll_cv); 1610 } 1611 1612 static void 1613 ztest_rll_lock(rll_t *rll, rl_type_t type) 1614 { 1615 mutex_enter(&rll->rll_lock); 1616 1617 if (type == RL_READER) { 1618 while (rll->rll_writer != NULL) 1619 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1620 rll->rll_readers++; 1621 } else { 1622 while (rll->rll_writer != NULL || rll->rll_readers) 1623 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1624 rll->rll_writer = curthread; 1625 } 1626 1627 mutex_exit(&rll->rll_lock); 1628 } 1629 1630 static void 1631 ztest_rll_unlock(rll_t *rll) 1632 { 1633 mutex_enter(&rll->rll_lock); 1634 1635 if (rll->rll_writer) { 1636 ASSERT0(rll->rll_readers); 1637 rll->rll_writer = NULL; 1638 } else { 1639 ASSERT3S(rll->rll_readers, >, 0); 1640 ASSERT3P(rll->rll_writer, ==, NULL); 1641 rll->rll_readers--; 1642 } 1643 1644 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1645 cv_broadcast(&rll->rll_cv); 1646 1647 mutex_exit(&rll->rll_lock); 1648 } 1649 1650 static void 1651 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1652 { 1653 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1654 1655 ztest_rll_lock(rll, type); 1656 } 1657 1658 static void 1659 ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1660 { 1661 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1662 1663 ztest_rll_unlock(rll); 1664 } 1665 1666 static rl_t * 1667 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1668 uint64_t size, rl_type_t type) 1669 { 1670 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1671 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1672 rl_t *rl; 1673 1674 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1675 rl->rl_object = object; 1676 rl->rl_offset = offset; 1677 rl->rl_size = size; 1678 rl->rl_lock = rll; 1679 1680 ztest_rll_lock(rll, type); 1681 1682 return (rl); 1683 } 1684 1685 static void 1686 ztest_range_unlock(rl_t *rl) 1687 { 1688 rll_t *rll = rl->rl_lock; 1689 1690 ztest_rll_unlock(rll); 1691 1692 umem_free(rl, sizeof (*rl)); 1693 } 1694 1695 static void 1696 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1697 { 1698 zd->zd_os = os; 1699 zd->zd_zilog = dmu_objset_zil(os); 1700 zd->zd_shared = szd; 1701 dmu_objset_name(os, zd->zd_name); 1702 int l; 1703 1704 if (zd->zd_shared != NULL) 1705 zd->zd_shared->zd_seq = 0; 1706 1707 VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); 1708 mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); 1709 1710 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1711 ztest_rll_init(&zd->zd_object_lock[l]); 1712 1713 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1714 ztest_rll_init(&zd->zd_range_lock[l]); 1715 } 1716 1717 static void 1718 ztest_zd_fini(ztest_ds_t *zd) 1719 { 1720 int l; 1721 1722 mutex_destroy(&zd->zd_dirobj_lock); 1723 (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); 1724 1725 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1726 ztest_rll_destroy(&zd->zd_object_lock[l]); 1727 1728 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1729 ztest_rll_destroy(&zd->zd_range_lock[l]); 1730 } 1731 1732 #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) 1733 1734 static uint64_t 1735 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 1736 { 1737 uint64_t txg; 1738 int error; 1739 1740 /* 1741 * Attempt to assign tx to some transaction group. 1742 */ 1743 error = dmu_tx_assign(tx, txg_how); 1744 if (error) { 1745 if (error == ERESTART) { 1746 ASSERT3U(txg_how, ==, TXG_NOWAIT); 1747 dmu_tx_wait(tx); 1748 } else { 1749 ASSERT3U(error, ==, ENOSPC); 1750 ztest_record_enospc(tag); 1751 } 1752 dmu_tx_abort(tx); 1753 return (0); 1754 } 1755 txg = dmu_tx_get_txg(tx); 1756 ASSERT3U(txg, !=, 0); 1757 return (txg); 1758 } 1759 1760 static void 1761 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1762 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1763 uint64_t crtxg) 1764 { 1765 bt->bt_magic = BT_MAGIC; 1766 bt->bt_objset = dmu_objset_id(os); 1767 bt->bt_object = object; 1768 bt->bt_dnodesize = dnodesize; 1769 bt->bt_offset = offset; 1770 bt->bt_gen = gen; 1771 bt->bt_txg = txg; 1772 bt->bt_crtxg = crtxg; 1773 } 1774 1775 static void 1776 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1777 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1778 uint64_t crtxg) 1779 { 1780 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1781 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1782 ASSERT3U(bt->bt_object, ==, object); 1783 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1784 ASSERT3U(bt->bt_offset, ==, offset); 1785 ASSERT3U(bt->bt_gen, <=, gen); 1786 ASSERT3U(bt->bt_txg, <=, txg); 1787 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1788 } 1789 1790 static ztest_block_tag_t * 1791 ztest_bt_bonus(dmu_buf_t *db) 1792 { 1793 dmu_object_info_t doi; 1794 ztest_block_tag_t *bt; 1795 1796 dmu_object_info_from_db(db, &doi); 1797 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1798 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1799 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1800 1801 return (bt); 1802 } 1803 1804 /* 1805 * Generate a token to fill up unused bonus buffer space. Try to make 1806 * it unique to the object, generation, and offset to verify that data 1807 * is not getting overwritten by data from other dnodes. 1808 */ 1809 #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1810 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1811 1812 /* 1813 * Fill up the unused bonus buffer region before the block tag with a 1814 * verifiable pattern. Filling the whole bonus area with non-zero data 1815 * helps ensure that all dnode traversal code properly skips the 1816 * interior regions of large dnodes. 1817 */ 1818 static void 1819 ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1820 objset_t *os, uint64_t gen) 1821 { 1822 uint64_t *bonusp; 1823 1824 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1825 1826 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1827 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1828 gen, bonusp - (uint64_t *)db->db_data); 1829 *bonusp = token; 1830 } 1831 } 1832 1833 /* 1834 * Verify that the unused area of a bonus buffer is filled with the 1835 * expected tokens. 1836 */ 1837 static void 1838 ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1839 objset_t *os, uint64_t gen) 1840 { 1841 uint64_t *bonusp; 1842 1843 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1844 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1845 gen, bonusp - (uint64_t *)db->db_data); 1846 VERIFY3U(*bonusp, ==, token); 1847 } 1848 } 1849 1850 /* 1851 * ZIL logging ops 1852 */ 1853 1854 #define lrz_type lr_mode 1855 #define lrz_blocksize lr_uid 1856 #define lrz_ibshift lr_gid 1857 #define lrz_bonustype lr_rdev 1858 #define lrz_dnodesize lr_crtime[1] 1859 1860 static void 1861 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1862 { 1863 char *name = (void *)(lr + 1); /* name follows lr */ 1864 size_t namesize = strlen(name) + 1; 1865 itx_t *itx; 1866 1867 if (zil_replaying(zd->zd_zilog, tx)) 1868 return; 1869 1870 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1871 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1872 sizeof (*lr) + namesize - sizeof (lr_t)); 1873 1874 zil_itx_assign(zd->zd_zilog, itx, tx); 1875 } 1876 1877 static void 1878 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1879 { 1880 char *name = (void *)(lr + 1); /* name follows lr */ 1881 size_t namesize = strlen(name) + 1; 1882 itx_t *itx; 1883 1884 if (zil_replaying(zd->zd_zilog, tx)) 1885 return; 1886 1887 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1888 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1889 sizeof (*lr) + namesize - sizeof (lr_t)); 1890 1891 itx->itx_oid = object; 1892 zil_itx_assign(zd->zd_zilog, itx, tx); 1893 } 1894 1895 static void 1896 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1897 { 1898 itx_t *itx; 1899 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1900 1901 if (zil_replaying(zd->zd_zilog, tx)) 1902 return; 1903 1904 if (lr->lr_length > zil_max_log_data(zd->zd_zilog)) 1905 write_state = WR_INDIRECT; 1906 1907 itx = zil_itx_create(TX_WRITE, 1908 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1909 1910 if (write_state == WR_COPIED && 1911 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1912 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { 1913 zil_itx_destroy(itx); 1914 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1915 write_state = WR_NEED_COPY; 1916 } 1917 itx->itx_private = zd; 1918 itx->itx_wr_state = write_state; 1919 itx->itx_sync = (ztest_random(8) == 0); 1920 1921 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1922 sizeof (*lr) - sizeof (lr_t)); 1923 1924 zil_itx_assign(zd->zd_zilog, itx, tx); 1925 } 1926 1927 static void 1928 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 1929 { 1930 itx_t *itx; 1931 1932 if (zil_replaying(zd->zd_zilog, tx)) 1933 return; 1934 1935 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 1936 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1937 sizeof (*lr) - sizeof (lr_t)); 1938 1939 itx->itx_sync = B_FALSE; 1940 zil_itx_assign(zd->zd_zilog, itx, tx); 1941 } 1942 1943 static void 1944 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 1945 { 1946 itx_t *itx; 1947 1948 if (zil_replaying(zd->zd_zilog, tx)) 1949 return; 1950 1951 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 1952 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1953 sizeof (*lr) - sizeof (lr_t)); 1954 1955 itx->itx_sync = B_FALSE; 1956 zil_itx_assign(zd->zd_zilog, itx, tx); 1957 } 1958 1959 /* 1960 * ZIL replay ops 1961 */ 1962 static int 1963 ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 1964 { 1965 ztest_ds_t *zd = arg1; 1966 lr_create_t *lr = arg2; 1967 char *name = (void *)(lr + 1); /* name follows lr */ 1968 objset_t *os = zd->zd_os; 1969 ztest_block_tag_t *bbt; 1970 dmu_buf_t *db; 1971 dmu_tx_t *tx; 1972 uint64_t txg; 1973 int error = 0; 1974 int bonuslen; 1975 1976 if (byteswap) 1977 byteswap_uint64_array(lr, sizeof (*lr)); 1978 1979 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 1980 ASSERT3S(name[0], !=, '\0'); 1981 1982 tx = dmu_tx_create(os); 1983 1984 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 1985 1986 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1987 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1988 } else { 1989 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1990 } 1991 1992 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1993 if (txg == 0) 1994 return (ENOSPC); 1995 1996 ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); 1997 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 1998 1999 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2000 if (lr->lr_foid == 0) { 2001 lr->lr_foid = zap_create_dnsize(os, 2002 lr->lrz_type, lr->lrz_bonustype, 2003 bonuslen, lr->lrz_dnodesize, tx); 2004 } else { 2005 error = zap_create_claim_dnsize(os, lr->lr_foid, 2006 lr->lrz_type, lr->lrz_bonustype, 2007 bonuslen, lr->lrz_dnodesize, tx); 2008 } 2009 } else { 2010 if (lr->lr_foid == 0) { 2011 lr->lr_foid = dmu_object_alloc_dnsize(os, 2012 lr->lrz_type, 0, lr->lrz_bonustype, 2013 bonuslen, lr->lrz_dnodesize, tx); 2014 } else { 2015 error = dmu_object_claim_dnsize(os, lr->lr_foid, 2016 lr->lrz_type, 0, lr->lrz_bonustype, 2017 bonuslen, lr->lrz_dnodesize, tx); 2018 } 2019 } 2020 2021 if (error) { 2022 ASSERT3U(error, ==, EEXIST); 2023 ASSERT(zd->zd_zilog->zl_replay); 2024 dmu_tx_commit(tx); 2025 return (error); 2026 } 2027 2028 ASSERT3U(lr->lr_foid, !=, 0); 2029 2030 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 2031 VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, 2032 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 2033 2034 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2035 bbt = ztest_bt_bonus(db); 2036 dmu_buf_will_dirty(db, tx); 2037 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 2038 lr->lr_gen, txg, txg); 2039 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 2040 dmu_buf_rele(db, FTAG); 2041 2042 VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 2043 &lr->lr_foid, tx)); 2044 2045 (void) ztest_log_create(zd, tx, lr); 2046 2047 dmu_tx_commit(tx); 2048 2049 return (0); 2050 } 2051 2052 static int 2053 ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 2054 { 2055 ztest_ds_t *zd = arg1; 2056 lr_remove_t *lr = arg2; 2057 char *name = (void *)(lr + 1); /* name follows lr */ 2058 objset_t *os = zd->zd_os; 2059 dmu_object_info_t doi; 2060 dmu_tx_t *tx; 2061 uint64_t object, txg; 2062 2063 if (byteswap) 2064 byteswap_uint64_array(lr, sizeof (*lr)); 2065 2066 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2067 ASSERT3S(name[0], !=, '\0'); 2068 2069 VERIFY0( 2070 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 2071 ASSERT3U(object, !=, 0); 2072 2073 ztest_object_lock(zd, object, RL_WRITER); 2074 2075 VERIFY0(dmu_object_info(os, object, &doi)); 2076 2077 tx = dmu_tx_create(os); 2078 2079 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 2080 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 2081 2082 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2083 if (txg == 0) { 2084 ztest_object_unlock(zd, object); 2085 return (ENOSPC); 2086 } 2087 2088 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 2089 VERIFY0(zap_destroy(os, object, tx)); 2090 } else { 2091 VERIFY0(dmu_object_free(os, object, tx)); 2092 } 2093 2094 VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); 2095 2096 (void) ztest_log_remove(zd, tx, lr, object); 2097 2098 dmu_tx_commit(tx); 2099 2100 ztest_object_unlock(zd, object); 2101 2102 return (0); 2103 } 2104 2105 static int 2106 ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 2107 { 2108 ztest_ds_t *zd = arg1; 2109 lr_write_t *lr = arg2; 2110 objset_t *os = zd->zd_os; 2111 void *data = lr + 1; /* data follows lr */ 2112 uint64_t offset, length; 2113 ztest_block_tag_t *bt = data; 2114 ztest_block_tag_t *bbt; 2115 uint64_t gen, txg, lrtxg, crtxg; 2116 dmu_object_info_t doi; 2117 dmu_tx_t *tx; 2118 dmu_buf_t *db; 2119 arc_buf_t *abuf = NULL; 2120 rl_t *rl; 2121 2122 if (byteswap) 2123 byteswap_uint64_array(lr, sizeof (*lr)); 2124 2125 offset = lr->lr_offset; 2126 length = lr->lr_length; 2127 2128 /* If it's a dmu_sync() block, write the whole block */ 2129 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 2130 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 2131 if (length < blocksize) { 2132 offset -= offset % blocksize; 2133 length = blocksize; 2134 } 2135 } 2136 2137 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 2138 byteswap_uint64_array(bt, sizeof (*bt)); 2139 2140 if (bt->bt_magic != BT_MAGIC) 2141 bt = NULL; 2142 2143 ztest_object_lock(zd, lr->lr_foid, RL_READER); 2144 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); 2145 2146 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2147 2148 dmu_object_info_from_db(db, &doi); 2149 2150 bbt = ztest_bt_bonus(db); 2151 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2152 gen = bbt->bt_gen; 2153 crtxg = bbt->bt_crtxg; 2154 lrtxg = lr->lr_common.lrc_txg; 2155 2156 tx = dmu_tx_create(os); 2157 2158 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 2159 2160 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 2161 P2PHASE(offset, length) == 0) 2162 abuf = dmu_request_arcbuf(db, length); 2163 2164 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2165 if (txg == 0) { 2166 if (abuf != NULL) 2167 dmu_return_arcbuf(abuf); 2168 dmu_buf_rele(db, FTAG); 2169 ztest_range_unlock(rl); 2170 ztest_object_unlock(zd, lr->lr_foid); 2171 return (ENOSPC); 2172 } 2173 2174 if (bt != NULL) { 2175 /* 2176 * Usually, verify the old data before writing new data -- 2177 * but not always, because we also want to verify correct 2178 * behavior when the data was not recently read into cache. 2179 */ 2180 ASSERT(doi.doi_data_block_size); 2181 ASSERT0(offset % doi.doi_data_block_size); 2182 if (ztest_random(4) != 0) { 2183 int prefetch = ztest_random(2) ? 2184 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 2185 ztest_block_tag_t rbt; 2186 2187 VERIFY(dmu_read(os, lr->lr_foid, offset, 2188 sizeof (rbt), &rbt, prefetch) == 0); 2189 if (rbt.bt_magic == BT_MAGIC) { 2190 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 2191 offset, gen, txg, crtxg); 2192 } 2193 } 2194 2195 /* 2196 * Writes can appear to be newer than the bonus buffer because 2197 * the ztest_get_data() callback does a dmu_read() of the 2198 * open-context data, which may be different than the data 2199 * as it was when the write was generated. 2200 */ 2201 if (zd->zd_zilog->zl_replay) { 2202 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 2203 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 2204 bt->bt_crtxg); 2205 } 2206 2207 /* 2208 * Set the bt's gen/txg to the bonus buffer's gen/txg 2209 * so that all of the usual ASSERTs will work. 2210 */ 2211 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 2212 crtxg); 2213 } 2214 2215 if (abuf == NULL) { 2216 dmu_write(os, lr->lr_foid, offset, length, data, tx); 2217 } else { 2218 memcpy(abuf->b_data, data, length); 2219 VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx)); 2220 } 2221 2222 (void) ztest_log_write(zd, tx, lr); 2223 2224 dmu_buf_rele(db, FTAG); 2225 2226 dmu_tx_commit(tx); 2227 2228 ztest_range_unlock(rl); 2229 ztest_object_unlock(zd, lr->lr_foid); 2230 2231 return (0); 2232 } 2233 2234 static int 2235 ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 2236 { 2237 ztest_ds_t *zd = arg1; 2238 lr_truncate_t *lr = arg2; 2239 objset_t *os = zd->zd_os; 2240 dmu_tx_t *tx; 2241 uint64_t txg; 2242 rl_t *rl; 2243 2244 if (byteswap) 2245 byteswap_uint64_array(lr, sizeof (*lr)); 2246 2247 ztest_object_lock(zd, lr->lr_foid, RL_READER); 2248 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 2249 RL_WRITER); 2250 2251 tx = dmu_tx_create(os); 2252 2253 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 2254 2255 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2256 if (txg == 0) { 2257 ztest_range_unlock(rl); 2258 ztest_object_unlock(zd, lr->lr_foid); 2259 return (ENOSPC); 2260 } 2261 2262 VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 2263 lr->lr_length, tx)); 2264 2265 (void) ztest_log_truncate(zd, tx, lr); 2266 2267 dmu_tx_commit(tx); 2268 2269 ztest_range_unlock(rl); 2270 ztest_object_unlock(zd, lr->lr_foid); 2271 2272 return (0); 2273 } 2274 2275 static int 2276 ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 2277 { 2278 ztest_ds_t *zd = arg1; 2279 lr_setattr_t *lr = arg2; 2280 objset_t *os = zd->zd_os; 2281 dmu_tx_t *tx; 2282 dmu_buf_t *db; 2283 ztest_block_tag_t *bbt; 2284 uint64_t txg, lrtxg, crtxg, dnodesize; 2285 2286 if (byteswap) 2287 byteswap_uint64_array(lr, sizeof (*lr)); 2288 2289 ztest_object_lock(zd, lr->lr_foid, RL_WRITER); 2290 2291 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2292 2293 tx = dmu_tx_create(os); 2294 dmu_tx_hold_bonus(tx, lr->lr_foid); 2295 2296 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2297 if (txg == 0) { 2298 dmu_buf_rele(db, FTAG); 2299 ztest_object_unlock(zd, lr->lr_foid); 2300 return (ENOSPC); 2301 } 2302 2303 bbt = ztest_bt_bonus(db); 2304 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2305 crtxg = bbt->bt_crtxg; 2306 lrtxg = lr->lr_common.lrc_txg; 2307 dnodesize = bbt->bt_dnodesize; 2308 2309 if (zd->zd_zilog->zl_replay) { 2310 ASSERT3U(lr->lr_size, !=, 0); 2311 ASSERT3U(lr->lr_mode, !=, 0); 2312 ASSERT3U(lrtxg, !=, 0); 2313 } else { 2314 /* 2315 * Randomly change the size and increment the generation. 2316 */ 2317 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 2318 sizeof (*bbt); 2319 lr->lr_mode = bbt->bt_gen + 1; 2320 ASSERT0(lrtxg); 2321 } 2322 2323 /* 2324 * Verify that the current bonus buffer is not newer than our txg. 2325 */ 2326 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2327 MAX(txg, lrtxg), crtxg); 2328 2329 dmu_buf_will_dirty(db, tx); 2330 2331 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 2332 ASSERT3U(lr->lr_size, <=, db->db_size); 2333 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 2334 bbt = ztest_bt_bonus(db); 2335 2336 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2337 txg, crtxg); 2338 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 2339 dmu_buf_rele(db, FTAG); 2340 2341 (void) ztest_log_setattr(zd, tx, lr); 2342 2343 dmu_tx_commit(tx); 2344 2345 ztest_object_unlock(zd, lr->lr_foid); 2346 2347 return (0); 2348 } 2349 2350 static zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 2351 NULL, /* 0 no such transaction type */ 2352 ztest_replay_create, /* TX_CREATE */ 2353 NULL, /* TX_MKDIR */ 2354 NULL, /* TX_MKXATTR */ 2355 NULL, /* TX_SYMLINK */ 2356 ztest_replay_remove, /* TX_REMOVE */ 2357 NULL, /* TX_RMDIR */ 2358 NULL, /* TX_LINK */ 2359 NULL, /* TX_RENAME */ 2360 ztest_replay_write, /* TX_WRITE */ 2361 ztest_replay_truncate, /* TX_TRUNCATE */ 2362 ztest_replay_setattr, /* TX_SETATTR */ 2363 NULL, /* TX_ACL */ 2364 NULL, /* TX_CREATE_ACL */ 2365 NULL, /* TX_CREATE_ATTR */ 2366 NULL, /* TX_CREATE_ACL_ATTR */ 2367 NULL, /* TX_MKDIR_ACL */ 2368 NULL, /* TX_MKDIR_ATTR */ 2369 NULL, /* TX_MKDIR_ACL_ATTR */ 2370 NULL, /* TX_WRITE2 */ 2371 NULL, /* TX_SETSAXATTR */ 2372 NULL, /* TX_RENAME_EXCHANGE */ 2373 NULL, /* TX_RENAME_WHITEOUT */ 2374 }; 2375 2376 /* 2377 * ZIL get_data callbacks 2378 */ 2379 2380 static void 2381 ztest_get_done(zgd_t *zgd, int error) 2382 { 2383 (void) error; 2384 ztest_ds_t *zd = zgd->zgd_private; 2385 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 2386 2387 if (zgd->zgd_db) 2388 dmu_buf_rele(zgd->zgd_db, zgd); 2389 2390 ztest_range_unlock((rl_t *)zgd->zgd_lr); 2391 ztest_object_unlock(zd, object); 2392 2393 umem_free(zgd, sizeof (*zgd)); 2394 } 2395 2396 static int 2397 ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, 2398 struct lwb *lwb, zio_t *zio) 2399 { 2400 (void) arg2; 2401 ztest_ds_t *zd = arg; 2402 objset_t *os = zd->zd_os; 2403 uint64_t object = lr->lr_foid; 2404 uint64_t offset = lr->lr_offset; 2405 uint64_t size = lr->lr_length; 2406 uint64_t txg = lr->lr_common.lrc_txg; 2407 uint64_t crtxg; 2408 dmu_object_info_t doi; 2409 dmu_buf_t *db; 2410 zgd_t *zgd; 2411 int error; 2412 2413 ASSERT3P(lwb, !=, NULL); 2414 ASSERT3P(zio, !=, NULL); 2415 ASSERT3U(size, !=, 0); 2416 2417 ztest_object_lock(zd, object, RL_READER); 2418 error = dmu_bonus_hold(os, object, FTAG, &db); 2419 if (error) { 2420 ztest_object_unlock(zd, object); 2421 return (error); 2422 } 2423 2424 crtxg = ztest_bt_bonus(db)->bt_crtxg; 2425 2426 if (crtxg == 0 || crtxg > txg) { 2427 dmu_buf_rele(db, FTAG); 2428 ztest_object_unlock(zd, object); 2429 return (ENOENT); 2430 } 2431 2432 dmu_object_info_from_db(db, &doi); 2433 dmu_buf_rele(db, FTAG); 2434 db = NULL; 2435 2436 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 2437 zgd->zgd_lwb = lwb; 2438 zgd->zgd_private = zd; 2439 2440 if (buf != NULL) { /* immediate write */ 2441 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2442 object, offset, size, RL_READER); 2443 2444 error = dmu_read(os, object, offset, size, buf, 2445 DMU_READ_NO_PREFETCH); 2446 ASSERT0(error); 2447 } else { 2448 size = doi.doi_data_block_size; 2449 if (ISP2(size)) { 2450 offset = P2ALIGN(offset, size); 2451 } else { 2452 ASSERT3U(offset, <, size); 2453 offset = 0; 2454 } 2455 2456 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2457 object, offset, size, RL_READER); 2458 2459 error = dmu_buf_hold(os, object, offset, zgd, &db, 2460 DMU_READ_NO_PREFETCH); 2461 2462 if (error == 0) { 2463 blkptr_t *bp = &lr->lr_blkptr; 2464 2465 zgd->zgd_db = db; 2466 zgd->zgd_bp = bp; 2467 2468 ASSERT3U(db->db_offset, ==, offset); 2469 ASSERT3U(db->db_size, ==, size); 2470 2471 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2472 ztest_get_done, zgd); 2473 2474 if (error == 0) 2475 return (0); 2476 } 2477 } 2478 2479 ztest_get_done(zgd, error); 2480 2481 return (error); 2482 } 2483 2484 static void * 2485 ztest_lr_alloc(size_t lrsize, char *name) 2486 { 2487 char *lr; 2488 size_t namesize = name ? strlen(name) + 1 : 0; 2489 2490 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2491 2492 if (name) 2493 memcpy(lr + lrsize, name, namesize); 2494 2495 return (lr); 2496 } 2497 2498 static void 2499 ztest_lr_free(void *lr, size_t lrsize, char *name) 2500 { 2501 size_t namesize = name ? strlen(name) + 1 : 0; 2502 2503 umem_free(lr, lrsize + namesize); 2504 } 2505 2506 /* 2507 * Lookup a bunch of objects. Returns the number of objects not found. 2508 */ 2509 static int 2510 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2511 { 2512 int missing = 0; 2513 int error; 2514 int i; 2515 2516 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2517 2518 for (i = 0; i < count; i++, od++) { 2519 od->od_object = 0; 2520 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2521 sizeof (uint64_t), 1, &od->od_object); 2522 if (error) { 2523 ASSERT3S(error, ==, ENOENT); 2524 ASSERT0(od->od_object); 2525 missing++; 2526 } else { 2527 dmu_buf_t *db; 2528 ztest_block_tag_t *bbt; 2529 dmu_object_info_t doi; 2530 2531 ASSERT3U(od->od_object, !=, 0); 2532 ASSERT0(missing); /* there should be no gaps */ 2533 2534 ztest_object_lock(zd, od->od_object, RL_READER); 2535 VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, 2536 FTAG, &db)); 2537 dmu_object_info_from_db(db, &doi); 2538 bbt = ztest_bt_bonus(db); 2539 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2540 od->od_type = doi.doi_type; 2541 od->od_blocksize = doi.doi_data_block_size; 2542 od->od_gen = bbt->bt_gen; 2543 dmu_buf_rele(db, FTAG); 2544 ztest_object_unlock(zd, od->od_object); 2545 } 2546 } 2547 2548 return (missing); 2549 } 2550 2551 static int 2552 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2553 { 2554 int missing = 0; 2555 int i; 2556 2557 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2558 2559 for (i = 0; i < count; i++, od++) { 2560 if (missing) { 2561 od->od_object = 0; 2562 missing++; 2563 continue; 2564 } 2565 2566 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2567 2568 lr->lr_doid = od->od_dir; 2569 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2570 lr->lrz_type = od->od_crtype; 2571 lr->lrz_blocksize = od->od_crblocksize; 2572 lr->lrz_ibshift = ztest_random_ibshift(); 2573 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2574 lr->lrz_dnodesize = od->od_crdnodesize; 2575 lr->lr_gen = od->od_crgen; 2576 lr->lr_crtime[0] = time(NULL); 2577 2578 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2579 ASSERT0(missing); 2580 od->od_object = 0; 2581 missing++; 2582 } else { 2583 od->od_object = lr->lr_foid; 2584 od->od_type = od->od_crtype; 2585 od->od_blocksize = od->od_crblocksize; 2586 od->od_gen = od->od_crgen; 2587 ASSERT3U(od->od_object, !=, 0); 2588 } 2589 2590 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2591 } 2592 2593 return (missing); 2594 } 2595 2596 static int 2597 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2598 { 2599 int missing = 0; 2600 int error; 2601 int i; 2602 2603 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2604 2605 od += count - 1; 2606 2607 for (i = count - 1; i >= 0; i--, od--) { 2608 if (missing) { 2609 missing++; 2610 continue; 2611 } 2612 2613 /* 2614 * No object was found. 2615 */ 2616 if (od->od_object == 0) 2617 continue; 2618 2619 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2620 2621 lr->lr_doid = od->od_dir; 2622 2623 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2624 ASSERT3U(error, ==, ENOSPC); 2625 missing++; 2626 } else { 2627 od->od_object = 0; 2628 } 2629 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2630 } 2631 2632 return (missing); 2633 } 2634 2635 static int 2636 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2637 void *data) 2638 { 2639 lr_write_t *lr; 2640 int error; 2641 2642 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2643 2644 lr->lr_foid = object; 2645 lr->lr_offset = offset; 2646 lr->lr_length = size; 2647 lr->lr_blkoff = 0; 2648 BP_ZERO(&lr->lr_blkptr); 2649 2650 memcpy(lr + 1, data, size); 2651 2652 error = ztest_replay_write(zd, lr, B_FALSE); 2653 2654 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2655 2656 return (error); 2657 } 2658 2659 static int 2660 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2661 { 2662 lr_truncate_t *lr; 2663 int error; 2664 2665 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2666 2667 lr->lr_foid = object; 2668 lr->lr_offset = offset; 2669 lr->lr_length = size; 2670 2671 error = ztest_replay_truncate(zd, lr, B_FALSE); 2672 2673 ztest_lr_free(lr, sizeof (*lr), NULL); 2674 2675 return (error); 2676 } 2677 2678 static int 2679 ztest_setattr(ztest_ds_t *zd, uint64_t object) 2680 { 2681 lr_setattr_t *lr; 2682 int error; 2683 2684 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2685 2686 lr->lr_foid = object; 2687 lr->lr_size = 0; 2688 lr->lr_mode = 0; 2689 2690 error = ztest_replay_setattr(zd, lr, B_FALSE); 2691 2692 ztest_lr_free(lr, sizeof (*lr), NULL); 2693 2694 return (error); 2695 } 2696 2697 static void 2698 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2699 { 2700 objset_t *os = zd->zd_os; 2701 dmu_tx_t *tx; 2702 uint64_t txg; 2703 rl_t *rl; 2704 2705 txg_wait_synced(dmu_objset_pool(os), 0); 2706 2707 ztest_object_lock(zd, object, RL_READER); 2708 rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); 2709 2710 tx = dmu_tx_create(os); 2711 2712 dmu_tx_hold_write(tx, object, offset, size); 2713 2714 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2715 2716 if (txg != 0) { 2717 dmu_prealloc(os, object, offset, size, tx); 2718 dmu_tx_commit(tx); 2719 txg_wait_synced(dmu_objset_pool(os), txg); 2720 } else { 2721 (void) dmu_free_long_range(os, object, offset, size); 2722 } 2723 2724 ztest_range_unlock(rl); 2725 ztest_object_unlock(zd, object); 2726 } 2727 2728 static void 2729 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2730 { 2731 int err; 2732 ztest_block_tag_t wbt; 2733 dmu_object_info_t doi; 2734 enum ztest_io_type io_type; 2735 uint64_t blocksize; 2736 void *data; 2737 2738 VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); 2739 blocksize = doi.doi_data_block_size; 2740 data = umem_alloc(blocksize, UMEM_NOFAIL); 2741 2742 /* 2743 * Pick an i/o type at random, biased toward writing block tags. 2744 */ 2745 io_type = ztest_random(ZTEST_IO_TYPES); 2746 if (ztest_random(2) == 0) 2747 io_type = ZTEST_IO_WRITE_TAG; 2748 2749 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2750 2751 switch (io_type) { 2752 2753 case ZTEST_IO_WRITE_TAG: 2754 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2755 offset, 0, 0, 0); 2756 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2757 break; 2758 2759 case ZTEST_IO_WRITE_PATTERN: 2760 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2761 if (ztest_random(2) == 0) { 2762 /* 2763 * Induce fletcher2 collisions to ensure that 2764 * zio_ddt_collision() detects and resolves them 2765 * when using fletcher2-verify for deduplication. 2766 */ 2767 ((uint64_t *)data)[0] ^= 1ULL << 63; 2768 ((uint64_t *)data)[4] ^= 1ULL << 63; 2769 } 2770 (void) ztest_write(zd, object, offset, blocksize, data); 2771 break; 2772 2773 case ZTEST_IO_WRITE_ZEROES: 2774 memset(data, 0, blocksize); 2775 (void) ztest_write(zd, object, offset, blocksize, data); 2776 break; 2777 2778 case ZTEST_IO_TRUNCATE: 2779 (void) ztest_truncate(zd, object, offset, blocksize); 2780 break; 2781 2782 case ZTEST_IO_SETATTR: 2783 (void) ztest_setattr(zd, object); 2784 break; 2785 default: 2786 break; 2787 2788 case ZTEST_IO_REWRITE: 2789 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2790 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2791 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2792 B_FALSE); 2793 VERIFY(err == 0 || err == ENOSPC); 2794 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2795 ZFS_PROP_COMPRESSION, 2796 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2797 B_FALSE); 2798 VERIFY(err == 0 || err == ENOSPC); 2799 (void) pthread_rwlock_unlock(&ztest_name_lock); 2800 2801 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2802 DMU_READ_NO_PREFETCH)); 2803 2804 (void) ztest_write(zd, object, offset, blocksize, data); 2805 break; 2806 } 2807 2808 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2809 2810 umem_free(data, blocksize); 2811 } 2812 2813 /* 2814 * Initialize an object description template. 2815 */ 2816 static void 2817 ztest_od_init(ztest_od_t *od, uint64_t id, const char *tag, uint64_t index, 2818 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2819 uint64_t gen) 2820 { 2821 od->od_dir = ZTEST_DIROBJ; 2822 od->od_object = 0; 2823 2824 od->od_crtype = type; 2825 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2826 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2827 od->od_crgen = gen; 2828 2829 od->od_type = DMU_OT_NONE; 2830 od->od_blocksize = 0; 2831 od->od_gen = 0; 2832 2833 (void) snprintf(od->od_name, sizeof (od->od_name), 2834 "%s(%"PRId64")[%"PRIu64"]", 2835 tag, id, index); 2836 } 2837 2838 /* 2839 * Lookup or create the objects for a test using the od template. 2840 * If the objects do not all exist, or if 'remove' is specified, 2841 * remove any existing objects and create new ones. Otherwise, 2842 * use the existing objects. 2843 */ 2844 static int 2845 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2846 { 2847 int count = size / sizeof (*od); 2848 int rv = 0; 2849 2850 mutex_enter(&zd->zd_dirobj_lock); 2851 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2852 (ztest_remove(zd, od, count) != 0 || 2853 ztest_create(zd, od, count) != 0)) 2854 rv = -1; 2855 zd->zd_od = od; 2856 mutex_exit(&zd->zd_dirobj_lock); 2857 2858 return (rv); 2859 } 2860 2861 void 2862 ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2863 { 2864 (void) id; 2865 zilog_t *zilog = zd->zd_zilog; 2866 2867 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2868 2869 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2870 2871 /* 2872 * Remember the committed values in zd, which is in parent/child 2873 * shared memory. If we die, the next iteration of ztest_run() 2874 * will verify that the log really does contain this record. 2875 */ 2876 mutex_enter(&zilog->zl_lock); 2877 ASSERT3P(zd->zd_shared, !=, NULL); 2878 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2879 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2880 mutex_exit(&zilog->zl_lock); 2881 2882 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2883 } 2884 2885 /* 2886 * This function is designed to simulate the operations that occur during a 2887 * mount/unmount operation. We hold the dataset across these operations in an 2888 * attempt to expose any implicit assumptions about ZIL management. 2889 */ 2890 void 2891 ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2892 { 2893 (void) id; 2894 objset_t *os = zd->zd_os; 2895 2896 /* 2897 * We hold the ztest_vdev_lock so we don't cause problems with 2898 * other threads that wish to remove a log device, such as 2899 * ztest_device_removal(). 2900 */ 2901 mutex_enter(&ztest_vdev_lock); 2902 2903 /* 2904 * We grab the zd_dirobj_lock to ensure that no other thread is 2905 * updating the zil (i.e. adding in-memory log records) and the 2906 * zd_zilog_lock to block any I/O. 2907 */ 2908 mutex_enter(&zd->zd_dirobj_lock); 2909 (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); 2910 2911 /* zfsvfs_teardown() */ 2912 zil_close(zd->zd_zilog); 2913 2914 /* zfsvfs_setup() */ 2915 VERIFY3P(zil_open(os, ztest_get_data, NULL), ==, zd->zd_zilog); 2916 zil_replay(os, zd, ztest_replay_vector); 2917 2918 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2919 mutex_exit(&zd->zd_dirobj_lock); 2920 mutex_exit(&ztest_vdev_lock); 2921 } 2922 2923 /* 2924 * Verify that we can't destroy an active pool, create an existing pool, 2925 * or create a pool with a bad vdev spec. 2926 */ 2927 void 2928 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 2929 { 2930 (void) zd, (void) id; 2931 ztest_shared_opts_t *zo = &ztest_opts; 2932 spa_t *spa; 2933 nvlist_t *nvroot; 2934 2935 if (zo->zo_mmp_test) 2936 return; 2937 2938 /* 2939 * Attempt to create using a bad file. 2940 */ 2941 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 2942 VERIFY3U(ENOENT, ==, 2943 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 2944 fnvlist_free(nvroot); 2945 2946 /* 2947 * Attempt to create using a bad mirror. 2948 */ 2949 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); 2950 VERIFY3U(ENOENT, ==, 2951 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 2952 fnvlist_free(nvroot); 2953 2954 /* 2955 * Attempt to create an existing pool. It shouldn't matter 2956 * what's in the nvroot; we should fail with EEXIST. 2957 */ 2958 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2959 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 2960 VERIFY3U(EEXIST, ==, 2961 spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); 2962 fnvlist_free(nvroot); 2963 2964 /* 2965 * We open a reference to the spa and then we try to export it 2966 * expecting one of the following errors: 2967 * 2968 * EBUSY 2969 * Because of the reference we just opened. 2970 * 2971 * ZFS_ERR_EXPORT_IN_PROGRESS 2972 * For the case that there is another ztest thread doing 2973 * an export concurrently. 2974 */ 2975 VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); 2976 int error = spa_destroy(zo->zo_pool); 2977 if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { 2978 fatal(B_FALSE, "spa_destroy(%s) returned unexpected value %d", 2979 spa->spa_name, error); 2980 } 2981 spa_close(spa, FTAG); 2982 2983 (void) pthread_rwlock_unlock(&ztest_name_lock); 2984 } 2985 2986 /* 2987 * Start and then stop the MMP threads to ensure the startup and shutdown code 2988 * works properly. Actual protection and property-related code tested via ZTS. 2989 */ 2990 void 2991 ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) 2992 { 2993 (void) zd, (void) id; 2994 ztest_shared_opts_t *zo = &ztest_opts; 2995 spa_t *spa = ztest_spa; 2996 2997 if (zo->zo_mmp_test) 2998 return; 2999 3000 /* 3001 * Since enabling MMP involves setting a property, it could not be done 3002 * while the pool is suspended. 3003 */ 3004 if (spa_suspended(spa)) 3005 return; 3006 3007 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3008 mutex_enter(&spa->spa_props_lock); 3009 3010 zfs_multihost_fail_intervals = 0; 3011 3012 if (!spa_multihost(spa)) { 3013 spa->spa_multihost = B_TRUE; 3014 mmp_thread_start(spa); 3015 } 3016 3017 mutex_exit(&spa->spa_props_lock); 3018 spa_config_exit(spa, SCL_CONFIG, FTAG); 3019 3020 txg_wait_synced(spa_get_dsl(spa), 0); 3021 mmp_signal_all_threads(); 3022 txg_wait_synced(spa_get_dsl(spa), 0); 3023 3024 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3025 mutex_enter(&spa->spa_props_lock); 3026 3027 if (spa_multihost(spa)) { 3028 mmp_thread_stop(spa); 3029 spa->spa_multihost = B_FALSE; 3030 } 3031 3032 mutex_exit(&spa->spa_props_lock); 3033 spa_config_exit(spa, SCL_CONFIG, FTAG); 3034 } 3035 3036 void 3037 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 3038 { 3039 (void) zd, (void) id; 3040 spa_t *spa; 3041 uint64_t initial_version = SPA_VERSION_INITIAL; 3042 uint64_t version, newversion; 3043 nvlist_t *nvroot, *props; 3044 char *name; 3045 3046 if (ztest_opts.zo_mmp_test) 3047 return; 3048 3049 /* dRAID added after feature flags, skip upgrade test. */ 3050 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) 3051 return; 3052 3053 mutex_enter(&ztest_vdev_lock); 3054 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 3055 3056 /* 3057 * Clean up from previous runs. 3058 */ 3059 (void) spa_destroy(name); 3060 3061 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 3062 NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); 3063 3064 /* 3065 * If we're configuring a RAIDZ device then make sure that the 3066 * initial version is capable of supporting that feature. 3067 */ 3068 switch (ztest_opts.zo_raid_parity) { 3069 case 0: 3070 case 1: 3071 initial_version = SPA_VERSION_INITIAL; 3072 break; 3073 case 2: 3074 initial_version = SPA_VERSION_RAIDZ2; 3075 break; 3076 case 3: 3077 initial_version = SPA_VERSION_RAIDZ3; 3078 break; 3079 } 3080 3081 /* 3082 * Create a pool with a spa version that can be upgraded. Pick 3083 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 3084 */ 3085 do { 3086 version = ztest_random_spa_version(initial_version); 3087 } while (version > SPA_VERSION_BEFORE_FEATURES); 3088 3089 props = fnvlist_alloc(); 3090 fnvlist_add_uint64(props, 3091 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 3092 VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); 3093 fnvlist_free(nvroot); 3094 fnvlist_free(props); 3095 3096 VERIFY0(spa_open(name, &spa, FTAG)); 3097 VERIFY3U(spa_version(spa), ==, version); 3098 newversion = ztest_random_spa_version(version + 1); 3099 3100 if (ztest_opts.zo_verbose >= 4) { 3101 (void) printf("upgrading spa version from " 3102 "%"PRIu64" to %"PRIu64"\n", 3103 version, newversion); 3104 } 3105 3106 spa_upgrade(spa, newversion); 3107 VERIFY3U(spa_version(spa), >, version); 3108 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 3109 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 3110 spa_close(spa, FTAG); 3111 3112 kmem_strfree(name); 3113 mutex_exit(&ztest_vdev_lock); 3114 } 3115 3116 static void 3117 ztest_spa_checkpoint(spa_t *spa) 3118 { 3119 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3120 3121 int error = spa_checkpoint(spa->spa_name); 3122 3123 switch (error) { 3124 case 0: 3125 case ZFS_ERR_DEVRM_IN_PROGRESS: 3126 case ZFS_ERR_DISCARDING_CHECKPOINT: 3127 case ZFS_ERR_CHECKPOINT_EXISTS: 3128 break; 3129 case ENOSPC: 3130 ztest_record_enospc(FTAG); 3131 break; 3132 default: 3133 fatal(B_FALSE, "spa_checkpoint(%s) = %d", spa->spa_name, error); 3134 } 3135 } 3136 3137 static void 3138 ztest_spa_discard_checkpoint(spa_t *spa) 3139 { 3140 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3141 3142 int error = spa_checkpoint_discard(spa->spa_name); 3143 3144 switch (error) { 3145 case 0: 3146 case ZFS_ERR_DISCARDING_CHECKPOINT: 3147 case ZFS_ERR_NO_CHECKPOINT: 3148 break; 3149 default: 3150 fatal(B_FALSE, "spa_discard_checkpoint(%s) = %d", 3151 spa->spa_name, error); 3152 } 3153 3154 } 3155 3156 void 3157 ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 3158 { 3159 (void) zd, (void) id; 3160 spa_t *spa = ztest_spa; 3161 3162 mutex_enter(&ztest_checkpoint_lock); 3163 if (ztest_random(2) == 0) { 3164 ztest_spa_checkpoint(spa); 3165 } else { 3166 ztest_spa_discard_checkpoint(spa); 3167 } 3168 mutex_exit(&ztest_checkpoint_lock); 3169 } 3170 3171 3172 static vdev_t * 3173 vdev_lookup_by_path(vdev_t *vd, const char *path) 3174 { 3175 vdev_t *mvd; 3176 int c; 3177 3178 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 3179 return (vd); 3180 3181 for (c = 0; c < vd->vdev_children; c++) 3182 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 3183 NULL) 3184 return (mvd); 3185 3186 return (NULL); 3187 } 3188 3189 static int 3190 spa_num_top_vdevs(spa_t *spa) 3191 { 3192 vdev_t *rvd = spa->spa_root_vdev; 3193 ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); 3194 return (rvd->vdev_children); 3195 } 3196 3197 /* 3198 * Verify that vdev_add() works as expected. 3199 */ 3200 void 3201 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 3202 { 3203 (void) zd, (void) id; 3204 ztest_shared_t *zs = ztest_shared; 3205 spa_t *spa = ztest_spa; 3206 uint64_t leaves; 3207 uint64_t guid; 3208 nvlist_t *nvroot; 3209 int error; 3210 3211 if (ztest_opts.zo_mmp_test) 3212 return; 3213 3214 mutex_enter(&ztest_vdev_lock); 3215 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * 3216 ztest_opts.zo_raid_children; 3217 3218 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3219 3220 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3221 3222 /* 3223 * If we have slogs then remove them 1/4 of the time. 3224 */ 3225 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 3226 metaslab_group_t *mg; 3227 3228 /* 3229 * find the first real slog in log allocation class 3230 */ 3231 mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; 3232 while (!mg->mg_vd->vdev_islog) 3233 mg = mg->mg_next; 3234 3235 guid = mg->mg_vd->vdev_guid; 3236 3237 spa_config_exit(spa, SCL_VDEV, FTAG); 3238 3239 /* 3240 * We have to grab the zs_name_lock as writer to 3241 * prevent a race between removing a slog (dmu_objset_find) 3242 * and destroying a dataset. Removing the slog will 3243 * grab a reference on the dataset which may cause 3244 * dsl_destroy_head() to fail with EBUSY thus 3245 * leaving the dataset in an inconsistent state. 3246 */ 3247 pthread_rwlock_wrlock(&ztest_name_lock); 3248 error = spa_vdev_remove(spa, guid, B_FALSE); 3249 pthread_rwlock_unlock(&ztest_name_lock); 3250 3251 switch (error) { 3252 case 0: 3253 case EEXIST: /* Generic zil_reset() error */ 3254 case EBUSY: /* Replay required */ 3255 case EACCES: /* Crypto key not loaded */ 3256 case ZFS_ERR_CHECKPOINT_EXISTS: 3257 case ZFS_ERR_DISCARDING_CHECKPOINT: 3258 break; 3259 default: 3260 fatal(B_FALSE, "spa_vdev_remove() = %d", error); 3261 } 3262 } else { 3263 spa_config_exit(spa, SCL_VDEV, FTAG); 3264 3265 /* 3266 * Make 1/4 of the devices be log devices 3267 */ 3268 nvroot = make_vdev_root(NULL, NULL, NULL, 3269 ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? 3270 "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 3271 1); 3272 3273 error = spa_vdev_add(spa, nvroot); 3274 fnvlist_free(nvroot); 3275 3276 switch (error) { 3277 case 0: 3278 break; 3279 case ENOSPC: 3280 ztest_record_enospc("spa_vdev_add"); 3281 break; 3282 default: 3283 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3284 } 3285 } 3286 3287 mutex_exit(&ztest_vdev_lock); 3288 } 3289 3290 void 3291 ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) 3292 { 3293 (void) zd, (void) id; 3294 ztest_shared_t *zs = ztest_shared; 3295 spa_t *spa = ztest_spa; 3296 uint64_t leaves; 3297 nvlist_t *nvroot; 3298 const char *class = (ztest_random(2) == 0) ? 3299 VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; 3300 int error; 3301 3302 /* 3303 * By default add a special vdev 50% of the time 3304 */ 3305 if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || 3306 (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && 3307 ztest_random(2) == 0)) { 3308 return; 3309 } 3310 3311 mutex_enter(&ztest_vdev_lock); 3312 3313 /* Only test with mirrors */ 3314 if (zs->zs_mirrors < 2) { 3315 mutex_exit(&ztest_vdev_lock); 3316 return; 3317 } 3318 3319 /* requires feature@allocation_classes */ 3320 if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { 3321 mutex_exit(&ztest_vdev_lock); 3322 return; 3323 } 3324 3325 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * 3326 ztest_opts.zo_raid_children; 3327 3328 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3329 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3330 spa_config_exit(spa, SCL_VDEV, FTAG); 3331 3332 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 3333 class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 3334 3335 error = spa_vdev_add(spa, nvroot); 3336 fnvlist_free(nvroot); 3337 3338 if (error == ENOSPC) 3339 ztest_record_enospc("spa_vdev_add"); 3340 else if (error != 0) 3341 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3342 3343 /* 3344 * 50% of the time allow small blocks in the special class 3345 */ 3346 if (error == 0 && 3347 spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { 3348 if (ztest_opts.zo_verbose >= 3) 3349 (void) printf("Enabling special VDEV small blocks\n"); 3350 (void) ztest_dsl_prop_set_uint64(zd->zd_name, 3351 ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); 3352 } 3353 3354 mutex_exit(&ztest_vdev_lock); 3355 3356 if (ztest_opts.zo_verbose >= 3) { 3357 metaslab_class_t *mc; 3358 3359 if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) 3360 mc = spa_special_class(spa); 3361 else 3362 mc = spa_dedup_class(spa); 3363 (void) printf("Added a %s mirrored vdev (of %d)\n", 3364 class, (int)mc->mc_groups); 3365 } 3366 } 3367 3368 /* 3369 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 3370 */ 3371 void 3372 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 3373 { 3374 (void) zd, (void) id; 3375 ztest_shared_t *zs = ztest_shared; 3376 spa_t *spa = ztest_spa; 3377 vdev_t *rvd = spa->spa_root_vdev; 3378 spa_aux_vdev_t *sav; 3379 const char *aux; 3380 char *path; 3381 uint64_t guid = 0; 3382 int error, ignore_err = 0; 3383 3384 if (ztest_opts.zo_mmp_test) 3385 return; 3386 3387 path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3388 3389 if (ztest_random(2) == 0) { 3390 sav = &spa->spa_spares; 3391 aux = ZPOOL_CONFIG_SPARES; 3392 } else { 3393 sav = &spa->spa_l2cache; 3394 aux = ZPOOL_CONFIG_L2CACHE; 3395 } 3396 3397 mutex_enter(&ztest_vdev_lock); 3398 3399 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3400 3401 if (sav->sav_count != 0 && ztest_random(4) == 0) { 3402 /* 3403 * Pick a random device to remove. 3404 */ 3405 vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3406 3407 /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ 3408 if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) 3409 ignore_err = ENOTSUP; 3410 3411 guid = svd->vdev_guid; 3412 } else { 3413 /* 3414 * Find an unused device we can add. 3415 */ 3416 zs->zs_vdev_aux = 0; 3417 for (;;) { 3418 int c; 3419 (void) snprintf(path, MAXPATHLEN, ztest_aux_template, 3420 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 3421 zs->zs_vdev_aux); 3422 for (c = 0; c < sav->sav_count; c++) 3423 if (strcmp(sav->sav_vdevs[c]->vdev_path, 3424 path) == 0) 3425 break; 3426 if (c == sav->sav_count && 3427 vdev_lookup_by_path(rvd, path) == NULL) 3428 break; 3429 zs->zs_vdev_aux++; 3430 } 3431 } 3432 3433 spa_config_exit(spa, SCL_VDEV, FTAG); 3434 3435 if (guid == 0) { 3436 /* 3437 * Add a new device. 3438 */ 3439 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 3440 (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); 3441 error = spa_vdev_add(spa, nvroot); 3442 3443 switch (error) { 3444 case 0: 3445 break; 3446 default: 3447 fatal(B_FALSE, "spa_vdev_add(%p) = %d", nvroot, error); 3448 } 3449 fnvlist_free(nvroot); 3450 } else { 3451 /* 3452 * Remove an existing device. Sometimes, dirty its 3453 * vdev state first to make sure we handle removal 3454 * of devices that have pending state changes. 3455 */ 3456 if (ztest_random(2) == 0) 3457 (void) vdev_online(spa, guid, 0, NULL); 3458 3459 error = spa_vdev_remove(spa, guid, B_FALSE); 3460 3461 switch (error) { 3462 case 0: 3463 case EBUSY: 3464 case ZFS_ERR_CHECKPOINT_EXISTS: 3465 case ZFS_ERR_DISCARDING_CHECKPOINT: 3466 break; 3467 default: 3468 if (error != ignore_err) 3469 fatal(B_FALSE, 3470 "spa_vdev_remove(%"PRIu64") = %d", 3471 guid, error); 3472 } 3473 } 3474 3475 mutex_exit(&ztest_vdev_lock); 3476 3477 umem_free(path, MAXPATHLEN); 3478 } 3479 3480 /* 3481 * split a pool if it has mirror tlvdevs 3482 */ 3483 void 3484 ztest_split_pool(ztest_ds_t *zd, uint64_t id) 3485 { 3486 (void) zd, (void) id; 3487 ztest_shared_t *zs = ztest_shared; 3488 spa_t *spa = ztest_spa; 3489 vdev_t *rvd = spa->spa_root_vdev; 3490 nvlist_t *tree, **child, *config, *split, **schild; 3491 uint_t c, children, schildren = 0, lastlogid = 0; 3492 int error = 0; 3493 3494 if (ztest_opts.zo_mmp_test) 3495 return; 3496 3497 mutex_enter(&ztest_vdev_lock); 3498 3499 /* ensure we have a usable config; mirrors of raidz aren't supported */ 3500 if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { 3501 mutex_exit(&ztest_vdev_lock); 3502 return; 3503 } 3504 3505 /* clean up the old pool, if any */ 3506 (void) spa_destroy("splitp"); 3507 3508 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3509 3510 /* generate a config from the existing config */ 3511 mutex_enter(&spa->spa_props_lock); 3512 tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); 3513 mutex_exit(&spa->spa_props_lock); 3514 3515 VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, 3516 &child, &children)); 3517 3518 schild = umem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 3519 UMEM_NOFAIL); 3520 for (c = 0; c < children; c++) { 3521 vdev_t *tvd = rvd->vdev_child[c]; 3522 nvlist_t **mchild; 3523 uint_t mchildren; 3524 3525 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 3526 schild[schildren] = fnvlist_alloc(); 3527 fnvlist_add_string(schild[schildren], 3528 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); 3529 fnvlist_add_uint64(schild[schildren], 3530 ZPOOL_CONFIG_IS_HOLE, 1); 3531 if (lastlogid == 0) 3532 lastlogid = schildren; 3533 ++schildren; 3534 continue; 3535 } 3536 lastlogid = 0; 3537 VERIFY0(nvlist_lookup_nvlist_array(child[c], 3538 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); 3539 schild[schildren++] = fnvlist_dup(mchild[0]); 3540 } 3541 3542 /* OK, create a config that can be used to split */ 3543 split = fnvlist_alloc(); 3544 fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 3545 fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, 3546 (const nvlist_t **)schild, lastlogid != 0 ? lastlogid : schildren); 3547 3548 config = fnvlist_alloc(); 3549 fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); 3550 3551 for (c = 0; c < schildren; c++) 3552 fnvlist_free(schild[c]); 3553 umem_free(schild, rvd->vdev_children * sizeof (nvlist_t *)); 3554 fnvlist_free(split); 3555 3556 spa_config_exit(spa, SCL_VDEV, FTAG); 3557 3558 (void) pthread_rwlock_wrlock(&ztest_name_lock); 3559 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 3560 (void) pthread_rwlock_unlock(&ztest_name_lock); 3561 3562 fnvlist_free(config); 3563 3564 if (error == 0) { 3565 (void) printf("successful split - results:\n"); 3566 mutex_enter(&spa_namespace_lock); 3567 show_pool_stats(spa); 3568 show_pool_stats(spa_lookup("splitp")); 3569 mutex_exit(&spa_namespace_lock); 3570 ++zs->zs_splits; 3571 --zs->zs_mirrors; 3572 } 3573 mutex_exit(&ztest_vdev_lock); 3574 } 3575 3576 /* 3577 * Verify that we can attach and detach devices. 3578 */ 3579 void 3580 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 3581 { 3582 (void) zd, (void) id; 3583 ztest_shared_t *zs = ztest_shared; 3584 spa_t *spa = ztest_spa; 3585 spa_aux_vdev_t *sav = &spa->spa_spares; 3586 vdev_t *rvd = spa->spa_root_vdev; 3587 vdev_t *oldvd, *newvd, *pvd; 3588 nvlist_t *root; 3589 uint64_t leaves; 3590 uint64_t leaf, top; 3591 uint64_t ashift = ztest_get_ashift(); 3592 uint64_t oldguid, pguid; 3593 uint64_t oldsize, newsize; 3594 char *oldpath, *newpath; 3595 int replacing; 3596 int oldvd_has_siblings = B_FALSE; 3597 int newvd_is_spare = B_FALSE; 3598 int newvd_is_dspare = B_FALSE; 3599 int oldvd_is_log; 3600 int error, expected_error; 3601 3602 if (ztest_opts.zo_mmp_test) 3603 return; 3604 3605 oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3606 newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3607 3608 mutex_enter(&ztest_vdev_lock); 3609 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; 3610 3611 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3612 3613 /* 3614 * If a vdev is in the process of being removed, its removal may 3615 * finish while we are in progress, leading to an unexpected error 3616 * value. Don't bother trying to attach while we are in the middle 3617 * of removal. 3618 */ 3619 if (ztest_device_removal_active) { 3620 spa_config_exit(spa, SCL_ALL, FTAG); 3621 goto out; 3622 } 3623 3624 /* 3625 * Decide whether to do an attach or a replace. 3626 */ 3627 replacing = ztest_random(2); 3628 3629 /* 3630 * Pick a random top-level vdev. 3631 */ 3632 top = ztest_random_vdev_top(spa, B_TRUE); 3633 3634 /* 3635 * Pick a random leaf within it. 3636 */ 3637 leaf = ztest_random(leaves); 3638 3639 /* 3640 * Locate this vdev. 3641 */ 3642 oldvd = rvd->vdev_child[top]; 3643 3644 /* pick a child from the mirror */ 3645 if (zs->zs_mirrors >= 1) { 3646 ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); 3647 ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); 3648 oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; 3649 } 3650 3651 /* pick a child out of the raidz group */ 3652 if (ztest_opts.zo_raid_children > 1) { 3653 if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) 3654 ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); 3655 else 3656 ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); 3657 ASSERT3U(oldvd->vdev_children, ==, ztest_opts.zo_raid_children); 3658 oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; 3659 } 3660 3661 /* 3662 * If we're already doing an attach or replace, oldvd may be a 3663 * mirror vdev -- in which case, pick a random child. 3664 */ 3665 while (oldvd->vdev_children != 0) { 3666 oldvd_has_siblings = B_TRUE; 3667 ASSERT3U(oldvd->vdev_children, >=, 2); 3668 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3669 } 3670 3671 oldguid = oldvd->vdev_guid; 3672 oldsize = vdev_get_min_asize(oldvd); 3673 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3674 (void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN); 3675 pvd = oldvd->vdev_parent; 3676 pguid = pvd->vdev_guid; 3677 3678 /* 3679 * If oldvd has siblings, then half of the time, detach it. Prior 3680 * to the detach the pool is scrubbed in order to prevent creating 3681 * unrepairable blocks as a result of the data corruption injection. 3682 */ 3683 if (oldvd_has_siblings && ztest_random(2) == 0) { 3684 spa_config_exit(spa, SCL_ALL, FTAG); 3685 3686 error = ztest_scrub_impl(spa); 3687 if (error) 3688 goto out; 3689 3690 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3691 if (error != 0 && error != ENODEV && error != EBUSY && 3692 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3693 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3694 fatal(B_FALSE, "detach (%s) returned %d", 3695 oldpath, error); 3696 goto out; 3697 } 3698 3699 /* 3700 * For the new vdev, choose with equal probability between the two 3701 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3702 */ 3703 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3704 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3705 newvd_is_spare = B_TRUE; 3706 3707 if (newvd->vdev_ops == &vdev_draid_spare_ops) 3708 newvd_is_dspare = B_TRUE; 3709 3710 (void) strlcpy(newpath, newvd->vdev_path, MAXPATHLEN); 3711 } else { 3712 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 3713 ztest_opts.zo_dir, ztest_opts.zo_pool, 3714 top * leaves + leaf); 3715 if (ztest_random(2) == 0) 3716 newpath[strlen(newpath) - 1] = 'b'; 3717 newvd = vdev_lookup_by_path(rvd, newpath); 3718 } 3719 3720 if (newvd) { 3721 /* 3722 * Reopen to ensure the vdev's asize field isn't stale. 3723 */ 3724 vdev_reopen(newvd); 3725 newsize = vdev_get_min_asize(newvd); 3726 } else { 3727 /* 3728 * Make newsize a little bigger or smaller than oldsize. 3729 * If it's smaller, the attach should fail. 3730 * If it's larger, and we're doing a replace, 3731 * we should get dynamic LUN growth when we're done. 3732 */ 3733 newsize = 10 * oldsize / (9 + ztest_random(3)); 3734 } 3735 3736 /* 3737 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3738 * unless it's a replace; in that case any non-replacing parent is OK. 3739 * 3740 * If newvd is already part of the pool, it should fail with EBUSY. 3741 * 3742 * If newvd is too small, it should fail with EOVERFLOW. 3743 * 3744 * If newvd is a distributed spare and it's being attached to a 3745 * dRAID which is not its parent it should fail with EINVAL. 3746 */ 3747 if (pvd->vdev_ops != &vdev_mirror_ops && 3748 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3749 pvd->vdev_ops == &vdev_replacing_ops || 3750 pvd->vdev_ops == &vdev_spare_ops)) 3751 expected_error = ENOTSUP; 3752 else if (newvd_is_spare && (!replacing || oldvd_is_log)) 3753 expected_error = ENOTSUP; 3754 else if (newvd == oldvd) 3755 expected_error = replacing ? 0 : EBUSY; 3756 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3757 expected_error = EBUSY; 3758 else if (!newvd_is_dspare && newsize < oldsize) 3759 expected_error = EOVERFLOW; 3760 else if (ashift > oldvd->vdev_top->vdev_ashift) 3761 expected_error = EDOM; 3762 else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) 3763 expected_error = ENOTSUP; 3764 else 3765 expected_error = 0; 3766 3767 spa_config_exit(spa, SCL_ALL, FTAG); 3768 3769 /* 3770 * Build the nvlist describing newpath. 3771 */ 3772 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3773 ashift, NULL, 0, 0, 1); 3774 3775 /* 3776 * When supported select either a healing or sequential resilver. 3777 */ 3778 boolean_t rebuilding = B_FALSE; 3779 if (pvd->vdev_ops == &vdev_mirror_ops || 3780 pvd->vdev_ops == &vdev_root_ops) { 3781 rebuilding = !!ztest_random(2); 3782 } 3783 3784 error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); 3785 3786 fnvlist_free(root); 3787 3788 /* 3789 * If our parent was the replacing vdev, but the replace completed, 3790 * then instead of failing with ENOTSUP we may either succeed, 3791 * fail with ENODEV, or fail with EOVERFLOW. 3792 */ 3793 if (expected_error == ENOTSUP && 3794 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3795 expected_error = error; 3796 3797 /* 3798 * If someone grew the LUN, the replacement may be too small. 3799 */ 3800 if (error == EOVERFLOW || error == EBUSY) 3801 expected_error = error; 3802 3803 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3804 error == ZFS_ERR_DISCARDING_CHECKPOINT || 3805 error == ZFS_ERR_RESILVER_IN_PROGRESS || 3806 error == ZFS_ERR_REBUILD_IN_PROGRESS) 3807 expected_error = error; 3808 3809 if (error != expected_error && expected_error != EBUSY) { 3810 fatal(B_FALSE, "attach (%s %"PRIu64", %s %"PRIu64", %d) " 3811 "returned %d, expected %d", 3812 oldpath, oldsize, newpath, 3813 newsize, replacing, error, expected_error); 3814 } 3815 out: 3816 mutex_exit(&ztest_vdev_lock); 3817 3818 umem_free(oldpath, MAXPATHLEN); 3819 umem_free(newpath, MAXPATHLEN); 3820 } 3821 3822 void 3823 ztest_device_removal(ztest_ds_t *zd, uint64_t id) 3824 { 3825 (void) zd, (void) id; 3826 spa_t *spa = ztest_spa; 3827 vdev_t *vd; 3828 uint64_t guid; 3829 int error; 3830 3831 mutex_enter(&ztest_vdev_lock); 3832 3833 if (ztest_device_removal_active) { 3834 mutex_exit(&ztest_vdev_lock); 3835 return; 3836 } 3837 3838 /* 3839 * Remove a random top-level vdev and wait for removal to finish. 3840 */ 3841 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3842 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 3843 guid = vd->vdev_guid; 3844 spa_config_exit(spa, SCL_VDEV, FTAG); 3845 3846 error = spa_vdev_remove(spa, guid, B_FALSE); 3847 if (error == 0) { 3848 ztest_device_removal_active = B_TRUE; 3849 mutex_exit(&ztest_vdev_lock); 3850 3851 /* 3852 * spa->spa_vdev_removal is created in a sync task that 3853 * is initiated via dsl_sync_task_nowait(). Since the 3854 * task may not run before spa_vdev_remove() returns, we 3855 * must wait at least 1 txg to ensure that the removal 3856 * struct has been created. 3857 */ 3858 txg_wait_synced(spa_get_dsl(spa), 0); 3859 3860 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 3861 txg_wait_synced(spa_get_dsl(spa), 0); 3862 } else { 3863 mutex_exit(&ztest_vdev_lock); 3864 return; 3865 } 3866 3867 /* 3868 * The pool needs to be scrubbed after completing device removal. 3869 * Failure to do so may result in checksum errors due to the 3870 * strategy employed by ztest_fault_inject() when selecting which 3871 * offset are redundant and can be damaged. 3872 */ 3873 error = spa_scan(spa, POOL_SCAN_SCRUB); 3874 if (error == 0) { 3875 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 3876 txg_wait_synced(spa_get_dsl(spa), 0); 3877 } 3878 3879 mutex_enter(&ztest_vdev_lock); 3880 ztest_device_removal_active = B_FALSE; 3881 mutex_exit(&ztest_vdev_lock); 3882 } 3883 3884 /* 3885 * Callback function which expands the physical size of the vdev. 3886 */ 3887 static vdev_t * 3888 grow_vdev(vdev_t *vd, void *arg) 3889 { 3890 spa_t *spa __maybe_unused = vd->vdev_spa; 3891 size_t *newsize = arg; 3892 size_t fsize; 3893 int fd; 3894 3895 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 3896 ASSERT(vd->vdev_ops->vdev_op_leaf); 3897 3898 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 3899 return (vd); 3900 3901 fsize = lseek(fd, 0, SEEK_END); 3902 VERIFY0(ftruncate(fd, *newsize)); 3903 3904 if (ztest_opts.zo_verbose >= 6) { 3905 (void) printf("%s grew from %lu to %lu bytes\n", 3906 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 3907 } 3908 (void) close(fd); 3909 return (NULL); 3910 } 3911 3912 /* 3913 * Callback function which expands a given vdev by calling vdev_online(). 3914 */ 3915 static vdev_t * 3916 online_vdev(vdev_t *vd, void *arg) 3917 { 3918 (void) arg; 3919 spa_t *spa = vd->vdev_spa; 3920 vdev_t *tvd = vd->vdev_top; 3921 uint64_t guid = vd->vdev_guid; 3922 uint64_t generation = spa->spa_config_generation + 1; 3923 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 3924 int error; 3925 3926 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 3927 ASSERT(vd->vdev_ops->vdev_op_leaf); 3928 3929 /* Calling vdev_online will initialize the new metaslabs */ 3930 spa_config_exit(spa, SCL_STATE, spa); 3931 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 3932 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3933 3934 /* 3935 * If vdev_online returned an error or the underlying vdev_open 3936 * failed then we abort the expand. The only way to know that 3937 * vdev_open fails is by checking the returned newstate. 3938 */ 3939 if (error || newstate != VDEV_STATE_HEALTHY) { 3940 if (ztest_opts.zo_verbose >= 5) { 3941 (void) printf("Unable to expand vdev, state %u, " 3942 "error %d\n", newstate, error); 3943 } 3944 return (vd); 3945 } 3946 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 3947 3948 /* 3949 * Since we dropped the lock we need to ensure that we're 3950 * still talking to the original vdev. It's possible this 3951 * vdev may have been detached/replaced while we were 3952 * trying to online it. 3953 */ 3954 if (generation != spa->spa_config_generation) { 3955 if (ztest_opts.zo_verbose >= 5) { 3956 (void) printf("vdev configuration has changed, " 3957 "guid %"PRIu64", state %"PRIu64", " 3958 "expected gen %"PRIu64", got gen %"PRIu64"\n", 3959 guid, 3960 tvd->vdev_state, 3961 generation, 3962 spa->spa_config_generation); 3963 } 3964 return (vd); 3965 } 3966 return (NULL); 3967 } 3968 3969 /* 3970 * Traverse the vdev tree calling the supplied function. 3971 * We continue to walk the tree until we either have walked all 3972 * children or we receive a non-NULL return from the callback. 3973 * If a NULL callback is passed, then we just return back the first 3974 * leaf vdev we encounter. 3975 */ 3976 static vdev_t * 3977 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 3978 { 3979 uint_t c; 3980 3981 if (vd->vdev_ops->vdev_op_leaf) { 3982 if (func == NULL) 3983 return (vd); 3984 else 3985 return (func(vd, arg)); 3986 } 3987 3988 for (c = 0; c < vd->vdev_children; c++) { 3989 vdev_t *cvd = vd->vdev_child[c]; 3990 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 3991 return (cvd); 3992 } 3993 return (NULL); 3994 } 3995 3996 /* 3997 * Verify that dynamic LUN growth works as expected. 3998 */ 3999 void 4000 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 4001 { 4002 (void) zd, (void) id; 4003 spa_t *spa = ztest_spa; 4004 vdev_t *vd, *tvd; 4005 metaslab_class_t *mc; 4006 metaslab_group_t *mg; 4007 size_t psize, newsize; 4008 uint64_t top; 4009 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 4010 4011 mutex_enter(&ztest_checkpoint_lock); 4012 mutex_enter(&ztest_vdev_lock); 4013 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4014 4015 /* 4016 * If there is a vdev removal in progress, it could complete while 4017 * we are running, in which case we would not be able to verify 4018 * that the metaslab_class space increased (because it decreases 4019 * when the device removal completes). 4020 */ 4021 if (ztest_device_removal_active) { 4022 spa_config_exit(spa, SCL_STATE, spa); 4023 mutex_exit(&ztest_vdev_lock); 4024 mutex_exit(&ztest_checkpoint_lock); 4025 return; 4026 } 4027 4028 top = ztest_random_vdev_top(spa, B_TRUE); 4029 4030 tvd = spa->spa_root_vdev->vdev_child[top]; 4031 mg = tvd->vdev_mg; 4032 mc = mg->mg_class; 4033 old_ms_count = tvd->vdev_ms_count; 4034 old_class_space = metaslab_class_get_space(mc); 4035 4036 /* 4037 * Determine the size of the first leaf vdev associated with 4038 * our top-level device. 4039 */ 4040 vd = vdev_walk_tree(tvd, NULL, NULL); 4041 ASSERT3P(vd, !=, NULL); 4042 ASSERT(vd->vdev_ops->vdev_op_leaf); 4043 4044 psize = vd->vdev_psize; 4045 4046 /* 4047 * We only try to expand the vdev if it's healthy, less than 4x its 4048 * original size, and it has a valid psize. 4049 */ 4050 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 4051 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 4052 spa_config_exit(spa, SCL_STATE, spa); 4053 mutex_exit(&ztest_vdev_lock); 4054 mutex_exit(&ztest_checkpoint_lock); 4055 return; 4056 } 4057 ASSERT3U(psize, >, 0); 4058 newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); 4059 ASSERT3U(newsize, >, psize); 4060 4061 if (ztest_opts.zo_verbose >= 6) { 4062 (void) printf("Expanding LUN %s from %lu to %lu\n", 4063 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 4064 } 4065 4066 /* 4067 * Growing the vdev is a two step process: 4068 * 1). expand the physical size (i.e. relabel) 4069 * 2). online the vdev to create the new metaslabs 4070 */ 4071 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 4072 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 4073 tvd->vdev_state != VDEV_STATE_HEALTHY) { 4074 if (ztest_opts.zo_verbose >= 5) { 4075 (void) printf("Could not expand LUN because " 4076 "the vdev configuration changed.\n"); 4077 } 4078 spa_config_exit(spa, SCL_STATE, spa); 4079 mutex_exit(&ztest_vdev_lock); 4080 mutex_exit(&ztest_checkpoint_lock); 4081 return; 4082 } 4083 4084 spa_config_exit(spa, SCL_STATE, spa); 4085 4086 /* 4087 * Expanding the LUN will update the config asynchronously, 4088 * thus we must wait for the async thread to complete any 4089 * pending tasks before proceeding. 4090 */ 4091 for (;;) { 4092 boolean_t done; 4093 mutex_enter(&spa->spa_async_lock); 4094 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 4095 mutex_exit(&spa->spa_async_lock); 4096 if (done) 4097 break; 4098 txg_wait_synced(spa_get_dsl(spa), 0); 4099 (void) poll(NULL, 0, 100); 4100 } 4101 4102 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4103 4104 tvd = spa->spa_root_vdev->vdev_child[top]; 4105 new_ms_count = tvd->vdev_ms_count; 4106 new_class_space = metaslab_class_get_space(mc); 4107 4108 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 4109 if (ztest_opts.zo_verbose >= 5) { 4110 (void) printf("Could not verify LUN expansion due to " 4111 "intervening vdev offline or remove.\n"); 4112 } 4113 spa_config_exit(spa, SCL_STATE, spa); 4114 mutex_exit(&ztest_vdev_lock); 4115 mutex_exit(&ztest_checkpoint_lock); 4116 return; 4117 } 4118 4119 /* 4120 * Make sure we were able to grow the vdev. 4121 */ 4122 if (new_ms_count <= old_ms_count) { 4123 fatal(B_FALSE, 4124 "LUN expansion failed: ms_count %"PRIu64" < %"PRIu64"\n", 4125 old_ms_count, new_ms_count); 4126 } 4127 4128 /* 4129 * Make sure we were able to grow the pool. 4130 */ 4131 if (new_class_space <= old_class_space) { 4132 fatal(B_FALSE, 4133 "LUN expansion failed: class_space %"PRIu64" < %"PRIu64"\n", 4134 old_class_space, new_class_space); 4135 } 4136 4137 if (ztest_opts.zo_verbose >= 5) { 4138 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 4139 4140 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 4141 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 4142 (void) printf("%s grew from %s to %s\n", 4143 spa->spa_name, oldnumbuf, newnumbuf); 4144 } 4145 4146 spa_config_exit(spa, SCL_STATE, spa); 4147 mutex_exit(&ztest_vdev_lock); 4148 mutex_exit(&ztest_checkpoint_lock); 4149 } 4150 4151 /* 4152 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 4153 */ 4154 static void 4155 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 4156 { 4157 (void) arg, (void) cr; 4158 4159 /* 4160 * Create the objects common to all ztest datasets. 4161 */ 4162 VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, 4163 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); 4164 } 4165 4166 static int 4167 ztest_dataset_create(char *dsname) 4168 { 4169 int err; 4170 uint64_t rand; 4171 dsl_crypto_params_t *dcp = NULL; 4172 4173 /* 4174 * 50% of the time, we create encrypted datasets 4175 * using a random cipher suite and a hard-coded 4176 * wrapping key. 4177 */ 4178 rand = ztest_random(2); 4179 if (rand != 0) { 4180 nvlist_t *crypto_args = fnvlist_alloc(); 4181 nvlist_t *props = fnvlist_alloc(); 4182 4183 /* slight bias towards the default cipher suite */ 4184 rand = ztest_random(ZIO_CRYPT_FUNCTIONS); 4185 if (rand < ZIO_CRYPT_AES_128_CCM) 4186 rand = ZIO_CRYPT_ON; 4187 4188 fnvlist_add_uint64(props, 4189 zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); 4190 fnvlist_add_uint8_array(crypto_args, "wkeydata", 4191 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 4192 4193 /* 4194 * These parameters aren't really used by the kernel. They 4195 * are simply stored so that userspace knows how to load 4196 * the wrapping key. 4197 */ 4198 fnvlist_add_uint64(props, 4199 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); 4200 fnvlist_add_string(props, 4201 zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); 4202 fnvlist_add_uint64(props, 4203 zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); 4204 fnvlist_add_uint64(props, 4205 zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); 4206 4207 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, 4208 crypto_args, &dcp)); 4209 4210 /* 4211 * Cycle through all available encryption implementations 4212 * to verify interoperability. 4213 */ 4214 VERIFY0(gcm_impl_set("cycle")); 4215 VERIFY0(aes_impl_set("cycle")); 4216 4217 fnvlist_free(crypto_args); 4218 fnvlist_free(props); 4219 } 4220 4221 err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, 4222 ztest_objset_create_cb, NULL); 4223 dsl_crypto_params_free(dcp, !!err); 4224 4225 rand = ztest_random(100); 4226 if (err || rand < 80) 4227 return (err); 4228 4229 if (ztest_opts.zo_verbose >= 5) 4230 (void) printf("Setting dataset %s to sync always\n", dsname); 4231 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 4232 ZFS_SYNC_ALWAYS, B_FALSE)); 4233 } 4234 4235 static int 4236 ztest_objset_destroy_cb(const char *name, void *arg) 4237 { 4238 (void) arg; 4239 objset_t *os; 4240 dmu_object_info_t doi; 4241 int error; 4242 4243 /* 4244 * Verify that the dataset contains a directory object. 4245 */ 4246 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4247 B_TRUE, FTAG, &os)); 4248 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 4249 if (error != ENOENT) { 4250 /* We could have crashed in the middle of destroying it */ 4251 ASSERT0(error); 4252 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 4253 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 4254 } 4255 dmu_objset_disown(os, B_TRUE, FTAG); 4256 4257 /* 4258 * Destroy the dataset. 4259 */ 4260 if (strchr(name, '@') != NULL) { 4261 error = dsl_destroy_snapshot(name, B_TRUE); 4262 if (error != ECHRNG) { 4263 /* 4264 * The program was executed, but encountered a runtime 4265 * error, such as insufficient slop, or a hold on the 4266 * dataset. 4267 */ 4268 ASSERT0(error); 4269 } 4270 } else { 4271 error = dsl_destroy_head(name); 4272 if (error == ENOSPC) { 4273 /* There could be checkpoint or insufficient slop */ 4274 ztest_record_enospc(FTAG); 4275 } else if (error != EBUSY) { 4276 /* There could be a hold on this dataset */ 4277 ASSERT0(error); 4278 } 4279 } 4280 return (0); 4281 } 4282 4283 static boolean_t 4284 ztest_snapshot_create(char *osname, uint64_t id) 4285 { 4286 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4287 int error; 4288 4289 (void) snprintf(snapname, sizeof (snapname), "%"PRIu64"", id); 4290 4291 error = dmu_objset_snapshot_one(osname, snapname); 4292 if (error == ENOSPC) { 4293 ztest_record_enospc(FTAG); 4294 return (B_FALSE); 4295 } 4296 if (error != 0 && error != EEXIST) { 4297 fatal(B_FALSE, "ztest_snapshot_create(%s@%s) = %d", osname, 4298 snapname, error); 4299 } 4300 return (B_TRUE); 4301 } 4302 4303 static boolean_t 4304 ztest_snapshot_destroy(char *osname, uint64_t id) 4305 { 4306 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4307 int error; 4308 4309 (void) snprintf(snapname, sizeof (snapname), "%s@%"PRIu64"", 4310 osname, id); 4311 4312 error = dsl_destroy_snapshot(snapname, B_FALSE); 4313 if (error != 0 && error != ENOENT) 4314 fatal(B_FALSE, "ztest_snapshot_destroy(%s) = %d", 4315 snapname, error); 4316 return (B_TRUE); 4317 } 4318 4319 void 4320 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 4321 { 4322 (void) zd; 4323 ztest_ds_t *zdtmp; 4324 int iters; 4325 int error; 4326 objset_t *os, *os2; 4327 char name[ZFS_MAX_DATASET_NAME_LEN]; 4328 zilog_t *zilog; 4329 int i; 4330 4331 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 4332 4333 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4334 4335 (void) snprintf(name, sizeof (name), "%s/temp_%"PRIu64"", 4336 ztest_opts.zo_pool, id); 4337 4338 /* 4339 * If this dataset exists from a previous run, process its replay log 4340 * half of the time. If we don't replay it, then dsl_destroy_head() 4341 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 4342 */ 4343 if (ztest_random(2) == 0 && 4344 ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 4345 B_TRUE, FTAG, &os) == 0) { 4346 ztest_zd_init(zdtmp, NULL, os); 4347 zil_replay(os, zdtmp, ztest_replay_vector); 4348 ztest_zd_fini(zdtmp); 4349 dmu_objset_disown(os, B_TRUE, FTAG); 4350 } 4351 4352 /* 4353 * There may be an old instance of the dataset we're about to 4354 * create lying around from a previous run. If so, destroy it 4355 * and all of its snapshots. 4356 */ 4357 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 4358 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 4359 4360 /* 4361 * Verify that the destroyed dataset is no longer in the namespace. 4362 */ 4363 VERIFY3U(ENOENT, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4364 B_TRUE, FTAG, &os)); 4365 4366 /* 4367 * Verify that we can create a new dataset. 4368 */ 4369 error = ztest_dataset_create(name); 4370 if (error) { 4371 if (error == ENOSPC) { 4372 ztest_record_enospc(FTAG); 4373 goto out; 4374 } 4375 fatal(B_FALSE, "dmu_objset_create(%s) = %d", name, error); 4376 } 4377 4378 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, 4379 FTAG, &os)); 4380 4381 ztest_zd_init(zdtmp, NULL, os); 4382 4383 /* 4384 * Open the intent log for it. 4385 */ 4386 zilog = zil_open(os, ztest_get_data, NULL); 4387 4388 /* 4389 * Put some objects in there, do a little I/O to them, 4390 * and randomly take a couple of snapshots along the way. 4391 */ 4392 iters = ztest_random(5); 4393 for (i = 0; i < iters; i++) { 4394 ztest_dmu_object_alloc_free(zdtmp, id); 4395 if (ztest_random(iters) == 0) 4396 (void) ztest_snapshot_create(name, i); 4397 } 4398 4399 /* 4400 * Verify that we cannot create an existing dataset. 4401 */ 4402 VERIFY3U(EEXIST, ==, 4403 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); 4404 4405 /* 4406 * Verify that we can hold an objset that is also owned. 4407 */ 4408 VERIFY0(dmu_objset_hold(name, FTAG, &os2)); 4409 dmu_objset_rele(os2, FTAG); 4410 4411 /* 4412 * Verify that we cannot own an objset that is already owned. 4413 */ 4414 VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, 4415 B_FALSE, B_TRUE, FTAG, &os2)); 4416 4417 zil_close(zilog); 4418 dmu_objset_disown(os, B_TRUE, FTAG); 4419 ztest_zd_fini(zdtmp); 4420 out: 4421 (void) pthread_rwlock_unlock(&ztest_name_lock); 4422 4423 umem_free(zdtmp, sizeof (ztest_ds_t)); 4424 } 4425 4426 /* 4427 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 4428 */ 4429 void 4430 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 4431 { 4432 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4433 (void) ztest_snapshot_destroy(zd->zd_name, id); 4434 (void) ztest_snapshot_create(zd->zd_name, id); 4435 (void) pthread_rwlock_unlock(&ztest_name_lock); 4436 } 4437 4438 /* 4439 * Cleanup non-standard snapshots and clones. 4440 */ 4441 static void 4442 ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 4443 { 4444 char *snap1name; 4445 char *clone1name; 4446 char *snap2name; 4447 char *clone2name; 4448 char *snap3name; 4449 int error; 4450 4451 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4452 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4453 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4454 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4455 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4456 4457 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4458 osname, id); 4459 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4460 osname, id); 4461 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4462 clone1name, id); 4463 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4464 osname, id); 4465 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4466 clone1name, id); 4467 4468 error = dsl_destroy_head(clone2name); 4469 if (error && error != ENOENT) 4470 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone2name, error); 4471 error = dsl_destroy_snapshot(snap3name, B_FALSE); 4472 if (error && error != ENOENT) 4473 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4474 snap3name, error); 4475 error = dsl_destroy_snapshot(snap2name, B_FALSE); 4476 if (error && error != ENOENT) 4477 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4478 snap2name, error); 4479 error = dsl_destroy_head(clone1name); 4480 if (error && error != ENOENT) 4481 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone1name, error); 4482 error = dsl_destroy_snapshot(snap1name, B_FALSE); 4483 if (error && error != ENOENT) 4484 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4485 snap1name, error); 4486 4487 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4488 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4489 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4490 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4491 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4492 } 4493 4494 /* 4495 * Verify dsl_dataset_promote handles EBUSY 4496 */ 4497 void 4498 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 4499 { 4500 objset_t *os; 4501 char *snap1name; 4502 char *clone1name; 4503 char *snap2name; 4504 char *clone2name; 4505 char *snap3name; 4506 char *osname = zd->zd_name; 4507 int error; 4508 4509 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4510 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4511 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4512 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4513 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4514 4515 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4516 4517 ztest_dsl_dataset_cleanup(osname, id); 4518 4519 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4520 osname, id); 4521 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4522 osname, id); 4523 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4524 clone1name, id); 4525 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4526 osname, id); 4527 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4528 clone1name, id); 4529 4530 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 4531 if (error && error != EEXIST) { 4532 if (error == ENOSPC) { 4533 ztest_record_enospc(FTAG); 4534 goto out; 4535 } 4536 fatal(B_FALSE, "dmu_take_snapshot(%s) = %d", snap1name, error); 4537 } 4538 4539 error = dmu_objset_clone(clone1name, snap1name); 4540 if (error) { 4541 if (error == ENOSPC) { 4542 ztest_record_enospc(FTAG); 4543 goto out; 4544 } 4545 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone1name, error); 4546 } 4547 4548 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 4549 if (error && error != EEXIST) { 4550 if (error == ENOSPC) { 4551 ztest_record_enospc(FTAG); 4552 goto out; 4553 } 4554 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap2name, error); 4555 } 4556 4557 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 4558 if (error && error != EEXIST) { 4559 if (error == ENOSPC) { 4560 ztest_record_enospc(FTAG); 4561 goto out; 4562 } 4563 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap3name, error); 4564 } 4565 4566 error = dmu_objset_clone(clone2name, snap3name); 4567 if (error) { 4568 if (error == ENOSPC) { 4569 ztest_record_enospc(FTAG); 4570 goto out; 4571 } 4572 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone2name, error); 4573 } 4574 4575 error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, 4576 FTAG, &os); 4577 if (error) 4578 fatal(B_FALSE, "dmu_objset_own(%s) = %d", snap2name, error); 4579 error = dsl_dataset_promote(clone2name, NULL); 4580 if (error == ENOSPC) { 4581 dmu_objset_disown(os, B_TRUE, FTAG); 4582 ztest_record_enospc(FTAG); 4583 goto out; 4584 } 4585 if (error != EBUSY) 4586 fatal(B_FALSE, "dsl_dataset_promote(%s), %d, not EBUSY", 4587 clone2name, error); 4588 dmu_objset_disown(os, B_TRUE, FTAG); 4589 4590 out: 4591 ztest_dsl_dataset_cleanup(osname, id); 4592 4593 (void) pthread_rwlock_unlock(&ztest_name_lock); 4594 4595 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4596 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4597 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4598 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4599 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4600 } 4601 4602 #undef OD_ARRAY_SIZE 4603 #define OD_ARRAY_SIZE 4 4604 4605 /* 4606 * Verify that dmu_object_{alloc,free} work as expected. 4607 */ 4608 void 4609 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 4610 { 4611 ztest_od_t *od; 4612 int batchsize; 4613 int size; 4614 int b; 4615 4616 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4617 od = umem_alloc(size, UMEM_NOFAIL); 4618 batchsize = OD_ARRAY_SIZE; 4619 4620 for (b = 0; b < batchsize; b++) 4621 ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 4622 0, 0, 0); 4623 4624 /* 4625 * Destroy the previous batch of objects, create a new batch, 4626 * and do some I/O on the new objects. 4627 */ 4628 if (ztest_object_init(zd, od, size, B_TRUE) != 0) 4629 return; 4630 4631 while (ztest_random(4 * batchsize) != 0) 4632 ztest_io(zd, od[ztest_random(batchsize)].od_object, 4633 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4634 4635 umem_free(od, size); 4636 } 4637 4638 /* 4639 * Rewind the global allocator to verify object allocation backfilling. 4640 */ 4641 void 4642 ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 4643 { 4644 (void) id; 4645 objset_t *os = zd->zd_os; 4646 uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 4647 uint64_t object; 4648 4649 /* 4650 * Rewind the global allocator randomly back to a lower object number 4651 * to force backfilling and reclamation of recently freed dnodes. 4652 */ 4653 mutex_enter(&os->os_obj_lock); 4654 object = ztest_random(os->os_obj_next_chunk); 4655 os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); 4656 mutex_exit(&os->os_obj_lock); 4657 } 4658 4659 #undef OD_ARRAY_SIZE 4660 #define OD_ARRAY_SIZE 2 4661 4662 /* 4663 * Verify that dmu_{read,write} work as expected. 4664 */ 4665 void 4666 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 4667 { 4668 int size; 4669 ztest_od_t *od; 4670 4671 objset_t *os = zd->zd_os; 4672 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4673 od = umem_alloc(size, UMEM_NOFAIL); 4674 dmu_tx_t *tx; 4675 int freeit, error; 4676 uint64_t i, n, s, txg; 4677 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 4678 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4679 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 4680 uint64_t regions = 997; 4681 uint64_t stride = 123456789ULL; 4682 uint64_t width = 40; 4683 int free_percent = 5; 4684 4685 /* 4686 * This test uses two objects, packobj and bigobj, that are always 4687 * updated together (i.e. in the same tx) so that their contents are 4688 * in sync and can be compared. Their contents relate to each other 4689 * in a simple way: packobj is a dense array of 'bufwad' structures, 4690 * while bigobj is a sparse array of the same bufwads. Specifically, 4691 * for any index n, there are three bufwads that should be identical: 4692 * 4693 * packobj, at offset n * sizeof (bufwad_t) 4694 * bigobj, at the head of the nth chunk 4695 * bigobj, at the tail of the nth chunk 4696 * 4697 * The chunk size is arbitrary. It doesn't have to be a power of two, 4698 * and it doesn't have any relation to the object blocksize. 4699 * The only requirement is that it can hold at least two bufwads. 4700 * 4701 * Normally, we write the bufwad to each of these locations. 4702 * However, free_percent of the time we instead write zeroes to 4703 * packobj and perform a dmu_free_range() on bigobj. By comparing 4704 * bigobj to packobj, we can verify that the DMU is correctly 4705 * tracking which parts of an object are allocated and free, 4706 * and that the contents of the allocated blocks are correct. 4707 */ 4708 4709 /* 4710 * Read the directory info. If it's the first time, set things up. 4711 */ 4712 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); 4713 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 4714 chunksize); 4715 4716 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 4717 umem_free(od, size); 4718 return; 4719 } 4720 4721 bigobj = od[0].od_object; 4722 packobj = od[1].od_object; 4723 chunksize = od[0].od_gen; 4724 ASSERT3U(chunksize, ==, od[1].od_gen); 4725 4726 /* 4727 * Prefetch a random chunk of the big object. 4728 * Our aim here is to get some async reads in flight 4729 * for blocks that we may free below; the DMU should 4730 * handle this race correctly. 4731 */ 4732 n = ztest_random(regions) * stride + ztest_random(width); 4733 s = 1 + ztest_random(2 * width - 1); 4734 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 4735 ZIO_PRIORITY_SYNC_READ); 4736 4737 /* 4738 * Pick a random index and compute the offsets into packobj and bigobj. 4739 */ 4740 n = ztest_random(regions) * stride + ztest_random(width); 4741 s = 1 + ztest_random(width - 1); 4742 4743 packoff = n * sizeof (bufwad_t); 4744 packsize = s * sizeof (bufwad_t); 4745 4746 bigoff = n * chunksize; 4747 bigsize = s * chunksize; 4748 4749 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 4750 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 4751 4752 /* 4753 * free_percent of the time, free a range of bigobj rather than 4754 * overwriting it. 4755 */ 4756 freeit = (ztest_random(100) < free_percent); 4757 4758 /* 4759 * Read the current contents of our objects. 4760 */ 4761 error = dmu_read(os, packobj, packoff, packsize, packbuf, 4762 DMU_READ_PREFETCH); 4763 ASSERT0(error); 4764 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 4765 DMU_READ_PREFETCH); 4766 ASSERT0(error); 4767 4768 /* 4769 * Get a tx for the mods to both packobj and bigobj. 4770 */ 4771 tx = dmu_tx_create(os); 4772 4773 dmu_tx_hold_write(tx, packobj, packoff, packsize); 4774 4775 if (freeit) 4776 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 4777 else 4778 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 4779 4780 /* This accounts for setting the checksum/compression. */ 4781 dmu_tx_hold_bonus(tx, bigobj); 4782 4783 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4784 if (txg == 0) { 4785 umem_free(packbuf, packsize); 4786 umem_free(bigbuf, bigsize); 4787 umem_free(od, size); 4788 return; 4789 } 4790 4791 enum zio_checksum cksum; 4792 do { 4793 cksum = (enum zio_checksum) 4794 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 4795 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 4796 dmu_object_set_checksum(os, bigobj, cksum, tx); 4797 4798 enum zio_compress comp; 4799 do { 4800 comp = (enum zio_compress) 4801 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 4802 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 4803 dmu_object_set_compress(os, bigobj, comp, tx); 4804 4805 /* 4806 * For each index from n to n + s, verify that the existing bufwad 4807 * in packobj matches the bufwads at the head and tail of the 4808 * corresponding chunk in bigobj. Then update all three bufwads 4809 * with the new values we want to write out. 4810 */ 4811 for (i = 0; i < s; i++) { 4812 /* LINTED */ 4813 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4814 /* LINTED */ 4815 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4816 /* LINTED */ 4817 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4818 4819 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 4820 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 4821 4822 if (pack->bw_txg > txg) 4823 fatal(B_FALSE, 4824 "future leak: got %"PRIx64", open txg is %"PRIx64"", 4825 pack->bw_txg, txg); 4826 4827 if (pack->bw_data != 0 && pack->bw_index != n + i) 4828 fatal(B_FALSE, "wrong index: " 4829 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 4830 pack->bw_index, n, i); 4831 4832 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4833 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 4834 pack, bigH); 4835 4836 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4837 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 4838 pack, bigT); 4839 4840 if (freeit) { 4841 memset(pack, 0, sizeof (bufwad_t)); 4842 } else { 4843 pack->bw_index = n + i; 4844 pack->bw_txg = txg; 4845 pack->bw_data = 1 + ztest_random(-2ULL); 4846 } 4847 *bigH = *pack; 4848 *bigT = *pack; 4849 } 4850 4851 /* 4852 * We've verified all the old bufwads, and made new ones. 4853 * Now write them out. 4854 */ 4855 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 4856 4857 if (freeit) { 4858 if (ztest_opts.zo_verbose >= 7) { 4859 (void) printf("freeing offset %"PRIx64" size %"PRIx64"" 4860 " txg %"PRIx64"\n", 4861 bigoff, bigsize, txg); 4862 } 4863 VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 4864 } else { 4865 if (ztest_opts.zo_verbose >= 7) { 4866 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 4867 " txg %"PRIx64"\n", 4868 bigoff, bigsize, txg); 4869 } 4870 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 4871 } 4872 4873 dmu_tx_commit(tx); 4874 4875 /* 4876 * Sanity check the stuff we just wrote. 4877 */ 4878 { 4879 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 4880 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 4881 4882 VERIFY0(dmu_read(os, packobj, packoff, 4883 packsize, packcheck, DMU_READ_PREFETCH)); 4884 VERIFY0(dmu_read(os, bigobj, bigoff, 4885 bigsize, bigcheck, DMU_READ_PREFETCH)); 4886 4887 ASSERT0(memcmp(packbuf, packcheck, packsize)); 4888 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 4889 4890 umem_free(packcheck, packsize); 4891 umem_free(bigcheck, bigsize); 4892 } 4893 4894 umem_free(packbuf, packsize); 4895 umem_free(bigbuf, bigsize); 4896 umem_free(od, size); 4897 } 4898 4899 static void 4900 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 4901 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 4902 { 4903 uint64_t i; 4904 bufwad_t *pack; 4905 bufwad_t *bigH; 4906 bufwad_t *bigT; 4907 4908 /* 4909 * For each index from n to n + s, verify that the existing bufwad 4910 * in packobj matches the bufwads at the head and tail of the 4911 * corresponding chunk in bigobj. Then update all three bufwads 4912 * with the new values we want to write out. 4913 */ 4914 for (i = 0; i < s; i++) { 4915 /* LINTED */ 4916 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4917 /* LINTED */ 4918 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4919 /* LINTED */ 4920 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4921 4922 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 4923 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 4924 4925 if (pack->bw_txg > txg) 4926 fatal(B_FALSE, 4927 "future leak: got %"PRIx64", open txg is %"PRIx64"", 4928 pack->bw_txg, txg); 4929 4930 if (pack->bw_data != 0 && pack->bw_index != n + i) 4931 fatal(B_FALSE, "wrong index: " 4932 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 4933 pack->bw_index, n, i); 4934 4935 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4936 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 4937 pack, bigH); 4938 4939 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4940 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 4941 pack, bigT); 4942 4943 pack->bw_index = n + i; 4944 pack->bw_txg = txg; 4945 pack->bw_data = 1 + ztest_random(-2ULL); 4946 4947 *bigH = *pack; 4948 *bigT = *pack; 4949 } 4950 } 4951 4952 #undef OD_ARRAY_SIZE 4953 #define OD_ARRAY_SIZE 2 4954 4955 void 4956 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 4957 { 4958 objset_t *os = zd->zd_os; 4959 ztest_od_t *od; 4960 dmu_tx_t *tx; 4961 uint64_t i; 4962 int error; 4963 int size; 4964 uint64_t n, s, txg; 4965 bufwad_t *packbuf, *bigbuf; 4966 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4967 uint64_t blocksize = ztest_random_blocksize(); 4968 uint64_t chunksize = blocksize; 4969 uint64_t regions = 997; 4970 uint64_t stride = 123456789ULL; 4971 uint64_t width = 9; 4972 dmu_buf_t *bonus_db; 4973 arc_buf_t **bigbuf_arcbufs; 4974 dmu_object_info_t doi; 4975 4976 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4977 od = umem_alloc(size, UMEM_NOFAIL); 4978 4979 /* 4980 * This test uses two objects, packobj and bigobj, that are always 4981 * updated together (i.e. in the same tx) so that their contents are 4982 * in sync and can be compared. Their contents relate to each other 4983 * in a simple way: packobj is a dense array of 'bufwad' structures, 4984 * while bigobj is a sparse array of the same bufwads. Specifically, 4985 * for any index n, there are three bufwads that should be identical: 4986 * 4987 * packobj, at offset n * sizeof (bufwad_t) 4988 * bigobj, at the head of the nth chunk 4989 * bigobj, at the tail of the nth chunk 4990 * 4991 * The chunk size is set equal to bigobj block size so that 4992 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. 4993 */ 4994 4995 /* 4996 * Read the directory info. If it's the first time, set things up. 4997 */ 4998 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 4999 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5000 chunksize); 5001 5002 5003 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5004 umem_free(od, size); 5005 return; 5006 } 5007 5008 bigobj = od[0].od_object; 5009 packobj = od[1].od_object; 5010 blocksize = od[0].od_blocksize; 5011 chunksize = blocksize; 5012 ASSERT3U(chunksize, ==, od[1].od_gen); 5013 5014 VERIFY0(dmu_object_info(os, bigobj, &doi)); 5015 VERIFY(ISP2(doi.doi_data_block_size)); 5016 VERIFY3U(chunksize, ==, doi.doi_data_block_size); 5017 VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); 5018 5019 /* 5020 * Pick a random index and compute the offsets into packobj and bigobj. 5021 */ 5022 n = ztest_random(regions) * stride + ztest_random(width); 5023 s = 1 + ztest_random(width - 1); 5024 5025 packoff = n * sizeof (bufwad_t); 5026 packsize = s * sizeof (bufwad_t); 5027 5028 bigoff = n * chunksize; 5029 bigsize = s * chunksize; 5030 5031 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 5032 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 5033 5034 VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 5035 5036 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 5037 5038 /* 5039 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 5040 * Iteration 1 test zcopy to already referenced dbufs. 5041 * Iteration 2 test zcopy to dirty dbuf in the same txg. 5042 * Iteration 3 test zcopy to dbuf dirty in previous txg. 5043 * Iteration 4 test zcopy when dbuf is no longer dirty. 5044 * Iteration 5 test zcopy when it can't be done. 5045 * Iteration 6 one more zcopy write. 5046 */ 5047 for (i = 0; i < 7; i++) { 5048 uint64_t j; 5049 uint64_t off; 5050 5051 /* 5052 * In iteration 5 (i == 5) use arcbufs 5053 * that don't match bigobj blksz to test 5054 * dmu_assign_arcbuf_by_dbuf() when it can't directly 5055 * assign an arcbuf to a dbuf. 5056 */ 5057 for (j = 0; j < s; j++) { 5058 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5059 bigbuf_arcbufs[j] = 5060 dmu_request_arcbuf(bonus_db, chunksize); 5061 } else { 5062 bigbuf_arcbufs[2 * j] = 5063 dmu_request_arcbuf(bonus_db, chunksize / 2); 5064 bigbuf_arcbufs[2 * j + 1] = 5065 dmu_request_arcbuf(bonus_db, chunksize / 2); 5066 } 5067 } 5068 5069 /* 5070 * Get a tx for the mods to both packobj and bigobj. 5071 */ 5072 tx = dmu_tx_create(os); 5073 5074 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5075 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5076 5077 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5078 if (txg == 0) { 5079 umem_free(packbuf, packsize); 5080 umem_free(bigbuf, bigsize); 5081 for (j = 0; j < s; j++) { 5082 if (i != 5 || 5083 chunksize < (SPA_MINBLOCKSIZE * 2)) { 5084 dmu_return_arcbuf(bigbuf_arcbufs[j]); 5085 } else { 5086 dmu_return_arcbuf( 5087 bigbuf_arcbufs[2 * j]); 5088 dmu_return_arcbuf( 5089 bigbuf_arcbufs[2 * j + 1]); 5090 } 5091 } 5092 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5093 umem_free(od, size); 5094 dmu_buf_rele(bonus_db, FTAG); 5095 return; 5096 } 5097 5098 /* 5099 * 50% of the time don't read objects in the 1st iteration to 5100 * test dmu_assign_arcbuf_by_dbuf() for the case when there are 5101 * no existing dbufs for the specified offsets. 5102 */ 5103 if (i != 0 || ztest_random(2) != 0) { 5104 error = dmu_read(os, packobj, packoff, 5105 packsize, packbuf, DMU_READ_PREFETCH); 5106 ASSERT0(error); 5107 error = dmu_read(os, bigobj, bigoff, bigsize, 5108 bigbuf, DMU_READ_PREFETCH); 5109 ASSERT0(error); 5110 } 5111 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 5112 n, chunksize, txg); 5113 5114 /* 5115 * We've verified all the old bufwads, and made new ones. 5116 * Now write them out. 5117 */ 5118 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5119 if (ztest_opts.zo_verbose >= 7) { 5120 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5121 " txg %"PRIx64"\n", 5122 bigoff, bigsize, txg); 5123 } 5124 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 5125 dmu_buf_t *dbt; 5126 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5127 memcpy(bigbuf_arcbufs[j]->b_data, 5128 (caddr_t)bigbuf + (off - bigoff), 5129 chunksize); 5130 } else { 5131 memcpy(bigbuf_arcbufs[2 * j]->b_data, 5132 (caddr_t)bigbuf + (off - bigoff), 5133 chunksize / 2); 5134 memcpy(bigbuf_arcbufs[2 * j + 1]->b_data, 5135 (caddr_t)bigbuf + (off - bigoff) + 5136 chunksize / 2, 5137 chunksize / 2); 5138 } 5139 5140 if (i == 1) { 5141 VERIFY(dmu_buf_hold(os, bigobj, off, 5142 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 5143 } 5144 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5145 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5146 off, bigbuf_arcbufs[j], tx)); 5147 } else { 5148 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5149 off, bigbuf_arcbufs[2 * j], tx)); 5150 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5151 off + chunksize / 2, 5152 bigbuf_arcbufs[2 * j + 1], tx)); 5153 } 5154 if (i == 1) { 5155 dmu_buf_rele(dbt, FTAG); 5156 } 5157 } 5158 dmu_tx_commit(tx); 5159 5160 /* 5161 * Sanity check the stuff we just wrote. 5162 */ 5163 { 5164 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5165 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5166 5167 VERIFY0(dmu_read(os, packobj, packoff, 5168 packsize, packcheck, DMU_READ_PREFETCH)); 5169 VERIFY0(dmu_read(os, bigobj, bigoff, 5170 bigsize, bigcheck, DMU_READ_PREFETCH)); 5171 5172 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5173 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5174 5175 umem_free(packcheck, packsize); 5176 umem_free(bigcheck, bigsize); 5177 } 5178 if (i == 2) { 5179 txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); 5180 } else if (i == 3) { 5181 txg_wait_synced(dmu_objset_pool(os), 0); 5182 } 5183 } 5184 5185 dmu_buf_rele(bonus_db, FTAG); 5186 umem_free(packbuf, packsize); 5187 umem_free(bigbuf, bigsize); 5188 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5189 umem_free(od, size); 5190 } 5191 5192 void 5193 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 5194 { 5195 (void) id; 5196 ztest_od_t *od; 5197 5198 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5199 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 5200 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5201 5202 /* 5203 * Have multiple threads write to large offsets in an object 5204 * to verify that parallel writes to an object -- even to the 5205 * same blocks within the object -- doesn't cause any trouble. 5206 */ 5207 ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5208 5209 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) 5210 return; 5211 5212 while (ztest_random(10) != 0) 5213 ztest_io(zd, od->od_object, offset); 5214 5215 umem_free(od, sizeof (ztest_od_t)); 5216 } 5217 5218 void 5219 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 5220 { 5221 ztest_od_t *od; 5222 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 5223 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5224 uint64_t count = ztest_random(20) + 1; 5225 uint64_t blocksize = ztest_random_blocksize(); 5226 void *data; 5227 5228 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5229 5230 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5231 5232 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5233 !ztest_random(2)) != 0) { 5234 umem_free(od, sizeof (ztest_od_t)); 5235 return; 5236 } 5237 5238 if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { 5239 umem_free(od, sizeof (ztest_od_t)); 5240 return; 5241 } 5242 5243 ztest_prealloc(zd, od->od_object, offset, count * blocksize); 5244 5245 data = umem_zalloc(blocksize, UMEM_NOFAIL); 5246 5247 while (ztest_random(count) != 0) { 5248 uint64_t randoff = offset + (ztest_random(count) * blocksize); 5249 if (ztest_write(zd, od->od_object, randoff, blocksize, 5250 data) != 0) 5251 break; 5252 while (ztest_random(4) != 0) 5253 ztest_io(zd, od->od_object, randoff); 5254 } 5255 5256 umem_free(data, blocksize); 5257 umem_free(od, sizeof (ztest_od_t)); 5258 } 5259 5260 /* 5261 * Verify that zap_{create,destroy,add,remove,update} work as expected. 5262 */ 5263 #define ZTEST_ZAP_MIN_INTS 1 5264 #define ZTEST_ZAP_MAX_INTS 4 5265 #define ZTEST_ZAP_MAX_PROPS 1000 5266 5267 void 5268 ztest_zap(ztest_ds_t *zd, uint64_t id) 5269 { 5270 objset_t *os = zd->zd_os; 5271 ztest_od_t *od; 5272 uint64_t object; 5273 uint64_t txg, last_txg; 5274 uint64_t value[ZTEST_ZAP_MAX_INTS]; 5275 uint64_t zl_ints, zl_intsize, prop; 5276 int i, ints; 5277 dmu_tx_t *tx; 5278 char propname[100], txgname[100]; 5279 int error; 5280 const char *const hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 5281 5282 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5283 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5284 5285 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5286 !ztest_random(2)) != 0) 5287 goto out; 5288 5289 object = od->od_object; 5290 5291 /* 5292 * Generate a known hash collision, and verify that 5293 * we can lookup and remove both entries. 5294 */ 5295 tx = dmu_tx_create(os); 5296 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5297 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5298 if (txg == 0) 5299 goto out; 5300 for (i = 0; i < 2; i++) { 5301 value[i] = i; 5302 VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 5303 1, &value[i], tx)); 5304 } 5305 for (i = 0; i < 2; i++) { 5306 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 5307 sizeof (uint64_t), 1, &value[i], tx)); 5308 VERIFY0( 5309 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 5310 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5311 ASSERT3U(zl_ints, ==, 1); 5312 } 5313 for (i = 0; i < 2; i++) { 5314 VERIFY0(zap_remove(os, object, hc[i], tx)); 5315 } 5316 dmu_tx_commit(tx); 5317 5318 /* 5319 * Generate a bunch of random entries. 5320 */ 5321 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 5322 5323 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5324 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5325 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5326 memset(value, 0, sizeof (value)); 5327 last_txg = 0; 5328 5329 /* 5330 * If these zap entries already exist, validate their contents. 5331 */ 5332 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5333 if (error == 0) { 5334 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5335 ASSERT3U(zl_ints, ==, 1); 5336 5337 VERIFY0(zap_lookup(os, object, txgname, zl_intsize, 5338 zl_ints, &last_txg)); 5339 5340 VERIFY0(zap_length(os, object, propname, &zl_intsize, 5341 &zl_ints)); 5342 5343 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5344 ASSERT3U(zl_ints, ==, ints); 5345 5346 VERIFY0(zap_lookup(os, object, propname, zl_intsize, 5347 zl_ints, value)); 5348 5349 for (i = 0; i < ints; i++) { 5350 ASSERT3U(value[i], ==, last_txg + object + i); 5351 } 5352 } else { 5353 ASSERT3U(error, ==, ENOENT); 5354 } 5355 5356 /* 5357 * Atomically update two entries in our zap object. 5358 * The first is named txg_%llu, and contains the txg 5359 * in which the property was last updated. The second 5360 * is named prop_%llu, and the nth element of its value 5361 * should be txg + object + n. 5362 */ 5363 tx = dmu_tx_create(os); 5364 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5365 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5366 if (txg == 0) 5367 goto out; 5368 5369 if (last_txg > txg) 5370 fatal(B_FALSE, "zap future leak: old %"PRIu64" new %"PRIu64"", 5371 last_txg, txg); 5372 5373 for (i = 0; i < ints; i++) 5374 value[i] = txg + object + i; 5375 5376 VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 5377 1, &txg, tx)); 5378 VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), 5379 ints, value, tx)); 5380 5381 dmu_tx_commit(tx); 5382 5383 /* 5384 * Remove a random pair of entries. 5385 */ 5386 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5387 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5388 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5389 5390 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5391 5392 if (error == ENOENT) 5393 goto out; 5394 5395 ASSERT0(error); 5396 5397 tx = dmu_tx_create(os); 5398 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5399 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5400 if (txg == 0) 5401 goto out; 5402 VERIFY0(zap_remove(os, object, txgname, tx)); 5403 VERIFY0(zap_remove(os, object, propname, tx)); 5404 dmu_tx_commit(tx); 5405 out: 5406 umem_free(od, sizeof (ztest_od_t)); 5407 } 5408 5409 /* 5410 * Test case to test the upgrading of a microzap to fatzap. 5411 */ 5412 void 5413 ztest_fzap(ztest_ds_t *zd, uint64_t id) 5414 { 5415 objset_t *os = zd->zd_os; 5416 ztest_od_t *od; 5417 uint64_t object, txg, value; 5418 5419 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5420 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5421 5422 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5423 !ztest_random(2)) != 0) 5424 goto out; 5425 object = od->od_object; 5426 5427 /* 5428 * Add entries to this ZAP and make sure it spills over 5429 * and gets upgraded to a fatzap. Also, since we are adding 5430 * 2050 entries we should see ptrtbl growth and leaf-block split. 5431 */ 5432 for (value = 0; value < 2050; value++) { 5433 char name[ZFS_MAX_DATASET_NAME_LEN]; 5434 dmu_tx_t *tx; 5435 int error; 5436 5437 (void) snprintf(name, sizeof (name), "fzap-%"PRIu64"-%"PRIu64"", 5438 id, value); 5439 5440 tx = dmu_tx_create(os); 5441 dmu_tx_hold_zap(tx, object, B_TRUE, name); 5442 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5443 if (txg == 0) 5444 goto out; 5445 error = zap_add(os, object, name, sizeof (uint64_t), 1, 5446 &value, tx); 5447 ASSERT(error == 0 || error == EEXIST); 5448 dmu_tx_commit(tx); 5449 } 5450 out: 5451 umem_free(od, sizeof (ztest_od_t)); 5452 } 5453 5454 void 5455 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 5456 { 5457 (void) id; 5458 objset_t *os = zd->zd_os; 5459 ztest_od_t *od; 5460 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 5461 dmu_tx_t *tx; 5462 int i, namelen, error; 5463 int micro = ztest_random(2); 5464 char name[20], string_value[20]; 5465 void *data; 5466 5467 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5468 ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); 5469 5470 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5471 umem_free(od, sizeof (ztest_od_t)); 5472 return; 5473 } 5474 5475 object = od->od_object; 5476 5477 /* 5478 * Generate a random name of the form 'xxx.....' where each 5479 * x is a random printable character and the dots are dots. 5480 * There are 94 such characters, and the name length goes from 5481 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 5482 */ 5483 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 5484 5485 for (i = 0; i < 3; i++) 5486 name[i] = '!' + ztest_random('~' - '!' + 1); 5487 for (; i < namelen - 1; i++) 5488 name[i] = '.'; 5489 name[i] = '\0'; 5490 5491 if ((namelen & 1) || micro) { 5492 wsize = sizeof (txg); 5493 wc = 1; 5494 data = &txg; 5495 } else { 5496 wsize = 1; 5497 wc = namelen; 5498 data = string_value; 5499 } 5500 5501 count = -1ULL; 5502 VERIFY0(zap_count(os, object, &count)); 5503 ASSERT3S(count, !=, -1ULL); 5504 5505 /* 5506 * Select an operation: length, lookup, add, update, remove. 5507 */ 5508 i = ztest_random(5); 5509 5510 if (i >= 2) { 5511 tx = dmu_tx_create(os); 5512 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5513 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5514 if (txg == 0) { 5515 umem_free(od, sizeof (ztest_od_t)); 5516 return; 5517 } 5518 memcpy(string_value, name, namelen); 5519 } else { 5520 tx = NULL; 5521 txg = 0; 5522 memset(string_value, 0, namelen); 5523 } 5524 5525 switch (i) { 5526 5527 case 0: 5528 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 5529 if (error == 0) { 5530 ASSERT3U(wsize, ==, zl_wsize); 5531 ASSERT3U(wc, ==, zl_wc); 5532 } else { 5533 ASSERT3U(error, ==, ENOENT); 5534 } 5535 break; 5536 5537 case 1: 5538 error = zap_lookup(os, object, name, wsize, wc, data); 5539 if (error == 0) { 5540 if (data == string_value && 5541 memcmp(name, data, namelen) != 0) 5542 fatal(B_FALSE, "name '%s' != val '%s' len %d", 5543 name, (char *)data, namelen); 5544 } else { 5545 ASSERT3U(error, ==, ENOENT); 5546 } 5547 break; 5548 5549 case 2: 5550 error = zap_add(os, object, name, wsize, wc, data, tx); 5551 ASSERT(error == 0 || error == EEXIST); 5552 break; 5553 5554 case 3: 5555 VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); 5556 break; 5557 5558 case 4: 5559 error = zap_remove(os, object, name, tx); 5560 ASSERT(error == 0 || error == ENOENT); 5561 break; 5562 } 5563 5564 if (tx != NULL) 5565 dmu_tx_commit(tx); 5566 5567 umem_free(od, sizeof (ztest_od_t)); 5568 } 5569 5570 /* 5571 * Commit callback data. 5572 */ 5573 typedef struct ztest_cb_data { 5574 list_node_t zcd_node; 5575 uint64_t zcd_txg; 5576 int zcd_expected_err; 5577 boolean_t zcd_added; 5578 boolean_t zcd_called; 5579 spa_t *zcd_spa; 5580 } ztest_cb_data_t; 5581 5582 /* This is the actual commit callback function */ 5583 static void 5584 ztest_commit_callback(void *arg, int error) 5585 { 5586 ztest_cb_data_t *data = arg; 5587 uint64_t synced_txg; 5588 5589 VERIFY3P(data, !=, NULL); 5590 VERIFY3S(data->zcd_expected_err, ==, error); 5591 VERIFY(!data->zcd_called); 5592 5593 synced_txg = spa_last_synced_txg(data->zcd_spa); 5594 if (data->zcd_txg > synced_txg) 5595 fatal(B_FALSE, 5596 "commit callback of txg %"PRIu64" called prematurely, " 5597 "last synced txg = %"PRIu64"\n", 5598 data->zcd_txg, synced_txg); 5599 5600 data->zcd_called = B_TRUE; 5601 5602 if (error == ECANCELED) { 5603 ASSERT0(data->zcd_txg); 5604 ASSERT(!data->zcd_added); 5605 5606 /* 5607 * The private callback data should be destroyed here, but 5608 * since we are going to check the zcd_called field after 5609 * dmu_tx_abort(), we will destroy it there. 5610 */ 5611 return; 5612 } 5613 5614 ASSERT(data->zcd_added); 5615 ASSERT3U(data->zcd_txg, !=, 0); 5616 5617 (void) mutex_enter(&zcl.zcl_callbacks_lock); 5618 5619 /* See if this cb was called more quickly */ 5620 if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) 5621 zc_min_txg_delay = synced_txg - data->zcd_txg; 5622 5623 /* Remove our callback from the list */ 5624 list_remove(&zcl.zcl_callbacks, data); 5625 5626 (void) mutex_exit(&zcl.zcl_callbacks_lock); 5627 5628 umem_free(data, sizeof (ztest_cb_data_t)); 5629 } 5630 5631 /* Allocate and initialize callback data structure */ 5632 static ztest_cb_data_t * 5633 ztest_create_cb_data(objset_t *os, uint64_t txg) 5634 { 5635 ztest_cb_data_t *cb_data; 5636 5637 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 5638 5639 cb_data->zcd_txg = txg; 5640 cb_data->zcd_spa = dmu_objset_spa(os); 5641 list_link_init(&cb_data->zcd_node); 5642 5643 return (cb_data); 5644 } 5645 5646 /* 5647 * Commit callback test. 5648 */ 5649 void 5650 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 5651 { 5652 objset_t *os = zd->zd_os; 5653 ztest_od_t *od; 5654 dmu_tx_t *tx; 5655 ztest_cb_data_t *cb_data[3], *tmp_cb; 5656 uint64_t old_txg, txg; 5657 int i, error = 0; 5658 5659 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5660 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5661 5662 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5663 umem_free(od, sizeof (ztest_od_t)); 5664 return; 5665 } 5666 5667 tx = dmu_tx_create(os); 5668 5669 cb_data[0] = ztest_create_cb_data(os, 0); 5670 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 5671 5672 dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); 5673 5674 /* Every once in a while, abort the transaction on purpose */ 5675 if (ztest_random(100) == 0) 5676 error = -1; 5677 5678 if (!error) 5679 error = dmu_tx_assign(tx, TXG_NOWAIT); 5680 5681 txg = error ? 0 : dmu_tx_get_txg(tx); 5682 5683 cb_data[0]->zcd_txg = txg; 5684 cb_data[1] = ztest_create_cb_data(os, txg); 5685 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 5686 5687 if (error) { 5688 /* 5689 * It's not a strict requirement to call the registered 5690 * callbacks from inside dmu_tx_abort(), but that's what 5691 * it's supposed to happen in the current implementation 5692 * so we will check for that. 5693 */ 5694 for (i = 0; i < 2; i++) { 5695 cb_data[i]->zcd_expected_err = ECANCELED; 5696 VERIFY(!cb_data[i]->zcd_called); 5697 } 5698 5699 dmu_tx_abort(tx); 5700 5701 for (i = 0; i < 2; i++) { 5702 VERIFY(cb_data[i]->zcd_called); 5703 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 5704 } 5705 5706 umem_free(od, sizeof (ztest_od_t)); 5707 return; 5708 } 5709 5710 cb_data[2] = ztest_create_cb_data(os, txg); 5711 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 5712 5713 /* 5714 * Read existing data to make sure there isn't a future leak. 5715 */ 5716 VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), 5717 &old_txg, DMU_READ_PREFETCH)); 5718 5719 if (old_txg > txg) 5720 fatal(B_FALSE, 5721 "future leak: got %"PRIu64", open txg is %"PRIu64"", 5722 old_txg, txg); 5723 5724 dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); 5725 5726 (void) mutex_enter(&zcl.zcl_callbacks_lock); 5727 5728 /* 5729 * Since commit callbacks don't have any ordering requirement and since 5730 * it is theoretically possible for a commit callback to be called 5731 * after an arbitrary amount of time has elapsed since its txg has been 5732 * synced, it is difficult to reliably determine whether a commit 5733 * callback hasn't been called due to high load or due to a flawed 5734 * implementation. 5735 * 5736 * In practice, we will assume that if after a certain number of txgs a 5737 * commit callback hasn't been called, then most likely there's an 5738 * implementation bug.. 5739 */ 5740 tmp_cb = list_head(&zcl.zcl_callbacks); 5741 if (tmp_cb != NULL && 5742 tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { 5743 fatal(B_FALSE, 5744 "Commit callback threshold exceeded, " 5745 "oldest txg: %"PRIu64", open txg: %"PRIu64"\n", 5746 tmp_cb->zcd_txg, txg); 5747 } 5748 5749 /* 5750 * Let's find the place to insert our callbacks. 5751 * 5752 * Even though the list is ordered by txg, it is possible for the 5753 * insertion point to not be the end because our txg may already be 5754 * quiescing at this point and other callbacks in the open txg 5755 * (from other objsets) may have sneaked in. 5756 */ 5757 tmp_cb = list_tail(&zcl.zcl_callbacks); 5758 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 5759 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 5760 5761 /* Add the 3 callbacks to the list */ 5762 for (i = 0; i < 3; i++) { 5763 if (tmp_cb == NULL) 5764 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 5765 else 5766 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 5767 cb_data[i]); 5768 5769 cb_data[i]->zcd_added = B_TRUE; 5770 VERIFY(!cb_data[i]->zcd_called); 5771 5772 tmp_cb = cb_data[i]; 5773 } 5774 5775 zc_cb_counter += 3; 5776 5777 (void) mutex_exit(&zcl.zcl_callbacks_lock); 5778 5779 dmu_tx_commit(tx); 5780 5781 umem_free(od, sizeof (ztest_od_t)); 5782 } 5783 5784 /* 5785 * Visit each object in the dataset. Verify that its properties 5786 * are consistent what was stored in the block tag when it was created, 5787 * and that its unused bonus buffer space has not been overwritten. 5788 */ 5789 void 5790 ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 5791 { 5792 (void) id; 5793 objset_t *os = zd->zd_os; 5794 uint64_t obj; 5795 int err = 0; 5796 5797 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 5798 ztest_block_tag_t *bt = NULL; 5799 dmu_object_info_t doi; 5800 dmu_buf_t *db; 5801 5802 ztest_object_lock(zd, obj, RL_READER); 5803 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { 5804 ztest_object_unlock(zd, obj); 5805 continue; 5806 } 5807 5808 dmu_object_info_from_db(db, &doi); 5809 if (doi.doi_bonus_size >= sizeof (*bt)) 5810 bt = ztest_bt_bonus(db); 5811 5812 if (bt && bt->bt_magic == BT_MAGIC) { 5813 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 5814 bt->bt_offset, bt->bt_gen, bt->bt_txg, 5815 bt->bt_crtxg); 5816 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 5817 } 5818 5819 dmu_buf_rele(db, FTAG); 5820 ztest_object_unlock(zd, obj); 5821 } 5822 } 5823 5824 void 5825 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 5826 { 5827 (void) id; 5828 zfs_prop_t proplist[] = { 5829 ZFS_PROP_CHECKSUM, 5830 ZFS_PROP_COMPRESSION, 5831 ZFS_PROP_COPIES, 5832 ZFS_PROP_DEDUP 5833 }; 5834 5835 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5836 5837 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) 5838 (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 5839 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 5840 5841 VERIFY0(ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, 5842 ztest_random_blocksize(), (int)ztest_random(2))); 5843 5844 (void) pthread_rwlock_unlock(&ztest_name_lock); 5845 } 5846 5847 void 5848 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 5849 { 5850 (void) zd, (void) id; 5851 nvlist_t *props = NULL; 5852 5853 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5854 5855 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); 5856 5857 VERIFY0(spa_prop_get(ztest_spa, &props)); 5858 5859 if (ztest_opts.zo_verbose >= 6) 5860 dump_nvlist(props, 4); 5861 5862 fnvlist_free(props); 5863 5864 (void) pthread_rwlock_unlock(&ztest_name_lock); 5865 } 5866 5867 static int 5868 user_release_one(const char *snapname, const char *holdname) 5869 { 5870 nvlist_t *snaps, *holds; 5871 int error; 5872 5873 snaps = fnvlist_alloc(); 5874 holds = fnvlist_alloc(); 5875 fnvlist_add_boolean(holds, holdname); 5876 fnvlist_add_nvlist(snaps, snapname, holds); 5877 fnvlist_free(holds); 5878 error = dsl_dataset_user_release(snaps, NULL); 5879 fnvlist_free(snaps); 5880 return (error); 5881 } 5882 5883 /* 5884 * Test snapshot hold/release and deferred destroy. 5885 */ 5886 void 5887 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 5888 { 5889 int error; 5890 objset_t *os = zd->zd_os; 5891 objset_t *origin; 5892 char snapname[100]; 5893 char fullname[100]; 5894 char clonename[100]; 5895 char tag[100]; 5896 char osname[ZFS_MAX_DATASET_NAME_LEN]; 5897 nvlist_t *holds; 5898 5899 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5900 5901 dmu_objset_name(os, osname); 5902 5903 (void) snprintf(snapname, sizeof (snapname), "sh1_%"PRIu64"", id); 5904 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 5905 (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%"PRIu64"", 5906 osname, id); 5907 (void) snprintf(tag, sizeof (tag), "tag_%"PRIu64"", id); 5908 5909 /* 5910 * Clean up from any previous run. 5911 */ 5912 error = dsl_destroy_head(clonename); 5913 if (error != ENOENT) 5914 ASSERT0(error); 5915 error = user_release_one(fullname, tag); 5916 if (error != ESRCH && error != ENOENT) 5917 ASSERT0(error); 5918 error = dsl_destroy_snapshot(fullname, B_FALSE); 5919 if (error != ENOENT) 5920 ASSERT0(error); 5921 5922 /* 5923 * Create snapshot, clone it, mark snap for deferred destroy, 5924 * destroy clone, verify snap was also destroyed. 5925 */ 5926 error = dmu_objset_snapshot_one(osname, snapname); 5927 if (error) { 5928 if (error == ENOSPC) { 5929 ztest_record_enospc("dmu_objset_snapshot"); 5930 goto out; 5931 } 5932 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 5933 } 5934 5935 error = dmu_objset_clone(clonename, fullname); 5936 if (error) { 5937 if (error == ENOSPC) { 5938 ztest_record_enospc("dmu_objset_clone"); 5939 goto out; 5940 } 5941 fatal(B_FALSE, "dmu_objset_clone(%s) = %d", clonename, error); 5942 } 5943 5944 error = dsl_destroy_snapshot(fullname, B_TRUE); 5945 if (error) { 5946 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5947 fullname, error); 5948 } 5949 5950 error = dsl_destroy_head(clonename); 5951 if (error) 5952 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clonename, error); 5953 5954 error = dmu_objset_hold(fullname, FTAG, &origin); 5955 if (error != ENOENT) 5956 fatal(B_FALSE, "dmu_objset_hold(%s) = %d", fullname, error); 5957 5958 /* 5959 * Create snapshot, add temporary hold, verify that we can't 5960 * destroy a held snapshot, mark for deferred destroy, 5961 * release hold, verify snapshot was destroyed. 5962 */ 5963 error = dmu_objset_snapshot_one(osname, snapname); 5964 if (error) { 5965 if (error == ENOSPC) { 5966 ztest_record_enospc("dmu_objset_snapshot"); 5967 goto out; 5968 } 5969 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 5970 } 5971 5972 holds = fnvlist_alloc(); 5973 fnvlist_add_string(holds, fullname, tag); 5974 error = dsl_dataset_user_hold(holds, 0, NULL); 5975 fnvlist_free(holds); 5976 5977 if (error == ENOSPC) { 5978 ztest_record_enospc("dsl_dataset_user_hold"); 5979 goto out; 5980 } else if (error) { 5981 fatal(B_FALSE, "dsl_dataset_user_hold(%s, %s) = %u", 5982 fullname, tag, error); 5983 } 5984 5985 error = dsl_destroy_snapshot(fullname, B_FALSE); 5986 if (error != EBUSY) { 5987 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 5988 fullname, error); 5989 } 5990 5991 error = dsl_destroy_snapshot(fullname, B_TRUE); 5992 if (error) { 5993 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5994 fullname, error); 5995 } 5996 5997 error = user_release_one(fullname, tag); 5998 if (error) 5999 fatal(B_FALSE, "user_release_one(%s, %s) = %d", 6000 fullname, tag, error); 6001 6002 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 6003 6004 out: 6005 (void) pthread_rwlock_unlock(&ztest_name_lock); 6006 } 6007 6008 /* 6009 * Inject random faults into the on-disk data. 6010 */ 6011 void 6012 ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 6013 { 6014 (void) zd, (void) id; 6015 ztest_shared_t *zs = ztest_shared; 6016 spa_t *spa = ztest_spa; 6017 int fd; 6018 uint64_t offset; 6019 uint64_t leaves; 6020 uint64_t bad = 0x1990c0ffeedecadeull; 6021 uint64_t top, leaf; 6022 char *path0; 6023 char *pathrand; 6024 size_t fsize; 6025 int bshift = SPA_MAXBLOCKSHIFT + 2; 6026 int iters = 1000; 6027 int maxfaults; 6028 int mirror_save; 6029 vdev_t *vd0 = NULL; 6030 uint64_t guid0 = 0; 6031 boolean_t islog = B_FALSE; 6032 6033 path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6034 pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6035 6036 mutex_enter(&ztest_vdev_lock); 6037 6038 /* 6039 * Device removal is in progress, fault injection must be disabled 6040 * until it completes and the pool is scrubbed. The fault injection 6041 * strategy for damaging blocks does not take in to account evacuated 6042 * blocks which may have already been damaged. 6043 */ 6044 if (ztest_device_removal_active) { 6045 mutex_exit(&ztest_vdev_lock); 6046 goto out; 6047 } 6048 6049 maxfaults = MAXFAULTS(zs); 6050 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; 6051 mirror_save = zs->zs_mirrors; 6052 mutex_exit(&ztest_vdev_lock); 6053 6054 ASSERT3U(leaves, >=, 1); 6055 6056 /* 6057 * While ztest is running the number of leaves will not change. This 6058 * is critical for the fault injection logic as it determines where 6059 * errors can be safely injected such that they are always repairable. 6060 * 6061 * When restarting ztest a different number of leaves may be requested 6062 * which will shift the regions to be damaged. This is fine as long 6063 * as the pool has been scrubbed prior to using the new mapping. 6064 * Failure to do can result in non-repairable damage being injected. 6065 */ 6066 if (ztest_pool_scrubbed == B_FALSE) 6067 goto out; 6068 6069 /* 6070 * Grab the name lock as reader. There are some operations 6071 * which don't like to have their vdevs changed while 6072 * they are in progress (i.e. spa_change_guid). Those 6073 * operations will have grabbed the name lock as writer. 6074 */ 6075 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6076 6077 /* 6078 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 6079 */ 6080 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6081 6082 if (ztest_random(2) == 0) { 6083 /* 6084 * Inject errors on a normal data device or slog device. 6085 */ 6086 top = ztest_random_vdev_top(spa, B_TRUE); 6087 leaf = ztest_random(leaves) + zs->zs_splits; 6088 6089 /* 6090 * Generate paths to the first leaf in this top-level vdev, 6091 * and to the random leaf we selected. We'll induce transient 6092 * write failures and random online/offline activity on leaf 0, 6093 * and we'll write random garbage to the randomly chosen leaf. 6094 */ 6095 (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, 6096 ztest_opts.zo_dir, ztest_opts.zo_pool, 6097 top * leaves + zs->zs_splits); 6098 (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, 6099 ztest_opts.zo_dir, ztest_opts.zo_pool, 6100 top * leaves + leaf); 6101 6102 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 6103 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 6104 islog = B_TRUE; 6105 6106 /* 6107 * If the top-level vdev needs to be resilvered 6108 * then we only allow faults on the device that is 6109 * resilvering. 6110 */ 6111 if (vd0 != NULL && maxfaults != 1 && 6112 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 6113 vd0->vdev_resilver_txg != 0)) { 6114 /* 6115 * Make vd0 explicitly claim to be unreadable, 6116 * or unwritable, or reach behind its back 6117 * and close the underlying fd. We can do this if 6118 * maxfaults == 0 because we'll fail and reexecute, 6119 * and we can do it if maxfaults >= 2 because we'll 6120 * have enough redundancy. If maxfaults == 1, the 6121 * combination of this with injection of random data 6122 * corruption below exceeds the pool's fault tolerance. 6123 */ 6124 vdev_file_t *vf = vd0->vdev_tsd; 6125 6126 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 6127 (long long)vd0->vdev_id, (int)maxfaults); 6128 6129 if (vf != NULL && ztest_random(3) == 0) { 6130 (void) close(vf->vf_file->f_fd); 6131 vf->vf_file->f_fd = -1; 6132 } else if (ztest_random(2) == 0) { 6133 vd0->vdev_cant_read = B_TRUE; 6134 } else { 6135 vd0->vdev_cant_write = B_TRUE; 6136 } 6137 guid0 = vd0->vdev_guid; 6138 } 6139 } else { 6140 /* 6141 * Inject errors on an l2cache device. 6142 */ 6143 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6144 6145 if (sav->sav_count == 0) { 6146 spa_config_exit(spa, SCL_STATE, FTAG); 6147 (void) pthread_rwlock_unlock(&ztest_name_lock); 6148 goto out; 6149 } 6150 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 6151 guid0 = vd0->vdev_guid; 6152 (void) strlcpy(path0, vd0->vdev_path, MAXPATHLEN); 6153 (void) strlcpy(pathrand, vd0->vdev_path, MAXPATHLEN); 6154 6155 leaf = 0; 6156 leaves = 1; 6157 maxfaults = INT_MAX; /* no limit on cache devices */ 6158 } 6159 6160 spa_config_exit(spa, SCL_STATE, FTAG); 6161 (void) pthread_rwlock_unlock(&ztest_name_lock); 6162 6163 /* 6164 * If we can tolerate two or more faults, or we're dealing 6165 * with a slog, randomly online/offline vd0. 6166 */ 6167 if ((maxfaults >= 2 || islog) && guid0 != 0) { 6168 if (ztest_random(10) < 6) { 6169 int flags = (ztest_random(2) == 0 ? 6170 ZFS_OFFLINE_TEMPORARY : 0); 6171 6172 /* 6173 * We have to grab the zs_name_lock as writer to 6174 * prevent a race between offlining a slog and 6175 * destroying a dataset. Offlining the slog will 6176 * grab a reference on the dataset which may cause 6177 * dsl_destroy_head() to fail with EBUSY thus 6178 * leaving the dataset in an inconsistent state. 6179 */ 6180 if (islog) 6181 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6182 6183 VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); 6184 6185 if (islog) 6186 (void) pthread_rwlock_unlock(&ztest_name_lock); 6187 } else { 6188 /* 6189 * Ideally we would like to be able to randomly 6190 * call vdev_[on|off]line without holding locks 6191 * to force unpredictable failures but the side 6192 * effects of vdev_[on|off]line prevent us from 6193 * doing so. We grab the ztest_vdev_lock here to 6194 * prevent a race between injection testing and 6195 * aux_vdev removal. 6196 */ 6197 mutex_enter(&ztest_vdev_lock); 6198 (void) vdev_online(spa, guid0, 0, NULL); 6199 mutex_exit(&ztest_vdev_lock); 6200 } 6201 } 6202 6203 if (maxfaults == 0) 6204 goto out; 6205 6206 /* 6207 * We have at least single-fault tolerance, so inject data corruption. 6208 */ 6209 fd = open(pathrand, O_RDWR); 6210 6211 if (fd == -1) /* we hit a gap in the device namespace */ 6212 goto out; 6213 6214 fsize = lseek(fd, 0, SEEK_END); 6215 6216 while (--iters != 0) { 6217 /* 6218 * The offset must be chosen carefully to ensure that 6219 * we do not inject a given logical block with errors 6220 * on two different leaf devices, because ZFS can not 6221 * tolerate that (if maxfaults==1). 6222 * 6223 * To achieve this we divide each leaf device into 6224 * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). 6225 * Each chunk is further divided into error-injection 6226 * ranges (can accept errors) and clear ranges (we do 6227 * not inject errors in those). Each error-injection 6228 * range can accept errors only for a single leaf vdev. 6229 * Error-injection ranges are separated by clear ranges. 6230 * 6231 * For example, with 3 leaves, each chunk looks like: 6232 * 0 to 32M: injection range for leaf 0 6233 * 32M to 64M: clear range - no injection allowed 6234 * 64M to 96M: injection range for leaf 1 6235 * 96M to 128M: clear range - no injection allowed 6236 * 128M to 160M: injection range for leaf 2 6237 * 160M to 192M: clear range - no injection allowed 6238 * 6239 * Each clear range must be large enough such that a 6240 * single block cannot straddle it. This way a block 6241 * can't be a target in two different injection ranges 6242 * (on different leaf vdevs). 6243 */ 6244 offset = ztest_random(fsize / (leaves << bshift)) * 6245 (leaves << bshift) + (leaf << bshift) + 6246 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 6247 6248 /* 6249 * Only allow damage to the labels at one end of the vdev. 6250 * 6251 * If all labels are damaged, the device will be totally 6252 * inaccessible, which will result in loss of data, 6253 * because we also damage (parts of) the other side of 6254 * the mirror/raidz. 6255 * 6256 * Additionally, we will always have both an even and an 6257 * odd label, so that we can handle crashes in the 6258 * middle of vdev_config_sync(). 6259 */ 6260 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 6261 continue; 6262 6263 /* 6264 * The two end labels are stored at the "end" of the disk, but 6265 * the end of the disk (vdev_psize) is aligned to 6266 * sizeof (vdev_label_t). 6267 */ 6268 uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); 6269 if ((leaf & 1) == 1 && 6270 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 6271 continue; 6272 6273 mutex_enter(&ztest_vdev_lock); 6274 if (mirror_save != zs->zs_mirrors) { 6275 mutex_exit(&ztest_vdev_lock); 6276 (void) close(fd); 6277 goto out; 6278 } 6279 6280 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 6281 fatal(B_TRUE, 6282 "can't inject bad word at 0x%"PRIx64" in %s", 6283 offset, pathrand); 6284 6285 mutex_exit(&ztest_vdev_lock); 6286 6287 if (ztest_opts.zo_verbose >= 7) 6288 (void) printf("injected bad word into %s," 6289 " offset 0x%"PRIx64"\n", pathrand, offset); 6290 } 6291 6292 (void) close(fd); 6293 out: 6294 umem_free(path0, MAXPATHLEN); 6295 umem_free(pathrand, MAXPATHLEN); 6296 } 6297 6298 /* 6299 * By design ztest will never inject uncorrectable damage in to the pool. 6300 * Issue a scrub, wait for it to complete, and verify there is never any 6301 * persistent damage. 6302 * 6303 * Only after a full scrub has been completed is it safe to start injecting 6304 * data corruption. See the comment in zfs_fault_inject(). 6305 */ 6306 static int 6307 ztest_scrub_impl(spa_t *spa) 6308 { 6309 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6310 if (error) 6311 return (error); 6312 6313 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6314 txg_wait_synced(spa_get_dsl(spa), 0); 6315 6316 if (spa_get_errlog_size(spa) > 0) 6317 return (ECKSUM); 6318 6319 ztest_pool_scrubbed = B_TRUE; 6320 6321 return (0); 6322 } 6323 6324 /* 6325 * Scrub the pool. 6326 */ 6327 void 6328 ztest_scrub(ztest_ds_t *zd, uint64_t id) 6329 { 6330 (void) zd, (void) id; 6331 spa_t *spa = ztest_spa; 6332 int error; 6333 6334 /* 6335 * Scrub in progress by device removal. 6336 */ 6337 if (ztest_device_removal_active) 6338 return; 6339 6340 /* 6341 * Start a scrub, wait a moment, then force a restart. 6342 */ 6343 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6344 (void) poll(NULL, 0, 100); 6345 6346 error = ztest_scrub_impl(spa); 6347 if (error == EBUSY) 6348 error = 0; 6349 ASSERT0(error); 6350 } 6351 6352 /* 6353 * Change the guid for the pool. 6354 */ 6355 void 6356 ztest_reguid(ztest_ds_t *zd, uint64_t id) 6357 { 6358 (void) zd, (void) id; 6359 spa_t *spa = ztest_spa; 6360 uint64_t orig, load; 6361 int error; 6362 6363 if (ztest_opts.zo_mmp_test) 6364 return; 6365 6366 orig = spa_guid(spa); 6367 load = spa_load_guid(spa); 6368 6369 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6370 error = spa_change_guid(spa); 6371 (void) pthread_rwlock_unlock(&ztest_name_lock); 6372 6373 if (error != 0) 6374 return; 6375 6376 if (ztest_opts.zo_verbose >= 4) { 6377 (void) printf("Changed guid old %"PRIu64" -> %"PRIu64"\n", 6378 orig, spa_guid(spa)); 6379 } 6380 6381 VERIFY3U(orig, !=, spa_guid(spa)); 6382 VERIFY3U(load, ==, spa_load_guid(spa)); 6383 } 6384 6385 void 6386 ztest_blake3(ztest_ds_t *zd, uint64_t id) 6387 { 6388 (void) zd, (void) id; 6389 hrtime_t end = gethrtime() + NANOSEC; 6390 zio_cksum_salt_t salt; 6391 void *salt_ptr = &salt.zcs_bytes; 6392 struct abd *abd_data, *abd_meta; 6393 void *buf, *templ; 6394 int i, *ptr; 6395 uint32_t size; 6396 BLAKE3_CTX ctx; 6397 6398 size = ztest_random_blocksize(); 6399 buf = umem_alloc(size, UMEM_NOFAIL); 6400 abd_data = abd_alloc(size, B_FALSE); 6401 abd_meta = abd_alloc(size, B_TRUE); 6402 6403 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6404 *ptr = ztest_random(UINT_MAX); 6405 memset(salt_ptr, 'A', 32); 6406 6407 abd_copy_from_buf_off(abd_data, buf, 0, size); 6408 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6409 6410 while (gethrtime() <= end) { 6411 int run_count = 100; 6412 zio_cksum_t zc_ref1, zc_ref2; 6413 zio_cksum_t zc_res1, zc_res2; 6414 6415 void *ref1 = &zc_ref1; 6416 void *ref2 = &zc_ref2; 6417 void *res1 = &zc_res1; 6418 void *res2 = &zc_res2; 6419 6420 /* BLAKE3_KEY_LEN = 32 */ 6421 VERIFY0(blake3_impl_setname("generic")); 6422 templ = abd_checksum_blake3_tmpl_init(&salt); 6423 Blake3_InitKeyed(&ctx, salt_ptr); 6424 Blake3_Update(&ctx, buf, size); 6425 Blake3_Final(&ctx, ref1); 6426 zc_ref2 = zc_ref1; 6427 ZIO_CHECKSUM_BSWAP(&zc_ref2); 6428 abd_checksum_blake3_tmpl_free(templ); 6429 6430 VERIFY0(blake3_impl_setname("cycle")); 6431 while (run_count-- > 0) { 6432 6433 /* Test current implementation */ 6434 Blake3_InitKeyed(&ctx, salt_ptr); 6435 Blake3_Update(&ctx, buf, size); 6436 Blake3_Final(&ctx, res1); 6437 zc_res2 = zc_res1; 6438 ZIO_CHECKSUM_BSWAP(&zc_res2); 6439 6440 VERIFY0(memcmp(ref1, res1, 32)); 6441 VERIFY0(memcmp(ref2, res2, 32)); 6442 6443 /* Test ABD - data */ 6444 templ = abd_checksum_blake3_tmpl_init(&salt); 6445 abd_checksum_blake3_native(abd_data, size, 6446 templ, &zc_res1); 6447 abd_checksum_blake3_byteswap(abd_data, size, 6448 templ, &zc_res2); 6449 6450 VERIFY0(memcmp(ref1, res1, 32)); 6451 VERIFY0(memcmp(ref2, res2, 32)); 6452 6453 /* Test ABD - metadata */ 6454 abd_checksum_blake3_native(abd_meta, size, 6455 templ, &zc_res1); 6456 abd_checksum_blake3_byteswap(abd_meta, size, 6457 templ, &zc_res2); 6458 abd_checksum_blake3_tmpl_free(templ); 6459 6460 VERIFY0(memcmp(ref1, res1, 32)); 6461 VERIFY0(memcmp(ref2, res2, 32)); 6462 6463 } 6464 } 6465 6466 abd_free(abd_data); 6467 abd_free(abd_meta); 6468 umem_free(buf, size); 6469 } 6470 6471 void 6472 ztest_fletcher(ztest_ds_t *zd, uint64_t id) 6473 { 6474 (void) zd, (void) id; 6475 hrtime_t end = gethrtime() + NANOSEC; 6476 6477 while (gethrtime() <= end) { 6478 int run_count = 100; 6479 void *buf; 6480 struct abd *abd_data, *abd_meta; 6481 uint32_t size; 6482 int *ptr; 6483 int i; 6484 zio_cksum_t zc_ref; 6485 zio_cksum_t zc_ref_byteswap; 6486 6487 size = ztest_random_blocksize(); 6488 6489 buf = umem_alloc(size, UMEM_NOFAIL); 6490 abd_data = abd_alloc(size, B_FALSE); 6491 abd_meta = abd_alloc(size, B_TRUE); 6492 6493 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6494 *ptr = ztest_random(UINT_MAX); 6495 6496 abd_copy_from_buf_off(abd_data, buf, 0, size); 6497 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6498 6499 VERIFY0(fletcher_4_impl_set("scalar")); 6500 fletcher_4_native(buf, size, NULL, &zc_ref); 6501 fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); 6502 6503 VERIFY0(fletcher_4_impl_set("cycle")); 6504 while (run_count-- > 0) { 6505 zio_cksum_t zc; 6506 zio_cksum_t zc_byteswap; 6507 6508 fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); 6509 fletcher_4_native(buf, size, NULL, &zc); 6510 6511 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6512 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6513 sizeof (zc_byteswap))); 6514 6515 /* Test ABD - data */ 6516 abd_fletcher_4_byteswap(abd_data, size, NULL, 6517 &zc_byteswap); 6518 abd_fletcher_4_native(abd_data, size, NULL, &zc); 6519 6520 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6521 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6522 sizeof (zc_byteswap))); 6523 6524 /* Test ABD - metadata */ 6525 abd_fletcher_4_byteswap(abd_meta, size, NULL, 6526 &zc_byteswap); 6527 abd_fletcher_4_native(abd_meta, size, NULL, &zc); 6528 6529 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6530 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6531 sizeof (zc_byteswap))); 6532 6533 } 6534 6535 umem_free(buf, size); 6536 abd_free(abd_data); 6537 abd_free(abd_meta); 6538 } 6539 } 6540 6541 void 6542 ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) 6543 { 6544 (void) zd, (void) id; 6545 void *buf; 6546 size_t size; 6547 int *ptr; 6548 int i; 6549 zio_cksum_t zc_ref; 6550 zio_cksum_t zc_ref_bswap; 6551 6552 hrtime_t end = gethrtime() + NANOSEC; 6553 6554 while (gethrtime() <= end) { 6555 int run_count = 100; 6556 6557 size = ztest_random_blocksize(); 6558 buf = umem_alloc(size, UMEM_NOFAIL); 6559 6560 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6561 *ptr = ztest_random(UINT_MAX); 6562 6563 VERIFY0(fletcher_4_impl_set("scalar")); 6564 fletcher_4_native(buf, size, NULL, &zc_ref); 6565 fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); 6566 6567 VERIFY0(fletcher_4_impl_set("cycle")); 6568 6569 while (run_count-- > 0) { 6570 zio_cksum_t zc; 6571 zio_cksum_t zc_bswap; 6572 size_t pos = 0; 6573 6574 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6575 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6576 6577 while (pos < size) { 6578 size_t inc = 64 * ztest_random(size / 67); 6579 /* sometimes add few bytes to test non-simd */ 6580 if (ztest_random(100) < 10) 6581 inc += P2ALIGN(ztest_random(64), 6582 sizeof (uint32_t)); 6583 6584 if (inc > (size - pos)) 6585 inc = size - pos; 6586 6587 fletcher_4_incremental_native(buf + pos, inc, 6588 &zc); 6589 fletcher_4_incremental_byteswap(buf + pos, inc, 6590 &zc_bswap); 6591 6592 pos += inc; 6593 } 6594 6595 VERIFY3U(pos, ==, size); 6596 6597 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6598 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6599 6600 /* 6601 * verify if incremental on the whole buffer is 6602 * equivalent to non-incremental version 6603 */ 6604 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6605 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6606 6607 fletcher_4_incremental_native(buf, size, &zc); 6608 fletcher_4_incremental_byteswap(buf, size, &zc_bswap); 6609 6610 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6611 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6612 } 6613 6614 umem_free(buf, size); 6615 } 6616 } 6617 6618 static int 6619 ztest_set_global_vars(void) 6620 { 6621 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 6622 char *kv = ztest_opts.zo_gvars[i]; 6623 VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); 6624 VERIFY3U(strlen(kv), >, 0); 6625 int err = set_global_var(kv); 6626 if (ztest_opts.zo_verbose > 0) { 6627 (void) printf("setting global var %s ... %s\n", kv, 6628 err ? "failed" : "ok"); 6629 } 6630 if (err != 0) { 6631 (void) fprintf(stderr, 6632 "failed to set global var '%s'\n", kv); 6633 return (err); 6634 } 6635 } 6636 return (0); 6637 } 6638 6639 static char ** 6640 ztest_global_vars_to_zdb_args(void) 6641 { 6642 char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); 6643 char **cur = args; 6644 if (args == NULL) 6645 return (NULL); 6646 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 6647 *cur++ = (char *)"-o"; 6648 *cur++ = ztest_opts.zo_gvars[i]; 6649 } 6650 ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); 6651 *cur = NULL; 6652 return (args); 6653 } 6654 6655 /* The end of strings is indicated by a NULL element */ 6656 static char * 6657 join_strings(char **strings, const char *sep) 6658 { 6659 size_t totallen = 0; 6660 for (char **sp = strings; *sp != NULL; sp++) { 6661 totallen += strlen(*sp); 6662 totallen += strlen(sep); 6663 } 6664 if (totallen > 0) { 6665 ASSERT(totallen >= strlen(sep)); 6666 totallen -= strlen(sep); 6667 } 6668 6669 size_t buflen = totallen + 1; 6670 char *o = umem_alloc(buflen, UMEM_NOFAIL); /* trailing 0 byte */ 6671 o[0] = '\0'; 6672 for (char **sp = strings; *sp != NULL; sp++) { 6673 size_t would; 6674 would = strlcat(o, *sp, buflen); 6675 VERIFY3U(would, <, buflen); 6676 if (*(sp+1) == NULL) { 6677 break; 6678 } 6679 would = strlcat(o, sep, buflen); 6680 VERIFY3U(would, <, buflen); 6681 } 6682 ASSERT3S(strlen(o), ==, totallen); 6683 return (o); 6684 } 6685 6686 static int 6687 ztest_check_path(char *path) 6688 { 6689 struct stat s; 6690 /* return true on success */ 6691 return (!stat(path, &s)); 6692 } 6693 6694 static void 6695 ztest_get_zdb_bin(char *bin, int len) 6696 { 6697 char *zdb_path; 6698 /* 6699 * Try to use $ZDB and in-tree zdb path. If not successful, just 6700 * let popen to search through PATH. 6701 */ 6702 if ((zdb_path = getenv("ZDB"))) { 6703 strlcpy(bin, zdb_path, len); /* In env */ 6704 if (!ztest_check_path(bin)) { 6705 ztest_dump_core = 0; 6706 fatal(B_TRUE, "invalid ZDB '%s'", bin); 6707 } 6708 return; 6709 } 6710 6711 VERIFY3P(realpath(getexecname(), bin), !=, NULL); 6712 if (strstr(bin, ".libs/ztest")) { 6713 strstr(bin, ".libs/ztest")[0] = '\0'; /* In-tree */ 6714 strcat(bin, "zdb"); 6715 if (ztest_check_path(bin)) 6716 return; 6717 } 6718 strcpy(bin, "zdb"); 6719 } 6720 6721 static vdev_t * 6722 ztest_random_concrete_vdev_leaf(vdev_t *vd) 6723 { 6724 if (vd == NULL) 6725 return (NULL); 6726 6727 if (vd->vdev_children == 0) 6728 return (vd); 6729 6730 vdev_t *eligible[vd->vdev_children]; 6731 int eligible_idx = 0, i; 6732 for (i = 0; i < vd->vdev_children; i++) { 6733 vdev_t *cvd = vd->vdev_child[i]; 6734 if (cvd->vdev_top->vdev_removing) 6735 continue; 6736 if (cvd->vdev_children > 0 || 6737 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 6738 eligible[eligible_idx++] = cvd; 6739 } 6740 } 6741 VERIFY3S(eligible_idx, >, 0); 6742 6743 uint64_t child_no = ztest_random(eligible_idx); 6744 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 6745 } 6746 6747 void 6748 ztest_initialize(ztest_ds_t *zd, uint64_t id) 6749 { 6750 (void) zd, (void) id; 6751 spa_t *spa = ztest_spa; 6752 int error = 0; 6753 6754 mutex_enter(&ztest_vdev_lock); 6755 6756 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6757 6758 /* Random leaf vdev */ 6759 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 6760 if (rand_vd == NULL) { 6761 spa_config_exit(spa, SCL_VDEV, FTAG); 6762 mutex_exit(&ztest_vdev_lock); 6763 return; 6764 } 6765 6766 /* 6767 * The random vdev we've selected may change as soon as we 6768 * drop the spa_config_lock. We create local copies of things 6769 * we're interested in. 6770 */ 6771 uint64_t guid = rand_vd->vdev_guid; 6772 char *path = strdup(rand_vd->vdev_path); 6773 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 6774 6775 zfs_dbgmsg("vd %px, guid %llu", rand_vd, (u_longlong_t)guid); 6776 spa_config_exit(spa, SCL_VDEV, FTAG); 6777 6778 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 6779 6780 nvlist_t *vdev_guids = fnvlist_alloc(); 6781 nvlist_t *vdev_errlist = fnvlist_alloc(); 6782 fnvlist_add_uint64(vdev_guids, path, guid); 6783 error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); 6784 fnvlist_free(vdev_guids); 6785 fnvlist_free(vdev_errlist); 6786 6787 switch (cmd) { 6788 case POOL_INITIALIZE_CANCEL: 6789 if (ztest_opts.zo_verbose >= 4) { 6790 (void) printf("Cancel initialize %s", path); 6791 if (!active) 6792 (void) printf(" failed (no initialize active)"); 6793 (void) printf("\n"); 6794 } 6795 break; 6796 case POOL_INITIALIZE_START: 6797 if (ztest_opts.zo_verbose >= 4) { 6798 (void) printf("Start initialize %s", path); 6799 if (active && error == 0) 6800 (void) printf(" failed (already active)"); 6801 else if (error != 0) 6802 (void) printf(" failed (error %d)", error); 6803 (void) printf("\n"); 6804 } 6805 break; 6806 case POOL_INITIALIZE_SUSPEND: 6807 if (ztest_opts.zo_verbose >= 4) { 6808 (void) printf("Suspend initialize %s", path); 6809 if (!active) 6810 (void) printf(" failed (no initialize active)"); 6811 (void) printf("\n"); 6812 } 6813 break; 6814 } 6815 free(path); 6816 mutex_exit(&ztest_vdev_lock); 6817 } 6818 6819 void 6820 ztest_trim(ztest_ds_t *zd, uint64_t id) 6821 { 6822 (void) zd, (void) id; 6823 spa_t *spa = ztest_spa; 6824 int error = 0; 6825 6826 mutex_enter(&ztest_vdev_lock); 6827 6828 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6829 6830 /* Random leaf vdev */ 6831 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 6832 if (rand_vd == NULL) { 6833 spa_config_exit(spa, SCL_VDEV, FTAG); 6834 mutex_exit(&ztest_vdev_lock); 6835 return; 6836 } 6837 6838 /* 6839 * The random vdev we've selected may change as soon as we 6840 * drop the spa_config_lock. We create local copies of things 6841 * we're interested in. 6842 */ 6843 uint64_t guid = rand_vd->vdev_guid; 6844 char *path = strdup(rand_vd->vdev_path); 6845 boolean_t active = rand_vd->vdev_trim_thread != NULL; 6846 6847 zfs_dbgmsg("vd %p, guid %llu", rand_vd, (u_longlong_t)guid); 6848 spa_config_exit(spa, SCL_VDEV, FTAG); 6849 6850 uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); 6851 uint64_t rate = 1 << ztest_random(30); 6852 boolean_t partial = (ztest_random(5) > 0); 6853 boolean_t secure = (ztest_random(5) > 0); 6854 6855 nvlist_t *vdev_guids = fnvlist_alloc(); 6856 nvlist_t *vdev_errlist = fnvlist_alloc(); 6857 fnvlist_add_uint64(vdev_guids, path, guid); 6858 error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, 6859 secure, vdev_errlist); 6860 fnvlist_free(vdev_guids); 6861 fnvlist_free(vdev_errlist); 6862 6863 switch (cmd) { 6864 case POOL_TRIM_CANCEL: 6865 if (ztest_opts.zo_verbose >= 4) { 6866 (void) printf("Cancel TRIM %s", path); 6867 if (!active) 6868 (void) printf(" failed (no TRIM active)"); 6869 (void) printf("\n"); 6870 } 6871 break; 6872 case POOL_TRIM_START: 6873 if (ztest_opts.zo_verbose >= 4) { 6874 (void) printf("Start TRIM %s", path); 6875 if (active && error == 0) 6876 (void) printf(" failed (already active)"); 6877 else if (error != 0) 6878 (void) printf(" failed (error %d)", error); 6879 (void) printf("\n"); 6880 } 6881 break; 6882 case POOL_TRIM_SUSPEND: 6883 if (ztest_opts.zo_verbose >= 4) { 6884 (void) printf("Suspend TRIM %s", path); 6885 if (!active) 6886 (void) printf(" failed (no TRIM active)"); 6887 (void) printf("\n"); 6888 } 6889 break; 6890 } 6891 free(path); 6892 mutex_exit(&ztest_vdev_lock); 6893 } 6894 6895 /* 6896 * Verify pool integrity by running zdb. 6897 */ 6898 static void 6899 ztest_run_zdb(const char *pool) 6900 { 6901 int status; 6902 char *bin; 6903 char *zdb; 6904 char *zbuf; 6905 const int len = MAXPATHLEN + MAXNAMELEN + 20; 6906 FILE *fp; 6907 6908 bin = umem_alloc(len, UMEM_NOFAIL); 6909 zdb = umem_alloc(len, UMEM_NOFAIL); 6910 zbuf = umem_alloc(1024, UMEM_NOFAIL); 6911 6912 ztest_get_zdb_bin(bin, len); 6913 6914 char **set_gvars_args = ztest_global_vars_to_zdb_args(); 6915 if (set_gvars_args == NULL) { 6916 fatal(B_FALSE, "Failed to allocate memory in " 6917 "ztest_global_vars_to_zdb_args(). Cannot run zdb.\n"); 6918 } 6919 char *set_gvars_args_joined = join_strings(set_gvars_args, " "); 6920 free(set_gvars_args); 6921 6922 size_t would = snprintf(zdb, len, 6923 "%s -bcc%s%s -G -d -Y -e -y %s -p %s %s", 6924 bin, 6925 ztest_opts.zo_verbose >= 3 ? "s" : "", 6926 ztest_opts.zo_verbose >= 4 ? "v" : "", 6927 set_gvars_args_joined, 6928 ztest_opts.zo_dir, 6929 pool); 6930 ASSERT3U(would, <, len); 6931 6932 umem_free(set_gvars_args_joined, strlen(set_gvars_args_joined) + 1); 6933 6934 if (ztest_opts.zo_verbose >= 5) 6935 (void) printf("Executing %s\n", zdb); 6936 6937 fp = popen(zdb, "r"); 6938 6939 while (fgets(zbuf, 1024, fp) != NULL) 6940 if (ztest_opts.zo_verbose >= 3) 6941 (void) printf("%s", zbuf); 6942 6943 status = pclose(fp); 6944 6945 if (status == 0) 6946 goto out; 6947 6948 ztest_dump_core = 0; 6949 if (WIFEXITED(status)) 6950 fatal(B_FALSE, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 6951 else 6952 fatal(B_FALSE, "'%s' died with signal %d", 6953 zdb, WTERMSIG(status)); 6954 out: 6955 umem_free(bin, len); 6956 umem_free(zdb, len); 6957 umem_free(zbuf, 1024); 6958 } 6959 6960 static void 6961 ztest_walk_pool_directory(const char *header) 6962 { 6963 spa_t *spa = NULL; 6964 6965 if (ztest_opts.zo_verbose >= 6) 6966 (void) puts(header); 6967 6968 mutex_enter(&spa_namespace_lock); 6969 while ((spa = spa_next(spa)) != NULL) 6970 if (ztest_opts.zo_verbose >= 6) 6971 (void) printf("\t%s\n", spa_name(spa)); 6972 mutex_exit(&spa_namespace_lock); 6973 } 6974 6975 static void 6976 ztest_spa_import_export(char *oldname, char *newname) 6977 { 6978 nvlist_t *config, *newconfig; 6979 uint64_t pool_guid; 6980 spa_t *spa; 6981 int error; 6982 6983 if (ztest_opts.zo_verbose >= 4) { 6984 (void) printf("import/export: old = %s, new = %s\n", 6985 oldname, newname); 6986 } 6987 6988 /* 6989 * Clean up from previous runs. 6990 */ 6991 (void) spa_destroy(newname); 6992 6993 /* 6994 * Get the pool's configuration and guid. 6995 */ 6996 VERIFY0(spa_open(oldname, &spa, FTAG)); 6997 6998 /* 6999 * Kick off a scrub to tickle scrub/export races. 7000 */ 7001 if (ztest_random(2) == 0) 7002 (void) spa_scan(spa, POOL_SCAN_SCRUB); 7003 7004 pool_guid = spa_guid(spa); 7005 spa_close(spa, FTAG); 7006 7007 ztest_walk_pool_directory("pools before export"); 7008 7009 /* 7010 * Export it. 7011 */ 7012 VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); 7013 7014 ztest_walk_pool_directory("pools after export"); 7015 7016 /* 7017 * Try to import it. 7018 */ 7019 newconfig = spa_tryimport(config); 7020 ASSERT3P(newconfig, !=, NULL); 7021 fnvlist_free(newconfig); 7022 7023 /* 7024 * Import it under the new name. 7025 */ 7026 error = spa_import(newname, config, NULL, 0); 7027 if (error != 0) { 7028 dump_nvlist(config, 0); 7029 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 7030 oldname, newname, error); 7031 } 7032 7033 ztest_walk_pool_directory("pools after import"); 7034 7035 /* 7036 * Try to import it again -- should fail with EEXIST. 7037 */ 7038 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 7039 7040 /* 7041 * Try to import it under a different name -- should fail with EEXIST. 7042 */ 7043 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 7044 7045 /* 7046 * Verify that the pool is no longer visible under the old name. 7047 */ 7048 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 7049 7050 /* 7051 * Verify that we can open and close the pool using the new name. 7052 */ 7053 VERIFY0(spa_open(newname, &spa, FTAG)); 7054 ASSERT3U(pool_guid, ==, spa_guid(spa)); 7055 spa_close(spa, FTAG); 7056 7057 fnvlist_free(config); 7058 } 7059 7060 static void 7061 ztest_resume(spa_t *spa) 7062 { 7063 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 7064 (void) printf("resuming from suspended state\n"); 7065 spa_vdev_state_enter(spa, SCL_NONE); 7066 vdev_clear(spa, NULL); 7067 (void) spa_vdev_state_exit(spa, NULL, 0); 7068 (void) zio_resume(spa); 7069 } 7070 7071 static __attribute__((noreturn)) void 7072 ztest_resume_thread(void *arg) 7073 { 7074 spa_t *spa = arg; 7075 7076 while (!ztest_exiting) { 7077 if (spa_suspended(spa)) 7078 ztest_resume(spa); 7079 (void) poll(NULL, 0, 100); 7080 7081 /* 7082 * Periodically change the zfs_compressed_arc_enabled setting. 7083 */ 7084 if (ztest_random(10) == 0) 7085 zfs_compressed_arc_enabled = ztest_random(2); 7086 7087 /* 7088 * Periodically change the zfs_abd_scatter_enabled setting. 7089 */ 7090 if (ztest_random(10) == 0) 7091 zfs_abd_scatter_enabled = ztest_random(2); 7092 } 7093 7094 thread_exit(); 7095 } 7096 7097 static __attribute__((noreturn)) void 7098 ztest_deadman_thread(void *arg) 7099 { 7100 ztest_shared_t *zs = arg; 7101 spa_t *spa = ztest_spa; 7102 hrtime_t delay, overdue, last_run = gethrtime(); 7103 7104 delay = (zs->zs_thread_stop - zs->zs_thread_start) + 7105 MSEC2NSEC(zfs_deadman_synctime_ms); 7106 7107 while (!ztest_exiting) { 7108 /* 7109 * Wait for the delay timer while checking occasionally 7110 * if we should stop. 7111 */ 7112 if (gethrtime() < last_run + delay) { 7113 (void) poll(NULL, 0, 1000); 7114 continue; 7115 } 7116 7117 /* 7118 * If the pool is suspended then fail immediately. Otherwise, 7119 * check to see if the pool is making any progress. If 7120 * vdev_deadman() discovers that there hasn't been any recent 7121 * I/Os then it will end up aborting the tests. 7122 */ 7123 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 7124 fatal(B_FALSE, 7125 "aborting test after %llu seconds because " 7126 "pool has transitioned to a suspended state.", 7127 (u_longlong_t)zfs_deadman_synctime_ms / 1000); 7128 } 7129 vdev_deadman(spa->spa_root_vdev, FTAG); 7130 7131 /* 7132 * If the process doesn't complete within a grace period of 7133 * zfs_deadman_synctime_ms over the expected finish time, 7134 * then it may be hung and is terminated. 7135 */ 7136 overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); 7137 if (gethrtime() > overdue) { 7138 fatal(B_FALSE, 7139 "aborting test after %llu seconds because " 7140 "the process is overdue for termination.", 7141 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7142 } 7143 7144 (void) printf("ztest has been running for %lld seconds\n", 7145 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7146 7147 last_run = gethrtime(); 7148 delay = MSEC2NSEC(zfs_deadman_checktime_ms); 7149 } 7150 7151 thread_exit(); 7152 } 7153 7154 static void 7155 ztest_execute(int test, ztest_info_t *zi, uint64_t id) 7156 { 7157 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 7158 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 7159 hrtime_t functime = gethrtime(); 7160 int i; 7161 7162 for (i = 0; i < zi->zi_iters; i++) 7163 zi->zi_func(zd, id); 7164 7165 functime = gethrtime() - functime; 7166 7167 atomic_add_64(&zc->zc_count, 1); 7168 atomic_add_64(&zc->zc_time, functime); 7169 7170 if (ztest_opts.zo_verbose >= 4) 7171 (void) printf("%6.2f sec in %s\n", 7172 (double)functime / NANOSEC, zi->zi_funcname); 7173 } 7174 7175 static __attribute__((noreturn)) void 7176 ztest_thread(void *arg) 7177 { 7178 int rand; 7179 uint64_t id = (uintptr_t)arg; 7180 ztest_shared_t *zs = ztest_shared; 7181 uint64_t call_next; 7182 hrtime_t now; 7183 ztest_info_t *zi; 7184 ztest_shared_callstate_t *zc; 7185 7186 while ((now = gethrtime()) < zs->zs_thread_stop) { 7187 /* 7188 * See if it's time to force a crash. 7189 */ 7190 if (now > zs->zs_thread_kill) 7191 ztest_kill(zs); 7192 7193 /* 7194 * If we're getting ENOSPC with some regularity, stop. 7195 */ 7196 if (zs->zs_enospc_count > 10) 7197 break; 7198 7199 /* 7200 * Pick a random function to execute. 7201 */ 7202 rand = ztest_random(ZTEST_FUNCS); 7203 zi = &ztest_info[rand]; 7204 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 7205 call_next = zc->zc_next; 7206 7207 if (now >= call_next && 7208 atomic_cas_64(&zc->zc_next, call_next, call_next + 7209 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 7210 ztest_execute(rand, zi, id); 7211 } 7212 } 7213 7214 thread_exit(); 7215 } 7216 7217 static void 7218 ztest_dataset_name(char *dsname, const char *pool, int d) 7219 { 7220 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 7221 } 7222 7223 static void 7224 ztest_dataset_destroy(int d) 7225 { 7226 char name[ZFS_MAX_DATASET_NAME_LEN]; 7227 int t; 7228 7229 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7230 7231 if (ztest_opts.zo_verbose >= 3) 7232 (void) printf("Destroying %s to free up space\n", name); 7233 7234 /* 7235 * Cleanup any non-standard clones and snapshots. In general, 7236 * ztest thread t operates on dataset (t % zopt_datasets), 7237 * so there may be more than one thing to clean up. 7238 */ 7239 for (t = d; t < ztest_opts.zo_threads; 7240 t += ztest_opts.zo_datasets) 7241 ztest_dsl_dataset_cleanup(name, t); 7242 7243 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 7244 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 7245 } 7246 7247 static void 7248 ztest_dataset_dirobj_verify(ztest_ds_t *zd) 7249 { 7250 uint64_t usedobjs, dirobjs, scratch; 7251 7252 /* 7253 * ZTEST_DIROBJ is the object directory for the entire dataset. 7254 * Therefore, the number of objects in use should equal the 7255 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 7256 * If not, we have an object leak. 7257 * 7258 * Note that we can only check this in ztest_dataset_open(), 7259 * when the open-context and syncing-context values agree. 7260 * That's because zap_count() returns the open-context value, 7261 * while dmu_objset_space() returns the rootbp fill count. 7262 */ 7263 VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 7264 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 7265 ASSERT3U(dirobjs + 1, ==, usedobjs); 7266 } 7267 7268 static int 7269 ztest_dataset_open(int d) 7270 { 7271 ztest_ds_t *zd = &ztest_ds[d]; 7272 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 7273 objset_t *os; 7274 zilog_t *zilog; 7275 char name[ZFS_MAX_DATASET_NAME_LEN]; 7276 int error; 7277 7278 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7279 7280 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7281 7282 error = ztest_dataset_create(name); 7283 if (error == ENOSPC) { 7284 (void) pthread_rwlock_unlock(&ztest_name_lock); 7285 ztest_record_enospc(FTAG); 7286 return (error); 7287 } 7288 ASSERT(error == 0 || error == EEXIST); 7289 7290 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 7291 B_TRUE, zd, &os)); 7292 (void) pthread_rwlock_unlock(&ztest_name_lock); 7293 7294 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 7295 7296 zilog = zd->zd_zilog; 7297 7298 if (zilog->zl_header->zh_claim_lr_seq != 0 && 7299 zilog->zl_header->zh_claim_lr_seq < committed_seq) 7300 fatal(B_FALSE, "missing log records: " 7301 "claimed %"PRIu64" < committed %"PRIu64"", 7302 zilog->zl_header->zh_claim_lr_seq, committed_seq); 7303 7304 ztest_dataset_dirobj_verify(zd); 7305 7306 zil_replay(os, zd, ztest_replay_vector); 7307 7308 ztest_dataset_dirobj_verify(zd); 7309 7310 if (ztest_opts.zo_verbose >= 6) 7311 (void) printf("%s replay %"PRIu64" blocks, " 7312 "%"PRIu64" records, seq %"PRIu64"\n", 7313 zd->zd_name, 7314 zilog->zl_parse_blk_count, 7315 zilog->zl_parse_lr_count, 7316 zilog->zl_replaying_seq); 7317 7318 zilog = zil_open(os, ztest_get_data, NULL); 7319 7320 if (zilog->zl_replaying_seq != 0 && 7321 zilog->zl_replaying_seq < committed_seq) 7322 fatal(B_FALSE, "missing log records: " 7323 "replayed %"PRIu64" < committed %"PRIu64"", 7324 zilog->zl_replaying_seq, committed_seq); 7325 7326 return (0); 7327 } 7328 7329 static void 7330 ztest_dataset_close(int d) 7331 { 7332 ztest_ds_t *zd = &ztest_ds[d]; 7333 7334 zil_close(zd->zd_zilog); 7335 dmu_objset_disown(zd->zd_os, B_TRUE, zd); 7336 7337 ztest_zd_fini(zd); 7338 } 7339 7340 static int 7341 ztest_replay_zil_cb(const char *name, void *arg) 7342 { 7343 (void) arg; 7344 objset_t *os; 7345 ztest_ds_t *zdtmp; 7346 7347 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, 7348 B_TRUE, FTAG, &os)); 7349 7350 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 7351 7352 ztest_zd_init(zdtmp, NULL, os); 7353 zil_replay(os, zdtmp, ztest_replay_vector); 7354 ztest_zd_fini(zdtmp); 7355 7356 if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && 7357 ztest_opts.zo_verbose >= 6) { 7358 zilog_t *zilog = dmu_objset_zil(os); 7359 7360 (void) printf("%s replay %"PRIu64" blocks, " 7361 "%"PRIu64" records, seq %"PRIu64"\n", 7362 name, 7363 zilog->zl_parse_blk_count, 7364 zilog->zl_parse_lr_count, 7365 zilog->zl_replaying_seq); 7366 } 7367 7368 umem_free(zdtmp, sizeof (ztest_ds_t)); 7369 7370 dmu_objset_disown(os, B_TRUE, FTAG); 7371 return (0); 7372 } 7373 7374 static void 7375 ztest_freeze(void) 7376 { 7377 ztest_ds_t *zd = &ztest_ds[0]; 7378 spa_t *spa; 7379 int numloops = 0; 7380 7381 if (ztest_opts.zo_verbose >= 3) 7382 (void) printf("testing spa_freeze()...\n"); 7383 7384 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7385 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7386 VERIFY0(ztest_dataset_open(0)); 7387 ztest_spa = spa; 7388 7389 /* 7390 * Force the first log block to be transactionally allocated. 7391 * We have to do this before we freeze the pool -- otherwise 7392 * the log chain won't be anchored. 7393 */ 7394 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 7395 ztest_dmu_object_alloc_free(zd, 0); 7396 zil_commit(zd->zd_zilog, 0); 7397 } 7398 7399 txg_wait_synced(spa_get_dsl(spa), 0); 7400 7401 /* 7402 * Freeze the pool. This stops spa_sync() from doing anything, 7403 * so that the only way to record changes from now on is the ZIL. 7404 */ 7405 spa_freeze(spa); 7406 7407 /* 7408 * Because it is hard to predict how much space a write will actually 7409 * require beforehand, we leave ourselves some fudge space to write over 7410 * capacity. 7411 */ 7412 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 7413 7414 /* 7415 * Run tests that generate log records but don't alter the pool config 7416 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 7417 * We do a txg_wait_synced() after each iteration to force the txg 7418 * to increase well beyond the last synced value in the uberblock. 7419 * The ZIL should be OK with that. 7420 * 7421 * Run a random number of times less than zo_maxloops and ensure we do 7422 * not run out of space on the pool. 7423 */ 7424 while (ztest_random(10) != 0 && 7425 numloops++ < ztest_opts.zo_maxloops && 7426 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 7427 ztest_od_t od; 7428 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 7429 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 7430 ztest_io(zd, od.od_object, 7431 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 7432 txg_wait_synced(spa_get_dsl(spa), 0); 7433 } 7434 7435 /* 7436 * Commit all of the changes we just generated. 7437 */ 7438 zil_commit(zd->zd_zilog, 0); 7439 txg_wait_synced(spa_get_dsl(spa), 0); 7440 7441 /* 7442 * Close our dataset and close the pool. 7443 */ 7444 ztest_dataset_close(0); 7445 spa_close(spa, FTAG); 7446 kernel_fini(); 7447 7448 /* 7449 * Open and close the pool and dataset to induce log replay. 7450 */ 7451 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7452 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7453 ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); 7454 VERIFY0(ztest_dataset_open(0)); 7455 ztest_spa = spa; 7456 txg_wait_synced(spa_get_dsl(spa), 0); 7457 ztest_dataset_close(0); 7458 ztest_reguid(NULL, 0); 7459 7460 spa_close(spa, FTAG); 7461 kernel_fini(); 7462 } 7463 7464 static void 7465 ztest_import_impl(void) 7466 { 7467 importargs_t args = { 0 }; 7468 nvlist_t *cfg = NULL; 7469 int nsearch = 1; 7470 char *searchdirs[nsearch]; 7471 int flags = ZFS_IMPORT_MISSING_LOG; 7472 7473 searchdirs[0] = ztest_opts.zo_dir; 7474 args.paths = nsearch; 7475 args.path = searchdirs; 7476 args.can_be_active = B_FALSE; 7477 7478 libpc_handle_t lpch = { 7479 .lpc_lib_handle = NULL, 7480 .lpc_ops = &libzpool_config_ops, 7481 .lpc_printerr = B_TRUE 7482 }; 7483 VERIFY0(zpool_find_config(&lpch, ztest_opts.zo_pool, &cfg, &args)); 7484 VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); 7485 fnvlist_free(cfg); 7486 } 7487 7488 /* 7489 * Import a storage pool with the given name. 7490 */ 7491 static void 7492 ztest_import(ztest_shared_t *zs) 7493 { 7494 spa_t *spa; 7495 7496 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7497 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7498 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7499 7500 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7501 7502 ztest_import_impl(); 7503 7504 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7505 zs->zs_metaslab_sz = 7506 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7507 spa_close(spa, FTAG); 7508 7509 kernel_fini(); 7510 7511 if (!ztest_opts.zo_mmp_test) { 7512 ztest_run_zdb(ztest_opts.zo_pool); 7513 ztest_freeze(); 7514 ztest_run_zdb(ztest_opts.zo_pool); 7515 } 7516 7517 (void) pthread_rwlock_destroy(&ztest_name_lock); 7518 mutex_destroy(&ztest_vdev_lock); 7519 mutex_destroy(&ztest_checkpoint_lock); 7520 } 7521 7522 /* 7523 * Kick off threads to run tests on all datasets in parallel. 7524 */ 7525 static void 7526 ztest_run(ztest_shared_t *zs) 7527 { 7528 spa_t *spa; 7529 objset_t *os; 7530 kthread_t *resume_thread, *deadman_thread; 7531 kthread_t **run_threads; 7532 uint64_t object; 7533 int error; 7534 int t, d; 7535 7536 ztest_exiting = B_FALSE; 7537 7538 /* 7539 * Initialize parent/child shared state. 7540 */ 7541 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7542 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7543 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7544 7545 zs->zs_thread_start = gethrtime(); 7546 zs->zs_thread_stop = 7547 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 7548 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 7549 zs->zs_thread_kill = zs->zs_thread_stop; 7550 if (ztest_random(100) < ztest_opts.zo_killrate) { 7551 zs->zs_thread_kill -= 7552 ztest_random(ztest_opts.zo_passtime * NANOSEC); 7553 } 7554 7555 mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); 7556 7557 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 7558 offsetof(ztest_cb_data_t, zcd_node)); 7559 7560 /* 7561 * Open our pool. It may need to be imported first depending on 7562 * what tests were running when the previous pass was terminated. 7563 */ 7564 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7565 error = spa_open(ztest_opts.zo_pool, &spa, FTAG); 7566 if (error) { 7567 VERIFY3S(error, ==, ENOENT); 7568 ztest_import_impl(); 7569 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7570 zs->zs_metaslab_sz = 7571 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7572 } 7573 7574 metaslab_preload_limit = ztest_random(20) + 1; 7575 ztest_spa = spa; 7576 7577 VERIFY0(vdev_raidz_impl_set("cycle")); 7578 7579 dmu_objset_stats_t dds; 7580 VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, 7581 DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); 7582 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 7583 dmu_objset_fast_stat(os, &dds); 7584 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 7585 zs->zs_guid = dds.dds_guid; 7586 dmu_objset_disown(os, B_TRUE, FTAG); 7587 7588 /* 7589 * Create a thread to periodically resume suspended I/O. 7590 */ 7591 resume_thread = thread_create(NULL, 0, ztest_resume_thread, 7592 spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 7593 7594 /* 7595 * Create a deadman thread and set to panic if we hang. 7596 */ 7597 deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, 7598 zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 7599 7600 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; 7601 7602 /* 7603 * Verify that we can safely inquire about any object, 7604 * whether it's allocated or not. To make it interesting, 7605 * we probe a 5-wide window around each power of two. 7606 * This hits all edge cases, including zero and the max. 7607 */ 7608 for (t = 0; t < 64; t++) { 7609 for (d = -5; d <= 5; d++) { 7610 error = dmu_object_info(spa->spa_meta_objset, 7611 (1ULL << t) + d, NULL); 7612 ASSERT(error == 0 || error == ENOENT || 7613 error == EINVAL); 7614 } 7615 } 7616 7617 /* 7618 * If we got any ENOSPC errors on the previous run, destroy something. 7619 */ 7620 if (zs->zs_enospc_count != 0) { 7621 int d = ztest_random(ztest_opts.zo_datasets); 7622 ztest_dataset_destroy(d); 7623 } 7624 zs->zs_enospc_count = 0; 7625 7626 /* 7627 * If we were in the middle of ztest_device_removal() and were killed 7628 * we need to ensure the removal and scrub complete before running 7629 * any tests that check ztest_device_removal_active. The removal will 7630 * be restarted automatically when the spa is opened, but we need to 7631 * initiate the scrub manually if it is not already in progress. Note 7632 * that we always run the scrub whenever an indirect vdev exists 7633 * because we have no way of knowing for sure if ztest_device_removal() 7634 * fully completed its scrub before the pool was reimported. 7635 */ 7636 if (spa->spa_removing_phys.sr_state == DSS_SCANNING || 7637 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 7638 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 7639 txg_wait_synced(spa_get_dsl(spa), 0); 7640 7641 error = ztest_scrub_impl(spa); 7642 if (error == EBUSY) 7643 error = 0; 7644 ASSERT0(error); 7645 } 7646 7647 run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), 7648 UMEM_NOFAIL); 7649 7650 if (ztest_opts.zo_verbose >= 4) 7651 (void) printf("starting main threads...\n"); 7652 7653 /* 7654 * Replay all logs of all datasets in the pool. This is primarily for 7655 * temporary datasets which wouldn't otherwise get replayed, which 7656 * can trigger failures when attempting to offline a SLOG in 7657 * ztest_fault_inject(). 7658 */ 7659 (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, 7660 NULL, DS_FIND_CHILDREN); 7661 7662 /* 7663 * Kick off all the tests that run in parallel. 7664 */ 7665 for (t = 0; t < ztest_opts.zo_threads; t++) { 7666 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 7667 umem_free(run_threads, ztest_opts.zo_threads * 7668 sizeof (kthread_t *)); 7669 return; 7670 } 7671 7672 run_threads[t] = thread_create(NULL, 0, ztest_thread, 7673 (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, 7674 defclsyspri); 7675 } 7676 7677 /* 7678 * Wait for all of the tests to complete. 7679 */ 7680 for (t = 0; t < ztest_opts.zo_threads; t++) 7681 VERIFY0(thread_join(run_threads[t])); 7682 7683 /* 7684 * Close all datasets. This must be done after all the threads 7685 * are joined so we can be sure none of the datasets are in-use 7686 * by any of the threads. 7687 */ 7688 for (t = 0; t < ztest_opts.zo_threads; t++) { 7689 if (t < ztest_opts.zo_datasets) 7690 ztest_dataset_close(t); 7691 } 7692 7693 txg_wait_synced(spa_get_dsl(spa), 0); 7694 7695 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 7696 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 7697 7698 umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); 7699 7700 /* Kill the resume and deadman threads */ 7701 ztest_exiting = B_TRUE; 7702 VERIFY0(thread_join(resume_thread)); 7703 VERIFY0(thread_join(deadman_thread)); 7704 ztest_resume(spa); 7705 7706 /* 7707 * Right before closing the pool, kick off a bunch of async I/O; 7708 * spa_close() should wait for it to complete. 7709 */ 7710 for (object = 1; object < 50; object++) { 7711 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 7712 ZIO_PRIORITY_SYNC_READ); 7713 } 7714 7715 /* Verify that at least one commit cb was called in a timely fashion */ 7716 if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) 7717 VERIFY0(zc_min_txg_delay); 7718 7719 spa_close(spa, FTAG); 7720 7721 /* 7722 * Verify that we can loop over all pools. 7723 */ 7724 mutex_enter(&spa_namespace_lock); 7725 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 7726 if (ztest_opts.zo_verbose > 3) 7727 (void) printf("spa_next: found %s\n", spa_name(spa)); 7728 mutex_exit(&spa_namespace_lock); 7729 7730 /* 7731 * Verify that we can export the pool and reimport it under a 7732 * different name. 7733 */ 7734 if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { 7735 char name[ZFS_MAX_DATASET_NAME_LEN]; 7736 (void) snprintf(name, sizeof (name), "%s_import", 7737 ztest_opts.zo_pool); 7738 ztest_spa_import_export(ztest_opts.zo_pool, name); 7739 ztest_spa_import_export(name, ztest_opts.zo_pool); 7740 } 7741 7742 kernel_fini(); 7743 7744 list_destroy(&zcl.zcl_callbacks); 7745 mutex_destroy(&zcl.zcl_callbacks_lock); 7746 (void) pthread_rwlock_destroy(&ztest_name_lock); 7747 mutex_destroy(&ztest_vdev_lock); 7748 mutex_destroy(&ztest_checkpoint_lock); 7749 } 7750 7751 static void 7752 print_time(hrtime_t t, char *timebuf) 7753 { 7754 hrtime_t s = t / NANOSEC; 7755 hrtime_t m = s / 60; 7756 hrtime_t h = m / 60; 7757 hrtime_t d = h / 24; 7758 7759 s -= m * 60; 7760 m -= h * 60; 7761 h -= d * 24; 7762 7763 timebuf[0] = '\0'; 7764 7765 if (d) 7766 (void) sprintf(timebuf, 7767 "%llud%02lluh%02llum%02llus", d, h, m, s); 7768 else if (h) 7769 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 7770 else if (m) 7771 (void) sprintf(timebuf, "%llum%02llus", m, s); 7772 else 7773 (void) sprintf(timebuf, "%llus", s); 7774 } 7775 7776 static nvlist_t * 7777 make_random_props(void) 7778 { 7779 nvlist_t *props; 7780 7781 props = fnvlist_alloc(); 7782 7783 if (ztest_random(2) == 0) 7784 return (props); 7785 7786 fnvlist_add_uint64(props, 7787 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); 7788 7789 return (props); 7790 } 7791 7792 /* 7793 * Create a storage pool with the given name and initial vdev size. 7794 * Then test spa_freeze() functionality. 7795 */ 7796 static void 7797 ztest_init(ztest_shared_t *zs) 7798 { 7799 spa_t *spa; 7800 nvlist_t *nvroot, *props; 7801 int i; 7802 7803 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7804 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7805 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7806 7807 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7808 7809 /* 7810 * Create the storage pool. 7811 */ 7812 (void) spa_destroy(ztest_opts.zo_pool); 7813 ztest_shared->zs_vdev_next_leaf = 0; 7814 zs->zs_splits = 0; 7815 zs->zs_mirrors = ztest_opts.zo_mirrors; 7816 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 7817 NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 7818 props = make_random_props(); 7819 7820 /* 7821 * We don't expect the pool to suspend unless maxfaults == 0, 7822 * in which case ztest_fault_inject() temporarily takes away 7823 * the only valid replica. 7824 */ 7825 fnvlist_add_uint64(props, 7826 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 7827 MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); 7828 7829 for (i = 0; i < SPA_FEATURES; i++) { 7830 char *buf; 7831 7832 if (!spa_feature_table[i].fi_zfs_mod_supported) 7833 continue; 7834 7835 /* 7836 * 75% chance of using the log space map feature. We want ztest 7837 * to exercise both the code paths that use the log space map 7838 * feature and the ones that don't. 7839 */ 7840 if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) 7841 continue; 7842 7843 VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", 7844 spa_feature_table[i].fi_uname)); 7845 fnvlist_add_uint64(props, buf, 0); 7846 free(buf); 7847 } 7848 7849 VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); 7850 fnvlist_free(nvroot); 7851 fnvlist_free(props); 7852 7853 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7854 zs->zs_metaslab_sz = 7855 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7856 spa_close(spa, FTAG); 7857 7858 kernel_fini(); 7859 7860 if (!ztest_opts.zo_mmp_test) { 7861 ztest_run_zdb(ztest_opts.zo_pool); 7862 ztest_freeze(); 7863 ztest_run_zdb(ztest_opts.zo_pool); 7864 } 7865 7866 (void) pthread_rwlock_destroy(&ztest_name_lock); 7867 mutex_destroy(&ztest_vdev_lock); 7868 mutex_destroy(&ztest_checkpoint_lock); 7869 } 7870 7871 static void 7872 setup_data_fd(void) 7873 { 7874 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 7875 7876 ztest_fd_data = mkstemp(ztest_name_data); 7877 ASSERT3S(ztest_fd_data, >=, 0); 7878 (void) unlink(ztest_name_data); 7879 } 7880 7881 static int 7882 shared_data_size(ztest_shared_hdr_t *hdr) 7883 { 7884 int size; 7885 7886 size = hdr->zh_hdr_size; 7887 size += hdr->zh_opts_size; 7888 size += hdr->zh_size; 7889 size += hdr->zh_stats_size * hdr->zh_stats_count; 7890 size += hdr->zh_ds_size * hdr->zh_ds_count; 7891 7892 return (size); 7893 } 7894 7895 static void 7896 setup_hdr(void) 7897 { 7898 int size; 7899 ztest_shared_hdr_t *hdr; 7900 7901 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 7902 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 7903 ASSERT3P(hdr, !=, MAP_FAILED); 7904 7905 VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 7906 7907 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 7908 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 7909 hdr->zh_size = sizeof (ztest_shared_t); 7910 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 7911 hdr->zh_stats_count = ZTEST_FUNCS; 7912 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 7913 hdr->zh_ds_count = ztest_opts.zo_datasets; 7914 7915 size = shared_data_size(hdr); 7916 VERIFY0(ftruncate(ztest_fd_data, size)); 7917 7918 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 7919 } 7920 7921 static void 7922 setup_data(void) 7923 { 7924 int size, offset; 7925 ztest_shared_hdr_t *hdr; 7926 uint8_t *buf; 7927 7928 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 7929 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 7930 ASSERT3P(hdr, !=, MAP_FAILED); 7931 7932 size = shared_data_size(hdr); 7933 7934 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 7935 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 7936 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 7937 ASSERT3P(hdr, !=, MAP_FAILED); 7938 buf = (uint8_t *)hdr; 7939 7940 offset = hdr->zh_hdr_size; 7941 ztest_shared_opts = (void *)&buf[offset]; 7942 offset += hdr->zh_opts_size; 7943 ztest_shared = (void *)&buf[offset]; 7944 offset += hdr->zh_size; 7945 ztest_shared_callstate = (void *)&buf[offset]; 7946 offset += hdr->zh_stats_size * hdr->zh_stats_count; 7947 ztest_shared_ds = (void *)&buf[offset]; 7948 } 7949 7950 static boolean_t 7951 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 7952 { 7953 pid_t pid; 7954 int status; 7955 char *cmdbuf = NULL; 7956 7957 pid = fork(); 7958 7959 if (cmd == NULL) { 7960 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 7961 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 7962 cmd = cmdbuf; 7963 } 7964 7965 if (pid == -1) 7966 fatal(B_TRUE, "fork failed"); 7967 7968 if (pid == 0) { /* child */ 7969 char fd_data_str[12]; 7970 7971 VERIFY3S(11, >=, 7972 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 7973 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 7974 7975 if (libpath != NULL) { 7976 const char *curlp = getenv("LD_LIBRARY_PATH"); 7977 if (curlp == NULL) 7978 VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); 7979 else { 7980 char *newlp = NULL; 7981 VERIFY3S(-1, !=, 7982 asprintf(&newlp, "%s:%s", libpath, curlp)); 7983 VERIFY0(setenv("LD_LIBRARY_PATH", newlp, 1)); 7984 free(newlp); 7985 } 7986 } 7987 (void) execl(cmd, cmd, (char *)NULL); 7988 ztest_dump_core = B_FALSE; 7989 fatal(B_TRUE, "exec failed: %s", cmd); 7990 } 7991 7992 if (cmdbuf != NULL) { 7993 umem_free(cmdbuf, MAXPATHLEN); 7994 cmd = NULL; 7995 } 7996 7997 while (waitpid(pid, &status, 0) != pid) 7998 continue; 7999 if (statusp != NULL) 8000 *statusp = status; 8001 8002 if (WIFEXITED(status)) { 8003 if (WEXITSTATUS(status) != 0) { 8004 (void) fprintf(stderr, "child exited with code %d\n", 8005 WEXITSTATUS(status)); 8006 exit(2); 8007 } 8008 return (B_FALSE); 8009 } else if (WIFSIGNALED(status)) { 8010 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 8011 (void) fprintf(stderr, "child died with signal %d\n", 8012 WTERMSIG(status)); 8013 exit(3); 8014 } 8015 return (B_TRUE); 8016 } else { 8017 (void) fprintf(stderr, "something strange happened to child\n"); 8018 exit(4); 8019 } 8020 } 8021 8022 static void 8023 ztest_run_init(void) 8024 { 8025 int i; 8026 8027 ztest_shared_t *zs = ztest_shared; 8028 8029 /* 8030 * Blow away any existing copy of zpool.cache 8031 */ 8032 (void) remove(spa_config_path); 8033 8034 if (ztest_opts.zo_init == 0) { 8035 if (ztest_opts.zo_verbose >= 1) 8036 (void) printf("Importing pool %s\n", 8037 ztest_opts.zo_pool); 8038 ztest_import(zs); 8039 return; 8040 } 8041 8042 /* 8043 * Create and initialize our storage pool. 8044 */ 8045 for (i = 1; i <= ztest_opts.zo_init; i++) { 8046 memset(zs, 0, sizeof (*zs)); 8047 if (ztest_opts.zo_verbose >= 3 && 8048 ztest_opts.zo_init != 1) { 8049 (void) printf("ztest_init(), pass %d\n", i); 8050 } 8051 ztest_init(zs); 8052 } 8053 } 8054 8055 int 8056 main(int argc, char **argv) 8057 { 8058 int kills = 0; 8059 int iters = 0; 8060 int older = 0; 8061 int newer = 0; 8062 ztest_shared_t *zs; 8063 ztest_info_t *zi; 8064 ztest_shared_callstate_t *zc; 8065 char timebuf[100]; 8066 char numbuf[NN_NUMBUF_SZ]; 8067 char *cmd; 8068 boolean_t hasalt; 8069 int f, err; 8070 char *fd_data_str = getenv("ZTEST_FD_DATA"); 8071 struct sigaction action; 8072 8073 (void) setvbuf(stdout, NULL, _IOLBF, 0); 8074 8075 dprintf_setup(&argc, argv); 8076 zfs_deadman_synctime_ms = 300000; 8077 zfs_deadman_checktime_ms = 30000; 8078 /* 8079 * As two-word space map entries may not come up often (especially 8080 * if pool and vdev sizes are small) we want to force at least some 8081 * of them so the feature get tested. 8082 */ 8083 zfs_force_some_double_word_sm_entries = B_TRUE; 8084 8085 /* 8086 * Verify that even extensively damaged split blocks with many 8087 * segments can be reconstructed in a reasonable amount of time 8088 * when reconstruction is known to be possible. 8089 * 8090 * Note: the lower this value is, the more damage we inflict, and 8091 * the more time ztest spends in recovering that damage. We chose 8092 * to induce damage 1/100th of the time so recovery is tested but 8093 * not so frequently that ztest doesn't get to test other code paths. 8094 */ 8095 zfs_reconstruct_indirect_damage_fraction = 100; 8096 8097 action.sa_handler = sig_handler; 8098 sigemptyset(&action.sa_mask); 8099 action.sa_flags = 0; 8100 8101 if (sigaction(SIGSEGV, &action, NULL) < 0) { 8102 (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", 8103 strerror(errno)); 8104 exit(EXIT_FAILURE); 8105 } 8106 8107 if (sigaction(SIGABRT, &action, NULL) < 0) { 8108 (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", 8109 strerror(errno)); 8110 exit(EXIT_FAILURE); 8111 } 8112 8113 /* 8114 * Force random_get_bytes() to use /dev/urandom in order to prevent 8115 * ztest from needlessly depleting the system entropy pool. 8116 */ 8117 random_path = "/dev/urandom"; 8118 ztest_fd_rand = open(random_path, O_RDONLY | O_CLOEXEC); 8119 ASSERT3S(ztest_fd_rand, >=, 0); 8120 8121 if (!fd_data_str) { 8122 process_options(argc, argv); 8123 8124 setup_data_fd(); 8125 setup_hdr(); 8126 setup_data(); 8127 memcpy(ztest_shared_opts, &ztest_opts, 8128 sizeof (*ztest_shared_opts)); 8129 } else { 8130 ztest_fd_data = atoi(fd_data_str); 8131 setup_data(); 8132 memcpy(&ztest_opts, ztest_shared_opts, sizeof (ztest_opts)); 8133 } 8134 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 8135 8136 err = ztest_set_global_vars(); 8137 if (err != 0 && !fd_data_str) { 8138 /* error message done by ztest_set_global_vars */ 8139 exit(EXIT_FAILURE); 8140 } else { 8141 /* children should not be spawned if setting gvars fails */ 8142 VERIFY3S(err, ==, 0); 8143 } 8144 8145 /* Override location of zpool.cache */ 8146 VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", 8147 ztest_opts.zo_dir), !=, -1); 8148 8149 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 8150 UMEM_NOFAIL); 8151 zs = ztest_shared; 8152 8153 if (fd_data_str) { 8154 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 8155 metaslab_df_alloc_threshold = 8156 zs->zs_metaslab_df_alloc_threshold; 8157 8158 if (zs->zs_do_init) 8159 ztest_run_init(); 8160 else 8161 ztest_run(zs); 8162 exit(0); 8163 } 8164 8165 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 8166 8167 if (ztest_opts.zo_verbose >= 1) { 8168 (void) printf("%"PRIu64" vdevs, %d datasets, %d threads," 8169 "%d %s disks, %"PRIu64" seconds...\n\n", 8170 ztest_opts.zo_vdevs, 8171 ztest_opts.zo_datasets, 8172 ztest_opts.zo_threads, 8173 ztest_opts.zo_raid_children, 8174 ztest_opts.zo_raid_type, 8175 ztest_opts.zo_time); 8176 } 8177 8178 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 8179 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 8180 8181 zs->zs_do_init = B_TRUE; 8182 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 8183 if (ztest_opts.zo_verbose >= 1) { 8184 (void) printf("Executing older ztest for " 8185 "initialization: %s\n", ztest_opts.zo_alt_ztest); 8186 } 8187 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 8188 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 8189 } else { 8190 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 8191 } 8192 zs->zs_do_init = B_FALSE; 8193 8194 zs->zs_proc_start = gethrtime(); 8195 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 8196 8197 for (f = 0; f < ZTEST_FUNCS; f++) { 8198 zi = &ztest_info[f]; 8199 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8200 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 8201 zc->zc_next = UINT64_MAX; 8202 else 8203 zc->zc_next = zs->zs_proc_start + 8204 ztest_random(2 * zi->zi_interval[0] + 1); 8205 } 8206 8207 /* 8208 * Run the tests in a loop. These tests include fault injection 8209 * to verify that self-healing data works, and forced crashes 8210 * to verify that we never lose on-disk consistency. 8211 */ 8212 while (gethrtime() < zs->zs_proc_stop) { 8213 int status; 8214 boolean_t killed; 8215 8216 /* 8217 * Initialize the workload counters for each function. 8218 */ 8219 for (f = 0; f < ZTEST_FUNCS; f++) { 8220 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8221 zc->zc_count = 0; 8222 zc->zc_time = 0; 8223 } 8224 8225 /* Set the allocation switch size */ 8226 zs->zs_metaslab_df_alloc_threshold = 8227 ztest_random(zs->zs_metaslab_sz / 4) + 1; 8228 8229 if (!hasalt || ztest_random(2) == 0) { 8230 if (hasalt && ztest_opts.zo_verbose >= 1) { 8231 (void) printf("Executing newer ztest: %s\n", 8232 cmd); 8233 } 8234 newer++; 8235 killed = exec_child(cmd, NULL, B_TRUE, &status); 8236 } else { 8237 if (hasalt && ztest_opts.zo_verbose >= 1) { 8238 (void) printf("Executing older ztest: %s\n", 8239 ztest_opts.zo_alt_ztest); 8240 } 8241 older++; 8242 killed = exec_child(ztest_opts.zo_alt_ztest, 8243 ztest_opts.zo_alt_libpath, B_TRUE, &status); 8244 } 8245 8246 if (killed) 8247 kills++; 8248 iters++; 8249 8250 if (ztest_opts.zo_verbose >= 1) { 8251 hrtime_t now = gethrtime(); 8252 8253 now = MIN(now, zs->zs_proc_stop); 8254 print_time(zs->zs_proc_stop - now, timebuf); 8255 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 8256 8257 (void) printf("Pass %3d, %8s, %3"PRIu64" ENOSPC, " 8258 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 8259 iters, 8260 WIFEXITED(status) ? "Complete" : "SIGKILL", 8261 zs->zs_enospc_count, 8262 100.0 * zs->zs_alloc / zs->zs_space, 8263 numbuf, 8264 100.0 * (now - zs->zs_proc_start) / 8265 (ztest_opts.zo_time * NANOSEC), timebuf); 8266 } 8267 8268 if (ztest_opts.zo_verbose >= 2) { 8269 (void) printf("\nWorkload summary:\n\n"); 8270 (void) printf("%7s %9s %s\n", 8271 "Calls", "Time", "Function"); 8272 (void) printf("%7s %9s %s\n", 8273 "-----", "----", "--------"); 8274 for (f = 0; f < ZTEST_FUNCS; f++) { 8275 zi = &ztest_info[f]; 8276 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8277 print_time(zc->zc_time, timebuf); 8278 (void) printf("%7"PRIu64" %9s %s\n", 8279 zc->zc_count, timebuf, 8280 zi->zi_funcname); 8281 } 8282 (void) printf("\n"); 8283 } 8284 8285 if (!ztest_opts.zo_mmp_test) 8286 ztest_run_zdb(ztest_opts.zo_pool); 8287 } 8288 8289 if (ztest_opts.zo_verbose >= 1) { 8290 if (hasalt) { 8291 (void) printf("%d runs of older ztest: %s\n", older, 8292 ztest_opts.zo_alt_ztest); 8293 (void) printf("%d runs of newer ztest: %s\n", newer, 8294 cmd); 8295 } 8296 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 8297 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 8298 } 8299 8300 umem_free(cmd, MAXNAMELEN); 8301 8302 return (0); 8303 } 8304