1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2024 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2013 Steven Hartland. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2017 Joyent, Inc. 28 * Copyright (c) 2017, Intel Corporation. 29 * Copyright (c) 2023, Klara, Inc. 30 */ 31 32 /* 33 * The objective of this program is to provide a DMU/ZAP/SPA stress test 34 * that runs entirely in userland, is easy to use, and easy to extend. 35 * 36 * The overall design of the ztest program is as follows: 37 * 38 * (1) For each major functional area (e.g. adding vdevs to a pool, 39 * creating and destroying datasets, reading and writing objects, etc) 40 * we have a simple routine to test that functionality. These 41 * individual routines do not have to do anything "stressful". 42 * 43 * (2) We turn these simple functionality tests into a stress test by 44 * running them all in parallel, with as many threads as desired, 45 * and spread across as many datasets, objects, and vdevs as desired. 46 * 47 * (3) While all this is happening, we inject faults into the pool to 48 * verify that self-healing data really works. 49 * 50 * (4) Every time we open a dataset, we change its checksum and compression 51 * functions. Thus even individual objects vary from block to block 52 * in which checksum they use and whether they're compressed. 53 * 54 * (5) To verify that we never lose on-disk consistency after a crash, 55 * we run the entire test in a child of the main process. 56 * At random times, the child self-immolates with a SIGKILL. 57 * This is the software equivalent of pulling the power cord. 58 * The parent then runs the test again, using the existing 59 * storage pool, as many times as desired. If backwards compatibility 60 * testing is enabled ztest will sometimes run the "older" version 61 * of ztest after a SIGKILL. 62 * 63 * (6) To verify that we don't have future leaks or temporal incursions, 64 * many of the functional tests record the transaction group number 65 * as part of their data. When reading old data, they verify that 66 * the transaction group number is less than the current, open txg. 67 * If you add a new test, please do this if applicable. 68 * 69 * (7) Threads are created with a reduced stack size, for sanity checking. 70 * Therefore, it's important not to allocate huge buffers on the stack. 71 * 72 * When run with no arguments, ztest runs for about five minutes and 73 * produces no output if successful. To get a little bit of information, 74 * specify -V. To get more information, specify -VV, and so on. 75 * 76 * To turn this into an overnight stress test, use -T to specify run time. 77 * 78 * You can ask more vdevs [-v], datasets [-d], or threads [-t] 79 * to increase the pool capacity, fanout, and overall stress level. 80 * 81 * Use the -k option to set the desired frequency of kills. 82 * 83 * When ztest invokes itself it passes all relevant information through a 84 * temporary file which is mmap-ed in the child process. This allows shared 85 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 86 * stored at offset 0 of this file and contains information on the size and 87 * number of shared structures in the file. The information stored in this file 88 * must remain backwards compatible with older versions of ztest so that 89 * ztest can invoke them during backwards compatibility testing (-B). 90 */ 91 92 #include <sys/zfs_context.h> 93 #include <sys/spa.h> 94 #include <sys/dmu.h> 95 #include <sys/txg.h> 96 #include <sys/dbuf.h> 97 #include <sys/zap.h> 98 #include <sys/dmu_objset.h> 99 #include <sys/poll.h> 100 #include <sys/stat.h> 101 #include <sys/time.h> 102 #include <sys/wait.h> 103 #include <sys/mman.h> 104 #include <sys/resource.h> 105 #include <sys/zio.h> 106 #include <sys/zil.h> 107 #include <sys/zil_impl.h> 108 #include <sys/vdev_draid.h> 109 #include <sys/vdev_impl.h> 110 #include <sys/vdev_file.h> 111 #include <sys/vdev_initialize.h> 112 #include <sys/vdev_raidz.h> 113 #include <sys/vdev_trim.h> 114 #include <sys/spa_impl.h> 115 #include <sys/metaslab_impl.h> 116 #include <sys/dsl_prop.h> 117 #include <sys/dsl_dataset.h> 118 #include <sys/dsl_destroy.h> 119 #include <sys/dsl_scan.h> 120 #include <sys/zio_checksum.h> 121 #include <sys/zfs_refcount.h> 122 #include <sys/zfeature.h> 123 #include <sys/dsl_userhold.h> 124 #include <sys/abd.h> 125 #include <sys/blake3.h> 126 #include <stdio.h> 127 #include <stdlib.h> 128 #include <unistd.h> 129 #include <getopt.h> 130 #include <signal.h> 131 #include <umem.h> 132 #include <ctype.h> 133 #include <math.h> 134 #include <sys/fs/zfs.h> 135 #include <zfs_fletcher.h> 136 #include <libnvpair.h> 137 #include <libzutil.h> 138 #include <sys/crypto/icp.h> 139 #include <sys/zfs_impl.h> 140 #include <sys/backtrace.h> 141 142 static int ztest_fd_data = -1; 143 static int ztest_fd_rand = -1; 144 145 typedef struct ztest_shared_hdr { 146 uint64_t zh_hdr_size; 147 uint64_t zh_opts_size; 148 uint64_t zh_size; 149 uint64_t zh_stats_size; 150 uint64_t zh_stats_count; 151 uint64_t zh_ds_size; 152 uint64_t zh_ds_count; 153 uint64_t zh_scratch_state_size; 154 } ztest_shared_hdr_t; 155 156 static ztest_shared_hdr_t *ztest_shared_hdr; 157 158 enum ztest_class_state { 159 ZTEST_VDEV_CLASS_OFF, 160 ZTEST_VDEV_CLASS_ON, 161 ZTEST_VDEV_CLASS_RND 162 }; 163 164 /* Dedicated RAIDZ Expansion test states */ 165 typedef enum { 166 RAIDZ_EXPAND_NONE, /* Default is none, must opt-in */ 167 RAIDZ_EXPAND_REQUESTED, /* The '-X' option was used */ 168 RAIDZ_EXPAND_STARTED, /* Testing has commenced */ 169 RAIDZ_EXPAND_KILLED, /* Reached the proccess kill */ 170 RAIDZ_EXPAND_CHECKED, /* Pool scrub verification done */ 171 } raidz_expand_test_state_t; 172 173 174 #define ZO_GVARS_MAX_ARGLEN ((size_t)64) 175 #define ZO_GVARS_MAX_COUNT ((size_t)10) 176 177 typedef struct ztest_shared_opts { 178 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 179 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 180 char zo_alt_ztest[MAXNAMELEN]; 181 char zo_alt_libpath[MAXNAMELEN]; 182 uint64_t zo_vdevs; 183 uint64_t zo_vdevtime; 184 size_t zo_vdev_size; 185 int zo_ashift; 186 int zo_mirrors; 187 int zo_raid_do_expand; 188 int zo_raid_children; 189 int zo_raid_parity; 190 char zo_raid_type[8]; 191 int zo_draid_data; 192 int zo_draid_spares; 193 int zo_datasets; 194 int zo_threads; 195 uint64_t zo_passtime; 196 uint64_t zo_killrate; 197 int zo_verbose; 198 int zo_init; 199 uint64_t zo_time; 200 uint64_t zo_maxloops; 201 uint64_t zo_metaslab_force_ganging; 202 raidz_expand_test_state_t zo_raidz_expand_test; 203 int zo_mmp_test; 204 int zo_special_vdevs; 205 int zo_dump_dbgmsg; 206 int zo_gvars_count; 207 char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; 208 } ztest_shared_opts_t; 209 210 /* Default values for command line options. */ 211 #define DEFAULT_POOL "ztest" 212 #define DEFAULT_VDEV_DIR "/tmp" 213 #define DEFAULT_VDEV_COUNT 5 214 #define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ 215 #define DEFAULT_VDEV_SIZE_STR "256M" 216 #define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT 217 #define DEFAULT_MIRRORS 2 218 #define DEFAULT_RAID_CHILDREN 4 219 #define DEFAULT_RAID_PARITY 1 220 #define DEFAULT_DRAID_DATA 4 221 #define DEFAULT_DRAID_SPARES 1 222 #define DEFAULT_DATASETS_COUNT 7 223 #define DEFAULT_THREADS 23 224 #define DEFAULT_RUN_TIME 300 /* 300 seconds */ 225 #define DEFAULT_RUN_TIME_STR "300 sec" 226 #define DEFAULT_PASS_TIME 60 /* 60 seconds */ 227 #define DEFAULT_PASS_TIME_STR "60 sec" 228 #define DEFAULT_KILL_RATE 70 /* 70% kill rate */ 229 #define DEFAULT_KILLRATE_STR "70%" 230 #define DEFAULT_INITS 1 231 #define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ 232 #define DEFAULT_FORCE_GANGING (64 << 10) 233 #define DEFAULT_FORCE_GANGING_STR "64K" 234 235 /* Simplifying assumption: -1 is not a valid default. */ 236 #define NO_DEFAULT -1 237 238 static const ztest_shared_opts_t ztest_opts_defaults = { 239 .zo_pool = DEFAULT_POOL, 240 .zo_dir = DEFAULT_VDEV_DIR, 241 .zo_alt_ztest = { '\0' }, 242 .zo_alt_libpath = { '\0' }, 243 .zo_vdevs = DEFAULT_VDEV_COUNT, 244 .zo_ashift = DEFAULT_ASHIFT, 245 .zo_mirrors = DEFAULT_MIRRORS, 246 .zo_raid_children = DEFAULT_RAID_CHILDREN, 247 .zo_raid_parity = DEFAULT_RAID_PARITY, 248 .zo_raid_type = VDEV_TYPE_RAIDZ, 249 .zo_vdev_size = DEFAULT_VDEV_SIZE, 250 .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ 251 .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ 252 .zo_datasets = DEFAULT_DATASETS_COUNT, 253 .zo_threads = DEFAULT_THREADS, 254 .zo_passtime = DEFAULT_PASS_TIME, 255 .zo_killrate = DEFAULT_KILL_RATE, 256 .zo_verbose = 0, 257 .zo_mmp_test = 0, 258 .zo_init = DEFAULT_INITS, 259 .zo_time = DEFAULT_RUN_TIME, 260 .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ 261 .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, 262 .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, 263 .zo_gvars_count = 0, 264 .zo_raidz_expand_test = RAIDZ_EXPAND_NONE, 265 }; 266 267 extern uint64_t metaslab_force_ganging; 268 extern uint64_t metaslab_df_alloc_threshold; 269 extern uint64_t zfs_deadman_synctime_ms; 270 extern uint_t metaslab_preload_limit; 271 extern int zfs_compressed_arc_enabled; 272 extern int zfs_abd_scatter_enabled; 273 extern uint_t dmu_object_alloc_chunk_shift; 274 extern boolean_t zfs_force_some_double_word_sm_entries; 275 extern unsigned long zio_decompress_fail_fraction; 276 extern unsigned long zfs_reconstruct_indirect_damage_fraction; 277 extern uint64_t raidz_expand_max_reflow_bytes; 278 extern uint_t raidz_expand_pause_point; 279 extern boolean_t ddt_prune_artificial_age; 280 extern boolean_t ddt_dump_prune_histogram; 281 282 283 static ztest_shared_opts_t *ztest_shared_opts; 284 static ztest_shared_opts_t ztest_opts; 285 static const char *const ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; 286 287 typedef struct ztest_shared_ds { 288 uint64_t zd_seq; 289 } ztest_shared_ds_t; 290 291 static ztest_shared_ds_t *ztest_shared_ds; 292 #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 293 294 typedef struct ztest_scratch_state { 295 uint64_t zs_raidz_scratch_verify_pause; 296 } ztest_shared_scratch_state_t; 297 298 static ztest_shared_scratch_state_t *ztest_scratch_state; 299 300 #define BT_MAGIC 0x123456789abcdefULL 301 #define MAXFAULTS(zs) \ 302 (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) 303 304 enum ztest_io_type { 305 ZTEST_IO_WRITE_TAG, 306 ZTEST_IO_WRITE_PATTERN, 307 ZTEST_IO_WRITE_ZEROES, 308 ZTEST_IO_TRUNCATE, 309 ZTEST_IO_SETATTR, 310 ZTEST_IO_REWRITE, 311 ZTEST_IO_TYPES 312 }; 313 314 typedef struct ztest_block_tag { 315 uint64_t bt_magic; 316 uint64_t bt_objset; 317 uint64_t bt_object; 318 uint64_t bt_dnodesize; 319 uint64_t bt_offset; 320 uint64_t bt_gen; 321 uint64_t bt_txg; 322 uint64_t bt_crtxg; 323 } ztest_block_tag_t; 324 325 typedef struct bufwad { 326 uint64_t bw_index; 327 uint64_t bw_txg; 328 uint64_t bw_data; 329 } bufwad_t; 330 331 /* 332 * It would be better to use a rangelock_t per object. Unfortunately 333 * the rangelock_t is not a drop-in replacement for rl_t, because we 334 * still need to map from object ID to rangelock_t. 335 */ 336 typedef enum { 337 ZTRL_READER, 338 ZTRL_WRITER, 339 ZTRL_APPEND 340 } rl_type_t; 341 342 typedef struct rll { 343 void *rll_writer; 344 int rll_readers; 345 kmutex_t rll_lock; 346 kcondvar_t rll_cv; 347 } rll_t; 348 349 typedef struct rl { 350 uint64_t rl_object; 351 uint64_t rl_offset; 352 uint64_t rl_size; 353 rll_t *rl_lock; 354 } rl_t; 355 356 #define ZTEST_RANGE_LOCKS 64 357 #define ZTEST_OBJECT_LOCKS 64 358 359 /* 360 * Object descriptor. Used as a template for object lookup/create/remove. 361 */ 362 typedef struct ztest_od { 363 uint64_t od_dir; 364 uint64_t od_object; 365 dmu_object_type_t od_type; 366 dmu_object_type_t od_crtype; 367 uint64_t od_blocksize; 368 uint64_t od_crblocksize; 369 uint64_t od_crdnodesize; 370 uint64_t od_gen; 371 uint64_t od_crgen; 372 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 373 } ztest_od_t; 374 375 /* 376 * Per-dataset state. 377 */ 378 typedef struct ztest_ds { 379 ztest_shared_ds_t *zd_shared; 380 objset_t *zd_os; 381 pthread_rwlock_t zd_zilog_lock; 382 zilog_t *zd_zilog; 383 ztest_od_t *zd_od; /* debugging aid */ 384 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 385 kmutex_t zd_dirobj_lock; 386 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 387 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 388 } ztest_ds_t; 389 390 /* 391 * Per-iteration state. 392 */ 393 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 394 395 typedef struct ztest_info { 396 ztest_func_t *zi_func; /* test function */ 397 uint64_t zi_iters; /* iterations per execution */ 398 uint64_t *zi_interval; /* execute every <interval> seconds */ 399 const char *zi_funcname; /* name of test function */ 400 } ztest_info_t; 401 402 typedef struct ztest_shared_callstate { 403 uint64_t zc_count; /* per-pass count */ 404 uint64_t zc_time; /* per-pass time */ 405 uint64_t zc_next; /* next time to call this function */ 406 } ztest_shared_callstate_t; 407 408 static ztest_shared_callstate_t *ztest_shared_callstate; 409 #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 410 411 ztest_func_t ztest_dmu_read_write; 412 ztest_func_t ztest_dmu_write_parallel; 413 ztest_func_t ztest_dmu_object_alloc_free; 414 ztest_func_t ztest_dmu_object_next_chunk; 415 ztest_func_t ztest_dmu_commit_callbacks; 416 ztest_func_t ztest_zap; 417 ztest_func_t ztest_zap_parallel; 418 ztest_func_t ztest_zil_commit; 419 ztest_func_t ztest_zil_remount; 420 ztest_func_t ztest_dmu_read_write_zcopy; 421 ztest_func_t ztest_dmu_objset_create_destroy; 422 ztest_func_t ztest_dmu_prealloc; 423 ztest_func_t ztest_fzap; 424 ztest_func_t ztest_dmu_snapshot_create_destroy; 425 ztest_func_t ztest_dsl_prop_get_set; 426 ztest_func_t ztest_spa_prop_get_set; 427 ztest_func_t ztest_spa_create_destroy; 428 ztest_func_t ztest_fault_inject; 429 ztest_func_t ztest_dmu_snapshot_hold; 430 ztest_func_t ztest_mmp_enable_disable; 431 ztest_func_t ztest_scrub; 432 ztest_func_t ztest_dsl_dataset_promote_busy; 433 ztest_func_t ztest_vdev_attach_detach; 434 ztest_func_t ztest_vdev_raidz_attach; 435 ztest_func_t ztest_vdev_LUN_growth; 436 ztest_func_t ztest_vdev_add_remove; 437 ztest_func_t ztest_vdev_class_add; 438 ztest_func_t ztest_vdev_aux_add_remove; 439 ztest_func_t ztest_split_pool; 440 ztest_func_t ztest_reguid; 441 ztest_func_t ztest_spa_upgrade; 442 ztest_func_t ztest_device_removal; 443 ztest_func_t ztest_spa_checkpoint_create_discard; 444 ztest_func_t ztest_initialize; 445 ztest_func_t ztest_trim; 446 ztest_func_t ztest_blake3; 447 ztest_func_t ztest_fletcher; 448 ztest_func_t ztest_fletcher_incr; 449 ztest_func_t ztest_verify_dnode_bt; 450 ztest_func_t ztest_pool_prefetch_ddt; 451 ztest_func_t ztest_ddt_prune; 452 453 static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 454 static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 455 static uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 456 static uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 457 static uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 458 459 #define ZTI_INIT(func, iters, interval) \ 460 { .zi_func = (func), \ 461 .zi_iters = (iters), \ 462 .zi_interval = (interval), \ 463 .zi_funcname = # func } 464 465 static ztest_info_t ztest_info[] = { 466 ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), 467 ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), 468 ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), 469 ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), 470 ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), 471 ZTI_INIT(ztest_zap, 30, &zopt_always), 472 ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), 473 ZTI_INIT(ztest_split_pool, 1, &zopt_sometimes), 474 ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), 475 ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), 476 ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), 477 ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), 478 ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), 479 ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), 480 #if 0 481 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), 482 #endif 483 ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), 484 ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), 485 ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), 486 ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), 487 ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), 488 ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), 489 ZTI_INIT(ztest_reguid, 1, &zopt_rarely), 490 ZTI_INIT(ztest_scrub, 1, &zopt_rarely), 491 ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), 492 ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), 493 ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), 494 ZTI_INIT(ztest_vdev_raidz_attach, 1, &zopt_sometimes), 495 ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), 496 ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), 497 ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), 498 ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), 499 ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), 500 ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), 501 ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), 502 ZTI_INIT(ztest_trim, 1, &zopt_sometimes), 503 ZTI_INIT(ztest_blake3, 1, &zopt_rarely), 504 ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), 505 ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), 506 ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), 507 ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely), 508 ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely), 509 }; 510 511 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 512 513 /* 514 * The following struct is used to hold a list of uncalled commit callbacks. 515 * The callbacks are ordered by txg number. 516 */ 517 typedef struct ztest_cb_list { 518 kmutex_t zcl_callbacks_lock; 519 list_t zcl_callbacks; 520 } ztest_cb_list_t; 521 522 /* 523 * Stuff we need to share writably between parent and child. 524 */ 525 typedef struct ztest_shared { 526 boolean_t zs_do_init; 527 hrtime_t zs_proc_start; 528 hrtime_t zs_proc_stop; 529 hrtime_t zs_thread_start; 530 hrtime_t zs_thread_stop; 531 hrtime_t zs_thread_kill; 532 uint64_t zs_enospc_count; 533 uint64_t zs_vdev_next_leaf; 534 uint64_t zs_vdev_aux; 535 uint64_t zs_alloc; 536 uint64_t zs_space; 537 uint64_t zs_splits; 538 uint64_t zs_mirrors; 539 uint64_t zs_metaslab_sz; 540 uint64_t zs_metaslab_df_alloc_threshold; 541 uint64_t zs_guid; 542 } ztest_shared_t; 543 544 #define ID_PARALLEL -1ULL 545 546 static char ztest_dev_template[] = "%s/%s.%llua"; 547 static char ztest_aux_template[] = "%s/%s.%s.%llu"; 548 static ztest_shared_t *ztest_shared; 549 550 static spa_t *ztest_spa = NULL; 551 static ztest_ds_t *ztest_ds; 552 553 static kmutex_t ztest_vdev_lock; 554 static boolean_t ztest_device_removal_active = B_FALSE; 555 static boolean_t ztest_pool_scrubbed = B_FALSE; 556 static kmutex_t ztest_checkpoint_lock; 557 558 /* 559 * The ztest_name_lock protects the pool and dataset namespace used by 560 * the individual tests. To modify the namespace, consumers must grab 561 * this lock as writer. Grabbing the lock as reader will ensure that the 562 * namespace does not change while the lock is held. 563 */ 564 static pthread_rwlock_t ztest_name_lock; 565 566 static boolean_t ztest_dump_core = B_TRUE; 567 static boolean_t ztest_exiting; 568 569 /* Global commit callback list */ 570 static ztest_cb_list_t zcl; 571 /* Commit cb delay */ 572 static uint64_t zc_min_txg_delay = UINT64_MAX; 573 static int zc_cb_counter = 0; 574 575 /* 576 * Minimum number of commit callbacks that need to be registered for us to check 577 * whether the minimum txg delay is acceptable. 578 */ 579 #define ZTEST_COMMIT_CB_MIN_REG 100 580 581 /* 582 * If a number of txgs equal to this threshold have been created after a commit 583 * callback has been registered but not called, then we assume there is an 584 * implementation bug. 585 */ 586 #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) 587 588 enum ztest_object { 589 ZTEST_META_DNODE = 0, 590 ZTEST_DIROBJ, 591 ZTEST_OBJECTS 592 }; 593 594 static __attribute__((noreturn)) void usage(boolean_t requested); 595 static int ztest_scrub_impl(spa_t *spa); 596 597 /* 598 * These libumem hooks provide a reasonable set of defaults for the allocator's 599 * debugging facilities. 600 */ 601 const char * 602 _umem_debug_init(void) 603 { 604 return ("default,verbose"); /* $UMEM_DEBUG setting */ 605 } 606 607 const char * 608 _umem_logging_init(void) 609 { 610 return ("fail,contents"); /* $UMEM_LOGGING setting */ 611 } 612 613 static void 614 dump_debug_buffer(void) 615 { 616 ssize_t ret __attribute__((unused)); 617 618 if (!ztest_opts.zo_dump_dbgmsg) 619 return; 620 621 /* 622 * We use write() instead of printf() so that this function 623 * is safe to call from a signal handler. 624 */ 625 ret = write(STDERR_FILENO, "\n", 1); 626 zfs_dbgmsg_print(STDERR_FILENO, "ztest"); 627 } 628 629 static void sig_handler(int signo) 630 { 631 struct sigaction action; 632 633 libspl_backtrace(STDERR_FILENO); 634 dump_debug_buffer(); 635 636 /* 637 * Restore default action and re-raise signal so SIGSEGV and 638 * SIGABRT can trigger a core dump. 639 */ 640 action.sa_handler = SIG_DFL; 641 sigemptyset(&action.sa_mask); 642 action.sa_flags = 0; 643 (void) sigaction(signo, &action, NULL); 644 raise(signo); 645 } 646 647 #define FATAL_MSG_SZ 1024 648 649 static const char *fatal_msg; 650 651 static __attribute__((format(printf, 2, 3))) __attribute__((noreturn)) void 652 fatal(int do_perror, const char *message, ...) 653 { 654 va_list args; 655 int save_errno = errno; 656 char *buf; 657 658 (void) fflush(stdout); 659 buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); 660 if (buf == NULL) 661 goto out; 662 663 va_start(args, message); 664 (void) sprintf(buf, "ztest: "); 665 /* LINTED */ 666 (void) vsprintf(buf + strlen(buf), message, args); 667 va_end(args); 668 if (do_perror) { 669 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 670 ": %s", strerror(save_errno)); 671 } 672 (void) fprintf(stderr, "%s\n", buf); 673 fatal_msg = buf; /* to ease debugging */ 674 675 out: 676 if (ztest_dump_core) 677 abort(); 678 else 679 dump_debug_buffer(); 680 681 exit(3); 682 } 683 684 static int 685 str2shift(const char *buf) 686 { 687 const char *ends = "BKMGTPEZ"; 688 int i; 689 690 if (buf[0] == '\0') 691 return (0); 692 for (i = 0; i < strlen(ends); i++) { 693 if (toupper(buf[0]) == ends[i]) 694 break; 695 } 696 if (i == strlen(ends)) { 697 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 698 buf); 699 usage(B_FALSE); 700 } 701 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 702 return (10*i); 703 } 704 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 705 usage(B_FALSE); 706 } 707 708 static uint64_t 709 nicenumtoull(const char *buf) 710 { 711 char *end; 712 uint64_t val; 713 714 val = strtoull(buf, &end, 0); 715 if (end == buf) { 716 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 717 usage(B_FALSE); 718 } else if (end[0] == '.') { 719 double fval = strtod(buf, &end); 720 fval *= pow(2, str2shift(end)); 721 /* 722 * UINT64_MAX is not exactly representable as a double. 723 * The closest representation is UINT64_MAX + 1, so we 724 * use a >= comparison instead of > for the bounds check. 725 */ 726 if (fval >= (double)UINT64_MAX) { 727 (void) fprintf(stderr, "ztest: value too large: %s\n", 728 buf); 729 usage(B_FALSE); 730 } 731 val = (uint64_t)fval; 732 } else { 733 int shift = str2shift(end); 734 if (shift >= 64 || (val << shift) >> shift != val) { 735 (void) fprintf(stderr, "ztest: value too large: %s\n", 736 buf); 737 usage(B_FALSE); 738 } 739 val <<= shift; 740 } 741 return (val); 742 } 743 744 typedef struct ztest_option { 745 const char short_opt; 746 const char *long_opt; 747 const char *long_opt_param; 748 const char *comment; 749 unsigned int default_int; 750 const char *default_str; 751 } ztest_option_t; 752 753 /* 754 * The following option_table is used for generating the usage info as well as 755 * the long and short option information for calling getopt_long(). 756 */ 757 static ztest_option_t option_table[] = { 758 { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, 759 NULL}, 760 { 's', "vdev-size", "INTEGER", "Size of each vdev", 761 NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, 762 { 'a', "alignment-shift", "INTEGER", 763 "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, 764 { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", 765 DEFAULT_MIRRORS, NULL}, 766 { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", 767 DEFAULT_RAID_CHILDREN, NULL}, 768 { 'R', "raid-parity", "INTEGER", "Raid parity", 769 DEFAULT_RAID_PARITY, NULL}, 770 { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind", 771 NO_DEFAULT, "random"}, 772 { 'D', "draid-data", "INTEGER", "Number of draid data drives", 773 DEFAULT_DRAID_DATA, NULL}, 774 { 'S', "draid-spares", "INTEGER", "Number of draid spares", 775 DEFAULT_DRAID_SPARES, NULL}, 776 { 'd', "datasets", "INTEGER", "Number of datasets", 777 DEFAULT_DATASETS_COUNT, NULL}, 778 { 't', "threads", "INTEGER", "Number of ztest threads", 779 DEFAULT_THREADS, NULL}, 780 { 'g', "gang-block-threshold", "INTEGER", 781 "Metaslab gang block threshold", 782 NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, 783 { 'i', "init-count", "INTEGER", "Number of times to initialize pool", 784 DEFAULT_INITS, NULL}, 785 { 'k', "kill-percentage", "INTEGER", "Kill percentage", 786 NO_DEFAULT, DEFAULT_KILLRATE_STR}, 787 { 'p', "pool-name", "STRING", "Pool name", 788 NO_DEFAULT, DEFAULT_POOL}, 789 { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", 790 NO_DEFAULT, DEFAULT_VDEV_DIR}, 791 { 'M', "multi-host", NULL, 792 "Multi-host; simulate pool imported on remote host", 793 NO_DEFAULT, NULL}, 794 { 'E', "use-existing-pool", NULL, 795 "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, 796 { 'T', "run-time", "INTEGER", "Total run time", 797 NO_DEFAULT, DEFAULT_RUN_TIME_STR}, 798 { 'P', "pass-time", "INTEGER", "Time per pass", 799 NO_DEFAULT, DEFAULT_PASS_TIME_STR}, 800 { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", 801 DEFAULT_MAX_LOOPS, NULL}, 802 { 'B', "alt-ztest", "PATH", "Alternate ztest path", 803 NO_DEFAULT, NULL}, 804 { 'C', "vdev-class-state", "on|off|random", "vdev class state", 805 NO_DEFAULT, "random"}, 806 { 'X', "raidz-expansion", NULL, 807 "Perform a dedicated raidz expansion test", 808 NO_DEFAULT, NULL}, 809 { 'o', "option", "\"OPTION=INTEGER\"", 810 "Set global variable to an unsigned 32-bit integer value", 811 NO_DEFAULT, NULL}, 812 { 'G', "dump-debug-msg", NULL, 813 "Dump zfs_dbgmsg buffer before exiting due to an error", 814 NO_DEFAULT, NULL}, 815 { 'V', "verbose", NULL, 816 "Verbose (use multiple times for ever more verbosity)", 817 NO_DEFAULT, NULL}, 818 { 'h', "help", NULL, "Show this help", 819 NO_DEFAULT, NULL}, 820 {0, 0, 0, 0, 0, 0} 821 }; 822 823 static struct option *long_opts = NULL; 824 static char *short_opts = NULL; 825 826 static void 827 init_options(void) 828 { 829 ASSERT3P(long_opts, ==, NULL); 830 ASSERT3P(short_opts, ==, NULL); 831 832 int count = sizeof (option_table) / sizeof (option_table[0]); 833 long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); 834 835 short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); 836 int short_opt_index = 0; 837 838 for (int i = 0; i < count; i++) { 839 long_opts[i].val = option_table[i].short_opt; 840 long_opts[i].name = option_table[i].long_opt; 841 long_opts[i].has_arg = option_table[i].long_opt_param != NULL 842 ? required_argument : no_argument; 843 long_opts[i].flag = NULL; 844 short_opts[short_opt_index++] = option_table[i].short_opt; 845 if (option_table[i].long_opt_param != NULL) { 846 short_opts[short_opt_index++] = ':'; 847 } 848 } 849 } 850 851 static void 852 fini_options(void) 853 { 854 int count = sizeof (option_table) / sizeof (option_table[0]); 855 856 umem_free(long_opts, sizeof (struct option) * count); 857 umem_free(short_opts, sizeof (char) * 2 * count); 858 859 long_opts = NULL; 860 short_opts = NULL; 861 } 862 863 static __attribute__((noreturn)) void 864 usage(boolean_t requested) 865 { 866 char option[80]; 867 FILE *fp = requested ? stdout : stderr; 868 869 (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); 870 for (int i = 0; option_table[i].short_opt != 0; i++) { 871 if (option_table[i].long_opt_param != NULL) { 872 (void) sprintf(option, " -%c --%s=%s", 873 option_table[i].short_opt, 874 option_table[i].long_opt, 875 option_table[i].long_opt_param); 876 } else { 877 (void) sprintf(option, " -%c --%s", 878 option_table[i].short_opt, 879 option_table[i].long_opt); 880 } 881 (void) fprintf(fp, " %-43s%s", option, 882 option_table[i].comment); 883 884 if (option_table[i].long_opt_param != NULL) { 885 if (option_table[i].default_str != NULL) { 886 (void) fprintf(fp, " (default: %s)", 887 option_table[i].default_str); 888 } else if (option_table[i].default_int != NO_DEFAULT) { 889 (void) fprintf(fp, " (default: %u)", 890 option_table[i].default_int); 891 } 892 } 893 (void) fprintf(fp, "\n"); 894 } 895 exit(requested ? 0 : 1); 896 } 897 898 static uint64_t 899 ztest_random(uint64_t range) 900 { 901 uint64_t r; 902 903 ASSERT3S(ztest_fd_rand, >=, 0); 904 905 if (range == 0) 906 return (0); 907 908 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) 909 fatal(B_TRUE, "short read from /dev/urandom"); 910 911 return (r % range); 912 } 913 914 static void 915 ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) 916 { 917 char name[32]; 918 char *value; 919 int state = ZTEST_VDEV_CLASS_RND; 920 921 (void) strlcpy(name, input, sizeof (name)); 922 923 value = strchr(name, '='); 924 if (value == NULL) { 925 (void) fprintf(stderr, "missing value in property=value " 926 "'-C' argument (%s)\n", input); 927 usage(B_FALSE); 928 } 929 *(value) = '\0'; 930 value++; 931 932 if (strcmp(value, "on") == 0) { 933 state = ZTEST_VDEV_CLASS_ON; 934 } else if (strcmp(value, "off") == 0) { 935 state = ZTEST_VDEV_CLASS_OFF; 936 } else if (strcmp(value, "random") == 0) { 937 state = ZTEST_VDEV_CLASS_RND; 938 } else { 939 (void) fprintf(stderr, "invalid property value '%s'\n", value); 940 usage(B_FALSE); 941 } 942 943 if (strcmp(name, "special") == 0) { 944 zo->zo_special_vdevs = state; 945 } else { 946 (void) fprintf(stderr, "invalid property name '%s'\n", name); 947 usage(B_FALSE); 948 } 949 if (zo->zo_verbose >= 3) 950 (void) printf("%s vdev state is '%s'\n", name, value); 951 } 952 953 static void 954 process_options(int argc, char **argv) 955 { 956 char *path; 957 ztest_shared_opts_t *zo = &ztest_opts; 958 959 int opt; 960 uint64_t value; 961 const char *raid_kind = "random"; 962 963 memcpy(zo, &ztest_opts_defaults, sizeof (*zo)); 964 965 init_options(); 966 967 while ((opt = getopt_long(argc, argv, short_opts, long_opts, 968 NULL)) != EOF) { 969 value = 0; 970 switch (opt) { 971 case 'v': 972 case 's': 973 case 'a': 974 case 'm': 975 case 'r': 976 case 'R': 977 case 'D': 978 case 'S': 979 case 'd': 980 case 't': 981 case 'g': 982 case 'i': 983 case 'k': 984 case 'T': 985 case 'P': 986 case 'F': 987 value = nicenumtoull(optarg); 988 } 989 switch (opt) { 990 case 'v': 991 zo->zo_vdevs = value; 992 break; 993 case 's': 994 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 995 break; 996 case 'a': 997 zo->zo_ashift = value; 998 break; 999 case 'm': 1000 zo->zo_mirrors = value; 1001 break; 1002 case 'r': 1003 zo->zo_raid_children = MAX(1, value); 1004 break; 1005 case 'R': 1006 zo->zo_raid_parity = MIN(MAX(value, 1), 3); 1007 break; 1008 case 'K': 1009 raid_kind = optarg; 1010 break; 1011 case 'D': 1012 zo->zo_draid_data = MAX(1, value); 1013 break; 1014 case 'S': 1015 zo->zo_draid_spares = MAX(1, value); 1016 break; 1017 case 'd': 1018 zo->zo_datasets = MAX(1, value); 1019 break; 1020 case 't': 1021 zo->zo_threads = MAX(1, value); 1022 break; 1023 case 'g': 1024 zo->zo_metaslab_force_ganging = 1025 MAX(SPA_MINBLOCKSIZE << 1, value); 1026 break; 1027 case 'i': 1028 zo->zo_init = value; 1029 break; 1030 case 'k': 1031 zo->zo_killrate = value; 1032 break; 1033 case 'p': 1034 (void) strlcpy(zo->zo_pool, optarg, 1035 sizeof (zo->zo_pool)); 1036 break; 1037 case 'f': 1038 path = realpath(optarg, NULL); 1039 if (path == NULL) { 1040 (void) fprintf(stderr, "error: %s: %s\n", 1041 optarg, strerror(errno)); 1042 usage(B_FALSE); 1043 } else { 1044 (void) strlcpy(zo->zo_dir, path, 1045 sizeof (zo->zo_dir)); 1046 free(path); 1047 } 1048 break; 1049 case 'M': 1050 zo->zo_mmp_test = 1; 1051 break; 1052 case 'V': 1053 zo->zo_verbose++; 1054 break; 1055 case 'X': 1056 zo->zo_raidz_expand_test = RAIDZ_EXPAND_REQUESTED; 1057 break; 1058 case 'E': 1059 zo->zo_init = 0; 1060 break; 1061 case 'T': 1062 zo->zo_time = value; 1063 break; 1064 case 'P': 1065 zo->zo_passtime = MAX(1, value); 1066 break; 1067 case 'F': 1068 zo->zo_maxloops = MAX(1, value); 1069 break; 1070 case 'B': 1071 (void) strlcpy(zo->zo_alt_ztest, optarg, 1072 sizeof (zo->zo_alt_ztest)); 1073 break; 1074 case 'C': 1075 ztest_parse_name_value(optarg, zo); 1076 break; 1077 case 'o': 1078 if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { 1079 (void) fprintf(stderr, 1080 "max global var count (%zu) exceeded\n", 1081 ZO_GVARS_MAX_COUNT); 1082 usage(B_FALSE); 1083 } 1084 char *v = zo->zo_gvars[zo->zo_gvars_count]; 1085 if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= 1086 ZO_GVARS_MAX_ARGLEN) { 1087 (void) fprintf(stderr, 1088 "global var option '%s' is too long\n", 1089 optarg); 1090 usage(B_FALSE); 1091 } 1092 zo->zo_gvars_count++; 1093 break; 1094 case 'G': 1095 zo->zo_dump_dbgmsg = 1; 1096 break; 1097 case 'h': 1098 usage(B_TRUE); 1099 break; 1100 case '?': 1101 default: 1102 usage(B_FALSE); 1103 break; 1104 } 1105 } 1106 1107 fini_options(); 1108 1109 /* Force compatible options for raidz expansion run */ 1110 if (zo->zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) { 1111 zo->zo_mmp_test = 0; 1112 zo->zo_mirrors = 0; 1113 zo->zo_vdevs = 1; 1114 zo->zo_vdev_size = DEFAULT_VDEV_SIZE * 2; 1115 zo->zo_raid_do_expand = B_FALSE; 1116 raid_kind = "raidz"; 1117 } 1118 1119 if (strcmp(raid_kind, "random") == 0) { 1120 switch (ztest_random(3)) { 1121 case 0: 1122 raid_kind = "raidz"; 1123 break; 1124 case 1: 1125 raid_kind = "eraidz"; 1126 break; 1127 case 2: 1128 raid_kind = "draid"; 1129 break; 1130 } 1131 1132 if (ztest_opts.zo_verbose >= 3) 1133 (void) printf("choosing RAID type '%s'\n", raid_kind); 1134 } 1135 1136 if (strcmp(raid_kind, "draid") == 0) { 1137 uint64_t min_devsize; 1138 1139 /* With fewer disk use 256M, otherwise 128M is OK */ 1140 min_devsize = (ztest_opts.zo_raid_children < 16) ? 1141 (256ULL << 20) : (128ULL << 20); 1142 1143 /* No top-level mirrors with dRAID for now */ 1144 zo->zo_mirrors = 0; 1145 1146 /* Use more appropriate defaults for dRAID */ 1147 if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) 1148 zo->zo_vdevs = 1; 1149 if (zo->zo_raid_children == 1150 ztest_opts_defaults.zo_raid_children) 1151 zo->zo_raid_children = 16; 1152 if (zo->zo_ashift < 12) 1153 zo->zo_ashift = 12; 1154 if (zo->zo_vdev_size < min_devsize) 1155 zo->zo_vdev_size = min_devsize; 1156 1157 if (zo->zo_draid_data + zo->zo_raid_parity > 1158 zo->zo_raid_children - zo->zo_draid_spares) { 1159 (void) fprintf(stderr, "error: too few draid " 1160 "children (%d) for stripe width (%d)\n", 1161 zo->zo_raid_children, 1162 zo->zo_draid_data + zo->zo_raid_parity); 1163 usage(B_FALSE); 1164 } 1165 1166 (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, 1167 sizeof (zo->zo_raid_type)); 1168 1169 } else if (strcmp(raid_kind, "eraidz") == 0) { 1170 /* using eraidz (expandable raidz) */ 1171 zo->zo_raid_do_expand = B_TRUE; 1172 1173 /* tests expect top-level to be raidz */ 1174 zo->zo_mirrors = 0; 1175 zo->zo_vdevs = 1; 1176 1177 /* Make sure parity is less than data columns */ 1178 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1179 zo->zo_raid_children - 1); 1180 1181 } else /* using raidz */ { 1182 ASSERT0(strcmp(raid_kind, "raidz")); 1183 1184 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1185 zo->zo_raid_children - 1); 1186 } 1187 1188 zo->zo_vdevtime = 1189 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 1190 UINT64_MAX >> 2); 1191 1192 if (*zo->zo_alt_ztest) { 1193 const char *invalid_what = "ztest"; 1194 char *val = zo->zo_alt_ztest; 1195 if (0 != access(val, X_OK) || 1196 (strrchr(val, '/') == NULL && (errno == EINVAL))) 1197 goto invalid; 1198 1199 int dirlen = strrchr(val, '/') - val; 1200 strlcpy(zo->zo_alt_libpath, val, 1201 MIN(sizeof (zo->zo_alt_libpath), dirlen + 1)); 1202 invalid_what = "library path", val = zo->zo_alt_libpath; 1203 if (strrchr(val, '/') == NULL && (errno == EINVAL)) 1204 goto invalid; 1205 *strrchr(val, '/') = '\0'; 1206 strlcat(val, "/lib", sizeof (zo->zo_alt_libpath)); 1207 1208 if (0 != access(zo->zo_alt_libpath, X_OK)) 1209 goto invalid; 1210 return; 1211 1212 invalid: 1213 ztest_dump_core = B_FALSE; 1214 fatal(B_TRUE, "invalid alternate %s %s", invalid_what, val); 1215 } 1216 } 1217 1218 static void 1219 ztest_kill(ztest_shared_t *zs) 1220 { 1221 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 1222 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 1223 1224 /* 1225 * Before we kill ourselves, make sure that the config is updated. 1226 * See comment above spa_write_cachefile(). 1227 */ 1228 if (raidz_expand_pause_point != RAIDZ_EXPAND_PAUSE_NONE) { 1229 if (mutex_tryenter(&spa_namespace_lock)) { 1230 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, 1231 B_FALSE); 1232 mutex_exit(&spa_namespace_lock); 1233 1234 ztest_scratch_state->zs_raidz_scratch_verify_pause = 1235 raidz_expand_pause_point; 1236 } else { 1237 /* 1238 * Do not verify scratch object in case if 1239 * spa_namespace_lock cannot be acquired, 1240 * it can cause deadlock in spa_config_update(). 1241 */ 1242 raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; 1243 1244 return; 1245 } 1246 } else { 1247 mutex_enter(&spa_namespace_lock); 1248 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); 1249 mutex_exit(&spa_namespace_lock); 1250 } 1251 1252 (void) raise(SIGKILL); 1253 } 1254 1255 static void 1256 ztest_record_enospc(const char *s) 1257 { 1258 (void) s; 1259 ztest_shared->zs_enospc_count++; 1260 } 1261 1262 static uint64_t 1263 ztest_get_ashift(void) 1264 { 1265 if (ztest_opts.zo_ashift == 0) 1266 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 1267 return (ztest_opts.zo_ashift); 1268 } 1269 1270 static boolean_t 1271 ztest_is_draid_spare(const char *name) 1272 { 1273 uint64_t spare_id = 0, parity = 0, vdev_id = 0; 1274 1275 if (sscanf(name, VDEV_TYPE_DRAID "%"PRIu64"-%"PRIu64"-%"PRIu64"", 1276 &parity, &vdev_id, &spare_id) == 3) { 1277 return (B_TRUE); 1278 } 1279 1280 return (B_FALSE); 1281 } 1282 1283 static nvlist_t * 1284 make_vdev_file(const char *path, const char *aux, const char *pool, 1285 size_t size, uint64_t ashift) 1286 { 1287 char *pathbuf = NULL; 1288 uint64_t vdev; 1289 nvlist_t *file; 1290 boolean_t draid_spare = B_FALSE; 1291 1292 1293 if (ashift == 0) 1294 ashift = ztest_get_ashift(); 1295 1296 if (path == NULL) { 1297 pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1298 path = pathbuf; 1299 1300 if (aux != NULL) { 1301 vdev = ztest_shared->zs_vdev_aux; 1302 (void) snprintf(pathbuf, MAXPATHLEN, 1303 ztest_aux_template, ztest_opts.zo_dir, 1304 pool == NULL ? ztest_opts.zo_pool : pool, 1305 aux, vdev); 1306 } else { 1307 vdev = ztest_shared->zs_vdev_next_leaf++; 1308 (void) snprintf(pathbuf, MAXPATHLEN, 1309 ztest_dev_template, ztest_opts.zo_dir, 1310 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 1311 } 1312 } else { 1313 draid_spare = ztest_is_draid_spare(path); 1314 } 1315 1316 if (size != 0 && !draid_spare) { 1317 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 1318 if (fd == -1) 1319 fatal(B_TRUE, "can't open %s", path); 1320 if (ftruncate(fd, size) != 0) 1321 fatal(B_TRUE, "can't ftruncate %s", path); 1322 (void) close(fd); 1323 } 1324 1325 file = fnvlist_alloc(); 1326 fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, 1327 draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); 1328 fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); 1329 fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); 1330 umem_free(pathbuf, MAXPATHLEN); 1331 1332 return (file); 1333 } 1334 1335 static nvlist_t * 1336 make_vdev_raid(const char *path, const char *aux, const char *pool, size_t size, 1337 uint64_t ashift, int r) 1338 { 1339 nvlist_t *raid, **child; 1340 int c; 1341 1342 if (r < 2) 1343 return (make_vdev_file(path, aux, pool, size, ashift)); 1344 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 1345 1346 for (c = 0; c < r; c++) 1347 child[c] = make_vdev_file(path, aux, pool, size, ashift); 1348 1349 raid = fnvlist_alloc(); 1350 fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, 1351 ztest_opts.zo_raid_type); 1352 fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, 1353 ztest_opts.zo_raid_parity); 1354 fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, 1355 (const nvlist_t **)child, r); 1356 1357 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { 1358 uint64_t ndata = ztest_opts.zo_draid_data; 1359 uint64_t nparity = ztest_opts.zo_raid_parity; 1360 uint64_t nspares = ztest_opts.zo_draid_spares; 1361 uint64_t children = ztest_opts.zo_raid_children; 1362 uint64_t ngroups = 1; 1363 1364 /* 1365 * Calculate the minimum number of groups required to fill a 1366 * slice. This is the LCM of the stripe width (data + parity) 1367 * and the number of data drives (children - spares). 1368 */ 1369 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1370 ngroups++; 1371 1372 /* Store the basic dRAID configuration. */ 1373 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1374 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1375 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1376 } 1377 1378 for (c = 0; c < r; c++) 1379 fnvlist_free(child[c]); 1380 1381 umem_free(child, r * sizeof (nvlist_t *)); 1382 1383 return (raid); 1384 } 1385 1386 static nvlist_t * 1387 make_vdev_mirror(const char *path, const char *aux, const char *pool, 1388 size_t size, uint64_t ashift, int r, int m) 1389 { 1390 nvlist_t *mirror, **child; 1391 int c; 1392 1393 if (m < 1) 1394 return (make_vdev_raid(path, aux, pool, size, ashift, r)); 1395 1396 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 1397 1398 for (c = 0; c < m; c++) 1399 child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); 1400 1401 mirror = fnvlist_alloc(); 1402 fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); 1403 fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 1404 (const nvlist_t **)child, m); 1405 1406 for (c = 0; c < m; c++) 1407 fnvlist_free(child[c]); 1408 1409 umem_free(child, m * sizeof (nvlist_t *)); 1410 1411 return (mirror); 1412 } 1413 1414 static nvlist_t * 1415 make_vdev_root(const char *path, const char *aux, const char *pool, size_t size, 1416 uint64_t ashift, const char *class, int r, int m, int t) 1417 { 1418 nvlist_t *root, **child; 1419 int c; 1420 boolean_t log; 1421 1422 ASSERT3S(t, >, 0); 1423 1424 log = (class != NULL && strcmp(class, "log") == 0); 1425 1426 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 1427 1428 for (c = 0; c < t; c++) { 1429 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 1430 r, m); 1431 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); 1432 1433 if (class != NULL && class[0] != '\0') { 1434 ASSERT(m > 1 || log); /* expecting a mirror */ 1435 fnvlist_add_string(child[c], 1436 ZPOOL_CONFIG_ALLOCATION_BIAS, class); 1437 } 1438 } 1439 1440 root = fnvlist_alloc(); 1441 fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 1442 fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 1443 (const nvlist_t **)child, t); 1444 1445 for (c = 0; c < t; c++) 1446 fnvlist_free(child[c]); 1447 1448 umem_free(child, t * sizeof (nvlist_t *)); 1449 1450 return (root); 1451 } 1452 1453 /* 1454 * Find a random spa version. Returns back a random spa version in the 1455 * range [initial_version, SPA_VERSION_FEATURES]. 1456 */ 1457 static uint64_t 1458 ztest_random_spa_version(uint64_t initial_version) 1459 { 1460 uint64_t version = initial_version; 1461 1462 if (version <= SPA_VERSION_BEFORE_FEATURES) { 1463 version = version + 1464 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 1465 } 1466 1467 if (version > SPA_VERSION_BEFORE_FEATURES) 1468 version = SPA_VERSION_FEATURES; 1469 1470 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1471 return (version); 1472 } 1473 1474 static int 1475 ztest_random_blocksize(void) 1476 { 1477 ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); 1478 1479 /* 1480 * Choose a block size >= the ashift. 1481 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1482 */ 1483 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1484 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1485 maxbs = 20; 1486 uint64_t block_shift = 1487 ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1488 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1489 } 1490 1491 static int 1492 ztest_random_dnodesize(void) 1493 { 1494 int slots; 1495 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1496 1497 if (max_slots == DNODE_MIN_SLOTS) 1498 return (DNODE_MIN_SIZE); 1499 1500 /* 1501 * Weight the random distribution more heavily toward smaller 1502 * dnode sizes since that is more likely to reflect real-world 1503 * usage. 1504 */ 1505 ASSERT3U(max_slots, >, 4); 1506 switch (ztest_random(10)) { 1507 case 0: 1508 slots = 5 + ztest_random(max_slots - 4); 1509 break; 1510 case 1 ... 4: 1511 slots = 2 + ztest_random(3); 1512 break; 1513 default: 1514 slots = 1; 1515 break; 1516 } 1517 1518 return (slots << DNODE_SHIFT); 1519 } 1520 1521 static int 1522 ztest_random_ibshift(void) 1523 { 1524 return (DN_MIN_INDBLKSHIFT + 1525 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1526 } 1527 1528 static uint64_t 1529 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1530 { 1531 uint64_t top; 1532 vdev_t *rvd = spa->spa_root_vdev; 1533 vdev_t *tvd; 1534 1535 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 1536 1537 do { 1538 top = ztest_random(rvd->vdev_children); 1539 tvd = rvd->vdev_child[top]; 1540 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1541 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1542 1543 return (top); 1544 } 1545 1546 static uint64_t 1547 ztest_random_dsl_prop(zfs_prop_t prop) 1548 { 1549 uint64_t value; 1550 1551 do { 1552 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1553 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1554 1555 return (value); 1556 } 1557 1558 static int 1559 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1560 boolean_t inherit) 1561 { 1562 const char *propname = zfs_prop_to_name(prop); 1563 const char *valname; 1564 char *setpoint; 1565 uint64_t curval; 1566 int error; 1567 1568 error = dsl_prop_set_int(osname, propname, 1569 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1570 1571 if (error == ENOSPC) { 1572 ztest_record_enospc(FTAG); 1573 return (error); 1574 } 1575 ASSERT0(error); 1576 1577 setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1578 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1579 1580 if (ztest_opts.zo_verbose >= 6) { 1581 int err; 1582 1583 err = zfs_prop_index_to_string(prop, curval, &valname); 1584 if (err) 1585 (void) printf("%s %s = %llu at '%s'\n", osname, 1586 propname, (unsigned long long)curval, setpoint); 1587 else 1588 (void) printf("%s %s = %s at '%s'\n", 1589 osname, propname, valname, setpoint); 1590 } 1591 umem_free(setpoint, MAXPATHLEN); 1592 1593 return (error); 1594 } 1595 1596 static int 1597 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1598 { 1599 spa_t *spa = ztest_spa; 1600 nvlist_t *props = NULL; 1601 int error; 1602 1603 props = fnvlist_alloc(); 1604 fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); 1605 1606 error = spa_prop_set(spa, props); 1607 1608 fnvlist_free(props); 1609 1610 if (error == ENOSPC) { 1611 ztest_record_enospc(FTAG); 1612 return (error); 1613 } 1614 ASSERT0(error); 1615 1616 return (error); 1617 } 1618 1619 static int 1620 ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, 1621 boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) 1622 { 1623 int err; 1624 char *cp = NULL; 1625 char ddname[ZFS_MAX_DATASET_NAME_LEN]; 1626 1627 strlcpy(ddname, name, sizeof (ddname)); 1628 cp = strchr(ddname, '@'); 1629 if (cp != NULL) 1630 *cp = '\0'; 1631 1632 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1633 while (decrypt && err == EACCES) { 1634 dsl_crypto_params_t *dcp; 1635 nvlist_t *crypto_args = fnvlist_alloc(); 1636 1637 fnvlist_add_uint8_array(crypto_args, "wkeydata", 1638 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 1639 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, 1640 crypto_args, &dcp)); 1641 err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); 1642 /* 1643 * Note: if there was an error loading, the wkey was not 1644 * consumed, and needs to be freed. 1645 */ 1646 dsl_crypto_params_free(dcp, (err != 0)); 1647 fnvlist_free(crypto_args); 1648 1649 if (err == EINVAL) { 1650 /* 1651 * We couldn't load a key for this dataset so try 1652 * the parent. This loop will eventually hit the 1653 * encryption root since ztest only makes clones 1654 * as children of their origin datasets. 1655 */ 1656 cp = strrchr(ddname, '/'); 1657 if (cp == NULL) 1658 return (err); 1659 1660 *cp = '\0'; 1661 err = EACCES; 1662 continue; 1663 } else if (err != 0) { 1664 break; 1665 } 1666 1667 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1668 break; 1669 } 1670 1671 return (err); 1672 } 1673 1674 static void 1675 ztest_rll_init(rll_t *rll) 1676 { 1677 rll->rll_writer = NULL; 1678 rll->rll_readers = 0; 1679 mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); 1680 cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); 1681 } 1682 1683 static void 1684 ztest_rll_destroy(rll_t *rll) 1685 { 1686 ASSERT3P(rll->rll_writer, ==, NULL); 1687 ASSERT0(rll->rll_readers); 1688 mutex_destroy(&rll->rll_lock); 1689 cv_destroy(&rll->rll_cv); 1690 } 1691 1692 static void 1693 ztest_rll_lock(rll_t *rll, rl_type_t type) 1694 { 1695 mutex_enter(&rll->rll_lock); 1696 1697 if (type == ZTRL_READER) { 1698 while (rll->rll_writer != NULL) 1699 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1700 rll->rll_readers++; 1701 } else { 1702 while (rll->rll_writer != NULL || rll->rll_readers) 1703 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1704 rll->rll_writer = curthread; 1705 } 1706 1707 mutex_exit(&rll->rll_lock); 1708 } 1709 1710 static void 1711 ztest_rll_unlock(rll_t *rll) 1712 { 1713 mutex_enter(&rll->rll_lock); 1714 1715 if (rll->rll_writer) { 1716 ASSERT0(rll->rll_readers); 1717 rll->rll_writer = NULL; 1718 } else { 1719 ASSERT3S(rll->rll_readers, >, 0); 1720 ASSERT3P(rll->rll_writer, ==, NULL); 1721 rll->rll_readers--; 1722 } 1723 1724 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1725 cv_broadcast(&rll->rll_cv); 1726 1727 mutex_exit(&rll->rll_lock); 1728 } 1729 1730 static void 1731 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1732 { 1733 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1734 1735 ztest_rll_lock(rll, type); 1736 } 1737 1738 static void 1739 ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1740 { 1741 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1742 1743 ztest_rll_unlock(rll); 1744 } 1745 1746 static rl_t * 1747 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1748 uint64_t size, rl_type_t type) 1749 { 1750 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1751 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1752 rl_t *rl; 1753 1754 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1755 rl->rl_object = object; 1756 rl->rl_offset = offset; 1757 rl->rl_size = size; 1758 rl->rl_lock = rll; 1759 1760 ztest_rll_lock(rll, type); 1761 1762 return (rl); 1763 } 1764 1765 static void 1766 ztest_range_unlock(rl_t *rl) 1767 { 1768 rll_t *rll = rl->rl_lock; 1769 1770 ztest_rll_unlock(rll); 1771 1772 umem_free(rl, sizeof (*rl)); 1773 } 1774 1775 static void 1776 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1777 { 1778 zd->zd_os = os; 1779 zd->zd_zilog = dmu_objset_zil(os); 1780 zd->zd_shared = szd; 1781 dmu_objset_name(os, zd->zd_name); 1782 int l; 1783 1784 if (zd->zd_shared != NULL) 1785 zd->zd_shared->zd_seq = 0; 1786 1787 VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); 1788 mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); 1789 1790 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1791 ztest_rll_init(&zd->zd_object_lock[l]); 1792 1793 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1794 ztest_rll_init(&zd->zd_range_lock[l]); 1795 } 1796 1797 static void 1798 ztest_zd_fini(ztest_ds_t *zd) 1799 { 1800 int l; 1801 1802 mutex_destroy(&zd->zd_dirobj_lock); 1803 (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); 1804 1805 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1806 ztest_rll_destroy(&zd->zd_object_lock[l]); 1807 1808 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1809 ztest_rll_destroy(&zd->zd_range_lock[l]); 1810 } 1811 1812 #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) 1813 1814 static uint64_t 1815 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 1816 { 1817 uint64_t txg; 1818 int error; 1819 1820 /* 1821 * Attempt to assign tx to some transaction group. 1822 */ 1823 error = dmu_tx_assign(tx, txg_how); 1824 if (error) { 1825 if (error == ERESTART) { 1826 ASSERT3U(txg_how, ==, TXG_NOWAIT); 1827 dmu_tx_wait(tx); 1828 } else { 1829 ASSERT3U(error, ==, ENOSPC); 1830 ztest_record_enospc(tag); 1831 } 1832 dmu_tx_abort(tx); 1833 return (0); 1834 } 1835 txg = dmu_tx_get_txg(tx); 1836 ASSERT3U(txg, !=, 0); 1837 return (txg); 1838 } 1839 1840 static void 1841 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1842 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1843 uint64_t crtxg) 1844 { 1845 bt->bt_magic = BT_MAGIC; 1846 bt->bt_objset = dmu_objset_id(os); 1847 bt->bt_object = object; 1848 bt->bt_dnodesize = dnodesize; 1849 bt->bt_offset = offset; 1850 bt->bt_gen = gen; 1851 bt->bt_txg = txg; 1852 bt->bt_crtxg = crtxg; 1853 } 1854 1855 static void 1856 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1857 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1858 uint64_t crtxg) 1859 { 1860 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1861 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1862 ASSERT3U(bt->bt_object, ==, object); 1863 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1864 ASSERT3U(bt->bt_offset, ==, offset); 1865 ASSERT3U(bt->bt_gen, <=, gen); 1866 ASSERT3U(bt->bt_txg, <=, txg); 1867 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1868 } 1869 1870 static ztest_block_tag_t * 1871 ztest_bt_bonus(dmu_buf_t *db) 1872 { 1873 dmu_object_info_t doi; 1874 ztest_block_tag_t *bt; 1875 1876 dmu_object_info_from_db(db, &doi); 1877 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1878 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1879 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1880 1881 return (bt); 1882 } 1883 1884 /* 1885 * Generate a token to fill up unused bonus buffer space. Try to make 1886 * it unique to the object, generation, and offset to verify that data 1887 * is not getting overwritten by data from other dnodes. 1888 */ 1889 #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1890 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1891 1892 /* 1893 * Fill up the unused bonus buffer region before the block tag with a 1894 * verifiable pattern. Filling the whole bonus area with non-zero data 1895 * helps ensure that all dnode traversal code properly skips the 1896 * interior regions of large dnodes. 1897 */ 1898 static void 1899 ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1900 objset_t *os, uint64_t gen) 1901 { 1902 uint64_t *bonusp; 1903 1904 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1905 1906 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1907 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1908 gen, bonusp - (uint64_t *)db->db_data); 1909 *bonusp = token; 1910 } 1911 } 1912 1913 /* 1914 * Verify that the unused area of a bonus buffer is filled with the 1915 * expected tokens. 1916 */ 1917 static void 1918 ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1919 objset_t *os, uint64_t gen) 1920 { 1921 uint64_t *bonusp; 1922 1923 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1924 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1925 gen, bonusp - (uint64_t *)db->db_data); 1926 VERIFY3U(*bonusp, ==, token); 1927 } 1928 } 1929 1930 /* 1931 * ZIL logging ops 1932 */ 1933 1934 #define lrz_type lr_mode 1935 #define lrz_blocksize lr_uid 1936 #define lrz_ibshift lr_gid 1937 #define lrz_bonustype lr_rdev 1938 #define lrz_dnodesize lr_crtime[1] 1939 1940 static void 1941 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1942 { 1943 char *name = (void *)(lr + 1); /* name follows lr */ 1944 size_t namesize = strlen(name) + 1; 1945 itx_t *itx; 1946 1947 if (zil_replaying(zd->zd_zilog, tx)) 1948 return; 1949 1950 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1951 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1952 sizeof (*lr) + namesize - sizeof (lr_t)); 1953 1954 zil_itx_assign(zd->zd_zilog, itx, tx); 1955 } 1956 1957 static void 1958 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1959 { 1960 char *name = (void *)(lr + 1); /* name follows lr */ 1961 size_t namesize = strlen(name) + 1; 1962 itx_t *itx; 1963 1964 if (zil_replaying(zd->zd_zilog, tx)) 1965 return; 1966 1967 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1968 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1969 sizeof (*lr) + namesize - sizeof (lr_t)); 1970 1971 itx->itx_oid = object; 1972 zil_itx_assign(zd->zd_zilog, itx, tx); 1973 } 1974 1975 static void 1976 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1977 { 1978 itx_t *itx; 1979 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1980 1981 if (zil_replaying(zd->zd_zilog, tx)) 1982 return; 1983 1984 if (lr->lr_length > zil_max_log_data(zd->zd_zilog, sizeof (lr_write_t))) 1985 write_state = WR_INDIRECT; 1986 1987 itx = zil_itx_create(TX_WRITE, 1988 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1989 1990 if (write_state == WR_COPIED && 1991 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1992 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { 1993 zil_itx_destroy(itx); 1994 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1995 write_state = WR_NEED_COPY; 1996 } 1997 itx->itx_private = zd; 1998 itx->itx_wr_state = write_state; 1999 itx->itx_sync = (ztest_random(8) == 0); 2000 2001 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2002 sizeof (*lr) - sizeof (lr_t)); 2003 2004 zil_itx_assign(zd->zd_zilog, itx, tx); 2005 } 2006 2007 static void 2008 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 2009 { 2010 itx_t *itx; 2011 2012 if (zil_replaying(zd->zd_zilog, tx)) 2013 return; 2014 2015 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 2016 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2017 sizeof (*lr) - sizeof (lr_t)); 2018 2019 itx->itx_sync = B_FALSE; 2020 zil_itx_assign(zd->zd_zilog, itx, tx); 2021 } 2022 2023 static void 2024 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 2025 { 2026 itx_t *itx; 2027 2028 if (zil_replaying(zd->zd_zilog, tx)) 2029 return; 2030 2031 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 2032 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2033 sizeof (*lr) - sizeof (lr_t)); 2034 2035 itx->itx_sync = B_FALSE; 2036 zil_itx_assign(zd->zd_zilog, itx, tx); 2037 } 2038 2039 /* 2040 * ZIL replay ops 2041 */ 2042 static int 2043 ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 2044 { 2045 ztest_ds_t *zd = arg1; 2046 lr_create_t *lr = arg2; 2047 char *name = (void *)(lr + 1); /* name follows lr */ 2048 objset_t *os = zd->zd_os; 2049 ztest_block_tag_t *bbt; 2050 dmu_buf_t *db; 2051 dmu_tx_t *tx; 2052 uint64_t txg; 2053 int error = 0; 2054 int bonuslen; 2055 2056 if (byteswap) 2057 byteswap_uint64_array(lr, sizeof (*lr)); 2058 2059 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2060 ASSERT3S(name[0], !=, '\0'); 2061 2062 tx = dmu_tx_create(os); 2063 2064 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 2065 2066 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2067 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 2068 } else { 2069 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 2070 } 2071 2072 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2073 if (txg == 0) 2074 return (ENOSPC); 2075 2076 ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); 2077 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 2078 2079 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2080 if (lr->lr_foid == 0) { 2081 lr->lr_foid = zap_create_dnsize(os, 2082 lr->lrz_type, lr->lrz_bonustype, 2083 bonuslen, lr->lrz_dnodesize, tx); 2084 } else { 2085 error = zap_create_claim_dnsize(os, lr->lr_foid, 2086 lr->lrz_type, lr->lrz_bonustype, 2087 bonuslen, lr->lrz_dnodesize, tx); 2088 } 2089 } else { 2090 if (lr->lr_foid == 0) { 2091 lr->lr_foid = dmu_object_alloc_dnsize(os, 2092 lr->lrz_type, 0, lr->lrz_bonustype, 2093 bonuslen, lr->lrz_dnodesize, tx); 2094 } else { 2095 error = dmu_object_claim_dnsize(os, lr->lr_foid, 2096 lr->lrz_type, 0, lr->lrz_bonustype, 2097 bonuslen, lr->lrz_dnodesize, tx); 2098 } 2099 } 2100 2101 if (error) { 2102 ASSERT3U(error, ==, EEXIST); 2103 ASSERT(zd->zd_zilog->zl_replay); 2104 dmu_tx_commit(tx); 2105 return (error); 2106 } 2107 2108 ASSERT3U(lr->lr_foid, !=, 0); 2109 2110 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 2111 VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, 2112 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 2113 2114 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2115 bbt = ztest_bt_bonus(db); 2116 dmu_buf_will_dirty(db, tx); 2117 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 2118 lr->lr_gen, txg, txg); 2119 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 2120 dmu_buf_rele(db, FTAG); 2121 2122 VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 2123 &lr->lr_foid, tx)); 2124 2125 (void) ztest_log_create(zd, tx, lr); 2126 2127 dmu_tx_commit(tx); 2128 2129 return (0); 2130 } 2131 2132 static int 2133 ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 2134 { 2135 ztest_ds_t *zd = arg1; 2136 lr_remove_t *lr = arg2; 2137 char *name = (void *)(lr + 1); /* name follows lr */ 2138 objset_t *os = zd->zd_os; 2139 dmu_object_info_t doi; 2140 dmu_tx_t *tx; 2141 uint64_t object, txg; 2142 2143 if (byteswap) 2144 byteswap_uint64_array(lr, sizeof (*lr)); 2145 2146 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2147 ASSERT3S(name[0], !=, '\0'); 2148 2149 VERIFY0( 2150 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 2151 ASSERT3U(object, !=, 0); 2152 2153 ztest_object_lock(zd, object, ZTRL_WRITER); 2154 2155 VERIFY0(dmu_object_info(os, object, &doi)); 2156 2157 tx = dmu_tx_create(os); 2158 2159 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 2160 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 2161 2162 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2163 if (txg == 0) { 2164 ztest_object_unlock(zd, object); 2165 return (ENOSPC); 2166 } 2167 2168 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 2169 VERIFY0(zap_destroy(os, object, tx)); 2170 } else { 2171 VERIFY0(dmu_object_free(os, object, tx)); 2172 } 2173 2174 VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); 2175 2176 (void) ztest_log_remove(zd, tx, lr, object); 2177 2178 dmu_tx_commit(tx); 2179 2180 ztest_object_unlock(zd, object); 2181 2182 return (0); 2183 } 2184 2185 static int 2186 ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 2187 { 2188 ztest_ds_t *zd = arg1; 2189 lr_write_t *lr = arg2; 2190 objset_t *os = zd->zd_os; 2191 void *data = lr + 1; /* data follows lr */ 2192 uint64_t offset, length; 2193 ztest_block_tag_t *bt = data; 2194 ztest_block_tag_t *bbt; 2195 uint64_t gen, txg, lrtxg, crtxg; 2196 dmu_object_info_t doi; 2197 dmu_tx_t *tx; 2198 dmu_buf_t *db; 2199 arc_buf_t *abuf = NULL; 2200 rl_t *rl; 2201 2202 if (byteswap) 2203 byteswap_uint64_array(lr, sizeof (*lr)); 2204 2205 offset = lr->lr_offset; 2206 length = lr->lr_length; 2207 2208 /* If it's a dmu_sync() block, write the whole block */ 2209 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 2210 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 2211 if (length < blocksize) { 2212 offset -= offset % blocksize; 2213 length = blocksize; 2214 } 2215 } 2216 2217 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 2218 byteswap_uint64_array(bt, sizeof (*bt)); 2219 2220 if (bt->bt_magic != BT_MAGIC) 2221 bt = NULL; 2222 2223 ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); 2224 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, ZTRL_WRITER); 2225 2226 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2227 2228 dmu_object_info_from_db(db, &doi); 2229 2230 bbt = ztest_bt_bonus(db); 2231 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2232 gen = bbt->bt_gen; 2233 crtxg = bbt->bt_crtxg; 2234 lrtxg = lr->lr_common.lrc_txg; 2235 2236 tx = dmu_tx_create(os); 2237 2238 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 2239 2240 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 2241 P2PHASE(offset, length) == 0) 2242 abuf = dmu_request_arcbuf(db, length); 2243 2244 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2245 if (txg == 0) { 2246 if (abuf != NULL) 2247 dmu_return_arcbuf(abuf); 2248 dmu_buf_rele(db, FTAG); 2249 ztest_range_unlock(rl); 2250 ztest_object_unlock(zd, lr->lr_foid); 2251 return (ENOSPC); 2252 } 2253 2254 if (bt != NULL) { 2255 /* 2256 * Usually, verify the old data before writing new data -- 2257 * but not always, because we also want to verify correct 2258 * behavior when the data was not recently read into cache. 2259 */ 2260 ASSERT(doi.doi_data_block_size); 2261 ASSERT0(offset % doi.doi_data_block_size); 2262 if (ztest_random(4) != 0) { 2263 int prefetch = ztest_random(2) ? 2264 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 2265 ztest_block_tag_t rbt; 2266 2267 VERIFY(dmu_read(os, lr->lr_foid, offset, 2268 sizeof (rbt), &rbt, prefetch) == 0); 2269 if (rbt.bt_magic == BT_MAGIC) { 2270 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 2271 offset, gen, txg, crtxg); 2272 } 2273 } 2274 2275 /* 2276 * Writes can appear to be newer than the bonus buffer because 2277 * the ztest_get_data() callback does a dmu_read() of the 2278 * open-context data, which may be different than the data 2279 * as it was when the write was generated. 2280 */ 2281 if (zd->zd_zilog->zl_replay) { 2282 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 2283 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 2284 bt->bt_crtxg); 2285 } 2286 2287 /* 2288 * Set the bt's gen/txg to the bonus buffer's gen/txg 2289 * so that all of the usual ASSERTs will work. 2290 */ 2291 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 2292 crtxg); 2293 } 2294 2295 if (abuf == NULL) { 2296 dmu_write(os, lr->lr_foid, offset, length, data, tx); 2297 } else { 2298 memcpy(abuf->b_data, data, length); 2299 VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx)); 2300 } 2301 2302 (void) ztest_log_write(zd, tx, lr); 2303 2304 dmu_buf_rele(db, FTAG); 2305 2306 dmu_tx_commit(tx); 2307 2308 ztest_range_unlock(rl); 2309 ztest_object_unlock(zd, lr->lr_foid); 2310 2311 return (0); 2312 } 2313 2314 static int 2315 ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 2316 { 2317 ztest_ds_t *zd = arg1; 2318 lr_truncate_t *lr = arg2; 2319 objset_t *os = zd->zd_os; 2320 dmu_tx_t *tx; 2321 uint64_t txg; 2322 rl_t *rl; 2323 2324 if (byteswap) 2325 byteswap_uint64_array(lr, sizeof (*lr)); 2326 2327 ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); 2328 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 2329 ZTRL_WRITER); 2330 2331 tx = dmu_tx_create(os); 2332 2333 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 2334 2335 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2336 if (txg == 0) { 2337 ztest_range_unlock(rl); 2338 ztest_object_unlock(zd, lr->lr_foid); 2339 return (ENOSPC); 2340 } 2341 2342 VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 2343 lr->lr_length, tx)); 2344 2345 (void) ztest_log_truncate(zd, tx, lr); 2346 2347 dmu_tx_commit(tx); 2348 2349 ztest_range_unlock(rl); 2350 ztest_object_unlock(zd, lr->lr_foid); 2351 2352 return (0); 2353 } 2354 2355 static int 2356 ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 2357 { 2358 ztest_ds_t *zd = arg1; 2359 lr_setattr_t *lr = arg2; 2360 objset_t *os = zd->zd_os; 2361 dmu_tx_t *tx; 2362 dmu_buf_t *db; 2363 ztest_block_tag_t *bbt; 2364 uint64_t txg, lrtxg, crtxg, dnodesize; 2365 2366 if (byteswap) 2367 byteswap_uint64_array(lr, sizeof (*lr)); 2368 2369 ztest_object_lock(zd, lr->lr_foid, ZTRL_WRITER); 2370 2371 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2372 2373 tx = dmu_tx_create(os); 2374 dmu_tx_hold_bonus(tx, lr->lr_foid); 2375 2376 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2377 if (txg == 0) { 2378 dmu_buf_rele(db, FTAG); 2379 ztest_object_unlock(zd, lr->lr_foid); 2380 return (ENOSPC); 2381 } 2382 2383 bbt = ztest_bt_bonus(db); 2384 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2385 crtxg = bbt->bt_crtxg; 2386 lrtxg = lr->lr_common.lrc_txg; 2387 dnodesize = bbt->bt_dnodesize; 2388 2389 if (zd->zd_zilog->zl_replay) { 2390 ASSERT3U(lr->lr_size, !=, 0); 2391 ASSERT3U(lr->lr_mode, !=, 0); 2392 ASSERT3U(lrtxg, !=, 0); 2393 } else { 2394 /* 2395 * Randomly change the size and increment the generation. 2396 */ 2397 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 2398 sizeof (*bbt); 2399 lr->lr_mode = bbt->bt_gen + 1; 2400 ASSERT0(lrtxg); 2401 } 2402 2403 /* 2404 * Verify that the current bonus buffer is not newer than our txg. 2405 */ 2406 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2407 MAX(txg, lrtxg), crtxg); 2408 2409 dmu_buf_will_dirty(db, tx); 2410 2411 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 2412 ASSERT3U(lr->lr_size, <=, db->db_size); 2413 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 2414 bbt = ztest_bt_bonus(db); 2415 2416 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2417 txg, crtxg); 2418 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 2419 dmu_buf_rele(db, FTAG); 2420 2421 (void) ztest_log_setattr(zd, tx, lr); 2422 2423 dmu_tx_commit(tx); 2424 2425 ztest_object_unlock(zd, lr->lr_foid); 2426 2427 return (0); 2428 } 2429 2430 static zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 2431 NULL, /* 0 no such transaction type */ 2432 ztest_replay_create, /* TX_CREATE */ 2433 NULL, /* TX_MKDIR */ 2434 NULL, /* TX_MKXATTR */ 2435 NULL, /* TX_SYMLINK */ 2436 ztest_replay_remove, /* TX_REMOVE */ 2437 NULL, /* TX_RMDIR */ 2438 NULL, /* TX_LINK */ 2439 NULL, /* TX_RENAME */ 2440 ztest_replay_write, /* TX_WRITE */ 2441 ztest_replay_truncate, /* TX_TRUNCATE */ 2442 ztest_replay_setattr, /* TX_SETATTR */ 2443 NULL, /* TX_ACL */ 2444 NULL, /* TX_CREATE_ACL */ 2445 NULL, /* TX_CREATE_ATTR */ 2446 NULL, /* TX_CREATE_ACL_ATTR */ 2447 NULL, /* TX_MKDIR_ACL */ 2448 NULL, /* TX_MKDIR_ATTR */ 2449 NULL, /* TX_MKDIR_ACL_ATTR */ 2450 NULL, /* TX_WRITE2 */ 2451 NULL, /* TX_SETSAXATTR */ 2452 NULL, /* TX_RENAME_EXCHANGE */ 2453 NULL, /* TX_RENAME_WHITEOUT */ 2454 }; 2455 2456 /* 2457 * ZIL get_data callbacks 2458 */ 2459 2460 static void 2461 ztest_get_done(zgd_t *zgd, int error) 2462 { 2463 (void) error; 2464 ztest_ds_t *zd = zgd->zgd_private; 2465 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 2466 2467 if (zgd->zgd_db) 2468 dmu_buf_rele(zgd->zgd_db, zgd); 2469 2470 ztest_range_unlock((rl_t *)zgd->zgd_lr); 2471 ztest_object_unlock(zd, object); 2472 2473 umem_free(zgd, sizeof (*zgd)); 2474 } 2475 2476 static int 2477 ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, 2478 struct lwb *lwb, zio_t *zio) 2479 { 2480 (void) arg2; 2481 ztest_ds_t *zd = arg; 2482 objset_t *os = zd->zd_os; 2483 uint64_t object = lr->lr_foid; 2484 uint64_t offset = lr->lr_offset; 2485 uint64_t size = lr->lr_length; 2486 uint64_t txg = lr->lr_common.lrc_txg; 2487 uint64_t crtxg; 2488 dmu_object_info_t doi; 2489 dmu_buf_t *db; 2490 zgd_t *zgd; 2491 int error; 2492 2493 ASSERT3P(lwb, !=, NULL); 2494 ASSERT3U(size, !=, 0); 2495 2496 ztest_object_lock(zd, object, ZTRL_READER); 2497 error = dmu_bonus_hold(os, object, FTAG, &db); 2498 if (error) { 2499 ztest_object_unlock(zd, object); 2500 return (error); 2501 } 2502 2503 crtxg = ztest_bt_bonus(db)->bt_crtxg; 2504 2505 if (crtxg == 0 || crtxg > txg) { 2506 dmu_buf_rele(db, FTAG); 2507 ztest_object_unlock(zd, object); 2508 return (ENOENT); 2509 } 2510 2511 dmu_object_info_from_db(db, &doi); 2512 dmu_buf_rele(db, FTAG); 2513 db = NULL; 2514 2515 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 2516 zgd->zgd_lwb = lwb; 2517 zgd->zgd_private = zd; 2518 2519 if (buf != NULL) { /* immediate write */ 2520 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2521 object, offset, size, ZTRL_READER); 2522 2523 error = dmu_read(os, object, offset, size, buf, 2524 DMU_READ_NO_PREFETCH); 2525 ASSERT0(error); 2526 } else { 2527 ASSERT3P(zio, !=, NULL); 2528 size = doi.doi_data_block_size; 2529 if (ISP2(size)) { 2530 offset = P2ALIGN_TYPED(offset, size, uint64_t); 2531 } else { 2532 ASSERT3U(offset, <, size); 2533 offset = 0; 2534 } 2535 2536 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2537 object, offset, size, ZTRL_READER); 2538 2539 error = dmu_buf_hold_noread(os, object, offset, zgd, &db); 2540 2541 if (error == 0) { 2542 blkptr_t *bp = &lr->lr_blkptr; 2543 2544 zgd->zgd_db = db; 2545 zgd->zgd_bp = bp; 2546 2547 ASSERT3U(db->db_offset, ==, offset); 2548 ASSERT3U(db->db_size, ==, size); 2549 2550 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2551 ztest_get_done, zgd); 2552 2553 if (error == 0) 2554 return (0); 2555 } 2556 } 2557 2558 ztest_get_done(zgd, error); 2559 2560 return (error); 2561 } 2562 2563 static void * 2564 ztest_lr_alloc(size_t lrsize, char *name) 2565 { 2566 char *lr; 2567 size_t namesize = name ? strlen(name) + 1 : 0; 2568 2569 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2570 2571 if (name) 2572 memcpy(lr + lrsize, name, namesize); 2573 2574 return (lr); 2575 } 2576 2577 static void 2578 ztest_lr_free(void *lr, size_t lrsize, char *name) 2579 { 2580 size_t namesize = name ? strlen(name) + 1 : 0; 2581 2582 umem_free(lr, lrsize + namesize); 2583 } 2584 2585 /* 2586 * Lookup a bunch of objects. Returns the number of objects not found. 2587 */ 2588 static int 2589 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2590 { 2591 int missing = 0; 2592 int error; 2593 int i; 2594 2595 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2596 2597 for (i = 0; i < count; i++, od++) { 2598 od->od_object = 0; 2599 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2600 sizeof (uint64_t), 1, &od->od_object); 2601 if (error) { 2602 ASSERT3S(error, ==, ENOENT); 2603 ASSERT0(od->od_object); 2604 missing++; 2605 } else { 2606 dmu_buf_t *db; 2607 ztest_block_tag_t *bbt; 2608 dmu_object_info_t doi; 2609 2610 ASSERT3U(od->od_object, !=, 0); 2611 ASSERT0(missing); /* there should be no gaps */ 2612 2613 ztest_object_lock(zd, od->od_object, ZTRL_READER); 2614 VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, 2615 FTAG, &db)); 2616 dmu_object_info_from_db(db, &doi); 2617 bbt = ztest_bt_bonus(db); 2618 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2619 od->od_type = doi.doi_type; 2620 od->od_blocksize = doi.doi_data_block_size; 2621 od->od_gen = bbt->bt_gen; 2622 dmu_buf_rele(db, FTAG); 2623 ztest_object_unlock(zd, od->od_object); 2624 } 2625 } 2626 2627 return (missing); 2628 } 2629 2630 static int 2631 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2632 { 2633 int missing = 0; 2634 int i; 2635 2636 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2637 2638 for (i = 0; i < count; i++, od++) { 2639 if (missing) { 2640 od->od_object = 0; 2641 missing++; 2642 continue; 2643 } 2644 2645 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2646 2647 lr->lr_doid = od->od_dir; 2648 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2649 lr->lrz_type = od->od_crtype; 2650 lr->lrz_blocksize = od->od_crblocksize; 2651 lr->lrz_ibshift = ztest_random_ibshift(); 2652 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2653 lr->lrz_dnodesize = od->od_crdnodesize; 2654 lr->lr_gen = od->od_crgen; 2655 lr->lr_crtime[0] = time(NULL); 2656 2657 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2658 ASSERT0(missing); 2659 od->od_object = 0; 2660 missing++; 2661 } else { 2662 od->od_object = lr->lr_foid; 2663 od->od_type = od->od_crtype; 2664 od->od_blocksize = od->od_crblocksize; 2665 od->od_gen = od->od_crgen; 2666 ASSERT3U(od->od_object, !=, 0); 2667 } 2668 2669 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2670 } 2671 2672 return (missing); 2673 } 2674 2675 static int 2676 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2677 { 2678 int missing = 0; 2679 int error; 2680 int i; 2681 2682 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2683 2684 od += count - 1; 2685 2686 for (i = count - 1; i >= 0; i--, od--) { 2687 if (missing) { 2688 missing++; 2689 continue; 2690 } 2691 2692 /* 2693 * No object was found. 2694 */ 2695 if (od->od_object == 0) 2696 continue; 2697 2698 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2699 2700 lr->lr_doid = od->od_dir; 2701 2702 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2703 ASSERT3U(error, ==, ENOSPC); 2704 missing++; 2705 } else { 2706 od->od_object = 0; 2707 } 2708 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2709 } 2710 2711 return (missing); 2712 } 2713 2714 static int 2715 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2716 const void *data) 2717 { 2718 lr_write_t *lr; 2719 int error; 2720 2721 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2722 2723 lr->lr_foid = object; 2724 lr->lr_offset = offset; 2725 lr->lr_length = size; 2726 lr->lr_blkoff = 0; 2727 BP_ZERO(&lr->lr_blkptr); 2728 2729 memcpy(lr + 1, data, size); 2730 2731 error = ztest_replay_write(zd, lr, B_FALSE); 2732 2733 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2734 2735 return (error); 2736 } 2737 2738 static int 2739 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2740 { 2741 lr_truncate_t *lr; 2742 int error; 2743 2744 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2745 2746 lr->lr_foid = object; 2747 lr->lr_offset = offset; 2748 lr->lr_length = size; 2749 2750 error = ztest_replay_truncate(zd, lr, B_FALSE); 2751 2752 ztest_lr_free(lr, sizeof (*lr), NULL); 2753 2754 return (error); 2755 } 2756 2757 static int 2758 ztest_setattr(ztest_ds_t *zd, uint64_t object) 2759 { 2760 lr_setattr_t *lr; 2761 int error; 2762 2763 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2764 2765 lr->lr_foid = object; 2766 lr->lr_size = 0; 2767 lr->lr_mode = 0; 2768 2769 error = ztest_replay_setattr(zd, lr, B_FALSE); 2770 2771 ztest_lr_free(lr, sizeof (*lr), NULL); 2772 2773 return (error); 2774 } 2775 2776 static void 2777 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2778 { 2779 objset_t *os = zd->zd_os; 2780 dmu_tx_t *tx; 2781 uint64_t txg; 2782 rl_t *rl; 2783 2784 txg_wait_synced(dmu_objset_pool(os), 0); 2785 2786 ztest_object_lock(zd, object, ZTRL_READER); 2787 rl = ztest_range_lock(zd, object, offset, size, ZTRL_WRITER); 2788 2789 tx = dmu_tx_create(os); 2790 2791 dmu_tx_hold_write(tx, object, offset, size); 2792 2793 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2794 2795 if (txg != 0) { 2796 dmu_prealloc(os, object, offset, size, tx); 2797 dmu_tx_commit(tx); 2798 txg_wait_synced(dmu_objset_pool(os), txg); 2799 } else { 2800 (void) dmu_free_long_range(os, object, offset, size); 2801 } 2802 2803 ztest_range_unlock(rl); 2804 ztest_object_unlock(zd, object); 2805 } 2806 2807 static void 2808 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2809 { 2810 int err; 2811 ztest_block_tag_t wbt; 2812 dmu_object_info_t doi; 2813 enum ztest_io_type io_type; 2814 uint64_t blocksize; 2815 void *data; 2816 2817 VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); 2818 blocksize = doi.doi_data_block_size; 2819 data = umem_alloc(blocksize, UMEM_NOFAIL); 2820 2821 /* 2822 * Pick an i/o type at random, biased toward writing block tags. 2823 */ 2824 io_type = ztest_random(ZTEST_IO_TYPES); 2825 if (ztest_random(2) == 0) 2826 io_type = ZTEST_IO_WRITE_TAG; 2827 2828 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2829 2830 switch (io_type) { 2831 2832 case ZTEST_IO_WRITE_TAG: 2833 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2834 offset, 0, 0, 0); 2835 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2836 break; 2837 2838 case ZTEST_IO_WRITE_PATTERN: 2839 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2840 if (ztest_random(2) == 0) { 2841 /* 2842 * Induce fletcher2 collisions to ensure that 2843 * zio_ddt_collision() detects and resolves them 2844 * when using fletcher2-verify for deduplication. 2845 */ 2846 ((uint64_t *)data)[0] ^= 1ULL << 63; 2847 ((uint64_t *)data)[4] ^= 1ULL << 63; 2848 } 2849 (void) ztest_write(zd, object, offset, blocksize, data); 2850 break; 2851 2852 case ZTEST_IO_WRITE_ZEROES: 2853 memset(data, 0, blocksize); 2854 (void) ztest_write(zd, object, offset, blocksize, data); 2855 break; 2856 2857 case ZTEST_IO_TRUNCATE: 2858 (void) ztest_truncate(zd, object, offset, blocksize); 2859 break; 2860 2861 case ZTEST_IO_SETATTR: 2862 (void) ztest_setattr(zd, object); 2863 break; 2864 default: 2865 break; 2866 2867 case ZTEST_IO_REWRITE: 2868 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2869 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2870 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2871 B_FALSE); 2872 ASSERT(err == 0 || err == ENOSPC); 2873 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2874 ZFS_PROP_COMPRESSION, 2875 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2876 B_FALSE); 2877 ASSERT(err == 0 || err == ENOSPC); 2878 (void) pthread_rwlock_unlock(&ztest_name_lock); 2879 2880 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2881 DMU_READ_NO_PREFETCH)); 2882 2883 (void) ztest_write(zd, object, offset, blocksize, data); 2884 break; 2885 } 2886 2887 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2888 2889 umem_free(data, blocksize); 2890 } 2891 2892 /* 2893 * Initialize an object description template. 2894 */ 2895 static void 2896 ztest_od_init(ztest_od_t *od, uint64_t id, const char *tag, uint64_t index, 2897 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2898 uint64_t gen) 2899 { 2900 od->od_dir = ZTEST_DIROBJ; 2901 od->od_object = 0; 2902 2903 od->od_crtype = type; 2904 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2905 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2906 od->od_crgen = gen; 2907 2908 od->od_type = DMU_OT_NONE; 2909 od->od_blocksize = 0; 2910 od->od_gen = 0; 2911 2912 (void) snprintf(od->od_name, sizeof (od->od_name), 2913 "%s(%"PRId64")[%"PRIu64"]", 2914 tag, id, index); 2915 } 2916 2917 /* 2918 * Lookup or create the objects for a test using the od template. 2919 * If the objects do not all exist, or if 'remove' is specified, 2920 * remove any existing objects and create new ones. Otherwise, 2921 * use the existing objects. 2922 */ 2923 static int 2924 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2925 { 2926 int count = size / sizeof (*od); 2927 int rv = 0; 2928 2929 mutex_enter(&zd->zd_dirobj_lock); 2930 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2931 (ztest_remove(zd, od, count) != 0 || 2932 ztest_create(zd, od, count) != 0)) 2933 rv = -1; 2934 zd->zd_od = od; 2935 mutex_exit(&zd->zd_dirobj_lock); 2936 2937 return (rv); 2938 } 2939 2940 void 2941 ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2942 { 2943 (void) id; 2944 zilog_t *zilog = zd->zd_zilog; 2945 2946 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2947 2948 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2949 2950 /* 2951 * Remember the committed values in zd, which is in parent/child 2952 * shared memory. If we die, the next iteration of ztest_run() 2953 * will verify that the log really does contain this record. 2954 */ 2955 mutex_enter(&zilog->zl_lock); 2956 ASSERT3P(zd->zd_shared, !=, NULL); 2957 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2958 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2959 mutex_exit(&zilog->zl_lock); 2960 2961 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2962 } 2963 2964 /* 2965 * This function is designed to simulate the operations that occur during a 2966 * mount/unmount operation. We hold the dataset across these operations in an 2967 * attempt to expose any implicit assumptions about ZIL management. 2968 */ 2969 void 2970 ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2971 { 2972 (void) id; 2973 objset_t *os = zd->zd_os; 2974 2975 /* 2976 * We hold the ztest_vdev_lock so we don't cause problems with 2977 * other threads that wish to remove a log device, such as 2978 * ztest_device_removal(). 2979 */ 2980 mutex_enter(&ztest_vdev_lock); 2981 2982 /* 2983 * We grab the zd_dirobj_lock to ensure that no other thread is 2984 * updating the zil (i.e. adding in-memory log records) and the 2985 * zd_zilog_lock to block any I/O. 2986 */ 2987 mutex_enter(&zd->zd_dirobj_lock); 2988 (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); 2989 2990 /* zfsvfs_teardown() */ 2991 zil_close(zd->zd_zilog); 2992 2993 /* zfsvfs_setup() */ 2994 VERIFY3P(zil_open(os, ztest_get_data, NULL), ==, zd->zd_zilog); 2995 zil_replay(os, zd, ztest_replay_vector); 2996 2997 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2998 mutex_exit(&zd->zd_dirobj_lock); 2999 mutex_exit(&ztest_vdev_lock); 3000 } 3001 3002 /* 3003 * Verify that we can't destroy an active pool, create an existing pool, 3004 * or create a pool with a bad vdev spec. 3005 */ 3006 void 3007 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 3008 { 3009 (void) zd, (void) id; 3010 ztest_shared_opts_t *zo = &ztest_opts; 3011 spa_t *spa; 3012 nvlist_t *nvroot; 3013 3014 if (zo->zo_mmp_test) 3015 return; 3016 3017 /* 3018 * Attempt to create using a bad file. 3019 */ 3020 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 3021 VERIFY3U(ENOENT, ==, 3022 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 3023 fnvlist_free(nvroot); 3024 3025 /* 3026 * Attempt to create using a bad mirror. 3027 */ 3028 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); 3029 VERIFY3U(ENOENT, ==, 3030 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 3031 fnvlist_free(nvroot); 3032 3033 /* 3034 * Attempt to create an existing pool. It shouldn't matter 3035 * what's in the nvroot; we should fail with EEXIST. 3036 */ 3037 (void) pthread_rwlock_rdlock(&ztest_name_lock); 3038 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 3039 VERIFY3U(EEXIST, ==, 3040 spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); 3041 fnvlist_free(nvroot); 3042 3043 /* 3044 * We open a reference to the spa and then we try to export it 3045 * expecting one of the following errors: 3046 * 3047 * EBUSY 3048 * Because of the reference we just opened. 3049 * 3050 * ZFS_ERR_EXPORT_IN_PROGRESS 3051 * For the case that there is another ztest thread doing 3052 * an export concurrently. 3053 */ 3054 VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); 3055 int error = spa_destroy(zo->zo_pool); 3056 if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { 3057 fatal(B_FALSE, "spa_destroy(%s) returned unexpected value %d", 3058 spa->spa_name, error); 3059 } 3060 spa_close(spa, FTAG); 3061 3062 (void) pthread_rwlock_unlock(&ztest_name_lock); 3063 } 3064 3065 /* 3066 * Start and then stop the MMP threads to ensure the startup and shutdown code 3067 * works properly. Actual protection and property-related code tested via ZTS. 3068 */ 3069 void 3070 ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) 3071 { 3072 (void) zd, (void) id; 3073 ztest_shared_opts_t *zo = &ztest_opts; 3074 spa_t *spa = ztest_spa; 3075 3076 if (zo->zo_mmp_test) 3077 return; 3078 3079 /* 3080 * Since enabling MMP involves setting a property, it could not be done 3081 * while the pool is suspended. 3082 */ 3083 if (spa_suspended(spa)) 3084 return; 3085 3086 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3087 mutex_enter(&spa->spa_props_lock); 3088 3089 zfs_multihost_fail_intervals = 0; 3090 3091 if (!spa_multihost(spa)) { 3092 spa->spa_multihost = B_TRUE; 3093 mmp_thread_start(spa); 3094 } 3095 3096 mutex_exit(&spa->spa_props_lock); 3097 spa_config_exit(spa, SCL_CONFIG, FTAG); 3098 3099 txg_wait_synced(spa_get_dsl(spa), 0); 3100 mmp_signal_all_threads(); 3101 txg_wait_synced(spa_get_dsl(spa), 0); 3102 3103 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3104 mutex_enter(&spa->spa_props_lock); 3105 3106 if (spa_multihost(spa)) { 3107 mmp_thread_stop(spa); 3108 spa->spa_multihost = B_FALSE; 3109 } 3110 3111 mutex_exit(&spa->spa_props_lock); 3112 spa_config_exit(spa, SCL_CONFIG, FTAG); 3113 } 3114 3115 static int 3116 ztest_get_raidz_children(spa_t *spa) 3117 { 3118 (void) spa; 3119 vdev_t *raidvd; 3120 3121 ASSERT(MUTEX_HELD(&ztest_vdev_lock)); 3122 3123 if (ztest_opts.zo_raid_do_expand) { 3124 raidvd = ztest_spa->spa_root_vdev->vdev_child[0]; 3125 3126 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3127 3128 return (raidvd->vdev_children); 3129 } 3130 3131 return (ztest_opts.zo_raid_children); 3132 } 3133 3134 void 3135 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 3136 { 3137 (void) zd, (void) id; 3138 spa_t *spa; 3139 uint64_t initial_version = SPA_VERSION_INITIAL; 3140 uint64_t raidz_children, version, newversion; 3141 nvlist_t *nvroot, *props; 3142 char *name; 3143 3144 if (ztest_opts.zo_mmp_test) 3145 return; 3146 3147 /* dRAID added after feature flags, skip upgrade test. */ 3148 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) 3149 return; 3150 3151 mutex_enter(&ztest_vdev_lock); 3152 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 3153 3154 /* 3155 * Clean up from previous runs. 3156 */ 3157 (void) spa_destroy(name); 3158 3159 raidz_children = ztest_get_raidz_children(ztest_spa); 3160 3161 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 3162 NULL, raidz_children, ztest_opts.zo_mirrors, 1); 3163 3164 /* 3165 * If we're configuring a RAIDZ device then make sure that the 3166 * initial version is capable of supporting that feature. 3167 */ 3168 switch (ztest_opts.zo_raid_parity) { 3169 case 0: 3170 case 1: 3171 initial_version = SPA_VERSION_INITIAL; 3172 break; 3173 case 2: 3174 initial_version = SPA_VERSION_RAIDZ2; 3175 break; 3176 case 3: 3177 initial_version = SPA_VERSION_RAIDZ3; 3178 break; 3179 } 3180 3181 /* 3182 * Create a pool with a spa version that can be upgraded. Pick 3183 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 3184 */ 3185 do { 3186 version = ztest_random_spa_version(initial_version); 3187 } while (version > SPA_VERSION_BEFORE_FEATURES); 3188 3189 props = fnvlist_alloc(); 3190 fnvlist_add_uint64(props, 3191 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 3192 VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); 3193 fnvlist_free(nvroot); 3194 fnvlist_free(props); 3195 3196 VERIFY0(spa_open(name, &spa, FTAG)); 3197 VERIFY3U(spa_version(spa), ==, version); 3198 newversion = ztest_random_spa_version(version + 1); 3199 3200 if (ztest_opts.zo_verbose >= 4) { 3201 (void) printf("upgrading spa version from " 3202 "%"PRIu64" to %"PRIu64"\n", 3203 version, newversion); 3204 } 3205 3206 spa_upgrade(spa, newversion); 3207 VERIFY3U(spa_version(spa), >, version); 3208 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 3209 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 3210 spa_close(spa, FTAG); 3211 3212 kmem_strfree(name); 3213 mutex_exit(&ztest_vdev_lock); 3214 } 3215 3216 static void 3217 ztest_spa_checkpoint(spa_t *spa) 3218 { 3219 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3220 3221 int error = spa_checkpoint(spa->spa_name); 3222 3223 switch (error) { 3224 case 0: 3225 case ZFS_ERR_DEVRM_IN_PROGRESS: 3226 case ZFS_ERR_DISCARDING_CHECKPOINT: 3227 case ZFS_ERR_CHECKPOINT_EXISTS: 3228 case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: 3229 break; 3230 case ENOSPC: 3231 ztest_record_enospc(FTAG); 3232 break; 3233 default: 3234 fatal(B_FALSE, "spa_checkpoint(%s) = %d", spa->spa_name, error); 3235 } 3236 } 3237 3238 static void 3239 ztest_spa_discard_checkpoint(spa_t *spa) 3240 { 3241 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3242 3243 int error = spa_checkpoint_discard(spa->spa_name); 3244 3245 switch (error) { 3246 case 0: 3247 case ZFS_ERR_DISCARDING_CHECKPOINT: 3248 case ZFS_ERR_NO_CHECKPOINT: 3249 break; 3250 default: 3251 fatal(B_FALSE, "spa_discard_checkpoint(%s) = %d", 3252 spa->spa_name, error); 3253 } 3254 3255 } 3256 3257 void 3258 ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 3259 { 3260 (void) zd, (void) id; 3261 spa_t *spa = ztest_spa; 3262 3263 mutex_enter(&ztest_checkpoint_lock); 3264 if (ztest_random(2) == 0) { 3265 ztest_spa_checkpoint(spa); 3266 } else { 3267 ztest_spa_discard_checkpoint(spa); 3268 } 3269 mutex_exit(&ztest_checkpoint_lock); 3270 } 3271 3272 3273 static vdev_t * 3274 vdev_lookup_by_path(vdev_t *vd, const char *path) 3275 { 3276 vdev_t *mvd; 3277 int c; 3278 3279 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 3280 return (vd); 3281 3282 for (c = 0; c < vd->vdev_children; c++) 3283 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 3284 NULL) 3285 return (mvd); 3286 3287 return (NULL); 3288 } 3289 3290 static int 3291 spa_num_top_vdevs(spa_t *spa) 3292 { 3293 vdev_t *rvd = spa->spa_root_vdev; 3294 ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); 3295 return (rvd->vdev_children); 3296 } 3297 3298 /* 3299 * Verify that vdev_add() works as expected. 3300 */ 3301 void 3302 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 3303 { 3304 (void) zd, (void) id; 3305 ztest_shared_t *zs = ztest_shared; 3306 spa_t *spa = ztest_spa; 3307 uint64_t leaves; 3308 uint64_t guid; 3309 uint64_t raidz_children; 3310 3311 nvlist_t *nvroot; 3312 int error; 3313 3314 if (ztest_opts.zo_mmp_test) 3315 return; 3316 3317 mutex_enter(&ztest_vdev_lock); 3318 raidz_children = ztest_get_raidz_children(spa); 3319 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 3320 3321 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3322 3323 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3324 3325 /* 3326 * If we have slogs then remove them 1/4 of the time. 3327 */ 3328 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 3329 metaslab_group_t *mg; 3330 3331 /* 3332 * find the first real slog in log allocation class 3333 */ 3334 mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; 3335 while (!mg->mg_vd->vdev_islog) 3336 mg = mg->mg_next; 3337 3338 guid = mg->mg_vd->vdev_guid; 3339 3340 spa_config_exit(spa, SCL_VDEV, FTAG); 3341 3342 /* 3343 * We have to grab the zs_name_lock as writer to 3344 * prevent a race between removing a slog (dmu_objset_find) 3345 * and destroying a dataset. Removing the slog will 3346 * grab a reference on the dataset which may cause 3347 * dsl_destroy_head() to fail with EBUSY thus 3348 * leaving the dataset in an inconsistent state. 3349 */ 3350 pthread_rwlock_wrlock(&ztest_name_lock); 3351 error = spa_vdev_remove(spa, guid, B_FALSE); 3352 pthread_rwlock_unlock(&ztest_name_lock); 3353 3354 switch (error) { 3355 case 0: 3356 case EEXIST: /* Generic zil_reset() error */ 3357 case EBUSY: /* Replay required */ 3358 case EACCES: /* Crypto key not loaded */ 3359 case ZFS_ERR_CHECKPOINT_EXISTS: 3360 case ZFS_ERR_DISCARDING_CHECKPOINT: 3361 break; 3362 default: 3363 fatal(B_FALSE, "spa_vdev_remove() = %d", error); 3364 } 3365 } else { 3366 spa_config_exit(spa, SCL_VDEV, FTAG); 3367 3368 /* 3369 * Make 1/4 of the devices be log devices 3370 */ 3371 nvroot = make_vdev_root(NULL, NULL, NULL, 3372 ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? 3373 "log" : NULL, raidz_children, zs->zs_mirrors, 3374 1); 3375 3376 error = spa_vdev_add(spa, nvroot, B_FALSE); 3377 fnvlist_free(nvroot); 3378 3379 switch (error) { 3380 case 0: 3381 break; 3382 case ENOSPC: 3383 ztest_record_enospc("spa_vdev_add"); 3384 break; 3385 default: 3386 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3387 } 3388 } 3389 3390 mutex_exit(&ztest_vdev_lock); 3391 } 3392 3393 void 3394 ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) 3395 { 3396 (void) zd, (void) id; 3397 ztest_shared_t *zs = ztest_shared; 3398 spa_t *spa = ztest_spa; 3399 uint64_t leaves; 3400 nvlist_t *nvroot; 3401 uint64_t raidz_children; 3402 const char *class = (ztest_random(2) == 0) ? 3403 VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; 3404 int error; 3405 3406 /* 3407 * By default add a special vdev 50% of the time 3408 */ 3409 if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || 3410 (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && 3411 ztest_random(2) == 0)) { 3412 return; 3413 } 3414 3415 mutex_enter(&ztest_vdev_lock); 3416 3417 /* Only test with mirrors */ 3418 if (zs->zs_mirrors < 2) { 3419 mutex_exit(&ztest_vdev_lock); 3420 return; 3421 } 3422 3423 /* requires feature@allocation_classes */ 3424 if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { 3425 mutex_exit(&ztest_vdev_lock); 3426 return; 3427 } 3428 3429 raidz_children = ztest_get_raidz_children(spa); 3430 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 3431 3432 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3433 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3434 spa_config_exit(spa, SCL_VDEV, FTAG); 3435 3436 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 3437 class, raidz_children, zs->zs_mirrors, 1); 3438 3439 error = spa_vdev_add(spa, nvroot, B_FALSE); 3440 fnvlist_free(nvroot); 3441 3442 if (error == ENOSPC) 3443 ztest_record_enospc("spa_vdev_add"); 3444 else if (error != 0) 3445 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3446 3447 /* 3448 * 50% of the time allow small blocks in the special class 3449 */ 3450 if (error == 0 && 3451 spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { 3452 if (ztest_opts.zo_verbose >= 3) 3453 (void) printf("Enabling special VDEV small blocks\n"); 3454 error = ztest_dsl_prop_set_uint64(zd->zd_name, 3455 ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); 3456 ASSERT(error == 0 || error == ENOSPC); 3457 } 3458 3459 mutex_exit(&ztest_vdev_lock); 3460 3461 if (ztest_opts.zo_verbose >= 3) { 3462 metaslab_class_t *mc; 3463 3464 if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) 3465 mc = spa_special_class(spa); 3466 else 3467 mc = spa_dedup_class(spa); 3468 (void) printf("Added a %s mirrored vdev (of %d)\n", 3469 class, (int)mc->mc_groups); 3470 } 3471 } 3472 3473 /* 3474 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 3475 */ 3476 void 3477 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 3478 { 3479 (void) zd, (void) id; 3480 ztest_shared_t *zs = ztest_shared; 3481 spa_t *spa = ztest_spa; 3482 vdev_t *rvd = spa->spa_root_vdev; 3483 spa_aux_vdev_t *sav; 3484 const char *aux; 3485 char *path; 3486 uint64_t guid = 0; 3487 int error, ignore_err = 0; 3488 3489 if (ztest_opts.zo_mmp_test) 3490 return; 3491 3492 path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3493 3494 if (ztest_random(2) == 0) { 3495 sav = &spa->spa_spares; 3496 aux = ZPOOL_CONFIG_SPARES; 3497 } else { 3498 sav = &spa->spa_l2cache; 3499 aux = ZPOOL_CONFIG_L2CACHE; 3500 } 3501 3502 mutex_enter(&ztest_vdev_lock); 3503 3504 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3505 3506 if (sav->sav_count != 0 && ztest_random(4) == 0) { 3507 /* 3508 * Pick a random device to remove. 3509 */ 3510 vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3511 3512 /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ 3513 if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) 3514 ignore_err = ENOTSUP; 3515 3516 guid = svd->vdev_guid; 3517 } else { 3518 /* 3519 * Find an unused device we can add. 3520 */ 3521 zs->zs_vdev_aux = 0; 3522 for (;;) { 3523 int c; 3524 (void) snprintf(path, MAXPATHLEN, ztest_aux_template, 3525 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 3526 zs->zs_vdev_aux); 3527 for (c = 0; c < sav->sav_count; c++) 3528 if (strcmp(sav->sav_vdevs[c]->vdev_path, 3529 path) == 0) 3530 break; 3531 if (c == sav->sav_count && 3532 vdev_lookup_by_path(rvd, path) == NULL) 3533 break; 3534 zs->zs_vdev_aux++; 3535 } 3536 } 3537 3538 spa_config_exit(spa, SCL_VDEV, FTAG); 3539 3540 if (guid == 0) { 3541 /* 3542 * Add a new device. 3543 */ 3544 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 3545 (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); 3546 error = spa_vdev_add(spa, nvroot, B_FALSE); 3547 3548 switch (error) { 3549 case 0: 3550 break; 3551 default: 3552 fatal(B_FALSE, "spa_vdev_add(%p) = %d", nvroot, error); 3553 } 3554 fnvlist_free(nvroot); 3555 } else { 3556 /* 3557 * Remove an existing device. Sometimes, dirty its 3558 * vdev state first to make sure we handle removal 3559 * of devices that have pending state changes. 3560 */ 3561 if (ztest_random(2) == 0) 3562 (void) vdev_online(spa, guid, 0, NULL); 3563 3564 error = spa_vdev_remove(spa, guid, B_FALSE); 3565 3566 switch (error) { 3567 case 0: 3568 case EBUSY: 3569 case ZFS_ERR_CHECKPOINT_EXISTS: 3570 case ZFS_ERR_DISCARDING_CHECKPOINT: 3571 break; 3572 default: 3573 if (error != ignore_err) 3574 fatal(B_FALSE, 3575 "spa_vdev_remove(%"PRIu64") = %d", 3576 guid, error); 3577 } 3578 } 3579 3580 mutex_exit(&ztest_vdev_lock); 3581 3582 umem_free(path, MAXPATHLEN); 3583 } 3584 3585 /* 3586 * split a pool if it has mirror tlvdevs 3587 */ 3588 void 3589 ztest_split_pool(ztest_ds_t *zd, uint64_t id) 3590 { 3591 (void) zd, (void) id; 3592 ztest_shared_t *zs = ztest_shared; 3593 spa_t *spa = ztest_spa; 3594 vdev_t *rvd = spa->spa_root_vdev; 3595 nvlist_t *tree, **child, *config, *split, **schild; 3596 uint_t c, children, schildren = 0, lastlogid = 0; 3597 int error = 0; 3598 3599 if (ztest_opts.zo_mmp_test) 3600 return; 3601 3602 mutex_enter(&ztest_vdev_lock); 3603 3604 /* ensure we have a usable config; mirrors of raidz aren't supported */ 3605 if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { 3606 mutex_exit(&ztest_vdev_lock); 3607 return; 3608 } 3609 3610 /* clean up the old pool, if any */ 3611 (void) spa_destroy("splitp"); 3612 3613 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3614 3615 /* generate a config from the existing config */ 3616 mutex_enter(&spa->spa_props_lock); 3617 tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); 3618 mutex_exit(&spa->spa_props_lock); 3619 3620 VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, 3621 &child, &children)); 3622 3623 schild = umem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 3624 UMEM_NOFAIL); 3625 for (c = 0; c < children; c++) { 3626 vdev_t *tvd = rvd->vdev_child[c]; 3627 nvlist_t **mchild; 3628 uint_t mchildren; 3629 3630 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 3631 schild[schildren] = fnvlist_alloc(); 3632 fnvlist_add_string(schild[schildren], 3633 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); 3634 fnvlist_add_uint64(schild[schildren], 3635 ZPOOL_CONFIG_IS_HOLE, 1); 3636 if (lastlogid == 0) 3637 lastlogid = schildren; 3638 ++schildren; 3639 continue; 3640 } 3641 lastlogid = 0; 3642 VERIFY0(nvlist_lookup_nvlist_array(child[c], 3643 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); 3644 schild[schildren++] = fnvlist_dup(mchild[0]); 3645 } 3646 3647 /* OK, create a config that can be used to split */ 3648 split = fnvlist_alloc(); 3649 fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 3650 fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, 3651 (const nvlist_t **)schild, lastlogid != 0 ? lastlogid : schildren); 3652 3653 config = fnvlist_alloc(); 3654 fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); 3655 3656 for (c = 0; c < schildren; c++) 3657 fnvlist_free(schild[c]); 3658 umem_free(schild, rvd->vdev_children * sizeof (nvlist_t *)); 3659 fnvlist_free(split); 3660 3661 spa_config_exit(spa, SCL_VDEV, FTAG); 3662 3663 (void) pthread_rwlock_wrlock(&ztest_name_lock); 3664 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 3665 (void) pthread_rwlock_unlock(&ztest_name_lock); 3666 3667 fnvlist_free(config); 3668 3669 if (error == 0) { 3670 (void) printf("successful split - results:\n"); 3671 mutex_enter(&spa_namespace_lock); 3672 show_pool_stats(spa); 3673 show_pool_stats(spa_lookup("splitp")); 3674 mutex_exit(&spa_namespace_lock); 3675 ++zs->zs_splits; 3676 --zs->zs_mirrors; 3677 } 3678 mutex_exit(&ztest_vdev_lock); 3679 } 3680 3681 /* 3682 * Verify that we can attach and detach devices. 3683 */ 3684 void 3685 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 3686 { 3687 (void) zd, (void) id; 3688 ztest_shared_t *zs = ztest_shared; 3689 spa_t *spa = ztest_spa; 3690 spa_aux_vdev_t *sav = &spa->spa_spares; 3691 vdev_t *rvd = spa->spa_root_vdev; 3692 vdev_t *oldvd, *newvd, *pvd; 3693 nvlist_t *root; 3694 uint64_t leaves; 3695 uint64_t leaf, top; 3696 uint64_t ashift = ztest_get_ashift(); 3697 uint64_t oldguid, pguid; 3698 uint64_t oldsize, newsize; 3699 uint64_t raidz_children; 3700 char *oldpath, *newpath; 3701 int replacing; 3702 int oldvd_has_siblings = B_FALSE; 3703 int newvd_is_spare = B_FALSE; 3704 int newvd_is_dspare = B_FALSE; 3705 int oldvd_is_log; 3706 int oldvd_is_special; 3707 int error, expected_error; 3708 3709 if (ztest_opts.zo_mmp_test) 3710 return; 3711 3712 oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3713 newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3714 3715 mutex_enter(&ztest_vdev_lock); 3716 raidz_children = ztest_get_raidz_children(spa); 3717 leaves = MAX(zs->zs_mirrors, 1) * raidz_children; 3718 3719 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3720 3721 /* 3722 * If a vdev is in the process of being removed, its removal may 3723 * finish while we are in progress, leading to an unexpected error 3724 * value. Don't bother trying to attach while we are in the middle 3725 * of removal. 3726 */ 3727 if (ztest_device_removal_active) { 3728 spa_config_exit(spa, SCL_ALL, FTAG); 3729 goto out; 3730 } 3731 3732 /* 3733 * RAIDZ leaf VDEV mirrors are not currently supported while a 3734 * RAIDZ expansion is in progress. 3735 */ 3736 if (ztest_opts.zo_raid_do_expand) { 3737 spa_config_exit(spa, SCL_ALL, FTAG); 3738 goto out; 3739 } 3740 3741 /* 3742 * Decide whether to do an attach or a replace. 3743 */ 3744 replacing = ztest_random(2); 3745 3746 /* 3747 * Pick a random top-level vdev. 3748 */ 3749 top = ztest_random_vdev_top(spa, B_TRUE); 3750 3751 /* 3752 * Pick a random leaf within it. 3753 */ 3754 leaf = ztest_random(leaves); 3755 3756 /* 3757 * Locate this vdev. 3758 */ 3759 oldvd = rvd->vdev_child[top]; 3760 3761 /* pick a child from the mirror */ 3762 if (zs->zs_mirrors >= 1) { 3763 ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); 3764 ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); 3765 oldvd = oldvd->vdev_child[leaf / raidz_children]; 3766 } 3767 3768 /* pick a child out of the raidz group */ 3769 if (ztest_opts.zo_raid_children > 1) { 3770 if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) 3771 ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); 3772 else 3773 ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); 3774 oldvd = oldvd->vdev_child[leaf % raidz_children]; 3775 } 3776 3777 /* 3778 * If we're already doing an attach or replace, oldvd may be a 3779 * mirror vdev -- in which case, pick a random child. 3780 */ 3781 while (oldvd->vdev_children != 0) { 3782 oldvd_has_siblings = B_TRUE; 3783 ASSERT3U(oldvd->vdev_children, >=, 2); 3784 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3785 } 3786 3787 oldguid = oldvd->vdev_guid; 3788 oldsize = vdev_get_min_asize(oldvd); 3789 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3790 oldvd_is_special = 3791 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_SPECIAL || 3792 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_DEDUP; 3793 (void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN); 3794 pvd = oldvd->vdev_parent; 3795 pguid = pvd->vdev_guid; 3796 3797 /* 3798 * If oldvd has siblings, then half of the time, detach it. Prior 3799 * to the detach the pool is scrubbed in order to prevent creating 3800 * unrepairable blocks as a result of the data corruption injection. 3801 */ 3802 if (oldvd_has_siblings && ztest_random(2) == 0) { 3803 spa_config_exit(spa, SCL_ALL, FTAG); 3804 3805 error = ztest_scrub_impl(spa); 3806 if (error) 3807 goto out; 3808 3809 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3810 if (error != 0 && error != ENODEV && error != EBUSY && 3811 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3812 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3813 fatal(B_FALSE, "detach (%s) returned %d", 3814 oldpath, error); 3815 goto out; 3816 } 3817 3818 /* 3819 * For the new vdev, choose with equal probability between the two 3820 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3821 */ 3822 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3823 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3824 newvd_is_spare = B_TRUE; 3825 3826 if (newvd->vdev_ops == &vdev_draid_spare_ops) 3827 newvd_is_dspare = B_TRUE; 3828 3829 (void) strlcpy(newpath, newvd->vdev_path, MAXPATHLEN); 3830 } else { 3831 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 3832 ztest_opts.zo_dir, ztest_opts.zo_pool, 3833 top * leaves + leaf); 3834 if (ztest_random(2) == 0) 3835 newpath[strlen(newpath) - 1] = 'b'; 3836 newvd = vdev_lookup_by_path(rvd, newpath); 3837 } 3838 3839 if (newvd) { 3840 /* 3841 * Reopen to ensure the vdev's asize field isn't stale. 3842 */ 3843 vdev_reopen(newvd); 3844 newsize = vdev_get_min_asize(newvd); 3845 } else { 3846 /* 3847 * Make newsize a little bigger or smaller than oldsize. 3848 * If it's smaller, the attach should fail. 3849 * If it's larger, and we're doing a replace, 3850 * we should get dynamic LUN growth when we're done. 3851 */ 3852 newsize = 10 * oldsize / (9 + ztest_random(3)); 3853 } 3854 3855 /* 3856 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3857 * unless it's a replace; in that case any non-replacing parent is OK. 3858 * 3859 * If newvd is already part of the pool, it should fail with EBUSY. 3860 * 3861 * If newvd is too small, it should fail with EOVERFLOW. 3862 * 3863 * If newvd is a distributed spare and it's being attached to a 3864 * dRAID which is not its parent it should fail with EINVAL. 3865 */ 3866 if (pvd->vdev_ops != &vdev_mirror_ops && 3867 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3868 pvd->vdev_ops == &vdev_replacing_ops || 3869 pvd->vdev_ops == &vdev_spare_ops)) 3870 expected_error = ENOTSUP; 3871 else if (newvd_is_spare && 3872 (!replacing || oldvd_is_log || oldvd_is_special)) 3873 expected_error = ENOTSUP; 3874 else if (newvd == oldvd) 3875 expected_error = replacing ? 0 : EBUSY; 3876 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3877 expected_error = EBUSY; 3878 else if (!newvd_is_dspare && newsize < oldsize) 3879 expected_error = EOVERFLOW; 3880 else if (ashift > oldvd->vdev_top->vdev_ashift) 3881 expected_error = EDOM; 3882 else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) 3883 expected_error = EINVAL; 3884 else 3885 expected_error = 0; 3886 3887 spa_config_exit(spa, SCL_ALL, FTAG); 3888 3889 /* 3890 * Build the nvlist describing newpath. 3891 */ 3892 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3893 ashift, NULL, 0, 0, 1); 3894 3895 /* 3896 * When supported select either a healing or sequential resilver. 3897 */ 3898 boolean_t rebuilding = B_FALSE; 3899 if (pvd->vdev_ops == &vdev_mirror_ops || 3900 pvd->vdev_ops == &vdev_root_ops) { 3901 rebuilding = !!ztest_random(2); 3902 } 3903 3904 error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); 3905 3906 fnvlist_free(root); 3907 3908 /* 3909 * If our parent was the replacing vdev, but the replace completed, 3910 * then instead of failing with ENOTSUP we may either succeed, 3911 * fail with ENODEV, or fail with EOVERFLOW. 3912 */ 3913 if (expected_error == ENOTSUP && 3914 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3915 expected_error = error; 3916 3917 /* 3918 * If someone grew the LUN, the replacement may be too small. 3919 */ 3920 if (error == EOVERFLOW || error == EBUSY) 3921 expected_error = error; 3922 3923 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3924 error == ZFS_ERR_DISCARDING_CHECKPOINT || 3925 error == ZFS_ERR_RESILVER_IN_PROGRESS || 3926 error == ZFS_ERR_REBUILD_IN_PROGRESS) 3927 expected_error = error; 3928 3929 if (error != expected_error && expected_error != EBUSY) { 3930 fatal(B_FALSE, "attach (%s %"PRIu64", %s %"PRIu64", %d) " 3931 "returned %d, expected %d", 3932 oldpath, oldsize, newpath, 3933 newsize, replacing, error, expected_error); 3934 } 3935 out: 3936 mutex_exit(&ztest_vdev_lock); 3937 3938 umem_free(oldpath, MAXPATHLEN); 3939 umem_free(newpath, MAXPATHLEN); 3940 } 3941 3942 static void 3943 raidz_scratch_verify(void) 3944 { 3945 spa_t *spa; 3946 uint64_t write_size, logical_size, offset; 3947 raidz_reflow_scratch_state_t state; 3948 vdev_raidz_expand_t *vre; 3949 vdev_t *raidvd; 3950 3951 ASSERT(raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE); 3952 3953 if (ztest_scratch_state->zs_raidz_scratch_verify_pause == 0) 3954 return; 3955 3956 kernel_init(SPA_MODE_READ); 3957 3958 mutex_enter(&spa_namespace_lock); 3959 spa = spa_lookup(ztest_opts.zo_pool); 3960 ASSERT(spa); 3961 spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; 3962 mutex_exit(&spa_namespace_lock); 3963 3964 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 3965 3966 ASSERT3U(RRSS_GET_OFFSET(&spa->spa_uberblock), !=, UINT64_MAX); 3967 3968 mutex_enter(&ztest_vdev_lock); 3969 3970 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); 3971 3972 vre = spa->spa_raidz_expand; 3973 if (vre == NULL) 3974 goto out; 3975 3976 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3977 offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3978 state = RRSS_GET_STATE(&spa->spa_uberblock); 3979 write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift, 3980 uint64_t); 3981 logical_size = write_size * raidvd->vdev_children; 3982 3983 switch (state) { 3984 /* 3985 * Initial state of reflow process. RAIDZ expansion was 3986 * requested by user, but scratch object was not created. 3987 */ 3988 case RRSS_SCRATCH_NOT_IN_USE: 3989 ASSERT3U(offset, ==, 0); 3990 break; 3991 3992 /* 3993 * Scratch object was synced and stored in boot area. 3994 */ 3995 case RRSS_SCRATCH_VALID: 3996 3997 /* 3998 * Scratch object was synced back to raidz start offset, 3999 * raidz is ready for sector by sector reflow process. 4000 */ 4001 case RRSS_SCRATCH_INVALID_SYNCED: 4002 4003 /* 4004 * Scratch object was synced back to raidz start offset 4005 * on zpool importing, raidz is ready for sector by sector 4006 * reflow process. 4007 */ 4008 case RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT: 4009 ASSERT3U(offset, ==, logical_size); 4010 break; 4011 4012 /* 4013 * Sector by sector reflow process started. 4014 */ 4015 case RRSS_SCRATCH_INVALID_SYNCED_REFLOW: 4016 ASSERT3U(offset, >=, logical_size); 4017 break; 4018 } 4019 4020 out: 4021 spa_config_exit(spa, SCL_ALL, FTAG); 4022 4023 mutex_exit(&ztest_vdev_lock); 4024 4025 ztest_scratch_state->zs_raidz_scratch_verify_pause = 0; 4026 4027 spa_close(spa, FTAG); 4028 kernel_fini(); 4029 } 4030 4031 static void 4032 ztest_scratch_thread(void *arg) 4033 { 4034 (void) arg; 4035 4036 /* wait up to 10 seconds */ 4037 for (int t = 100; t > 0; t -= 1) { 4038 if (raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) 4039 thread_exit(); 4040 4041 (void) poll(NULL, 0, 100); 4042 } 4043 4044 /* killed when the scratch area progress reached a certain point */ 4045 ztest_kill(ztest_shared); 4046 } 4047 4048 /* 4049 * Verify that we can attach raidz device. 4050 */ 4051 void 4052 ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) 4053 { 4054 (void) zd, (void) id; 4055 ztest_shared_t *zs = ztest_shared; 4056 spa_t *spa = ztest_spa; 4057 uint64_t leaves, raidz_children, newsize, ashift = ztest_get_ashift(); 4058 kthread_t *scratch_thread = NULL; 4059 vdev_t *newvd, *pvd; 4060 nvlist_t *root; 4061 char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 4062 int error, expected_error = 0; 4063 4064 mutex_enter(&ztest_vdev_lock); 4065 4066 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); 4067 4068 /* Only allow attach when raid-kind = 'eraidz' */ 4069 if (!ztest_opts.zo_raid_do_expand) { 4070 spa_config_exit(spa, SCL_ALL, FTAG); 4071 goto out; 4072 } 4073 4074 if (ztest_opts.zo_mmp_test) { 4075 spa_config_exit(spa, SCL_ALL, FTAG); 4076 goto out; 4077 } 4078 4079 if (ztest_device_removal_active) { 4080 spa_config_exit(spa, SCL_ALL, FTAG); 4081 goto out; 4082 } 4083 4084 pvd = vdev_lookup_top(spa, 0); 4085 4086 ASSERT(pvd->vdev_ops == &vdev_raidz_ops); 4087 4088 /* 4089 * Get size of a child of the raidz group, 4090 * make sure device is a bit bigger 4091 */ 4092 newvd = pvd->vdev_child[ztest_random(pvd->vdev_children)]; 4093 newsize = 10 * vdev_get_min_asize(newvd) / (9 + ztest_random(2)); 4094 4095 /* 4096 * Get next attached leaf id 4097 */ 4098 raidz_children = ztest_get_raidz_children(spa); 4099 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 4100 zs->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 4101 4102 if (spa->spa_raidz_expand) 4103 expected_error = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS; 4104 4105 spa_config_exit(spa, SCL_ALL, FTAG); 4106 4107 /* 4108 * Path to vdev to be attached 4109 */ 4110 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 4111 ztest_opts.zo_dir, ztest_opts.zo_pool, zs->zs_vdev_next_leaf); 4112 4113 /* 4114 * Build the nvlist describing newpath. 4115 */ 4116 root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL, 4117 0, 0, 1); 4118 4119 /* 4120 * 50% of the time, set raidz_expand_pause_point to cause 4121 * raidz_reflow_scratch_sync() to pause at a certain point and 4122 * then kill the test after 10 seconds so raidz_scratch_verify() 4123 * can confirm consistency when the pool is imported. 4124 */ 4125 if (ztest_random(2) == 0 && expected_error == 0) { 4126 raidz_expand_pause_point = 4127 ztest_random(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2) + 1; 4128 scratch_thread = thread_create(NULL, 0, ztest_scratch_thread, 4129 ztest_shared, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 4130 } 4131 4132 error = spa_vdev_attach(spa, pvd->vdev_guid, root, B_FALSE, B_FALSE); 4133 4134 nvlist_free(root); 4135 4136 if (error == EOVERFLOW || error == ENXIO || 4137 error == ZFS_ERR_CHECKPOINT_EXISTS || 4138 error == ZFS_ERR_DISCARDING_CHECKPOINT) 4139 expected_error = error; 4140 4141 if (error != 0 && error != expected_error) { 4142 fatal(0, "raidz attach (%s %"PRIu64") returned %d, expected %d", 4143 newpath, newsize, error, expected_error); 4144 } 4145 4146 if (raidz_expand_pause_point) { 4147 if (error != 0) { 4148 /* 4149 * Do not verify scratch object in case of error 4150 * returned by vdev attaching. 4151 */ 4152 raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; 4153 } 4154 4155 VERIFY0(thread_join(scratch_thread)); 4156 } 4157 out: 4158 mutex_exit(&ztest_vdev_lock); 4159 4160 umem_free(newpath, MAXPATHLEN); 4161 } 4162 4163 void 4164 ztest_device_removal(ztest_ds_t *zd, uint64_t id) 4165 { 4166 (void) zd, (void) id; 4167 spa_t *spa = ztest_spa; 4168 vdev_t *vd; 4169 uint64_t guid; 4170 int error; 4171 4172 mutex_enter(&ztest_vdev_lock); 4173 4174 if (ztest_device_removal_active) { 4175 mutex_exit(&ztest_vdev_lock); 4176 return; 4177 } 4178 4179 /* 4180 * Remove a random top-level vdev and wait for removal to finish. 4181 */ 4182 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 4183 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 4184 guid = vd->vdev_guid; 4185 spa_config_exit(spa, SCL_VDEV, FTAG); 4186 4187 error = spa_vdev_remove(spa, guid, B_FALSE); 4188 if (error == 0) { 4189 ztest_device_removal_active = B_TRUE; 4190 mutex_exit(&ztest_vdev_lock); 4191 4192 /* 4193 * spa->spa_vdev_removal is created in a sync task that 4194 * is initiated via dsl_sync_task_nowait(). Since the 4195 * task may not run before spa_vdev_remove() returns, we 4196 * must wait at least 1 txg to ensure that the removal 4197 * struct has been created. 4198 */ 4199 txg_wait_synced(spa_get_dsl(spa), 0); 4200 4201 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 4202 txg_wait_synced(spa_get_dsl(spa), 0); 4203 } else { 4204 mutex_exit(&ztest_vdev_lock); 4205 return; 4206 } 4207 4208 /* 4209 * The pool needs to be scrubbed after completing device removal. 4210 * Failure to do so may result in checksum errors due to the 4211 * strategy employed by ztest_fault_inject() when selecting which 4212 * offset are redundant and can be damaged. 4213 */ 4214 error = spa_scan(spa, POOL_SCAN_SCRUB); 4215 if (error == 0) { 4216 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 4217 txg_wait_synced(spa_get_dsl(spa), 0); 4218 } 4219 4220 mutex_enter(&ztest_vdev_lock); 4221 ztest_device_removal_active = B_FALSE; 4222 mutex_exit(&ztest_vdev_lock); 4223 } 4224 4225 /* 4226 * Callback function which expands the physical size of the vdev. 4227 */ 4228 static vdev_t * 4229 grow_vdev(vdev_t *vd, void *arg) 4230 { 4231 spa_t *spa __maybe_unused = vd->vdev_spa; 4232 size_t *newsize = arg; 4233 size_t fsize; 4234 int fd; 4235 4236 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 4237 ASSERT(vd->vdev_ops->vdev_op_leaf); 4238 4239 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 4240 return (vd); 4241 4242 fsize = lseek(fd, 0, SEEK_END); 4243 VERIFY0(ftruncate(fd, *newsize)); 4244 4245 if (ztest_opts.zo_verbose >= 6) { 4246 (void) printf("%s grew from %lu to %lu bytes\n", 4247 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 4248 } 4249 (void) close(fd); 4250 return (NULL); 4251 } 4252 4253 /* 4254 * Callback function which expands a given vdev by calling vdev_online(). 4255 */ 4256 static vdev_t * 4257 online_vdev(vdev_t *vd, void *arg) 4258 { 4259 (void) arg; 4260 spa_t *spa = vd->vdev_spa; 4261 vdev_t *tvd = vd->vdev_top; 4262 uint64_t guid = vd->vdev_guid; 4263 uint64_t generation = spa->spa_config_generation + 1; 4264 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 4265 int error; 4266 4267 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 4268 ASSERT(vd->vdev_ops->vdev_op_leaf); 4269 4270 /* Calling vdev_online will initialize the new metaslabs */ 4271 spa_config_exit(spa, SCL_STATE, spa); 4272 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 4273 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4274 4275 /* 4276 * If vdev_online returned an error or the underlying vdev_open 4277 * failed then we abort the expand. The only way to know that 4278 * vdev_open fails is by checking the returned newstate. 4279 */ 4280 if (error || newstate != VDEV_STATE_HEALTHY) { 4281 if (ztest_opts.zo_verbose >= 5) { 4282 (void) printf("Unable to expand vdev, state %u, " 4283 "error %d\n", newstate, error); 4284 } 4285 return (vd); 4286 } 4287 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 4288 4289 /* 4290 * Since we dropped the lock we need to ensure that we're 4291 * still talking to the original vdev. It's possible this 4292 * vdev may have been detached/replaced while we were 4293 * trying to online it. 4294 */ 4295 if (generation != spa->spa_config_generation) { 4296 if (ztest_opts.zo_verbose >= 5) { 4297 (void) printf("vdev configuration has changed, " 4298 "guid %"PRIu64", state %"PRIu64", " 4299 "expected gen %"PRIu64", got gen %"PRIu64"\n", 4300 guid, 4301 tvd->vdev_state, 4302 generation, 4303 spa->spa_config_generation); 4304 } 4305 return (vd); 4306 } 4307 return (NULL); 4308 } 4309 4310 /* 4311 * Traverse the vdev tree calling the supplied function. 4312 * We continue to walk the tree until we either have walked all 4313 * children or we receive a non-NULL return from the callback. 4314 * If a NULL callback is passed, then we just return back the first 4315 * leaf vdev we encounter. 4316 */ 4317 static vdev_t * 4318 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 4319 { 4320 uint_t c; 4321 4322 if (vd->vdev_ops->vdev_op_leaf) { 4323 if (func == NULL) 4324 return (vd); 4325 else 4326 return (func(vd, arg)); 4327 } 4328 4329 for (c = 0; c < vd->vdev_children; c++) { 4330 vdev_t *cvd = vd->vdev_child[c]; 4331 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 4332 return (cvd); 4333 } 4334 return (NULL); 4335 } 4336 4337 /* 4338 * Verify that dynamic LUN growth works as expected. 4339 */ 4340 void 4341 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 4342 { 4343 (void) zd, (void) id; 4344 spa_t *spa = ztest_spa; 4345 vdev_t *vd, *tvd; 4346 metaslab_class_t *mc; 4347 metaslab_group_t *mg; 4348 size_t psize, newsize; 4349 uint64_t top; 4350 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 4351 4352 mutex_enter(&ztest_checkpoint_lock); 4353 mutex_enter(&ztest_vdev_lock); 4354 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4355 4356 /* 4357 * If there is a vdev removal in progress, it could complete while 4358 * we are running, in which case we would not be able to verify 4359 * that the metaslab_class space increased (because it decreases 4360 * when the device removal completes). 4361 */ 4362 if (ztest_device_removal_active) { 4363 spa_config_exit(spa, SCL_STATE, spa); 4364 mutex_exit(&ztest_vdev_lock); 4365 mutex_exit(&ztest_checkpoint_lock); 4366 return; 4367 } 4368 4369 /* 4370 * If we are under raidz expansion, the test can failed because the 4371 * metaslabs count will not increase immediately after the vdev is 4372 * expanded. It will happen only after raidz expansion completion. 4373 */ 4374 if (spa->spa_raidz_expand) { 4375 spa_config_exit(spa, SCL_STATE, spa); 4376 mutex_exit(&ztest_vdev_lock); 4377 mutex_exit(&ztest_checkpoint_lock); 4378 return; 4379 } 4380 4381 top = ztest_random_vdev_top(spa, B_TRUE); 4382 4383 tvd = spa->spa_root_vdev->vdev_child[top]; 4384 mg = tvd->vdev_mg; 4385 mc = mg->mg_class; 4386 old_ms_count = tvd->vdev_ms_count; 4387 old_class_space = metaslab_class_get_space(mc); 4388 4389 /* 4390 * Determine the size of the first leaf vdev associated with 4391 * our top-level device. 4392 */ 4393 vd = vdev_walk_tree(tvd, NULL, NULL); 4394 ASSERT3P(vd, !=, NULL); 4395 ASSERT(vd->vdev_ops->vdev_op_leaf); 4396 4397 psize = vd->vdev_psize; 4398 4399 /* 4400 * We only try to expand the vdev if it's healthy, less than 4x its 4401 * original size, and it has a valid psize. 4402 */ 4403 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 4404 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 4405 spa_config_exit(spa, SCL_STATE, spa); 4406 mutex_exit(&ztest_vdev_lock); 4407 mutex_exit(&ztest_checkpoint_lock); 4408 return; 4409 } 4410 ASSERT3U(psize, >, 0); 4411 newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); 4412 ASSERT3U(newsize, >, psize); 4413 4414 if (ztest_opts.zo_verbose >= 6) { 4415 (void) printf("Expanding LUN %s from %lu to %lu\n", 4416 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 4417 } 4418 4419 /* 4420 * Growing the vdev is a two step process: 4421 * 1). expand the physical size (i.e. relabel) 4422 * 2). online the vdev to create the new metaslabs 4423 */ 4424 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 4425 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 4426 tvd->vdev_state != VDEV_STATE_HEALTHY) { 4427 if (ztest_opts.zo_verbose >= 5) { 4428 (void) printf("Could not expand LUN because " 4429 "the vdev configuration changed.\n"); 4430 } 4431 spa_config_exit(spa, SCL_STATE, spa); 4432 mutex_exit(&ztest_vdev_lock); 4433 mutex_exit(&ztest_checkpoint_lock); 4434 return; 4435 } 4436 4437 spa_config_exit(spa, SCL_STATE, spa); 4438 4439 /* 4440 * Expanding the LUN will update the config asynchronously, 4441 * thus we must wait for the async thread to complete any 4442 * pending tasks before proceeding. 4443 */ 4444 for (;;) { 4445 boolean_t done; 4446 mutex_enter(&spa->spa_async_lock); 4447 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 4448 mutex_exit(&spa->spa_async_lock); 4449 if (done) 4450 break; 4451 txg_wait_synced(spa_get_dsl(spa), 0); 4452 (void) poll(NULL, 0, 100); 4453 } 4454 4455 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4456 4457 tvd = spa->spa_root_vdev->vdev_child[top]; 4458 new_ms_count = tvd->vdev_ms_count; 4459 new_class_space = metaslab_class_get_space(mc); 4460 4461 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 4462 if (ztest_opts.zo_verbose >= 5) { 4463 (void) printf("Could not verify LUN expansion due to " 4464 "intervening vdev offline or remove.\n"); 4465 } 4466 spa_config_exit(spa, SCL_STATE, spa); 4467 mutex_exit(&ztest_vdev_lock); 4468 mutex_exit(&ztest_checkpoint_lock); 4469 return; 4470 } 4471 4472 /* 4473 * Make sure we were able to grow the vdev. 4474 */ 4475 if (new_ms_count <= old_ms_count) { 4476 fatal(B_FALSE, 4477 "LUN expansion failed: ms_count %"PRIu64" < %"PRIu64"\n", 4478 old_ms_count, new_ms_count); 4479 } 4480 4481 /* 4482 * Make sure we were able to grow the pool. 4483 */ 4484 if (new_class_space <= old_class_space) { 4485 fatal(B_FALSE, 4486 "LUN expansion failed: class_space %"PRIu64" < %"PRIu64"\n", 4487 old_class_space, new_class_space); 4488 } 4489 4490 if (ztest_opts.zo_verbose >= 5) { 4491 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 4492 4493 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 4494 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 4495 (void) printf("%s grew from %s to %s\n", 4496 spa->spa_name, oldnumbuf, newnumbuf); 4497 } 4498 4499 spa_config_exit(spa, SCL_STATE, spa); 4500 mutex_exit(&ztest_vdev_lock); 4501 mutex_exit(&ztest_checkpoint_lock); 4502 } 4503 4504 /* 4505 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 4506 */ 4507 static void 4508 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 4509 { 4510 (void) arg, (void) cr; 4511 4512 /* 4513 * Create the objects common to all ztest datasets. 4514 */ 4515 VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, 4516 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); 4517 } 4518 4519 static int 4520 ztest_dataset_create(char *dsname) 4521 { 4522 int err; 4523 uint64_t rand; 4524 dsl_crypto_params_t *dcp = NULL; 4525 4526 /* 4527 * 50% of the time, we create encrypted datasets 4528 * using a random cipher suite and a hard-coded 4529 * wrapping key. 4530 */ 4531 rand = ztest_random(2); 4532 if (rand != 0) { 4533 nvlist_t *crypto_args = fnvlist_alloc(); 4534 nvlist_t *props = fnvlist_alloc(); 4535 4536 /* slight bias towards the default cipher suite */ 4537 rand = ztest_random(ZIO_CRYPT_FUNCTIONS); 4538 if (rand < ZIO_CRYPT_AES_128_CCM) 4539 rand = ZIO_CRYPT_ON; 4540 4541 fnvlist_add_uint64(props, 4542 zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); 4543 fnvlist_add_uint8_array(crypto_args, "wkeydata", 4544 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 4545 4546 /* 4547 * These parameters aren't really used by the kernel. They 4548 * are simply stored so that userspace knows how to load 4549 * the wrapping key. 4550 */ 4551 fnvlist_add_uint64(props, 4552 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); 4553 fnvlist_add_string(props, 4554 zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); 4555 fnvlist_add_uint64(props, 4556 zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); 4557 fnvlist_add_uint64(props, 4558 zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); 4559 4560 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, 4561 crypto_args, &dcp)); 4562 4563 /* 4564 * Cycle through all available encryption implementations 4565 * to verify interoperability. 4566 */ 4567 VERIFY0(gcm_impl_set("cycle")); 4568 VERIFY0(aes_impl_set("cycle")); 4569 4570 fnvlist_free(crypto_args); 4571 fnvlist_free(props); 4572 } 4573 4574 err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, 4575 ztest_objset_create_cb, NULL); 4576 dsl_crypto_params_free(dcp, !!err); 4577 4578 rand = ztest_random(100); 4579 if (err || rand < 80) 4580 return (err); 4581 4582 if (ztest_opts.zo_verbose >= 5) 4583 (void) printf("Setting dataset %s to sync always\n", dsname); 4584 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 4585 ZFS_SYNC_ALWAYS, B_FALSE)); 4586 } 4587 4588 static int 4589 ztest_objset_destroy_cb(const char *name, void *arg) 4590 { 4591 (void) arg; 4592 objset_t *os; 4593 dmu_object_info_t doi; 4594 int error; 4595 4596 /* 4597 * Verify that the dataset contains a directory object. 4598 */ 4599 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4600 B_TRUE, FTAG, &os)); 4601 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 4602 if (error != ENOENT) { 4603 /* We could have crashed in the middle of destroying it */ 4604 ASSERT0(error); 4605 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 4606 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 4607 } 4608 dmu_objset_disown(os, B_TRUE, FTAG); 4609 4610 /* 4611 * Destroy the dataset. 4612 */ 4613 if (strchr(name, '@') != NULL) { 4614 error = dsl_destroy_snapshot(name, B_TRUE); 4615 if (error != ECHRNG) { 4616 /* 4617 * The program was executed, but encountered a runtime 4618 * error, such as insufficient slop, or a hold on the 4619 * dataset. 4620 */ 4621 ASSERT0(error); 4622 } 4623 } else { 4624 error = dsl_destroy_head(name); 4625 if (error == ENOSPC) { 4626 /* There could be checkpoint or insufficient slop */ 4627 ztest_record_enospc(FTAG); 4628 } else if (error != EBUSY) { 4629 /* There could be a hold on this dataset */ 4630 ASSERT0(error); 4631 } 4632 } 4633 return (0); 4634 } 4635 4636 static boolean_t 4637 ztest_snapshot_create(char *osname, uint64_t id) 4638 { 4639 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4640 int error; 4641 4642 (void) snprintf(snapname, sizeof (snapname), "%"PRIu64"", id); 4643 4644 error = dmu_objset_snapshot_one(osname, snapname); 4645 if (error == ENOSPC) { 4646 ztest_record_enospc(FTAG); 4647 return (B_FALSE); 4648 } 4649 if (error != 0 && error != EEXIST && error != ECHRNG) { 4650 fatal(B_FALSE, "ztest_snapshot_create(%s@%s) = %d", osname, 4651 snapname, error); 4652 } 4653 return (B_TRUE); 4654 } 4655 4656 static boolean_t 4657 ztest_snapshot_destroy(char *osname, uint64_t id) 4658 { 4659 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4660 int error; 4661 4662 (void) snprintf(snapname, sizeof (snapname), "%s@%"PRIu64"", 4663 osname, id); 4664 4665 error = dsl_destroy_snapshot(snapname, B_FALSE); 4666 if (error != 0 && error != ENOENT && error != ECHRNG) 4667 fatal(B_FALSE, "ztest_snapshot_destroy(%s) = %d", 4668 snapname, error); 4669 return (B_TRUE); 4670 } 4671 4672 void 4673 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 4674 { 4675 (void) zd; 4676 ztest_ds_t *zdtmp; 4677 int iters; 4678 int error; 4679 objset_t *os, *os2; 4680 char name[ZFS_MAX_DATASET_NAME_LEN]; 4681 zilog_t *zilog; 4682 int i; 4683 4684 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 4685 4686 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4687 4688 (void) snprintf(name, sizeof (name), "%s/temp_%"PRIu64"", 4689 ztest_opts.zo_pool, id); 4690 4691 /* 4692 * If this dataset exists from a previous run, process its replay log 4693 * half of the time. If we don't replay it, then dsl_destroy_head() 4694 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 4695 */ 4696 if (ztest_random(2) == 0 && 4697 ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 4698 B_TRUE, FTAG, &os) == 0) { 4699 ztest_zd_init(zdtmp, NULL, os); 4700 zil_replay(os, zdtmp, ztest_replay_vector); 4701 ztest_zd_fini(zdtmp); 4702 dmu_objset_disown(os, B_TRUE, FTAG); 4703 } 4704 4705 /* 4706 * There may be an old instance of the dataset we're about to 4707 * create lying around from a previous run. If so, destroy it 4708 * and all of its snapshots. 4709 */ 4710 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 4711 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 4712 4713 /* 4714 * Verify that the destroyed dataset is no longer in the namespace. 4715 * It may still be present if the destroy above fails with ENOSPC. 4716 */ 4717 error = ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, 4718 FTAG, &os); 4719 if (error == 0) { 4720 dmu_objset_disown(os, B_TRUE, FTAG); 4721 ztest_record_enospc(FTAG); 4722 goto out; 4723 } 4724 VERIFY3U(ENOENT, ==, error); 4725 4726 /* 4727 * Verify that we can create a new dataset. 4728 */ 4729 error = ztest_dataset_create(name); 4730 if (error) { 4731 if (error == ENOSPC) { 4732 ztest_record_enospc(FTAG); 4733 goto out; 4734 } 4735 fatal(B_FALSE, "dmu_objset_create(%s) = %d", name, error); 4736 } 4737 4738 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, 4739 FTAG, &os)); 4740 4741 ztest_zd_init(zdtmp, NULL, os); 4742 4743 /* 4744 * Open the intent log for it. 4745 */ 4746 zilog = zil_open(os, ztest_get_data, NULL); 4747 4748 /* 4749 * Put some objects in there, do a little I/O to them, 4750 * and randomly take a couple of snapshots along the way. 4751 */ 4752 iters = ztest_random(5); 4753 for (i = 0; i < iters; i++) { 4754 ztest_dmu_object_alloc_free(zdtmp, id); 4755 if (ztest_random(iters) == 0) 4756 (void) ztest_snapshot_create(name, i); 4757 } 4758 4759 /* 4760 * Verify that we cannot create an existing dataset. 4761 */ 4762 VERIFY3U(EEXIST, ==, 4763 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); 4764 4765 /* 4766 * Verify that we can hold an objset that is also owned. 4767 */ 4768 VERIFY0(dmu_objset_hold(name, FTAG, &os2)); 4769 dmu_objset_rele(os2, FTAG); 4770 4771 /* 4772 * Verify that we cannot own an objset that is already owned. 4773 */ 4774 VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, 4775 B_FALSE, B_TRUE, FTAG, &os2)); 4776 4777 zil_close(zilog); 4778 dmu_objset_disown(os, B_TRUE, FTAG); 4779 ztest_zd_fini(zdtmp); 4780 out: 4781 (void) pthread_rwlock_unlock(&ztest_name_lock); 4782 4783 umem_free(zdtmp, sizeof (ztest_ds_t)); 4784 } 4785 4786 /* 4787 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 4788 */ 4789 void 4790 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 4791 { 4792 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4793 (void) ztest_snapshot_destroy(zd->zd_name, id); 4794 (void) ztest_snapshot_create(zd->zd_name, id); 4795 (void) pthread_rwlock_unlock(&ztest_name_lock); 4796 } 4797 4798 /* 4799 * Cleanup non-standard snapshots and clones. 4800 */ 4801 static void 4802 ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 4803 { 4804 char *snap1name; 4805 char *clone1name; 4806 char *snap2name; 4807 char *clone2name; 4808 char *snap3name; 4809 int error; 4810 4811 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4812 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4813 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4814 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4815 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4816 4817 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4818 osname, id); 4819 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4820 osname, id); 4821 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4822 clone1name, id); 4823 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4824 osname, id); 4825 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4826 clone1name, id); 4827 4828 error = dsl_destroy_head(clone2name); 4829 if (error && error != ENOENT) 4830 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone2name, error); 4831 error = dsl_destroy_snapshot(snap3name, B_FALSE); 4832 if (error && error != ENOENT) 4833 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4834 snap3name, error); 4835 error = dsl_destroy_snapshot(snap2name, B_FALSE); 4836 if (error && error != ENOENT) 4837 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4838 snap2name, error); 4839 error = dsl_destroy_head(clone1name); 4840 if (error && error != ENOENT) 4841 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone1name, error); 4842 error = dsl_destroy_snapshot(snap1name, B_FALSE); 4843 if (error && error != ENOENT) 4844 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4845 snap1name, error); 4846 4847 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4848 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4849 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4850 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4851 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4852 } 4853 4854 /* 4855 * Verify dsl_dataset_promote handles EBUSY 4856 */ 4857 void 4858 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 4859 { 4860 objset_t *os; 4861 char *snap1name; 4862 char *clone1name; 4863 char *snap2name; 4864 char *clone2name; 4865 char *snap3name; 4866 char *osname = zd->zd_name; 4867 int error; 4868 4869 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4870 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4871 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4872 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4873 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4874 4875 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4876 4877 ztest_dsl_dataset_cleanup(osname, id); 4878 4879 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4880 osname, id); 4881 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4882 osname, id); 4883 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4884 clone1name, id); 4885 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4886 osname, id); 4887 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4888 clone1name, id); 4889 4890 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 4891 if (error && error != EEXIST) { 4892 if (error == ENOSPC) { 4893 ztest_record_enospc(FTAG); 4894 goto out; 4895 } 4896 fatal(B_FALSE, "dmu_take_snapshot(%s) = %d", snap1name, error); 4897 } 4898 4899 error = dmu_objset_clone(clone1name, snap1name); 4900 if (error) { 4901 if (error == ENOSPC) { 4902 ztest_record_enospc(FTAG); 4903 goto out; 4904 } 4905 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone1name, error); 4906 } 4907 4908 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 4909 if (error && error != EEXIST) { 4910 if (error == ENOSPC) { 4911 ztest_record_enospc(FTAG); 4912 goto out; 4913 } 4914 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap2name, error); 4915 } 4916 4917 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 4918 if (error && error != EEXIST) { 4919 if (error == ENOSPC) { 4920 ztest_record_enospc(FTAG); 4921 goto out; 4922 } 4923 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap3name, error); 4924 } 4925 4926 error = dmu_objset_clone(clone2name, snap3name); 4927 if (error) { 4928 if (error == ENOSPC) { 4929 ztest_record_enospc(FTAG); 4930 goto out; 4931 } 4932 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone2name, error); 4933 } 4934 4935 error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, 4936 FTAG, &os); 4937 if (error) 4938 fatal(B_FALSE, "dmu_objset_own(%s) = %d", snap2name, error); 4939 error = dsl_dataset_promote(clone2name, NULL); 4940 if (error == ENOSPC) { 4941 dmu_objset_disown(os, B_TRUE, FTAG); 4942 ztest_record_enospc(FTAG); 4943 goto out; 4944 } 4945 if (error != EBUSY) 4946 fatal(B_FALSE, "dsl_dataset_promote(%s), %d, not EBUSY", 4947 clone2name, error); 4948 dmu_objset_disown(os, B_TRUE, FTAG); 4949 4950 out: 4951 ztest_dsl_dataset_cleanup(osname, id); 4952 4953 (void) pthread_rwlock_unlock(&ztest_name_lock); 4954 4955 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4956 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4957 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4958 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4959 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4960 } 4961 4962 #undef OD_ARRAY_SIZE 4963 #define OD_ARRAY_SIZE 4 4964 4965 /* 4966 * Verify that dmu_object_{alloc,free} work as expected. 4967 */ 4968 void 4969 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 4970 { 4971 ztest_od_t *od; 4972 int batchsize; 4973 int size; 4974 int b; 4975 4976 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4977 od = umem_alloc(size, UMEM_NOFAIL); 4978 batchsize = OD_ARRAY_SIZE; 4979 4980 for (b = 0; b < batchsize; b++) 4981 ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 4982 0, 0, 0); 4983 4984 /* 4985 * Destroy the previous batch of objects, create a new batch, 4986 * and do some I/O on the new objects. 4987 */ 4988 if (ztest_object_init(zd, od, size, B_TRUE) != 0) { 4989 zd->zd_od = NULL; 4990 umem_free(od, size); 4991 return; 4992 } 4993 4994 while (ztest_random(4 * batchsize) != 0) 4995 ztest_io(zd, od[ztest_random(batchsize)].od_object, 4996 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4997 4998 umem_free(od, size); 4999 } 5000 5001 /* 5002 * Rewind the global allocator to verify object allocation backfilling. 5003 */ 5004 void 5005 ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 5006 { 5007 (void) id; 5008 objset_t *os = zd->zd_os; 5009 uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 5010 uint64_t object; 5011 5012 /* 5013 * Rewind the global allocator randomly back to a lower object number 5014 * to force backfilling and reclamation of recently freed dnodes. 5015 */ 5016 mutex_enter(&os->os_obj_lock); 5017 object = ztest_random(os->os_obj_next_chunk); 5018 os->os_obj_next_chunk = P2ALIGN_TYPED(object, dnodes_per_chunk, 5019 uint64_t); 5020 mutex_exit(&os->os_obj_lock); 5021 } 5022 5023 #undef OD_ARRAY_SIZE 5024 #define OD_ARRAY_SIZE 2 5025 5026 /* 5027 * Verify that dmu_{read,write} work as expected. 5028 */ 5029 void 5030 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 5031 { 5032 int size; 5033 ztest_od_t *od; 5034 5035 objset_t *os = zd->zd_os; 5036 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 5037 od = umem_alloc(size, UMEM_NOFAIL); 5038 dmu_tx_t *tx; 5039 int freeit, error; 5040 uint64_t i, n, s, txg; 5041 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 5042 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 5043 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 5044 uint64_t regions = 997; 5045 uint64_t stride = 123456789ULL; 5046 uint64_t width = 40; 5047 int free_percent = 5; 5048 5049 /* 5050 * This test uses two objects, packobj and bigobj, that are always 5051 * updated together (i.e. in the same tx) so that their contents are 5052 * in sync and can be compared. Their contents relate to each other 5053 * in a simple way: packobj is a dense array of 'bufwad' structures, 5054 * while bigobj is a sparse array of the same bufwads. Specifically, 5055 * for any index n, there are three bufwads that should be identical: 5056 * 5057 * packobj, at offset n * sizeof (bufwad_t) 5058 * bigobj, at the head of the nth chunk 5059 * bigobj, at the tail of the nth chunk 5060 * 5061 * The chunk size is arbitrary. It doesn't have to be a power of two, 5062 * and it doesn't have any relation to the object blocksize. 5063 * The only requirement is that it can hold at least two bufwads. 5064 * 5065 * Normally, we write the bufwad to each of these locations. 5066 * However, free_percent of the time we instead write zeroes to 5067 * packobj and perform a dmu_free_range() on bigobj. By comparing 5068 * bigobj to packobj, we can verify that the DMU is correctly 5069 * tracking which parts of an object are allocated and free, 5070 * and that the contents of the allocated blocks are correct. 5071 */ 5072 5073 /* 5074 * Read the directory info. If it's the first time, set things up. 5075 */ 5076 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); 5077 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5078 chunksize); 5079 5080 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5081 umem_free(od, size); 5082 return; 5083 } 5084 5085 bigobj = od[0].od_object; 5086 packobj = od[1].od_object; 5087 chunksize = od[0].od_gen; 5088 ASSERT3U(chunksize, ==, od[1].od_gen); 5089 5090 /* 5091 * Prefetch a random chunk of the big object. 5092 * Our aim here is to get some async reads in flight 5093 * for blocks that we may free below; the DMU should 5094 * handle this race correctly. 5095 */ 5096 n = ztest_random(regions) * stride + ztest_random(width); 5097 s = 1 + ztest_random(2 * width - 1); 5098 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 5099 ZIO_PRIORITY_SYNC_READ); 5100 5101 /* 5102 * Pick a random index and compute the offsets into packobj and bigobj. 5103 */ 5104 n = ztest_random(regions) * stride + ztest_random(width); 5105 s = 1 + ztest_random(width - 1); 5106 5107 packoff = n * sizeof (bufwad_t); 5108 packsize = s * sizeof (bufwad_t); 5109 5110 bigoff = n * chunksize; 5111 bigsize = s * chunksize; 5112 5113 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 5114 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 5115 5116 /* 5117 * free_percent of the time, free a range of bigobj rather than 5118 * overwriting it. 5119 */ 5120 freeit = (ztest_random(100) < free_percent); 5121 5122 /* 5123 * Read the current contents of our objects. 5124 */ 5125 error = dmu_read(os, packobj, packoff, packsize, packbuf, 5126 DMU_READ_PREFETCH); 5127 ASSERT0(error); 5128 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 5129 DMU_READ_PREFETCH); 5130 ASSERT0(error); 5131 5132 /* 5133 * Get a tx for the mods to both packobj and bigobj. 5134 */ 5135 tx = dmu_tx_create(os); 5136 5137 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5138 5139 if (freeit) 5140 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 5141 else 5142 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5143 5144 /* This accounts for setting the checksum/compression. */ 5145 dmu_tx_hold_bonus(tx, bigobj); 5146 5147 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5148 if (txg == 0) { 5149 umem_free(packbuf, packsize); 5150 umem_free(bigbuf, bigsize); 5151 umem_free(od, size); 5152 return; 5153 } 5154 5155 enum zio_checksum cksum; 5156 do { 5157 cksum = (enum zio_checksum) 5158 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 5159 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 5160 dmu_object_set_checksum(os, bigobj, cksum, tx); 5161 5162 enum zio_compress comp; 5163 do { 5164 comp = (enum zio_compress) 5165 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 5166 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 5167 dmu_object_set_compress(os, bigobj, comp, tx); 5168 5169 /* 5170 * For each index from n to n + s, verify that the existing bufwad 5171 * in packobj matches the bufwads at the head and tail of the 5172 * corresponding chunk in bigobj. Then update all three bufwads 5173 * with the new values we want to write out. 5174 */ 5175 for (i = 0; i < s; i++) { 5176 /* LINTED */ 5177 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 5178 /* LINTED */ 5179 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 5180 /* LINTED */ 5181 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 5182 5183 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 5184 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 5185 5186 if (pack->bw_txg > txg) 5187 fatal(B_FALSE, 5188 "future leak: got %"PRIx64", open txg is %"PRIx64"", 5189 pack->bw_txg, txg); 5190 5191 if (pack->bw_data != 0 && pack->bw_index != n + i) 5192 fatal(B_FALSE, "wrong index: " 5193 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 5194 pack->bw_index, n, i); 5195 5196 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 5197 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 5198 pack, bigH); 5199 5200 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 5201 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 5202 pack, bigT); 5203 5204 if (freeit) { 5205 memset(pack, 0, sizeof (bufwad_t)); 5206 } else { 5207 pack->bw_index = n + i; 5208 pack->bw_txg = txg; 5209 pack->bw_data = 1 + ztest_random(-2ULL); 5210 } 5211 *bigH = *pack; 5212 *bigT = *pack; 5213 } 5214 5215 /* 5216 * We've verified all the old bufwads, and made new ones. 5217 * Now write them out. 5218 */ 5219 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5220 5221 if (freeit) { 5222 if (ztest_opts.zo_verbose >= 7) { 5223 (void) printf("freeing offset %"PRIx64" size %"PRIx64"" 5224 " txg %"PRIx64"\n", 5225 bigoff, bigsize, txg); 5226 } 5227 VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 5228 } else { 5229 if (ztest_opts.zo_verbose >= 7) { 5230 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5231 " txg %"PRIx64"\n", 5232 bigoff, bigsize, txg); 5233 } 5234 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 5235 } 5236 5237 dmu_tx_commit(tx); 5238 5239 /* 5240 * Sanity check the stuff we just wrote. 5241 */ 5242 { 5243 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5244 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5245 5246 VERIFY0(dmu_read(os, packobj, packoff, 5247 packsize, packcheck, DMU_READ_PREFETCH)); 5248 VERIFY0(dmu_read(os, bigobj, bigoff, 5249 bigsize, bigcheck, DMU_READ_PREFETCH)); 5250 5251 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5252 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5253 5254 umem_free(packcheck, packsize); 5255 umem_free(bigcheck, bigsize); 5256 } 5257 5258 umem_free(packbuf, packsize); 5259 umem_free(bigbuf, bigsize); 5260 umem_free(od, size); 5261 } 5262 5263 static void 5264 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 5265 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 5266 { 5267 uint64_t i; 5268 bufwad_t *pack; 5269 bufwad_t *bigH; 5270 bufwad_t *bigT; 5271 5272 /* 5273 * For each index from n to n + s, verify that the existing bufwad 5274 * in packobj matches the bufwads at the head and tail of the 5275 * corresponding chunk in bigobj. Then update all three bufwads 5276 * with the new values we want to write out. 5277 */ 5278 for (i = 0; i < s; i++) { 5279 /* LINTED */ 5280 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 5281 /* LINTED */ 5282 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 5283 /* LINTED */ 5284 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 5285 5286 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 5287 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 5288 5289 if (pack->bw_txg > txg) 5290 fatal(B_FALSE, 5291 "future leak: got %"PRIx64", open txg is %"PRIx64"", 5292 pack->bw_txg, txg); 5293 5294 if (pack->bw_data != 0 && pack->bw_index != n + i) 5295 fatal(B_FALSE, "wrong index: " 5296 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 5297 pack->bw_index, n, i); 5298 5299 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 5300 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 5301 pack, bigH); 5302 5303 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 5304 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 5305 pack, bigT); 5306 5307 pack->bw_index = n + i; 5308 pack->bw_txg = txg; 5309 pack->bw_data = 1 + ztest_random(-2ULL); 5310 5311 *bigH = *pack; 5312 *bigT = *pack; 5313 } 5314 } 5315 5316 #undef OD_ARRAY_SIZE 5317 #define OD_ARRAY_SIZE 2 5318 5319 void 5320 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 5321 { 5322 objset_t *os = zd->zd_os; 5323 ztest_od_t *od; 5324 dmu_tx_t *tx; 5325 uint64_t i; 5326 int error; 5327 int size; 5328 uint64_t n, s, txg; 5329 bufwad_t *packbuf, *bigbuf; 5330 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 5331 uint64_t blocksize = ztest_random_blocksize(); 5332 uint64_t chunksize = blocksize; 5333 uint64_t regions = 997; 5334 uint64_t stride = 123456789ULL; 5335 uint64_t width = 9; 5336 dmu_buf_t *bonus_db; 5337 arc_buf_t **bigbuf_arcbufs; 5338 dmu_object_info_t doi; 5339 5340 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 5341 od = umem_alloc(size, UMEM_NOFAIL); 5342 5343 /* 5344 * This test uses two objects, packobj and bigobj, that are always 5345 * updated together (i.e. in the same tx) so that their contents are 5346 * in sync and can be compared. Their contents relate to each other 5347 * in a simple way: packobj is a dense array of 'bufwad' structures, 5348 * while bigobj is a sparse array of the same bufwads. Specifically, 5349 * for any index n, there are three bufwads that should be identical: 5350 * 5351 * packobj, at offset n * sizeof (bufwad_t) 5352 * bigobj, at the head of the nth chunk 5353 * bigobj, at the tail of the nth chunk 5354 * 5355 * The chunk size is set equal to bigobj block size so that 5356 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. 5357 */ 5358 5359 /* 5360 * Read the directory info. If it's the first time, set things up. 5361 */ 5362 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5363 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5364 chunksize); 5365 5366 5367 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5368 umem_free(od, size); 5369 return; 5370 } 5371 5372 bigobj = od[0].od_object; 5373 packobj = od[1].od_object; 5374 blocksize = od[0].od_blocksize; 5375 chunksize = blocksize; 5376 ASSERT3U(chunksize, ==, od[1].od_gen); 5377 5378 VERIFY0(dmu_object_info(os, bigobj, &doi)); 5379 VERIFY(ISP2(doi.doi_data_block_size)); 5380 VERIFY3U(chunksize, ==, doi.doi_data_block_size); 5381 VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); 5382 5383 /* 5384 * Pick a random index and compute the offsets into packobj and bigobj. 5385 */ 5386 n = ztest_random(regions) * stride + ztest_random(width); 5387 s = 1 + ztest_random(width - 1); 5388 5389 packoff = n * sizeof (bufwad_t); 5390 packsize = s * sizeof (bufwad_t); 5391 5392 bigoff = n * chunksize; 5393 bigsize = s * chunksize; 5394 5395 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 5396 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 5397 5398 VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 5399 5400 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 5401 5402 /* 5403 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 5404 * Iteration 1 test zcopy to already referenced dbufs. 5405 * Iteration 2 test zcopy to dirty dbuf in the same txg. 5406 * Iteration 3 test zcopy to dbuf dirty in previous txg. 5407 * Iteration 4 test zcopy when dbuf is no longer dirty. 5408 * Iteration 5 test zcopy when it can't be done. 5409 * Iteration 6 one more zcopy write. 5410 */ 5411 for (i = 0; i < 7; i++) { 5412 uint64_t j; 5413 uint64_t off; 5414 5415 /* 5416 * In iteration 5 (i == 5) use arcbufs 5417 * that don't match bigobj blksz to test 5418 * dmu_assign_arcbuf_by_dbuf() when it can't directly 5419 * assign an arcbuf to a dbuf. 5420 */ 5421 for (j = 0; j < s; j++) { 5422 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5423 bigbuf_arcbufs[j] = 5424 dmu_request_arcbuf(bonus_db, chunksize); 5425 } else { 5426 bigbuf_arcbufs[2 * j] = 5427 dmu_request_arcbuf(bonus_db, chunksize / 2); 5428 bigbuf_arcbufs[2 * j + 1] = 5429 dmu_request_arcbuf(bonus_db, chunksize / 2); 5430 } 5431 } 5432 5433 /* 5434 * Get a tx for the mods to both packobj and bigobj. 5435 */ 5436 tx = dmu_tx_create(os); 5437 5438 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5439 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5440 5441 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5442 if (txg == 0) { 5443 umem_free(packbuf, packsize); 5444 umem_free(bigbuf, bigsize); 5445 for (j = 0; j < s; j++) { 5446 if (i != 5 || 5447 chunksize < (SPA_MINBLOCKSIZE * 2)) { 5448 dmu_return_arcbuf(bigbuf_arcbufs[j]); 5449 } else { 5450 dmu_return_arcbuf( 5451 bigbuf_arcbufs[2 * j]); 5452 dmu_return_arcbuf( 5453 bigbuf_arcbufs[2 * j + 1]); 5454 } 5455 } 5456 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5457 umem_free(od, size); 5458 dmu_buf_rele(bonus_db, FTAG); 5459 return; 5460 } 5461 5462 /* 5463 * 50% of the time don't read objects in the 1st iteration to 5464 * test dmu_assign_arcbuf_by_dbuf() for the case when there are 5465 * no existing dbufs for the specified offsets. 5466 */ 5467 if (i != 0 || ztest_random(2) != 0) { 5468 error = dmu_read(os, packobj, packoff, 5469 packsize, packbuf, DMU_READ_PREFETCH); 5470 ASSERT0(error); 5471 error = dmu_read(os, bigobj, bigoff, bigsize, 5472 bigbuf, DMU_READ_PREFETCH); 5473 ASSERT0(error); 5474 } 5475 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 5476 n, chunksize, txg); 5477 5478 /* 5479 * We've verified all the old bufwads, and made new ones. 5480 * Now write them out. 5481 */ 5482 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5483 if (ztest_opts.zo_verbose >= 7) { 5484 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5485 " txg %"PRIx64"\n", 5486 bigoff, bigsize, txg); 5487 } 5488 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 5489 dmu_buf_t *dbt; 5490 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5491 memcpy(bigbuf_arcbufs[j]->b_data, 5492 (caddr_t)bigbuf + (off - bigoff), 5493 chunksize); 5494 } else { 5495 memcpy(bigbuf_arcbufs[2 * j]->b_data, 5496 (caddr_t)bigbuf + (off - bigoff), 5497 chunksize / 2); 5498 memcpy(bigbuf_arcbufs[2 * j + 1]->b_data, 5499 (caddr_t)bigbuf + (off - bigoff) + 5500 chunksize / 2, 5501 chunksize / 2); 5502 } 5503 5504 if (i == 1) { 5505 VERIFY(dmu_buf_hold(os, bigobj, off, 5506 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 5507 } 5508 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5509 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5510 off, bigbuf_arcbufs[j], tx)); 5511 } else { 5512 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5513 off, bigbuf_arcbufs[2 * j], tx)); 5514 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5515 off + chunksize / 2, 5516 bigbuf_arcbufs[2 * j + 1], tx)); 5517 } 5518 if (i == 1) { 5519 dmu_buf_rele(dbt, FTAG); 5520 } 5521 } 5522 dmu_tx_commit(tx); 5523 5524 /* 5525 * Sanity check the stuff we just wrote. 5526 */ 5527 { 5528 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5529 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5530 5531 VERIFY0(dmu_read(os, packobj, packoff, 5532 packsize, packcheck, DMU_READ_PREFETCH)); 5533 VERIFY0(dmu_read(os, bigobj, bigoff, 5534 bigsize, bigcheck, DMU_READ_PREFETCH)); 5535 5536 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5537 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5538 5539 umem_free(packcheck, packsize); 5540 umem_free(bigcheck, bigsize); 5541 } 5542 if (i == 2) { 5543 txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); 5544 } else if (i == 3) { 5545 txg_wait_synced(dmu_objset_pool(os), 0); 5546 } 5547 } 5548 5549 dmu_buf_rele(bonus_db, FTAG); 5550 umem_free(packbuf, packsize); 5551 umem_free(bigbuf, bigsize); 5552 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5553 umem_free(od, size); 5554 } 5555 5556 void 5557 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 5558 { 5559 (void) id; 5560 ztest_od_t *od; 5561 5562 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5563 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 5564 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5565 5566 /* 5567 * Have multiple threads write to large offsets in an object 5568 * to verify that parallel writes to an object -- even to the 5569 * same blocks within the object -- doesn't cause any trouble. 5570 */ 5571 ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5572 5573 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) 5574 return; 5575 5576 while (ztest_random(10) != 0) 5577 ztest_io(zd, od->od_object, offset); 5578 5579 umem_free(od, sizeof (ztest_od_t)); 5580 } 5581 5582 void 5583 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 5584 { 5585 ztest_od_t *od; 5586 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 5587 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5588 uint64_t count = ztest_random(20) + 1; 5589 uint64_t blocksize = ztest_random_blocksize(); 5590 void *data; 5591 5592 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5593 5594 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5595 5596 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5597 !ztest_random(2)) != 0) { 5598 umem_free(od, sizeof (ztest_od_t)); 5599 return; 5600 } 5601 5602 if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { 5603 umem_free(od, sizeof (ztest_od_t)); 5604 return; 5605 } 5606 5607 ztest_prealloc(zd, od->od_object, offset, count * blocksize); 5608 5609 data = umem_zalloc(blocksize, UMEM_NOFAIL); 5610 5611 while (ztest_random(count) != 0) { 5612 uint64_t randoff = offset + (ztest_random(count) * blocksize); 5613 if (ztest_write(zd, od->od_object, randoff, blocksize, 5614 data) != 0) 5615 break; 5616 while (ztest_random(4) != 0) 5617 ztest_io(zd, od->od_object, randoff); 5618 } 5619 5620 umem_free(data, blocksize); 5621 umem_free(od, sizeof (ztest_od_t)); 5622 } 5623 5624 /* 5625 * Verify that zap_{create,destroy,add,remove,update} work as expected. 5626 */ 5627 #define ZTEST_ZAP_MIN_INTS 1 5628 #define ZTEST_ZAP_MAX_INTS 4 5629 #define ZTEST_ZAP_MAX_PROPS 1000 5630 5631 void 5632 ztest_zap(ztest_ds_t *zd, uint64_t id) 5633 { 5634 objset_t *os = zd->zd_os; 5635 ztest_od_t *od; 5636 uint64_t object; 5637 uint64_t txg, last_txg; 5638 uint64_t value[ZTEST_ZAP_MAX_INTS]; 5639 uint64_t zl_ints, zl_intsize, prop; 5640 int i, ints; 5641 dmu_tx_t *tx; 5642 char propname[100], txgname[100]; 5643 int error; 5644 const char *const hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 5645 5646 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5647 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5648 5649 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5650 !ztest_random(2)) != 0) 5651 goto out; 5652 5653 object = od->od_object; 5654 5655 /* 5656 * Generate a known hash collision, and verify that 5657 * we can lookup and remove both entries. 5658 */ 5659 tx = dmu_tx_create(os); 5660 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5661 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5662 if (txg == 0) 5663 goto out; 5664 for (i = 0; i < 2; i++) { 5665 value[i] = i; 5666 VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 5667 1, &value[i], tx)); 5668 } 5669 for (i = 0; i < 2; i++) { 5670 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 5671 sizeof (uint64_t), 1, &value[i], tx)); 5672 VERIFY0( 5673 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 5674 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5675 ASSERT3U(zl_ints, ==, 1); 5676 } 5677 for (i = 0; i < 2; i++) { 5678 VERIFY0(zap_remove(os, object, hc[i], tx)); 5679 } 5680 dmu_tx_commit(tx); 5681 5682 /* 5683 * Generate a bunch of random entries. 5684 */ 5685 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 5686 5687 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5688 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5689 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5690 memset(value, 0, sizeof (value)); 5691 last_txg = 0; 5692 5693 /* 5694 * If these zap entries already exist, validate their contents. 5695 */ 5696 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5697 if (error == 0) { 5698 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5699 ASSERT3U(zl_ints, ==, 1); 5700 5701 VERIFY0(zap_lookup(os, object, txgname, zl_intsize, 5702 zl_ints, &last_txg)); 5703 5704 VERIFY0(zap_length(os, object, propname, &zl_intsize, 5705 &zl_ints)); 5706 5707 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5708 ASSERT3U(zl_ints, ==, ints); 5709 5710 VERIFY0(zap_lookup(os, object, propname, zl_intsize, 5711 zl_ints, value)); 5712 5713 for (i = 0; i < ints; i++) { 5714 ASSERT3U(value[i], ==, last_txg + object + i); 5715 } 5716 } else { 5717 ASSERT3U(error, ==, ENOENT); 5718 } 5719 5720 /* 5721 * Atomically update two entries in our zap object. 5722 * The first is named txg_%llu, and contains the txg 5723 * in which the property was last updated. The second 5724 * is named prop_%llu, and the nth element of its value 5725 * should be txg + object + n. 5726 */ 5727 tx = dmu_tx_create(os); 5728 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5729 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5730 if (txg == 0) 5731 goto out; 5732 5733 if (last_txg > txg) 5734 fatal(B_FALSE, "zap future leak: old %"PRIu64" new %"PRIu64"", 5735 last_txg, txg); 5736 5737 for (i = 0; i < ints; i++) 5738 value[i] = txg + object + i; 5739 5740 VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 5741 1, &txg, tx)); 5742 VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), 5743 ints, value, tx)); 5744 5745 dmu_tx_commit(tx); 5746 5747 /* 5748 * Remove a random pair of entries. 5749 */ 5750 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5751 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5752 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5753 5754 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5755 5756 if (error == ENOENT) 5757 goto out; 5758 5759 ASSERT0(error); 5760 5761 tx = dmu_tx_create(os); 5762 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5763 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5764 if (txg == 0) 5765 goto out; 5766 VERIFY0(zap_remove(os, object, txgname, tx)); 5767 VERIFY0(zap_remove(os, object, propname, tx)); 5768 dmu_tx_commit(tx); 5769 out: 5770 umem_free(od, sizeof (ztest_od_t)); 5771 } 5772 5773 /* 5774 * Test case to test the upgrading of a microzap to fatzap. 5775 */ 5776 void 5777 ztest_fzap(ztest_ds_t *zd, uint64_t id) 5778 { 5779 objset_t *os = zd->zd_os; 5780 ztest_od_t *od; 5781 uint64_t object, txg, value; 5782 5783 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5784 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5785 5786 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5787 !ztest_random(2)) != 0) 5788 goto out; 5789 object = od->od_object; 5790 5791 /* 5792 * Add entries to this ZAP and make sure it spills over 5793 * and gets upgraded to a fatzap. Also, since we are adding 5794 * 2050 entries we should see ptrtbl growth and leaf-block split. 5795 */ 5796 for (value = 0; value < 2050; value++) { 5797 char name[ZFS_MAX_DATASET_NAME_LEN]; 5798 dmu_tx_t *tx; 5799 int error; 5800 5801 (void) snprintf(name, sizeof (name), "fzap-%"PRIu64"-%"PRIu64"", 5802 id, value); 5803 5804 tx = dmu_tx_create(os); 5805 dmu_tx_hold_zap(tx, object, B_TRUE, name); 5806 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5807 if (txg == 0) 5808 goto out; 5809 error = zap_add(os, object, name, sizeof (uint64_t), 1, 5810 &value, tx); 5811 ASSERT(error == 0 || error == EEXIST); 5812 dmu_tx_commit(tx); 5813 } 5814 out: 5815 umem_free(od, sizeof (ztest_od_t)); 5816 } 5817 5818 void 5819 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 5820 { 5821 (void) id; 5822 objset_t *os = zd->zd_os; 5823 ztest_od_t *od; 5824 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 5825 dmu_tx_t *tx; 5826 int i, namelen, error; 5827 int micro = ztest_random(2); 5828 char name[20], string_value[20]; 5829 void *data; 5830 5831 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5832 ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); 5833 5834 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5835 umem_free(od, sizeof (ztest_od_t)); 5836 return; 5837 } 5838 5839 object = od->od_object; 5840 5841 /* 5842 * Generate a random name of the form 'xxx.....' where each 5843 * x is a random printable character and the dots are dots. 5844 * There are 94 such characters, and the name length goes from 5845 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 5846 */ 5847 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 5848 5849 for (i = 0; i < 3; i++) 5850 name[i] = '!' + ztest_random('~' - '!' + 1); 5851 for (; i < namelen - 1; i++) 5852 name[i] = '.'; 5853 name[i] = '\0'; 5854 5855 if ((namelen & 1) || micro) { 5856 wsize = sizeof (txg); 5857 wc = 1; 5858 data = &txg; 5859 } else { 5860 wsize = 1; 5861 wc = namelen; 5862 data = string_value; 5863 } 5864 5865 count = -1ULL; 5866 VERIFY0(zap_count(os, object, &count)); 5867 ASSERT3S(count, !=, -1ULL); 5868 5869 /* 5870 * Select an operation: length, lookup, add, update, remove. 5871 */ 5872 i = ztest_random(5); 5873 5874 if (i >= 2) { 5875 tx = dmu_tx_create(os); 5876 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5877 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5878 if (txg == 0) { 5879 umem_free(od, sizeof (ztest_od_t)); 5880 return; 5881 } 5882 memcpy(string_value, name, namelen); 5883 } else { 5884 tx = NULL; 5885 txg = 0; 5886 memset(string_value, 0, namelen); 5887 } 5888 5889 switch (i) { 5890 5891 case 0: 5892 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 5893 if (error == 0) { 5894 ASSERT3U(wsize, ==, zl_wsize); 5895 ASSERT3U(wc, ==, zl_wc); 5896 } else { 5897 ASSERT3U(error, ==, ENOENT); 5898 } 5899 break; 5900 5901 case 1: 5902 error = zap_lookup(os, object, name, wsize, wc, data); 5903 if (error == 0) { 5904 if (data == string_value && 5905 memcmp(name, data, namelen) != 0) 5906 fatal(B_FALSE, "name '%s' != val '%s' len %d", 5907 name, (char *)data, namelen); 5908 } else { 5909 ASSERT3U(error, ==, ENOENT); 5910 } 5911 break; 5912 5913 case 2: 5914 error = zap_add(os, object, name, wsize, wc, data, tx); 5915 ASSERT(error == 0 || error == EEXIST); 5916 break; 5917 5918 case 3: 5919 VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); 5920 break; 5921 5922 case 4: 5923 error = zap_remove(os, object, name, tx); 5924 ASSERT(error == 0 || error == ENOENT); 5925 break; 5926 } 5927 5928 if (tx != NULL) 5929 dmu_tx_commit(tx); 5930 5931 umem_free(od, sizeof (ztest_od_t)); 5932 } 5933 5934 /* 5935 * Commit callback data. 5936 */ 5937 typedef struct ztest_cb_data { 5938 list_node_t zcd_node; 5939 uint64_t zcd_txg; 5940 int zcd_expected_err; 5941 boolean_t zcd_added; 5942 boolean_t zcd_called; 5943 spa_t *zcd_spa; 5944 } ztest_cb_data_t; 5945 5946 /* This is the actual commit callback function */ 5947 static void 5948 ztest_commit_callback(void *arg, int error) 5949 { 5950 ztest_cb_data_t *data = arg; 5951 uint64_t synced_txg; 5952 5953 VERIFY3P(data, !=, NULL); 5954 VERIFY3S(data->zcd_expected_err, ==, error); 5955 VERIFY(!data->zcd_called); 5956 5957 synced_txg = spa_last_synced_txg(data->zcd_spa); 5958 if (data->zcd_txg > synced_txg) 5959 fatal(B_FALSE, 5960 "commit callback of txg %"PRIu64" called prematurely, " 5961 "last synced txg = %"PRIu64"\n", 5962 data->zcd_txg, synced_txg); 5963 5964 data->zcd_called = B_TRUE; 5965 5966 if (error == ECANCELED) { 5967 ASSERT0(data->zcd_txg); 5968 ASSERT(!data->zcd_added); 5969 5970 /* 5971 * The private callback data should be destroyed here, but 5972 * since we are going to check the zcd_called field after 5973 * dmu_tx_abort(), we will destroy it there. 5974 */ 5975 return; 5976 } 5977 5978 ASSERT(data->zcd_added); 5979 ASSERT3U(data->zcd_txg, !=, 0); 5980 5981 (void) mutex_enter(&zcl.zcl_callbacks_lock); 5982 5983 /* See if this cb was called more quickly */ 5984 if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) 5985 zc_min_txg_delay = synced_txg - data->zcd_txg; 5986 5987 /* Remove our callback from the list */ 5988 list_remove(&zcl.zcl_callbacks, data); 5989 5990 (void) mutex_exit(&zcl.zcl_callbacks_lock); 5991 5992 umem_free(data, sizeof (ztest_cb_data_t)); 5993 } 5994 5995 /* Allocate and initialize callback data structure */ 5996 static ztest_cb_data_t * 5997 ztest_create_cb_data(objset_t *os, uint64_t txg) 5998 { 5999 ztest_cb_data_t *cb_data; 6000 6001 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 6002 6003 cb_data->zcd_txg = txg; 6004 cb_data->zcd_spa = dmu_objset_spa(os); 6005 list_link_init(&cb_data->zcd_node); 6006 6007 return (cb_data); 6008 } 6009 6010 /* 6011 * Commit callback test. 6012 */ 6013 void 6014 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 6015 { 6016 objset_t *os = zd->zd_os; 6017 ztest_od_t *od; 6018 dmu_tx_t *tx; 6019 ztest_cb_data_t *cb_data[3], *tmp_cb; 6020 uint64_t old_txg, txg; 6021 int i, error = 0; 6022 6023 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 6024 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 6025 6026 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 6027 umem_free(od, sizeof (ztest_od_t)); 6028 return; 6029 } 6030 6031 tx = dmu_tx_create(os); 6032 6033 cb_data[0] = ztest_create_cb_data(os, 0); 6034 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 6035 6036 dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); 6037 6038 /* Every once in a while, abort the transaction on purpose */ 6039 if (ztest_random(100) == 0) 6040 error = -1; 6041 6042 if (!error) 6043 error = dmu_tx_assign(tx, TXG_NOWAIT); 6044 6045 txg = error ? 0 : dmu_tx_get_txg(tx); 6046 6047 cb_data[0]->zcd_txg = txg; 6048 cb_data[1] = ztest_create_cb_data(os, txg); 6049 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 6050 6051 if (error) { 6052 /* 6053 * It's not a strict requirement to call the registered 6054 * callbacks from inside dmu_tx_abort(), but that's what 6055 * it's supposed to happen in the current implementation 6056 * so we will check for that. 6057 */ 6058 for (i = 0; i < 2; i++) { 6059 cb_data[i]->zcd_expected_err = ECANCELED; 6060 VERIFY(!cb_data[i]->zcd_called); 6061 } 6062 6063 dmu_tx_abort(tx); 6064 6065 for (i = 0; i < 2; i++) { 6066 VERIFY(cb_data[i]->zcd_called); 6067 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 6068 } 6069 6070 umem_free(od, sizeof (ztest_od_t)); 6071 return; 6072 } 6073 6074 cb_data[2] = ztest_create_cb_data(os, txg); 6075 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 6076 6077 /* 6078 * Read existing data to make sure there isn't a future leak. 6079 */ 6080 VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), 6081 &old_txg, DMU_READ_PREFETCH)); 6082 6083 if (old_txg > txg) 6084 fatal(B_FALSE, 6085 "future leak: got %"PRIu64", open txg is %"PRIu64"", 6086 old_txg, txg); 6087 6088 dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); 6089 6090 (void) mutex_enter(&zcl.zcl_callbacks_lock); 6091 6092 /* 6093 * Since commit callbacks don't have any ordering requirement and since 6094 * it is theoretically possible for a commit callback to be called 6095 * after an arbitrary amount of time has elapsed since its txg has been 6096 * synced, it is difficult to reliably determine whether a commit 6097 * callback hasn't been called due to high load or due to a flawed 6098 * implementation. 6099 * 6100 * In practice, we will assume that if after a certain number of txgs a 6101 * commit callback hasn't been called, then most likely there's an 6102 * implementation bug.. 6103 */ 6104 tmp_cb = list_head(&zcl.zcl_callbacks); 6105 if (tmp_cb != NULL && 6106 tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { 6107 fatal(B_FALSE, 6108 "Commit callback threshold exceeded, " 6109 "oldest txg: %"PRIu64", open txg: %"PRIu64"\n", 6110 tmp_cb->zcd_txg, txg); 6111 } 6112 6113 /* 6114 * Let's find the place to insert our callbacks. 6115 * 6116 * Even though the list is ordered by txg, it is possible for the 6117 * insertion point to not be the end because our txg may already be 6118 * quiescing at this point and other callbacks in the open txg 6119 * (from other objsets) may have sneaked in. 6120 */ 6121 tmp_cb = list_tail(&zcl.zcl_callbacks); 6122 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 6123 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 6124 6125 /* Add the 3 callbacks to the list */ 6126 for (i = 0; i < 3; i++) { 6127 if (tmp_cb == NULL) 6128 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 6129 else 6130 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 6131 cb_data[i]); 6132 6133 cb_data[i]->zcd_added = B_TRUE; 6134 VERIFY(!cb_data[i]->zcd_called); 6135 6136 tmp_cb = cb_data[i]; 6137 } 6138 6139 zc_cb_counter += 3; 6140 6141 (void) mutex_exit(&zcl.zcl_callbacks_lock); 6142 6143 dmu_tx_commit(tx); 6144 6145 umem_free(od, sizeof (ztest_od_t)); 6146 } 6147 6148 /* 6149 * Visit each object in the dataset. Verify that its properties 6150 * are consistent what was stored in the block tag when it was created, 6151 * and that its unused bonus buffer space has not been overwritten. 6152 */ 6153 void 6154 ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 6155 { 6156 (void) id; 6157 objset_t *os = zd->zd_os; 6158 uint64_t obj; 6159 int err = 0; 6160 6161 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 6162 ztest_block_tag_t *bt = NULL; 6163 dmu_object_info_t doi; 6164 dmu_buf_t *db; 6165 6166 ztest_object_lock(zd, obj, ZTRL_READER); 6167 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { 6168 ztest_object_unlock(zd, obj); 6169 continue; 6170 } 6171 6172 dmu_object_info_from_db(db, &doi); 6173 if (doi.doi_bonus_size >= sizeof (*bt)) 6174 bt = ztest_bt_bonus(db); 6175 6176 if (bt && bt->bt_magic == BT_MAGIC) { 6177 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 6178 bt->bt_offset, bt->bt_gen, bt->bt_txg, 6179 bt->bt_crtxg); 6180 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 6181 } 6182 6183 dmu_buf_rele(db, FTAG); 6184 ztest_object_unlock(zd, obj); 6185 } 6186 } 6187 6188 void 6189 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 6190 { 6191 (void) id; 6192 zfs_prop_t proplist[] = { 6193 ZFS_PROP_CHECKSUM, 6194 ZFS_PROP_COMPRESSION, 6195 ZFS_PROP_COPIES, 6196 ZFS_PROP_DEDUP 6197 }; 6198 6199 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6200 6201 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) { 6202 int error = ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 6203 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 6204 ASSERT(error == 0 || error == ENOSPC); 6205 } 6206 6207 int error = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, 6208 ztest_random_blocksize(), (int)ztest_random(2)); 6209 ASSERT(error == 0 || error == ENOSPC); 6210 6211 (void) pthread_rwlock_unlock(&ztest_name_lock); 6212 } 6213 6214 void 6215 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 6216 { 6217 (void) zd, (void) id; 6218 6219 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6220 6221 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); 6222 6223 nvlist_t *props = fnvlist_alloc(); 6224 6225 VERIFY0(spa_prop_get(ztest_spa, props)); 6226 6227 if (ztest_opts.zo_verbose >= 6) 6228 dump_nvlist(props, 4); 6229 6230 fnvlist_free(props); 6231 6232 (void) pthread_rwlock_unlock(&ztest_name_lock); 6233 } 6234 6235 static int 6236 user_release_one(const char *snapname, const char *holdname) 6237 { 6238 nvlist_t *snaps, *holds; 6239 int error; 6240 6241 snaps = fnvlist_alloc(); 6242 holds = fnvlist_alloc(); 6243 fnvlist_add_boolean(holds, holdname); 6244 fnvlist_add_nvlist(snaps, snapname, holds); 6245 fnvlist_free(holds); 6246 error = dsl_dataset_user_release(snaps, NULL); 6247 fnvlist_free(snaps); 6248 return (error); 6249 } 6250 6251 /* 6252 * Test snapshot hold/release and deferred destroy. 6253 */ 6254 void 6255 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 6256 { 6257 int error; 6258 objset_t *os = zd->zd_os; 6259 objset_t *origin; 6260 char snapname[100]; 6261 char fullname[100]; 6262 char clonename[100]; 6263 char tag[100]; 6264 char osname[ZFS_MAX_DATASET_NAME_LEN]; 6265 nvlist_t *holds; 6266 6267 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6268 6269 dmu_objset_name(os, osname); 6270 6271 (void) snprintf(snapname, sizeof (snapname), "sh1_%"PRIu64"", id); 6272 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 6273 (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%"PRIu64"", 6274 osname, id); 6275 (void) snprintf(tag, sizeof (tag), "tag_%"PRIu64"", id); 6276 6277 /* 6278 * Clean up from any previous run. 6279 */ 6280 error = dsl_destroy_head(clonename); 6281 if (error != ENOENT) 6282 ASSERT0(error); 6283 error = user_release_one(fullname, tag); 6284 if (error != ESRCH && error != ENOENT) 6285 ASSERT0(error); 6286 error = dsl_destroy_snapshot(fullname, B_FALSE); 6287 if (error != ENOENT) 6288 ASSERT0(error); 6289 6290 /* 6291 * Create snapshot, clone it, mark snap for deferred destroy, 6292 * destroy clone, verify snap was also destroyed. 6293 */ 6294 error = dmu_objset_snapshot_one(osname, snapname); 6295 if (error) { 6296 if (error == ENOSPC) { 6297 ztest_record_enospc("dmu_objset_snapshot"); 6298 goto out; 6299 } 6300 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 6301 } 6302 6303 error = dmu_objset_clone(clonename, fullname); 6304 if (error) { 6305 if (error == ENOSPC) { 6306 ztest_record_enospc("dmu_objset_clone"); 6307 goto out; 6308 } 6309 fatal(B_FALSE, "dmu_objset_clone(%s) = %d", clonename, error); 6310 } 6311 6312 error = dsl_destroy_snapshot(fullname, B_TRUE); 6313 if (error) { 6314 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6315 fullname, error); 6316 } 6317 6318 error = dsl_destroy_head(clonename); 6319 if (error) 6320 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clonename, error); 6321 6322 error = dmu_objset_hold(fullname, FTAG, &origin); 6323 if (error != ENOENT) 6324 fatal(B_FALSE, "dmu_objset_hold(%s) = %d", fullname, error); 6325 6326 /* 6327 * Create snapshot, add temporary hold, verify that we can't 6328 * destroy a held snapshot, mark for deferred destroy, 6329 * release hold, verify snapshot was destroyed. 6330 */ 6331 error = dmu_objset_snapshot_one(osname, snapname); 6332 if (error) { 6333 if (error == ENOSPC) { 6334 ztest_record_enospc("dmu_objset_snapshot"); 6335 goto out; 6336 } 6337 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 6338 } 6339 6340 holds = fnvlist_alloc(); 6341 fnvlist_add_string(holds, fullname, tag); 6342 error = dsl_dataset_user_hold(holds, 0, NULL); 6343 fnvlist_free(holds); 6344 6345 if (error == ENOSPC) { 6346 ztest_record_enospc("dsl_dataset_user_hold"); 6347 goto out; 6348 } else if (error) { 6349 fatal(B_FALSE, "dsl_dataset_user_hold(%s, %s) = %u", 6350 fullname, tag, error); 6351 } 6352 6353 error = dsl_destroy_snapshot(fullname, B_FALSE); 6354 if (error != EBUSY) { 6355 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 6356 fullname, error); 6357 } 6358 6359 error = dsl_destroy_snapshot(fullname, B_TRUE); 6360 if (error) { 6361 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6362 fullname, error); 6363 } 6364 6365 error = user_release_one(fullname, tag); 6366 if (error) 6367 fatal(B_FALSE, "user_release_one(%s, %s) = %d", 6368 fullname, tag, error); 6369 6370 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 6371 6372 out: 6373 (void) pthread_rwlock_unlock(&ztest_name_lock); 6374 } 6375 6376 /* 6377 * Inject random faults into the on-disk data. 6378 */ 6379 void 6380 ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 6381 { 6382 (void) zd, (void) id; 6383 ztest_shared_t *zs = ztest_shared; 6384 spa_t *spa = ztest_spa; 6385 int fd; 6386 uint64_t offset; 6387 uint64_t leaves; 6388 uint64_t bad = 0x1990c0ffeedecadeull; 6389 uint64_t top, leaf; 6390 uint64_t raidz_children; 6391 char *path0; 6392 char *pathrand; 6393 size_t fsize; 6394 int bshift = SPA_MAXBLOCKSHIFT + 2; 6395 int iters = 1000; 6396 int maxfaults; 6397 int mirror_save; 6398 vdev_t *vd0 = NULL; 6399 uint64_t guid0 = 0; 6400 boolean_t islog = B_FALSE; 6401 boolean_t injected = B_FALSE; 6402 6403 path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6404 pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6405 6406 mutex_enter(&ztest_vdev_lock); 6407 6408 /* 6409 * Device removal is in progress, fault injection must be disabled 6410 * until it completes and the pool is scrubbed. The fault injection 6411 * strategy for damaging blocks does not take in to account evacuated 6412 * blocks which may have already been damaged. 6413 */ 6414 if (ztest_device_removal_active) 6415 goto out; 6416 6417 /* 6418 * The fault injection strategy for damaging blocks cannot be used 6419 * if raidz expansion is in progress. The leaves value 6420 * (attached raidz children) is variable and strategy for damaging 6421 * blocks will corrupt same data blocks on different child vdevs 6422 * because of the reflow process. 6423 */ 6424 if (spa->spa_raidz_expand != NULL) 6425 goto out; 6426 6427 maxfaults = MAXFAULTS(zs); 6428 raidz_children = ztest_get_raidz_children(spa); 6429 leaves = MAX(zs->zs_mirrors, 1) * raidz_children; 6430 mirror_save = zs->zs_mirrors; 6431 6432 ASSERT3U(leaves, >=, 1); 6433 6434 /* 6435 * While ztest is running the number of leaves will not change. This 6436 * is critical for the fault injection logic as it determines where 6437 * errors can be safely injected such that they are always repairable. 6438 * 6439 * When restarting ztest a different number of leaves may be requested 6440 * which will shift the regions to be damaged. This is fine as long 6441 * as the pool has been scrubbed prior to using the new mapping. 6442 * Failure to do can result in non-repairable damage being injected. 6443 */ 6444 if (ztest_pool_scrubbed == B_FALSE) 6445 goto out; 6446 6447 /* 6448 * Grab the name lock as reader. There are some operations 6449 * which don't like to have their vdevs changed while 6450 * they are in progress (i.e. spa_change_guid). Those 6451 * operations will have grabbed the name lock as writer. 6452 */ 6453 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6454 6455 /* 6456 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 6457 */ 6458 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6459 6460 if (ztest_random(2) == 0) { 6461 /* 6462 * Inject errors on a normal data device or slog device. 6463 */ 6464 top = ztest_random_vdev_top(spa, B_TRUE); 6465 leaf = ztest_random(leaves) + zs->zs_splits; 6466 6467 /* 6468 * Generate paths to the first leaf in this top-level vdev, 6469 * and to the random leaf we selected. We'll induce transient 6470 * write failures and random online/offline activity on leaf 0, 6471 * and we'll write random garbage to the randomly chosen leaf. 6472 */ 6473 (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, 6474 ztest_opts.zo_dir, ztest_opts.zo_pool, 6475 top * leaves + zs->zs_splits); 6476 (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, 6477 ztest_opts.zo_dir, ztest_opts.zo_pool, 6478 top * leaves + leaf); 6479 6480 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 6481 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 6482 islog = B_TRUE; 6483 6484 /* 6485 * If the top-level vdev needs to be resilvered 6486 * then we only allow faults on the device that is 6487 * resilvering. 6488 */ 6489 if (vd0 != NULL && maxfaults != 1 && 6490 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 6491 vd0->vdev_resilver_txg != 0)) { 6492 /* 6493 * Make vd0 explicitly claim to be unreadable, 6494 * or unwritable, or reach behind its back 6495 * and close the underlying fd. We can do this if 6496 * maxfaults == 0 because we'll fail and reexecute, 6497 * and we can do it if maxfaults >= 2 because we'll 6498 * have enough redundancy. If maxfaults == 1, the 6499 * combination of this with injection of random data 6500 * corruption below exceeds the pool's fault tolerance. 6501 */ 6502 vdev_file_t *vf = vd0->vdev_tsd; 6503 6504 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 6505 (long long)vd0->vdev_id, (int)maxfaults); 6506 6507 if (vf != NULL && ztest_random(3) == 0) { 6508 (void) close(vf->vf_file->f_fd); 6509 vf->vf_file->f_fd = -1; 6510 } else if (ztest_random(2) == 0) { 6511 vd0->vdev_cant_read = B_TRUE; 6512 } else { 6513 vd0->vdev_cant_write = B_TRUE; 6514 } 6515 guid0 = vd0->vdev_guid; 6516 } 6517 } else { 6518 /* 6519 * Inject errors on an l2cache device. 6520 */ 6521 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6522 6523 if (sav->sav_count == 0) { 6524 spa_config_exit(spa, SCL_STATE, FTAG); 6525 (void) pthread_rwlock_unlock(&ztest_name_lock); 6526 goto out; 6527 } 6528 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 6529 guid0 = vd0->vdev_guid; 6530 (void) strlcpy(path0, vd0->vdev_path, MAXPATHLEN); 6531 (void) strlcpy(pathrand, vd0->vdev_path, MAXPATHLEN); 6532 6533 leaf = 0; 6534 leaves = 1; 6535 maxfaults = INT_MAX; /* no limit on cache devices */ 6536 } 6537 6538 spa_config_exit(spa, SCL_STATE, FTAG); 6539 (void) pthread_rwlock_unlock(&ztest_name_lock); 6540 6541 /* 6542 * If we can tolerate two or more faults, or we're dealing 6543 * with a slog, randomly online/offline vd0. 6544 */ 6545 if ((maxfaults >= 2 || islog) && guid0 != 0) { 6546 if (ztest_random(10) < 6) { 6547 int flags = (ztest_random(2) == 0 ? 6548 ZFS_OFFLINE_TEMPORARY : 0); 6549 6550 /* 6551 * We have to grab the zs_name_lock as writer to 6552 * prevent a race between offlining a slog and 6553 * destroying a dataset. Offlining the slog will 6554 * grab a reference on the dataset which may cause 6555 * dsl_destroy_head() to fail with EBUSY thus 6556 * leaving the dataset in an inconsistent state. 6557 */ 6558 if (islog) 6559 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6560 6561 VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); 6562 6563 if (islog) 6564 (void) pthread_rwlock_unlock(&ztest_name_lock); 6565 } else { 6566 /* 6567 * Ideally we would like to be able to randomly 6568 * call vdev_[on|off]line without holding locks 6569 * to force unpredictable failures but the side 6570 * effects of vdev_[on|off]line prevent us from 6571 * doing so. 6572 */ 6573 (void) vdev_online(spa, guid0, 0, NULL); 6574 } 6575 } 6576 6577 if (maxfaults == 0) 6578 goto out; 6579 6580 /* 6581 * We have at least single-fault tolerance, so inject data corruption. 6582 */ 6583 fd = open(pathrand, O_RDWR); 6584 6585 if (fd == -1) /* we hit a gap in the device namespace */ 6586 goto out; 6587 6588 fsize = lseek(fd, 0, SEEK_END); 6589 6590 while (--iters != 0) { 6591 /* 6592 * The offset must be chosen carefully to ensure that 6593 * we do not inject a given logical block with errors 6594 * on two different leaf devices, because ZFS can not 6595 * tolerate that (if maxfaults==1). 6596 * 6597 * To achieve this we divide each leaf device into 6598 * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). 6599 * Each chunk is further divided into error-injection 6600 * ranges (can accept errors) and clear ranges (we do 6601 * not inject errors in those). Each error-injection 6602 * range can accept errors only for a single leaf vdev. 6603 * Error-injection ranges are separated by clear ranges. 6604 * 6605 * For example, with 3 leaves, each chunk looks like: 6606 * 0 to 32M: injection range for leaf 0 6607 * 32M to 64M: clear range - no injection allowed 6608 * 64M to 96M: injection range for leaf 1 6609 * 96M to 128M: clear range - no injection allowed 6610 * 128M to 160M: injection range for leaf 2 6611 * 160M to 192M: clear range - no injection allowed 6612 * 6613 * Each clear range must be large enough such that a 6614 * single block cannot straddle it. This way a block 6615 * can't be a target in two different injection ranges 6616 * (on different leaf vdevs). 6617 */ 6618 offset = ztest_random(fsize / (leaves << bshift)) * 6619 (leaves << bshift) + (leaf << bshift) + 6620 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 6621 6622 /* 6623 * Only allow damage to the labels at one end of the vdev. 6624 * 6625 * If all labels are damaged, the device will be totally 6626 * inaccessible, which will result in loss of data, 6627 * because we also damage (parts of) the other side of 6628 * the mirror/raidz. 6629 * 6630 * Additionally, we will always have both an even and an 6631 * odd label, so that we can handle crashes in the 6632 * middle of vdev_config_sync(). 6633 */ 6634 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 6635 continue; 6636 6637 /* 6638 * The two end labels are stored at the "end" of the disk, but 6639 * the end of the disk (vdev_psize) is aligned to 6640 * sizeof (vdev_label_t). 6641 */ 6642 uint64_t psize = P2ALIGN_TYPED(fsize, sizeof (vdev_label_t), 6643 uint64_t); 6644 if ((leaf & 1) == 1 && 6645 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 6646 continue; 6647 6648 if (mirror_save != zs->zs_mirrors) { 6649 (void) close(fd); 6650 goto out; 6651 } 6652 6653 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 6654 fatal(B_TRUE, 6655 "can't inject bad word at 0x%"PRIx64" in %s", 6656 offset, pathrand); 6657 6658 if (ztest_opts.zo_verbose >= 7) 6659 (void) printf("injected bad word into %s," 6660 " offset 0x%"PRIx64"\n", pathrand, offset); 6661 6662 injected = B_TRUE; 6663 } 6664 6665 (void) close(fd); 6666 out: 6667 mutex_exit(&ztest_vdev_lock); 6668 6669 if (injected && ztest_opts.zo_raid_do_expand) { 6670 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6671 if (error == 0) { 6672 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6673 txg_wait_synced(spa_get_dsl(spa), 0); 6674 } 6675 } 6676 6677 umem_free(path0, MAXPATHLEN); 6678 umem_free(pathrand, MAXPATHLEN); 6679 } 6680 6681 /* 6682 * By design ztest will never inject uncorrectable damage in to the pool. 6683 * Issue a scrub, wait for it to complete, and verify there is never any 6684 * persistent damage. 6685 * 6686 * Only after a full scrub has been completed is it safe to start injecting 6687 * data corruption. See the comment in zfs_fault_inject(). 6688 */ 6689 static int 6690 ztest_scrub_impl(spa_t *spa) 6691 { 6692 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6693 if (error) 6694 return (error); 6695 6696 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6697 txg_wait_synced(spa_get_dsl(spa), 0); 6698 6699 if (spa_approx_errlog_size(spa) > 0) 6700 return (ECKSUM); 6701 6702 ztest_pool_scrubbed = B_TRUE; 6703 6704 return (0); 6705 } 6706 6707 /* 6708 * Scrub the pool. 6709 */ 6710 void 6711 ztest_scrub(ztest_ds_t *zd, uint64_t id) 6712 { 6713 (void) zd, (void) id; 6714 spa_t *spa = ztest_spa; 6715 int error; 6716 6717 /* 6718 * Scrub in progress by device removal. 6719 */ 6720 if (ztest_device_removal_active) 6721 return; 6722 6723 /* 6724 * Start a scrub, wait a moment, then force a restart. 6725 */ 6726 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6727 (void) poll(NULL, 0, 100); 6728 6729 error = ztest_scrub_impl(spa); 6730 if (error == EBUSY) 6731 error = 0; 6732 ASSERT0(error); 6733 } 6734 6735 /* 6736 * Change the guid for the pool. 6737 */ 6738 void 6739 ztest_reguid(ztest_ds_t *zd, uint64_t id) 6740 { 6741 (void) zd, (void) id; 6742 spa_t *spa = ztest_spa; 6743 uint64_t orig, load; 6744 int error; 6745 ztest_shared_t *zs = ztest_shared; 6746 6747 if (ztest_opts.zo_mmp_test) 6748 return; 6749 6750 orig = spa_guid(spa); 6751 load = spa_load_guid(spa); 6752 6753 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6754 error = spa_change_guid(spa, NULL); 6755 zs->zs_guid = spa_guid(spa); 6756 (void) pthread_rwlock_unlock(&ztest_name_lock); 6757 6758 if (error != 0) 6759 return; 6760 6761 if (ztest_opts.zo_verbose >= 4) { 6762 (void) printf("Changed guid old %"PRIu64" -> %"PRIu64"\n", 6763 orig, spa_guid(spa)); 6764 } 6765 6766 VERIFY3U(orig, !=, spa_guid(spa)); 6767 VERIFY3U(load, ==, spa_load_guid(spa)); 6768 } 6769 6770 void 6771 ztest_blake3(ztest_ds_t *zd, uint64_t id) 6772 { 6773 (void) zd, (void) id; 6774 hrtime_t end = gethrtime() + NANOSEC; 6775 zio_cksum_salt_t salt; 6776 void *salt_ptr = &salt.zcs_bytes; 6777 struct abd *abd_data, *abd_meta; 6778 void *buf, *templ; 6779 int i, *ptr; 6780 uint32_t size; 6781 BLAKE3_CTX ctx; 6782 const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); 6783 6784 size = ztest_random_blocksize(); 6785 buf = umem_alloc(size, UMEM_NOFAIL); 6786 abd_data = abd_alloc(size, B_FALSE); 6787 abd_meta = abd_alloc(size, B_TRUE); 6788 6789 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6790 *ptr = ztest_random(UINT_MAX); 6791 memset(salt_ptr, 'A', 32); 6792 6793 abd_copy_from_buf_off(abd_data, buf, 0, size); 6794 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6795 6796 while (gethrtime() <= end) { 6797 int run_count = 100; 6798 zio_cksum_t zc_ref1, zc_ref2; 6799 zio_cksum_t zc_res1, zc_res2; 6800 6801 void *ref1 = &zc_ref1; 6802 void *ref2 = &zc_ref2; 6803 void *res1 = &zc_res1; 6804 void *res2 = &zc_res2; 6805 6806 /* BLAKE3_KEY_LEN = 32 */ 6807 VERIFY0(blake3->setname("generic")); 6808 templ = abd_checksum_blake3_tmpl_init(&salt); 6809 Blake3_InitKeyed(&ctx, salt_ptr); 6810 Blake3_Update(&ctx, buf, size); 6811 Blake3_Final(&ctx, ref1); 6812 zc_ref2 = zc_ref1; 6813 ZIO_CHECKSUM_BSWAP(&zc_ref2); 6814 abd_checksum_blake3_tmpl_free(templ); 6815 6816 VERIFY0(blake3->setname("cycle")); 6817 while (run_count-- > 0) { 6818 6819 /* Test current implementation */ 6820 Blake3_InitKeyed(&ctx, salt_ptr); 6821 Blake3_Update(&ctx, buf, size); 6822 Blake3_Final(&ctx, res1); 6823 zc_res2 = zc_res1; 6824 ZIO_CHECKSUM_BSWAP(&zc_res2); 6825 6826 VERIFY0(memcmp(ref1, res1, 32)); 6827 VERIFY0(memcmp(ref2, res2, 32)); 6828 6829 /* Test ABD - data */ 6830 templ = abd_checksum_blake3_tmpl_init(&salt); 6831 abd_checksum_blake3_native(abd_data, size, 6832 templ, &zc_res1); 6833 abd_checksum_blake3_byteswap(abd_data, size, 6834 templ, &zc_res2); 6835 6836 VERIFY0(memcmp(ref1, res1, 32)); 6837 VERIFY0(memcmp(ref2, res2, 32)); 6838 6839 /* Test ABD - metadata */ 6840 abd_checksum_blake3_native(abd_meta, size, 6841 templ, &zc_res1); 6842 abd_checksum_blake3_byteswap(abd_meta, size, 6843 templ, &zc_res2); 6844 abd_checksum_blake3_tmpl_free(templ); 6845 6846 VERIFY0(memcmp(ref1, res1, 32)); 6847 VERIFY0(memcmp(ref2, res2, 32)); 6848 6849 } 6850 } 6851 6852 abd_free(abd_data); 6853 abd_free(abd_meta); 6854 umem_free(buf, size); 6855 } 6856 6857 void 6858 ztest_fletcher(ztest_ds_t *zd, uint64_t id) 6859 { 6860 (void) zd, (void) id; 6861 hrtime_t end = gethrtime() + NANOSEC; 6862 6863 while (gethrtime() <= end) { 6864 int run_count = 100; 6865 void *buf; 6866 struct abd *abd_data, *abd_meta; 6867 uint32_t size; 6868 int *ptr; 6869 int i; 6870 zio_cksum_t zc_ref; 6871 zio_cksum_t zc_ref_byteswap; 6872 6873 size = ztest_random_blocksize(); 6874 6875 buf = umem_alloc(size, UMEM_NOFAIL); 6876 abd_data = abd_alloc(size, B_FALSE); 6877 abd_meta = abd_alloc(size, B_TRUE); 6878 6879 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6880 *ptr = ztest_random(UINT_MAX); 6881 6882 abd_copy_from_buf_off(abd_data, buf, 0, size); 6883 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6884 6885 VERIFY0(fletcher_4_impl_set("scalar")); 6886 fletcher_4_native(buf, size, NULL, &zc_ref); 6887 fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); 6888 6889 VERIFY0(fletcher_4_impl_set("cycle")); 6890 while (run_count-- > 0) { 6891 zio_cksum_t zc; 6892 zio_cksum_t zc_byteswap; 6893 6894 fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); 6895 fletcher_4_native(buf, size, NULL, &zc); 6896 6897 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6898 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6899 sizeof (zc_byteswap))); 6900 6901 /* Test ABD - data */ 6902 abd_fletcher_4_byteswap(abd_data, size, NULL, 6903 &zc_byteswap); 6904 abd_fletcher_4_native(abd_data, size, NULL, &zc); 6905 6906 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6907 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6908 sizeof (zc_byteswap))); 6909 6910 /* Test ABD - metadata */ 6911 abd_fletcher_4_byteswap(abd_meta, size, NULL, 6912 &zc_byteswap); 6913 abd_fletcher_4_native(abd_meta, size, NULL, &zc); 6914 6915 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6916 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6917 sizeof (zc_byteswap))); 6918 6919 } 6920 6921 umem_free(buf, size); 6922 abd_free(abd_data); 6923 abd_free(abd_meta); 6924 } 6925 } 6926 6927 void 6928 ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) 6929 { 6930 (void) zd, (void) id; 6931 void *buf; 6932 size_t size; 6933 int *ptr; 6934 int i; 6935 zio_cksum_t zc_ref; 6936 zio_cksum_t zc_ref_bswap; 6937 6938 hrtime_t end = gethrtime() + NANOSEC; 6939 6940 while (gethrtime() <= end) { 6941 int run_count = 100; 6942 6943 size = ztest_random_blocksize(); 6944 buf = umem_alloc(size, UMEM_NOFAIL); 6945 6946 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6947 *ptr = ztest_random(UINT_MAX); 6948 6949 VERIFY0(fletcher_4_impl_set("scalar")); 6950 fletcher_4_native(buf, size, NULL, &zc_ref); 6951 fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); 6952 6953 VERIFY0(fletcher_4_impl_set("cycle")); 6954 6955 while (run_count-- > 0) { 6956 zio_cksum_t zc; 6957 zio_cksum_t zc_bswap; 6958 size_t pos = 0; 6959 6960 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6961 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6962 6963 while (pos < size) { 6964 size_t inc = 64 * ztest_random(size / 67); 6965 /* sometimes add few bytes to test non-simd */ 6966 if (ztest_random(100) < 10) 6967 inc += P2ALIGN_TYPED(ztest_random(64), 6968 sizeof (uint32_t), uint64_t); 6969 6970 if (inc > (size - pos)) 6971 inc = size - pos; 6972 6973 fletcher_4_incremental_native(buf + pos, inc, 6974 &zc); 6975 fletcher_4_incremental_byteswap(buf + pos, inc, 6976 &zc_bswap); 6977 6978 pos += inc; 6979 } 6980 6981 VERIFY3U(pos, ==, size); 6982 6983 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6984 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6985 6986 /* 6987 * verify if incremental on the whole buffer is 6988 * equivalent to non-incremental version 6989 */ 6990 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6991 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6992 6993 fletcher_4_incremental_native(buf, size, &zc); 6994 fletcher_4_incremental_byteswap(buf, size, &zc_bswap); 6995 6996 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6997 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6998 } 6999 7000 umem_free(buf, size); 7001 } 7002 } 7003 7004 void 7005 ztest_pool_prefetch_ddt(ztest_ds_t *zd, uint64_t id) 7006 { 7007 (void) zd, (void) id; 7008 spa_t *spa; 7009 7010 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7011 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7012 7013 ddt_prefetch_all(spa); 7014 7015 spa_close(spa, FTAG); 7016 (void) pthread_rwlock_unlock(&ztest_name_lock); 7017 } 7018 7019 static int 7020 ztest_set_global_vars(void) 7021 { 7022 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 7023 char *kv = ztest_opts.zo_gvars[i]; 7024 VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); 7025 VERIFY3U(strlen(kv), >, 0); 7026 int err = set_global_var(kv); 7027 if (ztest_opts.zo_verbose > 0) { 7028 (void) printf("setting global var %s ... %s\n", kv, 7029 err ? "failed" : "ok"); 7030 } 7031 if (err != 0) { 7032 (void) fprintf(stderr, 7033 "failed to set global var '%s'\n", kv); 7034 return (err); 7035 } 7036 } 7037 return (0); 7038 } 7039 7040 static char ** 7041 ztest_global_vars_to_zdb_args(void) 7042 { 7043 char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); 7044 char **cur = args; 7045 if (args == NULL) 7046 return (NULL); 7047 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 7048 *cur++ = (char *)"-o"; 7049 *cur++ = ztest_opts.zo_gvars[i]; 7050 } 7051 ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); 7052 *cur = NULL; 7053 return (args); 7054 } 7055 7056 /* The end of strings is indicated by a NULL element */ 7057 static char * 7058 join_strings(char **strings, const char *sep) 7059 { 7060 size_t totallen = 0; 7061 for (char **sp = strings; *sp != NULL; sp++) { 7062 totallen += strlen(*sp); 7063 totallen += strlen(sep); 7064 } 7065 if (totallen > 0) { 7066 ASSERT(totallen >= strlen(sep)); 7067 totallen -= strlen(sep); 7068 } 7069 7070 size_t buflen = totallen + 1; 7071 char *o = umem_alloc(buflen, UMEM_NOFAIL); /* trailing 0 byte */ 7072 o[0] = '\0'; 7073 for (char **sp = strings; *sp != NULL; sp++) { 7074 size_t would; 7075 would = strlcat(o, *sp, buflen); 7076 VERIFY3U(would, <, buflen); 7077 if (*(sp+1) == NULL) { 7078 break; 7079 } 7080 would = strlcat(o, sep, buflen); 7081 VERIFY3U(would, <, buflen); 7082 } 7083 ASSERT3S(strlen(o), ==, totallen); 7084 return (o); 7085 } 7086 7087 static int 7088 ztest_check_path(char *path) 7089 { 7090 struct stat s; 7091 /* return true on success */ 7092 return (!stat(path, &s)); 7093 } 7094 7095 static void 7096 ztest_get_zdb_bin(char *bin, int len) 7097 { 7098 char *zdb_path; 7099 /* 7100 * Try to use $ZDB and in-tree zdb path. If not successful, just 7101 * let popen to search through PATH. 7102 */ 7103 if ((zdb_path = getenv("ZDB"))) { 7104 strlcpy(bin, zdb_path, len); /* In env */ 7105 if (!ztest_check_path(bin)) { 7106 ztest_dump_core = 0; 7107 fatal(B_TRUE, "invalid ZDB '%s'", bin); 7108 } 7109 return; 7110 } 7111 7112 VERIFY3P(realpath(getexecname(), bin), !=, NULL); 7113 if (strstr(bin, ".libs/ztest")) { 7114 strstr(bin, ".libs/ztest")[0] = '\0'; /* In-tree */ 7115 strcat(bin, "zdb"); 7116 if (ztest_check_path(bin)) 7117 return; 7118 } 7119 strcpy(bin, "zdb"); 7120 } 7121 7122 static vdev_t * 7123 ztest_random_concrete_vdev_leaf(vdev_t *vd) 7124 { 7125 if (vd == NULL) 7126 return (NULL); 7127 7128 if (vd->vdev_children == 0) 7129 return (vd); 7130 7131 vdev_t *eligible[vd->vdev_children]; 7132 int eligible_idx = 0, i; 7133 for (i = 0; i < vd->vdev_children; i++) { 7134 vdev_t *cvd = vd->vdev_child[i]; 7135 if (cvd->vdev_top->vdev_removing) 7136 continue; 7137 if (cvd->vdev_children > 0 || 7138 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 7139 eligible[eligible_idx++] = cvd; 7140 } 7141 } 7142 VERIFY3S(eligible_idx, >, 0); 7143 7144 uint64_t child_no = ztest_random(eligible_idx); 7145 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 7146 } 7147 7148 void 7149 ztest_initialize(ztest_ds_t *zd, uint64_t id) 7150 { 7151 (void) zd, (void) id; 7152 spa_t *spa = ztest_spa; 7153 int error = 0; 7154 7155 mutex_enter(&ztest_vdev_lock); 7156 7157 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 7158 7159 /* Random leaf vdev */ 7160 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 7161 if (rand_vd == NULL) { 7162 spa_config_exit(spa, SCL_VDEV, FTAG); 7163 mutex_exit(&ztest_vdev_lock); 7164 return; 7165 } 7166 7167 /* 7168 * The random vdev we've selected may change as soon as we 7169 * drop the spa_config_lock. We create local copies of things 7170 * we're interested in. 7171 */ 7172 uint64_t guid = rand_vd->vdev_guid; 7173 char *path = strdup(rand_vd->vdev_path); 7174 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 7175 7176 zfs_dbgmsg("vd %px, guid %llu", rand_vd, (u_longlong_t)guid); 7177 spa_config_exit(spa, SCL_VDEV, FTAG); 7178 7179 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 7180 7181 nvlist_t *vdev_guids = fnvlist_alloc(); 7182 nvlist_t *vdev_errlist = fnvlist_alloc(); 7183 fnvlist_add_uint64(vdev_guids, path, guid); 7184 error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); 7185 fnvlist_free(vdev_guids); 7186 fnvlist_free(vdev_errlist); 7187 7188 switch (cmd) { 7189 case POOL_INITIALIZE_CANCEL: 7190 if (ztest_opts.zo_verbose >= 4) { 7191 (void) printf("Cancel initialize %s", path); 7192 if (!active) 7193 (void) printf(" failed (no initialize active)"); 7194 (void) printf("\n"); 7195 } 7196 break; 7197 case POOL_INITIALIZE_START: 7198 if (ztest_opts.zo_verbose >= 4) { 7199 (void) printf("Start initialize %s", path); 7200 if (active && error == 0) 7201 (void) printf(" failed (already active)"); 7202 else if (error != 0) 7203 (void) printf(" failed (error %d)", error); 7204 (void) printf("\n"); 7205 } 7206 break; 7207 case POOL_INITIALIZE_SUSPEND: 7208 if (ztest_opts.zo_verbose >= 4) { 7209 (void) printf("Suspend initialize %s", path); 7210 if (!active) 7211 (void) printf(" failed (no initialize active)"); 7212 (void) printf("\n"); 7213 } 7214 break; 7215 } 7216 free(path); 7217 mutex_exit(&ztest_vdev_lock); 7218 } 7219 7220 void 7221 ztest_trim(ztest_ds_t *zd, uint64_t id) 7222 { 7223 (void) zd, (void) id; 7224 spa_t *spa = ztest_spa; 7225 int error = 0; 7226 7227 mutex_enter(&ztest_vdev_lock); 7228 7229 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 7230 7231 /* Random leaf vdev */ 7232 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 7233 if (rand_vd == NULL) { 7234 spa_config_exit(spa, SCL_VDEV, FTAG); 7235 mutex_exit(&ztest_vdev_lock); 7236 return; 7237 } 7238 7239 /* 7240 * The random vdev we've selected may change as soon as we 7241 * drop the spa_config_lock. We create local copies of things 7242 * we're interested in. 7243 */ 7244 uint64_t guid = rand_vd->vdev_guid; 7245 char *path = strdup(rand_vd->vdev_path); 7246 boolean_t active = rand_vd->vdev_trim_thread != NULL; 7247 7248 zfs_dbgmsg("vd %p, guid %llu", rand_vd, (u_longlong_t)guid); 7249 spa_config_exit(spa, SCL_VDEV, FTAG); 7250 7251 uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); 7252 uint64_t rate = 1 << ztest_random(30); 7253 boolean_t partial = (ztest_random(5) > 0); 7254 boolean_t secure = (ztest_random(5) > 0); 7255 7256 nvlist_t *vdev_guids = fnvlist_alloc(); 7257 nvlist_t *vdev_errlist = fnvlist_alloc(); 7258 fnvlist_add_uint64(vdev_guids, path, guid); 7259 error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, 7260 secure, vdev_errlist); 7261 fnvlist_free(vdev_guids); 7262 fnvlist_free(vdev_errlist); 7263 7264 switch (cmd) { 7265 case POOL_TRIM_CANCEL: 7266 if (ztest_opts.zo_verbose >= 4) { 7267 (void) printf("Cancel TRIM %s", path); 7268 if (!active) 7269 (void) printf(" failed (no TRIM active)"); 7270 (void) printf("\n"); 7271 } 7272 break; 7273 case POOL_TRIM_START: 7274 if (ztest_opts.zo_verbose >= 4) { 7275 (void) printf("Start TRIM %s", path); 7276 if (active && error == 0) 7277 (void) printf(" failed (already active)"); 7278 else if (error != 0) 7279 (void) printf(" failed (error %d)", error); 7280 (void) printf("\n"); 7281 } 7282 break; 7283 case POOL_TRIM_SUSPEND: 7284 if (ztest_opts.zo_verbose >= 4) { 7285 (void) printf("Suspend TRIM %s", path); 7286 if (!active) 7287 (void) printf(" failed (no TRIM active)"); 7288 (void) printf("\n"); 7289 } 7290 break; 7291 } 7292 free(path); 7293 mutex_exit(&ztest_vdev_lock); 7294 } 7295 7296 void 7297 ztest_ddt_prune(ztest_ds_t *zd, uint64_t id) 7298 { 7299 (void) zd, (void) id; 7300 7301 spa_t *spa = ztest_spa; 7302 uint64_t pct = ztest_random(15) + 1; 7303 7304 (void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct); 7305 } 7306 7307 /* 7308 * Verify pool integrity by running zdb. 7309 */ 7310 static void 7311 ztest_run_zdb(uint64_t guid) 7312 { 7313 int status; 7314 char *bin; 7315 char *zdb; 7316 char *zbuf; 7317 const int len = MAXPATHLEN + MAXNAMELEN + 20; 7318 FILE *fp; 7319 7320 bin = umem_alloc(len, UMEM_NOFAIL); 7321 zdb = umem_alloc(len, UMEM_NOFAIL); 7322 zbuf = umem_alloc(1024, UMEM_NOFAIL); 7323 7324 ztest_get_zdb_bin(bin, len); 7325 7326 char **set_gvars_args = ztest_global_vars_to_zdb_args(); 7327 if (set_gvars_args == NULL) { 7328 fatal(B_FALSE, "Failed to allocate memory in " 7329 "ztest_global_vars_to_zdb_args(). Cannot run zdb.\n"); 7330 } 7331 char *set_gvars_args_joined = join_strings(set_gvars_args, " "); 7332 free(set_gvars_args); 7333 7334 size_t would = snprintf(zdb, len, 7335 "%s -bcc%s%s -G -d -Y -e -y %s -p %s %"PRIu64, 7336 bin, 7337 ztest_opts.zo_verbose >= 3 ? "s" : "", 7338 ztest_opts.zo_verbose >= 4 ? "v" : "", 7339 set_gvars_args_joined, 7340 ztest_opts.zo_dir, 7341 guid); 7342 ASSERT3U(would, <, len); 7343 7344 umem_free(set_gvars_args_joined, strlen(set_gvars_args_joined) + 1); 7345 7346 if (ztest_opts.zo_verbose >= 5) 7347 (void) printf("Executing %s\n", zdb); 7348 7349 fp = popen(zdb, "r"); 7350 7351 while (fgets(zbuf, 1024, fp) != NULL) 7352 if (ztest_opts.zo_verbose >= 3) 7353 (void) printf("%s", zbuf); 7354 7355 status = pclose(fp); 7356 7357 if (status == 0) 7358 goto out; 7359 7360 ztest_dump_core = 0; 7361 if (WIFEXITED(status)) 7362 fatal(B_FALSE, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 7363 else 7364 fatal(B_FALSE, "'%s' died with signal %d", 7365 zdb, WTERMSIG(status)); 7366 out: 7367 umem_free(bin, len); 7368 umem_free(zdb, len); 7369 umem_free(zbuf, 1024); 7370 } 7371 7372 static void 7373 ztest_walk_pool_directory(const char *header) 7374 { 7375 spa_t *spa = NULL; 7376 7377 if (ztest_opts.zo_verbose >= 6) 7378 (void) puts(header); 7379 7380 mutex_enter(&spa_namespace_lock); 7381 while ((spa = spa_next(spa)) != NULL) 7382 if (ztest_opts.zo_verbose >= 6) 7383 (void) printf("\t%s\n", spa_name(spa)); 7384 mutex_exit(&spa_namespace_lock); 7385 } 7386 7387 static void 7388 ztest_spa_import_export(char *oldname, char *newname) 7389 { 7390 nvlist_t *config, *newconfig; 7391 uint64_t pool_guid; 7392 spa_t *spa; 7393 int error; 7394 7395 if (ztest_opts.zo_verbose >= 4) { 7396 (void) printf("import/export: old = %s, new = %s\n", 7397 oldname, newname); 7398 } 7399 7400 /* 7401 * Clean up from previous runs. 7402 */ 7403 (void) spa_destroy(newname); 7404 7405 /* 7406 * Get the pool's configuration and guid. 7407 */ 7408 VERIFY0(spa_open(oldname, &spa, FTAG)); 7409 7410 /* 7411 * Kick off a scrub to tickle scrub/export races. 7412 */ 7413 if (ztest_random(2) == 0) 7414 (void) spa_scan(spa, POOL_SCAN_SCRUB); 7415 7416 pool_guid = spa_guid(spa); 7417 spa_close(spa, FTAG); 7418 7419 ztest_walk_pool_directory("pools before export"); 7420 7421 /* 7422 * Export it. 7423 */ 7424 VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); 7425 7426 ztest_walk_pool_directory("pools after export"); 7427 7428 /* 7429 * Try to import it. 7430 */ 7431 newconfig = spa_tryimport(config); 7432 ASSERT3P(newconfig, !=, NULL); 7433 fnvlist_free(newconfig); 7434 7435 /* 7436 * Import it under the new name. 7437 */ 7438 error = spa_import(newname, config, NULL, 0); 7439 if (error != 0) { 7440 dump_nvlist(config, 0); 7441 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 7442 oldname, newname, error); 7443 } 7444 7445 ztest_walk_pool_directory("pools after import"); 7446 7447 /* 7448 * Try to import it again -- should fail with EEXIST. 7449 */ 7450 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 7451 7452 /* 7453 * Try to import it under a different name -- should fail with EEXIST. 7454 */ 7455 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 7456 7457 /* 7458 * Verify that the pool is no longer visible under the old name. 7459 */ 7460 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 7461 7462 /* 7463 * Verify that we can open and close the pool using the new name. 7464 */ 7465 VERIFY0(spa_open(newname, &spa, FTAG)); 7466 ASSERT3U(pool_guid, ==, spa_guid(spa)); 7467 spa_close(spa, FTAG); 7468 7469 fnvlist_free(config); 7470 } 7471 7472 static void 7473 ztest_resume(spa_t *spa) 7474 { 7475 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 7476 (void) printf("resuming from suspended state\n"); 7477 spa_vdev_state_enter(spa, SCL_NONE); 7478 vdev_clear(spa, NULL); 7479 (void) spa_vdev_state_exit(spa, NULL, 0); 7480 (void) zio_resume(spa); 7481 } 7482 7483 static __attribute__((noreturn)) void 7484 ztest_resume_thread(void *arg) 7485 { 7486 spa_t *spa = arg; 7487 7488 /* 7489 * Synthesize aged DDT entries for ddt prune testing 7490 */ 7491 ddt_prune_artificial_age = B_TRUE; 7492 if (ztest_opts.zo_verbose >= 3) 7493 ddt_dump_prune_histogram = B_TRUE; 7494 7495 while (!ztest_exiting) { 7496 if (spa_suspended(spa)) 7497 ztest_resume(spa); 7498 (void) poll(NULL, 0, 100); 7499 7500 /* 7501 * Periodically change the zfs_compressed_arc_enabled setting. 7502 */ 7503 if (ztest_random(10) == 0) 7504 zfs_compressed_arc_enabled = ztest_random(2); 7505 7506 /* 7507 * Periodically change the zfs_abd_scatter_enabled setting. 7508 */ 7509 if (ztest_random(10) == 0) 7510 zfs_abd_scatter_enabled = ztest_random(2); 7511 } 7512 7513 thread_exit(); 7514 } 7515 7516 static __attribute__((noreturn)) void 7517 ztest_deadman_thread(void *arg) 7518 { 7519 ztest_shared_t *zs = arg; 7520 spa_t *spa = ztest_spa; 7521 hrtime_t delay, overdue, last_run = gethrtime(); 7522 7523 delay = (zs->zs_thread_stop - zs->zs_thread_start) + 7524 MSEC2NSEC(zfs_deadman_synctime_ms); 7525 7526 while (!ztest_exiting) { 7527 /* 7528 * Wait for the delay timer while checking occasionally 7529 * if we should stop. 7530 */ 7531 if (gethrtime() < last_run + delay) { 7532 (void) poll(NULL, 0, 1000); 7533 continue; 7534 } 7535 7536 /* 7537 * If the pool is suspended then fail immediately. Otherwise, 7538 * check to see if the pool is making any progress. If 7539 * vdev_deadman() discovers that there hasn't been any recent 7540 * I/Os then it will end up aborting the tests. 7541 */ 7542 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 7543 fatal(B_FALSE, 7544 "aborting test after %llu seconds because " 7545 "pool has transitioned to a suspended state.", 7546 (u_longlong_t)zfs_deadman_synctime_ms / 1000); 7547 } 7548 vdev_deadman(spa->spa_root_vdev, FTAG); 7549 7550 /* 7551 * If the process doesn't complete within a grace period of 7552 * zfs_deadman_synctime_ms over the expected finish time, 7553 * then it may be hung and is terminated. 7554 */ 7555 overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); 7556 if (gethrtime() > overdue) { 7557 fatal(B_FALSE, 7558 "aborting test after %llu seconds because " 7559 "the process is overdue for termination.", 7560 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7561 } 7562 7563 (void) printf("ztest has been running for %lld seconds\n", 7564 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7565 7566 last_run = gethrtime(); 7567 delay = MSEC2NSEC(zfs_deadman_checktime_ms); 7568 } 7569 7570 thread_exit(); 7571 } 7572 7573 static void 7574 ztest_execute(int test, ztest_info_t *zi, uint64_t id) 7575 { 7576 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 7577 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 7578 hrtime_t functime = gethrtime(); 7579 int i; 7580 7581 for (i = 0; i < zi->zi_iters; i++) 7582 zi->zi_func(zd, id); 7583 7584 functime = gethrtime() - functime; 7585 7586 atomic_add_64(&zc->zc_count, 1); 7587 atomic_add_64(&zc->zc_time, functime); 7588 7589 if (ztest_opts.zo_verbose >= 4) 7590 (void) printf("%6.2f sec in %s\n", 7591 (double)functime / NANOSEC, zi->zi_funcname); 7592 } 7593 7594 typedef struct ztest_raidz_expand_io { 7595 uint64_t rzx_id; 7596 uint64_t rzx_amount; 7597 uint64_t rzx_bufsize; 7598 const void *rzx_buffer; 7599 uint64_t rzx_alloc_max; 7600 spa_t *rzx_spa; 7601 } ztest_expand_io_t; 7602 7603 #undef OD_ARRAY_SIZE 7604 #define OD_ARRAY_SIZE 10 7605 7606 /* 7607 * Write a request amount of data to some dataset objects. 7608 * There will be ztest_opts.zo_threads count of these running in parallel. 7609 */ 7610 static __attribute__((noreturn)) void 7611 ztest_rzx_thread(void *arg) 7612 { 7613 ztest_expand_io_t *info = (ztest_expand_io_t *)arg; 7614 ztest_od_t *od; 7615 int batchsize; 7616 int od_size; 7617 ztest_ds_t *zd = &ztest_ds[info->rzx_id % ztest_opts.zo_datasets]; 7618 spa_t *spa = info->rzx_spa; 7619 7620 od_size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 7621 od = umem_alloc(od_size, UMEM_NOFAIL); 7622 batchsize = OD_ARRAY_SIZE; 7623 7624 /* Create objects to write to */ 7625 for (int b = 0; b < batchsize; b++) { 7626 ztest_od_init(od + b, info->rzx_id, FTAG, b, 7627 DMU_OT_UINT64_OTHER, 0, 0, 0); 7628 } 7629 if (ztest_object_init(zd, od, od_size, B_FALSE) != 0) { 7630 umem_free(od, od_size); 7631 thread_exit(); 7632 } 7633 7634 for (uint64_t offset = 0, written = 0; written < info->rzx_amount; 7635 offset += info->rzx_bufsize) { 7636 /* write to 10 objects */ 7637 for (int i = 0; i < batchsize && written < info->rzx_amount; 7638 i++) { 7639 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 7640 ztest_write(zd, od[i].od_object, offset, 7641 info->rzx_bufsize, info->rzx_buffer); 7642 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 7643 written += info->rzx_bufsize; 7644 } 7645 txg_wait_synced(spa_get_dsl(spa), 0); 7646 /* due to inflation, we'll typically bail here */ 7647 if (metaslab_class_get_alloc(spa_normal_class(spa)) > 7648 info->rzx_alloc_max) { 7649 break; 7650 } 7651 } 7652 7653 /* Remove a few objects to leave some holes in allocation space */ 7654 mutex_enter(&zd->zd_dirobj_lock); 7655 (void) ztest_remove(zd, od, 2); 7656 mutex_exit(&zd->zd_dirobj_lock); 7657 7658 umem_free(od, od_size); 7659 7660 thread_exit(); 7661 } 7662 7663 static __attribute__((noreturn)) void 7664 ztest_thread(void *arg) 7665 { 7666 int rand; 7667 uint64_t id = (uintptr_t)arg; 7668 ztest_shared_t *zs = ztest_shared; 7669 uint64_t call_next; 7670 hrtime_t now; 7671 ztest_info_t *zi; 7672 ztest_shared_callstate_t *zc; 7673 7674 while ((now = gethrtime()) < zs->zs_thread_stop) { 7675 /* 7676 * See if it's time to force a crash. 7677 */ 7678 if (now > zs->zs_thread_kill && 7679 raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) { 7680 ztest_kill(zs); 7681 } 7682 7683 /* 7684 * If we're getting ENOSPC with some regularity, stop. 7685 */ 7686 if (zs->zs_enospc_count > 10) 7687 break; 7688 7689 /* 7690 * Pick a random function to execute. 7691 */ 7692 rand = ztest_random(ZTEST_FUNCS); 7693 zi = &ztest_info[rand]; 7694 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 7695 call_next = zc->zc_next; 7696 7697 if (now >= call_next && 7698 atomic_cas_64(&zc->zc_next, call_next, call_next + 7699 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 7700 ztest_execute(rand, zi, id); 7701 } 7702 } 7703 7704 thread_exit(); 7705 } 7706 7707 static void 7708 ztest_dataset_name(char *dsname, const char *pool, int d) 7709 { 7710 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 7711 } 7712 7713 static void 7714 ztest_dataset_destroy(int d) 7715 { 7716 char name[ZFS_MAX_DATASET_NAME_LEN]; 7717 int t; 7718 7719 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7720 7721 if (ztest_opts.zo_verbose >= 3) 7722 (void) printf("Destroying %s to free up space\n", name); 7723 7724 /* 7725 * Cleanup any non-standard clones and snapshots. In general, 7726 * ztest thread t operates on dataset (t % zopt_datasets), 7727 * so there may be more than one thing to clean up. 7728 */ 7729 for (t = d; t < ztest_opts.zo_threads; 7730 t += ztest_opts.zo_datasets) 7731 ztest_dsl_dataset_cleanup(name, t); 7732 7733 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 7734 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 7735 } 7736 7737 static void 7738 ztest_dataset_dirobj_verify(ztest_ds_t *zd) 7739 { 7740 uint64_t usedobjs, dirobjs, scratch; 7741 7742 /* 7743 * ZTEST_DIROBJ is the object directory for the entire dataset. 7744 * Therefore, the number of objects in use should equal the 7745 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 7746 * If not, we have an object leak. 7747 * 7748 * Note that we can only check this in ztest_dataset_open(), 7749 * when the open-context and syncing-context values agree. 7750 * That's because zap_count() returns the open-context value, 7751 * while dmu_objset_space() returns the rootbp fill count. 7752 */ 7753 VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 7754 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 7755 ASSERT3U(dirobjs + 1, ==, usedobjs); 7756 } 7757 7758 static int 7759 ztest_dataset_open(int d) 7760 { 7761 ztest_ds_t *zd = &ztest_ds[d]; 7762 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 7763 objset_t *os; 7764 zilog_t *zilog; 7765 char name[ZFS_MAX_DATASET_NAME_LEN]; 7766 int error; 7767 7768 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7769 7770 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7771 7772 error = ztest_dataset_create(name); 7773 if (error == ENOSPC) { 7774 (void) pthread_rwlock_unlock(&ztest_name_lock); 7775 ztest_record_enospc(FTAG); 7776 return (error); 7777 } 7778 ASSERT(error == 0 || error == EEXIST); 7779 7780 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 7781 B_TRUE, zd, &os)); 7782 (void) pthread_rwlock_unlock(&ztest_name_lock); 7783 7784 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 7785 7786 zilog = zd->zd_zilog; 7787 7788 if (zilog->zl_header->zh_claim_lr_seq != 0 && 7789 zilog->zl_header->zh_claim_lr_seq < committed_seq) 7790 fatal(B_FALSE, "missing log records: " 7791 "claimed %"PRIu64" < committed %"PRIu64"", 7792 zilog->zl_header->zh_claim_lr_seq, committed_seq); 7793 7794 ztest_dataset_dirobj_verify(zd); 7795 7796 zil_replay(os, zd, ztest_replay_vector); 7797 7798 ztest_dataset_dirobj_verify(zd); 7799 7800 if (ztest_opts.zo_verbose >= 6) 7801 (void) printf("%s replay %"PRIu64" blocks, " 7802 "%"PRIu64" records, seq %"PRIu64"\n", 7803 zd->zd_name, 7804 zilog->zl_parse_blk_count, 7805 zilog->zl_parse_lr_count, 7806 zilog->zl_replaying_seq); 7807 7808 zilog = zil_open(os, ztest_get_data, NULL); 7809 7810 if (zilog->zl_replaying_seq != 0 && 7811 zilog->zl_replaying_seq < committed_seq) 7812 fatal(B_FALSE, "missing log records: " 7813 "replayed %"PRIu64" < committed %"PRIu64"", 7814 zilog->zl_replaying_seq, committed_seq); 7815 7816 return (0); 7817 } 7818 7819 static void 7820 ztest_dataset_close(int d) 7821 { 7822 ztest_ds_t *zd = &ztest_ds[d]; 7823 7824 zil_close(zd->zd_zilog); 7825 dmu_objset_disown(zd->zd_os, B_TRUE, zd); 7826 7827 ztest_zd_fini(zd); 7828 } 7829 7830 static int 7831 ztest_replay_zil_cb(const char *name, void *arg) 7832 { 7833 (void) arg; 7834 objset_t *os; 7835 ztest_ds_t *zdtmp; 7836 7837 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, 7838 B_TRUE, FTAG, &os)); 7839 7840 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 7841 7842 ztest_zd_init(zdtmp, NULL, os); 7843 zil_replay(os, zdtmp, ztest_replay_vector); 7844 ztest_zd_fini(zdtmp); 7845 7846 if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && 7847 ztest_opts.zo_verbose >= 6) { 7848 zilog_t *zilog = dmu_objset_zil(os); 7849 7850 (void) printf("%s replay %"PRIu64" blocks, " 7851 "%"PRIu64" records, seq %"PRIu64"\n", 7852 name, 7853 zilog->zl_parse_blk_count, 7854 zilog->zl_parse_lr_count, 7855 zilog->zl_replaying_seq); 7856 } 7857 7858 umem_free(zdtmp, sizeof (ztest_ds_t)); 7859 7860 dmu_objset_disown(os, B_TRUE, FTAG); 7861 return (0); 7862 } 7863 7864 static void 7865 ztest_freeze(void) 7866 { 7867 ztest_ds_t *zd = &ztest_ds[0]; 7868 spa_t *spa; 7869 int numloops = 0; 7870 7871 /* freeze not supported during RAIDZ expansion */ 7872 if (ztest_opts.zo_raid_do_expand) 7873 return; 7874 7875 if (ztest_opts.zo_verbose >= 3) 7876 (void) printf("testing spa_freeze()...\n"); 7877 7878 raidz_scratch_verify(); 7879 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7880 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7881 VERIFY0(ztest_dataset_open(0)); 7882 ztest_spa = spa; 7883 7884 /* 7885 * Force the first log block to be transactionally allocated. 7886 * We have to do this before we freeze the pool -- otherwise 7887 * the log chain won't be anchored. 7888 */ 7889 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 7890 ztest_dmu_object_alloc_free(zd, 0); 7891 zil_commit(zd->zd_zilog, 0); 7892 } 7893 7894 txg_wait_synced(spa_get_dsl(spa), 0); 7895 7896 /* 7897 * Freeze the pool. This stops spa_sync() from doing anything, 7898 * so that the only way to record changes from now on is the ZIL. 7899 */ 7900 spa_freeze(spa); 7901 7902 /* 7903 * Because it is hard to predict how much space a write will actually 7904 * require beforehand, we leave ourselves some fudge space to write over 7905 * capacity. 7906 */ 7907 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 7908 7909 /* 7910 * Run tests that generate log records but don't alter the pool config 7911 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 7912 * We do a txg_wait_synced() after each iteration to force the txg 7913 * to increase well beyond the last synced value in the uberblock. 7914 * The ZIL should be OK with that. 7915 * 7916 * Run a random number of times less than zo_maxloops and ensure we do 7917 * not run out of space on the pool. 7918 */ 7919 while (ztest_random(10) != 0 && 7920 numloops++ < ztest_opts.zo_maxloops && 7921 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 7922 ztest_od_t od; 7923 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 7924 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 7925 ztest_io(zd, od.od_object, 7926 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 7927 txg_wait_synced(spa_get_dsl(spa), 0); 7928 } 7929 7930 /* 7931 * Commit all of the changes we just generated. 7932 */ 7933 zil_commit(zd->zd_zilog, 0); 7934 txg_wait_synced(spa_get_dsl(spa), 0); 7935 7936 /* 7937 * Close our dataset and close the pool. 7938 */ 7939 ztest_dataset_close(0); 7940 spa_close(spa, FTAG); 7941 kernel_fini(); 7942 7943 /* 7944 * Open and close the pool and dataset to induce log replay. 7945 */ 7946 raidz_scratch_verify(); 7947 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7948 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7949 ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); 7950 VERIFY0(ztest_dataset_open(0)); 7951 ztest_spa = spa; 7952 txg_wait_synced(spa_get_dsl(spa), 0); 7953 ztest_dataset_close(0); 7954 ztest_reguid(NULL, 0); 7955 7956 spa_close(spa, FTAG); 7957 kernel_fini(); 7958 } 7959 7960 static void 7961 ztest_import_impl(void) 7962 { 7963 importargs_t args = { 0 }; 7964 nvlist_t *cfg = NULL; 7965 int nsearch = 1; 7966 char *searchdirs[nsearch]; 7967 int flags = ZFS_IMPORT_MISSING_LOG; 7968 7969 searchdirs[0] = ztest_opts.zo_dir; 7970 args.paths = nsearch; 7971 args.path = searchdirs; 7972 args.can_be_active = B_FALSE; 7973 7974 libpc_handle_t lpch = { 7975 .lpc_lib_handle = NULL, 7976 .lpc_ops = &libzpool_config_ops, 7977 .lpc_printerr = B_TRUE 7978 }; 7979 VERIFY0(zpool_find_config(&lpch, ztest_opts.zo_pool, &cfg, &args)); 7980 VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); 7981 fnvlist_free(cfg); 7982 } 7983 7984 /* 7985 * Import a storage pool with the given name. 7986 */ 7987 static void 7988 ztest_import(ztest_shared_t *zs) 7989 { 7990 spa_t *spa; 7991 7992 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7993 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7994 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7995 7996 raidz_scratch_verify(); 7997 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7998 7999 ztest_import_impl(); 8000 8001 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8002 zs->zs_metaslab_sz = 8003 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8004 zs->zs_guid = spa_guid(spa); 8005 spa_close(spa, FTAG); 8006 8007 kernel_fini(); 8008 8009 if (!ztest_opts.zo_mmp_test) { 8010 ztest_run_zdb(zs->zs_guid); 8011 ztest_freeze(); 8012 ztest_run_zdb(zs->zs_guid); 8013 } 8014 8015 (void) pthread_rwlock_destroy(&ztest_name_lock); 8016 mutex_destroy(&ztest_vdev_lock); 8017 mutex_destroy(&ztest_checkpoint_lock); 8018 } 8019 8020 /* 8021 * After the expansion was killed, check that the pool is healthy 8022 */ 8023 static void 8024 ztest_raidz_expand_check(spa_t *spa) 8025 { 8026 ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, RAIDZ_EXPAND_KILLED); 8027 /* 8028 * Set pool check done flag, main program will run a zdb check 8029 * of the pool when we exit. 8030 */ 8031 ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_CHECKED; 8032 8033 /* Wait for reflow to finish */ 8034 if (ztest_opts.zo_verbose >= 1) { 8035 (void) printf("\nwaiting for reflow to finish ...\n"); 8036 } 8037 pool_raidz_expand_stat_t rzx_stats; 8038 pool_raidz_expand_stat_t *pres = &rzx_stats; 8039 do { 8040 txg_wait_synced(spa_get_dsl(spa), 0); 8041 (void) poll(NULL, 0, 500); /* wait 1/2 second */ 8042 8043 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8044 (void) spa_raidz_expand_get_stats(spa, pres); 8045 spa_config_exit(spa, SCL_CONFIG, FTAG); 8046 } while (pres->pres_state != DSS_FINISHED && 8047 pres->pres_reflowed < pres->pres_to_reflow); 8048 8049 if (ztest_opts.zo_verbose >= 1) { 8050 (void) printf("verifying an interrupted raidz " 8051 "expansion using a pool scrub ...\n"); 8052 } 8053 /* Will fail here if there is non-recoverable corruption detected */ 8054 VERIFY0(ztest_scrub_impl(spa)); 8055 if (ztest_opts.zo_verbose >= 1) { 8056 (void) printf("raidz expansion scrub check complete\n"); 8057 } 8058 } 8059 8060 /* 8061 * Start a raidz expansion test. We run some I/O on the pool for a while 8062 * to get some data in the pool. Then we grow the raidz and 8063 * kill the test at the requested offset into the reflow, verifying that 8064 * doing such does not lead to pool corruption. 8065 */ 8066 static void 8067 ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) 8068 { 8069 nvlist_t *root; 8070 pool_raidz_expand_stat_t rzx_stats; 8071 pool_raidz_expand_stat_t *pres = &rzx_stats; 8072 kthread_t **run_threads; 8073 vdev_t *cvd, *rzvd = spa->spa_root_vdev->vdev_child[0]; 8074 int total_disks = rzvd->vdev_children; 8075 int data_disks = total_disks - vdev_get_nparity(rzvd); 8076 uint64_t alloc_goal; 8077 uint64_t csize; 8078 int error, t; 8079 int threads = ztest_opts.zo_threads; 8080 ztest_expand_io_t *thread_args; 8081 8082 ASSERT3U(ztest_opts.zo_raidz_expand_test, !=, RAIDZ_EXPAND_NONE); 8083 ASSERT3P(rzvd->vdev_ops, ==, &vdev_raidz_ops); 8084 ztest_opts.zo_raidz_expand_test = RAIDZ_EXPAND_STARTED; 8085 8086 /* Setup a 1 MiB buffer of random data */ 8087 uint64_t bufsize = 1024 * 1024; 8088 void *buffer = umem_alloc(bufsize, UMEM_NOFAIL); 8089 8090 if (read(ztest_fd_rand, buffer, bufsize) != bufsize) { 8091 fatal(B_TRUE, "short read from /dev/urandom"); 8092 } 8093 /* 8094 * Put some data in the pool and then attach a vdev to initiate 8095 * reflow. 8096 */ 8097 run_threads = umem_zalloc(threads * sizeof (kthread_t *), UMEM_NOFAIL); 8098 thread_args = umem_zalloc(threads * sizeof (ztest_expand_io_t), 8099 UMEM_NOFAIL); 8100 /* Aim for roughly 25% of allocatable space up to 1GB */ 8101 alloc_goal = (vdev_get_min_asize(rzvd) * data_disks) / total_disks; 8102 alloc_goal = MIN(alloc_goal >> 2, 1024*1024*1024); 8103 if (ztest_opts.zo_verbose >= 1) { 8104 (void) printf("adding data to pool '%s', goal %llu bytes\n", 8105 ztest_opts.zo_pool, (u_longlong_t)alloc_goal); 8106 } 8107 8108 /* 8109 * Kick off all the I/O generators that run in parallel. 8110 */ 8111 for (t = 0; t < threads; t++) { 8112 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 8113 umem_free(run_threads, threads * sizeof (kthread_t *)); 8114 umem_free(buffer, bufsize); 8115 return; 8116 } 8117 thread_args[t].rzx_id = t; 8118 thread_args[t].rzx_amount = alloc_goal / threads; 8119 thread_args[t].rzx_bufsize = bufsize; 8120 thread_args[t].rzx_buffer = buffer; 8121 thread_args[t].rzx_alloc_max = alloc_goal; 8122 thread_args[t].rzx_spa = spa; 8123 run_threads[t] = thread_create(NULL, 0, ztest_rzx_thread, 8124 &thread_args[t], 0, NULL, TS_RUN | TS_JOINABLE, 8125 defclsyspri); 8126 } 8127 8128 /* 8129 * Wait for all of the writers to complete. 8130 */ 8131 for (t = 0; t < threads; t++) 8132 VERIFY0(thread_join(run_threads[t])); 8133 8134 /* 8135 * Close all datasets. This must be done after all the threads 8136 * are joined so we can be sure none of the datasets are in-use 8137 * by any of the threads. 8138 */ 8139 for (t = 0; t < ztest_opts.zo_threads; t++) { 8140 if (t < ztest_opts.zo_datasets) 8141 ztest_dataset_close(t); 8142 } 8143 8144 txg_wait_synced(spa_get_dsl(spa), 0); 8145 8146 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 8147 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 8148 8149 umem_free(buffer, bufsize); 8150 umem_free(run_threads, threads * sizeof (kthread_t *)); 8151 umem_free(thread_args, threads * sizeof (ztest_expand_io_t)); 8152 8153 /* Set our reflow target to 25%, 50% or 75% of allocated size */ 8154 uint_t multiple = ztest_random(3) + 1; 8155 uint64_t reflow_max = (rzvd->vdev_stat.vs_alloc * multiple) / 4; 8156 raidz_expand_max_reflow_bytes = reflow_max; 8157 8158 if (ztest_opts.zo_verbose >= 1) { 8159 (void) printf("running raidz expansion test, killing when " 8160 "reflow reaches %llu bytes (%u/4 of allocated space)\n", 8161 (u_longlong_t)reflow_max, multiple); 8162 } 8163 8164 /* XXX - do we want some I/O load during the reflow? */ 8165 8166 /* 8167 * Use a disk size that is larger than existing ones 8168 */ 8169 cvd = rzvd->vdev_child[0]; 8170 csize = vdev_get_min_asize(cvd); 8171 csize += csize / 10; 8172 /* 8173 * Path to vdev to be attached 8174 */ 8175 char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 8176 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 8177 ztest_opts.zo_dir, ztest_opts.zo_pool, rzvd->vdev_children); 8178 /* 8179 * Build the nvlist describing newpath. 8180 */ 8181 root = make_vdev_root(newpath, NULL, NULL, csize, ztest_get_ashift(), 8182 NULL, 0, 0, 1); 8183 /* 8184 * Expand the raidz vdev by attaching the new disk 8185 */ 8186 if (ztest_opts.zo_verbose >= 1) { 8187 (void) printf("expanding raidz: %d wide to %d wide with '%s'\n", 8188 (int)rzvd->vdev_children, (int)rzvd->vdev_children + 1, 8189 newpath); 8190 } 8191 error = spa_vdev_attach(spa, rzvd->vdev_guid, root, B_FALSE, B_FALSE); 8192 nvlist_free(root); 8193 if (error != 0) { 8194 fatal(0, "raidz expand: attach (%s %llu) returned %d", 8195 newpath, (long long)csize, error); 8196 } 8197 8198 /* 8199 * Wait for reflow to begin 8200 */ 8201 while (spa->spa_raidz_expand == NULL) { 8202 txg_wait_synced(spa_get_dsl(spa), 0); 8203 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8204 } 8205 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8206 (void) spa_raidz_expand_get_stats(spa, pres); 8207 spa_config_exit(spa, SCL_CONFIG, FTAG); 8208 while (pres->pres_state != DSS_SCANNING) { 8209 txg_wait_synced(spa_get_dsl(spa), 0); 8210 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8211 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8212 (void) spa_raidz_expand_get_stats(spa, pres); 8213 spa_config_exit(spa, SCL_CONFIG, FTAG); 8214 } 8215 8216 ASSERT3U(pres->pres_state, ==, DSS_SCANNING); 8217 ASSERT3U(pres->pres_to_reflow, !=, 0); 8218 /* 8219 * Set so when we are killed we go to raidz checking rather than 8220 * restarting test. 8221 */ 8222 ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_KILLED; 8223 if (ztest_opts.zo_verbose >= 1) { 8224 (void) printf("raidz expansion reflow started, waiting for " 8225 "%llu bytes to be copied\n", (u_longlong_t)reflow_max); 8226 } 8227 8228 /* 8229 * Wait for reflow maximum to be reached and then kill the test 8230 */ 8231 while (pres->pres_reflowed < reflow_max) { 8232 txg_wait_synced(spa_get_dsl(spa), 0); 8233 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8234 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8235 (void) spa_raidz_expand_get_stats(spa, pres); 8236 spa_config_exit(spa, SCL_CONFIG, FTAG); 8237 } 8238 8239 /* Reset the reflow pause before killing */ 8240 raidz_expand_max_reflow_bytes = 0; 8241 8242 if (ztest_opts.zo_verbose >= 1) { 8243 (void) printf("killing raidz expansion test after reflow " 8244 "reached %llu bytes\n", (u_longlong_t)pres->pres_reflowed); 8245 } 8246 8247 /* 8248 * Kill ourself to simulate a panic during a reflow. Our parent will 8249 * restart the test and the changed flag value will drive the test 8250 * through the scrub/check code to verify the pool is not corrupted. 8251 */ 8252 ztest_kill(zs); 8253 } 8254 8255 static void 8256 ztest_generic_run(ztest_shared_t *zs, spa_t *spa) 8257 { 8258 kthread_t **run_threads; 8259 int t; 8260 8261 run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), 8262 UMEM_NOFAIL); 8263 8264 /* 8265 * Kick off all the tests that run in parallel. 8266 */ 8267 for (t = 0; t < ztest_opts.zo_threads; t++) { 8268 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 8269 umem_free(run_threads, ztest_opts.zo_threads * 8270 sizeof (kthread_t *)); 8271 return; 8272 } 8273 8274 run_threads[t] = thread_create(NULL, 0, ztest_thread, 8275 (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, 8276 defclsyspri); 8277 } 8278 8279 /* 8280 * Wait for all of the tests to complete. 8281 */ 8282 for (t = 0; t < ztest_opts.zo_threads; t++) 8283 VERIFY0(thread_join(run_threads[t])); 8284 8285 /* 8286 * Close all datasets. This must be done after all the threads 8287 * are joined so we can be sure none of the datasets are in-use 8288 * by any of the threads. 8289 */ 8290 for (t = 0; t < ztest_opts.zo_threads; t++) { 8291 if (t < ztest_opts.zo_datasets) 8292 ztest_dataset_close(t); 8293 } 8294 8295 txg_wait_synced(spa_get_dsl(spa), 0); 8296 8297 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 8298 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 8299 8300 umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); 8301 } 8302 8303 /* 8304 * Setup our test context and kick off threads to run tests on all datasets 8305 * in parallel. 8306 */ 8307 static void 8308 ztest_run(ztest_shared_t *zs) 8309 { 8310 spa_t *spa; 8311 objset_t *os; 8312 kthread_t *resume_thread, *deadman_thread; 8313 uint64_t object; 8314 int error; 8315 int t, d; 8316 8317 ztest_exiting = B_FALSE; 8318 8319 /* 8320 * Initialize parent/child shared state. 8321 */ 8322 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8323 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8324 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8325 8326 zs->zs_thread_start = gethrtime(); 8327 zs->zs_thread_stop = 8328 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 8329 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 8330 zs->zs_thread_kill = zs->zs_thread_stop; 8331 if (ztest_random(100) < ztest_opts.zo_killrate) { 8332 zs->zs_thread_kill -= 8333 ztest_random(ztest_opts.zo_passtime * NANOSEC); 8334 } 8335 8336 mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); 8337 8338 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 8339 offsetof(ztest_cb_data_t, zcd_node)); 8340 8341 /* 8342 * Open our pool. It may need to be imported first depending on 8343 * what tests were running when the previous pass was terminated. 8344 */ 8345 raidz_scratch_verify(); 8346 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8347 error = spa_open(ztest_opts.zo_pool, &spa, FTAG); 8348 if (error) { 8349 VERIFY3S(error, ==, ENOENT); 8350 ztest_import_impl(); 8351 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8352 zs->zs_metaslab_sz = 8353 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8354 } 8355 8356 metaslab_preload_limit = ztest_random(20) + 1; 8357 ztest_spa = spa; 8358 8359 /* 8360 * XXX - BUGBUG raidz expansion do not run this for generic for now 8361 */ 8362 if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) 8363 VERIFY0(vdev_raidz_impl_set("cycle")); 8364 8365 dmu_objset_stats_t dds; 8366 VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, 8367 DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); 8368 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 8369 dmu_objset_fast_stat(os, &dds); 8370 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 8371 dmu_objset_disown(os, B_TRUE, FTAG); 8372 8373 /* Give the dedicated raidz expansion test more grace time */ 8374 if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) 8375 zfs_deadman_synctime_ms *= 2; 8376 8377 /* 8378 * Create a thread to periodically resume suspended I/O. 8379 */ 8380 resume_thread = thread_create(NULL, 0, ztest_resume_thread, 8381 spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 8382 8383 /* 8384 * Create a deadman thread and set to panic if we hang. 8385 */ 8386 deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, 8387 zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 8388 8389 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; 8390 8391 /* 8392 * Verify that we can safely inquire about any object, 8393 * whether it's allocated or not. To make it interesting, 8394 * we probe a 5-wide window around each power of two. 8395 * This hits all edge cases, including zero and the max. 8396 */ 8397 for (t = 0; t < 64; t++) { 8398 for (d = -5; d <= 5; d++) { 8399 error = dmu_object_info(spa->spa_meta_objset, 8400 (1ULL << t) + d, NULL); 8401 ASSERT(error == 0 || error == ENOENT || 8402 error == EINVAL); 8403 } 8404 } 8405 8406 /* 8407 * If we got any ENOSPC errors on the previous run, destroy something. 8408 */ 8409 if (zs->zs_enospc_count != 0) { 8410 /* Not expecting ENOSPC errors during raidz expansion tests */ 8411 ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, 8412 RAIDZ_EXPAND_NONE); 8413 8414 int d = ztest_random(ztest_opts.zo_datasets); 8415 ztest_dataset_destroy(d); 8416 } 8417 zs->zs_enospc_count = 0; 8418 8419 /* 8420 * If we were in the middle of ztest_device_removal() and were killed 8421 * we need to ensure the removal and scrub complete before running 8422 * any tests that check ztest_device_removal_active. The removal will 8423 * be restarted automatically when the spa is opened, but we need to 8424 * initiate the scrub manually if it is not already in progress. Note 8425 * that we always run the scrub whenever an indirect vdev exists 8426 * because we have no way of knowing for sure if ztest_device_removal() 8427 * fully completed its scrub before the pool was reimported. 8428 * 8429 * Does not apply for the RAIDZ expansion specific test runs 8430 */ 8431 if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_NONE && 8432 (spa->spa_removing_phys.sr_state == DSS_SCANNING || 8433 spa->spa_removing_phys.sr_prev_indirect_vdev != -1)) { 8434 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 8435 txg_wait_synced(spa_get_dsl(spa), 0); 8436 8437 error = ztest_scrub_impl(spa); 8438 if (error == EBUSY) 8439 error = 0; 8440 ASSERT0(error); 8441 } 8442 8443 if (ztest_opts.zo_verbose >= 4) 8444 (void) printf("starting main threads...\n"); 8445 8446 /* 8447 * Replay all logs of all datasets in the pool. This is primarily for 8448 * temporary datasets which wouldn't otherwise get replayed, which 8449 * can trigger failures when attempting to offline a SLOG in 8450 * ztest_fault_inject(). 8451 */ 8452 (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, 8453 NULL, DS_FIND_CHILDREN); 8454 8455 if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) 8456 ztest_raidz_expand_run(zs, spa); 8457 else if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_KILLED) 8458 ztest_raidz_expand_check(spa); 8459 else 8460 ztest_generic_run(zs, spa); 8461 8462 /* Kill the resume and deadman threads */ 8463 ztest_exiting = B_TRUE; 8464 VERIFY0(thread_join(resume_thread)); 8465 VERIFY0(thread_join(deadman_thread)); 8466 ztest_resume(spa); 8467 8468 /* 8469 * Right before closing the pool, kick off a bunch of async I/O; 8470 * spa_close() should wait for it to complete. 8471 */ 8472 for (object = 1; object < 50; object++) { 8473 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 8474 ZIO_PRIORITY_SYNC_READ); 8475 } 8476 8477 /* Verify that at least one commit cb was called in a timely fashion */ 8478 if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) 8479 VERIFY0(zc_min_txg_delay); 8480 8481 spa_close(spa, FTAG); 8482 8483 /* 8484 * Verify that we can loop over all pools. 8485 */ 8486 mutex_enter(&spa_namespace_lock); 8487 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 8488 if (ztest_opts.zo_verbose > 3) 8489 (void) printf("spa_next: found %s\n", spa_name(spa)); 8490 mutex_exit(&spa_namespace_lock); 8491 8492 /* 8493 * Verify that we can export the pool and reimport it under a 8494 * different name. 8495 */ 8496 if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { 8497 char name[ZFS_MAX_DATASET_NAME_LEN]; 8498 (void) snprintf(name, sizeof (name), "%s_import", 8499 ztest_opts.zo_pool); 8500 ztest_spa_import_export(ztest_opts.zo_pool, name); 8501 ztest_spa_import_export(name, ztest_opts.zo_pool); 8502 } 8503 8504 kernel_fini(); 8505 8506 list_destroy(&zcl.zcl_callbacks); 8507 mutex_destroy(&zcl.zcl_callbacks_lock); 8508 (void) pthread_rwlock_destroy(&ztest_name_lock); 8509 mutex_destroy(&ztest_vdev_lock); 8510 mutex_destroy(&ztest_checkpoint_lock); 8511 } 8512 8513 static void 8514 print_time(hrtime_t t, char *timebuf) 8515 { 8516 hrtime_t s = t / NANOSEC; 8517 hrtime_t m = s / 60; 8518 hrtime_t h = m / 60; 8519 hrtime_t d = h / 24; 8520 8521 s -= m * 60; 8522 m -= h * 60; 8523 h -= d * 24; 8524 8525 timebuf[0] = '\0'; 8526 8527 if (d) 8528 (void) sprintf(timebuf, 8529 "%llud%02lluh%02llum%02llus", d, h, m, s); 8530 else if (h) 8531 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 8532 else if (m) 8533 (void) sprintf(timebuf, "%llum%02llus", m, s); 8534 else 8535 (void) sprintf(timebuf, "%llus", s); 8536 } 8537 8538 static nvlist_t * 8539 make_random_pool_props(void) 8540 { 8541 nvlist_t *props; 8542 8543 props = fnvlist_alloc(); 8544 8545 /* Twenty percent of the time enable ZPOOL_PROP_DEDUP_TABLE_QUOTA */ 8546 if (ztest_random(5) == 0) { 8547 fnvlist_add_uint64(props, 8548 zpool_prop_to_name(ZPOOL_PROP_DEDUP_TABLE_QUOTA), 8549 2 * 1024 * 1024); 8550 } 8551 8552 /* Fifty percent of the time enable ZPOOL_PROP_AUTOREPLACE */ 8553 if (ztest_random(2) == 0) { 8554 fnvlist_add_uint64(props, 8555 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); 8556 } 8557 8558 return (props); 8559 } 8560 8561 /* 8562 * Create a storage pool with the given name and initial vdev size. 8563 * Then test spa_freeze() functionality. 8564 */ 8565 static void 8566 ztest_init(ztest_shared_t *zs) 8567 { 8568 spa_t *spa; 8569 nvlist_t *nvroot, *props; 8570 int i; 8571 8572 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8573 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8574 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8575 8576 raidz_scratch_verify(); 8577 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8578 8579 /* 8580 * Create the storage pool. 8581 */ 8582 (void) spa_destroy(ztest_opts.zo_pool); 8583 ztest_shared->zs_vdev_next_leaf = 0; 8584 zs->zs_splits = 0; 8585 zs->zs_mirrors = ztest_opts.zo_mirrors; 8586 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 8587 NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 8588 props = make_random_pool_props(); 8589 8590 /* 8591 * We don't expect the pool to suspend unless maxfaults == 0, 8592 * in which case ztest_fault_inject() temporarily takes away 8593 * the only valid replica. 8594 */ 8595 fnvlist_add_uint64(props, 8596 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 8597 MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); 8598 8599 for (i = 0; i < SPA_FEATURES; i++) { 8600 char *buf; 8601 8602 if (!spa_feature_table[i].fi_zfs_mod_supported) 8603 continue; 8604 8605 /* 8606 * 75% chance of using the log space map feature. We want ztest 8607 * to exercise both the code paths that use the log space map 8608 * feature and the ones that don't. 8609 */ 8610 if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) 8611 continue; 8612 8613 /* 8614 * split 50/50 between legacy and fast dedup 8615 */ 8616 if (i == SPA_FEATURE_FAST_DEDUP && ztest_random(2) != 0) 8617 continue; 8618 8619 VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", 8620 spa_feature_table[i].fi_uname)); 8621 fnvlist_add_uint64(props, buf, 0); 8622 free(buf); 8623 } 8624 8625 VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); 8626 fnvlist_free(nvroot); 8627 fnvlist_free(props); 8628 8629 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8630 zs->zs_metaslab_sz = 8631 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8632 zs->zs_guid = spa_guid(spa); 8633 spa_close(spa, FTAG); 8634 8635 kernel_fini(); 8636 8637 if (!ztest_opts.zo_mmp_test) { 8638 ztest_run_zdb(zs->zs_guid); 8639 ztest_freeze(); 8640 ztest_run_zdb(zs->zs_guid); 8641 } 8642 8643 (void) pthread_rwlock_destroy(&ztest_name_lock); 8644 mutex_destroy(&ztest_vdev_lock); 8645 mutex_destroy(&ztest_checkpoint_lock); 8646 } 8647 8648 static void 8649 setup_data_fd(void) 8650 { 8651 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 8652 8653 ztest_fd_data = mkstemp(ztest_name_data); 8654 ASSERT3S(ztest_fd_data, >=, 0); 8655 (void) unlink(ztest_name_data); 8656 } 8657 8658 static int 8659 shared_data_size(ztest_shared_hdr_t *hdr) 8660 { 8661 int size; 8662 8663 size = hdr->zh_hdr_size; 8664 size += hdr->zh_opts_size; 8665 size += hdr->zh_size; 8666 size += hdr->zh_stats_size * hdr->zh_stats_count; 8667 size += hdr->zh_ds_size * hdr->zh_ds_count; 8668 size += hdr->zh_scratch_state_size; 8669 8670 return (size); 8671 } 8672 8673 static void 8674 setup_hdr(void) 8675 { 8676 int size; 8677 ztest_shared_hdr_t *hdr; 8678 8679 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 8680 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 8681 ASSERT3P(hdr, !=, MAP_FAILED); 8682 8683 VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 8684 8685 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 8686 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 8687 hdr->zh_size = sizeof (ztest_shared_t); 8688 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 8689 hdr->zh_stats_count = ZTEST_FUNCS; 8690 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 8691 hdr->zh_ds_count = ztest_opts.zo_datasets; 8692 hdr->zh_scratch_state_size = sizeof (ztest_shared_scratch_state_t); 8693 8694 size = shared_data_size(hdr); 8695 VERIFY0(ftruncate(ztest_fd_data, size)); 8696 8697 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 8698 } 8699 8700 static void 8701 setup_data(void) 8702 { 8703 int size, offset; 8704 ztest_shared_hdr_t *hdr; 8705 uint8_t *buf; 8706 8707 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 8708 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 8709 ASSERT3P(hdr, !=, MAP_FAILED); 8710 8711 size = shared_data_size(hdr); 8712 8713 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 8714 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 8715 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 8716 ASSERT3P(hdr, !=, MAP_FAILED); 8717 buf = (uint8_t *)hdr; 8718 8719 offset = hdr->zh_hdr_size; 8720 ztest_shared_opts = (void *)&buf[offset]; 8721 offset += hdr->zh_opts_size; 8722 ztest_shared = (void *)&buf[offset]; 8723 offset += hdr->zh_size; 8724 ztest_shared_callstate = (void *)&buf[offset]; 8725 offset += hdr->zh_stats_size * hdr->zh_stats_count; 8726 ztest_shared_ds = (void *)&buf[offset]; 8727 offset += hdr->zh_ds_size * hdr->zh_ds_count; 8728 ztest_scratch_state = (void *)&buf[offset]; 8729 } 8730 8731 static boolean_t 8732 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 8733 { 8734 pid_t pid; 8735 int status; 8736 char *cmdbuf = NULL; 8737 8738 pid = fork(); 8739 8740 if (cmd == NULL) { 8741 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 8742 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 8743 cmd = cmdbuf; 8744 } 8745 8746 if (pid == -1) 8747 fatal(B_TRUE, "fork failed"); 8748 8749 if (pid == 0) { /* child */ 8750 char fd_data_str[12]; 8751 8752 VERIFY3S(11, >=, 8753 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 8754 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 8755 8756 if (libpath != NULL) { 8757 const char *curlp = getenv("LD_LIBRARY_PATH"); 8758 if (curlp == NULL) 8759 VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); 8760 else { 8761 char *newlp = NULL; 8762 VERIFY3S(-1, !=, 8763 asprintf(&newlp, "%s:%s", libpath, curlp)); 8764 VERIFY0(setenv("LD_LIBRARY_PATH", newlp, 1)); 8765 free(newlp); 8766 } 8767 } 8768 (void) execl(cmd, cmd, (char *)NULL); 8769 ztest_dump_core = B_FALSE; 8770 fatal(B_TRUE, "exec failed: %s", cmd); 8771 } 8772 8773 if (cmdbuf != NULL) { 8774 umem_free(cmdbuf, MAXPATHLEN); 8775 cmd = NULL; 8776 } 8777 8778 while (waitpid(pid, &status, 0) != pid) 8779 continue; 8780 if (statusp != NULL) 8781 *statusp = status; 8782 8783 if (WIFEXITED(status)) { 8784 if (WEXITSTATUS(status) != 0) { 8785 (void) fprintf(stderr, "child exited with code %d\n", 8786 WEXITSTATUS(status)); 8787 exit(2); 8788 } 8789 return (B_FALSE); 8790 } else if (WIFSIGNALED(status)) { 8791 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 8792 (void) fprintf(stderr, "child died with signal %d\n", 8793 WTERMSIG(status)); 8794 exit(3); 8795 } 8796 return (B_TRUE); 8797 } else { 8798 (void) fprintf(stderr, "something strange happened to child\n"); 8799 exit(4); 8800 } 8801 } 8802 8803 static void 8804 ztest_run_init(void) 8805 { 8806 int i; 8807 8808 ztest_shared_t *zs = ztest_shared; 8809 8810 /* 8811 * Blow away any existing copy of zpool.cache 8812 */ 8813 (void) remove(spa_config_path); 8814 8815 if (ztest_opts.zo_init == 0) { 8816 if (ztest_opts.zo_verbose >= 1) 8817 (void) printf("Importing pool %s\n", 8818 ztest_opts.zo_pool); 8819 ztest_import(zs); 8820 return; 8821 } 8822 8823 /* 8824 * Create and initialize our storage pool. 8825 */ 8826 for (i = 1; i <= ztest_opts.zo_init; i++) { 8827 memset(zs, 0, sizeof (*zs)); 8828 if (ztest_opts.zo_verbose >= 3 && 8829 ztest_opts.zo_init != 1) { 8830 (void) printf("ztest_init(), pass %d\n", i); 8831 } 8832 ztest_init(zs); 8833 } 8834 } 8835 8836 int 8837 main(int argc, char **argv) 8838 { 8839 int kills = 0; 8840 int iters = 0; 8841 int older = 0; 8842 int newer = 0; 8843 ztest_shared_t *zs; 8844 ztest_info_t *zi; 8845 ztest_shared_callstate_t *zc; 8846 char timebuf[100]; 8847 char numbuf[NN_NUMBUF_SZ]; 8848 char *cmd; 8849 boolean_t hasalt; 8850 int f, err; 8851 char *fd_data_str = getenv("ZTEST_FD_DATA"); 8852 struct sigaction action; 8853 8854 (void) setvbuf(stdout, NULL, _IOLBF, 0); 8855 8856 dprintf_setup(&argc, argv); 8857 zfs_deadman_synctime_ms = 300000; 8858 zfs_deadman_checktime_ms = 30000; 8859 /* 8860 * As two-word space map entries may not come up often (especially 8861 * if pool and vdev sizes are small) we want to force at least some 8862 * of them so the feature get tested. 8863 */ 8864 zfs_force_some_double_word_sm_entries = B_TRUE; 8865 8866 /* 8867 * Verify that even extensively damaged split blocks with many 8868 * segments can be reconstructed in a reasonable amount of time 8869 * when reconstruction is known to be possible. 8870 * 8871 * Note: the lower this value is, the more damage we inflict, and 8872 * the more time ztest spends in recovering that damage. We chose 8873 * to induce damage 1/100th of the time so recovery is tested but 8874 * not so frequently that ztest doesn't get to test other code paths. 8875 */ 8876 zfs_reconstruct_indirect_damage_fraction = 100; 8877 8878 action.sa_handler = sig_handler; 8879 sigemptyset(&action.sa_mask); 8880 action.sa_flags = 0; 8881 8882 if (sigaction(SIGSEGV, &action, NULL) < 0) { 8883 (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", 8884 strerror(errno)); 8885 exit(EXIT_FAILURE); 8886 } 8887 8888 if (sigaction(SIGABRT, &action, NULL) < 0) { 8889 (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", 8890 strerror(errno)); 8891 exit(EXIT_FAILURE); 8892 } 8893 8894 /* 8895 * Force random_get_bytes() to use /dev/urandom in order to prevent 8896 * ztest from needlessly depleting the system entropy pool. 8897 */ 8898 random_path = "/dev/urandom"; 8899 ztest_fd_rand = open(random_path, O_RDONLY | O_CLOEXEC); 8900 ASSERT3S(ztest_fd_rand, >=, 0); 8901 8902 if (!fd_data_str) { 8903 process_options(argc, argv); 8904 8905 setup_data_fd(); 8906 setup_hdr(); 8907 setup_data(); 8908 memcpy(ztest_shared_opts, &ztest_opts, 8909 sizeof (*ztest_shared_opts)); 8910 } else { 8911 ztest_fd_data = atoi(fd_data_str); 8912 setup_data(); 8913 memcpy(&ztest_opts, ztest_shared_opts, sizeof (ztest_opts)); 8914 } 8915 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 8916 8917 err = ztest_set_global_vars(); 8918 if (err != 0 && !fd_data_str) { 8919 /* error message done by ztest_set_global_vars */ 8920 exit(EXIT_FAILURE); 8921 } else { 8922 /* children should not be spawned if setting gvars fails */ 8923 VERIFY3S(err, ==, 0); 8924 } 8925 8926 /* Override location of zpool.cache */ 8927 VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", 8928 ztest_opts.zo_dir), !=, -1); 8929 8930 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 8931 UMEM_NOFAIL); 8932 zs = ztest_shared; 8933 8934 if (fd_data_str) { 8935 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 8936 metaslab_df_alloc_threshold = 8937 zs->zs_metaslab_df_alloc_threshold; 8938 8939 if (zs->zs_do_init) 8940 ztest_run_init(); 8941 else 8942 ztest_run(zs); 8943 exit(0); 8944 } 8945 8946 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 8947 8948 if (ztest_opts.zo_verbose >= 1) { 8949 (void) printf("%"PRIu64" vdevs, %d datasets, %d threads, " 8950 "%d %s disks, parity %d, %"PRIu64" seconds...\n\n", 8951 ztest_opts.zo_vdevs, 8952 ztest_opts.zo_datasets, 8953 ztest_opts.zo_threads, 8954 ztest_opts.zo_raid_children, 8955 ztest_opts.zo_raid_type, 8956 ztest_opts.zo_raid_parity, 8957 ztest_opts.zo_time); 8958 } 8959 8960 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 8961 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 8962 8963 zs->zs_do_init = B_TRUE; 8964 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 8965 if (ztest_opts.zo_verbose >= 1) { 8966 (void) printf("Executing older ztest for " 8967 "initialization: %s\n", ztest_opts.zo_alt_ztest); 8968 } 8969 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 8970 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 8971 } else { 8972 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 8973 } 8974 zs->zs_do_init = B_FALSE; 8975 8976 zs->zs_proc_start = gethrtime(); 8977 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 8978 8979 for (f = 0; f < ZTEST_FUNCS; f++) { 8980 zi = &ztest_info[f]; 8981 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8982 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 8983 zc->zc_next = UINT64_MAX; 8984 else 8985 zc->zc_next = zs->zs_proc_start + 8986 ztest_random(2 * zi->zi_interval[0] + 1); 8987 } 8988 8989 /* 8990 * Run the tests in a loop. These tests include fault injection 8991 * to verify that self-healing data works, and forced crashes 8992 * to verify that we never lose on-disk consistency. 8993 */ 8994 while (gethrtime() < zs->zs_proc_stop) { 8995 int status; 8996 boolean_t killed; 8997 8998 /* 8999 * Initialize the workload counters for each function. 9000 */ 9001 for (f = 0; f < ZTEST_FUNCS; f++) { 9002 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9003 zc->zc_count = 0; 9004 zc->zc_time = 0; 9005 } 9006 9007 /* Set the allocation switch size */ 9008 zs->zs_metaslab_df_alloc_threshold = 9009 ztest_random(zs->zs_metaslab_sz / 4) + 1; 9010 9011 if (!hasalt || ztest_random(2) == 0) { 9012 if (hasalt && ztest_opts.zo_verbose >= 1) { 9013 (void) printf("Executing newer ztest: %s\n", 9014 cmd); 9015 } 9016 newer++; 9017 killed = exec_child(cmd, NULL, B_TRUE, &status); 9018 } else { 9019 if (hasalt && ztest_opts.zo_verbose >= 1) { 9020 (void) printf("Executing older ztest: %s\n", 9021 ztest_opts.zo_alt_ztest); 9022 } 9023 older++; 9024 killed = exec_child(ztest_opts.zo_alt_ztest, 9025 ztest_opts.zo_alt_libpath, B_TRUE, &status); 9026 } 9027 9028 if (killed) 9029 kills++; 9030 iters++; 9031 9032 if (ztest_opts.zo_verbose >= 1) { 9033 hrtime_t now = gethrtime(); 9034 9035 now = MIN(now, zs->zs_proc_stop); 9036 print_time(zs->zs_proc_stop - now, timebuf); 9037 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 9038 9039 (void) printf("Pass %3d, %8s, %3"PRIu64" ENOSPC, " 9040 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 9041 iters, 9042 WIFEXITED(status) ? "Complete" : "SIGKILL", 9043 zs->zs_enospc_count, 9044 100.0 * zs->zs_alloc / zs->zs_space, 9045 numbuf, 9046 100.0 * (now - zs->zs_proc_start) / 9047 (ztest_opts.zo_time * NANOSEC), timebuf); 9048 } 9049 9050 if (ztest_opts.zo_verbose >= 2) { 9051 (void) printf("\nWorkload summary:\n\n"); 9052 (void) printf("%7s %9s %s\n", 9053 "Calls", "Time", "Function"); 9054 (void) printf("%7s %9s %s\n", 9055 "-----", "----", "--------"); 9056 for (f = 0; f < ZTEST_FUNCS; f++) { 9057 zi = &ztest_info[f]; 9058 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9059 print_time(zc->zc_time, timebuf); 9060 (void) printf("%7"PRIu64" %9s %s\n", 9061 zc->zc_count, timebuf, 9062 zi->zi_funcname); 9063 } 9064 (void) printf("\n"); 9065 } 9066 9067 if (!ztest_opts.zo_mmp_test) 9068 ztest_run_zdb(zs->zs_guid); 9069 if (ztest_shared_opts->zo_raidz_expand_test == 9070 RAIDZ_EXPAND_CHECKED) 9071 break; /* raidz expand test complete */ 9072 } 9073 9074 if (ztest_opts.zo_verbose >= 1) { 9075 if (hasalt) { 9076 (void) printf("%d runs of older ztest: %s\n", older, 9077 ztest_opts.zo_alt_ztest); 9078 (void) printf("%d runs of newer ztest: %s\n", newer, 9079 cmd); 9080 } 9081 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 9082 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 9083 } 9084 9085 umem_free(cmd, MAXNAMELEN); 9086 9087 return (0); 9088 } 9089