1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2013 Steven Hartland. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2017 Joyent, Inc. 28 * Copyright (c) 2017, Intel Corporation. 29 */ 30 31 /* 32 * The objective of this program is to provide a DMU/ZAP/SPA stress test 33 * that runs entirely in userland, is easy to use, and easy to extend. 34 * 35 * The overall design of the ztest program is as follows: 36 * 37 * (1) For each major functional area (e.g. adding vdevs to a pool, 38 * creating and destroying datasets, reading and writing objects, etc) 39 * we have a simple routine to test that functionality. These 40 * individual routines do not have to do anything "stressful". 41 * 42 * (2) We turn these simple functionality tests into a stress test by 43 * running them all in parallel, with as many threads as desired, 44 * and spread across as many datasets, objects, and vdevs as desired. 45 * 46 * (3) While all this is happening, we inject faults into the pool to 47 * verify that self-healing data really works. 48 * 49 * (4) Every time we open a dataset, we change its checksum and compression 50 * functions. Thus even individual objects vary from block to block 51 * in which checksum they use and whether they're compressed. 52 * 53 * (5) To verify that we never lose on-disk consistency after a crash, 54 * we run the entire test in a child of the main process. 55 * At random times, the child self-immolates with a SIGKILL. 56 * This is the software equivalent of pulling the power cord. 57 * The parent then runs the test again, using the existing 58 * storage pool, as many times as desired. If backwards compatibility 59 * testing is enabled ztest will sometimes run the "older" version 60 * of ztest after a SIGKILL. 61 * 62 * (6) To verify that we don't have future leaks or temporal incursions, 63 * many of the functional tests record the transaction group number 64 * as part of their data. When reading old data, they verify that 65 * the transaction group number is less than the current, open txg. 66 * If you add a new test, please do this if applicable. 67 * 68 * (7) Threads are created with a reduced stack size, for sanity checking. 69 * Therefore, it's important not to allocate huge buffers on the stack. 70 * 71 * When run with no arguments, ztest runs for about five minutes and 72 * produces no output if successful. To get a little bit of information, 73 * specify -V. To get more information, specify -VV, and so on. 74 * 75 * To turn this into an overnight stress test, use -T to specify run time. 76 * 77 * You can ask more vdevs [-v], datasets [-d], or threads [-t] 78 * to increase the pool capacity, fanout, and overall stress level. 79 * 80 * Use the -k option to set the desired frequency of kills. 81 * 82 * When ztest invokes itself it passes all relevant information through a 83 * temporary file which is mmap-ed in the child process. This allows shared 84 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 85 * stored at offset 0 of this file and contains information on the size and 86 * number of shared structures in the file. The information stored in this file 87 * must remain backwards compatible with older versions of ztest so that 88 * ztest can invoke them during backwards compatibility testing (-B). 89 */ 90 91 #include <sys/zfs_context.h> 92 #include <sys/spa.h> 93 #include <sys/dmu.h> 94 #include <sys/txg.h> 95 #include <sys/dbuf.h> 96 #include <sys/zap.h> 97 #include <sys/dmu_objset.h> 98 #include <sys/poll.h> 99 #include <sys/stat.h> 100 #include <sys/time.h> 101 #include <sys/wait.h> 102 #include <sys/mman.h> 103 #include <sys/resource.h> 104 #include <sys/zio.h> 105 #include <sys/zil.h> 106 #include <sys/zil_impl.h> 107 #include <sys/vdev_draid.h> 108 #include <sys/vdev_impl.h> 109 #include <sys/vdev_file.h> 110 #include <sys/vdev_initialize.h> 111 #include <sys/vdev_raidz.h> 112 #include <sys/vdev_trim.h> 113 #include <sys/spa_impl.h> 114 #include <sys/metaslab_impl.h> 115 #include <sys/dsl_prop.h> 116 #include <sys/dsl_dataset.h> 117 #include <sys/dsl_destroy.h> 118 #include <sys/dsl_scan.h> 119 #include <sys/zio_checksum.h> 120 #include <sys/zfs_refcount.h> 121 #include <sys/zfeature.h> 122 #include <sys/dsl_userhold.h> 123 #include <sys/abd.h> 124 #include <sys/blake3.h> 125 #include <stdio.h> 126 #include <stdlib.h> 127 #include <unistd.h> 128 #include <getopt.h> 129 #include <signal.h> 130 #include <umem.h> 131 #include <ctype.h> 132 #include <math.h> 133 #include <sys/fs/zfs.h> 134 #include <zfs_fletcher.h> 135 #include <libnvpair.h> 136 #include <libzutil.h> 137 #include <sys/crypto/icp.h> 138 #include <sys/zfs_impl.h> 139 #if (__GLIBC__ && !__UCLIBC__) 140 #include <execinfo.h> /* for backtrace() */ 141 #endif 142 143 static int ztest_fd_data = -1; 144 static int ztest_fd_rand = -1; 145 146 typedef struct ztest_shared_hdr { 147 uint64_t zh_hdr_size; 148 uint64_t zh_opts_size; 149 uint64_t zh_size; 150 uint64_t zh_stats_size; 151 uint64_t zh_stats_count; 152 uint64_t zh_ds_size; 153 uint64_t zh_ds_count; 154 uint64_t zh_scratch_state_size; 155 } ztest_shared_hdr_t; 156 157 static ztest_shared_hdr_t *ztest_shared_hdr; 158 159 enum ztest_class_state { 160 ZTEST_VDEV_CLASS_OFF, 161 ZTEST_VDEV_CLASS_ON, 162 ZTEST_VDEV_CLASS_RND 163 }; 164 165 /* Dedicated RAIDZ Expansion test states */ 166 typedef enum { 167 RAIDZ_EXPAND_NONE, /* Default is none, must opt-in */ 168 RAIDZ_EXPAND_REQUESTED, /* The '-X' option was used */ 169 RAIDZ_EXPAND_STARTED, /* Testing has commenced */ 170 RAIDZ_EXPAND_KILLED, /* Reached the proccess kill */ 171 RAIDZ_EXPAND_CHECKED, /* Pool scrub verification done */ 172 } raidz_expand_test_state_t; 173 174 175 #define ZO_GVARS_MAX_ARGLEN ((size_t)64) 176 #define ZO_GVARS_MAX_COUNT ((size_t)10) 177 178 typedef struct ztest_shared_opts { 179 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 180 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 181 char zo_alt_ztest[MAXNAMELEN]; 182 char zo_alt_libpath[MAXNAMELEN]; 183 uint64_t zo_vdevs; 184 uint64_t zo_vdevtime; 185 size_t zo_vdev_size; 186 int zo_ashift; 187 int zo_mirrors; 188 int zo_raid_do_expand; 189 int zo_raid_children; 190 int zo_raid_parity; 191 char zo_raid_type[8]; 192 int zo_draid_data; 193 int zo_draid_spares; 194 int zo_datasets; 195 int zo_threads; 196 uint64_t zo_passtime; 197 uint64_t zo_killrate; 198 int zo_verbose; 199 int zo_init; 200 uint64_t zo_time; 201 uint64_t zo_maxloops; 202 uint64_t zo_metaslab_force_ganging; 203 raidz_expand_test_state_t zo_raidz_expand_test; 204 int zo_mmp_test; 205 int zo_special_vdevs; 206 int zo_dump_dbgmsg; 207 int zo_gvars_count; 208 char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; 209 } ztest_shared_opts_t; 210 211 /* Default values for command line options. */ 212 #define DEFAULT_POOL "ztest" 213 #define DEFAULT_VDEV_DIR "/tmp" 214 #define DEFAULT_VDEV_COUNT 5 215 #define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ 216 #define DEFAULT_VDEV_SIZE_STR "256M" 217 #define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT 218 #define DEFAULT_MIRRORS 2 219 #define DEFAULT_RAID_CHILDREN 4 220 #define DEFAULT_RAID_PARITY 1 221 #define DEFAULT_DRAID_DATA 4 222 #define DEFAULT_DRAID_SPARES 1 223 #define DEFAULT_DATASETS_COUNT 7 224 #define DEFAULT_THREADS 23 225 #define DEFAULT_RUN_TIME 300 /* 300 seconds */ 226 #define DEFAULT_RUN_TIME_STR "300 sec" 227 #define DEFAULT_PASS_TIME 60 /* 60 seconds */ 228 #define DEFAULT_PASS_TIME_STR "60 sec" 229 #define DEFAULT_KILL_RATE 70 /* 70% kill rate */ 230 #define DEFAULT_KILLRATE_STR "70%" 231 #define DEFAULT_INITS 1 232 #define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ 233 #define DEFAULT_FORCE_GANGING (64 << 10) 234 #define DEFAULT_FORCE_GANGING_STR "64K" 235 236 /* Simplifying assumption: -1 is not a valid default. */ 237 #define NO_DEFAULT -1 238 239 static const ztest_shared_opts_t ztest_opts_defaults = { 240 .zo_pool = DEFAULT_POOL, 241 .zo_dir = DEFAULT_VDEV_DIR, 242 .zo_alt_ztest = { '\0' }, 243 .zo_alt_libpath = { '\0' }, 244 .zo_vdevs = DEFAULT_VDEV_COUNT, 245 .zo_ashift = DEFAULT_ASHIFT, 246 .zo_mirrors = DEFAULT_MIRRORS, 247 .zo_raid_children = DEFAULT_RAID_CHILDREN, 248 .zo_raid_parity = DEFAULT_RAID_PARITY, 249 .zo_raid_type = VDEV_TYPE_RAIDZ, 250 .zo_vdev_size = DEFAULT_VDEV_SIZE, 251 .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ 252 .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ 253 .zo_datasets = DEFAULT_DATASETS_COUNT, 254 .zo_threads = DEFAULT_THREADS, 255 .zo_passtime = DEFAULT_PASS_TIME, 256 .zo_killrate = DEFAULT_KILL_RATE, 257 .zo_verbose = 0, 258 .zo_mmp_test = 0, 259 .zo_init = DEFAULT_INITS, 260 .zo_time = DEFAULT_RUN_TIME, 261 .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ 262 .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, 263 .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, 264 .zo_gvars_count = 0, 265 .zo_raidz_expand_test = RAIDZ_EXPAND_NONE, 266 }; 267 268 extern uint64_t metaslab_force_ganging; 269 extern uint64_t metaslab_df_alloc_threshold; 270 extern uint64_t zfs_deadman_synctime_ms; 271 extern uint_t metaslab_preload_limit; 272 extern int zfs_compressed_arc_enabled; 273 extern int zfs_abd_scatter_enabled; 274 extern uint_t dmu_object_alloc_chunk_shift; 275 extern boolean_t zfs_force_some_double_word_sm_entries; 276 extern unsigned long zio_decompress_fail_fraction; 277 extern unsigned long zfs_reconstruct_indirect_damage_fraction; 278 extern uint64_t raidz_expand_max_reflow_bytes; 279 extern uint_t raidz_expand_pause_point; 280 281 282 static ztest_shared_opts_t *ztest_shared_opts; 283 static ztest_shared_opts_t ztest_opts; 284 static const char *const ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; 285 286 typedef struct ztest_shared_ds { 287 uint64_t zd_seq; 288 } ztest_shared_ds_t; 289 290 static ztest_shared_ds_t *ztest_shared_ds; 291 #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 292 293 typedef struct ztest_scratch_state { 294 uint64_t zs_raidz_scratch_verify_pause; 295 } ztest_shared_scratch_state_t; 296 297 static ztest_shared_scratch_state_t *ztest_scratch_state; 298 299 #define BT_MAGIC 0x123456789abcdefULL 300 #define MAXFAULTS(zs) \ 301 (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) 302 303 enum ztest_io_type { 304 ZTEST_IO_WRITE_TAG, 305 ZTEST_IO_WRITE_PATTERN, 306 ZTEST_IO_WRITE_ZEROES, 307 ZTEST_IO_TRUNCATE, 308 ZTEST_IO_SETATTR, 309 ZTEST_IO_REWRITE, 310 ZTEST_IO_TYPES 311 }; 312 313 typedef struct ztest_block_tag { 314 uint64_t bt_magic; 315 uint64_t bt_objset; 316 uint64_t bt_object; 317 uint64_t bt_dnodesize; 318 uint64_t bt_offset; 319 uint64_t bt_gen; 320 uint64_t bt_txg; 321 uint64_t bt_crtxg; 322 } ztest_block_tag_t; 323 324 typedef struct bufwad { 325 uint64_t bw_index; 326 uint64_t bw_txg; 327 uint64_t bw_data; 328 } bufwad_t; 329 330 /* 331 * It would be better to use a rangelock_t per object. Unfortunately 332 * the rangelock_t is not a drop-in replacement for rl_t, because we 333 * still need to map from object ID to rangelock_t. 334 */ 335 typedef enum { 336 ZTRL_READER, 337 ZTRL_WRITER, 338 ZTRL_APPEND 339 } rl_type_t; 340 341 typedef struct rll { 342 void *rll_writer; 343 int rll_readers; 344 kmutex_t rll_lock; 345 kcondvar_t rll_cv; 346 } rll_t; 347 348 typedef struct rl { 349 uint64_t rl_object; 350 uint64_t rl_offset; 351 uint64_t rl_size; 352 rll_t *rl_lock; 353 } rl_t; 354 355 #define ZTEST_RANGE_LOCKS 64 356 #define ZTEST_OBJECT_LOCKS 64 357 358 /* 359 * Object descriptor. Used as a template for object lookup/create/remove. 360 */ 361 typedef struct ztest_od { 362 uint64_t od_dir; 363 uint64_t od_object; 364 dmu_object_type_t od_type; 365 dmu_object_type_t od_crtype; 366 uint64_t od_blocksize; 367 uint64_t od_crblocksize; 368 uint64_t od_crdnodesize; 369 uint64_t od_gen; 370 uint64_t od_crgen; 371 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 372 } ztest_od_t; 373 374 /* 375 * Per-dataset state. 376 */ 377 typedef struct ztest_ds { 378 ztest_shared_ds_t *zd_shared; 379 objset_t *zd_os; 380 pthread_rwlock_t zd_zilog_lock; 381 zilog_t *zd_zilog; 382 ztest_od_t *zd_od; /* debugging aid */ 383 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 384 kmutex_t zd_dirobj_lock; 385 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 386 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 387 } ztest_ds_t; 388 389 /* 390 * Per-iteration state. 391 */ 392 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 393 394 typedef struct ztest_info { 395 ztest_func_t *zi_func; /* test function */ 396 uint64_t zi_iters; /* iterations per execution */ 397 uint64_t *zi_interval; /* execute every <interval> seconds */ 398 const char *zi_funcname; /* name of test function */ 399 } ztest_info_t; 400 401 typedef struct ztest_shared_callstate { 402 uint64_t zc_count; /* per-pass count */ 403 uint64_t zc_time; /* per-pass time */ 404 uint64_t zc_next; /* next time to call this function */ 405 } ztest_shared_callstate_t; 406 407 static ztest_shared_callstate_t *ztest_shared_callstate; 408 #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 409 410 ztest_func_t ztest_dmu_read_write; 411 ztest_func_t ztest_dmu_write_parallel; 412 ztest_func_t ztest_dmu_object_alloc_free; 413 ztest_func_t ztest_dmu_object_next_chunk; 414 ztest_func_t ztest_dmu_commit_callbacks; 415 ztest_func_t ztest_zap; 416 ztest_func_t ztest_zap_parallel; 417 ztest_func_t ztest_zil_commit; 418 ztest_func_t ztest_zil_remount; 419 ztest_func_t ztest_dmu_read_write_zcopy; 420 ztest_func_t ztest_dmu_objset_create_destroy; 421 ztest_func_t ztest_dmu_prealloc; 422 ztest_func_t ztest_fzap; 423 ztest_func_t ztest_dmu_snapshot_create_destroy; 424 ztest_func_t ztest_dsl_prop_get_set; 425 ztest_func_t ztest_spa_prop_get_set; 426 ztest_func_t ztest_spa_create_destroy; 427 ztest_func_t ztest_fault_inject; 428 ztest_func_t ztest_dmu_snapshot_hold; 429 ztest_func_t ztest_mmp_enable_disable; 430 ztest_func_t ztest_scrub; 431 ztest_func_t ztest_dsl_dataset_promote_busy; 432 ztest_func_t ztest_vdev_attach_detach; 433 ztest_func_t ztest_vdev_raidz_attach; 434 ztest_func_t ztest_vdev_LUN_growth; 435 ztest_func_t ztest_vdev_add_remove; 436 ztest_func_t ztest_vdev_class_add; 437 ztest_func_t ztest_vdev_aux_add_remove; 438 ztest_func_t ztest_split_pool; 439 ztest_func_t ztest_reguid; 440 ztest_func_t ztest_spa_upgrade; 441 ztest_func_t ztest_device_removal; 442 ztest_func_t ztest_spa_checkpoint_create_discard; 443 ztest_func_t ztest_initialize; 444 ztest_func_t ztest_trim; 445 ztest_func_t ztest_blake3; 446 ztest_func_t ztest_fletcher; 447 ztest_func_t ztest_fletcher_incr; 448 ztest_func_t ztest_verify_dnode_bt; 449 450 static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 451 static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 452 static uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 453 static uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 454 static uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 455 456 #define ZTI_INIT(func, iters, interval) \ 457 { .zi_func = (func), \ 458 .zi_iters = (iters), \ 459 .zi_interval = (interval), \ 460 .zi_funcname = # func } 461 462 static ztest_info_t ztest_info[] = { 463 ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), 464 ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), 465 ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), 466 ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), 467 ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), 468 ZTI_INIT(ztest_zap, 30, &zopt_always), 469 ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), 470 ZTI_INIT(ztest_split_pool, 1, &zopt_sometimes), 471 ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), 472 ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), 473 ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), 474 ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), 475 ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), 476 ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), 477 #if 0 478 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), 479 #endif 480 ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), 481 ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), 482 ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), 483 ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), 484 ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), 485 ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), 486 ZTI_INIT(ztest_reguid, 1, &zopt_rarely), 487 ZTI_INIT(ztest_scrub, 1, &zopt_rarely), 488 ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), 489 ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), 490 ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), 491 ZTI_INIT(ztest_vdev_raidz_attach, 1, &zopt_sometimes), 492 ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), 493 ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), 494 ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), 495 ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), 496 ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), 497 ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), 498 ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), 499 ZTI_INIT(ztest_trim, 1, &zopt_sometimes), 500 ZTI_INIT(ztest_blake3, 1, &zopt_rarely), 501 ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), 502 ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), 503 ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), 504 }; 505 506 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 507 508 /* 509 * The following struct is used to hold a list of uncalled commit callbacks. 510 * The callbacks are ordered by txg number. 511 */ 512 typedef struct ztest_cb_list { 513 kmutex_t zcl_callbacks_lock; 514 list_t zcl_callbacks; 515 } ztest_cb_list_t; 516 517 /* 518 * Stuff we need to share writably between parent and child. 519 */ 520 typedef struct ztest_shared { 521 boolean_t zs_do_init; 522 hrtime_t zs_proc_start; 523 hrtime_t zs_proc_stop; 524 hrtime_t zs_thread_start; 525 hrtime_t zs_thread_stop; 526 hrtime_t zs_thread_kill; 527 uint64_t zs_enospc_count; 528 uint64_t zs_vdev_next_leaf; 529 uint64_t zs_vdev_aux; 530 uint64_t zs_alloc; 531 uint64_t zs_space; 532 uint64_t zs_splits; 533 uint64_t zs_mirrors; 534 uint64_t zs_metaslab_sz; 535 uint64_t zs_metaslab_df_alloc_threshold; 536 uint64_t zs_guid; 537 } ztest_shared_t; 538 539 #define ID_PARALLEL -1ULL 540 541 static char ztest_dev_template[] = "%s/%s.%llua"; 542 static char ztest_aux_template[] = "%s/%s.%s.%llu"; 543 static ztest_shared_t *ztest_shared; 544 545 static spa_t *ztest_spa = NULL; 546 static ztest_ds_t *ztest_ds; 547 548 static kmutex_t ztest_vdev_lock; 549 static boolean_t ztest_device_removal_active = B_FALSE; 550 static boolean_t ztest_pool_scrubbed = B_FALSE; 551 static kmutex_t ztest_checkpoint_lock; 552 553 /* 554 * The ztest_name_lock protects the pool and dataset namespace used by 555 * the individual tests. To modify the namespace, consumers must grab 556 * this lock as writer. Grabbing the lock as reader will ensure that the 557 * namespace does not change while the lock is held. 558 */ 559 static pthread_rwlock_t ztest_name_lock; 560 561 static boolean_t ztest_dump_core = B_TRUE; 562 static boolean_t ztest_exiting; 563 564 /* Global commit callback list */ 565 static ztest_cb_list_t zcl; 566 /* Commit cb delay */ 567 static uint64_t zc_min_txg_delay = UINT64_MAX; 568 static int zc_cb_counter = 0; 569 570 /* 571 * Minimum number of commit callbacks that need to be registered for us to check 572 * whether the minimum txg delay is acceptable. 573 */ 574 #define ZTEST_COMMIT_CB_MIN_REG 100 575 576 /* 577 * If a number of txgs equal to this threshold have been created after a commit 578 * callback has been registered but not called, then we assume there is an 579 * implementation bug. 580 */ 581 #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) 582 583 enum ztest_object { 584 ZTEST_META_DNODE = 0, 585 ZTEST_DIROBJ, 586 ZTEST_OBJECTS 587 }; 588 589 static __attribute__((noreturn)) void usage(boolean_t requested); 590 static int ztest_scrub_impl(spa_t *spa); 591 592 /* 593 * These libumem hooks provide a reasonable set of defaults for the allocator's 594 * debugging facilities. 595 */ 596 const char * 597 _umem_debug_init(void) 598 { 599 return ("default,verbose"); /* $UMEM_DEBUG setting */ 600 } 601 602 const char * 603 _umem_logging_init(void) 604 { 605 return ("fail,contents"); /* $UMEM_LOGGING setting */ 606 } 607 608 static void 609 dump_debug_buffer(void) 610 { 611 ssize_t ret __attribute__((unused)); 612 613 if (!ztest_opts.zo_dump_dbgmsg) 614 return; 615 616 /* 617 * We use write() instead of printf() so that this function 618 * is safe to call from a signal handler. 619 */ 620 ret = write(STDOUT_FILENO, "\n", 1); 621 zfs_dbgmsg_print("ztest"); 622 } 623 624 #define BACKTRACE_SZ 100 625 626 static void sig_handler(int signo) 627 { 628 struct sigaction action; 629 #if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ 630 int nptrs; 631 void *buffer[BACKTRACE_SZ]; 632 633 nptrs = backtrace(buffer, BACKTRACE_SZ); 634 backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); 635 #endif 636 dump_debug_buffer(); 637 638 /* 639 * Restore default action and re-raise signal so SIGSEGV and 640 * SIGABRT can trigger a core dump. 641 */ 642 action.sa_handler = SIG_DFL; 643 sigemptyset(&action.sa_mask); 644 action.sa_flags = 0; 645 (void) sigaction(signo, &action, NULL); 646 raise(signo); 647 } 648 649 #define FATAL_MSG_SZ 1024 650 651 static const char *fatal_msg; 652 653 static __attribute__((format(printf, 2, 3))) __attribute__((noreturn)) void 654 fatal(int do_perror, const char *message, ...) 655 { 656 va_list args; 657 int save_errno = errno; 658 char *buf; 659 660 (void) fflush(stdout); 661 buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); 662 if (buf == NULL) 663 goto out; 664 665 va_start(args, message); 666 (void) sprintf(buf, "ztest: "); 667 /* LINTED */ 668 (void) vsprintf(buf + strlen(buf), message, args); 669 va_end(args); 670 if (do_perror) { 671 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 672 ": %s", strerror(save_errno)); 673 } 674 (void) fprintf(stderr, "%s\n", buf); 675 fatal_msg = buf; /* to ease debugging */ 676 677 out: 678 if (ztest_dump_core) 679 abort(); 680 else 681 dump_debug_buffer(); 682 683 exit(3); 684 } 685 686 static int 687 str2shift(const char *buf) 688 { 689 const char *ends = "BKMGTPEZ"; 690 int i; 691 692 if (buf[0] == '\0') 693 return (0); 694 for (i = 0; i < strlen(ends); i++) { 695 if (toupper(buf[0]) == ends[i]) 696 break; 697 } 698 if (i == strlen(ends)) { 699 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 700 buf); 701 usage(B_FALSE); 702 } 703 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 704 return (10*i); 705 } 706 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 707 usage(B_FALSE); 708 } 709 710 static uint64_t 711 nicenumtoull(const char *buf) 712 { 713 char *end; 714 uint64_t val; 715 716 val = strtoull(buf, &end, 0); 717 if (end == buf) { 718 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 719 usage(B_FALSE); 720 } else if (end[0] == '.') { 721 double fval = strtod(buf, &end); 722 fval *= pow(2, str2shift(end)); 723 /* 724 * UINT64_MAX is not exactly representable as a double. 725 * The closest representation is UINT64_MAX + 1, so we 726 * use a >= comparison instead of > for the bounds check. 727 */ 728 if (fval >= (double)UINT64_MAX) { 729 (void) fprintf(stderr, "ztest: value too large: %s\n", 730 buf); 731 usage(B_FALSE); 732 } 733 val = (uint64_t)fval; 734 } else { 735 int shift = str2shift(end); 736 if (shift >= 64 || (val << shift) >> shift != val) { 737 (void) fprintf(stderr, "ztest: value too large: %s\n", 738 buf); 739 usage(B_FALSE); 740 } 741 val <<= shift; 742 } 743 return (val); 744 } 745 746 typedef struct ztest_option { 747 const char short_opt; 748 const char *long_opt; 749 const char *long_opt_param; 750 const char *comment; 751 unsigned int default_int; 752 const char *default_str; 753 } ztest_option_t; 754 755 /* 756 * The following option_table is used for generating the usage info as well as 757 * the long and short option information for calling getopt_long(). 758 */ 759 static ztest_option_t option_table[] = { 760 { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, 761 NULL}, 762 { 's', "vdev-size", "INTEGER", "Size of each vdev", 763 NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, 764 { 'a', "alignment-shift", "INTEGER", 765 "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, 766 { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", 767 DEFAULT_MIRRORS, NULL}, 768 { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", 769 DEFAULT_RAID_CHILDREN, NULL}, 770 { 'R', "raid-parity", "INTEGER", "Raid parity", 771 DEFAULT_RAID_PARITY, NULL}, 772 { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind", 773 NO_DEFAULT, "random"}, 774 { 'D', "draid-data", "INTEGER", "Number of draid data drives", 775 DEFAULT_DRAID_DATA, NULL}, 776 { 'S', "draid-spares", "INTEGER", "Number of draid spares", 777 DEFAULT_DRAID_SPARES, NULL}, 778 { 'd', "datasets", "INTEGER", "Number of datasets", 779 DEFAULT_DATASETS_COUNT, NULL}, 780 { 't', "threads", "INTEGER", "Number of ztest threads", 781 DEFAULT_THREADS, NULL}, 782 { 'g', "gang-block-threshold", "INTEGER", 783 "Metaslab gang block threshold", 784 NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, 785 { 'i', "init-count", "INTEGER", "Number of times to initialize pool", 786 DEFAULT_INITS, NULL}, 787 { 'k', "kill-percentage", "INTEGER", "Kill percentage", 788 NO_DEFAULT, DEFAULT_KILLRATE_STR}, 789 { 'p', "pool-name", "STRING", "Pool name", 790 NO_DEFAULT, DEFAULT_POOL}, 791 { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", 792 NO_DEFAULT, DEFAULT_VDEV_DIR}, 793 { 'M', "multi-host", NULL, 794 "Multi-host; simulate pool imported on remote host", 795 NO_DEFAULT, NULL}, 796 { 'E', "use-existing-pool", NULL, 797 "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, 798 { 'T', "run-time", "INTEGER", "Total run time", 799 NO_DEFAULT, DEFAULT_RUN_TIME_STR}, 800 { 'P', "pass-time", "INTEGER", "Time per pass", 801 NO_DEFAULT, DEFAULT_PASS_TIME_STR}, 802 { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", 803 DEFAULT_MAX_LOOPS, NULL}, 804 { 'B', "alt-ztest", "PATH", "Alternate ztest path", 805 NO_DEFAULT, NULL}, 806 { 'C', "vdev-class-state", "on|off|random", "vdev class state", 807 NO_DEFAULT, "random"}, 808 { 'X', "raidz-expansion", NULL, 809 "Perform a dedicated raidz expansion test", 810 NO_DEFAULT, NULL}, 811 { 'o', "option", "\"OPTION=INTEGER\"", 812 "Set global variable to an unsigned 32-bit integer value", 813 NO_DEFAULT, NULL}, 814 { 'G', "dump-debug-msg", NULL, 815 "Dump zfs_dbgmsg buffer before exiting due to an error", 816 NO_DEFAULT, NULL}, 817 { 'V', "verbose", NULL, 818 "Verbose (use multiple times for ever more verbosity)", 819 NO_DEFAULT, NULL}, 820 { 'h', "help", NULL, "Show this help", 821 NO_DEFAULT, NULL}, 822 {0, 0, 0, 0, 0, 0} 823 }; 824 825 static struct option *long_opts = NULL; 826 static char *short_opts = NULL; 827 828 static void 829 init_options(void) 830 { 831 ASSERT3P(long_opts, ==, NULL); 832 ASSERT3P(short_opts, ==, NULL); 833 834 int count = sizeof (option_table) / sizeof (option_table[0]); 835 long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); 836 837 short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); 838 int short_opt_index = 0; 839 840 for (int i = 0; i < count; i++) { 841 long_opts[i].val = option_table[i].short_opt; 842 long_opts[i].name = option_table[i].long_opt; 843 long_opts[i].has_arg = option_table[i].long_opt_param != NULL 844 ? required_argument : no_argument; 845 long_opts[i].flag = NULL; 846 short_opts[short_opt_index++] = option_table[i].short_opt; 847 if (option_table[i].long_opt_param != NULL) { 848 short_opts[short_opt_index++] = ':'; 849 } 850 } 851 } 852 853 static void 854 fini_options(void) 855 { 856 int count = sizeof (option_table) / sizeof (option_table[0]); 857 858 umem_free(long_opts, sizeof (struct option) * count); 859 umem_free(short_opts, sizeof (char) * 2 * count); 860 861 long_opts = NULL; 862 short_opts = NULL; 863 } 864 865 static __attribute__((noreturn)) void 866 usage(boolean_t requested) 867 { 868 char option[80]; 869 FILE *fp = requested ? stdout : stderr; 870 871 (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); 872 for (int i = 0; option_table[i].short_opt != 0; i++) { 873 if (option_table[i].long_opt_param != NULL) { 874 (void) sprintf(option, " -%c --%s=%s", 875 option_table[i].short_opt, 876 option_table[i].long_opt, 877 option_table[i].long_opt_param); 878 } else { 879 (void) sprintf(option, " -%c --%s", 880 option_table[i].short_opt, 881 option_table[i].long_opt); 882 } 883 (void) fprintf(fp, " %-43s%s", option, 884 option_table[i].comment); 885 886 if (option_table[i].long_opt_param != NULL) { 887 if (option_table[i].default_str != NULL) { 888 (void) fprintf(fp, " (default: %s)", 889 option_table[i].default_str); 890 } else if (option_table[i].default_int != NO_DEFAULT) { 891 (void) fprintf(fp, " (default: %u)", 892 option_table[i].default_int); 893 } 894 } 895 (void) fprintf(fp, "\n"); 896 } 897 exit(requested ? 0 : 1); 898 } 899 900 static uint64_t 901 ztest_random(uint64_t range) 902 { 903 uint64_t r; 904 905 ASSERT3S(ztest_fd_rand, >=, 0); 906 907 if (range == 0) 908 return (0); 909 910 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) 911 fatal(B_TRUE, "short read from /dev/urandom"); 912 913 return (r % range); 914 } 915 916 static void 917 ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) 918 { 919 char name[32]; 920 char *value; 921 int state = ZTEST_VDEV_CLASS_RND; 922 923 (void) strlcpy(name, input, sizeof (name)); 924 925 value = strchr(name, '='); 926 if (value == NULL) { 927 (void) fprintf(stderr, "missing value in property=value " 928 "'-C' argument (%s)\n", input); 929 usage(B_FALSE); 930 } 931 *(value) = '\0'; 932 value++; 933 934 if (strcmp(value, "on") == 0) { 935 state = ZTEST_VDEV_CLASS_ON; 936 } else if (strcmp(value, "off") == 0) { 937 state = ZTEST_VDEV_CLASS_OFF; 938 } else if (strcmp(value, "random") == 0) { 939 state = ZTEST_VDEV_CLASS_RND; 940 } else { 941 (void) fprintf(stderr, "invalid property value '%s'\n", value); 942 usage(B_FALSE); 943 } 944 945 if (strcmp(name, "special") == 0) { 946 zo->zo_special_vdevs = state; 947 } else { 948 (void) fprintf(stderr, "invalid property name '%s'\n", name); 949 usage(B_FALSE); 950 } 951 if (zo->zo_verbose >= 3) 952 (void) printf("%s vdev state is '%s'\n", name, value); 953 } 954 955 static void 956 process_options(int argc, char **argv) 957 { 958 char *path; 959 ztest_shared_opts_t *zo = &ztest_opts; 960 961 int opt; 962 uint64_t value; 963 const char *raid_kind = "random"; 964 965 memcpy(zo, &ztest_opts_defaults, sizeof (*zo)); 966 967 init_options(); 968 969 while ((opt = getopt_long(argc, argv, short_opts, long_opts, 970 NULL)) != EOF) { 971 value = 0; 972 switch (opt) { 973 case 'v': 974 case 's': 975 case 'a': 976 case 'm': 977 case 'r': 978 case 'R': 979 case 'D': 980 case 'S': 981 case 'd': 982 case 't': 983 case 'g': 984 case 'i': 985 case 'k': 986 case 'T': 987 case 'P': 988 case 'F': 989 value = nicenumtoull(optarg); 990 } 991 switch (opt) { 992 case 'v': 993 zo->zo_vdevs = value; 994 break; 995 case 's': 996 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 997 break; 998 case 'a': 999 zo->zo_ashift = value; 1000 break; 1001 case 'm': 1002 zo->zo_mirrors = value; 1003 break; 1004 case 'r': 1005 zo->zo_raid_children = MAX(1, value); 1006 break; 1007 case 'R': 1008 zo->zo_raid_parity = MIN(MAX(value, 1), 3); 1009 break; 1010 case 'K': 1011 raid_kind = optarg; 1012 break; 1013 case 'D': 1014 zo->zo_draid_data = MAX(1, value); 1015 break; 1016 case 'S': 1017 zo->zo_draid_spares = MAX(1, value); 1018 break; 1019 case 'd': 1020 zo->zo_datasets = MAX(1, value); 1021 break; 1022 case 't': 1023 zo->zo_threads = MAX(1, value); 1024 break; 1025 case 'g': 1026 zo->zo_metaslab_force_ganging = 1027 MAX(SPA_MINBLOCKSIZE << 1, value); 1028 break; 1029 case 'i': 1030 zo->zo_init = value; 1031 break; 1032 case 'k': 1033 zo->zo_killrate = value; 1034 break; 1035 case 'p': 1036 (void) strlcpy(zo->zo_pool, optarg, 1037 sizeof (zo->zo_pool)); 1038 break; 1039 case 'f': 1040 path = realpath(optarg, NULL); 1041 if (path == NULL) { 1042 (void) fprintf(stderr, "error: %s: %s\n", 1043 optarg, strerror(errno)); 1044 usage(B_FALSE); 1045 } else { 1046 (void) strlcpy(zo->zo_dir, path, 1047 sizeof (zo->zo_dir)); 1048 free(path); 1049 } 1050 break; 1051 case 'M': 1052 zo->zo_mmp_test = 1; 1053 break; 1054 case 'V': 1055 zo->zo_verbose++; 1056 break; 1057 case 'X': 1058 zo->zo_raidz_expand_test = RAIDZ_EXPAND_REQUESTED; 1059 break; 1060 case 'E': 1061 zo->zo_init = 0; 1062 break; 1063 case 'T': 1064 zo->zo_time = value; 1065 break; 1066 case 'P': 1067 zo->zo_passtime = MAX(1, value); 1068 break; 1069 case 'F': 1070 zo->zo_maxloops = MAX(1, value); 1071 break; 1072 case 'B': 1073 (void) strlcpy(zo->zo_alt_ztest, optarg, 1074 sizeof (zo->zo_alt_ztest)); 1075 break; 1076 case 'C': 1077 ztest_parse_name_value(optarg, zo); 1078 break; 1079 case 'o': 1080 if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { 1081 (void) fprintf(stderr, 1082 "max global var count (%zu) exceeded\n", 1083 ZO_GVARS_MAX_COUNT); 1084 usage(B_FALSE); 1085 } 1086 char *v = zo->zo_gvars[zo->zo_gvars_count]; 1087 if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= 1088 ZO_GVARS_MAX_ARGLEN) { 1089 (void) fprintf(stderr, 1090 "global var option '%s' is too long\n", 1091 optarg); 1092 usage(B_FALSE); 1093 } 1094 zo->zo_gvars_count++; 1095 break; 1096 case 'G': 1097 zo->zo_dump_dbgmsg = 1; 1098 break; 1099 case 'h': 1100 usage(B_TRUE); 1101 break; 1102 case '?': 1103 default: 1104 usage(B_FALSE); 1105 break; 1106 } 1107 } 1108 1109 fini_options(); 1110 1111 /* Force compatible options for raidz expansion run */ 1112 if (zo->zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) { 1113 zo->zo_mmp_test = 0; 1114 zo->zo_mirrors = 0; 1115 zo->zo_vdevs = 1; 1116 zo->zo_vdev_size = DEFAULT_VDEV_SIZE * 2; 1117 zo->zo_raid_do_expand = B_FALSE; 1118 raid_kind = "raidz"; 1119 } 1120 1121 if (strcmp(raid_kind, "random") == 0) { 1122 switch (ztest_random(3)) { 1123 case 0: 1124 raid_kind = "raidz"; 1125 break; 1126 case 1: 1127 raid_kind = "eraidz"; 1128 break; 1129 case 2: 1130 raid_kind = "draid"; 1131 break; 1132 } 1133 1134 if (ztest_opts.zo_verbose >= 3) 1135 (void) printf("choosing RAID type '%s'\n", raid_kind); 1136 } 1137 1138 if (strcmp(raid_kind, "draid") == 0) { 1139 uint64_t min_devsize; 1140 1141 /* With fewer disk use 256M, otherwise 128M is OK */ 1142 min_devsize = (ztest_opts.zo_raid_children < 16) ? 1143 (256ULL << 20) : (128ULL << 20); 1144 1145 /* No top-level mirrors with dRAID for now */ 1146 zo->zo_mirrors = 0; 1147 1148 /* Use more appropriate defaults for dRAID */ 1149 if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) 1150 zo->zo_vdevs = 1; 1151 if (zo->zo_raid_children == 1152 ztest_opts_defaults.zo_raid_children) 1153 zo->zo_raid_children = 16; 1154 if (zo->zo_ashift < 12) 1155 zo->zo_ashift = 12; 1156 if (zo->zo_vdev_size < min_devsize) 1157 zo->zo_vdev_size = min_devsize; 1158 1159 if (zo->zo_draid_data + zo->zo_raid_parity > 1160 zo->zo_raid_children - zo->zo_draid_spares) { 1161 (void) fprintf(stderr, "error: too few draid " 1162 "children (%d) for stripe width (%d)\n", 1163 zo->zo_raid_children, 1164 zo->zo_draid_data + zo->zo_raid_parity); 1165 usage(B_FALSE); 1166 } 1167 1168 (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, 1169 sizeof (zo->zo_raid_type)); 1170 1171 } else if (strcmp(raid_kind, "eraidz") == 0) { 1172 /* using eraidz (expandable raidz) */ 1173 zo->zo_raid_do_expand = B_TRUE; 1174 1175 /* tests expect top-level to be raidz */ 1176 zo->zo_mirrors = 0; 1177 zo->zo_vdevs = 1; 1178 1179 /* Make sure parity is less than data columns */ 1180 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1181 zo->zo_raid_children - 1); 1182 1183 } else /* using raidz */ { 1184 ASSERT0(strcmp(raid_kind, "raidz")); 1185 1186 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1187 zo->zo_raid_children - 1); 1188 } 1189 1190 zo->zo_vdevtime = 1191 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 1192 UINT64_MAX >> 2); 1193 1194 if (*zo->zo_alt_ztest) { 1195 const char *invalid_what = "ztest"; 1196 char *val = zo->zo_alt_ztest; 1197 if (0 != access(val, X_OK) || 1198 (strrchr(val, '/') == NULL && (errno == EINVAL))) 1199 goto invalid; 1200 1201 int dirlen = strrchr(val, '/') - val; 1202 strlcpy(zo->zo_alt_libpath, val, 1203 MIN(sizeof (zo->zo_alt_libpath), dirlen + 1)); 1204 invalid_what = "library path", val = zo->zo_alt_libpath; 1205 if (strrchr(val, '/') == NULL && (errno == EINVAL)) 1206 goto invalid; 1207 *strrchr(val, '/') = '\0'; 1208 strlcat(val, "/lib", sizeof (zo->zo_alt_libpath)); 1209 1210 if (0 != access(zo->zo_alt_libpath, X_OK)) 1211 goto invalid; 1212 return; 1213 1214 invalid: 1215 ztest_dump_core = B_FALSE; 1216 fatal(B_TRUE, "invalid alternate %s %s", invalid_what, val); 1217 } 1218 } 1219 1220 static void 1221 ztest_kill(ztest_shared_t *zs) 1222 { 1223 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 1224 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 1225 1226 /* 1227 * Before we kill ourselves, make sure that the config is updated. 1228 * See comment above spa_write_cachefile(). 1229 */ 1230 if (raidz_expand_pause_point != RAIDZ_EXPAND_PAUSE_NONE) { 1231 if (mutex_tryenter(&spa_namespace_lock)) { 1232 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, 1233 B_FALSE); 1234 mutex_exit(&spa_namespace_lock); 1235 1236 ztest_scratch_state->zs_raidz_scratch_verify_pause = 1237 raidz_expand_pause_point; 1238 } else { 1239 /* 1240 * Do not verify scratch object in case if 1241 * spa_namespace_lock cannot be acquired, 1242 * it can cause deadlock in spa_config_update(). 1243 */ 1244 raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; 1245 1246 return; 1247 } 1248 } else { 1249 mutex_enter(&spa_namespace_lock); 1250 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); 1251 mutex_exit(&spa_namespace_lock); 1252 } 1253 1254 (void) raise(SIGKILL); 1255 } 1256 1257 static void 1258 ztest_record_enospc(const char *s) 1259 { 1260 (void) s; 1261 ztest_shared->zs_enospc_count++; 1262 } 1263 1264 static uint64_t 1265 ztest_get_ashift(void) 1266 { 1267 if (ztest_opts.zo_ashift == 0) 1268 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 1269 return (ztest_opts.zo_ashift); 1270 } 1271 1272 static boolean_t 1273 ztest_is_draid_spare(const char *name) 1274 { 1275 uint64_t spare_id = 0, parity = 0, vdev_id = 0; 1276 1277 if (sscanf(name, VDEV_TYPE_DRAID "%"PRIu64"-%"PRIu64"-%"PRIu64"", 1278 &parity, &vdev_id, &spare_id) == 3) { 1279 return (B_TRUE); 1280 } 1281 1282 return (B_FALSE); 1283 } 1284 1285 static nvlist_t * 1286 make_vdev_file(const char *path, const char *aux, const char *pool, 1287 size_t size, uint64_t ashift) 1288 { 1289 char *pathbuf = NULL; 1290 uint64_t vdev; 1291 nvlist_t *file; 1292 boolean_t draid_spare = B_FALSE; 1293 1294 1295 if (ashift == 0) 1296 ashift = ztest_get_ashift(); 1297 1298 if (path == NULL) { 1299 pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1300 path = pathbuf; 1301 1302 if (aux != NULL) { 1303 vdev = ztest_shared->zs_vdev_aux; 1304 (void) snprintf(pathbuf, MAXPATHLEN, 1305 ztest_aux_template, ztest_opts.zo_dir, 1306 pool == NULL ? ztest_opts.zo_pool : pool, 1307 aux, vdev); 1308 } else { 1309 vdev = ztest_shared->zs_vdev_next_leaf++; 1310 (void) snprintf(pathbuf, MAXPATHLEN, 1311 ztest_dev_template, ztest_opts.zo_dir, 1312 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 1313 } 1314 } else { 1315 draid_spare = ztest_is_draid_spare(path); 1316 } 1317 1318 if (size != 0 && !draid_spare) { 1319 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 1320 if (fd == -1) 1321 fatal(B_TRUE, "can't open %s", path); 1322 if (ftruncate(fd, size) != 0) 1323 fatal(B_TRUE, "can't ftruncate %s", path); 1324 (void) close(fd); 1325 } 1326 1327 file = fnvlist_alloc(); 1328 fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, 1329 draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); 1330 fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); 1331 fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); 1332 umem_free(pathbuf, MAXPATHLEN); 1333 1334 return (file); 1335 } 1336 1337 static nvlist_t * 1338 make_vdev_raid(const char *path, const char *aux, const char *pool, size_t size, 1339 uint64_t ashift, int r) 1340 { 1341 nvlist_t *raid, **child; 1342 int c; 1343 1344 if (r < 2) 1345 return (make_vdev_file(path, aux, pool, size, ashift)); 1346 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 1347 1348 for (c = 0; c < r; c++) 1349 child[c] = make_vdev_file(path, aux, pool, size, ashift); 1350 1351 raid = fnvlist_alloc(); 1352 fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, 1353 ztest_opts.zo_raid_type); 1354 fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, 1355 ztest_opts.zo_raid_parity); 1356 fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, 1357 (const nvlist_t **)child, r); 1358 1359 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { 1360 uint64_t ndata = ztest_opts.zo_draid_data; 1361 uint64_t nparity = ztest_opts.zo_raid_parity; 1362 uint64_t nspares = ztest_opts.zo_draid_spares; 1363 uint64_t children = ztest_opts.zo_raid_children; 1364 uint64_t ngroups = 1; 1365 1366 /* 1367 * Calculate the minimum number of groups required to fill a 1368 * slice. This is the LCM of the stripe width (data + parity) 1369 * and the number of data drives (children - spares). 1370 */ 1371 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1372 ngroups++; 1373 1374 /* Store the basic dRAID configuration. */ 1375 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1376 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1377 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1378 } 1379 1380 for (c = 0; c < r; c++) 1381 fnvlist_free(child[c]); 1382 1383 umem_free(child, r * sizeof (nvlist_t *)); 1384 1385 return (raid); 1386 } 1387 1388 static nvlist_t * 1389 make_vdev_mirror(const char *path, const char *aux, const char *pool, 1390 size_t size, uint64_t ashift, int r, int m) 1391 { 1392 nvlist_t *mirror, **child; 1393 int c; 1394 1395 if (m < 1) 1396 return (make_vdev_raid(path, aux, pool, size, ashift, r)); 1397 1398 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 1399 1400 for (c = 0; c < m; c++) 1401 child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); 1402 1403 mirror = fnvlist_alloc(); 1404 fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); 1405 fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 1406 (const nvlist_t **)child, m); 1407 1408 for (c = 0; c < m; c++) 1409 fnvlist_free(child[c]); 1410 1411 umem_free(child, m * sizeof (nvlist_t *)); 1412 1413 return (mirror); 1414 } 1415 1416 static nvlist_t * 1417 make_vdev_root(const char *path, const char *aux, const char *pool, size_t size, 1418 uint64_t ashift, const char *class, int r, int m, int t) 1419 { 1420 nvlist_t *root, **child; 1421 int c; 1422 boolean_t log; 1423 1424 ASSERT3S(t, >, 0); 1425 1426 log = (class != NULL && strcmp(class, "log") == 0); 1427 1428 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 1429 1430 for (c = 0; c < t; c++) { 1431 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 1432 r, m); 1433 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); 1434 1435 if (class != NULL && class[0] != '\0') { 1436 ASSERT(m > 1 || log); /* expecting a mirror */ 1437 fnvlist_add_string(child[c], 1438 ZPOOL_CONFIG_ALLOCATION_BIAS, class); 1439 } 1440 } 1441 1442 root = fnvlist_alloc(); 1443 fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 1444 fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 1445 (const nvlist_t **)child, t); 1446 1447 for (c = 0; c < t; c++) 1448 fnvlist_free(child[c]); 1449 1450 umem_free(child, t * sizeof (nvlist_t *)); 1451 1452 return (root); 1453 } 1454 1455 /* 1456 * Find a random spa version. Returns back a random spa version in the 1457 * range [initial_version, SPA_VERSION_FEATURES]. 1458 */ 1459 static uint64_t 1460 ztest_random_spa_version(uint64_t initial_version) 1461 { 1462 uint64_t version = initial_version; 1463 1464 if (version <= SPA_VERSION_BEFORE_FEATURES) { 1465 version = version + 1466 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 1467 } 1468 1469 if (version > SPA_VERSION_BEFORE_FEATURES) 1470 version = SPA_VERSION_FEATURES; 1471 1472 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1473 return (version); 1474 } 1475 1476 static int 1477 ztest_random_blocksize(void) 1478 { 1479 ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); 1480 1481 /* 1482 * Choose a block size >= the ashift. 1483 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1484 */ 1485 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1486 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1487 maxbs = 20; 1488 uint64_t block_shift = 1489 ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1490 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1491 } 1492 1493 static int 1494 ztest_random_dnodesize(void) 1495 { 1496 int slots; 1497 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1498 1499 if (max_slots == DNODE_MIN_SLOTS) 1500 return (DNODE_MIN_SIZE); 1501 1502 /* 1503 * Weight the random distribution more heavily toward smaller 1504 * dnode sizes since that is more likely to reflect real-world 1505 * usage. 1506 */ 1507 ASSERT3U(max_slots, >, 4); 1508 switch (ztest_random(10)) { 1509 case 0: 1510 slots = 5 + ztest_random(max_slots - 4); 1511 break; 1512 case 1 ... 4: 1513 slots = 2 + ztest_random(3); 1514 break; 1515 default: 1516 slots = 1; 1517 break; 1518 } 1519 1520 return (slots << DNODE_SHIFT); 1521 } 1522 1523 static int 1524 ztest_random_ibshift(void) 1525 { 1526 return (DN_MIN_INDBLKSHIFT + 1527 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1528 } 1529 1530 static uint64_t 1531 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1532 { 1533 uint64_t top; 1534 vdev_t *rvd = spa->spa_root_vdev; 1535 vdev_t *tvd; 1536 1537 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 1538 1539 do { 1540 top = ztest_random(rvd->vdev_children); 1541 tvd = rvd->vdev_child[top]; 1542 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1543 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1544 1545 return (top); 1546 } 1547 1548 static uint64_t 1549 ztest_random_dsl_prop(zfs_prop_t prop) 1550 { 1551 uint64_t value; 1552 1553 do { 1554 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1555 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1556 1557 return (value); 1558 } 1559 1560 static int 1561 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1562 boolean_t inherit) 1563 { 1564 const char *propname = zfs_prop_to_name(prop); 1565 const char *valname; 1566 char *setpoint; 1567 uint64_t curval; 1568 int error; 1569 1570 error = dsl_prop_set_int(osname, propname, 1571 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1572 1573 if (error == ENOSPC) { 1574 ztest_record_enospc(FTAG); 1575 return (error); 1576 } 1577 ASSERT0(error); 1578 1579 setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1580 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1581 1582 if (ztest_opts.zo_verbose >= 6) { 1583 int err; 1584 1585 err = zfs_prop_index_to_string(prop, curval, &valname); 1586 if (err) 1587 (void) printf("%s %s = %llu at '%s'\n", osname, 1588 propname, (unsigned long long)curval, setpoint); 1589 else 1590 (void) printf("%s %s = %s at '%s'\n", 1591 osname, propname, valname, setpoint); 1592 } 1593 umem_free(setpoint, MAXPATHLEN); 1594 1595 return (error); 1596 } 1597 1598 static int 1599 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1600 { 1601 spa_t *spa = ztest_spa; 1602 nvlist_t *props = NULL; 1603 int error; 1604 1605 props = fnvlist_alloc(); 1606 fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); 1607 1608 error = spa_prop_set(spa, props); 1609 1610 fnvlist_free(props); 1611 1612 if (error == ENOSPC) { 1613 ztest_record_enospc(FTAG); 1614 return (error); 1615 } 1616 ASSERT0(error); 1617 1618 return (error); 1619 } 1620 1621 static int 1622 ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, 1623 boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) 1624 { 1625 int err; 1626 char *cp = NULL; 1627 char ddname[ZFS_MAX_DATASET_NAME_LEN]; 1628 1629 strlcpy(ddname, name, sizeof (ddname)); 1630 cp = strchr(ddname, '@'); 1631 if (cp != NULL) 1632 *cp = '\0'; 1633 1634 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1635 while (decrypt && err == EACCES) { 1636 dsl_crypto_params_t *dcp; 1637 nvlist_t *crypto_args = fnvlist_alloc(); 1638 1639 fnvlist_add_uint8_array(crypto_args, "wkeydata", 1640 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 1641 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, 1642 crypto_args, &dcp)); 1643 err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); 1644 /* 1645 * Note: if there was an error loading, the wkey was not 1646 * consumed, and needs to be freed. 1647 */ 1648 dsl_crypto_params_free(dcp, (err != 0)); 1649 fnvlist_free(crypto_args); 1650 1651 if (err == EINVAL) { 1652 /* 1653 * We couldn't load a key for this dataset so try 1654 * the parent. This loop will eventually hit the 1655 * encryption root since ztest only makes clones 1656 * as children of their origin datasets. 1657 */ 1658 cp = strrchr(ddname, '/'); 1659 if (cp == NULL) 1660 return (err); 1661 1662 *cp = '\0'; 1663 err = EACCES; 1664 continue; 1665 } else if (err != 0) { 1666 break; 1667 } 1668 1669 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1670 break; 1671 } 1672 1673 return (err); 1674 } 1675 1676 static void 1677 ztest_rll_init(rll_t *rll) 1678 { 1679 rll->rll_writer = NULL; 1680 rll->rll_readers = 0; 1681 mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); 1682 cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); 1683 } 1684 1685 static void 1686 ztest_rll_destroy(rll_t *rll) 1687 { 1688 ASSERT3P(rll->rll_writer, ==, NULL); 1689 ASSERT0(rll->rll_readers); 1690 mutex_destroy(&rll->rll_lock); 1691 cv_destroy(&rll->rll_cv); 1692 } 1693 1694 static void 1695 ztest_rll_lock(rll_t *rll, rl_type_t type) 1696 { 1697 mutex_enter(&rll->rll_lock); 1698 1699 if (type == ZTRL_READER) { 1700 while (rll->rll_writer != NULL) 1701 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1702 rll->rll_readers++; 1703 } else { 1704 while (rll->rll_writer != NULL || rll->rll_readers) 1705 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1706 rll->rll_writer = curthread; 1707 } 1708 1709 mutex_exit(&rll->rll_lock); 1710 } 1711 1712 static void 1713 ztest_rll_unlock(rll_t *rll) 1714 { 1715 mutex_enter(&rll->rll_lock); 1716 1717 if (rll->rll_writer) { 1718 ASSERT0(rll->rll_readers); 1719 rll->rll_writer = NULL; 1720 } else { 1721 ASSERT3S(rll->rll_readers, >, 0); 1722 ASSERT3P(rll->rll_writer, ==, NULL); 1723 rll->rll_readers--; 1724 } 1725 1726 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1727 cv_broadcast(&rll->rll_cv); 1728 1729 mutex_exit(&rll->rll_lock); 1730 } 1731 1732 static void 1733 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1734 { 1735 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1736 1737 ztest_rll_lock(rll, type); 1738 } 1739 1740 static void 1741 ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1742 { 1743 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1744 1745 ztest_rll_unlock(rll); 1746 } 1747 1748 static rl_t * 1749 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1750 uint64_t size, rl_type_t type) 1751 { 1752 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1753 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1754 rl_t *rl; 1755 1756 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1757 rl->rl_object = object; 1758 rl->rl_offset = offset; 1759 rl->rl_size = size; 1760 rl->rl_lock = rll; 1761 1762 ztest_rll_lock(rll, type); 1763 1764 return (rl); 1765 } 1766 1767 static void 1768 ztest_range_unlock(rl_t *rl) 1769 { 1770 rll_t *rll = rl->rl_lock; 1771 1772 ztest_rll_unlock(rll); 1773 1774 umem_free(rl, sizeof (*rl)); 1775 } 1776 1777 static void 1778 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1779 { 1780 zd->zd_os = os; 1781 zd->zd_zilog = dmu_objset_zil(os); 1782 zd->zd_shared = szd; 1783 dmu_objset_name(os, zd->zd_name); 1784 int l; 1785 1786 if (zd->zd_shared != NULL) 1787 zd->zd_shared->zd_seq = 0; 1788 1789 VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); 1790 mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); 1791 1792 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1793 ztest_rll_init(&zd->zd_object_lock[l]); 1794 1795 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1796 ztest_rll_init(&zd->zd_range_lock[l]); 1797 } 1798 1799 static void 1800 ztest_zd_fini(ztest_ds_t *zd) 1801 { 1802 int l; 1803 1804 mutex_destroy(&zd->zd_dirobj_lock); 1805 (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); 1806 1807 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1808 ztest_rll_destroy(&zd->zd_object_lock[l]); 1809 1810 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1811 ztest_rll_destroy(&zd->zd_range_lock[l]); 1812 } 1813 1814 #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) 1815 1816 static uint64_t 1817 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 1818 { 1819 uint64_t txg; 1820 int error; 1821 1822 /* 1823 * Attempt to assign tx to some transaction group. 1824 */ 1825 error = dmu_tx_assign(tx, txg_how); 1826 if (error) { 1827 if (error == ERESTART) { 1828 ASSERT3U(txg_how, ==, TXG_NOWAIT); 1829 dmu_tx_wait(tx); 1830 } else { 1831 ASSERT3U(error, ==, ENOSPC); 1832 ztest_record_enospc(tag); 1833 } 1834 dmu_tx_abort(tx); 1835 return (0); 1836 } 1837 txg = dmu_tx_get_txg(tx); 1838 ASSERT3U(txg, !=, 0); 1839 return (txg); 1840 } 1841 1842 static void 1843 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1844 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1845 uint64_t crtxg) 1846 { 1847 bt->bt_magic = BT_MAGIC; 1848 bt->bt_objset = dmu_objset_id(os); 1849 bt->bt_object = object; 1850 bt->bt_dnodesize = dnodesize; 1851 bt->bt_offset = offset; 1852 bt->bt_gen = gen; 1853 bt->bt_txg = txg; 1854 bt->bt_crtxg = crtxg; 1855 } 1856 1857 static void 1858 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1859 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1860 uint64_t crtxg) 1861 { 1862 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1863 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1864 ASSERT3U(bt->bt_object, ==, object); 1865 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1866 ASSERT3U(bt->bt_offset, ==, offset); 1867 ASSERT3U(bt->bt_gen, <=, gen); 1868 ASSERT3U(bt->bt_txg, <=, txg); 1869 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1870 } 1871 1872 static ztest_block_tag_t * 1873 ztest_bt_bonus(dmu_buf_t *db) 1874 { 1875 dmu_object_info_t doi; 1876 ztest_block_tag_t *bt; 1877 1878 dmu_object_info_from_db(db, &doi); 1879 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1880 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1881 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1882 1883 return (bt); 1884 } 1885 1886 /* 1887 * Generate a token to fill up unused bonus buffer space. Try to make 1888 * it unique to the object, generation, and offset to verify that data 1889 * is not getting overwritten by data from other dnodes. 1890 */ 1891 #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1892 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1893 1894 /* 1895 * Fill up the unused bonus buffer region before the block tag with a 1896 * verifiable pattern. Filling the whole bonus area with non-zero data 1897 * helps ensure that all dnode traversal code properly skips the 1898 * interior regions of large dnodes. 1899 */ 1900 static void 1901 ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1902 objset_t *os, uint64_t gen) 1903 { 1904 uint64_t *bonusp; 1905 1906 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1907 1908 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1909 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1910 gen, bonusp - (uint64_t *)db->db_data); 1911 *bonusp = token; 1912 } 1913 } 1914 1915 /* 1916 * Verify that the unused area of a bonus buffer is filled with the 1917 * expected tokens. 1918 */ 1919 static void 1920 ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1921 objset_t *os, uint64_t gen) 1922 { 1923 uint64_t *bonusp; 1924 1925 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1926 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1927 gen, bonusp - (uint64_t *)db->db_data); 1928 VERIFY3U(*bonusp, ==, token); 1929 } 1930 } 1931 1932 /* 1933 * ZIL logging ops 1934 */ 1935 1936 #define lrz_type lr_mode 1937 #define lrz_blocksize lr_uid 1938 #define lrz_ibshift lr_gid 1939 #define lrz_bonustype lr_rdev 1940 #define lrz_dnodesize lr_crtime[1] 1941 1942 static void 1943 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1944 { 1945 char *name = (void *)(lr + 1); /* name follows lr */ 1946 size_t namesize = strlen(name) + 1; 1947 itx_t *itx; 1948 1949 if (zil_replaying(zd->zd_zilog, tx)) 1950 return; 1951 1952 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1953 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1954 sizeof (*lr) + namesize - sizeof (lr_t)); 1955 1956 zil_itx_assign(zd->zd_zilog, itx, tx); 1957 } 1958 1959 static void 1960 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1961 { 1962 char *name = (void *)(lr + 1); /* name follows lr */ 1963 size_t namesize = strlen(name) + 1; 1964 itx_t *itx; 1965 1966 if (zil_replaying(zd->zd_zilog, tx)) 1967 return; 1968 1969 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1970 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1971 sizeof (*lr) + namesize - sizeof (lr_t)); 1972 1973 itx->itx_oid = object; 1974 zil_itx_assign(zd->zd_zilog, itx, tx); 1975 } 1976 1977 static void 1978 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1979 { 1980 itx_t *itx; 1981 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1982 1983 if (zil_replaying(zd->zd_zilog, tx)) 1984 return; 1985 1986 if (lr->lr_length > zil_max_log_data(zd->zd_zilog, sizeof (lr_write_t))) 1987 write_state = WR_INDIRECT; 1988 1989 itx = zil_itx_create(TX_WRITE, 1990 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1991 1992 if (write_state == WR_COPIED && 1993 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1994 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { 1995 zil_itx_destroy(itx); 1996 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1997 write_state = WR_NEED_COPY; 1998 } 1999 itx->itx_private = zd; 2000 itx->itx_wr_state = write_state; 2001 itx->itx_sync = (ztest_random(8) == 0); 2002 2003 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2004 sizeof (*lr) - sizeof (lr_t)); 2005 2006 zil_itx_assign(zd->zd_zilog, itx, tx); 2007 } 2008 2009 static void 2010 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 2011 { 2012 itx_t *itx; 2013 2014 if (zil_replaying(zd->zd_zilog, tx)) 2015 return; 2016 2017 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 2018 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2019 sizeof (*lr) - sizeof (lr_t)); 2020 2021 itx->itx_sync = B_FALSE; 2022 zil_itx_assign(zd->zd_zilog, itx, tx); 2023 } 2024 2025 static void 2026 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 2027 { 2028 itx_t *itx; 2029 2030 if (zil_replaying(zd->zd_zilog, tx)) 2031 return; 2032 2033 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 2034 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2035 sizeof (*lr) - sizeof (lr_t)); 2036 2037 itx->itx_sync = B_FALSE; 2038 zil_itx_assign(zd->zd_zilog, itx, tx); 2039 } 2040 2041 /* 2042 * ZIL replay ops 2043 */ 2044 static int 2045 ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 2046 { 2047 ztest_ds_t *zd = arg1; 2048 lr_create_t *lr = arg2; 2049 char *name = (void *)(lr + 1); /* name follows lr */ 2050 objset_t *os = zd->zd_os; 2051 ztest_block_tag_t *bbt; 2052 dmu_buf_t *db; 2053 dmu_tx_t *tx; 2054 uint64_t txg; 2055 int error = 0; 2056 int bonuslen; 2057 2058 if (byteswap) 2059 byteswap_uint64_array(lr, sizeof (*lr)); 2060 2061 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2062 ASSERT3S(name[0], !=, '\0'); 2063 2064 tx = dmu_tx_create(os); 2065 2066 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 2067 2068 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2069 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 2070 } else { 2071 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 2072 } 2073 2074 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2075 if (txg == 0) 2076 return (ENOSPC); 2077 2078 ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); 2079 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 2080 2081 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2082 if (lr->lr_foid == 0) { 2083 lr->lr_foid = zap_create_dnsize(os, 2084 lr->lrz_type, lr->lrz_bonustype, 2085 bonuslen, lr->lrz_dnodesize, tx); 2086 } else { 2087 error = zap_create_claim_dnsize(os, lr->lr_foid, 2088 lr->lrz_type, lr->lrz_bonustype, 2089 bonuslen, lr->lrz_dnodesize, tx); 2090 } 2091 } else { 2092 if (lr->lr_foid == 0) { 2093 lr->lr_foid = dmu_object_alloc_dnsize(os, 2094 lr->lrz_type, 0, lr->lrz_bonustype, 2095 bonuslen, lr->lrz_dnodesize, tx); 2096 } else { 2097 error = dmu_object_claim_dnsize(os, lr->lr_foid, 2098 lr->lrz_type, 0, lr->lrz_bonustype, 2099 bonuslen, lr->lrz_dnodesize, tx); 2100 } 2101 } 2102 2103 if (error) { 2104 ASSERT3U(error, ==, EEXIST); 2105 ASSERT(zd->zd_zilog->zl_replay); 2106 dmu_tx_commit(tx); 2107 return (error); 2108 } 2109 2110 ASSERT3U(lr->lr_foid, !=, 0); 2111 2112 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 2113 VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, 2114 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 2115 2116 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2117 bbt = ztest_bt_bonus(db); 2118 dmu_buf_will_dirty(db, tx); 2119 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 2120 lr->lr_gen, txg, txg); 2121 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 2122 dmu_buf_rele(db, FTAG); 2123 2124 VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 2125 &lr->lr_foid, tx)); 2126 2127 (void) ztest_log_create(zd, tx, lr); 2128 2129 dmu_tx_commit(tx); 2130 2131 return (0); 2132 } 2133 2134 static int 2135 ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 2136 { 2137 ztest_ds_t *zd = arg1; 2138 lr_remove_t *lr = arg2; 2139 char *name = (void *)(lr + 1); /* name follows lr */ 2140 objset_t *os = zd->zd_os; 2141 dmu_object_info_t doi; 2142 dmu_tx_t *tx; 2143 uint64_t object, txg; 2144 2145 if (byteswap) 2146 byteswap_uint64_array(lr, sizeof (*lr)); 2147 2148 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2149 ASSERT3S(name[0], !=, '\0'); 2150 2151 VERIFY0( 2152 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 2153 ASSERT3U(object, !=, 0); 2154 2155 ztest_object_lock(zd, object, ZTRL_WRITER); 2156 2157 VERIFY0(dmu_object_info(os, object, &doi)); 2158 2159 tx = dmu_tx_create(os); 2160 2161 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 2162 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 2163 2164 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2165 if (txg == 0) { 2166 ztest_object_unlock(zd, object); 2167 return (ENOSPC); 2168 } 2169 2170 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 2171 VERIFY0(zap_destroy(os, object, tx)); 2172 } else { 2173 VERIFY0(dmu_object_free(os, object, tx)); 2174 } 2175 2176 VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); 2177 2178 (void) ztest_log_remove(zd, tx, lr, object); 2179 2180 dmu_tx_commit(tx); 2181 2182 ztest_object_unlock(zd, object); 2183 2184 return (0); 2185 } 2186 2187 static int 2188 ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 2189 { 2190 ztest_ds_t *zd = arg1; 2191 lr_write_t *lr = arg2; 2192 objset_t *os = zd->zd_os; 2193 void *data = lr + 1; /* data follows lr */ 2194 uint64_t offset, length; 2195 ztest_block_tag_t *bt = data; 2196 ztest_block_tag_t *bbt; 2197 uint64_t gen, txg, lrtxg, crtxg; 2198 dmu_object_info_t doi; 2199 dmu_tx_t *tx; 2200 dmu_buf_t *db; 2201 arc_buf_t *abuf = NULL; 2202 rl_t *rl; 2203 2204 if (byteswap) 2205 byteswap_uint64_array(lr, sizeof (*lr)); 2206 2207 offset = lr->lr_offset; 2208 length = lr->lr_length; 2209 2210 /* If it's a dmu_sync() block, write the whole block */ 2211 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 2212 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 2213 if (length < blocksize) { 2214 offset -= offset % blocksize; 2215 length = blocksize; 2216 } 2217 } 2218 2219 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 2220 byteswap_uint64_array(bt, sizeof (*bt)); 2221 2222 if (bt->bt_magic != BT_MAGIC) 2223 bt = NULL; 2224 2225 ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); 2226 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, ZTRL_WRITER); 2227 2228 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2229 2230 dmu_object_info_from_db(db, &doi); 2231 2232 bbt = ztest_bt_bonus(db); 2233 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2234 gen = bbt->bt_gen; 2235 crtxg = bbt->bt_crtxg; 2236 lrtxg = lr->lr_common.lrc_txg; 2237 2238 tx = dmu_tx_create(os); 2239 2240 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 2241 2242 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 2243 P2PHASE(offset, length) == 0) 2244 abuf = dmu_request_arcbuf(db, length); 2245 2246 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2247 if (txg == 0) { 2248 if (abuf != NULL) 2249 dmu_return_arcbuf(abuf); 2250 dmu_buf_rele(db, FTAG); 2251 ztest_range_unlock(rl); 2252 ztest_object_unlock(zd, lr->lr_foid); 2253 return (ENOSPC); 2254 } 2255 2256 if (bt != NULL) { 2257 /* 2258 * Usually, verify the old data before writing new data -- 2259 * but not always, because we also want to verify correct 2260 * behavior when the data was not recently read into cache. 2261 */ 2262 ASSERT(doi.doi_data_block_size); 2263 ASSERT0(offset % doi.doi_data_block_size); 2264 if (ztest_random(4) != 0) { 2265 int prefetch = ztest_random(2) ? 2266 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 2267 ztest_block_tag_t rbt; 2268 2269 VERIFY(dmu_read(os, lr->lr_foid, offset, 2270 sizeof (rbt), &rbt, prefetch) == 0); 2271 if (rbt.bt_magic == BT_MAGIC) { 2272 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 2273 offset, gen, txg, crtxg); 2274 } 2275 } 2276 2277 /* 2278 * Writes can appear to be newer than the bonus buffer because 2279 * the ztest_get_data() callback does a dmu_read() of the 2280 * open-context data, which may be different than the data 2281 * as it was when the write was generated. 2282 */ 2283 if (zd->zd_zilog->zl_replay) { 2284 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 2285 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 2286 bt->bt_crtxg); 2287 } 2288 2289 /* 2290 * Set the bt's gen/txg to the bonus buffer's gen/txg 2291 * so that all of the usual ASSERTs will work. 2292 */ 2293 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 2294 crtxg); 2295 } 2296 2297 if (abuf == NULL) { 2298 dmu_write(os, lr->lr_foid, offset, length, data, tx); 2299 } else { 2300 memcpy(abuf->b_data, data, length); 2301 VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx)); 2302 } 2303 2304 (void) ztest_log_write(zd, tx, lr); 2305 2306 dmu_buf_rele(db, FTAG); 2307 2308 dmu_tx_commit(tx); 2309 2310 ztest_range_unlock(rl); 2311 ztest_object_unlock(zd, lr->lr_foid); 2312 2313 return (0); 2314 } 2315 2316 static int 2317 ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 2318 { 2319 ztest_ds_t *zd = arg1; 2320 lr_truncate_t *lr = arg2; 2321 objset_t *os = zd->zd_os; 2322 dmu_tx_t *tx; 2323 uint64_t txg; 2324 rl_t *rl; 2325 2326 if (byteswap) 2327 byteswap_uint64_array(lr, sizeof (*lr)); 2328 2329 ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); 2330 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 2331 ZTRL_WRITER); 2332 2333 tx = dmu_tx_create(os); 2334 2335 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 2336 2337 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2338 if (txg == 0) { 2339 ztest_range_unlock(rl); 2340 ztest_object_unlock(zd, lr->lr_foid); 2341 return (ENOSPC); 2342 } 2343 2344 VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 2345 lr->lr_length, tx)); 2346 2347 (void) ztest_log_truncate(zd, tx, lr); 2348 2349 dmu_tx_commit(tx); 2350 2351 ztest_range_unlock(rl); 2352 ztest_object_unlock(zd, lr->lr_foid); 2353 2354 return (0); 2355 } 2356 2357 static int 2358 ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 2359 { 2360 ztest_ds_t *zd = arg1; 2361 lr_setattr_t *lr = arg2; 2362 objset_t *os = zd->zd_os; 2363 dmu_tx_t *tx; 2364 dmu_buf_t *db; 2365 ztest_block_tag_t *bbt; 2366 uint64_t txg, lrtxg, crtxg, dnodesize; 2367 2368 if (byteswap) 2369 byteswap_uint64_array(lr, sizeof (*lr)); 2370 2371 ztest_object_lock(zd, lr->lr_foid, ZTRL_WRITER); 2372 2373 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2374 2375 tx = dmu_tx_create(os); 2376 dmu_tx_hold_bonus(tx, lr->lr_foid); 2377 2378 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2379 if (txg == 0) { 2380 dmu_buf_rele(db, FTAG); 2381 ztest_object_unlock(zd, lr->lr_foid); 2382 return (ENOSPC); 2383 } 2384 2385 bbt = ztest_bt_bonus(db); 2386 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2387 crtxg = bbt->bt_crtxg; 2388 lrtxg = lr->lr_common.lrc_txg; 2389 dnodesize = bbt->bt_dnodesize; 2390 2391 if (zd->zd_zilog->zl_replay) { 2392 ASSERT3U(lr->lr_size, !=, 0); 2393 ASSERT3U(lr->lr_mode, !=, 0); 2394 ASSERT3U(lrtxg, !=, 0); 2395 } else { 2396 /* 2397 * Randomly change the size and increment the generation. 2398 */ 2399 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 2400 sizeof (*bbt); 2401 lr->lr_mode = bbt->bt_gen + 1; 2402 ASSERT0(lrtxg); 2403 } 2404 2405 /* 2406 * Verify that the current bonus buffer is not newer than our txg. 2407 */ 2408 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2409 MAX(txg, lrtxg), crtxg); 2410 2411 dmu_buf_will_dirty(db, tx); 2412 2413 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 2414 ASSERT3U(lr->lr_size, <=, db->db_size); 2415 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 2416 bbt = ztest_bt_bonus(db); 2417 2418 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2419 txg, crtxg); 2420 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 2421 dmu_buf_rele(db, FTAG); 2422 2423 (void) ztest_log_setattr(zd, tx, lr); 2424 2425 dmu_tx_commit(tx); 2426 2427 ztest_object_unlock(zd, lr->lr_foid); 2428 2429 return (0); 2430 } 2431 2432 static zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 2433 NULL, /* 0 no such transaction type */ 2434 ztest_replay_create, /* TX_CREATE */ 2435 NULL, /* TX_MKDIR */ 2436 NULL, /* TX_MKXATTR */ 2437 NULL, /* TX_SYMLINK */ 2438 ztest_replay_remove, /* TX_REMOVE */ 2439 NULL, /* TX_RMDIR */ 2440 NULL, /* TX_LINK */ 2441 NULL, /* TX_RENAME */ 2442 ztest_replay_write, /* TX_WRITE */ 2443 ztest_replay_truncate, /* TX_TRUNCATE */ 2444 ztest_replay_setattr, /* TX_SETATTR */ 2445 NULL, /* TX_ACL */ 2446 NULL, /* TX_CREATE_ACL */ 2447 NULL, /* TX_CREATE_ATTR */ 2448 NULL, /* TX_CREATE_ACL_ATTR */ 2449 NULL, /* TX_MKDIR_ACL */ 2450 NULL, /* TX_MKDIR_ATTR */ 2451 NULL, /* TX_MKDIR_ACL_ATTR */ 2452 NULL, /* TX_WRITE2 */ 2453 NULL, /* TX_SETSAXATTR */ 2454 NULL, /* TX_RENAME_EXCHANGE */ 2455 NULL, /* TX_RENAME_WHITEOUT */ 2456 }; 2457 2458 /* 2459 * ZIL get_data callbacks 2460 */ 2461 2462 static void 2463 ztest_get_done(zgd_t *zgd, int error) 2464 { 2465 (void) error; 2466 ztest_ds_t *zd = zgd->zgd_private; 2467 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 2468 2469 if (zgd->zgd_db) 2470 dmu_buf_rele(zgd->zgd_db, zgd); 2471 2472 ztest_range_unlock((rl_t *)zgd->zgd_lr); 2473 ztest_object_unlock(zd, object); 2474 2475 umem_free(zgd, sizeof (*zgd)); 2476 } 2477 2478 static int 2479 ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, 2480 struct lwb *lwb, zio_t *zio) 2481 { 2482 (void) arg2; 2483 ztest_ds_t *zd = arg; 2484 objset_t *os = zd->zd_os; 2485 uint64_t object = lr->lr_foid; 2486 uint64_t offset = lr->lr_offset; 2487 uint64_t size = lr->lr_length; 2488 uint64_t txg = lr->lr_common.lrc_txg; 2489 uint64_t crtxg; 2490 dmu_object_info_t doi; 2491 dmu_buf_t *db; 2492 zgd_t *zgd; 2493 int error; 2494 2495 ASSERT3P(lwb, !=, NULL); 2496 ASSERT3U(size, !=, 0); 2497 2498 ztest_object_lock(zd, object, ZTRL_READER); 2499 error = dmu_bonus_hold(os, object, FTAG, &db); 2500 if (error) { 2501 ztest_object_unlock(zd, object); 2502 return (error); 2503 } 2504 2505 crtxg = ztest_bt_bonus(db)->bt_crtxg; 2506 2507 if (crtxg == 0 || crtxg > txg) { 2508 dmu_buf_rele(db, FTAG); 2509 ztest_object_unlock(zd, object); 2510 return (ENOENT); 2511 } 2512 2513 dmu_object_info_from_db(db, &doi); 2514 dmu_buf_rele(db, FTAG); 2515 db = NULL; 2516 2517 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 2518 zgd->zgd_lwb = lwb; 2519 zgd->zgd_private = zd; 2520 2521 if (buf != NULL) { /* immediate write */ 2522 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2523 object, offset, size, ZTRL_READER); 2524 2525 error = dmu_read(os, object, offset, size, buf, 2526 DMU_READ_NO_PREFETCH); 2527 ASSERT0(error); 2528 } else { 2529 ASSERT3P(zio, !=, NULL); 2530 size = doi.doi_data_block_size; 2531 if (ISP2(size)) { 2532 offset = P2ALIGN(offset, size); 2533 } else { 2534 ASSERT3U(offset, <, size); 2535 offset = 0; 2536 } 2537 2538 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2539 object, offset, size, ZTRL_READER); 2540 2541 error = dmu_buf_hold_noread(os, object, offset, zgd, &db); 2542 2543 if (error == 0) { 2544 blkptr_t *bp = &lr->lr_blkptr; 2545 2546 zgd->zgd_db = db; 2547 zgd->zgd_bp = bp; 2548 2549 ASSERT3U(db->db_offset, ==, offset); 2550 ASSERT3U(db->db_size, ==, size); 2551 2552 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2553 ztest_get_done, zgd); 2554 2555 if (error == 0) 2556 return (0); 2557 } 2558 } 2559 2560 ztest_get_done(zgd, error); 2561 2562 return (error); 2563 } 2564 2565 static void * 2566 ztest_lr_alloc(size_t lrsize, char *name) 2567 { 2568 char *lr; 2569 size_t namesize = name ? strlen(name) + 1 : 0; 2570 2571 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2572 2573 if (name) 2574 memcpy(lr + lrsize, name, namesize); 2575 2576 return (lr); 2577 } 2578 2579 static void 2580 ztest_lr_free(void *lr, size_t lrsize, char *name) 2581 { 2582 size_t namesize = name ? strlen(name) + 1 : 0; 2583 2584 umem_free(lr, lrsize + namesize); 2585 } 2586 2587 /* 2588 * Lookup a bunch of objects. Returns the number of objects not found. 2589 */ 2590 static int 2591 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2592 { 2593 int missing = 0; 2594 int error; 2595 int i; 2596 2597 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2598 2599 for (i = 0; i < count; i++, od++) { 2600 od->od_object = 0; 2601 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2602 sizeof (uint64_t), 1, &od->od_object); 2603 if (error) { 2604 ASSERT3S(error, ==, ENOENT); 2605 ASSERT0(od->od_object); 2606 missing++; 2607 } else { 2608 dmu_buf_t *db; 2609 ztest_block_tag_t *bbt; 2610 dmu_object_info_t doi; 2611 2612 ASSERT3U(od->od_object, !=, 0); 2613 ASSERT0(missing); /* there should be no gaps */ 2614 2615 ztest_object_lock(zd, od->od_object, ZTRL_READER); 2616 VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, 2617 FTAG, &db)); 2618 dmu_object_info_from_db(db, &doi); 2619 bbt = ztest_bt_bonus(db); 2620 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2621 od->od_type = doi.doi_type; 2622 od->od_blocksize = doi.doi_data_block_size; 2623 od->od_gen = bbt->bt_gen; 2624 dmu_buf_rele(db, FTAG); 2625 ztest_object_unlock(zd, od->od_object); 2626 } 2627 } 2628 2629 return (missing); 2630 } 2631 2632 static int 2633 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2634 { 2635 int missing = 0; 2636 int i; 2637 2638 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2639 2640 for (i = 0; i < count; i++, od++) { 2641 if (missing) { 2642 od->od_object = 0; 2643 missing++; 2644 continue; 2645 } 2646 2647 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2648 2649 lr->lr_doid = od->od_dir; 2650 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2651 lr->lrz_type = od->od_crtype; 2652 lr->lrz_blocksize = od->od_crblocksize; 2653 lr->lrz_ibshift = ztest_random_ibshift(); 2654 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2655 lr->lrz_dnodesize = od->od_crdnodesize; 2656 lr->lr_gen = od->od_crgen; 2657 lr->lr_crtime[0] = time(NULL); 2658 2659 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2660 ASSERT0(missing); 2661 od->od_object = 0; 2662 missing++; 2663 } else { 2664 od->od_object = lr->lr_foid; 2665 od->od_type = od->od_crtype; 2666 od->od_blocksize = od->od_crblocksize; 2667 od->od_gen = od->od_crgen; 2668 ASSERT3U(od->od_object, !=, 0); 2669 } 2670 2671 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2672 } 2673 2674 return (missing); 2675 } 2676 2677 static int 2678 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2679 { 2680 int missing = 0; 2681 int error; 2682 int i; 2683 2684 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2685 2686 od += count - 1; 2687 2688 for (i = count - 1; i >= 0; i--, od--) { 2689 if (missing) { 2690 missing++; 2691 continue; 2692 } 2693 2694 /* 2695 * No object was found. 2696 */ 2697 if (od->od_object == 0) 2698 continue; 2699 2700 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2701 2702 lr->lr_doid = od->od_dir; 2703 2704 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2705 ASSERT3U(error, ==, ENOSPC); 2706 missing++; 2707 } else { 2708 od->od_object = 0; 2709 } 2710 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2711 } 2712 2713 return (missing); 2714 } 2715 2716 static int 2717 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2718 const void *data) 2719 { 2720 lr_write_t *lr; 2721 int error; 2722 2723 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2724 2725 lr->lr_foid = object; 2726 lr->lr_offset = offset; 2727 lr->lr_length = size; 2728 lr->lr_blkoff = 0; 2729 BP_ZERO(&lr->lr_blkptr); 2730 2731 memcpy(lr + 1, data, size); 2732 2733 error = ztest_replay_write(zd, lr, B_FALSE); 2734 2735 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2736 2737 return (error); 2738 } 2739 2740 static int 2741 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2742 { 2743 lr_truncate_t *lr; 2744 int error; 2745 2746 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2747 2748 lr->lr_foid = object; 2749 lr->lr_offset = offset; 2750 lr->lr_length = size; 2751 2752 error = ztest_replay_truncate(zd, lr, B_FALSE); 2753 2754 ztest_lr_free(lr, sizeof (*lr), NULL); 2755 2756 return (error); 2757 } 2758 2759 static int 2760 ztest_setattr(ztest_ds_t *zd, uint64_t object) 2761 { 2762 lr_setattr_t *lr; 2763 int error; 2764 2765 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2766 2767 lr->lr_foid = object; 2768 lr->lr_size = 0; 2769 lr->lr_mode = 0; 2770 2771 error = ztest_replay_setattr(zd, lr, B_FALSE); 2772 2773 ztest_lr_free(lr, sizeof (*lr), NULL); 2774 2775 return (error); 2776 } 2777 2778 static void 2779 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2780 { 2781 objset_t *os = zd->zd_os; 2782 dmu_tx_t *tx; 2783 uint64_t txg; 2784 rl_t *rl; 2785 2786 txg_wait_synced(dmu_objset_pool(os), 0); 2787 2788 ztest_object_lock(zd, object, ZTRL_READER); 2789 rl = ztest_range_lock(zd, object, offset, size, ZTRL_WRITER); 2790 2791 tx = dmu_tx_create(os); 2792 2793 dmu_tx_hold_write(tx, object, offset, size); 2794 2795 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2796 2797 if (txg != 0) { 2798 dmu_prealloc(os, object, offset, size, tx); 2799 dmu_tx_commit(tx); 2800 txg_wait_synced(dmu_objset_pool(os), txg); 2801 } else { 2802 (void) dmu_free_long_range(os, object, offset, size); 2803 } 2804 2805 ztest_range_unlock(rl); 2806 ztest_object_unlock(zd, object); 2807 } 2808 2809 static void 2810 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2811 { 2812 int err; 2813 ztest_block_tag_t wbt; 2814 dmu_object_info_t doi; 2815 enum ztest_io_type io_type; 2816 uint64_t blocksize; 2817 void *data; 2818 2819 VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); 2820 blocksize = doi.doi_data_block_size; 2821 data = umem_alloc(blocksize, UMEM_NOFAIL); 2822 2823 /* 2824 * Pick an i/o type at random, biased toward writing block tags. 2825 */ 2826 io_type = ztest_random(ZTEST_IO_TYPES); 2827 if (ztest_random(2) == 0) 2828 io_type = ZTEST_IO_WRITE_TAG; 2829 2830 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2831 2832 switch (io_type) { 2833 2834 case ZTEST_IO_WRITE_TAG: 2835 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2836 offset, 0, 0, 0); 2837 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2838 break; 2839 2840 case ZTEST_IO_WRITE_PATTERN: 2841 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2842 if (ztest_random(2) == 0) { 2843 /* 2844 * Induce fletcher2 collisions to ensure that 2845 * zio_ddt_collision() detects and resolves them 2846 * when using fletcher2-verify for deduplication. 2847 */ 2848 ((uint64_t *)data)[0] ^= 1ULL << 63; 2849 ((uint64_t *)data)[4] ^= 1ULL << 63; 2850 } 2851 (void) ztest_write(zd, object, offset, blocksize, data); 2852 break; 2853 2854 case ZTEST_IO_WRITE_ZEROES: 2855 memset(data, 0, blocksize); 2856 (void) ztest_write(zd, object, offset, blocksize, data); 2857 break; 2858 2859 case ZTEST_IO_TRUNCATE: 2860 (void) ztest_truncate(zd, object, offset, blocksize); 2861 break; 2862 2863 case ZTEST_IO_SETATTR: 2864 (void) ztest_setattr(zd, object); 2865 break; 2866 default: 2867 break; 2868 2869 case ZTEST_IO_REWRITE: 2870 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2871 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2872 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2873 B_FALSE); 2874 ASSERT(err == 0 || err == ENOSPC); 2875 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2876 ZFS_PROP_COMPRESSION, 2877 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2878 B_FALSE); 2879 ASSERT(err == 0 || err == ENOSPC); 2880 (void) pthread_rwlock_unlock(&ztest_name_lock); 2881 2882 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2883 DMU_READ_NO_PREFETCH)); 2884 2885 (void) ztest_write(zd, object, offset, blocksize, data); 2886 break; 2887 } 2888 2889 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2890 2891 umem_free(data, blocksize); 2892 } 2893 2894 /* 2895 * Initialize an object description template. 2896 */ 2897 static void 2898 ztest_od_init(ztest_od_t *od, uint64_t id, const char *tag, uint64_t index, 2899 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2900 uint64_t gen) 2901 { 2902 od->od_dir = ZTEST_DIROBJ; 2903 od->od_object = 0; 2904 2905 od->od_crtype = type; 2906 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2907 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2908 od->od_crgen = gen; 2909 2910 od->od_type = DMU_OT_NONE; 2911 od->od_blocksize = 0; 2912 od->od_gen = 0; 2913 2914 (void) snprintf(od->od_name, sizeof (od->od_name), 2915 "%s(%"PRId64")[%"PRIu64"]", 2916 tag, id, index); 2917 } 2918 2919 /* 2920 * Lookup or create the objects for a test using the od template. 2921 * If the objects do not all exist, or if 'remove' is specified, 2922 * remove any existing objects and create new ones. Otherwise, 2923 * use the existing objects. 2924 */ 2925 static int 2926 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2927 { 2928 int count = size / sizeof (*od); 2929 int rv = 0; 2930 2931 mutex_enter(&zd->zd_dirobj_lock); 2932 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2933 (ztest_remove(zd, od, count) != 0 || 2934 ztest_create(zd, od, count) != 0)) 2935 rv = -1; 2936 zd->zd_od = od; 2937 mutex_exit(&zd->zd_dirobj_lock); 2938 2939 return (rv); 2940 } 2941 2942 void 2943 ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2944 { 2945 (void) id; 2946 zilog_t *zilog = zd->zd_zilog; 2947 2948 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2949 2950 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2951 2952 /* 2953 * Remember the committed values in zd, which is in parent/child 2954 * shared memory. If we die, the next iteration of ztest_run() 2955 * will verify that the log really does contain this record. 2956 */ 2957 mutex_enter(&zilog->zl_lock); 2958 ASSERT3P(zd->zd_shared, !=, NULL); 2959 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2960 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2961 mutex_exit(&zilog->zl_lock); 2962 2963 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2964 } 2965 2966 /* 2967 * This function is designed to simulate the operations that occur during a 2968 * mount/unmount operation. We hold the dataset across these operations in an 2969 * attempt to expose any implicit assumptions about ZIL management. 2970 */ 2971 void 2972 ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2973 { 2974 (void) id; 2975 objset_t *os = zd->zd_os; 2976 2977 /* 2978 * We hold the ztest_vdev_lock so we don't cause problems with 2979 * other threads that wish to remove a log device, such as 2980 * ztest_device_removal(). 2981 */ 2982 mutex_enter(&ztest_vdev_lock); 2983 2984 /* 2985 * We grab the zd_dirobj_lock to ensure that no other thread is 2986 * updating the zil (i.e. adding in-memory log records) and the 2987 * zd_zilog_lock to block any I/O. 2988 */ 2989 mutex_enter(&zd->zd_dirobj_lock); 2990 (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); 2991 2992 /* zfsvfs_teardown() */ 2993 zil_close(zd->zd_zilog); 2994 2995 /* zfsvfs_setup() */ 2996 VERIFY3P(zil_open(os, ztest_get_data, NULL), ==, zd->zd_zilog); 2997 zil_replay(os, zd, ztest_replay_vector); 2998 2999 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 3000 mutex_exit(&zd->zd_dirobj_lock); 3001 mutex_exit(&ztest_vdev_lock); 3002 } 3003 3004 /* 3005 * Verify that we can't destroy an active pool, create an existing pool, 3006 * or create a pool with a bad vdev spec. 3007 */ 3008 void 3009 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 3010 { 3011 (void) zd, (void) id; 3012 ztest_shared_opts_t *zo = &ztest_opts; 3013 spa_t *spa; 3014 nvlist_t *nvroot; 3015 3016 if (zo->zo_mmp_test) 3017 return; 3018 3019 /* 3020 * Attempt to create using a bad file. 3021 */ 3022 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 3023 VERIFY3U(ENOENT, ==, 3024 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 3025 fnvlist_free(nvroot); 3026 3027 /* 3028 * Attempt to create using a bad mirror. 3029 */ 3030 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); 3031 VERIFY3U(ENOENT, ==, 3032 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 3033 fnvlist_free(nvroot); 3034 3035 /* 3036 * Attempt to create an existing pool. It shouldn't matter 3037 * what's in the nvroot; we should fail with EEXIST. 3038 */ 3039 (void) pthread_rwlock_rdlock(&ztest_name_lock); 3040 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 3041 VERIFY3U(EEXIST, ==, 3042 spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); 3043 fnvlist_free(nvroot); 3044 3045 /* 3046 * We open a reference to the spa and then we try to export it 3047 * expecting one of the following errors: 3048 * 3049 * EBUSY 3050 * Because of the reference we just opened. 3051 * 3052 * ZFS_ERR_EXPORT_IN_PROGRESS 3053 * For the case that there is another ztest thread doing 3054 * an export concurrently. 3055 */ 3056 VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); 3057 int error = spa_destroy(zo->zo_pool); 3058 if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { 3059 fatal(B_FALSE, "spa_destroy(%s) returned unexpected value %d", 3060 spa->spa_name, error); 3061 } 3062 spa_close(spa, FTAG); 3063 3064 (void) pthread_rwlock_unlock(&ztest_name_lock); 3065 } 3066 3067 /* 3068 * Start and then stop the MMP threads to ensure the startup and shutdown code 3069 * works properly. Actual protection and property-related code tested via ZTS. 3070 */ 3071 void 3072 ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) 3073 { 3074 (void) zd, (void) id; 3075 ztest_shared_opts_t *zo = &ztest_opts; 3076 spa_t *spa = ztest_spa; 3077 3078 if (zo->zo_mmp_test) 3079 return; 3080 3081 /* 3082 * Since enabling MMP involves setting a property, it could not be done 3083 * while the pool is suspended. 3084 */ 3085 if (spa_suspended(spa)) 3086 return; 3087 3088 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3089 mutex_enter(&spa->spa_props_lock); 3090 3091 zfs_multihost_fail_intervals = 0; 3092 3093 if (!spa_multihost(spa)) { 3094 spa->spa_multihost = B_TRUE; 3095 mmp_thread_start(spa); 3096 } 3097 3098 mutex_exit(&spa->spa_props_lock); 3099 spa_config_exit(spa, SCL_CONFIG, FTAG); 3100 3101 txg_wait_synced(spa_get_dsl(spa), 0); 3102 mmp_signal_all_threads(); 3103 txg_wait_synced(spa_get_dsl(spa), 0); 3104 3105 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3106 mutex_enter(&spa->spa_props_lock); 3107 3108 if (spa_multihost(spa)) { 3109 mmp_thread_stop(spa); 3110 spa->spa_multihost = B_FALSE; 3111 } 3112 3113 mutex_exit(&spa->spa_props_lock); 3114 spa_config_exit(spa, SCL_CONFIG, FTAG); 3115 } 3116 3117 static int 3118 ztest_get_raidz_children(spa_t *spa) 3119 { 3120 (void) spa; 3121 vdev_t *raidvd; 3122 3123 ASSERT(MUTEX_HELD(&ztest_vdev_lock)); 3124 3125 if (ztest_opts.zo_raid_do_expand) { 3126 raidvd = ztest_spa->spa_root_vdev->vdev_child[0]; 3127 3128 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3129 3130 return (raidvd->vdev_children); 3131 } 3132 3133 return (ztest_opts.zo_raid_children); 3134 } 3135 3136 void 3137 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 3138 { 3139 (void) zd, (void) id; 3140 spa_t *spa; 3141 uint64_t initial_version = SPA_VERSION_INITIAL; 3142 uint64_t raidz_children, version, newversion; 3143 nvlist_t *nvroot, *props; 3144 char *name; 3145 3146 if (ztest_opts.zo_mmp_test) 3147 return; 3148 3149 /* dRAID added after feature flags, skip upgrade test. */ 3150 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) 3151 return; 3152 3153 mutex_enter(&ztest_vdev_lock); 3154 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 3155 3156 /* 3157 * Clean up from previous runs. 3158 */ 3159 (void) spa_destroy(name); 3160 3161 raidz_children = ztest_get_raidz_children(ztest_spa); 3162 3163 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 3164 NULL, raidz_children, ztest_opts.zo_mirrors, 1); 3165 3166 /* 3167 * If we're configuring a RAIDZ device then make sure that the 3168 * initial version is capable of supporting that feature. 3169 */ 3170 switch (ztest_opts.zo_raid_parity) { 3171 case 0: 3172 case 1: 3173 initial_version = SPA_VERSION_INITIAL; 3174 break; 3175 case 2: 3176 initial_version = SPA_VERSION_RAIDZ2; 3177 break; 3178 case 3: 3179 initial_version = SPA_VERSION_RAIDZ3; 3180 break; 3181 } 3182 3183 /* 3184 * Create a pool with a spa version that can be upgraded. Pick 3185 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 3186 */ 3187 do { 3188 version = ztest_random_spa_version(initial_version); 3189 } while (version > SPA_VERSION_BEFORE_FEATURES); 3190 3191 props = fnvlist_alloc(); 3192 fnvlist_add_uint64(props, 3193 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 3194 VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); 3195 fnvlist_free(nvroot); 3196 fnvlist_free(props); 3197 3198 VERIFY0(spa_open(name, &spa, FTAG)); 3199 VERIFY3U(spa_version(spa), ==, version); 3200 newversion = ztest_random_spa_version(version + 1); 3201 3202 if (ztest_opts.zo_verbose >= 4) { 3203 (void) printf("upgrading spa version from " 3204 "%"PRIu64" to %"PRIu64"\n", 3205 version, newversion); 3206 } 3207 3208 spa_upgrade(spa, newversion); 3209 VERIFY3U(spa_version(spa), >, version); 3210 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 3211 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 3212 spa_close(spa, FTAG); 3213 3214 kmem_strfree(name); 3215 mutex_exit(&ztest_vdev_lock); 3216 } 3217 3218 static void 3219 ztest_spa_checkpoint(spa_t *spa) 3220 { 3221 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3222 3223 int error = spa_checkpoint(spa->spa_name); 3224 3225 switch (error) { 3226 case 0: 3227 case ZFS_ERR_DEVRM_IN_PROGRESS: 3228 case ZFS_ERR_DISCARDING_CHECKPOINT: 3229 case ZFS_ERR_CHECKPOINT_EXISTS: 3230 case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: 3231 break; 3232 case ENOSPC: 3233 ztest_record_enospc(FTAG); 3234 break; 3235 default: 3236 fatal(B_FALSE, "spa_checkpoint(%s) = %d", spa->spa_name, error); 3237 } 3238 } 3239 3240 static void 3241 ztest_spa_discard_checkpoint(spa_t *spa) 3242 { 3243 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3244 3245 int error = spa_checkpoint_discard(spa->spa_name); 3246 3247 switch (error) { 3248 case 0: 3249 case ZFS_ERR_DISCARDING_CHECKPOINT: 3250 case ZFS_ERR_NO_CHECKPOINT: 3251 break; 3252 default: 3253 fatal(B_FALSE, "spa_discard_checkpoint(%s) = %d", 3254 spa->spa_name, error); 3255 } 3256 3257 } 3258 3259 void 3260 ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 3261 { 3262 (void) zd, (void) id; 3263 spa_t *spa = ztest_spa; 3264 3265 mutex_enter(&ztest_checkpoint_lock); 3266 if (ztest_random(2) == 0) { 3267 ztest_spa_checkpoint(spa); 3268 } else { 3269 ztest_spa_discard_checkpoint(spa); 3270 } 3271 mutex_exit(&ztest_checkpoint_lock); 3272 } 3273 3274 3275 static vdev_t * 3276 vdev_lookup_by_path(vdev_t *vd, const char *path) 3277 { 3278 vdev_t *mvd; 3279 int c; 3280 3281 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 3282 return (vd); 3283 3284 for (c = 0; c < vd->vdev_children; c++) 3285 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 3286 NULL) 3287 return (mvd); 3288 3289 return (NULL); 3290 } 3291 3292 static int 3293 spa_num_top_vdevs(spa_t *spa) 3294 { 3295 vdev_t *rvd = spa->spa_root_vdev; 3296 ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); 3297 return (rvd->vdev_children); 3298 } 3299 3300 /* 3301 * Verify that vdev_add() works as expected. 3302 */ 3303 void 3304 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 3305 { 3306 (void) zd, (void) id; 3307 ztest_shared_t *zs = ztest_shared; 3308 spa_t *spa = ztest_spa; 3309 uint64_t leaves; 3310 uint64_t guid; 3311 uint64_t raidz_children; 3312 3313 nvlist_t *nvroot; 3314 int error; 3315 3316 if (ztest_opts.zo_mmp_test) 3317 return; 3318 3319 mutex_enter(&ztest_vdev_lock); 3320 raidz_children = ztest_get_raidz_children(spa); 3321 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 3322 3323 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3324 3325 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3326 3327 /* 3328 * If we have slogs then remove them 1/4 of the time. 3329 */ 3330 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 3331 metaslab_group_t *mg; 3332 3333 /* 3334 * find the first real slog in log allocation class 3335 */ 3336 mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; 3337 while (!mg->mg_vd->vdev_islog) 3338 mg = mg->mg_next; 3339 3340 guid = mg->mg_vd->vdev_guid; 3341 3342 spa_config_exit(spa, SCL_VDEV, FTAG); 3343 3344 /* 3345 * We have to grab the zs_name_lock as writer to 3346 * prevent a race between removing a slog (dmu_objset_find) 3347 * and destroying a dataset. Removing the slog will 3348 * grab a reference on the dataset which may cause 3349 * dsl_destroy_head() to fail with EBUSY thus 3350 * leaving the dataset in an inconsistent state. 3351 */ 3352 pthread_rwlock_wrlock(&ztest_name_lock); 3353 error = spa_vdev_remove(spa, guid, B_FALSE); 3354 pthread_rwlock_unlock(&ztest_name_lock); 3355 3356 switch (error) { 3357 case 0: 3358 case EEXIST: /* Generic zil_reset() error */ 3359 case EBUSY: /* Replay required */ 3360 case EACCES: /* Crypto key not loaded */ 3361 case ZFS_ERR_CHECKPOINT_EXISTS: 3362 case ZFS_ERR_DISCARDING_CHECKPOINT: 3363 break; 3364 default: 3365 fatal(B_FALSE, "spa_vdev_remove() = %d", error); 3366 } 3367 } else { 3368 spa_config_exit(spa, SCL_VDEV, FTAG); 3369 3370 /* 3371 * Make 1/4 of the devices be log devices 3372 */ 3373 nvroot = make_vdev_root(NULL, NULL, NULL, 3374 ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? 3375 "log" : NULL, raidz_children, zs->zs_mirrors, 3376 1); 3377 3378 error = spa_vdev_add(spa, nvroot); 3379 fnvlist_free(nvroot); 3380 3381 switch (error) { 3382 case 0: 3383 break; 3384 case ENOSPC: 3385 ztest_record_enospc("spa_vdev_add"); 3386 break; 3387 default: 3388 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3389 } 3390 } 3391 3392 mutex_exit(&ztest_vdev_lock); 3393 } 3394 3395 void 3396 ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) 3397 { 3398 (void) zd, (void) id; 3399 ztest_shared_t *zs = ztest_shared; 3400 spa_t *spa = ztest_spa; 3401 uint64_t leaves; 3402 nvlist_t *nvroot; 3403 uint64_t raidz_children; 3404 const char *class = (ztest_random(2) == 0) ? 3405 VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; 3406 int error; 3407 3408 /* 3409 * By default add a special vdev 50% of the time 3410 */ 3411 if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || 3412 (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && 3413 ztest_random(2) == 0)) { 3414 return; 3415 } 3416 3417 mutex_enter(&ztest_vdev_lock); 3418 3419 /* Only test with mirrors */ 3420 if (zs->zs_mirrors < 2) { 3421 mutex_exit(&ztest_vdev_lock); 3422 return; 3423 } 3424 3425 /* requires feature@allocation_classes */ 3426 if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { 3427 mutex_exit(&ztest_vdev_lock); 3428 return; 3429 } 3430 3431 raidz_children = ztest_get_raidz_children(spa); 3432 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 3433 3434 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3435 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3436 spa_config_exit(spa, SCL_VDEV, FTAG); 3437 3438 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 3439 class, raidz_children, zs->zs_mirrors, 1); 3440 3441 error = spa_vdev_add(spa, nvroot); 3442 fnvlist_free(nvroot); 3443 3444 if (error == ENOSPC) 3445 ztest_record_enospc("spa_vdev_add"); 3446 else if (error != 0) 3447 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3448 3449 /* 3450 * 50% of the time allow small blocks in the special class 3451 */ 3452 if (error == 0 && 3453 spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { 3454 if (ztest_opts.zo_verbose >= 3) 3455 (void) printf("Enabling special VDEV small blocks\n"); 3456 error = ztest_dsl_prop_set_uint64(zd->zd_name, 3457 ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); 3458 ASSERT(error == 0 || error == ENOSPC); 3459 } 3460 3461 mutex_exit(&ztest_vdev_lock); 3462 3463 if (ztest_opts.zo_verbose >= 3) { 3464 metaslab_class_t *mc; 3465 3466 if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) 3467 mc = spa_special_class(spa); 3468 else 3469 mc = spa_dedup_class(spa); 3470 (void) printf("Added a %s mirrored vdev (of %d)\n", 3471 class, (int)mc->mc_groups); 3472 } 3473 } 3474 3475 /* 3476 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 3477 */ 3478 void 3479 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 3480 { 3481 (void) zd, (void) id; 3482 ztest_shared_t *zs = ztest_shared; 3483 spa_t *spa = ztest_spa; 3484 vdev_t *rvd = spa->spa_root_vdev; 3485 spa_aux_vdev_t *sav; 3486 const char *aux; 3487 char *path; 3488 uint64_t guid = 0; 3489 int error, ignore_err = 0; 3490 3491 if (ztest_opts.zo_mmp_test) 3492 return; 3493 3494 path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3495 3496 if (ztest_random(2) == 0) { 3497 sav = &spa->spa_spares; 3498 aux = ZPOOL_CONFIG_SPARES; 3499 } else { 3500 sav = &spa->spa_l2cache; 3501 aux = ZPOOL_CONFIG_L2CACHE; 3502 } 3503 3504 mutex_enter(&ztest_vdev_lock); 3505 3506 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3507 3508 if (sav->sav_count != 0 && ztest_random(4) == 0) { 3509 /* 3510 * Pick a random device to remove. 3511 */ 3512 vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3513 3514 /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ 3515 if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) 3516 ignore_err = ENOTSUP; 3517 3518 guid = svd->vdev_guid; 3519 } else { 3520 /* 3521 * Find an unused device we can add. 3522 */ 3523 zs->zs_vdev_aux = 0; 3524 for (;;) { 3525 int c; 3526 (void) snprintf(path, MAXPATHLEN, ztest_aux_template, 3527 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 3528 zs->zs_vdev_aux); 3529 for (c = 0; c < sav->sav_count; c++) 3530 if (strcmp(sav->sav_vdevs[c]->vdev_path, 3531 path) == 0) 3532 break; 3533 if (c == sav->sav_count && 3534 vdev_lookup_by_path(rvd, path) == NULL) 3535 break; 3536 zs->zs_vdev_aux++; 3537 } 3538 } 3539 3540 spa_config_exit(spa, SCL_VDEV, FTAG); 3541 3542 if (guid == 0) { 3543 /* 3544 * Add a new device. 3545 */ 3546 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 3547 (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); 3548 error = spa_vdev_add(spa, nvroot); 3549 3550 switch (error) { 3551 case 0: 3552 break; 3553 default: 3554 fatal(B_FALSE, "spa_vdev_add(%p) = %d", nvroot, error); 3555 } 3556 fnvlist_free(nvroot); 3557 } else { 3558 /* 3559 * Remove an existing device. Sometimes, dirty its 3560 * vdev state first to make sure we handle removal 3561 * of devices that have pending state changes. 3562 */ 3563 if (ztest_random(2) == 0) 3564 (void) vdev_online(spa, guid, 0, NULL); 3565 3566 error = spa_vdev_remove(spa, guid, B_FALSE); 3567 3568 switch (error) { 3569 case 0: 3570 case EBUSY: 3571 case ZFS_ERR_CHECKPOINT_EXISTS: 3572 case ZFS_ERR_DISCARDING_CHECKPOINT: 3573 break; 3574 default: 3575 if (error != ignore_err) 3576 fatal(B_FALSE, 3577 "spa_vdev_remove(%"PRIu64") = %d", 3578 guid, error); 3579 } 3580 } 3581 3582 mutex_exit(&ztest_vdev_lock); 3583 3584 umem_free(path, MAXPATHLEN); 3585 } 3586 3587 /* 3588 * split a pool if it has mirror tlvdevs 3589 */ 3590 void 3591 ztest_split_pool(ztest_ds_t *zd, uint64_t id) 3592 { 3593 (void) zd, (void) id; 3594 ztest_shared_t *zs = ztest_shared; 3595 spa_t *spa = ztest_spa; 3596 vdev_t *rvd = spa->spa_root_vdev; 3597 nvlist_t *tree, **child, *config, *split, **schild; 3598 uint_t c, children, schildren = 0, lastlogid = 0; 3599 int error = 0; 3600 3601 if (ztest_opts.zo_mmp_test) 3602 return; 3603 3604 mutex_enter(&ztest_vdev_lock); 3605 3606 /* ensure we have a usable config; mirrors of raidz aren't supported */ 3607 if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { 3608 mutex_exit(&ztest_vdev_lock); 3609 return; 3610 } 3611 3612 /* clean up the old pool, if any */ 3613 (void) spa_destroy("splitp"); 3614 3615 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3616 3617 /* generate a config from the existing config */ 3618 mutex_enter(&spa->spa_props_lock); 3619 tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); 3620 mutex_exit(&spa->spa_props_lock); 3621 3622 VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, 3623 &child, &children)); 3624 3625 schild = umem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 3626 UMEM_NOFAIL); 3627 for (c = 0; c < children; c++) { 3628 vdev_t *tvd = rvd->vdev_child[c]; 3629 nvlist_t **mchild; 3630 uint_t mchildren; 3631 3632 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 3633 schild[schildren] = fnvlist_alloc(); 3634 fnvlist_add_string(schild[schildren], 3635 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); 3636 fnvlist_add_uint64(schild[schildren], 3637 ZPOOL_CONFIG_IS_HOLE, 1); 3638 if (lastlogid == 0) 3639 lastlogid = schildren; 3640 ++schildren; 3641 continue; 3642 } 3643 lastlogid = 0; 3644 VERIFY0(nvlist_lookup_nvlist_array(child[c], 3645 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); 3646 schild[schildren++] = fnvlist_dup(mchild[0]); 3647 } 3648 3649 /* OK, create a config that can be used to split */ 3650 split = fnvlist_alloc(); 3651 fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 3652 fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, 3653 (const nvlist_t **)schild, lastlogid != 0 ? lastlogid : schildren); 3654 3655 config = fnvlist_alloc(); 3656 fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); 3657 3658 for (c = 0; c < schildren; c++) 3659 fnvlist_free(schild[c]); 3660 umem_free(schild, rvd->vdev_children * sizeof (nvlist_t *)); 3661 fnvlist_free(split); 3662 3663 spa_config_exit(spa, SCL_VDEV, FTAG); 3664 3665 (void) pthread_rwlock_wrlock(&ztest_name_lock); 3666 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 3667 (void) pthread_rwlock_unlock(&ztest_name_lock); 3668 3669 fnvlist_free(config); 3670 3671 if (error == 0) { 3672 (void) printf("successful split - results:\n"); 3673 mutex_enter(&spa_namespace_lock); 3674 show_pool_stats(spa); 3675 show_pool_stats(spa_lookup("splitp")); 3676 mutex_exit(&spa_namespace_lock); 3677 ++zs->zs_splits; 3678 --zs->zs_mirrors; 3679 } 3680 mutex_exit(&ztest_vdev_lock); 3681 } 3682 3683 /* 3684 * Verify that we can attach and detach devices. 3685 */ 3686 void 3687 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 3688 { 3689 (void) zd, (void) id; 3690 ztest_shared_t *zs = ztest_shared; 3691 spa_t *spa = ztest_spa; 3692 spa_aux_vdev_t *sav = &spa->spa_spares; 3693 vdev_t *rvd = spa->spa_root_vdev; 3694 vdev_t *oldvd, *newvd, *pvd; 3695 nvlist_t *root; 3696 uint64_t leaves; 3697 uint64_t leaf, top; 3698 uint64_t ashift = ztest_get_ashift(); 3699 uint64_t oldguid, pguid; 3700 uint64_t oldsize, newsize; 3701 uint64_t raidz_children; 3702 char *oldpath, *newpath; 3703 int replacing; 3704 int oldvd_has_siblings = B_FALSE; 3705 int newvd_is_spare = B_FALSE; 3706 int newvd_is_dspare = B_FALSE; 3707 int oldvd_is_log; 3708 int oldvd_is_special; 3709 int error, expected_error; 3710 3711 if (ztest_opts.zo_mmp_test) 3712 return; 3713 3714 oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3715 newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3716 3717 mutex_enter(&ztest_vdev_lock); 3718 raidz_children = ztest_get_raidz_children(spa); 3719 leaves = MAX(zs->zs_mirrors, 1) * raidz_children; 3720 3721 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3722 3723 /* 3724 * If a vdev is in the process of being removed, its removal may 3725 * finish while we are in progress, leading to an unexpected error 3726 * value. Don't bother trying to attach while we are in the middle 3727 * of removal. 3728 */ 3729 if (ztest_device_removal_active) { 3730 spa_config_exit(spa, SCL_ALL, FTAG); 3731 goto out; 3732 } 3733 3734 /* 3735 * RAIDZ leaf VDEV mirrors are not currently supported while a 3736 * RAIDZ expansion is in progress. 3737 */ 3738 if (ztest_opts.zo_raid_do_expand) { 3739 spa_config_exit(spa, SCL_ALL, FTAG); 3740 goto out; 3741 } 3742 3743 /* 3744 * Decide whether to do an attach or a replace. 3745 */ 3746 replacing = ztest_random(2); 3747 3748 /* 3749 * Pick a random top-level vdev. 3750 */ 3751 top = ztest_random_vdev_top(spa, B_TRUE); 3752 3753 /* 3754 * Pick a random leaf within it. 3755 */ 3756 leaf = ztest_random(leaves); 3757 3758 /* 3759 * Locate this vdev. 3760 */ 3761 oldvd = rvd->vdev_child[top]; 3762 3763 /* pick a child from the mirror */ 3764 if (zs->zs_mirrors >= 1) { 3765 ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); 3766 ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); 3767 oldvd = oldvd->vdev_child[leaf / raidz_children]; 3768 } 3769 3770 /* pick a child out of the raidz group */ 3771 if (ztest_opts.zo_raid_children > 1) { 3772 if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) 3773 ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); 3774 else 3775 ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); 3776 oldvd = oldvd->vdev_child[leaf % raidz_children]; 3777 } 3778 3779 /* 3780 * If we're already doing an attach or replace, oldvd may be a 3781 * mirror vdev -- in which case, pick a random child. 3782 */ 3783 while (oldvd->vdev_children != 0) { 3784 oldvd_has_siblings = B_TRUE; 3785 ASSERT3U(oldvd->vdev_children, >=, 2); 3786 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3787 } 3788 3789 oldguid = oldvd->vdev_guid; 3790 oldsize = vdev_get_min_asize(oldvd); 3791 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3792 oldvd_is_special = 3793 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_SPECIAL || 3794 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_DEDUP; 3795 (void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN); 3796 pvd = oldvd->vdev_parent; 3797 pguid = pvd->vdev_guid; 3798 3799 /* 3800 * If oldvd has siblings, then half of the time, detach it. Prior 3801 * to the detach the pool is scrubbed in order to prevent creating 3802 * unrepairable blocks as a result of the data corruption injection. 3803 */ 3804 if (oldvd_has_siblings && ztest_random(2) == 0) { 3805 spa_config_exit(spa, SCL_ALL, FTAG); 3806 3807 error = ztest_scrub_impl(spa); 3808 if (error) 3809 goto out; 3810 3811 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3812 if (error != 0 && error != ENODEV && error != EBUSY && 3813 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3814 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3815 fatal(B_FALSE, "detach (%s) returned %d", 3816 oldpath, error); 3817 goto out; 3818 } 3819 3820 /* 3821 * For the new vdev, choose with equal probability between the two 3822 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3823 */ 3824 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3825 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3826 newvd_is_spare = B_TRUE; 3827 3828 if (newvd->vdev_ops == &vdev_draid_spare_ops) 3829 newvd_is_dspare = B_TRUE; 3830 3831 (void) strlcpy(newpath, newvd->vdev_path, MAXPATHLEN); 3832 } else { 3833 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 3834 ztest_opts.zo_dir, ztest_opts.zo_pool, 3835 top * leaves + leaf); 3836 if (ztest_random(2) == 0) 3837 newpath[strlen(newpath) - 1] = 'b'; 3838 newvd = vdev_lookup_by_path(rvd, newpath); 3839 } 3840 3841 if (newvd) { 3842 /* 3843 * Reopen to ensure the vdev's asize field isn't stale. 3844 */ 3845 vdev_reopen(newvd); 3846 newsize = vdev_get_min_asize(newvd); 3847 } else { 3848 /* 3849 * Make newsize a little bigger or smaller than oldsize. 3850 * If it's smaller, the attach should fail. 3851 * If it's larger, and we're doing a replace, 3852 * we should get dynamic LUN growth when we're done. 3853 */ 3854 newsize = 10 * oldsize / (9 + ztest_random(3)); 3855 } 3856 3857 /* 3858 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3859 * unless it's a replace; in that case any non-replacing parent is OK. 3860 * 3861 * If newvd is already part of the pool, it should fail with EBUSY. 3862 * 3863 * If newvd is too small, it should fail with EOVERFLOW. 3864 * 3865 * If newvd is a distributed spare and it's being attached to a 3866 * dRAID which is not its parent it should fail with EINVAL. 3867 */ 3868 if (pvd->vdev_ops != &vdev_mirror_ops && 3869 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3870 pvd->vdev_ops == &vdev_replacing_ops || 3871 pvd->vdev_ops == &vdev_spare_ops)) 3872 expected_error = ENOTSUP; 3873 else if (newvd_is_spare && 3874 (!replacing || oldvd_is_log || oldvd_is_special)) 3875 expected_error = ENOTSUP; 3876 else if (newvd == oldvd) 3877 expected_error = replacing ? 0 : EBUSY; 3878 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3879 expected_error = EBUSY; 3880 else if (!newvd_is_dspare && newsize < oldsize) 3881 expected_error = EOVERFLOW; 3882 else if (ashift > oldvd->vdev_top->vdev_ashift) 3883 expected_error = EDOM; 3884 else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) 3885 expected_error = EINVAL; 3886 else 3887 expected_error = 0; 3888 3889 spa_config_exit(spa, SCL_ALL, FTAG); 3890 3891 /* 3892 * Build the nvlist describing newpath. 3893 */ 3894 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3895 ashift, NULL, 0, 0, 1); 3896 3897 /* 3898 * When supported select either a healing or sequential resilver. 3899 */ 3900 boolean_t rebuilding = B_FALSE; 3901 if (pvd->vdev_ops == &vdev_mirror_ops || 3902 pvd->vdev_ops == &vdev_root_ops) { 3903 rebuilding = !!ztest_random(2); 3904 } 3905 3906 error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); 3907 3908 fnvlist_free(root); 3909 3910 /* 3911 * If our parent was the replacing vdev, but the replace completed, 3912 * then instead of failing with ENOTSUP we may either succeed, 3913 * fail with ENODEV, or fail with EOVERFLOW. 3914 */ 3915 if (expected_error == ENOTSUP && 3916 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3917 expected_error = error; 3918 3919 /* 3920 * If someone grew the LUN, the replacement may be too small. 3921 */ 3922 if (error == EOVERFLOW || error == EBUSY) 3923 expected_error = error; 3924 3925 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3926 error == ZFS_ERR_DISCARDING_CHECKPOINT || 3927 error == ZFS_ERR_RESILVER_IN_PROGRESS || 3928 error == ZFS_ERR_REBUILD_IN_PROGRESS) 3929 expected_error = error; 3930 3931 if (error != expected_error && expected_error != EBUSY) { 3932 fatal(B_FALSE, "attach (%s %"PRIu64", %s %"PRIu64", %d) " 3933 "returned %d, expected %d", 3934 oldpath, oldsize, newpath, 3935 newsize, replacing, error, expected_error); 3936 } 3937 out: 3938 mutex_exit(&ztest_vdev_lock); 3939 3940 umem_free(oldpath, MAXPATHLEN); 3941 umem_free(newpath, MAXPATHLEN); 3942 } 3943 3944 static void 3945 raidz_scratch_verify(void) 3946 { 3947 spa_t *spa; 3948 uint64_t write_size, logical_size, offset; 3949 raidz_reflow_scratch_state_t state; 3950 vdev_raidz_expand_t *vre; 3951 vdev_t *raidvd; 3952 3953 ASSERT(raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE); 3954 3955 if (ztest_scratch_state->zs_raidz_scratch_verify_pause == 0) 3956 return; 3957 3958 kernel_init(SPA_MODE_READ); 3959 3960 mutex_enter(&spa_namespace_lock); 3961 spa = spa_lookup(ztest_opts.zo_pool); 3962 ASSERT(spa); 3963 spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; 3964 mutex_exit(&spa_namespace_lock); 3965 3966 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 3967 3968 ASSERT3U(RRSS_GET_OFFSET(&spa->spa_uberblock), !=, UINT64_MAX); 3969 3970 mutex_enter(&ztest_vdev_lock); 3971 3972 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); 3973 3974 vre = spa->spa_raidz_expand; 3975 if (vre == NULL) 3976 goto out; 3977 3978 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3979 offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3980 state = RRSS_GET_STATE(&spa->spa_uberblock); 3981 write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift); 3982 logical_size = write_size * raidvd->vdev_children; 3983 3984 switch (state) { 3985 /* 3986 * Initial state of reflow process. RAIDZ expansion was 3987 * requested by user, but scratch object was not created. 3988 */ 3989 case RRSS_SCRATCH_NOT_IN_USE: 3990 ASSERT3U(offset, ==, 0); 3991 break; 3992 3993 /* 3994 * Scratch object was synced and stored in boot area. 3995 */ 3996 case RRSS_SCRATCH_VALID: 3997 3998 /* 3999 * Scratch object was synced back to raidz start offset, 4000 * raidz is ready for sector by sector reflow process. 4001 */ 4002 case RRSS_SCRATCH_INVALID_SYNCED: 4003 4004 /* 4005 * Scratch object was synced back to raidz start offset 4006 * on zpool importing, raidz is ready for sector by sector 4007 * reflow process. 4008 */ 4009 case RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT: 4010 ASSERT3U(offset, ==, logical_size); 4011 break; 4012 4013 /* 4014 * Sector by sector reflow process started. 4015 */ 4016 case RRSS_SCRATCH_INVALID_SYNCED_REFLOW: 4017 ASSERT3U(offset, >=, logical_size); 4018 break; 4019 } 4020 4021 out: 4022 spa_config_exit(spa, SCL_ALL, FTAG); 4023 4024 mutex_exit(&ztest_vdev_lock); 4025 4026 ztest_scratch_state->zs_raidz_scratch_verify_pause = 0; 4027 4028 spa_close(spa, FTAG); 4029 kernel_fini(); 4030 } 4031 4032 static void 4033 ztest_scratch_thread(void *arg) 4034 { 4035 (void) arg; 4036 4037 /* wait up to 10 seconds */ 4038 for (int t = 100; t > 0; t -= 1) { 4039 if (raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) 4040 thread_exit(); 4041 4042 (void) poll(NULL, 0, 100); 4043 } 4044 4045 /* killed when the scratch area progress reached a certain point */ 4046 ztest_kill(ztest_shared); 4047 } 4048 4049 /* 4050 * Verify that we can attach raidz device. 4051 */ 4052 void 4053 ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) 4054 { 4055 (void) zd, (void) id; 4056 ztest_shared_t *zs = ztest_shared; 4057 spa_t *spa = ztest_spa; 4058 uint64_t leaves, raidz_children, newsize, ashift = ztest_get_ashift(); 4059 kthread_t *scratch_thread = NULL; 4060 vdev_t *newvd, *pvd; 4061 nvlist_t *root; 4062 char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 4063 int error, expected_error = 0; 4064 4065 mutex_enter(&ztest_vdev_lock); 4066 4067 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); 4068 4069 /* Only allow attach when raid-kind = 'eraidz' */ 4070 if (!ztest_opts.zo_raid_do_expand) { 4071 spa_config_exit(spa, SCL_ALL, FTAG); 4072 goto out; 4073 } 4074 4075 if (ztest_opts.zo_mmp_test) { 4076 spa_config_exit(spa, SCL_ALL, FTAG); 4077 goto out; 4078 } 4079 4080 if (ztest_device_removal_active) { 4081 spa_config_exit(spa, SCL_ALL, FTAG); 4082 goto out; 4083 } 4084 4085 pvd = vdev_lookup_top(spa, 0); 4086 4087 ASSERT(pvd->vdev_ops == &vdev_raidz_ops); 4088 4089 /* 4090 * Get size of a child of the raidz group, 4091 * make sure device is a bit bigger 4092 */ 4093 newvd = pvd->vdev_child[ztest_random(pvd->vdev_children)]; 4094 newsize = 10 * vdev_get_min_asize(newvd) / (9 + ztest_random(2)); 4095 4096 /* 4097 * Get next attached leaf id 4098 */ 4099 raidz_children = ztest_get_raidz_children(spa); 4100 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 4101 zs->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 4102 4103 if (spa->spa_raidz_expand) 4104 expected_error = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS; 4105 4106 spa_config_exit(spa, SCL_ALL, FTAG); 4107 4108 /* 4109 * Path to vdev to be attached 4110 */ 4111 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 4112 ztest_opts.zo_dir, ztest_opts.zo_pool, zs->zs_vdev_next_leaf); 4113 4114 /* 4115 * Build the nvlist describing newpath. 4116 */ 4117 root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL, 4118 0, 0, 1); 4119 4120 /* 4121 * 50% of the time, set raidz_expand_pause_point to cause 4122 * raidz_reflow_scratch_sync() to pause at a certain point and 4123 * then kill the test after 10 seconds so raidz_scratch_verify() 4124 * can confirm consistency when the pool is imported. 4125 */ 4126 if (ztest_random(2) == 0 && expected_error == 0) { 4127 raidz_expand_pause_point = 4128 ztest_random(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2) + 1; 4129 scratch_thread = thread_create(NULL, 0, ztest_scratch_thread, 4130 ztest_shared, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 4131 } 4132 4133 error = spa_vdev_attach(spa, pvd->vdev_guid, root, B_FALSE, B_FALSE); 4134 4135 nvlist_free(root); 4136 4137 if (error == EOVERFLOW || error == ENXIO || 4138 error == ZFS_ERR_CHECKPOINT_EXISTS || 4139 error == ZFS_ERR_DISCARDING_CHECKPOINT) 4140 expected_error = error; 4141 4142 if (error != 0 && error != expected_error) { 4143 fatal(0, "raidz attach (%s %"PRIu64") returned %d, expected %d", 4144 newpath, newsize, error, expected_error); 4145 } 4146 4147 if (raidz_expand_pause_point) { 4148 if (error != 0) { 4149 /* 4150 * Do not verify scratch object in case of error 4151 * returned by vdev attaching. 4152 */ 4153 raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; 4154 } 4155 4156 VERIFY0(thread_join(scratch_thread)); 4157 } 4158 out: 4159 mutex_exit(&ztest_vdev_lock); 4160 4161 umem_free(newpath, MAXPATHLEN); 4162 } 4163 4164 void 4165 ztest_device_removal(ztest_ds_t *zd, uint64_t id) 4166 { 4167 (void) zd, (void) id; 4168 spa_t *spa = ztest_spa; 4169 vdev_t *vd; 4170 uint64_t guid; 4171 int error; 4172 4173 mutex_enter(&ztest_vdev_lock); 4174 4175 if (ztest_device_removal_active) { 4176 mutex_exit(&ztest_vdev_lock); 4177 return; 4178 } 4179 4180 /* 4181 * Remove a random top-level vdev and wait for removal to finish. 4182 */ 4183 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 4184 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 4185 guid = vd->vdev_guid; 4186 spa_config_exit(spa, SCL_VDEV, FTAG); 4187 4188 error = spa_vdev_remove(spa, guid, B_FALSE); 4189 if (error == 0) { 4190 ztest_device_removal_active = B_TRUE; 4191 mutex_exit(&ztest_vdev_lock); 4192 4193 /* 4194 * spa->spa_vdev_removal is created in a sync task that 4195 * is initiated via dsl_sync_task_nowait(). Since the 4196 * task may not run before spa_vdev_remove() returns, we 4197 * must wait at least 1 txg to ensure that the removal 4198 * struct has been created. 4199 */ 4200 txg_wait_synced(spa_get_dsl(spa), 0); 4201 4202 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 4203 txg_wait_synced(spa_get_dsl(spa), 0); 4204 } else { 4205 mutex_exit(&ztest_vdev_lock); 4206 return; 4207 } 4208 4209 /* 4210 * The pool needs to be scrubbed after completing device removal. 4211 * Failure to do so may result in checksum errors due to the 4212 * strategy employed by ztest_fault_inject() when selecting which 4213 * offset are redundant and can be damaged. 4214 */ 4215 error = spa_scan(spa, POOL_SCAN_SCRUB); 4216 if (error == 0) { 4217 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 4218 txg_wait_synced(spa_get_dsl(spa), 0); 4219 } 4220 4221 mutex_enter(&ztest_vdev_lock); 4222 ztest_device_removal_active = B_FALSE; 4223 mutex_exit(&ztest_vdev_lock); 4224 } 4225 4226 /* 4227 * Callback function which expands the physical size of the vdev. 4228 */ 4229 static vdev_t * 4230 grow_vdev(vdev_t *vd, void *arg) 4231 { 4232 spa_t *spa __maybe_unused = vd->vdev_spa; 4233 size_t *newsize = arg; 4234 size_t fsize; 4235 int fd; 4236 4237 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 4238 ASSERT(vd->vdev_ops->vdev_op_leaf); 4239 4240 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 4241 return (vd); 4242 4243 fsize = lseek(fd, 0, SEEK_END); 4244 VERIFY0(ftruncate(fd, *newsize)); 4245 4246 if (ztest_opts.zo_verbose >= 6) { 4247 (void) printf("%s grew from %lu to %lu bytes\n", 4248 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 4249 } 4250 (void) close(fd); 4251 return (NULL); 4252 } 4253 4254 /* 4255 * Callback function which expands a given vdev by calling vdev_online(). 4256 */ 4257 static vdev_t * 4258 online_vdev(vdev_t *vd, void *arg) 4259 { 4260 (void) arg; 4261 spa_t *spa = vd->vdev_spa; 4262 vdev_t *tvd = vd->vdev_top; 4263 uint64_t guid = vd->vdev_guid; 4264 uint64_t generation = spa->spa_config_generation + 1; 4265 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 4266 int error; 4267 4268 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 4269 ASSERT(vd->vdev_ops->vdev_op_leaf); 4270 4271 /* Calling vdev_online will initialize the new metaslabs */ 4272 spa_config_exit(spa, SCL_STATE, spa); 4273 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 4274 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4275 4276 /* 4277 * If vdev_online returned an error or the underlying vdev_open 4278 * failed then we abort the expand. The only way to know that 4279 * vdev_open fails is by checking the returned newstate. 4280 */ 4281 if (error || newstate != VDEV_STATE_HEALTHY) { 4282 if (ztest_opts.zo_verbose >= 5) { 4283 (void) printf("Unable to expand vdev, state %u, " 4284 "error %d\n", newstate, error); 4285 } 4286 return (vd); 4287 } 4288 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 4289 4290 /* 4291 * Since we dropped the lock we need to ensure that we're 4292 * still talking to the original vdev. It's possible this 4293 * vdev may have been detached/replaced while we were 4294 * trying to online it. 4295 */ 4296 if (generation != spa->spa_config_generation) { 4297 if (ztest_opts.zo_verbose >= 5) { 4298 (void) printf("vdev configuration has changed, " 4299 "guid %"PRIu64", state %"PRIu64", " 4300 "expected gen %"PRIu64", got gen %"PRIu64"\n", 4301 guid, 4302 tvd->vdev_state, 4303 generation, 4304 spa->spa_config_generation); 4305 } 4306 return (vd); 4307 } 4308 return (NULL); 4309 } 4310 4311 /* 4312 * Traverse the vdev tree calling the supplied function. 4313 * We continue to walk the tree until we either have walked all 4314 * children or we receive a non-NULL return from the callback. 4315 * If a NULL callback is passed, then we just return back the first 4316 * leaf vdev we encounter. 4317 */ 4318 static vdev_t * 4319 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 4320 { 4321 uint_t c; 4322 4323 if (vd->vdev_ops->vdev_op_leaf) { 4324 if (func == NULL) 4325 return (vd); 4326 else 4327 return (func(vd, arg)); 4328 } 4329 4330 for (c = 0; c < vd->vdev_children; c++) { 4331 vdev_t *cvd = vd->vdev_child[c]; 4332 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 4333 return (cvd); 4334 } 4335 return (NULL); 4336 } 4337 4338 /* 4339 * Verify that dynamic LUN growth works as expected. 4340 */ 4341 void 4342 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 4343 { 4344 (void) zd, (void) id; 4345 spa_t *spa = ztest_spa; 4346 vdev_t *vd, *tvd; 4347 metaslab_class_t *mc; 4348 metaslab_group_t *mg; 4349 size_t psize, newsize; 4350 uint64_t top; 4351 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 4352 4353 mutex_enter(&ztest_checkpoint_lock); 4354 mutex_enter(&ztest_vdev_lock); 4355 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4356 4357 /* 4358 * If there is a vdev removal in progress, it could complete while 4359 * we are running, in which case we would not be able to verify 4360 * that the metaslab_class space increased (because it decreases 4361 * when the device removal completes). 4362 */ 4363 if (ztest_device_removal_active) { 4364 spa_config_exit(spa, SCL_STATE, spa); 4365 mutex_exit(&ztest_vdev_lock); 4366 mutex_exit(&ztest_checkpoint_lock); 4367 return; 4368 } 4369 4370 /* 4371 * If we are under raidz expansion, the test can failed because the 4372 * metaslabs count will not increase immediately after the vdev is 4373 * expanded. It will happen only after raidz expansion completion. 4374 */ 4375 if (spa->spa_raidz_expand) { 4376 spa_config_exit(spa, SCL_STATE, spa); 4377 mutex_exit(&ztest_vdev_lock); 4378 mutex_exit(&ztest_checkpoint_lock); 4379 return; 4380 } 4381 4382 top = ztest_random_vdev_top(spa, B_TRUE); 4383 4384 tvd = spa->spa_root_vdev->vdev_child[top]; 4385 mg = tvd->vdev_mg; 4386 mc = mg->mg_class; 4387 old_ms_count = tvd->vdev_ms_count; 4388 old_class_space = metaslab_class_get_space(mc); 4389 4390 /* 4391 * Determine the size of the first leaf vdev associated with 4392 * our top-level device. 4393 */ 4394 vd = vdev_walk_tree(tvd, NULL, NULL); 4395 ASSERT3P(vd, !=, NULL); 4396 ASSERT(vd->vdev_ops->vdev_op_leaf); 4397 4398 psize = vd->vdev_psize; 4399 4400 /* 4401 * We only try to expand the vdev if it's healthy, less than 4x its 4402 * original size, and it has a valid psize. 4403 */ 4404 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 4405 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 4406 spa_config_exit(spa, SCL_STATE, spa); 4407 mutex_exit(&ztest_vdev_lock); 4408 mutex_exit(&ztest_checkpoint_lock); 4409 return; 4410 } 4411 ASSERT3U(psize, >, 0); 4412 newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); 4413 ASSERT3U(newsize, >, psize); 4414 4415 if (ztest_opts.zo_verbose >= 6) { 4416 (void) printf("Expanding LUN %s from %lu to %lu\n", 4417 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 4418 } 4419 4420 /* 4421 * Growing the vdev is a two step process: 4422 * 1). expand the physical size (i.e. relabel) 4423 * 2). online the vdev to create the new metaslabs 4424 */ 4425 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 4426 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 4427 tvd->vdev_state != VDEV_STATE_HEALTHY) { 4428 if (ztest_opts.zo_verbose >= 5) { 4429 (void) printf("Could not expand LUN because " 4430 "the vdev configuration changed.\n"); 4431 } 4432 spa_config_exit(spa, SCL_STATE, spa); 4433 mutex_exit(&ztest_vdev_lock); 4434 mutex_exit(&ztest_checkpoint_lock); 4435 return; 4436 } 4437 4438 spa_config_exit(spa, SCL_STATE, spa); 4439 4440 /* 4441 * Expanding the LUN will update the config asynchronously, 4442 * thus we must wait for the async thread to complete any 4443 * pending tasks before proceeding. 4444 */ 4445 for (;;) { 4446 boolean_t done; 4447 mutex_enter(&spa->spa_async_lock); 4448 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 4449 mutex_exit(&spa->spa_async_lock); 4450 if (done) 4451 break; 4452 txg_wait_synced(spa_get_dsl(spa), 0); 4453 (void) poll(NULL, 0, 100); 4454 } 4455 4456 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4457 4458 tvd = spa->spa_root_vdev->vdev_child[top]; 4459 new_ms_count = tvd->vdev_ms_count; 4460 new_class_space = metaslab_class_get_space(mc); 4461 4462 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 4463 if (ztest_opts.zo_verbose >= 5) { 4464 (void) printf("Could not verify LUN expansion due to " 4465 "intervening vdev offline or remove.\n"); 4466 } 4467 spa_config_exit(spa, SCL_STATE, spa); 4468 mutex_exit(&ztest_vdev_lock); 4469 mutex_exit(&ztest_checkpoint_lock); 4470 return; 4471 } 4472 4473 /* 4474 * Make sure we were able to grow the vdev. 4475 */ 4476 if (new_ms_count <= old_ms_count) { 4477 fatal(B_FALSE, 4478 "LUN expansion failed: ms_count %"PRIu64" < %"PRIu64"\n", 4479 old_ms_count, new_ms_count); 4480 } 4481 4482 /* 4483 * Make sure we were able to grow the pool. 4484 */ 4485 if (new_class_space <= old_class_space) { 4486 fatal(B_FALSE, 4487 "LUN expansion failed: class_space %"PRIu64" < %"PRIu64"\n", 4488 old_class_space, new_class_space); 4489 } 4490 4491 if (ztest_opts.zo_verbose >= 5) { 4492 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 4493 4494 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 4495 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 4496 (void) printf("%s grew from %s to %s\n", 4497 spa->spa_name, oldnumbuf, newnumbuf); 4498 } 4499 4500 spa_config_exit(spa, SCL_STATE, spa); 4501 mutex_exit(&ztest_vdev_lock); 4502 mutex_exit(&ztest_checkpoint_lock); 4503 } 4504 4505 /* 4506 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 4507 */ 4508 static void 4509 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 4510 { 4511 (void) arg, (void) cr; 4512 4513 /* 4514 * Create the objects common to all ztest datasets. 4515 */ 4516 VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, 4517 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); 4518 } 4519 4520 static int 4521 ztest_dataset_create(char *dsname) 4522 { 4523 int err; 4524 uint64_t rand; 4525 dsl_crypto_params_t *dcp = NULL; 4526 4527 /* 4528 * 50% of the time, we create encrypted datasets 4529 * using a random cipher suite and a hard-coded 4530 * wrapping key. 4531 */ 4532 rand = ztest_random(2); 4533 if (rand != 0) { 4534 nvlist_t *crypto_args = fnvlist_alloc(); 4535 nvlist_t *props = fnvlist_alloc(); 4536 4537 /* slight bias towards the default cipher suite */ 4538 rand = ztest_random(ZIO_CRYPT_FUNCTIONS); 4539 if (rand < ZIO_CRYPT_AES_128_CCM) 4540 rand = ZIO_CRYPT_ON; 4541 4542 fnvlist_add_uint64(props, 4543 zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); 4544 fnvlist_add_uint8_array(crypto_args, "wkeydata", 4545 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 4546 4547 /* 4548 * These parameters aren't really used by the kernel. They 4549 * are simply stored so that userspace knows how to load 4550 * the wrapping key. 4551 */ 4552 fnvlist_add_uint64(props, 4553 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); 4554 fnvlist_add_string(props, 4555 zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); 4556 fnvlist_add_uint64(props, 4557 zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); 4558 fnvlist_add_uint64(props, 4559 zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); 4560 4561 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, 4562 crypto_args, &dcp)); 4563 4564 /* 4565 * Cycle through all available encryption implementations 4566 * to verify interoperability. 4567 */ 4568 VERIFY0(gcm_impl_set("cycle")); 4569 VERIFY0(aes_impl_set("cycle")); 4570 4571 fnvlist_free(crypto_args); 4572 fnvlist_free(props); 4573 } 4574 4575 err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, 4576 ztest_objset_create_cb, NULL); 4577 dsl_crypto_params_free(dcp, !!err); 4578 4579 rand = ztest_random(100); 4580 if (err || rand < 80) 4581 return (err); 4582 4583 if (ztest_opts.zo_verbose >= 5) 4584 (void) printf("Setting dataset %s to sync always\n", dsname); 4585 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 4586 ZFS_SYNC_ALWAYS, B_FALSE)); 4587 } 4588 4589 static int 4590 ztest_objset_destroy_cb(const char *name, void *arg) 4591 { 4592 (void) arg; 4593 objset_t *os; 4594 dmu_object_info_t doi; 4595 int error; 4596 4597 /* 4598 * Verify that the dataset contains a directory object. 4599 */ 4600 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4601 B_TRUE, FTAG, &os)); 4602 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 4603 if (error != ENOENT) { 4604 /* We could have crashed in the middle of destroying it */ 4605 ASSERT0(error); 4606 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 4607 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 4608 } 4609 dmu_objset_disown(os, B_TRUE, FTAG); 4610 4611 /* 4612 * Destroy the dataset. 4613 */ 4614 if (strchr(name, '@') != NULL) { 4615 error = dsl_destroy_snapshot(name, B_TRUE); 4616 if (error != ECHRNG) { 4617 /* 4618 * The program was executed, but encountered a runtime 4619 * error, such as insufficient slop, or a hold on the 4620 * dataset. 4621 */ 4622 ASSERT0(error); 4623 } 4624 } else { 4625 error = dsl_destroy_head(name); 4626 if (error == ENOSPC) { 4627 /* There could be checkpoint or insufficient slop */ 4628 ztest_record_enospc(FTAG); 4629 } else if (error != EBUSY) { 4630 /* There could be a hold on this dataset */ 4631 ASSERT0(error); 4632 } 4633 } 4634 return (0); 4635 } 4636 4637 static boolean_t 4638 ztest_snapshot_create(char *osname, uint64_t id) 4639 { 4640 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4641 int error; 4642 4643 (void) snprintf(snapname, sizeof (snapname), "%"PRIu64"", id); 4644 4645 error = dmu_objset_snapshot_one(osname, snapname); 4646 if (error == ENOSPC) { 4647 ztest_record_enospc(FTAG); 4648 return (B_FALSE); 4649 } 4650 if (error != 0 && error != EEXIST && error != ECHRNG) { 4651 fatal(B_FALSE, "ztest_snapshot_create(%s@%s) = %d", osname, 4652 snapname, error); 4653 } 4654 return (B_TRUE); 4655 } 4656 4657 static boolean_t 4658 ztest_snapshot_destroy(char *osname, uint64_t id) 4659 { 4660 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4661 int error; 4662 4663 (void) snprintf(snapname, sizeof (snapname), "%s@%"PRIu64"", 4664 osname, id); 4665 4666 error = dsl_destroy_snapshot(snapname, B_FALSE); 4667 if (error != 0 && error != ENOENT && error != ECHRNG) 4668 fatal(B_FALSE, "ztest_snapshot_destroy(%s) = %d", 4669 snapname, error); 4670 return (B_TRUE); 4671 } 4672 4673 void 4674 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 4675 { 4676 (void) zd; 4677 ztest_ds_t *zdtmp; 4678 int iters; 4679 int error; 4680 objset_t *os, *os2; 4681 char name[ZFS_MAX_DATASET_NAME_LEN]; 4682 zilog_t *zilog; 4683 int i; 4684 4685 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 4686 4687 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4688 4689 (void) snprintf(name, sizeof (name), "%s/temp_%"PRIu64"", 4690 ztest_opts.zo_pool, id); 4691 4692 /* 4693 * If this dataset exists from a previous run, process its replay log 4694 * half of the time. If we don't replay it, then dsl_destroy_head() 4695 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 4696 */ 4697 if (ztest_random(2) == 0 && 4698 ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 4699 B_TRUE, FTAG, &os) == 0) { 4700 ztest_zd_init(zdtmp, NULL, os); 4701 zil_replay(os, zdtmp, ztest_replay_vector); 4702 ztest_zd_fini(zdtmp); 4703 dmu_objset_disown(os, B_TRUE, FTAG); 4704 } 4705 4706 /* 4707 * There may be an old instance of the dataset we're about to 4708 * create lying around from a previous run. If so, destroy it 4709 * and all of its snapshots. 4710 */ 4711 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 4712 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 4713 4714 /* 4715 * Verify that the destroyed dataset is no longer in the namespace. 4716 * It may still be present if the destroy above fails with ENOSPC. 4717 */ 4718 error = ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, 4719 FTAG, &os); 4720 if (error == 0) { 4721 dmu_objset_disown(os, B_TRUE, FTAG); 4722 ztest_record_enospc(FTAG); 4723 goto out; 4724 } 4725 VERIFY3U(ENOENT, ==, error); 4726 4727 /* 4728 * Verify that we can create a new dataset. 4729 */ 4730 error = ztest_dataset_create(name); 4731 if (error) { 4732 if (error == ENOSPC) { 4733 ztest_record_enospc(FTAG); 4734 goto out; 4735 } 4736 fatal(B_FALSE, "dmu_objset_create(%s) = %d", name, error); 4737 } 4738 4739 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, 4740 FTAG, &os)); 4741 4742 ztest_zd_init(zdtmp, NULL, os); 4743 4744 /* 4745 * Open the intent log for it. 4746 */ 4747 zilog = zil_open(os, ztest_get_data, NULL); 4748 4749 /* 4750 * Put some objects in there, do a little I/O to them, 4751 * and randomly take a couple of snapshots along the way. 4752 */ 4753 iters = ztest_random(5); 4754 for (i = 0; i < iters; i++) { 4755 ztest_dmu_object_alloc_free(zdtmp, id); 4756 if (ztest_random(iters) == 0) 4757 (void) ztest_snapshot_create(name, i); 4758 } 4759 4760 /* 4761 * Verify that we cannot create an existing dataset. 4762 */ 4763 VERIFY3U(EEXIST, ==, 4764 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); 4765 4766 /* 4767 * Verify that we can hold an objset that is also owned. 4768 */ 4769 VERIFY0(dmu_objset_hold(name, FTAG, &os2)); 4770 dmu_objset_rele(os2, FTAG); 4771 4772 /* 4773 * Verify that we cannot own an objset that is already owned. 4774 */ 4775 VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, 4776 B_FALSE, B_TRUE, FTAG, &os2)); 4777 4778 zil_close(zilog); 4779 dmu_objset_disown(os, B_TRUE, FTAG); 4780 ztest_zd_fini(zdtmp); 4781 out: 4782 (void) pthread_rwlock_unlock(&ztest_name_lock); 4783 4784 umem_free(zdtmp, sizeof (ztest_ds_t)); 4785 } 4786 4787 /* 4788 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 4789 */ 4790 void 4791 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 4792 { 4793 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4794 (void) ztest_snapshot_destroy(zd->zd_name, id); 4795 (void) ztest_snapshot_create(zd->zd_name, id); 4796 (void) pthread_rwlock_unlock(&ztest_name_lock); 4797 } 4798 4799 /* 4800 * Cleanup non-standard snapshots and clones. 4801 */ 4802 static void 4803 ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 4804 { 4805 char *snap1name; 4806 char *clone1name; 4807 char *snap2name; 4808 char *clone2name; 4809 char *snap3name; 4810 int error; 4811 4812 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4813 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4814 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4815 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4816 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4817 4818 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4819 osname, id); 4820 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4821 osname, id); 4822 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4823 clone1name, id); 4824 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4825 osname, id); 4826 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4827 clone1name, id); 4828 4829 error = dsl_destroy_head(clone2name); 4830 if (error && error != ENOENT) 4831 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone2name, error); 4832 error = dsl_destroy_snapshot(snap3name, B_FALSE); 4833 if (error && error != ENOENT) 4834 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4835 snap3name, error); 4836 error = dsl_destroy_snapshot(snap2name, B_FALSE); 4837 if (error && error != ENOENT) 4838 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4839 snap2name, error); 4840 error = dsl_destroy_head(clone1name); 4841 if (error && error != ENOENT) 4842 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone1name, error); 4843 error = dsl_destroy_snapshot(snap1name, B_FALSE); 4844 if (error && error != ENOENT) 4845 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4846 snap1name, error); 4847 4848 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4849 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4850 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4851 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4852 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4853 } 4854 4855 /* 4856 * Verify dsl_dataset_promote handles EBUSY 4857 */ 4858 void 4859 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 4860 { 4861 objset_t *os; 4862 char *snap1name; 4863 char *clone1name; 4864 char *snap2name; 4865 char *clone2name; 4866 char *snap3name; 4867 char *osname = zd->zd_name; 4868 int error; 4869 4870 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4871 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4872 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4873 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4874 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4875 4876 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4877 4878 ztest_dsl_dataset_cleanup(osname, id); 4879 4880 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4881 osname, id); 4882 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4883 osname, id); 4884 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4885 clone1name, id); 4886 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4887 osname, id); 4888 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4889 clone1name, id); 4890 4891 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 4892 if (error && error != EEXIST) { 4893 if (error == ENOSPC) { 4894 ztest_record_enospc(FTAG); 4895 goto out; 4896 } 4897 fatal(B_FALSE, "dmu_take_snapshot(%s) = %d", snap1name, error); 4898 } 4899 4900 error = dmu_objset_clone(clone1name, snap1name); 4901 if (error) { 4902 if (error == ENOSPC) { 4903 ztest_record_enospc(FTAG); 4904 goto out; 4905 } 4906 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone1name, error); 4907 } 4908 4909 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 4910 if (error && error != EEXIST) { 4911 if (error == ENOSPC) { 4912 ztest_record_enospc(FTAG); 4913 goto out; 4914 } 4915 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap2name, error); 4916 } 4917 4918 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 4919 if (error && error != EEXIST) { 4920 if (error == ENOSPC) { 4921 ztest_record_enospc(FTAG); 4922 goto out; 4923 } 4924 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap3name, error); 4925 } 4926 4927 error = dmu_objset_clone(clone2name, snap3name); 4928 if (error) { 4929 if (error == ENOSPC) { 4930 ztest_record_enospc(FTAG); 4931 goto out; 4932 } 4933 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone2name, error); 4934 } 4935 4936 error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, 4937 FTAG, &os); 4938 if (error) 4939 fatal(B_FALSE, "dmu_objset_own(%s) = %d", snap2name, error); 4940 error = dsl_dataset_promote(clone2name, NULL); 4941 if (error == ENOSPC) { 4942 dmu_objset_disown(os, B_TRUE, FTAG); 4943 ztest_record_enospc(FTAG); 4944 goto out; 4945 } 4946 if (error != EBUSY) 4947 fatal(B_FALSE, "dsl_dataset_promote(%s), %d, not EBUSY", 4948 clone2name, error); 4949 dmu_objset_disown(os, B_TRUE, FTAG); 4950 4951 out: 4952 ztest_dsl_dataset_cleanup(osname, id); 4953 4954 (void) pthread_rwlock_unlock(&ztest_name_lock); 4955 4956 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4957 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4958 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4959 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4960 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4961 } 4962 4963 #undef OD_ARRAY_SIZE 4964 #define OD_ARRAY_SIZE 4 4965 4966 /* 4967 * Verify that dmu_object_{alloc,free} work as expected. 4968 */ 4969 void 4970 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 4971 { 4972 ztest_od_t *od; 4973 int batchsize; 4974 int size; 4975 int b; 4976 4977 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4978 od = umem_alloc(size, UMEM_NOFAIL); 4979 batchsize = OD_ARRAY_SIZE; 4980 4981 for (b = 0; b < batchsize; b++) 4982 ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 4983 0, 0, 0); 4984 4985 /* 4986 * Destroy the previous batch of objects, create a new batch, 4987 * and do some I/O on the new objects. 4988 */ 4989 if (ztest_object_init(zd, od, size, B_TRUE) != 0) { 4990 zd->zd_od = NULL; 4991 umem_free(od, size); 4992 return; 4993 } 4994 4995 while (ztest_random(4 * batchsize) != 0) 4996 ztest_io(zd, od[ztest_random(batchsize)].od_object, 4997 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4998 4999 umem_free(od, size); 5000 } 5001 5002 /* 5003 * Rewind the global allocator to verify object allocation backfilling. 5004 */ 5005 void 5006 ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 5007 { 5008 (void) id; 5009 objset_t *os = zd->zd_os; 5010 uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 5011 uint64_t object; 5012 5013 /* 5014 * Rewind the global allocator randomly back to a lower object number 5015 * to force backfilling and reclamation of recently freed dnodes. 5016 */ 5017 mutex_enter(&os->os_obj_lock); 5018 object = ztest_random(os->os_obj_next_chunk); 5019 os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); 5020 mutex_exit(&os->os_obj_lock); 5021 } 5022 5023 #undef OD_ARRAY_SIZE 5024 #define OD_ARRAY_SIZE 2 5025 5026 /* 5027 * Verify that dmu_{read,write} work as expected. 5028 */ 5029 void 5030 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 5031 { 5032 int size; 5033 ztest_od_t *od; 5034 5035 objset_t *os = zd->zd_os; 5036 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 5037 od = umem_alloc(size, UMEM_NOFAIL); 5038 dmu_tx_t *tx; 5039 int freeit, error; 5040 uint64_t i, n, s, txg; 5041 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 5042 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 5043 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 5044 uint64_t regions = 997; 5045 uint64_t stride = 123456789ULL; 5046 uint64_t width = 40; 5047 int free_percent = 5; 5048 5049 /* 5050 * This test uses two objects, packobj and bigobj, that are always 5051 * updated together (i.e. in the same tx) so that their contents are 5052 * in sync and can be compared. Their contents relate to each other 5053 * in a simple way: packobj is a dense array of 'bufwad' structures, 5054 * while bigobj is a sparse array of the same bufwads. Specifically, 5055 * for any index n, there are three bufwads that should be identical: 5056 * 5057 * packobj, at offset n * sizeof (bufwad_t) 5058 * bigobj, at the head of the nth chunk 5059 * bigobj, at the tail of the nth chunk 5060 * 5061 * The chunk size is arbitrary. It doesn't have to be a power of two, 5062 * and it doesn't have any relation to the object blocksize. 5063 * The only requirement is that it can hold at least two bufwads. 5064 * 5065 * Normally, we write the bufwad to each of these locations. 5066 * However, free_percent of the time we instead write zeroes to 5067 * packobj and perform a dmu_free_range() on bigobj. By comparing 5068 * bigobj to packobj, we can verify that the DMU is correctly 5069 * tracking which parts of an object are allocated and free, 5070 * and that the contents of the allocated blocks are correct. 5071 */ 5072 5073 /* 5074 * Read the directory info. If it's the first time, set things up. 5075 */ 5076 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); 5077 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5078 chunksize); 5079 5080 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5081 umem_free(od, size); 5082 return; 5083 } 5084 5085 bigobj = od[0].od_object; 5086 packobj = od[1].od_object; 5087 chunksize = od[0].od_gen; 5088 ASSERT3U(chunksize, ==, od[1].od_gen); 5089 5090 /* 5091 * Prefetch a random chunk of the big object. 5092 * Our aim here is to get some async reads in flight 5093 * for blocks that we may free below; the DMU should 5094 * handle this race correctly. 5095 */ 5096 n = ztest_random(regions) * stride + ztest_random(width); 5097 s = 1 + ztest_random(2 * width - 1); 5098 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 5099 ZIO_PRIORITY_SYNC_READ); 5100 5101 /* 5102 * Pick a random index and compute the offsets into packobj and bigobj. 5103 */ 5104 n = ztest_random(regions) * stride + ztest_random(width); 5105 s = 1 + ztest_random(width - 1); 5106 5107 packoff = n * sizeof (bufwad_t); 5108 packsize = s * sizeof (bufwad_t); 5109 5110 bigoff = n * chunksize; 5111 bigsize = s * chunksize; 5112 5113 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 5114 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 5115 5116 /* 5117 * free_percent of the time, free a range of bigobj rather than 5118 * overwriting it. 5119 */ 5120 freeit = (ztest_random(100) < free_percent); 5121 5122 /* 5123 * Read the current contents of our objects. 5124 */ 5125 error = dmu_read(os, packobj, packoff, packsize, packbuf, 5126 DMU_READ_PREFETCH); 5127 ASSERT0(error); 5128 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 5129 DMU_READ_PREFETCH); 5130 ASSERT0(error); 5131 5132 /* 5133 * Get a tx for the mods to both packobj and bigobj. 5134 */ 5135 tx = dmu_tx_create(os); 5136 5137 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5138 5139 if (freeit) 5140 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 5141 else 5142 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5143 5144 /* This accounts for setting the checksum/compression. */ 5145 dmu_tx_hold_bonus(tx, bigobj); 5146 5147 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5148 if (txg == 0) { 5149 umem_free(packbuf, packsize); 5150 umem_free(bigbuf, bigsize); 5151 umem_free(od, size); 5152 return; 5153 } 5154 5155 enum zio_checksum cksum; 5156 do { 5157 cksum = (enum zio_checksum) 5158 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 5159 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 5160 dmu_object_set_checksum(os, bigobj, cksum, tx); 5161 5162 enum zio_compress comp; 5163 do { 5164 comp = (enum zio_compress) 5165 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 5166 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 5167 dmu_object_set_compress(os, bigobj, comp, tx); 5168 5169 /* 5170 * For each index from n to n + s, verify that the existing bufwad 5171 * in packobj matches the bufwads at the head and tail of the 5172 * corresponding chunk in bigobj. Then update all three bufwads 5173 * with the new values we want to write out. 5174 */ 5175 for (i = 0; i < s; i++) { 5176 /* LINTED */ 5177 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 5178 /* LINTED */ 5179 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 5180 /* LINTED */ 5181 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 5182 5183 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 5184 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 5185 5186 if (pack->bw_txg > txg) 5187 fatal(B_FALSE, 5188 "future leak: got %"PRIx64", open txg is %"PRIx64"", 5189 pack->bw_txg, txg); 5190 5191 if (pack->bw_data != 0 && pack->bw_index != n + i) 5192 fatal(B_FALSE, "wrong index: " 5193 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 5194 pack->bw_index, n, i); 5195 5196 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 5197 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 5198 pack, bigH); 5199 5200 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 5201 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 5202 pack, bigT); 5203 5204 if (freeit) { 5205 memset(pack, 0, sizeof (bufwad_t)); 5206 } else { 5207 pack->bw_index = n + i; 5208 pack->bw_txg = txg; 5209 pack->bw_data = 1 + ztest_random(-2ULL); 5210 } 5211 *bigH = *pack; 5212 *bigT = *pack; 5213 } 5214 5215 /* 5216 * We've verified all the old bufwads, and made new ones. 5217 * Now write them out. 5218 */ 5219 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5220 5221 if (freeit) { 5222 if (ztest_opts.zo_verbose >= 7) { 5223 (void) printf("freeing offset %"PRIx64" size %"PRIx64"" 5224 " txg %"PRIx64"\n", 5225 bigoff, bigsize, txg); 5226 } 5227 VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 5228 } else { 5229 if (ztest_opts.zo_verbose >= 7) { 5230 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5231 " txg %"PRIx64"\n", 5232 bigoff, bigsize, txg); 5233 } 5234 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 5235 } 5236 5237 dmu_tx_commit(tx); 5238 5239 /* 5240 * Sanity check the stuff we just wrote. 5241 */ 5242 { 5243 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5244 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5245 5246 VERIFY0(dmu_read(os, packobj, packoff, 5247 packsize, packcheck, DMU_READ_PREFETCH)); 5248 VERIFY0(dmu_read(os, bigobj, bigoff, 5249 bigsize, bigcheck, DMU_READ_PREFETCH)); 5250 5251 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5252 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5253 5254 umem_free(packcheck, packsize); 5255 umem_free(bigcheck, bigsize); 5256 } 5257 5258 umem_free(packbuf, packsize); 5259 umem_free(bigbuf, bigsize); 5260 umem_free(od, size); 5261 } 5262 5263 static void 5264 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 5265 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 5266 { 5267 uint64_t i; 5268 bufwad_t *pack; 5269 bufwad_t *bigH; 5270 bufwad_t *bigT; 5271 5272 /* 5273 * For each index from n to n + s, verify that the existing bufwad 5274 * in packobj matches the bufwads at the head and tail of the 5275 * corresponding chunk in bigobj. Then update all three bufwads 5276 * with the new values we want to write out. 5277 */ 5278 for (i = 0; i < s; i++) { 5279 /* LINTED */ 5280 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 5281 /* LINTED */ 5282 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 5283 /* LINTED */ 5284 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 5285 5286 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 5287 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 5288 5289 if (pack->bw_txg > txg) 5290 fatal(B_FALSE, 5291 "future leak: got %"PRIx64", open txg is %"PRIx64"", 5292 pack->bw_txg, txg); 5293 5294 if (pack->bw_data != 0 && pack->bw_index != n + i) 5295 fatal(B_FALSE, "wrong index: " 5296 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 5297 pack->bw_index, n, i); 5298 5299 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 5300 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 5301 pack, bigH); 5302 5303 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 5304 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 5305 pack, bigT); 5306 5307 pack->bw_index = n + i; 5308 pack->bw_txg = txg; 5309 pack->bw_data = 1 + ztest_random(-2ULL); 5310 5311 *bigH = *pack; 5312 *bigT = *pack; 5313 } 5314 } 5315 5316 #undef OD_ARRAY_SIZE 5317 #define OD_ARRAY_SIZE 2 5318 5319 void 5320 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 5321 { 5322 objset_t *os = zd->zd_os; 5323 ztest_od_t *od; 5324 dmu_tx_t *tx; 5325 uint64_t i; 5326 int error; 5327 int size; 5328 uint64_t n, s, txg; 5329 bufwad_t *packbuf, *bigbuf; 5330 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 5331 uint64_t blocksize = ztest_random_blocksize(); 5332 uint64_t chunksize = blocksize; 5333 uint64_t regions = 997; 5334 uint64_t stride = 123456789ULL; 5335 uint64_t width = 9; 5336 dmu_buf_t *bonus_db; 5337 arc_buf_t **bigbuf_arcbufs; 5338 dmu_object_info_t doi; 5339 5340 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 5341 od = umem_alloc(size, UMEM_NOFAIL); 5342 5343 /* 5344 * This test uses two objects, packobj and bigobj, that are always 5345 * updated together (i.e. in the same tx) so that their contents are 5346 * in sync and can be compared. Their contents relate to each other 5347 * in a simple way: packobj is a dense array of 'bufwad' structures, 5348 * while bigobj is a sparse array of the same bufwads. Specifically, 5349 * for any index n, there are three bufwads that should be identical: 5350 * 5351 * packobj, at offset n * sizeof (bufwad_t) 5352 * bigobj, at the head of the nth chunk 5353 * bigobj, at the tail of the nth chunk 5354 * 5355 * The chunk size is set equal to bigobj block size so that 5356 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. 5357 */ 5358 5359 /* 5360 * Read the directory info. If it's the first time, set things up. 5361 */ 5362 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5363 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5364 chunksize); 5365 5366 5367 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5368 umem_free(od, size); 5369 return; 5370 } 5371 5372 bigobj = od[0].od_object; 5373 packobj = od[1].od_object; 5374 blocksize = od[0].od_blocksize; 5375 chunksize = blocksize; 5376 ASSERT3U(chunksize, ==, od[1].od_gen); 5377 5378 VERIFY0(dmu_object_info(os, bigobj, &doi)); 5379 VERIFY(ISP2(doi.doi_data_block_size)); 5380 VERIFY3U(chunksize, ==, doi.doi_data_block_size); 5381 VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); 5382 5383 /* 5384 * Pick a random index and compute the offsets into packobj and bigobj. 5385 */ 5386 n = ztest_random(regions) * stride + ztest_random(width); 5387 s = 1 + ztest_random(width - 1); 5388 5389 packoff = n * sizeof (bufwad_t); 5390 packsize = s * sizeof (bufwad_t); 5391 5392 bigoff = n * chunksize; 5393 bigsize = s * chunksize; 5394 5395 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 5396 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 5397 5398 VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 5399 5400 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 5401 5402 /* 5403 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 5404 * Iteration 1 test zcopy to already referenced dbufs. 5405 * Iteration 2 test zcopy to dirty dbuf in the same txg. 5406 * Iteration 3 test zcopy to dbuf dirty in previous txg. 5407 * Iteration 4 test zcopy when dbuf is no longer dirty. 5408 * Iteration 5 test zcopy when it can't be done. 5409 * Iteration 6 one more zcopy write. 5410 */ 5411 for (i = 0; i < 7; i++) { 5412 uint64_t j; 5413 uint64_t off; 5414 5415 /* 5416 * In iteration 5 (i == 5) use arcbufs 5417 * that don't match bigobj blksz to test 5418 * dmu_assign_arcbuf_by_dbuf() when it can't directly 5419 * assign an arcbuf to a dbuf. 5420 */ 5421 for (j = 0; j < s; j++) { 5422 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5423 bigbuf_arcbufs[j] = 5424 dmu_request_arcbuf(bonus_db, chunksize); 5425 } else { 5426 bigbuf_arcbufs[2 * j] = 5427 dmu_request_arcbuf(bonus_db, chunksize / 2); 5428 bigbuf_arcbufs[2 * j + 1] = 5429 dmu_request_arcbuf(bonus_db, chunksize / 2); 5430 } 5431 } 5432 5433 /* 5434 * Get a tx for the mods to both packobj and bigobj. 5435 */ 5436 tx = dmu_tx_create(os); 5437 5438 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5439 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5440 5441 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5442 if (txg == 0) { 5443 umem_free(packbuf, packsize); 5444 umem_free(bigbuf, bigsize); 5445 for (j = 0; j < s; j++) { 5446 if (i != 5 || 5447 chunksize < (SPA_MINBLOCKSIZE * 2)) { 5448 dmu_return_arcbuf(bigbuf_arcbufs[j]); 5449 } else { 5450 dmu_return_arcbuf( 5451 bigbuf_arcbufs[2 * j]); 5452 dmu_return_arcbuf( 5453 bigbuf_arcbufs[2 * j + 1]); 5454 } 5455 } 5456 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5457 umem_free(od, size); 5458 dmu_buf_rele(bonus_db, FTAG); 5459 return; 5460 } 5461 5462 /* 5463 * 50% of the time don't read objects in the 1st iteration to 5464 * test dmu_assign_arcbuf_by_dbuf() for the case when there are 5465 * no existing dbufs for the specified offsets. 5466 */ 5467 if (i != 0 || ztest_random(2) != 0) { 5468 error = dmu_read(os, packobj, packoff, 5469 packsize, packbuf, DMU_READ_PREFETCH); 5470 ASSERT0(error); 5471 error = dmu_read(os, bigobj, bigoff, bigsize, 5472 bigbuf, DMU_READ_PREFETCH); 5473 ASSERT0(error); 5474 } 5475 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 5476 n, chunksize, txg); 5477 5478 /* 5479 * We've verified all the old bufwads, and made new ones. 5480 * Now write them out. 5481 */ 5482 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5483 if (ztest_opts.zo_verbose >= 7) { 5484 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5485 " txg %"PRIx64"\n", 5486 bigoff, bigsize, txg); 5487 } 5488 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 5489 dmu_buf_t *dbt; 5490 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5491 memcpy(bigbuf_arcbufs[j]->b_data, 5492 (caddr_t)bigbuf + (off - bigoff), 5493 chunksize); 5494 } else { 5495 memcpy(bigbuf_arcbufs[2 * j]->b_data, 5496 (caddr_t)bigbuf + (off - bigoff), 5497 chunksize / 2); 5498 memcpy(bigbuf_arcbufs[2 * j + 1]->b_data, 5499 (caddr_t)bigbuf + (off - bigoff) + 5500 chunksize / 2, 5501 chunksize / 2); 5502 } 5503 5504 if (i == 1) { 5505 VERIFY(dmu_buf_hold(os, bigobj, off, 5506 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 5507 } 5508 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5509 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5510 off, bigbuf_arcbufs[j], tx)); 5511 } else { 5512 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5513 off, bigbuf_arcbufs[2 * j], tx)); 5514 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5515 off + chunksize / 2, 5516 bigbuf_arcbufs[2 * j + 1], tx)); 5517 } 5518 if (i == 1) { 5519 dmu_buf_rele(dbt, FTAG); 5520 } 5521 } 5522 dmu_tx_commit(tx); 5523 5524 /* 5525 * Sanity check the stuff we just wrote. 5526 */ 5527 { 5528 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5529 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5530 5531 VERIFY0(dmu_read(os, packobj, packoff, 5532 packsize, packcheck, DMU_READ_PREFETCH)); 5533 VERIFY0(dmu_read(os, bigobj, bigoff, 5534 bigsize, bigcheck, DMU_READ_PREFETCH)); 5535 5536 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5537 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5538 5539 umem_free(packcheck, packsize); 5540 umem_free(bigcheck, bigsize); 5541 } 5542 if (i == 2) { 5543 txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); 5544 } else if (i == 3) { 5545 txg_wait_synced(dmu_objset_pool(os), 0); 5546 } 5547 } 5548 5549 dmu_buf_rele(bonus_db, FTAG); 5550 umem_free(packbuf, packsize); 5551 umem_free(bigbuf, bigsize); 5552 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5553 umem_free(od, size); 5554 } 5555 5556 void 5557 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 5558 { 5559 (void) id; 5560 ztest_od_t *od; 5561 5562 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5563 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 5564 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5565 5566 /* 5567 * Have multiple threads write to large offsets in an object 5568 * to verify that parallel writes to an object -- even to the 5569 * same blocks within the object -- doesn't cause any trouble. 5570 */ 5571 ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5572 5573 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) 5574 return; 5575 5576 while (ztest_random(10) != 0) 5577 ztest_io(zd, od->od_object, offset); 5578 5579 umem_free(od, sizeof (ztest_od_t)); 5580 } 5581 5582 void 5583 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 5584 { 5585 ztest_od_t *od; 5586 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 5587 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5588 uint64_t count = ztest_random(20) + 1; 5589 uint64_t blocksize = ztest_random_blocksize(); 5590 void *data; 5591 5592 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5593 5594 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5595 5596 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5597 !ztest_random(2)) != 0) { 5598 umem_free(od, sizeof (ztest_od_t)); 5599 return; 5600 } 5601 5602 if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { 5603 umem_free(od, sizeof (ztest_od_t)); 5604 return; 5605 } 5606 5607 ztest_prealloc(zd, od->od_object, offset, count * blocksize); 5608 5609 data = umem_zalloc(blocksize, UMEM_NOFAIL); 5610 5611 while (ztest_random(count) != 0) { 5612 uint64_t randoff = offset + (ztest_random(count) * blocksize); 5613 if (ztest_write(zd, od->od_object, randoff, blocksize, 5614 data) != 0) 5615 break; 5616 while (ztest_random(4) != 0) 5617 ztest_io(zd, od->od_object, randoff); 5618 } 5619 5620 umem_free(data, blocksize); 5621 umem_free(od, sizeof (ztest_od_t)); 5622 } 5623 5624 /* 5625 * Verify that zap_{create,destroy,add,remove,update} work as expected. 5626 */ 5627 #define ZTEST_ZAP_MIN_INTS 1 5628 #define ZTEST_ZAP_MAX_INTS 4 5629 #define ZTEST_ZAP_MAX_PROPS 1000 5630 5631 void 5632 ztest_zap(ztest_ds_t *zd, uint64_t id) 5633 { 5634 objset_t *os = zd->zd_os; 5635 ztest_od_t *od; 5636 uint64_t object; 5637 uint64_t txg, last_txg; 5638 uint64_t value[ZTEST_ZAP_MAX_INTS]; 5639 uint64_t zl_ints, zl_intsize, prop; 5640 int i, ints; 5641 dmu_tx_t *tx; 5642 char propname[100], txgname[100]; 5643 int error; 5644 const char *const hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 5645 5646 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5647 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5648 5649 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5650 !ztest_random(2)) != 0) 5651 goto out; 5652 5653 object = od->od_object; 5654 5655 /* 5656 * Generate a known hash collision, and verify that 5657 * we can lookup and remove both entries. 5658 */ 5659 tx = dmu_tx_create(os); 5660 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5661 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5662 if (txg == 0) 5663 goto out; 5664 for (i = 0; i < 2; i++) { 5665 value[i] = i; 5666 VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 5667 1, &value[i], tx)); 5668 } 5669 for (i = 0; i < 2; i++) { 5670 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 5671 sizeof (uint64_t), 1, &value[i], tx)); 5672 VERIFY0( 5673 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 5674 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5675 ASSERT3U(zl_ints, ==, 1); 5676 } 5677 for (i = 0; i < 2; i++) { 5678 VERIFY0(zap_remove(os, object, hc[i], tx)); 5679 } 5680 dmu_tx_commit(tx); 5681 5682 /* 5683 * Generate a bunch of random entries. 5684 */ 5685 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 5686 5687 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5688 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5689 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5690 memset(value, 0, sizeof (value)); 5691 last_txg = 0; 5692 5693 /* 5694 * If these zap entries already exist, validate their contents. 5695 */ 5696 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5697 if (error == 0) { 5698 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5699 ASSERT3U(zl_ints, ==, 1); 5700 5701 VERIFY0(zap_lookup(os, object, txgname, zl_intsize, 5702 zl_ints, &last_txg)); 5703 5704 VERIFY0(zap_length(os, object, propname, &zl_intsize, 5705 &zl_ints)); 5706 5707 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5708 ASSERT3U(zl_ints, ==, ints); 5709 5710 VERIFY0(zap_lookup(os, object, propname, zl_intsize, 5711 zl_ints, value)); 5712 5713 for (i = 0; i < ints; i++) { 5714 ASSERT3U(value[i], ==, last_txg + object + i); 5715 } 5716 } else { 5717 ASSERT3U(error, ==, ENOENT); 5718 } 5719 5720 /* 5721 * Atomically update two entries in our zap object. 5722 * The first is named txg_%llu, and contains the txg 5723 * in which the property was last updated. The second 5724 * is named prop_%llu, and the nth element of its value 5725 * should be txg + object + n. 5726 */ 5727 tx = dmu_tx_create(os); 5728 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5729 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5730 if (txg == 0) 5731 goto out; 5732 5733 if (last_txg > txg) 5734 fatal(B_FALSE, "zap future leak: old %"PRIu64" new %"PRIu64"", 5735 last_txg, txg); 5736 5737 for (i = 0; i < ints; i++) 5738 value[i] = txg + object + i; 5739 5740 VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 5741 1, &txg, tx)); 5742 VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), 5743 ints, value, tx)); 5744 5745 dmu_tx_commit(tx); 5746 5747 /* 5748 * Remove a random pair of entries. 5749 */ 5750 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5751 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5752 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5753 5754 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5755 5756 if (error == ENOENT) 5757 goto out; 5758 5759 ASSERT0(error); 5760 5761 tx = dmu_tx_create(os); 5762 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5763 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5764 if (txg == 0) 5765 goto out; 5766 VERIFY0(zap_remove(os, object, txgname, tx)); 5767 VERIFY0(zap_remove(os, object, propname, tx)); 5768 dmu_tx_commit(tx); 5769 out: 5770 umem_free(od, sizeof (ztest_od_t)); 5771 } 5772 5773 /* 5774 * Test case to test the upgrading of a microzap to fatzap. 5775 */ 5776 void 5777 ztest_fzap(ztest_ds_t *zd, uint64_t id) 5778 { 5779 objset_t *os = zd->zd_os; 5780 ztest_od_t *od; 5781 uint64_t object, txg, value; 5782 5783 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5784 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5785 5786 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5787 !ztest_random(2)) != 0) 5788 goto out; 5789 object = od->od_object; 5790 5791 /* 5792 * Add entries to this ZAP and make sure it spills over 5793 * and gets upgraded to a fatzap. Also, since we are adding 5794 * 2050 entries we should see ptrtbl growth and leaf-block split. 5795 */ 5796 for (value = 0; value < 2050; value++) { 5797 char name[ZFS_MAX_DATASET_NAME_LEN]; 5798 dmu_tx_t *tx; 5799 int error; 5800 5801 (void) snprintf(name, sizeof (name), "fzap-%"PRIu64"-%"PRIu64"", 5802 id, value); 5803 5804 tx = dmu_tx_create(os); 5805 dmu_tx_hold_zap(tx, object, B_TRUE, name); 5806 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5807 if (txg == 0) 5808 goto out; 5809 error = zap_add(os, object, name, sizeof (uint64_t), 1, 5810 &value, tx); 5811 ASSERT(error == 0 || error == EEXIST); 5812 dmu_tx_commit(tx); 5813 } 5814 out: 5815 umem_free(od, sizeof (ztest_od_t)); 5816 } 5817 5818 void 5819 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 5820 { 5821 (void) id; 5822 objset_t *os = zd->zd_os; 5823 ztest_od_t *od; 5824 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 5825 dmu_tx_t *tx; 5826 int i, namelen, error; 5827 int micro = ztest_random(2); 5828 char name[20], string_value[20]; 5829 void *data; 5830 5831 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5832 ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); 5833 5834 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5835 umem_free(od, sizeof (ztest_od_t)); 5836 return; 5837 } 5838 5839 object = od->od_object; 5840 5841 /* 5842 * Generate a random name of the form 'xxx.....' where each 5843 * x is a random printable character and the dots are dots. 5844 * There are 94 such characters, and the name length goes from 5845 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 5846 */ 5847 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 5848 5849 for (i = 0; i < 3; i++) 5850 name[i] = '!' + ztest_random('~' - '!' + 1); 5851 for (; i < namelen - 1; i++) 5852 name[i] = '.'; 5853 name[i] = '\0'; 5854 5855 if ((namelen & 1) || micro) { 5856 wsize = sizeof (txg); 5857 wc = 1; 5858 data = &txg; 5859 } else { 5860 wsize = 1; 5861 wc = namelen; 5862 data = string_value; 5863 } 5864 5865 count = -1ULL; 5866 VERIFY0(zap_count(os, object, &count)); 5867 ASSERT3S(count, !=, -1ULL); 5868 5869 /* 5870 * Select an operation: length, lookup, add, update, remove. 5871 */ 5872 i = ztest_random(5); 5873 5874 if (i >= 2) { 5875 tx = dmu_tx_create(os); 5876 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5877 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5878 if (txg == 0) { 5879 umem_free(od, sizeof (ztest_od_t)); 5880 return; 5881 } 5882 memcpy(string_value, name, namelen); 5883 } else { 5884 tx = NULL; 5885 txg = 0; 5886 memset(string_value, 0, namelen); 5887 } 5888 5889 switch (i) { 5890 5891 case 0: 5892 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 5893 if (error == 0) { 5894 ASSERT3U(wsize, ==, zl_wsize); 5895 ASSERT3U(wc, ==, zl_wc); 5896 } else { 5897 ASSERT3U(error, ==, ENOENT); 5898 } 5899 break; 5900 5901 case 1: 5902 error = zap_lookup(os, object, name, wsize, wc, data); 5903 if (error == 0) { 5904 if (data == string_value && 5905 memcmp(name, data, namelen) != 0) 5906 fatal(B_FALSE, "name '%s' != val '%s' len %d", 5907 name, (char *)data, namelen); 5908 } else { 5909 ASSERT3U(error, ==, ENOENT); 5910 } 5911 break; 5912 5913 case 2: 5914 error = zap_add(os, object, name, wsize, wc, data, tx); 5915 ASSERT(error == 0 || error == EEXIST); 5916 break; 5917 5918 case 3: 5919 VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); 5920 break; 5921 5922 case 4: 5923 error = zap_remove(os, object, name, tx); 5924 ASSERT(error == 0 || error == ENOENT); 5925 break; 5926 } 5927 5928 if (tx != NULL) 5929 dmu_tx_commit(tx); 5930 5931 umem_free(od, sizeof (ztest_od_t)); 5932 } 5933 5934 /* 5935 * Commit callback data. 5936 */ 5937 typedef struct ztest_cb_data { 5938 list_node_t zcd_node; 5939 uint64_t zcd_txg; 5940 int zcd_expected_err; 5941 boolean_t zcd_added; 5942 boolean_t zcd_called; 5943 spa_t *zcd_spa; 5944 } ztest_cb_data_t; 5945 5946 /* This is the actual commit callback function */ 5947 static void 5948 ztest_commit_callback(void *arg, int error) 5949 { 5950 ztest_cb_data_t *data = arg; 5951 uint64_t synced_txg; 5952 5953 VERIFY3P(data, !=, NULL); 5954 VERIFY3S(data->zcd_expected_err, ==, error); 5955 VERIFY(!data->zcd_called); 5956 5957 synced_txg = spa_last_synced_txg(data->zcd_spa); 5958 if (data->zcd_txg > synced_txg) 5959 fatal(B_FALSE, 5960 "commit callback of txg %"PRIu64" called prematurely, " 5961 "last synced txg = %"PRIu64"\n", 5962 data->zcd_txg, synced_txg); 5963 5964 data->zcd_called = B_TRUE; 5965 5966 if (error == ECANCELED) { 5967 ASSERT0(data->zcd_txg); 5968 ASSERT(!data->zcd_added); 5969 5970 /* 5971 * The private callback data should be destroyed here, but 5972 * since we are going to check the zcd_called field after 5973 * dmu_tx_abort(), we will destroy it there. 5974 */ 5975 return; 5976 } 5977 5978 ASSERT(data->zcd_added); 5979 ASSERT3U(data->zcd_txg, !=, 0); 5980 5981 (void) mutex_enter(&zcl.zcl_callbacks_lock); 5982 5983 /* See if this cb was called more quickly */ 5984 if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) 5985 zc_min_txg_delay = synced_txg - data->zcd_txg; 5986 5987 /* Remove our callback from the list */ 5988 list_remove(&zcl.zcl_callbacks, data); 5989 5990 (void) mutex_exit(&zcl.zcl_callbacks_lock); 5991 5992 umem_free(data, sizeof (ztest_cb_data_t)); 5993 } 5994 5995 /* Allocate and initialize callback data structure */ 5996 static ztest_cb_data_t * 5997 ztest_create_cb_data(objset_t *os, uint64_t txg) 5998 { 5999 ztest_cb_data_t *cb_data; 6000 6001 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 6002 6003 cb_data->zcd_txg = txg; 6004 cb_data->zcd_spa = dmu_objset_spa(os); 6005 list_link_init(&cb_data->zcd_node); 6006 6007 return (cb_data); 6008 } 6009 6010 /* 6011 * Commit callback test. 6012 */ 6013 void 6014 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 6015 { 6016 objset_t *os = zd->zd_os; 6017 ztest_od_t *od; 6018 dmu_tx_t *tx; 6019 ztest_cb_data_t *cb_data[3], *tmp_cb; 6020 uint64_t old_txg, txg; 6021 int i, error = 0; 6022 6023 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 6024 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 6025 6026 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 6027 umem_free(od, sizeof (ztest_od_t)); 6028 return; 6029 } 6030 6031 tx = dmu_tx_create(os); 6032 6033 cb_data[0] = ztest_create_cb_data(os, 0); 6034 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 6035 6036 dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); 6037 6038 /* Every once in a while, abort the transaction on purpose */ 6039 if (ztest_random(100) == 0) 6040 error = -1; 6041 6042 if (!error) 6043 error = dmu_tx_assign(tx, TXG_NOWAIT); 6044 6045 txg = error ? 0 : dmu_tx_get_txg(tx); 6046 6047 cb_data[0]->zcd_txg = txg; 6048 cb_data[1] = ztest_create_cb_data(os, txg); 6049 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 6050 6051 if (error) { 6052 /* 6053 * It's not a strict requirement to call the registered 6054 * callbacks from inside dmu_tx_abort(), but that's what 6055 * it's supposed to happen in the current implementation 6056 * so we will check for that. 6057 */ 6058 for (i = 0; i < 2; i++) { 6059 cb_data[i]->zcd_expected_err = ECANCELED; 6060 VERIFY(!cb_data[i]->zcd_called); 6061 } 6062 6063 dmu_tx_abort(tx); 6064 6065 for (i = 0; i < 2; i++) { 6066 VERIFY(cb_data[i]->zcd_called); 6067 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 6068 } 6069 6070 umem_free(od, sizeof (ztest_od_t)); 6071 return; 6072 } 6073 6074 cb_data[2] = ztest_create_cb_data(os, txg); 6075 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 6076 6077 /* 6078 * Read existing data to make sure there isn't a future leak. 6079 */ 6080 VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), 6081 &old_txg, DMU_READ_PREFETCH)); 6082 6083 if (old_txg > txg) 6084 fatal(B_FALSE, 6085 "future leak: got %"PRIu64", open txg is %"PRIu64"", 6086 old_txg, txg); 6087 6088 dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); 6089 6090 (void) mutex_enter(&zcl.zcl_callbacks_lock); 6091 6092 /* 6093 * Since commit callbacks don't have any ordering requirement and since 6094 * it is theoretically possible for a commit callback to be called 6095 * after an arbitrary amount of time has elapsed since its txg has been 6096 * synced, it is difficult to reliably determine whether a commit 6097 * callback hasn't been called due to high load or due to a flawed 6098 * implementation. 6099 * 6100 * In practice, we will assume that if after a certain number of txgs a 6101 * commit callback hasn't been called, then most likely there's an 6102 * implementation bug.. 6103 */ 6104 tmp_cb = list_head(&zcl.zcl_callbacks); 6105 if (tmp_cb != NULL && 6106 tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { 6107 fatal(B_FALSE, 6108 "Commit callback threshold exceeded, " 6109 "oldest txg: %"PRIu64", open txg: %"PRIu64"\n", 6110 tmp_cb->zcd_txg, txg); 6111 } 6112 6113 /* 6114 * Let's find the place to insert our callbacks. 6115 * 6116 * Even though the list is ordered by txg, it is possible for the 6117 * insertion point to not be the end because our txg may already be 6118 * quiescing at this point and other callbacks in the open txg 6119 * (from other objsets) may have sneaked in. 6120 */ 6121 tmp_cb = list_tail(&zcl.zcl_callbacks); 6122 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 6123 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 6124 6125 /* Add the 3 callbacks to the list */ 6126 for (i = 0; i < 3; i++) { 6127 if (tmp_cb == NULL) 6128 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 6129 else 6130 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 6131 cb_data[i]); 6132 6133 cb_data[i]->zcd_added = B_TRUE; 6134 VERIFY(!cb_data[i]->zcd_called); 6135 6136 tmp_cb = cb_data[i]; 6137 } 6138 6139 zc_cb_counter += 3; 6140 6141 (void) mutex_exit(&zcl.zcl_callbacks_lock); 6142 6143 dmu_tx_commit(tx); 6144 6145 umem_free(od, sizeof (ztest_od_t)); 6146 } 6147 6148 /* 6149 * Visit each object in the dataset. Verify that its properties 6150 * are consistent what was stored in the block tag when it was created, 6151 * and that its unused bonus buffer space has not been overwritten. 6152 */ 6153 void 6154 ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 6155 { 6156 (void) id; 6157 objset_t *os = zd->zd_os; 6158 uint64_t obj; 6159 int err = 0; 6160 6161 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 6162 ztest_block_tag_t *bt = NULL; 6163 dmu_object_info_t doi; 6164 dmu_buf_t *db; 6165 6166 ztest_object_lock(zd, obj, ZTRL_READER); 6167 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { 6168 ztest_object_unlock(zd, obj); 6169 continue; 6170 } 6171 6172 dmu_object_info_from_db(db, &doi); 6173 if (doi.doi_bonus_size >= sizeof (*bt)) 6174 bt = ztest_bt_bonus(db); 6175 6176 if (bt && bt->bt_magic == BT_MAGIC) { 6177 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 6178 bt->bt_offset, bt->bt_gen, bt->bt_txg, 6179 bt->bt_crtxg); 6180 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 6181 } 6182 6183 dmu_buf_rele(db, FTAG); 6184 ztest_object_unlock(zd, obj); 6185 } 6186 } 6187 6188 void 6189 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 6190 { 6191 (void) id; 6192 zfs_prop_t proplist[] = { 6193 ZFS_PROP_CHECKSUM, 6194 ZFS_PROP_COMPRESSION, 6195 ZFS_PROP_COPIES, 6196 ZFS_PROP_DEDUP 6197 }; 6198 6199 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6200 6201 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) { 6202 int error = ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 6203 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 6204 ASSERT(error == 0 || error == ENOSPC); 6205 } 6206 6207 int error = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, 6208 ztest_random_blocksize(), (int)ztest_random(2)); 6209 ASSERT(error == 0 || error == ENOSPC); 6210 6211 (void) pthread_rwlock_unlock(&ztest_name_lock); 6212 } 6213 6214 void 6215 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 6216 { 6217 (void) zd, (void) id; 6218 nvlist_t *props = NULL; 6219 6220 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6221 6222 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); 6223 6224 VERIFY0(spa_prop_get(ztest_spa, &props)); 6225 6226 if (ztest_opts.zo_verbose >= 6) 6227 dump_nvlist(props, 4); 6228 6229 fnvlist_free(props); 6230 6231 (void) pthread_rwlock_unlock(&ztest_name_lock); 6232 } 6233 6234 static int 6235 user_release_one(const char *snapname, const char *holdname) 6236 { 6237 nvlist_t *snaps, *holds; 6238 int error; 6239 6240 snaps = fnvlist_alloc(); 6241 holds = fnvlist_alloc(); 6242 fnvlist_add_boolean(holds, holdname); 6243 fnvlist_add_nvlist(snaps, snapname, holds); 6244 fnvlist_free(holds); 6245 error = dsl_dataset_user_release(snaps, NULL); 6246 fnvlist_free(snaps); 6247 return (error); 6248 } 6249 6250 /* 6251 * Test snapshot hold/release and deferred destroy. 6252 */ 6253 void 6254 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 6255 { 6256 int error; 6257 objset_t *os = zd->zd_os; 6258 objset_t *origin; 6259 char snapname[100]; 6260 char fullname[100]; 6261 char clonename[100]; 6262 char tag[100]; 6263 char osname[ZFS_MAX_DATASET_NAME_LEN]; 6264 nvlist_t *holds; 6265 6266 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6267 6268 dmu_objset_name(os, osname); 6269 6270 (void) snprintf(snapname, sizeof (snapname), "sh1_%"PRIu64"", id); 6271 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 6272 (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%"PRIu64"", 6273 osname, id); 6274 (void) snprintf(tag, sizeof (tag), "tag_%"PRIu64"", id); 6275 6276 /* 6277 * Clean up from any previous run. 6278 */ 6279 error = dsl_destroy_head(clonename); 6280 if (error != ENOENT) 6281 ASSERT0(error); 6282 error = user_release_one(fullname, tag); 6283 if (error != ESRCH && error != ENOENT) 6284 ASSERT0(error); 6285 error = dsl_destroy_snapshot(fullname, B_FALSE); 6286 if (error != ENOENT) 6287 ASSERT0(error); 6288 6289 /* 6290 * Create snapshot, clone it, mark snap for deferred destroy, 6291 * destroy clone, verify snap was also destroyed. 6292 */ 6293 error = dmu_objset_snapshot_one(osname, snapname); 6294 if (error) { 6295 if (error == ENOSPC) { 6296 ztest_record_enospc("dmu_objset_snapshot"); 6297 goto out; 6298 } 6299 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 6300 } 6301 6302 error = dmu_objset_clone(clonename, fullname); 6303 if (error) { 6304 if (error == ENOSPC) { 6305 ztest_record_enospc("dmu_objset_clone"); 6306 goto out; 6307 } 6308 fatal(B_FALSE, "dmu_objset_clone(%s) = %d", clonename, error); 6309 } 6310 6311 error = dsl_destroy_snapshot(fullname, B_TRUE); 6312 if (error) { 6313 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6314 fullname, error); 6315 } 6316 6317 error = dsl_destroy_head(clonename); 6318 if (error) 6319 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clonename, error); 6320 6321 error = dmu_objset_hold(fullname, FTAG, &origin); 6322 if (error != ENOENT) 6323 fatal(B_FALSE, "dmu_objset_hold(%s) = %d", fullname, error); 6324 6325 /* 6326 * Create snapshot, add temporary hold, verify that we can't 6327 * destroy a held snapshot, mark for deferred destroy, 6328 * release hold, verify snapshot was destroyed. 6329 */ 6330 error = dmu_objset_snapshot_one(osname, snapname); 6331 if (error) { 6332 if (error == ENOSPC) { 6333 ztest_record_enospc("dmu_objset_snapshot"); 6334 goto out; 6335 } 6336 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 6337 } 6338 6339 holds = fnvlist_alloc(); 6340 fnvlist_add_string(holds, fullname, tag); 6341 error = dsl_dataset_user_hold(holds, 0, NULL); 6342 fnvlist_free(holds); 6343 6344 if (error == ENOSPC) { 6345 ztest_record_enospc("dsl_dataset_user_hold"); 6346 goto out; 6347 } else if (error) { 6348 fatal(B_FALSE, "dsl_dataset_user_hold(%s, %s) = %u", 6349 fullname, tag, error); 6350 } 6351 6352 error = dsl_destroy_snapshot(fullname, B_FALSE); 6353 if (error != EBUSY) { 6354 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 6355 fullname, error); 6356 } 6357 6358 error = dsl_destroy_snapshot(fullname, B_TRUE); 6359 if (error) { 6360 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6361 fullname, error); 6362 } 6363 6364 error = user_release_one(fullname, tag); 6365 if (error) 6366 fatal(B_FALSE, "user_release_one(%s, %s) = %d", 6367 fullname, tag, error); 6368 6369 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 6370 6371 out: 6372 (void) pthread_rwlock_unlock(&ztest_name_lock); 6373 } 6374 6375 /* 6376 * Inject random faults into the on-disk data. 6377 */ 6378 void 6379 ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 6380 { 6381 (void) zd, (void) id; 6382 ztest_shared_t *zs = ztest_shared; 6383 spa_t *spa = ztest_spa; 6384 int fd; 6385 uint64_t offset; 6386 uint64_t leaves; 6387 uint64_t bad = 0x1990c0ffeedecadeull; 6388 uint64_t top, leaf; 6389 uint64_t raidz_children; 6390 char *path0; 6391 char *pathrand; 6392 size_t fsize; 6393 int bshift = SPA_MAXBLOCKSHIFT + 2; 6394 int iters = 1000; 6395 int maxfaults; 6396 int mirror_save; 6397 vdev_t *vd0 = NULL; 6398 uint64_t guid0 = 0; 6399 boolean_t islog = B_FALSE; 6400 boolean_t injected = B_FALSE; 6401 6402 path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6403 pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6404 6405 mutex_enter(&ztest_vdev_lock); 6406 6407 /* 6408 * Device removal is in progress, fault injection must be disabled 6409 * until it completes and the pool is scrubbed. The fault injection 6410 * strategy for damaging blocks does not take in to account evacuated 6411 * blocks which may have already been damaged. 6412 */ 6413 if (ztest_device_removal_active) 6414 goto out; 6415 6416 /* 6417 * The fault injection strategy for damaging blocks cannot be used 6418 * if raidz expansion is in progress. The leaves value 6419 * (attached raidz children) is variable and strategy for damaging 6420 * blocks will corrupt same data blocks on different child vdevs 6421 * because of the reflow process. 6422 */ 6423 if (spa->spa_raidz_expand != NULL) 6424 goto out; 6425 6426 maxfaults = MAXFAULTS(zs); 6427 raidz_children = ztest_get_raidz_children(spa); 6428 leaves = MAX(zs->zs_mirrors, 1) * raidz_children; 6429 mirror_save = zs->zs_mirrors; 6430 6431 ASSERT3U(leaves, >=, 1); 6432 6433 /* 6434 * While ztest is running the number of leaves will not change. This 6435 * is critical for the fault injection logic as it determines where 6436 * errors can be safely injected such that they are always repairable. 6437 * 6438 * When restarting ztest a different number of leaves may be requested 6439 * which will shift the regions to be damaged. This is fine as long 6440 * as the pool has been scrubbed prior to using the new mapping. 6441 * Failure to do can result in non-repairable damage being injected. 6442 */ 6443 if (ztest_pool_scrubbed == B_FALSE) 6444 goto out; 6445 6446 /* 6447 * Grab the name lock as reader. There are some operations 6448 * which don't like to have their vdevs changed while 6449 * they are in progress (i.e. spa_change_guid). Those 6450 * operations will have grabbed the name lock as writer. 6451 */ 6452 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6453 6454 /* 6455 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 6456 */ 6457 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6458 6459 if (ztest_random(2) == 0) { 6460 /* 6461 * Inject errors on a normal data device or slog device. 6462 */ 6463 top = ztest_random_vdev_top(spa, B_TRUE); 6464 leaf = ztest_random(leaves) + zs->zs_splits; 6465 6466 /* 6467 * Generate paths to the first leaf in this top-level vdev, 6468 * and to the random leaf we selected. We'll induce transient 6469 * write failures and random online/offline activity on leaf 0, 6470 * and we'll write random garbage to the randomly chosen leaf. 6471 */ 6472 (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, 6473 ztest_opts.zo_dir, ztest_opts.zo_pool, 6474 top * leaves + zs->zs_splits); 6475 (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, 6476 ztest_opts.zo_dir, ztest_opts.zo_pool, 6477 top * leaves + leaf); 6478 6479 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 6480 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 6481 islog = B_TRUE; 6482 6483 /* 6484 * If the top-level vdev needs to be resilvered 6485 * then we only allow faults on the device that is 6486 * resilvering. 6487 */ 6488 if (vd0 != NULL && maxfaults != 1 && 6489 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 6490 vd0->vdev_resilver_txg != 0)) { 6491 /* 6492 * Make vd0 explicitly claim to be unreadable, 6493 * or unwritable, or reach behind its back 6494 * and close the underlying fd. We can do this if 6495 * maxfaults == 0 because we'll fail and reexecute, 6496 * and we can do it if maxfaults >= 2 because we'll 6497 * have enough redundancy. If maxfaults == 1, the 6498 * combination of this with injection of random data 6499 * corruption below exceeds the pool's fault tolerance. 6500 */ 6501 vdev_file_t *vf = vd0->vdev_tsd; 6502 6503 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 6504 (long long)vd0->vdev_id, (int)maxfaults); 6505 6506 if (vf != NULL && ztest_random(3) == 0) { 6507 (void) close(vf->vf_file->f_fd); 6508 vf->vf_file->f_fd = -1; 6509 } else if (ztest_random(2) == 0) { 6510 vd0->vdev_cant_read = B_TRUE; 6511 } else { 6512 vd0->vdev_cant_write = B_TRUE; 6513 } 6514 guid0 = vd0->vdev_guid; 6515 } 6516 } else { 6517 /* 6518 * Inject errors on an l2cache device. 6519 */ 6520 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6521 6522 if (sav->sav_count == 0) { 6523 spa_config_exit(spa, SCL_STATE, FTAG); 6524 (void) pthread_rwlock_unlock(&ztest_name_lock); 6525 goto out; 6526 } 6527 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 6528 guid0 = vd0->vdev_guid; 6529 (void) strlcpy(path0, vd0->vdev_path, MAXPATHLEN); 6530 (void) strlcpy(pathrand, vd0->vdev_path, MAXPATHLEN); 6531 6532 leaf = 0; 6533 leaves = 1; 6534 maxfaults = INT_MAX; /* no limit on cache devices */ 6535 } 6536 6537 spa_config_exit(spa, SCL_STATE, FTAG); 6538 (void) pthread_rwlock_unlock(&ztest_name_lock); 6539 6540 /* 6541 * If we can tolerate two or more faults, or we're dealing 6542 * with a slog, randomly online/offline vd0. 6543 */ 6544 if ((maxfaults >= 2 || islog) && guid0 != 0) { 6545 if (ztest_random(10) < 6) { 6546 int flags = (ztest_random(2) == 0 ? 6547 ZFS_OFFLINE_TEMPORARY : 0); 6548 6549 /* 6550 * We have to grab the zs_name_lock as writer to 6551 * prevent a race between offlining a slog and 6552 * destroying a dataset. Offlining the slog will 6553 * grab a reference on the dataset which may cause 6554 * dsl_destroy_head() to fail with EBUSY thus 6555 * leaving the dataset in an inconsistent state. 6556 */ 6557 if (islog) 6558 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6559 6560 VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); 6561 6562 if (islog) 6563 (void) pthread_rwlock_unlock(&ztest_name_lock); 6564 } else { 6565 /* 6566 * Ideally we would like to be able to randomly 6567 * call vdev_[on|off]line without holding locks 6568 * to force unpredictable failures but the side 6569 * effects of vdev_[on|off]line prevent us from 6570 * doing so. 6571 */ 6572 (void) vdev_online(spa, guid0, 0, NULL); 6573 } 6574 } 6575 6576 if (maxfaults == 0) 6577 goto out; 6578 6579 /* 6580 * We have at least single-fault tolerance, so inject data corruption. 6581 */ 6582 fd = open(pathrand, O_RDWR); 6583 6584 if (fd == -1) /* we hit a gap in the device namespace */ 6585 goto out; 6586 6587 fsize = lseek(fd, 0, SEEK_END); 6588 6589 while (--iters != 0) { 6590 /* 6591 * The offset must be chosen carefully to ensure that 6592 * we do not inject a given logical block with errors 6593 * on two different leaf devices, because ZFS can not 6594 * tolerate that (if maxfaults==1). 6595 * 6596 * To achieve this we divide each leaf device into 6597 * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). 6598 * Each chunk is further divided into error-injection 6599 * ranges (can accept errors) and clear ranges (we do 6600 * not inject errors in those). Each error-injection 6601 * range can accept errors only for a single leaf vdev. 6602 * Error-injection ranges are separated by clear ranges. 6603 * 6604 * For example, with 3 leaves, each chunk looks like: 6605 * 0 to 32M: injection range for leaf 0 6606 * 32M to 64M: clear range - no injection allowed 6607 * 64M to 96M: injection range for leaf 1 6608 * 96M to 128M: clear range - no injection allowed 6609 * 128M to 160M: injection range for leaf 2 6610 * 160M to 192M: clear range - no injection allowed 6611 * 6612 * Each clear range must be large enough such that a 6613 * single block cannot straddle it. This way a block 6614 * can't be a target in two different injection ranges 6615 * (on different leaf vdevs). 6616 */ 6617 offset = ztest_random(fsize / (leaves << bshift)) * 6618 (leaves << bshift) + (leaf << bshift) + 6619 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 6620 6621 /* 6622 * Only allow damage to the labels at one end of the vdev. 6623 * 6624 * If all labels are damaged, the device will be totally 6625 * inaccessible, which will result in loss of data, 6626 * because we also damage (parts of) the other side of 6627 * the mirror/raidz. 6628 * 6629 * Additionally, we will always have both an even and an 6630 * odd label, so that we can handle crashes in the 6631 * middle of vdev_config_sync(). 6632 */ 6633 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 6634 continue; 6635 6636 /* 6637 * The two end labels are stored at the "end" of the disk, but 6638 * the end of the disk (vdev_psize) is aligned to 6639 * sizeof (vdev_label_t). 6640 */ 6641 uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); 6642 if ((leaf & 1) == 1 && 6643 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 6644 continue; 6645 6646 if (mirror_save != zs->zs_mirrors) { 6647 (void) close(fd); 6648 goto out; 6649 } 6650 6651 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 6652 fatal(B_TRUE, 6653 "can't inject bad word at 0x%"PRIx64" in %s", 6654 offset, pathrand); 6655 6656 if (ztest_opts.zo_verbose >= 7) 6657 (void) printf("injected bad word into %s," 6658 " offset 0x%"PRIx64"\n", pathrand, offset); 6659 6660 injected = B_TRUE; 6661 } 6662 6663 (void) close(fd); 6664 out: 6665 mutex_exit(&ztest_vdev_lock); 6666 6667 if (injected && ztest_opts.zo_raid_do_expand) { 6668 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6669 if (error == 0) { 6670 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6671 txg_wait_synced(spa_get_dsl(spa), 0); 6672 } 6673 } 6674 6675 umem_free(path0, MAXPATHLEN); 6676 umem_free(pathrand, MAXPATHLEN); 6677 } 6678 6679 /* 6680 * By design ztest will never inject uncorrectable damage in to the pool. 6681 * Issue a scrub, wait for it to complete, and verify there is never any 6682 * persistent damage. 6683 * 6684 * Only after a full scrub has been completed is it safe to start injecting 6685 * data corruption. See the comment in zfs_fault_inject(). 6686 */ 6687 static int 6688 ztest_scrub_impl(spa_t *spa) 6689 { 6690 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6691 if (error) 6692 return (error); 6693 6694 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6695 txg_wait_synced(spa_get_dsl(spa), 0); 6696 6697 if (spa_approx_errlog_size(spa) > 0) 6698 return (ECKSUM); 6699 6700 ztest_pool_scrubbed = B_TRUE; 6701 6702 return (0); 6703 } 6704 6705 /* 6706 * Scrub the pool. 6707 */ 6708 void 6709 ztest_scrub(ztest_ds_t *zd, uint64_t id) 6710 { 6711 (void) zd, (void) id; 6712 spa_t *spa = ztest_spa; 6713 int error; 6714 6715 /* 6716 * Scrub in progress by device removal. 6717 */ 6718 if (ztest_device_removal_active) 6719 return; 6720 6721 /* 6722 * Start a scrub, wait a moment, then force a restart. 6723 */ 6724 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6725 (void) poll(NULL, 0, 100); 6726 6727 error = ztest_scrub_impl(spa); 6728 if (error == EBUSY) 6729 error = 0; 6730 ASSERT0(error); 6731 } 6732 6733 /* 6734 * Change the guid for the pool. 6735 */ 6736 void 6737 ztest_reguid(ztest_ds_t *zd, uint64_t id) 6738 { 6739 (void) zd, (void) id; 6740 spa_t *spa = ztest_spa; 6741 uint64_t orig, load; 6742 int error; 6743 ztest_shared_t *zs = ztest_shared; 6744 6745 if (ztest_opts.zo_mmp_test) 6746 return; 6747 6748 orig = spa_guid(spa); 6749 load = spa_load_guid(spa); 6750 6751 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6752 error = spa_change_guid(spa); 6753 zs->zs_guid = spa_guid(spa); 6754 (void) pthread_rwlock_unlock(&ztest_name_lock); 6755 6756 if (error != 0) 6757 return; 6758 6759 if (ztest_opts.zo_verbose >= 4) { 6760 (void) printf("Changed guid old %"PRIu64" -> %"PRIu64"\n", 6761 orig, spa_guid(spa)); 6762 } 6763 6764 VERIFY3U(orig, !=, spa_guid(spa)); 6765 VERIFY3U(load, ==, spa_load_guid(spa)); 6766 } 6767 6768 void 6769 ztest_blake3(ztest_ds_t *zd, uint64_t id) 6770 { 6771 (void) zd, (void) id; 6772 hrtime_t end = gethrtime() + NANOSEC; 6773 zio_cksum_salt_t salt; 6774 void *salt_ptr = &salt.zcs_bytes; 6775 struct abd *abd_data, *abd_meta; 6776 void *buf, *templ; 6777 int i, *ptr; 6778 uint32_t size; 6779 BLAKE3_CTX ctx; 6780 const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); 6781 6782 size = ztest_random_blocksize(); 6783 buf = umem_alloc(size, UMEM_NOFAIL); 6784 abd_data = abd_alloc(size, B_FALSE); 6785 abd_meta = abd_alloc(size, B_TRUE); 6786 6787 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6788 *ptr = ztest_random(UINT_MAX); 6789 memset(salt_ptr, 'A', 32); 6790 6791 abd_copy_from_buf_off(abd_data, buf, 0, size); 6792 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6793 6794 while (gethrtime() <= end) { 6795 int run_count = 100; 6796 zio_cksum_t zc_ref1, zc_ref2; 6797 zio_cksum_t zc_res1, zc_res2; 6798 6799 void *ref1 = &zc_ref1; 6800 void *ref2 = &zc_ref2; 6801 void *res1 = &zc_res1; 6802 void *res2 = &zc_res2; 6803 6804 /* BLAKE3_KEY_LEN = 32 */ 6805 VERIFY0(blake3->setname("generic")); 6806 templ = abd_checksum_blake3_tmpl_init(&salt); 6807 Blake3_InitKeyed(&ctx, salt_ptr); 6808 Blake3_Update(&ctx, buf, size); 6809 Blake3_Final(&ctx, ref1); 6810 zc_ref2 = zc_ref1; 6811 ZIO_CHECKSUM_BSWAP(&zc_ref2); 6812 abd_checksum_blake3_tmpl_free(templ); 6813 6814 VERIFY0(blake3->setname("cycle")); 6815 while (run_count-- > 0) { 6816 6817 /* Test current implementation */ 6818 Blake3_InitKeyed(&ctx, salt_ptr); 6819 Blake3_Update(&ctx, buf, size); 6820 Blake3_Final(&ctx, res1); 6821 zc_res2 = zc_res1; 6822 ZIO_CHECKSUM_BSWAP(&zc_res2); 6823 6824 VERIFY0(memcmp(ref1, res1, 32)); 6825 VERIFY0(memcmp(ref2, res2, 32)); 6826 6827 /* Test ABD - data */ 6828 templ = abd_checksum_blake3_tmpl_init(&salt); 6829 abd_checksum_blake3_native(abd_data, size, 6830 templ, &zc_res1); 6831 abd_checksum_blake3_byteswap(abd_data, size, 6832 templ, &zc_res2); 6833 6834 VERIFY0(memcmp(ref1, res1, 32)); 6835 VERIFY0(memcmp(ref2, res2, 32)); 6836 6837 /* Test ABD - metadata */ 6838 abd_checksum_blake3_native(abd_meta, size, 6839 templ, &zc_res1); 6840 abd_checksum_blake3_byteswap(abd_meta, size, 6841 templ, &zc_res2); 6842 abd_checksum_blake3_tmpl_free(templ); 6843 6844 VERIFY0(memcmp(ref1, res1, 32)); 6845 VERIFY0(memcmp(ref2, res2, 32)); 6846 6847 } 6848 } 6849 6850 abd_free(abd_data); 6851 abd_free(abd_meta); 6852 umem_free(buf, size); 6853 } 6854 6855 void 6856 ztest_fletcher(ztest_ds_t *zd, uint64_t id) 6857 { 6858 (void) zd, (void) id; 6859 hrtime_t end = gethrtime() + NANOSEC; 6860 6861 while (gethrtime() <= end) { 6862 int run_count = 100; 6863 void *buf; 6864 struct abd *abd_data, *abd_meta; 6865 uint32_t size; 6866 int *ptr; 6867 int i; 6868 zio_cksum_t zc_ref; 6869 zio_cksum_t zc_ref_byteswap; 6870 6871 size = ztest_random_blocksize(); 6872 6873 buf = umem_alloc(size, UMEM_NOFAIL); 6874 abd_data = abd_alloc(size, B_FALSE); 6875 abd_meta = abd_alloc(size, B_TRUE); 6876 6877 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6878 *ptr = ztest_random(UINT_MAX); 6879 6880 abd_copy_from_buf_off(abd_data, buf, 0, size); 6881 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6882 6883 VERIFY0(fletcher_4_impl_set("scalar")); 6884 fletcher_4_native(buf, size, NULL, &zc_ref); 6885 fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); 6886 6887 VERIFY0(fletcher_4_impl_set("cycle")); 6888 while (run_count-- > 0) { 6889 zio_cksum_t zc; 6890 zio_cksum_t zc_byteswap; 6891 6892 fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); 6893 fletcher_4_native(buf, size, NULL, &zc); 6894 6895 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6896 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6897 sizeof (zc_byteswap))); 6898 6899 /* Test ABD - data */ 6900 abd_fletcher_4_byteswap(abd_data, size, NULL, 6901 &zc_byteswap); 6902 abd_fletcher_4_native(abd_data, size, NULL, &zc); 6903 6904 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6905 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6906 sizeof (zc_byteswap))); 6907 6908 /* Test ABD - metadata */ 6909 abd_fletcher_4_byteswap(abd_meta, size, NULL, 6910 &zc_byteswap); 6911 abd_fletcher_4_native(abd_meta, size, NULL, &zc); 6912 6913 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6914 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6915 sizeof (zc_byteswap))); 6916 6917 } 6918 6919 umem_free(buf, size); 6920 abd_free(abd_data); 6921 abd_free(abd_meta); 6922 } 6923 } 6924 6925 void 6926 ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) 6927 { 6928 (void) zd, (void) id; 6929 void *buf; 6930 size_t size; 6931 int *ptr; 6932 int i; 6933 zio_cksum_t zc_ref; 6934 zio_cksum_t zc_ref_bswap; 6935 6936 hrtime_t end = gethrtime() + NANOSEC; 6937 6938 while (gethrtime() <= end) { 6939 int run_count = 100; 6940 6941 size = ztest_random_blocksize(); 6942 buf = umem_alloc(size, UMEM_NOFAIL); 6943 6944 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6945 *ptr = ztest_random(UINT_MAX); 6946 6947 VERIFY0(fletcher_4_impl_set("scalar")); 6948 fletcher_4_native(buf, size, NULL, &zc_ref); 6949 fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); 6950 6951 VERIFY0(fletcher_4_impl_set("cycle")); 6952 6953 while (run_count-- > 0) { 6954 zio_cksum_t zc; 6955 zio_cksum_t zc_bswap; 6956 size_t pos = 0; 6957 6958 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6959 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6960 6961 while (pos < size) { 6962 size_t inc = 64 * ztest_random(size / 67); 6963 /* sometimes add few bytes to test non-simd */ 6964 if (ztest_random(100) < 10) 6965 inc += P2ALIGN(ztest_random(64), 6966 sizeof (uint32_t)); 6967 6968 if (inc > (size - pos)) 6969 inc = size - pos; 6970 6971 fletcher_4_incremental_native(buf + pos, inc, 6972 &zc); 6973 fletcher_4_incremental_byteswap(buf + pos, inc, 6974 &zc_bswap); 6975 6976 pos += inc; 6977 } 6978 6979 VERIFY3U(pos, ==, size); 6980 6981 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6982 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6983 6984 /* 6985 * verify if incremental on the whole buffer is 6986 * equivalent to non-incremental version 6987 */ 6988 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6989 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6990 6991 fletcher_4_incremental_native(buf, size, &zc); 6992 fletcher_4_incremental_byteswap(buf, size, &zc_bswap); 6993 6994 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6995 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6996 } 6997 6998 umem_free(buf, size); 6999 } 7000 } 7001 7002 static int 7003 ztest_set_global_vars(void) 7004 { 7005 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 7006 char *kv = ztest_opts.zo_gvars[i]; 7007 VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); 7008 VERIFY3U(strlen(kv), >, 0); 7009 int err = set_global_var(kv); 7010 if (ztest_opts.zo_verbose > 0) { 7011 (void) printf("setting global var %s ... %s\n", kv, 7012 err ? "failed" : "ok"); 7013 } 7014 if (err != 0) { 7015 (void) fprintf(stderr, 7016 "failed to set global var '%s'\n", kv); 7017 return (err); 7018 } 7019 } 7020 return (0); 7021 } 7022 7023 static char ** 7024 ztest_global_vars_to_zdb_args(void) 7025 { 7026 char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); 7027 char **cur = args; 7028 if (args == NULL) 7029 return (NULL); 7030 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 7031 *cur++ = (char *)"-o"; 7032 *cur++ = ztest_opts.zo_gvars[i]; 7033 } 7034 ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); 7035 *cur = NULL; 7036 return (args); 7037 } 7038 7039 /* The end of strings is indicated by a NULL element */ 7040 static char * 7041 join_strings(char **strings, const char *sep) 7042 { 7043 size_t totallen = 0; 7044 for (char **sp = strings; *sp != NULL; sp++) { 7045 totallen += strlen(*sp); 7046 totallen += strlen(sep); 7047 } 7048 if (totallen > 0) { 7049 ASSERT(totallen >= strlen(sep)); 7050 totallen -= strlen(sep); 7051 } 7052 7053 size_t buflen = totallen + 1; 7054 char *o = umem_alloc(buflen, UMEM_NOFAIL); /* trailing 0 byte */ 7055 o[0] = '\0'; 7056 for (char **sp = strings; *sp != NULL; sp++) { 7057 size_t would; 7058 would = strlcat(o, *sp, buflen); 7059 VERIFY3U(would, <, buflen); 7060 if (*(sp+1) == NULL) { 7061 break; 7062 } 7063 would = strlcat(o, sep, buflen); 7064 VERIFY3U(would, <, buflen); 7065 } 7066 ASSERT3S(strlen(o), ==, totallen); 7067 return (o); 7068 } 7069 7070 static int 7071 ztest_check_path(char *path) 7072 { 7073 struct stat s; 7074 /* return true on success */ 7075 return (!stat(path, &s)); 7076 } 7077 7078 static void 7079 ztest_get_zdb_bin(char *bin, int len) 7080 { 7081 char *zdb_path; 7082 /* 7083 * Try to use $ZDB and in-tree zdb path. If not successful, just 7084 * let popen to search through PATH. 7085 */ 7086 if ((zdb_path = getenv("ZDB"))) { 7087 strlcpy(bin, zdb_path, len); /* In env */ 7088 if (!ztest_check_path(bin)) { 7089 ztest_dump_core = 0; 7090 fatal(B_TRUE, "invalid ZDB '%s'", bin); 7091 } 7092 return; 7093 } 7094 7095 VERIFY3P(realpath(getexecname(), bin), !=, NULL); 7096 if (strstr(bin, ".libs/ztest")) { 7097 strstr(bin, ".libs/ztest")[0] = '\0'; /* In-tree */ 7098 strcat(bin, "zdb"); 7099 if (ztest_check_path(bin)) 7100 return; 7101 } 7102 strcpy(bin, "zdb"); 7103 } 7104 7105 static vdev_t * 7106 ztest_random_concrete_vdev_leaf(vdev_t *vd) 7107 { 7108 if (vd == NULL) 7109 return (NULL); 7110 7111 if (vd->vdev_children == 0) 7112 return (vd); 7113 7114 vdev_t *eligible[vd->vdev_children]; 7115 int eligible_idx = 0, i; 7116 for (i = 0; i < vd->vdev_children; i++) { 7117 vdev_t *cvd = vd->vdev_child[i]; 7118 if (cvd->vdev_top->vdev_removing) 7119 continue; 7120 if (cvd->vdev_children > 0 || 7121 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 7122 eligible[eligible_idx++] = cvd; 7123 } 7124 } 7125 VERIFY3S(eligible_idx, >, 0); 7126 7127 uint64_t child_no = ztest_random(eligible_idx); 7128 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 7129 } 7130 7131 void 7132 ztest_initialize(ztest_ds_t *zd, uint64_t id) 7133 { 7134 (void) zd, (void) id; 7135 spa_t *spa = ztest_spa; 7136 int error = 0; 7137 7138 mutex_enter(&ztest_vdev_lock); 7139 7140 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 7141 7142 /* Random leaf vdev */ 7143 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 7144 if (rand_vd == NULL) { 7145 spa_config_exit(spa, SCL_VDEV, FTAG); 7146 mutex_exit(&ztest_vdev_lock); 7147 return; 7148 } 7149 7150 /* 7151 * The random vdev we've selected may change as soon as we 7152 * drop the spa_config_lock. We create local copies of things 7153 * we're interested in. 7154 */ 7155 uint64_t guid = rand_vd->vdev_guid; 7156 char *path = strdup(rand_vd->vdev_path); 7157 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 7158 7159 zfs_dbgmsg("vd %px, guid %llu", rand_vd, (u_longlong_t)guid); 7160 spa_config_exit(spa, SCL_VDEV, FTAG); 7161 7162 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 7163 7164 nvlist_t *vdev_guids = fnvlist_alloc(); 7165 nvlist_t *vdev_errlist = fnvlist_alloc(); 7166 fnvlist_add_uint64(vdev_guids, path, guid); 7167 error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); 7168 fnvlist_free(vdev_guids); 7169 fnvlist_free(vdev_errlist); 7170 7171 switch (cmd) { 7172 case POOL_INITIALIZE_CANCEL: 7173 if (ztest_opts.zo_verbose >= 4) { 7174 (void) printf("Cancel initialize %s", path); 7175 if (!active) 7176 (void) printf(" failed (no initialize active)"); 7177 (void) printf("\n"); 7178 } 7179 break; 7180 case POOL_INITIALIZE_START: 7181 if (ztest_opts.zo_verbose >= 4) { 7182 (void) printf("Start initialize %s", path); 7183 if (active && error == 0) 7184 (void) printf(" failed (already active)"); 7185 else if (error != 0) 7186 (void) printf(" failed (error %d)", error); 7187 (void) printf("\n"); 7188 } 7189 break; 7190 case POOL_INITIALIZE_SUSPEND: 7191 if (ztest_opts.zo_verbose >= 4) { 7192 (void) printf("Suspend initialize %s", path); 7193 if (!active) 7194 (void) printf(" failed (no initialize active)"); 7195 (void) printf("\n"); 7196 } 7197 break; 7198 } 7199 free(path); 7200 mutex_exit(&ztest_vdev_lock); 7201 } 7202 7203 void 7204 ztest_trim(ztest_ds_t *zd, uint64_t id) 7205 { 7206 (void) zd, (void) id; 7207 spa_t *spa = ztest_spa; 7208 int error = 0; 7209 7210 mutex_enter(&ztest_vdev_lock); 7211 7212 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 7213 7214 /* Random leaf vdev */ 7215 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 7216 if (rand_vd == NULL) { 7217 spa_config_exit(spa, SCL_VDEV, FTAG); 7218 mutex_exit(&ztest_vdev_lock); 7219 return; 7220 } 7221 7222 /* 7223 * The random vdev we've selected may change as soon as we 7224 * drop the spa_config_lock. We create local copies of things 7225 * we're interested in. 7226 */ 7227 uint64_t guid = rand_vd->vdev_guid; 7228 char *path = strdup(rand_vd->vdev_path); 7229 boolean_t active = rand_vd->vdev_trim_thread != NULL; 7230 7231 zfs_dbgmsg("vd %p, guid %llu", rand_vd, (u_longlong_t)guid); 7232 spa_config_exit(spa, SCL_VDEV, FTAG); 7233 7234 uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); 7235 uint64_t rate = 1 << ztest_random(30); 7236 boolean_t partial = (ztest_random(5) > 0); 7237 boolean_t secure = (ztest_random(5) > 0); 7238 7239 nvlist_t *vdev_guids = fnvlist_alloc(); 7240 nvlist_t *vdev_errlist = fnvlist_alloc(); 7241 fnvlist_add_uint64(vdev_guids, path, guid); 7242 error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, 7243 secure, vdev_errlist); 7244 fnvlist_free(vdev_guids); 7245 fnvlist_free(vdev_errlist); 7246 7247 switch (cmd) { 7248 case POOL_TRIM_CANCEL: 7249 if (ztest_opts.zo_verbose >= 4) { 7250 (void) printf("Cancel TRIM %s", path); 7251 if (!active) 7252 (void) printf(" failed (no TRIM active)"); 7253 (void) printf("\n"); 7254 } 7255 break; 7256 case POOL_TRIM_START: 7257 if (ztest_opts.zo_verbose >= 4) { 7258 (void) printf("Start TRIM %s", path); 7259 if (active && error == 0) 7260 (void) printf(" failed (already active)"); 7261 else if (error != 0) 7262 (void) printf(" failed (error %d)", error); 7263 (void) printf("\n"); 7264 } 7265 break; 7266 case POOL_TRIM_SUSPEND: 7267 if (ztest_opts.zo_verbose >= 4) { 7268 (void) printf("Suspend TRIM %s", path); 7269 if (!active) 7270 (void) printf(" failed (no TRIM active)"); 7271 (void) printf("\n"); 7272 } 7273 break; 7274 } 7275 free(path); 7276 mutex_exit(&ztest_vdev_lock); 7277 } 7278 7279 /* 7280 * Verify pool integrity by running zdb. 7281 */ 7282 static void 7283 ztest_run_zdb(uint64_t guid) 7284 { 7285 int status; 7286 char *bin; 7287 char *zdb; 7288 char *zbuf; 7289 const int len = MAXPATHLEN + MAXNAMELEN + 20; 7290 FILE *fp; 7291 7292 bin = umem_alloc(len, UMEM_NOFAIL); 7293 zdb = umem_alloc(len, UMEM_NOFAIL); 7294 zbuf = umem_alloc(1024, UMEM_NOFAIL); 7295 7296 ztest_get_zdb_bin(bin, len); 7297 7298 char **set_gvars_args = ztest_global_vars_to_zdb_args(); 7299 if (set_gvars_args == NULL) { 7300 fatal(B_FALSE, "Failed to allocate memory in " 7301 "ztest_global_vars_to_zdb_args(). Cannot run zdb.\n"); 7302 } 7303 char *set_gvars_args_joined = join_strings(set_gvars_args, " "); 7304 free(set_gvars_args); 7305 7306 size_t would = snprintf(zdb, len, 7307 "%s -bcc%s%s -G -d -Y -e -y %s -p %s %"PRIu64, 7308 bin, 7309 ztest_opts.zo_verbose >= 3 ? "s" : "", 7310 ztest_opts.zo_verbose >= 4 ? "v" : "", 7311 set_gvars_args_joined, 7312 ztest_opts.zo_dir, 7313 guid); 7314 ASSERT3U(would, <, len); 7315 7316 umem_free(set_gvars_args_joined, strlen(set_gvars_args_joined) + 1); 7317 7318 if (ztest_opts.zo_verbose >= 5) 7319 (void) printf("Executing %s\n", zdb); 7320 7321 fp = popen(zdb, "r"); 7322 7323 while (fgets(zbuf, 1024, fp) != NULL) 7324 if (ztest_opts.zo_verbose >= 3) 7325 (void) printf("%s", zbuf); 7326 7327 status = pclose(fp); 7328 7329 if (status == 0) 7330 goto out; 7331 7332 ztest_dump_core = 0; 7333 if (WIFEXITED(status)) 7334 fatal(B_FALSE, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 7335 else 7336 fatal(B_FALSE, "'%s' died with signal %d", 7337 zdb, WTERMSIG(status)); 7338 out: 7339 umem_free(bin, len); 7340 umem_free(zdb, len); 7341 umem_free(zbuf, 1024); 7342 } 7343 7344 static void 7345 ztest_walk_pool_directory(const char *header) 7346 { 7347 spa_t *spa = NULL; 7348 7349 if (ztest_opts.zo_verbose >= 6) 7350 (void) puts(header); 7351 7352 mutex_enter(&spa_namespace_lock); 7353 while ((spa = spa_next(spa)) != NULL) 7354 if (ztest_opts.zo_verbose >= 6) 7355 (void) printf("\t%s\n", spa_name(spa)); 7356 mutex_exit(&spa_namespace_lock); 7357 } 7358 7359 static void 7360 ztest_spa_import_export(char *oldname, char *newname) 7361 { 7362 nvlist_t *config, *newconfig; 7363 uint64_t pool_guid; 7364 spa_t *spa; 7365 int error; 7366 7367 if (ztest_opts.zo_verbose >= 4) { 7368 (void) printf("import/export: old = %s, new = %s\n", 7369 oldname, newname); 7370 } 7371 7372 /* 7373 * Clean up from previous runs. 7374 */ 7375 (void) spa_destroy(newname); 7376 7377 /* 7378 * Get the pool's configuration and guid. 7379 */ 7380 VERIFY0(spa_open(oldname, &spa, FTAG)); 7381 7382 /* 7383 * Kick off a scrub to tickle scrub/export races. 7384 */ 7385 if (ztest_random(2) == 0) 7386 (void) spa_scan(spa, POOL_SCAN_SCRUB); 7387 7388 pool_guid = spa_guid(spa); 7389 spa_close(spa, FTAG); 7390 7391 ztest_walk_pool_directory("pools before export"); 7392 7393 /* 7394 * Export it. 7395 */ 7396 VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); 7397 7398 ztest_walk_pool_directory("pools after export"); 7399 7400 /* 7401 * Try to import it. 7402 */ 7403 newconfig = spa_tryimport(config); 7404 ASSERT3P(newconfig, !=, NULL); 7405 fnvlist_free(newconfig); 7406 7407 /* 7408 * Import it under the new name. 7409 */ 7410 error = spa_import(newname, config, NULL, 0); 7411 if (error != 0) { 7412 dump_nvlist(config, 0); 7413 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 7414 oldname, newname, error); 7415 } 7416 7417 ztest_walk_pool_directory("pools after import"); 7418 7419 /* 7420 * Try to import it again -- should fail with EEXIST. 7421 */ 7422 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 7423 7424 /* 7425 * Try to import it under a different name -- should fail with EEXIST. 7426 */ 7427 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 7428 7429 /* 7430 * Verify that the pool is no longer visible under the old name. 7431 */ 7432 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 7433 7434 /* 7435 * Verify that we can open and close the pool using the new name. 7436 */ 7437 VERIFY0(spa_open(newname, &spa, FTAG)); 7438 ASSERT3U(pool_guid, ==, spa_guid(spa)); 7439 spa_close(spa, FTAG); 7440 7441 fnvlist_free(config); 7442 } 7443 7444 static void 7445 ztest_resume(spa_t *spa) 7446 { 7447 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 7448 (void) printf("resuming from suspended state\n"); 7449 spa_vdev_state_enter(spa, SCL_NONE); 7450 vdev_clear(spa, NULL); 7451 (void) spa_vdev_state_exit(spa, NULL, 0); 7452 (void) zio_resume(spa); 7453 } 7454 7455 static __attribute__((noreturn)) void 7456 ztest_resume_thread(void *arg) 7457 { 7458 spa_t *spa = arg; 7459 7460 while (!ztest_exiting) { 7461 if (spa_suspended(spa)) 7462 ztest_resume(spa); 7463 (void) poll(NULL, 0, 100); 7464 7465 /* 7466 * Periodically change the zfs_compressed_arc_enabled setting. 7467 */ 7468 if (ztest_random(10) == 0) 7469 zfs_compressed_arc_enabled = ztest_random(2); 7470 7471 /* 7472 * Periodically change the zfs_abd_scatter_enabled setting. 7473 */ 7474 if (ztest_random(10) == 0) 7475 zfs_abd_scatter_enabled = ztest_random(2); 7476 } 7477 7478 thread_exit(); 7479 } 7480 7481 static __attribute__((noreturn)) void 7482 ztest_deadman_thread(void *arg) 7483 { 7484 ztest_shared_t *zs = arg; 7485 spa_t *spa = ztest_spa; 7486 hrtime_t delay, overdue, last_run = gethrtime(); 7487 7488 delay = (zs->zs_thread_stop - zs->zs_thread_start) + 7489 MSEC2NSEC(zfs_deadman_synctime_ms); 7490 7491 while (!ztest_exiting) { 7492 /* 7493 * Wait for the delay timer while checking occasionally 7494 * if we should stop. 7495 */ 7496 if (gethrtime() < last_run + delay) { 7497 (void) poll(NULL, 0, 1000); 7498 continue; 7499 } 7500 7501 /* 7502 * If the pool is suspended then fail immediately. Otherwise, 7503 * check to see if the pool is making any progress. If 7504 * vdev_deadman() discovers that there hasn't been any recent 7505 * I/Os then it will end up aborting the tests. 7506 */ 7507 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 7508 fatal(B_FALSE, 7509 "aborting test after %llu seconds because " 7510 "pool has transitioned to a suspended state.", 7511 (u_longlong_t)zfs_deadman_synctime_ms / 1000); 7512 } 7513 vdev_deadman(spa->spa_root_vdev, FTAG); 7514 7515 /* 7516 * If the process doesn't complete within a grace period of 7517 * zfs_deadman_synctime_ms over the expected finish time, 7518 * then it may be hung and is terminated. 7519 */ 7520 overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); 7521 if (gethrtime() > overdue) { 7522 fatal(B_FALSE, 7523 "aborting test after %llu seconds because " 7524 "the process is overdue for termination.", 7525 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7526 } 7527 7528 (void) printf("ztest has been running for %lld seconds\n", 7529 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7530 7531 last_run = gethrtime(); 7532 delay = MSEC2NSEC(zfs_deadman_checktime_ms); 7533 } 7534 7535 thread_exit(); 7536 } 7537 7538 static void 7539 ztest_execute(int test, ztest_info_t *zi, uint64_t id) 7540 { 7541 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 7542 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 7543 hrtime_t functime = gethrtime(); 7544 int i; 7545 7546 for (i = 0; i < zi->zi_iters; i++) 7547 zi->zi_func(zd, id); 7548 7549 functime = gethrtime() - functime; 7550 7551 atomic_add_64(&zc->zc_count, 1); 7552 atomic_add_64(&zc->zc_time, functime); 7553 7554 if (ztest_opts.zo_verbose >= 4) 7555 (void) printf("%6.2f sec in %s\n", 7556 (double)functime / NANOSEC, zi->zi_funcname); 7557 } 7558 7559 typedef struct ztest_raidz_expand_io { 7560 uint64_t rzx_id; 7561 uint64_t rzx_amount; 7562 uint64_t rzx_bufsize; 7563 const void *rzx_buffer; 7564 uint64_t rzx_alloc_max; 7565 spa_t *rzx_spa; 7566 } ztest_expand_io_t; 7567 7568 #undef OD_ARRAY_SIZE 7569 #define OD_ARRAY_SIZE 10 7570 7571 /* 7572 * Write a request amount of data to some dataset objects. 7573 * There will be ztest_opts.zo_threads count of these running in parallel. 7574 */ 7575 static __attribute__((noreturn)) void 7576 ztest_rzx_thread(void *arg) 7577 { 7578 ztest_expand_io_t *info = (ztest_expand_io_t *)arg; 7579 ztest_od_t *od; 7580 int batchsize; 7581 int od_size; 7582 ztest_ds_t *zd = &ztest_ds[info->rzx_id % ztest_opts.zo_datasets]; 7583 spa_t *spa = info->rzx_spa; 7584 7585 od_size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 7586 od = umem_alloc(od_size, UMEM_NOFAIL); 7587 batchsize = OD_ARRAY_SIZE; 7588 7589 /* Create objects to write to */ 7590 for (int b = 0; b < batchsize; b++) { 7591 ztest_od_init(od + b, info->rzx_id, FTAG, b, 7592 DMU_OT_UINT64_OTHER, 0, 0, 0); 7593 } 7594 if (ztest_object_init(zd, od, od_size, B_FALSE) != 0) { 7595 umem_free(od, od_size); 7596 thread_exit(); 7597 } 7598 7599 for (uint64_t offset = 0, written = 0; written < info->rzx_amount; 7600 offset += info->rzx_bufsize) { 7601 /* write to 10 objects */ 7602 for (int i = 0; i < batchsize && written < info->rzx_amount; 7603 i++) { 7604 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 7605 ztest_write(zd, od[i].od_object, offset, 7606 info->rzx_bufsize, info->rzx_buffer); 7607 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 7608 written += info->rzx_bufsize; 7609 } 7610 txg_wait_synced(spa_get_dsl(spa), 0); 7611 /* due to inflation, we'll typically bail here */ 7612 if (metaslab_class_get_alloc(spa_normal_class(spa)) > 7613 info->rzx_alloc_max) { 7614 break; 7615 } 7616 } 7617 7618 /* Remove a few objects to leave some holes in allocation space */ 7619 mutex_enter(&zd->zd_dirobj_lock); 7620 (void) ztest_remove(zd, od, 2); 7621 mutex_exit(&zd->zd_dirobj_lock); 7622 7623 umem_free(od, od_size); 7624 7625 thread_exit(); 7626 } 7627 7628 static __attribute__((noreturn)) void 7629 ztest_thread(void *arg) 7630 { 7631 int rand; 7632 uint64_t id = (uintptr_t)arg; 7633 ztest_shared_t *zs = ztest_shared; 7634 uint64_t call_next; 7635 hrtime_t now; 7636 ztest_info_t *zi; 7637 ztest_shared_callstate_t *zc; 7638 7639 while ((now = gethrtime()) < zs->zs_thread_stop) { 7640 /* 7641 * See if it's time to force a crash. 7642 */ 7643 if (now > zs->zs_thread_kill && 7644 raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) { 7645 ztest_kill(zs); 7646 } 7647 7648 /* 7649 * If we're getting ENOSPC with some regularity, stop. 7650 */ 7651 if (zs->zs_enospc_count > 10) 7652 break; 7653 7654 /* 7655 * Pick a random function to execute. 7656 */ 7657 rand = ztest_random(ZTEST_FUNCS); 7658 zi = &ztest_info[rand]; 7659 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 7660 call_next = zc->zc_next; 7661 7662 if (now >= call_next && 7663 atomic_cas_64(&zc->zc_next, call_next, call_next + 7664 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 7665 ztest_execute(rand, zi, id); 7666 } 7667 } 7668 7669 thread_exit(); 7670 } 7671 7672 static void 7673 ztest_dataset_name(char *dsname, const char *pool, int d) 7674 { 7675 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 7676 } 7677 7678 static void 7679 ztest_dataset_destroy(int d) 7680 { 7681 char name[ZFS_MAX_DATASET_NAME_LEN]; 7682 int t; 7683 7684 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7685 7686 if (ztest_opts.zo_verbose >= 3) 7687 (void) printf("Destroying %s to free up space\n", name); 7688 7689 /* 7690 * Cleanup any non-standard clones and snapshots. In general, 7691 * ztest thread t operates on dataset (t % zopt_datasets), 7692 * so there may be more than one thing to clean up. 7693 */ 7694 for (t = d; t < ztest_opts.zo_threads; 7695 t += ztest_opts.zo_datasets) 7696 ztest_dsl_dataset_cleanup(name, t); 7697 7698 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 7699 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 7700 } 7701 7702 static void 7703 ztest_dataset_dirobj_verify(ztest_ds_t *zd) 7704 { 7705 uint64_t usedobjs, dirobjs, scratch; 7706 7707 /* 7708 * ZTEST_DIROBJ is the object directory for the entire dataset. 7709 * Therefore, the number of objects in use should equal the 7710 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 7711 * If not, we have an object leak. 7712 * 7713 * Note that we can only check this in ztest_dataset_open(), 7714 * when the open-context and syncing-context values agree. 7715 * That's because zap_count() returns the open-context value, 7716 * while dmu_objset_space() returns the rootbp fill count. 7717 */ 7718 VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 7719 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 7720 ASSERT3U(dirobjs + 1, ==, usedobjs); 7721 } 7722 7723 static int 7724 ztest_dataset_open(int d) 7725 { 7726 ztest_ds_t *zd = &ztest_ds[d]; 7727 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 7728 objset_t *os; 7729 zilog_t *zilog; 7730 char name[ZFS_MAX_DATASET_NAME_LEN]; 7731 int error; 7732 7733 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7734 7735 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7736 7737 error = ztest_dataset_create(name); 7738 if (error == ENOSPC) { 7739 (void) pthread_rwlock_unlock(&ztest_name_lock); 7740 ztest_record_enospc(FTAG); 7741 return (error); 7742 } 7743 ASSERT(error == 0 || error == EEXIST); 7744 7745 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 7746 B_TRUE, zd, &os)); 7747 (void) pthread_rwlock_unlock(&ztest_name_lock); 7748 7749 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 7750 7751 zilog = zd->zd_zilog; 7752 7753 if (zilog->zl_header->zh_claim_lr_seq != 0 && 7754 zilog->zl_header->zh_claim_lr_seq < committed_seq) 7755 fatal(B_FALSE, "missing log records: " 7756 "claimed %"PRIu64" < committed %"PRIu64"", 7757 zilog->zl_header->zh_claim_lr_seq, committed_seq); 7758 7759 ztest_dataset_dirobj_verify(zd); 7760 7761 zil_replay(os, zd, ztest_replay_vector); 7762 7763 ztest_dataset_dirobj_verify(zd); 7764 7765 if (ztest_opts.zo_verbose >= 6) 7766 (void) printf("%s replay %"PRIu64" blocks, " 7767 "%"PRIu64" records, seq %"PRIu64"\n", 7768 zd->zd_name, 7769 zilog->zl_parse_blk_count, 7770 zilog->zl_parse_lr_count, 7771 zilog->zl_replaying_seq); 7772 7773 zilog = zil_open(os, ztest_get_data, NULL); 7774 7775 if (zilog->zl_replaying_seq != 0 && 7776 zilog->zl_replaying_seq < committed_seq) 7777 fatal(B_FALSE, "missing log records: " 7778 "replayed %"PRIu64" < committed %"PRIu64"", 7779 zilog->zl_replaying_seq, committed_seq); 7780 7781 return (0); 7782 } 7783 7784 static void 7785 ztest_dataset_close(int d) 7786 { 7787 ztest_ds_t *zd = &ztest_ds[d]; 7788 7789 zil_close(zd->zd_zilog); 7790 dmu_objset_disown(zd->zd_os, B_TRUE, zd); 7791 7792 ztest_zd_fini(zd); 7793 } 7794 7795 static int 7796 ztest_replay_zil_cb(const char *name, void *arg) 7797 { 7798 (void) arg; 7799 objset_t *os; 7800 ztest_ds_t *zdtmp; 7801 7802 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, 7803 B_TRUE, FTAG, &os)); 7804 7805 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 7806 7807 ztest_zd_init(zdtmp, NULL, os); 7808 zil_replay(os, zdtmp, ztest_replay_vector); 7809 ztest_zd_fini(zdtmp); 7810 7811 if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && 7812 ztest_opts.zo_verbose >= 6) { 7813 zilog_t *zilog = dmu_objset_zil(os); 7814 7815 (void) printf("%s replay %"PRIu64" blocks, " 7816 "%"PRIu64" records, seq %"PRIu64"\n", 7817 name, 7818 zilog->zl_parse_blk_count, 7819 zilog->zl_parse_lr_count, 7820 zilog->zl_replaying_seq); 7821 } 7822 7823 umem_free(zdtmp, sizeof (ztest_ds_t)); 7824 7825 dmu_objset_disown(os, B_TRUE, FTAG); 7826 return (0); 7827 } 7828 7829 static void 7830 ztest_freeze(void) 7831 { 7832 ztest_ds_t *zd = &ztest_ds[0]; 7833 spa_t *spa; 7834 int numloops = 0; 7835 7836 /* freeze not supported during RAIDZ expansion */ 7837 if (ztest_opts.zo_raid_do_expand) 7838 return; 7839 7840 if (ztest_opts.zo_verbose >= 3) 7841 (void) printf("testing spa_freeze()...\n"); 7842 7843 raidz_scratch_verify(); 7844 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7845 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7846 VERIFY0(ztest_dataset_open(0)); 7847 ztest_spa = spa; 7848 7849 /* 7850 * Force the first log block to be transactionally allocated. 7851 * We have to do this before we freeze the pool -- otherwise 7852 * the log chain won't be anchored. 7853 */ 7854 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 7855 ztest_dmu_object_alloc_free(zd, 0); 7856 zil_commit(zd->zd_zilog, 0); 7857 } 7858 7859 txg_wait_synced(spa_get_dsl(spa), 0); 7860 7861 /* 7862 * Freeze the pool. This stops spa_sync() from doing anything, 7863 * so that the only way to record changes from now on is the ZIL. 7864 */ 7865 spa_freeze(spa); 7866 7867 /* 7868 * Because it is hard to predict how much space a write will actually 7869 * require beforehand, we leave ourselves some fudge space to write over 7870 * capacity. 7871 */ 7872 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 7873 7874 /* 7875 * Run tests that generate log records but don't alter the pool config 7876 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 7877 * We do a txg_wait_synced() after each iteration to force the txg 7878 * to increase well beyond the last synced value in the uberblock. 7879 * The ZIL should be OK with that. 7880 * 7881 * Run a random number of times less than zo_maxloops and ensure we do 7882 * not run out of space on the pool. 7883 */ 7884 while (ztest_random(10) != 0 && 7885 numloops++ < ztest_opts.zo_maxloops && 7886 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 7887 ztest_od_t od; 7888 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 7889 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 7890 ztest_io(zd, od.od_object, 7891 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 7892 txg_wait_synced(spa_get_dsl(spa), 0); 7893 } 7894 7895 /* 7896 * Commit all of the changes we just generated. 7897 */ 7898 zil_commit(zd->zd_zilog, 0); 7899 txg_wait_synced(spa_get_dsl(spa), 0); 7900 7901 /* 7902 * Close our dataset and close the pool. 7903 */ 7904 ztest_dataset_close(0); 7905 spa_close(spa, FTAG); 7906 kernel_fini(); 7907 7908 /* 7909 * Open and close the pool and dataset to induce log replay. 7910 */ 7911 raidz_scratch_verify(); 7912 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7913 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7914 ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); 7915 VERIFY0(ztest_dataset_open(0)); 7916 ztest_spa = spa; 7917 txg_wait_synced(spa_get_dsl(spa), 0); 7918 ztest_dataset_close(0); 7919 ztest_reguid(NULL, 0); 7920 7921 spa_close(spa, FTAG); 7922 kernel_fini(); 7923 } 7924 7925 static void 7926 ztest_import_impl(void) 7927 { 7928 importargs_t args = { 0 }; 7929 nvlist_t *cfg = NULL; 7930 int nsearch = 1; 7931 char *searchdirs[nsearch]; 7932 int flags = ZFS_IMPORT_MISSING_LOG; 7933 7934 searchdirs[0] = ztest_opts.zo_dir; 7935 args.paths = nsearch; 7936 args.path = searchdirs; 7937 args.can_be_active = B_FALSE; 7938 7939 libpc_handle_t lpch = { 7940 .lpc_lib_handle = NULL, 7941 .lpc_ops = &libzpool_config_ops, 7942 .lpc_printerr = B_TRUE 7943 }; 7944 VERIFY0(zpool_find_config(&lpch, ztest_opts.zo_pool, &cfg, &args)); 7945 VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); 7946 fnvlist_free(cfg); 7947 } 7948 7949 /* 7950 * Import a storage pool with the given name. 7951 */ 7952 static void 7953 ztest_import(ztest_shared_t *zs) 7954 { 7955 spa_t *spa; 7956 7957 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7958 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7959 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7960 7961 raidz_scratch_verify(); 7962 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7963 7964 ztest_import_impl(); 7965 7966 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7967 zs->zs_metaslab_sz = 7968 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7969 zs->zs_guid = spa_guid(spa); 7970 spa_close(spa, FTAG); 7971 7972 kernel_fini(); 7973 7974 if (!ztest_opts.zo_mmp_test) { 7975 ztest_run_zdb(zs->zs_guid); 7976 ztest_freeze(); 7977 ztest_run_zdb(zs->zs_guid); 7978 } 7979 7980 (void) pthread_rwlock_destroy(&ztest_name_lock); 7981 mutex_destroy(&ztest_vdev_lock); 7982 mutex_destroy(&ztest_checkpoint_lock); 7983 } 7984 7985 /* 7986 * After the expansion was killed, check that the pool is healthy 7987 */ 7988 static void 7989 ztest_raidz_expand_check(spa_t *spa) 7990 { 7991 ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, RAIDZ_EXPAND_KILLED); 7992 /* 7993 * Set pool check done flag, main program will run a zdb check 7994 * of the pool when we exit. 7995 */ 7996 ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_CHECKED; 7997 7998 /* Wait for reflow to finish */ 7999 if (ztest_opts.zo_verbose >= 1) { 8000 (void) printf("\nwaiting for reflow to finish ...\n"); 8001 } 8002 pool_raidz_expand_stat_t rzx_stats; 8003 pool_raidz_expand_stat_t *pres = &rzx_stats; 8004 do { 8005 txg_wait_synced(spa_get_dsl(spa), 0); 8006 (void) poll(NULL, 0, 500); /* wait 1/2 second */ 8007 8008 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8009 (void) spa_raidz_expand_get_stats(spa, pres); 8010 spa_config_exit(spa, SCL_CONFIG, FTAG); 8011 } while (pres->pres_state != DSS_FINISHED && 8012 pres->pres_reflowed < pres->pres_to_reflow); 8013 8014 if (ztest_opts.zo_verbose >= 1) { 8015 (void) printf("verifying an interrupted raidz " 8016 "expansion using a pool scrub ...\n"); 8017 } 8018 /* Will fail here if there is non-recoverable corruption detected */ 8019 VERIFY0(ztest_scrub_impl(spa)); 8020 if (ztest_opts.zo_verbose >= 1) { 8021 (void) printf("raidz expansion scrub check complete\n"); 8022 } 8023 } 8024 8025 /* 8026 * Start a raidz expansion test. We run some I/O on the pool for a while 8027 * to get some data in the pool. Then we grow the raidz and 8028 * kill the test at the requested offset into the reflow, verifying that 8029 * doing such does not lead to pool corruption. 8030 */ 8031 static void 8032 ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) 8033 { 8034 nvlist_t *root; 8035 pool_raidz_expand_stat_t rzx_stats; 8036 pool_raidz_expand_stat_t *pres = &rzx_stats; 8037 kthread_t **run_threads; 8038 vdev_t *cvd, *rzvd = spa->spa_root_vdev->vdev_child[0]; 8039 int total_disks = rzvd->vdev_children; 8040 int data_disks = total_disks - vdev_get_nparity(rzvd); 8041 uint64_t alloc_goal; 8042 uint64_t csize; 8043 int error, t; 8044 int threads = ztest_opts.zo_threads; 8045 ztest_expand_io_t *thread_args; 8046 8047 ASSERT3U(ztest_opts.zo_raidz_expand_test, !=, RAIDZ_EXPAND_NONE); 8048 ASSERT3U(rzvd->vdev_ops, ==, &vdev_raidz_ops); 8049 ztest_opts.zo_raidz_expand_test = RAIDZ_EXPAND_STARTED; 8050 8051 /* Setup a 1 MiB buffer of random data */ 8052 uint64_t bufsize = 1024 * 1024; 8053 void *buffer = umem_alloc(bufsize, UMEM_NOFAIL); 8054 8055 if (read(ztest_fd_rand, buffer, bufsize) != bufsize) { 8056 fatal(B_TRUE, "short read from /dev/urandom"); 8057 } 8058 /* 8059 * Put some data in the pool and then attach a vdev to initiate 8060 * reflow. 8061 */ 8062 run_threads = umem_zalloc(threads * sizeof (kthread_t *), UMEM_NOFAIL); 8063 thread_args = umem_zalloc(threads * sizeof (ztest_expand_io_t), 8064 UMEM_NOFAIL); 8065 /* Aim for roughly 25% of allocatable space up to 1GB */ 8066 alloc_goal = (vdev_get_min_asize(rzvd) * data_disks) / total_disks; 8067 alloc_goal = MIN(alloc_goal >> 2, 1024*1024*1024); 8068 if (ztest_opts.zo_verbose >= 1) { 8069 (void) printf("adding data to pool '%s', goal %llu bytes\n", 8070 ztest_opts.zo_pool, (u_longlong_t)alloc_goal); 8071 } 8072 8073 /* 8074 * Kick off all the I/O generators that run in parallel. 8075 */ 8076 for (t = 0; t < threads; t++) { 8077 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 8078 umem_free(run_threads, threads * sizeof (kthread_t *)); 8079 umem_free(buffer, bufsize); 8080 return; 8081 } 8082 thread_args[t].rzx_id = t; 8083 thread_args[t].rzx_amount = alloc_goal / threads; 8084 thread_args[t].rzx_bufsize = bufsize; 8085 thread_args[t].rzx_buffer = buffer; 8086 thread_args[t].rzx_alloc_max = alloc_goal; 8087 thread_args[t].rzx_spa = spa; 8088 run_threads[t] = thread_create(NULL, 0, ztest_rzx_thread, 8089 &thread_args[t], 0, NULL, TS_RUN | TS_JOINABLE, 8090 defclsyspri); 8091 } 8092 8093 /* 8094 * Wait for all of the writers to complete. 8095 */ 8096 for (t = 0; t < threads; t++) 8097 VERIFY0(thread_join(run_threads[t])); 8098 8099 /* 8100 * Close all datasets. This must be done after all the threads 8101 * are joined so we can be sure none of the datasets are in-use 8102 * by any of the threads. 8103 */ 8104 for (t = 0; t < ztest_opts.zo_threads; t++) { 8105 if (t < ztest_opts.zo_datasets) 8106 ztest_dataset_close(t); 8107 } 8108 8109 txg_wait_synced(spa_get_dsl(spa), 0); 8110 8111 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 8112 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 8113 8114 umem_free(buffer, bufsize); 8115 umem_free(run_threads, threads * sizeof (kthread_t *)); 8116 umem_free(thread_args, threads * sizeof (ztest_expand_io_t)); 8117 8118 /* Set our reflow target to 25%, 50% or 75% of allocated size */ 8119 uint_t multiple = ztest_random(3) + 1; 8120 uint64_t reflow_max = (rzvd->vdev_stat.vs_alloc * multiple) / 4; 8121 raidz_expand_max_reflow_bytes = reflow_max; 8122 8123 if (ztest_opts.zo_verbose >= 1) { 8124 (void) printf("running raidz expansion test, killing when " 8125 "reflow reaches %llu bytes (%u/4 of allocated space)\n", 8126 (u_longlong_t)reflow_max, multiple); 8127 } 8128 8129 /* XXX - do we want some I/O load during the reflow? */ 8130 8131 /* 8132 * Use a disk size that is larger than existing ones 8133 */ 8134 cvd = rzvd->vdev_child[0]; 8135 csize = vdev_get_min_asize(cvd); 8136 csize += csize / 10; 8137 /* 8138 * Path to vdev to be attached 8139 */ 8140 char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 8141 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 8142 ztest_opts.zo_dir, ztest_opts.zo_pool, rzvd->vdev_children); 8143 /* 8144 * Build the nvlist describing newpath. 8145 */ 8146 root = make_vdev_root(newpath, NULL, NULL, csize, ztest_get_ashift(), 8147 NULL, 0, 0, 1); 8148 /* 8149 * Expand the raidz vdev by attaching the new disk 8150 */ 8151 if (ztest_opts.zo_verbose >= 1) { 8152 (void) printf("expanding raidz: %d wide to %d wide with '%s'\n", 8153 (int)rzvd->vdev_children, (int)rzvd->vdev_children + 1, 8154 newpath); 8155 } 8156 error = spa_vdev_attach(spa, rzvd->vdev_guid, root, B_FALSE, B_FALSE); 8157 nvlist_free(root); 8158 if (error != 0) { 8159 fatal(0, "raidz expand: attach (%s %llu) returned %d", 8160 newpath, (long long)csize, error); 8161 } 8162 8163 /* 8164 * Wait for reflow to begin 8165 */ 8166 while (spa->spa_raidz_expand == NULL) { 8167 txg_wait_synced(spa_get_dsl(spa), 0); 8168 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8169 } 8170 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8171 (void) spa_raidz_expand_get_stats(spa, pres); 8172 spa_config_exit(spa, SCL_CONFIG, FTAG); 8173 while (pres->pres_state != DSS_SCANNING) { 8174 txg_wait_synced(spa_get_dsl(spa), 0); 8175 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8176 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8177 (void) spa_raidz_expand_get_stats(spa, pres); 8178 spa_config_exit(spa, SCL_CONFIG, FTAG); 8179 } 8180 8181 ASSERT3U(pres->pres_state, ==, DSS_SCANNING); 8182 ASSERT3U(pres->pres_to_reflow, !=, 0); 8183 /* 8184 * Set so when we are killed we go to raidz checking rather than 8185 * restarting test. 8186 */ 8187 ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_KILLED; 8188 if (ztest_opts.zo_verbose >= 1) { 8189 (void) printf("raidz expansion reflow started, waiting for " 8190 "%llu bytes to be copied\n", (u_longlong_t)reflow_max); 8191 } 8192 8193 /* 8194 * Wait for reflow maximum to be reached and then kill the test 8195 */ 8196 while (pres->pres_reflowed < reflow_max) { 8197 txg_wait_synced(spa_get_dsl(spa), 0); 8198 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8199 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8200 (void) spa_raidz_expand_get_stats(spa, pres); 8201 spa_config_exit(spa, SCL_CONFIG, FTAG); 8202 } 8203 8204 /* Reset the reflow pause before killing */ 8205 raidz_expand_max_reflow_bytes = 0; 8206 8207 if (ztest_opts.zo_verbose >= 1) { 8208 (void) printf("killing raidz expansion test after reflow " 8209 "reached %llu bytes\n", (u_longlong_t)pres->pres_reflowed); 8210 } 8211 8212 /* 8213 * Kill ourself to simulate a panic during a reflow. Our parent will 8214 * restart the test and the changed flag value will drive the test 8215 * through the scrub/check code to verify the pool is not corrupted. 8216 */ 8217 ztest_kill(zs); 8218 } 8219 8220 static void 8221 ztest_generic_run(ztest_shared_t *zs, spa_t *spa) 8222 { 8223 kthread_t **run_threads; 8224 int t; 8225 8226 run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), 8227 UMEM_NOFAIL); 8228 8229 /* 8230 * Kick off all the tests that run in parallel. 8231 */ 8232 for (t = 0; t < ztest_opts.zo_threads; t++) { 8233 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 8234 umem_free(run_threads, ztest_opts.zo_threads * 8235 sizeof (kthread_t *)); 8236 return; 8237 } 8238 8239 run_threads[t] = thread_create(NULL, 0, ztest_thread, 8240 (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, 8241 defclsyspri); 8242 } 8243 8244 /* 8245 * Wait for all of the tests to complete. 8246 */ 8247 for (t = 0; t < ztest_opts.zo_threads; t++) 8248 VERIFY0(thread_join(run_threads[t])); 8249 8250 /* 8251 * Close all datasets. This must be done after all the threads 8252 * are joined so we can be sure none of the datasets are in-use 8253 * by any of the threads. 8254 */ 8255 for (t = 0; t < ztest_opts.zo_threads; t++) { 8256 if (t < ztest_opts.zo_datasets) 8257 ztest_dataset_close(t); 8258 } 8259 8260 txg_wait_synced(spa_get_dsl(spa), 0); 8261 8262 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 8263 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 8264 8265 umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); 8266 } 8267 8268 /* 8269 * Setup our test context and kick off threads to run tests on all datasets 8270 * in parallel. 8271 */ 8272 static void 8273 ztest_run(ztest_shared_t *zs) 8274 { 8275 spa_t *spa; 8276 objset_t *os; 8277 kthread_t *resume_thread, *deadman_thread; 8278 uint64_t object; 8279 int error; 8280 int t, d; 8281 8282 ztest_exiting = B_FALSE; 8283 8284 /* 8285 * Initialize parent/child shared state. 8286 */ 8287 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8288 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8289 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8290 8291 zs->zs_thread_start = gethrtime(); 8292 zs->zs_thread_stop = 8293 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 8294 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 8295 zs->zs_thread_kill = zs->zs_thread_stop; 8296 if (ztest_random(100) < ztest_opts.zo_killrate) { 8297 zs->zs_thread_kill -= 8298 ztest_random(ztest_opts.zo_passtime * NANOSEC); 8299 } 8300 8301 mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); 8302 8303 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 8304 offsetof(ztest_cb_data_t, zcd_node)); 8305 8306 /* 8307 * Open our pool. It may need to be imported first depending on 8308 * what tests were running when the previous pass was terminated. 8309 */ 8310 raidz_scratch_verify(); 8311 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8312 error = spa_open(ztest_opts.zo_pool, &spa, FTAG); 8313 if (error) { 8314 VERIFY3S(error, ==, ENOENT); 8315 ztest_import_impl(); 8316 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8317 zs->zs_metaslab_sz = 8318 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8319 } 8320 8321 metaslab_preload_limit = ztest_random(20) + 1; 8322 ztest_spa = spa; 8323 8324 /* 8325 * XXX - BUGBUG raidz expansion do not run this for generic for now 8326 */ 8327 if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) 8328 VERIFY0(vdev_raidz_impl_set("cycle")); 8329 8330 dmu_objset_stats_t dds; 8331 VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, 8332 DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); 8333 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 8334 dmu_objset_fast_stat(os, &dds); 8335 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 8336 dmu_objset_disown(os, B_TRUE, FTAG); 8337 8338 /* Give the dedicated raidz expansion test more grace time */ 8339 if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) 8340 zfs_deadman_synctime_ms *= 2; 8341 8342 /* 8343 * Create a thread to periodically resume suspended I/O. 8344 */ 8345 resume_thread = thread_create(NULL, 0, ztest_resume_thread, 8346 spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 8347 8348 /* 8349 * Create a deadman thread and set to panic if we hang. 8350 */ 8351 deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, 8352 zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 8353 8354 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; 8355 8356 /* 8357 * Verify that we can safely inquire about any object, 8358 * whether it's allocated or not. To make it interesting, 8359 * we probe a 5-wide window around each power of two. 8360 * This hits all edge cases, including zero and the max. 8361 */ 8362 for (t = 0; t < 64; t++) { 8363 for (d = -5; d <= 5; d++) { 8364 error = dmu_object_info(spa->spa_meta_objset, 8365 (1ULL << t) + d, NULL); 8366 ASSERT(error == 0 || error == ENOENT || 8367 error == EINVAL); 8368 } 8369 } 8370 8371 /* 8372 * If we got any ENOSPC errors on the previous run, destroy something. 8373 */ 8374 if (zs->zs_enospc_count != 0) { 8375 /* Not expecting ENOSPC errors during raidz expansion tests */ 8376 ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, 8377 RAIDZ_EXPAND_NONE); 8378 8379 int d = ztest_random(ztest_opts.zo_datasets); 8380 ztest_dataset_destroy(d); 8381 } 8382 zs->zs_enospc_count = 0; 8383 8384 /* 8385 * If we were in the middle of ztest_device_removal() and were killed 8386 * we need to ensure the removal and scrub complete before running 8387 * any tests that check ztest_device_removal_active. The removal will 8388 * be restarted automatically when the spa is opened, but we need to 8389 * initiate the scrub manually if it is not already in progress. Note 8390 * that we always run the scrub whenever an indirect vdev exists 8391 * because we have no way of knowing for sure if ztest_device_removal() 8392 * fully completed its scrub before the pool was reimported. 8393 * 8394 * Does not apply for the RAIDZ expansion specific test runs 8395 */ 8396 if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_NONE && 8397 (spa->spa_removing_phys.sr_state == DSS_SCANNING || 8398 spa->spa_removing_phys.sr_prev_indirect_vdev != -1)) { 8399 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 8400 txg_wait_synced(spa_get_dsl(spa), 0); 8401 8402 error = ztest_scrub_impl(spa); 8403 if (error == EBUSY) 8404 error = 0; 8405 ASSERT0(error); 8406 } 8407 8408 if (ztest_opts.zo_verbose >= 4) 8409 (void) printf("starting main threads...\n"); 8410 8411 /* 8412 * Replay all logs of all datasets in the pool. This is primarily for 8413 * temporary datasets which wouldn't otherwise get replayed, which 8414 * can trigger failures when attempting to offline a SLOG in 8415 * ztest_fault_inject(). 8416 */ 8417 (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, 8418 NULL, DS_FIND_CHILDREN); 8419 8420 if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) 8421 ztest_raidz_expand_run(zs, spa); 8422 else if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_KILLED) 8423 ztest_raidz_expand_check(spa); 8424 else 8425 ztest_generic_run(zs, spa); 8426 8427 /* Kill the resume and deadman threads */ 8428 ztest_exiting = B_TRUE; 8429 VERIFY0(thread_join(resume_thread)); 8430 VERIFY0(thread_join(deadman_thread)); 8431 ztest_resume(spa); 8432 8433 /* 8434 * Right before closing the pool, kick off a bunch of async I/O; 8435 * spa_close() should wait for it to complete. 8436 */ 8437 for (object = 1; object < 50; object++) { 8438 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 8439 ZIO_PRIORITY_SYNC_READ); 8440 } 8441 8442 /* Verify that at least one commit cb was called in a timely fashion */ 8443 if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) 8444 VERIFY0(zc_min_txg_delay); 8445 8446 spa_close(spa, FTAG); 8447 8448 /* 8449 * Verify that we can loop over all pools. 8450 */ 8451 mutex_enter(&spa_namespace_lock); 8452 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 8453 if (ztest_opts.zo_verbose > 3) 8454 (void) printf("spa_next: found %s\n", spa_name(spa)); 8455 mutex_exit(&spa_namespace_lock); 8456 8457 /* 8458 * Verify that we can export the pool and reimport it under a 8459 * different name. 8460 */ 8461 if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { 8462 char name[ZFS_MAX_DATASET_NAME_LEN]; 8463 (void) snprintf(name, sizeof (name), "%s_import", 8464 ztest_opts.zo_pool); 8465 ztest_spa_import_export(ztest_opts.zo_pool, name); 8466 ztest_spa_import_export(name, ztest_opts.zo_pool); 8467 } 8468 8469 kernel_fini(); 8470 8471 list_destroy(&zcl.zcl_callbacks); 8472 mutex_destroy(&zcl.zcl_callbacks_lock); 8473 (void) pthread_rwlock_destroy(&ztest_name_lock); 8474 mutex_destroy(&ztest_vdev_lock); 8475 mutex_destroy(&ztest_checkpoint_lock); 8476 } 8477 8478 static void 8479 print_time(hrtime_t t, char *timebuf) 8480 { 8481 hrtime_t s = t / NANOSEC; 8482 hrtime_t m = s / 60; 8483 hrtime_t h = m / 60; 8484 hrtime_t d = h / 24; 8485 8486 s -= m * 60; 8487 m -= h * 60; 8488 h -= d * 24; 8489 8490 timebuf[0] = '\0'; 8491 8492 if (d) 8493 (void) sprintf(timebuf, 8494 "%llud%02lluh%02llum%02llus", d, h, m, s); 8495 else if (h) 8496 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 8497 else if (m) 8498 (void) sprintf(timebuf, "%llum%02llus", m, s); 8499 else 8500 (void) sprintf(timebuf, "%llus", s); 8501 } 8502 8503 static nvlist_t * 8504 make_random_props(void) 8505 { 8506 nvlist_t *props; 8507 8508 props = fnvlist_alloc(); 8509 8510 if (ztest_random(2) == 0) 8511 return (props); 8512 8513 fnvlist_add_uint64(props, 8514 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); 8515 8516 return (props); 8517 } 8518 8519 /* 8520 * Create a storage pool with the given name and initial vdev size. 8521 * Then test spa_freeze() functionality. 8522 */ 8523 static void 8524 ztest_init(ztest_shared_t *zs) 8525 { 8526 spa_t *spa; 8527 nvlist_t *nvroot, *props; 8528 int i; 8529 8530 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8531 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8532 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8533 8534 raidz_scratch_verify(); 8535 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8536 8537 /* 8538 * Create the storage pool. 8539 */ 8540 (void) spa_destroy(ztest_opts.zo_pool); 8541 ztest_shared->zs_vdev_next_leaf = 0; 8542 zs->zs_splits = 0; 8543 zs->zs_mirrors = ztest_opts.zo_mirrors; 8544 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 8545 NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 8546 props = make_random_props(); 8547 8548 /* 8549 * We don't expect the pool to suspend unless maxfaults == 0, 8550 * in which case ztest_fault_inject() temporarily takes away 8551 * the only valid replica. 8552 */ 8553 fnvlist_add_uint64(props, 8554 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 8555 MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); 8556 8557 for (i = 0; i < SPA_FEATURES; i++) { 8558 char *buf; 8559 8560 if (!spa_feature_table[i].fi_zfs_mod_supported) 8561 continue; 8562 8563 /* 8564 * 75% chance of using the log space map feature. We want ztest 8565 * to exercise both the code paths that use the log space map 8566 * feature and the ones that don't. 8567 */ 8568 if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) 8569 continue; 8570 8571 VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", 8572 spa_feature_table[i].fi_uname)); 8573 fnvlist_add_uint64(props, buf, 0); 8574 free(buf); 8575 } 8576 8577 VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); 8578 fnvlist_free(nvroot); 8579 fnvlist_free(props); 8580 8581 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8582 zs->zs_metaslab_sz = 8583 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8584 zs->zs_guid = spa_guid(spa); 8585 spa_close(spa, FTAG); 8586 8587 kernel_fini(); 8588 8589 if (!ztest_opts.zo_mmp_test) { 8590 ztest_run_zdb(zs->zs_guid); 8591 ztest_freeze(); 8592 ztest_run_zdb(zs->zs_guid); 8593 } 8594 8595 (void) pthread_rwlock_destroy(&ztest_name_lock); 8596 mutex_destroy(&ztest_vdev_lock); 8597 mutex_destroy(&ztest_checkpoint_lock); 8598 } 8599 8600 static void 8601 setup_data_fd(void) 8602 { 8603 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 8604 8605 ztest_fd_data = mkstemp(ztest_name_data); 8606 ASSERT3S(ztest_fd_data, >=, 0); 8607 (void) unlink(ztest_name_data); 8608 } 8609 8610 static int 8611 shared_data_size(ztest_shared_hdr_t *hdr) 8612 { 8613 int size; 8614 8615 size = hdr->zh_hdr_size; 8616 size += hdr->zh_opts_size; 8617 size += hdr->zh_size; 8618 size += hdr->zh_stats_size * hdr->zh_stats_count; 8619 size += hdr->zh_ds_size * hdr->zh_ds_count; 8620 size += hdr->zh_scratch_state_size; 8621 8622 return (size); 8623 } 8624 8625 static void 8626 setup_hdr(void) 8627 { 8628 int size; 8629 ztest_shared_hdr_t *hdr; 8630 8631 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 8632 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 8633 ASSERT3P(hdr, !=, MAP_FAILED); 8634 8635 VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 8636 8637 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 8638 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 8639 hdr->zh_size = sizeof (ztest_shared_t); 8640 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 8641 hdr->zh_stats_count = ZTEST_FUNCS; 8642 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 8643 hdr->zh_ds_count = ztest_opts.zo_datasets; 8644 hdr->zh_scratch_state_size = sizeof (ztest_shared_scratch_state_t); 8645 8646 size = shared_data_size(hdr); 8647 VERIFY0(ftruncate(ztest_fd_data, size)); 8648 8649 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 8650 } 8651 8652 static void 8653 setup_data(void) 8654 { 8655 int size, offset; 8656 ztest_shared_hdr_t *hdr; 8657 uint8_t *buf; 8658 8659 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 8660 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 8661 ASSERT3P(hdr, !=, MAP_FAILED); 8662 8663 size = shared_data_size(hdr); 8664 8665 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 8666 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 8667 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 8668 ASSERT3P(hdr, !=, MAP_FAILED); 8669 buf = (uint8_t *)hdr; 8670 8671 offset = hdr->zh_hdr_size; 8672 ztest_shared_opts = (void *)&buf[offset]; 8673 offset += hdr->zh_opts_size; 8674 ztest_shared = (void *)&buf[offset]; 8675 offset += hdr->zh_size; 8676 ztest_shared_callstate = (void *)&buf[offset]; 8677 offset += hdr->zh_stats_size * hdr->zh_stats_count; 8678 ztest_shared_ds = (void *)&buf[offset]; 8679 offset += hdr->zh_ds_size * hdr->zh_ds_count; 8680 ztest_scratch_state = (void *)&buf[offset]; 8681 } 8682 8683 static boolean_t 8684 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 8685 { 8686 pid_t pid; 8687 int status; 8688 char *cmdbuf = NULL; 8689 8690 pid = fork(); 8691 8692 if (cmd == NULL) { 8693 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 8694 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 8695 cmd = cmdbuf; 8696 } 8697 8698 if (pid == -1) 8699 fatal(B_TRUE, "fork failed"); 8700 8701 if (pid == 0) { /* child */ 8702 char fd_data_str[12]; 8703 8704 VERIFY3S(11, >=, 8705 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 8706 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 8707 8708 if (libpath != NULL) { 8709 const char *curlp = getenv("LD_LIBRARY_PATH"); 8710 if (curlp == NULL) 8711 VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); 8712 else { 8713 char *newlp = NULL; 8714 VERIFY3S(-1, !=, 8715 asprintf(&newlp, "%s:%s", libpath, curlp)); 8716 VERIFY0(setenv("LD_LIBRARY_PATH", newlp, 1)); 8717 free(newlp); 8718 } 8719 } 8720 (void) execl(cmd, cmd, (char *)NULL); 8721 ztest_dump_core = B_FALSE; 8722 fatal(B_TRUE, "exec failed: %s", cmd); 8723 } 8724 8725 if (cmdbuf != NULL) { 8726 umem_free(cmdbuf, MAXPATHLEN); 8727 cmd = NULL; 8728 } 8729 8730 while (waitpid(pid, &status, 0) != pid) 8731 continue; 8732 if (statusp != NULL) 8733 *statusp = status; 8734 8735 if (WIFEXITED(status)) { 8736 if (WEXITSTATUS(status) != 0) { 8737 (void) fprintf(stderr, "child exited with code %d\n", 8738 WEXITSTATUS(status)); 8739 exit(2); 8740 } 8741 return (B_FALSE); 8742 } else if (WIFSIGNALED(status)) { 8743 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 8744 (void) fprintf(stderr, "child died with signal %d\n", 8745 WTERMSIG(status)); 8746 exit(3); 8747 } 8748 return (B_TRUE); 8749 } else { 8750 (void) fprintf(stderr, "something strange happened to child\n"); 8751 exit(4); 8752 } 8753 } 8754 8755 static void 8756 ztest_run_init(void) 8757 { 8758 int i; 8759 8760 ztest_shared_t *zs = ztest_shared; 8761 8762 /* 8763 * Blow away any existing copy of zpool.cache 8764 */ 8765 (void) remove(spa_config_path); 8766 8767 if (ztest_opts.zo_init == 0) { 8768 if (ztest_opts.zo_verbose >= 1) 8769 (void) printf("Importing pool %s\n", 8770 ztest_opts.zo_pool); 8771 ztest_import(zs); 8772 return; 8773 } 8774 8775 /* 8776 * Create and initialize our storage pool. 8777 */ 8778 for (i = 1; i <= ztest_opts.zo_init; i++) { 8779 memset(zs, 0, sizeof (*zs)); 8780 if (ztest_opts.zo_verbose >= 3 && 8781 ztest_opts.zo_init != 1) { 8782 (void) printf("ztest_init(), pass %d\n", i); 8783 } 8784 ztest_init(zs); 8785 } 8786 } 8787 8788 int 8789 main(int argc, char **argv) 8790 { 8791 int kills = 0; 8792 int iters = 0; 8793 int older = 0; 8794 int newer = 0; 8795 ztest_shared_t *zs; 8796 ztest_info_t *zi; 8797 ztest_shared_callstate_t *zc; 8798 char timebuf[100]; 8799 char numbuf[NN_NUMBUF_SZ]; 8800 char *cmd; 8801 boolean_t hasalt; 8802 int f, err; 8803 char *fd_data_str = getenv("ZTEST_FD_DATA"); 8804 struct sigaction action; 8805 8806 (void) setvbuf(stdout, NULL, _IOLBF, 0); 8807 8808 dprintf_setup(&argc, argv); 8809 zfs_deadman_synctime_ms = 300000; 8810 zfs_deadman_checktime_ms = 30000; 8811 /* 8812 * As two-word space map entries may not come up often (especially 8813 * if pool and vdev sizes are small) we want to force at least some 8814 * of them so the feature get tested. 8815 */ 8816 zfs_force_some_double_word_sm_entries = B_TRUE; 8817 8818 /* 8819 * Verify that even extensively damaged split blocks with many 8820 * segments can be reconstructed in a reasonable amount of time 8821 * when reconstruction is known to be possible. 8822 * 8823 * Note: the lower this value is, the more damage we inflict, and 8824 * the more time ztest spends in recovering that damage. We chose 8825 * to induce damage 1/100th of the time so recovery is tested but 8826 * not so frequently that ztest doesn't get to test other code paths. 8827 */ 8828 zfs_reconstruct_indirect_damage_fraction = 100; 8829 8830 action.sa_handler = sig_handler; 8831 sigemptyset(&action.sa_mask); 8832 action.sa_flags = 0; 8833 8834 if (sigaction(SIGSEGV, &action, NULL) < 0) { 8835 (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", 8836 strerror(errno)); 8837 exit(EXIT_FAILURE); 8838 } 8839 8840 if (sigaction(SIGABRT, &action, NULL) < 0) { 8841 (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", 8842 strerror(errno)); 8843 exit(EXIT_FAILURE); 8844 } 8845 8846 /* 8847 * Force random_get_bytes() to use /dev/urandom in order to prevent 8848 * ztest from needlessly depleting the system entropy pool. 8849 */ 8850 random_path = "/dev/urandom"; 8851 ztest_fd_rand = open(random_path, O_RDONLY | O_CLOEXEC); 8852 ASSERT3S(ztest_fd_rand, >=, 0); 8853 8854 if (!fd_data_str) { 8855 process_options(argc, argv); 8856 8857 setup_data_fd(); 8858 setup_hdr(); 8859 setup_data(); 8860 memcpy(ztest_shared_opts, &ztest_opts, 8861 sizeof (*ztest_shared_opts)); 8862 } else { 8863 ztest_fd_data = atoi(fd_data_str); 8864 setup_data(); 8865 memcpy(&ztest_opts, ztest_shared_opts, sizeof (ztest_opts)); 8866 } 8867 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 8868 8869 err = ztest_set_global_vars(); 8870 if (err != 0 && !fd_data_str) { 8871 /* error message done by ztest_set_global_vars */ 8872 exit(EXIT_FAILURE); 8873 } else { 8874 /* children should not be spawned if setting gvars fails */ 8875 VERIFY3S(err, ==, 0); 8876 } 8877 8878 /* Override location of zpool.cache */ 8879 VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", 8880 ztest_opts.zo_dir), !=, -1); 8881 8882 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 8883 UMEM_NOFAIL); 8884 zs = ztest_shared; 8885 8886 if (fd_data_str) { 8887 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 8888 metaslab_df_alloc_threshold = 8889 zs->zs_metaslab_df_alloc_threshold; 8890 8891 if (zs->zs_do_init) 8892 ztest_run_init(); 8893 else 8894 ztest_run(zs); 8895 exit(0); 8896 } 8897 8898 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 8899 8900 if (ztest_opts.zo_verbose >= 1) { 8901 (void) printf("%"PRIu64" vdevs, %d datasets, %d threads, " 8902 "%d %s disks, parity %d, %"PRIu64" seconds...\n\n", 8903 ztest_opts.zo_vdevs, 8904 ztest_opts.zo_datasets, 8905 ztest_opts.zo_threads, 8906 ztest_opts.zo_raid_children, 8907 ztest_opts.zo_raid_type, 8908 ztest_opts.zo_raid_parity, 8909 ztest_opts.zo_time); 8910 } 8911 8912 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 8913 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 8914 8915 zs->zs_do_init = B_TRUE; 8916 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 8917 if (ztest_opts.zo_verbose >= 1) { 8918 (void) printf("Executing older ztest for " 8919 "initialization: %s\n", ztest_opts.zo_alt_ztest); 8920 } 8921 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 8922 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 8923 } else { 8924 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 8925 } 8926 zs->zs_do_init = B_FALSE; 8927 8928 zs->zs_proc_start = gethrtime(); 8929 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 8930 8931 for (f = 0; f < ZTEST_FUNCS; f++) { 8932 zi = &ztest_info[f]; 8933 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8934 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 8935 zc->zc_next = UINT64_MAX; 8936 else 8937 zc->zc_next = zs->zs_proc_start + 8938 ztest_random(2 * zi->zi_interval[0] + 1); 8939 } 8940 8941 /* 8942 * Run the tests in a loop. These tests include fault injection 8943 * to verify that self-healing data works, and forced crashes 8944 * to verify that we never lose on-disk consistency. 8945 */ 8946 while (gethrtime() < zs->zs_proc_stop) { 8947 int status; 8948 boolean_t killed; 8949 8950 /* 8951 * Initialize the workload counters for each function. 8952 */ 8953 for (f = 0; f < ZTEST_FUNCS; f++) { 8954 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8955 zc->zc_count = 0; 8956 zc->zc_time = 0; 8957 } 8958 8959 /* Set the allocation switch size */ 8960 zs->zs_metaslab_df_alloc_threshold = 8961 ztest_random(zs->zs_metaslab_sz / 4) + 1; 8962 8963 if (!hasalt || ztest_random(2) == 0) { 8964 if (hasalt && ztest_opts.zo_verbose >= 1) { 8965 (void) printf("Executing newer ztest: %s\n", 8966 cmd); 8967 } 8968 newer++; 8969 killed = exec_child(cmd, NULL, B_TRUE, &status); 8970 } else { 8971 if (hasalt && ztest_opts.zo_verbose >= 1) { 8972 (void) printf("Executing older ztest: %s\n", 8973 ztest_opts.zo_alt_ztest); 8974 } 8975 older++; 8976 killed = exec_child(ztest_opts.zo_alt_ztest, 8977 ztest_opts.zo_alt_libpath, B_TRUE, &status); 8978 } 8979 8980 if (killed) 8981 kills++; 8982 iters++; 8983 8984 if (ztest_opts.zo_verbose >= 1) { 8985 hrtime_t now = gethrtime(); 8986 8987 now = MIN(now, zs->zs_proc_stop); 8988 print_time(zs->zs_proc_stop - now, timebuf); 8989 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 8990 8991 (void) printf("Pass %3d, %8s, %3"PRIu64" ENOSPC, " 8992 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 8993 iters, 8994 WIFEXITED(status) ? "Complete" : "SIGKILL", 8995 zs->zs_enospc_count, 8996 100.0 * zs->zs_alloc / zs->zs_space, 8997 numbuf, 8998 100.0 * (now - zs->zs_proc_start) / 8999 (ztest_opts.zo_time * NANOSEC), timebuf); 9000 } 9001 9002 if (ztest_opts.zo_verbose >= 2) { 9003 (void) printf("\nWorkload summary:\n\n"); 9004 (void) printf("%7s %9s %s\n", 9005 "Calls", "Time", "Function"); 9006 (void) printf("%7s %9s %s\n", 9007 "-----", "----", "--------"); 9008 for (f = 0; f < ZTEST_FUNCS; f++) { 9009 zi = &ztest_info[f]; 9010 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9011 print_time(zc->zc_time, timebuf); 9012 (void) printf("%7"PRIu64" %9s %s\n", 9013 zc->zc_count, timebuf, 9014 zi->zi_funcname); 9015 } 9016 (void) printf("\n"); 9017 } 9018 9019 if (!ztest_opts.zo_mmp_test) 9020 ztest_run_zdb(zs->zs_guid); 9021 if (ztest_shared_opts->zo_raidz_expand_test == 9022 RAIDZ_EXPAND_CHECKED) 9023 break; /* raidz expand test complete */ 9024 } 9025 9026 if (ztest_opts.zo_verbose >= 1) { 9027 if (hasalt) { 9028 (void) printf("%d runs of older ztest: %s\n", older, 9029 ztest_opts.zo_alt_ztest); 9030 (void) printf("%d runs of newer ztest: %s\n", newer, 9031 cmd); 9032 } 9033 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 9034 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 9035 } 9036 9037 umem_free(cmd, MAXNAMELEN); 9038 9039 return (0); 9040 } 9041