1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2013 Steven Hartland. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2017 Joyent, Inc. 28 * Copyright (c) 2017, Intel Corporation. 29 */ 30 31 /* 32 * The objective of this program is to provide a DMU/ZAP/SPA stress test 33 * that runs entirely in userland, is easy to use, and easy to extend. 34 * 35 * The overall design of the ztest program is as follows: 36 * 37 * (1) For each major functional area (e.g. adding vdevs to a pool, 38 * creating and destroying datasets, reading and writing objects, etc) 39 * we have a simple routine to test that functionality. These 40 * individual routines do not have to do anything "stressful". 41 * 42 * (2) We turn these simple functionality tests into a stress test by 43 * running them all in parallel, with as many threads as desired, 44 * and spread across as many datasets, objects, and vdevs as desired. 45 * 46 * (3) While all this is happening, we inject faults into the pool to 47 * verify that self-healing data really works. 48 * 49 * (4) Every time we open a dataset, we change its checksum and compression 50 * functions. Thus even individual objects vary from block to block 51 * in which checksum they use and whether they're compressed. 52 * 53 * (5) To verify that we never lose on-disk consistency after a crash, 54 * we run the entire test in a child of the main process. 55 * At random times, the child self-immolates with a SIGKILL. 56 * This is the software equivalent of pulling the power cord. 57 * The parent then runs the test again, using the existing 58 * storage pool, as many times as desired. If backwards compatibility 59 * testing is enabled ztest will sometimes run the "older" version 60 * of ztest after a SIGKILL. 61 * 62 * (6) To verify that we don't have future leaks or temporal incursions, 63 * many of the functional tests record the transaction group number 64 * as part of their data. When reading old data, they verify that 65 * the transaction group number is less than the current, open txg. 66 * If you add a new test, please do this if applicable. 67 * 68 * (7) Threads are created with a reduced stack size, for sanity checking. 69 * Therefore, it's important not to allocate huge buffers on the stack. 70 * 71 * When run with no arguments, ztest runs for about five minutes and 72 * produces no output if successful. To get a little bit of information, 73 * specify -V. To get more information, specify -VV, and so on. 74 * 75 * To turn this into an overnight stress test, use -T to specify run time. 76 * 77 * You can ask more vdevs [-v], datasets [-d], or threads [-t] 78 * to increase the pool capacity, fanout, and overall stress level. 79 * 80 * Use the -k option to set the desired frequency of kills. 81 * 82 * When ztest invokes itself it passes all relevant information through a 83 * temporary file which is mmap-ed in the child process. This allows shared 84 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 85 * stored at offset 0 of this file and contains information on the size and 86 * number of shared structures in the file. The information stored in this file 87 * must remain backwards compatible with older versions of ztest so that 88 * ztest can invoke them during backwards compatibility testing (-B). 89 */ 90 91 #include <sys/zfs_context.h> 92 #include <sys/spa.h> 93 #include <sys/dmu.h> 94 #include <sys/txg.h> 95 #include <sys/dbuf.h> 96 #include <sys/zap.h> 97 #include <sys/dmu_objset.h> 98 #include <sys/poll.h> 99 #include <sys/stat.h> 100 #include <sys/time.h> 101 #include <sys/wait.h> 102 #include <sys/mman.h> 103 #include <sys/resource.h> 104 #include <sys/zio.h> 105 #include <sys/zil.h> 106 #include <sys/zil_impl.h> 107 #include <sys/vdev_draid.h> 108 #include <sys/vdev_impl.h> 109 #include <sys/vdev_file.h> 110 #include <sys/vdev_initialize.h> 111 #include <sys/vdev_raidz.h> 112 #include <sys/vdev_trim.h> 113 #include <sys/spa_impl.h> 114 #include <sys/metaslab_impl.h> 115 #include <sys/dsl_prop.h> 116 #include <sys/dsl_dataset.h> 117 #include <sys/dsl_destroy.h> 118 #include <sys/dsl_scan.h> 119 #include <sys/zio_checksum.h> 120 #include <sys/zfs_refcount.h> 121 #include <sys/zfeature.h> 122 #include <sys/dsl_userhold.h> 123 #include <sys/abd.h> 124 #include <sys/blake3.h> 125 #include <stdio.h> 126 #include <stdlib.h> 127 #include <unistd.h> 128 #include <getopt.h> 129 #include <signal.h> 130 #include <umem.h> 131 #include <ctype.h> 132 #include <math.h> 133 #include <sys/fs/zfs.h> 134 #include <zfs_fletcher.h> 135 #include <libnvpair.h> 136 #include <libzutil.h> 137 #include <sys/crypto/icp.h> 138 #if (__GLIBC__ && !__UCLIBC__) 139 #include <execinfo.h> /* for backtrace() */ 140 #endif 141 142 static int ztest_fd_data = -1; 143 static int ztest_fd_rand = -1; 144 145 typedef struct ztest_shared_hdr { 146 uint64_t zh_hdr_size; 147 uint64_t zh_opts_size; 148 uint64_t zh_size; 149 uint64_t zh_stats_size; 150 uint64_t zh_stats_count; 151 uint64_t zh_ds_size; 152 uint64_t zh_ds_count; 153 } ztest_shared_hdr_t; 154 155 static ztest_shared_hdr_t *ztest_shared_hdr; 156 157 enum ztest_class_state { 158 ZTEST_VDEV_CLASS_OFF, 159 ZTEST_VDEV_CLASS_ON, 160 ZTEST_VDEV_CLASS_RND 161 }; 162 163 #define ZO_GVARS_MAX_ARGLEN ((size_t)64) 164 #define ZO_GVARS_MAX_COUNT ((size_t)10) 165 166 typedef struct ztest_shared_opts { 167 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 168 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 169 char zo_alt_ztest[MAXNAMELEN]; 170 char zo_alt_libpath[MAXNAMELEN]; 171 uint64_t zo_vdevs; 172 uint64_t zo_vdevtime; 173 size_t zo_vdev_size; 174 int zo_ashift; 175 int zo_mirrors; 176 int zo_raid_children; 177 int zo_raid_parity; 178 char zo_raid_type[8]; 179 int zo_draid_data; 180 int zo_draid_spares; 181 int zo_datasets; 182 int zo_threads; 183 uint64_t zo_passtime; 184 uint64_t zo_killrate; 185 int zo_verbose; 186 int zo_init; 187 uint64_t zo_time; 188 uint64_t zo_maxloops; 189 uint64_t zo_metaslab_force_ganging; 190 int zo_mmp_test; 191 int zo_special_vdevs; 192 int zo_dump_dbgmsg; 193 int zo_gvars_count; 194 char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; 195 } ztest_shared_opts_t; 196 197 /* Default values for command line options. */ 198 #define DEFAULT_POOL "ztest" 199 #define DEFAULT_VDEV_DIR "/tmp" 200 #define DEFAULT_VDEV_COUNT 5 201 #define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ 202 #define DEFAULT_VDEV_SIZE_STR "256M" 203 #define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT 204 #define DEFAULT_MIRRORS 2 205 #define DEFAULT_RAID_CHILDREN 4 206 #define DEFAULT_RAID_PARITY 1 207 #define DEFAULT_DRAID_DATA 4 208 #define DEFAULT_DRAID_SPARES 1 209 #define DEFAULT_DATASETS_COUNT 7 210 #define DEFAULT_THREADS 23 211 #define DEFAULT_RUN_TIME 300 /* 300 seconds */ 212 #define DEFAULT_RUN_TIME_STR "300 sec" 213 #define DEFAULT_PASS_TIME 60 /* 60 seconds */ 214 #define DEFAULT_PASS_TIME_STR "60 sec" 215 #define DEFAULT_KILL_RATE 70 /* 70% kill rate */ 216 #define DEFAULT_KILLRATE_STR "70%" 217 #define DEFAULT_INITS 1 218 #define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ 219 #define DEFAULT_FORCE_GANGING (64 << 10) 220 #define DEFAULT_FORCE_GANGING_STR "64K" 221 222 /* Simplifying assumption: -1 is not a valid default. */ 223 #define NO_DEFAULT -1 224 225 static const ztest_shared_opts_t ztest_opts_defaults = { 226 .zo_pool = DEFAULT_POOL, 227 .zo_dir = DEFAULT_VDEV_DIR, 228 .zo_alt_ztest = { '\0' }, 229 .zo_alt_libpath = { '\0' }, 230 .zo_vdevs = DEFAULT_VDEV_COUNT, 231 .zo_ashift = DEFAULT_ASHIFT, 232 .zo_mirrors = DEFAULT_MIRRORS, 233 .zo_raid_children = DEFAULT_RAID_CHILDREN, 234 .zo_raid_parity = DEFAULT_RAID_PARITY, 235 .zo_raid_type = VDEV_TYPE_RAIDZ, 236 .zo_vdev_size = DEFAULT_VDEV_SIZE, 237 .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ 238 .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ 239 .zo_datasets = DEFAULT_DATASETS_COUNT, 240 .zo_threads = DEFAULT_THREADS, 241 .zo_passtime = DEFAULT_PASS_TIME, 242 .zo_killrate = DEFAULT_KILL_RATE, 243 .zo_verbose = 0, 244 .zo_mmp_test = 0, 245 .zo_init = DEFAULT_INITS, 246 .zo_time = DEFAULT_RUN_TIME, 247 .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ 248 .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, 249 .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, 250 .zo_gvars_count = 0, 251 }; 252 253 extern uint64_t metaslab_force_ganging; 254 extern uint64_t metaslab_df_alloc_threshold; 255 extern unsigned long zfs_deadman_synctime_ms; 256 extern int metaslab_preload_limit; 257 extern int zfs_compressed_arc_enabled; 258 extern int zfs_abd_scatter_enabled; 259 extern int dmu_object_alloc_chunk_shift; 260 extern boolean_t zfs_force_some_double_word_sm_entries; 261 extern unsigned long zio_decompress_fail_fraction; 262 extern unsigned long zfs_reconstruct_indirect_damage_fraction; 263 264 265 static ztest_shared_opts_t *ztest_shared_opts; 266 static ztest_shared_opts_t ztest_opts; 267 static char *ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; 268 269 typedef struct ztest_shared_ds { 270 uint64_t zd_seq; 271 } ztest_shared_ds_t; 272 273 static ztest_shared_ds_t *ztest_shared_ds; 274 #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 275 276 #define BT_MAGIC 0x123456789abcdefULL 277 #define MAXFAULTS(zs) \ 278 (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) 279 280 enum ztest_io_type { 281 ZTEST_IO_WRITE_TAG, 282 ZTEST_IO_WRITE_PATTERN, 283 ZTEST_IO_WRITE_ZEROES, 284 ZTEST_IO_TRUNCATE, 285 ZTEST_IO_SETATTR, 286 ZTEST_IO_REWRITE, 287 ZTEST_IO_TYPES 288 }; 289 290 typedef struct ztest_block_tag { 291 uint64_t bt_magic; 292 uint64_t bt_objset; 293 uint64_t bt_object; 294 uint64_t bt_dnodesize; 295 uint64_t bt_offset; 296 uint64_t bt_gen; 297 uint64_t bt_txg; 298 uint64_t bt_crtxg; 299 } ztest_block_tag_t; 300 301 typedef struct bufwad { 302 uint64_t bw_index; 303 uint64_t bw_txg; 304 uint64_t bw_data; 305 } bufwad_t; 306 307 /* 308 * It would be better to use a rangelock_t per object. Unfortunately 309 * the rangelock_t is not a drop-in replacement for rl_t, because we 310 * still need to map from object ID to rangelock_t. 311 */ 312 typedef enum { 313 RL_READER, 314 RL_WRITER, 315 RL_APPEND 316 } rl_type_t; 317 318 typedef struct rll { 319 void *rll_writer; 320 int rll_readers; 321 kmutex_t rll_lock; 322 kcondvar_t rll_cv; 323 } rll_t; 324 325 typedef struct rl { 326 uint64_t rl_object; 327 uint64_t rl_offset; 328 uint64_t rl_size; 329 rll_t *rl_lock; 330 } rl_t; 331 332 #define ZTEST_RANGE_LOCKS 64 333 #define ZTEST_OBJECT_LOCKS 64 334 335 /* 336 * Object descriptor. Used as a template for object lookup/create/remove. 337 */ 338 typedef struct ztest_od { 339 uint64_t od_dir; 340 uint64_t od_object; 341 dmu_object_type_t od_type; 342 dmu_object_type_t od_crtype; 343 uint64_t od_blocksize; 344 uint64_t od_crblocksize; 345 uint64_t od_crdnodesize; 346 uint64_t od_gen; 347 uint64_t od_crgen; 348 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 349 } ztest_od_t; 350 351 /* 352 * Per-dataset state. 353 */ 354 typedef struct ztest_ds { 355 ztest_shared_ds_t *zd_shared; 356 objset_t *zd_os; 357 pthread_rwlock_t zd_zilog_lock; 358 zilog_t *zd_zilog; 359 ztest_od_t *zd_od; /* debugging aid */ 360 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 361 kmutex_t zd_dirobj_lock; 362 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 363 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 364 } ztest_ds_t; 365 366 /* 367 * Per-iteration state. 368 */ 369 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 370 371 typedef struct ztest_info { 372 ztest_func_t *zi_func; /* test function */ 373 uint64_t zi_iters; /* iterations per execution */ 374 uint64_t *zi_interval; /* execute every <interval> seconds */ 375 const char *zi_funcname; /* name of test function */ 376 } ztest_info_t; 377 378 typedef struct ztest_shared_callstate { 379 uint64_t zc_count; /* per-pass count */ 380 uint64_t zc_time; /* per-pass time */ 381 uint64_t zc_next; /* next time to call this function */ 382 } ztest_shared_callstate_t; 383 384 static ztest_shared_callstate_t *ztest_shared_callstate; 385 #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 386 387 ztest_func_t ztest_dmu_read_write; 388 ztest_func_t ztest_dmu_write_parallel; 389 ztest_func_t ztest_dmu_object_alloc_free; 390 ztest_func_t ztest_dmu_object_next_chunk; 391 ztest_func_t ztest_dmu_commit_callbacks; 392 ztest_func_t ztest_zap; 393 ztest_func_t ztest_zap_parallel; 394 ztest_func_t ztest_zil_commit; 395 ztest_func_t ztest_zil_remount; 396 ztest_func_t ztest_dmu_read_write_zcopy; 397 ztest_func_t ztest_dmu_objset_create_destroy; 398 ztest_func_t ztest_dmu_prealloc; 399 ztest_func_t ztest_fzap; 400 ztest_func_t ztest_dmu_snapshot_create_destroy; 401 ztest_func_t ztest_dsl_prop_get_set; 402 ztest_func_t ztest_spa_prop_get_set; 403 ztest_func_t ztest_spa_create_destroy; 404 ztest_func_t ztest_fault_inject; 405 ztest_func_t ztest_dmu_snapshot_hold; 406 ztest_func_t ztest_mmp_enable_disable; 407 ztest_func_t ztest_scrub; 408 ztest_func_t ztest_dsl_dataset_promote_busy; 409 ztest_func_t ztest_vdev_attach_detach; 410 ztest_func_t ztest_vdev_LUN_growth; 411 ztest_func_t ztest_vdev_add_remove; 412 ztest_func_t ztest_vdev_class_add; 413 ztest_func_t ztest_vdev_aux_add_remove; 414 ztest_func_t ztest_split_pool; 415 ztest_func_t ztest_reguid; 416 ztest_func_t ztest_spa_upgrade; 417 ztest_func_t ztest_device_removal; 418 ztest_func_t ztest_spa_checkpoint_create_discard; 419 ztest_func_t ztest_initialize; 420 ztest_func_t ztest_trim; 421 ztest_func_t ztest_blake3; 422 ztest_func_t ztest_fletcher; 423 ztest_func_t ztest_fletcher_incr; 424 ztest_func_t ztest_verify_dnode_bt; 425 426 uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 427 uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 428 uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 429 uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 430 uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 431 432 #define ZTI_INIT(func, iters, interval) \ 433 { .zi_func = (func), \ 434 .zi_iters = (iters), \ 435 .zi_interval = (interval), \ 436 .zi_funcname = # func } 437 438 ztest_info_t ztest_info[] = { 439 ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), 440 ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), 441 ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), 442 ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), 443 ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), 444 ZTI_INIT(ztest_zap, 30, &zopt_always), 445 ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), 446 ZTI_INIT(ztest_split_pool, 1, &zopt_always), 447 ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), 448 ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), 449 ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), 450 ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), 451 ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), 452 ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), 453 #if 0 454 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), 455 #endif 456 ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), 457 ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), 458 ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), 459 ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), 460 ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), 461 ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), 462 ZTI_INIT(ztest_reguid, 1, &zopt_rarely), 463 ZTI_INIT(ztest_scrub, 1, &zopt_rarely), 464 ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), 465 ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), 466 ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), 467 ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), 468 ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), 469 ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), 470 ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), 471 ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), 472 ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), 473 ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), 474 ZTI_INIT(ztest_trim, 1, &zopt_sometimes), 475 ZTI_INIT(ztest_blake3, 1, &zopt_rarely), 476 ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), 477 ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), 478 ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), 479 }; 480 481 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 482 483 /* 484 * The following struct is used to hold a list of uncalled commit callbacks. 485 * The callbacks are ordered by txg number. 486 */ 487 typedef struct ztest_cb_list { 488 kmutex_t zcl_callbacks_lock; 489 list_t zcl_callbacks; 490 } ztest_cb_list_t; 491 492 /* 493 * Stuff we need to share writably between parent and child. 494 */ 495 typedef struct ztest_shared { 496 boolean_t zs_do_init; 497 hrtime_t zs_proc_start; 498 hrtime_t zs_proc_stop; 499 hrtime_t zs_thread_start; 500 hrtime_t zs_thread_stop; 501 hrtime_t zs_thread_kill; 502 uint64_t zs_enospc_count; 503 uint64_t zs_vdev_next_leaf; 504 uint64_t zs_vdev_aux; 505 uint64_t zs_alloc; 506 uint64_t zs_space; 507 uint64_t zs_splits; 508 uint64_t zs_mirrors; 509 uint64_t zs_metaslab_sz; 510 uint64_t zs_metaslab_df_alloc_threshold; 511 uint64_t zs_guid; 512 } ztest_shared_t; 513 514 #define ID_PARALLEL -1ULL 515 516 static char ztest_dev_template[] = "%s/%s.%llua"; 517 static char ztest_aux_template[] = "%s/%s.%s.%llu"; 518 ztest_shared_t *ztest_shared; 519 520 static spa_t *ztest_spa = NULL; 521 static ztest_ds_t *ztest_ds; 522 523 static kmutex_t ztest_vdev_lock; 524 static boolean_t ztest_device_removal_active = B_FALSE; 525 static boolean_t ztest_pool_scrubbed = B_FALSE; 526 static kmutex_t ztest_checkpoint_lock; 527 528 /* 529 * The ztest_name_lock protects the pool and dataset namespace used by 530 * the individual tests. To modify the namespace, consumers must grab 531 * this lock as writer. Grabbing the lock as reader will ensure that the 532 * namespace does not change while the lock is held. 533 */ 534 static pthread_rwlock_t ztest_name_lock; 535 536 static boolean_t ztest_dump_core = B_TRUE; 537 static boolean_t ztest_exiting; 538 539 /* Global commit callback list */ 540 static ztest_cb_list_t zcl; 541 /* Commit cb delay */ 542 static uint64_t zc_min_txg_delay = UINT64_MAX; 543 static int zc_cb_counter = 0; 544 545 /* 546 * Minimum number of commit callbacks that need to be registered for us to check 547 * whether the minimum txg delay is acceptable. 548 */ 549 #define ZTEST_COMMIT_CB_MIN_REG 100 550 551 /* 552 * If a number of txgs equal to this threshold have been created after a commit 553 * callback has been registered but not called, then we assume there is an 554 * implementation bug. 555 */ 556 #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) 557 558 enum ztest_object { 559 ZTEST_META_DNODE = 0, 560 ZTEST_DIROBJ, 561 ZTEST_OBJECTS 562 }; 563 564 static __attribute__((noreturn)) void usage(boolean_t requested); 565 static int ztest_scrub_impl(spa_t *spa); 566 567 /* 568 * These libumem hooks provide a reasonable set of defaults for the allocator's 569 * debugging facilities. 570 */ 571 const char * 572 _umem_debug_init(void) 573 { 574 return ("default,verbose"); /* $UMEM_DEBUG setting */ 575 } 576 577 const char * 578 _umem_logging_init(void) 579 { 580 return ("fail,contents"); /* $UMEM_LOGGING setting */ 581 } 582 583 static void 584 dump_debug_buffer(void) 585 { 586 ssize_t ret __attribute__((unused)); 587 588 if (!ztest_opts.zo_dump_dbgmsg) 589 return; 590 591 /* 592 * We use write() instead of printf() so that this function 593 * is safe to call from a signal handler. 594 */ 595 ret = write(STDOUT_FILENO, "\n", 1); 596 zfs_dbgmsg_print("ztest"); 597 } 598 599 #define BACKTRACE_SZ 100 600 601 static void sig_handler(int signo) 602 { 603 struct sigaction action; 604 #if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ 605 int nptrs; 606 void *buffer[BACKTRACE_SZ]; 607 608 nptrs = backtrace(buffer, BACKTRACE_SZ); 609 backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); 610 #endif 611 dump_debug_buffer(); 612 613 /* 614 * Restore default action and re-raise signal so SIGSEGV and 615 * SIGABRT can trigger a core dump. 616 */ 617 action.sa_handler = SIG_DFL; 618 sigemptyset(&action.sa_mask); 619 action.sa_flags = 0; 620 (void) sigaction(signo, &action, NULL); 621 raise(signo); 622 } 623 624 #define FATAL_MSG_SZ 1024 625 626 char *fatal_msg; 627 628 static __attribute__((format(printf, 2, 3))) __attribute__((noreturn)) void 629 fatal(int do_perror, char *message, ...) 630 { 631 va_list args; 632 int save_errno = errno; 633 char *buf; 634 635 (void) fflush(stdout); 636 buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); 637 if (buf == NULL) 638 goto out; 639 640 va_start(args, message); 641 (void) sprintf(buf, "ztest: "); 642 /* LINTED */ 643 (void) vsprintf(buf + strlen(buf), message, args); 644 va_end(args); 645 if (do_perror) { 646 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 647 ": %s", strerror(save_errno)); 648 } 649 (void) fprintf(stderr, "%s\n", buf); 650 fatal_msg = buf; /* to ease debugging */ 651 652 out: 653 if (ztest_dump_core) 654 abort(); 655 else 656 dump_debug_buffer(); 657 658 exit(3); 659 } 660 661 static int 662 str2shift(const char *buf) 663 { 664 const char *ends = "BKMGTPEZ"; 665 int i; 666 667 if (buf[0] == '\0') 668 return (0); 669 for (i = 0; i < strlen(ends); i++) { 670 if (toupper(buf[0]) == ends[i]) 671 break; 672 } 673 if (i == strlen(ends)) { 674 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 675 buf); 676 usage(B_FALSE); 677 } 678 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 679 return (10*i); 680 } 681 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 682 usage(B_FALSE); 683 } 684 685 static uint64_t 686 nicenumtoull(const char *buf) 687 { 688 char *end; 689 uint64_t val; 690 691 val = strtoull(buf, &end, 0); 692 if (end == buf) { 693 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 694 usage(B_FALSE); 695 } else if (end[0] == '.') { 696 double fval = strtod(buf, &end); 697 fval *= pow(2, str2shift(end)); 698 /* 699 * UINT64_MAX is not exactly representable as a double. 700 * The closest representation is UINT64_MAX + 1, so we 701 * use a >= comparison instead of > for the bounds check. 702 */ 703 if (fval >= (double)UINT64_MAX) { 704 (void) fprintf(stderr, "ztest: value too large: %s\n", 705 buf); 706 usage(B_FALSE); 707 } 708 val = (uint64_t)fval; 709 } else { 710 int shift = str2shift(end); 711 if (shift >= 64 || (val << shift) >> shift != val) { 712 (void) fprintf(stderr, "ztest: value too large: %s\n", 713 buf); 714 usage(B_FALSE); 715 } 716 val <<= shift; 717 } 718 return (val); 719 } 720 721 typedef struct ztest_option { 722 const char short_opt; 723 const char *long_opt; 724 const char *long_opt_param; 725 const char *comment; 726 unsigned int default_int; 727 char *default_str; 728 } ztest_option_t; 729 730 /* 731 * The following option_table is used for generating the usage info as well as 732 * the long and short option information for calling getopt_long(). 733 */ 734 static ztest_option_t option_table[] = { 735 { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, 736 NULL}, 737 { 's', "vdev-size", "INTEGER", "Size of each vdev", 738 NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, 739 { 'a', "alignment-shift", "INTEGER", 740 "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, 741 { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", 742 DEFAULT_MIRRORS, NULL}, 743 { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", 744 DEFAULT_RAID_CHILDREN, NULL}, 745 { 'R', "raid-parity", "INTEGER", "Raid parity", 746 DEFAULT_RAID_PARITY, NULL}, 747 { 'K', "raid-kind", "raidz|draid|random", "Raid kind", 748 NO_DEFAULT, "random"}, 749 { 'D', "draid-data", "INTEGER", "Number of draid data drives", 750 DEFAULT_DRAID_DATA, NULL}, 751 { 'S', "draid-spares", "INTEGER", "Number of draid spares", 752 DEFAULT_DRAID_SPARES, NULL}, 753 { 'd', "datasets", "INTEGER", "Number of datasets", 754 DEFAULT_DATASETS_COUNT, NULL}, 755 { 't', "threads", "INTEGER", "Number of ztest threads", 756 DEFAULT_THREADS, NULL}, 757 { 'g', "gang-block-threshold", "INTEGER", 758 "Metaslab gang block threshold", 759 NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, 760 { 'i', "init-count", "INTEGER", "Number of times to initialize pool", 761 DEFAULT_INITS, NULL}, 762 { 'k', "kill-percentage", "INTEGER", "Kill percentage", 763 NO_DEFAULT, DEFAULT_KILLRATE_STR}, 764 { 'p', "pool-name", "STRING", "Pool name", 765 NO_DEFAULT, DEFAULT_POOL}, 766 { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", 767 NO_DEFAULT, DEFAULT_VDEV_DIR}, 768 { 'M', "multi-host", NULL, 769 "Multi-host; simulate pool imported on remote host", 770 NO_DEFAULT, NULL}, 771 { 'E', "use-existing-pool", NULL, 772 "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, 773 { 'T', "run-time", "INTEGER", "Total run time", 774 NO_DEFAULT, DEFAULT_RUN_TIME_STR}, 775 { 'P', "pass-time", "INTEGER", "Time per pass", 776 NO_DEFAULT, DEFAULT_PASS_TIME_STR}, 777 { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", 778 DEFAULT_MAX_LOOPS, NULL}, 779 { 'B', "alt-ztest", "PATH", "Alternate ztest path", 780 NO_DEFAULT, NULL}, 781 { 'C', "vdev-class-state", "on|off|random", "vdev class state", 782 NO_DEFAULT, "random"}, 783 { 'o', "option", "\"OPTION=INTEGER\"", 784 "Set global variable to an unsigned 32-bit integer value", 785 NO_DEFAULT, NULL}, 786 { 'G', "dump-debug-msg", NULL, 787 "Dump zfs_dbgmsg buffer before exiting due to an error", 788 NO_DEFAULT, NULL}, 789 { 'V', "verbose", NULL, 790 "Verbose (use multiple times for ever more verbosity)", 791 NO_DEFAULT, NULL}, 792 { 'h', "help", NULL, "Show this help", 793 NO_DEFAULT, NULL}, 794 {0, 0, 0, 0, 0, 0} 795 }; 796 797 static struct option *long_opts = NULL; 798 static char *short_opts = NULL; 799 800 static void 801 init_options(void) 802 { 803 ASSERT3P(long_opts, ==, NULL); 804 ASSERT3P(short_opts, ==, NULL); 805 806 int count = sizeof (option_table) / sizeof (option_table[0]); 807 long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); 808 809 short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); 810 int short_opt_index = 0; 811 812 for (int i = 0; i < count; i++) { 813 long_opts[i].val = option_table[i].short_opt; 814 long_opts[i].name = option_table[i].long_opt; 815 long_opts[i].has_arg = option_table[i].long_opt_param != NULL 816 ? required_argument : no_argument; 817 long_opts[i].flag = NULL; 818 short_opts[short_opt_index++] = option_table[i].short_opt; 819 if (option_table[i].long_opt_param != NULL) { 820 short_opts[short_opt_index++] = ':'; 821 } 822 } 823 } 824 825 static void 826 fini_options(void) 827 { 828 int count = sizeof (option_table) / sizeof (option_table[0]); 829 830 umem_free(long_opts, sizeof (struct option) * count); 831 umem_free(short_opts, sizeof (char) * 2 * count); 832 833 long_opts = NULL; 834 short_opts = NULL; 835 } 836 837 static __attribute__((noreturn)) void 838 usage(boolean_t requested) 839 { 840 char option[80]; 841 FILE *fp = requested ? stdout : stderr; 842 843 (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); 844 for (int i = 0; option_table[i].short_opt != 0; i++) { 845 if (option_table[i].long_opt_param != NULL) { 846 (void) sprintf(option, " -%c --%s=%s", 847 option_table[i].short_opt, 848 option_table[i].long_opt, 849 option_table[i].long_opt_param); 850 } else { 851 (void) sprintf(option, " -%c --%s", 852 option_table[i].short_opt, 853 option_table[i].long_opt); 854 } 855 (void) fprintf(fp, " %-40s%s", option, 856 option_table[i].comment); 857 858 if (option_table[i].long_opt_param != NULL) { 859 if (option_table[i].default_str != NULL) { 860 (void) fprintf(fp, " (default: %s)", 861 option_table[i].default_str); 862 } else if (option_table[i].default_int != NO_DEFAULT) { 863 (void) fprintf(fp, " (default: %u)", 864 option_table[i].default_int); 865 } 866 } 867 (void) fprintf(fp, "\n"); 868 } 869 exit(requested ? 0 : 1); 870 } 871 872 static uint64_t 873 ztest_random(uint64_t range) 874 { 875 uint64_t r; 876 877 ASSERT3S(ztest_fd_rand, >=, 0); 878 879 if (range == 0) 880 return (0); 881 882 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) 883 fatal(B_TRUE, "short read from /dev/urandom"); 884 885 return (r % range); 886 } 887 888 static void 889 ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) 890 { 891 char name[32]; 892 char *value; 893 int state = ZTEST_VDEV_CLASS_RND; 894 895 (void) strlcpy(name, input, sizeof (name)); 896 897 value = strchr(name, '='); 898 if (value == NULL) { 899 (void) fprintf(stderr, "missing value in property=value " 900 "'-C' argument (%s)\n", input); 901 usage(B_FALSE); 902 } 903 *(value) = '\0'; 904 value++; 905 906 if (strcmp(value, "on") == 0) { 907 state = ZTEST_VDEV_CLASS_ON; 908 } else if (strcmp(value, "off") == 0) { 909 state = ZTEST_VDEV_CLASS_OFF; 910 } else if (strcmp(value, "random") == 0) { 911 state = ZTEST_VDEV_CLASS_RND; 912 } else { 913 (void) fprintf(stderr, "invalid property value '%s'\n", value); 914 usage(B_FALSE); 915 } 916 917 if (strcmp(name, "special") == 0) { 918 zo->zo_special_vdevs = state; 919 } else { 920 (void) fprintf(stderr, "invalid property name '%s'\n", name); 921 usage(B_FALSE); 922 } 923 if (zo->zo_verbose >= 3) 924 (void) printf("%s vdev state is '%s'\n", name, value); 925 } 926 927 static void 928 process_options(int argc, char **argv) 929 { 930 char *path; 931 ztest_shared_opts_t *zo = &ztest_opts; 932 933 int opt; 934 uint64_t value; 935 const char *raid_kind = "random"; 936 937 memcpy(zo, &ztest_opts_defaults, sizeof (*zo)); 938 939 init_options(); 940 941 while ((opt = getopt_long(argc, argv, short_opts, long_opts, 942 NULL)) != EOF) { 943 value = 0; 944 switch (opt) { 945 case 'v': 946 case 's': 947 case 'a': 948 case 'm': 949 case 'r': 950 case 'R': 951 case 'D': 952 case 'S': 953 case 'd': 954 case 't': 955 case 'g': 956 case 'i': 957 case 'k': 958 case 'T': 959 case 'P': 960 case 'F': 961 value = nicenumtoull(optarg); 962 } 963 switch (opt) { 964 case 'v': 965 zo->zo_vdevs = value; 966 break; 967 case 's': 968 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 969 break; 970 case 'a': 971 zo->zo_ashift = value; 972 break; 973 case 'm': 974 zo->zo_mirrors = value; 975 break; 976 case 'r': 977 zo->zo_raid_children = MAX(1, value); 978 break; 979 case 'R': 980 zo->zo_raid_parity = MIN(MAX(value, 1), 3); 981 break; 982 case 'K': 983 raid_kind = optarg; 984 break; 985 case 'D': 986 zo->zo_draid_data = MAX(1, value); 987 break; 988 case 'S': 989 zo->zo_draid_spares = MAX(1, value); 990 break; 991 case 'd': 992 zo->zo_datasets = MAX(1, value); 993 break; 994 case 't': 995 zo->zo_threads = MAX(1, value); 996 break; 997 case 'g': 998 zo->zo_metaslab_force_ganging = 999 MAX(SPA_MINBLOCKSIZE << 1, value); 1000 break; 1001 case 'i': 1002 zo->zo_init = value; 1003 break; 1004 case 'k': 1005 zo->zo_killrate = value; 1006 break; 1007 case 'p': 1008 (void) strlcpy(zo->zo_pool, optarg, 1009 sizeof (zo->zo_pool)); 1010 break; 1011 case 'f': 1012 path = realpath(optarg, NULL); 1013 if (path == NULL) { 1014 (void) fprintf(stderr, "error: %s: %s\n", 1015 optarg, strerror(errno)); 1016 usage(B_FALSE); 1017 } else { 1018 (void) strlcpy(zo->zo_dir, path, 1019 sizeof (zo->zo_dir)); 1020 free(path); 1021 } 1022 break; 1023 case 'M': 1024 zo->zo_mmp_test = 1; 1025 break; 1026 case 'V': 1027 zo->zo_verbose++; 1028 break; 1029 case 'E': 1030 zo->zo_init = 0; 1031 break; 1032 case 'T': 1033 zo->zo_time = value; 1034 break; 1035 case 'P': 1036 zo->zo_passtime = MAX(1, value); 1037 break; 1038 case 'F': 1039 zo->zo_maxloops = MAX(1, value); 1040 break; 1041 case 'B': 1042 (void) strlcpy(zo->zo_alt_ztest, optarg, 1043 sizeof (zo->zo_alt_ztest)); 1044 break; 1045 case 'C': 1046 ztest_parse_name_value(optarg, zo); 1047 break; 1048 case 'o': 1049 if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { 1050 (void) fprintf(stderr, 1051 "max global var count (%zu) exceeded\n", 1052 ZO_GVARS_MAX_COUNT); 1053 usage(B_FALSE); 1054 } 1055 char *v = zo->zo_gvars[zo->zo_gvars_count]; 1056 if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= 1057 ZO_GVARS_MAX_ARGLEN) { 1058 (void) fprintf(stderr, 1059 "global var option '%s' is too long\n", 1060 optarg); 1061 usage(B_FALSE); 1062 } 1063 zo->zo_gvars_count++; 1064 break; 1065 case 'G': 1066 zo->zo_dump_dbgmsg = 1; 1067 break; 1068 case 'h': 1069 usage(B_TRUE); 1070 break; 1071 case '?': 1072 default: 1073 usage(B_FALSE); 1074 break; 1075 } 1076 } 1077 1078 fini_options(); 1079 1080 /* When raid choice is 'random' add a draid pool 50% of the time */ 1081 if (strcmp(raid_kind, "random") == 0) { 1082 raid_kind = (ztest_random(2) == 0) ? "draid" : "raidz"; 1083 1084 if (ztest_opts.zo_verbose >= 3) 1085 (void) printf("choosing RAID type '%s'\n", raid_kind); 1086 } 1087 1088 if (strcmp(raid_kind, "draid") == 0) { 1089 uint64_t min_devsize; 1090 1091 /* With fewer disk use 256M, otherwise 128M is OK */ 1092 min_devsize = (ztest_opts.zo_raid_children < 16) ? 1093 (256ULL << 20) : (128ULL << 20); 1094 1095 /* No top-level mirrors with dRAID for now */ 1096 zo->zo_mirrors = 0; 1097 1098 /* Use more appropriate defaults for dRAID */ 1099 if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) 1100 zo->zo_vdevs = 1; 1101 if (zo->zo_raid_children == 1102 ztest_opts_defaults.zo_raid_children) 1103 zo->zo_raid_children = 16; 1104 if (zo->zo_ashift < 12) 1105 zo->zo_ashift = 12; 1106 if (zo->zo_vdev_size < min_devsize) 1107 zo->zo_vdev_size = min_devsize; 1108 1109 if (zo->zo_draid_data + zo->zo_raid_parity > 1110 zo->zo_raid_children - zo->zo_draid_spares) { 1111 (void) fprintf(stderr, "error: too few draid " 1112 "children (%d) for stripe width (%d)\n", 1113 zo->zo_raid_children, 1114 zo->zo_draid_data + zo->zo_raid_parity); 1115 usage(B_FALSE); 1116 } 1117 1118 (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, 1119 sizeof (zo->zo_raid_type)); 1120 1121 } else /* using raidz */ { 1122 ASSERT0(strcmp(raid_kind, "raidz")); 1123 1124 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1125 zo->zo_raid_children - 1); 1126 } 1127 1128 zo->zo_vdevtime = 1129 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 1130 UINT64_MAX >> 2); 1131 1132 if (*zo->zo_alt_ztest) { 1133 const char *invalid_what = "ztest"; 1134 char *val = zo->zo_alt_ztest; 1135 if (0 != access(val, X_OK) || 1136 (strrchr(val, '/') == NULL && (errno = EINVAL))) 1137 goto invalid; 1138 1139 int dirlen = strrchr(val, '/') - val; 1140 strncpy(zo->zo_alt_libpath, val, dirlen); 1141 invalid_what = "library path", val = zo->zo_alt_libpath; 1142 if (strrchr(val, '/') == NULL && (errno = EINVAL)) 1143 goto invalid; 1144 *strrchr(val, '/') = '\0'; 1145 strlcat(val, "/lib", sizeof (zo->zo_alt_libpath)); 1146 1147 if (0 != access(zo->zo_alt_libpath, X_OK)) 1148 goto invalid; 1149 return; 1150 1151 invalid: 1152 ztest_dump_core = B_FALSE; 1153 fatal(B_TRUE, "invalid alternate %s %s", invalid_what, val); 1154 } 1155 } 1156 1157 static void 1158 ztest_kill(ztest_shared_t *zs) 1159 { 1160 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 1161 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 1162 1163 /* 1164 * Before we kill ourselves, make sure that the config is updated. 1165 * See comment above spa_write_cachefile(). 1166 */ 1167 mutex_enter(&spa_namespace_lock); 1168 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE); 1169 mutex_exit(&spa_namespace_lock); 1170 1171 (void) raise(SIGKILL); 1172 } 1173 1174 static void 1175 ztest_record_enospc(const char *s) 1176 { 1177 (void) s; 1178 ztest_shared->zs_enospc_count++; 1179 } 1180 1181 static uint64_t 1182 ztest_get_ashift(void) 1183 { 1184 if (ztest_opts.zo_ashift == 0) 1185 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 1186 return (ztest_opts.zo_ashift); 1187 } 1188 1189 static boolean_t 1190 ztest_is_draid_spare(const char *name) 1191 { 1192 uint64_t spare_id = 0, parity = 0, vdev_id = 0; 1193 1194 if (sscanf(name, VDEV_TYPE_DRAID "%"PRIu64"-%"PRIu64"-%"PRIu64"", 1195 &parity, &vdev_id, &spare_id) == 3) { 1196 return (B_TRUE); 1197 } 1198 1199 return (B_FALSE); 1200 } 1201 1202 static nvlist_t * 1203 make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) 1204 { 1205 char *pathbuf; 1206 uint64_t vdev; 1207 nvlist_t *file; 1208 boolean_t draid_spare = B_FALSE; 1209 1210 pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1211 1212 if (ashift == 0) 1213 ashift = ztest_get_ashift(); 1214 1215 if (path == NULL) { 1216 path = pathbuf; 1217 1218 if (aux != NULL) { 1219 vdev = ztest_shared->zs_vdev_aux; 1220 (void) snprintf(path, MAXPATHLEN, 1221 ztest_aux_template, ztest_opts.zo_dir, 1222 pool == NULL ? ztest_opts.zo_pool : pool, 1223 aux, vdev); 1224 } else { 1225 vdev = ztest_shared->zs_vdev_next_leaf++; 1226 (void) snprintf(path, MAXPATHLEN, 1227 ztest_dev_template, ztest_opts.zo_dir, 1228 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 1229 } 1230 } else { 1231 draid_spare = ztest_is_draid_spare(path); 1232 } 1233 1234 if (size != 0 && !draid_spare) { 1235 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 1236 if (fd == -1) 1237 fatal(B_TRUE, "can't open %s", path); 1238 if (ftruncate(fd, size) != 0) 1239 fatal(B_TRUE, "can't ftruncate %s", path); 1240 (void) close(fd); 1241 } 1242 1243 file = fnvlist_alloc(); 1244 fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, 1245 draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); 1246 fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); 1247 fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); 1248 umem_free(pathbuf, MAXPATHLEN); 1249 1250 return (file); 1251 } 1252 1253 static nvlist_t * 1254 make_vdev_raid(char *path, char *aux, char *pool, size_t size, 1255 uint64_t ashift, int r) 1256 { 1257 nvlist_t *raid, **child; 1258 int c; 1259 1260 if (r < 2) 1261 return (make_vdev_file(path, aux, pool, size, ashift)); 1262 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 1263 1264 for (c = 0; c < r; c++) 1265 child[c] = make_vdev_file(path, aux, pool, size, ashift); 1266 1267 raid = fnvlist_alloc(); 1268 fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, 1269 ztest_opts.zo_raid_type); 1270 fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, 1271 ztest_opts.zo_raid_parity); 1272 fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, 1273 (const nvlist_t **)child, r); 1274 1275 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { 1276 uint64_t ndata = ztest_opts.zo_draid_data; 1277 uint64_t nparity = ztest_opts.zo_raid_parity; 1278 uint64_t nspares = ztest_opts.zo_draid_spares; 1279 uint64_t children = ztest_opts.zo_raid_children; 1280 uint64_t ngroups = 1; 1281 1282 /* 1283 * Calculate the minimum number of groups required to fill a 1284 * slice. This is the LCM of the stripe width (data + parity) 1285 * and the number of data drives (children - spares). 1286 */ 1287 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1288 ngroups++; 1289 1290 /* Store the basic dRAID configuration. */ 1291 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1292 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1293 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1294 } 1295 1296 for (c = 0; c < r; c++) 1297 fnvlist_free(child[c]); 1298 1299 umem_free(child, r * sizeof (nvlist_t *)); 1300 1301 return (raid); 1302 } 1303 1304 static nvlist_t * 1305 make_vdev_mirror(char *path, char *aux, char *pool, size_t size, 1306 uint64_t ashift, int r, int m) 1307 { 1308 nvlist_t *mirror, **child; 1309 int c; 1310 1311 if (m < 1) 1312 return (make_vdev_raid(path, aux, pool, size, ashift, r)); 1313 1314 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 1315 1316 for (c = 0; c < m; c++) 1317 child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); 1318 1319 mirror = fnvlist_alloc(); 1320 fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); 1321 fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 1322 (const nvlist_t **)child, m); 1323 1324 for (c = 0; c < m; c++) 1325 fnvlist_free(child[c]); 1326 1327 umem_free(child, m * sizeof (nvlist_t *)); 1328 1329 return (mirror); 1330 } 1331 1332 static nvlist_t * 1333 make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, 1334 const char *class, int r, int m, int t) 1335 { 1336 nvlist_t *root, **child; 1337 int c; 1338 boolean_t log; 1339 1340 ASSERT3S(t, >, 0); 1341 1342 log = (class != NULL && strcmp(class, "log") == 0); 1343 1344 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 1345 1346 for (c = 0; c < t; c++) { 1347 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 1348 r, m); 1349 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); 1350 1351 if (class != NULL && class[0] != '\0') { 1352 ASSERT(m > 1 || log); /* expecting a mirror */ 1353 fnvlist_add_string(child[c], 1354 ZPOOL_CONFIG_ALLOCATION_BIAS, class); 1355 } 1356 } 1357 1358 root = fnvlist_alloc(); 1359 fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 1360 fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 1361 (const nvlist_t **)child, t); 1362 1363 for (c = 0; c < t; c++) 1364 fnvlist_free(child[c]); 1365 1366 umem_free(child, t * sizeof (nvlist_t *)); 1367 1368 return (root); 1369 } 1370 1371 /* 1372 * Find a random spa version. Returns back a random spa version in the 1373 * range [initial_version, SPA_VERSION_FEATURES]. 1374 */ 1375 static uint64_t 1376 ztest_random_spa_version(uint64_t initial_version) 1377 { 1378 uint64_t version = initial_version; 1379 1380 if (version <= SPA_VERSION_BEFORE_FEATURES) { 1381 version = version + 1382 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 1383 } 1384 1385 if (version > SPA_VERSION_BEFORE_FEATURES) 1386 version = SPA_VERSION_FEATURES; 1387 1388 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1389 return (version); 1390 } 1391 1392 static int 1393 ztest_random_blocksize(void) 1394 { 1395 ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); 1396 1397 /* 1398 * Choose a block size >= the ashift. 1399 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1400 */ 1401 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1402 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1403 maxbs = 20; 1404 uint64_t block_shift = 1405 ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1406 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1407 } 1408 1409 static int 1410 ztest_random_dnodesize(void) 1411 { 1412 int slots; 1413 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1414 1415 if (max_slots == DNODE_MIN_SLOTS) 1416 return (DNODE_MIN_SIZE); 1417 1418 /* 1419 * Weight the random distribution more heavily toward smaller 1420 * dnode sizes since that is more likely to reflect real-world 1421 * usage. 1422 */ 1423 ASSERT3U(max_slots, >, 4); 1424 switch (ztest_random(10)) { 1425 case 0: 1426 slots = 5 + ztest_random(max_slots - 4); 1427 break; 1428 case 1 ... 4: 1429 slots = 2 + ztest_random(3); 1430 break; 1431 default: 1432 slots = 1; 1433 break; 1434 } 1435 1436 return (slots << DNODE_SHIFT); 1437 } 1438 1439 static int 1440 ztest_random_ibshift(void) 1441 { 1442 return (DN_MIN_INDBLKSHIFT + 1443 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1444 } 1445 1446 static uint64_t 1447 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1448 { 1449 uint64_t top; 1450 vdev_t *rvd = spa->spa_root_vdev; 1451 vdev_t *tvd; 1452 1453 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 1454 1455 do { 1456 top = ztest_random(rvd->vdev_children); 1457 tvd = rvd->vdev_child[top]; 1458 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1459 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1460 1461 return (top); 1462 } 1463 1464 static uint64_t 1465 ztest_random_dsl_prop(zfs_prop_t prop) 1466 { 1467 uint64_t value; 1468 1469 do { 1470 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1471 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1472 1473 return (value); 1474 } 1475 1476 static int 1477 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1478 boolean_t inherit) 1479 { 1480 const char *propname = zfs_prop_to_name(prop); 1481 const char *valname; 1482 char *setpoint; 1483 uint64_t curval; 1484 int error; 1485 1486 error = dsl_prop_set_int(osname, propname, 1487 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1488 1489 if (error == ENOSPC) { 1490 ztest_record_enospc(FTAG); 1491 return (error); 1492 } 1493 ASSERT0(error); 1494 1495 setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1496 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1497 1498 if (ztest_opts.zo_verbose >= 6) { 1499 int err; 1500 1501 err = zfs_prop_index_to_string(prop, curval, &valname); 1502 if (err) 1503 (void) printf("%s %s = %llu at '%s'\n", osname, 1504 propname, (unsigned long long)curval, setpoint); 1505 else 1506 (void) printf("%s %s = %s at '%s'\n", 1507 osname, propname, valname, setpoint); 1508 } 1509 umem_free(setpoint, MAXPATHLEN); 1510 1511 return (error); 1512 } 1513 1514 static int 1515 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1516 { 1517 spa_t *spa = ztest_spa; 1518 nvlist_t *props = NULL; 1519 int error; 1520 1521 props = fnvlist_alloc(); 1522 fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); 1523 1524 error = spa_prop_set(spa, props); 1525 1526 fnvlist_free(props); 1527 1528 if (error == ENOSPC) { 1529 ztest_record_enospc(FTAG); 1530 return (error); 1531 } 1532 ASSERT0(error); 1533 1534 return (error); 1535 } 1536 1537 static int 1538 ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, 1539 boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) 1540 { 1541 int err; 1542 char *cp = NULL; 1543 char ddname[ZFS_MAX_DATASET_NAME_LEN]; 1544 1545 strcpy(ddname, name); 1546 cp = strchr(ddname, '@'); 1547 if (cp != NULL) 1548 *cp = '\0'; 1549 1550 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1551 while (decrypt && err == EACCES) { 1552 dsl_crypto_params_t *dcp; 1553 nvlist_t *crypto_args = fnvlist_alloc(); 1554 1555 fnvlist_add_uint8_array(crypto_args, "wkeydata", 1556 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 1557 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, 1558 crypto_args, &dcp)); 1559 err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); 1560 /* 1561 * Note: if there was an error loading, the wkey was not 1562 * consumed, and needs to be freed. 1563 */ 1564 dsl_crypto_params_free(dcp, (err != 0)); 1565 fnvlist_free(crypto_args); 1566 1567 if (err == EINVAL) { 1568 /* 1569 * We couldn't load a key for this dataset so try 1570 * the parent. This loop will eventually hit the 1571 * encryption root since ztest only makes clones 1572 * as children of their origin datasets. 1573 */ 1574 cp = strrchr(ddname, '/'); 1575 if (cp == NULL) 1576 return (err); 1577 1578 *cp = '\0'; 1579 err = EACCES; 1580 continue; 1581 } else if (err != 0) { 1582 break; 1583 } 1584 1585 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1586 break; 1587 } 1588 1589 return (err); 1590 } 1591 1592 static void 1593 ztest_rll_init(rll_t *rll) 1594 { 1595 rll->rll_writer = NULL; 1596 rll->rll_readers = 0; 1597 mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); 1598 cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); 1599 } 1600 1601 static void 1602 ztest_rll_destroy(rll_t *rll) 1603 { 1604 ASSERT3P(rll->rll_writer, ==, NULL); 1605 ASSERT0(rll->rll_readers); 1606 mutex_destroy(&rll->rll_lock); 1607 cv_destroy(&rll->rll_cv); 1608 } 1609 1610 static void 1611 ztest_rll_lock(rll_t *rll, rl_type_t type) 1612 { 1613 mutex_enter(&rll->rll_lock); 1614 1615 if (type == RL_READER) { 1616 while (rll->rll_writer != NULL) 1617 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1618 rll->rll_readers++; 1619 } else { 1620 while (rll->rll_writer != NULL || rll->rll_readers) 1621 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1622 rll->rll_writer = curthread; 1623 } 1624 1625 mutex_exit(&rll->rll_lock); 1626 } 1627 1628 static void 1629 ztest_rll_unlock(rll_t *rll) 1630 { 1631 mutex_enter(&rll->rll_lock); 1632 1633 if (rll->rll_writer) { 1634 ASSERT0(rll->rll_readers); 1635 rll->rll_writer = NULL; 1636 } else { 1637 ASSERT3S(rll->rll_readers, >, 0); 1638 ASSERT3P(rll->rll_writer, ==, NULL); 1639 rll->rll_readers--; 1640 } 1641 1642 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1643 cv_broadcast(&rll->rll_cv); 1644 1645 mutex_exit(&rll->rll_lock); 1646 } 1647 1648 static void 1649 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1650 { 1651 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1652 1653 ztest_rll_lock(rll, type); 1654 } 1655 1656 static void 1657 ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1658 { 1659 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1660 1661 ztest_rll_unlock(rll); 1662 } 1663 1664 static rl_t * 1665 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1666 uint64_t size, rl_type_t type) 1667 { 1668 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1669 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1670 rl_t *rl; 1671 1672 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1673 rl->rl_object = object; 1674 rl->rl_offset = offset; 1675 rl->rl_size = size; 1676 rl->rl_lock = rll; 1677 1678 ztest_rll_lock(rll, type); 1679 1680 return (rl); 1681 } 1682 1683 static void 1684 ztest_range_unlock(rl_t *rl) 1685 { 1686 rll_t *rll = rl->rl_lock; 1687 1688 ztest_rll_unlock(rll); 1689 1690 umem_free(rl, sizeof (*rl)); 1691 } 1692 1693 static void 1694 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1695 { 1696 zd->zd_os = os; 1697 zd->zd_zilog = dmu_objset_zil(os); 1698 zd->zd_shared = szd; 1699 dmu_objset_name(os, zd->zd_name); 1700 int l; 1701 1702 if (zd->zd_shared != NULL) 1703 zd->zd_shared->zd_seq = 0; 1704 1705 VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); 1706 mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); 1707 1708 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1709 ztest_rll_init(&zd->zd_object_lock[l]); 1710 1711 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1712 ztest_rll_init(&zd->zd_range_lock[l]); 1713 } 1714 1715 static void 1716 ztest_zd_fini(ztest_ds_t *zd) 1717 { 1718 int l; 1719 1720 mutex_destroy(&zd->zd_dirobj_lock); 1721 (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); 1722 1723 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1724 ztest_rll_destroy(&zd->zd_object_lock[l]); 1725 1726 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1727 ztest_rll_destroy(&zd->zd_range_lock[l]); 1728 } 1729 1730 #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) 1731 1732 static uint64_t 1733 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 1734 { 1735 uint64_t txg; 1736 int error; 1737 1738 /* 1739 * Attempt to assign tx to some transaction group. 1740 */ 1741 error = dmu_tx_assign(tx, txg_how); 1742 if (error) { 1743 if (error == ERESTART) { 1744 ASSERT3U(txg_how, ==, TXG_NOWAIT); 1745 dmu_tx_wait(tx); 1746 } else { 1747 ASSERT3U(error, ==, ENOSPC); 1748 ztest_record_enospc(tag); 1749 } 1750 dmu_tx_abort(tx); 1751 return (0); 1752 } 1753 txg = dmu_tx_get_txg(tx); 1754 ASSERT3U(txg, !=, 0); 1755 return (txg); 1756 } 1757 1758 static void 1759 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1760 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1761 uint64_t crtxg) 1762 { 1763 bt->bt_magic = BT_MAGIC; 1764 bt->bt_objset = dmu_objset_id(os); 1765 bt->bt_object = object; 1766 bt->bt_dnodesize = dnodesize; 1767 bt->bt_offset = offset; 1768 bt->bt_gen = gen; 1769 bt->bt_txg = txg; 1770 bt->bt_crtxg = crtxg; 1771 } 1772 1773 static void 1774 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1775 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1776 uint64_t crtxg) 1777 { 1778 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1779 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1780 ASSERT3U(bt->bt_object, ==, object); 1781 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1782 ASSERT3U(bt->bt_offset, ==, offset); 1783 ASSERT3U(bt->bt_gen, <=, gen); 1784 ASSERT3U(bt->bt_txg, <=, txg); 1785 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1786 } 1787 1788 static ztest_block_tag_t * 1789 ztest_bt_bonus(dmu_buf_t *db) 1790 { 1791 dmu_object_info_t doi; 1792 ztest_block_tag_t *bt; 1793 1794 dmu_object_info_from_db(db, &doi); 1795 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1796 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1797 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1798 1799 return (bt); 1800 } 1801 1802 /* 1803 * Generate a token to fill up unused bonus buffer space. Try to make 1804 * it unique to the object, generation, and offset to verify that data 1805 * is not getting overwritten by data from other dnodes. 1806 */ 1807 #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1808 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1809 1810 /* 1811 * Fill up the unused bonus buffer region before the block tag with a 1812 * verifiable pattern. Filling the whole bonus area with non-zero data 1813 * helps ensure that all dnode traversal code properly skips the 1814 * interior regions of large dnodes. 1815 */ 1816 static void 1817 ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1818 objset_t *os, uint64_t gen) 1819 { 1820 uint64_t *bonusp; 1821 1822 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1823 1824 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1825 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1826 gen, bonusp - (uint64_t *)db->db_data); 1827 *bonusp = token; 1828 } 1829 } 1830 1831 /* 1832 * Verify that the unused area of a bonus buffer is filled with the 1833 * expected tokens. 1834 */ 1835 static void 1836 ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1837 objset_t *os, uint64_t gen) 1838 { 1839 uint64_t *bonusp; 1840 1841 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1842 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1843 gen, bonusp - (uint64_t *)db->db_data); 1844 VERIFY3U(*bonusp, ==, token); 1845 } 1846 } 1847 1848 /* 1849 * ZIL logging ops 1850 */ 1851 1852 #define lrz_type lr_mode 1853 #define lrz_blocksize lr_uid 1854 #define lrz_ibshift lr_gid 1855 #define lrz_bonustype lr_rdev 1856 #define lrz_dnodesize lr_crtime[1] 1857 1858 static void 1859 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1860 { 1861 char *name = (void *)(lr + 1); /* name follows lr */ 1862 size_t namesize = strlen(name) + 1; 1863 itx_t *itx; 1864 1865 if (zil_replaying(zd->zd_zilog, tx)) 1866 return; 1867 1868 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1869 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1870 sizeof (*lr) + namesize - sizeof (lr_t)); 1871 1872 zil_itx_assign(zd->zd_zilog, itx, tx); 1873 } 1874 1875 static void 1876 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1877 { 1878 char *name = (void *)(lr + 1); /* name follows lr */ 1879 size_t namesize = strlen(name) + 1; 1880 itx_t *itx; 1881 1882 if (zil_replaying(zd->zd_zilog, tx)) 1883 return; 1884 1885 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1886 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1887 sizeof (*lr) + namesize - sizeof (lr_t)); 1888 1889 itx->itx_oid = object; 1890 zil_itx_assign(zd->zd_zilog, itx, tx); 1891 } 1892 1893 static void 1894 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1895 { 1896 itx_t *itx; 1897 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1898 1899 if (zil_replaying(zd->zd_zilog, tx)) 1900 return; 1901 1902 if (lr->lr_length > zil_max_log_data(zd->zd_zilog)) 1903 write_state = WR_INDIRECT; 1904 1905 itx = zil_itx_create(TX_WRITE, 1906 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1907 1908 if (write_state == WR_COPIED && 1909 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1910 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { 1911 zil_itx_destroy(itx); 1912 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1913 write_state = WR_NEED_COPY; 1914 } 1915 itx->itx_private = zd; 1916 itx->itx_wr_state = write_state; 1917 itx->itx_sync = (ztest_random(8) == 0); 1918 1919 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1920 sizeof (*lr) - sizeof (lr_t)); 1921 1922 zil_itx_assign(zd->zd_zilog, itx, tx); 1923 } 1924 1925 static void 1926 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 1927 { 1928 itx_t *itx; 1929 1930 if (zil_replaying(zd->zd_zilog, tx)) 1931 return; 1932 1933 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 1934 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1935 sizeof (*lr) - sizeof (lr_t)); 1936 1937 itx->itx_sync = B_FALSE; 1938 zil_itx_assign(zd->zd_zilog, itx, tx); 1939 } 1940 1941 static void 1942 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 1943 { 1944 itx_t *itx; 1945 1946 if (zil_replaying(zd->zd_zilog, tx)) 1947 return; 1948 1949 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 1950 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1951 sizeof (*lr) - sizeof (lr_t)); 1952 1953 itx->itx_sync = B_FALSE; 1954 zil_itx_assign(zd->zd_zilog, itx, tx); 1955 } 1956 1957 /* 1958 * ZIL replay ops 1959 */ 1960 static int 1961 ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 1962 { 1963 ztest_ds_t *zd = arg1; 1964 lr_create_t *lr = arg2; 1965 char *name = (void *)(lr + 1); /* name follows lr */ 1966 objset_t *os = zd->zd_os; 1967 ztest_block_tag_t *bbt; 1968 dmu_buf_t *db; 1969 dmu_tx_t *tx; 1970 uint64_t txg; 1971 int error = 0; 1972 int bonuslen; 1973 1974 if (byteswap) 1975 byteswap_uint64_array(lr, sizeof (*lr)); 1976 1977 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 1978 ASSERT3S(name[0], !=, '\0'); 1979 1980 tx = dmu_tx_create(os); 1981 1982 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 1983 1984 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1985 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1986 } else { 1987 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1988 } 1989 1990 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1991 if (txg == 0) 1992 return (ENOSPC); 1993 1994 ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); 1995 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 1996 1997 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1998 if (lr->lr_foid == 0) { 1999 lr->lr_foid = zap_create_dnsize(os, 2000 lr->lrz_type, lr->lrz_bonustype, 2001 bonuslen, lr->lrz_dnodesize, tx); 2002 } else { 2003 error = zap_create_claim_dnsize(os, lr->lr_foid, 2004 lr->lrz_type, lr->lrz_bonustype, 2005 bonuslen, lr->lrz_dnodesize, tx); 2006 } 2007 } else { 2008 if (lr->lr_foid == 0) { 2009 lr->lr_foid = dmu_object_alloc_dnsize(os, 2010 lr->lrz_type, 0, lr->lrz_bonustype, 2011 bonuslen, lr->lrz_dnodesize, tx); 2012 } else { 2013 error = dmu_object_claim_dnsize(os, lr->lr_foid, 2014 lr->lrz_type, 0, lr->lrz_bonustype, 2015 bonuslen, lr->lrz_dnodesize, tx); 2016 } 2017 } 2018 2019 if (error) { 2020 ASSERT3U(error, ==, EEXIST); 2021 ASSERT(zd->zd_zilog->zl_replay); 2022 dmu_tx_commit(tx); 2023 return (error); 2024 } 2025 2026 ASSERT3U(lr->lr_foid, !=, 0); 2027 2028 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 2029 VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, 2030 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 2031 2032 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2033 bbt = ztest_bt_bonus(db); 2034 dmu_buf_will_dirty(db, tx); 2035 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 2036 lr->lr_gen, txg, txg); 2037 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 2038 dmu_buf_rele(db, FTAG); 2039 2040 VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 2041 &lr->lr_foid, tx)); 2042 2043 (void) ztest_log_create(zd, tx, lr); 2044 2045 dmu_tx_commit(tx); 2046 2047 return (0); 2048 } 2049 2050 static int 2051 ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 2052 { 2053 ztest_ds_t *zd = arg1; 2054 lr_remove_t *lr = arg2; 2055 char *name = (void *)(lr + 1); /* name follows lr */ 2056 objset_t *os = zd->zd_os; 2057 dmu_object_info_t doi; 2058 dmu_tx_t *tx; 2059 uint64_t object, txg; 2060 2061 if (byteswap) 2062 byteswap_uint64_array(lr, sizeof (*lr)); 2063 2064 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2065 ASSERT3S(name[0], !=, '\0'); 2066 2067 VERIFY0( 2068 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 2069 ASSERT3U(object, !=, 0); 2070 2071 ztest_object_lock(zd, object, RL_WRITER); 2072 2073 VERIFY0(dmu_object_info(os, object, &doi)); 2074 2075 tx = dmu_tx_create(os); 2076 2077 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 2078 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 2079 2080 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2081 if (txg == 0) { 2082 ztest_object_unlock(zd, object); 2083 return (ENOSPC); 2084 } 2085 2086 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 2087 VERIFY0(zap_destroy(os, object, tx)); 2088 } else { 2089 VERIFY0(dmu_object_free(os, object, tx)); 2090 } 2091 2092 VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); 2093 2094 (void) ztest_log_remove(zd, tx, lr, object); 2095 2096 dmu_tx_commit(tx); 2097 2098 ztest_object_unlock(zd, object); 2099 2100 return (0); 2101 } 2102 2103 static int 2104 ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 2105 { 2106 ztest_ds_t *zd = arg1; 2107 lr_write_t *lr = arg2; 2108 objset_t *os = zd->zd_os; 2109 void *data = lr + 1; /* data follows lr */ 2110 uint64_t offset, length; 2111 ztest_block_tag_t *bt = data; 2112 ztest_block_tag_t *bbt; 2113 uint64_t gen, txg, lrtxg, crtxg; 2114 dmu_object_info_t doi; 2115 dmu_tx_t *tx; 2116 dmu_buf_t *db; 2117 arc_buf_t *abuf = NULL; 2118 rl_t *rl; 2119 2120 if (byteswap) 2121 byteswap_uint64_array(lr, sizeof (*lr)); 2122 2123 offset = lr->lr_offset; 2124 length = lr->lr_length; 2125 2126 /* If it's a dmu_sync() block, write the whole block */ 2127 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 2128 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 2129 if (length < blocksize) { 2130 offset -= offset % blocksize; 2131 length = blocksize; 2132 } 2133 } 2134 2135 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 2136 byteswap_uint64_array(bt, sizeof (*bt)); 2137 2138 if (bt->bt_magic != BT_MAGIC) 2139 bt = NULL; 2140 2141 ztest_object_lock(zd, lr->lr_foid, RL_READER); 2142 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); 2143 2144 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2145 2146 dmu_object_info_from_db(db, &doi); 2147 2148 bbt = ztest_bt_bonus(db); 2149 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2150 gen = bbt->bt_gen; 2151 crtxg = bbt->bt_crtxg; 2152 lrtxg = lr->lr_common.lrc_txg; 2153 2154 tx = dmu_tx_create(os); 2155 2156 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 2157 2158 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 2159 P2PHASE(offset, length) == 0) 2160 abuf = dmu_request_arcbuf(db, length); 2161 2162 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2163 if (txg == 0) { 2164 if (abuf != NULL) 2165 dmu_return_arcbuf(abuf); 2166 dmu_buf_rele(db, FTAG); 2167 ztest_range_unlock(rl); 2168 ztest_object_unlock(zd, lr->lr_foid); 2169 return (ENOSPC); 2170 } 2171 2172 if (bt != NULL) { 2173 /* 2174 * Usually, verify the old data before writing new data -- 2175 * but not always, because we also want to verify correct 2176 * behavior when the data was not recently read into cache. 2177 */ 2178 ASSERT0(offset % doi.doi_data_block_size); 2179 if (ztest_random(4) != 0) { 2180 int prefetch = ztest_random(2) ? 2181 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 2182 ztest_block_tag_t rbt; 2183 2184 VERIFY(dmu_read(os, lr->lr_foid, offset, 2185 sizeof (rbt), &rbt, prefetch) == 0); 2186 if (rbt.bt_magic == BT_MAGIC) { 2187 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 2188 offset, gen, txg, crtxg); 2189 } 2190 } 2191 2192 /* 2193 * Writes can appear to be newer than the bonus buffer because 2194 * the ztest_get_data() callback does a dmu_read() of the 2195 * open-context data, which may be different than the data 2196 * as it was when the write was generated. 2197 */ 2198 if (zd->zd_zilog->zl_replay) { 2199 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 2200 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 2201 bt->bt_crtxg); 2202 } 2203 2204 /* 2205 * Set the bt's gen/txg to the bonus buffer's gen/txg 2206 * so that all of the usual ASSERTs will work. 2207 */ 2208 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 2209 crtxg); 2210 } 2211 2212 if (abuf == NULL) { 2213 dmu_write(os, lr->lr_foid, offset, length, data, tx); 2214 } else { 2215 memcpy(abuf->b_data, data, length); 2216 dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx); 2217 } 2218 2219 (void) ztest_log_write(zd, tx, lr); 2220 2221 dmu_buf_rele(db, FTAG); 2222 2223 dmu_tx_commit(tx); 2224 2225 ztest_range_unlock(rl); 2226 ztest_object_unlock(zd, lr->lr_foid); 2227 2228 return (0); 2229 } 2230 2231 static int 2232 ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 2233 { 2234 ztest_ds_t *zd = arg1; 2235 lr_truncate_t *lr = arg2; 2236 objset_t *os = zd->zd_os; 2237 dmu_tx_t *tx; 2238 uint64_t txg; 2239 rl_t *rl; 2240 2241 if (byteswap) 2242 byteswap_uint64_array(lr, sizeof (*lr)); 2243 2244 ztest_object_lock(zd, lr->lr_foid, RL_READER); 2245 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 2246 RL_WRITER); 2247 2248 tx = dmu_tx_create(os); 2249 2250 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 2251 2252 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2253 if (txg == 0) { 2254 ztest_range_unlock(rl); 2255 ztest_object_unlock(zd, lr->lr_foid); 2256 return (ENOSPC); 2257 } 2258 2259 VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 2260 lr->lr_length, tx)); 2261 2262 (void) ztest_log_truncate(zd, tx, lr); 2263 2264 dmu_tx_commit(tx); 2265 2266 ztest_range_unlock(rl); 2267 ztest_object_unlock(zd, lr->lr_foid); 2268 2269 return (0); 2270 } 2271 2272 static int 2273 ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 2274 { 2275 ztest_ds_t *zd = arg1; 2276 lr_setattr_t *lr = arg2; 2277 objset_t *os = zd->zd_os; 2278 dmu_tx_t *tx; 2279 dmu_buf_t *db; 2280 ztest_block_tag_t *bbt; 2281 uint64_t txg, lrtxg, crtxg, dnodesize; 2282 2283 if (byteswap) 2284 byteswap_uint64_array(lr, sizeof (*lr)); 2285 2286 ztest_object_lock(zd, lr->lr_foid, RL_WRITER); 2287 2288 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2289 2290 tx = dmu_tx_create(os); 2291 dmu_tx_hold_bonus(tx, lr->lr_foid); 2292 2293 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2294 if (txg == 0) { 2295 dmu_buf_rele(db, FTAG); 2296 ztest_object_unlock(zd, lr->lr_foid); 2297 return (ENOSPC); 2298 } 2299 2300 bbt = ztest_bt_bonus(db); 2301 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2302 crtxg = bbt->bt_crtxg; 2303 lrtxg = lr->lr_common.lrc_txg; 2304 dnodesize = bbt->bt_dnodesize; 2305 2306 if (zd->zd_zilog->zl_replay) { 2307 ASSERT3U(lr->lr_size, !=, 0); 2308 ASSERT3U(lr->lr_mode, !=, 0); 2309 ASSERT3U(lrtxg, !=, 0); 2310 } else { 2311 /* 2312 * Randomly change the size and increment the generation. 2313 */ 2314 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 2315 sizeof (*bbt); 2316 lr->lr_mode = bbt->bt_gen + 1; 2317 ASSERT0(lrtxg); 2318 } 2319 2320 /* 2321 * Verify that the current bonus buffer is not newer than our txg. 2322 */ 2323 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2324 MAX(txg, lrtxg), crtxg); 2325 2326 dmu_buf_will_dirty(db, tx); 2327 2328 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 2329 ASSERT3U(lr->lr_size, <=, db->db_size); 2330 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 2331 bbt = ztest_bt_bonus(db); 2332 2333 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2334 txg, crtxg); 2335 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 2336 dmu_buf_rele(db, FTAG); 2337 2338 (void) ztest_log_setattr(zd, tx, lr); 2339 2340 dmu_tx_commit(tx); 2341 2342 ztest_object_unlock(zd, lr->lr_foid); 2343 2344 return (0); 2345 } 2346 2347 zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 2348 NULL, /* 0 no such transaction type */ 2349 ztest_replay_create, /* TX_CREATE */ 2350 NULL, /* TX_MKDIR */ 2351 NULL, /* TX_MKXATTR */ 2352 NULL, /* TX_SYMLINK */ 2353 ztest_replay_remove, /* TX_REMOVE */ 2354 NULL, /* TX_RMDIR */ 2355 NULL, /* TX_LINK */ 2356 NULL, /* TX_RENAME */ 2357 ztest_replay_write, /* TX_WRITE */ 2358 ztest_replay_truncate, /* TX_TRUNCATE */ 2359 ztest_replay_setattr, /* TX_SETATTR */ 2360 NULL, /* TX_ACL */ 2361 NULL, /* TX_CREATE_ACL */ 2362 NULL, /* TX_CREATE_ATTR */ 2363 NULL, /* TX_CREATE_ACL_ATTR */ 2364 NULL, /* TX_MKDIR_ACL */ 2365 NULL, /* TX_MKDIR_ATTR */ 2366 NULL, /* TX_MKDIR_ACL_ATTR */ 2367 NULL, /* TX_WRITE2 */ 2368 NULL, /* TX_SETSAXATTR */ 2369 }; 2370 2371 /* 2372 * ZIL get_data callbacks 2373 */ 2374 2375 static void 2376 ztest_get_done(zgd_t *zgd, int error) 2377 { 2378 (void) error; 2379 ztest_ds_t *zd = zgd->zgd_private; 2380 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 2381 2382 if (zgd->zgd_db) 2383 dmu_buf_rele(zgd->zgd_db, zgd); 2384 2385 ztest_range_unlock((rl_t *)zgd->zgd_lr); 2386 ztest_object_unlock(zd, object); 2387 2388 umem_free(zgd, sizeof (*zgd)); 2389 } 2390 2391 static int 2392 ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, 2393 struct lwb *lwb, zio_t *zio) 2394 { 2395 (void) arg2; 2396 ztest_ds_t *zd = arg; 2397 objset_t *os = zd->zd_os; 2398 uint64_t object = lr->lr_foid; 2399 uint64_t offset = lr->lr_offset; 2400 uint64_t size = lr->lr_length; 2401 uint64_t txg = lr->lr_common.lrc_txg; 2402 uint64_t crtxg; 2403 dmu_object_info_t doi; 2404 dmu_buf_t *db; 2405 zgd_t *zgd; 2406 int error; 2407 2408 ASSERT3P(lwb, !=, NULL); 2409 ASSERT3P(zio, !=, NULL); 2410 ASSERT3U(size, !=, 0); 2411 2412 ztest_object_lock(zd, object, RL_READER); 2413 error = dmu_bonus_hold(os, object, FTAG, &db); 2414 if (error) { 2415 ztest_object_unlock(zd, object); 2416 return (error); 2417 } 2418 2419 crtxg = ztest_bt_bonus(db)->bt_crtxg; 2420 2421 if (crtxg == 0 || crtxg > txg) { 2422 dmu_buf_rele(db, FTAG); 2423 ztest_object_unlock(zd, object); 2424 return (ENOENT); 2425 } 2426 2427 dmu_object_info_from_db(db, &doi); 2428 dmu_buf_rele(db, FTAG); 2429 db = NULL; 2430 2431 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 2432 zgd->zgd_lwb = lwb; 2433 zgd->zgd_private = zd; 2434 2435 if (buf != NULL) { /* immediate write */ 2436 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2437 object, offset, size, RL_READER); 2438 2439 error = dmu_read(os, object, offset, size, buf, 2440 DMU_READ_NO_PREFETCH); 2441 ASSERT0(error); 2442 } else { 2443 size = doi.doi_data_block_size; 2444 if (ISP2(size)) { 2445 offset = P2ALIGN(offset, size); 2446 } else { 2447 ASSERT3U(offset, <, size); 2448 offset = 0; 2449 } 2450 2451 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2452 object, offset, size, RL_READER); 2453 2454 error = dmu_buf_hold(os, object, offset, zgd, &db, 2455 DMU_READ_NO_PREFETCH); 2456 2457 if (error == 0) { 2458 blkptr_t *bp = &lr->lr_blkptr; 2459 2460 zgd->zgd_db = db; 2461 zgd->zgd_bp = bp; 2462 2463 ASSERT3U(db->db_offset, ==, offset); 2464 ASSERT3U(db->db_size, ==, size); 2465 2466 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2467 ztest_get_done, zgd); 2468 2469 if (error == 0) 2470 return (0); 2471 } 2472 } 2473 2474 ztest_get_done(zgd, error); 2475 2476 return (error); 2477 } 2478 2479 static void * 2480 ztest_lr_alloc(size_t lrsize, char *name) 2481 { 2482 char *lr; 2483 size_t namesize = name ? strlen(name) + 1 : 0; 2484 2485 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2486 2487 if (name) 2488 memcpy(lr + lrsize, name, namesize); 2489 2490 return (lr); 2491 } 2492 2493 static void 2494 ztest_lr_free(void *lr, size_t lrsize, char *name) 2495 { 2496 size_t namesize = name ? strlen(name) + 1 : 0; 2497 2498 umem_free(lr, lrsize + namesize); 2499 } 2500 2501 /* 2502 * Lookup a bunch of objects. Returns the number of objects not found. 2503 */ 2504 static int 2505 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2506 { 2507 int missing = 0; 2508 int error; 2509 int i; 2510 2511 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2512 2513 for (i = 0; i < count; i++, od++) { 2514 od->od_object = 0; 2515 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2516 sizeof (uint64_t), 1, &od->od_object); 2517 if (error) { 2518 ASSERT3S(error, ==, ENOENT); 2519 ASSERT0(od->od_object); 2520 missing++; 2521 } else { 2522 dmu_buf_t *db; 2523 ztest_block_tag_t *bbt; 2524 dmu_object_info_t doi; 2525 2526 ASSERT3U(od->od_object, !=, 0); 2527 ASSERT0(missing); /* there should be no gaps */ 2528 2529 ztest_object_lock(zd, od->od_object, RL_READER); 2530 VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, 2531 FTAG, &db)); 2532 dmu_object_info_from_db(db, &doi); 2533 bbt = ztest_bt_bonus(db); 2534 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2535 od->od_type = doi.doi_type; 2536 od->od_blocksize = doi.doi_data_block_size; 2537 od->od_gen = bbt->bt_gen; 2538 dmu_buf_rele(db, FTAG); 2539 ztest_object_unlock(zd, od->od_object); 2540 } 2541 } 2542 2543 return (missing); 2544 } 2545 2546 static int 2547 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2548 { 2549 int missing = 0; 2550 int i; 2551 2552 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2553 2554 for (i = 0; i < count; i++, od++) { 2555 if (missing) { 2556 od->od_object = 0; 2557 missing++; 2558 continue; 2559 } 2560 2561 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2562 2563 lr->lr_doid = od->od_dir; 2564 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2565 lr->lrz_type = od->od_crtype; 2566 lr->lrz_blocksize = od->od_crblocksize; 2567 lr->lrz_ibshift = ztest_random_ibshift(); 2568 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2569 lr->lrz_dnodesize = od->od_crdnodesize; 2570 lr->lr_gen = od->od_crgen; 2571 lr->lr_crtime[0] = time(NULL); 2572 2573 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2574 ASSERT0(missing); 2575 od->od_object = 0; 2576 missing++; 2577 } else { 2578 od->od_object = lr->lr_foid; 2579 od->od_type = od->od_crtype; 2580 od->od_blocksize = od->od_crblocksize; 2581 od->od_gen = od->od_crgen; 2582 ASSERT3U(od->od_object, !=, 0); 2583 } 2584 2585 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2586 } 2587 2588 return (missing); 2589 } 2590 2591 static int 2592 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2593 { 2594 int missing = 0; 2595 int error; 2596 int i; 2597 2598 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2599 2600 od += count - 1; 2601 2602 for (i = count - 1; i >= 0; i--, od--) { 2603 if (missing) { 2604 missing++; 2605 continue; 2606 } 2607 2608 /* 2609 * No object was found. 2610 */ 2611 if (od->od_object == 0) 2612 continue; 2613 2614 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2615 2616 lr->lr_doid = od->od_dir; 2617 2618 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2619 ASSERT3U(error, ==, ENOSPC); 2620 missing++; 2621 } else { 2622 od->od_object = 0; 2623 } 2624 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2625 } 2626 2627 return (missing); 2628 } 2629 2630 static int 2631 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2632 void *data) 2633 { 2634 lr_write_t *lr; 2635 int error; 2636 2637 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2638 2639 lr->lr_foid = object; 2640 lr->lr_offset = offset; 2641 lr->lr_length = size; 2642 lr->lr_blkoff = 0; 2643 BP_ZERO(&lr->lr_blkptr); 2644 2645 memcpy(lr + 1, data, size); 2646 2647 error = ztest_replay_write(zd, lr, B_FALSE); 2648 2649 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2650 2651 return (error); 2652 } 2653 2654 static int 2655 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2656 { 2657 lr_truncate_t *lr; 2658 int error; 2659 2660 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2661 2662 lr->lr_foid = object; 2663 lr->lr_offset = offset; 2664 lr->lr_length = size; 2665 2666 error = ztest_replay_truncate(zd, lr, B_FALSE); 2667 2668 ztest_lr_free(lr, sizeof (*lr), NULL); 2669 2670 return (error); 2671 } 2672 2673 static int 2674 ztest_setattr(ztest_ds_t *zd, uint64_t object) 2675 { 2676 lr_setattr_t *lr; 2677 int error; 2678 2679 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2680 2681 lr->lr_foid = object; 2682 lr->lr_size = 0; 2683 lr->lr_mode = 0; 2684 2685 error = ztest_replay_setattr(zd, lr, B_FALSE); 2686 2687 ztest_lr_free(lr, sizeof (*lr), NULL); 2688 2689 return (error); 2690 } 2691 2692 static void 2693 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2694 { 2695 objset_t *os = zd->zd_os; 2696 dmu_tx_t *tx; 2697 uint64_t txg; 2698 rl_t *rl; 2699 2700 txg_wait_synced(dmu_objset_pool(os), 0); 2701 2702 ztest_object_lock(zd, object, RL_READER); 2703 rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); 2704 2705 tx = dmu_tx_create(os); 2706 2707 dmu_tx_hold_write(tx, object, offset, size); 2708 2709 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2710 2711 if (txg != 0) { 2712 dmu_prealloc(os, object, offset, size, tx); 2713 dmu_tx_commit(tx); 2714 txg_wait_synced(dmu_objset_pool(os), txg); 2715 } else { 2716 (void) dmu_free_long_range(os, object, offset, size); 2717 } 2718 2719 ztest_range_unlock(rl); 2720 ztest_object_unlock(zd, object); 2721 } 2722 2723 static void 2724 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2725 { 2726 int err; 2727 ztest_block_tag_t wbt; 2728 dmu_object_info_t doi; 2729 enum ztest_io_type io_type; 2730 uint64_t blocksize; 2731 void *data; 2732 2733 VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); 2734 blocksize = doi.doi_data_block_size; 2735 data = umem_alloc(blocksize, UMEM_NOFAIL); 2736 2737 /* 2738 * Pick an i/o type at random, biased toward writing block tags. 2739 */ 2740 io_type = ztest_random(ZTEST_IO_TYPES); 2741 if (ztest_random(2) == 0) 2742 io_type = ZTEST_IO_WRITE_TAG; 2743 2744 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2745 2746 switch (io_type) { 2747 2748 case ZTEST_IO_WRITE_TAG: 2749 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2750 offset, 0, 0, 0); 2751 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2752 break; 2753 2754 case ZTEST_IO_WRITE_PATTERN: 2755 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2756 if (ztest_random(2) == 0) { 2757 /* 2758 * Induce fletcher2 collisions to ensure that 2759 * zio_ddt_collision() detects and resolves them 2760 * when using fletcher2-verify for deduplication. 2761 */ 2762 ((uint64_t *)data)[0] ^= 1ULL << 63; 2763 ((uint64_t *)data)[4] ^= 1ULL << 63; 2764 } 2765 (void) ztest_write(zd, object, offset, blocksize, data); 2766 break; 2767 2768 case ZTEST_IO_WRITE_ZEROES: 2769 memset(data, 0, blocksize); 2770 (void) ztest_write(zd, object, offset, blocksize, data); 2771 break; 2772 2773 case ZTEST_IO_TRUNCATE: 2774 (void) ztest_truncate(zd, object, offset, blocksize); 2775 break; 2776 2777 case ZTEST_IO_SETATTR: 2778 (void) ztest_setattr(zd, object); 2779 break; 2780 default: 2781 break; 2782 2783 case ZTEST_IO_REWRITE: 2784 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2785 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2786 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2787 B_FALSE); 2788 VERIFY(err == 0 || err == ENOSPC); 2789 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2790 ZFS_PROP_COMPRESSION, 2791 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2792 B_FALSE); 2793 VERIFY(err == 0 || err == ENOSPC); 2794 (void) pthread_rwlock_unlock(&ztest_name_lock); 2795 2796 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2797 DMU_READ_NO_PREFETCH)); 2798 2799 (void) ztest_write(zd, object, offset, blocksize, data); 2800 break; 2801 } 2802 2803 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2804 2805 umem_free(data, blocksize); 2806 } 2807 2808 /* 2809 * Initialize an object description template. 2810 */ 2811 static void 2812 ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, 2813 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2814 uint64_t gen) 2815 { 2816 od->od_dir = ZTEST_DIROBJ; 2817 od->od_object = 0; 2818 2819 od->od_crtype = type; 2820 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2821 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2822 od->od_crgen = gen; 2823 2824 od->od_type = DMU_OT_NONE; 2825 od->od_blocksize = 0; 2826 od->od_gen = 0; 2827 2828 (void) snprintf(od->od_name, sizeof (od->od_name), 2829 "%s(%"PRId64")[%"PRIu64"]", 2830 tag, id, index); 2831 } 2832 2833 /* 2834 * Lookup or create the objects for a test using the od template. 2835 * If the objects do not all exist, or if 'remove' is specified, 2836 * remove any existing objects and create new ones. Otherwise, 2837 * use the existing objects. 2838 */ 2839 static int 2840 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2841 { 2842 int count = size / sizeof (*od); 2843 int rv = 0; 2844 2845 mutex_enter(&zd->zd_dirobj_lock); 2846 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2847 (ztest_remove(zd, od, count) != 0 || 2848 ztest_create(zd, od, count) != 0)) 2849 rv = -1; 2850 zd->zd_od = od; 2851 mutex_exit(&zd->zd_dirobj_lock); 2852 2853 return (rv); 2854 } 2855 2856 void 2857 ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2858 { 2859 (void) id; 2860 zilog_t *zilog = zd->zd_zilog; 2861 2862 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2863 2864 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2865 2866 /* 2867 * Remember the committed values in zd, which is in parent/child 2868 * shared memory. If we die, the next iteration of ztest_run() 2869 * will verify that the log really does contain this record. 2870 */ 2871 mutex_enter(&zilog->zl_lock); 2872 ASSERT3P(zd->zd_shared, !=, NULL); 2873 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2874 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2875 mutex_exit(&zilog->zl_lock); 2876 2877 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2878 } 2879 2880 /* 2881 * This function is designed to simulate the operations that occur during a 2882 * mount/unmount operation. We hold the dataset across these operations in an 2883 * attempt to expose any implicit assumptions about ZIL management. 2884 */ 2885 void 2886 ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2887 { 2888 (void) id; 2889 objset_t *os = zd->zd_os; 2890 2891 /* 2892 * We hold the ztest_vdev_lock so we don't cause problems with 2893 * other threads that wish to remove a log device, such as 2894 * ztest_device_removal(). 2895 */ 2896 mutex_enter(&ztest_vdev_lock); 2897 2898 /* 2899 * We grab the zd_dirobj_lock to ensure that no other thread is 2900 * updating the zil (i.e. adding in-memory log records) and the 2901 * zd_zilog_lock to block any I/O. 2902 */ 2903 mutex_enter(&zd->zd_dirobj_lock); 2904 (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); 2905 2906 /* zfsvfs_teardown() */ 2907 zil_close(zd->zd_zilog); 2908 2909 /* zfsvfs_setup() */ 2910 VERIFY3P(zil_open(os, ztest_get_data), ==, zd->zd_zilog); 2911 zil_replay(os, zd, ztest_replay_vector); 2912 2913 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2914 mutex_exit(&zd->zd_dirobj_lock); 2915 mutex_exit(&ztest_vdev_lock); 2916 } 2917 2918 /* 2919 * Verify that we can't destroy an active pool, create an existing pool, 2920 * or create a pool with a bad vdev spec. 2921 */ 2922 void 2923 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 2924 { 2925 (void) zd, (void) id; 2926 ztest_shared_opts_t *zo = &ztest_opts; 2927 spa_t *spa; 2928 nvlist_t *nvroot; 2929 2930 if (zo->zo_mmp_test) 2931 return; 2932 2933 /* 2934 * Attempt to create using a bad file. 2935 */ 2936 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 2937 VERIFY3U(ENOENT, ==, 2938 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 2939 fnvlist_free(nvroot); 2940 2941 /* 2942 * Attempt to create using a bad mirror. 2943 */ 2944 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); 2945 VERIFY3U(ENOENT, ==, 2946 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 2947 fnvlist_free(nvroot); 2948 2949 /* 2950 * Attempt to create an existing pool. It shouldn't matter 2951 * what's in the nvroot; we should fail with EEXIST. 2952 */ 2953 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2954 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 2955 VERIFY3U(EEXIST, ==, 2956 spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); 2957 fnvlist_free(nvroot); 2958 2959 /* 2960 * We open a reference to the spa and then we try to export it 2961 * expecting one of the following errors: 2962 * 2963 * EBUSY 2964 * Because of the reference we just opened. 2965 * 2966 * ZFS_ERR_EXPORT_IN_PROGRESS 2967 * For the case that there is another ztest thread doing 2968 * an export concurrently. 2969 */ 2970 VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); 2971 int error = spa_destroy(zo->zo_pool); 2972 if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { 2973 fatal(B_FALSE, "spa_destroy(%s) returned unexpected value %d", 2974 spa->spa_name, error); 2975 } 2976 spa_close(spa, FTAG); 2977 2978 (void) pthread_rwlock_unlock(&ztest_name_lock); 2979 } 2980 2981 /* 2982 * Start and then stop the MMP threads to ensure the startup and shutdown code 2983 * works properly. Actual protection and property-related code tested via ZTS. 2984 */ 2985 void 2986 ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) 2987 { 2988 (void) zd, (void) id; 2989 ztest_shared_opts_t *zo = &ztest_opts; 2990 spa_t *spa = ztest_spa; 2991 2992 if (zo->zo_mmp_test) 2993 return; 2994 2995 /* 2996 * Since enabling MMP involves setting a property, it could not be done 2997 * while the pool is suspended. 2998 */ 2999 if (spa_suspended(spa)) 3000 return; 3001 3002 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3003 mutex_enter(&spa->spa_props_lock); 3004 3005 zfs_multihost_fail_intervals = 0; 3006 3007 if (!spa_multihost(spa)) { 3008 spa->spa_multihost = B_TRUE; 3009 mmp_thread_start(spa); 3010 } 3011 3012 mutex_exit(&spa->spa_props_lock); 3013 spa_config_exit(spa, SCL_CONFIG, FTAG); 3014 3015 txg_wait_synced(spa_get_dsl(spa), 0); 3016 mmp_signal_all_threads(); 3017 txg_wait_synced(spa_get_dsl(spa), 0); 3018 3019 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3020 mutex_enter(&spa->spa_props_lock); 3021 3022 if (spa_multihost(spa)) { 3023 mmp_thread_stop(spa); 3024 spa->spa_multihost = B_FALSE; 3025 } 3026 3027 mutex_exit(&spa->spa_props_lock); 3028 spa_config_exit(spa, SCL_CONFIG, FTAG); 3029 } 3030 3031 void 3032 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 3033 { 3034 (void) zd, (void) id; 3035 spa_t *spa; 3036 uint64_t initial_version = SPA_VERSION_INITIAL; 3037 uint64_t version, newversion; 3038 nvlist_t *nvroot, *props; 3039 char *name; 3040 3041 if (ztest_opts.zo_mmp_test) 3042 return; 3043 3044 /* dRAID added after feature flags, skip upgrade test. */ 3045 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) 3046 return; 3047 3048 mutex_enter(&ztest_vdev_lock); 3049 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 3050 3051 /* 3052 * Clean up from previous runs. 3053 */ 3054 (void) spa_destroy(name); 3055 3056 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 3057 NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); 3058 3059 /* 3060 * If we're configuring a RAIDZ device then make sure that the 3061 * initial version is capable of supporting that feature. 3062 */ 3063 switch (ztest_opts.zo_raid_parity) { 3064 case 0: 3065 case 1: 3066 initial_version = SPA_VERSION_INITIAL; 3067 break; 3068 case 2: 3069 initial_version = SPA_VERSION_RAIDZ2; 3070 break; 3071 case 3: 3072 initial_version = SPA_VERSION_RAIDZ3; 3073 break; 3074 } 3075 3076 /* 3077 * Create a pool with a spa version that can be upgraded. Pick 3078 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 3079 */ 3080 do { 3081 version = ztest_random_spa_version(initial_version); 3082 } while (version > SPA_VERSION_BEFORE_FEATURES); 3083 3084 props = fnvlist_alloc(); 3085 fnvlist_add_uint64(props, 3086 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 3087 VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); 3088 fnvlist_free(nvroot); 3089 fnvlist_free(props); 3090 3091 VERIFY0(spa_open(name, &spa, FTAG)); 3092 VERIFY3U(spa_version(spa), ==, version); 3093 newversion = ztest_random_spa_version(version + 1); 3094 3095 if (ztest_opts.zo_verbose >= 4) { 3096 (void) printf("upgrading spa version from " 3097 "%"PRIu64" to %"PRIu64"\n", 3098 version, newversion); 3099 } 3100 3101 spa_upgrade(spa, newversion); 3102 VERIFY3U(spa_version(spa), >, version); 3103 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 3104 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 3105 spa_close(spa, FTAG); 3106 3107 kmem_strfree(name); 3108 mutex_exit(&ztest_vdev_lock); 3109 } 3110 3111 static void 3112 ztest_spa_checkpoint(spa_t *spa) 3113 { 3114 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3115 3116 int error = spa_checkpoint(spa->spa_name); 3117 3118 switch (error) { 3119 case 0: 3120 case ZFS_ERR_DEVRM_IN_PROGRESS: 3121 case ZFS_ERR_DISCARDING_CHECKPOINT: 3122 case ZFS_ERR_CHECKPOINT_EXISTS: 3123 break; 3124 case ENOSPC: 3125 ztest_record_enospc(FTAG); 3126 break; 3127 default: 3128 fatal(B_FALSE, "spa_checkpoint(%s) = %d", spa->spa_name, error); 3129 } 3130 } 3131 3132 static void 3133 ztest_spa_discard_checkpoint(spa_t *spa) 3134 { 3135 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3136 3137 int error = spa_checkpoint_discard(spa->spa_name); 3138 3139 switch (error) { 3140 case 0: 3141 case ZFS_ERR_DISCARDING_CHECKPOINT: 3142 case ZFS_ERR_NO_CHECKPOINT: 3143 break; 3144 default: 3145 fatal(B_FALSE, "spa_discard_checkpoint(%s) = %d", 3146 spa->spa_name, error); 3147 } 3148 3149 } 3150 3151 void 3152 ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 3153 { 3154 (void) zd, (void) id; 3155 spa_t *spa = ztest_spa; 3156 3157 mutex_enter(&ztest_checkpoint_lock); 3158 if (ztest_random(2) == 0) { 3159 ztest_spa_checkpoint(spa); 3160 } else { 3161 ztest_spa_discard_checkpoint(spa); 3162 } 3163 mutex_exit(&ztest_checkpoint_lock); 3164 } 3165 3166 3167 static vdev_t * 3168 vdev_lookup_by_path(vdev_t *vd, const char *path) 3169 { 3170 vdev_t *mvd; 3171 int c; 3172 3173 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 3174 return (vd); 3175 3176 for (c = 0; c < vd->vdev_children; c++) 3177 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 3178 NULL) 3179 return (mvd); 3180 3181 return (NULL); 3182 } 3183 3184 static int 3185 spa_num_top_vdevs(spa_t *spa) 3186 { 3187 vdev_t *rvd = spa->spa_root_vdev; 3188 ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); 3189 return (rvd->vdev_children); 3190 } 3191 3192 /* 3193 * Verify that vdev_add() works as expected. 3194 */ 3195 void 3196 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 3197 { 3198 (void) zd, (void) id; 3199 ztest_shared_t *zs = ztest_shared; 3200 spa_t *spa = ztest_spa; 3201 uint64_t leaves; 3202 uint64_t guid; 3203 nvlist_t *nvroot; 3204 int error; 3205 3206 if (ztest_opts.zo_mmp_test) 3207 return; 3208 3209 mutex_enter(&ztest_vdev_lock); 3210 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * 3211 ztest_opts.zo_raid_children; 3212 3213 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3214 3215 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3216 3217 /* 3218 * If we have slogs then remove them 1/4 of the time. 3219 */ 3220 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 3221 metaslab_group_t *mg; 3222 3223 /* 3224 * find the first real slog in log allocation class 3225 */ 3226 mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; 3227 while (!mg->mg_vd->vdev_islog) 3228 mg = mg->mg_next; 3229 3230 guid = mg->mg_vd->vdev_guid; 3231 3232 spa_config_exit(spa, SCL_VDEV, FTAG); 3233 3234 /* 3235 * We have to grab the zs_name_lock as writer to 3236 * prevent a race between removing a slog (dmu_objset_find) 3237 * and destroying a dataset. Removing the slog will 3238 * grab a reference on the dataset which may cause 3239 * dsl_destroy_head() to fail with EBUSY thus 3240 * leaving the dataset in an inconsistent state. 3241 */ 3242 pthread_rwlock_wrlock(&ztest_name_lock); 3243 error = spa_vdev_remove(spa, guid, B_FALSE); 3244 pthread_rwlock_unlock(&ztest_name_lock); 3245 3246 switch (error) { 3247 case 0: 3248 case EEXIST: /* Generic zil_reset() error */ 3249 case EBUSY: /* Replay required */ 3250 case EACCES: /* Crypto key not loaded */ 3251 case ZFS_ERR_CHECKPOINT_EXISTS: 3252 case ZFS_ERR_DISCARDING_CHECKPOINT: 3253 break; 3254 default: 3255 fatal(B_FALSE, "spa_vdev_remove() = %d", error); 3256 } 3257 } else { 3258 spa_config_exit(spa, SCL_VDEV, FTAG); 3259 3260 /* 3261 * Make 1/4 of the devices be log devices 3262 */ 3263 nvroot = make_vdev_root(NULL, NULL, NULL, 3264 ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? 3265 "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 3266 1); 3267 3268 error = spa_vdev_add(spa, nvroot); 3269 fnvlist_free(nvroot); 3270 3271 switch (error) { 3272 case 0: 3273 break; 3274 case ENOSPC: 3275 ztest_record_enospc("spa_vdev_add"); 3276 break; 3277 default: 3278 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3279 } 3280 } 3281 3282 mutex_exit(&ztest_vdev_lock); 3283 } 3284 3285 void 3286 ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) 3287 { 3288 (void) zd, (void) id; 3289 ztest_shared_t *zs = ztest_shared; 3290 spa_t *spa = ztest_spa; 3291 uint64_t leaves; 3292 nvlist_t *nvroot; 3293 const char *class = (ztest_random(2) == 0) ? 3294 VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; 3295 int error; 3296 3297 /* 3298 * By default add a special vdev 50% of the time 3299 */ 3300 if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || 3301 (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && 3302 ztest_random(2) == 0)) { 3303 return; 3304 } 3305 3306 mutex_enter(&ztest_vdev_lock); 3307 3308 /* Only test with mirrors */ 3309 if (zs->zs_mirrors < 2) { 3310 mutex_exit(&ztest_vdev_lock); 3311 return; 3312 } 3313 3314 /* requires feature@allocation_classes */ 3315 if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { 3316 mutex_exit(&ztest_vdev_lock); 3317 return; 3318 } 3319 3320 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * 3321 ztest_opts.zo_raid_children; 3322 3323 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3324 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3325 spa_config_exit(spa, SCL_VDEV, FTAG); 3326 3327 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 3328 class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 3329 3330 error = spa_vdev_add(spa, nvroot); 3331 fnvlist_free(nvroot); 3332 3333 if (error == ENOSPC) 3334 ztest_record_enospc("spa_vdev_add"); 3335 else if (error != 0) 3336 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3337 3338 /* 3339 * 50% of the time allow small blocks in the special class 3340 */ 3341 if (error == 0 && 3342 spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { 3343 if (ztest_opts.zo_verbose >= 3) 3344 (void) printf("Enabling special VDEV small blocks\n"); 3345 (void) ztest_dsl_prop_set_uint64(zd->zd_name, 3346 ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); 3347 } 3348 3349 mutex_exit(&ztest_vdev_lock); 3350 3351 if (ztest_opts.zo_verbose >= 3) { 3352 metaslab_class_t *mc; 3353 3354 if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) 3355 mc = spa_special_class(spa); 3356 else 3357 mc = spa_dedup_class(spa); 3358 (void) printf("Added a %s mirrored vdev (of %d)\n", 3359 class, (int)mc->mc_groups); 3360 } 3361 } 3362 3363 /* 3364 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 3365 */ 3366 void 3367 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 3368 { 3369 (void) zd, (void) id; 3370 ztest_shared_t *zs = ztest_shared; 3371 spa_t *spa = ztest_spa; 3372 vdev_t *rvd = spa->spa_root_vdev; 3373 spa_aux_vdev_t *sav; 3374 char *aux; 3375 char *path; 3376 uint64_t guid = 0; 3377 int error, ignore_err = 0; 3378 3379 if (ztest_opts.zo_mmp_test) 3380 return; 3381 3382 path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3383 3384 if (ztest_random(2) == 0) { 3385 sav = &spa->spa_spares; 3386 aux = ZPOOL_CONFIG_SPARES; 3387 } else { 3388 sav = &spa->spa_l2cache; 3389 aux = ZPOOL_CONFIG_L2CACHE; 3390 } 3391 3392 mutex_enter(&ztest_vdev_lock); 3393 3394 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3395 3396 if (sav->sav_count != 0 && ztest_random(4) == 0) { 3397 /* 3398 * Pick a random device to remove. 3399 */ 3400 vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3401 3402 /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ 3403 if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) 3404 ignore_err = ENOTSUP; 3405 3406 guid = svd->vdev_guid; 3407 } else { 3408 /* 3409 * Find an unused device we can add. 3410 */ 3411 zs->zs_vdev_aux = 0; 3412 for (;;) { 3413 int c; 3414 (void) snprintf(path, MAXPATHLEN, ztest_aux_template, 3415 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 3416 zs->zs_vdev_aux); 3417 for (c = 0; c < sav->sav_count; c++) 3418 if (strcmp(sav->sav_vdevs[c]->vdev_path, 3419 path) == 0) 3420 break; 3421 if (c == sav->sav_count && 3422 vdev_lookup_by_path(rvd, path) == NULL) 3423 break; 3424 zs->zs_vdev_aux++; 3425 } 3426 } 3427 3428 spa_config_exit(spa, SCL_VDEV, FTAG); 3429 3430 if (guid == 0) { 3431 /* 3432 * Add a new device. 3433 */ 3434 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 3435 (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); 3436 error = spa_vdev_add(spa, nvroot); 3437 3438 switch (error) { 3439 case 0: 3440 break; 3441 default: 3442 fatal(B_FALSE, "spa_vdev_add(%p) = %d", nvroot, error); 3443 } 3444 fnvlist_free(nvroot); 3445 } else { 3446 /* 3447 * Remove an existing device. Sometimes, dirty its 3448 * vdev state first to make sure we handle removal 3449 * of devices that have pending state changes. 3450 */ 3451 if (ztest_random(2) == 0) 3452 (void) vdev_online(spa, guid, 0, NULL); 3453 3454 error = spa_vdev_remove(spa, guid, B_FALSE); 3455 3456 switch (error) { 3457 case 0: 3458 case EBUSY: 3459 case ZFS_ERR_CHECKPOINT_EXISTS: 3460 case ZFS_ERR_DISCARDING_CHECKPOINT: 3461 break; 3462 default: 3463 if (error != ignore_err) 3464 fatal(B_FALSE, 3465 "spa_vdev_remove(%"PRIu64") = %d", 3466 guid, error); 3467 } 3468 } 3469 3470 mutex_exit(&ztest_vdev_lock); 3471 3472 umem_free(path, MAXPATHLEN); 3473 } 3474 3475 /* 3476 * split a pool if it has mirror tlvdevs 3477 */ 3478 void 3479 ztest_split_pool(ztest_ds_t *zd, uint64_t id) 3480 { 3481 (void) zd, (void) id; 3482 ztest_shared_t *zs = ztest_shared; 3483 spa_t *spa = ztest_spa; 3484 vdev_t *rvd = spa->spa_root_vdev; 3485 nvlist_t *tree, **child, *config, *split, **schild; 3486 uint_t c, children, schildren = 0, lastlogid = 0; 3487 int error = 0; 3488 3489 if (ztest_opts.zo_mmp_test) 3490 return; 3491 3492 mutex_enter(&ztest_vdev_lock); 3493 3494 /* ensure we have a usable config; mirrors of raidz aren't supported */ 3495 if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { 3496 mutex_exit(&ztest_vdev_lock); 3497 return; 3498 } 3499 3500 /* clean up the old pool, if any */ 3501 (void) spa_destroy("splitp"); 3502 3503 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3504 3505 /* generate a config from the existing config */ 3506 mutex_enter(&spa->spa_props_lock); 3507 tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); 3508 mutex_exit(&spa->spa_props_lock); 3509 3510 VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, 3511 &child, &children)); 3512 3513 schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); 3514 for (c = 0; c < children; c++) { 3515 vdev_t *tvd = rvd->vdev_child[c]; 3516 nvlist_t **mchild; 3517 uint_t mchildren; 3518 3519 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 3520 schild[schildren] = fnvlist_alloc(); 3521 fnvlist_add_string(schild[schildren], 3522 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); 3523 fnvlist_add_uint64(schild[schildren], 3524 ZPOOL_CONFIG_IS_HOLE, 1); 3525 if (lastlogid == 0) 3526 lastlogid = schildren; 3527 ++schildren; 3528 continue; 3529 } 3530 lastlogid = 0; 3531 VERIFY0(nvlist_lookup_nvlist_array(child[c], 3532 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); 3533 schild[schildren++] = fnvlist_dup(mchild[0]); 3534 } 3535 3536 /* OK, create a config that can be used to split */ 3537 split = fnvlist_alloc(); 3538 fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 3539 fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, 3540 (const nvlist_t **)schild, lastlogid != 0 ? lastlogid : schildren); 3541 3542 config = fnvlist_alloc(); 3543 fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); 3544 3545 for (c = 0; c < schildren; c++) 3546 fnvlist_free(schild[c]); 3547 free(schild); 3548 fnvlist_free(split); 3549 3550 spa_config_exit(spa, SCL_VDEV, FTAG); 3551 3552 (void) pthread_rwlock_wrlock(&ztest_name_lock); 3553 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 3554 (void) pthread_rwlock_unlock(&ztest_name_lock); 3555 3556 fnvlist_free(config); 3557 3558 if (error == 0) { 3559 (void) printf("successful split - results:\n"); 3560 mutex_enter(&spa_namespace_lock); 3561 show_pool_stats(spa); 3562 show_pool_stats(spa_lookup("splitp")); 3563 mutex_exit(&spa_namespace_lock); 3564 ++zs->zs_splits; 3565 --zs->zs_mirrors; 3566 } 3567 mutex_exit(&ztest_vdev_lock); 3568 } 3569 3570 /* 3571 * Verify that we can attach and detach devices. 3572 */ 3573 void 3574 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 3575 { 3576 (void) zd, (void) id; 3577 ztest_shared_t *zs = ztest_shared; 3578 spa_t *spa = ztest_spa; 3579 spa_aux_vdev_t *sav = &spa->spa_spares; 3580 vdev_t *rvd = spa->spa_root_vdev; 3581 vdev_t *oldvd, *newvd, *pvd; 3582 nvlist_t *root; 3583 uint64_t leaves; 3584 uint64_t leaf, top; 3585 uint64_t ashift = ztest_get_ashift(); 3586 uint64_t oldguid, pguid; 3587 uint64_t oldsize, newsize; 3588 char *oldpath, *newpath; 3589 int replacing; 3590 int oldvd_has_siblings = B_FALSE; 3591 int newvd_is_spare = B_FALSE; 3592 int newvd_is_dspare = B_FALSE; 3593 int oldvd_is_log; 3594 int error, expected_error; 3595 3596 if (ztest_opts.zo_mmp_test) 3597 return; 3598 3599 oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3600 newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3601 3602 mutex_enter(&ztest_vdev_lock); 3603 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; 3604 3605 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3606 3607 /* 3608 * If a vdev is in the process of being removed, its removal may 3609 * finish while we are in progress, leading to an unexpected error 3610 * value. Don't bother trying to attach while we are in the middle 3611 * of removal. 3612 */ 3613 if (ztest_device_removal_active) { 3614 spa_config_exit(spa, SCL_ALL, FTAG); 3615 goto out; 3616 } 3617 3618 /* 3619 * Decide whether to do an attach or a replace. 3620 */ 3621 replacing = ztest_random(2); 3622 3623 /* 3624 * Pick a random top-level vdev. 3625 */ 3626 top = ztest_random_vdev_top(spa, B_TRUE); 3627 3628 /* 3629 * Pick a random leaf within it. 3630 */ 3631 leaf = ztest_random(leaves); 3632 3633 /* 3634 * Locate this vdev. 3635 */ 3636 oldvd = rvd->vdev_child[top]; 3637 3638 /* pick a child from the mirror */ 3639 if (zs->zs_mirrors >= 1) { 3640 ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); 3641 ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); 3642 oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; 3643 } 3644 3645 /* pick a child out of the raidz group */ 3646 if (ztest_opts.zo_raid_children > 1) { 3647 if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) 3648 ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); 3649 else 3650 ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); 3651 ASSERT3U(oldvd->vdev_children, ==, ztest_opts.zo_raid_children); 3652 oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; 3653 } 3654 3655 /* 3656 * If we're already doing an attach or replace, oldvd may be a 3657 * mirror vdev -- in which case, pick a random child. 3658 */ 3659 while (oldvd->vdev_children != 0) { 3660 oldvd_has_siblings = B_TRUE; 3661 ASSERT3U(oldvd->vdev_children, >=, 2); 3662 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3663 } 3664 3665 oldguid = oldvd->vdev_guid; 3666 oldsize = vdev_get_min_asize(oldvd); 3667 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3668 (void) strcpy(oldpath, oldvd->vdev_path); 3669 pvd = oldvd->vdev_parent; 3670 pguid = pvd->vdev_guid; 3671 3672 /* 3673 * If oldvd has siblings, then half of the time, detach it. Prior 3674 * to the detach the pool is scrubbed in order to prevent creating 3675 * unrepairable blocks as a result of the data corruption injection. 3676 */ 3677 if (oldvd_has_siblings && ztest_random(2) == 0) { 3678 spa_config_exit(spa, SCL_ALL, FTAG); 3679 3680 error = ztest_scrub_impl(spa); 3681 if (error) 3682 goto out; 3683 3684 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3685 if (error != 0 && error != ENODEV && error != EBUSY && 3686 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3687 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3688 fatal(B_FALSE, "detach (%s) returned %d", 3689 oldpath, error); 3690 goto out; 3691 } 3692 3693 /* 3694 * For the new vdev, choose with equal probability between the two 3695 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3696 */ 3697 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3698 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3699 newvd_is_spare = B_TRUE; 3700 3701 if (newvd->vdev_ops == &vdev_draid_spare_ops) 3702 newvd_is_dspare = B_TRUE; 3703 3704 (void) strcpy(newpath, newvd->vdev_path); 3705 } else { 3706 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 3707 ztest_opts.zo_dir, ztest_opts.zo_pool, 3708 top * leaves + leaf); 3709 if (ztest_random(2) == 0) 3710 newpath[strlen(newpath) - 1] = 'b'; 3711 newvd = vdev_lookup_by_path(rvd, newpath); 3712 } 3713 3714 if (newvd) { 3715 /* 3716 * Reopen to ensure the vdev's asize field isn't stale. 3717 */ 3718 vdev_reopen(newvd); 3719 newsize = vdev_get_min_asize(newvd); 3720 } else { 3721 /* 3722 * Make newsize a little bigger or smaller than oldsize. 3723 * If it's smaller, the attach should fail. 3724 * If it's larger, and we're doing a replace, 3725 * we should get dynamic LUN growth when we're done. 3726 */ 3727 newsize = 10 * oldsize / (9 + ztest_random(3)); 3728 } 3729 3730 /* 3731 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3732 * unless it's a replace; in that case any non-replacing parent is OK. 3733 * 3734 * If newvd is already part of the pool, it should fail with EBUSY. 3735 * 3736 * If newvd is too small, it should fail with EOVERFLOW. 3737 * 3738 * If newvd is a distributed spare and it's being attached to a 3739 * dRAID which is not its parent it should fail with EINVAL. 3740 */ 3741 if (pvd->vdev_ops != &vdev_mirror_ops && 3742 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3743 pvd->vdev_ops == &vdev_replacing_ops || 3744 pvd->vdev_ops == &vdev_spare_ops)) 3745 expected_error = ENOTSUP; 3746 else if (newvd_is_spare && (!replacing || oldvd_is_log)) 3747 expected_error = ENOTSUP; 3748 else if (newvd == oldvd) 3749 expected_error = replacing ? 0 : EBUSY; 3750 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3751 expected_error = EBUSY; 3752 else if (!newvd_is_dspare && newsize < oldsize) 3753 expected_error = EOVERFLOW; 3754 else if (ashift > oldvd->vdev_top->vdev_ashift) 3755 expected_error = EDOM; 3756 else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) 3757 expected_error = ENOTSUP; 3758 else 3759 expected_error = 0; 3760 3761 spa_config_exit(spa, SCL_ALL, FTAG); 3762 3763 /* 3764 * Build the nvlist describing newpath. 3765 */ 3766 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3767 ashift, NULL, 0, 0, 1); 3768 3769 /* 3770 * When supported select either a healing or sequential resilver. 3771 */ 3772 boolean_t rebuilding = B_FALSE; 3773 if (pvd->vdev_ops == &vdev_mirror_ops || 3774 pvd->vdev_ops == &vdev_root_ops) { 3775 rebuilding = !!ztest_random(2); 3776 } 3777 3778 error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); 3779 3780 fnvlist_free(root); 3781 3782 /* 3783 * If our parent was the replacing vdev, but the replace completed, 3784 * then instead of failing with ENOTSUP we may either succeed, 3785 * fail with ENODEV, or fail with EOVERFLOW. 3786 */ 3787 if (expected_error == ENOTSUP && 3788 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3789 expected_error = error; 3790 3791 /* 3792 * If someone grew the LUN, the replacement may be too small. 3793 */ 3794 if (error == EOVERFLOW || error == EBUSY) 3795 expected_error = error; 3796 3797 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3798 error == ZFS_ERR_DISCARDING_CHECKPOINT || 3799 error == ZFS_ERR_RESILVER_IN_PROGRESS || 3800 error == ZFS_ERR_REBUILD_IN_PROGRESS) 3801 expected_error = error; 3802 3803 if (error != expected_error && expected_error != EBUSY) { 3804 fatal(B_FALSE, "attach (%s %"PRIu64", %s %"PRIu64", %d) " 3805 "returned %d, expected %d", 3806 oldpath, oldsize, newpath, 3807 newsize, replacing, error, expected_error); 3808 } 3809 out: 3810 mutex_exit(&ztest_vdev_lock); 3811 3812 umem_free(oldpath, MAXPATHLEN); 3813 umem_free(newpath, MAXPATHLEN); 3814 } 3815 3816 void 3817 ztest_device_removal(ztest_ds_t *zd, uint64_t id) 3818 { 3819 (void) zd, (void) id; 3820 spa_t *spa = ztest_spa; 3821 vdev_t *vd; 3822 uint64_t guid; 3823 int error; 3824 3825 mutex_enter(&ztest_vdev_lock); 3826 3827 if (ztest_device_removal_active) { 3828 mutex_exit(&ztest_vdev_lock); 3829 return; 3830 } 3831 3832 /* 3833 * Remove a random top-level vdev and wait for removal to finish. 3834 */ 3835 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3836 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 3837 guid = vd->vdev_guid; 3838 spa_config_exit(spa, SCL_VDEV, FTAG); 3839 3840 error = spa_vdev_remove(spa, guid, B_FALSE); 3841 if (error == 0) { 3842 ztest_device_removal_active = B_TRUE; 3843 mutex_exit(&ztest_vdev_lock); 3844 3845 /* 3846 * spa->spa_vdev_removal is created in a sync task that 3847 * is initiated via dsl_sync_task_nowait(). Since the 3848 * task may not run before spa_vdev_remove() returns, we 3849 * must wait at least 1 txg to ensure that the removal 3850 * struct has been created. 3851 */ 3852 txg_wait_synced(spa_get_dsl(spa), 0); 3853 3854 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 3855 txg_wait_synced(spa_get_dsl(spa), 0); 3856 } else { 3857 mutex_exit(&ztest_vdev_lock); 3858 return; 3859 } 3860 3861 /* 3862 * The pool needs to be scrubbed after completing device removal. 3863 * Failure to do so may result in checksum errors due to the 3864 * strategy employed by ztest_fault_inject() when selecting which 3865 * offset are redundant and can be damaged. 3866 */ 3867 error = spa_scan(spa, POOL_SCAN_SCRUB); 3868 if (error == 0) { 3869 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 3870 txg_wait_synced(spa_get_dsl(spa), 0); 3871 } 3872 3873 mutex_enter(&ztest_vdev_lock); 3874 ztest_device_removal_active = B_FALSE; 3875 mutex_exit(&ztest_vdev_lock); 3876 } 3877 3878 /* 3879 * Callback function which expands the physical size of the vdev. 3880 */ 3881 static vdev_t * 3882 grow_vdev(vdev_t *vd, void *arg) 3883 { 3884 spa_t *spa __maybe_unused = vd->vdev_spa; 3885 size_t *newsize = arg; 3886 size_t fsize; 3887 int fd; 3888 3889 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 3890 ASSERT(vd->vdev_ops->vdev_op_leaf); 3891 3892 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 3893 return (vd); 3894 3895 fsize = lseek(fd, 0, SEEK_END); 3896 VERIFY0(ftruncate(fd, *newsize)); 3897 3898 if (ztest_opts.zo_verbose >= 6) { 3899 (void) printf("%s grew from %lu to %lu bytes\n", 3900 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 3901 } 3902 (void) close(fd); 3903 return (NULL); 3904 } 3905 3906 /* 3907 * Callback function which expands a given vdev by calling vdev_online(). 3908 */ 3909 static vdev_t * 3910 online_vdev(vdev_t *vd, void *arg) 3911 { 3912 (void) arg; 3913 spa_t *spa = vd->vdev_spa; 3914 vdev_t *tvd = vd->vdev_top; 3915 uint64_t guid = vd->vdev_guid; 3916 uint64_t generation = spa->spa_config_generation + 1; 3917 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 3918 int error; 3919 3920 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 3921 ASSERT(vd->vdev_ops->vdev_op_leaf); 3922 3923 /* Calling vdev_online will initialize the new metaslabs */ 3924 spa_config_exit(spa, SCL_STATE, spa); 3925 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 3926 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3927 3928 /* 3929 * If vdev_online returned an error or the underlying vdev_open 3930 * failed then we abort the expand. The only way to know that 3931 * vdev_open fails is by checking the returned newstate. 3932 */ 3933 if (error || newstate != VDEV_STATE_HEALTHY) { 3934 if (ztest_opts.zo_verbose >= 5) { 3935 (void) printf("Unable to expand vdev, state %u, " 3936 "error %d\n", newstate, error); 3937 } 3938 return (vd); 3939 } 3940 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 3941 3942 /* 3943 * Since we dropped the lock we need to ensure that we're 3944 * still talking to the original vdev. It's possible this 3945 * vdev may have been detached/replaced while we were 3946 * trying to online it. 3947 */ 3948 if (generation != spa->spa_config_generation) { 3949 if (ztest_opts.zo_verbose >= 5) { 3950 (void) printf("vdev configuration has changed, " 3951 "guid %"PRIu64", state %"PRIu64", " 3952 "expected gen %"PRIu64", got gen %"PRIu64"\n", 3953 guid, 3954 tvd->vdev_state, 3955 generation, 3956 spa->spa_config_generation); 3957 } 3958 return (vd); 3959 } 3960 return (NULL); 3961 } 3962 3963 /* 3964 * Traverse the vdev tree calling the supplied function. 3965 * We continue to walk the tree until we either have walked all 3966 * children or we receive a non-NULL return from the callback. 3967 * If a NULL callback is passed, then we just return back the first 3968 * leaf vdev we encounter. 3969 */ 3970 static vdev_t * 3971 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 3972 { 3973 uint_t c; 3974 3975 if (vd->vdev_ops->vdev_op_leaf) { 3976 if (func == NULL) 3977 return (vd); 3978 else 3979 return (func(vd, arg)); 3980 } 3981 3982 for (c = 0; c < vd->vdev_children; c++) { 3983 vdev_t *cvd = vd->vdev_child[c]; 3984 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 3985 return (cvd); 3986 } 3987 return (NULL); 3988 } 3989 3990 /* 3991 * Verify that dynamic LUN growth works as expected. 3992 */ 3993 void 3994 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 3995 { 3996 (void) zd, (void) id; 3997 spa_t *spa = ztest_spa; 3998 vdev_t *vd, *tvd; 3999 metaslab_class_t *mc; 4000 metaslab_group_t *mg; 4001 size_t psize, newsize; 4002 uint64_t top; 4003 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 4004 4005 mutex_enter(&ztest_checkpoint_lock); 4006 mutex_enter(&ztest_vdev_lock); 4007 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4008 4009 /* 4010 * If there is a vdev removal in progress, it could complete while 4011 * we are running, in which case we would not be able to verify 4012 * that the metaslab_class space increased (because it decreases 4013 * when the device removal completes). 4014 */ 4015 if (ztest_device_removal_active) { 4016 spa_config_exit(spa, SCL_STATE, spa); 4017 mutex_exit(&ztest_vdev_lock); 4018 mutex_exit(&ztest_checkpoint_lock); 4019 return; 4020 } 4021 4022 top = ztest_random_vdev_top(spa, B_TRUE); 4023 4024 tvd = spa->spa_root_vdev->vdev_child[top]; 4025 mg = tvd->vdev_mg; 4026 mc = mg->mg_class; 4027 old_ms_count = tvd->vdev_ms_count; 4028 old_class_space = metaslab_class_get_space(mc); 4029 4030 /* 4031 * Determine the size of the first leaf vdev associated with 4032 * our top-level device. 4033 */ 4034 vd = vdev_walk_tree(tvd, NULL, NULL); 4035 ASSERT3P(vd, !=, NULL); 4036 ASSERT(vd->vdev_ops->vdev_op_leaf); 4037 4038 psize = vd->vdev_psize; 4039 4040 /* 4041 * We only try to expand the vdev if it's healthy, less than 4x its 4042 * original size, and it has a valid psize. 4043 */ 4044 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 4045 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 4046 spa_config_exit(spa, SCL_STATE, spa); 4047 mutex_exit(&ztest_vdev_lock); 4048 mutex_exit(&ztest_checkpoint_lock); 4049 return; 4050 } 4051 ASSERT3U(psize, >, 0); 4052 newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); 4053 ASSERT3U(newsize, >, psize); 4054 4055 if (ztest_opts.zo_verbose >= 6) { 4056 (void) printf("Expanding LUN %s from %lu to %lu\n", 4057 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 4058 } 4059 4060 /* 4061 * Growing the vdev is a two step process: 4062 * 1). expand the physical size (i.e. relabel) 4063 * 2). online the vdev to create the new metaslabs 4064 */ 4065 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 4066 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 4067 tvd->vdev_state != VDEV_STATE_HEALTHY) { 4068 if (ztest_opts.zo_verbose >= 5) { 4069 (void) printf("Could not expand LUN because " 4070 "the vdev configuration changed.\n"); 4071 } 4072 spa_config_exit(spa, SCL_STATE, spa); 4073 mutex_exit(&ztest_vdev_lock); 4074 mutex_exit(&ztest_checkpoint_lock); 4075 return; 4076 } 4077 4078 spa_config_exit(spa, SCL_STATE, spa); 4079 4080 /* 4081 * Expanding the LUN will update the config asynchronously, 4082 * thus we must wait for the async thread to complete any 4083 * pending tasks before proceeding. 4084 */ 4085 for (;;) { 4086 boolean_t done; 4087 mutex_enter(&spa->spa_async_lock); 4088 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 4089 mutex_exit(&spa->spa_async_lock); 4090 if (done) 4091 break; 4092 txg_wait_synced(spa_get_dsl(spa), 0); 4093 (void) poll(NULL, 0, 100); 4094 } 4095 4096 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4097 4098 tvd = spa->spa_root_vdev->vdev_child[top]; 4099 new_ms_count = tvd->vdev_ms_count; 4100 new_class_space = metaslab_class_get_space(mc); 4101 4102 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 4103 if (ztest_opts.zo_verbose >= 5) { 4104 (void) printf("Could not verify LUN expansion due to " 4105 "intervening vdev offline or remove.\n"); 4106 } 4107 spa_config_exit(spa, SCL_STATE, spa); 4108 mutex_exit(&ztest_vdev_lock); 4109 mutex_exit(&ztest_checkpoint_lock); 4110 return; 4111 } 4112 4113 /* 4114 * Make sure we were able to grow the vdev. 4115 */ 4116 if (new_ms_count <= old_ms_count) { 4117 fatal(B_FALSE, 4118 "LUN expansion failed: ms_count %"PRIu64" < %"PRIu64"\n", 4119 old_ms_count, new_ms_count); 4120 } 4121 4122 /* 4123 * Make sure we were able to grow the pool. 4124 */ 4125 if (new_class_space <= old_class_space) { 4126 fatal(B_FALSE, 4127 "LUN expansion failed: class_space %"PRIu64" < %"PRIu64"\n", 4128 old_class_space, new_class_space); 4129 } 4130 4131 if (ztest_opts.zo_verbose >= 5) { 4132 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 4133 4134 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 4135 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 4136 (void) printf("%s grew from %s to %s\n", 4137 spa->spa_name, oldnumbuf, newnumbuf); 4138 } 4139 4140 spa_config_exit(spa, SCL_STATE, spa); 4141 mutex_exit(&ztest_vdev_lock); 4142 mutex_exit(&ztest_checkpoint_lock); 4143 } 4144 4145 /* 4146 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 4147 */ 4148 static void 4149 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 4150 { 4151 (void) arg, (void) cr; 4152 4153 /* 4154 * Create the objects common to all ztest datasets. 4155 */ 4156 VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, 4157 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); 4158 } 4159 4160 static int 4161 ztest_dataset_create(char *dsname) 4162 { 4163 int err; 4164 uint64_t rand; 4165 dsl_crypto_params_t *dcp = NULL; 4166 4167 /* 4168 * 50% of the time, we create encrypted datasets 4169 * using a random cipher suite and a hard-coded 4170 * wrapping key. 4171 */ 4172 rand = ztest_random(2); 4173 if (rand != 0) { 4174 nvlist_t *crypto_args = fnvlist_alloc(); 4175 nvlist_t *props = fnvlist_alloc(); 4176 4177 /* slight bias towards the default cipher suite */ 4178 rand = ztest_random(ZIO_CRYPT_FUNCTIONS); 4179 if (rand < ZIO_CRYPT_AES_128_CCM) 4180 rand = ZIO_CRYPT_ON; 4181 4182 fnvlist_add_uint64(props, 4183 zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); 4184 fnvlist_add_uint8_array(crypto_args, "wkeydata", 4185 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 4186 4187 /* 4188 * These parameters aren't really used by the kernel. They 4189 * are simply stored so that userspace knows how to load 4190 * the wrapping key. 4191 */ 4192 fnvlist_add_uint64(props, 4193 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); 4194 fnvlist_add_string(props, 4195 zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); 4196 fnvlist_add_uint64(props, 4197 zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); 4198 fnvlist_add_uint64(props, 4199 zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); 4200 4201 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, 4202 crypto_args, &dcp)); 4203 4204 /* 4205 * Cycle through all available encryption implementations 4206 * to verify interoperability. 4207 */ 4208 VERIFY0(gcm_impl_set("cycle")); 4209 VERIFY0(aes_impl_set("cycle")); 4210 4211 fnvlist_free(crypto_args); 4212 fnvlist_free(props); 4213 } 4214 4215 err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, 4216 ztest_objset_create_cb, NULL); 4217 dsl_crypto_params_free(dcp, !!err); 4218 4219 rand = ztest_random(100); 4220 if (err || rand < 80) 4221 return (err); 4222 4223 if (ztest_opts.zo_verbose >= 5) 4224 (void) printf("Setting dataset %s to sync always\n", dsname); 4225 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 4226 ZFS_SYNC_ALWAYS, B_FALSE)); 4227 } 4228 4229 static int 4230 ztest_objset_destroy_cb(const char *name, void *arg) 4231 { 4232 (void) arg; 4233 objset_t *os; 4234 dmu_object_info_t doi; 4235 int error; 4236 4237 /* 4238 * Verify that the dataset contains a directory object. 4239 */ 4240 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4241 B_TRUE, FTAG, &os)); 4242 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 4243 if (error != ENOENT) { 4244 /* We could have crashed in the middle of destroying it */ 4245 ASSERT0(error); 4246 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 4247 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 4248 } 4249 dmu_objset_disown(os, B_TRUE, FTAG); 4250 4251 /* 4252 * Destroy the dataset. 4253 */ 4254 if (strchr(name, '@') != NULL) { 4255 error = dsl_destroy_snapshot(name, B_TRUE); 4256 if (error != ECHRNG) { 4257 /* 4258 * The program was executed, but encountered a runtime 4259 * error, such as insufficient slop, or a hold on the 4260 * dataset. 4261 */ 4262 ASSERT0(error); 4263 } 4264 } else { 4265 error = dsl_destroy_head(name); 4266 if (error == ENOSPC) { 4267 /* There could be checkpoint or insufficient slop */ 4268 ztest_record_enospc(FTAG); 4269 } else if (error != EBUSY) { 4270 /* There could be a hold on this dataset */ 4271 ASSERT0(error); 4272 } 4273 } 4274 return (0); 4275 } 4276 4277 static boolean_t 4278 ztest_snapshot_create(char *osname, uint64_t id) 4279 { 4280 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4281 int error; 4282 4283 (void) snprintf(snapname, sizeof (snapname), "%"PRIu64"", id); 4284 4285 error = dmu_objset_snapshot_one(osname, snapname); 4286 if (error == ENOSPC) { 4287 ztest_record_enospc(FTAG); 4288 return (B_FALSE); 4289 } 4290 if (error != 0 && error != EEXIST) { 4291 fatal(B_FALSE, "ztest_snapshot_create(%s@%s) = %d", osname, 4292 snapname, error); 4293 } 4294 return (B_TRUE); 4295 } 4296 4297 static boolean_t 4298 ztest_snapshot_destroy(char *osname, uint64_t id) 4299 { 4300 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4301 int error; 4302 4303 (void) snprintf(snapname, sizeof (snapname), "%s@%"PRIu64"", 4304 osname, id); 4305 4306 error = dsl_destroy_snapshot(snapname, B_FALSE); 4307 if (error != 0 && error != ENOENT) 4308 fatal(B_FALSE, "ztest_snapshot_destroy(%s) = %d", 4309 snapname, error); 4310 return (B_TRUE); 4311 } 4312 4313 void 4314 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 4315 { 4316 (void) zd; 4317 ztest_ds_t *zdtmp; 4318 int iters; 4319 int error; 4320 objset_t *os, *os2; 4321 char name[ZFS_MAX_DATASET_NAME_LEN]; 4322 zilog_t *zilog; 4323 int i; 4324 4325 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 4326 4327 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4328 4329 (void) snprintf(name, sizeof (name), "%s/temp_%"PRIu64"", 4330 ztest_opts.zo_pool, id); 4331 4332 /* 4333 * If this dataset exists from a previous run, process its replay log 4334 * half of the time. If we don't replay it, then dsl_destroy_head() 4335 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 4336 */ 4337 if (ztest_random(2) == 0 && 4338 ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 4339 B_TRUE, FTAG, &os) == 0) { 4340 ztest_zd_init(zdtmp, NULL, os); 4341 zil_replay(os, zdtmp, ztest_replay_vector); 4342 ztest_zd_fini(zdtmp); 4343 dmu_objset_disown(os, B_TRUE, FTAG); 4344 } 4345 4346 /* 4347 * There may be an old instance of the dataset we're about to 4348 * create lying around from a previous run. If so, destroy it 4349 * and all of its snapshots. 4350 */ 4351 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 4352 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 4353 4354 /* 4355 * Verify that the destroyed dataset is no longer in the namespace. 4356 */ 4357 VERIFY3U(ENOENT, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4358 B_TRUE, FTAG, &os)); 4359 4360 /* 4361 * Verify that we can create a new dataset. 4362 */ 4363 error = ztest_dataset_create(name); 4364 if (error) { 4365 if (error == ENOSPC) { 4366 ztest_record_enospc(FTAG); 4367 goto out; 4368 } 4369 fatal(B_FALSE, "dmu_objset_create(%s) = %d", name, error); 4370 } 4371 4372 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, 4373 FTAG, &os)); 4374 4375 ztest_zd_init(zdtmp, NULL, os); 4376 4377 /* 4378 * Open the intent log for it. 4379 */ 4380 zilog = zil_open(os, ztest_get_data); 4381 4382 /* 4383 * Put some objects in there, do a little I/O to them, 4384 * and randomly take a couple of snapshots along the way. 4385 */ 4386 iters = ztest_random(5); 4387 for (i = 0; i < iters; i++) { 4388 ztest_dmu_object_alloc_free(zdtmp, id); 4389 if (ztest_random(iters) == 0) 4390 (void) ztest_snapshot_create(name, i); 4391 } 4392 4393 /* 4394 * Verify that we cannot create an existing dataset. 4395 */ 4396 VERIFY3U(EEXIST, ==, 4397 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); 4398 4399 /* 4400 * Verify that we can hold an objset that is also owned. 4401 */ 4402 VERIFY0(dmu_objset_hold(name, FTAG, &os2)); 4403 dmu_objset_rele(os2, FTAG); 4404 4405 /* 4406 * Verify that we cannot own an objset that is already owned. 4407 */ 4408 VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, 4409 B_FALSE, B_TRUE, FTAG, &os2)); 4410 4411 zil_close(zilog); 4412 dmu_objset_disown(os, B_TRUE, FTAG); 4413 ztest_zd_fini(zdtmp); 4414 out: 4415 (void) pthread_rwlock_unlock(&ztest_name_lock); 4416 4417 umem_free(zdtmp, sizeof (ztest_ds_t)); 4418 } 4419 4420 /* 4421 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 4422 */ 4423 void 4424 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 4425 { 4426 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4427 (void) ztest_snapshot_destroy(zd->zd_name, id); 4428 (void) ztest_snapshot_create(zd->zd_name, id); 4429 (void) pthread_rwlock_unlock(&ztest_name_lock); 4430 } 4431 4432 /* 4433 * Cleanup non-standard snapshots and clones. 4434 */ 4435 static void 4436 ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 4437 { 4438 char *snap1name; 4439 char *clone1name; 4440 char *snap2name; 4441 char *clone2name; 4442 char *snap3name; 4443 int error; 4444 4445 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4446 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4447 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4448 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4449 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4450 4451 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4452 osname, id); 4453 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4454 osname, id); 4455 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4456 clone1name, id); 4457 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4458 osname, id); 4459 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4460 clone1name, id); 4461 4462 error = dsl_destroy_head(clone2name); 4463 if (error && error != ENOENT) 4464 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone2name, error); 4465 error = dsl_destroy_snapshot(snap3name, B_FALSE); 4466 if (error && error != ENOENT) 4467 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4468 snap3name, error); 4469 error = dsl_destroy_snapshot(snap2name, B_FALSE); 4470 if (error && error != ENOENT) 4471 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4472 snap2name, error); 4473 error = dsl_destroy_head(clone1name); 4474 if (error && error != ENOENT) 4475 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone1name, error); 4476 error = dsl_destroy_snapshot(snap1name, B_FALSE); 4477 if (error && error != ENOENT) 4478 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4479 snap1name, error); 4480 4481 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4482 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4483 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4484 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4485 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4486 } 4487 4488 /* 4489 * Verify dsl_dataset_promote handles EBUSY 4490 */ 4491 void 4492 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 4493 { 4494 objset_t *os; 4495 char *snap1name; 4496 char *clone1name; 4497 char *snap2name; 4498 char *clone2name; 4499 char *snap3name; 4500 char *osname = zd->zd_name; 4501 int error; 4502 4503 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4504 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4505 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4506 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4507 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4508 4509 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4510 4511 ztest_dsl_dataset_cleanup(osname, id); 4512 4513 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4514 osname, id); 4515 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4516 osname, id); 4517 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4518 clone1name, id); 4519 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4520 osname, id); 4521 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4522 clone1name, id); 4523 4524 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 4525 if (error && error != EEXIST) { 4526 if (error == ENOSPC) { 4527 ztest_record_enospc(FTAG); 4528 goto out; 4529 } 4530 fatal(B_FALSE, "dmu_take_snapshot(%s) = %d", snap1name, error); 4531 } 4532 4533 error = dmu_objset_clone(clone1name, snap1name); 4534 if (error) { 4535 if (error == ENOSPC) { 4536 ztest_record_enospc(FTAG); 4537 goto out; 4538 } 4539 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone1name, error); 4540 } 4541 4542 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 4543 if (error && error != EEXIST) { 4544 if (error == ENOSPC) { 4545 ztest_record_enospc(FTAG); 4546 goto out; 4547 } 4548 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap2name, error); 4549 } 4550 4551 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 4552 if (error && error != EEXIST) { 4553 if (error == ENOSPC) { 4554 ztest_record_enospc(FTAG); 4555 goto out; 4556 } 4557 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap3name, error); 4558 } 4559 4560 error = dmu_objset_clone(clone2name, snap3name); 4561 if (error) { 4562 if (error == ENOSPC) { 4563 ztest_record_enospc(FTAG); 4564 goto out; 4565 } 4566 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone2name, error); 4567 } 4568 4569 error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, 4570 FTAG, &os); 4571 if (error) 4572 fatal(B_FALSE, "dmu_objset_own(%s) = %d", snap2name, error); 4573 error = dsl_dataset_promote(clone2name, NULL); 4574 if (error == ENOSPC) { 4575 dmu_objset_disown(os, B_TRUE, FTAG); 4576 ztest_record_enospc(FTAG); 4577 goto out; 4578 } 4579 if (error != EBUSY) 4580 fatal(B_FALSE, "dsl_dataset_promote(%s), %d, not EBUSY", 4581 clone2name, error); 4582 dmu_objset_disown(os, B_TRUE, FTAG); 4583 4584 out: 4585 ztest_dsl_dataset_cleanup(osname, id); 4586 4587 (void) pthread_rwlock_unlock(&ztest_name_lock); 4588 4589 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4590 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4591 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4592 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4593 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4594 } 4595 4596 #undef OD_ARRAY_SIZE 4597 #define OD_ARRAY_SIZE 4 4598 4599 /* 4600 * Verify that dmu_object_{alloc,free} work as expected. 4601 */ 4602 void 4603 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 4604 { 4605 ztest_od_t *od; 4606 int batchsize; 4607 int size; 4608 int b; 4609 4610 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4611 od = umem_alloc(size, UMEM_NOFAIL); 4612 batchsize = OD_ARRAY_SIZE; 4613 4614 for (b = 0; b < batchsize; b++) 4615 ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 4616 0, 0, 0); 4617 4618 /* 4619 * Destroy the previous batch of objects, create a new batch, 4620 * and do some I/O on the new objects. 4621 */ 4622 if (ztest_object_init(zd, od, size, B_TRUE) != 0) 4623 return; 4624 4625 while (ztest_random(4 * batchsize) != 0) 4626 ztest_io(zd, od[ztest_random(batchsize)].od_object, 4627 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4628 4629 umem_free(od, size); 4630 } 4631 4632 /* 4633 * Rewind the global allocator to verify object allocation backfilling. 4634 */ 4635 void 4636 ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 4637 { 4638 (void) id; 4639 objset_t *os = zd->zd_os; 4640 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 4641 uint64_t object; 4642 4643 /* 4644 * Rewind the global allocator randomly back to a lower object number 4645 * to force backfilling and reclamation of recently freed dnodes. 4646 */ 4647 mutex_enter(&os->os_obj_lock); 4648 object = ztest_random(os->os_obj_next_chunk); 4649 os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); 4650 mutex_exit(&os->os_obj_lock); 4651 } 4652 4653 #undef OD_ARRAY_SIZE 4654 #define OD_ARRAY_SIZE 2 4655 4656 /* 4657 * Verify that dmu_{read,write} work as expected. 4658 */ 4659 void 4660 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 4661 { 4662 int size; 4663 ztest_od_t *od; 4664 4665 objset_t *os = zd->zd_os; 4666 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4667 od = umem_alloc(size, UMEM_NOFAIL); 4668 dmu_tx_t *tx; 4669 int freeit, error; 4670 uint64_t i, n, s, txg; 4671 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 4672 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4673 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 4674 uint64_t regions = 997; 4675 uint64_t stride = 123456789ULL; 4676 uint64_t width = 40; 4677 int free_percent = 5; 4678 4679 /* 4680 * This test uses two objects, packobj and bigobj, that are always 4681 * updated together (i.e. in the same tx) so that their contents are 4682 * in sync and can be compared. Their contents relate to each other 4683 * in a simple way: packobj is a dense array of 'bufwad' structures, 4684 * while bigobj is a sparse array of the same bufwads. Specifically, 4685 * for any index n, there are three bufwads that should be identical: 4686 * 4687 * packobj, at offset n * sizeof (bufwad_t) 4688 * bigobj, at the head of the nth chunk 4689 * bigobj, at the tail of the nth chunk 4690 * 4691 * The chunk size is arbitrary. It doesn't have to be a power of two, 4692 * and it doesn't have any relation to the object blocksize. 4693 * The only requirement is that it can hold at least two bufwads. 4694 * 4695 * Normally, we write the bufwad to each of these locations. 4696 * However, free_percent of the time we instead write zeroes to 4697 * packobj and perform a dmu_free_range() on bigobj. By comparing 4698 * bigobj to packobj, we can verify that the DMU is correctly 4699 * tracking which parts of an object are allocated and free, 4700 * and that the contents of the allocated blocks are correct. 4701 */ 4702 4703 /* 4704 * Read the directory info. If it's the first time, set things up. 4705 */ 4706 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); 4707 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 4708 chunksize); 4709 4710 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 4711 umem_free(od, size); 4712 return; 4713 } 4714 4715 bigobj = od[0].od_object; 4716 packobj = od[1].od_object; 4717 chunksize = od[0].od_gen; 4718 ASSERT3U(chunksize, ==, od[1].od_gen); 4719 4720 /* 4721 * Prefetch a random chunk of the big object. 4722 * Our aim here is to get some async reads in flight 4723 * for blocks that we may free below; the DMU should 4724 * handle this race correctly. 4725 */ 4726 n = ztest_random(regions) * stride + ztest_random(width); 4727 s = 1 + ztest_random(2 * width - 1); 4728 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 4729 ZIO_PRIORITY_SYNC_READ); 4730 4731 /* 4732 * Pick a random index and compute the offsets into packobj and bigobj. 4733 */ 4734 n = ztest_random(regions) * stride + ztest_random(width); 4735 s = 1 + ztest_random(width - 1); 4736 4737 packoff = n * sizeof (bufwad_t); 4738 packsize = s * sizeof (bufwad_t); 4739 4740 bigoff = n * chunksize; 4741 bigsize = s * chunksize; 4742 4743 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 4744 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 4745 4746 /* 4747 * free_percent of the time, free a range of bigobj rather than 4748 * overwriting it. 4749 */ 4750 freeit = (ztest_random(100) < free_percent); 4751 4752 /* 4753 * Read the current contents of our objects. 4754 */ 4755 error = dmu_read(os, packobj, packoff, packsize, packbuf, 4756 DMU_READ_PREFETCH); 4757 ASSERT0(error); 4758 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 4759 DMU_READ_PREFETCH); 4760 ASSERT0(error); 4761 4762 /* 4763 * Get a tx for the mods to both packobj and bigobj. 4764 */ 4765 tx = dmu_tx_create(os); 4766 4767 dmu_tx_hold_write(tx, packobj, packoff, packsize); 4768 4769 if (freeit) 4770 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 4771 else 4772 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 4773 4774 /* This accounts for setting the checksum/compression. */ 4775 dmu_tx_hold_bonus(tx, bigobj); 4776 4777 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4778 if (txg == 0) { 4779 umem_free(packbuf, packsize); 4780 umem_free(bigbuf, bigsize); 4781 umem_free(od, size); 4782 return; 4783 } 4784 4785 enum zio_checksum cksum; 4786 do { 4787 cksum = (enum zio_checksum) 4788 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 4789 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 4790 dmu_object_set_checksum(os, bigobj, cksum, tx); 4791 4792 enum zio_compress comp; 4793 do { 4794 comp = (enum zio_compress) 4795 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 4796 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 4797 dmu_object_set_compress(os, bigobj, comp, tx); 4798 4799 /* 4800 * For each index from n to n + s, verify that the existing bufwad 4801 * in packobj matches the bufwads at the head and tail of the 4802 * corresponding chunk in bigobj. Then update all three bufwads 4803 * with the new values we want to write out. 4804 */ 4805 for (i = 0; i < s; i++) { 4806 /* LINTED */ 4807 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4808 /* LINTED */ 4809 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4810 /* LINTED */ 4811 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4812 4813 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 4814 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 4815 4816 if (pack->bw_txg > txg) 4817 fatal(B_FALSE, 4818 "future leak: got %"PRIx64", open txg is %"PRIx64"", 4819 pack->bw_txg, txg); 4820 4821 if (pack->bw_data != 0 && pack->bw_index != n + i) 4822 fatal(B_FALSE, "wrong index: " 4823 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 4824 pack->bw_index, n, i); 4825 4826 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4827 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 4828 pack, bigH); 4829 4830 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4831 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 4832 pack, bigT); 4833 4834 if (freeit) { 4835 memset(pack, 0, sizeof (bufwad_t)); 4836 } else { 4837 pack->bw_index = n + i; 4838 pack->bw_txg = txg; 4839 pack->bw_data = 1 + ztest_random(-2ULL); 4840 } 4841 *bigH = *pack; 4842 *bigT = *pack; 4843 } 4844 4845 /* 4846 * We've verified all the old bufwads, and made new ones. 4847 * Now write them out. 4848 */ 4849 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 4850 4851 if (freeit) { 4852 if (ztest_opts.zo_verbose >= 7) { 4853 (void) printf("freeing offset %"PRIx64" size %"PRIx64"" 4854 " txg %"PRIx64"\n", 4855 bigoff, bigsize, txg); 4856 } 4857 VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 4858 } else { 4859 if (ztest_opts.zo_verbose >= 7) { 4860 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 4861 " txg %"PRIx64"\n", 4862 bigoff, bigsize, txg); 4863 } 4864 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 4865 } 4866 4867 dmu_tx_commit(tx); 4868 4869 /* 4870 * Sanity check the stuff we just wrote. 4871 */ 4872 { 4873 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 4874 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 4875 4876 VERIFY0(dmu_read(os, packobj, packoff, 4877 packsize, packcheck, DMU_READ_PREFETCH)); 4878 VERIFY0(dmu_read(os, bigobj, bigoff, 4879 bigsize, bigcheck, DMU_READ_PREFETCH)); 4880 4881 ASSERT0(memcmp(packbuf, packcheck, packsize)); 4882 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 4883 4884 umem_free(packcheck, packsize); 4885 umem_free(bigcheck, bigsize); 4886 } 4887 4888 umem_free(packbuf, packsize); 4889 umem_free(bigbuf, bigsize); 4890 umem_free(od, size); 4891 } 4892 4893 static void 4894 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 4895 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 4896 { 4897 uint64_t i; 4898 bufwad_t *pack; 4899 bufwad_t *bigH; 4900 bufwad_t *bigT; 4901 4902 /* 4903 * For each index from n to n + s, verify that the existing bufwad 4904 * in packobj matches the bufwads at the head and tail of the 4905 * corresponding chunk in bigobj. Then update all three bufwads 4906 * with the new values we want to write out. 4907 */ 4908 for (i = 0; i < s; i++) { 4909 /* LINTED */ 4910 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4911 /* LINTED */ 4912 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4913 /* LINTED */ 4914 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4915 4916 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 4917 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 4918 4919 if (pack->bw_txg > txg) 4920 fatal(B_FALSE, 4921 "future leak: got %"PRIx64", open txg is %"PRIx64"", 4922 pack->bw_txg, txg); 4923 4924 if (pack->bw_data != 0 && pack->bw_index != n + i) 4925 fatal(B_FALSE, "wrong index: " 4926 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 4927 pack->bw_index, n, i); 4928 4929 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4930 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 4931 pack, bigH); 4932 4933 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4934 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 4935 pack, bigT); 4936 4937 pack->bw_index = n + i; 4938 pack->bw_txg = txg; 4939 pack->bw_data = 1 + ztest_random(-2ULL); 4940 4941 *bigH = *pack; 4942 *bigT = *pack; 4943 } 4944 } 4945 4946 #undef OD_ARRAY_SIZE 4947 #define OD_ARRAY_SIZE 2 4948 4949 void 4950 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 4951 { 4952 objset_t *os = zd->zd_os; 4953 ztest_od_t *od; 4954 dmu_tx_t *tx; 4955 uint64_t i; 4956 int error; 4957 int size; 4958 uint64_t n, s, txg; 4959 bufwad_t *packbuf, *bigbuf; 4960 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4961 uint64_t blocksize = ztest_random_blocksize(); 4962 uint64_t chunksize = blocksize; 4963 uint64_t regions = 997; 4964 uint64_t stride = 123456789ULL; 4965 uint64_t width = 9; 4966 dmu_buf_t *bonus_db; 4967 arc_buf_t **bigbuf_arcbufs; 4968 dmu_object_info_t doi; 4969 4970 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4971 od = umem_alloc(size, UMEM_NOFAIL); 4972 4973 /* 4974 * This test uses two objects, packobj and bigobj, that are always 4975 * updated together (i.e. in the same tx) so that their contents are 4976 * in sync and can be compared. Their contents relate to each other 4977 * in a simple way: packobj is a dense array of 'bufwad' structures, 4978 * while bigobj is a sparse array of the same bufwads. Specifically, 4979 * for any index n, there are three bufwads that should be identical: 4980 * 4981 * packobj, at offset n * sizeof (bufwad_t) 4982 * bigobj, at the head of the nth chunk 4983 * bigobj, at the tail of the nth chunk 4984 * 4985 * The chunk size is set equal to bigobj block size so that 4986 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. 4987 */ 4988 4989 /* 4990 * Read the directory info. If it's the first time, set things up. 4991 */ 4992 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 4993 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 4994 chunksize); 4995 4996 4997 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 4998 umem_free(od, size); 4999 return; 5000 } 5001 5002 bigobj = od[0].od_object; 5003 packobj = od[1].od_object; 5004 blocksize = od[0].od_blocksize; 5005 chunksize = blocksize; 5006 ASSERT3U(chunksize, ==, od[1].od_gen); 5007 5008 VERIFY0(dmu_object_info(os, bigobj, &doi)); 5009 VERIFY(ISP2(doi.doi_data_block_size)); 5010 VERIFY3U(chunksize, ==, doi.doi_data_block_size); 5011 VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); 5012 5013 /* 5014 * Pick a random index and compute the offsets into packobj and bigobj. 5015 */ 5016 n = ztest_random(regions) * stride + ztest_random(width); 5017 s = 1 + ztest_random(width - 1); 5018 5019 packoff = n * sizeof (bufwad_t); 5020 packsize = s * sizeof (bufwad_t); 5021 5022 bigoff = n * chunksize; 5023 bigsize = s * chunksize; 5024 5025 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 5026 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 5027 5028 VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 5029 5030 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 5031 5032 /* 5033 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 5034 * Iteration 1 test zcopy to already referenced dbufs. 5035 * Iteration 2 test zcopy to dirty dbuf in the same txg. 5036 * Iteration 3 test zcopy to dbuf dirty in previous txg. 5037 * Iteration 4 test zcopy when dbuf is no longer dirty. 5038 * Iteration 5 test zcopy when it can't be done. 5039 * Iteration 6 one more zcopy write. 5040 */ 5041 for (i = 0; i < 7; i++) { 5042 uint64_t j; 5043 uint64_t off; 5044 5045 /* 5046 * In iteration 5 (i == 5) use arcbufs 5047 * that don't match bigobj blksz to test 5048 * dmu_assign_arcbuf_by_dbuf() when it can't directly 5049 * assign an arcbuf to a dbuf. 5050 */ 5051 for (j = 0; j < s; j++) { 5052 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5053 bigbuf_arcbufs[j] = 5054 dmu_request_arcbuf(bonus_db, chunksize); 5055 } else { 5056 bigbuf_arcbufs[2 * j] = 5057 dmu_request_arcbuf(bonus_db, chunksize / 2); 5058 bigbuf_arcbufs[2 * j + 1] = 5059 dmu_request_arcbuf(bonus_db, chunksize / 2); 5060 } 5061 } 5062 5063 /* 5064 * Get a tx for the mods to both packobj and bigobj. 5065 */ 5066 tx = dmu_tx_create(os); 5067 5068 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5069 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5070 5071 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5072 if (txg == 0) { 5073 umem_free(packbuf, packsize); 5074 umem_free(bigbuf, bigsize); 5075 for (j = 0; j < s; j++) { 5076 if (i != 5 || 5077 chunksize < (SPA_MINBLOCKSIZE * 2)) { 5078 dmu_return_arcbuf(bigbuf_arcbufs[j]); 5079 } else { 5080 dmu_return_arcbuf( 5081 bigbuf_arcbufs[2 * j]); 5082 dmu_return_arcbuf( 5083 bigbuf_arcbufs[2 * j + 1]); 5084 } 5085 } 5086 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5087 umem_free(od, size); 5088 dmu_buf_rele(bonus_db, FTAG); 5089 return; 5090 } 5091 5092 /* 5093 * 50% of the time don't read objects in the 1st iteration to 5094 * test dmu_assign_arcbuf_by_dbuf() for the case when there are 5095 * no existing dbufs for the specified offsets. 5096 */ 5097 if (i != 0 || ztest_random(2) != 0) { 5098 error = dmu_read(os, packobj, packoff, 5099 packsize, packbuf, DMU_READ_PREFETCH); 5100 ASSERT0(error); 5101 error = dmu_read(os, bigobj, bigoff, bigsize, 5102 bigbuf, DMU_READ_PREFETCH); 5103 ASSERT0(error); 5104 } 5105 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 5106 n, chunksize, txg); 5107 5108 /* 5109 * We've verified all the old bufwads, and made new ones. 5110 * Now write them out. 5111 */ 5112 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5113 if (ztest_opts.zo_verbose >= 7) { 5114 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5115 " txg %"PRIx64"\n", 5116 bigoff, bigsize, txg); 5117 } 5118 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 5119 dmu_buf_t *dbt; 5120 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5121 memcpy(bigbuf_arcbufs[j]->b_data, 5122 (caddr_t)bigbuf + (off - bigoff), 5123 chunksize); 5124 } else { 5125 memcpy(bigbuf_arcbufs[2 * j]->b_data, 5126 (caddr_t)bigbuf + (off - bigoff), 5127 chunksize / 2); 5128 memcpy(bigbuf_arcbufs[2 * j + 1]->b_data, 5129 (caddr_t)bigbuf + (off - bigoff) + 5130 chunksize / 2, 5131 chunksize / 2); 5132 } 5133 5134 if (i == 1) { 5135 VERIFY(dmu_buf_hold(os, bigobj, off, 5136 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 5137 } 5138 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5139 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5140 off, bigbuf_arcbufs[j], tx)); 5141 } else { 5142 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5143 off, bigbuf_arcbufs[2 * j], tx)); 5144 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5145 off + chunksize / 2, 5146 bigbuf_arcbufs[2 * j + 1], tx)); 5147 } 5148 if (i == 1) { 5149 dmu_buf_rele(dbt, FTAG); 5150 } 5151 } 5152 dmu_tx_commit(tx); 5153 5154 /* 5155 * Sanity check the stuff we just wrote. 5156 */ 5157 { 5158 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5159 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5160 5161 VERIFY0(dmu_read(os, packobj, packoff, 5162 packsize, packcheck, DMU_READ_PREFETCH)); 5163 VERIFY0(dmu_read(os, bigobj, bigoff, 5164 bigsize, bigcheck, DMU_READ_PREFETCH)); 5165 5166 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5167 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5168 5169 umem_free(packcheck, packsize); 5170 umem_free(bigcheck, bigsize); 5171 } 5172 if (i == 2) { 5173 txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); 5174 } else if (i == 3) { 5175 txg_wait_synced(dmu_objset_pool(os), 0); 5176 } 5177 } 5178 5179 dmu_buf_rele(bonus_db, FTAG); 5180 umem_free(packbuf, packsize); 5181 umem_free(bigbuf, bigsize); 5182 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5183 umem_free(od, size); 5184 } 5185 5186 void 5187 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 5188 { 5189 (void) id; 5190 ztest_od_t *od; 5191 5192 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5193 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 5194 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5195 5196 /* 5197 * Have multiple threads write to large offsets in an object 5198 * to verify that parallel writes to an object -- even to the 5199 * same blocks within the object -- doesn't cause any trouble. 5200 */ 5201 ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5202 5203 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) 5204 return; 5205 5206 while (ztest_random(10) != 0) 5207 ztest_io(zd, od->od_object, offset); 5208 5209 umem_free(od, sizeof (ztest_od_t)); 5210 } 5211 5212 void 5213 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 5214 { 5215 ztest_od_t *od; 5216 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 5217 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5218 uint64_t count = ztest_random(20) + 1; 5219 uint64_t blocksize = ztest_random_blocksize(); 5220 void *data; 5221 5222 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5223 5224 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5225 5226 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5227 !ztest_random(2)) != 0) { 5228 umem_free(od, sizeof (ztest_od_t)); 5229 return; 5230 } 5231 5232 if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { 5233 umem_free(od, sizeof (ztest_od_t)); 5234 return; 5235 } 5236 5237 ztest_prealloc(zd, od->od_object, offset, count * blocksize); 5238 5239 data = umem_zalloc(blocksize, UMEM_NOFAIL); 5240 5241 while (ztest_random(count) != 0) { 5242 uint64_t randoff = offset + (ztest_random(count) * blocksize); 5243 if (ztest_write(zd, od->od_object, randoff, blocksize, 5244 data) != 0) 5245 break; 5246 while (ztest_random(4) != 0) 5247 ztest_io(zd, od->od_object, randoff); 5248 } 5249 5250 umem_free(data, blocksize); 5251 umem_free(od, sizeof (ztest_od_t)); 5252 } 5253 5254 /* 5255 * Verify that zap_{create,destroy,add,remove,update} work as expected. 5256 */ 5257 #define ZTEST_ZAP_MIN_INTS 1 5258 #define ZTEST_ZAP_MAX_INTS 4 5259 #define ZTEST_ZAP_MAX_PROPS 1000 5260 5261 void 5262 ztest_zap(ztest_ds_t *zd, uint64_t id) 5263 { 5264 objset_t *os = zd->zd_os; 5265 ztest_od_t *od; 5266 uint64_t object; 5267 uint64_t txg, last_txg; 5268 uint64_t value[ZTEST_ZAP_MAX_INTS]; 5269 uint64_t zl_ints, zl_intsize, prop; 5270 int i, ints; 5271 dmu_tx_t *tx; 5272 char propname[100], txgname[100]; 5273 int error; 5274 char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 5275 5276 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5277 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5278 5279 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5280 !ztest_random(2)) != 0) 5281 goto out; 5282 5283 object = od->od_object; 5284 5285 /* 5286 * Generate a known hash collision, and verify that 5287 * we can lookup and remove both entries. 5288 */ 5289 tx = dmu_tx_create(os); 5290 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5291 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5292 if (txg == 0) 5293 goto out; 5294 for (i = 0; i < 2; i++) { 5295 value[i] = i; 5296 VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 5297 1, &value[i], tx)); 5298 } 5299 for (i = 0; i < 2; i++) { 5300 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 5301 sizeof (uint64_t), 1, &value[i], tx)); 5302 VERIFY0( 5303 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 5304 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5305 ASSERT3U(zl_ints, ==, 1); 5306 } 5307 for (i = 0; i < 2; i++) { 5308 VERIFY0(zap_remove(os, object, hc[i], tx)); 5309 } 5310 dmu_tx_commit(tx); 5311 5312 /* 5313 * Generate a bunch of random entries. 5314 */ 5315 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 5316 5317 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5318 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5319 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5320 memset(value, 0, sizeof (value)); 5321 last_txg = 0; 5322 5323 /* 5324 * If these zap entries already exist, validate their contents. 5325 */ 5326 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5327 if (error == 0) { 5328 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5329 ASSERT3U(zl_ints, ==, 1); 5330 5331 VERIFY0(zap_lookup(os, object, txgname, zl_intsize, 5332 zl_ints, &last_txg)); 5333 5334 VERIFY0(zap_length(os, object, propname, &zl_intsize, 5335 &zl_ints)); 5336 5337 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5338 ASSERT3U(zl_ints, ==, ints); 5339 5340 VERIFY0(zap_lookup(os, object, propname, zl_intsize, 5341 zl_ints, value)); 5342 5343 for (i = 0; i < ints; i++) { 5344 ASSERT3U(value[i], ==, last_txg + object + i); 5345 } 5346 } else { 5347 ASSERT3U(error, ==, ENOENT); 5348 } 5349 5350 /* 5351 * Atomically update two entries in our zap object. 5352 * The first is named txg_%llu, and contains the txg 5353 * in which the property was last updated. The second 5354 * is named prop_%llu, and the nth element of its value 5355 * should be txg + object + n. 5356 */ 5357 tx = dmu_tx_create(os); 5358 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5359 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5360 if (txg == 0) 5361 goto out; 5362 5363 if (last_txg > txg) 5364 fatal(B_FALSE, "zap future leak: old %"PRIu64" new %"PRIu64"", 5365 last_txg, txg); 5366 5367 for (i = 0; i < ints; i++) 5368 value[i] = txg + object + i; 5369 5370 VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 5371 1, &txg, tx)); 5372 VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), 5373 ints, value, tx)); 5374 5375 dmu_tx_commit(tx); 5376 5377 /* 5378 * Remove a random pair of entries. 5379 */ 5380 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5381 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5382 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5383 5384 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5385 5386 if (error == ENOENT) 5387 goto out; 5388 5389 ASSERT0(error); 5390 5391 tx = dmu_tx_create(os); 5392 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5393 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5394 if (txg == 0) 5395 goto out; 5396 VERIFY0(zap_remove(os, object, txgname, tx)); 5397 VERIFY0(zap_remove(os, object, propname, tx)); 5398 dmu_tx_commit(tx); 5399 out: 5400 umem_free(od, sizeof (ztest_od_t)); 5401 } 5402 5403 /* 5404 * Test case to test the upgrading of a microzap to fatzap. 5405 */ 5406 void 5407 ztest_fzap(ztest_ds_t *zd, uint64_t id) 5408 { 5409 objset_t *os = zd->zd_os; 5410 ztest_od_t *od; 5411 uint64_t object, txg, value; 5412 5413 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5414 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5415 5416 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5417 !ztest_random(2)) != 0) 5418 goto out; 5419 object = od->od_object; 5420 5421 /* 5422 * Add entries to this ZAP and make sure it spills over 5423 * and gets upgraded to a fatzap. Also, since we are adding 5424 * 2050 entries we should see ptrtbl growth and leaf-block split. 5425 */ 5426 for (value = 0; value < 2050; value++) { 5427 char name[ZFS_MAX_DATASET_NAME_LEN]; 5428 dmu_tx_t *tx; 5429 int error; 5430 5431 (void) snprintf(name, sizeof (name), "fzap-%"PRIu64"-%"PRIu64"", 5432 id, value); 5433 5434 tx = dmu_tx_create(os); 5435 dmu_tx_hold_zap(tx, object, B_TRUE, name); 5436 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5437 if (txg == 0) 5438 goto out; 5439 error = zap_add(os, object, name, sizeof (uint64_t), 1, 5440 &value, tx); 5441 ASSERT(error == 0 || error == EEXIST); 5442 dmu_tx_commit(tx); 5443 } 5444 out: 5445 umem_free(od, sizeof (ztest_od_t)); 5446 } 5447 5448 void 5449 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 5450 { 5451 (void) id; 5452 objset_t *os = zd->zd_os; 5453 ztest_od_t *od; 5454 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 5455 dmu_tx_t *tx; 5456 int i, namelen, error; 5457 int micro = ztest_random(2); 5458 char name[20], string_value[20]; 5459 void *data; 5460 5461 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5462 ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); 5463 5464 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5465 umem_free(od, sizeof (ztest_od_t)); 5466 return; 5467 } 5468 5469 object = od->od_object; 5470 5471 /* 5472 * Generate a random name of the form 'xxx.....' where each 5473 * x is a random printable character and the dots are dots. 5474 * There are 94 such characters, and the name length goes from 5475 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 5476 */ 5477 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 5478 5479 for (i = 0; i < 3; i++) 5480 name[i] = '!' + ztest_random('~' - '!' + 1); 5481 for (; i < namelen - 1; i++) 5482 name[i] = '.'; 5483 name[i] = '\0'; 5484 5485 if ((namelen & 1) || micro) { 5486 wsize = sizeof (txg); 5487 wc = 1; 5488 data = &txg; 5489 } else { 5490 wsize = 1; 5491 wc = namelen; 5492 data = string_value; 5493 } 5494 5495 count = -1ULL; 5496 VERIFY0(zap_count(os, object, &count)); 5497 ASSERT3S(count, !=, -1ULL); 5498 5499 /* 5500 * Select an operation: length, lookup, add, update, remove. 5501 */ 5502 i = ztest_random(5); 5503 5504 if (i >= 2) { 5505 tx = dmu_tx_create(os); 5506 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5507 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5508 if (txg == 0) { 5509 umem_free(od, sizeof (ztest_od_t)); 5510 return; 5511 } 5512 memcpy(string_value, name, namelen); 5513 } else { 5514 tx = NULL; 5515 txg = 0; 5516 memset(string_value, 0, namelen); 5517 } 5518 5519 switch (i) { 5520 5521 case 0: 5522 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 5523 if (error == 0) { 5524 ASSERT3U(wsize, ==, zl_wsize); 5525 ASSERT3U(wc, ==, zl_wc); 5526 } else { 5527 ASSERT3U(error, ==, ENOENT); 5528 } 5529 break; 5530 5531 case 1: 5532 error = zap_lookup(os, object, name, wsize, wc, data); 5533 if (error == 0) { 5534 if (data == string_value && 5535 memcmp(name, data, namelen) != 0) 5536 fatal(B_FALSE, "name '%s' != val '%s' len %d", 5537 name, (char *)data, namelen); 5538 } else { 5539 ASSERT3U(error, ==, ENOENT); 5540 } 5541 break; 5542 5543 case 2: 5544 error = zap_add(os, object, name, wsize, wc, data, tx); 5545 ASSERT(error == 0 || error == EEXIST); 5546 break; 5547 5548 case 3: 5549 VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); 5550 break; 5551 5552 case 4: 5553 error = zap_remove(os, object, name, tx); 5554 ASSERT(error == 0 || error == ENOENT); 5555 break; 5556 } 5557 5558 if (tx != NULL) 5559 dmu_tx_commit(tx); 5560 5561 umem_free(od, sizeof (ztest_od_t)); 5562 } 5563 5564 /* 5565 * Commit callback data. 5566 */ 5567 typedef struct ztest_cb_data { 5568 list_node_t zcd_node; 5569 uint64_t zcd_txg; 5570 int zcd_expected_err; 5571 boolean_t zcd_added; 5572 boolean_t zcd_called; 5573 spa_t *zcd_spa; 5574 } ztest_cb_data_t; 5575 5576 /* This is the actual commit callback function */ 5577 static void 5578 ztest_commit_callback(void *arg, int error) 5579 { 5580 ztest_cb_data_t *data = arg; 5581 uint64_t synced_txg; 5582 5583 VERIFY3P(data, !=, NULL); 5584 VERIFY3S(data->zcd_expected_err, ==, error); 5585 VERIFY(!data->zcd_called); 5586 5587 synced_txg = spa_last_synced_txg(data->zcd_spa); 5588 if (data->zcd_txg > synced_txg) 5589 fatal(B_FALSE, 5590 "commit callback of txg %"PRIu64" called prematurely, " 5591 "last synced txg = %"PRIu64"\n", 5592 data->zcd_txg, synced_txg); 5593 5594 data->zcd_called = B_TRUE; 5595 5596 if (error == ECANCELED) { 5597 ASSERT0(data->zcd_txg); 5598 ASSERT(!data->zcd_added); 5599 5600 /* 5601 * The private callback data should be destroyed here, but 5602 * since we are going to check the zcd_called field after 5603 * dmu_tx_abort(), we will destroy it there. 5604 */ 5605 return; 5606 } 5607 5608 ASSERT(data->zcd_added); 5609 ASSERT3U(data->zcd_txg, !=, 0); 5610 5611 (void) mutex_enter(&zcl.zcl_callbacks_lock); 5612 5613 /* See if this cb was called more quickly */ 5614 if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) 5615 zc_min_txg_delay = synced_txg - data->zcd_txg; 5616 5617 /* Remove our callback from the list */ 5618 list_remove(&zcl.zcl_callbacks, data); 5619 5620 (void) mutex_exit(&zcl.zcl_callbacks_lock); 5621 5622 umem_free(data, sizeof (ztest_cb_data_t)); 5623 } 5624 5625 /* Allocate and initialize callback data structure */ 5626 static ztest_cb_data_t * 5627 ztest_create_cb_data(objset_t *os, uint64_t txg) 5628 { 5629 ztest_cb_data_t *cb_data; 5630 5631 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 5632 5633 cb_data->zcd_txg = txg; 5634 cb_data->zcd_spa = dmu_objset_spa(os); 5635 list_link_init(&cb_data->zcd_node); 5636 5637 return (cb_data); 5638 } 5639 5640 /* 5641 * Commit callback test. 5642 */ 5643 void 5644 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 5645 { 5646 objset_t *os = zd->zd_os; 5647 ztest_od_t *od; 5648 dmu_tx_t *tx; 5649 ztest_cb_data_t *cb_data[3], *tmp_cb; 5650 uint64_t old_txg, txg; 5651 int i, error = 0; 5652 5653 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5654 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5655 5656 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5657 umem_free(od, sizeof (ztest_od_t)); 5658 return; 5659 } 5660 5661 tx = dmu_tx_create(os); 5662 5663 cb_data[0] = ztest_create_cb_data(os, 0); 5664 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 5665 5666 dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); 5667 5668 /* Every once in a while, abort the transaction on purpose */ 5669 if (ztest_random(100) == 0) 5670 error = -1; 5671 5672 if (!error) 5673 error = dmu_tx_assign(tx, TXG_NOWAIT); 5674 5675 txg = error ? 0 : dmu_tx_get_txg(tx); 5676 5677 cb_data[0]->zcd_txg = txg; 5678 cb_data[1] = ztest_create_cb_data(os, txg); 5679 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 5680 5681 if (error) { 5682 /* 5683 * It's not a strict requirement to call the registered 5684 * callbacks from inside dmu_tx_abort(), but that's what 5685 * it's supposed to happen in the current implementation 5686 * so we will check for that. 5687 */ 5688 for (i = 0; i < 2; i++) { 5689 cb_data[i]->zcd_expected_err = ECANCELED; 5690 VERIFY(!cb_data[i]->zcd_called); 5691 } 5692 5693 dmu_tx_abort(tx); 5694 5695 for (i = 0; i < 2; i++) { 5696 VERIFY(cb_data[i]->zcd_called); 5697 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 5698 } 5699 5700 umem_free(od, sizeof (ztest_od_t)); 5701 return; 5702 } 5703 5704 cb_data[2] = ztest_create_cb_data(os, txg); 5705 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 5706 5707 /* 5708 * Read existing data to make sure there isn't a future leak. 5709 */ 5710 VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), 5711 &old_txg, DMU_READ_PREFETCH)); 5712 5713 if (old_txg > txg) 5714 fatal(B_FALSE, 5715 "future leak: got %"PRIu64", open txg is %"PRIu64"", 5716 old_txg, txg); 5717 5718 dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); 5719 5720 (void) mutex_enter(&zcl.zcl_callbacks_lock); 5721 5722 /* 5723 * Since commit callbacks don't have any ordering requirement and since 5724 * it is theoretically possible for a commit callback to be called 5725 * after an arbitrary amount of time has elapsed since its txg has been 5726 * synced, it is difficult to reliably determine whether a commit 5727 * callback hasn't been called due to high load or due to a flawed 5728 * implementation. 5729 * 5730 * In practice, we will assume that if after a certain number of txgs a 5731 * commit callback hasn't been called, then most likely there's an 5732 * implementation bug.. 5733 */ 5734 tmp_cb = list_head(&zcl.zcl_callbacks); 5735 if (tmp_cb != NULL && 5736 tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { 5737 fatal(B_FALSE, 5738 "Commit callback threshold exceeded, " 5739 "oldest txg: %"PRIu64", open txg: %"PRIu64"\n", 5740 tmp_cb->zcd_txg, txg); 5741 } 5742 5743 /* 5744 * Let's find the place to insert our callbacks. 5745 * 5746 * Even though the list is ordered by txg, it is possible for the 5747 * insertion point to not be the end because our txg may already be 5748 * quiescing at this point and other callbacks in the open txg 5749 * (from other objsets) may have sneaked in. 5750 */ 5751 tmp_cb = list_tail(&zcl.zcl_callbacks); 5752 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 5753 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 5754 5755 /* Add the 3 callbacks to the list */ 5756 for (i = 0; i < 3; i++) { 5757 if (tmp_cb == NULL) 5758 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 5759 else 5760 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 5761 cb_data[i]); 5762 5763 cb_data[i]->zcd_added = B_TRUE; 5764 VERIFY(!cb_data[i]->zcd_called); 5765 5766 tmp_cb = cb_data[i]; 5767 } 5768 5769 zc_cb_counter += 3; 5770 5771 (void) mutex_exit(&zcl.zcl_callbacks_lock); 5772 5773 dmu_tx_commit(tx); 5774 5775 umem_free(od, sizeof (ztest_od_t)); 5776 } 5777 5778 /* 5779 * Visit each object in the dataset. Verify that its properties 5780 * are consistent what was stored in the block tag when it was created, 5781 * and that its unused bonus buffer space has not been overwritten. 5782 */ 5783 void 5784 ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 5785 { 5786 (void) id; 5787 objset_t *os = zd->zd_os; 5788 uint64_t obj; 5789 int err = 0; 5790 5791 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 5792 ztest_block_tag_t *bt = NULL; 5793 dmu_object_info_t doi; 5794 dmu_buf_t *db; 5795 5796 ztest_object_lock(zd, obj, RL_READER); 5797 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { 5798 ztest_object_unlock(zd, obj); 5799 continue; 5800 } 5801 5802 dmu_object_info_from_db(db, &doi); 5803 if (doi.doi_bonus_size >= sizeof (*bt)) 5804 bt = ztest_bt_bonus(db); 5805 5806 if (bt && bt->bt_magic == BT_MAGIC) { 5807 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 5808 bt->bt_offset, bt->bt_gen, bt->bt_txg, 5809 bt->bt_crtxg); 5810 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 5811 } 5812 5813 dmu_buf_rele(db, FTAG); 5814 ztest_object_unlock(zd, obj); 5815 } 5816 } 5817 5818 void 5819 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 5820 { 5821 (void) id; 5822 zfs_prop_t proplist[] = { 5823 ZFS_PROP_CHECKSUM, 5824 ZFS_PROP_COMPRESSION, 5825 ZFS_PROP_COPIES, 5826 ZFS_PROP_DEDUP 5827 }; 5828 5829 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5830 5831 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) 5832 (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 5833 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 5834 5835 VERIFY0(ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, 5836 ztest_random_blocksize(), (int)ztest_random(2))); 5837 5838 (void) pthread_rwlock_unlock(&ztest_name_lock); 5839 } 5840 5841 void 5842 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 5843 { 5844 (void) zd, (void) id; 5845 nvlist_t *props = NULL; 5846 5847 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5848 5849 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); 5850 5851 VERIFY0(spa_prop_get(ztest_spa, &props)); 5852 5853 if (ztest_opts.zo_verbose >= 6) 5854 dump_nvlist(props, 4); 5855 5856 fnvlist_free(props); 5857 5858 (void) pthread_rwlock_unlock(&ztest_name_lock); 5859 } 5860 5861 static int 5862 user_release_one(const char *snapname, const char *holdname) 5863 { 5864 nvlist_t *snaps, *holds; 5865 int error; 5866 5867 snaps = fnvlist_alloc(); 5868 holds = fnvlist_alloc(); 5869 fnvlist_add_boolean(holds, holdname); 5870 fnvlist_add_nvlist(snaps, snapname, holds); 5871 fnvlist_free(holds); 5872 error = dsl_dataset_user_release(snaps, NULL); 5873 fnvlist_free(snaps); 5874 return (error); 5875 } 5876 5877 /* 5878 * Test snapshot hold/release and deferred destroy. 5879 */ 5880 void 5881 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 5882 { 5883 int error; 5884 objset_t *os = zd->zd_os; 5885 objset_t *origin; 5886 char snapname[100]; 5887 char fullname[100]; 5888 char clonename[100]; 5889 char tag[100]; 5890 char osname[ZFS_MAX_DATASET_NAME_LEN]; 5891 nvlist_t *holds; 5892 5893 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5894 5895 dmu_objset_name(os, osname); 5896 5897 (void) snprintf(snapname, sizeof (snapname), "sh1_%"PRIu64"", id); 5898 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 5899 (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%"PRIu64"", 5900 osname, id); 5901 (void) snprintf(tag, sizeof (tag), "tag_%"PRIu64"", id); 5902 5903 /* 5904 * Clean up from any previous run. 5905 */ 5906 error = dsl_destroy_head(clonename); 5907 if (error != ENOENT) 5908 ASSERT0(error); 5909 error = user_release_one(fullname, tag); 5910 if (error != ESRCH && error != ENOENT) 5911 ASSERT0(error); 5912 error = dsl_destroy_snapshot(fullname, B_FALSE); 5913 if (error != ENOENT) 5914 ASSERT0(error); 5915 5916 /* 5917 * Create snapshot, clone it, mark snap for deferred destroy, 5918 * destroy clone, verify snap was also destroyed. 5919 */ 5920 error = dmu_objset_snapshot_one(osname, snapname); 5921 if (error) { 5922 if (error == ENOSPC) { 5923 ztest_record_enospc("dmu_objset_snapshot"); 5924 goto out; 5925 } 5926 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 5927 } 5928 5929 error = dmu_objset_clone(clonename, fullname); 5930 if (error) { 5931 if (error == ENOSPC) { 5932 ztest_record_enospc("dmu_objset_clone"); 5933 goto out; 5934 } 5935 fatal(B_FALSE, "dmu_objset_clone(%s) = %d", clonename, error); 5936 } 5937 5938 error = dsl_destroy_snapshot(fullname, B_TRUE); 5939 if (error) { 5940 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5941 fullname, error); 5942 } 5943 5944 error = dsl_destroy_head(clonename); 5945 if (error) 5946 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clonename, error); 5947 5948 error = dmu_objset_hold(fullname, FTAG, &origin); 5949 if (error != ENOENT) 5950 fatal(B_FALSE, "dmu_objset_hold(%s) = %d", fullname, error); 5951 5952 /* 5953 * Create snapshot, add temporary hold, verify that we can't 5954 * destroy a held snapshot, mark for deferred destroy, 5955 * release hold, verify snapshot was destroyed. 5956 */ 5957 error = dmu_objset_snapshot_one(osname, snapname); 5958 if (error) { 5959 if (error == ENOSPC) { 5960 ztest_record_enospc("dmu_objset_snapshot"); 5961 goto out; 5962 } 5963 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 5964 } 5965 5966 holds = fnvlist_alloc(); 5967 fnvlist_add_string(holds, fullname, tag); 5968 error = dsl_dataset_user_hold(holds, 0, NULL); 5969 fnvlist_free(holds); 5970 5971 if (error == ENOSPC) { 5972 ztest_record_enospc("dsl_dataset_user_hold"); 5973 goto out; 5974 } else if (error) { 5975 fatal(B_FALSE, "dsl_dataset_user_hold(%s, %s) = %u", 5976 fullname, tag, error); 5977 } 5978 5979 error = dsl_destroy_snapshot(fullname, B_FALSE); 5980 if (error != EBUSY) { 5981 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 5982 fullname, error); 5983 } 5984 5985 error = dsl_destroy_snapshot(fullname, B_TRUE); 5986 if (error) { 5987 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5988 fullname, error); 5989 } 5990 5991 error = user_release_one(fullname, tag); 5992 if (error) 5993 fatal(B_FALSE, "user_release_one(%s, %s) = %d", 5994 fullname, tag, error); 5995 5996 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 5997 5998 out: 5999 (void) pthread_rwlock_unlock(&ztest_name_lock); 6000 } 6001 6002 /* 6003 * Inject random faults into the on-disk data. 6004 */ 6005 void 6006 ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 6007 { 6008 (void) zd, (void) id; 6009 ztest_shared_t *zs = ztest_shared; 6010 spa_t *spa = ztest_spa; 6011 int fd; 6012 uint64_t offset; 6013 uint64_t leaves; 6014 uint64_t bad = 0x1990c0ffeedecadeull; 6015 uint64_t top, leaf; 6016 char *path0; 6017 char *pathrand; 6018 size_t fsize; 6019 int bshift = SPA_MAXBLOCKSHIFT + 2; 6020 int iters = 1000; 6021 int maxfaults; 6022 int mirror_save; 6023 vdev_t *vd0 = NULL; 6024 uint64_t guid0 = 0; 6025 boolean_t islog = B_FALSE; 6026 6027 path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6028 pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6029 6030 mutex_enter(&ztest_vdev_lock); 6031 6032 /* 6033 * Device removal is in progress, fault injection must be disabled 6034 * until it completes and the pool is scrubbed. The fault injection 6035 * strategy for damaging blocks does not take in to account evacuated 6036 * blocks which may have already been damaged. 6037 */ 6038 if (ztest_device_removal_active) { 6039 mutex_exit(&ztest_vdev_lock); 6040 goto out; 6041 } 6042 6043 maxfaults = MAXFAULTS(zs); 6044 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; 6045 mirror_save = zs->zs_mirrors; 6046 mutex_exit(&ztest_vdev_lock); 6047 6048 ASSERT3U(leaves, >=, 1); 6049 6050 /* 6051 * While ztest is running the number of leaves will not change. This 6052 * is critical for the fault injection logic as it determines where 6053 * errors can be safely injected such that they are always repairable. 6054 * 6055 * When restarting ztest a different number of leaves may be requested 6056 * which will shift the regions to be damaged. This is fine as long 6057 * as the pool has been scrubbed prior to using the new mapping. 6058 * Failure to do can result in non-repairable damage being injected. 6059 */ 6060 if (ztest_pool_scrubbed == B_FALSE) 6061 goto out; 6062 6063 /* 6064 * Grab the name lock as reader. There are some operations 6065 * which don't like to have their vdevs changed while 6066 * they are in progress (i.e. spa_change_guid). Those 6067 * operations will have grabbed the name lock as writer. 6068 */ 6069 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6070 6071 /* 6072 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 6073 */ 6074 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6075 6076 if (ztest_random(2) == 0) { 6077 /* 6078 * Inject errors on a normal data device or slog device. 6079 */ 6080 top = ztest_random_vdev_top(spa, B_TRUE); 6081 leaf = ztest_random(leaves) + zs->zs_splits; 6082 6083 /* 6084 * Generate paths to the first leaf in this top-level vdev, 6085 * and to the random leaf we selected. We'll induce transient 6086 * write failures and random online/offline activity on leaf 0, 6087 * and we'll write random garbage to the randomly chosen leaf. 6088 */ 6089 (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, 6090 ztest_opts.zo_dir, ztest_opts.zo_pool, 6091 top * leaves + zs->zs_splits); 6092 (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, 6093 ztest_opts.zo_dir, ztest_opts.zo_pool, 6094 top * leaves + leaf); 6095 6096 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 6097 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 6098 islog = B_TRUE; 6099 6100 /* 6101 * If the top-level vdev needs to be resilvered 6102 * then we only allow faults on the device that is 6103 * resilvering. 6104 */ 6105 if (vd0 != NULL && maxfaults != 1 && 6106 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 6107 vd0->vdev_resilver_txg != 0)) { 6108 /* 6109 * Make vd0 explicitly claim to be unreadable, 6110 * or unwritable, or reach behind its back 6111 * and close the underlying fd. We can do this if 6112 * maxfaults == 0 because we'll fail and reexecute, 6113 * and we can do it if maxfaults >= 2 because we'll 6114 * have enough redundancy. If maxfaults == 1, the 6115 * combination of this with injection of random data 6116 * corruption below exceeds the pool's fault tolerance. 6117 */ 6118 vdev_file_t *vf = vd0->vdev_tsd; 6119 6120 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 6121 (long long)vd0->vdev_id, (int)maxfaults); 6122 6123 if (vf != NULL && ztest_random(3) == 0) { 6124 (void) close(vf->vf_file->f_fd); 6125 vf->vf_file->f_fd = -1; 6126 } else if (ztest_random(2) == 0) { 6127 vd0->vdev_cant_read = B_TRUE; 6128 } else { 6129 vd0->vdev_cant_write = B_TRUE; 6130 } 6131 guid0 = vd0->vdev_guid; 6132 } 6133 } else { 6134 /* 6135 * Inject errors on an l2cache device. 6136 */ 6137 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6138 6139 if (sav->sav_count == 0) { 6140 spa_config_exit(spa, SCL_STATE, FTAG); 6141 (void) pthread_rwlock_unlock(&ztest_name_lock); 6142 goto out; 6143 } 6144 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 6145 guid0 = vd0->vdev_guid; 6146 (void) strcpy(path0, vd0->vdev_path); 6147 (void) strcpy(pathrand, vd0->vdev_path); 6148 6149 leaf = 0; 6150 leaves = 1; 6151 maxfaults = INT_MAX; /* no limit on cache devices */ 6152 } 6153 6154 spa_config_exit(spa, SCL_STATE, FTAG); 6155 (void) pthread_rwlock_unlock(&ztest_name_lock); 6156 6157 /* 6158 * If we can tolerate two or more faults, or we're dealing 6159 * with a slog, randomly online/offline vd0. 6160 */ 6161 if ((maxfaults >= 2 || islog) && guid0 != 0) { 6162 if (ztest_random(10) < 6) { 6163 int flags = (ztest_random(2) == 0 ? 6164 ZFS_OFFLINE_TEMPORARY : 0); 6165 6166 /* 6167 * We have to grab the zs_name_lock as writer to 6168 * prevent a race between offlining a slog and 6169 * destroying a dataset. Offlining the slog will 6170 * grab a reference on the dataset which may cause 6171 * dsl_destroy_head() to fail with EBUSY thus 6172 * leaving the dataset in an inconsistent state. 6173 */ 6174 if (islog) 6175 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6176 6177 VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); 6178 6179 if (islog) 6180 (void) pthread_rwlock_unlock(&ztest_name_lock); 6181 } else { 6182 /* 6183 * Ideally we would like to be able to randomly 6184 * call vdev_[on|off]line without holding locks 6185 * to force unpredictable failures but the side 6186 * effects of vdev_[on|off]line prevent us from 6187 * doing so. We grab the ztest_vdev_lock here to 6188 * prevent a race between injection testing and 6189 * aux_vdev removal. 6190 */ 6191 mutex_enter(&ztest_vdev_lock); 6192 (void) vdev_online(spa, guid0, 0, NULL); 6193 mutex_exit(&ztest_vdev_lock); 6194 } 6195 } 6196 6197 if (maxfaults == 0) 6198 goto out; 6199 6200 /* 6201 * We have at least single-fault tolerance, so inject data corruption. 6202 */ 6203 fd = open(pathrand, O_RDWR); 6204 6205 if (fd == -1) /* we hit a gap in the device namespace */ 6206 goto out; 6207 6208 fsize = lseek(fd, 0, SEEK_END); 6209 6210 while (--iters != 0) { 6211 /* 6212 * The offset must be chosen carefully to ensure that 6213 * we do not inject a given logical block with errors 6214 * on two different leaf devices, because ZFS can not 6215 * tolerate that (if maxfaults==1). 6216 * 6217 * To achieve this we divide each leaf device into 6218 * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). 6219 * Each chunk is further divided into error-injection 6220 * ranges (can accept errors) and clear ranges (we do 6221 * not inject errors in those). Each error-injection 6222 * range can accept errors only for a single leaf vdev. 6223 * Error-injection ranges are separated by clear ranges. 6224 * 6225 * For example, with 3 leaves, each chunk looks like: 6226 * 0 to 32M: injection range for leaf 0 6227 * 32M to 64M: clear range - no injection allowed 6228 * 64M to 96M: injection range for leaf 1 6229 * 96M to 128M: clear range - no injection allowed 6230 * 128M to 160M: injection range for leaf 2 6231 * 160M to 192M: clear range - no injection allowed 6232 * 6233 * Each clear range must be large enough such that a 6234 * single block cannot straddle it. This way a block 6235 * can't be a target in two different injection ranges 6236 * (on different leaf vdevs). 6237 */ 6238 offset = ztest_random(fsize / (leaves << bshift)) * 6239 (leaves << bshift) + (leaf << bshift) + 6240 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 6241 6242 /* 6243 * Only allow damage to the labels at one end of the vdev. 6244 * 6245 * If all labels are damaged, the device will be totally 6246 * inaccessible, which will result in loss of data, 6247 * because we also damage (parts of) the other side of 6248 * the mirror/raidz. 6249 * 6250 * Additionally, we will always have both an even and an 6251 * odd label, so that we can handle crashes in the 6252 * middle of vdev_config_sync(). 6253 */ 6254 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 6255 continue; 6256 6257 /* 6258 * The two end labels are stored at the "end" of the disk, but 6259 * the end of the disk (vdev_psize) is aligned to 6260 * sizeof (vdev_label_t). 6261 */ 6262 uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); 6263 if ((leaf & 1) == 1 && 6264 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 6265 continue; 6266 6267 mutex_enter(&ztest_vdev_lock); 6268 if (mirror_save != zs->zs_mirrors) { 6269 mutex_exit(&ztest_vdev_lock); 6270 (void) close(fd); 6271 goto out; 6272 } 6273 6274 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 6275 fatal(B_TRUE, 6276 "can't inject bad word at 0x%"PRIx64" in %s", 6277 offset, pathrand); 6278 6279 mutex_exit(&ztest_vdev_lock); 6280 6281 if (ztest_opts.zo_verbose >= 7) 6282 (void) printf("injected bad word into %s," 6283 " offset 0x%"PRIx64"\n", pathrand, offset); 6284 } 6285 6286 (void) close(fd); 6287 out: 6288 umem_free(path0, MAXPATHLEN); 6289 umem_free(pathrand, MAXPATHLEN); 6290 } 6291 6292 /* 6293 * By design ztest will never inject uncorrectable damage in to the pool. 6294 * Issue a scrub, wait for it to complete, and verify there is never any 6295 * persistent damage. 6296 * 6297 * Only after a full scrub has been completed is it safe to start injecting 6298 * data corruption. See the comment in zfs_fault_inject(). 6299 */ 6300 static int 6301 ztest_scrub_impl(spa_t *spa) 6302 { 6303 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6304 if (error) 6305 return (error); 6306 6307 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6308 txg_wait_synced(spa_get_dsl(spa), 0); 6309 6310 if (spa_get_errlog_size(spa) > 0) 6311 return (ECKSUM); 6312 6313 ztest_pool_scrubbed = B_TRUE; 6314 6315 return (0); 6316 } 6317 6318 /* 6319 * Scrub the pool. 6320 */ 6321 void 6322 ztest_scrub(ztest_ds_t *zd, uint64_t id) 6323 { 6324 (void) zd, (void) id; 6325 spa_t *spa = ztest_spa; 6326 int error; 6327 6328 /* 6329 * Scrub in progress by device removal. 6330 */ 6331 if (ztest_device_removal_active) 6332 return; 6333 6334 /* 6335 * Start a scrub, wait a moment, then force a restart. 6336 */ 6337 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6338 (void) poll(NULL, 0, 100); 6339 6340 error = ztest_scrub_impl(spa); 6341 if (error == EBUSY) 6342 error = 0; 6343 ASSERT0(error); 6344 } 6345 6346 /* 6347 * Change the guid for the pool. 6348 */ 6349 void 6350 ztest_reguid(ztest_ds_t *zd, uint64_t id) 6351 { 6352 (void) zd, (void) id; 6353 spa_t *spa = ztest_spa; 6354 uint64_t orig, load; 6355 int error; 6356 6357 if (ztest_opts.zo_mmp_test) 6358 return; 6359 6360 orig = spa_guid(spa); 6361 load = spa_load_guid(spa); 6362 6363 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6364 error = spa_change_guid(spa); 6365 (void) pthread_rwlock_unlock(&ztest_name_lock); 6366 6367 if (error != 0) 6368 return; 6369 6370 if (ztest_opts.zo_verbose >= 4) { 6371 (void) printf("Changed guid old %"PRIu64" -> %"PRIu64"\n", 6372 orig, spa_guid(spa)); 6373 } 6374 6375 VERIFY3U(orig, !=, spa_guid(spa)); 6376 VERIFY3U(load, ==, spa_load_guid(spa)); 6377 } 6378 6379 void 6380 ztest_blake3(ztest_ds_t *zd, uint64_t id) 6381 { 6382 (void) zd, (void) id; 6383 hrtime_t end = gethrtime() + NANOSEC; 6384 zio_cksum_salt_t salt; 6385 void *salt_ptr = &salt.zcs_bytes; 6386 struct abd *abd_data, *abd_meta; 6387 void *buf, *templ; 6388 int i, *ptr; 6389 uint32_t size; 6390 BLAKE3_CTX ctx; 6391 6392 size = ztest_random_blocksize(); 6393 buf = umem_alloc(size, UMEM_NOFAIL); 6394 abd_data = abd_alloc(size, B_FALSE); 6395 abd_meta = abd_alloc(size, B_TRUE); 6396 6397 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6398 *ptr = ztest_random(UINT_MAX); 6399 memset(salt_ptr, 'A', 32); 6400 6401 abd_copy_from_buf_off(abd_data, buf, 0, size); 6402 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6403 6404 while (gethrtime() <= end) { 6405 int run_count = 100; 6406 zio_cksum_t zc_ref1, zc_ref2; 6407 zio_cksum_t zc_res1, zc_res2; 6408 6409 void *ref1 = &zc_ref1; 6410 void *ref2 = &zc_ref2; 6411 void *res1 = &zc_res1; 6412 void *res2 = &zc_res2; 6413 6414 /* BLAKE3_KEY_LEN = 32 */ 6415 VERIFY0(blake3_set_impl_name("generic")); 6416 templ = abd_checksum_blake3_tmpl_init(&salt); 6417 Blake3_InitKeyed(&ctx, salt_ptr); 6418 Blake3_Update(&ctx, buf, size); 6419 Blake3_Final(&ctx, ref1); 6420 zc_ref2 = zc_ref1; 6421 ZIO_CHECKSUM_BSWAP(&zc_ref2); 6422 abd_checksum_blake3_tmpl_free(templ); 6423 6424 VERIFY0(blake3_set_impl_name("cycle")); 6425 while (run_count-- > 0) { 6426 6427 /* Test current implementation */ 6428 Blake3_InitKeyed(&ctx, salt_ptr); 6429 Blake3_Update(&ctx, buf, size); 6430 Blake3_Final(&ctx, res1); 6431 zc_res2 = zc_res1; 6432 ZIO_CHECKSUM_BSWAP(&zc_res2); 6433 6434 VERIFY0(memcmp(ref1, res1, 32)); 6435 VERIFY0(memcmp(ref2, res2, 32)); 6436 6437 /* Test ABD - data */ 6438 templ = abd_checksum_blake3_tmpl_init(&salt); 6439 abd_checksum_blake3_native(abd_data, size, 6440 templ, &zc_res1); 6441 abd_checksum_blake3_byteswap(abd_data, size, 6442 templ, &zc_res2); 6443 6444 VERIFY0(memcmp(ref1, res1, 32)); 6445 VERIFY0(memcmp(ref2, res2, 32)); 6446 6447 /* Test ABD - metadata */ 6448 abd_checksum_blake3_native(abd_meta, size, 6449 templ, &zc_res1); 6450 abd_checksum_blake3_byteswap(abd_meta, size, 6451 templ, &zc_res2); 6452 abd_checksum_blake3_tmpl_free(templ); 6453 6454 VERIFY0(memcmp(ref1, res1, 32)); 6455 VERIFY0(memcmp(ref2, res2, 32)); 6456 6457 } 6458 } 6459 6460 abd_free(abd_data); 6461 abd_free(abd_meta); 6462 umem_free(buf, size); 6463 } 6464 6465 void 6466 ztest_fletcher(ztest_ds_t *zd, uint64_t id) 6467 { 6468 (void) zd, (void) id; 6469 hrtime_t end = gethrtime() + NANOSEC; 6470 6471 while (gethrtime() <= end) { 6472 int run_count = 100; 6473 void *buf; 6474 struct abd *abd_data, *abd_meta; 6475 uint32_t size; 6476 int *ptr; 6477 int i; 6478 zio_cksum_t zc_ref; 6479 zio_cksum_t zc_ref_byteswap; 6480 6481 size = ztest_random_blocksize(); 6482 6483 buf = umem_alloc(size, UMEM_NOFAIL); 6484 abd_data = abd_alloc(size, B_FALSE); 6485 abd_meta = abd_alloc(size, B_TRUE); 6486 6487 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6488 *ptr = ztest_random(UINT_MAX); 6489 6490 abd_copy_from_buf_off(abd_data, buf, 0, size); 6491 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6492 6493 VERIFY0(fletcher_4_impl_set("scalar")); 6494 fletcher_4_native(buf, size, NULL, &zc_ref); 6495 fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); 6496 6497 VERIFY0(fletcher_4_impl_set("cycle")); 6498 while (run_count-- > 0) { 6499 zio_cksum_t zc; 6500 zio_cksum_t zc_byteswap; 6501 6502 fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); 6503 fletcher_4_native(buf, size, NULL, &zc); 6504 6505 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6506 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6507 sizeof (zc_byteswap))); 6508 6509 /* Test ABD - data */ 6510 abd_fletcher_4_byteswap(abd_data, size, NULL, 6511 &zc_byteswap); 6512 abd_fletcher_4_native(abd_data, size, NULL, &zc); 6513 6514 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6515 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6516 sizeof (zc_byteswap))); 6517 6518 /* Test ABD - metadata */ 6519 abd_fletcher_4_byteswap(abd_meta, size, NULL, 6520 &zc_byteswap); 6521 abd_fletcher_4_native(abd_meta, size, NULL, &zc); 6522 6523 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6524 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6525 sizeof (zc_byteswap))); 6526 6527 } 6528 6529 umem_free(buf, size); 6530 abd_free(abd_data); 6531 abd_free(abd_meta); 6532 } 6533 } 6534 6535 void 6536 ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) 6537 { 6538 (void) zd, (void) id; 6539 void *buf; 6540 size_t size; 6541 int *ptr; 6542 int i; 6543 zio_cksum_t zc_ref; 6544 zio_cksum_t zc_ref_bswap; 6545 6546 hrtime_t end = gethrtime() + NANOSEC; 6547 6548 while (gethrtime() <= end) { 6549 int run_count = 100; 6550 6551 size = ztest_random_blocksize(); 6552 buf = umem_alloc(size, UMEM_NOFAIL); 6553 6554 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6555 *ptr = ztest_random(UINT_MAX); 6556 6557 VERIFY0(fletcher_4_impl_set("scalar")); 6558 fletcher_4_native(buf, size, NULL, &zc_ref); 6559 fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); 6560 6561 VERIFY0(fletcher_4_impl_set("cycle")); 6562 6563 while (run_count-- > 0) { 6564 zio_cksum_t zc; 6565 zio_cksum_t zc_bswap; 6566 size_t pos = 0; 6567 6568 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6569 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6570 6571 while (pos < size) { 6572 size_t inc = 64 * ztest_random(size / 67); 6573 /* sometimes add few bytes to test non-simd */ 6574 if (ztest_random(100) < 10) 6575 inc += P2ALIGN(ztest_random(64), 6576 sizeof (uint32_t)); 6577 6578 if (inc > (size - pos)) 6579 inc = size - pos; 6580 6581 fletcher_4_incremental_native(buf + pos, inc, 6582 &zc); 6583 fletcher_4_incremental_byteswap(buf + pos, inc, 6584 &zc_bswap); 6585 6586 pos += inc; 6587 } 6588 6589 VERIFY3U(pos, ==, size); 6590 6591 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6592 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6593 6594 /* 6595 * verify if incremental on the whole buffer is 6596 * equivalent to non-incremental version 6597 */ 6598 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6599 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6600 6601 fletcher_4_incremental_native(buf, size, &zc); 6602 fletcher_4_incremental_byteswap(buf, size, &zc_bswap); 6603 6604 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6605 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6606 } 6607 6608 umem_free(buf, size); 6609 } 6610 } 6611 6612 static int 6613 ztest_set_global_vars(void) 6614 { 6615 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 6616 char *kv = ztest_opts.zo_gvars[i]; 6617 VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); 6618 VERIFY3U(strlen(kv), >, 0); 6619 int err = set_global_var(kv); 6620 if (ztest_opts.zo_verbose > 0) { 6621 (void) printf("setting global var %s ... %s\n", kv, 6622 err ? "failed" : "ok"); 6623 } 6624 if (err != 0) { 6625 (void) fprintf(stderr, 6626 "failed to set global var '%s'\n", kv); 6627 return (err); 6628 } 6629 } 6630 return (0); 6631 } 6632 6633 static char ** 6634 ztest_global_vars_to_zdb_args(void) 6635 { 6636 char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); 6637 char **cur = args; 6638 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 6639 char *kv = ztest_opts.zo_gvars[i]; 6640 *cur = "-o"; 6641 cur++; 6642 *cur = strdup(kv); 6643 cur++; 6644 } 6645 ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); 6646 *cur = NULL; 6647 return (args); 6648 } 6649 6650 /* The end of strings is indicated by a NULL element */ 6651 static char * 6652 join_strings(char **strings, const char *sep) 6653 { 6654 size_t totallen = 0; 6655 for (char **sp = strings; *sp != NULL; sp++) { 6656 totallen += strlen(*sp); 6657 totallen += strlen(sep); 6658 } 6659 if (totallen > 0) { 6660 ASSERT(totallen >= strlen(sep)); 6661 totallen -= strlen(sep); 6662 } 6663 6664 size_t buflen = totallen + 1; 6665 char *o = malloc(buflen); /* trailing 0 byte */ 6666 o[0] = '\0'; 6667 for (char **sp = strings; *sp != NULL; sp++) { 6668 size_t would; 6669 would = strlcat(o, *sp, buflen); 6670 VERIFY3U(would, <, buflen); 6671 if (*(sp+1) == NULL) { 6672 break; 6673 } 6674 would = strlcat(o, sep, buflen); 6675 VERIFY3U(would, <, buflen); 6676 } 6677 ASSERT3S(strlen(o), ==, totallen); 6678 return (o); 6679 } 6680 6681 static int 6682 ztest_check_path(char *path) 6683 { 6684 struct stat s; 6685 /* return true on success */ 6686 return (!stat(path, &s)); 6687 } 6688 6689 static void 6690 ztest_get_zdb_bin(char *bin, int len) 6691 { 6692 char *zdb_path; 6693 /* 6694 * Try to use $ZDB and in-tree zdb path. If not successful, just 6695 * let popen to search through PATH. 6696 */ 6697 if ((zdb_path = getenv("ZDB"))) { 6698 strlcpy(bin, zdb_path, len); /* In env */ 6699 if (!ztest_check_path(bin)) { 6700 ztest_dump_core = 0; 6701 fatal(B_TRUE, "invalid ZDB '%s'", bin); 6702 } 6703 return; 6704 } 6705 6706 VERIFY3P(realpath(getexecname(), bin), !=, NULL); 6707 if (strstr(bin, ".libs/ztest")) { 6708 strstr(bin, ".libs/ztest")[0] = '\0'; /* In-tree */ 6709 strcat(bin, "zdb"); 6710 if (ztest_check_path(bin)) 6711 return; 6712 } 6713 strcpy(bin, "zdb"); 6714 } 6715 6716 static vdev_t * 6717 ztest_random_concrete_vdev_leaf(vdev_t *vd) 6718 { 6719 if (vd == NULL) 6720 return (NULL); 6721 6722 if (vd->vdev_children == 0) 6723 return (vd); 6724 6725 vdev_t *eligible[vd->vdev_children]; 6726 int eligible_idx = 0, i; 6727 for (i = 0; i < vd->vdev_children; i++) { 6728 vdev_t *cvd = vd->vdev_child[i]; 6729 if (cvd->vdev_top->vdev_removing) 6730 continue; 6731 if (cvd->vdev_children > 0 || 6732 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 6733 eligible[eligible_idx++] = cvd; 6734 } 6735 } 6736 VERIFY3S(eligible_idx, >, 0); 6737 6738 uint64_t child_no = ztest_random(eligible_idx); 6739 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 6740 } 6741 6742 void 6743 ztest_initialize(ztest_ds_t *zd, uint64_t id) 6744 { 6745 (void) zd, (void) id; 6746 spa_t *spa = ztest_spa; 6747 int error = 0; 6748 6749 mutex_enter(&ztest_vdev_lock); 6750 6751 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6752 6753 /* Random leaf vdev */ 6754 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 6755 if (rand_vd == NULL) { 6756 spa_config_exit(spa, SCL_VDEV, FTAG); 6757 mutex_exit(&ztest_vdev_lock); 6758 return; 6759 } 6760 6761 /* 6762 * The random vdev we've selected may change as soon as we 6763 * drop the spa_config_lock. We create local copies of things 6764 * we're interested in. 6765 */ 6766 uint64_t guid = rand_vd->vdev_guid; 6767 char *path = strdup(rand_vd->vdev_path); 6768 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 6769 6770 zfs_dbgmsg("vd %px, guid %llu", rand_vd, (u_longlong_t)guid); 6771 spa_config_exit(spa, SCL_VDEV, FTAG); 6772 6773 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 6774 6775 nvlist_t *vdev_guids = fnvlist_alloc(); 6776 nvlist_t *vdev_errlist = fnvlist_alloc(); 6777 fnvlist_add_uint64(vdev_guids, path, guid); 6778 error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); 6779 fnvlist_free(vdev_guids); 6780 fnvlist_free(vdev_errlist); 6781 6782 switch (cmd) { 6783 case POOL_INITIALIZE_CANCEL: 6784 if (ztest_opts.zo_verbose >= 4) { 6785 (void) printf("Cancel initialize %s", path); 6786 if (!active) 6787 (void) printf(" failed (no initialize active)"); 6788 (void) printf("\n"); 6789 } 6790 break; 6791 case POOL_INITIALIZE_START: 6792 if (ztest_opts.zo_verbose >= 4) { 6793 (void) printf("Start initialize %s", path); 6794 if (active && error == 0) 6795 (void) printf(" failed (already active)"); 6796 else if (error != 0) 6797 (void) printf(" failed (error %d)", error); 6798 (void) printf("\n"); 6799 } 6800 break; 6801 case POOL_INITIALIZE_SUSPEND: 6802 if (ztest_opts.zo_verbose >= 4) { 6803 (void) printf("Suspend initialize %s", path); 6804 if (!active) 6805 (void) printf(" failed (no initialize active)"); 6806 (void) printf("\n"); 6807 } 6808 break; 6809 } 6810 free(path); 6811 mutex_exit(&ztest_vdev_lock); 6812 } 6813 6814 void 6815 ztest_trim(ztest_ds_t *zd, uint64_t id) 6816 { 6817 (void) zd, (void) id; 6818 spa_t *spa = ztest_spa; 6819 int error = 0; 6820 6821 mutex_enter(&ztest_vdev_lock); 6822 6823 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6824 6825 /* Random leaf vdev */ 6826 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 6827 if (rand_vd == NULL) { 6828 spa_config_exit(spa, SCL_VDEV, FTAG); 6829 mutex_exit(&ztest_vdev_lock); 6830 return; 6831 } 6832 6833 /* 6834 * The random vdev we've selected may change as soon as we 6835 * drop the spa_config_lock. We create local copies of things 6836 * we're interested in. 6837 */ 6838 uint64_t guid = rand_vd->vdev_guid; 6839 char *path = strdup(rand_vd->vdev_path); 6840 boolean_t active = rand_vd->vdev_trim_thread != NULL; 6841 6842 zfs_dbgmsg("vd %p, guid %llu", rand_vd, (u_longlong_t)guid); 6843 spa_config_exit(spa, SCL_VDEV, FTAG); 6844 6845 uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); 6846 uint64_t rate = 1 << ztest_random(30); 6847 boolean_t partial = (ztest_random(5) > 0); 6848 boolean_t secure = (ztest_random(5) > 0); 6849 6850 nvlist_t *vdev_guids = fnvlist_alloc(); 6851 nvlist_t *vdev_errlist = fnvlist_alloc(); 6852 fnvlist_add_uint64(vdev_guids, path, guid); 6853 error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, 6854 secure, vdev_errlist); 6855 fnvlist_free(vdev_guids); 6856 fnvlist_free(vdev_errlist); 6857 6858 switch (cmd) { 6859 case POOL_TRIM_CANCEL: 6860 if (ztest_opts.zo_verbose >= 4) { 6861 (void) printf("Cancel TRIM %s", path); 6862 if (!active) 6863 (void) printf(" failed (no TRIM active)"); 6864 (void) printf("\n"); 6865 } 6866 break; 6867 case POOL_TRIM_START: 6868 if (ztest_opts.zo_verbose >= 4) { 6869 (void) printf("Start TRIM %s", path); 6870 if (active && error == 0) 6871 (void) printf(" failed (already active)"); 6872 else if (error != 0) 6873 (void) printf(" failed (error %d)", error); 6874 (void) printf("\n"); 6875 } 6876 break; 6877 case POOL_TRIM_SUSPEND: 6878 if (ztest_opts.zo_verbose >= 4) { 6879 (void) printf("Suspend TRIM %s", path); 6880 if (!active) 6881 (void) printf(" failed (no TRIM active)"); 6882 (void) printf("\n"); 6883 } 6884 break; 6885 } 6886 free(path); 6887 mutex_exit(&ztest_vdev_lock); 6888 } 6889 6890 /* 6891 * Verify pool integrity by running zdb. 6892 */ 6893 static void 6894 ztest_run_zdb(char *pool) 6895 { 6896 int status; 6897 char *bin; 6898 char *zdb; 6899 char *zbuf; 6900 const int len = MAXPATHLEN + MAXNAMELEN + 20; 6901 FILE *fp; 6902 6903 bin = umem_alloc(len, UMEM_NOFAIL); 6904 zdb = umem_alloc(len, UMEM_NOFAIL); 6905 zbuf = umem_alloc(1024, UMEM_NOFAIL); 6906 6907 ztest_get_zdb_bin(bin, len); 6908 6909 char **set_gvars_args = ztest_global_vars_to_zdb_args(); 6910 char *set_gvars_args_joined = join_strings(set_gvars_args, " "); 6911 free(set_gvars_args); 6912 6913 size_t would = snprintf(zdb, len, 6914 "%s -bcc%s%s -G -d -Y -e -y %s -p %s %s", 6915 bin, 6916 ztest_opts.zo_verbose >= 3 ? "s" : "", 6917 ztest_opts.zo_verbose >= 4 ? "v" : "", 6918 set_gvars_args_joined, 6919 ztest_opts.zo_dir, 6920 pool); 6921 ASSERT3U(would, <, len); 6922 6923 free(set_gvars_args_joined); 6924 6925 if (ztest_opts.zo_verbose >= 5) 6926 (void) printf("Executing %s\n", zdb); 6927 6928 fp = popen(zdb, "r"); 6929 6930 while (fgets(zbuf, 1024, fp) != NULL) 6931 if (ztest_opts.zo_verbose >= 3) 6932 (void) printf("%s", zbuf); 6933 6934 status = pclose(fp); 6935 6936 if (status == 0) 6937 goto out; 6938 6939 ztest_dump_core = 0; 6940 if (WIFEXITED(status)) 6941 fatal(B_FALSE, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 6942 else 6943 fatal(B_FALSE, "'%s' died with signal %d", 6944 zdb, WTERMSIG(status)); 6945 out: 6946 umem_free(bin, len); 6947 umem_free(zdb, len); 6948 umem_free(zbuf, 1024); 6949 } 6950 6951 static void 6952 ztest_walk_pool_directory(char *header) 6953 { 6954 spa_t *spa = NULL; 6955 6956 if (ztest_opts.zo_verbose >= 6) 6957 (void) printf("%s\n", header); 6958 6959 mutex_enter(&spa_namespace_lock); 6960 while ((spa = spa_next(spa)) != NULL) 6961 if (ztest_opts.zo_verbose >= 6) 6962 (void) printf("\t%s\n", spa_name(spa)); 6963 mutex_exit(&spa_namespace_lock); 6964 } 6965 6966 static void 6967 ztest_spa_import_export(char *oldname, char *newname) 6968 { 6969 nvlist_t *config, *newconfig; 6970 uint64_t pool_guid; 6971 spa_t *spa; 6972 int error; 6973 6974 if (ztest_opts.zo_verbose >= 4) { 6975 (void) printf("import/export: old = %s, new = %s\n", 6976 oldname, newname); 6977 } 6978 6979 /* 6980 * Clean up from previous runs. 6981 */ 6982 (void) spa_destroy(newname); 6983 6984 /* 6985 * Get the pool's configuration and guid. 6986 */ 6987 VERIFY0(spa_open(oldname, &spa, FTAG)); 6988 6989 /* 6990 * Kick off a scrub to tickle scrub/export races. 6991 */ 6992 if (ztest_random(2) == 0) 6993 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6994 6995 pool_guid = spa_guid(spa); 6996 spa_close(spa, FTAG); 6997 6998 ztest_walk_pool_directory("pools before export"); 6999 7000 /* 7001 * Export it. 7002 */ 7003 VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); 7004 7005 ztest_walk_pool_directory("pools after export"); 7006 7007 /* 7008 * Try to import it. 7009 */ 7010 newconfig = spa_tryimport(config); 7011 ASSERT3P(newconfig, !=, NULL); 7012 fnvlist_free(newconfig); 7013 7014 /* 7015 * Import it under the new name. 7016 */ 7017 error = spa_import(newname, config, NULL, 0); 7018 if (error != 0) { 7019 dump_nvlist(config, 0); 7020 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 7021 oldname, newname, error); 7022 } 7023 7024 ztest_walk_pool_directory("pools after import"); 7025 7026 /* 7027 * Try to import it again -- should fail with EEXIST. 7028 */ 7029 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 7030 7031 /* 7032 * Try to import it under a different name -- should fail with EEXIST. 7033 */ 7034 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 7035 7036 /* 7037 * Verify that the pool is no longer visible under the old name. 7038 */ 7039 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 7040 7041 /* 7042 * Verify that we can open and close the pool using the new name. 7043 */ 7044 VERIFY0(spa_open(newname, &spa, FTAG)); 7045 ASSERT3U(pool_guid, ==, spa_guid(spa)); 7046 spa_close(spa, FTAG); 7047 7048 fnvlist_free(config); 7049 } 7050 7051 static void 7052 ztest_resume(spa_t *spa) 7053 { 7054 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 7055 (void) printf("resuming from suspended state\n"); 7056 spa_vdev_state_enter(spa, SCL_NONE); 7057 vdev_clear(spa, NULL); 7058 (void) spa_vdev_state_exit(spa, NULL, 0); 7059 (void) zio_resume(spa); 7060 } 7061 7062 static __attribute__((noreturn)) void 7063 ztest_resume_thread(void *arg) 7064 { 7065 spa_t *spa = arg; 7066 7067 while (!ztest_exiting) { 7068 if (spa_suspended(spa)) 7069 ztest_resume(spa); 7070 (void) poll(NULL, 0, 100); 7071 7072 /* 7073 * Periodically change the zfs_compressed_arc_enabled setting. 7074 */ 7075 if (ztest_random(10) == 0) 7076 zfs_compressed_arc_enabled = ztest_random(2); 7077 7078 /* 7079 * Periodically change the zfs_abd_scatter_enabled setting. 7080 */ 7081 if (ztest_random(10) == 0) 7082 zfs_abd_scatter_enabled = ztest_random(2); 7083 } 7084 7085 thread_exit(); 7086 } 7087 7088 static __attribute__((noreturn)) void 7089 ztest_deadman_thread(void *arg) 7090 { 7091 ztest_shared_t *zs = arg; 7092 spa_t *spa = ztest_spa; 7093 hrtime_t delay, overdue, last_run = gethrtime(); 7094 7095 delay = (zs->zs_thread_stop - zs->zs_thread_start) + 7096 MSEC2NSEC(zfs_deadman_synctime_ms); 7097 7098 while (!ztest_exiting) { 7099 /* 7100 * Wait for the delay timer while checking occasionally 7101 * if we should stop. 7102 */ 7103 if (gethrtime() < last_run + delay) { 7104 (void) poll(NULL, 0, 1000); 7105 continue; 7106 } 7107 7108 /* 7109 * If the pool is suspended then fail immediately. Otherwise, 7110 * check to see if the pool is making any progress. If 7111 * vdev_deadman() discovers that there hasn't been any recent 7112 * I/Os then it will end up aborting the tests. 7113 */ 7114 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 7115 fatal(B_FALSE, 7116 "aborting test after %lu seconds because " 7117 "pool has transitioned to a suspended state.", 7118 zfs_deadman_synctime_ms / 1000); 7119 } 7120 vdev_deadman(spa->spa_root_vdev, FTAG); 7121 7122 /* 7123 * If the process doesn't complete within a grace period of 7124 * zfs_deadman_synctime_ms over the expected finish time, 7125 * then it may be hung and is terminated. 7126 */ 7127 overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); 7128 if (gethrtime() > overdue) { 7129 fatal(B_FALSE, 7130 "aborting test after %llu seconds because " 7131 "the process is overdue for termination.", 7132 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7133 } 7134 7135 (void) printf("ztest has been running for %lld seconds\n", 7136 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7137 7138 last_run = gethrtime(); 7139 delay = MSEC2NSEC(zfs_deadman_checktime_ms); 7140 } 7141 7142 thread_exit(); 7143 } 7144 7145 static void 7146 ztest_execute(int test, ztest_info_t *zi, uint64_t id) 7147 { 7148 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 7149 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 7150 hrtime_t functime = gethrtime(); 7151 int i; 7152 7153 for (i = 0; i < zi->zi_iters; i++) 7154 zi->zi_func(zd, id); 7155 7156 functime = gethrtime() - functime; 7157 7158 atomic_add_64(&zc->zc_count, 1); 7159 atomic_add_64(&zc->zc_time, functime); 7160 7161 if (ztest_opts.zo_verbose >= 4) 7162 (void) printf("%6.2f sec in %s\n", 7163 (double)functime / NANOSEC, zi->zi_funcname); 7164 } 7165 7166 static __attribute__((noreturn)) void 7167 ztest_thread(void *arg) 7168 { 7169 int rand; 7170 uint64_t id = (uintptr_t)arg; 7171 ztest_shared_t *zs = ztest_shared; 7172 uint64_t call_next; 7173 hrtime_t now; 7174 ztest_info_t *zi; 7175 ztest_shared_callstate_t *zc; 7176 7177 while ((now = gethrtime()) < zs->zs_thread_stop) { 7178 /* 7179 * See if it's time to force a crash. 7180 */ 7181 if (now > zs->zs_thread_kill) 7182 ztest_kill(zs); 7183 7184 /* 7185 * If we're getting ENOSPC with some regularity, stop. 7186 */ 7187 if (zs->zs_enospc_count > 10) 7188 break; 7189 7190 /* 7191 * Pick a random function to execute. 7192 */ 7193 rand = ztest_random(ZTEST_FUNCS); 7194 zi = &ztest_info[rand]; 7195 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 7196 call_next = zc->zc_next; 7197 7198 if (now >= call_next && 7199 atomic_cas_64(&zc->zc_next, call_next, call_next + 7200 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 7201 ztest_execute(rand, zi, id); 7202 } 7203 } 7204 7205 thread_exit(); 7206 } 7207 7208 static void 7209 ztest_dataset_name(char *dsname, char *pool, int d) 7210 { 7211 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 7212 } 7213 7214 static void 7215 ztest_dataset_destroy(int d) 7216 { 7217 char name[ZFS_MAX_DATASET_NAME_LEN]; 7218 int t; 7219 7220 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7221 7222 if (ztest_opts.zo_verbose >= 3) 7223 (void) printf("Destroying %s to free up space\n", name); 7224 7225 /* 7226 * Cleanup any non-standard clones and snapshots. In general, 7227 * ztest thread t operates on dataset (t % zopt_datasets), 7228 * so there may be more than one thing to clean up. 7229 */ 7230 for (t = d; t < ztest_opts.zo_threads; 7231 t += ztest_opts.zo_datasets) 7232 ztest_dsl_dataset_cleanup(name, t); 7233 7234 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 7235 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 7236 } 7237 7238 static void 7239 ztest_dataset_dirobj_verify(ztest_ds_t *zd) 7240 { 7241 uint64_t usedobjs, dirobjs, scratch; 7242 7243 /* 7244 * ZTEST_DIROBJ is the object directory for the entire dataset. 7245 * Therefore, the number of objects in use should equal the 7246 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 7247 * If not, we have an object leak. 7248 * 7249 * Note that we can only check this in ztest_dataset_open(), 7250 * when the open-context and syncing-context values agree. 7251 * That's because zap_count() returns the open-context value, 7252 * while dmu_objset_space() returns the rootbp fill count. 7253 */ 7254 VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 7255 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 7256 ASSERT3U(dirobjs + 1, ==, usedobjs); 7257 } 7258 7259 static int 7260 ztest_dataset_open(int d) 7261 { 7262 ztest_ds_t *zd = &ztest_ds[d]; 7263 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 7264 objset_t *os; 7265 zilog_t *zilog; 7266 char name[ZFS_MAX_DATASET_NAME_LEN]; 7267 int error; 7268 7269 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7270 7271 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7272 7273 error = ztest_dataset_create(name); 7274 if (error == ENOSPC) { 7275 (void) pthread_rwlock_unlock(&ztest_name_lock); 7276 ztest_record_enospc(FTAG); 7277 return (error); 7278 } 7279 ASSERT(error == 0 || error == EEXIST); 7280 7281 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 7282 B_TRUE, zd, &os)); 7283 (void) pthread_rwlock_unlock(&ztest_name_lock); 7284 7285 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 7286 7287 zilog = zd->zd_zilog; 7288 7289 if (zilog->zl_header->zh_claim_lr_seq != 0 && 7290 zilog->zl_header->zh_claim_lr_seq < committed_seq) 7291 fatal(B_FALSE, "missing log records: " 7292 "claimed %"PRIu64" < committed %"PRIu64"", 7293 zilog->zl_header->zh_claim_lr_seq, committed_seq); 7294 7295 ztest_dataset_dirobj_verify(zd); 7296 7297 zil_replay(os, zd, ztest_replay_vector); 7298 7299 ztest_dataset_dirobj_verify(zd); 7300 7301 if (ztest_opts.zo_verbose >= 6) 7302 (void) printf("%s replay %"PRIu64" blocks, " 7303 "%"PRIu64" records, seq %"PRIu64"\n", 7304 zd->zd_name, 7305 zilog->zl_parse_blk_count, 7306 zilog->zl_parse_lr_count, 7307 zilog->zl_replaying_seq); 7308 7309 zilog = zil_open(os, ztest_get_data); 7310 7311 if (zilog->zl_replaying_seq != 0 && 7312 zilog->zl_replaying_seq < committed_seq) 7313 fatal(B_FALSE, "missing log records: " 7314 "replayed %"PRIu64" < committed %"PRIu64"", 7315 zilog->zl_replaying_seq, committed_seq); 7316 7317 return (0); 7318 } 7319 7320 static void 7321 ztest_dataset_close(int d) 7322 { 7323 ztest_ds_t *zd = &ztest_ds[d]; 7324 7325 zil_close(zd->zd_zilog); 7326 dmu_objset_disown(zd->zd_os, B_TRUE, zd); 7327 7328 ztest_zd_fini(zd); 7329 } 7330 7331 static int 7332 ztest_replay_zil_cb(const char *name, void *arg) 7333 { 7334 (void) arg; 7335 objset_t *os; 7336 ztest_ds_t *zdtmp; 7337 7338 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, 7339 B_TRUE, FTAG, &os)); 7340 7341 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 7342 7343 ztest_zd_init(zdtmp, NULL, os); 7344 zil_replay(os, zdtmp, ztest_replay_vector); 7345 ztest_zd_fini(zdtmp); 7346 7347 if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && 7348 ztest_opts.zo_verbose >= 6) { 7349 zilog_t *zilog = dmu_objset_zil(os); 7350 7351 (void) printf("%s replay %"PRIu64" blocks, " 7352 "%"PRIu64" records, seq %"PRIu64"\n", 7353 name, 7354 zilog->zl_parse_blk_count, 7355 zilog->zl_parse_lr_count, 7356 zilog->zl_replaying_seq); 7357 } 7358 7359 umem_free(zdtmp, sizeof (ztest_ds_t)); 7360 7361 dmu_objset_disown(os, B_TRUE, FTAG); 7362 return (0); 7363 } 7364 7365 static void 7366 ztest_freeze(void) 7367 { 7368 ztest_ds_t *zd = &ztest_ds[0]; 7369 spa_t *spa; 7370 int numloops = 0; 7371 7372 if (ztest_opts.zo_verbose >= 3) 7373 (void) printf("testing spa_freeze()...\n"); 7374 7375 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7376 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7377 VERIFY0(ztest_dataset_open(0)); 7378 ztest_spa = spa; 7379 7380 /* 7381 * Force the first log block to be transactionally allocated. 7382 * We have to do this before we freeze the pool -- otherwise 7383 * the log chain won't be anchored. 7384 */ 7385 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 7386 ztest_dmu_object_alloc_free(zd, 0); 7387 zil_commit(zd->zd_zilog, 0); 7388 } 7389 7390 txg_wait_synced(spa_get_dsl(spa), 0); 7391 7392 /* 7393 * Freeze the pool. This stops spa_sync() from doing anything, 7394 * so that the only way to record changes from now on is the ZIL. 7395 */ 7396 spa_freeze(spa); 7397 7398 /* 7399 * Because it is hard to predict how much space a write will actually 7400 * require beforehand, we leave ourselves some fudge space to write over 7401 * capacity. 7402 */ 7403 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 7404 7405 /* 7406 * Run tests that generate log records but don't alter the pool config 7407 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 7408 * We do a txg_wait_synced() after each iteration to force the txg 7409 * to increase well beyond the last synced value in the uberblock. 7410 * The ZIL should be OK with that. 7411 * 7412 * Run a random number of times less than zo_maxloops and ensure we do 7413 * not run out of space on the pool. 7414 */ 7415 while (ztest_random(10) != 0 && 7416 numloops++ < ztest_opts.zo_maxloops && 7417 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 7418 ztest_od_t od; 7419 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 7420 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 7421 ztest_io(zd, od.od_object, 7422 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 7423 txg_wait_synced(spa_get_dsl(spa), 0); 7424 } 7425 7426 /* 7427 * Commit all of the changes we just generated. 7428 */ 7429 zil_commit(zd->zd_zilog, 0); 7430 txg_wait_synced(spa_get_dsl(spa), 0); 7431 7432 /* 7433 * Close our dataset and close the pool. 7434 */ 7435 ztest_dataset_close(0); 7436 spa_close(spa, FTAG); 7437 kernel_fini(); 7438 7439 /* 7440 * Open and close the pool and dataset to induce log replay. 7441 */ 7442 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7443 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7444 ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); 7445 VERIFY0(ztest_dataset_open(0)); 7446 ztest_spa = spa; 7447 txg_wait_synced(spa_get_dsl(spa), 0); 7448 ztest_dataset_close(0); 7449 ztest_reguid(NULL, 0); 7450 7451 spa_close(spa, FTAG); 7452 kernel_fini(); 7453 } 7454 7455 static void 7456 ztest_import_impl(void) 7457 { 7458 importargs_t args = { 0 }; 7459 nvlist_t *cfg = NULL; 7460 int nsearch = 1; 7461 char *searchdirs[nsearch]; 7462 int flags = ZFS_IMPORT_MISSING_LOG; 7463 7464 searchdirs[0] = ztest_opts.zo_dir; 7465 args.paths = nsearch; 7466 args.path = searchdirs; 7467 args.can_be_active = B_FALSE; 7468 7469 VERIFY0(zpool_find_config(NULL, ztest_opts.zo_pool, &cfg, &args, 7470 &libzpool_config_ops)); 7471 VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); 7472 fnvlist_free(cfg); 7473 } 7474 7475 /* 7476 * Import a storage pool with the given name. 7477 */ 7478 static void 7479 ztest_import(ztest_shared_t *zs) 7480 { 7481 spa_t *spa; 7482 7483 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7484 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7485 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7486 7487 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7488 7489 ztest_import_impl(); 7490 7491 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7492 zs->zs_metaslab_sz = 7493 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7494 spa_close(spa, FTAG); 7495 7496 kernel_fini(); 7497 7498 if (!ztest_opts.zo_mmp_test) { 7499 ztest_run_zdb(ztest_opts.zo_pool); 7500 ztest_freeze(); 7501 ztest_run_zdb(ztest_opts.zo_pool); 7502 } 7503 7504 (void) pthread_rwlock_destroy(&ztest_name_lock); 7505 mutex_destroy(&ztest_vdev_lock); 7506 mutex_destroy(&ztest_checkpoint_lock); 7507 } 7508 7509 /* 7510 * Kick off threads to run tests on all datasets in parallel. 7511 */ 7512 static void 7513 ztest_run(ztest_shared_t *zs) 7514 { 7515 spa_t *spa; 7516 objset_t *os; 7517 kthread_t *resume_thread, *deadman_thread; 7518 kthread_t **run_threads; 7519 uint64_t object; 7520 int error; 7521 int t, d; 7522 7523 ztest_exiting = B_FALSE; 7524 7525 /* 7526 * Initialize parent/child shared state. 7527 */ 7528 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7529 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7530 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7531 7532 zs->zs_thread_start = gethrtime(); 7533 zs->zs_thread_stop = 7534 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 7535 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 7536 zs->zs_thread_kill = zs->zs_thread_stop; 7537 if (ztest_random(100) < ztest_opts.zo_killrate) { 7538 zs->zs_thread_kill -= 7539 ztest_random(ztest_opts.zo_passtime * NANOSEC); 7540 } 7541 7542 mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); 7543 7544 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 7545 offsetof(ztest_cb_data_t, zcd_node)); 7546 7547 /* 7548 * Open our pool. It may need to be imported first depending on 7549 * what tests were running when the previous pass was terminated. 7550 */ 7551 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7552 error = spa_open(ztest_opts.zo_pool, &spa, FTAG); 7553 if (error) { 7554 VERIFY3S(error, ==, ENOENT); 7555 ztest_import_impl(); 7556 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7557 zs->zs_metaslab_sz = 7558 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7559 } 7560 7561 metaslab_preload_limit = ztest_random(20) + 1; 7562 ztest_spa = spa; 7563 7564 VERIFY0(vdev_raidz_impl_set("cycle")); 7565 7566 dmu_objset_stats_t dds; 7567 VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, 7568 DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); 7569 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 7570 dmu_objset_fast_stat(os, &dds); 7571 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 7572 zs->zs_guid = dds.dds_guid; 7573 dmu_objset_disown(os, B_TRUE, FTAG); 7574 7575 /* 7576 * Create a thread to periodically resume suspended I/O. 7577 */ 7578 resume_thread = thread_create(NULL, 0, ztest_resume_thread, 7579 spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 7580 7581 /* 7582 * Create a deadman thread and set to panic if we hang. 7583 */ 7584 deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, 7585 zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 7586 7587 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; 7588 7589 /* 7590 * Verify that we can safely inquire about any object, 7591 * whether it's allocated or not. To make it interesting, 7592 * we probe a 5-wide window around each power of two. 7593 * This hits all edge cases, including zero and the max. 7594 */ 7595 for (t = 0; t < 64; t++) { 7596 for (d = -5; d <= 5; d++) { 7597 error = dmu_object_info(spa->spa_meta_objset, 7598 (1ULL << t) + d, NULL); 7599 ASSERT(error == 0 || error == ENOENT || 7600 error == EINVAL); 7601 } 7602 } 7603 7604 /* 7605 * If we got any ENOSPC errors on the previous run, destroy something. 7606 */ 7607 if (zs->zs_enospc_count != 0) { 7608 int d = ztest_random(ztest_opts.zo_datasets); 7609 ztest_dataset_destroy(d); 7610 } 7611 zs->zs_enospc_count = 0; 7612 7613 /* 7614 * If we were in the middle of ztest_device_removal() and were killed 7615 * we need to ensure the removal and scrub complete before running 7616 * any tests that check ztest_device_removal_active. The removal will 7617 * be restarted automatically when the spa is opened, but we need to 7618 * initiate the scrub manually if it is not already in progress. Note 7619 * that we always run the scrub whenever an indirect vdev exists 7620 * because we have no way of knowing for sure if ztest_device_removal() 7621 * fully completed its scrub before the pool was reimported. 7622 */ 7623 if (spa->spa_removing_phys.sr_state == DSS_SCANNING || 7624 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 7625 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 7626 txg_wait_synced(spa_get_dsl(spa), 0); 7627 7628 error = ztest_scrub_impl(spa); 7629 if (error == EBUSY) 7630 error = 0; 7631 ASSERT0(error); 7632 } 7633 7634 run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), 7635 UMEM_NOFAIL); 7636 7637 if (ztest_opts.zo_verbose >= 4) 7638 (void) printf("starting main threads...\n"); 7639 7640 /* 7641 * Replay all logs of all datasets in the pool. This is primarily for 7642 * temporary datasets which wouldn't otherwise get replayed, which 7643 * can trigger failures when attempting to offline a SLOG in 7644 * ztest_fault_inject(). 7645 */ 7646 (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, 7647 NULL, DS_FIND_CHILDREN); 7648 7649 /* 7650 * Kick off all the tests that run in parallel. 7651 */ 7652 for (t = 0; t < ztest_opts.zo_threads; t++) { 7653 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 7654 umem_free(run_threads, ztest_opts.zo_threads * 7655 sizeof (kthread_t *)); 7656 return; 7657 } 7658 7659 run_threads[t] = thread_create(NULL, 0, ztest_thread, 7660 (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, 7661 defclsyspri); 7662 } 7663 7664 /* 7665 * Wait for all of the tests to complete. 7666 */ 7667 for (t = 0; t < ztest_opts.zo_threads; t++) 7668 VERIFY0(thread_join(run_threads[t])); 7669 7670 /* 7671 * Close all datasets. This must be done after all the threads 7672 * are joined so we can be sure none of the datasets are in-use 7673 * by any of the threads. 7674 */ 7675 for (t = 0; t < ztest_opts.zo_threads; t++) { 7676 if (t < ztest_opts.zo_datasets) 7677 ztest_dataset_close(t); 7678 } 7679 7680 txg_wait_synced(spa_get_dsl(spa), 0); 7681 7682 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 7683 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 7684 7685 umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); 7686 7687 /* Kill the resume and deadman threads */ 7688 ztest_exiting = B_TRUE; 7689 VERIFY0(thread_join(resume_thread)); 7690 VERIFY0(thread_join(deadman_thread)); 7691 ztest_resume(spa); 7692 7693 /* 7694 * Right before closing the pool, kick off a bunch of async I/O; 7695 * spa_close() should wait for it to complete. 7696 */ 7697 for (object = 1; object < 50; object++) { 7698 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 7699 ZIO_PRIORITY_SYNC_READ); 7700 } 7701 7702 /* Verify that at least one commit cb was called in a timely fashion */ 7703 if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) 7704 VERIFY0(zc_min_txg_delay); 7705 7706 spa_close(spa, FTAG); 7707 7708 /* 7709 * Verify that we can loop over all pools. 7710 */ 7711 mutex_enter(&spa_namespace_lock); 7712 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 7713 if (ztest_opts.zo_verbose > 3) 7714 (void) printf("spa_next: found %s\n", spa_name(spa)); 7715 mutex_exit(&spa_namespace_lock); 7716 7717 /* 7718 * Verify that we can export the pool and reimport it under a 7719 * different name. 7720 */ 7721 if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { 7722 char name[ZFS_MAX_DATASET_NAME_LEN]; 7723 (void) snprintf(name, sizeof (name), "%s_import", 7724 ztest_opts.zo_pool); 7725 ztest_spa_import_export(ztest_opts.zo_pool, name); 7726 ztest_spa_import_export(name, ztest_opts.zo_pool); 7727 } 7728 7729 kernel_fini(); 7730 7731 list_destroy(&zcl.zcl_callbacks); 7732 mutex_destroy(&zcl.zcl_callbacks_lock); 7733 (void) pthread_rwlock_destroy(&ztest_name_lock); 7734 mutex_destroy(&ztest_vdev_lock); 7735 mutex_destroy(&ztest_checkpoint_lock); 7736 } 7737 7738 static void 7739 print_time(hrtime_t t, char *timebuf) 7740 { 7741 hrtime_t s = t / NANOSEC; 7742 hrtime_t m = s / 60; 7743 hrtime_t h = m / 60; 7744 hrtime_t d = h / 24; 7745 7746 s -= m * 60; 7747 m -= h * 60; 7748 h -= d * 24; 7749 7750 timebuf[0] = '\0'; 7751 7752 if (d) 7753 (void) sprintf(timebuf, 7754 "%llud%02lluh%02llum%02llus", d, h, m, s); 7755 else if (h) 7756 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 7757 else if (m) 7758 (void) sprintf(timebuf, "%llum%02llus", m, s); 7759 else 7760 (void) sprintf(timebuf, "%llus", s); 7761 } 7762 7763 static nvlist_t * 7764 make_random_props(void) 7765 { 7766 nvlist_t *props; 7767 7768 props = fnvlist_alloc(); 7769 7770 if (ztest_random(2) == 0) 7771 return (props); 7772 7773 fnvlist_add_uint64(props, 7774 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); 7775 7776 return (props); 7777 } 7778 7779 /* 7780 * Create a storage pool with the given name and initial vdev size. 7781 * Then test spa_freeze() functionality. 7782 */ 7783 static void 7784 ztest_init(ztest_shared_t *zs) 7785 { 7786 spa_t *spa; 7787 nvlist_t *nvroot, *props; 7788 int i; 7789 7790 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7791 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7792 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7793 7794 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7795 7796 /* 7797 * Create the storage pool. 7798 */ 7799 (void) spa_destroy(ztest_opts.zo_pool); 7800 ztest_shared->zs_vdev_next_leaf = 0; 7801 zs->zs_splits = 0; 7802 zs->zs_mirrors = ztest_opts.zo_mirrors; 7803 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 7804 NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 7805 props = make_random_props(); 7806 7807 /* 7808 * We don't expect the pool to suspend unless maxfaults == 0, 7809 * in which case ztest_fault_inject() temporarily takes away 7810 * the only valid replica. 7811 */ 7812 fnvlist_add_uint64(props, 7813 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 7814 MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); 7815 7816 for (i = 0; i < SPA_FEATURES; i++) { 7817 char *buf; 7818 7819 if (!spa_feature_table[i].fi_zfs_mod_supported) 7820 continue; 7821 7822 /* 7823 * 75% chance of using the log space map feature. We want ztest 7824 * to exercise both the code paths that use the log space map 7825 * feature and the ones that don't. 7826 */ 7827 if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) 7828 continue; 7829 7830 VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", 7831 spa_feature_table[i].fi_uname)); 7832 fnvlist_add_uint64(props, buf, 0); 7833 free(buf); 7834 } 7835 7836 VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); 7837 fnvlist_free(nvroot); 7838 fnvlist_free(props); 7839 7840 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7841 zs->zs_metaslab_sz = 7842 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7843 spa_close(spa, FTAG); 7844 7845 kernel_fini(); 7846 7847 if (!ztest_opts.zo_mmp_test) { 7848 ztest_run_zdb(ztest_opts.zo_pool); 7849 ztest_freeze(); 7850 ztest_run_zdb(ztest_opts.zo_pool); 7851 } 7852 7853 (void) pthread_rwlock_destroy(&ztest_name_lock); 7854 mutex_destroy(&ztest_vdev_lock); 7855 mutex_destroy(&ztest_checkpoint_lock); 7856 } 7857 7858 static void 7859 setup_data_fd(void) 7860 { 7861 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 7862 7863 ztest_fd_data = mkstemp(ztest_name_data); 7864 ASSERT3S(ztest_fd_data, >=, 0); 7865 (void) unlink(ztest_name_data); 7866 } 7867 7868 static int 7869 shared_data_size(ztest_shared_hdr_t *hdr) 7870 { 7871 int size; 7872 7873 size = hdr->zh_hdr_size; 7874 size += hdr->zh_opts_size; 7875 size += hdr->zh_size; 7876 size += hdr->zh_stats_size * hdr->zh_stats_count; 7877 size += hdr->zh_ds_size * hdr->zh_ds_count; 7878 7879 return (size); 7880 } 7881 7882 static void 7883 setup_hdr(void) 7884 { 7885 int size; 7886 ztest_shared_hdr_t *hdr; 7887 7888 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 7889 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 7890 ASSERT3P(hdr, !=, MAP_FAILED); 7891 7892 VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 7893 7894 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 7895 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 7896 hdr->zh_size = sizeof (ztest_shared_t); 7897 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 7898 hdr->zh_stats_count = ZTEST_FUNCS; 7899 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 7900 hdr->zh_ds_count = ztest_opts.zo_datasets; 7901 7902 size = shared_data_size(hdr); 7903 VERIFY0(ftruncate(ztest_fd_data, size)); 7904 7905 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 7906 } 7907 7908 static void 7909 setup_data(void) 7910 { 7911 int size, offset; 7912 ztest_shared_hdr_t *hdr; 7913 uint8_t *buf; 7914 7915 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 7916 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 7917 ASSERT3P(hdr, !=, MAP_FAILED); 7918 7919 size = shared_data_size(hdr); 7920 7921 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 7922 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 7923 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 7924 ASSERT3P(hdr, !=, MAP_FAILED); 7925 buf = (uint8_t *)hdr; 7926 7927 offset = hdr->zh_hdr_size; 7928 ztest_shared_opts = (void *)&buf[offset]; 7929 offset += hdr->zh_opts_size; 7930 ztest_shared = (void *)&buf[offset]; 7931 offset += hdr->zh_size; 7932 ztest_shared_callstate = (void *)&buf[offset]; 7933 offset += hdr->zh_stats_size * hdr->zh_stats_count; 7934 ztest_shared_ds = (void *)&buf[offset]; 7935 } 7936 7937 static boolean_t 7938 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 7939 { 7940 pid_t pid; 7941 int status; 7942 char *cmdbuf = NULL; 7943 7944 pid = fork(); 7945 7946 if (cmd == NULL) { 7947 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 7948 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 7949 cmd = cmdbuf; 7950 } 7951 7952 if (pid == -1) 7953 fatal(B_TRUE, "fork failed"); 7954 7955 if (pid == 0) { /* child */ 7956 char fd_data_str[12]; 7957 7958 VERIFY3S(11, >=, 7959 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 7960 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 7961 7962 if (libpath != NULL) { 7963 const char *curlp = getenv("LD_LIBRARY_PATH"); 7964 if (curlp == NULL) 7965 VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); 7966 else { 7967 char *newlp = NULL; 7968 VERIFY3S(-1, !=, 7969 asprintf(&newlp, "%s:%s", libpath, curlp)); 7970 VERIFY0(setenv("LD_LIBRARY_PATH", newlp, 1)); 7971 } 7972 } 7973 (void) execl(cmd, cmd, (char *)NULL); 7974 ztest_dump_core = B_FALSE; 7975 fatal(B_TRUE, "exec failed: %s", cmd); 7976 } 7977 7978 if (cmdbuf != NULL) { 7979 umem_free(cmdbuf, MAXPATHLEN); 7980 cmd = NULL; 7981 } 7982 7983 while (waitpid(pid, &status, 0) != pid) 7984 continue; 7985 if (statusp != NULL) 7986 *statusp = status; 7987 7988 if (WIFEXITED(status)) { 7989 if (WEXITSTATUS(status) != 0) { 7990 (void) fprintf(stderr, "child exited with code %d\n", 7991 WEXITSTATUS(status)); 7992 exit(2); 7993 } 7994 return (B_FALSE); 7995 } else if (WIFSIGNALED(status)) { 7996 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 7997 (void) fprintf(stderr, "child died with signal %d\n", 7998 WTERMSIG(status)); 7999 exit(3); 8000 } 8001 return (B_TRUE); 8002 } else { 8003 (void) fprintf(stderr, "something strange happened to child\n"); 8004 exit(4); 8005 } 8006 } 8007 8008 static void 8009 ztest_run_init(void) 8010 { 8011 int i; 8012 8013 ztest_shared_t *zs = ztest_shared; 8014 8015 /* 8016 * Blow away any existing copy of zpool.cache 8017 */ 8018 (void) remove(spa_config_path); 8019 8020 if (ztest_opts.zo_init == 0) { 8021 if (ztest_opts.zo_verbose >= 1) 8022 (void) printf("Importing pool %s\n", 8023 ztest_opts.zo_pool); 8024 ztest_import(zs); 8025 return; 8026 } 8027 8028 /* 8029 * Create and initialize our storage pool. 8030 */ 8031 for (i = 1; i <= ztest_opts.zo_init; i++) { 8032 memset(zs, 0, sizeof (*zs)); 8033 if (ztest_opts.zo_verbose >= 3 && 8034 ztest_opts.zo_init != 1) { 8035 (void) printf("ztest_init(), pass %d\n", i); 8036 } 8037 ztest_init(zs); 8038 } 8039 } 8040 8041 int 8042 main(int argc, char **argv) 8043 { 8044 int kills = 0; 8045 int iters = 0; 8046 int older = 0; 8047 int newer = 0; 8048 ztest_shared_t *zs; 8049 ztest_info_t *zi; 8050 ztest_shared_callstate_t *zc; 8051 char timebuf[100]; 8052 char numbuf[NN_NUMBUF_SZ]; 8053 char *cmd; 8054 boolean_t hasalt; 8055 int f, err; 8056 char *fd_data_str = getenv("ZTEST_FD_DATA"); 8057 struct sigaction action; 8058 8059 (void) setvbuf(stdout, NULL, _IOLBF, 0); 8060 8061 dprintf_setup(&argc, argv); 8062 zfs_deadman_synctime_ms = 300000; 8063 zfs_deadman_checktime_ms = 30000; 8064 /* 8065 * As two-word space map entries may not come up often (especially 8066 * if pool and vdev sizes are small) we want to force at least some 8067 * of them so the feature get tested. 8068 */ 8069 zfs_force_some_double_word_sm_entries = B_TRUE; 8070 8071 /* 8072 * Verify that even extensively damaged split blocks with many 8073 * segments can be reconstructed in a reasonable amount of time 8074 * when reconstruction is known to be possible. 8075 * 8076 * Note: the lower this value is, the more damage we inflict, and 8077 * the more time ztest spends in recovering that damage. We chose 8078 * to induce damage 1/100th of the time so recovery is tested but 8079 * not so frequently that ztest doesn't get to test other code paths. 8080 */ 8081 zfs_reconstruct_indirect_damage_fraction = 100; 8082 8083 action.sa_handler = sig_handler; 8084 sigemptyset(&action.sa_mask); 8085 action.sa_flags = 0; 8086 8087 if (sigaction(SIGSEGV, &action, NULL) < 0) { 8088 (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", 8089 strerror(errno)); 8090 exit(EXIT_FAILURE); 8091 } 8092 8093 if (sigaction(SIGABRT, &action, NULL) < 0) { 8094 (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", 8095 strerror(errno)); 8096 exit(EXIT_FAILURE); 8097 } 8098 8099 /* 8100 * Force random_get_bytes() to use /dev/urandom in order to prevent 8101 * ztest from needlessly depleting the system entropy pool. 8102 */ 8103 random_path = "/dev/urandom"; 8104 ztest_fd_rand = open(random_path, O_RDONLY | O_CLOEXEC); 8105 ASSERT3S(ztest_fd_rand, >=, 0); 8106 8107 if (!fd_data_str) { 8108 process_options(argc, argv); 8109 8110 setup_data_fd(); 8111 setup_hdr(); 8112 setup_data(); 8113 memcpy(ztest_shared_opts, &ztest_opts, 8114 sizeof (*ztest_shared_opts)); 8115 } else { 8116 ztest_fd_data = atoi(fd_data_str); 8117 setup_data(); 8118 memcpy(&ztest_opts, ztest_shared_opts, sizeof (ztest_opts)); 8119 } 8120 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 8121 8122 err = ztest_set_global_vars(); 8123 if (err != 0 && !fd_data_str) { 8124 /* error message done by ztest_set_global_vars */ 8125 exit(EXIT_FAILURE); 8126 } else { 8127 /* children should not be spawned if setting gvars fails */ 8128 VERIFY3S(err, ==, 0); 8129 } 8130 8131 /* Override location of zpool.cache */ 8132 VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", 8133 ztest_opts.zo_dir), !=, -1); 8134 8135 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 8136 UMEM_NOFAIL); 8137 zs = ztest_shared; 8138 8139 if (fd_data_str) { 8140 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 8141 metaslab_df_alloc_threshold = 8142 zs->zs_metaslab_df_alloc_threshold; 8143 8144 if (zs->zs_do_init) 8145 ztest_run_init(); 8146 else 8147 ztest_run(zs); 8148 exit(0); 8149 } 8150 8151 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 8152 8153 if (ztest_opts.zo_verbose >= 1) { 8154 (void) printf("%"PRIu64" vdevs, %d datasets, %d threads," 8155 "%d %s disks, %"PRIu64" seconds...\n\n", 8156 ztest_opts.zo_vdevs, 8157 ztest_opts.zo_datasets, 8158 ztest_opts.zo_threads, 8159 ztest_opts.zo_raid_children, 8160 ztest_opts.zo_raid_type, 8161 ztest_opts.zo_time); 8162 } 8163 8164 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 8165 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 8166 8167 zs->zs_do_init = B_TRUE; 8168 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 8169 if (ztest_opts.zo_verbose >= 1) { 8170 (void) printf("Executing older ztest for " 8171 "initialization: %s\n", ztest_opts.zo_alt_ztest); 8172 } 8173 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 8174 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 8175 } else { 8176 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 8177 } 8178 zs->zs_do_init = B_FALSE; 8179 8180 zs->zs_proc_start = gethrtime(); 8181 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 8182 8183 for (f = 0; f < ZTEST_FUNCS; f++) { 8184 zi = &ztest_info[f]; 8185 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8186 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 8187 zc->zc_next = UINT64_MAX; 8188 else 8189 zc->zc_next = zs->zs_proc_start + 8190 ztest_random(2 * zi->zi_interval[0] + 1); 8191 } 8192 8193 /* 8194 * Run the tests in a loop. These tests include fault injection 8195 * to verify that self-healing data works, and forced crashes 8196 * to verify that we never lose on-disk consistency. 8197 */ 8198 while (gethrtime() < zs->zs_proc_stop) { 8199 int status; 8200 boolean_t killed; 8201 8202 /* 8203 * Initialize the workload counters for each function. 8204 */ 8205 for (f = 0; f < ZTEST_FUNCS; f++) { 8206 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8207 zc->zc_count = 0; 8208 zc->zc_time = 0; 8209 } 8210 8211 /* Set the allocation switch size */ 8212 zs->zs_metaslab_df_alloc_threshold = 8213 ztest_random(zs->zs_metaslab_sz / 4) + 1; 8214 8215 if (!hasalt || ztest_random(2) == 0) { 8216 if (hasalt && ztest_opts.zo_verbose >= 1) { 8217 (void) printf("Executing newer ztest: %s\n", 8218 cmd); 8219 } 8220 newer++; 8221 killed = exec_child(cmd, NULL, B_TRUE, &status); 8222 } else { 8223 if (hasalt && ztest_opts.zo_verbose >= 1) { 8224 (void) printf("Executing older ztest: %s\n", 8225 ztest_opts.zo_alt_ztest); 8226 } 8227 older++; 8228 killed = exec_child(ztest_opts.zo_alt_ztest, 8229 ztest_opts.zo_alt_libpath, B_TRUE, &status); 8230 } 8231 8232 if (killed) 8233 kills++; 8234 iters++; 8235 8236 if (ztest_opts.zo_verbose >= 1) { 8237 hrtime_t now = gethrtime(); 8238 8239 now = MIN(now, zs->zs_proc_stop); 8240 print_time(zs->zs_proc_stop - now, timebuf); 8241 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 8242 8243 (void) printf("Pass %3d, %8s, %3"PRIu64" ENOSPC, " 8244 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 8245 iters, 8246 WIFEXITED(status) ? "Complete" : "SIGKILL", 8247 zs->zs_enospc_count, 8248 100.0 * zs->zs_alloc / zs->zs_space, 8249 numbuf, 8250 100.0 * (now - zs->zs_proc_start) / 8251 (ztest_opts.zo_time * NANOSEC), timebuf); 8252 } 8253 8254 if (ztest_opts.zo_verbose >= 2) { 8255 (void) printf("\nWorkload summary:\n\n"); 8256 (void) printf("%7s %9s %s\n", 8257 "Calls", "Time", "Function"); 8258 (void) printf("%7s %9s %s\n", 8259 "-----", "----", "--------"); 8260 for (f = 0; f < ZTEST_FUNCS; f++) { 8261 zi = &ztest_info[f]; 8262 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8263 print_time(zc->zc_time, timebuf); 8264 (void) printf("%7"PRIu64" %9s %s\n", 8265 zc->zc_count, timebuf, 8266 zi->zi_funcname); 8267 } 8268 (void) printf("\n"); 8269 } 8270 8271 if (!ztest_opts.zo_mmp_test) 8272 ztest_run_zdb(ztest_opts.zo_pool); 8273 } 8274 8275 if (ztest_opts.zo_verbose >= 1) { 8276 if (hasalt) { 8277 (void) printf("%d runs of older ztest: %s\n", older, 8278 ztest_opts.zo_alt_ztest); 8279 (void) printf("%d runs of newer ztest: %s\n", newer, 8280 cmd); 8281 } 8282 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 8283 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 8284 } 8285 8286 umem_free(cmd, MAXNAMELEN); 8287 8288 return (0); 8289 } 8290