1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2013 Steven Hartland. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2017 Joyent, Inc. 28 * Copyright (c) 2017, Intel Corporation. 29 */ 30 31 /* 32 * The objective of this program is to provide a DMU/ZAP/SPA stress test 33 * that runs entirely in userland, is easy to use, and easy to extend. 34 * 35 * The overall design of the ztest program is as follows: 36 * 37 * (1) For each major functional area (e.g. adding vdevs to a pool, 38 * creating and destroying datasets, reading and writing objects, etc) 39 * we have a simple routine to test that functionality. These 40 * individual routines do not have to do anything "stressful". 41 * 42 * (2) We turn these simple functionality tests into a stress test by 43 * running them all in parallel, with as many threads as desired, 44 * and spread across as many datasets, objects, and vdevs as desired. 45 * 46 * (3) While all this is happening, we inject faults into the pool to 47 * verify that self-healing data really works. 48 * 49 * (4) Every time we open a dataset, we change its checksum and compression 50 * functions. Thus even individual objects vary from block to block 51 * in which checksum they use and whether they're compressed. 52 * 53 * (5) To verify that we never lose on-disk consistency after a crash, 54 * we run the entire test in a child of the main process. 55 * At random times, the child self-immolates with a SIGKILL. 56 * This is the software equivalent of pulling the power cord. 57 * The parent then runs the test again, using the existing 58 * storage pool, as many times as desired. If backwards compatibility 59 * testing is enabled ztest will sometimes run the "older" version 60 * of ztest after a SIGKILL. 61 * 62 * (6) To verify that we don't have future leaks or temporal incursions, 63 * many of the functional tests record the transaction group number 64 * as part of their data. When reading old data, they verify that 65 * the transaction group number is less than the current, open txg. 66 * If you add a new test, please do this if applicable. 67 * 68 * (7) Threads are created with a reduced stack size, for sanity checking. 69 * Therefore, it's important not to allocate huge buffers on the stack. 70 * 71 * When run with no arguments, ztest runs for about five minutes and 72 * produces no output if successful. To get a little bit of information, 73 * specify -V. To get more information, specify -VV, and so on. 74 * 75 * To turn this into an overnight stress test, use -T to specify run time. 76 * 77 * You can ask more vdevs [-v], datasets [-d], or threads [-t] 78 * to increase the pool capacity, fanout, and overall stress level. 79 * 80 * Use the -k option to set the desired frequency of kills. 81 * 82 * When ztest invokes itself it passes all relevant information through a 83 * temporary file which is mmap-ed in the child process. This allows shared 84 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 85 * stored at offset 0 of this file and contains information on the size and 86 * number of shared structures in the file. The information stored in this file 87 * must remain backwards compatible with older versions of ztest so that 88 * ztest can invoke them during backwards compatibility testing (-B). 89 */ 90 91 #include <sys/zfs_context.h> 92 #include <sys/spa.h> 93 #include <sys/dmu.h> 94 #include <sys/txg.h> 95 #include <sys/dbuf.h> 96 #include <sys/zap.h> 97 #include <sys/dmu_objset.h> 98 #include <sys/poll.h> 99 #include <sys/stat.h> 100 #include <sys/time.h> 101 #include <sys/wait.h> 102 #include <sys/mman.h> 103 #include <sys/resource.h> 104 #include <sys/zio.h> 105 #include <sys/zil.h> 106 #include <sys/zil_impl.h> 107 #include <sys/vdev_draid.h> 108 #include <sys/vdev_impl.h> 109 #include <sys/vdev_file.h> 110 #include <sys/vdev_initialize.h> 111 #include <sys/vdev_raidz.h> 112 #include <sys/vdev_trim.h> 113 #include <sys/spa_impl.h> 114 #include <sys/metaslab_impl.h> 115 #include <sys/dsl_prop.h> 116 #include <sys/dsl_dataset.h> 117 #include <sys/dsl_destroy.h> 118 #include <sys/dsl_scan.h> 119 #include <sys/zio_checksum.h> 120 #include <sys/zfs_refcount.h> 121 #include <sys/zfeature.h> 122 #include <sys/dsl_userhold.h> 123 #include <sys/abd.h> 124 #include <sys/blake3.h> 125 #include <stdio.h> 126 #include <stdlib.h> 127 #include <unistd.h> 128 #include <getopt.h> 129 #include <signal.h> 130 #include <umem.h> 131 #include <ctype.h> 132 #include <math.h> 133 #include <sys/fs/zfs.h> 134 #include <zfs_fletcher.h> 135 #include <libnvpair.h> 136 #include <libzutil.h> 137 #include <sys/crypto/icp.h> 138 #include <sys/zfs_impl.h> 139 #if (__GLIBC__ && !__UCLIBC__) 140 #include <execinfo.h> /* for backtrace() */ 141 #endif 142 143 static int ztest_fd_data = -1; 144 static int ztest_fd_rand = -1; 145 146 typedef struct ztest_shared_hdr { 147 uint64_t zh_hdr_size; 148 uint64_t zh_opts_size; 149 uint64_t zh_size; 150 uint64_t zh_stats_size; 151 uint64_t zh_stats_count; 152 uint64_t zh_ds_size; 153 uint64_t zh_ds_count; 154 } ztest_shared_hdr_t; 155 156 static ztest_shared_hdr_t *ztest_shared_hdr; 157 158 enum ztest_class_state { 159 ZTEST_VDEV_CLASS_OFF, 160 ZTEST_VDEV_CLASS_ON, 161 ZTEST_VDEV_CLASS_RND 162 }; 163 164 #define ZO_GVARS_MAX_ARGLEN ((size_t)64) 165 #define ZO_GVARS_MAX_COUNT ((size_t)10) 166 167 typedef struct ztest_shared_opts { 168 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 169 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 170 char zo_alt_ztest[MAXNAMELEN]; 171 char zo_alt_libpath[MAXNAMELEN]; 172 uint64_t zo_vdevs; 173 uint64_t zo_vdevtime; 174 size_t zo_vdev_size; 175 int zo_ashift; 176 int zo_mirrors; 177 int zo_raid_children; 178 int zo_raid_parity; 179 char zo_raid_type[8]; 180 int zo_draid_data; 181 int zo_draid_spares; 182 int zo_datasets; 183 int zo_threads; 184 uint64_t zo_passtime; 185 uint64_t zo_killrate; 186 int zo_verbose; 187 int zo_init; 188 uint64_t zo_time; 189 uint64_t zo_maxloops; 190 uint64_t zo_metaslab_force_ganging; 191 int zo_mmp_test; 192 int zo_special_vdevs; 193 int zo_dump_dbgmsg; 194 int zo_gvars_count; 195 char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; 196 } ztest_shared_opts_t; 197 198 /* Default values for command line options. */ 199 #define DEFAULT_POOL "ztest" 200 #define DEFAULT_VDEV_DIR "/tmp" 201 #define DEFAULT_VDEV_COUNT 5 202 #define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ 203 #define DEFAULT_VDEV_SIZE_STR "256M" 204 #define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT 205 #define DEFAULT_MIRRORS 2 206 #define DEFAULT_RAID_CHILDREN 4 207 #define DEFAULT_RAID_PARITY 1 208 #define DEFAULT_DRAID_DATA 4 209 #define DEFAULT_DRAID_SPARES 1 210 #define DEFAULT_DATASETS_COUNT 7 211 #define DEFAULT_THREADS 23 212 #define DEFAULT_RUN_TIME 300 /* 300 seconds */ 213 #define DEFAULT_RUN_TIME_STR "300 sec" 214 #define DEFAULT_PASS_TIME 60 /* 60 seconds */ 215 #define DEFAULT_PASS_TIME_STR "60 sec" 216 #define DEFAULT_KILL_RATE 70 /* 70% kill rate */ 217 #define DEFAULT_KILLRATE_STR "70%" 218 #define DEFAULT_INITS 1 219 #define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ 220 #define DEFAULT_FORCE_GANGING (64 << 10) 221 #define DEFAULT_FORCE_GANGING_STR "64K" 222 223 /* Simplifying assumption: -1 is not a valid default. */ 224 #define NO_DEFAULT -1 225 226 static const ztest_shared_opts_t ztest_opts_defaults = { 227 .zo_pool = DEFAULT_POOL, 228 .zo_dir = DEFAULT_VDEV_DIR, 229 .zo_alt_ztest = { '\0' }, 230 .zo_alt_libpath = { '\0' }, 231 .zo_vdevs = DEFAULT_VDEV_COUNT, 232 .zo_ashift = DEFAULT_ASHIFT, 233 .zo_mirrors = DEFAULT_MIRRORS, 234 .zo_raid_children = DEFAULT_RAID_CHILDREN, 235 .zo_raid_parity = DEFAULT_RAID_PARITY, 236 .zo_raid_type = VDEV_TYPE_RAIDZ, 237 .zo_vdev_size = DEFAULT_VDEV_SIZE, 238 .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ 239 .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ 240 .zo_datasets = DEFAULT_DATASETS_COUNT, 241 .zo_threads = DEFAULT_THREADS, 242 .zo_passtime = DEFAULT_PASS_TIME, 243 .zo_killrate = DEFAULT_KILL_RATE, 244 .zo_verbose = 0, 245 .zo_mmp_test = 0, 246 .zo_init = DEFAULT_INITS, 247 .zo_time = DEFAULT_RUN_TIME, 248 .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ 249 .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, 250 .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, 251 .zo_gvars_count = 0, 252 }; 253 254 extern uint64_t metaslab_force_ganging; 255 extern uint64_t metaslab_df_alloc_threshold; 256 extern uint64_t zfs_deadman_synctime_ms; 257 extern uint_t metaslab_preload_limit; 258 extern int zfs_compressed_arc_enabled; 259 extern int zfs_abd_scatter_enabled; 260 extern uint_t dmu_object_alloc_chunk_shift; 261 extern boolean_t zfs_force_some_double_word_sm_entries; 262 extern unsigned long zio_decompress_fail_fraction; 263 extern unsigned long zfs_reconstruct_indirect_damage_fraction; 264 265 266 static ztest_shared_opts_t *ztest_shared_opts; 267 static ztest_shared_opts_t ztest_opts; 268 static const char *const ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; 269 270 typedef struct ztest_shared_ds { 271 uint64_t zd_seq; 272 } ztest_shared_ds_t; 273 274 static ztest_shared_ds_t *ztest_shared_ds; 275 #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 276 277 #define BT_MAGIC 0x123456789abcdefULL 278 #define MAXFAULTS(zs) \ 279 (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) 280 281 enum ztest_io_type { 282 ZTEST_IO_WRITE_TAG, 283 ZTEST_IO_WRITE_PATTERN, 284 ZTEST_IO_WRITE_ZEROES, 285 ZTEST_IO_TRUNCATE, 286 ZTEST_IO_SETATTR, 287 ZTEST_IO_REWRITE, 288 ZTEST_IO_TYPES 289 }; 290 291 typedef struct ztest_block_tag { 292 uint64_t bt_magic; 293 uint64_t bt_objset; 294 uint64_t bt_object; 295 uint64_t bt_dnodesize; 296 uint64_t bt_offset; 297 uint64_t bt_gen; 298 uint64_t bt_txg; 299 uint64_t bt_crtxg; 300 } ztest_block_tag_t; 301 302 typedef struct bufwad { 303 uint64_t bw_index; 304 uint64_t bw_txg; 305 uint64_t bw_data; 306 } bufwad_t; 307 308 /* 309 * It would be better to use a rangelock_t per object. Unfortunately 310 * the rangelock_t is not a drop-in replacement for rl_t, because we 311 * still need to map from object ID to rangelock_t. 312 */ 313 typedef enum { 314 RL_READER, 315 RL_WRITER, 316 RL_APPEND 317 } rl_type_t; 318 319 typedef struct rll { 320 void *rll_writer; 321 int rll_readers; 322 kmutex_t rll_lock; 323 kcondvar_t rll_cv; 324 } rll_t; 325 326 typedef struct rl { 327 uint64_t rl_object; 328 uint64_t rl_offset; 329 uint64_t rl_size; 330 rll_t *rl_lock; 331 } rl_t; 332 333 #define ZTEST_RANGE_LOCKS 64 334 #define ZTEST_OBJECT_LOCKS 64 335 336 /* 337 * Object descriptor. Used as a template for object lookup/create/remove. 338 */ 339 typedef struct ztest_od { 340 uint64_t od_dir; 341 uint64_t od_object; 342 dmu_object_type_t od_type; 343 dmu_object_type_t od_crtype; 344 uint64_t od_blocksize; 345 uint64_t od_crblocksize; 346 uint64_t od_crdnodesize; 347 uint64_t od_gen; 348 uint64_t od_crgen; 349 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 350 } ztest_od_t; 351 352 /* 353 * Per-dataset state. 354 */ 355 typedef struct ztest_ds { 356 ztest_shared_ds_t *zd_shared; 357 objset_t *zd_os; 358 pthread_rwlock_t zd_zilog_lock; 359 zilog_t *zd_zilog; 360 ztest_od_t *zd_od; /* debugging aid */ 361 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 362 kmutex_t zd_dirobj_lock; 363 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 364 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 365 } ztest_ds_t; 366 367 /* 368 * Per-iteration state. 369 */ 370 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 371 372 typedef struct ztest_info { 373 ztest_func_t *zi_func; /* test function */ 374 uint64_t zi_iters; /* iterations per execution */ 375 uint64_t *zi_interval; /* execute every <interval> seconds */ 376 const char *zi_funcname; /* name of test function */ 377 } ztest_info_t; 378 379 typedef struct ztest_shared_callstate { 380 uint64_t zc_count; /* per-pass count */ 381 uint64_t zc_time; /* per-pass time */ 382 uint64_t zc_next; /* next time to call this function */ 383 } ztest_shared_callstate_t; 384 385 static ztest_shared_callstate_t *ztest_shared_callstate; 386 #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 387 388 ztest_func_t ztest_dmu_read_write; 389 ztest_func_t ztest_dmu_write_parallel; 390 ztest_func_t ztest_dmu_object_alloc_free; 391 ztest_func_t ztest_dmu_object_next_chunk; 392 ztest_func_t ztest_dmu_commit_callbacks; 393 ztest_func_t ztest_zap; 394 ztest_func_t ztest_zap_parallel; 395 ztest_func_t ztest_zil_commit; 396 ztest_func_t ztest_zil_remount; 397 ztest_func_t ztest_dmu_read_write_zcopy; 398 ztest_func_t ztest_dmu_objset_create_destroy; 399 ztest_func_t ztest_dmu_prealloc; 400 ztest_func_t ztest_fzap; 401 ztest_func_t ztest_dmu_snapshot_create_destroy; 402 ztest_func_t ztest_dsl_prop_get_set; 403 ztest_func_t ztest_spa_prop_get_set; 404 ztest_func_t ztest_spa_create_destroy; 405 ztest_func_t ztest_fault_inject; 406 ztest_func_t ztest_dmu_snapshot_hold; 407 ztest_func_t ztest_mmp_enable_disable; 408 ztest_func_t ztest_scrub; 409 ztest_func_t ztest_dsl_dataset_promote_busy; 410 ztest_func_t ztest_vdev_attach_detach; 411 ztest_func_t ztest_vdev_LUN_growth; 412 ztest_func_t ztest_vdev_add_remove; 413 ztest_func_t ztest_vdev_class_add; 414 ztest_func_t ztest_vdev_aux_add_remove; 415 ztest_func_t ztest_split_pool; 416 ztest_func_t ztest_reguid; 417 ztest_func_t ztest_spa_upgrade; 418 ztest_func_t ztest_device_removal; 419 ztest_func_t ztest_spa_checkpoint_create_discard; 420 ztest_func_t ztest_initialize; 421 ztest_func_t ztest_trim; 422 ztest_func_t ztest_blake3; 423 ztest_func_t ztest_fletcher; 424 ztest_func_t ztest_fletcher_incr; 425 ztest_func_t ztest_verify_dnode_bt; 426 427 static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 428 static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 429 static uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 430 static uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 431 static uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 432 433 #define ZTI_INIT(func, iters, interval) \ 434 { .zi_func = (func), \ 435 .zi_iters = (iters), \ 436 .zi_interval = (interval), \ 437 .zi_funcname = # func } 438 439 static ztest_info_t ztest_info[] = { 440 ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), 441 ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), 442 ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), 443 ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), 444 ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), 445 ZTI_INIT(ztest_zap, 30, &zopt_always), 446 ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), 447 ZTI_INIT(ztest_split_pool, 1, &zopt_sometimes), 448 ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), 449 ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), 450 ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), 451 ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), 452 ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), 453 ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), 454 #if 0 455 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), 456 #endif 457 ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), 458 ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), 459 ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), 460 ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), 461 ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), 462 ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), 463 ZTI_INIT(ztest_reguid, 1, &zopt_rarely), 464 ZTI_INIT(ztest_scrub, 1, &zopt_rarely), 465 ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), 466 ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), 467 ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), 468 ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), 469 ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), 470 ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), 471 ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), 472 ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), 473 ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), 474 ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), 475 ZTI_INIT(ztest_trim, 1, &zopt_sometimes), 476 ZTI_INIT(ztest_blake3, 1, &zopt_rarely), 477 ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), 478 ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), 479 ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), 480 }; 481 482 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 483 484 /* 485 * The following struct is used to hold a list of uncalled commit callbacks. 486 * The callbacks are ordered by txg number. 487 */ 488 typedef struct ztest_cb_list { 489 kmutex_t zcl_callbacks_lock; 490 list_t zcl_callbacks; 491 } ztest_cb_list_t; 492 493 /* 494 * Stuff we need to share writably between parent and child. 495 */ 496 typedef struct ztest_shared { 497 boolean_t zs_do_init; 498 hrtime_t zs_proc_start; 499 hrtime_t zs_proc_stop; 500 hrtime_t zs_thread_start; 501 hrtime_t zs_thread_stop; 502 hrtime_t zs_thread_kill; 503 uint64_t zs_enospc_count; 504 uint64_t zs_vdev_next_leaf; 505 uint64_t zs_vdev_aux; 506 uint64_t zs_alloc; 507 uint64_t zs_space; 508 uint64_t zs_splits; 509 uint64_t zs_mirrors; 510 uint64_t zs_metaslab_sz; 511 uint64_t zs_metaslab_df_alloc_threshold; 512 uint64_t zs_guid; 513 } ztest_shared_t; 514 515 #define ID_PARALLEL -1ULL 516 517 static char ztest_dev_template[] = "%s/%s.%llua"; 518 static char ztest_aux_template[] = "%s/%s.%s.%llu"; 519 static ztest_shared_t *ztest_shared; 520 521 static spa_t *ztest_spa = NULL; 522 static ztest_ds_t *ztest_ds; 523 524 static kmutex_t ztest_vdev_lock; 525 static boolean_t ztest_device_removal_active = B_FALSE; 526 static boolean_t ztest_pool_scrubbed = B_FALSE; 527 static kmutex_t ztest_checkpoint_lock; 528 529 /* 530 * The ztest_name_lock protects the pool and dataset namespace used by 531 * the individual tests. To modify the namespace, consumers must grab 532 * this lock as writer. Grabbing the lock as reader will ensure that the 533 * namespace does not change while the lock is held. 534 */ 535 static pthread_rwlock_t ztest_name_lock; 536 537 static boolean_t ztest_dump_core = B_TRUE; 538 static boolean_t ztest_exiting; 539 540 /* Global commit callback list */ 541 static ztest_cb_list_t zcl; 542 /* Commit cb delay */ 543 static uint64_t zc_min_txg_delay = UINT64_MAX; 544 static int zc_cb_counter = 0; 545 546 /* 547 * Minimum number of commit callbacks that need to be registered for us to check 548 * whether the minimum txg delay is acceptable. 549 */ 550 #define ZTEST_COMMIT_CB_MIN_REG 100 551 552 /* 553 * If a number of txgs equal to this threshold have been created after a commit 554 * callback has been registered but not called, then we assume there is an 555 * implementation bug. 556 */ 557 #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) 558 559 enum ztest_object { 560 ZTEST_META_DNODE = 0, 561 ZTEST_DIROBJ, 562 ZTEST_OBJECTS 563 }; 564 565 static __attribute__((noreturn)) void usage(boolean_t requested); 566 static int ztest_scrub_impl(spa_t *spa); 567 568 /* 569 * These libumem hooks provide a reasonable set of defaults for the allocator's 570 * debugging facilities. 571 */ 572 const char * 573 _umem_debug_init(void) 574 { 575 return ("default,verbose"); /* $UMEM_DEBUG setting */ 576 } 577 578 const char * 579 _umem_logging_init(void) 580 { 581 return ("fail,contents"); /* $UMEM_LOGGING setting */ 582 } 583 584 static void 585 dump_debug_buffer(void) 586 { 587 ssize_t ret __attribute__((unused)); 588 589 if (!ztest_opts.zo_dump_dbgmsg) 590 return; 591 592 /* 593 * We use write() instead of printf() so that this function 594 * is safe to call from a signal handler. 595 */ 596 ret = write(STDOUT_FILENO, "\n", 1); 597 zfs_dbgmsg_print("ztest"); 598 } 599 600 #define BACKTRACE_SZ 100 601 602 static void sig_handler(int signo) 603 { 604 struct sigaction action; 605 #if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ 606 int nptrs; 607 void *buffer[BACKTRACE_SZ]; 608 609 nptrs = backtrace(buffer, BACKTRACE_SZ); 610 backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); 611 #endif 612 dump_debug_buffer(); 613 614 /* 615 * Restore default action and re-raise signal so SIGSEGV and 616 * SIGABRT can trigger a core dump. 617 */ 618 action.sa_handler = SIG_DFL; 619 sigemptyset(&action.sa_mask); 620 action.sa_flags = 0; 621 (void) sigaction(signo, &action, NULL); 622 raise(signo); 623 } 624 625 #define FATAL_MSG_SZ 1024 626 627 static const char *fatal_msg; 628 629 static __attribute__((format(printf, 2, 3))) __attribute__((noreturn)) void 630 fatal(int do_perror, const char *message, ...) 631 { 632 va_list args; 633 int save_errno = errno; 634 char *buf; 635 636 (void) fflush(stdout); 637 buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); 638 if (buf == NULL) 639 goto out; 640 641 va_start(args, message); 642 (void) sprintf(buf, "ztest: "); 643 /* LINTED */ 644 (void) vsprintf(buf + strlen(buf), message, args); 645 va_end(args); 646 if (do_perror) { 647 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 648 ": %s", strerror(save_errno)); 649 } 650 (void) fprintf(stderr, "%s\n", buf); 651 fatal_msg = buf; /* to ease debugging */ 652 653 out: 654 if (ztest_dump_core) 655 abort(); 656 else 657 dump_debug_buffer(); 658 659 exit(3); 660 } 661 662 static int 663 str2shift(const char *buf) 664 { 665 const char *ends = "BKMGTPEZ"; 666 int i; 667 668 if (buf[0] == '\0') 669 return (0); 670 for (i = 0; i < strlen(ends); i++) { 671 if (toupper(buf[0]) == ends[i]) 672 break; 673 } 674 if (i == strlen(ends)) { 675 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 676 buf); 677 usage(B_FALSE); 678 } 679 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 680 return (10*i); 681 } 682 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 683 usage(B_FALSE); 684 } 685 686 static uint64_t 687 nicenumtoull(const char *buf) 688 { 689 char *end; 690 uint64_t val; 691 692 val = strtoull(buf, &end, 0); 693 if (end == buf) { 694 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 695 usage(B_FALSE); 696 } else if (end[0] == '.') { 697 double fval = strtod(buf, &end); 698 fval *= pow(2, str2shift(end)); 699 /* 700 * UINT64_MAX is not exactly representable as a double. 701 * The closest representation is UINT64_MAX + 1, so we 702 * use a >= comparison instead of > for the bounds check. 703 */ 704 if (fval >= (double)UINT64_MAX) { 705 (void) fprintf(stderr, "ztest: value too large: %s\n", 706 buf); 707 usage(B_FALSE); 708 } 709 val = (uint64_t)fval; 710 } else { 711 int shift = str2shift(end); 712 if (shift >= 64 || (val << shift) >> shift != val) { 713 (void) fprintf(stderr, "ztest: value too large: %s\n", 714 buf); 715 usage(B_FALSE); 716 } 717 val <<= shift; 718 } 719 return (val); 720 } 721 722 typedef struct ztest_option { 723 const char short_opt; 724 const char *long_opt; 725 const char *long_opt_param; 726 const char *comment; 727 unsigned int default_int; 728 const char *default_str; 729 } ztest_option_t; 730 731 /* 732 * The following option_table is used for generating the usage info as well as 733 * the long and short option information for calling getopt_long(). 734 */ 735 static ztest_option_t option_table[] = { 736 { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, 737 NULL}, 738 { 's', "vdev-size", "INTEGER", "Size of each vdev", 739 NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, 740 { 'a', "alignment-shift", "INTEGER", 741 "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, 742 { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", 743 DEFAULT_MIRRORS, NULL}, 744 { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", 745 DEFAULT_RAID_CHILDREN, NULL}, 746 { 'R', "raid-parity", "INTEGER", "Raid parity", 747 DEFAULT_RAID_PARITY, NULL}, 748 { 'K', "raid-kind", "raidz|draid|random", "Raid kind", 749 NO_DEFAULT, "random"}, 750 { 'D', "draid-data", "INTEGER", "Number of draid data drives", 751 DEFAULT_DRAID_DATA, NULL}, 752 { 'S', "draid-spares", "INTEGER", "Number of draid spares", 753 DEFAULT_DRAID_SPARES, NULL}, 754 { 'd', "datasets", "INTEGER", "Number of datasets", 755 DEFAULT_DATASETS_COUNT, NULL}, 756 { 't', "threads", "INTEGER", "Number of ztest threads", 757 DEFAULT_THREADS, NULL}, 758 { 'g', "gang-block-threshold", "INTEGER", 759 "Metaslab gang block threshold", 760 NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, 761 { 'i', "init-count", "INTEGER", "Number of times to initialize pool", 762 DEFAULT_INITS, NULL}, 763 { 'k', "kill-percentage", "INTEGER", "Kill percentage", 764 NO_DEFAULT, DEFAULT_KILLRATE_STR}, 765 { 'p', "pool-name", "STRING", "Pool name", 766 NO_DEFAULT, DEFAULT_POOL}, 767 { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", 768 NO_DEFAULT, DEFAULT_VDEV_DIR}, 769 { 'M', "multi-host", NULL, 770 "Multi-host; simulate pool imported on remote host", 771 NO_DEFAULT, NULL}, 772 { 'E', "use-existing-pool", NULL, 773 "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, 774 { 'T', "run-time", "INTEGER", "Total run time", 775 NO_DEFAULT, DEFAULT_RUN_TIME_STR}, 776 { 'P', "pass-time", "INTEGER", "Time per pass", 777 NO_DEFAULT, DEFAULT_PASS_TIME_STR}, 778 { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", 779 DEFAULT_MAX_LOOPS, NULL}, 780 { 'B', "alt-ztest", "PATH", "Alternate ztest path", 781 NO_DEFAULT, NULL}, 782 { 'C', "vdev-class-state", "on|off|random", "vdev class state", 783 NO_DEFAULT, "random"}, 784 { 'o', "option", "\"OPTION=INTEGER\"", 785 "Set global variable to an unsigned 32-bit integer value", 786 NO_DEFAULT, NULL}, 787 { 'G', "dump-debug-msg", NULL, 788 "Dump zfs_dbgmsg buffer before exiting due to an error", 789 NO_DEFAULT, NULL}, 790 { 'V', "verbose", NULL, 791 "Verbose (use multiple times for ever more verbosity)", 792 NO_DEFAULT, NULL}, 793 { 'h', "help", NULL, "Show this help", 794 NO_DEFAULT, NULL}, 795 {0, 0, 0, 0, 0, 0} 796 }; 797 798 static struct option *long_opts = NULL; 799 static char *short_opts = NULL; 800 801 static void 802 init_options(void) 803 { 804 ASSERT3P(long_opts, ==, NULL); 805 ASSERT3P(short_opts, ==, NULL); 806 807 int count = sizeof (option_table) / sizeof (option_table[0]); 808 long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); 809 810 short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); 811 int short_opt_index = 0; 812 813 for (int i = 0; i < count; i++) { 814 long_opts[i].val = option_table[i].short_opt; 815 long_opts[i].name = option_table[i].long_opt; 816 long_opts[i].has_arg = option_table[i].long_opt_param != NULL 817 ? required_argument : no_argument; 818 long_opts[i].flag = NULL; 819 short_opts[short_opt_index++] = option_table[i].short_opt; 820 if (option_table[i].long_opt_param != NULL) { 821 short_opts[short_opt_index++] = ':'; 822 } 823 } 824 } 825 826 static void 827 fini_options(void) 828 { 829 int count = sizeof (option_table) / sizeof (option_table[0]); 830 831 umem_free(long_opts, sizeof (struct option) * count); 832 umem_free(short_opts, sizeof (char) * 2 * count); 833 834 long_opts = NULL; 835 short_opts = NULL; 836 } 837 838 static __attribute__((noreturn)) void 839 usage(boolean_t requested) 840 { 841 char option[80]; 842 FILE *fp = requested ? stdout : stderr; 843 844 (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); 845 for (int i = 0; option_table[i].short_opt != 0; i++) { 846 if (option_table[i].long_opt_param != NULL) { 847 (void) sprintf(option, " -%c --%s=%s", 848 option_table[i].short_opt, 849 option_table[i].long_opt, 850 option_table[i].long_opt_param); 851 } else { 852 (void) sprintf(option, " -%c --%s", 853 option_table[i].short_opt, 854 option_table[i].long_opt); 855 } 856 (void) fprintf(fp, " %-40s%s", option, 857 option_table[i].comment); 858 859 if (option_table[i].long_opt_param != NULL) { 860 if (option_table[i].default_str != NULL) { 861 (void) fprintf(fp, " (default: %s)", 862 option_table[i].default_str); 863 } else if (option_table[i].default_int != NO_DEFAULT) { 864 (void) fprintf(fp, " (default: %u)", 865 option_table[i].default_int); 866 } 867 } 868 (void) fprintf(fp, "\n"); 869 } 870 exit(requested ? 0 : 1); 871 } 872 873 static uint64_t 874 ztest_random(uint64_t range) 875 { 876 uint64_t r; 877 878 ASSERT3S(ztest_fd_rand, >=, 0); 879 880 if (range == 0) 881 return (0); 882 883 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) 884 fatal(B_TRUE, "short read from /dev/urandom"); 885 886 return (r % range); 887 } 888 889 static void 890 ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) 891 { 892 char name[32]; 893 char *value; 894 int state = ZTEST_VDEV_CLASS_RND; 895 896 (void) strlcpy(name, input, sizeof (name)); 897 898 value = strchr(name, '='); 899 if (value == NULL) { 900 (void) fprintf(stderr, "missing value in property=value " 901 "'-C' argument (%s)\n", input); 902 usage(B_FALSE); 903 } 904 *(value) = '\0'; 905 value++; 906 907 if (strcmp(value, "on") == 0) { 908 state = ZTEST_VDEV_CLASS_ON; 909 } else if (strcmp(value, "off") == 0) { 910 state = ZTEST_VDEV_CLASS_OFF; 911 } else if (strcmp(value, "random") == 0) { 912 state = ZTEST_VDEV_CLASS_RND; 913 } else { 914 (void) fprintf(stderr, "invalid property value '%s'\n", value); 915 usage(B_FALSE); 916 } 917 918 if (strcmp(name, "special") == 0) { 919 zo->zo_special_vdevs = state; 920 } else { 921 (void) fprintf(stderr, "invalid property name '%s'\n", name); 922 usage(B_FALSE); 923 } 924 if (zo->zo_verbose >= 3) 925 (void) printf("%s vdev state is '%s'\n", name, value); 926 } 927 928 static void 929 process_options(int argc, char **argv) 930 { 931 char *path; 932 ztest_shared_opts_t *zo = &ztest_opts; 933 934 int opt; 935 uint64_t value; 936 const char *raid_kind = "random"; 937 938 memcpy(zo, &ztest_opts_defaults, sizeof (*zo)); 939 940 init_options(); 941 942 while ((opt = getopt_long(argc, argv, short_opts, long_opts, 943 NULL)) != EOF) { 944 value = 0; 945 switch (opt) { 946 case 'v': 947 case 's': 948 case 'a': 949 case 'm': 950 case 'r': 951 case 'R': 952 case 'D': 953 case 'S': 954 case 'd': 955 case 't': 956 case 'g': 957 case 'i': 958 case 'k': 959 case 'T': 960 case 'P': 961 case 'F': 962 value = nicenumtoull(optarg); 963 } 964 switch (opt) { 965 case 'v': 966 zo->zo_vdevs = value; 967 break; 968 case 's': 969 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 970 break; 971 case 'a': 972 zo->zo_ashift = value; 973 break; 974 case 'm': 975 zo->zo_mirrors = value; 976 break; 977 case 'r': 978 zo->zo_raid_children = MAX(1, value); 979 break; 980 case 'R': 981 zo->zo_raid_parity = MIN(MAX(value, 1), 3); 982 break; 983 case 'K': 984 raid_kind = optarg; 985 break; 986 case 'D': 987 zo->zo_draid_data = MAX(1, value); 988 break; 989 case 'S': 990 zo->zo_draid_spares = MAX(1, value); 991 break; 992 case 'd': 993 zo->zo_datasets = MAX(1, value); 994 break; 995 case 't': 996 zo->zo_threads = MAX(1, value); 997 break; 998 case 'g': 999 zo->zo_metaslab_force_ganging = 1000 MAX(SPA_MINBLOCKSIZE << 1, value); 1001 break; 1002 case 'i': 1003 zo->zo_init = value; 1004 break; 1005 case 'k': 1006 zo->zo_killrate = value; 1007 break; 1008 case 'p': 1009 (void) strlcpy(zo->zo_pool, optarg, 1010 sizeof (zo->zo_pool)); 1011 break; 1012 case 'f': 1013 path = realpath(optarg, NULL); 1014 if (path == NULL) { 1015 (void) fprintf(stderr, "error: %s: %s\n", 1016 optarg, strerror(errno)); 1017 usage(B_FALSE); 1018 } else { 1019 (void) strlcpy(zo->zo_dir, path, 1020 sizeof (zo->zo_dir)); 1021 free(path); 1022 } 1023 break; 1024 case 'M': 1025 zo->zo_mmp_test = 1; 1026 break; 1027 case 'V': 1028 zo->zo_verbose++; 1029 break; 1030 case 'E': 1031 zo->zo_init = 0; 1032 break; 1033 case 'T': 1034 zo->zo_time = value; 1035 break; 1036 case 'P': 1037 zo->zo_passtime = MAX(1, value); 1038 break; 1039 case 'F': 1040 zo->zo_maxloops = MAX(1, value); 1041 break; 1042 case 'B': 1043 (void) strlcpy(zo->zo_alt_ztest, optarg, 1044 sizeof (zo->zo_alt_ztest)); 1045 break; 1046 case 'C': 1047 ztest_parse_name_value(optarg, zo); 1048 break; 1049 case 'o': 1050 if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { 1051 (void) fprintf(stderr, 1052 "max global var count (%zu) exceeded\n", 1053 ZO_GVARS_MAX_COUNT); 1054 usage(B_FALSE); 1055 } 1056 char *v = zo->zo_gvars[zo->zo_gvars_count]; 1057 if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= 1058 ZO_GVARS_MAX_ARGLEN) { 1059 (void) fprintf(stderr, 1060 "global var option '%s' is too long\n", 1061 optarg); 1062 usage(B_FALSE); 1063 } 1064 zo->zo_gvars_count++; 1065 break; 1066 case 'G': 1067 zo->zo_dump_dbgmsg = 1; 1068 break; 1069 case 'h': 1070 usage(B_TRUE); 1071 break; 1072 case '?': 1073 default: 1074 usage(B_FALSE); 1075 break; 1076 } 1077 } 1078 1079 fini_options(); 1080 1081 /* When raid choice is 'random' add a draid pool 50% of the time */ 1082 if (strcmp(raid_kind, "random") == 0) { 1083 raid_kind = (ztest_random(2) == 0) ? "draid" : "raidz"; 1084 1085 if (ztest_opts.zo_verbose >= 3) 1086 (void) printf("choosing RAID type '%s'\n", raid_kind); 1087 } 1088 1089 if (strcmp(raid_kind, "draid") == 0) { 1090 uint64_t min_devsize; 1091 1092 /* With fewer disk use 256M, otherwise 128M is OK */ 1093 min_devsize = (ztest_opts.zo_raid_children < 16) ? 1094 (256ULL << 20) : (128ULL << 20); 1095 1096 /* No top-level mirrors with dRAID for now */ 1097 zo->zo_mirrors = 0; 1098 1099 /* Use more appropriate defaults for dRAID */ 1100 if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) 1101 zo->zo_vdevs = 1; 1102 if (zo->zo_raid_children == 1103 ztest_opts_defaults.zo_raid_children) 1104 zo->zo_raid_children = 16; 1105 if (zo->zo_ashift < 12) 1106 zo->zo_ashift = 12; 1107 if (zo->zo_vdev_size < min_devsize) 1108 zo->zo_vdev_size = min_devsize; 1109 1110 if (zo->zo_draid_data + zo->zo_raid_parity > 1111 zo->zo_raid_children - zo->zo_draid_spares) { 1112 (void) fprintf(stderr, "error: too few draid " 1113 "children (%d) for stripe width (%d)\n", 1114 zo->zo_raid_children, 1115 zo->zo_draid_data + zo->zo_raid_parity); 1116 usage(B_FALSE); 1117 } 1118 1119 (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, 1120 sizeof (zo->zo_raid_type)); 1121 1122 } else /* using raidz */ { 1123 ASSERT0(strcmp(raid_kind, "raidz")); 1124 1125 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1126 zo->zo_raid_children - 1); 1127 } 1128 1129 zo->zo_vdevtime = 1130 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 1131 UINT64_MAX >> 2); 1132 1133 if (*zo->zo_alt_ztest) { 1134 const char *invalid_what = "ztest"; 1135 char *val = zo->zo_alt_ztest; 1136 if (0 != access(val, X_OK) || 1137 (strrchr(val, '/') == NULL && (errno == EINVAL))) 1138 goto invalid; 1139 1140 int dirlen = strrchr(val, '/') - val; 1141 strlcpy(zo->zo_alt_libpath, val, 1142 MIN(sizeof (zo->zo_alt_libpath), dirlen + 1)); 1143 invalid_what = "library path", val = zo->zo_alt_libpath; 1144 if (strrchr(val, '/') == NULL && (errno == EINVAL)) 1145 goto invalid; 1146 *strrchr(val, '/') = '\0'; 1147 strlcat(val, "/lib", sizeof (zo->zo_alt_libpath)); 1148 1149 if (0 != access(zo->zo_alt_libpath, X_OK)) 1150 goto invalid; 1151 return; 1152 1153 invalid: 1154 ztest_dump_core = B_FALSE; 1155 fatal(B_TRUE, "invalid alternate %s %s", invalid_what, val); 1156 } 1157 } 1158 1159 static void 1160 ztest_kill(ztest_shared_t *zs) 1161 { 1162 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 1163 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 1164 1165 /* 1166 * Before we kill ourselves, make sure that the config is updated. 1167 * See comment above spa_write_cachefile(). 1168 */ 1169 mutex_enter(&spa_namespace_lock); 1170 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); 1171 mutex_exit(&spa_namespace_lock); 1172 1173 (void) raise(SIGKILL); 1174 } 1175 1176 static void 1177 ztest_record_enospc(const char *s) 1178 { 1179 (void) s; 1180 ztest_shared->zs_enospc_count++; 1181 } 1182 1183 static uint64_t 1184 ztest_get_ashift(void) 1185 { 1186 if (ztest_opts.zo_ashift == 0) 1187 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 1188 return (ztest_opts.zo_ashift); 1189 } 1190 1191 static boolean_t 1192 ztest_is_draid_spare(const char *name) 1193 { 1194 uint64_t spare_id = 0, parity = 0, vdev_id = 0; 1195 1196 if (sscanf(name, VDEV_TYPE_DRAID "%"PRIu64"-%"PRIu64"-%"PRIu64"", 1197 &parity, &vdev_id, &spare_id) == 3) { 1198 return (B_TRUE); 1199 } 1200 1201 return (B_FALSE); 1202 } 1203 1204 static nvlist_t * 1205 make_vdev_file(const char *path, const char *aux, const char *pool, 1206 size_t size, uint64_t ashift) 1207 { 1208 char *pathbuf = NULL; 1209 uint64_t vdev; 1210 nvlist_t *file; 1211 boolean_t draid_spare = B_FALSE; 1212 1213 1214 if (ashift == 0) 1215 ashift = ztest_get_ashift(); 1216 1217 if (path == NULL) { 1218 pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1219 path = pathbuf; 1220 1221 if (aux != NULL) { 1222 vdev = ztest_shared->zs_vdev_aux; 1223 (void) snprintf(pathbuf, MAXPATHLEN, 1224 ztest_aux_template, ztest_opts.zo_dir, 1225 pool == NULL ? ztest_opts.zo_pool : pool, 1226 aux, vdev); 1227 } else { 1228 vdev = ztest_shared->zs_vdev_next_leaf++; 1229 (void) snprintf(pathbuf, MAXPATHLEN, 1230 ztest_dev_template, ztest_opts.zo_dir, 1231 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 1232 } 1233 } else { 1234 draid_spare = ztest_is_draid_spare(path); 1235 } 1236 1237 if (size != 0 && !draid_spare) { 1238 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 1239 if (fd == -1) 1240 fatal(B_TRUE, "can't open %s", path); 1241 if (ftruncate(fd, size) != 0) 1242 fatal(B_TRUE, "can't ftruncate %s", path); 1243 (void) close(fd); 1244 } 1245 1246 file = fnvlist_alloc(); 1247 fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, 1248 draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); 1249 fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); 1250 fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); 1251 umem_free(pathbuf, MAXPATHLEN); 1252 1253 return (file); 1254 } 1255 1256 static nvlist_t * 1257 make_vdev_raid(const char *path, const char *aux, const char *pool, size_t size, 1258 uint64_t ashift, int r) 1259 { 1260 nvlist_t *raid, **child; 1261 int c; 1262 1263 if (r < 2) 1264 return (make_vdev_file(path, aux, pool, size, ashift)); 1265 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 1266 1267 for (c = 0; c < r; c++) 1268 child[c] = make_vdev_file(path, aux, pool, size, ashift); 1269 1270 raid = fnvlist_alloc(); 1271 fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, 1272 ztest_opts.zo_raid_type); 1273 fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, 1274 ztest_opts.zo_raid_parity); 1275 fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, 1276 (const nvlist_t **)child, r); 1277 1278 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { 1279 uint64_t ndata = ztest_opts.zo_draid_data; 1280 uint64_t nparity = ztest_opts.zo_raid_parity; 1281 uint64_t nspares = ztest_opts.zo_draid_spares; 1282 uint64_t children = ztest_opts.zo_raid_children; 1283 uint64_t ngroups = 1; 1284 1285 /* 1286 * Calculate the minimum number of groups required to fill a 1287 * slice. This is the LCM of the stripe width (data + parity) 1288 * and the number of data drives (children - spares). 1289 */ 1290 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1291 ngroups++; 1292 1293 /* Store the basic dRAID configuration. */ 1294 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1295 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1296 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1297 } 1298 1299 for (c = 0; c < r; c++) 1300 fnvlist_free(child[c]); 1301 1302 umem_free(child, r * sizeof (nvlist_t *)); 1303 1304 return (raid); 1305 } 1306 1307 static nvlist_t * 1308 make_vdev_mirror(const char *path, const char *aux, const char *pool, 1309 size_t size, uint64_t ashift, int r, int m) 1310 { 1311 nvlist_t *mirror, **child; 1312 int c; 1313 1314 if (m < 1) 1315 return (make_vdev_raid(path, aux, pool, size, ashift, r)); 1316 1317 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 1318 1319 for (c = 0; c < m; c++) 1320 child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); 1321 1322 mirror = fnvlist_alloc(); 1323 fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); 1324 fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 1325 (const nvlist_t **)child, m); 1326 1327 for (c = 0; c < m; c++) 1328 fnvlist_free(child[c]); 1329 1330 umem_free(child, m * sizeof (nvlist_t *)); 1331 1332 return (mirror); 1333 } 1334 1335 static nvlist_t * 1336 make_vdev_root(const char *path, const char *aux, const char *pool, size_t size, 1337 uint64_t ashift, const char *class, int r, int m, int t) 1338 { 1339 nvlist_t *root, **child; 1340 int c; 1341 boolean_t log; 1342 1343 ASSERT3S(t, >, 0); 1344 1345 log = (class != NULL && strcmp(class, "log") == 0); 1346 1347 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 1348 1349 for (c = 0; c < t; c++) { 1350 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 1351 r, m); 1352 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); 1353 1354 if (class != NULL && class[0] != '\0') { 1355 ASSERT(m > 1 || log); /* expecting a mirror */ 1356 fnvlist_add_string(child[c], 1357 ZPOOL_CONFIG_ALLOCATION_BIAS, class); 1358 } 1359 } 1360 1361 root = fnvlist_alloc(); 1362 fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 1363 fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 1364 (const nvlist_t **)child, t); 1365 1366 for (c = 0; c < t; c++) 1367 fnvlist_free(child[c]); 1368 1369 umem_free(child, t * sizeof (nvlist_t *)); 1370 1371 return (root); 1372 } 1373 1374 /* 1375 * Find a random spa version. Returns back a random spa version in the 1376 * range [initial_version, SPA_VERSION_FEATURES]. 1377 */ 1378 static uint64_t 1379 ztest_random_spa_version(uint64_t initial_version) 1380 { 1381 uint64_t version = initial_version; 1382 1383 if (version <= SPA_VERSION_BEFORE_FEATURES) { 1384 version = version + 1385 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 1386 } 1387 1388 if (version > SPA_VERSION_BEFORE_FEATURES) 1389 version = SPA_VERSION_FEATURES; 1390 1391 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1392 return (version); 1393 } 1394 1395 static int 1396 ztest_random_blocksize(void) 1397 { 1398 ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); 1399 1400 /* 1401 * Choose a block size >= the ashift. 1402 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1403 */ 1404 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1405 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1406 maxbs = 20; 1407 uint64_t block_shift = 1408 ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1409 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1410 } 1411 1412 static int 1413 ztest_random_dnodesize(void) 1414 { 1415 int slots; 1416 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1417 1418 if (max_slots == DNODE_MIN_SLOTS) 1419 return (DNODE_MIN_SIZE); 1420 1421 /* 1422 * Weight the random distribution more heavily toward smaller 1423 * dnode sizes since that is more likely to reflect real-world 1424 * usage. 1425 */ 1426 ASSERT3U(max_slots, >, 4); 1427 switch (ztest_random(10)) { 1428 case 0: 1429 slots = 5 + ztest_random(max_slots - 4); 1430 break; 1431 case 1 ... 4: 1432 slots = 2 + ztest_random(3); 1433 break; 1434 default: 1435 slots = 1; 1436 break; 1437 } 1438 1439 return (slots << DNODE_SHIFT); 1440 } 1441 1442 static int 1443 ztest_random_ibshift(void) 1444 { 1445 return (DN_MIN_INDBLKSHIFT + 1446 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1447 } 1448 1449 static uint64_t 1450 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1451 { 1452 uint64_t top; 1453 vdev_t *rvd = spa->spa_root_vdev; 1454 vdev_t *tvd; 1455 1456 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 1457 1458 do { 1459 top = ztest_random(rvd->vdev_children); 1460 tvd = rvd->vdev_child[top]; 1461 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1462 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1463 1464 return (top); 1465 } 1466 1467 static uint64_t 1468 ztest_random_dsl_prop(zfs_prop_t prop) 1469 { 1470 uint64_t value; 1471 1472 do { 1473 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1474 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1475 1476 return (value); 1477 } 1478 1479 static int 1480 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1481 boolean_t inherit) 1482 { 1483 const char *propname = zfs_prop_to_name(prop); 1484 const char *valname; 1485 char *setpoint; 1486 uint64_t curval; 1487 int error; 1488 1489 error = dsl_prop_set_int(osname, propname, 1490 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1491 1492 if (error == ENOSPC) { 1493 ztest_record_enospc(FTAG); 1494 return (error); 1495 } 1496 ASSERT0(error); 1497 1498 setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1499 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1500 1501 if (ztest_opts.zo_verbose >= 6) { 1502 int err; 1503 1504 err = zfs_prop_index_to_string(prop, curval, &valname); 1505 if (err) 1506 (void) printf("%s %s = %llu at '%s'\n", osname, 1507 propname, (unsigned long long)curval, setpoint); 1508 else 1509 (void) printf("%s %s = %s at '%s'\n", 1510 osname, propname, valname, setpoint); 1511 } 1512 umem_free(setpoint, MAXPATHLEN); 1513 1514 return (error); 1515 } 1516 1517 static int 1518 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1519 { 1520 spa_t *spa = ztest_spa; 1521 nvlist_t *props = NULL; 1522 int error; 1523 1524 props = fnvlist_alloc(); 1525 fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); 1526 1527 error = spa_prop_set(spa, props); 1528 1529 fnvlist_free(props); 1530 1531 if (error == ENOSPC) { 1532 ztest_record_enospc(FTAG); 1533 return (error); 1534 } 1535 ASSERT0(error); 1536 1537 return (error); 1538 } 1539 1540 static int 1541 ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, 1542 boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) 1543 { 1544 int err; 1545 char *cp = NULL; 1546 char ddname[ZFS_MAX_DATASET_NAME_LEN]; 1547 1548 strlcpy(ddname, name, sizeof (ddname)); 1549 cp = strchr(ddname, '@'); 1550 if (cp != NULL) 1551 *cp = '\0'; 1552 1553 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1554 while (decrypt && err == EACCES) { 1555 dsl_crypto_params_t *dcp; 1556 nvlist_t *crypto_args = fnvlist_alloc(); 1557 1558 fnvlist_add_uint8_array(crypto_args, "wkeydata", 1559 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 1560 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, 1561 crypto_args, &dcp)); 1562 err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); 1563 /* 1564 * Note: if there was an error loading, the wkey was not 1565 * consumed, and needs to be freed. 1566 */ 1567 dsl_crypto_params_free(dcp, (err != 0)); 1568 fnvlist_free(crypto_args); 1569 1570 if (err == EINVAL) { 1571 /* 1572 * We couldn't load a key for this dataset so try 1573 * the parent. This loop will eventually hit the 1574 * encryption root since ztest only makes clones 1575 * as children of their origin datasets. 1576 */ 1577 cp = strrchr(ddname, '/'); 1578 if (cp == NULL) 1579 return (err); 1580 1581 *cp = '\0'; 1582 err = EACCES; 1583 continue; 1584 } else if (err != 0) { 1585 break; 1586 } 1587 1588 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1589 break; 1590 } 1591 1592 return (err); 1593 } 1594 1595 static void 1596 ztest_rll_init(rll_t *rll) 1597 { 1598 rll->rll_writer = NULL; 1599 rll->rll_readers = 0; 1600 mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); 1601 cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); 1602 } 1603 1604 static void 1605 ztest_rll_destroy(rll_t *rll) 1606 { 1607 ASSERT3P(rll->rll_writer, ==, NULL); 1608 ASSERT0(rll->rll_readers); 1609 mutex_destroy(&rll->rll_lock); 1610 cv_destroy(&rll->rll_cv); 1611 } 1612 1613 static void 1614 ztest_rll_lock(rll_t *rll, rl_type_t type) 1615 { 1616 mutex_enter(&rll->rll_lock); 1617 1618 if (type == RL_READER) { 1619 while (rll->rll_writer != NULL) 1620 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1621 rll->rll_readers++; 1622 } else { 1623 while (rll->rll_writer != NULL || rll->rll_readers) 1624 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1625 rll->rll_writer = curthread; 1626 } 1627 1628 mutex_exit(&rll->rll_lock); 1629 } 1630 1631 static void 1632 ztest_rll_unlock(rll_t *rll) 1633 { 1634 mutex_enter(&rll->rll_lock); 1635 1636 if (rll->rll_writer) { 1637 ASSERT0(rll->rll_readers); 1638 rll->rll_writer = NULL; 1639 } else { 1640 ASSERT3S(rll->rll_readers, >, 0); 1641 ASSERT3P(rll->rll_writer, ==, NULL); 1642 rll->rll_readers--; 1643 } 1644 1645 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1646 cv_broadcast(&rll->rll_cv); 1647 1648 mutex_exit(&rll->rll_lock); 1649 } 1650 1651 static void 1652 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1653 { 1654 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1655 1656 ztest_rll_lock(rll, type); 1657 } 1658 1659 static void 1660 ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1661 { 1662 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1663 1664 ztest_rll_unlock(rll); 1665 } 1666 1667 static rl_t * 1668 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1669 uint64_t size, rl_type_t type) 1670 { 1671 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1672 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1673 rl_t *rl; 1674 1675 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1676 rl->rl_object = object; 1677 rl->rl_offset = offset; 1678 rl->rl_size = size; 1679 rl->rl_lock = rll; 1680 1681 ztest_rll_lock(rll, type); 1682 1683 return (rl); 1684 } 1685 1686 static void 1687 ztest_range_unlock(rl_t *rl) 1688 { 1689 rll_t *rll = rl->rl_lock; 1690 1691 ztest_rll_unlock(rll); 1692 1693 umem_free(rl, sizeof (*rl)); 1694 } 1695 1696 static void 1697 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1698 { 1699 zd->zd_os = os; 1700 zd->zd_zilog = dmu_objset_zil(os); 1701 zd->zd_shared = szd; 1702 dmu_objset_name(os, zd->zd_name); 1703 int l; 1704 1705 if (zd->zd_shared != NULL) 1706 zd->zd_shared->zd_seq = 0; 1707 1708 VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); 1709 mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); 1710 1711 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1712 ztest_rll_init(&zd->zd_object_lock[l]); 1713 1714 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1715 ztest_rll_init(&zd->zd_range_lock[l]); 1716 } 1717 1718 static void 1719 ztest_zd_fini(ztest_ds_t *zd) 1720 { 1721 int l; 1722 1723 mutex_destroy(&zd->zd_dirobj_lock); 1724 (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); 1725 1726 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1727 ztest_rll_destroy(&zd->zd_object_lock[l]); 1728 1729 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1730 ztest_rll_destroy(&zd->zd_range_lock[l]); 1731 } 1732 1733 #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) 1734 1735 static uint64_t 1736 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 1737 { 1738 uint64_t txg; 1739 int error; 1740 1741 /* 1742 * Attempt to assign tx to some transaction group. 1743 */ 1744 error = dmu_tx_assign(tx, txg_how); 1745 if (error) { 1746 if (error == ERESTART) { 1747 ASSERT3U(txg_how, ==, TXG_NOWAIT); 1748 dmu_tx_wait(tx); 1749 } else { 1750 ASSERT3U(error, ==, ENOSPC); 1751 ztest_record_enospc(tag); 1752 } 1753 dmu_tx_abort(tx); 1754 return (0); 1755 } 1756 txg = dmu_tx_get_txg(tx); 1757 ASSERT3U(txg, !=, 0); 1758 return (txg); 1759 } 1760 1761 static void 1762 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1763 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1764 uint64_t crtxg) 1765 { 1766 bt->bt_magic = BT_MAGIC; 1767 bt->bt_objset = dmu_objset_id(os); 1768 bt->bt_object = object; 1769 bt->bt_dnodesize = dnodesize; 1770 bt->bt_offset = offset; 1771 bt->bt_gen = gen; 1772 bt->bt_txg = txg; 1773 bt->bt_crtxg = crtxg; 1774 } 1775 1776 static void 1777 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1778 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1779 uint64_t crtxg) 1780 { 1781 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1782 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1783 ASSERT3U(bt->bt_object, ==, object); 1784 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1785 ASSERT3U(bt->bt_offset, ==, offset); 1786 ASSERT3U(bt->bt_gen, <=, gen); 1787 ASSERT3U(bt->bt_txg, <=, txg); 1788 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1789 } 1790 1791 static ztest_block_tag_t * 1792 ztest_bt_bonus(dmu_buf_t *db) 1793 { 1794 dmu_object_info_t doi; 1795 ztest_block_tag_t *bt; 1796 1797 dmu_object_info_from_db(db, &doi); 1798 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1799 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1800 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1801 1802 return (bt); 1803 } 1804 1805 /* 1806 * Generate a token to fill up unused bonus buffer space. Try to make 1807 * it unique to the object, generation, and offset to verify that data 1808 * is not getting overwritten by data from other dnodes. 1809 */ 1810 #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1811 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1812 1813 /* 1814 * Fill up the unused bonus buffer region before the block tag with a 1815 * verifiable pattern. Filling the whole bonus area with non-zero data 1816 * helps ensure that all dnode traversal code properly skips the 1817 * interior regions of large dnodes. 1818 */ 1819 static void 1820 ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1821 objset_t *os, uint64_t gen) 1822 { 1823 uint64_t *bonusp; 1824 1825 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1826 1827 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1828 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1829 gen, bonusp - (uint64_t *)db->db_data); 1830 *bonusp = token; 1831 } 1832 } 1833 1834 /* 1835 * Verify that the unused area of a bonus buffer is filled with the 1836 * expected tokens. 1837 */ 1838 static void 1839 ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1840 objset_t *os, uint64_t gen) 1841 { 1842 uint64_t *bonusp; 1843 1844 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1845 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1846 gen, bonusp - (uint64_t *)db->db_data); 1847 VERIFY3U(*bonusp, ==, token); 1848 } 1849 } 1850 1851 /* 1852 * ZIL logging ops 1853 */ 1854 1855 #define lrz_type lr_mode 1856 #define lrz_blocksize lr_uid 1857 #define lrz_ibshift lr_gid 1858 #define lrz_bonustype lr_rdev 1859 #define lrz_dnodesize lr_crtime[1] 1860 1861 static void 1862 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1863 { 1864 char *name = (void *)(lr + 1); /* name follows lr */ 1865 size_t namesize = strlen(name) + 1; 1866 itx_t *itx; 1867 1868 if (zil_replaying(zd->zd_zilog, tx)) 1869 return; 1870 1871 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1872 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1873 sizeof (*lr) + namesize - sizeof (lr_t)); 1874 1875 zil_itx_assign(zd->zd_zilog, itx, tx); 1876 } 1877 1878 static void 1879 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1880 { 1881 char *name = (void *)(lr + 1); /* name follows lr */ 1882 size_t namesize = strlen(name) + 1; 1883 itx_t *itx; 1884 1885 if (zil_replaying(zd->zd_zilog, tx)) 1886 return; 1887 1888 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1889 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1890 sizeof (*lr) + namesize - sizeof (lr_t)); 1891 1892 itx->itx_oid = object; 1893 zil_itx_assign(zd->zd_zilog, itx, tx); 1894 } 1895 1896 static void 1897 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1898 { 1899 itx_t *itx; 1900 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1901 1902 if (zil_replaying(zd->zd_zilog, tx)) 1903 return; 1904 1905 if (lr->lr_length > zil_max_log_data(zd->zd_zilog, sizeof (lr_write_t))) 1906 write_state = WR_INDIRECT; 1907 1908 itx = zil_itx_create(TX_WRITE, 1909 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1910 1911 if (write_state == WR_COPIED && 1912 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1913 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { 1914 zil_itx_destroy(itx); 1915 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1916 write_state = WR_NEED_COPY; 1917 } 1918 itx->itx_private = zd; 1919 itx->itx_wr_state = write_state; 1920 itx->itx_sync = (ztest_random(8) == 0); 1921 1922 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1923 sizeof (*lr) - sizeof (lr_t)); 1924 1925 zil_itx_assign(zd->zd_zilog, itx, tx); 1926 } 1927 1928 static void 1929 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 1930 { 1931 itx_t *itx; 1932 1933 if (zil_replaying(zd->zd_zilog, tx)) 1934 return; 1935 1936 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 1937 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1938 sizeof (*lr) - sizeof (lr_t)); 1939 1940 itx->itx_sync = B_FALSE; 1941 zil_itx_assign(zd->zd_zilog, itx, tx); 1942 } 1943 1944 static void 1945 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 1946 { 1947 itx_t *itx; 1948 1949 if (zil_replaying(zd->zd_zilog, tx)) 1950 return; 1951 1952 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 1953 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1954 sizeof (*lr) - sizeof (lr_t)); 1955 1956 itx->itx_sync = B_FALSE; 1957 zil_itx_assign(zd->zd_zilog, itx, tx); 1958 } 1959 1960 /* 1961 * ZIL replay ops 1962 */ 1963 static int 1964 ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 1965 { 1966 ztest_ds_t *zd = arg1; 1967 lr_create_t *lr = arg2; 1968 char *name = (void *)(lr + 1); /* name follows lr */ 1969 objset_t *os = zd->zd_os; 1970 ztest_block_tag_t *bbt; 1971 dmu_buf_t *db; 1972 dmu_tx_t *tx; 1973 uint64_t txg; 1974 int error = 0; 1975 int bonuslen; 1976 1977 if (byteswap) 1978 byteswap_uint64_array(lr, sizeof (*lr)); 1979 1980 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 1981 ASSERT3S(name[0], !=, '\0'); 1982 1983 tx = dmu_tx_create(os); 1984 1985 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 1986 1987 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1988 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1989 } else { 1990 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1991 } 1992 1993 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1994 if (txg == 0) 1995 return (ENOSPC); 1996 1997 ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); 1998 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 1999 2000 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2001 if (lr->lr_foid == 0) { 2002 lr->lr_foid = zap_create_dnsize(os, 2003 lr->lrz_type, lr->lrz_bonustype, 2004 bonuslen, lr->lrz_dnodesize, tx); 2005 } else { 2006 error = zap_create_claim_dnsize(os, lr->lr_foid, 2007 lr->lrz_type, lr->lrz_bonustype, 2008 bonuslen, lr->lrz_dnodesize, tx); 2009 } 2010 } else { 2011 if (lr->lr_foid == 0) { 2012 lr->lr_foid = dmu_object_alloc_dnsize(os, 2013 lr->lrz_type, 0, lr->lrz_bonustype, 2014 bonuslen, lr->lrz_dnodesize, tx); 2015 } else { 2016 error = dmu_object_claim_dnsize(os, lr->lr_foid, 2017 lr->lrz_type, 0, lr->lrz_bonustype, 2018 bonuslen, lr->lrz_dnodesize, tx); 2019 } 2020 } 2021 2022 if (error) { 2023 ASSERT3U(error, ==, EEXIST); 2024 ASSERT(zd->zd_zilog->zl_replay); 2025 dmu_tx_commit(tx); 2026 return (error); 2027 } 2028 2029 ASSERT3U(lr->lr_foid, !=, 0); 2030 2031 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 2032 VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, 2033 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 2034 2035 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2036 bbt = ztest_bt_bonus(db); 2037 dmu_buf_will_dirty(db, tx); 2038 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 2039 lr->lr_gen, txg, txg); 2040 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 2041 dmu_buf_rele(db, FTAG); 2042 2043 VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 2044 &lr->lr_foid, tx)); 2045 2046 (void) ztest_log_create(zd, tx, lr); 2047 2048 dmu_tx_commit(tx); 2049 2050 return (0); 2051 } 2052 2053 static int 2054 ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 2055 { 2056 ztest_ds_t *zd = arg1; 2057 lr_remove_t *lr = arg2; 2058 char *name = (void *)(lr + 1); /* name follows lr */ 2059 objset_t *os = zd->zd_os; 2060 dmu_object_info_t doi; 2061 dmu_tx_t *tx; 2062 uint64_t object, txg; 2063 2064 if (byteswap) 2065 byteswap_uint64_array(lr, sizeof (*lr)); 2066 2067 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2068 ASSERT3S(name[0], !=, '\0'); 2069 2070 VERIFY0( 2071 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 2072 ASSERT3U(object, !=, 0); 2073 2074 ztest_object_lock(zd, object, RL_WRITER); 2075 2076 VERIFY0(dmu_object_info(os, object, &doi)); 2077 2078 tx = dmu_tx_create(os); 2079 2080 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 2081 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 2082 2083 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2084 if (txg == 0) { 2085 ztest_object_unlock(zd, object); 2086 return (ENOSPC); 2087 } 2088 2089 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 2090 VERIFY0(zap_destroy(os, object, tx)); 2091 } else { 2092 VERIFY0(dmu_object_free(os, object, tx)); 2093 } 2094 2095 VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); 2096 2097 (void) ztest_log_remove(zd, tx, lr, object); 2098 2099 dmu_tx_commit(tx); 2100 2101 ztest_object_unlock(zd, object); 2102 2103 return (0); 2104 } 2105 2106 static int 2107 ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 2108 { 2109 ztest_ds_t *zd = arg1; 2110 lr_write_t *lr = arg2; 2111 objset_t *os = zd->zd_os; 2112 void *data = lr + 1; /* data follows lr */ 2113 uint64_t offset, length; 2114 ztest_block_tag_t *bt = data; 2115 ztest_block_tag_t *bbt; 2116 uint64_t gen, txg, lrtxg, crtxg; 2117 dmu_object_info_t doi; 2118 dmu_tx_t *tx; 2119 dmu_buf_t *db; 2120 arc_buf_t *abuf = NULL; 2121 rl_t *rl; 2122 2123 if (byteswap) 2124 byteswap_uint64_array(lr, sizeof (*lr)); 2125 2126 offset = lr->lr_offset; 2127 length = lr->lr_length; 2128 2129 /* If it's a dmu_sync() block, write the whole block */ 2130 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 2131 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 2132 if (length < blocksize) { 2133 offset -= offset % blocksize; 2134 length = blocksize; 2135 } 2136 } 2137 2138 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 2139 byteswap_uint64_array(bt, sizeof (*bt)); 2140 2141 if (bt->bt_magic != BT_MAGIC) 2142 bt = NULL; 2143 2144 ztest_object_lock(zd, lr->lr_foid, RL_READER); 2145 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); 2146 2147 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2148 2149 dmu_object_info_from_db(db, &doi); 2150 2151 bbt = ztest_bt_bonus(db); 2152 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2153 gen = bbt->bt_gen; 2154 crtxg = bbt->bt_crtxg; 2155 lrtxg = lr->lr_common.lrc_txg; 2156 2157 tx = dmu_tx_create(os); 2158 2159 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 2160 2161 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 2162 P2PHASE(offset, length) == 0) 2163 abuf = dmu_request_arcbuf(db, length); 2164 2165 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2166 if (txg == 0) { 2167 if (abuf != NULL) 2168 dmu_return_arcbuf(abuf); 2169 dmu_buf_rele(db, FTAG); 2170 ztest_range_unlock(rl); 2171 ztest_object_unlock(zd, lr->lr_foid); 2172 return (ENOSPC); 2173 } 2174 2175 if (bt != NULL) { 2176 /* 2177 * Usually, verify the old data before writing new data -- 2178 * but not always, because we also want to verify correct 2179 * behavior when the data was not recently read into cache. 2180 */ 2181 ASSERT(doi.doi_data_block_size); 2182 ASSERT0(offset % doi.doi_data_block_size); 2183 if (ztest_random(4) != 0) { 2184 int prefetch = ztest_random(2) ? 2185 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 2186 ztest_block_tag_t rbt; 2187 2188 VERIFY(dmu_read(os, lr->lr_foid, offset, 2189 sizeof (rbt), &rbt, prefetch) == 0); 2190 if (rbt.bt_magic == BT_MAGIC) { 2191 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 2192 offset, gen, txg, crtxg); 2193 } 2194 } 2195 2196 /* 2197 * Writes can appear to be newer than the bonus buffer because 2198 * the ztest_get_data() callback does a dmu_read() of the 2199 * open-context data, which may be different than the data 2200 * as it was when the write was generated. 2201 */ 2202 if (zd->zd_zilog->zl_replay) { 2203 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 2204 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 2205 bt->bt_crtxg); 2206 } 2207 2208 /* 2209 * Set the bt's gen/txg to the bonus buffer's gen/txg 2210 * so that all of the usual ASSERTs will work. 2211 */ 2212 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 2213 crtxg); 2214 } 2215 2216 if (abuf == NULL) { 2217 dmu_write(os, lr->lr_foid, offset, length, data, tx); 2218 } else { 2219 memcpy(abuf->b_data, data, length); 2220 VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx)); 2221 } 2222 2223 (void) ztest_log_write(zd, tx, lr); 2224 2225 dmu_buf_rele(db, FTAG); 2226 2227 dmu_tx_commit(tx); 2228 2229 ztest_range_unlock(rl); 2230 ztest_object_unlock(zd, lr->lr_foid); 2231 2232 return (0); 2233 } 2234 2235 static int 2236 ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 2237 { 2238 ztest_ds_t *zd = arg1; 2239 lr_truncate_t *lr = arg2; 2240 objset_t *os = zd->zd_os; 2241 dmu_tx_t *tx; 2242 uint64_t txg; 2243 rl_t *rl; 2244 2245 if (byteswap) 2246 byteswap_uint64_array(lr, sizeof (*lr)); 2247 2248 ztest_object_lock(zd, lr->lr_foid, RL_READER); 2249 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 2250 RL_WRITER); 2251 2252 tx = dmu_tx_create(os); 2253 2254 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 2255 2256 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2257 if (txg == 0) { 2258 ztest_range_unlock(rl); 2259 ztest_object_unlock(zd, lr->lr_foid); 2260 return (ENOSPC); 2261 } 2262 2263 VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 2264 lr->lr_length, tx)); 2265 2266 (void) ztest_log_truncate(zd, tx, lr); 2267 2268 dmu_tx_commit(tx); 2269 2270 ztest_range_unlock(rl); 2271 ztest_object_unlock(zd, lr->lr_foid); 2272 2273 return (0); 2274 } 2275 2276 static int 2277 ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 2278 { 2279 ztest_ds_t *zd = arg1; 2280 lr_setattr_t *lr = arg2; 2281 objset_t *os = zd->zd_os; 2282 dmu_tx_t *tx; 2283 dmu_buf_t *db; 2284 ztest_block_tag_t *bbt; 2285 uint64_t txg, lrtxg, crtxg, dnodesize; 2286 2287 if (byteswap) 2288 byteswap_uint64_array(lr, sizeof (*lr)); 2289 2290 ztest_object_lock(zd, lr->lr_foid, RL_WRITER); 2291 2292 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2293 2294 tx = dmu_tx_create(os); 2295 dmu_tx_hold_bonus(tx, lr->lr_foid); 2296 2297 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2298 if (txg == 0) { 2299 dmu_buf_rele(db, FTAG); 2300 ztest_object_unlock(zd, lr->lr_foid); 2301 return (ENOSPC); 2302 } 2303 2304 bbt = ztest_bt_bonus(db); 2305 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2306 crtxg = bbt->bt_crtxg; 2307 lrtxg = lr->lr_common.lrc_txg; 2308 dnodesize = bbt->bt_dnodesize; 2309 2310 if (zd->zd_zilog->zl_replay) { 2311 ASSERT3U(lr->lr_size, !=, 0); 2312 ASSERT3U(lr->lr_mode, !=, 0); 2313 ASSERT3U(lrtxg, !=, 0); 2314 } else { 2315 /* 2316 * Randomly change the size and increment the generation. 2317 */ 2318 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 2319 sizeof (*bbt); 2320 lr->lr_mode = bbt->bt_gen + 1; 2321 ASSERT0(lrtxg); 2322 } 2323 2324 /* 2325 * Verify that the current bonus buffer is not newer than our txg. 2326 */ 2327 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2328 MAX(txg, lrtxg), crtxg); 2329 2330 dmu_buf_will_dirty(db, tx); 2331 2332 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 2333 ASSERT3U(lr->lr_size, <=, db->db_size); 2334 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 2335 bbt = ztest_bt_bonus(db); 2336 2337 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2338 txg, crtxg); 2339 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 2340 dmu_buf_rele(db, FTAG); 2341 2342 (void) ztest_log_setattr(zd, tx, lr); 2343 2344 dmu_tx_commit(tx); 2345 2346 ztest_object_unlock(zd, lr->lr_foid); 2347 2348 return (0); 2349 } 2350 2351 static zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 2352 NULL, /* 0 no such transaction type */ 2353 ztest_replay_create, /* TX_CREATE */ 2354 NULL, /* TX_MKDIR */ 2355 NULL, /* TX_MKXATTR */ 2356 NULL, /* TX_SYMLINK */ 2357 ztest_replay_remove, /* TX_REMOVE */ 2358 NULL, /* TX_RMDIR */ 2359 NULL, /* TX_LINK */ 2360 NULL, /* TX_RENAME */ 2361 ztest_replay_write, /* TX_WRITE */ 2362 ztest_replay_truncate, /* TX_TRUNCATE */ 2363 ztest_replay_setattr, /* TX_SETATTR */ 2364 NULL, /* TX_ACL */ 2365 NULL, /* TX_CREATE_ACL */ 2366 NULL, /* TX_CREATE_ATTR */ 2367 NULL, /* TX_CREATE_ACL_ATTR */ 2368 NULL, /* TX_MKDIR_ACL */ 2369 NULL, /* TX_MKDIR_ATTR */ 2370 NULL, /* TX_MKDIR_ACL_ATTR */ 2371 NULL, /* TX_WRITE2 */ 2372 NULL, /* TX_SETSAXATTR */ 2373 NULL, /* TX_RENAME_EXCHANGE */ 2374 NULL, /* TX_RENAME_WHITEOUT */ 2375 }; 2376 2377 /* 2378 * ZIL get_data callbacks 2379 */ 2380 2381 static void 2382 ztest_get_done(zgd_t *zgd, int error) 2383 { 2384 (void) error; 2385 ztest_ds_t *zd = zgd->zgd_private; 2386 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 2387 2388 if (zgd->zgd_db) 2389 dmu_buf_rele(zgd->zgd_db, zgd); 2390 2391 ztest_range_unlock((rl_t *)zgd->zgd_lr); 2392 ztest_object_unlock(zd, object); 2393 2394 umem_free(zgd, sizeof (*zgd)); 2395 } 2396 2397 static int 2398 ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, 2399 struct lwb *lwb, zio_t *zio) 2400 { 2401 (void) arg2; 2402 ztest_ds_t *zd = arg; 2403 objset_t *os = zd->zd_os; 2404 uint64_t object = lr->lr_foid; 2405 uint64_t offset = lr->lr_offset; 2406 uint64_t size = lr->lr_length; 2407 uint64_t txg = lr->lr_common.lrc_txg; 2408 uint64_t crtxg; 2409 dmu_object_info_t doi; 2410 dmu_buf_t *db; 2411 zgd_t *zgd; 2412 int error; 2413 2414 ASSERT3P(lwb, !=, NULL); 2415 ASSERT3U(size, !=, 0); 2416 2417 ztest_object_lock(zd, object, RL_READER); 2418 error = dmu_bonus_hold(os, object, FTAG, &db); 2419 if (error) { 2420 ztest_object_unlock(zd, object); 2421 return (error); 2422 } 2423 2424 crtxg = ztest_bt_bonus(db)->bt_crtxg; 2425 2426 if (crtxg == 0 || crtxg > txg) { 2427 dmu_buf_rele(db, FTAG); 2428 ztest_object_unlock(zd, object); 2429 return (ENOENT); 2430 } 2431 2432 dmu_object_info_from_db(db, &doi); 2433 dmu_buf_rele(db, FTAG); 2434 db = NULL; 2435 2436 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 2437 zgd->zgd_lwb = lwb; 2438 zgd->zgd_private = zd; 2439 2440 if (buf != NULL) { /* immediate write */ 2441 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2442 object, offset, size, RL_READER); 2443 2444 error = dmu_read(os, object, offset, size, buf, 2445 DMU_READ_NO_PREFETCH); 2446 ASSERT0(error); 2447 } else { 2448 ASSERT3P(zio, !=, NULL); 2449 size = doi.doi_data_block_size; 2450 if (ISP2(size)) { 2451 offset = P2ALIGN(offset, size); 2452 } else { 2453 ASSERT3U(offset, <, size); 2454 offset = 0; 2455 } 2456 2457 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2458 object, offset, size, RL_READER); 2459 2460 error = dmu_buf_hold(os, object, offset, zgd, &db, 2461 DMU_READ_NO_PREFETCH); 2462 2463 if (error == 0) { 2464 blkptr_t *bp = &lr->lr_blkptr; 2465 2466 zgd->zgd_db = db; 2467 zgd->zgd_bp = bp; 2468 2469 ASSERT3U(db->db_offset, ==, offset); 2470 ASSERT3U(db->db_size, ==, size); 2471 2472 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2473 ztest_get_done, zgd); 2474 2475 if (error == 0) 2476 return (0); 2477 } 2478 } 2479 2480 ztest_get_done(zgd, error); 2481 2482 return (error); 2483 } 2484 2485 static void * 2486 ztest_lr_alloc(size_t lrsize, char *name) 2487 { 2488 char *lr; 2489 size_t namesize = name ? strlen(name) + 1 : 0; 2490 2491 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2492 2493 if (name) 2494 memcpy(lr + lrsize, name, namesize); 2495 2496 return (lr); 2497 } 2498 2499 static void 2500 ztest_lr_free(void *lr, size_t lrsize, char *name) 2501 { 2502 size_t namesize = name ? strlen(name) + 1 : 0; 2503 2504 umem_free(lr, lrsize + namesize); 2505 } 2506 2507 /* 2508 * Lookup a bunch of objects. Returns the number of objects not found. 2509 */ 2510 static int 2511 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2512 { 2513 int missing = 0; 2514 int error; 2515 int i; 2516 2517 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2518 2519 for (i = 0; i < count; i++, od++) { 2520 od->od_object = 0; 2521 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2522 sizeof (uint64_t), 1, &od->od_object); 2523 if (error) { 2524 ASSERT3S(error, ==, ENOENT); 2525 ASSERT0(od->od_object); 2526 missing++; 2527 } else { 2528 dmu_buf_t *db; 2529 ztest_block_tag_t *bbt; 2530 dmu_object_info_t doi; 2531 2532 ASSERT3U(od->od_object, !=, 0); 2533 ASSERT0(missing); /* there should be no gaps */ 2534 2535 ztest_object_lock(zd, od->od_object, RL_READER); 2536 VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, 2537 FTAG, &db)); 2538 dmu_object_info_from_db(db, &doi); 2539 bbt = ztest_bt_bonus(db); 2540 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2541 od->od_type = doi.doi_type; 2542 od->od_blocksize = doi.doi_data_block_size; 2543 od->od_gen = bbt->bt_gen; 2544 dmu_buf_rele(db, FTAG); 2545 ztest_object_unlock(zd, od->od_object); 2546 } 2547 } 2548 2549 return (missing); 2550 } 2551 2552 static int 2553 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2554 { 2555 int missing = 0; 2556 int i; 2557 2558 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2559 2560 for (i = 0; i < count; i++, od++) { 2561 if (missing) { 2562 od->od_object = 0; 2563 missing++; 2564 continue; 2565 } 2566 2567 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2568 2569 lr->lr_doid = od->od_dir; 2570 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2571 lr->lrz_type = od->od_crtype; 2572 lr->lrz_blocksize = od->od_crblocksize; 2573 lr->lrz_ibshift = ztest_random_ibshift(); 2574 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2575 lr->lrz_dnodesize = od->od_crdnodesize; 2576 lr->lr_gen = od->od_crgen; 2577 lr->lr_crtime[0] = time(NULL); 2578 2579 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2580 ASSERT0(missing); 2581 od->od_object = 0; 2582 missing++; 2583 } else { 2584 od->od_object = lr->lr_foid; 2585 od->od_type = od->od_crtype; 2586 od->od_blocksize = od->od_crblocksize; 2587 od->od_gen = od->od_crgen; 2588 ASSERT3U(od->od_object, !=, 0); 2589 } 2590 2591 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2592 } 2593 2594 return (missing); 2595 } 2596 2597 static int 2598 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2599 { 2600 int missing = 0; 2601 int error; 2602 int i; 2603 2604 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2605 2606 od += count - 1; 2607 2608 for (i = count - 1; i >= 0; i--, od--) { 2609 if (missing) { 2610 missing++; 2611 continue; 2612 } 2613 2614 /* 2615 * No object was found. 2616 */ 2617 if (od->od_object == 0) 2618 continue; 2619 2620 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2621 2622 lr->lr_doid = od->od_dir; 2623 2624 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2625 ASSERT3U(error, ==, ENOSPC); 2626 missing++; 2627 } else { 2628 od->od_object = 0; 2629 } 2630 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2631 } 2632 2633 return (missing); 2634 } 2635 2636 static int 2637 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2638 void *data) 2639 { 2640 lr_write_t *lr; 2641 int error; 2642 2643 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2644 2645 lr->lr_foid = object; 2646 lr->lr_offset = offset; 2647 lr->lr_length = size; 2648 lr->lr_blkoff = 0; 2649 BP_ZERO(&lr->lr_blkptr); 2650 2651 memcpy(lr + 1, data, size); 2652 2653 error = ztest_replay_write(zd, lr, B_FALSE); 2654 2655 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2656 2657 return (error); 2658 } 2659 2660 static int 2661 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2662 { 2663 lr_truncate_t *lr; 2664 int error; 2665 2666 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2667 2668 lr->lr_foid = object; 2669 lr->lr_offset = offset; 2670 lr->lr_length = size; 2671 2672 error = ztest_replay_truncate(zd, lr, B_FALSE); 2673 2674 ztest_lr_free(lr, sizeof (*lr), NULL); 2675 2676 return (error); 2677 } 2678 2679 static int 2680 ztest_setattr(ztest_ds_t *zd, uint64_t object) 2681 { 2682 lr_setattr_t *lr; 2683 int error; 2684 2685 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2686 2687 lr->lr_foid = object; 2688 lr->lr_size = 0; 2689 lr->lr_mode = 0; 2690 2691 error = ztest_replay_setattr(zd, lr, B_FALSE); 2692 2693 ztest_lr_free(lr, sizeof (*lr), NULL); 2694 2695 return (error); 2696 } 2697 2698 static void 2699 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2700 { 2701 objset_t *os = zd->zd_os; 2702 dmu_tx_t *tx; 2703 uint64_t txg; 2704 rl_t *rl; 2705 2706 txg_wait_synced(dmu_objset_pool(os), 0); 2707 2708 ztest_object_lock(zd, object, RL_READER); 2709 rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); 2710 2711 tx = dmu_tx_create(os); 2712 2713 dmu_tx_hold_write(tx, object, offset, size); 2714 2715 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2716 2717 if (txg != 0) { 2718 dmu_prealloc(os, object, offset, size, tx); 2719 dmu_tx_commit(tx); 2720 txg_wait_synced(dmu_objset_pool(os), txg); 2721 } else { 2722 (void) dmu_free_long_range(os, object, offset, size); 2723 } 2724 2725 ztest_range_unlock(rl); 2726 ztest_object_unlock(zd, object); 2727 } 2728 2729 static void 2730 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2731 { 2732 int err; 2733 ztest_block_tag_t wbt; 2734 dmu_object_info_t doi; 2735 enum ztest_io_type io_type; 2736 uint64_t blocksize; 2737 void *data; 2738 2739 VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); 2740 blocksize = doi.doi_data_block_size; 2741 data = umem_alloc(blocksize, UMEM_NOFAIL); 2742 2743 /* 2744 * Pick an i/o type at random, biased toward writing block tags. 2745 */ 2746 io_type = ztest_random(ZTEST_IO_TYPES); 2747 if (ztest_random(2) == 0) 2748 io_type = ZTEST_IO_WRITE_TAG; 2749 2750 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2751 2752 switch (io_type) { 2753 2754 case ZTEST_IO_WRITE_TAG: 2755 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2756 offset, 0, 0, 0); 2757 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2758 break; 2759 2760 case ZTEST_IO_WRITE_PATTERN: 2761 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2762 if (ztest_random(2) == 0) { 2763 /* 2764 * Induce fletcher2 collisions to ensure that 2765 * zio_ddt_collision() detects and resolves them 2766 * when using fletcher2-verify for deduplication. 2767 */ 2768 ((uint64_t *)data)[0] ^= 1ULL << 63; 2769 ((uint64_t *)data)[4] ^= 1ULL << 63; 2770 } 2771 (void) ztest_write(zd, object, offset, blocksize, data); 2772 break; 2773 2774 case ZTEST_IO_WRITE_ZEROES: 2775 memset(data, 0, blocksize); 2776 (void) ztest_write(zd, object, offset, blocksize, data); 2777 break; 2778 2779 case ZTEST_IO_TRUNCATE: 2780 (void) ztest_truncate(zd, object, offset, blocksize); 2781 break; 2782 2783 case ZTEST_IO_SETATTR: 2784 (void) ztest_setattr(zd, object); 2785 break; 2786 default: 2787 break; 2788 2789 case ZTEST_IO_REWRITE: 2790 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2791 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2792 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2793 B_FALSE); 2794 ASSERT(err == 0 || err == ENOSPC); 2795 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2796 ZFS_PROP_COMPRESSION, 2797 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2798 B_FALSE); 2799 ASSERT(err == 0 || err == ENOSPC); 2800 (void) pthread_rwlock_unlock(&ztest_name_lock); 2801 2802 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2803 DMU_READ_NO_PREFETCH)); 2804 2805 (void) ztest_write(zd, object, offset, blocksize, data); 2806 break; 2807 } 2808 2809 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2810 2811 umem_free(data, blocksize); 2812 } 2813 2814 /* 2815 * Initialize an object description template. 2816 */ 2817 static void 2818 ztest_od_init(ztest_od_t *od, uint64_t id, const char *tag, uint64_t index, 2819 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2820 uint64_t gen) 2821 { 2822 od->od_dir = ZTEST_DIROBJ; 2823 od->od_object = 0; 2824 2825 od->od_crtype = type; 2826 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2827 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2828 od->od_crgen = gen; 2829 2830 od->od_type = DMU_OT_NONE; 2831 od->od_blocksize = 0; 2832 od->od_gen = 0; 2833 2834 (void) snprintf(od->od_name, sizeof (od->od_name), 2835 "%s(%"PRId64")[%"PRIu64"]", 2836 tag, id, index); 2837 } 2838 2839 /* 2840 * Lookup or create the objects for a test using the od template. 2841 * If the objects do not all exist, or if 'remove' is specified, 2842 * remove any existing objects and create new ones. Otherwise, 2843 * use the existing objects. 2844 */ 2845 static int 2846 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2847 { 2848 int count = size / sizeof (*od); 2849 int rv = 0; 2850 2851 mutex_enter(&zd->zd_dirobj_lock); 2852 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2853 (ztest_remove(zd, od, count) != 0 || 2854 ztest_create(zd, od, count) != 0)) 2855 rv = -1; 2856 zd->zd_od = od; 2857 mutex_exit(&zd->zd_dirobj_lock); 2858 2859 return (rv); 2860 } 2861 2862 void 2863 ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2864 { 2865 (void) id; 2866 zilog_t *zilog = zd->zd_zilog; 2867 2868 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2869 2870 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2871 2872 /* 2873 * Remember the committed values in zd, which is in parent/child 2874 * shared memory. If we die, the next iteration of ztest_run() 2875 * will verify that the log really does contain this record. 2876 */ 2877 mutex_enter(&zilog->zl_lock); 2878 ASSERT3P(zd->zd_shared, !=, NULL); 2879 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2880 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2881 mutex_exit(&zilog->zl_lock); 2882 2883 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2884 } 2885 2886 /* 2887 * This function is designed to simulate the operations that occur during a 2888 * mount/unmount operation. We hold the dataset across these operations in an 2889 * attempt to expose any implicit assumptions about ZIL management. 2890 */ 2891 void 2892 ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2893 { 2894 (void) id; 2895 objset_t *os = zd->zd_os; 2896 2897 /* 2898 * We hold the ztest_vdev_lock so we don't cause problems with 2899 * other threads that wish to remove a log device, such as 2900 * ztest_device_removal(). 2901 */ 2902 mutex_enter(&ztest_vdev_lock); 2903 2904 /* 2905 * We grab the zd_dirobj_lock to ensure that no other thread is 2906 * updating the zil (i.e. adding in-memory log records) and the 2907 * zd_zilog_lock to block any I/O. 2908 */ 2909 mutex_enter(&zd->zd_dirobj_lock); 2910 (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); 2911 2912 /* zfsvfs_teardown() */ 2913 zil_close(zd->zd_zilog); 2914 2915 /* zfsvfs_setup() */ 2916 VERIFY3P(zil_open(os, ztest_get_data, NULL), ==, zd->zd_zilog); 2917 zil_replay(os, zd, ztest_replay_vector); 2918 2919 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2920 mutex_exit(&zd->zd_dirobj_lock); 2921 mutex_exit(&ztest_vdev_lock); 2922 } 2923 2924 /* 2925 * Verify that we can't destroy an active pool, create an existing pool, 2926 * or create a pool with a bad vdev spec. 2927 */ 2928 void 2929 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 2930 { 2931 (void) zd, (void) id; 2932 ztest_shared_opts_t *zo = &ztest_opts; 2933 spa_t *spa; 2934 nvlist_t *nvroot; 2935 2936 if (zo->zo_mmp_test) 2937 return; 2938 2939 /* 2940 * Attempt to create using a bad file. 2941 */ 2942 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 2943 VERIFY3U(ENOENT, ==, 2944 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 2945 fnvlist_free(nvroot); 2946 2947 /* 2948 * Attempt to create using a bad mirror. 2949 */ 2950 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); 2951 VERIFY3U(ENOENT, ==, 2952 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 2953 fnvlist_free(nvroot); 2954 2955 /* 2956 * Attempt to create an existing pool. It shouldn't matter 2957 * what's in the nvroot; we should fail with EEXIST. 2958 */ 2959 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2960 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 2961 VERIFY3U(EEXIST, ==, 2962 spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); 2963 fnvlist_free(nvroot); 2964 2965 /* 2966 * We open a reference to the spa and then we try to export it 2967 * expecting one of the following errors: 2968 * 2969 * EBUSY 2970 * Because of the reference we just opened. 2971 * 2972 * ZFS_ERR_EXPORT_IN_PROGRESS 2973 * For the case that there is another ztest thread doing 2974 * an export concurrently. 2975 */ 2976 VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); 2977 int error = spa_destroy(zo->zo_pool); 2978 if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { 2979 fatal(B_FALSE, "spa_destroy(%s) returned unexpected value %d", 2980 spa->spa_name, error); 2981 } 2982 spa_close(spa, FTAG); 2983 2984 (void) pthread_rwlock_unlock(&ztest_name_lock); 2985 } 2986 2987 /* 2988 * Start and then stop the MMP threads to ensure the startup and shutdown code 2989 * works properly. Actual protection and property-related code tested via ZTS. 2990 */ 2991 void 2992 ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) 2993 { 2994 (void) zd, (void) id; 2995 ztest_shared_opts_t *zo = &ztest_opts; 2996 spa_t *spa = ztest_spa; 2997 2998 if (zo->zo_mmp_test) 2999 return; 3000 3001 /* 3002 * Since enabling MMP involves setting a property, it could not be done 3003 * while the pool is suspended. 3004 */ 3005 if (spa_suspended(spa)) 3006 return; 3007 3008 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3009 mutex_enter(&spa->spa_props_lock); 3010 3011 zfs_multihost_fail_intervals = 0; 3012 3013 if (!spa_multihost(spa)) { 3014 spa->spa_multihost = B_TRUE; 3015 mmp_thread_start(spa); 3016 } 3017 3018 mutex_exit(&spa->spa_props_lock); 3019 spa_config_exit(spa, SCL_CONFIG, FTAG); 3020 3021 txg_wait_synced(spa_get_dsl(spa), 0); 3022 mmp_signal_all_threads(); 3023 txg_wait_synced(spa_get_dsl(spa), 0); 3024 3025 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3026 mutex_enter(&spa->spa_props_lock); 3027 3028 if (spa_multihost(spa)) { 3029 mmp_thread_stop(spa); 3030 spa->spa_multihost = B_FALSE; 3031 } 3032 3033 mutex_exit(&spa->spa_props_lock); 3034 spa_config_exit(spa, SCL_CONFIG, FTAG); 3035 } 3036 3037 void 3038 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 3039 { 3040 (void) zd, (void) id; 3041 spa_t *spa; 3042 uint64_t initial_version = SPA_VERSION_INITIAL; 3043 uint64_t version, newversion; 3044 nvlist_t *nvroot, *props; 3045 char *name; 3046 3047 if (ztest_opts.zo_mmp_test) 3048 return; 3049 3050 /* dRAID added after feature flags, skip upgrade test. */ 3051 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) 3052 return; 3053 3054 mutex_enter(&ztest_vdev_lock); 3055 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 3056 3057 /* 3058 * Clean up from previous runs. 3059 */ 3060 (void) spa_destroy(name); 3061 3062 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 3063 NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); 3064 3065 /* 3066 * If we're configuring a RAIDZ device then make sure that the 3067 * initial version is capable of supporting that feature. 3068 */ 3069 switch (ztest_opts.zo_raid_parity) { 3070 case 0: 3071 case 1: 3072 initial_version = SPA_VERSION_INITIAL; 3073 break; 3074 case 2: 3075 initial_version = SPA_VERSION_RAIDZ2; 3076 break; 3077 case 3: 3078 initial_version = SPA_VERSION_RAIDZ3; 3079 break; 3080 } 3081 3082 /* 3083 * Create a pool with a spa version that can be upgraded. Pick 3084 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 3085 */ 3086 do { 3087 version = ztest_random_spa_version(initial_version); 3088 } while (version > SPA_VERSION_BEFORE_FEATURES); 3089 3090 props = fnvlist_alloc(); 3091 fnvlist_add_uint64(props, 3092 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 3093 VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); 3094 fnvlist_free(nvroot); 3095 fnvlist_free(props); 3096 3097 VERIFY0(spa_open(name, &spa, FTAG)); 3098 VERIFY3U(spa_version(spa), ==, version); 3099 newversion = ztest_random_spa_version(version + 1); 3100 3101 if (ztest_opts.zo_verbose >= 4) { 3102 (void) printf("upgrading spa version from " 3103 "%"PRIu64" to %"PRIu64"\n", 3104 version, newversion); 3105 } 3106 3107 spa_upgrade(spa, newversion); 3108 VERIFY3U(spa_version(spa), >, version); 3109 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 3110 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 3111 spa_close(spa, FTAG); 3112 3113 kmem_strfree(name); 3114 mutex_exit(&ztest_vdev_lock); 3115 } 3116 3117 static void 3118 ztest_spa_checkpoint(spa_t *spa) 3119 { 3120 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3121 3122 int error = spa_checkpoint(spa->spa_name); 3123 3124 switch (error) { 3125 case 0: 3126 case ZFS_ERR_DEVRM_IN_PROGRESS: 3127 case ZFS_ERR_DISCARDING_CHECKPOINT: 3128 case ZFS_ERR_CHECKPOINT_EXISTS: 3129 break; 3130 case ENOSPC: 3131 ztest_record_enospc(FTAG); 3132 break; 3133 default: 3134 fatal(B_FALSE, "spa_checkpoint(%s) = %d", spa->spa_name, error); 3135 } 3136 } 3137 3138 static void 3139 ztest_spa_discard_checkpoint(spa_t *spa) 3140 { 3141 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3142 3143 int error = spa_checkpoint_discard(spa->spa_name); 3144 3145 switch (error) { 3146 case 0: 3147 case ZFS_ERR_DISCARDING_CHECKPOINT: 3148 case ZFS_ERR_NO_CHECKPOINT: 3149 break; 3150 default: 3151 fatal(B_FALSE, "spa_discard_checkpoint(%s) = %d", 3152 spa->spa_name, error); 3153 } 3154 3155 } 3156 3157 void 3158 ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 3159 { 3160 (void) zd, (void) id; 3161 spa_t *spa = ztest_spa; 3162 3163 mutex_enter(&ztest_checkpoint_lock); 3164 if (ztest_random(2) == 0) { 3165 ztest_spa_checkpoint(spa); 3166 } else { 3167 ztest_spa_discard_checkpoint(spa); 3168 } 3169 mutex_exit(&ztest_checkpoint_lock); 3170 } 3171 3172 3173 static vdev_t * 3174 vdev_lookup_by_path(vdev_t *vd, const char *path) 3175 { 3176 vdev_t *mvd; 3177 int c; 3178 3179 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 3180 return (vd); 3181 3182 for (c = 0; c < vd->vdev_children; c++) 3183 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 3184 NULL) 3185 return (mvd); 3186 3187 return (NULL); 3188 } 3189 3190 static int 3191 spa_num_top_vdevs(spa_t *spa) 3192 { 3193 vdev_t *rvd = spa->spa_root_vdev; 3194 ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); 3195 return (rvd->vdev_children); 3196 } 3197 3198 /* 3199 * Verify that vdev_add() works as expected. 3200 */ 3201 void 3202 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 3203 { 3204 (void) zd, (void) id; 3205 ztest_shared_t *zs = ztest_shared; 3206 spa_t *spa = ztest_spa; 3207 uint64_t leaves; 3208 uint64_t guid; 3209 nvlist_t *nvroot; 3210 int error; 3211 3212 if (ztest_opts.zo_mmp_test) 3213 return; 3214 3215 mutex_enter(&ztest_vdev_lock); 3216 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * 3217 ztest_opts.zo_raid_children; 3218 3219 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3220 3221 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3222 3223 /* 3224 * If we have slogs then remove them 1/4 of the time. 3225 */ 3226 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 3227 metaslab_group_t *mg; 3228 3229 /* 3230 * find the first real slog in log allocation class 3231 */ 3232 mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; 3233 while (!mg->mg_vd->vdev_islog) 3234 mg = mg->mg_next; 3235 3236 guid = mg->mg_vd->vdev_guid; 3237 3238 spa_config_exit(spa, SCL_VDEV, FTAG); 3239 3240 /* 3241 * We have to grab the zs_name_lock as writer to 3242 * prevent a race between removing a slog (dmu_objset_find) 3243 * and destroying a dataset. Removing the slog will 3244 * grab a reference on the dataset which may cause 3245 * dsl_destroy_head() to fail with EBUSY thus 3246 * leaving the dataset in an inconsistent state. 3247 */ 3248 pthread_rwlock_wrlock(&ztest_name_lock); 3249 error = spa_vdev_remove(spa, guid, B_FALSE); 3250 pthread_rwlock_unlock(&ztest_name_lock); 3251 3252 switch (error) { 3253 case 0: 3254 case EEXIST: /* Generic zil_reset() error */ 3255 case EBUSY: /* Replay required */ 3256 case EACCES: /* Crypto key not loaded */ 3257 case ZFS_ERR_CHECKPOINT_EXISTS: 3258 case ZFS_ERR_DISCARDING_CHECKPOINT: 3259 break; 3260 default: 3261 fatal(B_FALSE, "spa_vdev_remove() = %d", error); 3262 } 3263 } else { 3264 spa_config_exit(spa, SCL_VDEV, FTAG); 3265 3266 /* 3267 * Make 1/4 of the devices be log devices 3268 */ 3269 nvroot = make_vdev_root(NULL, NULL, NULL, 3270 ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? 3271 "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 3272 1); 3273 3274 error = spa_vdev_add(spa, nvroot); 3275 fnvlist_free(nvroot); 3276 3277 switch (error) { 3278 case 0: 3279 break; 3280 case ENOSPC: 3281 ztest_record_enospc("spa_vdev_add"); 3282 break; 3283 default: 3284 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3285 } 3286 } 3287 3288 mutex_exit(&ztest_vdev_lock); 3289 } 3290 3291 void 3292 ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) 3293 { 3294 (void) zd, (void) id; 3295 ztest_shared_t *zs = ztest_shared; 3296 spa_t *spa = ztest_spa; 3297 uint64_t leaves; 3298 nvlist_t *nvroot; 3299 const char *class = (ztest_random(2) == 0) ? 3300 VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; 3301 int error; 3302 3303 /* 3304 * By default add a special vdev 50% of the time 3305 */ 3306 if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || 3307 (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && 3308 ztest_random(2) == 0)) { 3309 return; 3310 } 3311 3312 mutex_enter(&ztest_vdev_lock); 3313 3314 /* Only test with mirrors */ 3315 if (zs->zs_mirrors < 2) { 3316 mutex_exit(&ztest_vdev_lock); 3317 return; 3318 } 3319 3320 /* requires feature@allocation_classes */ 3321 if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { 3322 mutex_exit(&ztest_vdev_lock); 3323 return; 3324 } 3325 3326 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * 3327 ztest_opts.zo_raid_children; 3328 3329 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3330 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3331 spa_config_exit(spa, SCL_VDEV, FTAG); 3332 3333 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 3334 class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 3335 3336 error = spa_vdev_add(spa, nvroot); 3337 fnvlist_free(nvroot); 3338 3339 if (error == ENOSPC) 3340 ztest_record_enospc("spa_vdev_add"); 3341 else if (error != 0) 3342 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3343 3344 /* 3345 * 50% of the time allow small blocks in the special class 3346 */ 3347 if (error == 0 && 3348 spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { 3349 if (ztest_opts.zo_verbose >= 3) 3350 (void) printf("Enabling special VDEV small blocks\n"); 3351 error = ztest_dsl_prop_set_uint64(zd->zd_name, 3352 ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); 3353 ASSERT(error == 0 || error == ENOSPC); 3354 } 3355 3356 mutex_exit(&ztest_vdev_lock); 3357 3358 if (ztest_opts.zo_verbose >= 3) { 3359 metaslab_class_t *mc; 3360 3361 if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) 3362 mc = spa_special_class(spa); 3363 else 3364 mc = spa_dedup_class(spa); 3365 (void) printf("Added a %s mirrored vdev (of %d)\n", 3366 class, (int)mc->mc_groups); 3367 } 3368 } 3369 3370 /* 3371 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 3372 */ 3373 void 3374 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 3375 { 3376 (void) zd, (void) id; 3377 ztest_shared_t *zs = ztest_shared; 3378 spa_t *spa = ztest_spa; 3379 vdev_t *rvd = spa->spa_root_vdev; 3380 spa_aux_vdev_t *sav; 3381 const char *aux; 3382 char *path; 3383 uint64_t guid = 0; 3384 int error, ignore_err = 0; 3385 3386 if (ztest_opts.zo_mmp_test) 3387 return; 3388 3389 path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3390 3391 if (ztest_random(2) == 0) { 3392 sav = &spa->spa_spares; 3393 aux = ZPOOL_CONFIG_SPARES; 3394 } else { 3395 sav = &spa->spa_l2cache; 3396 aux = ZPOOL_CONFIG_L2CACHE; 3397 } 3398 3399 mutex_enter(&ztest_vdev_lock); 3400 3401 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3402 3403 if (sav->sav_count != 0 && ztest_random(4) == 0) { 3404 /* 3405 * Pick a random device to remove. 3406 */ 3407 vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3408 3409 /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ 3410 if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) 3411 ignore_err = ENOTSUP; 3412 3413 guid = svd->vdev_guid; 3414 } else { 3415 /* 3416 * Find an unused device we can add. 3417 */ 3418 zs->zs_vdev_aux = 0; 3419 for (;;) { 3420 int c; 3421 (void) snprintf(path, MAXPATHLEN, ztest_aux_template, 3422 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 3423 zs->zs_vdev_aux); 3424 for (c = 0; c < sav->sav_count; c++) 3425 if (strcmp(sav->sav_vdevs[c]->vdev_path, 3426 path) == 0) 3427 break; 3428 if (c == sav->sav_count && 3429 vdev_lookup_by_path(rvd, path) == NULL) 3430 break; 3431 zs->zs_vdev_aux++; 3432 } 3433 } 3434 3435 spa_config_exit(spa, SCL_VDEV, FTAG); 3436 3437 if (guid == 0) { 3438 /* 3439 * Add a new device. 3440 */ 3441 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 3442 (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); 3443 error = spa_vdev_add(spa, nvroot); 3444 3445 switch (error) { 3446 case 0: 3447 break; 3448 default: 3449 fatal(B_FALSE, "spa_vdev_add(%p) = %d", nvroot, error); 3450 } 3451 fnvlist_free(nvroot); 3452 } else { 3453 /* 3454 * Remove an existing device. Sometimes, dirty its 3455 * vdev state first to make sure we handle removal 3456 * of devices that have pending state changes. 3457 */ 3458 if (ztest_random(2) == 0) 3459 (void) vdev_online(spa, guid, 0, NULL); 3460 3461 error = spa_vdev_remove(spa, guid, B_FALSE); 3462 3463 switch (error) { 3464 case 0: 3465 case EBUSY: 3466 case ZFS_ERR_CHECKPOINT_EXISTS: 3467 case ZFS_ERR_DISCARDING_CHECKPOINT: 3468 break; 3469 default: 3470 if (error != ignore_err) 3471 fatal(B_FALSE, 3472 "spa_vdev_remove(%"PRIu64") = %d", 3473 guid, error); 3474 } 3475 } 3476 3477 mutex_exit(&ztest_vdev_lock); 3478 3479 umem_free(path, MAXPATHLEN); 3480 } 3481 3482 /* 3483 * split a pool if it has mirror tlvdevs 3484 */ 3485 void 3486 ztest_split_pool(ztest_ds_t *zd, uint64_t id) 3487 { 3488 (void) zd, (void) id; 3489 ztest_shared_t *zs = ztest_shared; 3490 spa_t *spa = ztest_spa; 3491 vdev_t *rvd = spa->spa_root_vdev; 3492 nvlist_t *tree, **child, *config, *split, **schild; 3493 uint_t c, children, schildren = 0, lastlogid = 0; 3494 int error = 0; 3495 3496 if (ztest_opts.zo_mmp_test) 3497 return; 3498 3499 mutex_enter(&ztest_vdev_lock); 3500 3501 /* ensure we have a usable config; mirrors of raidz aren't supported */ 3502 if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { 3503 mutex_exit(&ztest_vdev_lock); 3504 return; 3505 } 3506 3507 /* clean up the old pool, if any */ 3508 (void) spa_destroy("splitp"); 3509 3510 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3511 3512 /* generate a config from the existing config */ 3513 mutex_enter(&spa->spa_props_lock); 3514 tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); 3515 mutex_exit(&spa->spa_props_lock); 3516 3517 VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, 3518 &child, &children)); 3519 3520 schild = umem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 3521 UMEM_NOFAIL); 3522 for (c = 0; c < children; c++) { 3523 vdev_t *tvd = rvd->vdev_child[c]; 3524 nvlist_t **mchild; 3525 uint_t mchildren; 3526 3527 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 3528 schild[schildren] = fnvlist_alloc(); 3529 fnvlist_add_string(schild[schildren], 3530 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); 3531 fnvlist_add_uint64(schild[schildren], 3532 ZPOOL_CONFIG_IS_HOLE, 1); 3533 if (lastlogid == 0) 3534 lastlogid = schildren; 3535 ++schildren; 3536 continue; 3537 } 3538 lastlogid = 0; 3539 VERIFY0(nvlist_lookup_nvlist_array(child[c], 3540 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); 3541 schild[schildren++] = fnvlist_dup(mchild[0]); 3542 } 3543 3544 /* OK, create a config that can be used to split */ 3545 split = fnvlist_alloc(); 3546 fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 3547 fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, 3548 (const nvlist_t **)schild, lastlogid != 0 ? lastlogid : schildren); 3549 3550 config = fnvlist_alloc(); 3551 fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); 3552 3553 for (c = 0; c < schildren; c++) 3554 fnvlist_free(schild[c]); 3555 umem_free(schild, rvd->vdev_children * sizeof (nvlist_t *)); 3556 fnvlist_free(split); 3557 3558 spa_config_exit(spa, SCL_VDEV, FTAG); 3559 3560 (void) pthread_rwlock_wrlock(&ztest_name_lock); 3561 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 3562 (void) pthread_rwlock_unlock(&ztest_name_lock); 3563 3564 fnvlist_free(config); 3565 3566 if (error == 0) { 3567 (void) printf("successful split - results:\n"); 3568 mutex_enter(&spa_namespace_lock); 3569 show_pool_stats(spa); 3570 show_pool_stats(spa_lookup("splitp")); 3571 mutex_exit(&spa_namespace_lock); 3572 ++zs->zs_splits; 3573 --zs->zs_mirrors; 3574 } 3575 mutex_exit(&ztest_vdev_lock); 3576 } 3577 3578 /* 3579 * Verify that we can attach and detach devices. 3580 */ 3581 void 3582 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 3583 { 3584 (void) zd, (void) id; 3585 ztest_shared_t *zs = ztest_shared; 3586 spa_t *spa = ztest_spa; 3587 spa_aux_vdev_t *sav = &spa->spa_spares; 3588 vdev_t *rvd = spa->spa_root_vdev; 3589 vdev_t *oldvd, *newvd, *pvd; 3590 nvlist_t *root; 3591 uint64_t leaves; 3592 uint64_t leaf, top; 3593 uint64_t ashift = ztest_get_ashift(); 3594 uint64_t oldguid, pguid; 3595 uint64_t oldsize, newsize; 3596 char *oldpath, *newpath; 3597 int replacing; 3598 int oldvd_has_siblings = B_FALSE; 3599 int newvd_is_spare = B_FALSE; 3600 int newvd_is_dspare = B_FALSE; 3601 int oldvd_is_log; 3602 int oldvd_is_special; 3603 int error, expected_error; 3604 3605 if (ztest_opts.zo_mmp_test) 3606 return; 3607 3608 oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3609 newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3610 3611 mutex_enter(&ztest_vdev_lock); 3612 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; 3613 3614 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3615 3616 /* 3617 * If a vdev is in the process of being removed, its removal may 3618 * finish while we are in progress, leading to an unexpected error 3619 * value. Don't bother trying to attach while we are in the middle 3620 * of removal. 3621 */ 3622 if (ztest_device_removal_active) { 3623 spa_config_exit(spa, SCL_ALL, FTAG); 3624 goto out; 3625 } 3626 3627 /* 3628 * Decide whether to do an attach or a replace. 3629 */ 3630 replacing = ztest_random(2); 3631 3632 /* 3633 * Pick a random top-level vdev. 3634 */ 3635 top = ztest_random_vdev_top(spa, B_TRUE); 3636 3637 /* 3638 * Pick a random leaf within it. 3639 */ 3640 leaf = ztest_random(leaves); 3641 3642 /* 3643 * Locate this vdev. 3644 */ 3645 oldvd = rvd->vdev_child[top]; 3646 3647 /* pick a child from the mirror */ 3648 if (zs->zs_mirrors >= 1) { 3649 ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); 3650 ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); 3651 oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; 3652 } 3653 3654 /* pick a child out of the raidz group */ 3655 if (ztest_opts.zo_raid_children > 1) { 3656 if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) 3657 ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); 3658 else 3659 ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); 3660 ASSERT3U(oldvd->vdev_children, ==, ztest_opts.zo_raid_children); 3661 oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; 3662 } 3663 3664 /* 3665 * If we're already doing an attach or replace, oldvd may be a 3666 * mirror vdev -- in which case, pick a random child. 3667 */ 3668 while (oldvd->vdev_children != 0) { 3669 oldvd_has_siblings = B_TRUE; 3670 ASSERT3U(oldvd->vdev_children, >=, 2); 3671 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3672 } 3673 3674 oldguid = oldvd->vdev_guid; 3675 oldsize = vdev_get_min_asize(oldvd); 3676 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3677 oldvd_is_special = 3678 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_SPECIAL || 3679 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_DEDUP; 3680 (void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN); 3681 pvd = oldvd->vdev_parent; 3682 pguid = pvd->vdev_guid; 3683 3684 /* 3685 * If oldvd has siblings, then half of the time, detach it. Prior 3686 * to the detach the pool is scrubbed in order to prevent creating 3687 * unrepairable blocks as a result of the data corruption injection. 3688 */ 3689 if (oldvd_has_siblings && ztest_random(2) == 0) { 3690 spa_config_exit(spa, SCL_ALL, FTAG); 3691 3692 error = ztest_scrub_impl(spa); 3693 if (error) 3694 goto out; 3695 3696 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3697 if (error != 0 && error != ENODEV && error != EBUSY && 3698 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3699 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3700 fatal(B_FALSE, "detach (%s) returned %d", 3701 oldpath, error); 3702 goto out; 3703 } 3704 3705 /* 3706 * For the new vdev, choose with equal probability between the two 3707 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3708 */ 3709 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3710 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3711 newvd_is_spare = B_TRUE; 3712 3713 if (newvd->vdev_ops == &vdev_draid_spare_ops) 3714 newvd_is_dspare = B_TRUE; 3715 3716 (void) strlcpy(newpath, newvd->vdev_path, MAXPATHLEN); 3717 } else { 3718 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 3719 ztest_opts.zo_dir, ztest_opts.zo_pool, 3720 top * leaves + leaf); 3721 if (ztest_random(2) == 0) 3722 newpath[strlen(newpath) - 1] = 'b'; 3723 newvd = vdev_lookup_by_path(rvd, newpath); 3724 } 3725 3726 if (newvd) { 3727 /* 3728 * Reopen to ensure the vdev's asize field isn't stale. 3729 */ 3730 vdev_reopen(newvd); 3731 newsize = vdev_get_min_asize(newvd); 3732 } else { 3733 /* 3734 * Make newsize a little bigger or smaller than oldsize. 3735 * If it's smaller, the attach should fail. 3736 * If it's larger, and we're doing a replace, 3737 * we should get dynamic LUN growth when we're done. 3738 */ 3739 newsize = 10 * oldsize / (9 + ztest_random(3)); 3740 } 3741 3742 /* 3743 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3744 * unless it's a replace; in that case any non-replacing parent is OK. 3745 * 3746 * If newvd is already part of the pool, it should fail with EBUSY. 3747 * 3748 * If newvd is too small, it should fail with EOVERFLOW. 3749 * 3750 * If newvd is a distributed spare and it's being attached to a 3751 * dRAID which is not its parent it should fail with EINVAL. 3752 */ 3753 if (pvd->vdev_ops != &vdev_mirror_ops && 3754 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3755 pvd->vdev_ops == &vdev_replacing_ops || 3756 pvd->vdev_ops == &vdev_spare_ops)) 3757 expected_error = ENOTSUP; 3758 else if (newvd_is_spare && 3759 (!replacing || oldvd_is_log || oldvd_is_special)) 3760 expected_error = ENOTSUP; 3761 else if (newvd == oldvd) 3762 expected_error = replacing ? 0 : EBUSY; 3763 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3764 expected_error = EBUSY; 3765 else if (!newvd_is_dspare && newsize < oldsize) 3766 expected_error = EOVERFLOW; 3767 else if (ashift > oldvd->vdev_top->vdev_ashift) 3768 expected_error = EDOM; 3769 else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) 3770 expected_error = ENOTSUP; 3771 else 3772 expected_error = 0; 3773 3774 spa_config_exit(spa, SCL_ALL, FTAG); 3775 3776 /* 3777 * Build the nvlist describing newpath. 3778 */ 3779 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3780 ashift, NULL, 0, 0, 1); 3781 3782 /* 3783 * When supported select either a healing or sequential resilver. 3784 */ 3785 boolean_t rebuilding = B_FALSE; 3786 if (pvd->vdev_ops == &vdev_mirror_ops || 3787 pvd->vdev_ops == &vdev_root_ops) { 3788 rebuilding = !!ztest_random(2); 3789 } 3790 3791 error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); 3792 3793 fnvlist_free(root); 3794 3795 /* 3796 * If our parent was the replacing vdev, but the replace completed, 3797 * then instead of failing with ENOTSUP we may either succeed, 3798 * fail with ENODEV, or fail with EOVERFLOW. 3799 */ 3800 if (expected_error == ENOTSUP && 3801 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3802 expected_error = error; 3803 3804 /* 3805 * If someone grew the LUN, the replacement may be too small. 3806 */ 3807 if (error == EOVERFLOW || error == EBUSY) 3808 expected_error = error; 3809 3810 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3811 error == ZFS_ERR_DISCARDING_CHECKPOINT || 3812 error == ZFS_ERR_RESILVER_IN_PROGRESS || 3813 error == ZFS_ERR_REBUILD_IN_PROGRESS) 3814 expected_error = error; 3815 3816 if (error != expected_error && expected_error != EBUSY) { 3817 fatal(B_FALSE, "attach (%s %"PRIu64", %s %"PRIu64", %d) " 3818 "returned %d, expected %d", 3819 oldpath, oldsize, newpath, 3820 newsize, replacing, error, expected_error); 3821 } 3822 out: 3823 mutex_exit(&ztest_vdev_lock); 3824 3825 umem_free(oldpath, MAXPATHLEN); 3826 umem_free(newpath, MAXPATHLEN); 3827 } 3828 3829 void 3830 ztest_device_removal(ztest_ds_t *zd, uint64_t id) 3831 { 3832 (void) zd, (void) id; 3833 spa_t *spa = ztest_spa; 3834 vdev_t *vd; 3835 uint64_t guid; 3836 int error; 3837 3838 mutex_enter(&ztest_vdev_lock); 3839 3840 if (ztest_device_removal_active) { 3841 mutex_exit(&ztest_vdev_lock); 3842 return; 3843 } 3844 3845 /* 3846 * Remove a random top-level vdev and wait for removal to finish. 3847 */ 3848 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3849 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 3850 guid = vd->vdev_guid; 3851 spa_config_exit(spa, SCL_VDEV, FTAG); 3852 3853 error = spa_vdev_remove(spa, guid, B_FALSE); 3854 if (error == 0) { 3855 ztest_device_removal_active = B_TRUE; 3856 mutex_exit(&ztest_vdev_lock); 3857 3858 /* 3859 * spa->spa_vdev_removal is created in a sync task that 3860 * is initiated via dsl_sync_task_nowait(). Since the 3861 * task may not run before spa_vdev_remove() returns, we 3862 * must wait at least 1 txg to ensure that the removal 3863 * struct has been created. 3864 */ 3865 txg_wait_synced(spa_get_dsl(spa), 0); 3866 3867 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 3868 txg_wait_synced(spa_get_dsl(spa), 0); 3869 } else { 3870 mutex_exit(&ztest_vdev_lock); 3871 return; 3872 } 3873 3874 /* 3875 * The pool needs to be scrubbed after completing device removal. 3876 * Failure to do so may result in checksum errors due to the 3877 * strategy employed by ztest_fault_inject() when selecting which 3878 * offset are redundant and can be damaged. 3879 */ 3880 error = spa_scan(spa, POOL_SCAN_SCRUB); 3881 if (error == 0) { 3882 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 3883 txg_wait_synced(spa_get_dsl(spa), 0); 3884 } 3885 3886 mutex_enter(&ztest_vdev_lock); 3887 ztest_device_removal_active = B_FALSE; 3888 mutex_exit(&ztest_vdev_lock); 3889 } 3890 3891 /* 3892 * Callback function which expands the physical size of the vdev. 3893 */ 3894 static vdev_t * 3895 grow_vdev(vdev_t *vd, void *arg) 3896 { 3897 spa_t *spa __maybe_unused = vd->vdev_spa; 3898 size_t *newsize = arg; 3899 size_t fsize; 3900 int fd; 3901 3902 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 3903 ASSERT(vd->vdev_ops->vdev_op_leaf); 3904 3905 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 3906 return (vd); 3907 3908 fsize = lseek(fd, 0, SEEK_END); 3909 VERIFY0(ftruncate(fd, *newsize)); 3910 3911 if (ztest_opts.zo_verbose >= 6) { 3912 (void) printf("%s grew from %lu to %lu bytes\n", 3913 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 3914 } 3915 (void) close(fd); 3916 return (NULL); 3917 } 3918 3919 /* 3920 * Callback function which expands a given vdev by calling vdev_online(). 3921 */ 3922 static vdev_t * 3923 online_vdev(vdev_t *vd, void *arg) 3924 { 3925 (void) arg; 3926 spa_t *spa = vd->vdev_spa; 3927 vdev_t *tvd = vd->vdev_top; 3928 uint64_t guid = vd->vdev_guid; 3929 uint64_t generation = spa->spa_config_generation + 1; 3930 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 3931 int error; 3932 3933 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 3934 ASSERT(vd->vdev_ops->vdev_op_leaf); 3935 3936 /* Calling vdev_online will initialize the new metaslabs */ 3937 spa_config_exit(spa, SCL_STATE, spa); 3938 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 3939 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3940 3941 /* 3942 * If vdev_online returned an error or the underlying vdev_open 3943 * failed then we abort the expand. The only way to know that 3944 * vdev_open fails is by checking the returned newstate. 3945 */ 3946 if (error || newstate != VDEV_STATE_HEALTHY) { 3947 if (ztest_opts.zo_verbose >= 5) { 3948 (void) printf("Unable to expand vdev, state %u, " 3949 "error %d\n", newstate, error); 3950 } 3951 return (vd); 3952 } 3953 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 3954 3955 /* 3956 * Since we dropped the lock we need to ensure that we're 3957 * still talking to the original vdev. It's possible this 3958 * vdev may have been detached/replaced while we were 3959 * trying to online it. 3960 */ 3961 if (generation != spa->spa_config_generation) { 3962 if (ztest_opts.zo_verbose >= 5) { 3963 (void) printf("vdev configuration has changed, " 3964 "guid %"PRIu64", state %"PRIu64", " 3965 "expected gen %"PRIu64", got gen %"PRIu64"\n", 3966 guid, 3967 tvd->vdev_state, 3968 generation, 3969 spa->spa_config_generation); 3970 } 3971 return (vd); 3972 } 3973 return (NULL); 3974 } 3975 3976 /* 3977 * Traverse the vdev tree calling the supplied function. 3978 * We continue to walk the tree until we either have walked all 3979 * children or we receive a non-NULL return from the callback. 3980 * If a NULL callback is passed, then we just return back the first 3981 * leaf vdev we encounter. 3982 */ 3983 static vdev_t * 3984 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 3985 { 3986 uint_t c; 3987 3988 if (vd->vdev_ops->vdev_op_leaf) { 3989 if (func == NULL) 3990 return (vd); 3991 else 3992 return (func(vd, arg)); 3993 } 3994 3995 for (c = 0; c < vd->vdev_children; c++) { 3996 vdev_t *cvd = vd->vdev_child[c]; 3997 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 3998 return (cvd); 3999 } 4000 return (NULL); 4001 } 4002 4003 /* 4004 * Verify that dynamic LUN growth works as expected. 4005 */ 4006 void 4007 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 4008 { 4009 (void) zd, (void) id; 4010 spa_t *spa = ztest_spa; 4011 vdev_t *vd, *tvd; 4012 metaslab_class_t *mc; 4013 metaslab_group_t *mg; 4014 size_t psize, newsize; 4015 uint64_t top; 4016 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 4017 4018 mutex_enter(&ztest_checkpoint_lock); 4019 mutex_enter(&ztest_vdev_lock); 4020 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4021 4022 /* 4023 * If there is a vdev removal in progress, it could complete while 4024 * we are running, in which case we would not be able to verify 4025 * that the metaslab_class space increased (because it decreases 4026 * when the device removal completes). 4027 */ 4028 if (ztest_device_removal_active) { 4029 spa_config_exit(spa, SCL_STATE, spa); 4030 mutex_exit(&ztest_vdev_lock); 4031 mutex_exit(&ztest_checkpoint_lock); 4032 return; 4033 } 4034 4035 top = ztest_random_vdev_top(spa, B_TRUE); 4036 4037 tvd = spa->spa_root_vdev->vdev_child[top]; 4038 mg = tvd->vdev_mg; 4039 mc = mg->mg_class; 4040 old_ms_count = tvd->vdev_ms_count; 4041 old_class_space = metaslab_class_get_space(mc); 4042 4043 /* 4044 * Determine the size of the first leaf vdev associated with 4045 * our top-level device. 4046 */ 4047 vd = vdev_walk_tree(tvd, NULL, NULL); 4048 ASSERT3P(vd, !=, NULL); 4049 ASSERT(vd->vdev_ops->vdev_op_leaf); 4050 4051 psize = vd->vdev_psize; 4052 4053 /* 4054 * We only try to expand the vdev if it's healthy, less than 4x its 4055 * original size, and it has a valid psize. 4056 */ 4057 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 4058 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 4059 spa_config_exit(spa, SCL_STATE, spa); 4060 mutex_exit(&ztest_vdev_lock); 4061 mutex_exit(&ztest_checkpoint_lock); 4062 return; 4063 } 4064 ASSERT3U(psize, >, 0); 4065 newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); 4066 ASSERT3U(newsize, >, psize); 4067 4068 if (ztest_opts.zo_verbose >= 6) { 4069 (void) printf("Expanding LUN %s from %lu to %lu\n", 4070 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 4071 } 4072 4073 /* 4074 * Growing the vdev is a two step process: 4075 * 1). expand the physical size (i.e. relabel) 4076 * 2). online the vdev to create the new metaslabs 4077 */ 4078 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 4079 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 4080 tvd->vdev_state != VDEV_STATE_HEALTHY) { 4081 if (ztest_opts.zo_verbose >= 5) { 4082 (void) printf("Could not expand LUN because " 4083 "the vdev configuration changed.\n"); 4084 } 4085 spa_config_exit(spa, SCL_STATE, spa); 4086 mutex_exit(&ztest_vdev_lock); 4087 mutex_exit(&ztest_checkpoint_lock); 4088 return; 4089 } 4090 4091 spa_config_exit(spa, SCL_STATE, spa); 4092 4093 /* 4094 * Expanding the LUN will update the config asynchronously, 4095 * thus we must wait for the async thread to complete any 4096 * pending tasks before proceeding. 4097 */ 4098 for (;;) { 4099 boolean_t done; 4100 mutex_enter(&spa->spa_async_lock); 4101 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 4102 mutex_exit(&spa->spa_async_lock); 4103 if (done) 4104 break; 4105 txg_wait_synced(spa_get_dsl(spa), 0); 4106 (void) poll(NULL, 0, 100); 4107 } 4108 4109 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4110 4111 tvd = spa->spa_root_vdev->vdev_child[top]; 4112 new_ms_count = tvd->vdev_ms_count; 4113 new_class_space = metaslab_class_get_space(mc); 4114 4115 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 4116 if (ztest_opts.zo_verbose >= 5) { 4117 (void) printf("Could not verify LUN expansion due to " 4118 "intervening vdev offline or remove.\n"); 4119 } 4120 spa_config_exit(spa, SCL_STATE, spa); 4121 mutex_exit(&ztest_vdev_lock); 4122 mutex_exit(&ztest_checkpoint_lock); 4123 return; 4124 } 4125 4126 /* 4127 * Make sure we were able to grow the vdev. 4128 */ 4129 if (new_ms_count <= old_ms_count) { 4130 fatal(B_FALSE, 4131 "LUN expansion failed: ms_count %"PRIu64" < %"PRIu64"\n", 4132 old_ms_count, new_ms_count); 4133 } 4134 4135 /* 4136 * Make sure we were able to grow the pool. 4137 */ 4138 if (new_class_space <= old_class_space) { 4139 fatal(B_FALSE, 4140 "LUN expansion failed: class_space %"PRIu64" < %"PRIu64"\n", 4141 old_class_space, new_class_space); 4142 } 4143 4144 if (ztest_opts.zo_verbose >= 5) { 4145 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 4146 4147 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 4148 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 4149 (void) printf("%s grew from %s to %s\n", 4150 spa->spa_name, oldnumbuf, newnumbuf); 4151 } 4152 4153 spa_config_exit(spa, SCL_STATE, spa); 4154 mutex_exit(&ztest_vdev_lock); 4155 mutex_exit(&ztest_checkpoint_lock); 4156 } 4157 4158 /* 4159 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 4160 */ 4161 static void 4162 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 4163 { 4164 (void) arg, (void) cr; 4165 4166 /* 4167 * Create the objects common to all ztest datasets. 4168 */ 4169 VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, 4170 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); 4171 } 4172 4173 static int 4174 ztest_dataset_create(char *dsname) 4175 { 4176 int err; 4177 uint64_t rand; 4178 dsl_crypto_params_t *dcp = NULL; 4179 4180 /* 4181 * 50% of the time, we create encrypted datasets 4182 * using a random cipher suite and a hard-coded 4183 * wrapping key. 4184 */ 4185 rand = ztest_random(2); 4186 if (rand != 0) { 4187 nvlist_t *crypto_args = fnvlist_alloc(); 4188 nvlist_t *props = fnvlist_alloc(); 4189 4190 /* slight bias towards the default cipher suite */ 4191 rand = ztest_random(ZIO_CRYPT_FUNCTIONS); 4192 if (rand < ZIO_CRYPT_AES_128_CCM) 4193 rand = ZIO_CRYPT_ON; 4194 4195 fnvlist_add_uint64(props, 4196 zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); 4197 fnvlist_add_uint8_array(crypto_args, "wkeydata", 4198 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 4199 4200 /* 4201 * These parameters aren't really used by the kernel. They 4202 * are simply stored so that userspace knows how to load 4203 * the wrapping key. 4204 */ 4205 fnvlist_add_uint64(props, 4206 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); 4207 fnvlist_add_string(props, 4208 zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); 4209 fnvlist_add_uint64(props, 4210 zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); 4211 fnvlist_add_uint64(props, 4212 zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); 4213 4214 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, 4215 crypto_args, &dcp)); 4216 4217 /* 4218 * Cycle through all available encryption implementations 4219 * to verify interoperability. 4220 */ 4221 VERIFY0(gcm_impl_set("cycle")); 4222 VERIFY0(aes_impl_set("cycle")); 4223 4224 fnvlist_free(crypto_args); 4225 fnvlist_free(props); 4226 } 4227 4228 err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, 4229 ztest_objset_create_cb, NULL); 4230 dsl_crypto_params_free(dcp, !!err); 4231 4232 rand = ztest_random(100); 4233 if (err || rand < 80) 4234 return (err); 4235 4236 if (ztest_opts.zo_verbose >= 5) 4237 (void) printf("Setting dataset %s to sync always\n", dsname); 4238 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 4239 ZFS_SYNC_ALWAYS, B_FALSE)); 4240 } 4241 4242 static int 4243 ztest_objset_destroy_cb(const char *name, void *arg) 4244 { 4245 (void) arg; 4246 objset_t *os; 4247 dmu_object_info_t doi; 4248 int error; 4249 4250 /* 4251 * Verify that the dataset contains a directory object. 4252 */ 4253 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4254 B_TRUE, FTAG, &os)); 4255 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 4256 if (error != ENOENT) { 4257 /* We could have crashed in the middle of destroying it */ 4258 ASSERT0(error); 4259 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 4260 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 4261 } 4262 dmu_objset_disown(os, B_TRUE, FTAG); 4263 4264 /* 4265 * Destroy the dataset. 4266 */ 4267 if (strchr(name, '@') != NULL) { 4268 error = dsl_destroy_snapshot(name, B_TRUE); 4269 if (error != ECHRNG) { 4270 /* 4271 * The program was executed, but encountered a runtime 4272 * error, such as insufficient slop, or a hold on the 4273 * dataset. 4274 */ 4275 ASSERT0(error); 4276 } 4277 } else { 4278 error = dsl_destroy_head(name); 4279 if (error == ENOSPC) { 4280 /* There could be checkpoint or insufficient slop */ 4281 ztest_record_enospc(FTAG); 4282 } else if (error != EBUSY) { 4283 /* There could be a hold on this dataset */ 4284 ASSERT0(error); 4285 } 4286 } 4287 return (0); 4288 } 4289 4290 static boolean_t 4291 ztest_snapshot_create(char *osname, uint64_t id) 4292 { 4293 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4294 int error; 4295 4296 (void) snprintf(snapname, sizeof (snapname), "%"PRIu64"", id); 4297 4298 error = dmu_objset_snapshot_one(osname, snapname); 4299 if (error == ENOSPC) { 4300 ztest_record_enospc(FTAG); 4301 return (B_FALSE); 4302 } 4303 if (error != 0 && error != EEXIST && error != ECHRNG) { 4304 fatal(B_FALSE, "ztest_snapshot_create(%s@%s) = %d", osname, 4305 snapname, error); 4306 } 4307 return (B_TRUE); 4308 } 4309 4310 static boolean_t 4311 ztest_snapshot_destroy(char *osname, uint64_t id) 4312 { 4313 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4314 int error; 4315 4316 (void) snprintf(snapname, sizeof (snapname), "%s@%"PRIu64"", 4317 osname, id); 4318 4319 error = dsl_destroy_snapshot(snapname, B_FALSE); 4320 if (error != 0 && error != ENOENT && error != ECHRNG) 4321 fatal(B_FALSE, "ztest_snapshot_destroy(%s) = %d", 4322 snapname, error); 4323 return (B_TRUE); 4324 } 4325 4326 void 4327 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 4328 { 4329 (void) zd; 4330 ztest_ds_t *zdtmp; 4331 int iters; 4332 int error; 4333 objset_t *os, *os2; 4334 char name[ZFS_MAX_DATASET_NAME_LEN]; 4335 zilog_t *zilog; 4336 int i; 4337 4338 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 4339 4340 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4341 4342 (void) snprintf(name, sizeof (name), "%s/temp_%"PRIu64"", 4343 ztest_opts.zo_pool, id); 4344 4345 /* 4346 * If this dataset exists from a previous run, process its replay log 4347 * half of the time. If we don't replay it, then dsl_destroy_head() 4348 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 4349 */ 4350 if (ztest_random(2) == 0 && 4351 ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 4352 B_TRUE, FTAG, &os) == 0) { 4353 ztest_zd_init(zdtmp, NULL, os); 4354 zil_replay(os, zdtmp, ztest_replay_vector); 4355 ztest_zd_fini(zdtmp); 4356 dmu_objset_disown(os, B_TRUE, FTAG); 4357 } 4358 4359 /* 4360 * There may be an old instance of the dataset we're about to 4361 * create lying around from a previous run. If so, destroy it 4362 * and all of its snapshots. 4363 */ 4364 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 4365 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 4366 4367 /* 4368 * Verify that the destroyed dataset is no longer in the namespace. 4369 * It may still be present if the destroy above fails with ENOSPC. 4370 */ 4371 error = ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, 4372 FTAG, &os); 4373 if (error == 0) { 4374 dmu_objset_disown(os, B_TRUE, FTAG); 4375 ztest_record_enospc(FTAG); 4376 goto out; 4377 } 4378 VERIFY3U(ENOENT, ==, error); 4379 4380 /* 4381 * Verify that we can create a new dataset. 4382 */ 4383 error = ztest_dataset_create(name); 4384 if (error) { 4385 if (error == ENOSPC) { 4386 ztest_record_enospc(FTAG); 4387 goto out; 4388 } 4389 fatal(B_FALSE, "dmu_objset_create(%s) = %d", name, error); 4390 } 4391 4392 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, 4393 FTAG, &os)); 4394 4395 ztest_zd_init(zdtmp, NULL, os); 4396 4397 /* 4398 * Open the intent log for it. 4399 */ 4400 zilog = zil_open(os, ztest_get_data, NULL); 4401 4402 /* 4403 * Put some objects in there, do a little I/O to them, 4404 * and randomly take a couple of snapshots along the way. 4405 */ 4406 iters = ztest_random(5); 4407 for (i = 0; i < iters; i++) { 4408 ztest_dmu_object_alloc_free(zdtmp, id); 4409 if (ztest_random(iters) == 0) 4410 (void) ztest_snapshot_create(name, i); 4411 } 4412 4413 /* 4414 * Verify that we cannot create an existing dataset. 4415 */ 4416 VERIFY3U(EEXIST, ==, 4417 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); 4418 4419 /* 4420 * Verify that we can hold an objset that is also owned. 4421 */ 4422 VERIFY0(dmu_objset_hold(name, FTAG, &os2)); 4423 dmu_objset_rele(os2, FTAG); 4424 4425 /* 4426 * Verify that we cannot own an objset that is already owned. 4427 */ 4428 VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, 4429 B_FALSE, B_TRUE, FTAG, &os2)); 4430 4431 zil_close(zilog); 4432 dmu_objset_disown(os, B_TRUE, FTAG); 4433 ztest_zd_fini(zdtmp); 4434 out: 4435 (void) pthread_rwlock_unlock(&ztest_name_lock); 4436 4437 umem_free(zdtmp, sizeof (ztest_ds_t)); 4438 } 4439 4440 /* 4441 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 4442 */ 4443 void 4444 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 4445 { 4446 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4447 (void) ztest_snapshot_destroy(zd->zd_name, id); 4448 (void) ztest_snapshot_create(zd->zd_name, id); 4449 (void) pthread_rwlock_unlock(&ztest_name_lock); 4450 } 4451 4452 /* 4453 * Cleanup non-standard snapshots and clones. 4454 */ 4455 static void 4456 ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 4457 { 4458 char *snap1name; 4459 char *clone1name; 4460 char *snap2name; 4461 char *clone2name; 4462 char *snap3name; 4463 int error; 4464 4465 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4466 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4467 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4468 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4469 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4470 4471 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4472 osname, id); 4473 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4474 osname, id); 4475 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4476 clone1name, id); 4477 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4478 osname, id); 4479 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4480 clone1name, id); 4481 4482 error = dsl_destroy_head(clone2name); 4483 if (error && error != ENOENT) 4484 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone2name, error); 4485 error = dsl_destroy_snapshot(snap3name, B_FALSE); 4486 if (error && error != ENOENT) 4487 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4488 snap3name, error); 4489 error = dsl_destroy_snapshot(snap2name, B_FALSE); 4490 if (error && error != ENOENT) 4491 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4492 snap2name, error); 4493 error = dsl_destroy_head(clone1name); 4494 if (error && error != ENOENT) 4495 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone1name, error); 4496 error = dsl_destroy_snapshot(snap1name, B_FALSE); 4497 if (error && error != ENOENT) 4498 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4499 snap1name, error); 4500 4501 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4502 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4503 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4504 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4505 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4506 } 4507 4508 /* 4509 * Verify dsl_dataset_promote handles EBUSY 4510 */ 4511 void 4512 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 4513 { 4514 objset_t *os; 4515 char *snap1name; 4516 char *clone1name; 4517 char *snap2name; 4518 char *clone2name; 4519 char *snap3name; 4520 char *osname = zd->zd_name; 4521 int error; 4522 4523 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4524 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4525 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4526 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4527 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4528 4529 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4530 4531 ztest_dsl_dataset_cleanup(osname, id); 4532 4533 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4534 osname, id); 4535 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4536 osname, id); 4537 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4538 clone1name, id); 4539 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4540 osname, id); 4541 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4542 clone1name, id); 4543 4544 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 4545 if (error && error != EEXIST) { 4546 if (error == ENOSPC) { 4547 ztest_record_enospc(FTAG); 4548 goto out; 4549 } 4550 fatal(B_FALSE, "dmu_take_snapshot(%s) = %d", snap1name, error); 4551 } 4552 4553 error = dmu_objset_clone(clone1name, snap1name); 4554 if (error) { 4555 if (error == ENOSPC) { 4556 ztest_record_enospc(FTAG); 4557 goto out; 4558 } 4559 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone1name, error); 4560 } 4561 4562 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 4563 if (error && error != EEXIST) { 4564 if (error == ENOSPC) { 4565 ztest_record_enospc(FTAG); 4566 goto out; 4567 } 4568 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap2name, error); 4569 } 4570 4571 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 4572 if (error && error != EEXIST) { 4573 if (error == ENOSPC) { 4574 ztest_record_enospc(FTAG); 4575 goto out; 4576 } 4577 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap3name, error); 4578 } 4579 4580 error = dmu_objset_clone(clone2name, snap3name); 4581 if (error) { 4582 if (error == ENOSPC) { 4583 ztest_record_enospc(FTAG); 4584 goto out; 4585 } 4586 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone2name, error); 4587 } 4588 4589 error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, 4590 FTAG, &os); 4591 if (error) 4592 fatal(B_FALSE, "dmu_objset_own(%s) = %d", snap2name, error); 4593 error = dsl_dataset_promote(clone2name, NULL); 4594 if (error == ENOSPC) { 4595 dmu_objset_disown(os, B_TRUE, FTAG); 4596 ztest_record_enospc(FTAG); 4597 goto out; 4598 } 4599 if (error != EBUSY) 4600 fatal(B_FALSE, "dsl_dataset_promote(%s), %d, not EBUSY", 4601 clone2name, error); 4602 dmu_objset_disown(os, B_TRUE, FTAG); 4603 4604 out: 4605 ztest_dsl_dataset_cleanup(osname, id); 4606 4607 (void) pthread_rwlock_unlock(&ztest_name_lock); 4608 4609 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4610 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4611 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4612 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4613 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4614 } 4615 4616 #undef OD_ARRAY_SIZE 4617 #define OD_ARRAY_SIZE 4 4618 4619 /* 4620 * Verify that dmu_object_{alloc,free} work as expected. 4621 */ 4622 void 4623 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 4624 { 4625 ztest_od_t *od; 4626 int batchsize; 4627 int size; 4628 int b; 4629 4630 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4631 od = umem_alloc(size, UMEM_NOFAIL); 4632 batchsize = OD_ARRAY_SIZE; 4633 4634 for (b = 0; b < batchsize; b++) 4635 ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 4636 0, 0, 0); 4637 4638 /* 4639 * Destroy the previous batch of objects, create a new batch, 4640 * and do some I/O on the new objects. 4641 */ 4642 if (ztest_object_init(zd, od, size, B_TRUE) != 0) { 4643 zd->zd_od = NULL; 4644 umem_free(od, size); 4645 return; 4646 } 4647 4648 while (ztest_random(4 * batchsize) != 0) 4649 ztest_io(zd, od[ztest_random(batchsize)].od_object, 4650 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4651 4652 umem_free(od, size); 4653 } 4654 4655 /* 4656 * Rewind the global allocator to verify object allocation backfilling. 4657 */ 4658 void 4659 ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 4660 { 4661 (void) id; 4662 objset_t *os = zd->zd_os; 4663 uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 4664 uint64_t object; 4665 4666 /* 4667 * Rewind the global allocator randomly back to a lower object number 4668 * to force backfilling and reclamation of recently freed dnodes. 4669 */ 4670 mutex_enter(&os->os_obj_lock); 4671 object = ztest_random(os->os_obj_next_chunk); 4672 os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); 4673 mutex_exit(&os->os_obj_lock); 4674 } 4675 4676 #undef OD_ARRAY_SIZE 4677 #define OD_ARRAY_SIZE 2 4678 4679 /* 4680 * Verify that dmu_{read,write} work as expected. 4681 */ 4682 void 4683 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 4684 { 4685 int size; 4686 ztest_od_t *od; 4687 4688 objset_t *os = zd->zd_os; 4689 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4690 od = umem_alloc(size, UMEM_NOFAIL); 4691 dmu_tx_t *tx; 4692 int freeit, error; 4693 uint64_t i, n, s, txg; 4694 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 4695 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4696 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 4697 uint64_t regions = 997; 4698 uint64_t stride = 123456789ULL; 4699 uint64_t width = 40; 4700 int free_percent = 5; 4701 4702 /* 4703 * This test uses two objects, packobj and bigobj, that are always 4704 * updated together (i.e. in the same tx) so that their contents are 4705 * in sync and can be compared. Their contents relate to each other 4706 * in a simple way: packobj is a dense array of 'bufwad' structures, 4707 * while bigobj is a sparse array of the same bufwads. Specifically, 4708 * for any index n, there are three bufwads that should be identical: 4709 * 4710 * packobj, at offset n * sizeof (bufwad_t) 4711 * bigobj, at the head of the nth chunk 4712 * bigobj, at the tail of the nth chunk 4713 * 4714 * The chunk size is arbitrary. It doesn't have to be a power of two, 4715 * and it doesn't have any relation to the object blocksize. 4716 * The only requirement is that it can hold at least two bufwads. 4717 * 4718 * Normally, we write the bufwad to each of these locations. 4719 * However, free_percent of the time we instead write zeroes to 4720 * packobj and perform a dmu_free_range() on bigobj. By comparing 4721 * bigobj to packobj, we can verify that the DMU is correctly 4722 * tracking which parts of an object are allocated and free, 4723 * and that the contents of the allocated blocks are correct. 4724 */ 4725 4726 /* 4727 * Read the directory info. If it's the first time, set things up. 4728 */ 4729 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); 4730 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 4731 chunksize); 4732 4733 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 4734 umem_free(od, size); 4735 return; 4736 } 4737 4738 bigobj = od[0].od_object; 4739 packobj = od[1].od_object; 4740 chunksize = od[0].od_gen; 4741 ASSERT3U(chunksize, ==, od[1].od_gen); 4742 4743 /* 4744 * Prefetch a random chunk of the big object. 4745 * Our aim here is to get some async reads in flight 4746 * for blocks that we may free below; the DMU should 4747 * handle this race correctly. 4748 */ 4749 n = ztest_random(regions) * stride + ztest_random(width); 4750 s = 1 + ztest_random(2 * width - 1); 4751 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 4752 ZIO_PRIORITY_SYNC_READ); 4753 4754 /* 4755 * Pick a random index and compute the offsets into packobj and bigobj. 4756 */ 4757 n = ztest_random(regions) * stride + ztest_random(width); 4758 s = 1 + ztest_random(width - 1); 4759 4760 packoff = n * sizeof (bufwad_t); 4761 packsize = s * sizeof (bufwad_t); 4762 4763 bigoff = n * chunksize; 4764 bigsize = s * chunksize; 4765 4766 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 4767 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 4768 4769 /* 4770 * free_percent of the time, free a range of bigobj rather than 4771 * overwriting it. 4772 */ 4773 freeit = (ztest_random(100) < free_percent); 4774 4775 /* 4776 * Read the current contents of our objects. 4777 */ 4778 error = dmu_read(os, packobj, packoff, packsize, packbuf, 4779 DMU_READ_PREFETCH); 4780 ASSERT0(error); 4781 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 4782 DMU_READ_PREFETCH); 4783 ASSERT0(error); 4784 4785 /* 4786 * Get a tx for the mods to both packobj and bigobj. 4787 */ 4788 tx = dmu_tx_create(os); 4789 4790 dmu_tx_hold_write(tx, packobj, packoff, packsize); 4791 4792 if (freeit) 4793 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 4794 else 4795 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 4796 4797 /* This accounts for setting the checksum/compression. */ 4798 dmu_tx_hold_bonus(tx, bigobj); 4799 4800 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4801 if (txg == 0) { 4802 umem_free(packbuf, packsize); 4803 umem_free(bigbuf, bigsize); 4804 umem_free(od, size); 4805 return; 4806 } 4807 4808 enum zio_checksum cksum; 4809 do { 4810 cksum = (enum zio_checksum) 4811 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 4812 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 4813 dmu_object_set_checksum(os, bigobj, cksum, tx); 4814 4815 enum zio_compress comp; 4816 do { 4817 comp = (enum zio_compress) 4818 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 4819 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 4820 dmu_object_set_compress(os, bigobj, comp, tx); 4821 4822 /* 4823 * For each index from n to n + s, verify that the existing bufwad 4824 * in packobj matches the bufwads at the head and tail of the 4825 * corresponding chunk in bigobj. Then update all three bufwads 4826 * with the new values we want to write out. 4827 */ 4828 for (i = 0; i < s; i++) { 4829 /* LINTED */ 4830 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4831 /* LINTED */ 4832 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4833 /* LINTED */ 4834 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4835 4836 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 4837 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 4838 4839 if (pack->bw_txg > txg) 4840 fatal(B_FALSE, 4841 "future leak: got %"PRIx64", open txg is %"PRIx64"", 4842 pack->bw_txg, txg); 4843 4844 if (pack->bw_data != 0 && pack->bw_index != n + i) 4845 fatal(B_FALSE, "wrong index: " 4846 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 4847 pack->bw_index, n, i); 4848 4849 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4850 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 4851 pack, bigH); 4852 4853 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4854 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 4855 pack, bigT); 4856 4857 if (freeit) { 4858 memset(pack, 0, sizeof (bufwad_t)); 4859 } else { 4860 pack->bw_index = n + i; 4861 pack->bw_txg = txg; 4862 pack->bw_data = 1 + ztest_random(-2ULL); 4863 } 4864 *bigH = *pack; 4865 *bigT = *pack; 4866 } 4867 4868 /* 4869 * We've verified all the old bufwads, and made new ones. 4870 * Now write them out. 4871 */ 4872 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 4873 4874 if (freeit) { 4875 if (ztest_opts.zo_verbose >= 7) { 4876 (void) printf("freeing offset %"PRIx64" size %"PRIx64"" 4877 " txg %"PRIx64"\n", 4878 bigoff, bigsize, txg); 4879 } 4880 VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 4881 } else { 4882 if (ztest_opts.zo_verbose >= 7) { 4883 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 4884 " txg %"PRIx64"\n", 4885 bigoff, bigsize, txg); 4886 } 4887 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 4888 } 4889 4890 dmu_tx_commit(tx); 4891 4892 /* 4893 * Sanity check the stuff we just wrote. 4894 */ 4895 { 4896 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 4897 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 4898 4899 VERIFY0(dmu_read(os, packobj, packoff, 4900 packsize, packcheck, DMU_READ_PREFETCH)); 4901 VERIFY0(dmu_read(os, bigobj, bigoff, 4902 bigsize, bigcheck, DMU_READ_PREFETCH)); 4903 4904 ASSERT0(memcmp(packbuf, packcheck, packsize)); 4905 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 4906 4907 umem_free(packcheck, packsize); 4908 umem_free(bigcheck, bigsize); 4909 } 4910 4911 umem_free(packbuf, packsize); 4912 umem_free(bigbuf, bigsize); 4913 umem_free(od, size); 4914 } 4915 4916 static void 4917 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 4918 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 4919 { 4920 uint64_t i; 4921 bufwad_t *pack; 4922 bufwad_t *bigH; 4923 bufwad_t *bigT; 4924 4925 /* 4926 * For each index from n to n + s, verify that the existing bufwad 4927 * in packobj matches the bufwads at the head and tail of the 4928 * corresponding chunk in bigobj. Then update all three bufwads 4929 * with the new values we want to write out. 4930 */ 4931 for (i = 0; i < s; i++) { 4932 /* LINTED */ 4933 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4934 /* LINTED */ 4935 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4936 /* LINTED */ 4937 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4938 4939 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 4940 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 4941 4942 if (pack->bw_txg > txg) 4943 fatal(B_FALSE, 4944 "future leak: got %"PRIx64", open txg is %"PRIx64"", 4945 pack->bw_txg, txg); 4946 4947 if (pack->bw_data != 0 && pack->bw_index != n + i) 4948 fatal(B_FALSE, "wrong index: " 4949 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 4950 pack->bw_index, n, i); 4951 4952 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4953 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 4954 pack, bigH); 4955 4956 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4957 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 4958 pack, bigT); 4959 4960 pack->bw_index = n + i; 4961 pack->bw_txg = txg; 4962 pack->bw_data = 1 + ztest_random(-2ULL); 4963 4964 *bigH = *pack; 4965 *bigT = *pack; 4966 } 4967 } 4968 4969 #undef OD_ARRAY_SIZE 4970 #define OD_ARRAY_SIZE 2 4971 4972 void 4973 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 4974 { 4975 objset_t *os = zd->zd_os; 4976 ztest_od_t *od; 4977 dmu_tx_t *tx; 4978 uint64_t i; 4979 int error; 4980 int size; 4981 uint64_t n, s, txg; 4982 bufwad_t *packbuf, *bigbuf; 4983 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4984 uint64_t blocksize = ztest_random_blocksize(); 4985 uint64_t chunksize = blocksize; 4986 uint64_t regions = 997; 4987 uint64_t stride = 123456789ULL; 4988 uint64_t width = 9; 4989 dmu_buf_t *bonus_db; 4990 arc_buf_t **bigbuf_arcbufs; 4991 dmu_object_info_t doi; 4992 4993 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4994 od = umem_alloc(size, UMEM_NOFAIL); 4995 4996 /* 4997 * This test uses two objects, packobj and bigobj, that are always 4998 * updated together (i.e. in the same tx) so that their contents are 4999 * in sync and can be compared. Their contents relate to each other 5000 * in a simple way: packobj is a dense array of 'bufwad' structures, 5001 * while bigobj is a sparse array of the same bufwads. Specifically, 5002 * for any index n, there are three bufwads that should be identical: 5003 * 5004 * packobj, at offset n * sizeof (bufwad_t) 5005 * bigobj, at the head of the nth chunk 5006 * bigobj, at the tail of the nth chunk 5007 * 5008 * The chunk size is set equal to bigobj block size so that 5009 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. 5010 */ 5011 5012 /* 5013 * Read the directory info. If it's the first time, set things up. 5014 */ 5015 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5016 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5017 chunksize); 5018 5019 5020 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5021 umem_free(od, size); 5022 return; 5023 } 5024 5025 bigobj = od[0].od_object; 5026 packobj = od[1].od_object; 5027 blocksize = od[0].od_blocksize; 5028 chunksize = blocksize; 5029 ASSERT3U(chunksize, ==, od[1].od_gen); 5030 5031 VERIFY0(dmu_object_info(os, bigobj, &doi)); 5032 VERIFY(ISP2(doi.doi_data_block_size)); 5033 VERIFY3U(chunksize, ==, doi.doi_data_block_size); 5034 VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); 5035 5036 /* 5037 * Pick a random index and compute the offsets into packobj and bigobj. 5038 */ 5039 n = ztest_random(regions) * stride + ztest_random(width); 5040 s = 1 + ztest_random(width - 1); 5041 5042 packoff = n * sizeof (bufwad_t); 5043 packsize = s * sizeof (bufwad_t); 5044 5045 bigoff = n * chunksize; 5046 bigsize = s * chunksize; 5047 5048 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 5049 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 5050 5051 VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 5052 5053 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 5054 5055 /* 5056 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 5057 * Iteration 1 test zcopy to already referenced dbufs. 5058 * Iteration 2 test zcopy to dirty dbuf in the same txg. 5059 * Iteration 3 test zcopy to dbuf dirty in previous txg. 5060 * Iteration 4 test zcopy when dbuf is no longer dirty. 5061 * Iteration 5 test zcopy when it can't be done. 5062 * Iteration 6 one more zcopy write. 5063 */ 5064 for (i = 0; i < 7; i++) { 5065 uint64_t j; 5066 uint64_t off; 5067 5068 /* 5069 * In iteration 5 (i == 5) use arcbufs 5070 * that don't match bigobj blksz to test 5071 * dmu_assign_arcbuf_by_dbuf() when it can't directly 5072 * assign an arcbuf to a dbuf. 5073 */ 5074 for (j = 0; j < s; j++) { 5075 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5076 bigbuf_arcbufs[j] = 5077 dmu_request_arcbuf(bonus_db, chunksize); 5078 } else { 5079 bigbuf_arcbufs[2 * j] = 5080 dmu_request_arcbuf(bonus_db, chunksize / 2); 5081 bigbuf_arcbufs[2 * j + 1] = 5082 dmu_request_arcbuf(bonus_db, chunksize / 2); 5083 } 5084 } 5085 5086 /* 5087 * Get a tx for the mods to both packobj and bigobj. 5088 */ 5089 tx = dmu_tx_create(os); 5090 5091 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5092 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5093 5094 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5095 if (txg == 0) { 5096 umem_free(packbuf, packsize); 5097 umem_free(bigbuf, bigsize); 5098 for (j = 0; j < s; j++) { 5099 if (i != 5 || 5100 chunksize < (SPA_MINBLOCKSIZE * 2)) { 5101 dmu_return_arcbuf(bigbuf_arcbufs[j]); 5102 } else { 5103 dmu_return_arcbuf( 5104 bigbuf_arcbufs[2 * j]); 5105 dmu_return_arcbuf( 5106 bigbuf_arcbufs[2 * j + 1]); 5107 } 5108 } 5109 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5110 umem_free(od, size); 5111 dmu_buf_rele(bonus_db, FTAG); 5112 return; 5113 } 5114 5115 /* 5116 * 50% of the time don't read objects in the 1st iteration to 5117 * test dmu_assign_arcbuf_by_dbuf() for the case when there are 5118 * no existing dbufs for the specified offsets. 5119 */ 5120 if (i != 0 || ztest_random(2) != 0) { 5121 error = dmu_read(os, packobj, packoff, 5122 packsize, packbuf, DMU_READ_PREFETCH); 5123 ASSERT0(error); 5124 error = dmu_read(os, bigobj, bigoff, bigsize, 5125 bigbuf, DMU_READ_PREFETCH); 5126 ASSERT0(error); 5127 } 5128 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 5129 n, chunksize, txg); 5130 5131 /* 5132 * We've verified all the old bufwads, and made new ones. 5133 * Now write them out. 5134 */ 5135 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5136 if (ztest_opts.zo_verbose >= 7) { 5137 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5138 " txg %"PRIx64"\n", 5139 bigoff, bigsize, txg); 5140 } 5141 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 5142 dmu_buf_t *dbt; 5143 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5144 memcpy(bigbuf_arcbufs[j]->b_data, 5145 (caddr_t)bigbuf + (off - bigoff), 5146 chunksize); 5147 } else { 5148 memcpy(bigbuf_arcbufs[2 * j]->b_data, 5149 (caddr_t)bigbuf + (off - bigoff), 5150 chunksize / 2); 5151 memcpy(bigbuf_arcbufs[2 * j + 1]->b_data, 5152 (caddr_t)bigbuf + (off - bigoff) + 5153 chunksize / 2, 5154 chunksize / 2); 5155 } 5156 5157 if (i == 1) { 5158 VERIFY(dmu_buf_hold(os, bigobj, off, 5159 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 5160 } 5161 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5162 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5163 off, bigbuf_arcbufs[j], tx)); 5164 } else { 5165 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5166 off, bigbuf_arcbufs[2 * j], tx)); 5167 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5168 off + chunksize / 2, 5169 bigbuf_arcbufs[2 * j + 1], tx)); 5170 } 5171 if (i == 1) { 5172 dmu_buf_rele(dbt, FTAG); 5173 } 5174 } 5175 dmu_tx_commit(tx); 5176 5177 /* 5178 * Sanity check the stuff we just wrote. 5179 */ 5180 { 5181 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5182 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5183 5184 VERIFY0(dmu_read(os, packobj, packoff, 5185 packsize, packcheck, DMU_READ_PREFETCH)); 5186 VERIFY0(dmu_read(os, bigobj, bigoff, 5187 bigsize, bigcheck, DMU_READ_PREFETCH)); 5188 5189 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5190 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5191 5192 umem_free(packcheck, packsize); 5193 umem_free(bigcheck, bigsize); 5194 } 5195 if (i == 2) { 5196 txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); 5197 } else if (i == 3) { 5198 txg_wait_synced(dmu_objset_pool(os), 0); 5199 } 5200 } 5201 5202 dmu_buf_rele(bonus_db, FTAG); 5203 umem_free(packbuf, packsize); 5204 umem_free(bigbuf, bigsize); 5205 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5206 umem_free(od, size); 5207 } 5208 5209 void 5210 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 5211 { 5212 (void) id; 5213 ztest_od_t *od; 5214 5215 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5216 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 5217 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5218 5219 /* 5220 * Have multiple threads write to large offsets in an object 5221 * to verify that parallel writes to an object -- even to the 5222 * same blocks within the object -- doesn't cause any trouble. 5223 */ 5224 ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5225 5226 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) 5227 return; 5228 5229 while (ztest_random(10) != 0) 5230 ztest_io(zd, od->od_object, offset); 5231 5232 umem_free(od, sizeof (ztest_od_t)); 5233 } 5234 5235 void 5236 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 5237 { 5238 ztest_od_t *od; 5239 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 5240 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5241 uint64_t count = ztest_random(20) + 1; 5242 uint64_t blocksize = ztest_random_blocksize(); 5243 void *data; 5244 5245 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5246 5247 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5248 5249 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5250 !ztest_random(2)) != 0) { 5251 umem_free(od, sizeof (ztest_od_t)); 5252 return; 5253 } 5254 5255 if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { 5256 umem_free(od, sizeof (ztest_od_t)); 5257 return; 5258 } 5259 5260 ztest_prealloc(zd, od->od_object, offset, count * blocksize); 5261 5262 data = umem_zalloc(blocksize, UMEM_NOFAIL); 5263 5264 while (ztest_random(count) != 0) { 5265 uint64_t randoff = offset + (ztest_random(count) * blocksize); 5266 if (ztest_write(zd, od->od_object, randoff, blocksize, 5267 data) != 0) 5268 break; 5269 while (ztest_random(4) != 0) 5270 ztest_io(zd, od->od_object, randoff); 5271 } 5272 5273 umem_free(data, blocksize); 5274 umem_free(od, sizeof (ztest_od_t)); 5275 } 5276 5277 /* 5278 * Verify that zap_{create,destroy,add,remove,update} work as expected. 5279 */ 5280 #define ZTEST_ZAP_MIN_INTS 1 5281 #define ZTEST_ZAP_MAX_INTS 4 5282 #define ZTEST_ZAP_MAX_PROPS 1000 5283 5284 void 5285 ztest_zap(ztest_ds_t *zd, uint64_t id) 5286 { 5287 objset_t *os = zd->zd_os; 5288 ztest_od_t *od; 5289 uint64_t object; 5290 uint64_t txg, last_txg; 5291 uint64_t value[ZTEST_ZAP_MAX_INTS]; 5292 uint64_t zl_ints, zl_intsize, prop; 5293 int i, ints; 5294 dmu_tx_t *tx; 5295 char propname[100], txgname[100]; 5296 int error; 5297 const char *const hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 5298 5299 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5300 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5301 5302 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5303 !ztest_random(2)) != 0) 5304 goto out; 5305 5306 object = od->od_object; 5307 5308 /* 5309 * Generate a known hash collision, and verify that 5310 * we can lookup and remove both entries. 5311 */ 5312 tx = dmu_tx_create(os); 5313 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5314 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5315 if (txg == 0) 5316 goto out; 5317 for (i = 0; i < 2; i++) { 5318 value[i] = i; 5319 VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 5320 1, &value[i], tx)); 5321 } 5322 for (i = 0; i < 2; i++) { 5323 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 5324 sizeof (uint64_t), 1, &value[i], tx)); 5325 VERIFY0( 5326 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 5327 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5328 ASSERT3U(zl_ints, ==, 1); 5329 } 5330 for (i = 0; i < 2; i++) { 5331 VERIFY0(zap_remove(os, object, hc[i], tx)); 5332 } 5333 dmu_tx_commit(tx); 5334 5335 /* 5336 * Generate a bunch of random entries. 5337 */ 5338 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 5339 5340 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5341 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5342 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5343 memset(value, 0, sizeof (value)); 5344 last_txg = 0; 5345 5346 /* 5347 * If these zap entries already exist, validate their contents. 5348 */ 5349 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5350 if (error == 0) { 5351 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5352 ASSERT3U(zl_ints, ==, 1); 5353 5354 VERIFY0(zap_lookup(os, object, txgname, zl_intsize, 5355 zl_ints, &last_txg)); 5356 5357 VERIFY0(zap_length(os, object, propname, &zl_intsize, 5358 &zl_ints)); 5359 5360 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5361 ASSERT3U(zl_ints, ==, ints); 5362 5363 VERIFY0(zap_lookup(os, object, propname, zl_intsize, 5364 zl_ints, value)); 5365 5366 for (i = 0; i < ints; i++) { 5367 ASSERT3U(value[i], ==, last_txg + object + i); 5368 } 5369 } else { 5370 ASSERT3U(error, ==, ENOENT); 5371 } 5372 5373 /* 5374 * Atomically update two entries in our zap object. 5375 * The first is named txg_%llu, and contains the txg 5376 * in which the property was last updated. The second 5377 * is named prop_%llu, and the nth element of its value 5378 * should be txg + object + n. 5379 */ 5380 tx = dmu_tx_create(os); 5381 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5382 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5383 if (txg == 0) 5384 goto out; 5385 5386 if (last_txg > txg) 5387 fatal(B_FALSE, "zap future leak: old %"PRIu64" new %"PRIu64"", 5388 last_txg, txg); 5389 5390 for (i = 0; i < ints; i++) 5391 value[i] = txg + object + i; 5392 5393 VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 5394 1, &txg, tx)); 5395 VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), 5396 ints, value, tx)); 5397 5398 dmu_tx_commit(tx); 5399 5400 /* 5401 * Remove a random pair of entries. 5402 */ 5403 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5404 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5405 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5406 5407 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5408 5409 if (error == ENOENT) 5410 goto out; 5411 5412 ASSERT0(error); 5413 5414 tx = dmu_tx_create(os); 5415 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5416 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5417 if (txg == 0) 5418 goto out; 5419 VERIFY0(zap_remove(os, object, txgname, tx)); 5420 VERIFY0(zap_remove(os, object, propname, tx)); 5421 dmu_tx_commit(tx); 5422 out: 5423 umem_free(od, sizeof (ztest_od_t)); 5424 } 5425 5426 /* 5427 * Test case to test the upgrading of a microzap to fatzap. 5428 */ 5429 void 5430 ztest_fzap(ztest_ds_t *zd, uint64_t id) 5431 { 5432 objset_t *os = zd->zd_os; 5433 ztest_od_t *od; 5434 uint64_t object, txg, value; 5435 5436 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5437 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5438 5439 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5440 !ztest_random(2)) != 0) 5441 goto out; 5442 object = od->od_object; 5443 5444 /* 5445 * Add entries to this ZAP and make sure it spills over 5446 * and gets upgraded to a fatzap. Also, since we are adding 5447 * 2050 entries we should see ptrtbl growth and leaf-block split. 5448 */ 5449 for (value = 0; value < 2050; value++) { 5450 char name[ZFS_MAX_DATASET_NAME_LEN]; 5451 dmu_tx_t *tx; 5452 int error; 5453 5454 (void) snprintf(name, sizeof (name), "fzap-%"PRIu64"-%"PRIu64"", 5455 id, value); 5456 5457 tx = dmu_tx_create(os); 5458 dmu_tx_hold_zap(tx, object, B_TRUE, name); 5459 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5460 if (txg == 0) 5461 goto out; 5462 error = zap_add(os, object, name, sizeof (uint64_t), 1, 5463 &value, tx); 5464 ASSERT(error == 0 || error == EEXIST); 5465 dmu_tx_commit(tx); 5466 } 5467 out: 5468 umem_free(od, sizeof (ztest_od_t)); 5469 } 5470 5471 void 5472 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 5473 { 5474 (void) id; 5475 objset_t *os = zd->zd_os; 5476 ztest_od_t *od; 5477 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 5478 dmu_tx_t *tx; 5479 int i, namelen, error; 5480 int micro = ztest_random(2); 5481 char name[20], string_value[20]; 5482 void *data; 5483 5484 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5485 ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); 5486 5487 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5488 umem_free(od, sizeof (ztest_od_t)); 5489 return; 5490 } 5491 5492 object = od->od_object; 5493 5494 /* 5495 * Generate a random name of the form 'xxx.....' where each 5496 * x is a random printable character and the dots are dots. 5497 * There are 94 such characters, and the name length goes from 5498 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 5499 */ 5500 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 5501 5502 for (i = 0; i < 3; i++) 5503 name[i] = '!' + ztest_random('~' - '!' + 1); 5504 for (; i < namelen - 1; i++) 5505 name[i] = '.'; 5506 name[i] = '\0'; 5507 5508 if ((namelen & 1) || micro) { 5509 wsize = sizeof (txg); 5510 wc = 1; 5511 data = &txg; 5512 } else { 5513 wsize = 1; 5514 wc = namelen; 5515 data = string_value; 5516 } 5517 5518 count = -1ULL; 5519 VERIFY0(zap_count(os, object, &count)); 5520 ASSERT3S(count, !=, -1ULL); 5521 5522 /* 5523 * Select an operation: length, lookup, add, update, remove. 5524 */ 5525 i = ztest_random(5); 5526 5527 if (i >= 2) { 5528 tx = dmu_tx_create(os); 5529 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5530 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5531 if (txg == 0) { 5532 umem_free(od, sizeof (ztest_od_t)); 5533 return; 5534 } 5535 memcpy(string_value, name, namelen); 5536 } else { 5537 tx = NULL; 5538 txg = 0; 5539 memset(string_value, 0, namelen); 5540 } 5541 5542 switch (i) { 5543 5544 case 0: 5545 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 5546 if (error == 0) { 5547 ASSERT3U(wsize, ==, zl_wsize); 5548 ASSERT3U(wc, ==, zl_wc); 5549 } else { 5550 ASSERT3U(error, ==, ENOENT); 5551 } 5552 break; 5553 5554 case 1: 5555 error = zap_lookup(os, object, name, wsize, wc, data); 5556 if (error == 0) { 5557 if (data == string_value && 5558 memcmp(name, data, namelen) != 0) 5559 fatal(B_FALSE, "name '%s' != val '%s' len %d", 5560 name, (char *)data, namelen); 5561 } else { 5562 ASSERT3U(error, ==, ENOENT); 5563 } 5564 break; 5565 5566 case 2: 5567 error = zap_add(os, object, name, wsize, wc, data, tx); 5568 ASSERT(error == 0 || error == EEXIST); 5569 break; 5570 5571 case 3: 5572 VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); 5573 break; 5574 5575 case 4: 5576 error = zap_remove(os, object, name, tx); 5577 ASSERT(error == 0 || error == ENOENT); 5578 break; 5579 } 5580 5581 if (tx != NULL) 5582 dmu_tx_commit(tx); 5583 5584 umem_free(od, sizeof (ztest_od_t)); 5585 } 5586 5587 /* 5588 * Commit callback data. 5589 */ 5590 typedef struct ztest_cb_data { 5591 list_node_t zcd_node; 5592 uint64_t zcd_txg; 5593 int zcd_expected_err; 5594 boolean_t zcd_added; 5595 boolean_t zcd_called; 5596 spa_t *zcd_spa; 5597 } ztest_cb_data_t; 5598 5599 /* This is the actual commit callback function */ 5600 static void 5601 ztest_commit_callback(void *arg, int error) 5602 { 5603 ztest_cb_data_t *data = arg; 5604 uint64_t synced_txg; 5605 5606 VERIFY3P(data, !=, NULL); 5607 VERIFY3S(data->zcd_expected_err, ==, error); 5608 VERIFY(!data->zcd_called); 5609 5610 synced_txg = spa_last_synced_txg(data->zcd_spa); 5611 if (data->zcd_txg > synced_txg) 5612 fatal(B_FALSE, 5613 "commit callback of txg %"PRIu64" called prematurely, " 5614 "last synced txg = %"PRIu64"\n", 5615 data->zcd_txg, synced_txg); 5616 5617 data->zcd_called = B_TRUE; 5618 5619 if (error == ECANCELED) { 5620 ASSERT0(data->zcd_txg); 5621 ASSERT(!data->zcd_added); 5622 5623 /* 5624 * The private callback data should be destroyed here, but 5625 * since we are going to check the zcd_called field after 5626 * dmu_tx_abort(), we will destroy it there. 5627 */ 5628 return; 5629 } 5630 5631 ASSERT(data->zcd_added); 5632 ASSERT3U(data->zcd_txg, !=, 0); 5633 5634 (void) mutex_enter(&zcl.zcl_callbacks_lock); 5635 5636 /* See if this cb was called more quickly */ 5637 if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) 5638 zc_min_txg_delay = synced_txg - data->zcd_txg; 5639 5640 /* Remove our callback from the list */ 5641 list_remove(&zcl.zcl_callbacks, data); 5642 5643 (void) mutex_exit(&zcl.zcl_callbacks_lock); 5644 5645 umem_free(data, sizeof (ztest_cb_data_t)); 5646 } 5647 5648 /* Allocate and initialize callback data structure */ 5649 static ztest_cb_data_t * 5650 ztest_create_cb_data(objset_t *os, uint64_t txg) 5651 { 5652 ztest_cb_data_t *cb_data; 5653 5654 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 5655 5656 cb_data->zcd_txg = txg; 5657 cb_data->zcd_spa = dmu_objset_spa(os); 5658 list_link_init(&cb_data->zcd_node); 5659 5660 return (cb_data); 5661 } 5662 5663 /* 5664 * Commit callback test. 5665 */ 5666 void 5667 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 5668 { 5669 objset_t *os = zd->zd_os; 5670 ztest_od_t *od; 5671 dmu_tx_t *tx; 5672 ztest_cb_data_t *cb_data[3], *tmp_cb; 5673 uint64_t old_txg, txg; 5674 int i, error = 0; 5675 5676 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5677 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5678 5679 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5680 umem_free(od, sizeof (ztest_od_t)); 5681 return; 5682 } 5683 5684 tx = dmu_tx_create(os); 5685 5686 cb_data[0] = ztest_create_cb_data(os, 0); 5687 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 5688 5689 dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); 5690 5691 /* Every once in a while, abort the transaction on purpose */ 5692 if (ztest_random(100) == 0) 5693 error = -1; 5694 5695 if (!error) 5696 error = dmu_tx_assign(tx, TXG_NOWAIT); 5697 5698 txg = error ? 0 : dmu_tx_get_txg(tx); 5699 5700 cb_data[0]->zcd_txg = txg; 5701 cb_data[1] = ztest_create_cb_data(os, txg); 5702 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 5703 5704 if (error) { 5705 /* 5706 * It's not a strict requirement to call the registered 5707 * callbacks from inside dmu_tx_abort(), but that's what 5708 * it's supposed to happen in the current implementation 5709 * so we will check for that. 5710 */ 5711 for (i = 0; i < 2; i++) { 5712 cb_data[i]->zcd_expected_err = ECANCELED; 5713 VERIFY(!cb_data[i]->zcd_called); 5714 } 5715 5716 dmu_tx_abort(tx); 5717 5718 for (i = 0; i < 2; i++) { 5719 VERIFY(cb_data[i]->zcd_called); 5720 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 5721 } 5722 5723 umem_free(od, sizeof (ztest_od_t)); 5724 return; 5725 } 5726 5727 cb_data[2] = ztest_create_cb_data(os, txg); 5728 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 5729 5730 /* 5731 * Read existing data to make sure there isn't a future leak. 5732 */ 5733 VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), 5734 &old_txg, DMU_READ_PREFETCH)); 5735 5736 if (old_txg > txg) 5737 fatal(B_FALSE, 5738 "future leak: got %"PRIu64", open txg is %"PRIu64"", 5739 old_txg, txg); 5740 5741 dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); 5742 5743 (void) mutex_enter(&zcl.zcl_callbacks_lock); 5744 5745 /* 5746 * Since commit callbacks don't have any ordering requirement and since 5747 * it is theoretically possible for a commit callback to be called 5748 * after an arbitrary amount of time has elapsed since its txg has been 5749 * synced, it is difficult to reliably determine whether a commit 5750 * callback hasn't been called due to high load or due to a flawed 5751 * implementation. 5752 * 5753 * In practice, we will assume that if after a certain number of txgs a 5754 * commit callback hasn't been called, then most likely there's an 5755 * implementation bug.. 5756 */ 5757 tmp_cb = list_head(&zcl.zcl_callbacks); 5758 if (tmp_cb != NULL && 5759 tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { 5760 fatal(B_FALSE, 5761 "Commit callback threshold exceeded, " 5762 "oldest txg: %"PRIu64", open txg: %"PRIu64"\n", 5763 tmp_cb->zcd_txg, txg); 5764 } 5765 5766 /* 5767 * Let's find the place to insert our callbacks. 5768 * 5769 * Even though the list is ordered by txg, it is possible for the 5770 * insertion point to not be the end because our txg may already be 5771 * quiescing at this point and other callbacks in the open txg 5772 * (from other objsets) may have sneaked in. 5773 */ 5774 tmp_cb = list_tail(&zcl.zcl_callbacks); 5775 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 5776 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 5777 5778 /* Add the 3 callbacks to the list */ 5779 for (i = 0; i < 3; i++) { 5780 if (tmp_cb == NULL) 5781 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 5782 else 5783 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 5784 cb_data[i]); 5785 5786 cb_data[i]->zcd_added = B_TRUE; 5787 VERIFY(!cb_data[i]->zcd_called); 5788 5789 tmp_cb = cb_data[i]; 5790 } 5791 5792 zc_cb_counter += 3; 5793 5794 (void) mutex_exit(&zcl.zcl_callbacks_lock); 5795 5796 dmu_tx_commit(tx); 5797 5798 umem_free(od, sizeof (ztest_od_t)); 5799 } 5800 5801 /* 5802 * Visit each object in the dataset. Verify that its properties 5803 * are consistent what was stored in the block tag when it was created, 5804 * and that its unused bonus buffer space has not been overwritten. 5805 */ 5806 void 5807 ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 5808 { 5809 (void) id; 5810 objset_t *os = zd->zd_os; 5811 uint64_t obj; 5812 int err = 0; 5813 5814 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 5815 ztest_block_tag_t *bt = NULL; 5816 dmu_object_info_t doi; 5817 dmu_buf_t *db; 5818 5819 ztest_object_lock(zd, obj, RL_READER); 5820 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { 5821 ztest_object_unlock(zd, obj); 5822 continue; 5823 } 5824 5825 dmu_object_info_from_db(db, &doi); 5826 if (doi.doi_bonus_size >= sizeof (*bt)) 5827 bt = ztest_bt_bonus(db); 5828 5829 if (bt && bt->bt_magic == BT_MAGIC) { 5830 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 5831 bt->bt_offset, bt->bt_gen, bt->bt_txg, 5832 bt->bt_crtxg); 5833 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 5834 } 5835 5836 dmu_buf_rele(db, FTAG); 5837 ztest_object_unlock(zd, obj); 5838 } 5839 } 5840 5841 void 5842 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 5843 { 5844 (void) id; 5845 zfs_prop_t proplist[] = { 5846 ZFS_PROP_CHECKSUM, 5847 ZFS_PROP_COMPRESSION, 5848 ZFS_PROP_COPIES, 5849 ZFS_PROP_DEDUP 5850 }; 5851 5852 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5853 5854 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) { 5855 int error = ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 5856 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 5857 ASSERT(error == 0 || error == ENOSPC); 5858 } 5859 5860 int error = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, 5861 ztest_random_blocksize(), (int)ztest_random(2)); 5862 ASSERT(error == 0 || error == ENOSPC); 5863 5864 (void) pthread_rwlock_unlock(&ztest_name_lock); 5865 } 5866 5867 void 5868 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 5869 { 5870 (void) zd, (void) id; 5871 nvlist_t *props = NULL; 5872 5873 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5874 5875 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); 5876 5877 VERIFY0(spa_prop_get(ztest_spa, &props)); 5878 5879 if (ztest_opts.zo_verbose >= 6) 5880 dump_nvlist(props, 4); 5881 5882 fnvlist_free(props); 5883 5884 (void) pthread_rwlock_unlock(&ztest_name_lock); 5885 } 5886 5887 static int 5888 user_release_one(const char *snapname, const char *holdname) 5889 { 5890 nvlist_t *snaps, *holds; 5891 int error; 5892 5893 snaps = fnvlist_alloc(); 5894 holds = fnvlist_alloc(); 5895 fnvlist_add_boolean(holds, holdname); 5896 fnvlist_add_nvlist(snaps, snapname, holds); 5897 fnvlist_free(holds); 5898 error = dsl_dataset_user_release(snaps, NULL); 5899 fnvlist_free(snaps); 5900 return (error); 5901 } 5902 5903 /* 5904 * Test snapshot hold/release and deferred destroy. 5905 */ 5906 void 5907 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 5908 { 5909 int error; 5910 objset_t *os = zd->zd_os; 5911 objset_t *origin; 5912 char snapname[100]; 5913 char fullname[100]; 5914 char clonename[100]; 5915 char tag[100]; 5916 char osname[ZFS_MAX_DATASET_NAME_LEN]; 5917 nvlist_t *holds; 5918 5919 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5920 5921 dmu_objset_name(os, osname); 5922 5923 (void) snprintf(snapname, sizeof (snapname), "sh1_%"PRIu64"", id); 5924 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 5925 (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%"PRIu64"", 5926 osname, id); 5927 (void) snprintf(tag, sizeof (tag), "tag_%"PRIu64"", id); 5928 5929 /* 5930 * Clean up from any previous run. 5931 */ 5932 error = dsl_destroy_head(clonename); 5933 if (error != ENOENT) 5934 ASSERT0(error); 5935 error = user_release_one(fullname, tag); 5936 if (error != ESRCH && error != ENOENT) 5937 ASSERT0(error); 5938 error = dsl_destroy_snapshot(fullname, B_FALSE); 5939 if (error != ENOENT) 5940 ASSERT0(error); 5941 5942 /* 5943 * Create snapshot, clone it, mark snap for deferred destroy, 5944 * destroy clone, verify snap was also destroyed. 5945 */ 5946 error = dmu_objset_snapshot_one(osname, snapname); 5947 if (error) { 5948 if (error == ENOSPC) { 5949 ztest_record_enospc("dmu_objset_snapshot"); 5950 goto out; 5951 } 5952 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 5953 } 5954 5955 error = dmu_objset_clone(clonename, fullname); 5956 if (error) { 5957 if (error == ENOSPC) { 5958 ztest_record_enospc("dmu_objset_clone"); 5959 goto out; 5960 } 5961 fatal(B_FALSE, "dmu_objset_clone(%s) = %d", clonename, error); 5962 } 5963 5964 error = dsl_destroy_snapshot(fullname, B_TRUE); 5965 if (error) { 5966 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5967 fullname, error); 5968 } 5969 5970 error = dsl_destroy_head(clonename); 5971 if (error) 5972 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clonename, error); 5973 5974 error = dmu_objset_hold(fullname, FTAG, &origin); 5975 if (error != ENOENT) 5976 fatal(B_FALSE, "dmu_objset_hold(%s) = %d", fullname, error); 5977 5978 /* 5979 * Create snapshot, add temporary hold, verify that we can't 5980 * destroy a held snapshot, mark for deferred destroy, 5981 * release hold, verify snapshot was destroyed. 5982 */ 5983 error = dmu_objset_snapshot_one(osname, snapname); 5984 if (error) { 5985 if (error == ENOSPC) { 5986 ztest_record_enospc("dmu_objset_snapshot"); 5987 goto out; 5988 } 5989 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 5990 } 5991 5992 holds = fnvlist_alloc(); 5993 fnvlist_add_string(holds, fullname, tag); 5994 error = dsl_dataset_user_hold(holds, 0, NULL); 5995 fnvlist_free(holds); 5996 5997 if (error == ENOSPC) { 5998 ztest_record_enospc("dsl_dataset_user_hold"); 5999 goto out; 6000 } else if (error) { 6001 fatal(B_FALSE, "dsl_dataset_user_hold(%s, %s) = %u", 6002 fullname, tag, error); 6003 } 6004 6005 error = dsl_destroy_snapshot(fullname, B_FALSE); 6006 if (error != EBUSY) { 6007 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 6008 fullname, error); 6009 } 6010 6011 error = dsl_destroy_snapshot(fullname, B_TRUE); 6012 if (error) { 6013 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6014 fullname, error); 6015 } 6016 6017 error = user_release_one(fullname, tag); 6018 if (error) 6019 fatal(B_FALSE, "user_release_one(%s, %s) = %d", 6020 fullname, tag, error); 6021 6022 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 6023 6024 out: 6025 (void) pthread_rwlock_unlock(&ztest_name_lock); 6026 } 6027 6028 /* 6029 * Inject random faults into the on-disk data. 6030 */ 6031 void 6032 ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 6033 { 6034 (void) zd, (void) id; 6035 ztest_shared_t *zs = ztest_shared; 6036 spa_t *spa = ztest_spa; 6037 int fd; 6038 uint64_t offset; 6039 uint64_t leaves; 6040 uint64_t bad = 0x1990c0ffeedecadeull; 6041 uint64_t top, leaf; 6042 char *path0; 6043 char *pathrand; 6044 size_t fsize; 6045 int bshift = SPA_MAXBLOCKSHIFT + 2; 6046 int iters = 1000; 6047 int maxfaults; 6048 int mirror_save; 6049 vdev_t *vd0 = NULL; 6050 uint64_t guid0 = 0; 6051 boolean_t islog = B_FALSE; 6052 6053 path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6054 pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6055 6056 mutex_enter(&ztest_vdev_lock); 6057 6058 /* 6059 * Device removal is in progress, fault injection must be disabled 6060 * until it completes and the pool is scrubbed. The fault injection 6061 * strategy for damaging blocks does not take in to account evacuated 6062 * blocks which may have already been damaged. 6063 */ 6064 if (ztest_device_removal_active) { 6065 mutex_exit(&ztest_vdev_lock); 6066 goto out; 6067 } 6068 6069 maxfaults = MAXFAULTS(zs); 6070 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; 6071 mirror_save = zs->zs_mirrors; 6072 mutex_exit(&ztest_vdev_lock); 6073 6074 ASSERT3U(leaves, >=, 1); 6075 6076 /* 6077 * While ztest is running the number of leaves will not change. This 6078 * is critical for the fault injection logic as it determines where 6079 * errors can be safely injected such that they are always repairable. 6080 * 6081 * When restarting ztest a different number of leaves may be requested 6082 * which will shift the regions to be damaged. This is fine as long 6083 * as the pool has been scrubbed prior to using the new mapping. 6084 * Failure to do can result in non-repairable damage being injected. 6085 */ 6086 if (ztest_pool_scrubbed == B_FALSE) 6087 goto out; 6088 6089 /* 6090 * Grab the name lock as reader. There are some operations 6091 * which don't like to have their vdevs changed while 6092 * they are in progress (i.e. spa_change_guid). Those 6093 * operations will have grabbed the name lock as writer. 6094 */ 6095 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6096 6097 /* 6098 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 6099 */ 6100 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6101 6102 if (ztest_random(2) == 0) { 6103 /* 6104 * Inject errors on a normal data device or slog device. 6105 */ 6106 top = ztest_random_vdev_top(spa, B_TRUE); 6107 leaf = ztest_random(leaves) + zs->zs_splits; 6108 6109 /* 6110 * Generate paths to the first leaf in this top-level vdev, 6111 * and to the random leaf we selected. We'll induce transient 6112 * write failures and random online/offline activity on leaf 0, 6113 * and we'll write random garbage to the randomly chosen leaf. 6114 */ 6115 (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, 6116 ztest_opts.zo_dir, ztest_opts.zo_pool, 6117 top * leaves + zs->zs_splits); 6118 (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, 6119 ztest_opts.zo_dir, ztest_opts.zo_pool, 6120 top * leaves + leaf); 6121 6122 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 6123 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 6124 islog = B_TRUE; 6125 6126 /* 6127 * If the top-level vdev needs to be resilvered 6128 * then we only allow faults on the device that is 6129 * resilvering. 6130 */ 6131 if (vd0 != NULL && maxfaults != 1 && 6132 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 6133 vd0->vdev_resilver_txg != 0)) { 6134 /* 6135 * Make vd0 explicitly claim to be unreadable, 6136 * or unwritable, or reach behind its back 6137 * and close the underlying fd. We can do this if 6138 * maxfaults == 0 because we'll fail and reexecute, 6139 * and we can do it if maxfaults >= 2 because we'll 6140 * have enough redundancy. If maxfaults == 1, the 6141 * combination of this with injection of random data 6142 * corruption below exceeds the pool's fault tolerance. 6143 */ 6144 vdev_file_t *vf = vd0->vdev_tsd; 6145 6146 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 6147 (long long)vd0->vdev_id, (int)maxfaults); 6148 6149 if (vf != NULL && ztest_random(3) == 0) { 6150 (void) close(vf->vf_file->f_fd); 6151 vf->vf_file->f_fd = -1; 6152 } else if (ztest_random(2) == 0) { 6153 vd0->vdev_cant_read = B_TRUE; 6154 } else { 6155 vd0->vdev_cant_write = B_TRUE; 6156 } 6157 guid0 = vd0->vdev_guid; 6158 } 6159 } else { 6160 /* 6161 * Inject errors on an l2cache device. 6162 */ 6163 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6164 6165 if (sav->sav_count == 0) { 6166 spa_config_exit(spa, SCL_STATE, FTAG); 6167 (void) pthread_rwlock_unlock(&ztest_name_lock); 6168 goto out; 6169 } 6170 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 6171 guid0 = vd0->vdev_guid; 6172 (void) strlcpy(path0, vd0->vdev_path, MAXPATHLEN); 6173 (void) strlcpy(pathrand, vd0->vdev_path, MAXPATHLEN); 6174 6175 leaf = 0; 6176 leaves = 1; 6177 maxfaults = INT_MAX; /* no limit on cache devices */ 6178 } 6179 6180 spa_config_exit(spa, SCL_STATE, FTAG); 6181 (void) pthread_rwlock_unlock(&ztest_name_lock); 6182 6183 /* 6184 * If we can tolerate two or more faults, or we're dealing 6185 * with a slog, randomly online/offline vd0. 6186 */ 6187 if ((maxfaults >= 2 || islog) && guid0 != 0) { 6188 if (ztest_random(10) < 6) { 6189 int flags = (ztest_random(2) == 0 ? 6190 ZFS_OFFLINE_TEMPORARY : 0); 6191 6192 /* 6193 * We have to grab the zs_name_lock as writer to 6194 * prevent a race between offlining a slog and 6195 * destroying a dataset. Offlining the slog will 6196 * grab a reference on the dataset which may cause 6197 * dsl_destroy_head() to fail with EBUSY thus 6198 * leaving the dataset in an inconsistent state. 6199 */ 6200 if (islog) 6201 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6202 6203 VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); 6204 6205 if (islog) 6206 (void) pthread_rwlock_unlock(&ztest_name_lock); 6207 } else { 6208 /* 6209 * Ideally we would like to be able to randomly 6210 * call vdev_[on|off]line without holding locks 6211 * to force unpredictable failures but the side 6212 * effects of vdev_[on|off]line prevent us from 6213 * doing so. We grab the ztest_vdev_lock here to 6214 * prevent a race between injection testing and 6215 * aux_vdev removal. 6216 */ 6217 mutex_enter(&ztest_vdev_lock); 6218 (void) vdev_online(spa, guid0, 0, NULL); 6219 mutex_exit(&ztest_vdev_lock); 6220 } 6221 } 6222 6223 if (maxfaults == 0) 6224 goto out; 6225 6226 /* 6227 * We have at least single-fault tolerance, so inject data corruption. 6228 */ 6229 fd = open(pathrand, O_RDWR); 6230 6231 if (fd == -1) /* we hit a gap in the device namespace */ 6232 goto out; 6233 6234 fsize = lseek(fd, 0, SEEK_END); 6235 6236 while (--iters != 0) { 6237 /* 6238 * The offset must be chosen carefully to ensure that 6239 * we do not inject a given logical block with errors 6240 * on two different leaf devices, because ZFS can not 6241 * tolerate that (if maxfaults==1). 6242 * 6243 * To achieve this we divide each leaf device into 6244 * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). 6245 * Each chunk is further divided into error-injection 6246 * ranges (can accept errors) and clear ranges (we do 6247 * not inject errors in those). Each error-injection 6248 * range can accept errors only for a single leaf vdev. 6249 * Error-injection ranges are separated by clear ranges. 6250 * 6251 * For example, with 3 leaves, each chunk looks like: 6252 * 0 to 32M: injection range for leaf 0 6253 * 32M to 64M: clear range - no injection allowed 6254 * 64M to 96M: injection range for leaf 1 6255 * 96M to 128M: clear range - no injection allowed 6256 * 128M to 160M: injection range for leaf 2 6257 * 160M to 192M: clear range - no injection allowed 6258 * 6259 * Each clear range must be large enough such that a 6260 * single block cannot straddle it. This way a block 6261 * can't be a target in two different injection ranges 6262 * (on different leaf vdevs). 6263 */ 6264 offset = ztest_random(fsize / (leaves << bshift)) * 6265 (leaves << bshift) + (leaf << bshift) + 6266 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 6267 6268 /* 6269 * Only allow damage to the labels at one end of the vdev. 6270 * 6271 * If all labels are damaged, the device will be totally 6272 * inaccessible, which will result in loss of data, 6273 * because we also damage (parts of) the other side of 6274 * the mirror/raidz. 6275 * 6276 * Additionally, we will always have both an even and an 6277 * odd label, so that we can handle crashes in the 6278 * middle of vdev_config_sync(). 6279 */ 6280 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 6281 continue; 6282 6283 /* 6284 * The two end labels are stored at the "end" of the disk, but 6285 * the end of the disk (vdev_psize) is aligned to 6286 * sizeof (vdev_label_t). 6287 */ 6288 uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); 6289 if ((leaf & 1) == 1 && 6290 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 6291 continue; 6292 6293 mutex_enter(&ztest_vdev_lock); 6294 if (mirror_save != zs->zs_mirrors) { 6295 mutex_exit(&ztest_vdev_lock); 6296 (void) close(fd); 6297 goto out; 6298 } 6299 6300 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 6301 fatal(B_TRUE, 6302 "can't inject bad word at 0x%"PRIx64" in %s", 6303 offset, pathrand); 6304 6305 mutex_exit(&ztest_vdev_lock); 6306 6307 if (ztest_opts.zo_verbose >= 7) 6308 (void) printf("injected bad word into %s," 6309 " offset 0x%"PRIx64"\n", pathrand, offset); 6310 } 6311 6312 (void) close(fd); 6313 out: 6314 umem_free(path0, MAXPATHLEN); 6315 umem_free(pathrand, MAXPATHLEN); 6316 } 6317 6318 /* 6319 * By design ztest will never inject uncorrectable damage in to the pool. 6320 * Issue a scrub, wait for it to complete, and verify there is never any 6321 * persistent damage. 6322 * 6323 * Only after a full scrub has been completed is it safe to start injecting 6324 * data corruption. See the comment in zfs_fault_inject(). 6325 */ 6326 static int 6327 ztest_scrub_impl(spa_t *spa) 6328 { 6329 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6330 if (error) 6331 return (error); 6332 6333 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6334 txg_wait_synced(spa_get_dsl(spa), 0); 6335 6336 if (spa_approx_errlog_size(spa) > 0) 6337 return (ECKSUM); 6338 6339 ztest_pool_scrubbed = B_TRUE; 6340 6341 return (0); 6342 } 6343 6344 /* 6345 * Scrub the pool. 6346 */ 6347 void 6348 ztest_scrub(ztest_ds_t *zd, uint64_t id) 6349 { 6350 (void) zd, (void) id; 6351 spa_t *spa = ztest_spa; 6352 int error; 6353 6354 /* 6355 * Scrub in progress by device removal. 6356 */ 6357 if (ztest_device_removal_active) 6358 return; 6359 6360 /* 6361 * Start a scrub, wait a moment, then force a restart. 6362 */ 6363 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6364 (void) poll(NULL, 0, 100); 6365 6366 error = ztest_scrub_impl(spa); 6367 if (error == EBUSY) 6368 error = 0; 6369 ASSERT0(error); 6370 } 6371 6372 /* 6373 * Change the guid for the pool. 6374 */ 6375 void 6376 ztest_reguid(ztest_ds_t *zd, uint64_t id) 6377 { 6378 (void) zd, (void) id; 6379 spa_t *spa = ztest_spa; 6380 uint64_t orig, load; 6381 int error; 6382 6383 if (ztest_opts.zo_mmp_test) 6384 return; 6385 6386 orig = spa_guid(spa); 6387 load = spa_load_guid(spa); 6388 6389 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6390 error = spa_change_guid(spa); 6391 (void) pthread_rwlock_unlock(&ztest_name_lock); 6392 6393 if (error != 0) 6394 return; 6395 6396 if (ztest_opts.zo_verbose >= 4) { 6397 (void) printf("Changed guid old %"PRIu64" -> %"PRIu64"\n", 6398 orig, spa_guid(spa)); 6399 } 6400 6401 VERIFY3U(orig, !=, spa_guid(spa)); 6402 VERIFY3U(load, ==, spa_load_guid(spa)); 6403 } 6404 6405 void 6406 ztest_blake3(ztest_ds_t *zd, uint64_t id) 6407 { 6408 (void) zd, (void) id; 6409 hrtime_t end = gethrtime() + NANOSEC; 6410 zio_cksum_salt_t salt; 6411 void *salt_ptr = &salt.zcs_bytes; 6412 struct abd *abd_data, *abd_meta; 6413 void *buf, *templ; 6414 int i, *ptr; 6415 uint32_t size; 6416 BLAKE3_CTX ctx; 6417 const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); 6418 6419 size = ztest_random_blocksize(); 6420 buf = umem_alloc(size, UMEM_NOFAIL); 6421 abd_data = abd_alloc(size, B_FALSE); 6422 abd_meta = abd_alloc(size, B_TRUE); 6423 6424 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6425 *ptr = ztest_random(UINT_MAX); 6426 memset(salt_ptr, 'A', 32); 6427 6428 abd_copy_from_buf_off(abd_data, buf, 0, size); 6429 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6430 6431 while (gethrtime() <= end) { 6432 int run_count = 100; 6433 zio_cksum_t zc_ref1, zc_ref2; 6434 zio_cksum_t zc_res1, zc_res2; 6435 6436 void *ref1 = &zc_ref1; 6437 void *ref2 = &zc_ref2; 6438 void *res1 = &zc_res1; 6439 void *res2 = &zc_res2; 6440 6441 /* BLAKE3_KEY_LEN = 32 */ 6442 VERIFY0(blake3->setname("generic")); 6443 templ = abd_checksum_blake3_tmpl_init(&salt); 6444 Blake3_InitKeyed(&ctx, salt_ptr); 6445 Blake3_Update(&ctx, buf, size); 6446 Blake3_Final(&ctx, ref1); 6447 zc_ref2 = zc_ref1; 6448 ZIO_CHECKSUM_BSWAP(&zc_ref2); 6449 abd_checksum_blake3_tmpl_free(templ); 6450 6451 VERIFY0(blake3->setname("cycle")); 6452 while (run_count-- > 0) { 6453 6454 /* Test current implementation */ 6455 Blake3_InitKeyed(&ctx, salt_ptr); 6456 Blake3_Update(&ctx, buf, size); 6457 Blake3_Final(&ctx, res1); 6458 zc_res2 = zc_res1; 6459 ZIO_CHECKSUM_BSWAP(&zc_res2); 6460 6461 VERIFY0(memcmp(ref1, res1, 32)); 6462 VERIFY0(memcmp(ref2, res2, 32)); 6463 6464 /* Test ABD - data */ 6465 templ = abd_checksum_blake3_tmpl_init(&salt); 6466 abd_checksum_blake3_native(abd_data, size, 6467 templ, &zc_res1); 6468 abd_checksum_blake3_byteswap(abd_data, size, 6469 templ, &zc_res2); 6470 6471 VERIFY0(memcmp(ref1, res1, 32)); 6472 VERIFY0(memcmp(ref2, res2, 32)); 6473 6474 /* Test ABD - metadata */ 6475 abd_checksum_blake3_native(abd_meta, size, 6476 templ, &zc_res1); 6477 abd_checksum_blake3_byteswap(abd_meta, size, 6478 templ, &zc_res2); 6479 abd_checksum_blake3_tmpl_free(templ); 6480 6481 VERIFY0(memcmp(ref1, res1, 32)); 6482 VERIFY0(memcmp(ref2, res2, 32)); 6483 6484 } 6485 } 6486 6487 abd_free(abd_data); 6488 abd_free(abd_meta); 6489 umem_free(buf, size); 6490 } 6491 6492 void 6493 ztest_fletcher(ztest_ds_t *zd, uint64_t id) 6494 { 6495 (void) zd, (void) id; 6496 hrtime_t end = gethrtime() + NANOSEC; 6497 6498 while (gethrtime() <= end) { 6499 int run_count = 100; 6500 void *buf; 6501 struct abd *abd_data, *abd_meta; 6502 uint32_t size; 6503 int *ptr; 6504 int i; 6505 zio_cksum_t zc_ref; 6506 zio_cksum_t zc_ref_byteswap; 6507 6508 size = ztest_random_blocksize(); 6509 6510 buf = umem_alloc(size, UMEM_NOFAIL); 6511 abd_data = abd_alloc(size, B_FALSE); 6512 abd_meta = abd_alloc(size, B_TRUE); 6513 6514 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6515 *ptr = ztest_random(UINT_MAX); 6516 6517 abd_copy_from_buf_off(abd_data, buf, 0, size); 6518 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6519 6520 VERIFY0(fletcher_4_impl_set("scalar")); 6521 fletcher_4_native(buf, size, NULL, &zc_ref); 6522 fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); 6523 6524 VERIFY0(fletcher_4_impl_set("cycle")); 6525 while (run_count-- > 0) { 6526 zio_cksum_t zc; 6527 zio_cksum_t zc_byteswap; 6528 6529 fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); 6530 fletcher_4_native(buf, size, NULL, &zc); 6531 6532 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6533 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6534 sizeof (zc_byteswap))); 6535 6536 /* Test ABD - data */ 6537 abd_fletcher_4_byteswap(abd_data, size, NULL, 6538 &zc_byteswap); 6539 abd_fletcher_4_native(abd_data, size, NULL, &zc); 6540 6541 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6542 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6543 sizeof (zc_byteswap))); 6544 6545 /* Test ABD - metadata */ 6546 abd_fletcher_4_byteswap(abd_meta, size, NULL, 6547 &zc_byteswap); 6548 abd_fletcher_4_native(abd_meta, size, NULL, &zc); 6549 6550 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6551 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6552 sizeof (zc_byteswap))); 6553 6554 } 6555 6556 umem_free(buf, size); 6557 abd_free(abd_data); 6558 abd_free(abd_meta); 6559 } 6560 } 6561 6562 void 6563 ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) 6564 { 6565 (void) zd, (void) id; 6566 void *buf; 6567 size_t size; 6568 int *ptr; 6569 int i; 6570 zio_cksum_t zc_ref; 6571 zio_cksum_t zc_ref_bswap; 6572 6573 hrtime_t end = gethrtime() + NANOSEC; 6574 6575 while (gethrtime() <= end) { 6576 int run_count = 100; 6577 6578 size = ztest_random_blocksize(); 6579 buf = umem_alloc(size, UMEM_NOFAIL); 6580 6581 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6582 *ptr = ztest_random(UINT_MAX); 6583 6584 VERIFY0(fletcher_4_impl_set("scalar")); 6585 fletcher_4_native(buf, size, NULL, &zc_ref); 6586 fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); 6587 6588 VERIFY0(fletcher_4_impl_set("cycle")); 6589 6590 while (run_count-- > 0) { 6591 zio_cksum_t zc; 6592 zio_cksum_t zc_bswap; 6593 size_t pos = 0; 6594 6595 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6596 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6597 6598 while (pos < size) { 6599 size_t inc = 64 * ztest_random(size / 67); 6600 /* sometimes add few bytes to test non-simd */ 6601 if (ztest_random(100) < 10) 6602 inc += P2ALIGN(ztest_random(64), 6603 sizeof (uint32_t)); 6604 6605 if (inc > (size - pos)) 6606 inc = size - pos; 6607 6608 fletcher_4_incremental_native(buf + pos, inc, 6609 &zc); 6610 fletcher_4_incremental_byteswap(buf + pos, inc, 6611 &zc_bswap); 6612 6613 pos += inc; 6614 } 6615 6616 VERIFY3U(pos, ==, size); 6617 6618 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6619 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6620 6621 /* 6622 * verify if incremental on the whole buffer is 6623 * equivalent to non-incremental version 6624 */ 6625 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6626 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6627 6628 fletcher_4_incremental_native(buf, size, &zc); 6629 fletcher_4_incremental_byteswap(buf, size, &zc_bswap); 6630 6631 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6632 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6633 } 6634 6635 umem_free(buf, size); 6636 } 6637 } 6638 6639 static int 6640 ztest_set_global_vars(void) 6641 { 6642 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 6643 char *kv = ztest_opts.zo_gvars[i]; 6644 VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); 6645 VERIFY3U(strlen(kv), >, 0); 6646 int err = set_global_var(kv); 6647 if (ztest_opts.zo_verbose > 0) { 6648 (void) printf("setting global var %s ... %s\n", kv, 6649 err ? "failed" : "ok"); 6650 } 6651 if (err != 0) { 6652 (void) fprintf(stderr, 6653 "failed to set global var '%s'\n", kv); 6654 return (err); 6655 } 6656 } 6657 return (0); 6658 } 6659 6660 static char ** 6661 ztest_global_vars_to_zdb_args(void) 6662 { 6663 char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); 6664 char **cur = args; 6665 if (args == NULL) 6666 return (NULL); 6667 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 6668 *cur++ = (char *)"-o"; 6669 *cur++ = ztest_opts.zo_gvars[i]; 6670 } 6671 ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); 6672 *cur = NULL; 6673 return (args); 6674 } 6675 6676 /* The end of strings is indicated by a NULL element */ 6677 static char * 6678 join_strings(char **strings, const char *sep) 6679 { 6680 size_t totallen = 0; 6681 for (char **sp = strings; *sp != NULL; sp++) { 6682 totallen += strlen(*sp); 6683 totallen += strlen(sep); 6684 } 6685 if (totallen > 0) { 6686 ASSERT(totallen >= strlen(sep)); 6687 totallen -= strlen(sep); 6688 } 6689 6690 size_t buflen = totallen + 1; 6691 char *o = umem_alloc(buflen, UMEM_NOFAIL); /* trailing 0 byte */ 6692 o[0] = '\0'; 6693 for (char **sp = strings; *sp != NULL; sp++) { 6694 size_t would; 6695 would = strlcat(o, *sp, buflen); 6696 VERIFY3U(would, <, buflen); 6697 if (*(sp+1) == NULL) { 6698 break; 6699 } 6700 would = strlcat(o, sep, buflen); 6701 VERIFY3U(would, <, buflen); 6702 } 6703 ASSERT3S(strlen(o), ==, totallen); 6704 return (o); 6705 } 6706 6707 static int 6708 ztest_check_path(char *path) 6709 { 6710 struct stat s; 6711 /* return true on success */ 6712 return (!stat(path, &s)); 6713 } 6714 6715 static void 6716 ztest_get_zdb_bin(char *bin, int len) 6717 { 6718 char *zdb_path; 6719 /* 6720 * Try to use $ZDB and in-tree zdb path. If not successful, just 6721 * let popen to search through PATH. 6722 */ 6723 if ((zdb_path = getenv("ZDB"))) { 6724 strlcpy(bin, zdb_path, len); /* In env */ 6725 if (!ztest_check_path(bin)) { 6726 ztest_dump_core = 0; 6727 fatal(B_TRUE, "invalid ZDB '%s'", bin); 6728 } 6729 return; 6730 } 6731 6732 VERIFY3P(realpath(getexecname(), bin), !=, NULL); 6733 if (strstr(bin, ".libs/ztest")) { 6734 strstr(bin, ".libs/ztest")[0] = '\0'; /* In-tree */ 6735 strcat(bin, "zdb"); 6736 if (ztest_check_path(bin)) 6737 return; 6738 } 6739 strcpy(bin, "zdb"); 6740 } 6741 6742 static vdev_t * 6743 ztest_random_concrete_vdev_leaf(vdev_t *vd) 6744 { 6745 if (vd == NULL) 6746 return (NULL); 6747 6748 if (vd->vdev_children == 0) 6749 return (vd); 6750 6751 vdev_t *eligible[vd->vdev_children]; 6752 int eligible_idx = 0, i; 6753 for (i = 0; i < vd->vdev_children; i++) { 6754 vdev_t *cvd = vd->vdev_child[i]; 6755 if (cvd->vdev_top->vdev_removing) 6756 continue; 6757 if (cvd->vdev_children > 0 || 6758 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 6759 eligible[eligible_idx++] = cvd; 6760 } 6761 } 6762 VERIFY3S(eligible_idx, >, 0); 6763 6764 uint64_t child_no = ztest_random(eligible_idx); 6765 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 6766 } 6767 6768 void 6769 ztest_initialize(ztest_ds_t *zd, uint64_t id) 6770 { 6771 (void) zd, (void) id; 6772 spa_t *spa = ztest_spa; 6773 int error = 0; 6774 6775 mutex_enter(&ztest_vdev_lock); 6776 6777 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6778 6779 /* Random leaf vdev */ 6780 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 6781 if (rand_vd == NULL) { 6782 spa_config_exit(spa, SCL_VDEV, FTAG); 6783 mutex_exit(&ztest_vdev_lock); 6784 return; 6785 } 6786 6787 /* 6788 * The random vdev we've selected may change as soon as we 6789 * drop the spa_config_lock. We create local copies of things 6790 * we're interested in. 6791 */ 6792 uint64_t guid = rand_vd->vdev_guid; 6793 char *path = strdup(rand_vd->vdev_path); 6794 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 6795 6796 zfs_dbgmsg("vd %px, guid %llu", rand_vd, (u_longlong_t)guid); 6797 spa_config_exit(spa, SCL_VDEV, FTAG); 6798 6799 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 6800 6801 nvlist_t *vdev_guids = fnvlist_alloc(); 6802 nvlist_t *vdev_errlist = fnvlist_alloc(); 6803 fnvlist_add_uint64(vdev_guids, path, guid); 6804 error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); 6805 fnvlist_free(vdev_guids); 6806 fnvlist_free(vdev_errlist); 6807 6808 switch (cmd) { 6809 case POOL_INITIALIZE_CANCEL: 6810 if (ztest_opts.zo_verbose >= 4) { 6811 (void) printf("Cancel initialize %s", path); 6812 if (!active) 6813 (void) printf(" failed (no initialize active)"); 6814 (void) printf("\n"); 6815 } 6816 break; 6817 case POOL_INITIALIZE_START: 6818 if (ztest_opts.zo_verbose >= 4) { 6819 (void) printf("Start initialize %s", path); 6820 if (active && error == 0) 6821 (void) printf(" failed (already active)"); 6822 else if (error != 0) 6823 (void) printf(" failed (error %d)", error); 6824 (void) printf("\n"); 6825 } 6826 break; 6827 case POOL_INITIALIZE_SUSPEND: 6828 if (ztest_opts.zo_verbose >= 4) { 6829 (void) printf("Suspend initialize %s", path); 6830 if (!active) 6831 (void) printf(" failed (no initialize active)"); 6832 (void) printf("\n"); 6833 } 6834 break; 6835 } 6836 free(path); 6837 mutex_exit(&ztest_vdev_lock); 6838 } 6839 6840 void 6841 ztest_trim(ztest_ds_t *zd, uint64_t id) 6842 { 6843 (void) zd, (void) id; 6844 spa_t *spa = ztest_spa; 6845 int error = 0; 6846 6847 mutex_enter(&ztest_vdev_lock); 6848 6849 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6850 6851 /* Random leaf vdev */ 6852 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 6853 if (rand_vd == NULL) { 6854 spa_config_exit(spa, SCL_VDEV, FTAG); 6855 mutex_exit(&ztest_vdev_lock); 6856 return; 6857 } 6858 6859 /* 6860 * The random vdev we've selected may change as soon as we 6861 * drop the spa_config_lock. We create local copies of things 6862 * we're interested in. 6863 */ 6864 uint64_t guid = rand_vd->vdev_guid; 6865 char *path = strdup(rand_vd->vdev_path); 6866 boolean_t active = rand_vd->vdev_trim_thread != NULL; 6867 6868 zfs_dbgmsg("vd %p, guid %llu", rand_vd, (u_longlong_t)guid); 6869 spa_config_exit(spa, SCL_VDEV, FTAG); 6870 6871 uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); 6872 uint64_t rate = 1 << ztest_random(30); 6873 boolean_t partial = (ztest_random(5) > 0); 6874 boolean_t secure = (ztest_random(5) > 0); 6875 6876 nvlist_t *vdev_guids = fnvlist_alloc(); 6877 nvlist_t *vdev_errlist = fnvlist_alloc(); 6878 fnvlist_add_uint64(vdev_guids, path, guid); 6879 error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, 6880 secure, vdev_errlist); 6881 fnvlist_free(vdev_guids); 6882 fnvlist_free(vdev_errlist); 6883 6884 switch (cmd) { 6885 case POOL_TRIM_CANCEL: 6886 if (ztest_opts.zo_verbose >= 4) { 6887 (void) printf("Cancel TRIM %s", path); 6888 if (!active) 6889 (void) printf(" failed (no TRIM active)"); 6890 (void) printf("\n"); 6891 } 6892 break; 6893 case POOL_TRIM_START: 6894 if (ztest_opts.zo_verbose >= 4) { 6895 (void) printf("Start TRIM %s", path); 6896 if (active && error == 0) 6897 (void) printf(" failed (already active)"); 6898 else if (error != 0) 6899 (void) printf(" failed (error %d)", error); 6900 (void) printf("\n"); 6901 } 6902 break; 6903 case POOL_TRIM_SUSPEND: 6904 if (ztest_opts.zo_verbose >= 4) { 6905 (void) printf("Suspend TRIM %s", path); 6906 if (!active) 6907 (void) printf(" failed (no TRIM active)"); 6908 (void) printf("\n"); 6909 } 6910 break; 6911 } 6912 free(path); 6913 mutex_exit(&ztest_vdev_lock); 6914 } 6915 6916 /* 6917 * Verify pool integrity by running zdb. 6918 */ 6919 static void 6920 ztest_run_zdb(const char *pool) 6921 { 6922 int status; 6923 char *bin; 6924 char *zdb; 6925 char *zbuf; 6926 const int len = MAXPATHLEN + MAXNAMELEN + 20; 6927 FILE *fp; 6928 6929 bin = umem_alloc(len, UMEM_NOFAIL); 6930 zdb = umem_alloc(len, UMEM_NOFAIL); 6931 zbuf = umem_alloc(1024, UMEM_NOFAIL); 6932 6933 ztest_get_zdb_bin(bin, len); 6934 6935 char **set_gvars_args = ztest_global_vars_to_zdb_args(); 6936 if (set_gvars_args == NULL) { 6937 fatal(B_FALSE, "Failed to allocate memory in " 6938 "ztest_global_vars_to_zdb_args(). Cannot run zdb.\n"); 6939 } 6940 char *set_gvars_args_joined = join_strings(set_gvars_args, " "); 6941 free(set_gvars_args); 6942 6943 size_t would = snprintf(zdb, len, 6944 "%s -bcc%s%s -G -d -Y -e -y %s -p %s %s", 6945 bin, 6946 ztest_opts.zo_verbose >= 3 ? "s" : "", 6947 ztest_opts.zo_verbose >= 4 ? "v" : "", 6948 set_gvars_args_joined, 6949 ztest_opts.zo_dir, 6950 pool); 6951 ASSERT3U(would, <, len); 6952 6953 umem_free(set_gvars_args_joined, strlen(set_gvars_args_joined) + 1); 6954 6955 if (ztest_opts.zo_verbose >= 5) 6956 (void) printf("Executing %s\n", zdb); 6957 6958 fp = popen(zdb, "r"); 6959 6960 while (fgets(zbuf, 1024, fp) != NULL) 6961 if (ztest_opts.zo_verbose >= 3) 6962 (void) printf("%s", zbuf); 6963 6964 status = pclose(fp); 6965 6966 if (status == 0) 6967 goto out; 6968 6969 ztest_dump_core = 0; 6970 if (WIFEXITED(status)) 6971 fatal(B_FALSE, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 6972 else 6973 fatal(B_FALSE, "'%s' died with signal %d", 6974 zdb, WTERMSIG(status)); 6975 out: 6976 umem_free(bin, len); 6977 umem_free(zdb, len); 6978 umem_free(zbuf, 1024); 6979 } 6980 6981 static void 6982 ztest_walk_pool_directory(const char *header) 6983 { 6984 spa_t *spa = NULL; 6985 6986 if (ztest_opts.zo_verbose >= 6) 6987 (void) puts(header); 6988 6989 mutex_enter(&spa_namespace_lock); 6990 while ((spa = spa_next(spa)) != NULL) 6991 if (ztest_opts.zo_verbose >= 6) 6992 (void) printf("\t%s\n", spa_name(spa)); 6993 mutex_exit(&spa_namespace_lock); 6994 } 6995 6996 static void 6997 ztest_spa_import_export(char *oldname, char *newname) 6998 { 6999 nvlist_t *config, *newconfig; 7000 uint64_t pool_guid; 7001 spa_t *spa; 7002 int error; 7003 7004 if (ztest_opts.zo_verbose >= 4) { 7005 (void) printf("import/export: old = %s, new = %s\n", 7006 oldname, newname); 7007 } 7008 7009 /* 7010 * Clean up from previous runs. 7011 */ 7012 (void) spa_destroy(newname); 7013 7014 /* 7015 * Get the pool's configuration and guid. 7016 */ 7017 VERIFY0(spa_open(oldname, &spa, FTAG)); 7018 7019 /* 7020 * Kick off a scrub to tickle scrub/export races. 7021 */ 7022 if (ztest_random(2) == 0) 7023 (void) spa_scan(spa, POOL_SCAN_SCRUB); 7024 7025 pool_guid = spa_guid(spa); 7026 spa_close(spa, FTAG); 7027 7028 ztest_walk_pool_directory("pools before export"); 7029 7030 /* 7031 * Export it. 7032 */ 7033 VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); 7034 7035 ztest_walk_pool_directory("pools after export"); 7036 7037 /* 7038 * Try to import it. 7039 */ 7040 newconfig = spa_tryimport(config); 7041 ASSERT3P(newconfig, !=, NULL); 7042 fnvlist_free(newconfig); 7043 7044 /* 7045 * Import it under the new name. 7046 */ 7047 error = spa_import(newname, config, NULL, 0); 7048 if (error != 0) { 7049 dump_nvlist(config, 0); 7050 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 7051 oldname, newname, error); 7052 } 7053 7054 ztest_walk_pool_directory("pools after import"); 7055 7056 /* 7057 * Try to import it again -- should fail with EEXIST. 7058 */ 7059 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 7060 7061 /* 7062 * Try to import it under a different name -- should fail with EEXIST. 7063 */ 7064 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 7065 7066 /* 7067 * Verify that the pool is no longer visible under the old name. 7068 */ 7069 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 7070 7071 /* 7072 * Verify that we can open and close the pool using the new name. 7073 */ 7074 VERIFY0(spa_open(newname, &spa, FTAG)); 7075 ASSERT3U(pool_guid, ==, spa_guid(spa)); 7076 spa_close(spa, FTAG); 7077 7078 fnvlist_free(config); 7079 } 7080 7081 static void 7082 ztest_resume(spa_t *spa) 7083 { 7084 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 7085 (void) printf("resuming from suspended state\n"); 7086 spa_vdev_state_enter(spa, SCL_NONE); 7087 vdev_clear(spa, NULL); 7088 (void) spa_vdev_state_exit(spa, NULL, 0); 7089 (void) zio_resume(spa); 7090 } 7091 7092 static __attribute__((noreturn)) void 7093 ztest_resume_thread(void *arg) 7094 { 7095 spa_t *spa = arg; 7096 7097 while (!ztest_exiting) { 7098 if (spa_suspended(spa)) 7099 ztest_resume(spa); 7100 (void) poll(NULL, 0, 100); 7101 7102 /* 7103 * Periodically change the zfs_compressed_arc_enabled setting. 7104 */ 7105 if (ztest_random(10) == 0) 7106 zfs_compressed_arc_enabled = ztest_random(2); 7107 7108 /* 7109 * Periodically change the zfs_abd_scatter_enabled setting. 7110 */ 7111 if (ztest_random(10) == 0) 7112 zfs_abd_scatter_enabled = ztest_random(2); 7113 } 7114 7115 thread_exit(); 7116 } 7117 7118 static __attribute__((noreturn)) void 7119 ztest_deadman_thread(void *arg) 7120 { 7121 ztest_shared_t *zs = arg; 7122 spa_t *spa = ztest_spa; 7123 hrtime_t delay, overdue, last_run = gethrtime(); 7124 7125 delay = (zs->zs_thread_stop - zs->zs_thread_start) + 7126 MSEC2NSEC(zfs_deadman_synctime_ms); 7127 7128 while (!ztest_exiting) { 7129 /* 7130 * Wait for the delay timer while checking occasionally 7131 * if we should stop. 7132 */ 7133 if (gethrtime() < last_run + delay) { 7134 (void) poll(NULL, 0, 1000); 7135 continue; 7136 } 7137 7138 /* 7139 * If the pool is suspended then fail immediately. Otherwise, 7140 * check to see if the pool is making any progress. If 7141 * vdev_deadman() discovers that there hasn't been any recent 7142 * I/Os then it will end up aborting the tests. 7143 */ 7144 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 7145 fatal(B_FALSE, 7146 "aborting test after %llu seconds because " 7147 "pool has transitioned to a suspended state.", 7148 (u_longlong_t)zfs_deadman_synctime_ms / 1000); 7149 } 7150 vdev_deadman(spa->spa_root_vdev, FTAG); 7151 7152 /* 7153 * If the process doesn't complete within a grace period of 7154 * zfs_deadman_synctime_ms over the expected finish time, 7155 * then it may be hung and is terminated. 7156 */ 7157 overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); 7158 if (gethrtime() > overdue) { 7159 fatal(B_FALSE, 7160 "aborting test after %llu seconds because " 7161 "the process is overdue for termination.", 7162 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7163 } 7164 7165 (void) printf("ztest has been running for %lld seconds\n", 7166 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7167 7168 last_run = gethrtime(); 7169 delay = MSEC2NSEC(zfs_deadman_checktime_ms); 7170 } 7171 7172 thread_exit(); 7173 } 7174 7175 static void 7176 ztest_execute(int test, ztest_info_t *zi, uint64_t id) 7177 { 7178 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 7179 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 7180 hrtime_t functime = gethrtime(); 7181 int i; 7182 7183 for (i = 0; i < zi->zi_iters; i++) 7184 zi->zi_func(zd, id); 7185 7186 functime = gethrtime() - functime; 7187 7188 atomic_add_64(&zc->zc_count, 1); 7189 atomic_add_64(&zc->zc_time, functime); 7190 7191 if (ztest_opts.zo_verbose >= 4) 7192 (void) printf("%6.2f sec in %s\n", 7193 (double)functime / NANOSEC, zi->zi_funcname); 7194 } 7195 7196 static __attribute__((noreturn)) void 7197 ztest_thread(void *arg) 7198 { 7199 int rand; 7200 uint64_t id = (uintptr_t)arg; 7201 ztest_shared_t *zs = ztest_shared; 7202 uint64_t call_next; 7203 hrtime_t now; 7204 ztest_info_t *zi; 7205 ztest_shared_callstate_t *zc; 7206 7207 while ((now = gethrtime()) < zs->zs_thread_stop) { 7208 /* 7209 * See if it's time to force a crash. 7210 */ 7211 if (now > zs->zs_thread_kill) 7212 ztest_kill(zs); 7213 7214 /* 7215 * If we're getting ENOSPC with some regularity, stop. 7216 */ 7217 if (zs->zs_enospc_count > 10) 7218 break; 7219 7220 /* 7221 * Pick a random function to execute. 7222 */ 7223 rand = ztest_random(ZTEST_FUNCS); 7224 zi = &ztest_info[rand]; 7225 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 7226 call_next = zc->zc_next; 7227 7228 if (now >= call_next && 7229 atomic_cas_64(&zc->zc_next, call_next, call_next + 7230 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 7231 ztest_execute(rand, zi, id); 7232 } 7233 } 7234 7235 thread_exit(); 7236 } 7237 7238 static void 7239 ztest_dataset_name(char *dsname, const char *pool, int d) 7240 { 7241 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 7242 } 7243 7244 static void 7245 ztest_dataset_destroy(int d) 7246 { 7247 char name[ZFS_MAX_DATASET_NAME_LEN]; 7248 int t; 7249 7250 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7251 7252 if (ztest_opts.zo_verbose >= 3) 7253 (void) printf("Destroying %s to free up space\n", name); 7254 7255 /* 7256 * Cleanup any non-standard clones and snapshots. In general, 7257 * ztest thread t operates on dataset (t % zopt_datasets), 7258 * so there may be more than one thing to clean up. 7259 */ 7260 for (t = d; t < ztest_opts.zo_threads; 7261 t += ztest_opts.zo_datasets) 7262 ztest_dsl_dataset_cleanup(name, t); 7263 7264 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 7265 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 7266 } 7267 7268 static void 7269 ztest_dataset_dirobj_verify(ztest_ds_t *zd) 7270 { 7271 uint64_t usedobjs, dirobjs, scratch; 7272 7273 /* 7274 * ZTEST_DIROBJ is the object directory for the entire dataset. 7275 * Therefore, the number of objects in use should equal the 7276 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 7277 * If not, we have an object leak. 7278 * 7279 * Note that we can only check this in ztest_dataset_open(), 7280 * when the open-context and syncing-context values agree. 7281 * That's because zap_count() returns the open-context value, 7282 * while dmu_objset_space() returns the rootbp fill count. 7283 */ 7284 VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 7285 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 7286 ASSERT3U(dirobjs + 1, ==, usedobjs); 7287 } 7288 7289 static int 7290 ztest_dataset_open(int d) 7291 { 7292 ztest_ds_t *zd = &ztest_ds[d]; 7293 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 7294 objset_t *os; 7295 zilog_t *zilog; 7296 char name[ZFS_MAX_DATASET_NAME_LEN]; 7297 int error; 7298 7299 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7300 7301 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7302 7303 error = ztest_dataset_create(name); 7304 if (error == ENOSPC) { 7305 (void) pthread_rwlock_unlock(&ztest_name_lock); 7306 ztest_record_enospc(FTAG); 7307 return (error); 7308 } 7309 ASSERT(error == 0 || error == EEXIST); 7310 7311 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 7312 B_TRUE, zd, &os)); 7313 (void) pthread_rwlock_unlock(&ztest_name_lock); 7314 7315 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 7316 7317 zilog = zd->zd_zilog; 7318 7319 if (zilog->zl_header->zh_claim_lr_seq != 0 && 7320 zilog->zl_header->zh_claim_lr_seq < committed_seq) 7321 fatal(B_FALSE, "missing log records: " 7322 "claimed %"PRIu64" < committed %"PRIu64"", 7323 zilog->zl_header->zh_claim_lr_seq, committed_seq); 7324 7325 ztest_dataset_dirobj_verify(zd); 7326 7327 zil_replay(os, zd, ztest_replay_vector); 7328 7329 ztest_dataset_dirobj_verify(zd); 7330 7331 if (ztest_opts.zo_verbose >= 6) 7332 (void) printf("%s replay %"PRIu64" blocks, " 7333 "%"PRIu64" records, seq %"PRIu64"\n", 7334 zd->zd_name, 7335 zilog->zl_parse_blk_count, 7336 zilog->zl_parse_lr_count, 7337 zilog->zl_replaying_seq); 7338 7339 zilog = zil_open(os, ztest_get_data, NULL); 7340 7341 if (zilog->zl_replaying_seq != 0 && 7342 zilog->zl_replaying_seq < committed_seq) 7343 fatal(B_FALSE, "missing log records: " 7344 "replayed %"PRIu64" < committed %"PRIu64"", 7345 zilog->zl_replaying_seq, committed_seq); 7346 7347 return (0); 7348 } 7349 7350 static void 7351 ztest_dataset_close(int d) 7352 { 7353 ztest_ds_t *zd = &ztest_ds[d]; 7354 7355 zil_close(zd->zd_zilog); 7356 dmu_objset_disown(zd->zd_os, B_TRUE, zd); 7357 7358 ztest_zd_fini(zd); 7359 } 7360 7361 static int 7362 ztest_replay_zil_cb(const char *name, void *arg) 7363 { 7364 (void) arg; 7365 objset_t *os; 7366 ztest_ds_t *zdtmp; 7367 7368 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, 7369 B_TRUE, FTAG, &os)); 7370 7371 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 7372 7373 ztest_zd_init(zdtmp, NULL, os); 7374 zil_replay(os, zdtmp, ztest_replay_vector); 7375 ztest_zd_fini(zdtmp); 7376 7377 if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && 7378 ztest_opts.zo_verbose >= 6) { 7379 zilog_t *zilog = dmu_objset_zil(os); 7380 7381 (void) printf("%s replay %"PRIu64" blocks, " 7382 "%"PRIu64" records, seq %"PRIu64"\n", 7383 name, 7384 zilog->zl_parse_blk_count, 7385 zilog->zl_parse_lr_count, 7386 zilog->zl_replaying_seq); 7387 } 7388 7389 umem_free(zdtmp, sizeof (ztest_ds_t)); 7390 7391 dmu_objset_disown(os, B_TRUE, FTAG); 7392 return (0); 7393 } 7394 7395 static void 7396 ztest_freeze(void) 7397 { 7398 ztest_ds_t *zd = &ztest_ds[0]; 7399 spa_t *spa; 7400 int numloops = 0; 7401 7402 if (ztest_opts.zo_verbose >= 3) 7403 (void) printf("testing spa_freeze()...\n"); 7404 7405 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7406 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7407 VERIFY0(ztest_dataset_open(0)); 7408 ztest_spa = spa; 7409 7410 /* 7411 * Force the first log block to be transactionally allocated. 7412 * We have to do this before we freeze the pool -- otherwise 7413 * the log chain won't be anchored. 7414 */ 7415 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 7416 ztest_dmu_object_alloc_free(zd, 0); 7417 zil_commit(zd->zd_zilog, 0); 7418 } 7419 7420 txg_wait_synced(spa_get_dsl(spa), 0); 7421 7422 /* 7423 * Freeze the pool. This stops spa_sync() from doing anything, 7424 * so that the only way to record changes from now on is the ZIL. 7425 */ 7426 spa_freeze(spa); 7427 7428 /* 7429 * Because it is hard to predict how much space a write will actually 7430 * require beforehand, we leave ourselves some fudge space to write over 7431 * capacity. 7432 */ 7433 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 7434 7435 /* 7436 * Run tests that generate log records but don't alter the pool config 7437 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 7438 * We do a txg_wait_synced() after each iteration to force the txg 7439 * to increase well beyond the last synced value in the uberblock. 7440 * The ZIL should be OK with that. 7441 * 7442 * Run a random number of times less than zo_maxloops and ensure we do 7443 * not run out of space on the pool. 7444 */ 7445 while (ztest_random(10) != 0 && 7446 numloops++ < ztest_opts.zo_maxloops && 7447 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 7448 ztest_od_t od; 7449 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 7450 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 7451 ztest_io(zd, od.od_object, 7452 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 7453 txg_wait_synced(spa_get_dsl(spa), 0); 7454 } 7455 7456 /* 7457 * Commit all of the changes we just generated. 7458 */ 7459 zil_commit(zd->zd_zilog, 0); 7460 txg_wait_synced(spa_get_dsl(spa), 0); 7461 7462 /* 7463 * Close our dataset and close the pool. 7464 */ 7465 ztest_dataset_close(0); 7466 spa_close(spa, FTAG); 7467 kernel_fini(); 7468 7469 /* 7470 * Open and close the pool and dataset to induce log replay. 7471 */ 7472 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7473 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7474 ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); 7475 VERIFY0(ztest_dataset_open(0)); 7476 ztest_spa = spa; 7477 txg_wait_synced(spa_get_dsl(spa), 0); 7478 ztest_dataset_close(0); 7479 ztest_reguid(NULL, 0); 7480 7481 spa_close(spa, FTAG); 7482 kernel_fini(); 7483 } 7484 7485 static void 7486 ztest_import_impl(void) 7487 { 7488 importargs_t args = { 0 }; 7489 nvlist_t *cfg = NULL; 7490 int nsearch = 1; 7491 char *searchdirs[nsearch]; 7492 int flags = ZFS_IMPORT_MISSING_LOG; 7493 7494 searchdirs[0] = ztest_opts.zo_dir; 7495 args.paths = nsearch; 7496 args.path = searchdirs; 7497 args.can_be_active = B_FALSE; 7498 7499 libpc_handle_t lpch = { 7500 .lpc_lib_handle = NULL, 7501 .lpc_ops = &libzpool_config_ops, 7502 .lpc_printerr = B_TRUE 7503 }; 7504 VERIFY0(zpool_find_config(&lpch, ztest_opts.zo_pool, &cfg, &args)); 7505 VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); 7506 fnvlist_free(cfg); 7507 } 7508 7509 /* 7510 * Import a storage pool with the given name. 7511 */ 7512 static void 7513 ztest_import(ztest_shared_t *zs) 7514 { 7515 spa_t *spa; 7516 7517 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7518 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7519 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7520 7521 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7522 7523 ztest_import_impl(); 7524 7525 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7526 zs->zs_metaslab_sz = 7527 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7528 spa_close(spa, FTAG); 7529 7530 kernel_fini(); 7531 7532 if (!ztest_opts.zo_mmp_test) { 7533 ztest_run_zdb(ztest_opts.zo_pool); 7534 ztest_freeze(); 7535 ztest_run_zdb(ztest_opts.zo_pool); 7536 } 7537 7538 (void) pthread_rwlock_destroy(&ztest_name_lock); 7539 mutex_destroy(&ztest_vdev_lock); 7540 mutex_destroy(&ztest_checkpoint_lock); 7541 } 7542 7543 /* 7544 * Kick off threads to run tests on all datasets in parallel. 7545 */ 7546 static void 7547 ztest_run(ztest_shared_t *zs) 7548 { 7549 spa_t *spa; 7550 objset_t *os; 7551 kthread_t *resume_thread, *deadman_thread; 7552 kthread_t **run_threads; 7553 uint64_t object; 7554 int error; 7555 int t, d; 7556 7557 ztest_exiting = B_FALSE; 7558 7559 /* 7560 * Initialize parent/child shared state. 7561 */ 7562 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7563 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7564 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7565 7566 zs->zs_thread_start = gethrtime(); 7567 zs->zs_thread_stop = 7568 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 7569 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 7570 zs->zs_thread_kill = zs->zs_thread_stop; 7571 if (ztest_random(100) < ztest_opts.zo_killrate) { 7572 zs->zs_thread_kill -= 7573 ztest_random(ztest_opts.zo_passtime * NANOSEC); 7574 } 7575 7576 mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); 7577 7578 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 7579 offsetof(ztest_cb_data_t, zcd_node)); 7580 7581 /* 7582 * Open our pool. It may need to be imported first depending on 7583 * what tests were running when the previous pass was terminated. 7584 */ 7585 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7586 error = spa_open(ztest_opts.zo_pool, &spa, FTAG); 7587 if (error) { 7588 VERIFY3S(error, ==, ENOENT); 7589 ztest_import_impl(); 7590 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7591 zs->zs_metaslab_sz = 7592 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7593 } 7594 7595 metaslab_preload_limit = ztest_random(20) + 1; 7596 ztest_spa = spa; 7597 7598 VERIFY0(vdev_raidz_impl_set("cycle")); 7599 7600 dmu_objset_stats_t dds; 7601 VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, 7602 DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); 7603 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 7604 dmu_objset_fast_stat(os, &dds); 7605 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 7606 zs->zs_guid = dds.dds_guid; 7607 dmu_objset_disown(os, B_TRUE, FTAG); 7608 7609 /* 7610 * Create a thread to periodically resume suspended I/O. 7611 */ 7612 resume_thread = thread_create(NULL, 0, ztest_resume_thread, 7613 spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 7614 7615 /* 7616 * Create a deadman thread and set to panic if we hang. 7617 */ 7618 deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, 7619 zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 7620 7621 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; 7622 7623 /* 7624 * Verify that we can safely inquire about any object, 7625 * whether it's allocated or not. To make it interesting, 7626 * we probe a 5-wide window around each power of two. 7627 * This hits all edge cases, including zero and the max. 7628 */ 7629 for (t = 0; t < 64; t++) { 7630 for (d = -5; d <= 5; d++) { 7631 error = dmu_object_info(spa->spa_meta_objset, 7632 (1ULL << t) + d, NULL); 7633 ASSERT(error == 0 || error == ENOENT || 7634 error == EINVAL); 7635 } 7636 } 7637 7638 /* 7639 * If we got any ENOSPC errors on the previous run, destroy something. 7640 */ 7641 if (zs->zs_enospc_count != 0) { 7642 int d = ztest_random(ztest_opts.zo_datasets); 7643 ztest_dataset_destroy(d); 7644 } 7645 zs->zs_enospc_count = 0; 7646 7647 /* 7648 * If we were in the middle of ztest_device_removal() and were killed 7649 * we need to ensure the removal and scrub complete before running 7650 * any tests that check ztest_device_removal_active. The removal will 7651 * be restarted automatically when the spa is opened, but we need to 7652 * initiate the scrub manually if it is not already in progress. Note 7653 * that we always run the scrub whenever an indirect vdev exists 7654 * because we have no way of knowing for sure if ztest_device_removal() 7655 * fully completed its scrub before the pool was reimported. 7656 */ 7657 if (spa->spa_removing_phys.sr_state == DSS_SCANNING || 7658 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 7659 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 7660 txg_wait_synced(spa_get_dsl(spa), 0); 7661 7662 error = ztest_scrub_impl(spa); 7663 if (error == EBUSY) 7664 error = 0; 7665 ASSERT0(error); 7666 } 7667 7668 run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), 7669 UMEM_NOFAIL); 7670 7671 if (ztest_opts.zo_verbose >= 4) 7672 (void) printf("starting main threads...\n"); 7673 7674 /* 7675 * Replay all logs of all datasets in the pool. This is primarily for 7676 * temporary datasets which wouldn't otherwise get replayed, which 7677 * can trigger failures when attempting to offline a SLOG in 7678 * ztest_fault_inject(). 7679 */ 7680 (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, 7681 NULL, DS_FIND_CHILDREN); 7682 7683 /* 7684 * Kick off all the tests that run in parallel. 7685 */ 7686 for (t = 0; t < ztest_opts.zo_threads; t++) { 7687 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 7688 umem_free(run_threads, ztest_opts.zo_threads * 7689 sizeof (kthread_t *)); 7690 return; 7691 } 7692 7693 run_threads[t] = thread_create(NULL, 0, ztest_thread, 7694 (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, 7695 defclsyspri); 7696 } 7697 7698 /* 7699 * Wait for all of the tests to complete. 7700 */ 7701 for (t = 0; t < ztest_opts.zo_threads; t++) 7702 VERIFY0(thread_join(run_threads[t])); 7703 7704 /* 7705 * Close all datasets. This must be done after all the threads 7706 * are joined so we can be sure none of the datasets are in-use 7707 * by any of the threads. 7708 */ 7709 for (t = 0; t < ztest_opts.zo_threads; t++) { 7710 if (t < ztest_opts.zo_datasets) 7711 ztest_dataset_close(t); 7712 } 7713 7714 txg_wait_synced(spa_get_dsl(spa), 0); 7715 7716 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 7717 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 7718 7719 umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); 7720 7721 /* Kill the resume and deadman threads */ 7722 ztest_exiting = B_TRUE; 7723 VERIFY0(thread_join(resume_thread)); 7724 VERIFY0(thread_join(deadman_thread)); 7725 ztest_resume(spa); 7726 7727 /* 7728 * Right before closing the pool, kick off a bunch of async I/O; 7729 * spa_close() should wait for it to complete. 7730 */ 7731 for (object = 1; object < 50; object++) { 7732 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 7733 ZIO_PRIORITY_SYNC_READ); 7734 } 7735 7736 /* Verify that at least one commit cb was called in a timely fashion */ 7737 if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) 7738 VERIFY0(zc_min_txg_delay); 7739 7740 spa_close(spa, FTAG); 7741 7742 /* 7743 * Verify that we can loop over all pools. 7744 */ 7745 mutex_enter(&spa_namespace_lock); 7746 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 7747 if (ztest_opts.zo_verbose > 3) 7748 (void) printf("spa_next: found %s\n", spa_name(spa)); 7749 mutex_exit(&spa_namespace_lock); 7750 7751 /* 7752 * Verify that we can export the pool and reimport it under a 7753 * different name. 7754 */ 7755 if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { 7756 char name[ZFS_MAX_DATASET_NAME_LEN]; 7757 (void) snprintf(name, sizeof (name), "%s_import", 7758 ztest_opts.zo_pool); 7759 ztest_spa_import_export(ztest_opts.zo_pool, name); 7760 ztest_spa_import_export(name, ztest_opts.zo_pool); 7761 } 7762 7763 kernel_fini(); 7764 7765 list_destroy(&zcl.zcl_callbacks); 7766 mutex_destroy(&zcl.zcl_callbacks_lock); 7767 (void) pthread_rwlock_destroy(&ztest_name_lock); 7768 mutex_destroy(&ztest_vdev_lock); 7769 mutex_destroy(&ztest_checkpoint_lock); 7770 } 7771 7772 static void 7773 print_time(hrtime_t t, char *timebuf) 7774 { 7775 hrtime_t s = t / NANOSEC; 7776 hrtime_t m = s / 60; 7777 hrtime_t h = m / 60; 7778 hrtime_t d = h / 24; 7779 7780 s -= m * 60; 7781 m -= h * 60; 7782 h -= d * 24; 7783 7784 timebuf[0] = '\0'; 7785 7786 if (d) 7787 (void) sprintf(timebuf, 7788 "%llud%02lluh%02llum%02llus", d, h, m, s); 7789 else if (h) 7790 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 7791 else if (m) 7792 (void) sprintf(timebuf, "%llum%02llus", m, s); 7793 else 7794 (void) sprintf(timebuf, "%llus", s); 7795 } 7796 7797 static nvlist_t * 7798 make_random_props(void) 7799 { 7800 nvlist_t *props; 7801 7802 props = fnvlist_alloc(); 7803 7804 if (ztest_random(2) == 0) 7805 return (props); 7806 7807 fnvlist_add_uint64(props, 7808 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); 7809 7810 return (props); 7811 } 7812 7813 /* 7814 * Create a storage pool with the given name and initial vdev size. 7815 * Then test spa_freeze() functionality. 7816 */ 7817 static void 7818 ztest_init(ztest_shared_t *zs) 7819 { 7820 spa_t *spa; 7821 nvlist_t *nvroot, *props; 7822 int i; 7823 7824 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7825 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7826 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7827 7828 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7829 7830 /* 7831 * Create the storage pool. 7832 */ 7833 (void) spa_destroy(ztest_opts.zo_pool); 7834 ztest_shared->zs_vdev_next_leaf = 0; 7835 zs->zs_splits = 0; 7836 zs->zs_mirrors = ztest_opts.zo_mirrors; 7837 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 7838 NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 7839 props = make_random_props(); 7840 7841 /* 7842 * We don't expect the pool to suspend unless maxfaults == 0, 7843 * in which case ztest_fault_inject() temporarily takes away 7844 * the only valid replica. 7845 */ 7846 fnvlist_add_uint64(props, 7847 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 7848 MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); 7849 7850 for (i = 0; i < SPA_FEATURES; i++) { 7851 char *buf; 7852 7853 if (!spa_feature_table[i].fi_zfs_mod_supported) 7854 continue; 7855 7856 /* 7857 * 75% chance of using the log space map feature. We want ztest 7858 * to exercise both the code paths that use the log space map 7859 * feature and the ones that don't. 7860 */ 7861 if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) 7862 continue; 7863 7864 VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", 7865 spa_feature_table[i].fi_uname)); 7866 fnvlist_add_uint64(props, buf, 0); 7867 free(buf); 7868 } 7869 7870 VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); 7871 fnvlist_free(nvroot); 7872 fnvlist_free(props); 7873 7874 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7875 zs->zs_metaslab_sz = 7876 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7877 spa_close(spa, FTAG); 7878 7879 kernel_fini(); 7880 7881 if (!ztest_opts.zo_mmp_test) { 7882 ztest_run_zdb(ztest_opts.zo_pool); 7883 ztest_freeze(); 7884 ztest_run_zdb(ztest_opts.zo_pool); 7885 } 7886 7887 (void) pthread_rwlock_destroy(&ztest_name_lock); 7888 mutex_destroy(&ztest_vdev_lock); 7889 mutex_destroy(&ztest_checkpoint_lock); 7890 } 7891 7892 static void 7893 setup_data_fd(void) 7894 { 7895 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 7896 7897 ztest_fd_data = mkstemp(ztest_name_data); 7898 ASSERT3S(ztest_fd_data, >=, 0); 7899 (void) unlink(ztest_name_data); 7900 } 7901 7902 static int 7903 shared_data_size(ztest_shared_hdr_t *hdr) 7904 { 7905 int size; 7906 7907 size = hdr->zh_hdr_size; 7908 size += hdr->zh_opts_size; 7909 size += hdr->zh_size; 7910 size += hdr->zh_stats_size * hdr->zh_stats_count; 7911 size += hdr->zh_ds_size * hdr->zh_ds_count; 7912 7913 return (size); 7914 } 7915 7916 static void 7917 setup_hdr(void) 7918 { 7919 int size; 7920 ztest_shared_hdr_t *hdr; 7921 7922 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 7923 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 7924 ASSERT3P(hdr, !=, MAP_FAILED); 7925 7926 VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 7927 7928 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 7929 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 7930 hdr->zh_size = sizeof (ztest_shared_t); 7931 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 7932 hdr->zh_stats_count = ZTEST_FUNCS; 7933 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 7934 hdr->zh_ds_count = ztest_opts.zo_datasets; 7935 7936 size = shared_data_size(hdr); 7937 VERIFY0(ftruncate(ztest_fd_data, size)); 7938 7939 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 7940 } 7941 7942 static void 7943 setup_data(void) 7944 { 7945 int size, offset; 7946 ztest_shared_hdr_t *hdr; 7947 uint8_t *buf; 7948 7949 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 7950 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 7951 ASSERT3P(hdr, !=, MAP_FAILED); 7952 7953 size = shared_data_size(hdr); 7954 7955 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 7956 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 7957 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 7958 ASSERT3P(hdr, !=, MAP_FAILED); 7959 buf = (uint8_t *)hdr; 7960 7961 offset = hdr->zh_hdr_size; 7962 ztest_shared_opts = (void *)&buf[offset]; 7963 offset += hdr->zh_opts_size; 7964 ztest_shared = (void *)&buf[offset]; 7965 offset += hdr->zh_size; 7966 ztest_shared_callstate = (void *)&buf[offset]; 7967 offset += hdr->zh_stats_size * hdr->zh_stats_count; 7968 ztest_shared_ds = (void *)&buf[offset]; 7969 } 7970 7971 static boolean_t 7972 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 7973 { 7974 pid_t pid; 7975 int status; 7976 char *cmdbuf = NULL; 7977 7978 pid = fork(); 7979 7980 if (cmd == NULL) { 7981 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 7982 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 7983 cmd = cmdbuf; 7984 } 7985 7986 if (pid == -1) 7987 fatal(B_TRUE, "fork failed"); 7988 7989 if (pid == 0) { /* child */ 7990 char fd_data_str[12]; 7991 7992 VERIFY3S(11, >=, 7993 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 7994 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 7995 7996 if (libpath != NULL) { 7997 const char *curlp = getenv("LD_LIBRARY_PATH"); 7998 if (curlp == NULL) 7999 VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); 8000 else { 8001 char *newlp = NULL; 8002 VERIFY3S(-1, !=, 8003 asprintf(&newlp, "%s:%s", libpath, curlp)); 8004 VERIFY0(setenv("LD_LIBRARY_PATH", newlp, 1)); 8005 free(newlp); 8006 } 8007 } 8008 (void) execl(cmd, cmd, (char *)NULL); 8009 ztest_dump_core = B_FALSE; 8010 fatal(B_TRUE, "exec failed: %s", cmd); 8011 } 8012 8013 if (cmdbuf != NULL) { 8014 umem_free(cmdbuf, MAXPATHLEN); 8015 cmd = NULL; 8016 } 8017 8018 while (waitpid(pid, &status, 0) != pid) 8019 continue; 8020 if (statusp != NULL) 8021 *statusp = status; 8022 8023 if (WIFEXITED(status)) { 8024 if (WEXITSTATUS(status) != 0) { 8025 (void) fprintf(stderr, "child exited with code %d\n", 8026 WEXITSTATUS(status)); 8027 exit(2); 8028 } 8029 return (B_FALSE); 8030 } else if (WIFSIGNALED(status)) { 8031 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 8032 (void) fprintf(stderr, "child died with signal %d\n", 8033 WTERMSIG(status)); 8034 exit(3); 8035 } 8036 return (B_TRUE); 8037 } else { 8038 (void) fprintf(stderr, "something strange happened to child\n"); 8039 exit(4); 8040 } 8041 } 8042 8043 static void 8044 ztest_run_init(void) 8045 { 8046 int i; 8047 8048 ztest_shared_t *zs = ztest_shared; 8049 8050 /* 8051 * Blow away any existing copy of zpool.cache 8052 */ 8053 (void) remove(spa_config_path); 8054 8055 if (ztest_opts.zo_init == 0) { 8056 if (ztest_opts.zo_verbose >= 1) 8057 (void) printf("Importing pool %s\n", 8058 ztest_opts.zo_pool); 8059 ztest_import(zs); 8060 return; 8061 } 8062 8063 /* 8064 * Create and initialize our storage pool. 8065 */ 8066 for (i = 1; i <= ztest_opts.zo_init; i++) { 8067 memset(zs, 0, sizeof (*zs)); 8068 if (ztest_opts.zo_verbose >= 3 && 8069 ztest_opts.zo_init != 1) { 8070 (void) printf("ztest_init(), pass %d\n", i); 8071 } 8072 ztest_init(zs); 8073 } 8074 } 8075 8076 int 8077 main(int argc, char **argv) 8078 { 8079 int kills = 0; 8080 int iters = 0; 8081 int older = 0; 8082 int newer = 0; 8083 ztest_shared_t *zs; 8084 ztest_info_t *zi; 8085 ztest_shared_callstate_t *zc; 8086 char timebuf[100]; 8087 char numbuf[NN_NUMBUF_SZ]; 8088 char *cmd; 8089 boolean_t hasalt; 8090 int f, err; 8091 char *fd_data_str = getenv("ZTEST_FD_DATA"); 8092 struct sigaction action; 8093 8094 (void) setvbuf(stdout, NULL, _IOLBF, 0); 8095 8096 dprintf_setup(&argc, argv); 8097 zfs_deadman_synctime_ms = 300000; 8098 zfs_deadman_checktime_ms = 30000; 8099 /* 8100 * As two-word space map entries may not come up often (especially 8101 * if pool and vdev sizes are small) we want to force at least some 8102 * of them so the feature get tested. 8103 */ 8104 zfs_force_some_double_word_sm_entries = B_TRUE; 8105 8106 /* 8107 * Verify that even extensively damaged split blocks with many 8108 * segments can be reconstructed in a reasonable amount of time 8109 * when reconstruction is known to be possible. 8110 * 8111 * Note: the lower this value is, the more damage we inflict, and 8112 * the more time ztest spends in recovering that damage. We chose 8113 * to induce damage 1/100th of the time so recovery is tested but 8114 * not so frequently that ztest doesn't get to test other code paths. 8115 */ 8116 zfs_reconstruct_indirect_damage_fraction = 100; 8117 8118 action.sa_handler = sig_handler; 8119 sigemptyset(&action.sa_mask); 8120 action.sa_flags = 0; 8121 8122 if (sigaction(SIGSEGV, &action, NULL) < 0) { 8123 (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", 8124 strerror(errno)); 8125 exit(EXIT_FAILURE); 8126 } 8127 8128 if (sigaction(SIGABRT, &action, NULL) < 0) { 8129 (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", 8130 strerror(errno)); 8131 exit(EXIT_FAILURE); 8132 } 8133 8134 /* 8135 * Force random_get_bytes() to use /dev/urandom in order to prevent 8136 * ztest from needlessly depleting the system entropy pool. 8137 */ 8138 random_path = "/dev/urandom"; 8139 ztest_fd_rand = open(random_path, O_RDONLY | O_CLOEXEC); 8140 ASSERT3S(ztest_fd_rand, >=, 0); 8141 8142 if (!fd_data_str) { 8143 process_options(argc, argv); 8144 8145 setup_data_fd(); 8146 setup_hdr(); 8147 setup_data(); 8148 memcpy(ztest_shared_opts, &ztest_opts, 8149 sizeof (*ztest_shared_opts)); 8150 } else { 8151 ztest_fd_data = atoi(fd_data_str); 8152 setup_data(); 8153 memcpy(&ztest_opts, ztest_shared_opts, sizeof (ztest_opts)); 8154 } 8155 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 8156 8157 err = ztest_set_global_vars(); 8158 if (err != 0 && !fd_data_str) { 8159 /* error message done by ztest_set_global_vars */ 8160 exit(EXIT_FAILURE); 8161 } else { 8162 /* children should not be spawned if setting gvars fails */ 8163 VERIFY3S(err, ==, 0); 8164 } 8165 8166 /* Override location of zpool.cache */ 8167 VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", 8168 ztest_opts.zo_dir), !=, -1); 8169 8170 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 8171 UMEM_NOFAIL); 8172 zs = ztest_shared; 8173 8174 if (fd_data_str) { 8175 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 8176 metaslab_df_alloc_threshold = 8177 zs->zs_metaslab_df_alloc_threshold; 8178 8179 if (zs->zs_do_init) 8180 ztest_run_init(); 8181 else 8182 ztest_run(zs); 8183 exit(0); 8184 } 8185 8186 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 8187 8188 if (ztest_opts.zo_verbose >= 1) { 8189 (void) printf("%"PRIu64" vdevs, %d datasets, %d threads," 8190 "%d %s disks, %"PRIu64" seconds...\n\n", 8191 ztest_opts.zo_vdevs, 8192 ztest_opts.zo_datasets, 8193 ztest_opts.zo_threads, 8194 ztest_opts.zo_raid_children, 8195 ztest_opts.zo_raid_type, 8196 ztest_opts.zo_time); 8197 } 8198 8199 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 8200 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 8201 8202 zs->zs_do_init = B_TRUE; 8203 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 8204 if (ztest_opts.zo_verbose >= 1) { 8205 (void) printf("Executing older ztest for " 8206 "initialization: %s\n", ztest_opts.zo_alt_ztest); 8207 } 8208 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 8209 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 8210 } else { 8211 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 8212 } 8213 zs->zs_do_init = B_FALSE; 8214 8215 zs->zs_proc_start = gethrtime(); 8216 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 8217 8218 for (f = 0; f < ZTEST_FUNCS; f++) { 8219 zi = &ztest_info[f]; 8220 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8221 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 8222 zc->zc_next = UINT64_MAX; 8223 else 8224 zc->zc_next = zs->zs_proc_start + 8225 ztest_random(2 * zi->zi_interval[0] + 1); 8226 } 8227 8228 /* 8229 * Run the tests in a loop. These tests include fault injection 8230 * to verify that self-healing data works, and forced crashes 8231 * to verify that we never lose on-disk consistency. 8232 */ 8233 while (gethrtime() < zs->zs_proc_stop) { 8234 int status; 8235 boolean_t killed; 8236 8237 /* 8238 * Initialize the workload counters for each function. 8239 */ 8240 for (f = 0; f < ZTEST_FUNCS; f++) { 8241 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8242 zc->zc_count = 0; 8243 zc->zc_time = 0; 8244 } 8245 8246 /* Set the allocation switch size */ 8247 zs->zs_metaslab_df_alloc_threshold = 8248 ztest_random(zs->zs_metaslab_sz / 4) + 1; 8249 8250 if (!hasalt || ztest_random(2) == 0) { 8251 if (hasalt && ztest_opts.zo_verbose >= 1) { 8252 (void) printf("Executing newer ztest: %s\n", 8253 cmd); 8254 } 8255 newer++; 8256 killed = exec_child(cmd, NULL, B_TRUE, &status); 8257 } else { 8258 if (hasalt && ztest_opts.zo_verbose >= 1) { 8259 (void) printf("Executing older ztest: %s\n", 8260 ztest_opts.zo_alt_ztest); 8261 } 8262 older++; 8263 killed = exec_child(ztest_opts.zo_alt_ztest, 8264 ztest_opts.zo_alt_libpath, B_TRUE, &status); 8265 } 8266 8267 if (killed) 8268 kills++; 8269 iters++; 8270 8271 if (ztest_opts.zo_verbose >= 1) { 8272 hrtime_t now = gethrtime(); 8273 8274 now = MIN(now, zs->zs_proc_stop); 8275 print_time(zs->zs_proc_stop - now, timebuf); 8276 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 8277 8278 (void) printf("Pass %3d, %8s, %3"PRIu64" ENOSPC, " 8279 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 8280 iters, 8281 WIFEXITED(status) ? "Complete" : "SIGKILL", 8282 zs->zs_enospc_count, 8283 100.0 * zs->zs_alloc / zs->zs_space, 8284 numbuf, 8285 100.0 * (now - zs->zs_proc_start) / 8286 (ztest_opts.zo_time * NANOSEC), timebuf); 8287 } 8288 8289 if (ztest_opts.zo_verbose >= 2) { 8290 (void) printf("\nWorkload summary:\n\n"); 8291 (void) printf("%7s %9s %s\n", 8292 "Calls", "Time", "Function"); 8293 (void) printf("%7s %9s %s\n", 8294 "-----", "----", "--------"); 8295 for (f = 0; f < ZTEST_FUNCS; f++) { 8296 zi = &ztest_info[f]; 8297 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8298 print_time(zc->zc_time, timebuf); 8299 (void) printf("%7"PRIu64" %9s %s\n", 8300 zc->zc_count, timebuf, 8301 zi->zi_funcname); 8302 } 8303 (void) printf("\n"); 8304 } 8305 8306 if (!ztest_opts.zo_mmp_test) 8307 ztest_run_zdb(ztest_opts.zo_pool); 8308 } 8309 8310 if (ztest_opts.zo_verbose >= 1) { 8311 if (hasalt) { 8312 (void) printf("%d runs of older ztest: %s\n", older, 8313 ztest_opts.zo_alt_ztest); 8314 (void) printf("%d runs of newer ztest: %s\n", newer, 8315 cmd); 8316 } 8317 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 8318 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 8319 } 8320 8321 umem_free(cmd, MAXNAMELEN); 8322 8323 return (0); 8324 } 8325