1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2013 Steven Hartland. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2017 Joyent, Inc. 28 * Copyright (c) 2017, Intel Corporation. 29 */ 30 31 /* 32 * The objective of this program is to provide a DMU/ZAP/SPA stress test 33 * that runs entirely in userland, is easy to use, and easy to extend. 34 * 35 * The overall design of the ztest program is as follows: 36 * 37 * (1) For each major functional area (e.g. adding vdevs to a pool, 38 * creating and destroying datasets, reading and writing objects, etc) 39 * we have a simple routine to test that functionality. These 40 * individual routines do not have to do anything "stressful". 41 * 42 * (2) We turn these simple functionality tests into a stress test by 43 * running them all in parallel, with as many threads as desired, 44 * and spread across as many datasets, objects, and vdevs as desired. 45 * 46 * (3) While all this is happening, we inject faults into the pool to 47 * verify that self-healing data really works. 48 * 49 * (4) Every time we open a dataset, we change its checksum and compression 50 * functions. Thus even individual objects vary from block to block 51 * in which checksum they use and whether they're compressed. 52 * 53 * (5) To verify that we never lose on-disk consistency after a crash, 54 * we run the entire test in a child of the main process. 55 * At random times, the child self-immolates with a SIGKILL. 56 * This is the software equivalent of pulling the power cord. 57 * The parent then runs the test again, using the existing 58 * storage pool, as many times as desired. If backwards compatibility 59 * testing is enabled ztest will sometimes run the "older" version 60 * of ztest after a SIGKILL. 61 * 62 * (6) To verify that we don't have future leaks or temporal incursions, 63 * many of the functional tests record the transaction group number 64 * as part of their data. When reading old data, they verify that 65 * the transaction group number is less than the current, open txg. 66 * If you add a new test, please do this if applicable. 67 * 68 * (7) Threads are created with a reduced stack size, for sanity checking. 69 * Therefore, it's important not to allocate huge buffers on the stack. 70 * 71 * When run with no arguments, ztest runs for about five minutes and 72 * produces no output if successful. To get a little bit of information, 73 * specify -V. To get more information, specify -VV, and so on. 74 * 75 * To turn this into an overnight stress test, use -T to specify run time. 76 * 77 * You can ask more vdevs [-v], datasets [-d], or threads [-t] 78 * to increase the pool capacity, fanout, and overall stress level. 79 * 80 * Use the -k option to set the desired frequency of kills. 81 * 82 * When ztest invokes itself it passes all relevant information through a 83 * temporary file which is mmap-ed in the child process. This allows shared 84 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 85 * stored at offset 0 of this file and contains information on the size and 86 * number of shared structures in the file. The information stored in this file 87 * must remain backwards compatible with older versions of ztest so that 88 * ztest can invoke them during backwards compatibility testing (-B). 89 */ 90 91 #include <sys/zfs_context.h> 92 #include <sys/spa.h> 93 #include <sys/dmu.h> 94 #include <sys/txg.h> 95 #include <sys/dbuf.h> 96 #include <sys/zap.h> 97 #include <sys/dmu_objset.h> 98 #include <sys/poll.h> 99 #include <sys/stat.h> 100 #include <sys/time.h> 101 #include <sys/wait.h> 102 #include <sys/mman.h> 103 #include <sys/resource.h> 104 #include <sys/zio.h> 105 #include <sys/zil.h> 106 #include <sys/zil_impl.h> 107 #include <sys/vdev_draid.h> 108 #include <sys/vdev_impl.h> 109 #include <sys/vdev_file.h> 110 #include <sys/vdev_initialize.h> 111 #include <sys/vdev_raidz.h> 112 #include <sys/vdev_trim.h> 113 #include <sys/spa_impl.h> 114 #include <sys/metaslab_impl.h> 115 #include <sys/dsl_prop.h> 116 #include <sys/dsl_dataset.h> 117 #include <sys/dsl_destroy.h> 118 #include <sys/dsl_scan.h> 119 #include <sys/zio_checksum.h> 120 #include <sys/zfs_refcount.h> 121 #include <sys/zfeature.h> 122 #include <sys/dsl_userhold.h> 123 #include <sys/abd.h> 124 #include <stdio.h> 125 #include <stdlib.h> 126 #include <unistd.h> 127 #include <getopt.h> 128 #include <signal.h> 129 #include <umem.h> 130 #include <ctype.h> 131 #include <math.h> 132 #include <sys/fs/zfs.h> 133 #include <zfs_fletcher.h> 134 #include <libnvpair.h> 135 #include <libzutil.h> 136 #include <sys/crypto/icp.h> 137 #if (__GLIBC__ && !__UCLIBC__) 138 #include <execinfo.h> /* for backtrace() */ 139 #endif 140 141 static int ztest_fd_data = -1; 142 static int ztest_fd_rand = -1; 143 144 typedef struct ztest_shared_hdr { 145 uint64_t zh_hdr_size; 146 uint64_t zh_opts_size; 147 uint64_t zh_size; 148 uint64_t zh_stats_size; 149 uint64_t zh_stats_count; 150 uint64_t zh_ds_size; 151 uint64_t zh_ds_count; 152 } ztest_shared_hdr_t; 153 154 static ztest_shared_hdr_t *ztest_shared_hdr; 155 156 enum ztest_class_state { 157 ZTEST_VDEV_CLASS_OFF, 158 ZTEST_VDEV_CLASS_ON, 159 ZTEST_VDEV_CLASS_RND 160 }; 161 162 #define ZO_GVARS_MAX_ARGLEN ((size_t)64) 163 #define ZO_GVARS_MAX_COUNT ((size_t)10) 164 165 typedef struct ztest_shared_opts { 166 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 167 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 168 char zo_alt_ztest[MAXNAMELEN]; 169 char zo_alt_libpath[MAXNAMELEN]; 170 uint64_t zo_vdevs; 171 uint64_t zo_vdevtime; 172 size_t zo_vdev_size; 173 int zo_ashift; 174 int zo_mirrors; 175 int zo_raid_children; 176 int zo_raid_parity; 177 char zo_raid_type[8]; 178 int zo_draid_data; 179 int zo_draid_spares; 180 int zo_datasets; 181 int zo_threads; 182 uint64_t zo_passtime; 183 uint64_t zo_killrate; 184 int zo_verbose; 185 int zo_init; 186 uint64_t zo_time; 187 uint64_t zo_maxloops; 188 uint64_t zo_metaslab_force_ganging; 189 int zo_mmp_test; 190 int zo_special_vdevs; 191 int zo_dump_dbgmsg; 192 int zo_gvars_count; 193 char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; 194 } ztest_shared_opts_t; 195 196 /* Default values for command line options. */ 197 #define DEFAULT_POOL "ztest" 198 #define DEFAULT_VDEV_DIR "/tmp" 199 #define DEFAULT_VDEV_COUNT 5 200 #define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ 201 #define DEFAULT_VDEV_SIZE_STR "256M" 202 #define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT 203 #define DEFAULT_MIRRORS 2 204 #define DEFAULT_RAID_CHILDREN 4 205 #define DEFAULT_RAID_PARITY 1 206 #define DEFAULT_DRAID_DATA 4 207 #define DEFAULT_DRAID_SPARES 1 208 #define DEFAULT_DATASETS_COUNT 7 209 #define DEFAULT_THREADS 23 210 #define DEFAULT_RUN_TIME 300 /* 300 seconds */ 211 #define DEFAULT_RUN_TIME_STR "300 sec" 212 #define DEFAULT_PASS_TIME 60 /* 60 seconds */ 213 #define DEFAULT_PASS_TIME_STR "60 sec" 214 #define DEFAULT_KILL_RATE 70 /* 70% kill rate */ 215 #define DEFAULT_KILLRATE_STR "70%" 216 #define DEFAULT_INITS 1 217 #define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ 218 #define DEFAULT_FORCE_GANGING (64 << 10) 219 #define DEFAULT_FORCE_GANGING_STR "64K" 220 221 /* Simplifying assumption: -1 is not a valid default. */ 222 #define NO_DEFAULT -1 223 224 static const ztest_shared_opts_t ztest_opts_defaults = { 225 .zo_pool = DEFAULT_POOL, 226 .zo_dir = DEFAULT_VDEV_DIR, 227 .zo_alt_ztest = { '\0' }, 228 .zo_alt_libpath = { '\0' }, 229 .zo_vdevs = DEFAULT_VDEV_COUNT, 230 .zo_ashift = DEFAULT_ASHIFT, 231 .zo_mirrors = DEFAULT_MIRRORS, 232 .zo_raid_children = DEFAULT_RAID_CHILDREN, 233 .zo_raid_parity = DEFAULT_RAID_PARITY, 234 .zo_raid_type = VDEV_TYPE_RAIDZ, 235 .zo_vdev_size = DEFAULT_VDEV_SIZE, 236 .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ 237 .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ 238 .zo_datasets = DEFAULT_DATASETS_COUNT, 239 .zo_threads = DEFAULT_THREADS, 240 .zo_passtime = DEFAULT_PASS_TIME, 241 .zo_killrate = DEFAULT_KILL_RATE, 242 .zo_verbose = 0, 243 .zo_mmp_test = 0, 244 .zo_init = DEFAULT_INITS, 245 .zo_time = DEFAULT_RUN_TIME, 246 .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ 247 .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, 248 .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, 249 .zo_gvars_count = 0, 250 }; 251 252 extern uint64_t metaslab_force_ganging; 253 extern uint64_t metaslab_df_alloc_threshold; 254 extern unsigned long zfs_deadman_synctime_ms; 255 extern int metaslab_preload_limit; 256 extern int zfs_compressed_arc_enabled; 257 extern int zfs_abd_scatter_enabled; 258 extern int dmu_object_alloc_chunk_shift; 259 extern boolean_t zfs_force_some_double_word_sm_entries; 260 extern unsigned long zio_decompress_fail_fraction; 261 extern unsigned long zfs_reconstruct_indirect_damage_fraction; 262 263 264 static ztest_shared_opts_t *ztest_shared_opts; 265 static ztest_shared_opts_t ztest_opts; 266 static char *ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; 267 268 typedef struct ztest_shared_ds { 269 uint64_t zd_seq; 270 } ztest_shared_ds_t; 271 272 static ztest_shared_ds_t *ztest_shared_ds; 273 #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 274 275 #define BT_MAGIC 0x123456789abcdefULL 276 #define MAXFAULTS(zs) \ 277 (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) 278 279 enum ztest_io_type { 280 ZTEST_IO_WRITE_TAG, 281 ZTEST_IO_WRITE_PATTERN, 282 ZTEST_IO_WRITE_ZEROES, 283 ZTEST_IO_TRUNCATE, 284 ZTEST_IO_SETATTR, 285 ZTEST_IO_REWRITE, 286 ZTEST_IO_TYPES 287 }; 288 289 typedef struct ztest_block_tag { 290 uint64_t bt_magic; 291 uint64_t bt_objset; 292 uint64_t bt_object; 293 uint64_t bt_dnodesize; 294 uint64_t bt_offset; 295 uint64_t bt_gen; 296 uint64_t bt_txg; 297 uint64_t bt_crtxg; 298 } ztest_block_tag_t; 299 300 typedef struct bufwad { 301 uint64_t bw_index; 302 uint64_t bw_txg; 303 uint64_t bw_data; 304 } bufwad_t; 305 306 /* 307 * It would be better to use a rangelock_t per object. Unfortunately 308 * the rangelock_t is not a drop-in replacement for rl_t, because we 309 * still need to map from object ID to rangelock_t. 310 */ 311 typedef enum { 312 RL_READER, 313 RL_WRITER, 314 RL_APPEND 315 } rl_type_t; 316 317 typedef struct rll { 318 void *rll_writer; 319 int rll_readers; 320 kmutex_t rll_lock; 321 kcondvar_t rll_cv; 322 } rll_t; 323 324 typedef struct rl { 325 uint64_t rl_object; 326 uint64_t rl_offset; 327 uint64_t rl_size; 328 rll_t *rl_lock; 329 } rl_t; 330 331 #define ZTEST_RANGE_LOCKS 64 332 #define ZTEST_OBJECT_LOCKS 64 333 334 /* 335 * Object descriptor. Used as a template for object lookup/create/remove. 336 */ 337 typedef struct ztest_od { 338 uint64_t od_dir; 339 uint64_t od_object; 340 dmu_object_type_t od_type; 341 dmu_object_type_t od_crtype; 342 uint64_t od_blocksize; 343 uint64_t od_crblocksize; 344 uint64_t od_crdnodesize; 345 uint64_t od_gen; 346 uint64_t od_crgen; 347 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 348 } ztest_od_t; 349 350 /* 351 * Per-dataset state. 352 */ 353 typedef struct ztest_ds { 354 ztest_shared_ds_t *zd_shared; 355 objset_t *zd_os; 356 pthread_rwlock_t zd_zilog_lock; 357 zilog_t *zd_zilog; 358 ztest_od_t *zd_od; /* debugging aid */ 359 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 360 kmutex_t zd_dirobj_lock; 361 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 362 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 363 } ztest_ds_t; 364 365 /* 366 * Per-iteration state. 367 */ 368 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 369 370 typedef struct ztest_info { 371 ztest_func_t *zi_func; /* test function */ 372 uint64_t zi_iters; /* iterations per execution */ 373 uint64_t *zi_interval; /* execute every <interval> seconds */ 374 const char *zi_funcname; /* name of test function */ 375 } ztest_info_t; 376 377 typedef struct ztest_shared_callstate { 378 uint64_t zc_count; /* per-pass count */ 379 uint64_t zc_time; /* per-pass time */ 380 uint64_t zc_next; /* next time to call this function */ 381 } ztest_shared_callstate_t; 382 383 static ztest_shared_callstate_t *ztest_shared_callstate; 384 #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 385 386 ztest_func_t ztest_dmu_read_write; 387 ztest_func_t ztest_dmu_write_parallel; 388 ztest_func_t ztest_dmu_object_alloc_free; 389 ztest_func_t ztest_dmu_object_next_chunk; 390 ztest_func_t ztest_dmu_commit_callbacks; 391 ztest_func_t ztest_zap; 392 ztest_func_t ztest_zap_parallel; 393 ztest_func_t ztest_zil_commit; 394 ztest_func_t ztest_zil_remount; 395 ztest_func_t ztest_dmu_read_write_zcopy; 396 ztest_func_t ztest_dmu_objset_create_destroy; 397 ztest_func_t ztest_dmu_prealloc; 398 ztest_func_t ztest_fzap; 399 ztest_func_t ztest_dmu_snapshot_create_destroy; 400 ztest_func_t ztest_dsl_prop_get_set; 401 ztest_func_t ztest_spa_prop_get_set; 402 ztest_func_t ztest_spa_create_destroy; 403 ztest_func_t ztest_fault_inject; 404 ztest_func_t ztest_dmu_snapshot_hold; 405 ztest_func_t ztest_mmp_enable_disable; 406 ztest_func_t ztest_scrub; 407 ztest_func_t ztest_dsl_dataset_promote_busy; 408 ztest_func_t ztest_vdev_attach_detach; 409 ztest_func_t ztest_vdev_LUN_growth; 410 ztest_func_t ztest_vdev_add_remove; 411 ztest_func_t ztest_vdev_class_add; 412 ztest_func_t ztest_vdev_aux_add_remove; 413 ztest_func_t ztest_split_pool; 414 ztest_func_t ztest_reguid; 415 ztest_func_t ztest_spa_upgrade; 416 ztest_func_t ztest_device_removal; 417 ztest_func_t ztest_spa_checkpoint_create_discard; 418 ztest_func_t ztest_initialize; 419 ztest_func_t ztest_trim; 420 ztest_func_t ztest_fletcher; 421 ztest_func_t ztest_fletcher_incr; 422 ztest_func_t ztest_verify_dnode_bt; 423 424 uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 425 uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 426 uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 427 uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 428 uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 429 430 #define ZTI_INIT(func, iters, interval) \ 431 { .zi_func = (func), \ 432 .zi_iters = (iters), \ 433 .zi_interval = (interval), \ 434 .zi_funcname = # func } 435 436 ztest_info_t ztest_info[] = { 437 ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), 438 ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), 439 ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), 440 ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), 441 ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), 442 ZTI_INIT(ztest_zap, 30, &zopt_always), 443 ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), 444 ZTI_INIT(ztest_split_pool, 1, &zopt_always), 445 ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), 446 ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), 447 ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), 448 ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), 449 ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), 450 ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), 451 #if 0 452 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), 453 #endif 454 ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), 455 ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), 456 ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), 457 ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), 458 ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), 459 ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), 460 ZTI_INIT(ztest_reguid, 1, &zopt_rarely), 461 ZTI_INIT(ztest_scrub, 1, &zopt_rarely), 462 ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), 463 ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), 464 ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), 465 ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), 466 ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), 467 ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), 468 ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), 469 ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), 470 ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), 471 ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), 472 ZTI_INIT(ztest_trim, 1, &zopt_sometimes), 473 ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), 474 ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), 475 ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), 476 }; 477 478 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 479 480 /* 481 * The following struct is used to hold a list of uncalled commit callbacks. 482 * The callbacks are ordered by txg number. 483 */ 484 typedef struct ztest_cb_list { 485 kmutex_t zcl_callbacks_lock; 486 list_t zcl_callbacks; 487 } ztest_cb_list_t; 488 489 /* 490 * Stuff we need to share writably between parent and child. 491 */ 492 typedef struct ztest_shared { 493 boolean_t zs_do_init; 494 hrtime_t zs_proc_start; 495 hrtime_t zs_proc_stop; 496 hrtime_t zs_thread_start; 497 hrtime_t zs_thread_stop; 498 hrtime_t zs_thread_kill; 499 uint64_t zs_enospc_count; 500 uint64_t zs_vdev_next_leaf; 501 uint64_t zs_vdev_aux; 502 uint64_t zs_alloc; 503 uint64_t zs_space; 504 uint64_t zs_splits; 505 uint64_t zs_mirrors; 506 uint64_t zs_metaslab_sz; 507 uint64_t zs_metaslab_df_alloc_threshold; 508 uint64_t zs_guid; 509 } ztest_shared_t; 510 511 #define ID_PARALLEL -1ULL 512 513 static char ztest_dev_template[] = "%s/%s.%llua"; 514 static char ztest_aux_template[] = "%s/%s.%s.%llu"; 515 ztest_shared_t *ztest_shared; 516 517 static spa_t *ztest_spa = NULL; 518 static ztest_ds_t *ztest_ds; 519 520 static kmutex_t ztest_vdev_lock; 521 static boolean_t ztest_device_removal_active = B_FALSE; 522 static boolean_t ztest_pool_scrubbed = B_FALSE; 523 static kmutex_t ztest_checkpoint_lock; 524 525 /* 526 * The ztest_name_lock protects the pool and dataset namespace used by 527 * the individual tests. To modify the namespace, consumers must grab 528 * this lock as writer. Grabbing the lock as reader will ensure that the 529 * namespace does not change while the lock is held. 530 */ 531 static pthread_rwlock_t ztest_name_lock; 532 533 static boolean_t ztest_dump_core = B_TRUE; 534 static boolean_t ztest_exiting; 535 536 /* Global commit callback list */ 537 static ztest_cb_list_t zcl; 538 /* Commit cb delay */ 539 static uint64_t zc_min_txg_delay = UINT64_MAX; 540 static int zc_cb_counter = 0; 541 542 /* 543 * Minimum number of commit callbacks that need to be registered for us to check 544 * whether the minimum txg delay is acceptable. 545 */ 546 #define ZTEST_COMMIT_CB_MIN_REG 100 547 548 /* 549 * If a number of txgs equal to this threshold have been created after a commit 550 * callback has been registered but not called, then we assume there is an 551 * implementation bug. 552 */ 553 #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) 554 555 enum ztest_object { 556 ZTEST_META_DNODE = 0, 557 ZTEST_DIROBJ, 558 ZTEST_OBJECTS 559 }; 560 561 static __attribute__((noreturn)) void usage(boolean_t requested); 562 static int ztest_scrub_impl(spa_t *spa); 563 564 /* 565 * These libumem hooks provide a reasonable set of defaults for the allocator's 566 * debugging facilities. 567 */ 568 const char * 569 _umem_debug_init(void) 570 { 571 return ("default,verbose"); /* $UMEM_DEBUG setting */ 572 } 573 574 const char * 575 _umem_logging_init(void) 576 { 577 return ("fail,contents"); /* $UMEM_LOGGING setting */ 578 } 579 580 static void 581 dump_debug_buffer(void) 582 { 583 ssize_t ret __attribute__((unused)); 584 585 if (!ztest_opts.zo_dump_dbgmsg) 586 return; 587 588 /* 589 * We use write() instead of printf() so that this function 590 * is safe to call from a signal handler. 591 */ 592 ret = write(STDOUT_FILENO, "\n", 1); 593 zfs_dbgmsg_print("ztest"); 594 } 595 596 #define BACKTRACE_SZ 100 597 598 static void sig_handler(int signo) 599 { 600 struct sigaction action; 601 #if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ 602 int nptrs; 603 void *buffer[BACKTRACE_SZ]; 604 605 nptrs = backtrace(buffer, BACKTRACE_SZ); 606 backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); 607 #endif 608 dump_debug_buffer(); 609 610 /* 611 * Restore default action and re-raise signal so SIGSEGV and 612 * SIGABRT can trigger a core dump. 613 */ 614 action.sa_handler = SIG_DFL; 615 sigemptyset(&action.sa_mask); 616 action.sa_flags = 0; 617 (void) sigaction(signo, &action, NULL); 618 raise(signo); 619 } 620 621 #define FATAL_MSG_SZ 1024 622 623 char *fatal_msg; 624 625 static __attribute__((format(printf, 2, 3))) __attribute__((noreturn)) void 626 fatal(int do_perror, char *message, ...) 627 { 628 va_list args; 629 int save_errno = errno; 630 char *buf; 631 632 (void) fflush(stdout); 633 buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); 634 if (buf == NULL) 635 goto out; 636 637 va_start(args, message); 638 (void) sprintf(buf, "ztest: "); 639 /* LINTED */ 640 (void) vsprintf(buf + strlen(buf), message, args); 641 va_end(args); 642 if (do_perror) { 643 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 644 ": %s", strerror(save_errno)); 645 } 646 (void) fprintf(stderr, "%s\n", buf); 647 fatal_msg = buf; /* to ease debugging */ 648 649 out: 650 if (ztest_dump_core) 651 abort(); 652 else 653 dump_debug_buffer(); 654 655 exit(3); 656 } 657 658 static int 659 str2shift(const char *buf) 660 { 661 const char *ends = "BKMGTPEZ"; 662 int i; 663 664 if (buf[0] == '\0') 665 return (0); 666 for (i = 0; i < strlen(ends); i++) { 667 if (toupper(buf[0]) == ends[i]) 668 break; 669 } 670 if (i == strlen(ends)) { 671 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 672 buf); 673 usage(B_FALSE); 674 } 675 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 676 return (10*i); 677 } 678 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 679 usage(B_FALSE); 680 } 681 682 static uint64_t 683 nicenumtoull(const char *buf) 684 { 685 char *end; 686 uint64_t val; 687 688 val = strtoull(buf, &end, 0); 689 if (end == buf) { 690 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 691 usage(B_FALSE); 692 } else if (end[0] == '.') { 693 double fval = strtod(buf, &end); 694 fval *= pow(2, str2shift(end)); 695 /* 696 * UINT64_MAX is not exactly representable as a double. 697 * The closest representation is UINT64_MAX + 1, so we 698 * use a >= comparison instead of > for the bounds check. 699 */ 700 if (fval >= (double)UINT64_MAX) { 701 (void) fprintf(stderr, "ztest: value too large: %s\n", 702 buf); 703 usage(B_FALSE); 704 } 705 val = (uint64_t)fval; 706 } else { 707 int shift = str2shift(end); 708 if (shift >= 64 || (val << shift) >> shift != val) { 709 (void) fprintf(stderr, "ztest: value too large: %s\n", 710 buf); 711 usage(B_FALSE); 712 } 713 val <<= shift; 714 } 715 return (val); 716 } 717 718 typedef struct ztest_option { 719 const char short_opt; 720 const char *long_opt; 721 const char *long_opt_param; 722 const char *comment; 723 unsigned int default_int; 724 char *default_str; 725 } ztest_option_t; 726 727 /* 728 * The following option_table is used for generating the usage info as well as 729 * the long and short option information for calling getopt_long(). 730 */ 731 static ztest_option_t option_table[] = { 732 { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, 733 NULL}, 734 { 's', "vdev-size", "INTEGER", "Size of each vdev", 735 NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, 736 { 'a', "alignment-shift", "INTEGER", 737 "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, 738 { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", 739 DEFAULT_MIRRORS, NULL}, 740 { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", 741 DEFAULT_RAID_CHILDREN, NULL}, 742 { 'R', "raid-parity", "INTEGER", "Raid parity", 743 DEFAULT_RAID_PARITY, NULL}, 744 { 'K', "raid-kind", "raidz|draid|random", "Raid kind", 745 NO_DEFAULT, "random"}, 746 { 'D', "draid-data", "INTEGER", "Number of draid data drives", 747 DEFAULT_DRAID_DATA, NULL}, 748 { 'S', "draid-spares", "INTEGER", "Number of draid spares", 749 DEFAULT_DRAID_SPARES, NULL}, 750 { 'd', "datasets", "INTEGER", "Number of datasets", 751 DEFAULT_DATASETS_COUNT, NULL}, 752 { 't', "threads", "INTEGER", "Number of ztest threads", 753 DEFAULT_THREADS, NULL}, 754 { 'g', "gang-block-threshold", "INTEGER", 755 "Metaslab gang block threshold", 756 NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, 757 { 'i', "init-count", "INTEGER", "Number of times to initialize pool", 758 DEFAULT_INITS, NULL}, 759 { 'k', "kill-percentage", "INTEGER", "Kill percentage", 760 NO_DEFAULT, DEFAULT_KILLRATE_STR}, 761 { 'p', "pool-name", "STRING", "Pool name", 762 NO_DEFAULT, DEFAULT_POOL}, 763 { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", 764 NO_DEFAULT, DEFAULT_VDEV_DIR}, 765 { 'M', "multi-host", NULL, 766 "Multi-host; simulate pool imported on remote host", 767 NO_DEFAULT, NULL}, 768 { 'E', "use-existing-pool", NULL, 769 "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, 770 { 'T', "run-time", "INTEGER", "Total run time", 771 NO_DEFAULT, DEFAULT_RUN_TIME_STR}, 772 { 'P', "pass-time", "INTEGER", "Time per pass", 773 NO_DEFAULT, DEFAULT_PASS_TIME_STR}, 774 { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", 775 DEFAULT_MAX_LOOPS, NULL}, 776 { 'B', "alt-ztest", "PATH", "Alternate ztest path", 777 NO_DEFAULT, NULL}, 778 { 'C', "vdev-class-state", "on|off|random", "vdev class state", 779 NO_DEFAULT, "random"}, 780 { 'o', "option", "\"OPTION=INTEGER\"", 781 "Set global variable to an unsigned 32-bit integer value", 782 NO_DEFAULT, NULL}, 783 { 'G', "dump-debug-msg", NULL, 784 "Dump zfs_dbgmsg buffer before exiting due to an error", 785 NO_DEFAULT, NULL}, 786 { 'V', "verbose", NULL, 787 "Verbose (use multiple times for ever more verbosity)", 788 NO_DEFAULT, NULL}, 789 { 'h', "help", NULL, "Show this help", 790 NO_DEFAULT, NULL}, 791 {0, 0, 0, 0, 0, 0} 792 }; 793 794 static struct option *long_opts = NULL; 795 static char *short_opts = NULL; 796 797 static void 798 init_options(void) 799 { 800 ASSERT3P(long_opts, ==, NULL); 801 ASSERT3P(short_opts, ==, NULL); 802 803 int count = sizeof (option_table) / sizeof (option_table[0]); 804 long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); 805 806 short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); 807 int short_opt_index = 0; 808 809 for (int i = 0; i < count; i++) { 810 long_opts[i].val = option_table[i].short_opt; 811 long_opts[i].name = option_table[i].long_opt; 812 long_opts[i].has_arg = option_table[i].long_opt_param != NULL 813 ? required_argument : no_argument; 814 long_opts[i].flag = NULL; 815 short_opts[short_opt_index++] = option_table[i].short_opt; 816 if (option_table[i].long_opt_param != NULL) { 817 short_opts[short_opt_index++] = ':'; 818 } 819 } 820 } 821 822 static void 823 fini_options(void) 824 { 825 int count = sizeof (option_table) / sizeof (option_table[0]); 826 827 umem_free(long_opts, sizeof (struct option) * count); 828 umem_free(short_opts, sizeof (char) * 2 * count); 829 830 long_opts = NULL; 831 short_opts = NULL; 832 } 833 834 static __attribute__((noreturn)) void 835 usage(boolean_t requested) 836 { 837 char option[80]; 838 FILE *fp = requested ? stdout : stderr; 839 840 (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); 841 for (int i = 0; option_table[i].short_opt != 0; i++) { 842 if (option_table[i].long_opt_param != NULL) { 843 (void) sprintf(option, " -%c --%s=%s", 844 option_table[i].short_opt, 845 option_table[i].long_opt, 846 option_table[i].long_opt_param); 847 } else { 848 (void) sprintf(option, " -%c --%s", 849 option_table[i].short_opt, 850 option_table[i].long_opt); 851 } 852 (void) fprintf(fp, " %-40s%s", option, 853 option_table[i].comment); 854 855 if (option_table[i].long_opt_param != NULL) { 856 if (option_table[i].default_str != NULL) { 857 (void) fprintf(fp, " (default: %s)", 858 option_table[i].default_str); 859 } else if (option_table[i].default_int != NO_DEFAULT) { 860 (void) fprintf(fp, " (default: %u)", 861 option_table[i].default_int); 862 } 863 } 864 (void) fprintf(fp, "\n"); 865 } 866 exit(requested ? 0 : 1); 867 } 868 869 static uint64_t 870 ztest_random(uint64_t range) 871 { 872 uint64_t r; 873 874 ASSERT3S(ztest_fd_rand, >=, 0); 875 876 if (range == 0) 877 return (0); 878 879 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) 880 fatal(B_TRUE, "short read from /dev/urandom"); 881 882 return (r % range); 883 } 884 885 static void 886 ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) 887 { 888 char name[32]; 889 char *value; 890 int state = ZTEST_VDEV_CLASS_RND; 891 892 (void) strlcpy(name, input, sizeof (name)); 893 894 value = strchr(name, '='); 895 if (value == NULL) { 896 (void) fprintf(stderr, "missing value in property=value " 897 "'-C' argument (%s)\n", input); 898 usage(B_FALSE); 899 } 900 *(value) = '\0'; 901 value++; 902 903 if (strcmp(value, "on") == 0) { 904 state = ZTEST_VDEV_CLASS_ON; 905 } else if (strcmp(value, "off") == 0) { 906 state = ZTEST_VDEV_CLASS_OFF; 907 } else if (strcmp(value, "random") == 0) { 908 state = ZTEST_VDEV_CLASS_RND; 909 } else { 910 (void) fprintf(stderr, "invalid property value '%s'\n", value); 911 usage(B_FALSE); 912 } 913 914 if (strcmp(name, "special") == 0) { 915 zo->zo_special_vdevs = state; 916 } else { 917 (void) fprintf(stderr, "invalid property name '%s'\n", name); 918 usage(B_FALSE); 919 } 920 if (zo->zo_verbose >= 3) 921 (void) printf("%s vdev state is '%s'\n", name, value); 922 } 923 924 static void 925 process_options(int argc, char **argv) 926 { 927 char *path; 928 ztest_shared_opts_t *zo = &ztest_opts; 929 930 int opt; 931 uint64_t value; 932 const char *raid_kind = "random"; 933 934 memcpy(zo, &ztest_opts_defaults, sizeof (*zo)); 935 936 init_options(); 937 938 while ((opt = getopt_long(argc, argv, short_opts, long_opts, 939 NULL)) != EOF) { 940 value = 0; 941 switch (opt) { 942 case 'v': 943 case 's': 944 case 'a': 945 case 'm': 946 case 'r': 947 case 'R': 948 case 'D': 949 case 'S': 950 case 'd': 951 case 't': 952 case 'g': 953 case 'i': 954 case 'k': 955 case 'T': 956 case 'P': 957 case 'F': 958 value = nicenumtoull(optarg); 959 } 960 switch (opt) { 961 case 'v': 962 zo->zo_vdevs = value; 963 break; 964 case 's': 965 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 966 break; 967 case 'a': 968 zo->zo_ashift = value; 969 break; 970 case 'm': 971 zo->zo_mirrors = value; 972 break; 973 case 'r': 974 zo->zo_raid_children = MAX(1, value); 975 break; 976 case 'R': 977 zo->zo_raid_parity = MIN(MAX(value, 1), 3); 978 break; 979 case 'K': 980 raid_kind = optarg; 981 break; 982 case 'D': 983 zo->zo_draid_data = MAX(1, value); 984 break; 985 case 'S': 986 zo->zo_draid_spares = MAX(1, value); 987 break; 988 case 'd': 989 zo->zo_datasets = MAX(1, value); 990 break; 991 case 't': 992 zo->zo_threads = MAX(1, value); 993 break; 994 case 'g': 995 zo->zo_metaslab_force_ganging = 996 MAX(SPA_MINBLOCKSIZE << 1, value); 997 break; 998 case 'i': 999 zo->zo_init = value; 1000 break; 1001 case 'k': 1002 zo->zo_killrate = value; 1003 break; 1004 case 'p': 1005 (void) strlcpy(zo->zo_pool, optarg, 1006 sizeof (zo->zo_pool)); 1007 break; 1008 case 'f': 1009 path = realpath(optarg, NULL); 1010 if (path == NULL) { 1011 (void) fprintf(stderr, "error: %s: %s\n", 1012 optarg, strerror(errno)); 1013 usage(B_FALSE); 1014 } else { 1015 (void) strlcpy(zo->zo_dir, path, 1016 sizeof (zo->zo_dir)); 1017 free(path); 1018 } 1019 break; 1020 case 'M': 1021 zo->zo_mmp_test = 1; 1022 break; 1023 case 'V': 1024 zo->zo_verbose++; 1025 break; 1026 case 'E': 1027 zo->zo_init = 0; 1028 break; 1029 case 'T': 1030 zo->zo_time = value; 1031 break; 1032 case 'P': 1033 zo->zo_passtime = MAX(1, value); 1034 break; 1035 case 'F': 1036 zo->zo_maxloops = MAX(1, value); 1037 break; 1038 case 'B': 1039 (void) strlcpy(zo->zo_alt_ztest, optarg, 1040 sizeof (zo->zo_alt_ztest)); 1041 break; 1042 case 'C': 1043 ztest_parse_name_value(optarg, zo); 1044 break; 1045 case 'o': 1046 if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { 1047 (void) fprintf(stderr, 1048 "max global var count (%zu) exceeded\n", 1049 ZO_GVARS_MAX_COUNT); 1050 usage(B_FALSE); 1051 } 1052 char *v = zo->zo_gvars[zo->zo_gvars_count]; 1053 if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= 1054 ZO_GVARS_MAX_ARGLEN) { 1055 (void) fprintf(stderr, 1056 "global var option '%s' is too long\n", 1057 optarg); 1058 usage(B_FALSE); 1059 } 1060 zo->zo_gvars_count++; 1061 break; 1062 case 'G': 1063 zo->zo_dump_dbgmsg = 1; 1064 break; 1065 case 'h': 1066 usage(B_TRUE); 1067 break; 1068 case '?': 1069 default: 1070 usage(B_FALSE); 1071 break; 1072 } 1073 } 1074 1075 fini_options(); 1076 1077 /* When raid choice is 'random' add a draid pool 50% of the time */ 1078 if (strcmp(raid_kind, "random") == 0) { 1079 raid_kind = (ztest_random(2) == 0) ? "draid" : "raidz"; 1080 1081 if (ztest_opts.zo_verbose >= 3) 1082 (void) printf("choosing RAID type '%s'\n", raid_kind); 1083 } 1084 1085 if (strcmp(raid_kind, "draid") == 0) { 1086 uint64_t min_devsize; 1087 1088 /* With fewer disk use 256M, otherwise 128M is OK */ 1089 min_devsize = (ztest_opts.zo_raid_children < 16) ? 1090 (256ULL << 20) : (128ULL << 20); 1091 1092 /* No top-level mirrors with dRAID for now */ 1093 zo->zo_mirrors = 0; 1094 1095 /* Use more appropriate defaults for dRAID */ 1096 if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) 1097 zo->zo_vdevs = 1; 1098 if (zo->zo_raid_children == 1099 ztest_opts_defaults.zo_raid_children) 1100 zo->zo_raid_children = 16; 1101 if (zo->zo_ashift < 12) 1102 zo->zo_ashift = 12; 1103 if (zo->zo_vdev_size < min_devsize) 1104 zo->zo_vdev_size = min_devsize; 1105 1106 if (zo->zo_draid_data + zo->zo_raid_parity > 1107 zo->zo_raid_children - zo->zo_draid_spares) { 1108 (void) fprintf(stderr, "error: too few draid " 1109 "children (%d) for stripe width (%d)\n", 1110 zo->zo_raid_children, 1111 zo->zo_draid_data + zo->zo_raid_parity); 1112 usage(B_FALSE); 1113 } 1114 1115 (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, 1116 sizeof (zo->zo_raid_type)); 1117 1118 } else /* using raidz */ { 1119 ASSERT0(strcmp(raid_kind, "raidz")); 1120 1121 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1122 zo->zo_raid_children - 1); 1123 } 1124 1125 zo->zo_vdevtime = 1126 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 1127 UINT64_MAX >> 2); 1128 1129 if (*zo->zo_alt_ztest) { 1130 const char *invalid_what = "ztest"; 1131 char *val = zo->zo_alt_ztest; 1132 if (0 != access(val, X_OK) || 1133 (strrchr(val, '/') == NULL && (errno = EINVAL))) 1134 goto invalid; 1135 1136 int dirlen = strrchr(val, '/') - val; 1137 strncpy(zo->zo_alt_libpath, val, dirlen); 1138 invalid_what = "library path", val = zo->zo_alt_libpath; 1139 if (strrchr(val, '/') == NULL && (errno = EINVAL)) 1140 goto invalid; 1141 *strrchr(val, '/') = '\0'; 1142 strlcat(val, "/lib", sizeof (zo->zo_alt_libpath)); 1143 1144 if (0 != access(zo->zo_alt_libpath, X_OK)) 1145 goto invalid; 1146 return; 1147 1148 invalid: 1149 ztest_dump_core = B_FALSE; 1150 fatal(B_TRUE, "invalid alternate %s %s", invalid_what, val); 1151 } 1152 } 1153 1154 static void 1155 ztest_kill(ztest_shared_t *zs) 1156 { 1157 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 1158 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 1159 1160 /* 1161 * Before we kill ourselves, make sure that the config is updated. 1162 * See comment above spa_write_cachefile(). 1163 */ 1164 mutex_enter(&spa_namespace_lock); 1165 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE); 1166 mutex_exit(&spa_namespace_lock); 1167 1168 (void) raise(SIGKILL); 1169 } 1170 1171 static void 1172 ztest_record_enospc(const char *s) 1173 { 1174 (void) s; 1175 ztest_shared->zs_enospc_count++; 1176 } 1177 1178 static uint64_t 1179 ztest_get_ashift(void) 1180 { 1181 if (ztest_opts.zo_ashift == 0) 1182 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 1183 return (ztest_opts.zo_ashift); 1184 } 1185 1186 static boolean_t 1187 ztest_is_draid_spare(const char *name) 1188 { 1189 uint64_t spare_id = 0, parity = 0, vdev_id = 0; 1190 1191 if (sscanf(name, VDEV_TYPE_DRAID "%"PRIu64"-%"PRIu64"-%"PRIu64"", 1192 &parity, &vdev_id, &spare_id) == 3) { 1193 return (B_TRUE); 1194 } 1195 1196 return (B_FALSE); 1197 } 1198 1199 static nvlist_t * 1200 make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) 1201 { 1202 char *pathbuf; 1203 uint64_t vdev; 1204 nvlist_t *file; 1205 boolean_t draid_spare = B_FALSE; 1206 1207 pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1208 1209 if (ashift == 0) 1210 ashift = ztest_get_ashift(); 1211 1212 if (path == NULL) { 1213 path = pathbuf; 1214 1215 if (aux != NULL) { 1216 vdev = ztest_shared->zs_vdev_aux; 1217 (void) snprintf(path, MAXPATHLEN, 1218 ztest_aux_template, ztest_opts.zo_dir, 1219 pool == NULL ? ztest_opts.zo_pool : pool, 1220 aux, vdev); 1221 } else { 1222 vdev = ztest_shared->zs_vdev_next_leaf++; 1223 (void) snprintf(path, MAXPATHLEN, 1224 ztest_dev_template, ztest_opts.zo_dir, 1225 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 1226 } 1227 } else { 1228 draid_spare = ztest_is_draid_spare(path); 1229 } 1230 1231 if (size != 0 && !draid_spare) { 1232 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 1233 if (fd == -1) 1234 fatal(B_TRUE, "can't open %s", path); 1235 if (ftruncate(fd, size) != 0) 1236 fatal(B_TRUE, "can't ftruncate %s", path); 1237 (void) close(fd); 1238 } 1239 1240 file = fnvlist_alloc(); 1241 fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, 1242 draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); 1243 fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); 1244 fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); 1245 umem_free(pathbuf, MAXPATHLEN); 1246 1247 return (file); 1248 } 1249 1250 static nvlist_t * 1251 make_vdev_raid(char *path, char *aux, char *pool, size_t size, 1252 uint64_t ashift, int r) 1253 { 1254 nvlist_t *raid, **child; 1255 int c; 1256 1257 if (r < 2) 1258 return (make_vdev_file(path, aux, pool, size, ashift)); 1259 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 1260 1261 for (c = 0; c < r; c++) 1262 child[c] = make_vdev_file(path, aux, pool, size, ashift); 1263 1264 raid = fnvlist_alloc(); 1265 fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, 1266 ztest_opts.zo_raid_type); 1267 fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, 1268 ztest_opts.zo_raid_parity); 1269 fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, 1270 (const nvlist_t **)child, r); 1271 1272 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { 1273 uint64_t ndata = ztest_opts.zo_draid_data; 1274 uint64_t nparity = ztest_opts.zo_raid_parity; 1275 uint64_t nspares = ztest_opts.zo_draid_spares; 1276 uint64_t children = ztest_opts.zo_raid_children; 1277 uint64_t ngroups = 1; 1278 1279 /* 1280 * Calculate the minimum number of groups required to fill a 1281 * slice. This is the LCM of the stripe width (data + parity) 1282 * and the number of data drives (children - spares). 1283 */ 1284 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1285 ngroups++; 1286 1287 /* Store the basic dRAID configuration. */ 1288 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1289 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1290 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1291 } 1292 1293 for (c = 0; c < r; c++) 1294 fnvlist_free(child[c]); 1295 1296 umem_free(child, r * sizeof (nvlist_t *)); 1297 1298 return (raid); 1299 } 1300 1301 static nvlist_t * 1302 make_vdev_mirror(char *path, char *aux, char *pool, size_t size, 1303 uint64_t ashift, int r, int m) 1304 { 1305 nvlist_t *mirror, **child; 1306 int c; 1307 1308 if (m < 1) 1309 return (make_vdev_raid(path, aux, pool, size, ashift, r)); 1310 1311 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 1312 1313 for (c = 0; c < m; c++) 1314 child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); 1315 1316 mirror = fnvlist_alloc(); 1317 fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); 1318 fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 1319 (const nvlist_t **)child, m); 1320 1321 for (c = 0; c < m; c++) 1322 fnvlist_free(child[c]); 1323 1324 umem_free(child, m * sizeof (nvlist_t *)); 1325 1326 return (mirror); 1327 } 1328 1329 static nvlist_t * 1330 make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, 1331 const char *class, int r, int m, int t) 1332 { 1333 nvlist_t *root, **child; 1334 int c; 1335 boolean_t log; 1336 1337 ASSERT3S(t, >, 0); 1338 1339 log = (class != NULL && strcmp(class, "log") == 0); 1340 1341 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 1342 1343 for (c = 0; c < t; c++) { 1344 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 1345 r, m); 1346 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); 1347 1348 if (class != NULL && class[0] != '\0') { 1349 ASSERT(m > 1 || log); /* expecting a mirror */ 1350 fnvlist_add_string(child[c], 1351 ZPOOL_CONFIG_ALLOCATION_BIAS, class); 1352 } 1353 } 1354 1355 root = fnvlist_alloc(); 1356 fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 1357 fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 1358 (const nvlist_t **)child, t); 1359 1360 for (c = 0; c < t; c++) 1361 fnvlist_free(child[c]); 1362 1363 umem_free(child, t * sizeof (nvlist_t *)); 1364 1365 return (root); 1366 } 1367 1368 /* 1369 * Find a random spa version. Returns back a random spa version in the 1370 * range [initial_version, SPA_VERSION_FEATURES]. 1371 */ 1372 static uint64_t 1373 ztest_random_spa_version(uint64_t initial_version) 1374 { 1375 uint64_t version = initial_version; 1376 1377 if (version <= SPA_VERSION_BEFORE_FEATURES) { 1378 version = version + 1379 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 1380 } 1381 1382 if (version > SPA_VERSION_BEFORE_FEATURES) 1383 version = SPA_VERSION_FEATURES; 1384 1385 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1386 return (version); 1387 } 1388 1389 static int 1390 ztest_random_blocksize(void) 1391 { 1392 ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); 1393 1394 /* 1395 * Choose a block size >= the ashift. 1396 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1397 */ 1398 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1399 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1400 maxbs = 20; 1401 uint64_t block_shift = 1402 ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1403 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1404 } 1405 1406 static int 1407 ztest_random_dnodesize(void) 1408 { 1409 int slots; 1410 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1411 1412 if (max_slots == DNODE_MIN_SLOTS) 1413 return (DNODE_MIN_SIZE); 1414 1415 /* 1416 * Weight the random distribution more heavily toward smaller 1417 * dnode sizes since that is more likely to reflect real-world 1418 * usage. 1419 */ 1420 ASSERT3U(max_slots, >, 4); 1421 switch (ztest_random(10)) { 1422 case 0: 1423 slots = 5 + ztest_random(max_slots - 4); 1424 break; 1425 case 1 ... 4: 1426 slots = 2 + ztest_random(3); 1427 break; 1428 default: 1429 slots = 1; 1430 break; 1431 } 1432 1433 return (slots << DNODE_SHIFT); 1434 } 1435 1436 static int 1437 ztest_random_ibshift(void) 1438 { 1439 return (DN_MIN_INDBLKSHIFT + 1440 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1441 } 1442 1443 static uint64_t 1444 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1445 { 1446 uint64_t top; 1447 vdev_t *rvd = spa->spa_root_vdev; 1448 vdev_t *tvd; 1449 1450 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 1451 1452 do { 1453 top = ztest_random(rvd->vdev_children); 1454 tvd = rvd->vdev_child[top]; 1455 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1456 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1457 1458 return (top); 1459 } 1460 1461 static uint64_t 1462 ztest_random_dsl_prop(zfs_prop_t prop) 1463 { 1464 uint64_t value; 1465 1466 do { 1467 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1468 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1469 1470 return (value); 1471 } 1472 1473 static int 1474 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1475 boolean_t inherit) 1476 { 1477 const char *propname = zfs_prop_to_name(prop); 1478 const char *valname; 1479 char *setpoint; 1480 uint64_t curval; 1481 int error; 1482 1483 error = dsl_prop_set_int(osname, propname, 1484 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1485 1486 if (error == ENOSPC) { 1487 ztest_record_enospc(FTAG); 1488 return (error); 1489 } 1490 ASSERT0(error); 1491 1492 setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1493 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1494 1495 if (ztest_opts.zo_verbose >= 6) { 1496 int err; 1497 1498 err = zfs_prop_index_to_string(prop, curval, &valname); 1499 if (err) 1500 (void) printf("%s %s = %llu at '%s'\n", osname, 1501 propname, (unsigned long long)curval, setpoint); 1502 else 1503 (void) printf("%s %s = %s at '%s'\n", 1504 osname, propname, valname, setpoint); 1505 } 1506 umem_free(setpoint, MAXPATHLEN); 1507 1508 return (error); 1509 } 1510 1511 static int 1512 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1513 { 1514 spa_t *spa = ztest_spa; 1515 nvlist_t *props = NULL; 1516 int error; 1517 1518 props = fnvlist_alloc(); 1519 fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); 1520 1521 error = spa_prop_set(spa, props); 1522 1523 fnvlist_free(props); 1524 1525 if (error == ENOSPC) { 1526 ztest_record_enospc(FTAG); 1527 return (error); 1528 } 1529 ASSERT0(error); 1530 1531 return (error); 1532 } 1533 1534 static int 1535 ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, 1536 boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) 1537 { 1538 int err; 1539 char *cp = NULL; 1540 char ddname[ZFS_MAX_DATASET_NAME_LEN]; 1541 1542 strcpy(ddname, name); 1543 cp = strchr(ddname, '@'); 1544 if (cp != NULL) 1545 *cp = '\0'; 1546 1547 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1548 while (decrypt && err == EACCES) { 1549 dsl_crypto_params_t *dcp; 1550 nvlist_t *crypto_args = fnvlist_alloc(); 1551 1552 fnvlist_add_uint8_array(crypto_args, "wkeydata", 1553 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 1554 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, 1555 crypto_args, &dcp)); 1556 err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); 1557 /* 1558 * Note: if there was an error loading, the wkey was not 1559 * consumed, and needs to be freed. 1560 */ 1561 dsl_crypto_params_free(dcp, (err != 0)); 1562 fnvlist_free(crypto_args); 1563 1564 if (err == EINVAL) { 1565 /* 1566 * We couldn't load a key for this dataset so try 1567 * the parent. This loop will eventually hit the 1568 * encryption root since ztest only makes clones 1569 * as children of their origin datasets. 1570 */ 1571 cp = strrchr(ddname, '/'); 1572 if (cp == NULL) 1573 return (err); 1574 1575 *cp = '\0'; 1576 err = EACCES; 1577 continue; 1578 } else if (err != 0) { 1579 break; 1580 } 1581 1582 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1583 break; 1584 } 1585 1586 return (err); 1587 } 1588 1589 static void 1590 ztest_rll_init(rll_t *rll) 1591 { 1592 rll->rll_writer = NULL; 1593 rll->rll_readers = 0; 1594 mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); 1595 cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); 1596 } 1597 1598 static void 1599 ztest_rll_destroy(rll_t *rll) 1600 { 1601 ASSERT3P(rll->rll_writer, ==, NULL); 1602 ASSERT0(rll->rll_readers); 1603 mutex_destroy(&rll->rll_lock); 1604 cv_destroy(&rll->rll_cv); 1605 } 1606 1607 static void 1608 ztest_rll_lock(rll_t *rll, rl_type_t type) 1609 { 1610 mutex_enter(&rll->rll_lock); 1611 1612 if (type == RL_READER) { 1613 while (rll->rll_writer != NULL) 1614 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1615 rll->rll_readers++; 1616 } else { 1617 while (rll->rll_writer != NULL || rll->rll_readers) 1618 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1619 rll->rll_writer = curthread; 1620 } 1621 1622 mutex_exit(&rll->rll_lock); 1623 } 1624 1625 static void 1626 ztest_rll_unlock(rll_t *rll) 1627 { 1628 mutex_enter(&rll->rll_lock); 1629 1630 if (rll->rll_writer) { 1631 ASSERT0(rll->rll_readers); 1632 rll->rll_writer = NULL; 1633 } else { 1634 ASSERT3S(rll->rll_readers, >, 0); 1635 ASSERT3P(rll->rll_writer, ==, NULL); 1636 rll->rll_readers--; 1637 } 1638 1639 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1640 cv_broadcast(&rll->rll_cv); 1641 1642 mutex_exit(&rll->rll_lock); 1643 } 1644 1645 static void 1646 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1647 { 1648 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1649 1650 ztest_rll_lock(rll, type); 1651 } 1652 1653 static void 1654 ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1655 { 1656 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1657 1658 ztest_rll_unlock(rll); 1659 } 1660 1661 static rl_t * 1662 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1663 uint64_t size, rl_type_t type) 1664 { 1665 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1666 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1667 rl_t *rl; 1668 1669 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1670 rl->rl_object = object; 1671 rl->rl_offset = offset; 1672 rl->rl_size = size; 1673 rl->rl_lock = rll; 1674 1675 ztest_rll_lock(rll, type); 1676 1677 return (rl); 1678 } 1679 1680 static void 1681 ztest_range_unlock(rl_t *rl) 1682 { 1683 rll_t *rll = rl->rl_lock; 1684 1685 ztest_rll_unlock(rll); 1686 1687 umem_free(rl, sizeof (*rl)); 1688 } 1689 1690 static void 1691 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1692 { 1693 zd->zd_os = os; 1694 zd->zd_zilog = dmu_objset_zil(os); 1695 zd->zd_shared = szd; 1696 dmu_objset_name(os, zd->zd_name); 1697 int l; 1698 1699 if (zd->zd_shared != NULL) 1700 zd->zd_shared->zd_seq = 0; 1701 1702 VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); 1703 mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); 1704 1705 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1706 ztest_rll_init(&zd->zd_object_lock[l]); 1707 1708 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1709 ztest_rll_init(&zd->zd_range_lock[l]); 1710 } 1711 1712 static void 1713 ztest_zd_fini(ztest_ds_t *zd) 1714 { 1715 int l; 1716 1717 mutex_destroy(&zd->zd_dirobj_lock); 1718 (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); 1719 1720 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1721 ztest_rll_destroy(&zd->zd_object_lock[l]); 1722 1723 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1724 ztest_rll_destroy(&zd->zd_range_lock[l]); 1725 } 1726 1727 #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) 1728 1729 static uint64_t 1730 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 1731 { 1732 uint64_t txg; 1733 int error; 1734 1735 /* 1736 * Attempt to assign tx to some transaction group. 1737 */ 1738 error = dmu_tx_assign(tx, txg_how); 1739 if (error) { 1740 if (error == ERESTART) { 1741 ASSERT3U(txg_how, ==, TXG_NOWAIT); 1742 dmu_tx_wait(tx); 1743 } else { 1744 ASSERT3U(error, ==, ENOSPC); 1745 ztest_record_enospc(tag); 1746 } 1747 dmu_tx_abort(tx); 1748 return (0); 1749 } 1750 txg = dmu_tx_get_txg(tx); 1751 ASSERT3U(txg, !=, 0); 1752 return (txg); 1753 } 1754 1755 static void 1756 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1757 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1758 uint64_t crtxg) 1759 { 1760 bt->bt_magic = BT_MAGIC; 1761 bt->bt_objset = dmu_objset_id(os); 1762 bt->bt_object = object; 1763 bt->bt_dnodesize = dnodesize; 1764 bt->bt_offset = offset; 1765 bt->bt_gen = gen; 1766 bt->bt_txg = txg; 1767 bt->bt_crtxg = crtxg; 1768 } 1769 1770 static void 1771 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1772 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1773 uint64_t crtxg) 1774 { 1775 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1776 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1777 ASSERT3U(bt->bt_object, ==, object); 1778 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1779 ASSERT3U(bt->bt_offset, ==, offset); 1780 ASSERT3U(bt->bt_gen, <=, gen); 1781 ASSERT3U(bt->bt_txg, <=, txg); 1782 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1783 } 1784 1785 static ztest_block_tag_t * 1786 ztest_bt_bonus(dmu_buf_t *db) 1787 { 1788 dmu_object_info_t doi; 1789 ztest_block_tag_t *bt; 1790 1791 dmu_object_info_from_db(db, &doi); 1792 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1793 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1794 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1795 1796 return (bt); 1797 } 1798 1799 /* 1800 * Generate a token to fill up unused bonus buffer space. Try to make 1801 * it unique to the object, generation, and offset to verify that data 1802 * is not getting overwritten by data from other dnodes. 1803 */ 1804 #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1805 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1806 1807 /* 1808 * Fill up the unused bonus buffer region before the block tag with a 1809 * verifiable pattern. Filling the whole bonus area with non-zero data 1810 * helps ensure that all dnode traversal code properly skips the 1811 * interior regions of large dnodes. 1812 */ 1813 static void 1814 ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1815 objset_t *os, uint64_t gen) 1816 { 1817 uint64_t *bonusp; 1818 1819 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1820 1821 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1822 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1823 gen, bonusp - (uint64_t *)db->db_data); 1824 *bonusp = token; 1825 } 1826 } 1827 1828 /* 1829 * Verify that the unused area of a bonus buffer is filled with the 1830 * expected tokens. 1831 */ 1832 static void 1833 ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1834 objset_t *os, uint64_t gen) 1835 { 1836 uint64_t *bonusp; 1837 1838 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1839 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1840 gen, bonusp - (uint64_t *)db->db_data); 1841 VERIFY3U(*bonusp, ==, token); 1842 } 1843 } 1844 1845 /* 1846 * ZIL logging ops 1847 */ 1848 1849 #define lrz_type lr_mode 1850 #define lrz_blocksize lr_uid 1851 #define lrz_ibshift lr_gid 1852 #define lrz_bonustype lr_rdev 1853 #define lrz_dnodesize lr_crtime[1] 1854 1855 static void 1856 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1857 { 1858 char *name = (void *)(lr + 1); /* name follows lr */ 1859 size_t namesize = strlen(name) + 1; 1860 itx_t *itx; 1861 1862 if (zil_replaying(zd->zd_zilog, tx)) 1863 return; 1864 1865 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1866 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1867 sizeof (*lr) + namesize - sizeof (lr_t)); 1868 1869 zil_itx_assign(zd->zd_zilog, itx, tx); 1870 } 1871 1872 static void 1873 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1874 { 1875 char *name = (void *)(lr + 1); /* name follows lr */ 1876 size_t namesize = strlen(name) + 1; 1877 itx_t *itx; 1878 1879 if (zil_replaying(zd->zd_zilog, tx)) 1880 return; 1881 1882 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1883 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1884 sizeof (*lr) + namesize - sizeof (lr_t)); 1885 1886 itx->itx_oid = object; 1887 zil_itx_assign(zd->zd_zilog, itx, tx); 1888 } 1889 1890 static void 1891 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1892 { 1893 itx_t *itx; 1894 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1895 1896 if (zil_replaying(zd->zd_zilog, tx)) 1897 return; 1898 1899 if (lr->lr_length > zil_max_log_data(zd->zd_zilog)) 1900 write_state = WR_INDIRECT; 1901 1902 itx = zil_itx_create(TX_WRITE, 1903 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1904 1905 if (write_state == WR_COPIED && 1906 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1907 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { 1908 zil_itx_destroy(itx); 1909 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1910 write_state = WR_NEED_COPY; 1911 } 1912 itx->itx_private = zd; 1913 itx->itx_wr_state = write_state; 1914 itx->itx_sync = (ztest_random(8) == 0); 1915 1916 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1917 sizeof (*lr) - sizeof (lr_t)); 1918 1919 zil_itx_assign(zd->zd_zilog, itx, tx); 1920 } 1921 1922 static void 1923 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 1924 { 1925 itx_t *itx; 1926 1927 if (zil_replaying(zd->zd_zilog, tx)) 1928 return; 1929 1930 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 1931 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1932 sizeof (*lr) - sizeof (lr_t)); 1933 1934 itx->itx_sync = B_FALSE; 1935 zil_itx_assign(zd->zd_zilog, itx, tx); 1936 } 1937 1938 static void 1939 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 1940 { 1941 itx_t *itx; 1942 1943 if (zil_replaying(zd->zd_zilog, tx)) 1944 return; 1945 1946 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 1947 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1948 sizeof (*lr) - sizeof (lr_t)); 1949 1950 itx->itx_sync = B_FALSE; 1951 zil_itx_assign(zd->zd_zilog, itx, tx); 1952 } 1953 1954 /* 1955 * ZIL replay ops 1956 */ 1957 static int 1958 ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 1959 { 1960 ztest_ds_t *zd = arg1; 1961 lr_create_t *lr = arg2; 1962 char *name = (void *)(lr + 1); /* name follows lr */ 1963 objset_t *os = zd->zd_os; 1964 ztest_block_tag_t *bbt; 1965 dmu_buf_t *db; 1966 dmu_tx_t *tx; 1967 uint64_t txg; 1968 int error = 0; 1969 int bonuslen; 1970 1971 if (byteswap) 1972 byteswap_uint64_array(lr, sizeof (*lr)); 1973 1974 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 1975 ASSERT3S(name[0], !=, '\0'); 1976 1977 tx = dmu_tx_create(os); 1978 1979 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 1980 1981 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1982 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1983 } else { 1984 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1985 } 1986 1987 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1988 if (txg == 0) 1989 return (ENOSPC); 1990 1991 ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); 1992 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 1993 1994 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1995 if (lr->lr_foid == 0) { 1996 lr->lr_foid = zap_create_dnsize(os, 1997 lr->lrz_type, lr->lrz_bonustype, 1998 bonuslen, lr->lrz_dnodesize, tx); 1999 } else { 2000 error = zap_create_claim_dnsize(os, lr->lr_foid, 2001 lr->lrz_type, lr->lrz_bonustype, 2002 bonuslen, lr->lrz_dnodesize, tx); 2003 } 2004 } else { 2005 if (lr->lr_foid == 0) { 2006 lr->lr_foid = dmu_object_alloc_dnsize(os, 2007 lr->lrz_type, 0, lr->lrz_bonustype, 2008 bonuslen, lr->lrz_dnodesize, tx); 2009 } else { 2010 error = dmu_object_claim_dnsize(os, lr->lr_foid, 2011 lr->lrz_type, 0, lr->lrz_bonustype, 2012 bonuslen, lr->lrz_dnodesize, tx); 2013 } 2014 } 2015 2016 if (error) { 2017 ASSERT3U(error, ==, EEXIST); 2018 ASSERT(zd->zd_zilog->zl_replay); 2019 dmu_tx_commit(tx); 2020 return (error); 2021 } 2022 2023 ASSERT3U(lr->lr_foid, !=, 0); 2024 2025 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 2026 VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, 2027 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 2028 2029 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2030 bbt = ztest_bt_bonus(db); 2031 dmu_buf_will_dirty(db, tx); 2032 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 2033 lr->lr_gen, txg, txg); 2034 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 2035 dmu_buf_rele(db, FTAG); 2036 2037 VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 2038 &lr->lr_foid, tx)); 2039 2040 (void) ztest_log_create(zd, tx, lr); 2041 2042 dmu_tx_commit(tx); 2043 2044 return (0); 2045 } 2046 2047 static int 2048 ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 2049 { 2050 ztest_ds_t *zd = arg1; 2051 lr_remove_t *lr = arg2; 2052 char *name = (void *)(lr + 1); /* name follows lr */ 2053 objset_t *os = zd->zd_os; 2054 dmu_object_info_t doi; 2055 dmu_tx_t *tx; 2056 uint64_t object, txg; 2057 2058 if (byteswap) 2059 byteswap_uint64_array(lr, sizeof (*lr)); 2060 2061 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2062 ASSERT3S(name[0], !=, '\0'); 2063 2064 VERIFY0( 2065 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 2066 ASSERT3U(object, !=, 0); 2067 2068 ztest_object_lock(zd, object, RL_WRITER); 2069 2070 VERIFY0(dmu_object_info(os, object, &doi)); 2071 2072 tx = dmu_tx_create(os); 2073 2074 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 2075 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 2076 2077 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2078 if (txg == 0) { 2079 ztest_object_unlock(zd, object); 2080 return (ENOSPC); 2081 } 2082 2083 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 2084 VERIFY0(zap_destroy(os, object, tx)); 2085 } else { 2086 VERIFY0(dmu_object_free(os, object, tx)); 2087 } 2088 2089 VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); 2090 2091 (void) ztest_log_remove(zd, tx, lr, object); 2092 2093 dmu_tx_commit(tx); 2094 2095 ztest_object_unlock(zd, object); 2096 2097 return (0); 2098 } 2099 2100 static int 2101 ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 2102 { 2103 ztest_ds_t *zd = arg1; 2104 lr_write_t *lr = arg2; 2105 objset_t *os = zd->zd_os; 2106 void *data = lr + 1; /* data follows lr */ 2107 uint64_t offset, length; 2108 ztest_block_tag_t *bt = data; 2109 ztest_block_tag_t *bbt; 2110 uint64_t gen, txg, lrtxg, crtxg; 2111 dmu_object_info_t doi; 2112 dmu_tx_t *tx; 2113 dmu_buf_t *db; 2114 arc_buf_t *abuf = NULL; 2115 rl_t *rl; 2116 2117 if (byteswap) 2118 byteswap_uint64_array(lr, sizeof (*lr)); 2119 2120 offset = lr->lr_offset; 2121 length = lr->lr_length; 2122 2123 /* If it's a dmu_sync() block, write the whole block */ 2124 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 2125 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 2126 if (length < blocksize) { 2127 offset -= offset % blocksize; 2128 length = blocksize; 2129 } 2130 } 2131 2132 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 2133 byteswap_uint64_array(bt, sizeof (*bt)); 2134 2135 if (bt->bt_magic != BT_MAGIC) 2136 bt = NULL; 2137 2138 ztest_object_lock(zd, lr->lr_foid, RL_READER); 2139 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); 2140 2141 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2142 2143 dmu_object_info_from_db(db, &doi); 2144 2145 bbt = ztest_bt_bonus(db); 2146 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2147 gen = bbt->bt_gen; 2148 crtxg = bbt->bt_crtxg; 2149 lrtxg = lr->lr_common.lrc_txg; 2150 2151 tx = dmu_tx_create(os); 2152 2153 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 2154 2155 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 2156 P2PHASE(offset, length) == 0) 2157 abuf = dmu_request_arcbuf(db, length); 2158 2159 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2160 if (txg == 0) { 2161 if (abuf != NULL) 2162 dmu_return_arcbuf(abuf); 2163 dmu_buf_rele(db, FTAG); 2164 ztest_range_unlock(rl); 2165 ztest_object_unlock(zd, lr->lr_foid); 2166 return (ENOSPC); 2167 } 2168 2169 if (bt != NULL) { 2170 /* 2171 * Usually, verify the old data before writing new data -- 2172 * but not always, because we also want to verify correct 2173 * behavior when the data was not recently read into cache. 2174 */ 2175 ASSERT0(offset % doi.doi_data_block_size); 2176 if (ztest_random(4) != 0) { 2177 int prefetch = ztest_random(2) ? 2178 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 2179 ztest_block_tag_t rbt; 2180 2181 VERIFY(dmu_read(os, lr->lr_foid, offset, 2182 sizeof (rbt), &rbt, prefetch) == 0); 2183 if (rbt.bt_magic == BT_MAGIC) { 2184 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 2185 offset, gen, txg, crtxg); 2186 } 2187 } 2188 2189 /* 2190 * Writes can appear to be newer than the bonus buffer because 2191 * the ztest_get_data() callback does a dmu_read() of the 2192 * open-context data, which may be different than the data 2193 * as it was when the write was generated. 2194 */ 2195 if (zd->zd_zilog->zl_replay) { 2196 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 2197 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 2198 bt->bt_crtxg); 2199 } 2200 2201 /* 2202 * Set the bt's gen/txg to the bonus buffer's gen/txg 2203 * so that all of the usual ASSERTs will work. 2204 */ 2205 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 2206 crtxg); 2207 } 2208 2209 if (abuf == NULL) { 2210 dmu_write(os, lr->lr_foid, offset, length, data, tx); 2211 } else { 2212 memcpy(abuf->b_data, data, length); 2213 dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx); 2214 } 2215 2216 (void) ztest_log_write(zd, tx, lr); 2217 2218 dmu_buf_rele(db, FTAG); 2219 2220 dmu_tx_commit(tx); 2221 2222 ztest_range_unlock(rl); 2223 ztest_object_unlock(zd, lr->lr_foid); 2224 2225 return (0); 2226 } 2227 2228 static int 2229 ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 2230 { 2231 ztest_ds_t *zd = arg1; 2232 lr_truncate_t *lr = arg2; 2233 objset_t *os = zd->zd_os; 2234 dmu_tx_t *tx; 2235 uint64_t txg; 2236 rl_t *rl; 2237 2238 if (byteswap) 2239 byteswap_uint64_array(lr, sizeof (*lr)); 2240 2241 ztest_object_lock(zd, lr->lr_foid, RL_READER); 2242 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 2243 RL_WRITER); 2244 2245 tx = dmu_tx_create(os); 2246 2247 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 2248 2249 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2250 if (txg == 0) { 2251 ztest_range_unlock(rl); 2252 ztest_object_unlock(zd, lr->lr_foid); 2253 return (ENOSPC); 2254 } 2255 2256 VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 2257 lr->lr_length, tx)); 2258 2259 (void) ztest_log_truncate(zd, tx, lr); 2260 2261 dmu_tx_commit(tx); 2262 2263 ztest_range_unlock(rl); 2264 ztest_object_unlock(zd, lr->lr_foid); 2265 2266 return (0); 2267 } 2268 2269 static int 2270 ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 2271 { 2272 ztest_ds_t *zd = arg1; 2273 lr_setattr_t *lr = arg2; 2274 objset_t *os = zd->zd_os; 2275 dmu_tx_t *tx; 2276 dmu_buf_t *db; 2277 ztest_block_tag_t *bbt; 2278 uint64_t txg, lrtxg, crtxg, dnodesize; 2279 2280 if (byteswap) 2281 byteswap_uint64_array(lr, sizeof (*lr)); 2282 2283 ztest_object_lock(zd, lr->lr_foid, RL_WRITER); 2284 2285 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2286 2287 tx = dmu_tx_create(os); 2288 dmu_tx_hold_bonus(tx, lr->lr_foid); 2289 2290 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2291 if (txg == 0) { 2292 dmu_buf_rele(db, FTAG); 2293 ztest_object_unlock(zd, lr->lr_foid); 2294 return (ENOSPC); 2295 } 2296 2297 bbt = ztest_bt_bonus(db); 2298 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2299 crtxg = bbt->bt_crtxg; 2300 lrtxg = lr->lr_common.lrc_txg; 2301 dnodesize = bbt->bt_dnodesize; 2302 2303 if (zd->zd_zilog->zl_replay) { 2304 ASSERT3U(lr->lr_size, !=, 0); 2305 ASSERT3U(lr->lr_mode, !=, 0); 2306 ASSERT3U(lrtxg, !=, 0); 2307 } else { 2308 /* 2309 * Randomly change the size and increment the generation. 2310 */ 2311 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 2312 sizeof (*bbt); 2313 lr->lr_mode = bbt->bt_gen + 1; 2314 ASSERT0(lrtxg); 2315 } 2316 2317 /* 2318 * Verify that the current bonus buffer is not newer than our txg. 2319 */ 2320 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2321 MAX(txg, lrtxg), crtxg); 2322 2323 dmu_buf_will_dirty(db, tx); 2324 2325 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 2326 ASSERT3U(lr->lr_size, <=, db->db_size); 2327 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 2328 bbt = ztest_bt_bonus(db); 2329 2330 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2331 txg, crtxg); 2332 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 2333 dmu_buf_rele(db, FTAG); 2334 2335 (void) ztest_log_setattr(zd, tx, lr); 2336 2337 dmu_tx_commit(tx); 2338 2339 ztest_object_unlock(zd, lr->lr_foid); 2340 2341 return (0); 2342 } 2343 2344 zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 2345 NULL, /* 0 no such transaction type */ 2346 ztest_replay_create, /* TX_CREATE */ 2347 NULL, /* TX_MKDIR */ 2348 NULL, /* TX_MKXATTR */ 2349 NULL, /* TX_SYMLINK */ 2350 ztest_replay_remove, /* TX_REMOVE */ 2351 NULL, /* TX_RMDIR */ 2352 NULL, /* TX_LINK */ 2353 NULL, /* TX_RENAME */ 2354 ztest_replay_write, /* TX_WRITE */ 2355 ztest_replay_truncate, /* TX_TRUNCATE */ 2356 ztest_replay_setattr, /* TX_SETATTR */ 2357 NULL, /* TX_ACL */ 2358 NULL, /* TX_CREATE_ACL */ 2359 NULL, /* TX_CREATE_ATTR */ 2360 NULL, /* TX_CREATE_ACL_ATTR */ 2361 NULL, /* TX_MKDIR_ACL */ 2362 NULL, /* TX_MKDIR_ATTR */ 2363 NULL, /* TX_MKDIR_ACL_ATTR */ 2364 NULL, /* TX_WRITE2 */ 2365 NULL, /* TX_SETSAXATTR */ 2366 }; 2367 2368 /* 2369 * ZIL get_data callbacks 2370 */ 2371 2372 static void 2373 ztest_get_done(zgd_t *zgd, int error) 2374 { 2375 (void) error; 2376 ztest_ds_t *zd = zgd->zgd_private; 2377 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 2378 2379 if (zgd->zgd_db) 2380 dmu_buf_rele(zgd->zgd_db, zgd); 2381 2382 ztest_range_unlock((rl_t *)zgd->zgd_lr); 2383 ztest_object_unlock(zd, object); 2384 2385 umem_free(zgd, sizeof (*zgd)); 2386 } 2387 2388 static int 2389 ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, 2390 struct lwb *lwb, zio_t *zio) 2391 { 2392 (void) arg2; 2393 ztest_ds_t *zd = arg; 2394 objset_t *os = zd->zd_os; 2395 uint64_t object = lr->lr_foid; 2396 uint64_t offset = lr->lr_offset; 2397 uint64_t size = lr->lr_length; 2398 uint64_t txg = lr->lr_common.lrc_txg; 2399 uint64_t crtxg; 2400 dmu_object_info_t doi; 2401 dmu_buf_t *db; 2402 zgd_t *zgd; 2403 int error; 2404 2405 ASSERT3P(lwb, !=, NULL); 2406 ASSERT3P(zio, !=, NULL); 2407 ASSERT3U(size, !=, 0); 2408 2409 ztest_object_lock(zd, object, RL_READER); 2410 error = dmu_bonus_hold(os, object, FTAG, &db); 2411 if (error) { 2412 ztest_object_unlock(zd, object); 2413 return (error); 2414 } 2415 2416 crtxg = ztest_bt_bonus(db)->bt_crtxg; 2417 2418 if (crtxg == 0 || crtxg > txg) { 2419 dmu_buf_rele(db, FTAG); 2420 ztest_object_unlock(zd, object); 2421 return (ENOENT); 2422 } 2423 2424 dmu_object_info_from_db(db, &doi); 2425 dmu_buf_rele(db, FTAG); 2426 db = NULL; 2427 2428 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 2429 zgd->zgd_lwb = lwb; 2430 zgd->zgd_private = zd; 2431 2432 if (buf != NULL) { /* immediate write */ 2433 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2434 object, offset, size, RL_READER); 2435 2436 error = dmu_read(os, object, offset, size, buf, 2437 DMU_READ_NO_PREFETCH); 2438 ASSERT0(error); 2439 } else { 2440 size = doi.doi_data_block_size; 2441 if (ISP2(size)) { 2442 offset = P2ALIGN(offset, size); 2443 } else { 2444 ASSERT3U(offset, <, size); 2445 offset = 0; 2446 } 2447 2448 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2449 object, offset, size, RL_READER); 2450 2451 error = dmu_buf_hold(os, object, offset, zgd, &db, 2452 DMU_READ_NO_PREFETCH); 2453 2454 if (error == 0) { 2455 blkptr_t *bp = &lr->lr_blkptr; 2456 2457 zgd->zgd_db = db; 2458 zgd->zgd_bp = bp; 2459 2460 ASSERT3U(db->db_offset, ==, offset); 2461 ASSERT3U(db->db_size, ==, size); 2462 2463 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2464 ztest_get_done, zgd); 2465 2466 if (error == 0) 2467 return (0); 2468 } 2469 } 2470 2471 ztest_get_done(zgd, error); 2472 2473 return (error); 2474 } 2475 2476 static void * 2477 ztest_lr_alloc(size_t lrsize, char *name) 2478 { 2479 char *lr; 2480 size_t namesize = name ? strlen(name) + 1 : 0; 2481 2482 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2483 2484 if (name) 2485 memcpy(lr + lrsize, name, namesize); 2486 2487 return (lr); 2488 } 2489 2490 static void 2491 ztest_lr_free(void *lr, size_t lrsize, char *name) 2492 { 2493 size_t namesize = name ? strlen(name) + 1 : 0; 2494 2495 umem_free(lr, lrsize + namesize); 2496 } 2497 2498 /* 2499 * Lookup a bunch of objects. Returns the number of objects not found. 2500 */ 2501 static int 2502 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2503 { 2504 int missing = 0; 2505 int error; 2506 int i; 2507 2508 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2509 2510 for (i = 0; i < count; i++, od++) { 2511 od->od_object = 0; 2512 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2513 sizeof (uint64_t), 1, &od->od_object); 2514 if (error) { 2515 ASSERT3S(error, ==, ENOENT); 2516 ASSERT0(od->od_object); 2517 missing++; 2518 } else { 2519 dmu_buf_t *db; 2520 ztest_block_tag_t *bbt; 2521 dmu_object_info_t doi; 2522 2523 ASSERT3U(od->od_object, !=, 0); 2524 ASSERT0(missing); /* there should be no gaps */ 2525 2526 ztest_object_lock(zd, od->od_object, RL_READER); 2527 VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, 2528 FTAG, &db)); 2529 dmu_object_info_from_db(db, &doi); 2530 bbt = ztest_bt_bonus(db); 2531 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2532 od->od_type = doi.doi_type; 2533 od->od_blocksize = doi.doi_data_block_size; 2534 od->od_gen = bbt->bt_gen; 2535 dmu_buf_rele(db, FTAG); 2536 ztest_object_unlock(zd, od->od_object); 2537 } 2538 } 2539 2540 return (missing); 2541 } 2542 2543 static int 2544 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2545 { 2546 int missing = 0; 2547 int i; 2548 2549 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2550 2551 for (i = 0; i < count; i++, od++) { 2552 if (missing) { 2553 od->od_object = 0; 2554 missing++; 2555 continue; 2556 } 2557 2558 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2559 2560 lr->lr_doid = od->od_dir; 2561 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2562 lr->lrz_type = od->od_crtype; 2563 lr->lrz_blocksize = od->od_crblocksize; 2564 lr->lrz_ibshift = ztest_random_ibshift(); 2565 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2566 lr->lrz_dnodesize = od->od_crdnodesize; 2567 lr->lr_gen = od->od_crgen; 2568 lr->lr_crtime[0] = time(NULL); 2569 2570 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2571 ASSERT0(missing); 2572 od->od_object = 0; 2573 missing++; 2574 } else { 2575 od->od_object = lr->lr_foid; 2576 od->od_type = od->od_crtype; 2577 od->od_blocksize = od->od_crblocksize; 2578 od->od_gen = od->od_crgen; 2579 ASSERT3U(od->od_object, !=, 0); 2580 } 2581 2582 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2583 } 2584 2585 return (missing); 2586 } 2587 2588 static int 2589 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2590 { 2591 int missing = 0; 2592 int error; 2593 int i; 2594 2595 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2596 2597 od += count - 1; 2598 2599 for (i = count - 1; i >= 0; i--, od--) { 2600 if (missing) { 2601 missing++; 2602 continue; 2603 } 2604 2605 /* 2606 * No object was found. 2607 */ 2608 if (od->od_object == 0) 2609 continue; 2610 2611 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2612 2613 lr->lr_doid = od->od_dir; 2614 2615 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2616 ASSERT3U(error, ==, ENOSPC); 2617 missing++; 2618 } else { 2619 od->od_object = 0; 2620 } 2621 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2622 } 2623 2624 return (missing); 2625 } 2626 2627 static int 2628 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2629 void *data) 2630 { 2631 lr_write_t *lr; 2632 int error; 2633 2634 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2635 2636 lr->lr_foid = object; 2637 lr->lr_offset = offset; 2638 lr->lr_length = size; 2639 lr->lr_blkoff = 0; 2640 BP_ZERO(&lr->lr_blkptr); 2641 2642 memcpy(lr + 1, data, size); 2643 2644 error = ztest_replay_write(zd, lr, B_FALSE); 2645 2646 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2647 2648 return (error); 2649 } 2650 2651 static int 2652 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2653 { 2654 lr_truncate_t *lr; 2655 int error; 2656 2657 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2658 2659 lr->lr_foid = object; 2660 lr->lr_offset = offset; 2661 lr->lr_length = size; 2662 2663 error = ztest_replay_truncate(zd, lr, B_FALSE); 2664 2665 ztest_lr_free(lr, sizeof (*lr), NULL); 2666 2667 return (error); 2668 } 2669 2670 static int 2671 ztest_setattr(ztest_ds_t *zd, uint64_t object) 2672 { 2673 lr_setattr_t *lr; 2674 int error; 2675 2676 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2677 2678 lr->lr_foid = object; 2679 lr->lr_size = 0; 2680 lr->lr_mode = 0; 2681 2682 error = ztest_replay_setattr(zd, lr, B_FALSE); 2683 2684 ztest_lr_free(lr, sizeof (*lr), NULL); 2685 2686 return (error); 2687 } 2688 2689 static void 2690 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2691 { 2692 objset_t *os = zd->zd_os; 2693 dmu_tx_t *tx; 2694 uint64_t txg; 2695 rl_t *rl; 2696 2697 txg_wait_synced(dmu_objset_pool(os), 0); 2698 2699 ztest_object_lock(zd, object, RL_READER); 2700 rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); 2701 2702 tx = dmu_tx_create(os); 2703 2704 dmu_tx_hold_write(tx, object, offset, size); 2705 2706 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2707 2708 if (txg != 0) { 2709 dmu_prealloc(os, object, offset, size, tx); 2710 dmu_tx_commit(tx); 2711 txg_wait_synced(dmu_objset_pool(os), txg); 2712 } else { 2713 (void) dmu_free_long_range(os, object, offset, size); 2714 } 2715 2716 ztest_range_unlock(rl); 2717 ztest_object_unlock(zd, object); 2718 } 2719 2720 static void 2721 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2722 { 2723 int err; 2724 ztest_block_tag_t wbt; 2725 dmu_object_info_t doi; 2726 enum ztest_io_type io_type; 2727 uint64_t blocksize; 2728 void *data; 2729 2730 VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); 2731 blocksize = doi.doi_data_block_size; 2732 data = umem_alloc(blocksize, UMEM_NOFAIL); 2733 2734 /* 2735 * Pick an i/o type at random, biased toward writing block tags. 2736 */ 2737 io_type = ztest_random(ZTEST_IO_TYPES); 2738 if (ztest_random(2) == 0) 2739 io_type = ZTEST_IO_WRITE_TAG; 2740 2741 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2742 2743 switch (io_type) { 2744 2745 case ZTEST_IO_WRITE_TAG: 2746 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2747 offset, 0, 0, 0); 2748 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2749 break; 2750 2751 case ZTEST_IO_WRITE_PATTERN: 2752 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2753 if (ztest_random(2) == 0) { 2754 /* 2755 * Induce fletcher2 collisions to ensure that 2756 * zio_ddt_collision() detects and resolves them 2757 * when using fletcher2-verify for deduplication. 2758 */ 2759 ((uint64_t *)data)[0] ^= 1ULL << 63; 2760 ((uint64_t *)data)[4] ^= 1ULL << 63; 2761 } 2762 (void) ztest_write(zd, object, offset, blocksize, data); 2763 break; 2764 2765 case ZTEST_IO_WRITE_ZEROES: 2766 memset(data, 0, blocksize); 2767 (void) ztest_write(zd, object, offset, blocksize, data); 2768 break; 2769 2770 case ZTEST_IO_TRUNCATE: 2771 (void) ztest_truncate(zd, object, offset, blocksize); 2772 break; 2773 2774 case ZTEST_IO_SETATTR: 2775 (void) ztest_setattr(zd, object); 2776 break; 2777 default: 2778 break; 2779 2780 case ZTEST_IO_REWRITE: 2781 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2782 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2783 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2784 B_FALSE); 2785 VERIFY(err == 0 || err == ENOSPC); 2786 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2787 ZFS_PROP_COMPRESSION, 2788 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2789 B_FALSE); 2790 VERIFY(err == 0 || err == ENOSPC); 2791 (void) pthread_rwlock_unlock(&ztest_name_lock); 2792 2793 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2794 DMU_READ_NO_PREFETCH)); 2795 2796 (void) ztest_write(zd, object, offset, blocksize, data); 2797 break; 2798 } 2799 2800 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2801 2802 umem_free(data, blocksize); 2803 } 2804 2805 /* 2806 * Initialize an object description template. 2807 */ 2808 static void 2809 ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, 2810 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2811 uint64_t gen) 2812 { 2813 od->od_dir = ZTEST_DIROBJ; 2814 od->od_object = 0; 2815 2816 od->od_crtype = type; 2817 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2818 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2819 od->od_crgen = gen; 2820 2821 od->od_type = DMU_OT_NONE; 2822 od->od_blocksize = 0; 2823 od->od_gen = 0; 2824 2825 (void) snprintf(od->od_name, sizeof (od->od_name), 2826 "%s(%"PRId64")[%"PRIu64"]", 2827 tag, id, index); 2828 } 2829 2830 /* 2831 * Lookup or create the objects for a test using the od template. 2832 * If the objects do not all exist, or if 'remove' is specified, 2833 * remove any existing objects and create new ones. Otherwise, 2834 * use the existing objects. 2835 */ 2836 static int 2837 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2838 { 2839 int count = size / sizeof (*od); 2840 int rv = 0; 2841 2842 mutex_enter(&zd->zd_dirobj_lock); 2843 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2844 (ztest_remove(zd, od, count) != 0 || 2845 ztest_create(zd, od, count) != 0)) 2846 rv = -1; 2847 zd->zd_od = od; 2848 mutex_exit(&zd->zd_dirobj_lock); 2849 2850 return (rv); 2851 } 2852 2853 void 2854 ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2855 { 2856 (void) id; 2857 zilog_t *zilog = zd->zd_zilog; 2858 2859 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2860 2861 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2862 2863 /* 2864 * Remember the committed values in zd, which is in parent/child 2865 * shared memory. If we die, the next iteration of ztest_run() 2866 * will verify that the log really does contain this record. 2867 */ 2868 mutex_enter(&zilog->zl_lock); 2869 ASSERT3P(zd->zd_shared, !=, NULL); 2870 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2871 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2872 mutex_exit(&zilog->zl_lock); 2873 2874 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2875 } 2876 2877 /* 2878 * This function is designed to simulate the operations that occur during a 2879 * mount/unmount operation. We hold the dataset across these operations in an 2880 * attempt to expose any implicit assumptions about ZIL management. 2881 */ 2882 void 2883 ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2884 { 2885 (void) id; 2886 objset_t *os = zd->zd_os; 2887 2888 /* 2889 * We hold the ztest_vdev_lock so we don't cause problems with 2890 * other threads that wish to remove a log device, such as 2891 * ztest_device_removal(). 2892 */ 2893 mutex_enter(&ztest_vdev_lock); 2894 2895 /* 2896 * We grab the zd_dirobj_lock to ensure that no other thread is 2897 * updating the zil (i.e. adding in-memory log records) and the 2898 * zd_zilog_lock to block any I/O. 2899 */ 2900 mutex_enter(&zd->zd_dirobj_lock); 2901 (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); 2902 2903 /* zfsvfs_teardown() */ 2904 zil_close(zd->zd_zilog); 2905 2906 /* zfsvfs_setup() */ 2907 VERIFY3P(zil_open(os, ztest_get_data), ==, zd->zd_zilog); 2908 zil_replay(os, zd, ztest_replay_vector); 2909 2910 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2911 mutex_exit(&zd->zd_dirobj_lock); 2912 mutex_exit(&ztest_vdev_lock); 2913 } 2914 2915 /* 2916 * Verify that we can't destroy an active pool, create an existing pool, 2917 * or create a pool with a bad vdev spec. 2918 */ 2919 void 2920 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 2921 { 2922 (void) zd, (void) id; 2923 ztest_shared_opts_t *zo = &ztest_opts; 2924 spa_t *spa; 2925 nvlist_t *nvroot; 2926 2927 if (zo->zo_mmp_test) 2928 return; 2929 2930 /* 2931 * Attempt to create using a bad file. 2932 */ 2933 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 2934 VERIFY3U(ENOENT, ==, 2935 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 2936 fnvlist_free(nvroot); 2937 2938 /* 2939 * Attempt to create using a bad mirror. 2940 */ 2941 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); 2942 VERIFY3U(ENOENT, ==, 2943 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 2944 fnvlist_free(nvroot); 2945 2946 /* 2947 * Attempt to create an existing pool. It shouldn't matter 2948 * what's in the nvroot; we should fail with EEXIST. 2949 */ 2950 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2951 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 2952 VERIFY3U(EEXIST, ==, 2953 spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); 2954 fnvlist_free(nvroot); 2955 2956 /* 2957 * We open a reference to the spa and then we try to export it 2958 * expecting one of the following errors: 2959 * 2960 * EBUSY 2961 * Because of the reference we just opened. 2962 * 2963 * ZFS_ERR_EXPORT_IN_PROGRESS 2964 * For the case that there is another ztest thread doing 2965 * an export concurrently. 2966 */ 2967 VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); 2968 int error = spa_destroy(zo->zo_pool); 2969 if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { 2970 fatal(B_FALSE, "spa_destroy(%s) returned unexpected value %d", 2971 spa->spa_name, error); 2972 } 2973 spa_close(spa, FTAG); 2974 2975 (void) pthread_rwlock_unlock(&ztest_name_lock); 2976 } 2977 2978 /* 2979 * Start and then stop the MMP threads to ensure the startup and shutdown code 2980 * works properly. Actual protection and property-related code tested via ZTS. 2981 */ 2982 void 2983 ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) 2984 { 2985 (void) zd, (void) id; 2986 ztest_shared_opts_t *zo = &ztest_opts; 2987 spa_t *spa = ztest_spa; 2988 2989 if (zo->zo_mmp_test) 2990 return; 2991 2992 /* 2993 * Since enabling MMP involves setting a property, it could not be done 2994 * while the pool is suspended. 2995 */ 2996 if (spa_suspended(spa)) 2997 return; 2998 2999 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3000 mutex_enter(&spa->spa_props_lock); 3001 3002 zfs_multihost_fail_intervals = 0; 3003 3004 if (!spa_multihost(spa)) { 3005 spa->spa_multihost = B_TRUE; 3006 mmp_thread_start(spa); 3007 } 3008 3009 mutex_exit(&spa->spa_props_lock); 3010 spa_config_exit(spa, SCL_CONFIG, FTAG); 3011 3012 txg_wait_synced(spa_get_dsl(spa), 0); 3013 mmp_signal_all_threads(); 3014 txg_wait_synced(spa_get_dsl(spa), 0); 3015 3016 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3017 mutex_enter(&spa->spa_props_lock); 3018 3019 if (spa_multihost(spa)) { 3020 mmp_thread_stop(spa); 3021 spa->spa_multihost = B_FALSE; 3022 } 3023 3024 mutex_exit(&spa->spa_props_lock); 3025 spa_config_exit(spa, SCL_CONFIG, FTAG); 3026 } 3027 3028 void 3029 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 3030 { 3031 (void) zd, (void) id; 3032 spa_t *spa; 3033 uint64_t initial_version = SPA_VERSION_INITIAL; 3034 uint64_t version, newversion; 3035 nvlist_t *nvroot, *props; 3036 char *name; 3037 3038 if (ztest_opts.zo_mmp_test) 3039 return; 3040 3041 /* dRAID added after feature flags, skip upgrade test. */ 3042 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) 3043 return; 3044 3045 mutex_enter(&ztest_vdev_lock); 3046 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 3047 3048 /* 3049 * Clean up from previous runs. 3050 */ 3051 (void) spa_destroy(name); 3052 3053 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 3054 NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); 3055 3056 /* 3057 * If we're configuring a RAIDZ device then make sure that the 3058 * initial version is capable of supporting that feature. 3059 */ 3060 switch (ztest_opts.zo_raid_parity) { 3061 case 0: 3062 case 1: 3063 initial_version = SPA_VERSION_INITIAL; 3064 break; 3065 case 2: 3066 initial_version = SPA_VERSION_RAIDZ2; 3067 break; 3068 case 3: 3069 initial_version = SPA_VERSION_RAIDZ3; 3070 break; 3071 } 3072 3073 /* 3074 * Create a pool with a spa version that can be upgraded. Pick 3075 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 3076 */ 3077 do { 3078 version = ztest_random_spa_version(initial_version); 3079 } while (version > SPA_VERSION_BEFORE_FEATURES); 3080 3081 props = fnvlist_alloc(); 3082 fnvlist_add_uint64(props, 3083 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 3084 VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); 3085 fnvlist_free(nvroot); 3086 fnvlist_free(props); 3087 3088 VERIFY0(spa_open(name, &spa, FTAG)); 3089 VERIFY3U(spa_version(spa), ==, version); 3090 newversion = ztest_random_spa_version(version + 1); 3091 3092 if (ztest_opts.zo_verbose >= 4) { 3093 (void) printf("upgrading spa version from " 3094 "%"PRIu64" to %"PRIu64"\n", 3095 version, newversion); 3096 } 3097 3098 spa_upgrade(spa, newversion); 3099 VERIFY3U(spa_version(spa), >, version); 3100 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 3101 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 3102 spa_close(spa, FTAG); 3103 3104 kmem_strfree(name); 3105 mutex_exit(&ztest_vdev_lock); 3106 } 3107 3108 static void 3109 ztest_spa_checkpoint(spa_t *spa) 3110 { 3111 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3112 3113 int error = spa_checkpoint(spa->spa_name); 3114 3115 switch (error) { 3116 case 0: 3117 case ZFS_ERR_DEVRM_IN_PROGRESS: 3118 case ZFS_ERR_DISCARDING_CHECKPOINT: 3119 case ZFS_ERR_CHECKPOINT_EXISTS: 3120 break; 3121 case ENOSPC: 3122 ztest_record_enospc(FTAG); 3123 break; 3124 default: 3125 fatal(B_FALSE, "spa_checkpoint(%s) = %d", spa->spa_name, error); 3126 } 3127 } 3128 3129 static void 3130 ztest_spa_discard_checkpoint(spa_t *spa) 3131 { 3132 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3133 3134 int error = spa_checkpoint_discard(spa->spa_name); 3135 3136 switch (error) { 3137 case 0: 3138 case ZFS_ERR_DISCARDING_CHECKPOINT: 3139 case ZFS_ERR_NO_CHECKPOINT: 3140 break; 3141 default: 3142 fatal(B_FALSE, "spa_discard_checkpoint(%s) = %d", 3143 spa->spa_name, error); 3144 } 3145 3146 } 3147 3148 void 3149 ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 3150 { 3151 (void) zd, (void) id; 3152 spa_t *spa = ztest_spa; 3153 3154 mutex_enter(&ztest_checkpoint_lock); 3155 if (ztest_random(2) == 0) { 3156 ztest_spa_checkpoint(spa); 3157 } else { 3158 ztest_spa_discard_checkpoint(spa); 3159 } 3160 mutex_exit(&ztest_checkpoint_lock); 3161 } 3162 3163 3164 static vdev_t * 3165 vdev_lookup_by_path(vdev_t *vd, const char *path) 3166 { 3167 vdev_t *mvd; 3168 int c; 3169 3170 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 3171 return (vd); 3172 3173 for (c = 0; c < vd->vdev_children; c++) 3174 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 3175 NULL) 3176 return (mvd); 3177 3178 return (NULL); 3179 } 3180 3181 static int 3182 spa_num_top_vdevs(spa_t *spa) 3183 { 3184 vdev_t *rvd = spa->spa_root_vdev; 3185 ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); 3186 return (rvd->vdev_children); 3187 } 3188 3189 /* 3190 * Verify that vdev_add() works as expected. 3191 */ 3192 void 3193 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 3194 { 3195 (void) zd, (void) id; 3196 ztest_shared_t *zs = ztest_shared; 3197 spa_t *spa = ztest_spa; 3198 uint64_t leaves; 3199 uint64_t guid; 3200 nvlist_t *nvroot; 3201 int error; 3202 3203 if (ztest_opts.zo_mmp_test) 3204 return; 3205 3206 mutex_enter(&ztest_vdev_lock); 3207 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * 3208 ztest_opts.zo_raid_children; 3209 3210 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3211 3212 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3213 3214 /* 3215 * If we have slogs then remove them 1/4 of the time. 3216 */ 3217 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 3218 metaslab_group_t *mg; 3219 3220 /* 3221 * find the first real slog in log allocation class 3222 */ 3223 mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; 3224 while (!mg->mg_vd->vdev_islog) 3225 mg = mg->mg_next; 3226 3227 guid = mg->mg_vd->vdev_guid; 3228 3229 spa_config_exit(spa, SCL_VDEV, FTAG); 3230 3231 /* 3232 * We have to grab the zs_name_lock as writer to 3233 * prevent a race between removing a slog (dmu_objset_find) 3234 * and destroying a dataset. Removing the slog will 3235 * grab a reference on the dataset which may cause 3236 * dsl_destroy_head() to fail with EBUSY thus 3237 * leaving the dataset in an inconsistent state. 3238 */ 3239 pthread_rwlock_wrlock(&ztest_name_lock); 3240 error = spa_vdev_remove(spa, guid, B_FALSE); 3241 pthread_rwlock_unlock(&ztest_name_lock); 3242 3243 switch (error) { 3244 case 0: 3245 case EEXIST: /* Generic zil_reset() error */ 3246 case EBUSY: /* Replay required */ 3247 case EACCES: /* Crypto key not loaded */ 3248 case ZFS_ERR_CHECKPOINT_EXISTS: 3249 case ZFS_ERR_DISCARDING_CHECKPOINT: 3250 break; 3251 default: 3252 fatal(B_FALSE, "spa_vdev_remove() = %d", error); 3253 } 3254 } else { 3255 spa_config_exit(spa, SCL_VDEV, FTAG); 3256 3257 /* 3258 * Make 1/4 of the devices be log devices 3259 */ 3260 nvroot = make_vdev_root(NULL, NULL, NULL, 3261 ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? 3262 "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 3263 1); 3264 3265 error = spa_vdev_add(spa, nvroot); 3266 fnvlist_free(nvroot); 3267 3268 switch (error) { 3269 case 0: 3270 break; 3271 case ENOSPC: 3272 ztest_record_enospc("spa_vdev_add"); 3273 break; 3274 default: 3275 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3276 } 3277 } 3278 3279 mutex_exit(&ztest_vdev_lock); 3280 } 3281 3282 void 3283 ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) 3284 { 3285 (void) zd, (void) id; 3286 ztest_shared_t *zs = ztest_shared; 3287 spa_t *spa = ztest_spa; 3288 uint64_t leaves; 3289 nvlist_t *nvroot; 3290 const char *class = (ztest_random(2) == 0) ? 3291 VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; 3292 int error; 3293 3294 /* 3295 * By default add a special vdev 50% of the time 3296 */ 3297 if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || 3298 (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && 3299 ztest_random(2) == 0)) { 3300 return; 3301 } 3302 3303 mutex_enter(&ztest_vdev_lock); 3304 3305 /* Only test with mirrors */ 3306 if (zs->zs_mirrors < 2) { 3307 mutex_exit(&ztest_vdev_lock); 3308 return; 3309 } 3310 3311 /* requires feature@allocation_classes */ 3312 if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { 3313 mutex_exit(&ztest_vdev_lock); 3314 return; 3315 } 3316 3317 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * 3318 ztest_opts.zo_raid_children; 3319 3320 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3321 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3322 spa_config_exit(spa, SCL_VDEV, FTAG); 3323 3324 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 3325 class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 3326 3327 error = spa_vdev_add(spa, nvroot); 3328 fnvlist_free(nvroot); 3329 3330 if (error == ENOSPC) 3331 ztest_record_enospc("spa_vdev_add"); 3332 else if (error != 0) 3333 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3334 3335 /* 3336 * 50% of the time allow small blocks in the special class 3337 */ 3338 if (error == 0 && 3339 spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { 3340 if (ztest_opts.zo_verbose >= 3) 3341 (void) printf("Enabling special VDEV small blocks\n"); 3342 (void) ztest_dsl_prop_set_uint64(zd->zd_name, 3343 ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); 3344 } 3345 3346 mutex_exit(&ztest_vdev_lock); 3347 3348 if (ztest_opts.zo_verbose >= 3) { 3349 metaslab_class_t *mc; 3350 3351 if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) 3352 mc = spa_special_class(spa); 3353 else 3354 mc = spa_dedup_class(spa); 3355 (void) printf("Added a %s mirrored vdev (of %d)\n", 3356 class, (int)mc->mc_groups); 3357 } 3358 } 3359 3360 /* 3361 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 3362 */ 3363 void 3364 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 3365 { 3366 (void) zd, (void) id; 3367 ztest_shared_t *zs = ztest_shared; 3368 spa_t *spa = ztest_spa; 3369 vdev_t *rvd = spa->spa_root_vdev; 3370 spa_aux_vdev_t *sav; 3371 char *aux; 3372 char *path; 3373 uint64_t guid = 0; 3374 int error, ignore_err = 0; 3375 3376 if (ztest_opts.zo_mmp_test) 3377 return; 3378 3379 path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3380 3381 if (ztest_random(2) == 0) { 3382 sav = &spa->spa_spares; 3383 aux = ZPOOL_CONFIG_SPARES; 3384 } else { 3385 sav = &spa->spa_l2cache; 3386 aux = ZPOOL_CONFIG_L2CACHE; 3387 } 3388 3389 mutex_enter(&ztest_vdev_lock); 3390 3391 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3392 3393 if (sav->sav_count != 0 && ztest_random(4) == 0) { 3394 /* 3395 * Pick a random device to remove. 3396 */ 3397 vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3398 3399 /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ 3400 if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) 3401 ignore_err = ENOTSUP; 3402 3403 guid = svd->vdev_guid; 3404 } else { 3405 /* 3406 * Find an unused device we can add. 3407 */ 3408 zs->zs_vdev_aux = 0; 3409 for (;;) { 3410 int c; 3411 (void) snprintf(path, MAXPATHLEN, ztest_aux_template, 3412 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 3413 zs->zs_vdev_aux); 3414 for (c = 0; c < sav->sav_count; c++) 3415 if (strcmp(sav->sav_vdevs[c]->vdev_path, 3416 path) == 0) 3417 break; 3418 if (c == sav->sav_count && 3419 vdev_lookup_by_path(rvd, path) == NULL) 3420 break; 3421 zs->zs_vdev_aux++; 3422 } 3423 } 3424 3425 spa_config_exit(spa, SCL_VDEV, FTAG); 3426 3427 if (guid == 0) { 3428 /* 3429 * Add a new device. 3430 */ 3431 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 3432 (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); 3433 error = spa_vdev_add(spa, nvroot); 3434 3435 switch (error) { 3436 case 0: 3437 break; 3438 default: 3439 fatal(B_FALSE, "spa_vdev_add(%p) = %d", nvroot, error); 3440 } 3441 fnvlist_free(nvroot); 3442 } else { 3443 /* 3444 * Remove an existing device. Sometimes, dirty its 3445 * vdev state first to make sure we handle removal 3446 * of devices that have pending state changes. 3447 */ 3448 if (ztest_random(2) == 0) 3449 (void) vdev_online(spa, guid, 0, NULL); 3450 3451 error = spa_vdev_remove(spa, guid, B_FALSE); 3452 3453 switch (error) { 3454 case 0: 3455 case EBUSY: 3456 case ZFS_ERR_CHECKPOINT_EXISTS: 3457 case ZFS_ERR_DISCARDING_CHECKPOINT: 3458 break; 3459 default: 3460 if (error != ignore_err) 3461 fatal(B_FALSE, 3462 "spa_vdev_remove(%"PRIu64") = %d", 3463 guid, error); 3464 } 3465 } 3466 3467 mutex_exit(&ztest_vdev_lock); 3468 3469 umem_free(path, MAXPATHLEN); 3470 } 3471 3472 /* 3473 * split a pool if it has mirror tlvdevs 3474 */ 3475 void 3476 ztest_split_pool(ztest_ds_t *zd, uint64_t id) 3477 { 3478 (void) zd, (void) id; 3479 ztest_shared_t *zs = ztest_shared; 3480 spa_t *spa = ztest_spa; 3481 vdev_t *rvd = spa->spa_root_vdev; 3482 nvlist_t *tree, **child, *config, *split, **schild; 3483 uint_t c, children, schildren = 0, lastlogid = 0; 3484 int error = 0; 3485 3486 if (ztest_opts.zo_mmp_test) 3487 return; 3488 3489 mutex_enter(&ztest_vdev_lock); 3490 3491 /* ensure we have a usable config; mirrors of raidz aren't supported */ 3492 if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { 3493 mutex_exit(&ztest_vdev_lock); 3494 return; 3495 } 3496 3497 /* clean up the old pool, if any */ 3498 (void) spa_destroy("splitp"); 3499 3500 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3501 3502 /* generate a config from the existing config */ 3503 mutex_enter(&spa->spa_props_lock); 3504 tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); 3505 mutex_exit(&spa->spa_props_lock); 3506 3507 VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, 3508 &child, &children)); 3509 3510 schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); 3511 for (c = 0; c < children; c++) { 3512 vdev_t *tvd = rvd->vdev_child[c]; 3513 nvlist_t **mchild; 3514 uint_t mchildren; 3515 3516 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 3517 schild[schildren] = fnvlist_alloc(); 3518 fnvlist_add_string(schild[schildren], 3519 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); 3520 fnvlist_add_uint64(schild[schildren], 3521 ZPOOL_CONFIG_IS_HOLE, 1); 3522 if (lastlogid == 0) 3523 lastlogid = schildren; 3524 ++schildren; 3525 continue; 3526 } 3527 lastlogid = 0; 3528 VERIFY0(nvlist_lookup_nvlist_array(child[c], 3529 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); 3530 schild[schildren++] = fnvlist_dup(mchild[0]); 3531 } 3532 3533 /* OK, create a config that can be used to split */ 3534 split = fnvlist_alloc(); 3535 fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 3536 fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, 3537 (const nvlist_t **)schild, lastlogid != 0 ? lastlogid : schildren); 3538 3539 config = fnvlist_alloc(); 3540 fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); 3541 3542 for (c = 0; c < schildren; c++) 3543 fnvlist_free(schild[c]); 3544 free(schild); 3545 fnvlist_free(split); 3546 3547 spa_config_exit(spa, SCL_VDEV, FTAG); 3548 3549 (void) pthread_rwlock_wrlock(&ztest_name_lock); 3550 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 3551 (void) pthread_rwlock_unlock(&ztest_name_lock); 3552 3553 fnvlist_free(config); 3554 3555 if (error == 0) { 3556 (void) printf("successful split - results:\n"); 3557 mutex_enter(&spa_namespace_lock); 3558 show_pool_stats(spa); 3559 show_pool_stats(spa_lookup("splitp")); 3560 mutex_exit(&spa_namespace_lock); 3561 ++zs->zs_splits; 3562 --zs->zs_mirrors; 3563 } 3564 mutex_exit(&ztest_vdev_lock); 3565 } 3566 3567 /* 3568 * Verify that we can attach and detach devices. 3569 */ 3570 void 3571 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 3572 { 3573 (void) zd, (void) id; 3574 ztest_shared_t *zs = ztest_shared; 3575 spa_t *spa = ztest_spa; 3576 spa_aux_vdev_t *sav = &spa->spa_spares; 3577 vdev_t *rvd = spa->spa_root_vdev; 3578 vdev_t *oldvd, *newvd, *pvd; 3579 nvlist_t *root; 3580 uint64_t leaves; 3581 uint64_t leaf, top; 3582 uint64_t ashift = ztest_get_ashift(); 3583 uint64_t oldguid, pguid; 3584 uint64_t oldsize, newsize; 3585 char *oldpath, *newpath; 3586 int replacing; 3587 int oldvd_has_siblings = B_FALSE; 3588 int newvd_is_spare = B_FALSE; 3589 int newvd_is_dspare = B_FALSE; 3590 int oldvd_is_log; 3591 int error, expected_error; 3592 3593 if (ztest_opts.zo_mmp_test) 3594 return; 3595 3596 oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3597 newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3598 3599 mutex_enter(&ztest_vdev_lock); 3600 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; 3601 3602 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3603 3604 /* 3605 * If a vdev is in the process of being removed, its removal may 3606 * finish while we are in progress, leading to an unexpected error 3607 * value. Don't bother trying to attach while we are in the middle 3608 * of removal. 3609 */ 3610 if (ztest_device_removal_active) { 3611 spa_config_exit(spa, SCL_ALL, FTAG); 3612 goto out; 3613 } 3614 3615 /* 3616 * Decide whether to do an attach or a replace. 3617 */ 3618 replacing = ztest_random(2); 3619 3620 /* 3621 * Pick a random top-level vdev. 3622 */ 3623 top = ztest_random_vdev_top(spa, B_TRUE); 3624 3625 /* 3626 * Pick a random leaf within it. 3627 */ 3628 leaf = ztest_random(leaves); 3629 3630 /* 3631 * Locate this vdev. 3632 */ 3633 oldvd = rvd->vdev_child[top]; 3634 3635 /* pick a child from the mirror */ 3636 if (zs->zs_mirrors >= 1) { 3637 ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); 3638 ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); 3639 oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; 3640 } 3641 3642 /* pick a child out of the raidz group */ 3643 if (ztest_opts.zo_raid_children > 1) { 3644 if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) 3645 ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); 3646 else 3647 ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); 3648 ASSERT3U(oldvd->vdev_children, ==, ztest_opts.zo_raid_children); 3649 oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; 3650 } 3651 3652 /* 3653 * If we're already doing an attach or replace, oldvd may be a 3654 * mirror vdev -- in which case, pick a random child. 3655 */ 3656 while (oldvd->vdev_children != 0) { 3657 oldvd_has_siblings = B_TRUE; 3658 ASSERT3U(oldvd->vdev_children, >=, 2); 3659 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3660 } 3661 3662 oldguid = oldvd->vdev_guid; 3663 oldsize = vdev_get_min_asize(oldvd); 3664 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3665 (void) strcpy(oldpath, oldvd->vdev_path); 3666 pvd = oldvd->vdev_parent; 3667 pguid = pvd->vdev_guid; 3668 3669 /* 3670 * If oldvd has siblings, then half of the time, detach it. Prior 3671 * to the detach the pool is scrubbed in order to prevent creating 3672 * unrepairable blocks as a result of the data corruption injection. 3673 */ 3674 if (oldvd_has_siblings && ztest_random(2) == 0) { 3675 spa_config_exit(spa, SCL_ALL, FTAG); 3676 3677 error = ztest_scrub_impl(spa); 3678 if (error) 3679 goto out; 3680 3681 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3682 if (error != 0 && error != ENODEV && error != EBUSY && 3683 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3684 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3685 fatal(B_FALSE, "detach (%s) returned %d", 3686 oldpath, error); 3687 goto out; 3688 } 3689 3690 /* 3691 * For the new vdev, choose with equal probability between the two 3692 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3693 */ 3694 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3695 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3696 newvd_is_spare = B_TRUE; 3697 3698 if (newvd->vdev_ops == &vdev_draid_spare_ops) 3699 newvd_is_dspare = B_TRUE; 3700 3701 (void) strcpy(newpath, newvd->vdev_path); 3702 } else { 3703 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 3704 ztest_opts.zo_dir, ztest_opts.zo_pool, 3705 top * leaves + leaf); 3706 if (ztest_random(2) == 0) 3707 newpath[strlen(newpath) - 1] = 'b'; 3708 newvd = vdev_lookup_by_path(rvd, newpath); 3709 } 3710 3711 if (newvd) { 3712 /* 3713 * Reopen to ensure the vdev's asize field isn't stale. 3714 */ 3715 vdev_reopen(newvd); 3716 newsize = vdev_get_min_asize(newvd); 3717 } else { 3718 /* 3719 * Make newsize a little bigger or smaller than oldsize. 3720 * If it's smaller, the attach should fail. 3721 * If it's larger, and we're doing a replace, 3722 * we should get dynamic LUN growth when we're done. 3723 */ 3724 newsize = 10 * oldsize / (9 + ztest_random(3)); 3725 } 3726 3727 /* 3728 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3729 * unless it's a replace; in that case any non-replacing parent is OK. 3730 * 3731 * If newvd is already part of the pool, it should fail with EBUSY. 3732 * 3733 * If newvd is too small, it should fail with EOVERFLOW. 3734 * 3735 * If newvd is a distributed spare and it's being attached to a 3736 * dRAID which is not its parent it should fail with EINVAL. 3737 */ 3738 if (pvd->vdev_ops != &vdev_mirror_ops && 3739 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3740 pvd->vdev_ops == &vdev_replacing_ops || 3741 pvd->vdev_ops == &vdev_spare_ops)) 3742 expected_error = ENOTSUP; 3743 else if (newvd_is_spare && (!replacing || oldvd_is_log)) 3744 expected_error = ENOTSUP; 3745 else if (newvd == oldvd) 3746 expected_error = replacing ? 0 : EBUSY; 3747 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3748 expected_error = EBUSY; 3749 else if (!newvd_is_dspare && newsize < oldsize) 3750 expected_error = EOVERFLOW; 3751 else if (ashift > oldvd->vdev_top->vdev_ashift) 3752 expected_error = EDOM; 3753 else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) 3754 expected_error = ENOTSUP; 3755 else 3756 expected_error = 0; 3757 3758 spa_config_exit(spa, SCL_ALL, FTAG); 3759 3760 /* 3761 * Build the nvlist describing newpath. 3762 */ 3763 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3764 ashift, NULL, 0, 0, 1); 3765 3766 /* 3767 * When supported select either a healing or sequential resilver. 3768 */ 3769 boolean_t rebuilding = B_FALSE; 3770 if (pvd->vdev_ops == &vdev_mirror_ops || 3771 pvd->vdev_ops == &vdev_root_ops) { 3772 rebuilding = !!ztest_random(2); 3773 } 3774 3775 error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); 3776 3777 fnvlist_free(root); 3778 3779 /* 3780 * If our parent was the replacing vdev, but the replace completed, 3781 * then instead of failing with ENOTSUP we may either succeed, 3782 * fail with ENODEV, or fail with EOVERFLOW. 3783 */ 3784 if (expected_error == ENOTSUP && 3785 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3786 expected_error = error; 3787 3788 /* 3789 * If someone grew the LUN, the replacement may be too small. 3790 */ 3791 if (error == EOVERFLOW || error == EBUSY) 3792 expected_error = error; 3793 3794 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3795 error == ZFS_ERR_DISCARDING_CHECKPOINT || 3796 error == ZFS_ERR_RESILVER_IN_PROGRESS || 3797 error == ZFS_ERR_REBUILD_IN_PROGRESS) 3798 expected_error = error; 3799 3800 if (error != expected_error && expected_error != EBUSY) { 3801 fatal(B_FALSE, "attach (%s %"PRIu64", %s %"PRIu64", %d) " 3802 "returned %d, expected %d", 3803 oldpath, oldsize, newpath, 3804 newsize, replacing, error, expected_error); 3805 } 3806 out: 3807 mutex_exit(&ztest_vdev_lock); 3808 3809 umem_free(oldpath, MAXPATHLEN); 3810 umem_free(newpath, MAXPATHLEN); 3811 } 3812 3813 void 3814 ztest_device_removal(ztest_ds_t *zd, uint64_t id) 3815 { 3816 (void) zd, (void) id; 3817 spa_t *spa = ztest_spa; 3818 vdev_t *vd; 3819 uint64_t guid; 3820 int error; 3821 3822 mutex_enter(&ztest_vdev_lock); 3823 3824 if (ztest_device_removal_active) { 3825 mutex_exit(&ztest_vdev_lock); 3826 return; 3827 } 3828 3829 /* 3830 * Remove a random top-level vdev and wait for removal to finish. 3831 */ 3832 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3833 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 3834 guid = vd->vdev_guid; 3835 spa_config_exit(spa, SCL_VDEV, FTAG); 3836 3837 error = spa_vdev_remove(spa, guid, B_FALSE); 3838 if (error == 0) { 3839 ztest_device_removal_active = B_TRUE; 3840 mutex_exit(&ztest_vdev_lock); 3841 3842 /* 3843 * spa->spa_vdev_removal is created in a sync task that 3844 * is initiated via dsl_sync_task_nowait(). Since the 3845 * task may not run before spa_vdev_remove() returns, we 3846 * must wait at least 1 txg to ensure that the removal 3847 * struct has been created. 3848 */ 3849 txg_wait_synced(spa_get_dsl(spa), 0); 3850 3851 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 3852 txg_wait_synced(spa_get_dsl(spa), 0); 3853 } else { 3854 mutex_exit(&ztest_vdev_lock); 3855 return; 3856 } 3857 3858 /* 3859 * The pool needs to be scrubbed after completing device removal. 3860 * Failure to do so may result in checksum errors due to the 3861 * strategy employed by ztest_fault_inject() when selecting which 3862 * offset are redundant and can be damaged. 3863 */ 3864 error = spa_scan(spa, POOL_SCAN_SCRUB); 3865 if (error == 0) { 3866 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 3867 txg_wait_synced(spa_get_dsl(spa), 0); 3868 } 3869 3870 mutex_enter(&ztest_vdev_lock); 3871 ztest_device_removal_active = B_FALSE; 3872 mutex_exit(&ztest_vdev_lock); 3873 } 3874 3875 /* 3876 * Callback function which expands the physical size of the vdev. 3877 */ 3878 static vdev_t * 3879 grow_vdev(vdev_t *vd, void *arg) 3880 { 3881 spa_t *spa __maybe_unused = vd->vdev_spa; 3882 size_t *newsize = arg; 3883 size_t fsize; 3884 int fd; 3885 3886 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 3887 ASSERT(vd->vdev_ops->vdev_op_leaf); 3888 3889 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 3890 return (vd); 3891 3892 fsize = lseek(fd, 0, SEEK_END); 3893 VERIFY0(ftruncate(fd, *newsize)); 3894 3895 if (ztest_opts.zo_verbose >= 6) { 3896 (void) printf("%s grew from %lu to %lu bytes\n", 3897 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 3898 } 3899 (void) close(fd); 3900 return (NULL); 3901 } 3902 3903 /* 3904 * Callback function which expands a given vdev by calling vdev_online(). 3905 */ 3906 static vdev_t * 3907 online_vdev(vdev_t *vd, void *arg) 3908 { 3909 (void) arg; 3910 spa_t *spa = vd->vdev_spa; 3911 vdev_t *tvd = vd->vdev_top; 3912 uint64_t guid = vd->vdev_guid; 3913 uint64_t generation = spa->spa_config_generation + 1; 3914 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 3915 int error; 3916 3917 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 3918 ASSERT(vd->vdev_ops->vdev_op_leaf); 3919 3920 /* Calling vdev_online will initialize the new metaslabs */ 3921 spa_config_exit(spa, SCL_STATE, spa); 3922 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 3923 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3924 3925 /* 3926 * If vdev_online returned an error or the underlying vdev_open 3927 * failed then we abort the expand. The only way to know that 3928 * vdev_open fails is by checking the returned newstate. 3929 */ 3930 if (error || newstate != VDEV_STATE_HEALTHY) { 3931 if (ztest_opts.zo_verbose >= 5) { 3932 (void) printf("Unable to expand vdev, state %u, " 3933 "error %d\n", newstate, error); 3934 } 3935 return (vd); 3936 } 3937 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 3938 3939 /* 3940 * Since we dropped the lock we need to ensure that we're 3941 * still talking to the original vdev. It's possible this 3942 * vdev may have been detached/replaced while we were 3943 * trying to online it. 3944 */ 3945 if (generation != spa->spa_config_generation) { 3946 if (ztest_opts.zo_verbose >= 5) { 3947 (void) printf("vdev configuration has changed, " 3948 "guid %"PRIu64", state %"PRIu64", " 3949 "expected gen %"PRIu64", got gen %"PRIu64"\n", 3950 guid, 3951 tvd->vdev_state, 3952 generation, 3953 spa->spa_config_generation); 3954 } 3955 return (vd); 3956 } 3957 return (NULL); 3958 } 3959 3960 /* 3961 * Traverse the vdev tree calling the supplied function. 3962 * We continue to walk the tree until we either have walked all 3963 * children or we receive a non-NULL return from the callback. 3964 * If a NULL callback is passed, then we just return back the first 3965 * leaf vdev we encounter. 3966 */ 3967 static vdev_t * 3968 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 3969 { 3970 uint_t c; 3971 3972 if (vd->vdev_ops->vdev_op_leaf) { 3973 if (func == NULL) 3974 return (vd); 3975 else 3976 return (func(vd, arg)); 3977 } 3978 3979 for (c = 0; c < vd->vdev_children; c++) { 3980 vdev_t *cvd = vd->vdev_child[c]; 3981 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 3982 return (cvd); 3983 } 3984 return (NULL); 3985 } 3986 3987 /* 3988 * Verify that dynamic LUN growth works as expected. 3989 */ 3990 void 3991 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 3992 { 3993 (void) zd, (void) id; 3994 spa_t *spa = ztest_spa; 3995 vdev_t *vd, *tvd; 3996 metaslab_class_t *mc; 3997 metaslab_group_t *mg; 3998 size_t psize, newsize; 3999 uint64_t top; 4000 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 4001 4002 mutex_enter(&ztest_checkpoint_lock); 4003 mutex_enter(&ztest_vdev_lock); 4004 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4005 4006 /* 4007 * If there is a vdev removal in progress, it could complete while 4008 * we are running, in which case we would not be able to verify 4009 * that the metaslab_class space increased (because it decreases 4010 * when the device removal completes). 4011 */ 4012 if (ztest_device_removal_active) { 4013 spa_config_exit(spa, SCL_STATE, spa); 4014 mutex_exit(&ztest_vdev_lock); 4015 mutex_exit(&ztest_checkpoint_lock); 4016 return; 4017 } 4018 4019 top = ztest_random_vdev_top(spa, B_TRUE); 4020 4021 tvd = spa->spa_root_vdev->vdev_child[top]; 4022 mg = tvd->vdev_mg; 4023 mc = mg->mg_class; 4024 old_ms_count = tvd->vdev_ms_count; 4025 old_class_space = metaslab_class_get_space(mc); 4026 4027 /* 4028 * Determine the size of the first leaf vdev associated with 4029 * our top-level device. 4030 */ 4031 vd = vdev_walk_tree(tvd, NULL, NULL); 4032 ASSERT3P(vd, !=, NULL); 4033 ASSERT(vd->vdev_ops->vdev_op_leaf); 4034 4035 psize = vd->vdev_psize; 4036 4037 /* 4038 * We only try to expand the vdev if it's healthy, less than 4x its 4039 * original size, and it has a valid psize. 4040 */ 4041 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 4042 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 4043 spa_config_exit(spa, SCL_STATE, spa); 4044 mutex_exit(&ztest_vdev_lock); 4045 mutex_exit(&ztest_checkpoint_lock); 4046 return; 4047 } 4048 ASSERT3U(psize, >, 0); 4049 newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); 4050 ASSERT3U(newsize, >, psize); 4051 4052 if (ztest_opts.zo_verbose >= 6) { 4053 (void) printf("Expanding LUN %s from %lu to %lu\n", 4054 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 4055 } 4056 4057 /* 4058 * Growing the vdev is a two step process: 4059 * 1). expand the physical size (i.e. relabel) 4060 * 2). online the vdev to create the new metaslabs 4061 */ 4062 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 4063 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 4064 tvd->vdev_state != VDEV_STATE_HEALTHY) { 4065 if (ztest_opts.zo_verbose >= 5) { 4066 (void) printf("Could not expand LUN because " 4067 "the vdev configuration changed.\n"); 4068 } 4069 spa_config_exit(spa, SCL_STATE, spa); 4070 mutex_exit(&ztest_vdev_lock); 4071 mutex_exit(&ztest_checkpoint_lock); 4072 return; 4073 } 4074 4075 spa_config_exit(spa, SCL_STATE, spa); 4076 4077 /* 4078 * Expanding the LUN will update the config asynchronously, 4079 * thus we must wait for the async thread to complete any 4080 * pending tasks before proceeding. 4081 */ 4082 for (;;) { 4083 boolean_t done; 4084 mutex_enter(&spa->spa_async_lock); 4085 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 4086 mutex_exit(&spa->spa_async_lock); 4087 if (done) 4088 break; 4089 txg_wait_synced(spa_get_dsl(spa), 0); 4090 (void) poll(NULL, 0, 100); 4091 } 4092 4093 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4094 4095 tvd = spa->spa_root_vdev->vdev_child[top]; 4096 new_ms_count = tvd->vdev_ms_count; 4097 new_class_space = metaslab_class_get_space(mc); 4098 4099 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 4100 if (ztest_opts.zo_verbose >= 5) { 4101 (void) printf("Could not verify LUN expansion due to " 4102 "intervening vdev offline or remove.\n"); 4103 } 4104 spa_config_exit(spa, SCL_STATE, spa); 4105 mutex_exit(&ztest_vdev_lock); 4106 mutex_exit(&ztest_checkpoint_lock); 4107 return; 4108 } 4109 4110 /* 4111 * Make sure we were able to grow the vdev. 4112 */ 4113 if (new_ms_count <= old_ms_count) { 4114 fatal(B_FALSE, 4115 "LUN expansion failed: ms_count %"PRIu64" < %"PRIu64"\n", 4116 old_ms_count, new_ms_count); 4117 } 4118 4119 /* 4120 * Make sure we were able to grow the pool. 4121 */ 4122 if (new_class_space <= old_class_space) { 4123 fatal(B_FALSE, 4124 "LUN expansion failed: class_space %"PRIu64" < %"PRIu64"\n", 4125 old_class_space, new_class_space); 4126 } 4127 4128 if (ztest_opts.zo_verbose >= 5) { 4129 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 4130 4131 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 4132 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 4133 (void) printf("%s grew from %s to %s\n", 4134 spa->spa_name, oldnumbuf, newnumbuf); 4135 } 4136 4137 spa_config_exit(spa, SCL_STATE, spa); 4138 mutex_exit(&ztest_vdev_lock); 4139 mutex_exit(&ztest_checkpoint_lock); 4140 } 4141 4142 /* 4143 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 4144 */ 4145 static void 4146 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 4147 { 4148 (void) arg, (void) cr; 4149 4150 /* 4151 * Create the objects common to all ztest datasets. 4152 */ 4153 VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, 4154 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); 4155 } 4156 4157 static int 4158 ztest_dataset_create(char *dsname) 4159 { 4160 int err; 4161 uint64_t rand; 4162 dsl_crypto_params_t *dcp = NULL; 4163 4164 /* 4165 * 50% of the time, we create encrypted datasets 4166 * using a random cipher suite and a hard-coded 4167 * wrapping key. 4168 */ 4169 rand = ztest_random(2); 4170 if (rand != 0) { 4171 nvlist_t *crypto_args = fnvlist_alloc(); 4172 nvlist_t *props = fnvlist_alloc(); 4173 4174 /* slight bias towards the default cipher suite */ 4175 rand = ztest_random(ZIO_CRYPT_FUNCTIONS); 4176 if (rand < ZIO_CRYPT_AES_128_CCM) 4177 rand = ZIO_CRYPT_ON; 4178 4179 fnvlist_add_uint64(props, 4180 zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); 4181 fnvlist_add_uint8_array(crypto_args, "wkeydata", 4182 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 4183 4184 /* 4185 * These parameters aren't really used by the kernel. They 4186 * are simply stored so that userspace knows how to load 4187 * the wrapping key. 4188 */ 4189 fnvlist_add_uint64(props, 4190 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); 4191 fnvlist_add_string(props, 4192 zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); 4193 fnvlist_add_uint64(props, 4194 zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); 4195 fnvlist_add_uint64(props, 4196 zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); 4197 4198 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, 4199 crypto_args, &dcp)); 4200 4201 /* 4202 * Cycle through all available encryption implementations 4203 * to verify interoperability. 4204 */ 4205 VERIFY0(gcm_impl_set("cycle")); 4206 VERIFY0(aes_impl_set("cycle")); 4207 4208 fnvlist_free(crypto_args); 4209 fnvlist_free(props); 4210 } 4211 4212 err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, 4213 ztest_objset_create_cb, NULL); 4214 dsl_crypto_params_free(dcp, !!err); 4215 4216 rand = ztest_random(100); 4217 if (err || rand < 80) 4218 return (err); 4219 4220 if (ztest_opts.zo_verbose >= 5) 4221 (void) printf("Setting dataset %s to sync always\n", dsname); 4222 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 4223 ZFS_SYNC_ALWAYS, B_FALSE)); 4224 } 4225 4226 static int 4227 ztest_objset_destroy_cb(const char *name, void *arg) 4228 { 4229 (void) arg; 4230 objset_t *os; 4231 dmu_object_info_t doi; 4232 int error; 4233 4234 /* 4235 * Verify that the dataset contains a directory object. 4236 */ 4237 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4238 B_TRUE, FTAG, &os)); 4239 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 4240 if (error != ENOENT) { 4241 /* We could have crashed in the middle of destroying it */ 4242 ASSERT0(error); 4243 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 4244 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 4245 } 4246 dmu_objset_disown(os, B_TRUE, FTAG); 4247 4248 /* 4249 * Destroy the dataset. 4250 */ 4251 if (strchr(name, '@') != NULL) { 4252 error = dsl_destroy_snapshot(name, B_TRUE); 4253 if (error != ECHRNG) { 4254 /* 4255 * The program was executed, but encountered a runtime 4256 * error, such as insufficient slop, or a hold on the 4257 * dataset. 4258 */ 4259 ASSERT0(error); 4260 } 4261 } else { 4262 error = dsl_destroy_head(name); 4263 if (error == ENOSPC) { 4264 /* There could be checkpoint or insufficient slop */ 4265 ztest_record_enospc(FTAG); 4266 } else if (error != EBUSY) { 4267 /* There could be a hold on this dataset */ 4268 ASSERT0(error); 4269 } 4270 } 4271 return (0); 4272 } 4273 4274 static boolean_t 4275 ztest_snapshot_create(char *osname, uint64_t id) 4276 { 4277 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4278 int error; 4279 4280 (void) snprintf(snapname, sizeof (snapname), "%"PRIu64"", id); 4281 4282 error = dmu_objset_snapshot_one(osname, snapname); 4283 if (error == ENOSPC) { 4284 ztest_record_enospc(FTAG); 4285 return (B_FALSE); 4286 } 4287 if (error != 0 && error != EEXIST) { 4288 fatal(B_FALSE, "ztest_snapshot_create(%s@%s) = %d", osname, 4289 snapname, error); 4290 } 4291 return (B_TRUE); 4292 } 4293 4294 static boolean_t 4295 ztest_snapshot_destroy(char *osname, uint64_t id) 4296 { 4297 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4298 int error; 4299 4300 (void) snprintf(snapname, sizeof (snapname), "%s@%"PRIu64"", 4301 osname, id); 4302 4303 error = dsl_destroy_snapshot(snapname, B_FALSE); 4304 if (error != 0 && error != ENOENT) 4305 fatal(B_FALSE, "ztest_snapshot_destroy(%s) = %d", 4306 snapname, error); 4307 return (B_TRUE); 4308 } 4309 4310 void 4311 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 4312 { 4313 (void) zd; 4314 ztest_ds_t *zdtmp; 4315 int iters; 4316 int error; 4317 objset_t *os, *os2; 4318 char name[ZFS_MAX_DATASET_NAME_LEN]; 4319 zilog_t *zilog; 4320 int i; 4321 4322 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 4323 4324 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4325 4326 (void) snprintf(name, sizeof (name), "%s/temp_%"PRIu64"", 4327 ztest_opts.zo_pool, id); 4328 4329 /* 4330 * If this dataset exists from a previous run, process its replay log 4331 * half of the time. If we don't replay it, then dsl_destroy_head() 4332 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 4333 */ 4334 if (ztest_random(2) == 0 && 4335 ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 4336 B_TRUE, FTAG, &os) == 0) { 4337 ztest_zd_init(zdtmp, NULL, os); 4338 zil_replay(os, zdtmp, ztest_replay_vector); 4339 ztest_zd_fini(zdtmp); 4340 dmu_objset_disown(os, B_TRUE, FTAG); 4341 } 4342 4343 /* 4344 * There may be an old instance of the dataset we're about to 4345 * create lying around from a previous run. If so, destroy it 4346 * and all of its snapshots. 4347 */ 4348 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 4349 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 4350 4351 /* 4352 * Verify that the destroyed dataset is no longer in the namespace. 4353 */ 4354 VERIFY3U(ENOENT, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4355 B_TRUE, FTAG, &os)); 4356 4357 /* 4358 * Verify that we can create a new dataset. 4359 */ 4360 error = ztest_dataset_create(name); 4361 if (error) { 4362 if (error == ENOSPC) { 4363 ztest_record_enospc(FTAG); 4364 goto out; 4365 } 4366 fatal(B_FALSE, "dmu_objset_create(%s) = %d", name, error); 4367 } 4368 4369 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, 4370 FTAG, &os)); 4371 4372 ztest_zd_init(zdtmp, NULL, os); 4373 4374 /* 4375 * Open the intent log for it. 4376 */ 4377 zilog = zil_open(os, ztest_get_data); 4378 4379 /* 4380 * Put some objects in there, do a little I/O to them, 4381 * and randomly take a couple of snapshots along the way. 4382 */ 4383 iters = ztest_random(5); 4384 for (i = 0; i < iters; i++) { 4385 ztest_dmu_object_alloc_free(zdtmp, id); 4386 if (ztest_random(iters) == 0) 4387 (void) ztest_snapshot_create(name, i); 4388 } 4389 4390 /* 4391 * Verify that we cannot create an existing dataset. 4392 */ 4393 VERIFY3U(EEXIST, ==, 4394 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); 4395 4396 /* 4397 * Verify that we can hold an objset that is also owned. 4398 */ 4399 VERIFY0(dmu_objset_hold(name, FTAG, &os2)); 4400 dmu_objset_rele(os2, FTAG); 4401 4402 /* 4403 * Verify that we cannot own an objset that is already owned. 4404 */ 4405 VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, 4406 B_FALSE, B_TRUE, FTAG, &os2)); 4407 4408 zil_close(zilog); 4409 dmu_objset_disown(os, B_TRUE, FTAG); 4410 ztest_zd_fini(zdtmp); 4411 out: 4412 (void) pthread_rwlock_unlock(&ztest_name_lock); 4413 4414 umem_free(zdtmp, sizeof (ztest_ds_t)); 4415 } 4416 4417 /* 4418 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 4419 */ 4420 void 4421 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 4422 { 4423 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4424 (void) ztest_snapshot_destroy(zd->zd_name, id); 4425 (void) ztest_snapshot_create(zd->zd_name, id); 4426 (void) pthread_rwlock_unlock(&ztest_name_lock); 4427 } 4428 4429 /* 4430 * Cleanup non-standard snapshots and clones. 4431 */ 4432 static void 4433 ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 4434 { 4435 char *snap1name; 4436 char *clone1name; 4437 char *snap2name; 4438 char *clone2name; 4439 char *snap3name; 4440 int error; 4441 4442 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4443 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4444 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4445 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4446 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4447 4448 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4449 osname, id); 4450 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4451 osname, id); 4452 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4453 clone1name, id); 4454 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4455 osname, id); 4456 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4457 clone1name, id); 4458 4459 error = dsl_destroy_head(clone2name); 4460 if (error && error != ENOENT) 4461 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone2name, error); 4462 error = dsl_destroy_snapshot(snap3name, B_FALSE); 4463 if (error && error != ENOENT) 4464 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4465 snap3name, error); 4466 error = dsl_destroy_snapshot(snap2name, B_FALSE); 4467 if (error && error != ENOENT) 4468 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4469 snap2name, error); 4470 error = dsl_destroy_head(clone1name); 4471 if (error && error != ENOENT) 4472 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone1name, error); 4473 error = dsl_destroy_snapshot(snap1name, B_FALSE); 4474 if (error && error != ENOENT) 4475 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4476 snap1name, error); 4477 4478 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4479 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4480 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4481 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4482 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4483 } 4484 4485 /* 4486 * Verify dsl_dataset_promote handles EBUSY 4487 */ 4488 void 4489 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 4490 { 4491 objset_t *os; 4492 char *snap1name; 4493 char *clone1name; 4494 char *snap2name; 4495 char *clone2name; 4496 char *snap3name; 4497 char *osname = zd->zd_name; 4498 int error; 4499 4500 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4501 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4502 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4503 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4504 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4505 4506 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4507 4508 ztest_dsl_dataset_cleanup(osname, id); 4509 4510 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4511 osname, id); 4512 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4513 osname, id); 4514 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4515 clone1name, id); 4516 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4517 osname, id); 4518 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4519 clone1name, id); 4520 4521 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 4522 if (error && error != EEXIST) { 4523 if (error == ENOSPC) { 4524 ztest_record_enospc(FTAG); 4525 goto out; 4526 } 4527 fatal(B_FALSE, "dmu_take_snapshot(%s) = %d", snap1name, error); 4528 } 4529 4530 error = dmu_objset_clone(clone1name, snap1name); 4531 if (error) { 4532 if (error == ENOSPC) { 4533 ztest_record_enospc(FTAG); 4534 goto out; 4535 } 4536 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone1name, error); 4537 } 4538 4539 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 4540 if (error && error != EEXIST) { 4541 if (error == ENOSPC) { 4542 ztest_record_enospc(FTAG); 4543 goto out; 4544 } 4545 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap2name, error); 4546 } 4547 4548 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 4549 if (error && error != EEXIST) { 4550 if (error == ENOSPC) { 4551 ztest_record_enospc(FTAG); 4552 goto out; 4553 } 4554 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap3name, error); 4555 } 4556 4557 error = dmu_objset_clone(clone2name, snap3name); 4558 if (error) { 4559 if (error == ENOSPC) { 4560 ztest_record_enospc(FTAG); 4561 goto out; 4562 } 4563 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone2name, error); 4564 } 4565 4566 error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, 4567 FTAG, &os); 4568 if (error) 4569 fatal(B_FALSE, "dmu_objset_own(%s) = %d", snap2name, error); 4570 error = dsl_dataset_promote(clone2name, NULL); 4571 if (error == ENOSPC) { 4572 dmu_objset_disown(os, B_TRUE, FTAG); 4573 ztest_record_enospc(FTAG); 4574 goto out; 4575 } 4576 if (error != EBUSY) 4577 fatal(B_FALSE, "dsl_dataset_promote(%s), %d, not EBUSY", 4578 clone2name, error); 4579 dmu_objset_disown(os, B_TRUE, FTAG); 4580 4581 out: 4582 ztest_dsl_dataset_cleanup(osname, id); 4583 4584 (void) pthread_rwlock_unlock(&ztest_name_lock); 4585 4586 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4587 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4588 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4589 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4590 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4591 } 4592 4593 #undef OD_ARRAY_SIZE 4594 #define OD_ARRAY_SIZE 4 4595 4596 /* 4597 * Verify that dmu_object_{alloc,free} work as expected. 4598 */ 4599 void 4600 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 4601 { 4602 ztest_od_t *od; 4603 int batchsize; 4604 int size; 4605 int b; 4606 4607 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4608 od = umem_alloc(size, UMEM_NOFAIL); 4609 batchsize = OD_ARRAY_SIZE; 4610 4611 for (b = 0; b < batchsize; b++) 4612 ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 4613 0, 0, 0); 4614 4615 /* 4616 * Destroy the previous batch of objects, create a new batch, 4617 * and do some I/O on the new objects. 4618 */ 4619 if (ztest_object_init(zd, od, size, B_TRUE) != 0) 4620 return; 4621 4622 while (ztest_random(4 * batchsize) != 0) 4623 ztest_io(zd, od[ztest_random(batchsize)].od_object, 4624 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4625 4626 umem_free(od, size); 4627 } 4628 4629 /* 4630 * Rewind the global allocator to verify object allocation backfilling. 4631 */ 4632 void 4633 ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 4634 { 4635 (void) id; 4636 objset_t *os = zd->zd_os; 4637 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 4638 uint64_t object; 4639 4640 /* 4641 * Rewind the global allocator randomly back to a lower object number 4642 * to force backfilling and reclamation of recently freed dnodes. 4643 */ 4644 mutex_enter(&os->os_obj_lock); 4645 object = ztest_random(os->os_obj_next_chunk); 4646 os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); 4647 mutex_exit(&os->os_obj_lock); 4648 } 4649 4650 #undef OD_ARRAY_SIZE 4651 #define OD_ARRAY_SIZE 2 4652 4653 /* 4654 * Verify that dmu_{read,write} work as expected. 4655 */ 4656 void 4657 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 4658 { 4659 int size; 4660 ztest_od_t *od; 4661 4662 objset_t *os = zd->zd_os; 4663 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4664 od = umem_alloc(size, UMEM_NOFAIL); 4665 dmu_tx_t *tx; 4666 int freeit, error; 4667 uint64_t i, n, s, txg; 4668 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 4669 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4670 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 4671 uint64_t regions = 997; 4672 uint64_t stride = 123456789ULL; 4673 uint64_t width = 40; 4674 int free_percent = 5; 4675 4676 /* 4677 * This test uses two objects, packobj and bigobj, that are always 4678 * updated together (i.e. in the same tx) so that their contents are 4679 * in sync and can be compared. Their contents relate to each other 4680 * in a simple way: packobj is a dense array of 'bufwad' structures, 4681 * while bigobj is a sparse array of the same bufwads. Specifically, 4682 * for any index n, there are three bufwads that should be identical: 4683 * 4684 * packobj, at offset n * sizeof (bufwad_t) 4685 * bigobj, at the head of the nth chunk 4686 * bigobj, at the tail of the nth chunk 4687 * 4688 * The chunk size is arbitrary. It doesn't have to be a power of two, 4689 * and it doesn't have any relation to the object blocksize. 4690 * The only requirement is that it can hold at least two bufwads. 4691 * 4692 * Normally, we write the bufwad to each of these locations. 4693 * However, free_percent of the time we instead write zeroes to 4694 * packobj and perform a dmu_free_range() on bigobj. By comparing 4695 * bigobj to packobj, we can verify that the DMU is correctly 4696 * tracking which parts of an object are allocated and free, 4697 * and that the contents of the allocated blocks are correct. 4698 */ 4699 4700 /* 4701 * Read the directory info. If it's the first time, set things up. 4702 */ 4703 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); 4704 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 4705 chunksize); 4706 4707 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 4708 umem_free(od, size); 4709 return; 4710 } 4711 4712 bigobj = od[0].od_object; 4713 packobj = od[1].od_object; 4714 chunksize = od[0].od_gen; 4715 ASSERT3U(chunksize, ==, od[1].od_gen); 4716 4717 /* 4718 * Prefetch a random chunk of the big object. 4719 * Our aim here is to get some async reads in flight 4720 * for blocks that we may free below; the DMU should 4721 * handle this race correctly. 4722 */ 4723 n = ztest_random(regions) * stride + ztest_random(width); 4724 s = 1 + ztest_random(2 * width - 1); 4725 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 4726 ZIO_PRIORITY_SYNC_READ); 4727 4728 /* 4729 * Pick a random index and compute the offsets into packobj and bigobj. 4730 */ 4731 n = ztest_random(regions) * stride + ztest_random(width); 4732 s = 1 + ztest_random(width - 1); 4733 4734 packoff = n * sizeof (bufwad_t); 4735 packsize = s * sizeof (bufwad_t); 4736 4737 bigoff = n * chunksize; 4738 bigsize = s * chunksize; 4739 4740 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 4741 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 4742 4743 /* 4744 * free_percent of the time, free a range of bigobj rather than 4745 * overwriting it. 4746 */ 4747 freeit = (ztest_random(100) < free_percent); 4748 4749 /* 4750 * Read the current contents of our objects. 4751 */ 4752 error = dmu_read(os, packobj, packoff, packsize, packbuf, 4753 DMU_READ_PREFETCH); 4754 ASSERT0(error); 4755 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 4756 DMU_READ_PREFETCH); 4757 ASSERT0(error); 4758 4759 /* 4760 * Get a tx for the mods to both packobj and bigobj. 4761 */ 4762 tx = dmu_tx_create(os); 4763 4764 dmu_tx_hold_write(tx, packobj, packoff, packsize); 4765 4766 if (freeit) 4767 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 4768 else 4769 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 4770 4771 /* This accounts for setting the checksum/compression. */ 4772 dmu_tx_hold_bonus(tx, bigobj); 4773 4774 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4775 if (txg == 0) { 4776 umem_free(packbuf, packsize); 4777 umem_free(bigbuf, bigsize); 4778 umem_free(od, size); 4779 return; 4780 } 4781 4782 enum zio_checksum cksum; 4783 do { 4784 cksum = (enum zio_checksum) 4785 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 4786 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 4787 dmu_object_set_checksum(os, bigobj, cksum, tx); 4788 4789 enum zio_compress comp; 4790 do { 4791 comp = (enum zio_compress) 4792 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 4793 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 4794 dmu_object_set_compress(os, bigobj, comp, tx); 4795 4796 /* 4797 * For each index from n to n + s, verify that the existing bufwad 4798 * in packobj matches the bufwads at the head and tail of the 4799 * corresponding chunk in bigobj. Then update all three bufwads 4800 * with the new values we want to write out. 4801 */ 4802 for (i = 0; i < s; i++) { 4803 /* LINTED */ 4804 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4805 /* LINTED */ 4806 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4807 /* LINTED */ 4808 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4809 4810 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 4811 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 4812 4813 if (pack->bw_txg > txg) 4814 fatal(B_FALSE, 4815 "future leak: got %"PRIx64", open txg is %"PRIx64"", 4816 pack->bw_txg, txg); 4817 4818 if (pack->bw_data != 0 && pack->bw_index != n + i) 4819 fatal(B_FALSE, "wrong index: " 4820 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 4821 pack->bw_index, n, i); 4822 4823 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4824 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 4825 pack, bigH); 4826 4827 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4828 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 4829 pack, bigT); 4830 4831 if (freeit) { 4832 memset(pack, 0, sizeof (bufwad_t)); 4833 } else { 4834 pack->bw_index = n + i; 4835 pack->bw_txg = txg; 4836 pack->bw_data = 1 + ztest_random(-2ULL); 4837 } 4838 *bigH = *pack; 4839 *bigT = *pack; 4840 } 4841 4842 /* 4843 * We've verified all the old bufwads, and made new ones. 4844 * Now write them out. 4845 */ 4846 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 4847 4848 if (freeit) { 4849 if (ztest_opts.zo_verbose >= 7) { 4850 (void) printf("freeing offset %"PRIx64" size %"PRIx64"" 4851 " txg %"PRIx64"\n", 4852 bigoff, bigsize, txg); 4853 } 4854 VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 4855 } else { 4856 if (ztest_opts.zo_verbose >= 7) { 4857 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 4858 " txg %"PRIx64"\n", 4859 bigoff, bigsize, txg); 4860 } 4861 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 4862 } 4863 4864 dmu_tx_commit(tx); 4865 4866 /* 4867 * Sanity check the stuff we just wrote. 4868 */ 4869 { 4870 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 4871 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 4872 4873 VERIFY0(dmu_read(os, packobj, packoff, 4874 packsize, packcheck, DMU_READ_PREFETCH)); 4875 VERIFY0(dmu_read(os, bigobj, bigoff, 4876 bigsize, bigcheck, DMU_READ_PREFETCH)); 4877 4878 ASSERT0(memcmp(packbuf, packcheck, packsize)); 4879 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 4880 4881 umem_free(packcheck, packsize); 4882 umem_free(bigcheck, bigsize); 4883 } 4884 4885 umem_free(packbuf, packsize); 4886 umem_free(bigbuf, bigsize); 4887 umem_free(od, size); 4888 } 4889 4890 static void 4891 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 4892 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 4893 { 4894 uint64_t i; 4895 bufwad_t *pack; 4896 bufwad_t *bigH; 4897 bufwad_t *bigT; 4898 4899 /* 4900 * For each index from n to n + s, verify that the existing bufwad 4901 * in packobj matches the bufwads at the head and tail of the 4902 * corresponding chunk in bigobj. Then update all three bufwads 4903 * with the new values we want to write out. 4904 */ 4905 for (i = 0; i < s; i++) { 4906 /* LINTED */ 4907 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4908 /* LINTED */ 4909 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4910 /* LINTED */ 4911 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4912 4913 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 4914 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 4915 4916 if (pack->bw_txg > txg) 4917 fatal(B_FALSE, 4918 "future leak: got %"PRIx64", open txg is %"PRIx64"", 4919 pack->bw_txg, txg); 4920 4921 if (pack->bw_data != 0 && pack->bw_index != n + i) 4922 fatal(B_FALSE, "wrong index: " 4923 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 4924 pack->bw_index, n, i); 4925 4926 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4927 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 4928 pack, bigH); 4929 4930 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4931 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 4932 pack, bigT); 4933 4934 pack->bw_index = n + i; 4935 pack->bw_txg = txg; 4936 pack->bw_data = 1 + ztest_random(-2ULL); 4937 4938 *bigH = *pack; 4939 *bigT = *pack; 4940 } 4941 } 4942 4943 #undef OD_ARRAY_SIZE 4944 #define OD_ARRAY_SIZE 2 4945 4946 void 4947 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 4948 { 4949 objset_t *os = zd->zd_os; 4950 ztest_od_t *od; 4951 dmu_tx_t *tx; 4952 uint64_t i; 4953 int error; 4954 int size; 4955 uint64_t n, s, txg; 4956 bufwad_t *packbuf, *bigbuf; 4957 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4958 uint64_t blocksize = ztest_random_blocksize(); 4959 uint64_t chunksize = blocksize; 4960 uint64_t regions = 997; 4961 uint64_t stride = 123456789ULL; 4962 uint64_t width = 9; 4963 dmu_buf_t *bonus_db; 4964 arc_buf_t **bigbuf_arcbufs; 4965 dmu_object_info_t doi; 4966 4967 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4968 od = umem_alloc(size, UMEM_NOFAIL); 4969 4970 /* 4971 * This test uses two objects, packobj and bigobj, that are always 4972 * updated together (i.e. in the same tx) so that their contents are 4973 * in sync and can be compared. Their contents relate to each other 4974 * in a simple way: packobj is a dense array of 'bufwad' structures, 4975 * while bigobj is a sparse array of the same bufwads. Specifically, 4976 * for any index n, there are three bufwads that should be identical: 4977 * 4978 * packobj, at offset n * sizeof (bufwad_t) 4979 * bigobj, at the head of the nth chunk 4980 * bigobj, at the tail of the nth chunk 4981 * 4982 * The chunk size is set equal to bigobj block size so that 4983 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. 4984 */ 4985 4986 /* 4987 * Read the directory info. If it's the first time, set things up. 4988 */ 4989 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 4990 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 4991 chunksize); 4992 4993 4994 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 4995 umem_free(od, size); 4996 return; 4997 } 4998 4999 bigobj = od[0].od_object; 5000 packobj = od[1].od_object; 5001 blocksize = od[0].od_blocksize; 5002 chunksize = blocksize; 5003 ASSERT3U(chunksize, ==, od[1].od_gen); 5004 5005 VERIFY0(dmu_object_info(os, bigobj, &doi)); 5006 VERIFY(ISP2(doi.doi_data_block_size)); 5007 VERIFY3U(chunksize, ==, doi.doi_data_block_size); 5008 VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); 5009 5010 /* 5011 * Pick a random index and compute the offsets into packobj and bigobj. 5012 */ 5013 n = ztest_random(regions) * stride + ztest_random(width); 5014 s = 1 + ztest_random(width - 1); 5015 5016 packoff = n * sizeof (bufwad_t); 5017 packsize = s * sizeof (bufwad_t); 5018 5019 bigoff = n * chunksize; 5020 bigsize = s * chunksize; 5021 5022 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 5023 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 5024 5025 VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 5026 5027 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 5028 5029 /* 5030 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 5031 * Iteration 1 test zcopy to already referenced dbufs. 5032 * Iteration 2 test zcopy to dirty dbuf in the same txg. 5033 * Iteration 3 test zcopy to dbuf dirty in previous txg. 5034 * Iteration 4 test zcopy when dbuf is no longer dirty. 5035 * Iteration 5 test zcopy when it can't be done. 5036 * Iteration 6 one more zcopy write. 5037 */ 5038 for (i = 0; i < 7; i++) { 5039 uint64_t j; 5040 uint64_t off; 5041 5042 /* 5043 * In iteration 5 (i == 5) use arcbufs 5044 * that don't match bigobj blksz to test 5045 * dmu_assign_arcbuf_by_dbuf() when it can't directly 5046 * assign an arcbuf to a dbuf. 5047 */ 5048 for (j = 0; j < s; j++) { 5049 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5050 bigbuf_arcbufs[j] = 5051 dmu_request_arcbuf(bonus_db, chunksize); 5052 } else { 5053 bigbuf_arcbufs[2 * j] = 5054 dmu_request_arcbuf(bonus_db, chunksize / 2); 5055 bigbuf_arcbufs[2 * j + 1] = 5056 dmu_request_arcbuf(bonus_db, chunksize / 2); 5057 } 5058 } 5059 5060 /* 5061 * Get a tx for the mods to both packobj and bigobj. 5062 */ 5063 tx = dmu_tx_create(os); 5064 5065 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5066 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5067 5068 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5069 if (txg == 0) { 5070 umem_free(packbuf, packsize); 5071 umem_free(bigbuf, bigsize); 5072 for (j = 0; j < s; j++) { 5073 if (i != 5 || 5074 chunksize < (SPA_MINBLOCKSIZE * 2)) { 5075 dmu_return_arcbuf(bigbuf_arcbufs[j]); 5076 } else { 5077 dmu_return_arcbuf( 5078 bigbuf_arcbufs[2 * j]); 5079 dmu_return_arcbuf( 5080 bigbuf_arcbufs[2 * j + 1]); 5081 } 5082 } 5083 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5084 umem_free(od, size); 5085 dmu_buf_rele(bonus_db, FTAG); 5086 return; 5087 } 5088 5089 /* 5090 * 50% of the time don't read objects in the 1st iteration to 5091 * test dmu_assign_arcbuf_by_dbuf() for the case when there are 5092 * no existing dbufs for the specified offsets. 5093 */ 5094 if (i != 0 || ztest_random(2) != 0) { 5095 error = dmu_read(os, packobj, packoff, 5096 packsize, packbuf, DMU_READ_PREFETCH); 5097 ASSERT0(error); 5098 error = dmu_read(os, bigobj, bigoff, bigsize, 5099 bigbuf, DMU_READ_PREFETCH); 5100 ASSERT0(error); 5101 } 5102 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 5103 n, chunksize, txg); 5104 5105 /* 5106 * We've verified all the old bufwads, and made new ones. 5107 * Now write them out. 5108 */ 5109 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5110 if (ztest_opts.zo_verbose >= 7) { 5111 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5112 " txg %"PRIx64"\n", 5113 bigoff, bigsize, txg); 5114 } 5115 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 5116 dmu_buf_t *dbt; 5117 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5118 memcpy(bigbuf_arcbufs[j]->b_data, 5119 (caddr_t)bigbuf + (off - bigoff), 5120 chunksize); 5121 } else { 5122 memcpy(bigbuf_arcbufs[2 * j]->b_data, 5123 (caddr_t)bigbuf + (off - bigoff), 5124 chunksize / 2); 5125 memcpy(bigbuf_arcbufs[2 * j + 1]->b_data, 5126 (caddr_t)bigbuf + (off - bigoff) + 5127 chunksize / 2, 5128 chunksize / 2); 5129 } 5130 5131 if (i == 1) { 5132 VERIFY(dmu_buf_hold(os, bigobj, off, 5133 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 5134 } 5135 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5136 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5137 off, bigbuf_arcbufs[j], tx)); 5138 } else { 5139 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5140 off, bigbuf_arcbufs[2 * j], tx)); 5141 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5142 off + chunksize / 2, 5143 bigbuf_arcbufs[2 * j + 1], tx)); 5144 } 5145 if (i == 1) { 5146 dmu_buf_rele(dbt, FTAG); 5147 } 5148 } 5149 dmu_tx_commit(tx); 5150 5151 /* 5152 * Sanity check the stuff we just wrote. 5153 */ 5154 { 5155 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5156 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5157 5158 VERIFY0(dmu_read(os, packobj, packoff, 5159 packsize, packcheck, DMU_READ_PREFETCH)); 5160 VERIFY0(dmu_read(os, bigobj, bigoff, 5161 bigsize, bigcheck, DMU_READ_PREFETCH)); 5162 5163 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5164 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5165 5166 umem_free(packcheck, packsize); 5167 umem_free(bigcheck, bigsize); 5168 } 5169 if (i == 2) { 5170 txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); 5171 } else if (i == 3) { 5172 txg_wait_synced(dmu_objset_pool(os), 0); 5173 } 5174 } 5175 5176 dmu_buf_rele(bonus_db, FTAG); 5177 umem_free(packbuf, packsize); 5178 umem_free(bigbuf, bigsize); 5179 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5180 umem_free(od, size); 5181 } 5182 5183 void 5184 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 5185 { 5186 (void) id; 5187 ztest_od_t *od; 5188 5189 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5190 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 5191 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5192 5193 /* 5194 * Have multiple threads write to large offsets in an object 5195 * to verify that parallel writes to an object -- even to the 5196 * same blocks within the object -- doesn't cause any trouble. 5197 */ 5198 ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5199 5200 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) 5201 return; 5202 5203 while (ztest_random(10) != 0) 5204 ztest_io(zd, od->od_object, offset); 5205 5206 umem_free(od, sizeof (ztest_od_t)); 5207 } 5208 5209 void 5210 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 5211 { 5212 ztest_od_t *od; 5213 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 5214 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5215 uint64_t count = ztest_random(20) + 1; 5216 uint64_t blocksize = ztest_random_blocksize(); 5217 void *data; 5218 5219 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5220 5221 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5222 5223 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5224 !ztest_random(2)) != 0) { 5225 umem_free(od, sizeof (ztest_od_t)); 5226 return; 5227 } 5228 5229 if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { 5230 umem_free(od, sizeof (ztest_od_t)); 5231 return; 5232 } 5233 5234 ztest_prealloc(zd, od->od_object, offset, count * blocksize); 5235 5236 data = umem_zalloc(blocksize, UMEM_NOFAIL); 5237 5238 while (ztest_random(count) != 0) { 5239 uint64_t randoff = offset + (ztest_random(count) * blocksize); 5240 if (ztest_write(zd, od->od_object, randoff, blocksize, 5241 data) != 0) 5242 break; 5243 while (ztest_random(4) != 0) 5244 ztest_io(zd, od->od_object, randoff); 5245 } 5246 5247 umem_free(data, blocksize); 5248 umem_free(od, sizeof (ztest_od_t)); 5249 } 5250 5251 /* 5252 * Verify that zap_{create,destroy,add,remove,update} work as expected. 5253 */ 5254 #define ZTEST_ZAP_MIN_INTS 1 5255 #define ZTEST_ZAP_MAX_INTS 4 5256 #define ZTEST_ZAP_MAX_PROPS 1000 5257 5258 void 5259 ztest_zap(ztest_ds_t *zd, uint64_t id) 5260 { 5261 objset_t *os = zd->zd_os; 5262 ztest_od_t *od; 5263 uint64_t object; 5264 uint64_t txg, last_txg; 5265 uint64_t value[ZTEST_ZAP_MAX_INTS]; 5266 uint64_t zl_ints, zl_intsize, prop; 5267 int i, ints; 5268 dmu_tx_t *tx; 5269 char propname[100], txgname[100]; 5270 int error; 5271 char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 5272 5273 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5274 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5275 5276 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5277 !ztest_random(2)) != 0) 5278 goto out; 5279 5280 object = od->od_object; 5281 5282 /* 5283 * Generate a known hash collision, and verify that 5284 * we can lookup and remove both entries. 5285 */ 5286 tx = dmu_tx_create(os); 5287 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5288 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5289 if (txg == 0) 5290 goto out; 5291 for (i = 0; i < 2; i++) { 5292 value[i] = i; 5293 VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 5294 1, &value[i], tx)); 5295 } 5296 for (i = 0; i < 2; i++) { 5297 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 5298 sizeof (uint64_t), 1, &value[i], tx)); 5299 VERIFY0( 5300 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 5301 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5302 ASSERT3U(zl_ints, ==, 1); 5303 } 5304 for (i = 0; i < 2; i++) { 5305 VERIFY0(zap_remove(os, object, hc[i], tx)); 5306 } 5307 dmu_tx_commit(tx); 5308 5309 /* 5310 * Generate a bunch of random entries. 5311 */ 5312 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 5313 5314 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5315 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5316 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5317 memset(value, 0, sizeof (value)); 5318 last_txg = 0; 5319 5320 /* 5321 * If these zap entries already exist, validate their contents. 5322 */ 5323 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5324 if (error == 0) { 5325 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5326 ASSERT3U(zl_ints, ==, 1); 5327 5328 VERIFY0(zap_lookup(os, object, txgname, zl_intsize, 5329 zl_ints, &last_txg)); 5330 5331 VERIFY0(zap_length(os, object, propname, &zl_intsize, 5332 &zl_ints)); 5333 5334 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5335 ASSERT3U(zl_ints, ==, ints); 5336 5337 VERIFY0(zap_lookup(os, object, propname, zl_intsize, 5338 zl_ints, value)); 5339 5340 for (i = 0; i < ints; i++) { 5341 ASSERT3U(value[i], ==, last_txg + object + i); 5342 } 5343 } else { 5344 ASSERT3U(error, ==, ENOENT); 5345 } 5346 5347 /* 5348 * Atomically update two entries in our zap object. 5349 * The first is named txg_%llu, and contains the txg 5350 * in which the property was last updated. The second 5351 * is named prop_%llu, and the nth element of its value 5352 * should be txg + object + n. 5353 */ 5354 tx = dmu_tx_create(os); 5355 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5356 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5357 if (txg == 0) 5358 goto out; 5359 5360 if (last_txg > txg) 5361 fatal(B_FALSE, "zap future leak: old %"PRIu64" new %"PRIu64"", 5362 last_txg, txg); 5363 5364 for (i = 0; i < ints; i++) 5365 value[i] = txg + object + i; 5366 5367 VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 5368 1, &txg, tx)); 5369 VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), 5370 ints, value, tx)); 5371 5372 dmu_tx_commit(tx); 5373 5374 /* 5375 * Remove a random pair of entries. 5376 */ 5377 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5378 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5379 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5380 5381 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5382 5383 if (error == ENOENT) 5384 goto out; 5385 5386 ASSERT0(error); 5387 5388 tx = dmu_tx_create(os); 5389 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5390 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5391 if (txg == 0) 5392 goto out; 5393 VERIFY0(zap_remove(os, object, txgname, tx)); 5394 VERIFY0(zap_remove(os, object, propname, tx)); 5395 dmu_tx_commit(tx); 5396 out: 5397 umem_free(od, sizeof (ztest_od_t)); 5398 } 5399 5400 /* 5401 * Test case to test the upgrading of a microzap to fatzap. 5402 */ 5403 void 5404 ztest_fzap(ztest_ds_t *zd, uint64_t id) 5405 { 5406 objset_t *os = zd->zd_os; 5407 ztest_od_t *od; 5408 uint64_t object, txg, value; 5409 5410 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5411 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5412 5413 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5414 !ztest_random(2)) != 0) 5415 goto out; 5416 object = od->od_object; 5417 5418 /* 5419 * Add entries to this ZAP and make sure it spills over 5420 * and gets upgraded to a fatzap. Also, since we are adding 5421 * 2050 entries we should see ptrtbl growth and leaf-block split. 5422 */ 5423 for (value = 0; value < 2050; value++) { 5424 char name[ZFS_MAX_DATASET_NAME_LEN]; 5425 dmu_tx_t *tx; 5426 int error; 5427 5428 (void) snprintf(name, sizeof (name), "fzap-%"PRIu64"-%"PRIu64"", 5429 id, value); 5430 5431 tx = dmu_tx_create(os); 5432 dmu_tx_hold_zap(tx, object, B_TRUE, name); 5433 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5434 if (txg == 0) 5435 goto out; 5436 error = zap_add(os, object, name, sizeof (uint64_t), 1, 5437 &value, tx); 5438 ASSERT(error == 0 || error == EEXIST); 5439 dmu_tx_commit(tx); 5440 } 5441 out: 5442 umem_free(od, sizeof (ztest_od_t)); 5443 } 5444 5445 void 5446 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 5447 { 5448 (void) id; 5449 objset_t *os = zd->zd_os; 5450 ztest_od_t *od; 5451 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 5452 dmu_tx_t *tx; 5453 int i, namelen, error; 5454 int micro = ztest_random(2); 5455 char name[20], string_value[20]; 5456 void *data; 5457 5458 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5459 ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); 5460 5461 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5462 umem_free(od, sizeof (ztest_od_t)); 5463 return; 5464 } 5465 5466 object = od->od_object; 5467 5468 /* 5469 * Generate a random name of the form 'xxx.....' where each 5470 * x is a random printable character and the dots are dots. 5471 * There are 94 such characters, and the name length goes from 5472 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 5473 */ 5474 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 5475 5476 for (i = 0; i < 3; i++) 5477 name[i] = '!' + ztest_random('~' - '!' + 1); 5478 for (; i < namelen - 1; i++) 5479 name[i] = '.'; 5480 name[i] = '\0'; 5481 5482 if ((namelen & 1) || micro) { 5483 wsize = sizeof (txg); 5484 wc = 1; 5485 data = &txg; 5486 } else { 5487 wsize = 1; 5488 wc = namelen; 5489 data = string_value; 5490 } 5491 5492 count = -1ULL; 5493 VERIFY0(zap_count(os, object, &count)); 5494 ASSERT3S(count, !=, -1ULL); 5495 5496 /* 5497 * Select an operation: length, lookup, add, update, remove. 5498 */ 5499 i = ztest_random(5); 5500 5501 if (i >= 2) { 5502 tx = dmu_tx_create(os); 5503 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5504 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5505 if (txg == 0) { 5506 umem_free(od, sizeof (ztest_od_t)); 5507 return; 5508 } 5509 memcpy(string_value, name, namelen); 5510 } else { 5511 tx = NULL; 5512 txg = 0; 5513 memset(string_value, 0, namelen); 5514 } 5515 5516 switch (i) { 5517 5518 case 0: 5519 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 5520 if (error == 0) { 5521 ASSERT3U(wsize, ==, zl_wsize); 5522 ASSERT3U(wc, ==, zl_wc); 5523 } else { 5524 ASSERT3U(error, ==, ENOENT); 5525 } 5526 break; 5527 5528 case 1: 5529 error = zap_lookup(os, object, name, wsize, wc, data); 5530 if (error == 0) { 5531 if (data == string_value && 5532 memcmp(name, data, namelen) != 0) 5533 fatal(B_FALSE, "name '%s' != val '%s' len %d", 5534 name, (char *)data, namelen); 5535 } else { 5536 ASSERT3U(error, ==, ENOENT); 5537 } 5538 break; 5539 5540 case 2: 5541 error = zap_add(os, object, name, wsize, wc, data, tx); 5542 ASSERT(error == 0 || error == EEXIST); 5543 break; 5544 5545 case 3: 5546 VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); 5547 break; 5548 5549 case 4: 5550 error = zap_remove(os, object, name, tx); 5551 ASSERT(error == 0 || error == ENOENT); 5552 break; 5553 } 5554 5555 if (tx != NULL) 5556 dmu_tx_commit(tx); 5557 5558 umem_free(od, sizeof (ztest_od_t)); 5559 } 5560 5561 /* 5562 * Commit callback data. 5563 */ 5564 typedef struct ztest_cb_data { 5565 list_node_t zcd_node; 5566 uint64_t zcd_txg; 5567 int zcd_expected_err; 5568 boolean_t zcd_added; 5569 boolean_t zcd_called; 5570 spa_t *zcd_spa; 5571 } ztest_cb_data_t; 5572 5573 /* This is the actual commit callback function */ 5574 static void 5575 ztest_commit_callback(void *arg, int error) 5576 { 5577 ztest_cb_data_t *data = arg; 5578 uint64_t synced_txg; 5579 5580 VERIFY3P(data, !=, NULL); 5581 VERIFY3S(data->zcd_expected_err, ==, error); 5582 VERIFY(!data->zcd_called); 5583 5584 synced_txg = spa_last_synced_txg(data->zcd_spa); 5585 if (data->zcd_txg > synced_txg) 5586 fatal(B_FALSE, 5587 "commit callback of txg %"PRIu64" called prematurely, " 5588 "last synced txg = %"PRIu64"\n", 5589 data->zcd_txg, synced_txg); 5590 5591 data->zcd_called = B_TRUE; 5592 5593 if (error == ECANCELED) { 5594 ASSERT0(data->zcd_txg); 5595 ASSERT(!data->zcd_added); 5596 5597 /* 5598 * The private callback data should be destroyed here, but 5599 * since we are going to check the zcd_called field after 5600 * dmu_tx_abort(), we will destroy it there. 5601 */ 5602 return; 5603 } 5604 5605 ASSERT(data->zcd_added); 5606 ASSERT3U(data->zcd_txg, !=, 0); 5607 5608 (void) mutex_enter(&zcl.zcl_callbacks_lock); 5609 5610 /* See if this cb was called more quickly */ 5611 if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) 5612 zc_min_txg_delay = synced_txg - data->zcd_txg; 5613 5614 /* Remove our callback from the list */ 5615 list_remove(&zcl.zcl_callbacks, data); 5616 5617 (void) mutex_exit(&zcl.zcl_callbacks_lock); 5618 5619 umem_free(data, sizeof (ztest_cb_data_t)); 5620 } 5621 5622 /* Allocate and initialize callback data structure */ 5623 static ztest_cb_data_t * 5624 ztest_create_cb_data(objset_t *os, uint64_t txg) 5625 { 5626 ztest_cb_data_t *cb_data; 5627 5628 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 5629 5630 cb_data->zcd_txg = txg; 5631 cb_data->zcd_spa = dmu_objset_spa(os); 5632 list_link_init(&cb_data->zcd_node); 5633 5634 return (cb_data); 5635 } 5636 5637 /* 5638 * Commit callback test. 5639 */ 5640 void 5641 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 5642 { 5643 objset_t *os = zd->zd_os; 5644 ztest_od_t *od; 5645 dmu_tx_t *tx; 5646 ztest_cb_data_t *cb_data[3], *tmp_cb; 5647 uint64_t old_txg, txg; 5648 int i, error = 0; 5649 5650 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5651 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5652 5653 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5654 umem_free(od, sizeof (ztest_od_t)); 5655 return; 5656 } 5657 5658 tx = dmu_tx_create(os); 5659 5660 cb_data[0] = ztest_create_cb_data(os, 0); 5661 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 5662 5663 dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); 5664 5665 /* Every once in a while, abort the transaction on purpose */ 5666 if (ztest_random(100) == 0) 5667 error = -1; 5668 5669 if (!error) 5670 error = dmu_tx_assign(tx, TXG_NOWAIT); 5671 5672 txg = error ? 0 : dmu_tx_get_txg(tx); 5673 5674 cb_data[0]->zcd_txg = txg; 5675 cb_data[1] = ztest_create_cb_data(os, txg); 5676 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 5677 5678 if (error) { 5679 /* 5680 * It's not a strict requirement to call the registered 5681 * callbacks from inside dmu_tx_abort(), but that's what 5682 * it's supposed to happen in the current implementation 5683 * so we will check for that. 5684 */ 5685 for (i = 0; i < 2; i++) { 5686 cb_data[i]->zcd_expected_err = ECANCELED; 5687 VERIFY(!cb_data[i]->zcd_called); 5688 } 5689 5690 dmu_tx_abort(tx); 5691 5692 for (i = 0; i < 2; i++) { 5693 VERIFY(cb_data[i]->zcd_called); 5694 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 5695 } 5696 5697 umem_free(od, sizeof (ztest_od_t)); 5698 return; 5699 } 5700 5701 cb_data[2] = ztest_create_cb_data(os, txg); 5702 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 5703 5704 /* 5705 * Read existing data to make sure there isn't a future leak. 5706 */ 5707 VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), 5708 &old_txg, DMU_READ_PREFETCH)); 5709 5710 if (old_txg > txg) 5711 fatal(B_FALSE, 5712 "future leak: got %"PRIu64", open txg is %"PRIu64"", 5713 old_txg, txg); 5714 5715 dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); 5716 5717 (void) mutex_enter(&zcl.zcl_callbacks_lock); 5718 5719 /* 5720 * Since commit callbacks don't have any ordering requirement and since 5721 * it is theoretically possible for a commit callback to be called 5722 * after an arbitrary amount of time has elapsed since its txg has been 5723 * synced, it is difficult to reliably determine whether a commit 5724 * callback hasn't been called due to high load or due to a flawed 5725 * implementation. 5726 * 5727 * In practice, we will assume that if after a certain number of txgs a 5728 * commit callback hasn't been called, then most likely there's an 5729 * implementation bug.. 5730 */ 5731 tmp_cb = list_head(&zcl.zcl_callbacks); 5732 if (tmp_cb != NULL && 5733 tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { 5734 fatal(B_FALSE, 5735 "Commit callback threshold exceeded, " 5736 "oldest txg: %"PRIu64", open txg: %"PRIu64"\n", 5737 tmp_cb->zcd_txg, txg); 5738 } 5739 5740 /* 5741 * Let's find the place to insert our callbacks. 5742 * 5743 * Even though the list is ordered by txg, it is possible for the 5744 * insertion point to not be the end because our txg may already be 5745 * quiescing at this point and other callbacks in the open txg 5746 * (from other objsets) may have sneaked in. 5747 */ 5748 tmp_cb = list_tail(&zcl.zcl_callbacks); 5749 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 5750 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 5751 5752 /* Add the 3 callbacks to the list */ 5753 for (i = 0; i < 3; i++) { 5754 if (tmp_cb == NULL) 5755 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 5756 else 5757 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 5758 cb_data[i]); 5759 5760 cb_data[i]->zcd_added = B_TRUE; 5761 VERIFY(!cb_data[i]->zcd_called); 5762 5763 tmp_cb = cb_data[i]; 5764 } 5765 5766 zc_cb_counter += 3; 5767 5768 (void) mutex_exit(&zcl.zcl_callbacks_lock); 5769 5770 dmu_tx_commit(tx); 5771 5772 umem_free(od, sizeof (ztest_od_t)); 5773 } 5774 5775 /* 5776 * Visit each object in the dataset. Verify that its properties 5777 * are consistent what was stored in the block tag when it was created, 5778 * and that its unused bonus buffer space has not been overwritten. 5779 */ 5780 void 5781 ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 5782 { 5783 (void) id; 5784 objset_t *os = zd->zd_os; 5785 uint64_t obj; 5786 int err = 0; 5787 5788 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 5789 ztest_block_tag_t *bt = NULL; 5790 dmu_object_info_t doi; 5791 dmu_buf_t *db; 5792 5793 ztest_object_lock(zd, obj, RL_READER); 5794 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { 5795 ztest_object_unlock(zd, obj); 5796 continue; 5797 } 5798 5799 dmu_object_info_from_db(db, &doi); 5800 if (doi.doi_bonus_size >= sizeof (*bt)) 5801 bt = ztest_bt_bonus(db); 5802 5803 if (bt && bt->bt_magic == BT_MAGIC) { 5804 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 5805 bt->bt_offset, bt->bt_gen, bt->bt_txg, 5806 bt->bt_crtxg); 5807 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 5808 } 5809 5810 dmu_buf_rele(db, FTAG); 5811 ztest_object_unlock(zd, obj); 5812 } 5813 } 5814 5815 void 5816 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 5817 { 5818 (void) id; 5819 zfs_prop_t proplist[] = { 5820 ZFS_PROP_CHECKSUM, 5821 ZFS_PROP_COMPRESSION, 5822 ZFS_PROP_COPIES, 5823 ZFS_PROP_DEDUP 5824 }; 5825 5826 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5827 5828 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) 5829 (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 5830 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 5831 5832 VERIFY0(ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, 5833 ztest_random_blocksize(), (int)ztest_random(2))); 5834 5835 (void) pthread_rwlock_unlock(&ztest_name_lock); 5836 } 5837 5838 void 5839 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 5840 { 5841 (void) zd, (void) id; 5842 nvlist_t *props = NULL; 5843 5844 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5845 5846 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); 5847 5848 VERIFY0(spa_prop_get(ztest_spa, &props)); 5849 5850 if (ztest_opts.zo_verbose >= 6) 5851 dump_nvlist(props, 4); 5852 5853 fnvlist_free(props); 5854 5855 (void) pthread_rwlock_unlock(&ztest_name_lock); 5856 } 5857 5858 static int 5859 user_release_one(const char *snapname, const char *holdname) 5860 { 5861 nvlist_t *snaps, *holds; 5862 int error; 5863 5864 snaps = fnvlist_alloc(); 5865 holds = fnvlist_alloc(); 5866 fnvlist_add_boolean(holds, holdname); 5867 fnvlist_add_nvlist(snaps, snapname, holds); 5868 fnvlist_free(holds); 5869 error = dsl_dataset_user_release(snaps, NULL); 5870 fnvlist_free(snaps); 5871 return (error); 5872 } 5873 5874 /* 5875 * Test snapshot hold/release and deferred destroy. 5876 */ 5877 void 5878 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 5879 { 5880 int error; 5881 objset_t *os = zd->zd_os; 5882 objset_t *origin; 5883 char snapname[100]; 5884 char fullname[100]; 5885 char clonename[100]; 5886 char tag[100]; 5887 char osname[ZFS_MAX_DATASET_NAME_LEN]; 5888 nvlist_t *holds; 5889 5890 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5891 5892 dmu_objset_name(os, osname); 5893 5894 (void) snprintf(snapname, sizeof (snapname), "sh1_%"PRIu64"", id); 5895 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 5896 (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%"PRIu64"", 5897 osname, id); 5898 (void) snprintf(tag, sizeof (tag), "tag_%"PRIu64"", id); 5899 5900 /* 5901 * Clean up from any previous run. 5902 */ 5903 error = dsl_destroy_head(clonename); 5904 if (error != ENOENT) 5905 ASSERT0(error); 5906 error = user_release_one(fullname, tag); 5907 if (error != ESRCH && error != ENOENT) 5908 ASSERT0(error); 5909 error = dsl_destroy_snapshot(fullname, B_FALSE); 5910 if (error != ENOENT) 5911 ASSERT0(error); 5912 5913 /* 5914 * Create snapshot, clone it, mark snap for deferred destroy, 5915 * destroy clone, verify snap was also destroyed. 5916 */ 5917 error = dmu_objset_snapshot_one(osname, snapname); 5918 if (error) { 5919 if (error == ENOSPC) { 5920 ztest_record_enospc("dmu_objset_snapshot"); 5921 goto out; 5922 } 5923 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 5924 } 5925 5926 error = dmu_objset_clone(clonename, fullname); 5927 if (error) { 5928 if (error == ENOSPC) { 5929 ztest_record_enospc("dmu_objset_clone"); 5930 goto out; 5931 } 5932 fatal(B_FALSE, "dmu_objset_clone(%s) = %d", clonename, error); 5933 } 5934 5935 error = dsl_destroy_snapshot(fullname, B_TRUE); 5936 if (error) { 5937 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5938 fullname, error); 5939 } 5940 5941 error = dsl_destroy_head(clonename); 5942 if (error) 5943 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clonename, error); 5944 5945 error = dmu_objset_hold(fullname, FTAG, &origin); 5946 if (error != ENOENT) 5947 fatal(B_FALSE, "dmu_objset_hold(%s) = %d", fullname, error); 5948 5949 /* 5950 * Create snapshot, add temporary hold, verify that we can't 5951 * destroy a held snapshot, mark for deferred destroy, 5952 * release hold, verify snapshot was destroyed. 5953 */ 5954 error = dmu_objset_snapshot_one(osname, snapname); 5955 if (error) { 5956 if (error == ENOSPC) { 5957 ztest_record_enospc("dmu_objset_snapshot"); 5958 goto out; 5959 } 5960 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 5961 } 5962 5963 holds = fnvlist_alloc(); 5964 fnvlist_add_string(holds, fullname, tag); 5965 error = dsl_dataset_user_hold(holds, 0, NULL); 5966 fnvlist_free(holds); 5967 5968 if (error == ENOSPC) { 5969 ztest_record_enospc("dsl_dataset_user_hold"); 5970 goto out; 5971 } else if (error) { 5972 fatal(B_FALSE, "dsl_dataset_user_hold(%s, %s) = %u", 5973 fullname, tag, error); 5974 } 5975 5976 error = dsl_destroy_snapshot(fullname, B_FALSE); 5977 if (error != EBUSY) { 5978 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 5979 fullname, error); 5980 } 5981 5982 error = dsl_destroy_snapshot(fullname, B_TRUE); 5983 if (error) { 5984 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5985 fullname, error); 5986 } 5987 5988 error = user_release_one(fullname, tag); 5989 if (error) 5990 fatal(B_FALSE, "user_release_one(%s, %s) = %d", 5991 fullname, tag, error); 5992 5993 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 5994 5995 out: 5996 (void) pthread_rwlock_unlock(&ztest_name_lock); 5997 } 5998 5999 /* 6000 * Inject random faults into the on-disk data. 6001 */ 6002 void 6003 ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 6004 { 6005 (void) zd, (void) id; 6006 ztest_shared_t *zs = ztest_shared; 6007 spa_t *spa = ztest_spa; 6008 int fd; 6009 uint64_t offset; 6010 uint64_t leaves; 6011 uint64_t bad = 0x1990c0ffeedecadeull; 6012 uint64_t top, leaf; 6013 char *path0; 6014 char *pathrand; 6015 size_t fsize; 6016 int bshift = SPA_MAXBLOCKSHIFT + 2; 6017 int iters = 1000; 6018 int maxfaults; 6019 int mirror_save; 6020 vdev_t *vd0 = NULL; 6021 uint64_t guid0 = 0; 6022 boolean_t islog = B_FALSE; 6023 6024 path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6025 pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6026 6027 mutex_enter(&ztest_vdev_lock); 6028 6029 /* 6030 * Device removal is in progress, fault injection must be disabled 6031 * until it completes and the pool is scrubbed. The fault injection 6032 * strategy for damaging blocks does not take in to account evacuated 6033 * blocks which may have already been damaged. 6034 */ 6035 if (ztest_device_removal_active) { 6036 mutex_exit(&ztest_vdev_lock); 6037 goto out; 6038 } 6039 6040 maxfaults = MAXFAULTS(zs); 6041 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; 6042 mirror_save = zs->zs_mirrors; 6043 mutex_exit(&ztest_vdev_lock); 6044 6045 ASSERT3U(leaves, >=, 1); 6046 6047 /* 6048 * While ztest is running the number of leaves will not change. This 6049 * is critical for the fault injection logic as it determines where 6050 * errors can be safely injected such that they are always repairable. 6051 * 6052 * When restarting ztest a different number of leaves may be requested 6053 * which will shift the regions to be damaged. This is fine as long 6054 * as the pool has been scrubbed prior to using the new mapping. 6055 * Failure to do can result in non-repairable damage being injected. 6056 */ 6057 if (ztest_pool_scrubbed == B_FALSE) 6058 goto out; 6059 6060 /* 6061 * Grab the name lock as reader. There are some operations 6062 * which don't like to have their vdevs changed while 6063 * they are in progress (i.e. spa_change_guid). Those 6064 * operations will have grabbed the name lock as writer. 6065 */ 6066 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6067 6068 /* 6069 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 6070 */ 6071 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6072 6073 if (ztest_random(2) == 0) { 6074 /* 6075 * Inject errors on a normal data device or slog device. 6076 */ 6077 top = ztest_random_vdev_top(spa, B_TRUE); 6078 leaf = ztest_random(leaves) + zs->zs_splits; 6079 6080 /* 6081 * Generate paths to the first leaf in this top-level vdev, 6082 * and to the random leaf we selected. We'll induce transient 6083 * write failures and random online/offline activity on leaf 0, 6084 * and we'll write random garbage to the randomly chosen leaf. 6085 */ 6086 (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, 6087 ztest_opts.zo_dir, ztest_opts.zo_pool, 6088 top * leaves + zs->zs_splits); 6089 (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, 6090 ztest_opts.zo_dir, ztest_opts.zo_pool, 6091 top * leaves + leaf); 6092 6093 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 6094 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 6095 islog = B_TRUE; 6096 6097 /* 6098 * If the top-level vdev needs to be resilvered 6099 * then we only allow faults on the device that is 6100 * resilvering. 6101 */ 6102 if (vd0 != NULL && maxfaults != 1 && 6103 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 6104 vd0->vdev_resilver_txg != 0)) { 6105 /* 6106 * Make vd0 explicitly claim to be unreadable, 6107 * or unwritable, or reach behind its back 6108 * and close the underlying fd. We can do this if 6109 * maxfaults == 0 because we'll fail and reexecute, 6110 * and we can do it if maxfaults >= 2 because we'll 6111 * have enough redundancy. If maxfaults == 1, the 6112 * combination of this with injection of random data 6113 * corruption below exceeds the pool's fault tolerance. 6114 */ 6115 vdev_file_t *vf = vd0->vdev_tsd; 6116 6117 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 6118 (long long)vd0->vdev_id, (int)maxfaults); 6119 6120 if (vf != NULL && ztest_random(3) == 0) { 6121 (void) close(vf->vf_file->f_fd); 6122 vf->vf_file->f_fd = -1; 6123 } else if (ztest_random(2) == 0) { 6124 vd0->vdev_cant_read = B_TRUE; 6125 } else { 6126 vd0->vdev_cant_write = B_TRUE; 6127 } 6128 guid0 = vd0->vdev_guid; 6129 } 6130 } else { 6131 /* 6132 * Inject errors on an l2cache device. 6133 */ 6134 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6135 6136 if (sav->sav_count == 0) { 6137 spa_config_exit(spa, SCL_STATE, FTAG); 6138 (void) pthread_rwlock_unlock(&ztest_name_lock); 6139 goto out; 6140 } 6141 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 6142 guid0 = vd0->vdev_guid; 6143 (void) strcpy(path0, vd0->vdev_path); 6144 (void) strcpy(pathrand, vd0->vdev_path); 6145 6146 leaf = 0; 6147 leaves = 1; 6148 maxfaults = INT_MAX; /* no limit on cache devices */ 6149 } 6150 6151 spa_config_exit(spa, SCL_STATE, FTAG); 6152 (void) pthread_rwlock_unlock(&ztest_name_lock); 6153 6154 /* 6155 * If we can tolerate two or more faults, or we're dealing 6156 * with a slog, randomly online/offline vd0. 6157 */ 6158 if ((maxfaults >= 2 || islog) && guid0 != 0) { 6159 if (ztest_random(10) < 6) { 6160 int flags = (ztest_random(2) == 0 ? 6161 ZFS_OFFLINE_TEMPORARY : 0); 6162 6163 /* 6164 * We have to grab the zs_name_lock as writer to 6165 * prevent a race between offlining a slog and 6166 * destroying a dataset. Offlining the slog will 6167 * grab a reference on the dataset which may cause 6168 * dsl_destroy_head() to fail with EBUSY thus 6169 * leaving the dataset in an inconsistent state. 6170 */ 6171 if (islog) 6172 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6173 6174 VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); 6175 6176 if (islog) 6177 (void) pthread_rwlock_unlock(&ztest_name_lock); 6178 } else { 6179 /* 6180 * Ideally we would like to be able to randomly 6181 * call vdev_[on|off]line without holding locks 6182 * to force unpredictable failures but the side 6183 * effects of vdev_[on|off]line prevent us from 6184 * doing so. We grab the ztest_vdev_lock here to 6185 * prevent a race between injection testing and 6186 * aux_vdev removal. 6187 */ 6188 mutex_enter(&ztest_vdev_lock); 6189 (void) vdev_online(spa, guid0, 0, NULL); 6190 mutex_exit(&ztest_vdev_lock); 6191 } 6192 } 6193 6194 if (maxfaults == 0) 6195 goto out; 6196 6197 /* 6198 * We have at least single-fault tolerance, so inject data corruption. 6199 */ 6200 fd = open(pathrand, O_RDWR); 6201 6202 if (fd == -1) /* we hit a gap in the device namespace */ 6203 goto out; 6204 6205 fsize = lseek(fd, 0, SEEK_END); 6206 6207 while (--iters != 0) { 6208 /* 6209 * The offset must be chosen carefully to ensure that 6210 * we do not inject a given logical block with errors 6211 * on two different leaf devices, because ZFS can not 6212 * tolerate that (if maxfaults==1). 6213 * 6214 * To achieve this we divide each leaf device into 6215 * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). 6216 * Each chunk is further divided into error-injection 6217 * ranges (can accept errors) and clear ranges (we do 6218 * not inject errors in those). Each error-injection 6219 * range can accept errors only for a single leaf vdev. 6220 * Error-injection ranges are separated by clear ranges. 6221 * 6222 * For example, with 3 leaves, each chunk looks like: 6223 * 0 to 32M: injection range for leaf 0 6224 * 32M to 64M: clear range - no injection allowed 6225 * 64M to 96M: injection range for leaf 1 6226 * 96M to 128M: clear range - no injection allowed 6227 * 128M to 160M: injection range for leaf 2 6228 * 160M to 192M: clear range - no injection allowed 6229 * 6230 * Each clear range must be large enough such that a 6231 * single block cannot straddle it. This way a block 6232 * can't be a target in two different injection ranges 6233 * (on different leaf vdevs). 6234 */ 6235 offset = ztest_random(fsize / (leaves << bshift)) * 6236 (leaves << bshift) + (leaf << bshift) + 6237 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 6238 6239 /* 6240 * Only allow damage to the labels at one end of the vdev. 6241 * 6242 * If all labels are damaged, the device will be totally 6243 * inaccessible, which will result in loss of data, 6244 * because we also damage (parts of) the other side of 6245 * the mirror/raidz. 6246 * 6247 * Additionally, we will always have both an even and an 6248 * odd label, so that we can handle crashes in the 6249 * middle of vdev_config_sync(). 6250 */ 6251 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 6252 continue; 6253 6254 /* 6255 * The two end labels are stored at the "end" of the disk, but 6256 * the end of the disk (vdev_psize) is aligned to 6257 * sizeof (vdev_label_t). 6258 */ 6259 uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); 6260 if ((leaf & 1) == 1 && 6261 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 6262 continue; 6263 6264 mutex_enter(&ztest_vdev_lock); 6265 if (mirror_save != zs->zs_mirrors) { 6266 mutex_exit(&ztest_vdev_lock); 6267 (void) close(fd); 6268 goto out; 6269 } 6270 6271 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 6272 fatal(B_TRUE, 6273 "can't inject bad word at 0x%"PRIx64" in %s", 6274 offset, pathrand); 6275 6276 mutex_exit(&ztest_vdev_lock); 6277 6278 if (ztest_opts.zo_verbose >= 7) 6279 (void) printf("injected bad word into %s," 6280 " offset 0x%"PRIx64"\n", pathrand, offset); 6281 } 6282 6283 (void) close(fd); 6284 out: 6285 umem_free(path0, MAXPATHLEN); 6286 umem_free(pathrand, MAXPATHLEN); 6287 } 6288 6289 /* 6290 * By design ztest will never inject uncorrectable damage in to the pool. 6291 * Issue a scrub, wait for it to complete, and verify there is never any 6292 * persistent damage. 6293 * 6294 * Only after a full scrub has been completed is it safe to start injecting 6295 * data corruption. See the comment in zfs_fault_inject(). 6296 */ 6297 static int 6298 ztest_scrub_impl(spa_t *spa) 6299 { 6300 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6301 if (error) 6302 return (error); 6303 6304 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6305 txg_wait_synced(spa_get_dsl(spa), 0); 6306 6307 if (spa_get_errlog_size(spa) > 0) 6308 return (ECKSUM); 6309 6310 ztest_pool_scrubbed = B_TRUE; 6311 6312 return (0); 6313 } 6314 6315 /* 6316 * Scrub the pool. 6317 */ 6318 void 6319 ztest_scrub(ztest_ds_t *zd, uint64_t id) 6320 { 6321 (void) zd, (void) id; 6322 spa_t *spa = ztest_spa; 6323 int error; 6324 6325 /* 6326 * Scrub in progress by device removal. 6327 */ 6328 if (ztest_device_removal_active) 6329 return; 6330 6331 /* 6332 * Start a scrub, wait a moment, then force a restart. 6333 */ 6334 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6335 (void) poll(NULL, 0, 100); 6336 6337 error = ztest_scrub_impl(spa); 6338 if (error == EBUSY) 6339 error = 0; 6340 ASSERT0(error); 6341 } 6342 6343 /* 6344 * Change the guid for the pool. 6345 */ 6346 void 6347 ztest_reguid(ztest_ds_t *zd, uint64_t id) 6348 { 6349 (void) zd, (void) id; 6350 spa_t *spa = ztest_spa; 6351 uint64_t orig, load; 6352 int error; 6353 6354 if (ztest_opts.zo_mmp_test) 6355 return; 6356 6357 orig = spa_guid(spa); 6358 load = spa_load_guid(spa); 6359 6360 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6361 error = spa_change_guid(spa); 6362 (void) pthread_rwlock_unlock(&ztest_name_lock); 6363 6364 if (error != 0) 6365 return; 6366 6367 if (ztest_opts.zo_verbose >= 4) { 6368 (void) printf("Changed guid old %"PRIu64" -> %"PRIu64"\n", 6369 orig, spa_guid(spa)); 6370 } 6371 6372 VERIFY3U(orig, !=, spa_guid(spa)); 6373 VERIFY3U(load, ==, spa_load_guid(spa)); 6374 } 6375 6376 void 6377 ztest_fletcher(ztest_ds_t *zd, uint64_t id) 6378 { 6379 (void) zd, (void) id; 6380 hrtime_t end = gethrtime() + NANOSEC; 6381 6382 while (gethrtime() <= end) { 6383 int run_count = 100; 6384 void *buf; 6385 struct abd *abd_data, *abd_meta; 6386 uint32_t size; 6387 int *ptr; 6388 int i; 6389 zio_cksum_t zc_ref; 6390 zio_cksum_t zc_ref_byteswap; 6391 6392 size = ztest_random_blocksize(); 6393 6394 buf = umem_alloc(size, UMEM_NOFAIL); 6395 abd_data = abd_alloc(size, B_FALSE); 6396 abd_meta = abd_alloc(size, B_TRUE); 6397 6398 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6399 *ptr = ztest_random(UINT_MAX); 6400 6401 abd_copy_from_buf_off(abd_data, buf, 0, size); 6402 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6403 6404 VERIFY0(fletcher_4_impl_set("scalar")); 6405 fletcher_4_native(buf, size, NULL, &zc_ref); 6406 fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); 6407 6408 VERIFY0(fletcher_4_impl_set("cycle")); 6409 while (run_count-- > 0) { 6410 zio_cksum_t zc; 6411 zio_cksum_t zc_byteswap; 6412 6413 fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); 6414 fletcher_4_native(buf, size, NULL, &zc); 6415 6416 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6417 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6418 sizeof (zc_byteswap))); 6419 6420 /* Test ABD - data */ 6421 abd_fletcher_4_byteswap(abd_data, size, NULL, 6422 &zc_byteswap); 6423 abd_fletcher_4_native(abd_data, size, NULL, &zc); 6424 6425 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6426 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6427 sizeof (zc_byteswap))); 6428 6429 /* Test ABD - metadata */ 6430 abd_fletcher_4_byteswap(abd_meta, size, NULL, 6431 &zc_byteswap); 6432 abd_fletcher_4_native(abd_meta, size, NULL, &zc); 6433 6434 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6435 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6436 sizeof (zc_byteswap))); 6437 6438 } 6439 6440 umem_free(buf, size); 6441 abd_free(abd_data); 6442 abd_free(abd_meta); 6443 } 6444 } 6445 6446 void 6447 ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) 6448 { 6449 (void) zd, (void) id; 6450 void *buf; 6451 size_t size; 6452 int *ptr; 6453 int i; 6454 zio_cksum_t zc_ref; 6455 zio_cksum_t zc_ref_bswap; 6456 6457 hrtime_t end = gethrtime() + NANOSEC; 6458 6459 while (gethrtime() <= end) { 6460 int run_count = 100; 6461 6462 size = ztest_random_blocksize(); 6463 buf = umem_alloc(size, UMEM_NOFAIL); 6464 6465 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6466 *ptr = ztest_random(UINT_MAX); 6467 6468 VERIFY0(fletcher_4_impl_set("scalar")); 6469 fletcher_4_native(buf, size, NULL, &zc_ref); 6470 fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); 6471 6472 VERIFY0(fletcher_4_impl_set("cycle")); 6473 6474 while (run_count-- > 0) { 6475 zio_cksum_t zc; 6476 zio_cksum_t zc_bswap; 6477 size_t pos = 0; 6478 6479 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6480 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6481 6482 while (pos < size) { 6483 size_t inc = 64 * ztest_random(size / 67); 6484 /* sometimes add few bytes to test non-simd */ 6485 if (ztest_random(100) < 10) 6486 inc += P2ALIGN(ztest_random(64), 6487 sizeof (uint32_t)); 6488 6489 if (inc > (size - pos)) 6490 inc = size - pos; 6491 6492 fletcher_4_incremental_native(buf + pos, inc, 6493 &zc); 6494 fletcher_4_incremental_byteswap(buf + pos, inc, 6495 &zc_bswap); 6496 6497 pos += inc; 6498 } 6499 6500 VERIFY3U(pos, ==, size); 6501 6502 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6503 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6504 6505 /* 6506 * verify if incremental on the whole buffer is 6507 * equivalent to non-incremental version 6508 */ 6509 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6510 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6511 6512 fletcher_4_incremental_native(buf, size, &zc); 6513 fletcher_4_incremental_byteswap(buf, size, &zc_bswap); 6514 6515 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6516 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6517 } 6518 6519 umem_free(buf, size); 6520 } 6521 } 6522 6523 static int 6524 ztest_set_global_vars(void) 6525 { 6526 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 6527 char *kv = ztest_opts.zo_gvars[i]; 6528 VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); 6529 VERIFY3U(strlen(kv), >, 0); 6530 int err = set_global_var(kv); 6531 if (ztest_opts.zo_verbose > 0) { 6532 (void) printf("setting global var %s ... %s\n", kv, 6533 err ? "failed" : "ok"); 6534 } 6535 if (err != 0) { 6536 (void) fprintf(stderr, 6537 "failed to set global var '%s'\n", kv); 6538 return (err); 6539 } 6540 } 6541 return (0); 6542 } 6543 6544 static char ** 6545 ztest_global_vars_to_zdb_args(void) 6546 { 6547 char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); 6548 char **cur = args; 6549 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 6550 char *kv = ztest_opts.zo_gvars[i]; 6551 *cur = "-o"; 6552 cur++; 6553 *cur = strdup(kv); 6554 cur++; 6555 } 6556 ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); 6557 *cur = NULL; 6558 return (args); 6559 } 6560 6561 /* The end of strings is indicated by a NULL element */ 6562 static char * 6563 join_strings(char **strings, const char *sep) 6564 { 6565 size_t totallen = 0; 6566 for (char **sp = strings; *sp != NULL; sp++) { 6567 totallen += strlen(*sp); 6568 totallen += strlen(sep); 6569 } 6570 if (totallen > 0) { 6571 ASSERT(totallen >= strlen(sep)); 6572 totallen -= strlen(sep); 6573 } 6574 6575 size_t buflen = totallen + 1; 6576 char *o = malloc(buflen); /* trailing 0 byte */ 6577 o[0] = '\0'; 6578 for (char **sp = strings; *sp != NULL; sp++) { 6579 size_t would; 6580 would = strlcat(o, *sp, buflen); 6581 VERIFY3U(would, <, buflen); 6582 if (*(sp+1) == NULL) { 6583 break; 6584 } 6585 would = strlcat(o, sep, buflen); 6586 VERIFY3U(would, <, buflen); 6587 } 6588 ASSERT3S(strlen(o), ==, totallen); 6589 return (o); 6590 } 6591 6592 static int 6593 ztest_check_path(char *path) 6594 { 6595 struct stat s; 6596 /* return true on success */ 6597 return (!stat(path, &s)); 6598 } 6599 6600 static void 6601 ztest_get_zdb_bin(char *bin, int len) 6602 { 6603 char *zdb_path; 6604 /* 6605 * Try to use $ZDB and in-tree zdb path. If not successful, just 6606 * let popen to search through PATH. 6607 */ 6608 if ((zdb_path = getenv("ZDB"))) { 6609 strlcpy(bin, zdb_path, len); /* In env */ 6610 if (!ztest_check_path(bin)) { 6611 ztest_dump_core = 0; 6612 fatal(B_TRUE, "invalid ZDB '%s'", bin); 6613 } 6614 return; 6615 } 6616 6617 VERIFY3P(realpath(getexecname(), bin), !=, NULL); 6618 if (strstr(bin, ".libs/ztest")) { 6619 strstr(bin, ".libs/ztest")[0] = '\0'; /* In-tree */ 6620 strcat(bin, "zdb"); 6621 if (ztest_check_path(bin)) 6622 return; 6623 } 6624 strcpy(bin, "zdb"); 6625 } 6626 6627 static vdev_t * 6628 ztest_random_concrete_vdev_leaf(vdev_t *vd) 6629 { 6630 if (vd == NULL) 6631 return (NULL); 6632 6633 if (vd->vdev_children == 0) 6634 return (vd); 6635 6636 vdev_t *eligible[vd->vdev_children]; 6637 int eligible_idx = 0, i; 6638 for (i = 0; i < vd->vdev_children; i++) { 6639 vdev_t *cvd = vd->vdev_child[i]; 6640 if (cvd->vdev_top->vdev_removing) 6641 continue; 6642 if (cvd->vdev_children > 0 || 6643 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 6644 eligible[eligible_idx++] = cvd; 6645 } 6646 } 6647 VERIFY3S(eligible_idx, >, 0); 6648 6649 uint64_t child_no = ztest_random(eligible_idx); 6650 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 6651 } 6652 6653 void 6654 ztest_initialize(ztest_ds_t *zd, uint64_t id) 6655 { 6656 (void) zd, (void) id; 6657 spa_t *spa = ztest_spa; 6658 int error = 0; 6659 6660 mutex_enter(&ztest_vdev_lock); 6661 6662 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6663 6664 /* Random leaf vdev */ 6665 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 6666 if (rand_vd == NULL) { 6667 spa_config_exit(spa, SCL_VDEV, FTAG); 6668 mutex_exit(&ztest_vdev_lock); 6669 return; 6670 } 6671 6672 /* 6673 * The random vdev we've selected may change as soon as we 6674 * drop the spa_config_lock. We create local copies of things 6675 * we're interested in. 6676 */ 6677 uint64_t guid = rand_vd->vdev_guid; 6678 char *path = strdup(rand_vd->vdev_path); 6679 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 6680 6681 zfs_dbgmsg("vd %px, guid %llu", rand_vd, (u_longlong_t)guid); 6682 spa_config_exit(spa, SCL_VDEV, FTAG); 6683 6684 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 6685 6686 nvlist_t *vdev_guids = fnvlist_alloc(); 6687 nvlist_t *vdev_errlist = fnvlist_alloc(); 6688 fnvlist_add_uint64(vdev_guids, path, guid); 6689 error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); 6690 fnvlist_free(vdev_guids); 6691 fnvlist_free(vdev_errlist); 6692 6693 switch (cmd) { 6694 case POOL_INITIALIZE_CANCEL: 6695 if (ztest_opts.zo_verbose >= 4) { 6696 (void) printf("Cancel initialize %s", path); 6697 if (!active) 6698 (void) printf(" failed (no initialize active)"); 6699 (void) printf("\n"); 6700 } 6701 break; 6702 case POOL_INITIALIZE_START: 6703 if (ztest_opts.zo_verbose >= 4) { 6704 (void) printf("Start initialize %s", path); 6705 if (active && error == 0) 6706 (void) printf(" failed (already active)"); 6707 else if (error != 0) 6708 (void) printf(" failed (error %d)", error); 6709 (void) printf("\n"); 6710 } 6711 break; 6712 case POOL_INITIALIZE_SUSPEND: 6713 if (ztest_opts.zo_verbose >= 4) { 6714 (void) printf("Suspend initialize %s", path); 6715 if (!active) 6716 (void) printf(" failed (no initialize active)"); 6717 (void) printf("\n"); 6718 } 6719 break; 6720 } 6721 free(path); 6722 mutex_exit(&ztest_vdev_lock); 6723 } 6724 6725 void 6726 ztest_trim(ztest_ds_t *zd, uint64_t id) 6727 { 6728 (void) zd, (void) id; 6729 spa_t *spa = ztest_spa; 6730 int error = 0; 6731 6732 mutex_enter(&ztest_vdev_lock); 6733 6734 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6735 6736 /* Random leaf vdev */ 6737 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 6738 if (rand_vd == NULL) { 6739 spa_config_exit(spa, SCL_VDEV, FTAG); 6740 mutex_exit(&ztest_vdev_lock); 6741 return; 6742 } 6743 6744 /* 6745 * The random vdev we've selected may change as soon as we 6746 * drop the spa_config_lock. We create local copies of things 6747 * we're interested in. 6748 */ 6749 uint64_t guid = rand_vd->vdev_guid; 6750 char *path = strdup(rand_vd->vdev_path); 6751 boolean_t active = rand_vd->vdev_trim_thread != NULL; 6752 6753 zfs_dbgmsg("vd %p, guid %llu", rand_vd, (u_longlong_t)guid); 6754 spa_config_exit(spa, SCL_VDEV, FTAG); 6755 6756 uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); 6757 uint64_t rate = 1 << ztest_random(30); 6758 boolean_t partial = (ztest_random(5) > 0); 6759 boolean_t secure = (ztest_random(5) > 0); 6760 6761 nvlist_t *vdev_guids = fnvlist_alloc(); 6762 nvlist_t *vdev_errlist = fnvlist_alloc(); 6763 fnvlist_add_uint64(vdev_guids, path, guid); 6764 error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, 6765 secure, vdev_errlist); 6766 fnvlist_free(vdev_guids); 6767 fnvlist_free(vdev_errlist); 6768 6769 switch (cmd) { 6770 case POOL_TRIM_CANCEL: 6771 if (ztest_opts.zo_verbose >= 4) { 6772 (void) printf("Cancel TRIM %s", path); 6773 if (!active) 6774 (void) printf(" failed (no TRIM active)"); 6775 (void) printf("\n"); 6776 } 6777 break; 6778 case POOL_TRIM_START: 6779 if (ztest_opts.zo_verbose >= 4) { 6780 (void) printf("Start TRIM %s", path); 6781 if (active && error == 0) 6782 (void) printf(" failed (already active)"); 6783 else if (error != 0) 6784 (void) printf(" failed (error %d)", error); 6785 (void) printf("\n"); 6786 } 6787 break; 6788 case POOL_TRIM_SUSPEND: 6789 if (ztest_opts.zo_verbose >= 4) { 6790 (void) printf("Suspend TRIM %s", path); 6791 if (!active) 6792 (void) printf(" failed (no TRIM active)"); 6793 (void) printf("\n"); 6794 } 6795 break; 6796 } 6797 free(path); 6798 mutex_exit(&ztest_vdev_lock); 6799 } 6800 6801 /* 6802 * Verify pool integrity by running zdb. 6803 */ 6804 static void 6805 ztest_run_zdb(char *pool) 6806 { 6807 int status; 6808 char *bin; 6809 char *zdb; 6810 char *zbuf; 6811 const int len = MAXPATHLEN + MAXNAMELEN + 20; 6812 FILE *fp; 6813 6814 bin = umem_alloc(len, UMEM_NOFAIL); 6815 zdb = umem_alloc(len, UMEM_NOFAIL); 6816 zbuf = umem_alloc(1024, UMEM_NOFAIL); 6817 6818 ztest_get_zdb_bin(bin, len); 6819 6820 char **set_gvars_args = ztest_global_vars_to_zdb_args(); 6821 char *set_gvars_args_joined = join_strings(set_gvars_args, " "); 6822 free(set_gvars_args); 6823 6824 size_t would = snprintf(zdb, len, 6825 "%s -bcc%s%s -G -d -Y -e -y %s -p %s %s", 6826 bin, 6827 ztest_opts.zo_verbose >= 3 ? "s" : "", 6828 ztest_opts.zo_verbose >= 4 ? "v" : "", 6829 set_gvars_args_joined, 6830 ztest_opts.zo_dir, 6831 pool); 6832 ASSERT3U(would, <, len); 6833 6834 free(set_gvars_args_joined); 6835 6836 if (ztest_opts.zo_verbose >= 5) 6837 (void) printf("Executing %s\n", zdb); 6838 6839 fp = popen(zdb, "r"); 6840 6841 while (fgets(zbuf, 1024, fp) != NULL) 6842 if (ztest_opts.zo_verbose >= 3) 6843 (void) printf("%s", zbuf); 6844 6845 status = pclose(fp); 6846 6847 if (status == 0) 6848 goto out; 6849 6850 ztest_dump_core = 0; 6851 if (WIFEXITED(status)) 6852 fatal(B_FALSE, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 6853 else 6854 fatal(B_FALSE, "'%s' died with signal %d", 6855 zdb, WTERMSIG(status)); 6856 out: 6857 umem_free(bin, len); 6858 umem_free(zdb, len); 6859 umem_free(zbuf, 1024); 6860 } 6861 6862 static void 6863 ztest_walk_pool_directory(char *header) 6864 { 6865 spa_t *spa = NULL; 6866 6867 if (ztest_opts.zo_verbose >= 6) 6868 (void) printf("%s\n", header); 6869 6870 mutex_enter(&spa_namespace_lock); 6871 while ((spa = spa_next(spa)) != NULL) 6872 if (ztest_opts.zo_verbose >= 6) 6873 (void) printf("\t%s\n", spa_name(spa)); 6874 mutex_exit(&spa_namespace_lock); 6875 } 6876 6877 static void 6878 ztest_spa_import_export(char *oldname, char *newname) 6879 { 6880 nvlist_t *config, *newconfig; 6881 uint64_t pool_guid; 6882 spa_t *spa; 6883 int error; 6884 6885 if (ztest_opts.zo_verbose >= 4) { 6886 (void) printf("import/export: old = %s, new = %s\n", 6887 oldname, newname); 6888 } 6889 6890 /* 6891 * Clean up from previous runs. 6892 */ 6893 (void) spa_destroy(newname); 6894 6895 /* 6896 * Get the pool's configuration and guid. 6897 */ 6898 VERIFY0(spa_open(oldname, &spa, FTAG)); 6899 6900 /* 6901 * Kick off a scrub to tickle scrub/export races. 6902 */ 6903 if (ztest_random(2) == 0) 6904 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6905 6906 pool_guid = spa_guid(spa); 6907 spa_close(spa, FTAG); 6908 6909 ztest_walk_pool_directory("pools before export"); 6910 6911 /* 6912 * Export it. 6913 */ 6914 VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); 6915 6916 ztest_walk_pool_directory("pools after export"); 6917 6918 /* 6919 * Try to import it. 6920 */ 6921 newconfig = spa_tryimport(config); 6922 ASSERT3P(newconfig, !=, NULL); 6923 fnvlist_free(newconfig); 6924 6925 /* 6926 * Import it under the new name. 6927 */ 6928 error = spa_import(newname, config, NULL, 0); 6929 if (error != 0) { 6930 dump_nvlist(config, 0); 6931 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 6932 oldname, newname, error); 6933 } 6934 6935 ztest_walk_pool_directory("pools after import"); 6936 6937 /* 6938 * Try to import it again -- should fail with EEXIST. 6939 */ 6940 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 6941 6942 /* 6943 * Try to import it under a different name -- should fail with EEXIST. 6944 */ 6945 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 6946 6947 /* 6948 * Verify that the pool is no longer visible under the old name. 6949 */ 6950 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 6951 6952 /* 6953 * Verify that we can open and close the pool using the new name. 6954 */ 6955 VERIFY0(spa_open(newname, &spa, FTAG)); 6956 ASSERT3U(pool_guid, ==, spa_guid(spa)); 6957 spa_close(spa, FTAG); 6958 6959 fnvlist_free(config); 6960 } 6961 6962 static void 6963 ztest_resume(spa_t *spa) 6964 { 6965 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 6966 (void) printf("resuming from suspended state\n"); 6967 spa_vdev_state_enter(spa, SCL_NONE); 6968 vdev_clear(spa, NULL); 6969 (void) spa_vdev_state_exit(spa, NULL, 0); 6970 (void) zio_resume(spa); 6971 } 6972 6973 static __attribute__((noreturn)) void 6974 ztest_resume_thread(void *arg) 6975 { 6976 spa_t *spa = arg; 6977 6978 while (!ztest_exiting) { 6979 if (spa_suspended(spa)) 6980 ztest_resume(spa); 6981 (void) poll(NULL, 0, 100); 6982 6983 /* 6984 * Periodically change the zfs_compressed_arc_enabled setting. 6985 */ 6986 if (ztest_random(10) == 0) 6987 zfs_compressed_arc_enabled = ztest_random(2); 6988 6989 /* 6990 * Periodically change the zfs_abd_scatter_enabled setting. 6991 */ 6992 if (ztest_random(10) == 0) 6993 zfs_abd_scatter_enabled = ztest_random(2); 6994 } 6995 6996 thread_exit(); 6997 } 6998 6999 static __attribute__((noreturn)) void 7000 ztest_deadman_thread(void *arg) 7001 { 7002 ztest_shared_t *zs = arg; 7003 spa_t *spa = ztest_spa; 7004 hrtime_t delay, overdue, last_run = gethrtime(); 7005 7006 delay = (zs->zs_thread_stop - zs->zs_thread_start) + 7007 MSEC2NSEC(zfs_deadman_synctime_ms); 7008 7009 while (!ztest_exiting) { 7010 /* 7011 * Wait for the delay timer while checking occasionally 7012 * if we should stop. 7013 */ 7014 if (gethrtime() < last_run + delay) { 7015 (void) poll(NULL, 0, 1000); 7016 continue; 7017 } 7018 7019 /* 7020 * If the pool is suspended then fail immediately. Otherwise, 7021 * check to see if the pool is making any progress. If 7022 * vdev_deadman() discovers that there hasn't been any recent 7023 * I/Os then it will end up aborting the tests. 7024 */ 7025 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 7026 fatal(B_FALSE, 7027 "aborting test after %lu seconds because " 7028 "pool has transitioned to a suspended state.", 7029 zfs_deadman_synctime_ms / 1000); 7030 } 7031 vdev_deadman(spa->spa_root_vdev, FTAG); 7032 7033 /* 7034 * If the process doesn't complete within a grace period of 7035 * zfs_deadman_synctime_ms over the expected finish time, 7036 * then it may be hung and is terminated. 7037 */ 7038 overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); 7039 if (gethrtime() > overdue) { 7040 fatal(B_FALSE, 7041 "aborting test after %llu seconds because " 7042 "the process is overdue for termination.", 7043 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7044 } 7045 7046 (void) printf("ztest has been running for %lld seconds\n", 7047 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7048 7049 last_run = gethrtime(); 7050 delay = MSEC2NSEC(zfs_deadman_checktime_ms); 7051 } 7052 7053 thread_exit(); 7054 } 7055 7056 static void 7057 ztest_execute(int test, ztest_info_t *zi, uint64_t id) 7058 { 7059 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 7060 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 7061 hrtime_t functime = gethrtime(); 7062 int i; 7063 7064 for (i = 0; i < zi->zi_iters; i++) 7065 zi->zi_func(zd, id); 7066 7067 functime = gethrtime() - functime; 7068 7069 atomic_add_64(&zc->zc_count, 1); 7070 atomic_add_64(&zc->zc_time, functime); 7071 7072 if (ztest_opts.zo_verbose >= 4) 7073 (void) printf("%6.2f sec in %s\n", 7074 (double)functime / NANOSEC, zi->zi_funcname); 7075 } 7076 7077 static __attribute__((noreturn)) void 7078 ztest_thread(void *arg) 7079 { 7080 int rand; 7081 uint64_t id = (uintptr_t)arg; 7082 ztest_shared_t *zs = ztest_shared; 7083 uint64_t call_next; 7084 hrtime_t now; 7085 ztest_info_t *zi; 7086 ztest_shared_callstate_t *zc; 7087 7088 while ((now = gethrtime()) < zs->zs_thread_stop) { 7089 /* 7090 * See if it's time to force a crash. 7091 */ 7092 if (now > zs->zs_thread_kill) 7093 ztest_kill(zs); 7094 7095 /* 7096 * If we're getting ENOSPC with some regularity, stop. 7097 */ 7098 if (zs->zs_enospc_count > 10) 7099 break; 7100 7101 /* 7102 * Pick a random function to execute. 7103 */ 7104 rand = ztest_random(ZTEST_FUNCS); 7105 zi = &ztest_info[rand]; 7106 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 7107 call_next = zc->zc_next; 7108 7109 if (now >= call_next && 7110 atomic_cas_64(&zc->zc_next, call_next, call_next + 7111 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 7112 ztest_execute(rand, zi, id); 7113 } 7114 } 7115 7116 thread_exit(); 7117 } 7118 7119 static void 7120 ztest_dataset_name(char *dsname, char *pool, int d) 7121 { 7122 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 7123 } 7124 7125 static void 7126 ztest_dataset_destroy(int d) 7127 { 7128 char name[ZFS_MAX_DATASET_NAME_LEN]; 7129 int t; 7130 7131 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7132 7133 if (ztest_opts.zo_verbose >= 3) 7134 (void) printf("Destroying %s to free up space\n", name); 7135 7136 /* 7137 * Cleanup any non-standard clones and snapshots. In general, 7138 * ztest thread t operates on dataset (t % zopt_datasets), 7139 * so there may be more than one thing to clean up. 7140 */ 7141 for (t = d; t < ztest_opts.zo_threads; 7142 t += ztest_opts.zo_datasets) 7143 ztest_dsl_dataset_cleanup(name, t); 7144 7145 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 7146 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 7147 } 7148 7149 static void 7150 ztest_dataset_dirobj_verify(ztest_ds_t *zd) 7151 { 7152 uint64_t usedobjs, dirobjs, scratch; 7153 7154 /* 7155 * ZTEST_DIROBJ is the object directory for the entire dataset. 7156 * Therefore, the number of objects in use should equal the 7157 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 7158 * If not, we have an object leak. 7159 * 7160 * Note that we can only check this in ztest_dataset_open(), 7161 * when the open-context and syncing-context values agree. 7162 * That's because zap_count() returns the open-context value, 7163 * while dmu_objset_space() returns the rootbp fill count. 7164 */ 7165 VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 7166 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 7167 ASSERT3U(dirobjs + 1, ==, usedobjs); 7168 } 7169 7170 static int 7171 ztest_dataset_open(int d) 7172 { 7173 ztest_ds_t *zd = &ztest_ds[d]; 7174 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 7175 objset_t *os; 7176 zilog_t *zilog; 7177 char name[ZFS_MAX_DATASET_NAME_LEN]; 7178 int error; 7179 7180 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7181 7182 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7183 7184 error = ztest_dataset_create(name); 7185 if (error == ENOSPC) { 7186 (void) pthread_rwlock_unlock(&ztest_name_lock); 7187 ztest_record_enospc(FTAG); 7188 return (error); 7189 } 7190 ASSERT(error == 0 || error == EEXIST); 7191 7192 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 7193 B_TRUE, zd, &os)); 7194 (void) pthread_rwlock_unlock(&ztest_name_lock); 7195 7196 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 7197 7198 zilog = zd->zd_zilog; 7199 7200 if (zilog->zl_header->zh_claim_lr_seq != 0 && 7201 zilog->zl_header->zh_claim_lr_seq < committed_seq) 7202 fatal(B_FALSE, "missing log records: " 7203 "claimed %"PRIu64" < committed %"PRIu64"", 7204 zilog->zl_header->zh_claim_lr_seq, committed_seq); 7205 7206 ztest_dataset_dirobj_verify(zd); 7207 7208 zil_replay(os, zd, ztest_replay_vector); 7209 7210 ztest_dataset_dirobj_verify(zd); 7211 7212 if (ztest_opts.zo_verbose >= 6) 7213 (void) printf("%s replay %"PRIu64" blocks, " 7214 "%"PRIu64" records, seq %"PRIu64"\n", 7215 zd->zd_name, 7216 zilog->zl_parse_blk_count, 7217 zilog->zl_parse_lr_count, 7218 zilog->zl_replaying_seq); 7219 7220 zilog = zil_open(os, ztest_get_data); 7221 7222 if (zilog->zl_replaying_seq != 0 && 7223 zilog->zl_replaying_seq < committed_seq) 7224 fatal(B_FALSE, "missing log records: " 7225 "replayed %"PRIu64" < committed %"PRIu64"", 7226 zilog->zl_replaying_seq, committed_seq); 7227 7228 return (0); 7229 } 7230 7231 static void 7232 ztest_dataset_close(int d) 7233 { 7234 ztest_ds_t *zd = &ztest_ds[d]; 7235 7236 zil_close(zd->zd_zilog); 7237 dmu_objset_disown(zd->zd_os, B_TRUE, zd); 7238 7239 ztest_zd_fini(zd); 7240 } 7241 7242 static int 7243 ztest_replay_zil_cb(const char *name, void *arg) 7244 { 7245 (void) arg; 7246 objset_t *os; 7247 ztest_ds_t *zdtmp; 7248 7249 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, 7250 B_TRUE, FTAG, &os)); 7251 7252 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 7253 7254 ztest_zd_init(zdtmp, NULL, os); 7255 zil_replay(os, zdtmp, ztest_replay_vector); 7256 ztest_zd_fini(zdtmp); 7257 7258 if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && 7259 ztest_opts.zo_verbose >= 6) { 7260 zilog_t *zilog = dmu_objset_zil(os); 7261 7262 (void) printf("%s replay %"PRIu64" blocks, " 7263 "%"PRIu64" records, seq %"PRIu64"\n", 7264 name, 7265 zilog->zl_parse_blk_count, 7266 zilog->zl_parse_lr_count, 7267 zilog->zl_replaying_seq); 7268 } 7269 7270 umem_free(zdtmp, sizeof (ztest_ds_t)); 7271 7272 dmu_objset_disown(os, B_TRUE, FTAG); 7273 return (0); 7274 } 7275 7276 static void 7277 ztest_freeze(void) 7278 { 7279 ztest_ds_t *zd = &ztest_ds[0]; 7280 spa_t *spa; 7281 int numloops = 0; 7282 7283 if (ztest_opts.zo_verbose >= 3) 7284 (void) printf("testing spa_freeze()...\n"); 7285 7286 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7287 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7288 VERIFY0(ztest_dataset_open(0)); 7289 ztest_spa = spa; 7290 7291 /* 7292 * Force the first log block to be transactionally allocated. 7293 * We have to do this before we freeze the pool -- otherwise 7294 * the log chain won't be anchored. 7295 */ 7296 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 7297 ztest_dmu_object_alloc_free(zd, 0); 7298 zil_commit(zd->zd_zilog, 0); 7299 } 7300 7301 txg_wait_synced(spa_get_dsl(spa), 0); 7302 7303 /* 7304 * Freeze the pool. This stops spa_sync() from doing anything, 7305 * so that the only way to record changes from now on is the ZIL. 7306 */ 7307 spa_freeze(spa); 7308 7309 /* 7310 * Because it is hard to predict how much space a write will actually 7311 * require beforehand, we leave ourselves some fudge space to write over 7312 * capacity. 7313 */ 7314 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 7315 7316 /* 7317 * Run tests that generate log records but don't alter the pool config 7318 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 7319 * We do a txg_wait_synced() after each iteration to force the txg 7320 * to increase well beyond the last synced value in the uberblock. 7321 * The ZIL should be OK with that. 7322 * 7323 * Run a random number of times less than zo_maxloops and ensure we do 7324 * not run out of space on the pool. 7325 */ 7326 while (ztest_random(10) != 0 && 7327 numloops++ < ztest_opts.zo_maxloops && 7328 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 7329 ztest_od_t od; 7330 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 7331 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 7332 ztest_io(zd, od.od_object, 7333 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 7334 txg_wait_synced(spa_get_dsl(spa), 0); 7335 } 7336 7337 /* 7338 * Commit all of the changes we just generated. 7339 */ 7340 zil_commit(zd->zd_zilog, 0); 7341 txg_wait_synced(spa_get_dsl(spa), 0); 7342 7343 /* 7344 * Close our dataset and close the pool. 7345 */ 7346 ztest_dataset_close(0); 7347 spa_close(spa, FTAG); 7348 kernel_fini(); 7349 7350 /* 7351 * Open and close the pool and dataset to induce log replay. 7352 */ 7353 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7354 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7355 ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); 7356 VERIFY0(ztest_dataset_open(0)); 7357 ztest_spa = spa; 7358 txg_wait_synced(spa_get_dsl(spa), 0); 7359 ztest_dataset_close(0); 7360 ztest_reguid(NULL, 0); 7361 7362 spa_close(spa, FTAG); 7363 kernel_fini(); 7364 } 7365 7366 static void 7367 ztest_import_impl(void) 7368 { 7369 importargs_t args = { 0 }; 7370 nvlist_t *cfg = NULL; 7371 int nsearch = 1; 7372 char *searchdirs[nsearch]; 7373 int flags = ZFS_IMPORT_MISSING_LOG; 7374 7375 searchdirs[0] = ztest_opts.zo_dir; 7376 args.paths = nsearch; 7377 args.path = searchdirs; 7378 args.can_be_active = B_FALSE; 7379 7380 VERIFY0(zpool_find_config(NULL, ztest_opts.zo_pool, &cfg, &args, 7381 &libzpool_config_ops)); 7382 VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); 7383 fnvlist_free(cfg); 7384 } 7385 7386 /* 7387 * Import a storage pool with the given name. 7388 */ 7389 static void 7390 ztest_import(ztest_shared_t *zs) 7391 { 7392 spa_t *spa; 7393 7394 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7395 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7396 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7397 7398 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7399 7400 ztest_import_impl(); 7401 7402 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7403 zs->zs_metaslab_sz = 7404 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7405 spa_close(spa, FTAG); 7406 7407 kernel_fini(); 7408 7409 if (!ztest_opts.zo_mmp_test) { 7410 ztest_run_zdb(ztest_opts.zo_pool); 7411 ztest_freeze(); 7412 ztest_run_zdb(ztest_opts.zo_pool); 7413 } 7414 7415 (void) pthread_rwlock_destroy(&ztest_name_lock); 7416 mutex_destroy(&ztest_vdev_lock); 7417 mutex_destroy(&ztest_checkpoint_lock); 7418 } 7419 7420 /* 7421 * Kick off threads to run tests on all datasets in parallel. 7422 */ 7423 static void 7424 ztest_run(ztest_shared_t *zs) 7425 { 7426 spa_t *spa; 7427 objset_t *os; 7428 kthread_t *resume_thread, *deadman_thread; 7429 kthread_t **run_threads; 7430 uint64_t object; 7431 int error; 7432 int t, d; 7433 7434 ztest_exiting = B_FALSE; 7435 7436 /* 7437 * Initialize parent/child shared state. 7438 */ 7439 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7440 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7441 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7442 7443 zs->zs_thread_start = gethrtime(); 7444 zs->zs_thread_stop = 7445 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 7446 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 7447 zs->zs_thread_kill = zs->zs_thread_stop; 7448 if (ztest_random(100) < ztest_opts.zo_killrate) { 7449 zs->zs_thread_kill -= 7450 ztest_random(ztest_opts.zo_passtime * NANOSEC); 7451 } 7452 7453 mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); 7454 7455 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 7456 offsetof(ztest_cb_data_t, zcd_node)); 7457 7458 /* 7459 * Open our pool. It may need to be imported first depending on 7460 * what tests were running when the previous pass was terminated. 7461 */ 7462 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7463 error = spa_open(ztest_opts.zo_pool, &spa, FTAG); 7464 if (error) { 7465 VERIFY3S(error, ==, ENOENT); 7466 ztest_import_impl(); 7467 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7468 zs->zs_metaslab_sz = 7469 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7470 } 7471 7472 metaslab_preload_limit = ztest_random(20) + 1; 7473 ztest_spa = spa; 7474 7475 VERIFY0(vdev_raidz_impl_set("cycle")); 7476 7477 dmu_objset_stats_t dds; 7478 VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, 7479 DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); 7480 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 7481 dmu_objset_fast_stat(os, &dds); 7482 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 7483 zs->zs_guid = dds.dds_guid; 7484 dmu_objset_disown(os, B_TRUE, FTAG); 7485 7486 /* 7487 * Create a thread to periodically resume suspended I/O. 7488 */ 7489 resume_thread = thread_create(NULL, 0, ztest_resume_thread, 7490 spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 7491 7492 /* 7493 * Create a deadman thread and set to panic if we hang. 7494 */ 7495 deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, 7496 zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 7497 7498 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; 7499 7500 /* 7501 * Verify that we can safely inquire about any object, 7502 * whether it's allocated or not. To make it interesting, 7503 * we probe a 5-wide window around each power of two. 7504 * This hits all edge cases, including zero and the max. 7505 */ 7506 for (t = 0; t < 64; t++) { 7507 for (d = -5; d <= 5; d++) { 7508 error = dmu_object_info(spa->spa_meta_objset, 7509 (1ULL << t) + d, NULL); 7510 ASSERT(error == 0 || error == ENOENT || 7511 error == EINVAL); 7512 } 7513 } 7514 7515 /* 7516 * If we got any ENOSPC errors on the previous run, destroy something. 7517 */ 7518 if (zs->zs_enospc_count != 0) { 7519 int d = ztest_random(ztest_opts.zo_datasets); 7520 ztest_dataset_destroy(d); 7521 } 7522 zs->zs_enospc_count = 0; 7523 7524 /* 7525 * If we were in the middle of ztest_device_removal() and were killed 7526 * we need to ensure the removal and scrub complete before running 7527 * any tests that check ztest_device_removal_active. The removal will 7528 * be restarted automatically when the spa is opened, but we need to 7529 * initiate the scrub manually if it is not already in progress. Note 7530 * that we always run the scrub whenever an indirect vdev exists 7531 * because we have no way of knowing for sure if ztest_device_removal() 7532 * fully completed its scrub before the pool was reimported. 7533 */ 7534 if (spa->spa_removing_phys.sr_state == DSS_SCANNING || 7535 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 7536 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 7537 txg_wait_synced(spa_get_dsl(spa), 0); 7538 7539 error = ztest_scrub_impl(spa); 7540 if (error == EBUSY) 7541 error = 0; 7542 ASSERT0(error); 7543 } 7544 7545 run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), 7546 UMEM_NOFAIL); 7547 7548 if (ztest_opts.zo_verbose >= 4) 7549 (void) printf("starting main threads...\n"); 7550 7551 /* 7552 * Replay all logs of all datasets in the pool. This is primarily for 7553 * temporary datasets which wouldn't otherwise get replayed, which 7554 * can trigger failures when attempting to offline a SLOG in 7555 * ztest_fault_inject(). 7556 */ 7557 (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, 7558 NULL, DS_FIND_CHILDREN); 7559 7560 /* 7561 * Kick off all the tests that run in parallel. 7562 */ 7563 for (t = 0; t < ztest_opts.zo_threads; t++) { 7564 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 7565 umem_free(run_threads, ztest_opts.zo_threads * 7566 sizeof (kthread_t *)); 7567 return; 7568 } 7569 7570 run_threads[t] = thread_create(NULL, 0, ztest_thread, 7571 (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, 7572 defclsyspri); 7573 } 7574 7575 /* 7576 * Wait for all of the tests to complete. 7577 */ 7578 for (t = 0; t < ztest_opts.zo_threads; t++) 7579 VERIFY0(thread_join(run_threads[t])); 7580 7581 /* 7582 * Close all datasets. This must be done after all the threads 7583 * are joined so we can be sure none of the datasets are in-use 7584 * by any of the threads. 7585 */ 7586 for (t = 0; t < ztest_opts.zo_threads; t++) { 7587 if (t < ztest_opts.zo_datasets) 7588 ztest_dataset_close(t); 7589 } 7590 7591 txg_wait_synced(spa_get_dsl(spa), 0); 7592 7593 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 7594 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 7595 7596 umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); 7597 7598 /* Kill the resume and deadman threads */ 7599 ztest_exiting = B_TRUE; 7600 VERIFY0(thread_join(resume_thread)); 7601 VERIFY0(thread_join(deadman_thread)); 7602 ztest_resume(spa); 7603 7604 /* 7605 * Right before closing the pool, kick off a bunch of async I/O; 7606 * spa_close() should wait for it to complete. 7607 */ 7608 for (object = 1; object < 50; object++) { 7609 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 7610 ZIO_PRIORITY_SYNC_READ); 7611 } 7612 7613 /* Verify that at least one commit cb was called in a timely fashion */ 7614 if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) 7615 VERIFY0(zc_min_txg_delay); 7616 7617 spa_close(spa, FTAG); 7618 7619 /* 7620 * Verify that we can loop over all pools. 7621 */ 7622 mutex_enter(&spa_namespace_lock); 7623 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 7624 if (ztest_opts.zo_verbose > 3) 7625 (void) printf("spa_next: found %s\n", spa_name(spa)); 7626 mutex_exit(&spa_namespace_lock); 7627 7628 /* 7629 * Verify that we can export the pool and reimport it under a 7630 * different name. 7631 */ 7632 if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { 7633 char name[ZFS_MAX_DATASET_NAME_LEN]; 7634 (void) snprintf(name, sizeof (name), "%s_import", 7635 ztest_opts.zo_pool); 7636 ztest_spa_import_export(ztest_opts.zo_pool, name); 7637 ztest_spa_import_export(name, ztest_opts.zo_pool); 7638 } 7639 7640 kernel_fini(); 7641 7642 list_destroy(&zcl.zcl_callbacks); 7643 mutex_destroy(&zcl.zcl_callbacks_lock); 7644 (void) pthread_rwlock_destroy(&ztest_name_lock); 7645 mutex_destroy(&ztest_vdev_lock); 7646 mutex_destroy(&ztest_checkpoint_lock); 7647 } 7648 7649 static void 7650 print_time(hrtime_t t, char *timebuf) 7651 { 7652 hrtime_t s = t / NANOSEC; 7653 hrtime_t m = s / 60; 7654 hrtime_t h = m / 60; 7655 hrtime_t d = h / 24; 7656 7657 s -= m * 60; 7658 m -= h * 60; 7659 h -= d * 24; 7660 7661 timebuf[0] = '\0'; 7662 7663 if (d) 7664 (void) sprintf(timebuf, 7665 "%llud%02lluh%02llum%02llus", d, h, m, s); 7666 else if (h) 7667 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 7668 else if (m) 7669 (void) sprintf(timebuf, "%llum%02llus", m, s); 7670 else 7671 (void) sprintf(timebuf, "%llus", s); 7672 } 7673 7674 static nvlist_t * 7675 make_random_props(void) 7676 { 7677 nvlist_t *props; 7678 7679 props = fnvlist_alloc(); 7680 7681 if (ztest_random(2) == 0) 7682 return (props); 7683 7684 fnvlist_add_uint64(props, 7685 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); 7686 7687 return (props); 7688 } 7689 7690 /* 7691 * Create a storage pool with the given name and initial vdev size. 7692 * Then test spa_freeze() functionality. 7693 */ 7694 static void 7695 ztest_init(ztest_shared_t *zs) 7696 { 7697 spa_t *spa; 7698 nvlist_t *nvroot, *props; 7699 int i; 7700 7701 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7702 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7703 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7704 7705 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7706 7707 /* 7708 * Create the storage pool. 7709 */ 7710 (void) spa_destroy(ztest_opts.zo_pool); 7711 ztest_shared->zs_vdev_next_leaf = 0; 7712 zs->zs_splits = 0; 7713 zs->zs_mirrors = ztest_opts.zo_mirrors; 7714 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 7715 NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 7716 props = make_random_props(); 7717 7718 /* 7719 * We don't expect the pool to suspend unless maxfaults == 0, 7720 * in which case ztest_fault_inject() temporarily takes away 7721 * the only valid replica. 7722 */ 7723 fnvlist_add_uint64(props, 7724 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 7725 MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); 7726 7727 for (i = 0; i < SPA_FEATURES; i++) { 7728 char *buf; 7729 7730 if (!spa_feature_table[i].fi_zfs_mod_supported) 7731 continue; 7732 7733 /* 7734 * 75% chance of using the log space map feature. We want ztest 7735 * to exercise both the code paths that use the log space map 7736 * feature and the ones that don't. 7737 */ 7738 if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) 7739 continue; 7740 7741 VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", 7742 spa_feature_table[i].fi_uname)); 7743 fnvlist_add_uint64(props, buf, 0); 7744 free(buf); 7745 } 7746 7747 VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); 7748 fnvlist_free(nvroot); 7749 fnvlist_free(props); 7750 7751 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7752 zs->zs_metaslab_sz = 7753 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7754 spa_close(spa, FTAG); 7755 7756 kernel_fini(); 7757 7758 if (!ztest_opts.zo_mmp_test) { 7759 ztest_run_zdb(ztest_opts.zo_pool); 7760 ztest_freeze(); 7761 ztest_run_zdb(ztest_opts.zo_pool); 7762 } 7763 7764 (void) pthread_rwlock_destroy(&ztest_name_lock); 7765 mutex_destroy(&ztest_vdev_lock); 7766 mutex_destroy(&ztest_checkpoint_lock); 7767 } 7768 7769 static void 7770 setup_data_fd(void) 7771 { 7772 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 7773 7774 ztest_fd_data = mkstemp(ztest_name_data); 7775 ASSERT3S(ztest_fd_data, >=, 0); 7776 (void) unlink(ztest_name_data); 7777 } 7778 7779 static int 7780 shared_data_size(ztest_shared_hdr_t *hdr) 7781 { 7782 int size; 7783 7784 size = hdr->zh_hdr_size; 7785 size += hdr->zh_opts_size; 7786 size += hdr->zh_size; 7787 size += hdr->zh_stats_size * hdr->zh_stats_count; 7788 size += hdr->zh_ds_size * hdr->zh_ds_count; 7789 7790 return (size); 7791 } 7792 7793 static void 7794 setup_hdr(void) 7795 { 7796 int size; 7797 ztest_shared_hdr_t *hdr; 7798 7799 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 7800 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 7801 ASSERT3P(hdr, !=, MAP_FAILED); 7802 7803 VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 7804 7805 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 7806 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 7807 hdr->zh_size = sizeof (ztest_shared_t); 7808 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 7809 hdr->zh_stats_count = ZTEST_FUNCS; 7810 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 7811 hdr->zh_ds_count = ztest_opts.zo_datasets; 7812 7813 size = shared_data_size(hdr); 7814 VERIFY0(ftruncate(ztest_fd_data, size)); 7815 7816 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 7817 } 7818 7819 static void 7820 setup_data(void) 7821 { 7822 int size, offset; 7823 ztest_shared_hdr_t *hdr; 7824 uint8_t *buf; 7825 7826 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 7827 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 7828 ASSERT3P(hdr, !=, MAP_FAILED); 7829 7830 size = shared_data_size(hdr); 7831 7832 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 7833 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 7834 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 7835 ASSERT3P(hdr, !=, MAP_FAILED); 7836 buf = (uint8_t *)hdr; 7837 7838 offset = hdr->zh_hdr_size; 7839 ztest_shared_opts = (void *)&buf[offset]; 7840 offset += hdr->zh_opts_size; 7841 ztest_shared = (void *)&buf[offset]; 7842 offset += hdr->zh_size; 7843 ztest_shared_callstate = (void *)&buf[offset]; 7844 offset += hdr->zh_stats_size * hdr->zh_stats_count; 7845 ztest_shared_ds = (void *)&buf[offset]; 7846 } 7847 7848 static boolean_t 7849 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 7850 { 7851 pid_t pid; 7852 int status; 7853 char *cmdbuf = NULL; 7854 7855 pid = fork(); 7856 7857 if (cmd == NULL) { 7858 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 7859 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 7860 cmd = cmdbuf; 7861 } 7862 7863 if (pid == -1) 7864 fatal(B_TRUE, "fork failed"); 7865 7866 if (pid == 0) { /* child */ 7867 char fd_data_str[12]; 7868 7869 VERIFY3S(11, >=, 7870 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 7871 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 7872 7873 if (libpath != NULL) { 7874 const char *curlp = getenv("LD_LIBRARY_PATH"); 7875 if (curlp == NULL) 7876 VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); 7877 else { 7878 char *newlp = NULL; 7879 VERIFY3S(-1, !=, 7880 asprintf(&newlp, "%s:%s", libpath, curlp)); 7881 VERIFY0(setenv("LD_LIBRARY_PATH", newlp, 1)); 7882 } 7883 } 7884 (void) execl(cmd, cmd, (char *)NULL); 7885 ztest_dump_core = B_FALSE; 7886 fatal(B_TRUE, "exec failed: %s", cmd); 7887 } 7888 7889 if (cmdbuf != NULL) { 7890 umem_free(cmdbuf, MAXPATHLEN); 7891 cmd = NULL; 7892 } 7893 7894 while (waitpid(pid, &status, 0) != pid) 7895 continue; 7896 if (statusp != NULL) 7897 *statusp = status; 7898 7899 if (WIFEXITED(status)) { 7900 if (WEXITSTATUS(status) != 0) { 7901 (void) fprintf(stderr, "child exited with code %d\n", 7902 WEXITSTATUS(status)); 7903 exit(2); 7904 } 7905 return (B_FALSE); 7906 } else if (WIFSIGNALED(status)) { 7907 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 7908 (void) fprintf(stderr, "child died with signal %d\n", 7909 WTERMSIG(status)); 7910 exit(3); 7911 } 7912 return (B_TRUE); 7913 } else { 7914 (void) fprintf(stderr, "something strange happened to child\n"); 7915 exit(4); 7916 } 7917 } 7918 7919 static void 7920 ztest_run_init(void) 7921 { 7922 int i; 7923 7924 ztest_shared_t *zs = ztest_shared; 7925 7926 /* 7927 * Blow away any existing copy of zpool.cache 7928 */ 7929 (void) remove(spa_config_path); 7930 7931 if (ztest_opts.zo_init == 0) { 7932 if (ztest_opts.zo_verbose >= 1) 7933 (void) printf("Importing pool %s\n", 7934 ztest_opts.zo_pool); 7935 ztest_import(zs); 7936 return; 7937 } 7938 7939 /* 7940 * Create and initialize our storage pool. 7941 */ 7942 for (i = 1; i <= ztest_opts.zo_init; i++) { 7943 memset(zs, 0, sizeof (*zs)); 7944 if (ztest_opts.zo_verbose >= 3 && 7945 ztest_opts.zo_init != 1) { 7946 (void) printf("ztest_init(), pass %d\n", i); 7947 } 7948 ztest_init(zs); 7949 } 7950 } 7951 7952 int 7953 main(int argc, char **argv) 7954 { 7955 int kills = 0; 7956 int iters = 0; 7957 int older = 0; 7958 int newer = 0; 7959 ztest_shared_t *zs; 7960 ztest_info_t *zi; 7961 ztest_shared_callstate_t *zc; 7962 char timebuf[100]; 7963 char numbuf[NN_NUMBUF_SZ]; 7964 char *cmd; 7965 boolean_t hasalt; 7966 int f, err; 7967 char *fd_data_str = getenv("ZTEST_FD_DATA"); 7968 struct sigaction action; 7969 7970 (void) setvbuf(stdout, NULL, _IOLBF, 0); 7971 7972 dprintf_setup(&argc, argv); 7973 zfs_deadman_synctime_ms = 300000; 7974 zfs_deadman_checktime_ms = 30000; 7975 /* 7976 * As two-word space map entries may not come up often (especially 7977 * if pool and vdev sizes are small) we want to force at least some 7978 * of them so the feature get tested. 7979 */ 7980 zfs_force_some_double_word_sm_entries = B_TRUE; 7981 7982 /* 7983 * Verify that even extensively damaged split blocks with many 7984 * segments can be reconstructed in a reasonable amount of time 7985 * when reconstruction is known to be possible. 7986 * 7987 * Note: the lower this value is, the more damage we inflict, and 7988 * the more time ztest spends in recovering that damage. We chose 7989 * to induce damage 1/100th of the time so recovery is tested but 7990 * not so frequently that ztest doesn't get to test other code paths. 7991 */ 7992 zfs_reconstruct_indirect_damage_fraction = 100; 7993 7994 action.sa_handler = sig_handler; 7995 sigemptyset(&action.sa_mask); 7996 action.sa_flags = 0; 7997 7998 if (sigaction(SIGSEGV, &action, NULL) < 0) { 7999 (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", 8000 strerror(errno)); 8001 exit(EXIT_FAILURE); 8002 } 8003 8004 if (sigaction(SIGABRT, &action, NULL) < 0) { 8005 (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", 8006 strerror(errno)); 8007 exit(EXIT_FAILURE); 8008 } 8009 8010 /* 8011 * Force random_get_bytes() to use /dev/urandom in order to prevent 8012 * ztest from needlessly depleting the system entropy pool. 8013 */ 8014 random_path = "/dev/urandom"; 8015 ztest_fd_rand = open(random_path, O_RDONLY | O_CLOEXEC); 8016 ASSERT3S(ztest_fd_rand, >=, 0); 8017 8018 if (!fd_data_str) { 8019 process_options(argc, argv); 8020 8021 setup_data_fd(); 8022 setup_hdr(); 8023 setup_data(); 8024 memcpy(ztest_shared_opts, &ztest_opts, 8025 sizeof (*ztest_shared_opts)); 8026 } else { 8027 ztest_fd_data = atoi(fd_data_str); 8028 setup_data(); 8029 memcpy(&ztest_opts, ztest_shared_opts, sizeof (ztest_opts)); 8030 } 8031 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 8032 8033 err = ztest_set_global_vars(); 8034 if (err != 0 && !fd_data_str) { 8035 /* error message done by ztest_set_global_vars */ 8036 exit(EXIT_FAILURE); 8037 } else { 8038 /* children should not be spawned if setting gvars fails */ 8039 VERIFY3S(err, ==, 0); 8040 } 8041 8042 /* Override location of zpool.cache */ 8043 VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", 8044 ztest_opts.zo_dir), !=, -1); 8045 8046 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 8047 UMEM_NOFAIL); 8048 zs = ztest_shared; 8049 8050 if (fd_data_str) { 8051 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 8052 metaslab_df_alloc_threshold = 8053 zs->zs_metaslab_df_alloc_threshold; 8054 8055 if (zs->zs_do_init) 8056 ztest_run_init(); 8057 else 8058 ztest_run(zs); 8059 exit(0); 8060 } 8061 8062 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 8063 8064 if (ztest_opts.zo_verbose >= 1) { 8065 (void) printf("%"PRIu64" vdevs, %d datasets, %d threads," 8066 "%d %s disks, %"PRIu64" seconds...\n\n", 8067 ztest_opts.zo_vdevs, 8068 ztest_opts.zo_datasets, 8069 ztest_opts.zo_threads, 8070 ztest_opts.zo_raid_children, 8071 ztest_opts.zo_raid_type, 8072 ztest_opts.zo_time); 8073 } 8074 8075 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 8076 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 8077 8078 zs->zs_do_init = B_TRUE; 8079 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 8080 if (ztest_opts.zo_verbose >= 1) { 8081 (void) printf("Executing older ztest for " 8082 "initialization: %s\n", ztest_opts.zo_alt_ztest); 8083 } 8084 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 8085 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 8086 } else { 8087 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 8088 } 8089 zs->zs_do_init = B_FALSE; 8090 8091 zs->zs_proc_start = gethrtime(); 8092 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 8093 8094 for (f = 0; f < ZTEST_FUNCS; f++) { 8095 zi = &ztest_info[f]; 8096 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8097 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 8098 zc->zc_next = UINT64_MAX; 8099 else 8100 zc->zc_next = zs->zs_proc_start + 8101 ztest_random(2 * zi->zi_interval[0] + 1); 8102 } 8103 8104 /* 8105 * Run the tests in a loop. These tests include fault injection 8106 * to verify that self-healing data works, and forced crashes 8107 * to verify that we never lose on-disk consistency. 8108 */ 8109 while (gethrtime() < zs->zs_proc_stop) { 8110 int status; 8111 boolean_t killed; 8112 8113 /* 8114 * Initialize the workload counters for each function. 8115 */ 8116 for (f = 0; f < ZTEST_FUNCS; f++) { 8117 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8118 zc->zc_count = 0; 8119 zc->zc_time = 0; 8120 } 8121 8122 /* Set the allocation switch size */ 8123 zs->zs_metaslab_df_alloc_threshold = 8124 ztest_random(zs->zs_metaslab_sz / 4) + 1; 8125 8126 if (!hasalt || ztest_random(2) == 0) { 8127 if (hasalt && ztest_opts.zo_verbose >= 1) { 8128 (void) printf("Executing newer ztest: %s\n", 8129 cmd); 8130 } 8131 newer++; 8132 killed = exec_child(cmd, NULL, B_TRUE, &status); 8133 } else { 8134 if (hasalt && ztest_opts.zo_verbose >= 1) { 8135 (void) printf("Executing older ztest: %s\n", 8136 ztest_opts.zo_alt_ztest); 8137 } 8138 older++; 8139 killed = exec_child(ztest_opts.zo_alt_ztest, 8140 ztest_opts.zo_alt_libpath, B_TRUE, &status); 8141 } 8142 8143 if (killed) 8144 kills++; 8145 iters++; 8146 8147 if (ztest_opts.zo_verbose >= 1) { 8148 hrtime_t now = gethrtime(); 8149 8150 now = MIN(now, zs->zs_proc_stop); 8151 print_time(zs->zs_proc_stop - now, timebuf); 8152 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 8153 8154 (void) printf("Pass %3d, %8s, %3"PRIu64" ENOSPC, " 8155 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 8156 iters, 8157 WIFEXITED(status) ? "Complete" : "SIGKILL", 8158 zs->zs_enospc_count, 8159 100.0 * zs->zs_alloc / zs->zs_space, 8160 numbuf, 8161 100.0 * (now - zs->zs_proc_start) / 8162 (ztest_opts.zo_time * NANOSEC), timebuf); 8163 } 8164 8165 if (ztest_opts.zo_verbose >= 2) { 8166 (void) printf("\nWorkload summary:\n\n"); 8167 (void) printf("%7s %9s %s\n", 8168 "Calls", "Time", "Function"); 8169 (void) printf("%7s %9s %s\n", 8170 "-----", "----", "--------"); 8171 for (f = 0; f < ZTEST_FUNCS; f++) { 8172 zi = &ztest_info[f]; 8173 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8174 print_time(zc->zc_time, timebuf); 8175 (void) printf("%7"PRIu64" %9s %s\n", 8176 zc->zc_count, timebuf, 8177 zi->zi_funcname); 8178 } 8179 (void) printf("\n"); 8180 } 8181 8182 if (!ztest_opts.zo_mmp_test) 8183 ztest_run_zdb(ztest_opts.zo_pool); 8184 } 8185 8186 if (ztest_opts.zo_verbose >= 1) { 8187 if (hasalt) { 8188 (void) printf("%d runs of older ztest: %s\n", older, 8189 ztest_opts.zo_alt_ztest); 8190 (void) printf("%d runs of newer ztest: %s\n", newer, 8191 cmd); 8192 } 8193 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 8194 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 8195 } 8196 8197 umem_free(cmd, MAXNAMELEN); 8198 8199 return (0); 8200 } 8201