1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2013 Steven Hartland. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2017 Joyent, Inc. 28 * Copyright (c) 2017, Intel Corporation. 29 */ 30 31 /* 32 * The objective of this program is to provide a DMU/ZAP/SPA stress test 33 * that runs entirely in userland, is easy to use, and easy to extend. 34 * 35 * The overall design of the ztest program is as follows: 36 * 37 * (1) For each major functional area (e.g. adding vdevs to a pool, 38 * creating and destroying datasets, reading and writing objects, etc) 39 * we have a simple routine to test that functionality. These 40 * individual routines do not have to do anything "stressful". 41 * 42 * (2) We turn these simple functionality tests into a stress test by 43 * running them all in parallel, with as many threads as desired, 44 * and spread across as many datasets, objects, and vdevs as desired. 45 * 46 * (3) While all this is happening, we inject faults into the pool to 47 * verify that self-healing data really works. 48 * 49 * (4) Every time we open a dataset, we change its checksum and compression 50 * functions. Thus even individual objects vary from block to block 51 * in which checksum they use and whether they're compressed. 52 * 53 * (5) To verify that we never lose on-disk consistency after a crash, 54 * we run the entire test in a child of the main process. 55 * At random times, the child self-immolates with a SIGKILL. 56 * This is the software equivalent of pulling the power cord. 57 * The parent then runs the test again, using the existing 58 * storage pool, as many times as desired. If backwards compatibility 59 * testing is enabled ztest will sometimes run the "older" version 60 * of ztest after a SIGKILL. 61 * 62 * (6) To verify that we don't have future leaks or temporal incursions, 63 * many of the functional tests record the transaction group number 64 * as part of their data. When reading old data, they verify that 65 * the transaction group number is less than the current, open txg. 66 * If you add a new test, please do this if applicable. 67 * 68 * (7) Threads are created with a reduced stack size, for sanity checking. 69 * Therefore, it's important not to allocate huge buffers on the stack. 70 * 71 * When run with no arguments, ztest runs for about five minutes and 72 * produces no output if successful. To get a little bit of information, 73 * specify -V. To get more information, specify -VV, and so on. 74 * 75 * To turn this into an overnight stress test, use -T to specify run time. 76 * 77 * You can ask more vdevs [-v], datasets [-d], or threads [-t] 78 * to increase the pool capacity, fanout, and overall stress level. 79 * 80 * Use the -k option to set the desired frequency of kills. 81 * 82 * When ztest invokes itself it passes all relevant information through a 83 * temporary file which is mmap-ed in the child process. This allows shared 84 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 85 * stored at offset 0 of this file and contains information on the size and 86 * number of shared structures in the file. The information stored in this file 87 * must remain backwards compatible with older versions of ztest so that 88 * ztest can invoke them during backwards compatibility testing (-B). 89 */ 90 91 #include <sys/zfs_context.h> 92 #include <sys/spa.h> 93 #include <sys/dmu.h> 94 #include <sys/txg.h> 95 #include <sys/dbuf.h> 96 #include <sys/zap.h> 97 #include <sys/dmu_objset.h> 98 #include <sys/poll.h> 99 #include <sys/stat.h> 100 #include <sys/time.h> 101 #include <sys/wait.h> 102 #include <sys/mman.h> 103 #include <sys/resource.h> 104 #include <sys/zio.h> 105 #include <sys/zil.h> 106 #include <sys/zil_impl.h> 107 #include <sys/vdev_draid.h> 108 #include <sys/vdev_impl.h> 109 #include <sys/vdev_file.h> 110 #include <sys/vdev_initialize.h> 111 #include <sys/vdev_raidz.h> 112 #include <sys/vdev_trim.h> 113 #include <sys/spa_impl.h> 114 #include <sys/metaslab_impl.h> 115 #include <sys/dsl_prop.h> 116 #include <sys/dsl_dataset.h> 117 #include <sys/dsl_destroy.h> 118 #include <sys/dsl_scan.h> 119 #include <sys/zio_checksum.h> 120 #include <sys/zfs_refcount.h> 121 #include <sys/zfeature.h> 122 #include <sys/dsl_userhold.h> 123 #include <sys/abd.h> 124 #include <sys/blake3.h> 125 #include <stdio.h> 126 #include <stdlib.h> 127 #include <unistd.h> 128 #include <getopt.h> 129 #include <signal.h> 130 #include <umem.h> 131 #include <ctype.h> 132 #include <math.h> 133 #include <sys/fs/zfs.h> 134 #include <zfs_fletcher.h> 135 #include <libnvpair.h> 136 #include <libzutil.h> 137 #include <sys/crypto/icp.h> 138 #if (__GLIBC__ && !__UCLIBC__) 139 #include <execinfo.h> /* for backtrace() */ 140 #endif 141 142 static int ztest_fd_data = -1; 143 static int ztest_fd_rand = -1; 144 145 typedef struct ztest_shared_hdr { 146 uint64_t zh_hdr_size; 147 uint64_t zh_opts_size; 148 uint64_t zh_size; 149 uint64_t zh_stats_size; 150 uint64_t zh_stats_count; 151 uint64_t zh_ds_size; 152 uint64_t zh_ds_count; 153 } ztest_shared_hdr_t; 154 155 static ztest_shared_hdr_t *ztest_shared_hdr; 156 157 enum ztest_class_state { 158 ZTEST_VDEV_CLASS_OFF, 159 ZTEST_VDEV_CLASS_ON, 160 ZTEST_VDEV_CLASS_RND 161 }; 162 163 #define ZO_GVARS_MAX_ARGLEN ((size_t)64) 164 #define ZO_GVARS_MAX_COUNT ((size_t)10) 165 166 typedef struct ztest_shared_opts { 167 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 168 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 169 char zo_alt_ztest[MAXNAMELEN]; 170 char zo_alt_libpath[MAXNAMELEN]; 171 uint64_t zo_vdevs; 172 uint64_t zo_vdevtime; 173 size_t zo_vdev_size; 174 int zo_ashift; 175 int zo_mirrors; 176 int zo_raid_children; 177 int zo_raid_parity; 178 char zo_raid_type[8]; 179 int zo_draid_data; 180 int zo_draid_spares; 181 int zo_datasets; 182 int zo_threads; 183 uint64_t zo_passtime; 184 uint64_t zo_killrate; 185 int zo_verbose; 186 int zo_init; 187 uint64_t zo_time; 188 uint64_t zo_maxloops; 189 uint64_t zo_metaslab_force_ganging; 190 int zo_mmp_test; 191 int zo_special_vdevs; 192 int zo_dump_dbgmsg; 193 int zo_gvars_count; 194 char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; 195 } ztest_shared_opts_t; 196 197 /* Default values for command line options. */ 198 #define DEFAULT_POOL "ztest" 199 #define DEFAULT_VDEV_DIR "/tmp" 200 #define DEFAULT_VDEV_COUNT 5 201 #define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ 202 #define DEFAULT_VDEV_SIZE_STR "256M" 203 #define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT 204 #define DEFAULT_MIRRORS 2 205 #define DEFAULT_RAID_CHILDREN 4 206 #define DEFAULT_RAID_PARITY 1 207 #define DEFAULT_DRAID_DATA 4 208 #define DEFAULT_DRAID_SPARES 1 209 #define DEFAULT_DATASETS_COUNT 7 210 #define DEFAULT_THREADS 23 211 #define DEFAULT_RUN_TIME 300 /* 300 seconds */ 212 #define DEFAULT_RUN_TIME_STR "300 sec" 213 #define DEFAULT_PASS_TIME 60 /* 60 seconds */ 214 #define DEFAULT_PASS_TIME_STR "60 sec" 215 #define DEFAULT_KILL_RATE 70 /* 70% kill rate */ 216 #define DEFAULT_KILLRATE_STR "70%" 217 #define DEFAULT_INITS 1 218 #define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ 219 #define DEFAULT_FORCE_GANGING (64 << 10) 220 #define DEFAULT_FORCE_GANGING_STR "64K" 221 222 /* Simplifying assumption: -1 is not a valid default. */ 223 #define NO_DEFAULT -1 224 225 static const ztest_shared_opts_t ztest_opts_defaults = { 226 .zo_pool = DEFAULT_POOL, 227 .zo_dir = DEFAULT_VDEV_DIR, 228 .zo_alt_ztest = { '\0' }, 229 .zo_alt_libpath = { '\0' }, 230 .zo_vdevs = DEFAULT_VDEV_COUNT, 231 .zo_ashift = DEFAULT_ASHIFT, 232 .zo_mirrors = DEFAULT_MIRRORS, 233 .zo_raid_children = DEFAULT_RAID_CHILDREN, 234 .zo_raid_parity = DEFAULT_RAID_PARITY, 235 .zo_raid_type = VDEV_TYPE_RAIDZ, 236 .zo_vdev_size = DEFAULT_VDEV_SIZE, 237 .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ 238 .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ 239 .zo_datasets = DEFAULT_DATASETS_COUNT, 240 .zo_threads = DEFAULT_THREADS, 241 .zo_passtime = DEFAULT_PASS_TIME, 242 .zo_killrate = DEFAULT_KILL_RATE, 243 .zo_verbose = 0, 244 .zo_mmp_test = 0, 245 .zo_init = DEFAULT_INITS, 246 .zo_time = DEFAULT_RUN_TIME, 247 .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ 248 .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, 249 .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, 250 .zo_gvars_count = 0, 251 }; 252 253 extern uint64_t metaslab_force_ganging; 254 extern uint64_t metaslab_df_alloc_threshold; 255 extern unsigned long zfs_deadman_synctime_ms; 256 extern int metaslab_preload_limit; 257 extern int zfs_compressed_arc_enabled; 258 extern int zfs_abd_scatter_enabled; 259 extern int dmu_object_alloc_chunk_shift; 260 extern boolean_t zfs_force_some_double_word_sm_entries; 261 extern unsigned long zio_decompress_fail_fraction; 262 extern unsigned long zfs_reconstruct_indirect_damage_fraction; 263 264 265 static ztest_shared_opts_t *ztest_shared_opts; 266 static ztest_shared_opts_t ztest_opts; 267 static const char *const ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; 268 269 typedef struct ztest_shared_ds { 270 uint64_t zd_seq; 271 } ztest_shared_ds_t; 272 273 static ztest_shared_ds_t *ztest_shared_ds; 274 #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 275 276 #define BT_MAGIC 0x123456789abcdefULL 277 #define MAXFAULTS(zs) \ 278 (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) 279 280 enum ztest_io_type { 281 ZTEST_IO_WRITE_TAG, 282 ZTEST_IO_WRITE_PATTERN, 283 ZTEST_IO_WRITE_ZEROES, 284 ZTEST_IO_TRUNCATE, 285 ZTEST_IO_SETATTR, 286 ZTEST_IO_REWRITE, 287 ZTEST_IO_TYPES 288 }; 289 290 typedef struct ztest_block_tag { 291 uint64_t bt_magic; 292 uint64_t bt_objset; 293 uint64_t bt_object; 294 uint64_t bt_dnodesize; 295 uint64_t bt_offset; 296 uint64_t bt_gen; 297 uint64_t bt_txg; 298 uint64_t bt_crtxg; 299 } ztest_block_tag_t; 300 301 typedef struct bufwad { 302 uint64_t bw_index; 303 uint64_t bw_txg; 304 uint64_t bw_data; 305 } bufwad_t; 306 307 /* 308 * It would be better to use a rangelock_t per object. Unfortunately 309 * the rangelock_t is not a drop-in replacement for rl_t, because we 310 * still need to map from object ID to rangelock_t. 311 */ 312 typedef enum { 313 RL_READER, 314 RL_WRITER, 315 RL_APPEND 316 } rl_type_t; 317 318 typedef struct rll { 319 void *rll_writer; 320 int rll_readers; 321 kmutex_t rll_lock; 322 kcondvar_t rll_cv; 323 } rll_t; 324 325 typedef struct rl { 326 uint64_t rl_object; 327 uint64_t rl_offset; 328 uint64_t rl_size; 329 rll_t *rl_lock; 330 } rl_t; 331 332 #define ZTEST_RANGE_LOCKS 64 333 #define ZTEST_OBJECT_LOCKS 64 334 335 /* 336 * Object descriptor. Used as a template for object lookup/create/remove. 337 */ 338 typedef struct ztest_od { 339 uint64_t od_dir; 340 uint64_t od_object; 341 dmu_object_type_t od_type; 342 dmu_object_type_t od_crtype; 343 uint64_t od_blocksize; 344 uint64_t od_crblocksize; 345 uint64_t od_crdnodesize; 346 uint64_t od_gen; 347 uint64_t od_crgen; 348 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 349 } ztest_od_t; 350 351 /* 352 * Per-dataset state. 353 */ 354 typedef struct ztest_ds { 355 ztest_shared_ds_t *zd_shared; 356 objset_t *zd_os; 357 pthread_rwlock_t zd_zilog_lock; 358 zilog_t *zd_zilog; 359 ztest_od_t *zd_od; /* debugging aid */ 360 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 361 kmutex_t zd_dirobj_lock; 362 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 363 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 364 } ztest_ds_t; 365 366 /* 367 * Per-iteration state. 368 */ 369 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 370 371 typedef struct ztest_info { 372 ztest_func_t *zi_func; /* test function */ 373 uint64_t zi_iters; /* iterations per execution */ 374 uint64_t *zi_interval; /* execute every <interval> seconds */ 375 const char *zi_funcname; /* name of test function */ 376 } ztest_info_t; 377 378 typedef struct ztest_shared_callstate { 379 uint64_t zc_count; /* per-pass count */ 380 uint64_t zc_time; /* per-pass time */ 381 uint64_t zc_next; /* next time to call this function */ 382 } ztest_shared_callstate_t; 383 384 static ztest_shared_callstate_t *ztest_shared_callstate; 385 #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 386 387 ztest_func_t ztest_dmu_read_write; 388 ztest_func_t ztest_dmu_write_parallel; 389 ztest_func_t ztest_dmu_object_alloc_free; 390 ztest_func_t ztest_dmu_object_next_chunk; 391 ztest_func_t ztest_dmu_commit_callbacks; 392 ztest_func_t ztest_zap; 393 ztest_func_t ztest_zap_parallel; 394 ztest_func_t ztest_zil_commit; 395 ztest_func_t ztest_zil_remount; 396 ztest_func_t ztest_dmu_read_write_zcopy; 397 ztest_func_t ztest_dmu_objset_create_destroy; 398 ztest_func_t ztest_dmu_prealloc; 399 ztest_func_t ztest_fzap; 400 ztest_func_t ztest_dmu_snapshot_create_destroy; 401 ztest_func_t ztest_dsl_prop_get_set; 402 ztest_func_t ztest_spa_prop_get_set; 403 ztest_func_t ztest_spa_create_destroy; 404 ztest_func_t ztest_fault_inject; 405 ztest_func_t ztest_dmu_snapshot_hold; 406 ztest_func_t ztest_mmp_enable_disable; 407 ztest_func_t ztest_scrub; 408 ztest_func_t ztest_dsl_dataset_promote_busy; 409 ztest_func_t ztest_vdev_attach_detach; 410 ztest_func_t ztest_vdev_LUN_growth; 411 ztest_func_t ztest_vdev_add_remove; 412 ztest_func_t ztest_vdev_class_add; 413 ztest_func_t ztest_vdev_aux_add_remove; 414 ztest_func_t ztest_split_pool; 415 ztest_func_t ztest_reguid; 416 ztest_func_t ztest_spa_upgrade; 417 ztest_func_t ztest_device_removal; 418 ztest_func_t ztest_spa_checkpoint_create_discard; 419 ztest_func_t ztest_initialize; 420 ztest_func_t ztest_trim; 421 ztest_func_t ztest_blake3; 422 ztest_func_t ztest_fletcher; 423 ztest_func_t ztest_fletcher_incr; 424 ztest_func_t ztest_verify_dnode_bt; 425 426 uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 427 uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 428 uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 429 uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 430 uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 431 432 #define ZTI_INIT(func, iters, interval) \ 433 { .zi_func = (func), \ 434 .zi_iters = (iters), \ 435 .zi_interval = (interval), \ 436 .zi_funcname = # func } 437 438 ztest_info_t ztest_info[] = { 439 ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), 440 ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), 441 ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), 442 ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), 443 ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), 444 ZTI_INIT(ztest_zap, 30, &zopt_always), 445 ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), 446 ZTI_INIT(ztest_split_pool, 1, &zopt_always), 447 ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), 448 ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), 449 ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), 450 ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), 451 ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), 452 ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), 453 #if 0 454 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), 455 #endif 456 ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), 457 ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), 458 ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), 459 ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), 460 ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), 461 ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), 462 ZTI_INIT(ztest_reguid, 1, &zopt_rarely), 463 ZTI_INIT(ztest_scrub, 1, &zopt_rarely), 464 ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), 465 ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), 466 ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), 467 ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), 468 ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), 469 ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), 470 ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), 471 ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), 472 ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), 473 ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), 474 ZTI_INIT(ztest_trim, 1, &zopt_sometimes), 475 ZTI_INIT(ztest_blake3, 1, &zopt_rarely), 476 ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), 477 ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), 478 ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), 479 }; 480 481 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 482 483 /* 484 * The following struct is used to hold a list of uncalled commit callbacks. 485 * The callbacks are ordered by txg number. 486 */ 487 typedef struct ztest_cb_list { 488 kmutex_t zcl_callbacks_lock; 489 list_t zcl_callbacks; 490 } ztest_cb_list_t; 491 492 /* 493 * Stuff we need to share writably between parent and child. 494 */ 495 typedef struct ztest_shared { 496 boolean_t zs_do_init; 497 hrtime_t zs_proc_start; 498 hrtime_t zs_proc_stop; 499 hrtime_t zs_thread_start; 500 hrtime_t zs_thread_stop; 501 hrtime_t zs_thread_kill; 502 uint64_t zs_enospc_count; 503 uint64_t zs_vdev_next_leaf; 504 uint64_t zs_vdev_aux; 505 uint64_t zs_alloc; 506 uint64_t zs_space; 507 uint64_t zs_splits; 508 uint64_t zs_mirrors; 509 uint64_t zs_metaslab_sz; 510 uint64_t zs_metaslab_df_alloc_threshold; 511 uint64_t zs_guid; 512 } ztest_shared_t; 513 514 #define ID_PARALLEL -1ULL 515 516 static char ztest_dev_template[] = "%s/%s.%llua"; 517 static char ztest_aux_template[] = "%s/%s.%s.%llu"; 518 ztest_shared_t *ztest_shared; 519 520 static spa_t *ztest_spa = NULL; 521 static ztest_ds_t *ztest_ds; 522 523 static kmutex_t ztest_vdev_lock; 524 static boolean_t ztest_device_removal_active = B_FALSE; 525 static boolean_t ztest_pool_scrubbed = B_FALSE; 526 static kmutex_t ztest_checkpoint_lock; 527 528 /* 529 * The ztest_name_lock protects the pool and dataset namespace used by 530 * the individual tests. To modify the namespace, consumers must grab 531 * this lock as writer. Grabbing the lock as reader will ensure that the 532 * namespace does not change while the lock is held. 533 */ 534 static pthread_rwlock_t ztest_name_lock; 535 536 static boolean_t ztest_dump_core = B_TRUE; 537 static boolean_t ztest_exiting; 538 539 /* Global commit callback list */ 540 static ztest_cb_list_t zcl; 541 /* Commit cb delay */ 542 static uint64_t zc_min_txg_delay = UINT64_MAX; 543 static int zc_cb_counter = 0; 544 545 /* 546 * Minimum number of commit callbacks that need to be registered for us to check 547 * whether the minimum txg delay is acceptable. 548 */ 549 #define ZTEST_COMMIT_CB_MIN_REG 100 550 551 /* 552 * If a number of txgs equal to this threshold have been created after a commit 553 * callback has been registered but not called, then we assume there is an 554 * implementation bug. 555 */ 556 #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) 557 558 enum ztest_object { 559 ZTEST_META_DNODE = 0, 560 ZTEST_DIROBJ, 561 ZTEST_OBJECTS 562 }; 563 564 static __attribute__((noreturn)) void usage(boolean_t requested); 565 static int ztest_scrub_impl(spa_t *spa); 566 567 /* 568 * These libumem hooks provide a reasonable set of defaults for the allocator's 569 * debugging facilities. 570 */ 571 const char * 572 _umem_debug_init(void) 573 { 574 return ("default,verbose"); /* $UMEM_DEBUG setting */ 575 } 576 577 const char * 578 _umem_logging_init(void) 579 { 580 return ("fail,contents"); /* $UMEM_LOGGING setting */ 581 } 582 583 static void 584 dump_debug_buffer(void) 585 { 586 ssize_t ret __attribute__((unused)); 587 588 if (!ztest_opts.zo_dump_dbgmsg) 589 return; 590 591 /* 592 * We use write() instead of printf() so that this function 593 * is safe to call from a signal handler. 594 */ 595 ret = write(STDOUT_FILENO, "\n", 1); 596 zfs_dbgmsg_print("ztest"); 597 } 598 599 #define BACKTRACE_SZ 100 600 601 static void sig_handler(int signo) 602 { 603 struct sigaction action; 604 #if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ 605 int nptrs; 606 void *buffer[BACKTRACE_SZ]; 607 608 nptrs = backtrace(buffer, BACKTRACE_SZ); 609 backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); 610 #endif 611 dump_debug_buffer(); 612 613 /* 614 * Restore default action and re-raise signal so SIGSEGV and 615 * SIGABRT can trigger a core dump. 616 */ 617 action.sa_handler = SIG_DFL; 618 sigemptyset(&action.sa_mask); 619 action.sa_flags = 0; 620 (void) sigaction(signo, &action, NULL); 621 raise(signo); 622 } 623 624 #define FATAL_MSG_SZ 1024 625 626 static const char *fatal_msg; 627 628 static __attribute__((format(printf, 2, 3))) __attribute__((noreturn)) void 629 fatal(int do_perror, const char *message, ...) 630 { 631 va_list args; 632 int save_errno = errno; 633 char *buf; 634 635 (void) fflush(stdout); 636 buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); 637 if (buf == NULL) 638 goto out; 639 640 va_start(args, message); 641 (void) sprintf(buf, "ztest: "); 642 /* LINTED */ 643 (void) vsprintf(buf + strlen(buf), message, args); 644 va_end(args); 645 if (do_perror) { 646 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 647 ": %s", strerror(save_errno)); 648 } 649 (void) fprintf(stderr, "%s\n", buf); 650 fatal_msg = buf; /* to ease debugging */ 651 652 out: 653 if (ztest_dump_core) 654 abort(); 655 else 656 dump_debug_buffer(); 657 658 exit(3); 659 } 660 661 static int 662 str2shift(const char *buf) 663 { 664 const char *ends = "BKMGTPEZ"; 665 int i; 666 667 if (buf[0] == '\0') 668 return (0); 669 for (i = 0; i < strlen(ends); i++) { 670 if (toupper(buf[0]) == ends[i]) 671 break; 672 } 673 if (i == strlen(ends)) { 674 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 675 buf); 676 usage(B_FALSE); 677 } 678 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 679 return (10*i); 680 } 681 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 682 usage(B_FALSE); 683 } 684 685 static uint64_t 686 nicenumtoull(const char *buf) 687 { 688 char *end; 689 uint64_t val; 690 691 val = strtoull(buf, &end, 0); 692 if (end == buf) { 693 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 694 usage(B_FALSE); 695 } else if (end[0] == '.') { 696 double fval = strtod(buf, &end); 697 fval *= pow(2, str2shift(end)); 698 /* 699 * UINT64_MAX is not exactly representable as a double. 700 * The closest representation is UINT64_MAX + 1, so we 701 * use a >= comparison instead of > for the bounds check. 702 */ 703 if (fval >= (double)UINT64_MAX) { 704 (void) fprintf(stderr, "ztest: value too large: %s\n", 705 buf); 706 usage(B_FALSE); 707 } 708 val = (uint64_t)fval; 709 } else { 710 int shift = str2shift(end); 711 if (shift >= 64 || (val << shift) >> shift != val) { 712 (void) fprintf(stderr, "ztest: value too large: %s\n", 713 buf); 714 usage(B_FALSE); 715 } 716 val <<= shift; 717 } 718 return (val); 719 } 720 721 typedef struct ztest_option { 722 const char short_opt; 723 const char *long_opt; 724 const char *long_opt_param; 725 const char *comment; 726 unsigned int default_int; 727 const char *default_str; 728 } ztest_option_t; 729 730 /* 731 * The following option_table is used for generating the usage info as well as 732 * the long and short option information for calling getopt_long(). 733 */ 734 static ztest_option_t option_table[] = { 735 { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, 736 NULL}, 737 { 's', "vdev-size", "INTEGER", "Size of each vdev", 738 NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, 739 { 'a', "alignment-shift", "INTEGER", 740 "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, 741 { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", 742 DEFAULT_MIRRORS, NULL}, 743 { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", 744 DEFAULT_RAID_CHILDREN, NULL}, 745 { 'R', "raid-parity", "INTEGER", "Raid parity", 746 DEFAULT_RAID_PARITY, NULL}, 747 { 'K', "raid-kind", "raidz|draid|random", "Raid kind", 748 NO_DEFAULT, "random"}, 749 { 'D', "draid-data", "INTEGER", "Number of draid data drives", 750 DEFAULT_DRAID_DATA, NULL}, 751 { 'S', "draid-spares", "INTEGER", "Number of draid spares", 752 DEFAULT_DRAID_SPARES, NULL}, 753 { 'd', "datasets", "INTEGER", "Number of datasets", 754 DEFAULT_DATASETS_COUNT, NULL}, 755 { 't', "threads", "INTEGER", "Number of ztest threads", 756 DEFAULT_THREADS, NULL}, 757 { 'g', "gang-block-threshold", "INTEGER", 758 "Metaslab gang block threshold", 759 NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, 760 { 'i', "init-count", "INTEGER", "Number of times to initialize pool", 761 DEFAULT_INITS, NULL}, 762 { 'k', "kill-percentage", "INTEGER", "Kill percentage", 763 NO_DEFAULT, DEFAULT_KILLRATE_STR}, 764 { 'p', "pool-name", "STRING", "Pool name", 765 NO_DEFAULT, DEFAULT_POOL}, 766 { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", 767 NO_DEFAULT, DEFAULT_VDEV_DIR}, 768 { 'M', "multi-host", NULL, 769 "Multi-host; simulate pool imported on remote host", 770 NO_DEFAULT, NULL}, 771 { 'E', "use-existing-pool", NULL, 772 "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, 773 { 'T', "run-time", "INTEGER", "Total run time", 774 NO_DEFAULT, DEFAULT_RUN_TIME_STR}, 775 { 'P', "pass-time", "INTEGER", "Time per pass", 776 NO_DEFAULT, DEFAULT_PASS_TIME_STR}, 777 { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", 778 DEFAULT_MAX_LOOPS, NULL}, 779 { 'B', "alt-ztest", "PATH", "Alternate ztest path", 780 NO_DEFAULT, NULL}, 781 { 'C', "vdev-class-state", "on|off|random", "vdev class state", 782 NO_DEFAULT, "random"}, 783 { 'o', "option", "\"OPTION=INTEGER\"", 784 "Set global variable to an unsigned 32-bit integer value", 785 NO_DEFAULT, NULL}, 786 { 'G', "dump-debug-msg", NULL, 787 "Dump zfs_dbgmsg buffer before exiting due to an error", 788 NO_DEFAULT, NULL}, 789 { 'V', "verbose", NULL, 790 "Verbose (use multiple times for ever more verbosity)", 791 NO_DEFAULT, NULL}, 792 { 'h', "help", NULL, "Show this help", 793 NO_DEFAULT, NULL}, 794 {0, 0, 0, 0, 0, 0} 795 }; 796 797 static struct option *long_opts = NULL; 798 static char *short_opts = NULL; 799 800 static void 801 init_options(void) 802 { 803 ASSERT3P(long_opts, ==, NULL); 804 ASSERT3P(short_opts, ==, NULL); 805 806 int count = sizeof (option_table) / sizeof (option_table[0]); 807 long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); 808 809 short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); 810 int short_opt_index = 0; 811 812 for (int i = 0; i < count; i++) { 813 long_opts[i].val = option_table[i].short_opt; 814 long_opts[i].name = option_table[i].long_opt; 815 long_opts[i].has_arg = option_table[i].long_opt_param != NULL 816 ? required_argument : no_argument; 817 long_opts[i].flag = NULL; 818 short_opts[short_opt_index++] = option_table[i].short_opt; 819 if (option_table[i].long_opt_param != NULL) { 820 short_opts[short_opt_index++] = ':'; 821 } 822 } 823 } 824 825 static void 826 fini_options(void) 827 { 828 int count = sizeof (option_table) / sizeof (option_table[0]); 829 830 umem_free(long_opts, sizeof (struct option) * count); 831 umem_free(short_opts, sizeof (char) * 2 * count); 832 833 long_opts = NULL; 834 short_opts = NULL; 835 } 836 837 static __attribute__((noreturn)) void 838 usage(boolean_t requested) 839 { 840 char option[80]; 841 FILE *fp = requested ? stdout : stderr; 842 843 (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); 844 for (int i = 0; option_table[i].short_opt != 0; i++) { 845 if (option_table[i].long_opt_param != NULL) { 846 (void) sprintf(option, " -%c --%s=%s", 847 option_table[i].short_opt, 848 option_table[i].long_opt, 849 option_table[i].long_opt_param); 850 } else { 851 (void) sprintf(option, " -%c --%s", 852 option_table[i].short_opt, 853 option_table[i].long_opt); 854 } 855 (void) fprintf(fp, " %-40s%s", option, 856 option_table[i].comment); 857 858 if (option_table[i].long_opt_param != NULL) { 859 if (option_table[i].default_str != NULL) { 860 (void) fprintf(fp, " (default: %s)", 861 option_table[i].default_str); 862 } else if (option_table[i].default_int != NO_DEFAULT) { 863 (void) fprintf(fp, " (default: %u)", 864 option_table[i].default_int); 865 } 866 } 867 (void) fprintf(fp, "\n"); 868 } 869 exit(requested ? 0 : 1); 870 } 871 872 static uint64_t 873 ztest_random(uint64_t range) 874 { 875 uint64_t r; 876 877 ASSERT3S(ztest_fd_rand, >=, 0); 878 879 if (range == 0) 880 return (0); 881 882 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) 883 fatal(B_TRUE, "short read from /dev/urandom"); 884 885 return (r % range); 886 } 887 888 static void 889 ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) 890 { 891 char name[32]; 892 char *value; 893 int state = ZTEST_VDEV_CLASS_RND; 894 895 (void) strlcpy(name, input, sizeof (name)); 896 897 value = strchr(name, '='); 898 if (value == NULL) { 899 (void) fprintf(stderr, "missing value in property=value " 900 "'-C' argument (%s)\n", input); 901 usage(B_FALSE); 902 } 903 *(value) = '\0'; 904 value++; 905 906 if (strcmp(value, "on") == 0) { 907 state = ZTEST_VDEV_CLASS_ON; 908 } else if (strcmp(value, "off") == 0) { 909 state = ZTEST_VDEV_CLASS_OFF; 910 } else if (strcmp(value, "random") == 0) { 911 state = ZTEST_VDEV_CLASS_RND; 912 } else { 913 (void) fprintf(stderr, "invalid property value '%s'\n", value); 914 usage(B_FALSE); 915 } 916 917 if (strcmp(name, "special") == 0) { 918 zo->zo_special_vdevs = state; 919 } else { 920 (void) fprintf(stderr, "invalid property name '%s'\n", name); 921 usage(B_FALSE); 922 } 923 if (zo->zo_verbose >= 3) 924 (void) printf("%s vdev state is '%s'\n", name, value); 925 } 926 927 static void 928 process_options(int argc, char **argv) 929 { 930 char *path; 931 ztest_shared_opts_t *zo = &ztest_opts; 932 933 int opt; 934 uint64_t value; 935 const char *raid_kind = "random"; 936 937 memcpy(zo, &ztest_opts_defaults, sizeof (*zo)); 938 939 init_options(); 940 941 while ((opt = getopt_long(argc, argv, short_opts, long_opts, 942 NULL)) != EOF) { 943 value = 0; 944 switch (opt) { 945 case 'v': 946 case 's': 947 case 'a': 948 case 'm': 949 case 'r': 950 case 'R': 951 case 'D': 952 case 'S': 953 case 'd': 954 case 't': 955 case 'g': 956 case 'i': 957 case 'k': 958 case 'T': 959 case 'P': 960 case 'F': 961 value = nicenumtoull(optarg); 962 } 963 switch (opt) { 964 case 'v': 965 zo->zo_vdevs = value; 966 break; 967 case 's': 968 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 969 break; 970 case 'a': 971 zo->zo_ashift = value; 972 break; 973 case 'm': 974 zo->zo_mirrors = value; 975 break; 976 case 'r': 977 zo->zo_raid_children = MAX(1, value); 978 break; 979 case 'R': 980 zo->zo_raid_parity = MIN(MAX(value, 1), 3); 981 break; 982 case 'K': 983 raid_kind = optarg; 984 break; 985 case 'D': 986 zo->zo_draid_data = MAX(1, value); 987 break; 988 case 'S': 989 zo->zo_draid_spares = MAX(1, value); 990 break; 991 case 'd': 992 zo->zo_datasets = MAX(1, value); 993 break; 994 case 't': 995 zo->zo_threads = MAX(1, value); 996 break; 997 case 'g': 998 zo->zo_metaslab_force_ganging = 999 MAX(SPA_MINBLOCKSIZE << 1, value); 1000 break; 1001 case 'i': 1002 zo->zo_init = value; 1003 break; 1004 case 'k': 1005 zo->zo_killrate = value; 1006 break; 1007 case 'p': 1008 (void) strlcpy(zo->zo_pool, optarg, 1009 sizeof (zo->zo_pool)); 1010 break; 1011 case 'f': 1012 path = realpath(optarg, NULL); 1013 if (path == NULL) { 1014 (void) fprintf(stderr, "error: %s: %s\n", 1015 optarg, strerror(errno)); 1016 usage(B_FALSE); 1017 } else { 1018 (void) strlcpy(zo->zo_dir, path, 1019 sizeof (zo->zo_dir)); 1020 free(path); 1021 } 1022 break; 1023 case 'M': 1024 zo->zo_mmp_test = 1; 1025 break; 1026 case 'V': 1027 zo->zo_verbose++; 1028 break; 1029 case 'E': 1030 zo->zo_init = 0; 1031 break; 1032 case 'T': 1033 zo->zo_time = value; 1034 break; 1035 case 'P': 1036 zo->zo_passtime = MAX(1, value); 1037 break; 1038 case 'F': 1039 zo->zo_maxloops = MAX(1, value); 1040 break; 1041 case 'B': 1042 (void) strlcpy(zo->zo_alt_ztest, optarg, 1043 sizeof (zo->zo_alt_ztest)); 1044 break; 1045 case 'C': 1046 ztest_parse_name_value(optarg, zo); 1047 break; 1048 case 'o': 1049 if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { 1050 (void) fprintf(stderr, 1051 "max global var count (%zu) exceeded\n", 1052 ZO_GVARS_MAX_COUNT); 1053 usage(B_FALSE); 1054 } 1055 char *v = zo->zo_gvars[zo->zo_gvars_count]; 1056 if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= 1057 ZO_GVARS_MAX_ARGLEN) { 1058 (void) fprintf(stderr, 1059 "global var option '%s' is too long\n", 1060 optarg); 1061 usage(B_FALSE); 1062 } 1063 zo->zo_gvars_count++; 1064 break; 1065 case 'G': 1066 zo->zo_dump_dbgmsg = 1; 1067 break; 1068 case 'h': 1069 usage(B_TRUE); 1070 break; 1071 case '?': 1072 default: 1073 usage(B_FALSE); 1074 break; 1075 } 1076 } 1077 1078 fini_options(); 1079 1080 /* When raid choice is 'random' add a draid pool 50% of the time */ 1081 if (strcmp(raid_kind, "random") == 0) { 1082 raid_kind = (ztest_random(2) == 0) ? "draid" : "raidz"; 1083 1084 if (ztest_opts.zo_verbose >= 3) 1085 (void) printf("choosing RAID type '%s'\n", raid_kind); 1086 } 1087 1088 if (strcmp(raid_kind, "draid") == 0) { 1089 uint64_t min_devsize; 1090 1091 /* With fewer disk use 256M, otherwise 128M is OK */ 1092 min_devsize = (ztest_opts.zo_raid_children < 16) ? 1093 (256ULL << 20) : (128ULL << 20); 1094 1095 /* No top-level mirrors with dRAID for now */ 1096 zo->zo_mirrors = 0; 1097 1098 /* Use more appropriate defaults for dRAID */ 1099 if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) 1100 zo->zo_vdevs = 1; 1101 if (zo->zo_raid_children == 1102 ztest_opts_defaults.zo_raid_children) 1103 zo->zo_raid_children = 16; 1104 if (zo->zo_ashift < 12) 1105 zo->zo_ashift = 12; 1106 if (zo->zo_vdev_size < min_devsize) 1107 zo->zo_vdev_size = min_devsize; 1108 1109 if (zo->zo_draid_data + zo->zo_raid_parity > 1110 zo->zo_raid_children - zo->zo_draid_spares) { 1111 (void) fprintf(stderr, "error: too few draid " 1112 "children (%d) for stripe width (%d)\n", 1113 zo->zo_raid_children, 1114 zo->zo_draid_data + zo->zo_raid_parity); 1115 usage(B_FALSE); 1116 } 1117 1118 (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, 1119 sizeof (zo->zo_raid_type)); 1120 1121 } else /* using raidz */ { 1122 ASSERT0(strcmp(raid_kind, "raidz")); 1123 1124 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1125 zo->zo_raid_children - 1); 1126 } 1127 1128 zo->zo_vdevtime = 1129 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 1130 UINT64_MAX >> 2); 1131 1132 if (*zo->zo_alt_ztest) { 1133 const char *invalid_what = "ztest"; 1134 char *val = zo->zo_alt_ztest; 1135 if (0 != access(val, X_OK) || 1136 (strrchr(val, '/') == NULL && (errno = EINVAL))) 1137 goto invalid; 1138 1139 int dirlen = strrchr(val, '/') - val; 1140 strncpy(zo->zo_alt_libpath, val, dirlen); 1141 invalid_what = "library path", val = zo->zo_alt_libpath; 1142 if (strrchr(val, '/') == NULL && (errno = EINVAL)) 1143 goto invalid; 1144 *strrchr(val, '/') = '\0'; 1145 strlcat(val, "/lib", sizeof (zo->zo_alt_libpath)); 1146 1147 if (0 != access(zo->zo_alt_libpath, X_OK)) 1148 goto invalid; 1149 return; 1150 1151 invalid: 1152 ztest_dump_core = B_FALSE; 1153 fatal(B_TRUE, "invalid alternate %s %s", invalid_what, val); 1154 } 1155 } 1156 1157 static void 1158 ztest_kill(ztest_shared_t *zs) 1159 { 1160 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 1161 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 1162 1163 /* 1164 * Before we kill ourselves, make sure that the config is updated. 1165 * See comment above spa_write_cachefile(). 1166 */ 1167 mutex_enter(&spa_namespace_lock); 1168 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE); 1169 mutex_exit(&spa_namespace_lock); 1170 1171 (void) raise(SIGKILL); 1172 } 1173 1174 static void 1175 ztest_record_enospc(const char *s) 1176 { 1177 (void) s; 1178 ztest_shared->zs_enospc_count++; 1179 } 1180 1181 static uint64_t 1182 ztest_get_ashift(void) 1183 { 1184 if (ztest_opts.zo_ashift == 0) 1185 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 1186 return (ztest_opts.zo_ashift); 1187 } 1188 1189 static boolean_t 1190 ztest_is_draid_spare(const char *name) 1191 { 1192 uint64_t spare_id = 0, parity = 0, vdev_id = 0; 1193 1194 if (sscanf(name, VDEV_TYPE_DRAID "%"PRIu64"-%"PRIu64"-%"PRIu64"", 1195 &parity, &vdev_id, &spare_id) == 3) { 1196 return (B_TRUE); 1197 } 1198 1199 return (B_FALSE); 1200 } 1201 1202 static nvlist_t * 1203 make_vdev_file(const char *path, const char *aux, const char *pool, 1204 size_t size, uint64_t ashift) 1205 { 1206 char *pathbuf = NULL; 1207 uint64_t vdev; 1208 nvlist_t *file; 1209 boolean_t draid_spare = B_FALSE; 1210 1211 1212 if (ashift == 0) 1213 ashift = ztest_get_ashift(); 1214 1215 if (path == NULL) { 1216 pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1217 path = pathbuf; 1218 1219 if (aux != NULL) { 1220 vdev = ztest_shared->zs_vdev_aux; 1221 (void) snprintf(pathbuf, MAXPATHLEN, 1222 ztest_aux_template, ztest_opts.zo_dir, 1223 pool == NULL ? ztest_opts.zo_pool : pool, 1224 aux, vdev); 1225 } else { 1226 vdev = ztest_shared->zs_vdev_next_leaf++; 1227 (void) snprintf(pathbuf, MAXPATHLEN, 1228 ztest_dev_template, ztest_opts.zo_dir, 1229 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 1230 } 1231 } else { 1232 draid_spare = ztest_is_draid_spare(path); 1233 } 1234 1235 if (size != 0 && !draid_spare) { 1236 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 1237 if (fd == -1) 1238 fatal(B_TRUE, "can't open %s", path); 1239 if (ftruncate(fd, size) != 0) 1240 fatal(B_TRUE, "can't ftruncate %s", path); 1241 (void) close(fd); 1242 } 1243 1244 file = fnvlist_alloc(); 1245 fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, 1246 draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); 1247 fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); 1248 fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); 1249 umem_free(pathbuf, MAXPATHLEN); 1250 1251 return (file); 1252 } 1253 1254 static nvlist_t * 1255 make_vdev_raid(const char *path, const char *aux, const char *pool, size_t size, 1256 uint64_t ashift, int r) 1257 { 1258 nvlist_t *raid, **child; 1259 int c; 1260 1261 if (r < 2) 1262 return (make_vdev_file(path, aux, pool, size, ashift)); 1263 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 1264 1265 for (c = 0; c < r; c++) 1266 child[c] = make_vdev_file(path, aux, pool, size, ashift); 1267 1268 raid = fnvlist_alloc(); 1269 fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, 1270 ztest_opts.zo_raid_type); 1271 fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, 1272 ztest_opts.zo_raid_parity); 1273 fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, 1274 (const nvlist_t **)child, r); 1275 1276 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { 1277 uint64_t ndata = ztest_opts.zo_draid_data; 1278 uint64_t nparity = ztest_opts.zo_raid_parity; 1279 uint64_t nspares = ztest_opts.zo_draid_spares; 1280 uint64_t children = ztest_opts.zo_raid_children; 1281 uint64_t ngroups = 1; 1282 1283 /* 1284 * Calculate the minimum number of groups required to fill a 1285 * slice. This is the LCM of the stripe width (data + parity) 1286 * and the number of data drives (children - spares). 1287 */ 1288 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1289 ngroups++; 1290 1291 /* Store the basic dRAID configuration. */ 1292 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1293 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1294 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1295 } 1296 1297 for (c = 0; c < r; c++) 1298 fnvlist_free(child[c]); 1299 1300 umem_free(child, r * sizeof (nvlist_t *)); 1301 1302 return (raid); 1303 } 1304 1305 static nvlist_t * 1306 make_vdev_mirror(const char *path, const char *aux, const char *pool, 1307 size_t size, uint64_t ashift, int r, int m) 1308 { 1309 nvlist_t *mirror, **child; 1310 int c; 1311 1312 if (m < 1) 1313 return (make_vdev_raid(path, aux, pool, size, ashift, r)); 1314 1315 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 1316 1317 for (c = 0; c < m; c++) 1318 child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); 1319 1320 mirror = fnvlist_alloc(); 1321 fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); 1322 fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 1323 (const nvlist_t **)child, m); 1324 1325 for (c = 0; c < m; c++) 1326 fnvlist_free(child[c]); 1327 1328 umem_free(child, m * sizeof (nvlist_t *)); 1329 1330 return (mirror); 1331 } 1332 1333 static nvlist_t * 1334 make_vdev_root(const char *path, const char *aux, const char *pool, size_t size, 1335 uint64_t ashift, const char *class, int r, int m, int t) 1336 { 1337 nvlist_t *root, **child; 1338 int c; 1339 boolean_t log; 1340 1341 ASSERT3S(t, >, 0); 1342 1343 log = (class != NULL && strcmp(class, "log") == 0); 1344 1345 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 1346 1347 for (c = 0; c < t; c++) { 1348 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 1349 r, m); 1350 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); 1351 1352 if (class != NULL && class[0] != '\0') { 1353 ASSERT(m > 1 || log); /* expecting a mirror */ 1354 fnvlist_add_string(child[c], 1355 ZPOOL_CONFIG_ALLOCATION_BIAS, class); 1356 } 1357 } 1358 1359 root = fnvlist_alloc(); 1360 fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 1361 fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 1362 (const nvlist_t **)child, t); 1363 1364 for (c = 0; c < t; c++) 1365 fnvlist_free(child[c]); 1366 1367 umem_free(child, t * sizeof (nvlist_t *)); 1368 1369 return (root); 1370 } 1371 1372 /* 1373 * Find a random spa version. Returns back a random spa version in the 1374 * range [initial_version, SPA_VERSION_FEATURES]. 1375 */ 1376 static uint64_t 1377 ztest_random_spa_version(uint64_t initial_version) 1378 { 1379 uint64_t version = initial_version; 1380 1381 if (version <= SPA_VERSION_BEFORE_FEATURES) { 1382 version = version + 1383 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 1384 } 1385 1386 if (version > SPA_VERSION_BEFORE_FEATURES) 1387 version = SPA_VERSION_FEATURES; 1388 1389 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1390 return (version); 1391 } 1392 1393 static int 1394 ztest_random_blocksize(void) 1395 { 1396 ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); 1397 1398 /* 1399 * Choose a block size >= the ashift. 1400 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1401 */ 1402 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1403 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1404 maxbs = 20; 1405 uint64_t block_shift = 1406 ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1407 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1408 } 1409 1410 static int 1411 ztest_random_dnodesize(void) 1412 { 1413 int slots; 1414 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1415 1416 if (max_slots == DNODE_MIN_SLOTS) 1417 return (DNODE_MIN_SIZE); 1418 1419 /* 1420 * Weight the random distribution more heavily toward smaller 1421 * dnode sizes since that is more likely to reflect real-world 1422 * usage. 1423 */ 1424 ASSERT3U(max_slots, >, 4); 1425 switch (ztest_random(10)) { 1426 case 0: 1427 slots = 5 + ztest_random(max_slots - 4); 1428 break; 1429 case 1 ... 4: 1430 slots = 2 + ztest_random(3); 1431 break; 1432 default: 1433 slots = 1; 1434 break; 1435 } 1436 1437 return (slots << DNODE_SHIFT); 1438 } 1439 1440 static int 1441 ztest_random_ibshift(void) 1442 { 1443 return (DN_MIN_INDBLKSHIFT + 1444 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1445 } 1446 1447 static uint64_t 1448 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1449 { 1450 uint64_t top; 1451 vdev_t *rvd = spa->spa_root_vdev; 1452 vdev_t *tvd; 1453 1454 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 1455 1456 do { 1457 top = ztest_random(rvd->vdev_children); 1458 tvd = rvd->vdev_child[top]; 1459 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1460 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1461 1462 return (top); 1463 } 1464 1465 static uint64_t 1466 ztest_random_dsl_prop(zfs_prop_t prop) 1467 { 1468 uint64_t value; 1469 1470 do { 1471 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1472 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1473 1474 return (value); 1475 } 1476 1477 static int 1478 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1479 boolean_t inherit) 1480 { 1481 const char *propname = zfs_prop_to_name(prop); 1482 const char *valname; 1483 char *setpoint; 1484 uint64_t curval; 1485 int error; 1486 1487 error = dsl_prop_set_int(osname, propname, 1488 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1489 1490 if (error == ENOSPC) { 1491 ztest_record_enospc(FTAG); 1492 return (error); 1493 } 1494 ASSERT0(error); 1495 1496 setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1497 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1498 1499 if (ztest_opts.zo_verbose >= 6) { 1500 int err; 1501 1502 err = zfs_prop_index_to_string(prop, curval, &valname); 1503 if (err) 1504 (void) printf("%s %s = %llu at '%s'\n", osname, 1505 propname, (unsigned long long)curval, setpoint); 1506 else 1507 (void) printf("%s %s = %s at '%s'\n", 1508 osname, propname, valname, setpoint); 1509 } 1510 umem_free(setpoint, MAXPATHLEN); 1511 1512 return (error); 1513 } 1514 1515 static int 1516 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1517 { 1518 spa_t *spa = ztest_spa; 1519 nvlist_t *props = NULL; 1520 int error; 1521 1522 props = fnvlist_alloc(); 1523 fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); 1524 1525 error = spa_prop_set(spa, props); 1526 1527 fnvlist_free(props); 1528 1529 if (error == ENOSPC) { 1530 ztest_record_enospc(FTAG); 1531 return (error); 1532 } 1533 ASSERT0(error); 1534 1535 return (error); 1536 } 1537 1538 static int 1539 ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, 1540 boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) 1541 { 1542 int err; 1543 char *cp = NULL; 1544 char ddname[ZFS_MAX_DATASET_NAME_LEN]; 1545 1546 strcpy(ddname, name); 1547 cp = strchr(ddname, '@'); 1548 if (cp != NULL) 1549 *cp = '\0'; 1550 1551 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1552 while (decrypt && err == EACCES) { 1553 dsl_crypto_params_t *dcp; 1554 nvlist_t *crypto_args = fnvlist_alloc(); 1555 1556 fnvlist_add_uint8_array(crypto_args, "wkeydata", 1557 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 1558 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, 1559 crypto_args, &dcp)); 1560 err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); 1561 /* 1562 * Note: if there was an error loading, the wkey was not 1563 * consumed, and needs to be freed. 1564 */ 1565 dsl_crypto_params_free(dcp, (err != 0)); 1566 fnvlist_free(crypto_args); 1567 1568 if (err == EINVAL) { 1569 /* 1570 * We couldn't load a key for this dataset so try 1571 * the parent. This loop will eventually hit the 1572 * encryption root since ztest only makes clones 1573 * as children of their origin datasets. 1574 */ 1575 cp = strrchr(ddname, '/'); 1576 if (cp == NULL) 1577 return (err); 1578 1579 *cp = '\0'; 1580 err = EACCES; 1581 continue; 1582 } else if (err != 0) { 1583 break; 1584 } 1585 1586 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1587 break; 1588 } 1589 1590 return (err); 1591 } 1592 1593 static void 1594 ztest_rll_init(rll_t *rll) 1595 { 1596 rll->rll_writer = NULL; 1597 rll->rll_readers = 0; 1598 mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); 1599 cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); 1600 } 1601 1602 static void 1603 ztest_rll_destroy(rll_t *rll) 1604 { 1605 ASSERT3P(rll->rll_writer, ==, NULL); 1606 ASSERT0(rll->rll_readers); 1607 mutex_destroy(&rll->rll_lock); 1608 cv_destroy(&rll->rll_cv); 1609 } 1610 1611 static void 1612 ztest_rll_lock(rll_t *rll, rl_type_t type) 1613 { 1614 mutex_enter(&rll->rll_lock); 1615 1616 if (type == RL_READER) { 1617 while (rll->rll_writer != NULL) 1618 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1619 rll->rll_readers++; 1620 } else { 1621 while (rll->rll_writer != NULL || rll->rll_readers) 1622 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1623 rll->rll_writer = curthread; 1624 } 1625 1626 mutex_exit(&rll->rll_lock); 1627 } 1628 1629 static void 1630 ztest_rll_unlock(rll_t *rll) 1631 { 1632 mutex_enter(&rll->rll_lock); 1633 1634 if (rll->rll_writer) { 1635 ASSERT0(rll->rll_readers); 1636 rll->rll_writer = NULL; 1637 } else { 1638 ASSERT3S(rll->rll_readers, >, 0); 1639 ASSERT3P(rll->rll_writer, ==, NULL); 1640 rll->rll_readers--; 1641 } 1642 1643 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1644 cv_broadcast(&rll->rll_cv); 1645 1646 mutex_exit(&rll->rll_lock); 1647 } 1648 1649 static void 1650 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1651 { 1652 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1653 1654 ztest_rll_lock(rll, type); 1655 } 1656 1657 static void 1658 ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1659 { 1660 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1661 1662 ztest_rll_unlock(rll); 1663 } 1664 1665 static rl_t * 1666 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1667 uint64_t size, rl_type_t type) 1668 { 1669 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1670 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1671 rl_t *rl; 1672 1673 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1674 rl->rl_object = object; 1675 rl->rl_offset = offset; 1676 rl->rl_size = size; 1677 rl->rl_lock = rll; 1678 1679 ztest_rll_lock(rll, type); 1680 1681 return (rl); 1682 } 1683 1684 static void 1685 ztest_range_unlock(rl_t *rl) 1686 { 1687 rll_t *rll = rl->rl_lock; 1688 1689 ztest_rll_unlock(rll); 1690 1691 umem_free(rl, sizeof (*rl)); 1692 } 1693 1694 static void 1695 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1696 { 1697 zd->zd_os = os; 1698 zd->zd_zilog = dmu_objset_zil(os); 1699 zd->zd_shared = szd; 1700 dmu_objset_name(os, zd->zd_name); 1701 int l; 1702 1703 if (zd->zd_shared != NULL) 1704 zd->zd_shared->zd_seq = 0; 1705 1706 VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); 1707 mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); 1708 1709 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1710 ztest_rll_init(&zd->zd_object_lock[l]); 1711 1712 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1713 ztest_rll_init(&zd->zd_range_lock[l]); 1714 } 1715 1716 static void 1717 ztest_zd_fini(ztest_ds_t *zd) 1718 { 1719 int l; 1720 1721 mutex_destroy(&zd->zd_dirobj_lock); 1722 (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); 1723 1724 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1725 ztest_rll_destroy(&zd->zd_object_lock[l]); 1726 1727 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1728 ztest_rll_destroy(&zd->zd_range_lock[l]); 1729 } 1730 1731 #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) 1732 1733 static uint64_t 1734 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 1735 { 1736 uint64_t txg; 1737 int error; 1738 1739 /* 1740 * Attempt to assign tx to some transaction group. 1741 */ 1742 error = dmu_tx_assign(tx, txg_how); 1743 if (error) { 1744 if (error == ERESTART) { 1745 ASSERT3U(txg_how, ==, TXG_NOWAIT); 1746 dmu_tx_wait(tx); 1747 } else { 1748 ASSERT3U(error, ==, ENOSPC); 1749 ztest_record_enospc(tag); 1750 } 1751 dmu_tx_abort(tx); 1752 return (0); 1753 } 1754 txg = dmu_tx_get_txg(tx); 1755 ASSERT3U(txg, !=, 0); 1756 return (txg); 1757 } 1758 1759 static void 1760 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1761 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1762 uint64_t crtxg) 1763 { 1764 bt->bt_magic = BT_MAGIC; 1765 bt->bt_objset = dmu_objset_id(os); 1766 bt->bt_object = object; 1767 bt->bt_dnodesize = dnodesize; 1768 bt->bt_offset = offset; 1769 bt->bt_gen = gen; 1770 bt->bt_txg = txg; 1771 bt->bt_crtxg = crtxg; 1772 } 1773 1774 static void 1775 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1776 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1777 uint64_t crtxg) 1778 { 1779 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1780 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1781 ASSERT3U(bt->bt_object, ==, object); 1782 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1783 ASSERT3U(bt->bt_offset, ==, offset); 1784 ASSERT3U(bt->bt_gen, <=, gen); 1785 ASSERT3U(bt->bt_txg, <=, txg); 1786 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1787 } 1788 1789 static ztest_block_tag_t * 1790 ztest_bt_bonus(dmu_buf_t *db) 1791 { 1792 dmu_object_info_t doi; 1793 ztest_block_tag_t *bt; 1794 1795 dmu_object_info_from_db(db, &doi); 1796 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1797 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1798 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1799 1800 return (bt); 1801 } 1802 1803 /* 1804 * Generate a token to fill up unused bonus buffer space. Try to make 1805 * it unique to the object, generation, and offset to verify that data 1806 * is not getting overwritten by data from other dnodes. 1807 */ 1808 #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1809 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1810 1811 /* 1812 * Fill up the unused bonus buffer region before the block tag with a 1813 * verifiable pattern. Filling the whole bonus area with non-zero data 1814 * helps ensure that all dnode traversal code properly skips the 1815 * interior regions of large dnodes. 1816 */ 1817 static void 1818 ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1819 objset_t *os, uint64_t gen) 1820 { 1821 uint64_t *bonusp; 1822 1823 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1824 1825 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1826 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1827 gen, bonusp - (uint64_t *)db->db_data); 1828 *bonusp = token; 1829 } 1830 } 1831 1832 /* 1833 * Verify that the unused area of a bonus buffer is filled with the 1834 * expected tokens. 1835 */ 1836 static void 1837 ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1838 objset_t *os, uint64_t gen) 1839 { 1840 uint64_t *bonusp; 1841 1842 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1843 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1844 gen, bonusp - (uint64_t *)db->db_data); 1845 VERIFY3U(*bonusp, ==, token); 1846 } 1847 } 1848 1849 /* 1850 * ZIL logging ops 1851 */ 1852 1853 #define lrz_type lr_mode 1854 #define lrz_blocksize lr_uid 1855 #define lrz_ibshift lr_gid 1856 #define lrz_bonustype lr_rdev 1857 #define lrz_dnodesize lr_crtime[1] 1858 1859 static void 1860 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1861 { 1862 char *name = (void *)(lr + 1); /* name follows lr */ 1863 size_t namesize = strlen(name) + 1; 1864 itx_t *itx; 1865 1866 if (zil_replaying(zd->zd_zilog, tx)) 1867 return; 1868 1869 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1870 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1871 sizeof (*lr) + namesize - sizeof (lr_t)); 1872 1873 zil_itx_assign(zd->zd_zilog, itx, tx); 1874 } 1875 1876 static void 1877 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1878 { 1879 char *name = (void *)(lr + 1); /* name follows lr */ 1880 size_t namesize = strlen(name) + 1; 1881 itx_t *itx; 1882 1883 if (zil_replaying(zd->zd_zilog, tx)) 1884 return; 1885 1886 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1887 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1888 sizeof (*lr) + namesize - sizeof (lr_t)); 1889 1890 itx->itx_oid = object; 1891 zil_itx_assign(zd->zd_zilog, itx, tx); 1892 } 1893 1894 static void 1895 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1896 { 1897 itx_t *itx; 1898 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1899 1900 if (zil_replaying(zd->zd_zilog, tx)) 1901 return; 1902 1903 if (lr->lr_length > zil_max_log_data(zd->zd_zilog)) 1904 write_state = WR_INDIRECT; 1905 1906 itx = zil_itx_create(TX_WRITE, 1907 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1908 1909 if (write_state == WR_COPIED && 1910 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1911 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { 1912 zil_itx_destroy(itx); 1913 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1914 write_state = WR_NEED_COPY; 1915 } 1916 itx->itx_private = zd; 1917 itx->itx_wr_state = write_state; 1918 itx->itx_sync = (ztest_random(8) == 0); 1919 1920 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1921 sizeof (*lr) - sizeof (lr_t)); 1922 1923 zil_itx_assign(zd->zd_zilog, itx, tx); 1924 } 1925 1926 static void 1927 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 1928 { 1929 itx_t *itx; 1930 1931 if (zil_replaying(zd->zd_zilog, tx)) 1932 return; 1933 1934 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 1935 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1936 sizeof (*lr) - sizeof (lr_t)); 1937 1938 itx->itx_sync = B_FALSE; 1939 zil_itx_assign(zd->zd_zilog, itx, tx); 1940 } 1941 1942 static void 1943 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 1944 { 1945 itx_t *itx; 1946 1947 if (zil_replaying(zd->zd_zilog, tx)) 1948 return; 1949 1950 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 1951 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1952 sizeof (*lr) - sizeof (lr_t)); 1953 1954 itx->itx_sync = B_FALSE; 1955 zil_itx_assign(zd->zd_zilog, itx, tx); 1956 } 1957 1958 /* 1959 * ZIL replay ops 1960 */ 1961 static int 1962 ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 1963 { 1964 ztest_ds_t *zd = arg1; 1965 lr_create_t *lr = arg2; 1966 char *name = (void *)(lr + 1); /* name follows lr */ 1967 objset_t *os = zd->zd_os; 1968 ztest_block_tag_t *bbt; 1969 dmu_buf_t *db; 1970 dmu_tx_t *tx; 1971 uint64_t txg; 1972 int error = 0; 1973 int bonuslen; 1974 1975 if (byteswap) 1976 byteswap_uint64_array(lr, sizeof (*lr)); 1977 1978 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 1979 ASSERT3S(name[0], !=, '\0'); 1980 1981 tx = dmu_tx_create(os); 1982 1983 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 1984 1985 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1986 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1987 } else { 1988 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1989 } 1990 1991 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1992 if (txg == 0) 1993 return (ENOSPC); 1994 1995 ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); 1996 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 1997 1998 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1999 if (lr->lr_foid == 0) { 2000 lr->lr_foid = zap_create_dnsize(os, 2001 lr->lrz_type, lr->lrz_bonustype, 2002 bonuslen, lr->lrz_dnodesize, tx); 2003 } else { 2004 error = zap_create_claim_dnsize(os, lr->lr_foid, 2005 lr->lrz_type, lr->lrz_bonustype, 2006 bonuslen, lr->lrz_dnodesize, tx); 2007 } 2008 } else { 2009 if (lr->lr_foid == 0) { 2010 lr->lr_foid = dmu_object_alloc_dnsize(os, 2011 lr->lrz_type, 0, lr->lrz_bonustype, 2012 bonuslen, lr->lrz_dnodesize, tx); 2013 } else { 2014 error = dmu_object_claim_dnsize(os, lr->lr_foid, 2015 lr->lrz_type, 0, lr->lrz_bonustype, 2016 bonuslen, lr->lrz_dnodesize, tx); 2017 } 2018 } 2019 2020 if (error) { 2021 ASSERT3U(error, ==, EEXIST); 2022 ASSERT(zd->zd_zilog->zl_replay); 2023 dmu_tx_commit(tx); 2024 return (error); 2025 } 2026 2027 ASSERT3U(lr->lr_foid, !=, 0); 2028 2029 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 2030 VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, 2031 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 2032 2033 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2034 bbt = ztest_bt_bonus(db); 2035 dmu_buf_will_dirty(db, tx); 2036 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 2037 lr->lr_gen, txg, txg); 2038 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 2039 dmu_buf_rele(db, FTAG); 2040 2041 VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 2042 &lr->lr_foid, tx)); 2043 2044 (void) ztest_log_create(zd, tx, lr); 2045 2046 dmu_tx_commit(tx); 2047 2048 return (0); 2049 } 2050 2051 static int 2052 ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 2053 { 2054 ztest_ds_t *zd = arg1; 2055 lr_remove_t *lr = arg2; 2056 char *name = (void *)(lr + 1); /* name follows lr */ 2057 objset_t *os = zd->zd_os; 2058 dmu_object_info_t doi; 2059 dmu_tx_t *tx; 2060 uint64_t object, txg; 2061 2062 if (byteswap) 2063 byteswap_uint64_array(lr, sizeof (*lr)); 2064 2065 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2066 ASSERT3S(name[0], !=, '\0'); 2067 2068 VERIFY0( 2069 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 2070 ASSERT3U(object, !=, 0); 2071 2072 ztest_object_lock(zd, object, RL_WRITER); 2073 2074 VERIFY0(dmu_object_info(os, object, &doi)); 2075 2076 tx = dmu_tx_create(os); 2077 2078 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 2079 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 2080 2081 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2082 if (txg == 0) { 2083 ztest_object_unlock(zd, object); 2084 return (ENOSPC); 2085 } 2086 2087 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 2088 VERIFY0(zap_destroy(os, object, tx)); 2089 } else { 2090 VERIFY0(dmu_object_free(os, object, tx)); 2091 } 2092 2093 VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); 2094 2095 (void) ztest_log_remove(zd, tx, lr, object); 2096 2097 dmu_tx_commit(tx); 2098 2099 ztest_object_unlock(zd, object); 2100 2101 return (0); 2102 } 2103 2104 static int 2105 ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 2106 { 2107 ztest_ds_t *zd = arg1; 2108 lr_write_t *lr = arg2; 2109 objset_t *os = zd->zd_os; 2110 void *data = lr + 1; /* data follows lr */ 2111 uint64_t offset, length; 2112 ztest_block_tag_t *bt = data; 2113 ztest_block_tag_t *bbt; 2114 uint64_t gen, txg, lrtxg, crtxg; 2115 dmu_object_info_t doi; 2116 dmu_tx_t *tx; 2117 dmu_buf_t *db; 2118 arc_buf_t *abuf = NULL; 2119 rl_t *rl; 2120 2121 if (byteswap) 2122 byteswap_uint64_array(lr, sizeof (*lr)); 2123 2124 offset = lr->lr_offset; 2125 length = lr->lr_length; 2126 2127 /* If it's a dmu_sync() block, write the whole block */ 2128 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 2129 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 2130 if (length < blocksize) { 2131 offset -= offset % blocksize; 2132 length = blocksize; 2133 } 2134 } 2135 2136 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 2137 byteswap_uint64_array(bt, sizeof (*bt)); 2138 2139 if (bt->bt_magic != BT_MAGIC) 2140 bt = NULL; 2141 2142 ztest_object_lock(zd, lr->lr_foid, RL_READER); 2143 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); 2144 2145 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2146 2147 dmu_object_info_from_db(db, &doi); 2148 2149 bbt = ztest_bt_bonus(db); 2150 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2151 gen = bbt->bt_gen; 2152 crtxg = bbt->bt_crtxg; 2153 lrtxg = lr->lr_common.lrc_txg; 2154 2155 tx = dmu_tx_create(os); 2156 2157 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 2158 2159 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 2160 P2PHASE(offset, length) == 0) 2161 abuf = dmu_request_arcbuf(db, length); 2162 2163 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2164 if (txg == 0) { 2165 if (abuf != NULL) 2166 dmu_return_arcbuf(abuf); 2167 dmu_buf_rele(db, FTAG); 2168 ztest_range_unlock(rl); 2169 ztest_object_unlock(zd, lr->lr_foid); 2170 return (ENOSPC); 2171 } 2172 2173 if (bt != NULL) { 2174 /* 2175 * Usually, verify the old data before writing new data -- 2176 * but not always, because we also want to verify correct 2177 * behavior when the data was not recently read into cache. 2178 */ 2179 ASSERT0(offset % doi.doi_data_block_size); 2180 if (ztest_random(4) != 0) { 2181 int prefetch = ztest_random(2) ? 2182 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 2183 ztest_block_tag_t rbt; 2184 2185 VERIFY(dmu_read(os, lr->lr_foid, offset, 2186 sizeof (rbt), &rbt, prefetch) == 0); 2187 if (rbt.bt_magic == BT_MAGIC) { 2188 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 2189 offset, gen, txg, crtxg); 2190 } 2191 } 2192 2193 /* 2194 * Writes can appear to be newer than the bonus buffer because 2195 * the ztest_get_data() callback does a dmu_read() of the 2196 * open-context data, which may be different than the data 2197 * as it was when the write was generated. 2198 */ 2199 if (zd->zd_zilog->zl_replay) { 2200 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 2201 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 2202 bt->bt_crtxg); 2203 } 2204 2205 /* 2206 * Set the bt's gen/txg to the bonus buffer's gen/txg 2207 * so that all of the usual ASSERTs will work. 2208 */ 2209 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 2210 crtxg); 2211 } 2212 2213 if (abuf == NULL) { 2214 dmu_write(os, lr->lr_foid, offset, length, data, tx); 2215 } else { 2216 memcpy(abuf->b_data, data, length); 2217 dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx); 2218 } 2219 2220 (void) ztest_log_write(zd, tx, lr); 2221 2222 dmu_buf_rele(db, FTAG); 2223 2224 dmu_tx_commit(tx); 2225 2226 ztest_range_unlock(rl); 2227 ztest_object_unlock(zd, lr->lr_foid); 2228 2229 return (0); 2230 } 2231 2232 static int 2233 ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 2234 { 2235 ztest_ds_t *zd = arg1; 2236 lr_truncate_t *lr = arg2; 2237 objset_t *os = zd->zd_os; 2238 dmu_tx_t *tx; 2239 uint64_t txg; 2240 rl_t *rl; 2241 2242 if (byteswap) 2243 byteswap_uint64_array(lr, sizeof (*lr)); 2244 2245 ztest_object_lock(zd, lr->lr_foid, RL_READER); 2246 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 2247 RL_WRITER); 2248 2249 tx = dmu_tx_create(os); 2250 2251 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 2252 2253 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2254 if (txg == 0) { 2255 ztest_range_unlock(rl); 2256 ztest_object_unlock(zd, lr->lr_foid); 2257 return (ENOSPC); 2258 } 2259 2260 VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 2261 lr->lr_length, tx)); 2262 2263 (void) ztest_log_truncate(zd, tx, lr); 2264 2265 dmu_tx_commit(tx); 2266 2267 ztest_range_unlock(rl); 2268 ztest_object_unlock(zd, lr->lr_foid); 2269 2270 return (0); 2271 } 2272 2273 static int 2274 ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 2275 { 2276 ztest_ds_t *zd = arg1; 2277 lr_setattr_t *lr = arg2; 2278 objset_t *os = zd->zd_os; 2279 dmu_tx_t *tx; 2280 dmu_buf_t *db; 2281 ztest_block_tag_t *bbt; 2282 uint64_t txg, lrtxg, crtxg, dnodesize; 2283 2284 if (byteswap) 2285 byteswap_uint64_array(lr, sizeof (*lr)); 2286 2287 ztest_object_lock(zd, lr->lr_foid, RL_WRITER); 2288 2289 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2290 2291 tx = dmu_tx_create(os); 2292 dmu_tx_hold_bonus(tx, lr->lr_foid); 2293 2294 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2295 if (txg == 0) { 2296 dmu_buf_rele(db, FTAG); 2297 ztest_object_unlock(zd, lr->lr_foid); 2298 return (ENOSPC); 2299 } 2300 2301 bbt = ztest_bt_bonus(db); 2302 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2303 crtxg = bbt->bt_crtxg; 2304 lrtxg = lr->lr_common.lrc_txg; 2305 dnodesize = bbt->bt_dnodesize; 2306 2307 if (zd->zd_zilog->zl_replay) { 2308 ASSERT3U(lr->lr_size, !=, 0); 2309 ASSERT3U(lr->lr_mode, !=, 0); 2310 ASSERT3U(lrtxg, !=, 0); 2311 } else { 2312 /* 2313 * Randomly change the size and increment the generation. 2314 */ 2315 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 2316 sizeof (*bbt); 2317 lr->lr_mode = bbt->bt_gen + 1; 2318 ASSERT0(lrtxg); 2319 } 2320 2321 /* 2322 * Verify that the current bonus buffer is not newer than our txg. 2323 */ 2324 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2325 MAX(txg, lrtxg), crtxg); 2326 2327 dmu_buf_will_dirty(db, tx); 2328 2329 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 2330 ASSERT3U(lr->lr_size, <=, db->db_size); 2331 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 2332 bbt = ztest_bt_bonus(db); 2333 2334 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2335 txg, crtxg); 2336 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 2337 dmu_buf_rele(db, FTAG); 2338 2339 (void) ztest_log_setattr(zd, tx, lr); 2340 2341 dmu_tx_commit(tx); 2342 2343 ztest_object_unlock(zd, lr->lr_foid); 2344 2345 return (0); 2346 } 2347 2348 zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 2349 NULL, /* 0 no such transaction type */ 2350 ztest_replay_create, /* TX_CREATE */ 2351 NULL, /* TX_MKDIR */ 2352 NULL, /* TX_MKXATTR */ 2353 NULL, /* TX_SYMLINK */ 2354 ztest_replay_remove, /* TX_REMOVE */ 2355 NULL, /* TX_RMDIR */ 2356 NULL, /* TX_LINK */ 2357 NULL, /* TX_RENAME */ 2358 ztest_replay_write, /* TX_WRITE */ 2359 ztest_replay_truncate, /* TX_TRUNCATE */ 2360 ztest_replay_setattr, /* TX_SETATTR */ 2361 NULL, /* TX_ACL */ 2362 NULL, /* TX_CREATE_ACL */ 2363 NULL, /* TX_CREATE_ATTR */ 2364 NULL, /* TX_CREATE_ACL_ATTR */ 2365 NULL, /* TX_MKDIR_ACL */ 2366 NULL, /* TX_MKDIR_ATTR */ 2367 NULL, /* TX_MKDIR_ACL_ATTR */ 2368 NULL, /* TX_WRITE2 */ 2369 NULL, /* TX_SETSAXATTR */ 2370 }; 2371 2372 /* 2373 * ZIL get_data callbacks 2374 */ 2375 2376 static void 2377 ztest_get_done(zgd_t *zgd, int error) 2378 { 2379 (void) error; 2380 ztest_ds_t *zd = zgd->zgd_private; 2381 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 2382 2383 if (zgd->zgd_db) 2384 dmu_buf_rele(zgd->zgd_db, zgd); 2385 2386 ztest_range_unlock((rl_t *)zgd->zgd_lr); 2387 ztest_object_unlock(zd, object); 2388 2389 umem_free(zgd, sizeof (*zgd)); 2390 } 2391 2392 static int 2393 ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, 2394 struct lwb *lwb, zio_t *zio) 2395 { 2396 (void) arg2; 2397 ztest_ds_t *zd = arg; 2398 objset_t *os = zd->zd_os; 2399 uint64_t object = lr->lr_foid; 2400 uint64_t offset = lr->lr_offset; 2401 uint64_t size = lr->lr_length; 2402 uint64_t txg = lr->lr_common.lrc_txg; 2403 uint64_t crtxg; 2404 dmu_object_info_t doi; 2405 dmu_buf_t *db; 2406 zgd_t *zgd; 2407 int error; 2408 2409 ASSERT3P(lwb, !=, NULL); 2410 ASSERT3P(zio, !=, NULL); 2411 ASSERT3U(size, !=, 0); 2412 2413 ztest_object_lock(zd, object, RL_READER); 2414 error = dmu_bonus_hold(os, object, FTAG, &db); 2415 if (error) { 2416 ztest_object_unlock(zd, object); 2417 return (error); 2418 } 2419 2420 crtxg = ztest_bt_bonus(db)->bt_crtxg; 2421 2422 if (crtxg == 0 || crtxg > txg) { 2423 dmu_buf_rele(db, FTAG); 2424 ztest_object_unlock(zd, object); 2425 return (ENOENT); 2426 } 2427 2428 dmu_object_info_from_db(db, &doi); 2429 dmu_buf_rele(db, FTAG); 2430 db = NULL; 2431 2432 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 2433 zgd->zgd_lwb = lwb; 2434 zgd->zgd_private = zd; 2435 2436 if (buf != NULL) { /* immediate write */ 2437 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2438 object, offset, size, RL_READER); 2439 2440 error = dmu_read(os, object, offset, size, buf, 2441 DMU_READ_NO_PREFETCH); 2442 ASSERT0(error); 2443 } else { 2444 size = doi.doi_data_block_size; 2445 if (ISP2(size)) { 2446 offset = P2ALIGN(offset, size); 2447 } else { 2448 ASSERT3U(offset, <, size); 2449 offset = 0; 2450 } 2451 2452 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2453 object, offset, size, RL_READER); 2454 2455 error = dmu_buf_hold(os, object, offset, zgd, &db, 2456 DMU_READ_NO_PREFETCH); 2457 2458 if (error == 0) { 2459 blkptr_t *bp = &lr->lr_blkptr; 2460 2461 zgd->zgd_db = db; 2462 zgd->zgd_bp = bp; 2463 2464 ASSERT3U(db->db_offset, ==, offset); 2465 ASSERT3U(db->db_size, ==, size); 2466 2467 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2468 ztest_get_done, zgd); 2469 2470 if (error == 0) 2471 return (0); 2472 } 2473 } 2474 2475 ztest_get_done(zgd, error); 2476 2477 return (error); 2478 } 2479 2480 static void * 2481 ztest_lr_alloc(size_t lrsize, char *name) 2482 { 2483 char *lr; 2484 size_t namesize = name ? strlen(name) + 1 : 0; 2485 2486 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2487 2488 if (name) 2489 memcpy(lr + lrsize, name, namesize); 2490 2491 return (lr); 2492 } 2493 2494 static void 2495 ztest_lr_free(void *lr, size_t lrsize, char *name) 2496 { 2497 size_t namesize = name ? strlen(name) + 1 : 0; 2498 2499 umem_free(lr, lrsize + namesize); 2500 } 2501 2502 /* 2503 * Lookup a bunch of objects. Returns the number of objects not found. 2504 */ 2505 static int 2506 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2507 { 2508 int missing = 0; 2509 int error; 2510 int i; 2511 2512 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2513 2514 for (i = 0; i < count; i++, od++) { 2515 od->od_object = 0; 2516 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2517 sizeof (uint64_t), 1, &od->od_object); 2518 if (error) { 2519 ASSERT3S(error, ==, ENOENT); 2520 ASSERT0(od->od_object); 2521 missing++; 2522 } else { 2523 dmu_buf_t *db; 2524 ztest_block_tag_t *bbt; 2525 dmu_object_info_t doi; 2526 2527 ASSERT3U(od->od_object, !=, 0); 2528 ASSERT0(missing); /* there should be no gaps */ 2529 2530 ztest_object_lock(zd, od->od_object, RL_READER); 2531 VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, 2532 FTAG, &db)); 2533 dmu_object_info_from_db(db, &doi); 2534 bbt = ztest_bt_bonus(db); 2535 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2536 od->od_type = doi.doi_type; 2537 od->od_blocksize = doi.doi_data_block_size; 2538 od->od_gen = bbt->bt_gen; 2539 dmu_buf_rele(db, FTAG); 2540 ztest_object_unlock(zd, od->od_object); 2541 } 2542 } 2543 2544 return (missing); 2545 } 2546 2547 static int 2548 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2549 { 2550 int missing = 0; 2551 int i; 2552 2553 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2554 2555 for (i = 0; i < count; i++, od++) { 2556 if (missing) { 2557 od->od_object = 0; 2558 missing++; 2559 continue; 2560 } 2561 2562 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2563 2564 lr->lr_doid = od->od_dir; 2565 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2566 lr->lrz_type = od->od_crtype; 2567 lr->lrz_blocksize = od->od_crblocksize; 2568 lr->lrz_ibshift = ztest_random_ibshift(); 2569 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2570 lr->lrz_dnodesize = od->od_crdnodesize; 2571 lr->lr_gen = od->od_crgen; 2572 lr->lr_crtime[0] = time(NULL); 2573 2574 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2575 ASSERT0(missing); 2576 od->od_object = 0; 2577 missing++; 2578 } else { 2579 od->od_object = lr->lr_foid; 2580 od->od_type = od->od_crtype; 2581 od->od_blocksize = od->od_crblocksize; 2582 od->od_gen = od->od_crgen; 2583 ASSERT3U(od->od_object, !=, 0); 2584 } 2585 2586 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2587 } 2588 2589 return (missing); 2590 } 2591 2592 static int 2593 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2594 { 2595 int missing = 0; 2596 int error; 2597 int i; 2598 2599 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2600 2601 od += count - 1; 2602 2603 for (i = count - 1; i >= 0; i--, od--) { 2604 if (missing) { 2605 missing++; 2606 continue; 2607 } 2608 2609 /* 2610 * No object was found. 2611 */ 2612 if (od->od_object == 0) 2613 continue; 2614 2615 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2616 2617 lr->lr_doid = od->od_dir; 2618 2619 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2620 ASSERT3U(error, ==, ENOSPC); 2621 missing++; 2622 } else { 2623 od->od_object = 0; 2624 } 2625 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2626 } 2627 2628 return (missing); 2629 } 2630 2631 static int 2632 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2633 void *data) 2634 { 2635 lr_write_t *lr; 2636 int error; 2637 2638 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2639 2640 lr->lr_foid = object; 2641 lr->lr_offset = offset; 2642 lr->lr_length = size; 2643 lr->lr_blkoff = 0; 2644 BP_ZERO(&lr->lr_blkptr); 2645 2646 memcpy(lr + 1, data, size); 2647 2648 error = ztest_replay_write(zd, lr, B_FALSE); 2649 2650 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2651 2652 return (error); 2653 } 2654 2655 static int 2656 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2657 { 2658 lr_truncate_t *lr; 2659 int error; 2660 2661 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2662 2663 lr->lr_foid = object; 2664 lr->lr_offset = offset; 2665 lr->lr_length = size; 2666 2667 error = ztest_replay_truncate(zd, lr, B_FALSE); 2668 2669 ztest_lr_free(lr, sizeof (*lr), NULL); 2670 2671 return (error); 2672 } 2673 2674 static int 2675 ztest_setattr(ztest_ds_t *zd, uint64_t object) 2676 { 2677 lr_setattr_t *lr; 2678 int error; 2679 2680 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2681 2682 lr->lr_foid = object; 2683 lr->lr_size = 0; 2684 lr->lr_mode = 0; 2685 2686 error = ztest_replay_setattr(zd, lr, B_FALSE); 2687 2688 ztest_lr_free(lr, sizeof (*lr), NULL); 2689 2690 return (error); 2691 } 2692 2693 static void 2694 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2695 { 2696 objset_t *os = zd->zd_os; 2697 dmu_tx_t *tx; 2698 uint64_t txg; 2699 rl_t *rl; 2700 2701 txg_wait_synced(dmu_objset_pool(os), 0); 2702 2703 ztest_object_lock(zd, object, RL_READER); 2704 rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); 2705 2706 tx = dmu_tx_create(os); 2707 2708 dmu_tx_hold_write(tx, object, offset, size); 2709 2710 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2711 2712 if (txg != 0) { 2713 dmu_prealloc(os, object, offset, size, tx); 2714 dmu_tx_commit(tx); 2715 txg_wait_synced(dmu_objset_pool(os), txg); 2716 } else { 2717 (void) dmu_free_long_range(os, object, offset, size); 2718 } 2719 2720 ztest_range_unlock(rl); 2721 ztest_object_unlock(zd, object); 2722 } 2723 2724 static void 2725 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2726 { 2727 int err; 2728 ztest_block_tag_t wbt; 2729 dmu_object_info_t doi; 2730 enum ztest_io_type io_type; 2731 uint64_t blocksize; 2732 void *data; 2733 2734 VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); 2735 blocksize = doi.doi_data_block_size; 2736 data = umem_alloc(blocksize, UMEM_NOFAIL); 2737 2738 /* 2739 * Pick an i/o type at random, biased toward writing block tags. 2740 */ 2741 io_type = ztest_random(ZTEST_IO_TYPES); 2742 if (ztest_random(2) == 0) 2743 io_type = ZTEST_IO_WRITE_TAG; 2744 2745 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2746 2747 switch (io_type) { 2748 2749 case ZTEST_IO_WRITE_TAG: 2750 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2751 offset, 0, 0, 0); 2752 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2753 break; 2754 2755 case ZTEST_IO_WRITE_PATTERN: 2756 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2757 if (ztest_random(2) == 0) { 2758 /* 2759 * Induce fletcher2 collisions to ensure that 2760 * zio_ddt_collision() detects and resolves them 2761 * when using fletcher2-verify for deduplication. 2762 */ 2763 ((uint64_t *)data)[0] ^= 1ULL << 63; 2764 ((uint64_t *)data)[4] ^= 1ULL << 63; 2765 } 2766 (void) ztest_write(zd, object, offset, blocksize, data); 2767 break; 2768 2769 case ZTEST_IO_WRITE_ZEROES: 2770 memset(data, 0, blocksize); 2771 (void) ztest_write(zd, object, offset, blocksize, data); 2772 break; 2773 2774 case ZTEST_IO_TRUNCATE: 2775 (void) ztest_truncate(zd, object, offset, blocksize); 2776 break; 2777 2778 case ZTEST_IO_SETATTR: 2779 (void) ztest_setattr(zd, object); 2780 break; 2781 default: 2782 break; 2783 2784 case ZTEST_IO_REWRITE: 2785 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2786 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2787 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2788 B_FALSE); 2789 VERIFY(err == 0 || err == ENOSPC); 2790 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2791 ZFS_PROP_COMPRESSION, 2792 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2793 B_FALSE); 2794 VERIFY(err == 0 || err == ENOSPC); 2795 (void) pthread_rwlock_unlock(&ztest_name_lock); 2796 2797 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2798 DMU_READ_NO_PREFETCH)); 2799 2800 (void) ztest_write(zd, object, offset, blocksize, data); 2801 break; 2802 } 2803 2804 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2805 2806 umem_free(data, blocksize); 2807 } 2808 2809 /* 2810 * Initialize an object description template. 2811 */ 2812 static void 2813 ztest_od_init(ztest_od_t *od, uint64_t id, const char *tag, uint64_t index, 2814 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2815 uint64_t gen) 2816 { 2817 od->od_dir = ZTEST_DIROBJ; 2818 od->od_object = 0; 2819 2820 od->od_crtype = type; 2821 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2822 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2823 od->od_crgen = gen; 2824 2825 od->od_type = DMU_OT_NONE; 2826 od->od_blocksize = 0; 2827 od->od_gen = 0; 2828 2829 (void) snprintf(od->od_name, sizeof (od->od_name), 2830 "%s(%"PRId64")[%"PRIu64"]", 2831 tag, id, index); 2832 } 2833 2834 /* 2835 * Lookup or create the objects for a test using the od template. 2836 * If the objects do not all exist, or if 'remove' is specified, 2837 * remove any existing objects and create new ones. Otherwise, 2838 * use the existing objects. 2839 */ 2840 static int 2841 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2842 { 2843 int count = size / sizeof (*od); 2844 int rv = 0; 2845 2846 mutex_enter(&zd->zd_dirobj_lock); 2847 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2848 (ztest_remove(zd, od, count) != 0 || 2849 ztest_create(zd, od, count) != 0)) 2850 rv = -1; 2851 zd->zd_od = od; 2852 mutex_exit(&zd->zd_dirobj_lock); 2853 2854 return (rv); 2855 } 2856 2857 void 2858 ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2859 { 2860 (void) id; 2861 zilog_t *zilog = zd->zd_zilog; 2862 2863 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2864 2865 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2866 2867 /* 2868 * Remember the committed values in zd, which is in parent/child 2869 * shared memory. If we die, the next iteration of ztest_run() 2870 * will verify that the log really does contain this record. 2871 */ 2872 mutex_enter(&zilog->zl_lock); 2873 ASSERT3P(zd->zd_shared, !=, NULL); 2874 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2875 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2876 mutex_exit(&zilog->zl_lock); 2877 2878 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2879 } 2880 2881 /* 2882 * This function is designed to simulate the operations that occur during a 2883 * mount/unmount operation. We hold the dataset across these operations in an 2884 * attempt to expose any implicit assumptions about ZIL management. 2885 */ 2886 void 2887 ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2888 { 2889 (void) id; 2890 objset_t *os = zd->zd_os; 2891 2892 /* 2893 * We hold the ztest_vdev_lock so we don't cause problems with 2894 * other threads that wish to remove a log device, such as 2895 * ztest_device_removal(). 2896 */ 2897 mutex_enter(&ztest_vdev_lock); 2898 2899 /* 2900 * We grab the zd_dirobj_lock to ensure that no other thread is 2901 * updating the zil (i.e. adding in-memory log records) and the 2902 * zd_zilog_lock to block any I/O. 2903 */ 2904 mutex_enter(&zd->zd_dirobj_lock); 2905 (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); 2906 2907 /* zfsvfs_teardown() */ 2908 zil_close(zd->zd_zilog); 2909 2910 /* zfsvfs_setup() */ 2911 VERIFY3P(zil_open(os, ztest_get_data), ==, zd->zd_zilog); 2912 zil_replay(os, zd, ztest_replay_vector); 2913 2914 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2915 mutex_exit(&zd->zd_dirobj_lock); 2916 mutex_exit(&ztest_vdev_lock); 2917 } 2918 2919 /* 2920 * Verify that we can't destroy an active pool, create an existing pool, 2921 * or create a pool with a bad vdev spec. 2922 */ 2923 void 2924 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 2925 { 2926 (void) zd, (void) id; 2927 ztest_shared_opts_t *zo = &ztest_opts; 2928 spa_t *spa; 2929 nvlist_t *nvroot; 2930 2931 if (zo->zo_mmp_test) 2932 return; 2933 2934 /* 2935 * Attempt to create using a bad file. 2936 */ 2937 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 2938 VERIFY3U(ENOENT, ==, 2939 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 2940 fnvlist_free(nvroot); 2941 2942 /* 2943 * Attempt to create using a bad mirror. 2944 */ 2945 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); 2946 VERIFY3U(ENOENT, ==, 2947 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 2948 fnvlist_free(nvroot); 2949 2950 /* 2951 * Attempt to create an existing pool. It shouldn't matter 2952 * what's in the nvroot; we should fail with EEXIST. 2953 */ 2954 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2955 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 2956 VERIFY3U(EEXIST, ==, 2957 spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); 2958 fnvlist_free(nvroot); 2959 2960 /* 2961 * We open a reference to the spa and then we try to export it 2962 * expecting one of the following errors: 2963 * 2964 * EBUSY 2965 * Because of the reference we just opened. 2966 * 2967 * ZFS_ERR_EXPORT_IN_PROGRESS 2968 * For the case that there is another ztest thread doing 2969 * an export concurrently. 2970 */ 2971 VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); 2972 int error = spa_destroy(zo->zo_pool); 2973 if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { 2974 fatal(B_FALSE, "spa_destroy(%s) returned unexpected value %d", 2975 spa->spa_name, error); 2976 } 2977 spa_close(spa, FTAG); 2978 2979 (void) pthread_rwlock_unlock(&ztest_name_lock); 2980 } 2981 2982 /* 2983 * Start and then stop the MMP threads to ensure the startup and shutdown code 2984 * works properly. Actual protection and property-related code tested via ZTS. 2985 */ 2986 void 2987 ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) 2988 { 2989 (void) zd, (void) id; 2990 ztest_shared_opts_t *zo = &ztest_opts; 2991 spa_t *spa = ztest_spa; 2992 2993 if (zo->zo_mmp_test) 2994 return; 2995 2996 /* 2997 * Since enabling MMP involves setting a property, it could not be done 2998 * while the pool is suspended. 2999 */ 3000 if (spa_suspended(spa)) 3001 return; 3002 3003 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3004 mutex_enter(&spa->spa_props_lock); 3005 3006 zfs_multihost_fail_intervals = 0; 3007 3008 if (!spa_multihost(spa)) { 3009 spa->spa_multihost = B_TRUE; 3010 mmp_thread_start(spa); 3011 } 3012 3013 mutex_exit(&spa->spa_props_lock); 3014 spa_config_exit(spa, SCL_CONFIG, FTAG); 3015 3016 txg_wait_synced(spa_get_dsl(spa), 0); 3017 mmp_signal_all_threads(); 3018 txg_wait_synced(spa_get_dsl(spa), 0); 3019 3020 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3021 mutex_enter(&spa->spa_props_lock); 3022 3023 if (spa_multihost(spa)) { 3024 mmp_thread_stop(spa); 3025 spa->spa_multihost = B_FALSE; 3026 } 3027 3028 mutex_exit(&spa->spa_props_lock); 3029 spa_config_exit(spa, SCL_CONFIG, FTAG); 3030 } 3031 3032 void 3033 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 3034 { 3035 (void) zd, (void) id; 3036 spa_t *spa; 3037 uint64_t initial_version = SPA_VERSION_INITIAL; 3038 uint64_t version, newversion; 3039 nvlist_t *nvroot, *props; 3040 char *name; 3041 3042 if (ztest_opts.zo_mmp_test) 3043 return; 3044 3045 /* dRAID added after feature flags, skip upgrade test. */ 3046 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) 3047 return; 3048 3049 mutex_enter(&ztest_vdev_lock); 3050 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 3051 3052 /* 3053 * Clean up from previous runs. 3054 */ 3055 (void) spa_destroy(name); 3056 3057 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 3058 NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); 3059 3060 /* 3061 * If we're configuring a RAIDZ device then make sure that the 3062 * initial version is capable of supporting that feature. 3063 */ 3064 switch (ztest_opts.zo_raid_parity) { 3065 case 0: 3066 case 1: 3067 initial_version = SPA_VERSION_INITIAL; 3068 break; 3069 case 2: 3070 initial_version = SPA_VERSION_RAIDZ2; 3071 break; 3072 case 3: 3073 initial_version = SPA_VERSION_RAIDZ3; 3074 break; 3075 } 3076 3077 /* 3078 * Create a pool with a spa version that can be upgraded. Pick 3079 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 3080 */ 3081 do { 3082 version = ztest_random_spa_version(initial_version); 3083 } while (version > SPA_VERSION_BEFORE_FEATURES); 3084 3085 props = fnvlist_alloc(); 3086 fnvlist_add_uint64(props, 3087 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 3088 VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); 3089 fnvlist_free(nvroot); 3090 fnvlist_free(props); 3091 3092 VERIFY0(spa_open(name, &spa, FTAG)); 3093 VERIFY3U(spa_version(spa), ==, version); 3094 newversion = ztest_random_spa_version(version + 1); 3095 3096 if (ztest_opts.zo_verbose >= 4) { 3097 (void) printf("upgrading spa version from " 3098 "%"PRIu64" to %"PRIu64"\n", 3099 version, newversion); 3100 } 3101 3102 spa_upgrade(spa, newversion); 3103 VERIFY3U(spa_version(spa), >, version); 3104 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 3105 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 3106 spa_close(spa, FTAG); 3107 3108 kmem_strfree(name); 3109 mutex_exit(&ztest_vdev_lock); 3110 } 3111 3112 static void 3113 ztest_spa_checkpoint(spa_t *spa) 3114 { 3115 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3116 3117 int error = spa_checkpoint(spa->spa_name); 3118 3119 switch (error) { 3120 case 0: 3121 case ZFS_ERR_DEVRM_IN_PROGRESS: 3122 case ZFS_ERR_DISCARDING_CHECKPOINT: 3123 case ZFS_ERR_CHECKPOINT_EXISTS: 3124 break; 3125 case ENOSPC: 3126 ztest_record_enospc(FTAG); 3127 break; 3128 default: 3129 fatal(B_FALSE, "spa_checkpoint(%s) = %d", spa->spa_name, error); 3130 } 3131 } 3132 3133 static void 3134 ztest_spa_discard_checkpoint(spa_t *spa) 3135 { 3136 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3137 3138 int error = spa_checkpoint_discard(spa->spa_name); 3139 3140 switch (error) { 3141 case 0: 3142 case ZFS_ERR_DISCARDING_CHECKPOINT: 3143 case ZFS_ERR_NO_CHECKPOINT: 3144 break; 3145 default: 3146 fatal(B_FALSE, "spa_discard_checkpoint(%s) = %d", 3147 spa->spa_name, error); 3148 } 3149 3150 } 3151 3152 void 3153 ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 3154 { 3155 (void) zd, (void) id; 3156 spa_t *spa = ztest_spa; 3157 3158 mutex_enter(&ztest_checkpoint_lock); 3159 if (ztest_random(2) == 0) { 3160 ztest_spa_checkpoint(spa); 3161 } else { 3162 ztest_spa_discard_checkpoint(spa); 3163 } 3164 mutex_exit(&ztest_checkpoint_lock); 3165 } 3166 3167 3168 static vdev_t * 3169 vdev_lookup_by_path(vdev_t *vd, const char *path) 3170 { 3171 vdev_t *mvd; 3172 int c; 3173 3174 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 3175 return (vd); 3176 3177 for (c = 0; c < vd->vdev_children; c++) 3178 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 3179 NULL) 3180 return (mvd); 3181 3182 return (NULL); 3183 } 3184 3185 static int 3186 spa_num_top_vdevs(spa_t *spa) 3187 { 3188 vdev_t *rvd = spa->spa_root_vdev; 3189 ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); 3190 return (rvd->vdev_children); 3191 } 3192 3193 /* 3194 * Verify that vdev_add() works as expected. 3195 */ 3196 void 3197 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 3198 { 3199 (void) zd, (void) id; 3200 ztest_shared_t *zs = ztest_shared; 3201 spa_t *spa = ztest_spa; 3202 uint64_t leaves; 3203 uint64_t guid; 3204 nvlist_t *nvroot; 3205 int error; 3206 3207 if (ztest_opts.zo_mmp_test) 3208 return; 3209 3210 mutex_enter(&ztest_vdev_lock); 3211 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * 3212 ztest_opts.zo_raid_children; 3213 3214 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3215 3216 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3217 3218 /* 3219 * If we have slogs then remove them 1/4 of the time. 3220 */ 3221 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 3222 metaslab_group_t *mg; 3223 3224 /* 3225 * find the first real slog in log allocation class 3226 */ 3227 mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; 3228 while (!mg->mg_vd->vdev_islog) 3229 mg = mg->mg_next; 3230 3231 guid = mg->mg_vd->vdev_guid; 3232 3233 spa_config_exit(spa, SCL_VDEV, FTAG); 3234 3235 /* 3236 * We have to grab the zs_name_lock as writer to 3237 * prevent a race between removing a slog (dmu_objset_find) 3238 * and destroying a dataset. Removing the slog will 3239 * grab a reference on the dataset which may cause 3240 * dsl_destroy_head() to fail with EBUSY thus 3241 * leaving the dataset in an inconsistent state. 3242 */ 3243 pthread_rwlock_wrlock(&ztest_name_lock); 3244 error = spa_vdev_remove(spa, guid, B_FALSE); 3245 pthread_rwlock_unlock(&ztest_name_lock); 3246 3247 switch (error) { 3248 case 0: 3249 case EEXIST: /* Generic zil_reset() error */ 3250 case EBUSY: /* Replay required */ 3251 case EACCES: /* Crypto key not loaded */ 3252 case ZFS_ERR_CHECKPOINT_EXISTS: 3253 case ZFS_ERR_DISCARDING_CHECKPOINT: 3254 break; 3255 default: 3256 fatal(B_FALSE, "spa_vdev_remove() = %d", error); 3257 } 3258 } else { 3259 spa_config_exit(spa, SCL_VDEV, FTAG); 3260 3261 /* 3262 * Make 1/4 of the devices be log devices 3263 */ 3264 nvroot = make_vdev_root(NULL, NULL, NULL, 3265 ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? 3266 "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 3267 1); 3268 3269 error = spa_vdev_add(spa, nvroot); 3270 fnvlist_free(nvroot); 3271 3272 switch (error) { 3273 case 0: 3274 break; 3275 case ENOSPC: 3276 ztest_record_enospc("spa_vdev_add"); 3277 break; 3278 default: 3279 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3280 } 3281 } 3282 3283 mutex_exit(&ztest_vdev_lock); 3284 } 3285 3286 void 3287 ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) 3288 { 3289 (void) zd, (void) id; 3290 ztest_shared_t *zs = ztest_shared; 3291 spa_t *spa = ztest_spa; 3292 uint64_t leaves; 3293 nvlist_t *nvroot; 3294 const char *class = (ztest_random(2) == 0) ? 3295 VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; 3296 int error; 3297 3298 /* 3299 * By default add a special vdev 50% of the time 3300 */ 3301 if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || 3302 (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && 3303 ztest_random(2) == 0)) { 3304 return; 3305 } 3306 3307 mutex_enter(&ztest_vdev_lock); 3308 3309 /* Only test with mirrors */ 3310 if (zs->zs_mirrors < 2) { 3311 mutex_exit(&ztest_vdev_lock); 3312 return; 3313 } 3314 3315 /* requires feature@allocation_classes */ 3316 if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { 3317 mutex_exit(&ztest_vdev_lock); 3318 return; 3319 } 3320 3321 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * 3322 ztest_opts.zo_raid_children; 3323 3324 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3325 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3326 spa_config_exit(spa, SCL_VDEV, FTAG); 3327 3328 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 3329 class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 3330 3331 error = spa_vdev_add(spa, nvroot); 3332 fnvlist_free(nvroot); 3333 3334 if (error == ENOSPC) 3335 ztest_record_enospc("spa_vdev_add"); 3336 else if (error != 0) 3337 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3338 3339 /* 3340 * 50% of the time allow small blocks in the special class 3341 */ 3342 if (error == 0 && 3343 spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { 3344 if (ztest_opts.zo_verbose >= 3) 3345 (void) printf("Enabling special VDEV small blocks\n"); 3346 (void) ztest_dsl_prop_set_uint64(zd->zd_name, 3347 ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); 3348 } 3349 3350 mutex_exit(&ztest_vdev_lock); 3351 3352 if (ztest_opts.zo_verbose >= 3) { 3353 metaslab_class_t *mc; 3354 3355 if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) 3356 mc = spa_special_class(spa); 3357 else 3358 mc = spa_dedup_class(spa); 3359 (void) printf("Added a %s mirrored vdev (of %d)\n", 3360 class, (int)mc->mc_groups); 3361 } 3362 } 3363 3364 /* 3365 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 3366 */ 3367 void 3368 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 3369 { 3370 (void) zd, (void) id; 3371 ztest_shared_t *zs = ztest_shared; 3372 spa_t *spa = ztest_spa; 3373 vdev_t *rvd = spa->spa_root_vdev; 3374 spa_aux_vdev_t *sav; 3375 const char *aux; 3376 char *path; 3377 uint64_t guid = 0; 3378 int error, ignore_err = 0; 3379 3380 if (ztest_opts.zo_mmp_test) 3381 return; 3382 3383 path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3384 3385 if (ztest_random(2) == 0) { 3386 sav = &spa->spa_spares; 3387 aux = ZPOOL_CONFIG_SPARES; 3388 } else { 3389 sav = &spa->spa_l2cache; 3390 aux = ZPOOL_CONFIG_L2CACHE; 3391 } 3392 3393 mutex_enter(&ztest_vdev_lock); 3394 3395 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3396 3397 if (sav->sav_count != 0 && ztest_random(4) == 0) { 3398 /* 3399 * Pick a random device to remove. 3400 */ 3401 vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3402 3403 /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ 3404 if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) 3405 ignore_err = ENOTSUP; 3406 3407 guid = svd->vdev_guid; 3408 } else { 3409 /* 3410 * Find an unused device we can add. 3411 */ 3412 zs->zs_vdev_aux = 0; 3413 for (;;) { 3414 int c; 3415 (void) snprintf(path, MAXPATHLEN, ztest_aux_template, 3416 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 3417 zs->zs_vdev_aux); 3418 for (c = 0; c < sav->sav_count; c++) 3419 if (strcmp(sav->sav_vdevs[c]->vdev_path, 3420 path) == 0) 3421 break; 3422 if (c == sav->sav_count && 3423 vdev_lookup_by_path(rvd, path) == NULL) 3424 break; 3425 zs->zs_vdev_aux++; 3426 } 3427 } 3428 3429 spa_config_exit(spa, SCL_VDEV, FTAG); 3430 3431 if (guid == 0) { 3432 /* 3433 * Add a new device. 3434 */ 3435 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 3436 (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); 3437 error = spa_vdev_add(spa, nvroot); 3438 3439 switch (error) { 3440 case 0: 3441 break; 3442 default: 3443 fatal(B_FALSE, "spa_vdev_add(%p) = %d", nvroot, error); 3444 } 3445 fnvlist_free(nvroot); 3446 } else { 3447 /* 3448 * Remove an existing device. Sometimes, dirty its 3449 * vdev state first to make sure we handle removal 3450 * of devices that have pending state changes. 3451 */ 3452 if (ztest_random(2) == 0) 3453 (void) vdev_online(spa, guid, 0, NULL); 3454 3455 error = spa_vdev_remove(spa, guid, B_FALSE); 3456 3457 switch (error) { 3458 case 0: 3459 case EBUSY: 3460 case ZFS_ERR_CHECKPOINT_EXISTS: 3461 case ZFS_ERR_DISCARDING_CHECKPOINT: 3462 break; 3463 default: 3464 if (error != ignore_err) 3465 fatal(B_FALSE, 3466 "spa_vdev_remove(%"PRIu64") = %d", 3467 guid, error); 3468 } 3469 } 3470 3471 mutex_exit(&ztest_vdev_lock); 3472 3473 umem_free(path, MAXPATHLEN); 3474 } 3475 3476 /* 3477 * split a pool if it has mirror tlvdevs 3478 */ 3479 void 3480 ztest_split_pool(ztest_ds_t *zd, uint64_t id) 3481 { 3482 (void) zd, (void) id; 3483 ztest_shared_t *zs = ztest_shared; 3484 spa_t *spa = ztest_spa; 3485 vdev_t *rvd = spa->spa_root_vdev; 3486 nvlist_t *tree, **child, *config, *split, **schild; 3487 uint_t c, children, schildren = 0, lastlogid = 0; 3488 int error = 0; 3489 3490 if (ztest_opts.zo_mmp_test) 3491 return; 3492 3493 mutex_enter(&ztest_vdev_lock); 3494 3495 /* ensure we have a usable config; mirrors of raidz aren't supported */ 3496 if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { 3497 mutex_exit(&ztest_vdev_lock); 3498 return; 3499 } 3500 3501 /* clean up the old pool, if any */ 3502 (void) spa_destroy("splitp"); 3503 3504 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3505 3506 /* generate a config from the existing config */ 3507 mutex_enter(&spa->spa_props_lock); 3508 tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); 3509 mutex_exit(&spa->spa_props_lock); 3510 3511 VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, 3512 &child, &children)); 3513 3514 schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); 3515 for (c = 0; c < children; c++) { 3516 vdev_t *tvd = rvd->vdev_child[c]; 3517 nvlist_t **mchild; 3518 uint_t mchildren; 3519 3520 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 3521 schild[schildren] = fnvlist_alloc(); 3522 fnvlist_add_string(schild[schildren], 3523 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); 3524 fnvlist_add_uint64(schild[schildren], 3525 ZPOOL_CONFIG_IS_HOLE, 1); 3526 if (lastlogid == 0) 3527 lastlogid = schildren; 3528 ++schildren; 3529 continue; 3530 } 3531 lastlogid = 0; 3532 VERIFY0(nvlist_lookup_nvlist_array(child[c], 3533 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); 3534 schild[schildren++] = fnvlist_dup(mchild[0]); 3535 } 3536 3537 /* OK, create a config that can be used to split */ 3538 split = fnvlist_alloc(); 3539 fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 3540 fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, 3541 (const nvlist_t **)schild, lastlogid != 0 ? lastlogid : schildren); 3542 3543 config = fnvlist_alloc(); 3544 fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); 3545 3546 for (c = 0; c < schildren; c++) 3547 fnvlist_free(schild[c]); 3548 free(schild); 3549 fnvlist_free(split); 3550 3551 spa_config_exit(spa, SCL_VDEV, FTAG); 3552 3553 (void) pthread_rwlock_wrlock(&ztest_name_lock); 3554 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 3555 (void) pthread_rwlock_unlock(&ztest_name_lock); 3556 3557 fnvlist_free(config); 3558 3559 if (error == 0) { 3560 (void) printf("successful split - results:\n"); 3561 mutex_enter(&spa_namespace_lock); 3562 show_pool_stats(spa); 3563 show_pool_stats(spa_lookup("splitp")); 3564 mutex_exit(&spa_namespace_lock); 3565 ++zs->zs_splits; 3566 --zs->zs_mirrors; 3567 } 3568 mutex_exit(&ztest_vdev_lock); 3569 } 3570 3571 /* 3572 * Verify that we can attach and detach devices. 3573 */ 3574 void 3575 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 3576 { 3577 (void) zd, (void) id; 3578 ztest_shared_t *zs = ztest_shared; 3579 spa_t *spa = ztest_spa; 3580 spa_aux_vdev_t *sav = &spa->spa_spares; 3581 vdev_t *rvd = spa->spa_root_vdev; 3582 vdev_t *oldvd, *newvd, *pvd; 3583 nvlist_t *root; 3584 uint64_t leaves; 3585 uint64_t leaf, top; 3586 uint64_t ashift = ztest_get_ashift(); 3587 uint64_t oldguid, pguid; 3588 uint64_t oldsize, newsize; 3589 char *oldpath, *newpath; 3590 int replacing; 3591 int oldvd_has_siblings = B_FALSE; 3592 int newvd_is_spare = B_FALSE; 3593 int newvd_is_dspare = B_FALSE; 3594 int oldvd_is_log; 3595 int error, expected_error; 3596 3597 if (ztest_opts.zo_mmp_test) 3598 return; 3599 3600 oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3601 newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3602 3603 mutex_enter(&ztest_vdev_lock); 3604 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; 3605 3606 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3607 3608 /* 3609 * If a vdev is in the process of being removed, its removal may 3610 * finish while we are in progress, leading to an unexpected error 3611 * value. Don't bother trying to attach while we are in the middle 3612 * of removal. 3613 */ 3614 if (ztest_device_removal_active) { 3615 spa_config_exit(spa, SCL_ALL, FTAG); 3616 goto out; 3617 } 3618 3619 /* 3620 * Decide whether to do an attach or a replace. 3621 */ 3622 replacing = ztest_random(2); 3623 3624 /* 3625 * Pick a random top-level vdev. 3626 */ 3627 top = ztest_random_vdev_top(spa, B_TRUE); 3628 3629 /* 3630 * Pick a random leaf within it. 3631 */ 3632 leaf = ztest_random(leaves); 3633 3634 /* 3635 * Locate this vdev. 3636 */ 3637 oldvd = rvd->vdev_child[top]; 3638 3639 /* pick a child from the mirror */ 3640 if (zs->zs_mirrors >= 1) { 3641 ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); 3642 ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); 3643 oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; 3644 } 3645 3646 /* pick a child out of the raidz group */ 3647 if (ztest_opts.zo_raid_children > 1) { 3648 if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) 3649 ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); 3650 else 3651 ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); 3652 ASSERT3U(oldvd->vdev_children, ==, ztest_opts.zo_raid_children); 3653 oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; 3654 } 3655 3656 /* 3657 * If we're already doing an attach or replace, oldvd may be a 3658 * mirror vdev -- in which case, pick a random child. 3659 */ 3660 while (oldvd->vdev_children != 0) { 3661 oldvd_has_siblings = B_TRUE; 3662 ASSERT3U(oldvd->vdev_children, >=, 2); 3663 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3664 } 3665 3666 oldguid = oldvd->vdev_guid; 3667 oldsize = vdev_get_min_asize(oldvd); 3668 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3669 (void) strcpy(oldpath, oldvd->vdev_path); 3670 pvd = oldvd->vdev_parent; 3671 pguid = pvd->vdev_guid; 3672 3673 /* 3674 * If oldvd has siblings, then half of the time, detach it. Prior 3675 * to the detach the pool is scrubbed in order to prevent creating 3676 * unrepairable blocks as a result of the data corruption injection. 3677 */ 3678 if (oldvd_has_siblings && ztest_random(2) == 0) { 3679 spa_config_exit(spa, SCL_ALL, FTAG); 3680 3681 error = ztest_scrub_impl(spa); 3682 if (error) 3683 goto out; 3684 3685 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3686 if (error != 0 && error != ENODEV && error != EBUSY && 3687 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3688 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3689 fatal(B_FALSE, "detach (%s) returned %d", 3690 oldpath, error); 3691 goto out; 3692 } 3693 3694 /* 3695 * For the new vdev, choose with equal probability between the two 3696 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3697 */ 3698 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3699 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3700 newvd_is_spare = B_TRUE; 3701 3702 if (newvd->vdev_ops == &vdev_draid_spare_ops) 3703 newvd_is_dspare = B_TRUE; 3704 3705 (void) strcpy(newpath, newvd->vdev_path); 3706 } else { 3707 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 3708 ztest_opts.zo_dir, ztest_opts.zo_pool, 3709 top * leaves + leaf); 3710 if (ztest_random(2) == 0) 3711 newpath[strlen(newpath) - 1] = 'b'; 3712 newvd = vdev_lookup_by_path(rvd, newpath); 3713 } 3714 3715 if (newvd) { 3716 /* 3717 * Reopen to ensure the vdev's asize field isn't stale. 3718 */ 3719 vdev_reopen(newvd); 3720 newsize = vdev_get_min_asize(newvd); 3721 } else { 3722 /* 3723 * Make newsize a little bigger or smaller than oldsize. 3724 * If it's smaller, the attach should fail. 3725 * If it's larger, and we're doing a replace, 3726 * we should get dynamic LUN growth when we're done. 3727 */ 3728 newsize = 10 * oldsize / (9 + ztest_random(3)); 3729 } 3730 3731 /* 3732 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3733 * unless it's a replace; in that case any non-replacing parent is OK. 3734 * 3735 * If newvd is already part of the pool, it should fail with EBUSY. 3736 * 3737 * If newvd is too small, it should fail with EOVERFLOW. 3738 * 3739 * If newvd is a distributed spare and it's being attached to a 3740 * dRAID which is not its parent it should fail with EINVAL. 3741 */ 3742 if (pvd->vdev_ops != &vdev_mirror_ops && 3743 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3744 pvd->vdev_ops == &vdev_replacing_ops || 3745 pvd->vdev_ops == &vdev_spare_ops)) 3746 expected_error = ENOTSUP; 3747 else if (newvd_is_spare && (!replacing || oldvd_is_log)) 3748 expected_error = ENOTSUP; 3749 else if (newvd == oldvd) 3750 expected_error = replacing ? 0 : EBUSY; 3751 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3752 expected_error = EBUSY; 3753 else if (!newvd_is_dspare && newsize < oldsize) 3754 expected_error = EOVERFLOW; 3755 else if (ashift > oldvd->vdev_top->vdev_ashift) 3756 expected_error = EDOM; 3757 else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) 3758 expected_error = ENOTSUP; 3759 else 3760 expected_error = 0; 3761 3762 spa_config_exit(spa, SCL_ALL, FTAG); 3763 3764 /* 3765 * Build the nvlist describing newpath. 3766 */ 3767 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3768 ashift, NULL, 0, 0, 1); 3769 3770 /* 3771 * When supported select either a healing or sequential resilver. 3772 */ 3773 boolean_t rebuilding = B_FALSE; 3774 if (pvd->vdev_ops == &vdev_mirror_ops || 3775 pvd->vdev_ops == &vdev_root_ops) { 3776 rebuilding = !!ztest_random(2); 3777 } 3778 3779 error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); 3780 3781 fnvlist_free(root); 3782 3783 /* 3784 * If our parent was the replacing vdev, but the replace completed, 3785 * then instead of failing with ENOTSUP we may either succeed, 3786 * fail with ENODEV, or fail with EOVERFLOW. 3787 */ 3788 if (expected_error == ENOTSUP && 3789 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3790 expected_error = error; 3791 3792 /* 3793 * If someone grew the LUN, the replacement may be too small. 3794 */ 3795 if (error == EOVERFLOW || error == EBUSY) 3796 expected_error = error; 3797 3798 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3799 error == ZFS_ERR_DISCARDING_CHECKPOINT || 3800 error == ZFS_ERR_RESILVER_IN_PROGRESS || 3801 error == ZFS_ERR_REBUILD_IN_PROGRESS) 3802 expected_error = error; 3803 3804 if (error != expected_error && expected_error != EBUSY) { 3805 fatal(B_FALSE, "attach (%s %"PRIu64", %s %"PRIu64", %d) " 3806 "returned %d, expected %d", 3807 oldpath, oldsize, newpath, 3808 newsize, replacing, error, expected_error); 3809 } 3810 out: 3811 mutex_exit(&ztest_vdev_lock); 3812 3813 umem_free(oldpath, MAXPATHLEN); 3814 umem_free(newpath, MAXPATHLEN); 3815 } 3816 3817 void 3818 ztest_device_removal(ztest_ds_t *zd, uint64_t id) 3819 { 3820 (void) zd, (void) id; 3821 spa_t *spa = ztest_spa; 3822 vdev_t *vd; 3823 uint64_t guid; 3824 int error; 3825 3826 mutex_enter(&ztest_vdev_lock); 3827 3828 if (ztest_device_removal_active) { 3829 mutex_exit(&ztest_vdev_lock); 3830 return; 3831 } 3832 3833 /* 3834 * Remove a random top-level vdev and wait for removal to finish. 3835 */ 3836 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3837 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 3838 guid = vd->vdev_guid; 3839 spa_config_exit(spa, SCL_VDEV, FTAG); 3840 3841 error = spa_vdev_remove(spa, guid, B_FALSE); 3842 if (error == 0) { 3843 ztest_device_removal_active = B_TRUE; 3844 mutex_exit(&ztest_vdev_lock); 3845 3846 /* 3847 * spa->spa_vdev_removal is created in a sync task that 3848 * is initiated via dsl_sync_task_nowait(). Since the 3849 * task may not run before spa_vdev_remove() returns, we 3850 * must wait at least 1 txg to ensure that the removal 3851 * struct has been created. 3852 */ 3853 txg_wait_synced(spa_get_dsl(spa), 0); 3854 3855 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 3856 txg_wait_synced(spa_get_dsl(spa), 0); 3857 } else { 3858 mutex_exit(&ztest_vdev_lock); 3859 return; 3860 } 3861 3862 /* 3863 * The pool needs to be scrubbed after completing device removal. 3864 * Failure to do so may result in checksum errors due to the 3865 * strategy employed by ztest_fault_inject() when selecting which 3866 * offset are redundant and can be damaged. 3867 */ 3868 error = spa_scan(spa, POOL_SCAN_SCRUB); 3869 if (error == 0) { 3870 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 3871 txg_wait_synced(spa_get_dsl(spa), 0); 3872 } 3873 3874 mutex_enter(&ztest_vdev_lock); 3875 ztest_device_removal_active = B_FALSE; 3876 mutex_exit(&ztest_vdev_lock); 3877 } 3878 3879 /* 3880 * Callback function which expands the physical size of the vdev. 3881 */ 3882 static vdev_t * 3883 grow_vdev(vdev_t *vd, void *arg) 3884 { 3885 spa_t *spa __maybe_unused = vd->vdev_spa; 3886 size_t *newsize = arg; 3887 size_t fsize; 3888 int fd; 3889 3890 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 3891 ASSERT(vd->vdev_ops->vdev_op_leaf); 3892 3893 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 3894 return (vd); 3895 3896 fsize = lseek(fd, 0, SEEK_END); 3897 VERIFY0(ftruncate(fd, *newsize)); 3898 3899 if (ztest_opts.zo_verbose >= 6) { 3900 (void) printf("%s grew from %lu to %lu bytes\n", 3901 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 3902 } 3903 (void) close(fd); 3904 return (NULL); 3905 } 3906 3907 /* 3908 * Callback function which expands a given vdev by calling vdev_online(). 3909 */ 3910 static vdev_t * 3911 online_vdev(vdev_t *vd, void *arg) 3912 { 3913 (void) arg; 3914 spa_t *spa = vd->vdev_spa; 3915 vdev_t *tvd = vd->vdev_top; 3916 uint64_t guid = vd->vdev_guid; 3917 uint64_t generation = spa->spa_config_generation + 1; 3918 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 3919 int error; 3920 3921 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 3922 ASSERT(vd->vdev_ops->vdev_op_leaf); 3923 3924 /* Calling vdev_online will initialize the new metaslabs */ 3925 spa_config_exit(spa, SCL_STATE, spa); 3926 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 3927 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3928 3929 /* 3930 * If vdev_online returned an error or the underlying vdev_open 3931 * failed then we abort the expand. The only way to know that 3932 * vdev_open fails is by checking the returned newstate. 3933 */ 3934 if (error || newstate != VDEV_STATE_HEALTHY) { 3935 if (ztest_opts.zo_verbose >= 5) { 3936 (void) printf("Unable to expand vdev, state %u, " 3937 "error %d\n", newstate, error); 3938 } 3939 return (vd); 3940 } 3941 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 3942 3943 /* 3944 * Since we dropped the lock we need to ensure that we're 3945 * still talking to the original vdev. It's possible this 3946 * vdev may have been detached/replaced while we were 3947 * trying to online it. 3948 */ 3949 if (generation != spa->spa_config_generation) { 3950 if (ztest_opts.zo_verbose >= 5) { 3951 (void) printf("vdev configuration has changed, " 3952 "guid %"PRIu64", state %"PRIu64", " 3953 "expected gen %"PRIu64", got gen %"PRIu64"\n", 3954 guid, 3955 tvd->vdev_state, 3956 generation, 3957 spa->spa_config_generation); 3958 } 3959 return (vd); 3960 } 3961 return (NULL); 3962 } 3963 3964 /* 3965 * Traverse the vdev tree calling the supplied function. 3966 * We continue to walk the tree until we either have walked all 3967 * children or we receive a non-NULL return from the callback. 3968 * If a NULL callback is passed, then we just return back the first 3969 * leaf vdev we encounter. 3970 */ 3971 static vdev_t * 3972 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 3973 { 3974 uint_t c; 3975 3976 if (vd->vdev_ops->vdev_op_leaf) { 3977 if (func == NULL) 3978 return (vd); 3979 else 3980 return (func(vd, arg)); 3981 } 3982 3983 for (c = 0; c < vd->vdev_children; c++) { 3984 vdev_t *cvd = vd->vdev_child[c]; 3985 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 3986 return (cvd); 3987 } 3988 return (NULL); 3989 } 3990 3991 /* 3992 * Verify that dynamic LUN growth works as expected. 3993 */ 3994 void 3995 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 3996 { 3997 (void) zd, (void) id; 3998 spa_t *spa = ztest_spa; 3999 vdev_t *vd, *tvd; 4000 metaslab_class_t *mc; 4001 metaslab_group_t *mg; 4002 size_t psize, newsize; 4003 uint64_t top; 4004 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 4005 4006 mutex_enter(&ztest_checkpoint_lock); 4007 mutex_enter(&ztest_vdev_lock); 4008 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4009 4010 /* 4011 * If there is a vdev removal in progress, it could complete while 4012 * we are running, in which case we would not be able to verify 4013 * that the metaslab_class space increased (because it decreases 4014 * when the device removal completes). 4015 */ 4016 if (ztest_device_removal_active) { 4017 spa_config_exit(spa, SCL_STATE, spa); 4018 mutex_exit(&ztest_vdev_lock); 4019 mutex_exit(&ztest_checkpoint_lock); 4020 return; 4021 } 4022 4023 top = ztest_random_vdev_top(spa, B_TRUE); 4024 4025 tvd = spa->spa_root_vdev->vdev_child[top]; 4026 mg = tvd->vdev_mg; 4027 mc = mg->mg_class; 4028 old_ms_count = tvd->vdev_ms_count; 4029 old_class_space = metaslab_class_get_space(mc); 4030 4031 /* 4032 * Determine the size of the first leaf vdev associated with 4033 * our top-level device. 4034 */ 4035 vd = vdev_walk_tree(tvd, NULL, NULL); 4036 ASSERT3P(vd, !=, NULL); 4037 ASSERT(vd->vdev_ops->vdev_op_leaf); 4038 4039 psize = vd->vdev_psize; 4040 4041 /* 4042 * We only try to expand the vdev if it's healthy, less than 4x its 4043 * original size, and it has a valid psize. 4044 */ 4045 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 4046 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 4047 spa_config_exit(spa, SCL_STATE, spa); 4048 mutex_exit(&ztest_vdev_lock); 4049 mutex_exit(&ztest_checkpoint_lock); 4050 return; 4051 } 4052 ASSERT3U(psize, >, 0); 4053 newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); 4054 ASSERT3U(newsize, >, psize); 4055 4056 if (ztest_opts.zo_verbose >= 6) { 4057 (void) printf("Expanding LUN %s from %lu to %lu\n", 4058 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 4059 } 4060 4061 /* 4062 * Growing the vdev is a two step process: 4063 * 1). expand the physical size (i.e. relabel) 4064 * 2). online the vdev to create the new metaslabs 4065 */ 4066 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 4067 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 4068 tvd->vdev_state != VDEV_STATE_HEALTHY) { 4069 if (ztest_opts.zo_verbose >= 5) { 4070 (void) printf("Could not expand LUN because " 4071 "the vdev configuration changed.\n"); 4072 } 4073 spa_config_exit(spa, SCL_STATE, spa); 4074 mutex_exit(&ztest_vdev_lock); 4075 mutex_exit(&ztest_checkpoint_lock); 4076 return; 4077 } 4078 4079 spa_config_exit(spa, SCL_STATE, spa); 4080 4081 /* 4082 * Expanding the LUN will update the config asynchronously, 4083 * thus we must wait for the async thread to complete any 4084 * pending tasks before proceeding. 4085 */ 4086 for (;;) { 4087 boolean_t done; 4088 mutex_enter(&spa->spa_async_lock); 4089 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 4090 mutex_exit(&spa->spa_async_lock); 4091 if (done) 4092 break; 4093 txg_wait_synced(spa_get_dsl(spa), 0); 4094 (void) poll(NULL, 0, 100); 4095 } 4096 4097 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4098 4099 tvd = spa->spa_root_vdev->vdev_child[top]; 4100 new_ms_count = tvd->vdev_ms_count; 4101 new_class_space = metaslab_class_get_space(mc); 4102 4103 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 4104 if (ztest_opts.zo_verbose >= 5) { 4105 (void) printf("Could not verify LUN expansion due to " 4106 "intervening vdev offline or remove.\n"); 4107 } 4108 spa_config_exit(spa, SCL_STATE, spa); 4109 mutex_exit(&ztest_vdev_lock); 4110 mutex_exit(&ztest_checkpoint_lock); 4111 return; 4112 } 4113 4114 /* 4115 * Make sure we were able to grow the vdev. 4116 */ 4117 if (new_ms_count <= old_ms_count) { 4118 fatal(B_FALSE, 4119 "LUN expansion failed: ms_count %"PRIu64" < %"PRIu64"\n", 4120 old_ms_count, new_ms_count); 4121 } 4122 4123 /* 4124 * Make sure we were able to grow the pool. 4125 */ 4126 if (new_class_space <= old_class_space) { 4127 fatal(B_FALSE, 4128 "LUN expansion failed: class_space %"PRIu64" < %"PRIu64"\n", 4129 old_class_space, new_class_space); 4130 } 4131 4132 if (ztest_opts.zo_verbose >= 5) { 4133 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 4134 4135 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 4136 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 4137 (void) printf("%s grew from %s to %s\n", 4138 spa->spa_name, oldnumbuf, newnumbuf); 4139 } 4140 4141 spa_config_exit(spa, SCL_STATE, spa); 4142 mutex_exit(&ztest_vdev_lock); 4143 mutex_exit(&ztest_checkpoint_lock); 4144 } 4145 4146 /* 4147 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 4148 */ 4149 static void 4150 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 4151 { 4152 (void) arg, (void) cr; 4153 4154 /* 4155 * Create the objects common to all ztest datasets. 4156 */ 4157 VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, 4158 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); 4159 } 4160 4161 static int 4162 ztest_dataset_create(char *dsname) 4163 { 4164 int err; 4165 uint64_t rand; 4166 dsl_crypto_params_t *dcp = NULL; 4167 4168 /* 4169 * 50% of the time, we create encrypted datasets 4170 * using a random cipher suite and a hard-coded 4171 * wrapping key. 4172 */ 4173 rand = ztest_random(2); 4174 if (rand != 0) { 4175 nvlist_t *crypto_args = fnvlist_alloc(); 4176 nvlist_t *props = fnvlist_alloc(); 4177 4178 /* slight bias towards the default cipher suite */ 4179 rand = ztest_random(ZIO_CRYPT_FUNCTIONS); 4180 if (rand < ZIO_CRYPT_AES_128_CCM) 4181 rand = ZIO_CRYPT_ON; 4182 4183 fnvlist_add_uint64(props, 4184 zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); 4185 fnvlist_add_uint8_array(crypto_args, "wkeydata", 4186 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 4187 4188 /* 4189 * These parameters aren't really used by the kernel. They 4190 * are simply stored so that userspace knows how to load 4191 * the wrapping key. 4192 */ 4193 fnvlist_add_uint64(props, 4194 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); 4195 fnvlist_add_string(props, 4196 zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); 4197 fnvlist_add_uint64(props, 4198 zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); 4199 fnvlist_add_uint64(props, 4200 zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); 4201 4202 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, 4203 crypto_args, &dcp)); 4204 4205 /* 4206 * Cycle through all available encryption implementations 4207 * to verify interoperability. 4208 */ 4209 VERIFY0(gcm_impl_set("cycle")); 4210 VERIFY0(aes_impl_set("cycle")); 4211 4212 fnvlist_free(crypto_args); 4213 fnvlist_free(props); 4214 } 4215 4216 err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, 4217 ztest_objset_create_cb, NULL); 4218 dsl_crypto_params_free(dcp, !!err); 4219 4220 rand = ztest_random(100); 4221 if (err || rand < 80) 4222 return (err); 4223 4224 if (ztest_opts.zo_verbose >= 5) 4225 (void) printf("Setting dataset %s to sync always\n", dsname); 4226 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 4227 ZFS_SYNC_ALWAYS, B_FALSE)); 4228 } 4229 4230 static int 4231 ztest_objset_destroy_cb(const char *name, void *arg) 4232 { 4233 (void) arg; 4234 objset_t *os; 4235 dmu_object_info_t doi; 4236 int error; 4237 4238 /* 4239 * Verify that the dataset contains a directory object. 4240 */ 4241 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4242 B_TRUE, FTAG, &os)); 4243 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 4244 if (error != ENOENT) { 4245 /* We could have crashed in the middle of destroying it */ 4246 ASSERT0(error); 4247 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 4248 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 4249 } 4250 dmu_objset_disown(os, B_TRUE, FTAG); 4251 4252 /* 4253 * Destroy the dataset. 4254 */ 4255 if (strchr(name, '@') != NULL) { 4256 error = dsl_destroy_snapshot(name, B_TRUE); 4257 if (error != ECHRNG) { 4258 /* 4259 * The program was executed, but encountered a runtime 4260 * error, such as insufficient slop, or a hold on the 4261 * dataset. 4262 */ 4263 ASSERT0(error); 4264 } 4265 } else { 4266 error = dsl_destroy_head(name); 4267 if (error == ENOSPC) { 4268 /* There could be checkpoint or insufficient slop */ 4269 ztest_record_enospc(FTAG); 4270 } else if (error != EBUSY) { 4271 /* There could be a hold on this dataset */ 4272 ASSERT0(error); 4273 } 4274 } 4275 return (0); 4276 } 4277 4278 static boolean_t 4279 ztest_snapshot_create(char *osname, uint64_t id) 4280 { 4281 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4282 int error; 4283 4284 (void) snprintf(snapname, sizeof (snapname), "%"PRIu64"", id); 4285 4286 error = dmu_objset_snapshot_one(osname, snapname); 4287 if (error == ENOSPC) { 4288 ztest_record_enospc(FTAG); 4289 return (B_FALSE); 4290 } 4291 if (error != 0 && error != EEXIST) { 4292 fatal(B_FALSE, "ztest_snapshot_create(%s@%s) = %d", osname, 4293 snapname, error); 4294 } 4295 return (B_TRUE); 4296 } 4297 4298 static boolean_t 4299 ztest_snapshot_destroy(char *osname, uint64_t id) 4300 { 4301 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4302 int error; 4303 4304 (void) snprintf(snapname, sizeof (snapname), "%s@%"PRIu64"", 4305 osname, id); 4306 4307 error = dsl_destroy_snapshot(snapname, B_FALSE); 4308 if (error != 0 && error != ENOENT) 4309 fatal(B_FALSE, "ztest_snapshot_destroy(%s) = %d", 4310 snapname, error); 4311 return (B_TRUE); 4312 } 4313 4314 void 4315 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 4316 { 4317 (void) zd; 4318 ztest_ds_t *zdtmp; 4319 int iters; 4320 int error; 4321 objset_t *os, *os2; 4322 char name[ZFS_MAX_DATASET_NAME_LEN]; 4323 zilog_t *zilog; 4324 int i; 4325 4326 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 4327 4328 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4329 4330 (void) snprintf(name, sizeof (name), "%s/temp_%"PRIu64"", 4331 ztest_opts.zo_pool, id); 4332 4333 /* 4334 * If this dataset exists from a previous run, process its replay log 4335 * half of the time. If we don't replay it, then dsl_destroy_head() 4336 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 4337 */ 4338 if (ztest_random(2) == 0 && 4339 ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 4340 B_TRUE, FTAG, &os) == 0) { 4341 ztest_zd_init(zdtmp, NULL, os); 4342 zil_replay(os, zdtmp, ztest_replay_vector); 4343 ztest_zd_fini(zdtmp); 4344 dmu_objset_disown(os, B_TRUE, FTAG); 4345 } 4346 4347 /* 4348 * There may be an old instance of the dataset we're about to 4349 * create lying around from a previous run. If so, destroy it 4350 * and all of its snapshots. 4351 */ 4352 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 4353 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 4354 4355 /* 4356 * Verify that the destroyed dataset is no longer in the namespace. 4357 */ 4358 VERIFY3U(ENOENT, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4359 B_TRUE, FTAG, &os)); 4360 4361 /* 4362 * Verify that we can create a new dataset. 4363 */ 4364 error = ztest_dataset_create(name); 4365 if (error) { 4366 if (error == ENOSPC) { 4367 ztest_record_enospc(FTAG); 4368 goto out; 4369 } 4370 fatal(B_FALSE, "dmu_objset_create(%s) = %d", name, error); 4371 } 4372 4373 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, 4374 FTAG, &os)); 4375 4376 ztest_zd_init(zdtmp, NULL, os); 4377 4378 /* 4379 * Open the intent log for it. 4380 */ 4381 zilog = zil_open(os, ztest_get_data); 4382 4383 /* 4384 * Put some objects in there, do a little I/O to them, 4385 * and randomly take a couple of snapshots along the way. 4386 */ 4387 iters = ztest_random(5); 4388 for (i = 0; i < iters; i++) { 4389 ztest_dmu_object_alloc_free(zdtmp, id); 4390 if (ztest_random(iters) == 0) 4391 (void) ztest_snapshot_create(name, i); 4392 } 4393 4394 /* 4395 * Verify that we cannot create an existing dataset. 4396 */ 4397 VERIFY3U(EEXIST, ==, 4398 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); 4399 4400 /* 4401 * Verify that we can hold an objset that is also owned. 4402 */ 4403 VERIFY0(dmu_objset_hold(name, FTAG, &os2)); 4404 dmu_objset_rele(os2, FTAG); 4405 4406 /* 4407 * Verify that we cannot own an objset that is already owned. 4408 */ 4409 VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, 4410 B_FALSE, B_TRUE, FTAG, &os2)); 4411 4412 zil_close(zilog); 4413 dmu_objset_disown(os, B_TRUE, FTAG); 4414 ztest_zd_fini(zdtmp); 4415 out: 4416 (void) pthread_rwlock_unlock(&ztest_name_lock); 4417 4418 umem_free(zdtmp, sizeof (ztest_ds_t)); 4419 } 4420 4421 /* 4422 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 4423 */ 4424 void 4425 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 4426 { 4427 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4428 (void) ztest_snapshot_destroy(zd->zd_name, id); 4429 (void) ztest_snapshot_create(zd->zd_name, id); 4430 (void) pthread_rwlock_unlock(&ztest_name_lock); 4431 } 4432 4433 /* 4434 * Cleanup non-standard snapshots and clones. 4435 */ 4436 static void 4437 ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 4438 { 4439 char *snap1name; 4440 char *clone1name; 4441 char *snap2name; 4442 char *clone2name; 4443 char *snap3name; 4444 int error; 4445 4446 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4447 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4448 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4449 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4450 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4451 4452 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4453 osname, id); 4454 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4455 osname, id); 4456 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4457 clone1name, id); 4458 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4459 osname, id); 4460 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4461 clone1name, id); 4462 4463 error = dsl_destroy_head(clone2name); 4464 if (error && error != ENOENT) 4465 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone2name, error); 4466 error = dsl_destroy_snapshot(snap3name, B_FALSE); 4467 if (error && error != ENOENT) 4468 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4469 snap3name, error); 4470 error = dsl_destroy_snapshot(snap2name, B_FALSE); 4471 if (error && error != ENOENT) 4472 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4473 snap2name, error); 4474 error = dsl_destroy_head(clone1name); 4475 if (error && error != ENOENT) 4476 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone1name, error); 4477 error = dsl_destroy_snapshot(snap1name, B_FALSE); 4478 if (error && error != ENOENT) 4479 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4480 snap1name, error); 4481 4482 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4483 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4484 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4485 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4486 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4487 } 4488 4489 /* 4490 * Verify dsl_dataset_promote handles EBUSY 4491 */ 4492 void 4493 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 4494 { 4495 objset_t *os; 4496 char *snap1name; 4497 char *clone1name; 4498 char *snap2name; 4499 char *clone2name; 4500 char *snap3name; 4501 char *osname = zd->zd_name; 4502 int error; 4503 4504 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4505 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4506 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4507 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4508 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4509 4510 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4511 4512 ztest_dsl_dataset_cleanup(osname, id); 4513 4514 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4515 osname, id); 4516 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4517 osname, id); 4518 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4519 clone1name, id); 4520 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4521 osname, id); 4522 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4523 clone1name, id); 4524 4525 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 4526 if (error && error != EEXIST) { 4527 if (error == ENOSPC) { 4528 ztest_record_enospc(FTAG); 4529 goto out; 4530 } 4531 fatal(B_FALSE, "dmu_take_snapshot(%s) = %d", snap1name, error); 4532 } 4533 4534 error = dmu_objset_clone(clone1name, snap1name); 4535 if (error) { 4536 if (error == ENOSPC) { 4537 ztest_record_enospc(FTAG); 4538 goto out; 4539 } 4540 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone1name, error); 4541 } 4542 4543 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 4544 if (error && error != EEXIST) { 4545 if (error == ENOSPC) { 4546 ztest_record_enospc(FTAG); 4547 goto out; 4548 } 4549 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap2name, error); 4550 } 4551 4552 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 4553 if (error && error != EEXIST) { 4554 if (error == ENOSPC) { 4555 ztest_record_enospc(FTAG); 4556 goto out; 4557 } 4558 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap3name, error); 4559 } 4560 4561 error = dmu_objset_clone(clone2name, snap3name); 4562 if (error) { 4563 if (error == ENOSPC) { 4564 ztest_record_enospc(FTAG); 4565 goto out; 4566 } 4567 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone2name, error); 4568 } 4569 4570 error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, 4571 FTAG, &os); 4572 if (error) 4573 fatal(B_FALSE, "dmu_objset_own(%s) = %d", snap2name, error); 4574 error = dsl_dataset_promote(clone2name, NULL); 4575 if (error == ENOSPC) { 4576 dmu_objset_disown(os, B_TRUE, FTAG); 4577 ztest_record_enospc(FTAG); 4578 goto out; 4579 } 4580 if (error != EBUSY) 4581 fatal(B_FALSE, "dsl_dataset_promote(%s), %d, not EBUSY", 4582 clone2name, error); 4583 dmu_objset_disown(os, B_TRUE, FTAG); 4584 4585 out: 4586 ztest_dsl_dataset_cleanup(osname, id); 4587 4588 (void) pthread_rwlock_unlock(&ztest_name_lock); 4589 4590 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4591 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4592 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4593 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4594 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4595 } 4596 4597 #undef OD_ARRAY_SIZE 4598 #define OD_ARRAY_SIZE 4 4599 4600 /* 4601 * Verify that dmu_object_{alloc,free} work as expected. 4602 */ 4603 void 4604 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 4605 { 4606 ztest_od_t *od; 4607 int batchsize; 4608 int size; 4609 int b; 4610 4611 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4612 od = umem_alloc(size, UMEM_NOFAIL); 4613 batchsize = OD_ARRAY_SIZE; 4614 4615 for (b = 0; b < batchsize; b++) 4616 ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 4617 0, 0, 0); 4618 4619 /* 4620 * Destroy the previous batch of objects, create a new batch, 4621 * and do some I/O on the new objects. 4622 */ 4623 if (ztest_object_init(zd, od, size, B_TRUE) != 0) 4624 return; 4625 4626 while (ztest_random(4 * batchsize) != 0) 4627 ztest_io(zd, od[ztest_random(batchsize)].od_object, 4628 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4629 4630 umem_free(od, size); 4631 } 4632 4633 /* 4634 * Rewind the global allocator to verify object allocation backfilling. 4635 */ 4636 void 4637 ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 4638 { 4639 (void) id; 4640 objset_t *os = zd->zd_os; 4641 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 4642 uint64_t object; 4643 4644 /* 4645 * Rewind the global allocator randomly back to a lower object number 4646 * to force backfilling and reclamation of recently freed dnodes. 4647 */ 4648 mutex_enter(&os->os_obj_lock); 4649 object = ztest_random(os->os_obj_next_chunk); 4650 os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); 4651 mutex_exit(&os->os_obj_lock); 4652 } 4653 4654 #undef OD_ARRAY_SIZE 4655 #define OD_ARRAY_SIZE 2 4656 4657 /* 4658 * Verify that dmu_{read,write} work as expected. 4659 */ 4660 void 4661 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 4662 { 4663 int size; 4664 ztest_od_t *od; 4665 4666 objset_t *os = zd->zd_os; 4667 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4668 od = umem_alloc(size, UMEM_NOFAIL); 4669 dmu_tx_t *tx; 4670 int freeit, error; 4671 uint64_t i, n, s, txg; 4672 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 4673 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4674 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 4675 uint64_t regions = 997; 4676 uint64_t stride = 123456789ULL; 4677 uint64_t width = 40; 4678 int free_percent = 5; 4679 4680 /* 4681 * This test uses two objects, packobj and bigobj, that are always 4682 * updated together (i.e. in the same tx) so that their contents are 4683 * in sync and can be compared. Their contents relate to each other 4684 * in a simple way: packobj is a dense array of 'bufwad' structures, 4685 * while bigobj is a sparse array of the same bufwads. Specifically, 4686 * for any index n, there are three bufwads that should be identical: 4687 * 4688 * packobj, at offset n * sizeof (bufwad_t) 4689 * bigobj, at the head of the nth chunk 4690 * bigobj, at the tail of the nth chunk 4691 * 4692 * The chunk size is arbitrary. It doesn't have to be a power of two, 4693 * and it doesn't have any relation to the object blocksize. 4694 * The only requirement is that it can hold at least two bufwads. 4695 * 4696 * Normally, we write the bufwad to each of these locations. 4697 * However, free_percent of the time we instead write zeroes to 4698 * packobj and perform a dmu_free_range() on bigobj. By comparing 4699 * bigobj to packobj, we can verify that the DMU is correctly 4700 * tracking which parts of an object are allocated and free, 4701 * and that the contents of the allocated blocks are correct. 4702 */ 4703 4704 /* 4705 * Read the directory info. If it's the first time, set things up. 4706 */ 4707 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); 4708 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 4709 chunksize); 4710 4711 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 4712 umem_free(od, size); 4713 return; 4714 } 4715 4716 bigobj = od[0].od_object; 4717 packobj = od[1].od_object; 4718 chunksize = od[0].od_gen; 4719 ASSERT3U(chunksize, ==, od[1].od_gen); 4720 4721 /* 4722 * Prefetch a random chunk of the big object. 4723 * Our aim here is to get some async reads in flight 4724 * for blocks that we may free below; the DMU should 4725 * handle this race correctly. 4726 */ 4727 n = ztest_random(regions) * stride + ztest_random(width); 4728 s = 1 + ztest_random(2 * width - 1); 4729 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 4730 ZIO_PRIORITY_SYNC_READ); 4731 4732 /* 4733 * Pick a random index and compute the offsets into packobj and bigobj. 4734 */ 4735 n = ztest_random(regions) * stride + ztest_random(width); 4736 s = 1 + ztest_random(width - 1); 4737 4738 packoff = n * sizeof (bufwad_t); 4739 packsize = s * sizeof (bufwad_t); 4740 4741 bigoff = n * chunksize; 4742 bigsize = s * chunksize; 4743 4744 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 4745 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 4746 4747 /* 4748 * free_percent of the time, free a range of bigobj rather than 4749 * overwriting it. 4750 */ 4751 freeit = (ztest_random(100) < free_percent); 4752 4753 /* 4754 * Read the current contents of our objects. 4755 */ 4756 error = dmu_read(os, packobj, packoff, packsize, packbuf, 4757 DMU_READ_PREFETCH); 4758 ASSERT0(error); 4759 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 4760 DMU_READ_PREFETCH); 4761 ASSERT0(error); 4762 4763 /* 4764 * Get a tx for the mods to both packobj and bigobj. 4765 */ 4766 tx = dmu_tx_create(os); 4767 4768 dmu_tx_hold_write(tx, packobj, packoff, packsize); 4769 4770 if (freeit) 4771 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 4772 else 4773 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 4774 4775 /* This accounts for setting the checksum/compression. */ 4776 dmu_tx_hold_bonus(tx, bigobj); 4777 4778 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4779 if (txg == 0) { 4780 umem_free(packbuf, packsize); 4781 umem_free(bigbuf, bigsize); 4782 umem_free(od, size); 4783 return; 4784 } 4785 4786 enum zio_checksum cksum; 4787 do { 4788 cksum = (enum zio_checksum) 4789 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 4790 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 4791 dmu_object_set_checksum(os, bigobj, cksum, tx); 4792 4793 enum zio_compress comp; 4794 do { 4795 comp = (enum zio_compress) 4796 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 4797 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 4798 dmu_object_set_compress(os, bigobj, comp, tx); 4799 4800 /* 4801 * For each index from n to n + s, verify that the existing bufwad 4802 * in packobj matches the bufwads at the head and tail of the 4803 * corresponding chunk in bigobj. Then update all three bufwads 4804 * with the new values we want to write out. 4805 */ 4806 for (i = 0; i < s; i++) { 4807 /* LINTED */ 4808 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4809 /* LINTED */ 4810 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4811 /* LINTED */ 4812 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4813 4814 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 4815 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 4816 4817 if (pack->bw_txg > txg) 4818 fatal(B_FALSE, 4819 "future leak: got %"PRIx64", open txg is %"PRIx64"", 4820 pack->bw_txg, txg); 4821 4822 if (pack->bw_data != 0 && pack->bw_index != n + i) 4823 fatal(B_FALSE, "wrong index: " 4824 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 4825 pack->bw_index, n, i); 4826 4827 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4828 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 4829 pack, bigH); 4830 4831 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4832 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 4833 pack, bigT); 4834 4835 if (freeit) { 4836 memset(pack, 0, sizeof (bufwad_t)); 4837 } else { 4838 pack->bw_index = n + i; 4839 pack->bw_txg = txg; 4840 pack->bw_data = 1 + ztest_random(-2ULL); 4841 } 4842 *bigH = *pack; 4843 *bigT = *pack; 4844 } 4845 4846 /* 4847 * We've verified all the old bufwads, and made new ones. 4848 * Now write them out. 4849 */ 4850 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 4851 4852 if (freeit) { 4853 if (ztest_opts.zo_verbose >= 7) { 4854 (void) printf("freeing offset %"PRIx64" size %"PRIx64"" 4855 " txg %"PRIx64"\n", 4856 bigoff, bigsize, txg); 4857 } 4858 VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 4859 } else { 4860 if (ztest_opts.zo_verbose >= 7) { 4861 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 4862 " txg %"PRIx64"\n", 4863 bigoff, bigsize, txg); 4864 } 4865 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 4866 } 4867 4868 dmu_tx_commit(tx); 4869 4870 /* 4871 * Sanity check the stuff we just wrote. 4872 */ 4873 { 4874 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 4875 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 4876 4877 VERIFY0(dmu_read(os, packobj, packoff, 4878 packsize, packcheck, DMU_READ_PREFETCH)); 4879 VERIFY0(dmu_read(os, bigobj, bigoff, 4880 bigsize, bigcheck, DMU_READ_PREFETCH)); 4881 4882 ASSERT0(memcmp(packbuf, packcheck, packsize)); 4883 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 4884 4885 umem_free(packcheck, packsize); 4886 umem_free(bigcheck, bigsize); 4887 } 4888 4889 umem_free(packbuf, packsize); 4890 umem_free(bigbuf, bigsize); 4891 umem_free(od, size); 4892 } 4893 4894 static void 4895 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 4896 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 4897 { 4898 uint64_t i; 4899 bufwad_t *pack; 4900 bufwad_t *bigH; 4901 bufwad_t *bigT; 4902 4903 /* 4904 * For each index from n to n + s, verify that the existing bufwad 4905 * in packobj matches the bufwads at the head and tail of the 4906 * corresponding chunk in bigobj. Then update all three bufwads 4907 * with the new values we want to write out. 4908 */ 4909 for (i = 0; i < s; i++) { 4910 /* LINTED */ 4911 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4912 /* LINTED */ 4913 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4914 /* LINTED */ 4915 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4916 4917 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 4918 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 4919 4920 if (pack->bw_txg > txg) 4921 fatal(B_FALSE, 4922 "future leak: got %"PRIx64", open txg is %"PRIx64"", 4923 pack->bw_txg, txg); 4924 4925 if (pack->bw_data != 0 && pack->bw_index != n + i) 4926 fatal(B_FALSE, "wrong index: " 4927 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 4928 pack->bw_index, n, i); 4929 4930 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4931 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 4932 pack, bigH); 4933 4934 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4935 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 4936 pack, bigT); 4937 4938 pack->bw_index = n + i; 4939 pack->bw_txg = txg; 4940 pack->bw_data = 1 + ztest_random(-2ULL); 4941 4942 *bigH = *pack; 4943 *bigT = *pack; 4944 } 4945 } 4946 4947 #undef OD_ARRAY_SIZE 4948 #define OD_ARRAY_SIZE 2 4949 4950 void 4951 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 4952 { 4953 objset_t *os = zd->zd_os; 4954 ztest_od_t *od; 4955 dmu_tx_t *tx; 4956 uint64_t i; 4957 int error; 4958 int size; 4959 uint64_t n, s, txg; 4960 bufwad_t *packbuf, *bigbuf; 4961 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4962 uint64_t blocksize = ztest_random_blocksize(); 4963 uint64_t chunksize = blocksize; 4964 uint64_t regions = 997; 4965 uint64_t stride = 123456789ULL; 4966 uint64_t width = 9; 4967 dmu_buf_t *bonus_db; 4968 arc_buf_t **bigbuf_arcbufs; 4969 dmu_object_info_t doi; 4970 4971 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4972 od = umem_alloc(size, UMEM_NOFAIL); 4973 4974 /* 4975 * This test uses two objects, packobj and bigobj, that are always 4976 * updated together (i.e. in the same tx) so that their contents are 4977 * in sync and can be compared. Their contents relate to each other 4978 * in a simple way: packobj is a dense array of 'bufwad' structures, 4979 * while bigobj is a sparse array of the same bufwads. Specifically, 4980 * for any index n, there are three bufwads that should be identical: 4981 * 4982 * packobj, at offset n * sizeof (bufwad_t) 4983 * bigobj, at the head of the nth chunk 4984 * bigobj, at the tail of the nth chunk 4985 * 4986 * The chunk size is set equal to bigobj block size so that 4987 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. 4988 */ 4989 4990 /* 4991 * Read the directory info. If it's the first time, set things up. 4992 */ 4993 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 4994 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 4995 chunksize); 4996 4997 4998 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 4999 umem_free(od, size); 5000 return; 5001 } 5002 5003 bigobj = od[0].od_object; 5004 packobj = od[1].od_object; 5005 blocksize = od[0].od_blocksize; 5006 chunksize = blocksize; 5007 ASSERT3U(chunksize, ==, od[1].od_gen); 5008 5009 VERIFY0(dmu_object_info(os, bigobj, &doi)); 5010 VERIFY(ISP2(doi.doi_data_block_size)); 5011 VERIFY3U(chunksize, ==, doi.doi_data_block_size); 5012 VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); 5013 5014 /* 5015 * Pick a random index and compute the offsets into packobj and bigobj. 5016 */ 5017 n = ztest_random(regions) * stride + ztest_random(width); 5018 s = 1 + ztest_random(width - 1); 5019 5020 packoff = n * sizeof (bufwad_t); 5021 packsize = s * sizeof (bufwad_t); 5022 5023 bigoff = n * chunksize; 5024 bigsize = s * chunksize; 5025 5026 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 5027 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 5028 5029 VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 5030 5031 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 5032 5033 /* 5034 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 5035 * Iteration 1 test zcopy to already referenced dbufs. 5036 * Iteration 2 test zcopy to dirty dbuf in the same txg. 5037 * Iteration 3 test zcopy to dbuf dirty in previous txg. 5038 * Iteration 4 test zcopy when dbuf is no longer dirty. 5039 * Iteration 5 test zcopy when it can't be done. 5040 * Iteration 6 one more zcopy write. 5041 */ 5042 for (i = 0; i < 7; i++) { 5043 uint64_t j; 5044 uint64_t off; 5045 5046 /* 5047 * In iteration 5 (i == 5) use arcbufs 5048 * that don't match bigobj blksz to test 5049 * dmu_assign_arcbuf_by_dbuf() when it can't directly 5050 * assign an arcbuf to a dbuf. 5051 */ 5052 for (j = 0; j < s; j++) { 5053 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5054 bigbuf_arcbufs[j] = 5055 dmu_request_arcbuf(bonus_db, chunksize); 5056 } else { 5057 bigbuf_arcbufs[2 * j] = 5058 dmu_request_arcbuf(bonus_db, chunksize / 2); 5059 bigbuf_arcbufs[2 * j + 1] = 5060 dmu_request_arcbuf(bonus_db, chunksize / 2); 5061 } 5062 } 5063 5064 /* 5065 * Get a tx for the mods to both packobj and bigobj. 5066 */ 5067 tx = dmu_tx_create(os); 5068 5069 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5070 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5071 5072 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5073 if (txg == 0) { 5074 umem_free(packbuf, packsize); 5075 umem_free(bigbuf, bigsize); 5076 for (j = 0; j < s; j++) { 5077 if (i != 5 || 5078 chunksize < (SPA_MINBLOCKSIZE * 2)) { 5079 dmu_return_arcbuf(bigbuf_arcbufs[j]); 5080 } else { 5081 dmu_return_arcbuf( 5082 bigbuf_arcbufs[2 * j]); 5083 dmu_return_arcbuf( 5084 bigbuf_arcbufs[2 * j + 1]); 5085 } 5086 } 5087 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5088 umem_free(od, size); 5089 dmu_buf_rele(bonus_db, FTAG); 5090 return; 5091 } 5092 5093 /* 5094 * 50% of the time don't read objects in the 1st iteration to 5095 * test dmu_assign_arcbuf_by_dbuf() for the case when there are 5096 * no existing dbufs for the specified offsets. 5097 */ 5098 if (i != 0 || ztest_random(2) != 0) { 5099 error = dmu_read(os, packobj, packoff, 5100 packsize, packbuf, DMU_READ_PREFETCH); 5101 ASSERT0(error); 5102 error = dmu_read(os, bigobj, bigoff, bigsize, 5103 bigbuf, DMU_READ_PREFETCH); 5104 ASSERT0(error); 5105 } 5106 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 5107 n, chunksize, txg); 5108 5109 /* 5110 * We've verified all the old bufwads, and made new ones. 5111 * Now write them out. 5112 */ 5113 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5114 if (ztest_opts.zo_verbose >= 7) { 5115 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5116 " txg %"PRIx64"\n", 5117 bigoff, bigsize, txg); 5118 } 5119 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 5120 dmu_buf_t *dbt; 5121 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5122 memcpy(bigbuf_arcbufs[j]->b_data, 5123 (caddr_t)bigbuf + (off - bigoff), 5124 chunksize); 5125 } else { 5126 memcpy(bigbuf_arcbufs[2 * j]->b_data, 5127 (caddr_t)bigbuf + (off - bigoff), 5128 chunksize / 2); 5129 memcpy(bigbuf_arcbufs[2 * j + 1]->b_data, 5130 (caddr_t)bigbuf + (off - bigoff) + 5131 chunksize / 2, 5132 chunksize / 2); 5133 } 5134 5135 if (i == 1) { 5136 VERIFY(dmu_buf_hold(os, bigobj, off, 5137 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 5138 } 5139 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5140 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5141 off, bigbuf_arcbufs[j], tx)); 5142 } else { 5143 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5144 off, bigbuf_arcbufs[2 * j], tx)); 5145 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5146 off + chunksize / 2, 5147 bigbuf_arcbufs[2 * j + 1], tx)); 5148 } 5149 if (i == 1) { 5150 dmu_buf_rele(dbt, FTAG); 5151 } 5152 } 5153 dmu_tx_commit(tx); 5154 5155 /* 5156 * Sanity check the stuff we just wrote. 5157 */ 5158 { 5159 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5160 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5161 5162 VERIFY0(dmu_read(os, packobj, packoff, 5163 packsize, packcheck, DMU_READ_PREFETCH)); 5164 VERIFY0(dmu_read(os, bigobj, bigoff, 5165 bigsize, bigcheck, DMU_READ_PREFETCH)); 5166 5167 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5168 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5169 5170 umem_free(packcheck, packsize); 5171 umem_free(bigcheck, bigsize); 5172 } 5173 if (i == 2) { 5174 txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); 5175 } else if (i == 3) { 5176 txg_wait_synced(dmu_objset_pool(os), 0); 5177 } 5178 } 5179 5180 dmu_buf_rele(bonus_db, FTAG); 5181 umem_free(packbuf, packsize); 5182 umem_free(bigbuf, bigsize); 5183 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5184 umem_free(od, size); 5185 } 5186 5187 void 5188 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 5189 { 5190 (void) id; 5191 ztest_od_t *od; 5192 5193 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5194 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 5195 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5196 5197 /* 5198 * Have multiple threads write to large offsets in an object 5199 * to verify that parallel writes to an object -- even to the 5200 * same blocks within the object -- doesn't cause any trouble. 5201 */ 5202 ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5203 5204 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) 5205 return; 5206 5207 while (ztest_random(10) != 0) 5208 ztest_io(zd, od->od_object, offset); 5209 5210 umem_free(od, sizeof (ztest_od_t)); 5211 } 5212 5213 void 5214 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 5215 { 5216 ztest_od_t *od; 5217 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 5218 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5219 uint64_t count = ztest_random(20) + 1; 5220 uint64_t blocksize = ztest_random_blocksize(); 5221 void *data; 5222 5223 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5224 5225 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5226 5227 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5228 !ztest_random(2)) != 0) { 5229 umem_free(od, sizeof (ztest_od_t)); 5230 return; 5231 } 5232 5233 if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { 5234 umem_free(od, sizeof (ztest_od_t)); 5235 return; 5236 } 5237 5238 ztest_prealloc(zd, od->od_object, offset, count * blocksize); 5239 5240 data = umem_zalloc(blocksize, UMEM_NOFAIL); 5241 5242 while (ztest_random(count) != 0) { 5243 uint64_t randoff = offset + (ztest_random(count) * blocksize); 5244 if (ztest_write(zd, od->od_object, randoff, blocksize, 5245 data) != 0) 5246 break; 5247 while (ztest_random(4) != 0) 5248 ztest_io(zd, od->od_object, randoff); 5249 } 5250 5251 umem_free(data, blocksize); 5252 umem_free(od, sizeof (ztest_od_t)); 5253 } 5254 5255 /* 5256 * Verify that zap_{create,destroy,add,remove,update} work as expected. 5257 */ 5258 #define ZTEST_ZAP_MIN_INTS 1 5259 #define ZTEST_ZAP_MAX_INTS 4 5260 #define ZTEST_ZAP_MAX_PROPS 1000 5261 5262 void 5263 ztest_zap(ztest_ds_t *zd, uint64_t id) 5264 { 5265 objset_t *os = zd->zd_os; 5266 ztest_od_t *od; 5267 uint64_t object; 5268 uint64_t txg, last_txg; 5269 uint64_t value[ZTEST_ZAP_MAX_INTS]; 5270 uint64_t zl_ints, zl_intsize, prop; 5271 int i, ints; 5272 dmu_tx_t *tx; 5273 char propname[100], txgname[100]; 5274 int error; 5275 const char *const hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 5276 5277 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5278 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5279 5280 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5281 !ztest_random(2)) != 0) 5282 goto out; 5283 5284 object = od->od_object; 5285 5286 /* 5287 * Generate a known hash collision, and verify that 5288 * we can lookup and remove both entries. 5289 */ 5290 tx = dmu_tx_create(os); 5291 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5292 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5293 if (txg == 0) 5294 goto out; 5295 for (i = 0; i < 2; i++) { 5296 value[i] = i; 5297 VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 5298 1, &value[i], tx)); 5299 } 5300 for (i = 0; i < 2; i++) { 5301 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 5302 sizeof (uint64_t), 1, &value[i], tx)); 5303 VERIFY0( 5304 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 5305 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5306 ASSERT3U(zl_ints, ==, 1); 5307 } 5308 for (i = 0; i < 2; i++) { 5309 VERIFY0(zap_remove(os, object, hc[i], tx)); 5310 } 5311 dmu_tx_commit(tx); 5312 5313 /* 5314 * Generate a bunch of random entries. 5315 */ 5316 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 5317 5318 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5319 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5320 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5321 memset(value, 0, sizeof (value)); 5322 last_txg = 0; 5323 5324 /* 5325 * If these zap entries already exist, validate their contents. 5326 */ 5327 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5328 if (error == 0) { 5329 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5330 ASSERT3U(zl_ints, ==, 1); 5331 5332 VERIFY0(zap_lookup(os, object, txgname, zl_intsize, 5333 zl_ints, &last_txg)); 5334 5335 VERIFY0(zap_length(os, object, propname, &zl_intsize, 5336 &zl_ints)); 5337 5338 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5339 ASSERT3U(zl_ints, ==, ints); 5340 5341 VERIFY0(zap_lookup(os, object, propname, zl_intsize, 5342 zl_ints, value)); 5343 5344 for (i = 0; i < ints; i++) { 5345 ASSERT3U(value[i], ==, last_txg + object + i); 5346 } 5347 } else { 5348 ASSERT3U(error, ==, ENOENT); 5349 } 5350 5351 /* 5352 * Atomically update two entries in our zap object. 5353 * The first is named txg_%llu, and contains the txg 5354 * in which the property was last updated. The second 5355 * is named prop_%llu, and the nth element of its value 5356 * should be txg + object + n. 5357 */ 5358 tx = dmu_tx_create(os); 5359 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5360 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5361 if (txg == 0) 5362 goto out; 5363 5364 if (last_txg > txg) 5365 fatal(B_FALSE, "zap future leak: old %"PRIu64" new %"PRIu64"", 5366 last_txg, txg); 5367 5368 for (i = 0; i < ints; i++) 5369 value[i] = txg + object + i; 5370 5371 VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 5372 1, &txg, tx)); 5373 VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), 5374 ints, value, tx)); 5375 5376 dmu_tx_commit(tx); 5377 5378 /* 5379 * Remove a random pair of entries. 5380 */ 5381 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5382 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5383 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5384 5385 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5386 5387 if (error == ENOENT) 5388 goto out; 5389 5390 ASSERT0(error); 5391 5392 tx = dmu_tx_create(os); 5393 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5394 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5395 if (txg == 0) 5396 goto out; 5397 VERIFY0(zap_remove(os, object, txgname, tx)); 5398 VERIFY0(zap_remove(os, object, propname, tx)); 5399 dmu_tx_commit(tx); 5400 out: 5401 umem_free(od, sizeof (ztest_od_t)); 5402 } 5403 5404 /* 5405 * Test case to test the upgrading of a microzap to fatzap. 5406 */ 5407 void 5408 ztest_fzap(ztest_ds_t *zd, uint64_t id) 5409 { 5410 objset_t *os = zd->zd_os; 5411 ztest_od_t *od; 5412 uint64_t object, txg, value; 5413 5414 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5415 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5416 5417 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5418 !ztest_random(2)) != 0) 5419 goto out; 5420 object = od->od_object; 5421 5422 /* 5423 * Add entries to this ZAP and make sure it spills over 5424 * and gets upgraded to a fatzap. Also, since we are adding 5425 * 2050 entries we should see ptrtbl growth and leaf-block split. 5426 */ 5427 for (value = 0; value < 2050; value++) { 5428 char name[ZFS_MAX_DATASET_NAME_LEN]; 5429 dmu_tx_t *tx; 5430 int error; 5431 5432 (void) snprintf(name, sizeof (name), "fzap-%"PRIu64"-%"PRIu64"", 5433 id, value); 5434 5435 tx = dmu_tx_create(os); 5436 dmu_tx_hold_zap(tx, object, B_TRUE, name); 5437 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5438 if (txg == 0) 5439 goto out; 5440 error = zap_add(os, object, name, sizeof (uint64_t), 1, 5441 &value, tx); 5442 ASSERT(error == 0 || error == EEXIST); 5443 dmu_tx_commit(tx); 5444 } 5445 out: 5446 umem_free(od, sizeof (ztest_od_t)); 5447 } 5448 5449 void 5450 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 5451 { 5452 (void) id; 5453 objset_t *os = zd->zd_os; 5454 ztest_od_t *od; 5455 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 5456 dmu_tx_t *tx; 5457 int i, namelen, error; 5458 int micro = ztest_random(2); 5459 char name[20], string_value[20]; 5460 void *data; 5461 5462 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5463 ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); 5464 5465 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5466 umem_free(od, sizeof (ztest_od_t)); 5467 return; 5468 } 5469 5470 object = od->od_object; 5471 5472 /* 5473 * Generate a random name of the form 'xxx.....' where each 5474 * x is a random printable character and the dots are dots. 5475 * There are 94 such characters, and the name length goes from 5476 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 5477 */ 5478 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 5479 5480 for (i = 0; i < 3; i++) 5481 name[i] = '!' + ztest_random('~' - '!' + 1); 5482 for (; i < namelen - 1; i++) 5483 name[i] = '.'; 5484 name[i] = '\0'; 5485 5486 if ((namelen & 1) || micro) { 5487 wsize = sizeof (txg); 5488 wc = 1; 5489 data = &txg; 5490 } else { 5491 wsize = 1; 5492 wc = namelen; 5493 data = string_value; 5494 } 5495 5496 count = -1ULL; 5497 VERIFY0(zap_count(os, object, &count)); 5498 ASSERT3S(count, !=, -1ULL); 5499 5500 /* 5501 * Select an operation: length, lookup, add, update, remove. 5502 */ 5503 i = ztest_random(5); 5504 5505 if (i >= 2) { 5506 tx = dmu_tx_create(os); 5507 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5508 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5509 if (txg == 0) { 5510 umem_free(od, sizeof (ztest_od_t)); 5511 return; 5512 } 5513 memcpy(string_value, name, namelen); 5514 } else { 5515 tx = NULL; 5516 txg = 0; 5517 memset(string_value, 0, namelen); 5518 } 5519 5520 switch (i) { 5521 5522 case 0: 5523 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 5524 if (error == 0) { 5525 ASSERT3U(wsize, ==, zl_wsize); 5526 ASSERT3U(wc, ==, zl_wc); 5527 } else { 5528 ASSERT3U(error, ==, ENOENT); 5529 } 5530 break; 5531 5532 case 1: 5533 error = zap_lookup(os, object, name, wsize, wc, data); 5534 if (error == 0) { 5535 if (data == string_value && 5536 memcmp(name, data, namelen) != 0) 5537 fatal(B_FALSE, "name '%s' != val '%s' len %d", 5538 name, (char *)data, namelen); 5539 } else { 5540 ASSERT3U(error, ==, ENOENT); 5541 } 5542 break; 5543 5544 case 2: 5545 error = zap_add(os, object, name, wsize, wc, data, tx); 5546 ASSERT(error == 0 || error == EEXIST); 5547 break; 5548 5549 case 3: 5550 VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); 5551 break; 5552 5553 case 4: 5554 error = zap_remove(os, object, name, tx); 5555 ASSERT(error == 0 || error == ENOENT); 5556 break; 5557 } 5558 5559 if (tx != NULL) 5560 dmu_tx_commit(tx); 5561 5562 umem_free(od, sizeof (ztest_od_t)); 5563 } 5564 5565 /* 5566 * Commit callback data. 5567 */ 5568 typedef struct ztest_cb_data { 5569 list_node_t zcd_node; 5570 uint64_t zcd_txg; 5571 int zcd_expected_err; 5572 boolean_t zcd_added; 5573 boolean_t zcd_called; 5574 spa_t *zcd_spa; 5575 } ztest_cb_data_t; 5576 5577 /* This is the actual commit callback function */ 5578 static void 5579 ztest_commit_callback(void *arg, int error) 5580 { 5581 ztest_cb_data_t *data = arg; 5582 uint64_t synced_txg; 5583 5584 VERIFY3P(data, !=, NULL); 5585 VERIFY3S(data->zcd_expected_err, ==, error); 5586 VERIFY(!data->zcd_called); 5587 5588 synced_txg = spa_last_synced_txg(data->zcd_spa); 5589 if (data->zcd_txg > synced_txg) 5590 fatal(B_FALSE, 5591 "commit callback of txg %"PRIu64" called prematurely, " 5592 "last synced txg = %"PRIu64"\n", 5593 data->zcd_txg, synced_txg); 5594 5595 data->zcd_called = B_TRUE; 5596 5597 if (error == ECANCELED) { 5598 ASSERT0(data->zcd_txg); 5599 ASSERT(!data->zcd_added); 5600 5601 /* 5602 * The private callback data should be destroyed here, but 5603 * since we are going to check the zcd_called field after 5604 * dmu_tx_abort(), we will destroy it there. 5605 */ 5606 return; 5607 } 5608 5609 ASSERT(data->zcd_added); 5610 ASSERT3U(data->zcd_txg, !=, 0); 5611 5612 (void) mutex_enter(&zcl.zcl_callbacks_lock); 5613 5614 /* See if this cb was called more quickly */ 5615 if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) 5616 zc_min_txg_delay = synced_txg - data->zcd_txg; 5617 5618 /* Remove our callback from the list */ 5619 list_remove(&zcl.zcl_callbacks, data); 5620 5621 (void) mutex_exit(&zcl.zcl_callbacks_lock); 5622 5623 umem_free(data, sizeof (ztest_cb_data_t)); 5624 } 5625 5626 /* Allocate and initialize callback data structure */ 5627 static ztest_cb_data_t * 5628 ztest_create_cb_data(objset_t *os, uint64_t txg) 5629 { 5630 ztest_cb_data_t *cb_data; 5631 5632 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 5633 5634 cb_data->zcd_txg = txg; 5635 cb_data->zcd_spa = dmu_objset_spa(os); 5636 list_link_init(&cb_data->zcd_node); 5637 5638 return (cb_data); 5639 } 5640 5641 /* 5642 * Commit callback test. 5643 */ 5644 void 5645 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 5646 { 5647 objset_t *os = zd->zd_os; 5648 ztest_od_t *od; 5649 dmu_tx_t *tx; 5650 ztest_cb_data_t *cb_data[3], *tmp_cb; 5651 uint64_t old_txg, txg; 5652 int i, error = 0; 5653 5654 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5655 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5656 5657 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5658 umem_free(od, sizeof (ztest_od_t)); 5659 return; 5660 } 5661 5662 tx = dmu_tx_create(os); 5663 5664 cb_data[0] = ztest_create_cb_data(os, 0); 5665 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 5666 5667 dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); 5668 5669 /* Every once in a while, abort the transaction on purpose */ 5670 if (ztest_random(100) == 0) 5671 error = -1; 5672 5673 if (!error) 5674 error = dmu_tx_assign(tx, TXG_NOWAIT); 5675 5676 txg = error ? 0 : dmu_tx_get_txg(tx); 5677 5678 cb_data[0]->zcd_txg = txg; 5679 cb_data[1] = ztest_create_cb_data(os, txg); 5680 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 5681 5682 if (error) { 5683 /* 5684 * It's not a strict requirement to call the registered 5685 * callbacks from inside dmu_tx_abort(), but that's what 5686 * it's supposed to happen in the current implementation 5687 * so we will check for that. 5688 */ 5689 for (i = 0; i < 2; i++) { 5690 cb_data[i]->zcd_expected_err = ECANCELED; 5691 VERIFY(!cb_data[i]->zcd_called); 5692 } 5693 5694 dmu_tx_abort(tx); 5695 5696 for (i = 0; i < 2; i++) { 5697 VERIFY(cb_data[i]->zcd_called); 5698 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 5699 } 5700 5701 umem_free(od, sizeof (ztest_od_t)); 5702 return; 5703 } 5704 5705 cb_data[2] = ztest_create_cb_data(os, txg); 5706 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 5707 5708 /* 5709 * Read existing data to make sure there isn't a future leak. 5710 */ 5711 VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), 5712 &old_txg, DMU_READ_PREFETCH)); 5713 5714 if (old_txg > txg) 5715 fatal(B_FALSE, 5716 "future leak: got %"PRIu64", open txg is %"PRIu64"", 5717 old_txg, txg); 5718 5719 dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); 5720 5721 (void) mutex_enter(&zcl.zcl_callbacks_lock); 5722 5723 /* 5724 * Since commit callbacks don't have any ordering requirement and since 5725 * it is theoretically possible for a commit callback to be called 5726 * after an arbitrary amount of time has elapsed since its txg has been 5727 * synced, it is difficult to reliably determine whether a commit 5728 * callback hasn't been called due to high load or due to a flawed 5729 * implementation. 5730 * 5731 * In practice, we will assume that if after a certain number of txgs a 5732 * commit callback hasn't been called, then most likely there's an 5733 * implementation bug.. 5734 */ 5735 tmp_cb = list_head(&zcl.zcl_callbacks); 5736 if (tmp_cb != NULL && 5737 tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { 5738 fatal(B_FALSE, 5739 "Commit callback threshold exceeded, " 5740 "oldest txg: %"PRIu64", open txg: %"PRIu64"\n", 5741 tmp_cb->zcd_txg, txg); 5742 } 5743 5744 /* 5745 * Let's find the place to insert our callbacks. 5746 * 5747 * Even though the list is ordered by txg, it is possible for the 5748 * insertion point to not be the end because our txg may already be 5749 * quiescing at this point and other callbacks in the open txg 5750 * (from other objsets) may have sneaked in. 5751 */ 5752 tmp_cb = list_tail(&zcl.zcl_callbacks); 5753 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 5754 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 5755 5756 /* Add the 3 callbacks to the list */ 5757 for (i = 0; i < 3; i++) { 5758 if (tmp_cb == NULL) 5759 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 5760 else 5761 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 5762 cb_data[i]); 5763 5764 cb_data[i]->zcd_added = B_TRUE; 5765 VERIFY(!cb_data[i]->zcd_called); 5766 5767 tmp_cb = cb_data[i]; 5768 } 5769 5770 zc_cb_counter += 3; 5771 5772 (void) mutex_exit(&zcl.zcl_callbacks_lock); 5773 5774 dmu_tx_commit(tx); 5775 5776 umem_free(od, sizeof (ztest_od_t)); 5777 } 5778 5779 /* 5780 * Visit each object in the dataset. Verify that its properties 5781 * are consistent what was stored in the block tag when it was created, 5782 * and that its unused bonus buffer space has not been overwritten. 5783 */ 5784 void 5785 ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 5786 { 5787 (void) id; 5788 objset_t *os = zd->zd_os; 5789 uint64_t obj; 5790 int err = 0; 5791 5792 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 5793 ztest_block_tag_t *bt = NULL; 5794 dmu_object_info_t doi; 5795 dmu_buf_t *db; 5796 5797 ztest_object_lock(zd, obj, RL_READER); 5798 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { 5799 ztest_object_unlock(zd, obj); 5800 continue; 5801 } 5802 5803 dmu_object_info_from_db(db, &doi); 5804 if (doi.doi_bonus_size >= sizeof (*bt)) 5805 bt = ztest_bt_bonus(db); 5806 5807 if (bt && bt->bt_magic == BT_MAGIC) { 5808 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 5809 bt->bt_offset, bt->bt_gen, bt->bt_txg, 5810 bt->bt_crtxg); 5811 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 5812 } 5813 5814 dmu_buf_rele(db, FTAG); 5815 ztest_object_unlock(zd, obj); 5816 } 5817 } 5818 5819 void 5820 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 5821 { 5822 (void) id; 5823 zfs_prop_t proplist[] = { 5824 ZFS_PROP_CHECKSUM, 5825 ZFS_PROP_COMPRESSION, 5826 ZFS_PROP_COPIES, 5827 ZFS_PROP_DEDUP 5828 }; 5829 5830 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5831 5832 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) 5833 (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 5834 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 5835 5836 VERIFY0(ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, 5837 ztest_random_blocksize(), (int)ztest_random(2))); 5838 5839 (void) pthread_rwlock_unlock(&ztest_name_lock); 5840 } 5841 5842 void 5843 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 5844 { 5845 (void) zd, (void) id; 5846 nvlist_t *props = NULL; 5847 5848 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5849 5850 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); 5851 5852 VERIFY0(spa_prop_get(ztest_spa, &props)); 5853 5854 if (ztest_opts.zo_verbose >= 6) 5855 dump_nvlist(props, 4); 5856 5857 fnvlist_free(props); 5858 5859 (void) pthread_rwlock_unlock(&ztest_name_lock); 5860 } 5861 5862 static int 5863 user_release_one(const char *snapname, const char *holdname) 5864 { 5865 nvlist_t *snaps, *holds; 5866 int error; 5867 5868 snaps = fnvlist_alloc(); 5869 holds = fnvlist_alloc(); 5870 fnvlist_add_boolean(holds, holdname); 5871 fnvlist_add_nvlist(snaps, snapname, holds); 5872 fnvlist_free(holds); 5873 error = dsl_dataset_user_release(snaps, NULL); 5874 fnvlist_free(snaps); 5875 return (error); 5876 } 5877 5878 /* 5879 * Test snapshot hold/release and deferred destroy. 5880 */ 5881 void 5882 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 5883 { 5884 int error; 5885 objset_t *os = zd->zd_os; 5886 objset_t *origin; 5887 char snapname[100]; 5888 char fullname[100]; 5889 char clonename[100]; 5890 char tag[100]; 5891 char osname[ZFS_MAX_DATASET_NAME_LEN]; 5892 nvlist_t *holds; 5893 5894 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5895 5896 dmu_objset_name(os, osname); 5897 5898 (void) snprintf(snapname, sizeof (snapname), "sh1_%"PRIu64"", id); 5899 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 5900 (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%"PRIu64"", 5901 osname, id); 5902 (void) snprintf(tag, sizeof (tag), "tag_%"PRIu64"", id); 5903 5904 /* 5905 * Clean up from any previous run. 5906 */ 5907 error = dsl_destroy_head(clonename); 5908 if (error != ENOENT) 5909 ASSERT0(error); 5910 error = user_release_one(fullname, tag); 5911 if (error != ESRCH && error != ENOENT) 5912 ASSERT0(error); 5913 error = dsl_destroy_snapshot(fullname, B_FALSE); 5914 if (error != ENOENT) 5915 ASSERT0(error); 5916 5917 /* 5918 * Create snapshot, clone it, mark snap for deferred destroy, 5919 * destroy clone, verify snap was also destroyed. 5920 */ 5921 error = dmu_objset_snapshot_one(osname, snapname); 5922 if (error) { 5923 if (error == ENOSPC) { 5924 ztest_record_enospc("dmu_objset_snapshot"); 5925 goto out; 5926 } 5927 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 5928 } 5929 5930 error = dmu_objset_clone(clonename, fullname); 5931 if (error) { 5932 if (error == ENOSPC) { 5933 ztest_record_enospc("dmu_objset_clone"); 5934 goto out; 5935 } 5936 fatal(B_FALSE, "dmu_objset_clone(%s) = %d", clonename, error); 5937 } 5938 5939 error = dsl_destroy_snapshot(fullname, B_TRUE); 5940 if (error) { 5941 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5942 fullname, error); 5943 } 5944 5945 error = dsl_destroy_head(clonename); 5946 if (error) 5947 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clonename, error); 5948 5949 error = dmu_objset_hold(fullname, FTAG, &origin); 5950 if (error != ENOENT) 5951 fatal(B_FALSE, "dmu_objset_hold(%s) = %d", fullname, error); 5952 5953 /* 5954 * Create snapshot, add temporary hold, verify that we can't 5955 * destroy a held snapshot, mark for deferred destroy, 5956 * release hold, verify snapshot was destroyed. 5957 */ 5958 error = dmu_objset_snapshot_one(osname, snapname); 5959 if (error) { 5960 if (error == ENOSPC) { 5961 ztest_record_enospc("dmu_objset_snapshot"); 5962 goto out; 5963 } 5964 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 5965 } 5966 5967 holds = fnvlist_alloc(); 5968 fnvlist_add_string(holds, fullname, tag); 5969 error = dsl_dataset_user_hold(holds, 0, NULL); 5970 fnvlist_free(holds); 5971 5972 if (error == ENOSPC) { 5973 ztest_record_enospc("dsl_dataset_user_hold"); 5974 goto out; 5975 } else if (error) { 5976 fatal(B_FALSE, "dsl_dataset_user_hold(%s, %s) = %u", 5977 fullname, tag, error); 5978 } 5979 5980 error = dsl_destroy_snapshot(fullname, B_FALSE); 5981 if (error != EBUSY) { 5982 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 5983 fullname, error); 5984 } 5985 5986 error = dsl_destroy_snapshot(fullname, B_TRUE); 5987 if (error) { 5988 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5989 fullname, error); 5990 } 5991 5992 error = user_release_one(fullname, tag); 5993 if (error) 5994 fatal(B_FALSE, "user_release_one(%s, %s) = %d", 5995 fullname, tag, error); 5996 5997 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 5998 5999 out: 6000 (void) pthread_rwlock_unlock(&ztest_name_lock); 6001 } 6002 6003 /* 6004 * Inject random faults into the on-disk data. 6005 */ 6006 void 6007 ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 6008 { 6009 (void) zd, (void) id; 6010 ztest_shared_t *zs = ztest_shared; 6011 spa_t *spa = ztest_spa; 6012 int fd; 6013 uint64_t offset; 6014 uint64_t leaves; 6015 uint64_t bad = 0x1990c0ffeedecadeull; 6016 uint64_t top, leaf; 6017 char *path0; 6018 char *pathrand; 6019 size_t fsize; 6020 int bshift = SPA_MAXBLOCKSHIFT + 2; 6021 int iters = 1000; 6022 int maxfaults; 6023 int mirror_save; 6024 vdev_t *vd0 = NULL; 6025 uint64_t guid0 = 0; 6026 boolean_t islog = B_FALSE; 6027 6028 path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6029 pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6030 6031 mutex_enter(&ztest_vdev_lock); 6032 6033 /* 6034 * Device removal is in progress, fault injection must be disabled 6035 * until it completes and the pool is scrubbed. The fault injection 6036 * strategy for damaging blocks does not take in to account evacuated 6037 * blocks which may have already been damaged. 6038 */ 6039 if (ztest_device_removal_active) { 6040 mutex_exit(&ztest_vdev_lock); 6041 goto out; 6042 } 6043 6044 maxfaults = MAXFAULTS(zs); 6045 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; 6046 mirror_save = zs->zs_mirrors; 6047 mutex_exit(&ztest_vdev_lock); 6048 6049 ASSERT3U(leaves, >=, 1); 6050 6051 /* 6052 * While ztest is running the number of leaves will not change. This 6053 * is critical for the fault injection logic as it determines where 6054 * errors can be safely injected such that they are always repairable. 6055 * 6056 * When restarting ztest a different number of leaves may be requested 6057 * which will shift the regions to be damaged. This is fine as long 6058 * as the pool has been scrubbed prior to using the new mapping. 6059 * Failure to do can result in non-repairable damage being injected. 6060 */ 6061 if (ztest_pool_scrubbed == B_FALSE) 6062 goto out; 6063 6064 /* 6065 * Grab the name lock as reader. There are some operations 6066 * which don't like to have their vdevs changed while 6067 * they are in progress (i.e. spa_change_guid). Those 6068 * operations will have grabbed the name lock as writer. 6069 */ 6070 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6071 6072 /* 6073 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 6074 */ 6075 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6076 6077 if (ztest_random(2) == 0) { 6078 /* 6079 * Inject errors on a normal data device or slog device. 6080 */ 6081 top = ztest_random_vdev_top(spa, B_TRUE); 6082 leaf = ztest_random(leaves) + zs->zs_splits; 6083 6084 /* 6085 * Generate paths to the first leaf in this top-level vdev, 6086 * and to the random leaf we selected. We'll induce transient 6087 * write failures and random online/offline activity on leaf 0, 6088 * and we'll write random garbage to the randomly chosen leaf. 6089 */ 6090 (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, 6091 ztest_opts.zo_dir, ztest_opts.zo_pool, 6092 top * leaves + zs->zs_splits); 6093 (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, 6094 ztest_opts.zo_dir, ztest_opts.zo_pool, 6095 top * leaves + leaf); 6096 6097 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 6098 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 6099 islog = B_TRUE; 6100 6101 /* 6102 * If the top-level vdev needs to be resilvered 6103 * then we only allow faults on the device that is 6104 * resilvering. 6105 */ 6106 if (vd0 != NULL && maxfaults != 1 && 6107 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 6108 vd0->vdev_resilver_txg != 0)) { 6109 /* 6110 * Make vd0 explicitly claim to be unreadable, 6111 * or unwritable, or reach behind its back 6112 * and close the underlying fd. We can do this if 6113 * maxfaults == 0 because we'll fail and reexecute, 6114 * and we can do it if maxfaults >= 2 because we'll 6115 * have enough redundancy. If maxfaults == 1, the 6116 * combination of this with injection of random data 6117 * corruption below exceeds the pool's fault tolerance. 6118 */ 6119 vdev_file_t *vf = vd0->vdev_tsd; 6120 6121 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 6122 (long long)vd0->vdev_id, (int)maxfaults); 6123 6124 if (vf != NULL && ztest_random(3) == 0) { 6125 (void) close(vf->vf_file->f_fd); 6126 vf->vf_file->f_fd = -1; 6127 } else if (ztest_random(2) == 0) { 6128 vd0->vdev_cant_read = B_TRUE; 6129 } else { 6130 vd0->vdev_cant_write = B_TRUE; 6131 } 6132 guid0 = vd0->vdev_guid; 6133 } 6134 } else { 6135 /* 6136 * Inject errors on an l2cache device. 6137 */ 6138 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6139 6140 if (sav->sav_count == 0) { 6141 spa_config_exit(spa, SCL_STATE, FTAG); 6142 (void) pthread_rwlock_unlock(&ztest_name_lock); 6143 goto out; 6144 } 6145 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 6146 guid0 = vd0->vdev_guid; 6147 (void) strcpy(path0, vd0->vdev_path); 6148 (void) strcpy(pathrand, vd0->vdev_path); 6149 6150 leaf = 0; 6151 leaves = 1; 6152 maxfaults = INT_MAX; /* no limit on cache devices */ 6153 } 6154 6155 spa_config_exit(spa, SCL_STATE, FTAG); 6156 (void) pthread_rwlock_unlock(&ztest_name_lock); 6157 6158 /* 6159 * If we can tolerate two or more faults, or we're dealing 6160 * with a slog, randomly online/offline vd0. 6161 */ 6162 if ((maxfaults >= 2 || islog) && guid0 != 0) { 6163 if (ztest_random(10) < 6) { 6164 int flags = (ztest_random(2) == 0 ? 6165 ZFS_OFFLINE_TEMPORARY : 0); 6166 6167 /* 6168 * We have to grab the zs_name_lock as writer to 6169 * prevent a race between offlining a slog and 6170 * destroying a dataset. Offlining the slog will 6171 * grab a reference on the dataset which may cause 6172 * dsl_destroy_head() to fail with EBUSY thus 6173 * leaving the dataset in an inconsistent state. 6174 */ 6175 if (islog) 6176 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6177 6178 VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); 6179 6180 if (islog) 6181 (void) pthread_rwlock_unlock(&ztest_name_lock); 6182 } else { 6183 /* 6184 * Ideally we would like to be able to randomly 6185 * call vdev_[on|off]line without holding locks 6186 * to force unpredictable failures but the side 6187 * effects of vdev_[on|off]line prevent us from 6188 * doing so. We grab the ztest_vdev_lock here to 6189 * prevent a race between injection testing and 6190 * aux_vdev removal. 6191 */ 6192 mutex_enter(&ztest_vdev_lock); 6193 (void) vdev_online(spa, guid0, 0, NULL); 6194 mutex_exit(&ztest_vdev_lock); 6195 } 6196 } 6197 6198 if (maxfaults == 0) 6199 goto out; 6200 6201 /* 6202 * We have at least single-fault tolerance, so inject data corruption. 6203 */ 6204 fd = open(pathrand, O_RDWR); 6205 6206 if (fd == -1) /* we hit a gap in the device namespace */ 6207 goto out; 6208 6209 fsize = lseek(fd, 0, SEEK_END); 6210 6211 while (--iters != 0) { 6212 /* 6213 * The offset must be chosen carefully to ensure that 6214 * we do not inject a given logical block with errors 6215 * on two different leaf devices, because ZFS can not 6216 * tolerate that (if maxfaults==1). 6217 * 6218 * To achieve this we divide each leaf device into 6219 * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). 6220 * Each chunk is further divided into error-injection 6221 * ranges (can accept errors) and clear ranges (we do 6222 * not inject errors in those). Each error-injection 6223 * range can accept errors only for a single leaf vdev. 6224 * Error-injection ranges are separated by clear ranges. 6225 * 6226 * For example, with 3 leaves, each chunk looks like: 6227 * 0 to 32M: injection range for leaf 0 6228 * 32M to 64M: clear range - no injection allowed 6229 * 64M to 96M: injection range for leaf 1 6230 * 96M to 128M: clear range - no injection allowed 6231 * 128M to 160M: injection range for leaf 2 6232 * 160M to 192M: clear range - no injection allowed 6233 * 6234 * Each clear range must be large enough such that a 6235 * single block cannot straddle it. This way a block 6236 * can't be a target in two different injection ranges 6237 * (on different leaf vdevs). 6238 */ 6239 offset = ztest_random(fsize / (leaves << bshift)) * 6240 (leaves << bshift) + (leaf << bshift) + 6241 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 6242 6243 /* 6244 * Only allow damage to the labels at one end of the vdev. 6245 * 6246 * If all labels are damaged, the device will be totally 6247 * inaccessible, which will result in loss of data, 6248 * because we also damage (parts of) the other side of 6249 * the mirror/raidz. 6250 * 6251 * Additionally, we will always have both an even and an 6252 * odd label, so that we can handle crashes in the 6253 * middle of vdev_config_sync(). 6254 */ 6255 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 6256 continue; 6257 6258 /* 6259 * The two end labels are stored at the "end" of the disk, but 6260 * the end of the disk (vdev_psize) is aligned to 6261 * sizeof (vdev_label_t). 6262 */ 6263 uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); 6264 if ((leaf & 1) == 1 && 6265 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 6266 continue; 6267 6268 mutex_enter(&ztest_vdev_lock); 6269 if (mirror_save != zs->zs_mirrors) { 6270 mutex_exit(&ztest_vdev_lock); 6271 (void) close(fd); 6272 goto out; 6273 } 6274 6275 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 6276 fatal(B_TRUE, 6277 "can't inject bad word at 0x%"PRIx64" in %s", 6278 offset, pathrand); 6279 6280 mutex_exit(&ztest_vdev_lock); 6281 6282 if (ztest_opts.zo_verbose >= 7) 6283 (void) printf("injected bad word into %s," 6284 " offset 0x%"PRIx64"\n", pathrand, offset); 6285 } 6286 6287 (void) close(fd); 6288 out: 6289 umem_free(path0, MAXPATHLEN); 6290 umem_free(pathrand, MAXPATHLEN); 6291 } 6292 6293 /* 6294 * By design ztest will never inject uncorrectable damage in to the pool. 6295 * Issue a scrub, wait for it to complete, and verify there is never any 6296 * persistent damage. 6297 * 6298 * Only after a full scrub has been completed is it safe to start injecting 6299 * data corruption. See the comment in zfs_fault_inject(). 6300 */ 6301 static int 6302 ztest_scrub_impl(spa_t *spa) 6303 { 6304 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6305 if (error) 6306 return (error); 6307 6308 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6309 txg_wait_synced(spa_get_dsl(spa), 0); 6310 6311 if (spa_get_errlog_size(spa) > 0) 6312 return (ECKSUM); 6313 6314 ztest_pool_scrubbed = B_TRUE; 6315 6316 return (0); 6317 } 6318 6319 /* 6320 * Scrub the pool. 6321 */ 6322 void 6323 ztest_scrub(ztest_ds_t *zd, uint64_t id) 6324 { 6325 (void) zd, (void) id; 6326 spa_t *spa = ztest_spa; 6327 int error; 6328 6329 /* 6330 * Scrub in progress by device removal. 6331 */ 6332 if (ztest_device_removal_active) 6333 return; 6334 6335 /* 6336 * Start a scrub, wait a moment, then force a restart. 6337 */ 6338 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6339 (void) poll(NULL, 0, 100); 6340 6341 error = ztest_scrub_impl(spa); 6342 if (error == EBUSY) 6343 error = 0; 6344 ASSERT0(error); 6345 } 6346 6347 /* 6348 * Change the guid for the pool. 6349 */ 6350 void 6351 ztest_reguid(ztest_ds_t *zd, uint64_t id) 6352 { 6353 (void) zd, (void) id; 6354 spa_t *spa = ztest_spa; 6355 uint64_t orig, load; 6356 int error; 6357 6358 if (ztest_opts.zo_mmp_test) 6359 return; 6360 6361 orig = spa_guid(spa); 6362 load = spa_load_guid(spa); 6363 6364 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6365 error = spa_change_guid(spa); 6366 (void) pthread_rwlock_unlock(&ztest_name_lock); 6367 6368 if (error != 0) 6369 return; 6370 6371 if (ztest_opts.zo_verbose >= 4) { 6372 (void) printf("Changed guid old %"PRIu64" -> %"PRIu64"\n", 6373 orig, spa_guid(spa)); 6374 } 6375 6376 VERIFY3U(orig, !=, spa_guid(spa)); 6377 VERIFY3U(load, ==, spa_load_guid(spa)); 6378 } 6379 6380 void 6381 ztest_blake3(ztest_ds_t *zd, uint64_t id) 6382 { 6383 (void) zd, (void) id; 6384 hrtime_t end = gethrtime() + NANOSEC; 6385 zio_cksum_salt_t salt; 6386 void *salt_ptr = &salt.zcs_bytes; 6387 struct abd *abd_data, *abd_meta; 6388 void *buf, *templ; 6389 int i, *ptr; 6390 uint32_t size; 6391 BLAKE3_CTX ctx; 6392 6393 size = ztest_random_blocksize(); 6394 buf = umem_alloc(size, UMEM_NOFAIL); 6395 abd_data = abd_alloc(size, B_FALSE); 6396 abd_meta = abd_alloc(size, B_TRUE); 6397 6398 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6399 *ptr = ztest_random(UINT_MAX); 6400 memset(salt_ptr, 'A', 32); 6401 6402 abd_copy_from_buf_off(abd_data, buf, 0, size); 6403 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6404 6405 while (gethrtime() <= end) { 6406 int run_count = 100; 6407 zio_cksum_t zc_ref1, zc_ref2; 6408 zio_cksum_t zc_res1, zc_res2; 6409 6410 void *ref1 = &zc_ref1; 6411 void *ref2 = &zc_ref2; 6412 void *res1 = &zc_res1; 6413 void *res2 = &zc_res2; 6414 6415 /* BLAKE3_KEY_LEN = 32 */ 6416 VERIFY0(blake3_set_impl_name("generic")); 6417 templ = abd_checksum_blake3_tmpl_init(&salt); 6418 Blake3_InitKeyed(&ctx, salt_ptr); 6419 Blake3_Update(&ctx, buf, size); 6420 Blake3_Final(&ctx, ref1); 6421 zc_ref2 = zc_ref1; 6422 ZIO_CHECKSUM_BSWAP(&zc_ref2); 6423 abd_checksum_blake3_tmpl_free(templ); 6424 6425 VERIFY0(blake3_set_impl_name("cycle")); 6426 while (run_count-- > 0) { 6427 6428 /* Test current implementation */ 6429 Blake3_InitKeyed(&ctx, salt_ptr); 6430 Blake3_Update(&ctx, buf, size); 6431 Blake3_Final(&ctx, res1); 6432 zc_res2 = zc_res1; 6433 ZIO_CHECKSUM_BSWAP(&zc_res2); 6434 6435 VERIFY0(memcmp(ref1, res1, 32)); 6436 VERIFY0(memcmp(ref2, res2, 32)); 6437 6438 /* Test ABD - data */ 6439 templ = abd_checksum_blake3_tmpl_init(&salt); 6440 abd_checksum_blake3_native(abd_data, size, 6441 templ, &zc_res1); 6442 abd_checksum_blake3_byteswap(abd_data, size, 6443 templ, &zc_res2); 6444 6445 VERIFY0(memcmp(ref1, res1, 32)); 6446 VERIFY0(memcmp(ref2, res2, 32)); 6447 6448 /* Test ABD - metadata */ 6449 abd_checksum_blake3_native(abd_meta, size, 6450 templ, &zc_res1); 6451 abd_checksum_blake3_byteswap(abd_meta, size, 6452 templ, &zc_res2); 6453 abd_checksum_blake3_tmpl_free(templ); 6454 6455 VERIFY0(memcmp(ref1, res1, 32)); 6456 VERIFY0(memcmp(ref2, res2, 32)); 6457 6458 } 6459 } 6460 6461 abd_free(abd_data); 6462 abd_free(abd_meta); 6463 umem_free(buf, size); 6464 } 6465 6466 void 6467 ztest_fletcher(ztest_ds_t *zd, uint64_t id) 6468 { 6469 (void) zd, (void) id; 6470 hrtime_t end = gethrtime() + NANOSEC; 6471 6472 while (gethrtime() <= end) { 6473 int run_count = 100; 6474 void *buf; 6475 struct abd *abd_data, *abd_meta; 6476 uint32_t size; 6477 int *ptr; 6478 int i; 6479 zio_cksum_t zc_ref; 6480 zio_cksum_t zc_ref_byteswap; 6481 6482 size = ztest_random_blocksize(); 6483 6484 buf = umem_alloc(size, UMEM_NOFAIL); 6485 abd_data = abd_alloc(size, B_FALSE); 6486 abd_meta = abd_alloc(size, B_TRUE); 6487 6488 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6489 *ptr = ztest_random(UINT_MAX); 6490 6491 abd_copy_from_buf_off(abd_data, buf, 0, size); 6492 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6493 6494 VERIFY0(fletcher_4_impl_set("scalar")); 6495 fletcher_4_native(buf, size, NULL, &zc_ref); 6496 fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); 6497 6498 VERIFY0(fletcher_4_impl_set("cycle")); 6499 while (run_count-- > 0) { 6500 zio_cksum_t zc; 6501 zio_cksum_t zc_byteswap; 6502 6503 fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); 6504 fletcher_4_native(buf, size, NULL, &zc); 6505 6506 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6507 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6508 sizeof (zc_byteswap))); 6509 6510 /* Test ABD - data */ 6511 abd_fletcher_4_byteswap(abd_data, size, NULL, 6512 &zc_byteswap); 6513 abd_fletcher_4_native(abd_data, size, NULL, &zc); 6514 6515 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6516 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6517 sizeof (zc_byteswap))); 6518 6519 /* Test ABD - metadata */ 6520 abd_fletcher_4_byteswap(abd_meta, size, NULL, 6521 &zc_byteswap); 6522 abd_fletcher_4_native(abd_meta, size, NULL, &zc); 6523 6524 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6525 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6526 sizeof (zc_byteswap))); 6527 6528 } 6529 6530 umem_free(buf, size); 6531 abd_free(abd_data); 6532 abd_free(abd_meta); 6533 } 6534 } 6535 6536 void 6537 ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) 6538 { 6539 (void) zd, (void) id; 6540 void *buf; 6541 size_t size; 6542 int *ptr; 6543 int i; 6544 zio_cksum_t zc_ref; 6545 zio_cksum_t zc_ref_bswap; 6546 6547 hrtime_t end = gethrtime() + NANOSEC; 6548 6549 while (gethrtime() <= end) { 6550 int run_count = 100; 6551 6552 size = ztest_random_blocksize(); 6553 buf = umem_alloc(size, UMEM_NOFAIL); 6554 6555 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6556 *ptr = ztest_random(UINT_MAX); 6557 6558 VERIFY0(fletcher_4_impl_set("scalar")); 6559 fletcher_4_native(buf, size, NULL, &zc_ref); 6560 fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); 6561 6562 VERIFY0(fletcher_4_impl_set("cycle")); 6563 6564 while (run_count-- > 0) { 6565 zio_cksum_t zc; 6566 zio_cksum_t zc_bswap; 6567 size_t pos = 0; 6568 6569 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6570 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6571 6572 while (pos < size) { 6573 size_t inc = 64 * ztest_random(size / 67); 6574 /* sometimes add few bytes to test non-simd */ 6575 if (ztest_random(100) < 10) 6576 inc += P2ALIGN(ztest_random(64), 6577 sizeof (uint32_t)); 6578 6579 if (inc > (size - pos)) 6580 inc = size - pos; 6581 6582 fletcher_4_incremental_native(buf + pos, inc, 6583 &zc); 6584 fletcher_4_incremental_byteswap(buf + pos, inc, 6585 &zc_bswap); 6586 6587 pos += inc; 6588 } 6589 6590 VERIFY3U(pos, ==, size); 6591 6592 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6593 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6594 6595 /* 6596 * verify if incremental on the whole buffer is 6597 * equivalent to non-incremental version 6598 */ 6599 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6600 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6601 6602 fletcher_4_incremental_native(buf, size, &zc); 6603 fletcher_4_incremental_byteswap(buf, size, &zc_bswap); 6604 6605 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6606 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6607 } 6608 6609 umem_free(buf, size); 6610 } 6611 } 6612 6613 static int 6614 ztest_set_global_vars(void) 6615 { 6616 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 6617 char *kv = ztest_opts.zo_gvars[i]; 6618 VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); 6619 VERIFY3U(strlen(kv), >, 0); 6620 int err = set_global_var(kv); 6621 if (ztest_opts.zo_verbose > 0) { 6622 (void) printf("setting global var %s ... %s\n", kv, 6623 err ? "failed" : "ok"); 6624 } 6625 if (err != 0) { 6626 (void) fprintf(stderr, 6627 "failed to set global var '%s'\n", kv); 6628 return (err); 6629 } 6630 } 6631 return (0); 6632 } 6633 6634 static char ** 6635 ztest_global_vars_to_zdb_args(void) 6636 { 6637 char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); 6638 char **cur = args; 6639 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 6640 *cur++ = (char *)"-o"; 6641 *cur++ = ztest_opts.zo_gvars[i]; 6642 } 6643 ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); 6644 *cur = NULL; 6645 return (args); 6646 } 6647 6648 /* The end of strings is indicated by a NULL element */ 6649 static char * 6650 join_strings(char **strings, const char *sep) 6651 { 6652 size_t totallen = 0; 6653 for (char **sp = strings; *sp != NULL; sp++) { 6654 totallen += strlen(*sp); 6655 totallen += strlen(sep); 6656 } 6657 if (totallen > 0) { 6658 ASSERT(totallen >= strlen(sep)); 6659 totallen -= strlen(sep); 6660 } 6661 6662 size_t buflen = totallen + 1; 6663 char *o = malloc(buflen); /* trailing 0 byte */ 6664 o[0] = '\0'; 6665 for (char **sp = strings; *sp != NULL; sp++) { 6666 size_t would; 6667 would = strlcat(o, *sp, buflen); 6668 VERIFY3U(would, <, buflen); 6669 if (*(sp+1) == NULL) { 6670 break; 6671 } 6672 would = strlcat(o, sep, buflen); 6673 VERIFY3U(would, <, buflen); 6674 } 6675 ASSERT3S(strlen(o), ==, totallen); 6676 return (o); 6677 } 6678 6679 static int 6680 ztest_check_path(char *path) 6681 { 6682 struct stat s; 6683 /* return true on success */ 6684 return (!stat(path, &s)); 6685 } 6686 6687 static void 6688 ztest_get_zdb_bin(char *bin, int len) 6689 { 6690 char *zdb_path; 6691 /* 6692 * Try to use $ZDB and in-tree zdb path. If not successful, just 6693 * let popen to search through PATH. 6694 */ 6695 if ((zdb_path = getenv("ZDB"))) { 6696 strlcpy(bin, zdb_path, len); /* In env */ 6697 if (!ztest_check_path(bin)) { 6698 ztest_dump_core = 0; 6699 fatal(B_TRUE, "invalid ZDB '%s'", bin); 6700 } 6701 return; 6702 } 6703 6704 VERIFY3P(realpath(getexecname(), bin), !=, NULL); 6705 if (strstr(bin, ".libs/ztest")) { 6706 strstr(bin, ".libs/ztest")[0] = '\0'; /* In-tree */ 6707 strcat(bin, "zdb"); 6708 if (ztest_check_path(bin)) 6709 return; 6710 } 6711 strcpy(bin, "zdb"); 6712 } 6713 6714 static vdev_t * 6715 ztest_random_concrete_vdev_leaf(vdev_t *vd) 6716 { 6717 if (vd == NULL) 6718 return (NULL); 6719 6720 if (vd->vdev_children == 0) 6721 return (vd); 6722 6723 vdev_t *eligible[vd->vdev_children]; 6724 int eligible_idx = 0, i; 6725 for (i = 0; i < vd->vdev_children; i++) { 6726 vdev_t *cvd = vd->vdev_child[i]; 6727 if (cvd->vdev_top->vdev_removing) 6728 continue; 6729 if (cvd->vdev_children > 0 || 6730 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 6731 eligible[eligible_idx++] = cvd; 6732 } 6733 } 6734 VERIFY3S(eligible_idx, >, 0); 6735 6736 uint64_t child_no = ztest_random(eligible_idx); 6737 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 6738 } 6739 6740 void 6741 ztest_initialize(ztest_ds_t *zd, uint64_t id) 6742 { 6743 (void) zd, (void) id; 6744 spa_t *spa = ztest_spa; 6745 int error = 0; 6746 6747 mutex_enter(&ztest_vdev_lock); 6748 6749 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6750 6751 /* Random leaf vdev */ 6752 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 6753 if (rand_vd == NULL) { 6754 spa_config_exit(spa, SCL_VDEV, FTAG); 6755 mutex_exit(&ztest_vdev_lock); 6756 return; 6757 } 6758 6759 /* 6760 * The random vdev we've selected may change as soon as we 6761 * drop the spa_config_lock. We create local copies of things 6762 * we're interested in. 6763 */ 6764 uint64_t guid = rand_vd->vdev_guid; 6765 char *path = strdup(rand_vd->vdev_path); 6766 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 6767 6768 zfs_dbgmsg("vd %px, guid %llu", rand_vd, (u_longlong_t)guid); 6769 spa_config_exit(spa, SCL_VDEV, FTAG); 6770 6771 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 6772 6773 nvlist_t *vdev_guids = fnvlist_alloc(); 6774 nvlist_t *vdev_errlist = fnvlist_alloc(); 6775 fnvlist_add_uint64(vdev_guids, path, guid); 6776 error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); 6777 fnvlist_free(vdev_guids); 6778 fnvlist_free(vdev_errlist); 6779 6780 switch (cmd) { 6781 case POOL_INITIALIZE_CANCEL: 6782 if (ztest_opts.zo_verbose >= 4) { 6783 (void) printf("Cancel initialize %s", path); 6784 if (!active) 6785 (void) printf(" failed (no initialize active)"); 6786 (void) printf("\n"); 6787 } 6788 break; 6789 case POOL_INITIALIZE_START: 6790 if (ztest_opts.zo_verbose >= 4) { 6791 (void) printf("Start initialize %s", path); 6792 if (active && error == 0) 6793 (void) printf(" failed (already active)"); 6794 else if (error != 0) 6795 (void) printf(" failed (error %d)", error); 6796 (void) printf("\n"); 6797 } 6798 break; 6799 case POOL_INITIALIZE_SUSPEND: 6800 if (ztest_opts.zo_verbose >= 4) { 6801 (void) printf("Suspend initialize %s", path); 6802 if (!active) 6803 (void) printf(" failed (no initialize active)"); 6804 (void) printf("\n"); 6805 } 6806 break; 6807 } 6808 free(path); 6809 mutex_exit(&ztest_vdev_lock); 6810 } 6811 6812 void 6813 ztest_trim(ztest_ds_t *zd, uint64_t id) 6814 { 6815 (void) zd, (void) id; 6816 spa_t *spa = ztest_spa; 6817 int error = 0; 6818 6819 mutex_enter(&ztest_vdev_lock); 6820 6821 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6822 6823 /* Random leaf vdev */ 6824 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 6825 if (rand_vd == NULL) { 6826 spa_config_exit(spa, SCL_VDEV, FTAG); 6827 mutex_exit(&ztest_vdev_lock); 6828 return; 6829 } 6830 6831 /* 6832 * The random vdev we've selected may change as soon as we 6833 * drop the spa_config_lock. We create local copies of things 6834 * we're interested in. 6835 */ 6836 uint64_t guid = rand_vd->vdev_guid; 6837 char *path = strdup(rand_vd->vdev_path); 6838 boolean_t active = rand_vd->vdev_trim_thread != NULL; 6839 6840 zfs_dbgmsg("vd %p, guid %llu", rand_vd, (u_longlong_t)guid); 6841 spa_config_exit(spa, SCL_VDEV, FTAG); 6842 6843 uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); 6844 uint64_t rate = 1 << ztest_random(30); 6845 boolean_t partial = (ztest_random(5) > 0); 6846 boolean_t secure = (ztest_random(5) > 0); 6847 6848 nvlist_t *vdev_guids = fnvlist_alloc(); 6849 nvlist_t *vdev_errlist = fnvlist_alloc(); 6850 fnvlist_add_uint64(vdev_guids, path, guid); 6851 error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, 6852 secure, vdev_errlist); 6853 fnvlist_free(vdev_guids); 6854 fnvlist_free(vdev_errlist); 6855 6856 switch (cmd) { 6857 case POOL_TRIM_CANCEL: 6858 if (ztest_opts.zo_verbose >= 4) { 6859 (void) printf("Cancel TRIM %s", path); 6860 if (!active) 6861 (void) printf(" failed (no TRIM active)"); 6862 (void) printf("\n"); 6863 } 6864 break; 6865 case POOL_TRIM_START: 6866 if (ztest_opts.zo_verbose >= 4) { 6867 (void) printf("Start TRIM %s", path); 6868 if (active && error == 0) 6869 (void) printf(" failed (already active)"); 6870 else if (error != 0) 6871 (void) printf(" failed (error %d)", error); 6872 (void) printf("\n"); 6873 } 6874 break; 6875 case POOL_TRIM_SUSPEND: 6876 if (ztest_opts.zo_verbose >= 4) { 6877 (void) printf("Suspend TRIM %s", path); 6878 if (!active) 6879 (void) printf(" failed (no TRIM active)"); 6880 (void) printf("\n"); 6881 } 6882 break; 6883 } 6884 free(path); 6885 mutex_exit(&ztest_vdev_lock); 6886 } 6887 6888 /* 6889 * Verify pool integrity by running zdb. 6890 */ 6891 static void 6892 ztest_run_zdb(const char *pool) 6893 { 6894 int status; 6895 char *bin; 6896 char *zdb; 6897 char *zbuf; 6898 const int len = MAXPATHLEN + MAXNAMELEN + 20; 6899 FILE *fp; 6900 6901 bin = umem_alloc(len, UMEM_NOFAIL); 6902 zdb = umem_alloc(len, UMEM_NOFAIL); 6903 zbuf = umem_alloc(1024, UMEM_NOFAIL); 6904 6905 ztest_get_zdb_bin(bin, len); 6906 6907 char **set_gvars_args = ztest_global_vars_to_zdb_args(); 6908 char *set_gvars_args_joined = join_strings(set_gvars_args, " "); 6909 free(set_gvars_args); 6910 6911 size_t would = snprintf(zdb, len, 6912 "%s -bcc%s%s -G -d -Y -e -y %s -p %s %s", 6913 bin, 6914 ztest_opts.zo_verbose >= 3 ? "s" : "", 6915 ztest_opts.zo_verbose >= 4 ? "v" : "", 6916 set_gvars_args_joined, 6917 ztest_opts.zo_dir, 6918 pool); 6919 ASSERT3U(would, <, len); 6920 6921 free(set_gvars_args_joined); 6922 6923 if (ztest_opts.zo_verbose >= 5) 6924 (void) printf("Executing %s\n", zdb); 6925 6926 fp = popen(zdb, "r"); 6927 6928 while (fgets(zbuf, 1024, fp) != NULL) 6929 if (ztest_opts.zo_verbose >= 3) 6930 (void) printf("%s", zbuf); 6931 6932 status = pclose(fp); 6933 6934 if (status == 0) 6935 goto out; 6936 6937 ztest_dump_core = 0; 6938 if (WIFEXITED(status)) 6939 fatal(B_FALSE, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 6940 else 6941 fatal(B_FALSE, "'%s' died with signal %d", 6942 zdb, WTERMSIG(status)); 6943 out: 6944 umem_free(bin, len); 6945 umem_free(zdb, len); 6946 umem_free(zbuf, 1024); 6947 } 6948 6949 static void 6950 ztest_walk_pool_directory(const char *header) 6951 { 6952 spa_t *spa = NULL; 6953 6954 if (ztest_opts.zo_verbose >= 6) 6955 (void) puts(header); 6956 6957 mutex_enter(&spa_namespace_lock); 6958 while ((spa = spa_next(spa)) != NULL) 6959 if (ztest_opts.zo_verbose >= 6) 6960 (void) printf("\t%s\n", spa_name(spa)); 6961 mutex_exit(&spa_namespace_lock); 6962 } 6963 6964 static void 6965 ztest_spa_import_export(char *oldname, char *newname) 6966 { 6967 nvlist_t *config, *newconfig; 6968 uint64_t pool_guid; 6969 spa_t *spa; 6970 int error; 6971 6972 if (ztest_opts.zo_verbose >= 4) { 6973 (void) printf("import/export: old = %s, new = %s\n", 6974 oldname, newname); 6975 } 6976 6977 /* 6978 * Clean up from previous runs. 6979 */ 6980 (void) spa_destroy(newname); 6981 6982 /* 6983 * Get the pool's configuration and guid. 6984 */ 6985 VERIFY0(spa_open(oldname, &spa, FTAG)); 6986 6987 /* 6988 * Kick off a scrub to tickle scrub/export races. 6989 */ 6990 if (ztest_random(2) == 0) 6991 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6992 6993 pool_guid = spa_guid(spa); 6994 spa_close(spa, FTAG); 6995 6996 ztest_walk_pool_directory("pools before export"); 6997 6998 /* 6999 * Export it. 7000 */ 7001 VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); 7002 7003 ztest_walk_pool_directory("pools after export"); 7004 7005 /* 7006 * Try to import it. 7007 */ 7008 newconfig = spa_tryimport(config); 7009 ASSERT3P(newconfig, !=, NULL); 7010 fnvlist_free(newconfig); 7011 7012 /* 7013 * Import it under the new name. 7014 */ 7015 error = spa_import(newname, config, NULL, 0); 7016 if (error != 0) { 7017 dump_nvlist(config, 0); 7018 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 7019 oldname, newname, error); 7020 } 7021 7022 ztest_walk_pool_directory("pools after import"); 7023 7024 /* 7025 * Try to import it again -- should fail with EEXIST. 7026 */ 7027 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 7028 7029 /* 7030 * Try to import it under a different name -- should fail with EEXIST. 7031 */ 7032 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 7033 7034 /* 7035 * Verify that the pool is no longer visible under the old name. 7036 */ 7037 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 7038 7039 /* 7040 * Verify that we can open and close the pool using the new name. 7041 */ 7042 VERIFY0(spa_open(newname, &spa, FTAG)); 7043 ASSERT3U(pool_guid, ==, spa_guid(spa)); 7044 spa_close(spa, FTAG); 7045 7046 fnvlist_free(config); 7047 } 7048 7049 static void 7050 ztest_resume(spa_t *spa) 7051 { 7052 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 7053 (void) printf("resuming from suspended state\n"); 7054 spa_vdev_state_enter(spa, SCL_NONE); 7055 vdev_clear(spa, NULL); 7056 (void) spa_vdev_state_exit(spa, NULL, 0); 7057 (void) zio_resume(spa); 7058 } 7059 7060 static __attribute__((noreturn)) void 7061 ztest_resume_thread(void *arg) 7062 { 7063 spa_t *spa = arg; 7064 7065 while (!ztest_exiting) { 7066 if (spa_suspended(spa)) 7067 ztest_resume(spa); 7068 (void) poll(NULL, 0, 100); 7069 7070 /* 7071 * Periodically change the zfs_compressed_arc_enabled setting. 7072 */ 7073 if (ztest_random(10) == 0) 7074 zfs_compressed_arc_enabled = ztest_random(2); 7075 7076 /* 7077 * Periodically change the zfs_abd_scatter_enabled setting. 7078 */ 7079 if (ztest_random(10) == 0) 7080 zfs_abd_scatter_enabled = ztest_random(2); 7081 } 7082 7083 thread_exit(); 7084 } 7085 7086 static __attribute__((noreturn)) void 7087 ztest_deadman_thread(void *arg) 7088 { 7089 ztest_shared_t *zs = arg; 7090 spa_t *spa = ztest_spa; 7091 hrtime_t delay, overdue, last_run = gethrtime(); 7092 7093 delay = (zs->zs_thread_stop - zs->zs_thread_start) + 7094 MSEC2NSEC(zfs_deadman_synctime_ms); 7095 7096 while (!ztest_exiting) { 7097 /* 7098 * Wait for the delay timer while checking occasionally 7099 * if we should stop. 7100 */ 7101 if (gethrtime() < last_run + delay) { 7102 (void) poll(NULL, 0, 1000); 7103 continue; 7104 } 7105 7106 /* 7107 * If the pool is suspended then fail immediately. Otherwise, 7108 * check to see if the pool is making any progress. If 7109 * vdev_deadman() discovers that there hasn't been any recent 7110 * I/Os then it will end up aborting the tests. 7111 */ 7112 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 7113 fatal(B_FALSE, 7114 "aborting test after %lu seconds because " 7115 "pool has transitioned to a suspended state.", 7116 zfs_deadman_synctime_ms / 1000); 7117 } 7118 vdev_deadman(spa->spa_root_vdev, FTAG); 7119 7120 /* 7121 * If the process doesn't complete within a grace period of 7122 * zfs_deadman_synctime_ms over the expected finish time, 7123 * then it may be hung and is terminated. 7124 */ 7125 overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); 7126 if (gethrtime() > overdue) { 7127 fatal(B_FALSE, 7128 "aborting test after %llu seconds because " 7129 "the process is overdue for termination.", 7130 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7131 } 7132 7133 (void) printf("ztest has been running for %lld seconds\n", 7134 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7135 7136 last_run = gethrtime(); 7137 delay = MSEC2NSEC(zfs_deadman_checktime_ms); 7138 } 7139 7140 thread_exit(); 7141 } 7142 7143 static void 7144 ztest_execute(int test, ztest_info_t *zi, uint64_t id) 7145 { 7146 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 7147 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 7148 hrtime_t functime = gethrtime(); 7149 int i; 7150 7151 for (i = 0; i < zi->zi_iters; i++) 7152 zi->zi_func(zd, id); 7153 7154 functime = gethrtime() - functime; 7155 7156 atomic_add_64(&zc->zc_count, 1); 7157 atomic_add_64(&zc->zc_time, functime); 7158 7159 if (ztest_opts.zo_verbose >= 4) 7160 (void) printf("%6.2f sec in %s\n", 7161 (double)functime / NANOSEC, zi->zi_funcname); 7162 } 7163 7164 static __attribute__((noreturn)) void 7165 ztest_thread(void *arg) 7166 { 7167 int rand; 7168 uint64_t id = (uintptr_t)arg; 7169 ztest_shared_t *zs = ztest_shared; 7170 uint64_t call_next; 7171 hrtime_t now; 7172 ztest_info_t *zi; 7173 ztest_shared_callstate_t *zc; 7174 7175 while ((now = gethrtime()) < zs->zs_thread_stop) { 7176 /* 7177 * See if it's time to force a crash. 7178 */ 7179 if (now > zs->zs_thread_kill) 7180 ztest_kill(zs); 7181 7182 /* 7183 * If we're getting ENOSPC with some regularity, stop. 7184 */ 7185 if (zs->zs_enospc_count > 10) 7186 break; 7187 7188 /* 7189 * Pick a random function to execute. 7190 */ 7191 rand = ztest_random(ZTEST_FUNCS); 7192 zi = &ztest_info[rand]; 7193 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 7194 call_next = zc->zc_next; 7195 7196 if (now >= call_next && 7197 atomic_cas_64(&zc->zc_next, call_next, call_next + 7198 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 7199 ztest_execute(rand, zi, id); 7200 } 7201 } 7202 7203 thread_exit(); 7204 } 7205 7206 static void 7207 ztest_dataset_name(char *dsname, const char *pool, int d) 7208 { 7209 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 7210 } 7211 7212 static void 7213 ztest_dataset_destroy(int d) 7214 { 7215 char name[ZFS_MAX_DATASET_NAME_LEN]; 7216 int t; 7217 7218 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7219 7220 if (ztest_opts.zo_verbose >= 3) 7221 (void) printf("Destroying %s to free up space\n", name); 7222 7223 /* 7224 * Cleanup any non-standard clones and snapshots. In general, 7225 * ztest thread t operates on dataset (t % zopt_datasets), 7226 * so there may be more than one thing to clean up. 7227 */ 7228 for (t = d; t < ztest_opts.zo_threads; 7229 t += ztest_opts.zo_datasets) 7230 ztest_dsl_dataset_cleanup(name, t); 7231 7232 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 7233 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 7234 } 7235 7236 static void 7237 ztest_dataset_dirobj_verify(ztest_ds_t *zd) 7238 { 7239 uint64_t usedobjs, dirobjs, scratch; 7240 7241 /* 7242 * ZTEST_DIROBJ is the object directory for the entire dataset. 7243 * Therefore, the number of objects in use should equal the 7244 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 7245 * If not, we have an object leak. 7246 * 7247 * Note that we can only check this in ztest_dataset_open(), 7248 * when the open-context and syncing-context values agree. 7249 * That's because zap_count() returns the open-context value, 7250 * while dmu_objset_space() returns the rootbp fill count. 7251 */ 7252 VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 7253 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 7254 ASSERT3U(dirobjs + 1, ==, usedobjs); 7255 } 7256 7257 static int 7258 ztest_dataset_open(int d) 7259 { 7260 ztest_ds_t *zd = &ztest_ds[d]; 7261 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 7262 objset_t *os; 7263 zilog_t *zilog; 7264 char name[ZFS_MAX_DATASET_NAME_LEN]; 7265 int error; 7266 7267 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7268 7269 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7270 7271 error = ztest_dataset_create(name); 7272 if (error == ENOSPC) { 7273 (void) pthread_rwlock_unlock(&ztest_name_lock); 7274 ztest_record_enospc(FTAG); 7275 return (error); 7276 } 7277 ASSERT(error == 0 || error == EEXIST); 7278 7279 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 7280 B_TRUE, zd, &os)); 7281 (void) pthread_rwlock_unlock(&ztest_name_lock); 7282 7283 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 7284 7285 zilog = zd->zd_zilog; 7286 7287 if (zilog->zl_header->zh_claim_lr_seq != 0 && 7288 zilog->zl_header->zh_claim_lr_seq < committed_seq) 7289 fatal(B_FALSE, "missing log records: " 7290 "claimed %"PRIu64" < committed %"PRIu64"", 7291 zilog->zl_header->zh_claim_lr_seq, committed_seq); 7292 7293 ztest_dataset_dirobj_verify(zd); 7294 7295 zil_replay(os, zd, ztest_replay_vector); 7296 7297 ztest_dataset_dirobj_verify(zd); 7298 7299 if (ztest_opts.zo_verbose >= 6) 7300 (void) printf("%s replay %"PRIu64" blocks, " 7301 "%"PRIu64" records, seq %"PRIu64"\n", 7302 zd->zd_name, 7303 zilog->zl_parse_blk_count, 7304 zilog->zl_parse_lr_count, 7305 zilog->zl_replaying_seq); 7306 7307 zilog = zil_open(os, ztest_get_data); 7308 7309 if (zilog->zl_replaying_seq != 0 && 7310 zilog->zl_replaying_seq < committed_seq) 7311 fatal(B_FALSE, "missing log records: " 7312 "replayed %"PRIu64" < committed %"PRIu64"", 7313 zilog->zl_replaying_seq, committed_seq); 7314 7315 return (0); 7316 } 7317 7318 static void 7319 ztest_dataset_close(int d) 7320 { 7321 ztest_ds_t *zd = &ztest_ds[d]; 7322 7323 zil_close(zd->zd_zilog); 7324 dmu_objset_disown(zd->zd_os, B_TRUE, zd); 7325 7326 ztest_zd_fini(zd); 7327 } 7328 7329 static int 7330 ztest_replay_zil_cb(const char *name, void *arg) 7331 { 7332 (void) arg; 7333 objset_t *os; 7334 ztest_ds_t *zdtmp; 7335 7336 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, 7337 B_TRUE, FTAG, &os)); 7338 7339 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 7340 7341 ztest_zd_init(zdtmp, NULL, os); 7342 zil_replay(os, zdtmp, ztest_replay_vector); 7343 ztest_zd_fini(zdtmp); 7344 7345 if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && 7346 ztest_opts.zo_verbose >= 6) { 7347 zilog_t *zilog = dmu_objset_zil(os); 7348 7349 (void) printf("%s replay %"PRIu64" blocks, " 7350 "%"PRIu64" records, seq %"PRIu64"\n", 7351 name, 7352 zilog->zl_parse_blk_count, 7353 zilog->zl_parse_lr_count, 7354 zilog->zl_replaying_seq); 7355 } 7356 7357 umem_free(zdtmp, sizeof (ztest_ds_t)); 7358 7359 dmu_objset_disown(os, B_TRUE, FTAG); 7360 return (0); 7361 } 7362 7363 static void 7364 ztest_freeze(void) 7365 { 7366 ztest_ds_t *zd = &ztest_ds[0]; 7367 spa_t *spa; 7368 int numloops = 0; 7369 7370 if (ztest_opts.zo_verbose >= 3) 7371 (void) printf("testing spa_freeze()...\n"); 7372 7373 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7374 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7375 VERIFY0(ztest_dataset_open(0)); 7376 ztest_spa = spa; 7377 7378 /* 7379 * Force the first log block to be transactionally allocated. 7380 * We have to do this before we freeze the pool -- otherwise 7381 * the log chain won't be anchored. 7382 */ 7383 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 7384 ztest_dmu_object_alloc_free(zd, 0); 7385 zil_commit(zd->zd_zilog, 0); 7386 } 7387 7388 txg_wait_synced(spa_get_dsl(spa), 0); 7389 7390 /* 7391 * Freeze the pool. This stops spa_sync() from doing anything, 7392 * so that the only way to record changes from now on is the ZIL. 7393 */ 7394 spa_freeze(spa); 7395 7396 /* 7397 * Because it is hard to predict how much space a write will actually 7398 * require beforehand, we leave ourselves some fudge space to write over 7399 * capacity. 7400 */ 7401 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 7402 7403 /* 7404 * Run tests that generate log records but don't alter the pool config 7405 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 7406 * We do a txg_wait_synced() after each iteration to force the txg 7407 * to increase well beyond the last synced value in the uberblock. 7408 * The ZIL should be OK with that. 7409 * 7410 * Run a random number of times less than zo_maxloops and ensure we do 7411 * not run out of space on the pool. 7412 */ 7413 while (ztest_random(10) != 0 && 7414 numloops++ < ztest_opts.zo_maxloops && 7415 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 7416 ztest_od_t od; 7417 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 7418 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 7419 ztest_io(zd, od.od_object, 7420 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 7421 txg_wait_synced(spa_get_dsl(spa), 0); 7422 } 7423 7424 /* 7425 * Commit all of the changes we just generated. 7426 */ 7427 zil_commit(zd->zd_zilog, 0); 7428 txg_wait_synced(spa_get_dsl(spa), 0); 7429 7430 /* 7431 * Close our dataset and close the pool. 7432 */ 7433 ztest_dataset_close(0); 7434 spa_close(spa, FTAG); 7435 kernel_fini(); 7436 7437 /* 7438 * Open and close the pool and dataset to induce log replay. 7439 */ 7440 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7441 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7442 ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); 7443 VERIFY0(ztest_dataset_open(0)); 7444 ztest_spa = spa; 7445 txg_wait_synced(spa_get_dsl(spa), 0); 7446 ztest_dataset_close(0); 7447 ztest_reguid(NULL, 0); 7448 7449 spa_close(spa, FTAG); 7450 kernel_fini(); 7451 } 7452 7453 static void 7454 ztest_import_impl(void) 7455 { 7456 importargs_t args = { 0 }; 7457 nvlist_t *cfg = NULL; 7458 int nsearch = 1; 7459 char *searchdirs[nsearch]; 7460 int flags = ZFS_IMPORT_MISSING_LOG; 7461 7462 searchdirs[0] = ztest_opts.zo_dir; 7463 args.paths = nsearch; 7464 args.path = searchdirs; 7465 args.can_be_active = B_FALSE; 7466 7467 VERIFY0(zpool_find_config(NULL, ztest_opts.zo_pool, &cfg, &args, 7468 &libzpool_config_ops)); 7469 VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); 7470 fnvlist_free(cfg); 7471 } 7472 7473 /* 7474 * Import a storage pool with the given name. 7475 */ 7476 static void 7477 ztest_import(ztest_shared_t *zs) 7478 { 7479 spa_t *spa; 7480 7481 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7482 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7483 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7484 7485 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7486 7487 ztest_import_impl(); 7488 7489 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7490 zs->zs_metaslab_sz = 7491 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7492 spa_close(spa, FTAG); 7493 7494 kernel_fini(); 7495 7496 if (!ztest_opts.zo_mmp_test) { 7497 ztest_run_zdb(ztest_opts.zo_pool); 7498 ztest_freeze(); 7499 ztest_run_zdb(ztest_opts.zo_pool); 7500 } 7501 7502 (void) pthread_rwlock_destroy(&ztest_name_lock); 7503 mutex_destroy(&ztest_vdev_lock); 7504 mutex_destroy(&ztest_checkpoint_lock); 7505 } 7506 7507 /* 7508 * Kick off threads to run tests on all datasets in parallel. 7509 */ 7510 static void 7511 ztest_run(ztest_shared_t *zs) 7512 { 7513 spa_t *spa; 7514 objset_t *os; 7515 kthread_t *resume_thread, *deadman_thread; 7516 kthread_t **run_threads; 7517 uint64_t object; 7518 int error; 7519 int t, d; 7520 7521 ztest_exiting = B_FALSE; 7522 7523 /* 7524 * Initialize parent/child shared state. 7525 */ 7526 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7527 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7528 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7529 7530 zs->zs_thread_start = gethrtime(); 7531 zs->zs_thread_stop = 7532 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 7533 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 7534 zs->zs_thread_kill = zs->zs_thread_stop; 7535 if (ztest_random(100) < ztest_opts.zo_killrate) { 7536 zs->zs_thread_kill -= 7537 ztest_random(ztest_opts.zo_passtime * NANOSEC); 7538 } 7539 7540 mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); 7541 7542 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 7543 offsetof(ztest_cb_data_t, zcd_node)); 7544 7545 /* 7546 * Open our pool. It may need to be imported first depending on 7547 * what tests were running when the previous pass was terminated. 7548 */ 7549 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7550 error = spa_open(ztest_opts.zo_pool, &spa, FTAG); 7551 if (error) { 7552 VERIFY3S(error, ==, ENOENT); 7553 ztest_import_impl(); 7554 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7555 zs->zs_metaslab_sz = 7556 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7557 } 7558 7559 metaslab_preload_limit = ztest_random(20) + 1; 7560 ztest_spa = spa; 7561 7562 VERIFY0(vdev_raidz_impl_set("cycle")); 7563 7564 dmu_objset_stats_t dds; 7565 VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, 7566 DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); 7567 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 7568 dmu_objset_fast_stat(os, &dds); 7569 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 7570 zs->zs_guid = dds.dds_guid; 7571 dmu_objset_disown(os, B_TRUE, FTAG); 7572 7573 /* 7574 * Create a thread to periodically resume suspended I/O. 7575 */ 7576 resume_thread = thread_create(NULL, 0, ztest_resume_thread, 7577 spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 7578 7579 /* 7580 * Create a deadman thread and set to panic if we hang. 7581 */ 7582 deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, 7583 zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 7584 7585 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; 7586 7587 /* 7588 * Verify that we can safely inquire about any object, 7589 * whether it's allocated or not. To make it interesting, 7590 * we probe a 5-wide window around each power of two. 7591 * This hits all edge cases, including zero and the max. 7592 */ 7593 for (t = 0; t < 64; t++) { 7594 for (d = -5; d <= 5; d++) { 7595 error = dmu_object_info(spa->spa_meta_objset, 7596 (1ULL << t) + d, NULL); 7597 ASSERT(error == 0 || error == ENOENT || 7598 error == EINVAL); 7599 } 7600 } 7601 7602 /* 7603 * If we got any ENOSPC errors on the previous run, destroy something. 7604 */ 7605 if (zs->zs_enospc_count != 0) { 7606 int d = ztest_random(ztest_opts.zo_datasets); 7607 ztest_dataset_destroy(d); 7608 } 7609 zs->zs_enospc_count = 0; 7610 7611 /* 7612 * If we were in the middle of ztest_device_removal() and were killed 7613 * we need to ensure the removal and scrub complete before running 7614 * any tests that check ztest_device_removal_active. The removal will 7615 * be restarted automatically when the spa is opened, but we need to 7616 * initiate the scrub manually if it is not already in progress. Note 7617 * that we always run the scrub whenever an indirect vdev exists 7618 * because we have no way of knowing for sure if ztest_device_removal() 7619 * fully completed its scrub before the pool was reimported. 7620 */ 7621 if (spa->spa_removing_phys.sr_state == DSS_SCANNING || 7622 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 7623 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 7624 txg_wait_synced(spa_get_dsl(spa), 0); 7625 7626 error = ztest_scrub_impl(spa); 7627 if (error == EBUSY) 7628 error = 0; 7629 ASSERT0(error); 7630 } 7631 7632 run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), 7633 UMEM_NOFAIL); 7634 7635 if (ztest_opts.zo_verbose >= 4) 7636 (void) printf("starting main threads...\n"); 7637 7638 /* 7639 * Replay all logs of all datasets in the pool. This is primarily for 7640 * temporary datasets which wouldn't otherwise get replayed, which 7641 * can trigger failures when attempting to offline a SLOG in 7642 * ztest_fault_inject(). 7643 */ 7644 (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, 7645 NULL, DS_FIND_CHILDREN); 7646 7647 /* 7648 * Kick off all the tests that run in parallel. 7649 */ 7650 for (t = 0; t < ztest_opts.zo_threads; t++) { 7651 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 7652 umem_free(run_threads, ztest_opts.zo_threads * 7653 sizeof (kthread_t *)); 7654 return; 7655 } 7656 7657 run_threads[t] = thread_create(NULL, 0, ztest_thread, 7658 (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, 7659 defclsyspri); 7660 } 7661 7662 /* 7663 * Wait for all of the tests to complete. 7664 */ 7665 for (t = 0; t < ztest_opts.zo_threads; t++) 7666 VERIFY0(thread_join(run_threads[t])); 7667 7668 /* 7669 * Close all datasets. This must be done after all the threads 7670 * are joined so we can be sure none of the datasets are in-use 7671 * by any of the threads. 7672 */ 7673 for (t = 0; t < ztest_opts.zo_threads; t++) { 7674 if (t < ztest_opts.zo_datasets) 7675 ztest_dataset_close(t); 7676 } 7677 7678 txg_wait_synced(spa_get_dsl(spa), 0); 7679 7680 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 7681 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 7682 7683 umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); 7684 7685 /* Kill the resume and deadman threads */ 7686 ztest_exiting = B_TRUE; 7687 VERIFY0(thread_join(resume_thread)); 7688 VERIFY0(thread_join(deadman_thread)); 7689 ztest_resume(spa); 7690 7691 /* 7692 * Right before closing the pool, kick off a bunch of async I/O; 7693 * spa_close() should wait for it to complete. 7694 */ 7695 for (object = 1; object < 50; object++) { 7696 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 7697 ZIO_PRIORITY_SYNC_READ); 7698 } 7699 7700 /* Verify that at least one commit cb was called in a timely fashion */ 7701 if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) 7702 VERIFY0(zc_min_txg_delay); 7703 7704 spa_close(spa, FTAG); 7705 7706 /* 7707 * Verify that we can loop over all pools. 7708 */ 7709 mutex_enter(&spa_namespace_lock); 7710 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 7711 if (ztest_opts.zo_verbose > 3) 7712 (void) printf("spa_next: found %s\n", spa_name(spa)); 7713 mutex_exit(&spa_namespace_lock); 7714 7715 /* 7716 * Verify that we can export the pool and reimport it under a 7717 * different name. 7718 */ 7719 if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { 7720 char name[ZFS_MAX_DATASET_NAME_LEN]; 7721 (void) snprintf(name, sizeof (name), "%s_import", 7722 ztest_opts.zo_pool); 7723 ztest_spa_import_export(ztest_opts.zo_pool, name); 7724 ztest_spa_import_export(name, ztest_opts.zo_pool); 7725 } 7726 7727 kernel_fini(); 7728 7729 list_destroy(&zcl.zcl_callbacks); 7730 mutex_destroy(&zcl.zcl_callbacks_lock); 7731 (void) pthread_rwlock_destroy(&ztest_name_lock); 7732 mutex_destroy(&ztest_vdev_lock); 7733 mutex_destroy(&ztest_checkpoint_lock); 7734 } 7735 7736 static void 7737 print_time(hrtime_t t, char *timebuf) 7738 { 7739 hrtime_t s = t / NANOSEC; 7740 hrtime_t m = s / 60; 7741 hrtime_t h = m / 60; 7742 hrtime_t d = h / 24; 7743 7744 s -= m * 60; 7745 m -= h * 60; 7746 h -= d * 24; 7747 7748 timebuf[0] = '\0'; 7749 7750 if (d) 7751 (void) sprintf(timebuf, 7752 "%llud%02lluh%02llum%02llus", d, h, m, s); 7753 else if (h) 7754 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 7755 else if (m) 7756 (void) sprintf(timebuf, "%llum%02llus", m, s); 7757 else 7758 (void) sprintf(timebuf, "%llus", s); 7759 } 7760 7761 static nvlist_t * 7762 make_random_props(void) 7763 { 7764 nvlist_t *props; 7765 7766 props = fnvlist_alloc(); 7767 7768 if (ztest_random(2) == 0) 7769 return (props); 7770 7771 fnvlist_add_uint64(props, 7772 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); 7773 7774 return (props); 7775 } 7776 7777 /* 7778 * Create a storage pool with the given name and initial vdev size. 7779 * Then test spa_freeze() functionality. 7780 */ 7781 static void 7782 ztest_init(ztest_shared_t *zs) 7783 { 7784 spa_t *spa; 7785 nvlist_t *nvroot, *props; 7786 int i; 7787 7788 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7789 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7790 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7791 7792 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7793 7794 /* 7795 * Create the storage pool. 7796 */ 7797 (void) spa_destroy(ztest_opts.zo_pool); 7798 ztest_shared->zs_vdev_next_leaf = 0; 7799 zs->zs_splits = 0; 7800 zs->zs_mirrors = ztest_opts.zo_mirrors; 7801 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 7802 NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 7803 props = make_random_props(); 7804 7805 /* 7806 * We don't expect the pool to suspend unless maxfaults == 0, 7807 * in which case ztest_fault_inject() temporarily takes away 7808 * the only valid replica. 7809 */ 7810 fnvlist_add_uint64(props, 7811 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 7812 MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); 7813 7814 for (i = 0; i < SPA_FEATURES; i++) { 7815 char *buf; 7816 7817 if (!spa_feature_table[i].fi_zfs_mod_supported) 7818 continue; 7819 7820 /* 7821 * 75% chance of using the log space map feature. We want ztest 7822 * to exercise both the code paths that use the log space map 7823 * feature and the ones that don't. 7824 */ 7825 if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) 7826 continue; 7827 7828 VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", 7829 spa_feature_table[i].fi_uname)); 7830 fnvlist_add_uint64(props, buf, 0); 7831 free(buf); 7832 } 7833 7834 VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); 7835 fnvlist_free(nvroot); 7836 fnvlist_free(props); 7837 7838 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7839 zs->zs_metaslab_sz = 7840 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7841 spa_close(spa, FTAG); 7842 7843 kernel_fini(); 7844 7845 if (!ztest_opts.zo_mmp_test) { 7846 ztest_run_zdb(ztest_opts.zo_pool); 7847 ztest_freeze(); 7848 ztest_run_zdb(ztest_opts.zo_pool); 7849 } 7850 7851 (void) pthread_rwlock_destroy(&ztest_name_lock); 7852 mutex_destroy(&ztest_vdev_lock); 7853 mutex_destroy(&ztest_checkpoint_lock); 7854 } 7855 7856 static void 7857 setup_data_fd(void) 7858 { 7859 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 7860 7861 ztest_fd_data = mkstemp(ztest_name_data); 7862 ASSERT3S(ztest_fd_data, >=, 0); 7863 (void) unlink(ztest_name_data); 7864 } 7865 7866 static int 7867 shared_data_size(ztest_shared_hdr_t *hdr) 7868 { 7869 int size; 7870 7871 size = hdr->zh_hdr_size; 7872 size += hdr->zh_opts_size; 7873 size += hdr->zh_size; 7874 size += hdr->zh_stats_size * hdr->zh_stats_count; 7875 size += hdr->zh_ds_size * hdr->zh_ds_count; 7876 7877 return (size); 7878 } 7879 7880 static void 7881 setup_hdr(void) 7882 { 7883 int size; 7884 ztest_shared_hdr_t *hdr; 7885 7886 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 7887 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 7888 ASSERT3P(hdr, !=, MAP_FAILED); 7889 7890 VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 7891 7892 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 7893 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 7894 hdr->zh_size = sizeof (ztest_shared_t); 7895 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 7896 hdr->zh_stats_count = ZTEST_FUNCS; 7897 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 7898 hdr->zh_ds_count = ztest_opts.zo_datasets; 7899 7900 size = shared_data_size(hdr); 7901 VERIFY0(ftruncate(ztest_fd_data, size)); 7902 7903 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 7904 } 7905 7906 static void 7907 setup_data(void) 7908 { 7909 int size, offset; 7910 ztest_shared_hdr_t *hdr; 7911 uint8_t *buf; 7912 7913 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 7914 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 7915 ASSERT3P(hdr, !=, MAP_FAILED); 7916 7917 size = shared_data_size(hdr); 7918 7919 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 7920 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 7921 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 7922 ASSERT3P(hdr, !=, MAP_FAILED); 7923 buf = (uint8_t *)hdr; 7924 7925 offset = hdr->zh_hdr_size; 7926 ztest_shared_opts = (void *)&buf[offset]; 7927 offset += hdr->zh_opts_size; 7928 ztest_shared = (void *)&buf[offset]; 7929 offset += hdr->zh_size; 7930 ztest_shared_callstate = (void *)&buf[offset]; 7931 offset += hdr->zh_stats_size * hdr->zh_stats_count; 7932 ztest_shared_ds = (void *)&buf[offset]; 7933 } 7934 7935 static boolean_t 7936 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 7937 { 7938 pid_t pid; 7939 int status; 7940 char *cmdbuf = NULL; 7941 7942 pid = fork(); 7943 7944 if (cmd == NULL) { 7945 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 7946 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 7947 cmd = cmdbuf; 7948 } 7949 7950 if (pid == -1) 7951 fatal(B_TRUE, "fork failed"); 7952 7953 if (pid == 0) { /* child */ 7954 char fd_data_str[12]; 7955 7956 VERIFY3S(11, >=, 7957 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 7958 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 7959 7960 if (libpath != NULL) { 7961 const char *curlp = getenv("LD_LIBRARY_PATH"); 7962 if (curlp == NULL) 7963 VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); 7964 else { 7965 char *newlp = NULL; 7966 VERIFY3S(-1, !=, 7967 asprintf(&newlp, "%s:%s", libpath, curlp)); 7968 VERIFY0(setenv("LD_LIBRARY_PATH", newlp, 1)); 7969 } 7970 } 7971 (void) execl(cmd, cmd, (char *)NULL); 7972 ztest_dump_core = B_FALSE; 7973 fatal(B_TRUE, "exec failed: %s", cmd); 7974 } 7975 7976 if (cmdbuf != NULL) { 7977 umem_free(cmdbuf, MAXPATHLEN); 7978 cmd = NULL; 7979 } 7980 7981 while (waitpid(pid, &status, 0) != pid) 7982 continue; 7983 if (statusp != NULL) 7984 *statusp = status; 7985 7986 if (WIFEXITED(status)) { 7987 if (WEXITSTATUS(status) != 0) { 7988 (void) fprintf(stderr, "child exited with code %d\n", 7989 WEXITSTATUS(status)); 7990 exit(2); 7991 } 7992 return (B_FALSE); 7993 } else if (WIFSIGNALED(status)) { 7994 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 7995 (void) fprintf(stderr, "child died with signal %d\n", 7996 WTERMSIG(status)); 7997 exit(3); 7998 } 7999 return (B_TRUE); 8000 } else { 8001 (void) fprintf(stderr, "something strange happened to child\n"); 8002 exit(4); 8003 } 8004 } 8005 8006 static void 8007 ztest_run_init(void) 8008 { 8009 int i; 8010 8011 ztest_shared_t *zs = ztest_shared; 8012 8013 /* 8014 * Blow away any existing copy of zpool.cache 8015 */ 8016 (void) remove(spa_config_path); 8017 8018 if (ztest_opts.zo_init == 0) { 8019 if (ztest_opts.zo_verbose >= 1) 8020 (void) printf("Importing pool %s\n", 8021 ztest_opts.zo_pool); 8022 ztest_import(zs); 8023 return; 8024 } 8025 8026 /* 8027 * Create and initialize our storage pool. 8028 */ 8029 for (i = 1; i <= ztest_opts.zo_init; i++) { 8030 memset(zs, 0, sizeof (*zs)); 8031 if (ztest_opts.zo_verbose >= 3 && 8032 ztest_opts.zo_init != 1) { 8033 (void) printf("ztest_init(), pass %d\n", i); 8034 } 8035 ztest_init(zs); 8036 } 8037 } 8038 8039 int 8040 main(int argc, char **argv) 8041 { 8042 int kills = 0; 8043 int iters = 0; 8044 int older = 0; 8045 int newer = 0; 8046 ztest_shared_t *zs; 8047 ztest_info_t *zi; 8048 ztest_shared_callstate_t *zc; 8049 char timebuf[100]; 8050 char numbuf[NN_NUMBUF_SZ]; 8051 char *cmd; 8052 boolean_t hasalt; 8053 int f, err; 8054 char *fd_data_str = getenv("ZTEST_FD_DATA"); 8055 struct sigaction action; 8056 8057 (void) setvbuf(stdout, NULL, _IOLBF, 0); 8058 8059 dprintf_setup(&argc, argv); 8060 zfs_deadman_synctime_ms = 300000; 8061 zfs_deadman_checktime_ms = 30000; 8062 /* 8063 * As two-word space map entries may not come up often (especially 8064 * if pool and vdev sizes are small) we want to force at least some 8065 * of them so the feature get tested. 8066 */ 8067 zfs_force_some_double_word_sm_entries = B_TRUE; 8068 8069 /* 8070 * Verify that even extensively damaged split blocks with many 8071 * segments can be reconstructed in a reasonable amount of time 8072 * when reconstruction is known to be possible. 8073 * 8074 * Note: the lower this value is, the more damage we inflict, and 8075 * the more time ztest spends in recovering that damage. We chose 8076 * to induce damage 1/100th of the time so recovery is tested but 8077 * not so frequently that ztest doesn't get to test other code paths. 8078 */ 8079 zfs_reconstruct_indirect_damage_fraction = 100; 8080 8081 action.sa_handler = sig_handler; 8082 sigemptyset(&action.sa_mask); 8083 action.sa_flags = 0; 8084 8085 if (sigaction(SIGSEGV, &action, NULL) < 0) { 8086 (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", 8087 strerror(errno)); 8088 exit(EXIT_FAILURE); 8089 } 8090 8091 if (sigaction(SIGABRT, &action, NULL) < 0) { 8092 (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", 8093 strerror(errno)); 8094 exit(EXIT_FAILURE); 8095 } 8096 8097 /* 8098 * Force random_get_bytes() to use /dev/urandom in order to prevent 8099 * ztest from needlessly depleting the system entropy pool. 8100 */ 8101 random_path = "/dev/urandom"; 8102 ztest_fd_rand = open(random_path, O_RDONLY | O_CLOEXEC); 8103 ASSERT3S(ztest_fd_rand, >=, 0); 8104 8105 if (!fd_data_str) { 8106 process_options(argc, argv); 8107 8108 setup_data_fd(); 8109 setup_hdr(); 8110 setup_data(); 8111 memcpy(ztest_shared_opts, &ztest_opts, 8112 sizeof (*ztest_shared_opts)); 8113 } else { 8114 ztest_fd_data = atoi(fd_data_str); 8115 setup_data(); 8116 memcpy(&ztest_opts, ztest_shared_opts, sizeof (ztest_opts)); 8117 } 8118 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 8119 8120 err = ztest_set_global_vars(); 8121 if (err != 0 && !fd_data_str) { 8122 /* error message done by ztest_set_global_vars */ 8123 exit(EXIT_FAILURE); 8124 } else { 8125 /* children should not be spawned if setting gvars fails */ 8126 VERIFY3S(err, ==, 0); 8127 } 8128 8129 /* Override location of zpool.cache */ 8130 VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", 8131 ztest_opts.zo_dir), !=, -1); 8132 8133 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 8134 UMEM_NOFAIL); 8135 zs = ztest_shared; 8136 8137 if (fd_data_str) { 8138 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 8139 metaslab_df_alloc_threshold = 8140 zs->zs_metaslab_df_alloc_threshold; 8141 8142 if (zs->zs_do_init) 8143 ztest_run_init(); 8144 else 8145 ztest_run(zs); 8146 exit(0); 8147 } 8148 8149 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 8150 8151 if (ztest_opts.zo_verbose >= 1) { 8152 (void) printf("%"PRIu64" vdevs, %d datasets, %d threads," 8153 "%d %s disks, %"PRIu64" seconds...\n\n", 8154 ztest_opts.zo_vdevs, 8155 ztest_opts.zo_datasets, 8156 ztest_opts.zo_threads, 8157 ztest_opts.zo_raid_children, 8158 ztest_opts.zo_raid_type, 8159 ztest_opts.zo_time); 8160 } 8161 8162 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 8163 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 8164 8165 zs->zs_do_init = B_TRUE; 8166 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 8167 if (ztest_opts.zo_verbose >= 1) { 8168 (void) printf("Executing older ztest for " 8169 "initialization: %s\n", ztest_opts.zo_alt_ztest); 8170 } 8171 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 8172 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 8173 } else { 8174 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 8175 } 8176 zs->zs_do_init = B_FALSE; 8177 8178 zs->zs_proc_start = gethrtime(); 8179 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 8180 8181 for (f = 0; f < ZTEST_FUNCS; f++) { 8182 zi = &ztest_info[f]; 8183 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8184 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 8185 zc->zc_next = UINT64_MAX; 8186 else 8187 zc->zc_next = zs->zs_proc_start + 8188 ztest_random(2 * zi->zi_interval[0] + 1); 8189 } 8190 8191 /* 8192 * Run the tests in a loop. These tests include fault injection 8193 * to verify that self-healing data works, and forced crashes 8194 * to verify that we never lose on-disk consistency. 8195 */ 8196 while (gethrtime() < zs->zs_proc_stop) { 8197 int status; 8198 boolean_t killed; 8199 8200 /* 8201 * Initialize the workload counters for each function. 8202 */ 8203 for (f = 0; f < ZTEST_FUNCS; f++) { 8204 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8205 zc->zc_count = 0; 8206 zc->zc_time = 0; 8207 } 8208 8209 /* Set the allocation switch size */ 8210 zs->zs_metaslab_df_alloc_threshold = 8211 ztest_random(zs->zs_metaslab_sz / 4) + 1; 8212 8213 if (!hasalt || ztest_random(2) == 0) { 8214 if (hasalt && ztest_opts.zo_verbose >= 1) { 8215 (void) printf("Executing newer ztest: %s\n", 8216 cmd); 8217 } 8218 newer++; 8219 killed = exec_child(cmd, NULL, B_TRUE, &status); 8220 } else { 8221 if (hasalt && ztest_opts.zo_verbose >= 1) { 8222 (void) printf("Executing older ztest: %s\n", 8223 ztest_opts.zo_alt_ztest); 8224 } 8225 older++; 8226 killed = exec_child(ztest_opts.zo_alt_ztest, 8227 ztest_opts.zo_alt_libpath, B_TRUE, &status); 8228 } 8229 8230 if (killed) 8231 kills++; 8232 iters++; 8233 8234 if (ztest_opts.zo_verbose >= 1) { 8235 hrtime_t now = gethrtime(); 8236 8237 now = MIN(now, zs->zs_proc_stop); 8238 print_time(zs->zs_proc_stop - now, timebuf); 8239 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 8240 8241 (void) printf("Pass %3d, %8s, %3"PRIu64" ENOSPC, " 8242 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 8243 iters, 8244 WIFEXITED(status) ? "Complete" : "SIGKILL", 8245 zs->zs_enospc_count, 8246 100.0 * zs->zs_alloc / zs->zs_space, 8247 numbuf, 8248 100.0 * (now - zs->zs_proc_start) / 8249 (ztest_opts.zo_time * NANOSEC), timebuf); 8250 } 8251 8252 if (ztest_opts.zo_verbose >= 2) { 8253 (void) printf("\nWorkload summary:\n\n"); 8254 (void) printf("%7s %9s %s\n", 8255 "Calls", "Time", "Function"); 8256 (void) printf("%7s %9s %s\n", 8257 "-----", "----", "--------"); 8258 for (f = 0; f < ZTEST_FUNCS; f++) { 8259 zi = &ztest_info[f]; 8260 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8261 print_time(zc->zc_time, timebuf); 8262 (void) printf("%7"PRIu64" %9s %s\n", 8263 zc->zc_count, timebuf, 8264 zi->zi_funcname); 8265 } 8266 (void) printf("\n"); 8267 } 8268 8269 if (!ztest_opts.zo_mmp_test) 8270 ztest_run_zdb(ztest_opts.zo_pool); 8271 } 8272 8273 if (ztest_opts.zo_verbose >= 1) { 8274 if (hasalt) { 8275 (void) printf("%d runs of older ztest: %s\n", older, 8276 ztest_opts.zo_alt_ztest); 8277 (void) printf("%d runs of newer ztest: %s\n", newer, 8278 cmd); 8279 } 8280 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 8281 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 8282 } 8283 8284 umem_free(cmd, MAXNAMELEN); 8285 8286 return (0); 8287 } 8288