1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2024 by Delphix. All rights reserved. 25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2013 Steven Hartland. All rights reserved. 27 * Copyright (c) 2014 Integros [integros.com] 28 * Copyright 2017 Joyent, Inc. 29 * Copyright (c) 2017, Intel Corporation. 30 * Copyright (c) 2023, Klara, Inc. 31 */ 32 33 /* 34 * The objective of this program is to provide a DMU/ZAP/SPA stress test 35 * that runs entirely in userland, is easy to use, and easy to extend. 36 * 37 * The overall design of the ztest program is as follows: 38 * 39 * (1) For each major functional area (e.g. adding vdevs to a pool, 40 * creating and destroying datasets, reading and writing objects, etc) 41 * we have a simple routine to test that functionality. These 42 * individual routines do not have to do anything "stressful". 43 * 44 * (2) We turn these simple functionality tests into a stress test by 45 * running them all in parallel, with as many threads as desired, 46 * and spread across as many datasets, objects, and vdevs as desired. 47 * 48 * (3) While all this is happening, we inject faults into the pool to 49 * verify that self-healing data really works. 50 * 51 * (4) Every time we open a dataset, we change its checksum and compression 52 * functions. Thus even individual objects vary from block to block 53 * in which checksum they use and whether they're compressed. 54 * 55 * (5) To verify that we never lose on-disk consistency after a crash, 56 * we run the entire test in a child of the main process. 57 * At random times, the child self-immolates with a SIGKILL. 58 * This is the software equivalent of pulling the power cord. 59 * The parent then runs the test again, using the existing 60 * storage pool, as many times as desired. If backwards compatibility 61 * testing is enabled ztest will sometimes run the "older" version 62 * of ztest after a SIGKILL. 63 * 64 * (6) To verify that we don't have future leaks or temporal incursions, 65 * many of the functional tests record the transaction group number 66 * as part of their data. When reading old data, they verify that 67 * the transaction group number is less than the current, open txg. 68 * If you add a new test, please do this if applicable. 69 * 70 * (7) Threads are created with a reduced stack size, for sanity checking. 71 * Therefore, it's important not to allocate huge buffers on the stack. 72 * 73 * When run with no arguments, ztest runs for about five minutes and 74 * produces no output if successful. To get a little bit of information, 75 * specify -V. To get more information, specify -VV, and so on. 76 * 77 * To turn this into an overnight stress test, use -T to specify run time. 78 * 79 * You can ask more vdevs [-v], datasets [-d], or threads [-t] 80 * to increase the pool capacity, fanout, and overall stress level. 81 * 82 * Use the -k option to set the desired frequency of kills. 83 * 84 * When ztest invokes itself it passes all relevant information through a 85 * temporary file which is mmap-ed in the child process. This allows shared 86 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 87 * stored at offset 0 of this file and contains information on the size and 88 * number of shared structures in the file. The information stored in this file 89 * must remain backwards compatible with older versions of ztest so that 90 * ztest can invoke them during backwards compatibility testing (-B). 91 */ 92 93 #include <sys/zfs_context.h> 94 #include <sys/spa.h> 95 #include <sys/dmu.h> 96 #include <sys/txg.h> 97 #include <sys/dbuf.h> 98 #include <sys/zap.h> 99 #include <sys/dmu_objset.h> 100 #include <sys/poll.h> 101 #include <sys/stat.h> 102 #include <sys/time.h> 103 #include <sys/wait.h> 104 #include <sys/mman.h> 105 #include <sys/resource.h> 106 #include <sys/zio.h> 107 #include <sys/zil.h> 108 #include <sys/zil_impl.h> 109 #include <sys/vdev_draid.h> 110 #include <sys/vdev_impl.h> 111 #include <sys/vdev_file.h> 112 #include <sys/vdev_initialize.h> 113 #include <sys/vdev_raidz.h> 114 #include <sys/vdev_trim.h> 115 #include <sys/spa_impl.h> 116 #include <sys/metaslab_impl.h> 117 #include <sys/dsl_prop.h> 118 #include <sys/dsl_dataset.h> 119 #include <sys/dsl_destroy.h> 120 #include <sys/dsl_scan.h> 121 #include <sys/zio_checksum.h> 122 #include <sys/zfs_refcount.h> 123 #include <sys/zfeature.h> 124 #include <sys/dsl_userhold.h> 125 #include <sys/abd.h> 126 #include <sys/blake3.h> 127 #include <stdio.h> 128 #include <stdlib.h> 129 #include <unistd.h> 130 #include <getopt.h> 131 #include <signal.h> 132 #include <umem.h> 133 #include <ctype.h> 134 #include <math.h> 135 #include <sys/fs/zfs.h> 136 #include <zfs_fletcher.h> 137 #include <libnvpair.h> 138 #include <libzutil.h> 139 #include <sys/crypto/icp.h> 140 #include <sys/zfs_impl.h> 141 #include <sys/backtrace.h> 142 #include <libzpool.h> 143 #include <libspl.h> 144 145 static int ztest_fd_data = -1; 146 147 typedef struct ztest_shared_hdr { 148 uint64_t zh_hdr_size; 149 uint64_t zh_opts_size; 150 uint64_t zh_size; 151 uint64_t zh_stats_size; 152 uint64_t zh_stats_count; 153 uint64_t zh_ds_size; 154 uint64_t zh_ds_count; 155 uint64_t zh_scratch_state_size; 156 } ztest_shared_hdr_t; 157 158 static ztest_shared_hdr_t *ztest_shared_hdr; 159 160 enum ztest_class_state { 161 ZTEST_VDEV_CLASS_OFF, 162 ZTEST_VDEV_CLASS_ON, 163 ZTEST_VDEV_CLASS_RND 164 }; 165 166 /* Dedicated RAIDZ Expansion test states */ 167 typedef enum { 168 RAIDZ_EXPAND_NONE, /* Default is none, must opt-in */ 169 RAIDZ_EXPAND_REQUESTED, /* The '-X' option was used */ 170 RAIDZ_EXPAND_STARTED, /* Testing has commenced */ 171 RAIDZ_EXPAND_KILLED, /* Reached the proccess kill */ 172 RAIDZ_EXPAND_CHECKED, /* Pool scrub verification done */ 173 } raidz_expand_test_state_t; 174 175 176 #define ZO_GVARS_MAX_ARGLEN ((size_t)64) 177 #define ZO_GVARS_MAX_COUNT ((size_t)10) 178 179 typedef struct ztest_shared_opts { 180 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 181 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 182 char zo_alt_ztest[MAXNAMELEN]; 183 char zo_alt_libpath[MAXNAMELEN]; 184 uint64_t zo_vdevs; 185 uint64_t zo_vdevtime; 186 size_t zo_vdev_size; 187 int zo_ashift; 188 int zo_mirrors; 189 int zo_raid_do_expand; 190 int zo_raid_children; 191 int zo_raid_parity; 192 char zo_raid_type[8]; 193 int zo_draid_data; 194 int zo_draid_spares; 195 int zo_datasets; 196 int zo_threads; 197 uint64_t zo_passtime; 198 uint64_t zo_killrate; 199 int zo_verbose; 200 int zo_init; 201 uint64_t zo_time; 202 uint64_t zo_maxloops; 203 uint64_t zo_metaslab_force_ganging; 204 raidz_expand_test_state_t zo_raidz_expand_test; 205 int zo_mmp_test; 206 int zo_special_vdevs; 207 int zo_dump_dbgmsg; 208 int zo_gvars_count; 209 char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; 210 } ztest_shared_opts_t; 211 212 /* Default values for command line options. */ 213 #define DEFAULT_POOL "ztest" 214 #define DEFAULT_VDEV_DIR "/tmp" 215 #define DEFAULT_VDEV_COUNT 5 216 #define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ 217 #define DEFAULT_VDEV_SIZE_STR "256M" 218 #define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT 219 #define DEFAULT_MIRRORS 2 220 #define DEFAULT_RAID_CHILDREN 4 221 #define DEFAULT_RAID_PARITY 1 222 #define DEFAULT_DRAID_DATA 4 223 #define DEFAULT_DRAID_SPARES 1 224 #define DEFAULT_DATASETS_COUNT 7 225 #define DEFAULT_THREADS 23 226 #define DEFAULT_RUN_TIME 300 /* 300 seconds */ 227 #define DEFAULT_RUN_TIME_STR "300 sec" 228 #define DEFAULT_PASS_TIME 60 /* 60 seconds */ 229 #define DEFAULT_PASS_TIME_STR "60 sec" 230 #define DEFAULT_KILL_RATE 70 /* 70% kill rate */ 231 #define DEFAULT_KILLRATE_STR "70%" 232 #define DEFAULT_INITS 1 233 #define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ 234 #define DEFAULT_FORCE_GANGING (64 << 10) 235 #define DEFAULT_FORCE_GANGING_STR "64K" 236 237 /* Simplifying assumption: -1 is not a valid default. */ 238 #define NO_DEFAULT -1 239 240 static const ztest_shared_opts_t ztest_opts_defaults = { 241 .zo_pool = DEFAULT_POOL, 242 .zo_dir = DEFAULT_VDEV_DIR, 243 .zo_alt_ztest = { '\0' }, 244 .zo_alt_libpath = { '\0' }, 245 .zo_vdevs = DEFAULT_VDEV_COUNT, 246 .zo_ashift = DEFAULT_ASHIFT, 247 .zo_mirrors = DEFAULT_MIRRORS, 248 .zo_raid_children = DEFAULT_RAID_CHILDREN, 249 .zo_raid_parity = DEFAULT_RAID_PARITY, 250 .zo_raid_type = VDEV_TYPE_RAIDZ, 251 .zo_vdev_size = DEFAULT_VDEV_SIZE, 252 .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ 253 .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ 254 .zo_datasets = DEFAULT_DATASETS_COUNT, 255 .zo_threads = DEFAULT_THREADS, 256 .zo_passtime = DEFAULT_PASS_TIME, 257 .zo_killrate = DEFAULT_KILL_RATE, 258 .zo_verbose = 0, 259 .zo_mmp_test = 0, 260 .zo_init = DEFAULT_INITS, 261 .zo_time = DEFAULT_RUN_TIME, 262 .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ 263 .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, 264 .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, 265 .zo_gvars_count = 0, 266 .zo_raidz_expand_test = RAIDZ_EXPAND_NONE, 267 }; 268 269 extern uint64_t metaslab_force_ganging; 270 extern uint64_t metaslab_df_alloc_threshold; 271 extern uint64_t zfs_deadman_synctime_ms; 272 extern uint_t metaslab_preload_limit; 273 extern int zfs_compressed_arc_enabled; 274 extern int zfs_abd_scatter_enabled; 275 extern uint_t dmu_object_alloc_chunk_shift; 276 extern boolean_t zfs_force_some_double_word_sm_entries; 277 extern unsigned long zfs_reconstruct_indirect_damage_fraction; 278 extern uint64_t raidz_expand_max_reflow_bytes; 279 extern uint_t raidz_expand_pause_point; 280 extern boolean_t ddt_prune_artificial_age; 281 extern boolean_t ddt_dump_prune_histogram; 282 283 284 static ztest_shared_opts_t *ztest_shared_opts; 285 static ztest_shared_opts_t ztest_opts; 286 static const char *const ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; 287 288 typedef struct ztest_shared_ds { 289 uint64_t zd_seq; 290 } ztest_shared_ds_t; 291 292 static ztest_shared_ds_t *ztest_shared_ds; 293 #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 294 295 typedef struct ztest_scratch_state { 296 uint64_t zs_raidz_scratch_verify_pause; 297 } ztest_shared_scratch_state_t; 298 299 static ztest_shared_scratch_state_t *ztest_scratch_state; 300 301 #define BT_MAGIC 0x123456789abcdefULL 302 #define MAXFAULTS(zs) \ 303 (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) 304 305 enum ztest_io_type { 306 ZTEST_IO_WRITE_TAG, 307 ZTEST_IO_WRITE_PATTERN, 308 ZTEST_IO_WRITE_ZEROES, 309 ZTEST_IO_TRUNCATE, 310 ZTEST_IO_SETATTR, 311 ZTEST_IO_REWRITE, 312 ZTEST_IO_TYPES 313 }; 314 315 typedef struct ztest_block_tag { 316 uint64_t bt_magic; 317 uint64_t bt_objset; 318 uint64_t bt_object; 319 uint64_t bt_dnodesize; 320 uint64_t bt_offset; 321 uint64_t bt_gen; 322 uint64_t bt_txg; 323 uint64_t bt_crtxg; 324 } ztest_block_tag_t; 325 326 typedef struct bufwad { 327 uint64_t bw_index; 328 uint64_t bw_txg; 329 uint64_t bw_data; 330 } bufwad_t; 331 332 /* 333 * It would be better to use a rangelock_t per object. Unfortunately 334 * the rangelock_t is not a drop-in replacement for rl_t, because we 335 * still need to map from object ID to rangelock_t. 336 */ 337 typedef enum { 338 ZTRL_READER, 339 ZTRL_WRITER, 340 ZTRL_APPEND 341 } rl_type_t; 342 343 typedef struct rll { 344 void *rll_writer; 345 int rll_readers; 346 kmutex_t rll_lock; 347 kcondvar_t rll_cv; 348 } rll_t; 349 350 typedef struct rl { 351 uint64_t rl_object; 352 uint64_t rl_offset; 353 uint64_t rl_size; 354 rll_t *rl_lock; 355 } rl_t; 356 357 #define ZTEST_RANGE_LOCKS 64 358 #define ZTEST_OBJECT_LOCKS 64 359 360 /* 361 * Object descriptor. Used as a template for object lookup/create/remove. 362 */ 363 typedef struct ztest_od { 364 uint64_t od_dir; 365 uint64_t od_object; 366 dmu_object_type_t od_type; 367 dmu_object_type_t od_crtype; 368 uint64_t od_blocksize; 369 uint64_t od_crblocksize; 370 uint64_t od_crdnodesize; 371 uint64_t od_gen; 372 uint64_t od_crgen; 373 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 374 } ztest_od_t; 375 376 /* 377 * Per-dataset state. 378 */ 379 typedef struct ztest_ds { 380 ztest_shared_ds_t *zd_shared; 381 objset_t *zd_os; 382 pthread_rwlock_t zd_zilog_lock; 383 zilog_t *zd_zilog; 384 ztest_od_t *zd_od; /* debugging aid */ 385 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 386 kmutex_t zd_dirobj_lock; 387 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 388 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 389 } ztest_ds_t; 390 391 /* 392 * Per-iteration state. 393 */ 394 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 395 396 typedef struct ztest_info { 397 ztest_func_t *zi_func; /* test function */ 398 uint64_t zi_iters; /* iterations per execution */ 399 uint64_t *zi_interval; /* execute every <interval> seconds */ 400 const char *zi_funcname; /* name of test function */ 401 } ztest_info_t; 402 403 typedef struct ztest_shared_callstate { 404 uint64_t zc_count; /* per-pass count */ 405 uint64_t zc_time; /* per-pass time */ 406 uint64_t zc_next; /* next time to call this function */ 407 } ztest_shared_callstate_t; 408 409 static ztest_shared_callstate_t *ztest_shared_callstate; 410 #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 411 412 ztest_func_t ztest_dmu_read_write; 413 ztest_func_t ztest_dmu_write_parallel; 414 ztest_func_t ztest_dmu_object_alloc_free; 415 ztest_func_t ztest_dmu_object_next_chunk; 416 ztest_func_t ztest_dmu_commit_callbacks; 417 ztest_func_t ztest_zap; 418 ztest_func_t ztest_zap_parallel; 419 ztest_func_t ztest_zil_commit; 420 ztest_func_t ztest_zil_remount; 421 ztest_func_t ztest_dmu_read_write_zcopy; 422 ztest_func_t ztest_dmu_objset_create_destroy; 423 ztest_func_t ztest_dmu_prealloc; 424 ztest_func_t ztest_fzap; 425 ztest_func_t ztest_dmu_snapshot_create_destroy; 426 ztest_func_t ztest_dsl_prop_get_set; 427 ztest_func_t ztest_spa_prop_get_set; 428 ztest_func_t ztest_spa_create_destroy; 429 ztest_func_t ztest_fault_inject; 430 ztest_func_t ztest_dmu_snapshot_hold; 431 ztest_func_t ztest_mmp_enable_disable; 432 ztest_func_t ztest_scrub; 433 ztest_func_t ztest_dsl_dataset_promote_busy; 434 ztest_func_t ztest_vdev_attach_detach; 435 ztest_func_t ztest_vdev_raidz_attach; 436 ztest_func_t ztest_vdev_LUN_growth; 437 ztest_func_t ztest_vdev_add_remove; 438 ztest_func_t ztest_vdev_class_add; 439 ztest_func_t ztest_vdev_aux_add_remove; 440 ztest_func_t ztest_split_pool; 441 ztest_func_t ztest_reguid; 442 ztest_func_t ztest_spa_upgrade; 443 ztest_func_t ztest_device_removal; 444 ztest_func_t ztest_spa_checkpoint_create_discard; 445 ztest_func_t ztest_initialize; 446 ztest_func_t ztest_trim; 447 ztest_func_t ztest_blake3; 448 ztest_func_t ztest_fletcher; 449 ztest_func_t ztest_fletcher_incr; 450 ztest_func_t ztest_verify_dnode_bt; 451 ztest_func_t ztest_pool_prefetch_ddt; 452 ztest_func_t ztest_ddt_prune; 453 454 static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 455 static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 456 static uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 457 static uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 458 static uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 459 460 #define ZTI_INIT(func, iters, interval) \ 461 { .zi_func = (func), \ 462 .zi_iters = (iters), \ 463 .zi_interval = (interval), \ 464 .zi_funcname = # func } 465 466 static ztest_info_t ztest_info[] = { 467 ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), 468 ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), 469 ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), 470 ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), 471 ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), 472 ZTI_INIT(ztest_zap, 30, &zopt_always), 473 ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), 474 ZTI_INIT(ztest_split_pool, 1, &zopt_sometimes), 475 ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), 476 ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), 477 ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), 478 ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), 479 ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), 480 ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), 481 #if 0 482 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), 483 #endif 484 ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), 485 ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), 486 ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), 487 ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), 488 ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), 489 ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), 490 ZTI_INIT(ztest_reguid, 1, &zopt_rarely), 491 ZTI_INIT(ztest_scrub, 1, &zopt_rarely), 492 ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), 493 ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), 494 ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), 495 ZTI_INIT(ztest_vdev_raidz_attach, 1, &zopt_sometimes), 496 ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), 497 ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), 498 ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), 499 ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), 500 ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), 501 ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), 502 ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), 503 ZTI_INIT(ztest_trim, 1, &zopt_sometimes), 504 ZTI_INIT(ztest_blake3, 1, &zopt_rarely), 505 ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), 506 ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), 507 ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), 508 ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely), 509 ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely), 510 }; 511 512 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 513 514 /* 515 * The following struct is used to hold a list of uncalled commit callbacks. 516 * The callbacks are ordered by txg number. 517 */ 518 typedef struct ztest_cb_list { 519 kmutex_t zcl_callbacks_lock; 520 list_t zcl_callbacks; 521 } ztest_cb_list_t; 522 523 /* 524 * Stuff we need to share writably between parent and child. 525 */ 526 typedef struct ztest_shared { 527 boolean_t zs_do_init; 528 hrtime_t zs_proc_start; 529 hrtime_t zs_proc_stop; 530 hrtime_t zs_thread_start; 531 hrtime_t zs_thread_stop; 532 hrtime_t zs_thread_kill; 533 uint64_t zs_enospc_count; 534 uint64_t zs_vdev_next_leaf; 535 uint64_t zs_vdev_aux; 536 uint64_t zs_alloc; 537 uint64_t zs_space; 538 uint64_t zs_splits; 539 uint64_t zs_mirrors; 540 uint64_t zs_metaslab_sz; 541 uint64_t zs_metaslab_df_alloc_threshold; 542 uint64_t zs_guid; 543 } ztest_shared_t; 544 545 #define ID_PARALLEL -1ULL 546 547 static char ztest_dev_template[] = "%s/%s.%llua"; 548 static char ztest_aux_template[] = "%s/%s.%s.%llu"; 549 static ztest_shared_t *ztest_shared; 550 551 static spa_t *ztest_spa = NULL; 552 static ztest_ds_t *ztest_ds; 553 554 static kmutex_t ztest_vdev_lock; 555 static boolean_t ztest_device_removal_active = B_FALSE; 556 static boolean_t ztest_pool_scrubbed = B_FALSE; 557 static kmutex_t ztest_checkpoint_lock; 558 559 /* 560 * The ztest_name_lock protects the pool and dataset namespace used by 561 * the individual tests. To modify the namespace, consumers must grab 562 * this lock as writer. Grabbing the lock as reader will ensure that the 563 * namespace does not change while the lock is held. 564 */ 565 static pthread_rwlock_t ztest_name_lock; 566 567 static boolean_t ztest_dump_core = B_TRUE; 568 static boolean_t ztest_exiting; 569 570 /* Global commit callback list */ 571 static ztest_cb_list_t zcl; 572 /* Commit cb delay */ 573 static uint64_t zc_min_txg_delay = UINT64_MAX; 574 static int zc_cb_counter = 0; 575 576 /* 577 * Minimum number of commit callbacks that need to be registered for us to check 578 * whether the minimum txg delay is acceptable. 579 */ 580 #define ZTEST_COMMIT_CB_MIN_REG 100 581 582 /* 583 * If a number of txgs equal to this threshold have been created after a commit 584 * callback has been registered but not called, then we assume there is an 585 * implementation bug. 586 */ 587 #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) 588 589 enum ztest_object { 590 ZTEST_META_DNODE = 0, 591 ZTEST_DIROBJ, 592 ZTEST_OBJECTS 593 }; 594 595 static __attribute__((noreturn)) void usage(boolean_t requested); 596 static int ztest_scrub_impl(spa_t *spa); 597 598 /* 599 * These libumem hooks provide a reasonable set of defaults for the allocator's 600 * debugging facilities. 601 */ 602 const char * 603 _umem_debug_init(void) 604 { 605 return ("default,verbose"); /* $UMEM_DEBUG setting */ 606 } 607 608 const char * 609 _umem_logging_init(void) 610 { 611 return ("fail,contents"); /* $UMEM_LOGGING setting */ 612 } 613 614 static void 615 dump_debug_buffer(void) 616 { 617 ssize_t ret __attribute__((unused)); 618 619 if (!ztest_opts.zo_dump_dbgmsg) 620 return; 621 622 /* 623 * We use write() instead of printf() so that this function 624 * is safe to call from a signal handler. 625 */ 626 ret = write(STDERR_FILENO, "\n", 1); 627 zfs_dbgmsg_print(STDERR_FILENO, "ztest"); 628 } 629 630 static void sig_handler(int signo) 631 { 632 struct sigaction action; 633 634 libspl_backtrace(STDERR_FILENO); 635 dump_debug_buffer(); 636 637 /* 638 * Restore default action and re-raise signal so SIGSEGV and 639 * SIGABRT can trigger a core dump. 640 */ 641 action.sa_handler = SIG_DFL; 642 sigemptyset(&action.sa_mask); 643 action.sa_flags = 0; 644 (void) sigaction(signo, &action, NULL); 645 raise(signo); 646 } 647 648 #define FATAL_MSG_SZ 1024 649 650 static const char *fatal_msg; 651 652 static __attribute__((format(printf, 2, 3))) __attribute__((noreturn)) void 653 fatal(int do_perror, const char *message, ...) 654 { 655 va_list args; 656 int save_errno = errno; 657 char *buf; 658 659 (void) fflush(stdout); 660 buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); 661 if (buf == NULL) 662 goto out; 663 664 va_start(args, message); 665 (void) sprintf(buf, "ztest: "); 666 /* LINTED */ 667 (void) vsprintf(buf + strlen(buf), message, args); 668 va_end(args); 669 if (do_perror) { 670 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 671 ": %s", strerror(save_errno)); 672 } 673 (void) fprintf(stderr, "%s\n", buf); 674 fatal_msg = buf; /* to ease debugging */ 675 676 out: 677 if (ztest_dump_core) 678 abort(); 679 else 680 dump_debug_buffer(); 681 682 exit(3); 683 } 684 685 static int 686 str2shift(const char *buf) 687 { 688 const char *ends = "BKMGTPEZ"; 689 int i, len; 690 691 if (buf[0] == '\0') 692 return (0); 693 694 len = strlen(ends); 695 for (i = 0; i < len; i++) { 696 if (toupper(buf[0]) == ends[i]) 697 break; 698 } 699 if (i == len) { 700 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 701 buf); 702 usage(B_FALSE); 703 } 704 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 705 return (10*i); 706 } 707 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 708 usage(B_FALSE); 709 } 710 711 static uint64_t 712 nicenumtoull(const char *buf) 713 { 714 char *end; 715 uint64_t val; 716 717 val = strtoull(buf, &end, 0); 718 if (end == buf) { 719 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 720 usage(B_FALSE); 721 } else if (end[0] == '.') { 722 double fval = strtod(buf, &end); 723 fval *= pow(2, str2shift(end)); 724 /* 725 * UINT64_MAX is not exactly representable as a double. 726 * The closest representation is UINT64_MAX + 1, so we 727 * use a >= comparison instead of > for the bounds check. 728 */ 729 if (fval >= (double)UINT64_MAX) { 730 (void) fprintf(stderr, "ztest: value too large: %s\n", 731 buf); 732 usage(B_FALSE); 733 } 734 val = (uint64_t)fval; 735 } else { 736 int shift = str2shift(end); 737 if (shift >= 64 || (val << shift) >> shift != val) { 738 (void) fprintf(stderr, "ztest: value too large: %s\n", 739 buf); 740 usage(B_FALSE); 741 } 742 val <<= shift; 743 } 744 return (val); 745 } 746 747 typedef struct ztest_option { 748 const char short_opt; 749 const char *long_opt; 750 const char *long_opt_param; 751 const char *comment; 752 unsigned int default_int; 753 const char *default_str; 754 } ztest_option_t; 755 756 /* 757 * The following option_table is used for generating the usage info as well as 758 * the long and short option information for calling getopt_long(). 759 */ 760 static ztest_option_t option_table[] = { 761 { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, 762 NULL}, 763 { 's', "vdev-size", "INTEGER", "Size of each vdev", 764 NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, 765 { 'a', "alignment-shift", "INTEGER", 766 "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, 767 { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", 768 DEFAULT_MIRRORS, NULL}, 769 { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", 770 DEFAULT_RAID_CHILDREN, NULL}, 771 { 'R', "raid-parity", "INTEGER", "Raid parity", 772 DEFAULT_RAID_PARITY, NULL}, 773 { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind", 774 NO_DEFAULT, "random"}, 775 { 'D', "draid-data", "INTEGER", "Number of draid data drives", 776 DEFAULT_DRAID_DATA, NULL}, 777 { 'S', "draid-spares", "INTEGER", "Number of draid spares", 778 DEFAULT_DRAID_SPARES, NULL}, 779 { 'd', "datasets", "INTEGER", "Number of datasets", 780 DEFAULT_DATASETS_COUNT, NULL}, 781 { 't', "threads", "INTEGER", "Number of ztest threads", 782 DEFAULT_THREADS, NULL}, 783 { 'g', "gang-block-threshold", "INTEGER", 784 "Metaslab gang block threshold", 785 NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, 786 { 'i', "init-count", "INTEGER", "Number of times to initialize pool", 787 DEFAULT_INITS, NULL}, 788 { 'k', "kill-percentage", "INTEGER", "Kill percentage", 789 NO_DEFAULT, DEFAULT_KILLRATE_STR}, 790 { 'p', "pool-name", "STRING", "Pool name", 791 NO_DEFAULT, DEFAULT_POOL}, 792 { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", 793 NO_DEFAULT, DEFAULT_VDEV_DIR}, 794 { 'M', "multi-host", NULL, 795 "Multi-host; simulate pool imported on remote host", 796 NO_DEFAULT, NULL}, 797 { 'E', "use-existing-pool", NULL, 798 "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, 799 { 'T', "run-time", "INTEGER", "Total run time", 800 NO_DEFAULT, DEFAULT_RUN_TIME_STR}, 801 { 'P', "pass-time", "INTEGER", "Time per pass", 802 NO_DEFAULT, DEFAULT_PASS_TIME_STR}, 803 { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", 804 DEFAULT_MAX_LOOPS, NULL}, 805 { 'B', "alt-ztest", "PATH", "Alternate ztest path", 806 NO_DEFAULT, NULL}, 807 { 'C', "vdev-class-state", "on|off|random", "vdev class state", 808 NO_DEFAULT, "random"}, 809 { 'X', "raidz-expansion", NULL, 810 "Perform a dedicated raidz expansion test", 811 NO_DEFAULT, NULL}, 812 { 'o', "option", "\"NAME=VALUE\"", 813 "Set the named tunable to the given value", 814 NO_DEFAULT, NULL}, 815 { 'G', "dump-debug-msg", NULL, 816 "Dump zfs_dbgmsg buffer before exiting due to an error", 817 NO_DEFAULT, NULL}, 818 { 'V', "verbose", NULL, 819 "Verbose (use multiple times for ever more verbosity)", 820 NO_DEFAULT, NULL}, 821 { 'h', "help", NULL, "Show this help", 822 NO_DEFAULT, NULL}, 823 {0, 0, 0, 0, 0, 0} 824 }; 825 826 static struct option *long_opts = NULL; 827 static char *short_opts = NULL; 828 829 static void 830 init_options(void) 831 { 832 ASSERT0P(long_opts); 833 ASSERT0P(short_opts); 834 835 int count = sizeof (option_table) / sizeof (option_table[0]); 836 long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); 837 838 short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); 839 int short_opt_index = 0; 840 841 for (int i = 0; i < count; i++) { 842 long_opts[i].val = option_table[i].short_opt; 843 long_opts[i].name = option_table[i].long_opt; 844 long_opts[i].has_arg = option_table[i].long_opt_param != NULL 845 ? required_argument : no_argument; 846 long_opts[i].flag = NULL; 847 short_opts[short_opt_index++] = option_table[i].short_opt; 848 if (option_table[i].long_opt_param != NULL) { 849 short_opts[short_opt_index++] = ':'; 850 } 851 } 852 } 853 854 static void 855 fini_options(void) 856 { 857 int count = sizeof (option_table) / sizeof (option_table[0]); 858 859 umem_free(long_opts, sizeof (struct option) * count); 860 umem_free(short_opts, sizeof (char) * 2 * count); 861 862 long_opts = NULL; 863 short_opts = NULL; 864 } 865 866 static __attribute__((noreturn)) void 867 usage(boolean_t requested) 868 { 869 char option[80]; 870 FILE *fp = requested ? stdout : stderr; 871 872 (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); 873 for (int i = 0; option_table[i].short_opt != 0; i++) { 874 if (option_table[i].long_opt_param != NULL) { 875 (void) sprintf(option, " -%c --%s=%s", 876 option_table[i].short_opt, 877 option_table[i].long_opt, 878 option_table[i].long_opt_param); 879 } else { 880 (void) sprintf(option, " -%c --%s", 881 option_table[i].short_opt, 882 option_table[i].long_opt); 883 } 884 (void) fprintf(fp, " %-43s%s", option, 885 option_table[i].comment); 886 887 if (option_table[i].long_opt_param != NULL) { 888 if (option_table[i].default_str != NULL) { 889 (void) fprintf(fp, " (default: %s)", 890 option_table[i].default_str); 891 } else if (option_table[i].default_int != NO_DEFAULT) { 892 (void) fprintf(fp, " (default: %u)", 893 option_table[i].default_int); 894 } 895 } 896 (void) fprintf(fp, "\n"); 897 } 898 exit(requested ? 0 : 1); 899 } 900 901 static uint64_t 902 ztest_random(uint64_t range) 903 { 904 uint64_t r; 905 906 if (range == 0) 907 return (0); 908 909 random_get_pseudo_bytes((uint8_t *)&r, sizeof (r)); 910 911 return (r % range); 912 } 913 914 static void 915 ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) 916 { 917 char name[32]; 918 char *value; 919 int state; 920 921 (void) strlcpy(name, input, sizeof (name)); 922 923 value = strchr(name, '='); 924 if (value == NULL) { 925 (void) fprintf(stderr, "missing value in property=value " 926 "'-C' argument (%s)\n", input); 927 usage(B_FALSE); 928 } 929 *(value) = '\0'; 930 value++; 931 932 if (strcmp(value, "on") == 0) { 933 state = ZTEST_VDEV_CLASS_ON; 934 } else if (strcmp(value, "off") == 0) { 935 state = ZTEST_VDEV_CLASS_OFF; 936 } else if (strcmp(value, "random") == 0) { 937 state = ZTEST_VDEV_CLASS_RND; 938 } else { 939 (void) fprintf(stderr, "invalid property value '%s'\n", value); 940 usage(B_FALSE); 941 } 942 943 if (strcmp(name, "special") == 0) { 944 zo->zo_special_vdevs = state; 945 } else { 946 (void) fprintf(stderr, "invalid property name '%s'\n", name); 947 usage(B_FALSE); 948 } 949 if (zo->zo_verbose >= 3) 950 (void) printf("%s vdev state is '%s'\n", name, value); 951 } 952 953 static void 954 process_options(int argc, char **argv) 955 { 956 char *path; 957 ztest_shared_opts_t *zo = &ztest_opts; 958 959 int opt; 960 uint64_t value; 961 const char *raid_kind = "random"; 962 963 memcpy(zo, &ztest_opts_defaults, sizeof (*zo)); 964 965 init_options(); 966 967 while ((opt = getopt_long(argc, argv, short_opts, long_opts, 968 NULL)) != EOF) { 969 value = 0; 970 switch (opt) { 971 case 'v': 972 case 's': 973 case 'a': 974 case 'm': 975 case 'r': 976 case 'R': 977 case 'D': 978 case 'S': 979 case 'd': 980 case 't': 981 case 'g': 982 case 'i': 983 case 'k': 984 case 'T': 985 case 'P': 986 case 'F': 987 value = nicenumtoull(optarg); 988 } 989 switch (opt) { 990 case 'v': 991 zo->zo_vdevs = value; 992 break; 993 case 's': 994 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 995 break; 996 case 'a': 997 zo->zo_ashift = value; 998 break; 999 case 'm': 1000 zo->zo_mirrors = value; 1001 break; 1002 case 'r': 1003 zo->zo_raid_children = MAX(1, value); 1004 break; 1005 case 'R': 1006 zo->zo_raid_parity = MIN(MAX(value, 1), 3); 1007 break; 1008 case 'K': 1009 raid_kind = optarg; 1010 break; 1011 case 'D': 1012 zo->zo_draid_data = MAX(1, value); 1013 break; 1014 case 'S': 1015 zo->zo_draid_spares = MAX(1, value); 1016 break; 1017 case 'd': 1018 zo->zo_datasets = MAX(1, value); 1019 break; 1020 case 't': 1021 zo->zo_threads = MAX(1, value); 1022 break; 1023 case 'g': 1024 zo->zo_metaslab_force_ganging = 1025 MAX(SPA_MINBLOCKSIZE << 1, value); 1026 break; 1027 case 'i': 1028 zo->zo_init = value; 1029 break; 1030 case 'k': 1031 zo->zo_killrate = value; 1032 break; 1033 case 'p': 1034 (void) strlcpy(zo->zo_pool, optarg, 1035 sizeof (zo->zo_pool)); 1036 break; 1037 case 'f': 1038 path = realpath(optarg, NULL); 1039 if (path == NULL) { 1040 (void) fprintf(stderr, "error: %s: %s\n", 1041 optarg, strerror(errno)); 1042 usage(B_FALSE); 1043 } else { 1044 (void) strlcpy(zo->zo_dir, path, 1045 sizeof (zo->zo_dir)); 1046 free(path); 1047 } 1048 break; 1049 case 'M': 1050 zo->zo_mmp_test = 1; 1051 break; 1052 case 'V': 1053 zo->zo_verbose++; 1054 break; 1055 case 'X': 1056 zo->zo_raidz_expand_test = RAIDZ_EXPAND_REQUESTED; 1057 break; 1058 case 'E': 1059 zo->zo_init = 0; 1060 break; 1061 case 'T': 1062 zo->zo_time = value; 1063 break; 1064 case 'P': 1065 zo->zo_passtime = MAX(1, value); 1066 break; 1067 case 'F': 1068 zo->zo_maxloops = MAX(1, value); 1069 break; 1070 case 'B': 1071 (void) strlcpy(zo->zo_alt_ztest, optarg, 1072 sizeof (zo->zo_alt_ztest)); 1073 break; 1074 case 'C': 1075 ztest_parse_name_value(optarg, zo); 1076 break; 1077 case 'o': 1078 if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { 1079 (void) fprintf(stderr, 1080 "max global var count (%zu) exceeded\n", 1081 ZO_GVARS_MAX_COUNT); 1082 usage(B_FALSE); 1083 } 1084 char *v = zo->zo_gvars[zo->zo_gvars_count]; 1085 if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= 1086 ZO_GVARS_MAX_ARGLEN) { 1087 (void) fprintf(stderr, 1088 "global var option '%s' is too long\n", 1089 optarg); 1090 usage(B_FALSE); 1091 } 1092 zo->zo_gvars_count++; 1093 break; 1094 case 'G': 1095 zo->zo_dump_dbgmsg = 1; 1096 break; 1097 case 'h': 1098 usage(B_TRUE); 1099 break; 1100 case '?': 1101 default: 1102 usage(B_FALSE); 1103 break; 1104 } 1105 } 1106 1107 fini_options(); 1108 1109 /* Force compatible options for raidz expansion run */ 1110 if (zo->zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) { 1111 zo->zo_mmp_test = 0; 1112 zo->zo_mirrors = 0; 1113 zo->zo_vdevs = 1; 1114 zo->zo_vdev_size = DEFAULT_VDEV_SIZE * 2; 1115 zo->zo_raid_do_expand = B_FALSE; 1116 raid_kind = "raidz"; 1117 } 1118 1119 if (strcmp(raid_kind, "random") == 0) { 1120 switch (ztest_random(3)) { 1121 case 0: 1122 raid_kind = "raidz"; 1123 break; 1124 case 1: 1125 raid_kind = "eraidz"; 1126 break; 1127 case 2: 1128 raid_kind = "draid"; 1129 break; 1130 } 1131 1132 if (ztest_opts.zo_verbose >= 3) 1133 (void) printf("choosing RAID type '%s'\n", raid_kind); 1134 } 1135 1136 if (strcmp(raid_kind, "draid") == 0) { 1137 uint64_t min_devsize; 1138 1139 /* With fewer disk use 256M, otherwise 128M is OK */ 1140 min_devsize = (ztest_opts.zo_raid_children < 16) ? 1141 (256ULL << 20) : (128ULL << 20); 1142 1143 /* No top-level mirrors with dRAID for now */ 1144 zo->zo_mirrors = 0; 1145 1146 /* Use more appropriate defaults for dRAID */ 1147 if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) 1148 zo->zo_vdevs = 1; 1149 if (zo->zo_raid_children == 1150 ztest_opts_defaults.zo_raid_children) 1151 zo->zo_raid_children = 16; 1152 if (zo->zo_ashift < 12) 1153 zo->zo_ashift = 12; 1154 if (zo->zo_vdev_size < min_devsize) 1155 zo->zo_vdev_size = min_devsize; 1156 1157 if (zo->zo_draid_data + zo->zo_raid_parity > 1158 zo->zo_raid_children - zo->zo_draid_spares) { 1159 (void) fprintf(stderr, "error: too few draid " 1160 "children (%d) for stripe width (%d)\n", 1161 zo->zo_raid_children, 1162 zo->zo_draid_data + zo->zo_raid_parity); 1163 usage(B_FALSE); 1164 } 1165 1166 (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, 1167 sizeof (zo->zo_raid_type)); 1168 1169 } else if (strcmp(raid_kind, "eraidz") == 0) { 1170 /* using eraidz (expandable raidz) */ 1171 zo->zo_raid_do_expand = B_TRUE; 1172 1173 /* tests expect top-level to be raidz */ 1174 zo->zo_mirrors = 0; 1175 zo->zo_vdevs = 1; 1176 1177 /* Make sure parity is less than data columns */ 1178 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1179 zo->zo_raid_children - 1); 1180 1181 } else /* using raidz */ { 1182 ASSERT0(strcmp(raid_kind, "raidz")); 1183 1184 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1185 zo->zo_raid_children - 1); 1186 } 1187 1188 zo->zo_vdevtime = 1189 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 1190 UINT64_MAX >> 2); 1191 1192 if (*zo->zo_alt_ztest) { 1193 const char *invalid_what = "ztest"; 1194 char *val = zo->zo_alt_ztest; 1195 if (0 != access(val, X_OK) || 1196 (strrchr(val, '/') == NULL && (errno == EINVAL))) 1197 goto invalid; 1198 1199 int dirlen = strrchr(val, '/') - val; 1200 strlcpy(zo->zo_alt_libpath, val, 1201 MIN(sizeof (zo->zo_alt_libpath), dirlen + 1)); 1202 invalid_what = "library path", val = zo->zo_alt_libpath; 1203 if (strrchr(val, '/') == NULL && (errno == EINVAL)) 1204 goto invalid; 1205 *strrchr(val, '/') = '\0'; 1206 strlcat(val, "/lib", sizeof (zo->zo_alt_libpath)); 1207 1208 if (0 != access(zo->zo_alt_libpath, X_OK)) 1209 goto invalid; 1210 return; 1211 1212 invalid: 1213 ztest_dump_core = B_FALSE; 1214 fatal(B_TRUE, "invalid alternate %s %s", invalid_what, val); 1215 } 1216 } 1217 1218 static void 1219 ztest_kill(ztest_shared_t *zs) 1220 { 1221 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 1222 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 1223 1224 /* 1225 * Before we kill ourselves, make sure that the config is updated. 1226 * See comment above spa_write_cachefile(). 1227 */ 1228 if (raidz_expand_pause_point != RAIDZ_EXPAND_PAUSE_NONE) { 1229 if (spa_namespace_tryenter(FTAG)) { 1230 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, 1231 B_FALSE); 1232 spa_namespace_exit(FTAG); 1233 1234 ztest_scratch_state->zs_raidz_scratch_verify_pause = 1235 raidz_expand_pause_point; 1236 } else { 1237 /* 1238 * Do not verify scratch object in case if 1239 * spa_namespace_lock cannot be acquired, 1240 * it can cause deadlock in spa_config_update(). 1241 */ 1242 raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; 1243 1244 return; 1245 } 1246 } else { 1247 spa_namespace_enter(FTAG); 1248 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); 1249 spa_namespace_exit(FTAG); 1250 } 1251 1252 (void) raise(SIGKILL); 1253 } 1254 1255 static void 1256 ztest_record_enospc(const char *s) 1257 { 1258 (void) s; 1259 ztest_shared->zs_enospc_count++; 1260 } 1261 1262 static uint64_t 1263 ztest_get_ashift(void) 1264 { 1265 if (ztest_opts.zo_ashift == 0) 1266 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 1267 return (ztest_opts.zo_ashift); 1268 } 1269 1270 static boolean_t 1271 ztest_is_draid_spare(const char *name) 1272 { 1273 uint64_t spare_id = 0, parity = 0, vdev_id = 0; 1274 1275 if (sscanf(name, VDEV_TYPE_DRAID "%"PRIu64"-%"PRIu64"-%"PRIu64"", 1276 &parity, &vdev_id, &spare_id) == 3) { 1277 return (B_TRUE); 1278 } 1279 1280 return (B_FALSE); 1281 } 1282 1283 static nvlist_t * 1284 make_vdev_file(const char *path, const char *aux, const char *pool, 1285 size_t size, uint64_t ashift) 1286 { 1287 char *pathbuf = NULL; 1288 uint64_t vdev; 1289 nvlist_t *file; 1290 boolean_t draid_spare = B_FALSE; 1291 1292 1293 if (ashift == 0) 1294 ashift = ztest_get_ashift(); 1295 1296 if (path == NULL) { 1297 pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1298 path = pathbuf; 1299 1300 if (aux != NULL) { 1301 vdev = ztest_shared->zs_vdev_aux; 1302 (void) snprintf(pathbuf, MAXPATHLEN, 1303 ztest_aux_template, ztest_opts.zo_dir, 1304 pool == NULL ? ztest_opts.zo_pool : pool, 1305 aux, vdev); 1306 } else { 1307 vdev = ztest_shared->zs_vdev_next_leaf++; 1308 (void) snprintf(pathbuf, MAXPATHLEN, 1309 ztest_dev_template, ztest_opts.zo_dir, 1310 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 1311 } 1312 } else { 1313 draid_spare = ztest_is_draid_spare(path); 1314 } 1315 1316 if (size != 0 && !draid_spare) { 1317 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 1318 if (fd == -1) 1319 fatal(B_TRUE, "can't open %s", path); 1320 if (ftruncate(fd, size) != 0) 1321 fatal(B_TRUE, "can't ftruncate %s", path); 1322 (void) close(fd); 1323 } 1324 1325 file = fnvlist_alloc(); 1326 fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, 1327 draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); 1328 fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); 1329 fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); 1330 umem_free(pathbuf, MAXPATHLEN); 1331 1332 return (file); 1333 } 1334 1335 static nvlist_t * 1336 make_vdev_raid(const char *path, const char *aux, const char *pool, size_t size, 1337 uint64_t ashift, int r) 1338 { 1339 nvlist_t *raid, **child; 1340 int c; 1341 1342 if (r < 2) 1343 return (make_vdev_file(path, aux, pool, size, ashift)); 1344 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 1345 1346 for (c = 0; c < r; c++) 1347 child[c] = make_vdev_file(path, aux, pool, size, ashift); 1348 1349 raid = fnvlist_alloc(); 1350 fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, 1351 ztest_opts.zo_raid_type); 1352 fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, 1353 ztest_opts.zo_raid_parity); 1354 fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, 1355 (const nvlist_t **)child, r); 1356 1357 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { 1358 uint64_t ndata = ztest_opts.zo_draid_data; 1359 uint64_t nparity = ztest_opts.zo_raid_parity; 1360 uint64_t nspares = ztest_opts.zo_draid_spares; 1361 uint64_t children = ztest_opts.zo_raid_children; 1362 uint64_t ngroups = 1; 1363 1364 /* 1365 * Calculate the minimum number of groups required to fill a 1366 * slice. This is the LCM of the stripe width (data + parity) 1367 * and the number of data drives (children - spares). 1368 */ 1369 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1370 ngroups++; 1371 1372 /* Store the basic dRAID configuration. */ 1373 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1374 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1375 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1376 } 1377 1378 for (c = 0; c < r; c++) 1379 fnvlist_free(child[c]); 1380 1381 umem_free(child, r * sizeof (nvlist_t *)); 1382 1383 return (raid); 1384 } 1385 1386 static nvlist_t * 1387 make_vdev_mirror(const char *path, const char *aux, const char *pool, 1388 size_t size, uint64_t ashift, int r, int m) 1389 { 1390 nvlist_t *mirror, **child; 1391 int c; 1392 1393 if (m < 1) 1394 return (make_vdev_raid(path, aux, pool, size, ashift, r)); 1395 1396 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 1397 1398 for (c = 0; c < m; c++) 1399 child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); 1400 1401 mirror = fnvlist_alloc(); 1402 fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); 1403 fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 1404 (const nvlist_t **)child, m); 1405 1406 for (c = 0; c < m; c++) 1407 fnvlist_free(child[c]); 1408 1409 umem_free(child, m * sizeof (nvlist_t *)); 1410 1411 return (mirror); 1412 } 1413 1414 static nvlist_t * 1415 make_vdev_root(const char *path, const char *aux, const char *pool, size_t size, 1416 uint64_t ashift, const char *class, int r, int m, int t) 1417 { 1418 nvlist_t *root, **child; 1419 int c; 1420 boolean_t log; 1421 1422 ASSERT3S(t, >, 0); 1423 1424 log = (class != NULL && strcmp(class, "log") == 0); 1425 1426 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 1427 1428 for (c = 0; c < t; c++) { 1429 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 1430 r, m); 1431 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); 1432 1433 if (class != NULL && class[0] != '\0') { 1434 ASSERT(m > 1 || log); /* expecting a mirror */ 1435 fnvlist_add_string(child[c], 1436 ZPOOL_CONFIG_ALLOCATION_BIAS, class); 1437 } 1438 } 1439 1440 root = fnvlist_alloc(); 1441 fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 1442 fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 1443 (const nvlist_t **)child, t); 1444 1445 for (c = 0; c < t; c++) 1446 fnvlist_free(child[c]); 1447 1448 umem_free(child, t * sizeof (nvlist_t *)); 1449 1450 return (root); 1451 } 1452 1453 /* 1454 * Find a random spa version. Returns back a random spa version in the 1455 * range [initial_version, SPA_VERSION_FEATURES]. 1456 */ 1457 static uint64_t 1458 ztest_random_spa_version(uint64_t initial_version) 1459 { 1460 uint64_t version = initial_version; 1461 1462 if (version <= SPA_VERSION_BEFORE_FEATURES) { 1463 version = version + 1464 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 1465 } 1466 1467 if (version > SPA_VERSION_BEFORE_FEATURES) 1468 version = SPA_VERSION_FEATURES; 1469 1470 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1471 return (version); 1472 } 1473 1474 static int 1475 ztest_random_blocksize(void) 1476 { 1477 ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); 1478 1479 /* 1480 * Choose a block size >= the ashift. 1481 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1482 */ 1483 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1484 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1485 maxbs = 20; 1486 uint64_t block_shift = 1487 ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1488 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1489 } 1490 1491 static int 1492 ztest_random_dnodesize(void) 1493 { 1494 int slots; 1495 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1496 1497 if (max_slots == DNODE_MIN_SLOTS) 1498 return (DNODE_MIN_SIZE); 1499 1500 /* 1501 * Weight the random distribution more heavily toward smaller 1502 * dnode sizes since that is more likely to reflect real-world 1503 * usage. 1504 */ 1505 ASSERT3U(max_slots, >, 4); 1506 switch (ztest_random(10)) { 1507 case 0: 1508 slots = 5 + ztest_random(max_slots - 4); 1509 break; 1510 case 1 ... 4: 1511 slots = 2 + ztest_random(3); 1512 break; 1513 default: 1514 slots = 1; 1515 break; 1516 } 1517 1518 return (slots << DNODE_SHIFT); 1519 } 1520 1521 static int 1522 ztest_random_ibshift(void) 1523 { 1524 return (DN_MIN_INDBLKSHIFT + 1525 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1526 } 1527 1528 static uint64_t 1529 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1530 { 1531 uint64_t top; 1532 vdev_t *rvd = spa->spa_root_vdev; 1533 vdev_t *tvd; 1534 1535 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 1536 1537 do { 1538 top = ztest_random(rvd->vdev_children); 1539 tvd = rvd->vdev_child[top]; 1540 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1541 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1542 1543 return (top); 1544 } 1545 1546 static uint64_t 1547 ztest_random_dsl_prop(zfs_prop_t prop) 1548 { 1549 uint64_t value; 1550 1551 do { 1552 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1553 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1554 1555 return (value); 1556 } 1557 1558 static int 1559 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1560 boolean_t inherit) 1561 { 1562 const char *propname = zfs_prop_to_name(prop); 1563 const char *valname; 1564 char *setpoint; 1565 uint64_t curval; 1566 int error; 1567 1568 error = dsl_prop_set_int(osname, propname, 1569 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1570 1571 if (error == ENOSPC) { 1572 ztest_record_enospc(FTAG); 1573 return (error); 1574 } 1575 ASSERT0(error); 1576 1577 setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1578 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1579 1580 if (ztest_opts.zo_verbose >= 6) { 1581 int err; 1582 1583 err = zfs_prop_index_to_string(prop, curval, &valname); 1584 if (err) 1585 (void) printf("%s %s = %llu at '%s'\n", osname, 1586 propname, (unsigned long long)curval, setpoint); 1587 else 1588 (void) printf("%s %s = %s at '%s'\n", 1589 osname, propname, valname, setpoint); 1590 } 1591 umem_free(setpoint, MAXPATHLEN); 1592 1593 return (error); 1594 } 1595 1596 static int 1597 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1598 { 1599 spa_t *spa = ztest_spa; 1600 nvlist_t *props = NULL; 1601 int error; 1602 1603 props = fnvlist_alloc(); 1604 fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); 1605 1606 error = spa_prop_set(spa, props); 1607 1608 fnvlist_free(props); 1609 1610 if (error == ENOSPC) { 1611 ztest_record_enospc(FTAG); 1612 return (error); 1613 } 1614 ASSERT0(error); 1615 1616 return (error); 1617 } 1618 1619 static int 1620 ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, 1621 boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) 1622 { 1623 int err; 1624 char *cp = NULL; 1625 char ddname[ZFS_MAX_DATASET_NAME_LEN]; 1626 1627 strlcpy(ddname, name, sizeof (ddname)); 1628 cp = strchr(ddname, '@'); 1629 if (cp != NULL) 1630 *cp = '\0'; 1631 1632 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1633 while (decrypt && err == EACCES) { 1634 dsl_crypto_params_t *dcp; 1635 nvlist_t *crypto_args = fnvlist_alloc(); 1636 1637 fnvlist_add_uint8_array(crypto_args, "wkeydata", 1638 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 1639 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, 1640 crypto_args, &dcp)); 1641 err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); 1642 /* 1643 * Note: if there was an error loading, the wkey was not 1644 * consumed, and needs to be freed. 1645 */ 1646 dsl_crypto_params_free(dcp, (err != 0)); 1647 fnvlist_free(crypto_args); 1648 1649 if (err == EINVAL) { 1650 /* 1651 * We couldn't load a key for this dataset so try 1652 * the parent. This loop will eventually hit the 1653 * encryption root since ztest only makes clones 1654 * as children of their origin datasets. 1655 */ 1656 cp = strrchr(ddname, '/'); 1657 if (cp == NULL) 1658 return (err); 1659 1660 *cp = '\0'; 1661 err = EACCES; 1662 continue; 1663 } else if (err != 0) { 1664 break; 1665 } 1666 1667 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1668 break; 1669 } 1670 1671 return (err); 1672 } 1673 1674 static void 1675 ztest_rll_init(rll_t *rll) 1676 { 1677 rll->rll_writer = NULL; 1678 rll->rll_readers = 0; 1679 mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); 1680 cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); 1681 } 1682 1683 static void 1684 ztest_rll_destroy(rll_t *rll) 1685 { 1686 ASSERT0P(rll->rll_writer); 1687 ASSERT0(rll->rll_readers); 1688 mutex_destroy(&rll->rll_lock); 1689 cv_destroy(&rll->rll_cv); 1690 } 1691 1692 static void 1693 ztest_rll_lock(rll_t *rll, rl_type_t type) 1694 { 1695 mutex_enter(&rll->rll_lock); 1696 1697 if (type == ZTRL_READER) { 1698 while (rll->rll_writer != NULL) 1699 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1700 rll->rll_readers++; 1701 } else { 1702 while (rll->rll_writer != NULL || rll->rll_readers) 1703 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1704 rll->rll_writer = curthread; 1705 } 1706 1707 mutex_exit(&rll->rll_lock); 1708 } 1709 1710 static void 1711 ztest_rll_unlock(rll_t *rll) 1712 { 1713 mutex_enter(&rll->rll_lock); 1714 1715 if (rll->rll_writer) { 1716 ASSERT0(rll->rll_readers); 1717 rll->rll_writer = NULL; 1718 } else { 1719 ASSERT3S(rll->rll_readers, >, 0); 1720 ASSERT0P(rll->rll_writer); 1721 rll->rll_readers--; 1722 } 1723 1724 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1725 cv_broadcast(&rll->rll_cv); 1726 1727 mutex_exit(&rll->rll_lock); 1728 } 1729 1730 static void 1731 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1732 { 1733 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1734 1735 ztest_rll_lock(rll, type); 1736 } 1737 1738 static void 1739 ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1740 { 1741 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1742 1743 ztest_rll_unlock(rll); 1744 } 1745 1746 static rl_t * 1747 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1748 uint64_t size, rl_type_t type) 1749 { 1750 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1751 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1752 rl_t *rl; 1753 1754 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1755 rl->rl_object = object; 1756 rl->rl_offset = offset; 1757 rl->rl_size = size; 1758 rl->rl_lock = rll; 1759 1760 ztest_rll_lock(rll, type); 1761 1762 return (rl); 1763 } 1764 1765 static void 1766 ztest_range_unlock(rl_t *rl) 1767 { 1768 rll_t *rll = rl->rl_lock; 1769 1770 ztest_rll_unlock(rll); 1771 1772 umem_free(rl, sizeof (*rl)); 1773 } 1774 1775 static void 1776 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1777 { 1778 zd->zd_os = os; 1779 zd->zd_zilog = dmu_objset_zil(os); 1780 zd->zd_shared = szd; 1781 dmu_objset_name(os, zd->zd_name); 1782 int l; 1783 1784 if (zd->zd_shared != NULL) 1785 zd->zd_shared->zd_seq = 0; 1786 1787 VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); 1788 mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); 1789 1790 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1791 ztest_rll_init(&zd->zd_object_lock[l]); 1792 1793 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1794 ztest_rll_init(&zd->zd_range_lock[l]); 1795 } 1796 1797 static void 1798 ztest_zd_fini(ztest_ds_t *zd) 1799 { 1800 int l; 1801 1802 mutex_destroy(&zd->zd_dirobj_lock); 1803 (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); 1804 1805 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1806 ztest_rll_destroy(&zd->zd_object_lock[l]); 1807 1808 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1809 ztest_rll_destroy(&zd->zd_range_lock[l]); 1810 } 1811 1812 #define DMU_TX_MIGHTWAIT \ 1813 (ztest_random(10) == 0 ? DMU_TX_NOWAIT : DMU_TX_WAIT) 1814 1815 static uint64_t 1816 ztest_tx_assign(dmu_tx_t *tx, dmu_tx_flag_t txg_how, const char *tag) 1817 { 1818 uint64_t txg; 1819 int error; 1820 1821 /* 1822 * Attempt to assign tx to some transaction group. 1823 */ 1824 error = dmu_tx_assign(tx, txg_how); 1825 if (error) { 1826 if (error == ERESTART) { 1827 ASSERT3U(txg_how, ==, DMU_TX_NOWAIT); 1828 dmu_tx_wait(tx); 1829 } else if (error == ENOSPC) { 1830 ztest_record_enospc(tag); 1831 } else { 1832 ASSERT(error == EDQUOT || error == EIO); 1833 } 1834 dmu_tx_abort(tx); 1835 return (0); 1836 } 1837 txg = dmu_tx_get_txg(tx); 1838 ASSERT3U(txg, !=, 0); 1839 return (txg); 1840 } 1841 1842 static void 1843 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1844 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1845 uint64_t crtxg) 1846 { 1847 bt->bt_magic = BT_MAGIC; 1848 bt->bt_objset = dmu_objset_id(os); 1849 bt->bt_object = object; 1850 bt->bt_dnodesize = dnodesize; 1851 bt->bt_offset = offset; 1852 bt->bt_gen = gen; 1853 bt->bt_txg = txg; 1854 bt->bt_crtxg = crtxg; 1855 } 1856 1857 static void 1858 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1859 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1860 uint64_t crtxg) 1861 { 1862 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1863 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1864 ASSERT3U(bt->bt_object, ==, object); 1865 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1866 ASSERT3U(bt->bt_offset, ==, offset); 1867 ASSERT3U(bt->bt_gen, <=, gen); 1868 ASSERT3U(bt->bt_txg, <=, txg); 1869 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1870 } 1871 1872 static ztest_block_tag_t * 1873 ztest_bt_bonus(dmu_buf_t *db) 1874 { 1875 dmu_object_info_t doi; 1876 ztest_block_tag_t *bt; 1877 1878 dmu_object_info_from_db(db, &doi); 1879 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1880 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1881 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1882 1883 return (bt); 1884 } 1885 1886 /* 1887 * Generate a token to fill up unused bonus buffer space. Try to make 1888 * it unique to the object, generation, and offset to verify that data 1889 * is not getting overwritten by data from other dnodes. 1890 */ 1891 #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1892 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1893 1894 /* 1895 * Fill up the unused bonus buffer region before the block tag with a 1896 * verifiable pattern. Filling the whole bonus area with non-zero data 1897 * helps ensure that all dnode traversal code properly skips the 1898 * interior regions of large dnodes. 1899 */ 1900 static void 1901 ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1902 objset_t *os, uint64_t gen) 1903 { 1904 uint64_t *bonusp; 1905 1906 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1907 1908 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1909 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1910 gen, bonusp - (uint64_t *)db->db_data); 1911 *bonusp = token; 1912 } 1913 } 1914 1915 /* 1916 * Verify that the unused area of a bonus buffer is filled with the 1917 * expected tokens. 1918 */ 1919 static void 1920 ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1921 objset_t *os, uint64_t gen) 1922 { 1923 uint64_t *bonusp; 1924 1925 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1926 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1927 gen, bonusp - (uint64_t *)db->db_data); 1928 VERIFY3U(*bonusp, ==, token); 1929 } 1930 } 1931 1932 /* 1933 * ZIL logging ops 1934 */ 1935 1936 #define lrz_type lr_mode 1937 #define lrz_blocksize lr_uid 1938 #define lrz_ibshift lr_gid 1939 #define lrz_bonustype lr_rdev 1940 #define lrz_dnodesize lr_crtime[1] 1941 1942 static void 1943 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1944 { 1945 char *name = (char *)&lr->lr_data[0]; /* name follows lr */ 1946 size_t namesize = strlen(name) + 1; 1947 itx_t *itx; 1948 1949 if (zil_replaying(zd->zd_zilog, tx)) 1950 return; 1951 1952 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1953 memcpy(&itx->itx_lr + 1, &lr->lr_create.lr_common + 1, 1954 sizeof (*lr) + namesize - sizeof (lr_t)); 1955 1956 zil_itx_assign(zd->zd_zilog, itx, tx); 1957 } 1958 1959 static void 1960 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1961 { 1962 char *name = (char *)&lr->lr_data[0]; /* name follows lr */ 1963 size_t namesize = strlen(name) + 1; 1964 itx_t *itx; 1965 1966 if (zil_replaying(zd->zd_zilog, tx)) 1967 return; 1968 1969 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1970 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1971 sizeof (*lr) + namesize - sizeof (lr_t)); 1972 1973 itx->itx_oid = object; 1974 zil_itx_assign(zd->zd_zilog, itx, tx); 1975 } 1976 1977 static void 1978 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1979 { 1980 itx_t *itx; 1981 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1982 1983 if (zil_replaying(zd->zd_zilog, tx)) 1984 return; 1985 1986 if (lr->lr_length > zil_max_log_data(zd->zd_zilog, sizeof (lr_write_t))) 1987 write_state = WR_INDIRECT; 1988 1989 itx = zil_itx_create(TX_WRITE, 1990 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1991 1992 if (write_state == WR_COPIED && 1993 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1994 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH | 1995 DMU_KEEP_CACHING) != 0) { 1996 zil_itx_destroy(itx, 0); 1997 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1998 write_state = WR_NEED_COPY; 1999 } 2000 itx->itx_private = zd; 2001 itx->itx_wr_state = write_state; 2002 itx->itx_sync = (ztest_random(8) == 0); 2003 2004 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2005 sizeof (*lr) - sizeof (lr_t)); 2006 2007 zil_itx_assign(zd->zd_zilog, itx, tx); 2008 } 2009 2010 static void 2011 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 2012 { 2013 itx_t *itx; 2014 2015 if (zil_replaying(zd->zd_zilog, tx)) 2016 return; 2017 2018 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 2019 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2020 sizeof (*lr) - sizeof (lr_t)); 2021 2022 itx->itx_sync = B_FALSE; 2023 zil_itx_assign(zd->zd_zilog, itx, tx); 2024 } 2025 2026 static void 2027 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 2028 { 2029 itx_t *itx; 2030 2031 if (zil_replaying(zd->zd_zilog, tx)) 2032 return; 2033 2034 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 2035 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2036 sizeof (*lr) - sizeof (lr_t)); 2037 2038 itx->itx_sync = B_FALSE; 2039 zil_itx_assign(zd->zd_zilog, itx, tx); 2040 } 2041 2042 /* 2043 * ZIL replay ops 2044 */ 2045 static int 2046 ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 2047 { 2048 ztest_ds_t *zd = arg1; 2049 lr_create_t *lrc = arg2; 2050 _lr_create_t *lr = &lrc->lr_create; 2051 char *name = (char *)&lrc->lr_data[0]; /* name follows lr */ 2052 objset_t *os = zd->zd_os; 2053 ztest_block_tag_t *bbt; 2054 dmu_buf_t *db; 2055 dmu_tx_t *tx; 2056 uint64_t txg; 2057 int error = 0; 2058 int bonuslen; 2059 2060 if (byteswap) 2061 byteswap_uint64_array(lr, sizeof (*lr)); 2062 2063 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2064 ASSERT3S(name[0], !=, '\0'); 2065 2066 tx = dmu_tx_create(os); 2067 2068 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 2069 2070 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2071 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 2072 } else { 2073 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 2074 } 2075 2076 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2077 if (txg == 0) 2078 return (ENOSPC); 2079 2080 ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); 2081 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 2082 2083 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2084 if (lr->lr_foid == 0) { 2085 lr->lr_foid = zap_create_dnsize(os, 2086 lr->lrz_type, lr->lrz_bonustype, 2087 bonuslen, lr->lrz_dnodesize, tx); 2088 } else { 2089 error = zap_create_claim_dnsize(os, lr->lr_foid, 2090 lr->lrz_type, lr->lrz_bonustype, 2091 bonuslen, lr->lrz_dnodesize, tx); 2092 } 2093 } else { 2094 if (lr->lr_foid == 0) { 2095 lr->lr_foid = dmu_object_alloc_dnsize(os, 2096 lr->lrz_type, 0, lr->lrz_bonustype, 2097 bonuslen, lr->lrz_dnodesize, tx); 2098 } else { 2099 error = dmu_object_claim_dnsize(os, lr->lr_foid, 2100 lr->lrz_type, 0, lr->lrz_bonustype, 2101 bonuslen, lr->lrz_dnodesize, tx); 2102 } 2103 } 2104 2105 if (error) { 2106 ASSERT3U(error, ==, EEXIST); 2107 ASSERT(zd->zd_zilog->zl_replay); 2108 dmu_tx_commit(tx); 2109 return (error); 2110 } 2111 2112 ASSERT3U(lr->lr_foid, !=, 0); 2113 2114 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 2115 VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, 2116 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 2117 2118 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2119 bbt = ztest_bt_bonus(db); 2120 dmu_buf_will_dirty(db, tx); 2121 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 2122 lr->lr_gen, txg, txg); 2123 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 2124 dmu_buf_rele(db, FTAG); 2125 2126 VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 2127 &lr->lr_foid, tx)); 2128 2129 (void) ztest_log_create(zd, tx, lrc); 2130 2131 dmu_tx_commit(tx); 2132 2133 return (0); 2134 } 2135 2136 static int 2137 ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 2138 { 2139 ztest_ds_t *zd = arg1; 2140 lr_remove_t *lr = arg2; 2141 char *name = (char *)&lr->lr_data[0]; /* name follows lr */ 2142 objset_t *os = zd->zd_os; 2143 dmu_object_info_t doi; 2144 dmu_tx_t *tx; 2145 uint64_t object, txg; 2146 2147 if (byteswap) 2148 byteswap_uint64_array(lr, sizeof (*lr)); 2149 2150 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2151 ASSERT3S(name[0], !=, '\0'); 2152 2153 VERIFY0( 2154 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 2155 ASSERT3U(object, !=, 0); 2156 2157 ztest_object_lock(zd, object, ZTRL_WRITER); 2158 2159 VERIFY0(dmu_object_info(os, object, &doi)); 2160 2161 tx = dmu_tx_create(os); 2162 2163 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 2164 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 2165 2166 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2167 if (txg == 0) { 2168 ztest_object_unlock(zd, object); 2169 return (ENOSPC); 2170 } 2171 2172 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 2173 VERIFY0(zap_destroy(os, object, tx)); 2174 } else { 2175 VERIFY0(dmu_object_free(os, object, tx)); 2176 } 2177 2178 VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); 2179 2180 (void) ztest_log_remove(zd, tx, lr, object); 2181 2182 dmu_tx_commit(tx); 2183 2184 ztest_object_unlock(zd, object); 2185 2186 return (0); 2187 } 2188 2189 static int 2190 ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 2191 { 2192 ztest_ds_t *zd = arg1; 2193 lr_write_t *lr = arg2; 2194 objset_t *os = zd->zd_os; 2195 uint8_t *data = &lr->lr_data[0]; /* data follows lr */ 2196 uint64_t offset, length; 2197 ztest_block_tag_t *bt = (ztest_block_tag_t *)data; 2198 ztest_block_tag_t *bbt; 2199 uint64_t gen, txg, lrtxg, crtxg; 2200 dmu_object_info_t doi; 2201 dmu_tx_t *tx; 2202 dmu_buf_t *db; 2203 arc_buf_t *abuf = NULL; 2204 rl_t *rl; 2205 2206 if (byteswap) 2207 byteswap_uint64_array(lr, sizeof (*lr)); 2208 2209 offset = lr->lr_offset; 2210 length = lr->lr_length; 2211 2212 /* If it's a dmu_sync() block, write the whole block */ 2213 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 2214 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 2215 if (length < blocksize) { 2216 offset -= offset % blocksize; 2217 length = blocksize; 2218 } 2219 } 2220 2221 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 2222 byteswap_uint64_array(bt, sizeof (*bt)); 2223 2224 if (bt->bt_magic != BT_MAGIC) 2225 bt = NULL; 2226 2227 ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); 2228 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, ZTRL_WRITER); 2229 2230 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2231 2232 dmu_object_info_from_db(db, &doi); 2233 2234 bbt = ztest_bt_bonus(db); 2235 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2236 gen = bbt->bt_gen; 2237 crtxg = bbt->bt_crtxg; 2238 lrtxg = lr->lr_common.lrc_txg; 2239 2240 tx = dmu_tx_create(os); 2241 2242 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 2243 2244 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 2245 P2PHASE(offset, length) == 0) 2246 abuf = dmu_request_arcbuf(db, length); 2247 2248 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2249 if (txg == 0) { 2250 if (abuf != NULL) 2251 dmu_return_arcbuf(abuf); 2252 dmu_buf_rele(db, FTAG); 2253 ztest_range_unlock(rl); 2254 ztest_object_unlock(zd, lr->lr_foid); 2255 return (ENOSPC); 2256 } 2257 2258 if (bt != NULL) { 2259 /* 2260 * Usually, verify the old data before writing new data -- 2261 * but not always, because we also want to verify correct 2262 * behavior when the data was not recently read into cache. 2263 */ 2264 ASSERT(doi.doi_data_block_size); 2265 ASSERT0(offset % doi.doi_data_block_size); 2266 if (ztest_random(4) != 0) { 2267 dmu_flags_t flags = ztest_random(2) ? 2268 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 2269 2270 /* 2271 * We will randomly set when to do O_DIRECT on a read. 2272 */ 2273 if (ztest_random(4) == 0) 2274 flags |= DMU_DIRECTIO; 2275 2276 ztest_block_tag_t rbt; 2277 2278 VERIFY0(dmu_read(os, lr->lr_foid, offset, 2279 sizeof (rbt), &rbt, flags)); 2280 if (rbt.bt_magic == BT_MAGIC) { 2281 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 2282 offset, gen, txg, crtxg); 2283 } 2284 } 2285 2286 /* 2287 * Writes can appear to be newer than the bonus buffer because 2288 * the ztest_get_data() callback does a dmu_read() of the 2289 * open-context data, which may be different than the data 2290 * as it was when the write was generated. 2291 */ 2292 if (zd->zd_zilog->zl_replay) { 2293 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 2294 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 2295 bt->bt_crtxg); 2296 } 2297 2298 /* 2299 * Set the bt's gen/txg to the bonus buffer's gen/txg 2300 * so that all of the usual ASSERTs will work. 2301 */ 2302 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 2303 crtxg); 2304 } 2305 2306 if (abuf == NULL) { 2307 dmu_write(os, lr->lr_foid, offset, length, data, tx, 2308 DMU_READ_PREFETCH); 2309 } else { 2310 memcpy(abuf->b_data, data, length); 2311 VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx, 0)); 2312 } 2313 2314 (void) ztest_log_write(zd, tx, lr); 2315 2316 dmu_buf_rele(db, FTAG); 2317 2318 dmu_tx_commit(tx); 2319 2320 ztest_range_unlock(rl); 2321 ztest_object_unlock(zd, lr->lr_foid); 2322 2323 return (0); 2324 } 2325 2326 static int 2327 ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 2328 { 2329 ztest_ds_t *zd = arg1; 2330 lr_truncate_t *lr = arg2; 2331 objset_t *os = zd->zd_os; 2332 dmu_tx_t *tx; 2333 uint64_t txg; 2334 rl_t *rl; 2335 2336 if (byteswap) 2337 byteswap_uint64_array(lr, sizeof (*lr)); 2338 2339 ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); 2340 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 2341 ZTRL_WRITER); 2342 2343 tx = dmu_tx_create(os); 2344 2345 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 2346 2347 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2348 if (txg == 0) { 2349 ztest_range_unlock(rl); 2350 ztest_object_unlock(zd, lr->lr_foid); 2351 return (ENOSPC); 2352 } 2353 2354 VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 2355 lr->lr_length, tx)); 2356 2357 (void) ztest_log_truncate(zd, tx, lr); 2358 2359 dmu_tx_commit(tx); 2360 2361 ztest_range_unlock(rl); 2362 ztest_object_unlock(zd, lr->lr_foid); 2363 2364 return (0); 2365 } 2366 2367 static int 2368 ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 2369 { 2370 ztest_ds_t *zd = arg1; 2371 lr_setattr_t *lr = arg2; 2372 objset_t *os = zd->zd_os; 2373 dmu_tx_t *tx; 2374 dmu_buf_t *db; 2375 ztest_block_tag_t *bbt; 2376 uint64_t txg, lrtxg, crtxg, dnodesize; 2377 2378 if (byteswap) 2379 byteswap_uint64_array(lr, sizeof (*lr)); 2380 2381 ztest_object_lock(zd, lr->lr_foid, ZTRL_WRITER); 2382 2383 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2384 2385 tx = dmu_tx_create(os); 2386 dmu_tx_hold_bonus(tx, lr->lr_foid); 2387 2388 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2389 if (txg == 0) { 2390 dmu_buf_rele(db, FTAG); 2391 ztest_object_unlock(zd, lr->lr_foid); 2392 return (ENOSPC); 2393 } 2394 2395 bbt = ztest_bt_bonus(db); 2396 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2397 crtxg = bbt->bt_crtxg; 2398 lrtxg = lr->lr_common.lrc_txg; 2399 dnodesize = bbt->bt_dnodesize; 2400 2401 if (zd->zd_zilog->zl_replay) { 2402 ASSERT3U(lr->lr_size, !=, 0); 2403 ASSERT3U(lr->lr_mode, !=, 0); 2404 ASSERT3U(lrtxg, !=, 0); 2405 } else { 2406 /* 2407 * Randomly change the size and increment the generation. 2408 */ 2409 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 2410 sizeof (*bbt); 2411 lr->lr_mode = bbt->bt_gen + 1; 2412 ASSERT0(lrtxg); 2413 } 2414 2415 /* 2416 * Verify that the current bonus buffer is not newer than our txg. 2417 */ 2418 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2419 MAX(txg, lrtxg), crtxg); 2420 2421 dmu_buf_will_dirty(db, tx); 2422 2423 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 2424 ASSERT3U(lr->lr_size, <=, db->db_size); 2425 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 2426 bbt = ztest_bt_bonus(db); 2427 2428 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2429 txg, crtxg); 2430 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 2431 dmu_buf_rele(db, FTAG); 2432 2433 (void) ztest_log_setattr(zd, tx, lr); 2434 2435 dmu_tx_commit(tx); 2436 2437 ztest_object_unlock(zd, lr->lr_foid); 2438 2439 return (0); 2440 } 2441 2442 static zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 2443 NULL, /* 0 no such transaction type */ 2444 ztest_replay_create, /* TX_CREATE */ 2445 NULL, /* TX_MKDIR */ 2446 NULL, /* TX_MKXATTR */ 2447 NULL, /* TX_SYMLINK */ 2448 ztest_replay_remove, /* TX_REMOVE */ 2449 NULL, /* TX_RMDIR */ 2450 NULL, /* TX_LINK */ 2451 NULL, /* TX_RENAME */ 2452 ztest_replay_write, /* TX_WRITE */ 2453 ztest_replay_truncate, /* TX_TRUNCATE */ 2454 ztest_replay_setattr, /* TX_SETATTR */ 2455 NULL, /* TX_ACL */ 2456 NULL, /* TX_CREATE_ACL */ 2457 NULL, /* TX_CREATE_ATTR */ 2458 NULL, /* TX_CREATE_ACL_ATTR */ 2459 NULL, /* TX_MKDIR_ACL */ 2460 NULL, /* TX_MKDIR_ATTR */ 2461 NULL, /* TX_MKDIR_ACL_ATTR */ 2462 NULL, /* TX_WRITE2 */ 2463 NULL, /* TX_SETSAXATTR */ 2464 NULL, /* TX_RENAME_EXCHANGE */ 2465 NULL, /* TX_RENAME_WHITEOUT */ 2466 }; 2467 2468 /* 2469 * ZIL get_data callbacks 2470 */ 2471 2472 static void 2473 ztest_get_done(zgd_t *zgd, int error) 2474 { 2475 (void) error; 2476 ztest_ds_t *zd = zgd->zgd_private; 2477 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 2478 2479 if (zgd->zgd_db) 2480 dmu_buf_rele(zgd->zgd_db, zgd); 2481 2482 ztest_range_unlock((rl_t *)zgd->zgd_lr); 2483 ztest_object_unlock(zd, object); 2484 2485 umem_free(zgd, sizeof (*zgd)); 2486 } 2487 2488 static int 2489 ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, 2490 struct lwb *lwb, zio_t *zio) 2491 { 2492 (void) arg2; 2493 ztest_ds_t *zd = arg; 2494 objset_t *os = zd->zd_os; 2495 uint64_t object = lr->lr_foid; 2496 uint64_t offset = lr->lr_offset; 2497 uint64_t size = lr->lr_length; 2498 uint64_t txg = lr->lr_common.lrc_txg; 2499 uint64_t crtxg; 2500 dmu_object_info_t doi; 2501 dmu_buf_t *db; 2502 zgd_t *zgd; 2503 int error; 2504 2505 ASSERT3P(lwb, !=, NULL); 2506 ASSERT3U(size, !=, 0); 2507 2508 ztest_object_lock(zd, object, ZTRL_READER); 2509 error = dmu_bonus_hold(os, object, FTAG, &db); 2510 if (error) { 2511 ztest_object_unlock(zd, object); 2512 return (error); 2513 } 2514 2515 crtxg = ztest_bt_bonus(db)->bt_crtxg; 2516 2517 if (crtxg == 0 || crtxg > txg) { 2518 dmu_buf_rele(db, FTAG); 2519 ztest_object_unlock(zd, object); 2520 return (ENOENT); 2521 } 2522 2523 dmu_object_info_from_db(db, &doi); 2524 dmu_buf_rele(db, FTAG); 2525 db = NULL; 2526 2527 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 2528 zgd->zgd_lwb = lwb; 2529 zgd->zgd_private = zd; 2530 2531 if (buf != NULL) { /* immediate write */ 2532 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2533 object, offset, size, ZTRL_READER); 2534 2535 error = dmu_read(os, object, offset, size, buf, 2536 DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING); 2537 ASSERT0(error); 2538 } else { 2539 ASSERT3P(zio, !=, NULL); 2540 size = doi.doi_data_block_size; 2541 if (ISP2(size)) { 2542 offset = P2ALIGN_TYPED(offset, size, uint64_t); 2543 } else { 2544 ASSERT3U(offset, <, size); 2545 offset = 0; 2546 } 2547 2548 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2549 object, offset, size, ZTRL_READER); 2550 2551 error = dmu_buf_hold_noread(os, object, offset, zgd, &db); 2552 if (error == 0) { 2553 blkptr_t *bp = &lr->lr_blkptr; 2554 2555 zgd->zgd_db = db; 2556 zgd->zgd_bp = bp; 2557 2558 ASSERT3U(db->db_offset, ==, offset); 2559 ASSERT3U(db->db_size, ==, size); 2560 2561 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2562 ztest_get_done, zgd); 2563 2564 if (error == 0) 2565 return (0); 2566 } 2567 } 2568 2569 ztest_get_done(zgd, error); 2570 2571 return (error); 2572 } 2573 2574 static void * 2575 ztest_lr_alloc(size_t lrsize, char *name) 2576 { 2577 char *lr; 2578 size_t namesize = name ? strlen(name) + 1 : 0; 2579 2580 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2581 2582 if (name) 2583 memcpy(lr + lrsize, name, namesize); 2584 2585 return (lr); 2586 } 2587 2588 static void 2589 ztest_lr_free(void *lr, size_t lrsize, char *name) 2590 { 2591 size_t namesize = name ? strlen(name) + 1 : 0; 2592 2593 umem_free(lr, lrsize + namesize); 2594 } 2595 2596 /* 2597 * Lookup a bunch of objects. Returns the number of objects not found. 2598 */ 2599 static int 2600 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2601 { 2602 int missing = 0; 2603 int error; 2604 int i; 2605 2606 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2607 2608 for (i = 0; i < count; i++, od++) { 2609 od->od_object = 0; 2610 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2611 sizeof (uint64_t), 1, &od->od_object); 2612 if (error) { 2613 ASSERT3S(error, ==, ENOENT); 2614 ASSERT0(od->od_object); 2615 missing++; 2616 } else { 2617 dmu_buf_t *db; 2618 ztest_block_tag_t *bbt; 2619 dmu_object_info_t doi; 2620 2621 ASSERT3U(od->od_object, !=, 0); 2622 ASSERT0(missing); /* there should be no gaps */ 2623 2624 ztest_object_lock(zd, od->od_object, ZTRL_READER); 2625 VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, 2626 FTAG, &db)); 2627 dmu_object_info_from_db(db, &doi); 2628 bbt = ztest_bt_bonus(db); 2629 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2630 od->od_type = doi.doi_type; 2631 od->od_blocksize = doi.doi_data_block_size; 2632 od->od_gen = bbt->bt_gen; 2633 dmu_buf_rele(db, FTAG); 2634 ztest_object_unlock(zd, od->od_object); 2635 } 2636 } 2637 2638 return (missing); 2639 } 2640 2641 static int 2642 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2643 { 2644 int missing = 0; 2645 int i; 2646 2647 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2648 2649 for (i = 0; i < count; i++, od++) { 2650 if (missing) { 2651 od->od_object = 0; 2652 missing++; 2653 continue; 2654 } 2655 2656 lr_create_t *lrc = ztest_lr_alloc(sizeof (*lrc), od->od_name); 2657 _lr_create_t *lr = &lrc->lr_create; 2658 2659 lr->lr_doid = od->od_dir; 2660 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2661 lr->lrz_type = od->od_crtype; 2662 lr->lrz_blocksize = od->od_crblocksize; 2663 lr->lrz_ibshift = ztest_random_ibshift(); 2664 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2665 lr->lrz_dnodesize = od->od_crdnodesize; 2666 lr->lr_gen = od->od_crgen; 2667 lr->lr_crtime[0] = time(NULL); 2668 2669 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2670 ASSERT0(missing); 2671 od->od_object = 0; 2672 missing++; 2673 } else { 2674 od->od_object = lr->lr_foid; 2675 od->od_type = od->od_crtype; 2676 od->od_blocksize = od->od_crblocksize; 2677 od->od_gen = od->od_crgen; 2678 ASSERT3U(od->od_object, !=, 0); 2679 } 2680 2681 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2682 } 2683 2684 return (missing); 2685 } 2686 2687 static int 2688 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2689 { 2690 int missing = 0; 2691 int error; 2692 int i; 2693 2694 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2695 2696 od += count - 1; 2697 2698 for (i = count - 1; i >= 0; i--, od--) { 2699 if (missing) { 2700 missing++; 2701 continue; 2702 } 2703 2704 /* 2705 * No object was found. 2706 */ 2707 if (od->od_object == 0) 2708 continue; 2709 2710 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2711 2712 lr->lr_doid = od->od_dir; 2713 2714 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2715 ASSERT3U(error, ==, ENOSPC); 2716 missing++; 2717 } else { 2718 od->od_object = 0; 2719 } 2720 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2721 } 2722 2723 return (missing); 2724 } 2725 2726 static int 2727 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2728 const void *data) 2729 { 2730 lr_write_t *lr; 2731 int error; 2732 2733 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2734 2735 lr->lr_foid = object; 2736 lr->lr_offset = offset; 2737 lr->lr_length = size; 2738 lr->lr_blkoff = 0; 2739 BP_ZERO(&lr->lr_blkptr); 2740 2741 memcpy(&lr->lr_data[0], data, size); 2742 2743 error = ztest_replay_write(zd, lr, B_FALSE); 2744 2745 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2746 2747 return (error); 2748 } 2749 2750 static int 2751 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2752 { 2753 lr_truncate_t *lr; 2754 int error; 2755 2756 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2757 2758 lr->lr_foid = object; 2759 lr->lr_offset = offset; 2760 lr->lr_length = size; 2761 2762 error = ztest_replay_truncate(zd, lr, B_FALSE); 2763 2764 ztest_lr_free(lr, sizeof (*lr), NULL); 2765 2766 return (error); 2767 } 2768 2769 static int 2770 ztest_setattr(ztest_ds_t *zd, uint64_t object) 2771 { 2772 lr_setattr_t *lr; 2773 int error; 2774 2775 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2776 2777 lr->lr_foid = object; 2778 lr->lr_size = 0; 2779 lr->lr_mode = 0; 2780 2781 error = ztest_replay_setattr(zd, lr, B_FALSE); 2782 2783 ztest_lr_free(lr, sizeof (*lr), NULL); 2784 2785 return (error); 2786 } 2787 2788 static void 2789 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2790 { 2791 objset_t *os = zd->zd_os; 2792 dmu_tx_t *tx; 2793 uint64_t txg; 2794 rl_t *rl; 2795 2796 txg_wait_synced(dmu_objset_pool(os), 0); 2797 2798 ztest_object_lock(zd, object, ZTRL_READER); 2799 rl = ztest_range_lock(zd, object, offset, size, ZTRL_WRITER); 2800 2801 tx = dmu_tx_create(os); 2802 2803 dmu_tx_hold_write(tx, object, offset, size); 2804 2805 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2806 2807 if (txg != 0) { 2808 dmu_prealloc(os, object, offset, size, tx); 2809 dmu_tx_commit(tx); 2810 txg_wait_synced(dmu_objset_pool(os), txg); 2811 } else { 2812 (void) dmu_free_long_range(os, object, offset, size); 2813 } 2814 2815 ztest_range_unlock(rl); 2816 ztest_object_unlock(zd, object); 2817 } 2818 2819 static void 2820 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2821 { 2822 int err; 2823 ztest_block_tag_t wbt; 2824 dmu_object_info_t doi; 2825 enum ztest_io_type io_type; 2826 uint64_t blocksize; 2827 void *data; 2828 dmu_flags_t dmu_read_flags = DMU_READ_NO_PREFETCH; 2829 2830 /* 2831 * We will randomly set when to do O_DIRECT on a read. 2832 */ 2833 if (ztest_random(4) == 0) 2834 dmu_read_flags |= DMU_DIRECTIO; 2835 2836 VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); 2837 blocksize = doi.doi_data_block_size; 2838 data = umem_alloc(blocksize, UMEM_NOFAIL); 2839 2840 /* 2841 * Pick an i/o type at random, biased toward writing block tags. 2842 */ 2843 io_type = ztest_random(ZTEST_IO_TYPES); 2844 if (ztest_random(2) == 0) 2845 io_type = ZTEST_IO_WRITE_TAG; 2846 2847 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2848 2849 switch (io_type) { 2850 2851 case ZTEST_IO_WRITE_TAG: 2852 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2853 offset, 0, 0, 0); 2854 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2855 break; 2856 2857 case ZTEST_IO_WRITE_PATTERN: 2858 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2859 if (ztest_random(2) == 0) { 2860 /* 2861 * Induce fletcher2 collisions to ensure that 2862 * zio_ddt_collision() detects and resolves them 2863 * when using fletcher2-verify for deduplication. 2864 */ 2865 ((uint64_t *)data)[0] ^= 1ULL << 63; 2866 ((uint64_t *)data)[4] ^= 1ULL << 63; 2867 } 2868 (void) ztest_write(zd, object, offset, blocksize, data); 2869 break; 2870 2871 case ZTEST_IO_WRITE_ZEROES: 2872 memset(data, 0, blocksize); 2873 (void) ztest_write(zd, object, offset, blocksize, data); 2874 break; 2875 2876 case ZTEST_IO_TRUNCATE: 2877 (void) ztest_truncate(zd, object, offset, blocksize); 2878 break; 2879 2880 case ZTEST_IO_SETATTR: 2881 (void) ztest_setattr(zd, object); 2882 break; 2883 default: 2884 break; 2885 2886 case ZTEST_IO_REWRITE: 2887 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2888 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2889 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2890 B_FALSE); 2891 ASSERT(err == 0 || err == ENOSPC); 2892 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2893 ZFS_PROP_COMPRESSION, 2894 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2895 B_FALSE); 2896 ASSERT(err == 0 || err == ENOSPC); 2897 (void) pthread_rwlock_unlock(&ztest_name_lock); 2898 2899 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2900 dmu_read_flags)); 2901 2902 (void) ztest_write(zd, object, offset, blocksize, data); 2903 break; 2904 } 2905 2906 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2907 2908 umem_free(data, blocksize); 2909 } 2910 2911 /* 2912 * Initialize an object description template. 2913 */ 2914 static void 2915 ztest_od_init(ztest_od_t *od, uint64_t id, const char *tag, uint64_t index, 2916 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2917 uint64_t gen) 2918 { 2919 od->od_dir = ZTEST_DIROBJ; 2920 od->od_object = 0; 2921 2922 od->od_crtype = type; 2923 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2924 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2925 od->od_crgen = gen; 2926 2927 od->od_type = DMU_OT_NONE; 2928 od->od_blocksize = 0; 2929 od->od_gen = 0; 2930 2931 (void) snprintf(od->od_name, sizeof (od->od_name), 2932 "%s(%"PRId64")[%"PRIu64"]", 2933 tag, id, index); 2934 } 2935 2936 /* 2937 * Lookup or create the objects for a test using the od template. 2938 * If the objects do not all exist, or if 'remove' is specified, 2939 * remove any existing objects and create new ones. Otherwise, 2940 * use the existing objects. 2941 */ 2942 static int 2943 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2944 { 2945 int count = size / sizeof (*od); 2946 int rv = 0; 2947 2948 mutex_enter(&zd->zd_dirobj_lock); 2949 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2950 (ztest_remove(zd, od, count) != 0 || 2951 ztest_create(zd, od, count) != 0)) 2952 rv = -1; 2953 zd->zd_od = od; 2954 mutex_exit(&zd->zd_dirobj_lock); 2955 2956 return (rv); 2957 } 2958 2959 void 2960 ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2961 { 2962 (void) id; 2963 zilog_t *zilog = zd->zd_zilog; 2964 2965 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2966 2967 VERIFY0(zil_commit(zilog, ztest_random(ZTEST_OBJECTS))); 2968 2969 /* 2970 * Remember the committed values in zd, which is in parent/child 2971 * shared memory. If we die, the next iteration of ztest_run() 2972 * will verify that the log really does contain this record. 2973 */ 2974 mutex_enter(&zilog->zl_lock); 2975 ASSERT3P(zd->zd_shared, !=, NULL); 2976 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2977 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2978 mutex_exit(&zilog->zl_lock); 2979 2980 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2981 } 2982 2983 /* 2984 * This function is designed to simulate the operations that occur during a 2985 * mount/unmount operation. We hold the dataset across these operations in an 2986 * attempt to expose any implicit assumptions about ZIL management. 2987 */ 2988 void 2989 ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2990 { 2991 (void) id; 2992 objset_t *os = zd->zd_os; 2993 2994 /* 2995 * We hold the ztest_vdev_lock so we don't cause problems with 2996 * other threads that wish to remove a log device, such as 2997 * ztest_device_removal(). 2998 */ 2999 mutex_enter(&ztest_vdev_lock); 3000 3001 /* 3002 * We grab the zd_dirobj_lock to ensure that no other thread is 3003 * updating the zil (i.e. adding in-memory log records) and the 3004 * zd_zilog_lock to block any I/O. 3005 */ 3006 mutex_enter(&zd->zd_dirobj_lock); 3007 (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); 3008 3009 /* zfsvfs_teardown() */ 3010 zil_close(zd->zd_zilog); 3011 3012 /* zfsvfs_setup() */ 3013 VERIFY3P(zil_open(os, ztest_get_data, NULL), ==, zd->zd_zilog); 3014 zil_replay(os, zd, ztest_replay_vector); 3015 3016 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 3017 mutex_exit(&zd->zd_dirobj_lock); 3018 mutex_exit(&ztest_vdev_lock); 3019 } 3020 3021 /* 3022 * Verify that we can't destroy an active pool, create an existing pool, 3023 * or create a pool with a bad vdev spec. 3024 */ 3025 void 3026 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 3027 { 3028 (void) zd, (void) id; 3029 ztest_shared_opts_t *zo = &ztest_opts; 3030 spa_t *spa; 3031 nvlist_t *nvroot; 3032 3033 if (zo->zo_mmp_test) 3034 return; 3035 3036 /* 3037 * Attempt to create using a bad file. 3038 */ 3039 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 3040 VERIFY3U(ENOENT, ==, 3041 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 3042 fnvlist_free(nvroot); 3043 3044 /* 3045 * Attempt to create using a bad mirror. 3046 */ 3047 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); 3048 VERIFY3U(ENOENT, ==, 3049 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 3050 fnvlist_free(nvroot); 3051 3052 /* 3053 * Attempt to create an existing pool. It shouldn't matter 3054 * what's in the nvroot; we should fail with EEXIST. 3055 */ 3056 (void) pthread_rwlock_rdlock(&ztest_name_lock); 3057 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 3058 VERIFY3U(EEXIST, ==, 3059 spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); 3060 fnvlist_free(nvroot); 3061 3062 /* 3063 * We open a reference to the spa and then we try to export it 3064 * expecting one of the following errors: 3065 * 3066 * EBUSY 3067 * Because of the reference we just opened. 3068 * 3069 * ZFS_ERR_EXPORT_IN_PROGRESS 3070 * For the case that there is another ztest thread doing 3071 * an export concurrently. 3072 */ 3073 VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); 3074 int error = spa_destroy(zo->zo_pool); 3075 if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { 3076 fatal(B_FALSE, "spa_destroy(%s) returned unexpected value %d", 3077 spa->spa_name, error); 3078 } 3079 spa_close(spa, FTAG); 3080 3081 (void) pthread_rwlock_unlock(&ztest_name_lock); 3082 } 3083 3084 /* 3085 * Start and then stop the MMP threads to ensure the startup and shutdown code 3086 * works properly. Actual protection and property-related code tested via ZTS. 3087 */ 3088 void 3089 ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) 3090 { 3091 (void) zd, (void) id; 3092 ztest_shared_opts_t *zo = &ztest_opts; 3093 spa_t *spa = ztest_spa; 3094 3095 if (zo->zo_mmp_test) 3096 return; 3097 3098 /* 3099 * Since enabling MMP involves setting a property, it could not be done 3100 * while the pool is suspended. 3101 */ 3102 if (spa_suspended(spa)) 3103 return; 3104 3105 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3106 mutex_enter(&spa->spa_props_lock); 3107 3108 zfs_multihost_fail_intervals = 0; 3109 3110 if (!spa_multihost(spa)) { 3111 spa->spa_multihost = B_TRUE; 3112 mmp_thread_start(spa); 3113 } 3114 3115 mutex_exit(&spa->spa_props_lock); 3116 spa_config_exit(spa, SCL_CONFIG, FTAG); 3117 3118 txg_wait_synced(spa_get_dsl(spa), 0); 3119 mmp_signal_all_threads(); 3120 txg_wait_synced(spa_get_dsl(spa), 0); 3121 3122 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3123 mutex_enter(&spa->spa_props_lock); 3124 3125 if (spa_multihost(spa)) { 3126 mmp_thread_stop(spa); 3127 spa->spa_multihost = B_FALSE; 3128 } 3129 3130 mutex_exit(&spa->spa_props_lock); 3131 spa_config_exit(spa, SCL_CONFIG, FTAG); 3132 } 3133 3134 static int 3135 ztest_get_raidz_children(spa_t *spa) 3136 { 3137 (void) spa; 3138 vdev_t *raidvd; 3139 3140 ASSERT(MUTEX_HELD(&ztest_vdev_lock)); 3141 3142 if (ztest_opts.zo_raid_do_expand) { 3143 raidvd = ztest_spa->spa_root_vdev->vdev_child[0]; 3144 3145 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3146 3147 return (raidvd->vdev_children); 3148 } 3149 3150 return (ztest_opts.zo_raid_children); 3151 } 3152 3153 void 3154 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 3155 { 3156 (void) zd, (void) id; 3157 spa_t *spa; 3158 uint64_t initial_version = SPA_VERSION_INITIAL; 3159 uint64_t raidz_children, version, newversion; 3160 nvlist_t *nvroot, *props; 3161 char *name; 3162 3163 if (ztest_opts.zo_mmp_test) 3164 return; 3165 3166 /* dRAID added after feature flags, skip upgrade test. */ 3167 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) 3168 return; 3169 3170 mutex_enter(&ztest_vdev_lock); 3171 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 3172 3173 /* 3174 * Clean up from previous runs. 3175 */ 3176 (void) spa_destroy(name); 3177 3178 raidz_children = ztest_get_raidz_children(ztest_spa); 3179 3180 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 3181 NULL, raidz_children, ztest_opts.zo_mirrors, 1); 3182 3183 /* 3184 * If we're configuring a RAIDZ device then make sure that the 3185 * initial version is capable of supporting that feature. 3186 */ 3187 switch (ztest_opts.zo_raid_parity) { 3188 case 0: 3189 case 1: 3190 initial_version = SPA_VERSION_INITIAL; 3191 break; 3192 case 2: 3193 initial_version = SPA_VERSION_RAIDZ2; 3194 break; 3195 case 3: 3196 initial_version = SPA_VERSION_RAIDZ3; 3197 break; 3198 } 3199 3200 /* 3201 * Create a pool with a spa version that can be upgraded. Pick 3202 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 3203 */ 3204 do { 3205 version = ztest_random_spa_version(initial_version); 3206 } while (version > SPA_VERSION_BEFORE_FEATURES); 3207 3208 props = fnvlist_alloc(); 3209 fnvlist_add_uint64(props, 3210 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 3211 VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); 3212 fnvlist_free(nvroot); 3213 fnvlist_free(props); 3214 3215 VERIFY0(spa_open(name, &spa, FTAG)); 3216 VERIFY3U(spa_version(spa), ==, version); 3217 newversion = ztest_random_spa_version(version + 1); 3218 3219 if (ztest_opts.zo_verbose >= 4) { 3220 (void) printf("upgrading spa version from " 3221 "%"PRIu64" to %"PRIu64"\n", 3222 version, newversion); 3223 } 3224 3225 spa_upgrade(spa, newversion); 3226 VERIFY3U(spa_version(spa), >, version); 3227 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 3228 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 3229 spa_close(spa, FTAG); 3230 3231 kmem_strfree(name); 3232 mutex_exit(&ztest_vdev_lock); 3233 } 3234 3235 static void 3236 ztest_spa_checkpoint(spa_t *spa) 3237 { 3238 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3239 3240 int error = spa_checkpoint(spa->spa_name); 3241 3242 switch (error) { 3243 case 0: 3244 case ZFS_ERR_DEVRM_IN_PROGRESS: 3245 case ZFS_ERR_DISCARDING_CHECKPOINT: 3246 case ZFS_ERR_CHECKPOINT_EXISTS: 3247 case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: 3248 break; 3249 case ENOSPC: 3250 ztest_record_enospc(FTAG); 3251 break; 3252 default: 3253 fatal(B_FALSE, "spa_checkpoint(%s) = %d", spa->spa_name, error); 3254 } 3255 } 3256 3257 static void 3258 ztest_spa_discard_checkpoint(spa_t *spa) 3259 { 3260 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3261 3262 int error = spa_checkpoint_discard(spa->spa_name); 3263 3264 switch (error) { 3265 case 0: 3266 case ZFS_ERR_DISCARDING_CHECKPOINT: 3267 case ZFS_ERR_NO_CHECKPOINT: 3268 break; 3269 default: 3270 fatal(B_FALSE, "spa_discard_checkpoint(%s) = %d", 3271 spa->spa_name, error); 3272 } 3273 3274 } 3275 3276 void 3277 ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 3278 { 3279 (void) zd, (void) id; 3280 spa_t *spa = ztest_spa; 3281 3282 mutex_enter(&ztest_checkpoint_lock); 3283 if (ztest_random(2) == 0) { 3284 ztest_spa_checkpoint(spa); 3285 } else { 3286 ztest_spa_discard_checkpoint(spa); 3287 } 3288 mutex_exit(&ztest_checkpoint_lock); 3289 } 3290 3291 3292 static vdev_t * 3293 vdev_lookup_by_path(vdev_t *vd, const char *path) 3294 { 3295 vdev_t *mvd; 3296 int c; 3297 3298 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 3299 return (vd); 3300 3301 for (c = 0; c < vd->vdev_children; c++) 3302 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 3303 NULL) 3304 return (mvd); 3305 3306 return (NULL); 3307 } 3308 3309 static int 3310 spa_num_top_vdevs(spa_t *spa) 3311 { 3312 vdev_t *rvd = spa->spa_root_vdev; 3313 ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); 3314 return (rvd->vdev_children); 3315 } 3316 3317 /* 3318 * Verify that vdev_add() works as expected. 3319 */ 3320 void 3321 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 3322 { 3323 (void) zd, (void) id; 3324 ztest_shared_t *zs = ztest_shared; 3325 spa_t *spa = ztest_spa; 3326 uint64_t leaves; 3327 uint64_t guid; 3328 uint64_t raidz_children; 3329 3330 nvlist_t *nvroot; 3331 int error; 3332 3333 if (ztest_opts.zo_mmp_test) 3334 return; 3335 3336 mutex_enter(&ztest_vdev_lock); 3337 raidz_children = ztest_get_raidz_children(spa); 3338 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 3339 3340 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3341 3342 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3343 3344 /* 3345 * If we have slogs then remove them 1/4 of the time. 3346 */ 3347 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 3348 metaslab_group_t *mg; 3349 3350 /* 3351 * find the first real slog in log allocation class 3352 */ 3353 mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; 3354 while (!mg->mg_vd->vdev_islog) 3355 mg = mg->mg_next; 3356 3357 guid = mg->mg_vd->vdev_guid; 3358 3359 spa_config_exit(spa, SCL_VDEV, FTAG); 3360 3361 /* 3362 * We have to grab the zs_name_lock as writer to 3363 * prevent a race between removing a slog (dmu_objset_find) 3364 * and destroying a dataset. Removing the slog will 3365 * grab a reference on the dataset which may cause 3366 * dsl_destroy_head() to fail with EBUSY thus 3367 * leaving the dataset in an inconsistent state. 3368 */ 3369 pthread_rwlock_wrlock(&ztest_name_lock); 3370 error = spa_vdev_remove(spa, guid, B_FALSE); 3371 pthread_rwlock_unlock(&ztest_name_lock); 3372 3373 switch (error) { 3374 case 0: 3375 case EEXIST: /* Generic zil_reset() error */ 3376 case EBUSY: /* Replay required */ 3377 case EACCES: /* Crypto key not loaded */ 3378 case ZFS_ERR_CHECKPOINT_EXISTS: 3379 case ZFS_ERR_DISCARDING_CHECKPOINT: 3380 break; 3381 default: 3382 fatal(B_FALSE, "spa_vdev_remove() = %d", error); 3383 } 3384 } else { 3385 spa_config_exit(spa, SCL_VDEV, FTAG); 3386 3387 /* 3388 * Make 1/4 of the devices be log devices 3389 */ 3390 nvroot = make_vdev_root(NULL, NULL, NULL, 3391 ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? 3392 "log" : NULL, raidz_children, zs->zs_mirrors, 3393 1); 3394 3395 error = spa_vdev_add(spa, nvroot, B_FALSE); 3396 fnvlist_free(nvroot); 3397 3398 switch (error) { 3399 case 0: 3400 break; 3401 case ENOSPC: 3402 ztest_record_enospc("spa_vdev_add"); 3403 break; 3404 default: 3405 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3406 } 3407 } 3408 3409 mutex_exit(&ztest_vdev_lock); 3410 } 3411 3412 void 3413 ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) 3414 { 3415 (void) zd, (void) id; 3416 ztest_shared_t *zs = ztest_shared; 3417 spa_t *spa = ztest_spa; 3418 uint64_t leaves; 3419 nvlist_t *nvroot; 3420 uint64_t raidz_children; 3421 const char *class = (ztest_random(2) == 0) ? 3422 VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; 3423 int error; 3424 3425 /* 3426 * By default add a special vdev 50% of the time 3427 */ 3428 if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || 3429 (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && 3430 ztest_random(2) == 0)) { 3431 return; 3432 } 3433 3434 mutex_enter(&ztest_vdev_lock); 3435 3436 /* Only test with mirrors */ 3437 if (zs->zs_mirrors < 2) { 3438 mutex_exit(&ztest_vdev_lock); 3439 return; 3440 } 3441 3442 /* requires feature@allocation_classes */ 3443 if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { 3444 mutex_exit(&ztest_vdev_lock); 3445 return; 3446 } 3447 3448 raidz_children = ztest_get_raidz_children(spa); 3449 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 3450 3451 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3452 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3453 spa_config_exit(spa, SCL_VDEV, FTAG); 3454 3455 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 3456 class, raidz_children, zs->zs_mirrors, 1); 3457 3458 error = spa_vdev_add(spa, nvroot, B_FALSE); 3459 fnvlist_free(nvroot); 3460 3461 if (error == ENOSPC) 3462 ztest_record_enospc("spa_vdev_add"); 3463 else if (error != 0) 3464 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3465 3466 /* 3467 * 50% of the time allow small blocks in the special class 3468 */ 3469 if (error == 0 && 3470 spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { 3471 if (ztest_opts.zo_verbose >= 3) 3472 (void) printf("Enabling special VDEV small blocks\n"); 3473 error = ztest_dsl_prop_set_uint64(zd->zd_name, 3474 ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); 3475 ASSERT(error == 0 || error == ENOSPC); 3476 } 3477 3478 mutex_exit(&ztest_vdev_lock); 3479 3480 if (ztest_opts.zo_verbose >= 3) { 3481 metaslab_class_t *mc; 3482 3483 if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) 3484 mc = spa_special_class(spa); 3485 else 3486 mc = spa_dedup_class(spa); 3487 (void) printf("Added a %s mirrored vdev (of %d)\n", 3488 class, (int)mc->mc_groups); 3489 } 3490 } 3491 3492 /* 3493 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 3494 */ 3495 void 3496 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 3497 { 3498 (void) zd, (void) id; 3499 ztest_shared_t *zs = ztest_shared; 3500 spa_t *spa = ztest_spa; 3501 vdev_t *rvd = spa->spa_root_vdev; 3502 spa_aux_vdev_t *sav; 3503 const char *aux; 3504 char *path; 3505 uint64_t guid = 0; 3506 int error, ignore_err = 0; 3507 3508 if (ztest_opts.zo_mmp_test) 3509 return; 3510 3511 path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3512 3513 if (ztest_random(2) == 0) { 3514 sav = &spa->spa_spares; 3515 aux = ZPOOL_CONFIG_SPARES; 3516 } else { 3517 sav = &spa->spa_l2cache; 3518 aux = ZPOOL_CONFIG_L2CACHE; 3519 } 3520 3521 mutex_enter(&ztest_vdev_lock); 3522 3523 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3524 3525 if (sav->sav_count != 0 && ztest_random(4) == 0) { 3526 /* 3527 * Pick a random device to remove. 3528 */ 3529 vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3530 3531 /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ 3532 if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) 3533 ignore_err = ENOTSUP; 3534 3535 guid = svd->vdev_guid; 3536 } else { 3537 /* 3538 * Find an unused device we can add. 3539 */ 3540 zs->zs_vdev_aux = 0; 3541 for (;;) { 3542 int c; 3543 (void) snprintf(path, MAXPATHLEN, ztest_aux_template, 3544 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 3545 zs->zs_vdev_aux); 3546 for (c = 0; c < sav->sav_count; c++) 3547 if (strcmp(sav->sav_vdevs[c]->vdev_path, 3548 path) == 0) 3549 break; 3550 if (c == sav->sav_count && 3551 vdev_lookup_by_path(rvd, path) == NULL) 3552 break; 3553 zs->zs_vdev_aux++; 3554 } 3555 } 3556 3557 spa_config_exit(spa, SCL_VDEV, FTAG); 3558 3559 if (guid == 0) { 3560 /* 3561 * Add a new device. 3562 */ 3563 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 3564 (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); 3565 error = spa_vdev_add(spa, nvroot, B_FALSE); 3566 3567 switch (error) { 3568 case 0: 3569 break; 3570 default: 3571 fatal(B_FALSE, "spa_vdev_add(%p) = %d", nvroot, error); 3572 } 3573 fnvlist_free(nvroot); 3574 } else { 3575 /* 3576 * Remove an existing device. Sometimes, dirty its 3577 * vdev state first to make sure we handle removal 3578 * of devices that have pending state changes. 3579 */ 3580 if (ztest_random(2) == 0) 3581 (void) vdev_online(spa, guid, 0, NULL); 3582 3583 error = spa_vdev_remove(spa, guid, B_FALSE); 3584 3585 switch (error) { 3586 case 0: 3587 case EBUSY: 3588 case ZFS_ERR_CHECKPOINT_EXISTS: 3589 case ZFS_ERR_DISCARDING_CHECKPOINT: 3590 break; 3591 default: 3592 if (error != ignore_err) 3593 fatal(B_FALSE, 3594 "spa_vdev_remove(%"PRIu64") = %d", 3595 guid, error); 3596 } 3597 } 3598 3599 mutex_exit(&ztest_vdev_lock); 3600 3601 umem_free(path, MAXPATHLEN); 3602 } 3603 3604 /* 3605 * split a pool if it has mirror tlvdevs 3606 */ 3607 void 3608 ztest_split_pool(ztest_ds_t *zd, uint64_t id) 3609 { 3610 (void) zd, (void) id; 3611 ztest_shared_t *zs = ztest_shared; 3612 spa_t *spa = ztest_spa; 3613 vdev_t *rvd = spa->spa_root_vdev; 3614 nvlist_t *tree, **child, *config, *split, **schild; 3615 uint_t c, children, schildren = 0, lastlogid = 0; 3616 int error = 0; 3617 3618 if (ztest_opts.zo_mmp_test) 3619 return; 3620 3621 mutex_enter(&ztest_vdev_lock); 3622 3623 /* ensure we have a usable config; mirrors of raidz aren't supported */ 3624 if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { 3625 mutex_exit(&ztest_vdev_lock); 3626 return; 3627 } 3628 3629 /* clean up the old pool, if any */ 3630 (void) spa_destroy("splitp"); 3631 3632 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3633 3634 /* generate a config from the existing config */ 3635 mutex_enter(&spa->spa_props_lock); 3636 tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); 3637 mutex_exit(&spa->spa_props_lock); 3638 3639 VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, 3640 &child, &children)); 3641 3642 schild = umem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 3643 UMEM_NOFAIL); 3644 for (c = 0; c < children; c++) { 3645 vdev_t *tvd = rvd->vdev_child[c]; 3646 nvlist_t **mchild; 3647 uint_t mchildren; 3648 3649 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 3650 schild[schildren] = fnvlist_alloc(); 3651 fnvlist_add_string(schild[schildren], 3652 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); 3653 fnvlist_add_uint64(schild[schildren], 3654 ZPOOL_CONFIG_IS_HOLE, 1); 3655 if (lastlogid == 0) 3656 lastlogid = schildren; 3657 ++schildren; 3658 continue; 3659 } 3660 lastlogid = 0; 3661 VERIFY0(nvlist_lookup_nvlist_array(child[c], 3662 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); 3663 schild[schildren++] = fnvlist_dup(mchild[0]); 3664 } 3665 3666 /* OK, create a config that can be used to split */ 3667 split = fnvlist_alloc(); 3668 fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 3669 fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, 3670 (const nvlist_t **)schild, lastlogid != 0 ? lastlogid : schildren); 3671 3672 config = fnvlist_alloc(); 3673 fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); 3674 3675 for (c = 0; c < schildren; c++) 3676 fnvlist_free(schild[c]); 3677 umem_free(schild, rvd->vdev_children * sizeof (nvlist_t *)); 3678 fnvlist_free(split); 3679 3680 spa_config_exit(spa, SCL_VDEV, FTAG); 3681 3682 (void) pthread_rwlock_wrlock(&ztest_name_lock); 3683 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 3684 (void) pthread_rwlock_unlock(&ztest_name_lock); 3685 3686 fnvlist_free(config); 3687 3688 if (error == 0) { 3689 (void) printf("successful split - results:\n"); 3690 spa_namespace_enter(FTAG); 3691 show_pool_stats(spa); 3692 show_pool_stats(spa_lookup("splitp")); 3693 spa_namespace_exit(FTAG); 3694 ++zs->zs_splits; 3695 --zs->zs_mirrors; 3696 } 3697 mutex_exit(&ztest_vdev_lock); 3698 } 3699 3700 /* 3701 * Verify that we can attach and detach devices. 3702 */ 3703 void 3704 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 3705 { 3706 (void) zd, (void) id; 3707 ztest_shared_t *zs = ztest_shared; 3708 spa_t *spa = ztest_spa; 3709 spa_aux_vdev_t *sav = &spa->spa_spares; 3710 vdev_t *rvd = spa->spa_root_vdev; 3711 vdev_t *oldvd, *newvd, *pvd; 3712 nvlist_t *root; 3713 uint64_t leaves; 3714 uint64_t leaf, top; 3715 uint64_t ashift = ztest_get_ashift(); 3716 uint64_t oldguid, pguid; 3717 uint64_t oldsize, newsize; 3718 uint64_t raidz_children; 3719 char *oldpath, *newpath; 3720 int replacing; 3721 int oldvd_has_siblings = B_FALSE; 3722 int newvd_is_spare = B_FALSE; 3723 int newvd_is_dspare = B_FALSE; 3724 int oldvd_is_log; 3725 int oldvd_is_special; 3726 int error, expected_error; 3727 3728 if (ztest_opts.zo_mmp_test) 3729 return; 3730 3731 oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3732 newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3733 3734 mutex_enter(&ztest_vdev_lock); 3735 raidz_children = ztest_get_raidz_children(spa); 3736 leaves = MAX(zs->zs_mirrors, 1) * raidz_children; 3737 3738 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3739 3740 /* 3741 * If a vdev is in the process of being removed, its removal may 3742 * finish while we are in progress, leading to an unexpected error 3743 * value. Don't bother trying to attach while we are in the middle 3744 * of removal. 3745 */ 3746 if (ztest_device_removal_active) { 3747 spa_config_exit(spa, SCL_ALL, FTAG); 3748 goto out; 3749 } 3750 3751 /* 3752 * RAIDZ leaf VDEV mirrors are not currently supported while a 3753 * RAIDZ expansion is in progress. 3754 */ 3755 if (ztest_opts.zo_raid_do_expand) { 3756 spa_config_exit(spa, SCL_ALL, FTAG); 3757 goto out; 3758 } 3759 3760 /* 3761 * Decide whether to do an attach or a replace. 3762 */ 3763 replacing = ztest_random(2); 3764 3765 /* 3766 * Pick a random top-level vdev. 3767 */ 3768 top = ztest_random_vdev_top(spa, B_TRUE); 3769 3770 /* 3771 * Pick a random leaf within it. 3772 */ 3773 leaf = ztest_random(leaves); 3774 3775 /* 3776 * Locate this vdev. 3777 */ 3778 oldvd = rvd->vdev_child[top]; 3779 3780 /* pick a child from the mirror */ 3781 if (zs->zs_mirrors >= 1) { 3782 ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); 3783 ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); 3784 oldvd = oldvd->vdev_child[leaf / raidz_children]; 3785 } 3786 3787 /* pick a child out of the raidz group */ 3788 if (ztest_opts.zo_raid_children > 1) { 3789 if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) 3790 ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); 3791 else 3792 ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); 3793 oldvd = oldvd->vdev_child[leaf % raidz_children]; 3794 } 3795 3796 /* 3797 * If we're already doing an attach or replace, oldvd may be a 3798 * mirror vdev -- in which case, pick a random child. 3799 */ 3800 while (oldvd->vdev_children != 0) { 3801 oldvd_has_siblings = B_TRUE; 3802 ASSERT3U(oldvd->vdev_children, >=, 2); 3803 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3804 } 3805 3806 oldguid = oldvd->vdev_guid; 3807 oldsize = vdev_get_min_asize(oldvd); 3808 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3809 oldvd_is_special = 3810 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_SPECIAL || 3811 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_DEDUP; 3812 (void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN); 3813 pvd = oldvd->vdev_parent; 3814 pguid = pvd->vdev_guid; 3815 3816 /* 3817 * If oldvd has siblings, then half of the time, detach it. Prior 3818 * to the detach the pool is scrubbed in order to prevent creating 3819 * unrepairable blocks as a result of the data corruption injection. 3820 */ 3821 if (oldvd_has_siblings && ztest_random(2) == 0) { 3822 spa_config_exit(spa, SCL_ALL, FTAG); 3823 3824 error = ztest_scrub_impl(spa); 3825 if (error) 3826 goto out; 3827 3828 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3829 if (error != 0 && error != ENODEV && error != EBUSY && 3830 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3831 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3832 fatal(B_FALSE, "detach (%s) returned %d", 3833 oldpath, error); 3834 goto out; 3835 } 3836 3837 /* 3838 * For the new vdev, choose with equal probability between the two 3839 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3840 */ 3841 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3842 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3843 newvd_is_spare = B_TRUE; 3844 3845 if (newvd->vdev_ops == &vdev_draid_spare_ops) 3846 newvd_is_dspare = B_TRUE; 3847 3848 (void) strlcpy(newpath, newvd->vdev_path, MAXPATHLEN); 3849 } else { 3850 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 3851 ztest_opts.zo_dir, ztest_opts.zo_pool, 3852 top * leaves + leaf); 3853 if (ztest_random(2) == 0) 3854 newpath[strlen(newpath) - 1] = 'b'; 3855 newvd = vdev_lookup_by_path(rvd, newpath); 3856 } 3857 3858 if (newvd) { 3859 /* 3860 * Reopen to ensure the vdev's asize field isn't stale. 3861 */ 3862 vdev_reopen(newvd); 3863 newsize = vdev_get_min_asize(newvd); 3864 } else { 3865 /* 3866 * Make newsize a little bigger or smaller than oldsize. 3867 * If it's smaller, the attach should fail. 3868 * If it's larger, and we're doing a replace, 3869 * we should get dynamic LUN growth when we're done. 3870 */ 3871 newsize = 10 * oldsize / (9 + ztest_random(3)); 3872 } 3873 3874 /* 3875 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3876 * unless it's a replace; in that case any non-replacing parent is OK. 3877 * 3878 * If newvd is already part of the pool, it should fail with EBUSY. 3879 * 3880 * If newvd is too small, it should fail with EOVERFLOW. 3881 * 3882 * If newvd is a distributed spare and it's being attached to a 3883 * dRAID which is not its parent it should fail with ENOTSUP. 3884 */ 3885 if (pvd->vdev_ops != &vdev_mirror_ops && 3886 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3887 pvd->vdev_ops == &vdev_replacing_ops || 3888 pvd->vdev_ops == &vdev_spare_ops)) 3889 expected_error = ENOTSUP; 3890 else if (newvd_is_spare && 3891 (!replacing || oldvd_is_log || oldvd_is_special)) 3892 expected_error = ENOTSUP; 3893 else if (newvd == oldvd) 3894 expected_error = replacing ? 0 : EBUSY; 3895 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3896 expected_error = EBUSY; 3897 else if (!newvd_is_dspare && newsize < oldsize) 3898 expected_error = EOVERFLOW; 3899 else if (ashift > oldvd->vdev_top->vdev_ashift) 3900 expected_error = EDOM; 3901 else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) 3902 expected_error = ENOTSUP; 3903 else 3904 expected_error = 0; 3905 3906 spa_config_exit(spa, SCL_ALL, FTAG); 3907 3908 /* 3909 * Build the nvlist describing newpath. 3910 */ 3911 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3912 ashift, NULL, 0, 0, 1); 3913 3914 /* 3915 * When supported select either a healing or sequential resilver. 3916 */ 3917 boolean_t rebuilding = B_FALSE; 3918 if (pvd->vdev_ops == &vdev_mirror_ops || 3919 pvd->vdev_ops == &vdev_root_ops) { 3920 rebuilding = !!ztest_random(2); 3921 } 3922 3923 error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); 3924 3925 fnvlist_free(root); 3926 3927 /* 3928 * If our parent was the replacing vdev, but the replace completed, 3929 * then instead of failing with ENOTSUP we may either succeed, 3930 * fail with ENODEV, or fail with EOVERFLOW. 3931 */ 3932 if (expected_error == ENOTSUP && 3933 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3934 expected_error = error; 3935 3936 /* 3937 * If someone grew the LUN, the replacement may be too small. 3938 */ 3939 if (error == EOVERFLOW || error == EBUSY) 3940 expected_error = error; 3941 3942 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3943 error == ZFS_ERR_DISCARDING_CHECKPOINT || 3944 error == ZFS_ERR_RESILVER_IN_PROGRESS || 3945 error == ZFS_ERR_REBUILD_IN_PROGRESS) 3946 expected_error = error; 3947 3948 if (error != expected_error && expected_error != EBUSY) { 3949 fatal(B_FALSE, "attach (%s %"PRIu64", %s %"PRIu64", %d) " 3950 "returned %d, expected %d", 3951 oldpath, oldsize, newpath, 3952 newsize, replacing, error, expected_error); 3953 } 3954 out: 3955 mutex_exit(&ztest_vdev_lock); 3956 3957 umem_free(oldpath, MAXPATHLEN); 3958 umem_free(newpath, MAXPATHLEN); 3959 } 3960 3961 static void 3962 raidz_scratch_verify(void) 3963 { 3964 spa_t *spa; 3965 uint64_t write_size, logical_size, offset; 3966 raidz_reflow_scratch_state_t state; 3967 vdev_raidz_expand_t *vre; 3968 vdev_t *raidvd; 3969 3970 ASSERT(raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE); 3971 3972 if (ztest_scratch_state->zs_raidz_scratch_verify_pause == 0) 3973 return; 3974 3975 kernel_init(SPA_MODE_READ); 3976 3977 spa_namespace_enter(FTAG); 3978 spa = spa_lookup(ztest_opts.zo_pool); 3979 ASSERT(spa); 3980 spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; 3981 spa_namespace_exit(FTAG); 3982 3983 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 3984 3985 ASSERT3U(RRSS_GET_OFFSET(&spa->spa_uberblock), !=, UINT64_MAX); 3986 3987 mutex_enter(&ztest_vdev_lock); 3988 3989 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); 3990 3991 vre = spa->spa_raidz_expand; 3992 if (vre == NULL) 3993 goto out; 3994 3995 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3996 offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3997 state = RRSS_GET_STATE(&spa->spa_uberblock); 3998 write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift, 3999 uint64_t); 4000 logical_size = write_size * raidvd->vdev_children; 4001 4002 switch (state) { 4003 /* 4004 * Initial state of reflow process. RAIDZ expansion was 4005 * requested by user, but scratch object was not created. 4006 */ 4007 case RRSS_SCRATCH_NOT_IN_USE: 4008 ASSERT0(offset); 4009 break; 4010 4011 /* 4012 * Scratch object was synced and stored in boot area. 4013 */ 4014 case RRSS_SCRATCH_VALID: 4015 4016 /* 4017 * Scratch object was synced back to raidz start offset, 4018 * raidz is ready for sector by sector reflow process. 4019 */ 4020 case RRSS_SCRATCH_INVALID_SYNCED: 4021 4022 /* 4023 * Scratch object was synced back to raidz start offset 4024 * on zpool importing, raidz is ready for sector by sector 4025 * reflow process. 4026 */ 4027 case RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT: 4028 ASSERT3U(offset, ==, logical_size); 4029 break; 4030 4031 /* 4032 * Sector by sector reflow process started. 4033 */ 4034 case RRSS_SCRATCH_INVALID_SYNCED_REFLOW: 4035 ASSERT3U(offset, >=, logical_size); 4036 break; 4037 } 4038 4039 out: 4040 spa_config_exit(spa, SCL_ALL, FTAG); 4041 4042 mutex_exit(&ztest_vdev_lock); 4043 4044 ztest_scratch_state->zs_raidz_scratch_verify_pause = 0; 4045 4046 spa_close(spa, FTAG); 4047 kernel_fini(); 4048 } 4049 4050 static void 4051 ztest_scratch_thread(void *arg) 4052 { 4053 (void) arg; 4054 4055 /* wait up to 10 seconds */ 4056 for (int t = 100; t > 0; t -= 1) { 4057 if (raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) 4058 thread_exit(); 4059 4060 (void) poll(NULL, 0, 100); 4061 } 4062 4063 /* killed when the scratch area progress reached a certain point */ 4064 ztest_kill(ztest_shared); 4065 } 4066 4067 /* 4068 * Verify that we can attach raidz device. 4069 */ 4070 void 4071 ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) 4072 { 4073 (void) zd, (void) id; 4074 ztest_shared_t *zs = ztest_shared; 4075 spa_t *spa = ztest_spa; 4076 uint64_t leaves, raidz_children, newsize, ashift = ztest_get_ashift(); 4077 kthread_t *scratch_thread = NULL; 4078 vdev_t *newvd, *pvd; 4079 nvlist_t *root; 4080 char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 4081 int error, expected_error = 0; 4082 4083 mutex_enter(&ztest_vdev_lock); 4084 4085 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); 4086 4087 /* Only allow attach when raid-kind = 'eraidz' */ 4088 if (!ztest_opts.zo_raid_do_expand) { 4089 spa_config_exit(spa, SCL_ALL, FTAG); 4090 goto out; 4091 } 4092 4093 if (ztest_opts.zo_mmp_test) { 4094 spa_config_exit(spa, SCL_ALL, FTAG); 4095 goto out; 4096 } 4097 4098 if (ztest_device_removal_active) { 4099 spa_config_exit(spa, SCL_ALL, FTAG); 4100 goto out; 4101 } 4102 4103 pvd = vdev_lookup_top(spa, 0); 4104 4105 ASSERT(pvd->vdev_ops == &vdev_raidz_ops); 4106 4107 /* 4108 * Get size of a child of the raidz group, 4109 * make sure device is a bit bigger 4110 */ 4111 newvd = pvd->vdev_child[ztest_random(pvd->vdev_children)]; 4112 newsize = 10 * vdev_get_min_asize(newvd) / (9 + ztest_random(2)); 4113 4114 /* 4115 * Get next attached leaf id 4116 */ 4117 raidz_children = ztest_get_raidz_children(spa); 4118 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 4119 zs->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 4120 4121 if (spa->spa_raidz_expand) 4122 expected_error = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS; 4123 4124 spa_config_exit(spa, SCL_ALL, FTAG); 4125 4126 /* 4127 * Path to vdev to be attached 4128 */ 4129 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 4130 ztest_opts.zo_dir, ztest_opts.zo_pool, zs->zs_vdev_next_leaf); 4131 4132 /* 4133 * Build the nvlist describing newpath. 4134 */ 4135 root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL, 4136 0, 0, 1); 4137 4138 /* 4139 * 50% of the time, set raidz_expand_pause_point to cause 4140 * raidz_reflow_scratch_sync() to pause at a certain point and 4141 * then kill the test after 10 seconds so raidz_scratch_verify() 4142 * can confirm consistency when the pool is imported. 4143 */ 4144 if (ztest_random(2) == 0 && expected_error == 0) { 4145 raidz_expand_pause_point = 4146 ztest_random(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2) + 1; 4147 scratch_thread = thread_create(NULL, 0, ztest_scratch_thread, 4148 ztest_shared, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 4149 } 4150 4151 error = spa_vdev_attach(spa, pvd->vdev_guid, root, B_FALSE, B_FALSE); 4152 4153 nvlist_free(root); 4154 4155 if (error == EOVERFLOW || error == ENXIO || 4156 error == ZFS_ERR_CHECKPOINT_EXISTS || 4157 error == ZFS_ERR_DISCARDING_CHECKPOINT) 4158 expected_error = error; 4159 4160 if (error != 0 && error != expected_error) { 4161 fatal(0, "raidz attach (%s %"PRIu64") returned %d, expected %d", 4162 newpath, newsize, error, expected_error); 4163 } 4164 4165 if (raidz_expand_pause_point) { 4166 if (error != 0) { 4167 /* 4168 * Do not verify scratch object in case of error 4169 * returned by vdev attaching. 4170 */ 4171 raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; 4172 } 4173 4174 VERIFY0(thread_join(scratch_thread)); 4175 } 4176 out: 4177 mutex_exit(&ztest_vdev_lock); 4178 4179 umem_free(newpath, MAXPATHLEN); 4180 } 4181 4182 void 4183 ztest_device_removal(ztest_ds_t *zd, uint64_t id) 4184 { 4185 (void) zd, (void) id; 4186 spa_t *spa = ztest_spa; 4187 vdev_t *vd; 4188 uint64_t guid; 4189 int error; 4190 4191 mutex_enter(&ztest_vdev_lock); 4192 4193 if (ztest_device_removal_active) { 4194 mutex_exit(&ztest_vdev_lock); 4195 return; 4196 } 4197 4198 /* 4199 * Remove a random top-level vdev and wait for removal to finish. 4200 */ 4201 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 4202 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 4203 guid = vd->vdev_guid; 4204 spa_config_exit(spa, SCL_VDEV, FTAG); 4205 4206 error = spa_vdev_remove(spa, guid, B_FALSE); 4207 if (error == 0) { 4208 ztest_device_removal_active = B_TRUE; 4209 mutex_exit(&ztest_vdev_lock); 4210 4211 /* 4212 * spa->spa_vdev_removal is created in a sync task that 4213 * is initiated via dsl_sync_task_nowait(). Since the 4214 * task may not run before spa_vdev_remove() returns, we 4215 * must wait at least 1 txg to ensure that the removal 4216 * struct has been created. 4217 */ 4218 txg_wait_synced(spa_get_dsl(spa), 0); 4219 4220 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 4221 txg_wait_synced(spa_get_dsl(spa), 0); 4222 } else { 4223 mutex_exit(&ztest_vdev_lock); 4224 return; 4225 } 4226 4227 /* 4228 * The pool needs to be scrubbed after completing device removal. 4229 * Failure to do so may result in checksum errors due to the 4230 * strategy employed by ztest_fault_inject() when selecting which 4231 * offset are redundant and can be damaged. 4232 */ 4233 error = spa_scan(spa, POOL_SCAN_SCRUB); 4234 if (error == 0) { 4235 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 4236 txg_wait_synced(spa_get_dsl(spa), 0); 4237 } 4238 4239 mutex_enter(&ztest_vdev_lock); 4240 ztest_device_removal_active = B_FALSE; 4241 mutex_exit(&ztest_vdev_lock); 4242 } 4243 4244 /* 4245 * Callback function which expands the physical size of the vdev. 4246 */ 4247 static vdev_t * 4248 grow_vdev(vdev_t *vd, void *arg) 4249 { 4250 spa_t *spa __maybe_unused = vd->vdev_spa; 4251 size_t *newsize = arg; 4252 size_t fsize; 4253 int fd; 4254 4255 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 4256 ASSERT(vd->vdev_ops->vdev_op_leaf); 4257 4258 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 4259 return (vd); 4260 4261 fsize = lseek(fd, 0, SEEK_END); 4262 VERIFY0(ftruncate(fd, *newsize)); 4263 4264 if (ztest_opts.zo_verbose >= 6) { 4265 (void) printf("%s grew from %lu to %lu bytes\n", 4266 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 4267 } 4268 (void) close(fd); 4269 return (NULL); 4270 } 4271 4272 /* 4273 * Callback function which expands a given vdev by calling vdev_online(). 4274 */ 4275 static vdev_t * 4276 online_vdev(vdev_t *vd, void *arg) 4277 { 4278 (void) arg; 4279 spa_t *spa = vd->vdev_spa; 4280 vdev_t *tvd = vd->vdev_top; 4281 uint64_t guid = vd->vdev_guid; 4282 uint64_t generation = spa->spa_config_generation + 1; 4283 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 4284 int error; 4285 4286 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 4287 ASSERT(vd->vdev_ops->vdev_op_leaf); 4288 4289 /* Calling vdev_online will initialize the new metaslabs */ 4290 spa_config_exit(spa, SCL_STATE, spa); 4291 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 4292 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4293 4294 /* 4295 * If vdev_online returned an error or the underlying vdev_open 4296 * failed then we abort the expand. The only way to know that 4297 * vdev_open fails is by checking the returned newstate. 4298 */ 4299 if (error || newstate != VDEV_STATE_HEALTHY) { 4300 if (ztest_opts.zo_verbose >= 5) { 4301 (void) printf("Unable to expand vdev, state %u, " 4302 "error %d\n", newstate, error); 4303 } 4304 return (vd); 4305 } 4306 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 4307 4308 /* 4309 * Since we dropped the lock we need to ensure that we're 4310 * still talking to the original vdev. It's possible this 4311 * vdev may have been detached/replaced while we were 4312 * trying to online it. 4313 */ 4314 if (generation != spa->spa_config_generation) { 4315 if (ztest_opts.zo_verbose >= 5) { 4316 (void) printf("vdev configuration has changed, " 4317 "guid %"PRIu64", state %"PRIu64", " 4318 "expected gen %"PRIu64", got gen %"PRIu64"\n", 4319 guid, 4320 tvd->vdev_state, 4321 generation, 4322 spa->spa_config_generation); 4323 } 4324 return (vd); 4325 } 4326 return (NULL); 4327 } 4328 4329 /* 4330 * Traverse the vdev tree calling the supplied function. 4331 * We continue to walk the tree until we either have walked all 4332 * children or we receive a non-NULL return from the callback. 4333 * If a NULL callback is passed, then we just return back the first 4334 * leaf vdev we encounter. 4335 */ 4336 static vdev_t * 4337 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 4338 { 4339 uint_t c; 4340 4341 if (vd->vdev_ops->vdev_op_leaf) { 4342 if (func == NULL) 4343 return (vd); 4344 else 4345 return (func(vd, arg)); 4346 } 4347 4348 for (c = 0; c < vd->vdev_children; c++) { 4349 vdev_t *cvd = vd->vdev_child[c]; 4350 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 4351 return (cvd); 4352 } 4353 return (NULL); 4354 } 4355 4356 /* 4357 * Verify that dynamic LUN growth works as expected. 4358 */ 4359 void 4360 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 4361 { 4362 (void) zd, (void) id; 4363 spa_t *spa = ztest_spa; 4364 vdev_t *vd, *tvd; 4365 metaslab_class_t *mc; 4366 metaslab_group_t *mg; 4367 size_t psize, newsize; 4368 uint64_t top; 4369 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 4370 4371 mutex_enter(&ztest_checkpoint_lock); 4372 mutex_enter(&ztest_vdev_lock); 4373 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4374 4375 /* 4376 * If there is a vdev removal in progress, it could complete while 4377 * we are running, in which case we would not be able to verify 4378 * that the metaslab_class space increased (because it decreases 4379 * when the device removal completes). 4380 */ 4381 if (ztest_device_removal_active) { 4382 spa_config_exit(spa, SCL_STATE, spa); 4383 mutex_exit(&ztest_vdev_lock); 4384 mutex_exit(&ztest_checkpoint_lock); 4385 return; 4386 } 4387 4388 /* 4389 * If we are under raidz expansion, the test can failed because the 4390 * metaslabs count will not increase immediately after the vdev is 4391 * expanded. It will happen only after raidz expansion completion. 4392 */ 4393 if (spa->spa_raidz_expand) { 4394 spa_config_exit(spa, SCL_STATE, spa); 4395 mutex_exit(&ztest_vdev_lock); 4396 mutex_exit(&ztest_checkpoint_lock); 4397 return; 4398 } 4399 4400 top = ztest_random_vdev_top(spa, B_TRUE); 4401 4402 tvd = spa->spa_root_vdev->vdev_child[top]; 4403 mg = tvd->vdev_mg; 4404 mc = mg->mg_class; 4405 old_ms_count = tvd->vdev_ms_count; 4406 old_class_space = metaslab_class_get_space(mc); 4407 4408 /* 4409 * Determine the size of the first leaf vdev associated with 4410 * our top-level device. 4411 */ 4412 vd = vdev_walk_tree(tvd, NULL, NULL); 4413 ASSERT3P(vd, !=, NULL); 4414 ASSERT(vd->vdev_ops->vdev_op_leaf); 4415 4416 psize = vd->vdev_psize; 4417 4418 /* 4419 * We only try to expand the vdev if it's healthy, less than 4x its 4420 * original size, and it has a valid psize. 4421 */ 4422 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 4423 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 4424 spa_config_exit(spa, SCL_STATE, spa); 4425 mutex_exit(&ztest_vdev_lock); 4426 mutex_exit(&ztest_checkpoint_lock); 4427 return; 4428 } 4429 ASSERT3U(psize, >, 0); 4430 newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); 4431 ASSERT3U(newsize, >, psize); 4432 4433 if (ztest_opts.zo_verbose >= 6) { 4434 (void) printf("Expanding LUN %s from %lu to %lu\n", 4435 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 4436 } 4437 4438 /* 4439 * Growing the vdev is a two step process: 4440 * 1). expand the physical size (i.e. relabel) 4441 * 2). online the vdev to create the new metaslabs 4442 */ 4443 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 4444 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 4445 tvd->vdev_state != VDEV_STATE_HEALTHY) { 4446 if (ztest_opts.zo_verbose >= 5) { 4447 (void) printf("Could not expand LUN because " 4448 "the vdev configuration changed.\n"); 4449 } 4450 spa_config_exit(spa, SCL_STATE, spa); 4451 mutex_exit(&ztest_vdev_lock); 4452 mutex_exit(&ztest_checkpoint_lock); 4453 return; 4454 } 4455 4456 spa_config_exit(spa, SCL_STATE, spa); 4457 4458 /* 4459 * Expanding the LUN will update the config asynchronously, 4460 * thus we must wait for the async thread to complete any 4461 * pending tasks before proceeding. 4462 */ 4463 for (;;) { 4464 boolean_t done; 4465 mutex_enter(&spa->spa_async_lock); 4466 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 4467 mutex_exit(&spa->spa_async_lock); 4468 if (done) 4469 break; 4470 txg_wait_synced(spa_get_dsl(spa), 0); 4471 (void) poll(NULL, 0, 100); 4472 } 4473 4474 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4475 4476 tvd = spa->spa_root_vdev->vdev_child[top]; 4477 new_ms_count = tvd->vdev_ms_count; 4478 new_class_space = metaslab_class_get_space(mc); 4479 4480 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 4481 if (ztest_opts.zo_verbose >= 5) { 4482 (void) printf("Could not verify LUN expansion due to " 4483 "intervening vdev offline or remove.\n"); 4484 } 4485 spa_config_exit(spa, SCL_STATE, spa); 4486 mutex_exit(&ztest_vdev_lock); 4487 mutex_exit(&ztest_checkpoint_lock); 4488 return; 4489 } 4490 4491 /* 4492 * Make sure we were able to grow the vdev. 4493 */ 4494 if (new_ms_count <= old_ms_count) { 4495 fatal(B_FALSE, 4496 "LUN expansion failed: ms_count %"PRIu64" < %"PRIu64"\n", 4497 old_ms_count, new_ms_count); 4498 } 4499 4500 /* 4501 * Make sure we were able to grow the pool. 4502 */ 4503 if (new_class_space <= old_class_space) { 4504 fatal(B_FALSE, 4505 "LUN expansion failed: class_space %"PRIu64" < %"PRIu64"\n", 4506 old_class_space, new_class_space); 4507 } 4508 4509 if (ztest_opts.zo_verbose >= 5) { 4510 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 4511 4512 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 4513 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 4514 (void) printf("%s grew from %s to %s\n", 4515 spa->spa_name, oldnumbuf, newnumbuf); 4516 } 4517 4518 spa_config_exit(spa, SCL_STATE, spa); 4519 mutex_exit(&ztest_vdev_lock); 4520 mutex_exit(&ztest_checkpoint_lock); 4521 } 4522 4523 /* 4524 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 4525 */ 4526 static void 4527 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 4528 { 4529 (void) arg, (void) cr; 4530 4531 /* 4532 * Create the objects common to all ztest datasets. 4533 */ 4534 VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, 4535 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); 4536 } 4537 4538 static int 4539 ztest_dataset_create(char *dsname) 4540 { 4541 int err; 4542 uint64_t rand; 4543 dsl_crypto_params_t *dcp = NULL; 4544 4545 /* 4546 * 50% of the time, we create encrypted datasets 4547 * using a random cipher suite and a hard-coded 4548 * wrapping key. 4549 */ 4550 rand = ztest_random(2); 4551 if (rand != 0) { 4552 nvlist_t *crypto_args = fnvlist_alloc(); 4553 nvlist_t *props = fnvlist_alloc(); 4554 4555 /* slight bias towards the default cipher suite */ 4556 rand = ztest_random(ZIO_CRYPT_FUNCTIONS); 4557 if (rand < ZIO_CRYPT_AES_128_CCM) 4558 rand = ZIO_CRYPT_ON; 4559 4560 fnvlist_add_uint64(props, 4561 zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); 4562 fnvlist_add_uint8_array(crypto_args, "wkeydata", 4563 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 4564 4565 /* 4566 * These parameters aren't really used by the kernel. They 4567 * are simply stored so that userspace knows how to load 4568 * the wrapping key. 4569 */ 4570 fnvlist_add_uint64(props, 4571 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); 4572 fnvlist_add_string(props, 4573 zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); 4574 fnvlist_add_uint64(props, 4575 zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); 4576 fnvlist_add_uint64(props, 4577 zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); 4578 4579 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, 4580 crypto_args, &dcp)); 4581 4582 /* 4583 * Cycle through all available encryption implementations 4584 * to verify interoperability. 4585 */ 4586 VERIFY0(gcm_impl_set("cycle")); 4587 VERIFY0(aes_impl_set("cycle")); 4588 4589 fnvlist_free(crypto_args); 4590 fnvlist_free(props); 4591 } 4592 4593 err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, 4594 ztest_objset_create_cb, NULL); 4595 dsl_crypto_params_free(dcp, !!err); 4596 4597 rand = ztest_random(100); 4598 if (err || rand < 80) 4599 return (err); 4600 4601 if (ztest_opts.zo_verbose >= 5) 4602 (void) printf("Setting dataset %s to sync always\n", dsname); 4603 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 4604 ZFS_SYNC_ALWAYS, B_FALSE)); 4605 } 4606 4607 static int 4608 ztest_objset_destroy_cb(const char *name, void *arg) 4609 { 4610 (void) arg; 4611 objset_t *os; 4612 dmu_object_info_t doi; 4613 int error; 4614 4615 /* 4616 * Verify that the dataset contains a directory object. 4617 */ 4618 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4619 B_TRUE, FTAG, &os)); 4620 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 4621 if (error != ENOENT) { 4622 /* We could have crashed in the middle of destroying it */ 4623 ASSERT0(error); 4624 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 4625 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 4626 } 4627 dmu_objset_disown(os, B_TRUE, FTAG); 4628 4629 /* 4630 * Destroy the dataset. 4631 */ 4632 if (strchr(name, '@') != NULL) { 4633 error = dsl_destroy_snapshot(name, B_TRUE); 4634 if (error != ECHRNG) { 4635 /* 4636 * The program was executed, but encountered a runtime 4637 * error, such as insufficient slop, or a hold on the 4638 * dataset. 4639 */ 4640 ASSERT0(error); 4641 } 4642 } else { 4643 error = dsl_destroy_head(name); 4644 if (error == ENOSPC) { 4645 /* There could be checkpoint or insufficient slop */ 4646 ztest_record_enospc(FTAG); 4647 } else if (error != EBUSY) { 4648 /* There could be a hold on this dataset */ 4649 ASSERT0(error); 4650 } 4651 } 4652 return (0); 4653 } 4654 4655 static boolean_t 4656 ztest_snapshot_create(char *osname, uint64_t id) 4657 { 4658 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4659 int error; 4660 4661 (void) snprintf(snapname, sizeof (snapname), "%"PRIu64"", id); 4662 4663 error = dmu_objset_snapshot_one(osname, snapname); 4664 if (error == ENOSPC) { 4665 ztest_record_enospc(FTAG); 4666 return (B_FALSE); 4667 } 4668 if (error != 0 && error != EEXIST && error != ECHRNG) { 4669 fatal(B_FALSE, "ztest_snapshot_create(%s@%s) = %d", osname, 4670 snapname, error); 4671 } 4672 return (B_TRUE); 4673 } 4674 4675 static boolean_t 4676 ztest_snapshot_destroy(char *osname, uint64_t id) 4677 { 4678 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4679 int error; 4680 4681 (void) snprintf(snapname, sizeof (snapname), "%s@%"PRIu64"", 4682 osname, id); 4683 4684 error = dsl_destroy_snapshot(snapname, B_FALSE); 4685 if (error != 0 && error != ENOENT && error != ECHRNG) 4686 fatal(B_FALSE, "ztest_snapshot_destroy(%s) = %d", 4687 snapname, error); 4688 return (B_TRUE); 4689 } 4690 4691 void 4692 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 4693 { 4694 (void) zd; 4695 ztest_ds_t *zdtmp; 4696 int iters; 4697 int error; 4698 objset_t *os, *os2; 4699 char name[ZFS_MAX_DATASET_NAME_LEN]; 4700 zilog_t *zilog; 4701 int i; 4702 4703 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 4704 4705 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4706 4707 (void) snprintf(name, sizeof (name), "%s/temp_%"PRIu64"", 4708 ztest_opts.zo_pool, id); 4709 4710 /* 4711 * If this dataset exists from a previous run, process its replay log 4712 * half of the time. If we don't replay it, then dsl_destroy_head() 4713 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 4714 */ 4715 if (ztest_random(2) == 0 && 4716 ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 4717 B_TRUE, FTAG, &os) == 0) { 4718 ztest_zd_init(zdtmp, NULL, os); 4719 zil_replay(os, zdtmp, ztest_replay_vector); 4720 ztest_zd_fini(zdtmp); 4721 dmu_objset_disown(os, B_TRUE, FTAG); 4722 } 4723 4724 /* 4725 * There may be an old instance of the dataset we're about to 4726 * create lying around from a previous run. If so, destroy it 4727 * and all of its snapshots. 4728 */ 4729 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 4730 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 4731 4732 /* 4733 * Verify that the destroyed dataset is no longer in the namespace. 4734 * It may still be present if the destroy above fails with ENOSPC. 4735 */ 4736 error = ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, 4737 FTAG, &os); 4738 if (error == 0) { 4739 dmu_objset_disown(os, B_TRUE, FTAG); 4740 ztest_record_enospc(FTAG); 4741 goto out; 4742 } 4743 VERIFY3U(ENOENT, ==, error); 4744 4745 /* 4746 * Verify that we can create a new dataset. 4747 */ 4748 error = ztest_dataset_create(name); 4749 if (error) { 4750 if (error == ENOSPC) { 4751 ztest_record_enospc(FTAG); 4752 goto out; 4753 } 4754 fatal(B_FALSE, "dmu_objset_create(%s) = %d", name, error); 4755 } 4756 4757 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, 4758 FTAG, &os)); 4759 4760 ztest_zd_init(zdtmp, NULL, os); 4761 4762 /* 4763 * Open the intent log for it. 4764 */ 4765 zilog = zil_open(os, ztest_get_data, NULL); 4766 4767 /* 4768 * Put some objects in there, do a little I/O to them, 4769 * and randomly take a couple of snapshots along the way. 4770 */ 4771 iters = ztest_random(5); 4772 for (i = 0; i < iters; i++) { 4773 ztest_dmu_object_alloc_free(zdtmp, id); 4774 if (ztest_random(iters) == 0) 4775 (void) ztest_snapshot_create(name, i); 4776 } 4777 4778 /* 4779 * Verify that we cannot create an existing dataset. 4780 */ 4781 VERIFY3U(EEXIST, ==, 4782 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); 4783 4784 /* 4785 * Verify that we can hold an objset that is also owned. 4786 */ 4787 VERIFY0(dmu_objset_hold(name, FTAG, &os2)); 4788 dmu_objset_rele(os2, FTAG); 4789 4790 /* 4791 * Verify that we cannot own an objset that is already owned. 4792 */ 4793 VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, 4794 B_FALSE, B_TRUE, FTAG, &os2)); 4795 4796 zil_close(zilog); 4797 dmu_objset_disown(os, B_TRUE, FTAG); 4798 ztest_zd_fini(zdtmp); 4799 out: 4800 (void) pthread_rwlock_unlock(&ztest_name_lock); 4801 4802 umem_free(zdtmp, sizeof (ztest_ds_t)); 4803 } 4804 4805 /* 4806 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 4807 */ 4808 void 4809 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 4810 { 4811 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4812 (void) ztest_snapshot_destroy(zd->zd_name, id); 4813 (void) ztest_snapshot_create(zd->zd_name, id); 4814 (void) pthread_rwlock_unlock(&ztest_name_lock); 4815 } 4816 4817 /* 4818 * Cleanup non-standard snapshots and clones. 4819 */ 4820 static void 4821 ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 4822 { 4823 char *snap1name; 4824 char *clone1name; 4825 char *snap2name; 4826 char *clone2name; 4827 char *snap3name; 4828 int error; 4829 4830 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4831 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4832 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4833 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4834 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4835 4836 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4837 osname, id); 4838 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4839 osname, id); 4840 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4841 clone1name, id); 4842 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4843 osname, id); 4844 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4845 clone1name, id); 4846 4847 error = dsl_destroy_head(clone2name); 4848 if (error && error != ENOENT) 4849 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone2name, error); 4850 error = dsl_destroy_snapshot(snap3name, B_FALSE); 4851 if (error && error != ENOENT) 4852 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4853 snap3name, error); 4854 error = dsl_destroy_snapshot(snap2name, B_FALSE); 4855 if (error && error != ENOENT) 4856 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4857 snap2name, error); 4858 error = dsl_destroy_head(clone1name); 4859 if (error && error != ENOENT) 4860 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone1name, error); 4861 error = dsl_destroy_snapshot(snap1name, B_FALSE); 4862 if (error && error != ENOENT) 4863 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4864 snap1name, error); 4865 4866 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4867 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4868 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4869 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4870 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4871 } 4872 4873 /* 4874 * Verify dsl_dataset_promote handles EBUSY 4875 */ 4876 void 4877 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 4878 { 4879 objset_t *os; 4880 char *snap1name; 4881 char *clone1name; 4882 char *snap2name; 4883 char *clone2name; 4884 char *snap3name; 4885 char *osname = zd->zd_name; 4886 int error; 4887 4888 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4889 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4890 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4891 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4892 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4893 4894 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4895 4896 ztest_dsl_dataset_cleanup(osname, id); 4897 4898 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4899 osname, id); 4900 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4901 osname, id); 4902 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4903 clone1name, id); 4904 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4905 osname, id); 4906 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4907 clone1name, id); 4908 4909 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 4910 if (error && error != EEXIST) { 4911 if (error == ENOSPC) { 4912 ztest_record_enospc(FTAG); 4913 goto out; 4914 } 4915 fatal(B_FALSE, "dmu_take_snapshot(%s) = %d", snap1name, error); 4916 } 4917 4918 error = dsl_dataset_clone(clone1name, snap1name); 4919 if (error) { 4920 if (error == ENOSPC) { 4921 ztest_record_enospc(FTAG); 4922 goto out; 4923 } 4924 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone1name, error); 4925 } 4926 4927 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 4928 if (error && error != EEXIST) { 4929 if (error == ENOSPC) { 4930 ztest_record_enospc(FTAG); 4931 goto out; 4932 } 4933 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap2name, error); 4934 } 4935 4936 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 4937 if (error && error != EEXIST) { 4938 if (error == ENOSPC) { 4939 ztest_record_enospc(FTAG); 4940 goto out; 4941 } 4942 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap3name, error); 4943 } 4944 4945 error = dsl_dataset_clone(clone2name, snap3name); 4946 if (error) { 4947 if (error == ENOSPC) { 4948 ztest_record_enospc(FTAG); 4949 goto out; 4950 } 4951 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone2name, error); 4952 } 4953 4954 error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, 4955 FTAG, &os); 4956 if (error) 4957 fatal(B_FALSE, "dmu_objset_own(%s) = %d", snap2name, error); 4958 error = dsl_dataset_promote(clone2name, NULL); 4959 if (error == ENOSPC) { 4960 dmu_objset_disown(os, B_TRUE, FTAG); 4961 ztest_record_enospc(FTAG); 4962 goto out; 4963 } 4964 if (error != EBUSY) 4965 fatal(B_FALSE, "dsl_dataset_promote(%s), %d, not EBUSY", 4966 clone2name, error); 4967 dmu_objset_disown(os, B_TRUE, FTAG); 4968 4969 out: 4970 ztest_dsl_dataset_cleanup(osname, id); 4971 4972 (void) pthread_rwlock_unlock(&ztest_name_lock); 4973 4974 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4975 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4976 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4977 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4978 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4979 } 4980 4981 #undef OD_ARRAY_SIZE 4982 #define OD_ARRAY_SIZE 4 4983 4984 /* 4985 * Verify that dmu_object_{alloc,free} work as expected. 4986 */ 4987 void 4988 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 4989 { 4990 ztest_od_t *od; 4991 int batchsize; 4992 int size; 4993 int b; 4994 4995 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4996 od = umem_alloc(size, UMEM_NOFAIL); 4997 batchsize = OD_ARRAY_SIZE; 4998 4999 for (b = 0; b < batchsize; b++) 5000 ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 5001 0, 0, 0); 5002 5003 /* 5004 * Destroy the previous batch of objects, create a new batch, 5005 * and do some I/O on the new objects. 5006 */ 5007 if (ztest_object_init(zd, od, size, B_TRUE) != 0) { 5008 zd->zd_od = NULL; 5009 umem_free(od, size); 5010 return; 5011 } 5012 5013 while (ztest_random(4 * batchsize) != 0) 5014 ztest_io(zd, od[ztest_random(batchsize)].od_object, 5015 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5016 5017 umem_free(od, size); 5018 } 5019 5020 /* 5021 * Rewind the global allocator to verify object allocation backfilling. 5022 */ 5023 void 5024 ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 5025 { 5026 (void) id; 5027 objset_t *os = zd->zd_os; 5028 uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 5029 uint64_t object; 5030 5031 /* 5032 * Rewind the global allocator randomly back to a lower object number 5033 * to force backfilling and reclamation of recently freed dnodes. 5034 */ 5035 mutex_enter(&os->os_obj_lock); 5036 object = ztest_random(os->os_obj_next_chunk); 5037 os->os_obj_next_chunk = P2ALIGN_TYPED(object, dnodes_per_chunk, 5038 uint64_t); 5039 mutex_exit(&os->os_obj_lock); 5040 } 5041 5042 #undef OD_ARRAY_SIZE 5043 #define OD_ARRAY_SIZE 2 5044 5045 /* 5046 * Verify that dmu_{read,write} work as expected. 5047 */ 5048 void 5049 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 5050 { 5051 int size; 5052 ztest_od_t *od; 5053 5054 objset_t *os = zd->zd_os; 5055 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 5056 od = umem_alloc(size, UMEM_NOFAIL); 5057 dmu_tx_t *tx; 5058 int freeit, error; 5059 uint64_t i, n, s, txg; 5060 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 5061 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 5062 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 5063 uint64_t regions = 997; 5064 uint64_t stride = 123456789ULL; 5065 uint64_t width = 40; 5066 int free_percent = 5; 5067 dmu_flags_t dmu_read_flags = DMU_READ_PREFETCH; 5068 5069 /* 5070 * We will randomly set when to do O_DIRECT on a read. 5071 */ 5072 if (ztest_random(4) == 0) 5073 dmu_read_flags |= DMU_DIRECTIO; 5074 5075 /* 5076 * This test uses two objects, packobj and bigobj, that are always 5077 * updated together (i.e. in the same tx) so that their contents are 5078 * in sync and can be compared. Their contents relate to each other 5079 * in a simple way: packobj is a dense array of 'bufwad' structures, 5080 * while bigobj is a sparse array of the same bufwads. Specifically, 5081 * for any index n, there are three bufwads that should be identical: 5082 * 5083 * packobj, at offset n * sizeof (bufwad_t) 5084 * bigobj, at the head of the nth chunk 5085 * bigobj, at the tail of the nth chunk 5086 * 5087 * The chunk size is arbitrary. It doesn't have to be a power of two, 5088 * and it doesn't have any relation to the object blocksize. 5089 * The only requirement is that it can hold at least two bufwads. 5090 * 5091 * Normally, we write the bufwad to each of these locations. 5092 * However, free_percent of the time we instead write zeroes to 5093 * packobj and perform a dmu_free_range() on bigobj. By comparing 5094 * bigobj to packobj, we can verify that the DMU is correctly 5095 * tracking which parts of an object are allocated and free, 5096 * and that the contents of the allocated blocks are correct. 5097 */ 5098 5099 /* 5100 * Read the directory info. If it's the first time, set things up. 5101 */ 5102 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); 5103 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5104 chunksize); 5105 5106 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5107 umem_free(od, size); 5108 return; 5109 } 5110 5111 bigobj = od[0].od_object; 5112 packobj = od[1].od_object; 5113 chunksize = od[0].od_gen; 5114 ASSERT3U(chunksize, ==, od[1].od_gen); 5115 5116 /* 5117 * Prefetch a random chunk of the big object. 5118 * Our aim here is to get some async reads in flight 5119 * for blocks that we may free below; the DMU should 5120 * handle this race correctly. 5121 */ 5122 n = ztest_random(regions) * stride + ztest_random(width); 5123 s = 1 + ztest_random(2 * width - 1); 5124 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 5125 ZIO_PRIORITY_SYNC_READ); 5126 5127 /* 5128 * Pick a random index and compute the offsets into packobj and bigobj. 5129 */ 5130 n = ztest_random(regions) * stride + ztest_random(width); 5131 s = 1 + ztest_random(width - 1); 5132 5133 packoff = n * sizeof (bufwad_t); 5134 packsize = s * sizeof (bufwad_t); 5135 5136 bigoff = n * chunksize; 5137 bigsize = s * chunksize; 5138 5139 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 5140 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 5141 5142 /* 5143 * free_percent of the time, free a range of bigobj rather than 5144 * overwriting it. 5145 */ 5146 freeit = (ztest_random(100) < free_percent); 5147 5148 /* 5149 * Read the current contents of our objects. 5150 */ 5151 error = dmu_read(os, packobj, packoff, packsize, packbuf, 5152 dmu_read_flags); 5153 ASSERT0(error); 5154 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 5155 dmu_read_flags); 5156 ASSERT0(error); 5157 5158 /* 5159 * Get a tx for the mods to both packobj and bigobj. 5160 */ 5161 tx = dmu_tx_create(os); 5162 5163 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5164 5165 if (freeit) 5166 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 5167 else 5168 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5169 5170 /* This accounts for setting the checksum/compression. */ 5171 dmu_tx_hold_bonus(tx, bigobj); 5172 5173 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5174 if (txg == 0) { 5175 umem_free(packbuf, packsize); 5176 umem_free(bigbuf, bigsize); 5177 umem_free(od, size); 5178 return; 5179 } 5180 5181 enum zio_checksum cksum; 5182 do { 5183 cksum = (enum zio_checksum) 5184 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 5185 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 5186 dmu_object_set_checksum(os, bigobj, cksum, tx); 5187 5188 enum zio_compress comp; 5189 do { 5190 comp = (enum zio_compress) 5191 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 5192 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 5193 dmu_object_set_compress(os, bigobj, comp, tx); 5194 5195 /* 5196 * For each index from n to n + s, verify that the existing bufwad 5197 * in packobj matches the bufwads at the head and tail of the 5198 * corresponding chunk in bigobj. Then update all three bufwads 5199 * with the new values we want to write out. 5200 */ 5201 for (i = 0; i < s; i++) { 5202 /* LINTED */ 5203 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 5204 /* LINTED */ 5205 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 5206 /* LINTED */ 5207 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 5208 5209 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 5210 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 5211 5212 if (pack->bw_txg > txg) 5213 fatal(B_FALSE, 5214 "future leak: got %"PRIx64", open txg is %"PRIx64"", 5215 pack->bw_txg, txg); 5216 5217 if (pack->bw_data != 0 && pack->bw_index != n + i) 5218 fatal(B_FALSE, "wrong index: " 5219 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 5220 pack->bw_index, n, i); 5221 5222 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 5223 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 5224 pack, bigH); 5225 5226 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 5227 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 5228 pack, bigT); 5229 5230 if (freeit) { 5231 memset(pack, 0, sizeof (bufwad_t)); 5232 } else { 5233 pack->bw_index = n + i; 5234 pack->bw_txg = txg; 5235 pack->bw_data = 1 + ztest_random(-2ULL); 5236 } 5237 *bigH = *pack; 5238 *bigT = *pack; 5239 } 5240 5241 /* 5242 * We've verified all the old bufwads, and made new ones. 5243 * Now write them out. 5244 */ 5245 dmu_write(os, packobj, packoff, packsize, packbuf, tx, 5246 DMU_READ_PREFETCH); 5247 5248 if (freeit) { 5249 if (ztest_opts.zo_verbose >= 7) { 5250 (void) printf("freeing offset %"PRIx64" size %"PRIx64"" 5251 " txg %"PRIx64"\n", 5252 bigoff, bigsize, txg); 5253 } 5254 VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 5255 } else { 5256 if (ztest_opts.zo_verbose >= 7) { 5257 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5258 " txg %"PRIx64"\n", 5259 bigoff, bigsize, txg); 5260 } 5261 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx, 5262 DMU_READ_PREFETCH); 5263 } 5264 5265 dmu_tx_commit(tx); 5266 5267 /* 5268 * Sanity check the stuff we just wrote. 5269 */ 5270 { 5271 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5272 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5273 5274 VERIFY0(dmu_read(os, packobj, packoff, 5275 packsize, packcheck, dmu_read_flags)); 5276 VERIFY0(dmu_read(os, bigobj, bigoff, 5277 bigsize, bigcheck, dmu_read_flags)); 5278 5279 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5280 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5281 5282 umem_free(packcheck, packsize); 5283 umem_free(bigcheck, bigsize); 5284 } 5285 5286 umem_free(packbuf, packsize); 5287 umem_free(bigbuf, bigsize); 5288 umem_free(od, size); 5289 } 5290 5291 static void 5292 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 5293 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 5294 { 5295 uint64_t i; 5296 bufwad_t *pack; 5297 bufwad_t *bigH; 5298 bufwad_t *bigT; 5299 5300 /* 5301 * For each index from n to n + s, verify that the existing bufwad 5302 * in packobj matches the bufwads at the head and tail of the 5303 * corresponding chunk in bigobj. Then update all three bufwads 5304 * with the new values we want to write out. 5305 */ 5306 for (i = 0; i < s; i++) { 5307 /* LINTED */ 5308 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 5309 /* LINTED */ 5310 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 5311 /* LINTED */ 5312 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 5313 5314 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 5315 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 5316 5317 if (pack->bw_txg > txg) 5318 fatal(B_FALSE, 5319 "future leak: got %"PRIx64", open txg is %"PRIx64"", 5320 pack->bw_txg, txg); 5321 5322 if (pack->bw_data != 0 && pack->bw_index != n + i) 5323 fatal(B_FALSE, "wrong index: " 5324 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 5325 pack->bw_index, n, i); 5326 5327 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 5328 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 5329 pack, bigH); 5330 5331 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 5332 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 5333 pack, bigT); 5334 5335 pack->bw_index = n + i; 5336 pack->bw_txg = txg; 5337 pack->bw_data = 1 + ztest_random(-2ULL); 5338 5339 *bigH = *pack; 5340 *bigT = *pack; 5341 } 5342 } 5343 5344 #undef OD_ARRAY_SIZE 5345 #define OD_ARRAY_SIZE 2 5346 5347 void 5348 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 5349 { 5350 objset_t *os = zd->zd_os; 5351 ztest_od_t *od; 5352 dmu_tx_t *tx; 5353 uint64_t i; 5354 int error; 5355 int size; 5356 uint64_t n, s, txg; 5357 bufwad_t *packbuf, *bigbuf; 5358 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 5359 uint64_t blocksize = ztest_random_blocksize(); 5360 uint64_t chunksize = blocksize; 5361 uint64_t regions = 997; 5362 uint64_t stride = 123456789ULL; 5363 uint64_t width = 9; 5364 dmu_buf_t *bonus_db; 5365 arc_buf_t **bigbuf_arcbufs; 5366 dmu_object_info_t doi; 5367 uint32_t dmu_read_flags = DMU_READ_PREFETCH; 5368 5369 /* 5370 * We will randomly set when to do O_DIRECT on a read. 5371 */ 5372 if (ztest_random(4) == 0) 5373 dmu_read_flags |= DMU_DIRECTIO; 5374 5375 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 5376 od = umem_alloc(size, UMEM_NOFAIL); 5377 5378 /* 5379 * This test uses two objects, packobj and bigobj, that are always 5380 * updated together (i.e. in the same tx) so that their contents are 5381 * in sync and can be compared. Their contents relate to each other 5382 * in a simple way: packobj is a dense array of 'bufwad' structures, 5383 * while bigobj is a sparse array of the same bufwads. Specifically, 5384 * for any index n, there are three bufwads that should be identical: 5385 * 5386 * packobj, at offset n * sizeof (bufwad_t) 5387 * bigobj, at the head of the nth chunk 5388 * bigobj, at the tail of the nth chunk 5389 * 5390 * The chunk size is set equal to bigobj block size so that 5391 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. 5392 */ 5393 5394 /* 5395 * Read the directory info. If it's the first time, set things up. 5396 */ 5397 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5398 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5399 chunksize); 5400 5401 5402 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5403 umem_free(od, size); 5404 return; 5405 } 5406 5407 bigobj = od[0].od_object; 5408 packobj = od[1].od_object; 5409 blocksize = od[0].od_blocksize; 5410 chunksize = blocksize; 5411 ASSERT3U(chunksize, ==, od[1].od_gen); 5412 5413 VERIFY0(dmu_object_info(os, bigobj, &doi)); 5414 VERIFY(ISP2(doi.doi_data_block_size)); 5415 VERIFY3U(chunksize, ==, doi.doi_data_block_size); 5416 VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); 5417 5418 /* 5419 * Pick a random index and compute the offsets into packobj and bigobj. 5420 */ 5421 n = ztest_random(regions) * stride + ztest_random(width); 5422 s = 1 + ztest_random(width - 1); 5423 5424 packoff = n * sizeof (bufwad_t); 5425 packsize = s * sizeof (bufwad_t); 5426 5427 bigoff = n * chunksize; 5428 bigsize = s * chunksize; 5429 5430 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 5431 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 5432 5433 VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 5434 5435 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 5436 5437 /* 5438 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 5439 * Iteration 1 test zcopy to already referenced dbufs. 5440 * Iteration 2 test zcopy to dirty dbuf in the same txg. 5441 * Iteration 3 test zcopy to dbuf dirty in previous txg. 5442 * Iteration 4 test zcopy when dbuf is no longer dirty. 5443 * Iteration 5 test zcopy when it can't be done. 5444 * Iteration 6 one more zcopy write. 5445 */ 5446 for (i = 0; i < 7; i++) { 5447 uint64_t j; 5448 uint64_t off; 5449 5450 /* 5451 * In iteration 5 (i == 5) use arcbufs 5452 * that don't match bigobj blksz to test 5453 * dmu_assign_arcbuf_by_dbuf() when it can't directly 5454 * assign an arcbuf to a dbuf. 5455 */ 5456 for (j = 0; j < s; j++) { 5457 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5458 bigbuf_arcbufs[j] = 5459 dmu_request_arcbuf(bonus_db, chunksize); 5460 } else { 5461 bigbuf_arcbufs[2 * j] = 5462 dmu_request_arcbuf(bonus_db, chunksize / 2); 5463 bigbuf_arcbufs[2 * j + 1] = 5464 dmu_request_arcbuf(bonus_db, chunksize / 2); 5465 } 5466 } 5467 5468 /* 5469 * Get a tx for the mods to both packobj and bigobj. 5470 */ 5471 tx = dmu_tx_create(os); 5472 5473 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5474 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5475 5476 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5477 if (txg == 0) { 5478 umem_free(packbuf, packsize); 5479 umem_free(bigbuf, bigsize); 5480 for (j = 0; j < s; j++) { 5481 if (i != 5 || 5482 chunksize < (SPA_MINBLOCKSIZE * 2)) { 5483 dmu_return_arcbuf(bigbuf_arcbufs[j]); 5484 } else { 5485 dmu_return_arcbuf( 5486 bigbuf_arcbufs[2 * j]); 5487 dmu_return_arcbuf( 5488 bigbuf_arcbufs[2 * j + 1]); 5489 } 5490 } 5491 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5492 umem_free(od, size); 5493 dmu_buf_rele(bonus_db, FTAG); 5494 return; 5495 } 5496 5497 /* 5498 * 50% of the time don't read objects in the 1st iteration to 5499 * test dmu_assign_arcbuf_by_dbuf() for the case when there are 5500 * no existing dbufs for the specified offsets. 5501 */ 5502 if (i != 0 || ztest_random(2) != 0) { 5503 error = dmu_read(os, packobj, packoff, 5504 packsize, packbuf, dmu_read_flags); 5505 ASSERT0(error); 5506 error = dmu_read(os, bigobj, bigoff, bigsize, 5507 bigbuf, dmu_read_flags); 5508 ASSERT0(error); 5509 } 5510 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 5511 n, chunksize, txg); 5512 5513 /* 5514 * We've verified all the old bufwads, and made new ones. 5515 * Now write them out. 5516 */ 5517 dmu_write(os, packobj, packoff, packsize, packbuf, tx, 5518 DMU_READ_PREFETCH); 5519 if (ztest_opts.zo_verbose >= 7) { 5520 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5521 " txg %"PRIx64"\n", 5522 bigoff, bigsize, txg); 5523 } 5524 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 5525 dmu_buf_t *dbt; 5526 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5527 memcpy(bigbuf_arcbufs[j]->b_data, 5528 (caddr_t)bigbuf + (off - bigoff), 5529 chunksize); 5530 } else { 5531 memcpy(bigbuf_arcbufs[2 * j]->b_data, 5532 (caddr_t)bigbuf + (off - bigoff), 5533 chunksize / 2); 5534 memcpy(bigbuf_arcbufs[2 * j + 1]->b_data, 5535 (caddr_t)bigbuf + (off - bigoff) + 5536 chunksize / 2, 5537 chunksize / 2); 5538 } 5539 5540 if (i == 1) { 5541 VERIFY0(dmu_buf_hold(os, bigobj, off, 5542 FTAG, &dbt, DMU_READ_NO_PREFETCH)); 5543 } 5544 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5545 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5546 off, bigbuf_arcbufs[j], tx, 0)); 5547 } else { 5548 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5549 off, bigbuf_arcbufs[2 * j], tx, 0)); 5550 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5551 off + chunksize / 2, 5552 bigbuf_arcbufs[2 * j + 1], tx, 0)); 5553 } 5554 if (i == 1) { 5555 dmu_buf_rele(dbt, FTAG); 5556 } 5557 } 5558 dmu_tx_commit(tx); 5559 5560 /* 5561 * Sanity check the stuff we just wrote. 5562 */ 5563 { 5564 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5565 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5566 5567 VERIFY0(dmu_read(os, packobj, packoff, 5568 packsize, packcheck, dmu_read_flags)); 5569 VERIFY0(dmu_read(os, bigobj, bigoff, 5570 bigsize, bigcheck, dmu_read_flags)); 5571 5572 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5573 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5574 5575 umem_free(packcheck, packsize); 5576 umem_free(bigcheck, bigsize); 5577 } 5578 if (i == 2) { 5579 txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); 5580 } else if (i == 3) { 5581 txg_wait_synced(dmu_objset_pool(os), 0); 5582 } 5583 } 5584 5585 dmu_buf_rele(bonus_db, FTAG); 5586 umem_free(packbuf, packsize); 5587 umem_free(bigbuf, bigsize); 5588 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5589 umem_free(od, size); 5590 } 5591 5592 void 5593 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 5594 { 5595 (void) id; 5596 ztest_od_t *od; 5597 5598 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5599 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 5600 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5601 5602 /* 5603 * Have multiple threads write to large offsets in an object 5604 * to verify that parallel writes to an object -- even to the 5605 * same blocks within the object -- doesn't cause any trouble. 5606 */ 5607 ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5608 5609 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) 5610 return; 5611 5612 while (ztest_random(10) != 0) 5613 ztest_io(zd, od->od_object, offset); 5614 5615 umem_free(od, sizeof (ztest_od_t)); 5616 } 5617 5618 void 5619 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 5620 { 5621 ztest_od_t *od; 5622 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 5623 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5624 uint64_t count = ztest_random(20) + 1; 5625 uint64_t blocksize = ztest_random_blocksize(); 5626 void *data; 5627 5628 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5629 5630 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5631 5632 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5633 !ztest_random(2)) != 0) { 5634 umem_free(od, sizeof (ztest_od_t)); 5635 return; 5636 } 5637 5638 if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { 5639 umem_free(od, sizeof (ztest_od_t)); 5640 return; 5641 } 5642 5643 ztest_prealloc(zd, od->od_object, offset, count * blocksize); 5644 5645 data = umem_zalloc(blocksize, UMEM_NOFAIL); 5646 5647 while (ztest_random(count) != 0) { 5648 uint64_t randoff = offset + (ztest_random(count) * blocksize); 5649 if (ztest_write(zd, od->od_object, randoff, blocksize, 5650 data) != 0) 5651 break; 5652 while (ztest_random(4) != 0) 5653 ztest_io(zd, od->od_object, randoff); 5654 } 5655 5656 umem_free(data, blocksize); 5657 umem_free(od, sizeof (ztest_od_t)); 5658 } 5659 5660 /* 5661 * Verify that zap_{create,destroy,add,remove,update} work as expected. 5662 */ 5663 #define ZTEST_ZAP_MIN_INTS 1 5664 #define ZTEST_ZAP_MAX_INTS 4 5665 #define ZTEST_ZAP_MAX_PROPS 1000 5666 5667 void 5668 ztest_zap(ztest_ds_t *zd, uint64_t id) 5669 { 5670 objset_t *os = zd->zd_os; 5671 ztest_od_t *od; 5672 uint64_t object; 5673 uint64_t txg, last_txg; 5674 uint64_t value[ZTEST_ZAP_MAX_INTS]; 5675 uint64_t zl_ints, zl_intsize, prop; 5676 int i, ints; 5677 dmu_tx_t *tx; 5678 char propname[100], txgname[100]; 5679 int error; 5680 const char *const hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 5681 5682 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5683 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5684 5685 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5686 !ztest_random(2)) != 0) 5687 goto out; 5688 5689 object = od->od_object; 5690 5691 /* 5692 * Generate a known hash collision, and verify that 5693 * we can lookup and remove both entries. 5694 */ 5695 tx = dmu_tx_create(os); 5696 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5697 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5698 if (txg == 0) 5699 goto out; 5700 for (i = 0; i < 2; i++) { 5701 value[i] = i; 5702 VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 5703 1, &value[i], tx)); 5704 } 5705 for (i = 0; i < 2; i++) { 5706 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 5707 sizeof (uint64_t), 1, &value[i], tx)); 5708 VERIFY0( 5709 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 5710 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5711 ASSERT3U(zl_ints, ==, 1); 5712 } 5713 for (i = 0; i < 2; i++) { 5714 VERIFY0(zap_remove(os, object, hc[i], tx)); 5715 } 5716 dmu_tx_commit(tx); 5717 5718 /* 5719 * Generate a bunch of random entries. 5720 */ 5721 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 5722 5723 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5724 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5725 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5726 memset(value, 0, sizeof (value)); 5727 last_txg = 0; 5728 5729 /* 5730 * If these zap entries already exist, validate their contents. 5731 */ 5732 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5733 if (error == 0) { 5734 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5735 ASSERT3U(zl_ints, ==, 1); 5736 5737 VERIFY0(zap_lookup(os, object, txgname, zl_intsize, 5738 zl_ints, &last_txg)); 5739 5740 VERIFY0(zap_length(os, object, propname, &zl_intsize, 5741 &zl_ints)); 5742 5743 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5744 ASSERT3U(zl_ints, ==, ints); 5745 5746 VERIFY0(zap_lookup(os, object, propname, zl_intsize, 5747 zl_ints, value)); 5748 5749 for (i = 0; i < ints; i++) { 5750 ASSERT3U(value[i], ==, last_txg + object + i); 5751 } 5752 } else { 5753 ASSERT3U(error, ==, ENOENT); 5754 } 5755 5756 /* 5757 * Atomically update two entries in our zap object. 5758 * The first is named txg_%llu, and contains the txg 5759 * in which the property was last updated. The second 5760 * is named prop_%llu, and the nth element of its value 5761 * should be txg + object + n. 5762 */ 5763 tx = dmu_tx_create(os); 5764 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5765 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5766 if (txg == 0) 5767 goto out; 5768 5769 if (last_txg > txg) 5770 fatal(B_FALSE, "zap future leak: old %"PRIu64" new %"PRIu64"", 5771 last_txg, txg); 5772 5773 for (i = 0; i < ints; i++) 5774 value[i] = txg + object + i; 5775 5776 VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 5777 1, &txg, tx)); 5778 VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), 5779 ints, value, tx)); 5780 5781 dmu_tx_commit(tx); 5782 5783 /* 5784 * Remove a random pair of entries. 5785 */ 5786 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5787 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5788 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5789 5790 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5791 5792 if (error == ENOENT) 5793 goto out; 5794 5795 ASSERT0(error); 5796 5797 tx = dmu_tx_create(os); 5798 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5799 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5800 if (txg == 0) 5801 goto out; 5802 VERIFY0(zap_remove(os, object, txgname, tx)); 5803 VERIFY0(zap_remove(os, object, propname, tx)); 5804 dmu_tx_commit(tx); 5805 out: 5806 umem_free(od, sizeof (ztest_od_t)); 5807 } 5808 5809 /* 5810 * Test case to test the upgrading of a microzap to fatzap. 5811 */ 5812 void 5813 ztest_fzap(ztest_ds_t *zd, uint64_t id) 5814 { 5815 objset_t *os = zd->zd_os; 5816 ztest_od_t *od; 5817 uint64_t object, txg, value; 5818 5819 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5820 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5821 5822 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5823 !ztest_random(2)) != 0) 5824 goto out; 5825 object = od->od_object; 5826 5827 /* 5828 * Add entries to this ZAP and make sure it spills over 5829 * and gets upgraded to a fatzap. Also, since we are adding 5830 * 2050 entries we should see ptrtbl growth and leaf-block split. 5831 */ 5832 for (value = 0; value < 2050; value++) { 5833 char name[ZFS_MAX_DATASET_NAME_LEN]; 5834 dmu_tx_t *tx; 5835 int error; 5836 5837 (void) snprintf(name, sizeof (name), "fzap-%"PRIu64"-%"PRIu64"", 5838 id, value); 5839 5840 tx = dmu_tx_create(os); 5841 dmu_tx_hold_zap(tx, object, B_TRUE, name); 5842 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5843 if (txg == 0) 5844 goto out; 5845 error = zap_add(os, object, name, sizeof (uint64_t), 1, 5846 &value, tx); 5847 ASSERT(error == 0 || error == EEXIST); 5848 dmu_tx_commit(tx); 5849 } 5850 out: 5851 umem_free(od, sizeof (ztest_od_t)); 5852 } 5853 5854 void 5855 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 5856 { 5857 (void) id; 5858 objset_t *os = zd->zd_os; 5859 ztest_od_t *od; 5860 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 5861 dmu_tx_t *tx; 5862 int i, namelen, error; 5863 int micro = ztest_random(2); 5864 char name[20], string_value[20]; 5865 void *data; 5866 5867 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5868 ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); 5869 5870 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5871 umem_free(od, sizeof (ztest_od_t)); 5872 return; 5873 } 5874 5875 object = od->od_object; 5876 5877 /* 5878 * Generate a random name of the form 'xxx.....' where each 5879 * x is a random printable character and the dots are dots. 5880 * There are 94 such characters, and the name length goes from 5881 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 5882 */ 5883 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 5884 5885 for (i = 0; i < 3; i++) 5886 name[i] = '!' + ztest_random('~' - '!' + 1); 5887 for (; i < namelen - 1; i++) 5888 name[i] = '.'; 5889 name[i] = '\0'; 5890 5891 if ((namelen & 1) || micro) { 5892 wsize = sizeof (txg); 5893 wc = 1; 5894 data = &txg; 5895 } else { 5896 wsize = 1; 5897 wc = namelen; 5898 data = string_value; 5899 } 5900 5901 count = -1ULL; 5902 VERIFY0(zap_count(os, object, &count)); 5903 ASSERT3S(count, !=, -1ULL); 5904 5905 /* 5906 * Select an operation: length, lookup, add, update, remove. 5907 */ 5908 i = ztest_random(5); 5909 5910 if (i >= 2) { 5911 tx = dmu_tx_create(os); 5912 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5913 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5914 if (txg == 0) { 5915 umem_free(od, sizeof (ztest_od_t)); 5916 return; 5917 } 5918 memcpy(string_value, name, namelen); 5919 } else { 5920 tx = NULL; 5921 txg = 0; 5922 memset(string_value, 0, namelen); 5923 } 5924 5925 switch (i) { 5926 5927 case 0: 5928 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 5929 if (error == 0) { 5930 ASSERT3U(wsize, ==, zl_wsize); 5931 ASSERT3U(wc, ==, zl_wc); 5932 } else { 5933 ASSERT3U(error, ==, ENOENT); 5934 } 5935 break; 5936 5937 case 1: 5938 error = zap_lookup(os, object, name, wsize, wc, data); 5939 if (error == 0) { 5940 if (data == string_value && 5941 memcmp(name, data, namelen) != 0) 5942 fatal(B_FALSE, "name '%s' != val '%s' len %d", 5943 name, (char *)data, namelen); 5944 } else { 5945 ASSERT3U(error, ==, ENOENT); 5946 } 5947 break; 5948 5949 case 2: 5950 error = zap_add(os, object, name, wsize, wc, data, tx); 5951 ASSERT(error == 0 || error == EEXIST); 5952 break; 5953 5954 case 3: 5955 VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); 5956 break; 5957 5958 case 4: 5959 error = zap_remove(os, object, name, tx); 5960 ASSERT(error == 0 || error == ENOENT); 5961 break; 5962 } 5963 5964 if (tx != NULL) 5965 dmu_tx_commit(tx); 5966 5967 umem_free(od, sizeof (ztest_od_t)); 5968 } 5969 5970 /* 5971 * Commit callback data. 5972 */ 5973 typedef struct ztest_cb_data { 5974 list_node_t zcd_node; 5975 uint64_t zcd_txg; 5976 int zcd_expected_err; 5977 boolean_t zcd_added; 5978 boolean_t zcd_called; 5979 spa_t *zcd_spa; 5980 } ztest_cb_data_t; 5981 5982 /* This is the actual commit callback function */ 5983 static void 5984 ztest_commit_callback(void *arg, int error) 5985 { 5986 ztest_cb_data_t *data = arg; 5987 uint64_t synced_txg; 5988 5989 VERIFY3P(data, !=, NULL); 5990 VERIFY3S(data->zcd_expected_err, ==, error); 5991 VERIFY(!data->zcd_called); 5992 5993 synced_txg = spa_last_synced_txg(data->zcd_spa); 5994 if (data->zcd_txg > synced_txg) 5995 fatal(B_FALSE, 5996 "commit callback of txg %"PRIu64" called prematurely, " 5997 "last synced txg = %"PRIu64"\n", 5998 data->zcd_txg, synced_txg); 5999 6000 data->zcd_called = B_TRUE; 6001 6002 if (error == ECANCELED) { 6003 ASSERT0(data->zcd_txg); 6004 ASSERT(!data->zcd_added); 6005 6006 /* 6007 * The private callback data should be destroyed here, but 6008 * since we are going to check the zcd_called field after 6009 * dmu_tx_abort(), we will destroy it there. 6010 */ 6011 return; 6012 } 6013 6014 ASSERT(data->zcd_added); 6015 ASSERT3U(data->zcd_txg, !=, 0); 6016 6017 (void) mutex_enter(&zcl.zcl_callbacks_lock); 6018 6019 /* See if this cb was called more quickly */ 6020 if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) 6021 zc_min_txg_delay = synced_txg - data->zcd_txg; 6022 6023 /* Remove our callback from the list */ 6024 list_remove(&zcl.zcl_callbacks, data); 6025 6026 (void) mutex_exit(&zcl.zcl_callbacks_lock); 6027 6028 umem_free(data, sizeof (ztest_cb_data_t)); 6029 } 6030 6031 /* Allocate and initialize callback data structure */ 6032 static ztest_cb_data_t * 6033 ztest_create_cb_data(objset_t *os, uint64_t txg) 6034 { 6035 ztest_cb_data_t *cb_data; 6036 6037 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 6038 6039 cb_data->zcd_txg = txg; 6040 cb_data->zcd_spa = dmu_objset_spa(os); 6041 list_link_init(&cb_data->zcd_node); 6042 6043 return (cb_data); 6044 } 6045 6046 /* 6047 * Commit callback test. 6048 */ 6049 void 6050 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 6051 { 6052 objset_t *os = zd->zd_os; 6053 ztest_od_t *od; 6054 dmu_tx_t *tx; 6055 ztest_cb_data_t *cb_data[3], *tmp_cb; 6056 uint64_t old_txg, txg; 6057 int i, error = 0; 6058 6059 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 6060 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 6061 6062 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 6063 umem_free(od, sizeof (ztest_od_t)); 6064 return; 6065 } 6066 6067 tx = dmu_tx_create(os); 6068 6069 cb_data[0] = ztest_create_cb_data(os, 0); 6070 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 6071 6072 dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); 6073 6074 /* Every once in a while, abort the transaction on purpose */ 6075 if (ztest_random(100) == 0) 6076 error = -1; 6077 6078 if (!error) 6079 error = dmu_tx_assign(tx, DMU_TX_NOWAIT); 6080 6081 txg = error ? 0 : dmu_tx_get_txg(tx); 6082 6083 cb_data[0]->zcd_txg = txg; 6084 cb_data[1] = ztest_create_cb_data(os, txg); 6085 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 6086 6087 if (error) { 6088 /* 6089 * It's not a strict requirement to call the registered 6090 * callbacks from inside dmu_tx_abort(), but that's what 6091 * it's supposed to happen in the current implementation 6092 * so we will check for that. 6093 */ 6094 for (i = 0; i < 2; i++) { 6095 cb_data[i]->zcd_expected_err = ECANCELED; 6096 VERIFY(!cb_data[i]->zcd_called); 6097 } 6098 6099 dmu_tx_abort(tx); 6100 6101 for (i = 0; i < 2; i++) { 6102 VERIFY(cb_data[i]->zcd_called); 6103 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 6104 } 6105 6106 umem_free(od, sizeof (ztest_od_t)); 6107 return; 6108 } 6109 6110 cb_data[2] = ztest_create_cb_data(os, txg); 6111 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 6112 6113 /* 6114 * Read existing data to make sure there isn't a future leak. 6115 */ 6116 VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), 6117 &old_txg, DMU_READ_PREFETCH)); 6118 6119 if (old_txg > txg) 6120 fatal(B_FALSE, 6121 "future leak: got %"PRIu64", open txg is %"PRIu64"", 6122 old_txg, txg); 6123 6124 dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx, 6125 DMU_READ_PREFETCH); 6126 6127 (void) mutex_enter(&zcl.zcl_callbacks_lock); 6128 6129 /* 6130 * Since commit callbacks don't have any ordering requirement and since 6131 * it is theoretically possible for a commit callback to be called 6132 * after an arbitrary amount of time has elapsed since its txg has been 6133 * synced, it is difficult to reliably determine whether a commit 6134 * callback hasn't been called due to high load or due to a flawed 6135 * implementation. 6136 * 6137 * In practice, we will assume that if after a certain number of txgs a 6138 * commit callback hasn't been called, then most likely there's an 6139 * implementation bug.. 6140 */ 6141 tmp_cb = list_head(&zcl.zcl_callbacks); 6142 if (tmp_cb != NULL && 6143 tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { 6144 fatal(B_FALSE, 6145 "Commit callback threshold exceeded, " 6146 "oldest txg: %"PRIu64", open txg: %"PRIu64"\n", 6147 tmp_cb->zcd_txg, txg); 6148 } 6149 6150 /* 6151 * Let's find the place to insert our callbacks. 6152 * 6153 * Even though the list is ordered by txg, it is possible for the 6154 * insertion point to not be the end because our txg may already be 6155 * quiescing at this point and other callbacks in the open txg 6156 * (from other objsets) may have sneaked in. 6157 */ 6158 tmp_cb = list_tail(&zcl.zcl_callbacks); 6159 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 6160 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 6161 6162 /* Add the 3 callbacks to the list */ 6163 for (i = 0; i < 3; i++) { 6164 if (tmp_cb == NULL) 6165 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 6166 else 6167 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 6168 cb_data[i]); 6169 6170 cb_data[i]->zcd_added = B_TRUE; 6171 VERIFY(!cb_data[i]->zcd_called); 6172 6173 tmp_cb = cb_data[i]; 6174 } 6175 6176 zc_cb_counter += 3; 6177 6178 (void) mutex_exit(&zcl.zcl_callbacks_lock); 6179 6180 dmu_tx_commit(tx); 6181 6182 umem_free(od, sizeof (ztest_od_t)); 6183 } 6184 6185 /* 6186 * Visit each object in the dataset. Verify that its properties 6187 * are consistent what was stored in the block tag when it was created, 6188 * and that its unused bonus buffer space has not been overwritten. 6189 */ 6190 void 6191 ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 6192 { 6193 (void) id; 6194 objset_t *os = zd->zd_os; 6195 uint64_t obj; 6196 int err = 0; 6197 6198 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 6199 ztest_block_tag_t *bt = NULL; 6200 dmu_object_info_t doi; 6201 dmu_buf_t *db; 6202 6203 ztest_object_lock(zd, obj, ZTRL_READER); 6204 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { 6205 ztest_object_unlock(zd, obj); 6206 continue; 6207 } 6208 6209 dmu_object_info_from_db(db, &doi); 6210 if (doi.doi_bonus_size >= sizeof (*bt)) 6211 bt = ztest_bt_bonus(db); 6212 6213 if (bt && bt->bt_magic == BT_MAGIC) { 6214 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 6215 bt->bt_offset, bt->bt_gen, bt->bt_txg, 6216 bt->bt_crtxg); 6217 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 6218 } 6219 6220 dmu_buf_rele(db, FTAG); 6221 ztest_object_unlock(zd, obj); 6222 } 6223 } 6224 6225 void 6226 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 6227 { 6228 (void) id; 6229 zfs_prop_t proplist[] = { 6230 ZFS_PROP_CHECKSUM, 6231 ZFS_PROP_COMPRESSION, 6232 ZFS_PROP_COPIES, 6233 ZFS_PROP_DEDUP 6234 }; 6235 6236 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6237 6238 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) { 6239 int error = ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 6240 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 6241 ASSERT(error == 0 || error == ENOSPC); 6242 } 6243 6244 int error = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, 6245 ztest_random_blocksize(), (int)ztest_random(2)); 6246 ASSERT(error == 0 || error == ENOSPC); 6247 6248 (void) pthread_rwlock_unlock(&ztest_name_lock); 6249 } 6250 6251 void 6252 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 6253 { 6254 (void) zd, (void) id; 6255 6256 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6257 6258 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); 6259 6260 nvlist_t *props = fnvlist_alloc(); 6261 6262 VERIFY0(spa_prop_get(ztest_spa, props)); 6263 6264 if (ztest_opts.zo_verbose >= 6) 6265 dump_nvlist(props, 4); 6266 6267 fnvlist_free(props); 6268 6269 (void) pthread_rwlock_unlock(&ztest_name_lock); 6270 } 6271 6272 static int 6273 user_release_one(const char *snapname, const char *holdname) 6274 { 6275 nvlist_t *snaps, *holds; 6276 int error; 6277 6278 snaps = fnvlist_alloc(); 6279 holds = fnvlist_alloc(); 6280 fnvlist_add_boolean(holds, holdname); 6281 fnvlist_add_nvlist(snaps, snapname, holds); 6282 fnvlist_free(holds); 6283 error = dsl_dataset_user_release(snaps, NULL); 6284 fnvlist_free(snaps); 6285 return (error); 6286 } 6287 6288 /* 6289 * Test snapshot hold/release and deferred destroy. 6290 */ 6291 void 6292 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 6293 { 6294 int error; 6295 objset_t *os = zd->zd_os; 6296 objset_t *origin; 6297 char snapname[100]; 6298 char fullname[100]; 6299 char clonename[100]; 6300 char tag[100]; 6301 char osname[ZFS_MAX_DATASET_NAME_LEN]; 6302 nvlist_t *holds; 6303 6304 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6305 6306 dmu_objset_name(os, osname); 6307 6308 (void) snprintf(snapname, sizeof (snapname), "sh1_%"PRIu64"", id); 6309 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 6310 (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%"PRIu64"", 6311 osname, id); 6312 (void) snprintf(tag, sizeof (tag), "tag_%"PRIu64"", id); 6313 6314 /* 6315 * Clean up from any previous run. 6316 */ 6317 error = dsl_destroy_head(clonename); 6318 if (error != ENOENT) 6319 ASSERT0(error); 6320 error = user_release_one(fullname, tag); 6321 if (error != ESRCH && error != ENOENT) 6322 ASSERT0(error); 6323 error = dsl_destroy_snapshot(fullname, B_FALSE); 6324 if (error != ENOENT) 6325 ASSERT0(error); 6326 6327 /* 6328 * Create snapshot, clone it, mark snap for deferred destroy, 6329 * destroy clone, verify snap was also destroyed. 6330 */ 6331 error = dmu_objset_snapshot_one(osname, snapname); 6332 if (error) { 6333 if (error == ENOSPC) { 6334 ztest_record_enospc("dmu_objset_snapshot"); 6335 goto out; 6336 } 6337 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 6338 } 6339 6340 error = dsl_dataset_clone(clonename, fullname); 6341 if (error) { 6342 if (error == ENOSPC) { 6343 ztest_record_enospc("dsl_dataset_clone"); 6344 goto out; 6345 } 6346 fatal(B_FALSE, "dsl_dataset_clone(%s) = %d", clonename, error); 6347 } 6348 6349 error = dsl_destroy_snapshot(fullname, B_TRUE); 6350 if (error) { 6351 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6352 fullname, error); 6353 } 6354 6355 error = dsl_destroy_head(clonename); 6356 if (error) 6357 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clonename, error); 6358 6359 error = dmu_objset_hold(fullname, FTAG, &origin); 6360 if (error != ENOENT) 6361 fatal(B_FALSE, "dmu_objset_hold(%s) = %d", fullname, error); 6362 6363 /* 6364 * Create snapshot, add temporary hold, verify that we can't 6365 * destroy a held snapshot, mark for deferred destroy, 6366 * release hold, verify snapshot was destroyed. 6367 */ 6368 error = dmu_objset_snapshot_one(osname, snapname); 6369 if (error) { 6370 if (error == ENOSPC) { 6371 ztest_record_enospc("dmu_objset_snapshot"); 6372 goto out; 6373 } 6374 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 6375 } 6376 6377 holds = fnvlist_alloc(); 6378 fnvlist_add_string(holds, fullname, tag); 6379 error = dsl_dataset_user_hold(holds, 0, NULL); 6380 fnvlist_free(holds); 6381 6382 if (error == ENOSPC) { 6383 ztest_record_enospc("dsl_dataset_user_hold"); 6384 goto out; 6385 } else if (error) { 6386 fatal(B_FALSE, "dsl_dataset_user_hold(%s, %s) = %u", 6387 fullname, tag, error); 6388 } 6389 6390 error = dsl_destroy_snapshot(fullname, B_FALSE); 6391 if (error != EBUSY) { 6392 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 6393 fullname, error); 6394 } 6395 6396 error = dsl_destroy_snapshot(fullname, B_TRUE); 6397 if (error) { 6398 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6399 fullname, error); 6400 } 6401 6402 error = user_release_one(fullname, tag); 6403 if (error) 6404 fatal(B_FALSE, "user_release_one(%s, %s) = %d", 6405 fullname, tag, error); 6406 6407 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 6408 6409 out: 6410 (void) pthread_rwlock_unlock(&ztest_name_lock); 6411 } 6412 6413 /* 6414 * Inject random faults into the on-disk data. 6415 */ 6416 void 6417 ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 6418 { 6419 (void) zd, (void) id; 6420 ztest_shared_t *zs = ztest_shared; 6421 spa_t *spa = ztest_spa; 6422 int fd; 6423 uint64_t offset; 6424 uint64_t leaves; 6425 uint64_t bad = 0x1990c0ffeedecadeull; 6426 uint64_t top, leaf; 6427 uint64_t raidz_children; 6428 char *path0; 6429 char *pathrand; 6430 size_t fsize; 6431 int bshift = SPA_MAXBLOCKSHIFT + 2; 6432 int iters = 1000; 6433 int maxfaults; 6434 int mirror_save; 6435 vdev_t *vd0 = NULL; 6436 uint64_t guid0 = 0; 6437 boolean_t islog = B_FALSE; 6438 boolean_t injected = B_FALSE; 6439 6440 path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6441 pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6442 6443 mutex_enter(&ztest_vdev_lock); 6444 6445 /* 6446 * Device removal is in progress, fault injection must be disabled 6447 * until it completes and the pool is scrubbed. The fault injection 6448 * strategy for damaging blocks does not take in to account evacuated 6449 * blocks which may have already been damaged. 6450 */ 6451 if (ztest_device_removal_active) 6452 goto out; 6453 6454 /* 6455 * The fault injection strategy for damaging blocks cannot be used 6456 * if raidz expansion is in progress. The leaves value 6457 * (attached raidz children) is variable and strategy for damaging 6458 * blocks will corrupt same data blocks on different child vdevs 6459 * because of the reflow process. 6460 */ 6461 if (spa->spa_raidz_expand != NULL) 6462 goto out; 6463 6464 maxfaults = MAXFAULTS(zs); 6465 raidz_children = ztest_get_raidz_children(spa); 6466 leaves = MAX(zs->zs_mirrors, 1) * raidz_children; 6467 mirror_save = zs->zs_mirrors; 6468 6469 ASSERT3U(leaves, >=, 1); 6470 6471 /* 6472 * While ztest is running the number of leaves will not change. This 6473 * is critical for the fault injection logic as it determines where 6474 * errors can be safely injected such that they are always repairable. 6475 * 6476 * When restarting ztest a different number of leaves may be requested 6477 * which will shift the regions to be damaged. This is fine as long 6478 * as the pool has been scrubbed prior to using the new mapping. 6479 * Failure to do can result in non-repairable damage being injected. 6480 */ 6481 if (ztest_pool_scrubbed == B_FALSE) 6482 goto out; 6483 6484 /* 6485 * Grab the name lock as reader. There are some operations 6486 * which don't like to have their vdevs changed while 6487 * they are in progress (i.e. spa_change_guid). Those 6488 * operations will have grabbed the name lock as writer. 6489 */ 6490 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6491 6492 /* 6493 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 6494 */ 6495 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6496 6497 if (ztest_random(2) == 0) { 6498 /* 6499 * Inject errors on a normal data device or slog device. 6500 */ 6501 top = ztest_random_vdev_top(spa, B_TRUE); 6502 leaf = ztest_random(leaves) + zs->zs_splits; 6503 6504 /* 6505 * Generate paths to the first leaf in this top-level vdev, 6506 * and to the random leaf we selected. We'll induce transient 6507 * write failures and random online/offline activity on leaf 0, 6508 * and we'll write random garbage to the randomly chosen leaf. 6509 */ 6510 (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, 6511 ztest_opts.zo_dir, ztest_opts.zo_pool, 6512 top * leaves + zs->zs_splits); 6513 (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, 6514 ztest_opts.zo_dir, ztest_opts.zo_pool, 6515 top * leaves + leaf); 6516 6517 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 6518 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 6519 islog = B_TRUE; 6520 6521 /* 6522 * If the top-level vdev needs to be resilvered 6523 * then we only allow faults on the device that is 6524 * resilvering. 6525 */ 6526 if (vd0 != NULL && maxfaults != 1 && 6527 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 6528 vd0->vdev_resilver_txg != 0)) { 6529 /* 6530 * Make vd0 explicitly claim to be unreadable, 6531 * or unwritable, or reach behind its back 6532 * and close the underlying fd. We can do this if 6533 * maxfaults == 0 because we'll fail and reexecute, 6534 * and we can do it if maxfaults >= 2 because we'll 6535 * have enough redundancy. If maxfaults == 1, the 6536 * combination of this with injection of random data 6537 * corruption below exceeds the pool's fault tolerance. 6538 */ 6539 vdev_file_t *vf = vd0->vdev_tsd; 6540 6541 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 6542 (long long)vd0->vdev_id, (int)maxfaults); 6543 6544 if (vf != NULL && ztest_random(3) == 0) { 6545 (void) close(vf->vf_file->f_fd); 6546 vf->vf_file->f_fd = -1; 6547 } else if (ztest_random(2) == 0) { 6548 vd0->vdev_cant_read = B_TRUE; 6549 } else { 6550 vd0->vdev_cant_write = B_TRUE; 6551 } 6552 guid0 = vd0->vdev_guid; 6553 } 6554 } else { 6555 /* 6556 * Inject errors on an l2cache device. 6557 */ 6558 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6559 6560 if (sav->sav_count == 0) { 6561 spa_config_exit(spa, SCL_STATE, FTAG); 6562 (void) pthread_rwlock_unlock(&ztest_name_lock); 6563 goto out; 6564 } 6565 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 6566 guid0 = vd0->vdev_guid; 6567 (void) strlcpy(path0, vd0->vdev_path, MAXPATHLEN); 6568 (void) strlcpy(pathrand, vd0->vdev_path, MAXPATHLEN); 6569 6570 leaf = 0; 6571 leaves = 1; 6572 maxfaults = INT_MAX; /* no limit on cache devices */ 6573 } 6574 6575 spa_config_exit(spa, SCL_STATE, FTAG); 6576 (void) pthread_rwlock_unlock(&ztest_name_lock); 6577 6578 /* 6579 * If we can tolerate two or more faults, or we're dealing 6580 * with a slog, randomly online/offline vd0. 6581 */ 6582 if ((maxfaults >= 2 || islog) && guid0 != 0) { 6583 if (ztest_random(10) < 6) { 6584 int flags = (ztest_random(2) == 0 ? 6585 ZFS_OFFLINE_TEMPORARY : 0); 6586 6587 /* 6588 * We have to grab the zs_name_lock as writer to 6589 * prevent a race between offlining a slog and 6590 * destroying a dataset. Offlining the slog will 6591 * grab a reference on the dataset which may cause 6592 * dsl_destroy_head() to fail with EBUSY thus 6593 * leaving the dataset in an inconsistent state. 6594 */ 6595 if (islog) 6596 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6597 6598 VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); 6599 6600 if (islog) 6601 (void) pthread_rwlock_unlock(&ztest_name_lock); 6602 } else { 6603 /* 6604 * Ideally we would like to be able to randomly 6605 * call vdev_[on|off]line without holding locks 6606 * to force unpredictable failures but the side 6607 * effects of vdev_[on|off]line prevent us from 6608 * doing so. 6609 */ 6610 (void) vdev_online(spa, guid0, 0, NULL); 6611 } 6612 } 6613 6614 if (maxfaults == 0) 6615 goto out; 6616 6617 /* 6618 * We have at least single-fault tolerance, so inject data corruption. 6619 */ 6620 fd = open(pathrand, O_RDWR); 6621 6622 if (fd == -1) /* we hit a gap in the device namespace */ 6623 goto out; 6624 6625 fsize = lseek(fd, 0, SEEK_END); 6626 6627 while (--iters != 0) { 6628 /* 6629 * The offset must be chosen carefully to ensure that 6630 * we do not inject a given logical block with errors 6631 * on two different leaf devices, because ZFS can not 6632 * tolerate that (if maxfaults==1). 6633 * 6634 * To achieve this we divide each leaf device into 6635 * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). 6636 * Each chunk is further divided into error-injection 6637 * ranges (can accept errors) and clear ranges (we do 6638 * not inject errors in those). Each error-injection 6639 * range can accept errors only for a single leaf vdev. 6640 * Error-injection ranges are separated by clear ranges. 6641 * 6642 * For example, with 3 leaves, each chunk looks like: 6643 * 0 to 32M: injection range for leaf 0 6644 * 32M to 64M: clear range - no injection allowed 6645 * 64M to 96M: injection range for leaf 1 6646 * 96M to 128M: clear range - no injection allowed 6647 * 128M to 160M: injection range for leaf 2 6648 * 160M to 192M: clear range - no injection allowed 6649 * 6650 * Each clear range must be large enough such that a 6651 * single block cannot straddle it. This way a block 6652 * can't be a target in two different injection ranges 6653 * (on different leaf vdevs). 6654 */ 6655 offset = ztest_random(fsize / (leaves << bshift)) * 6656 (leaves << bshift) + (leaf << bshift) + 6657 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 6658 6659 /* 6660 * Only allow damage to the labels at one end of the vdev. 6661 * 6662 * If all labels are damaged, the device will be totally 6663 * inaccessible, which will result in loss of data, 6664 * because we also damage (parts of) the other side of 6665 * the mirror/raidz. 6666 * 6667 * Additionally, we will always have both an even and an 6668 * odd label, so that we can handle crashes in the 6669 * middle of vdev_config_sync(). 6670 */ 6671 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 6672 continue; 6673 6674 /* 6675 * The two end labels are stored at the "end" of the disk, but 6676 * the end of the disk (vdev_psize) is aligned to 6677 * sizeof (vdev_label_t). 6678 */ 6679 uint64_t psize = P2ALIGN_TYPED(fsize, sizeof (vdev_label_t), 6680 uint64_t); 6681 if ((leaf & 1) == 1 && 6682 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 6683 continue; 6684 6685 if (mirror_save != zs->zs_mirrors) { 6686 (void) close(fd); 6687 goto out; 6688 } 6689 6690 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 6691 fatal(B_TRUE, 6692 "can't inject bad word at 0x%"PRIx64" in %s", 6693 offset, pathrand); 6694 6695 if (ztest_opts.zo_verbose >= 7) 6696 (void) printf("injected bad word into %s," 6697 " offset 0x%"PRIx64"\n", pathrand, offset); 6698 6699 injected = B_TRUE; 6700 } 6701 6702 (void) close(fd); 6703 out: 6704 mutex_exit(&ztest_vdev_lock); 6705 6706 if (injected && ztest_opts.zo_raid_do_expand) { 6707 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6708 if (error == 0) { 6709 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6710 txg_wait_synced(spa_get_dsl(spa), 0); 6711 } 6712 } 6713 6714 umem_free(path0, MAXPATHLEN); 6715 umem_free(pathrand, MAXPATHLEN); 6716 } 6717 6718 /* 6719 * By design ztest will never inject uncorrectable damage in to the pool. 6720 * Issue a scrub, wait for it to complete, and verify there is never any 6721 * persistent damage. 6722 * 6723 * Only after a full scrub has been completed is it safe to start injecting 6724 * data corruption. See the comment in zfs_fault_inject(). 6725 * 6726 * EBUSY may be returned for the following six cases. It's the callers 6727 * responsibility to handle them accordingly. 6728 * 6729 * Current state Requested 6730 * 1. Normal Scrub Running Normal Scrub or Error Scrub 6731 * 2. Normal Scrub Paused Error Scrub 6732 * 3. Normal Scrub Paused Pause Normal Scrub 6733 * 4. Error Scrub Running Normal Scrub or Error Scrub 6734 * 5. Error Scrub Paused Pause Error Scrub 6735 * 6. Resilvering Anything else 6736 */ 6737 static int 6738 ztest_scrub_impl(spa_t *spa) 6739 { 6740 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6741 if (error) 6742 return (error); 6743 6744 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6745 txg_wait_synced(spa_get_dsl(spa), 0); 6746 6747 if (spa_approx_errlog_size(spa) > 0) 6748 return (ECKSUM); 6749 6750 ztest_pool_scrubbed = B_TRUE; 6751 6752 return (0); 6753 } 6754 6755 /* 6756 * Scrub the pool. 6757 */ 6758 void 6759 ztest_scrub(ztest_ds_t *zd, uint64_t id) 6760 { 6761 (void) zd, (void) id; 6762 spa_t *spa = ztest_spa; 6763 int error; 6764 6765 /* 6766 * Scrub in progress by device removal. 6767 */ 6768 if (ztest_device_removal_active) 6769 return; 6770 6771 /* 6772 * Start a scrub, wait a moment, then force a restart. 6773 */ 6774 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6775 (void) poll(NULL, 0, 100); 6776 6777 error = ztest_scrub_impl(spa); 6778 if (error == EBUSY) 6779 error = 0; 6780 ASSERT0(error); 6781 } 6782 6783 /* 6784 * Change the guid for the pool. 6785 */ 6786 void 6787 ztest_reguid(ztest_ds_t *zd, uint64_t id) 6788 { 6789 (void) zd, (void) id; 6790 spa_t *spa = ztest_spa; 6791 uint64_t orig, load; 6792 int error; 6793 ztest_shared_t *zs = ztest_shared; 6794 6795 if (ztest_opts.zo_mmp_test) 6796 return; 6797 6798 orig = spa_guid(spa); 6799 load = spa_load_guid(spa); 6800 6801 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6802 error = spa_change_guid(spa, NULL); 6803 zs->zs_guid = spa_guid(spa); 6804 (void) pthread_rwlock_unlock(&ztest_name_lock); 6805 6806 if (error != 0) 6807 return; 6808 6809 if (ztest_opts.zo_verbose >= 4) { 6810 (void) printf("Changed guid old %"PRIu64" -> %"PRIu64"\n", 6811 orig, spa_guid(spa)); 6812 } 6813 6814 VERIFY3U(orig, !=, spa_guid(spa)); 6815 VERIFY3U(load, ==, spa_load_guid(spa)); 6816 } 6817 6818 void 6819 ztest_blake3(ztest_ds_t *zd, uint64_t id) 6820 { 6821 (void) zd, (void) id; 6822 hrtime_t end = gethrtime() + NANOSEC; 6823 zio_cksum_salt_t salt; 6824 void *salt_ptr = &salt.zcs_bytes; 6825 struct abd *abd_data, *abd_meta; 6826 void *buf, *templ; 6827 int i, *ptr; 6828 uint32_t size; 6829 BLAKE3_CTX ctx; 6830 const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); 6831 6832 size = ztest_random_blocksize(); 6833 buf = umem_alloc(size, UMEM_NOFAIL); 6834 abd_data = abd_alloc(size, B_FALSE); 6835 abd_meta = abd_alloc(size, B_TRUE); 6836 6837 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6838 *ptr = ztest_random(UINT_MAX); 6839 memset(salt_ptr, 'A', 32); 6840 6841 abd_copy_from_buf_off(abd_data, buf, 0, size); 6842 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6843 6844 while (gethrtime() <= end) { 6845 int run_count = 100; 6846 zio_cksum_t zc_ref1, zc_ref2; 6847 zio_cksum_t zc_res1, zc_res2; 6848 6849 void *ref1 = &zc_ref1; 6850 void *ref2 = &zc_ref2; 6851 void *res1 = &zc_res1; 6852 void *res2 = &zc_res2; 6853 6854 /* BLAKE3_KEY_LEN = 32 */ 6855 VERIFY0(blake3->setname("generic")); 6856 templ = abd_checksum_blake3_tmpl_init(&salt); 6857 Blake3_InitKeyed(&ctx, salt_ptr); 6858 Blake3_Update(&ctx, buf, size); 6859 Blake3_Final(&ctx, ref1); 6860 zc_ref2 = zc_ref1; 6861 ZIO_CHECKSUM_BSWAP(&zc_ref2); 6862 abd_checksum_blake3_tmpl_free(templ); 6863 6864 VERIFY0(blake3->setname("cycle")); 6865 while (run_count-- > 0) { 6866 6867 /* Test current implementation */ 6868 Blake3_InitKeyed(&ctx, salt_ptr); 6869 Blake3_Update(&ctx, buf, size); 6870 Blake3_Final(&ctx, res1); 6871 zc_res2 = zc_res1; 6872 ZIO_CHECKSUM_BSWAP(&zc_res2); 6873 6874 VERIFY0(memcmp(ref1, res1, 32)); 6875 VERIFY0(memcmp(ref2, res2, 32)); 6876 6877 /* Test ABD - data */ 6878 templ = abd_checksum_blake3_tmpl_init(&salt); 6879 abd_checksum_blake3_native(abd_data, size, 6880 templ, &zc_res1); 6881 abd_checksum_blake3_byteswap(abd_data, size, 6882 templ, &zc_res2); 6883 6884 VERIFY0(memcmp(ref1, res1, 32)); 6885 VERIFY0(memcmp(ref2, res2, 32)); 6886 6887 /* Test ABD - metadata */ 6888 abd_checksum_blake3_native(abd_meta, size, 6889 templ, &zc_res1); 6890 abd_checksum_blake3_byteswap(abd_meta, size, 6891 templ, &zc_res2); 6892 abd_checksum_blake3_tmpl_free(templ); 6893 6894 VERIFY0(memcmp(ref1, res1, 32)); 6895 VERIFY0(memcmp(ref2, res2, 32)); 6896 6897 } 6898 } 6899 6900 abd_free(abd_data); 6901 abd_free(abd_meta); 6902 umem_free(buf, size); 6903 } 6904 6905 void 6906 ztest_fletcher(ztest_ds_t *zd, uint64_t id) 6907 { 6908 (void) zd, (void) id; 6909 hrtime_t end = gethrtime() + NANOSEC; 6910 6911 while (gethrtime() <= end) { 6912 int run_count = 100; 6913 void *buf; 6914 struct abd *abd_data, *abd_meta; 6915 uint32_t size; 6916 int *ptr; 6917 int i; 6918 zio_cksum_t zc_ref; 6919 zio_cksum_t zc_ref_byteswap; 6920 6921 size = ztest_random_blocksize(); 6922 6923 buf = umem_alloc(size, UMEM_NOFAIL); 6924 abd_data = abd_alloc(size, B_FALSE); 6925 abd_meta = abd_alloc(size, B_TRUE); 6926 6927 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6928 *ptr = ztest_random(UINT_MAX); 6929 6930 abd_copy_from_buf_off(abd_data, buf, 0, size); 6931 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6932 6933 VERIFY0(fletcher_4_impl_set("scalar")); 6934 fletcher_4_native(buf, size, NULL, &zc_ref); 6935 fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); 6936 6937 VERIFY0(fletcher_4_impl_set("cycle")); 6938 while (run_count-- > 0) { 6939 zio_cksum_t zc; 6940 zio_cksum_t zc_byteswap; 6941 6942 fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); 6943 fletcher_4_native(buf, size, NULL, &zc); 6944 6945 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6946 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6947 sizeof (zc_byteswap))); 6948 6949 /* Test ABD - data */ 6950 abd_fletcher_4_byteswap(abd_data, size, NULL, 6951 &zc_byteswap); 6952 abd_fletcher_4_native(abd_data, size, NULL, &zc); 6953 6954 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6955 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6956 sizeof (zc_byteswap))); 6957 6958 /* Test ABD - metadata */ 6959 abd_fletcher_4_byteswap(abd_meta, size, NULL, 6960 &zc_byteswap); 6961 abd_fletcher_4_native(abd_meta, size, NULL, &zc); 6962 6963 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6964 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6965 sizeof (zc_byteswap))); 6966 6967 } 6968 6969 umem_free(buf, size); 6970 abd_free(abd_data); 6971 abd_free(abd_meta); 6972 } 6973 } 6974 6975 void 6976 ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) 6977 { 6978 (void) zd, (void) id; 6979 void *buf; 6980 size_t size; 6981 int *ptr; 6982 int i; 6983 zio_cksum_t zc_ref; 6984 zio_cksum_t zc_ref_bswap; 6985 6986 hrtime_t end = gethrtime() + NANOSEC; 6987 6988 while (gethrtime() <= end) { 6989 int run_count = 100; 6990 6991 size = ztest_random_blocksize(); 6992 buf = umem_alloc(size, UMEM_NOFAIL); 6993 6994 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6995 *ptr = ztest_random(UINT_MAX); 6996 6997 VERIFY0(fletcher_4_impl_set("scalar")); 6998 fletcher_4_native(buf, size, NULL, &zc_ref); 6999 fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); 7000 7001 VERIFY0(fletcher_4_impl_set("cycle")); 7002 7003 while (run_count-- > 0) { 7004 zio_cksum_t zc; 7005 zio_cksum_t zc_bswap; 7006 size_t pos = 0; 7007 7008 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 7009 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 7010 7011 while (pos < size) { 7012 size_t inc = 64 * ztest_random(size / 67); 7013 /* sometimes add few bytes to test non-simd */ 7014 if (ztest_random(100) < 10) 7015 inc += P2ALIGN_TYPED(ztest_random(64), 7016 sizeof (uint32_t), uint64_t); 7017 7018 if (inc > (size - pos)) 7019 inc = size - pos; 7020 7021 fletcher_4_incremental_native(buf + pos, inc, 7022 &zc); 7023 fletcher_4_incremental_byteswap(buf + pos, inc, 7024 &zc_bswap); 7025 7026 pos += inc; 7027 } 7028 7029 VERIFY3U(pos, ==, size); 7030 7031 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 7032 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 7033 7034 /* 7035 * verify if incremental on the whole buffer is 7036 * equivalent to non-incremental version 7037 */ 7038 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 7039 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 7040 7041 fletcher_4_incremental_native(buf, size, &zc); 7042 fletcher_4_incremental_byteswap(buf, size, &zc_bswap); 7043 7044 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 7045 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 7046 } 7047 7048 umem_free(buf, size); 7049 } 7050 } 7051 7052 void 7053 ztest_pool_prefetch_ddt(ztest_ds_t *zd, uint64_t id) 7054 { 7055 (void) zd, (void) id; 7056 spa_t *spa; 7057 7058 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7059 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7060 7061 ddt_prefetch_all(spa); 7062 7063 spa_close(spa, FTAG); 7064 (void) pthread_rwlock_unlock(&ztest_name_lock); 7065 } 7066 7067 static int 7068 ztest_set_global_vars(void) 7069 { 7070 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 7071 char *kv = ztest_opts.zo_gvars[i]; 7072 VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); 7073 VERIFY3U(strlen(kv), >, 0); 7074 int err = handle_tunable_option(kv, B_TRUE); 7075 if (ztest_opts.zo_verbose > 0) { 7076 (void) printf("setting global var %s ... %s\n", kv, 7077 err ? "failed" : "ok"); 7078 } 7079 if (err != 0) { 7080 (void) fprintf(stderr, 7081 "failed to set global var '%s'\n", kv); 7082 return (err); 7083 } 7084 } 7085 return (0); 7086 } 7087 7088 static char ** 7089 ztest_global_vars_to_zdb_args(void) 7090 { 7091 char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); 7092 char **cur = args; 7093 if (args == NULL) 7094 return (NULL); 7095 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 7096 *cur++ = (char *)"-o"; 7097 *cur++ = ztest_opts.zo_gvars[i]; 7098 } 7099 ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); 7100 *cur = NULL; 7101 return (args); 7102 } 7103 7104 /* The end of strings is indicated by a NULL element */ 7105 static char * 7106 join_strings(char **strings, const char *sep) 7107 { 7108 size_t totallen = 0; 7109 for (char **sp = strings; *sp != NULL; sp++) { 7110 totallen += strlen(*sp); 7111 totallen += strlen(sep); 7112 } 7113 if (totallen > 0) { 7114 ASSERT(totallen >= strlen(sep)); 7115 totallen -= strlen(sep); 7116 } 7117 7118 size_t buflen = totallen + 1; 7119 char *o = umem_alloc(buflen, UMEM_NOFAIL); /* trailing 0 byte */ 7120 o[0] = '\0'; 7121 for (char **sp = strings; *sp != NULL; sp++) { 7122 size_t would; 7123 would = strlcat(o, *sp, buflen); 7124 VERIFY3U(would, <, buflen); 7125 if (*(sp+1) == NULL) { 7126 break; 7127 } 7128 would = strlcat(o, sep, buflen); 7129 VERIFY3U(would, <, buflen); 7130 } 7131 ASSERT3S(strlen(o), ==, totallen); 7132 return (o); 7133 } 7134 7135 static int 7136 ztest_check_path(char *path) 7137 { 7138 struct stat s; 7139 /* return true on success */ 7140 return (!stat(path, &s)); 7141 } 7142 7143 static void 7144 ztest_get_zdb_bin(char *bin, int len) 7145 { 7146 char *zdb_path; 7147 /* 7148 * Try to use $ZDB and in-tree zdb path. If not successful, just 7149 * let popen to search through PATH. 7150 */ 7151 if ((zdb_path = getenv("ZDB"))) { 7152 strlcpy(bin, zdb_path, len); /* In env */ 7153 if (!ztest_check_path(bin)) { 7154 ztest_dump_core = 0; 7155 fatal(B_TRUE, "invalid ZDB '%s'", bin); 7156 } 7157 return; 7158 } 7159 7160 VERIFY3P(realpath(getexecname(), bin), !=, NULL); 7161 if (strstr(bin, ".libs/ztest")) { 7162 strstr(bin, ".libs/ztest")[0] = '\0'; /* In-tree */ 7163 strcat(bin, "zdb"); 7164 if (ztest_check_path(bin)) 7165 return; 7166 } 7167 strcpy(bin, "zdb"); 7168 } 7169 7170 static vdev_t * 7171 ztest_random_concrete_vdev_leaf(vdev_t *vd) 7172 { 7173 if (vd == NULL) 7174 return (NULL); 7175 7176 if (vd->vdev_children == 0) 7177 return (vd); 7178 7179 vdev_t *eligible[vd->vdev_children]; 7180 int eligible_idx = 0, i; 7181 for (i = 0; i < vd->vdev_children; i++) { 7182 vdev_t *cvd = vd->vdev_child[i]; 7183 if (cvd->vdev_top->vdev_removing) 7184 continue; 7185 if (cvd->vdev_children > 0 || 7186 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 7187 eligible[eligible_idx++] = cvd; 7188 } 7189 } 7190 VERIFY3S(eligible_idx, >, 0); 7191 7192 uint64_t child_no = ztest_random(eligible_idx); 7193 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 7194 } 7195 7196 void 7197 ztest_initialize(ztest_ds_t *zd, uint64_t id) 7198 { 7199 (void) zd, (void) id; 7200 spa_t *spa = ztest_spa; 7201 int error = 0; 7202 7203 mutex_enter(&ztest_vdev_lock); 7204 7205 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 7206 7207 /* Random leaf vdev */ 7208 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 7209 if (rand_vd == NULL) { 7210 spa_config_exit(spa, SCL_VDEV, FTAG); 7211 mutex_exit(&ztest_vdev_lock); 7212 return; 7213 } 7214 7215 /* 7216 * The random vdev we've selected may change as soon as we 7217 * drop the spa_config_lock. We create local copies of things 7218 * we're interested in. 7219 */ 7220 uint64_t guid = rand_vd->vdev_guid; 7221 char *path = strdup(rand_vd->vdev_path); 7222 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 7223 7224 zfs_dbgmsg("vd %px, guid %llu", rand_vd, (u_longlong_t)guid); 7225 spa_config_exit(spa, SCL_VDEV, FTAG); 7226 7227 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 7228 7229 nvlist_t *vdev_guids = fnvlist_alloc(); 7230 nvlist_t *vdev_errlist = fnvlist_alloc(); 7231 fnvlist_add_uint64(vdev_guids, path, guid); 7232 error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); 7233 fnvlist_free(vdev_guids); 7234 fnvlist_free(vdev_errlist); 7235 7236 switch (cmd) { 7237 case POOL_INITIALIZE_CANCEL: 7238 if (ztest_opts.zo_verbose >= 4) { 7239 (void) printf("Cancel initialize %s", path); 7240 if (!active) 7241 (void) printf(" failed (no initialize active)"); 7242 (void) printf("\n"); 7243 } 7244 break; 7245 case POOL_INITIALIZE_START: 7246 if (ztest_opts.zo_verbose >= 4) { 7247 (void) printf("Start initialize %s", path); 7248 if (active && error == 0) 7249 (void) printf(" failed (already active)"); 7250 else if (error != 0) 7251 (void) printf(" failed (error %d)", error); 7252 (void) printf("\n"); 7253 } 7254 break; 7255 case POOL_INITIALIZE_SUSPEND: 7256 if (ztest_opts.zo_verbose >= 4) { 7257 (void) printf("Suspend initialize %s", path); 7258 if (!active) 7259 (void) printf(" failed (no initialize active)"); 7260 (void) printf("\n"); 7261 } 7262 break; 7263 } 7264 free(path); 7265 mutex_exit(&ztest_vdev_lock); 7266 } 7267 7268 void 7269 ztest_trim(ztest_ds_t *zd, uint64_t id) 7270 { 7271 (void) zd, (void) id; 7272 spa_t *spa = ztest_spa; 7273 int error = 0; 7274 7275 mutex_enter(&ztest_vdev_lock); 7276 7277 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 7278 7279 /* Random leaf vdev */ 7280 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 7281 if (rand_vd == NULL) { 7282 spa_config_exit(spa, SCL_VDEV, FTAG); 7283 mutex_exit(&ztest_vdev_lock); 7284 return; 7285 } 7286 7287 /* 7288 * The random vdev we've selected may change as soon as we 7289 * drop the spa_config_lock. We create local copies of things 7290 * we're interested in. 7291 */ 7292 uint64_t guid = rand_vd->vdev_guid; 7293 char *path = strdup(rand_vd->vdev_path); 7294 boolean_t active = rand_vd->vdev_trim_thread != NULL; 7295 7296 zfs_dbgmsg("vd %p, guid %llu", rand_vd, (u_longlong_t)guid); 7297 spa_config_exit(spa, SCL_VDEV, FTAG); 7298 7299 uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); 7300 uint64_t rate = 1 << ztest_random(30); 7301 boolean_t partial = (ztest_random(5) > 0); 7302 boolean_t secure = (ztest_random(5) > 0); 7303 7304 nvlist_t *vdev_guids = fnvlist_alloc(); 7305 nvlist_t *vdev_errlist = fnvlist_alloc(); 7306 fnvlist_add_uint64(vdev_guids, path, guid); 7307 error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, 7308 secure, vdev_errlist); 7309 fnvlist_free(vdev_guids); 7310 fnvlist_free(vdev_errlist); 7311 7312 switch (cmd) { 7313 case POOL_TRIM_CANCEL: 7314 if (ztest_opts.zo_verbose >= 4) { 7315 (void) printf("Cancel TRIM %s", path); 7316 if (!active) 7317 (void) printf(" failed (no TRIM active)"); 7318 (void) printf("\n"); 7319 } 7320 break; 7321 case POOL_TRIM_START: 7322 if (ztest_opts.zo_verbose >= 4) { 7323 (void) printf("Start TRIM %s", path); 7324 if (active && error == 0) 7325 (void) printf(" failed (already active)"); 7326 else if (error != 0) 7327 (void) printf(" failed (error %d)", error); 7328 (void) printf("\n"); 7329 } 7330 break; 7331 case POOL_TRIM_SUSPEND: 7332 if (ztest_opts.zo_verbose >= 4) { 7333 (void) printf("Suspend TRIM %s", path); 7334 if (!active) 7335 (void) printf(" failed (no TRIM active)"); 7336 (void) printf("\n"); 7337 } 7338 break; 7339 } 7340 free(path); 7341 mutex_exit(&ztest_vdev_lock); 7342 } 7343 7344 void 7345 ztest_ddt_prune(ztest_ds_t *zd, uint64_t id) 7346 { 7347 (void) zd, (void) id; 7348 7349 spa_t *spa = ztest_spa; 7350 uint64_t pct = ztest_random(15) + 1; 7351 7352 (void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct); 7353 } 7354 7355 /* 7356 * Verify pool integrity by running zdb. 7357 */ 7358 static void 7359 ztest_run_zdb(uint64_t guid) 7360 { 7361 int status; 7362 char *bin; 7363 char *zdb; 7364 char *zbuf; 7365 const int len = MAXPATHLEN + MAXNAMELEN + 20; 7366 FILE *fp; 7367 7368 bin = umem_alloc(len, UMEM_NOFAIL); 7369 zdb = umem_alloc(len, UMEM_NOFAIL); 7370 zbuf = umem_alloc(1024, UMEM_NOFAIL); 7371 7372 ztest_get_zdb_bin(bin, len); 7373 7374 char **set_gvars_args = ztest_global_vars_to_zdb_args(); 7375 if (set_gvars_args == NULL) { 7376 fatal(B_FALSE, "Failed to allocate memory in " 7377 "ztest_global_vars_to_zdb_args(). Cannot run zdb.\n"); 7378 } 7379 char *set_gvars_args_joined = join_strings(set_gvars_args, " "); 7380 free(set_gvars_args); 7381 7382 size_t would = snprintf(zdb, len, 7383 "%s -bcc%s%s -G -d -Y -e -y %s -p %s %"PRIu64, 7384 bin, 7385 ztest_opts.zo_verbose >= 3 ? "s" : "", 7386 ztest_opts.zo_verbose >= 4 ? "v" : "", 7387 set_gvars_args_joined, 7388 ztest_opts.zo_dir, 7389 guid); 7390 ASSERT3U(would, <, len); 7391 7392 umem_free(set_gvars_args_joined, strlen(set_gvars_args_joined) + 1); 7393 7394 if (ztest_opts.zo_verbose >= 5) 7395 (void) printf("Executing %s\n", zdb); 7396 7397 fp = popen(zdb, "r"); 7398 7399 while (fgets(zbuf, 1024, fp) != NULL) 7400 if (ztest_opts.zo_verbose >= 3) 7401 (void) printf("%s", zbuf); 7402 7403 status = pclose(fp); 7404 7405 if (status == 0) 7406 goto out; 7407 7408 ztest_dump_core = 0; 7409 if (WIFEXITED(status)) 7410 fatal(B_FALSE, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 7411 else 7412 fatal(B_FALSE, "'%s' died with signal %d", 7413 zdb, WTERMSIG(status)); 7414 out: 7415 umem_free(bin, len); 7416 umem_free(zdb, len); 7417 umem_free(zbuf, 1024); 7418 } 7419 7420 static void 7421 ztest_walk_pool_directory(const char *header) 7422 { 7423 spa_t *spa = NULL; 7424 7425 if (ztest_opts.zo_verbose >= 6) 7426 (void) puts(header); 7427 7428 spa_namespace_enter(FTAG); 7429 while ((spa = spa_next(spa)) != NULL) 7430 if (ztest_opts.zo_verbose >= 6) 7431 (void) printf("\t%s\n", spa_name(spa)); 7432 spa_namespace_exit(FTAG); 7433 } 7434 7435 static void 7436 ztest_spa_import_export(char *oldname, char *newname) 7437 { 7438 nvlist_t *config, *newconfig; 7439 uint64_t pool_guid; 7440 spa_t *spa; 7441 int error; 7442 7443 if (ztest_opts.zo_verbose >= 4) { 7444 (void) printf("import/export: old = %s, new = %s\n", 7445 oldname, newname); 7446 } 7447 7448 /* 7449 * Clean up from previous runs. 7450 */ 7451 (void) spa_destroy(newname); 7452 7453 /* 7454 * Get the pool's configuration and guid. 7455 */ 7456 VERIFY0(spa_open(oldname, &spa, FTAG)); 7457 7458 /* 7459 * Kick off a scrub to tickle scrub/export races. 7460 */ 7461 if (ztest_random(2) == 0) 7462 (void) spa_scan(spa, POOL_SCAN_SCRUB); 7463 7464 pool_guid = spa_guid(spa); 7465 spa_close(spa, FTAG); 7466 7467 ztest_walk_pool_directory("pools before export"); 7468 7469 /* 7470 * Export it. 7471 */ 7472 VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); 7473 7474 ztest_walk_pool_directory("pools after export"); 7475 7476 /* 7477 * Try to import it. 7478 */ 7479 newconfig = spa_tryimport(config); 7480 ASSERT3P(newconfig, !=, NULL); 7481 fnvlist_free(newconfig); 7482 7483 /* 7484 * Import it under the new name. 7485 */ 7486 error = spa_import(newname, config, NULL, 0); 7487 if (error != 0) { 7488 dump_nvlist(config, 0); 7489 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 7490 oldname, newname, error); 7491 } 7492 7493 ztest_walk_pool_directory("pools after import"); 7494 7495 /* 7496 * Try to import it again -- should fail with EEXIST. 7497 */ 7498 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 7499 7500 /* 7501 * Try to import it under a different name -- should fail with EEXIST. 7502 */ 7503 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 7504 7505 /* 7506 * Verify that the pool is no longer visible under the old name. 7507 */ 7508 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 7509 7510 /* 7511 * Verify that we can open and close the pool using the new name. 7512 */ 7513 VERIFY0(spa_open(newname, &spa, FTAG)); 7514 ASSERT3U(pool_guid, ==, spa_guid(spa)); 7515 spa_close(spa, FTAG); 7516 7517 fnvlist_free(config); 7518 } 7519 7520 static void 7521 ztest_resume(spa_t *spa) 7522 { 7523 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 7524 (void) printf("resuming from suspended state\n"); 7525 spa_vdev_state_enter(spa, SCL_NONE); 7526 vdev_clear(spa, NULL); 7527 (void) spa_vdev_state_exit(spa, NULL, 0); 7528 (void) zio_resume(spa); 7529 } 7530 7531 static __attribute__((noreturn)) void 7532 ztest_resume_thread(void *arg) 7533 { 7534 spa_t *spa = arg; 7535 7536 /* 7537 * Synthesize aged DDT entries for ddt prune testing 7538 */ 7539 ddt_prune_artificial_age = B_TRUE; 7540 if (ztest_opts.zo_verbose >= 3) 7541 ddt_dump_prune_histogram = B_TRUE; 7542 7543 while (!ztest_exiting) { 7544 if (spa_suspended(spa)) 7545 ztest_resume(spa); 7546 (void) poll(NULL, 0, 100); 7547 7548 /* 7549 * Periodically change the zfs_compressed_arc_enabled setting. 7550 */ 7551 if (ztest_random(10) == 0) 7552 zfs_compressed_arc_enabled = ztest_random(2); 7553 7554 /* 7555 * Periodically change the zfs_abd_scatter_enabled setting. 7556 */ 7557 if (ztest_random(10) == 0) 7558 zfs_abd_scatter_enabled = ztest_random(2); 7559 } 7560 7561 thread_exit(); 7562 } 7563 7564 static __attribute__((noreturn)) void 7565 ztest_deadman_thread(void *arg) 7566 { 7567 ztest_shared_t *zs = arg; 7568 spa_t *spa = ztest_spa; 7569 hrtime_t delay, overdue, last_run = gethrtime(); 7570 7571 delay = (zs->zs_thread_stop - zs->zs_thread_start) + 7572 MSEC2NSEC(zfs_deadman_synctime_ms); 7573 7574 while (!ztest_exiting) { 7575 /* 7576 * Wait for the delay timer while checking occasionally 7577 * if we should stop. 7578 */ 7579 if (gethrtime() < last_run + delay) { 7580 (void) poll(NULL, 0, 1000); 7581 continue; 7582 } 7583 7584 /* 7585 * If the pool is suspended then fail immediately. Otherwise, 7586 * check to see if the pool is making any progress. If 7587 * vdev_deadman() discovers that there hasn't been any recent 7588 * I/Os then it will end up aborting the tests. 7589 */ 7590 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 7591 fatal(B_FALSE, 7592 "aborting test after %llu seconds because " 7593 "pool has transitioned to a suspended state.", 7594 (u_longlong_t)zfs_deadman_synctime_ms / 1000); 7595 } 7596 vdev_deadman(spa->spa_root_vdev, FTAG); 7597 7598 /* 7599 * If the process doesn't complete within a grace period of 7600 * zfs_deadman_synctime_ms over the expected finish time, 7601 * then it may be hung and is terminated. 7602 */ 7603 overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); 7604 if (gethrtime() > overdue) { 7605 fatal(B_FALSE, 7606 "aborting test after %llu seconds because " 7607 "the process is overdue for termination.", 7608 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7609 } 7610 7611 (void) printf("ztest has been running for %lld seconds\n", 7612 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7613 7614 last_run = gethrtime(); 7615 delay = MSEC2NSEC(zfs_deadman_checktime_ms); 7616 } 7617 7618 thread_exit(); 7619 } 7620 7621 static void 7622 ztest_execute(int test, ztest_info_t *zi, uint64_t id) 7623 { 7624 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 7625 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 7626 hrtime_t functime = gethrtime(); 7627 int i; 7628 7629 for (i = 0; i < zi->zi_iters; i++) 7630 zi->zi_func(zd, id); 7631 7632 functime = gethrtime() - functime; 7633 7634 atomic_add_64(&zc->zc_count, 1); 7635 atomic_add_64(&zc->zc_time, functime); 7636 7637 if (ztest_opts.zo_verbose >= 4) 7638 (void) printf("%6.2f sec in %s\n", 7639 (double)functime / NANOSEC, zi->zi_funcname); 7640 } 7641 7642 typedef struct ztest_raidz_expand_io { 7643 uint64_t rzx_id; 7644 uint64_t rzx_amount; 7645 uint64_t rzx_bufsize; 7646 const void *rzx_buffer; 7647 uint64_t rzx_alloc_max; 7648 spa_t *rzx_spa; 7649 } ztest_expand_io_t; 7650 7651 #undef OD_ARRAY_SIZE 7652 #define OD_ARRAY_SIZE 10 7653 7654 /* 7655 * Write a request amount of data to some dataset objects. 7656 * There will be ztest_opts.zo_threads count of these running in parallel. 7657 */ 7658 static __attribute__((noreturn)) void 7659 ztest_rzx_thread(void *arg) 7660 { 7661 ztest_expand_io_t *info = (ztest_expand_io_t *)arg; 7662 ztest_od_t *od; 7663 int batchsize; 7664 int od_size; 7665 ztest_ds_t *zd = &ztest_ds[info->rzx_id % ztest_opts.zo_datasets]; 7666 spa_t *spa = info->rzx_spa; 7667 7668 od_size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 7669 od = umem_alloc(od_size, UMEM_NOFAIL); 7670 batchsize = OD_ARRAY_SIZE; 7671 7672 /* Create objects to write to */ 7673 for (int b = 0; b < batchsize; b++) { 7674 ztest_od_init(od + b, info->rzx_id, FTAG, b, 7675 DMU_OT_UINT64_OTHER, 0, 0, 0); 7676 } 7677 if (ztest_object_init(zd, od, od_size, B_FALSE) != 0) { 7678 umem_free(od, od_size); 7679 thread_exit(); 7680 } 7681 7682 for (uint64_t offset = 0, written = 0; written < info->rzx_amount; 7683 offset += info->rzx_bufsize) { 7684 /* write to 10 objects */ 7685 for (int i = 0; i < batchsize && written < info->rzx_amount; 7686 i++) { 7687 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 7688 ztest_write(zd, od[i].od_object, offset, 7689 info->rzx_bufsize, info->rzx_buffer); 7690 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 7691 written += info->rzx_bufsize; 7692 } 7693 txg_wait_synced(spa_get_dsl(spa), 0); 7694 /* due to inflation, we'll typically bail here */ 7695 if (metaslab_class_get_alloc(spa_normal_class(spa)) > 7696 info->rzx_alloc_max) { 7697 break; 7698 } 7699 } 7700 7701 /* Remove a few objects to leave some holes in allocation space */ 7702 mutex_enter(&zd->zd_dirobj_lock); 7703 (void) ztest_remove(zd, od, 2); 7704 mutex_exit(&zd->zd_dirobj_lock); 7705 7706 umem_free(od, od_size); 7707 7708 thread_exit(); 7709 } 7710 7711 static __attribute__((noreturn)) void 7712 ztest_thread(void *arg) 7713 { 7714 int rand; 7715 uint64_t id = (uintptr_t)arg; 7716 ztest_shared_t *zs = ztest_shared; 7717 uint64_t call_next; 7718 hrtime_t now; 7719 ztest_info_t *zi; 7720 ztest_shared_callstate_t *zc; 7721 7722 while ((now = gethrtime()) < zs->zs_thread_stop) { 7723 /* 7724 * See if it's time to force a crash. 7725 */ 7726 if (now > zs->zs_thread_kill && 7727 raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) { 7728 ztest_kill(zs); 7729 } 7730 7731 /* 7732 * If we're getting ENOSPC with some regularity, stop. 7733 */ 7734 if (zs->zs_enospc_count > 10) 7735 break; 7736 7737 /* 7738 * Pick a random function to execute. 7739 */ 7740 rand = ztest_random(ZTEST_FUNCS); 7741 zi = &ztest_info[rand]; 7742 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 7743 call_next = zc->zc_next; 7744 7745 if (now >= call_next && 7746 atomic_cas_64(&zc->zc_next, call_next, call_next + 7747 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 7748 ztest_execute(rand, zi, id); 7749 } 7750 } 7751 7752 thread_exit(); 7753 } 7754 7755 static void 7756 ztest_dataset_name(char *dsname, const char *pool, int d) 7757 { 7758 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 7759 } 7760 7761 static void 7762 ztest_dataset_destroy(int d) 7763 { 7764 char name[ZFS_MAX_DATASET_NAME_LEN]; 7765 int t; 7766 7767 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7768 7769 if (ztest_opts.zo_verbose >= 3) 7770 (void) printf("Destroying %s to free up space\n", name); 7771 7772 /* 7773 * Cleanup any non-standard clones and snapshots. In general, 7774 * ztest thread t operates on dataset (t % zopt_datasets), 7775 * so there may be more than one thing to clean up. 7776 */ 7777 for (t = d; t < ztest_opts.zo_threads; 7778 t += ztest_opts.zo_datasets) 7779 ztest_dsl_dataset_cleanup(name, t); 7780 7781 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 7782 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 7783 } 7784 7785 static void 7786 ztest_dataset_dirobj_verify(ztest_ds_t *zd) 7787 { 7788 uint64_t usedobjs, dirobjs, scratch; 7789 7790 /* 7791 * ZTEST_DIROBJ is the object directory for the entire dataset. 7792 * Therefore, the number of objects in use should equal the 7793 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 7794 * If not, we have an object leak. 7795 * 7796 * Note that we can only check this in ztest_dataset_open(), 7797 * when the open-context and syncing-context values agree. 7798 * That's because zap_count() returns the open-context value, 7799 * while dmu_objset_space() returns the rootbp fill count. 7800 */ 7801 VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 7802 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 7803 ASSERT3U(dirobjs + 1, ==, usedobjs); 7804 } 7805 7806 static int 7807 ztest_dataset_open(int d) 7808 { 7809 ztest_ds_t *zd = &ztest_ds[d]; 7810 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 7811 objset_t *os; 7812 zilog_t *zilog; 7813 char name[ZFS_MAX_DATASET_NAME_LEN]; 7814 int error; 7815 7816 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7817 7818 if (ztest_opts.zo_verbose >= 6) 7819 (void) printf("Opening %s\n", name); 7820 7821 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7822 7823 error = ztest_dataset_create(name); 7824 if (error == ENOSPC) { 7825 (void) pthread_rwlock_unlock(&ztest_name_lock); 7826 ztest_record_enospc(FTAG); 7827 return (error); 7828 } 7829 ASSERT(error == 0 || error == EEXIST); 7830 7831 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 7832 B_TRUE, zd, &os)); 7833 (void) pthread_rwlock_unlock(&ztest_name_lock); 7834 7835 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 7836 7837 zilog = zd->zd_zilog; 7838 7839 if (zilog->zl_header->zh_claim_lr_seq != 0 && 7840 zilog->zl_header->zh_claim_lr_seq < committed_seq) 7841 fatal(B_FALSE, "missing log records: " 7842 "claimed %"PRIu64" < committed %"PRIu64"", 7843 zilog->zl_header->zh_claim_lr_seq, committed_seq); 7844 7845 ztest_dataset_dirobj_verify(zd); 7846 7847 zil_replay(os, zd, ztest_replay_vector); 7848 7849 ztest_dataset_dirobj_verify(zd); 7850 7851 if (ztest_opts.zo_verbose >= 6) 7852 (void) printf("%s replay %"PRIu64" blocks, " 7853 "%"PRIu64" records, seq %"PRIu64"\n", 7854 zd->zd_name, 7855 zilog->zl_parse_blk_count, 7856 zilog->zl_parse_lr_count, 7857 zilog->zl_replaying_seq); 7858 7859 zilog = zil_open(os, ztest_get_data, NULL); 7860 7861 if (zilog->zl_replaying_seq != 0 && 7862 zilog->zl_replaying_seq < committed_seq) 7863 fatal(B_FALSE, "missing log records: " 7864 "replayed %"PRIu64" < committed %"PRIu64"", 7865 zilog->zl_replaying_seq, committed_seq); 7866 7867 return (0); 7868 } 7869 7870 static void 7871 ztest_dataset_close(int d) 7872 { 7873 ztest_ds_t *zd = &ztest_ds[d]; 7874 7875 zil_close(zd->zd_zilog); 7876 dmu_objset_disown(zd->zd_os, B_TRUE, zd); 7877 7878 ztest_zd_fini(zd); 7879 } 7880 7881 static int 7882 ztest_replay_zil_cb(const char *name, void *arg) 7883 { 7884 (void) arg; 7885 objset_t *os; 7886 ztest_ds_t *zdtmp; 7887 7888 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, 7889 B_TRUE, FTAG, &os)); 7890 7891 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 7892 7893 ztest_zd_init(zdtmp, NULL, os); 7894 zil_replay(os, zdtmp, ztest_replay_vector); 7895 ztest_zd_fini(zdtmp); 7896 7897 if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && 7898 ztest_opts.zo_verbose >= 6) { 7899 zilog_t *zilog = dmu_objset_zil(os); 7900 7901 (void) printf("%s replay %"PRIu64" blocks, " 7902 "%"PRIu64" records, seq %"PRIu64"\n", 7903 name, 7904 zilog->zl_parse_blk_count, 7905 zilog->zl_parse_lr_count, 7906 zilog->zl_replaying_seq); 7907 } 7908 7909 umem_free(zdtmp, sizeof (ztest_ds_t)); 7910 7911 dmu_objset_disown(os, B_TRUE, FTAG); 7912 return (0); 7913 } 7914 7915 static void 7916 ztest_freeze(void) 7917 { 7918 ztest_ds_t *zd = &ztest_ds[0]; 7919 spa_t *spa; 7920 int numloops = 0; 7921 7922 /* freeze not supported during RAIDZ expansion */ 7923 if (ztest_opts.zo_raid_do_expand) 7924 return; 7925 7926 if (ztest_opts.zo_verbose >= 3) 7927 (void) printf("testing spa_freeze()...\n"); 7928 7929 raidz_scratch_verify(); 7930 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7931 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7932 VERIFY0(ztest_dataset_open(0)); 7933 ztest_spa = spa; 7934 7935 /* 7936 * Force the first log block to be transactionally allocated. 7937 * We have to do this before we freeze the pool -- otherwise 7938 * the log chain won't be anchored. 7939 */ 7940 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 7941 ztest_dmu_object_alloc_free(zd, 0); 7942 VERIFY0(zil_commit(zd->zd_zilog, 0)); 7943 } 7944 7945 txg_wait_synced(spa_get_dsl(spa), 0); 7946 7947 /* 7948 * Freeze the pool. This stops spa_sync() from doing anything, 7949 * so that the only way to record changes from now on is the ZIL. 7950 */ 7951 spa_freeze(spa); 7952 7953 /* 7954 * Because it is hard to predict how much space a write will actually 7955 * require beforehand, we leave ourselves some fudge space to write over 7956 * capacity. 7957 */ 7958 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 7959 7960 /* 7961 * Run tests that generate log records but don't alter the pool config 7962 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 7963 * We do a txg_wait_synced() after each iteration to force the txg 7964 * to increase well beyond the last synced value in the uberblock. 7965 * The ZIL should be OK with that. 7966 * 7967 * Run a random number of times less than zo_maxloops and ensure we do 7968 * not run out of space on the pool. 7969 */ 7970 while (ztest_random(10) != 0 && 7971 numloops++ < ztest_opts.zo_maxloops && 7972 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 7973 ztest_od_t od; 7974 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 7975 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 7976 ztest_io(zd, od.od_object, 7977 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 7978 txg_wait_synced(spa_get_dsl(spa), 0); 7979 } 7980 7981 /* 7982 * Commit all of the changes we just generated. 7983 */ 7984 VERIFY0(zil_commit(zd->zd_zilog, 0)); 7985 txg_wait_synced(spa_get_dsl(spa), 0); 7986 7987 /* 7988 * Close our dataset and close the pool. 7989 */ 7990 ztest_dataset_close(0); 7991 spa_close(spa, FTAG); 7992 kernel_fini(); 7993 7994 /* 7995 * Open and close the pool and dataset to induce log replay. 7996 */ 7997 raidz_scratch_verify(); 7998 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7999 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8000 ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); 8001 VERIFY0(ztest_dataset_open(0)); 8002 ztest_spa = spa; 8003 txg_wait_synced(spa_get_dsl(spa), 0); 8004 ztest_dataset_close(0); 8005 ztest_reguid(NULL, 0); 8006 8007 spa_close(spa, FTAG); 8008 kernel_fini(); 8009 } 8010 8011 static void 8012 ztest_import_impl(void) 8013 { 8014 importargs_t args = { 0 }; 8015 nvlist_t *cfg = NULL; 8016 int nsearch = 1; 8017 char *searchdirs[nsearch]; 8018 int flags = ZFS_IMPORT_MISSING_LOG; 8019 8020 searchdirs[0] = ztest_opts.zo_dir; 8021 args.paths = nsearch; 8022 args.path = searchdirs; 8023 args.can_be_active = B_FALSE; 8024 8025 libpc_handle_t lpch = { 8026 .lpc_lib_handle = NULL, 8027 .lpc_ops = &libzpool_config_ops, 8028 .lpc_printerr = B_TRUE 8029 }; 8030 VERIFY0(zpool_find_config(&lpch, ztest_opts.zo_pool, &cfg, &args)); 8031 VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); 8032 fnvlist_free(cfg); 8033 } 8034 8035 /* 8036 * Import a storage pool with the given name. 8037 */ 8038 static void 8039 ztest_import(ztest_shared_t *zs) 8040 { 8041 spa_t *spa; 8042 8043 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8044 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8045 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8046 8047 raidz_scratch_verify(); 8048 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8049 8050 ztest_import_impl(); 8051 8052 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8053 zs->zs_metaslab_sz = 8054 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8055 zs->zs_guid = spa_guid(spa); 8056 spa_close(spa, FTAG); 8057 8058 kernel_fini(); 8059 8060 if (!ztest_opts.zo_mmp_test) { 8061 ztest_run_zdb(zs->zs_guid); 8062 ztest_freeze(); 8063 ztest_run_zdb(zs->zs_guid); 8064 } 8065 8066 (void) pthread_rwlock_destroy(&ztest_name_lock); 8067 mutex_destroy(&ztest_vdev_lock); 8068 mutex_destroy(&ztest_checkpoint_lock); 8069 } 8070 8071 /* 8072 * After the expansion was killed, check that the pool is healthy 8073 */ 8074 static void 8075 ztest_raidz_expand_check(spa_t *spa) 8076 { 8077 ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, RAIDZ_EXPAND_KILLED); 8078 /* 8079 * Set pool check done flag, main program will run a zdb check 8080 * of the pool when we exit. 8081 */ 8082 ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_CHECKED; 8083 8084 /* Wait for reflow to finish */ 8085 if (ztest_opts.zo_verbose >= 1) { 8086 (void) printf("\nwaiting for reflow to finish ...\n"); 8087 } 8088 pool_raidz_expand_stat_t rzx_stats; 8089 pool_raidz_expand_stat_t *pres = &rzx_stats; 8090 do { 8091 txg_wait_synced(spa_get_dsl(spa), 0); 8092 (void) poll(NULL, 0, 500); /* wait 1/2 second */ 8093 8094 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8095 (void) spa_raidz_expand_get_stats(spa, pres); 8096 spa_config_exit(spa, SCL_CONFIG, FTAG); 8097 } while (pres->pres_state != DSS_FINISHED && 8098 pres->pres_reflowed < pres->pres_to_reflow); 8099 8100 if (ztest_opts.zo_verbose >= 1) { 8101 (void) printf("verifying an interrupted raidz " 8102 "expansion using a pool scrub ...\n"); 8103 } 8104 8105 /* Will fail here if there is non-recoverable corruption detected */ 8106 int error = ztest_scrub_impl(spa); 8107 if (error == EBUSY) 8108 error = 0; 8109 8110 VERIFY0(error); 8111 8112 if (ztest_opts.zo_verbose >= 1) { 8113 (void) printf("raidz expansion scrub check complete\n"); 8114 } 8115 } 8116 8117 /* 8118 * Start a raidz expansion test. We run some I/O on the pool for a while 8119 * to get some data in the pool. Then we grow the raidz and 8120 * kill the test at the requested offset into the reflow, verifying that 8121 * doing such does not lead to pool corruption. 8122 */ 8123 static void 8124 ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) 8125 { 8126 nvlist_t *root; 8127 pool_raidz_expand_stat_t rzx_stats; 8128 pool_raidz_expand_stat_t *pres = &rzx_stats; 8129 kthread_t **run_threads; 8130 vdev_t *cvd, *rzvd = spa->spa_root_vdev->vdev_child[0]; 8131 int total_disks = rzvd->vdev_children; 8132 int data_disks = total_disks - vdev_get_nparity(rzvd); 8133 uint64_t alloc_goal; 8134 uint64_t csize; 8135 int error, t; 8136 int threads = ztest_opts.zo_threads; 8137 ztest_expand_io_t *thread_args; 8138 8139 ASSERT3U(ztest_opts.zo_raidz_expand_test, !=, RAIDZ_EXPAND_NONE); 8140 ASSERT3P(rzvd->vdev_ops, ==, &vdev_raidz_ops); 8141 ztest_opts.zo_raidz_expand_test = RAIDZ_EXPAND_STARTED; 8142 8143 /* Setup a 1 MiB buffer of random data */ 8144 uint64_t bufsize = 1024 * 1024; 8145 void *buffer = umem_alloc(bufsize, UMEM_NOFAIL); 8146 random_get_pseudo_bytes((uint8_t *)&buffer, bufsize); 8147 8148 /* 8149 * Put some data in the pool and then attach a vdev to initiate 8150 * reflow. 8151 */ 8152 run_threads = umem_zalloc(threads * sizeof (kthread_t *), UMEM_NOFAIL); 8153 thread_args = umem_zalloc(threads * sizeof (ztest_expand_io_t), 8154 UMEM_NOFAIL); 8155 /* Aim for roughly 25% of allocatable space up to 1GB */ 8156 alloc_goal = (vdev_get_min_asize(rzvd) * data_disks) / total_disks; 8157 alloc_goal = MIN(alloc_goal >> 2, 1024*1024*1024); 8158 if (ztest_opts.zo_verbose >= 1) { 8159 (void) printf("adding data to pool '%s', goal %llu bytes\n", 8160 ztest_opts.zo_pool, (u_longlong_t)alloc_goal); 8161 } 8162 8163 /* 8164 * Kick off all the I/O generators that run in parallel. 8165 */ 8166 for (t = 0; t < threads; t++) { 8167 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 8168 umem_free(run_threads, threads * sizeof (kthread_t *)); 8169 umem_free(buffer, bufsize); 8170 return; 8171 } 8172 thread_args[t].rzx_id = t; 8173 thread_args[t].rzx_amount = alloc_goal / threads; 8174 thread_args[t].rzx_bufsize = bufsize; 8175 thread_args[t].rzx_buffer = buffer; 8176 thread_args[t].rzx_alloc_max = alloc_goal; 8177 thread_args[t].rzx_spa = spa; 8178 run_threads[t] = thread_create(NULL, 0, ztest_rzx_thread, 8179 &thread_args[t], 0, NULL, TS_RUN | TS_JOINABLE, 8180 defclsyspri); 8181 } 8182 8183 /* 8184 * Wait for all of the writers to complete. 8185 */ 8186 for (t = 0; t < threads; t++) 8187 VERIFY0(thread_join(run_threads[t])); 8188 8189 /* 8190 * Close all datasets. This must be done after all the threads 8191 * are joined so we can be sure none of the datasets are in-use 8192 * by any of the threads. 8193 */ 8194 for (t = 0; t < ztest_opts.zo_threads; t++) { 8195 if (t < ztest_opts.zo_datasets) 8196 ztest_dataset_close(t); 8197 } 8198 8199 txg_wait_synced(spa_get_dsl(spa), 0); 8200 8201 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 8202 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 8203 8204 umem_free(buffer, bufsize); 8205 umem_free(run_threads, threads * sizeof (kthread_t *)); 8206 umem_free(thread_args, threads * sizeof (ztest_expand_io_t)); 8207 8208 /* Set our reflow target to 25%, 50% or 75% of allocated size */ 8209 uint_t multiple = ztest_random(3) + 1; 8210 uint64_t reflow_max = (rzvd->vdev_stat.vs_alloc * multiple) / 4; 8211 raidz_expand_max_reflow_bytes = reflow_max; 8212 8213 if (ztest_opts.zo_verbose >= 1) { 8214 (void) printf("running raidz expansion test, killing when " 8215 "reflow reaches %llu bytes (%u/4 of allocated space)\n", 8216 (u_longlong_t)reflow_max, multiple); 8217 } 8218 8219 /* XXX - do we want some I/O load during the reflow? */ 8220 8221 /* 8222 * Use a disk size that is larger than existing ones 8223 */ 8224 cvd = rzvd->vdev_child[0]; 8225 csize = vdev_get_min_asize(cvd); 8226 csize += csize / 10; 8227 /* 8228 * Path to vdev to be attached 8229 */ 8230 char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 8231 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 8232 ztest_opts.zo_dir, ztest_opts.zo_pool, rzvd->vdev_children); 8233 /* 8234 * Build the nvlist describing newpath. 8235 */ 8236 root = make_vdev_root(newpath, NULL, NULL, csize, ztest_get_ashift(), 8237 NULL, 0, 0, 1); 8238 /* 8239 * Expand the raidz vdev by attaching the new disk 8240 */ 8241 if (ztest_opts.zo_verbose >= 1) { 8242 (void) printf("expanding raidz: %d wide to %d wide with '%s'\n", 8243 (int)rzvd->vdev_children, (int)rzvd->vdev_children + 1, 8244 newpath); 8245 } 8246 error = spa_vdev_attach(spa, rzvd->vdev_guid, root, B_FALSE, B_FALSE); 8247 nvlist_free(root); 8248 if (error != 0) { 8249 fatal(0, "raidz expand: attach (%s %llu) returned %d", 8250 newpath, (long long)csize, error); 8251 } 8252 8253 /* 8254 * Wait for reflow to begin 8255 */ 8256 while (spa->spa_raidz_expand == NULL) { 8257 txg_wait_synced(spa_get_dsl(spa), 0); 8258 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8259 } 8260 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8261 (void) spa_raidz_expand_get_stats(spa, pres); 8262 spa_config_exit(spa, SCL_CONFIG, FTAG); 8263 while (pres->pres_state != DSS_SCANNING) { 8264 txg_wait_synced(spa_get_dsl(spa), 0); 8265 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8266 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8267 (void) spa_raidz_expand_get_stats(spa, pres); 8268 spa_config_exit(spa, SCL_CONFIG, FTAG); 8269 } 8270 8271 ASSERT3U(pres->pres_state, ==, DSS_SCANNING); 8272 ASSERT3U(pres->pres_to_reflow, !=, 0); 8273 /* 8274 * Set so when we are killed we go to raidz checking rather than 8275 * restarting test. 8276 */ 8277 ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_KILLED; 8278 if (ztest_opts.zo_verbose >= 1) { 8279 (void) printf("raidz expansion reflow started, waiting for " 8280 "%llu bytes to be copied\n", (u_longlong_t)reflow_max); 8281 } 8282 8283 /* 8284 * Wait for reflow maximum to be reached and then kill the test 8285 */ 8286 while (pres->pres_reflowed < reflow_max) { 8287 txg_wait_synced(spa_get_dsl(spa), 0); 8288 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8289 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8290 (void) spa_raidz_expand_get_stats(spa, pres); 8291 spa_config_exit(spa, SCL_CONFIG, FTAG); 8292 } 8293 8294 /* Reset the reflow pause before killing */ 8295 raidz_expand_max_reflow_bytes = 0; 8296 8297 if (ztest_opts.zo_verbose >= 1) { 8298 (void) printf("killing raidz expansion test after reflow " 8299 "reached %llu bytes\n", (u_longlong_t)pres->pres_reflowed); 8300 } 8301 8302 /* 8303 * Kill ourself to simulate a panic during a reflow. Our parent will 8304 * restart the test and the changed flag value will drive the test 8305 * through the scrub/check code to verify the pool is not corrupted. 8306 */ 8307 ztest_kill(zs); 8308 } 8309 8310 static void 8311 ztest_generic_run(ztest_shared_t *zs, spa_t *spa) 8312 { 8313 kthread_t **run_threads; 8314 int i, ndatasets; 8315 8316 run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), 8317 UMEM_NOFAIL); 8318 8319 /* 8320 * Actual number of datasets to be used. 8321 */ 8322 ndatasets = MIN(ztest_opts.zo_datasets, ztest_opts.zo_threads); 8323 8324 /* 8325 * Prepare the datasets first. 8326 */ 8327 for (i = 0; i < ndatasets; i++) 8328 VERIFY0(ztest_dataset_open(i)); 8329 8330 /* 8331 * Kick off all the tests that run in parallel. 8332 */ 8333 for (i = 0; i < ztest_opts.zo_threads; i++) { 8334 run_threads[i] = thread_create(NULL, 0, ztest_thread, 8335 (void *)(uintptr_t)i, 0, NULL, TS_RUN | TS_JOINABLE, 8336 defclsyspri); 8337 } 8338 8339 /* 8340 * Wait for all of the tests to complete. 8341 */ 8342 for (i = 0; i < ztest_opts.zo_threads; i++) 8343 VERIFY0(thread_join(run_threads[i])); 8344 8345 /* 8346 * Close all datasets. This must be done after all the threads 8347 * are joined so we can be sure none of the datasets are in-use 8348 * by any of the threads. 8349 */ 8350 for (i = 0; i < ndatasets; i++) 8351 ztest_dataset_close(i); 8352 8353 txg_wait_synced(spa_get_dsl(spa), 0); 8354 8355 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 8356 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 8357 8358 umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); 8359 } 8360 8361 /* 8362 * Setup our test context and kick off threads to run tests on all datasets 8363 * in parallel. 8364 */ 8365 static void 8366 ztest_run(ztest_shared_t *zs) 8367 { 8368 spa_t *spa; 8369 objset_t *os; 8370 kthread_t *resume_thread, *deadman_thread; 8371 uint64_t object; 8372 int error; 8373 int t, d; 8374 8375 ztest_exiting = B_FALSE; 8376 8377 /* 8378 * Initialize parent/child shared state. 8379 */ 8380 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8381 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8382 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8383 8384 zs->zs_thread_start = gethrtime(); 8385 zs->zs_thread_stop = 8386 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 8387 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 8388 zs->zs_thread_kill = zs->zs_thread_stop; 8389 if (ztest_random(100) < ztest_opts.zo_killrate) { 8390 zs->zs_thread_kill -= 8391 ztest_random(ztest_opts.zo_passtime * NANOSEC); 8392 } 8393 8394 mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); 8395 8396 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 8397 offsetof(ztest_cb_data_t, zcd_node)); 8398 8399 /* 8400 * Open our pool. It may need to be imported first depending on 8401 * what tests were running when the previous pass was terminated. 8402 */ 8403 raidz_scratch_verify(); 8404 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8405 error = spa_open(ztest_opts.zo_pool, &spa, FTAG); 8406 if (error) { 8407 VERIFY3S(error, ==, ENOENT); 8408 ztest_import_impl(); 8409 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8410 zs->zs_metaslab_sz = 8411 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8412 } 8413 8414 metaslab_preload_limit = ztest_random(20) + 1; 8415 ztest_spa = spa; 8416 8417 /* 8418 * XXX - BUGBUG raidz expansion do not run this for generic for now 8419 */ 8420 if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) 8421 VERIFY0(vdev_raidz_impl_set("cycle")); 8422 8423 dmu_objset_stats_t dds; 8424 VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, 8425 DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); 8426 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 8427 dmu_objset_fast_stat(os, &dds); 8428 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 8429 dmu_objset_disown(os, B_TRUE, FTAG); 8430 8431 /* Give the dedicated raidz expansion test more grace time */ 8432 if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) 8433 zfs_deadman_synctime_ms *= 2; 8434 8435 /* 8436 * Create a thread to periodically resume suspended I/O. 8437 */ 8438 resume_thread = thread_create(NULL, 0, ztest_resume_thread, 8439 spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 8440 8441 /* 8442 * Create a deadman thread and set to panic if we hang. 8443 */ 8444 deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, 8445 zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 8446 8447 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; 8448 8449 /* 8450 * Verify that we can safely inquire about any object, 8451 * whether it's allocated or not. To make it interesting, 8452 * we probe a 5-wide window around each power of two. 8453 * This hits all edge cases, including zero and the max. 8454 */ 8455 for (t = 0; t < 64; t++) { 8456 for (d = -5; d <= 5; d++) { 8457 error = dmu_object_info(spa->spa_meta_objset, 8458 (1ULL << t) + d, NULL); 8459 ASSERT(error == 0 || error == ENOENT || 8460 error == EINVAL); 8461 } 8462 } 8463 8464 /* 8465 * If we got any ENOSPC errors on the previous run, destroy something. 8466 */ 8467 if (zs->zs_enospc_count != 0) { 8468 /* Not expecting ENOSPC errors during raidz expansion tests */ 8469 ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, 8470 RAIDZ_EXPAND_NONE); 8471 8472 int d = ztest_random(ztest_opts.zo_datasets); 8473 ztest_dataset_destroy(d); 8474 txg_wait_synced(spa_get_dsl(spa), 0); 8475 } 8476 zs->zs_enospc_count = 0; 8477 8478 /* 8479 * If we were in the middle of ztest_device_removal() and were killed 8480 * we need to ensure the removal and scrub complete before running 8481 * any tests that check ztest_device_removal_active. The removal will 8482 * be restarted automatically when the spa is opened, but we need to 8483 * initiate the scrub manually if it is not already in progress. Note 8484 * that we always run the scrub whenever an indirect vdev exists 8485 * because we have no way of knowing for sure if ztest_device_removal() 8486 * fully completed its scrub before the pool was reimported. 8487 * 8488 * Does not apply for the RAIDZ expansion specific test runs 8489 */ 8490 if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_NONE && 8491 (spa->spa_removing_phys.sr_state == DSS_SCANNING || 8492 spa->spa_removing_phys.sr_prev_indirect_vdev != -1)) { 8493 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 8494 txg_wait_synced(spa_get_dsl(spa), 0); 8495 8496 error = ztest_scrub_impl(spa); 8497 if (error == EBUSY) 8498 error = 0; 8499 ASSERT0(error); 8500 } 8501 8502 if (ztest_opts.zo_verbose >= 4) 8503 (void) printf("starting main threads...\n"); 8504 8505 /* 8506 * Replay all logs of all datasets in the pool. This is primarily for 8507 * temporary datasets which wouldn't otherwise get replayed, which 8508 * can trigger failures when attempting to offline a SLOG in 8509 * ztest_fault_inject(). 8510 */ 8511 (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, 8512 NULL, DS_FIND_CHILDREN); 8513 8514 if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) 8515 ztest_raidz_expand_run(zs, spa); 8516 else if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_KILLED) 8517 ztest_raidz_expand_check(spa); 8518 else 8519 ztest_generic_run(zs, spa); 8520 8521 /* Kill the resume and deadman threads */ 8522 ztest_exiting = B_TRUE; 8523 VERIFY0(thread_join(resume_thread)); 8524 VERIFY0(thread_join(deadman_thread)); 8525 ztest_resume(spa); 8526 8527 /* 8528 * Right before closing the pool, kick off a bunch of async I/O; 8529 * spa_close() should wait for it to complete. 8530 */ 8531 for (object = 1; object < 50; object++) { 8532 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 8533 ZIO_PRIORITY_SYNC_READ); 8534 } 8535 8536 /* Verify that at least one commit cb was called in a timely fashion */ 8537 if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) 8538 VERIFY0(zc_min_txg_delay); 8539 8540 spa_close(spa, FTAG); 8541 8542 /* 8543 * Verify that we can loop over all pools. 8544 */ 8545 spa_namespace_enter(FTAG); 8546 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 8547 if (ztest_opts.zo_verbose > 3) 8548 (void) printf("spa_next: found %s\n", spa_name(spa)); 8549 spa_namespace_exit(FTAG); 8550 8551 /* 8552 * Verify that we can export the pool and reimport it under a 8553 * different name. 8554 */ 8555 if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { 8556 char name[ZFS_MAX_DATASET_NAME_LEN]; 8557 (void) snprintf(name, sizeof (name), "%s_import", 8558 ztest_opts.zo_pool); 8559 ztest_spa_import_export(ztest_opts.zo_pool, name); 8560 ztest_spa_import_export(name, ztest_opts.zo_pool); 8561 } 8562 8563 kernel_fini(); 8564 8565 list_destroy(&zcl.zcl_callbacks); 8566 mutex_destroy(&zcl.zcl_callbacks_lock); 8567 (void) pthread_rwlock_destroy(&ztest_name_lock); 8568 mutex_destroy(&ztest_vdev_lock); 8569 mutex_destroy(&ztest_checkpoint_lock); 8570 } 8571 8572 static void 8573 print_time(hrtime_t t, char *timebuf) 8574 { 8575 hrtime_t s = t / NANOSEC; 8576 hrtime_t m = s / 60; 8577 hrtime_t h = m / 60; 8578 hrtime_t d = h / 24; 8579 8580 s -= m * 60; 8581 m -= h * 60; 8582 h -= d * 24; 8583 8584 timebuf[0] = '\0'; 8585 8586 if (d) 8587 (void) sprintf(timebuf, 8588 "%llud%02lluh%02llum%02llus", d, h, m, s); 8589 else if (h) 8590 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 8591 else if (m) 8592 (void) sprintf(timebuf, "%llum%02llus", m, s); 8593 else 8594 (void) sprintf(timebuf, "%llus", s); 8595 } 8596 8597 static nvlist_t * 8598 make_random_pool_props(void) 8599 { 8600 nvlist_t *props; 8601 8602 props = fnvlist_alloc(); 8603 8604 /* Twenty percent of the time enable ZPOOL_PROP_DEDUP_TABLE_QUOTA */ 8605 if (ztest_random(5) == 0) { 8606 fnvlist_add_uint64(props, 8607 zpool_prop_to_name(ZPOOL_PROP_DEDUP_TABLE_QUOTA), 8608 2 * 1024 * 1024); 8609 } 8610 8611 /* Fifty percent of the time enable ZPOOL_PROP_AUTOREPLACE */ 8612 if (ztest_random(2) == 0) { 8613 fnvlist_add_uint64(props, 8614 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); 8615 } 8616 8617 return (props); 8618 } 8619 8620 /* 8621 * Create a storage pool with the given name and initial vdev size. 8622 * Then test spa_freeze() functionality. 8623 */ 8624 static void 8625 ztest_init(ztest_shared_t *zs) 8626 { 8627 spa_t *spa; 8628 nvlist_t *nvroot, *props; 8629 int i; 8630 8631 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8632 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8633 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8634 8635 raidz_scratch_verify(); 8636 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8637 8638 /* 8639 * Create the storage pool. 8640 */ 8641 (void) spa_destroy(ztest_opts.zo_pool); 8642 ztest_shared->zs_vdev_next_leaf = 0; 8643 zs->zs_splits = 0; 8644 zs->zs_mirrors = ztest_opts.zo_mirrors; 8645 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 8646 NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 8647 props = make_random_pool_props(); 8648 8649 /* 8650 * We don't expect the pool to suspend unless maxfaults == 0, 8651 * in which case ztest_fault_inject() temporarily takes away 8652 * the only valid replica. 8653 */ 8654 fnvlist_add_uint64(props, 8655 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 8656 MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); 8657 8658 for (i = 0; i < SPA_FEATURES; i++) { 8659 char *buf; 8660 8661 if (!spa_feature_table[i].fi_zfs_mod_supported) 8662 continue; 8663 8664 /* 8665 * 75% chance of using the log space map feature. We want ztest 8666 * to exercise both the code paths that use the log space map 8667 * feature and the ones that don't. 8668 */ 8669 if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) 8670 continue; 8671 8672 /* 8673 * split 50/50 between legacy and fast dedup 8674 */ 8675 if (i == SPA_FEATURE_FAST_DEDUP && ztest_random(2) != 0) 8676 continue; 8677 8678 VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", 8679 spa_feature_table[i].fi_uname)); 8680 fnvlist_add_uint64(props, buf, 0); 8681 free(buf); 8682 } 8683 8684 VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); 8685 fnvlist_free(nvroot); 8686 fnvlist_free(props); 8687 8688 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8689 zs->zs_metaslab_sz = 8690 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8691 zs->zs_guid = spa_guid(spa); 8692 spa_close(spa, FTAG); 8693 8694 kernel_fini(); 8695 8696 if (!ztest_opts.zo_mmp_test) { 8697 ztest_run_zdb(zs->zs_guid); 8698 ztest_freeze(); 8699 ztest_run_zdb(zs->zs_guid); 8700 } 8701 8702 (void) pthread_rwlock_destroy(&ztest_name_lock); 8703 mutex_destroy(&ztest_vdev_lock); 8704 mutex_destroy(&ztest_checkpoint_lock); 8705 } 8706 8707 static void 8708 setup_data_fd(void) 8709 { 8710 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 8711 8712 ztest_fd_data = mkstemp(ztest_name_data); 8713 ASSERT3S(ztest_fd_data, >=, 0); 8714 (void) unlink(ztest_name_data); 8715 } 8716 8717 static int 8718 shared_data_size(ztest_shared_hdr_t *hdr) 8719 { 8720 int size; 8721 8722 size = hdr->zh_hdr_size; 8723 size += hdr->zh_opts_size; 8724 size += hdr->zh_size; 8725 size += hdr->zh_stats_size * hdr->zh_stats_count; 8726 size += hdr->zh_ds_size * hdr->zh_ds_count; 8727 size += hdr->zh_scratch_state_size; 8728 8729 return (size); 8730 } 8731 8732 static void 8733 setup_hdr(void) 8734 { 8735 int size; 8736 ztest_shared_hdr_t *hdr; 8737 8738 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 8739 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 8740 ASSERT3P(hdr, !=, MAP_FAILED); 8741 8742 VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 8743 8744 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 8745 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 8746 hdr->zh_size = sizeof (ztest_shared_t); 8747 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 8748 hdr->zh_stats_count = ZTEST_FUNCS; 8749 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 8750 hdr->zh_ds_count = ztest_opts.zo_datasets; 8751 hdr->zh_scratch_state_size = sizeof (ztest_shared_scratch_state_t); 8752 8753 size = shared_data_size(hdr); 8754 VERIFY0(ftruncate(ztest_fd_data, size)); 8755 8756 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 8757 } 8758 8759 static void 8760 setup_data(void) 8761 { 8762 int size, offset; 8763 ztest_shared_hdr_t *hdr; 8764 uint8_t *buf; 8765 8766 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 8767 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 8768 ASSERT3P(hdr, !=, MAP_FAILED); 8769 8770 size = shared_data_size(hdr); 8771 8772 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 8773 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 8774 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 8775 ASSERT3P(hdr, !=, MAP_FAILED); 8776 buf = (uint8_t *)hdr; 8777 8778 offset = hdr->zh_hdr_size; 8779 ztest_shared_opts = (void *)&buf[offset]; 8780 offset += hdr->zh_opts_size; 8781 ztest_shared = (void *)&buf[offset]; 8782 offset += hdr->zh_size; 8783 ztest_shared_callstate = (void *)&buf[offset]; 8784 offset += hdr->zh_stats_size * hdr->zh_stats_count; 8785 ztest_shared_ds = (void *)&buf[offset]; 8786 offset += hdr->zh_ds_size * hdr->zh_ds_count; 8787 ztest_scratch_state = (void *)&buf[offset]; 8788 } 8789 8790 static boolean_t 8791 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 8792 { 8793 pid_t pid; 8794 int status; 8795 char *cmdbuf = NULL; 8796 8797 pid = fork(); 8798 8799 if (cmd == NULL) { 8800 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 8801 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 8802 cmd = cmdbuf; 8803 } 8804 8805 if (pid == -1) 8806 fatal(B_TRUE, "fork failed"); 8807 8808 if (pid == 0) { /* child */ 8809 char fd_data_str[12]; 8810 8811 VERIFY3S(11, >=, 8812 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 8813 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 8814 8815 if (libpath != NULL) { 8816 const char *curlp = getenv("LD_LIBRARY_PATH"); 8817 if (curlp == NULL) 8818 VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); 8819 else { 8820 char *newlp = NULL; 8821 VERIFY3S(-1, !=, 8822 asprintf(&newlp, "%s:%s", libpath, curlp)); 8823 VERIFY0(setenv("LD_LIBRARY_PATH", newlp, 1)); 8824 free(newlp); 8825 } 8826 } 8827 (void) execl(cmd, cmd, (char *)NULL); 8828 ztest_dump_core = B_FALSE; 8829 fatal(B_TRUE, "exec failed: %s", cmd); 8830 } 8831 8832 if (cmdbuf != NULL) { 8833 umem_free(cmdbuf, MAXPATHLEN); 8834 cmd = NULL; 8835 } 8836 8837 while (waitpid(pid, &status, 0) != pid) 8838 continue; 8839 if (statusp != NULL) 8840 *statusp = status; 8841 8842 if (WIFEXITED(status)) { 8843 if (WEXITSTATUS(status) != 0) { 8844 (void) fprintf(stderr, "child exited with code %d\n", 8845 WEXITSTATUS(status)); 8846 exit(2); 8847 } 8848 return (B_FALSE); 8849 } else if (WIFSIGNALED(status)) { 8850 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 8851 (void) fprintf(stderr, "child died with signal %d\n", 8852 WTERMSIG(status)); 8853 exit(3); 8854 } 8855 return (B_TRUE); 8856 } else { 8857 (void) fprintf(stderr, "something strange happened to child\n"); 8858 exit(4); 8859 } 8860 } 8861 8862 static void 8863 ztest_run_init(void) 8864 { 8865 int i; 8866 8867 ztest_shared_t *zs = ztest_shared; 8868 8869 /* 8870 * Blow away any existing copy of zpool.cache 8871 */ 8872 (void) remove(spa_config_path); 8873 8874 if (ztest_opts.zo_init == 0) { 8875 if (ztest_opts.zo_verbose >= 1) 8876 (void) printf("Importing pool %s\n", 8877 ztest_opts.zo_pool); 8878 ztest_import(zs); 8879 return; 8880 } 8881 8882 /* 8883 * Create and initialize our storage pool. 8884 */ 8885 for (i = 1; i <= ztest_opts.zo_init; i++) { 8886 memset(zs, 0, sizeof (*zs)); 8887 if (ztest_opts.zo_verbose >= 3 && 8888 ztest_opts.zo_init != 1) { 8889 (void) printf("ztest_init(), pass %d\n", i); 8890 } 8891 ztest_init(zs); 8892 } 8893 } 8894 8895 int 8896 main(int argc, char **argv) 8897 { 8898 int kills = 0; 8899 int iters = 0; 8900 int older = 0; 8901 int newer = 0; 8902 ztest_shared_t *zs; 8903 ztest_info_t *zi; 8904 ztest_shared_callstate_t *zc; 8905 char timebuf[100]; 8906 char numbuf[NN_NUMBUF_SZ]; 8907 char *cmd; 8908 boolean_t hasalt; 8909 int f, err; 8910 char *fd_data_str = getenv("ZTEST_FD_DATA"); 8911 struct sigaction action; 8912 8913 (void) setvbuf(stdout, NULL, _IOLBF, 0); 8914 8915 dprintf_setup(&argc, argv); 8916 zfs_deadman_synctime_ms = 300000; 8917 zfs_deadman_checktime_ms = 30000; 8918 /* 8919 * As two-word space map entries may not come up often (especially 8920 * if pool and vdev sizes are small) we want to force at least some 8921 * of them so the feature get tested. 8922 */ 8923 zfs_force_some_double_word_sm_entries = B_TRUE; 8924 8925 /* 8926 * Verify that even extensively damaged split blocks with many 8927 * segments can be reconstructed in a reasonable amount of time 8928 * when reconstruction is known to be possible. 8929 * 8930 * Note: the lower this value is, the more damage we inflict, and 8931 * the more time ztest spends in recovering that damage. We chose 8932 * to induce damage 1/100th of the time so recovery is tested but 8933 * not so frequently that ztest doesn't get to test other code paths. 8934 */ 8935 zfs_reconstruct_indirect_damage_fraction = 100; 8936 8937 action.sa_handler = sig_handler; 8938 sigemptyset(&action.sa_mask); 8939 action.sa_flags = 0; 8940 8941 if (sigaction(SIGSEGV, &action, NULL) < 0) { 8942 (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", 8943 strerror(errno)); 8944 exit(EXIT_FAILURE); 8945 } 8946 8947 if (sigaction(SIGABRT, &action, NULL) < 0) { 8948 (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", 8949 strerror(errno)); 8950 exit(EXIT_FAILURE); 8951 } 8952 8953 libspl_init(); 8954 8955 /* 8956 * Force random_get_bytes() to use /dev/urandom in order to prevent 8957 * ztest from needlessly depleting the system entropy pool. 8958 */ 8959 random_force_pseudo(B_TRUE); 8960 8961 if (!fd_data_str) { 8962 process_options(argc, argv); 8963 8964 setup_data_fd(); 8965 setup_hdr(); 8966 setup_data(); 8967 memcpy(ztest_shared_opts, &ztest_opts, 8968 sizeof (*ztest_shared_opts)); 8969 } else { 8970 ztest_fd_data = atoi(fd_data_str); 8971 setup_data(); 8972 memcpy(&ztest_opts, ztest_shared_opts, sizeof (ztest_opts)); 8973 } 8974 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 8975 8976 err = ztest_set_global_vars(); 8977 if (err != 0 && !fd_data_str) { 8978 /* error message done by ztest_set_global_vars */ 8979 exit(EXIT_FAILURE); 8980 } else { 8981 /* children should not be spawned if setting gvars fails */ 8982 VERIFY0(err); 8983 } 8984 8985 /* Override location of zpool.cache */ 8986 VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", 8987 ztest_opts.zo_dir), !=, -1); 8988 8989 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 8990 UMEM_NOFAIL); 8991 zs = ztest_shared; 8992 8993 if (fd_data_str) { 8994 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 8995 metaslab_df_alloc_threshold = 8996 zs->zs_metaslab_df_alloc_threshold; 8997 8998 if (zs->zs_do_init) 8999 ztest_run_init(); 9000 else 9001 ztest_run(zs); 9002 exit(0); 9003 } 9004 9005 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 9006 9007 if (ztest_opts.zo_verbose >= 1) { 9008 (void) printf("%"PRIu64" vdevs, %d datasets, %d threads, " 9009 "%d %s disks, parity %d, %"PRIu64" seconds...\n\n", 9010 ztest_opts.zo_vdevs, 9011 ztest_opts.zo_datasets, 9012 ztest_opts.zo_threads, 9013 ztest_opts.zo_raid_children, 9014 ztest_opts.zo_raid_type, 9015 ztest_opts.zo_raid_parity, 9016 ztest_opts.zo_time); 9017 } 9018 9019 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 9020 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 9021 9022 zs->zs_do_init = B_TRUE; 9023 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 9024 if (ztest_opts.zo_verbose >= 1) { 9025 (void) printf("Executing older ztest for " 9026 "initialization: %s\n", ztest_opts.zo_alt_ztest); 9027 } 9028 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 9029 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 9030 } else { 9031 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 9032 } 9033 zs->zs_do_init = B_FALSE; 9034 9035 zs->zs_proc_start = gethrtime(); 9036 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 9037 9038 for (f = 0; f < ZTEST_FUNCS; f++) { 9039 zi = &ztest_info[f]; 9040 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9041 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 9042 zc->zc_next = UINT64_MAX; 9043 else 9044 zc->zc_next = zs->zs_proc_start + 9045 ztest_random(2 * zi->zi_interval[0] + 1); 9046 } 9047 9048 /* 9049 * Run the tests in a loop. These tests include fault injection 9050 * to verify that self-healing data works, and forced crashes 9051 * to verify that we never lose on-disk consistency. 9052 */ 9053 while (gethrtime() < zs->zs_proc_stop) { 9054 int status; 9055 boolean_t killed; 9056 9057 /* 9058 * Initialize the workload counters for each function. 9059 */ 9060 for (f = 0; f < ZTEST_FUNCS; f++) { 9061 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9062 zc->zc_count = 0; 9063 zc->zc_time = 0; 9064 } 9065 9066 /* Set the allocation switch size */ 9067 zs->zs_metaslab_df_alloc_threshold = 9068 ztest_random(zs->zs_metaslab_sz / 4) + 1; 9069 9070 if (!hasalt || ztest_random(2) == 0) { 9071 if (hasalt && ztest_opts.zo_verbose >= 1) { 9072 (void) printf("Executing newer ztest: %s\n", 9073 cmd); 9074 } 9075 newer++; 9076 killed = exec_child(cmd, NULL, B_TRUE, &status); 9077 } else { 9078 if (hasalt && ztest_opts.zo_verbose >= 1) { 9079 (void) printf("Executing older ztest: %s\n", 9080 ztest_opts.zo_alt_ztest); 9081 } 9082 older++; 9083 killed = exec_child(ztest_opts.zo_alt_ztest, 9084 ztest_opts.zo_alt_libpath, B_TRUE, &status); 9085 } 9086 9087 if (killed) 9088 kills++; 9089 iters++; 9090 9091 if (ztest_opts.zo_verbose >= 1) { 9092 hrtime_t now = gethrtime(); 9093 9094 now = MIN(now, zs->zs_proc_stop); 9095 print_time(zs->zs_proc_stop - now, timebuf); 9096 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 9097 9098 (void) printf("Pass %3d, %8s, %3"PRIu64" ENOSPC, " 9099 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 9100 iters, 9101 WIFEXITED(status) ? "Complete" : "SIGKILL", 9102 zs->zs_enospc_count, 9103 100.0 * zs->zs_alloc / zs->zs_space, 9104 numbuf, 9105 100.0 * (now - zs->zs_proc_start) / 9106 (ztest_opts.zo_time * NANOSEC), timebuf); 9107 } 9108 9109 if (ztest_opts.zo_verbose >= 2) { 9110 (void) printf("\nWorkload summary:\n\n"); 9111 (void) printf("%7s %9s %s\n", 9112 "Calls", "Time", "Function"); 9113 (void) printf("%7s %9s %s\n", 9114 "-----", "----", "--------"); 9115 for (f = 0; f < ZTEST_FUNCS; f++) { 9116 zi = &ztest_info[f]; 9117 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9118 print_time(zc->zc_time, timebuf); 9119 (void) printf("%7"PRIu64" %9s %s\n", 9120 zc->zc_count, timebuf, 9121 zi->zi_funcname); 9122 } 9123 (void) printf("\n"); 9124 } 9125 9126 if (!ztest_opts.zo_mmp_test) 9127 ztest_run_zdb(zs->zs_guid); 9128 if (ztest_shared_opts->zo_raidz_expand_test == 9129 RAIDZ_EXPAND_CHECKED) 9130 break; /* raidz expand test complete */ 9131 } 9132 9133 if (ztest_opts.zo_verbose >= 1) { 9134 if (hasalt) { 9135 (void) printf("%d runs of older ztest: %s\n", older, 9136 ztest_opts.zo_alt_ztest); 9137 (void) printf("%d runs of newer ztest: %s\n", newer, 9138 cmd); 9139 } 9140 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 9141 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 9142 } 9143 9144 umem_free(cmd, MAXNAMELEN); 9145 9146 return (0); 9147 } 9148