1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2024 by Delphix. All rights reserved. 25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2013 Steven Hartland. All rights reserved. 27 * Copyright (c) 2014 Integros [integros.com] 28 * Copyright 2017 Joyent, Inc. 29 * Copyright (c) 2017, Intel Corporation. 30 * Copyright (c) 2023, Klara, Inc. 31 */ 32 33 /* 34 * The objective of this program is to provide a DMU/ZAP/SPA stress test 35 * that runs entirely in userland, is easy to use, and easy to extend. 36 * 37 * The overall design of the ztest program is as follows: 38 * 39 * (1) For each major functional area (e.g. adding vdevs to a pool, 40 * creating and destroying datasets, reading and writing objects, etc) 41 * we have a simple routine to test that functionality. These 42 * individual routines do not have to do anything "stressful". 43 * 44 * (2) We turn these simple functionality tests into a stress test by 45 * running them all in parallel, with as many threads as desired, 46 * and spread across as many datasets, objects, and vdevs as desired. 47 * 48 * (3) While all this is happening, we inject faults into the pool to 49 * verify that self-healing data really works. 50 * 51 * (4) Every time we open a dataset, we change its checksum and compression 52 * functions. Thus even individual objects vary from block to block 53 * in which checksum they use and whether they're compressed. 54 * 55 * (5) To verify that we never lose on-disk consistency after a crash, 56 * we run the entire test in a child of the main process. 57 * At random times, the child self-immolates with a SIGKILL. 58 * This is the software equivalent of pulling the power cord. 59 * The parent then runs the test again, using the existing 60 * storage pool, as many times as desired. If backwards compatibility 61 * testing is enabled ztest will sometimes run the "older" version 62 * of ztest after a SIGKILL. 63 * 64 * (6) To verify that we don't have future leaks or temporal incursions, 65 * many of the functional tests record the transaction group number 66 * as part of their data. When reading old data, they verify that 67 * the transaction group number is less than the current, open txg. 68 * If you add a new test, please do this if applicable. 69 * 70 * (7) Threads are created with a reduced stack size, for sanity checking. 71 * Therefore, it's important not to allocate huge buffers on the stack. 72 * 73 * When run with no arguments, ztest runs for about five minutes and 74 * produces no output if successful. To get a little bit of information, 75 * specify -V. To get more information, specify -VV, and so on. 76 * 77 * To turn this into an overnight stress test, use -T to specify run time. 78 * 79 * You can ask more vdevs [-v], datasets [-d], or threads [-t] 80 * to increase the pool capacity, fanout, and overall stress level. 81 * 82 * Use the -k option to set the desired frequency of kills. 83 * 84 * When ztest invokes itself it passes all relevant information through a 85 * temporary file which is mmap-ed in the child process. This allows shared 86 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 87 * stored at offset 0 of this file and contains information on the size and 88 * number of shared structures in the file. The information stored in this file 89 * must remain backwards compatible with older versions of ztest so that 90 * ztest can invoke them during backwards compatibility testing (-B). 91 */ 92 93 #include <sys/zfs_context.h> 94 #include <sys/spa.h> 95 #include <sys/dmu.h> 96 #include <sys/txg.h> 97 #include <sys/dbuf.h> 98 #include <sys/zap.h> 99 #include <sys/dmu_objset.h> 100 #include <sys/poll.h> 101 #include <sys/stat.h> 102 #include <sys/time.h> 103 #include <sys/wait.h> 104 #include <sys/mman.h> 105 #include <sys/resource.h> 106 #include <sys/zio.h> 107 #include <sys/zil.h> 108 #include <sys/zil_impl.h> 109 #include <sys/vdev_draid.h> 110 #include <sys/vdev_impl.h> 111 #include <sys/vdev_file.h> 112 #include <sys/vdev_initialize.h> 113 #include <sys/vdev_raidz.h> 114 #include <sys/vdev_trim.h> 115 #include <sys/spa_impl.h> 116 #include <sys/metaslab_impl.h> 117 #include <sys/dsl_prop.h> 118 #include <sys/dsl_dataset.h> 119 #include <sys/dsl_destroy.h> 120 #include <sys/dsl_scan.h> 121 #include <sys/zio_checksum.h> 122 #include <sys/zfs_refcount.h> 123 #include <sys/zfeature.h> 124 #include <sys/dsl_userhold.h> 125 #include <sys/abd.h> 126 #include <sys/blake3.h> 127 #include <stdio.h> 128 #include <stdlib.h> 129 #include <unistd.h> 130 #include <getopt.h> 131 #include <signal.h> 132 #include <umem.h> 133 #include <ctype.h> 134 #include <math.h> 135 #include <sys/fs/zfs.h> 136 #include <zfs_fletcher.h> 137 #include <libnvpair.h> 138 #include <libzutil.h> 139 #include <sys/crypto/icp.h> 140 #include <sys/zfs_impl.h> 141 #include <sys/backtrace.h> 142 143 static int ztest_fd_data = -1; 144 static int ztest_fd_rand = -1; 145 146 typedef struct ztest_shared_hdr { 147 uint64_t zh_hdr_size; 148 uint64_t zh_opts_size; 149 uint64_t zh_size; 150 uint64_t zh_stats_size; 151 uint64_t zh_stats_count; 152 uint64_t zh_ds_size; 153 uint64_t zh_ds_count; 154 uint64_t zh_scratch_state_size; 155 } ztest_shared_hdr_t; 156 157 static ztest_shared_hdr_t *ztest_shared_hdr; 158 159 enum ztest_class_state { 160 ZTEST_VDEV_CLASS_OFF, 161 ZTEST_VDEV_CLASS_ON, 162 ZTEST_VDEV_CLASS_RND 163 }; 164 165 /* Dedicated RAIDZ Expansion test states */ 166 typedef enum { 167 RAIDZ_EXPAND_NONE, /* Default is none, must opt-in */ 168 RAIDZ_EXPAND_REQUESTED, /* The '-X' option was used */ 169 RAIDZ_EXPAND_STARTED, /* Testing has commenced */ 170 RAIDZ_EXPAND_KILLED, /* Reached the proccess kill */ 171 RAIDZ_EXPAND_CHECKED, /* Pool scrub verification done */ 172 } raidz_expand_test_state_t; 173 174 175 #define ZO_GVARS_MAX_ARGLEN ((size_t)64) 176 #define ZO_GVARS_MAX_COUNT ((size_t)10) 177 178 typedef struct ztest_shared_opts { 179 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 180 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 181 char zo_alt_ztest[MAXNAMELEN]; 182 char zo_alt_libpath[MAXNAMELEN]; 183 uint64_t zo_vdevs; 184 uint64_t zo_vdevtime; 185 size_t zo_vdev_size; 186 int zo_ashift; 187 int zo_mirrors; 188 int zo_raid_do_expand; 189 int zo_raid_children; 190 int zo_raid_parity; 191 char zo_raid_type[8]; 192 int zo_draid_data; 193 int zo_draid_spares; 194 int zo_datasets; 195 int zo_threads; 196 uint64_t zo_passtime; 197 uint64_t zo_killrate; 198 int zo_verbose; 199 int zo_init; 200 uint64_t zo_time; 201 uint64_t zo_maxloops; 202 uint64_t zo_metaslab_force_ganging; 203 raidz_expand_test_state_t zo_raidz_expand_test; 204 int zo_mmp_test; 205 int zo_special_vdevs; 206 int zo_dump_dbgmsg; 207 int zo_gvars_count; 208 char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; 209 } ztest_shared_opts_t; 210 211 /* Default values for command line options. */ 212 #define DEFAULT_POOL "ztest" 213 #define DEFAULT_VDEV_DIR "/tmp" 214 #define DEFAULT_VDEV_COUNT 5 215 #define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ 216 #define DEFAULT_VDEV_SIZE_STR "256M" 217 #define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT 218 #define DEFAULT_MIRRORS 2 219 #define DEFAULT_RAID_CHILDREN 4 220 #define DEFAULT_RAID_PARITY 1 221 #define DEFAULT_DRAID_DATA 4 222 #define DEFAULT_DRAID_SPARES 1 223 #define DEFAULT_DATASETS_COUNT 7 224 #define DEFAULT_THREADS 23 225 #define DEFAULT_RUN_TIME 300 /* 300 seconds */ 226 #define DEFAULT_RUN_TIME_STR "300 sec" 227 #define DEFAULT_PASS_TIME 60 /* 60 seconds */ 228 #define DEFAULT_PASS_TIME_STR "60 sec" 229 #define DEFAULT_KILL_RATE 70 /* 70% kill rate */ 230 #define DEFAULT_KILLRATE_STR "70%" 231 #define DEFAULT_INITS 1 232 #define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ 233 #define DEFAULT_FORCE_GANGING (64 << 10) 234 #define DEFAULT_FORCE_GANGING_STR "64K" 235 236 /* Simplifying assumption: -1 is not a valid default. */ 237 #define NO_DEFAULT -1 238 239 static const ztest_shared_opts_t ztest_opts_defaults = { 240 .zo_pool = DEFAULT_POOL, 241 .zo_dir = DEFAULT_VDEV_DIR, 242 .zo_alt_ztest = { '\0' }, 243 .zo_alt_libpath = { '\0' }, 244 .zo_vdevs = DEFAULT_VDEV_COUNT, 245 .zo_ashift = DEFAULT_ASHIFT, 246 .zo_mirrors = DEFAULT_MIRRORS, 247 .zo_raid_children = DEFAULT_RAID_CHILDREN, 248 .zo_raid_parity = DEFAULT_RAID_PARITY, 249 .zo_raid_type = VDEV_TYPE_RAIDZ, 250 .zo_vdev_size = DEFAULT_VDEV_SIZE, 251 .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ 252 .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ 253 .zo_datasets = DEFAULT_DATASETS_COUNT, 254 .zo_threads = DEFAULT_THREADS, 255 .zo_passtime = DEFAULT_PASS_TIME, 256 .zo_killrate = DEFAULT_KILL_RATE, 257 .zo_verbose = 0, 258 .zo_mmp_test = 0, 259 .zo_init = DEFAULT_INITS, 260 .zo_time = DEFAULT_RUN_TIME, 261 .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ 262 .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, 263 .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, 264 .zo_gvars_count = 0, 265 .zo_raidz_expand_test = RAIDZ_EXPAND_NONE, 266 }; 267 268 extern uint64_t metaslab_force_ganging; 269 extern uint64_t metaslab_df_alloc_threshold; 270 extern uint64_t zfs_deadman_synctime_ms; 271 extern uint_t metaslab_preload_limit; 272 extern int zfs_compressed_arc_enabled; 273 extern int zfs_abd_scatter_enabled; 274 extern uint_t dmu_object_alloc_chunk_shift; 275 extern boolean_t zfs_force_some_double_word_sm_entries; 276 extern unsigned long zio_decompress_fail_fraction; 277 extern unsigned long zfs_reconstruct_indirect_damage_fraction; 278 extern uint64_t raidz_expand_max_reflow_bytes; 279 extern uint_t raidz_expand_pause_point; 280 extern boolean_t ddt_prune_artificial_age; 281 extern boolean_t ddt_dump_prune_histogram; 282 283 284 static ztest_shared_opts_t *ztest_shared_opts; 285 static ztest_shared_opts_t ztest_opts; 286 static const char *const ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; 287 288 typedef struct ztest_shared_ds { 289 uint64_t zd_seq; 290 } ztest_shared_ds_t; 291 292 static ztest_shared_ds_t *ztest_shared_ds; 293 #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 294 295 typedef struct ztest_scratch_state { 296 uint64_t zs_raidz_scratch_verify_pause; 297 } ztest_shared_scratch_state_t; 298 299 static ztest_shared_scratch_state_t *ztest_scratch_state; 300 301 #define BT_MAGIC 0x123456789abcdefULL 302 #define MAXFAULTS(zs) \ 303 (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) 304 305 enum ztest_io_type { 306 ZTEST_IO_WRITE_TAG, 307 ZTEST_IO_WRITE_PATTERN, 308 ZTEST_IO_WRITE_ZEROES, 309 ZTEST_IO_TRUNCATE, 310 ZTEST_IO_SETATTR, 311 ZTEST_IO_REWRITE, 312 ZTEST_IO_TYPES 313 }; 314 315 typedef struct ztest_block_tag { 316 uint64_t bt_magic; 317 uint64_t bt_objset; 318 uint64_t bt_object; 319 uint64_t bt_dnodesize; 320 uint64_t bt_offset; 321 uint64_t bt_gen; 322 uint64_t bt_txg; 323 uint64_t bt_crtxg; 324 } ztest_block_tag_t; 325 326 typedef struct bufwad { 327 uint64_t bw_index; 328 uint64_t bw_txg; 329 uint64_t bw_data; 330 } bufwad_t; 331 332 /* 333 * It would be better to use a rangelock_t per object. Unfortunately 334 * the rangelock_t is not a drop-in replacement for rl_t, because we 335 * still need to map from object ID to rangelock_t. 336 */ 337 typedef enum { 338 ZTRL_READER, 339 ZTRL_WRITER, 340 ZTRL_APPEND 341 } rl_type_t; 342 343 typedef struct rll { 344 void *rll_writer; 345 int rll_readers; 346 kmutex_t rll_lock; 347 kcondvar_t rll_cv; 348 } rll_t; 349 350 typedef struct rl { 351 uint64_t rl_object; 352 uint64_t rl_offset; 353 uint64_t rl_size; 354 rll_t *rl_lock; 355 } rl_t; 356 357 #define ZTEST_RANGE_LOCKS 64 358 #define ZTEST_OBJECT_LOCKS 64 359 360 /* 361 * Object descriptor. Used as a template for object lookup/create/remove. 362 */ 363 typedef struct ztest_od { 364 uint64_t od_dir; 365 uint64_t od_object; 366 dmu_object_type_t od_type; 367 dmu_object_type_t od_crtype; 368 uint64_t od_blocksize; 369 uint64_t od_crblocksize; 370 uint64_t od_crdnodesize; 371 uint64_t od_gen; 372 uint64_t od_crgen; 373 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 374 } ztest_od_t; 375 376 /* 377 * Per-dataset state. 378 */ 379 typedef struct ztest_ds { 380 ztest_shared_ds_t *zd_shared; 381 objset_t *zd_os; 382 pthread_rwlock_t zd_zilog_lock; 383 zilog_t *zd_zilog; 384 ztest_od_t *zd_od; /* debugging aid */ 385 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 386 kmutex_t zd_dirobj_lock; 387 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 388 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 389 } ztest_ds_t; 390 391 /* 392 * Per-iteration state. 393 */ 394 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 395 396 typedef struct ztest_info { 397 ztest_func_t *zi_func; /* test function */ 398 uint64_t zi_iters; /* iterations per execution */ 399 uint64_t *zi_interval; /* execute every <interval> seconds */ 400 const char *zi_funcname; /* name of test function */ 401 } ztest_info_t; 402 403 typedef struct ztest_shared_callstate { 404 uint64_t zc_count; /* per-pass count */ 405 uint64_t zc_time; /* per-pass time */ 406 uint64_t zc_next; /* next time to call this function */ 407 } ztest_shared_callstate_t; 408 409 static ztest_shared_callstate_t *ztest_shared_callstate; 410 #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 411 412 ztest_func_t ztest_dmu_read_write; 413 ztest_func_t ztest_dmu_write_parallel; 414 ztest_func_t ztest_dmu_object_alloc_free; 415 ztest_func_t ztest_dmu_object_next_chunk; 416 ztest_func_t ztest_dmu_commit_callbacks; 417 ztest_func_t ztest_zap; 418 ztest_func_t ztest_zap_parallel; 419 ztest_func_t ztest_zil_commit; 420 ztest_func_t ztest_zil_remount; 421 ztest_func_t ztest_dmu_read_write_zcopy; 422 ztest_func_t ztest_dmu_objset_create_destroy; 423 ztest_func_t ztest_dmu_prealloc; 424 ztest_func_t ztest_fzap; 425 ztest_func_t ztest_dmu_snapshot_create_destroy; 426 ztest_func_t ztest_dsl_prop_get_set; 427 ztest_func_t ztest_spa_prop_get_set; 428 ztest_func_t ztest_spa_create_destroy; 429 ztest_func_t ztest_fault_inject; 430 ztest_func_t ztest_dmu_snapshot_hold; 431 ztest_func_t ztest_mmp_enable_disable; 432 ztest_func_t ztest_scrub; 433 ztest_func_t ztest_dsl_dataset_promote_busy; 434 ztest_func_t ztest_vdev_attach_detach; 435 ztest_func_t ztest_vdev_raidz_attach; 436 ztest_func_t ztest_vdev_LUN_growth; 437 ztest_func_t ztest_vdev_add_remove; 438 ztest_func_t ztest_vdev_class_add; 439 ztest_func_t ztest_vdev_aux_add_remove; 440 ztest_func_t ztest_split_pool; 441 ztest_func_t ztest_reguid; 442 ztest_func_t ztest_spa_upgrade; 443 ztest_func_t ztest_device_removal; 444 ztest_func_t ztest_spa_checkpoint_create_discard; 445 ztest_func_t ztest_initialize; 446 ztest_func_t ztest_trim; 447 ztest_func_t ztest_blake3; 448 ztest_func_t ztest_fletcher; 449 ztest_func_t ztest_fletcher_incr; 450 ztest_func_t ztest_verify_dnode_bt; 451 ztest_func_t ztest_pool_prefetch_ddt; 452 ztest_func_t ztest_ddt_prune; 453 454 static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 455 static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 456 static uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 457 static uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 458 static uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 459 460 #define ZTI_INIT(func, iters, interval) \ 461 { .zi_func = (func), \ 462 .zi_iters = (iters), \ 463 .zi_interval = (interval), \ 464 .zi_funcname = # func } 465 466 static ztest_info_t ztest_info[] = { 467 ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), 468 ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), 469 ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), 470 ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), 471 ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), 472 ZTI_INIT(ztest_zap, 30, &zopt_always), 473 ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), 474 ZTI_INIT(ztest_split_pool, 1, &zopt_sometimes), 475 ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), 476 ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), 477 ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), 478 ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), 479 ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), 480 ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), 481 #if 0 482 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), 483 #endif 484 ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), 485 ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), 486 ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), 487 ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), 488 ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), 489 ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), 490 ZTI_INIT(ztest_reguid, 1, &zopt_rarely), 491 ZTI_INIT(ztest_scrub, 1, &zopt_rarely), 492 ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), 493 ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), 494 ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), 495 ZTI_INIT(ztest_vdev_raidz_attach, 1, &zopt_sometimes), 496 ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), 497 ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), 498 ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), 499 ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), 500 ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), 501 ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), 502 ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), 503 ZTI_INIT(ztest_trim, 1, &zopt_sometimes), 504 ZTI_INIT(ztest_blake3, 1, &zopt_rarely), 505 ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), 506 ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), 507 ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), 508 ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely), 509 ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely), 510 }; 511 512 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 513 514 /* 515 * The following struct is used to hold a list of uncalled commit callbacks. 516 * The callbacks are ordered by txg number. 517 */ 518 typedef struct ztest_cb_list { 519 kmutex_t zcl_callbacks_lock; 520 list_t zcl_callbacks; 521 } ztest_cb_list_t; 522 523 /* 524 * Stuff we need to share writably between parent and child. 525 */ 526 typedef struct ztest_shared { 527 boolean_t zs_do_init; 528 hrtime_t zs_proc_start; 529 hrtime_t zs_proc_stop; 530 hrtime_t zs_thread_start; 531 hrtime_t zs_thread_stop; 532 hrtime_t zs_thread_kill; 533 uint64_t zs_enospc_count; 534 uint64_t zs_vdev_next_leaf; 535 uint64_t zs_vdev_aux; 536 uint64_t zs_alloc; 537 uint64_t zs_space; 538 uint64_t zs_splits; 539 uint64_t zs_mirrors; 540 uint64_t zs_metaslab_sz; 541 uint64_t zs_metaslab_df_alloc_threshold; 542 uint64_t zs_guid; 543 } ztest_shared_t; 544 545 #define ID_PARALLEL -1ULL 546 547 static char ztest_dev_template[] = "%s/%s.%llua"; 548 static char ztest_aux_template[] = "%s/%s.%s.%llu"; 549 static ztest_shared_t *ztest_shared; 550 551 static spa_t *ztest_spa = NULL; 552 static ztest_ds_t *ztest_ds; 553 554 static kmutex_t ztest_vdev_lock; 555 static boolean_t ztest_device_removal_active = B_FALSE; 556 static boolean_t ztest_pool_scrubbed = B_FALSE; 557 static kmutex_t ztest_checkpoint_lock; 558 559 /* 560 * The ztest_name_lock protects the pool and dataset namespace used by 561 * the individual tests. To modify the namespace, consumers must grab 562 * this lock as writer. Grabbing the lock as reader will ensure that the 563 * namespace does not change while the lock is held. 564 */ 565 static pthread_rwlock_t ztest_name_lock; 566 567 static boolean_t ztest_dump_core = B_TRUE; 568 static boolean_t ztest_exiting; 569 570 /* Global commit callback list */ 571 static ztest_cb_list_t zcl; 572 /* Commit cb delay */ 573 static uint64_t zc_min_txg_delay = UINT64_MAX; 574 static int zc_cb_counter = 0; 575 576 /* 577 * Minimum number of commit callbacks that need to be registered for us to check 578 * whether the minimum txg delay is acceptable. 579 */ 580 #define ZTEST_COMMIT_CB_MIN_REG 100 581 582 /* 583 * If a number of txgs equal to this threshold have been created after a commit 584 * callback has been registered but not called, then we assume there is an 585 * implementation bug. 586 */ 587 #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) 588 589 enum ztest_object { 590 ZTEST_META_DNODE = 0, 591 ZTEST_DIROBJ, 592 ZTEST_OBJECTS 593 }; 594 595 static __attribute__((noreturn)) void usage(boolean_t requested); 596 static int ztest_scrub_impl(spa_t *spa); 597 598 /* 599 * These libumem hooks provide a reasonable set of defaults for the allocator's 600 * debugging facilities. 601 */ 602 const char * 603 _umem_debug_init(void) 604 { 605 return ("default,verbose"); /* $UMEM_DEBUG setting */ 606 } 607 608 const char * 609 _umem_logging_init(void) 610 { 611 return ("fail,contents"); /* $UMEM_LOGGING setting */ 612 } 613 614 static void 615 dump_debug_buffer(void) 616 { 617 ssize_t ret __attribute__((unused)); 618 619 if (!ztest_opts.zo_dump_dbgmsg) 620 return; 621 622 /* 623 * We use write() instead of printf() so that this function 624 * is safe to call from a signal handler. 625 */ 626 ret = write(STDERR_FILENO, "\n", 1); 627 zfs_dbgmsg_print(STDERR_FILENO, "ztest"); 628 } 629 630 static void sig_handler(int signo) 631 { 632 struct sigaction action; 633 634 libspl_backtrace(STDERR_FILENO); 635 dump_debug_buffer(); 636 637 /* 638 * Restore default action and re-raise signal so SIGSEGV and 639 * SIGABRT can trigger a core dump. 640 */ 641 action.sa_handler = SIG_DFL; 642 sigemptyset(&action.sa_mask); 643 action.sa_flags = 0; 644 (void) sigaction(signo, &action, NULL); 645 raise(signo); 646 } 647 648 #define FATAL_MSG_SZ 1024 649 650 static const char *fatal_msg; 651 652 static __attribute__((format(printf, 2, 3))) __attribute__((noreturn)) void 653 fatal(int do_perror, const char *message, ...) 654 { 655 va_list args; 656 int save_errno = errno; 657 char *buf; 658 659 (void) fflush(stdout); 660 buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); 661 if (buf == NULL) 662 goto out; 663 664 va_start(args, message); 665 (void) sprintf(buf, "ztest: "); 666 /* LINTED */ 667 (void) vsprintf(buf + strlen(buf), message, args); 668 va_end(args); 669 if (do_perror) { 670 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 671 ": %s", strerror(save_errno)); 672 } 673 (void) fprintf(stderr, "%s\n", buf); 674 fatal_msg = buf; /* to ease debugging */ 675 676 out: 677 if (ztest_dump_core) 678 abort(); 679 else 680 dump_debug_buffer(); 681 682 exit(3); 683 } 684 685 static int 686 str2shift(const char *buf) 687 { 688 const char *ends = "BKMGTPEZ"; 689 int i, len; 690 691 if (buf[0] == '\0') 692 return (0); 693 694 len = strlen(ends); 695 for (i = 0; i < len; i++) { 696 if (toupper(buf[0]) == ends[i]) 697 break; 698 } 699 if (i == len) { 700 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 701 buf); 702 usage(B_FALSE); 703 } 704 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 705 return (10*i); 706 } 707 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 708 usage(B_FALSE); 709 } 710 711 static uint64_t 712 nicenumtoull(const char *buf) 713 { 714 char *end; 715 uint64_t val; 716 717 val = strtoull(buf, &end, 0); 718 if (end == buf) { 719 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 720 usage(B_FALSE); 721 } else if (end[0] == '.') { 722 double fval = strtod(buf, &end); 723 fval *= pow(2, str2shift(end)); 724 /* 725 * UINT64_MAX is not exactly representable as a double. 726 * The closest representation is UINT64_MAX + 1, so we 727 * use a >= comparison instead of > for the bounds check. 728 */ 729 if (fval >= (double)UINT64_MAX) { 730 (void) fprintf(stderr, "ztest: value too large: %s\n", 731 buf); 732 usage(B_FALSE); 733 } 734 val = (uint64_t)fval; 735 } else { 736 int shift = str2shift(end); 737 if (shift >= 64 || (val << shift) >> shift != val) { 738 (void) fprintf(stderr, "ztest: value too large: %s\n", 739 buf); 740 usage(B_FALSE); 741 } 742 val <<= shift; 743 } 744 return (val); 745 } 746 747 typedef struct ztest_option { 748 const char short_opt; 749 const char *long_opt; 750 const char *long_opt_param; 751 const char *comment; 752 unsigned int default_int; 753 const char *default_str; 754 } ztest_option_t; 755 756 /* 757 * The following option_table is used for generating the usage info as well as 758 * the long and short option information for calling getopt_long(). 759 */ 760 static ztest_option_t option_table[] = { 761 { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, 762 NULL}, 763 { 's', "vdev-size", "INTEGER", "Size of each vdev", 764 NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, 765 { 'a', "alignment-shift", "INTEGER", 766 "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, 767 { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", 768 DEFAULT_MIRRORS, NULL}, 769 { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", 770 DEFAULT_RAID_CHILDREN, NULL}, 771 { 'R', "raid-parity", "INTEGER", "Raid parity", 772 DEFAULT_RAID_PARITY, NULL}, 773 { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind", 774 NO_DEFAULT, "random"}, 775 { 'D', "draid-data", "INTEGER", "Number of draid data drives", 776 DEFAULT_DRAID_DATA, NULL}, 777 { 'S', "draid-spares", "INTEGER", "Number of draid spares", 778 DEFAULT_DRAID_SPARES, NULL}, 779 { 'd', "datasets", "INTEGER", "Number of datasets", 780 DEFAULT_DATASETS_COUNT, NULL}, 781 { 't', "threads", "INTEGER", "Number of ztest threads", 782 DEFAULT_THREADS, NULL}, 783 { 'g', "gang-block-threshold", "INTEGER", 784 "Metaslab gang block threshold", 785 NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, 786 { 'i', "init-count", "INTEGER", "Number of times to initialize pool", 787 DEFAULT_INITS, NULL}, 788 { 'k', "kill-percentage", "INTEGER", "Kill percentage", 789 NO_DEFAULT, DEFAULT_KILLRATE_STR}, 790 { 'p', "pool-name", "STRING", "Pool name", 791 NO_DEFAULT, DEFAULT_POOL}, 792 { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", 793 NO_DEFAULT, DEFAULT_VDEV_DIR}, 794 { 'M', "multi-host", NULL, 795 "Multi-host; simulate pool imported on remote host", 796 NO_DEFAULT, NULL}, 797 { 'E', "use-existing-pool", NULL, 798 "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, 799 { 'T', "run-time", "INTEGER", "Total run time", 800 NO_DEFAULT, DEFAULT_RUN_TIME_STR}, 801 { 'P', "pass-time", "INTEGER", "Time per pass", 802 NO_DEFAULT, DEFAULT_PASS_TIME_STR}, 803 { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", 804 DEFAULT_MAX_LOOPS, NULL}, 805 { 'B', "alt-ztest", "PATH", "Alternate ztest path", 806 NO_DEFAULT, NULL}, 807 { 'C', "vdev-class-state", "on|off|random", "vdev class state", 808 NO_DEFAULT, "random"}, 809 { 'X', "raidz-expansion", NULL, 810 "Perform a dedicated raidz expansion test", 811 NO_DEFAULT, NULL}, 812 { 'o', "option", "\"OPTION=INTEGER\"", 813 "Set global variable to an unsigned 32-bit integer value", 814 NO_DEFAULT, NULL}, 815 { 'G', "dump-debug-msg", NULL, 816 "Dump zfs_dbgmsg buffer before exiting due to an error", 817 NO_DEFAULT, NULL}, 818 { 'V', "verbose", NULL, 819 "Verbose (use multiple times for ever more verbosity)", 820 NO_DEFAULT, NULL}, 821 { 'h', "help", NULL, "Show this help", 822 NO_DEFAULT, NULL}, 823 {0, 0, 0, 0, 0, 0} 824 }; 825 826 static struct option *long_opts = NULL; 827 static char *short_opts = NULL; 828 829 static void 830 init_options(void) 831 { 832 ASSERT3P(long_opts, ==, NULL); 833 ASSERT3P(short_opts, ==, NULL); 834 835 int count = sizeof (option_table) / sizeof (option_table[0]); 836 long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); 837 838 short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); 839 int short_opt_index = 0; 840 841 for (int i = 0; i < count; i++) { 842 long_opts[i].val = option_table[i].short_opt; 843 long_opts[i].name = option_table[i].long_opt; 844 long_opts[i].has_arg = option_table[i].long_opt_param != NULL 845 ? required_argument : no_argument; 846 long_opts[i].flag = NULL; 847 short_opts[short_opt_index++] = option_table[i].short_opt; 848 if (option_table[i].long_opt_param != NULL) { 849 short_opts[short_opt_index++] = ':'; 850 } 851 } 852 } 853 854 static void 855 fini_options(void) 856 { 857 int count = sizeof (option_table) / sizeof (option_table[0]); 858 859 umem_free(long_opts, sizeof (struct option) * count); 860 umem_free(short_opts, sizeof (char) * 2 * count); 861 862 long_opts = NULL; 863 short_opts = NULL; 864 } 865 866 static __attribute__((noreturn)) void 867 usage(boolean_t requested) 868 { 869 char option[80]; 870 FILE *fp = requested ? stdout : stderr; 871 872 (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); 873 for (int i = 0; option_table[i].short_opt != 0; i++) { 874 if (option_table[i].long_opt_param != NULL) { 875 (void) sprintf(option, " -%c --%s=%s", 876 option_table[i].short_opt, 877 option_table[i].long_opt, 878 option_table[i].long_opt_param); 879 } else { 880 (void) sprintf(option, " -%c --%s", 881 option_table[i].short_opt, 882 option_table[i].long_opt); 883 } 884 (void) fprintf(fp, " %-43s%s", option, 885 option_table[i].comment); 886 887 if (option_table[i].long_opt_param != NULL) { 888 if (option_table[i].default_str != NULL) { 889 (void) fprintf(fp, " (default: %s)", 890 option_table[i].default_str); 891 } else if (option_table[i].default_int != NO_DEFAULT) { 892 (void) fprintf(fp, " (default: %u)", 893 option_table[i].default_int); 894 } 895 } 896 (void) fprintf(fp, "\n"); 897 } 898 exit(requested ? 0 : 1); 899 } 900 901 static uint64_t 902 ztest_random(uint64_t range) 903 { 904 uint64_t r; 905 906 ASSERT3S(ztest_fd_rand, >=, 0); 907 908 if (range == 0) 909 return (0); 910 911 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) 912 fatal(B_TRUE, "short read from /dev/urandom"); 913 914 return (r % range); 915 } 916 917 static void 918 ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) 919 { 920 char name[32]; 921 char *value; 922 int state = ZTEST_VDEV_CLASS_RND; 923 924 (void) strlcpy(name, input, sizeof (name)); 925 926 value = strchr(name, '='); 927 if (value == NULL) { 928 (void) fprintf(stderr, "missing value in property=value " 929 "'-C' argument (%s)\n", input); 930 usage(B_FALSE); 931 } 932 *(value) = '\0'; 933 value++; 934 935 if (strcmp(value, "on") == 0) { 936 state = ZTEST_VDEV_CLASS_ON; 937 } else if (strcmp(value, "off") == 0) { 938 state = ZTEST_VDEV_CLASS_OFF; 939 } else if (strcmp(value, "random") == 0) { 940 state = ZTEST_VDEV_CLASS_RND; 941 } else { 942 (void) fprintf(stderr, "invalid property value '%s'\n", value); 943 usage(B_FALSE); 944 } 945 946 if (strcmp(name, "special") == 0) { 947 zo->zo_special_vdevs = state; 948 } else { 949 (void) fprintf(stderr, "invalid property name '%s'\n", name); 950 usage(B_FALSE); 951 } 952 if (zo->zo_verbose >= 3) 953 (void) printf("%s vdev state is '%s'\n", name, value); 954 } 955 956 static void 957 process_options(int argc, char **argv) 958 { 959 char *path; 960 ztest_shared_opts_t *zo = &ztest_opts; 961 962 int opt; 963 uint64_t value; 964 const char *raid_kind = "random"; 965 966 memcpy(zo, &ztest_opts_defaults, sizeof (*zo)); 967 968 init_options(); 969 970 while ((opt = getopt_long(argc, argv, short_opts, long_opts, 971 NULL)) != EOF) { 972 value = 0; 973 switch (opt) { 974 case 'v': 975 case 's': 976 case 'a': 977 case 'm': 978 case 'r': 979 case 'R': 980 case 'D': 981 case 'S': 982 case 'd': 983 case 't': 984 case 'g': 985 case 'i': 986 case 'k': 987 case 'T': 988 case 'P': 989 case 'F': 990 value = nicenumtoull(optarg); 991 } 992 switch (opt) { 993 case 'v': 994 zo->zo_vdevs = value; 995 break; 996 case 's': 997 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 998 break; 999 case 'a': 1000 zo->zo_ashift = value; 1001 break; 1002 case 'm': 1003 zo->zo_mirrors = value; 1004 break; 1005 case 'r': 1006 zo->zo_raid_children = MAX(1, value); 1007 break; 1008 case 'R': 1009 zo->zo_raid_parity = MIN(MAX(value, 1), 3); 1010 break; 1011 case 'K': 1012 raid_kind = optarg; 1013 break; 1014 case 'D': 1015 zo->zo_draid_data = MAX(1, value); 1016 break; 1017 case 'S': 1018 zo->zo_draid_spares = MAX(1, value); 1019 break; 1020 case 'd': 1021 zo->zo_datasets = MAX(1, value); 1022 break; 1023 case 't': 1024 zo->zo_threads = MAX(1, value); 1025 break; 1026 case 'g': 1027 zo->zo_metaslab_force_ganging = 1028 MAX(SPA_MINBLOCKSIZE << 1, value); 1029 break; 1030 case 'i': 1031 zo->zo_init = value; 1032 break; 1033 case 'k': 1034 zo->zo_killrate = value; 1035 break; 1036 case 'p': 1037 (void) strlcpy(zo->zo_pool, optarg, 1038 sizeof (zo->zo_pool)); 1039 break; 1040 case 'f': 1041 path = realpath(optarg, NULL); 1042 if (path == NULL) { 1043 (void) fprintf(stderr, "error: %s: %s\n", 1044 optarg, strerror(errno)); 1045 usage(B_FALSE); 1046 } else { 1047 (void) strlcpy(zo->zo_dir, path, 1048 sizeof (zo->zo_dir)); 1049 free(path); 1050 } 1051 break; 1052 case 'M': 1053 zo->zo_mmp_test = 1; 1054 break; 1055 case 'V': 1056 zo->zo_verbose++; 1057 break; 1058 case 'X': 1059 zo->zo_raidz_expand_test = RAIDZ_EXPAND_REQUESTED; 1060 break; 1061 case 'E': 1062 zo->zo_init = 0; 1063 break; 1064 case 'T': 1065 zo->zo_time = value; 1066 break; 1067 case 'P': 1068 zo->zo_passtime = MAX(1, value); 1069 break; 1070 case 'F': 1071 zo->zo_maxloops = MAX(1, value); 1072 break; 1073 case 'B': 1074 (void) strlcpy(zo->zo_alt_ztest, optarg, 1075 sizeof (zo->zo_alt_ztest)); 1076 break; 1077 case 'C': 1078 ztest_parse_name_value(optarg, zo); 1079 break; 1080 case 'o': 1081 if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { 1082 (void) fprintf(stderr, 1083 "max global var count (%zu) exceeded\n", 1084 ZO_GVARS_MAX_COUNT); 1085 usage(B_FALSE); 1086 } 1087 char *v = zo->zo_gvars[zo->zo_gvars_count]; 1088 if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= 1089 ZO_GVARS_MAX_ARGLEN) { 1090 (void) fprintf(stderr, 1091 "global var option '%s' is too long\n", 1092 optarg); 1093 usage(B_FALSE); 1094 } 1095 zo->zo_gvars_count++; 1096 break; 1097 case 'G': 1098 zo->zo_dump_dbgmsg = 1; 1099 break; 1100 case 'h': 1101 usage(B_TRUE); 1102 break; 1103 case '?': 1104 default: 1105 usage(B_FALSE); 1106 break; 1107 } 1108 } 1109 1110 fini_options(); 1111 1112 /* Force compatible options for raidz expansion run */ 1113 if (zo->zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) { 1114 zo->zo_mmp_test = 0; 1115 zo->zo_mirrors = 0; 1116 zo->zo_vdevs = 1; 1117 zo->zo_vdev_size = DEFAULT_VDEV_SIZE * 2; 1118 zo->zo_raid_do_expand = B_FALSE; 1119 raid_kind = "raidz"; 1120 } 1121 1122 if (strcmp(raid_kind, "random") == 0) { 1123 switch (ztest_random(3)) { 1124 case 0: 1125 raid_kind = "raidz"; 1126 break; 1127 case 1: 1128 raid_kind = "eraidz"; 1129 break; 1130 case 2: 1131 raid_kind = "draid"; 1132 break; 1133 } 1134 1135 if (ztest_opts.zo_verbose >= 3) 1136 (void) printf("choosing RAID type '%s'\n", raid_kind); 1137 } 1138 1139 if (strcmp(raid_kind, "draid") == 0) { 1140 uint64_t min_devsize; 1141 1142 /* With fewer disk use 256M, otherwise 128M is OK */ 1143 min_devsize = (ztest_opts.zo_raid_children < 16) ? 1144 (256ULL << 20) : (128ULL << 20); 1145 1146 /* No top-level mirrors with dRAID for now */ 1147 zo->zo_mirrors = 0; 1148 1149 /* Use more appropriate defaults for dRAID */ 1150 if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) 1151 zo->zo_vdevs = 1; 1152 if (zo->zo_raid_children == 1153 ztest_opts_defaults.zo_raid_children) 1154 zo->zo_raid_children = 16; 1155 if (zo->zo_ashift < 12) 1156 zo->zo_ashift = 12; 1157 if (zo->zo_vdev_size < min_devsize) 1158 zo->zo_vdev_size = min_devsize; 1159 1160 if (zo->zo_draid_data + zo->zo_raid_parity > 1161 zo->zo_raid_children - zo->zo_draid_spares) { 1162 (void) fprintf(stderr, "error: too few draid " 1163 "children (%d) for stripe width (%d)\n", 1164 zo->zo_raid_children, 1165 zo->zo_draid_data + zo->zo_raid_parity); 1166 usage(B_FALSE); 1167 } 1168 1169 (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, 1170 sizeof (zo->zo_raid_type)); 1171 1172 } else if (strcmp(raid_kind, "eraidz") == 0) { 1173 /* using eraidz (expandable raidz) */ 1174 zo->zo_raid_do_expand = B_TRUE; 1175 1176 /* tests expect top-level to be raidz */ 1177 zo->zo_mirrors = 0; 1178 zo->zo_vdevs = 1; 1179 1180 /* Make sure parity is less than data columns */ 1181 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1182 zo->zo_raid_children - 1); 1183 1184 } else /* using raidz */ { 1185 ASSERT0(strcmp(raid_kind, "raidz")); 1186 1187 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1188 zo->zo_raid_children - 1); 1189 } 1190 1191 zo->zo_vdevtime = 1192 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 1193 UINT64_MAX >> 2); 1194 1195 if (*zo->zo_alt_ztest) { 1196 const char *invalid_what = "ztest"; 1197 char *val = zo->zo_alt_ztest; 1198 if (0 != access(val, X_OK) || 1199 (strrchr(val, '/') == NULL && (errno == EINVAL))) 1200 goto invalid; 1201 1202 int dirlen = strrchr(val, '/') - val; 1203 strlcpy(zo->zo_alt_libpath, val, 1204 MIN(sizeof (zo->zo_alt_libpath), dirlen + 1)); 1205 invalid_what = "library path", val = zo->zo_alt_libpath; 1206 if (strrchr(val, '/') == NULL && (errno == EINVAL)) 1207 goto invalid; 1208 *strrchr(val, '/') = '\0'; 1209 strlcat(val, "/lib", sizeof (zo->zo_alt_libpath)); 1210 1211 if (0 != access(zo->zo_alt_libpath, X_OK)) 1212 goto invalid; 1213 return; 1214 1215 invalid: 1216 ztest_dump_core = B_FALSE; 1217 fatal(B_TRUE, "invalid alternate %s %s", invalid_what, val); 1218 } 1219 } 1220 1221 static void 1222 ztest_kill(ztest_shared_t *zs) 1223 { 1224 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 1225 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 1226 1227 /* 1228 * Before we kill ourselves, make sure that the config is updated. 1229 * See comment above spa_write_cachefile(). 1230 */ 1231 if (raidz_expand_pause_point != RAIDZ_EXPAND_PAUSE_NONE) { 1232 if (mutex_tryenter(&spa_namespace_lock)) { 1233 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, 1234 B_FALSE); 1235 mutex_exit(&spa_namespace_lock); 1236 1237 ztest_scratch_state->zs_raidz_scratch_verify_pause = 1238 raidz_expand_pause_point; 1239 } else { 1240 /* 1241 * Do not verify scratch object in case if 1242 * spa_namespace_lock cannot be acquired, 1243 * it can cause deadlock in spa_config_update(). 1244 */ 1245 raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; 1246 1247 return; 1248 } 1249 } else { 1250 mutex_enter(&spa_namespace_lock); 1251 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); 1252 mutex_exit(&spa_namespace_lock); 1253 } 1254 1255 (void) raise(SIGKILL); 1256 } 1257 1258 static void 1259 ztest_record_enospc(const char *s) 1260 { 1261 (void) s; 1262 ztest_shared->zs_enospc_count++; 1263 } 1264 1265 static uint64_t 1266 ztest_get_ashift(void) 1267 { 1268 if (ztest_opts.zo_ashift == 0) 1269 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 1270 return (ztest_opts.zo_ashift); 1271 } 1272 1273 static boolean_t 1274 ztest_is_draid_spare(const char *name) 1275 { 1276 uint64_t spare_id = 0, parity = 0, vdev_id = 0; 1277 1278 if (sscanf(name, VDEV_TYPE_DRAID "%"PRIu64"-%"PRIu64"-%"PRIu64"", 1279 &parity, &vdev_id, &spare_id) == 3) { 1280 return (B_TRUE); 1281 } 1282 1283 return (B_FALSE); 1284 } 1285 1286 static nvlist_t * 1287 make_vdev_file(const char *path, const char *aux, const char *pool, 1288 size_t size, uint64_t ashift) 1289 { 1290 char *pathbuf = NULL; 1291 uint64_t vdev; 1292 nvlist_t *file; 1293 boolean_t draid_spare = B_FALSE; 1294 1295 1296 if (ashift == 0) 1297 ashift = ztest_get_ashift(); 1298 1299 if (path == NULL) { 1300 pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1301 path = pathbuf; 1302 1303 if (aux != NULL) { 1304 vdev = ztest_shared->zs_vdev_aux; 1305 (void) snprintf(pathbuf, MAXPATHLEN, 1306 ztest_aux_template, ztest_opts.zo_dir, 1307 pool == NULL ? ztest_opts.zo_pool : pool, 1308 aux, vdev); 1309 } else { 1310 vdev = ztest_shared->zs_vdev_next_leaf++; 1311 (void) snprintf(pathbuf, MAXPATHLEN, 1312 ztest_dev_template, ztest_opts.zo_dir, 1313 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 1314 } 1315 } else { 1316 draid_spare = ztest_is_draid_spare(path); 1317 } 1318 1319 if (size != 0 && !draid_spare) { 1320 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 1321 if (fd == -1) 1322 fatal(B_TRUE, "can't open %s", path); 1323 if (ftruncate(fd, size) != 0) 1324 fatal(B_TRUE, "can't ftruncate %s", path); 1325 (void) close(fd); 1326 } 1327 1328 file = fnvlist_alloc(); 1329 fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, 1330 draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); 1331 fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); 1332 fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); 1333 umem_free(pathbuf, MAXPATHLEN); 1334 1335 return (file); 1336 } 1337 1338 static nvlist_t * 1339 make_vdev_raid(const char *path, const char *aux, const char *pool, size_t size, 1340 uint64_t ashift, int r) 1341 { 1342 nvlist_t *raid, **child; 1343 int c; 1344 1345 if (r < 2) 1346 return (make_vdev_file(path, aux, pool, size, ashift)); 1347 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 1348 1349 for (c = 0; c < r; c++) 1350 child[c] = make_vdev_file(path, aux, pool, size, ashift); 1351 1352 raid = fnvlist_alloc(); 1353 fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, 1354 ztest_opts.zo_raid_type); 1355 fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, 1356 ztest_opts.zo_raid_parity); 1357 fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, 1358 (const nvlist_t **)child, r); 1359 1360 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { 1361 uint64_t ndata = ztest_opts.zo_draid_data; 1362 uint64_t nparity = ztest_opts.zo_raid_parity; 1363 uint64_t nspares = ztest_opts.zo_draid_spares; 1364 uint64_t children = ztest_opts.zo_raid_children; 1365 uint64_t ngroups = 1; 1366 1367 /* 1368 * Calculate the minimum number of groups required to fill a 1369 * slice. This is the LCM of the stripe width (data + parity) 1370 * and the number of data drives (children - spares). 1371 */ 1372 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1373 ngroups++; 1374 1375 /* Store the basic dRAID configuration. */ 1376 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1377 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1378 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1379 } 1380 1381 for (c = 0; c < r; c++) 1382 fnvlist_free(child[c]); 1383 1384 umem_free(child, r * sizeof (nvlist_t *)); 1385 1386 return (raid); 1387 } 1388 1389 static nvlist_t * 1390 make_vdev_mirror(const char *path, const char *aux, const char *pool, 1391 size_t size, uint64_t ashift, int r, int m) 1392 { 1393 nvlist_t *mirror, **child; 1394 int c; 1395 1396 if (m < 1) 1397 return (make_vdev_raid(path, aux, pool, size, ashift, r)); 1398 1399 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 1400 1401 for (c = 0; c < m; c++) 1402 child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); 1403 1404 mirror = fnvlist_alloc(); 1405 fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); 1406 fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 1407 (const nvlist_t **)child, m); 1408 1409 for (c = 0; c < m; c++) 1410 fnvlist_free(child[c]); 1411 1412 umem_free(child, m * sizeof (nvlist_t *)); 1413 1414 return (mirror); 1415 } 1416 1417 static nvlist_t * 1418 make_vdev_root(const char *path, const char *aux, const char *pool, size_t size, 1419 uint64_t ashift, const char *class, int r, int m, int t) 1420 { 1421 nvlist_t *root, **child; 1422 int c; 1423 boolean_t log; 1424 1425 ASSERT3S(t, >, 0); 1426 1427 log = (class != NULL && strcmp(class, "log") == 0); 1428 1429 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 1430 1431 for (c = 0; c < t; c++) { 1432 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 1433 r, m); 1434 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); 1435 1436 if (class != NULL && class[0] != '\0') { 1437 ASSERT(m > 1 || log); /* expecting a mirror */ 1438 fnvlist_add_string(child[c], 1439 ZPOOL_CONFIG_ALLOCATION_BIAS, class); 1440 } 1441 } 1442 1443 root = fnvlist_alloc(); 1444 fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 1445 fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 1446 (const nvlist_t **)child, t); 1447 1448 for (c = 0; c < t; c++) 1449 fnvlist_free(child[c]); 1450 1451 umem_free(child, t * sizeof (nvlist_t *)); 1452 1453 return (root); 1454 } 1455 1456 /* 1457 * Find a random spa version. Returns back a random spa version in the 1458 * range [initial_version, SPA_VERSION_FEATURES]. 1459 */ 1460 static uint64_t 1461 ztest_random_spa_version(uint64_t initial_version) 1462 { 1463 uint64_t version = initial_version; 1464 1465 if (version <= SPA_VERSION_BEFORE_FEATURES) { 1466 version = version + 1467 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 1468 } 1469 1470 if (version > SPA_VERSION_BEFORE_FEATURES) 1471 version = SPA_VERSION_FEATURES; 1472 1473 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1474 return (version); 1475 } 1476 1477 static int 1478 ztest_random_blocksize(void) 1479 { 1480 ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); 1481 1482 /* 1483 * Choose a block size >= the ashift. 1484 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1485 */ 1486 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1487 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1488 maxbs = 20; 1489 uint64_t block_shift = 1490 ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1491 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1492 } 1493 1494 static int 1495 ztest_random_dnodesize(void) 1496 { 1497 int slots; 1498 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1499 1500 if (max_slots == DNODE_MIN_SLOTS) 1501 return (DNODE_MIN_SIZE); 1502 1503 /* 1504 * Weight the random distribution more heavily toward smaller 1505 * dnode sizes since that is more likely to reflect real-world 1506 * usage. 1507 */ 1508 ASSERT3U(max_slots, >, 4); 1509 switch (ztest_random(10)) { 1510 case 0: 1511 slots = 5 + ztest_random(max_slots - 4); 1512 break; 1513 case 1 ... 4: 1514 slots = 2 + ztest_random(3); 1515 break; 1516 default: 1517 slots = 1; 1518 break; 1519 } 1520 1521 return (slots << DNODE_SHIFT); 1522 } 1523 1524 static int 1525 ztest_random_ibshift(void) 1526 { 1527 return (DN_MIN_INDBLKSHIFT + 1528 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1529 } 1530 1531 static uint64_t 1532 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1533 { 1534 uint64_t top; 1535 vdev_t *rvd = spa->spa_root_vdev; 1536 vdev_t *tvd; 1537 1538 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 1539 1540 do { 1541 top = ztest_random(rvd->vdev_children); 1542 tvd = rvd->vdev_child[top]; 1543 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1544 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1545 1546 return (top); 1547 } 1548 1549 static uint64_t 1550 ztest_random_dsl_prop(zfs_prop_t prop) 1551 { 1552 uint64_t value; 1553 1554 do { 1555 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1556 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1557 1558 return (value); 1559 } 1560 1561 static int 1562 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1563 boolean_t inherit) 1564 { 1565 const char *propname = zfs_prop_to_name(prop); 1566 const char *valname; 1567 char *setpoint; 1568 uint64_t curval; 1569 int error; 1570 1571 error = dsl_prop_set_int(osname, propname, 1572 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1573 1574 if (error == ENOSPC) { 1575 ztest_record_enospc(FTAG); 1576 return (error); 1577 } 1578 ASSERT0(error); 1579 1580 setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1581 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1582 1583 if (ztest_opts.zo_verbose >= 6) { 1584 int err; 1585 1586 err = zfs_prop_index_to_string(prop, curval, &valname); 1587 if (err) 1588 (void) printf("%s %s = %llu at '%s'\n", osname, 1589 propname, (unsigned long long)curval, setpoint); 1590 else 1591 (void) printf("%s %s = %s at '%s'\n", 1592 osname, propname, valname, setpoint); 1593 } 1594 umem_free(setpoint, MAXPATHLEN); 1595 1596 return (error); 1597 } 1598 1599 static int 1600 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1601 { 1602 spa_t *spa = ztest_spa; 1603 nvlist_t *props = NULL; 1604 int error; 1605 1606 props = fnvlist_alloc(); 1607 fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); 1608 1609 error = spa_prop_set(spa, props); 1610 1611 fnvlist_free(props); 1612 1613 if (error == ENOSPC) { 1614 ztest_record_enospc(FTAG); 1615 return (error); 1616 } 1617 ASSERT0(error); 1618 1619 return (error); 1620 } 1621 1622 static int 1623 ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, 1624 boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) 1625 { 1626 int err; 1627 char *cp = NULL; 1628 char ddname[ZFS_MAX_DATASET_NAME_LEN]; 1629 1630 strlcpy(ddname, name, sizeof (ddname)); 1631 cp = strchr(ddname, '@'); 1632 if (cp != NULL) 1633 *cp = '\0'; 1634 1635 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1636 while (decrypt && err == EACCES) { 1637 dsl_crypto_params_t *dcp; 1638 nvlist_t *crypto_args = fnvlist_alloc(); 1639 1640 fnvlist_add_uint8_array(crypto_args, "wkeydata", 1641 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 1642 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, 1643 crypto_args, &dcp)); 1644 err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); 1645 /* 1646 * Note: if there was an error loading, the wkey was not 1647 * consumed, and needs to be freed. 1648 */ 1649 dsl_crypto_params_free(dcp, (err != 0)); 1650 fnvlist_free(crypto_args); 1651 1652 if (err == EINVAL) { 1653 /* 1654 * We couldn't load a key for this dataset so try 1655 * the parent. This loop will eventually hit the 1656 * encryption root since ztest only makes clones 1657 * as children of their origin datasets. 1658 */ 1659 cp = strrchr(ddname, '/'); 1660 if (cp == NULL) 1661 return (err); 1662 1663 *cp = '\0'; 1664 err = EACCES; 1665 continue; 1666 } else if (err != 0) { 1667 break; 1668 } 1669 1670 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1671 break; 1672 } 1673 1674 return (err); 1675 } 1676 1677 static void 1678 ztest_rll_init(rll_t *rll) 1679 { 1680 rll->rll_writer = NULL; 1681 rll->rll_readers = 0; 1682 mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); 1683 cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); 1684 } 1685 1686 static void 1687 ztest_rll_destroy(rll_t *rll) 1688 { 1689 ASSERT3P(rll->rll_writer, ==, NULL); 1690 ASSERT0(rll->rll_readers); 1691 mutex_destroy(&rll->rll_lock); 1692 cv_destroy(&rll->rll_cv); 1693 } 1694 1695 static void 1696 ztest_rll_lock(rll_t *rll, rl_type_t type) 1697 { 1698 mutex_enter(&rll->rll_lock); 1699 1700 if (type == ZTRL_READER) { 1701 while (rll->rll_writer != NULL) 1702 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1703 rll->rll_readers++; 1704 } else { 1705 while (rll->rll_writer != NULL || rll->rll_readers) 1706 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1707 rll->rll_writer = curthread; 1708 } 1709 1710 mutex_exit(&rll->rll_lock); 1711 } 1712 1713 static void 1714 ztest_rll_unlock(rll_t *rll) 1715 { 1716 mutex_enter(&rll->rll_lock); 1717 1718 if (rll->rll_writer) { 1719 ASSERT0(rll->rll_readers); 1720 rll->rll_writer = NULL; 1721 } else { 1722 ASSERT3S(rll->rll_readers, >, 0); 1723 ASSERT3P(rll->rll_writer, ==, NULL); 1724 rll->rll_readers--; 1725 } 1726 1727 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1728 cv_broadcast(&rll->rll_cv); 1729 1730 mutex_exit(&rll->rll_lock); 1731 } 1732 1733 static void 1734 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1735 { 1736 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1737 1738 ztest_rll_lock(rll, type); 1739 } 1740 1741 static void 1742 ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1743 { 1744 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1745 1746 ztest_rll_unlock(rll); 1747 } 1748 1749 static rl_t * 1750 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1751 uint64_t size, rl_type_t type) 1752 { 1753 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1754 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1755 rl_t *rl; 1756 1757 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1758 rl->rl_object = object; 1759 rl->rl_offset = offset; 1760 rl->rl_size = size; 1761 rl->rl_lock = rll; 1762 1763 ztest_rll_lock(rll, type); 1764 1765 return (rl); 1766 } 1767 1768 static void 1769 ztest_range_unlock(rl_t *rl) 1770 { 1771 rll_t *rll = rl->rl_lock; 1772 1773 ztest_rll_unlock(rll); 1774 1775 umem_free(rl, sizeof (*rl)); 1776 } 1777 1778 static void 1779 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1780 { 1781 zd->zd_os = os; 1782 zd->zd_zilog = dmu_objset_zil(os); 1783 zd->zd_shared = szd; 1784 dmu_objset_name(os, zd->zd_name); 1785 int l; 1786 1787 if (zd->zd_shared != NULL) 1788 zd->zd_shared->zd_seq = 0; 1789 1790 VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); 1791 mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); 1792 1793 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1794 ztest_rll_init(&zd->zd_object_lock[l]); 1795 1796 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1797 ztest_rll_init(&zd->zd_range_lock[l]); 1798 } 1799 1800 static void 1801 ztest_zd_fini(ztest_ds_t *zd) 1802 { 1803 int l; 1804 1805 mutex_destroy(&zd->zd_dirobj_lock); 1806 (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); 1807 1808 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1809 ztest_rll_destroy(&zd->zd_object_lock[l]); 1810 1811 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1812 ztest_rll_destroy(&zd->zd_range_lock[l]); 1813 } 1814 1815 #define DMU_TX_MIGHTWAIT \ 1816 (ztest_random(10) == 0 ? DMU_TX_NOWAIT : DMU_TX_WAIT) 1817 1818 static uint64_t 1819 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 1820 { 1821 uint64_t txg; 1822 int error; 1823 1824 /* 1825 * Attempt to assign tx to some transaction group. 1826 */ 1827 error = dmu_tx_assign(tx, txg_how); 1828 if (error) { 1829 if (error == ERESTART) { 1830 ASSERT3U(txg_how, ==, DMU_TX_NOWAIT); 1831 dmu_tx_wait(tx); 1832 } else { 1833 ASSERT3U(error, ==, ENOSPC); 1834 ztest_record_enospc(tag); 1835 } 1836 dmu_tx_abort(tx); 1837 return (0); 1838 } 1839 txg = dmu_tx_get_txg(tx); 1840 ASSERT3U(txg, !=, 0); 1841 return (txg); 1842 } 1843 1844 static void 1845 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1846 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1847 uint64_t crtxg) 1848 { 1849 bt->bt_magic = BT_MAGIC; 1850 bt->bt_objset = dmu_objset_id(os); 1851 bt->bt_object = object; 1852 bt->bt_dnodesize = dnodesize; 1853 bt->bt_offset = offset; 1854 bt->bt_gen = gen; 1855 bt->bt_txg = txg; 1856 bt->bt_crtxg = crtxg; 1857 } 1858 1859 static void 1860 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1861 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1862 uint64_t crtxg) 1863 { 1864 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1865 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1866 ASSERT3U(bt->bt_object, ==, object); 1867 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1868 ASSERT3U(bt->bt_offset, ==, offset); 1869 ASSERT3U(bt->bt_gen, <=, gen); 1870 ASSERT3U(bt->bt_txg, <=, txg); 1871 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1872 } 1873 1874 static ztest_block_tag_t * 1875 ztest_bt_bonus(dmu_buf_t *db) 1876 { 1877 dmu_object_info_t doi; 1878 ztest_block_tag_t *bt; 1879 1880 dmu_object_info_from_db(db, &doi); 1881 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1882 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1883 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1884 1885 return (bt); 1886 } 1887 1888 /* 1889 * Generate a token to fill up unused bonus buffer space. Try to make 1890 * it unique to the object, generation, and offset to verify that data 1891 * is not getting overwritten by data from other dnodes. 1892 */ 1893 #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1894 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1895 1896 /* 1897 * Fill up the unused bonus buffer region before the block tag with a 1898 * verifiable pattern. Filling the whole bonus area with non-zero data 1899 * helps ensure that all dnode traversal code properly skips the 1900 * interior regions of large dnodes. 1901 */ 1902 static void 1903 ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1904 objset_t *os, uint64_t gen) 1905 { 1906 uint64_t *bonusp; 1907 1908 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1909 1910 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1911 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1912 gen, bonusp - (uint64_t *)db->db_data); 1913 *bonusp = token; 1914 } 1915 } 1916 1917 /* 1918 * Verify that the unused area of a bonus buffer is filled with the 1919 * expected tokens. 1920 */ 1921 static void 1922 ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1923 objset_t *os, uint64_t gen) 1924 { 1925 uint64_t *bonusp; 1926 1927 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1928 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1929 gen, bonusp - (uint64_t *)db->db_data); 1930 VERIFY3U(*bonusp, ==, token); 1931 } 1932 } 1933 1934 /* 1935 * ZIL logging ops 1936 */ 1937 1938 #define lrz_type lr_mode 1939 #define lrz_blocksize lr_uid 1940 #define lrz_ibshift lr_gid 1941 #define lrz_bonustype lr_rdev 1942 #define lrz_dnodesize lr_crtime[1] 1943 1944 static void 1945 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1946 { 1947 char *name = (char *)&lr->lr_data[0]; /* name follows lr */ 1948 size_t namesize = strlen(name) + 1; 1949 itx_t *itx; 1950 1951 if (zil_replaying(zd->zd_zilog, tx)) 1952 return; 1953 1954 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1955 memcpy(&itx->itx_lr + 1, &lr->lr_create.lr_common + 1, 1956 sizeof (*lr) + namesize - sizeof (lr_t)); 1957 1958 zil_itx_assign(zd->zd_zilog, itx, tx); 1959 } 1960 1961 static void 1962 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1963 { 1964 char *name = (char *)&lr->lr_data[0]; /* name follows lr */ 1965 size_t namesize = strlen(name) + 1; 1966 itx_t *itx; 1967 1968 if (zil_replaying(zd->zd_zilog, tx)) 1969 return; 1970 1971 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1972 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1973 sizeof (*lr) + namesize - sizeof (lr_t)); 1974 1975 itx->itx_oid = object; 1976 zil_itx_assign(zd->zd_zilog, itx, tx); 1977 } 1978 1979 static void 1980 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1981 { 1982 itx_t *itx; 1983 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1984 1985 if (zil_replaying(zd->zd_zilog, tx)) 1986 return; 1987 1988 if (lr->lr_length > zil_max_log_data(zd->zd_zilog, sizeof (lr_write_t))) 1989 write_state = WR_INDIRECT; 1990 1991 itx = zil_itx_create(TX_WRITE, 1992 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1993 1994 if (write_state == WR_COPIED && 1995 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1996 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH | 1997 DMU_KEEP_CACHING) != 0) { 1998 zil_itx_destroy(itx); 1999 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 2000 write_state = WR_NEED_COPY; 2001 } 2002 itx->itx_private = zd; 2003 itx->itx_wr_state = write_state; 2004 itx->itx_sync = (ztest_random(8) == 0); 2005 2006 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2007 sizeof (*lr) - sizeof (lr_t)); 2008 2009 zil_itx_assign(zd->zd_zilog, itx, tx); 2010 } 2011 2012 static void 2013 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 2014 { 2015 itx_t *itx; 2016 2017 if (zil_replaying(zd->zd_zilog, tx)) 2018 return; 2019 2020 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 2021 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2022 sizeof (*lr) - sizeof (lr_t)); 2023 2024 itx->itx_sync = B_FALSE; 2025 zil_itx_assign(zd->zd_zilog, itx, tx); 2026 } 2027 2028 static void 2029 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 2030 { 2031 itx_t *itx; 2032 2033 if (zil_replaying(zd->zd_zilog, tx)) 2034 return; 2035 2036 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 2037 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2038 sizeof (*lr) - sizeof (lr_t)); 2039 2040 itx->itx_sync = B_FALSE; 2041 zil_itx_assign(zd->zd_zilog, itx, tx); 2042 } 2043 2044 /* 2045 * ZIL replay ops 2046 */ 2047 static int 2048 ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 2049 { 2050 ztest_ds_t *zd = arg1; 2051 lr_create_t *lrc = arg2; 2052 _lr_create_t *lr = &lrc->lr_create; 2053 char *name = (char *)&lrc->lr_data[0]; /* name follows lr */ 2054 objset_t *os = zd->zd_os; 2055 ztest_block_tag_t *bbt; 2056 dmu_buf_t *db; 2057 dmu_tx_t *tx; 2058 uint64_t txg; 2059 int error = 0; 2060 int bonuslen; 2061 2062 if (byteswap) 2063 byteswap_uint64_array(lr, sizeof (*lr)); 2064 2065 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2066 ASSERT3S(name[0], !=, '\0'); 2067 2068 tx = dmu_tx_create(os); 2069 2070 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 2071 2072 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2073 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 2074 } else { 2075 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 2076 } 2077 2078 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2079 if (txg == 0) 2080 return (ENOSPC); 2081 2082 ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); 2083 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 2084 2085 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2086 if (lr->lr_foid == 0) { 2087 lr->lr_foid = zap_create_dnsize(os, 2088 lr->lrz_type, lr->lrz_bonustype, 2089 bonuslen, lr->lrz_dnodesize, tx); 2090 } else { 2091 error = zap_create_claim_dnsize(os, lr->lr_foid, 2092 lr->lrz_type, lr->lrz_bonustype, 2093 bonuslen, lr->lrz_dnodesize, tx); 2094 } 2095 } else { 2096 if (lr->lr_foid == 0) { 2097 lr->lr_foid = dmu_object_alloc_dnsize(os, 2098 lr->lrz_type, 0, lr->lrz_bonustype, 2099 bonuslen, lr->lrz_dnodesize, tx); 2100 } else { 2101 error = dmu_object_claim_dnsize(os, lr->lr_foid, 2102 lr->lrz_type, 0, lr->lrz_bonustype, 2103 bonuslen, lr->lrz_dnodesize, tx); 2104 } 2105 } 2106 2107 if (error) { 2108 ASSERT3U(error, ==, EEXIST); 2109 ASSERT(zd->zd_zilog->zl_replay); 2110 dmu_tx_commit(tx); 2111 return (error); 2112 } 2113 2114 ASSERT3U(lr->lr_foid, !=, 0); 2115 2116 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 2117 VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, 2118 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 2119 2120 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2121 bbt = ztest_bt_bonus(db); 2122 dmu_buf_will_dirty(db, tx); 2123 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 2124 lr->lr_gen, txg, txg); 2125 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 2126 dmu_buf_rele(db, FTAG); 2127 2128 VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 2129 &lr->lr_foid, tx)); 2130 2131 (void) ztest_log_create(zd, tx, lrc); 2132 2133 dmu_tx_commit(tx); 2134 2135 return (0); 2136 } 2137 2138 static int 2139 ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 2140 { 2141 ztest_ds_t *zd = arg1; 2142 lr_remove_t *lr = arg2; 2143 char *name = (char *)&lr->lr_data[0]; /* name follows lr */ 2144 objset_t *os = zd->zd_os; 2145 dmu_object_info_t doi; 2146 dmu_tx_t *tx; 2147 uint64_t object, txg; 2148 2149 if (byteswap) 2150 byteswap_uint64_array(lr, sizeof (*lr)); 2151 2152 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2153 ASSERT3S(name[0], !=, '\0'); 2154 2155 VERIFY0( 2156 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 2157 ASSERT3U(object, !=, 0); 2158 2159 ztest_object_lock(zd, object, ZTRL_WRITER); 2160 2161 VERIFY0(dmu_object_info(os, object, &doi)); 2162 2163 tx = dmu_tx_create(os); 2164 2165 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 2166 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 2167 2168 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2169 if (txg == 0) { 2170 ztest_object_unlock(zd, object); 2171 return (ENOSPC); 2172 } 2173 2174 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 2175 VERIFY0(zap_destroy(os, object, tx)); 2176 } else { 2177 VERIFY0(dmu_object_free(os, object, tx)); 2178 } 2179 2180 VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); 2181 2182 (void) ztest_log_remove(zd, tx, lr, object); 2183 2184 dmu_tx_commit(tx); 2185 2186 ztest_object_unlock(zd, object); 2187 2188 return (0); 2189 } 2190 2191 static int 2192 ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 2193 { 2194 ztest_ds_t *zd = arg1; 2195 lr_write_t *lr = arg2; 2196 objset_t *os = zd->zd_os; 2197 uint8_t *data = &lr->lr_data[0]; /* data follows lr */ 2198 uint64_t offset, length; 2199 ztest_block_tag_t *bt = (ztest_block_tag_t *)data; 2200 ztest_block_tag_t *bbt; 2201 uint64_t gen, txg, lrtxg, crtxg; 2202 dmu_object_info_t doi; 2203 dmu_tx_t *tx; 2204 dmu_buf_t *db; 2205 arc_buf_t *abuf = NULL; 2206 rl_t *rl; 2207 2208 if (byteswap) 2209 byteswap_uint64_array(lr, sizeof (*lr)); 2210 2211 offset = lr->lr_offset; 2212 length = lr->lr_length; 2213 2214 /* If it's a dmu_sync() block, write the whole block */ 2215 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 2216 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 2217 if (length < blocksize) { 2218 offset -= offset % blocksize; 2219 length = blocksize; 2220 } 2221 } 2222 2223 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 2224 byteswap_uint64_array(bt, sizeof (*bt)); 2225 2226 if (bt->bt_magic != BT_MAGIC) 2227 bt = NULL; 2228 2229 ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); 2230 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, ZTRL_WRITER); 2231 2232 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2233 2234 dmu_object_info_from_db(db, &doi); 2235 2236 bbt = ztest_bt_bonus(db); 2237 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2238 gen = bbt->bt_gen; 2239 crtxg = bbt->bt_crtxg; 2240 lrtxg = lr->lr_common.lrc_txg; 2241 2242 tx = dmu_tx_create(os); 2243 2244 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 2245 2246 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 2247 P2PHASE(offset, length) == 0) 2248 abuf = dmu_request_arcbuf(db, length); 2249 2250 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2251 if (txg == 0) { 2252 if (abuf != NULL) 2253 dmu_return_arcbuf(abuf); 2254 dmu_buf_rele(db, FTAG); 2255 ztest_range_unlock(rl); 2256 ztest_object_unlock(zd, lr->lr_foid); 2257 return (ENOSPC); 2258 } 2259 2260 if (bt != NULL) { 2261 /* 2262 * Usually, verify the old data before writing new data -- 2263 * but not always, because we also want to verify correct 2264 * behavior when the data was not recently read into cache. 2265 */ 2266 ASSERT(doi.doi_data_block_size); 2267 ASSERT0(offset % doi.doi_data_block_size); 2268 if (ztest_random(4) != 0) { 2269 dmu_flags_t flags = ztest_random(2) ? 2270 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 2271 2272 /* 2273 * We will randomly set when to do O_DIRECT on a read. 2274 */ 2275 if (ztest_random(4) == 0) 2276 flags |= DMU_DIRECTIO; 2277 2278 ztest_block_tag_t rbt; 2279 2280 VERIFY(dmu_read(os, lr->lr_foid, offset, 2281 sizeof (rbt), &rbt, flags) == 0); 2282 if (rbt.bt_magic == BT_MAGIC) { 2283 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 2284 offset, gen, txg, crtxg); 2285 } 2286 } 2287 2288 /* 2289 * Writes can appear to be newer than the bonus buffer because 2290 * the ztest_get_data() callback does a dmu_read() of the 2291 * open-context data, which may be different than the data 2292 * as it was when the write was generated. 2293 */ 2294 if (zd->zd_zilog->zl_replay) { 2295 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 2296 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 2297 bt->bt_crtxg); 2298 } 2299 2300 /* 2301 * Set the bt's gen/txg to the bonus buffer's gen/txg 2302 * so that all of the usual ASSERTs will work. 2303 */ 2304 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 2305 crtxg); 2306 } 2307 2308 if (abuf == NULL) { 2309 dmu_write(os, lr->lr_foid, offset, length, data, tx); 2310 } else { 2311 memcpy(abuf->b_data, data, length); 2312 VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx, 0)); 2313 } 2314 2315 (void) ztest_log_write(zd, tx, lr); 2316 2317 dmu_buf_rele(db, FTAG); 2318 2319 dmu_tx_commit(tx); 2320 2321 ztest_range_unlock(rl); 2322 ztest_object_unlock(zd, lr->lr_foid); 2323 2324 return (0); 2325 } 2326 2327 static int 2328 ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 2329 { 2330 ztest_ds_t *zd = arg1; 2331 lr_truncate_t *lr = arg2; 2332 objset_t *os = zd->zd_os; 2333 dmu_tx_t *tx; 2334 uint64_t txg; 2335 rl_t *rl; 2336 2337 if (byteswap) 2338 byteswap_uint64_array(lr, sizeof (*lr)); 2339 2340 ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); 2341 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 2342 ZTRL_WRITER); 2343 2344 tx = dmu_tx_create(os); 2345 2346 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 2347 2348 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2349 if (txg == 0) { 2350 ztest_range_unlock(rl); 2351 ztest_object_unlock(zd, lr->lr_foid); 2352 return (ENOSPC); 2353 } 2354 2355 VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 2356 lr->lr_length, tx)); 2357 2358 (void) ztest_log_truncate(zd, tx, lr); 2359 2360 dmu_tx_commit(tx); 2361 2362 ztest_range_unlock(rl); 2363 ztest_object_unlock(zd, lr->lr_foid); 2364 2365 return (0); 2366 } 2367 2368 static int 2369 ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 2370 { 2371 ztest_ds_t *zd = arg1; 2372 lr_setattr_t *lr = arg2; 2373 objset_t *os = zd->zd_os; 2374 dmu_tx_t *tx; 2375 dmu_buf_t *db; 2376 ztest_block_tag_t *bbt; 2377 uint64_t txg, lrtxg, crtxg, dnodesize; 2378 2379 if (byteswap) 2380 byteswap_uint64_array(lr, sizeof (*lr)); 2381 2382 ztest_object_lock(zd, lr->lr_foid, ZTRL_WRITER); 2383 2384 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2385 2386 tx = dmu_tx_create(os); 2387 dmu_tx_hold_bonus(tx, lr->lr_foid); 2388 2389 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2390 if (txg == 0) { 2391 dmu_buf_rele(db, FTAG); 2392 ztest_object_unlock(zd, lr->lr_foid); 2393 return (ENOSPC); 2394 } 2395 2396 bbt = ztest_bt_bonus(db); 2397 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2398 crtxg = bbt->bt_crtxg; 2399 lrtxg = lr->lr_common.lrc_txg; 2400 dnodesize = bbt->bt_dnodesize; 2401 2402 if (zd->zd_zilog->zl_replay) { 2403 ASSERT3U(lr->lr_size, !=, 0); 2404 ASSERT3U(lr->lr_mode, !=, 0); 2405 ASSERT3U(lrtxg, !=, 0); 2406 } else { 2407 /* 2408 * Randomly change the size and increment the generation. 2409 */ 2410 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 2411 sizeof (*bbt); 2412 lr->lr_mode = bbt->bt_gen + 1; 2413 ASSERT0(lrtxg); 2414 } 2415 2416 /* 2417 * Verify that the current bonus buffer is not newer than our txg. 2418 */ 2419 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2420 MAX(txg, lrtxg), crtxg); 2421 2422 dmu_buf_will_dirty(db, tx); 2423 2424 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 2425 ASSERT3U(lr->lr_size, <=, db->db_size); 2426 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 2427 bbt = ztest_bt_bonus(db); 2428 2429 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2430 txg, crtxg); 2431 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 2432 dmu_buf_rele(db, FTAG); 2433 2434 (void) ztest_log_setattr(zd, tx, lr); 2435 2436 dmu_tx_commit(tx); 2437 2438 ztest_object_unlock(zd, lr->lr_foid); 2439 2440 return (0); 2441 } 2442 2443 static zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 2444 NULL, /* 0 no such transaction type */ 2445 ztest_replay_create, /* TX_CREATE */ 2446 NULL, /* TX_MKDIR */ 2447 NULL, /* TX_MKXATTR */ 2448 NULL, /* TX_SYMLINK */ 2449 ztest_replay_remove, /* TX_REMOVE */ 2450 NULL, /* TX_RMDIR */ 2451 NULL, /* TX_LINK */ 2452 NULL, /* TX_RENAME */ 2453 ztest_replay_write, /* TX_WRITE */ 2454 ztest_replay_truncate, /* TX_TRUNCATE */ 2455 ztest_replay_setattr, /* TX_SETATTR */ 2456 NULL, /* TX_ACL */ 2457 NULL, /* TX_CREATE_ACL */ 2458 NULL, /* TX_CREATE_ATTR */ 2459 NULL, /* TX_CREATE_ACL_ATTR */ 2460 NULL, /* TX_MKDIR_ACL */ 2461 NULL, /* TX_MKDIR_ATTR */ 2462 NULL, /* TX_MKDIR_ACL_ATTR */ 2463 NULL, /* TX_WRITE2 */ 2464 NULL, /* TX_SETSAXATTR */ 2465 NULL, /* TX_RENAME_EXCHANGE */ 2466 NULL, /* TX_RENAME_WHITEOUT */ 2467 }; 2468 2469 /* 2470 * ZIL get_data callbacks 2471 */ 2472 2473 static void 2474 ztest_get_done(zgd_t *zgd, int error) 2475 { 2476 (void) error; 2477 ztest_ds_t *zd = zgd->zgd_private; 2478 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 2479 2480 if (zgd->zgd_db) 2481 dmu_buf_rele(zgd->zgd_db, zgd); 2482 2483 ztest_range_unlock((rl_t *)zgd->zgd_lr); 2484 ztest_object_unlock(zd, object); 2485 2486 umem_free(zgd, sizeof (*zgd)); 2487 } 2488 2489 static int 2490 ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, 2491 struct lwb *lwb, zio_t *zio) 2492 { 2493 (void) arg2; 2494 ztest_ds_t *zd = arg; 2495 objset_t *os = zd->zd_os; 2496 uint64_t object = lr->lr_foid; 2497 uint64_t offset = lr->lr_offset; 2498 uint64_t size = lr->lr_length; 2499 uint64_t txg = lr->lr_common.lrc_txg; 2500 uint64_t crtxg; 2501 dmu_object_info_t doi; 2502 dmu_buf_t *db; 2503 zgd_t *zgd; 2504 int error; 2505 2506 ASSERT3P(lwb, !=, NULL); 2507 ASSERT3U(size, !=, 0); 2508 2509 ztest_object_lock(zd, object, ZTRL_READER); 2510 error = dmu_bonus_hold(os, object, FTAG, &db); 2511 if (error) { 2512 ztest_object_unlock(zd, object); 2513 return (error); 2514 } 2515 2516 crtxg = ztest_bt_bonus(db)->bt_crtxg; 2517 2518 if (crtxg == 0 || crtxg > txg) { 2519 dmu_buf_rele(db, FTAG); 2520 ztest_object_unlock(zd, object); 2521 return (ENOENT); 2522 } 2523 2524 dmu_object_info_from_db(db, &doi); 2525 dmu_buf_rele(db, FTAG); 2526 db = NULL; 2527 2528 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 2529 zgd->zgd_lwb = lwb; 2530 zgd->zgd_private = zd; 2531 2532 if (buf != NULL) { /* immediate write */ 2533 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2534 object, offset, size, ZTRL_READER); 2535 2536 error = dmu_read(os, object, offset, size, buf, 2537 DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING); 2538 ASSERT0(error); 2539 } else { 2540 ASSERT3P(zio, !=, NULL); 2541 size = doi.doi_data_block_size; 2542 if (ISP2(size)) { 2543 offset = P2ALIGN_TYPED(offset, size, uint64_t); 2544 } else { 2545 ASSERT3U(offset, <, size); 2546 offset = 0; 2547 } 2548 2549 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2550 object, offset, size, ZTRL_READER); 2551 2552 error = dmu_buf_hold_noread(os, object, offset, zgd, &db); 2553 if (error == 0) { 2554 blkptr_t *bp = &lr->lr_blkptr; 2555 2556 zgd->zgd_db = db; 2557 zgd->zgd_bp = bp; 2558 2559 ASSERT3U(db->db_offset, ==, offset); 2560 ASSERT3U(db->db_size, ==, size); 2561 2562 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2563 ztest_get_done, zgd); 2564 2565 if (error == 0) 2566 return (0); 2567 } 2568 } 2569 2570 ztest_get_done(zgd, error); 2571 2572 return (error); 2573 } 2574 2575 static void * 2576 ztest_lr_alloc(size_t lrsize, char *name) 2577 { 2578 char *lr; 2579 size_t namesize = name ? strlen(name) + 1 : 0; 2580 2581 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2582 2583 if (name) 2584 memcpy(lr + lrsize, name, namesize); 2585 2586 return (lr); 2587 } 2588 2589 static void 2590 ztest_lr_free(void *lr, size_t lrsize, char *name) 2591 { 2592 size_t namesize = name ? strlen(name) + 1 : 0; 2593 2594 umem_free(lr, lrsize + namesize); 2595 } 2596 2597 /* 2598 * Lookup a bunch of objects. Returns the number of objects not found. 2599 */ 2600 static int 2601 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2602 { 2603 int missing = 0; 2604 int error; 2605 int i; 2606 2607 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2608 2609 for (i = 0; i < count; i++, od++) { 2610 od->od_object = 0; 2611 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2612 sizeof (uint64_t), 1, &od->od_object); 2613 if (error) { 2614 ASSERT3S(error, ==, ENOENT); 2615 ASSERT0(od->od_object); 2616 missing++; 2617 } else { 2618 dmu_buf_t *db; 2619 ztest_block_tag_t *bbt; 2620 dmu_object_info_t doi; 2621 2622 ASSERT3U(od->od_object, !=, 0); 2623 ASSERT0(missing); /* there should be no gaps */ 2624 2625 ztest_object_lock(zd, od->od_object, ZTRL_READER); 2626 VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, 2627 FTAG, &db)); 2628 dmu_object_info_from_db(db, &doi); 2629 bbt = ztest_bt_bonus(db); 2630 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2631 od->od_type = doi.doi_type; 2632 od->od_blocksize = doi.doi_data_block_size; 2633 od->od_gen = bbt->bt_gen; 2634 dmu_buf_rele(db, FTAG); 2635 ztest_object_unlock(zd, od->od_object); 2636 } 2637 } 2638 2639 return (missing); 2640 } 2641 2642 static int 2643 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2644 { 2645 int missing = 0; 2646 int i; 2647 2648 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2649 2650 for (i = 0; i < count; i++, od++) { 2651 if (missing) { 2652 od->od_object = 0; 2653 missing++; 2654 continue; 2655 } 2656 2657 lr_create_t *lrc = ztest_lr_alloc(sizeof (*lrc), od->od_name); 2658 _lr_create_t *lr = &lrc->lr_create; 2659 2660 lr->lr_doid = od->od_dir; 2661 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2662 lr->lrz_type = od->od_crtype; 2663 lr->lrz_blocksize = od->od_crblocksize; 2664 lr->lrz_ibshift = ztest_random_ibshift(); 2665 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2666 lr->lrz_dnodesize = od->od_crdnodesize; 2667 lr->lr_gen = od->od_crgen; 2668 lr->lr_crtime[0] = time(NULL); 2669 2670 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2671 ASSERT0(missing); 2672 od->od_object = 0; 2673 missing++; 2674 } else { 2675 od->od_object = lr->lr_foid; 2676 od->od_type = od->od_crtype; 2677 od->od_blocksize = od->od_crblocksize; 2678 od->od_gen = od->od_crgen; 2679 ASSERT3U(od->od_object, !=, 0); 2680 } 2681 2682 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2683 } 2684 2685 return (missing); 2686 } 2687 2688 static int 2689 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2690 { 2691 int missing = 0; 2692 int error; 2693 int i; 2694 2695 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2696 2697 od += count - 1; 2698 2699 for (i = count - 1; i >= 0; i--, od--) { 2700 if (missing) { 2701 missing++; 2702 continue; 2703 } 2704 2705 /* 2706 * No object was found. 2707 */ 2708 if (od->od_object == 0) 2709 continue; 2710 2711 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2712 2713 lr->lr_doid = od->od_dir; 2714 2715 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2716 ASSERT3U(error, ==, ENOSPC); 2717 missing++; 2718 } else { 2719 od->od_object = 0; 2720 } 2721 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2722 } 2723 2724 return (missing); 2725 } 2726 2727 static int 2728 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2729 const void *data) 2730 { 2731 lr_write_t *lr; 2732 int error; 2733 2734 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2735 2736 lr->lr_foid = object; 2737 lr->lr_offset = offset; 2738 lr->lr_length = size; 2739 lr->lr_blkoff = 0; 2740 BP_ZERO(&lr->lr_blkptr); 2741 2742 memcpy(&lr->lr_data[0], data, size); 2743 2744 error = ztest_replay_write(zd, lr, B_FALSE); 2745 2746 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2747 2748 return (error); 2749 } 2750 2751 static int 2752 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2753 { 2754 lr_truncate_t *lr; 2755 int error; 2756 2757 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2758 2759 lr->lr_foid = object; 2760 lr->lr_offset = offset; 2761 lr->lr_length = size; 2762 2763 error = ztest_replay_truncate(zd, lr, B_FALSE); 2764 2765 ztest_lr_free(lr, sizeof (*lr), NULL); 2766 2767 return (error); 2768 } 2769 2770 static int 2771 ztest_setattr(ztest_ds_t *zd, uint64_t object) 2772 { 2773 lr_setattr_t *lr; 2774 int error; 2775 2776 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2777 2778 lr->lr_foid = object; 2779 lr->lr_size = 0; 2780 lr->lr_mode = 0; 2781 2782 error = ztest_replay_setattr(zd, lr, B_FALSE); 2783 2784 ztest_lr_free(lr, sizeof (*lr), NULL); 2785 2786 return (error); 2787 } 2788 2789 static void 2790 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2791 { 2792 objset_t *os = zd->zd_os; 2793 dmu_tx_t *tx; 2794 uint64_t txg; 2795 rl_t *rl; 2796 2797 txg_wait_synced(dmu_objset_pool(os), 0); 2798 2799 ztest_object_lock(zd, object, ZTRL_READER); 2800 rl = ztest_range_lock(zd, object, offset, size, ZTRL_WRITER); 2801 2802 tx = dmu_tx_create(os); 2803 2804 dmu_tx_hold_write(tx, object, offset, size); 2805 2806 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2807 2808 if (txg != 0) { 2809 dmu_prealloc(os, object, offset, size, tx); 2810 dmu_tx_commit(tx); 2811 txg_wait_synced(dmu_objset_pool(os), txg); 2812 } else { 2813 (void) dmu_free_long_range(os, object, offset, size); 2814 } 2815 2816 ztest_range_unlock(rl); 2817 ztest_object_unlock(zd, object); 2818 } 2819 2820 static void 2821 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2822 { 2823 int err; 2824 ztest_block_tag_t wbt; 2825 dmu_object_info_t doi; 2826 enum ztest_io_type io_type; 2827 uint64_t blocksize; 2828 void *data; 2829 dmu_flags_t dmu_read_flags = DMU_READ_NO_PREFETCH; 2830 2831 /* 2832 * We will randomly set when to do O_DIRECT on a read. 2833 */ 2834 if (ztest_random(4) == 0) 2835 dmu_read_flags |= DMU_DIRECTIO; 2836 2837 VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); 2838 blocksize = doi.doi_data_block_size; 2839 data = umem_alloc(blocksize, UMEM_NOFAIL); 2840 2841 /* 2842 * Pick an i/o type at random, biased toward writing block tags. 2843 */ 2844 io_type = ztest_random(ZTEST_IO_TYPES); 2845 if (ztest_random(2) == 0) 2846 io_type = ZTEST_IO_WRITE_TAG; 2847 2848 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2849 2850 switch (io_type) { 2851 2852 case ZTEST_IO_WRITE_TAG: 2853 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2854 offset, 0, 0, 0); 2855 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2856 break; 2857 2858 case ZTEST_IO_WRITE_PATTERN: 2859 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2860 if (ztest_random(2) == 0) { 2861 /* 2862 * Induce fletcher2 collisions to ensure that 2863 * zio_ddt_collision() detects and resolves them 2864 * when using fletcher2-verify for deduplication. 2865 */ 2866 ((uint64_t *)data)[0] ^= 1ULL << 63; 2867 ((uint64_t *)data)[4] ^= 1ULL << 63; 2868 } 2869 (void) ztest_write(zd, object, offset, blocksize, data); 2870 break; 2871 2872 case ZTEST_IO_WRITE_ZEROES: 2873 memset(data, 0, blocksize); 2874 (void) ztest_write(zd, object, offset, blocksize, data); 2875 break; 2876 2877 case ZTEST_IO_TRUNCATE: 2878 (void) ztest_truncate(zd, object, offset, blocksize); 2879 break; 2880 2881 case ZTEST_IO_SETATTR: 2882 (void) ztest_setattr(zd, object); 2883 break; 2884 default: 2885 break; 2886 2887 case ZTEST_IO_REWRITE: 2888 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2889 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2890 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2891 B_FALSE); 2892 ASSERT(err == 0 || err == ENOSPC); 2893 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2894 ZFS_PROP_COMPRESSION, 2895 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2896 B_FALSE); 2897 ASSERT(err == 0 || err == ENOSPC); 2898 (void) pthread_rwlock_unlock(&ztest_name_lock); 2899 2900 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2901 dmu_read_flags)); 2902 2903 (void) ztest_write(zd, object, offset, blocksize, data); 2904 break; 2905 } 2906 2907 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2908 2909 umem_free(data, blocksize); 2910 } 2911 2912 /* 2913 * Initialize an object description template. 2914 */ 2915 static void 2916 ztest_od_init(ztest_od_t *od, uint64_t id, const char *tag, uint64_t index, 2917 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2918 uint64_t gen) 2919 { 2920 od->od_dir = ZTEST_DIROBJ; 2921 od->od_object = 0; 2922 2923 od->od_crtype = type; 2924 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2925 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2926 od->od_crgen = gen; 2927 2928 od->od_type = DMU_OT_NONE; 2929 od->od_blocksize = 0; 2930 od->od_gen = 0; 2931 2932 (void) snprintf(od->od_name, sizeof (od->od_name), 2933 "%s(%"PRId64")[%"PRIu64"]", 2934 tag, id, index); 2935 } 2936 2937 /* 2938 * Lookup or create the objects for a test using the od template. 2939 * If the objects do not all exist, or if 'remove' is specified, 2940 * remove any existing objects and create new ones. Otherwise, 2941 * use the existing objects. 2942 */ 2943 static int 2944 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2945 { 2946 int count = size / sizeof (*od); 2947 int rv = 0; 2948 2949 mutex_enter(&zd->zd_dirobj_lock); 2950 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2951 (ztest_remove(zd, od, count) != 0 || 2952 ztest_create(zd, od, count) != 0)) 2953 rv = -1; 2954 zd->zd_od = od; 2955 mutex_exit(&zd->zd_dirobj_lock); 2956 2957 return (rv); 2958 } 2959 2960 void 2961 ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2962 { 2963 (void) id; 2964 zilog_t *zilog = zd->zd_zilog; 2965 2966 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2967 2968 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2969 2970 /* 2971 * Remember the committed values in zd, which is in parent/child 2972 * shared memory. If we die, the next iteration of ztest_run() 2973 * will verify that the log really does contain this record. 2974 */ 2975 mutex_enter(&zilog->zl_lock); 2976 ASSERT3P(zd->zd_shared, !=, NULL); 2977 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2978 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2979 mutex_exit(&zilog->zl_lock); 2980 2981 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2982 } 2983 2984 /* 2985 * This function is designed to simulate the operations that occur during a 2986 * mount/unmount operation. We hold the dataset across these operations in an 2987 * attempt to expose any implicit assumptions about ZIL management. 2988 */ 2989 void 2990 ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2991 { 2992 (void) id; 2993 objset_t *os = zd->zd_os; 2994 2995 /* 2996 * We hold the ztest_vdev_lock so we don't cause problems with 2997 * other threads that wish to remove a log device, such as 2998 * ztest_device_removal(). 2999 */ 3000 mutex_enter(&ztest_vdev_lock); 3001 3002 /* 3003 * We grab the zd_dirobj_lock to ensure that no other thread is 3004 * updating the zil (i.e. adding in-memory log records) and the 3005 * zd_zilog_lock to block any I/O. 3006 */ 3007 mutex_enter(&zd->zd_dirobj_lock); 3008 (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); 3009 3010 /* zfsvfs_teardown() */ 3011 zil_close(zd->zd_zilog); 3012 3013 /* zfsvfs_setup() */ 3014 VERIFY3P(zil_open(os, ztest_get_data, NULL), ==, zd->zd_zilog); 3015 zil_replay(os, zd, ztest_replay_vector); 3016 3017 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 3018 mutex_exit(&zd->zd_dirobj_lock); 3019 mutex_exit(&ztest_vdev_lock); 3020 } 3021 3022 /* 3023 * Verify that we can't destroy an active pool, create an existing pool, 3024 * or create a pool with a bad vdev spec. 3025 */ 3026 void 3027 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 3028 { 3029 (void) zd, (void) id; 3030 ztest_shared_opts_t *zo = &ztest_opts; 3031 spa_t *spa; 3032 nvlist_t *nvroot; 3033 3034 if (zo->zo_mmp_test) 3035 return; 3036 3037 /* 3038 * Attempt to create using a bad file. 3039 */ 3040 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 3041 VERIFY3U(ENOENT, ==, 3042 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 3043 fnvlist_free(nvroot); 3044 3045 /* 3046 * Attempt to create using a bad mirror. 3047 */ 3048 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); 3049 VERIFY3U(ENOENT, ==, 3050 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 3051 fnvlist_free(nvroot); 3052 3053 /* 3054 * Attempt to create an existing pool. It shouldn't matter 3055 * what's in the nvroot; we should fail with EEXIST. 3056 */ 3057 (void) pthread_rwlock_rdlock(&ztest_name_lock); 3058 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 3059 VERIFY3U(EEXIST, ==, 3060 spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); 3061 fnvlist_free(nvroot); 3062 3063 /* 3064 * We open a reference to the spa and then we try to export it 3065 * expecting one of the following errors: 3066 * 3067 * EBUSY 3068 * Because of the reference we just opened. 3069 * 3070 * ZFS_ERR_EXPORT_IN_PROGRESS 3071 * For the case that there is another ztest thread doing 3072 * an export concurrently. 3073 */ 3074 VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); 3075 int error = spa_destroy(zo->zo_pool); 3076 if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { 3077 fatal(B_FALSE, "spa_destroy(%s) returned unexpected value %d", 3078 spa->spa_name, error); 3079 } 3080 spa_close(spa, FTAG); 3081 3082 (void) pthread_rwlock_unlock(&ztest_name_lock); 3083 } 3084 3085 /* 3086 * Start and then stop the MMP threads to ensure the startup and shutdown code 3087 * works properly. Actual protection and property-related code tested via ZTS. 3088 */ 3089 void 3090 ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) 3091 { 3092 (void) zd, (void) id; 3093 ztest_shared_opts_t *zo = &ztest_opts; 3094 spa_t *spa = ztest_spa; 3095 3096 if (zo->zo_mmp_test) 3097 return; 3098 3099 /* 3100 * Since enabling MMP involves setting a property, it could not be done 3101 * while the pool is suspended. 3102 */ 3103 if (spa_suspended(spa)) 3104 return; 3105 3106 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3107 mutex_enter(&spa->spa_props_lock); 3108 3109 zfs_multihost_fail_intervals = 0; 3110 3111 if (!spa_multihost(spa)) { 3112 spa->spa_multihost = B_TRUE; 3113 mmp_thread_start(spa); 3114 } 3115 3116 mutex_exit(&spa->spa_props_lock); 3117 spa_config_exit(spa, SCL_CONFIG, FTAG); 3118 3119 txg_wait_synced(spa_get_dsl(spa), 0); 3120 mmp_signal_all_threads(); 3121 txg_wait_synced(spa_get_dsl(spa), 0); 3122 3123 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3124 mutex_enter(&spa->spa_props_lock); 3125 3126 if (spa_multihost(spa)) { 3127 mmp_thread_stop(spa); 3128 spa->spa_multihost = B_FALSE; 3129 } 3130 3131 mutex_exit(&spa->spa_props_lock); 3132 spa_config_exit(spa, SCL_CONFIG, FTAG); 3133 } 3134 3135 static int 3136 ztest_get_raidz_children(spa_t *spa) 3137 { 3138 (void) spa; 3139 vdev_t *raidvd; 3140 3141 ASSERT(MUTEX_HELD(&ztest_vdev_lock)); 3142 3143 if (ztest_opts.zo_raid_do_expand) { 3144 raidvd = ztest_spa->spa_root_vdev->vdev_child[0]; 3145 3146 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3147 3148 return (raidvd->vdev_children); 3149 } 3150 3151 return (ztest_opts.zo_raid_children); 3152 } 3153 3154 void 3155 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 3156 { 3157 (void) zd, (void) id; 3158 spa_t *spa; 3159 uint64_t initial_version = SPA_VERSION_INITIAL; 3160 uint64_t raidz_children, version, newversion; 3161 nvlist_t *nvroot, *props; 3162 char *name; 3163 3164 if (ztest_opts.zo_mmp_test) 3165 return; 3166 3167 /* dRAID added after feature flags, skip upgrade test. */ 3168 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) 3169 return; 3170 3171 mutex_enter(&ztest_vdev_lock); 3172 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 3173 3174 /* 3175 * Clean up from previous runs. 3176 */ 3177 (void) spa_destroy(name); 3178 3179 raidz_children = ztest_get_raidz_children(ztest_spa); 3180 3181 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 3182 NULL, raidz_children, ztest_opts.zo_mirrors, 1); 3183 3184 /* 3185 * If we're configuring a RAIDZ device then make sure that the 3186 * initial version is capable of supporting that feature. 3187 */ 3188 switch (ztest_opts.zo_raid_parity) { 3189 case 0: 3190 case 1: 3191 initial_version = SPA_VERSION_INITIAL; 3192 break; 3193 case 2: 3194 initial_version = SPA_VERSION_RAIDZ2; 3195 break; 3196 case 3: 3197 initial_version = SPA_VERSION_RAIDZ3; 3198 break; 3199 } 3200 3201 /* 3202 * Create a pool with a spa version that can be upgraded. Pick 3203 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 3204 */ 3205 do { 3206 version = ztest_random_spa_version(initial_version); 3207 } while (version > SPA_VERSION_BEFORE_FEATURES); 3208 3209 props = fnvlist_alloc(); 3210 fnvlist_add_uint64(props, 3211 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 3212 VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); 3213 fnvlist_free(nvroot); 3214 fnvlist_free(props); 3215 3216 VERIFY0(spa_open(name, &spa, FTAG)); 3217 VERIFY3U(spa_version(spa), ==, version); 3218 newversion = ztest_random_spa_version(version + 1); 3219 3220 if (ztest_opts.zo_verbose >= 4) { 3221 (void) printf("upgrading spa version from " 3222 "%"PRIu64" to %"PRIu64"\n", 3223 version, newversion); 3224 } 3225 3226 spa_upgrade(spa, newversion); 3227 VERIFY3U(spa_version(spa), >, version); 3228 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 3229 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 3230 spa_close(spa, FTAG); 3231 3232 kmem_strfree(name); 3233 mutex_exit(&ztest_vdev_lock); 3234 } 3235 3236 static void 3237 ztest_spa_checkpoint(spa_t *spa) 3238 { 3239 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3240 3241 int error = spa_checkpoint(spa->spa_name); 3242 3243 switch (error) { 3244 case 0: 3245 case ZFS_ERR_DEVRM_IN_PROGRESS: 3246 case ZFS_ERR_DISCARDING_CHECKPOINT: 3247 case ZFS_ERR_CHECKPOINT_EXISTS: 3248 case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: 3249 break; 3250 case ENOSPC: 3251 ztest_record_enospc(FTAG); 3252 break; 3253 default: 3254 fatal(B_FALSE, "spa_checkpoint(%s) = %d", spa->spa_name, error); 3255 } 3256 } 3257 3258 static void 3259 ztest_spa_discard_checkpoint(spa_t *spa) 3260 { 3261 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3262 3263 int error = spa_checkpoint_discard(spa->spa_name); 3264 3265 switch (error) { 3266 case 0: 3267 case ZFS_ERR_DISCARDING_CHECKPOINT: 3268 case ZFS_ERR_NO_CHECKPOINT: 3269 break; 3270 default: 3271 fatal(B_FALSE, "spa_discard_checkpoint(%s) = %d", 3272 spa->spa_name, error); 3273 } 3274 3275 } 3276 3277 void 3278 ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 3279 { 3280 (void) zd, (void) id; 3281 spa_t *spa = ztest_spa; 3282 3283 mutex_enter(&ztest_checkpoint_lock); 3284 if (ztest_random(2) == 0) { 3285 ztest_spa_checkpoint(spa); 3286 } else { 3287 ztest_spa_discard_checkpoint(spa); 3288 } 3289 mutex_exit(&ztest_checkpoint_lock); 3290 } 3291 3292 3293 static vdev_t * 3294 vdev_lookup_by_path(vdev_t *vd, const char *path) 3295 { 3296 vdev_t *mvd; 3297 int c; 3298 3299 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 3300 return (vd); 3301 3302 for (c = 0; c < vd->vdev_children; c++) 3303 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 3304 NULL) 3305 return (mvd); 3306 3307 return (NULL); 3308 } 3309 3310 static int 3311 spa_num_top_vdevs(spa_t *spa) 3312 { 3313 vdev_t *rvd = spa->spa_root_vdev; 3314 ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); 3315 return (rvd->vdev_children); 3316 } 3317 3318 /* 3319 * Verify that vdev_add() works as expected. 3320 */ 3321 void 3322 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 3323 { 3324 (void) zd, (void) id; 3325 ztest_shared_t *zs = ztest_shared; 3326 spa_t *spa = ztest_spa; 3327 uint64_t leaves; 3328 uint64_t guid; 3329 uint64_t raidz_children; 3330 3331 nvlist_t *nvroot; 3332 int error; 3333 3334 if (ztest_opts.zo_mmp_test) 3335 return; 3336 3337 mutex_enter(&ztest_vdev_lock); 3338 raidz_children = ztest_get_raidz_children(spa); 3339 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 3340 3341 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3342 3343 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3344 3345 /* 3346 * If we have slogs then remove them 1/4 of the time. 3347 */ 3348 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 3349 metaslab_group_t *mg; 3350 3351 /* 3352 * find the first real slog in log allocation class 3353 */ 3354 mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; 3355 while (!mg->mg_vd->vdev_islog) 3356 mg = mg->mg_next; 3357 3358 guid = mg->mg_vd->vdev_guid; 3359 3360 spa_config_exit(spa, SCL_VDEV, FTAG); 3361 3362 /* 3363 * We have to grab the zs_name_lock as writer to 3364 * prevent a race between removing a slog (dmu_objset_find) 3365 * and destroying a dataset. Removing the slog will 3366 * grab a reference on the dataset which may cause 3367 * dsl_destroy_head() to fail with EBUSY thus 3368 * leaving the dataset in an inconsistent state. 3369 */ 3370 pthread_rwlock_wrlock(&ztest_name_lock); 3371 error = spa_vdev_remove(spa, guid, B_FALSE); 3372 pthread_rwlock_unlock(&ztest_name_lock); 3373 3374 switch (error) { 3375 case 0: 3376 case EEXIST: /* Generic zil_reset() error */ 3377 case EBUSY: /* Replay required */ 3378 case EACCES: /* Crypto key not loaded */ 3379 case ZFS_ERR_CHECKPOINT_EXISTS: 3380 case ZFS_ERR_DISCARDING_CHECKPOINT: 3381 break; 3382 default: 3383 fatal(B_FALSE, "spa_vdev_remove() = %d", error); 3384 } 3385 } else { 3386 spa_config_exit(spa, SCL_VDEV, FTAG); 3387 3388 /* 3389 * Make 1/4 of the devices be log devices 3390 */ 3391 nvroot = make_vdev_root(NULL, NULL, NULL, 3392 ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? 3393 "log" : NULL, raidz_children, zs->zs_mirrors, 3394 1); 3395 3396 error = spa_vdev_add(spa, nvroot, B_FALSE); 3397 fnvlist_free(nvroot); 3398 3399 switch (error) { 3400 case 0: 3401 break; 3402 case ENOSPC: 3403 ztest_record_enospc("spa_vdev_add"); 3404 break; 3405 default: 3406 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3407 } 3408 } 3409 3410 mutex_exit(&ztest_vdev_lock); 3411 } 3412 3413 void 3414 ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) 3415 { 3416 (void) zd, (void) id; 3417 ztest_shared_t *zs = ztest_shared; 3418 spa_t *spa = ztest_spa; 3419 uint64_t leaves; 3420 nvlist_t *nvroot; 3421 uint64_t raidz_children; 3422 const char *class = (ztest_random(2) == 0) ? 3423 VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; 3424 int error; 3425 3426 /* 3427 * By default add a special vdev 50% of the time 3428 */ 3429 if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || 3430 (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && 3431 ztest_random(2) == 0)) { 3432 return; 3433 } 3434 3435 mutex_enter(&ztest_vdev_lock); 3436 3437 /* Only test with mirrors */ 3438 if (zs->zs_mirrors < 2) { 3439 mutex_exit(&ztest_vdev_lock); 3440 return; 3441 } 3442 3443 /* requires feature@allocation_classes */ 3444 if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { 3445 mutex_exit(&ztest_vdev_lock); 3446 return; 3447 } 3448 3449 raidz_children = ztest_get_raidz_children(spa); 3450 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 3451 3452 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3453 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3454 spa_config_exit(spa, SCL_VDEV, FTAG); 3455 3456 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 3457 class, raidz_children, zs->zs_mirrors, 1); 3458 3459 error = spa_vdev_add(spa, nvroot, B_FALSE); 3460 fnvlist_free(nvroot); 3461 3462 if (error == ENOSPC) 3463 ztest_record_enospc("spa_vdev_add"); 3464 else if (error != 0) 3465 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3466 3467 /* 3468 * 50% of the time allow small blocks in the special class 3469 */ 3470 if (error == 0 && 3471 spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { 3472 if (ztest_opts.zo_verbose >= 3) 3473 (void) printf("Enabling special VDEV small blocks\n"); 3474 error = ztest_dsl_prop_set_uint64(zd->zd_name, 3475 ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); 3476 ASSERT(error == 0 || error == ENOSPC); 3477 } 3478 3479 mutex_exit(&ztest_vdev_lock); 3480 3481 if (ztest_opts.zo_verbose >= 3) { 3482 metaslab_class_t *mc; 3483 3484 if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) 3485 mc = spa_special_class(spa); 3486 else 3487 mc = spa_dedup_class(spa); 3488 (void) printf("Added a %s mirrored vdev (of %d)\n", 3489 class, (int)mc->mc_groups); 3490 } 3491 } 3492 3493 /* 3494 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 3495 */ 3496 void 3497 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 3498 { 3499 (void) zd, (void) id; 3500 ztest_shared_t *zs = ztest_shared; 3501 spa_t *spa = ztest_spa; 3502 vdev_t *rvd = spa->spa_root_vdev; 3503 spa_aux_vdev_t *sav; 3504 const char *aux; 3505 char *path; 3506 uint64_t guid = 0; 3507 int error, ignore_err = 0; 3508 3509 if (ztest_opts.zo_mmp_test) 3510 return; 3511 3512 path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3513 3514 if (ztest_random(2) == 0) { 3515 sav = &spa->spa_spares; 3516 aux = ZPOOL_CONFIG_SPARES; 3517 } else { 3518 sav = &spa->spa_l2cache; 3519 aux = ZPOOL_CONFIG_L2CACHE; 3520 } 3521 3522 mutex_enter(&ztest_vdev_lock); 3523 3524 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3525 3526 if (sav->sav_count != 0 && ztest_random(4) == 0) { 3527 /* 3528 * Pick a random device to remove. 3529 */ 3530 vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3531 3532 /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ 3533 if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) 3534 ignore_err = ENOTSUP; 3535 3536 guid = svd->vdev_guid; 3537 } else { 3538 /* 3539 * Find an unused device we can add. 3540 */ 3541 zs->zs_vdev_aux = 0; 3542 for (;;) { 3543 int c; 3544 (void) snprintf(path, MAXPATHLEN, ztest_aux_template, 3545 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 3546 zs->zs_vdev_aux); 3547 for (c = 0; c < sav->sav_count; c++) 3548 if (strcmp(sav->sav_vdevs[c]->vdev_path, 3549 path) == 0) 3550 break; 3551 if (c == sav->sav_count && 3552 vdev_lookup_by_path(rvd, path) == NULL) 3553 break; 3554 zs->zs_vdev_aux++; 3555 } 3556 } 3557 3558 spa_config_exit(spa, SCL_VDEV, FTAG); 3559 3560 if (guid == 0) { 3561 /* 3562 * Add a new device. 3563 */ 3564 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 3565 (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); 3566 error = spa_vdev_add(spa, nvroot, B_FALSE); 3567 3568 switch (error) { 3569 case 0: 3570 break; 3571 default: 3572 fatal(B_FALSE, "spa_vdev_add(%p) = %d", nvroot, error); 3573 } 3574 fnvlist_free(nvroot); 3575 } else { 3576 /* 3577 * Remove an existing device. Sometimes, dirty its 3578 * vdev state first to make sure we handle removal 3579 * of devices that have pending state changes. 3580 */ 3581 if (ztest_random(2) == 0) 3582 (void) vdev_online(spa, guid, 0, NULL); 3583 3584 error = spa_vdev_remove(spa, guid, B_FALSE); 3585 3586 switch (error) { 3587 case 0: 3588 case EBUSY: 3589 case ZFS_ERR_CHECKPOINT_EXISTS: 3590 case ZFS_ERR_DISCARDING_CHECKPOINT: 3591 break; 3592 default: 3593 if (error != ignore_err) 3594 fatal(B_FALSE, 3595 "spa_vdev_remove(%"PRIu64") = %d", 3596 guid, error); 3597 } 3598 } 3599 3600 mutex_exit(&ztest_vdev_lock); 3601 3602 umem_free(path, MAXPATHLEN); 3603 } 3604 3605 /* 3606 * split a pool if it has mirror tlvdevs 3607 */ 3608 void 3609 ztest_split_pool(ztest_ds_t *zd, uint64_t id) 3610 { 3611 (void) zd, (void) id; 3612 ztest_shared_t *zs = ztest_shared; 3613 spa_t *spa = ztest_spa; 3614 vdev_t *rvd = spa->spa_root_vdev; 3615 nvlist_t *tree, **child, *config, *split, **schild; 3616 uint_t c, children, schildren = 0, lastlogid = 0; 3617 int error = 0; 3618 3619 if (ztest_opts.zo_mmp_test) 3620 return; 3621 3622 mutex_enter(&ztest_vdev_lock); 3623 3624 /* ensure we have a usable config; mirrors of raidz aren't supported */ 3625 if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { 3626 mutex_exit(&ztest_vdev_lock); 3627 return; 3628 } 3629 3630 /* clean up the old pool, if any */ 3631 (void) spa_destroy("splitp"); 3632 3633 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3634 3635 /* generate a config from the existing config */ 3636 mutex_enter(&spa->spa_props_lock); 3637 tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); 3638 mutex_exit(&spa->spa_props_lock); 3639 3640 VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, 3641 &child, &children)); 3642 3643 schild = umem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 3644 UMEM_NOFAIL); 3645 for (c = 0; c < children; c++) { 3646 vdev_t *tvd = rvd->vdev_child[c]; 3647 nvlist_t **mchild; 3648 uint_t mchildren; 3649 3650 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 3651 schild[schildren] = fnvlist_alloc(); 3652 fnvlist_add_string(schild[schildren], 3653 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); 3654 fnvlist_add_uint64(schild[schildren], 3655 ZPOOL_CONFIG_IS_HOLE, 1); 3656 if (lastlogid == 0) 3657 lastlogid = schildren; 3658 ++schildren; 3659 continue; 3660 } 3661 lastlogid = 0; 3662 VERIFY0(nvlist_lookup_nvlist_array(child[c], 3663 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); 3664 schild[schildren++] = fnvlist_dup(mchild[0]); 3665 } 3666 3667 /* OK, create a config that can be used to split */ 3668 split = fnvlist_alloc(); 3669 fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 3670 fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, 3671 (const nvlist_t **)schild, lastlogid != 0 ? lastlogid : schildren); 3672 3673 config = fnvlist_alloc(); 3674 fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); 3675 3676 for (c = 0; c < schildren; c++) 3677 fnvlist_free(schild[c]); 3678 umem_free(schild, rvd->vdev_children * sizeof (nvlist_t *)); 3679 fnvlist_free(split); 3680 3681 spa_config_exit(spa, SCL_VDEV, FTAG); 3682 3683 (void) pthread_rwlock_wrlock(&ztest_name_lock); 3684 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 3685 (void) pthread_rwlock_unlock(&ztest_name_lock); 3686 3687 fnvlist_free(config); 3688 3689 if (error == 0) { 3690 (void) printf("successful split - results:\n"); 3691 mutex_enter(&spa_namespace_lock); 3692 show_pool_stats(spa); 3693 show_pool_stats(spa_lookup("splitp")); 3694 mutex_exit(&spa_namespace_lock); 3695 ++zs->zs_splits; 3696 --zs->zs_mirrors; 3697 } 3698 mutex_exit(&ztest_vdev_lock); 3699 } 3700 3701 /* 3702 * Verify that we can attach and detach devices. 3703 */ 3704 void 3705 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 3706 { 3707 (void) zd, (void) id; 3708 ztest_shared_t *zs = ztest_shared; 3709 spa_t *spa = ztest_spa; 3710 spa_aux_vdev_t *sav = &spa->spa_spares; 3711 vdev_t *rvd = spa->spa_root_vdev; 3712 vdev_t *oldvd, *newvd, *pvd; 3713 nvlist_t *root; 3714 uint64_t leaves; 3715 uint64_t leaf, top; 3716 uint64_t ashift = ztest_get_ashift(); 3717 uint64_t oldguid, pguid; 3718 uint64_t oldsize, newsize; 3719 uint64_t raidz_children; 3720 char *oldpath, *newpath; 3721 int replacing; 3722 int oldvd_has_siblings = B_FALSE; 3723 int newvd_is_spare = B_FALSE; 3724 int newvd_is_dspare = B_FALSE; 3725 int oldvd_is_log; 3726 int oldvd_is_special; 3727 int error, expected_error; 3728 3729 if (ztest_opts.zo_mmp_test) 3730 return; 3731 3732 oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3733 newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3734 3735 mutex_enter(&ztest_vdev_lock); 3736 raidz_children = ztest_get_raidz_children(spa); 3737 leaves = MAX(zs->zs_mirrors, 1) * raidz_children; 3738 3739 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3740 3741 /* 3742 * If a vdev is in the process of being removed, its removal may 3743 * finish while we are in progress, leading to an unexpected error 3744 * value. Don't bother trying to attach while we are in the middle 3745 * of removal. 3746 */ 3747 if (ztest_device_removal_active) { 3748 spa_config_exit(spa, SCL_ALL, FTAG); 3749 goto out; 3750 } 3751 3752 /* 3753 * RAIDZ leaf VDEV mirrors are not currently supported while a 3754 * RAIDZ expansion is in progress. 3755 */ 3756 if (ztest_opts.zo_raid_do_expand) { 3757 spa_config_exit(spa, SCL_ALL, FTAG); 3758 goto out; 3759 } 3760 3761 /* 3762 * Decide whether to do an attach or a replace. 3763 */ 3764 replacing = ztest_random(2); 3765 3766 /* 3767 * Pick a random top-level vdev. 3768 */ 3769 top = ztest_random_vdev_top(spa, B_TRUE); 3770 3771 /* 3772 * Pick a random leaf within it. 3773 */ 3774 leaf = ztest_random(leaves); 3775 3776 /* 3777 * Locate this vdev. 3778 */ 3779 oldvd = rvd->vdev_child[top]; 3780 3781 /* pick a child from the mirror */ 3782 if (zs->zs_mirrors >= 1) { 3783 ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); 3784 ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); 3785 oldvd = oldvd->vdev_child[leaf / raidz_children]; 3786 } 3787 3788 /* pick a child out of the raidz group */ 3789 if (ztest_opts.zo_raid_children > 1) { 3790 if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) 3791 ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); 3792 else 3793 ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); 3794 oldvd = oldvd->vdev_child[leaf % raidz_children]; 3795 } 3796 3797 /* 3798 * If we're already doing an attach or replace, oldvd may be a 3799 * mirror vdev -- in which case, pick a random child. 3800 */ 3801 while (oldvd->vdev_children != 0) { 3802 oldvd_has_siblings = B_TRUE; 3803 ASSERT3U(oldvd->vdev_children, >=, 2); 3804 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3805 } 3806 3807 oldguid = oldvd->vdev_guid; 3808 oldsize = vdev_get_min_asize(oldvd); 3809 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3810 oldvd_is_special = 3811 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_SPECIAL || 3812 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_DEDUP; 3813 (void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN); 3814 pvd = oldvd->vdev_parent; 3815 pguid = pvd->vdev_guid; 3816 3817 /* 3818 * If oldvd has siblings, then half of the time, detach it. Prior 3819 * to the detach the pool is scrubbed in order to prevent creating 3820 * unrepairable blocks as a result of the data corruption injection. 3821 */ 3822 if (oldvd_has_siblings && ztest_random(2) == 0) { 3823 spa_config_exit(spa, SCL_ALL, FTAG); 3824 3825 error = ztest_scrub_impl(spa); 3826 if (error) 3827 goto out; 3828 3829 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3830 if (error != 0 && error != ENODEV && error != EBUSY && 3831 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3832 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3833 fatal(B_FALSE, "detach (%s) returned %d", 3834 oldpath, error); 3835 goto out; 3836 } 3837 3838 /* 3839 * For the new vdev, choose with equal probability between the two 3840 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3841 */ 3842 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3843 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3844 newvd_is_spare = B_TRUE; 3845 3846 if (newvd->vdev_ops == &vdev_draid_spare_ops) 3847 newvd_is_dspare = B_TRUE; 3848 3849 (void) strlcpy(newpath, newvd->vdev_path, MAXPATHLEN); 3850 } else { 3851 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 3852 ztest_opts.zo_dir, ztest_opts.zo_pool, 3853 top * leaves + leaf); 3854 if (ztest_random(2) == 0) 3855 newpath[strlen(newpath) - 1] = 'b'; 3856 newvd = vdev_lookup_by_path(rvd, newpath); 3857 } 3858 3859 if (newvd) { 3860 /* 3861 * Reopen to ensure the vdev's asize field isn't stale. 3862 */ 3863 vdev_reopen(newvd); 3864 newsize = vdev_get_min_asize(newvd); 3865 } else { 3866 /* 3867 * Make newsize a little bigger or smaller than oldsize. 3868 * If it's smaller, the attach should fail. 3869 * If it's larger, and we're doing a replace, 3870 * we should get dynamic LUN growth when we're done. 3871 */ 3872 newsize = 10 * oldsize / (9 + ztest_random(3)); 3873 } 3874 3875 /* 3876 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3877 * unless it's a replace; in that case any non-replacing parent is OK. 3878 * 3879 * If newvd is already part of the pool, it should fail with EBUSY. 3880 * 3881 * If newvd is too small, it should fail with EOVERFLOW. 3882 * 3883 * If newvd is a distributed spare and it's being attached to a 3884 * dRAID which is not its parent it should fail with EINVAL. 3885 */ 3886 if (pvd->vdev_ops != &vdev_mirror_ops && 3887 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3888 pvd->vdev_ops == &vdev_replacing_ops || 3889 pvd->vdev_ops == &vdev_spare_ops)) 3890 expected_error = ENOTSUP; 3891 else if (newvd_is_spare && 3892 (!replacing || oldvd_is_log || oldvd_is_special)) 3893 expected_error = ENOTSUP; 3894 else if (newvd == oldvd) 3895 expected_error = replacing ? 0 : EBUSY; 3896 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3897 expected_error = EBUSY; 3898 else if (!newvd_is_dspare && newsize < oldsize) 3899 expected_error = EOVERFLOW; 3900 else if (ashift > oldvd->vdev_top->vdev_ashift) 3901 expected_error = EDOM; 3902 else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) 3903 expected_error = EINVAL; 3904 else 3905 expected_error = 0; 3906 3907 spa_config_exit(spa, SCL_ALL, FTAG); 3908 3909 /* 3910 * Build the nvlist describing newpath. 3911 */ 3912 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3913 ashift, NULL, 0, 0, 1); 3914 3915 /* 3916 * When supported select either a healing or sequential resilver. 3917 */ 3918 boolean_t rebuilding = B_FALSE; 3919 if (pvd->vdev_ops == &vdev_mirror_ops || 3920 pvd->vdev_ops == &vdev_root_ops) { 3921 rebuilding = !!ztest_random(2); 3922 } 3923 3924 error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); 3925 3926 fnvlist_free(root); 3927 3928 /* 3929 * If our parent was the replacing vdev, but the replace completed, 3930 * then instead of failing with ENOTSUP we may either succeed, 3931 * fail with ENODEV, or fail with EOVERFLOW. 3932 */ 3933 if (expected_error == ENOTSUP && 3934 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3935 expected_error = error; 3936 3937 /* 3938 * If someone grew the LUN, the replacement may be too small. 3939 */ 3940 if (error == EOVERFLOW || error == EBUSY) 3941 expected_error = error; 3942 3943 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3944 error == ZFS_ERR_DISCARDING_CHECKPOINT || 3945 error == ZFS_ERR_RESILVER_IN_PROGRESS || 3946 error == ZFS_ERR_REBUILD_IN_PROGRESS) 3947 expected_error = error; 3948 3949 if (error != expected_error && expected_error != EBUSY) { 3950 fatal(B_FALSE, "attach (%s %"PRIu64", %s %"PRIu64", %d) " 3951 "returned %d, expected %d", 3952 oldpath, oldsize, newpath, 3953 newsize, replacing, error, expected_error); 3954 } 3955 out: 3956 mutex_exit(&ztest_vdev_lock); 3957 3958 umem_free(oldpath, MAXPATHLEN); 3959 umem_free(newpath, MAXPATHLEN); 3960 } 3961 3962 static void 3963 raidz_scratch_verify(void) 3964 { 3965 spa_t *spa; 3966 uint64_t write_size, logical_size, offset; 3967 raidz_reflow_scratch_state_t state; 3968 vdev_raidz_expand_t *vre; 3969 vdev_t *raidvd; 3970 3971 ASSERT(raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE); 3972 3973 if (ztest_scratch_state->zs_raidz_scratch_verify_pause == 0) 3974 return; 3975 3976 kernel_init(SPA_MODE_READ); 3977 3978 mutex_enter(&spa_namespace_lock); 3979 spa = spa_lookup(ztest_opts.zo_pool); 3980 ASSERT(spa); 3981 spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; 3982 mutex_exit(&spa_namespace_lock); 3983 3984 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 3985 3986 ASSERT3U(RRSS_GET_OFFSET(&spa->spa_uberblock), !=, UINT64_MAX); 3987 3988 mutex_enter(&ztest_vdev_lock); 3989 3990 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); 3991 3992 vre = spa->spa_raidz_expand; 3993 if (vre == NULL) 3994 goto out; 3995 3996 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3997 offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3998 state = RRSS_GET_STATE(&spa->spa_uberblock); 3999 write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift, 4000 uint64_t); 4001 logical_size = write_size * raidvd->vdev_children; 4002 4003 switch (state) { 4004 /* 4005 * Initial state of reflow process. RAIDZ expansion was 4006 * requested by user, but scratch object was not created. 4007 */ 4008 case RRSS_SCRATCH_NOT_IN_USE: 4009 ASSERT3U(offset, ==, 0); 4010 break; 4011 4012 /* 4013 * Scratch object was synced and stored in boot area. 4014 */ 4015 case RRSS_SCRATCH_VALID: 4016 4017 /* 4018 * Scratch object was synced back to raidz start offset, 4019 * raidz is ready for sector by sector reflow process. 4020 */ 4021 case RRSS_SCRATCH_INVALID_SYNCED: 4022 4023 /* 4024 * Scratch object was synced back to raidz start offset 4025 * on zpool importing, raidz is ready for sector by sector 4026 * reflow process. 4027 */ 4028 case RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT: 4029 ASSERT3U(offset, ==, logical_size); 4030 break; 4031 4032 /* 4033 * Sector by sector reflow process started. 4034 */ 4035 case RRSS_SCRATCH_INVALID_SYNCED_REFLOW: 4036 ASSERT3U(offset, >=, logical_size); 4037 break; 4038 } 4039 4040 out: 4041 spa_config_exit(spa, SCL_ALL, FTAG); 4042 4043 mutex_exit(&ztest_vdev_lock); 4044 4045 ztest_scratch_state->zs_raidz_scratch_verify_pause = 0; 4046 4047 spa_close(spa, FTAG); 4048 kernel_fini(); 4049 } 4050 4051 static void 4052 ztest_scratch_thread(void *arg) 4053 { 4054 (void) arg; 4055 4056 /* wait up to 10 seconds */ 4057 for (int t = 100; t > 0; t -= 1) { 4058 if (raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) 4059 thread_exit(); 4060 4061 (void) poll(NULL, 0, 100); 4062 } 4063 4064 /* killed when the scratch area progress reached a certain point */ 4065 ztest_kill(ztest_shared); 4066 } 4067 4068 /* 4069 * Verify that we can attach raidz device. 4070 */ 4071 void 4072 ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) 4073 { 4074 (void) zd, (void) id; 4075 ztest_shared_t *zs = ztest_shared; 4076 spa_t *spa = ztest_spa; 4077 uint64_t leaves, raidz_children, newsize, ashift = ztest_get_ashift(); 4078 kthread_t *scratch_thread = NULL; 4079 vdev_t *newvd, *pvd; 4080 nvlist_t *root; 4081 char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 4082 int error, expected_error = 0; 4083 4084 mutex_enter(&ztest_vdev_lock); 4085 4086 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); 4087 4088 /* Only allow attach when raid-kind = 'eraidz' */ 4089 if (!ztest_opts.zo_raid_do_expand) { 4090 spa_config_exit(spa, SCL_ALL, FTAG); 4091 goto out; 4092 } 4093 4094 if (ztest_opts.zo_mmp_test) { 4095 spa_config_exit(spa, SCL_ALL, FTAG); 4096 goto out; 4097 } 4098 4099 if (ztest_device_removal_active) { 4100 spa_config_exit(spa, SCL_ALL, FTAG); 4101 goto out; 4102 } 4103 4104 pvd = vdev_lookup_top(spa, 0); 4105 4106 ASSERT(pvd->vdev_ops == &vdev_raidz_ops); 4107 4108 /* 4109 * Get size of a child of the raidz group, 4110 * make sure device is a bit bigger 4111 */ 4112 newvd = pvd->vdev_child[ztest_random(pvd->vdev_children)]; 4113 newsize = 10 * vdev_get_min_asize(newvd) / (9 + ztest_random(2)); 4114 4115 /* 4116 * Get next attached leaf id 4117 */ 4118 raidz_children = ztest_get_raidz_children(spa); 4119 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 4120 zs->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 4121 4122 if (spa->spa_raidz_expand) 4123 expected_error = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS; 4124 4125 spa_config_exit(spa, SCL_ALL, FTAG); 4126 4127 /* 4128 * Path to vdev to be attached 4129 */ 4130 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 4131 ztest_opts.zo_dir, ztest_opts.zo_pool, zs->zs_vdev_next_leaf); 4132 4133 /* 4134 * Build the nvlist describing newpath. 4135 */ 4136 root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL, 4137 0, 0, 1); 4138 4139 /* 4140 * 50% of the time, set raidz_expand_pause_point to cause 4141 * raidz_reflow_scratch_sync() to pause at a certain point and 4142 * then kill the test after 10 seconds so raidz_scratch_verify() 4143 * can confirm consistency when the pool is imported. 4144 */ 4145 if (ztest_random(2) == 0 && expected_error == 0) { 4146 raidz_expand_pause_point = 4147 ztest_random(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2) + 1; 4148 scratch_thread = thread_create(NULL, 0, ztest_scratch_thread, 4149 ztest_shared, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 4150 } 4151 4152 error = spa_vdev_attach(spa, pvd->vdev_guid, root, B_FALSE, B_FALSE); 4153 4154 nvlist_free(root); 4155 4156 if (error == EOVERFLOW || error == ENXIO || 4157 error == ZFS_ERR_CHECKPOINT_EXISTS || 4158 error == ZFS_ERR_DISCARDING_CHECKPOINT) 4159 expected_error = error; 4160 4161 if (error != 0 && error != expected_error) { 4162 fatal(0, "raidz attach (%s %"PRIu64") returned %d, expected %d", 4163 newpath, newsize, error, expected_error); 4164 } 4165 4166 if (raidz_expand_pause_point) { 4167 if (error != 0) { 4168 /* 4169 * Do not verify scratch object in case of error 4170 * returned by vdev attaching. 4171 */ 4172 raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; 4173 } 4174 4175 VERIFY0(thread_join(scratch_thread)); 4176 } 4177 out: 4178 mutex_exit(&ztest_vdev_lock); 4179 4180 umem_free(newpath, MAXPATHLEN); 4181 } 4182 4183 void 4184 ztest_device_removal(ztest_ds_t *zd, uint64_t id) 4185 { 4186 (void) zd, (void) id; 4187 spa_t *spa = ztest_spa; 4188 vdev_t *vd; 4189 uint64_t guid; 4190 int error; 4191 4192 mutex_enter(&ztest_vdev_lock); 4193 4194 if (ztest_device_removal_active) { 4195 mutex_exit(&ztest_vdev_lock); 4196 return; 4197 } 4198 4199 /* 4200 * Remove a random top-level vdev and wait for removal to finish. 4201 */ 4202 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 4203 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 4204 guid = vd->vdev_guid; 4205 spa_config_exit(spa, SCL_VDEV, FTAG); 4206 4207 error = spa_vdev_remove(spa, guid, B_FALSE); 4208 if (error == 0) { 4209 ztest_device_removal_active = B_TRUE; 4210 mutex_exit(&ztest_vdev_lock); 4211 4212 /* 4213 * spa->spa_vdev_removal is created in a sync task that 4214 * is initiated via dsl_sync_task_nowait(). Since the 4215 * task may not run before spa_vdev_remove() returns, we 4216 * must wait at least 1 txg to ensure that the removal 4217 * struct has been created. 4218 */ 4219 txg_wait_synced(spa_get_dsl(spa), 0); 4220 4221 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 4222 txg_wait_synced(spa_get_dsl(spa), 0); 4223 } else { 4224 mutex_exit(&ztest_vdev_lock); 4225 return; 4226 } 4227 4228 /* 4229 * The pool needs to be scrubbed after completing device removal. 4230 * Failure to do so may result in checksum errors due to the 4231 * strategy employed by ztest_fault_inject() when selecting which 4232 * offset are redundant and can be damaged. 4233 */ 4234 error = spa_scan(spa, POOL_SCAN_SCRUB); 4235 if (error == 0) { 4236 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 4237 txg_wait_synced(spa_get_dsl(spa), 0); 4238 } 4239 4240 mutex_enter(&ztest_vdev_lock); 4241 ztest_device_removal_active = B_FALSE; 4242 mutex_exit(&ztest_vdev_lock); 4243 } 4244 4245 /* 4246 * Callback function which expands the physical size of the vdev. 4247 */ 4248 static vdev_t * 4249 grow_vdev(vdev_t *vd, void *arg) 4250 { 4251 spa_t *spa __maybe_unused = vd->vdev_spa; 4252 size_t *newsize = arg; 4253 size_t fsize; 4254 int fd; 4255 4256 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 4257 ASSERT(vd->vdev_ops->vdev_op_leaf); 4258 4259 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 4260 return (vd); 4261 4262 fsize = lseek(fd, 0, SEEK_END); 4263 VERIFY0(ftruncate(fd, *newsize)); 4264 4265 if (ztest_opts.zo_verbose >= 6) { 4266 (void) printf("%s grew from %lu to %lu bytes\n", 4267 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 4268 } 4269 (void) close(fd); 4270 return (NULL); 4271 } 4272 4273 /* 4274 * Callback function which expands a given vdev by calling vdev_online(). 4275 */ 4276 static vdev_t * 4277 online_vdev(vdev_t *vd, void *arg) 4278 { 4279 (void) arg; 4280 spa_t *spa = vd->vdev_spa; 4281 vdev_t *tvd = vd->vdev_top; 4282 uint64_t guid = vd->vdev_guid; 4283 uint64_t generation = spa->spa_config_generation + 1; 4284 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 4285 int error; 4286 4287 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 4288 ASSERT(vd->vdev_ops->vdev_op_leaf); 4289 4290 /* Calling vdev_online will initialize the new metaslabs */ 4291 spa_config_exit(spa, SCL_STATE, spa); 4292 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 4293 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4294 4295 /* 4296 * If vdev_online returned an error or the underlying vdev_open 4297 * failed then we abort the expand. The only way to know that 4298 * vdev_open fails is by checking the returned newstate. 4299 */ 4300 if (error || newstate != VDEV_STATE_HEALTHY) { 4301 if (ztest_opts.zo_verbose >= 5) { 4302 (void) printf("Unable to expand vdev, state %u, " 4303 "error %d\n", newstate, error); 4304 } 4305 return (vd); 4306 } 4307 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 4308 4309 /* 4310 * Since we dropped the lock we need to ensure that we're 4311 * still talking to the original vdev. It's possible this 4312 * vdev may have been detached/replaced while we were 4313 * trying to online it. 4314 */ 4315 if (generation != spa->spa_config_generation) { 4316 if (ztest_opts.zo_verbose >= 5) { 4317 (void) printf("vdev configuration has changed, " 4318 "guid %"PRIu64", state %"PRIu64", " 4319 "expected gen %"PRIu64", got gen %"PRIu64"\n", 4320 guid, 4321 tvd->vdev_state, 4322 generation, 4323 spa->spa_config_generation); 4324 } 4325 return (vd); 4326 } 4327 return (NULL); 4328 } 4329 4330 /* 4331 * Traverse the vdev tree calling the supplied function. 4332 * We continue to walk the tree until we either have walked all 4333 * children or we receive a non-NULL return from the callback. 4334 * If a NULL callback is passed, then we just return back the first 4335 * leaf vdev we encounter. 4336 */ 4337 static vdev_t * 4338 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 4339 { 4340 uint_t c; 4341 4342 if (vd->vdev_ops->vdev_op_leaf) { 4343 if (func == NULL) 4344 return (vd); 4345 else 4346 return (func(vd, arg)); 4347 } 4348 4349 for (c = 0; c < vd->vdev_children; c++) { 4350 vdev_t *cvd = vd->vdev_child[c]; 4351 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 4352 return (cvd); 4353 } 4354 return (NULL); 4355 } 4356 4357 /* 4358 * Verify that dynamic LUN growth works as expected. 4359 */ 4360 void 4361 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 4362 { 4363 (void) zd, (void) id; 4364 spa_t *spa = ztest_spa; 4365 vdev_t *vd, *tvd; 4366 metaslab_class_t *mc; 4367 metaslab_group_t *mg; 4368 size_t psize, newsize; 4369 uint64_t top; 4370 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 4371 4372 mutex_enter(&ztest_checkpoint_lock); 4373 mutex_enter(&ztest_vdev_lock); 4374 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4375 4376 /* 4377 * If there is a vdev removal in progress, it could complete while 4378 * we are running, in which case we would not be able to verify 4379 * that the metaslab_class space increased (because it decreases 4380 * when the device removal completes). 4381 */ 4382 if (ztest_device_removal_active) { 4383 spa_config_exit(spa, SCL_STATE, spa); 4384 mutex_exit(&ztest_vdev_lock); 4385 mutex_exit(&ztest_checkpoint_lock); 4386 return; 4387 } 4388 4389 /* 4390 * If we are under raidz expansion, the test can failed because the 4391 * metaslabs count will not increase immediately after the vdev is 4392 * expanded. It will happen only after raidz expansion completion. 4393 */ 4394 if (spa->spa_raidz_expand) { 4395 spa_config_exit(spa, SCL_STATE, spa); 4396 mutex_exit(&ztest_vdev_lock); 4397 mutex_exit(&ztest_checkpoint_lock); 4398 return; 4399 } 4400 4401 top = ztest_random_vdev_top(spa, B_TRUE); 4402 4403 tvd = spa->spa_root_vdev->vdev_child[top]; 4404 mg = tvd->vdev_mg; 4405 mc = mg->mg_class; 4406 old_ms_count = tvd->vdev_ms_count; 4407 old_class_space = metaslab_class_get_space(mc); 4408 4409 /* 4410 * Determine the size of the first leaf vdev associated with 4411 * our top-level device. 4412 */ 4413 vd = vdev_walk_tree(tvd, NULL, NULL); 4414 ASSERT3P(vd, !=, NULL); 4415 ASSERT(vd->vdev_ops->vdev_op_leaf); 4416 4417 psize = vd->vdev_psize; 4418 4419 /* 4420 * We only try to expand the vdev if it's healthy, less than 4x its 4421 * original size, and it has a valid psize. 4422 */ 4423 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 4424 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 4425 spa_config_exit(spa, SCL_STATE, spa); 4426 mutex_exit(&ztest_vdev_lock); 4427 mutex_exit(&ztest_checkpoint_lock); 4428 return; 4429 } 4430 ASSERT3U(psize, >, 0); 4431 newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); 4432 ASSERT3U(newsize, >, psize); 4433 4434 if (ztest_opts.zo_verbose >= 6) { 4435 (void) printf("Expanding LUN %s from %lu to %lu\n", 4436 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 4437 } 4438 4439 /* 4440 * Growing the vdev is a two step process: 4441 * 1). expand the physical size (i.e. relabel) 4442 * 2). online the vdev to create the new metaslabs 4443 */ 4444 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 4445 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 4446 tvd->vdev_state != VDEV_STATE_HEALTHY) { 4447 if (ztest_opts.zo_verbose >= 5) { 4448 (void) printf("Could not expand LUN because " 4449 "the vdev configuration changed.\n"); 4450 } 4451 spa_config_exit(spa, SCL_STATE, spa); 4452 mutex_exit(&ztest_vdev_lock); 4453 mutex_exit(&ztest_checkpoint_lock); 4454 return; 4455 } 4456 4457 spa_config_exit(spa, SCL_STATE, spa); 4458 4459 /* 4460 * Expanding the LUN will update the config asynchronously, 4461 * thus we must wait for the async thread to complete any 4462 * pending tasks before proceeding. 4463 */ 4464 for (;;) { 4465 boolean_t done; 4466 mutex_enter(&spa->spa_async_lock); 4467 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 4468 mutex_exit(&spa->spa_async_lock); 4469 if (done) 4470 break; 4471 txg_wait_synced(spa_get_dsl(spa), 0); 4472 (void) poll(NULL, 0, 100); 4473 } 4474 4475 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4476 4477 tvd = spa->spa_root_vdev->vdev_child[top]; 4478 new_ms_count = tvd->vdev_ms_count; 4479 new_class_space = metaslab_class_get_space(mc); 4480 4481 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 4482 if (ztest_opts.zo_verbose >= 5) { 4483 (void) printf("Could not verify LUN expansion due to " 4484 "intervening vdev offline or remove.\n"); 4485 } 4486 spa_config_exit(spa, SCL_STATE, spa); 4487 mutex_exit(&ztest_vdev_lock); 4488 mutex_exit(&ztest_checkpoint_lock); 4489 return; 4490 } 4491 4492 /* 4493 * Make sure we were able to grow the vdev. 4494 */ 4495 if (new_ms_count <= old_ms_count) { 4496 fatal(B_FALSE, 4497 "LUN expansion failed: ms_count %"PRIu64" < %"PRIu64"\n", 4498 old_ms_count, new_ms_count); 4499 } 4500 4501 /* 4502 * Make sure we were able to grow the pool. 4503 */ 4504 if (new_class_space <= old_class_space) { 4505 fatal(B_FALSE, 4506 "LUN expansion failed: class_space %"PRIu64" < %"PRIu64"\n", 4507 old_class_space, new_class_space); 4508 } 4509 4510 if (ztest_opts.zo_verbose >= 5) { 4511 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 4512 4513 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 4514 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 4515 (void) printf("%s grew from %s to %s\n", 4516 spa->spa_name, oldnumbuf, newnumbuf); 4517 } 4518 4519 spa_config_exit(spa, SCL_STATE, spa); 4520 mutex_exit(&ztest_vdev_lock); 4521 mutex_exit(&ztest_checkpoint_lock); 4522 } 4523 4524 /* 4525 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 4526 */ 4527 static void 4528 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 4529 { 4530 (void) arg, (void) cr; 4531 4532 /* 4533 * Create the objects common to all ztest datasets. 4534 */ 4535 VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, 4536 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); 4537 } 4538 4539 static int 4540 ztest_dataset_create(char *dsname) 4541 { 4542 int err; 4543 uint64_t rand; 4544 dsl_crypto_params_t *dcp = NULL; 4545 4546 /* 4547 * 50% of the time, we create encrypted datasets 4548 * using a random cipher suite and a hard-coded 4549 * wrapping key. 4550 */ 4551 rand = ztest_random(2); 4552 if (rand != 0) { 4553 nvlist_t *crypto_args = fnvlist_alloc(); 4554 nvlist_t *props = fnvlist_alloc(); 4555 4556 /* slight bias towards the default cipher suite */ 4557 rand = ztest_random(ZIO_CRYPT_FUNCTIONS); 4558 if (rand < ZIO_CRYPT_AES_128_CCM) 4559 rand = ZIO_CRYPT_ON; 4560 4561 fnvlist_add_uint64(props, 4562 zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); 4563 fnvlist_add_uint8_array(crypto_args, "wkeydata", 4564 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 4565 4566 /* 4567 * These parameters aren't really used by the kernel. They 4568 * are simply stored so that userspace knows how to load 4569 * the wrapping key. 4570 */ 4571 fnvlist_add_uint64(props, 4572 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); 4573 fnvlist_add_string(props, 4574 zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); 4575 fnvlist_add_uint64(props, 4576 zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); 4577 fnvlist_add_uint64(props, 4578 zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); 4579 4580 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, 4581 crypto_args, &dcp)); 4582 4583 /* 4584 * Cycle through all available encryption implementations 4585 * to verify interoperability. 4586 */ 4587 VERIFY0(gcm_impl_set("cycle")); 4588 VERIFY0(aes_impl_set("cycle")); 4589 4590 fnvlist_free(crypto_args); 4591 fnvlist_free(props); 4592 } 4593 4594 err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, 4595 ztest_objset_create_cb, NULL); 4596 dsl_crypto_params_free(dcp, !!err); 4597 4598 rand = ztest_random(100); 4599 if (err || rand < 80) 4600 return (err); 4601 4602 if (ztest_opts.zo_verbose >= 5) 4603 (void) printf("Setting dataset %s to sync always\n", dsname); 4604 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 4605 ZFS_SYNC_ALWAYS, B_FALSE)); 4606 } 4607 4608 static int 4609 ztest_objset_destroy_cb(const char *name, void *arg) 4610 { 4611 (void) arg; 4612 objset_t *os; 4613 dmu_object_info_t doi; 4614 int error; 4615 4616 /* 4617 * Verify that the dataset contains a directory object. 4618 */ 4619 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4620 B_TRUE, FTAG, &os)); 4621 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 4622 if (error != ENOENT) { 4623 /* We could have crashed in the middle of destroying it */ 4624 ASSERT0(error); 4625 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 4626 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 4627 } 4628 dmu_objset_disown(os, B_TRUE, FTAG); 4629 4630 /* 4631 * Destroy the dataset. 4632 */ 4633 if (strchr(name, '@') != NULL) { 4634 error = dsl_destroy_snapshot(name, B_TRUE); 4635 if (error != ECHRNG) { 4636 /* 4637 * The program was executed, but encountered a runtime 4638 * error, such as insufficient slop, or a hold on the 4639 * dataset. 4640 */ 4641 ASSERT0(error); 4642 } 4643 } else { 4644 error = dsl_destroy_head(name); 4645 if (error == ENOSPC) { 4646 /* There could be checkpoint or insufficient slop */ 4647 ztest_record_enospc(FTAG); 4648 } else if (error != EBUSY) { 4649 /* There could be a hold on this dataset */ 4650 ASSERT0(error); 4651 } 4652 } 4653 return (0); 4654 } 4655 4656 static boolean_t 4657 ztest_snapshot_create(char *osname, uint64_t id) 4658 { 4659 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4660 int error; 4661 4662 (void) snprintf(snapname, sizeof (snapname), "%"PRIu64"", id); 4663 4664 error = dmu_objset_snapshot_one(osname, snapname); 4665 if (error == ENOSPC) { 4666 ztest_record_enospc(FTAG); 4667 return (B_FALSE); 4668 } 4669 if (error != 0 && error != EEXIST && error != ECHRNG) { 4670 fatal(B_FALSE, "ztest_snapshot_create(%s@%s) = %d", osname, 4671 snapname, error); 4672 } 4673 return (B_TRUE); 4674 } 4675 4676 static boolean_t 4677 ztest_snapshot_destroy(char *osname, uint64_t id) 4678 { 4679 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4680 int error; 4681 4682 (void) snprintf(snapname, sizeof (snapname), "%s@%"PRIu64"", 4683 osname, id); 4684 4685 error = dsl_destroy_snapshot(snapname, B_FALSE); 4686 if (error != 0 && error != ENOENT && error != ECHRNG) 4687 fatal(B_FALSE, "ztest_snapshot_destroy(%s) = %d", 4688 snapname, error); 4689 return (B_TRUE); 4690 } 4691 4692 void 4693 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 4694 { 4695 (void) zd; 4696 ztest_ds_t *zdtmp; 4697 int iters; 4698 int error; 4699 objset_t *os, *os2; 4700 char name[ZFS_MAX_DATASET_NAME_LEN]; 4701 zilog_t *zilog; 4702 int i; 4703 4704 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 4705 4706 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4707 4708 (void) snprintf(name, sizeof (name), "%s/temp_%"PRIu64"", 4709 ztest_opts.zo_pool, id); 4710 4711 /* 4712 * If this dataset exists from a previous run, process its replay log 4713 * half of the time. If we don't replay it, then dsl_destroy_head() 4714 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 4715 */ 4716 if (ztest_random(2) == 0 && 4717 ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 4718 B_TRUE, FTAG, &os) == 0) { 4719 ztest_zd_init(zdtmp, NULL, os); 4720 zil_replay(os, zdtmp, ztest_replay_vector); 4721 ztest_zd_fini(zdtmp); 4722 dmu_objset_disown(os, B_TRUE, FTAG); 4723 } 4724 4725 /* 4726 * There may be an old instance of the dataset we're about to 4727 * create lying around from a previous run. If so, destroy it 4728 * and all of its snapshots. 4729 */ 4730 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 4731 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 4732 4733 /* 4734 * Verify that the destroyed dataset is no longer in the namespace. 4735 * It may still be present if the destroy above fails with ENOSPC. 4736 */ 4737 error = ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, 4738 FTAG, &os); 4739 if (error == 0) { 4740 dmu_objset_disown(os, B_TRUE, FTAG); 4741 ztest_record_enospc(FTAG); 4742 goto out; 4743 } 4744 VERIFY3U(ENOENT, ==, error); 4745 4746 /* 4747 * Verify that we can create a new dataset. 4748 */ 4749 error = ztest_dataset_create(name); 4750 if (error) { 4751 if (error == ENOSPC) { 4752 ztest_record_enospc(FTAG); 4753 goto out; 4754 } 4755 fatal(B_FALSE, "dmu_objset_create(%s) = %d", name, error); 4756 } 4757 4758 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, 4759 FTAG, &os)); 4760 4761 ztest_zd_init(zdtmp, NULL, os); 4762 4763 /* 4764 * Open the intent log for it. 4765 */ 4766 zilog = zil_open(os, ztest_get_data, NULL); 4767 4768 /* 4769 * Put some objects in there, do a little I/O to them, 4770 * and randomly take a couple of snapshots along the way. 4771 */ 4772 iters = ztest_random(5); 4773 for (i = 0; i < iters; i++) { 4774 ztest_dmu_object_alloc_free(zdtmp, id); 4775 if (ztest_random(iters) == 0) 4776 (void) ztest_snapshot_create(name, i); 4777 } 4778 4779 /* 4780 * Verify that we cannot create an existing dataset. 4781 */ 4782 VERIFY3U(EEXIST, ==, 4783 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); 4784 4785 /* 4786 * Verify that we can hold an objset that is also owned. 4787 */ 4788 VERIFY0(dmu_objset_hold(name, FTAG, &os2)); 4789 dmu_objset_rele(os2, FTAG); 4790 4791 /* 4792 * Verify that we cannot own an objset that is already owned. 4793 */ 4794 VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, 4795 B_FALSE, B_TRUE, FTAG, &os2)); 4796 4797 zil_close(zilog); 4798 dmu_objset_disown(os, B_TRUE, FTAG); 4799 ztest_zd_fini(zdtmp); 4800 out: 4801 (void) pthread_rwlock_unlock(&ztest_name_lock); 4802 4803 umem_free(zdtmp, sizeof (ztest_ds_t)); 4804 } 4805 4806 /* 4807 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 4808 */ 4809 void 4810 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 4811 { 4812 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4813 (void) ztest_snapshot_destroy(zd->zd_name, id); 4814 (void) ztest_snapshot_create(zd->zd_name, id); 4815 (void) pthread_rwlock_unlock(&ztest_name_lock); 4816 } 4817 4818 /* 4819 * Cleanup non-standard snapshots and clones. 4820 */ 4821 static void 4822 ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 4823 { 4824 char *snap1name; 4825 char *clone1name; 4826 char *snap2name; 4827 char *clone2name; 4828 char *snap3name; 4829 int error; 4830 4831 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4832 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4833 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4834 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4835 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4836 4837 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4838 osname, id); 4839 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4840 osname, id); 4841 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4842 clone1name, id); 4843 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4844 osname, id); 4845 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4846 clone1name, id); 4847 4848 error = dsl_destroy_head(clone2name); 4849 if (error && error != ENOENT) 4850 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone2name, error); 4851 error = dsl_destroy_snapshot(snap3name, B_FALSE); 4852 if (error && error != ENOENT) 4853 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4854 snap3name, error); 4855 error = dsl_destroy_snapshot(snap2name, B_FALSE); 4856 if (error && error != ENOENT) 4857 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4858 snap2name, error); 4859 error = dsl_destroy_head(clone1name); 4860 if (error && error != ENOENT) 4861 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone1name, error); 4862 error = dsl_destroy_snapshot(snap1name, B_FALSE); 4863 if (error && error != ENOENT) 4864 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4865 snap1name, error); 4866 4867 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4868 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4869 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4870 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4871 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4872 } 4873 4874 /* 4875 * Verify dsl_dataset_promote handles EBUSY 4876 */ 4877 void 4878 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 4879 { 4880 objset_t *os; 4881 char *snap1name; 4882 char *clone1name; 4883 char *snap2name; 4884 char *clone2name; 4885 char *snap3name; 4886 char *osname = zd->zd_name; 4887 int error; 4888 4889 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4890 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4891 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4892 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4893 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4894 4895 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4896 4897 ztest_dsl_dataset_cleanup(osname, id); 4898 4899 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4900 osname, id); 4901 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4902 osname, id); 4903 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4904 clone1name, id); 4905 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4906 osname, id); 4907 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4908 clone1name, id); 4909 4910 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 4911 if (error && error != EEXIST) { 4912 if (error == ENOSPC) { 4913 ztest_record_enospc(FTAG); 4914 goto out; 4915 } 4916 fatal(B_FALSE, "dmu_take_snapshot(%s) = %d", snap1name, error); 4917 } 4918 4919 error = dmu_objset_clone(clone1name, snap1name); 4920 if (error) { 4921 if (error == ENOSPC) { 4922 ztest_record_enospc(FTAG); 4923 goto out; 4924 } 4925 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone1name, error); 4926 } 4927 4928 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 4929 if (error && error != EEXIST) { 4930 if (error == ENOSPC) { 4931 ztest_record_enospc(FTAG); 4932 goto out; 4933 } 4934 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap2name, error); 4935 } 4936 4937 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 4938 if (error && error != EEXIST) { 4939 if (error == ENOSPC) { 4940 ztest_record_enospc(FTAG); 4941 goto out; 4942 } 4943 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap3name, error); 4944 } 4945 4946 error = dmu_objset_clone(clone2name, snap3name); 4947 if (error) { 4948 if (error == ENOSPC) { 4949 ztest_record_enospc(FTAG); 4950 goto out; 4951 } 4952 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone2name, error); 4953 } 4954 4955 error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, 4956 FTAG, &os); 4957 if (error) 4958 fatal(B_FALSE, "dmu_objset_own(%s) = %d", snap2name, error); 4959 error = dsl_dataset_promote(clone2name, NULL); 4960 if (error == ENOSPC) { 4961 dmu_objset_disown(os, B_TRUE, FTAG); 4962 ztest_record_enospc(FTAG); 4963 goto out; 4964 } 4965 if (error != EBUSY) 4966 fatal(B_FALSE, "dsl_dataset_promote(%s), %d, not EBUSY", 4967 clone2name, error); 4968 dmu_objset_disown(os, B_TRUE, FTAG); 4969 4970 out: 4971 ztest_dsl_dataset_cleanup(osname, id); 4972 4973 (void) pthread_rwlock_unlock(&ztest_name_lock); 4974 4975 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4976 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4977 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4978 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4979 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4980 } 4981 4982 #undef OD_ARRAY_SIZE 4983 #define OD_ARRAY_SIZE 4 4984 4985 /* 4986 * Verify that dmu_object_{alloc,free} work as expected. 4987 */ 4988 void 4989 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 4990 { 4991 ztest_od_t *od; 4992 int batchsize; 4993 int size; 4994 int b; 4995 4996 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4997 od = umem_alloc(size, UMEM_NOFAIL); 4998 batchsize = OD_ARRAY_SIZE; 4999 5000 for (b = 0; b < batchsize; b++) 5001 ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 5002 0, 0, 0); 5003 5004 /* 5005 * Destroy the previous batch of objects, create a new batch, 5006 * and do some I/O on the new objects. 5007 */ 5008 if (ztest_object_init(zd, od, size, B_TRUE) != 0) { 5009 zd->zd_od = NULL; 5010 umem_free(od, size); 5011 return; 5012 } 5013 5014 while (ztest_random(4 * batchsize) != 0) 5015 ztest_io(zd, od[ztest_random(batchsize)].od_object, 5016 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5017 5018 umem_free(od, size); 5019 } 5020 5021 /* 5022 * Rewind the global allocator to verify object allocation backfilling. 5023 */ 5024 void 5025 ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 5026 { 5027 (void) id; 5028 objset_t *os = zd->zd_os; 5029 uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 5030 uint64_t object; 5031 5032 /* 5033 * Rewind the global allocator randomly back to a lower object number 5034 * to force backfilling and reclamation of recently freed dnodes. 5035 */ 5036 mutex_enter(&os->os_obj_lock); 5037 object = ztest_random(os->os_obj_next_chunk); 5038 os->os_obj_next_chunk = P2ALIGN_TYPED(object, dnodes_per_chunk, 5039 uint64_t); 5040 mutex_exit(&os->os_obj_lock); 5041 } 5042 5043 #undef OD_ARRAY_SIZE 5044 #define OD_ARRAY_SIZE 2 5045 5046 /* 5047 * Verify that dmu_{read,write} work as expected. 5048 */ 5049 void 5050 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 5051 { 5052 int size; 5053 ztest_od_t *od; 5054 5055 objset_t *os = zd->zd_os; 5056 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 5057 od = umem_alloc(size, UMEM_NOFAIL); 5058 dmu_tx_t *tx; 5059 int freeit, error; 5060 uint64_t i, n, s, txg; 5061 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 5062 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 5063 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 5064 uint64_t regions = 997; 5065 uint64_t stride = 123456789ULL; 5066 uint64_t width = 40; 5067 int free_percent = 5; 5068 dmu_flags_t dmu_read_flags = DMU_READ_PREFETCH; 5069 5070 /* 5071 * We will randomly set when to do O_DIRECT on a read. 5072 */ 5073 if (ztest_random(4) == 0) 5074 dmu_read_flags |= DMU_DIRECTIO; 5075 5076 /* 5077 * This test uses two objects, packobj and bigobj, that are always 5078 * updated together (i.e. in the same tx) so that their contents are 5079 * in sync and can be compared. Their contents relate to each other 5080 * in a simple way: packobj is a dense array of 'bufwad' structures, 5081 * while bigobj is a sparse array of the same bufwads. Specifically, 5082 * for any index n, there are three bufwads that should be identical: 5083 * 5084 * packobj, at offset n * sizeof (bufwad_t) 5085 * bigobj, at the head of the nth chunk 5086 * bigobj, at the tail of the nth chunk 5087 * 5088 * The chunk size is arbitrary. It doesn't have to be a power of two, 5089 * and it doesn't have any relation to the object blocksize. 5090 * The only requirement is that it can hold at least two bufwads. 5091 * 5092 * Normally, we write the bufwad to each of these locations. 5093 * However, free_percent of the time we instead write zeroes to 5094 * packobj and perform a dmu_free_range() on bigobj. By comparing 5095 * bigobj to packobj, we can verify that the DMU is correctly 5096 * tracking which parts of an object are allocated and free, 5097 * and that the contents of the allocated blocks are correct. 5098 */ 5099 5100 /* 5101 * Read the directory info. If it's the first time, set things up. 5102 */ 5103 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); 5104 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5105 chunksize); 5106 5107 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5108 umem_free(od, size); 5109 return; 5110 } 5111 5112 bigobj = od[0].od_object; 5113 packobj = od[1].od_object; 5114 chunksize = od[0].od_gen; 5115 ASSERT3U(chunksize, ==, od[1].od_gen); 5116 5117 /* 5118 * Prefetch a random chunk of the big object. 5119 * Our aim here is to get some async reads in flight 5120 * for blocks that we may free below; the DMU should 5121 * handle this race correctly. 5122 */ 5123 n = ztest_random(regions) * stride + ztest_random(width); 5124 s = 1 + ztest_random(2 * width - 1); 5125 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 5126 ZIO_PRIORITY_SYNC_READ); 5127 5128 /* 5129 * Pick a random index and compute the offsets into packobj and bigobj. 5130 */ 5131 n = ztest_random(regions) * stride + ztest_random(width); 5132 s = 1 + ztest_random(width - 1); 5133 5134 packoff = n * sizeof (bufwad_t); 5135 packsize = s * sizeof (bufwad_t); 5136 5137 bigoff = n * chunksize; 5138 bigsize = s * chunksize; 5139 5140 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 5141 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 5142 5143 /* 5144 * free_percent of the time, free a range of bigobj rather than 5145 * overwriting it. 5146 */ 5147 freeit = (ztest_random(100) < free_percent); 5148 5149 /* 5150 * Read the current contents of our objects. 5151 */ 5152 error = dmu_read(os, packobj, packoff, packsize, packbuf, 5153 dmu_read_flags); 5154 ASSERT0(error); 5155 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 5156 dmu_read_flags); 5157 ASSERT0(error); 5158 5159 /* 5160 * Get a tx for the mods to both packobj and bigobj. 5161 */ 5162 tx = dmu_tx_create(os); 5163 5164 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5165 5166 if (freeit) 5167 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 5168 else 5169 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5170 5171 /* This accounts for setting the checksum/compression. */ 5172 dmu_tx_hold_bonus(tx, bigobj); 5173 5174 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5175 if (txg == 0) { 5176 umem_free(packbuf, packsize); 5177 umem_free(bigbuf, bigsize); 5178 umem_free(od, size); 5179 return; 5180 } 5181 5182 enum zio_checksum cksum; 5183 do { 5184 cksum = (enum zio_checksum) 5185 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 5186 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 5187 dmu_object_set_checksum(os, bigobj, cksum, tx); 5188 5189 enum zio_compress comp; 5190 do { 5191 comp = (enum zio_compress) 5192 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 5193 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 5194 dmu_object_set_compress(os, bigobj, comp, tx); 5195 5196 /* 5197 * For each index from n to n + s, verify that the existing bufwad 5198 * in packobj matches the bufwads at the head and tail of the 5199 * corresponding chunk in bigobj. Then update all three bufwads 5200 * with the new values we want to write out. 5201 */ 5202 for (i = 0; i < s; i++) { 5203 /* LINTED */ 5204 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 5205 /* LINTED */ 5206 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 5207 /* LINTED */ 5208 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 5209 5210 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 5211 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 5212 5213 if (pack->bw_txg > txg) 5214 fatal(B_FALSE, 5215 "future leak: got %"PRIx64", open txg is %"PRIx64"", 5216 pack->bw_txg, txg); 5217 5218 if (pack->bw_data != 0 && pack->bw_index != n + i) 5219 fatal(B_FALSE, "wrong index: " 5220 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 5221 pack->bw_index, n, i); 5222 5223 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 5224 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 5225 pack, bigH); 5226 5227 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 5228 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 5229 pack, bigT); 5230 5231 if (freeit) { 5232 memset(pack, 0, sizeof (bufwad_t)); 5233 } else { 5234 pack->bw_index = n + i; 5235 pack->bw_txg = txg; 5236 pack->bw_data = 1 + ztest_random(-2ULL); 5237 } 5238 *bigH = *pack; 5239 *bigT = *pack; 5240 } 5241 5242 /* 5243 * We've verified all the old bufwads, and made new ones. 5244 * Now write them out. 5245 */ 5246 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5247 5248 if (freeit) { 5249 if (ztest_opts.zo_verbose >= 7) { 5250 (void) printf("freeing offset %"PRIx64" size %"PRIx64"" 5251 " txg %"PRIx64"\n", 5252 bigoff, bigsize, txg); 5253 } 5254 VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 5255 } else { 5256 if (ztest_opts.zo_verbose >= 7) { 5257 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5258 " txg %"PRIx64"\n", 5259 bigoff, bigsize, txg); 5260 } 5261 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 5262 } 5263 5264 dmu_tx_commit(tx); 5265 5266 /* 5267 * Sanity check the stuff we just wrote. 5268 */ 5269 { 5270 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5271 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5272 5273 VERIFY0(dmu_read(os, packobj, packoff, 5274 packsize, packcheck, dmu_read_flags)); 5275 VERIFY0(dmu_read(os, bigobj, bigoff, 5276 bigsize, bigcheck, dmu_read_flags)); 5277 5278 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5279 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5280 5281 umem_free(packcheck, packsize); 5282 umem_free(bigcheck, bigsize); 5283 } 5284 5285 umem_free(packbuf, packsize); 5286 umem_free(bigbuf, bigsize); 5287 umem_free(od, size); 5288 } 5289 5290 static void 5291 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 5292 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 5293 { 5294 uint64_t i; 5295 bufwad_t *pack; 5296 bufwad_t *bigH; 5297 bufwad_t *bigT; 5298 5299 /* 5300 * For each index from n to n + s, verify that the existing bufwad 5301 * in packobj matches the bufwads at the head and tail of the 5302 * corresponding chunk in bigobj. Then update all three bufwads 5303 * with the new values we want to write out. 5304 */ 5305 for (i = 0; i < s; i++) { 5306 /* LINTED */ 5307 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 5308 /* LINTED */ 5309 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 5310 /* LINTED */ 5311 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 5312 5313 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 5314 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 5315 5316 if (pack->bw_txg > txg) 5317 fatal(B_FALSE, 5318 "future leak: got %"PRIx64", open txg is %"PRIx64"", 5319 pack->bw_txg, txg); 5320 5321 if (pack->bw_data != 0 && pack->bw_index != n + i) 5322 fatal(B_FALSE, "wrong index: " 5323 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 5324 pack->bw_index, n, i); 5325 5326 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 5327 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 5328 pack, bigH); 5329 5330 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 5331 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 5332 pack, bigT); 5333 5334 pack->bw_index = n + i; 5335 pack->bw_txg = txg; 5336 pack->bw_data = 1 + ztest_random(-2ULL); 5337 5338 *bigH = *pack; 5339 *bigT = *pack; 5340 } 5341 } 5342 5343 #undef OD_ARRAY_SIZE 5344 #define OD_ARRAY_SIZE 2 5345 5346 void 5347 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 5348 { 5349 objset_t *os = zd->zd_os; 5350 ztest_od_t *od; 5351 dmu_tx_t *tx; 5352 uint64_t i; 5353 int error; 5354 int size; 5355 uint64_t n, s, txg; 5356 bufwad_t *packbuf, *bigbuf; 5357 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 5358 uint64_t blocksize = ztest_random_blocksize(); 5359 uint64_t chunksize = blocksize; 5360 uint64_t regions = 997; 5361 uint64_t stride = 123456789ULL; 5362 uint64_t width = 9; 5363 dmu_buf_t *bonus_db; 5364 arc_buf_t **bigbuf_arcbufs; 5365 dmu_object_info_t doi; 5366 uint32_t dmu_read_flags = DMU_READ_PREFETCH; 5367 5368 /* 5369 * We will randomly set when to do O_DIRECT on a read. 5370 */ 5371 if (ztest_random(4) == 0) 5372 dmu_read_flags |= DMU_DIRECTIO; 5373 5374 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 5375 od = umem_alloc(size, UMEM_NOFAIL); 5376 5377 /* 5378 * This test uses two objects, packobj and bigobj, that are always 5379 * updated together (i.e. in the same tx) so that their contents are 5380 * in sync and can be compared. Their contents relate to each other 5381 * in a simple way: packobj is a dense array of 'bufwad' structures, 5382 * while bigobj is a sparse array of the same bufwads. Specifically, 5383 * for any index n, there are three bufwads that should be identical: 5384 * 5385 * packobj, at offset n * sizeof (bufwad_t) 5386 * bigobj, at the head of the nth chunk 5387 * bigobj, at the tail of the nth chunk 5388 * 5389 * The chunk size is set equal to bigobj block size so that 5390 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. 5391 */ 5392 5393 /* 5394 * Read the directory info. If it's the first time, set things up. 5395 */ 5396 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5397 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5398 chunksize); 5399 5400 5401 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5402 umem_free(od, size); 5403 return; 5404 } 5405 5406 bigobj = od[0].od_object; 5407 packobj = od[1].od_object; 5408 blocksize = od[0].od_blocksize; 5409 chunksize = blocksize; 5410 ASSERT3U(chunksize, ==, od[1].od_gen); 5411 5412 VERIFY0(dmu_object_info(os, bigobj, &doi)); 5413 VERIFY(ISP2(doi.doi_data_block_size)); 5414 VERIFY3U(chunksize, ==, doi.doi_data_block_size); 5415 VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); 5416 5417 /* 5418 * Pick a random index and compute the offsets into packobj and bigobj. 5419 */ 5420 n = ztest_random(regions) * stride + ztest_random(width); 5421 s = 1 + ztest_random(width - 1); 5422 5423 packoff = n * sizeof (bufwad_t); 5424 packsize = s * sizeof (bufwad_t); 5425 5426 bigoff = n * chunksize; 5427 bigsize = s * chunksize; 5428 5429 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 5430 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 5431 5432 VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 5433 5434 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 5435 5436 /* 5437 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 5438 * Iteration 1 test zcopy to already referenced dbufs. 5439 * Iteration 2 test zcopy to dirty dbuf in the same txg. 5440 * Iteration 3 test zcopy to dbuf dirty in previous txg. 5441 * Iteration 4 test zcopy when dbuf is no longer dirty. 5442 * Iteration 5 test zcopy when it can't be done. 5443 * Iteration 6 one more zcopy write. 5444 */ 5445 for (i = 0; i < 7; i++) { 5446 uint64_t j; 5447 uint64_t off; 5448 5449 /* 5450 * In iteration 5 (i == 5) use arcbufs 5451 * that don't match bigobj blksz to test 5452 * dmu_assign_arcbuf_by_dbuf() when it can't directly 5453 * assign an arcbuf to a dbuf. 5454 */ 5455 for (j = 0; j < s; j++) { 5456 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5457 bigbuf_arcbufs[j] = 5458 dmu_request_arcbuf(bonus_db, chunksize); 5459 } else { 5460 bigbuf_arcbufs[2 * j] = 5461 dmu_request_arcbuf(bonus_db, chunksize / 2); 5462 bigbuf_arcbufs[2 * j + 1] = 5463 dmu_request_arcbuf(bonus_db, chunksize / 2); 5464 } 5465 } 5466 5467 /* 5468 * Get a tx for the mods to both packobj and bigobj. 5469 */ 5470 tx = dmu_tx_create(os); 5471 5472 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5473 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5474 5475 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5476 if (txg == 0) { 5477 umem_free(packbuf, packsize); 5478 umem_free(bigbuf, bigsize); 5479 for (j = 0; j < s; j++) { 5480 if (i != 5 || 5481 chunksize < (SPA_MINBLOCKSIZE * 2)) { 5482 dmu_return_arcbuf(bigbuf_arcbufs[j]); 5483 } else { 5484 dmu_return_arcbuf( 5485 bigbuf_arcbufs[2 * j]); 5486 dmu_return_arcbuf( 5487 bigbuf_arcbufs[2 * j + 1]); 5488 } 5489 } 5490 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5491 umem_free(od, size); 5492 dmu_buf_rele(bonus_db, FTAG); 5493 return; 5494 } 5495 5496 /* 5497 * 50% of the time don't read objects in the 1st iteration to 5498 * test dmu_assign_arcbuf_by_dbuf() for the case when there are 5499 * no existing dbufs for the specified offsets. 5500 */ 5501 if (i != 0 || ztest_random(2) != 0) { 5502 error = dmu_read(os, packobj, packoff, 5503 packsize, packbuf, dmu_read_flags); 5504 ASSERT0(error); 5505 error = dmu_read(os, bigobj, bigoff, bigsize, 5506 bigbuf, dmu_read_flags); 5507 ASSERT0(error); 5508 } 5509 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 5510 n, chunksize, txg); 5511 5512 /* 5513 * We've verified all the old bufwads, and made new ones. 5514 * Now write them out. 5515 */ 5516 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5517 if (ztest_opts.zo_verbose >= 7) { 5518 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5519 " txg %"PRIx64"\n", 5520 bigoff, bigsize, txg); 5521 } 5522 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 5523 dmu_buf_t *dbt; 5524 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5525 memcpy(bigbuf_arcbufs[j]->b_data, 5526 (caddr_t)bigbuf + (off - bigoff), 5527 chunksize); 5528 } else { 5529 memcpy(bigbuf_arcbufs[2 * j]->b_data, 5530 (caddr_t)bigbuf + (off - bigoff), 5531 chunksize / 2); 5532 memcpy(bigbuf_arcbufs[2 * j + 1]->b_data, 5533 (caddr_t)bigbuf + (off - bigoff) + 5534 chunksize / 2, 5535 chunksize / 2); 5536 } 5537 5538 if (i == 1) { 5539 VERIFY(dmu_buf_hold(os, bigobj, off, 5540 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 5541 } 5542 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5543 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5544 off, bigbuf_arcbufs[j], tx, 0)); 5545 } else { 5546 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5547 off, bigbuf_arcbufs[2 * j], tx, 0)); 5548 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5549 off + chunksize / 2, 5550 bigbuf_arcbufs[2 * j + 1], tx, 0)); 5551 } 5552 if (i == 1) { 5553 dmu_buf_rele(dbt, FTAG); 5554 } 5555 } 5556 dmu_tx_commit(tx); 5557 5558 /* 5559 * Sanity check the stuff we just wrote. 5560 */ 5561 { 5562 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5563 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5564 5565 VERIFY0(dmu_read(os, packobj, packoff, 5566 packsize, packcheck, dmu_read_flags)); 5567 VERIFY0(dmu_read(os, bigobj, bigoff, 5568 bigsize, bigcheck, dmu_read_flags)); 5569 5570 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5571 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5572 5573 umem_free(packcheck, packsize); 5574 umem_free(bigcheck, bigsize); 5575 } 5576 if (i == 2) { 5577 txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); 5578 } else if (i == 3) { 5579 txg_wait_synced(dmu_objset_pool(os), 0); 5580 } 5581 } 5582 5583 dmu_buf_rele(bonus_db, FTAG); 5584 umem_free(packbuf, packsize); 5585 umem_free(bigbuf, bigsize); 5586 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5587 umem_free(od, size); 5588 } 5589 5590 void 5591 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 5592 { 5593 (void) id; 5594 ztest_od_t *od; 5595 5596 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5597 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 5598 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5599 5600 /* 5601 * Have multiple threads write to large offsets in an object 5602 * to verify that parallel writes to an object -- even to the 5603 * same blocks within the object -- doesn't cause any trouble. 5604 */ 5605 ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5606 5607 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) 5608 return; 5609 5610 while (ztest_random(10) != 0) 5611 ztest_io(zd, od->od_object, offset); 5612 5613 umem_free(od, sizeof (ztest_od_t)); 5614 } 5615 5616 void 5617 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 5618 { 5619 ztest_od_t *od; 5620 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 5621 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5622 uint64_t count = ztest_random(20) + 1; 5623 uint64_t blocksize = ztest_random_blocksize(); 5624 void *data; 5625 5626 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5627 5628 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5629 5630 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5631 !ztest_random(2)) != 0) { 5632 umem_free(od, sizeof (ztest_od_t)); 5633 return; 5634 } 5635 5636 if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { 5637 umem_free(od, sizeof (ztest_od_t)); 5638 return; 5639 } 5640 5641 ztest_prealloc(zd, od->od_object, offset, count * blocksize); 5642 5643 data = umem_zalloc(blocksize, UMEM_NOFAIL); 5644 5645 while (ztest_random(count) != 0) { 5646 uint64_t randoff = offset + (ztest_random(count) * blocksize); 5647 if (ztest_write(zd, od->od_object, randoff, blocksize, 5648 data) != 0) 5649 break; 5650 while (ztest_random(4) != 0) 5651 ztest_io(zd, od->od_object, randoff); 5652 } 5653 5654 umem_free(data, blocksize); 5655 umem_free(od, sizeof (ztest_od_t)); 5656 } 5657 5658 /* 5659 * Verify that zap_{create,destroy,add,remove,update} work as expected. 5660 */ 5661 #define ZTEST_ZAP_MIN_INTS 1 5662 #define ZTEST_ZAP_MAX_INTS 4 5663 #define ZTEST_ZAP_MAX_PROPS 1000 5664 5665 void 5666 ztest_zap(ztest_ds_t *zd, uint64_t id) 5667 { 5668 objset_t *os = zd->zd_os; 5669 ztest_od_t *od; 5670 uint64_t object; 5671 uint64_t txg, last_txg; 5672 uint64_t value[ZTEST_ZAP_MAX_INTS]; 5673 uint64_t zl_ints, zl_intsize, prop; 5674 int i, ints; 5675 dmu_tx_t *tx; 5676 char propname[100], txgname[100]; 5677 int error; 5678 const char *const hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 5679 5680 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5681 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5682 5683 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5684 !ztest_random(2)) != 0) 5685 goto out; 5686 5687 object = od->od_object; 5688 5689 /* 5690 * Generate a known hash collision, and verify that 5691 * we can lookup and remove both entries. 5692 */ 5693 tx = dmu_tx_create(os); 5694 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5695 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5696 if (txg == 0) 5697 goto out; 5698 for (i = 0; i < 2; i++) { 5699 value[i] = i; 5700 VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 5701 1, &value[i], tx)); 5702 } 5703 for (i = 0; i < 2; i++) { 5704 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 5705 sizeof (uint64_t), 1, &value[i], tx)); 5706 VERIFY0( 5707 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 5708 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5709 ASSERT3U(zl_ints, ==, 1); 5710 } 5711 for (i = 0; i < 2; i++) { 5712 VERIFY0(zap_remove(os, object, hc[i], tx)); 5713 } 5714 dmu_tx_commit(tx); 5715 5716 /* 5717 * Generate a bunch of random entries. 5718 */ 5719 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 5720 5721 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5722 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5723 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5724 memset(value, 0, sizeof (value)); 5725 last_txg = 0; 5726 5727 /* 5728 * If these zap entries already exist, validate their contents. 5729 */ 5730 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5731 if (error == 0) { 5732 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5733 ASSERT3U(zl_ints, ==, 1); 5734 5735 VERIFY0(zap_lookup(os, object, txgname, zl_intsize, 5736 zl_ints, &last_txg)); 5737 5738 VERIFY0(zap_length(os, object, propname, &zl_intsize, 5739 &zl_ints)); 5740 5741 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5742 ASSERT3U(zl_ints, ==, ints); 5743 5744 VERIFY0(zap_lookup(os, object, propname, zl_intsize, 5745 zl_ints, value)); 5746 5747 for (i = 0; i < ints; i++) { 5748 ASSERT3U(value[i], ==, last_txg + object + i); 5749 } 5750 } else { 5751 ASSERT3U(error, ==, ENOENT); 5752 } 5753 5754 /* 5755 * Atomically update two entries in our zap object. 5756 * The first is named txg_%llu, and contains the txg 5757 * in which the property was last updated. The second 5758 * is named prop_%llu, and the nth element of its value 5759 * should be txg + object + n. 5760 */ 5761 tx = dmu_tx_create(os); 5762 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5763 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5764 if (txg == 0) 5765 goto out; 5766 5767 if (last_txg > txg) 5768 fatal(B_FALSE, "zap future leak: old %"PRIu64" new %"PRIu64"", 5769 last_txg, txg); 5770 5771 for (i = 0; i < ints; i++) 5772 value[i] = txg + object + i; 5773 5774 VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 5775 1, &txg, tx)); 5776 VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), 5777 ints, value, tx)); 5778 5779 dmu_tx_commit(tx); 5780 5781 /* 5782 * Remove a random pair of entries. 5783 */ 5784 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5785 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5786 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5787 5788 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5789 5790 if (error == ENOENT) 5791 goto out; 5792 5793 ASSERT0(error); 5794 5795 tx = dmu_tx_create(os); 5796 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5797 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5798 if (txg == 0) 5799 goto out; 5800 VERIFY0(zap_remove(os, object, txgname, tx)); 5801 VERIFY0(zap_remove(os, object, propname, tx)); 5802 dmu_tx_commit(tx); 5803 out: 5804 umem_free(od, sizeof (ztest_od_t)); 5805 } 5806 5807 /* 5808 * Test case to test the upgrading of a microzap to fatzap. 5809 */ 5810 void 5811 ztest_fzap(ztest_ds_t *zd, uint64_t id) 5812 { 5813 objset_t *os = zd->zd_os; 5814 ztest_od_t *od; 5815 uint64_t object, txg, value; 5816 5817 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5818 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5819 5820 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5821 !ztest_random(2)) != 0) 5822 goto out; 5823 object = od->od_object; 5824 5825 /* 5826 * Add entries to this ZAP and make sure it spills over 5827 * and gets upgraded to a fatzap. Also, since we are adding 5828 * 2050 entries we should see ptrtbl growth and leaf-block split. 5829 */ 5830 for (value = 0; value < 2050; value++) { 5831 char name[ZFS_MAX_DATASET_NAME_LEN]; 5832 dmu_tx_t *tx; 5833 int error; 5834 5835 (void) snprintf(name, sizeof (name), "fzap-%"PRIu64"-%"PRIu64"", 5836 id, value); 5837 5838 tx = dmu_tx_create(os); 5839 dmu_tx_hold_zap(tx, object, B_TRUE, name); 5840 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5841 if (txg == 0) 5842 goto out; 5843 error = zap_add(os, object, name, sizeof (uint64_t), 1, 5844 &value, tx); 5845 ASSERT(error == 0 || error == EEXIST); 5846 dmu_tx_commit(tx); 5847 } 5848 out: 5849 umem_free(od, sizeof (ztest_od_t)); 5850 } 5851 5852 void 5853 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 5854 { 5855 (void) id; 5856 objset_t *os = zd->zd_os; 5857 ztest_od_t *od; 5858 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 5859 dmu_tx_t *tx; 5860 int i, namelen, error; 5861 int micro = ztest_random(2); 5862 char name[20], string_value[20]; 5863 void *data; 5864 5865 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5866 ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); 5867 5868 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5869 umem_free(od, sizeof (ztest_od_t)); 5870 return; 5871 } 5872 5873 object = od->od_object; 5874 5875 /* 5876 * Generate a random name of the form 'xxx.....' where each 5877 * x is a random printable character and the dots are dots. 5878 * There are 94 such characters, and the name length goes from 5879 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 5880 */ 5881 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 5882 5883 for (i = 0; i < 3; i++) 5884 name[i] = '!' + ztest_random('~' - '!' + 1); 5885 for (; i < namelen - 1; i++) 5886 name[i] = '.'; 5887 name[i] = '\0'; 5888 5889 if ((namelen & 1) || micro) { 5890 wsize = sizeof (txg); 5891 wc = 1; 5892 data = &txg; 5893 } else { 5894 wsize = 1; 5895 wc = namelen; 5896 data = string_value; 5897 } 5898 5899 count = -1ULL; 5900 VERIFY0(zap_count(os, object, &count)); 5901 ASSERT3S(count, !=, -1ULL); 5902 5903 /* 5904 * Select an operation: length, lookup, add, update, remove. 5905 */ 5906 i = ztest_random(5); 5907 5908 if (i >= 2) { 5909 tx = dmu_tx_create(os); 5910 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5911 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5912 if (txg == 0) { 5913 umem_free(od, sizeof (ztest_od_t)); 5914 return; 5915 } 5916 memcpy(string_value, name, namelen); 5917 } else { 5918 tx = NULL; 5919 txg = 0; 5920 memset(string_value, 0, namelen); 5921 } 5922 5923 switch (i) { 5924 5925 case 0: 5926 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 5927 if (error == 0) { 5928 ASSERT3U(wsize, ==, zl_wsize); 5929 ASSERT3U(wc, ==, zl_wc); 5930 } else { 5931 ASSERT3U(error, ==, ENOENT); 5932 } 5933 break; 5934 5935 case 1: 5936 error = zap_lookup(os, object, name, wsize, wc, data); 5937 if (error == 0) { 5938 if (data == string_value && 5939 memcmp(name, data, namelen) != 0) 5940 fatal(B_FALSE, "name '%s' != val '%s' len %d", 5941 name, (char *)data, namelen); 5942 } else { 5943 ASSERT3U(error, ==, ENOENT); 5944 } 5945 break; 5946 5947 case 2: 5948 error = zap_add(os, object, name, wsize, wc, data, tx); 5949 ASSERT(error == 0 || error == EEXIST); 5950 break; 5951 5952 case 3: 5953 VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); 5954 break; 5955 5956 case 4: 5957 error = zap_remove(os, object, name, tx); 5958 ASSERT(error == 0 || error == ENOENT); 5959 break; 5960 } 5961 5962 if (tx != NULL) 5963 dmu_tx_commit(tx); 5964 5965 umem_free(od, sizeof (ztest_od_t)); 5966 } 5967 5968 /* 5969 * Commit callback data. 5970 */ 5971 typedef struct ztest_cb_data { 5972 list_node_t zcd_node; 5973 uint64_t zcd_txg; 5974 int zcd_expected_err; 5975 boolean_t zcd_added; 5976 boolean_t zcd_called; 5977 spa_t *zcd_spa; 5978 } ztest_cb_data_t; 5979 5980 /* This is the actual commit callback function */ 5981 static void 5982 ztest_commit_callback(void *arg, int error) 5983 { 5984 ztest_cb_data_t *data = arg; 5985 uint64_t synced_txg; 5986 5987 VERIFY3P(data, !=, NULL); 5988 VERIFY3S(data->zcd_expected_err, ==, error); 5989 VERIFY(!data->zcd_called); 5990 5991 synced_txg = spa_last_synced_txg(data->zcd_spa); 5992 if (data->zcd_txg > synced_txg) 5993 fatal(B_FALSE, 5994 "commit callback of txg %"PRIu64" called prematurely, " 5995 "last synced txg = %"PRIu64"\n", 5996 data->zcd_txg, synced_txg); 5997 5998 data->zcd_called = B_TRUE; 5999 6000 if (error == ECANCELED) { 6001 ASSERT0(data->zcd_txg); 6002 ASSERT(!data->zcd_added); 6003 6004 /* 6005 * The private callback data should be destroyed here, but 6006 * since we are going to check the zcd_called field after 6007 * dmu_tx_abort(), we will destroy it there. 6008 */ 6009 return; 6010 } 6011 6012 ASSERT(data->zcd_added); 6013 ASSERT3U(data->zcd_txg, !=, 0); 6014 6015 (void) mutex_enter(&zcl.zcl_callbacks_lock); 6016 6017 /* See if this cb was called more quickly */ 6018 if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) 6019 zc_min_txg_delay = synced_txg - data->zcd_txg; 6020 6021 /* Remove our callback from the list */ 6022 list_remove(&zcl.zcl_callbacks, data); 6023 6024 (void) mutex_exit(&zcl.zcl_callbacks_lock); 6025 6026 umem_free(data, sizeof (ztest_cb_data_t)); 6027 } 6028 6029 /* Allocate and initialize callback data structure */ 6030 static ztest_cb_data_t * 6031 ztest_create_cb_data(objset_t *os, uint64_t txg) 6032 { 6033 ztest_cb_data_t *cb_data; 6034 6035 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 6036 6037 cb_data->zcd_txg = txg; 6038 cb_data->zcd_spa = dmu_objset_spa(os); 6039 list_link_init(&cb_data->zcd_node); 6040 6041 return (cb_data); 6042 } 6043 6044 /* 6045 * Commit callback test. 6046 */ 6047 void 6048 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 6049 { 6050 objset_t *os = zd->zd_os; 6051 ztest_od_t *od; 6052 dmu_tx_t *tx; 6053 ztest_cb_data_t *cb_data[3], *tmp_cb; 6054 uint64_t old_txg, txg; 6055 int i, error = 0; 6056 6057 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 6058 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 6059 6060 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 6061 umem_free(od, sizeof (ztest_od_t)); 6062 return; 6063 } 6064 6065 tx = dmu_tx_create(os); 6066 6067 cb_data[0] = ztest_create_cb_data(os, 0); 6068 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 6069 6070 dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); 6071 6072 /* Every once in a while, abort the transaction on purpose */ 6073 if (ztest_random(100) == 0) 6074 error = -1; 6075 6076 if (!error) 6077 error = dmu_tx_assign(tx, DMU_TX_NOWAIT); 6078 6079 txg = error ? 0 : dmu_tx_get_txg(tx); 6080 6081 cb_data[0]->zcd_txg = txg; 6082 cb_data[1] = ztest_create_cb_data(os, txg); 6083 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 6084 6085 if (error) { 6086 /* 6087 * It's not a strict requirement to call the registered 6088 * callbacks from inside dmu_tx_abort(), but that's what 6089 * it's supposed to happen in the current implementation 6090 * so we will check for that. 6091 */ 6092 for (i = 0; i < 2; i++) { 6093 cb_data[i]->zcd_expected_err = ECANCELED; 6094 VERIFY(!cb_data[i]->zcd_called); 6095 } 6096 6097 dmu_tx_abort(tx); 6098 6099 for (i = 0; i < 2; i++) { 6100 VERIFY(cb_data[i]->zcd_called); 6101 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 6102 } 6103 6104 umem_free(od, sizeof (ztest_od_t)); 6105 return; 6106 } 6107 6108 cb_data[2] = ztest_create_cb_data(os, txg); 6109 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 6110 6111 /* 6112 * Read existing data to make sure there isn't a future leak. 6113 */ 6114 VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), 6115 &old_txg, DMU_READ_PREFETCH)); 6116 6117 if (old_txg > txg) 6118 fatal(B_FALSE, 6119 "future leak: got %"PRIu64", open txg is %"PRIu64"", 6120 old_txg, txg); 6121 6122 dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); 6123 6124 (void) mutex_enter(&zcl.zcl_callbacks_lock); 6125 6126 /* 6127 * Since commit callbacks don't have any ordering requirement and since 6128 * it is theoretically possible for a commit callback to be called 6129 * after an arbitrary amount of time has elapsed since its txg has been 6130 * synced, it is difficult to reliably determine whether a commit 6131 * callback hasn't been called due to high load or due to a flawed 6132 * implementation. 6133 * 6134 * In practice, we will assume that if after a certain number of txgs a 6135 * commit callback hasn't been called, then most likely there's an 6136 * implementation bug.. 6137 */ 6138 tmp_cb = list_head(&zcl.zcl_callbacks); 6139 if (tmp_cb != NULL && 6140 tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { 6141 fatal(B_FALSE, 6142 "Commit callback threshold exceeded, " 6143 "oldest txg: %"PRIu64", open txg: %"PRIu64"\n", 6144 tmp_cb->zcd_txg, txg); 6145 } 6146 6147 /* 6148 * Let's find the place to insert our callbacks. 6149 * 6150 * Even though the list is ordered by txg, it is possible for the 6151 * insertion point to not be the end because our txg may already be 6152 * quiescing at this point and other callbacks in the open txg 6153 * (from other objsets) may have sneaked in. 6154 */ 6155 tmp_cb = list_tail(&zcl.zcl_callbacks); 6156 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 6157 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 6158 6159 /* Add the 3 callbacks to the list */ 6160 for (i = 0; i < 3; i++) { 6161 if (tmp_cb == NULL) 6162 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 6163 else 6164 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 6165 cb_data[i]); 6166 6167 cb_data[i]->zcd_added = B_TRUE; 6168 VERIFY(!cb_data[i]->zcd_called); 6169 6170 tmp_cb = cb_data[i]; 6171 } 6172 6173 zc_cb_counter += 3; 6174 6175 (void) mutex_exit(&zcl.zcl_callbacks_lock); 6176 6177 dmu_tx_commit(tx); 6178 6179 umem_free(od, sizeof (ztest_od_t)); 6180 } 6181 6182 /* 6183 * Visit each object in the dataset. Verify that its properties 6184 * are consistent what was stored in the block tag when it was created, 6185 * and that its unused bonus buffer space has not been overwritten. 6186 */ 6187 void 6188 ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 6189 { 6190 (void) id; 6191 objset_t *os = zd->zd_os; 6192 uint64_t obj; 6193 int err = 0; 6194 6195 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 6196 ztest_block_tag_t *bt = NULL; 6197 dmu_object_info_t doi; 6198 dmu_buf_t *db; 6199 6200 ztest_object_lock(zd, obj, ZTRL_READER); 6201 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { 6202 ztest_object_unlock(zd, obj); 6203 continue; 6204 } 6205 6206 dmu_object_info_from_db(db, &doi); 6207 if (doi.doi_bonus_size >= sizeof (*bt)) 6208 bt = ztest_bt_bonus(db); 6209 6210 if (bt && bt->bt_magic == BT_MAGIC) { 6211 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 6212 bt->bt_offset, bt->bt_gen, bt->bt_txg, 6213 bt->bt_crtxg); 6214 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 6215 } 6216 6217 dmu_buf_rele(db, FTAG); 6218 ztest_object_unlock(zd, obj); 6219 } 6220 } 6221 6222 void 6223 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 6224 { 6225 (void) id; 6226 zfs_prop_t proplist[] = { 6227 ZFS_PROP_CHECKSUM, 6228 ZFS_PROP_COMPRESSION, 6229 ZFS_PROP_COPIES, 6230 ZFS_PROP_DEDUP 6231 }; 6232 6233 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6234 6235 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) { 6236 int error = ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 6237 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 6238 ASSERT(error == 0 || error == ENOSPC); 6239 } 6240 6241 int error = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, 6242 ztest_random_blocksize(), (int)ztest_random(2)); 6243 ASSERT(error == 0 || error == ENOSPC); 6244 6245 (void) pthread_rwlock_unlock(&ztest_name_lock); 6246 } 6247 6248 void 6249 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 6250 { 6251 (void) zd, (void) id; 6252 6253 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6254 6255 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); 6256 6257 nvlist_t *props = fnvlist_alloc(); 6258 6259 VERIFY0(spa_prop_get(ztest_spa, props)); 6260 6261 if (ztest_opts.zo_verbose >= 6) 6262 dump_nvlist(props, 4); 6263 6264 fnvlist_free(props); 6265 6266 (void) pthread_rwlock_unlock(&ztest_name_lock); 6267 } 6268 6269 static int 6270 user_release_one(const char *snapname, const char *holdname) 6271 { 6272 nvlist_t *snaps, *holds; 6273 int error; 6274 6275 snaps = fnvlist_alloc(); 6276 holds = fnvlist_alloc(); 6277 fnvlist_add_boolean(holds, holdname); 6278 fnvlist_add_nvlist(snaps, snapname, holds); 6279 fnvlist_free(holds); 6280 error = dsl_dataset_user_release(snaps, NULL); 6281 fnvlist_free(snaps); 6282 return (error); 6283 } 6284 6285 /* 6286 * Test snapshot hold/release and deferred destroy. 6287 */ 6288 void 6289 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 6290 { 6291 int error; 6292 objset_t *os = zd->zd_os; 6293 objset_t *origin; 6294 char snapname[100]; 6295 char fullname[100]; 6296 char clonename[100]; 6297 char tag[100]; 6298 char osname[ZFS_MAX_DATASET_NAME_LEN]; 6299 nvlist_t *holds; 6300 6301 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6302 6303 dmu_objset_name(os, osname); 6304 6305 (void) snprintf(snapname, sizeof (snapname), "sh1_%"PRIu64"", id); 6306 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 6307 (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%"PRIu64"", 6308 osname, id); 6309 (void) snprintf(tag, sizeof (tag), "tag_%"PRIu64"", id); 6310 6311 /* 6312 * Clean up from any previous run. 6313 */ 6314 error = dsl_destroy_head(clonename); 6315 if (error != ENOENT) 6316 ASSERT0(error); 6317 error = user_release_one(fullname, tag); 6318 if (error != ESRCH && error != ENOENT) 6319 ASSERT0(error); 6320 error = dsl_destroy_snapshot(fullname, B_FALSE); 6321 if (error != ENOENT) 6322 ASSERT0(error); 6323 6324 /* 6325 * Create snapshot, clone it, mark snap for deferred destroy, 6326 * destroy clone, verify snap was also destroyed. 6327 */ 6328 error = dmu_objset_snapshot_one(osname, snapname); 6329 if (error) { 6330 if (error == ENOSPC) { 6331 ztest_record_enospc("dmu_objset_snapshot"); 6332 goto out; 6333 } 6334 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 6335 } 6336 6337 error = dmu_objset_clone(clonename, fullname); 6338 if (error) { 6339 if (error == ENOSPC) { 6340 ztest_record_enospc("dmu_objset_clone"); 6341 goto out; 6342 } 6343 fatal(B_FALSE, "dmu_objset_clone(%s) = %d", clonename, error); 6344 } 6345 6346 error = dsl_destroy_snapshot(fullname, B_TRUE); 6347 if (error) { 6348 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6349 fullname, error); 6350 } 6351 6352 error = dsl_destroy_head(clonename); 6353 if (error) 6354 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clonename, error); 6355 6356 error = dmu_objset_hold(fullname, FTAG, &origin); 6357 if (error != ENOENT) 6358 fatal(B_FALSE, "dmu_objset_hold(%s) = %d", fullname, error); 6359 6360 /* 6361 * Create snapshot, add temporary hold, verify that we can't 6362 * destroy a held snapshot, mark for deferred destroy, 6363 * release hold, verify snapshot was destroyed. 6364 */ 6365 error = dmu_objset_snapshot_one(osname, snapname); 6366 if (error) { 6367 if (error == ENOSPC) { 6368 ztest_record_enospc("dmu_objset_snapshot"); 6369 goto out; 6370 } 6371 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 6372 } 6373 6374 holds = fnvlist_alloc(); 6375 fnvlist_add_string(holds, fullname, tag); 6376 error = dsl_dataset_user_hold(holds, 0, NULL); 6377 fnvlist_free(holds); 6378 6379 if (error == ENOSPC) { 6380 ztest_record_enospc("dsl_dataset_user_hold"); 6381 goto out; 6382 } else if (error) { 6383 fatal(B_FALSE, "dsl_dataset_user_hold(%s, %s) = %u", 6384 fullname, tag, error); 6385 } 6386 6387 error = dsl_destroy_snapshot(fullname, B_FALSE); 6388 if (error != EBUSY) { 6389 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 6390 fullname, error); 6391 } 6392 6393 error = dsl_destroy_snapshot(fullname, B_TRUE); 6394 if (error) { 6395 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6396 fullname, error); 6397 } 6398 6399 error = user_release_one(fullname, tag); 6400 if (error) 6401 fatal(B_FALSE, "user_release_one(%s, %s) = %d", 6402 fullname, tag, error); 6403 6404 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 6405 6406 out: 6407 (void) pthread_rwlock_unlock(&ztest_name_lock); 6408 } 6409 6410 /* 6411 * Inject random faults into the on-disk data. 6412 */ 6413 void 6414 ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 6415 { 6416 (void) zd, (void) id; 6417 ztest_shared_t *zs = ztest_shared; 6418 spa_t *spa = ztest_spa; 6419 int fd; 6420 uint64_t offset; 6421 uint64_t leaves; 6422 uint64_t bad = 0x1990c0ffeedecadeull; 6423 uint64_t top, leaf; 6424 uint64_t raidz_children; 6425 char *path0; 6426 char *pathrand; 6427 size_t fsize; 6428 int bshift = SPA_MAXBLOCKSHIFT + 2; 6429 int iters = 1000; 6430 int maxfaults; 6431 int mirror_save; 6432 vdev_t *vd0 = NULL; 6433 uint64_t guid0 = 0; 6434 boolean_t islog = B_FALSE; 6435 boolean_t injected = B_FALSE; 6436 6437 path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6438 pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6439 6440 mutex_enter(&ztest_vdev_lock); 6441 6442 /* 6443 * Device removal is in progress, fault injection must be disabled 6444 * until it completes and the pool is scrubbed. The fault injection 6445 * strategy for damaging blocks does not take in to account evacuated 6446 * blocks which may have already been damaged. 6447 */ 6448 if (ztest_device_removal_active) 6449 goto out; 6450 6451 /* 6452 * The fault injection strategy for damaging blocks cannot be used 6453 * if raidz expansion is in progress. The leaves value 6454 * (attached raidz children) is variable and strategy for damaging 6455 * blocks will corrupt same data blocks on different child vdevs 6456 * because of the reflow process. 6457 */ 6458 if (spa->spa_raidz_expand != NULL) 6459 goto out; 6460 6461 maxfaults = MAXFAULTS(zs); 6462 raidz_children = ztest_get_raidz_children(spa); 6463 leaves = MAX(zs->zs_mirrors, 1) * raidz_children; 6464 mirror_save = zs->zs_mirrors; 6465 6466 ASSERT3U(leaves, >=, 1); 6467 6468 /* 6469 * While ztest is running the number of leaves will not change. This 6470 * is critical for the fault injection logic as it determines where 6471 * errors can be safely injected such that they are always repairable. 6472 * 6473 * When restarting ztest a different number of leaves may be requested 6474 * which will shift the regions to be damaged. This is fine as long 6475 * as the pool has been scrubbed prior to using the new mapping. 6476 * Failure to do can result in non-repairable damage being injected. 6477 */ 6478 if (ztest_pool_scrubbed == B_FALSE) 6479 goto out; 6480 6481 /* 6482 * Grab the name lock as reader. There are some operations 6483 * which don't like to have their vdevs changed while 6484 * they are in progress (i.e. spa_change_guid). Those 6485 * operations will have grabbed the name lock as writer. 6486 */ 6487 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6488 6489 /* 6490 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 6491 */ 6492 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6493 6494 if (ztest_random(2) == 0) { 6495 /* 6496 * Inject errors on a normal data device or slog device. 6497 */ 6498 top = ztest_random_vdev_top(spa, B_TRUE); 6499 leaf = ztest_random(leaves) + zs->zs_splits; 6500 6501 /* 6502 * Generate paths to the first leaf in this top-level vdev, 6503 * and to the random leaf we selected. We'll induce transient 6504 * write failures and random online/offline activity on leaf 0, 6505 * and we'll write random garbage to the randomly chosen leaf. 6506 */ 6507 (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, 6508 ztest_opts.zo_dir, ztest_opts.zo_pool, 6509 top * leaves + zs->zs_splits); 6510 (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, 6511 ztest_opts.zo_dir, ztest_opts.zo_pool, 6512 top * leaves + leaf); 6513 6514 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 6515 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 6516 islog = B_TRUE; 6517 6518 /* 6519 * If the top-level vdev needs to be resilvered 6520 * then we only allow faults on the device that is 6521 * resilvering. 6522 */ 6523 if (vd0 != NULL && maxfaults != 1 && 6524 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 6525 vd0->vdev_resilver_txg != 0)) { 6526 /* 6527 * Make vd0 explicitly claim to be unreadable, 6528 * or unwritable, or reach behind its back 6529 * and close the underlying fd. We can do this if 6530 * maxfaults == 0 because we'll fail and reexecute, 6531 * and we can do it if maxfaults >= 2 because we'll 6532 * have enough redundancy. If maxfaults == 1, the 6533 * combination of this with injection of random data 6534 * corruption below exceeds the pool's fault tolerance. 6535 */ 6536 vdev_file_t *vf = vd0->vdev_tsd; 6537 6538 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 6539 (long long)vd0->vdev_id, (int)maxfaults); 6540 6541 if (vf != NULL && ztest_random(3) == 0) { 6542 (void) close(vf->vf_file->f_fd); 6543 vf->vf_file->f_fd = -1; 6544 } else if (ztest_random(2) == 0) { 6545 vd0->vdev_cant_read = B_TRUE; 6546 } else { 6547 vd0->vdev_cant_write = B_TRUE; 6548 } 6549 guid0 = vd0->vdev_guid; 6550 } 6551 } else { 6552 /* 6553 * Inject errors on an l2cache device. 6554 */ 6555 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6556 6557 if (sav->sav_count == 0) { 6558 spa_config_exit(spa, SCL_STATE, FTAG); 6559 (void) pthread_rwlock_unlock(&ztest_name_lock); 6560 goto out; 6561 } 6562 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 6563 guid0 = vd0->vdev_guid; 6564 (void) strlcpy(path0, vd0->vdev_path, MAXPATHLEN); 6565 (void) strlcpy(pathrand, vd0->vdev_path, MAXPATHLEN); 6566 6567 leaf = 0; 6568 leaves = 1; 6569 maxfaults = INT_MAX; /* no limit on cache devices */ 6570 } 6571 6572 spa_config_exit(spa, SCL_STATE, FTAG); 6573 (void) pthread_rwlock_unlock(&ztest_name_lock); 6574 6575 /* 6576 * If we can tolerate two or more faults, or we're dealing 6577 * with a slog, randomly online/offline vd0. 6578 */ 6579 if ((maxfaults >= 2 || islog) && guid0 != 0) { 6580 if (ztest_random(10) < 6) { 6581 int flags = (ztest_random(2) == 0 ? 6582 ZFS_OFFLINE_TEMPORARY : 0); 6583 6584 /* 6585 * We have to grab the zs_name_lock as writer to 6586 * prevent a race between offlining a slog and 6587 * destroying a dataset. Offlining the slog will 6588 * grab a reference on the dataset which may cause 6589 * dsl_destroy_head() to fail with EBUSY thus 6590 * leaving the dataset in an inconsistent state. 6591 */ 6592 if (islog) 6593 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6594 6595 VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); 6596 6597 if (islog) 6598 (void) pthread_rwlock_unlock(&ztest_name_lock); 6599 } else { 6600 /* 6601 * Ideally we would like to be able to randomly 6602 * call vdev_[on|off]line without holding locks 6603 * to force unpredictable failures but the side 6604 * effects of vdev_[on|off]line prevent us from 6605 * doing so. 6606 */ 6607 (void) vdev_online(spa, guid0, 0, NULL); 6608 } 6609 } 6610 6611 if (maxfaults == 0) 6612 goto out; 6613 6614 /* 6615 * We have at least single-fault tolerance, so inject data corruption. 6616 */ 6617 fd = open(pathrand, O_RDWR); 6618 6619 if (fd == -1) /* we hit a gap in the device namespace */ 6620 goto out; 6621 6622 fsize = lseek(fd, 0, SEEK_END); 6623 6624 while (--iters != 0) { 6625 /* 6626 * The offset must be chosen carefully to ensure that 6627 * we do not inject a given logical block with errors 6628 * on two different leaf devices, because ZFS can not 6629 * tolerate that (if maxfaults==1). 6630 * 6631 * To achieve this we divide each leaf device into 6632 * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). 6633 * Each chunk is further divided into error-injection 6634 * ranges (can accept errors) and clear ranges (we do 6635 * not inject errors in those). Each error-injection 6636 * range can accept errors only for a single leaf vdev. 6637 * Error-injection ranges are separated by clear ranges. 6638 * 6639 * For example, with 3 leaves, each chunk looks like: 6640 * 0 to 32M: injection range for leaf 0 6641 * 32M to 64M: clear range - no injection allowed 6642 * 64M to 96M: injection range for leaf 1 6643 * 96M to 128M: clear range - no injection allowed 6644 * 128M to 160M: injection range for leaf 2 6645 * 160M to 192M: clear range - no injection allowed 6646 * 6647 * Each clear range must be large enough such that a 6648 * single block cannot straddle it. This way a block 6649 * can't be a target in two different injection ranges 6650 * (on different leaf vdevs). 6651 */ 6652 offset = ztest_random(fsize / (leaves << bshift)) * 6653 (leaves << bshift) + (leaf << bshift) + 6654 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 6655 6656 /* 6657 * Only allow damage to the labels at one end of the vdev. 6658 * 6659 * If all labels are damaged, the device will be totally 6660 * inaccessible, which will result in loss of data, 6661 * because we also damage (parts of) the other side of 6662 * the mirror/raidz. 6663 * 6664 * Additionally, we will always have both an even and an 6665 * odd label, so that we can handle crashes in the 6666 * middle of vdev_config_sync(). 6667 */ 6668 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 6669 continue; 6670 6671 /* 6672 * The two end labels are stored at the "end" of the disk, but 6673 * the end of the disk (vdev_psize) is aligned to 6674 * sizeof (vdev_label_t). 6675 */ 6676 uint64_t psize = P2ALIGN_TYPED(fsize, sizeof (vdev_label_t), 6677 uint64_t); 6678 if ((leaf & 1) == 1 && 6679 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 6680 continue; 6681 6682 if (mirror_save != zs->zs_mirrors) { 6683 (void) close(fd); 6684 goto out; 6685 } 6686 6687 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 6688 fatal(B_TRUE, 6689 "can't inject bad word at 0x%"PRIx64" in %s", 6690 offset, pathrand); 6691 6692 if (ztest_opts.zo_verbose >= 7) 6693 (void) printf("injected bad word into %s," 6694 " offset 0x%"PRIx64"\n", pathrand, offset); 6695 6696 injected = B_TRUE; 6697 } 6698 6699 (void) close(fd); 6700 out: 6701 mutex_exit(&ztest_vdev_lock); 6702 6703 if (injected && ztest_opts.zo_raid_do_expand) { 6704 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6705 if (error == 0) { 6706 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6707 txg_wait_synced(spa_get_dsl(spa), 0); 6708 } 6709 } 6710 6711 umem_free(path0, MAXPATHLEN); 6712 umem_free(pathrand, MAXPATHLEN); 6713 } 6714 6715 /* 6716 * By design ztest will never inject uncorrectable damage in to the pool. 6717 * Issue a scrub, wait for it to complete, and verify there is never any 6718 * persistent damage. 6719 * 6720 * Only after a full scrub has been completed is it safe to start injecting 6721 * data corruption. See the comment in zfs_fault_inject(). 6722 * 6723 * EBUSY may be returned for the following six cases. It's the callers 6724 * responsibility to handle them accordingly. 6725 * 6726 * Current state Requested 6727 * 1. Normal Scrub Running Normal Scrub or Error Scrub 6728 * 2. Normal Scrub Paused Error Scrub 6729 * 3. Normal Scrub Paused Pause Normal Scrub 6730 * 4. Error Scrub Running Normal Scrub or Error Scrub 6731 * 5. Error Scrub Paused Pause Error Scrub 6732 * 6. Resilvering Anything else 6733 */ 6734 static int 6735 ztest_scrub_impl(spa_t *spa) 6736 { 6737 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6738 if (error) 6739 return (error); 6740 6741 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6742 txg_wait_synced(spa_get_dsl(spa), 0); 6743 6744 if (spa_approx_errlog_size(spa) > 0) 6745 return (ECKSUM); 6746 6747 ztest_pool_scrubbed = B_TRUE; 6748 6749 return (0); 6750 } 6751 6752 /* 6753 * Scrub the pool. 6754 */ 6755 void 6756 ztest_scrub(ztest_ds_t *zd, uint64_t id) 6757 { 6758 (void) zd, (void) id; 6759 spa_t *spa = ztest_spa; 6760 int error; 6761 6762 /* 6763 * Scrub in progress by device removal. 6764 */ 6765 if (ztest_device_removal_active) 6766 return; 6767 6768 /* 6769 * Start a scrub, wait a moment, then force a restart. 6770 */ 6771 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6772 (void) poll(NULL, 0, 100); 6773 6774 error = ztest_scrub_impl(spa); 6775 if (error == EBUSY) 6776 error = 0; 6777 ASSERT0(error); 6778 } 6779 6780 /* 6781 * Change the guid for the pool. 6782 */ 6783 void 6784 ztest_reguid(ztest_ds_t *zd, uint64_t id) 6785 { 6786 (void) zd, (void) id; 6787 spa_t *spa = ztest_spa; 6788 uint64_t orig, load; 6789 int error; 6790 ztest_shared_t *zs = ztest_shared; 6791 6792 if (ztest_opts.zo_mmp_test) 6793 return; 6794 6795 orig = spa_guid(spa); 6796 load = spa_load_guid(spa); 6797 6798 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6799 error = spa_change_guid(spa, NULL); 6800 zs->zs_guid = spa_guid(spa); 6801 (void) pthread_rwlock_unlock(&ztest_name_lock); 6802 6803 if (error != 0) 6804 return; 6805 6806 if (ztest_opts.zo_verbose >= 4) { 6807 (void) printf("Changed guid old %"PRIu64" -> %"PRIu64"\n", 6808 orig, spa_guid(spa)); 6809 } 6810 6811 VERIFY3U(orig, !=, spa_guid(spa)); 6812 VERIFY3U(load, ==, spa_load_guid(spa)); 6813 } 6814 6815 void 6816 ztest_blake3(ztest_ds_t *zd, uint64_t id) 6817 { 6818 (void) zd, (void) id; 6819 hrtime_t end = gethrtime() + NANOSEC; 6820 zio_cksum_salt_t salt; 6821 void *salt_ptr = &salt.zcs_bytes; 6822 struct abd *abd_data, *abd_meta; 6823 void *buf, *templ; 6824 int i, *ptr; 6825 uint32_t size; 6826 BLAKE3_CTX ctx; 6827 const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); 6828 6829 size = ztest_random_blocksize(); 6830 buf = umem_alloc(size, UMEM_NOFAIL); 6831 abd_data = abd_alloc(size, B_FALSE); 6832 abd_meta = abd_alloc(size, B_TRUE); 6833 6834 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6835 *ptr = ztest_random(UINT_MAX); 6836 memset(salt_ptr, 'A', 32); 6837 6838 abd_copy_from_buf_off(abd_data, buf, 0, size); 6839 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6840 6841 while (gethrtime() <= end) { 6842 int run_count = 100; 6843 zio_cksum_t zc_ref1, zc_ref2; 6844 zio_cksum_t zc_res1, zc_res2; 6845 6846 void *ref1 = &zc_ref1; 6847 void *ref2 = &zc_ref2; 6848 void *res1 = &zc_res1; 6849 void *res2 = &zc_res2; 6850 6851 /* BLAKE3_KEY_LEN = 32 */ 6852 VERIFY0(blake3->setname("generic")); 6853 templ = abd_checksum_blake3_tmpl_init(&salt); 6854 Blake3_InitKeyed(&ctx, salt_ptr); 6855 Blake3_Update(&ctx, buf, size); 6856 Blake3_Final(&ctx, ref1); 6857 zc_ref2 = zc_ref1; 6858 ZIO_CHECKSUM_BSWAP(&zc_ref2); 6859 abd_checksum_blake3_tmpl_free(templ); 6860 6861 VERIFY0(blake3->setname("cycle")); 6862 while (run_count-- > 0) { 6863 6864 /* Test current implementation */ 6865 Blake3_InitKeyed(&ctx, salt_ptr); 6866 Blake3_Update(&ctx, buf, size); 6867 Blake3_Final(&ctx, res1); 6868 zc_res2 = zc_res1; 6869 ZIO_CHECKSUM_BSWAP(&zc_res2); 6870 6871 VERIFY0(memcmp(ref1, res1, 32)); 6872 VERIFY0(memcmp(ref2, res2, 32)); 6873 6874 /* Test ABD - data */ 6875 templ = abd_checksum_blake3_tmpl_init(&salt); 6876 abd_checksum_blake3_native(abd_data, size, 6877 templ, &zc_res1); 6878 abd_checksum_blake3_byteswap(abd_data, size, 6879 templ, &zc_res2); 6880 6881 VERIFY0(memcmp(ref1, res1, 32)); 6882 VERIFY0(memcmp(ref2, res2, 32)); 6883 6884 /* Test ABD - metadata */ 6885 abd_checksum_blake3_native(abd_meta, size, 6886 templ, &zc_res1); 6887 abd_checksum_blake3_byteswap(abd_meta, size, 6888 templ, &zc_res2); 6889 abd_checksum_blake3_tmpl_free(templ); 6890 6891 VERIFY0(memcmp(ref1, res1, 32)); 6892 VERIFY0(memcmp(ref2, res2, 32)); 6893 6894 } 6895 } 6896 6897 abd_free(abd_data); 6898 abd_free(abd_meta); 6899 umem_free(buf, size); 6900 } 6901 6902 void 6903 ztest_fletcher(ztest_ds_t *zd, uint64_t id) 6904 { 6905 (void) zd, (void) id; 6906 hrtime_t end = gethrtime() + NANOSEC; 6907 6908 while (gethrtime() <= end) { 6909 int run_count = 100; 6910 void *buf; 6911 struct abd *abd_data, *abd_meta; 6912 uint32_t size; 6913 int *ptr; 6914 int i; 6915 zio_cksum_t zc_ref; 6916 zio_cksum_t zc_ref_byteswap; 6917 6918 size = ztest_random_blocksize(); 6919 6920 buf = umem_alloc(size, UMEM_NOFAIL); 6921 abd_data = abd_alloc(size, B_FALSE); 6922 abd_meta = abd_alloc(size, B_TRUE); 6923 6924 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6925 *ptr = ztest_random(UINT_MAX); 6926 6927 abd_copy_from_buf_off(abd_data, buf, 0, size); 6928 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6929 6930 VERIFY0(fletcher_4_impl_set("scalar")); 6931 fletcher_4_native(buf, size, NULL, &zc_ref); 6932 fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); 6933 6934 VERIFY0(fletcher_4_impl_set("cycle")); 6935 while (run_count-- > 0) { 6936 zio_cksum_t zc; 6937 zio_cksum_t zc_byteswap; 6938 6939 fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); 6940 fletcher_4_native(buf, size, NULL, &zc); 6941 6942 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6943 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6944 sizeof (zc_byteswap))); 6945 6946 /* Test ABD - data */ 6947 abd_fletcher_4_byteswap(abd_data, size, NULL, 6948 &zc_byteswap); 6949 abd_fletcher_4_native(abd_data, size, NULL, &zc); 6950 6951 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6952 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6953 sizeof (zc_byteswap))); 6954 6955 /* Test ABD - metadata */ 6956 abd_fletcher_4_byteswap(abd_meta, size, NULL, 6957 &zc_byteswap); 6958 abd_fletcher_4_native(abd_meta, size, NULL, &zc); 6959 6960 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6961 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6962 sizeof (zc_byteswap))); 6963 6964 } 6965 6966 umem_free(buf, size); 6967 abd_free(abd_data); 6968 abd_free(abd_meta); 6969 } 6970 } 6971 6972 void 6973 ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) 6974 { 6975 (void) zd, (void) id; 6976 void *buf; 6977 size_t size; 6978 int *ptr; 6979 int i; 6980 zio_cksum_t zc_ref; 6981 zio_cksum_t zc_ref_bswap; 6982 6983 hrtime_t end = gethrtime() + NANOSEC; 6984 6985 while (gethrtime() <= end) { 6986 int run_count = 100; 6987 6988 size = ztest_random_blocksize(); 6989 buf = umem_alloc(size, UMEM_NOFAIL); 6990 6991 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6992 *ptr = ztest_random(UINT_MAX); 6993 6994 VERIFY0(fletcher_4_impl_set("scalar")); 6995 fletcher_4_native(buf, size, NULL, &zc_ref); 6996 fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); 6997 6998 VERIFY0(fletcher_4_impl_set("cycle")); 6999 7000 while (run_count-- > 0) { 7001 zio_cksum_t zc; 7002 zio_cksum_t zc_bswap; 7003 size_t pos = 0; 7004 7005 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 7006 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 7007 7008 while (pos < size) { 7009 size_t inc = 64 * ztest_random(size / 67); 7010 /* sometimes add few bytes to test non-simd */ 7011 if (ztest_random(100) < 10) 7012 inc += P2ALIGN_TYPED(ztest_random(64), 7013 sizeof (uint32_t), uint64_t); 7014 7015 if (inc > (size - pos)) 7016 inc = size - pos; 7017 7018 fletcher_4_incremental_native(buf + pos, inc, 7019 &zc); 7020 fletcher_4_incremental_byteswap(buf + pos, inc, 7021 &zc_bswap); 7022 7023 pos += inc; 7024 } 7025 7026 VERIFY3U(pos, ==, size); 7027 7028 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 7029 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 7030 7031 /* 7032 * verify if incremental on the whole buffer is 7033 * equivalent to non-incremental version 7034 */ 7035 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 7036 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 7037 7038 fletcher_4_incremental_native(buf, size, &zc); 7039 fletcher_4_incremental_byteswap(buf, size, &zc_bswap); 7040 7041 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 7042 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 7043 } 7044 7045 umem_free(buf, size); 7046 } 7047 } 7048 7049 void 7050 ztest_pool_prefetch_ddt(ztest_ds_t *zd, uint64_t id) 7051 { 7052 (void) zd, (void) id; 7053 spa_t *spa; 7054 7055 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7056 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7057 7058 ddt_prefetch_all(spa); 7059 7060 spa_close(spa, FTAG); 7061 (void) pthread_rwlock_unlock(&ztest_name_lock); 7062 } 7063 7064 static int 7065 ztest_set_global_vars(void) 7066 { 7067 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 7068 char *kv = ztest_opts.zo_gvars[i]; 7069 VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); 7070 VERIFY3U(strlen(kv), >, 0); 7071 int err = set_global_var(kv); 7072 if (ztest_opts.zo_verbose > 0) { 7073 (void) printf("setting global var %s ... %s\n", kv, 7074 err ? "failed" : "ok"); 7075 } 7076 if (err != 0) { 7077 (void) fprintf(stderr, 7078 "failed to set global var '%s'\n", kv); 7079 return (err); 7080 } 7081 } 7082 return (0); 7083 } 7084 7085 static char ** 7086 ztest_global_vars_to_zdb_args(void) 7087 { 7088 char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); 7089 char **cur = args; 7090 if (args == NULL) 7091 return (NULL); 7092 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 7093 *cur++ = (char *)"-o"; 7094 *cur++ = ztest_opts.zo_gvars[i]; 7095 } 7096 ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); 7097 *cur = NULL; 7098 return (args); 7099 } 7100 7101 /* The end of strings is indicated by a NULL element */ 7102 static char * 7103 join_strings(char **strings, const char *sep) 7104 { 7105 size_t totallen = 0; 7106 for (char **sp = strings; *sp != NULL; sp++) { 7107 totallen += strlen(*sp); 7108 totallen += strlen(sep); 7109 } 7110 if (totallen > 0) { 7111 ASSERT(totallen >= strlen(sep)); 7112 totallen -= strlen(sep); 7113 } 7114 7115 size_t buflen = totallen + 1; 7116 char *o = umem_alloc(buflen, UMEM_NOFAIL); /* trailing 0 byte */ 7117 o[0] = '\0'; 7118 for (char **sp = strings; *sp != NULL; sp++) { 7119 size_t would; 7120 would = strlcat(o, *sp, buflen); 7121 VERIFY3U(would, <, buflen); 7122 if (*(sp+1) == NULL) { 7123 break; 7124 } 7125 would = strlcat(o, sep, buflen); 7126 VERIFY3U(would, <, buflen); 7127 } 7128 ASSERT3S(strlen(o), ==, totallen); 7129 return (o); 7130 } 7131 7132 static int 7133 ztest_check_path(char *path) 7134 { 7135 struct stat s; 7136 /* return true on success */ 7137 return (!stat(path, &s)); 7138 } 7139 7140 static void 7141 ztest_get_zdb_bin(char *bin, int len) 7142 { 7143 char *zdb_path; 7144 /* 7145 * Try to use $ZDB and in-tree zdb path. If not successful, just 7146 * let popen to search through PATH. 7147 */ 7148 if ((zdb_path = getenv("ZDB"))) { 7149 strlcpy(bin, zdb_path, len); /* In env */ 7150 if (!ztest_check_path(bin)) { 7151 ztest_dump_core = 0; 7152 fatal(B_TRUE, "invalid ZDB '%s'", bin); 7153 } 7154 return; 7155 } 7156 7157 VERIFY3P(realpath(getexecname(), bin), !=, NULL); 7158 if (strstr(bin, ".libs/ztest")) { 7159 strstr(bin, ".libs/ztest")[0] = '\0'; /* In-tree */ 7160 strcat(bin, "zdb"); 7161 if (ztest_check_path(bin)) 7162 return; 7163 } 7164 strcpy(bin, "zdb"); 7165 } 7166 7167 static vdev_t * 7168 ztest_random_concrete_vdev_leaf(vdev_t *vd) 7169 { 7170 if (vd == NULL) 7171 return (NULL); 7172 7173 if (vd->vdev_children == 0) 7174 return (vd); 7175 7176 vdev_t *eligible[vd->vdev_children]; 7177 int eligible_idx = 0, i; 7178 for (i = 0; i < vd->vdev_children; i++) { 7179 vdev_t *cvd = vd->vdev_child[i]; 7180 if (cvd->vdev_top->vdev_removing) 7181 continue; 7182 if (cvd->vdev_children > 0 || 7183 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 7184 eligible[eligible_idx++] = cvd; 7185 } 7186 } 7187 VERIFY3S(eligible_idx, >, 0); 7188 7189 uint64_t child_no = ztest_random(eligible_idx); 7190 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 7191 } 7192 7193 void 7194 ztest_initialize(ztest_ds_t *zd, uint64_t id) 7195 { 7196 (void) zd, (void) id; 7197 spa_t *spa = ztest_spa; 7198 int error = 0; 7199 7200 mutex_enter(&ztest_vdev_lock); 7201 7202 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 7203 7204 /* Random leaf vdev */ 7205 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 7206 if (rand_vd == NULL) { 7207 spa_config_exit(spa, SCL_VDEV, FTAG); 7208 mutex_exit(&ztest_vdev_lock); 7209 return; 7210 } 7211 7212 /* 7213 * The random vdev we've selected may change as soon as we 7214 * drop the spa_config_lock. We create local copies of things 7215 * we're interested in. 7216 */ 7217 uint64_t guid = rand_vd->vdev_guid; 7218 char *path = strdup(rand_vd->vdev_path); 7219 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 7220 7221 zfs_dbgmsg("vd %px, guid %llu", rand_vd, (u_longlong_t)guid); 7222 spa_config_exit(spa, SCL_VDEV, FTAG); 7223 7224 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 7225 7226 nvlist_t *vdev_guids = fnvlist_alloc(); 7227 nvlist_t *vdev_errlist = fnvlist_alloc(); 7228 fnvlist_add_uint64(vdev_guids, path, guid); 7229 error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); 7230 fnvlist_free(vdev_guids); 7231 fnvlist_free(vdev_errlist); 7232 7233 switch (cmd) { 7234 case POOL_INITIALIZE_CANCEL: 7235 if (ztest_opts.zo_verbose >= 4) { 7236 (void) printf("Cancel initialize %s", path); 7237 if (!active) 7238 (void) printf(" failed (no initialize active)"); 7239 (void) printf("\n"); 7240 } 7241 break; 7242 case POOL_INITIALIZE_START: 7243 if (ztest_opts.zo_verbose >= 4) { 7244 (void) printf("Start initialize %s", path); 7245 if (active && error == 0) 7246 (void) printf(" failed (already active)"); 7247 else if (error != 0) 7248 (void) printf(" failed (error %d)", error); 7249 (void) printf("\n"); 7250 } 7251 break; 7252 case POOL_INITIALIZE_SUSPEND: 7253 if (ztest_opts.zo_verbose >= 4) { 7254 (void) printf("Suspend initialize %s", path); 7255 if (!active) 7256 (void) printf(" failed (no initialize active)"); 7257 (void) printf("\n"); 7258 } 7259 break; 7260 } 7261 free(path); 7262 mutex_exit(&ztest_vdev_lock); 7263 } 7264 7265 void 7266 ztest_trim(ztest_ds_t *zd, uint64_t id) 7267 { 7268 (void) zd, (void) id; 7269 spa_t *spa = ztest_spa; 7270 int error = 0; 7271 7272 mutex_enter(&ztest_vdev_lock); 7273 7274 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 7275 7276 /* Random leaf vdev */ 7277 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 7278 if (rand_vd == NULL) { 7279 spa_config_exit(spa, SCL_VDEV, FTAG); 7280 mutex_exit(&ztest_vdev_lock); 7281 return; 7282 } 7283 7284 /* 7285 * The random vdev we've selected may change as soon as we 7286 * drop the spa_config_lock. We create local copies of things 7287 * we're interested in. 7288 */ 7289 uint64_t guid = rand_vd->vdev_guid; 7290 char *path = strdup(rand_vd->vdev_path); 7291 boolean_t active = rand_vd->vdev_trim_thread != NULL; 7292 7293 zfs_dbgmsg("vd %p, guid %llu", rand_vd, (u_longlong_t)guid); 7294 spa_config_exit(spa, SCL_VDEV, FTAG); 7295 7296 uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); 7297 uint64_t rate = 1 << ztest_random(30); 7298 boolean_t partial = (ztest_random(5) > 0); 7299 boolean_t secure = (ztest_random(5) > 0); 7300 7301 nvlist_t *vdev_guids = fnvlist_alloc(); 7302 nvlist_t *vdev_errlist = fnvlist_alloc(); 7303 fnvlist_add_uint64(vdev_guids, path, guid); 7304 error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, 7305 secure, vdev_errlist); 7306 fnvlist_free(vdev_guids); 7307 fnvlist_free(vdev_errlist); 7308 7309 switch (cmd) { 7310 case POOL_TRIM_CANCEL: 7311 if (ztest_opts.zo_verbose >= 4) { 7312 (void) printf("Cancel TRIM %s", path); 7313 if (!active) 7314 (void) printf(" failed (no TRIM active)"); 7315 (void) printf("\n"); 7316 } 7317 break; 7318 case POOL_TRIM_START: 7319 if (ztest_opts.zo_verbose >= 4) { 7320 (void) printf("Start TRIM %s", path); 7321 if (active && error == 0) 7322 (void) printf(" failed (already active)"); 7323 else if (error != 0) 7324 (void) printf(" failed (error %d)", error); 7325 (void) printf("\n"); 7326 } 7327 break; 7328 case POOL_TRIM_SUSPEND: 7329 if (ztest_opts.zo_verbose >= 4) { 7330 (void) printf("Suspend TRIM %s", path); 7331 if (!active) 7332 (void) printf(" failed (no TRIM active)"); 7333 (void) printf("\n"); 7334 } 7335 break; 7336 } 7337 free(path); 7338 mutex_exit(&ztest_vdev_lock); 7339 } 7340 7341 void 7342 ztest_ddt_prune(ztest_ds_t *zd, uint64_t id) 7343 { 7344 (void) zd, (void) id; 7345 7346 spa_t *spa = ztest_spa; 7347 uint64_t pct = ztest_random(15) + 1; 7348 7349 (void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct); 7350 } 7351 7352 /* 7353 * Verify pool integrity by running zdb. 7354 */ 7355 static void 7356 ztest_run_zdb(uint64_t guid) 7357 { 7358 int status; 7359 char *bin; 7360 char *zdb; 7361 char *zbuf; 7362 const int len = MAXPATHLEN + MAXNAMELEN + 20; 7363 FILE *fp; 7364 7365 bin = umem_alloc(len, UMEM_NOFAIL); 7366 zdb = umem_alloc(len, UMEM_NOFAIL); 7367 zbuf = umem_alloc(1024, UMEM_NOFAIL); 7368 7369 ztest_get_zdb_bin(bin, len); 7370 7371 char **set_gvars_args = ztest_global_vars_to_zdb_args(); 7372 if (set_gvars_args == NULL) { 7373 fatal(B_FALSE, "Failed to allocate memory in " 7374 "ztest_global_vars_to_zdb_args(). Cannot run zdb.\n"); 7375 } 7376 char *set_gvars_args_joined = join_strings(set_gvars_args, " "); 7377 free(set_gvars_args); 7378 7379 size_t would = snprintf(zdb, len, 7380 "%s -bcc%s%s -G -d -Y -e -y %s -p %s %"PRIu64, 7381 bin, 7382 ztest_opts.zo_verbose >= 3 ? "s" : "", 7383 ztest_opts.zo_verbose >= 4 ? "v" : "", 7384 set_gvars_args_joined, 7385 ztest_opts.zo_dir, 7386 guid); 7387 ASSERT3U(would, <, len); 7388 7389 umem_free(set_gvars_args_joined, strlen(set_gvars_args_joined) + 1); 7390 7391 if (ztest_opts.zo_verbose >= 5) 7392 (void) printf("Executing %s\n", zdb); 7393 7394 fp = popen(zdb, "r"); 7395 7396 while (fgets(zbuf, 1024, fp) != NULL) 7397 if (ztest_opts.zo_verbose >= 3) 7398 (void) printf("%s", zbuf); 7399 7400 status = pclose(fp); 7401 7402 if (status == 0) 7403 goto out; 7404 7405 ztest_dump_core = 0; 7406 if (WIFEXITED(status)) 7407 fatal(B_FALSE, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 7408 else 7409 fatal(B_FALSE, "'%s' died with signal %d", 7410 zdb, WTERMSIG(status)); 7411 out: 7412 umem_free(bin, len); 7413 umem_free(zdb, len); 7414 umem_free(zbuf, 1024); 7415 } 7416 7417 static void 7418 ztest_walk_pool_directory(const char *header) 7419 { 7420 spa_t *spa = NULL; 7421 7422 if (ztest_opts.zo_verbose >= 6) 7423 (void) puts(header); 7424 7425 mutex_enter(&spa_namespace_lock); 7426 while ((spa = spa_next(spa)) != NULL) 7427 if (ztest_opts.zo_verbose >= 6) 7428 (void) printf("\t%s\n", spa_name(spa)); 7429 mutex_exit(&spa_namespace_lock); 7430 } 7431 7432 static void 7433 ztest_spa_import_export(char *oldname, char *newname) 7434 { 7435 nvlist_t *config, *newconfig; 7436 uint64_t pool_guid; 7437 spa_t *spa; 7438 int error; 7439 7440 if (ztest_opts.zo_verbose >= 4) { 7441 (void) printf("import/export: old = %s, new = %s\n", 7442 oldname, newname); 7443 } 7444 7445 /* 7446 * Clean up from previous runs. 7447 */ 7448 (void) spa_destroy(newname); 7449 7450 /* 7451 * Get the pool's configuration and guid. 7452 */ 7453 VERIFY0(spa_open(oldname, &spa, FTAG)); 7454 7455 /* 7456 * Kick off a scrub to tickle scrub/export races. 7457 */ 7458 if (ztest_random(2) == 0) 7459 (void) spa_scan(spa, POOL_SCAN_SCRUB); 7460 7461 pool_guid = spa_guid(spa); 7462 spa_close(spa, FTAG); 7463 7464 ztest_walk_pool_directory("pools before export"); 7465 7466 /* 7467 * Export it. 7468 */ 7469 VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); 7470 7471 ztest_walk_pool_directory("pools after export"); 7472 7473 /* 7474 * Try to import it. 7475 */ 7476 newconfig = spa_tryimport(config); 7477 ASSERT3P(newconfig, !=, NULL); 7478 fnvlist_free(newconfig); 7479 7480 /* 7481 * Import it under the new name. 7482 */ 7483 error = spa_import(newname, config, NULL, 0); 7484 if (error != 0) { 7485 dump_nvlist(config, 0); 7486 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 7487 oldname, newname, error); 7488 } 7489 7490 ztest_walk_pool_directory("pools after import"); 7491 7492 /* 7493 * Try to import it again -- should fail with EEXIST. 7494 */ 7495 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 7496 7497 /* 7498 * Try to import it under a different name -- should fail with EEXIST. 7499 */ 7500 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 7501 7502 /* 7503 * Verify that the pool is no longer visible under the old name. 7504 */ 7505 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 7506 7507 /* 7508 * Verify that we can open and close the pool using the new name. 7509 */ 7510 VERIFY0(spa_open(newname, &spa, FTAG)); 7511 ASSERT3U(pool_guid, ==, spa_guid(spa)); 7512 spa_close(spa, FTAG); 7513 7514 fnvlist_free(config); 7515 } 7516 7517 static void 7518 ztest_resume(spa_t *spa) 7519 { 7520 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 7521 (void) printf("resuming from suspended state\n"); 7522 spa_vdev_state_enter(spa, SCL_NONE); 7523 vdev_clear(spa, NULL); 7524 (void) spa_vdev_state_exit(spa, NULL, 0); 7525 (void) zio_resume(spa); 7526 } 7527 7528 static __attribute__((noreturn)) void 7529 ztest_resume_thread(void *arg) 7530 { 7531 spa_t *spa = arg; 7532 7533 /* 7534 * Synthesize aged DDT entries for ddt prune testing 7535 */ 7536 ddt_prune_artificial_age = B_TRUE; 7537 if (ztest_opts.zo_verbose >= 3) 7538 ddt_dump_prune_histogram = B_TRUE; 7539 7540 while (!ztest_exiting) { 7541 if (spa_suspended(spa)) 7542 ztest_resume(spa); 7543 (void) poll(NULL, 0, 100); 7544 7545 /* 7546 * Periodically change the zfs_compressed_arc_enabled setting. 7547 */ 7548 if (ztest_random(10) == 0) 7549 zfs_compressed_arc_enabled = ztest_random(2); 7550 7551 /* 7552 * Periodically change the zfs_abd_scatter_enabled setting. 7553 */ 7554 if (ztest_random(10) == 0) 7555 zfs_abd_scatter_enabled = ztest_random(2); 7556 } 7557 7558 thread_exit(); 7559 } 7560 7561 static __attribute__((noreturn)) void 7562 ztest_deadman_thread(void *arg) 7563 { 7564 ztest_shared_t *zs = arg; 7565 spa_t *spa = ztest_spa; 7566 hrtime_t delay, overdue, last_run = gethrtime(); 7567 7568 delay = (zs->zs_thread_stop - zs->zs_thread_start) + 7569 MSEC2NSEC(zfs_deadman_synctime_ms); 7570 7571 while (!ztest_exiting) { 7572 /* 7573 * Wait for the delay timer while checking occasionally 7574 * if we should stop. 7575 */ 7576 if (gethrtime() < last_run + delay) { 7577 (void) poll(NULL, 0, 1000); 7578 continue; 7579 } 7580 7581 /* 7582 * If the pool is suspended then fail immediately. Otherwise, 7583 * check to see if the pool is making any progress. If 7584 * vdev_deadman() discovers that there hasn't been any recent 7585 * I/Os then it will end up aborting the tests. 7586 */ 7587 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 7588 fatal(B_FALSE, 7589 "aborting test after %llu seconds because " 7590 "pool has transitioned to a suspended state.", 7591 (u_longlong_t)zfs_deadman_synctime_ms / 1000); 7592 } 7593 vdev_deadman(spa->spa_root_vdev, FTAG); 7594 7595 /* 7596 * If the process doesn't complete within a grace period of 7597 * zfs_deadman_synctime_ms over the expected finish time, 7598 * then it may be hung and is terminated. 7599 */ 7600 overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); 7601 if (gethrtime() > overdue) { 7602 fatal(B_FALSE, 7603 "aborting test after %llu seconds because " 7604 "the process is overdue for termination.", 7605 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7606 } 7607 7608 (void) printf("ztest has been running for %lld seconds\n", 7609 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7610 7611 last_run = gethrtime(); 7612 delay = MSEC2NSEC(zfs_deadman_checktime_ms); 7613 } 7614 7615 thread_exit(); 7616 } 7617 7618 static void 7619 ztest_execute(int test, ztest_info_t *zi, uint64_t id) 7620 { 7621 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 7622 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 7623 hrtime_t functime = gethrtime(); 7624 int i; 7625 7626 for (i = 0; i < zi->zi_iters; i++) 7627 zi->zi_func(zd, id); 7628 7629 functime = gethrtime() - functime; 7630 7631 atomic_add_64(&zc->zc_count, 1); 7632 atomic_add_64(&zc->zc_time, functime); 7633 7634 if (ztest_opts.zo_verbose >= 4) 7635 (void) printf("%6.2f sec in %s\n", 7636 (double)functime / NANOSEC, zi->zi_funcname); 7637 } 7638 7639 typedef struct ztest_raidz_expand_io { 7640 uint64_t rzx_id; 7641 uint64_t rzx_amount; 7642 uint64_t rzx_bufsize; 7643 const void *rzx_buffer; 7644 uint64_t rzx_alloc_max; 7645 spa_t *rzx_spa; 7646 } ztest_expand_io_t; 7647 7648 #undef OD_ARRAY_SIZE 7649 #define OD_ARRAY_SIZE 10 7650 7651 /* 7652 * Write a request amount of data to some dataset objects. 7653 * There will be ztest_opts.zo_threads count of these running in parallel. 7654 */ 7655 static __attribute__((noreturn)) void 7656 ztest_rzx_thread(void *arg) 7657 { 7658 ztest_expand_io_t *info = (ztest_expand_io_t *)arg; 7659 ztest_od_t *od; 7660 int batchsize; 7661 int od_size; 7662 ztest_ds_t *zd = &ztest_ds[info->rzx_id % ztest_opts.zo_datasets]; 7663 spa_t *spa = info->rzx_spa; 7664 7665 od_size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 7666 od = umem_alloc(od_size, UMEM_NOFAIL); 7667 batchsize = OD_ARRAY_SIZE; 7668 7669 /* Create objects to write to */ 7670 for (int b = 0; b < batchsize; b++) { 7671 ztest_od_init(od + b, info->rzx_id, FTAG, b, 7672 DMU_OT_UINT64_OTHER, 0, 0, 0); 7673 } 7674 if (ztest_object_init(zd, od, od_size, B_FALSE) != 0) { 7675 umem_free(od, od_size); 7676 thread_exit(); 7677 } 7678 7679 for (uint64_t offset = 0, written = 0; written < info->rzx_amount; 7680 offset += info->rzx_bufsize) { 7681 /* write to 10 objects */ 7682 for (int i = 0; i < batchsize && written < info->rzx_amount; 7683 i++) { 7684 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 7685 ztest_write(zd, od[i].od_object, offset, 7686 info->rzx_bufsize, info->rzx_buffer); 7687 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 7688 written += info->rzx_bufsize; 7689 } 7690 txg_wait_synced(spa_get_dsl(spa), 0); 7691 /* due to inflation, we'll typically bail here */ 7692 if (metaslab_class_get_alloc(spa_normal_class(spa)) > 7693 info->rzx_alloc_max) { 7694 break; 7695 } 7696 } 7697 7698 /* Remove a few objects to leave some holes in allocation space */ 7699 mutex_enter(&zd->zd_dirobj_lock); 7700 (void) ztest_remove(zd, od, 2); 7701 mutex_exit(&zd->zd_dirobj_lock); 7702 7703 umem_free(od, od_size); 7704 7705 thread_exit(); 7706 } 7707 7708 static __attribute__((noreturn)) void 7709 ztest_thread(void *arg) 7710 { 7711 int rand; 7712 uint64_t id = (uintptr_t)arg; 7713 ztest_shared_t *zs = ztest_shared; 7714 uint64_t call_next; 7715 hrtime_t now; 7716 ztest_info_t *zi; 7717 ztest_shared_callstate_t *zc; 7718 7719 while ((now = gethrtime()) < zs->zs_thread_stop) { 7720 /* 7721 * See if it's time to force a crash. 7722 */ 7723 if (now > zs->zs_thread_kill && 7724 raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) { 7725 ztest_kill(zs); 7726 } 7727 7728 /* 7729 * If we're getting ENOSPC with some regularity, stop. 7730 */ 7731 if (zs->zs_enospc_count > 10) 7732 break; 7733 7734 /* 7735 * Pick a random function to execute. 7736 */ 7737 rand = ztest_random(ZTEST_FUNCS); 7738 zi = &ztest_info[rand]; 7739 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 7740 call_next = zc->zc_next; 7741 7742 if (now >= call_next && 7743 atomic_cas_64(&zc->zc_next, call_next, call_next + 7744 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 7745 ztest_execute(rand, zi, id); 7746 } 7747 } 7748 7749 thread_exit(); 7750 } 7751 7752 static void 7753 ztest_dataset_name(char *dsname, const char *pool, int d) 7754 { 7755 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 7756 } 7757 7758 static void 7759 ztest_dataset_destroy(int d) 7760 { 7761 char name[ZFS_MAX_DATASET_NAME_LEN]; 7762 int t; 7763 7764 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7765 7766 if (ztest_opts.zo_verbose >= 3) 7767 (void) printf("Destroying %s to free up space\n", name); 7768 7769 /* 7770 * Cleanup any non-standard clones and snapshots. In general, 7771 * ztest thread t operates on dataset (t % zopt_datasets), 7772 * so there may be more than one thing to clean up. 7773 */ 7774 for (t = d; t < ztest_opts.zo_threads; 7775 t += ztest_opts.zo_datasets) 7776 ztest_dsl_dataset_cleanup(name, t); 7777 7778 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 7779 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 7780 } 7781 7782 static void 7783 ztest_dataset_dirobj_verify(ztest_ds_t *zd) 7784 { 7785 uint64_t usedobjs, dirobjs, scratch; 7786 7787 /* 7788 * ZTEST_DIROBJ is the object directory for the entire dataset. 7789 * Therefore, the number of objects in use should equal the 7790 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 7791 * If not, we have an object leak. 7792 * 7793 * Note that we can only check this in ztest_dataset_open(), 7794 * when the open-context and syncing-context values agree. 7795 * That's because zap_count() returns the open-context value, 7796 * while dmu_objset_space() returns the rootbp fill count. 7797 */ 7798 VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 7799 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 7800 ASSERT3U(dirobjs + 1, ==, usedobjs); 7801 } 7802 7803 static int 7804 ztest_dataset_open(int d) 7805 { 7806 ztest_ds_t *zd = &ztest_ds[d]; 7807 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 7808 objset_t *os; 7809 zilog_t *zilog; 7810 char name[ZFS_MAX_DATASET_NAME_LEN]; 7811 int error; 7812 7813 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7814 7815 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7816 7817 error = ztest_dataset_create(name); 7818 if (error == ENOSPC) { 7819 (void) pthread_rwlock_unlock(&ztest_name_lock); 7820 ztest_record_enospc(FTAG); 7821 return (error); 7822 } 7823 ASSERT(error == 0 || error == EEXIST); 7824 7825 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 7826 B_TRUE, zd, &os)); 7827 (void) pthread_rwlock_unlock(&ztest_name_lock); 7828 7829 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 7830 7831 zilog = zd->zd_zilog; 7832 7833 if (zilog->zl_header->zh_claim_lr_seq != 0 && 7834 zilog->zl_header->zh_claim_lr_seq < committed_seq) 7835 fatal(B_FALSE, "missing log records: " 7836 "claimed %"PRIu64" < committed %"PRIu64"", 7837 zilog->zl_header->zh_claim_lr_seq, committed_seq); 7838 7839 ztest_dataset_dirobj_verify(zd); 7840 7841 zil_replay(os, zd, ztest_replay_vector); 7842 7843 ztest_dataset_dirobj_verify(zd); 7844 7845 if (ztest_opts.zo_verbose >= 6) 7846 (void) printf("%s replay %"PRIu64" blocks, " 7847 "%"PRIu64" records, seq %"PRIu64"\n", 7848 zd->zd_name, 7849 zilog->zl_parse_blk_count, 7850 zilog->zl_parse_lr_count, 7851 zilog->zl_replaying_seq); 7852 7853 zilog = zil_open(os, ztest_get_data, NULL); 7854 7855 if (zilog->zl_replaying_seq != 0 && 7856 zilog->zl_replaying_seq < committed_seq) 7857 fatal(B_FALSE, "missing log records: " 7858 "replayed %"PRIu64" < committed %"PRIu64"", 7859 zilog->zl_replaying_seq, committed_seq); 7860 7861 return (0); 7862 } 7863 7864 static void 7865 ztest_dataset_close(int d) 7866 { 7867 ztest_ds_t *zd = &ztest_ds[d]; 7868 7869 zil_close(zd->zd_zilog); 7870 dmu_objset_disown(zd->zd_os, B_TRUE, zd); 7871 7872 ztest_zd_fini(zd); 7873 } 7874 7875 static int 7876 ztest_replay_zil_cb(const char *name, void *arg) 7877 { 7878 (void) arg; 7879 objset_t *os; 7880 ztest_ds_t *zdtmp; 7881 7882 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, 7883 B_TRUE, FTAG, &os)); 7884 7885 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 7886 7887 ztest_zd_init(zdtmp, NULL, os); 7888 zil_replay(os, zdtmp, ztest_replay_vector); 7889 ztest_zd_fini(zdtmp); 7890 7891 if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && 7892 ztest_opts.zo_verbose >= 6) { 7893 zilog_t *zilog = dmu_objset_zil(os); 7894 7895 (void) printf("%s replay %"PRIu64" blocks, " 7896 "%"PRIu64" records, seq %"PRIu64"\n", 7897 name, 7898 zilog->zl_parse_blk_count, 7899 zilog->zl_parse_lr_count, 7900 zilog->zl_replaying_seq); 7901 } 7902 7903 umem_free(zdtmp, sizeof (ztest_ds_t)); 7904 7905 dmu_objset_disown(os, B_TRUE, FTAG); 7906 return (0); 7907 } 7908 7909 static void 7910 ztest_freeze(void) 7911 { 7912 ztest_ds_t *zd = &ztest_ds[0]; 7913 spa_t *spa; 7914 int numloops = 0; 7915 7916 /* freeze not supported during RAIDZ expansion */ 7917 if (ztest_opts.zo_raid_do_expand) 7918 return; 7919 7920 if (ztest_opts.zo_verbose >= 3) 7921 (void) printf("testing spa_freeze()...\n"); 7922 7923 raidz_scratch_verify(); 7924 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7925 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7926 VERIFY0(ztest_dataset_open(0)); 7927 ztest_spa = spa; 7928 7929 /* 7930 * Force the first log block to be transactionally allocated. 7931 * We have to do this before we freeze the pool -- otherwise 7932 * the log chain won't be anchored. 7933 */ 7934 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 7935 ztest_dmu_object_alloc_free(zd, 0); 7936 zil_commit(zd->zd_zilog, 0); 7937 } 7938 7939 txg_wait_synced(spa_get_dsl(spa), 0); 7940 7941 /* 7942 * Freeze the pool. This stops spa_sync() from doing anything, 7943 * so that the only way to record changes from now on is the ZIL. 7944 */ 7945 spa_freeze(spa); 7946 7947 /* 7948 * Because it is hard to predict how much space a write will actually 7949 * require beforehand, we leave ourselves some fudge space to write over 7950 * capacity. 7951 */ 7952 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 7953 7954 /* 7955 * Run tests that generate log records but don't alter the pool config 7956 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 7957 * We do a txg_wait_synced() after each iteration to force the txg 7958 * to increase well beyond the last synced value in the uberblock. 7959 * The ZIL should be OK with that. 7960 * 7961 * Run a random number of times less than zo_maxloops and ensure we do 7962 * not run out of space on the pool. 7963 */ 7964 while (ztest_random(10) != 0 && 7965 numloops++ < ztest_opts.zo_maxloops && 7966 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 7967 ztest_od_t od; 7968 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 7969 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 7970 ztest_io(zd, od.od_object, 7971 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 7972 txg_wait_synced(spa_get_dsl(spa), 0); 7973 } 7974 7975 /* 7976 * Commit all of the changes we just generated. 7977 */ 7978 zil_commit(zd->zd_zilog, 0); 7979 txg_wait_synced(spa_get_dsl(spa), 0); 7980 7981 /* 7982 * Close our dataset and close the pool. 7983 */ 7984 ztest_dataset_close(0); 7985 spa_close(spa, FTAG); 7986 kernel_fini(); 7987 7988 /* 7989 * Open and close the pool and dataset to induce log replay. 7990 */ 7991 raidz_scratch_verify(); 7992 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7993 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7994 ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); 7995 VERIFY0(ztest_dataset_open(0)); 7996 ztest_spa = spa; 7997 txg_wait_synced(spa_get_dsl(spa), 0); 7998 ztest_dataset_close(0); 7999 ztest_reguid(NULL, 0); 8000 8001 spa_close(spa, FTAG); 8002 kernel_fini(); 8003 } 8004 8005 static void 8006 ztest_import_impl(void) 8007 { 8008 importargs_t args = { 0 }; 8009 nvlist_t *cfg = NULL; 8010 int nsearch = 1; 8011 char *searchdirs[nsearch]; 8012 int flags = ZFS_IMPORT_MISSING_LOG; 8013 8014 searchdirs[0] = ztest_opts.zo_dir; 8015 args.paths = nsearch; 8016 args.path = searchdirs; 8017 args.can_be_active = B_FALSE; 8018 8019 libpc_handle_t lpch = { 8020 .lpc_lib_handle = NULL, 8021 .lpc_ops = &libzpool_config_ops, 8022 .lpc_printerr = B_TRUE 8023 }; 8024 VERIFY0(zpool_find_config(&lpch, ztest_opts.zo_pool, &cfg, &args)); 8025 VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); 8026 fnvlist_free(cfg); 8027 } 8028 8029 /* 8030 * Import a storage pool with the given name. 8031 */ 8032 static void 8033 ztest_import(ztest_shared_t *zs) 8034 { 8035 spa_t *spa; 8036 8037 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8038 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8039 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8040 8041 raidz_scratch_verify(); 8042 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8043 8044 ztest_import_impl(); 8045 8046 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8047 zs->zs_metaslab_sz = 8048 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8049 zs->zs_guid = spa_guid(spa); 8050 spa_close(spa, FTAG); 8051 8052 kernel_fini(); 8053 8054 if (!ztest_opts.zo_mmp_test) { 8055 ztest_run_zdb(zs->zs_guid); 8056 ztest_freeze(); 8057 ztest_run_zdb(zs->zs_guid); 8058 } 8059 8060 (void) pthread_rwlock_destroy(&ztest_name_lock); 8061 mutex_destroy(&ztest_vdev_lock); 8062 mutex_destroy(&ztest_checkpoint_lock); 8063 } 8064 8065 /* 8066 * After the expansion was killed, check that the pool is healthy 8067 */ 8068 static void 8069 ztest_raidz_expand_check(spa_t *spa) 8070 { 8071 ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, RAIDZ_EXPAND_KILLED); 8072 /* 8073 * Set pool check done flag, main program will run a zdb check 8074 * of the pool when we exit. 8075 */ 8076 ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_CHECKED; 8077 8078 /* Wait for reflow to finish */ 8079 if (ztest_opts.zo_verbose >= 1) { 8080 (void) printf("\nwaiting for reflow to finish ...\n"); 8081 } 8082 pool_raidz_expand_stat_t rzx_stats; 8083 pool_raidz_expand_stat_t *pres = &rzx_stats; 8084 do { 8085 txg_wait_synced(spa_get_dsl(spa), 0); 8086 (void) poll(NULL, 0, 500); /* wait 1/2 second */ 8087 8088 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8089 (void) spa_raidz_expand_get_stats(spa, pres); 8090 spa_config_exit(spa, SCL_CONFIG, FTAG); 8091 } while (pres->pres_state != DSS_FINISHED && 8092 pres->pres_reflowed < pres->pres_to_reflow); 8093 8094 if (ztest_opts.zo_verbose >= 1) { 8095 (void) printf("verifying an interrupted raidz " 8096 "expansion using a pool scrub ...\n"); 8097 } 8098 8099 /* Will fail here if there is non-recoverable corruption detected */ 8100 int error = ztest_scrub_impl(spa); 8101 if (error == EBUSY) 8102 error = 0; 8103 8104 VERIFY0(error); 8105 8106 if (ztest_opts.zo_verbose >= 1) { 8107 (void) printf("raidz expansion scrub check complete\n"); 8108 } 8109 } 8110 8111 /* 8112 * Start a raidz expansion test. We run some I/O on the pool for a while 8113 * to get some data in the pool. Then we grow the raidz and 8114 * kill the test at the requested offset into the reflow, verifying that 8115 * doing such does not lead to pool corruption. 8116 */ 8117 static void 8118 ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) 8119 { 8120 nvlist_t *root; 8121 pool_raidz_expand_stat_t rzx_stats; 8122 pool_raidz_expand_stat_t *pres = &rzx_stats; 8123 kthread_t **run_threads; 8124 vdev_t *cvd, *rzvd = spa->spa_root_vdev->vdev_child[0]; 8125 int total_disks = rzvd->vdev_children; 8126 int data_disks = total_disks - vdev_get_nparity(rzvd); 8127 uint64_t alloc_goal; 8128 uint64_t csize; 8129 int error, t; 8130 int threads = ztest_opts.zo_threads; 8131 ztest_expand_io_t *thread_args; 8132 8133 ASSERT3U(ztest_opts.zo_raidz_expand_test, !=, RAIDZ_EXPAND_NONE); 8134 ASSERT3P(rzvd->vdev_ops, ==, &vdev_raidz_ops); 8135 ztest_opts.zo_raidz_expand_test = RAIDZ_EXPAND_STARTED; 8136 8137 /* Setup a 1 MiB buffer of random data */ 8138 uint64_t bufsize = 1024 * 1024; 8139 void *buffer = umem_alloc(bufsize, UMEM_NOFAIL); 8140 8141 if (read(ztest_fd_rand, buffer, bufsize) != bufsize) { 8142 fatal(B_TRUE, "short read from /dev/urandom"); 8143 } 8144 /* 8145 * Put some data in the pool and then attach a vdev to initiate 8146 * reflow. 8147 */ 8148 run_threads = umem_zalloc(threads * sizeof (kthread_t *), UMEM_NOFAIL); 8149 thread_args = umem_zalloc(threads * sizeof (ztest_expand_io_t), 8150 UMEM_NOFAIL); 8151 /* Aim for roughly 25% of allocatable space up to 1GB */ 8152 alloc_goal = (vdev_get_min_asize(rzvd) * data_disks) / total_disks; 8153 alloc_goal = MIN(alloc_goal >> 2, 1024*1024*1024); 8154 if (ztest_opts.zo_verbose >= 1) { 8155 (void) printf("adding data to pool '%s', goal %llu bytes\n", 8156 ztest_opts.zo_pool, (u_longlong_t)alloc_goal); 8157 } 8158 8159 /* 8160 * Kick off all the I/O generators that run in parallel. 8161 */ 8162 for (t = 0; t < threads; t++) { 8163 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 8164 umem_free(run_threads, threads * sizeof (kthread_t *)); 8165 umem_free(buffer, bufsize); 8166 return; 8167 } 8168 thread_args[t].rzx_id = t; 8169 thread_args[t].rzx_amount = alloc_goal / threads; 8170 thread_args[t].rzx_bufsize = bufsize; 8171 thread_args[t].rzx_buffer = buffer; 8172 thread_args[t].rzx_alloc_max = alloc_goal; 8173 thread_args[t].rzx_spa = spa; 8174 run_threads[t] = thread_create(NULL, 0, ztest_rzx_thread, 8175 &thread_args[t], 0, NULL, TS_RUN | TS_JOINABLE, 8176 defclsyspri); 8177 } 8178 8179 /* 8180 * Wait for all of the writers to complete. 8181 */ 8182 for (t = 0; t < threads; t++) 8183 VERIFY0(thread_join(run_threads[t])); 8184 8185 /* 8186 * Close all datasets. This must be done after all the threads 8187 * are joined so we can be sure none of the datasets are in-use 8188 * by any of the threads. 8189 */ 8190 for (t = 0; t < ztest_opts.zo_threads; t++) { 8191 if (t < ztest_opts.zo_datasets) 8192 ztest_dataset_close(t); 8193 } 8194 8195 txg_wait_synced(spa_get_dsl(spa), 0); 8196 8197 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 8198 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 8199 8200 umem_free(buffer, bufsize); 8201 umem_free(run_threads, threads * sizeof (kthread_t *)); 8202 umem_free(thread_args, threads * sizeof (ztest_expand_io_t)); 8203 8204 /* Set our reflow target to 25%, 50% or 75% of allocated size */ 8205 uint_t multiple = ztest_random(3) + 1; 8206 uint64_t reflow_max = (rzvd->vdev_stat.vs_alloc * multiple) / 4; 8207 raidz_expand_max_reflow_bytes = reflow_max; 8208 8209 if (ztest_opts.zo_verbose >= 1) { 8210 (void) printf("running raidz expansion test, killing when " 8211 "reflow reaches %llu bytes (%u/4 of allocated space)\n", 8212 (u_longlong_t)reflow_max, multiple); 8213 } 8214 8215 /* XXX - do we want some I/O load during the reflow? */ 8216 8217 /* 8218 * Use a disk size that is larger than existing ones 8219 */ 8220 cvd = rzvd->vdev_child[0]; 8221 csize = vdev_get_min_asize(cvd); 8222 csize += csize / 10; 8223 /* 8224 * Path to vdev to be attached 8225 */ 8226 char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 8227 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 8228 ztest_opts.zo_dir, ztest_opts.zo_pool, rzvd->vdev_children); 8229 /* 8230 * Build the nvlist describing newpath. 8231 */ 8232 root = make_vdev_root(newpath, NULL, NULL, csize, ztest_get_ashift(), 8233 NULL, 0, 0, 1); 8234 /* 8235 * Expand the raidz vdev by attaching the new disk 8236 */ 8237 if (ztest_opts.zo_verbose >= 1) { 8238 (void) printf("expanding raidz: %d wide to %d wide with '%s'\n", 8239 (int)rzvd->vdev_children, (int)rzvd->vdev_children + 1, 8240 newpath); 8241 } 8242 error = spa_vdev_attach(spa, rzvd->vdev_guid, root, B_FALSE, B_FALSE); 8243 nvlist_free(root); 8244 if (error != 0) { 8245 fatal(0, "raidz expand: attach (%s %llu) returned %d", 8246 newpath, (long long)csize, error); 8247 } 8248 8249 /* 8250 * Wait for reflow to begin 8251 */ 8252 while (spa->spa_raidz_expand == NULL) { 8253 txg_wait_synced(spa_get_dsl(spa), 0); 8254 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8255 } 8256 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8257 (void) spa_raidz_expand_get_stats(spa, pres); 8258 spa_config_exit(spa, SCL_CONFIG, FTAG); 8259 while (pres->pres_state != DSS_SCANNING) { 8260 txg_wait_synced(spa_get_dsl(spa), 0); 8261 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8262 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8263 (void) spa_raidz_expand_get_stats(spa, pres); 8264 spa_config_exit(spa, SCL_CONFIG, FTAG); 8265 } 8266 8267 ASSERT3U(pres->pres_state, ==, DSS_SCANNING); 8268 ASSERT3U(pres->pres_to_reflow, !=, 0); 8269 /* 8270 * Set so when we are killed we go to raidz checking rather than 8271 * restarting test. 8272 */ 8273 ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_KILLED; 8274 if (ztest_opts.zo_verbose >= 1) { 8275 (void) printf("raidz expansion reflow started, waiting for " 8276 "%llu bytes to be copied\n", (u_longlong_t)reflow_max); 8277 } 8278 8279 /* 8280 * Wait for reflow maximum to be reached and then kill the test 8281 */ 8282 while (pres->pres_reflowed < reflow_max) { 8283 txg_wait_synced(spa_get_dsl(spa), 0); 8284 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8285 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8286 (void) spa_raidz_expand_get_stats(spa, pres); 8287 spa_config_exit(spa, SCL_CONFIG, FTAG); 8288 } 8289 8290 /* Reset the reflow pause before killing */ 8291 raidz_expand_max_reflow_bytes = 0; 8292 8293 if (ztest_opts.zo_verbose >= 1) { 8294 (void) printf("killing raidz expansion test after reflow " 8295 "reached %llu bytes\n", (u_longlong_t)pres->pres_reflowed); 8296 } 8297 8298 /* 8299 * Kill ourself to simulate a panic during a reflow. Our parent will 8300 * restart the test and the changed flag value will drive the test 8301 * through the scrub/check code to verify the pool is not corrupted. 8302 */ 8303 ztest_kill(zs); 8304 } 8305 8306 static void 8307 ztest_generic_run(ztest_shared_t *zs, spa_t *spa) 8308 { 8309 kthread_t **run_threads; 8310 int t; 8311 8312 run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), 8313 UMEM_NOFAIL); 8314 8315 /* 8316 * Kick off all the tests that run in parallel. 8317 */ 8318 for (t = 0; t < ztest_opts.zo_threads; t++) { 8319 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 8320 umem_free(run_threads, ztest_opts.zo_threads * 8321 sizeof (kthread_t *)); 8322 return; 8323 } 8324 8325 run_threads[t] = thread_create(NULL, 0, ztest_thread, 8326 (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, 8327 defclsyspri); 8328 } 8329 8330 /* 8331 * Wait for all of the tests to complete. 8332 */ 8333 for (t = 0; t < ztest_opts.zo_threads; t++) 8334 VERIFY0(thread_join(run_threads[t])); 8335 8336 /* 8337 * Close all datasets. This must be done after all the threads 8338 * are joined so we can be sure none of the datasets are in-use 8339 * by any of the threads. 8340 */ 8341 for (t = 0; t < ztest_opts.zo_threads; t++) { 8342 if (t < ztest_opts.zo_datasets) 8343 ztest_dataset_close(t); 8344 } 8345 8346 txg_wait_synced(spa_get_dsl(spa), 0); 8347 8348 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 8349 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 8350 8351 umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); 8352 } 8353 8354 /* 8355 * Setup our test context and kick off threads to run tests on all datasets 8356 * in parallel. 8357 */ 8358 static void 8359 ztest_run(ztest_shared_t *zs) 8360 { 8361 spa_t *spa; 8362 objset_t *os; 8363 kthread_t *resume_thread, *deadman_thread; 8364 uint64_t object; 8365 int error; 8366 int t, d; 8367 8368 ztest_exiting = B_FALSE; 8369 8370 /* 8371 * Initialize parent/child shared state. 8372 */ 8373 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8374 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8375 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8376 8377 zs->zs_thread_start = gethrtime(); 8378 zs->zs_thread_stop = 8379 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 8380 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 8381 zs->zs_thread_kill = zs->zs_thread_stop; 8382 if (ztest_random(100) < ztest_opts.zo_killrate) { 8383 zs->zs_thread_kill -= 8384 ztest_random(ztest_opts.zo_passtime * NANOSEC); 8385 } 8386 8387 mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); 8388 8389 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 8390 offsetof(ztest_cb_data_t, zcd_node)); 8391 8392 /* 8393 * Open our pool. It may need to be imported first depending on 8394 * what tests were running when the previous pass was terminated. 8395 */ 8396 raidz_scratch_verify(); 8397 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8398 error = spa_open(ztest_opts.zo_pool, &spa, FTAG); 8399 if (error) { 8400 VERIFY3S(error, ==, ENOENT); 8401 ztest_import_impl(); 8402 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8403 zs->zs_metaslab_sz = 8404 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8405 } 8406 8407 metaslab_preload_limit = ztest_random(20) + 1; 8408 ztest_spa = spa; 8409 8410 /* 8411 * XXX - BUGBUG raidz expansion do not run this for generic for now 8412 */ 8413 if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) 8414 VERIFY0(vdev_raidz_impl_set("cycle")); 8415 8416 dmu_objset_stats_t dds; 8417 VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, 8418 DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); 8419 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 8420 dmu_objset_fast_stat(os, &dds); 8421 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 8422 dmu_objset_disown(os, B_TRUE, FTAG); 8423 8424 /* Give the dedicated raidz expansion test more grace time */ 8425 if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) 8426 zfs_deadman_synctime_ms *= 2; 8427 8428 /* 8429 * Create a thread to periodically resume suspended I/O. 8430 */ 8431 resume_thread = thread_create(NULL, 0, ztest_resume_thread, 8432 spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 8433 8434 /* 8435 * Create a deadman thread and set to panic if we hang. 8436 */ 8437 deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, 8438 zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 8439 8440 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; 8441 8442 /* 8443 * Verify that we can safely inquire about any object, 8444 * whether it's allocated or not. To make it interesting, 8445 * we probe a 5-wide window around each power of two. 8446 * This hits all edge cases, including zero and the max. 8447 */ 8448 for (t = 0; t < 64; t++) { 8449 for (d = -5; d <= 5; d++) { 8450 error = dmu_object_info(spa->spa_meta_objset, 8451 (1ULL << t) + d, NULL); 8452 ASSERT(error == 0 || error == ENOENT || 8453 error == EINVAL); 8454 } 8455 } 8456 8457 /* 8458 * If we got any ENOSPC errors on the previous run, destroy something. 8459 */ 8460 if (zs->zs_enospc_count != 0) { 8461 /* Not expecting ENOSPC errors during raidz expansion tests */ 8462 ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, 8463 RAIDZ_EXPAND_NONE); 8464 8465 int d = ztest_random(ztest_opts.zo_datasets); 8466 ztest_dataset_destroy(d); 8467 } 8468 zs->zs_enospc_count = 0; 8469 8470 /* 8471 * If we were in the middle of ztest_device_removal() and were killed 8472 * we need to ensure the removal and scrub complete before running 8473 * any tests that check ztest_device_removal_active. The removal will 8474 * be restarted automatically when the spa is opened, but we need to 8475 * initiate the scrub manually if it is not already in progress. Note 8476 * that we always run the scrub whenever an indirect vdev exists 8477 * because we have no way of knowing for sure if ztest_device_removal() 8478 * fully completed its scrub before the pool was reimported. 8479 * 8480 * Does not apply for the RAIDZ expansion specific test runs 8481 */ 8482 if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_NONE && 8483 (spa->spa_removing_phys.sr_state == DSS_SCANNING || 8484 spa->spa_removing_phys.sr_prev_indirect_vdev != -1)) { 8485 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 8486 txg_wait_synced(spa_get_dsl(spa), 0); 8487 8488 error = ztest_scrub_impl(spa); 8489 if (error == EBUSY) 8490 error = 0; 8491 ASSERT0(error); 8492 } 8493 8494 if (ztest_opts.zo_verbose >= 4) 8495 (void) printf("starting main threads...\n"); 8496 8497 /* 8498 * Replay all logs of all datasets in the pool. This is primarily for 8499 * temporary datasets which wouldn't otherwise get replayed, which 8500 * can trigger failures when attempting to offline a SLOG in 8501 * ztest_fault_inject(). 8502 */ 8503 (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, 8504 NULL, DS_FIND_CHILDREN); 8505 8506 if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) 8507 ztest_raidz_expand_run(zs, spa); 8508 else if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_KILLED) 8509 ztest_raidz_expand_check(spa); 8510 else 8511 ztest_generic_run(zs, spa); 8512 8513 /* Kill the resume and deadman threads */ 8514 ztest_exiting = B_TRUE; 8515 VERIFY0(thread_join(resume_thread)); 8516 VERIFY0(thread_join(deadman_thread)); 8517 ztest_resume(spa); 8518 8519 /* 8520 * Right before closing the pool, kick off a bunch of async I/O; 8521 * spa_close() should wait for it to complete. 8522 */ 8523 for (object = 1; object < 50; object++) { 8524 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 8525 ZIO_PRIORITY_SYNC_READ); 8526 } 8527 8528 /* Verify that at least one commit cb was called in a timely fashion */ 8529 if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) 8530 VERIFY0(zc_min_txg_delay); 8531 8532 spa_close(spa, FTAG); 8533 8534 /* 8535 * Verify that we can loop over all pools. 8536 */ 8537 mutex_enter(&spa_namespace_lock); 8538 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 8539 if (ztest_opts.zo_verbose > 3) 8540 (void) printf("spa_next: found %s\n", spa_name(spa)); 8541 mutex_exit(&spa_namespace_lock); 8542 8543 /* 8544 * Verify that we can export the pool and reimport it under a 8545 * different name. 8546 */ 8547 if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { 8548 char name[ZFS_MAX_DATASET_NAME_LEN]; 8549 (void) snprintf(name, sizeof (name), "%s_import", 8550 ztest_opts.zo_pool); 8551 ztest_spa_import_export(ztest_opts.zo_pool, name); 8552 ztest_spa_import_export(name, ztest_opts.zo_pool); 8553 } 8554 8555 kernel_fini(); 8556 8557 list_destroy(&zcl.zcl_callbacks); 8558 mutex_destroy(&zcl.zcl_callbacks_lock); 8559 (void) pthread_rwlock_destroy(&ztest_name_lock); 8560 mutex_destroy(&ztest_vdev_lock); 8561 mutex_destroy(&ztest_checkpoint_lock); 8562 } 8563 8564 static void 8565 print_time(hrtime_t t, char *timebuf) 8566 { 8567 hrtime_t s = t / NANOSEC; 8568 hrtime_t m = s / 60; 8569 hrtime_t h = m / 60; 8570 hrtime_t d = h / 24; 8571 8572 s -= m * 60; 8573 m -= h * 60; 8574 h -= d * 24; 8575 8576 timebuf[0] = '\0'; 8577 8578 if (d) 8579 (void) sprintf(timebuf, 8580 "%llud%02lluh%02llum%02llus", d, h, m, s); 8581 else if (h) 8582 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 8583 else if (m) 8584 (void) sprintf(timebuf, "%llum%02llus", m, s); 8585 else 8586 (void) sprintf(timebuf, "%llus", s); 8587 } 8588 8589 static nvlist_t * 8590 make_random_pool_props(void) 8591 { 8592 nvlist_t *props; 8593 8594 props = fnvlist_alloc(); 8595 8596 /* Twenty percent of the time enable ZPOOL_PROP_DEDUP_TABLE_QUOTA */ 8597 if (ztest_random(5) == 0) { 8598 fnvlist_add_uint64(props, 8599 zpool_prop_to_name(ZPOOL_PROP_DEDUP_TABLE_QUOTA), 8600 2 * 1024 * 1024); 8601 } 8602 8603 /* Fifty percent of the time enable ZPOOL_PROP_AUTOREPLACE */ 8604 if (ztest_random(2) == 0) { 8605 fnvlist_add_uint64(props, 8606 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); 8607 } 8608 8609 return (props); 8610 } 8611 8612 /* 8613 * Create a storage pool with the given name and initial vdev size. 8614 * Then test spa_freeze() functionality. 8615 */ 8616 static void 8617 ztest_init(ztest_shared_t *zs) 8618 { 8619 spa_t *spa; 8620 nvlist_t *nvroot, *props; 8621 int i; 8622 8623 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8624 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8625 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8626 8627 raidz_scratch_verify(); 8628 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8629 8630 /* 8631 * Create the storage pool. 8632 */ 8633 (void) spa_destroy(ztest_opts.zo_pool); 8634 ztest_shared->zs_vdev_next_leaf = 0; 8635 zs->zs_splits = 0; 8636 zs->zs_mirrors = ztest_opts.zo_mirrors; 8637 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 8638 NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 8639 props = make_random_pool_props(); 8640 8641 /* 8642 * We don't expect the pool to suspend unless maxfaults == 0, 8643 * in which case ztest_fault_inject() temporarily takes away 8644 * the only valid replica. 8645 */ 8646 fnvlist_add_uint64(props, 8647 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 8648 MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); 8649 8650 for (i = 0; i < SPA_FEATURES; i++) { 8651 char *buf; 8652 8653 if (!spa_feature_table[i].fi_zfs_mod_supported) 8654 continue; 8655 8656 /* 8657 * 75% chance of using the log space map feature. We want ztest 8658 * to exercise both the code paths that use the log space map 8659 * feature and the ones that don't. 8660 */ 8661 if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) 8662 continue; 8663 8664 /* 8665 * split 50/50 between legacy and fast dedup 8666 */ 8667 if (i == SPA_FEATURE_FAST_DEDUP && ztest_random(2) != 0) 8668 continue; 8669 8670 VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", 8671 spa_feature_table[i].fi_uname)); 8672 fnvlist_add_uint64(props, buf, 0); 8673 free(buf); 8674 } 8675 8676 VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); 8677 fnvlist_free(nvroot); 8678 fnvlist_free(props); 8679 8680 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8681 zs->zs_metaslab_sz = 8682 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8683 zs->zs_guid = spa_guid(spa); 8684 spa_close(spa, FTAG); 8685 8686 kernel_fini(); 8687 8688 if (!ztest_opts.zo_mmp_test) { 8689 ztest_run_zdb(zs->zs_guid); 8690 ztest_freeze(); 8691 ztest_run_zdb(zs->zs_guid); 8692 } 8693 8694 (void) pthread_rwlock_destroy(&ztest_name_lock); 8695 mutex_destroy(&ztest_vdev_lock); 8696 mutex_destroy(&ztest_checkpoint_lock); 8697 } 8698 8699 static void 8700 setup_data_fd(void) 8701 { 8702 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 8703 8704 ztest_fd_data = mkstemp(ztest_name_data); 8705 ASSERT3S(ztest_fd_data, >=, 0); 8706 (void) unlink(ztest_name_data); 8707 } 8708 8709 static int 8710 shared_data_size(ztest_shared_hdr_t *hdr) 8711 { 8712 int size; 8713 8714 size = hdr->zh_hdr_size; 8715 size += hdr->zh_opts_size; 8716 size += hdr->zh_size; 8717 size += hdr->zh_stats_size * hdr->zh_stats_count; 8718 size += hdr->zh_ds_size * hdr->zh_ds_count; 8719 size += hdr->zh_scratch_state_size; 8720 8721 return (size); 8722 } 8723 8724 static void 8725 setup_hdr(void) 8726 { 8727 int size; 8728 ztest_shared_hdr_t *hdr; 8729 8730 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 8731 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 8732 ASSERT3P(hdr, !=, MAP_FAILED); 8733 8734 VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 8735 8736 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 8737 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 8738 hdr->zh_size = sizeof (ztest_shared_t); 8739 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 8740 hdr->zh_stats_count = ZTEST_FUNCS; 8741 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 8742 hdr->zh_ds_count = ztest_opts.zo_datasets; 8743 hdr->zh_scratch_state_size = sizeof (ztest_shared_scratch_state_t); 8744 8745 size = shared_data_size(hdr); 8746 VERIFY0(ftruncate(ztest_fd_data, size)); 8747 8748 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 8749 } 8750 8751 static void 8752 setup_data(void) 8753 { 8754 int size, offset; 8755 ztest_shared_hdr_t *hdr; 8756 uint8_t *buf; 8757 8758 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 8759 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 8760 ASSERT3P(hdr, !=, MAP_FAILED); 8761 8762 size = shared_data_size(hdr); 8763 8764 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 8765 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 8766 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 8767 ASSERT3P(hdr, !=, MAP_FAILED); 8768 buf = (uint8_t *)hdr; 8769 8770 offset = hdr->zh_hdr_size; 8771 ztest_shared_opts = (void *)&buf[offset]; 8772 offset += hdr->zh_opts_size; 8773 ztest_shared = (void *)&buf[offset]; 8774 offset += hdr->zh_size; 8775 ztest_shared_callstate = (void *)&buf[offset]; 8776 offset += hdr->zh_stats_size * hdr->zh_stats_count; 8777 ztest_shared_ds = (void *)&buf[offset]; 8778 offset += hdr->zh_ds_size * hdr->zh_ds_count; 8779 ztest_scratch_state = (void *)&buf[offset]; 8780 } 8781 8782 static boolean_t 8783 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 8784 { 8785 pid_t pid; 8786 int status; 8787 char *cmdbuf = NULL; 8788 8789 pid = fork(); 8790 8791 if (cmd == NULL) { 8792 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 8793 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 8794 cmd = cmdbuf; 8795 } 8796 8797 if (pid == -1) 8798 fatal(B_TRUE, "fork failed"); 8799 8800 if (pid == 0) { /* child */ 8801 char fd_data_str[12]; 8802 8803 VERIFY3S(11, >=, 8804 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 8805 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 8806 8807 if (libpath != NULL) { 8808 const char *curlp = getenv("LD_LIBRARY_PATH"); 8809 if (curlp == NULL) 8810 VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); 8811 else { 8812 char *newlp = NULL; 8813 VERIFY3S(-1, !=, 8814 asprintf(&newlp, "%s:%s", libpath, curlp)); 8815 VERIFY0(setenv("LD_LIBRARY_PATH", newlp, 1)); 8816 free(newlp); 8817 } 8818 } 8819 (void) execl(cmd, cmd, (char *)NULL); 8820 ztest_dump_core = B_FALSE; 8821 fatal(B_TRUE, "exec failed: %s", cmd); 8822 } 8823 8824 if (cmdbuf != NULL) { 8825 umem_free(cmdbuf, MAXPATHLEN); 8826 cmd = NULL; 8827 } 8828 8829 while (waitpid(pid, &status, 0) != pid) 8830 continue; 8831 if (statusp != NULL) 8832 *statusp = status; 8833 8834 if (WIFEXITED(status)) { 8835 if (WEXITSTATUS(status) != 0) { 8836 (void) fprintf(stderr, "child exited with code %d\n", 8837 WEXITSTATUS(status)); 8838 exit(2); 8839 } 8840 return (B_FALSE); 8841 } else if (WIFSIGNALED(status)) { 8842 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 8843 (void) fprintf(stderr, "child died with signal %d\n", 8844 WTERMSIG(status)); 8845 exit(3); 8846 } 8847 return (B_TRUE); 8848 } else { 8849 (void) fprintf(stderr, "something strange happened to child\n"); 8850 exit(4); 8851 } 8852 } 8853 8854 static void 8855 ztest_run_init(void) 8856 { 8857 int i; 8858 8859 ztest_shared_t *zs = ztest_shared; 8860 8861 /* 8862 * Blow away any existing copy of zpool.cache 8863 */ 8864 (void) remove(spa_config_path); 8865 8866 if (ztest_opts.zo_init == 0) { 8867 if (ztest_opts.zo_verbose >= 1) 8868 (void) printf("Importing pool %s\n", 8869 ztest_opts.zo_pool); 8870 ztest_import(zs); 8871 return; 8872 } 8873 8874 /* 8875 * Create and initialize our storage pool. 8876 */ 8877 for (i = 1; i <= ztest_opts.zo_init; i++) { 8878 memset(zs, 0, sizeof (*zs)); 8879 if (ztest_opts.zo_verbose >= 3 && 8880 ztest_opts.zo_init != 1) { 8881 (void) printf("ztest_init(), pass %d\n", i); 8882 } 8883 ztest_init(zs); 8884 } 8885 } 8886 8887 int 8888 main(int argc, char **argv) 8889 { 8890 int kills = 0; 8891 int iters = 0; 8892 int older = 0; 8893 int newer = 0; 8894 ztest_shared_t *zs; 8895 ztest_info_t *zi; 8896 ztest_shared_callstate_t *zc; 8897 char timebuf[100]; 8898 char numbuf[NN_NUMBUF_SZ]; 8899 char *cmd; 8900 boolean_t hasalt; 8901 int f, err; 8902 char *fd_data_str = getenv("ZTEST_FD_DATA"); 8903 struct sigaction action; 8904 8905 (void) setvbuf(stdout, NULL, _IOLBF, 0); 8906 8907 dprintf_setup(&argc, argv); 8908 zfs_deadman_synctime_ms = 300000; 8909 zfs_deadman_checktime_ms = 30000; 8910 /* 8911 * As two-word space map entries may not come up often (especially 8912 * if pool and vdev sizes are small) we want to force at least some 8913 * of them so the feature get tested. 8914 */ 8915 zfs_force_some_double_word_sm_entries = B_TRUE; 8916 8917 /* 8918 * Verify that even extensively damaged split blocks with many 8919 * segments can be reconstructed in a reasonable amount of time 8920 * when reconstruction is known to be possible. 8921 * 8922 * Note: the lower this value is, the more damage we inflict, and 8923 * the more time ztest spends in recovering that damage. We chose 8924 * to induce damage 1/100th of the time so recovery is tested but 8925 * not so frequently that ztest doesn't get to test other code paths. 8926 */ 8927 zfs_reconstruct_indirect_damage_fraction = 100; 8928 8929 action.sa_handler = sig_handler; 8930 sigemptyset(&action.sa_mask); 8931 action.sa_flags = 0; 8932 8933 if (sigaction(SIGSEGV, &action, NULL) < 0) { 8934 (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", 8935 strerror(errno)); 8936 exit(EXIT_FAILURE); 8937 } 8938 8939 if (sigaction(SIGABRT, &action, NULL) < 0) { 8940 (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", 8941 strerror(errno)); 8942 exit(EXIT_FAILURE); 8943 } 8944 8945 /* 8946 * Force random_get_bytes() to use /dev/urandom in order to prevent 8947 * ztest from needlessly depleting the system entropy pool. 8948 */ 8949 random_path = "/dev/urandom"; 8950 ztest_fd_rand = open(random_path, O_RDONLY | O_CLOEXEC); 8951 ASSERT3S(ztest_fd_rand, >=, 0); 8952 8953 if (!fd_data_str) { 8954 process_options(argc, argv); 8955 8956 setup_data_fd(); 8957 setup_hdr(); 8958 setup_data(); 8959 memcpy(ztest_shared_opts, &ztest_opts, 8960 sizeof (*ztest_shared_opts)); 8961 } else { 8962 ztest_fd_data = atoi(fd_data_str); 8963 setup_data(); 8964 memcpy(&ztest_opts, ztest_shared_opts, sizeof (ztest_opts)); 8965 } 8966 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 8967 8968 err = ztest_set_global_vars(); 8969 if (err != 0 && !fd_data_str) { 8970 /* error message done by ztest_set_global_vars */ 8971 exit(EXIT_FAILURE); 8972 } else { 8973 /* children should not be spawned if setting gvars fails */ 8974 VERIFY3S(err, ==, 0); 8975 } 8976 8977 /* Override location of zpool.cache */ 8978 VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", 8979 ztest_opts.zo_dir), !=, -1); 8980 8981 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 8982 UMEM_NOFAIL); 8983 zs = ztest_shared; 8984 8985 if (fd_data_str) { 8986 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 8987 metaslab_df_alloc_threshold = 8988 zs->zs_metaslab_df_alloc_threshold; 8989 8990 if (zs->zs_do_init) 8991 ztest_run_init(); 8992 else 8993 ztest_run(zs); 8994 exit(0); 8995 } 8996 8997 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 8998 8999 if (ztest_opts.zo_verbose >= 1) { 9000 (void) printf("%"PRIu64" vdevs, %d datasets, %d threads, " 9001 "%d %s disks, parity %d, %"PRIu64" seconds...\n\n", 9002 ztest_opts.zo_vdevs, 9003 ztest_opts.zo_datasets, 9004 ztest_opts.zo_threads, 9005 ztest_opts.zo_raid_children, 9006 ztest_opts.zo_raid_type, 9007 ztest_opts.zo_raid_parity, 9008 ztest_opts.zo_time); 9009 } 9010 9011 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 9012 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 9013 9014 zs->zs_do_init = B_TRUE; 9015 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 9016 if (ztest_opts.zo_verbose >= 1) { 9017 (void) printf("Executing older ztest for " 9018 "initialization: %s\n", ztest_opts.zo_alt_ztest); 9019 } 9020 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 9021 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 9022 } else { 9023 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 9024 } 9025 zs->zs_do_init = B_FALSE; 9026 9027 zs->zs_proc_start = gethrtime(); 9028 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 9029 9030 for (f = 0; f < ZTEST_FUNCS; f++) { 9031 zi = &ztest_info[f]; 9032 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9033 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 9034 zc->zc_next = UINT64_MAX; 9035 else 9036 zc->zc_next = zs->zs_proc_start + 9037 ztest_random(2 * zi->zi_interval[0] + 1); 9038 } 9039 9040 /* 9041 * Run the tests in a loop. These tests include fault injection 9042 * to verify that self-healing data works, and forced crashes 9043 * to verify that we never lose on-disk consistency. 9044 */ 9045 while (gethrtime() < zs->zs_proc_stop) { 9046 int status; 9047 boolean_t killed; 9048 9049 /* 9050 * Initialize the workload counters for each function. 9051 */ 9052 for (f = 0; f < ZTEST_FUNCS; f++) { 9053 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9054 zc->zc_count = 0; 9055 zc->zc_time = 0; 9056 } 9057 9058 /* Set the allocation switch size */ 9059 zs->zs_metaslab_df_alloc_threshold = 9060 ztest_random(zs->zs_metaslab_sz / 4) + 1; 9061 9062 if (!hasalt || ztest_random(2) == 0) { 9063 if (hasalt && ztest_opts.zo_verbose >= 1) { 9064 (void) printf("Executing newer ztest: %s\n", 9065 cmd); 9066 } 9067 newer++; 9068 killed = exec_child(cmd, NULL, B_TRUE, &status); 9069 } else { 9070 if (hasalt && ztest_opts.zo_verbose >= 1) { 9071 (void) printf("Executing older ztest: %s\n", 9072 ztest_opts.zo_alt_ztest); 9073 } 9074 older++; 9075 killed = exec_child(ztest_opts.zo_alt_ztest, 9076 ztest_opts.zo_alt_libpath, B_TRUE, &status); 9077 } 9078 9079 if (killed) 9080 kills++; 9081 iters++; 9082 9083 if (ztest_opts.zo_verbose >= 1) { 9084 hrtime_t now = gethrtime(); 9085 9086 now = MIN(now, zs->zs_proc_stop); 9087 print_time(zs->zs_proc_stop - now, timebuf); 9088 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 9089 9090 (void) printf("Pass %3d, %8s, %3"PRIu64" ENOSPC, " 9091 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 9092 iters, 9093 WIFEXITED(status) ? "Complete" : "SIGKILL", 9094 zs->zs_enospc_count, 9095 100.0 * zs->zs_alloc / zs->zs_space, 9096 numbuf, 9097 100.0 * (now - zs->zs_proc_start) / 9098 (ztest_opts.zo_time * NANOSEC), timebuf); 9099 } 9100 9101 if (ztest_opts.zo_verbose >= 2) { 9102 (void) printf("\nWorkload summary:\n\n"); 9103 (void) printf("%7s %9s %s\n", 9104 "Calls", "Time", "Function"); 9105 (void) printf("%7s %9s %s\n", 9106 "-----", "----", "--------"); 9107 for (f = 0; f < ZTEST_FUNCS; f++) { 9108 zi = &ztest_info[f]; 9109 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9110 print_time(zc->zc_time, timebuf); 9111 (void) printf("%7"PRIu64" %9s %s\n", 9112 zc->zc_count, timebuf, 9113 zi->zi_funcname); 9114 } 9115 (void) printf("\n"); 9116 } 9117 9118 if (!ztest_opts.zo_mmp_test) 9119 ztest_run_zdb(zs->zs_guid); 9120 if (ztest_shared_opts->zo_raidz_expand_test == 9121 RAIDZ_EXPAND_CHECKED) 9122 break; /* raidz expand test complete */ 9123 } 9124 9125 if (ztest_opts.zo_verbose >= 1) { 9126 if (hasalt) { 9127 (void) printf("%d runs of older ztest: %s\n", older, 9128 ztest_opts.zo_alt_ztest); 9129 (void) printf("%d runs of newer ztest: %s\n", newer, 9130 cmd); 9131 } 9132 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 9133 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 9134 } 9135 9136 umem_free(cmd, MAXNAMELEN); 9137 9138 return (0); 9139 } 9140