1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2024 by Delphix. All rights reserved. 25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2013 Steven Hartland. All rights reserved. 27 * Copyright (c) 2014 Integros [integros.com] 28 * Copyright 2017 Joyent, Inc. 29 * Copyright (c) 2017, Intel Corporation. 30 * Copyright (c) 2023, Klara, Inc. 31 */ 32 33 /* 34 * The objective of this program is to provide a DMU/ZAP/SPA stress test 35 * that runs entirely in userland, is easy to use, and easy to extend. 36 * 37 * The overall design of the ztest program is as follows: 38 * 39 * (1) For each major functional area (e.g. adding vdevs to a pool, 40 * creating and destroying datasets, reading and writing objects, etc) 41 * we have a simple routine to test that functionality. These 42 * individual routines do not have to do anything "stressful". 43 * 44 * (2) We turn these simple functionality tests into a stress test by 45 * running them all in parallel, with as many threads as desired, 46 * and spread across as many datasets, objects, and vdevs as desired. 47 * 48 * (3) While all this is happening, we inject faults into the pool to 49 * verify that self-healing data really works. 50 * 51 * (4) Every time we open a dataset, we change its checksum and compression 52 * functions. Thus even individual objects vary from block to block 53 * in which checksum they use and whether they're compressed. 54 * 55 * (5) To verify that we never lose on-disk consistency after a crash, 56 * we run the entire test in a child of the main process. 57 * At random times, the child self-immolates with a SIGKILL. 58 * This is the software equivalent of pulling the power cord. 59 * The parent then runs the test again, using the existing 60 * storage pool, as many times as desired. If backwards compatibility 61 * testing is enabled ztest will sometimes run the "older" version 62 * of ztest after a SIGKILL. 63 * 64 * (6) To verify that we don't have future leaks or temporal incursions, 65 * many of the functional tests record the transaction group number 66 * as part of their data. When reading old data, they verify that 67 * the transaction group number is less than the current, open txg. 68 * If you add a new test, please do this if applicable. 69 * 70 * (7) Threads are created with a reduced stack size, for sanity checking. 71 * Therefore, it's important not to allocate huge buffers on the stack. 72 * 73 * When run with no arguments, ztest runs for about five minutes and 74 * produces no output if successful. To get a little bit of information, 75 * specify -V. To get more information, specify -VV, and so on. 76 * 77 * To turn this into an overnight stress test, use -T to specify run time. 78 * 79 * You can ask more vdevs [-v], datasets [-d], or threads [-t] 80 * to increase the pool capacity, fanout, and overall stress level. 81 * 82 * Use the -k option to set the desired frequency of kills. 83 * 84 * When ztest invokes itself it passes all relevant information through a 85 * temporary file which is mmap-ed in the child process. This allows shared 86 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 87 * stored at offset 0 of this file and contains information on the size and 88 * number of shared structures in the file. The information stored in this file 89 * must remain backwards compatible with older versions of ztest so that 90 * ztest can invoke them during backwards compatibility testing (-B). 91 */ 92 93 #include <sys/zfs_context.h> 94 #include <sys/spa.h> 95 #include <sys/dmu.h> 96 #include <sys/txg.h> 97 #include <sys/dbuf.h> 98 #include <sys/zap.h> 99 #include <sys/dmu_objset.h> 100 #include <sys/poll.h> 101 #include <sys/stat.h> 102 #include <sys/time.h> 103 #include <sys/wait.h> 104 #include <sys/mman.h> 105 #include <sys/resource.h> 106 #include <sys/zio.h> 107 #include <sys/zil.h> 108 #include <sys/zil_impl.h> 109 #include <sys/vdev_draid.h> 110 #include <sys/vdev_impl.h> 111 #include <sys/vdev_file.h> 112 #include <sys/vdev_initialize.h> 113 #include <sys/vdev_raidz.h> 114 #include <sys/vdev_trim.h> 115 #include <sys/spa_impl.h> 116 #include <sys/metaslab_impl.h> 117 #include <sys/dsl_prop.h> 118 #include <sys/dsl_dataset.h> 119 #include <sys/dsl_destroy.h> 120 #include <sys/dsl_scan.h> 121 #include <sys/zio_checksum.h> 122 #include <sys/zfs_refcount.h> 123 #include <sys/zfeature.h> 124 #include <sys/dsl_userhold.h> 125 #include <sys/abd.h> 126 #include <sys/blake3.h> 127 #include <stdio.h> 128 #include <stdlib.h> 129 #include <unistd.h> 130 #include <getopt.h> 131 #include <signal.h> 132 #include <umem.h> 133 #include <ctype.h> 134 #include <math.h> 135 #include <sys/fs/zfs.h> 136 #include <zfs_fletcher.h> 137 #include <libnvpair.h> 138 #include <libzutil.h> 139 #include <sys/crypto/icp.h> 140 #include <sys/zfs_impl.h> 141 #include <sys/backtrace.h> 142 143 static int ztest_fd_data = -1; 144 static int ztest_fd_rand = -1; 145 146 typedef struct ztest_shared_hdr { 147 uint64_t zh_hdr_size; 148 uint64_t zh_opts_size; 149 uint64_t zh_size; 150 uint64_t zh_stats_size; 151 uint64_t zh_stats_count; 152 uint64_t zh_ds_size; 153 uint64_t zh_ds_count; 154 uint64_t zh_scratch_state_size; 155 } ztest_shared_hdr_t; 156 157 static ztest_shared_hdr_t *ztest_shared_hdr; 158 159 enum ztest_class_state { 160 ZTEST_VDEV_CLASS_OFF, 161 ZTEST_VDEV_CLASS_ON, 162 ZTEST_VDEV_CLASS_RND 163 }; 164 165 /* Dedicated RAIDZ Expansion test states */ 166 typedef enum { 167 RAIDZ_EXPAND_NONE, /* Default is none, must opt-in */ 168 RAIDZ_EXPAND_REQUESTED, /* The '-X' option was used */ 169 RAIDZ_EXPAND_STARTED, /* Testing has commenced */ 170 RAIDZ_EXPAND_KILLED, /* Reached the proccess kill */ 171 RAIDZ_EXPAND_CHECKED, /* Pool scrub verification done */ 172 } raidz_expand_test_state_t; 173 174 175 #define ZO_GVARS_MAX_ARGLEN ((size_t)64) 176 #define ZO_GVARS_MAX_COUNT ((size_t)10) 177 178 typedef struct ztest_shared_opts { 179 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 180 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 181 char zo_alt_ztest[MAXNAMELEN]; 182 char zo_alt_libpath[MAXNAMELEN]; 183 uint64_t zo_vdevs; 184 uint64_t zo_vdevtime; 185 size_t zo_vdev_size; 186 int zo_ashift; 187 int zo_mirrors; 188 int zo_raid_do_expand; 189 int zo_raid_children; 190 int zo_raid_parity; 191 char zo_raid_type[8]; 192 int zo_draid_data; 193 int zo_draid_spares; 194 int zo_datasets; 195 int zo_threads; 196 uint64_t zo_passtime; 197 uint64_t zo_killrate; 198 int zo_verbose; 199 int zo_init; 200 uint64_t zo_time; 201 uint64_t zo_maxloops; 202 uint64_t zo_metaslab_force_ganging; 203 raidz_expand_test_state_t zo_raidz_expand_test; 204 int zo_mmp_test; 205 int zo_special_vdevs; 206 int zo_dump_dbgmsg; 207 int zo_gvars_count; 208 char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; 209 } ztest_shared_opts_t; 210 211 /* Default values for command line options. */ 212 #define DEFAULT_POOL "ztest" 213 #define DEFAULT_VDEV_DIR "/tmp" 214 #define DEFAULT_VDEV_COUNT 5 215 #define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ 216 #define DEFAULT_VDEV_SIZE_STR "256M" 217 #define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT 218 #define DEFAULT_MIRRORS 2 219 #define DEFAULT_RAID_CHILDREN 4 220 #define DEFAULT_RAID_PARITY 1 221 #define DEFAULT_DRAID_DATA 4 222 #define DEFAULT_DRAID_SPARES 1 223 #define DEFAULT_DATASETS_COUNT 7 224 #define DEFAULT_THREADS 23 225 #define DEFAULT_RUN_TIME 300 /* 300 seconds */ 226 #define DEFAULT_RUN_TIME_STR "300 sec" 227 #define DEFAULT_PASS_TIME 60 /* 60 seconds */ 228 #define DEFAULT_PASS_TIME_STR "60 sec" 229 #define DEFAULT_KILL_RATE 70 /* 70% kill rate */ 230 #define DEFAULT_KILLRATE_STR "70%" 231 #define DEFAULT_INITS 1 232 #define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ 233 #define DEFAULT_FORCE_GANGING (64 << 10) 234 #define DEFAULT_FORCE_GANGING_STR "64K" 235 236 /* Simplifying assumption: -1 is not a valid default. */ 237 #define NO_DEFAULT -1 238 239 static const ztest_shared_opts_t ztest_opts_defaults = { 240 .zo_pool = DEFAULT_POOL, 241 .zo_dir = DEFAULT_VDEV_DIR, 242 .zo_alt_ztest = { '\0' }, 243 .zo_alt_libpath = { '\0' }, 244 .zo_vdevs = DEFAULT_VDEV_COUNT, 245 .zo_ashift = DEFAULT_ASHIFT, 246 .zo_mirrors = DEFAULT_MIRRORS, 247 .zo_raid_children = DEFAULT_RAID_CHILDREN, 248 .zo_raid_parity = DEFAULT_RAID_PARITY, 249 .zo_raid_type = VDEV_TYPE_RAIDZ, 250 .zo_vdev_size = DEFAULT_VDEV_SIZE, 251 .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ 252 .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ 253 .zo_datasets = DEFAULT_DATASETS_COUNT, 254 .zo_threads = DEFAULT_THREADS, 255 .zo_passtime = DEFAULT_PASS_TIME, 256 .zo_killrate = DEFAULT_KILL_RATE, 257 .zo_verbose = 0, 258 .zo_mmp_test = 0, 259 .zo_init = DEFAULT_INITS, 260 .zo_time = DEFAULT_RUN_TIME, 261 .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ 262 .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, 263 .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, 264 .zo_gvars_count = 0, 265 .zo_raidz_expand_test = RAIDZ_EXPAND_NONE, 266 }; 267 268 extern uint64_t metaslab_force_ganging; 269 extern uint64_t metaslab_df_alloc_threshold; 270 extern uint64_t zfs_deadman_synctime_ms; 271 extern uint_t metaslab_preload_limit; 272 extern int zfs_compressed_arc_enabled; 273 extern int zfs_abd_scatter_enabled; 274 extern uint_t dmu_object_alloc_chunk_shift; 275 extern boolean_t zfs_force_some_double_word_sm_entries; 276 extern unsigned long zio_decompress_fail_fraction; 277 extern unsigned long zfs_reconstruct_indirect_damage_fraction; 278 extern uint64_t raidz_expand_max_reflow_bytes; 279 extern uint_t raidz_expand_pause_point; 280 extern boolean_t ddt_prune_artificial_age; 281 extern boolean_t ddt_dump_prune_histogram; 282 283 284 static ztest_shared_opts_t *ztest_shared_opts; 285 static ztest_shared_opts_t ztest_opts; 286 static const char *const ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; 287 288 typedef struct ztest_shared_ds { 289 uint64_t zd_seq; 290 } ztest_shared_ds_t; 291 292 static ztest_shared_ds_t *ztest_shared_ds; 293 #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 294 295 typedef struct ztest_scratch_state { 296 uint64_t zs_raidz_scratch_verify_pause; 297 } ztest_shared_scratch_state_t; 298 299 static ztest_shared_scratch_state_t *ztest_scratch_state; 300 301 #define BT_MAGIC 0x123456789abcdefULL 302 #define MAXFAULTS(zs) \ 303 (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) 304 305 enum ztest_io_type { 306 ZTEST_IO_WRITE_TAG, 307 ZTEST_IO_WRITE_PATTERN, 308 ZTEST_IO_WRITE_ZEROES, 309 ZTEST_IO_TRUNCATE, 310 ZTEST_IO_SETATTR, 311 ZTEST_IO_REWRITE, 312 ZTEST_IO_TYPES 313 }; 314 315 typedef struct ztest_block_tag { 316 uint64_t bt_magic; 317 uint64_t bt_objset; 318 uint64_t bt_object; 319 uint64_t bt_dnodesize; 320 uint64_t bt_offset; 321 uint64_t bt_gen; 322 uint64_t bt_txg; 323 uint64_t bt_crtxg; 324 } ztest_block_tag_t; 325 326 typedef struct bufwad { 327 uint64_t bw_index; 328 uint64_t bw_txg; 329 uint64_t bw_data; 330 } bufwad_t; 331 332 /* 333 * It would be better to use a rangelock_t per object. Unfortunately 334 * the rangelock_t is not a drop-in replacement for rl_t, because we 335 * still need to map from object ID to rangelock_t. 336 */ 337 typedef enum { 338 ZTRL_READER, 339 ZTRL_WRITER, 340 ZTRL_APPEND 341 } rl_type_t; 342 343 typedef struct rll { 344 void *rll_writer; 345 int rll_readers; 346 kmutex_t rll_lock; 347 kcondvar_t rll_cv; 348 } rll_t; 349 350 typedef struct rl { 351 uint64_t rl_object; 352 uint64_t rl_offset; 353 uint64_t rl_size; 354 rll_t *rl_lock; 355 } rl_t; 356 357 #define ZTEST_RANGE_LOCKS 64 358 #define ZTEST_OBJECT_LOCKS 64 359 360 /* 361 * Object descriptor. Used as a template for object lookup/create/remove. 362 */ 363 typedef struct ztest_od { 364 uint64_t od_dir; 365 uint64_t od_object; 366 dmu_object_type_t od_type; 367 dmu_object_type_t od_crtype; 368 uint64_t od_blocksize; 369 uint64_t od_crblocksize; 370 uint64_t od_crdnodesize; 371 uint64_t od_gen; 372 uint64_t od_crgen; 373 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 374 } ztest_od_t; 375 376 /* 377 * Per-dataset state. 378 */ 379 typedef struct ztest_ds { 380 ztest_shared_ds_t *zd_shared; 381 objset_t *zd_os; 382 pthread_rwlock_t zd_zilog_lock; 383 zilog_t *zd_zilog; 384 ztest_od_t *zd_od; /* debugging aid */ 385 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 386 kmutex_t zd_dirobj_lock; 387 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 388 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 389 } ztest_ds_t; 390 391 /* 392 * Per-iteration state. 393 */ 394 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 395 396 typedef struct ztest_info { 397 ztest_func_t *zi_func; /* test function */ 398 uint64_t zi_iters; /* iterations per execution */ 399 uint64_t *zi_interval; /* execute every <interval> seconds */ 400 const char *zi_funcname; /* name of test function */ 401 } ztest_info_t; 402 403 typedef struct ztest_shared_callstate { 404 uint64_t zc_count; /* per-pass count */ 405 uint64_t zc_time; /* per-pass time */ 406 uint64_t zc_next; /* next time to call this function */ 407 } ztest_shared_callstate_t; 408 409 static ztest_shared_callstate_t *ztest_shared_callstate; 410 #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 411 412 ztest_func_t ztest_dmu_read_write; 413 ztest_func_t ztest_dmu_write_parallel; 414 ztest_func_t ztest_dmu_object_alloc_free; 415 ztest_func_t ztest_dmu_object_next_chunk; 416 ztest_func_t ztest_dmu_commit_callbacks; 417 ztest_func_t ztest_zap; 418 ztest_func_t ztest_zap_parallel; 419 ztest_func_t ztest_zil_commit; 420 ztest_func_t ztest_zil_remount; 421 ztest_func_t ztest_dmu_read_write_zcopy; 422 ztest_func_t ztest_dmu_objset_create_destroy; 423 ztest_func_t ztest_dmu_prealloc; 424 ztest_func_t ztest_fzap; 425 ztest_func_t ztest_dmu_snapshot_create_destroy; 426 ztest_func_t ztest_dsl_prop_get_set; 427 ztest_func_t ztest_spa_prop_get_set; 428 ztest_func_t ztest_spa_create_destroy; 429 ztest_func_t ztest_fault_inject; 430 ztest_func_t ztest_dmu_snapshot_hold; 431 ztest_func_t ztest_mmp_enable_disable; 432 ztest_func_t ztest_scrub; 433 ztest_func_t ztest_dsl_dataset_promote_busy; 434 ztest_func_t ztest_vdev_attach_detach; 435 ztest_func_t ztest_vdev_raidz_attach; 436 ztest_func_t ztest_vdev_LUN_growth; 437 ztest_func_t ztest_vdev_add_remove; 438 ztest_func_t ztest_vdev_class_add; 439 ztest_func_t ztest_vdev_aux_add_remove; 440 ztest_func_t ztest_split_pool; 441 ztest_func_t ztest_reguid; 442 ztest_func_t ztest_spa_upgrade; 443 ztest_func_t ztest_device_removal; 444 ztest_func_t ztest_spa_checkpoint_create_discard; 445 ztest_func_t ztest_initialize; 446 ztest_func_t ztest_trim; 447 ztest_func_t ztest_blake3; 448 ztest_func_t ztest_fletcher; 449 ztest_func_t ztest_fletcher_incr; 450 ztest_func_t ztest_verify_dnode_bt; 451 ztest_func_t ztest_pool_prefetch_ddt; 452 ztest_func_t ztest_ddt_prune; 453 454 static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 455 static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 456 static uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 457 static uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 458 static uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 459 460 #define ZTI_INIT(func, iters, interval) \ 461 { .zi_func = (func), \ 462 .zi_iters = (iters), \ 463 .zi_interval = (interval), \ 464 .zi_funcname = # func } 465 466 static ztest_info_t ztest_info[] = { 467 ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), 468 ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), 469 ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), 470 ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), 471 ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), 472 ZTI_INIT(ztest_zap, 30, &zopt_always), 473 ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), 474 ZTI_INIT(ztest_split_pool, 1, &zopt_sometimes), 475 ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), 476 ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), 477 ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), 478 ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), 479 ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), 480 ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), 481 #if 0 482 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), 483 #endif 484 ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), 485 ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), 486 ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), 487 ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), 488 ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), 489 ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), 490 ZTI_INIT(ztest_reguid, 1, &zopt_rarely), 491 ZTI_INIT(ztest_scrub, 1, &zopt_rarely), 492 ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), 493 ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), 494 ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), 495 ZTI_INIT(ztest_vdev_raidz_attach, 1, &zopt_sometimes), 496 ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), 497 ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), 498 ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), 499 ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), 500 ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), 501 ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), 502 ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), 503 ZTI_INIT(ztest_trim, 1, &zopt_sometimes), 504 ZTI_INIT(ztest_blake3, 1, &zopt_rarely), 505 ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), 506 ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), 507 ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), 508 ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely), 509 ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely), 510 }; 511 512 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 513 514 /* 515 * The following struct is used to hold a list of uncalled commit callbacks. 516 * The callbacks are ordered by txg number. 517 */ 518 typedef struct ztest_cb_list { 519 kmutex_t zcl_callbacks_lock; 520 list_t zcl_callbacks; 521 } ztest_cb_list_t; 522 523 /* 524 * Stuff we need to share writably between parent and child. 525 */ 526 typedef struct ztest_shared { 527 boolean_t zs_do_init; 528 hrtime_t zs_proc_start; 529 hrtime_t zs_proc_stop; 530 hrtime_t zs_thread_start; 531 hrtime_t zs_thread_stop; 532 hrtime_t zs_thread_kill; 533 uint64_t zs_enospc_count; 534 uint64_t zs_vdev_next_leaf; 535 uint64_t zs_vdev_aux; 536 uint64_t zs_alloc; 537 uint64_t zs_space; 538 uint64_t zs_splits; 539 uint64_t zs_mirrors; 540 uint64_t zs_metaslab_sz; 541 uint64_t zs_metaslab_df_alloc_threshold; 542 uint64_t zs_guid; 543 } ztest_shared_t; 544 545 #define ID_PARALLEL -1ULL 546 547 static char ztest_dev_template[] = "%s/%s.%llua"; 548 static char ztest_aux_template[] = "%s/%s.%s.%llu"; 549 static ztest_shared_t *ztest_shared; 550 551 static spa_t *ztest_spa = NULL; 552 static ztest_ds_t *ztest_ds; 553 554 static kmutex_t ztest_vdev_lock; 555 static boolean_t ztest_device_removal_active = B_FALSE; 556 static boolean_t ztest_pool_scrubbed = B_FALSE; 557 static kmutex_t ztest_checkpoint_lock; 558 559 /* 560 * The ztest_name_lock protects the pool and dataset namespace used by 561 * the individual tests. To modify the namespace, consumers must grab 562 * this lock as writer. Grabbing the lock as reader will ensure that the 563 * namespace does not change while the lock is held. 564 */ 565 static pthread_rwlock_t ztest_name_lock; 566 567 static boolean_t ztest_dump_core = B_TRUE; 568 static boolean_t ztest_exiting; 569 570 /* Global commit callback list */ 571 static ztest_cb_list_t zcl; 572 /* Commit cb delay */ 573 static uint64_t zc_min_txg_delay = UINT64_MAX; 574 static int zc_cb_counter = 0; 575 576 /* 577 * Minimum number of commit callbacks that need to be registered for us to check 578 * whether the minimum txg delay is acceptable. 579 */ 580 #define ZTEST_COMMIT_CB_MIN_REG 100 581 582 /* 583 * If a number of txgs equal to this threshold have been created after a commit 584 * callback has been registered but not called, then we assume there is an 585 * implementation bug. 586 */ 587 #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) 588 589 enum ztest_object { 590 ZTEST_META_DNODE = 0, 591 ZTEST_DIROBJ, 592 ZTEST_OBJECTS 593 }; 594 595 static __attribute__((noreturn)) void usage(boolean_t requested); 596 static int ztest_scrub_impl(spa_t *spa); 597 598 /* 599 * These libumem hooks provide a reasonable set of defaults for the allocator's 600 * debugging facilities. 601 */ 602 const char * 603 _umem_debug_init(void) 604 { 605 return ("default,verbose"); /* $UMEM_DEBUG setting */ 606 } 607 608 const char * 609 _umem_logging_init(void) 610 { 611 return ("fail,contents"); /* $UMEM_LOGGING setting */ 612 } 613 614 static void 615 dump_debug_buffer(void) 616 { 617 ssize_t ret __attribute__((unused)); 618 619 if (!ztest_opts.zo_dump_dbgmsg) 620 return; 621 622 /* 623 * We use write() instead of printf() so that this function 624 * is safe to call from a signal handler. 625 */ 626 ret = write(STDERR_FILENO, "\n", 1); 627 zfs_dbgmsg_print(STDERR_FILENO, "ztest"); 628 } 629 630 static void sig_handler(int signo) 631 { 632 struct sigaction action; 633 634 libspl_backtrace(STDERR_FILENO); 635 dump_debug_buffer(); 636 637 /* 638 * Restore default action and re-raise signal so SIGSEGV and 639 * SIGABRT can trigger a core dump. 640 */ 641 action.sa_handler = SIG_DFL; 642 sigemptyset(&action.sa_mask); 643 action.sa_flags = 0; 644 (void) sigaction(signo, &action, NULL); 645 raise(signo); 646 } 647 648 #define FATAL_MSG_SZ 1024 649 650 static const char *fatal_msg; 651 652 static __attribute__((format(printf, 2, 3))) __attribute__((noreturn)) void 653 fatal(int do_perror, const char *message, ...) 654 { 655 va_list args; 656 int save_errno = errno; 657 char *buf; 658 659 (void) fflush(stdout); 660 buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); 661 if (buf == NULL) 662 goto out; 663 664 va_start(args, message); 665 (void) sprintf(buf, "ztest: "); 666 /* LINTED */ 667 (void) vsprintf(buf + strlen(buf), message, args); 668 va_end(args); 669 if (do_perror) { 670 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 671 ": %s", strerror(save_errno)); 672 } 673 (void) fprintf(stderr, "%s\n", buf); 674 fatal_msg = buf; /* to ease debugging */ 675 676 out: 677 if (ztest_dump_core) 678 abort(); 679 else 680 dump_debug_buffer(); 681 682 exit(3); 683 } 684 685 static int 686 str2shift(const char *buf) 687 { 688 const char *ends = "BKMGTPEZ"; 689 int i, len; 690 691 if (buf[0] == '\0') 692 return (0); 693 694 len = strlen(ends); 695 for (i = 0; i < len; i++) { 696 if (toupper(buf[0]) == ends[i]) 697 break; 698 } 699 if (i == len) { 700 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 701 buf); 702 usage(B_FALSE); 703 } 704 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 705 return (10*i); 706 } 707 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 708 usage(B_FALSE); 709 } 710 711 static uint64_t 712 nicenumtoull(const char *buf) 713 { 714 char *end; 715 uint64_t val; 716 717 val = strtoull(buf, &end, 0); 718 if (end == buf) { 719 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 720 usage(B_FALSE); 721 } else if (end[0] == '.') { 722 double fval = strtod(buf, &end); 723 fval *= pow(2, str2shift(end)); 724 /* 725 * UINT64_MAX is not exactly representable as a double. 726 * The closest representation is UINT64_MAX + 1, so we 727 * use a >= comparison instead of > for the bounds check. 728 */ 729 if (fval >= (double)UINT64_MAX) { 730 (void) fprintf(stderr, "ztest: value too large: %s\n", 731 buf); 732 usage(B_FALSE); 733 } 734 val = (uint64_t)fval; 735 } else { 736 int shift = str2shift(end); 737 if (shift >= 64 || (val << shift) >> shift != val) { 738 (void) fprintf(stderr, "ztest: value too large: %s\n", 739 buf); 740 usage(B_FALSE); 741 } 742 val <<= shift; 743 } 744 return (val); 745 } 746 747 typedef struct ztest_option { 748 const char short_opt; 749 const char *long_opt; 750 const char *long_opt_param; 751 const char *comment; 752 unsigned int default_int; 753 const char *default_str; 754 } ztest_option_t; 755 756 /* 757 * The following option_table is used for generating the usage info as well as 758 * the long and short option information for calling getopt_long(). 759 */ 760 static ztest_option_t option_table[] = { 761 { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, 762 NULL}, 763 { 's', "vdev-size", "INTEGER", "Size of each vdev", 764 NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, 765 { 'a', "alignment-shift", "INTEGER", 766 "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, 767 { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", 768 DEFAULT_MIRRORS, NULL}, 769 { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", 770 DEFAULT_RAID_CHILDREN, NULL}, 771 { 'R', "raid-parity", "INTEGER", "Raid parity", 772 DEFAULT_RAID_PARITY, NULL}, 773 { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind", 774 NO_DEFAULT, "random"}, 775 { 'D', "draid-data", "INTEGER", "Number of draid data drives", 776 DEFAULT_DRAID_DATA, NULL}, 777 { 'S', "draid-spares", "INTEGER", "Number of draid spares", 778 DEFAULT_DRAID_SPARES, NULL}, 779 { 'd', "datasets", "INTEGER", "Number of datasets", 780 DEFAULT_DATASETS_COUNT, NULL}, 781 { 't', "threads", "INTEGER", "Number of ztest threads", 782 DEFAULT_THREADS, NULL}, 783 { 'g', "gang-block-threshold", "INTEGER", 784 "Metaslab gang block threshold", 785 NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, 786 { 'i', "init-count", "INTEGER", "Number of times to initialize pool", 787 DEFAULT_INITS, NULL}, 788 { 'k', "kill-percentage", "INTEGER", "Kill percentage", 789 NO_DEFAULT, DEFAULT_KILLRATE_STR}, 790 { 'p', "pool-name", "STRING", "Pool name", 791 NO_DEFAULT, DEFAULT_POOL}, 792 { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", 793 NO_DEFAULT, DEFAULT_VDEV_DIR}, 794 { 'M', "multi-host", NULL, 795 "Multi-host; simulate pool imported on remote host", 796 NO_DEFAULT, NULL}, 797 { 'E', "use-existing-pool", NULL, 798 "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, 799 { 'T', "run-time", "INTEGER", "Total run time", 800 NO_DEFAULT, DEFAULT_RUN_TIME_STR}, 801 { 'P', "pass-time", "INTEGER", "Time per pass", 802 NO_DEFAULT, DEFAULT_PASS_TIME_STR}, 803 { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", 804 DEFAULT_MAX_LOOPS, NULL}, 805 { 'B', "alt-ztest", "PATH", "Alternate ztest path", 806 NO_DEFAULT, NULL}, 807 { 'C', "vdev-class-state", "on|off|random", "vdev class state", 808 NO_DEFAULT, "random"}, 809 { 'X', "raidz-expansion", NULL, 810 "Perform a dedicated raidz expansion test", 811 NO_DEFAULT, NULL}, 812 { 'o', "option", "\"NAME=VALUE\"", 813 "Set the named tunable to the given value", 814 NO_DEFAULT, NULL}, 815 { 'G', "dump-debug-msg", NULL, 816 "Dump zfs_dbgmsg buffer before exiting due to an error", 817 NO_DEFAULT, NULL}, 818 { 'V', "verbose", NULL, 819 "Verbose (use multiple times for ever more verbosity)", 820 NO_DEFAULT, NULL}, 821 { 'h', "help", NULL, "Show this help", 822 NO_DEFAULT, NULL}, 823 {0, 0, 0, 0, 0, 0} 824 }; 825 826 static struct option *long_opts = NULL; 827 static char *short_opts = NULL; 828 829 static void 830 init_options(void) 831 { 832 ASSERT3P(long_opts, ==, NULL); 833 ASSERT3P(short_opts, ==, NULL); 834 835 int count = sizeof (option_table) / sizeof (option_table[0]); 836 long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); 837 838 short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); 839 int short_opt_index = 0; 840 841 for (int i = 0; i < count; i++) { 842 long_opts[i].val = option_table[i].short_opt; 843 long_opts[i].name = option_table[i].long_opt; 844 long_opts[i].has_arg = option_table[i].long_opt_param != NULL 845 ? required_argument : no_argument; 846 long_opts[i].flag = NULL; 847 short_opts[short_opt_index++] = option_table[i].short_opt; 848 if (option_table[i].long_opt_param != NULL) { 849 short_opts[short_opt_index++] = ':'; 850 } 851 } 852 } 853 854 static void 855 fini_options(void) 856 { 857 int count = sizeof (option_table) / sizeof (option_table[0]); 858 859 umem_free(long_opts, sizeof (struct option) * count); 860 umem_free(short_opts, sizeof (char) * 2 * count); 861 862 long_opts = NULL; 863 short_opts = NULL; 864 } 865 866 static __attribute__((noreturn)) void 867 usage(boolean_t requested) 868 { 869 char option[80]; 870 FILE *fp = requested ? stdout : stderr; 871 872 (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); 873 for (int i = 0; option_table[i].short_opt != 0; i++) { 874 if (option_table[i].long_opt_param != NULL) { 875 (void) sprintf(option, " -%c --%s=%s", 876 option_table[i].short_opt, 877 option_table[i].long_opt, 878 option_table[i].long_opt_param); 879 } else { 880 (void) sprintf(option, " -%c --%s", 881 option_table[i].short_opt, 882 option_table[i].long_opt); 883 } 884 (void) fprintf(fp, " %-43s%s", option, 885 option_table[i].comment); 886 887 if (option_table[i].long_opt_param != NULL) { 888 if (option_table[i].default_str != NULL) { 889 (void) fprintf(fp, " (default: %s)", 890 option_table[i].default_str); 891 } else if (option_table[i].default_int != NO_DEFAULT) { 892 (void) fprintf(fp, " (default: %u)", 893 option_table[i].default_int); 894 } 895 } 896 (void) fprintf(fp, "\n"); 897 } 898 exit(requested ? 0 : 1); 899 } 900 901 static uint64_t 902 ztest_random(uint64_t range) 903 { 904 uint64_t r; 905 906 ASSERT3S(ztest_fd_rand, >=, 0); 907 908 if (range == 0) 909 return (0); 910 911 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) 912 fatal(B_TRUE, "short read from /dev/urandom"); 913 914 return (r % range); 915 } 916 917 static void 918 ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) 919 { 920 char name[32]; 921 char *value; 922 int state; 923 924 (void) strlcpy(name, input, sizeof (name)); 925 926 value = strchr(name, '='); 927 if (value == NULL) { 928 (void) fprintf(stderr, "missing value in property=value " 929 "'-C' argument (%s)\n", input); 930 usage(B_FALSE); 931 } 932 *(value) = '\0'; 933 value++; 934 935 if (strcmp(value, "on") == 0) { 936 state = ZTEST_VDEV_CLASS_ON; 937 } else if (strcmp(value, "off") == 0) { 938 state = ZTEST_VDEV_CLASS_OFF; 939 } else if (strcmp(value, "random") == 0) { 940 state = ZTEST_VDEV_CLASS_RND; 941 } else { 942 (void) fprintf(stderr, "invalid property value '%s'\n", value); 943 usage(B_FALSE); 944 } 945 946 if (strcmp(name, "special") == 0) { 947 zo->zo_special_vdevs = state; 948 } else { 949 (void) fprintf(stderr, "invalid property name '%s'\n", name); 950 usage(B_FALSE); 951 } 952 if (zo->zo_verbose >= 3) 953 (void) printf("%s vdev state is '%s'\n", name, value); 954 } 955 956 static void 957 process_options(int argc, char **argv) 958 { 959 char *path; 960 ztest_shared_opts_t *zo = &ztest_opts; 961 962 int opt; 963 uint64_t value; 964 const char *raid_kind = "random"; 965 966 memcpy(zo, &ztest_opts_defaults, sizeof (*zo)); 967 968 init_options(); 969 970 while ((opt = getopt_long(argc, argv, short_opts, long_opts, 971 NULL)) != EOF) { 972 value = 0; 973 switch (opt) { 974 case 'v': 975 case 's': 976 case 'a': 977 case 'm': 978 case 'r': 979 case 'R': 980 case 'D': 981 case 'S': 982 case 'd': 983 case 't': 984 case 'g': 985 case 'i': 986 case 'k': 987 case 'T': 988 case 'P': 989 case 'F': 990 value = nicenumtoull(optarg); 991 } 992 switch (opt) { 993 case 'v': 994 zo->zo_vdevs = value; 995 break; 996 case 's': 997 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 998 break; 999 case 'a': 1000 zo->zo_ashift = value; 1001 break; 1002 case 'm': 1003 zo->zo_mirrors = value; 1004 break; 1005 case 'r': 1006 zo->zo_raid_children = MAX(1, value); 1007 break; 1008 case 'R': 1009 zo->zo_raid_parity = MIN(MAX(value, 1), 3); 1010 break; 1011 case 'K': 1012 raid_kind = optarg; 1013 break; 1014 case 'D': 1015 zo->zo_draid_data = MAX(1, value); 1016 break; 1017 case 'S': 1018 zo->zo_draid_spares = MAX(1, value); 1019 break; 1020 case 'd': 1021 zo->zo_datasets = MAX(1, value); 1022 break; 1023 case 't': 1024 zo->zo_threads = MAX(1, value); 1025 break; 1026 case 'g': 1027 zo->zo_metaslab_force_ganging = 1028 MAX(SPA_MINBLOCKSIZE << 1, value); 1029 break; 1030 case 'i': 1031 zo->zo_init = value; 1032 break; 1033 case 'k': 1034 zo->zo_killrate = value; 1035 break; 1036 case 'p': 1037 (void) strlcpy(zo->zo_pool, optarg, 1038 sizeof (zo->zo_pool)); 1039 break; 1040 case 'f': 1041 path = realpath(optarg, NULL); 1042 if (path == NULL) { 1043 (void) fprintf(stderr, "error: %s: %s\n", 1044 optarg, strerror(errno)); 1045 usage(B_FALSE); 1046 } else { 1047 (void) strlcpy(zo->zo_dir, path, 1048 sizeof (zo->zo_dir)); 1049 free(path); 1050 } 1051 break; 1052 case 'M': 1053 zo->zo_mmp_test = 1; 1054 break; 1055 case 'V': 1056 zo->zo_verbose++; 1057 break; 1058 case 'X': 1059 zo->zo_raidz_expand_test = RAIDZ_EXPAND_REQUESTED; 1060 break; 1061 case 'E': 1062 zo->zo_init = 0; 1063 break; 1064 case 'T': 1065 zo->zo_time = value; 1066 break; 1067 case 'P': 1068 zo->zo_passtime = MAX(1, value); 1069 break; 1070 case 'F': 1071 zo->zo_maxloops = MAX(1, value); 1072 break; 1073 case 'B': 1074 (void) strlcpy(zo->zo_alt_ztest, optarg, 1075 sizeof (zo->zo_alt_ztest)); 1076 break; 1077 case 'C': 1078 ztest_parse_name_value(optarg, zo); 1079 break; 1080 case 'o': 1081 if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { 1082 (void) fprintf(stderr, 1083 "max global var count (%zu) exceeded\n", 1084 ZO_GVARS_MAX_COUNT); 1085 usage(B_FALSE); 1086 } 1087 char *v = zo->zo_gvars[zo->zo_gvars_count]; 1088 if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= 1089 ZO_GVARS_MAX_ARGLEN) { 1090 (void) fprintf(stderr, 1091 "global var option '%s' is too long\n", 1092 optarg); 1093 usage(B_FALSE); 1094 } 1095 zo->zo_gvars_count++; 1096 break; 1097 case 'G': 1098 zo->zo_dump_dbgmsg = 1; 1099 break; 1100 case 'h': 1101 usage(B_TRUE); 1102 break; 1103 case '?': 1104 default: 1105 usage(B_FALSE); 1106 break; 1107 } 1108 } 1109 1110 fini_options(); 1111 1112 /* Force compatible options for raidz expansion run */ 1113 if (zo->zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) { 1114 zo->zo_mmp_test = 0; 1115 zo->zo_mirrors = 0; 1116 zo->zo_vdevs = 1; 1117 zo->zo_vdev_size = DEFAULT_VDEV_SIZE * 2; 1118 zo->zo_raid_do_expand = B_FALSE; 1119 raid_kind = "raidz"; 1120 } 1121 1122 if (strcmp(raid_kind, "random") == 0) { 1123 switch (ztest_random(3)) { 1124 case 0: 1125 raid_kind = "raidz"; 1126 break; 1127 case 1: 1128 raid_kind = "eraidz"; 1129 break; 1130 case 2: 1131 raid_kind = "draid"; 1132 break; 1133 } 1134 1135 if (ztest_opts.zo_verbose >= 3) 1136 (void) printf("choosing RAID type '%s'\n", raid_kind); 1137 } 1138 1139 if (strcmp(raid_kind, "draid") == 0) { 1140 uint64_t min_devsize; 1141 1142 /* With fewer disk use 256M, otherwise 128M is OK */ 1143 min_devsize = (ztest_opts.zo_raid_children < 16) ? 1144 (256ULL << 20) : (128ULL << 20); 1145 1146 /* No top-level mirrors with dRAID for now */ 1147 zo->zo_mirrors = 0; 1148 1149 /* Use more appropriate defaults for dRAID */ 1150 if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) 1151 zo->zo_vdevs = 1; 1152 if (zo->zo_raid_children == 1153 ztest_opts_defaults.zo_raid_children) 1154 zo->zo_raid_children = 16; 1155 if (zo->zo_ashift < 12) 1156 zo->zo_ashift = 12; 1157 if (zo->zo_vdev_size < min_devsize) 1158 zo->zo_vdev_size = min_devsize; 1159 1160 if (zo->zo_draid_data + zo->zo_raid_parity > 1161 zo->zo_raid_children - zo->zo_draid_spares) { 1162 (void) fprintf(stderr, "error: too few draid " 1163 "children (%d) for stripe width (%d)\n", 1164 zo->zo_raid_children, 1165 zo->zo_draid_data + zo->zo_raid_parity); 1166 usage(B_FALSE); 1167 } 1168 1169 (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, 1170 sizeof (zo->zo_raid_type)); 1171 1172 } else if (strcmp(raid_kind, "eraidz") == 0) { 1173 /* using eraidz (expandable raidz) */ 1174 zo->zo_raid_do_expand = B_TRUE; 1175 1176 /* tests expect top-level to be raidz */ 1177 zo->zo_mirrors = 0; 1178 zo->zo_vdevs = 1; 1179 1180 /* Make sure parity is less than data columns */ 1181 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1182 zo->zo_raid_children - 1); 1183 1184 } else /* using raidz */ { 1185 ASSERT0(strcmp(raid_kind, "raidz")); 1186 1187 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1188 zo->zo_raid_children - 1); 1189 } 1190 1191 zo->zo_vdevtime = 1192 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 1193 UINT64_MAX >> 2); 1194 1195 if (*zo->zo_alt_ztest) { 1196 const char *invalid_what = "ztest"; 1197 char *val = zo->zo_alt_ztest; 1198 if (0 != access(val, X_OK) || 1199 (strrchr(val, '/') == NULL && (errno == EINVAL))) 1200 goto invalid; 1201 1202 int dirlen = strrchr(val, '/') - val; 1203 strlcpy(zo->zo_alt_libpath, val, 1204 MIN(sizeof (zo->zo_alt_libpath), dirlen + 1)); 1205 invalid_what = "library path", val = zo->zo_alt_libpath; 1206 if (strrchr(val, '/') == NULL && (errno == EINVAL)) 1207 goto invalid; 1208 *strrchr(val, '/') = '\0'; 1209 strlcat(val, "/lib", sizeof (zo->zo_alt_libpath)); 1210 1211 if (0 != access(zo->zo_alt_libpath, X_OK)) 1212 goto invalid; 1213 return; 1214 1215 invalid: 1216 ztest_dump_core = B_FALSE; 1217 fatal(B_TRUE, "invalid alternate %s %s", invalid_what, val); 1218 } 1219 } 1220 1221 static void 1222 ztest_kill(ztest_shared_t *zs) 1223 { 1224 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 1225 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 1226 1227 /* 1228 * Before we kill ourselves, make sure that the config is updated. 1229 * See comment above spa_write_cachefile(). 1230 */ 1231 if (raidz_expand_pause_point != RAIDZ_EXPAND_PAUSE_NONE) { 1232 if (mutex_tryenter(&spa_namespace_lock)) { 1233 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, 1234 B_FALSE); 1235 mutex_exit(&spa_namespace_lock); 1236 1237 ztest_scratch_state->zs_raidz_scratch_verify_pause = 1238 raidz_expand_pause_point; 1239 } else { 1240 /* 1241 * Do not verify scratch object in case if 1242 * spa_namespace_lock cannot be acquired, 1243 * it can cause deadlock in spa_config_update(). 1244 */ 1245 raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; 1246 1247 return; 1248 } 1249 } else { 1250 mutex_enter(&spa_namespace_lock); 1251 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); 1252 mutex_exit(&spa_namespace_lock); 1253 } 1254 1255 (void) raise(SIGKILL); 1256 } 1257 1258 static void 1259 ztest_record_enospc(const char *s) 1260 { 1261 (void) s; 1262 ztest_shared->zs_enospc_count++; 1263 } 1264 1265 static uint64_t 1266 ztest_get_ashift(void) 1267 { 1268 if (ztest_opts.zo_ashift == 0) 1269 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 1270 return (ztest_opts.zo_ashift); 1271 } 1272 1273 static boolean_t 1274 ztest_is_draid_spare(const char *name) 1275 { 1276 uint64_t spare_id = 0, parity = 0, vdev_id = 0; 1277 1278 if (sscanf(name, VDEV_TYPE_DRAID "%"PRIu64"-%"PRIu64"-%"PRIu64"", 1279 &parity, &vdev_id, &spare_id) == 3) { 1280 return (B_TRUE); 1281 } 1282 1283 return (B_FALSE); 1284 } 1285 1286 static nvlist_t * 1287 make_vdev_file(const char *path, const char *aux, const char *pool, 1288 size_t size, uint64_t ashift) 1289 { 1290 char *pathbuf = NULL; 1291 uint64_t vdev; 1292 nvlist_t *file; 1293 boolean_t draid_spare = B_FALSE; 1294 1295 1296 if (ashift == 0) 1297 ashift = ztest_get_ashift(); 1298 1299 if (path == NULL) { 1300 pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1301 path = pathbuf; 1302 1303 if (aux != NULL) { 1304 vdev = ztest_shared->zs_vdev_aux; 1305 (void) snprintf(pathbuf, MAXPATHLEN, 1306 ztest_aux_template, ztest_opts.zo_dir, 1307 pool == NULL ? ztest_opts.zo_pool : pool, 1308 aux, vdev); 1309 } else { 1310 vdev = ztest_shared->zs_vdev_next_leaf++; 1311 (void) snprintf(pathbuf, MAXPATHLEN, 1312 ztest_dev_template, ztest_opts.zo_dir, 1313 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 1314 } 1315 } else { 1316 draid_spare = ztest_is_draid_spare(path); 1317 } 1318 1319 if (size != 0 && !draid_spare) { 1320 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 1321 if (fd == -1) 1322 fatal(B_TRUE, "can't open %s", path); 1323 if (ftruncate(fd, size) != 0) 1324 fatal(B_TRUE, "can't ftruncate %s", path); 1325 (void) close(fd); 1326 } 1327 1328 file = fnvlist_alloc(); 1329 fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, 1330 draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); 1331 fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); 1332 fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); 1333 umem_free(pathbuf, MAXPATHLEN); 1334 1335 return (file); 1336 } 1337 1338 static nvlist_t * 1339 make_vdev_raid(const char *path, const char *aux, const char *pool, size_t size, 1340 uint64_t ashift, int r) 1341 { 1342 nvlist_t *raid, **child; 1343 int c; 1344 1345 if (r < 2) 1346 return (make_vdev_file(path, aux, pool, size, ashift)); 1347 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 1348 1349 for (c = 0; c < r; c++) 1350 child[c] = make_vdev_file(path, aux, pool, size, ashift); 1351 1352 raid = fnvlist_alloc(); 1353 fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, 1354 ztest_opts.zo_raid_type); 1355 fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, 1356 ztest_opts.zo_raid_parity); 1357 fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, 1358 (const nvlist_t **)child, r); 1359 1360 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { 1361 uint64_t ndata = ztest_opts.zo_draid_data; 1362 uint64_t nparity = ztest_opts.zo_raid_parity; 1363 uint64_t nspares = ztest_opts.zo_draid_spares; 1364 uint64_t children = ztest_opts.zo_raid_children; 1365 uint64_t ngroups = 1; 1366 1367 /* 1368 * Calculate the minimum number of groups required to fill a 1369 * slice. This is the LCM of the stripe width (data + parity) 1370 * and the number of data drives (children - spares). 1371 */ 1372 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1373 ngroups++; 1374 1375 /* Store the basic dRAID configuration. */ 1376 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1377 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1378 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1379 } 1380 1381 for (c = 0; c < r; c++) 1382 fnvlist_free(child[c]); 1383 1384 umem_free(child, r * sizeof (nvlist_t *)); 1385 1386 return (raid); 1387 } 1388 1389 static nvlist_t * 1390 make_vdev_mirror(const char *path, const char *aux, const char *pool, 1391 size_t size, uint64_t ashift, int r, int m) 1392 { 1393 nvlist_t *mirror, **child; 1394 int c; 1395 1396 if (m < 1) 1397 return (make_vdev_raid(path, aux, pool, size, ashift, r)); 1398 1399 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 1400 1401 for (c = 0; c < m; c++) 1402 child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); 1403 1404 mirror = fnvlist_alloc(); 1405 fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); 1406 fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 1407 (const nvlist_t **)child, m); 1408 1409 for (c = 0; c < m; c++) 1410 fnvlist_free(child[c]); 1411 1412 umem_free(child, m * sizeof (nvlist_t *)); 1413 1414 return (mirror); 1415 } 1416 1417 static nvlist_t * 1418 make_vdev_root(const char *path, const char *aux, const char *pool, size_t size, 1419 uint64_t ashift, const char *class, int r, int m, int t) 1420 { 1421 nvlist_t *root, **child; 1422 int c; 1423 boolean_t log; 1424 1425 ASSERT3S(t, >, 0); 1426 1427 log = (class != NULL && strcmp(class, "log") == 0); 1428 1429 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 1430 1431 for (c = 0; c < t; c++) { 1432 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 1433 r, m); 1434 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); 1435 1436 if (class != NULL && class[0] != '\0') { 1437 ASSERT(m > 1 || log); /* expecting a mirror */ 1438 fnvlist_add_string(child[c], 1439 ZPOOL_CONFIG_ALLOCATION_BIAS, class); 1440 } 1441 } 1442 1443 root = fnvlist_alloc(); 1444 fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 1445 fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 1446 (const nvlist_t **)child, t); 1447 1448 for (c = 0; c < t; c++) 1449 fnvlist_free(child[c]); 1450 1451 umem_free(child, t * sizeof (nvlist_t *)); 1452 1453 return (root); 1454 } 1455 1456 /* 1457 * Find a random spa version. Returns back a random spa version in the 1458 * range [initial_version, SPA_VERSION_FEATURES]. 1459 */ 1460 static uint64_t 1461 ztest_random_spa_version(uint64_t initial_version) 1462 { 1463 uint64_t version = initial_version; 1464 1465 if (version <= SPA_VERSION_BEFORE_FEATURES) { 1466 version = version + 1467 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 1468 } 1469 1470 if (version > SPA_VERSION_BEFORE_FEATURES) 1471 version = SPA_VERSION_FEATURES; 1472 1473 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1474 return (version); 1475 } 1476 1477 static int 1478 ztest_random_blocksize(void) 1479 { 1480 ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); 1481 1482 /* 1483 * Choose a block size >= the ashift. 1484 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1485 */ 1486 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1487 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1488 maxbs = 20; 1489 uint64_t block_shift = 1490 ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1491 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1492 } 1493 1494 static int 1495 ztest_random_dnodesize(void) 1496 { 1497 int slots; 1498 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1499 1500 if (max_slots == DNODE_MIN_SLOTS) 1501 return (DNODE_MIN_SIZE); 1502 1503 /* 1504 * Weight the random distribution more heavily toward smaller 1505 * dnode sizes since that is more likely to reflect real-world 1506 * usage. 1507 */ 1508 ASSERT3U(max_slots, >, 4); 1509 switch (ztest_random(10)) { 1510 case 0: 1511 slots = 5 + ztest_random(max_slots - 4); 1512 break; 1513 case 1 ... 4: 1514 slots = 2 + ztest_random(3); 1515 break; 1516 default: 1517 slots = 1; 1518 break; 1519 } 1520 1521 return (slots << DNODE_SHIFT); 1522 } 1523 1524 static int 1525 ztest_random_ibshift(void) 1526 { 1527 return (DN_MIN_INDBLKSHIFT + 1528 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1529 } 1530 1531 static uint64_t 1532 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1533 { 1534 uint64_t top; 1535 vdev_t *rvd = spa->spa_root_vdev; 1536 vdev_t *tvd; 1537 1538 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 1539 1540 do { 1541 top = ztest_random(rvd->vdev_children); 1542 tvd = rvd->vdev_child[top]; 1543 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1544 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1545 1546 return (top); 1547 } 1548 1549 static uint64_t 1550 ztest_random_dsl_prop(zfs_prop_t prop) 1551 { 1552 uint64_t value; 1553 1554 do { 1555 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1556 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1557 1558 return (value); 1559 } 1560 1561 static int 1562 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1563 boolean_t inherit) 1564 { 1565 const char *propname = zfs_prop_to_name(prop); 1566 const char *valname; 1567 char *setpoint; 1568 uint64_t curval; 1569 int error; 1570 1571 error = dsl_prop_set_int(osname, propname, 1572 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1573 1574 if (error == ENOSPC) { 1575 ztest_record_enospc(FTAG); 1576 return (error); 1577 } 1578 ASSERT0(error); 1579 1580 setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1581 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1582 1583 if (ztest_opts.zo_verbose >= 6) { 1584 int err; 1585 1586 err = zfs_prop_index_to_string(prop, curval, &valname); 1587 if (err) 1588 (void) printf("%s %s = %llu at '%s'\n", osname, 1589 propname, (unsigned long long)curval, setpoint); 1590 else 1591 (void) printf("%s %s = %s at '%s'\n", 1592 osname, propname, valname, setpoint); 1593 } 1594 umem_free(setpoint, MAXPATHLEN); 1595 1596 return (error); 1597 } 1598 1599 static int 1600 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1601 { 1602 spa_t *spa = ztest_spa; 1603 nvlist_t *props = NULL; 1604 int error; 1605 1606 props = fnvlist_alloc(); 1607 fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); 1608 1609 error = spa_prop_set(spa, props); 1610 1611 fnvlist_free(props); 1612 1613 if (error == ENOSPC) { 1614 ztest_record_enospc(FTAG); 1615 return (error); 1616 } 1617 ASSERT0(error); 1618 1619 return (error); 1620 } 1621 1622 static int 1623 ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, 1624 boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) 1625 { 1626 int err; 1627 char *cp = NULL; 1628 char ddname[ZFS_MAX_DATASET_NAME_LEN]; 1629 1630 strlcpy(ddname, name, sizeof (ddname)); 1631 cp = strchr(ddname, '@'); 1632 if (cp != NULL) 1633 *cp = '\0'; 1634 1635 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1636 while (decrypt && err == EACCES) { 1637 dsl_crypto_params_t *dcp; 1638 nvlist_t *crypto_args = fnvlist_alloc(); 1639 1640 fnvlist_add_uint8_array(crypto_args, "wkeydata", 1641 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 1642 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, 1643 crypto_args, &dcp)); 1644 err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); 1645 /* 1646 * Note: if there was an error loading, the wkey was not 1647 * consumed, and needs to be freed. 1648 */ 1649 dsl_crypto_params_free(dcp, (err != 0)); 1650 fnvlist_free(crypto_args); 1651 1652 if (err == EINVAL) { 1653 /* 1654 * We couldn't load a key for this dataset so try 1655 * the parent. This loop will eventually hit the 1656 * encryption root since ztest only makes clones 1657 * as children of their origin datasets. 1658 */ 1659 cp = strrchr(ddname, '/'); 1660 if (cp == NULL) 1661 return (err); 1662 1663 *cp = '\0'; 1664 err = EACCES; 1665 continue; 1666 } else if (err != 0) { 1667 break; 1668 } 1669 1670 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1671 break; 1672 } 1673 1674 return (err); 1675 } 1676 1677 static void 1678 ztest_rll_init(rll_t *rll) 1679 { 1680 rll->rll_writer = NULL; 1681 rll->rll_readers = 0; 1682 mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); 1683 cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); 1684 } 1685 1686 static void 1687 ztest_rll_destroy(rll_t *rll) 1688 { 1689 ASSERT3P(rll->rll_writer, ==, NULL); 1690 ASSERT0(rll->rll_readers); 1691 mutex_destroy(&rll->rll_lock); 1692 cv_destroy(&rll->rll_cv); 1693 } 1694 1695 static void 1696 ztest_rll_lock(rll_t *rll, rl_type_t type) 1697 { 1698 mutex_enter(&rll->rll_lock); 1699 1700 if (type == ZTRL_READER) { 1701 while (rll->rll_writer != NULL) 1702 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1703 rll->rll_readers++; 1704 } else { 1705 while (rll->rll_writer != NULL || rll->rll_readers) 1706 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1707 rll->rll_writer = curthread; 1708 } 1709 1710 mutex_exit(&rll->rll_lock); 1711 } 1712 1713 static void 1714 ztest_rll_unlock(rll_t *rll) 1715 { 1716 mutex_enter(&rll->rll_lock); 1717 1718 if (rll->rll_writer) { 1719 ASSERT0(rll->rll_readers); 1720 rll->rll_writer = NULL; 1721 } else { 1722 ASSERT3S(rll->rll_readers, >, 0); 1723 ASSERT3P(rll->rll_writer, ==, NULL); 1724 rll->rll_readers--; 1725 } 1726 1727 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1728 cv_broadcast(&rll->rll_cv); 1729 1730 mutex_exit(&rll->rll_lock); 1731 } 1732 1733 static void 1734 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1735 { 1736 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1737 1738 ztest_rll_lock(rll, type); 1739 } 1740 1741 static void 1742 ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1743 { 1744 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1745 1746 ztest_rll_unlock(rll); 1747 } 1748 1749 static rl_t * 1750 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1751 uint64_t size, rl_type_t type) 1752 { 1753 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1754 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1755 rl_t *rl; 1756 1757 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1758 rl->rl_object = object; 1759 rl->rl_offset = offset; 1760 rl->rl_size = size; 1761 rl->rl_lock = rll; 1762 1763 ztest_rll_lock(rll, type); 1764 1765 return (rl); 1766 } 1767 1768 static void 1769 ztest_range_unlock(rl_t *rl) 1770 { 1771 rll_t *rll = rl->rl_lock; 1772 1773 ztest_rll_unlock(rll); 1774 1775 umem_free(rl, sizeof (*rl)); 1776 } 1777 1778 static void 1779 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1780 { 1781 zd->zd_os = os; 1782 zd->zd_zilog = dmu_objset_zil(os); 1783 zd->zd_shared = szd; 1784 dmu_objset_name(os, zd->zd_name); 1785 int l; 1786 1787 if (zd->zd_shared != NULL) 1788 zd->zd_shared->zd_seq = 0; 1789 1790 VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); 1791 mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); 1792 1793 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1794 ztest_rll_init(&zd->zd_object_lock[l]); 1795 1796 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1797 ztest_rll_init(&zd->zd_range_lock[l]); 1798 } 1799 1800 static void 1801 ztest_zd_fini(ztest_ds_t *zd) 1802 { 1803 int l; 1804 1805 mutex_destroy(&zd->zd_dirobj_lock); 1806 (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); 1807 1808 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1809 ztest_rll_destroy(&zd->zd_object_lock[l]); 1810 1811 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1812 ztest_rll_destroy(&zd->zd_range_lock[l]); 1813 } 1814 1815 #define DMU_TX_MIGHTWAIT \ 1816 (ztest_random(10) == 0 ? DMU_TX_NOWAIT : DMU_TX_WAIT) 1817 1818 static uint64_t 1819 ztest_tx_assign(dmu_tx_t *tx, dmu_tx_flag_t txg_how, const char *tag) 1820 { 1821 uint64_t txg; 1822 int error; 1823 1824 /* 1825 * Attempt to assign tx to some transaction group. 1826 */ 1827 error = dmu_tx_assign(tx, txg_how); 1828 if (error) { 1829 if (error == ERESTART) { 1830 ASSERT3U(txg_how, ==, DMU_TX_NOWAIT); 1831 dmu_tx_wait(tx); 1832 } else if (error == ENOSPC) { 1833 ztest_record_enospc(tag); 1834 } else { 1835 ASSERT(error == EDQUOT || error == EIO); 1836 } 1837 dmu_tx_abort(tx); 1838 return (0); 1839 } 1840 txg = dmu_tx_get_txg(tx); 1841 ASSERT3U(txg, !=, 0); 1842 return (txg); 1843 } 1844 1845 static void 1846 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1847 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1848 uint64_t crtxg) 1849 { 1850 bt->bt_magic = BT_MAGIC; 1851 bt->bt_objset = dmu_objset_id(os); 1852 bt->bt_object = object; 1853 bt->bt_dnodesize = dnodesize; 1854 bt->bt_offset = offset; 1855 bt->bt_gen = gen; 1856 bt->bt_txg = txg; 1857 bt->bt_crtxg = crtxg; 1858 } 1859 1860 static void 1861 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1862 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1863 uint64_t crtxg) 1864 { 1865 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1866 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1867 ASSERT3U(bt->bt_object, ==, object); 1868 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1869 ASSERT3U(bt->bt_offset, ==, offset); 1870 ASSERT3U(bt->bt_gen, <=, gen); 1871 ASSERT3U(bt->bt_txg, <=, txg); 1872 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1873 } 1874 1875 static ztest_block_tag_t * 1876 ztest_bt_bonus(dmu_buf_t *db) 1877 { 1878 dmu_object_info_t doi; 1879 ztest_block_tag_t *bt; 1880 1881 dmu_object_info_from_db(db, &doi); 1882 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1883 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1884 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1885 1886 return (bt); 1887 } 1888 1889 /* 1890 * Generate a token to fill up unused bonus buffer space. Try to make 1891 * it unique to the object, generation, and offset to verify that data 1892 * is not getting overwritten by data from other dnodes. 1893 */ 1894 #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1895 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1896 1897 /* 1898 * Fill up the unused bonus buffer region before the block tag with a 1899 * verifiable pattern. Filling the whole bonus area with non-zero data 1900 * helps ensure that all dnode traversal code properly skips the 1901 * interior regions of large dnodes. 1902 */ 1903 static void 1904 ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1905 objset_t *os, uint64_t gen) 1906 { 1907 uint64_t *bonusp; 1908 1909 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1910 1911 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1912 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1913 gen, bonusp - (uint64_t *)db->db_data); 1914 *bonusp = token; 1915 } 1916 } 1917 1918 /* 1919 * Verify that the unused area of a bonus buffer is filled with the 1920 * expected tokens. 1921 */ 1922 static void 1923 ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1924 objset_t *os, uint64_t gen) 1925 { 1926 uint64_t *bonusp; 1927 1928 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1929 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1930 gen, bonusp - (uint64_t *)db->db_data); 1931 VERIFY3U(*bonusp, ==, token); 1932 } 1933 } 1934 1935 /* 1936 * ZIL logging ops 1937 */ 1938 1939 #define lrz_type lr_mode 1940 #define lrz_blocksize lr_uid 1941 #define lrz_ibshift lr_gid 1942 #define lrz_bonustype lr_rdev 1943 #define lrz_dnodesize lr_crtime[1] 1944 1945 static void 1946 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1947 { 1948 char *name = (char *)&lr->lr_data[0]; /* name follows lr */ 1949 size_t namesize = strlen(name) + 1; 1950 itx_t *itx; 1951 1952 if (zil_replaying(zd->zd_zilog, tx)) 1953 return; 1954 1955 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1956 memcpy(&itx->itx_lr + 1, &lr->lr_create.lr_common + 1, 1957 sizeof (*lr) + namesize - sizeof (lr_t)); 1958 1959 zil_itx_assign(zd->zd_zilog, itx, tx); 1960 } 1961 1962 static void 1963 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1964 { 1965 char *name = (char *)&lr->lr_data[0]; /* name follows lr */ 1966 size_t namesize = strlen(name) + 1; 1967 itx_t *itx; 1968 1969 if (zil_replaying(zd->zd_zilog, tx)) 1970 return; 1971 1972 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1973 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1974 sizeof (*lr) + namesize - sizeof (lr_t)); 1975 1976 itx->itx_oid = object; 1977 zil_itx_assign(zd->zd_zilog, itx, tx); 1978 } 1979 1980 static void 1981 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1982 { 1983 itx_t *itx; 1984 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1985 1986 if (zil_replaying(zd->zd_zilog, tx)) 1987 return; 1988 1989 if (lr->lr_length > zil_max_log_data(zd->zd_zilog, sizeof (lr_write_t))) 1990 write_state = WR_INDIRECT; 1991 1992 itx = zil_itx_create(TX_WRITE, 1993 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1994 1995 if (write_state == WR_COPIED && 1996 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1997 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH | 1998 DMU_KEEP_CACHING) != 0) { 1999 zil_itx_destroy(itx); 2000 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 2001 write_state = WR_NEED_COPY; 2002 } 2003 itx->itx_private = zd; 2004 itx->itx_wr_state = write_state; 2005 itx->itx_sync = (ztest_random(8) == 0); 2006 2007 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2008 sizeof (*lr) - sizeof (lr_t)); 2009 2010 zil_itx_assign(zd->zd_zilog, itx, tx); 2011 } 2012 2013 static void 2014 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 2015 { 2016 itx_t *itx; 2017 2018 if (zil_replaying(zd->zd_zilog, tx)) 2019 return; 2020 2021 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 2022 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2023 sizeof (*lr) - sizeof (lr_t)); 2024 2025 itx->itx_sync = B_FALSE; 2026 zil_itx_assign(zd->zd_zilog, itx, tx); 2027 } 2028 2029 static void 2030 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 2031 { 2032 itx_t *itx; 2033 2034 if (zil_replaying(zd->zd_zilog, tx)) 2035 return; 2036 2037 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 2038 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2039 sizeof (*lr) - sizeof (lr_t)); 2040 2041 itx->itx_sync = B_FALSE; 2042 zil_itx_assign(zd->zd_zilog, itx, tx); 2043 } 2044 2045 /* 2046 * ZIL replay ops 2047 */ 2048 static int 2049 ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 2050 { 2051 ztest_ds_t *zd = arg1; 2052 lr_create_t *lrc = arg2; 2053 _lr_create_t *lr = &lrc->lr_create; 2054 char *name = (char *)&lrc->lr_data[0]; /* name follows lr */ 2055 objset_t *os = zd->zd_os; 2056 ztest_block_tag_t *bbt; 2057 dmu_buf_t *db; 2058 dmu_tx_t *tx; 2059 uint64_t txg; 2060 int error = 0; 2061 int bonuslen; 2062 2063 if (byteswap) 2064 byteswap_uint64_array(lr, sizeof (*lr)); 2065 2066 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2067 ASSERT3S(name[0], !=, '\0'); 2068 2069 tx = dmu_tx_create(os); 2070 2071 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 2072 2073 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2074 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 2075 } else { 2076 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 2077 } 2078 2079 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2080 if (txg == 0) 2081 return (ENOSPC); 2082 2083 ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); 2084 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 2085 2086 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2087 if (lr->lr_foid == 0) { 2088 lr->lr_foid = zap_create_dnsize(os, 2089 lr->lrz_type, lr->lrz_bonustype, 2090 bonuslen, lr->lrz_dnodesize, tx); 2091 } else { 2092 error = zap_create_claim_dnsize(os, lr->lr_foid, 2093 lr->lrz_type, lr->lrz_bonustype, 2094 bonuslen, lr->lrz_dnodesize, tx); 2095 } 2096 } else { 2097 if (lr->lr_foid == 0) { 2098 lr->lr_foid = dmu_object_alloc_dnsize(os, 2099 lr->lrz_type, 0, lr->lrz_bonustype, 2100 bonuslen, lr->lrz_dnodesize, tx); 2101 } else { 2102 error = dmu_object_claim_dnsize(os, lr->lr_foid, 2103 lr->lrz_type, 0, lr->lrz_bonustype, 2104 bonuslen, lr->lrz_dnodesize, tx); 2105 } 2106 } 2107 2108 if (error) { 2109 ASSERT3U(error, ==, EEXIST); 2110 ASSERT(zd->zd_zilog->zl_replay); 2111 dmu_tx_commit(tx); 2112 return (error); 2113 } 2114 2115 ASSERT3U(lr->lr_foid, !=, 0); 2116 2117 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 2118 VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, 2119 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 2120 2121 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2122 bbt = ztest_bt_bonus(db); 2123 dmu_buf_will_dirty(db, tx); 2124 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 2125 lr->lr_gen, txg, txg); 2126 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 2127 dmu_buf_rele(db, FTAG); 2128 2129 VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 2130 &lr->lr_foid, tx)); 2131 2132 (void) ztest_log_create(zd, tx, lrc); 2133 2134 dmu_tx_commit(tx); 2135 2136 return (0); 2137 } 2138 2139 static int 2140 ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 2141 { 2142 ztest_ds_t *zd = arg1; 2143 lr_remove_t *lr = arg2; 2144 char *name = (char *)&lr->lr_data[0]; /* name follows lr */ 2145 objset_t *os = zd->zd_os; 2146 dmu_object_info_t doi; 2147 dmu_tx_t *tx; 2148 uint64_t object, txg; 2149 2150 if (byteswap) 2151 byteswap_uint64_array(lr, sizeof (*lr)); 2152 2153 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2154 ASSERT3S(name[0], !=, '\0'); 2155 2156 VERIFY0( 2157 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 2158 ASSERT3U(object, !=, 0); 2159 2160 ztest_object_lock(zd, object, ZTRL_WRITER); 2161 2162 VERIFY0(dmu_object_info(os, object, &doi)); 2163 2164 tx = dmu_tx_create(os); 2165 2166 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 2167 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 2168 2169 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2170 if (txg == 0) { 2171 ztest_object_unlock(zd, object); 2172 return (ENOSPC); 2173 } 2174 2175 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 2176 VERIFY0(zap_destroy(os, object, tx)); 2177 } else { 2178 VERIFY0(dmu_object_free(os, object, tx)); 2179 } 2180 2181 VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); 2182 2183 (void) ztest_log_remove(zd, tx, lr, object); 2184 2185 dmu_tx_commit(tx); 2186 2187 ztest_object_unlock(zd, object); 2188 2189 return (0); 2190 } 2191 2192 static int 2193 ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 2194 { 2195 ztest_ds_t *zd = arg1; 2196 lr_write_t *lr = arg2; 2197 objset_t *os = zd->zd_os; 2198 uint8_t *data = &lr->lr_data[0]; /* data follows lr */ 2199 uint64_t offset, length; 2200 ztest_block_tag_t *bt = (ztest_block_tag_t *)data; 2201 ztest_block_tag_t *bbt; 2202 uint64_t gen, txg, lrtxg, crtxg; 2203 dmu_object_info_t doi; 2204 dmu_tx_t *tx; 2205 dmu_buf_t *db; 2206 arc_buf_t *abuf = NULL; 2207 rl_t *rl; 2208 2209 if (byteswap) 2210 byteswap_uint64_array(lr, sizeof (*lr)); 2211 2212 offset = lr->lr_offset; 2213 length = lr->lr_length; 2214 2215 /* If it's a dmu_sync() block, write the whole block */ 2216 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 2217 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 2218 if (length < blocksize) { 2219 offset -= offset % blocksize; 2220 length = blocksize; 2221 } 2222 } 2223 2224 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 2225 byteswap_uint64_array(bt, sizeof (*bt)); 2226 2227 if (bt->bt_magic != BT_MAGIC) 2228 bt = NULL; 2229 2230 ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); 2231 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, ZTRL_WRITER); 2232 2233 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2234 2235 dmu_object_info_from_db(db, &doi); 2236 2237 bbt = ztest_bt_bonus(db); 2238 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2239 gen = bbt->bt_gen; 2240 crtxg = bbt->bt_crtxg; 2241 lrtxg = lr->lr_common.lrc_txg; 2242 2243 tx = dmu_tx_create(os); 2244 2245 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 2246 2247 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 2248 P2PHASE(offset, length) == 0) 2249 abuf = dmu_request_arcbuf(db, length); 2250 2251 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2252 if (txg == 0) { 2253 if (abuf != NULL) 2254 dmu_return_arcbuf(abuf); 2255 dmu_buf_rele(db, FTAG); 2256 ztest_range_unlock(rl); 2257 ztest_object_unlock(zd, lr->lr_foid); 2258 return (ENOSPC); 2259 } 2260 2261 if (bt != NULL) { 2262 /* 2263 * Usually, verify the old data before writing new data -- 2264 * but not always, because we also want to verify correct 2265 * behavior when the data was not recently read into cache. 2266 */ 2267 ASSERT(doi.doi_data_block_size); 2268 ASSERT0(offset % doi.doi_data_block_size); 2269 if (ztest_random(4) != 0) { 2270 dmu_flags_t flags = ztest_random(2) ? 2271 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 2272 2273 /* 2274 * We will randomly set when to do O_DIRECT on a read. 2275 */ 2276 if (ztest_random(4) == 0) 2277 flags |= DMU_DIRECTIO; 2278 2279 ztest_block_tag_t rbt; 2280 2281 VERIFY(dmu_read(os, lr->lr_foid, offset, 2282 sizeof (rbt), &rbt, flags) == 0); 2283 if (rbt.bt_magic == BT_MAGIC) { 2284 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 2285 offset, gen, txg, crtxg); 2286 } 2287 } 2288 2289 /* 2290 * Writes can appear to be newer than the bonus buffer because 2291 * the ztest_get_data() callback does a dmu_read() of the 2292 * open-context data, which may be different than the data 2293 * as it was when the write was generated. 2294 */ 2295 if (zd->zd_zilog->zl_replay) { 2296 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 2297 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 2298 bt->bt_crtxg); 2299 } 2300 2301 /* 2302 * Set the bt's gen/txg to the bonus buffer's gen/txg 2303 * so that all of the usual ASSERTs will work. 2304 */ 2305 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 2306 crtxg); 2307 } 2308 2309 if (abuf == NULL) { 2310 dmu_write(os, lr->lr_foid, offset, length, data, tx); 2311 } else { 2312 memcpy(abuf->b_data, data, length); 2313 VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx, 0)); 2314 } 2315 2316 (void) ztest_log_write(zd, tx, lr); 2317 2318 dmu_buf_rele(db, FTAG); 2319 2320 dmu_tx_commit(tx); 2321 2322 ztest_range_unlock(rl); 2323 ztest_object_unlock(zd, lr->lr_foid); 2324 2325 return (0); 2326 } 2327 2328 static int 2329 ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 2330 { 2331 ztest_ds_t *zd = arg1; 2332 lr_truncate_t *lr = arg2; 2333 objset_t *os = zd->zd_os; 2334 dmu_tx_t *tx; 2335 uint64_t txg; 2336 rl_t *rl; 2337 2338 if (byteswap) 2339 byteswap_uint64_array(lr, sizeof (*lr)); 2340 2341 ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); 2342 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 2343 ZTRL_WRITER); 2344 2345 tx = dmu_tx_create(os); 2346 2347 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 2348 2349 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2350 if (txg == 0) { 2351 ztest_range_unlock(rl); 2352 ztest_object_unlock(zd, lr->lr_foid); 2353 return (ENOSPC); 2354 } 2355 2356 VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 2357 lr->lr_length, tx)); 2358 2359 (void) ztest_log_truncate(zd, tx, lr); 2360 2361 dmu_tx_commit(tx); 2362 2363 ztest_range_unlock(rl); 2364 ztest_object_unlock(zd, lr->lr_foid); 2365 2366 return (0); 2367 } 2368 2369 static int 2370 ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 2371 { 2372 ztest_ds_t *zd = arg1; 2373 lr_setattr_t *lr = arg2; 2374 objset_t *os = zd->zd_os; 2375 dmu_tx_t *tx; 2376 dmu_buf_t *db; 2377 ztest_block_tag_t *bbt; 2378 uint64_t txg, lrtxg, crtxg, dnodesize; 2379 2380 if (byteswap) 2381 byteswap_uint64_array(lr, sizeof (*lr)); 2382 2383 ztest_object_lock(zd, lr->lr_foid, ZTRL_WRITER); 2384 2385 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2386 2387 tx = dmu_tx_create(os); 2388 dmu_tx_hold_bonus(tx, lr->lr_foid); 2389 2390 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2391 if (txg == 0) { 2392 dmu_buf_rele(db, FTAG); 2393 ztest_object_unlock(zd, lr->lr_foid); 2394 return (ENOSPC); 2395 } 2396 2397 bbt = ztest_bt_bonus(db); 2398 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2399 crtxg = bbt->bt_crtxg; 2400 lrtxg = lr->lr_common.lrc_txg; 2401 dnodesize = bbt->bt_dnodesize; 2402 2403 if (zd->zd_zilog->zl_replay) { 2404 ASSERT3U(lr->lr_size, !=, 0); 2405 ASSERT3U(lr->lr_mode, !=, 0); 2406 ASSERT3U(lrtxg, !=, 0); 2407 } else { 2408 /* 2409 * Randomly change the size and increment the generation. 2410 */ 2411 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 2412 sizeof (*bbt); 2413 lr->lr_mode = bbt->bt_gen + 1; 2414 ASSERT0(lrtxg); 2415 } 2416 2417 /* 2418 * Verify that the current bonus buffer is not newer than our txg. 2419 */ 2420 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2421 MAX(txg, lrtxg), crtxg); 2422 2423 dmu_buf_will_dirty(db, tx); 2424 2425 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 2426 ASSERT3U(lr->lr_size, <=, db->db_size); 2427 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 2428 bbt = ztest_bt_bonus(db); 2429 2430 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2431 txg, crtxg); 2432 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 2433 dmu_buf_rele(db, FTAG); 2434 2435 (void) ztest_log_setattr(zd, tx, lr); 2436 2437 dmu_tx_commit(tx); 2438 2439 ztest_object_unlock(zd, lr->lr_foid); 2440 2441 return (0); 2442 } 2443 2444 static zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 2445 NULL, /* 0 no such transaction type */ 2446 ztest_replay_create, /* TX_CREATE */ 2447 NULL, /* TX_MKDIR */ 2448 NULL, /* TX_MKXATTR */ 2449 NULL, /* TX_SYMLINK */ 2450 ztest_replay_remove, /* TX_REMOVE */ 2451 NULL, /* TX_RMDIR */ 2452 NULL, /* TX_LINK */ 2453 NULL, /* TX_RENAME */ 2454 ztest_replay_write, /* TX_WRITE */ 2455 ztest_replay_truncate, /* TX_TRUNCATE */ 2456 ztest_replay_setattr, /* TX_SETATTR */ 2457 NULL, /* TX_ACL */ 2458 NULL, /* TX_CREATE_ACL */ 2459 NULL, /* TX_CREATE_ATTR */ 2460 NULL, /* TX_CREATE_ACL_ATTR */ 2461 NULL, /* TX_MKDIR_ACL */ 2462 NULL, /* TX_MKDIR_ATTR */ 2463 NULL, /* TX_MKDIR_ACL_ATTR */ 2464 NULL, /* TX_WRITE2 */ 2465 NULL, /* TX_SETSAXATTR */ 2466 NULL, /* TX_RENAME_EXCHANGE */ 2467 NULL, /* TX_RENAME_WHITEOUT */ 2468 }; 2469 2470 /* 2471 * ZIL get_data callbacks 2472 */ 2473 2474 static void 2475 ztest_get_done(zgd_t *zgd, int error) 2476 { 2477 (void) error; 2478 ztest_ds_t *zd = zgd->zgd_private; 2479 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 2480 2481 if (zgd->zgd_db) 2482 dmu_buf_rele(zgd->zgd_db, zgd); 2483 2484 ztest_range_unlock((rl_t *)zgd->zgd_lr); 2485 ztest_object_unlock(zd, object); 2486 2487 umem_free(zgd, sizeof (*zgd)); 2488 } 2489 2490 static int 2491 ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, 2492 struct lwb *lwb, zio_t *zio) 2493 { 2494 (void) arg2; 2495 ztest_ds_t *zd = arg; 2496 objset_t *os = zd->zd_os; 2497 uint64_t object = lr->lr_foid; 2498 uint64_t offset = lr->lr_offset; 2499 uint64_t size = lr->lr_length; 2500 uint64_t txg = lr->lr_common.lrc_txg; 2501 uint64_t crtxg; 2502 dmu_object_info_t doi; 2503 dmu_buf_t *db; 2504 zgd_t *zgd; 2505 int error; 2506 2507 ASSERT3P(lwb, !=, NULL); 2508 ASSERT3U(size, !=, 0); 2509 2510 ztest_object_lock(zd, object, ZTRL_READER); 2511 error = dmu_bonus_hold(os, object, FTAG, &db); 2512 if (error) { 2513 ztest_object_unlock(zd, object); 2514 return (error); 2515 } 2516 2517 crtxg = ztest_bt_bonus(db)->bt_crtxg; 2518 2519 if (crtxg == 0 || crtxg > txg) { 2520 dmu_buf_rele(db, FTAG); 2521 ztest_object_unlock(zd, object); 2522 return (ENOENT); 2523 } 2524 2525 dmu_object_info_from_db(db, &doi); 2526 dmu_buf_rele(db, FTAG); 2527 db = NULL; 2528 2529 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 2530 zgd->zgd_lwb = lwb; 2531 zgd->zgd_private = zd; 2532 2533 if (buf != NULL) { /* immediate write */ 2534 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2535 object, offset, size, ZTRL_READER); 2536 2537 error = dmu_read(os, object, offset, size, buf, 2538 DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING); 2539 ASSERT0(error); 2540 } else { 2541 ASSERT3P(zio, !=, NULL); 2542 size = doi.doi_data_block_size; 2543 if (ISP2(size)) { 2544 offset = P2ALIGN_TYPED(offset, size, uint64_t); 2545 } else { 2546 ASSERT3U(offset, <, size); 2547 offset = 0; 2548 } 2549 2550 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2551 object, offset, size, ZTRL_READER); 2552 2553 error = dmu_buf_hold_noread(os, object, offset, zgd, &db); 2554 if (error == 0) { 2555 blkptr_t *bp = &lr->lr_blkptr; 2556 2557 zgd->zgd_db = db; 2558 zgd->zgd_bp = bp; 2559 2560 ASSERT3U(db->db_offset, ==, offset); 2561 ASSERT3U(db->db_size, ==, size); 2562 2563 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2564 ztest_get_done, zgd); 2565 2566 if (error == 0) 2567 return (0); 2568 } 2569 } 2570 2571 ztest_get_done(zgd, error); 2572 2573 return (error); 2574 } 2575 2576 static void * 2577 ztest_lr_alloc(size_t lrsize, char *name) 2578 { 2579 char *lr; 2580 size_t namesize = name ? strlen(name) + 1 : 0; 2581 2582 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2583 2584 if (name) 2585 memcpy(lr + lrsize, name, namesize); 2586 2587 return (lr); 2588 } 2589 2590 static void 2591 ztest_lr_free(void *lr, size_t lrsize, char *name) 2592 { 2593 size_t namesize = name ? strlen(name) + 1 : 0; 2594 2595 umem_free(lr, lrsize + namesize); 2596 } 2597 2598 /* 2599 * Lookup a bunch of objects. Returns the number of objects not found. 2600 */ 2601 static int 2602 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2603 { 2604 int missing = 0; 2605 int error; 2606 int i; 2607 2608 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2609 2610 for (i = 0; i < count; i++, od++) { 2611 od->od_object = 0; 2612 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2613 sizeof (uint64_t), 1, &od->od_object); 2614 if (error) { 2615 ASSERT3S(error, ==, ENOENT); 2616 ASSERT0(od->od_object); 2617 missing++; 2618 } else { 2619 dmu_buf_t *db; 2620 ztest_block_tag_t *bbt; 2621 dmu_object_info_t doi; 2622 2623 ASSERT3U(od->od_object, !=, 0); 2624 ASSERT0(missing); /* there should be no gaps */ 2625 2626 ztest_object_lock(zd, od->od_object, ZTRL_READER); 2627 VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, 2628 FTAG, &db)); 2629 dmu_object_info_from_db(db, &doi); 2630 bbt = ztest_bt_bonus(db); 2631 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2632 od->od_type = doi.doi_type; 2633 od->od_blocksize = doi.doi_data_block_size; 2634 od->od_gen = bbt->bt_gen; 2635 dmu_buf_rele(db, FTAG); 2636 ztest_object_unlock(zd, od->od_object); 2637 } 2638 } 2639 2640 return (missing); 2641 } 2642 2643 static int 2644 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2645 { 2646 int missing = 0; 2647 int i; 2648 2649 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2650 2651 for (i = 0; i < count; i++, od++) { 2652 if (missing) { 2653 od->od_object = 0; 2654 missing++; 2655 continue; 2656 } 2657 2658 lr_create_t *lrc = ztest_lr_alloc(sizeof (*lrc), od->od_name); 2659 _lr_create_t *lr = &lrc->lr_create; 2660 2661 lr->lr_doid = od->od_dir; 2662 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2663 lr->lrz_type = od->od_crtype; 2664 lr->lrz_blocksize = od->od_crblocksize; 2665 lr->lrz_ibshift = ztest_random_ibshift(); 2666 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2667 lr->lrz_dnodesize = od->od_crdnodesize; 2668 lr->lr_gen = od->od_crgen; 2669 lr->lr_crtime[0] = time(NULL); 2670 2671 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2672 ASSERT0(missing); 2673 od->od_object = 0; 2674 missing++; 2675 } else { 2676 od->od_object = lr->lr_foid; 2677 od->od_type = od->od_crtype; 2678 od->od_blocksize = od->od_crblocksize; 2679 od->od_gen = od->od_crgen; 2680 ASSERT3U(od->od_object, !=, 0); 2681 } 2682 2683 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2684 } 2685 2686 return (missing); 2687 } 2688 2689 static int 2690 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2691 { 2692 int missing = 0; 2693 int error; 2694 int i; 2695 2696 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2697 2698 od += count - 1; 2699 2700 for (i = count - 1; i >= 0; i--, od--) { 2701 if (missing) { 2702 missing++; 2703 continue; 2704 } 2705 2706 /* 2707 * No object was found. 2708 */ 2709 if (od->od_object == 0) 2710 continue; 2711 2712 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2713 2714 lr->lr_doid = od->od_dir; 2715 2716 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2717 ASSERT3U(error, ==, ENOSPC); 2718 missing++; 2719 } else { 2720 od->od_object = 0; 2721 } 2722 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2723 } 2724 2725 return (missing); 2726 } 2727 2728 static int 2729 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2730 const void *data) 2731 { 2732 lr_write_t *lr; 2733 int error; 2734 2735 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2736 2737 lr->lr_foid = object; 2738 lr->lr_offset = offset; 2739 lr->lr_length = size; 2740 lr->lr_blkoff = 0; 2741 BP_ZERO(&lr->lr_blkptr); 2742 2743 memcpy(&lr->lr_data[0], data, size); 2744 2745 error = ztest_replay_write(zd, lr, B_FALSE); 2746 2747 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2748 2749 return (error); 2750 } 2751 2752 static int 2753 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2754 { 2755 lr_truncate_t *lr; 2756 int error; 2757 2758 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2759 2760 lr->lr_foid = object; 2761 lr->lr_offset = offset; 2762 lr->lr_length = size; 2763 2764 error = ztest_replay_truncate(zd, lr, B_FALSE); 2765 2766 ztest_lr_free(lr, sizeof (*lr), NULL); 2767 2768 return (error); 2769 } 2770 2771 static int 2772 ztest_setattr(ztest_ds_t *zd, uint64_t object) 2773 { 2774 lr_setattr_t *lr; 2775 int error; 2776 2777 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2778 2779 lr->lr_foid = object; 2780 lr->lr_size = 0; 2781 lr->lr_mode = 0; 2782 2783 error = ztest_replay_setattr(zd, lr, B_FALSE); 2784 2785 ztest_lr_free(lr, sizeof (*lr), NULL); 2786 2787 return (error); 2788 } 2789 2790 static void 2791 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2792 { 2793 objset_t *os = zd->zd_os; 2794 dmu_tx_t *tx; 2795 uint64_t txg; 2796 rl_t *rl; 2797 2798 txg_wait_synced(dmu_objset_pool(os), 0); 2799 2800 ztest_object_lock(zd, object, ZTRL_READER); 2801 rl = ztest_range_lock(zd, object, offset, size, ZTRL_WRITER); 2802 2803 tx = dmu_tx_create(os); 2804 2805 dmu_tx_hold_write(tx, object, offset, size); 2806 2807 txg = ztest_tx_assign(tx, DMU_TX_WAIT, FTAG); 2808 2809 if (txg != 0) { 2810 dmu_prealloc(os, object, offset, size, tx); 2811 dmu_tx_commit(tx); 2812 txg_wait_synced(dmu_objset_pool(os), txg); 2813 } else { 2814 (void) dmu_free_long_range(os, object, offset, size); 2815 } 2816 2817 ztest_range_unlock(rl); 2818 ztest_object_unlock(zd, object); 2819 } 2820 2821 static void 2822 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2823 { 2824 int err; 2825 ztest_block_tag_t wbt; 2826 dmu_object_info_t doi; 2827 enum ztest_io_type io_type; 2828 uint64_t blocksize; 2829 void *data; 2830 dmu_flags_t dmu_read_flags = DMU_READ_NO_PREFETCH; 2831 2832 /* 2833 * We will randomly set when to do O_DIRECT on a read. 2834 */ 2835 if (ztest_random(4) == 0) 2836 dmu_read_flags |= DMU_DIRECTIO; 2837 2838 VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); 2839 blocksize = doi.doi_data_block_size; 2840 data = umem_alloc(blocksize, UMEM_NOFAIL); 2841 2842 /* 2843 * Pick an i/o type at random, biased toward writing block tags. 2844 */ 2845 io_type = ztest_random(ZTEST_IO_TYPES); 2846 if (ztest_random(2) == 0) 2847 io_type = ZTEST_IO_WRITE_TAG; 2848 2849 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2850 2851 switch (io_type) { 2852 2853 case ZTEST_IO_WRITE_TAG: 2854 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2855 offset, 0, 0, 0); 2856 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2857 break; 2858 2859 case ZTEST_IO_WRITE_PATTERN: 2860 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2861 if (ztest_random(2) == 0) { 2862 /* 2863 * Induce fletcher2 collisions to ensure that 2864 * zio_ddt_collision() detects and resolves them 2865 * when using fletcher2-verify for deduplication. 2866 */ 2867 ((uint64_t *)data)[0] ^= 1ULL << 63; 2868 ((uint64_t *)data)[4] ^= 1ULL << 63; 2869 } 2870 (void) ztest_write(zd, object, offset, blocksize, data); 2871 break; 2872 2873 case ZTEST_IO_WRITE_ZEROES: 2874 memset(data, 0, blocksize); 2875 (void) ztest_write(zd, object, offset, blocksize, data); 2876 break; 2877 2878 case ZTEST_IO_TRUNCATE: 2879 (void) ztest_truncate(zd, object, offset, blocksize); 2880 break; 2881 2882 case ZTEST_IO_SETATTR: 2883 (void) ztest_setattr(zd, object); 2884 break; 2885 default: 2886 break; 2887 2888 case ZTEST_IO_REWRITE: 2889 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2890 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2891 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2892 B_FALSE); 2893 ASSERT(err == 0 || err == ENOSPC); 2894 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2895 ZFS_PROP_COMPRESSION, 2896 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2897 B_FALSE); 2898 ASSERT(err == 0 || err == ENOSPC); 2899 (void) pthread_rwlock_unlock(&ztest_name_lock); 2900 2901 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2902 dmu_read_flags)); 2903 2904 (void) ztest_write(zd, object, offset, blocksize, data); 2905 break; 2906 } 2907 2908 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2909 2910 umem_free(data, blocksize); 2911 } 2912 2913 /* 2914 * Initialize an object description template. 2915 */ 2916 static void 2917 ztest_od_init(ztest_od_t *od, uint64_t id, const char *tag, uint64_t index, 2918 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2919 uint64_t gen) 2920 { 2921 od->od_dir = ZTEST_DIROBJ; 2922 od->od_object = 0; 2923 2924 od->od_crtype = type; 2925 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2926 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2927 od->od_crgen = gen; 2928 2929 od->od_type = DMU_OT_NONE; 2930 od->od_blocksize = 0; 2931 od->od_gen = 0; 2932 2933 (void) snprintf(od->od_name, sizeof (od->od_name), 2934 "%s(%"PRId64")[%"PRIu64"]", 2935 tag, id, index); 2936 } 2937 2938 /* 2939 * Lookup or create the objects for a test using the od template. 2940 * If the objects do not all exist, or if 'remove' is specified, 2941 * remove any existing objects and create new ones. Otherwise, 2942 * use the existing objects. 2943 */ 2944 static int 2945 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2946 { 2947 int count = size / sizeof (*od); 2948 int rv = 0; 2949 2950 mutex_enter(&zd->zd_dirobj_lock); 2951 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2952 (ztest_remove(zd, od, count) != 0 || 2953 ztest_create(zd, od, count) != 0)) 2954 rv = -1; 2955 zd->zd_od = od; 2956 mutex_exit(&zd->zd_dirobj_lock); 2957 2958 return (rv); 2959 } 2960 2961 void 2962 ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2963 { 2964 (void) id; 2965 zilog_t *zilog = zd->zd_zilog; 2966 2967 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2968 2969 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2970 2971 /* 2972 * Remember the committed values in zd, which is in parent/child 2973 * shared memory. If we die, the next iteration of ztest_run() 2974 * will verify that the log really does contain this record. 2975 */ 2976 mutex_enter(&zilog->zl_lock); 2977 ASSERT3P(zd->zd_shared, !=, NULL); 2978 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2979 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2980 mutex_exit(&zilog->zl_lock); 2981 2982 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2983 } 2984 2985 /* 2986 * This function is designed to simulate the operations that occur during a 2987 * mount/unmount operation. We hold the dataset across these operations in an 2988 * attempt to expose any implicit assumptions about ZIL management. 2989 */ 2990 void 2991 ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2992 { 2993 (void) id; 2994 objset_t *os = zd->zd_os; 2995 2996 /* 2997 * We hold the ztest_vdev_lock so we don't cause problems with 2998 * other threads that wish to remove a log device, such as 2999 * ztest_device_removal(). 3000 */ 3001 mutex_enter(&ztest_vdev_lock); 3002 3003 /* 3004 * We grab the zd_dirobj_lock to ensure that no other thread is 3005 * updating the zil (i.e. adding in-memory log records) and the 3006 * zd_zilog_lock to block any I/O. 3007 */ 3008 mutex_enter(&zd->zd_dirobj_lock); 3009 (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); 3010 3011 /* zfsvfs_teardown() */ 3012 zil_close(zd->zd_zilog); 3013 3014 /* zfsvfs_setup() */ 3015 VERIFY3P(zil_open(os, ztest_get_data, NULL), ==, zd->zd_zilog); 3016 zil_replay(os, zd, ztest_replay_vector); 3017 3018 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 3019 mutex_exit(&zd->zd_dirobj_lock); 3020 mutex_exit(&ztest_vdev_lock); 3021 } 3022 3023 /* 3024 * Verify that we can't destroy an active pool, create an existing pool, 3025 * or create a pool with a bad vdev spec. 3026 */ 3027 void 3028 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 3029 { 3030 (void) zd, (void) id; 3031 ztest_shared_opts_t *zo = &ztest_opts; 3032 spa_t *spa; 3033 nvlist_t *nvroot; 3034 3035 if (zo->zo_mmp_test) 3036 return; 3037 3038 /* 3039 * Attempt to create using a bad file. 3040 */ 3041 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 3042 VERIFY3U(ENOENT, ==, 3043 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 3044 fnvlist_free(nvroot); 3045 3046 /* 3047 * Attempt to create using a bad mirror. 3048 */ 3049 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); 3050 VERIFY3U(ENOENT, ==, 3051 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 3052 fnvlist_free(nvroot); 3053 3054 /* 3055 * Attempt to create an existing pool. It shouldn't matter 3056 * what's in the nvroot; we should fail with EEXIST. 3057 */ 3058 (void) pthread_rwlock_rdlock(&ztest_name_lock); 3059 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 3060 VERIFY3U(EEXIST, ==, 3061 spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); 3062 fnvlist_free(nvroot); 3063 3064 /* 3065 * We open a reference to the spa and then we try to export it 3066 * expecting one of the following errors: 3067 * 3068 * EBUSY 3069 * Because of the reference we just opened. 3070 * 3071 * ZFS_ERR_EXPORT_IN_PROGRESS 3072 * For the case that there is another ztest thread doing 3073 * an export concurrently. 3074 */ 3075 VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); 3076 int error = spa_destroy(zo->zo_pool); 3077 if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { 3078 fatal(B_FALSE, "spa_destroy(%s) returned unexpected value %d", 3079 spa->spa_name, error); 3080 } 3081 spa_close(spa, FTAG); 3082 3083 (void) pthread_rwlock_unlock(&ztest_name_lock); 3084 } 3085 3086 /* 3087 * Start and then stop the MMP threads to ensure the startup and shutdown code 3088 * works properly. Actual protection and property-related code tested via ZTS. 3089 */ 3090 void 3091 ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) 3092 { 3093 (void) zd, (void) id; 3094 ztest_shared_opts_t *zo = &ztest_opts; 3095 spa_t *spa = ztest_spa; 3096 3097 if (zo->zo_mmp_test) 3098 return; 3099 3100 /* 3101 * Since enabling MMP involves setting a property, it could not be done 3102 * while the pool is suspended. 3103 */ 3104 if (spa_suspended(spa)) 3105 return; 3106 3107 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3108 mutex_enter(&spa->spa_props_lock); 3109 3110 zfs_multihost_fail_intervals = 0; 3111 3112 if (!spa_multihost(spa)) { 3113 spa->spa_multihost = B_TRUE; 3114 mmp_thread_start(spa); 3115 } 3116 3117 mutex_exit(&spa->spa_props_lock); 3118 spa_config_exit(spa, SCL_CONFIG, FTAG); 3119 3120 txg_wait_synced(spa_get_dsl(spa), 0); 3121 mmp_signal_all_threads(); 3122 txg_wait_synced(spa_get_dsl(spa), 0); 3123 3124 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3125 mutex_enter(&spa->spa_props_lock); 3126 3127 if (spa_multihost(spa)) { 3128 mmp_thread_stop(spa); 3129 spa->spa_multihost = B_FALSE; 3130 } 3131 3132 mutex_exit(&spa->spa_props_lock); 3133 spa_config_exit(spa, SCL_CONFIG, FTAG); 3134 } 3135 3136 static int 3137 ztest_get_raidz_children(spa_t *spa) 3138 { 3139 (void) spa; 3140 vdev_t *raidvd; 3141 3142 ASSERT(MUTEX_HELD(&ztest_vdev_lock)); 3143 3144 if (ztest_opts.zo_raid_do_expand) { 3145 raidvd = ztest_spa->spa_root_vdev->vdev_child[0]; 3146 3147 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3148 3149 return (raidvd->vdev_children); 3150 } 3151 3152 return (ztest_opts.zo_raid_children); 3153 } 3154 3155 void 3156 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 3157 { 3158 (void) zd, (void) id; 3159 spa_t *spa; 3160 uint64_t initial_version = SPA_VERSION_INITIAL; 3161 uint64_t raidz_children, version, newversion; 3162 nvlist_t *nvroot, *props; 3163 char *name; 3164 3165 if (ztest_opts.zo_mmp_test) 3166 return; 3167 3168 /* dRAID added after feature flags, skip upgrade test. */ 3169 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) 3170 return; 3171 3172 mutex_enter(&ztest_vdev_lock); 3173 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 3174 3175 /* 3176 * Clean up from previous runs. 3177 */ 3178 (void) spa_destroy(name); 3179 3180 raidz_children = ztest_get_raidz_children(ztest_spa); 3181 3182 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 3183 NULL, raidz_children, ztest_opts.zo_mirrors, 1); 3184 3185 /* 3186 * If we're configuring a RAIDZ device then make sure that the 3187 * initial version is capable of supporting that feature. 3188 */ 3189 switch (ztest_opts.zo_raid_parity) { 3190 case 0: 3191 case 1: 3192 initial_version = SPA_VERSION_INITIAL; 3193 break; 3194 case 2: 3195 initial_version = SPA_VERSION_RAIDZ2; 3196 break; 3197 case 3: 3198 initial_version = SPA_VERSION_RAIDZ3; 3199 break; 3200 } 3201 3202 /* 3203 * Create a pool with a spa version that can be upgraded. Pick 3204 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 3205 */ 3206 do { 3207 version = ztest_random_spa_version(initial_version); 3208 } while (version > SPA_VERSION_BEFORE_FEATURES); 3209 3210 props = fnvlist_alloc(); 3211 fnvlist_add_uint64(props, 3212 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 3213 VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); 3214 fnvlist_free(nvroot); 3215 fnvlist_free(props); 3216 3217 VERIFY0(spa_open(name, &spa, FTAG)); 3218 VERIFY3U(spa_version(spa), ==, version); 3219 newversion = ztest_random_spa_version(version + 1); 3220 3221 if (ztest_opts.zo_verbose >= 4) { 3222 (void) printf("upgrading spa version from " 3223 "%"PRIu64" to %"PRIu64"\n", 3224 version, newversion); 3225 } 3226 3227 spa_upgrade(spa, newversion); 3228 VERIFY3U(spa_version(spa), >, version); 3229 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 3230 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 3231 spa_close(spa, FTAG); 3232 3233 kmem_strfree(name); 3234 mutex_exit(&ztest_vdev_lock); 3235 } 3236 3237 static void 3238 ztest_spa_checkpoint(spa_t *spa) 3239 { 3240 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3241 3242 int error = spa_checkpoint(spa->spa_name); 3243 3244 switch (error) { 3245 case 0: 3246 case ZFS_ERR_DEVRM_IN_PROGRESS: 3247 case ZFS_ERR_DISCARDING_CHECKPOINT: 3248 case ZFS_ERR_CHECKPOINT_EXISTS: 3249 case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: 3250 break; 3251 case ENOSPC: 3252 ztest_record_enospc(FTAG); 3253 break; 3254 default: 3255 fatal(B_FALSE, "spa_checkpoint(%s) = %d", spa->spa_name, error); 3256 } 3257 } 3258 3259 static void 3260 ztest_spa_discard_checkpoint(spa_t *spa) 3261 { 3262 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3263 3264 int error = spa_checkpoint_discard(spa->spa_name); 3265 3266 switch (error) { 3267 case 0: 3268 case ZFS_ERR_DISCARDING_CHECKPOINT: 3269 case ZFS_ERR_NO_CHECKPOINT: 3270 break; 3271 default: 3272 fatal(B_FALSE, "spa_discard_checkpoint(%s) = %d", 3273 spa->spa_name, error); 3274 } 3275 3276 } 3277 3278 void 3279 ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 3280 { 3281 (void) zd, (void) id; 3282 spa_t *spa = ztest_spa; 3283 3284 mutex_enter(&ztest_checkpoint_lock); 3285 if (ztest_random(2) == 0) { 3286 ztest_spa_checkpoint(spa); 3287 } else { 3288 ztest_spa_discard_checkpoint(spa); 3289 } 3290 mutex_exit(&ztest_checkpoint_lock); 3291 } 3292 3293 3294 static vdev_t * 3295 vdev_lookup_by_path(vdev_t *vd, const char *path) 3296 { 3297 vdev_t *mvd; 3298 int c; 3299 3300 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 3301 return (vd); 3302 3303 for (c = 0; c < vd->vdev_children; c++) 3304 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 3305 NULL) 3306 return (mvd); 3307 3308 return (NULL); 3309 } 3310 3311 static int 3312 spa_num_top_vdevs(spa_t *spa) 3313 { 3314 vdev_t *rvd = spa->spa_root_vdev; 3315 ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); 3316 return (rvd->vdev_children); 3317 } 3318 3319 /* 3320 * Verify that vdev_add() works as expected. 3321 */ 3322 void 3323 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 3324 { 3325 (void) zd, (void) id; 3326 ztest_shared_t *zs = ztest_shared; 3327 spa_t *spa = ztest_spa; 3328 uint64_t leaves; 3329 uint64_t guid; 3330 uint64_t raidz_children; 3331 3332 nvlist_t *nvroot; 3333 int error; 3334 3335 if (ztest_opts.zo_mmp_test) 3336 return; 3337 3338 mutex_enter(&ztest_vdev_lock); 3339 raidz_children = ztest_get_raidz_children(spa); 3340 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 3341 3342 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3343 3344 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3345 3346 /* 3347 * If we have slogs then remove them 1/4 of the time. 3348 */ 3349 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 3350 metaslab_group_t *mg; 3351 3352 /* 3353 * find the first real slog in log allocation class 3354 */ 3355 mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; 3356 while (!mg->mg_vd->vdev_islog) 3357 mg = mg->mg_next; 3358 3359 guid = mg->mg_vd->vdev_guid; 3360 3361 spa_config_exit(spa, SCL_VDEV, FTAG); 3362 3363 /* 3364 * We have to grab the zs_name_lock as writer to 3365 * prevent a race between removing a slog (dmu_objset_find) 3366 * and destroying a dataset. Removing the slog will 3367 * grab a reference on the dataset which may cause 3368 * dsl_destroy_head() to fail with EBUSY thus 3369 * leaving the dataset in an inconsistent state. 3370 */ 3371 pthread_rwlock_wrlock(&ztest_name_lock); 3372 error = spa_vdev_remove(spa, guid, B_FALSE); 3373 pthread_rwlock_unlock(&ztest_name_lock); 3374 3375 switch (error) { 3376 case 0: 3377 case EEXIST: /* Generic zil_reset() error */ 3378 case EBUSY: /* Replay required */ 3379 case EACCES: /* Crypto key not loaded */ 3380 case ZFS_ERR_CHECKPOINT_EXISTS: 3381 case ZFS_ERR_DISCARDING_CHECKPOINT: 3382 break; 3383 default: 3384 fatal(B_FALSE, "spa_vdev_remove() = %d", error); 3385 } 3386 } else { 3387 spa_config_exit(spa, SCL_VDEV, FTAG); 3388 3389 /* 3390 * Make 1/4 of the devices be log devices 3391 */ 3392 nvroot = make_vdev_root(NULL, NULL, NULL, 3393 ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? 3394 "log" : NULL, raidz_children, zs->zs_mirrors, 3395 1); 3396 3397 error = spa_vdev_add(spa, nvroot, B_FALSE); 3398 fnvlist_free(nvroot); 3399 3400 switch (error) { 3401 case 0: 3402 break; 3403 case ENOSPC: 3404 ztest_record_enospc("spa_vdev_add"); 3405 break; 3406 default: 3407 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3408 } 3409 } 3410 3411 mutex_exit(&ztest_vdev_lock); 3412 } 3413 3414 void 3415 ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) 3416 { 3417 (void) zd, (void) id; 3418 ztest_shared_t *zs = ztest_shared; 3419 spa_t *spa = ztest_spa; 3420 uint64_t leaves; 3421 nvlist_t *nvroot; 3422 uint64_t raidz_children; 3423 const char *class = (ztest_random(2) == 0) ? 3424 VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; 3425 int error; 3426 3427 /* 3428 * By default add a special vdev 50% of the time 3429 */ 3430 if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || 3431 (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && 3432 ztest_random(2) == 0)) { 3433 return; 3434 } 3435 3436 mutex_enter(&ztest_vdev_lock); 3437 3438 /* Only test with mirrors */ 3439 if (zs->zs_mirrors < 2) { 3440 mutex_exit(&ztest_vdev_lock); 3441 return; 3442 } 3443 3444 /* requires feature@allocation_classes */ 3445 if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { 3446 mutex_exit(&ztest_vdev_lock); 3447 return; 3448 } 3449 3450 raidz_children = ztest_get_raidz_children(spa); 3451 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 3452 3453 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3454 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3455 spa_config_exit(spa, SCL_VDEV, FTAG); 3456 3457 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 3458 class, raidz_children, zs->zs_mirrors, 1); 3459 3460 error = spa_vdev_add(spa, nvroot, B_FALSE); 3461 fnvlist_free(nvroot); 3462 3463 if (error == ENOSPC) 3464 ztest_record_enospc("spa_vdev_add"); 3465 else if (error != 0) 3466 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3467 3468 /* 3469 * 50% of the time allow small blocks in the special class 3470 */ 3471 if (error == 0 && 3472 spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { 3473 if (ztest_opts.zo_verbose >= 3) 3474 (void) printf("Enabling special VDEV small blocks\n"); 3475 error = ztest_dsl_prop_set_uint64(zd->zd_name, 3476 ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); 3477 ASSERT(error == 0 || error == ENOSPC); 3478 } 3479 3480 mutex_exit(&ztest_vdev_lock); 3481 3482 if (ztest_opts.zo_verbose >= 3) { 3483 metaslab_class_t *mc; 3484 3485 if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) 3486 mc = spa_special_class(spa); 3487 else 3488 mc = spa_dedup_class(spa); 3489 (void) printf("Added a %s mirrored vdev (of %d)\n", 3490 class, (int)mc->mc_groups); 3491 } 3492 } 3493 3494 /* 3495 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 3496 */ 3497 void 3498 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 3499 { 3500 (void) zd, (void) id; 3501 ztest_shared_t *zs = ztest_shared; 3502 spa_t *spa = ztest_spa; 3503 vdev_t *rvd = spa->spa_root_vdev; 3504 spa_aux_vdev_t *sav; 3505 const char *aux; 3506 char *path; 3507 uint64_t guid = 0; 3508 int error, ignore_err = 0; 3509 3510 if (ztest_opts.zo_mmp_test) 3511 return; 3512 3513 path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3514 3515 if (ztest_random(2) == 0) { 3516 sav = &spa->spa_spares; 3517 aux = ZPOOL_CONFIG_SPARES; 3518 } else { 3519 sav = &spa->spa_l2cache; 3520 aux = ZPOOL_CONFIG_L2CACHE; 3521 } 3522 3523 mutex_enter(&ztest_vdev_lock); 3524 3525 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3526 3527 if (sav->sav_count != 0 && ztest_random(4) == 0) { 3528 /* 3529 * Pick a random device to remove. 3530 */ 3531 vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3532 3533 /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ 3534 if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) 3535 ignore_err = ENOTSUP; 3536 3537 guid = svd->vdev_guid; 3538 } else { 3539 /* 3540 * Find an unused device we can add. 3541 */ 3542 zs->zs_vdev_aux = 0; 3543 for (;;) { 3544 int c; 3545 (void) snprintf(path, MAXPATHLEN, ztest_aux_template, 3546 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 3547 zs->zs_vdev_aux); 3548 for (c = 0; c < sav->sav_count; c++) 3549 if (strcmp(sav->sav_vdevs[c]->vdev_path, 3550 path) == 0) 3551 break; 3552 if (c == sav->sav_count && 3553 vdev_lookup_by_path(rvd, path) == NULL) 3554 break; 3555 zs->zs_vdev_aux++; 3556 } 3557 } 3558 3559 spa_config_exit(spa, SCL_VDEV, FTAG); 3560 3561 if (guid == 0) { 3562 /* 3563 * Add a new device. 3564 */ 3565 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 3566 (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); 3567 error = spa_vdev_add(spa, nvroot, B_FALSE); 3568 3569 switch (error) { 3570 case 0: 3571 break; 3572 default: 3573 fatal(B_FALSE, "spa_vdev_add(%p) = %d", nvroot, error); 3574 } 3575 fnvlist_free(nvroot); 3576 } else { 3577 /* 3578 * Remove an existing device. Sometimes, dirty its 3579 * vdev state first to make sure we handle removal 3580 * of devices that have pending state changes. 3581 */ 3582 if (ztest_random(2) == 0) 3583 (void) vdev_online(spa, guid, 0, NULL); 3584 3585 error = spa_vdev_remove(spa, guid, B_FALSE); 3586 3587 switch (error) { 3588 case 0: 3589 case EBUSY: 3590 case ZFS_ERR_CHECKPOINT_EXISTS: 3591 case ZFS_ERR_DISCARDING_CHECKPOINT: 3592 break; 3593 default: 3594 if (error != ignore_err) 3595 fatal(B_FALSE, 3596 "spa_vdev_remove(%"PRIu64") = %d", 3597 guid, error); 3598 } 3599 } 3600 3601 mutex_exit(&ztest_vdev_lock); 3602 3603 umem_free(path, MAXPATHLEN); 3604 } 3605 3606 /* 3607 * split a pool if it has mirror tlvdevs 3608 */ 3609 void 3610 ztest_split_pool(ztest_ds_t *zd, uint64_t id) 3611 { 3612 (void) zd, (void) id; 3613 ztest_shared_t *zs = ztest_shared; 3614 spa_t *spa = ztest_spa; 3615 vdev_t *rvd = spa->spa_root_vdev; 3616 nvlist_t *tree, **child, *config, *split, **schild; 3617 uint_t c, children, schildren = 0, lastlogid = 0; 3618 int error = 0; 3619 3620 if (ztest_opts.zo_mmp_test) 3621 return; 3622 3623 mutex_enter(&ztest_vdev_lock); 3624 3625 /* ensure we have a usable config; mirrors of raidz aren't supported */ 3626 if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { 3627 mutex_exit(&ztest_vdev_lock); 3628 return; 3629 } 3630 3631 /* clean up the old pool, if any */ 3632 (void) spa_destroy("splitp"); 3633 3634 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3635 3636 /* generate a config from the existing config */ 3637 mutex_enter(&spa->spa_props_lock); 3638 tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); 3639 mutex_exit(&spa->spa_props_lock); 3640 3641 VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, 3642 &child, &children)); 3643 3644 schild = umem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 3645 UMEM_NOFAIL); 3646 for (c = 0; c < children; c++) { 3647 vdev_t *tvd = rvd->vdev_child[c]; 3648 nvlist_t **mchild; 3649 uint_t mchildren; 3650 3651 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 3652 schild[schildren] = fnvlist_alloc(); 3653 fnvlist_add_string(schild[schildren], 3654 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); 3655 fnvlist_add_uint64(schild[schildren], 3656 ZPOOL_CONFIG_IS_HOLE, 1); 3657 if (lastlogid == 0) 3658 lastlogid = schildren; 3659 ++schildren; 3660 continue; 3661 } 3662 lastlogid = 0; 3663 VERIFY0(nvlist_lookup_nvlist_array(child[c], 3664 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); 3665 schild[schildren++] = fnvlist_dup(mchild[0]); 3666 } 3667 3668 /* OK, create a config that can be used to split */ 3669 split = fnvlist_alloc(); 3670 fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 3671 fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, 3672 (const nvlist_t **)schild, lastlogid != 0 ? lastlogid : schildren); 3673 3674 config = fnvlist_alloc(); 3675 fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); 3676 3677 for (c = 0; c < schildren; c++) 3678 fnvlist_free(schild[c]); 3679 umem_free(schild, rvd->vdev_children * sizeof (nvlist_t *)); 3680 fnvlist_free(split); 3681 3682 spa_config_exit(spa, SCL_VDEV, FTAG); 3683 3684 (void) pthread_rwlock_wrlock(&ztest_name_lock); 3685 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 3686 (void) pthread_rwlock_unlock(&ztest_name_lock); 3687 3688 fnvlist_free(config); 3689 3690 if (error == 0) { 3691 (void) printf("successful split - results:\n"); 3692 mutex_enter(&spa_namespace_lock); 3693 show_pool_stats(spa); 3694 show_pool_stats(spa_lookup("splitp")); 3695 mutex_exit(&spa_namespace_lock); 3696 ++zs->zs_splits; 3697 --zs->zs_mirrors; 3698 } 3699 mutex_exit(&ztest_vdev_lock); 3700 } 3701 3702 /* 3703 * Verify that we can attach and detach devices. 3704 */ 3705 void 3706 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 3707 { 3708 (void) zd, (void) id; 3709 ztest_shared_t *zs = ztest_shared; 3710 spa_t *spa = ztest_spa; 3711 spa_aux_vdev_t *sav = &spa->spa_spares; 3712 vdev_t *rvd = spa->spa_root_vdev; 3713 vdev_t *oldvd, *newvd, *pvd; 3714 nvlist_t *root; 3715 uint64_t leaves; 3716 uint64_t leaf, top; 3717 uint64_t ashift = ztest_get_ashift(); 3718 uint64_t oldguid, pguid; 3719 uint64_t oldsize, newsize; 3720 uint64_t raidz_children; 3721 char *oldpath, *newpath; 3722 int replacing; 3723 int oldvd_has_siblings = B_FALSE; 3724 int newvd_is_spare = B_FALSE; 3725 int newvd_is_dspare = B_FALSE; 3726 int oldvd_is_log; 3727 int oldvd_is_special; 3728 int error, expected_error; 3729 3730 if (ztest_opts.zo_mmp_test) 3731 return; 3732 3733 oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3734 newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3735 3736 mutex_enter(&ztest_vdev_lock); 3737 raidz_children = ztest_get_raidz_children(spa); 3738 leaves = MAX(zs->zs_mirrors, 1) * raidz_children; 3739 3740 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3741 3742 /* 3743 * If a vdev is in the process of being removed, its removal may 3744 * finish while we are in progress, leading to an unexpected error 3745 * value. Don't bother trying to attach while we are in the middle 3746 * of removal. 3747 */ 3748 if (ztest_device_removal_active) { 3749 spa_config_exit(spa, SCL_ALL, FTAG); 3750 goto out; 3751 } 3752 3753 /* 3754 * RAIDZ leaf VDEV mirrors are not currently supported while a 3755 * RAIDZ expansion is in progress. 3756 */ 3757 if (ztest_opts.zo_raid_do_expand) { 3758 spa_config_exit(spa, SCL_ALL, FTAG); 3759 goto out; 3760 } 3761 3762 /* 3763 * Decide whether to do an attach or a replace. 3764 */ 3765 replacing = ztest_random(2); 3766 3767 /* 3768 * Pick a random top-level vdev. 3769 */ 3770 top = ztest_random_vdev_top(spa, B_TRUE); 3771 3772 /* 3773 * Pick a random leaf within it. 3774 */ 3775 leaf = ztest_random(leaves); 3776 3777 /* 3778 * Locate this vdev. 3779 */ 3780 oldvd = rvd->vdev_child[top]; 3781 3782 /* pick a child from the mirror */ 3783 if (zs->zs_mirrors >= 1) { 3784 ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); 3785 ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); 3786 oldvd = oldvd->vdev_child[leaf / raidz_children]; 3787 } 3788 3789 /* pick a child out of the raidz group */ 3790 if (ztest_opts.zo_raid_children > 1) { 3791 if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) 3792 ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); 3793 else 3794 ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); 3795 oldvd = oldvd->vdev_child[leaf % raidz_children]; 3796 } 3797 3798 /* 3799 * If we're already doing an attach or replace, oldvd may be a 3800 * mirror vdev -- in which case, pick a random child. 3801 */ 3802 while (oldvd->vdev_children != 0) { 3803 oldvd_has_siblings = B_TRUE; 3804 ASSERT3U(oldvd->vdev_children, >=, 2); 3805 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3806 } 3807 3808 oldguid = oldvd->vdev_guid; 3809 oldsize = vdev_get_min_asize(oldvd); 3810 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3811 oldvd_is_special = 3812 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_SPECIAL || 3813 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_DEDUP; 3814 (void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN); 3815 pvd = oldvd->vdev_parent; 3816 pguid = pvd->vdev_guid; 3817 3818 /* 3819 * If oldvd has siblings, then half of the time, detach it. Prior 3820 * to the detach the pool is scrubbed in order to prevent creating 3821 * unrepairable blocks as a result of the data corruption injection. 3822 */ 3823 if (oldvd_has_siblings && ztest_random(2) == 0) { 3824 spa_config_exit(spa, SCL_ALL, FTAG); 3825 3826 error = ztest_scrub_impl(spa); 3827 if (error) 3828 goto out; 3829 3830 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3831 if (error != 0 && error != ENODEV && error != EBUSY && 3832 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3833 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3834 fatal(B_FALSE, "detach (%s) returned %d", 3835 oldpath, error); 3836 goto out; 3837 } 3838 3839 /* 3840 * For the new vdev, choose with equal probability between the two 3841 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3842 */ 3843 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3844 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3845 newvd_is_spare = B_TRUE; 3846 3847 if (newvd->vdev_ops == &vdev_draid_spare_ops) 3848 newvd_is_dspare = B_TRUE; 3849 3850 (void) strlcpy(newpath, newvd->vdev_path, MAXPATHLEN); 3851 } else { 3852 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 3853 ztest_opts.zo_dir, ztest_opts.zo_pool, 3854 top * leaves + leaf); 3855 if (ztest_random(2) == 0) 3856 newpath[strlen(newpath) - 1] = 'b'; 3857 newvd = vdev_lookup_by_path(rvd, newpath); 3858 } 3859 3860 if (newvd) { 3861 /* 3862 * Reopen to ensure the vdev's asize field isn't stale. 3863 */ 3864 vdev_reopen(newvd); 3865 newsize = vdev_get_min_asize(newvd); 3866 } else { 3867 /* 3868 * Make newsize a little bigger or smaller than oldsize. 3869 * If it's smaller, the attach should fail. 3870 * If it's larger, and we're doing a replace, 3871 * we should get dynamic LUN growth when we're done. 3872 */ 3873 newsize = 10 * oldsize / (9 + ztest_random(3)); 3874 } 3875 3876 /* 3877 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3878 * unless it's a replace; in that case any non-replacing parent is OK. 3879 * 3880 * If newvd is already part of the pool, it should fail with EBUSY. 3881 * 3882 * If newvd is too small, it should fail with EOVERFLOW. 3883 * 3884 * If newvd is a distributed spare and it's being attached to a 3885 * dRAID which is not its parent it should fail with ENOTSUP. 3886 */ 3887 if (pvd->vdev_ops != &vdev_mirror_ops && 3888 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3889 pvd->vdev_ops == &vdev_replacing_ops || 3890 pvd->vdev_ops == &vdev_spare_ops)) 3891 expected_error = ENOTSUP; 3892 else if (newvd_is_spare && 3893 (!replacing || oldvd_is_log || oldvd_is_special)) 3894 expected_error = ENOTSUP; 3895 else if (newvd == oldvd) 3896 expected_error = replacing ? 0 : EBUSY; 3897 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3898 expected_error = EBUSY; 3899 else if (!newvd_is_dspare && newsize < oldsize) 3900 expected_error = EOVERFLOW; 3901 else if (ashift > oldvd->vdev_top->vdev_ashift) 3902 expected_error = EDOM; 3903 else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) 3904 expected_error = ENOTSUP; 3905 else 3906 expected_error = 0; 3907 3908 spa_config_exit(spa, SCL_ALL, FTAG); 3909 3910 /* 3911 * Build the nvlist describing newpath. 3912 */ 3913 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3914 ashift, NULL, 0, 0, 1); 3915 3916 /* 3917 * When supported select either a healing or sequential resilver. 3918 */ 3919 boolean_t rebuilding = B_FALSE; 3920 if (pvd->vdev_ops == &vdev_mirror_ops || 3921 pvd->vdev_ops == &vdev_root_ops) { 3922 rebuilding = !!ztest_random(2); 3923 } 3924 3925 error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); 3926 3927 fnvlist_free(root); 3928 3929 /* 3930 * If our parent was the replacing vdev, but the replace completed, 3931 * then instead of failing with ENOTSUP we may either succeed, 3932 * fail with ENODEV, or fail with EOVERFLOW. 3933 */ 3934 if (expected_error == ENOTSUP && 3935 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3936 expected_error = error; 3937 3938 /* 3939 * If someone grew the LUN, the replacement may be too small. 3940 */ 3941 if (error == EOVERFLOW || error == EBUSY) 3942 expected_error = error; 3943 3944 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3945 error == ZFS_ERR_DISCARDING_CHECKPOINT || 3946 error == ZFS_ERR_RESILVER_IN_PROGRESS || 3947 error == ZFS_ERR_REBUILD_IN_PROGRESS) 3948 expected_error = error; 3949 3950 if (error != expected_error && expected_error != EBUSY) { 3951 fatal(B_FALSE, "attach (%s %"PRIu64", %s %"PRIu64", %d) " 3952 "returned %d, expected %d", 3953 oldpath, oldsize, newpath, 3954 newsize, replacing, error, expected_error); 3955 } 3956 out: 3957 mutex_exit(&ztest_vdev_lock); 3958 3959 umem_free(oldpath, MAXPATHLEN); 3960 umem_free(newpath, MAXPATHLEN); 3961 } 3962 3963 static void 3964 raidz_scratch_verify(void) 3965 { 3966 spa_t *spa; 3967 uint64_t write_size, logical_size, offset; 3968 raidz_reflow_scratch_state_t state; 3969 vdev_raidz_expand_t *vre; 3970 vdev_t *raidvd; 3971 3972 ASSERT(raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE); 3973 3974 if (ztest_scratch_state->zs_raidz_scratch_verify_pause == 0) 3975 return; 3976 3977 kernel_init(SPA_MODE_READ); 3978 3979 mutex_enter(&spa_namespace_lock); 3980 spa = spa_lookup(ztest_opts.zo_pool); 3981 ASSERT(spa); 3982 spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; 3983 mutex_exit(&spa_namespace_lock); 3984 3985 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 3986 3987 ASSERT3U(RRSS_GET_OFFSET(&spa->spa_uberblock), !=, UINT64_MAX); 3988 3989 mutex_enter(&ztest_vdev_lock); 3990 3991 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); 3992 3993 vre = spa->spa_raidz_expand; 3994 if (vre == NULL) 3995 goto out; 3996 3997 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3998 offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3999 state = RRSS_GET_STATE(&spa->spa_uberblock); 4000 write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift, 4001 uint64_t); 4002 logical_size = write_size * raidvd->vdev_children; 4003 4004 switch (state) { 4005 /* 4006 * Initial state of reflow process. RAIDZ expansion was 4007 * requested by user, but scratch object was not created. 4008 */ 4009 case RRSS_SCRATCH_NOT_IN_USE: 4010 ASSERT3U(offset, ==, 0); 4011 break; 4012 4013 /* 4014 * Scratch object was synced and stored in boot area. 4015 */ 4016 case RRSS_SCRATCH_VALID: 4017 4018 /* 4019 * Scratch object was synced back to raidz start offset, 4020 * raidz is ready for sector by sector reflow process. 4021 */ 4022 case RRSS_SCRATCH_INVALID_SYNCED: 4023 4024 /* 4025 * Scratch object was synced back to raidz start offset 4026 * on zpool importing, raidz is ready for sector by sector 4027 * reflow process. 4028 */ 4029 case RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT: 4030 ASSERT3U(offset, ==, logical_size); 4031 break; 4032 4033 /* 4034 * Sector by sector reflow process started. 4035 */ 4036 case RRSS_SCRATCH_INVALID_SYNCED_REFLOW: 4037 ASSERT3U(offset, >=, logical_size); 4038 break; 4039 } 4040 4041 out: 4042 spa_config_exit(spa, SCL_ALL, FTAG); 4043 4044 mutex_exit(&ztest_vdev_lock); 4045 4046 ztest_scratch_state->zs_raidz_scratch_verify_pause = 0; 4047 4048 spa_close(spa, FTAG); 4049 kernel_fini(); 4050 } 4051 4052 static void 4053 ztest_scratch_thread(void *arg) 4054 { 4055 (void) arg; 4056 4057 /* wait up to 10 seconds */ 4058 for (int t = 100; t > 0; t -= 1) { 4059 if (raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) 4060 thread_exit(); 4061 4062 (void) poll(NULL, 0, 100); 4063 } 4064 4065 /* killed when the scratch area progress reached a certain point */ 4066 ztest_kill(ztest_shared); 4067 } 4068 4069 /* 4070 * Verify that we can attach raidz device. 4071 */ 4072 void 4073 ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) 4074 { 4075 (void) zd, (void) id; 4076 ztest_shared_t *zs = ztest_shared; 4077 spa_t *spa = ztest_spa; 4078 uint64_t leaves, raidz_children, newsize, ashift = ztest_get_ashift(); 4079 kthread_t *scratch_thread = NULL; 4080 vdev_t *newvd, *pvd; 4081 nvlist_t *root; 4082 char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 4083 int error, expected_error = 0; 4084 4085 mutex_enter(&ztest_vdev_lock); 4086 4087 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); 4088 4089 /* Only allow attach when raid-kind = 'eraidz' */ 4090 if (!ztest_opts.zo_raid_do_expand) { 4091 spa_config_exit(spa, SCL_ALL, FTAG); 4092 goto out; 4093 } 4094 4095 if (ztest_opts.zo_mmp_test) { 4096 spa_config_exit(spa, SCL_ALL, FTAG); 4097 goto out; 4098 } 4099 4100 if (ztest_device_removal_active) { 4101 spa_config_exit(spa, SCL_ALL, FTAG); 4102 goto out; 4103 } 4104 4105 pvd = vdev_lookup_top(spa, 0); 4106 4107 ASSERT(pvd->vdev_ops == &vdev_raidz_ops); 4108 4109 /* 4110 * Get size of a child of the raidz group, 4111 * make sure device is a bit bigger 4112 */ 4113 newvd = pvd->vdev_child[ztest_random(pvd->vdev_children)]; 4114 newsize = 10 * vdev_get_min_asize(newvd) / (9 + ztest_random(2)); 4115 4116 /* 4117 * Get next attached leaf id 4118 */ 4119 raidz_children = ztest_get_raidz_children(spa); 4120 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 4121 zs->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 4122 4123 if (spa->spa_raidz_expand) 4124 expected_error = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS; 4125 4126 spa_config_exit(spa, SCL_ALL, FTAG); 4127 4128 /* 4129 * Path to vdev to be attached 4130 */ 4131 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 4132 ztest_opts.zo_dir, ztest_opts.zo_pool, zs->zs_vdev_next_leaf); 4133 4134 /* 4135 * Build the nvlist describing newpath. 4136 */ 4137 root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL, 4138 0, 0, 1); 4139 4140 /* 4141 * 50% of the time, set raidz_expand_pause_point to cause 4142 * raidz_reflow_scratch_sync() to pause at a certain point and 4143 * then kill the test after 10 seconds so raidz_scratch_verify() 4144 * can confirm consistency when the pool is imported. 4145 */ 4146 if (ztest_random(2) == 0 && expected_error == 0) { 4147 raidz_expand_pause_point = 4148 ztest_random(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2) + 1; 4149 scratch_thread = thread_create(NULL, 0, ztest_scratch_thread, 4150 ztest_shared, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 4151 } 4152 4153 error = spa_vdev_attach(spa, pvd->vdev_guid, root, B_FALSE, B_FALSE); 4154 4155 nvlist_free(root); 4156 4157 if (error == EOVERFLOW || error == ENXIO || 4158 error == ZFS_ERR_CHECKPOINT_EXISTS || 4159 error == ZFS_ERR_DISCARDING_CHECKPOINT) 4160 expected_error = error; 4161 4162 if (error != 0 && error != expected_error) { 4163 fatal(0, "raidz attach (%s %"PRIu64") returned %d, expected %d", 4164 newpath, newsize, error, expected_error); 4165 } 4166 4167 if (raidz_expand_pause_point) { 4168 if (error != 0) { 4169 /* 4170 * Do not verify scratch object in case of error 4171 * returned by vdev attaching. 4172 */ 4173 raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; 4174 } 4175 4176 VERIFY0(thread_join(scratch_thread)); 4177 } 4178 out: 4179 mutex_exit(&ztest_vdev_lock); 4180 4181 umem_free(newpath, MAXPATHLEN); 4182 } 4183 4184 void 4185 ztest_device_removal(ztest_ds_t *zd, uint64_t id) 4186 { 4187 (void) zd, (void) id; 4188 spa_t *spa = ztest_spa; 4189 vdev_t *vd; 4190 uint64_t guid; 4191 int error; 4192 4193 mutex_enter(&ztest_vdev_lock); 4194 4195 if (ztest_device_removal_active) { 4196 mutex_exit(&ztest_vdev_lock); 4197 return; 4198 } 4199 4200 /* 4201 * Remove a random top-level vdev and wait for removal to finish. 4202 */ 4203 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 4204 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 4205 guid = vd->vdev_guid; 4206 spa_config_exit(spa, SCL_VDEV, FTAG); 4207 4208 error = spa_vdev_remove(spa, guid, B_FALSE); 4209 if (error == 0) { 4210 ztest_device_removal_active = B_TRUE; 4211 mutex_exit(&ztest_vdev_lock); 4212 4213 /* 4214 * spa->spa_vdev_removal is created in a sync task that 4215 * is initiated via dsl_sync_task_nowait(). Since the 4216 * task may not run before spa_vdev_remove() returns, we 4217 * must wait at least 1 txg to ensure that the removal 4218 * struct has been created. 4219 */ 4220 txg_wait_synced(spa_get_dsl(spa), 0); 4221 4222 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 4223 txg_wait_synced(spa_get_dsl(spa), 0); 4224 } else { 4225 mutex_exit(&ztest_vdev_lock); 4226 return; 4227 } 4228 4229 /* 4230 * The pool needs to be scrubbed after completing device removal. 4231 * Failure to do so may result in checksum errors due to the 4232 * strategy employed by ztest_fault_inject() when selecting which 4233 * offset are redundant and can be damaged. 4234 */ 4235 error = spa_scan(spa, POOL_SCAN_SCRUB); 4236 if (error == 0) { 4237 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 4238 txg_wait_synced(spa_get_dsl(spa), 0); 4239 } 4240 4241 mutex_enter(&ztest_vdev_lock); 4242 ztest_device_removal_active = B_FALSE; 4243 mutex_exit(&ztest_vdev_lock); 4244 } 4245 4246 /* 4247 * Callback function which expands the physical size of the vdev. 4248 */ 4249 static vdev_t * 4250 grow_vdev(vdev_t *vd, void *arg) 4251 { 4252 spa_t *spa __maybe_unused = vd->vdev_spa; 4253 size_t *newsize = arg; 4254 size_t fsize; 4255 int fd; 4256 4257 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 4258 ASSERT(vd->vdev_ops->vdev_op_leaf); 4259 4260 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 4261 return (vd); 4262 4263 fsize = lseek(fd, 0, SEEK_END); 4264 VERIFY0(ftruncate(fd, *newsize)); 4265 4266 if (ztest_opts.zo_verbose >= 6) { 4267 (void) printf("%s grew from %lu to %lu bytes\n", 4268 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 4269 } 4270 (void) close(fd); 4271 return (NULL); 4272 } 4273 4274 /* 4275 * Callback function which expands a given vdev by calling vdev_online(). 4276 */ 4277 static vdev_t * 4278 online_vdev(vdev_t *vd, void *arg) 4279 { 4280 (void) arg; 4281 spa_t *spa = vd->vdev_spa; 4282 vdev_t *tvd = vd->vdev_top; 4283 uint64_t guid = vd->vdev_guid; 4284 uint64_t generation = spa->spa_config_generation + 1; 4285 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 4286 int error; 4287 4288 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 4289 ASSERT(vd->vdev_ops->vdev_op_leaf); 4290 4291 /* Calling vdev_online will initialize the new metaslabs */ 4292 spa_config_exit(spa, SCL_STATE, spa); 4293 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 4294 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4295 4296 /* 4297 * If vdev_online returned an error or the underlying vdev_open 4298 * failed then we abort the expand. The only way to know that 4299 * vdev_open fails is by checking the returned newstate. 4300 */ 4301 if (error || newstate != VDEV_STATE_HEALTHY) { 4302 if (ztest_opts.zo_verbose >= 5) { 4303 (void) printf("Unable to expand vdev, state %u, " 4304 "error %d\n", newstate, error); 4305 } 4306 return (vd); 4307 } 4308 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 4309 4310 /* 4311 * Since we dropped the lock we need to ensure that we're 4312 * still talking to the original vdev. It's possible this 4313 * vdev may have been detached/replaced while we were 4314 * trying to online it. 4315 */ 4316 if (generation != spa->spa_config_generation) { 4317 if (ztest_opts.zo_verbose >= 5) { 4318 (void) printf("vdev configuration has changed, " 4319 "guid %"PRIu64", state %"PRIu64", " 4320 "expected gen %"PRIu64", got gen %"PRIu64"\n", 4321 guid, 4322 tvd->vdev_state, 4323 generation, 4324 spa->spa_config_generation); 4325 } 4326 return (vd); 4327 } 4328 return (NULL); 4329 } 4330 4331 /* 4332 * Traverse the vdev tree calling the supplied function. 4333 * We continue to walk the tree until we either have walked all 4334 * children or we receive a non-NULL return from the callback. 4335 * If a NULL callback is passed, then we just return back the first 4336 * leaf vdev we encounter. 4337 */ 4338 static vdev_t * 4339 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 4340 { 4341 uint_t c; 4342 4343 if (vd->vdev_ops->vdev_op_leaf) { 4344 if (func == NULL) 4345 return (vd); 4346 else 4347 return (func(vd, arg)); 4348 } 4349 4350 for (c = 0; c < vd->vdev_children; c++) { 4351 vdev_t *cvd = vd->vdev_child[c]; 4352 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 4353 return (cvd); 4354 } 4355 return (NULL); 4356 } 4357 4358 /* 4359 * Verify that dynamic LUN growth works as expected. 4360 */ 4361 void 4362 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 4363 { 4364 (void) zd, (void) id; 4365 spa_t *spa = ztest_spa; 4366 vdev_t *vd, *tvd; 4367 metaslab_class_t *mc; 4368 metaslab_group_t *mg; 4369 size_t psize, newsize; 4370 uint64_t top; 4371 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 4372 4373 mutex_enter(&ztest_checkpoint_lock); 4374 mutex_enter(&ztest_vdev_lock); 4375 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4376 4377 /* 4378 * If there is a vdev removal in progress, it could complete while 4379 * we are running, in which case we would not be able to verify 4380 * that the metaslab_class space increased (because it decreases 4381 * when the device removal completes). 4382 */ 4383 if (ztest_device_removal_active) { 4384 spa_config_exit(spa, SCL_STATE, spa); 4385 mutex_exit(&ztest_vdev_lock); 4386 mutex_exit(&ztest_checkpoint_lock); 4387 return; 4388 } 4389 4390 /* 4391 * If we are under raidz expansion, the test can failed because the 4392 * metaslabs count will not increase immediately after the vdev is 4393 * expanded. It will happen only after raidz expansion completion. 4394 */ 4395 if (spa->spa_raidz_expand) { 4396 spa_config_exit(spa, SCL_STATE, spa); 4397 mutex_exit(&ztest_vdev_lock); 4398 mutex_exit(&ztest_checkpoint_lock); 4399 return; 4400 } 4401 4402 top = ztest_random_vdev_top(spa, B_TRUE); 4403 4404 tvd = spa->spa_root_vdev->vdev_child[top]; 4405 mg = tvd->vdev_mg; 4406 mc = mg->mg_class; 4407 old_ms_count = tvd->vdev_ms_count; 4408 old_class_space = metaslab_class_get_space(mc); 4409 4410 /* 4411 * Determine the size of the first leaf vdev associated with 4412 * our top-level device. 4413 */ 4414 vd = vdev_walk_tree(tvd, NULL, NULL); 4415 ASSERT3P(vd, !=, NULL); 4416 ASSERT(vd->vdev_ops->vdev_op_leaf); 4417 4418 psize = vd->vdev_psize; 4419 4420 /* 4421 * We only try to expand the vdev if it's healthy, less than 4x its 4422 * original size, and it has a valid psize. 4423 */ 4424 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 4425 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 4426 spa_config_exit(spa, SCL_STATE, spa); 4427 mutex_exit(&ztest_vdev_lock); 4428 mutex_exit(&ztest_checkpoint_lock); 4429 return; 4430 } 4431 ASSERT3U(psize, >, 0); 4432 newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); 4433 ASSERT3U(newsize, >, psize); 4434 4435 if (ztest_opts.zo_verbose >= 6) { 4436 (void) printf("Expanding LUN %s from %lu to %lu\n", 4437 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 4438 } 4439 4440 /* 4441 * Growing the vdev is a two step process: 4442 * 1). expand the physical size (i.e. relabel) 4443 * 2). online the vdev to create the new metaslabs 4444 */ 4445 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 4446 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 4447 tvd->vdev_state != VDEV_STATE_HEALTHY) { 4448 if (ztest_opts.zo_verbose >= 5) { 4449 (void) printf("Could not expand LUN because " 4450 "the vdev configuration changed.\n"); 4451 } 4452 spa_config_exit(spa, SCL_STATE, spa); 4453 mutex_exit(&ztest_vdev_lock); 4454 mutex_exit(&ztest_checkpoint_lock); 4455 return; 4456 } 4457 4458 spa_config_exit(spa, SCL_STATE, spa); 4459 4460 /* 4461 * Expanding the LUN will update the config asynchronously, 4462 * thus we must wait for the async thread to complete any 4463 * pending tasks before proceeding. 4464 */ 4465 for (;;) { 4466 boolean_t done; 4467 mutex_enter(&spa->spa_async_lock); 4468 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 4469 mutex_exit(&spa->spa_async_lock); 4470 if (done) 4471 break; 4472 txg_wait_synced(spa_get_dsl(spa), 0); 4473 (void) poll(NULL, 0, 100); 4474 } 4475 4476 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4477 4478 tvd = spa->spa_root_vdev->vdev_child[top]; 4479 new_ms_count = tvd->vdev_ms_count; 4480 new_class_space = metaslab_class_get_space(mc); 4481 4482 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 4483 if (ztest_opts.zo_verbose >= 5) { 4484 (void) printf("Could not verify LUN expansion due to " 4485 "intervening vdev offline or remove.\n"); 4486 } 4487 spa_config_exit(spa, SCL_STATE, spa); 4488 mutex_exit(&ztest_vdev_lock); 4489 mutex_exit(&ztest_checkpoint_lock); 4490 return; 4491 } 4492 4493 /* 4494 * Make sure we were able to grow the vdev. 4495 */ 4496 if (new_ms_count <= old_ms_count) { 4497 fatal(B_FALSE, 4498 "LUN expansion failed: ms_count %"PRIu64" < %"PRIu64"\n", 4499 old_ms_count, new_ms_count); 4500 } 4501 4502 /* 4503 * Make sure we were able to grow the pool. 4504 */ 4505 if (new_class_space <= old_class_space) { 4506 fatal(B_FALSE, 4507 "LUN expansion failed: class_space %"PRIu64" < %"PRIu64"\n", 4508 old_class_space, new_class_space); 4509 } 4510 4511 if (ztest_opts.zo_verbose >= 5) { 4512 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 4513 4514 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 4515 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 4516 (void) printf("%s grew from %s to %s\n", 4517 spa->spa_name, oldnumbuf, newnumbuf); 4518 } 4519 4520 spa_config_exit(spa, SCL_STATE, spa); 4521 mutex_exit(&ztest_vdev_lock); 4522 mutex_exit(&ztest_checkpoint_lock); 4523 } 4524 4525 /* 4526 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 4527 */ 4528 static void 4529 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 4530 { 4531 (void) arg, (void) cr; 4532 4533 /* 4534 * Create the objects common to all ztest datasets. 4535 */ 4536 VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, 4537 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); 4538 } 4539 4540 static int 4541 ztest_dataset_create(char *dsname) 4542 { 4543 int err; 4544 uint64_t rand; 4545 dsl_crypto_params_t *dcp = NULL; 4546 4547 /* 4548 * 50% of the time, we create encrypted datasets 4549 * using a random cipher suite and a hard-coded 4550 * wrapping key. 4551 */ 4552 rand = ztest_random(2); 4553 if (rand != 0) { 4554 nvlist_t *crypto_args = fnvlist_alloc(); 4555 nvlist_t *props = fnvlist_alloc(); 4556 4557 /* slight bias towards the default cipher suite */ 4558 rand = ztest_random(ZIO_CRYPT_FUNCTIONS); 4559 if (rand < ZIO_CRYPT_AES_128_CCM) 4560 rand = ZIO_CRYPT_ON; 4561 4562 fnvlist_add_uint64(props, 4563 zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); 4564 fnvlist_add_uint8_array(crypto_args, "wkeydata", 4565 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 4566 4567 /* 4568 * These parameters aren't really used by the kernel. They 4569 * are simply stored so that userspace knows how to load 4570 * the wrapping key. 4571 */ 4572 fnvlist_add_uint64(props, 4573 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); 4574 fnvlist_add_string(props, 4575 zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); 4576 fnvlist_add_uint64(props, 4577 zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); 4578 fnvlist_add_uint64(props, 4579 zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); 4580 4581 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, 4582 crypto_args, &dcp)); 4583 4584 /* 4585 * Cycle through all available encryption implementations 4586 * to verify interoperability. 4587 */ 4588 VERIFY0(gcm_impl_set("cycle")); 4589 VERIFY0(aes_impl_set("cycle")); 4590 4591 fnvlist_free(crypto_args); 4592 fnvlist_free(props); 4593 } 4594 4595 err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, 4596 ztest_objset_create_cb, NULL); 4597 dsl_crypto_params_free(dcp, !!err); 4598 4599 rand = ztest_random(100); 4600 if (err || rand < 80) 4601 return (err); 4602 4603 if (ztest_opts.zo_verbose >= 5) 4604 (void) printf("Setting dataset %s to sync always\n", dsname); 4605 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 4606 ZFS_SYNC_ALWAYS, B_FALSE)); 4607 } 4608 4609 static int 4610 ztest_objset_destroy_cb(const char *name, void *arg) 4611 { 4612 (void) arg; 4613 objset_t *os; 4614 dmu_object_info_t doi; 4615 int error; 4616 4617 /* 4618 * Verify that the dataset contains a directory object. 4619 */ 4620 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4621 B_TRUE, FTAG, &os)); 4622 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 4623 if (error != ENOENT) { 4624 /* We could have crashed in the middle of destroying it */ 4625 ASSERT0(error); 4626 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 4627 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 4628 } 4629 dmu_objset_disown(os, B_TRUE, FTAG); 4630 4631 /* 4632 * Destroy the dataset. 4633 */ 4634 if (strchr(name, '@') != NULL) { 4635 error = dsl_destroy_snapshot(name, B_TRUE); 4636 if (error != ECHRNG) { 4637 /* 4638 * The program was executed, but encountered a runtime 4639 * error, such as insufficient slop, or a hold on the 4640 * dataset. 4641 */ 4642 ASSERT0(error); 4643 } 4644 } else { 4645 error = dsl_destroy_head(name); 4646 if (error == ENOSPC) { 4647 /* There could be checkpoint or insufficient slop */ 4648 ztest_record_enospc(FTAG); 4649 } else if (error != EBUSY) { 4650 /* There could be a hold on this dataset */ 4651 ASSERT0(error); 4652 } 4653 } 4654 return (0); 4655 } 4656 4657 static boolean_t 4658 ztest_snapshot_create(char *osname, uint64_t id) 4659 { 4660 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4661 int error; 4662 4663 (void) snprintf(snapname, sizeof (snapname), "%"PRIu64"", id); 4664 4665 error = dmu_objset_snapshot_one(osname, snapname); 4666 if (error == ENOSPC) { 4667 ztest_record_enospc(FTAG); 4668 return (B_FALSE); 4669 } 4670 if (error != 0 && error != EEXIST && error != ECHRNG) { 4671 fatal(B_FALSE, "ztest_snapshot_create(%s@%s) = %d", osname, 4672 snapname, error); 4673 } 4674 return (B_TRUE); 4675 } 4676 4677 static boolean_t 4678 ztest_snapshot_destroy(char *osname, uint64_t id) 4679 { 4680 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4681 int error; 4682 4683 (void) snprintf(snapname, sizeof (snapname), "%s@%"PRIu64"", 4684 osname, id); 4685 4686 error = dsl_destroy_snapshot(snapname, B_FALSE); 4687 if (error != 0 && error != ENOENT && error != ECHRNG) 4688 fatal(B_FALSE, "ztest_snapshot_destroy(%s) = %d", 4689 snapname, error); 4690 return (B_TRUE); 4691 } 4692 4693 void 4694 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 4695 { 4696 (void) zd; 4697 ztest_ds_t *zdtmp; 4698 int iters; 4699 int error; 4700 objset_t *os, *os2; 4701 char name[ZFS_MAX_DATASET_NAME_LEN]; 4702 zilog_t *zilog; 4703 int i; 4704 4705 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 4706 4707 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4708 4709 (void) snprintf(name, sizeof (name), "%s/temp_%"PRIu64"", 4710 ztest_opts.zo_pool, id); 4711 4712 /* 4713 * If this dataset exists from a previous run, process its replay log 4714 * half of the time. If we don't replay it, then dsl_destroy_head() 4715 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 4716 */ 4717 if (ztest_random(2) == 0 && 4718 ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 4719 B_TRUE, FTAG, &os) == 0) { 4720 ztest_zd_init(zdtmp, NULL, os); 4721 zil_replay(os, zdtmp, ztest_replay_vector); 4722 ztest_zd_fini(zdtmp); 4723 dmu_objset_disown(os, B_TRUE, FTAG); 4724 } 4725 4726 /* 4727 * There may be an old instance of the dataset we're about to 4728 * create lying around from a previous run. If so, destroy it 4729 * and all of its snapshots. 4730 */ 4731 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 4732 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 4733 4734 /* 4735 * Verify that the destroyed dataset is no longer in the namespace. 4736 * It may still be present if the destroy above fails with ENOSPC. 4737 */ 4738 error = ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, 4739 FTAG, &os); 4740 if (error == 0) { 4741 dmu_objset_disown(os, B_TRUE, FTAG); 4742 ztest_record_enospc(FTAG); 4743 goto out; 4744 } 4745 VERIFY3U(ENOENT, ==, error); 4746 4747 /* 4748 * Verify that we can create a new dataset. 4749 */ 4750 error = ztest_dataset_create(name); 4751 if (error) { 4752 if (error == ENOSPC) { 4753 ztest_record_enospc(FTAG); 4754 goto out; 4755 } 4756 fatal(B_FALSE, "dmu_objset_create(%s) = %d", name, error); 4757 } 4758 4759 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, 4760 FTAG, &os)); 4761 4762 ztest_zd_init(zdtmp, NULL, os); 4763 4764 /* 4765 * Open the intent log for it. 4766 */ 4767 zilog = zil_open(os, ztest_get_data, NULL); 4768 4769 /* 4770 * Put some objects in there, do a little I/O to them, 4771 * and randomly take a couple of snapshots along the way. 4772 */ 4773 iters = ztest_random(5); 4774 for (i = 0; i < iters; i++) { 4775 ztest_dmu_object_alloc_free(zdtmp, id); 4776 if (ztest_random(iters) == 0) 4777 (void) ztest_snapshot_create(name, i); 4778 } 4779 4780 /* 4781 * Verify that we cannot create an existing dataset. 4782 */ 4783 VERIFY3U(EEXIST, ==, 4784 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); 4785 4786 /* 4787 * Verify that we can hold an objset that is also owned. 4788 */ 4789 VERIFY0(dmu_objset_hold(name, FTAG, &os2)); 4790 dmu_objset_rele(os2, FTAG); 4791 4792 /* 4793 * Verify that we cannot own an objset that is already owned. 4794 */ 4795 VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, 4796 B_FALSE, B_TRUE, FTAG, &os2)); 4797 4798 zil_close(zilog); 4799 dmu_objset_disown(os, B_TRUE, FTAG); 4800 ztest_zd_fini(zdtmp); 4801 out: 4802 (void) pthread_rwlock_unlock(&ztest_name_lock); 4803 4804 umem_free(zdtmp, sizeof (ztest_ds_t)); 4805 } 4806 4807 /* 4808 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 4809 */ 4810 void 4811 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 4812 { 4813 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4814 (void) ztest_snapshot_destroy(zd->zd_name, id); 4815 (void) ztest_snapshot_create(zd->zd_name, id); 4816 (void) pthread_rwlock_unlock(&ztest_name_lock); 4817 } 4818 4819 /* 4820 * Cleanup non-standard snapshots and clones. 4821 */ 4822 static void 4823 ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 4824 { 4825 char *snap1name; 4826 char *clone1name; 4827 char *snap2name; 4828 char *clone2name; 4829 char *snap3name; 4830 int error; 4831 4832 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4833 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4834 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4835 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4836 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4837 4838 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4839 osname, id); 4840 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4841 osname, id); 4842 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4843 clone1name, id); 4844 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4845 osname, id); 4846 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4847 clone1name, id); 4848 4849 error = dsl_destroy_head(clone2name); 4850 if (error && error != ENOENT) 4851 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone2name, error); 4852 error = dsl_destroy_snapshot(snap3name, B_FALSE); 4853 if (error && error != ENOENT) 4854 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4855 snap3name, error); 4856 error = dsl_destroy_snapshot(snap2name, B_FALSE); 4857 if (error && error != ENOENT) 4858 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4859 snap2name, error); 4860 error = dsl_destroy_head(clone1name); 4861 if (error && error != ENOENT) 4862 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone1name, error); 4863 error = dsl_destroy_snapshot(snap1name, B_FALSE); 4864 if (error && error != ENOENT) 4865 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4866 snap1name, error); 4867 4868 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4869 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4870 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4871 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4872 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4873 } 4874 4875 /* 4876 * Verify dsl_dataset_promote handles EBUSY 4877 */ 4878 void 4879 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 4880 { 4881 objset_t *os; 4882 char *snap1name; 4883 char *clone1name; 4884 char *snap2name; 4885 char *clone2name; 4886 char *snap3name; 4887 char *osname = zd->zd_name; 4888 int error; 4889 4890 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4891 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4892 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4893 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4894 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4895 4896 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4897 4898 ztest_dsl_dataset_cleanup(osname, id); 4899 4900 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4901 osname, id); 4902 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4903 osname, id); 4904 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4905 clone1name, id); 4906 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4907 osname, id); 4908 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4909 clone1name, id); 4910 4911 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 4912 if (error && error != EEXIST) { 4913 if (error == ENOSPC) { 4914 ztest_record_enospc(FTAG); 4915 goto out; 4916 } 4917 fatal(B_FALSE, "dmu_take_snapshot(%s) = %d", snap1name, error); 4918 } 4919 4920 error = dsl_dataset_clone(clone1name, snap1name); 4921 if (error) { 4922 if (error == ENOSPC) { 4923 ztest_record_enospc(FTAG); 4924 goto out; 4925 } 4926 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone1name, error); 4927 } 4928 4929 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 4930 if (error && error != EEXIST) { 4931 if (error == ENOSPC) { 4932 ztest_record_enospc(FTAG); 4933 goto out; 4934 } 4935 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap2name, error); 4936 } 4937 4938 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 4939 if (error && error != EEXIST) { 4940 if (error == ENOSPC) { 4941 ztest_record_enospc(FTAG); 4942 goto out; 4943 } 4944 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap3name, error); 4945 } 4946 4947 error = dsl_dataset_clone(clone2name, snap3name); 4948 if (error) { 4949 if (error == ENOSPC) { 4950 ztest_record_enospc(FTAG); 4951 goto out; 4952 } 4953 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone2name, error); 4954 } 4955 4956 error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, 4957 FTAG, &os); 4958 if (error) 4959 fatal(B_FALSE, "dmu_objset_own(%s) = %d", snap2name, error); 4960 error = dsl_dataset_promote(clone2name, NULL); 4961 if (error == ENOSPC) { 4962 dmu_objset_disown(os, B_TRUE, FTAG); 4963 ztest_record_enospc(FTAG); 4964 goto out; 4965 } 4966 if (error != EBUSY) 4967 fatal(B_FALSE, "dsl_dataset_promote(%s), %d, not EBUSY", 4968 clone2name, error); 4969 dmu_objset_disown(os, B_TRUE, FTAG); 4970 4971 out: 4972 ztest_dsl_dataset_cleanup(osname, id); 4973 4974 (void) pthread_rwlock_unlock(&ztest_name_lock); 4975 4976 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4977 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4978 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4979 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4980 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4981 } 4982 4983 #undef OD_ARRAY_SIZE 4984 #define OD_ARRAY_SIZE 4 4985 4986 /* 4987 * Verify that dmu_object_{alloc,free} work as expected. 4988 */ 4989 void 4990 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 4991 { 4992 ztest_od_t *od; 4993 int batchsize; 4994 int size; 4995 int b; 4996 4997 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4998 od = umem_alloc(size, UMEM_NOFAIL); 4999 batchsize = OD_ARRAY_SIZE; 5000 5001 for (b = 0; b < batchsize; b++) 5002 ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 5003 0, 0, 0); 5004 5005 /* 5006 * Destroy the previous batch of objects, create a new batch, 5007 * and do some I/O on the new objects. 5008 */ 5009 if (ztest_object_init(zd, od, size, B_TRUE) != 0) { 5010 zd->zd_od = NULL; 5011 umem_free(od, size); 5012 return; 5013 } 5014 5015 while (ztest_random(4 * batchsize) != 0) 5016 ztest_io(zd, od[ztest_random(batchsize)].od_object, 5017 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5018 5019 umem_free(od, size); 5020 } 5021 5022 /* 5023 * Rewind the global allocator to verify object allocation backfilling. 5024 */ 5025 void 5026 ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 5027 { 5028 (void) id; 5029 objset_t *os = zd->zd_os; 5030 uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 5031 uint64_t object; 5032 5033 /* 5034 * Rewind the global allocator randomly back to a lower object number 5035 * to force backfilling and reclamation of recently freed dnodes. 5036 */ 5037 mutex_enter(&os->os_obj_lock); 5038 object = ztest_random(os->os_obj_next_chunk); 5039 os->os_obj_next_chunk = P2ALIGN_TYPED(object, dnodes_per_chunk, 5040 uint64_t); 5041 mutex_exit(&os->os_obj_lock); 5042 } 5043 5044 #undef OD_ARRAY_SIZE 5045 #define OD_ARRAY_SIZE 2 5046 5047 /* 5048 * Verify that dmu_{read,write} work as expected. 5049 */ 5050 void 5051 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 5052 { 5053 int size; 5054 ztest_od_t *od; 5055 5056 objset_t *os = zd->zd_os; 5057 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 5058 od = umem_alloc(size, UMEM_NOFAIL); 5059 dmu_tx_t *tx; 5060 int freeit, error; 5061 uint64_t i, n, s, txg; 5062 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 5063 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 5064 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 5065 uint64_t regions = 997; 5066 uint64_t stride = 123456789ULL; 5067 uint64_t width = 40; 5068 int free_percent = 5; 5069 dmu_flags_t dmu_read_flags = DMU_READ_PREFETCH; 5070 5071 /* 5072 * We will randomly set when to do O_DIRECT on a read. 5073 */ 5074 if (ztest_random(4) == 0) 5075 dmu_read_flags |= DMU_DIRECTIO; 5076 5077 /* 5078 * This test uses two objects, packobj and bigobj, that are always 5079 * updated together (i.e. in the same tx) so that their contents are 5080 * in sync and can be compared. Their contents relate to each other 5081 * in a simple way: packobj is a dense array of 'bufwad' structures, 5082 * while bigobj is a sparse array of the same bufwads. Specifically, 5083 * for any index n, there are three bufwads that should be identical: 5084 * 5085 * packobj, at offset n * sizeof (bufwad_t) 5086 * bigobj, at the head of the nth chunk 5087 * bigobj, at the tail of the nth chunk 5088 * 5089 * The chunk size is arbitrary. It doesn't have to be a power of two, 5090 * and it doesn't have any relation to the object blocksize. 5091 * The only requirement is that it can hold at least two bufwads. 5092 * 5093 * Normally, we write the bufwad to each of these locations. 5094 * However, free_percent of the time we instead write zeroes to 5095 * packobj and perform a dmu_free_range() on bigobj. By comparing 5096 * bigobj to packobj, we can verify that the DMU is correctly 5097 * tracking which parts of an object are allocated and free, 5098 * and that the contents of the allocated blocks are correct. 5099 */ 5100 5101 /* 5102 * Read the directory info. If it's the first time, set things up. 5103 */ 5104 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); 5105 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5106 chunksize); 5107 5108 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5109 umem_free(od, size); 5110 return; 5111 } 5112 5113 bigobj = od[0].od_object; 5114 packobj = od[1].od_object; 5115 chunksize = od[0].od_gen; 5116 ASSERT3U(chunksize, ==, od[1].od_gen); 5117 5118 /* 5119 * Prefetch a random chunk of the big object. 5120 * Our aim here is to get some async reads in flight 5121 * for blocks that we may free below; the DMU should 5122 * handle this race correctly. 5123 */ 5124 n = ztest_random(regions) * stride + ztest_random(width); 5125 s = 1 + ztest_random(2 * width - 1); 5126 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 5127 ZIO_PRIORITY_SYNC_READ); 5128 5129 /* 5130 * Pick a random index and compute the offsets into packobj and bigobj. 5131 */ 5132 n = ztest_random(regions) * stride + ztest_random(width); 5133 s = 1 + ztest_random(width - 1); 5134 5135 packoff = n * sizeof (bufwad_t); 5136 packsize = s * sizeof (bufwad_t); 5137 5138 bigoff = n * chunksize; 5139 bigsize = s * chunksize; 5140 5141 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 5142 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 5143 5144 /* 5145 * free_percent of the time, free a range of bigobj rather than 5146 * overwriting it. 5147 */ 5148 freeit = (ztest_random(100) < free_percent); 5149 5150 /* 5151 * Read the current contents of our objects. 5152 */ 5153 error = dmu_read(os, packobj, packoff, packsize, packbuf, 5154 dmu_read_flags); 5155 ASSERT0(error); 5156 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 5157 dmu_read_flags); 5158 ASSERT0(error); 5159 5160 /* 5161 * Get a tx for the mods to both packobj and bigobj. 5162 */ 5163 tx = dmu_tx_create(os); 5164 5165 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5166 5167 if (freeit) 5168 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 5169 else 5170 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5171 5172 /* This accounts for setting the checksum/compression. */ 5173 dmu_tx_hold_bonus(tx, bigobj); 5174 5175 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5176 if (txg == 0) { 5177 umem_free(packbuf, packsize); 5178 umem_free(bigbuf, bigsize); 5179 umem_free(od, size); 5180 return; 5181 } 5182 5183 enum zio_checksum cksum; 5184 do { 5185 cksum = (enum zio_checksum) 5186 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 5187 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 5188 dmu_object_set_checksum(os, bigobj, cksum, tx); 5189 5190 enum zio_compress comp; 5191 do { 5192 comp = (enum zio_compress) 5193 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 5194 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 5195 dmu_object_set_compress(os, bigobj, comp, tx); 5196 5197 /* 5198 * For each index from n to n + s, verify that the existing bufwad 5199 * in packobj matches the bufwads at the head and tail of the 5200 * corresponding chunk in bigobj. Then update all three bufwads 5201 * with the new values we want to write out. 5202 */ 5203 for (i = 0; i < s; i++) { 5204 /* LINTED */ 5205 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 5206 /* LINTED */ 5207 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 5208 /* LINTED */ 5209 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 5210 5211 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 5212 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 5213 5214 if (pack->bw_txg > txg) 5215 fatal(B_FALSE, 5216 "future leak: got %"PRIx64", open txg is %"PRIx64"", 5217 pack->bw_txg, txg); 5218 5219 if (pack->bw_data != 0 && pack->bw_index != n + i) 5220 fatal(B_FALSE, "wrong index: " 5221 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 5222 pack->bw_index, n, i); 5223 5224 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 5225 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 5226 pack, bigH); 5227 5228 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 5229 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 5230 pack, bigT); 5231 5232 if (freeit) { 5233 memset(pack, 0, sizeof (bufwad_t)); 5234 } else { 5235 pack->bw_index = n + i; 5236 pack->bw_txg = txg; 5237 pack->bw_data = 1 + ztest_random(-2ULL); 5238 } 5239 *bigH = *pack; 5240 *bigT = *pack; 5241 } 5242 5243 /* 5244 * We've verified all the old bufwads, and made new ones. 5245 * Now write them out. 5246 */ 5247 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5248 5249 if (freeit) { 5250 if (ztest_opts.zo_verbose >= 7) { 5251 (void) printf("freeing offset %"PRIx64" size %"PRIx64"" 5252 " txg %"PRIx64"\n", 5253 bigoff, bigsize, txg); 5254 } 5255 VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 5256 } else { 5257 if (ztest_opts.zo_verbose >= 7) { 5258 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5259 " txg %"PRIx64"\n", 5260 bigoff, bigsize, txg); 5261 } 5262 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 5263 } 5264 5265 dmu_tx_commit(tx); 5266 5267 /* 5268 * Sanity check the stuff we just wrote. 5269 */ 5270 { 5271 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5272 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5273 5274 VERIFY0(dmu_read(os, packobj, packoff, 5275 packsize, packcheck, dmu_read_flags)); 5276 VERIFY0(dmu_read(os, bigobj, bigoff, 5277 bigsize, bigcheck, dmu_read_flags)); 5278 5279 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5280 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5281 5282 umem_free(packcheck, packsize); 5283 umem_free(bigcheck, bigsize); 5284 } 5285 5286 umem_free(packbuf, packsize); 5287 umem_free(bigbuf, bigsize); 5288 umem_free(od, size); 5289 } 5290 5291 static void 5292 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 5293 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 5294 { 5295 uint64_t i; 5296 bufwad_t *pack; 5297 bufwad_t *bigH; 5298 bufwad_t *bigT; 5299 5300 /* 5301 * For each index from n to n + s, verify that the existing bufwad 5302 * in packobj matches the bufwads at the head and tail of the 5303 * corresponding chunk in bigobj. Then update all three bufwads 5304 * with the new values we want to write out. 5305 */ 5306 for (i = 0; i < s; i++) { 5307 /* LINTED */ 5308 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 5309 /* LINTED */ 5310 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 5311 /* LINTED */ 5312 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 5313 5314 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 5315 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 5316 5317 if (pack->bw_txg > txg) 5318 fatal(B_FALSE, 5319 "future leak: got %"PRIx64", open txg is %"PRIx64"", 5320 pack->bw_txg, txg); 5321 5322 if (pack->bw_data != 0 && pack->bw_index != n + i) 5323 fatal(B_FALSE, "wrong index: " 5324 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 5325 pack->bw_index, n, i); 5326 5327 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 5328 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 5329 pack, bigH); 5330 5331 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 5332 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 5333 pack, bigT); 5334 5335 pack->bw_index = n + i; 5336 pack->bw_txg = txg; 5337 pack->bw_data = 1 + ztest_random(-2ULL); 5338 5339 *bigH = *pack; 5340 *bigT = *pack; 5341 } 5342 } 5343 5344 #undef OD_ARRAY_SIZE 5345 #define OD_ARRAY_SIZE 2 5346 5347 void 5348 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 5349 { 5350 objset_t *os = zd->zd_os; 5351 ztest_od_t *od; 5352 dmu_tx_t *tx; 5353 uint64_t i; 5354 int error; 5355 int size; 5356 uint64_t n, s, txg; 5357 bufwad_t *packbuf, *bigbuf; 5358 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 5359 uint64_t blocksize = ztest_random_blocksize(); 5360 uint64_t chunksize = blocksize; 5361 uint64_t regions = 997; 5362 uint64_t stride = 123456789ULL; 5363 uint64_t width = 9; 5364 dmu_buf_t *bonus_db; 5365 arc_buf_t **bigbuf_arcbufs; 5366 dmu_object_info_t doi; 5367 uint32_t dmu_read_flags = DMU_READ_PREFETCH; 5368 5369 /* 5370 * We will randomly set when to do O_DIRECT on a read. 5371 */ 5372 if (ztest_random(4) == 0) 5373 dmu_read_flags |= DMU_DIRECTIO; 5374 5375 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 5376 od = umem_alloc(size, UMEM_NOFAIL); 5377 5378 /* 5379 * This test uses two objects, packobj and bigobj, that are always 5380 * updated together (i.e. in the same tx) so that their contents are 5381 * in sync and can be compared. Their contents relate to each other 5382 * in a simple way: packobj is a dense array of 'bufwad' structures, 5383 * while bigobj is a sparse array of the same bufwads. Specifically, 5384 * for any index n, there are three bufwads that should be identical: 5385 * 5386 * packobj, at offset n * sizeof (bufwad_t) 5387 * bigobj, at the head of the nth chunk 5388 * bigobj, at the tail of the nth chunk 5389 * 5390 * The chunk size is set equal to bigobj block size so that 5391 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. 5392 */ 5393 5394 /* 5395 * Read the directory info. If it's the first time, set things up. 5396 */ 5397 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5398 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5399 chunksize); 5400 5401 5402 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5403 umem_free(od, size); 5404 return; 5405 } 5406 5407 bigobj = od[0].od_object; 5408 packobj = od[1].od_object; 5409 blocksize = od[0].od_blocksize; 5410 chunksize = blocksize; 5411 ASSERT3U(chunksize, ==, od[1].od_gen); 5412 5413 VERIFY0(dmu_object_info(os, bigobj, &doi)); 5414 VERIFY(ISP2(doi.doi_data_block_size)); 5415 VERIFY3U(chunksize, ==, doi.doi_data_block_size); 5416 VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); 5417 5418 /* 5419 * Pick a random index and compute the offsets into packobj and bigobj. 5420 */ 5421 n = ztest_random(regions) * stride + ztest_random(width); 5422 s = 1 + ztest_random(width - 1); 5423 5424 packoff = n * sizeof (bufwad_t); 5425 packsize = s * sizeof (bufwad_t); 5426 5427 bigoff = n * chunksize; 5428 bigsize = s * chunksize; 5429 5430 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 5431 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 5432 5433 VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 5434 5435 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 5436 5437 /* 5438 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 5439 * Iteration 1 test zcopy to already referenced dbufs. 5440 * Iteration 2 test zcopy to dirty dbuf in the same txg. 5441 * Iteration 3 test zcopy to dbuf dirty in previous txg. 5442 * Iteration 4 test zcopy when dbuf is no longer dirty. 5443 * Iteration 5 test zcopy when it can't be done. 5444 * Iteration 6 one more zcopy write. 5445 */ 5446 for (i = 0; i < 7; i++) { 5447 uint64_t j; 5448 uint64_t off; 5449 5450 /* 5451 * In iteration 5 (i == 5) use arcbufs 5452 * that don't match bigobj blksz to test 5453 * dmu_assign_arcbuf_by_dbuf() when it can't directly 5454 * assign an arcbuf to a dbuf. 5455 */ 5456 for (j = 0; j < s; j++) { 5457 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5458 bigbuf_arcbufs[j] = 5459 dmu_request_arcbuf(bonus_db, chunksize); 5460 } else { 5461 bigbuf_arcbufs[2 * j] = 5462 dmu_request_arcbuf(bonus_db, chunksize / 2); 5463 bigbuf_arcbufs[2 * j + 1] = 5464 dmu_request_arcbuf(bonus_db, chunksize / 2); 5465 } 5466 } 5467 5468 /* 5469 * Get a tx for the mods to both packobj and bigobj. 5470 */ 5471 tx = dmu_tx_create(os); 5472 5473 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5474 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5475 5476 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5477 if (txg == 0) { 5478 umem_free(packbuf, packsize); 5479 umem_free(bigbuf, bigsize); 5480 for (j = 0; j < s; j++) { 5481 if (i != 5 || 5482 chunksize < (SPA_MINBLOCKSIZE * 2)) { 5483 dmu_return_arcbuf(bigbuf_arcbufs[j]); 5484 } else { 5485 dmu_return_arcbuf( 5486 bigbuf_arcbufs[2 * j]); 5487 dmu_return_arcbuf( 5488 bigbuf_arcbufs[2 * j + 1]); 5489 } 5490 } 5491 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5492 umem_free(od, size); 5493 dmu_buf_rele(bonus_db, FTAG); 5494 return; 5495 } 5496 5497 /* 5498 * 50% of the time don't read objects in the 1st iteration to 5499 * test dmu_assign_arcbuf_by_dbuf() for the case when there are 5500 * no existing dbufs for the specified offsets. 5501 */ 5502 if (i != 0 || ztest_random(2) != 0) { 5503 error = dmu_read(os, packobj, packoff, 5504 packsize, packbuf, dmu_read_flags); 5505 ASSERT0(error); 5506 error = dmu_read(os, bigobj, bigoff, bigsize, 5507 bigbuf, dmu_read_flags); 5508 ASSERT0(error); 5509 } 5510 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 5511 n, chunksize, txg); 5512 5513 /* 5514 * We've verified all the old bufwads, and made new ones. 5515 * Now write them out. 5516 */ 5517 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5518 if (ztest_opts.zo_verbose >= 7) { 5519 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5520 " txg %"PRIx64"\n", 5521 bigoff, bigsize, txg); 5522 } 5523 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 5524 dmu_buf_t *dbt; 5525 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5526 memcpy(bigbuf_arcbufs[j]->b_data, 5527 (caddr_t)bigbuf + (off - bigoff), 5528 chunksize); 5529 } else { 5530 memcpy(bigbuf_arcbufs[2 * j]->b_data, 5531 (caddr_t)bigbuf + (off - bigoff), 5532 chunksize / 2); 5533 memcpy(bigbuf_arcbufs[2 * j + 1]->b_data, 5534 (caddr_t)bigbuf + (off - bigoff) + 5535 chunksize / 2, 5536 chunksize / 2); 5537 } 5538 5539 if (i == 1) { 5540 VERIFY(dmu_buf_hold(os, bigobj, off, 5541 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 5542 } 5543 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5544 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5545 off, bigbuf_arcbufs[j], tx, 0)); 5546 } else { 5547 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5548 off, bigbuf_arcbufs[2 * j], tx, 0)); 5549 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5550 off + chunksize / 2, 5551 bigbuf_arcbufs[2 * j + 1], tx, 0)); 5552 } 5553 if (i == 1) { 5554 dmu_buf_rele(dbt, FTAG); 5555 } 5556 } 5557 dmu_tx_commit(tx); 5558 5559 /* 5560 * Sanity check the stuff we just wrote. 5561 */ 5562 { 5563 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5564 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5565 5566 VERIFY0(dmu_read(os, packobj, packoff, 5567 packsize, packcheck, dmu_read_flags)); 5568 VERIFY0(dmu_read(os, bigobj, bigoff, 5569 bigsize, bigcheck, dmu_read_flags)); 5570 5571 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5572 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5573 5574 umem_free(packcheck, packsize); 5575 umem_free(bigcheck, bigsize); 5576 } 5577 if (i == 2) { 5578 txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); 5579 } else if (i == 3) { 5580 txg_wait_synced(dmu_objset_pool(os), 0); 5581 } 5582 } 5583 5584 dmu_buf_rele(bonus_db, FTAG); 5585 umem_free(packbuf, packsize); 5586 umem_free(bigbuf, bigsize); 5587 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5588 umem_free(od, size); 5589 } 5590 5591 void 5592 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 5593 { 5594 (void) id; 5595 ztest_od_t *od; 5596 5597 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5598 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 5599 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5600 5601 /* 5602 * Have multiple threads write to large offsets in an object 5603 * to verify that parallel writes to an object -- even to the 5604 * same blocks within the object -- doesn't cause any trouble. 5605 */ 5606 ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5607 5608 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) 5609 return; 5610 5611 while (ztest_random(10) != 0) 5612 ztest_io(zd, od->od_object, offset); 5613 5614 umem_free(od, sizeof (ztest_od_t)); 5615 } 5616 5617 void 5618 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 5619 { 5620 ztest_od_t *od; 5621 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 5622 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5623 uint64_t count = ztest_random(20) + 1; 5624 uint64_t blocksize = ztest_random_blocksize(); 5625 void *data; 5626 5627 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5628 5629 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5630 5631 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5632 !ztest_random(2)) != 0) { 5633 umem_free(od, sizeof (ztest_od_t)); 5634 return; 5635 } 5636 5637 if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { 5638 umem_free(od, sizeof (ztest_od_t)); 5639 return; 5640 } 5641 5642 ztest_prealloc(zd, od->od_object, offset, count * blocksize); 5643 5644 data = umem_zalloc(blocksize, UMEM_NOFAIL); 5645 5646 while (ztest_random(count) != 0) { 5647 uint64_t randoff = offset + (ztest_random(count) * blocksize); 5648 if (ztest_write(zd, od->od_object, randoff, blocksize, 5649 data) != 0) 5650 break; 5651 while (ztest_random(4) != 0) 5652 ztest_io(zd, od->od_object, randoff); 5653 } 5654 5655 umem_free(data, blocksize); 5656 umem_free(od, sizeof (ztest_od_t)); 5657 } 5658 5659 /* 5660 * Verify that zap_{create,destroy,add,remove,update} work as expected. 5661 */ 5662 #define ZTEST_ZAP_MIN_INTS 1 5663 #define ZTEST_ZAP_MAX_INTS 4 5664 #define ZTEST_ZAP_MAX_PROPS 1000 5665 5666 void 5667 ztest_zap(ztest_ds_t *zd, uint64_t id) 5668 { 5669 objset_t *os = zd->zd_os; 5670 ztest_od_t *od; 5671 uint64_t object; 5672 uint64_t txg, last_txg; 5673 uint64_t value[ZTEST_ZAP_MAX_INTS]; 5674 uint64_t zl_ints, zl_intsize, prop; 5675 int i, ints; 5676 dmu_tx_t *tx; 5677 char propname[100], txgname[100]; 5678 int error; 5679 const char *const hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 5680 5681 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5682 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5683 5684 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5685 !ztest_random(2)) != 0) 5686 goto out; 5687 5688 object = od->od_object; 5689 5690 /* 5691 * Generate a known hash collision, and verify that 5692 * we can lookup and remove both entries. 5693 */ 5694 tx = dmu_tx_create(os); 5695 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5696 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5697 if (txg == 0) 5698 goto out; 5699 for (i = 0; i < 2; i++) { 5700 value[i] = i; 5701 VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 5702 1, &value[i], tx)); 5703 } 5704 for (i = 0; i < 2; i++) { 5705 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 5706 sizeof (uint64_t), 1, &value[i], tx)); 5707 VERIFY0( 5708 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 5709 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5710 ASSERT3U(zl_ints, ==, 1); 5711 } 5712 for (i = 0; i < 2; i++) { 5713 VERIFY0(zap_remove(os, object, hc[i], tx)); 5714 } 5715 dmu_tx_commit(tx); 5716 5717 /* 5718 * Generate a bunch of random entries. 5719 */ 5720 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 5721 5722 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5723 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5724 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5725 memset(value, 0, sizeof (value)); 5726 last_txg = 0; 5727 5728 /* 5729 * If these zap entries already exist, validate their contents. 5730 */ 5731 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5732 if (error == 0) { 5733 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5734 ASSERT3U(zl_ints, ==, 1); 5735 5736 VERIFY0(zap_lookup(os, object, txgname, zl_intsize, 5737 zl_ints, &last_txg)); 5738 5739 VERIFY0(zap_length(os, object, propname, &zl_intsize, 5740 &zl_ints)); 5741 5742 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5743 ASSERT3U(zl_ints, ==, ints); 5744 5745 VERIFY0(zap_lookup(os, object, propname, zl_intsize, 5746 zl_ints, value)); 5747 5748 for (i = 0; i < ints; i++) { 5749 ASSERT3U(value[i], ==, last_txg + object + i); 5750 } 5751 } else { 5752 ASSERT3U(error, ==, ENOENT); 5753 } 5754 5755 /* 5756 * Atomically update two entries in our zap object. 5757 * The first is named txg_%llu, and contains the txg 5758 * in which the property was last updated. The second 5759 * is named prop_%llu, and the nth element of its value 5760 * should be txg + object + n. 5761 */ 5762 tx = dmu_tx_create(os); 5763 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5764 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5765 if (txg == 0) 5766 goto out; 5767 5768 if (last_txg > txg) 5769 fatal(B_FALSE, "zap future leak: old %"PRIu64" new %"PRIu64"", 5770 last_txg, txg); 5771 5772 for (i = 0; i < ints; i++) 5773 value[i] = txg + object + i; 5774 5775 VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 5776 1, &txg, tx)); 5777 VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), 5778 ints, value, tx)); 5779 5780 dmu_tx_commit(tx); 5781 5782 /* 5783 * Remove a random pair of entries. 5784 */ 5785 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5786 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5787 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5788 5789 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5790 5791 if (error == ENOENT) 5792 goto out; 5793 5794 ASSERT0(error); 5795 5796 tx = dmu_tx_create(os); 5797 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5798 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5799 if (txg == 0) 5800 goto out; 5801 VERIFY0(zap_remove(os, object, txgname, tx)); 5802 VERIFY0(zap_remove(os, object, propname, tx)); 5803 dmu_tx_commit(tx); 5804 out: 5805 umem_free(od, sizeof (ztest_od_t)); 5806 } 5807 5808 /* 5809 * Test case to test the upgrading of a microzap to fatzap. 5810 */ 5811 void 5812 ztest_fzap(ztest_ds_t *zd, uint64_t id) 5813 { 5814 objset_t *os = zd->zd_os; 5815 ztest_od_t *od; 5816 uint64_t object, txg, value; 5817 5818 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5819 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5820 5821 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5822 !ztest_random(2)) != 0) 5823 goto out; 5824 object = od->od_object; 5825 5826 /* 5827 * Add entries to this ZAP and make sure it spills over 5828 * and gets upgraded to a fatzap. Also, since we are adding 5829 * 2050 entries we should see ptrtbl growth and leaf-block split. 5830 */ 5831 for (value = 0; value < 2050; value++) { 5832 char name[ZFS_MAX_DATASET_NAME_LEN]; 5833 dmu_tx_t *tx; 5834 int error; 5835 5836 (void) snprintf(name, sizeof (name), "fzap-%"PRIu64"-%"PRIu64"", 5837 id, value); 5838 5839 tx = dmu_tx_create(os); 5840 dmu_tx_hold_zap(tx, object, B_TRUE, name); 5841 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5842 if (txg == 0) 5843 goto out; 5844 error = zap_add(os, object, name, sizeof (uint64_t), 1, 5845 &value, tx); 5846 ASSERT(error == 0 || error == EEXIST); 5847 dmu_tx_commit(tx); 5848 } 5849 out: 5850 umem_free(od, sizeof (ztest_od_t)); 5851 } 5852 5853 void 5854 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 5855 { 5856 (void) id; 5857 objset_t *os = zd->zd_os; 5858 ztest_od_t *od; 5859 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 5860 dmu_tx_t *tx; 5861 int i, namelen, error; 5862 int micro = ztest_random(2); 5863 char name[20], string_value[20]; 5864 void *data; 5865 5866 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5867 ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); 5868 5869 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5870 umem_free(od, sizeof (ztest_od_t)); 5871 return; 5872 } 5873 5874 object = od->od_object; 5875 5876 /* 5877 * Generate a random name of the form 'xxx.....' where each 5878 * x is a random printable character and the dots are dots. 5879 * There are 94 such characters, and the name length goes from 5880 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 5881 */ 5882 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 5883 5884 for (i = 0; i < 3; i++) 5885 name[i] = '!' + ztest_random('~' - '!' + 1); 5886 for (; i < namelen - 1; i++) 5887 name[i] = '.'; 5888 name[i] = '\0'; 5889 5890 if ((namelen & 1) || micro) { 5891 wsize = sizeof (txg); 5892 wc = 1; 5893 data = &txg; 5894 } else { 5895 wsize = 1; 5896 wc = namelen; 5897 data = string_value; 5898 } 5899 5900 count = -1ULL; 5901 VERIFY0(zap_count(os, object, &count)); 5902 ASSERT3S(count, !=, -1ULL); 5903 5904 /* 5905 * Select an operation: length, lookup, add, update, remove. 5906 */ 5907 i = ztest_random(5); 5908 5909 if (i >= 2) { 5910 tx = dmu_tx_create(os); 5911 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5912 txg = ztest_tx_assign(tx, DMU_TX_MIGHTWAIT, FTAG); 5913 if (txg == 0) { 5914 umem_free(od, sizeof (ztest_od_t)); 5915 return; 5916 } 5917 memcpy(string_value, name, namelen); 5918 } else { 5919 tx = NULL; 5920 txg = 0; 5921 memset(string_value, 0, namelen); 5922 } 5923 5924 switch (i) { 5925 5926 case 0: 5927 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 5928 if (error == 0) { 5929 ASSERT3U(wsize, ==, zl_wsize); 5930 ASSERT3U(wc, ==, zl_wc); 5931 } else { 5932 ASSERT3U(error, ==, ENOENT); 5933 } 5934 break; 5935 5936 case 1: 5937 error = zap_lookup(os, object, name, wsize, wc, data); 5938 if (error == 0) { 5939 if (data == string_value && 5940 memcmp(name, data, namelen) != 0) 5941 fatal(B_FALSE, "name '%s' != val '%s' len %d", 5942 name, (char *)data, namelen); 5943 } else { 5944 ASSERT3U(error, ==, ENOENT); 5945 } 5946 break; 5947 5948 case 2: 5949 error = zap_add(os, object, name, wsize, wc, data, tx); 5950 ASSERT(error == 0 || error == EEXIST); 5951 break; 5952 5953 case 3: 5954 VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); 5955 break; 5956 5957 case 4: 5958 error = zap_remove(os, object, name, tx); 5959 ASSERT(error == 0 || error == ENOENT); 5960 break; 5961 } 5962 5963 if (tx != NULL) 5964 dmu_tx_commit(tx); 5965 5966 umem_free(od, sizeof (ztest_od_t)); 5967 } 5968 5969 /* 5970 * Commit callback data. 5971 */ 5972 typedef struct ztest_cb_data { 5973 list_node_t zcd_node; 5974 uint64_t zcd_txg; 5975 int zcd_expected_err; 5976 boolean_t zcd_added; 5977 boolean_t zcd_called; 5978 spa_t *zcd_spa; 5979 } ztest_cb_data_t; 5980 5981 /* This is the actual commit callback function */ 5982 static void 5983 ztest_commit_callback(void *arg, int error) 5984 { 5985 ztest_cb_data_t *data = arg; 5986 uint64_t synced_txg; 5987 5988 VERIFY3P(data, !=, NULL); 5989 VERIFY3S(data->zcd_expected_err, ==, error); 5990 VERIFY(!data->zcd_called); 5991 5992 synced_txg = spa_last_synced_txg(data->zcd_spa); 5993 if (data->zcd_txg > synced_txg) 5994 fatal(B_FALSE, 5995 "commit callback of txg %"PRIu64" called prematurely, " 5996 "last synced txg = %"PRIu64"\n", 5997 data->zcd_txg, synced_txg); 5998 5999 data->zcd_called = B_TRUE; 6000 6001 if (error == ECANCELED) { 6002 ASSERT0(data->zcd_txg); 6003 ASSERT(!data->zcd_added); 6004 6005 /* 6006 * The private callback data should be destroyed here, but 6007 * since we are going to check the zcd_called field after 6008 * dmu_tx_abort(), we will destroy it there. 6009 */ 6010 return; 6011 } 6012 6013 ASSERT(data->zcd_added); 6014 ASSERT3U(data->zcd_txg, !=, 0); 6015 6016 (void) mutex_enter(&zcl.zcl_callbacks_lock); 6017 6018 /* See if this cb was called more quickly */ 6019 if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) 6020 zc_min_txg_delay = synced_txg - data->zcd_txg; 6021 6022 /* Remove our callback from the list */ 6023 list_remove(&zcl.zcl_callbacks, data); 6024 6025 (void) mutex_exit(&zcl.zcl_callbacks_lock); 6026 6027 umem_free(data, sizeof (ztest_cb_data_t)); 6028 } 6029 6030 /* Allocate and initialize callback data structure */ 6031 static ztest_cb_data_t * 6032 ztest_create_cb_data(objset_t *os, uint64_t txg) 6033 { 6034 ztest_cb_data_t *cb_data; 6035 6036 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 6037 6038 cb_data->zcd_txg = txg; 6039 cb_data->zcd_spa = dmu_objset_spa(os); 6040 list_link_init(&cb_data->zcd_node); 6041 6042 return (cb_data); 6043 } 6044 6045 /* 6046 * Commit callback test. 6047 */ 6048 void 6049 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 6050 { 6051 objset_t *os = zd->zd_os; 6052 ztest_od_t *od; 6053 dmu_tx_t *tx; 6054 ztest_cb_data_t *cb_data[3], *tmp_cb; 6055 uint64_t old_txg, txg; 6056 int i, error = 0; 6057 6058 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 6059 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 6060 6061 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 6062 umem_free(od, sizeof (ztest_od_t)); 6063 return; 6064 } 6065 6066 tx = dmu_tx_create(os); 6067 6068 cb_data[0] = ztest_create_cb_data(os, 0); 6069 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 6070 6071 dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); 6072 6073 /* Every once in a while, abort the transaction on purpose */ 6074 if (ztest_random(100) == 0) 6075 error = -1; 6076 6077 if (!error) 6078 error = dmu_tx_assign(tx, DMU_TX_NOWAIT); 6079 6080 txg = error ? 0 : dmu_tx_get_txg(tx); 6081 6082 cb_data[0]->zcd_txg = txg; 6083 cb_data[1] = ztest_create_cb_data(os, txg); 6084 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 6085 6086 if (error) { 6087 /* 6088 * It's not a strict requirement to call the registered 6089 * callbacks from inside dmu_tx_abort(), but that's what 6090 * it's supposed to happen in the current implementation 6091 * so we will check for that. 6092 */ 6093 for (i = 0; i < 2; i++) { 6094 cb_data[i]->zcd_expected_err = ECANCELED; 6095 VERIFY(!cb_data[i]->zcd_called); 6096 } 6097 6098 dmu_tx_abort(tx); 6099 6100 for (i = 0; i < 2; i++) { 6101 VERIFY(cb_data[i]->zcd_called); 6102 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 6103 } 6104 6105 umem_free(od, sizeof (ztest_od_t)); 6106 return; 6107 } 6108 6109 cb_data[2] = ztest_create_cb_data(os, txg); 6110 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 6111 6112 /* 6113 * Read existing data to make sure there isn't a future leak. 6114 */ 6115 VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), 6116 &old_txg, DMU_READ_PREFETCH)); 6117 6118 if (old_txg > txg) 6119 fatal(B_FALSE, 6120 "future leak: got %"PRIu64", open txg is %"PRIu64"", 6121 old_txg, txg); 6122 6123 dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); 6124 6125 (void) mutex_enter(&zcl.zcl_callbacks_lock); 6126 6127 /* 6128 * Since commit callbacks don't have any ordering requirement and since 6129 * it is theoretically possible for a commit callback to be called 6130 * after an arbitrary amount of time has elapsed since its txg has been 6131 * synced, it is difficult to reliably determine whether a commit 6132 * callback hasn't been called due to high load or due to a flawed 6133 * implementation. 6134 * 6135 * In practice, we will assume that if after a certain number of txgs a 6136 * commit callback hasn't been called, then most likely there's an 6137 * implementation bug.. 6138 */ 6139 tmp_cb = list_head(&zcl.zcl_callbacks); 6140 if (tmp_cb != NULL && 6141 tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { 6142 fatal(B_FALSE, 6143 "Commit callback threshold exceeded, " 6144 "oldest txg: %"PRIu64", open txg: %"PRIu64"\n", 6145 tmp_cb->zcd_txg, txg); 6146 } 6147 6148 /* 6149 * Let's find the place to insert our callbacks. 6150 * 6151 * Even though the list is ordered by txg, it is possible for the 6152 * insertion point to not be the end because our txg may already be 6153 * quiescing at this point and other callbacks in the open txg 6154 * (from other objsets) may have sneaked in. 6155 */ 6156 tmp_cb = list_tail(&zcl.zcl_callbacks); 6157 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 6158 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 6159 6160 /* Add the 3 callbacks to the list */ 6161 for (i = 0; i < 3; i++) { 6162 if (tmp_cb == NULL) 6163 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 6164 else 6165 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 6166 cb_data[i]); 6167 6168 cb_data[i]->zcd_added = B_TRUE; 6169 VERIFY(!cb_data[i]->zcd_called); 6170 6171 tmp_cb = cb_data[i]; 6172 } 6173 6174 zc_cb_counter += 3; 6175 6176 (void) mutex_exit(&zcl.zcl_callbacks_lock); 6177 6178 dmu_tx_commit(tx); 6179 6180 umem_free(od, sizeof (ztest_od_t)); 6181 } 6182 6183 /* 6184 * Visit each object in the dataset. Verify that its properties 6185 * are consistent what was stored in the block tag when it was created, 6186 * and that its unused bonus buffer space has not been overwritten. 6187 */ 6188 void 6189 ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 6190 { 6191 (void) id; 6192 objset_t *os = zd->zd_os; 6193 uint64_t obj; 6194 int err = 0; 6195 6196 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 6197 ztest_block_tag_t *bt = NULL; 6198 dmu_object_info_t doi; 6199 dmu_buf_t *db; 6200 6201 ztest_object_lock(zd, obj, ZTRL_READER); 6202 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { 6203 ztest_object_unlock(zd, obj); 6204 continue; 6205 } 6206 6207 dmu_object_info_from_db(db, &doi); 6208 if (doi.doi_bonus_size >= sizeof (*bt)) 6209 bt = ztest_bt_bonus(db); 6210 6211 if (bt && bt->bt_magic == BT_MAGIC) { 6212 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 6213 bt->bt_offset, bt->bt_gen, bt->bt_txg, 6214 bt->bt_crtxg); 6215 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 6216 } 6217 6218 dmu_buf_rele(db, FTAG); 6219 ztest_object_unlock(zd, obj); 6220 } 6221 } 6222 6223 void 6224 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 6225 { 6226 (void) id; 6227 zfs_prop_t proplist[] = { 6228 ZFS_PROP_CHECKSUM, 6229 ZFS_PROP_COMPRESSION, 6230 ZFS_PROP_COPIES, 6231 ZFS_PROP_DEDUP 6232 }; 6233 6234 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6235 6236 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) { 6237 int error = ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 6238 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 6239 ASSERT(error == 0 || error == ENOSPC); 6240 } 6241 6242 int error = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, 6243 ztest_random_blocksize(), (int)ztest_random(2)); 6244 ASSERT(error == 0 || error == ENOSPC); 6245 6246 (void) pthread_rwlock_unlock(&ztest_name_lock); 6247 } 6248 6249 void 6250 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 6251 { 6252 (void) zd, (void) id; 6253 6254 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6255 6256 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); 6257 6258 nvlist_t *props = fnvlist_alloc(); 6259 6260 VERIFY0(spa_prop_get(ztest_spa, props)); 6261 6262 if (ztest_opts.zo_verbose >= 6) 6263 dump_nvlist(props, 4); 6264 6265 fnvlist_free(props); 6266 6267 (void) pthread_rwlock_unlock(&ztest_name_lock); 6268 } 6269 6270 static int 6271 user_release_one(const char *snapname, const char *holdname) 6272 { 6273 nvlist_t *snaps, *holds; 6274 int error; 6275 6276 snaps = fnvlist_alloc(); 6277 holds = fnvlist_alloc(); 6278 fnvlist_add_boolean(holds, holdname); 6279 fnvlist_add_nvlist(snaps, snapname, holds); 6280 fnvlist_free(holds); 6281 error = dsl_dataset_user_release(snaps, NULL); 6282 fnvlist_free(snaps); 6283 return (error); 6284 } 6285 6286 /* 6287 * Test snapshot hold/release and deferred destroy. 6288 */ 6289 void 6290 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 6291 { 6292 int error; 6293 objset_t *os = zd->zd_os; 6294 objset_t *origin; 6295 char snapname[100]; 6296 char fullname[100]; 6297 char clonename[100]; 6298 char tag[100]; 6299 char osname[ZFS_MAX_DATASET_NAME_LEN]; 6300 nvlist_t *holds; 6301 6302 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6303 6304 dmu_objset_name(os, osname); 6305 6306 (void) snprintf(snapname, sizeof (snapname), "sh1_%"PRIu64"", id); 6307 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 6308 (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%"PRIu64"", 6309 osname, id); 6310 (void) snprintf(tag, sizeof (tag), "tag_%"PRIu64"", id); 6311 6312 /* 6313 * Clean up from any previous run. 6314 */ 6315 error = dsl_destroy_head(clonename); 6316 if (error != ENOENT) 6317 ASSERT0(error); 6318 error = user_release_one(fullname, tag); 6319 if (error != ESRCH && error != ENOENT) 6320 ASSERT0(error); 6321 error = dsl_destroy_snapshot(fullname, B_FALSE); 6322 if (error != ENOENT) 6323 ASSERT0(error); 6324 6325 /* 6326 * Create snapshot, clone it, mark snap for deferred destroy, 6327 * destroy clone, verify snap was also destroyed. 6328 */ 6329 error = dmu_objset_snapshot_one(osname, snapname); 6330 if (error) { 6331 if (error == ENOSPC) { 6332 ztest_record_enospc("dmu_objset_snapshot"); 6333 goto out; 6334 } 6335 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 6336 } 6337 6338 error = dsl_dataset_clone(clonename, fullname); 6339 if (error) { 6340 if (error == ENOSPC) { 6341 ztest_record_enospc("dsl_dataset_clone"); 6342 goto out; 6343 } 6344 fatal(B_FALSE, "dsl_dataset_clone(%s) = %d", clonename, error); 6345 } 6346 6347 error = dsl_destroy_snapshot(fullname, B_TRUE); 6348 if (error) { 6349 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6350 fullname, error); 6351 } 6352 6353 error = dsl_destroy_head(clonename); 6354 if (error) 6355 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clonename, error); 6356 6357 error = dmu_objset_hold(fullname, FTAG, &origin); 6358 if (error != ENOENT) 6359 fatal(B_FALSE, "dmu_objset_hold(%s) = %d", fullname, error); 6360 6361 /* 6362 * Create snapshot, add temporary hold, verify that we can't 6363 * destroy a held snapshot, mark for deferred destroy, 6364 * release hold, verify snapshot was destroyed. 6365 */ 6366 error = dmu_objset_snapshot_one(osname, snapname); 6367 if (error) { 6368 if (error == ENOSPC) { 6369 ztest_record_enospc("dmu_objset_snapshot"); 6370 goto out; 6371 } 6372 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 6373 } 6374 6375 holds = fnvlist_alloc(); 6376 fnvlist_add_string(holds, fullname, tag); 6377 error = dsl_dataset_user_hold(holds, 0, NULL); 6378 fnvlist_free(holds); 6379 6380 if (error == ENOSPC) { 6381 ztest_record_enospc("dsl_dataset_user_hold"); 6382 goto out; 6383 } else if (error) { 6384 fatal(B_FALSE, "dsl_dataset_user_hold(%s, %s) = %u", 6385 fullname, tag, error); 6386 } 6387 6388 error = dsl_destroy_snapshot(fullname, B_FALSE); 6389 if (error != EBUSY) { 6390 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 6391 fullname, error); 6392 } 6393 6394 error = dsl_destroy_snapshot(fullname, B_TRUE); 6395 if (error) { 6396 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6397 fullname, error); 6398 } 6399 6400 error = user_release_one(fullname, tag); 6401 if (error) 6402 fatal(B_FALSE, "user_release_one(%s, %s) = %d", 6403 fullname, tag, error); 6404 6405 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 6406 6407 out: 6408 (void) pthread_rwlock_unlock(&ztest_name_lock); 6409 } 6410 6411 /* 6412 * Inject random faults into the on-disk data. 6413 */ 6414 void 6415 ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 6416 { 6417 (void) zd, (void) id; 6418 ztest_shared_t *zs = ztest_shared; 6419 spa_t *spa = ztest_spa; 6420 int fd; 6421 uint64_t offset; 6422 uint64_t leaves; 6423 uint64_t bad = 0x1990c0ffeedecadeull; 6424 uint64_t top, leaf; 6425 uint64_t raidz_children; 6426 char *path0; 6427 char *pathrand; 6428 size_t fsize; 6429 int bshift = SPA_MAXBLOCKSHIFT + 2; 6430 int iters = 1000; 6431 int maxfaults; 6432 int mirror_save; 6433 vdev_t *vd0 = NULL; 6434 uint64_t guid0 = 0; 6435 boolean_t islog = B_FALSE; 6436 boolean_t injected = B_FALSE; 6437 6438 path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6439 pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6440 6441 mutex_enter(&ztest_vdev_lock); 6442 6443 /* 6444 * Device removal is in progress, fault injection must be disabled 6445 * until it completes and the pool is scrubbed. The fault injection 6446 * strategy for damaging blocks does not take in to account evacuated 6447 * blocks which may have already been damaged. 6448 */ 6449 if (ztest_device_removal_active) 6450 goto out; 6451 6452 /* 6453 * The fault injection strategy for damaging blocks cannot be used 6454 * if raidz expansion is in progress. The leaves value 6455 * (attached raidz children) is variable and strategy for damaging 6456 * blocks will corrupt same data blocks on different child vdevs 6457 * because of the reflow process. 6458 */ 6459 if (spa->spa_raidz_expand != NULL) 6460 goto out; 6461 6462 maxfaults = MAXFAULTS(zs); 6463 raidz_children = ztest_get_raidz_children(spa); 6464 leaves = MAX(zs->zs_mirrors, 1) * raidz_children; 6465 mirror_save = zs->zs_mirrors; 6466 6467 ASSERT3U(leaves, >=, 1); 6468 6469 /* 6470 * While ztest is running the number of leaves will not change. This 6471 * is critical for the fault injection logic as it determines where 6472 * errors can be safely injected such that they are always repairable. 6473 * 6474 * When restarting ztest a different number of leaves may be requested 6475 * which will shift the regions to be damaged. This is fine as long 6476 * as the pool has been scrubbed prior to using the new mapping. 6477 * Failure to do can result in non-repairable damage being injected. 6478 */ 6479 if (ztest_pool_scrubbed == B_FALSE) 6480 goto out; 6481 6482 /* 6483 * Grab the name lock as reader. There are some operations 6484 * which don't like to have their vdevs changed while 6485 * they are in progress (i.e. spa_change_guid). Those 6486 * operations will have grabbed the name lock as writer. 6487 */ 6488 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6489 6490 /* 6491 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 6492 */ 6493 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6494 6495 if (ztest_random(2) == 0) { 6496 /* 6497 * Inject errors on a normal data device or slog device. 6498 */ 6499 top = ztest_random_vdev_top(spa, B_TRUE); 6500 leaf = ztest_random(leaves) + zs->zs_splits; 6501 6502 /* 6503 * Generate paths to the first leaf in this top-level vdev, 6504 * and to the random leaf we selected. We'll induce transient 6505 * write failures and random online/offline activity on leaf 0, 6506 * and we'll write random garbage to the randomly chosen leaf. 6507 */ 6508 (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, 6509 ztest_opts.zo_dir, ztest_opts.zo_pool, 6510 top * leaves + zs->zs_splits); 6511 (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, 6512 ztest_opts.zo_dir, ztest_opts.zo_pool, 6513 top * leaves + leaf); 6514 6515 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 6516 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 6517 islog = B_TRUE; 6518 6519 /* 6520 * If the top-level vdev needs to be resilvered 6521 * then we only allow faults on the device that is 6522 * resilvering. 6523 */ 6524 if (vd0 != NULL && maxfaults != 1 && 6525 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 6526 vd0->vdev_resilver_txg != 0)) { 6527 /* 6528 * Make vd0 explicitly claim to be unreadable, 6529 * or unwritable, or reach behind its back 6530 * and close the underlying fd. We can do this if 6531 * maxfaults == 0 because we'll fail and reexecute, 6532 * and we can do it if maxfaults >= 2 because we'll 6533 * have enough redundancy. If maxfaults == 1, the 6534 * combination of this with injection of random data 6535 * corruption below exceeds the pool's fault tolerance. 6536 */ 6537 vdev_file_t *vf = vd0->vdev_tsd; 6538 6539 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 6540 (long long)vd0->vdev_id, (int)maxfaults); 6541 6542 if (vf != NULL && ztest_random(3) == 0) { 6543 (void) close(vf->vf_file->f_fd); 6544 vf->vf_file->f_fd = -1; 6545 } else if (ztest_random(2) == 0) { 6546 vd0->vdev_cant_read = B_TRUE; 6547 } else { 6548 vd0->vdev_cant_write = B_TRUE; 6549 } 6550 guid0 = vd0->vdev_guid; 6551 } 6552 } else { 6553 /* 6554 * Inject errors on an l2cache device. 6555 */ 6556 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6557 6558 if (sav->sav_count == 0) { 6559 spa_config_exit(spa, SCL_STATE, FTAG); 6560 (void) pthread_rwlock_unlock(&ztest_name_lock); 6561 goto out; 6562 } 6563 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 6564 guid0 = vd0->vdev_guid; 6565 (void) strlcpy(path0, vd0->vdev_path, MAXPATHLEN); 6566 (void) strlcpy(pathrand, vd0->vdev_path, MAXPATHLEN); 6567 6568 leaf = 0; 6569 leaves = 1; 6570 maxfaults = INT_MAX; /* no limit on cache devices */ 6571 } 6572 6573 spa_config_exit(spa, SCL_STATE, FTAG); 6574 (void) pthread_rwlock_unlock(&ztest_name_lock); 6575 6576 /* 6577 * If we can tolerate two or more faults, or we're dealing 6578 * with a slog, randomly online/offline vd0. 6579 */ 6580 if ((maxfaults >= 2 || islog) && guid0 != 0) { 6581 if (ztest_random(10) < 6) { 6582 int flags = (ztest_random(2) == 0 ? 6583 ZFS_OFFLINE_TEMPORARY : 0); 6584 6585 /* 6586 * We have to grab the zs_name_lock as writer to 6587 * prevent a race between offlining a slog and 6588 * destroying a dataset. Offlining the slog will 6589 * grab a reference on the dataset which may cause 6590 * dsl_destroy_head() to fail with EBUSY thus 6591 * leaving the dataset in an inconsistent state. 6592 */ 6593 if (islog) 6594 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6595 6596 VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); 6597 6598 if (islog) 6599 (void) pthread_rwlock_unlock(&ztest_name_lock); 6600 } else { 6601 /* 6602 * Ideally we would like to be able to randomly 6603 * call vdev_[on|off]line without holding locks 6604 * to force unpredictable failures but the side 6605 * effects of vdev_[on|off]line prevent us from 6606 * doing so. 6607 */ 6608 (void) vdev_online(spa, guid0, 0, NULL); 6609 } 6610 } 6611 6612 if (maxfaults == 0) 6613 goto out; 6614 6615 /* 6616 * We have at least single-fault tolerance, so inject data corruption. 6617 */ 6618 fd = open(pathrand, O_RDWR); 6619 6620 if (fd == -1) /* we hit a gap in the device namespace */ 6621 goto out; 6622 6623 fsize = lseek(fd, 0, SEEK_END); 6624 6625 while (--iters != 0) { 6626 /* 6627 * The offset must be chosen carefully to ensure that 6628 * we do not inject a given logical block with errors 6629 * on two different leaf devices, because ZFS can not 6630 * tolerate that (if maxfaults==1). 6631 * 6632 * To achieve this we divide each leaf device into 6633 * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). 6634 * Each chunk is further divided into error-injection 6635 * ranges (can accept errors) and clear ranges (we do 6636 * not inject errors in those). Each error-injection 6637 * range can accept errors only for a single leaf vdev. 6638 * Error-injection ranges are separated by clear ranges. 6639 * 6640 * For example, with 3 leaves, each chunk looks like: 6641 * 0 to 32M: injection range for leaf 0 6642 * 32M to 64M: clear range - no injection allowed 6643 * 64M to 96M: injection range for leaf 1 6644 * 96M to 128M: clear range - no injection allowed 6645 * 128M to 160M: injection range for leaf 2 6646 * 160M to 192M: clear range - no injection allowed 6647 * 6648 * Each clear range must be large enough such that a 6649 * single block cannot straddle it. This way a block 6650 * can't be a target in two different injection ranges 6651 * (on different leaf vdevs). 6652 */ 6653 offset = ztest_random(fsize / (leaves << bshift)) * 6654 (leaves << bshift) + (leaf << bshift) + 6655 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 6656 6657 /* 6658 * Only allow damage to the labels at one end of the vdev. 6659 * 6660 * If all labels are damaged, the device will be totally 6661 * inaccessible, which will result in loss of data, 6662 * because we also damage (parts of) the other side of 6663 * the mirror/raidz. 6664 * 6665 * Additionally, we will always have both an even and an 6666 * odd label, so that we can handle crashes in the 6667 * middle of vdev_config_sync(). 6668 */ 6669 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 6670 continue; 6671 6672 /* 6673 * The two end labels are stored at the "end" of the disk, but 6674 * the end of the disk (vdev_psize) is aligned to 6675 * sizeof (vdev_label_t). 6676 */ 6677 uint64_t psize = P2ALIGN_TYPED(fsize, sizeof (vdev_label_t), 6678 uint64_t); 6679 if ((leaf & 1) == 1 && 6680 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 6681 continue; 6682 6683 if (mirror_save != zs->zs_mirrors) { 6684 (void) close(fd); 6685 goto out; 6686 } 6687 6688 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 6689 fatal(B_TRUE, 6690 "can't inject bad word at 0x%"PRIx64" in %s", 6691 offset, pathrand); 6692 6693 if (ztest_opts.zo_verbose >= 7) 6694 (void) printf("injected bad word into %s," 6695 " offset 0x%"PRIx64"\n", pathrand, offset); 6696 6697 injected = B_TRUE; 6698 } 6699 6700 (void) close(fd); 6701 out: 6702 mutex_exit(&ztest_vdev_lock); 6703 6704 if (injected && ztest_opts.zo_raid_do_expand) { 6705 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6706 if (error == 0) { 6707 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6708 txg_wait_synced(spa_get_dsl(spa), 0); 6709 } 6710 } 6711 6712 umem_free(path0, MAXPATHLEN); 6713 umem_free(pathrand, MAXPATHLEN); 6714 } 6715 6716 /* 6717 * By design ztest will never inject uncorrectable damage in to the pool. 6718 * Issue a scrub, wait for it to complete, and verify there is never any 6719 * persistent damage. 6720 * 6721 * Only after a full scrub has been completed is it safe to start injecting 6722 * data corruption. See the comment in zfs_fault_inject(). 6723 * 6724 * EBUSY may be returned for the following six cases. It's the callers 6725 * responsibility to handle them accordingly. 6726 * 6727 * Current state Requested 6728 * 1. Normal Scrub Running Normal Scrub or Error Scrub 6729 * 2. Normal Scrub Paused Error Scrub 6730 * 3. Normal Scrub Paused Pause Normal Scrub 6731 * 4. Error Scrub Running Normal Scrub or Error Scrub 6732 * 5. Error Scrub Paused Pause Error Scrub 6733 * 6. Resilvering Anything else 6734 */ 6735 static int 6736 ztest_scrub_impl(spa_t *spa) 6737 { 6738 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6739 if (error) 6740 return (error); 6741 6742 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6743 txg_wait_synced(spa_get_dsl(spa), 0); 6744 6745 if (spa_approx_errlog_size(spa) > 0) 6746 return (ECKSUM); 6747 6748 ztest_pool_scrubbed = B_TRUE; 6749 6750 return (0); 6751 } 6752 6753 /* 6754 * Scrub the pool. 6755 */ 6756 void 6757 ztest_scrub(ztest_ds_t *zd, uint64_t id) 6758 { 6759 (void) zd, (void) id; 6760 spa_t *spa = ztest_spa; 6761 int error; 6762 6763 /* 6764 * Scrub in progress by device removal. 6765 */ 6766 if (ztest_device_removal_active) 6767 return; 6768 6769 /* 6770 * Start a scrub, wait a moment, then force a restart. 6771 */ 6772 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6773 (void) poll(NULL, 0, 100); 6774 6775 error = ztest_scrub_impl(spa); 6776 if (error == EBUSY) 6777 error = 0; 6778 ASSERT0(error); 6779 } 6780 6781 /* 6782 * Change the guid for the pool. 6783 */ 6784 void 6785 ztest_reguid(ztest_ds_t *zd, uint64_t id) 6786 { 6787 (void) zd, (void) id; 6788 spa_t *spa = ztest_spa; 6789 uint64_t orig, load; 6790 int error; 6791 ztest_shared_t *zs = ztest_shared; 6792 6793 if (ztest_opts.zo_mmp_test) 6794 return; 6795 6796 orig = spa_guid(spa); 6797 load = spa_load_guid(spa); 6798 6799 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6800 error = spa_change_guid(spa, NULL); 6801 zs->zs_guid = spa_guid(spa); 6802 (void) pthread_rwlock_unlock(&ztest_name_lock); 6803 6804 if (error != 0) 6805 return; 6806 6807 if (ztest_opts.zo_verbose >= 4) { 6808 (void) printf("Changed guid old %"PRIu64" -> %"PRIu64"\n", 6809 orig, spa_guid(spa)); 6810 } 6811 6812 VERIFY3U(orig, !=, spa_guid(spa)); 6813 VERIFY3U(load, ==, spa_load_guid(spa)); 6814 } 6815 6816 void 6817 ztest_blake3(ztest_ds_t *zd, uint64_t id) 6818 { 6819 (void) zd, (void) id; 6820 hrtime_t end = gethrtime() + NANOSEC; 6821 zio_cksum_salt_t salt; 6822 void *salt_ptr = &salt.zcs_bytes; 6823 struct abd *abd_data, *abd_meta; 6824 void *buf, *templ; 6825 int i, *ptr; 6826 uint32_t size; 6827 BLAKE3_CTX ctx; 6828 const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); 6829 6830 size = ztest_random_blocksize(); 6831 buf = umem_alloc(size, UMEM_NOFAIL); 6832 abd_data = abd_alloc(size, B_FALSE); 6833 abd_meta = abd_alloc(size, B_TRUE); 6834 6835 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6836 *ptr = ztest_random(UINT_MAX); 6837 memset(salt_ptr, 'A', 32); 6838 6839 abd_copy_from_buf_off(abd_data, buf, 0, size); 6840 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6841 6842 while (gethrtime() <= end) { 6843 int run_count = 100; 6844 zio_cksum_t zc_ref1, zc_ref2; 6845 zio_cksum_t zc_res1, zc_res2; 6846 6847 void *ref1 = &zc_ref1; 6848 void *ref2 = &zc_ref2; 6849 void *res1 = &zc_res1; 6850 void *res2 = &zc_res2; 6851 6852 /* BLAKE3_KEY_LEN = 32 */ 6853 VERIFY0(blake3->setname("generic")); 6854 templ = abd_checksum_blake3_tmpl_init(&salt); 6855 Blake3_InitKeyed(&ctx, salt_ptr); 6856 Blake3_Update(&ctx, buf, size); 6857 Blake3_Final(&ctx, ref1); 6858 zc_ref2 = zc_ref1; 6859 ZIO_CHECKSUM_BSWAP(&zc_ref2); 6860 abd_checksum_blake3_tmpl_free(templ); 6861 6862 VERIFY0(blake3->setname("cycle")); 6863 while (run_count-- > 0) { 6864 6865 /* Test current implementation */ 6866 Blake3_InitKeyed(&ctx, salt_ptr); 6867 Blake3_Update(&ctx, buf, size); 6868 Blake3_Final(&ctx, res1); 6869 zc_res2 = zc_res1; 6870 ZIO_CHECKSUM_BSWAP(&zc_res2); 6871 6872 VERIFY0(memcmp(ref1, res1, 32)); 6873 VERIFY0(memcmp(ref2, res2, 32)); 6874 6875 /* Test ABD - data */ 6876 templ = abd_checksum_blake3_tmpl_init(&salt); 6877 abd_checksum_blake3_native(abd_data, size, 6878 templ, &zc_res1); 6879 abd_checksum_blake3_byteswap(abd_data, size, 6880 templ, &zc_res2); 6881 6882 VERIFY0(memcmp(ref1, res1, 32)); 6883 VERIFY0(memcmp(ref2, res2, 32)); 6884 6885 /* Test ABD - metadata */ 6886 abd_checksum_blake3_native(abd_meta, size, 6887 templ, &zc_res1); 6888 abd_checksum_blake3_byteswap(abd_meta, size, 6889 templ, &zc_res2); 6890 abd_checksum_blake3_tmpl_free(templ); 6891 6892 VERIFY0(memcmp(ref1, res1, 32)); 6893 VERIFY0(memcmp(ref2, res2, 32)); 6894 6895 } 6896 } 6897 6898 abd_free(abd_data); 6899 abd_free(abd_meta); 6900 umem_free(buf, size); 6901 } 6902 6903 void 6904 ztest_fletcher(ztest_ds_t *zd, uint64_t id) 6905 { 6906 (void) zd, (void) id; 6907 hrtime_t end = gethrtime() + NANOSEC; 6908 6909 while (gethrtime() <= end) { 6910 int run_count = 100; 6911 void *buf; 6912 struct abd *abd_data, *abd_meta; 6913 uint32_t size; 6914 int *ptr; 6915 int i; 6916 zio_cksum_t zc_ref; 6917 zio_cksum_t zc_ref_byteswap; 6918 6919 size = ztest_random_blocksize(); 6920 6921 buf = umem_alloc(size, UMEM_NOFAIL); 6922 abd_data = abd_alloc(size, B_FALSE); 6923 abd_meta = abd_alloc(size, B_TRUE); 6924 6925 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6926 *ptr = ztest_random(UINT_MAX); 6927 6928 abd_copy_from_buf_off(abd_data, buf, 0, size); 6929 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6930 6931 VERIFY0(fletcher_4_impl_set("scalar")); 6932 fletcher_4_native(buf, size, NULL, &zc_ref); 6933 fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); 6934 6935 VERIFY0(fletcher_4_impl_set("cycle")); 6936 while (run_count-- > 0) { 6937 zio_cksum_t zc; 6938 zio_cksum_t zc_byteswap; 6939 6940 fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); 6941 fletcher_4_native(buf, size, NULL, &zc); 6942 6943 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6944 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6945 sizeof (zc_byteswap))); 6946 6947 /* Test ABD - data */ 6948 abd_fletcher_4_byteswap(abd_data, size, NULL, 6949 &zc_byteswap); 6950 abd_fletcher_4_native(abd_data, size, NULL, &zc); 6951 6952 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6953 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6954 sizeof (zc_byteswap))); 6955 6956 /* Test ABD - metadata */ 6957 abd_fletcher_4_byteswap(abd_meta, size, NULL, 6958 &zc_byteswap); 6959 abd_fletcher_4_native(abd_meta, size, NULL, &zc); 6960 6961 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6962 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6963 sizeof (zc_byteswap))); 6964 6965 } 6966 6967 umem_free(buf, size); 6968 abd_free(abd_data); 6969 abd_free(abd_meta); 6970 } 6971 } 6972 6973 void 6974 ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) 6975 { 6976 (void) zd, (void) id; 6977 void *buf; 6978 size_t size; 6979 int *ptr; 6980 int i; 6981 zio_cksum_t zc_ref; 6982 zio_cksum_t zc_ref_bswap; 6983 6984 hrtime_t end = gethrtime() + NANOSEC; 6985 6986 while (gethrtime() <= end) { 6987 int run_count = 100; 6988 6989 size = ztest_random_blocksize(); 6990 buf = umem_alloc(size, UMEM_NOFAIL); 6991 6992 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6993 *ptr = ztest_random(UINT_MAX); 6994 6995 VERIFY0(fletcher_4_impl_set("scalar")); 6996 fletcher_4_native(buf, size, NULL, &zc_ref); 6997 fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); 6998 6999 VERIFY0(fletcher_4_impl_set("cycle")); 7000 7001 while (run_count-- > 0) { 7002 zio_cksum_t zc; 7003 zio_cksum_t zc_bswap; 7004 size_t pos = 0; 7005 7006 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 7007 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 7008 7009 while (pos < size) { 7010 size_t inc = 64 * ztest_random(size / 67); 7011 /* sometimes add few bytes to test non-simd */ 7012 if (ztest_random(100) < 10) 7013 inc += P2ALIGN_TYPED(ztest_random(64), 7014 sizeof (uint32_t), uint64_t); 7015 7016 if (inc > (size - pos)) 7017 inc = size - pos; 7018 7019 fletcher_4_incremental_native(buf + pos, inc, 7020 &zc); 7021 fletcher_4_incremental_byteswap(buf + pos, inc, 7022 &zc_bswap); 7023 7024 pos += inc; 7025 } 7026 7027 VERIFY3U(pos, ==, size); 7028 7029 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 7030 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 7031 7032 /* 7033 * verify if incremental on the whole buffer is 7034 * equivalent to non-incremental version 7035 */ 7036 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 7037 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 7038 7039 fletcher_4_incremental_native(buf, size, &zc); 7040 fletcher_4_incremental_byteswap(buf, size, &zc_bswap); 7041 7042 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 7043 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 7044 } 7045 7046 umem_free(buf, size); 7047 } 7048 } 7049 7050 void 7051 ztest_pool_prefetch_ddt(ztest_ds_t *zd, uint64_t id) 7052 { 7053 (void) zd, (void) id; 7054 spa_t *spa; 7055 7056 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7057 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7058 7059 ddt_prefetch_all(spa); 7060 7061 spa_close(spa, FTAG); 7062 (void) pthread_rwlock_unlock(&ztest_name_lock); 7063 } 7064 7065 static int 7066 ztest_set_global_vars(void) 7067 { 7068 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 7069 char *kv = ztest_opts.zo_gvars[i]; 7070 VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); 7071 VERIFY3U(strlen(kv), >, 0); 7072 int err = handle_tunable_option(kv, B_TRUE); 7073 if (ztest_opts.zo_verbose > 0) { 7074 (void) printf("setting global var %s ... %s\n", kv, 7075 err ? "failed" : "ok"); 7076 } 7077 if (err != 0) { 7078 (void) fprintf(stderr, 7079 "failed to set global var '%s'\n", kv); 7080 return (err); 7081 } 7082 } 7083 return (0); 7084 } 7085 7086 static char ** 7087 ztest_global_vars_to_zdb_args(void) 7088 { 7089 char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); 7090 char **cur = args; 7091 if (args == NULL) 7092 return (NULL); 7093 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 7094 *cur++ = (char *)"-o"; 7095 *cur++ = ztest_opts.zo_gvars[i]; 7096 } 7097 ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); 7098 *cur = NULL; 7099 return (args); 7100 } 7101 7102 /* The end of strings is indicated by a NULL element */ 7103 static char * 7104 join_strings(char **strings, const char *sep) 7105 { 7106 size_t totallen = 0; 7107 for (char **sp = strings; *sp != NULL; sp++) { 7108 totallen += strlen(*sp); 7109 totallen += strlen(sep); 7110 } 7111 if (totallen > 0) { 7112 ASSERT(totallen >= strlen(sep)); 7113 totallen -= strlen(sep); 7114 } 7115 7116 size_t buflen = totallen + 1; 7117 char *o = umem_alloc(buflen, UMEM_NOFAIL); /* trailing 0 byte */ 7118 o[0] = '\0'; 7119 for (char **sp = strings; *sp != NULL; sp++) { 7120 size_t would; 7121 would = strlcat(o, *sp, buflen); 7122 VERIFY3U(would, <, buflen); 7123 if (*(sp+1) == NULL) { 7124 break; 7125 } 7126 would = strlcat(o, sep, buflen); 7127 VERIFY3U(would, <, buflen); 7128 } 7129 ASSERT3S(strlen(o), ==, totallen); 7130 return (o); 7131 } 7132 7133 static int 7134 ztest_check_path(char *path) 7135 { 7136 struct stat s; 7137 /* return true on success */ 7138 return (!stat(path, &s)); 7139 } 7140 7141 static void 7142 ztest_get_zdb_bin(char *bin, int len) 7143 { 7144 char *zdb_path; 7145 /* 7146 * Try to use $ZDB and in-tree zdb path. If not successful, just 7147 * let popen to search through PATH. 7148 */ 7149 if ((zdb_path = getenv("ZDB"))) { 7150 strlcpy(bin, zdb_path, len); /* In env */ 7151 if (!ztest_check_path(bin)) { 7152 ztest_dump_core = 0; 7153 fatal(B_TRUE, "invalid ZDB '%s'", bin); 7154 } 7155 return; 7156 } 7157 7158 VERIFY3P(realpath(getexecname(), bin), !=, NULL); 7159 if (strstr(bin, ".libs/ztest")) { 7160 strstr(bin, ".libs/ztest")[0] = '\0'; /* In-tree */ 7161 strcat(bin, "zdb"); 7162 if (ztest_check_path(bin)) 7163 return; 7164 } 7165 strcpy(bin, "zdb"); 7166 } 7167 7168 static vdev_t * 7169 ztest_random_concrete_vdev_leaf(vdev_t *vd) 7170 { 7171 if (vd == NULL) 7172 return (NULL); 7173 7174 if (vd->vdev_children == 0) 7175 return (vd); 7176 7177 vdev_t *eligible[vd->vdev_children]; 7178 int eligible_idx = 0, i; 7179 for (i = 0; i < vd->vdev_children; i++) { 7180 vdev_t *cvd = vd->vdev_child[i]; 7181 if (cvd->vdev_top->vdev_removing) 7182 continue; 7183 if (cvd->vdev_children > 0 || 7184 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 7185 eligible[eligible_idx++] = cvd; 7186 } 7187 } 7188 VERIFY3S(eligible_idx, >, 0); 7189 7190 uint64_t child_no = ztest_random(eligible_idx); 7191 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 7192 } 7193 7194 void 7195 ztest_initialize(ztest_ds_t *zd, uint64_t id) 7196 { 7197 (void) zd, (void) id; 7198 spa_t *spa = ztest_spa; 7199 int error = 0; 7200 7201 mutex_enter(&ztest_vdev_lock); 7202 7203 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 7204 7205 /* Random leaf vdev */ 7206 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 7207 if (rand_vd == NULL) { 7208 spa_config_exit(spa, SCL_VDEV, FTAG); 7209 mutex_exit(&ztest_vdev_lock); 7210 return; 7211 } 7212 7213 /* 7214 * The random vdev we've selected may change as soon as we 7215 * drop the spa_config_lock. We create local copies of things 7216 * we're interested in. 7217 */ 7218 uint64_t guid = rand_vd->vdev_guid; 7219 char *path = strdup(rand_vd->vdev_path); 7220 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 7221 7222 zfs_dbgmsg("vd %px, guid %llu", rand_vd, (u_longlong_t)guid); 7223 spa_config_exit(spa, SCL_VDEV, FTAG); 7224 7225 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 7226 7227 nvlist_t *vdev_guids = fnvlist_alloc(); 7228 nvlist_t *vdev_errlist = fnvlist_alloc(); 7229 fnvlist_add_uint64(vdev_guids, path, guid); 7230 error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); 7231 fnvlist_free(vdev_guids); 7232 fnvlist_free(vdev_errlist); 7233 7234 switch (cmd) { 7235 case POOL_INITIALIZE_CANCEL: 7236 if (ztest_opts.zo_verbose >= 4) { 7237 (void) printf("Cancel initialize %s", path); 7238 if (!active) 7239 (void) printf(" failed (no initialize active)"); 7240 (void) printf("\n"); 7241 } 7242 break; 7243 case POOL_INITIALIZE_START: 7244 if (ztest_opts.zo_verbose >= 4) { 7245 (void) printf("Start initialize %s", path); 7246 if (active && error == 0) 7247 (void) printf(" failed (already active)"); 7248 else if (error != 0) 7249 (void) printf(" failed (error %d)", error); 7250 (void) printf("\n"); 7251 } 7252 break; 7253 case POOL_INITIALIZE_SUSPEND: 7254 if (ztest_opts.zo_verbose >= 4) { 7255 (void) printf("Suspend initialize %s", path); 7256 if (!active) 7257 (void) printf(" failed (no initialize active)"); 7258 (void) printf("\n"); 7259 } 7260 break; 7261 } 7262 free(path); 7263 mutex_exit(&ztest_vdev_lock); 7264 } 7265 7266 void 7267 ztest_trim(ztest_ds_t *zd, uint64_t id) 7268 { 7269 (void) zd, (void) id; 7270 spa_t *spa = ztest_spa; 7271 int error = 0; 7272 7273 mutex_enter(&ztest_vdev_lock); 7274 7275 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 7276 7277 /* Random leaf vdev */ 7278 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 7279 if (rand_vd == NULL) { 7280 spa_config_exit(spa, SCL_VDEV, FTAG); 7281 mutex_exit(&ztest_vdev_lock); 7282 return; 7283 } 7284 7285 /* 7286 * The random vdev we've selected may change as soon as we 7287 * drop the spa_config_lock. We create local copies of things 7288 * we're interested in. 7289 */ 7290 uint64_t guid = rand_vd->vdev_guid; 7291 char *path = strdup(rand_vd->vdev_path); 7292 boolean_t active = rand_vd->vdev_trim_thread != NULL; 7293 7294 zfs_dbgmsg("vd %p, guid %llu", rand_vd, (u_longlong_t)guid); 7295 spa_config_exit(spa, SCL_VDEV, FTAG); 7296 7297 uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); 7298 uint64_t rate = 1 << ztest_random(30); 7299 boolean_t partial = (ztest_random(5) > 0); 7300 boolean_t secure = (ztest_random(5) > 0); 7301 7302 nvlist_t *vdev_guids = fnvlist_alloc(); 7303 nvlist_t *vdev_errlist = fnvlist_alloc(); 7304 fnvlist_add_uint64(vdev_guids, path, guid); 7305 error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, 7306 secure, vdev_errlist); 7307 fnvlist_free(vdev_guids); 7308 fnvlist_free(vdev_errlist); 7309 7310 switch (cmd) { 7311 case POOL_TRIM_CANCEL: 7312 if (ztest_opts.zo_verbose >= 4) { 7313 (void) printf("Cancel TRIM %s", path); 7314 if (!active) 7315 (void) printf(" failed (no TRIM active)"); 7316 (void) printf("\n"); 7317 } 7318 break; 7319 case POOL_TRIM_START: 7320 if (ztest_opts.zo_verbose >= 4) { 7321 (void) printf("Start TRIM %s", path); 7322 if (active && error == 0) 7323 (void) printf(" failed (already active)"); 7324 else if (error != 0) 7325 (void) printf(" failed (error %d)", error); 7326 (void) printf("\n"); 7327 } 7328 break; 7329 case POOL_TRIM_SUSPEND: 7330 if (ztest_opts.zo_verbose >= 4) { 7331 (void) printf("Suspend TRIM %s", path); 7332 if (!active) 7333 (void) printf(" failed (no TRIM active)"); 7334 (void) printf("\n"); 7335 } 7336 break; 7337 } 7338 free(path); 7339 mutex_exit(&ztest_vdev_lock); 7340 } 7341 7342 void 7343 ztest_ddt_prune(ztest_ds_t *zd, uint64_t id) 7344 { 7345 (void) zd, (void) id; 7346 7347 spa_t *spa = ztest_spa; 7348 uint64_t pct = ztest_random(15) + 1; 7349 7350 (void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct); 7351 } 7352 7353 /* 7354 * Verify pool integrity by running zdb. 7355 */ 7356 static void 7357 ztest_run_zdb(uint64_t guid) 7358 { 7359 int status; 7360 char *bin; 7361 char *zdb; 7362 char *zbuf; 7363 const int len = MAXPATHLEN + MAXNAMELEN + 20; 7364 FILE *fp; 7365 7366 bin = umem_alloc(len, UMEM_NOFAIL); 7367 zdb = umem_alloc(len, UMEM_NOFAIL); 7368 zbuf = umem_alloc(1024, UMEM_NOFAIL); 7369 7370 ztest_get_zdb_bin(bin, len); 7371 7372 char **set_gvars_args = ztest_global_vars_to_zdb_args(); 7373 if (set_gvars_args == NULL) { 7374 fatal(B_FALSE, "Failed to allocate memory in " 7375 "ztest_global_vars_to_zdb_args(). Cannot run zdb.\n"); 7376 } 7377 char *set_gvars_args_joined = join_strings(set_gvars_args, " "); 7378 free(set_gvars_args); 7379 7380 size_t would = snprintf(zdb, len, 7381 "%s -bcc%s%s -G -d -Y -e -y %s -p %s %"PRIu64, 7382 bin, 7383 ztest_opts.zo_verbose >= 3 ? "s" : "", 7384 ztest_opts.zo_verbose >= 4 ? "v" : "", 7385 set_gvars_args_joined, 7386 ztest_opts.zo_dir, 7387 guid); 7388 ASSERT3U(would, <, len); 7389 7390 umem_free(set_gvars_args_joined, strlen(set_gvars_args_joined) + 1); 7391 7392 if (ztest_opts.zo_verbose >= 5) 7393 (void) printf("Executing %s\n", zdb); 7394 7395 fp = popen(zdb, "r"); 7396 7397 while (fgets(zbuf, 1024, fp) != NULL) 7398 if (ztest_opts.zo_verbose >= 3) 7399 (void) printf("%s", zbuf); 7400 7401 status = pclose(fp); 7402 7403 if (status == 0) 7404 goto out; 7405 7406 ztest_dump_core = 0; 7407 if (WIFEXITED(status)) 7408 fatal(B_FALSE, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 7409 else 7410 fatal(B_FALSE, "'%s' died with signal %d", 7411 zdb, WTERMSIG(status)); 7412 out: 7413 umem_free(bin, len); 7414 umem_free(zdb, len); 7415 umem_free(zbuf, 1024); 7416 } 7417 7418 static void 7419 ztest_walk_pool_directory(const char *header) 7420 { 7421 spa_t *spa = NULL; 7422 7423 if (ztest_opts.zo_verbose >= 6) 7424 (void) puts(header); 7425 7426 mutex_enter(&spa_namespace_lock); 7427 while ((spa = spa_next(spa)) != NULL) 7428 if (ztest_opts.zo_verbose >= 6) 7429 (void) printf("\t%s\n", spa_name(spa)); 7430 mutex_exit(&spa_namespace_lock); 7431 } 7432 7433 static void 7434 ztest_spa_import_export(char *oldname, char *newname) 7435 { 7436 nvlist_t *config, *newconfig; 7437 uint64_t pool_guid; 7438 spa_t *spa; 7439 int error; 7440 7441 if (ztest_opts.zo_verbose >= 4) { 7442 (void) printf("import/export: old = %s, new = %s\n", 7443 oldname, newname); 7444 } 7445 7446 /* 7447 * Clean up from previous runs. 7448 */ 7449 (void) spa_destroy(newname); 7450 7451 /* 7452 * Get the pool's configuration and guid. 7453 */ 7454 VERIFY0(spa_open(oldname, &spa, FTAG)); 7455 7456 /* 7457 * Kick off a scrub to tickle scrub/export races. 7458 */ 7459 if (ztest_random(2) == 0) 7460 (void) spa_scan(spa, POOL_SCAN_SCRUB); 7461 7462 pool_guid = spa_guid(spa); 7463 spa_close(spa, FTAG); 7464 7465 ztest_walk_pool_directory("pools before export"); 7466 7467 /* 7468 * Export it. 7469 */ 7470 VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); 7471 7472 ztest_walk_pool_directory("pools after export"); 7473 7474 /* 7475 * Try to import it. 7476 */ 7477 newconfig = spa_tryimport(config); 7478 ASSERT3P(newconfig, !=, NULL); 7479 fnvlist_free(newconfig); 7480 7481 /* 7482 * Import it under the new name. 7483 */ 7484 error = spa_import(newname, config, NULL, 0); 7485 if (error != 0) { 7486 dump_nvlist(config, 0); 7487 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 7488 oldname, newname, error); 7489 } 7490 7491 ztest_walk_pool_directory("pools after import"); 7492 7493 /* 7494 * Try to import it again -- should fail with EEXIST. 7495 */ 7496 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 7497 7498 /* 7499 * Try to import it under a different name -- should fail with EEXIST. 7500 */ 7501 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 7502 7503 /* 7504 * Verify that the pool is no longer visible under the old name. 7505 */ 7506 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 7507 7508 /* 7509 * Verify that we can open and close the pool using the new name. 7510 */ 7511 VERIFY0(spa_open(newname, &spa, FTAG)); 7512 ASSERT3U(pool_guid, ==, spa_guid(spa)); 7513 spa_close(spa, FTAG); 7514 7515 fnvlist_free(config); 7516 } 7517 7518 static void 7519 ztest_resume(spa_t *spa) 7520 { 7521 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 7522 (void) printf("resuming from suspended state\n"); 7523 spa_vdev_state_enter(spa, SCL_NONE); 7524 vdev_clear(spa, NULL); 7525 (void) spa_vdev_state_exit(spa, NULL, 0); 7526 (void) zio_resume(spa); 7527 } 7528 7529 static __attribute__((noreturn)) void 7530 ztest_resume_thread(void *arg) 7531 { 7532 spa_t *spa = arg; 7533 7534 /* 7535 * Synthesize aged DDT entries for ddt prune testing 7536 */ 7537 ddt_prune_artificial_age = B_TRUE; 7538 if (ztest_opts.zo_verbose >= 3) 7539 ddt_dump_prune_histogram = B_TRUE; 7540 7541 while (!ztest_exiting) { 7542 if (spa_suspended(spa)) 7543 ztest_resume(spa); 7544 (void) poll(NULL, 0, 100); 7545 7546 /* 7547 * Periodically change the zfs_compressed_arc_enabled setting. 7548 */ 7549 if (ztest_random(10) == 0) 7550 zfs_compressed_arc_enabled = ztest_random(2); 7551 7552 /* 7553 * Periodically change the zfs_abd_scatter_enabled setting. 7554 */ 7555 if (ztest_random(10) == 0) 7556 zfs_abd_scatter_enabled = ztest_random(2); 7557 } 7558 7559 thread_exit(); 7560 } 7561 7562 static __attribute__((noreturn)) void 7563 ztest_deadman_thread(void *arg) 7564 { 7565 ztest_shared_t *zs = arg; 7566 spa_t *spa = ztest_spa; 7567 hrtime_t delay, overdue, last_run = gethrtime(); 7568 7569 delay = (zs->zs_thread_stop - zs->zs_thread_start) + 7570 MSEC2NSEC(zfs_deadman_synctime_ms); 7571 7572 while (!ztest_exiting) { 7573 /* 7574 * Wait for the delay timer while checking occasionally 7575 * if we should stop. 7576 */ 7577 if (gethrtime() < last_run + delay) { 7578 (void) poll(NULL, 0, 1000); 7579 continue; 7580 } 7581 7582 /* 7583 * If the pool is suspended then fail immediately. Otherwise, 7584 * check to see if the pool is making any progress. If 7585 * vdev_deadman() discovers that there hasn't been any recent 7586 * I/Os then it will end up aborting the tests. 7587 */ 7588 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 7589 fatal(B_FALSE, 7590 "aborting test after %llu seconds because " 7591 "pool has transitioned to a suspended state.", 7592 (u_longlong_t)zfs_deadman_synctime_ms / 1000); 7593 } 7594 vdev_deadman(spa->spa_root_vdev, FTAG); 7595 7596 /* 7597 * If the process doesn't complete within a grace period of 7598 * zfs_deadman_synctime_ms over the expected finish time, 7599 * then it may be hung and is terminated. 7600 */ 7601 overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); 7602 if (gethrtime() > overdue) { 7603 fatal(B_FALSE, 7604 "aborting test after %llu seconds because " 7605 "the process is overdue for termination.", 7606 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7607 } 7608 7609 (void) printf("ztest has been running for %lld seconds\n", 7610 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7611 7612 last_run = gethrtime(); 7613 delay = MSEC2NSEC(zfs_deadman_checktime_ms); 7614 } 7615 7616 thread_exit(); 7617 } 7618 7619 static void 7620 ztest_execute(int test, ztest_info_t *zi, uint64_t id) 7621 { 7622 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 7623 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 7624 hrtime_t functime = gethrtime(); 7625 int i; 7626 7627 for (i = 0; i < zi->zi_iters; i++) 7628 zi->zi_func(zd, id); 7629 7630 functime = gethrtime() - functime; 7631 7632 atomic_add_64(&zc->zc_count, 1); 7633 atomic_add_64(&zc->zc_time, functime); 7634 7635 if (ztest_opts.zo_verbose >= 4) 7636 (void) printf("%6.2f sec in %s\n", 7637 (double)functime / NANOSEC, zi->zi_funcname); 7638 } 7639 7640 typedef struct ztest_raidz_expand_io { 7641 uint64_t rzx_id; 7642 uint64_t rzx_amount; 7643 uint64_t rzx_bufsize; 7644 const void *rzx_buffer; 7645 uint64_t rzx_alloc_max; 7646 spa_t *rzx_spa; 7647 } ztest_expand_io_t; 7648 7649 #undef OD_ARRAY_SIZE 7650 #define OD_ARRAY_SIZE 10 7651 7652 /* 7653 * Write a request amount of data to some dataset objects. 7654 * There will be ztest_opts.zo_threads count of these running in parallel. 7655 */ 7656 static __attribute__((noreturn)) void 7657 ztest_rzx_thread(void *arg) 7658 { 7659 ztest_expand_io_t *info = (ztest_expand_io_t *)arg; 7660 ztest_od_t *od; 7661 int batchsize; 7662 int od_size; 7663 ztest_ds_t *zd = &ztest_ds[info->rzx_id % ztest_opts.zo_datasets]; 7664 spa_t *spa = info->rzx_spa; 7665 7666 od_size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 7667 od = umem_alloc(od_size, UMEM_NOFAIL); 7668 batchsize = OD_ARRAY_SIZE; 7669 7670 /* Create objects to write to */ 7671 for (int b = 0; b < batchsize; b++) { 7672 ztest_od_init(od + b, info->rzx_id, FTAG, b, 7673 DMU_OT_UINT64_OTHER, 0, 0, 0); 7674 } 7675 if (ztest_object_init(zd, od, od_size, B_FALSE) != 0) { 7676 umem_free(od, od_size); 7677 thread_exit(); 7678 } 7679 7680 for (uint64_t offset = 0, written = 0; written < info->rzx_amount; 7681 offset += info->rzx_bufsize) { 7682 /* write to 10 objects */ 7683 for (int i = 0; i < batchsize && written < info->rzx_amount; 7684 i++) { 7685 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 7686 ztest_write(zd, od[i].od_object, offset, 7687 info->rzx_bufsize, info->rzx_buffer); 7688 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 7689 written += info->rzx_bufsize; 7690 } 7691 txg_wait_synced(spa_get_dsl(spa), 0); 7692 /* due to inflation, we'll typically bail here */ 7693 if (metaslab_class_get_alloc(spa_normal_class(spa)) > 7694 info->rzx_alloc_max) { 7695 break; 7696 } 7697 } 7698 7699 /* Remove a few objects to leave some holes in allocation space */ 7700 mutex_enter(&zd->zd_dirobj_lock); 7701 (void) ztest_remove(zd, od, 2); 7702 mutex_exit(&zd->zd_dirobj_lock); 7703 7704 umem_free(od, od_size); 7705 7706 thread_exit(); 7707 } 7708 7709 static __attribute__((noreturn)) void 7710 ztest_thread(void *arg) 7711 { 7712 int rand; 7713 uint64_t id = (uintptr_t)arg; 7714 ztest_shared_t *zs = ztest_shared; 7715 uint64_t call_next; 7716 hrtime_t now; 7717 ztest_info_t *zi; 7718 ztest_shared_callstate_t *zc; 7719 7720 while ((now = gethrtime()) < zs->zs_thread_stop) { 7721 /* 7722 * See if it's time to force a crash. 7723 */ 7724 if (now > zs->zs_thread_kill && 7725 raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) { 7726 ztest_kill(zs); 7727 } 7728 7729 /* 7730 * If we're getting ENOSPC with some regularity, stop. 7731 */ 7732 if (zs->zs_enospc_count > 10) 7733 break; 7734 7735 /* 7736 * Pick a random function to execute. 7737 */ 7738 rand = ztest_random(ZTEST_FUNCS); 7739 zi = &ztest_info[rand]; 7740 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 7741 call_next = zc->zc_next; 7742 7743 if (now >= call_next && 7744 atomic_cas_64(&zc->zc_next, call_next, call_next + 7745 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 7746 ztest_execute(rand, zi, id); 7747 } 7748 } 7749 7750 thread_exit(); 7751 } 7752 7753 static void 7754 ztest_dataset_name(char *dsname, const char *pool, int d) 7755 { 7756 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 7757 } 7758 7759 static void 7760 ztest_dataset_destroy(int d) 7761 { 7762 char name[ZFS_MAX_DATASET_NAME_LEN]; 7763 int t; 7764 7765 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7766 7767 if (ztest_opts.zo_verbose >= 3) 7768 (void) printf("Destroying %s to free up space\n", name); 7769 7770 /* 7771 * Cleanup any non-standard clones and snapshots. In general, 7772 * ztest thread t operates on dataset (t % zopt_datasets), 7773 * so there may be more than one thing to clean up. 7774 */ 7775 for (t = d; t < ztest_opts.zo_threads; 7776 t += ztest_opts.zo_datasets) 7777 ztest_dsl_dataset_cleanup(name, t); 7778 7779 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 7780 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 7781 } 7782 7783 static void 7784 ztest_dataset_dirobj_verify(ztest_ds_t *zd) 7785 { 7786 uint64_t usedobjs, dirobjs, scratch; 7787 7788 /* 7789 * ZTEST_DIROBJ is the object directory for the entire dataset. 7790 * Therefore, the number of objects in use should equal the 7791 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 7792 * If not, we have an object leak. 7793 * 7794 * Note that we can only check this in ztest_dataset_open(), 7795 * when the open-context and syncing-context values agree. 7796 * That's because zap_count() returns the open-context value, 7797 * while dmu_objset_space() returns the rootbp fill count. 7798 */ 7799 VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 7800 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 7801 ASSERT3U(dirobjs + 1, ==, usedobjs); 7802 } 7803 7804 static int 7805 ztest_dataset_open(int d) 7806 { 7807 ztest_ds_t *zd = &ztest_ds[d]; 7808 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 7809 objset_t *os; 7810 zilog_t *zilog; 7811 char name[ZFS_MAX_DATASET_NAME_LEN]; 7812 int error; 7813 7814 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7815 7816 if (ztest_opts.zo_verbose >= 6) 7817 (void) printf("Opening %s\n", name); 7818 7819 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7820 7821 error = ztest_dataset_create(name); 7822 if (error == ENOSPC) { 7823 (void) pthread_rwlock_unlock(&ztest_name_lock); 7824 ztest_record_enospc(FTAG); 7825 return (error); 7826 } 7827 ASSERT(error == 0 || error == EEXIST); 7828 7829 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 7830 B_TRUE, zd, &os)); 7831 (void) pthread_rwlock_unlock(&ztest_name_lock); 7832 7833 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 7834 7835 zilog = zd->zd_zilog; 7836 7837 if (zilog->zl_header->zh_claim_lr_seq != 0 && 7838 zilog->zl_header->zh_claim_lr_seq < committed_seq) 7839 fatal(B_FALSE, "missing log records: " 7840 "claimed %"PRIu64" < committed %"PRIu64"", 7841 zilog->zl_header->zh_claim_lr_seq, committed_seq); 7842 7843 ztest_dataset_dirobj_verify(zd); 7844 7845 zil_replay(os, zd, ztest_replay_vector); 7846 7847 ztest_dataset_dirobj_verify(zd); 7848 7849 if (ztest_opts.zo_verbose >= 6) 7850 (void) printf("%s replay %"PRIu64" blocks, " 7851 "%"PRIu64" records, seq %"PRIu64"\n", 7852 zd->zd_name, 7853 zilog->zl_parse_blk_count, 7854 zilog->zl_parse_lr_count, 7855 zilog->zl_replaying_seq); 7856 7857 zilog = zil_open(os, ztest_get_data, NULL); 7858 7859 if (zilog->zl_replaying_seq != 0 && 7860 zilog->zl_replaying_seq < committed_seq) 7861 fatal(B_FALSE, "missing log records: " 7862 "replayed %"PRIu64" < committed %"PRIu64"", 7863 zilog->zl_replaying_seq, committed_seq); 7864 7865 return (0); 7866 } 7867 7868 static void 7869 ztest_dataset_close(int d) 7870 { 7871 ztest_ds_t *zd = &ztest_ds[d]; 7872 7873 zil_close(zd->zd_zilog); 7874 dmu_objset_disown(zd->zd_os, B_TRUE, zd); 7875 7876 ztest_zd_fini(zd); 7877 } 7878 7879 static int 7880 ztest_replay_zil_cb(const char *name, void *arg) 7881 { 7882 (void) arg; 7883 objset_t *os; 7884 ztest_ds_t *zdtmp; 7885 7886 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, 7887 B_TRUE, FTAG, &os)); 7888 7889 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 7890 7891 ztest_zd_init(zdtmp, NULL, os); 7892 zil_replay(os, zdtmp, ztest_replay_vector); 7893 ztest_zd_fini(zdtmp); 7894 7895 if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && 7896 ztest_opts.zo_verbose >= 6) { 7897 zilog_t *zilog = dmu_objset_zil(os); 7898 7899 (void) printf("%s replay %"PRIu64" blocks, " 7900 "%"PRIu64" records, seq %"PRIu64"\n", 7901 name, 7902 zilog->zl_parse_blk_count, 7903 zilog->zl_parse_lr_count, 7904 zilog->zl_replaying_seq); 7905 } 7906 7907 umem_free(zdtmp, sizeof (ztest_ds_t)); 7908 7909 dmu_objset_disown(os, B_TRUE, FTAG); 7910 return (0); 7911 } 7912 7913 static void 7914 ztest_freeze(void) 7915 { 7916 ztest_ds_t *zd = &ztest_ds[0]; 7917 spa_t *spa; 7918 int numloops = 0; 7919 7920 /* freeze not supported during RAIDZ expansion */ 7921 if (ztest_opts.zo_raid_do_expand) 7922 return; 7923 7924 if (ztest_opts.zo_verbose >= 3) 7925 (void) printf("testing spa_freeze()...\n"); 7926 7927 raidz_scratch_verify(); 7928 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7929 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7930 VERIFY0(ztest_dataset_open(0)); 7931 ztest_spa = spa; 7932 7933 /* 7934 * Force the first log block to be transactionally allocated. 7935 * We have to do this before we freeze the pool -- otherwise 7936 * the log chain won't be anchored. 7937 */ 7938 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 7939 ztest_dmu_object_alloc_free(zd, 0); 7940 zil_commit(zd->zd_zilog, 0); 7941 } 7942 7943 txg_wait_synced(spa_get_dsl(spa), 0); 7944 7945 /* 7946 * Freeze the pool. This stops spa_sync() from doing anything, 7947 * so that the only way to record changes from now on is the ZIL. 7948 */ 7949 spa_freeze(spa); 7950 7951 /* 7952 * Because it is hard to predict how much space a write will actually 7953 * require beforehand, we leave ourselves some fudge space to write over 7954 * capacity. 7955 */ 7956 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 7957 7958 /* 7959 * Run tests that generate log records but don't alter the pool config 7960 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 7961 * We do a txg_wait_synced() after each iteration to force the txg 7962 * to increase well beyond the last synced value in the uberblock. 7963 * The ZIL should be OK with that. 7964 * 7965 * Run a random number of times less than zo_maxloops and ensure we do 7966 * not run out of space on the pool. 7967 */ 7968 while (ztest_random(10) != 0 && 7969 numloops++ < ztest_opts.zo_maxloops && 7970 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 7971 ztest_od_t od; 7972 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 7973 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 7974 ztest_io(zd, od.od_object, 7975 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 7976 txg_wait_synced(spa_get_dsl(spa), 0); 7977 } 7978 7979 /* 7980 * Commit all of the changes we just generated. 7981 */ 7982 zil_commit(zd->zd_zilog, 0); 7983 txg_wait_synced(spa_get_dsl(spa), 0); 7984 7985 /* 7986 * Close our dataset and close the pool. 7987 */ 7988 ztest_dataset_close(0); 7989 spa_close(spa, FTAG); 7990 kernel_fini(); 7991 7992 /* 7993 * Open and close the pool and dataset to induce log replay. 7994 */ 7995 raidz_scratch_verify(); 7996 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7997 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7998 ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); 7999 VERIFY0(ztest_dataset_open(0)); 8000 ztest_spa = spa; 8001 txg_wait_synced(spa_get_dsl(spa), 0); 8002 ztest_dataset_close(0); 8003 ztest_reguid(NULL, 0); 8004 8005 spa_close(spa, FTAG); 8006 kernel_fini(); 8007 } 8008 8009 static void 8010 ztest_import_impl(void) 8011 { 8012 importargs_t args = { 0 }; 8013 nvlist_t *cfg = NULL; 8014 int nsearch = 1; 8015 char *searchdirs[nsearch]; 8016 int flags = ZFS_IMPORT_MISSING_LOG; 8017 8018 searchdirs[0] = ztest_opts.zo_dir; 8019 args.paths = nsearch; 8020 args.path = searchdirs; 8021 args.can_be_active = B_FALSE; 8022 8023 libpc_handle_t lpch = { 8024 .lpc_lib_handle = NULL, 8025 .lpc_ops = &libzpool_config_ops, 8026 .lpc_printerr = B_TRUE 8027 }; 8028 VERIFY0(zpool_find_config(&lpch, ztest_opts.zo_pool, &cfg, &args)); 8029 VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); 8030 fnvlist_free(cfg); 8031 } 8032 8033 /* 8034 * Import a storage pool with the given name. 8035 */ 8036 static void 8037 ztest_import(ztest_shared_t *zs) 8038 { 8039 spa_t *spa; 8040 8041 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8042 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8043 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8044 8045 raidz_scratch_verify(); 8046 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8047 8048 ztest_import_impl(); 8049 8050 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8051 zs->zs_metaslab_sz = 8052 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8053 zs->zs_guid = spa_guid(spa); 8054 spa_close(spa, FTAG); 8055 8056 kernel_fini(); 8057 8058 if (!ztest_opts.zo_mmp_test) { 8059 ztest_run_zdb(zs->zs_guid); 8060 ztest_freeze(); 8061 ztest_run_zdb(zs->zs_guid); 8062 } 8063 8064 (void) pthread_rwlock_destroy(&ztest_name_lock); 8065 mutex_destroy(&ztest_vdev_lock); 8066 mutex_destroy(&ztest_checkpoint_lock); 8067 } 8068 8069 /* 8070 * After the expansion was killed, check that the pool is healthy 8071 */ 8072 static void 8073 ztest_raidz_expand_check(spa_t *spa) 8074 { 8075 ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, RAIDZ_EXPAND_KILLED); 8076 /* 8077 * Set pool check done flag, main program will run a zdb check 8078 * of the pool when we exit. 8079 */ 8080 ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_CHECKED; 8081 8082 /* Wait for reflow to finish */ 8083 if (ztest_opts.zo_verbose >= 1) { 8084 (void) printf("\nwaiting for reflow to finish ...\n"); 8085 } 8086 pool_raidz_expand_stat_t rzx_stats; 8087 pool_raidz_expand_stat_t *pres = &rzx_stats; 8088 do { 8089 txg_wait_synced(spa_get_dsl(spa), 0); 8090 (void) poll(NULL, 0, 500); /* wait 1/2 second */ 8091 8092 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8093 (void) spa_raidz_expand_get_stats(spa, pres); 8094 spa_config_exit(spa, SCL_CONFIG, FTAG); 8095 } while (pres->pres_state != DSS_FINISHED && 8096 pres->pres_reflowed < pres->pres_to_reflow); 8097 8098 if (ztest_opts.zo_verbose >= 1) { 8099 (void) printf("verifying an interrupted raidz " 8100 "expansion using a pool scrub ...\n"); 8101 } 8102 8103 /* Will fail here if there is non-recoverable corruption detected */ 8104 int error = ztest_scrub_impl(spa); 8105 if (error == EBUSY) 8106 error = 0; 8107 8108 VERIFY0(error); 8109 8110 if (ztest_opts.zo_verbose >= 1) { 8111 (void) printf("raidz expansion scrub check complete\n"); 8112 } 8113 } 8114 8115 /* 8116 * Start a raidz expansion test. We run some I/O on the pool for a while 8117 * to get some data in the pool. Then we grow the raidz and 8118 * kill the test at the requested offset into the reflow, verifying that 8119 * doing such does not lead to pool corruption. 8120 */ 8121 static void 8122 ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) 8123 { 8124 nvlist_t *root; 8125 pool_raidz_expand_stat_t rzx_stats; 8126 pool_raidz_expand_stat_t *pres = &rzx_stats; 8127 kthread_t **run_threads; 8128 vdev_t *cvd, *rzvd = spa->spa_root_vdev->vdev_child[0]; 8129 int total_disks = rzvd->vdev_children; 8130 int data_disks = total_disks - vdev_get_nparity(rzvd); 8131 uint64_t alloc_goal; 8132 uint64_t csize; 8133 int error, t; 8134 int threads = ztest_opts.zo_threads; 8135 ztest_expand_io_t *thread_args; 8136 8137 ASSERT3U(ztest_opts.zo_raidz_expand_test, !=, RAIDZ_EXPAND_NONE); 8138 ASSERT3P(rzvd->vdev_ops, ==, &vdev_raidz_ops); 8139 ztest_opts.zo_raidz_expand_test = RAIDZ_EXPAND_STARTED; 8140 8141 /* Setup a 1 MiB buffer of random data */ 8142 uint64_t bufsize = 1024 * 1024; 8143 void *buffer = umem_alloc(bufsize, UMEM_NOFAIL); 8144 8145 if (read(ztest_fd_rand, buffer, bufsize) != bufsize) { 8146 fatal(B_TRUE, "short read from /dev/urandom"); 8147 } 8148 /* 8149 * Put some data in the pool and then attach a vdev to initiate 8150 * reflow. 8151 */ 8152 run_threads = umem_zalloc(threads * sizeof (kthread_t *), UMEM_NOFAIL); 8153 thread_args = umem_zalloc(threads * sizeof (ztest_expand_io_t), 8154 UMEM_NOFAIL); 8155 /* Aim for roughly 25% of allocatable space up to 1GB */ 8156 alloc_goal = (vdev_get_min_asize(rzvd) * data_disks) / total_disks; 8157 alloc_goal = MIN(alloc_goal >> 2, 1024*1024*1024); 8158 if (ztest_opts.zo_verbose >= 1) { 8159 (void) printf("adding data to pool '%s', goal %llu bytes\n", 8160 ztest_opts.zo_pool, (u_longlong_t)alloc_goal); 8161 } 8162 8163 /* 8164 * Kick off all the I/O generators that run in parallel. 8165 */ 8166 for (t = 0; t < threads; t++) { 8167 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 8168 umem_free(run_threads, threads * sizeof (kthread_t *)); 8169 umem_free(buffer, bufsize); 8170 return; 8171 } 8172 thread_args[t].rzx_id = t; 8173 thread_args[t].rzx_amount = alloc_goal / threads; 8174 thread_args[t].rzx_bufsize = bufsize; 8175 thread_args[t].rzx_buffer = buffer; 8176 thread_args[t].rzx_alloc_max = alloc_goal; 8177 thread_args[t].rzx_spa = spa; 8178 run_threads[t] = thread_create(NULL, 0, ztest_rzx_thread, 8179 &thread_args[t], 0, NULL, TS_RUN | TS_JOINABLE, 8180 defclsyspri); 8181 } 8182 8183 /* 8184 * Wait for all of the writers to complete. 8185 */ 8186 for (t = 0; t < threads; t++) 8187 VERIFY0(thread_join(run_threads[t])); 8188 8189 /* 8190 * Close all datasets. This must be done after all the threads 8191 * are joined so we can be sure none of the datasets are in-use 8192 * by any of the threads. 8193 */ 8194 for (t = 0; t < ztest_opts.zo_threads; t++) { 8195 if (t < ztest_opts.zo_datasets) 8196 ztest_dataset_close(t); 8197 } 8198 8199 txg_wait_synced(spa_get_dsl(spa), 0); 8200 8201 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 8202 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 8203 8204 umem_free(buffer, bufsize); 8205 umem_free(run_threads, threads * sizeof (kthread_t *)); 8206 umem_free(thread_args, threads * sizeof (ztest_expand_io_t)); 8207 8208 /* Set our reflow target to 25%, 50% or 75% of allocated size */ 8209 uint_t multiple = ztest_random(3) + 1; 8210 uint64_t reflow_max = (rzvd->vdev_stat.vs_alloc * multiple) / 4; 8211 raidz_expand_max_reflow_bytes = reflow_max; 8212 8213 if (ztest_opts.zo_verbose >= 1) { 8214 (void) printf("running raidz expansion test, killing when " 8215 "reflow reaches %llu bytes (%u/4 of allocated space)\n", 8216 (u_longlong_t)reflow_max, multiple); 8217 } 8218 8219 /* XXX - do we want some I/O load during the reflow? */ 8220 8221 /* 8222 * Use a disk size that is larger than existing ones 8223 */ 8224 cvd = rzvd->vdev_child[0]; 8225 csize = vdev_get_min_asize(cvd); 8226 csize += csize / 10; 8227 /* 8228 * Path to vdev to be attached 8229 */ 8230 char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 8231 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 8232 ztest_opts.zo_dir, ztest_opts.zo_pool, rzvd->vdev_children); 8233 /* 8234 * Build the nvlist describing newpath. 8235 */ 8236 root = make_vdev_root(newpath, NULL, NULL, csize, ztest_get_ashift(), 8237 NULL, 0, 0, 1); 8238 /* 8239 * Expand the raidz vdev by attaching the new disk 8240 */ 8241 if (ztest_opts.zo_verbose >= 1) { 8242 (void) printf("expanding raidz: %d wide to %d wide with '%s'\n", 8243 (int)rzvd->vdev_children, (int)rzvd->vdev_children + 1, 8244 newpath); 8245 } 8246 error = spa_vdev_attach(spa, rzvd->vdev_guid, root, B_FALSE, B_FALSE); 8247 nvlist_free(root); 8248 if (error != 0) { 8249 fatal(0, "raidz expand: attach (%s %llu) returned %d", 8250 newpath, (long long)csize, error); 8251 } 8252 8253 /* 8254 * Wait for reflow to begin 8255 */ 8256 while (spa->spa_raidz_expand == NULL) { 8257 txg_wait_synced(spa_get_dsl(spa), 0); 8258 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8259 } 8260 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8261 (void) spa_raidz_expand_get_stats(spa, pres); 8262 spa_config_exit(spa, SCL_CONFIG, FTAG); 8263 while (pres->pres_state != DSS_SCANNING) { 8264 txg_wait_synced(spa_get_dsl(spa), 0); 8265 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8266 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8267 (void) spa_raidz_expand_get_stats(spa, pres); 8268 spa_config_exit(spa, SCL_CONFIG, FTAG); 8269 } 8270 8271 ASSERT3U(pres->pres_state, ==, DSS_SCANNING); 8272 ASSERT3U(pres->pres_to_reflow, !=, 0); 8273 /* 8274 * Set so when we are killed we go to raidz checking rather than 8275 * restarting test. 8276 */ 8277 ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_KILLED; 8278 if (ztest_opts.zo_verbose >= 1) { 8279 (void) printf("raidz expansion reflow started, waiting for " 8280 "%llu bytes to be copied\n", (u_longlong_t)reflow_max); 8281 } 8282 8283 /* 8284 * Wait for reflow maximum to be reached and then kill the test 8285 */ 8286 while (pres->pres_reflowed < reflow_max) { 8287 txg_wait_synced(spa_get_dsl(spa), 0); 8288 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8289 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8290 (void) spa_raidz_expand_get_stats(spa, pres); 8291 spa_config_exit(spa, SCL_CONFIG, FTAG); 8292 } 8293 8294 /* Reset the reflow pause before killing */ 8295 raidz_expand_max_reflow_bytes = 0; 8296 8297 if (ztest_opts.zo_verbose >= 1) { 8298 (void) printf("killing raidz expansion test after reflow " 8299 "reached %llu bytes\n", (u_longlong_t)pres->pres_reflowed); 8300 } 8301 8302 /* 8303 * Kill ourself to simulate a panic during a reflow. Our parent will 8304 * restart the test and the changed flag value will drive the test 8305 * through the scrub/check code to verify the pool is not corrupted. 8306 */ 8307 ztest_kill(zs); 8308 } 8309 8310 static void 8311 ztest_generic_run(ztest_shared_t *zs, spa_t *spa) 8312 { 8313 kthread_t **run_threads; 8314 int i, ndatasets; 8315 8316 run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), 8317 UMEM_NOFAIL); 8318 8319 /* 8320 * Actual number of datasets to be used. 8321 */ 8322 ndatasets = MIN(ztest_opts.zo_datasets, ztest_opts.zo_threads); 8323 8324 /* 8325 * Prepare the datasets first. 8326 */ 8327 for (i = 0; i < ndatasets; i++) 8328 VERIFY0(ztest_dataset_open(i)); 8329 8330 /* 8331 * Kick off all the tests that run in parallel. 8332 */ 8333 for (i = 0; i < ztest_opts.zo_threads; i++) { 8334 run_threads[i] = thread_create(NULL, 0, ztest_thread, 8335 (void *)(uintptr_t)i, 0, NULL, TS_RUN | TS_JOINABLE, 8336 defclsyspri); 8337 } 8338 8339 /* 8340 * Wait for all of the tests to complete. 8341 */ 8342 for (i = 0; i < ztest_opts.zo_threads; i++) 8343 VERIFY0(thread_join(run_threads[i])); 8344 8345 /* 8346 * Close all datasets. This must be done after all the threads 8347 * are joined so we can be sure none of the datasets are in-use 8348 * by any of the threads. 8349 */ 8350 for (i = 0; i < ndatasets; i++) 8351 ztest_dataset_close(i); 8352 8353 txg_wait_synced(spa_get_dsl(spa), 0); 8354 8355 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 8356 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 8357 8358 umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); 8359 } 8360 8361 /* 8362 * Setup our test context and kick off threads to run tests on all datasets 8363 * in parallel. 8364 */ 8365 static void 8366 ztest_run(ztest_shared_t *zs) 8367 { 8368 spa_t *spa; 8369 objset_t *os; 8370 kthread_t *resume_thread, *deadman_thread; 8371 uint64_t object; 8372 int error; 8373 int t, d; 8374 8375 ztest_exiting = B_FALSE; 8376 8377 /* 8378 * Initialize parent/child shared state. 8379 */ 8380 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8381 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8382 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8383 8384 zs->zs_thread_start = gethrtime(); 8385 zs->zs_thread_stop = 8386 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 8387 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 8388 zs->zs_thread_kill = zs->zs_thread_stop; 8389 if (ztest_random(100) < ztest_opts.zo_killrate) { 8390 zs->zs_thread_kill -= 8391 ztest_random(ztest_opts.zo_passtime * NANOSEC); 8392 } 8393 8394 mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); 8395 8396 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 8397 offsetof(ztest_cb_data_t, zcd_node)); 8398 8399 /* 8400 * Open our pool. It may need to be imported first depending on 8401 * what tests were running when the previous pass was terminated. 8402 */ 8403 raidz_scratch_verify(); 8404 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8405 error = spa_open(ztest_opts.zo_pool, &spa, FTAG); 8406 if (error) { 8407 VERIFY3S(error, ==, ENOENT); 8408 ztest_import_impl(); 8409 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8410 zs->zs_metaslab_sz = 8411 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8412 } 8413 8414 metaslab_preload_limit = ztest_random(20) + 1; 8415 ztest_spa = spa; 8416 8417 /* 8418 * XXX - BUGBUG raidz expansion do not run this for generic for now 8419 */ 8420 if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) 8421 VERIFY0(vdev_raidz_impl_set("cycle")); 8422 8423 dmu_objset_stats_t dds; 8424 VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, 8425 DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); 8426 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 8427 dmu_objset_fast_stat(os, &dds); 8428 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 8429 dmu_objset_disown(os, B_TRUE, FTAG); 8430 8431 /* Give the dedicated raidz expansion test more grace time */ 8432 if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) 8433 zfs_deadman_synctime_ms *= 2; 8434 8435 /* 8436 * Create a thread to periodically resume suspended I/O. 8437 */ 8438 resume_thread = thread_create(NULL, 0, ztest_resume_thread, 8439 spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 8440 8441 /* 8442 * Create a deadman thread and set to panic if we hang. 8443 */ 8444 deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, 8445 zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 8446 8447 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; 8448 8449 /* 8450 * Verify that we can safely inquire about any object, 8451 * whether it's allocated or not. To make it interesting, 8452 * we probe a 5-wide window around each power of two. 8453 * This hits all edge cases, including zero and the max. 8454 */ 8455 for (t = 0; t < 64; t++) { 8456 for (d = -5; d <= 5; d++) { 8457 error = dmu_object_info(spa->spa_meta_objset, 8458 (1ULL << t) + d, NULL); 8459 ASSERT(error == 0 || error == ENOENT || 8460 error == EINVAL); 8461 } 8462 } 8463 8464 /* 8465 * If we got any ENOSPC errors on the previous run, destroy something. 8466 */ 8467 if (zs->zs_enospc_count != 0) { 8468 /* Not expecting ENOSPC errors during raidz expansion tests */ 8469 ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, 8470 RAIDZ_EXPAND_NONE); 8471 8472 int d = ztest_random(ztest_opts.zo_datasets); 8473 ztest_dataset_destroy(d); 8474 txg_wait_synced(spa_get_dsl(spa), 0); 8475 } 8476 zs->zs_enospc_count = 0; 8477 8478 /* 8479 * If we were in the middle of ztest_device_removal() and were killed 8480 * we need to ensure the removal and scrub complete before running 8481 * any tests that check ztest_device_removal_active. The removal will 8482 * be restarted automatically when the spa is opened, but we need to 8483 * initiate the scrub manually if it is not already in progress. Note 8484 * that we always run the scrub whenever an indirect vdev exists 8485 * because we have no way of knowing for sure if ztest_device_removal() 8486 * fully completed its scrub before the pool was reimported. 8487 * 8488 * Does not apply for the RAIDZ expansion specific test runs 8489 */ 8490 if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_NONE && 8491 (spa->spa_removing_phys.sr_state == DSS_SCANNING || 8492 spa->spa_removing_phys.sr_prev_indirect_vdev != -1)) { 8493 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 8494 txg_wait_synced(spa_get_dsl(spa), 0); 8495 8496 error = ztest_scrub_impl(spa); 8497 if (error == EBUSY) 8498 error = 0; 8499 ASSERT0(error); 8500 } 8501 8502 if (ztest_opts.zo_verbose >= 4) 8503 (void) printf("starting main threads...\n"); 8504 8505 /* 8506 * Replay all logs of all datasets in the pool. This is primarily for 8507 * temporary datasets which wouldn't otherwise get replayed, which 8508 * can trigger failures when attempting to offline a SLOG in 8509 * ztest_fault_inject(). 8510 */ 8511 (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, 8512 NULL, DS_FIND_CHILDREN); 8513 8514 if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) 8515 ztest_raidz_expand_run(zs, spa); 8516 else if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_KILLED) 8517 ztest_raidz_expand_check(spa); 8518 else 8519 ztest_generic_run(zs, spa); 8520 8521 /* Kill the resume and deadman threads */ 8522 ztest_exiting = B_TRUE; 8523 VERIFY0(thread_join(resume_thread)); 8524 VERIFY0(thread_join(deadman_thread)); 8525 ztest_resume(spa); 8526 8527 /* 8528 * Right before closing the pool, kick off a bunch of async I/O; 8529 * spa_close() should wait for it to complete. 8530 */ 8531 for (object = 1; object < 50; object++) { 8532 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 8533 ZIO_PRIORITY_SYNC_READ); 8534 } 8535 8536 /* Verify that at least one commit cb was called in a timely fashion */ 8537 if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) 8538 VERIFY0(zc_min_txg_delay); 8539 8540 spa_close(spa, FTAG); 8541 8542 /* 8543 * Verify that we can loop over all pools. 8544 */ 8545 mutex_enter(&spa_namespace_lock); 8546 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 8547 if (ztest_opts.zo_verbose > 3) 8548 (void) printf("spa_next: found %s\n", spa_name(spa)); 8549 mutex_exit(&spa_namespace_lock); 8550 8551 /* 8552 * Verify that we can export the pool and reimport it under a 8553 * different name. 8554 */ 8555 if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { 8556 char name[ZFS_MAX_DATASET_NAME_LEN]; 8557 (void) snprintf(name, sizeof (name), "%s_import", 8558 ztest_opts.zo_pool); 8559 ztest_spa_import_export(ztest_opts.zo_pool, name); 8560 ztest_spa_import_export(name, ztest_opts.zo_pool); 8561 } 8562 8563 kernel_fini(); 8564 8565 list_destroy(&zcl.zcl_callbacks); 8566 mutex_destroy(&zcl.zcl_callbacks_lock); 8567 (void) pthread_rwlock_destroy(&ztest_name_lock); 8568 mutex_destroy(&ztest_vdev_lock); 8569 mutex_destroy(&ztest_checkpoint_lock); 8570 } 8571 8572 static void 8573 print_time(hrtime_t t, char *timebuf) 8574 { 8575 hrtime_t s = t / NANOSEC; 8576 hrtime_t m = s / 60; 8577 hrtime_t h = m / 60; 8578 hrtime_t d = h / 24; 8579 8580 s -= m * 60; 8581 m -= h * 60; 8582 h -= d * 24; 8583 8584 timebuf[0] = '\0'; 8585 8586 if (d) 8587 (void) sprintf(timebuf, 8588 "%llud%02lluh%02llum%02llus", d, h, m, s); 8589 else if (h) 8590 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 8591 else if (m) 8592 (void) sprintf(timebuf, "%llum%02llus", m, s); 8593 else 8594 (void) sprintf(timebuf, "%llus", s); 8595 } 8596 8597 static nvlist_t * 8598 make_random_pool_props(void) 8599 { 8600 nvlist_t *props; 8601 8602 props = fnvlist_alloc(); 8603 8604 /* Twenty percent of the time enable ZPOOL_PROP_DEDUP_TABLE_QUOTA */ 8605 if (ztest_random(5) == 0) { 8606 fnvlist_add_uint64(props, 8607 zpool_prop_to_name(ZPOOL_PROP_DEDUP_TABLE_QUOTA), 8608 2 * 1024 * 1024); 8609 } 8610 8611 /* Fifty percent of the time enable ZPOOL_PROP_AUTOREPLACE */ 8612 if (ztest_random(2) == 0) { 8613 fnvlist_add_uint64(props, 8614 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); 8615 } 8616 8617 return (props); 8618 } 8619 8620 /* 8621 * Create a storage pool with the given name and initial vdev size. 8622 * Then test spa_freeze() functionality. 8623 */ 8624 static void 8625 ztest_init(ztest_shared_t *zs) 8626 { 8627 spa_t *spa; 8628 nvlist_t *nvroot, *props; 8629 int i; 8630 8631 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8632 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8633 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8634 8635 raidz_scratch_verify(); 8636 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8637 8638 /* 8639 * Create the storage pool. 8640 */ 8641 (void) spa_destroy(ztest_opts.zo_pool); 8642 ztest_shared->zs_vdev_next_leaf = 0; 8643 zs->zs_splits = 0; 8644 zs->zs_mirrors = ztest_opts.zo_mirrors; 8645 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 8646 NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 8647 props = make_random_pool_props(); 8648 8649 /* 8650 * We don't expect the pool to suspend unless maxfaults == 0, 8651 * in which case ztest_fault_inject() temporarily takes away 8652 * the only valid replica. 8653 */ 8654 fnvlist_add_uint64(props, 8655 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 8656 MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); 8657 8658 for (i = 0; i < SPA_FEATURES; i++) { 8659 char *buf; 8660 8661 if (!spa_feature_table[i].fi_zfs_mod_supported) 8662 continue; 8663 8664 /* 8665 * 75% chance of using the log space map feature. We want ztest 8666 * to exercise both the code paths that use the log space map 8667 * feature and the ones that don't. 8668 */ 8669 if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) 8670 continue; 8671 8672 /* 8673 * split 50/50 between legacy and fast dedup 8674 */ 8675 if (i == SPA_FEATURE_FAST_DEDUP && ztest_random(2) != 0) 8676 continue; 8677 8678 VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", 8679 spa_feature_table[i].fi_uname)); 8680 fnvlist_add_uint64(props, buf, 0); 8681 free(buf); 8682 } 8683 8684 VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); 8685 fnvlist_free(nvroot); 8686 fnvlist_free(props); 8687 8688 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8689 zs->zs_metaslab_sz = 8690 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8691 zs->zs_guid = spa_guid(spa); 8692 spa_close(spa, FTAG); 8693 8694 kernel_fini(); 8695 8696 if (!ztest_opts.zo_mmp_test) { 8697 ztest_run_zdb(zs->zs_guid); 8698 ztest_freeze(); 8699 ztest_run_zdb(zs->zs_guid); 8700 } 8701 8702 (void) pthread_rwlock_destroy(&ztest_name_lock); 8703 mutex_destroy(&ztest_vdev_lock); 8704 mutex_destroy(&ztest_checkpoint_lock); 8705 } 8706 8707 static void 8708 setup_data_fd(void) 8709 { 8710 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 8711 8712 ztest_fd_data = mkstemp(ztest_name_data); 8713 ASSERT3S(ztest_fd_data, >=, 0); 8714 (void) unlink(ztest_name_data); 8715 } 8716 8717 static int 8718 shared_data_size(ztest_shared_hdr_t *hdr) 8719 { 8720 int size; 8721 8722 size = hdr->zh_hdr_size; 8723 size += hdr->zh_opts_size; 8724 size += hdr->zh_size; 8725 size += hdr->zh_stats_size * hdr->zh_stats_count; 8726 size += hdr->zh_ds_size * hdr->zh_ds_count; 8727 size += hdr->zh_scratch_state_size; 8728 8729 return (size); 8730 } 8731 8732 static void 8733 setup_hdr(void) 8734 { 8735 int size; 8736 ztest_shared_hdr_t *hdr; 8737 8738 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 8739 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 8740 ASSERT3P(hdr, !=, MAP_FAILED); 8741 8742 VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 8743 8744 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 8745 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 8746 hdr->zh_size = sizeof (ztest_shared_t); 8747 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 8748 hdr->zh_stats_count = ZTEST_FUNCS; 8749 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 8750 hdr->zh_ds_count = ztest_opts.zo_datasets; 8751 hdr->zh_scratch_state_size = sizeof (ztest_shared_scratch_state_t); 8752 8753 size = shared_data_size(hdr); 8754 VERIFY0(ftruncate(ztest_fd_data, size)); 8755 8756 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 8757 } 8758 8759 static void 8760 setup_data(void) 8761 { 8762 int size, offset; 8763 ztest_shared_hdr_t *hdr; 8764 uint8_t *buf; 8765 8766 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 8767 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 8768 ASSERT3P(hdr, !=, MAP_FAILED); 8769 8770 size = shared_data_size(hdr); 8771 8772 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 8773 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 8774 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 8775 ASSERT3P(hdr, !=, MAP_FAILED); 8776 buf = (uint8_t *)hdr; 8777 8778 offset = hdr->zh_hdr_size; 8779 ztest_shared_opts = (void *)&buf[offset]; 8780 offset += hdr->zh_opts_size; 8781 ztest_shared = (void *)&buf[offset]; 8782 offset += hdr->zh_size; 8783 ztest_shared_callstate = (void *)&buf[offset]; 8784 offset += hdr->zh_stats_size * hdr->zh_stats_count; 8785 ztest_shared_ds = (void *)&buf[offset]; 8786 offset += hdr->zh_ds_size * hdr->zh_ds_count; 8787 ztest_scratch_state = (void *)&buf[offset]; 8788 } 8789 8790 static boolean_t 8791 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 8792 { 8793 pid_t pid; 8794 int status; 8795 char *cmdbuf = NULL; 8796 8797 pid = fork(); 8798 8799 if (cmd == NULL) { 8800 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 8801 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 8802 cmd = cmdbuf; 8803 } 8804 8805 if (pid == -1) 8806 fatal(B_TRUE, "fork failed"); 8807 8808 if (pid == 0) { /* child */ 8809 char fd_data_str[12]; 8810 8811 VERIFY3S(11, >=, 8812 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 8813 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 8814 8815 if (libpath != NULL) { 8816 const char *curlp = getenv("LD_LIBRARY_PATH"); 8817 if (curlp == NULL) 8818 VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); 8819 else { 8820 char *newlp = NULL; 8821 VERIFY3S(-1, !=, 8822 asprintf(&newlp, "%s:%s", libpath, curlp)); 8823 VERIFY0(setenv("LD_LIBRARY_PATH", newlp, 1)); 8824 free(newlp); 8825 } 8826 } 8827 (void) execl(cmd, cmd, (char *)NULL); 8828 ztest_dump_core = B_FALSE; 8829 fatal(B_TRUE, "exec failed: %s", cmd); 8830 } 8831 8832 if (cmdbuf != NULL) { 8833 umem_free(cmdbuf, MAXPATHLEN); 8834 cmd = NULL; 8835 } 8836 8837 while (waitpid(pid, &status, 0) != pid) 8838 continue; 8839 if (statusp != NULL) 8840 *statusp = status; 8841 8842 if (WIFEXITED(status)) { 8843 if (WEXITSTATUS(status) != 0) { 8844 (void) fprintf(stderr, "child exited with code %d\n", 8845 WEXITSTATUS(status)); 8846 exit(2); 8847 } 8848 return (B_FALSE); 8849 } else if (WIFSIGNALED(status)) { 8850 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 8851 (void) fprintf(stderr, "child died with signal %d\n", 8852 WTERMSIG(status)); 8853 exit(3); 8854 } 8855 return (B_TRUE); 8856 } else { 8857 (void) fprintf(stderr, "something strange happened to child\n"); 8858 exit(4); 8859 } 8860 } 8861 8862 static void 8863 ztest_run_init(void) 8864 { 8865 int i; 8866 8867 ztest_shared_t *zs = ztest_shared; 8868 8869 /* 8870 * Blow away any existing copy of zpool.cache 8871 */ 8872 (void) remove(spa_config_path); 8873 8874 if (ztest_opts.zo_init == 0) { 8875 if (ztest_opts.zo_verbose >= 1) 8876 (void) printf("Importing pool %s\n", 8877 ztest_opts.zo_pool); 8878 ztest_import(zs); 8879 return; 8880 } 8881 8882 /* 8883 * Create and initialize our storage pool. 8884 */ 8885 for (i = 1; i <= ztest_opts.zo_init; i++) { 8886 memset(zs, 0, sizeof (*zs)); 8887 if (ztest_opts.zo_verbose >= 3 && 8888 ztest_opts.zo_init != 1) { 8889 (void) printf("ztest_init(), pass %d\n", i); 8890 } 8891 ztest_init(zs); 8892 } 8893 } 8894 8895 int 8896 main(int argc, char **argv) 8897 { 8898 int kills = 0; 8899 int iters = 0; 8900 int older = 0; 8901 int newer = 0; 8902 ztest_shared_t *zs; 8903 ztest_info_t *zi; 8904 ztest_shared_callstate_t *zc; 8905 char timebuf[100]; 8906 char numbuf[NN_NUMBUF_SZ]; 8907 char *cmd; 8908 boolean_t hasalt; 8909 int f, err; 8910 char *fd_data_str = getenv("ZTEST_FD_DATA"); 8911 struct sigaction action; 8912 8913 (void) setvbuf(stdout, NULL, _IOLBF, 0); 8914 8915 dprintf_setup(&argc, argv); 8916 zfs_deadman_synctime_ms = 300000; 8917 zfs_deadman_checktime_ms = 30000; 8918 /* 8919 * As two-word space map entries may not come up often (especially 8920 * if pool and vdev sizes are small) we want to force at least some 8921 * of them so the feature get tested. 8922 */ 8923 zfs_force_some_double_word_sm_entries = B_TRUE; 8924 8925 /* 8926 * Verify that even extensively damaged split blocks with many 8927 * segments can be reconstructed in a reasonable amount of time 8928 * when reconstruction is known to be possible. 8929 * 8930 * Note: the lower this value is, the more damage we inflict, and 8931 * the more time ztest spends in recovering that damage. We chose 8932 * to induce damage 1/100th of the time so recovery is tested but 8933 * not so frequently that ztest doesn't get to test other code paths. 8934 */ 8935 zfs_reconstruct_indirect_damage_fraction = 100; 8936 8937 action.sa_handler = sig_handler; 8938 sigemptyset(&action.sa_mask); 8939 action.sa_flags = 0; 8940 8941 if (sigaction(SIGSEGV, &action, NULL) < 0) { 8942 (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", 8943 strerror(errno)); 8944 exit(EXIT_FAILURE); 8945 } 8946 8947 if (sigaction(SIGABRT, &action, NULL) < 0) { 8948 (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", 8949 strerror(errno)); 8950 exit(EXIT_FAILURE); 8951 } 8952 8953 /* 8954 * Force random_get_bytes() to use /dev/urandom in order to prevent 8955 * ztest from needlessly depleting the system entropy pool. 8956 */ 8957 random_path = "/dev/urandom"; 8958 ztest_fd_rand = open(random_path, O_RDONLY | O_CLOEXEC); 8959 ASSERT3S(ztest_fd_rand, >=, 0); 8960 8961 if (!fd_data_str) { 8962 process_options(argc, argv); 8963 8964 setup_data_fd(); 8965 setup_hdr(); 8966 setup_data(); 8967 memcpy(ztest_shared_opts, &ztest_opts, 8968 sizeof (*ztest_shared_opts)); 8969 } else { 8970 ztest_fd_data = atoi(fd_data_str); 8971 setup_data(); 8972 memcpy(&ztest_opts, ztest_shared_opts, sizeof (ztest_opts)); 8973 } 8974 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 8975 8976 err = ztest_set_global_vars(); 8977 if (err != 0 && !fd_data_str) { 8978 /* error message done by ztest_set_global_vars */ 8979 exit(EXIT_FAILURE); 8980 } else { 8981 /* children should not be spawned if setting gvars fails */ 8982 VERIFY3S(err, ==, 0); 8983 } 8984 8985 /* Override location of zpool.cache */ 8986 VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", 8987 ztest_opts.zo_dir), !=, -1); 8988 8989 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 8990 UMEM_NOFAIL); 8991 zs = ztest_shared; 8992 8993 if (fd_data_str) { 8994 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 8995 metaslab_df_alloc_threshold = 8996 zs->zs_metaslab_df_alloc_threshold; 8997 8998 if (zs->zs_do_init) 8999 ztest_run_init(); 9000 else 9001 ztest_run(zs); 9002 exit(0); 9003 } 9004 9005 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 9006 9007 if (ztest_opts.zo_verbose >= 1) { 9008 (void) printf("%"PRIu64" vdevs, %d datasets, %d threads, " 9009 "%d %s disks, parity %d, %"PRIu64" seconds...\n\n", 9010 ztest_opts.zo_vdevs, 9011 ztest_opts.zo_datasets, 9012 ztest_opts.zo_threads, 9013 ztest_opts.zo_raid_children, 9014 ztest_opts.zo_raid_type, 9015 ztest_opts.zo_raid_parity, 9016 ztest_opts.zo_time); 9017 } 9018 9019 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 9020 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 9021 9022 zs->zs_do_init = B_TRUE; 9023 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 9024 if (ztest_opts.zo_verbose >= 1) { 9025 (void) printf("Executing older ztest for " 9026 "initialization: %s\n", ztest_opts.zo_alt_ztest); 9027 } 9028 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 9029 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 9030 } else { 9031 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 9032 } 9033 zs->zs_do_init = B_FALSE; 9034 9035 zs->zs_proc_start = gethrtime(); 9036 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 9037 9038 for (f = 0; f < ZTEST_FUNCS; f++) { 9039 zi = &ztest_info[f]; 9040 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9041 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 9042 zc->zc_next = UINT64_MAX; 9043 else 9044 zc->zc_next = zs->zs_proc_start + 9045 ztest_random(2 * zi->zi_interval[0] + 1); 9046 } 9047 9048 /* 9049 * Run the tests in a loop. These tests include fault injection 9050 * to verify that self-healing data works, and forced crashes 9051 * to verify that we never lose on-disk consistency. 9052 */ 9053 while (gethrtime() < zs->zs_proc_stop) { 9054 int status; 9055 boolean_t killed; 9056 9057 /* 9058 * Initialize the workload counters for each function. 9059 */ 9060 for (f = 0; f < ZTEST_FUNCS; f++) { 9061 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9062 zc->zc_count = 0; 9063 zc->zc_time = 0; 9064 } 9065 9066 /* Set the allocation switch size */ 9067 zs->zs_metaslab_df_alloc_threshold = 9068 ztest_random(zs->zs_metaslab_sz / 4) + 1; 9069 9070 if (!hasalt || ztest_random(2) == 0) { 9071 if (hasalt && ztest_opts.zo_verbose >= 1) { 9072 (void) printf("Executing newer ztest: %s\n", 9073 cmd); 9074 } 9075 newer++; 9076 killed = exec_child(cmd, NULL, B_TRUE, &status); 9077 } else { 9078 if (hasalt && ztest_opts.zo_verbose >= 1) { 9079 (void) printf("Executing older ztest: %s\n", 9080 ztest_opts.zo_alt_ztest); 9081 } 9082 older++; 9083 killed = exec_child(ztest_opts.zo_alt_ztest, 9084 ztest_opts.zo_alt_libpath, B_TRUE, &status); 9085 } 9086 9087 if (killed) 9088 kills++; 9089 iters++; 9090 9091 if (ztest_opts.zo_verbose >= 1) { 9092 hrtime_t now = gethrtime(); 9093 9094 now = MIN(now, zs->zs_proc_stop); 9095 print_time(zs->zs_proc_stop - now, timebuf); 9096 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 9097 9098 (void) printf("Pass %3d, %8s, %3"PRIu64" ENOSPC, " 9099 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 9100 iters, 9101 WIFEXITED(status) ? "Complete" : "SIGKILL", 9102 zs->zs_enospc_count, 9103 100.0 * zs->zs_alloc / zs->zs_space, 9104 numbuf, 9105 100.0 * (now - zs->zs_proc_start) / 9106 (ztest_opts.zo_time * NANOSEC), timebuf); 9107 } 9108 9109 if (ztest_opts.zo_verbose >= 2) { 9110 (void) printf("\nWorkload summary:\n\n"); 9111 (void) printf("%7s %9s %s\n", 9112 "Calls", "Time", "Function"); 9113 (void) printf("%7s %9s %s\n", 9114 "-----", "----", "--------"); 9115 for (f = 0; f < ZTEST_FUNCS; f++) { 9116 zi = &ztest_info[f]; 9117 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9118 print_time(zc->zc_time, timebuf); 9119 (void) printf("%7"PRIu64" %9s %s\n", 9120 zc->zc_count, timebuf, 9121 zi->zi_funcname); 9122 } 9123 (void) printf("\n"); 9124 } 9125 9126 if (!ztest_opts.zo_mmp_test) 9127 ztest_run_zdb(zs->zs_guid); 9128 if (ztest_shared_opts->zo_raidz_expand_test == 9129 RAIDZ_EXPAND_CHECKED) 9130 break; /* raidz expand test complete */ 9131 } 9132 9133 if (ztest_opts.zo_verbose >= 1) { 9134 if (hasalt) { 9135 (void) printf("%d runs of older ztest: %s\n", older, 9136 ztest_opts.zo_alt_ztest); 9137 (void) printf("%d runs of newer ztest: %s\n", newer, 9138 cmd); 9139 } 9140 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 9141 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 9142 } 9143 9144 umem_free(cmd, MAXNAMELEN); 9145 9146 return (0); 9147 } 9148