1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2024 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2013 Steven Hartland. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2017 Joyent, Inc. 28 * Copyright (c) 2017, Intel Corporation. 29 * Copyright (c) 2023, Klara, Inc. 30 */ 31 32 /* 33 * The objective of this program is to provide a DMU/ZAP/SPA stress test 34 * that runs entirely in userland, is easy to use, and easy to extend. 35 * 36 * The overall design of the ztest program is as follows: 37 * 38 * (1) For each major functional area (e.g. adding vdevs to a pool, 39 * creating and destroying datasets, reading and writing objects, etc) 40 * we have a simple routine to test that functionality. These 41 * individual routines do not have to do anything "stressful". 42 * 43 * (2) We turn these simple functionality tests into a stress test by 44 * running them all in parallel, with as many threads as desired, 45 * and spread across as many datasets, objects, and vdevs as desired. 46 * 47 * (3) While all this is happening, we inject faults into the pool to 48 * verify that self-healing data really works. 49 * 50 * (4) Every time we open a dataset, we change its checksum and compression 51 * functions. Thus even individual objects vary from block to block 52 * in which checksum they use and whether they're compressed. 53 * 54 * (5) To verify that we never lose on-disk consistency after a crash, 55 * we run the entire test in a child of the main process. 56 * At random times, the child self-immolates with a SIGKILL. 57 * This is the software equivalent of pulling the power cord. 58 * The parent then runs the test again, using the existing 59 * storage pool, as many times as desired. If backwards compatibility 60 * testing is enabled ztest will sometimes run the "older" version 61 * of ztest after a SIGKILL. 62 * 63 * (6) To verify that we don't have future leaks or temporal incursions, 64 * many of the functional tests record the transaction group number 65 * as part of their data. When reading old data, they verify that 66 * the transaction group number is less than the current, open txg. 67 * If you add a new test, please do this if applicable. 68 * 69 * (7) Threads are created with a reduced stack size, for sanity checking. 70 * Therefore, it's important not to allocate huge buffers on the stack. 71 * 72 * When run with no arguments, ztest runs for about five minutes and 73 * produces no output if successful. To get a little bit of information, 74 * specify -V. To get more information, specify -VV, and so on. 75 * 76 * To turn this into an overnight stress test, use -T to specify run time. 77 * 78 * You can ask more vdevs [-v], datasets [-d], or threads [-t] 79 * to increase the pool capacity, fanout, and overall stress level. 80 * 81 * Use the -k option to set the desired frequency of kills. 82 * 83 * When ztest invokes itself it passes all relevant information through a 84 * temporary file which is mmap-ed in the child process. This allows shared 85 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 86 * stored at offset 0 of this file and contains information on the size and 87 * number of shared structures in the file. The information stored in this file 88 * must remain backwards compatible with older versions of ztest so that 89 * ztest can invoke them during backwards compatibility testing (-B). 90 */ 91 92 #include <sys/zfs_context.h> 93 #include <sys/spa.h> 94 #include <sys/dmu.h> 95 #include <sys/txg.h> 96 #include <sys/dbuf.h> 97 #include <sys/zap.h> 98 #include <sys/dmu_objset.h> 99 #include <sys/poll.h> 100 #include <sys/stat.h> 101 #include <sys/time.h> 102 #include <sys/wait.h> 103 #include <sys/mman.h> 104 #include <sys/resource.h> 105 #include <sys/zio.h> 106 #include <sys/zil.h> 107 #include <sys/zil_impl.h> 108 #include <sys/vdev_draid.h> 109 #include <sys/vdev_impl.h> 110 #include <sys/vdev_file.h> 111 #include <sys/vdev_initialize.h> 112 #include <sys/vdev_raidz.h> 113 #include <sys/vdev_trim.h> 114 #include <sys/spa_impl.h> 115 #include <sys/metaslab_impl.h> 116 #include <sys/dsl_prop.h> 117 #include <sys/dsl_dataset.h> 118 #include <sys/dsl_destroy.h> 119 #include <sys/dsl_scan.h> 120 #include <sys/zio_checksum.h> 121 #include <sys/zfs_refcount.h> 122 #include <sys/zfeature.h> 123 #include <sys/dsl_userhold.h> 124 #include <sys/abd.h> 125 #include <sys/blake3.h> 126 #include <stdio.h> 127 #include <stdlib.h> 128 #include <unistd.h> 129 #include <getopt.h> 130 #include <signal.h> 131 #include <umem.h> 132 #include <ctype.h> 133 #include <math.h> 134 #include <sys/fs/zfs.h> 135 #include <zfs_fletcher.h> 136 #include <libnvpair.h> 137 #include <libzutil.h> 138 #include <sys/crypto/icp.h> 139 #include <sys/zfs_impl.h> 140 #include <sys/backtrace.h> 141 142 static int ztest_fd_data = -1; 143 static int ztest_fd_rand = -1; 144 145 typedef struct ztest_shared_hdr { 146 uint64_t zh_hdr_size; 147 uint64_t zh_opts_size; 148 uint64_t zh_size; 149 uint64_t zh_stats_size; 150 uint64_t zh_stats_count; 151 uint64_t zh_ds_size; 152 uint64_t zh_ds_count; 153 uint64_t zh_scratch_state_size; 154 } ztest_shared_hdr_t; 155 156 static ztest_shared_hdr_t *ztest_shared_hdr; 157 158 enum ztest_class_state { 159 ZTEST_VDEV_CLASS_OFF, 160 ZTEST_VDEV_CLASS_ON, 161 ZTEST_VDEV_CLASS_RND 162 }; 163 164 /* Dedicated RAIDZ Expansion test states */ 165 typedef enum { 166 RAIDZ_EXPAND_NONE, /* Default is none, must opt-in */ 167 RAIDZ_EXPAND_REQUESTED, /* The '-X' option was used */ 168 RAIDZ_EXPAND_STARTED, /* Testing has commenced */ 169 RAIDZ_EXPAND_KILLED, /* Reached the proccess kill */ 170 RAIDZ_EXPAND_CHECKED, /* Pool scrub verification done */ 171 } raidz_expand_test_state_t; 172 173 174 #define ZO_GVARS_MAX_ARGLEN ((size_t)64) 175 #define ZO_GVARS_MAX_COUNT ((size_t)10) 176 177 typedef struct ztest_shared_opts { 178 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 179 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 180 char zo_alt_ztest[MAXNAMELEN]; 181 char zo_alt_libpath[MAXNAMELEN]; 182 uint64_t zo_vdevs; 183 uint64_t zo_vdevtime; 184 size_t zo_vdev_size; 185 int zo_ashift; 186 int zo_mirrors; 187 int zo_raid_do_expand; 188 int zo_raid_children; 189 int zo_raid_parity; 190 char zo_raid_type[8]; 191 int zo_draid_data; 192 int zo_draid_spares; 193 int zo_datasets; 194 int zo_threads; 195 uint64_t zo_passtime; 196 uint64_t zo_killrate; 197 int zo_verbose; 198 int zo_init; 199 uint64_t zo_time; 200 uint64_t zo_maxloops; 201 uint64_t zo_metaslab_force_ganging; 202 raidz_expand_test_state_t zo_raidz_expand_test; 203 int zo_mmp_test; 204 int zo_special_vdevs; 205 int zo_dump_dbgmsg; 206 int zo_gvars_count; 207 char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; 208 } ztest_shared_opts_t; 209 210 /* Default values for command line options. */ 211 #define DEFAULT_POOL "ztest" 212 #define DEFAULT_VDEV_DIR "/tmp" 213 #define DEFAULT_VDEV_COUNT 5 214 #define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ 215 #define DEFAULT_VDEV_SIZE_STR "256M" 216 #define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT 217 #define DEFAULT_MIRRORS 2 218 #define DEFAULT_RAID_CHILDREN 4 219 #define DEFAULT_RAID_PARITY 1 220 #define DEFAULT_DRAID_DATA 4 221 #define DEFAULT_DRAID_SPARES 1 222 #define DEFAULT_DATASETS_COUNT 7 223 #define DEFAULT_THREADS 23 224 #define DEFAULT_RUN_TIME 300 /* 300 seconds */ 225 #define DEFAULT_RUN_TIME_STR "300 sec" 226 #define DEFAULT_PASS_TIME 60 /* 60 seconds */ 227 #define DEFAULT_PASS_TIME_STR "60 sec" 228 #define DEFAULT_KILL_RATE 70 /* 70% kill rate */ 229 #define DEFAULT_KILLRATE_STR "70%" 230 #define DEFAULT_INITS 1 231 #define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ 232 #define DEFAULT_FORCE_GANGING (64 << 10) 233 #define DEFAULT_FORCE_GANGING_STR "64K" 234 235 /* Simplifying assumption: -1 is not a valid default. */ 236 #define NO_DEFAULT -1 237 238 static const ztest_shared_opts_t ztest_opts_defaults = { 239 .zo_pool = DEFAULT_POOL, 240 .zo_dir = DEFAULT_VDEV_DIR, 241 .zo_alt_ztest = { '\0' }, 242 .zo_alt_libpath = { '\0' }, 243 .zo_vdevs = DEFAULT_VDEV_COUNT, 244 .zo_ashift = DEFAULT_ASHIFT, 245 .zo_mirrors = DEFAULT_MIRRORS, 246 .zo_raid_children = DEFAULT_RAID_CHILDREN, 247 .zo_raid_parity = DEFAULT_RAID_PARITY, 248 .zo_raid_type = VDEV_TYPE_RAIDZ, 249 .zo_vdev_size = DEFAULT_VDEV_SIZE, 250 .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ 251 .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ 252 .zo_datasets = DEFAULT_DATASETS_COUNT, 253 .zo_threads = DEFAULT_THREADS, 254 .zo_passtime = DEFAULT_PASS_TIME, 255 .zo_killrate = DEFAULT_KILL_RATE, 256 .zo_verbose = 0, 257 .zo_mmp_test = 0, 258 .zo_init = DEFAULT_INITS, 259 .zo_time = DEFAULT_RUN_TIME, 260 .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ 261 .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, 262 .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, 263 .zo_gvars_count = 0, 264 .zo_raidz_expand_test = RAIDZ_EXPAND_NONE, 265 }; 266 267 extern uint64_t metaslab_force_ganging; 268 extern uint64_t metaslab_df_alloc_threshold; 269 extern uint64_t zfs_deadman_synctime_ms; 270 extern uint_t metaslab_preload_limit; 271 extern int zfs_compressed_arc_enabled; 272 extern int zfs_abd_scatter_enabled; 273 extern uint_t dmu_object_alloc_chunk_shift; 274 extern boolean_t zfs_force_some_double_word_sm_entries; 275 extern unsigned long zio_decompress_fail_fraction; 276 extern unsigned long zfs_reconstruct_indirect_damage_fraction; 277 extern uint64_t raidz_expand_max_reflow_bytes; 278 extern uint_t raidz_expand_pause_point; 279 280 281 static ztest_shared_opts_t *ztest_shared_opts; 282 static ztest_shared_opts_t ztest_opts; 283 static const char *const ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; 284 285 typedef struct ztest_shared_ds { 286 uint64_t zd_seq; 287 } ztest_shared_ds_t; 288 289 static ztest_shared_ds_t *ztest_shared_ds; 290 #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 291 292 typedef struct ztest_scratch_state { 293 uint64_t zs_raidz_scratch_verify_pause; 294 } ztest_shared_scratch_state_t; 295 296 static ztest_shared_scratch_state_t *ztest_scratch_state; 297 298 #define BT_MAGIC 0x123456789abcdefULL 299 #define MAXFAULTS(zs) \ 300 (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) 301 302 enum ztest_io_type { 303 ZTEST_IO_WRITE_TAG, 304 ZTEST_IO_WRITE_PATTERN, 305 ZTEST_IO_WRITE_ZEROES, 306 ZTEST_IO_TRUNCATE, 307 ZTEST_IO_SETATTR, 308 ZTEST_IO_REWRITE, 309 ZTEST_IO_TYPES 310 }; 311 312 typedef struct ztest_block_tag { 313 uint64_t bt_magic; 314 uint64_t bt_objset; 315 uint64_t bt_object; 316 uint64_t bt_dnodesize; 317 uint64_t bt_offset; 318 uint64_t bt_gen; 319 uint64_t bt_txg; 320 uint64_t bt_crtxg; 321 } ztest_block_tag_t; 322 323 typedef struct bufwad { 324 uint64_t bw_index; 325 uint64_t bw_txg; 326 uint64_t bw_data; 327 } bufwad_t; 328 329 /* 330 * It would be better to use a rangelock_t per object. Unfortunately 331 * the rangelock_t is not a drop-in replacement for rl_t, because we 332 * still need to map from object ID to rangelock_t. 333 */ 334 typedef enum { 335 ZTRL_READER, 336 ZTRL_WRITER, 337 ZTRL_APPEND 338 } rl_type_t; 339 340 typedef struct rll { 341 void *rll_writer; 342 int rll_readers; 343 kmutex_t rll_lock; 344 kcondvar_t rll_cv; 345 } rll_t; 346 347 typedef struct rl { 348 uint64_t rl_object; 349 uint64_t rl_offset; 350 uint64_t rl_size; 351 rll_t *rl_lock; 352 } rl_t; 353 354 #define ZTEST_RANGE_LOCKS 64 355 #define ZTEST_OBJECT_LOCKS 64 356 357 /* 358 * Object descriptor. Used as a template for object lookup/create/remove. 359 */ 360 typedef struct ztest_od { 361 uint64_t od_dir; 362 uint64_t od_object; 363 dmu_object_type_t od_type; 364 dmu_object_type_t od_crtype; 365 uint64_t od_blocksize; 366 uint64_t od_crblocksize; 367 uint64_t od_crdnodesize; 368 uint64_t od_gen; 369 uint64_t od_crgen; 370 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 371 } ztest_od_t; 372 373 /* 374 * Per-dataset state. 375 */ 376 typedef struct ztest_ds { 377 ztest_shared_ds_t *zd_shared; 378 objset_t *zd_os; 379 pthread_rwlock_t zd_zilog_lock; 380 zilog_t *zd_zilog; 381 ztest_od_t *zd_od; /* debugging aid */ 382 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 383 kmutex_t zd_dirobj_lock; 384 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 385 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 386 } ztest_ds_t; 387 388 /* 389 * Per-iteration state. 390 */ 391 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 392 393 typedef struct ztest_info { 394 ztest_func_t *zi_func; /* test function */ 395 uint64_t zi_iters; /* iterations per execution */ 396 uint64_t *zi_interval; /* execute every <interval> seconds */ 397 const char *zi_funcname; /* name of test function */ 398 } ztest_info_t; 399 400 typedef struct ztest_shared_callstate { 401 uint64_t zc_count; /* per-pass count */ 402 uint64_t zc_time; /* per-pass time */ 403 uint64_t zc_next; /* next time to call this function */ 404 } ztest_shared_callstate_t; 405 406 static ztest_shared_callstate_t *ztest_shared_callstate; 407 #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 408 409 ztest_func_t ztest_dmu_read_write; 410 ztest_func_t ztest_dmu_write_parallel; 411 ztest_func_t ztest_dmu_object_alloc_free; 412 ztest_func_t ztest_dmu_object_next_chunk; 413 ztest_func_t ztest_dmu_commit_callbacks; 414 ztest_func_t ztest_zap; 415 ztest_func_t ztest_zap_parallel; 416 ztest_func_t ztest_zil_commit; 417 ztest_func_t ztest_zil_remount; 418 ztest_func_t ztest_dmu_read_write_zcopy; 419 ztest_func_t ztest_dmu_objset_create_destroy; 420 ztest_func_t ztest_dmu_prealloc; 421 ztest_func_t ztest_fzap; 422 ztest_func_t ztest_dmu_snapshot_create_destroy; 423 ztest_func_t ztest_dsl_prop_get_set; 424 ztest_func_t ztest_spa_prop_get_set; 425 ztest_func_t ztest_spa_create_destroy; 426 ztest_func_t ztest_fault_inject; 427 ztest_func_t ztest_dmu_snapshot_hold; 428 ztest_func_t ztest_mmp_enable_disable; 429 ztest_func_t ztest_scrub; 430 ztest_func_t ztest_dsl_dataset_promote_busy; 431 ztest_func_t ztest_vdev_attach_detach; 432 ztest_func_t ztest_vdev_raidz_attach; 433 ztest_func_t ztest_vdev_LUN_growth; 434 ztest_func_t ztest_vdev_add_remove; 435 ztest_func_t ztest_vdev_class_add; 436 ztest_func_t ztest_vdev_aux_add_remove; 437 ztest_func_t ztest_split_pool; 438 ztest_func_t ztest_reguid; 439 ztest_func_t ztest_spa_upgrade; 440 ztest_func_t ztest_device_removal; 441 ztest_func_t ztest_spa_checkpoint_create_discard; 442 ztest_func_t ztest_initialize; 443 ztest_func_t ztest_trim; 444 ztest_func_t ztest_blake3; 445 ztest_func_t ztest_fletcher; 446 ztest_func_t ztest_fletcher_incr; 447 ztest_func_t ztest_verify_dnode_bt; 448 ztest_func_t ztest_pool_prefetch_ddt; 449 450 static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 451 static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 452 static uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 453 static uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 454 static uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 455 456 #define ZTI_INIT(func, iters, interval) \ 457 { .zi_func = (func), \ 458 .zi_iters = (iters), \ 459 .zi_interval = (interval), \ 460 .zi_funcname = # func } 461 462 static ztest_info_t ztest_info[] = { 463 ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), 464 ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), 465 ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), 466 ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), 467 ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), 468 ZTI_INIT(ztest_zap, 30, &zopt_always), 469 ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), 470 ZTI_INIT(ztest_split_pool, 1, &zopt_sometimes), 471 ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), 472 ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), 473 ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), 474 ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), 475 ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), 476 ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), 477 #if 0 478 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), 479 #endif 480 ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), 481 ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), 482 ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), 483 ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), 484 ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), 485 ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), 486 ZTI_INIT(ztest_reguid, 1, &zopt_rarely), 487 ZTI_INIT(ztest_scrub, 1, &zopt_rarely), 488 ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), 489 ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), 490 ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), 491 ZTI_INIT(ztest_vdev_raidz_attach, 1, &zopt_sometimes), 492 ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), 493 ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), 494 ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), 495 ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), 496 ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), 497 ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), 498 ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), 499 ZTI_INIT(ztest_trim, 1, &zopt_sometimes), 500 ZTI_INIT(ztest_blake3, 1, &zopt_rarely), 501 ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), 502 ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), 503 ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), 504 ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely), 505 }; 506 507 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 508 509 /* 510 * The following struct is used to hold a list of uncalled commit callbacks. 511 * The callbacks are ordered by txg number. 512 */ 513 typedef struct ztest_cb_list { 514 kmutex_t zcl_callbacks_lock; 515 list_t zcl_callbacks; 516 } ztest_cb_list_t; 517 518 /* 519 * Stuff we need to share writably between parent and child. 520 */ 521 typedef struct ztest_shared { 522 boolean_t zs_do_init; 523 hrtime_t zs_proc_start; 524 hrtime_t zs_proc_stop; 525 hrtime_t zs_thread_start; 526 hrtime_t zs_thread_stop; 527 hrtime_t zs_thread_kill; 528 uint64_t zs_enospc_count; 529 uint64_t zs_vdev_next_leaf; 530 uint64_t zs_vdev_aux; 531 uint64_t zs_alloc; 532 uint64_t zs_space; 533 uint64_t zs_splits; 534 uint64_t zs_mirrors; 535 uint64_t zs_metaslab_sz; 536 uint64_t zs_metaslab_df_alloc_threshold; 537 uint64_t zs_guid; 538 } ztest_shared_t; 539 540 #define ID_PARALLEL -1ULL 541 542 static char ztest_dev_template[] = "%s/%s.%llua"; 543 static char ztest_aux_template[] = "%s/%s.%s.%llu"; 544 static ztest_shared_t *ztest_shared; 545 546 static spa_t *ztest_spa = NULL; 547 static ztest_ds_t *ztest_ds; 548 549 static kmutex_t ztest_vdev_lock; 550 static boolean_t ztest_device_removal_active = B_FALSE; 551 static boolean_t ztest_pool_scrubbed = B_FALSE; 552 static kmutex_t ztest_checkpoint_lock; 553 554 /* 555 * The ztest_name_lock protects the pool and dataset namespace used by 556 * the individual tests. To modify the namespace, consumers must grab 557 * this lock as writer. Grabbing the lock as reader will ensure that the 558 * namespace does not change while the lock is held. 559 */ 560 static pthread_rwlock_t ztest_name_lock; 561 562 static boolean_t ztest_dump_core = B_TRUE; 563 static boolean_t ztest_exiting; 564 565 /* Global commit callback list */ 566 static ztest_cb_list_t zcl; 567 /* Commit cb delay */ 568 static uint64_t zc_min_txg_delay = UINT64_MAX; 569 static int zc_cb_counter = 0; 570 571 /* 572 * Minimum number of commit callbacks that need to be registered for us to check 573 * whether the minimum txg delay is acceptable. 574 */ 575 #define ZTEST_COMMIT_CB_MIN_REG 100 576 577 /* 578 * If a number of txgs equal to this threshold have been created after a commit 579 * callback has been registered but not called, then we assume there is an 580 * implementation bug. 581 */ 582 #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) 583 584 enum ztest_object { 585 ZTEST_META_DNODE = 0, 586 ZTEST_DIROBJ, 587 ZTEST_OBJECTS 588 }; 589 590 static __attribute__((noreturn)) void usage(boolean_t requested); 591 static int ztest_scrub_impl(spa_t *spa); 592 593 /* 594 * These libumem hooks provide a reasonable set of defaults for the allocator's 595 * debugging facilities. 596 */ 597 const char * 598 _umem_debug_init(void) 599 { 600 return ("default,verbose"); /* $UMEM_DEBUG setting */ 601 } 602 603 const char * 604 _umem_logging_init(void) 605 { 606 return ("fail,contents"); /* $UMEM_LOGGING setting */ 607 } 608 609 static void 610 dump_debug_buffer(void) 611 { 612 ssize_t ret __attribute__((unused)); 613 614 if (!ztest_opts.zo_dump_dbgmsg) 615 return; 616 617 /* 618 * We use write() instead of printf() so that this function 619 * is safe to call from a signal handler. 620 */ 621 ret = write(STDERR_FILENO, "\n", 1); 622 zfs_dbgmsg_print(STDERR_FILENO, "ztest"); 623 } 624 625 static void sig_handler(int signo) 626 { 627 struct sigaction action; 628 629 libspl_backtrace(STDERR_FILENO); 630 dump_debug_buffer(); 631 632 /* 633 * Restore default action and re-raise signal so SIGSEGV and 634 * SIGABRT can trigger a core dump. 635 */ 636 action.sa_handler = SIG_DFL; 637 sigemptyset(&action.sa_mask); 638 action.sa_flags = 0; 639 (void) sigaction(signo, &action, NULL); 640 raise(signo); 641 } 642 643 #define FATAL_MSG_SZ 1024 644 645 static const char *fatal_msg; 646 647 static __attribute__((format(printf, 2, 3))) __attribute__((noreturn)) void 648 fatal(int do_perror, const char *message, ...) 649 { 650 va_list args; 651 int save_errno = errno; 652 char *buf; 653 654 (void) fflush(stdout); 655 buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); 656 if (buf == NULL) 657 goto out; 658 659 va_start(args, message); 660 (void) sprintf(buf, "ztest: "); 661 /* LINTED */ 662 (void) vsprintf(buf + strlen(buf), message, args); 663 va_end(args); 664 if (do_perror) { 665 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 666 ": %s", strerror(save_errno)); 667 } 668 (void) fprintf(stderr, "%s\n", buf); 669 fatal_msg = buf; /* to ease debugging */ 670 671 out: 672 if (ztest_dump_core) 673 abort(); 674 else 675 dump_debug_buffer(); 676 677 exit(3); 678 } 679 680 static int 681 str2shift(const char *buf) 682 { 683 const char *ends = "BKMGTPEZ"; 684 int i; 685 686 if (buf[0] == '\0') 687 return (0); 688 for (i = 0; i < strlen(ends); i++) { 689 if (toupper(buf[0]) == ends[i]) 690 break; 691 } 692 if (i == strlen(ends)) { 693 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 694 buf); 695 usage(B_FALSE); 696 } 697 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 698 return (10*i); 699 } 700 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 701 usage(B_FALSE); 702 } 703 704 static uint64_t 705 nicenumtoull(const char *buf) 706 { 707 char *end; 708 uint64_t val; 709 710 val = strtoull(buf, &end, 0); 711 if (end == buf) { 712 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 713 usage(B_FALSE); 714 } else if (end[0] == '.') { 715 double fval = strtod(buf, &end); 716 fval *= pow(2, str2shift(end)); 717 /* 718 * UINT64_MAX is not exactly representable as a double. 719 * The closest representation is UINT64_MAX + 1, so we 720 * use a >= comparison instead of > for the bounds check. 721 */ 722 if (fval >= (double)UINT64_MAX) { 723 (void) fprintf(stderr, "ztest: value too large: %s\n", 724 buf); 725 usage(B_FALSE); 726 } 727 val = (uint64_t)fval; 728 } else { 729 int shift = str2shift(end); 730 if (shift >= 64 || (val << shift) >> shift != val) { 731 (void) fprintf(stderr, "ztest: value too large: %s\n", 732 buf); 733 usage(B_FALSE); 734 } 735 val <<= shift; 736 } 737 return (val); 738 } 739 740 typedef struct ztest_option { 741 const char short_opt; 742 const char *long_opt; 743 const char *long_opt_param; 744 const char *comment; 745 unsigned int default_int; 746 const char *default_str; 747 } ztest_option_t; 748 749 /* 750 * The following option_table is used for generating the usage info as well as 751 * the long and short option information for calling getopt_long(). 752 */ 753 static ztest_option_t option_table[] = { 754 { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, 755 NULL}, 756 { 's', "vdev-size", "INTEGER", "Size of each vdev", 757 NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, 758 { 'a', "alignment-shift", "INTEGER", 759 "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, 760 { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", 761 DEFAULT_MIRRORS, NULL}, 762 { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", 763 DEFAULT_RAID_CHILDREN, NULL}, 764 { 'R', "raid-parity", "INTEGER", "Raid parity", 765 DEFAULT_RAID_PARITY, NULL}, 766 { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind", 767 NO_DEFAULT, "random"}, 768 { 'D', "draid-data", "INTEGER", "Number of draid data drives", 769 DEFAULT_DRAID_DATA, NULL}, 770 { 'S', "draid-spares", "INTEGER", "Number of draid spares", 771 DEFAULT_DRAID_SPARES, NULL}, 772 { 'd', "datasets", "INTEGER", "Number of datasets", 773 DEFAULT_DATASETS_COUNT, NULL}, 774 { 't', "threads", "INTEGER", "Number of ztest threads", 775 DEFAULT_THREADS, NULL}, 776 { 'g', "gang-block-threshold", "INTEGER", 777 "Metaslab gang block threshold", 778 NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, 779 { 'i', "init-count", "INTEGER", "Number of times to initialize pool", 780 DEFAULT_INITS, NULL}, 781 { 'k', "kill-percentage", "INTEGER", "Kill percentage", 782 NO_DEFAULT, DEFAULT_KILLRATE_STR}, 783 { 'p', "pool-name", "STRING", "Pool name", 784 NO_DEFAULT, DEFAULT_POOL}, 785 { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", 786 NO_DEFAULT, DEFAULT_VDEV_DIR}, 787 { 'M', "multi-host", NULL, 788 "Multi-host; simulate pool imported on remote host", 789 NO_DEFAULT, NULL}, 790 { 'E', "use-existing-pool", NULL, 791 "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, 792 { 'T', "run-time", "INTEGER", "Total run time", 793 NO_DEFAULT, DEFAULT_RUN_TIME_STR}, 794 { 'P', "pass-time", "INTEGER", "Time per pass", 795 NO_DEFAULT, DEFAULT_PASS_TIME_STR}, 796 { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", 797 DEFAULT_MAX_LOOPS, NULL}, 798 { 'B', "alt-ztest", "PATH", "Alternate ztest path", 799 NO_DEFAULT, NULL}, 800 { 'C', "vdev-class-state", "on|off|random", "vdev class state", 801 NO_DEFAULT, "random"}, 802 { 'X', "raidz-expansion", NULL, 803 "Perform a dedicated raidz expansion test", 804 NO_DEFAULT, NULL}, 805 { 'o', "option", "\"OPTION=INTEGER\"", 806 "Set global variable to an unsigned 32-bit integer value", 807 NO_DEFAULT, NULL}, 808 { 'G', "dump-debug-msg", NULL, 809 "Dump zfs_dbgmsg buffer before exiting due to an error", 810 NO_DEFAULT, NULL}, 811 { 'V', "verbose", NULL, 812 "Verbose (use multiple times for ever more verbosity)", 813 NO_DEFAULT, NULL}, 814 { 'h', "help", NULL, "Show this help", 815 NO_DEFAULT, NULL}, 816 {0, 0, 0, 0, 0, 0} 817 }; 818 819 static struct option *long_opts = NULL; 820 static char *short_opts = NULL; 821 822 static void 823 init_options(void) 824 { 825 ASSERT3P(long_opts, ==, NULL); 826 ASSERT3P(short_opts, ==, NULL); 827 828 int count = sizeof (option_table) / sizeof (option_table[0]); 829 long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); 830 831 short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); 832 int short_opt_index = 0; 833 834 for (int i = 0; i < count; i++) { 835 long_opts[i].val = option_table[i].short_opt; 836 long_opts[i].name = option_table[i].long_opt; 837 long_opts[i].has_arg = option_table[i].long_opt_param != NULL 838 ? required_argument : no_argument; 839 long_opts[i].flag = NULL; 840 short_opts[short_opt_index++] = option_table[i].short_opt; 841 if (option_table[i].long_opt_param != NULL) { 842 short_opts[short_opt_index++] = ':'; 843 } 844 } 845 } 846 847 static void 848 fini_options(void) 849 { 850 int count = sizeof (option_table) / sizeof (option_table[0]); 851 852 umem_free(long_opts, sizeof (struct option) * count); 853 umem_free(short_opts, sizeof (char) * 2 * count); 854 855 long_opts = NULL; 856 short_opts = NULL; 857 } 858 859 static __attribute__((noreturn)) void 860 usage(boolean_t requested) 861 { 862 char option[80]; 863 FILE *fp = requested ? stdout : stderr; 864 865 (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); 866 for (int i = 0; option_table[i].short_opt != 0; i++) { 867 if (option_table[i].long_opt_param != NULL) { 868 (void) sprintf(option, " -%c --%s=%s", 869 option_table[i].short_opt, 870 option_table[i].long_opt, 871 option_table[i].long_opt_param); 872 } else { 873 (void) sprintf(option, " -%c --%s", 874 option_table[i].short_opt, 875 option_table[i].long_opt); 876 } 877 (void) fprintf(fp, " %-43s%s", option, 878 option_table[i].comment); 879 880 if (option_table[i].long_opt_param != NULL) { 881 if (option_table[i].default_str != NULL) { 882 (void) fprintf(fp, " (default: %s)", 883 option_table[i].default_str); 884 } else if (option_table[i].default_int != NO_DEFAULT) { 885 (void) fprintf(fp, " (default: %u)", 886 option_table[i].default_int); 887 } 888 } 889 (void) fprintf(fp, "\n"); 890 } 891 exit(requested ? 0 : 1); 892 } 893 894 static uint64_t 895 ztest_random(uint64_t range) 896 { 897 uint64_t r; 898 899 ASSERT3S(ztest_fd_rand, >=, 0); 900 901 if (range == 0) 902 return (0); 903 904 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) 905 fatal(B_TRUE, "short read from /dev/urandom"); 906 907 return (r % range); 908 } 909 910 static void 911 ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) 912 { 913 char name[32]; 914 char *value; 915 int state = ZTEST_VDEV_CLASS_RND; 916 917 (void) strlcpy(name, input, sizeof (name)); 918 919 value = strchr(name, '='); 920 if (value == NULL) { 921 (void) fprintf(stderr, "missing value in property=value " 922 "'-C' argument (%s)\n", input); 923 usage(B_FALSE); 924 } 925 *(value) = '\0'; 926 value++; 927 928 if (strcmp(value, "on") == 0) { 929 state = ZTEST_VDEV_CLASS_ON; 930 } else if (strcmp(value, "off") == 0) { 931 state = ZTEST_VDEV_CLASS_OFF; 932 } else if (strcmp(value, "random") == 0) { 933 state = ZTEST_VDEV_CLASS_RND; 934 } else { 935 (void) fprintf(stderr, "invalid property value '%s'\n", value); 936 usage(B_FALSE); 937 } 938 939 if (strcmp(name, "special") == 0) { 940 zo->zo_special_vdevs = state; 941 } else { 942 (void) fprintf(stderr, "invalid property name '%s'\n", name); 943 usage(B_FALSE); 944 } 945 if (zo->zo_verbose >= 3) 946 (void) printf("%s vdev state is '%s'\n", name, value); 947 } 948 949 static void 950 process_options(int argc, char **argv) 951 { 952 char *path; 953 ztest_shared_opts_t *zo = &ztest_opts; 954 955 int opt; 956 uint64_t value; 957 const char *raid_kind = "random"; 958 959 memcpy(zo, &ztest_opts_defaults, sizeof (*zo)); 960 961 init_options(); 962 963 while ((opt = getopt_long(argc, argv, short_opts, long_opts, 964 NULL)) != EOF) { 965 value = 0; 966 switch (opt) { 967 case 'v': 968 case 's': 969 case 'a': 970 case 'm': 971 case 'r': 972 case 'R': 973 case 'D': 974 case 'S': 975 case 'd': 976 case 't': 977 case 'g': 978 case 'i': 979 case 'k': 980 case 'T': 981 case 'P': 982 case 'F': 983 value = nicenumtoull(optarg); 984 } 985 switch (opt) { 986 case 'v': 987 zo->zo_vdevs = value; 988 break; 989 case 's': 990 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 991 break; 992 case 'a': 993 zo->zo_ashift = value; 994 break; 995 case 'm': 996 zo->zo_mirrors = value; 997 break; 998 case 'r': 999 zo->zo_raid_children = MAX(1, value); 1000 break; 1001 case 'R': 1002 zo->zo_raid_parity = MIN(MAX(value, 1), 3); 1003 break; 1004 case 'K': 1005 raid_kind = optarg; 1006 break; 1007 case 'D': 1008 zo->zo_draid_data = MAX(1, value); 1009 break; 1010 case 'S': 1011 zo->zo_draid_spares = MAX(1, value); 1012 break; 1013 case 'd': 1014 zo->zo_datasets = MAX(1, value); 1015 break; 1016 case 't': 1017 zo->zo_threads = MAX(1, value); 1018 break; 1019 case 'g': 1020 zo->zo_metaslab_force_ganging = 1021 MAX(SPA_MINBLOCKSIZE << 1, value); 1022 break; 1023 case 'i': 1024 zo->zo_init = value; 1025 break; 1026 case 'k': 1027 zo->zo_killrate = value; 1028 break; 1029 case 'p': 1030 (void) strlcpy(zo->zo_pool, optarg, 1031 sizeof (zo->zo_pool)); 1032 break; 1033 case 'f': 1034 path = realpath(optarg, NULL); 1035 if (path == NULL) { 1036 (void) fprintf(stderr, "error: %s: %s\n", 1037 optarg, strerror(errno)); 1038 usage(B_FALSE); 1039 } else { 1040 (void) strlcpy(zo->zo_dir, path, 1041 sizeof (zo->zo_dir)); 1042 free(path); 1043 } 1044 break; 1045 case 'M': 1046 zo->zo_mmp_test = 1; 1047 break; 1048 case 'V': 1049 zo->zo_verbose++; 1050 break; 1051 case 'X': 1052 zo->zo_raidz_expand_test = RAIDZ_EXPAND_REQUESTED; 1053 break; 1054 case 'E': 1055 zo->zo_init = 0; 1056 break; 1057 case 'T': 1058 zo->zo_time = value; 1059 break; 1060 case 'P': 1061 zo->zo_passtime = MAX(1, value); 1062 break; 1063 case 'F': 1064 zo->zo_maxloops = MAX(1, value); 1065 break; 1066 case 'B': 1067 (void) strlcpy(zo->zo_alt_ztest, optarg, 1068 sizeof (zo->zo_alt_ztest)); 1069 break; 1070 case 'C': 1071 ztest_parse_name_value(optarg, zo); 1072 break; 1073 case 'o': 1074 if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { 1075 (void) fprintf(stderr, 1076 "max global var count (%zu) exceeded\n", 1077 ZO_GVARS_MAX_COUNT); 1078 usage(B_FALSE); 1079 } 1080 char *v = zo->zo_gvars[zo->zo_gvars_count]; 1081 if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= 1082 ZO_GVARS_MAX_ARGLEN) { 1083 (void) fprintf(stderr, 1084 "global var option '%s' is too long\n", 1085 optarg); 1086 usage(B_FALSE); 1087 } 1088 zo->zo_gvars_count++; 1089 break; 1090 case 'G': 1091 zo->zo_dump_dbgmsg = 1; 1092 break; 1093 case 'h': 1094 usage(B_TRUE); 1095 break; 1096 case '?': 1097 default: 1098 usage(B_FALSE); 1099 break; 1100 } 1101 } 1102 1103 fini_options(); 1104 1105 /* Force compatible options for raidz expansion run */ 1106 if (zo->zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) { 1107 zo->zo_mmp_test = 0; 1108 zo->zo_mirrors = 0; 1109 zo->zo_vdevs = 1; 1110 zo->zo_vdev_size = DEFAULT_VDEV_SIZE * 2; 1111 zo->zo_raid_do_expand = B_FALSE; 1112 raid_kind = "raidz"; 1113 } 1114 1115 if (strcmp(raid_kind, "random") == 0) { 1116 switch (ztest_random(3)) { 1117 case 0: 1118 raid_kind = "raidz"; 1119 break; 1120 case 1: 1121 raid_kind = "eraidz"; 1122 break; 1123 case 2: 1124 raid_kind = "draid"; 1125 break; 1126 } 1127 1128 if (ztest_opts.zo_verbose >= 3) 1129 (void) printf("choosing RAID type '%s'\n", raid_kind); 1130 } 1131 1132 if (strcmp(raid_kind, "draid") == 0) { 1133 uint64_t min_devsize; 1134 1135 /* With fewer disk use 256M, otherwise 128M is OK */ 1136 min_devsize = (ztest_opts.zo_raid_children < 16) ? 1137 (256ULL << 20) : (128ULL << 20); 1138 1139 /* No top-level mirrors with dRAID for now */ 1140 zo->zo_mirrors = 0; 1141 1142 /* Use more appropriate defaults for dRAID */ 1143 if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) 1144 zo->zo_vdevs = 1; 1145 if (zo->zo_raid_children == 1146 ztest_opts_defaults.zo_raid_children) 1147 zo->zo_raid_children = 16; 1148 if (zo->zo_ashift < 12) 1149 zo->zo_ashift = 12; 1150 if (zo->zo_vdev_size < min_devsize) 1151 zo->zo_vdev_size = min_devsize; 1152 1153 if (zo->zo_draid_data + zo->zo_raid_parity > 1154 zo->zo_raid_children - zo->zo_draid_spares) { 1155 (void) fprintf(stderr, "error: too few draid " 1156 "children (%d) for stripe width (%d)\n", 1157 zo->zo_raid_children, 1158 zo->zo_draid_data + zo->zo_raid_parity); 1159 usage(B_FALSE); 1160 } 1161 1162 (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, 1163 sizeof (zo->zo_raid_type)); 1164 1165 } else if (strcmp(raid_kind, "eraidz") == 0) { 1166 /* using eraidz (expandable raidz) */ 1167 zo->zo_raid_do_expand = B_TRUE; 1168 1169 /* tests expect top-level to be raidz */ 1170 zo->zo_mirrors = 0; 1171 zo->zo_vdevs = 1; 1172 1173 /* Make sure parity is less than data columns */ 1174 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1175 zo->zo_raid_children - 1); 1176 1177 } else /* using raidz */ { 1178 ASSERT0(strcmp(raid_kind, "raidz")); 1179 1180 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1181 zo->zo_raid_children - 1); 1182 } 1183 1184 zo->zo_vdevtime = 1185 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 1186 UINT64_MAX >> 2); 1187 1188 if (*zo->zo_alt_ztest) { 1189 const char *invalid_what = "ztest"; 1190 char *val = zo->zo_alt_ztest; 1191 if (0 != access(val, X_OK) || 1192 (strrchr(val, '/') == NULL && (errno == EINVAL))) 1193 goto invalid; 1194 1195 int dirlen = strrchr(val, '/') - val; 1196 strlcpy(zo->zo_alt_libpath, val, 1197 MIN(sizeof (zo->zo_alt_libpath), dirlen + 1)); 1198 invalid_what = "library path", val = zo->zo_alt_libpath; 1199 if (strrchr(val, '/') == NULL && (errno == EINVAL)) 1200 goto invalid; 1201 *strrchr(val, '/') = '\0'; 1202 strlcat(val, "/lib", sizeof (zo->zo_alt_libpath)); 1203 1204 if (0 != access(zo->zo_alt_libpath, X_OK)) 1205 goto invalid; 1206 return; 1207 1208 invalid: 1209 ztest_dump_core = B_FALSE; 1210 fatal(B_TRUE, "invalid alternate %s %s", invalid_what, val); 1211 } 1212 } 1213 1214 static void 1215 ztest_kill(ztest_shared_t *zs) 1216 { 1217 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 1218 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 1219 1220 /* 1221 * Before we kill ourselves, make sure that the config is updated. 1222 * See comment above spa_write_cachefile(). 1223 */ 1224 if (raidz_expand_pause_point != RAIDZ_EXPAND_PAUSE_NONE) { 1225 if (mutex_tryenter(&spa_namespace_lock)) { 1226 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, 1227 B_FALSE); 1228 mutex_exit(&spa_namespace_lock); 1229 1230 ztest_scratch_state->zs_raidz_scratch_verify_pause = 1231 raidz_expand_pause_point; 1232 } else { 1233 /* 1234 * Do not verify scratch object in case if 1235 * spa_namespace_lock cannot be acquired, 1236 * it can cause deadlock in spa_config_update(). 1237 */ 1238 raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; 1239 1240 return; 1241 } 1242 } else { 1243 mutex_enter(&spa_namespace_lock); 1244 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); 1245 mutex_exit(&spa_namespace_lock); 1246 } 1247 1248 (void) raise(SIGKILL); 1249 } 1250 1251 static void 1252 ztest_record_enospc(const char *s) 1253 { 1254 (void) s; 1255 ztest_shared->zs_enospc_count++; 1256 } 1257 1258 static uint64_t 1259 ztest_get_ashift(void) 1260 { 1261 if (ztest_opts.zo_ashift == 0) 1262 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 1263 return (ztest_opts.zo_ashift); 1264 } 1265 1266 static boolean_t 1267 ztest_is_draid_spare(const char *name) 1268 { 1269 uint64_t spare_id = 0, parity = 0, vdev_id = 0; 1270 1271 if (sscanf(name, VDEV_TYPE_DRAID "%"PRIu64"-%"PRIu64"-%"PRIu64"", 1272 &parity, &vdev_id, &spare_id) == 3) { 1273 return (B_TRUE); 1274 } 1275 1276 return (B_FALSE); 1277 } 1278 1279 static nvlist_t * 1280 make_vdev_file(const char *path, const char *aux, const char *pool, 1281 size_t size, uint64_t ashift) 1282 { 1283 char *pathbuf = NULL; 1284 uint64_t vdev; 1285 nvlist_t *file; 1286 boolean_t draid_spare = B_FALSE; 1287 1288 1289 if (ashift == 0) 1290 ashift = ztest_get_ashift(); 1291 1292 if (path == NULL) { 1293 pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1294 path = pathbuf; 1295 1296 if (aux != NULL) { 1297 vdev = ztest_shared->zs_vdev_aux; 1298 (void) snprintf(pathbuf, MAXPATHLEN, 1299 ztest_aux_template, ztest_opts.zo_dir, 1300 pool == NULL ? ztest_opts.zo_pool : pool, 1301 aux, vdev); 1302 } else { 1303 vdev = ztest_shared->zs_vdev_next_leaf++; 1304 (void) snprintf(pathbuf, MAXPATHLEN, 1305 ztest_dev_template, ztest_opts.zo_dir, 1306 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 1307 } 1308 } else { 1309 draid_spare = ztest_is_draid_spare(path); 1310 } 1311 1312 if (size != 0 && !draid_spare) { 1313 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 1314 if (fd == -1) 1315 fatal(B_TRUE, "can't open %s", path); 1316 if (ftruncate(fd, size) != 0) 1317 fatal(B_TRUE, "can't ftruncate %s", path); 1318 (void) close(fd); 1319 } 1320 1321 file = fnvlist_alloc(); 1322 fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, 1323 draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); 1324 fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); 1325 fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); 1326 umem_free(pathbuf, MAXPATHLEN); 1327 1328 return (file); 1329 } 1330 1331 static nvlist_t * 1332 make_vdev_raid(const char *path, const char *aux, const char *pool, size_t size, 1333 uint64_t ashift, int r) 1334 { 1335 nvlist_t *raid, **child; 1336 int c; 1337 1338 if (r < 2) 1339 return (make_vdev_file(path, aux, pool, size, ashift)); 1340 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 1341 1342 for (c = 0; c < r; c++) 1343 child[c] = make_vdev_file(path, aux, pool, size, ashift); 1344 1345 raid = fnvlist_alloc(); 1346 fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, 1347 ztest_opts.zo_raid_type); 1348 fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, 1349 ztest_opts.zo_raid_parity); 1350 fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, 1351 (const nvlist_t **)child, r); 1352 1353 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { 1354 uint64_t ndata = ztest_opts.zo_draid_data; 1355 uint64_t nparity = ztest_opts.zo_raid_parity; 1356 uint64_t nspares = ztest_opts.zo_draid_spares; 1357 uint64_t children = ztest_opts.zo_raid_children; 1358 uint64_t ngroups = 1; 1359 1360 /* 1361 * Calculate the minimum number of groups required to fill a 1362 * slice. This is the LCM of the stripe width (data + parity) 1363 * and the number of data drives (children - spares). 1364 */ 1365 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1366 ngroups++; 1367 1368 /* Store the basic dRAID configuration. */ 1369 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1370 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1371 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1372 } 1373 1374 for (c = 0; c < r; c++) 1375 fnvlist_free(child[c]); 1376 1377 umem_free(child, r * sizeof (nvlist_t *)); 1378 1379 return (raid); 1380 } 1381 1382 static nvlist_t * 1383 make_vdev_mirror(const char *path, const char *aux, const char *pool, 1384 size_t size, uint64_t ashift, int r, int m) 1385 { 1386 nvlist_t *mirror, **child; 1387 int c; 1388 1389 if (m < 1) 1390 return (make_vdev_raid(path, aux, pool, size, ashift, r)); 1391 1392 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 1393 1394 for (c = 0; c < m; c++) 1395 child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); 1396 1397 mirror = fnvlist_alloc(); 1398 fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); 1399 fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 1400 (const nvlist_t **)child, m); 1401 1402 for (c = 0; c < m; c++) 1403 fnvlist_free(child[c]); 1404 1405 umem_free(child, m * sizeof (nvlist_t *)); 1406 1407 return (mirror); 1408 } 1409 1410 static nvlist_t * 1411 make_vdev_root(const char *path, const char *aux, const char *pool, size_t size, 1412 uint64_t ashift, const char *class, int r, int m, int t) 1413 { 1414 nvlist_t *root, **child; 1415 int c; 1416 boolean_t log; 1417 1418 ASSERT3S(t, >, 0); 1419 1420 log = (class != NULL && strcmp(class, "log") == 0); 1421 1422 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 1423 1424 for (c = 0; c < t; c++) { 1425 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 1426 r, m); 1427 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); 1428 1429 if (class != NULL && class[0] != '\0') { 1430 ASSERT(m > 1 || log); /* expecting a mirror */ 1431 fnvlist_add_string(child[c], 1432 ZPOOL_CONFIG_ALLOCATION_BIAS, class); 1433 } 1434 } 1435 1436 root = fnvlist_alloc(); 1437 fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 1438 fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 1439 (const nvlist_t **)child, t); 1440 1441 for (c = 0; c < t; c++) 1442 fnvlist_free(child[c]); 1443 1444 umem_free(child, t * sizeof (nvlist_t *)); 1445 1446 return (root); 1447 } 1448 1449 /* 1450 * Find a random spa version. Returns back a random spa version in the 1451 * range [initial_version, SPA_VERSION_FEATURES]. 1452 */ 1453 static uint64_t 1454 ztest_random_spa_version(uint64_t initial_version) 1455 { 1456 uint64_t version = initial_version; 1457 1458 if (version <= SPA_VERSION_BEFORE_FEATURES) { 1459 version = version + 1460 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 1461 } 1462 1463 if (version > SPA_VERSION_BEFORE_FEATURES) 1464 version = SPA_VERSION_FEATURES; 1465 1466 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1467 return (version); 1468 } 1469 1470 static int 1471 ztest_random_blocksize(void) 1472 { 1473 ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); 1474 1475 /* 1476 * Choose a block size >= the ashift. 1477 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1478 */ 1479 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1480 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1481 maxbs = 20; 1482 uint64_t block_shift = 1483 ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1484 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1485 } 1486 1487 static int 1488 ztest_random_dnodesize(void) 1489 { 1490 int slots; 1491 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1492 1493 if (max_slots == DNODE_MIN_SLOTS) 1494 return (DNODE_MIN_SIZE); 1495 1496 /* 1497 * Weight the random distribution more heavily toward smaller 1498 * dnode sizes since that is more likely to reflect real-world 1499 * usage. 1500 */ 1501 ASSERT3U(max_slots, >, 4); 1502 switch (ztest_random(10)) { 1503 case 0: 1504 slots = 5 + ztest_random(max_slots - 4); 1505 break; 1506 case 1 ... 4: 1507 slots = 2 + ztest_random(3); 1508 break; 1509 default: 1510 slots = 1; 1511 break; 1512 } 1513 1514 return (slots << DNODE_SHIFT); 1515 } 1516 1517 static int 1518 ztest_random_ibshift(void) 1519 { 1520 return (DN_MIN_INDBLKSHIFT + 1521 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1522 } 1523 1524 static uint64_t 1525 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1526 { 1527 uint64_t top; 1528 vdev_t *rvd = spa->spa_root_vdev; 1529 vdev_t *tvd; 1530 1531 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 1532 1533 do { 1534 top = ztest_random(rvd->vdev_children); 1535 tvd = rvd->vdev_child[top]; 1536 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1537 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1538 1539 return (top); 1540 } 1541 1542 static uint64_t 1543 ztest_random_dsl_prop(zfs_prop_t prop) 1544 { 1545 uint64_t value; 1546 1547 do { 1548 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1549 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1550 1551 return (value); 1552 } 1553 1554 static int 1555 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1556 boolean_t inherit) 1557 { 1558 const char *propname = zfs_prop_to_name(prop); 1559 const char *valname; 1560 char *setpoint; 1561 uint64_t curval; 1562 int error; 1563 1564 error = dsl_prop_set_int(osname, propname, 1565 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1566 1567 if (error == ENOSPC) { 1568 ztest_record_enospc(FTAG); 1569 return (error); 1570 } 1571 ASSERT0(error); 1572 1573 setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1574 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1575 1576 if (ztest_opts.zo_verbose >= 6) { 1577 int err; 1578 1579 err = zfs_prop_index_to_string(prop, curval, &valname); 1580 if (err) 1581 (void) printf("%s %s = %llu at '%s'\n", osname, 1582 propname, (unsigned long long)curval, setpoint); 1583 else 1584 (void) printf("%s %s = %s at '%s'\n", 1585 osname, propname, valname, setpoint); 1586 } 1587 umem_free(setpoint, MAXPATHLEN); 1588 1589 return (error); 1590 } 1591 1592 static int 1593 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1594 { 1595 spa_t *spa = ztest_spa; 1596 nvlist_t *props = NULL; 1597 int error; 1598 1599 props = fnvlist_alloc(); 1600 fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); 1601 1602 error = spa_prop_set(spa, props); 1603 1604 fnvlist_free(props); 1605 1606 if (error == ENOSPC) { 1607 ztest_record_enospc(FTAG); 1608 return (error); 1609 } 1610 ASSERT0(error); 1611 1612 return (error); 1613 } 1614 1615 static int 1616 ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, 1617 boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) 1618 { 1619 int err; 1620 char *cp = NULL; 1621 char ddname[ZFS_MAX_DATASET_NAME_LEN]; 1622 1623 strlcpy(ddname, name, sizeof (ddname)); 1624 cp = strchr(ddname, '@'); 1625 if (cp != NULL) 1626 *cp = '\0'; 1627 1628 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1629 while (decrypt && err == EACCES) { 1630 dsl_crypto_params_t *dcp; 1631 nvlist_t *crypto_args = fnvlist_alloc(); 1632 1633 fnvlist_add_uint8_array(crypto_args, "wkeydata", 1634 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 1635 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, 1636 crypto_args, &dcp)); 1637 err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); 1638 /* 1639 * Note: if there was an error loading, the wkey was not 1640 * consumed, and needs to be freed. 1641 */ 1642 dsl_crypto_params_free(dcp, (err != 0)); 1643 fnvlist_free(crypto_args); 1644 1645 if (err == EINVAL) { 1646 /* 1647 * We couldn't load a key for this dataset so try 1648 * the parent. This loop will eventually hit the 1649 * encryption root since ztest only makes clones 1650 * as children of their origin datasets. 1651 */ 1652 cp = strrchr(ddname, '/'); 1653 if (cp == NULL) 1654 return (err); 1655 1656 *cp = '\0'; 1657 err = EACCES; 1658 continue; 1659 } else if (err != 0) { 1660 break; 1661 } 1662 1663 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1664 break; 1665 } 1666 1667 return (err); 1668 } 1669 1670 static void 1671 ztest_rll_init(rll_t *rll) 1672 { 1673 rll->rll_writer = NULL; 1674 rll->rll_readers = 0; 1675 mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); 1676 cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); 1677 } 1678 1679 static void 1680 ztest_rll_destroy(rll_t *rll) 1681 { 1682 ASSERT3P(rll->rll_writer, ==, NULL); 1683 ASSERT0(rll->rll_readers); 1684 mutex_destroy(&rll->rll_lock); 1685 cv_destroy(&rll->rll_cv); 1686 } 1687 1688 static void 1689 ztest_rll_lock(rll_t *rll, rl_type_t type) 1690 { 1691 mutex_enter(&rll->rll_lock); 1692 1693 if (type == ZTRL_READER) { 1694 while (rll->rll_writer != NULL) 1695 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1696 rll->rll_readers++; 1697 } else { 1698 while (rll->rll_writer != NULL || rll->rll_readers) 1699 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1700 rll->rll_writer = curthread; 1701 } 1702 1703 mutex_exit(&rll->rll_lock); 1704 } 1705 1706 static void 1707 ztest_rll_unlock(rll_t *rll) 1708 { 1709 mutex_enter(&rll->rll_lock); 1710 1711 if (rll->rll_writer) { 1712 ASSERT0(rll->rll_readers); 1713 rll->rll_writer = NULL; 1714 } else { 1715 ASSERT3S(rll->rll_readers, >, 0); 1716 ASSERT3P(rll->rll_writer, ==, NULL); 1717 rll->rll_readers--; 1718 } 1719 1720 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1721 cv_broadcast(&rll->rll_cv); 1722 1723 mutex_exit(&rll->rll_lock); 1724 } 1725 1726 static void 1727 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1728 { 1729 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1730 1731 ztest_rll_lock(rll, type); 1732 } 1733 1734 static void 1735 ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1736 { 1737 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1738 1739 ztest_rll_unlock(rll); 1740 } 1741 1742 static rl_t * 1743 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1744 uint64_t size, rl_type_t type) 1745 { 1746 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1747 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1748 rl_t *rl; 1749 1750 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1751 rl->rl_object = object; 1752 rl->rl_offset = offset; 1753 rl->rl_size = size; 1754 rl->rl_lock = rll; 1755 1756 ztest_rll_lock(rll, type); 1757 1758 return (rl); 1759 } 1760 1761 static void 1762 ztest_range_unlock(rl_t *rl) 1763 { 1764 rll_t *rll = rl->rl_lock; 1765 1766 ztest_rll_unlock(rll); 1767 1768 umem_free(rl, sizeof (*rl)); 1769 } 1770 1771 static void 1772 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1773 { 1774 zd->zd_os = os; 1775 zd->zd_zilog = dmu_objset_zil(os); 1776 zd->zd_shared = szd; 1777 dmu_objset_name(os, zd->zd_name); 1778 int l; 1779 1780 if (zd->zd_shared != NULL) 1781 zd->zd_shared->zd_seq = 0; 1782 1783 VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); 1784 mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); 1785 1786 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1787 ztest_rll_init(&zd->zd_object_lock[l]); 1788 1789 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1790 ztest_rll_init(&zd->zd_range_lock[l]); 1791 } 1792 1793 static void 1794 ztest_zd_fini(ztest_ds_t *zd) 1795 { 1796 int l; 1797 1798 mutex_destroy(&zd->zd_dirobj_lock); 1799 (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); 1800 1801 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1802 ztest_rll_destroy(&zd->zd_object_lock[l]); 1803 1804 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1805 ztest_rll_destroy(&zd->zd_range_lock[l]); 1806 } 1807 1808 #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) 1809 1810 static uint64_t 1811 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 1812 { 1813 uint64_t txg; 1814 int error; 1815 1816 /* 1817 * Attempt to assign tx to some transaction group. 1818 */ 1819 error = dmu_tx_assign(tx, txg_how); 1820 if (error) { 1821 if (error == ERESTART) { 1822 ASSERT3U(txg_how, ==, TXG_NOWAIT); 1823 dmu_tx_wait(tx); 1824 } else { 1825 ASSERT3U(error, ==, ENOSPC); 1826 ztest_record_enospc(tag); 1827 } 1828 dmu_tx_abort(tx); 1829 return (0); 1830 } 1831 txg = dmu_tx_get_txg(tx); 1832 ASSERT3U(txg, !=, 0); 1833 return (txg); 1834 } 1835 1836 static void 1837 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1838 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1839 uint64_t crtxg) 1840 { 1841 bt->bt_magic = BT_MAGIC; 1842 bt->bt_objset = dmu_objset_id(os); 1843 bt->bt_object = object; 1844 bt->bt_dnodesize = dnodesize; 1845 bt->bt_offset = offset; 1846 bt->bt_gen = gen; 1847 bt->bt_txg = txg; 1848 bt->bt_crtxg = crtxg; 1849 } 1850 1851 static void 1852 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1853 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1854 uint64_t crtxg) 1855 { 1856 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1857 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1858 ASSERT3U(bt->bt_object, ==, object); 1859 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1860 ASSERT3U(bt->bt_offset, ==, offset); 1861 ASSERT3U(bt->bt_gen, <=, gen); 1862 ASSERT3U(bt->bt_txg, <=, txg); 1863 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1864 } 1865 1866 static ztest_block_tag_t * 1867 ztest_bt_bonus(dmu_buf_t *db) 1868 { 1869 dmu_object_info_t doi; 1870 ztest_block_tag_t *bt; 1871 1872 dmu_object_info_from_db(db, &doi); 1873 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1874 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1875 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1876 1877 return (bt); 1878 } 1879 1880 /* 1881 * Generate a token to fill up unused bonus buffer space. Try to make 1882 * it unique to the object, generation, and offset to verify that data 1883 * is not getting overwritten by data from other dnodes. 1884 */ 1885 #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1886 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1887 1888 /* 1889 * Fill up the unused bonus buffer region before the block tag with a 1890 * verifiable pattern. Filling the whole bonus area with non-zero data 1891 * helps ensure that all dnode traversal code properly skips the 1892 * interior regions of large dnodes. 1893 */ 1894 static void 1895 ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1896 objset_t *os, uint64_t gen) 1897 { 1898 uint64_t *bonusp; 1899 1900 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1901 1902 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1903 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1904 gen, bonusp - (uint64_t *)db->db_data); 1905 *bonusp = token; 1906 } 1907 } 1908 1909 /* 1910 * Verify that the unused area of a bonus buffer is filled with the 1911 * expected tokens. 1912 */ 1913 static void 1914 ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1915 objset_t *os, uint64_t gen) 1916 { 1917 uint64_t *bonusp; 1918 1919 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1920 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1921 gen, bonusp - (uint64_t *)db->db_data); 1922 VERIFY3U(*bonusp, ==, token); 1923 } 1924 } 1925 1926 /* 1927 * ZIL logging ops 1928 */ 1929 1930 #define lrz_type lr_mode 1931 #define lrz_blocksize lr_uid 1932 #define lrz_ibshift lr_gid 1933 #define lrz_bonustype lr_rdev 1934 #define lrz_dnodesize lr_crtime[1] 1935 1936 static void 1937 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1938 { 1939 char *name = (void *)(lr + 1); /* name follows lr */ 1940 size_t namesize = strlen(name) + 1; 1941 itx_t *itx; 1942 1943 if (zil_replaying(zd->zd_zilog, tx)) 1944 return; 1945 1946 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1947 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1948 sizeof (*lr) + namesize - sizeof (lr_t)); 1949 1950 zil_itx_assign(zd->zd_zilog, itx, tx); 1951 } 1952 1953 static void 1954 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1955 { 1956 char *name = (void *)(lr + 1); /* name follows lr */ 1957 size_t namesize = strlen(name) + 1; 1958 itx_t *itx; 1959 1960 if (zil_replaying(zd->zd_zilog, tx)) 1961 return; 1962 1963 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1964 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1965 sizeof (*lr) + namesize - sizeof (lr_t)); 1966 1967 itx->itx_oid = object; 1968 zil_itx_assign(zd->zd_zilog, itx, tx); 1969 } 1970 1971 static void 1972 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1973 { 1974 itx_t *itx; 1975 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1976 1977 if (zil_replaying(zd->zd_zilog, tx)) 1978 return; 1979 1980 if (lr->lr_length > zil_max_log_data(zd->zd_zilog, sizeof (lr_write_t))) 1981 write_state = WR_INDIRECT; 1982 1983 itx = zil_itx_create(TX_WRITE, 1984 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1985 1986 if (write_state == WR_COPIED && 1987 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1988 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { 1989 zil_itx_destroy(itx); 1990 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1991 write_state = WR_NEED_COPY; 1992 } 1993 itx->itx_private = zd; 1994 itx->itx_wr_state = write_state; 1995 itx->itx_sync = (ztest_random(8) == 0); 1996 1997 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1998 sizeof (*lr) - sizeof (lr_t)); 1999 2000 zil_itx_assign(zd->zd_zilog, itx, tx); 2001 } 2002 2003 static void 2004 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 2005 { 2006 itx_t *itx; 2007 2008 if (zil_replaying(zd->zd_zilog, tx)) 2009 return; 2010 2011 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 2012 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2013 sizeof (*lr) - sizeof (lr_t)); 2014 2015 itx->itx_sync = B_FALSE; 2016 zil_itx_assign(zd->zd_zilog, itx, tx); 2017 } 2018 2019 static void 2020 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 2021 { 2022 itx_t *itx; 2023 2024 if (zil_replaying(zd->zd_zilog, tx)) 2025 return; 2026 2027 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 2028 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2029 sizeof (*lr) - sizeof (lr_t)); 2030 2031 itx->itx_sync = B_FALSE; 2032 zil_itx_assign(zd->zd_zilog, itx, tx); 2033 } 2034 2035 /* 2036 * ZIL replay ops 2037 */ 2038 static int 2039 ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 2040 { 2041 ztest_ds_t *zd = arg1; 2042 lr_create_t *lr = arg2; 2043 char *name = (void *)(lr + 1); /* name follows lr */ 2044 objset_t *os = zd->zd_os; 2045 ztest_block_tag_t *bbt; 2046 dmu_buf_t *db; 2047 dmu_tx_t *tx; 2048 uint64_t txg; 2049 int error = 0; 2050 int bonuslen; 2051 2052 if (byteswap) 2053 byteswap_uint64_array(lr, sizeof (*lr)); 2054 2055 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2056 ASSERT3S(name[0], !=, '\0'); 2057 2058 tx = dmu_tx_create(os); 2059 2060 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 2061 2062 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2063 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 2064 } else { 2065 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 2066 } 2067 2068 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2069 if (txg == 0) 2070 return (ENOSPC); 2071 2072 ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); 2073 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 2074 2075 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2076 if (lr->lr_foid == 0) { 2077 lr->lr_foid = zap_create_dnsize(os, 2078 lr->lrz_type, lr->lrz_bonustype, 2079 bonuslen, lr->lrz_dnodesize, tx); 2080 } else { 2081 error = zap_create_claim_dnsize(os, lr->lr_foid, 2082 lr->lrz_type, lr->lrz_bonustype, 2083 bonuslen, lr->lrz_dnodesize, tx); 2084 } 2085 } else { 2086 if (lr->lr_foid == 0) { 2087 lr->lr_foid = dmu_object_alloc_dnsize(os, 2088 lr->lrz_type, 0, lr->lrz_bonustype, 2089 bonuslen, lr->lrz_dnodesize, tx); 2090 } else { 2091 error = dmu_object_claim_dnsize(os, lr->lr_foid, 2092 lr->lrz_type, 0, lr->lrz_bonustype, 2093 bonuslen, lr->lrz_dnodesize, tx); 2094 } 2095 } 2096 2097 if (error) { 2098 ASSERT3U(error, ==, EEXIST); 2099 ASSERT(zd->zd_zilog->zl_replay); 2100 dmu_tx_commit(tx); 2101 return (error); 2102 } 2103 2104 ASSERT3U(lr->lr_foid, !=, 0); 2105 2106 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 2107 VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, 2108 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 2109 2110 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2111 bbt = ztest_bt_bonus(db); 2112 dmu_buf_will_dirty(db, tx); 2113 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 2114 lr->lr_gen, txg, txg); 2115 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 2116 dmu_buf_rele(db, FTAG); 2117 2118 VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 2119 &lr->lr_foid, tx)); 2120 2121 (void) ztest_log_create(zd, tx, lr); 2122 2123 dmu_tx_commit(tx); 2124 2125 return (0); 2126 } 2127 2128 static int 2129 ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 2130 { 2131 ztest_ds_t *zd = arg1; 2132 lr_remove_t *lr = arg2; 2133 char *name = (void *)(lr + 1); /* name follows lr */ 2134 objset_t *os = zd->zd_os; 2135 dmu_object_info_t doi; 2136 dmu_tx_t *tx; 2137 uint64_t object, txg; 2138 2139 if (byteswap) 2140 byteswap_uint64_array(lr, sizeof (*lr)); 2141 2142 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2143 ASSERT3S(name[0], !=, '\0'); 2144 2145 VERIFY0( 2146 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 2147 ASSERT3U(object, !=, 0); 2148 2149 ztest_object_lock(zd, object, ZTRL_WRITER); 2150 2151 VERIFY0(dmu_object_info(os, object, &doi)); 2152 2153 tx = dmu_tx_create(os); 2154 2155 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 2156 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 2157 2158 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2159 if (txg == 0) { 2160 ztest_object_unlock(zd, object); 2161 return (ENOSPC); 2162 } 2163 2164 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 2165 VERIFY0(zap_destroy(os, object, tx)); 2166 } else { 2167 VERIFY0(dmu_object_free(os, object, tx)); 2168 } 2169 2170 VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); 2171 2172 (void) ztest_log_remove(zd, tx, lr, object); 2173 2174 dmu_tx_commit(tx); 2175 2176 ztest_object_unlock(zd, object); 2177 2178 return (0); 2179 } 2180 2181 static int 2182 ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 2183 { 2184 ztest_ds_t *zd = arg1; 2185 lr_write_t *lr = arg2; 2186 objset_t *os = zd->zd_os; 2187 void *data = lr + 1; /* data follows lr */ 2188 uint64_t offset, length; 2189 ztest_block_tag_t *bt = data; 2190 ztest_block_tag_t *bbt; 2191 uint64_t gen, txg, lrtxg, crtxg; 2192 dmu_object_info_t doi; 2193 dmu_tx_t *tx; 2194 dmu_buf_t *db; 2195 arc_buf_t *abuf = NULL; 2196 rl_t *rl; 2197 2198 if (byteswap) 2199 byteswap_uint64_array(lr, sizeof (*lr)); 2200 2201 offset = lr->lr_offset; 2202 length = lr->lr_length; 2203 2204 /* If it's a dmu_sync() block, write the whole block */ 2205 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 2206 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 2207 if (length < blocksize) { 2208 offset -= offset % blocksize; 2209 length = blocksize; 2210 } 2211 } 2212 2213 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 2214 byteswap_uint64_array(bt, sizeof (*bt)); 2215 2216 if (bt->bt_magic != BT_MAGIC) 2217 bt = NULL; 2218 2219 ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); 2220 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, ZTRL_WRITER); 2221 2222 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2223 2224 dmu_object_info_from_db(db, &doi); 2225 2226 bbt = ztest_bt_bonus(db); 2227 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2228 gen = bbt->bt_gen; 2229 crtxg = bbt->bt_crtxg; 2230 lrtxg = lr->lr_common.lrc_txg; 2231 2232 tx = dmu_tx_create(os); 2233 2234 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 2235 2236 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 2237 P2PHASE(offset, length) == 0) 2238 abuf = dmu_request_arcbuf(db, length); 2239 2240 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2241 if (txg == 0) { 2242 if (abuf != NULL) 2243 dmu_return_arcbuf(abuf); 2244 dmu_buf_rele(db, FTAG); 2245 ztest_range_unlock(rl); 2246 ztest_object_unlock(zd, lr->lr_foid); 2247 return (ENOSPC); 2248 } 2249 2250 if (bt != NULL) { 2251 /* 2252 * Usually, verify the old data before writing new data -- 2253 * but not always, because we also want to verify correct 2254 * behavior when the data was not recently read into cache. 2255 */ 2256 ASSERT(doi.doi_data_block_size); 2257 ASSERT0(offset % doi.doi_data_block_size); 2258 if (ztest_random(4) != 0) { 2259 int prefetch = ztest_random(2) ? 2260 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 2261 ztest_block_tag_t rbt; 2262 2263 VERIFY(dmu_read(os, lr->lr_foid, offset, 2264 sizeof (rbt), &rbt, prefetch) == 0); 2265 if (rbt.bt_magic == BT_MAGIC) { 2266 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 2267 offset, gen, txg, crtxg); 2268 } 2269 } 2270 2271 /* 2272 * Writes can appear to be newer than the bonus buffer because 2273 * the ztest_get_data() callback does a dmu_read() of the 2274 * open-context data, which may be different than the data 2275 * as it was when the write was generated. 2276 */ 2277 if (zd->zd_zilog->zl_replay) { 2278 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 2279 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 2280 bt->bt_crtxg); 2281 } 2282 2283 /* 2284 * Set the bt's gen/txg to the bonus buffer's gen/txg 2285 * so that all of the usual ASSERTs will work. 2286 */ 2287 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 2288 crtxg); 2289 } 2290 2291 if (abuf == NULL) { 2292 dmu_write(os, lr->lr_foid, offset, length, data, tx); 2293 } else { 2294 memcpy(abuf->b_data, data, length); 2295 VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx)); 2296 } 2297 2298 (void) ztest_log_write(zd, tx, lr); 2299 2300 dmu_buf_rele(db, FTAG); 2301 2302 dmu_tx_commit(tx); 2303 2304 ztest_range_unlock(rl); 2305 ztest_object_unlock(zd, lr->lr_foid); 2306 2307 return (0); 2308 } 2309 2310 static int 2311 ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 2312 { 2313 ztest_ds_t *zd = arg1; 2314 lr_truncate_t *lr = arg2; 2315 objset_t *os = zd->zd_os; 2316 dmu_tx_t *tx; 2317 uint64_t txg; 2318 rl_t *rl; 2319 2320 if (byteswap) 2321 byteswap_uint64_array(lr, sizeof (*lr)); 2322 2323 ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); 2324 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 2325 ZTRL_WRITER); 2326 2327 tx = dmu_tx_create(os); 2328 2329 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 2330 2331 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2332 if (txg == 0) { 2333 ztest_range_unlock(rl); 2334 ztest_object_unlock(zd, lr->lr_foid); 2335 return (ENOSPC); 2336 } 2337 2338 VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 2339 lr->lr_length, tx)); 2340 2341 (void) ztest_log_truncate(zd, tx, lr); 2342 2343 dmu_tx_commit(tx); 2344 2345 ztest_range_unlock(rl); 2346 ztest_object_unlock(zd, lr->lr_foid); 2347 2348 return (0); 2349 } 2350 2351 static int 2352 ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 2353 { 2354 ztest_ds_t *zd = arg1; 2355 lr_setattr_t *lr = arg2; 2356 objset_t *os = zd->zd_os; 2357 dmu_tx_t *tx; 2358 dmu_buf_t *db; 2359 ztest_block_tag_t *bbt; 2360 uint64_t txg, lrtxg, crtxg, dnodesize; 2361 2362 if (byteswap) 2363 byteswap_uint64_array(lr, sizeof (*lr)); 2364 2365 ztest_object_lock(zd, lr->lr_foid, ZTRL_WRITER); 2366 2367 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2368 2369 tx = dmu_tx_create(os); 2370 dmu_tx_hold_bonus(tx, lr->lr_foid); 2371 2372 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2373 if (txg == 0) { 2374 dmu_buf_rele(db, FTAG); 2375 ztest_object_unlock(zd, lr->lr_foid); 2376 return (ENOSPC); 2377 } 2378 2379 bbt = ztest_bt_bonus(db); 2380 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2381 crtxg = bbt->bt_crtxg; 2382 lrtxg = lr->lr_common.lrc_txg; 2383 dnodesize = bbt->bt_dnodesize; 2384 2385 if (zd->zd_zilog->zl_replay) { 2386 ASSERT3U(lr->lr_size, !=, 0); 2387 ASSERT3U(lr->lr_mode, !=, 0); 2388 ASSERT3U(lrtxg, !=, 0); 2389 } else { 2390 /* 2391 * Randomly change the size and increment the generation. 2392 */ 2393 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 2394 sizeof (*bbt); 2395 lr->lr_mode = bbt->bt_gen + 1; 2396 ASSERT0(lrtxg); 2397 } 2398 2399 /* 2400 * Verify that the current bonus buffer is not newer than our txg. 2401 */ 2402 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2403 MAX(txg, lrtxg), crtxg); 2404 2405 dmu_buf_will_dirty(db, tx); 2406 2407 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 2408 ASSERT3U(lr->lr_size, <=, db->db_size); 2409 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 2410 bbt = ztest_bt_bonus(db); 2411 2412 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2413 txg, crtxg); 2414 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 2415 dmu_buf_rele(db, FTAG); 2416 2417 (void) ztest_log_setattr(zd, tx, lr); 2418 2419 dmu_tx_commit(tx); 2420 2421 ztest_object_unlock(zd, lr->lr_foid); 2422 2423 return (0); 2424 } 2425 2426 static zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 2427 NULL, /* 0 no such transaction type */ 2428 ztest_replay_create, /* TX_CREATE */ 2429 NULL, /* TX_MKDIR */ 2430 NULL, /* TX_MKXATTR */ 2431 NULL, /* TX_SYMLINK */ 2432 ztest_replay_remove, /* TX_REMOVE */ 2433 NULL, /* TX_RMDIR */ 2434 NULL, /* TX_LINK */ 2435 NULL, /* TX_RENAME */ 2436 ztest_replay_write, /* TX_WRITE */ 2437 ztest_replay_truncate, /* TX_TRUNCATE */ 2438 ztest_replay_setattr, /* TX_SETATTR */ 2439 NULL, /* TX_ACL */ 2440 NULL, /* TX_CREATE_ACL */ 2441 NULL, /* TX_CREATE_ATTR */ 2442 NULL, /* TX_CREATE_ACL_ATTR */ 2443 NULL, /* TX_MKDIR_ACL */ 2444 NULL, /* TX_MKDIR_ATTR */ 2445 NULL, /* TX_MKDIR_ACL_ATTR */ 2446 NULL, /* TX_WRITE2 */ 2447 NULL, /* TX_SETSAXATTR */ 2448 NULL, /* TX_RENAME_EXCHANGE */ 2449 NULL, /* TX_RENAME_WHITEOUT */ 2450 }; 2451 2452 /* 2453 * ZIL get_data callbacks 2454 */ 2455 2456 static void 2457 ztest_get_done(zgd_t *zgd, int error) 2458 { 2459 (void) error; 2460 ztest_ds_t *zd = zgd->zgd_private; 2461 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 2462 2463 if (zgd->zgd_db) 2464 dmu_buf_rele(zgd->zgd_db, zgd); 2465 2466 ztest_range_unlock((rl_t *)zgd->zgd_lr); 2467 ztest_object_unlock(zd, object); 2468 2469 umem_free(zgd, sizeof (*zgd)); 2470 } 2471 2472 static int 2473 ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, 2474 struct lwb *lwb, zio_t *zio) 2475 { 2476 (void) arg2; 2477 ztest_ds_t *zd = arg; 2478 objset_t *os = zd->zd_os; 2479 uint64_t object = lr->lr_foid; 2480 uint64_t offset = lr->lr_offset; 2481 uint64_t size = lr->lr_length; 2482 uint64_t txg = lr->lr_common.lrc_txg; 2483 uint64_t crtxg; 2484 dmu_object_info_t doi; 2485 dmu_buf_t *db; 2486 zgd_t *zgd; 2487 int error; 2488 2489 ASSERT3P(lwb, !=, NULL); 2490 ASSERT3U(size, !=, 0); 2491 2492 ztest_object_lock(zd, object, ZTRL_READER); 2493 error = dmu_bonus_hold(os, object, FTAG, &db); 2494 if (error) { 2495 ztest_object_unlock(zd, object); 2496 return (error); 2497 } 2498 2499 crtxg = ztest_bt_bonus(db)->bt_crtxg; 2500 2501 if (crtxg == 0 || crtxg > txg) { 2502 dmu_buf_rele(db, FTAG); 2503 ztest_object_unlock(zd, object); 2504 return (ENOENT); 2505 } 2506 2507 dmu_object_info_from_db(db, &doi); 2508 dmu_buf_rele(db, FTAG); 2509 db = NULL; 2510 2511 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 2512 zgd->zgd_lwb = lwb; 2513 zgd->zgd_private = zd; 2514 2515 if (buf != NULL) { /* immediate write */ 2516 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2517 object, offset, size, ZTRL_READER); 2518 2519 error = dmu_read(os, object, offset, size, buf, 2520 DMU_READ_NO_PREFETCH); 2521 ASSERT0(error); 2522 } else { 2523 ASSERT3P(zio, !=, NULL); 2524 size = doi.doi_data_block_size; 2525 if (ISP2(size)) { 2526 offset = P2ALIGN_TYPED(offset, size, uint64_t); 2527 } else { 2528 ASSERT3U(offset, <, size); 2529 offset = 0; 2530 } 2531 2532 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2533 object, offset, size, ZTRL_READER); 2534 2535 error = dmu_buf_hold_noread(os, object, offset, zgd, &db); 2536 2537 if (error == 0) { 2538 blkptr_t *bp = &lr->lr_blkptr; 2539 2540 zgd->zgd_db = db; 2541 zgd->zgd_bp = bp; 2542 2543 ASSERT3U(db->db_offset, ==, offset); 2544 ASSERT3U(db->db_size, ==, size); 2545 2546 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2547 ztest_get_done, zgd); 2548 2549 if (error == 0) 2550 return (0); 2551 } 2552 } 2553 2554 ztest_get_done(zgd, error); 2555 2556 return (error); 2557 } 2558 2559 static void * 2560 ztest_lr_alloc(size_t lrsize, char *name) 2561 { 2562 char *lr; 2563 size_t namesize = name ? strlen(name) + 1 : 0; 2564 2565 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2566 2567 if (name) 2568 memcpy(lr + lrsize, name, namesize); 2569 2570 return (lr); 2571 } 2572 2573 static void 2574 ztest_lr_free(void *lr, size_t lrsize, char *name) 2575 { 2576 size_t namesize = name ? strlen(name) + 1 : 0; 2577 2578 umem_free(lr, lrsize + namesize); 2579 } 2580 2581 /* 2582 * Lookup a bunch of objects. Returns the number of objects not found. 2583 */ 2584 static int 2585 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2586 { 2587 int missing = 0; 2588 int error; 2589 int i; 2590 2591 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2592 2593 for (i = 0; i < count; i++, od++) { 2594 od->od_object = 0; 2595 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2596 sizeof (uint64_t), 1, &od->od_object); 2597 if (error) { 2598 ASSERT3S(error, ==, ENOENT); 2599 ASSERT0(od->od_object); 2600 missing++; 2601 } else { 2602 dmu_buf_t *db; 2603 ztest_block_tag_t *bbt; 2604 dmu_object_info_t doi; 2605 2606 ASSERT3U(od->od_object, !=, 0); 2607 ASSERT0(missing); /* there should be no gaps */ 2608 2609 ztest_object_lock(zd, od->od_object, ZTRL_READER); 2610 VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, 2611 FTAG, &db)); 2612 dmu_object_info_from_db(db, &doi); 2613 bbt = ztest_bt_bonus(db); 2614 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2615 od->od_type = doi.doi_type; 2616 od->od_blocksize = doi.doi_data_block_size; 2617 od->od_gen = bbt->bt_gen; 2618 dmu_buf_rele(db, FTAG); 2619 ztest_object_unlock(zd, od->od_object); 2620 } 2621 } 2622 2623 return (missing); 2624 } 2625 2626 static int 2627 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2628 { 2629 int missing = 0; 2630 int i; 2631 2632 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2633 2634 for (i = 0; i < count; i++, od++) { 2635 if (missing) { 2636 od->od_object = 0; 2637 missing++; 2638 continue; 2639 } 2640 2641 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2642 2643 lr->lr_doid = od->od_dir; 2644 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2645 lr->lrz_type = od->od_crtype; 2646 lr->lrz_blocksize = od->od_crblocksize; 2647 lr->lrz_ibshift = ztest_random_ibshift(); 2648 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2649 lr->lrz_dnodesize = od->od_crdnodesize; 2650 lr->lr_gen = od->od_crgen; 2651 lr->lr_crtime[0] = time(NULL); 2652 2653 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2654 ASSERT0(missing); 2655 od->od_object = 0; 2656 missing++; 2657 } else { 2658 od->od_object = lr->lr_foid; 2659 od->od_type = od->od_crtype; 2660 od->od_blocksize = od->od_crblocksize; 2661 od->od_gen = od->od_crgen; 2662 ASSERT3U(od->od_object, !=, 0); 2663 } 2664 2665 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2666 } 2667 2668 return (missing); 2669 } 2670 2671 static int 2672 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2673 { 2674 int missing = 0; 2675 int error; 2676 int i; 2677 2678 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2679 2680 od += count - 1; 2681 2682 for (i = count - 1; i >= 0; i--, od--) { 2683 if (missing) { 2684 missing++; 2685 continue; 2686 } 2687 2688 /* 2689 * No object was found. 2690 */ 2691 if (od->od_object == 0) 2692 continue; 2693 2694 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2695 2696 lr->lr_doid = od->od_dir; 2697 2698 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2699 ASSERT3U(error, ==, ENOSPC); 2700 missing++; 2701 } else { 2702 od->od_object = 0; 2703 } 2704 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2705 } 2706 2707 return (missing); 2708 } 2709 2710 static int 2711 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2712 const void *data) 2713 { 2714 lr_write_t *lr; 2715 int error; 2716 2717 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2718 2719 lr->lr_foid = object; 2720 lr->lr_offset = offset; 2721 lr->lr_length = size; 2722 lr->lr_blkoff = 0; 2723 BP_ZERO(&lr->lr_blkptr); 2724 2725 memcpy(lr + 1, data, size); 2726 2727 error = ztest_replay_write(zd, lr, B_FALSE); 2728 2729 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2730 2731 return (error); 2732 } 2733 2734 static int 2735 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2736 { 2737 lr_truncate_t *lr; 2738 int error; 2739 2740 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2741 2742 lr->lr_foid = object; 2743 lr->lr_offset = offset; 2744 lr->lr_length = size; 2745 2746 error = ztest_replay_truncate(zd, lr, B_FALSE); 2747 2748 ztest_lr_free(lr, sizeof (*lr), NULL); 2749 2750 return (error); 2751 } 2752 2753 static int 2754 ztest_setattr(ztest_ds_t *zd, uint64_t object) 2755 { 2756 lr_setattr_t *lr; 2757 int error; 2758 2759 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2760 2761 lr->lr_foid = object; 2762 lr->lr_size = 0; 2763 lr->lr_mode = 0; 2764 2765 error = ztest_replay_setattr(zd, lr, B_FALSE); 2766 2767 ztest_lr_free(lr, sizeof (*lr), NULL); 2768 2769 return (error); 2770 } 2771 2772 static void 2773 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2774 { 2775 objset_t *os = zd->zd_os; 2776 dmu_tx_t *tx; 2777 uint64_t txg; 2778 rl_t *rl; 2779 2780 txg_wait_synced(dmu_objset_pool(os), 0); 2781 2782 ztest_object_lock(zd, object, ZTRL_READER); 2783 rl = ztest_range_lock(zd, object, offset, size, ZTRL_WRITER); 2784 2785 tx = dmu_tx_create(os); 2786 2787 dmu_tx_hold_write(tx, object, offset, size); 2788 2789 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2790 2791 if (txg != 0) { 2792 dmu_prealloc(os, object, offset, size, tx); 2793 dmu_tx_commit(tx); 2794 txg_wait_synced(dmu_objset_pool(os), txg); 2795 } else { 2796 (void) dmu_free_long_range(os, object, offset, size); 2797 } 2798 2799 ztest_range_unlock(rl); 2800 ztest_object_unlock(zd, object); 2801 } 2802 2803 static void 2804 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2805 { 2806 int err; 2807 ztest_block_tag_t wbt; 2808 dmu_object_info_t doi; 2809 enum ztest_io_type io_type; 2810 uint64_t blocksize; 2811 void *data; 2812 2813 VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); 2814 blocksize = doi.doi_data_block_size; 2815 data = umem_alloc(blocksize, UMEM_NOFAIL); 2816 2817 /* 2818 * Pick an i/o type at random, biased toward writing block tags. 2819 */ 2820 io_type = ztest_random(ZTEST_IO_TYPES); 2821 if (ztest_random(2) == 0) 2822 io_type = ZTEST_IO_WRITE_TAG; 2823 2824 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2825 2826 switch (io_type) { 2827 2828 case ZTEST_IO_WRITE_TAG: 2829 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2830 offset, 0, 0, 0); 2831 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2832 break; 2833 2834 case ZTEST_IO_WRITE_PATTERN: 2835 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2836 if (ztest_random(2) == 0) { 2837 /* 2838 * Induce fletcher2 collisions to ensure that 2839 * zio_ddt_collision() detects and resolves them 2840 * when using fletcher2-verify for deduplication. 2841 */ 2842 ((uint64_t *)data)[0] ^= 1ULL << 63; 2843 ((uint64_t *)data)[4] ^= 1ULL << 63; 2844 } 2845 (void) ztest_write(zd, object, offset, blocksize, data); 2846 break; 2847 2848 case ZTEST_IO_WRITE_ZEROES: 2849 memset(data, 0, blocksize); 2850 (void) ztest_write(zd, object, offset, blocksize, data); 2851 break; 2852 2853 case ZTEST_IO_TRUNCATE: 2854 (void) ztest_truncate(zd, object, offset, blocksize); 2855 break; 2856 2857 case ZTEST_IO_SETATTR: 2858 (void) ztest_setattr(zd, object); 2859 break; 2860 default: 2861 break; 2862 2863 case ZTEST_IO_REWRITE: 2864 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2865 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2866 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2867 B_FALSE); 2868 ASSERT(err == 0 || err == ENOSPC); 2869 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2870 ZFS_PROP_COMPRESSION, 2871 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2872 B_FALSE); 2873 ASSERT(err == 0 || err == ENOSPC); 2874 (void) pthread_rwlock_unlock(&ztest_name_lock); 2875 2876 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2877 DMU_READ_NO_PREFETCH)); 2878 2879 (void) ztest_write(zd, object, offset, blocksize, data); 2880 break; 2881 } 2882 2883 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2884 2885 umem_free(data, blocksize); 2886 } 2887 2888 /* 2889 * Initialize an object description template. 2890 */ 2891 static void 2892 ztest_od_init(ztest_od_t *od, uint64_t id, const char *tag, uint64_t index, 2893 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2894 uint64_t gen) 2895 { 2896 od->od_dir = ZTEST_DIROBJ; 2897 od->od_object = 0; 2898 2899 od->od_crtype = type; 2900 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2901 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2902 od->od_crgen = gen; 2903 2904 od->od_type = DMU_OT_NONE; 2905 od->od_blocksize = 0; 2906 od->od_gen = 0; 2907 2908 (void) snprintf(od->od_name, sizeof (od->od_name), 2909 "%s(%"PRId64")[%"PRIu64"]", 2910 tag, id, index); 2911 } 2912 2913 /* 2914 * Lookup or create the objects for a test using the od template. 2915 * If the objects do not all exist, or if 'remove' is specified, 2916 * remove any existing objects and create new ones. Otherwise, 2917 * use the existing objects. 2918 */ 2919 static int 2920 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2921 { 2922 int count = size / sizeof (*od); 2923 int rv = 0; 2924 2925 mutex_enter(&zd->zd_dirobj_lock); 2926 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2927 (ztest_remove(zd, od, count) != 0 || 2928 ztest_create(zd, od, count) != 0)) 2929 rv = -1; 2930 zd->zd_od = od; 2931 mutex_exit(&zd->zd_dirobj_lock); 2932 2933 return (rv); 2934 } 2935 2936 void 2937 ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2938 { 2939 (void) id; 2940 zilog_t *zilog = zd->zd_zilog; 2941 2942 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2943 2944 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2945 2946 /* 2947 * Remember the committed values in zd, which is in parent/child 2948 * shared memory. If we die, the next iteration of ztest_run() 2949 * will verify that the log really does contain this record. 2950 */ 2951 mutex_enter(&zilog->zl_lock); 2952 ASSERT3P(zd->zd_shared, !=, NULL); 2953 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2954 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2955 mutex_exit(&zilog->zl_lock); 2956 2957 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2958 } 2959 2960 /* 2961 * This function is designed to simulate the operations that occur during a 2962 * mount/unmount operation. We hold the dataset across these operations in an 2963 * attempt to expose any implicit assumptions about ZIL management. 2964 */ 2965 void 2966 ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2967 { 2968 (void) id; 2969 objset_t *os = zd->zd_os; 2970 2971 /* 2972 * We hold the ztest_vdev_lock so we don't cause problems with 2973 * other threads that wish to remove a log device, such as 2974 * ztest_device_removal(). 2975 */ 2976 mutex_enter(&ztest_vdev_lock); 2977 2978 /* 2979 * We grab the zd_dirobj_lock to ensure that no other thread is 2980 * updating the zil (i.e. adding in-memory log records) and the 2981 * zd_zilog_lock to block any I/O. 2982 */ 2983 mutex_enter(&zd->zd_dirobj_lock); 2984 (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); 2985 2986 /* zfsvfs_teardown() */ 2987 zil_close(zd->zd_zilog); 2988 2989 /* zfsvfs_setup() */ 2990 VERIFY3P(zil_open(os, ztest_get_data, NULL), ==, zd->zd_zilog); 2991 zil_replay(os, zd, ztest_replay_vector); 2992 2993 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2994 mutex_exit(&zd->zd_dirobj_lock); 2995 mutex_exit(&ztest_vdev_lock); 2996 } 2997 2998 /* 2999 * Verify that we can't destroy an active pool, create an existing pool, 3000 * or create a pool with a bad vdev spec. 3001 */ 3002 void 3003 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 3004 { 3005 (void) zd, (void) id; 3006 ztest_shared_opts_t *zo = &ztest_opts; 3007 spa_t *spa; 3008 nvlist_t *nvroot; 3009 3010 if (zo->zo_mmp_test) 3011 return; 3012 3013 /* 3014 * Attempt to create using a bad file. 3015 */ 3016 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 3017 VERIFY3U(ENOENT, ==, 3018 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 3019 fnvlist_free(nvroot); 3020 3021 /* 3022 * Attempt to create using a bad mirror. 3023 */ 3024 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); 3025 VERIFY3U(ENOENT, ==, 3026 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 3027 fnvlist_free(nvroot); 3028 3029 /* 3030 * Attempt to create an existing pool. It shouldn't matter 3031 * what's in the nvroot; we should fail with EEXIST. 3032 */ 3033 (void) pthread_rwlock_rdlock(&ztest_name_lock); 3034 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 3035 VERIFY3U(EEXIST, ==, 3036 spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); 3037 fnvlist_free(nvroot); 3038 3039 /* 3040 * We open a reference to the spa and then we try to export it 3041 * expecting one of the following errors: 3042 * 3043 * EBUSY 3044 * Because of the reference we just opened. 3045 * 3046 * ZFS_ERR_EXPORT_IN_PROGRESS 3047 * For the case that there is another ztest thread doing 3048 * an export concurrently. 3049 */ 3050 VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); 3051 int error = spa_destroy(zo->zo_pool); 3052 if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { 3053 fatal(B_FALSE, "spa_destroy(%s) returned unexpected value %d", 3054 spa->spa_name, error); 3055 } 3056 spa_close(spa, FTAG); 3057 3058 (void) pthread_rwlock_unlock(&ztest_name_lock); 3059 } 3060 3061 /* 3062 * Start and then stop the MMP threads to ensure the startup and shutdown code 3063 * works properly. Actual protection and property-related code tested via ZTS. 3064 */ 3065 void 3066 ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) 3067 { 3068 (void) zd, (void) id; 3069 ztest_shared_opts_t *zo = &ztest_opts; 3070 spa_t *spa = ztest_spa; 3071 3072 if (zo->zo_mmp_test) 3073 return; 3074 3075 /* 3076 * Since enabling MMP involves setting a property, it could not be done 3077 * while the pool is suspended. 3078 */ 3079 if (spa_suspended(spa)) 3080 return; 3081 3082 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3083 mutex_enter(&spa->spa_props_lock); 3084 3085 zfs_multihost_fail_intervals = 0; 3086 3087 if (!spa_multihost(spa)) { 3088 spa->spa_multihost = B_TRUE; 3089 mmp_thread_start(spa); 3090 } 3091 3092 mutex_exit(&spa->spa_props_lock); 3093 spa_config_exit(spa, SCL_CONFIG, FTAG); 3094 3095 txg_wait_synced(spa_get_dsl(spa), 0); 3096 mmp_signal_all_threads(); 3097 txg_wait_synced(spa_get_dsl(spa), 0); 3098 3099 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3100 mutex_enter(&spa->spa_props_lock); 3101 3102 if (spa_multihost(spa)) { 3103 mmp_thread_stop(spa); 3104 spa->spa_multihost = B_FALSE; 3105 } 3106 3107 mutex_exit(&spa->spa_props_lock); 3108 spa_config_exit(spa, SCL_CONFIG, FTAG); 3109 } 3110 3111 static int 3112 ztest_get_raidz_children(spa_t *spa) 3113 { 3114 (void) spa; 3115 vdev_t *raidvd; 3116 3117 ASSERT(MUTEX_HELD(&ztest_vdev_lock)); 3118 3119 if (ztest_opts.zo_raid_do_expand) { 3120 raidvd = ztest_spa->spa_root_vdev->vdev_child[0]; 3121 3122 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3123 3124 return (raidvd->vdev_children); 3125 } 3126 3127 return (ztest_opts.zo_raid_children); 3128 } 3129 3130 void 3131 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 3132 { 3133 (void) zd, (void) id; 3134 spa_t *spa; 3135 uint64_t initial_version = SPA_VERSION_INITIAL; 3136 uint64_t raidz_children, version, newversion; 3137 nvlist_t *nvroot, *props; 3138 char *name; 3139 3140 if (ztest_opts.zo_mmp_test) 3141 return; 3142 3143 /* dRAID added after feature flags, skip upgrade test. */ 3144 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) 3145 return; 3146 3147 mutex_enter(&ztest_vdev_lock); 3148 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 3149 3150 /* 3151 * Clean up from previous runs. 3152 */ 3153 (void) spa_destroy(name); 3154 3155 raidz_children = ztest_get_raidz_children(ztest_spa); 3156 3157 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 3158 NULL, raidz_children, ztest_opts.zo_mirrors, 1); 3159 3160 /* 3161 * If we're configuring a RAIDZ device then make sure that the 3162 * initial version is capable of supporting that feature. 3163 */ 3164 switch (ztest_opts.zo_raid_parity) { 3165 case 0: 3166 case 1: 3167 initial_version = SPA_VERSION_INITIAL; 3168 break; 3169 case 2: 3170 initial_version = SPA_VERSION_RAIDZ2; 3171 break; 3172 case 3: 3173 initial_version = SPA_VERSION_RAIDZ3; 3174 break; 3175 } 3176 3177 /* 3178 * Create a pool with a spa version that can be upgraded. Pick 3179 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 3180 */ 3181 do { 3182 version = ztest_random_spa_version(initial_version); 3183 } while (version > SPA_VERSION_BEFORE_FEATURES); 3184 3185 props = fnvlist_alloc(); 3186 fnvlist_add_uint64(props, 3187 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 3188 VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); 3189 fnvlist_free(nvroot); 3190 fnvlist_free(props); 3191 3192 VERIFY0(spa_open(name, &spa, FTAG)); 3193 VERIFY3U(spa_version(spa), ==, version); 3194 newversion = ztest_random_spa_version(version + 1); 3195 3196 if (ztest_opts.zo_verbose >= 4) { 3197 (void) printf("upgrading spa version from " 3198 "%"PRIu64" to %"PRIu64"\n", 3199 version, newversion); 3200 } 3201 3202 spa_upgrade(spa, newversion); 3203 VERIFY3U(spa_version(spa), >, version); 3204 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 3205 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 3206 spa_close(spa, FTAG); 3207 3208 kmem_strfree(name); 3209 mutex_exit(&ztest_vdev_lock); 3210 } 3211 3212 static void 3213 ztest_spa_checkpoint(spa_t *spa) 3214 { 3215 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3216 3217 int error = spa_checkpoint(spa->spa_name); 3218 3219 switch (error) { 3220 case 0: 3221 case ZFS_ERR_DEVRM_IN_PROGRESS: 3222 case ZFS_ERR_DISCARDING_CHECKPOINT: 3223 case ZFS_ERR_CHECKPOINT_EXISTS: 3224 case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: 3225 break; 3226 case ENOSPC: 3227 ztest_record_enospc(FTAG); 3228 break; 3229 default: 3230 fatal(B_FALSE, "spa_checkpoint(%s) = %d", spa->spa_name, error); 3231 } 3232 } 3233 3234 static void 3235 ztest_spa_discard_checkpoint(spa_t *spa) 3236 { 3237 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3238 3239 int error = spa_checkpoint_discard(spa->spa_name); 3240 3241 switch (error) { 3242 case 0: 3243 case ZFS_ERR_DISCARDING_CHECKPOINT: 3244 case ZFS_ERR_NO_CHECKPOINT: 3245 break; 3246 default: 3247 fatal(B_FALSE, "spa_discard_checkpoint(%s) = %d", 3248 spa->spa_name, error); 3249 } 3250 3251 } 3252 3253 void 3254 ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 3255 { 3256 (void) zd, (void) id; 3257 spa_t *spa = ztest_spa; 3258 3259 mutex_enter(&ztest_checkpoint_lock); 3260 if (ztest_random(2) == 0) { 3261 ztest_spa_checkpoint(spa); 3262 } else { 3263 ztest_spa_discard_checkpoint(spa); 3264 } 3265 mutex_exit(&ztest_checkpoint_lock); 3266 } 3267 3268 3269 static vdev_t * 3270 vdev_lookup_by_path(vdev_t *vd, const char *path) 3271 { 3272 vdev_t *mvd; 3273 int c; 3274 3275 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 3276 return (vd); 3277 3278 for (c = 0; c < vd->vdev_children; c++) 3279 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 3280 NULL) 3281 return (mvd); 3282 3283 return (NULL); 3284 } 3285 3286 static int 3287 spa_num_top_vdevs(spa_t *spa) 3288 { 3289 vdev_t *rvd = spa->spa_root_vdev; 3290 ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); 3291 return (rvd->vdev_children); 3292 } 3293 3294 /* 3295 * Verify that vdev_add() works as expected. 3296 */ 3297 void 3298 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 3299 { 3300 (void) zd, (void) id; 3301 ztest_shared_t *zs = ztest_shared; 3302 spa_t *spa = ztest_spa; 3303 uint64_t leaves; 3304 uint64_t guid; 3305 uint64_t raidz_children; 3306 3307 nvlist_t *nvroot; 3308 int error; 3309 3310 if (ztest_opts.zo_mmp_test) 3311 return; 3312 3313 mutex_enter(&ztest_vdev_lock); 3314 raidz_children = ztest_get_raidz_children(spa); 3315 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 3316 3317 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3318 3319 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3320 3321 /* 3322 * If we have slogs then remove them 1/4 of the time. 3323 */ 3324 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 3325 metaslab_group_t *mg; 3326 3327 /* 3328 * find the first real slog in log allocation class 3329 */ 3330 mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; 3331 while (!mg->mg_vd->vdev_islog) 3332 mg = mg->mg_next; 3333 3334 guid = mg->mg_vd->vdev_guid; 3335 3336 spa_config_exit(spa, SCL_VDEV, FTAG); 3337 3338 /* 3339 * We have to grab the zs_name_lock as writer to 3340 * prevent a race between removing a slog (dmu_objset_find) 3341 * and destroying a dataset. Removing the slog will 3342 * grab a reference on the dataset which may cause 3343 * dsl_destroy_head() to fail with EBUSY thus 3344 * leaving the dataset in an inconsistent state. 3345 */ 3346 pthread_rwlock_wrlock(&ztest_name_lock); 3347 error = spa_vdev_remove(spa, guid, B_FALSE); 3348 pthread_rwlock_unlock(&ztest_name_lock); 3349 3350 switch (error) { 3351 case 0: 3352 case EEXIST: /* Generic zil_reset() error */ 3353 case EBUSY: /* Replay required */ 3354 case EACCES: /* Crypto key not loaded */ 3355 case ZFS_ERR_CHECKPOINT_EXISTS: 3356 case ZFS_ERR_DISCARDING_CHECKPOINT: 3357 break; 3358 default: 3359 fatal(B_FALSE, "spa_vdev_remove() = %d", error); 3360 } 3361 } else { 3362 spa_config_exit(spa, SCL_VDEV, FTAG); 3363 3364 /* 3365 * Make 1/4 of the devices be log devices 3366 */ 3367 nvroot = make_vdev_root(NULL, NULL, NULL, 3368 ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? 3369 "log" : NULL, raidz_children, zs->zs_mirrors, 3370 1); 3371 3372 error = spa_vdev_add(spa, nvroot, B_FALSE); 3373 fnvlist_free(nvroot); 3374 3375 switch (error) { 3376 case 0: 3377 break; 3378 case ENOSPC: 3379 ztest_record_enospc("spa_vdev_add"); 3380 break; 3381 default: 3382 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3383 } 3384 } 3385 3386 mutex_exit(&ztest_vdev_lock); 3387 } 3388 3389 void 3390 ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) 3391 { 3392 (void) zd, (void) id; 3393 ztest_shared_t *zs = ztest_shared; 3394 spa_t *spa = ztest_spa; 3395 uint64_t leaves; 3396 nvlist_t *nvroot; 3397 uint64_t raidz_children; 3398 const char *class = (ztest_random(2) == 0) ? 3399 VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; 3400 int error; 3401 3402 /* 3403 * By default add a special vdev 50% of the time 3404 */ 3405 if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || 3406 (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && 3407 ztest_random(2) == 0)) { 3408 return; 3409 } 3410 3411 mutex_enter(&ztest_vdev_lock); 3412 3413 /* Only test with mirrors */ 3414 if (zs->zs_mirrors < 2) { 3415 mutex_exit(&ztest_vdev_lock); 3416 return; 3417 } 3418 3419 /* requires feature@allocation_classes */ 3420 if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { 3421 mutex_exit(&ztest_vdev_lock); 3422 return; 3423 } 3424 3425 raidz_children = ztest_get_raidz_children(spa); 3426 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 3427 3428 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3429 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3430 spa_config_exit(spa, SCL_VDEV, FTAG); 3431 3432 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 3433 class, raidz_children, zs->zs_mirrors, 1); 3434 3435 error = spa_vdev_add(spa, nvroot, B_FALSE); 3436 fnvlist_free(nvroot); 3437 3438 if (error == ENOSPC) 3439 ztest_record_enospc("spa_vdev_add"); 3440 else if (error != 0) 3441 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3442 3443 /* 3444 * 50% of the time allow small blocks in the special class 3445 */ 3446 if (error == 0 && 3447 spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { 3448 if (ztest_opts.zo_verbose >= 3) 3449 (void) printf("Enabling special VDEV small blocks\n"); 3450 error = ztest_dsl_prop_set_uint64(zd->zd_name, 3451 ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); 3452 ASSERT(error == 0 || error == ENOSPC); 3453 } 3454 3455 mutex_exit(&ztest_vdev_lock); 3456 3457 if (ztest_opts.zo_verbose >= 3) { 3458 metaslab_class_t *mc; 3459 3460 if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) 3461 mc = spa_special_class(spa); 3462 else 3463 mc = spa_dedup_class(spa); 3464 (void) printf("Added a %s mirrored vdev (of %d)\n", 3465 class, (int)mc->mc_groups); 3466 } 3467 } 3468 3469 /* 3470 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 3471 */ 3472 void 3473 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 3474 { 3475 (void) zd, (void) id; 3476 ztest_shared_t *zs = ztest_shared; 3477 spa_t *spa = ztest_spa; 3478 vdev_t *rvd = spa->spa_root_vdev; 3479 spa_aux_vdev_t *sav; 3480 const char *aux; 3481 char *path; 3482 uint64_t guid = 0; 3483 int error, ignore_err = 0; 3484 3485 if (ztest_opts.zo_mmp_test) 3486 return; 3487 3488 path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3489 3490 if (ztest_random(2) == 0) { 3491 sav = &spa->spa_spares; 3492 aux = ZPOOL_CONFIG_SPARES; 3493 } else { 3494 sav = &spa->spa_l2cache; 3495 aux = ZPOOL_CONFIG_L2CACHE; 3496 } 3497 3498 mutex_enter(&ztest_vdev_lock); 3499 3500 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3501 3502 if (sav->sav_count != 0 && ztest_random(4) == 0) { 3503 /* 3504 * Pick a random device to remove. 3505 */ 3506 vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3507 3508 /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ 3509 if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) 3510 ignore_err = ENOTSUP; 3511 3512 guid = svd->vdev_guid; 3513 } else { 3514 /* 3515 * Find an unused device we can add. 3516 */ 3517 zs->zs_vdev_aux = 0; 3518 for (;;) { 3519 int c; 3520 (void) snprintf(path, MAXPATHLEN, ztest_aux_template, 3521 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 3522 zs->zs_vdev_aux); 3523 for (c = 0; c < sav->sav_count; c++) 3524 if (strcmp(sav->sav_vdevs[c]->vdev_path, 3525 path) == 0) 3526 break; 3527 if (c == sav->sav_count && 3528 vdev_lookup_by_path(rvd, path) == NULL) 3529 break; 3530 zs->zs_vdev_aux++; 3531 } 3532 } 3533 3534 spa_config_exit(spa, SCL_VDEV, FTAG); 3535 3536 if (guid == 0) { 3537 /* 3538 * Add a new device. 3539 */ 3540 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 3541 (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); 3542 error = spa_vdev_add(spa, nvroot, B_FALSE); 3543 3544 switch (error) { 3545 case 0: 3546 break; 3547 default: 3548 fatal(B_FALSE, "spa_vdev_add(%p) = %d", nvroot, error); 3549 } 3550 fnvlist_free(nvroot); 3551 } else { 3552 /* 3553 * Remove an existing device. Sometimes, dirty its 3554 * vdev state first to make sure we handle removal 3555 * of devices that have pending state changes. 3556 */ 3557 if (ztest_random(2) == 0) 3558 (void) vdev_online(spa, guid, 0, NULL); 3559 3560 error = spa_vdev_remove(spa, guid, B_FALSE); 3561 3562 switch (error) { 3563 case 0: 3564 case EBUSY: 3565 case ZFS_ERR_CHECKPOINT_EXISTS: 3566 case ZFS_ERR_DISCARDING_CHECKPOINT: 3567 break; 3568 default: 3569 if (error != ignore_err) 3570 fatal(B_FALSE, 3571 "spa_vdev_remove(%"PRIu64") = %d", 3572 guid, error); 3573 } 3574 } 3575 3576 mutex_exit(&ztest_vdev_lock); 3577 3578 umem_free(path, MAXPATHLEN); 3579 } 3580 3581 /* 3582 * split a pool if it has mirror tlvdevs 3583 */ 3584 void 3585 ztest_split_pool(ztest_ds_t *zd, uint64_t id) 3586 { 3587 (void) zd, (void) id; 3588 ztest_shared_t *zs = ztest_shared; 3589 spa_t *spa = ztest_spa; 3590 vdev_t *rvd = spa->spa_root_vdev; 3591 nvlist_t *tree, **child, *config, *split, **schild; 3592 uint_t c, children, schildren = 0, lastlogid = 0; 3593 int error = 0; 3594 3595 if (ztest_opts.zo_mmp_test) 3596 return; 3597 3598 mutex_enter(&ztest_vdev_lock); 3599 3600 /* ensure we have a usable config; mirrors of raidz aren't supported */ 3601 if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { 3602 mutex_exit(&ztest_vdev_lock); 3603 return; 3604 } 3605 3606 /* clean up the old pool, if any */ 3607 (void) spa_destroy("splitp"); 3608 3609 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3610 3611 /* generate a config from the existing config */ 3612 mutex_enter(&spa->spa_props_lock); 3613 tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); 3614 mutex_exit(&spa->spa_props_lock); 3615 3616 VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, 3617 &child, &children)); 3618 3619 schild = umem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 3620 UMEM_NOFAIL); 3621 for (c = 0; c < children; c++) { 3622 vdev_t *tvd = rvd->vdev_child[c]; 3623 nvlist_t **mchild; 3624 uint_t mchildren; 3625 3626 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 3627 schild[schildren] = fnvlist_alloc(); 3628 fnvlist_add_string(schild[schildren], 3629 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); 3630 fnvlist_add_uint64(schild[schildren], 3631 ZPOOL_CONFIG_IS_HOLE, 1); 3632 if (lastlogid == 0) 3633 lastlogid = schildren; 3634 ++schildren; 3635 continue; 3636 } 3637 lastlogid = 0; 3638 VERIFY0(nvlist_lookup_nvlist_array(child[c], 3639 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); 3640 schild[schildren++] = fnvlist_dup(mchild[0]); 3641 } 3642 3643 /* OK, create a config that can be used to split */ 3644 split = fnvlist_alloc(); 3645 fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 3646 fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, 3647 (const nvlist_t **)schild, lastlogid != 0 ? lastlogid : schildren); 3648 3649 config = fnvlist_alloc(); 3650 fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); 3651 3652 for (c = 0; c < schildren; c++) 3653 fnvlist_free(schild[c]); 3654 umem_free(schild, rvd->vdev_children * sizeof (nvlist_t *)); 3655 fnvlist_free(split); 3656 3657 spa_config_exit(spa, SCL_VDEV, FTAG); 3658 3659 (void) pthread_rwlock_wrlock(&ztest_name_lock); 3660 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 3661 (void) pthread_rwlock_unlock(&ztest_name_lock); 3662 3663 fnvlist_free(config); 3664 3665 if (error == 0) { 3666 (void) printf("successful split - results:\n"); 3667 mutex_enter(&spa_namespace_lock); 3668 show_pool_stats(spa); 3669 show_pool_stats(spa_lookup("splitp")); 3670 mutex_exit(&spa_namespace_lock); 3671 ++zs->zs_splits; 3672 --zs->zs_mirrors; 3673 } 3674 mutex_exit(&ztest_vdev_lock); 3675 } 3676 3677 /* 3678 * Verify that we can attach and detach devices. 3679 */ 3680 void 3681 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 3682 { 3683 (void) zd, (void) id; 3684 ztest_shared_t *zs = ztest_shared; 3685 spa_t *spa = ztest_spa; 3686 spa_aux_vdev_t *sav = &spa->spa_spares; 3687 vdev_t *rvd = spa->spa_root_vdev; 3688 vdev_t *oldvd, *newvd, *pvd; 3689 nvlist_t *root; 3690 uint64_t leaves; 3691 uint64_t leaf, top; 3692 uint64_t ashift = ztest_get_ashift(); 3693 uint64_t oldguid, pguid; 3694 uint64_t oldsize, newsize; 3695 uint64_t raidz_children; 3696 char *oldpath, *newpath; 3697 int replacing; 3698 int oldvd_has_siblings = B_FALSE; 3699 int newvd_is_spare = B_FALSE; 3700 int newvd_is_dspare = B_FALSE; 3701 int oldvd_is_log; 3702 int oldvd_is_special; 3703 int error, expected_error; 3704 3705 if (ztest_opts.zo_mmp_test) 3706 return; 3707 3708 oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3709 newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3710 3711 mutex_enter(&ztest_vdev_lock); 3712 raidz_children = ztest_get_raidz_children(spa); 3713 leaves = MAX(zs->zs_mirrors, 1) * raidz_children; 3714 3715 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3716 3717 /* 3718 * If a vdev is in the process of being removed, its removal may 3719 * finish while we are in progress, leading to an unexpected error 3720 * value. Don't bother trying to attach while we are in the middle 3721 * of removal. 3722 */ 3723 if (ztest_device_removal_active) { 3724 spa_config_exit(spa, SCL_ALL, FTAG); 3725 goto out; 3726 } 3727 3728 /* 3729 * RAIDZ leaf VDEV mirrors are not currently supported while a 3730 * RAIDZ expansion is in progress. 3731 */ 3732 if (ztest_opts.zo_raid_do_expand) { 3733 spa_config_exit(spa, SCL_ALL, FTAG); 3734 goto out; 3735 } 3736 3737 /* 3738 * Decide whether to do an attach or a replace. 3739 */ 3740 replacing = ztest_random(2); 3741 3742 /* 3743 * Pick a random top-level vdev. 3744 */ 3745 top = ztest_random_vdev_top(spa, B_TRUE); 3746 3747 /* 3748 * Pick a random leaf within it. 3749 */ 3750 leaf = ztest_random(leaves); 3751 3752 /* 3753 * Locate this vdev. 3754 */ 3755 oldvd = rvd->vdev_child[top]; 3756 3757 /* pick a child from the mirror */ 3758 if (zs->zs_mirrors >= 1) { 3759 ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); 3760 ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); 3761 oldvd = oldvd->vdev_child[leaf / raidz_children]; 3762 } 3763 3764 /* pick a child out of the raidz group */ 3765 if (ztest_opts.zo_raid_children > 1) { 3766 if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) 3767 ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); 3768 else 3769 ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); 3770 oldvd = oldvd->vdev_child[leaf % raidz_children]; 3771 } 3772 3773 /* 3774 * If we're already doing an attach or replace, oldvd may be a 3775 * mirror vdev -- in which case, pick a random child. 3776 */ 3777 while (oldvd->vdev_children != 0) { 3778 oldvd_has_siblings = B_TRUE; 3779 ASSERT3U(oldvd->vdev_children, >=, 2); 3780 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3781 } 3782 3783 oldguid = oldvd->vdev_guid; 3784 oldsize = vdev_get_min_asize(oldvd); 3785 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3786 oldvd_is_special = 3787 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_SPECIAL || 3788 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_DEDUP; 3789 (void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN); 3790 pvd = oldvd->vdev_parent; 3791 pguid = pvd->vdev_guid; 3792 3793 /* 3794 * If oldvd has siblings, then half of the time, detach it. Prior 3795 * to the detach the pool is scrubbed in order to prevent creating 3796 * unrepairable blocks as a result of the data corruption injection. 3797 */ 3798 if (oldvd_has_siblings && ztest_random(2) == 0) { 3799 spa_config_exit(spa, SCL_ALL, FTAG); 3800 3801 error = ztest_scrub_impl(spa); 3802 if (error) 3803 goto out; 3804 3805 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3806 if (error != 0 && error != ENODEV && error != EBUSY && 3807 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3808 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3809 fatal(B_FALSE, "detach (%s) returned %d", 3810 oldpath, error); 3811 goto out; 3812 } 3813 3814 /* 3815 * For the new vdev, choose with equal probability between the two 3816 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3817 */ 3818 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3819 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3820 newvd_is_spare = B_TRUE; 3821 3822 if (newvd->vdev_ops == &vdev_draid_spare_ops) 3823 newvd_is_dspare = B_TRUE; 3824 3825 (void) strlcpy(newpath, newvd->vdev_path, MAXPATHLEN); 3826 } else { 3827 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 3828 ztest_opts.zo_dir, ztest_opts.zo_pool, 3829 top * leaves + leaf); 3830 if (ztest_random(2) == 0) 3831 newpath[strlen(newpath) - 1] = 'b'; 3832 newvd = vdev_lookup_by_path(rvd, newpath); 3833 } 3834 3835 if (newvd) { 3836 /* 3837 * Reopen to ensure the vdev's asize field isn't stale. 3838 */ 3839 vdev_reopen(newvd); 3840 newsize = vdev_get_min_asize(newvd); 3841 } else { 3842 /* 3843 * Make newsize a little bigger or smaller than oldsize. 3844 * If it's smaller, the attach should fail. 3845 * If it's larger, and we're doing a replace, 3846 * we should get dynamic LUN growth when we're done. 3847 */ 3848 newsize = 10 * oldsize / (9 + ztest_random(3)); 3849 } 3850 3851 /* 3852 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3853 * unless it's a replace; in that case any non-replacing parent is OK. 3854 * 3855 * If newvd is already part of the pool, it should fail with EBUSY. 3856 * 3857 * If newvd is too small, it should fail with EOVERFLOW. 3858 * 3859 * If newvd is a distributed spare and it's being attached to a 3860 * dRAID which is not its parent it should fail with EINVAL. 3861 */ 3862 if (pvd->vdev_ops != &vdev_mirror_ops && 3863 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3864 pvd->vdev_ops == &vdev_replacing_ops || 3865 pvd->vdev_ops == &vdev_spare_ops)) 3866 expected_error = ENOTSUP; 3867 else if (newvd_is_spare && 3868 (!replacing || oldvd_is_log || oldvd_is_special)) 3869 expected_error = ENOTSUP; 3870 else if (newvd == oldvd) 3871 expected_error = replacing ? 0 : EBUSY; 3872 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3873 expected_error = EBUSY; 3874 else if (!newvd_is_dspare && newsize < oldsize) 3875 expected_error = EOVERFLOW; 3876 else if (ashift > oldvd->vdev_top->vdev_ashift) 3877 expected_error = EDOM; 3878 else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) 3879 expected_error = EINVAL; 3880 else 3881 expected_error = 0; 3882 3883 spa_config_exit(spa, SCL_ALL, FTAG); 3884 3885 /* 3886 * Build the nvlist describing newpath. 3887 */ 3888 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3889 ashift, NULL, 0, 0, 1); 3890 3891 /* 3892 * When supported select either a healing or sequential resilver. 3893 */ 3894 boolean_t rebuilding = B_FALSE; 3895 if (pvd->vdev_ops == &vdev_mirror_ops || 3896 pvd->vdev_ops == &vdev_root_ops) { 3897 rebuilding = !!ztest_random(2); 3898 } 3899 3900 error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); 3901 3902 fnvlist_free(root); 3903 3904 /* 3905 * If our parent was the replacing vdev, but the replace completed, 3906 * then instead of failing with ENOTSUP we may either succeed, 3907 * fail with ENODEV, or fail with EOVERFLOW. 3908 */ 3909 if (expected_error == ENOTSUP && 3910 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3911 expected_error = error; 3912 3913 /* 3914 * If someone grew the LUN, the replacement may be too small. 3915 */ 3916 if (error == EOVERFLOW || error == EBUSY) 3917 expected_error = error; 3918 3919 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3920 error == ZFS_ERR_DISCARDING_CHECKPOINT || 3921 error == ZFS_ERR_RESILVER_IN_PROGRESS || 3922 error == ZFS_ERR_REBUILD_IN_PROGRESS) 3923 expected_error = error; 3924 3925 if (error != expected_error && expected_error != EBUSY) { 3926 fatal(B_FALSE, "attach (%s %"PRIu64", %s %"PRIu64", %d) " 3927 "returned %d, expected %d", 3928 oldpath, oldsize, newpath, 3929 newsize, replacing, error, expected_error); 3930 } 3931 out: 3932 mutex_exit(&ztest_vdev_lock); 3933 3934 umem_free(oldpath, MAXPATHLEN); 3935 umem_free(newpath, MAXPATHLEN); 3936 } 3937 3938 static void 3939 raidz_scratch_verify(void) 3940 { 3941 spa_t *spa; 3942 uint64_t write_size, logical_size, offset; 3943 raidz_reflow_scratch_state_t state; 3944 vdev_raidz_expand_t *vre; 3945 vdev_t *raidvd; 3946 3947 ASSERT(raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE); 3948 3949 if (ztest_scratch_state->zs_raidz_scratch_verify_pause == 0) 3950 return; 3951 3952 kernel_init(SPA_MODE_READ); 3953 3954 mutex_enter(&spa_namespace_lock); 3955 spa = spa_lookup(ztest_opts.zo_pool); 3956 ASSERT(spa); 3957 spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; 3958 mutex_exit(&spa_namespace_lock); 3959 3960 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 3961 3962 ASSERT3U(RRSS_GET_OFFSET(&spa->spa_uberblock), !=, UINT64_MAX); 3963 3964 mutex_enter(&ztest_vdev_lock); 3965 3966 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); 3967 3968 vre = spa->spa_raidz_expand; 3969 if (vre == NULL) 3970 goto out; 3971 3972 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3973 offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3974 state = RRSS_GET_STATE(&spa->spa_uberblock); 3975 write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift, 3976 uint64_t); 3977 logical_size = write_size * raidvd->vdev_children; 3978 3979 switch (state) { 3980 /* 3981 * Initial state of reflow process. RAIDZ expansion was 3982 * requested by user, but scratch object was not created. 3983 */ 3984 case RRSS_SCRATCH_NOT_IN_USE: 3985 ASSERT3U(offset, ==, 0); 3986 break; 3987 3988 /* 3989 * Scratch object was synced and stored in boot area. 3990 */ 3991 case RRSS_SCRATCH_VALID: 3992 3993 /* 3994 * Scratch object was synced back to raidz start offset, 3995 * raidz is ready for sector by sector reflow process. 3996 */ 3997 case RRSS_SCRATCH_INVALID_SYNCED: 3998 3999 /* 4000 * Scratch object was synced back to raidz start offset 4001 * on zpool importing, raidz is ready for sector by sector 4002 * reflow process. 4003 */ 4004 case RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT: 4005 ASSERT3U(offset, ==, logical_size); 4006 break; 4007 4008 /* 4009 * Sector by sector reflow process started. 4010 */ 4011 case RRSS_SCRATCH_INVALID_SYNCED_REFLOW: 4012 ASSERT3U(offset, >=, logical_size); 4013 break; 4014 } 4015 4016 out: 4017 spa_config_exit(spa, SCL_ALL, FTAG); 4018 4019 mutex_exit(&ztest_vdev_lock); 4020 4021 ztest_scratch_state->zs_raidz_scratch_verify_pause = 0; 4022 4023 spa_close(spa, FTAG); 4024 kernel_fini(); 4025 } 4026 4027 static void 4028 ztest_scratch_thread(void *arg) 4029 { 4030 (void) arg; 4031 4032 /* wait up to 10 seconds */ 4033 for (int t = 100; t > 0; t -= 1) { 4034 if (raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) 4035 thread_exit(); 4036 4037 (void) poll(NULL, 0, 100); 4038 } 4039 4040 /* killed when the scratch area progress reached a certain point */ 4041 ztest_kill(ztest_shared); 4042 } 4043 4044 /* 4045 * Verify that we can attach raidz device. 4046 */ 4047 void 4048 ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) 4049 { 4050 (void) zd, (void) id; 4051 ztest_shared_t *zs = ztest_shared; 4052 spa_t *spa = ztest_spa; 4053 uint64_t leaves, raidz_children, newsize, ashift = ztest_get_ashift(); 4054 kthread_t *scratch_thread = NULL; 4055 vdev_t *newvd, *pvd; 4056 nvlist_t *root; 4057 char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 4058 int error, expected_error = 0; 4059 4060 mutex_enter(&ztest_vdev_lock); 4061 4062 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); 4063 4064 /* Only allow attach when raid-kind = 'eraidz' */ 4065 if (!ztest_opts.zo_raid_do_expand) { 4066 spa_config_exit(spa, SCL_ALL, FTAG); 4067 goto out; 4068 } 4069 4070 if (ztest_opts.zo_mmp_test) { 4071 spa_config_exit(spa, SCL_ALL, FTAG); 4072 goto out; 4073 } 4074 4075 if (ztest_device_removal_active) { 4076 spa_config_exit(spa, SCL_ALL, FTAG); 4077 goto out; 4078 } 4079 4080 pvd = vdev_lookup_top(spa, 0); 4081 4082 ASSERT(pvd->vdev_ops == &vdev_raidz_ops); 4083 4084 /* 4085 * Get size of a child of the raidz group, 4086 * make sure device is a bit bigger 4087 */ 4088 newvd = pvd->vdev_child[ztest_random(pvd->vdev_children)]; 4089 newsize = 10 * vdev_get_min_asize(newvd) / (9 + ztest_random(2)); 4090 4091 /* 4092 * Get next attached leaf id 4093 */ 4094 raidz_children = ztest_get_raidz_children(spa); 4095 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 4096 zs->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 4097 4098 if (spa->spa_raidz_expand) 4099 expected_error = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS; 4100 4101 spa_config_exit(spa, SCL_ALL, FTAG); 4102 4103 /* 4104 * Path to vdev to be attached 4105 */ 4106 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 4107 ztest_opts.zo_dir, ztest_opts.zo_pool, zs->zs_vdev_next_leaf); 4108 4109 /* 4110 * Build the nvlist describing newpath. 4111 */ 4112 root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL, 4113 0, 0, 1); 4114 4115 /* 4116 * 50% of the time, set raidz_expand_pause_point to cause 4117 * raidz_reflow_scratch_sync() to pause at a certain point and 4118 * then kill the test after 10 seconds so raidz_scratch_verify() 4119 * can confirm consistency when the pool is imported. 4120 */ 4121 if (ztest_random(2) == 0 && expected_error == 0) { 4122 raidz_expand_pause_point = 4123 ztest_random(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2) + 1; 4124 scratch_thread = thread_create(NULL, 0, ztest_scratch_thread, 4125 ztest_shared, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 4126 } 4127 4128 error = spa_vdev_attach(spa, pvd->vdev_guid, root, B_FALSE, B_FALSE); 4129 4130 nvlist_free(root); 4131 4132 if (error == EOVERFLOW || error == ENXIO || 4133 error == ZFS_ERR_CHECKPOINT_EXISTS || 4134 error == ZFS_ERR_DISCARDING_CHECKPOINT) 4135 expected_error = error; 4136 4137 if (error != 0 && error != expected_error) { 4138 fatal(0, "raidz attach (%s %"PRIu64") returned %d, expected %d", 4139 newpath, newsize, error, expected_error); 4140 } 4141 4142 if (raidz_expand_pause_point) { 4143 if (error != 0) { 4144 /* 4145 * Do not verify scratch object in case of error 4146 * returned by vdev attaching. 4147 */ 4148 raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; 4149 } 4150 4151 VERIFY0(thread_join(scratch_thread)); 4152 } 4153 out: 4154 mutex_exit(&ztest_vdev_lock); 4155 4156 umem_free(newpath, MAXPATHLEN); 4157 } 4158 4159 void 4160 ztest_device_removal(ztest_ds_t *zd, uint64_t id) 4161 { 4162 (void) zd, (void) id; 4163 spa_t *spa = ztest_spa; 4164 vdev_t *vd; 4165 uint64_t guid; 4166 int error; 4167 4168 mutex_enter(&ztest_vdev_lock); 4169 4170 if (ztest_device_removal_active) { 4171 mutex_exit(&ztest_vdev_lock); 4172 return; 4173 } 4174 4175 /* 4176 * Remove a random top-level vdev and wait for removal to finish. 4177 */ 4178 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 4179 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 4180 guid = vd->vdev_guid; 4181 spa_config_exit(spa, SCL_VDEV, FTAG); 4182 4183 error = spa_vdev_remove(spa, guid, B_FALSE); 4184 if (error == 0) { 4185 ztest_device_removal_active = B_TRUE; 4186 mutex_exit(&ztest_vdev_lock); 4187 4188 /* 4189 * spa->spa_vdev_removal is created in a sync task that 4190 * is initiated via dsl_sync_task_nowait(). Since the 4191 * task may not run before spa_vdev_remove() returns, we 4192 * must wait at least 1 txg to ensure that the removal 4193 * struct has been created. 4194 */ 4195 txg_wait_synced(spa_get_dsl(spa), 0); 4196 4197 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 4198 txg_wait_synced(spa_get_dsl(spa), 0); 4199 } else { 4200 mutex_exit(&ztest_vdev_lock); 4201 return; 4202 } 4203 4204 /* 4205 * The pool needs to be scrubbed after completing device removal. 4206 * Failure to do so may result in checksum errors due to the 4207 * strategy employed by ztest_fault_inject() when selecting which 4208 * offset are redundant and can be damaged. 4209 */ 4210 error = spa_scan(spa, POOL_SCAN_SCRUB); 4211 if (error == 0) { 4212 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 4213 txg_wait_synced(spa_get_dsl(spa), 0); 4214 } 4215 4216 mutex_enter(&ztest_vdev_lock); 4217 ztest_device_removal_active = B_FALSE; 4218 mutex_exit(&ztest_vdev_lock); 4219 } 4220 4221 /* 4222 * Callback function which expands the physical size of the vdev. 4223 */ 4224 static vdev_t * 4225 grow_vdev(vdev_t *vd, void *arg) 4226 { 4227 spa_t *spa __maybe_unused = vd->vdev_spa; 4228 size_t *newsize = arg; 4229 size_t fsize; 4230 int fd; 4231 4232 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 4233 ASSERT(vd->vdev_ops->vdev_op_leaf); 4234 4235 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 4236 return (vd); 4237 4238 fsize = lseek(fd, 0, SEEK_END); 4239 VERIFY0(ftruncate(fd, *newsize)); 4240 4241 if (ztest_opts.zo_verbose >= 6) { 4242 (void) printf("%s grew from %lu to %lu bytes\n", 4243 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 4244 } 4245 (void) close(fd); 4246 return (NULL); 4247 } 4248 4249 /* 4250 * Callback function which expands a given vdev by calling vdev_online(). 4251 */ 4252 static vdev_t * 4253 online_vdev(vdev_t *vd, void *arg) 4254 { 4255 (void) arg; 4256 spa_t *spa = vd->vdev_spa; 4257 vdev_t *tvd = vd->vdev_top; 4258 uint64_t guid = vd->vdev_guid; 4259 uint64_t generation = spa->spa_config_generation + 1; 4260 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 4261 int error; 4262 4263 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 4264 ASSERT(vd->vdev_ops->vdev_op_leaf); 4265 4266 /* Calling vdev_online will initialize the new metaslabs */ 4267 spa_config_exit(spa, SCL_STATE, spa); 4268 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 4269 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4270 4271 /* 4272 * If vdev_online returned an error or the underlying vdev_open 4273 * failed then we abort the expand. The only way to know that 4274 * vdev_open fails is by checking the returned newstate. 4275 */ 4276 if (error || newstate != VDEV_STATE_HEALTHY) { 4277 if (ztest_opts.zo_verbose >= 5) { 4278 (void) printf("Unable to expand vdev, state %u, " 4279 "error %d\n", newstate, error); 4280 } 4281 return (vd); 4282 } 4283 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 4284 4285 /* 4286 * Since we dropped the lock we need to ensure that we're 4287 * still talking to the original vdev. It's possible this 4288 * vdev may have been detached/replaced while we were 4289 * trying to online it. 4290 */ 4291 if (generation != spa->spa_config_generation) { 4292 if (ztest_opts.zo_verbose >= 5) { 4293 (void) printf("vdev configuration has changed, " 4294 "guid %"PRIu64", state %"PRIu64", " 4295 "expected gen %"PRIu64", got gen %"PRIu64"\n", 4296 guid, 4297 tvd->vdev_state, 4298 generation, 4299 spa->spa_config_generation); 4300 } 4301 return (vd); 4302 } 4303 return (NULL); 4304 } 4305 4306 /* 4307 * Traverse the vdev tree calling the supplied function. 4308 * We continue to walk the tree until we either have walked all 4309 * children or we receive a non-NULL return from the callback. 4310 * If a NULL callback is passed, then we just return back the first 4311 * leaf vdev we encounter. 4312 */ 4313 static vdev_t * 4314 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 4315 { 4316 uint_t c; 4317 4318 if (vd->vdev_ops->vdev_op_leaf) { 4319 if (func == NULL) 4320 return (vd); 4321 else 4322 return (func(vd, arg)); 4323 } 4324 4325 for (c = 0; c < vd->vdev_children; c++) { 4326 vdev_t *cvd = vd->vdev_child[c]; 4327 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 4328 return (cvd); 4329 } 4330 return (NULL); 4331 } 4332 4333 /* 4334 * Verify that dynamic LUN growth works as expected. 4335 */ 4336 void 4337 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 4338 { 4339 (void) zd, (void) id; 4340 spa_t *spa = ztest_spa; 4341 vdev_t *vd, *tvd; 4342 metaslab_class_t *mc; 4343 metaslab_group_t *mg; 4344 size_t psize, newsize; 4345 uint64_t top; 4346 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 4347 4348 mutex_enter(&ztest_checkpoint_lock); 4349 mutex_enter(&ztest_vdev_lock); 4350 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4351 4352 /* 4353 * If there is a vdev removal in progress, it could complete while 4354 * we are running, in which case we would not be able to verify 4355 * that the metaslab_class space increased (because it decreases 4356 * when the device removal completes). 4357 */ 4358 if (ztest_device_removal_active) { 4359 spa_config_exit(spa, SCL_STATE, spa); 4360 mutex_exit(&ztest_vdev_lock); 4361 mutex_exit(&ztest_checkpoint_lock); 4362 return; 4363 } 4364 4365 /* 4366 * If we are under raidz expansion, the test can failed because the 4367 * metaslabs count will not increase immediately after the vdev is 4368 * expanded. It will happen only after raidz expansion completion. 4369 */ 4370 if (spa->spa_raidz_expand) { 4371 spa_config_exit(spa, SCL_STATE, spa); 4372 mutex_exit(&ztest_vdev_lock); 4373 mutex_exit(&ztest_checkpoint_lock); 4374 return; 4375 } 4376 4377 top = ztest_random_vdev_top(spa, B_TRUE); 4378 4379 tvd = spa->spa_root_vdev->vdev_child[top]; 4380 mg = tvd->vdev_mg; 4381 mc = mg->mg_class; 4382 old_ms_count = tvd->vdev_ms_count; 4383 old_class_space = metaslab_class_get_space(mc); 4384 4385 /* 4386 * Determine the size of the first leaf vdev associated with 4387 * our top-level device. 4388 */ 4389 vd = vdev_walk_tree(tvd, NULL, NULL); 4390 ASSERT3P(vd, !=, NULL); 4391 ASSERT(vd->vdev_ops->vdev_op_leaf); 4392 4393 psize = vd->vdev_psize; 4394 4395 /* 4396 * We only try to expand the vdev if it's healthy, less than 4x its 4397 * original size, and it has a valid psize. 4398 */ 4399 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 4400 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 4401 spa_config_exit(spa, SCL_STATE, spa); 4402 mutex_exit(&ztest_vdev_lock); 4403 mutex_exit(&ztest_checkpoint_lock); 4404 return; 4405 } 4406 ASSERT3U(psize, >, 0); 4407 newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); 4408 ASSERT3U(newsize, >, psize); 4409 4410 if (ztest_opts.zo_verbose >= 6) { 4411 (void) printf("Expanding LUN %s from %lu to %lu\n", 4412 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 4413 } 4414 4415 /* 4416 * Growing the vdev is a two step process: 4417 * 1). expand the physical size (i.e. relabel) 4418 * 2). online the vdev to create the new metaslabs 4419 */ 4420 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 4421 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 4422 tvd->vdev_state != VDEV_STATE_HEALTHY) { 4423 if (ztest_opts.zo_verbose >= 5) { 4424 (void) printf("Could not expand LUN because " 4425 "the vdev configuration changed.\n"); 4426 } 4427 spa_config_exit(spa, SCL_STATE, spa); 4428 mutex_exit(&ztest_vdev_lock); 4429 mutex_exit(&ztest_checkpoint_lock); 4430 return; 4431 } 4432 4433 spa_config_exit(spa, SCL_STATE, spa); 4434 4435 /* 4436 * Expanding the LUN will update the config asynchronously, 4437 * thus we must wait for the async thread to complete any 4438 * pending tasks before proceeding. 4439 */ 4440 for (;;) { 4441 boolean_t done; 4442 mutex_enter(&spa->spa_async_lock); 4443 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 4444 mutex_exit(&spa->spa_async_lock); 4445 if (done) 4446 break; 4447 txg_wait_synced(spa_get_dsl(spa), 0); 4448 (void) poll(NULL, 0, 100); 4449 } 4450 4451 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4452 4453 tvd = spa->spa_root_vdev->vdev_child[top]; 4454 new_ms_count = tvd->vdev_ms_count; 4455 new_class_space = metaslab_class_get_space(mc); 4456 4457 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 4458 if (ztest_opts.zo_verbose >= 5) { 4459 (void) printf("Could not verify LUN expansion due to " 4460 "intervening vdev offline or remove.\n"); 4461 } 4462 spa_config_exit(spa, SCL_STATE, spa); 4463 mutex_exit(&ztest_vdev_lock); 4464 mutex_exit(&ztest_checkpoint_lock); 4465 return; 4466 } 4467 4468 /* 4469 * Make sure we were able to grow the vdev. 4470 */ 4471 if (new_ms_count <= old_ms_count) { 4472 fatal(B_FALSE, 4473 "LUN expansion failed: ms_count %"PRIu64" < %"PRIu64"\n", 4474 old_ms_count, new_ms_count); 4475 } 4476 4477 /* 4478 * Make sure we were able to grow the pool. 4479 */ 4480 if (new_class_space <= old_class_space) { 4481 fatal(B_FALSE, 4482 "LUN expansion failed: class_space %"PRIu64" < %"PRIu64"\n", 4483 old_class_space, new_class_space); 4484 } 4485 4486 if (ztest_opts.zo_verbose >= 5) { 4487 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 4488 4489 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 4490 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 4491 (void) printf("%s grew from %s to %s\n", 4492 spa->spa_name, oldnumbuf, newnumbuf); 4493 } 4494 4495 spa_config_exit(spa, SCL_STATE, spa); 4496 mutex_exit(&ztest_vdev_lock); 4497 mutex_exit(&ztest_checkpoint_lock); 4498 } 4499 4500 /* 4501 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 4502 */ 4503 static void 4504 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 4505 { 4506 (void) arg, (void) cr; 4507 4508 /* 4509 * Create the objects common to all ztest datasets. 4510 */ 4511 VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, 4512 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); 4513 } 4514 4515 static int 4516 ztest_dataset_create(char *dsname) 4517 { 4518 int err; 4519 uint64_t rand; 4520 dsl_crypto_params_t *dcp = NULL; 4521 4522 /* 4523 * 50% of the time, we create encrypted datasets 4524 * using a random cipher suite and a hard-coded 4525 * wrapping key. 4526 */ 4527 rand = ztest_random(2); 4528 if (rand != 0) { 4529 nvlist_t *crypto_args = fnvlist_alloc(); 4530 nvlist_t *props = fnvlist_alloc(); 4531 4532 /* slight bias towards the default cipher suite */ 4533 rand = ztest_random(ZIO_CRYPT_FUNCTIONS); 4534 if (rand < ZIO_CRYPT_AES_128_CCM) 4535 rand = ZIO_CRYPT_ON; 4536 4537 fnvlist_add_uint64(props, 4538 zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); 4539 fnvlist_add_uint8_array(crypto_args, "wkeydata", 4540 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 4541 4542 /* 4543 * These parameters aren't really used by the kernel. They 4544 * are simply stored so that userspace knows how to load 4545 * the wrapping key. 4546 */ 4547 fnvlist_add_uint64(props, 4548 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); 4549 fnvlist_add_string(props, 4550 zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); 4551 fnvlist_add_uint64(props, 4552 zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); 4553 fnvlist_add_uint64(props, 4554 zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); 4555 4556 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, 4557 crypto_args, &dcp)); 4558 4559 /* 4560 * Cycle through all available encryption implementations 4561 * to verify interoperability. 4562 */ 4563 VERIFY0(gcm_impl_set("cycle")); 4564 VERIFY0(aes_impl_set("cycle")); 4565 4566 fnvlist_free(crypto_args); 4567 fnvlist_free(props); 4568 } 4569 4570 err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, 4571 ztest_objset_create_cb, NULL); 4572 dsl_crypto_params_free(dcp, !!err); 4573 4574 rand = ztest_random(100); 4575 if (err || rand < 80) 4576 return (err); 4577 4578 if (ztest_opts.zo_verbose >= 5) 4579 (void) printf("Setting dataset %s to sync always\n", dsname); 4580 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 4581 ZFS_SYNC_ALWAYS, B_FALSE)); 4582 } 4583 4584 static int 4585 ztest_objset_destroy_cb(const char *name, void *arg) 4586 { 4587 (void) arg; 4588 objset_t *os; 4589 dmu_object_info_t doi; 4590 int error; 4591 4592 /* 4593 * Verify that the dataset contains a directory object. 4594 */ 4595 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4596 B_TRUE, FTAG, &os)); 4597 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 4598 if (error != ENOENT) { 4599 /* We could have crashed in the middle of destroying it */ 4600 ASSERT0(error); 4601 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 4602 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 4603 } 4604 dmu_objset_disown(os, B_TRUE, FTAG); 4605 4606 /* 4607 * Destroy the dataset. 4608 */ 4609 if (strchr(name, '@') != NULL) { 4610 error = dsl_destroy_snapshot(name, B_TRUE); 4611 if (error != ECHRNG) { 4612 /* 4613 * The program was executed, but encountered a runtime 4614 * error, such as insufficient slop, or a hold on the 4615 * dataset. 4616 */ 4617 ASSERT0(error); 4618 } 4619 } else { 4620 error = dsl_destroy_head(name); 4621 if (error == ENOSPC) { 4622 /* There could be checkpoint or insufficient slop */ 4623 ztest_record_enospc(FTAG); 4624 } else if (error != EBUSY) { 4625 /* There could be a hold on this dataset */ 4626 ASSERT0(error); 4627 } 4628 } 4629 return (0); 4630 } 4631 4632 static boolean_t 4633 ztest_snapshot_create(char *osname, uint64_t id) 4634 { 4635 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4636 int error; 4637 4638 (void) snprintf(snapname, sizeof (snapname), "%"PRIu64"", id); 4639 4640 error = dmu_objset_snapshot_one(osname, snapname); 4641 if (error == ENOSPC) { 4642 ztest_record_enospc(FTAG); 4643 return (B_FALSE); 4644 } 4645 if (error != 0 && error != EEXIST && error != ECHRNG) { 4646 fatal(B_FALSE, "ztest_snapshot_create(%s@%s) = %d", osname, 4647 snapname, error); 4648 } 4649 return (B_TRUE); 4650 } 4651 4652 static boolean_t 4653 ztest_snapshot_destroy(char *osname, uint64_t id) 4654 { 4655 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4656 int error; 4657 4658 (void) snprintf(snapname, sizeof (snapname), "%s@%"PRIu64"", 4659 osname, id); 4660 4661 error = dsl_destroy_snapshot(snapname, B_FALSE); 4662 if (error != 0 && error != ENOENT && error != ECHRNG) 4663 fatal(B_FALSE, "ztest_snapshot_destroy(%s) = %d", 4664 snapname, error); 4665 return (B_TRUE); 4666 } 4667 4668 void 4669 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 4670 { 4671 (void) zd; 4672 ztest_ds_t *zdtmp; 4673 int iters; 4674 int error; 4675 objset_t *os, *os2; 4676 char name[ZFS_MAX_DATASET_NAME_LEN]; 4677 zilog_t *zilog; 4678 int i; 4679 4680 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 4681 4682 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4683 4684 (void) snprintf(name, sizeof (name), "%s/temp_%"PRIu64"", 4685 ztest_opts.zo_pool, id); 4686 4687 /* 4688 * If this dataset exists from a previous run, process its replay log 4689 * half of the time. If we don't replay it, then dsl_destroy_head() 4690 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 4691 */ 4692 if (ztest_random(2) == 0 && 4693 ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 4694 B_TRUE, FTAG, &os) == 0) { 4695 ztest_zd_init(zdtmp, NULL, os); 4696 zil_replay(os, zdtmp, ztest_replay_vector); 4697 ztest_zd_fini(zdtmp); 4698 dmu_objset_disown(os, B_TRUE, FTAG); 4699 } 4700 4701 /* 4702 * There may be an old instance of the dataset we're about to 4703 * create lying around from a previous run. If so, destroy it 4704 * and all of its snapshots. 4705 */ 4706 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 4707 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 4708 4709 /* 4710 * Verify that the destroyed dataset is no longer in the namespace. 4711 * It may still be present if the destroy above fails with ENOSPC. 4712 */ 4713 error = ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, 4714 FTAG, &os); 4715 if (error == 0) { 4716 dmu_objset_disown(os, B_TRUE, FTAG); 4717 ztest_record_enospc(FTAG); 4718 goto out; 4719 } 4720 VERIFY3U(ENOENT, ==, error); 4721 4722 /* 4723 * Verify that we can create a new dataset. 4724 */ 4725 error = ztest_dataset_create(name); 4726 if (error) { 4727 if (error == ENOSPC) { 4728 ztest_record_enospc(FTAG); 4729 goto out; 4730 } 4731 fatal(B_FALSE, "dmu_objset_create(%s) = %d", name, error); 4732 } 4733 4734 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, 4735 FTAG, &os)); 4736 4737 ztest_zd_init(zdtmp, NULL, os); 4738 4739 /* 4740 * Open the intent log for it. 4741 */ 4742 zilog = zil_open(os, ztest_get_data, NULL); 4743 4744 /* 4745 * Put some objects in there, do a little I/O to them, 4746 * and randomly take a couple of snapshots along the way. 4747 */ 4748 iters = ztest_random(5); 4749 for (i = 0; i < iters; i++) { 4750 ztest_dmu_object_alloc_free(zdtmp, id); 4751 if (ztest_random(iters) == 0) 4752 (void) ztest_snapshot_create(name, i); 4753 } 4754 4755 /* 4756 * Verify that we cannot create an existing dataset. 4757 */ 4758 VERIFY3U(EEXIST, ==, 4759 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); 4760 4761 /* 4762 * Verify that we can hold an objset that is also owned. 4763 */ 4764 VERIFY0(dmu_objset_hold(name, FTAG, &os2)); 4765 dmu_objset_rele(os2, FTAG); 4766 4767 /* 4768 * Verify that we cannot own an objset that is already owned. 4769 */ 4770 VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, 4771 B_FALSE, B_TRUE, FTAG, &os2)); 4772 4773 zil_close(zilog); 4774 dmu_objset_disown(os, B_TRUE, FTAG); 4775 ztest_zd_fini(zdtmp); 4776 out: 4777 (void) pthread_rwlock_unlock(&ztest_name_lock); 4778 4779 umem_free(zdtmp, sizeof (ztest_ds_t)); 4780 } 4781 4782 /* 4783 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 4784 */ 4785 void 4786 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 4787 { 4788 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4789 (void) ztest_snapshot_destroy(zd->zd_name, id); 4790 (void) ztest_snapshot_create(zd->zd_name, id); 4791 (void) pthread_rwlock_unlock(&ztest_name_lock); 4792 } 4793 4794 /* 4795 * Cleanup non-standard snapshots and clones. 4796 */ 4797 static void 4798 ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 4799 { 4800 char *snap1name; 4801 char *clone1name; 4802 char *snap2name; 4803 char *clone2name; 4804 char *snap3name; 4805 int error; 4806 4807 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4808 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4809 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4810 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4811 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4812 4813 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4814 osname, id); 4815 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4816 osname, id); 4817 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4818 clone1name, id); 4819 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4820 osname, id); 4821 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4822 clone1name, id); 4823 4824 error = dsl_destroy_head(clone2name); 4825 if (error && error != ENOENT) 4826 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone2name, error); 4827 error = dsl_destroy_snapshot(snap3name, B_FALSE); 4828 if (error && error != ENOENT) 4829 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4830 snap3name, error); 4831 error = dsl_destroy_snapshot(snap2name, B_FALSE); 4832 if (error && error != ENOENT) 4833 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4834 snap2name, error); 4835 error = dsl_destroy_head(clone1name); 4836 if (error && error != ENOENT) 4837 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone1name, error); 4838 error = dsl_destroy_snapshot(snap1name, B_FALSE); 4839 if (error && error != ENOENT) 4840 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4841 snap1name, error); 4842 4843 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4844 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4845 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4846 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4847 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4848 } 4849 4850 /* 4851 * Verify dsl_dataset_promote handles EBUSY 4852 */ 4853 void 4854 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 4855 { 4856 objset_t *os; 4857 char *snap1name; 4858 char *clone1name; 4859 char *snap2name; 4860 char *clone2name; 4861 char *snap3name; 4862 char *osname = zd->zd_name; 4863 int error; 4864 4865 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4866 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4867 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4868 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4869 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4870 4871 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4872 4873 ztest_dsl_dataset_cleanup(osname, id); 4874 4875 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4876 osname, id); 4877 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4878 osname, id); 4879 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4880 clone1name, id); 4881 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4882 osname, id); 4883 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4884 clone1name, id); 4885 4886 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 4887 if (error && error != EEXIST) { 4888 if (error == ENOSPC) { 4889 ztest_record_enospc(FTAG); 4890 goto out; 4891 } 4892 fatal(B_FALSE, "dmu_take_snapshot(%s) = %d", snap1name, error); 4893 } 4894 4895 error = dmu_objset_clone(clone1name, snap1name); 4896 if (error) { 4897 if (error == ENOSPC) { 4898 ztest_record_enospc(FTAG); 4899 goto out; 4900 } 4901 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone1name, error); 4902 } 4903 4904 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 4905 if (error && error != EEXIST) { 4906 if (error == ENOSPC) { 4907 ztest_record_enospc(FTAG); 4908 goto out; 4909 } 4910 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap2name, error); 4911 } 4912 4913 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 4914 if (error && error != EEXIST) { 4915 if (error == ENOSPC) { 4916 ztest_record_enospc(FTAG); 4917 goto out; 4918 } 4919 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap3name, error); 4920 } 4921 4922 error = dmu_objset_clone(clone2name, snap3name); 4923 if (error) { 4924 if (error == ENOSPC) { 4925 ztest_record_enospc(FTAG); 4926 goto out; 4927 } 4928 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone2name, error); 4929 } 4930 4931 error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, 4932 FTAG, &os); 4933 if (error) 4934 fatal(B_FALSE, "dmu_objset_own(%s) = %d", snap2name, error); 4935 error = dsl_dataset_promote(clone2name, NULL); 4936 if (error == ENOSPC) { 4937 dmu_objset_disown(os, B_TRUE, FTAG); 4938 ztest_record_enospc(FTAG); 4939 goto out; 4940 } 4941 if (error != EBUSY) 4942 fatal(B_FALSE, "dsl_dataset_promote(%s), %d, not EBUSY", 4943 clone2name, error); 4944 dmu_objset_disown(os, B_TRUE, FTAG); 4945 4946 out: 4947 ztest_dsl_dataset_cleanup(osname, id); 4948 4949 (void) pthread_rwlock_unlock(&ztest_name_lock); 4950 4951 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4952 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4953 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4954 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4955 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4956 } 4957 4958 #undef OD_ARRAY_SIZE 4959 #define OD_ARRAY_SIZE 4 4960 4961 /* 4962 * Verify that dmu_object_{alloc,free} work as expected. 4963 */ 4964 void 4965 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 4966 { 4967 ztest_od_t *od; 4968 int batchsize; 4969 int size; 4970 int b; 4971 4972 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4973 od = umem_alloc(size, UMEM_NOFAIL); 4974 batchsize = OD_ARRAY_SIZE; 4975 4976 for (b = 0; b < batchsize; b++) 4977 ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 4978 0, 0, 0); 4979 4980 /* 4981 * Destroy the previous batch of objects, create a new batch, 4982 * and do some I/O on the new objects. 4983 */ 4984 if (ztest_object_init(zd, od, size, B_TRUE) != 0) { 4985 zd->zd_od = NULL; 4986 umem_free(od, size); 4987 return; 4988 } 4989 4990 while (ztest_random(4 * batchsize) != 0) 4991 ztest_io(zd, od[ztest_random(batchsize)].od_object, 4992 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4993 4994 umem_free(od, size); 4995 } 4996 4997 /* 4998 * Rewind the global allocator to verify object allocation backfilling. 4999 */ 5000 void 5001 ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 5002 { 5003 (void) id; 5004 objset_t *os = zd->zd_os; 5005 uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 5006 uint64_t object; 5007 5008 /* 5009 * Rewind the global allocator randomly back to a lower object number 5010 * to force backfilling and reclamation of recently freed dnodes. 5011 */ 5012 mutex_enter(&os->os_obj_lock); 5013 object = ztest_random(os->os_obj_next_chunk); 5014 os->os_obj_next_chunk = P2ALIGN_TYPED(object, dnodes_per_chunk, 5015 uint64_t); 5016 mutex_exit(&os->os_obj_lock); 5017 } 5018 5019 #undef OD_ARRAY_SIZE 5020 #define OD_ARRAY_SIZE 2 5021 5022 /* 5023 * Verify that dmu_{read,write} work as expected. 5024 */ 5025 void 5026 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 5027 { 5028 int size; 5029 ztest_od_t *od; 5030 5031 objset_t *os = zd->zd_os; 5032 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 5033 od = umem_alloc(size, UMEM_NOFAIL); 5034 dmu_tx_t *tx; 5035 int freeit, error; 5036 uint64_t i, n, s, txg; 5037 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 5038 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 5039 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 5040 uint64_t regions = 997; 5041 uint64_t stride = 123456789ULL; 5042 uint64_t width = 40; 5043 int free_percent = 5; 5044 5045 /* 5046 * This test uses two objects, packobj and bigobj, that are always 5047 * updated together (i.e. in the same tx) so that their contents are 5048 * in sync and can be compared. Their contents relate to each other 5049 * in a simple way: packobj is a dense array of 'bufwad' structures, 5050 * while bigobj is a sparse array of the same bufwads. Specifically, 5051 * for any index n, there are three bufwads that should be identical: 5052 * 5053 * packobj, at offset n * sizeof (bufwad_t) 5054 * bigobj, at the head of the nth chunk 5055 * bigobj, at the tail of the nth chunk 5056 * 5057 * The chunk size is arbitrary. It doesn't have to be a power of two, 5058 * and it doesn't have any relation to the object blocksize. 5059 * The only requirement is that it can hold at least two bufwads. 5060 * 5061 * Normally, we write the bufwad to each of these locations. 5062 * However, free_percent of the time we instead write zeroes to 5063 * packobj and perform a dmu_free_range() on bigobj. By comparing 5064 * bigobj to packobj, we can verify that the DMU is correctly 5065 * tracking which parts of an object are allocated and free, 5066 * and that the contents of the allocated blocks are correct. 5067 */ 5068 5069 /* 5070 * Read the directory info. If it's the first time, set things up. 5071 */ 5072 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); 5073 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5074 chunksize); 5075 5076 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5077 umem_free(od, size); 5078 return; 5079 } 5080 5081 bigobj = od[0].od_object; 5082 packobj = od[1].od_object; 5083 chunksize = od[0].od_gen; 5084 ASSERT3U(chunksize, ==, od[1].od_gen); 5085 5086 /* 5087 * Prefetch a random chunk of the big object. 5088 * Our aim here is to get some async reads in flight 5089 * for blocks that we may free below; the DMU should 5090 * handle this race correctly. 5091 */ 5092 n = ztest_random(regions) * stride + ztest_random(width); 5093 s = 1 + ztest_random(2 * width - 1); 5094 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 5095 ZIO_PRIORITY_SYNC_READ); 5096 5097 /* 5098 * Pick a random index and compute the offsets into packobj and bigobj. 5099 */ 5100 n = ztest_random(regions) * stride + ztest_random(width); 5101 s = 1 + ztest_random(width - 1); 5102 5103 packoff = n * sizeof (bufwad_t); 5104 packsize = s * sizeof (bufwad_t); 5105 5106 bigoff = n * chunksize; 5107 bigsize = s * chunksize; 5108 5109 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 5110 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 5111 5112 /* 5113 * free_percent of the time, free a range of bigobj rather than 5114 * overwriting it. 5115 */ 5116 freeit = (ztest_random(100) < free_percent); 5117 5118 /* 5119 * Read the current contents of our objects. 5120 */ 5121 error = dmu_read(os, packobj, packoff, packsize, packbuf, 5122 DMU_READ_PREFETCH); 5123 ASSERT0(error); 5124 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 5125 DMU_READ_PREFETCH); 5126 ASSERT0(error); 5127 5128 /* 5129 * Get a tx for the mods to both packobj and bigobj. 5130 */ 5131 tx = dmu_tx_create(os); 5132 5133 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5134 5135 if (freeit) 5136 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 5137 else 5138 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5139 5140 /* This accounts for setting the checksum/compression. */ 5141 dmu_tx_hold_bonus(tx, bigobj); 5142 5143 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5144 if (txg == 0) { 5145 umem_free(packbuf, packsize); 5146 umem_free(bigbuf, bigsize); 5147 umem_free(od, size); 5148 return; 5149 } 5150 5151 enum zio_checksum cksum; 5152 do { 5153 cksum = (enum zio_checksum) 5154 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 5155 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 5156 dmu_object_set_checksum(os, bigobj, cksum, tx); 5157 5158 enum zio_compress comp; 5159 do { 5160 comp = (enum zio_compress) 5161 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 5162 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 5163 dmu_object_set_compress(os, bigobj, comp, tx); 5164 5165 /* 5166 * For each index from n to n + s, verify that the existing bufwad 5167 * in packobj matches the bufwads at the head and tail of the 5168 * corresponding chunk in bigobj. Then update all three bufwads 5169 * with the new values we want to write out. 5170 */ 5171 for (i = 0; i < s; i++) { 5172 /* LINTED */ 5173 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 5174 /* LINTED */ 5175 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 5176 /* LINTED */ 5177 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 5178 5179 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 5180 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 5181 5182 if (pack->bw_txg > txg) 5183 fatal(B_FALSE, 5184 "future leak: got %"PRIx64", open txg is %"PRIx64"", 5185 pack->bw_txg, txg); 5186 5187 if (pack->bw_data != 0 && pack->bw_index != n + i) 5188 fatal(B_FALSE, "wrong index: " 5189 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 5190 pack->bw_index, n, i); 5191 5192 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 5193 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 5194 pack, bigH); 5195 5196 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 5197 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 5198 pack, bigT); 5199 5200 if (freeit) { 5201 memset(pack, 0, sizeof (bufwad_t)); 5202 } else { 5203 pack->bw_index = n + i; 5204 pack->bw_txg = txg; 5205 pack->bw_data = 1 + ztest_random(-2ULL); 5206 } 5207 *bigH = *pack; 5208 *bigT = *pack; 5209 } 5210 5211 /* 5212 * We've verified all the old bufwads, and made new ones. 5213 * Now write them out. 5214 */ 5215 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5216 5217 if (freeit) { 5218 if (ztest_opts.zo_verbose >= 7) { 5219 (void) printf("freeing offset %"PRIx64" size %"PRIx64"" 5220 " txg %"PRIx64"\n", 5221 bigoff, bigsize, txg); 5222 } 5223 VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 5224 } else { 5225 if (ztest_opts.zo_verbose >= 7) { 5226 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5227 " txg %"PRIx64"\n", 5228 bigoff, bigsize, txg); 5229 } 5230 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 5231 } 5232 5233 dmu_tx_commit(tx); 5234 5235 /* 5236 * Sanity check the stuff we just wrote. 5237 */ 5238 { 5239 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5240 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5241 5242 VERIFY0(dmu_read(os, packobj, packoff, 5243 packsize, packcheck, DMU_READ_PREFETCH)); 5244 VERIFY0(dmu_read(os, bigobj, bigoff, 5245 bigsize, bigcheck, DMU_READ_PREFETCH)); 5246 5247 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5248 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5249 5250 umem_free(packcheck, packsize); 5251 umem_free(bigcheck, bigsize); 5252 } 5253 5254 umem_free(packbuf, packsize); 5255 umem_free(bigbuf, bigsize); 5256 umem_free(od, size); 5257 } 5258 5259 static void 5260 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 5261 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 5262 { 5263 uint64_t i; 5264 bufwad_t *pack; 5265 bufwad_t *bigH; 5266 bufwad_t *bigT; 5267 5268 /* 5269 * For each index from n to n + s, verify that the existing bufwad 5270 * in packobj matches the bufwads at the head and tail of the 5271 * corresponding chunk in bigobj. Then update all three bufwads 5272 * with the new values we want to write out. 5273 */ 5274 for (i = 0; i < s; i++) { 5275 /* LINTED */ 5276 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 5277 /* LINTED */ 5278 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 5279 /* LINTED */ 5280 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 5281 5282 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 5283 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 5284 5285 if (pack->bw_txg > txg) 5286 fatal(B_FALSE, 5287 "future leak: got %"PRIx64", open txg is %"PRIx64"", 5288 pack->bw_txg, txg); 5289 5290 if (pack->bw_data != 0 && pack->bw_index != n + i) 5291 fatal(B_FALSE, "wrong index: " 5292 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 5293 pack->bw_index, n, i); 5294 5295 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 5296 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 5297 pack, bigH); 5298 5299 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 5300 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 5301 pack, bigT); 5302 5303 pack->bw_index = n + i; 5304 pack->bw_txg = txg; 5305 pack->bw_data = 1 + ztest_random(-2ULL); 5306 5307 *bigH = *pack; 5308 *bigT = *pack; 5309 } 5310 } 5311 5312 #undef OD_ARRAY_SIZE 5313 #define OD_ARRAY_SIZE 2 5314 5315 void 5316 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 5317 { 5318 objset_t *os = zd->zd_os; 5319 ztest_od_t *od; 5320 dmu_tx_t *tx; 5321 uint64_t i; 5322 int error; 5323 int size; 5324 uint64_t n, s, txg; 5325 bufwad_t *packbuf, *bigbuf; 5326 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 5327 uint64_t blocksize = ztest_random_blocksize(); 5328 uint64_t chunksize = blocksize; 5329 uint64_t regions = 997; 5330 uint64_t stride = 123456789ULL; 5331 uint64_t width = 9; 5332 dmu_buf_t *bonus_db; 5333 arc_buf_t **bigbuf_arcbufs; 5334 dmu_object_info_t doi; 5335 5336 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 5337 od = umem_alloc(size, UMEM_NOFAIL); 5338 5339 /* 5340 * This test uses two objects, packobj and bigobj, that are always 5341 * updated together (i.e. in the same tx) so that their contents are 5342 * in sync and can be compared. Their contents relate to each other 5343 * in a simple way: packobj is a dense array of 'bufwad' structures, 5344 * while bigobj is a sparse array of the same bufwads. Specifically, 5345 * for any index n, there are three bufwads that should be identical: 5346 * 5347 * packobj, at offset n * sizeof (bufwad_t) 5348 * bigobj, at the head of the nth chunk 5349 * bigobj, at the tail of the nth chunk 5350 * 5351 * The chunk size is set equal to bigobj block size so that 5352 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. 5353 */ 5354 5355 /* 5356 * Read the directory info. If it's the first time, set things up. 5357 */ 5358 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5359 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5360 chunksize); 5361 5362 5363 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5364 umem_free(od, size); 5365 return; 5366 } 5367 5368 bigobj = od[0].od_object; 5369 packobj = od[1].od_object; 5370 blocksize = od[0].od_blocksize; 5371 chunksize = blocksize; 5372 ASSERT3U(chunksize, ==, od[1].od_gen); 5373 5374 VERIFY0(dmu_object_info(os, bigobj, &doi)); 5375 VERIFY(ISP2(doi.doi_data_block_size)); 5376 VERIFY3U(chunksize, ==, doi.doi_data_block_size); 5377 VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); 5378 5379 /* 5380 * Pick a random index and compute the offsets into packobj and bigobj. 5381 */ 5382 n = ztest_random(regions) * stride + ztest_random(width); 5383 s = 1 + ztest_random(width - 1); 5384 5385 packoff = n * sizeof (bufwad_t); 5386 packsize = s * sizeof (bufwad_t); 5387 5388 bigoff = n * chunksize; 5389 bigsize = s * chunksize; 5390 5391 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 5392 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 5393 5394 VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 5395 5396 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 5397 5398 /* 5399 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 5400 * Iteration 1 test zcopy to already referenced dbufs. 5401 * Iteration 2 test zcopy to dirty dbuf in the same txg. 5402 * Iteration 3 test zcopy to dbuf dirty in previous txg. 5403 * Iteration 4 test zcopy when dbuf is no longer dirty. 5404 * Iteration 5 test zcopy when it can't be done. 5405 * Iteration 6 one more zcopy write. 5406 */ 5407 for (i = 0; i < 7; i++) { 5408 uint64_t j; 5409 uint64_t off; 5410 5411 /* 5412 * In iteration 5 (i == 5) use arcbufs 5413 * that don't match bigobj blksz to test 5414 * dmu_assign_arcbuf_by_dbuf() when it can't directly 5415 * assign an arcbuf to a dbuf. 5416 */ 5417 for (j = 0; j < s; j++) { 5418 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5419 bigbuf_arcbufs[j] = 5420 dmu_request_arcbuf(bonus_db, chunksize); 5421 } else { 5422 bigbuf_arcbufs[2 * j] = 5423 dmu_request_arcbuf(bonus_db, chunksize / 2); 5424 bigbuf_arcbufs[2 * j + 1] = 5425 dmu_request_arcbuf(bonus_db, chunksize / 2); 5426 } 5427 } 5428 5429 /* 5430 * Get a tx for the mods to both packobj and bigobj. 5431 */ 5432 tx = dmu_tx_create(os); 5433 5434 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5435 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5436 5437 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5438 if (txg == 0) { 5439 umem_free(packbuf, packsize); 5440 umem_free(bigbuf, bigsize); 5441 for (j = 0; j < s; j++) { 5442 if (i != 5 || 5443 chunksize < (SPA_MINBLOCKSIZE * 2)) { 5444 dmu_return_arcbuf(bigbuf_arcbufs[j]); 5445 } else { 5446 dmu_return_arcbuf( 5447 bigbuf_arcbufs[2 * j]); 5448 dmu_return_arcbuf( 5449 bigbuf_arcbufs[2 * j + 1]); 5450 } 5451 } 5452 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5453 umem_free(od, size); 5454 dmu_buf_rele(bonus_db, FTAG); 5455 return; 5456 } 5457 5458 /* 5459 * 50% of the time don't read objects in the 1st iteration to 5460 * test dmu_assign_arcbuf_by_dbuf() for the case when there are 5461 * no existing dbufs for the specified offsets. 5462 */ 5463 if (i != 0 || ztest_random(2) != 0) { 5464 error = dmu_read(os, packobj, packoff, 5465 packsize, packbuf, DMU_READ_PREFETCH); 5466 ASSERT0(error); 5467 error = dmu_read(os, bigobj, bigoff, bigsize, 5468 bigbuf, DMU_READ_PREFETCH); 5469 ASSERT0(error); 5470 } 5471 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 5472 n, chunksize, txg); 5473 5474 /* 5475 * We've verified all the old bufwads, and made new ones. 5476 * Now write them out. 5477 */ 5478 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5479 if (ztest_opts.zo_verbose >= 7) { 5480 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5481 " txg %"PRIx64"\n", 5482 bigoff, bigsize, txg); 5483 } 5484 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 5485 dmu_buf_t *dbt; 5486 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5487 memcpy(bigbuf_arcbufs[j]->b_data, 5488 (caddr_t)bigbuf + (off - bigoff), 5489 chunksize); 5490 } else { 5491 memcpy(bigbuf_arcbufs[2 * j]->b_data, 5492 (caddr_t)bigbuf + (off - bigoff), 5493 chunksize / 2); 5494 memcpy(bigbuf_arcbufs[2 * j + 1]->b_data, 5495 (caddr_t)bigbuf + (off - bigoff) + 5496 chunksize / 2, 5497 chunksize / 2); 5498 } 5499 5500 if (i == 1) { 5501 VERIFY(dmu_buf_hold(os, bigobj, off, 5502 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 5503 } 5504 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5505 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5506 off, bigbuf_arcbufs[j], tx)); 5507 } else { 5508 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5509 off, bigbuf_arcbufs[2 * j], tx)); 5510 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5511 off + chunksize / 2, 5512 bigbuf_arcbufs[2 * j + 1], tx)); 5513 } 5514 if (i == 1) { 5515 dmu_buf_rele(dbt, FTAG); 5516 } 5517 } 5518 dmu_tx_commit(tx); 5519 5520 /* 5521 * Sanity check the stuff we just wrote. 5522 */ 5523 { 5524 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5525 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5526 5527 VERIFY0(dmu_read(os, packobj, packoff, 5528 packsize, packcheck, DMU_READ_PREFETCH)); 5529 VERIFY0(dmu_read(os, bigobj, bigoff, 5530 bigsize, bigcheck, DMU_READ_PREFETCH)); 5531 5532 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5533 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5534 5535 umem_free(packcheck, packsize); 5536 umem_free(bigcheck, bigsize); 5537 } 5538 if (i == 2) { 5539 txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); 5540 } else if (i == 3) { 5541 txg_wait_synced(dmu_objset_pool(os), 0); 5542 } 5543 } 5544 5545 dmu_buf_rele(bonus_db, FTAG); 5546 umem_free(packbuf, packsize); 5547 umem_free(bigbuf, bigsize); 5548 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5549 umem_free(od, size); 5550 } 5551 5552 void 5553 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 5554 { 5555 (void) id; 5556 ztest_od_t *od; 5557 5558 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5559 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 5560 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5561 5562 /* 5563 * Have multiple threads write to large offsets in an object 5564 * to verify that parallel writes to an object -- even to the 5565 * same blocks within the object -- doesn't cause any trouble. 5566 */ 5567 ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5568 5569 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) 5570 return; 5571 5572 while (ztest_random(10) != 0) 5573 ztest_io(zd, od->od_object, offset); 5574 5575 umem_free(od, sizeof (ztest_od_t)); 5576 } 5577 5578 void 5579 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 5580 { 5581 ztest_od_t *od; 5582 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 5583 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5584 uint64_t count = ztest_random(20) + 1; 5585 uint64_t blocksize = ztest_random_blocksize(); 5586 void *data; 5587 5588 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5589 5590 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5591 5592 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5593 !ztest_random(2)) != 0) { 5594 umem_free(od, sizeof (ztest_od_t)); 5595 return; 5596 } 5597 5598 if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { 5599 umem_free(od, sizeof (ztest_od_t)); 5600 return; 5601 } 5602 5603 ztest_prealloc(zd, od->od_object, offset, count * blocksize); 5604 5605 data = umem_zalloc(blocksize, UMEM_NOFAIL); 5606 5607 while (ztest_random(count) != 0) { 5608 uint64_t randoff = offset + (ztest_random(count) * blocksize); 5609 if (ztest_write(zd, od->od_object, randoff, blocksize, 5610 data) != 0) 5611 break; 5612 while (ztest_random(4) != 0) 5613 ztest_io(zd, od->od_object, randoff); 5614 } 5615 5616 umem_free(data, blocksize); 5617 umem_free(od, sizeof (ztest_od_t)); 5618 } 5619 5620 /* 5621 * Verify that zap_{create,destroy,add,remove,update} work as expected. 5622 */ 5623 #define ZTEST_ZAP_MIN_INTS 1 5624 #define ZTEST_ZAP_MAX_INTS 4 5625 #define ZTEST_ZAP_MAX_PROPS 1000 5626 5627 void 5628 ztest_zap(ztest_ds_t *zd, uint64_t id) 5629 { 5630 objset_t *os = zd->zd_os; 5631 ztest_od_t *od; 5632 uint64_t object; 5633 uint64_t txg, last_txg; 5634 uint64_t value[ZTEST_ZAP_MAX_INTS]; 5635 uint64_t zl_ints, zl_intsize, prop; 5636 int i, ints; 5637 dmu_tx_t *tx; 5638 char propname[100], txgname[100]; 5639 int error; 5640 const char *const hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 5641 5642 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5643 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5644 5645 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5646 !ztest_random(2)) != 0) 5647 goto out; 5648 5649 object = od->od_object; 5650 5651 /* 5652 * Generate a known hash collision, and verify that 5653 * we can lookup and remove both entries. 5654 */ 5655 tx = dmu_tx_create(os); 5656 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5657 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5658 if (txg == 0) 5659 goto out; 5660 for (i = 0; i < 2; i++) { 5661 value[i] = i; 5662 VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 5663 1, &value[i], tx)); 5664 } 5665 for (i = 0; i < 2; i++) { 5666 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 5667 sizeof (uint64_t), 1, &value[i], tx)); 5668 VERIFY0( 5669 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 5670 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5671 ASSERT3U(zl_ints, ==, 1); 5672 } 5673 for (i = 0; i < 2; i++) { 5674 VERIFY0(zap_remove(os, object, hc[i], tx)); 5675 } 5676 dmu_tx_commit(tx); 5677 5678 /* 5679 * Generate a bunch of random entries. 5680 */ 5681 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 5682 5683 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5684 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5685 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5686 memset(value, 0, sizeof (value)); 5687 last_txg = 0; 5688 5689 /* 5690 * If these zap entries already exist, validate their contents. 5691 */ 5692 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5693 if (error == 0) { 5694 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5695 ASSERT3U(zl_ints, ==, 1); 5696 5697 VERIFY0(zap_lookup(os, object, txgname, zl_intsize, 5698 zl_ints, &last_txg)); 5699 5700 VERIFY0(zap_length(os, object, propname, &zl_intsize, 5701 &zl_ints)); 5702 5703 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5704 ASSERT3U(zl_ints, ==, ints); 5705 5706 VERIFY0(zap_lookup(os, object, propname, zl_intsize, 5707 zl_ints, value)); 5708 5709 for (i = 0; i < ints; i++) { 5710 ASSERT3U(value[i], ==, last_txg + object + i); 5711 } 5712 } else { 5713 ASSERT3U(error, ==, ENOENT); 5714 } 5715 5716 /* 5717 * Atomically update two entries in our zap object. 5718 * The first is named txg_%llu, and contains the txg 5719 * in which the property was last updated. The second 5720 * is named prop_%llu, and the nth element of its value 5721 * should be txg + object + n. 5722 */ 5723 tx = dmu_tx_create(os); 5724 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5725 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5726 if (txg == 0) 5727 goto out; 5728 5729 if (last_txg > txg) 5730 fatal(B_FALSE, "zap future leak: old %"PRIu64" new %"PRIu64"", 5731 last_txg, txg); 5732 5733 for (i = 0; i < ints; i++) 5734 value[i] = txg + object + i; 5735 5736 VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 5737 1, &txg, tx)); 5738 VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), 5739 ints, value, tx)); 5740 5741 dmu_tx_commit(tx); 5742 5743 /* 5744 * Remove a random pair of entries. 5745 */ 5746 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5747 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5748 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5749 5750 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5751 5752 if (error == ENOENT) 5753 goto out; 5754 5755 ASSERT0(error); 5756 5757 tx = dmu_tx_create(os); 5758 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5759 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5760 if (txg == 0) 5761 goto out; 5762 VERIFY0(zap_remove(os, object, txgname, tx)); 5763 VERIFY0(zap_remove(os, object, propname, tx)); 5764 dmu_tx_commit(tx); 5765 out: 5766 umem_free(od, sizeof (ztest_od_t)); 5767 } 5768 5769 /* 5770 * Test case to test the upgrading of a microzap to fatzap. 5771 */ 5772 void 5773 ztest_fzap(ztest_ds_t *zd, uint64_t id) 5774 { 5775 objset_t *os = zd->zd_os; 5776 ztest_od_t *od; 5777 uint64_t object, txg, value; 5778 5779 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5780 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5781 5782 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5783 !ztest_random(2)) != 0) 5784 goto out; 5785 object = od->od_object; 5786 5787 /* 5788 * Add entries to this ZAP and make sure it spills over 5789 * and gets upgraded to a fatzap. Also, since we are adding 5790 * 2050 entries we should see ptrtbl growth and leaf-block split. 5791 */ 5792 for (value = 0; value < 2050; value++) { 5793 char name[ZFS_MAX_DATASET_NAME_LEN]; 5794 dmu_tx_t *tx; 5795 int error; 5796 5797 (void) snprintf(name, sizeof (name), "fzap-%"PRIu64"-%"PRIu64"", 5798 id, value); 5799 5800 tx = dmu_tx_create(os); 5801 dmu_tx_hold_zap(tx, object, B_TRUE, name); 5802 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5803 if (txg == 0) 5804 goto out; 5805 error = zap_add(os, object, name, sizeof (uint64_t), 1, 5806 &value, tx); 5807 ASSERT(error == 0 || error == EEXIST); 5808 dmu_tx_commit(tx); 5809 } 5810 out: 5811 umem_free(od, sizeof (ztest_od_t)); 5812 } 5813 5814 void 5815 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 5816 { 5817 (void) id; 5818 objset_t *os = zd->zd_os; 5819 ztest_od_t *od; 5820 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 5821 dmu_tx_t *tx; 5822 int i, namelen, error; 5823 int micro = ztest_random(2); 5824 char name[20], string_value[20]; 5825 void *data; 5826 5827 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5828 ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); 5829 5830 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5831 umem_free(od, sizeof (ztest_od_t)); 5832 return; 5833 } 5834 5835 object = od->od_object; 5836 5837 /* 5838 * Generate a random name of the form 'xxx.....' where each 5839 * x is a random printable character and the dots are dots. 5840 * There are 94 such characters, and the name length goes from 5841 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 5842 */ 5843 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 5844 5845 for (i = 0; i < 3; i++) 5846 name[i] = '!' + ztest_random('~' - '!' + 1); 5847 for (; i < namelen - 1; i++) 5848 name[i] = '.'; 5849 name[i] = '\0'; 5850 5851 if ((namelen & 1) || micro) { 5852 wsize = sizeof (txg); 5853 wc = 1; 5854 data = &txg; 5855 } else { 5856 wsize = 1; 5857 wc = namelen; 5858 data = string_value; 5859 } 5860 5861 count = -1ULL; 5862 VERIFY0(zap_count(os, object, &count)); 5863 ASSERT3S(count, !=, -1ULL); 5864 5865 /* 5866 * Select an operation: length, lookup, add, update, remove. 5867 */ 5868 i = ztest_random(5); 5869 5870 if (i >= 2) { 5871 tx = dmu_tx_create(os); 5872 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5873 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5874 if (txg == 0) { 5875 umem_free(od, sizeof (ztest_od_t)); 5876 return; 5877 } 5878 memcpy(string_value, name, namelen); 5879 } else { 5880 tx = NULL; 5881 txg = 0; 5882 memset(string_value, 0, namelen); 5883 } 5884 5885 switch (i) { 5886 5887 case 0: 5888 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 5889 if (error == 0) { 5890 ASSERT3U(wsize, ==, zl_wsize); 5891 ASSERT3U(wc, ==, zl_wc); 5892 } else { 5893 ASSERT3U(error, ==, ENOENT); 5894 } 5895 break; 5896 5897 case 1: 5898 error = zap_lookup(os, object, name, wsize, wc, data); 5899 if (error == 0) { 5900 if (data == string_value && 5901 memcmp(name, data, namelen) != 0) 5902 fatal(B_FALSE, "name '%s' != val '%s' len %d", 5903 name, (char *)data, namelen); 5904 } else { 5905 ASSERT3U(error, ==, ENOENT); 5906 } 5907 break; 5908 5909 case 2: 5910 error = zap_add(os, object, name, wsize, wc, data, tx); 5911 ASSERT(error == 0 || error == EEXIST); 5912 break; 5913 5914 case 3: 5915 VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); 5916 break; 5917 5918 case 4: 5919 error = zap_remove(os, object, name, tx); 5920 ASSERT(error == 0 || error == ENOENT); 5921 break; 5922 } 5923 5924 if (tx != NULL) 5925 dmu_tx_commit(tx); 5926 5927 umem_free(od, sizeof (ztest_od_t)); 5928 } 5929 5930 /* 5931 * Commit callback data. 5932 */ 5933 typedef struct ztest_cb_data { 5934 list_node_t zcd_node; 5935 uint64_t zcd_txg; 5936 int zcd_expected_err; 5937 boolean_t zcd_added; 5938 boolean_t zcd_called; 5939 spa_t *zcd_spa; 5940 } ztest_cb_data_t; 5941 5942 /* This is the actual commit callback function */ 5943 static void 5944 ztest_commit_callback(void *arg, int error) 5945 { 5946 ztest_cb_data_t *data = arg; 5947 uint64_t synced_txg; 5948 5949 VERIFY3P(data, !=, NULL); 5950 VERIFY3S(data->zcd_expected_err, ==, error); 5951 VERIFY(!data->zcd_called); 5952 5953 synced_txg = spa_last_synced_txg(data->zcd_spa); 5954 if (data->zcd_txg > synced_txg) 5955 fatal(B_FALSE, 5956 "commit callback of txg %"PRIu64" called prematurely, " 5957 "last synced txg = %"PRIu64"\n", 5958 data->zcd_txg, synced_txg); 5959 5960 data->zcd_called = B_TRUE; 5961 5962 if (error == ECANCELED) { 5963 ASSERT0(data->zcd_txg); 5964 ASSERT(!data->zcd_added); 5965 5966 /* 5967 * The private callback data should be destroyed here, but 5968 * since we are going to check the zcd_called field after 5969 * dmu_tx_abort(), we will destroy it there. 5970 */ 5971 return; 5972 } 5973 5974 ASSERT(data->zcd_added); 5975 ASSERT3U(data->zcd_txg, !=, 0); 5976 5977 (void) mutex_enter(&zcl.zcl_callbacks_lock); 5978 5979 /* See if this cb was called more quickly */ 5980 if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) 5981 zc_min_txg_delay = synced_txg - data->zcd_txg; 5982 5983 /* Remove our callback from the list */ 5984 list_remove(&zcl.zcl_callbacks, data); 5985 5986 (void) mutex_exit(&zcl.zcl_callbacks_lock); 5987 5988 umem_free(data, sizeof (ztest_cb_data_t)); 5989 } 5990 5991 /* Allocate and initialize callback data structure */ 5992 static ztest_cb_data_t * 5993 ztest_create_cb_data(objset_t *os, uint64_t txg) 5994 { 5995 ztest_cb_data_t *cb_data; 5996 5997 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 5998 5999 cb_data->zcd_txg = txg; 6000 cb_data->zcd_spa = dmu_objset_spa(os); 6001 list_link_init(&cb_data->zcd_node); 6002 6003 return (cb_data); 6004 } 6005 6006 /* 6007 * Commit callback test. 6008 */ 6009 void 6010 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 6011 { 6012 objset_t *os = zd->zd_os; 6013 ztest_od_t *od; 6014 dmu_tx_t *tx; 6015 ztest_cb_data_t *cb_data[3], *tmp_cb; 6016 uint64_t old_txg, txg; 6017 int i, error = 0; 6018 6019 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 6020 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 6021 6022 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 6023 umem_free(od, sizeof (ztest_od_t)); 6024 return; 6025 } 6026 6027 tx = dmu_tx_create(os); 6028 6029 cb_data[0] = ztest_create_cb_data(os, 0); 6030 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 6031 6032 dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); 6033 6034 /* Every once in a while, abort the transaction on purpose */ 6035 if (ztest_random(100) == 0) 6036 error = -1; 6037 6038 if (!error) 6039 error = dmu_tx_assign(tx, TXG_NOWAIT); 6040 6041 txg = error ? 0 : dmu_tx_get_txg(tx); 6042 6043 cb_data[0]->zcd_txg = txg; 6044 cb_data[1] = ztest_create_cb_data(os, txg); 6045 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 6046 6047 if (error) { 6048 /* 6049 * It's not a strict requirement to call the registered 6050 * callbacks from inside dmu_tx_abort(), but that's what 6051 * it's supposed to happen in the current implementation 6052 * so we will check for that. 6053 */ 6054 for (i = 0; i < 2; i++) { 6055 cb_data[i]->zcd_expected_err = ECANCELED; 6056 VERIFY(!cb_data[i]->zcd_called); 6057 } 6058 6059 dmu_tx_abort(tx); 6060 6061 for (i = 0; i < 2; i++) { 6062 VERIFY(cb_data[i]->zcd_called); 6063 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 6064 } 6065 6066 umem_free(od, sizeof (ztest_od_t)); 6067 return; 6068 } 6069 6070 cb_data[2] = ztest_create_cb_data(os, txg); 6071 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 6072 6073 /* 6074 * Read existing data to make sure there isn't a future leak. 6075 */ 6076 VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), 6077 &old_txg, DMU_READ_PREFETCH)); 6078 6079 if (old_txg > txg) 6080 fatal(B_FALSE, 6081 "future leak: got %"PRIu64", open txg is %"PRIu64"", 6082 old_txg, txg); 6083 6084 dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); 6085 6086 (void) mutex_enter(&zcl.zcl_callbacks_lock); 6087 6088 /* 6089 * Since commit callbacks don't have any ordering requirement and since 6090 * it is theoretically possible for a commit callback to be called 6091 * after an arbitrary amount of time has elapsed since its txg has been 6092 * synced, it is difficult to reliably determine whether a commit 6093 * callback hasn't been called due to high load or due to a flawed 6094 * implementation. 6095 * 6096 * In practice, we will assume that if after a certain number of txgs a 6097 * commit callback hasn't been called, then most likely there's an 6098 * implementation bug.. 6099 */ 6100 tmp_cb = list_head(&zcl.zcl_callbacks); 6101 if (tmp_cb != NULL && 6102 tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { 6103 fatal(B_FALSE, 6104 "Commit callback threshold exceeded, " 6105 "oldest txg: %"PRIu64", open txg: %"PRIu64"\n", 6106 tmp_cb->zcd_txg, txg); 6107 } 6108 6109 /* 6110 * Let's find the place to insert our callbacks. 6111 * 6112 * Even though the list is ordered by txg, it is possible for the 6113 * insertion point to not be the end because our txg may already be 6114 * quiescing at this point and other callbacks in the open txg 6115 * (from other objsets) may have sneaked in. 6116 */ 6117 tmp_cb = list_tail(&zcl.zcl_callbacks); 6118 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 6119 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 6120 6121 /* Add the 3 callbacks to the list */ 6122 for (i = 0; i < 3; i++) { 6123 if (tmp_cb == NULL) 6124 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 6125 else 6126 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 6127 cb_data[i]); 6128 6129 cb_data[i]->zcd_added = B_TRUE; 6130 VERIFY(!cb_data[i]->zcd_called); 6131 6132 tmp_cb = cb_data[i]; 6133 } 6134 6135 zc_cb_counter += 3; 6136 6137 (void) mutex_exit(&zcl.zcl_callbacks_lock); 6138 6139 dmu_tx_commit(tx); 6140 6141 umem_free(od, sizeof (ztest_od_t)); 6142 } 6143 6144 /* 6145 * Visit each object in the dataset. Verify that its properties 6146 * are consistent what was stored in the block tag when it was created, 6147 * and that its unused bonus buffer space has not been overwritten. 6148 */ 6149 void 6150 ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 6151 { 6152 (void) id; 6153 objset_t *os = zd->zd_os; 6154 uint64_t obj; 6155 int err = 0; 6156 6157 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 6158 ztest_block_tag_t *bt = NULL; 6159 dmu_object_info_t doi; 6160 dmu_buf_t *db; 6161 6162 ztest_object_lock(zd, obj, ZTRL_READER); 6163 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { 6164 ztest_object_unlock(zd, obj); 6165 continue; 6166 } 6167 6168 dmu_object_info_from_db(db, &doi); 6169 if (doi.doi_bonus_size >= sizeof (*bt)) 6170 bt = ztest_bt_bonus(db); 6171 6172 if (bt && bt->bt_magic == BT_MAGIC) { 6173 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 6174 bt->bt_offset, bt->bt_gen, bt->bt_txg, 6175 bt->bt_crtxg); 6176 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 6177 } 6178 6179 dmu_buf_rele(db, FTAG); 6180 ztest_object_unlock(zd, obj); 6181 } 6182 } 6183 6184 void 6185 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 6186 { 6187 (void) id; 6188 zfs_prop_t proplist[] = { 6189 ZFS_PROP_CHECKSUM, 6190 ZFS_PROP_COMPRESSION, 6191 ZFS_PROP_COPIES, 6192 ZFS_PROP_DEDUP 6193 }; 6194 6195 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6196 6197 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) { 6198 int error = ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 6199 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 6200 ASSERT(error == 0 || error == ENOSPC); 6201 } 6202 6203 int error = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, 6204 ztest_random_blocksize(), (int)ztest_random(2)); 6205 ASSERT(error == 0 || error == ENOSPC); 6206 6207 (void) pthread_rwlock_unlock(&ztest_name_lock); 6208 } 6209 6210 void 6211 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 6212 { 6213 (void) zd, (void) id; 6214 nvlist_t *props = NULL; 6215 6216 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6217 6218 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); 6219 6220 VERIFY0(spa_prop_get(ztest_spa, &props)); 6221 6222 if (ztest_opts.zo_verbose >= 6) 6223 dump_nvlist(props, 4); 6224 6225 fnvlist_free(props); 6226 6227 (void) pthread_rwlock_unlock(&ztest_name_lock); 6228 } 6229 6230 static int 6231 user_release_one(const char *snapname, const char *holdname) 6232 { 6233 nvlist_t *snaps, *holds; 6234 int error; 6235 6236 snaps = fnvlist_alloc(); 6237 holds = fnvlist_alloc(); 6238 fnvlist_add_boolean(holds, holdname); 6239 fnvlist_add_nvlist(snaps, snapname, holds); 6240 fnvlist_free(holds); 6241 error = dsl_dataset_user_release(snaps, NULL); 6242 fnvlist_free(snaps); 6243 return (error); 6244 } 6245 6246 /* 6247 * Test snapshot hold/release and deferred destroy. 6248 */ 6249 void 6250 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 6251 { 6252 int error; 6253 objset_t *os = zd->zd_os; 6254 objset_t *origin; 6255 char snapname[100]; 6256 char fullname[100]; 6257 char clonename[100]; 6258 char tag[100]; 6259 char osname[ZFS_MAX_DATASET_NAME_LEN]; 6260 nvlist_t *holds; 6261 6262 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6263 6264 dmu_objset_name(os, osname); 6265 6266 (void) snprintf(snapname, sizeof (snapname), "sh1_%"PRIu64"", id); 6267 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 6268 (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%"PRIu64"", 6269 osname, id); 6270 (void) snprintf(tag, sizeof (tag), "tag_%"PRIu64"", id); 6271 6272 /* 6273 * Clean up from any previous run. 6274 */ 6275 error = dsl_destroy_head(clonename); 6276 if (error != ENOENT) 6277 ASSERT0(error); 6278 error = user_release_one(fullname, tag); 6279 if (error != ESRCH && error != ENOENT) 6280 ASSERT0(error); 6281 error = dsl_destroy_snapshot(fullname, B_FALSE); 6282 if (error != ENOENT) 6283 ASSERT0(error); 6284 6285 /* 6286 * Create snapshot, clone it, mark snap for deferred destroy, 6287 * destroy clone, verify snap was also destroyed. 6288 */ 6289 error = dmu_objset_snapshot_one(osname, snapname); 6290 if (error) { 6291 if (error == ENOSPC) { 6292 ztest_record_enospc("dmu_objset_snapshot"); 6293 goto out; 6294 } 6295 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 6296 } 6297 6298 error = dmu_objset_clone(clonename, fullname); 6299 if (error) { 6300 if (error == ENOSPC) { 6301 ztest_record_enospc("dmu_objset_clone"); 6302 goto out; 6303 } 6304 fatal(B_FALSE, "dmu_objset_clone(%s) = %d", clonename, error); 6305 } 6306 6307 error = dsl_destroy_snapshot(fullname, B_TRUE); 6308 if (error) { 6309 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6310 fullname, error); 6311 } 6312 6313 error = dsl_destroy_head(clonename); 6314 if (error) 6315 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clonename, error); 6316 6317 error = dmu_objset_hold(fullname, FTAG, &origin); 6318 if (error != ENOENT) 6319 fatal(B_FALSE, "dmu_objset_hold(%s) = %d", fullname, error); 6320 6321 /* 6322 * Create snapshot, add temporary hold, verify that we can't 6323 * destroy a held snapshot, mark for deferred destroy, 6324 * release hold, verify snapshot was destroyed. 6325 */ 6326 error = dmu_objset_snapshot_one(osname, snapname); 6327 if (error) { 6328 if (error == ENOSPC) { 6329 ztest_record_enospc("dmu_objset_snapshot"); 6330 goto out; 6331 } 6332 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 6333 } 6334 6335 holds = fnvlist_alloc(); 6336 fnvlist_add_string(holds, fullname, tag); 6337 error = dsl_dataset_user_hold(holds, 0, NULL); 6338 fnvlist_free(holds); 6339 6340 if (error == ENOSPC) { 6341 ztest_record_enospc("dsl_dataset_user_hold"); 6342 goto out; 6343 } else if (error) { 6344 fatal(B_FALSE, "dsl_dataset_user_hold(%s, %s) = %u", 6345 fullname, tag, error); 6346 } 6347 6348 error = dsl_destroy_snapshot(fullname, B_FALSE); 6349 if (error != EBUSY) { 6350 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 6351 fullname, error); 6352 } 6353 6354 error = dsl_destroy_snapshot(fullname, B_TRUE); 6355 if (error) { 6356 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6357 fullname, error); 6358 } 6359 6360 error = user_release_one(fullname, tag); 6361 if (error) 6362 fatal(B_FALSE, "user_release_one(%s, %s) = %d", 6363 fullname, tag, error); 6364 6365 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 6366 6367 out: 6368 (void) pthread_rwlock_unlock(&ztest_name_lock); 6369 } 6370 6371 /* 6372 * Inject random faults into the on-disk data. 6373 */ 6374 void 6375 ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 6376 { 6377 (void) zd, (void) id; 6378 ztest_shared_t *zs = ztest_shared; 6379 spa_t *spa = ztest_spa; 6380 int fd; 6381 uint64_t offset; 6382 uint64_t leaves; 6383 uint64_t bad = 0x1990c0ffeedecadeull; 6384 uint64_t top, leaf; 6385 uint64_t raidz_children; 6386 char *path0; 6387 char *pathrand; 6388 size_t fsize; 6389 int bshift = SPA_MAXBLOCKSHIFT + 2; 6390 int iters = 1000; 6391 int maxfaults; 6392 int mirror_save; 6393 vdev_t *vd0 = NULL; 6394 uint64_t guid0 = 0; 6395 boolean_t islog = B_FALSE; 6396 boolean_t injected = B_FALSE; 6397 6398 path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6399 pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6400 6401 mutex_enter(&ztest_vdev_lock); 6402 6403 /* 6404 * Device removal is in progress, fault injection must be disabled 6405 * until it completes and the pool is scrubbed. The fault injection 6406 * strategy for damaging blocks does not take in to account evacuated 6407 * blocks which may have already been damaged. 6408 */ 6409 if (ztest_device_removal_active) 6410 goto out; 6411 6412 /* 6413 * The fault injection strategy for damaging blocks cannot be used 6414 * if raidz expansion is in progress. The leaves value 6415 * (attached raidz children) is variable and strategy for damaging 6416 * blocks will corrupt same data blocks on different child vdevs 6417 * because of the reflow process. 6418 */ 6419 if (spa->spa_raidz_expand != NULL) 6420 goto out; 6421 6422 maxfaults = MAXFAULTS(zs); 6423 raidz_children = ztest_get_raidz_children(spa); 6424 leaves = MAX(zs->zs_mirrors, 1) * raidz_children; 6425 mirror_save = zs->zs_mirrors; 6426 6427 ASSERT3U(leaves, >=, 1); 6428 6429 /* 6430 * While ztest is running the number of leaves will not change. This 6431 * is critical for the fault injection logic as it determines where 6432 * errors can be safely injected such that they are always repairable. 6433 * 6434 * When restarting ztest a different number of leaves may be requested 6435 * which will shift the regions to be damaged. This is fine as long 6436 * as the pool has been scrubbed prior to using the new mapping. 6437 * Failure to do can result in non-repairable damage being injected. 6438 */ 6439 if (ztest_pool_scrubbed == B_FALSE) 6440 goto out; 6441 6442 /* 6443 * Grab the name lock as reader. There are some operations 6444 * which don't like to have their vdevs changed while 6445 * they are in progress (i.e. spa_change_guid). Those 6446 * operations will have grabbed the name lock as writer. 6447 */ 6448 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6449 6450 /* 6451 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 6452 */ 6453 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6454 6455 if (ztest_random(2) == 0) { 6456 /* 6457 * Inject errors on a normal data device or slog device. 6458 */ 6459 top = ztest_random_vdev_top(spa, B_TRUE); 6460 leaf = ztest_random(leaves) + zs->zs_splits; 6461 6462 /* 6463 * Generate paths to the first leaf in this top-level vdev, 6464 * and to the random leaf we selected. We'll induce transient 6465 * write failures and random online/offline activity on leaf 0, 6466 * and we'll write random garbage to the randomly chosen leaf. 6467 */ 6468 (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, 6469 ztest_opts.zo_dir, ztest_opts.zo_pool, 6470 top * leaves + zs->zs_splits); 6471 (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, 6472 ztest_opts.zo_dir, ztest_opts.zo_pool, 6473 top * leaves + leaf); 6474 6475 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 6476 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 6477 islog = B_TRUE; 6478 6479 /* 6480 * If the top-level vdev needs to be resilvered 6481 * then we only allow faults on the device that is 6482 * resilvering. 6483 */ 6484 if (vd0 != NULL && maxfaults != 1 && 6485 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 6486 vd0->vdev_resilver_txg != 0)) { 6487 /* 6488 * Make vd0 explicitly claim to be unreadable, 6489 * or unwritable, or reach behind its back 6490 * and close the underlying fd. We can do this if 6491 * maxfaults == 0 because we'll fail and reexecute, 6492 * and we can do it if maxfaults >= 2 because we'll 6493 * have enough redundancy. If maxfaults == 1, the 6494 * combination of this with injection of random data 6495 * corruption below exceeds the pool's fault tolerance. 6496 */ 6497 vdev_file_t *vf = vd0->vdev_tsd; 6498 6499 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 6500 (long long)vd0->vdev_id, (int)maxfaults); 6501 6502 if (vf != NULL && ztest_random(3) == 0) { 6503 (void) close(vf->vf_file->f_fd); 6504 vf->vf_file->f_fd = -1; 6505 } else if (ztest_random(2) == 0) { 6506 vd0->vdev_cant_read = B_TRUE; 6507 } else { 6508 vd0->vdev_cant_write = B_TRUE; 6509 } 6510 guid0 = vd0->vdev_guid; 6511 } 6512 } else { 6513 /* 6514 * Inject errors on an l2cache device. 6515 */ 6516 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6517 6518 if (sav->sav_count == 0) { 6519 spa_config_exit(spa, SCL_STATE, FTAG); 6520 (void) pthread_rwlock_unlock(&ztest_name_lock); 6521 goto out; 6522 } 6523 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 6524 guid0 = vd0->vdev_guid; 6525 (void) strlcpy(path0, vd0->vdev_path, MAXPATHLEN); 6526 (void) strlcpy(pathrand, vd0->vdev_path, MAXPATHLEN); 6527 6528 leaf = 0; 6529 leaves = 1; 6530 maxfaults = INT_MAX; /* no limit on cache devices */ 6531 } 6532 6533 spa_config_exit(spa, SCL_STATE, FTAG); 6534 (void) pthread_rwlock_unlock(&ztest_name_lock); 6535 6536 /* 6537 * If we can tolerate two or more faults, or we're dealing 6538 * with a slog, randomly online/offline vd0. 6539 */ 6540 if ((maxfaults >= 2 || islog) && guid0 != 0) { 6541 if (ztest_random(10) < 6) { 6542 int flags = (ztest_random(2) == 0 ? 6543 ZFS_OFFLINE_TEMPORARY : 0); 6544 6545 /* 6546 * We have to grab the zs_name_lock as writer to 6547 * prevent a race between offlining a slog and 6548 * destroying a dataset. Offlining the slog will 6549 * grab a reference on the dataset which may cause 6550 * dsl_destroy_head() to fail with EBUSY thus 6551 * leaving the dataset in an inconsistent state. 6552 */ 6553 if (islog) 6554 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6555 6556 VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); 6557 6558 if (islog) 6559 (void) pthread_rwlock_unlock(&ztest_name_lock); 6560 } else { 6561 /* 6562 * Ideally we would like to be able to randomly 6563 * call vdev_[on|off]line without holding locks 6564 * to force unpredictable failures but the side 6565 * effects of vdev_[on|off]line prevent us from 6566 * doing so. 6567 */ 6568 (void) vdev_online(spa, guid0, 0, NULL); 6569 } 6570 } 6571 6572 if (maxfaults == 0) 6573 goto out; 6574 6575 /* 6576 * We have at least single-fault tolerance, so inject data corruption. 6577 */ 6578 fd = open(pathrand, O_RDWR); 6579 6580 if (fd == -1) /* we hit a gap in the device namespace */ 6581 goto out; 6582 6583 fsize = lseek(fd, 0, SEEK_END); 6584 6585 while (--iters != 0) { 6586 /* 6587 * The offset must be chosen carefully to ensure that 6588 * we do not inject a given logical block with errors 6589 * on two different leaf devices, because ZFS can not 6590 * tolerate that (if maxfaults==1). 6591 * 6592 * To achieve this we divide each leaf device into 6593 * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). 6594 * Each chunk is further divided into error-injection 6595 * ranges (can accept errors) and clear ranges (we do 6596 * not inject errors in those). Each error-injection 6597 * range can accept errors only for a single leaf vdev. 6598 * Error-injection ranges are separated by clear ranges. 6599 * 6600 * For example, with 3 leaves, each chunk looks like: 6601 * 0 to 32M: injection range for leaf 0 6602 * 32M to 64M: clear range - no injection allowed 6603 * 64M to 96M: injection range for leaf 1 6604 * 96M to 128M: clear range - no injection allowed 6605 * 128M to 160M: injection range for leaf 2 6606 * 160M to 192M: clear range - no injection allowed 6607 * 6608 * Each clear range must be large enough such that a 6609 * single block cannot straddle it. This way a block 6610 * can't be a target in two different injection ranges 6611 * (on different leaf vdevs). 6612 */ 6613 offset = ztest_random(fsize / (leaves << bshift)) * 6614 (leaves << bshift) + (leaf << bshift) + 6615 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 6616 6617 /* 6618 * Only allow damage to the labels at one end of the vdev. 6619 * 6620 * If all labels are damaged, the device will be totally 6621 * inaccessible, which will result in loss of data, 6622 * because we also damage (parts of) the other side of 6623 * the mirror/raidz. 6624 * 6625 * Additionally, we will always have both an even and an 6626 * odd label, so that we can handle crashes in the 6627 * middle of vdev_config_sync(). 6628 */ 6629 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 6630 continue; 6631 6632 /* 6633 * The two end labels are stored at the "end" of the disk, but 6634 * the end of the disk (vdev_psize) is aligned to 6635 * sizeof (vdev_label_t). 6636 */ 6637 uint64_t psize = P2ALIGN_TYPED(fsize, sizeof (vdev_label_t), 6638 uint64_t); 6639 if ((leaf & 1) == 1 && 6640 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 6641 continue; 6642 6643 if (mirror_save != zs->zs_mirrors) { 6644 (void) close(fd); 6645 goto out; 6646 } 6647 6648 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 6649 fatal(B_TRUE, 6650 "can't inject bad word at 0x%"PRIx64" in %s", 6651 offset, pathrand); 6652 6653 if (ztest_opts.zo_verbose >= 7) 6654 (void) printf("injected bad word into %s," 6655 " offset 0x%"PRIx64"\n", pathrand, offset); 6656 6657 injected = B_TRUE; 6658 } 6659 6660 (void) close(fd); 6661 out: 6662 mutex_exit(&ztest_vdev_lock); 6663 6664 if (injected && ztest_opts.zo_raid_do_expand) { 6665 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6666 if (error == 0) { 6667 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6668 txg_wait_synced(spa_get_dsl(spa), 0); 6669 } 6670 } 6671 6672 umem_free(path0, MAXPATHLEN); 6673 umem_free(pathrand, MAXPATHLEN); 6674 } 6675 6676 /* 6677 * By design ztest will never inject uncorrectable damage in to the pool. 6678 * Issue a scrub, wait for it to complete, and verify there is never any 6679 * persistent damage. 6680 * 6681 * Only after a full scrub has been completed is it safe to start injecting 6682 * data corruption. See the comment in zfs_fault_inject(). 6683 */ 6684 static int 6685 ztest_scrub_impl(spa_t *spa) 6686 { 6687 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6688 if (error) 6689 return (error); 6690 6691 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6692 txg_wait_synced(spa_get_dsl(spa), 0); 6693 6694 if (spa_approx_errlog_size(spa) > 0) 6695 return (ECKSUM); 6696 6697 ztest_pool_scrubbed = B_TRUE; 6698 6699 return (0); 6700 } 6701 6702 /* 6703 * Scrub the pool. 6704 */ 6705 void 6706 ztest_scrub(ztest_ds_t *zd, uint64_t id) 6707 { 6708 (void) zd, (void) id; 6709 spa_t *spa = ztest_spa; 6710 int error; 6711 6712 /* 6713 * Scrub in progress by device removal. 6714 */ 6715 if (ztest_device_removal_active) 6716 return; 6717 6718 /* 6719 * Start a scrub, wait a moment, then force a restart. 6720 */ 6721 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6722 (void) poll(NULL, 0, 100); 6723 6724 error = ztest_scrub_impl(spa); 6725 if (error == EBUSY) 6726 error = 0; 6727 ASSERT0(error); 6728 } 6729 6730 /* 6731 * Change the guid for the pool. 6732 */ 6733 void 6734 ztest_reguid(ztest_ds_t *zd, uint64_t id) 6735 { 6736 (void) zd, (void) id; 6737 spa_t *spa = ztest_spa; 6738 uint64_t orig, load; 6739 int error; 6740 ztest_shared_t *zs = ztest_shared; 6741 6742 if (ztest_opts.zo_mmp_test) 6743 return; 6744 6745 orig = spa_guid(spa); 6746 load = spa_load_guid(spa); 6747 6748 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6749 error = spa_change_guid(spa); 6750 zs->zs_guid = spa_guid(spa); 6751 (void) pthread_rwlock_unlock(&ztest_name_lock); 6752 6753 if (error != 0) 6754 return; 6755 6756 if (ztest_opts.zo_verbose >= 4) { 6757 (void) printf("Changed guid old %"PRIu64" -> %"PRIu64"\n", 6758 orig, spa_guid(spa)); 6759 } 6760 6761 VERIFY3U(orig, !=, spa_guid(spa)); 6762 VERIFY3U(load, ==, spa_load_guid(spa)); 6763 } 6764 6765 void 6766 ztest_blake3(ztest_ds_t *zd, uint64_t id) 6767 { 6768 (void) zd, (void) id; 6769 hrtime_t end = gethrtime() + NANOSEC; 6770 zio_cksum_salt_t salt; 6771 void *salt_ptr = &salt.zcs_bytes; 6772 struct abd *abd_data, *abd_meta; 6773 void *buf, *templ; 6774 int i, *ptr; 6775 uint32_t size; 6776 BLAKE3_CTX ctx; 6777 const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); 6778 6779 size = ztest_random_blocksize(); 6780 buf = umem_alloc(size, UMEM_NOFAIL); 6781 abd_data = abd_alloc(size, B_FALSE); 6782 abd_meta = abd_alloc(size, B_TRUE); 6783 6784 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6785 *ptr = ztest_random(UINT_MAX); 6786 memset(salt_ptr, 'A', 32); 6787 6788 abd_copy_from_buf_off(abd_data, buf, 0, size); 6789 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6790 6791 while (gethrtime() <= end) { 6792 int run_count = 100; 6793 zio_cksum_t zc_ref1, zc_ref2; 6794 zio_cksum_t zc_res1, zc_res2; 6795 6796 void *ref1 = &zc_ref1; 6797 void *ref2 = &zc_ref2; 6798 void *res1 = &zc_res1; 6799 void *res2 = &zc_res2; 6800 6801 /* BLAKE3_KEY_LEN = 32 */ 6802 VERIFY0(blake3->setname("generic")); 6803 templ = abd_checksum_blake3_tmpl_init(&salt); 6804 Blake3_InitKeyed(&ctx, salt_ptr); 6805 Blake3_Update(&ctx, buf, size); 6806 Blake3_Final(&ctx, ref1); 6807 zc_ref2 = zc_ref1; 6808 ZIO_CHECKSUM_BSWAP(&zc_ref2); 6809 abd_checksum_blake3_tmpl_free(templ); 6810 6811 VERIFY0(blake3->setname("cycle")); 6812 while (run_count-- > 0) { 6813 6814 /* Test current implementation */ 6815 Blake3_InitKeyed(&ctx, salt_ptr); 6816 Blake3_Update(&ctx, buf, size); 6817 Blake3_Final(&ctx, res1); 6818 zc_res2 = zc_res1; 6819 ZIO_CHECKSUM_BSWAP(&zc_res2); 6820 6821 VERIFY0(memcmp(ref1, res1, 32)); 6822 VERIFY0(memcmp(ref2, res2, 32)); 6823 6824 /* Test ABD - data */ 6825 templ = abd_checksum_blake3_tmpl_init(&salt); 6826 abd_checksum_blake3_native(abd_data, size, 6827 templ, &zc_res1); 6828 abd_checksum_blake3_byteswap(abd_data, size, 6829 templ, &zc_res2); 6830 6831 VERIFY0(memcmp(ref1, res1, 32)); 6832 VERIFY0(memcmp(ref2, res2, 32)); 6833 6834 /* Test ABD - metadata */ 6835 abd_checksum_blake3_native(abd_meta, size, 6836 templ, &zc_res1); 6837 abd_checksum_blake3_byteswap(abd_meta, size, 6838 templ, &zc_res2); 6839 abd_checksum_blake3_tmpl_free(templ); 6840 6841 VERIFY0(memcmp(ref1, res1, 32)); 6842 VERIFY0(memcmp(ref2, res2, 32)); 6843 6844 } 6845 } 6846 6847 abd_free(abd_data); 6848 abd_free(abd_meta); 6849 umem_free(buf, size); 6850 } 6851 6852 void 6853 ztest_fletcher(ztest_ds_t *zd, uint64_t id) 6854 { 6855 (void) zd, (void) id; 6856 hrtime_t end = gethrtime() + NANOSEC; 6857 6858 while (gethrtime() <= end) { 6859 int run_count = 100; 6860 void *buf; 6861 struct abd *abd_data, *abd_meta; 6862 uint32_t size; 6863 int *ptr; 6864 int i; 6865 zio_cksum_t zc_ref; 6866 zio_cksum_t zc_ref_byteswap; 6867 6868 size = ztest_random_blocksize(); 6869 6870 buf = umem_alloc(size, UMEM_NOFAIL); 6871 abd_data = abd_alloc(size, B_FALSE); 6872 abd_meta = abd_alloc(size, B_TRUE); 6873 6874 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6875 *ptr = ztest_random(UINT_MAX); 6876 6877 abd_copy_from_buf_off(abd_data, buf, 0, size); 6878 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6879 6880 VERIFY0(fletcher_4_impl_set("scalar")); 6881 fletcher_4_native(buf, size, NULL, &zc_ref); 6882 fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); 6883 6884 VERIFY0(fletcher_4_impl_set("cycle")); 6885 while (run_count-- > 0) { 6886 zio_cksum_t zc; 6887 zio_cksum_t zc_byteswap; 6888 6889 fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); 6890 fletcher_4_native(buf, size, NULL, &zc); 6891 6892 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6893 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6894 sizeof (zc_byteswap))); 6895 6896 /* Test ABD - data */ 6897 abd_fletcher_4_byteswap(abd_data, size, NULL, 6898 &zc_byteswap); 6899 abd_fletcher_4_native(abd_data, size, NULL, &zc); 6900 6901 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6902 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6903 sizeof (zc_byteswap))); 6904 6905 /* Test ABD - metadata */ 6906 abd_fletcher_4_byteswap(abd_meta, size, NULL, 6907 &zc_byteswap); 6908 abd_fletcher_4_native(abd_meta, size, NULL, &zc); 6909 6910 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6911 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6912 sizeof (zc_byteswap))); 6913 6914 } 6915 6916 umem_free(buf, size); 6917 abd_free(abd_data); 6918 abd_free(abd_meta); 6919 } 6920 } 6921 6922 void 6923 ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) 6924 { 6925 (void) zd, (void) id; 6926 void *buf; 6927 size_t size; 6928 int *ptr; 6929 int i; 6930 zio_cksum_t zc_ref; 6931 zio_cksum_t zc_ref_bswap; 6932 6933 hrtime_t end = gethrtime() + NANOSEC; 6934 6935 while (gethrtime() <= end) { 6936 int run_count = 100; 6937 6938 size = ztest_random_blocksize(); 6939 buf = umem_alloc(size, UMEM_NOFAIL); 6940 6941 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6942 *ptr = ztest_random(UINT_MAX); 6943 6944 VERIFY0(fletcher_4_impl_set("scalar")); 6945 fletcher_4_native(buf, size, NULL, &zc_ref); 6946 fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); 6947 6948 VERIFY0(fletcher_4_impl_set("cycle")); 6949 6950 while (run_count-- > 0) { 6951 zio_cksum_t zc; 6952 zio_cksum_t zc_bswap; 6953 size_t pos = 0; 6954 6955 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6956 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6957 6958 while (pos < size) { 6959 size_t inc = 64 * ztest_random(size / 67); 6960 /* sometimes add few bytes to test non-simd */ 6961 if (ztest_random(100) < 10) 6962 inc += P2ALIGN_TYPED(ztest_random(64), 6963 sizeof (uint32_t), uint64_t); 6964 6965 if (inc > (size - pos)) 6966 inc = size - pos; 6967 6968 fletcher_4_incremental_native(buf + pos, inc, 6969 &zc); 6970 fletcher_4_incremental_byteswap(buf + pos, inc, 6971 &zc_bswap); 6972 6973 pos += inc; 6974 } 6975 6976 VERIFY3U(pos, ==, size); 6977 6978 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6979 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6980 6981 /* 6982 * verify if incremental on the whole buffer is 6983 * equivalent to non-incremental version 6984 */ 6985 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6986 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6987 6988 fletcher_4_incremental_native(buf, size, &zc); 6989 fletcher_4_incremental_byteswap(buf, size, &zc_bswap); 6990 6991 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6992 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6993 } 6994 6995 umem_free(buf, size); 6996 } 6997 } 6998 6999 void 7000 ztest_pool_prefetch_ddt(ztest_ds_t *zd, uint64_t id) 7001 { 7002 (void) zd, (void) id; 7003 spa_t *spa; 7004 7005 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7006 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7007 7008 ddt_prefetch_all(spa); 7009 7010 spa_close(spa, FTAG); 7011 (void) pthread_rwlock_unlock(&ztest_name_lock); 7012 } 7013 7014 static int 7015 ztest_set_global_vars(void) 7016 { 7017 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 7018 char *kv = ztest_opts.zo_gvars[i]; 7019 VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); 7020 VERIFY3U(strlen(kv), >, 0); 7021 int err = set_global_var(kv); 7022 if (ztest_opts.zo_verbose > 0) { 7023 (void) printf("setting global var %s ... %s\n", kv, 7024 err ? "failed" : "ok"); 7025 } 7026 if (err != 0) { 7027 (void) fprintf(stderr, 7028 "failed to set global var '%s'\n", kv); 7029 return (err); 7030 } 7031 } 7032 return (0); 7033 } 7034 7035 static char ** 7036 ztest_global_vars_to_zdb_args(void) 7037 { 7038 char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); 7039 char **cur = args; 7040 if (args == NULL) 7041 return (NULL); 7042 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 7043 *cur++ = (char *)"-o"; 7044 *cur++ = ztest_opts.zo_gvars[i]; 7045 } 7046 ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); 7047 *cur = NULL; 7048 return (args); 7049 } 7050 7051 /* The end of strings is indicated by a NULL element */ 7052 static char * 7053 join_strings(char **strings, const char *sep) 7054 { 7055 size_t totallen = 0; 7056 for (char **sp = strings; *sp != NULL; sp++) { 7057 totallen += strlen(*sp); 7058 totallen += strlen(sep); 7059 } 7060 if (totallen > 0) { 7061 ASSERT(totallen >= strlen(sep)); 7062 totallen -= strlen(sep); 7063 } 7064 7065 size_t buflen = totallen + 1; 7066 char *o = umem_alloc(buflen, UMEM_NOFAIL); /* trailing 0 byte */ 7067 o[0] = '\0'; 7068 for (char **sp = strings; *sp != NULL; sp++) { 7069 size_t would; 7070 would = strlcat(o, *sp, buflen); 7071 VERIFY3U(would, <, buflen); 7072 if (*(sp+1) == NULL) { 7073 break; 7074 } 7075 would = strlcat(o, sep, buflen); 7076 VERIFY3U(would, <, buflen); 7077 } 7078 ASSERT3S(strlen(o), ==, totallen); 7079 return (o); 7080 } 7081 7082 static int 7083 ztest_check_path(char *path) 7084 { 7085 struct stat s; 7086 /* return true on success */ 7087 return (!stat(path, &s)); 7088 } 7089 7090 static void 7091 ztest_get_zdb_bin(char *bin, int len) 7092 { 7093 char *zdb_path; 7094 /* 7095 * Try to use $ZDB and in-tree zdb path. If not successful, just 7096 * let popen to search through PATH. 7097 */ 7098 if ((zdb_path = getenv("ZDB"))) { 7099 strlcpy(bin, zdb_path, len); /* In env */ 7100 if (!ztest_check_path(bin)) { 7101 ztest_dump_core = 0; 7102 fatal(B_TRUE, "invalid ZDB '%s'", bin); 7103 } 7104 return; 7105 } 7106 7107 VERIFY3P(realpath(getexecname(), bin), !=, NULL); 7108 if (strstr(bin, ".libs/ztest")) { 7109 strstr(bin, ".libs/ztest")[0] = '\0'; /* In-tree */ 7110 strcat(bin, "zdb"); 7111 if (ztest_check_path(bin)) 7112 return; 7113 } 7114 strcpy(bin, "zdb"); 7115 } 7116 7117 static vdev_t * 7118 ztest_random_concrete_vdev_leaf(vdev_t *vd) 7119 { 7120 if (vd == NULL) 7121 return (NULL); 7122 7123 if (vd->vdev_children == 0) 7124 return (vd); 7125 7126 vdev_t *eligible[vd->vdev_children]; 7127 int eligible_idx = 0, i; 7128 for (i = 0; i < vd->vdev_children; i++) { 7129 vdev_t *cvd = vd->vdev_child[i]; 7130 if (cvd->vdev_top->vdev_removing) 7131 continue; 7132 if (cvd->vdev_children > 0 || 7133 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 7134 eligible[eligible_idx++] = cvd; 7135 } 7136 } 7137 VERIFY3S(eligible_idx, >, 0); 7138 7139 uint64_t child_no = ztest_random(eligible_idx); 7140 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 7141 } 7142 7143 void 7144 ztest_initialize(ztest_ds_t *zd, uint64_t id) 7145 { 7146 (void) zd, (void) id; 7147 spa_t *spa = ztest_spa; 7148 int error = 0; 7149 7150 mutex_enter(&ztest_vdev_lock); 7151 7152 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 7153 7154 /* Random leaf vdev */ 7155 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 7156 if (rand_vd == NULL) { 7157 spa_config_exit(spa, SCL_VDEV, FTAG); 7158 mutex_exit(&ztest_vdev_lock); 7159 return; 7160 } 7161 7162 /* 7163 * The random vdev we've selected may change as soon as we 7164 * drop the spa_config_lock. We create local copies of things 7165 * we're interested in. 7166 */ 7167 uint64_t guid = rand_vd->vdev_guid; 7168 char *path = strdup(rand_vd->vdev_path); 7169 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 7170 7171 zfs_dbgmsg("vd %px, guid %llu", rand_vd, (u_longlong_t)guid); 7172 spa_config_exit(spa, SCL_VDEV, FTAG); 7173 7174 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 7175 7176 nvlist_t *vdev_guids = fnvlist_alloc(); 7177 nvlist_t *vdev_errlist = fnvlist_alloc(); 7178 fnvlist_add_uint64(vdev_guids, path, guid); 7179 error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); 7180 fnvlist_free(vdev_guids); 7181 fnvlist_free(vdev_errlist); 7182 7183 switch (cmd) { 7184 case POOL_INITIALIZE_CANCEL: 7185 if (ztest_opts.zo_verbose >= 4) { 7186 (void) printf("Cancel initialize %s", path); 7187 if (!active) 7188 (void) printf(" failed (no initialize active)"); 7189 (void) printf("\n"); 7190 } 7191 break; 7192 case POOL_INITIALIZE_START: 7193 if (ztest_opts.zo_verbose >= 4) { 7194 (void) printf("Start initialize %s", path); 7195 if (active && error == 0) 7196 (void) printf(" failed (already active)"); 7197 else if (error != 0) 7198 (void) printf(" failed (error %d)", error); 7199 (void) printf("\n"); 7200 } 7201 break; 7202 case POOL_INITIALIZE_SUSPEND: 7203 if (ztest_opts.zo_verbose >= 4) { 7204 (void) printf("Suspend initialize %s", path); 7205 if (!active) 7206 (void) printf(" failed (no initialize active)"); 7207 (void) printf("\n"); 7208 } 7209 break; 7210 } 7211 free(path); 7212 mutex_exit(&ztest_vdev_lock); 7213 } 7214 7215 void 7216 ztest_trim(ztest_ds_t *zd, uint64_t id) 7217 { 7218 (void) zd, (void) id; 7219 spa_t *spa = ztest_spa; 7220 int error = 0; 7221 7222 mutex_enter(&ztest_vdev_lock); 7223 7224 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 7225 7226 /* Random leaf vdev */ 7227 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 7228 if (rand_vd == NULL) { 7229 spa_config_exit(spa, SCL_VDEV, FTAG); 7230 mutex_exit(&ztest_vdev_lock); 7231 return; 7232 } 7233 7234 /* 7235 * The random vdev we've selected may change as soon as we 7236 * drop the spa_config_lock. We create local copies of things 7237 * we're interested in. 7238 */ 7239 uint64_t guid = rand_vd->vdev_guid; 7240 char *path = strdup(rand_vd->vdev_path); 7241 boolean_t active = rand_vd->vdev_trim_thread != NULL; 7242 7243 zfs_dbgmsg("vd %p, guid %llu", rand_vd, (u_longlong_t)guid); 7244 spa_config_exit(spa, SCL_VDEV, FTAG); 7245 7246 uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); 7247 uint64_t rate = 1 << ztest_random(30); 7248 boolean_t partial = (ztest_random(5) > 0); 7249 boolean_t secure = (ztest_random(5) > 0); 7250 7251 nvlist_t *vdev_guids = fnvlist_alloc(); 7252 nvlist_t *vdev_errlist = fnvlist_alloc(); 7253 fnvlist_add_uint64(vdev_guids, path, guid); 7254 error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, 7255 secure, vdev_errlist); 7256 fnvlist_free(vdev_guids); 7257 fnvlist_free(vdev_errlist); 7258 7259 switch (cmd) { 7260 case POOL_TRIM_CANCEL: 7261 if (ztest_opts.zo_verbose >= 4) { 7262 (void) printf("Cancel TRIM %s", path); 7263 if (!active) 7264 (void) printf(" failed (no TRIM active)"); 7265 (void) printf("\n"); 7266 } 7267 break; 7268 case POOL_TRIM_START: 7269 if (ztest_opts.zo_verbose >= 4) { 7270 (void) printf("Start TRIM %s", path); 7271 if (active && error == 0) 7272 (void) printf(" failed (already active)"); 7273 else if (error != 0) 7274 (void) printf(" failed (error %d)", error); 7275 (void) printf("\n"); 7276 } 7277 break; 7278 case POOL_TRIM_SUSPEND: 7279 if (ztest_opts.zo_verbose >= 4) { 7280 (void) printf("Suspend TRIM %s", path); 7281 if (!active) 7282 (void) printf(" failed (no TRIM active)"); 7283 (void) printf("\n"); 7284 } 7285 break; 7286 } 7287 free(path); 7288 mutex_exit(&ztest_vdev_lock); 7289 } 7290 7291 /* 7292 * Verify pool integrity by running zdb. 7293 */ 7294 static void 7295 ztest_run_zdb(uint64_t guid) 7296 { 7297 int status; 7298 char *bin; 7299 char *zdb; 7300 char *zbuf; 7301 const int len = MAXPATHLEN + MAXNAMELEN + 20; 7302 FILE *fp; 7303 7304 bin = umem_alloc(len, UMEM_NOFAIL); 7305 zdb = umem_alloc(len, UMEM_NOFAIL); 7306 zbuf = umem_alloc(1024, UMEM_NOFAIL); 7307 7308 ztest_get_zdb_bin(bin, len); 7309 7310 char **set_gvars_args = ztest_global_vars_to_zdb_args(); 7311 if (set_gvars_args == NULL) { 7312 fatal(B_FALSE, "Failed to allocate memory in " 7313 "ztest_global_vars_to_zdb_args(). Cannot run zdb.\n"); 7314 } 7315 char *set_gvars_args_joined = join_strings(set_gvars_args, " "); 7316 free(set_gvars_args); 7317 7318 size_t would = snprintf(zdb, len, 7319 "%s -bcc%s%s -G -d -Y -e -y %s -p %s %"PRIu64, 7320 bin, 7321 ztest_opts.zo_verbose >= 3 ? "s" : "", 7322 ztest_opts.zo_verbose >= 4 ? "v" : "", 7323 set_gvars_args_joined, 7324 ztest_opts.zo_dir, 7325 guid); 7326 ASSERT3U(would, <, len); 7327 7328 umem_free(set_gvars_args_joined, strlen(set_gvars_args_joined) + 1); 7329 7330 if (ztest_opts.zo_verbose >= 5) 7331 (void) printf("Executing %s\n", zdb); 7332 7333 fp = popen(zdb, "r"); 7334 7335 while (fgets(zbuf, 1024, fp) != NULL) 7336 if (ztest_opts.zo_verbose >= 3) 7337 (void) printf("%s", zbuf); 7338 7339 status = pclose(fp); 7340 7341 if (status == 0) 7342 goto out; 7343 7344 ztest_dump_core = 0; 7345 if (WIFEXITED(status)) 7346 fatal(B_FALSE, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 7347 else 7348 fatal(B_FALSE, "'%s' died with signal %d", 7349 zdb, WTERMSIG(status)); 7350 out: 7351 umem_free(bin, len); 7352 umem_free(zdb, len); 7353 umem_free(zbuf, 1024); 7354 } 7355 7356 static void 7357 ztest_walk_pool_directory(const char *header) 7358 { 7359 spa_t *spa = NULL; 7360 7361 if (ztest_opts.zo_verbose >= 6) 7362 (void) puts(header); 7363 7364 mutex_enter(&spa_namespace_lock); 7365 while ((spa = spa_next(spa)) != NULL) 7366 if (ztest_opts.zo_verbose >= 6) 7367 (void) printf("\t%s\n", spa_name(spa)); 7368 mutex_exit(&spa_namespace_lock); 7369 } 7370 7371 static void 7372 ztest_spa_import_export(char *oldname, char *newname) 7373 { 7374 nvlist_t *config, *newconfig; 7375 uint64_t pool_guid; 7376 spa_t *spa; 7377 int error; 7378 7379 if (ztest_opts.zo_verbose >= 4) { 7380 (void) printf("import/export: old = %s, new = %s\n", 7381 oldname, newname); 7382 } 7383 7384 /* 7385 * Clean up from previous runs. 7386 */ 7387 (void) spa_destroy(newname); 7388 7389 /* 7390 * Get the pool's configuration and guid. 7391 */ 7392 VERIFY0(spa_open(oldname, &spa, FTAG)); 7393 7394 /* 7395 * Kick off a scrub to tickle scrub/export races. 7396 */ 7397 if (ztest_random(2) == 0) 7398 (void) spa_scan(spa, POOL_SCAN_SCRUB); 7399 7400 pool_guid = spa_guid(spa); 7401 spa_close(spa, FTAG); 7402 7403 ztest_walk_pool_directory("pools before export"); 7404 7405 /* 7406 * Export it. 7407 */ 7408 VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); 7409 7410 ztest_walk_pool_directory("pools after export"); 7411 7412 /* 7413 * Try to import it. 7414 */ 7415 newconfig = spa_tryimport(config); 7416 ASSERT3P(newconfig, !=, NULL); 7417 fnvlist_free(newconfig); 7418 7419 /* 7420 * Import it under the new name. 7421 */ 7422 error = spa_import(newname, config, NULL, 0); 7423 if (error != 0) { 7424 dump_nvlist(config, 0); 7425 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 7426 oldname, newname, error); 7427 } 7428 7429 ztest_walk_pool_directory("pools after import"); 7430 7431 /* 7432 * Try to import it again -- should fail with EEXIST. 7433 */ 7434 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 7435 7436 /* 7437 * Try to import it under a different name -- should fail with EEXIST. 7438 */ 7439 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 7440 7441 /* 7442 * Verify that the pool is no longer visible under the old name. 7443 */ 7444 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 7445 7446 /* 7447 * Verify that we can open and close the pool using the new name. 7448 */ 7449 VERIFY0(spa_open(newname, &spa, FTAG)); 7450 ASSERT3U(pool_guid, ==, spa_guid(spa)); 7451 spa_close(spa, FTAG); 7452 7453 fnvlist_free(config); 7454 } 7455 7456 static void 7457 ztest_resume(spa_t *spa) 7458 { 7459 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 7460 (void) printf("resuming from suspended state\n"); 7461 spa_vdev_state_enter(spa, SCL_NONE); 7462 vdev_clear(spa, NULL); 7463 (void) spa_vdev_state_exit(spa, NULL, 0); 7464 (void) zio_resume(spa); 7465 } 7466 7467 static __attribute__((noreturn)) void 7468 ztest_resume_thread(void *arg) 7469 { 7470 spa_t *spa = arg; 7471 7472 while (!ztest_exiting) { 7473 if (spa_suspended(spa)) 7474 ztest_resume(spa); 7475 (void) poll(NULL, 0, 100); 7476 7477 /* 7478 * Periodically change the zfs_compressed_arc_enabled setting. 7479 */ 7480 if (ztest_random(10) == 0) 7481 zfs_compressed_arc_enabled = ztest_random(2); 7482 7483 /* 7484 * Periodically change the zfs_abd_scatter_enabled setting. 7485 */ 7486 if (ztest_random(10) == 0) 7487 zfs_abd_scatter_enabled = ztest_random(2); 7488 } 7489 7490 thread_exit(); 7491 } 7492 7493 static __attribute__((noreturn)) void 7494 ztest_deadman_thread(void *arg) 7495 { 7496 ztest_shared_t *zs = arg; 7497 spa_t *spa = ztest_spa; 7498 hrtime_t delay, overdue, last_run = gethrtime(); 7499 7500 delay = (zs->zs_thread_stop - zs->zs_thread_start) + 7501 MSEC2NSEC(zfs_deadman_synctime_ms); 7502 7503 while (!ztest_exiting) { 7504 /* 7505 * Wait for the delay timer while checking occasionally 7506 * if we should stop. 7507 */ 7508 if (gethrtime() < last_run + delay) { 7509 (void) poll(NULL, 0, 1000); 7510 continue; 7511 } 7512 7513 /* 7514 * If the pool is suspended then fail immediately. Otherwise, 7515 * check to see if the pool is making any progress. If 7516 * vdev_deadman() discovers that there hasn't been any recent 7517 * I/Os then it will end up aborting the tests. 7518 */ 7519 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 7520 fatal(B_FALSE, 7521 "aborting test after %llu seconds because " 7522 "pool has transitioned to a suspended state.", 7523 (u_longlong_t)zfs_deadman_synctime_ms / 1000); 7524 } 7525 vdev_deadman(spa->spa_root_vdev, FTAG); 7526 7527 /* 7528 * If the process doesn't complete within a grace period of 7529 * zfs_deadman_synctime_ms over the expected finish time, 7530 * then it may be hung and is terminated. 7531 */ 7532 overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); 7533 if (gethrtime() > overdue) { 7534 fatal(B_FALSE, 7535 "aborting test after %llu seconds because " 7536 "the process is overdue for termination.", 7537 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7538 } 7539 7540 (void) printf("ztest has been running for %lld seconds\n", 7541 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7542 7543 last_run = gethrtime(); 7544 delay = MSEC2NSEC(zfs_deadman_checktime_ms); 7545 } 7546 7547 thread_exit(); 7548 } 7549 7550 static void 7551 ztest_execute(int test, ztest_info_t *zi, uint64_t id) 7552 { 7553 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 7554 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 7555 hrtime_t functime = gethrtime(); 7556 int i; 7557 7558 for (i = 0; i < zi->zi_iters; i++) 7559 zi->zi_func(zd, id); 7560 7561 functime = gethrtime() - functime; 7562 7563 atomic_add_64(&zc->zc_count, 1); 7564 atomic_add_64(&zc->zc_time, functime); 7565 7566 if (ztest_opts.zo_verbose >= 4) 7567 (void) printf("%6.2f sec in %s\n", 7568 (double)functime / NANOSEC, zi->zi_funcname); 7569 } 7570 7571 typedef struct ztest_raidz_expand_io { 7572 uint64_t rzx_id; 7573 uint64_t rzx_amount; 7574 uint64_t rzx_bufsize; 7575 const void *rzx_buffer; 7576 uint64_t rzx_alloc_max; 7577 spa_t *rzx_spa; 7578 } ztest_expand_io_t; 7579 7580 #undef OD_ARRAY_SIZE 7581 #define OD_ARRAY_SIZE 10 7582 7583 /* 7584 * Write a request amount of data to some dataset objects. 7585 * There will be ztest_opts.zo_threads count of these running in parallel. 7586 */ 7587 static __attribute__((noreturn)) void 7588 ztest_rzx_thread(void *arg) 7589 { 7590 ztest_expand_io_t *info = (ztest_expand_io_t *)arg; 7591 ztest_od_t *od; 7592 int batchsize; 7593 int od_size; 7594 ztest_ds_t *zd = &ztest_ds[info->rzx_id % ztest_opts.zo_datasets]; 7595 spa_t *spa = info->rzx_spa; 7596 7597 od_size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 7598 od = umem_alloc(od_size, UMEM_NOFAIL); 7599 batchsize = OD_ARRAY_SIZE; 7600 7601 /* Create objects to write to */ 7602 for (int b = 0; b < batchsize; b++) { 7603 ztest_od_init(od + b, info->rzx_id, FTAG, b, 7604 DMU_OT_UINT64_OTHER, 0, 0, 0); 7605 } 7606 if (ztest_object_init(zd, od, od_size, B_FALSE) != 0) { 7607 umem_free(od, od_size); 7608 thread_exit(); 7609 } 7610 7611 for (uint64_t offset = 0, written = 0; written < info->rzx_amount; 7612 offset += info->rzx_bufsize) { 7613 /* write to 10 objects */ 7614 for (int i = 0; i < batchsize && written < info->rzx_amount; 7615 i++) { 7616 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 7617 ztest_write(zd, od[i].od_object, offset, 7618 info->rzx_bufsize, info->rzx_buffer); 7619 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 7620 written += info->rzx_bufsize; 7621 } 7622 txg_wait_synced(spa_get_dsl(spa), 0); 7623 /* due to inflation, we'll typically bail here */ 7624 if (metaslab_class_get_alloc(spa_normal_class(spa)) > 7625 info->rzx_alloc_max) { 7626 break; 7627 } 7628 } 7629 7630 /* Remove a few objects to leave some holes in allocation space */ 7631 mutex_enter(&zd->zd_dirobj_lock); 7632 (void) ztest_remove(zd, od, 2); 7633 mutex_exit(&zd->zd_dirobj_lock); 7634 7635 umem_free(od, od_size); 7636 7637 thread_exit(); 7638 } 7639 7640 static __attribute__((noreturn)) void 7641 ztest_thread(void *arg) 7642 { 7643 int rand; 7644 uint64_t id = (uintptr_t)arg; 7645 ztest_shared_t *zs = ztest_shared; 7646 uint64_t call_next; 7647 hrtime_t now; 7648 ztest_info_t *zi; 7649 ztest_shared_callstate_t *zc; 7650 7651 while ((now = gethrtime()) < zs->zs_thread_stop) { 7652 /* 7653 * See if it's time to force a crash. 7654 */ 7655 if (now > zs->zs_thread_kill && 7656 raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) { 7657 ztest_kill(zs); 7658 } 7659 7660 /* 7661 * If we're getting ENOSPC with some regularity, stop. 7662 */ 7663 if (zs->zs_enospc_count > 10) 7664 break; 7665 7666 /* 7667 * Pick a random function to execute. 7668 */ 7669 rand = ztest_random(ZTEST_FUNCS); 7670 zi = &ztest_info[rand]; 7671 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 7672 call_next = zc->zc_next; 7673 7674 if (now >= call_next && 7675 atomic_cas_64(&zc->zc_next, call_next, call_next + 7676 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 7677 ztest_execute(rand, zi, id); 7678 } 7679 } 7680 7681 thread_exit(); 7682 } 7683 7684 static void 7685 ztest_dataset_name(char *dsname, const char *pool, int d) 7686 { 7687 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 7688 } 7689 7690 static void 7691 ztest_dataset_destroy(int d) 7692 { 7693 char name[ZFS_MAX_DATASET_NAME_LEN]; 7694 int t; 7695 7696 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7697 7698 if (ztest_opts.zo_verbose >= 3) 7699 (void) printf("Destroying %s to free up space\n", name); 7700 7701 /* 7702 * Cleanup any non-standard clones and snapshots. In general, 7703 * ztest thread t operates on dataset (t % zopt_datasets), 7704 * so there may be more than one thing to clean up. 7705 */ 7706 for (t = d; t < ztest_opts.zo_threads; 7707 t += ztest_opts.zo_datasets) 7708 ztest_dsl_dataset_cleanup(name, t); 7709 7710 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 7711 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 7712 } 7713 7714 static void 7715 ztest_dataset_dirobj_verify(ztest_ds_t *zd) 7716 { 7717 uint64_t usedobjs, dirobjs, scratch; 7718 7719 /* 7720 * ZTEST_DIROBJ is the object directory for the entire dataset. 7721 * Therefore, the number of objects in use should equal the 7722 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 7723 * If not, we have an object leak. 7724 * 7725 * Note that we can only check this in ztest_dataset_open(), 7726 * when the open-context and syncing-context values agree. 7727 * That's because zap_count() returns the open-context value, 7728 * while dmu_objset_space() returns the rootbp fill count. 7729 */ 7730 VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 7731 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 7732 ASSERT3U(dirobjs + 1, ==, usedobjs); 7733 } 7734 7735 static int 7736 ztest_dataset_open(int d) 7737 { 7738 ztest_ds_t *zd = &ztest_ds[d]; 7739 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 7740 objset_t *os; 7741 zilog_t *zilog; 7742 char name[ZFS_MAX_DATASET_NAME_LEN]; 7743 int error; 7744 7745 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7746 7747 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7748 7749 error = ztest_dataset_create(name); 7750 if (error == ENOSPC) { 7751 (void) pthread_rwlock_unlock(&ztest_name_lock); 7752 ztest_record_enospc(FTAG); 7753 return (error); 7754 } 7755 ASSERT(error == 0 || error == EEXIST); 7756 7757 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 7758 B_TRUE, zd, &os)); 7759 (void) pthread_rwlock_unlock(&ztest_name_lock); 7760 7761 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 7762 7763 zilog = zd->zd_zilog; 7764 7765 if (zilog->zl_header->zh_claim_lr_seq != 0 && 7766 zilog->zl_header->zh_claim_lr_seq < committed_seq) 7767 fatal(B_FALSE, "missing log records: " 7768 "claimed %"PRIu64" < committed %"PRIu64"", 7769 zilog->zl_header->zh_claim_lr_seq, committed_seq); 7770 7771 ztest_dataset_dirobj_verify(zd); 7772 7773 zil_replay(os, zd, ztest_replay_vector); 7774 7775 ztest_dataset_dirobj_verify(zd); 7776 7777 if (ztest_opts.zo_verbose >= 6) 7778 (void) printf("%s replay %"PRIu64" blocks, " 7779 "%"PRIu64" records, seq %"PRIu64"\n", 7780 zd->zd_name, 7781 zilog->zl_parse_blk_count, 7782 zilog->zl_parse_lr_count, 7783 zilog->zl_replaying_seq); 7784 7785 zilog = zil_open(os, ztest_get_data, NULL); 7786 7787 if (zilog->zl_replaying_seq != 0 && 7788 zilog->zl_replaying_seq < committed_seq) 7789 fatal(B_FALSE, "missing log records: " 7790 "replayed %"PRIu64" < committed %"PRIu64"", 7791 zilog->zl_replaying_seq, committed_seq); 7792 7793 return (0); 7794 } 7795 7796 static void 7797 ztest_dataset_close(int d) 7798 { 7799 ztest_ds_t *zd = &ztest_ds[d]; 7800 7801 zil_close(zd->zd_zilog); 7802 dmu_objset_disown(zd->zd_os, B_TRUE, zd); 7803 7804 ztest_zd_fini(zd); 7805 } 7806 7807 static int 7808 ztest_replay_zil_cb(const char *name, void *arg) 7809 { 7810 (void) arg; 7811 objset_t *os; 7812 ztest_ds_t *zdtmp; 7813 7814 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, 7815 B_TRUE, FTAG, &os)); 7816 7817 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 7818 7819 ztest_zd_init(zdtmp, NULL, os); 7820 zil_replay(os, zdtmp, ztest_replay_vector); 7821 ztest_zd_fini(zdtmp); 7822 7823 if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && 7824 ztest_opts.zo_verbose >= 6) { 7825 zilog_t *zilog = dmu_objset_zil(os); 7826 7827 (void) printf("%s replay %"PRIu64" blocks, " 7828 "%"PRIu64" records, seq %"PRIu64"\n", 7829 name, 7830 zilog->zl_parse_blk_count, 7831 zilog->zl_parse_lr_count, 7832 zilog->zl_replaying_seq); 7833 } 7834 7835 umem_free(zdtmp, sizeof (ztest_ds_t)); 7836 7837 dmu_objset_disown(os, B_TRUE, FTAG); 7838 return (0); 7839 } 7840 7841 static void 7842 ztest_freeze(void) 7843 { 7844 ztest_ds_t *zd = &ztest_ds[0]; 7845 spa_t *spa; 7846 int numloops = 0; 7847 7848 /* freeze not supported during RAIDZ expansion */ 7849 if (ztest_opts.zo_raid_do_expand) 7850 return; 7851 7852 if (ztest_opts.zo_verbose >= 3) 7853 (void) printf("testing spa_freeze()...\n"); 7854 7855 raidz_scratch_verify(); 7856 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7857 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7858 VERIFY0(ztest_dataset_open(0)); 7859 ztest_spa = spa; 7860 7861 /* 7862 * Force the first log block to be transactionally allocated. 7863 * We have to do this before we freeze the pool -- otherwise 7864 * the log chain won't be anchored. 7865 */ 7866 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 7867 ztest_dmu_object_alloc_free(zd, 0); 7868 zil_commit(zd->zd_zilog, 0); 7869 } 7870 7871 txg_wait_synced(spa_get_dsl(spa), 0); 7872 7873 /* 7874 * Freeze the pool. This stops spa_sync() from doing anything, 7875 * so that the only way to record changes from now on is the ZIL. 7876 */ 7877 spa_freeze(spa); 7878 7879 /* 7880 * Because it is hard to predict how much space a write will actually 7881 * require beforehand, we leave ourselves some fudge space to write over 7882 * capacity. 7883 */ 7884 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 7885 7886 /* 7887 * Run tests that generate log records but don't alter the pool config 7888 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 7889 * We do a txg_wait_synced() after each iteration to force the txg 7890 * to increase well beyond the last synced value in the uberblock. 7891 * The ZIL should be OK with that. 7892 * 7893 * Run a random number of times less than zo_maxloops and ensure we do 7894 * not run out of space on the pool. 7895 */ 7896 while (ztest_random(10) != 0 && 7897 numloops++ < ztest_opts.zo_maxloops && 7898 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 7899 ztest_od_t od; 7900 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 7901 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 7902 ztest_io(zd, od.od_object, 7903 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 7904 txg_wait_synced(spa_get_dsl(spa), 0); 7905 } 7906 7907 /* 7908 * Commit all of the changes we just generated. 7909 */ 7910 zil_commit(zd->zd_zilog, 0); 7911 txg_wait_synced(spa_get_dsl(spa), 0); 7912 7913 /* 7914 * Close our dataset and close the pool. 7915 */ 7916 ztest_dataset_close(0); 7917 spa_close(spa, FTAG); 7918 kernel_fini(); 7919 7920 /* 7921 * Open and close the pool and dataset to induce log replay. 7922 */ 7923 raidz_scratch_verify(); 7924 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7925 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7926 ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); 7927 VERIFY0(ztest_dataset_open(0)); 7928 ztest_spa = spa; 7929 txg_wait_synced(spa_get_dsl(spa), 0); 7930 ztest_dataset_close(0); 7931 ztest_reguid(NULL, 0); 7932 7933 spa_close(spa, FTAG); 7934 kernel_fini(); 7935 } 7936 7937 static void 7938 ztest_import_impl(void) 7939 { 7940 importargs_t args = { 0 }; 7941 nvlist_t *cfg = NULL; 7942 int nsearch = 1; 7943 char *searchdirs[nsearch]; 7944 int flags = ZFS_IMPORT_MISSING_LOG; 7945 7946 searchdirs[0] = ztest_opts.zo_dir; 7947 args.paths = nsearch; 7948 args.path = searchdirs; 7949 args.can_be_active = B_FALSE; 7950 7951 libpc_handle_t lpch = { 7952 .lpc_lib_handle = NULL, 7953 .lpc_ops = &libzpool_config_ops, 7954 .lpc_printerr = B_TRUE 7955 }; 7956 VERIFY0(zpool_find_config(&lpch, ztest_opts.zo_pool, &cfg, &args)); 7957 VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); 7958 fnvlist_free(cfg); 7959 } 7960 7961 /* 7962 * Import a storage pool with the given name. 7963 */ 7964 static void 7965 ztest_import(ztest_shared_t *zs) 7966 { 7967 spa_t *spa; 7968 7969 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7970 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7971 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7972 7973 raidz_scratch_verify(); 7974 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7975 7976 ztest_import_impl(); 7977 7978 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7979 zs->zs_metaslab_sz = 7980 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7981 zs->zs_guid = spa_guid(spa); 7982 spa_close(spa, FTAG); 7983 7984 kernel_fini(); 7985 7986 if (!ztest_opts.zo_mmp_test) { 7987 ztest_run_zdb(zs->zs_guid); 7988 ztest_freeze(); 7989 ztest_run_zdb(zs->zs_guid); 7990 } 7991 7992 (void) pthread_rwlock_destroy(&ztest_name_lock); 7993 mutex_destroy(&ztest_vdev_lock); 7994 mutex_destroy(&ztest_checkpoint_lock); 7995 } 7996 7997 /* 7998 * After the expansion was killed, check that the pool is healthy 7999 */ 8000 static void 8001 ztest_raidz_expand_check(spa_t *spa) 8002 { 8003 ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, RAIDZ_EXPAND_KILLED); 8004 /* 8005 * Set pool check done flag, main program will run a zdb check 8006 * of the pool when we exit. 8007 */ 8008 ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_CHECKED; 8009 8010 /* Wait for reflow to finish */ 8011 if (ztest_opts.zo_verbose >= 1) { 8012 (void) printf("\nwaiting for reflow to finish ...\n"); 8013 } 8014 pool_raidz_expand_stat_t rzx_stats; 8015 pool_raidz_expand_stat_t *pres = &rzx_stats; 8016 do { 8017 txg_wait_synced(spa_get_dsl(spa), 0); 8018 (void) poll(NULL, 0, 500); /* wait 1/2 second */ 8019 8020 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8021 (void) spa_raidz_expand_get_stats(spa, pres); 8022 spa_config_exit(spa, SCL_CONFIG, FTAG); 8023 } while (pres->pres_state != DSS_FINISHED && 8024 pres->pres_reflowed < pres->pres_to_reflow); 8025 8026 if (ztest_opts.zo_verbose >= 1) { 8027 (void) printf("verifying an interrupted raidz " 8028 "expansion using a pool scrub ...\n"); 8029 } 8030 /* Will fail here if there is non-recoverable corruption detected */ 8031 VERIFY0(ztest_scrub_impl(spa)); 8032 if (ztest_opts.zo_verbose >= 1) { 8033 (void) printf("raidz expansion scrub check complete\n"); 8034 } 8035 } 8036 8037 /* 8038 * Start a raidz expansion test. We run some I/O on the pool for a while 8039 * to get some data in the pool. Then we grow the raidz and 8040 * kill the test at the requested offset into the reflow, verifying that 8041 * doing such does not lead to pool corruption. 8042 */ 8043 static void 8044 ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) 8045 { 8046 nvlist_t *root; 8047 pool_raidz_expand_stat_t rzx_stats; 8048 pool_raidz_expand_stat_t *pres = &rzx_stats; 8049 kthread_t **run_threads; 8050 vdev_t *cvd, *rzvd = spa->spa_root_vdev->vdev_child[0]; 8051 int total_disks = rzvd->vdev_children; 8052 int data_disks = total_disks - vdev_get_nparity(rzvd); 8053 uint64_t alloc_goal; 8054 uint64_t csize; 8055 int error, t; 8056 int threads = ztest_opts.zo_threads; 8057 ztest_expand_io_t *thread_args; 8058 8059 ASSERT3U(ztest_opts.zo_raidz_expand_test, !=, RAIDZ_EXPAND_NONE); 8060 ASSERT3P(rzvd->vdev_ops, ==, &vdev_raidz_ops); 8061 ztest_opts.zo_raidz_expand_test = RAIDZ_EXPAND_STARTED; 8062 8063 /* Setup a 1 MiB buffer of random data */ 8064 uint64_t bufsize = 1024 * 1024; 8065 void *buffer = umem_alloc(bufsize, UMEM_NOFAIL); 8066 8067 if (read(ztest_fd_rand, buffer, bufsize) != bufsize) { 8068 fatal(B_TRUE, "short read from /dev/urandom"); 8069 } 8070 /* 8071 * Put some data in the pool and then attach a vdev to initiate 8072 * reflow. 8073 */ 8074 run_threads = umem_zalloc(threads * sizeof (kthread_t *), UMEM_NOFAIL); 8075 thread_args = umem_zalloc(threads * sizeof (ztest_expand_io_t), 8076 UMEM_NOFAIL); 8077 /* Aim for roughly 25% of allocatable space up to 1GB */ 8078 alloc_goal = (vdev_get_min_asize(rzvd) * data_disks) / total_disks; 8079 alloc_goal = MIN(alloc_goal >> 2, 1024*1024*1024); 8080 if (ztest_opts.zo_verbose >= 1) { 8081 (void) printf("adding data to pool '%s', goal %llu bytes\n", 8082 ztest_opts.zo_pool, (u_longlong_t)alloc_goal); 8083 } 8084 8085 /* 8086 * Kick off all the I/O generators that run in parallel. 8087 */ 8088 for (t = 0; t < threads; t++) { 8089 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 8090 umem_free(run_threads, threads * sizeof (kthread_t *)); 8091 umem_free(buffer, bufsize); 8092 return; 8093 } 8094 thread_args[t].rzx_id = t; 8095 thread_args[t].rzx_amount = alloc_goal / threads; 8096 thread_args[t].rzx_bufsize = bufsize; 8097 thread_args[t].rzx_buffer = buffer; 8098 thread_args[t].rzx_alloc_max = alloc_goal; 8099 thread_args[t].rzx_spa = spa; 8100 run_threads[t] = thread_create(NULL, 0, ztest_rzx_thread, 8101 &thread_args[t], 0, NULL, TS_RUN | TS_JOINABLE, 8102 defclsyspri); 8103 } 8104 8105 /* 8106 * Wait for all of the writers to complete. 8107 */ 8108 for (t = 0; t < threads; t++) 8109 VERIFY0(thread_join(run_threads[t])); 8110 8111 /* 8112 * Close all datasets. This must be done after all the threads 8113 * are joined so we can be sure none of the datasets are in-use 8114 * by any of the threads. 8115 */ 8116 for (t = 0; t < ztest_opts.zo_threads; t++) { 8117 if (t < ztest_opts.zo_datasets) 8118 ztest_dataset_close(t); 8119 } 8120 8121 txg_wait_synced(spa_get_dsl(spa), 0); 8122 8123 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 8124 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 8125 8126 umem_free(buffer, bufsize); 8127 umem_free(run_threads, threads * sizeof (kthread_t *)); 8128 umem_free(thread_args, threads * sizeof (ztest_expand_io_t)); 8129 8130 /* Set our reflow target to 25%, 50% or 75% of allocated size */ 8131 uint_t multiple = ztest_random(3) + 1; 8132 uint64_t reflow_max = (rzvd->vdev_stat.vs_alloc * multiple) / 4; 8133 raidz_expand_max_reflow_bytes = reflow_max; 8134 8135 if (ztest_opts.zo_verbose >= 1) { 8136 (void) printf("running raidz expansion test, killing when " 8137 "reflow reaches %llu bytes (%u/4 of allocated space)\n", 8138 (u_longlong_t)reflow_max, multiple); 8139 } 8140 8141 /* XXX - do we want some I/O load during the reflow? */ 8142 8143 /* 8144 * Use a disk size that is larger than existing ones 8145 */ 8146 cvd = rzvd->vdev_child[0]; 8147 csize = vdev_get_min_asize(cvd); 8148 csize += csize / 10; 8149 /* 8150 * Path to vdev to be attached 8151 */ 8152 char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 8153 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 8154 ztest_opts.zo_dir, ztest_opts.zo_pool, rzvd->vdev_children); 8155 /* 8156 * Build the nvlist describing newpath. 8157 */ 8158 root = make_vdev_root(newpath, NULL, NULL, csize, ztest_get_ashift(), 8159 NULL, 0, 0, 1); 8160 /* 8161 * Expand the raidz vdev by attaching the new disk 8162 */ 8163 if (ztest_opts.zo_verbose >= 1) { 8164 (void) printf("expanding raidz: %d wide to %d wide with '%s'\n", 8165 (int)rzvd->vdev_children, (int)rzvd->vdev_children + 1, 8166 newpath); 8167 } 8168 error = spa_vdev_attach(spa, rzvd->vdev_guid, root, B_FALSE, B_FALSE); 8169 nvlist_free(root); 8170 if (error != 0) { 8171 fatal(0, "raidz expand: attach (%s %llu) returned %d", 8172 newpath, (long long)csize, error); 8173 } 8174 8175 /* 8176 * Wait for reflow to begin 8177 */ 8178 while (spa->spa_raidz_expand == NULL) { 8179 txg_wait_synced(spa_get_dsl(spa), 0); 8180 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8181 } 8182 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8183 (void) spa_raidz_expand_get_stats(spa, pres); 8184 spa_config_exit(spa, SCL_CONFIG, FTAG); 8185 while (pres->pres_state != DSS_SCANNING) { 8186 txg_wait_synced(spa_get_dsl(spa), 0); 8187 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8188 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8189 (void) spa_raidz_expand_get_stats(spa, pres); 8190 spa_config_exit(spa, SCL_CONFIG, FTAG); 8191 } 8192 8193 ASSERT3U(pres->pres_state, ==, DSS_SCANNING); 8194 ASSERT3U(pres->pres_to_reflow, !=, 0); 8195 /* 8196 * Set so when we are killed we go to raidz checking rather than 8197 * restarting test. 8198 */ 8199 ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_KILLED; 8200 if (ztest_opts.zo_verbose >= 1) { 8201 (void) printf("raidz expansion reflow started, waiting for " 8202 "%llu bytes to be copied\n", (u_longlong_t)reflow_max); 8203 } 8204 8205 /* 8206 * Wait for reflow maximum to be reached and then kill the test 8207 */ 8208 while (pres->pres_reflowed < reflow_max) { 8209 txg_wait_synced(spa_get_dsl(spa), 0); 8210 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8211 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8212 (void) spa_raidz_expand_get_stats(spa, pres); 8213 spa_config_exit(spa, SCL_CONFIG, FTAG); 8214 } 8215 8216 /* Reset the reflow pause before killing */ 8217 raidz_expand_max_reflow_bytes = 0; 8218 8219 if (ztest_opts.zo_verbose >= 1) { 8220 (void) printf("killing raidz expansion test after reflow " 8221 "reached %llu bytes\n", (u_longlong_t)pres->pres_reflowed); 8222 } 8223 8224 /* 8225 * Kill ourself to simulate a panic during a reflow. Our parent will 8226 * restart the test and the changed flag value will drive the test 8227 * through the scrub/check code to verify the pool is not corrupted. 8228 */ 8229 ztest_kill(zs); 8230 } 8231 8232 static void 8233 ztest_generic_run(ztest_shared_t *zs, spa_t *spa) 8234 { 8235 kthread_t **run_threads; 8236 int t; 8237 8238 run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), 8239 UMEM_NOFAIL); 8240 8241 /* 8242 * Kick off all the tests that run in parallel. 8243 */ 8244 for (t = 0; t < ztest_opts.zo_threads; t++) { 8245 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 8246 umem_free(run_threads, ztest_opts.zo_threads * 8247 sizeof (kthread_t *)); 8248 return; 8249 } 8250 8251 run_threads[t] = thread_create(NULL, 0, ztest_thread, 8252 (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, 8253 defclsyspri); 8254 } 8255 8256 /* 8257 * Wait for all of the tests to complete. 8258 */ 8259 for (t = 0; t < ztest_opts.zo_threads; t++) 8260 VERIFY0(thread_join(run_threads[t])); 8261 8262 /* 8263 * Close all datasets. This must be done after all the threads 8264 * are joined so we can be sure none of the datasets are in-use 8265 * by any of the threads. 8266 */ 8267 for (t = 0; t < ztest_opts.zo_threads; t++) { 8268 if (t < ztest_opts.zo_datasets) 8269 ztest_dataset_close(t); 8270 } 8271 8272 txg_wait_synced(spa_get_dsl(spa), 0); 8273 8274 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 8275 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 8276 8277 umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); 8278 } 8279 8280 /* 8281 * Setup our test context and kick off threads to run tests on all datasets 8282 * in parallel. 8283 */ 8284 static void 8285 ztest_run(ztest_shared_t *zs) 8286 { 8287 spa_t *spa; 8288 objset_t *os; 8289 kthread_t *resume_thread, *deadman_thread; 8290 uint64_t object; 8291 int error; 8292 int t, d; 8293 8294 ztest_exiting = B_FALSE; 8295 8296 /* 8297 * Initialize parent/child shared state. 8298 */ 8299 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8300 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8301 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8302 8303 zs->zs_thread_start = gethrtime(); 8304 zs->zs_thread_stop = 8305 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 8306 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 8307 zs->zs_thread_kill = zs->zs_thread_stop; 8308 if (ztest_random(100) < ztest_opts.zo_killrate) { 8309 zs->zs_thread_kill -= 8310 ztest_random(ztest_opts.zo_passtime * NANOSEC); 8311 } 8312 8313 mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); 8314 8315 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 8316 offsetof(ztest_cb_data_t, zcd_node)); 8317 8318 /* 8319 * Open our pool. It may need to be imported first depending on 8320 * what tests were running when the previous pass was terminated. 8321 */ 8322 raidz_scratch_verify(); 8323 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8324 error = spa_open(ztest_opts.zo_pool, &spa, FTAG); 8325 if (error) { 8326 VERIFY3S(error, ==, ENOENT); 8327 ztest_import_impl(); 8328 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8329 zs->zs_metaslab_sz = 8330 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8331 } 8332 8333 metaslab_preload_limit = ztest_random(20) + 1; 8334 ztest_spa = spa; 8335 8336 /* 8337 * XXX - BUGBUG raidz expansion do not run this for generic for now 8338 */ 8339 if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) 8340 VERIFY0(vdev_raidz_impl_set("cycle")); 8341 8342 dmu_objset_stats_t dds; 8343 VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, 8344 DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); 8345 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 8346 dmu_objset_fast_stat(os, &dds); 8347 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 8348 dmu_objset_disown(os, B_TRUE, FTAG); 8349 8350 /* Give the dedicated raidz expansion test more grace time */ 8351 if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) 8352 zfs_deadman_synctime_ms *= 2; 8353 8354 /* 8355 * Create a thread to periodically resume suspended I/O. 8356 */ 8357 resume_thread = thread_create(NULL, 0, ztest_resume_thread, 8358 spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 8359 8360 /* 8361 * Create a deadman thread and set to panic if we hang. 8362 */ 8363 deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, 8364 zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 8365 8366 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; 8367 8368 /* 8369 * Verify that we can safely inquire about any object, 8370 * whether it's allocated or not. To make it interesting, 8371 * we probe a 5-wide window around each power of two. 8372 * This hits all edge cases, including zero and the max. 8373 */ 8374 for (t = 0; t < 64; t++) { 8375 for (d = -5; d <= 5; d++) { 8376 error = dmu_object_info(spa->spa_meta_objset, 8377 (1ULL << t) + d, NULL); 8378 ASSERT(error == 0 || error == ENOENT || 8379 error == EINVAL); 8380 } 8381 } 8382 8383 /* 8384 * If we got any ENOSPC errors on the previous run, destroy something. 8385 */ 8386 if (zs->zs_enospc_count != 0) { 8387 /* Not expecting ENOSPC errors during raidz expansion tests */ 8388 ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, 8389 RAIDZ_EXPAND_NONE); 8390 8391 int d = ztest_random(ztest_opts.zo_datasets); 8392 ztest_dataset_destroy(d); 8393 } 8394 zs->zs_enospc_count = 0; 8395 8396 /* 8397 * If we were in the middle of ztest_device_removal() and were killed 8398 * we need to ensure the removal and scrub complete before running 8399 * any tests that check ztest_device_removal_active. The removal will 8400 * be restarted automatically when the spa is opened, but we need to 8401 * initiate the scrub manually if it is not already in progress. Note 8402 * that we always run the scrub whenever an indirect vdev exists 8403 * because we have no way of knowing for sure if ztest_device_removal() 8404 * fully completed its scrub before the pool was reimported. 8405 * 8406 * Does not apply for the RAIDZ expansion specific test runs 8407 */ 8408 if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_NONE && 8409 (spa->spa_removing_phys.sr_state == DSS_SCANNING || 8410 spa->spa_removing_phys.sr_prev_indirect_vdev != -1)) { 8411 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 8412 txg_wait_synced(spa_get_dsl(spa), 0); 8413 8414 error = ztest_scrub_impl(spa); 8415 if (error == EBUSY) 8416 error = 0; 8417 ASSERT0(error); 8418 } 8419 8420 if (ztest_opts.zo_verbose >= 4) 8421 (void) printf("starting main threads...\n"); 8422 8423 /* 8424 * Replay all logs of all datasets in the pool. This is primarily for 8425 * temporary datasets which wouldn't otherwise get replayed, which 8426 * can trigger failures when attempting to offline a SLOG in 8427 * ztest_fault_inject(). 8428 */ 8429 (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, 8430 NULL, DS_FIND_CHILDREN); 8431 8432 if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) 8433 ztest_raidz_expand_run(zs, spa); 8434 else if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_KILLED) 8435 ztest_raidz_expand_check(spa); 8436 else 8437 ztest_generic_run(zs, spa); 8438 8439 /* Kill the resume and deadman threads */ 8440 ztest_exiting = B_TRUE; 8441 VERIFY0(thread_join(resume_thread)); 8442 VERIFY0(thread_join(deadman_thread)); 8443 ztest_resume(spa); 8444 8445 /* 8446 * Right before closing the pool, kick off a bunch of async I/O; 8447 * spa_close() should wait for it to complete. 8448 */ 8449 for (object = 1; object < 50; object++) { 8450 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 8451 ZIO_PRIORITY_SYNC_READ); 8452 } 8453 8454 /* Verify that at least one commit cb was called in a timely fashion */ 8455 if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) 8456 VERIFY0(zc_min_txg_delay); 8457 8458 spa_close(spa, FTAG); 8459 8460 /* 8461 * Verify that we can loop over all pools. 8462 */ 8463 mutex_enter(&spa_namespace_lock); 8464 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 8465 if (ztest_opts.zo_verbose > 3) 8466 (void) printf("spa_next: found %s\n", spa_name(spa)); 8467 mutex_exit(&spa_namespace_lock); 8468 8469 /* 8470 * Verify that we can export the pool and reimport it under a 8471 * different name. 8472 */ 8473 if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { 8474 char name[ZFS_MAX_DATASET_NAME_LEN]; 8475 (void) snprintf(name, sizeof (name), "%s_import", 8476 ztest_opts.zo_pool); 8477 ztest_spa_import_export(ztest_opts.zo_pool, name); 8478 ztest_spa_import_export(name, ztest_opts.zo_pool); 8479 } 8480 8481 kernel_fini(); 8482 8483 list_destroy(&zcl.zcl_callbacks); 8484 mutex_destroy(&zcl.zcl_callbacks_lock); 8485 (void) pthread_rwlock_destroy(&ztest_name_lock); 8486 mutex_destroy(&ztest_vdev_lock); 8487 mutex_destroy(&ztest_checkpoint_lock); 8488 } 8489 8490 static void 8491 print_time(hrtime_t t, char *timebuf) 8492 { 8493 hrtime_t s = t / NANOSEC; 8494 hrtime_t m = s / 60; 8495 hrtime_t h = m / 60; 8496 hrtime_t d = h / 24; 8497 8498 s -= m * 60; 8499 m -= h * 60; 8500 h -= d * 24; 8501 8502 timebuf[0] = '\0'; 8503 8504 if (d) 8505 (void) sprintf(timebuf, 8506 "%llud%02lluh%02llum%02llus", d, h, m, s); 8507 else if (h) 8508 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 8509 else if (m) 8510 (void) sprintf(timebuf, "%llum%02llus", m, s); 8511 else 8512 (void) sprintf(timebuf, "%llus", s); 8513 } 8514 8515 static nvlist_t * 8516 make_random_pool_props(void) 8517 { 8518 nvlist_t *props; 8519 8520 props = fnvlist_alloc(); 8521 8522 /* Twenty percent of the time enable ZPOOL_PROP_DEDUP_TABLE_QUOTA */ 8523 if (ztest_random(5) == 0) { 8524 fnvlist_add_uint64(props, 8525 zpool_prop_to_name(ZPOOL_PROP_DEDUP_TABLE_QUOTA), 8526 2 * 1024 * 1024); 8527 } 8528 8529 /* Fifty percent of the time enable ZPOOL_PROP_AUTOREPLACE */ 8530 if (ztest_random(2) == 0) { 8531 fnvlist_add_uint64(props, 8532 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); 8533 } 8534 8535 return (props); 8536 } 8537 8538 /* 8539 * Create a storage pool with the given name and initial vdev size. 8540 * Then test spa_freeze() functionality. 8541 */ 8542 static void 8543 ztest_init(ztest_shared_t *zs) 8544 { 8545 spa_t *spa; 8546 nvlist_t *nvroot, *props; 8547 int i; 8548 8549 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8550 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8551 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8552 8553 raidz_scratch_verify(); 8554 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8555 8556 /* 8557 * Create the storage pool. 8558 */ 8559 (void) spa_destroy(ztest_opts.zo_pool); 8560 ztest_shared->zs_vdev_next_leaf = 0; 8561 zs->zs_splits = 0; 8562 zs->zs_mirrors = ztest_opts.zo_mirrors; 8563 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 8564 NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 8565 props = make_random_pool_props(); 8566 8567 /* 8568 * We don't expect the pool to suspend unless maxfaults == 0, 8569 * in which case ztest_fault_inject() temporarily takes away 8570 * the only valid replica. 8571 */ 8572 fnvlist_add_uint64(props, 8573 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 8574 MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); 8575 8576 for (i = 0; i < SPA_FEATURES; i++) { 8577 char *buf; 8578 8579 if (!spa_feature_table[i].fi_zfs_mod_supported) 8580 continue; 8581 8582 /* 8583 * 75% chance of using the log space map feature. We want ztest 8584 * to exercise both the code paths that use the log space map 8585 * feature and the ones that don't. 8586 */ 8587 if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) 8588 continue; 8589 8590 VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", 8591 spa_feature_table[i].fi_uname)); 8592 fnvlist_add_uint64(props, buf, 0); 8593 free(buf); 8594 } 8595 8596 VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); 8597 fnvlist_free(nvroot); 8598 fnvlist_free(props); 8599 8600 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8601 zs->zs_metaslab_sz = 8602 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8603 zs->zs_guid = spa_guid(spa); 8604 spa_close(spa, FTAG); 8605 8606 kernel_fini(); 8607 8608 if (!ztest_opts.zo_mmp_test) { 8609 ztest_run_zdb(zs->zs_guid); 8610 ztest_freeze(); 8611 ztest_run_zdb(zs->zs_guid); 8612 } 8613 8614 (void) pthread_rwlock_destroy(&ztest_name_lock); 8615 mutex_destroy(&ztest_vdev_lock); 8616 mutex_destroy(&ztest_checkpoint_lock); 8617 } 8618 8619 static void 8620 setup_data_fd(void) 8621 { 8622 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 8623 8624 ztest_fd_data = mkstemp(ztest_name_data); 8625 ASSERT3S(ztest_fd_data, >=, 0); 8626 (void) unlink(ztest_name_data); 8627 } 8628 8629 static int 8630 shared_data_size(ztest_shared_hdr_t *hdr) 8631 { 8632 int size; 8633 8634 size = hdr->zh_hdr_size; 8635 size += hdr->zh_opts_size; 8636 size += hdr->zh_size; 8637 size += hdr->zh_stats_size * hdr->zh_stats_count; 8638 size += hdr->zh_ds_size * hdr->zh_ds_count; 8639 size += hdr->zh_scratch_state_size; 8640 8641 return (size); 8642 } 8643 8644 static void 8645 setup_hdr(void) 8646 { 8647 int size; 8648 ztest_shared_hdr_t *hdr; 8649 8650 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 8651 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 8652 ASSERT3P(hdr, !=, MAP_FAILED); 8653 8654 VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 8655 8656 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 8657 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 8658 hdr->zh_size = sizeof (ztest_shared_t); 8659 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 8660 hdr->zh_stats_count = ZTEST_FUNCS; 8661 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 8662 hdr->zh_ds_count = ztest_opts.zo_datasets; 8663 hdr->zh_scratch_state_size = sizeof (ztest_shared_scratch_state_t); 8664 8665 size = shared_data_size(hdr); 8666 VERIFY0(ftruncate(ztest_fd_data, size)); 8667 8668 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 8669 } 8670 8671 static void 8672 setup_data(void) 8673 { 8674 int size, offset; 8675 ztest_shared_hdr_t *hdr; 8676 uint8_t *buf; 8677 8678 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 8679 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 8680 ASSERT3P(hdr, !=, MAP_FAILED); 8681 8682 size = shared_data_size(hdr); 8683 8684 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 8685 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 8686 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 8687 ASSERT3P(hdr, !=, MAP_FAILED); 8688 buf = (uint8_t *)hdr; 8689 8690 offset = hdr->zh_hdr_size; 8691 ztest_shared_opts = (void *)&buf[offset]; 8692 offset += hdr->zh_opts_size; 8693 ztest_shared = (void *)&buf[offset]; 8694 offset += hdr->zh_size; 8695 ztest_shared_callstate = (void *)&buf[offset]; 8696 offset += hdr->zh_stats_size * hdr->zh_stats_count; 8697 ztest_shared_ds = (void *)&buf[offset]; 8698 offset += hdr->zh_ds_size * hdr->zh_ds_count; 8699 ztest_scratch_state = (void *)&buf[offset]; 8700 } 8701 8702 static boolean_t 8703 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 8704 { 8705 pid_t pid; 8706 int status; 8707 char *cmdbuf = NULL; 8708 8709 pid = fork(); 8710 8711 if (cmd == NULL) { 8712 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 8713 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 8714 cmd = cmdbuf; 8715 } 8716 8717 if (pid == -1) 8718 fatal(B_TRUE, "fork failed"); 8719 8720 if (pid == 0) { /* child */ 8721 char fd_data_str[12]; 8722 8723 VERIFY3S(11, >=, 8724 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 8725 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 8726 8727 if (libpath != NULL) { 8728 const char *curlp = getenv("LD_LIBRARY_PATH"); 8729 if (curlp == NULL) 8730 VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); 8731 else { 8732 char *newlp = NULL; 8733 VERIFY3S(-1, !=, 8734 asprintf(&newlp, "%s:%s", libpath, curlp)); 8735 VERIFY0(setenv("LD_LIBRARY_PATH", newlp, 1)); 8736 free(newlp); 8737 } 8738 } 8739 (void) execl(cmd, cmd, (char *)NULL); 8740 ztest_dump_core = B_FALSE; 8741 fatal(B_TRUE, "exec failed: %s", cmd); 8742 } 8743 8744 if (cmdbuf != NULL) { 8745 umem_free(cmdbuf, MAXPATHLEN); 8746 cmd = NULL; 8747 } 8748 8749 while (waitpid(pid, &status, 0) != pid) 8750 continue; 8751 if (statusp != NULL) 8752 *statusp = status; 8753 8754 if (WIFEXITED(status)) { 8755 if (WEXITSTATUS(status) != 0) { 8756 (void) fprintf(stderr, "child exited with code %d\n", 8757 WEXITSTATUS(status)); 8758 exit(2); 8759 } 8760 return (B_FALSE); 8761 } else if (WIFSIGNALED(status)) { 8762 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 8763 (void) fprintf(stderr, "child died with signal %d\n", 8764 WTERMSIG(status)); 8765 exit(3); 8766 } 8767 return (B_TRUE); 8768 } else { 8769 (void) fprintf(stderr, "something strange happened to child\n"); 8770 exit(4); 8771 } 8772 } 8773 8774 static void 8775 ztest_run_init(void) 8776 { 8777 int i; 8778 8779 ztest_shared_t *zs = ztest_shared; 8780 8781 /* 8782 * Blow away any existing copy of zpool.cache 8783 */ 8784 (void) remove(spa_config_path); 8785 8786 if (ztest_opts.zo_init == 0) { 8787 if (ztest_opts.zo_verbose >= 1) 8788 (void) printf("Importing pool %s\n", 8789 ztest_opts.zo_pool); 8790 ztest_import(zs); 8791 return; 8792 } 8793 8794 /* 8795 * Create and initialize our storage pool. 8796 */ 8797 for (i = 1; i <= ztest_opts.zo_init; i++) { 8798 memset(zs, 0, sizeof (*zs)); 8799 if (ztest_opts.zo_verbose >= 3 && 8800 ztest_opts.zo_init != 1) { 8801 (void) printf("ztest_init(), pass %d\n", i); 8802 } 8803 ztest_init(zs); 8804 } 8805 } 8806 8807 int 8808 main(int argc, char **argv) 8809 { 8810 int kills = 0; 8811 int iters = 0; 8812 int older = 0; 8813 int newer = 0; 8814 ztest_shared_t *zs; 8815 ztest_info_t *zi; 8816 ztest_shared_callstate_t *zc; 8817 char timebuf[100]; 8818 char numbuf[NN_NUMBUF_SZ]; 8819 char *cmd; 8820 boolean_t hasalt; 8821 int f, err; 8822 char *fd_data_str = getenv("ZTEST_FD_DATA"); 8823 struct sigaction action; 8824 8825 (void) setvbuf(stdout, NULL, _IOLBF, 0); 8826 8827 dprintf_setup(&argc, argv); 8828 zfs_deadman_synctime_ms = 300000; 8829 zfs_deadman_checktime_ms = 30000; 8830 /* 8831 * As two-word space map entries may not come up often (especially 8832 * if pool and vdev sizes are small) we want to force at least some 8833 * of them so the feature get tested. 8834 */ 8835 zfs_force_some_double_word_sm_entries = B_TRUE; 8836 8837 /* 8838 * Verify that even extensively damaged split blocks with many 8839 * segments can be reconstructed in a reasonable amount of time 8840 * when reconstruction is known to be possible. 8841 * 8842 * Note: the lower this value is, the more damage we inflict, and 8843 * the more time ztest spends in recovering that damage. We chose 8844 * to induce damage 1/100th of the time so recovery is tested but 8845 * not so frequently that ztest doesn't get to test other code paths. 8846 */ 8847 zfs_reconstruct_indirect_damage_fraction = 100; 8848 8849 action.sa_handler = sig_handler; 8850 sigemptyset(&action.sa_mask); 8851 action.sa_flags = 0; 8852 8853 if (sigaction(SIGSEGV, &action, NULL) < 0) { 8854 (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", 8855 strerror(errno)); 8856 exit(EXIT_FAILURE); 8857 } 8858 8859 if (sigaction(SIGABRT, &action, NULL) < 0) { 8860 (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", 8861 strerror(errno)); 8862 exit(EXIT_FAILURE); 8863 } 8864 8865 /* 8866 * Force random_get_bytes() to use /dev/urandom in order to prevent 8867 * ztest from needlessly depleting the system entropy pool. 8868 */ 8869 random_path = "/dev/urandom"; 8870 ztest_fd_rand = open(random_path, O_RDONLY | O_CLOEXEC); 8871 ASSERT3S(ztest_fd_rand, >=, 0); 8872 8873 if (!fd_data_str) { 8874 process_options(argc, argv); 8875 8876 setup_data_fd(); 8877 setup_hdr(); 8878 setup_data(); 8879 memcpy(ztest_shared_opts, &ztest_opts, 8880 sizeof (*ztest_shared_opts)); 8881 } else { 8882 ztest_fd_data = atoi(fd_data_str); 8883 setup_data(); 8884 memcpy(&ztest_opts, ztest_shared_opts, sizeof (ztest_opts)); 8885 } 8886 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 8887 8888 err = ztest_set_global_vars(); 8889 if (err != 0 && !fd_data_str) { 8890 /* error message done by ztest_set_global_vars */ 8891 exit(EXIT_FAILURE); 8892 } else { 8893 /* children should not be spawned if setting gvars fails */ 8894 VERIFY3S(err, ==, 0); 8895 } 8896 8897 /* Override location of zpool.cache */ 8898 VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", 8899 ztest_opts.zo_dir), !=, -1); 8900 8901 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 8902 UMEM_NOFAIL); 8903 zs = ztest_shared; 8904 8905 if (fd_data_str) { 8906 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 8907 metaslab_df_alloc_threshold = 8908 zs->zs_metaslab_df_alloc_threshold; 8909 8910 if (zs->zs_do_init) 8911 ztest_run_init(); 8912 else 8913 ztest_run(zs); 8914 exit(0); 8915 } 8916 8917 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 8918 8919 if (ztest_opts.zo_verbose >= 1) { 8920 (void) printf("%"PRIu64" vdevs, %d datasets, %d threads, " 8921 "%d %s disks, parity %d, %"PRIu64" seconds...\n\n", 8922 ztest_opts.zo_vdevs, 8923 ztest_opts.zo_datasets, 8924 ztest_opts.zo_threads, 8925 ztest_opts.zo_raid_children, 8926 ztest_opts.zo_raid_type, 8927 ztest_opts.zo_raid_parity, 8928 ztest_opts.zo_time); 8929 } 8930 8931 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 8932 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 8933 8934 zs->zs_do_init = B_TRUE; 8935 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 8936 if (ztest_opts.zo_verbose >= 1) { 8937 (void) printf("Executing older ztest for " 8938 "initialization: %s\n", ztest_opts.zo_alt_ztest); 8939 } 8940 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 8941 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 8942 } else { 8943 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 8944 } 8945 zs->zs_do_init = B_FALSE; 8946 8947 zs->zs_proc_start = gethrtime(); 8948 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 8949 8950 for (f = 0; f < ZTEST_FUNCS; f++) { 8951 zi = &ztest_info[f]; 8952 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8953 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 8954 zc->zc_next = UINT64_MAX; 8955 else 8956 zc->zc_next = zs->zs_proc_start + 8957 ztest_random(2 * zi->zi_interval[0] + 1); 8958 } 8959 8960 /* 8961 * Run the tests in a loop. These tests include fault injection 8962 * to verify that self-healing data works, and forced crashes 8963 * to verify that we never lose on-disk consistency. 8964 */ 8965 while (gethrtime() < zs->zs_proc_stop) { 8966 int status; 8967 boolean_t killed; 8968 8969 /* 8970 * Initialize the workload counters for each function. 8971 */ 8972 for (f = 0; f < ZTEST_FUNCS; f++) { 8973 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8974 zc->zc_count = 0; 8975 zc->zc_time = 0; 8976 } 8977 8978 /* Set the allocation switch size */ 8979 zs->zs_metaslab_df_alloc_threshold = 8980 ztest_random(zs->zs_metaslab_sz / 4) + 1; 8981 8982 if (!hasalt || ztest_random(2) == 0) { 8983 if (hasalt && ztest_opts.zo_verbose >= 1) { 8984 (void) printf("Executing newer ztest: %s\n", 8985 cmd); 8986 } 8987 newer++; 8988 killed = exec_child(cmd, NULL, B_TRUE, &status); 8989 } else { 8990 if (hasalt && ztest_opts.zo_verbose >= 1) { 8991 (void) printf("Executing older ztest: %s\n", 8992 ztest_opts.zo_alt_ztest); 8993 } 8994 older++; 8995 killed = exec_child(ztest_opts.zo_alt_ztest, 8996 ztest_opts.zo_alt_libpath, B_TRUE, &status); 8997 } 8998 8999 if (killed) 9000 kills++; 9001 iters++; 9002 9003 if (ztest_opts.zo_verbose >= 1) { 9004 hrtime_t now = gethrtime(); 9005 9006 now = MIN(now, zs->zs_proc_stop); 9007 print_time(zs->zs_proc_stop - now, timebuf); 9008 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 9009 9010 (void) printf("Pass %3d, %8s, %3"PRIu64" ENOSPC, " 9011 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 9012 iters, 9013 WIFEXITED(status) ? "Complete" : "SIGKILL", 9014 zs->zs_enospc_count, 9015 100.0 * zs->zs_alloc / zs->zs_space, 9016 numbuf, 9017 100.0 * (now - zs->zs_proc_start) / 9018 (ztest_opts.zo_time * NANOSEC), timebuf); 9019 } 9020 9021 if (ztest_opts.zo_verbose >= 2) { 9022 (void) printf("\nWorkload summary:\n\n"); 9023 (void) printf("%7s %9s %s\n", 9024 "Calls", "Time", "Function"); 9025 (void) printf("%7s %9s %s\n", 9026 "-----", "----", "--------"); 9027 for (f = 0; f < ZTEST_FUNCS; f++) { 9028 zi = &ztest_info[f]; 9029 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9030 print_time(zc->zc_time, timebuf); 9031 (void) printf("%7"PRIu64" %9s %s\n", 9032 zc->zc_count, timebuf, 9033 zi->zi_funcname); 9034 } 9035 (void) printf("\n"); 9036 } 9037 9038 if (!ztest_opts.zo_mmp_test) 9039 ztest_run_zdb(zs->zs_guid); 9040 if (ztest_shared_opts->zo_raidz_expand_test == 9041 RAIDZ_EXPAND_CHECKED) 9042 break; /* raidz expand test complete */ 9043 } 9044 9045 if (ztest_opts.zo_verbose >= 1) { 9046 if (hasalt) { 9047 (void) printf("%d runs of older ztest: %s\n", older, 9048 ztest_opts.zo_alt_ztest); 9049 (void) printf("%d runs of newer ztest: %s\n", newer, 9050 cmd); 9051 } 9052 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 9053 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 9054 } 9055 9056 umem_free(cmd, MAXNAMELEN); 9057 9058 return (0); 9059 } 9060