1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2024 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2013 Steven Hartland. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2017 Joyent, Inc. 28 * Copyright (c) 2017, Intel Corporation. 29 */ 30 31 /* 32 * The objective of this program is to provide a DMU/ZAP/SPA stress test 33 * that runs entirely in userland, is easy to use, and easy to extend. 34 * 35 * The overall design of the ztest program is as follows: 36 * 37 * (1) For each major functional area (e.g. adding vdevs to a pool, 38 * creating and destroying datasets, reading and writing objects, etc) 39 * we have a simple routine to test that functionality. These 40 * individual routines do not have to do anything "stressful". 41 * 42 * (2) We turn these simple functionality tests into a stress test by 43 * running them all in parallel, with as many threads as desired, 44 * and spread across as many datasets, objects, and vdevs as desired. 45 * 46 * (3) While all this is happening, we inject faults into the pool to 47 * verify that self-healing data really works. 48 * 49 * (4) Every time we open a dataset, we change its checksum and compression 50 * functions. Thus even individual objects vary from block to block 51 * in which checksum they use and whether they're compressed. 52 * 53 * (5) To verify that we never lose on-disk consistency after a crash, 54 * we run the entire test in a child of the main process. 55 * At random times, the child self-immolates with a SIGKILL. 56 * This is the software equivalent of pulling the power cord. 57 * The parent then runs the test again, using the existing 58 * storage pool, as many times as desired. If backwards compatibility 59 * testing is enabled ztest will sometimes run the "older" version 60 * of ztest after a SIGKILL. 61 * 62 * (6) To verify that we don't have future leaks or temporal incursions, 63 * many of the functional tests record the transaction group number 64 * as part of their data. When reading old data, they verify that 65 * the transaction group number is less than the current, open txg. 66 * If you add a new test, please do this if applicable. 67 * 68 * (7) Threads are created with a reduced stack size, for sanity checking. 69 * Therefore, it's important not to allocate huge buffers on the stack. 70 * 71 * When run with no arguments, ztest runs for about five minutes and 72 * produces no output if successful. To get a little bit of information, 73 * specify -V. To get more information, specify -VV, and so on. 74 * 75 * To turn this into an overnight stress test, use -T to specify run time. 76 * 77 * You can ask more vdevs [-v], datasets [-d], or threads [-t] 78 * to increase the pool capacity, fanout, and overall stress level. 79 * 80 * Use the -k option to set the desired frequency of kills. 81 * 82 * When ztest invokes itself it passes all relevant information through a 83 * temporary file which is mmap-ed in the child process. This allows shared 84 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 85 * stored at offset 0 of this file and contains information on the size and 86 * number of shared structures in the file. The information stored in this file 87 * must remain backwards compatible with older versions of ztest so that 88 * ztest can invoke them during backwards compatibility testing (-B). 89 */ 90 91 #include <sys/zfs_context.h> 92 #include <sys/spa.h> 93 #include <sys/dmu.h> 94 #include <sys/txg.h> 95 #include <sys/dbuf.h> 96 #include <sys/zap.h> 97 #include <sys/dmu_objset.h> 98 #include <sys/poll.h> 99 #include <sys/stat.h> 100 #include <sys/time.h> 101 #include <sys/wait.h> 102 #include <sys/mman.h> 103 #include <sys/resource.h> 104 #include <sys/zio.h> 105 #include <sys/zil.h> 106 #include <sys/zil_impl.h> 107 #include <sys/vdev_draid.h> 108 #include <sys/vdev_impl.h> 109 #include <sys/vdev_file.h> 110 #include <sys/vdev_initialize.h> 111 #include <sys/vdev_raidz.h> 112 #include <sys/vdev_trim.h> 113 #include <sys/spa_impl.h> 114 #include <sys/metaslab_impl.h> 115 #include <sys/dsl_prop.h> 116 #include <sys/dsl_dataset.h> 117 #include <sys/dsl_destroy.h> 118 #include <sys/dsl_scan.h> 119 #include <sys/zio_checksum.h> 120 #include <sys/zfs_refcount.h> 121 #include <sys/zfeature.h> 122 #include <sys/dsl_userhold.h> 123 #include <sys/abd.h> 124 #include <sys/blake3.h> 125 #include <stdio.h> 126 #include <stdlib.h> 127 #include <unistd.h> 128 #include <getopt.h> 129 #include <signal.h> 130 #include <umem.h> 131 #include <ctype.h> 132 #include <math.h> 133 #include <sys/fs/zfs.h> 134 #include <zfs_fletcher.h> 135 #include <libnvpair.h> 136 #include <libzutil.h> 137 #include <sys/crypto/icp.h> 138 #include <sys/zfs_impl.h> 139 #include <sys/backtrace.h> 140 141 static int ztest_fd_data = -1; 142 static int ztest_fd_rand = -1; 143 144 typedef struct ztest_shared_hdr { 145 uint64_t zh_hdr_size; 146 uint64_t zh_opts_size; 147 uint64_t zh_size; 148 uint64_t zh_stats_size; 149 uint64_t zh_stats_count; 150 uint64_t zh_ds_size; 151 uint64_t zh_ds_count; 152 uint64_t zh_scratch_state_size; 153 } ztest_shared_hdr_t; 154 155 static ztest_shared_hdr_t *ztest_shared_hdr; 156 157 enum ztest_class_state { 158 ZTEST_VDEV_CLASS_OFF, 159 ZTEST_VDEV_CLASS_ON, 160 ZTEST_VDEV_CLASS_RND 161 }; 162 163 /* Dedicated RAIDZ Expansion test states */ 164 typedef enum { 165 RAIDZ_EXPAND_NONE, /* Default is none, must opt-in */ 166 RAIDZ_EXPAND_REQUESTED, /* The '-X' option was used */ 167 RAIDZ_EXPAND_STARTED, /* Testing has commenced */ 168 RAIDZ_EXPAND_KILLED, /* Reached the proccess kill */ 169 RAIDZ_EXPAND_CHECKED, /* Pool scrub verification done */ 170 } raidz_expand_test_state_t; 171 172 173 #define ZO_GVARS_MAX_ARGLEN ((size_t)64) 174 #define ZO_GVARS_MAX_COUNT ((size_t)10) 175 176 typedef struct ztest_shared_opts { 177 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 178 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 179 char zo_alt_ztest[MAXNAMELEN]; 180 char zo_alt_libpath[MAXNAMELEN]; 181 uint64_t zo_vdevs; 182 uint64_t zo_vdevtime; 183 size_t zo_vdev_size; 184 int zo_ashift; 185 int zo_mirrors; 186 int zo_raid_do_expand; 187 int zo_raid_children; 188 int zo_raid_parity; 189 char zo_raid_type[8]; 190 int zo_draid_data; 191 int zo_draid_spares; 192 int zo_datasets; 193 int zo_threads; 194 uint64_t zo_passtime; 195 uint64_t zo_killrate; 196 int zo_verbose; 197 int zo_init; 198 uint64_t zo_time; 199 uint64_t zo_maxloops; 200 uint64_t zo_metaslab_force_ganging; 201 raidz_expand_test_state_t zo_raidz_expand_test; 202 int zo_mmp_test; 203 int zo_special_vdevs; 204 int zo_dump_dbgmsg; 205 int zo_gvars_count; 206 char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; 207 } ztest_shared_opts_t; 208 209 /* Default values for command line options. */ 210 #define DEFAULT_POOL "ztest" 211 #define DEFAULT_VDEV_DIR "/tmp" 212 #define DEFAULT_VDEV_COUNT 5 213 #define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ 214 #define DEFAULT_VDEV_SIZE_STR "256M" 215 #define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT 216 #define DEFAULT_MIRRORS 2 217 #define DEFAULT_RAID_CHILDREN 4 218 #define DEFAULT_RAID_PARITY 1 219 #define DEFAULT_DRAID_DATA 4 220 #define DEFAULT_DRAID_SPARES 1 221 #define DEFAULT_DATASETS_COUNT 7 222 #define DEFAULT_THREADS 23 223 #define DEFAULT_RUN_TIME 300 /* 300 seconds */ 224 #define DEFAULT_RUN_TIME_STR "300 sec" 225 #define DEFAULT_PASS_TIME 60 /* 60 seconds */ 226 #define DEFAULT_PASS_TIME_STR "60 sec" 227 #define DEFAULT_KILL_RATE 70 /* 70% kill rate */ 228 #define DEFAULT_KILLRATE_STR "70%" 229 #define DEFAULT_INITS 1 230 #define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ 231 #define DEFAULT_FORCE_GANGING (64 << 10) 232 #define DEFAULT_FORCE_GANGING_STR "64K" 233 234 /* Simplifying assumption: -1 is not a valid default. */ 235 #define NO_DEFAULT -1 236 237 static const ztest_shared_opts_t ztest_opts_defaults = { 238 .zo_pool = DEFAULT_POOL, 239 .zo_dir = DEFAULT_VDEV_DIR, 240 .zo_alt_ztest = { '\0' }, 241 .zo_alt_libpath = { '\0' }, 242 .zo_vdevs = DEFAULT_VDEV_COUNT, 243 .zo_ashift = DEFAULT_ASHIFT, 244 .zo_mirrors = DEFAULT_MIRRORS, 245 .zo_raid_children = DEFAULT_RAID_CHILDREN, 246 .zo_raid_parity = DEFAULT_RAID_PARITY, 247 .zo_raid_type = VDEV_TYPE_RAIDZ, 248 .zo_vdev_size = DEFAULT_VDEV_SIZE, 249 .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ 250 .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ 251 .zo_datasets = DEFAULT_DATASETS_COUNT, 252 .zo_threads = DEFAULT_THREADS, 253 .zo_passtime = DEFAULT_PASS_TIME, 254 .zo_killrate = DEFAULT_KILL_RATE, 255 .zo_verbose = 0, 256 .zo_mmp_test = 0, 257 .zo_init = DEFAULT_INITS, 258 .zo_time = DEFAULT_RUN_TIME, 259 .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ 260 .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, 261 .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, 262 .zo_gvars_count = 0, 263 .zo_raidz_expand_test = RAIDZ_EXPAND_NONE, 264 }; 265 266 extern uint64_t metaslab_force_ganging; 267 extern uint64_t metaslab_df_alloc_threshold; 268 extern uint64_t zfs_deadman_synctime_ms; 269 extern uint_t metaslab_preload_limit; 270 extern int zfs_compressed_arc_enabled; 271 extern int zfs_abd_scatter_enabled; 272 extern uint_t dmu_object_alloc_chunk_shift; 273 extern boolean_t zfs_force_some_double_word_sm_entries; 274 extern unsigned long zio_decompress_fail_fraction; 275 extern unsigned long zfs_reconstruct_indirect_damage_fraction; 276 extern uint64_t raidz_expand_max_reflow_bytes; 277 extern uint_t raidz_expand_pause_point; 278 279 280 static ztest_shared_opts_t *ztest_shared_opts; 281 static ztest_shared_opts_t ztest_opts; 282 static const char *const ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; 283 284 typedef struct ztest_shared_ds { 285 uint64_t zd_seq; 286 } ztest_shared_ds_t; 287 288 static ztest_shared_ds_t *ztest_shared_ds; 289 #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 290 291 typedef struct ztest_scratch_state { 292 uint64_t zs_raidz_scratch_verify_pause; 293 } ztest_shared_scratch_state_t; 294 295 static ztest_shared_scratch_state_t *ztest_scratch_state; 296 297 #define BT_MAGIC 0x123456789abcdefULL 298 #define MAXFAULTS(zs) \ 299 (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) 300 301 enum ztest_io_type { 302 ZTEST_IO_WRITE_TAG, 303 ZTEST_IO_WRITE_PATTERN, 304 ZTEST_IO_WRITE_ZEROES, 305 ZTEST_IO_TRUNCATE, 306 ZTEST_IO_SETATTR, 307 ZTEST_IO_REWRITE, 308 ZTEST_IO_TYPES 309 }; 310 311 typedef struct ztest_block_tag { 312 uint64_t bt_magic; 313 uint64_t bt_objset; 314 uint64_t bt_object; 315 uint64_t bt_dnodesize; 316 uint64_t bt_offset; 317 uint64_t bt_gen; 318 uint64_t bt_txg; 319 uint64_t bt_crtxg; 320 } ztest_block_tag_t; 321 322 typedef struct bufwad { 323 uint64_t bw_index; 324 uint64_t bw_txg; 325 uint64_t bw_data; 326 } bufwad_t; 327 328 /* 329 * It would be better to use a rangelock_t per object. Unfortunately 330 * the rangelock_t is not a drop-in replacement for rl_t, because we 331 * still need to map from object ID to rangelock_t. 332 */ 333 typedef enum { 334 ZTRL_READER, 335 ZTRL_WRITER, 336 ZTRL_APPEND 337 } rl_type_t; 338 339 typedef struct rll { 340 void *rll_writer; 341 int rll_readers; 342 kmutex_t rll_lock; 343 kcondvar_t rll_cv; 344 } rll_t; 345 346 typedef struct rl { 347 uint64_t rl_object; 348 uint64_t rl_offset; 349 uint64_t rl_size; 350 rll_t *rl_lock; 351 } rl_t; 352 353 #define ZTEST_RANGE_LOCKS 64 354 #define ZTEST_OBJECT_LOCKS 64 355 356 /* 357 * Object descriptor. Used as a template for object lookup/create/remove. 358 */ 359 typedef struct ztest_od { 360 uint64_t od_dir; 361 uint64_t od_object; 362 dmu_object_type_t od_type; 363 dmu_object_type_t od_crtype; 364 uint64_t od_blocksize; 365 uint64_t od_crblocksize; 366 uint64_t od_crdnodesize; 367 uint64_t od_gen; 368 uint64_t od_crgen; 369 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 370 } ztest_od_t; 371 372 /* 373 * Per-dataset state. 374 */ 375 typedef struct ztest_ds { 376 ztest_shared_ds_t *zd_shared; 377 objset_t *zd_os; 378 pthread_rwlock_t zd_zilog_lock; 379 zilog_t *zd_zilog; 380 ztest_od_t *zd_od; /* debugging aid */ 381 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 382 kmutex_t zd_dirobj_lock; 383 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 384 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 385 } ztest_ds_t; 386 387 /* 388 * Per-iteration state. 389 */ 390 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 391 392 typedef struct ztest_info { 393 ztest_func_t *zi_func; /* test function */ 394 uint64_t zi_iters; /* iterations per execution */ 395 uint64_t *zi_interval; /* execute every <interval> seconds */ 396 const char *zi_funcname; /* name of test function */ 397 } ztest_info_t; 398 399 typedef struct ztest_shared_callstate { 400 uint64_t zc_count; /* per-pass count */ 401 uint64_t zc_time; /* per-pass time */ 402 uint64_t zc_next; /* next time to call this function */ 403 } ztest_shared_callstate_t; 404 405 static ztest_shared_callstate_t *ztest_shared_callstate; 406 #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 407 408 ztest_func_t ztest_dmu_read_write; 409 ztest_func_t ztest_dmu_write_parallel; 410 ztest_func_t ztest_dmu_object_alloc_free; 411 ztest_func_t ztest_dmu_object_next_chunk; 412 ztest_func_t ztest_dmu_commit_callbacks; 413 ztest_func_t ztest_zap; 414 ztest_func_t ztest_zap_parallel; 415 ztest_func_t ztest_zil_commit; 416 ztest_func_t ztest_zil_remount; 417 ztest_func_t ztest_dmu_read_write_zcopy; 418 ztest_func_t ztest_dmu_objset_create_destroy; 419 ztest_func_t ztest_dmu_prealloc; 420 ztest_func_t ztest_fzap; 421 ztest_func_t ztest_dmu_snapshot_create_destroy; 422 ztest_func_t ztest_dsl_prop_get_set; 423 ztest_func_t ztest_spa_prop_get_set; 424 ztest_func_t ztest_spa_create_destroy; 425 ztest_func_t ztest_fault_inject; 426 ztest_func_t ztest_dmu_snapshot_hold; 427 ztest_func_t ztest_mmp_enable_disable; 428 ztest_func_t ztest_scrub; 429 ztest_func_t ztest_dsl_dataset_promote_busy; 430 ztest_func_t ztest_vdev_attach_detach; 431 ztest_func_t ztest_vdev_raidz_attach; 432 ztest_func_t ztest_vdev_LUN_growth; 433 ztest_func_t ztest_vdev_add_remove; 434 ztest_func_t ztest_vdev_class_add; 435 ztest_func_t ztest_vdev_aux_add_remove; 436 ztest_func_t ztest_split_pool; 437 ztest_func_t ztest_reguid; 438 ztest_func_t ztest_spa_upgrade; 439 ztest_func_t ztest_device_removal; 440 ztest_func_t ztest_spa_checkpoint_create_discard; 441 ztest_func_t ztest_initialize; 442 ztest_func_t ztest_trim; 443 ztest_func_t ztest_blake3; 444 ztest_func_t ztest_fletcher; 445 ztest_func_t ztest_fletcher_incr; 446 ztest_func_t ztest_verify_dnode_bt; 447 448 static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 449 static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 450 static uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 451 static uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 452 static uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 453 454 #define ZTI_INIT(func, iters, interval) \ 455 { .zi_func = (func), \ 456 .zi_iters = (iters), \ 457 .zi_interval = (interval), \ 458 .zi_funcname = # func } 459 460 static ztest_info_t ztest_info[] = { 461 ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), 462 ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), 463 ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), 464 ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), 465 ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), 466 ZTI_INIT(ztest_zap, 30, &zopt_always), 467 ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), 468 ZTI_INIT(ztest_split_pool, 1, &zopt_sometimes), 469 ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), 470 ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), 471 ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), 472 ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), 473 ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), 474 ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), 475 #if 0 476 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), 477 #endif 478 ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), 479 ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), 480 ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), 481 ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), 482 ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), 483 ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), 484 ZTI_INIT(ztest_reguid, 1, &zopt_rarely), 485 ZTI_INIT(ztest_scrub, 1, &zopt_rarely), 486 ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), 487 ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), 488 ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), 489 ZTI_INIT(ztest_vdev_raidz_attach, 1, &zopt_sometimes), 490 ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), 491 ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), 492 ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), 493 ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), 494 ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), 495 ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), 496 ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), 497 ZTI_INIT(ztest_trim, 1, &zopt_sometimes), 498 ZTI_INIT(ztest_blake3, 1, &zopt_rarely), 499 ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), 500 ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), 501 ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), 502 }; 503 504 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 505 506 /* 507 * The following struct is used to hold a list of uncalled commit callbacks. 508 * The callbacks are ordered by txg number. 509 */ 510 typedef struct ztest_cb_list { 511 kmutex_t zcl_callbacks_lock; 512 list_t zcl_callbacks; 513 } ztest_cb_list_t; 514 515 /* 516 * Stuff we need to share writably between parent and child. 517 */ 518 typedef struct ztest_shared { 519 boolean_t zs_do_init; 520 hrtime_t zs_proc_start; 521 hrtime_t zs_proc_stop; 522 hrtime_t zs_thread_start; 523 hrtime_t zs_thread_stop; 524 hrtime_t zs_thread_kill; 525 uint64_t zs_enospc_count; 526 uint64_t zs_vdev_next_leaf; 527 uint64_t zs_vdev_aux; 528 uint64_t zs_alloc; 529 uint64_t zs_space; 530 uint64_t zs_splits; 531 uint64_t zs_mirrors; 532 uint64_t zs_metaslab_sz; 533 uint64_t zs_metaslab_df_alloc_threshold; 534 uint64_t zs_guid; 535 } ztest_shared_t; 536 537 #define ID_PARALLEL -1ULL 538 539 static char ztest_dev_template[] = "%s/%s.%llua"; 540 static char ztest_aux_template[] = "%s/%s.%s.%llu"; 541 static ztest_shared_t *ztest_shared; 542 543 static spa_t *ztest_spa = NULL; 544 static ztest_ds_t *ztest_ds; 545 546 static kmutex_t ztest_vdev_lock; 547 static boolean_t ztest_device_removal_active = B_FALSE; 548 static boolean_t ztest_pool_scrubbed = B_FALSE; 549 static kmutex_t ztest_checkpoint_lock; 550 551 /* 552 * The ztest_name_lock protects the pool and dataset namespace used by 553 * the individual tests. To modify the namespace, consumers must grab 554 * this lock as writer. Grabbing the lock as reader will ensure that the 555 * namespace does not change while the lock is held. 556 */ 557 static pthread_rwlock_t ztest_name_lock; 558 559 static boolean_t ztest_dump_core = B_TRUE; 560 static boolean_t ztest_exiting; 561 562 /* Global commit callback list */ 563 static ztest_cb_list_t zcl; 564 /* Commit cb delay */ 565 static uint64_t zc_min_txg_delay = UINT64_MAX; 566 static int zc_cb_counter = 0; 567 568 /* 569 * Minimum number of commit callbacks that need to be registered for us to check 570 * whether the minimum txg delay is acceptable. 571 */ 572 #define ZTEST_COMMIT_CB_MIN_REG 100 573 574 /* 575 * If a number of txgs equal to this threshold have been created after a commit 576 * callback has been registered but not called, then we assume there is an 577 * implementation bug. 578 */ 579 #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) 580 581 enum ztest_object { 582 ZTEST_META_DNODE = 0, 583 ZTEST_DIROBJ, 584 ZTEST_OBJECTS 585 }; 586 587 static __attribute__((noreturn)) void usage(boolean_t requested); 588 static int ztest_scrub_impl(spa_t *spa); 589 590 /* 591 * These libumem hooks provide a reasonable set of defaults for the allocator's 592 * debugging facilities. 593 */ 594 const char * 595 _umem_debug_init(void) 596 { 597 return ("default,verbose"); /* $UMEM_DEBUG setting */ 598 } 599 600 const char * 601 _umem_logging_init(void) 602 { 603 return ("fail,contents"); /* $UMEM_LOGGING setting */ 604 } 605 606 static void 607 dump_debug_buffer(void) 608 { 609 ssize_t ret __attribute__((unused)); 610 611 if (!ztest_opts.zo_dump_dbgmsg) 612 return; 613 614 /* 615 * We use write() instead of printf() so that this function 616 * is safe to call from a signal handler. 617 */ 618 ret = write(STDERR_FILENO, "\n", 1); 619 zfs_dbgmsg_print(STDERR_FILENO, "ztest"); 620 } 621 622 static void sig_handler(int signo) 623 { 624 struct sigaction action; 625 626 libspl_backtrace(STDERR_FILENO); 627 dump_debug_buffer(); 628 629 /* 630 * Restore default action and re-raise signal so SIGSEGV and 631 * SIGABRT can trigger a core dump. 632 */ 633 action.sa_handler = SIG_DFL; 634 sigemptyset(&action.sa_mask); 635 action.sa_flags = 0; 636 (void) sigaction(signo, &action, NULL); 637 raise(signo); 638 } 639 640 #define FATAL_MSG_SZ 1024 641 642 static const char *fatal_msg; 643 644 static __attribute__((format(printf, 2, 3))) __attribute__((noreturn)) void 645 fatal(int do_perror, const char *message, ...) 646 { 647 va_list args; 648 int save_errno = errno; 649 char *buf; 650 651 (void) fflush(stdout); 652 buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); 653 if (buf == NULL) 654 goto out; 655 656 va_start(args, message); 657 (void) sprintf(buf, "ztest: "); 658 /* LINTED */ 659 (void) vsprintf(buf + strlen(buf), message, args); 660 va_end(args); 661 if (do_perror) { 662 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 663 ": %s", strerror(save_errno)); 664 } 665 (void) fprintf(stderr, "%s\n", buf); 666 fatal_msg = buf; /* to ease debugging */ 667 668 out: 669 if (ztest_dump_core) 670 abort(); 671 else 672 dump_debug_buffer(); 673 674 exit(3); 675 } 676 677 static int 678 str2shift(const char *buf) 679 { 680 const char *ends = "BKMGTPEZ"; 681 int i; 682 683 if (buf[0] == '\0') 684 return (0); 685 for (i = 0; i < strlen(ends); i++) { 686 if (toupper(buf[0]) == ends[i]) 687 break; 688 } 689 if (i == strlen(ends)) { 690 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 691 buf); 692 usage(B_FALSE); 693 } 694 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 695 return (10*i); 696 } 697 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 698 usage(B_FALSE); 699 } 700 701 static uint64_t 702 nicenumtoull(const char *buf) 703 { 704 char *end; 705 uint64_t val; 706 707 val = strtoull(buf, &end, 0); 708 if (end == buf) { 709 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 710 usage(B_FALSE); 711 } else if (end[0] == '.') { 712 double fval = strtod(buf, &end); 713 fval *= pow(2, str2shift(end)); 714 /* 715 * UINT64_MAX is not exactly representable as a double. 716 * The closest representation is UINT64_MAX + 1, so we 717 * use a >= comparison instead of > for the bounds check. 718 */ 719 if (fval >= (double)UINT64_MAX) { 720 (void) fprintf(stderr, "ztest: value too large: %s\n", 721 buf); 722 usage(B_FALSE); 723 } 724 val = (uint64_t)fval; 725 } else { 726 int shift = str2shift(end); 727 if (shift >= 64 || (val << shift) >> shift != val) { 728 (void) fprintf(stderr, "ztest: value too large: %s\n", 729 buf); 730 usage(B_FALSE); 731 } 732 val <<= shift; 733 } 734 return (val); 735 } 736 737 typedef struct ztest_option { 738 const char short_opt; 739 const char *long_opt; 740 const char *long_opt_param; 741 const char *comment; 742 unsigned int default_int; 743 const char *default_str; 744 } ztest_option_t; 745 746 /* 747 * The following option_table is used for generating the usage info as well as 748 * the long and short option information for calling getopt_long(). 749 */ 750 static ztest_option_t option_table[] = { 751 { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, 752 NULL}, 753 { 's', "vdev-size", "INTEGER", "Size of each vdev", 754 NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, 755 { 'a', "alignment-shift", "INTEGER", 756 "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, 757 { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", 758 DEFAULT_MIRRORS, NULL}, 759 { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", 760 DEFAULT_RAID_CHILDREN, NULL}, 761 { 'R', "raid-parity", "INTEGER", "Raid parity", 762 DEFAULT_RAID_PARITY, NULL}, 763 { 'K', "raid-kind", "raidz|eraidz|draid|random", "Raid kind", 764 NO_DEFAULT, "random"}, 765 { 'D', "draid-data", "INTEGER", "Number of draid data drives", 766 DEFAULT_DRAID_DATA, NULL}, 767 { 'S', "draid-spares", "INTEGER", "Number of draid spares", 768 DEFAULT_DRAID_SPARES, NULL}, 769 { 'd', "datasets", "INTEGER", "Number of datasets", 770 DEFAULT_DATASETS_COUNT, NULL}, 771 { 't', "threads", "INTEGER", "Number of ztest threads", 772 DEFAULT_THREADS, NULL}, 773 { 'g', "gang-block-threshold", "INTEGER", 774 "Metaslab gang block threshold", 775 NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, 776 { 'i', "init-count", "INTEGER", "Number of times to initialize pool", 777 DEFAULT_INITS, NULL}, 778 { 'k', "kill-percentage", "INTEGER", "Kill percentage", 779 NO_DEFAULT, DEFAULT_KILLRATE_STR}, 780 { 'p', "pool-name", "STRING", "Pool name", 781 NO_DEFAULT, DEFAULT_POOL}, 782 { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", 783 NO_DEFAULT, DEFAULT_VDEV_DIR}, 784 { 'M', "multi-host", NULL, 785 "Multi-host; simulate pool imported on remote host", 786 NO_DEFAULT, NULL}, 787 { 'E', "use-existing-pool", NULL, 788 "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, 789 { 'T', "run-time", "INTEGER", "Total run time", 790 NO_DEFAULT, DEFAULT_RUN_TIME_STR}, 791 { 'P', "pass-time", "INTEGER", "Time per pass", 792 NO_DEFAULT, DEFAULT_PASS_TIME_STR}, 793 { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", 794 DEFAULT_MAX_LOOPS, NULL}, 795 { 'B', "alt-ztest", "PATH", "Alternate ztest path", 796 NO_DEFAULT, NULL}, 797 { 'C', "vdev-class-state", "on|off|random", "vdev class state", 798 NO_DEFAULT, "random"}, 799 { 'X', "raidz-expansion", NULL, 800 "Perform a dedicated raidz expansion test", 801 NO_DEFAULT, NULL}, 802 { 'o', "option", "\"OPTION=INTEGER\"", 803 "Set global variable to an unsigned 32-bit integer value", 804 NO_DEFAULT, NULL}, 805 { 'G', "dump-debug-msg", NULL, 806 "Dump zfs_dbgmsg buffer before exiting due to an error", 807 NO_DEFAULT, NULL}, 808 { 'V', "verbose", NULL, 809 "Verbose (use multiple times for ever more verbosity)", 810 NO_DEFAULT, NULL}, 811 { 'h', "help", NULL, "Show this help", 812 NO_DEFAULT, NULL}, 813 {0, 0, 0, 0, 0, 0} 814 }; 815 816 static struct option *long_opts = NULL; 817 static char *short_opts = NULL; 818 819 static void 820 init_options(void) 821 { 822 ASSERT3P(long_opts, ==, NULL); 823 ASSERT3P(short_opts, ==, NULL); 824 825 int count = sizeof (option_table) / sizeof (option_table[0]); 826 long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); 827 828 short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); 829 int short_opt_index = 0; 830 831 for (int i = 0; i < count; i++) { 832 long_opts[i].val = option_table[i].short_opt; 833 long_opts[i].name = option_table[i].long_opt; 834 long_opts[i].has_arg = option_table[i].long_opt_param != NULL 835 ? required_argument : no_argument; 836 long_opts[i].flag = NULL; 837 short_opts[short_opt_index++] = option_table[i].short_opt; 838 if (option_table[i].long_opt_param != NULL) { 839 short_opts[short_opt_index++] = ':'; 840 } 841 } 842 } 843 844 static void 845 fini_options(void) 846 { 847 int count = sizeof (option_table) / sizeof (option_table[0]); 848 849 umem_free(long_opts, sizeof (struct option) * count); 850 umem_free(short_opts, sizeof (char) * 2 * count); 851 852 long_opts = NULL; 853 short_opts = NULL; 854 } 855 856 static __attribute__((noreturn)) void 857 usage(boolean_t requested) 858 { 859 char option[80]; 860 FILE *fp = requested ? stdout : stderr; 861 862 (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); 863 for (int i = 0; option_table[i].short_opt != 0; i++) { 864 if (option_table[i].long_opt_param != NULL) { 865 (void) sprintf(option, " -%c --%s=%s", 866 option_table[i].short_opt, 867 option_table[i].long_opt, 868 option_table[i].long_opt_param); 869 } else { 870 (void) sprintf(option, " -%c --%s", 871 option_table[i].short_opt, 872 option_table[i].long_opt); 873 } 874 (void) fprintf(fp, " %-43s%s", option, 875 option_table[i].comment); 876 877 if (option_table[i].long_opt_param != NULL) { 878 if (option_table[i].default_str != NULL) { 879 (void) fprintf(fp, " (default: %s)", 880 option_table[i].default_str); 881 } else if (option_table[i].default_int != NO_DEFAULT) { 882 (void) fprintf(fp, " (default: %u)", 883 option_table[i].default_int); 884 } 885 } 886 (void) fprintf(fp, "\n"); 887 } 888 exit(requested ? 0 : 1); 889 } 890 891 static uint64_t 892 ztest_random(uint64_t range) 893 { 894 uint64_t r; 895 896 ASSERT3S(ztest_fd_rand, >=, 0); 897 898 if (range == 0) 899 return (0); 900 901 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) 902 fatal(B_TRUE, "short read from /dev/urandom"); 903 904 return (r % range); 905 } 906 907 static void 908 ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) 909 { 910 char name[32]; 911 char *value; 912 int state = ZTEST_VDEV_CLASS_RND; 913 914 (void) strlcpy(name, input, sizeof (name)); 915 916 value = strchr(name, '='); 917 if (value == NULL) { 918 (void) fprintf(stderr, "missing value in property=value " 919 "'-C' argument (%s)\n", input); 920 usage(B_FALSE); 921 } 922 *(value) = '\0'; 923 value++; 924 925 if (strcmp(value, "on") == 0) { 926 state = ZTEST_VDEV_CLASS_ON; 927 } else if (strcmp(value, "off") == 0) { 928 state = ZTEST_VDEV_CLASS_OFF; 929 } else if (strcmp(value, "random") == 0) { 930 state = ZTEST_VDEV_CLASS_RND; 931 } else { 932 (void) fprintf(stderr, "invalid property value '%s'\n", value); 933 usage(B_FALSE); 934 } 935 936 if (strcmp(name, "special") == 0) { 937 zo->zo_special_vdevs = state; 938 } else { 939 (void) fprintf(stderr, "invalid property name '%s'\n", name); 940 usage(B_FALSE); 941 } 942 if (zo->zo_verbose >= 3) 943 (void) printf("%s vdev state is '%s'\n", name, value); 944 } 945 946 static void 947 process_options(int argc, char **argv) 948 { 949 char *path; 950 ztest_shared_opts_t *zo = &ztest_opts; 951 952 int opt; 953 uint64_t value; 954 const char *raid_kind = "random"; 955 956 memcpy(zo, &ztest_opts_defaults, sizeof (*zo)); 957 958 init_options(); 959 960 while ((opt = getopt_long(argc, argv, short_opts, long_opts, 961 NULL)) != EOF) { 962 value = 0; 963 switch (opt) { 964 case 'v': 965 case 's': 966 case 'a': 967 case 'm': 968 case 'r': 969 case 'R': 970 case 'D': 971 case 'S': 972 case 'd': 973 case 't': 974 case 'g': 975 case 'i': 976 case 'k': 977 case 'T': 978 case 'P': 979 case 'F': 980 value = nicenumtoull(optarg); 981 } 982 switch (opt) { 983 case 'v': 984 zo->zo_vdevs = value; 985 break; 986 case 's': 987 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 988 break; 989 case 'a': 990 zo->zo_ashift = value; 991 break; 992 case 'm': 993 zo->zo_mirrors = value; 994 break; 995 case 'r': 996 zo->zo_raid_children = MAX(1, value); 997 break; 998 case 'R': 999 zo->zo_raid_parity = MIN(MAX(value, 1), 3); 1000 break; 1001 case 'K': 1002 raid_kind = optarg; 1003 break; 1004 case 'D': 1005 zo->zo_draid_data = MAX(1, value); 1006 break; 1007 case 'S': 1008 zo->zo_draid_spares = MAX(1, value); 1009 break; 1010 case 'd': 1011 zo->zo_datasets = MAX(1, value); 1012 break; 1013 case 't': 1014 zo->zo_threads = MAX(1, value); 1015 break; 1016 case 'g': 1017 zo->zo_metaslab_force_ganging = 1018 MAX(SPA_MINBLOCKSIZE << 1, value); 1019 break; 1020 case 'i': 1021 zo->zo_init = value; 1022 break; 1023 case 'k': 1024 zo->zo_killrate = value; 1025 break; 1026 case 'p': 1027 (void) strlcpy(zo->zo_pool, optarg, 1028 sizeof (zo->zo_pool)); 1029 break; 1030 case 'f': 1031 path = realpath(optarg, NULL); 1032 if (path == NULL) { 1033 (void) fprintf(stderr, "error: %s: %s\n", 1034 optarg, strerror(errno)); 1035 usage(B_FALSE); 1036 } else { 1037 (void) strlcpy(zo->zo_dir, path, 1038 sizeof (zo->zo_dir)); 1039 free(path); 1040 } 1041 break; 1042 case 'M': 1043 zo->zo_mmp_test = 1; 1044 break; 1045 case 'V': 1046 zo->zo_verbose++; 1047 break; 1048 case 'X': 1049 zo->zo_raidz_expand_test = RAIDZ_EXPAND_REQUESTED; 1050 break; 1051 case 'E': 1052 zo->zo_init = 0; 1053 break; 1054 case 'T': 1055 zo->zo_time = value; 1056 break; 1057 case 'P': 1058 zo->zo_passtime = MAX(1, value); 1059 break; 1060 case 'F': 1061 zo->zo_maxloops = MAX(1, value); 1062 break; 1063 case 'B': 1064 (void) strlcpy(zo->zo_alt_ztest, optarg, 1065 sizeof (zo->zo_alt_ztest)); 1066 break; 1067 case 'C': 1068 ztest_parse_name_value(optarg, zo); 1069 break; 1070 case 'o': 1071 if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { 1072 (void) fprintf(stderr, 1073 "max global var count (%zu) exceeded\n", 1074 ZO_GVARS_MAX_COUNT); 1075 usage(B_FALSE); 1076 } 1077 char *v = zo->zo_gvars[zo->zo_gvars_count]; 1078 if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= 1079 ZO_GVARS_MAX_ARGLEN) { 1080 (void) fprintf(stderr, 1081 "global var option '%s' is too long\n", 1082 optarg); 1083 usage(B_FALSE); 1084 } 1085 zo->zo_gvars_count++; 1086 break; 1087 case 'G': 1088 zo->zo_dump_dbgmsg = 1; 1089 break; 1090 case 'h': 1091 usage(B_TRUE); 1092 break; 1093 case '?': 1094 default: 1095 usage(B_FALSE); 1096 break; 1097 } 1098 } 1099 1100 fini_options(); 1101 1102 /* Force compatible options for raidz expansion run */ 1103 if (zo->zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) { 1104 zo->zo_mmp_test = 0; 1105 zo->zo_mirrors = 0; 1106 zo->zo_vdevs = 1; 1107 zo->zo_vdev_size = DEFAULT_VDEV_SIZE * 2; 1108 zo->zo_raid_do_expand = B_FALSE; 1109 raid_kind = "raidz"; 1110 } 1111 1112 if (strcmp(raid_kind, "random") == 0) { 1113 switch (ztest_random(3)) { 1114 case 0: 1115 raid_kind = "raidz"; 1116 break; 1117 case 1: 1118 raid_kind = "eraidz"; 1119 break; 1120 case 2: 1121 raid_kind = "draid"; 1122 break; 1123 } 1124 1125 if (ztest_opts.zo_verbose >= 3) 1126 (void) printf("choosing RAID type '%s'\n", raid_kind); 1127 } 1128 1129 if (strcmp(raid_kind, "draid") == 0) { 1130 uint64_t min_devsize; 1131 1132 /* With fewer disk use 256M, otherwise 128M is OK */ 1133 min_devsize = (ztest_opts.zo_raid_children < 16) ? 1134 (256ULL << 20) : (128ULL << 20); 1135 1136 /* No top-level mirrors with dRAID for now */ 1137 zo->zo_mirrors = 0; 1138 1139 /* Use more appropriate defaults for dRAID */ 1140 if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) 1141 zo->zo_vdevs = 1; 1142 if (zo->zo_raid_children == 1143 ztest_opts_defaults.zo_raid_children) 1144 zo->zo_raid_children = 16; 1145 if (zo->zo_ashift < 12) 1146 zo->zo_ashift = 12; 1147 if (zo->zo_vdev_size < min_devsize) 1148 zo->zo_vdev_size = min_devsize; 1149 1150 if (zo->zo_draid_data + zo->zo_raid_parity > 1151 zo->zo_raid_children - zo->zo_draid_spares) { 1152 (void) fprintf(stderr, "error: too few draid " 1153 "children (%d) for stripe width (%d)\n", 1154 zo->zo_raid_children, 1155 zo->zo_draid_data + zo->zo_raid_parity); 1156 usage(B_FALSE); 1157 } 1158 1159 (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, 1160 sizeof (zo->zo_raid_type)); 1161 1162 } else if (strcmp(raid_kind, "eraidz") == 0) { 1163 /* using eraidz (expandable raidz) */ 1164 zo->zo_raid_do_expand = B_TRUE; 1165 1166 /* tests expect top-level to be raidz */ 1167 zo->zo_mirrors = 0; 1168 zo->zo_vdevs = 1; 1169 1170 /* Make sure parity is less than data columns */ 1171 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1172 zo->zo_raid_children - 1); 1173 1174 } else /* using raidz */ { 1175 ASSERT0(strcmp(raid_kind, "raidz")); 1176 1177 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1178 zo->zo_raid_children - 1); 1179 } 1180 1181 zo->zo_vdevtime = 1182 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 1183 UINT64_MAX >> 2); 1184 1185 if (*zo->zo_alt_ztest) { 1186 const char *invalid_what = "ztest"; 1187 char *val = zo->zo_alt_ztest; 1188 if (0 != access(val, X_OK) || 1189 (strrchr(val, '/') == NULL && (errno == EINVAL))) 1190 goto invalid; 1191 1192 int dirlen = strrchr(val, '/') - val; 1193 strlcpy(zo->zo_alt_libpath, val, 1194 MIN(sizeof (zo->zo_alt_libpath), dirlen + 1)); 1195 invalid_what = "library path", val = zo->zo_alt_libpath; 1196 if (strrchr(val, '/') == NULL && (errno == EINVAL)) 1197 goto invalid; 1198 *strrchr(val, '/') = '\0'; 1199 strlcat(val, "/lib", sizeof (zo->zo_alt_libpath)); 1200 1201 if (0 != access(zo->zo_alt_libpath, X_OK)) 1202 goto invalid; 1203 return; 1204 1205 invalid: 1206 ztest_dump_core = B_FALSE; 1207 fatal(B_TRUE, "invalid alternate %s %s", invalid_what, val); 1208 } 1209 } 1210 1211 static void 1212 ztest_kill(ztest_shared_t *zs) 1213 { 1214 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 1215 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 1216 1217 /* 1218 * Before we kill ourselves, make sure that the config is updated. 1219 * See comment above spa_write_cachefile(). 1220 */ 1221 if (raidz_expand_pause_point != RAIDZ_EXPAND_PAUSE_NONE) { 1222 if (mutex_tryenter(&spa_namespace_lock)) { 1223 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, 1224 B_FALSE); 1225 mutex_exit(&spa_namespace_lock); 1226 1227 ztest_scratch_state->zs_raidz_scratch_verify_pause = 1228 raidz_expand_pause_point; 1229 } else { 1230 /* 1231 * Do not verify scratch object in case if 1232 * spa_namespace_lock cannot be acquired, 1233 * it can cause deadlock in spa_config_update(). 1234 */ 1235 raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; 1236 1237 return; 1238 } 1239 } else { 1240 mutex_enter(&spa_namespace_lock); 1241 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); 1242 mutex_exit(&spa_namespace_lock); 1243 } 1244 1245 (void) raise(SIGKILL); 1246 } 1247 1248 static void 1249 ztest_record_enospc(const char *s) 1250 { 1251 (void) s; 1252 ztest_shared->zs_enospc_count++; 1253 } 1254 1255 static uint64_t 1256 ztest_get_ashift(void) 1257 { 1258 if (ztest_opts.zo_ashift == 0) 1259 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 1260 return (ztest_opts.zo_ashift); 1261 } 1262 1263 static boolean_t 1264 ztest_is_draid_spare(const char *name) 1265 { 1266 uint64_t spare_id = 0, parity = 0, vdev_id = 0; 1267 1268 if (sscanf(name, VDEV_TYPE_DRAID "%"PRIu64"-%"PRIu64"-%"PRIu64"", 1269 &parity, &vdev_id, &spare_id) == 3) { 1270 return (B_TRUE); 1271 } 1272 1273 return (B_FALSE); 1274 } 1275 1276 static nvlist_t * 1277 make_vdev_file(const char *path, const char *aux, const char *pool, 1278 size_t size, uint64_t ashift) 1279 { 1280 char *pathbuf = NULL; 1281 uint64_t vdev; 1282 nvlist_t *file; 1283 boolean_t draid_spare = B_FALSE; 1284 1285 1286 if (ashift == 0) 1287 ashift = ztest_get_ashift(); 1288 1289 if (path == NULL) { 1290 pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1291 path = pathbuf; 1292 1293 if (aux != NULL) { 1294 vdev = ztest_shared->zs_vdev_aux; 1295 (void) snprintf(pathbuf, MAXPATHLEN, 1296 ztest_aux_template, ztest_opts.zo_dir, 1297 pool == NULL ? ztest_opts.zo_pool : pool, 1298 aux, vdev); 1299 } else { 1300 vdev = ztest_shared->zs_vdev_next_leaf++; 1301 (void) snprintf(pathbuf, MAXPATHLEN, 1302 ztest_dev_template, ztest_opts.zo_dir, 1303 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 1304 } 1305 } else { 1306 draid_spare = ztest_is_draid_spare(path); 1307 } 1308 1309 if (size != 0 && !draid_spare) { 1310 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 1311 if (fd == -1) 1312 fatal(B_TRUE, "can't open %s", path); 1313 if (ftruncate(fd, size) != 0) 1314 fatal(B_TRUE, "can't ftruncate %s", path); 1315 (void) close(fd); 1316 } 1317 1318 file = fnvlist_alloc(); 1319 fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, 1320 draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); 1321 fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); 1322 fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); 1323 umem_free(pathbuf, MAXPATHLEN); 1324 1325 return (file); 1326 } 1327 1328 static nvlist_t * 1329 make_vdev_raid(const char *path, const char *aux, const char *pool, size_t size, 1330 uint64_t ashift, int r) 1331 { 1332 nvlist_t *raid, **child; 1333 int c; 1334 1335 if (r < 2) 1336 return (make_vdev_file(path, aux, pool, size, ashift)); 1337 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 1338 1339 for (c = 0; c < r; c++) 1340 child[c] = make_vdev_file(path, aux, pool, size, ashift); 1341 1342 raid = fnvlist_alloc(); 1343 fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, 1344 ztest_opts.zo_raid_type); 1345 fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, 1346 ztest_opts.zo_raid_parity); 1347 fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, 1348 (const nvlist_t **)child, r); 1349 1350 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { 1351 uint64_t ndata = ztest_opts.zo_draid_data; 1352 uint64_t nparity = ztest_opts.zo_raid_parity; 1353 uint64_t nspares = ztest_opts.zo_draid_spares; 1354 uint64_t children = ztest_opts.zo_raid_children; 1355 uint64_t ngroups = 1; 1356 1357 /* 1358 * Calculate the minimum number of groups required to fill a 1359 * slice. This is the LCM of the stripe width (data + parity) 1360 * and the number of data drives (children - spares). 1361 */ 1362 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1363 ngroups++; 1364 1365 /* Store the basic dRAID configuration. */ 1366 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1367 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1368 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1369 } 1370 1371 for (c = 0; c < r; c++) 1372 fnvlist_free(child[c]); 1373 1374 umem_free(child, r * sizeof (nvlist_t *)); 1375 1376 return (raid); 1377 } 1378 1379 static nvlist_t * 1380 make_vdev_mirror(const char *path, const char *aux, const char *pool, 1381 size_t size, uint64_t ashift, int r, int m) 1382 { 1383 nvlist_t *mirror, **child; 1384 int c; 1385 1386 if (m < 1) 1387 return (make_vdev_raid(path, aux, pool, size, ashift, r)); 1388 1389 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 1390 1391 for (c = 0; c < m; c++) 1392 child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); 1393 1394 mirror = fnvlist_alloc(); 1395 fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); 1396 fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 1397 (const nvlist_t **)child, m); 1398 1399 for (c = 0; c < m; c++) 1400 fnvlist_free(child[c]); 1401 1402 umem_free(child, m * sizeof (nvlist_t *)); 1403 1404 return (mirror); 1405 } 1406 1407 static nvlist_t * 1408 make_vdev_root(const char *path, const char *aux, const char *pool, size_t size, 1409 uint64_t ashift, const char *class, int r, int m, int t) 1410 { 1411 nvlist_t *root, **child; 1412 int c; 1413 boolean_t log; 1414 1415 ASSERT3S(t, >, 0); 1416 1417 log = (class != NULL && strcmp(class, "log") == 0); 1418 1419 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 1420 1421 for (c = 0; c < t; c++) { 1422 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 1423 r, m); 1424 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); 1425 1426 if (class != NULL && class[0] != '\0') { 1427 ASSERT(m > 1 || log); /* expecting a mirror */ 1428 fnvlist_add_string(child[c], 1429 ZPOOL_CONFIG_ALLOCATION_BIAS, class); 1430 } 1431 } 1432 1433 root = fnvlist_alloc(); 1434 fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 1435 fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 1436 (const nvlist_t **)child, t); 1437 1438 for (c = 0; c < t; c++) 1439 fnvlist_free(child[c]); 1440 1441 umem_free(child, t * sizeof (nvlist_t *)); 1442 1443 return (root); 1444 } 1445 1446 /* 1447 * Find a random spa version. Returns back a random spa version in the 1448 * range [initial_version, SPA_VERSION_FEATURES]. 1449 */ 1450 static uint64_t 1451 ztest_random_spa_version(uint64_t initial_version) 1452 { 1453 uint64_t version = initial_version; 1454 1455 if (version <= SPA_VERSION_BEFORE_FEATURES) { 1456 version = version + 1457 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 1458 } 1459 1460 if (version > SPA_VERSION_BEFORE_FEATURES) 1461 version = SPA_VERSION_FEATURES; 1462 1463 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1464 return (version); 1465 } 1466 1467 static int 1468 ztest_random_blocksize(void) 1469 { 1470 ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); 1471 1472 /* 1473 * Choose a block size >= the ashift. 1474 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1475 */ 1476 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1477 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1478 maxbs = 20; 1479 uint64_t block_shift = 1480 ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1481 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1482 } 1483 1484 static int 1485 ztest_random_dnodesize(void) 1486 { 1487 int slots; 1488 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1489 1490 if (max_slots == DNODE_MIN_SLOTS) 1491 return (DNODE_MIN_SIZE); 1492 1493 /* 1494 * Weight the random distribution more heavily toward smaller 1495 * dnode sizes since that is more likely to reflect real-world 1496 * usage. 1497 */ 1498 ASSERT3U(max_slots, >, 4); 1499 switch (ztest_random(10)) { 1500 case 0: 1501 slots = 5 + ztest_random(max_slots - 4); 1502 break; 1503 case 1 ... 4: 1504 slots = 2 + ztest_random(3); 1505 break; 1506 default: 1507 slots = 1; 1508 break; 1509 } 1510 1511 return (slots << DNODE_SHIFT); 1512 } 1513 1514 static int 1515 ztest_random_ibshift(void) 1516 { 1517 return (DN_MIN_INDBLKSHIFT + 1518 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1519 } 1520 1521 static uint64_t 1522 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1523 { 1524 uint64_t top; 1525 vdev_t *rvd = spa->spa_root_vdev; 1526 vdev_t *tvd; 1527 1528 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 1529 1530 do { 1531 top = ztest_random(rvd->vdev_children); 1532 tvd = rvd->vdev_child[top]; 1533 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1534 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1535 1536 return (top); 1537 } 1538 1539 static uint64_t 1540 ztest_random_dsl_prop(zfs_prop_t prop) 1541 { 1542 uint64_t value; 1543 1544 do { 1545 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1546 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1547 1548 return (value); 1549 } 1550 1551 static int 1552 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1553 boolean_t inherit) 1554 { 1555 const char *propname = zfs_prop_to_name(prop); 1556 const char *valname; 1557 char *setpoint; 1558 uint64_t curval; 1559 int error; 1560 1561 error = dsl_prop_set_int(osname, propname, 1562 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1563 1564 if (error == ENOSPC) { 1565 ztest_record_enospc(FTAG); 1566 return (error); 1567 } 1568 ASSERT0(error); 1569 1570 setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1571 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1572 1573 if (ztest_opts.zo_verbose >= 6) { 1574 int err; 1575 1576 err = zfs_prop_index_to_string(prop, curval, &valname); 1577 if (err) 1578 (void) printf("%s %s = %llu at '%s'\n", osname, 1579 propname, (unsigned long long)curval, setpoint); 1580 else 1581 (void) printf("%s %s = %s at '%s'\n", 1582 osname, propname, valname, setpoint); 1583 } 1584 umem_free(setpoint, MAXPATHLEN); 1585 1586 return (error); 1587 } 1588 1589 static int 1590 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1591 { 1592 spa_t *spa = ztest_spa; 1593 nvlist_t *props = NULL; 1594 int error; 1595 1596 props = fnvlist_alloc(); 1597 fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); 1598 1599 error = spa_prop_set(spa, props); 1600 1601 fnvlist_free(props); 1602 1603 if (error == ENOSPC) { 1604 ztest_record_enospc(FTAG); 1605 return (error); 1606 } 1607 ASSERT0(error); 1608 1609 return (error); 1610 } 1611 1612 static int 1613 ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, 1614 boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) 1615 { 1616 int err; 1617 char *cp = NULL; 1618 char ddname[ZFS_MAX_DATASET_NAME_LEN]; 1619 1620 strlcpy(ddname, name, sizeof (ddname)); 1621 cp = strchr(ddname, '@'); 1622 if (cp != NULL) 1623 *cp = '\0'; 1624 1625 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1626 while (decrypt && err == EACCES) { 1627 dsl_crypto_params_t *dcp; 1628 nvlist_t *crypto_args = fnvlist_alloc(); 1629 1630 fnvlist_add_uint8_array(crypto_args, "wkeydata", 1631 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 1632 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, 1633 crypto_args, &dcp)); 1634 err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); 1635 /* 1636 * Note: if there was an error loading, the wkey was not 1637 * consumed, and needs to be freed. 1638 */ 1639 dsl_crypto_params_free(dcp, (err != 0)); 1640 fnvlist_free(crypto_args); 1641 1642 if (err == EINVAL) { 1643 /* 1644 * We couldn't load a key for this dataset so try 1645 * the parent. This loop will eventually hit the 1646 * encryption root since ztest only makes clones 1647 * as children of their origin datasets. 1648 */ 1649 cp = strrchr(ddname, '/'); 1650 if (cp == NULL) 1651 return (err); 1652 1653 *cp = '\0'; 1654 err = EACCES; 1655 continue; 1656 } else if (err != 0) { 1657 break; 1658 } 1659 1660 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1661 break; 1662 } 1663 1664 return (err); 1665 } 1666 1667 static void 1668 ztest_rll_init(rll_t *rll) 1669 { 1670 rll->rll_writer = NULL; 1671 rll->rll_readers = 0; 1672 mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); 1673 cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); 1674 } 1675 1676 static void 1677 ztest_rll_destroy(rll_t *rll) 1678 { 1679 ASSERT3P(rll->rll_writer, ==, NULL); 1680 ASSERT0(rll->rll_readers); 1681 mutex_destroy(&rll->rll_lock); 1682 cv_destroy(&rll->rll_cv); 1683 } 1684 1685 static void 1686 ztest_rll_lock(rll_t *rll, rl_type_t type) 1687 { 1688 mutex_enter(&rll->rll_lock); 1689 1690 if (type == ZTRL_READER) { 1691 while (rll->rll_writer != NULL) 1692 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1693 rll->rll_readers++; 1694 } else { 1695 while (rll->rll_writer != NULL || rll->rll_readers) 1696 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1697 rll->rll_writer = curthread; 1698 } 1699 1700 mutex_exit(&rll->rll_lock); 1701 } 1702 1703 static void 1704 ztest_rll_unlock(rll_t *rll) 1705 { 1706 mutex_enter(&rll->rll_lock); 1707 1708 if (rll->rll_writer) { 1709 ASSERT0(rll->rll_readers); 1710 rll->rll_writer = NULL; 1711 } else { 1712 ASSERT3S(rll->rll_readers, >, 0); 1713 ASSERT3P(rll->rll_writer, ==, NULL); 1714 rll->rll_readers--; 1715 } 1716 1717 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1718 cv_broadcast(&rll->rll_cv); 1719 1720 mutex_exit(&rll->rll_lock); 1721 } 1722 1723 static void 1724 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1725 { 1726 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1727 1728 ztest_rll_lock(rll, type); 1729 } 1730 1731 static void 1732 ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1733 { 1734 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1735 1736 ztest_rll_unlock(rll); 1737 } 1738 1739 static rl_t * 1740 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1741 uint64_t size, rl_type_t type) 1742 { 1743 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1744 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1745 rl_t *rl; 1746 1747 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1748 rl->rl_object = object; 1749 rl->rl_offset = offset; 1750 rl->rl_size = size; 1751 rl->rl_lock = rll; 1752 1753 ztest_rll_lock(rll, type); 1754 1755 return (rl); 1756 } 1757 1758 static void 1759 ztest_range_unlock(rl_t *rl) 1760 { 1761 rll_t *rll = rl->rl_lock; 1762 1763 ztest_rll_unlock(rll); 1764 1765 umem_free(rl, sizeof (*rl)); 1766 } 1767 1768 static void 1769 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1770 { 1771 zd->zd_os = os; 1772 zd->zd_zilog = dmu_objset_zil(os); 1773 zd->zd_shared = szd; 1774 dmu_objset_name(os, zd->zd_name); 1775 int l; 1776 1777 if (zd->zd_shared != NULL) 1778 zd->zd_shared->zd_seq = 0; 1779 1780 VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); 1781 mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); 1782 1783 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1784 ztest_rll_init(&zd->zd_object_lock[l]); 1785 1786 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1787 ztest_rll_init(&zd->zd_range_lock[l]); 1788 } 1789 1790 static void 1791 ztest_zd_fini(ztest_ds_t *zd) 1792 { 1793 int l; 1794 1795 mutex_destroy(&zd->zd_dirobj_lock); 1796 (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); 1797 1798 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1799 ztest_rll_destroy(&zd->zd_object_lock[l]); 1800 1801 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1802 ztest_rll_destroy(&zd->zd_range_lock[l]); 1803 } 1804 1805 #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) 1806 1807 static uint64_t 1808 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 1809 { 1810 uint64_t txg; 1811 int error; 1812 1813 /* 1814 * Attempt to assign tx to some transaction group. 1815 */ 1816 error = dmu_tx_assign(tx, txg_how); 1817 if (error) { 1818 if (error == ERESTART) { 1819 ASSERT3U(txg_how, ==, TXG_NOWAIT); 1820 dmu_tx_wait(tx); 1821 } else { 1822 ASSERT3U(error, ==, ENOSPC); 1823 ztest_record_enospc(tag); 1824 } 1825 dmu_tx_abort(tx); 1826 return (0); 1827 } 1828 txg = dmu_tx_get_txg(tx); 1829 ASSERT3U(txg, !=, 0); 1830 return (txg); 1831 } 1832 1833 static void 1834 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1835 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1836 uint64_t crtxg) 1837 { 1838 bt->bt_magic = BT_MAGIC; 1839 bt->bt_objset = dmu_objset_id(os); 1840 bt->bt_object = object; 1841 bt->bt_dnodesize = dnodesize; 1842 bt->bt_offset = offset; 1843 bt->bt_gen = gen; 1844 bt->bt_txg = txg; 1845 bt->bt_crtxg = crtxg; 1846 } 1847 1848 static void 1849 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1850 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1851 uint64_t crtxg) 1852 { 1853 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1854 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1855 ASSERT3U(bt->bt_object, ==, object); 1856 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1857 ASSERT3U(bt->bt_offset, ==, offset); 1858 ASSERT3U(bt->bt_gen, <=, gen); 1859 ASSERT3U(bt->bt_txg, <=, txg); 1860 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1861 } 1862 1863 static ztest_block_tag_t * 1864 ztest_bt_bonus(dmu_buf_t *db) 1865 { 1866 dmu_object_info_t doi; 1867 ztest_block_tag_t *bt; 1868 1869 dmu_object_info_from_db(db, &doi); 1870 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1871 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1872 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1873 1874 return (bt); 1875 } 1876 1877 /* 1878 * Generate a token to fill up unused bonus buffer space. Try to make 1879 * it unique to the object, generation, and offset to verify that data 1880 * is not getting overwritten by data from other dnodes. 1881 */ 1882 #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1883 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1884 1885 /* 1886 * Fill up the unused bonus buffer region before the block tag with a 1887 * verifiable pattern. Filling the whole bonus area with non-zero data 1888 * helps ensure that all dnode traversal code properly skips the 1889 * interior regions of large dnodes. 1890 */ 1891 static void 1892 ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1893 objset_t *os, uint64_t gen) 1894 { 1895 uint64_t *bonusp; 1896 1897 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1898 1899 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1900 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1901 gen, bonusp - (uint64_t *)db->db_data); 1902 *bonusp = token; 1903 } 1904 } 1905 1906 /* 1907 * Verify that the unused area of a bonus buffer is filled with the 1908 * expected tokens. 1909 */ 1910 static void 1911 ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1912 objset_t *os, uint64_t gen) 1913 { 1914 uint64_t *bonusp; 1915 1916 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1917 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1918 gen, bonusp - (uint64_t *)db->db_data); 1919 VERIFY3U(*bonusp, ==, token); 1920 } 1921 } 1922 1923 /* 1924 * ZIL logging ops 1925 */ 1926 1927 #define lrz_type lr_mode 1928 #define lrz_blocksize lr_uid 1929 #define lrz_ibshift lr_gid 1930 #define lrz_bonustype lr_rdev 1931 #define lrz_dnodesize lr_crtime[1] 1932 1933 static void 1934 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1935 { 1936 char *name = (void *)(lr + 1); /* name follows lr */ 1937 size_t namesize = strlen(name) + 1; 1938 itx_t *itx; 1939 1940 if (zil_replaying(zd->zd_zilog, tx)) 1941 return; 1942 1943 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1944 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1945 sizeof (*lr) + namesize - sizeof (lr_t)); 1946 1947 zil_itx_assign(zd->zd_zilog, itx, tx); 1948 } 1949 1950 static void 1951 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1952 { 1953 char *name = (void *)(lr + 1); /* name follows lr */ 1954 size_t namesize = strlen(name) + 1; 1955 itx_t *itx; 1956 1957 if (zil_replaying(zd->zd_zilog, tx)) 1958 return; 1959 1960 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1961 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1962 sizeof (*lr) + namesize - sizeof (lr_t)); 1963 1964 itx->itx_oid = object; 1965 zil_itx_assign(zd->zd_zilog, itx, tx); 1966 } 1967 1968 static void 1969 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1970 { 1971 itx_t *itx; 1972 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1973 1974 if (zil_replaying(zd->zd_zilog, tx)) 1975 return; 1976 1977 if (lr->lr_length > zil_max_log_data(zd->zd_zilog, sizeof (lr_write_t))) 1978 write_state = WR_INDIRECT; 1979 1980 itx = zil_itx_create(TX_WRITE, 1981 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1982 1983 if (write_state == WR_COPIED && 1984 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1985 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { 1986 zil_itx_destroy(itx); 1987 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1988 write_state = WR_NEED_COPY; 1989 } 1990 itx->itx_private = zd; 1991 itx->itx_wr_state = write_state; 1992 itx->itx_sync = (ztest_random(8) == 0); 1993 1994 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 1995 sizeof (*lr) - sizeof (lr_t)); 1996 1997 zil_itx_assign(zd->zd_zilog, itx, tx); 1998 } 1999 2000 static void 2001 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 2002 { 2003 itx_t *itx; 2004 2005 if (zil_replaying(zd->zd_zilog, tx)) 2006 return; 2007 2008 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 2009 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2010 sizeof (*lr) - sizeof (lr_t)); 2011 2012 itx->itx_sync = B_FALSE; 2013 zil_itx_assign(zd->zd_zilog, itx, tx); 2014 } 2015 2016 static void 2017 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 2018 { 2019 itx_t *itx; 2020 2021 if (zil_replaying(zd->zd_zilog, tx)) 2022 return; 2023 2024 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 2025 memcpy(&itx->itx_lr + 1, &lr->lr_common + 1, 2026 sizeof (*lr) - sizeof (lr_t)); 2027 2028 itx->itx_sync = B_FALSE; 2029 zil_itx_assign(zd->zd_zilog, itx, tx); 2030 } 2031 2032 /* 2033 * ZIL replay ops 2034 */ 2035 static int 2036 ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 2037 { 2038 ztest_ds_t *zd = arg1; 2039 lr_create_t *lr = arg2; 2040 char *name = (void *)(lr + 1); /* name follows lr */ 2041 objset_t *os = zd->zd_os; 2042 ztest_block_tag_t *bbt; 2043 dmu_buf_t *db; 2044 dmu_tx_t *tx; 2045 uint64_t txg; 2046 int error = 0; 2047 int bonuslen; 2048 2049 if (byteswap) 2050 byteswap_uint64_array(lr, sizeof (*lr)); 2051 2052 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2053 ASSERT3S(name[0], !=, '\0'); 2054 2055 tx = dmu_tx_create(os); 2056 2057 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 2058 2059 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2060 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 2061 } else { 2062 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 2063 } 2064 2065 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2066 if (txg == 0) 2067 return (ENOSPC); 2068 2069 ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); 2070 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 2071 2072 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2073 if (lr->lr_foid == 0) { 2074 lr->lr_foid = zap_create_dnsize(os, 2075 lr->lrz_type, lr->lrz_bonustype, 2076 bonuslen, lr->lrz_dnodesize, tx); 2077 } else { 2078 error = zap_create_claim_dnsize(os, lr->lr_foid, 2079 lr->lrz_type, lr->lrz_bonustype, 2080 bonuslen, lr->lrz_dnodesize, tx); 2081 } 2082 } else { 2083 if (lr->lr_foid == 0) { 2084 lr->lr_foid = dmu_object_alloc_dnsize(os, 2085 lr->lrz_type, 0, lr->lrz_bonustype, 2086 bonuslen, lr->lrz_dnodesize, tx); 2087 } else { 2088 error = dmu_object_claim_dnsize(os, lr->lr_foid, 2089 lr->lrz_type, 0, lr->lrz_bonustype, 2090 bonuslen, lr->lrz_dnodesize, tx); 2091 } 2092 } 2093 2094 if (error) { 2095 ASSERT3U(error, ==, EEXIST); 2096 ASSERT(zd->zd_zilog->zl_replay); 2097 dmu_tx_commit(tx); 2098 return (error); 2099 } 2100 2101 ASSERT3U(lr->lr_foid, !=, 0); 2102 2103 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 2104 VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, 2105 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 2106 2107 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2108 bbt = ztest_bt_bonus(db); 2109 dmu_buf_will_dirty(db, tx); 2110 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 2111 lr->lr_gen, txg, txg); 2112 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 2113 dmu_buf_rele(db, FTAG); 2114 2115 VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 2116 &lr->lr_foid, tx)); 2117 2118 (void) ztest_log_create(zd, tx, lr); 2119 2120 dmu_tx_commit(tx); 2121 2122 return (0); 2123 } 2124 2125 static int 2126 ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 2127 { 2128 ztest_ds_t *zd = arg1; 2129 lr_remove_t *lr = arg2; 2130 char *name = (void *)(lr + 1); /* name follows lr */ 2131 objset_t *os = zd->zd_os; 2132 dmu_object_info_t doi; 2133 dmu_tx_t *tx; 2134 uint64_t object, txg; 2135 2136 if (byteswap) 2137 byteswap_uint64_array(lr, sizeof (*lr)); 2138 2139 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2140 ASSERT3S(name[0], !=, '\0'); 2141 2142 VERIFY0( 2143 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 2144 ASSERT3U(object, !=, 0); 2145 2146 ztest_object_lock(zd, object, ZTRL_WRITER); 2147 2148 VERIFY0(dmu_object_info(os, object, &doi)); 2149 2150 tx = dmu_tx_create(os); 2151 2152 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 2153 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 2154 2155 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2156 if (txg == 0) { 2157 ztest_object_unlock(zd, object); 2158 return (ENOSPC); 2159 } 2160 2161 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 2162 VERIFY0(zap_destroy(os, object, tx)); 2163 } else { 2164 VERIFY0(dmu_object_free(os, object, tx)); 2165 } 2166 2167 VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); 2168 2169 (void) ztest_log_remove(zd, tx, lr, object); 2170 2171 dmu_tx_commit(tx); 2172 2173 ztest_object_unlock(zd, object); 2174 2175 return (0); 2176 } 2177 2178 static int 2179 ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 2180 { 2181 ztest_ds_t *zd = arg1; 2182 lr_write_t *lr = arg2; 2183 objset_t *os = zd->zd_os; 2184 void *data = lr + 1; /* data follows lr */ 2185 uint64_t offset, length; 2186 ztest_block_tag_t *bt = data; 2187 ztest_block_tag_t *bbt; 2188 uint64_t gen, txg, lrtxg, crtxg; 2189 dmu_object_info_t doi; 2190 dmu_tx_t *tx; 2191 dmu_buf_t *db; 2192 arc_buf_t *abuf = NULL; 2193 rl_t *rl; 2194 2195 if (byteswap) 2196 byteswap_uint64_array(lr, sizeof (*lr)); 2197 2198 offset = lr->lr_offset; 2199 length = lr->lr_length; 2200 2201 /* If it's a dmu_sync() block, write the whole block */ 2202 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 2203 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 2204 if (length < blocksize) { 2205 offset -= offset % blocksize; 2206 length = blocksize; 2207 } 2208 } 2209 2210 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 2211 byteswap_uint64_array(bt, sizeof (*bt)); 2212 2213 if (bt->bt_magic != BT_MAGIC) 2214 bt = NULL; 2215 2216 ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); 2217 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, ZTRL_WRITER); 2218 2219 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2220 2221 dmu_object_info_from_db(db, &doi); 2222 2223 bbt = ztest_bt_bonus(db); 2224 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2225 gen = bbt->bt_gen; 2226 crtxg = bbt->bt_crtxg; 2227 lrtxg = lr->lr_common.lrc_txg; 2228 2229 tx = dmu_tx_create(os); 2230 2231 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 2232 2233 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 2234 P2PHASE(offset, length) == 0) 2235 abuf = dmu_request_arcbuf(db, length); 2236 2237 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2238 if (txg == 0) { 2239 if (abuf != NULL) 2240 dmu_return_arcbuf(abuf); 2241 dmu_buf_rele(db, FTAG); 2242 ztest_range_unlock(rl); 2243 ztest_object_unlock(zd, lr->lr_foid); 2244 return (ENOSPC); 2245 } 2246 2247 if (bt != NULL) { 2248 /* 2249 * Usually, verify the old data before writing new data -- 2250 * but not always, because we also want to verify correct 2251 * behavior when the data was not recently read into cache. 2252 */ 2253 ASSERT(doi.doi_data_block_size); 2254 ASSERT0(offset % doi.doi_data_block_size); 2255 if (ztest_random(4) != 0) { 2256 int prefetch = ztest_random(2) ? 2257 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 2258 ztest_block_tag_t rbt; 2259 2260 VERIFY(dmu_read(os, lr->lr_foid, offset, 2261 sizeof (rbt), &rbt, prefetch) == 0); 2262 if (rbt.bt_magic == BT_MAGIC) { 2263 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 2264 offset, gen, txg, crtxg); 2265 } 2266 } 2267 2268 /* 2269 * Writes can appear to be newer than the bonus buffer because 2270 * the ztest_get_data() callback does a dmu_read() of the 2271 * open-context data, which may be different than the data 2272 * as it was when the write was generated. 2273 */ 2274 if (zd->zd_zilog->zl_replay) { 2275 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 2276 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 2277 bt->bt_crtxg); 2278 } 2279 2280 /* 2281 * Set the bt's gen/txg to the bonus buffer's gen/txg 2282 * so that all of the usual ASSERTs will work. 2283 */ 2284 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 2285 crtxg); 2286 } 2287 2288 if (abuf == NULL) { 2289 dmu_write(os, lr->lr_foid, offset, length, data, tx); 2290 } else { 2291 memcpy(abuf->b_data, data, length); 2292 VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx)); 2293 } 2294 2295 (void) ztest_log_write(zd, tx, lr); 2296 2297 dmu_buf_rele(db, FTAG); 2298 2299 dmu_tx_commit(tx); 2300 2301 ztest_range_unlock(rl); 2302 ztest_object_unlock(zd, lr->lr_foid); 2303 2304 return (0); 2305 } 2306 2307 static int 2308 ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 2309 { 2310 ztest_ds_t *zd = arg1; 2311 lr_truncate_t *lr = arg2; 2312 objset_t *os = zd->zd_os; 2313 dmu_tx_t *tx; 2314 uint64_t txg; 2315 rl_t *rl; 2316 2317 if (byteswap) 2318 byteswap_uint64_array(lr, sizeof (*lr)); 2319 2320 ztest_object_lock(zd, lr->lr_foid, ZTRL_READER); 2321 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 2322 ZTRL_WRITER); 2323 2324 tx = dmu_tx_create(os); 2325 2326 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 2327 2328 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2329 if (txg == 0) { 2330 ztest_range_unlock(rl); 2331 ztest_object_unlock(zd, lr->lr_foid); 2332 return (ENOSPC); 2333 } 2334 2335 VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 2336 lr->lr_length, tx)); 2337 2338 (void) ztest_log_truncate(zd, tx, lr); 2339 2340 dmu_tx_commit(tx); 2341 2342 ztest_range_unlock(rl); 2343 ztest_object_unlock(zd, lr->lr_foid); 2344 2345 return (0); 2346 } 2347 2348 static int 2349 ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 2350 { 2351 ztest_ds_t *zd = arg1; 2352 lr_setattr_t *lr = arg2; 2353 objset_t *os = zd->zd_os; 2354 dmu_tx_t *tx; 2355 dmu_buf_t *db; 2356 ztest_block_tag_t *bbt; 2357 uint64_t txg, lrtxg, crtxg, dnodesize; 2358 2359 if (byteswap) 2360 byteswap_uint64_array(lr, sizeof (*lr)); 2361 2362 ztest_object_lock(zd, lr->lr_foid, ZTRL_WRITER); 2363 2364 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2365 2366 tx = dmu_tx_create(os); 2367 dmu_tx_hold_bonus(tx, lr->lr_foid); 2368 2369 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2370 if (txg == 0) { 2371 dmu_buf_rele(db, FTAG); 2372 ztest_object_unlock(zd, lr->lr_foid); 2373 return (ENOSPC); 2374 } 2375 2376 bbt = ztest_bt_bonus(db); 2377 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2378 crtxg = bbt->bt_crtxg; 2379 lrtxg = lr->lr_common.lrc_txg; 2380 dnodesize = bbt->bt_dnodesize; 2381 2382 if (zd->zd_zilog->zl_replay) { 2383 ASSERT3U(lr->lr_size, !=, 0); 2384 ASSERT3U(lr->lr_mode, !=, 0); 2385 ASSERT3U(lrtxg, !=, 0); 2386 } else { 2387 /* 2388 * Randomly change the size and increment the generation. 2389 */ 2390 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 2391 sizeof (*bbt); 2392 lr->lr_mode = bbt->bt_gen + 1; 2393 ASSERT0(lrtxg); 2394 } 2395 2396 /* 2397 * Verify that the current bonus buffer is not newer than our txg. 2398 */ 2399 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2400 MAX(txg, lrtxg), crtxg); 2401 2402 dmu_buf_will_dirty(db, tx); 2403 2404 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 2405 ASSERT3U(lr->lr_size, <=, db->db_size); 2406 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 2407 bbt = ztest_bt_bonus(db); 2408 2409 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2410 txg, crtxg); 2411 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 2412 dmu_buf_rele(db, FTAG); 2413 2414 (void) ztest_log_setattr(zd, tx, lr); 2415 2416 dmu_tx_commit(tx); 2417 2418 ztest_object_unlock(zd, lr->lr_foid); 2419 2420 return (0); 2421 } 2422 2423 static zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 2424 NULL, /* 0 no such transaction type */ 2425 ztest_replay_create, /* TX_CREATE */ 2426 NULL, /* TX_MKDIR */ 2427 NULL, /* TX_MKXATTR */ 2428 NULL, /* TX_SYMLINK */ 2429 ztest_replay_remove, /* TX_REMOVE */ 2430 NULL, /* TX_RMDIR */ 2431 NULL, /* TX_LINK */ 2432 NULL, /* TX_RENAME */ 2433 ztest_replay_write, /* TX_WRITE */ 2434 ztest_replay_truncate, /* TX_TRUNCATE */ 2435 ztest_replay_setattr, /* TX_SETATTR */ 2436 NULL, /* TX_ACL */ 2437 NULL, /* TX_CREATE_ACL */ 2438 NULL, /* TX_CREATE_ATTR */ 2439 NULL, /* TX_CREATE_ACL_ATTR */ 2440 NULL, /* TX_MKDIR_ACL */ 2441 NULL, /* TX_MKDIR_ATTR */ 2442 NULL, /* TX_MKDIR_ACL_ATTR */ 2443 NULL, /* TX_WRITE2 */ 2444 NULL, /* TX_SETSAXATTR */ 2445 NULL, /* TX_RENAME_EXCHANGE */ 2446 NULL, /* TX_RENAME_WHITEOUT */ 2447 }; 2448 2449 /* 2450 * ZIL get_data callbacks 2451 */ 2452 2453 static void 2454 ztest_get_done(zgd_t *zgd, int error) 2455 { 2456 (void) error; 2457 ztest_ds_t *zd = zgd->zgd_private; 2458 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 2459 2460 if (zgd->zgd_db) 2461 dmu_buf_rele(zgd->zgd_db, zgd); 2462 2463 ztest_range_unlock((rl_t *)zgd->zgd_lr); 2464 ztest_object_unlock(zd, object); 2465 2466 umem_free(zgd, sizeof (*zgd)); 2467 } 2468 2469 static int 2470 ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, 2471 struct lwb *lwb, zio_t *zio) 2472 { 2473 (void) arg2; 2474 ztest_ds_t *zd = arg; 2475 objset_t *os = zd->zd_os; 2476 uint64_t object = lr->lr_foid; 2477 uint64_t offset = lr->lr_offset; 2478 uint64_t size = lr->lr_length; 2479 uint64_t txg = lr->lr_common.lrc_txg; 2480 uint64_t crtxg; 2481 dmu_object_info_t doi; 2482 dmu_buf_t *db; 2483 zgd_t *zgd; 2484 int error; 2485 2486 ASSERT3P(lwb, !=, NULL); 2487 ASSERT3U(size, !=, 0); 2488 2489 ztest_object_lock(zd, object, ZTRL_READER); 2490 error = dmu_bonus_hold(os, object, FTAG, &db); 2491 if (error) { 2492 ztest_object_unlock(zd, object); 2493 return (error); 2494 } 2495 2496 crtxg = ztest_bt_bonus(db)->bt_crtxg; 2497 2498 if (crtxg == 0 || crtxg > txg) { 2499 dmu_buf_rele(db, FTAG); 2500 ztest_object_unlock(zd, object); 2501 return (ENOENT); 2502 } 2503 2504 dmu_object_info_from_db(db, &doi); 2505 dmu_buf_rele(db, FTAG); 2506 db = NULL; 2507 2508 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 2509 zgd->zgd_lwb = lwb; 2510 zgd->zgd_private = zd; 2511 2512 if (buf != NULL) { /* immediate write */ 2513 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2514 object, offset, size, ZTRL_READER); 2515 2516 error = dmu_read(os, object, offset, size, buf, 2517 DMU_READ_NO_PREFETCH); 2518 ASSERT0(error); 2519 } else { 2520 ASSERT3P(zio, !=, NULL); 2521 size = doi.doi_data_block_size; 2522 if (ISP2(size)) { 2523 offset = P2ALIGN_TYPED(offset, size, uint64_t); 2524 } else { 2525 ASSERT3U(offset, <, size); 2526 offset = 0; 2527 } 2528 2529 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2530 object, offset, size, ZTRL_READER); 2531 2532 error = dmu_buf_hold_noread(os, object, offset, zgd, &db); 2533 2534 if (error == 0) { 2535 blkptr_t *bp = &lr->lr_blkptr; 2536 2537 zgd->zgd_db = db; 2538 zgd->zgd_bp = bp; 2539 2540 ASSERT3U(db->db_offset, ==, offset); 2541 ASSERT3U(db->db_size, ==, size); 2542 2543 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2544 ztest_get_done, zgd); 2545 2546 if (error == 0) 2547 return (0); 2548 } 2549 } 2550 2551 ztest_get_done(zgd, error); 2552 2553 return (error); 2554 } 2555 2556 static void * 2557 ztest_lr_alloc(size_t lrsize, char *name) 2558 { 2559 char *lr; 2560 size_t namesize = name ? strlen(name) + 1 : 0; 2561 2562 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2563 2564 if (name) 2565 memcpy(lr + lrsize, name, namesize); 2566 2567 return (lr); 2568 } 2569 2570 static void 2571 ztest_lr_free(void *lr, size_t lrsize, char *name) 2572 { 2573 size_t namesize = name ? strlen(name) + 1 : 0; 2574 2575 umem_free(lr, lrsize + namesize); 2576 } 2577 2578 /* 2579 * Lookup a bunch of objects. Returns the number of objects not found. 2580 */ 2581 static int 2582 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2583 { 2584 int missing = 0; 2585 int error; 2586 int i; 2587 2588 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2589 2590 for (i = 0; i < count; i++, od++) { 2591 od->od_object = 0; 2592 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2593 sizeof (uint64_t), 1, &od->od_object); 2594 if (error) { 2595 ASSERT3S(error, ==, ENOENT); 2596 ASSERT0(od->od_object); 2597 missing++; 2598 } else { 2599 dmu_buf_t *db; 2600 ztest_block_tag_t *bbt; 2601 dmu_object_info_t doi; 2602 2603 ASSERT3U(od->od_object, !=, 0); 2604 ASSERT0(missing); /* there should be no gaps */ 2605 2606 ztest_object_lock(zd, od->od_object, ZTRL_READER); 2607 VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, 2608 FTAG, &db)); 2609 dmu_object_info_from_db(db, &doi); 2610 bbt = ztest_bt_bonus(db); 2611 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2612 od->od_type = doi.doi_type; 2613 od->od_blocksize = doi.doi_data_block_size; 2614 od->od_gen = bbt->bt_gen; 2615 dmu_buf_rele(db, FTAG); 2616 ztest_object_unlock(zd, od->od_object); 2617 } 2618 } 2619 2620 return (missing); 2621 } 2622 2623 static int 2624 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2625 { 2626 int missing = 0; 2627 int i; 2628 2629 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2630 2631 for (i = 0; i < count; i++, od++) { 2632 if (missing) { 2633 od->od_object = 0; 2634 missing++; 2635 continue; 2636 } 2637 2638 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2639 2640 lr->lr_doid = od->od_dir; 2641 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2642 lr->lrz_type = od->od_crtype; 2643 lr->lrz_blocksize = od->od_crblocksize; 2644 lr->lrz_ibshift = ztest_random_ibshift(); 2645 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2646 lr->lrz_dnodesize = od->od_crdnodesize; 2647 lr->lr_gen = od->od_crgen; 2648 lr->lr_crtime[0] = time(NULL); 2649 2650 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2651 ASSERT0(missing); 2652 od->od_object = 0; 2653 missing++; 2654 } else { 2655 od->od_object = lr->lr_foid; 2656 od->od_type = od->od_crtype; 2657 od->od_blocksize = od->od_crblocksize; 2658 od->od_gen = od->od_crgen; 2659 ASSERT3U(od->od_object, !=, 0); 2660 } 2661 2662 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2663 } 2664 2665 return (missing); 2666 } 2667 2668 static int 2669 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2670 { 2671 int missing = 0; 2672 int error; 2673 int i; 2674 2675 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2676 2677 od += count - 1; 2678 2679 for (i = count - 1; i >= 0; i--, od--) { 2680 if (missing) { 2681 missing++; 2682 continue; 2683 } 2684 2685 /* 2686 * No object was found. 2687 */ 2688 if (od->od_object == 0) 2689 continue; 2690 2691 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2692 2693 lr->lr_doid = od->od_dir; 2694 2695 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2696 ASSERT3U(error, ==, ENOSPC); 2697 missing++; 2698 } else { 2699 od->od_object = 0; 2700 } 2701 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2702 } 2703 2704 return (missing); 2705 } 2706 2707 static int 2708 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2709 const void *data) 2710 { 2711 lr_write_t *lr; 2712 int error; 2713 2714 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2715 2716 lr->lr_foid = object; 2717 lr->lr_offset = offset; 2718 lr->lr_length = size; 2719 lr->lr_blkoff = 0; 2720 BP_ZERO(&lr->lr_blkptr); 2721 2722 memcpy(lr + 1, data, size); 2723 2724 error = ztest_replay_write(zd, lr, B_FALSE); 2725 2726 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2727 2728 return (error); 2729 } 2730 2731 static int 2732 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2733 { 2734 lr_truncate_t *lr; 2735 int error; 2736 2737 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2738 2739 lr->lr_foid = object; 2740 lr->lr_offset = offset; 2741 lr->lr_length = size; 2742 2743 error = ztest_replay_truncate(zd, lr, B_FALSE); 2744 2745 ztest_lr_free(lr, sizeof (*lr), NULL); 2746 2747 return (error); 2748 } 2749 2750 static int 2751 ztest_setattr(ztest_ds_t *zd, uint64_t object) 2752 { 2753 lr_setattr_t *lr; 2754 int error; 2755 2756 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2757 2758 lr->lr_foid = object; 2759 lr->lr_size = 0; 2760 lr->lr_mode = 0; 2761 2762 error = ztest_replay_setattr(zd, lr, B_FALSE); 2763 2764 ztest_lr_free(lr, sizeof (*lr), NULL); 2765 2766 return (error); 2767 } 2768 2769 static void 2770 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2771 { 2772 objset_t *os = zd->zd_os; 2773 dmu_tx_t *tx; 2774 uint64_t txg; 2775 rl_t *rl; 2776 2777 txg_wait_synced(dmu_objset_pool(os), 0); 2778 2779 ztest_object_lock(zd, object, ZTRL_READER); 2780 rl = ztest_range_lock(zd, object, offset, size, ZTRL_WRITER); 2781 2782 tx = dmu_tx_create(os); 2783 2784 dmu_tx_hold_write(tx, object, offset, size); 2785 2786 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2787 2788 if (txg != 0) { 2789 dmu_prealloc(os, object, offset, size, tx); 2790 dmu_tx_commit(tx); 2791 txg_wait_synced(dmu_objset_pool(os), txg); 2792 } else { 2793 (void) dmu_free_long_range(os, object, offset, size); 2794 } 2795 2796 ztest_range_unlock(rl); 2797 ztest_object_unlock(zd, object); 2798 } 2799 2800 static void 2801 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2802 { 2803 int err; 2804 ztest_block_tag_t wbt; 2805 dmu_object_info_t doi; 2806 enum ztest_io_type io_type; 2807 uint64_t blocksize; 2808 void *data; 2809 2810 VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); 2811 blocksize = doi.doi_data_block_size; 2812 data = umem_alloc(blocksize, UMEM_NOFAIL); 2813 2814 /* 2815 * Pick an i/o type at random, biased toward writing block tags. 2816 */ 2817 io_type = ztest_random(ZTEST_IO_TYPES); 2818 if (ztest_random(2) == 0) 2819 io_type = ZTEST_IO_WRITE_TAG; 2820 2821 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2822 2823 switch (io_type) { 2824 2825 case ZTEST_IO_WRITE_TAG: 2826 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2827 offset, 0, 0, 0); 2828 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2829 break; 2830 2831 case ZTEST_IO_WRITE_PATTERN: 2832 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2833 if (ztest_random(2) == 0) { 2834 /* 2835 * Induce fletcher2 collisions to ensure that 2836 * zio_ddt_collision() detects and resolves them 2837 * when using fletcher2-verify for deduplication. 2838 */ 2839 ((uint64_t *)data)[0] ^= 1ULL << 63; 2840 ((uint64_t *)data)[4] ^= 1ULL << 63; 2841 } 2842 (void) ztest_write(zd, object, offset, blocksize, data); 2843 break; 2844 2845 case ZTEST_IO_WRITE_ZEROES: 2846 memset(data, 0, blocksize); 2847 (void) ztest_write(zd, object, offset, blocksize, data); 2848 break; 2849 2850 case ZTEST_IO_TRUNCATE: 2851 (void) ztest_truncate(zd, object, offset, blocksize); 2852 break; 2853 2854 case ZTEST_IO_SETATTR: 2855 (void) ztest_setattr(zd, object); 2856 break; 2857 default: 2858 break; 2859 2860 case ZTEST_IO_REWRITE: 2861 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2862 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2863 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2864 B_FALSE); 2865 ASSERT(err == 0 || err == ENOSPC); 2866 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2867 ZFS_PROP_COMPRESSION, 2868 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2869 B_FALSE); 2870 ASSERT(err == 0 || err == ENOSPC); 2871 (void) pthread_rwlock_unlock(&ztest_name_lock); 2872 2873 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2874 DMU_READ_NO_PREFETCH)); 2875 2876 (void) ztest_write(zd, object, offset, blocksize, data); 2877 break; 2878 } 2879 2880 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2881 2882 umem_free(data, blocksize); 2883 } 2884 2885 /* 2886 * Initialize an object description template. 2887 */ 2888 static void 2889 ztest_od_init(ztest_od_t *od, uint64_t id, const char *tag, uint64_t index, 2890 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2891 uint64_t gen) 2892 { 2893 od->od_dir = ZTEST_DIROBJ; 2894 od->od_object = 0; 2895 2896 od->od_crtype = type; 2897 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2898 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2899 od->od_crgen = gen; 2900 2901 od->od_type = DMU_OT_NONE; 2902 od->od_blocksize = 0; 2903 od->od_gen = 0; 2904 2905 (void) snprintf(od->od_name, sizeof (od->od_name), 2906 "%s(%"PRId64")[%"PRIu64"]", 2907 tag, id, index); 2908 } 2909 2910 /* 2911 * Lookup or create the objects for a test using the od template. 2912 * If the objects do not all exist, or if 'remove' is specified, 2913 * remove any existing objects and create new ones. Otherwise, 2914 * use the existing objects. 2915 */ 2916 static int 2917 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2918 { 2919 int count = size / sizeof (*od); 2920 int rv = 0; 2921 2922 mutex_enter(&zd->zd_dirobj_lock); 2923 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2924 (ztest_remove(zd, od, count) != 0 || 2925 ztest_create(zd, od, count) != 0)) 2926 rv = -1; 2927 zd->zd_od = od; 2928 mutex_exit(&zd->zd_dirobj_lock); 2929 2930 return (rv); 2931 } 2932 2933 void 2934 ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2935 { 2936 (void) id; 2937 zilog_t *zilog = zd->zd_zilog; 2938 2939 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2940 2941 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2942 2943 /* 2944 * Remember the committed values in zd, which is in parent/child 2945 * shared memory. If we die, the next iteration of ztest_run() 2946 * will verify that the log really does contain this record. 2947 */ 2948 mutex_enter(&zilog->zl_lock); 2949 ASSERT3P(zd->zd_shared, !=, NULL); 2950 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2951 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2952 mutex_exit(&zilog->zl_lock); 2953 2954 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2955 } 2956 2957 /* 2958 * This function is designed to simulate the operations that occur during a 2959 * mount/unmount operation. We hold the dataset across these operations in an 2960 * attempt to expose any implicit assumptions about ZIL management. 2961 */ 2962 void 2963 ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2964 { 2965 (void) id; 2966 objset_t *os = zd->zd_os; 2967 2968 /* 2969 * We hold the ztest_vdev_lock so we don't cause problems with 2970 * other threads that wish to remove a log device, such as 2971 * ztest_device_removal(). 2972 */ 2973 mutex_enter(&ztest_vdev_lock); 2974 2975 /* 2976 * We grab the zd_dirobj_lock to ensure that no other thread is 2977 * updating the zil (i.e. adding in-memory log records) and the 2978 * zd_zilog_lock to block any I/O. 2979 */ 2980 mutex_enter(&zd->zd_dirobj_lock); 2981 (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); 2982 2983 /* zfsvfs_teardown() */ 2984 zil_close(zd->zd_zilog); 2985 2986 /* zfsvfs_setup() */ 2987 VERIFY3P(zil_open(os, ztest_get_data, NULL), ==, zd->zd_zilog); 2988 zil_replay(os, zd, ztest_replay_vector); 2989 2990 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2991 mutex_exit(&zd->zd_dirobj_lock); 2992 mutex_exit(&ztest_vdev_lock); 2993 } 2994 2995 /* 2996 * Verify that we can't destroy an active pool, create an existing pool, 2997 * or create a pool with a bad vdev spec. 2998 */ 2999 void 3000 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 3001 { 3002 (void) zd, (void) id; 3003 ztest_shared_opts_t *zo = &ztest_opts; 3004 spa_t *spa; 3005 nvlist_t *nvroot; 3006 3007 if (zo->zo_mmp_test) 3008 return; 3009 3010 /* 3011 * Attempt to create using a bad file. 3012 */ 3013 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 3014 VERIFY3U(ENOENT, ==, 3015 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 3016 fnvlist_free(nvroot); 3017 3018 /* 3019 * Attempt to create using a bad mirror. 3020 */ 3021 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); 3022 VERIFY3U(ENOENT, ==, 3023 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 3024 fnvlist_free(nvroot); 3025 3026 /* 3027 * Attempt to create an existing pool. It shouldn't matter 3028 * what's in the nvroot; we should fail with EEXIST. 3029 */ 3030 (void) pthread_rwlock_rdlock(&ztest_name_lock); 3031 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 3032 VERIFY3U(EEXIST, ==, 3033 spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); 3034 fnvlist_free(nvroot); 3035 3036 /* 3037 * We open a reference to the spa and then we try to export it 3038 * expecting one of the following errors: 3039 * 3040 * EBUSY 3041 * Because of the reference we just opened. 3042 * 3043 * ZFS_ERR_EXPORT_IN_PROGRESS 3044 * For the case that there is another ztest thread doing 3045 * an export concurrently. 3046 */ 3047 VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); 3048 int error = spa_destroy(zo->zo_pool); 3049 if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { 3050 fatal(B_FALSE, "spa_destroy(%s) returned unexpected value %d", 3051 spa->spa_name, error); 3052 } 3053 spa_close(spa, FTAG); 3054 3055 (void) pthread_rwlock_unlock(&ztest_name_lock); 3056 } 3057 3058 /* 3059 * Start and then stop the MMP threads to ensure the startup and shutdown code 3060 * works properly. Actual protection and property-related code tested via ZTS. 3061 */ 3062 void 3063 ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) 3064 { 3065 (void) zd, (void) id; 3066 ztest_shared_opts_t *zo = &ztest_opts; 3067 spa_t *spa = ztest_spa; 3068 3069 if (zo->zo_mmp_test) 3070 return; 3071 3072 /* 3073 * Since enabling MMP involves setting a property, it could not be done 3074 * while the pool is suspended. 3075 */ 3076 if (spa_suspended(spa)) 3077 return; 3078 3079 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3080 mutex_enter(&spa->spa_props_lock); 3081 3082 zfs_multihost_fail_intervals = 0; 3083 3084 if (!spa_multihost(spa)) { 3085 spa->spa_multihost = B_TRUE; 3086 mmp_thread_start(spa); 3087 } 3088 3089 mutex_exit(&spa->spa_props_lock); 3090 spa_config_exit(spa, SCL_CONFIG, FTAG); 3091 3092 txg_wait_synced(spa_get_dsl(spa), 0); 3093 mmp_signal_all_threads(); 3094 txg_wait_synced(spa_get_dsl(spa), 0); 3095 3096 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3097 mutex_enter(&spa->spa_props_lock); 3098 3099 if (spa_multihost(spa)) { 3100 mmp_thread_stop(spa); 3101 spa->spa_multihost = B_FALSE; 3102 } 3103 3104 mutex_exit(&spa->spa_props_lock); 3105 spa_config_exit(spa, SCL_CONFIG, FTAG); 3106 } 3107 3108 static int 3109 ztest_get_raidz_children(spa_t *spa) 3110 { 3111 (void) spa; 3112 vdev_t *raidvd; 3113 3114 ASSERT(MUTEX_HELD(&ztest_vdev_lock)); 3115 3116 if (ztest_opts.zo_raid_do_expand) { 3117 raidvd = ztest_spa->spa_root_vdev->vdev_child[0]; 3118 3119 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 3120 3121 return (raidvd->vdev_children); 3122 } 3123 3124 return (ztest_opts.zo_raid_children); 3125 } 3126 3127 void 3128 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 3129 { 3130 (void) zd, (void) id; 3131 spa_t *spa; 3132 uint64_t initial_version = SPA_VERSION_INITIAL; 3133 uint64_t raidz_children, version, newversion; 3134 nvlist_t *nvroot, *props; 3135 char *name; 3136 3137 if (ztest_opts.zo_mmp_test) 3138 return; 3139 3140 /* dRAID added after feature flags, skip upgrade test. */ 3141 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) 3142 return; 3143 3144 mutex_enter(&ztest_vdev_lock); 3145 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 3146 3147 /* 3148 * Clean up from previous runs. 3149 */ 3150 (void) spa_destroy(name); 3151 3152 raidz_children = ztest_get_raidz_children(ztest_spa); 3153 3154 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 3155 NULL, raidz_children, ztest_opts.zo_mirrors, 1); 3156 3157 /* 3158 * If we're configuring a RAIDZ device then make sure that the 3159 * initial version is capable of supporting that feature. 3160 */ 3161 switch (ztest_opts.zo_raid_parity) { 3162 case 0: 3163 case 1: 3164 initial_version = SPA_VERSION_INITIAL; 3165 break; 3166 case 2: 3167 initial_version = SPA_VERSION_RAIDZ2; 3168 break; 3169 case 3: 3170 initial_version = SPA_VERSION_RAIDZ3; 3171 break; 3172 } 3173 3174 /* 3175 * Create a pool with a spa version that can be upgraded. Pick 3176 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 3177 */ 3178 do { 3179 version = ztest_random_spa_version(initial_version); 3180 } while (version > SPA_VERSION_BEFORE_FEATURES); 3181 3182 props = fnvlist_alloc(); 3183 fnvlist_add_uint64(props, 3184 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 3185 VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); 3186 fnvlist_free(nvroot); 3187 fnvlist_free(props); 3188 3189 VERIFY0(spa_open(name, &spa, FTAG)); 3190 VERIFY3U(spa_version(spa), ==, version); 3191 newversion = ztest_random_spa_version(version + 1); 3192 3193 if (ztest_opts.zo_verbose >= 4) { 3194 (void) printf("upgrading spa version from " 3195 "%"PRIu64" to %"PRIu64"\n", 3196 version, newversion); 3197 } 3198 3199 spa_upgrade(spa, newversion); 3200 VERIFY3U(spa_version(spa), >, version); 3201 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 3202 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 3203 spa_close(spa, FTAG); 3204 3205 kmem_strfree(name); 3206 mutex_exit(&ztest_vdev_lock); 3207 } 3208 3209 static void 3210 ztest_spa_checkpoint(spa_t *spa) 3211 { 3212 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3213 3214 int error = spa_checkpoint(spa->spa_name); 3215 3216 switch (error) { 3217 case 0: 3218 case ZFS_ERR_DEVRM_IN_PROGRESS: 3219 case ZFS_ERR_DISCARDING_CHECKPOINT: 3220 case ZFS_ERR_CHECKPOINT_EXISTS: 3221 case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: 3222 break; 3223 case ENOSPC: 3224 ztest_record_enospc(FTAG); 3225 break; 3226 default: 3227 fatal(B_FALSE, "spa_checkpoint(%s) = %d", spa->spa_name, error); 3228 } 3229 } 3230 3231 static void 3232 ztest_spa_discard_checkpoint(spa_t *spa) 3233 { 3234 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3235 3236 int error = spa_checkpoint_discard(spa->spa_name); 3237 3238 switch (error) { 3239 case 0: 3240 case ZFS_ERR_DISCARDING_CHECKPOINT: 3241 case ZFS_ERR_NO_CHECKPOINT: 3242 break; 3243 default: 3244 fatal(B_FALSE, "spa_discard_checkpoint(%s) = %d", 3245 spa->spa_name, error); 3246 } 3247 3248 } 3249 3250 void 3251 ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 3252 { 3253 (void) zd, (void) id; 3254 spa_t *spa = ztest_spa; 3255 3256 mutex_enter(&ztest_checkpoint_lock); 3257 if (ztest_random(2) == 0) { 3258 ztest_spa_checkpoint(spa); 3259 } else { 3260 ztest_spa_discard_checkpoint(spa); 3261 } 3262 mutex_exit(&ztest_checkpoint_lock); 3263 } 3264 3265 3266 static vdev_t * 3267 vdev_lookup_by_path(vdev_t *vd, const char *path) 3268 { 3269 vdev_t *mvd; 3270 int c; 3271 3272 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 3273 return (vd); 3274 3275 for (c = 0; c < vd->vdev_children; c++) 3276 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 3277 NULL) 3278 return (mvd); 3279 3280 return (NULL); 3281 } 3282 3283 static int 3284 spa_num_top_vdevs(spa_t *spa) 3285 { 3286 vdev_t *rvd = spa->spa_root_vdev; 3287 ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); 3288 return (rvd->vdev_children); 3289 } 3290 3291 /* 3292 * Verify that vdev_add() works as expected. 3293 */ 3294 void 3295 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 3296 { 3297 (void) zd, (void) id; 3298 ztest_shared_t *zs = ztest_shared; 3299 spa_t *spa = ztest_spa; 3300 uint64_t leaves; 3301 uint64_t guid; 3302 uint64_t raidz_children; 3303 3304 nvlist_t *nvroot; 3305 int error; 3306 3307 if (ztest_opts.zo_mmp_test) 3308 return; 3309 3310 mutex_enter(&ztest_vdev_lock); 3311 raidz_children = ztest_get_raidz_children(spa); 3312 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 3313 3314 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3315 3316 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3317 3318 /* 3319 * If we have slogs then remove them 1/4 of the time. 3320 */ 3321 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 3322 metaslab_group_t *mg; 3323 3324 /* 3325 * find the first real slog in log allocation class 3326 */ 3327 mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; 3328 while (!mg->mg_vd->vdev_islog) 3329 mg = mg->mg_next; 3330 3331 guid = mg->mg_vd->vdev_guid; 3332 3333 spa_config_exit(spa, SCL_VDEV, FTAG); 3334 3335 /* 3336 * We have to grab the zs_name_lock as writer to 3337 * prevent a race between removing a slog (dmu_objset_find) 3338 * and destroying a dataset. Removing the slog will 3339 * grab a reference on the dataset which may cause 3340 * dsl_destroy_head() to fail with EBUSY thus 3341 * leaving the dataset in an inconsistent state. 3342 */ 3343 pthread_rwlock_wrlock(&ztest_name_lock); 3344 error = spa_vdev_remove(spa, guid, B_FALSE); 3345 pthread_rwlock_unlock(&ztest_name_lock); 3346 3347 switch (error) { 3348 case 0: 3349 case EEXIST: /* Generic zil_reset() error */ 3350 case EBUSY: /* Replay required */ 3351 case EACCES: /* Crypto key not loaded */ 3352 case ZFS_ERR_CHECKPOINT_EXISTS: 3353 case ZFS_ERR_DISCARDING_CHECKPOINT: 3354 break; 3355 default: 3356 fatal(B_FALSE, "spa_vdev_remove() = %d", error); 3357 } 3358 } else { 3359 spa_config_exit(spa, SCL_VDEV, FTAG); 3360 3361 /* 3362 * Make 1/4 of the devices be log devices 3363 */ 3364 nvroot = make_vdev_root(NULL, NULL, NULL, 3365 ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? 3366 "log" : NULL, raidz_children, zs->zs_mirrors, 3367 1); 3368 3369 error = spa_vdev_add(spa, nvroot, B_FALSE); 3370 fnvlist_free(nvroot); 3371 3372 switch (error) { 3373 case 0: 3374 break; 3375 case ENOSPC: 3376 ztest_record_enospc("spa_vdev_add"); 3377 break; 3378 default: 3379 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3380 } 3381 } 3382 3383 mutex_exit(&ztest_vdev_lock); 3384 } 3385 3386 void 3387 ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) 3388 { 3389 (void) zd, (void) id; 3390 ztest_shared_t *zs = ztest_shared; 3391 spa_t *spa = ztest_spa; 3392 uint64_t leaves; 3393 nvlist_t *nvroot; 3394 uint64_t raidz_children; 3395 const char *class = (ztest_random(2) == 0) ? 3396 VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; 3397 int error; 3398 3399 /* 3400 * By default add a special vdev 50% of the time 3401 */ 3402 if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || 3403 (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && 3404 ztest_random(2) == 0)) { 3405 return; 3406 } 3407 3408 mutex_enter(&ztest_vdev_lock); 3409 3410 /* Only test with mirrors */ 3411 if (zs->zs_mirrors < 2) { 3412 mutex_exit(&ztest_vdev_lock); 3413 return; 3414 } 3415 3416 /* requires feature@allocation_classes */ 3417 if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { 3418 mutex_exit(&ztest_vdev_lock); 3419 return; 3420 } 3421 3422 raidz_children = ztest_get_raidz_children(spa); 3423 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 3424 3425 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3426 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3427 spa_config_exit(spa, SCL_VDEV, FTAG); 3428 3429 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 3430 class, raidz_children, zs->zs_mirrors, 1); 3431 3432 error = spa_vdev_add(spa, nvroot, B_FALSE); 3433 fnvlist_free(nvroot); 3434 3435 if (error == ENOSPC) 3436 ztest_record_enospc("spa_vdev_add"); 3437 else if (error != 0) 3438 fatal(B_FALSE, "spa_vdev_add() = %d", error); 3439 3440 /* 3441 * 50% of the time allow small blocks in the special class 3442 */ 3443 if (error == 0 && 3444 spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { 3445 if (ztest_opts.zo_verbose >= 3) 3446 (void) printf("Enabling special VDEV small blocks\n"); 3447 error = ztest_dsl_prop_set_uint64(zd->zd_name, 3448 ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); 3449 ASSERT(error == 0 || error == ENOSPC); 3450 } 3451 3452 mutex_exit(&ztest_vdev_lock); 3453 3454 if (ztest_opts.zo_verbose >= 3) { 3455 metaslab_class_t *mc; 3456 3457 if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) 3458 mc = spa_special_class(spa); 3459 else 3460 mc = spa_dedup_class(spa); 3461 (void) printf("Added a %s mirrored vdev (of %d)\n", 3462 class, (int)mc->mc_groups); 3463 } 3464 } 3465 3466 /* 3467 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 3468 */ 3469 void 3470 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 3471 { 3472 (void) zd, (void) id; 3473 ztest_shared_t *zs = ztest_shared; 3474 spa_t *spa = ztest_spa; 3475 vdev_t *rvd = spa->spa_root_vdev; 3476 spa_aux_vdev_t *sav; 3477 const char *aux; 3478 char *path; 3479 uint64_t guid = 0; 3480 int error, ignore_err = 0; 3481 3482 if (ztest_opts.zo_mmp_test) 3483 return; 3484 3485 path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3486 3487 if (ztest_random(2) == 0) { 3488 sav = &spa->spa_spares; 3489 aux = ZPOOL_CONFIG_SPARES; 3490 } else { 3491 sav = &spa->spa_l2cache; 3492 aux = ZPOOL_CONFIG_L2CACHE; 3493 } 3494 3495 mutex_enter(&ztest_vdev_lock); 3496 3497 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3498 3499 if (sav->sav_count != 0 && ztest_random(4) == 0) { 3500 /* 3501 * Pick a random device to remove. 3502 */ 3503 vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3504 3505 /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ 3506 if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) 3507 ignore_err = ENOTSUP; 3508 3509 guid = svd->vdev_guid; 3510 } else { 3511 /* 3512 * Find an unused device we can add. 3513 */ 3514 zs->zs_vdev_aux = 0; 3515 for (;;) { 3516 int c; 3517 (void) snprintf(path, MAXPATHLEN, ztest_aux_template, 3518 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 3519 zs->zs_vdev_aux); 3520 for (c = 0; c < sav->sav_count; c++) 3521 if (strcmp(sav->sav_vdevs[c]->vdev_path, 3522 path) == 0) 3523 break; 3524 if (c == sav->sav_count && 3525 vdev_lookup_by_path(rvd, path) == NULL) 3526 break; 3527 zs->zs_vdev_aux++; 3528 } 3529 } 3530 3531 spa_config_exit(spa, SCL_VDEV, FTAG); 3532 3533 if (guid == 0) { 3534 /* 3535 * Add a new device. 3536 */ 3537 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 3538 (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); 3539 error = spa_vdev_add(spa, nvroot, B_FALSE); 3540 3541 switch (error) { 3542 case 0: 3543 break; 3544 default: 3545 fatal(B_FALSE, "spa_vdev_add(%p) = %d", nvroot, error); 3546 } 3547 fnvlist_free(nvroot); 3548 } else { 3549 /* 3550 * Remove an existing device. Sometimes, dirty its 3551 * vdev state first to make sure we handle removal 3552 * of devices that have pending state changes. 3553 */ 3554 if (ztest_random(2) == 0) 3555 (void) vdev_online(spa, guid, 0, NULL); 3556 3557 error = spa_vdev_remove(spa, guid, B_FALSE); 3558 3559 switch (error) { 3560 case 0: 3561 case EBUSY: 3562 case ZFS_ERR_CHECKPOINT_EXISTS: 3563 case ZFS_ERR_DISCARDING_CHECKPOINT: 3564 break; 3565 default: 3566 if (error != ignore_err) 3567 fatal(B_FALSE, 3568 "spa_vdev_remove(%"PRIu64") = %d", 3569 guid, error); 3570 } 3571 } 3572 3573 mutex_exit(&ztest_vdev_lock); 3574 3575 umem_free(path, MAXPATHLEN); 3576 } 3577 3578 /* 3579 * split a pool if it has mirror tlvdevs 3580 */ 3581 void 3582 ztest_split_pool(ztest_ds_t *zd, uint64_t id) 3583 { 3584 (void) zd, (void) id; 3585 ztest_shared_t *zs = ztest_shared; 3586 spa_t *spa = ztest_spa; 3587 vdev_t *rvd = spa->spa_root_vdev; 3588 nvlist_t *tree, **child, *config, *split, **schild; 3589 uint_t c, children, schildren = 0, lastlogid = 0; 3590 int error = 0; 3591 3592 if (ztest_opts.zo_mmp_test) 3593 return; 3594 3595 mutex_enter(&ztest_vdev_lock); 3596 3597 /* ensure we have a usable config; mirrors of raidz aren't supported */ 3598 if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { 3599 mutex_exit(&ztest_vdev_lock); 3600 return; 3601 } 3602 3603 /* clean up the old pool, if any */ 3604 (void) spa_destroy("splitp"); 3605 3606 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3607 3608 /* generate a config from the existing config */ 3609 mutex_enter(&spa->spa_props_lock); 3610 tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); 3611 mutex_exit(&spa->spa_props_lock); 3612 3613 VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, 3614 &child, &children)); 3615 3616 schild = umem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 3617 UMEM_NOFAIL); 3618 for (c = 0; c < children; c++) { 3619 vdev_t *tvd = rvd->vdev_child[c]; 3620 nvlist_t **mchild; 3621 uint_t mchildren; 3622 3623 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 3624 schild[schildren] = fnvlist_alloc(); 3625 fnvlist_add_string(schild[schildren], 3626 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); 3627 fnvlist_add_uint64(schild[schildren], 3628 ZPOOL_CONFIG_IS_HOLE, 1); 3629 if (lastlogid == 0) 3630 lastlogid = schildren; 3631 ++schildren; 3632 continue; 3633 } 3634 lastlogid = 0; 3635 VERIFY0(nvlist_lookup_nvlist_array(child[c], 3636 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); 3637 schild[schildren++] = fnvlist_dup(mchild[0]); 3638 } 3639 3640 /* OK, create a config that can be used to split */ 3641 split = fnvlist_alloc(); 3642 fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 3643 fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, 3644 (const nvlist_t **)schild, lastlogid != 0 ? lastlogid : schildren); 3645 3646 config = fnvlist_alloc(); 3647 fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); 3648 3649 for (c = 0; c < schildren; c++) 3650 fnvlist_free(schild[c]); 3651 umem_free(schild, rvd->vdev_children * sizeof (nvlist_t *)); 3652 fnvlist_free(split); 3653 3654 spa_config_exit(spa, SCL_VDEV, FTAG); 3655 3656 (void) pthread_rwlock_wrlock(&ztest_name_lock); 3657 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 3658 (void) pthread_rwlock_unlock(&ztest_name_lock); 3659 3660 fnvlist_free(config); 3661 3662 if (error == 0) { 3663 (void) printf("successful split - results:\n"); 3664 mutex_enter(&spa_namespace_lock); 3665 show_pool_stats(spa); 3666 show_pool_stats(spa_lookup("splitp")); 3667 mutex_exit(&spa_namespace_lock); 3668 ++zs->zs_splits; 3669 --zs->zs_mirrors; 3670 } 3671 mutex_exit(&ztest_vdev_lock); 3672 } 3673 3674 /* 3675 * Verify that we can attach and detach devices. 3676 */ 3677 void 3678 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 3679 { 3680 (void) zd, (void) id; 3681 ztest_shared_t *zs = ztest_shared; 3682 spa_t *spa = ztest_spa; 3683 spa_aux_vdev_t *sav = &spa->spa_spares; 3684 vdev_t *rvd = spa->spa_root_vdev; 3685 vdev_t *oldvd, *newvd, *pvd; 3686 nvlist_t *root; 3687 uint64_t leaves; 3688 uint64_t leaf, top; 3689 uint64_t ashift = ztest_get_ashift(); 3690 uint64_t oldguid, pguid; 3691 uint64_t oldsize, newsize; 3692 uint64_t raidz_children; 3693 char *oldpath, *newpath; 3694 int replacing; 3695 int oldvd_has_siblings = B_FALSE; 3696 int newvd_is_spare = B_FALSE; 3697 int newvd_is_dspare = B_FALSE; 3698 int oldvd_is_log; 3699 int oldvd_is_special; 3700 int error, expected_error; 3701 3702 if (ztest_opts.zo_mmp_test) 3703 return; 3704 3705 oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3706 newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3707 3708 mutex_enter(&ztest_vdev_lock); 3709 raidz_children = ztest_get_raidz_children(spa); 3710 leaves = MAX(zs->zs_mirrors, 1) * raidz_children; 3711 3712 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3713 3714 /* 3715 * If a vdev is in the process of being removed, its removal may 3716 * finish while we are in progress, leading to an unexpected error 3717 * value. Don't bother trying to attach while we are in the middle 3718 * of removal. 3719 */ 3720 if (ztest_device_removal_active) { 3721 spa_config_exit(spa, SCL_ALL, FTAG); 3722 goto out; 3723 } 3724 3725 /* 3726 * RAIDZ leaf VDEV mirrors are not currently supported while a 3727 * RAIDZ expansion is in progress. 3728 */ 3729 if (ztest_opts.zo_raid_do_expand) { 3730 spa_config_exit(spa, SCL_ALL, FTAG); 3731 goto out; 3732 } 3733 3734 /* 3735 * Decide whether to do an attach or a replace. 3736 */ 3737 replacing = ztest_random(2); 3738 3739 /* 3740 * Pick a random top-level vdev. 3741 */ 3742 top = ztest_random_vdev_top(spa, B_TRUE); 3743 3744 /* 3745 * Pick a random leaf within it. 3746 */ 3747 leaf = ztest_random(leaves); 3748 3749 /* 3750 * Locate this vdev. 3751 */ 3752 oldvd = rvd->vdev_child[top]; 3753 3754 /* pick a child from the mirror */ 3755 if (zs->zs_mirrors >= 1) { 3756 ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); 3757 ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); 3758 oldvd = oldvd->vdev_child[leaf / raidz_children]; 3759 } 3760 3761 /* pick a child out of the raidz group */ 3762 if (ztest_opts.zo_raid_children > 1) { 3763 if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) 3764 ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); 3765 else 3766 ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); 3767 oldvd = oldvd->vdev_child[leaf % raidz_children]; 3768 } 3769 3770 /* 3771 * If we're already doing an attach or replace, oldvd may be a 3772 * mirror vdev -- in which case, pick a random child. 3773 */ 3774 while (oldvd->vdev_children != 0) { 3775 oldvd_has_siblings = B_TRUE; 3776 ASSERT3U(oldvd->vdev_children, >=, 2); 3777 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3778 } 3779 3780 oldguid = oldvd->vdev_guid; 3781 oldsize = vdev_get_min_asize(oldvd); 3782 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3783 oldvd_is_special = 3784 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_SPECIAL || 3785 oldvd->vdev_top->vdev_alloc_bias == VDEV_BIAS_DEDUP; 3786 (void) strlcpy(oldpath, oldvd->vdev_path, MAXPATHLEN); 3787 pvd = oldvd->vdev_parent; 3788 pguid = pvd->vdev_guid; 3789 3790 /* 3791 * If oldvd has siblings, then half of the time, detach it. Prior 3792 * to the detach the pool is scrubbed in order to prevent creating 3793 * unrepairable blocks as a result of the data corruption injection. 3794 */ 3795 if (oldvd_has_siblings && ztest_random(2) == 0) { 3796 spa_config_exit(spa, SCL_ALL, FTAG); 3797 3798 error = ztest_scrub_impl(spa); 3799 if (error) 3800 goto out; 3801 3802 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3803 if (error != 0 && error != ENODEV && error != EBUSY && 3804 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3805 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3806 fatal(B_FALSE, "detach (%s) returned %d", 3807 oldpath, error); 3808 goto out; 3809 } 3810 3811 /* 3812 * For the new vdev, choose with equal probability between the two 3813 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3814 */ 3815 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3816 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3817 newvd_is_spare = B_TRUE; 3818 3819 if (newvd->vdev_ops == &vdev_draid_spare_ops) 3820 newvd_is_dspare = B_TRUE; 3821 3822 (void) strlcpy(newpath, newvd->vdev_path, MAXPATHLEN); 3823 } else { 3824 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 3825 ztest_opts.zo_dir, ztest_opts.zo_pool, 3826 top * leaves + leaf); 3827 if (ztest_random(2) == 0) 3828 newpath[strlen(newpath) - 1] = 'b'; 3829 newvd = vdev_lookup_by_path(rvd, newpath); 3830 } 3831 3832 if (newvd) { 3833 /* 3834 * Reopen to ensure the vdev's asize field isn't stale. 3835 */ 3836 vdev_reopen(newvd); 3837 newsize = vdev_get_min_asize(newvd); 3838 } else { 3839 /* 3840 * Make newsize a little bigger or smaller than oldsize. 3841 * If it's smaller, the attach should fail. 3842 * If it's larger, and we're doing a replace, 3843 * we should get dynamic LUN growth when we're done. 3844 */ 3845 newsize = 10 * oldsize / (9 + ztest_random(3)); 3846 } 3847 3848 /* 3849 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3850 * unless it's a replace; in that case any non-replacing parent is OK. 3851 * 3852 * If newvd is already part of the pool, it should fail with EBUSY. 3853 * 3854 * If newvd is too small, it should fail with EOVERFLOW. 3855 * 3856 * If newvd is a distributed spare and it's being attached to a 3857 * dRAID which is not its parent it should fail with EINVAL. 3858 */ 3859 if (pvd->vdev_ops != &vdev_mirror_ops && 3860 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3861 pvd->vdev_ops == &vdev_replacing_ops || 3862 pvd->vdev_ops == &vdev_spare_ops)) 3863 expected_error = ENOTSUP; 3864 else if (newvd_is_spare && 3865 (!replacing || oldvd_is_log || oldvd_is_special)) 3866 expected_error = ENOTSUP; 3867 else if (newvd == oldvd) 3868 expected_error = replacing ? 0 : EBUSY; 3869 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3870 expected_error = EBUSY; 3871 else if (!newvd_is_dspare && newsize < oldsize) 3872 expected_error = EOVERFLOW; 3873 else if (ashift > oldvd->vdev_top->vdev_ashift) 3874 expected_error = EDOM; 3875 else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) 3876 expected_error = EINVAL; 3877 else 3878 expected_error = 0; 3879 3880 spa_config_exit(spa, SCL_ALL, FTAG); 3881 3882 /* 3883 * Build the nvlist describing newpath. 3884 */ 3885 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3886 ashift, NULL, 0, 0, 1); 3887 3888 /* 3889 * When supported select either a healing or sequential resilver. 3890 */ 3891 boolean_t rebuilding = B_FALSE; 3892 if (pvd->vdev_ops == &vdev_mirror_ops || 3893 pvd->vdev_ops == &vdev_root_ops) { 3894 rebuilding = !!ztest_random(2); 3895 } 3896 3897 error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); 3898 3899 fnvlist_free(root); 3900 3901 /* 3902 * If our parent was the replacing vdev, but the replace completed, 3903 * then instead of failing with ENOTSUP we may either succeed, 3904 * fail with ENODEV, or fail with EOVERFLOW. 3905 */ 3906 if (expected_error == ENOTSUP && 3907 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3908 expected_error = error; 3909 3910 /* 3911 * If someone grew the LUN, the replacement may be too small. 3912 */ 3913 if (error == EOVERFLOW || error == EBUSY) 3914 expected_error = error; 3915 3916 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3917 error == ZFS_ERR_DISCARDING_CHECKPOINT || 3918 error == ZFS_ERR_RESILVER_IN_PROGRESS || 3919 error == ZFS_ERR_REBUILD_IN_PROGRESS) 3920 expected_error = error; 3921 3922 if (error != expected_error && expected_error != EBUSY) { 3923 fatal(B_FALSE, "attach (%s %"PRIu64", %s %"PRIu64", %d) " 3924 "returned %d, expected %d", 3925 oldpath, oldsize, newpath, 3926 newsize, replacing, error, expected_error); 3927 } 3928 out: 3929 mutex_exit(&ztest_vdev_lock); 3930 3931 umem_free(oldpath, MAXPATHLEN); 3932 umem_free(newpath, MAXPATHLEN); 3933 } 3934 3935 static void 3936 raidz_scratch_verify(void) 3937 { 3938 spa_t *spa; 3939 uint64_t write_size, logical_size, offset; 3940 raidz_reflow_scratch_state_t state; 3941 vdev_raidz_expand_t *vre; 3942 vdev_t *raidvd; 3943 3944 ASSERT(raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE); 3945 3946 if (ztest_scratch_state->zs_raidz_scratch_verify_pause == 0) 3947 return; 3948 3949 kernel_init(SPA_MODE_READ); 3950 3951 mutex_enter(&spa_namespace_lock); 3952 spa = spa_lookup(ztest_opts.zo_pool); 3953 ASSERT(spa); 3954 spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; 3955 mutex_exit(&spa_namespace_lock); 3956 3957 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 3958 3959 ASSERT3U(RRSS_GET_OFFSET(&spa->spa_uberblock), !=, UINT64_MAX); 3960 3961 mutex_enter(&ztest_vdev_lock); 3962 3963 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); 3964 3965 vre = spa->spa_raidz_expand; 3966 if (vre == NULL) 3967 goto out; 3968 3969 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); 3970 offset = RRSS_GET_OFFSET(&spa->spa_uberblock); 3971 state = RRSS_GET_STATE(&spa->spa_uberblock); 3972 write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << raidvd->vdev_ashift, 3973 uint64_t); 3974 logical_size = write_size * raidvd->vdev_children; 3975 3976 switch (state) { 3977 /* 3978 * Initial state of reflow process. RAIDZ expansion was 3979 * requested by user, but scratch object was not created. 3980 */ 3981 case RRSS_SCRATCH_NOT_IN_USE: 3982 ASSERT3U(offset, ==, 0); 3983 break; 3984 3985 /* 3986 * Scratch object was synced and stored in boot area. 3987 */ 3988 case RRSS_SCRATCH_VALID: 3989 3990 /* 3991 * Scratch object was synced back to raidz start offset, 3992 * raidz is ready for sector by sector reflow process. 3993 */ 3994 case RRSS_SCRATCH_INVALID_SYNCED: 3995 3996 /* 3997 * Scratch object was synced back to raidz start offset 3998 * on zpool importing, raidz is ready for sector by sector 3999 * reflow process. 4000 */ 4001 case RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT: 4002 ASSERT3U(offset, ==, logical_size); 4003 break; 4004 4005 /* 4006 * Sector by sector reflow process started. 4007 */ 4008 case RRSS_SCRATCH_INVALID_SYNCED_REFLOW: 4009 ASSERT3U(offset, >=, logical_size); 4010 break; 4011 } 4012 4013 out: 4014 spa_config_exit(spa, SCL_ALL, FTAG); 4015 4016 mutex_exit(&ztest_vdev_lock); 4017 4018 ztest_scratch_state->zs_raidz_scratch_verify_pause = 0; 4019 4020 spa_close(spa, FTAG); 4021 kernel_fini(); 4022 } 4023 4024 static void 4025 ztest_scratch_thread(void *arg) 4026 { 4027 (void) arg; 4028 4029 /* wait up to 10 seconds */ 4030 for (int t = 100; t > 0; t -= 1) { 4031 if (raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) 4032 thread_exit(); 4033 4034 (void) poll(NULL, 0, 100); 4035 } 4036 4037 /* killed when the scratch area progress reached a certain point */ 4038 ztest_kill(ztest_shared); 4039 } 4040 4041 /* 4042 * Verify that we can attach raidz device. 4043 */ 4044 void 4045 ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) 4046 { 4047 (void) zd, (void) id; 4048 ztest_shared_t *zs = ztest_shared; 4049 spa_t *spa = ztest_spa; 4050 uint64_t leaves, raidz_children, newsize, ashift = ztest_get_ashift(); 4051 kthread_t *scratch_thread = NULL; 4052 vdev_t *newvd, *pvd; 4053 nvlist_t *root; 4054 char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 4055 int error, expected_error = 0; 4056 4057 mutex_enter(&ztest_vdev_lock); 4058 4059 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); 4060 4061 /* Only allow attach when raid-kind = 'eraidz' */ 4062 if (!ztest_opts.zo_raid_do_expand) { 4063 spa_config_exit(spa, SCL_ALL, FTAG); 4064 goto out; 4065 } 4066 4067 if (ztest_opts.zo_mmp_test) { 4068 spa_config_exit(spa, SCL_ALL, FTAG); 4069 goto out; 4070 } 4071 4072 if (ztest_device_removal_active) { 4073 spa_config_exit(spa, SCL_ALL, FTAG); 4074 goto out; 4075 } 4076 4077 pvd = vdev_lookup_top(spa, 0); 4078 4079 ASSERT(pvd->vdev_ops == &vdev_raidz_ops); 4080 4081 /* 4082 * Get size of a child of the raidz group, 4083 * make sure device is a bit bigger 4084 */ 4085 newvd = pvd->vdev_child[ztest_random(pvd->vdev_children)]; 4086 newsize = 10 * vdev_get_min_asize(newvd) / (9 + ztest_random(2)); 4087 4088 /* 4089 * Get next attached leaf id 4090 */ 4091 raidz_children = ztest_get_raidz_children(spa); 4092 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * raidz_children; 4093 zs->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 4094 4095 if (spa->spa_raidz_expand) 4096 expected_error = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS; 4097 4098 spa_config_exit(spa, SCL_ALL, FTAG); 4099 4100 /* 4101 * Path to vdev to be attached 4102 */ 4103 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 4104 ztest_opts.zo_dir, ztest_opts.zo_pool, zs->zs_vdev_next_leaf); 4105 4106 /* 4107 * Build the nvlist describing newpath. 4108 */ 4109 root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL, 4110 0, 0, 1); 4111 4112 /* 4113 * 50% of the time, set raidz_expand_pause_point to cause 4114 * raidz_reflow_scratch_sync() to pause at a certain point and 4115 * then kill the test after 10 seconds so raidz_scratch_verify() 4116 * can confirm consistency when the pool is imported. 4117 */ 4118 if (ztest_random(2) == 0 && expected_error == 0) { 4119 raidz_expand_pause_point = 4120 ztest_random(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2) + 1; 4121 scratch_thread = thread_create(NULL, 0, ztest_scratch_thread, 4122 ztest_shared, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 4123 } 4124 4125 error = spa_vdev_attach(spa, pvd->vdev_guid, root, B_FALSE, B_FALSE); 4126 4127 nvlist_free(root); 4128 4129 if (error == EOVERFLOW || error == ENXIO || 4130 error == ZFS_ERR_CHECKPOINT_EXISTS || 4131 error == ZFS_ERR_DISCARDING_CHECKPOINT) 4132 expected_error = error; 4133 4134 if (error != 0 && error != expected_error) { 4135 fatal(0, "raidz attach (%s %"PRIu64") returned %d, expected %d", 4136 newpath, newsize, error, expected_error); 4137 } 4138 4139 if (raidz_expand_pause_point) { 4140 if (error != 0) { 4141 /* 4142 * Do not verify scratch object in case of error 4143 * returned by vdev attaching. 4144 */ 4145 raidz_expand_pause_point = RAIDZ_EXPAND_PAUSE_NONE; 4146 } 4147 4148 VERIFY0(thread_join(scratch_thread)); 4149 } 4150 out: 4151 mutex_exit(&ztest_vdev_lock); 4152 4153 umem_free(newpath, MAXPATHLEN); 4154 } 4155 4156 void 4157 ztest_device_removal(ztest_ds_t *zd, uint64_t id) 4158 { 4159 (void) zd, (void) id; 4160 spa_t *spa = ztest_spa; 4161 vdev_t *vd; 4162 uint64_t guid; 4163 int error; 4164 4165 mutex_enter(&ztest_vdev_lock); 4166 4167 if (ztest_device_removal_active) { 4168 mutex_exit(&ztest_vdev_lock); 4169 return; 4170 } 4171 4172 /* 4173 * Remove a random top-level vdev and wait for removal to finish. 4174 */ 4175 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 4176 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 4177 guid = vd->vdev_guid; 4178 spa_config_exit(spa, SCL_VDEV, FTAG); 4179 4180 error = spa_vdev_remove(spa, guid, B_FALSE); 4181 if (error == 0) { 4182 ztest_device_removal_active = B_TRUE; 4183 mutex_exit(&ztest_vdev_lock); 4184 4185 /* 4186 * spa->spa_vdev_removal is created in a sync task that 4187 * is initiated via dsl_sync_task_nowait(). Since the 4188 * task may not run before spa_vdev_remove() returns, we 4189 * must wait at least 1 txg to ensure that the removal 4190 * struct has been created. 4191 */ 4192 txg_wait_synced(spa_get_dsl(spa), 0); 4193 4194 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 4195 txg_wait_synced(spa_get_dsl(spa), 0); 4196 } else { 4197 mutex_exit(&ztest_vdev_lock); 4198 return; 4199 } 4200 4201 /* 4202 * The pool needs to be scrubbed after completing device removal. 4203 * Failure to do so may result in checksum errors due to the 4204 * strategy employed by ztest_fault_inject() when selecting which 4205 * offset are redundant and can be damaged. 4206 */ 4207 error = spa_scan(spa, POOL_SCAN_SCRUB); 4208 if (error == 0) { 4209 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 4210 txg_wait_synced(spa_get_dsl(spa), 0); 4211 } 4212 4213 mutex_enter(&ztest_vdev_lock); 4214 ztest_device_removal_active = B_FALSE; 4215 mutex_exit(&ztest_vdev_lock); 4216 } 4217 4218 /* 4219 * Callback function which expands the physical size of the vdev. 4220 */ 4221 static vdev_t * 4222 grow_vdev(vdev_t *vd, void *arg) 4223 { 4224 spa_t *spa __maybe_unused = vd->vdev_spa; 4225 size_t *newsize = arg; 4226 size_t fsize; 4227 int fd; 4228 4229 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 4230 ASSERT(vd->vdev_ops->vdev_op_leaf); 4231 4232 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 4233 return (vd); 4234 4235 fsize = lseek(fd, 0, SEEK_END); 4236 VERIFY0(ftruncate(fd, *newsize)); 4237 4238 if (ztest_opts.zo_verbose >= 6) { 4239 (void) printf("%s grew from %lu to %lu bytes\n", 4240 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 4241 } 4242 (void) close(fd); 4243 return (NULL); 4244 } 4245 4246 /* 4247 * Callback function which expands a given vdev by calling vdev_online(). 4248 */ 4249 static vdev_t * 4250 online_vdev(vdev_t *vd, void *arg) 4251 { 4252 (void) arg; 4253 spa_t *spa = vd->vdev_spa; 4254 vdev_t *tvd = vd->vdev_top; 4255 uint64_t guid = vd->vdev_guid; 4256 uint64_t generation = spa->spa_config_generation + 1; 4257 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 4258 int error; 4259 4260 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 4261 ASSERT(vd->vdev_ops->vdev_op_leaf); 4262 4263 /* Calling vdev_online will initialize the new metaslabs */ 4264 spa_config_exit(spa, SCL_STATE, spa); 4265 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 4266 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4267 4268 /* 4269 * If vdev_online returned an error or the underlying vdev_open 4270 * failed then we abort the expand. The only way to know that 4271 * vdev_open fails is by checking the returned newstate. 4272 */ 4273 if (error || newstate != VDEV_STATE_HEALTHY) { 4274 if (ztest_opts.zo_verbose >= 5) { 4275 (void) printf("Unable to expand vdev, state %u, " 4276 "error %d\n", newstate, error); 4277 } 4278 return (vd); 4279 } 4280 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 4281 4282 /* 4283 * Since we dropped the lock we need to ensure that we're 4284 * still talking to the original vdev. It's possible this 4285 * vdev may have been detached/replaced while we were 4286 * trying to online it. 4287 */ 4288 if (generation != spa->spa_config_generation) { 4289 if (ztest_opts.zo_verbose >= 5) { 4290 (void) printf("vdev configuration has changed, " 4291 "guid %"PRIu64", state %"PRIu64", " 4292 "expected gen %"PRIu64", got gen %"PRIu64"\n", 4293 guid, 4294 tvd->vdev_state, 4295 generation, 4296 spa->spa_config_generation); 4297 } 4298 return (vd); 4299 } 4300 return (NULL); 4301 } 4302 4303 /* 4304 * Traverse the vdev tree calling the supplied function. 4305 * We continue to walk the tree until we either have walked all 4306 * children or we receive a non-NULL return from the callback. 4307 * If a NULL callback is passed, then we just return back the first 4308 * leaf vdev we encounter. 4309 */ 4310 static vdev_t * 4311 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 4312 { 4313 uint_t c; 4314 4315 if (vd->vdev_ops->vdev_op_leaf) { 4316 if (func == NULL) 4317 return (vd); 4318 else 4319 return (func(vd, arg)); 4320 } 4321 4322 for (c = 0; c < vd->vdev_children; c++) { 4323 vdev_t *cvd = vd->vdev_child[c]; 4324 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 4325 return (cvd); 4326 } 4327 return (NULL); 4328 } 4329 4330 /* 4331 * Verify that dynamic LUN growth works as expected. 4332 */ 4333 void 4334 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 4335 { 4336 (void) zd, (void) id; 4337 spa_t *spa = ztest_spa; 4338 vdev_t *vd, *tvd; 4339 metaslab_class_t *mc; 4340 metaslab_group_t *mg; 4341 size_t psize, newsize; 4342 uint64_t top; 4343 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 4344 4345 mutex_enter(&ztest_checkpoint_lock); 4346 mutex_enter(&ztest_vdev_lock); 4347 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4348 4349 /* 4350 * If there is a vdev removal in progress, it could complete while 4351 * we are running, in which case we would not be able to verify 4352 * that the metaslab_class space increased (because it decreases 4353 * when the device removal completes). 4354 */ 4355 if (ztest_device_removal_active) { 4356 spa_config_exit(spa, SCL_STATE, spa); 4357 mutex_exit(&ztest_vdev_lock); 4358 mutex_exit(&ztest_checkpoint_lock); 4359 return; 4360 } 4361 4362 /* 4363 * If we are under raidz expansion, the test can failed because the 4364 * metaslabs count will not increase immediately after the vdev is 4365 * expanded. It will happen only after raidz expansion completion. 4366 */ 4367 if (spa->spa_raidz_expand) { 4368 spa_config_exit(spa, SCL_STATE, spa); 4369 mutex_exit(&ztest_vdev_lock); 4370 mutex_exit(&ztest_checkpoint_lock); 4371 return; 4372 } 4373 4374 top = ztest_random_vdev_top(spa, B_TRUE); 4375 4376 tvd = spa->spa_root_vdev->vdev_child[top]; 4377 mg = tvd->vdev_mg; 4378 mc = mg->mg_class; 4379 old_ms_count = tvd->vdev_ms_count; 4380 old_class_space = metaslab_class_get_space(mc); 4381 4382 /* 4383 * Determine the size of the first leaf vdev associated with 4384 * our top-level device. 4385 */ 4386 vd = vdev_walk_tree(tvd, NULL, NULL); 4387 ASSERT3P(vd, !=, NULL); 4388 ASSERT(vd->vdev_ops->vdev_op_leaf); 4389 4390 psize = vd->vdev_psize; 4391 4392 /* 4393 * We only try to expand the vdev if it's healthy, less than 4x its 4394 * original size, and it has a valid psize. 4395 */ 4396 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 4397 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 4398 spa_config_exit(spa, SCL_STATE, spa); 4399 mutex_exit(&ztest_vdev_lock); 4400 mutex_exit(&ztest_checkpoint_lock); 4401 return; 4402 } 4403 ASSERT3U(psize, >, 0); 4404 newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); 4405 ASSERT3U(newsize, >, psize); 4406 4407 if (ztest_opts.zo_verbose >= 6) { 4408 (void) printf("Expanding LUN %s from %lu to %lu\n", 4409 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 4410 } 4411 4412 /* 4413 * Growing the vdev is a two step process: 4414 * 1). expand the physical size (i.e. relabel) 4415 * 2). online the vdev to create the new metaslabs 4416 */ 4417 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 4418 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 4419 tvd->vdev_state != VDEV_STATE_HEALTHY) { 4420 if (ztest_opts.zo_verbose >= 5) { 4421 (void) printf("Could not expand LUN because " 4422 "the vdev configuration changed.\n"); 4423 } 4424 spa_config_exit(spa, SCL_STATE, spa); 4425 mutex_exit(&ztest_vdev_lock); 4426 mutex_exit(&ztest_checkpoint_lock); 4427 return; 4428 } 4429 4430 spa_config_exit(spa, SCL_STATE, spa); 4431 4432 /* 4433 * Expanding the LUN will update the config asynchronously, 4434 * thus we must wait for the async thread to complete any 4435 * pending tasks before proceeding. 4436 */ 4437 for (;;) { 4438 boolean_t done; 4439 mutex_enter(&spa->spa_async_lock); 4440 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 4441 mutex_exit(&spa->spa_async_lock); 4442 if (done) 4443 break; 4444 txg_wait_synced(spa_get_dsl(spa), 0); 4445 (void) poll(NULL, 0, 100); 4446 } 4447 4448 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4449 4450 tvd = spa->spa_root_vdev->vdev_child[top]; 4451 new_ms_count = tvd->vdev_ms_count; 4452 new_class_space = metaslab_class_get_space(mc); 4453 4454 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 4455 if (ztest_opts.zo_verbose >= 5) { 4456 (void) printf("Could not verify LUN expansion due to " 4457 "intervening vdev offline or remove.\n"); 4458 } 4459 spa_config_exit(spa, SCL_STATE, spa); 4460 mutex_exit(&ztest_vdev_lock); 4461 mutex_exit(&ztest_checkpoint_lock); 4462 return; 4463 } 4464 4465 /* 4466 * Make sure we were able to grow the vdev. 4467 */ 4468 if (new_ms_count <= old_ms_count) { 4469 fatal(B_FALSE, 4470 "LUN expansion failed: ms_count %"PRIu64" < %"PRIu64"\n", 4471 old_ms_count, new_ms_count); 4472 } 4473 4474 /* 4475 * Make sure we were able to grow the pool. 4476 */ 4477 if (new_class_space <= old_class_space) { 4478 fatal(B_FALSE, 4479 "LUN expansion failed: class_space %"PRIu64" < %"PRIu64"\n", 4480 old_class_space, new_class_space); 4481 } 4482 4483 if (ztest_opts.zo_verbose >= 5) { 4484 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 4485 4486 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 4487 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 4488 (void) printf("%s grew from %s to %s\n", 4489 spa->spa_name, oldnumbuf, newnumbuf); 4490 } 4491 4492 spa_config_exit(spa, SCL_STATE, spa); 4493 mutex_exit(&ztest_vdev_lock); 4494 mutex_exit(&ztest_checkpoint_lock); 4495 } 4496 4497 /* 4498 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 4499 */ 4500 static void 4501 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 4502 { 4503 (void) arg, (void) cr; 4504 4505 /* 4506 * Create the objects common to all ztest datasets. 4507 */ 4508 VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, 4509 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); 4510 } 4511 4512 static int 4513 ztest_dataset_create(char *dsname) 4514 { 4515 int err; 4516 uint64_t rand; 4517 dsl_crypto_params_t *dcp = NULL; 4518 4519 /* 4520 * 50% of the time, we create encrypted datasets 4521 * using a random cipher suite and a hard-coded 4522 * wrapping key. 4523 */ 4524 rand = ztest_random(2); 4525 if (rand != 0) { 4526 nvlist_t *crypto_args = fnvlist_alloc(); 4527 nvlist_t *props = fnvlist_alloc(); 4528 4529 /* slight bias towards the default cipher suite */ 4530 rand = ztest_random(ZIO_CRYPT_FUNCTIONS); 4531 if (rand < ZIO_CRYPT_AES_128_CCM) 4532 rand = ZIO_CRYPT_ON; 4533 4534 fnvlist_add_uint64(props, 4535 zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); 4536 fnvlist_add_uint8_array(crypto_args, "wkeydata", 4537 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 4538 4539 /* 4540 * These parameters aren't really used by the kernel. They 4541 * are simply stored so that userspace knows how to load 4542 * the wrapping key. 4543 */ 4544 fnvlist_add_uint64(props, 4545 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); 4546 fnvlist_add_string(props, 4547 zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); 4548 fnvlist_add_uint64(props, 4549 zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); 4550 fnvlist_add_uint64(props, 4551 zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); 4552 4553 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, 4554 crypto_args, &dcp)); 4555 4556 /* 4557 * Cycle through all available encryption implementations 4558 * to verify interoperability. 4559 */ 4560 VERIFY0(gcm_impl_set("cycle")); 4561 VERIFY0(aes_impl_set("cycle")); 4562 4563 fnvlist_free(crypto_args); 4564 fnvlist_free(props); 4565 } 4566 4567 err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, 4568 ztest_objset_create_cb, NULL); 4569 dsl_crypto_params_free(dcp, !!err); 4570 4571 rand = ztest_random(100); 4572 if (err || rand < 80) 4573 return (err); 4574 4575 if (ztest_opts.zo_verbose >= 5) 4576 (void) printf("Setting dataset %s to sync always\n", dsname); 4577 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 4578 ZFS_SYNC_ALWAYS, B_FALSE)); 4579 } 4580 4581 static int 4582 ztest_objset_destroy_cb(const char *name, void *arg) 4583 { 4584 (void) arg; 4585 objset_t *os; 4586 dmu_object_info_t doi; 4587 int error; 4588 4589 /* 4590 * Verify that the dataset contains a directory object. 4591 */ 4592 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4593 B_TRUE, FTAG, &os)); 4594 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 4595 if (error != ENOENT) { 4596 /* We could have crashed in the middle of destroying it */ 4597 ASSERT0(error); 4598 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 4599 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 4600 } 4601 dmu_objset_disown(os, B_TRUE, FTAG); 4602 4603 /* 4604 * Destroy the dataset. 4605 */ 4606 if (strchr(name, '@') != NULL) { 4607 error = dsl_destroy_snapshot(name, B_TRUE); 4608 if (error != ECHRNG) { 4609 /* 4610 * The program was executed, but encountered a runtime 4611 * error, such as insufficient slop, or a hold on the 4612 * dataset. 4613 */ 4614 ASSERT0(error); 4615 } 4616 } else { 4617 error = dsl_destroy_head(name); 4618 if (error == ENOSPC) { 4619 /* There could be checkpoint or insufficient slop */ 4620 ztest_record_enospc(FTAG); 4621 } else if (error != EBUSY) { 4622 /* There could be a hold on this dataset */ 4623 ASSERT0(error); 4624 } 4625 } 4626 return (0); 4627 } 4628 4629 static boolean_t 4630 ztest_snapshot_create(char *osname, uint64_t id) 4631 { 4632 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4633 int error; 4634 4635 (void) snprintf(snapname, sizeof (snapname), "%"PRIu64"", id); 4636 4637 error = dmu_objset_snapshot_one(osname, snapname); 4638 if (error == ENOSPC) { 4639 ztest_record_enospc(FTAG); 4640 return (B_FALSE); 4641 } 4642 if (error != 0 && error != EEXIST && error != ECHRNG) { 4643 fatal(B_FALSE, "ztest_snapshot_create(%s@%s) = %d", osname, 4644 snapname, error); 4645 } 4646 return (B_TRUE); 4647 } 4648 4649 static boolean_t 4650 ztest_snapshot_destroy(char *osname, uint64_t id) 4651 { 4652 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4653 int error; 4654 4655 (void) snprintf(snapname, sizeof (snapname), "%s@%"PRIu64"", 4656 osname, id); 4657 4658 error = dsl_destroy_snapshot(snapname, B_FALSE); 4659 if (error != 0 && error != ENOENT && error != ECHRNG) 4660 fatal(B_FALSE, "ztest_snapshot_destroy(%s) = %d", 4661 snapname, error); 4662 return (B_TRUE); 4663 } 4664 4665 void 4666 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 4667 { 4668 (void) zd; 4669 ztest_ds_t *zdtmp; 4670 int iters; 4671 int error; 4672 objset_t *os, *os2; 4673 char name[ZFS_MAX_DATASET_NAME_LEN]; 4674 zilog_t *zilog; 4675 int i; 4676 4677 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 4678 4679 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4680 4681 (void) snprintf(name, sizeof (name), "%s/temp_%"PRIu64"", 4682 ztest_opts.zo_pool, id); 4683 4684 /* 4685 * If this dataset exists from a previous run, process its replay log 4686 * half of the time. If we don't replay it, then dsl_destroy_head() 4687 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 4688 */ 4689 if (ztest_random(2) == 0 && 4690 ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 4691 B_TRUE, FTAG, &os) == 0) { 4692 ztest_zd_init(zdtmp, NULL, os); 4693 zil_replay(os, zdtmp, ztest_replay_vector); 4694 ztest_zd_fini(zdtmp); 4695 dmu_objset_disown(os, B_TRUE, FTAG); 4696 } 4697 4698 /* 4699 * There may be an old instance of the dataset we're about to 4700 * create lying around from a previous run. If so, destroy it 4701 * and all of its snapshots. 4702 */ 4703 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 4704 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 4705 4706 /* 4707 * Verify that the destroyed dataset is no longer in the namespace. 4708 * It may still be present if the destroy above fails with ENOSPC. 4709 */ 4710 error = ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, 4711 FTAG, &os); 4712 if (error == 0) { 4713 dmu_objset_disown(os, B_TRUE, FTAG); 4714 ztest_record_enospc(FTAG); 4715 goto out; 4716 } 4717 VERIFY3U(ENOENT, ==, error); 4718 4719 /* 4720 * Verify that we can create a new dataset. 4721 */ 4722 error = ztest_dataset_create(name); 4723 if (error) { 4724 if (error == ENOSPC) { 4725 ztest_record_enospc(FTAG); 4726 goto out; 4727 } 4728 fatal(B_FALSE, "dmu_objset_create(%s) = %d", name, error); 4729 } 4730 4731 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, 4732 FTAG, &os)); 4733 4734 ztest_zd_init(zdtmp, NULL, os); 4735 4736 /* 4737 * Open the intent log for it. 4738 */ 4739 zilog = zil_open(os, ztest_get_data, NULL); 4740 4741 /* 4742 * Put some objects in there, do a little I/O to them, 4743 * and randomly take a couple of snapshots along the way. 4744 */ 4745 iters = ztest_random(5); 4746 for (i = 0; i < iters; i++) { 4747 ztest_dmu_object_alloc_free(zdtmp, id); 4748 if (ztest_random(iters) == 0) 4749 (void) ztest_snapshot_create(name, i); 4750 } 4751 4752 /* 4753 * Verify that we cannot create an existing dataset. 4754 */ 4755 VERIFY3U(EEXIST, ==, 4756 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); 4757 4758 /* 4759 * Verify that we can hold an objset that is also owned. 4760 */ 4761 VERIFY0(dmu_objset_hold(name, FTAG, &os2)); 4762 dmu_objset_rele(os2, FTAG); 4763 4764 /* 4765 * Verify that we cannot own an objset that is already owned. 4766 */ 4767 VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, 4768 B_FALSE, B_TRUE, FTAG, &os2)); 4769 4770 zil_close(zilog); 4771 dmu_objset_disown(os, B_TRUE, FTAG); 4772 ztest_zd_fini(zdtmp); 4773 out: 4774 (void) pthread_rwlock_unlock(&ztest_name_lock); 4775 4776 umem_free(zdtmp, sizeof (ztest_ds_t)); 4777 } 4778 4779 /* 4780 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 4781 */ 4782 void 4783 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 4784 { 4785 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4786 (void) ztest_snapshot_destroy(zd->zd_name, id); 4787 (void) ztest_snapshot_create(zd->zd_name, id); 4788 (void) pthread_rwlock_unlock(&ztest_name_lock); 4789 } 4790 4791 /* 4792 * Cleanup non-standard snapshots and clones. 4793 */ 4794 static void 4795 ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 4796 { 4797 char *snap1name; 4798 char *clone1name; 4799 char *snap2name; 4800 char *clone2name; 4801 char *snap3name; 4802 int error; 4803 4804 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4805 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4806 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4807 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4808 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4809 4810 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4811 osname, id); 4812 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4813 osname, id); 4814 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4815 clone1name, id); 4816 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4817 osname, id); 4818 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4819 clone1name, id); 4820 4821 error = dsl_destroy_head(clone2name); 4822 if (error && error != ENOENT) 4823 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone2name, error); 4824 error = dsl_destroy_snapshot(snap3name, B_FALSE); 4825 if (error && error != ENOENT) 4826 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4827 snap3name, error); 4828 error = dsl_destroy_snapshot(snap2name, B_FALSE); 4829 if (error && error != ENOENT) 4830 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4831 snap2name, error); 4832 error = dsl_destroy_head(clone1name); 4833 if (error && error != ENOENT) 4834 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clone1name, error); 4835 error = dsl_destroy_snapshot(snap1name, B_FALSE); 4836 if (error && error != ENOENT) 4837 fatal(B_FALSE, "dsl_destroy_snapshot(%s) = %d", 4838 snap1name, error); 4839 4840 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4841 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4842 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4843 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4844 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4845 } 4846 4847 /* 4848 * Verify dsl_dataset_promote handles EBUSY 4849 */ 4850 void 4851 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 4852 { 4853 objset_t *os; 4854 char *snap1name; 4855 char *clone1name; 4856 char *snap2name; 4857 char *clone2name; 4858 char *snap3name; 4859 char *osname = zd->zd_name; 4860 int error; 4861 4862 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4863 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4864 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4865 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4866 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4867 4868 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4869 4870 ztest_dsl_dataset_cleanup(osname, id); 4871 4872 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%"PRIu64"", 4873 osname, id); 4874 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%"PRIu64"", 4875 osname, id); 4876 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%"PRIu64"", 4877 clone1name, id); 4878 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%"PRIu64"", 4879 osname, id); 4880 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%"PRIu64"", 4881 clone1name, id); 4882 4883 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 4884 if (error && error != EEXIST) { 4885 if (error == ENOSPC) { 4886 ztest_record_enospc(FTAG); 4887 goto out; 4888 } 4889 fatal(B_FALSE, "dmu_take_snapshot(%s) = %d", snap1name, error); 4890 } 4891 4892 error = dmu_objset_clone(clone1name, snap1name); 4893 if (error) { 4894 if (error == ENOSPC) { 4895 ztest_record_enospc(FTAG); 4896 goto out; 4897 } 4898 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone1name, error); 4899 } 4900 4901 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 4902 if (error && error != EEXIST) { 4903 if (error == ENOSPC) { 4904 ztest_record_enospc(FTAG); 4905 goto out; 4906 } 4907 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap2name, error); 4908 } 4909 4910 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 4911 if (error && error != EEXIST) { 4912 if (error == ENOSPC) { 4913 ztest_record_enospc(FTAG); 4914 goto out; 4915 } 4916 fatal(B_FALSE, "dmu_open_snapshot(%s) = %d", snap3name, error); 4917 } 4918 4919 error = dmu_objset_clone(clone2name, snap3name); 4920 if (error) { 4921 if (error == ENOSPC) { 4922 ztest_record_enospc(FTAG); 4923 goto out; 4924 } 4925 fatal(B_FALSE, "dmu_objset_create(%s) = %d", clone2name, error); 4926 } 4927 4928 error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, 4929 FTAG, &os); 4930 if (error) 4931 fatal(B_FALSE, "dmu_objset_own(%s) = %d", snap2name, error); 4932 error = dsl_dataset_promote(clone2name, NULL); 4933 if (error == ENOSPC) { 4934 dmu_objset_disown(os, B_TRUE, FTAG); 4935 ztest_record_enospc(FTAG); 4936 goto out; 4937 } 4938 if (error != EBUSY) 4939 fatal(B_FALSE, "dsl_dataset_promote(%s), %d, not EBUSY", 4940 clone2name, error); 4941 dmu_objset_disown(os, B_TRUE, FTAG); 4942 4943 out: 4944 ztest_dsl_dataset_cleanup(osname, id); 4945 4946 (void) pthread_rwlock_unlock(&ztest_name_lock); 4947 4948 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4949 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4950 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4951 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4952 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4953 } 4954 4955 #undef OD_ARRAY_SIZE 4956 #define OD_ARRAY_SIZE 4 4957 4958 /* 4959 * Verify that dmu_object_{alloc,free} work as expected. 4960 */ 4961 void 4962 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 4963 { 4964 ztest_od_t *od; 4965 int batchsize; 4966 int size; 4967 int b; 4968 4969 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4970 od = umem_alloc(size, UMEM_NOFAIL); 4971 batchsize = OD_ARRAY_SIZE; 4972 4973 for (b = 0; b < batchsize; b++) 4974 ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 4975 0, 0, 0); 4976 4977 /* 4978 * Destroy the previous batch of objects, create a new batch, 4979 * and do some I/O on the new objects. 4980 */ 4981 if (ztest_object_init(zd, od, size, B_TRUE) != 0) { 4982 zd->zd_od = NULL; 4983 umem_free(od, size); 4984 return; 4985 } 4986 4987 while (ztest_random(4 * batchsize) != 0) 4988 ztest_io(zd, od[ztest_random(batchsize)].od_object, 4989 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4990 4991 umem_free(od, size); 4992 } 4993 4994 /* 4995 * Rewind the global allocator to verify object allocation backfilling. 4996 */ 4997 void 4998 ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 4999 { 5000 (void) id; 5001 objset_t *os = zd->zd_os; 5002 uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 5003 uint64_t object; 5004 5005 /* 5006 * Rewind the global allocator randomly back to a lower object number 5007 * to force backfilling and reclamation of recently freed dnodes. 5008 */ 5009 mutex_enter(&os->os_obj_lock); 5010 object = ztest_random(os->os_obj_next_chunk); 5011 os->os_obj_next_chunk = P2ALIGN_TYPED(object, dnodes_per_chunk, 5012 uint64_t); 5013 mutex_exit(&os->os_obj_lock); 5014 } 5015 5016 #undef OD_ARRAY_SIZE 5017 #define OD_ARRAY_SIZE 2 5018 5019 /* 5020 * Verify that dmu_{read,write} work as expected. 5021 */ 5022 void 5023 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 5024 { 5025 int size; 5026 ztest_od_t *od; 5027 5028 objset_t *os = zd->zd_os; 5029 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 5030 od = umem_alloc(size, UMEM_NOFAIL); 5031 dmu_tx_t *tx; 5032 int freeit, error; 5033 uint64_t i, n, s, txg; 5034 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 5035 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 5036 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 5037 uint64_t regions = 997; 5038 uint64_t stride = 123456789ULL; 5039 uint64_t width = 40; 5040 int free_percent = 5; 5041 5042 /* 5043 * This test uses two objects, packobj and bigobj, that are always 5044 * updated together (i.e. in the same tx) so that their contents are 5045 * in sync and can be compared. Their contents relate to each other 5046 * in a simple way: packobj is a dense array of 'bufwad' structures, 5047 * while bigobj is a sparse array of the same bufwads. Specifically, 5048 * for any index n, there are three bufwads that should be identical: 5049 * 5050 * packobj, at offset n * sizeof (bufwad_t) 5051 * bigobj, at the head of the nth chunk 5052 * bigobj, at the tail of the nth chunk 5053 * 5054 * The chunk size is arbitrary. It doesn't have to be a power of two, 5055 * and it doesn't have any relation to the object blocksize. 5056 * The only requirement is that it can hold at least two bufwads. 5057 * 5058 * Normally, we write the bufwad to each of these locations. 5059 * However, free_percent of the time we instead write zeroes to 5060 * packobj and perform a dmu_free_range() on bigobj. By comparing 5061 * bigobj to packobj, we can verify that the DMU is correctly 5062 * tracking which parts of an object are allocated and free, 5063 * and that the contents of the allocated blocks are correct. 5064 */ 5065 5066 /* 5067 * Read the directory info. If it's the first time, set things up. 5068 */ 5069 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); 5070 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5071 chunksize); 5072 5073 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5074 umem_free(od, size); 5075 return; 5076 } 5077 5078 bigobj = od[0].od_object; 5079 packobj = od[1].od_object; 5080 chunksize = od[0].od_gen; 5081 ASSERT3U(chunksize, ==, od[1].od_gen); 5082 5083 /* 5084 * Prefetch a random chunk of the big object. 5085 * Our aim here is to get some async reads in flight 5086 * for blocks that we may free below; the DMU should 5087 * handle this race correctly. 5088 */ 5089 n = ztest_random(regions) * stride + ztest_random(width); 5090 s = 1 + ztest_random(2 * width - 1); 5091 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 5092 ZIO_PRIORITY_SYNC_READ); 5093 5094 /* 5095 * Pick a random index and compute the offsets into packobj and bigobj. 5096 */ 5097 n = ztest_random(regions) * stride + ztest_random(width); 5098 s = 1 + ztest_random(width - 1); 5099 5100 packoff = n * sizeof (bufwad_t); 5101 packsize = s * sizeof (bufwad_t); 5102 5103 bigoff = n * chunksize; 5104 bigsize = s * chunksize; 5105 5106 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 5107 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 5108 5109 /* 5110 * free_percent of the time, free a range of bigobj rather than 5111 * overwriting it. 5112 */ 5113 freeit = (ztest_random(100) < free_percent); 5114 5115 /* 5116 * Read the current contents of our objects. 5117 */ 5118 error = dmu_read(os, packobj, packoff, packsize, packbuf, 5119 DMU_READ_PREFETCH); 5120 ASSERT0(error); 5121 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 5122 DMU_READ_PREFETCH); 5123 ASSERT0(error); 5124 5125 /* 5126 * Get a tx for the mods to both packobj and bigobj. 5127 */ 5128 tx = dmu_tx_create(os); 5129 5130 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5131 5132 if (freeit) 5133 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 5134 else 5135 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5136 5137 /* This accounts for setting the checksum/compression. */ 5138 dmu_tx_hold_bonus(tx, bigobj); 5139 5140 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5141 if (txg == 0) { 5142 umem_free(packbuf, packsize); 5143 umem_free(bigbuf, bigsize); 5144 umem_free(od, size); 5145 return; 5146 } 5147 5148 enum zio_checksum cksum; 5149 do { 5150 cksum = (enum zio_checksum) 5151 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 5152 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 5153 dmu_object_set_checksum(os, bigobj, cksum, tx); 5154 5155 enum zio_compress comp; 5156 do { 5157 comp = (enum zio_compress) 5158 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 5159 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 5160 dmu_object_set_compress(os, bigobj, comp, tx); 5161 5162 /* 5163 * For each index from n to n + s, verify that the existing bufwad 5164 * in packobj matches the bufwads at the head and tail of the 5165 * corresponding chunk in bigobj. Then update all three bufwads 5166 * with the new values we want to write out. 5167 */ 5168 for (i = 0; i < s; i++) { 5169 /* LINTED */ 5170 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 5171 /* LINTED */ 5172 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 5173 /* LINTED */ 5174 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 5175 5176 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 5177 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 5178 5179 if (pack->bw_txg > txg) 5180 fatal(B_FALSE, 5181 "future leak: got %"PRIx64", open txg is %"PRIx64"", 5182 pack->bw_txg, txg); 5183 5184 if (pack->bw_data != 0 && pack->bw_index != n + i) 5185 fatal(B_FALSE, "wrong index: " 5186 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 5187 pack->bw_index, n, i); 5188 5189 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 5190 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 5191 pack, bigH); 5192 5193 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 5194 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 5195 pack, bigT); 5196 5197 if (freeit) { 5198 memset(pack, 0, sizeof (bufwad_t)); 5199 } else { 5200 pack->bw_index = n + i; 5201 pack->bw_txg = txg; 5202 pack->bw_data = 1 + ztest_random(-2ULL); 5203 } 5204 *bigH = *pack; 5205 *bigT = *pack; 5206 } 5207 5208 /* 5209 * We've verified all the old bufwads, and made new ones. 5210 * Now write them out. 5211 */ 5212 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5213 5214 if (freeit) { 5215 if (ztest_opts.zo_verbose >= 7) { 5216 (void) printf("freeing offset %"PRIx64" size %"PRIx64"" 5217 " txg %"PRIx64"\n", 5218 bigoff, bigsize, txg); 5219 } 5220 VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 5221 } else { 5222 if (ztest_opts.zo_verbose >= 7) { 5223 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5224 " txg %"PRIx64"\n", 5225 bigoff, bigsize, txg); 5226 } 5227 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 5228 } 5229 5230 dmu_tx_commit(tx); 5231 5232 /* 5233 * Sanity check the stuff we just wrote. 5234 */ 5235 { 5236 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5237 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5238 5239 VERIFY0(dmu_read(os, packobj, packoff, 5240 packsize, packcheck, DMU_READ_PREFETCH)); 5241 VERIFY0(dmu_read(os, bigobj, bigoff, 5242 bigsize, bigcheck, DMU_READ_PREFETCH)); 5243 5244 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5245 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5246 5247 umem_free(packcheck, packsize); 5248 umem_free(bigcheck, bigsize); 5249 } 5250 5251 umem_free(packbuf, packsize); 5252 umem_free(bigbuf, bigsize); 5253 umem_free(od, size); 5254 } 5255 5256 static void 5257 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 5258 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 5259 { 5260 uint64_t i; 5261 bufwad_t *pack; 5262 bufwad_t *bigH; 5263 bufwad_t *bigT; 5264 5265 /* 5266 * For each index from n to n + s, verify that the existing bufwad 5267 * in packobj matches the bufwads at the head and tail of the 5268 * corresponding chunk in bigobj. Then update all three bufwads 5269 * with the new values we want to write out. 5270 */ 5271 for (i = 0; i < s; i++) { 5272 /* LINTED */ 5273 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 5274 /* LINTED */ 5275 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 5276 /* LINTED */ 5277 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 5278 5279 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 5280 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 5281 5282 if (pack->bw_txg > txg) 5283 fatal(B_FALSE, 5284 "future leak: got %"PRIx64", open txg is %"PRIx64"", 5285 pack->bw_txg, txg); 5286 5287 if (pack->bw_data != 0 && pack->bw_index != n + i) 5288 fatal(B_FALSE, "wrong index: " 5289 "got %"PRIx64", wanted %"PRIx64"+%"PRIx64"", 5290 pack->bw_index, n, i); 5291 5292 if (memcmp(pack, bigH, sizeof (bufwad_t)) != 0) 5293 fatal(B_FALSE, "pack/bigH mismatch in %p/%p", 5294 pack, bigH); 5295 5296 if (memcmp(pack, bigT, sizeof (bufwad_t)) != 0) 5297 fatal(B_FALSE, "pack/bigT mismatch in %p/%p", 5298 pack, bigT); 5299 5300 pack->bw_index = n + i; 5301 pack->bw_txg = txg; 5302 pack->bw_data = 1 + ztest_random(-2ULL); 5303 5304 *bigH = *pack; 5305 *bigT = *pack; 5306 } 5307 } 5308 5309 #undef OD_ARRAY_SIZE 5310 #define OD_ARRAY_SIZE 2 5311 5312 void 5313 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 5314 { 5315 objset_t *os = zd->zd_os; 5316 ztest_od_t *od; 5317 dmu_tx_t *tx; 5318 uint64_t i; 5319 int error; 5320 int size; 5321 uint64_t n, s, txg; 5322 bufwad_t *packbuf, *bigbuf; 5323 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 5324 uint64_t blocksize = ztest_random_blocksize(); 5325 uint64_t chunksize = blocksize; 5326 uint64_t regions = 997; 5327 uint64_t stride = 123456789ULL; 5328 uint64_t width = 9; 5329 dmu_buf_t *bonus_db; 5330 arc_buf_t **bigbuf_arcbufs; 5331 dmu_object_info_t doi; 5332 5333 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 5334 od = umem_alloc(size, UMEM_NOFAIL); 5335 5336 /* 5337 * This test uses two objects, packobj and bigobj, that are always 5338 * updated together (i.e. in the same tx) so that their contents are 5339 * in sync and can be compared. Their contents relate to each other 5340 * in a simple way: packobj is a dense array of 'bufwad' structures, 5341 * while bigobj is a sparse array of the same bufwads. Specifically, 5342 * for any index n, there are three bufwads that should be identical: 5343 * 5344 * packobj, at offset n * sizeof (bufwad_t) 5345 * bigobj, at the head of the nth chunk 5346 * bigobj, at the tail of the nth chunk 5347 * 5348 * The chunk size is set equal to bigobj block size so that 5349 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. 5350 */ 5351 5352 /* 5353 * Read the directory info. If it's the first time, set things up. 5354 */ 5355 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5356 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 5357 chunksize); 5358 5359 5360 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 5361 umem_free(od, size); 5362 return; 5363 } 5364 5365 bigobj = od[0].od_object; 5366 packobj = od[1].od_object; 5367 blocksize = od[0].od_blocksize; 5368 chunksize = blocksize; 5369 ASSERT3U(chunksize, ==, od[1].od_gen); 5370 5371 VERIFY0(dmu_object_info(os, bigobj, &doi)); 5372 VERIFY(ISP2(doi.doi_data_block_size)); 5373 VERIFY3U(chunksize, ==, doi.doi_data_block_size); 5374 VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); 5375 5376 /* 5377 * Pick a random index and compute the offsets into packobj and bigobj. 5378 */ 5379 n = ztest_random(regions) * stride + ztest_random(width); 5380 s = 1 + ztest_random(width - 1); 5381 5382 packoff = n * sizeof (bufwad_t); 5383 packsize = s * sizeof (bufwad_t); 5384 5385 bigoff = n * chunksize; 5386 bigsize = s * chunksize; 5387 5388 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 5389 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 5390 5391 VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 5392 5393 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 5394 5395 /* 5396 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 5397 * Iteration 1 test zcopy to already referenced dbufs. 5398 * Iteration 2 test zcopy to dirty dbuf in the same txg. 5399 * Iteration 3 test zcopy to dbuf dirty in previous txg. 5400 * Iteration 4 test zcopy when dbuf is no longer dirty. 5401 * Iteration 5 test zcopy when it can't be done. 5402 * Iteration 6 one more zcopy write. 5403 */ 5404 for (i = 0; i < 7; i++) { 5405 uint64_t j; 5406 uint64_t off; 5407 5408 /* 5409 * In iteration 5 (i == 5) use arcbufs 5410 * that don't match bigobj blksz to test 5411 * dmu_assign_arcbuf_by_dbuf() when it can't directly 5412 * assign an arcbuf to a dbuf. 5413 */ 5414 for (j = 0; j < s; j++) { 5415 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5416 bigbuf_arcbufs[j] = 5417 dmu_request_arcbuf(bonus_db, chunksize); 5418 } else { 5419 bigbuf_arcbufs[2 * j] = 5420 dmu_request_arcbuf(bonus_db, chunksize / 2); 5421 bigbuf_arcbufs[2 * j + 1] = 5422 dmu_request_arcbuf(bonus_db, chunksize / 2); 5423 } 5424 } 5425 5426 /* 5427 * Get a tx for the mods to both packobj and bigobj. 5428 */ 5429 tx = dmu_tx_create(os); 5430 5431 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5432 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5433 5434 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5435 if (txg == 0) { 5436 umem_free(packbuf, packsize); 5437 umem_free(bigbuf, bigsize); 5438 for (j = 0; j < s; j++) { 5439 if (i != 5 || 5440 chunksize < (SPA_MINBLOCKSIZE * 2)) { 5441 dmu_return_arcbuf(bigbuf_arcbufs[j]); 5442 } else { 5443 dmu_return_arcbuf( 5444 bigbuf_arcbufs[2 * j]); 5445 dmu_return_arcbuf( 5446 bigbuf_arcbufs[2 * j + 1]); 5447 } 5448 } 5449 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5450 umem_free(od, size); 5451 dmu_buf_rele(bonus_db, FTAG); 5452 return; 5453 } 5454 5455 /* 5456 * 50% of the time don't read objects in the 1st iteration to 5457 * test dmu_assign_arcbuf_by_dbuf() for the case when there are 5458 * no existing dbufs for the specified offsets. 5459 */ 5460 if (i != 0 || ztest_random(2) != 0) { 5461 error = dmu_read(os, packobj, packoff, 5462 packsize, packbuf, DMU_READ_PREFETCH); 5463 ASSERT0(error); 5464 error = dmu_read(os, bigobj, bigoff, bigsize, 5465 bigbuf, DMU_READ_PREFETCH); 5466 ASSERT0(error); 5467 } 5468 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 5469 n, chunksize, txg); 5470 5471 /* 5472 * We've verified all the old bufwads, and made new ones. 5473 * Now write them out. 5474 */ 5475 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5476 if (ztest_opts.zo_verbose >= 7) { 5477 (void) printf("writing offset %"PRIx64" size %"PRIx64"" 5478 " txg %"PRIx64"\n", 5479 bigoff, bigsize, txg); 5480 } 5481 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 5482 dmu_buf_t *dbt; 5483 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5484 memcpy(bigbuf_arcbufs[j]->b_data, 5485 (caddr_t)bigbuf + (off - bigoff), 5486 chunksize); 5487 } else { 5488 memcpy(bigbuf_arcbufs[2 * j]->b_data, 5489 (caddr_t)bigbuf + (off - bigoff), 5490 chunksize / 2); 5491 memcpy(bigbuf_arcbufs[2 * j + 1]->b_data, 5492 (caddr_t)bigbuf + (off - bigoff) + 5493 chunksize / 2, 5494 chunksize / 2); 5495 } 5496 5497 if (i == 1) { 5498 VERIFY(dmu_buf_hold(os, bigobj, off, 5499 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 5500 } 5501 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5502 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5503 off, bigbuf_arcbufs[j], tx)); 5504 } else { 5505 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5506 off, bigbuf_arcbufs[2 * j], tx)); 5507 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5508 off + chunksize / 2, 5509 bigbuf_arcbufs[2 * j + 1], tx)); 5510 } 5511 if (i == 1) { 5512 dmu_buf_rele(dbt, FTAG); 5513 } 5514 } 5515 dmu_tx_commit(tx); 5516 5517 /* 5518 * Sanity check the stuff we just wrote. 5519 */ 5520 { 5521 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5522 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5523 5524 VERIFY0(dmu_read(os, packobj, packoff, 5525 packsize, packcheck, DMU_READ_PREFETCH)); 5526 VERIFY0(dmu_read(os, bigobj, bigoff, 5527 bigsize, bigcheck, DMU_READ_PREFETCH)); 5528 5529 ASSERT0(memcmp(packbuf, packcheck, packsize)); 5530 ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); 5531 5532 umem_free(packcheck, packsize); 5533 umem_free(bigcheck, bigsize); 5534 } 5535 if (i == 2) { 5536 txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); 5537 } else if (i == 3) { 5538 txg_wait_synced(dmu_objset_pool(os), 0); 5539 } 5540 } 5541 5542 dmu_buf_rele(bonus_db, FTAG); 5543 umem_free(packbuf, packsize); 5544 umem_free(bigbuf, bigsize); 5545 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5546 umem_free(od, size); 5547 } 5548 5549 void 5550 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 5551 { 5552 (void) id; 5553 ztest_od_t *od; 5554 5555 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5556 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 5557 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5558 5559 /* 5560 * Have multiple threads write to large offsets in an object 5561 * to verify that parallel writes to an object -- even to the 5562 * same blocks within the object -- doesn't cause any trouble. 5563 */ 5564 ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5565 5566 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) 5567 return; 5568 5569 while (ztest_random(10) != 0) 5570 ztest_io(zd, od->od_object, offset); 5571 5572 umem_free(od, sizeof (ztest_od_t)); 5573 } 5574 5575 void 5576 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 5577 { 5578 ztest_od_t *od; 5579 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 5580 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5581 uint64_t count = ztest_random(20) + 1; 5582 uint64_t blocksize = ztest_random_blocksize(); 5583 void *data; 5584 5585 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5586 5587 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5588 5589 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5590 !ztest_random(2)) != 0) { 5591 umem_free(od, sizeof (ztest_od_t)); 5592 return; 5593 } 5594 5595 if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { 5596 umem_free(od, sizeof (ztest_od_t)); 5597 return; 5598 } 5599 5600 ztest_prealloc(zd, od->od_object, offset, count * blocksize); 5601 5602 data = umem_zalloc(blocksize, UMEM_NOFAIL); 5603 5604 while (ztest_random(count) != 0) { 5605 uint64_t randoff = offset + (ztest_random(count) * blocksize); 5606 if (ztest_write(zd, od->od_object, randoff, blocksize, 5607 data) != 0) 5608 break; 5609 while (ztest_random(4) != 0) 5610 ztest_io(zd, od->od_object, randoff); 5611 } 5612 5613 umem_free(data, blocksize); 5614 umem_free(od, sizeof (ztest_od_t)); 5615 } 5616 5617 /* 5618 * Verify that zap_{create,destroy,add,remove,update} work as expected. 5619 */ 5620 #define ZTEST_ZAP_MIN_INTS 1 5621 #define ZTEST_ZAP_MAX_INTS 4 5622 #define ZTEST_ZAP_MAX_PROPS 1000 5623 5624 void 5625 ztest_zap(ztest_ds_t *zd, uint64_t id) 5626 { 5627 objset_t *os = zd->zd_os; 5628 ztest_od_t *od; 5629 uint64_t object; 5630 uint64_t txg, last_txg; 5631 uint64_t value[ZTEST_ZAP_MAX_INTS]; 5632 uint64_t zl_ints, zl_intsize, prop; 5633 int i, ints; 5634 dmu_tx_t *tx; 5635 char propname[100], txgname[100]; 5636 int error; 5637 const char *const hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 5638 5639 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5640 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5641 5642 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5643 !ztest_random(2)) != 0) 5644 goto out; 5645 5646 object = od->od_object; 5647 5648 /* 5649 * Generate a known hash collision, and verify that 5650 * we can lookup and remove both entries. 5651 */ 5652 tx = dmu_tx_create(os); 5653 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5654 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5655 if (txg == 0) 5656 goto out; 5657 for (i = 0; i < 2; i++) { 5658 value[i] = i; 5659 VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 5660 1, &value[i], tx)); 5661 } 5662 for (i = 0; i < 2; i++) { 5663 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 5664 sizeof (uint64_t), 1, &value[i], tx)); 5665 VERIFY0( 5666 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 5667 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5668 ASSERT3U(zl_ints, ==, 1); 5669 } 5670 for (i = 0; i < 2; i++) { 5671 VERIFY0(zap_remove(os, object, hc[i], tx)); 5672 } 5673 dmu_tx_commit(tx); 5674 5675 /* 5676 * Generate a bunch of random entries. 5677 */ 5678 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 5679 5680 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5681 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5682 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5683 memset(value, 0, sizeof (value)); 5684 last_txg = 0; 5685 5686 /* 5687 * If these zap entries already exist, validate their contents. 5688 */ 5689 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5690 if (error == 0) { 5691 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5692 ASSERT3U(zl_ints, ==, 1); 5693 5694 VERIFY0(zap_lookup(os, object, txgname, zl_intsize, 5695 zl_ints, &last_txg)); 5696 5697 VERIFY0(zap_length(os, object, propname, &zl_intsize, 5698 &zl_ints)); 5699 5700 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5701 ASSERT3U(zl_ints, ==, ints); 5702 5703 VERIFY0(zap_lookup(os, object, propname, zl_intsize, 5704 zl_ints, value)); 5705 5706 for (i = 0; i < ints; i++) { 5707 ASSERT3U(value[i], ==, last_txg + object + i); 5708 } 5709 } else { 5710 ASSERT3U(error, ==, ENOENT); 5711 } 5712 5713 /* 5714 * Atomically update two entries in our zap object. 5715 * The first is named txg_%llu, and contains the txg 5716 * in which the property was last updated. The second 5717 * is named prop_%llu, and the nth element of its value 5718 * should be txg + object + n. 5719 */ 5720 tx = dmu_tx_create(os); 5721 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5722 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5723 if (txg == 0) 5724 goto out; 5725 5726 if (last_txg > txg) 5727 fatal(B_FALSE, "zap future leak: old %"PRIu64" new %"PRIu64"", 5728 last_txg, txg); 5729 5730 for (i = 0; i < ints; i++) 5731 value[i] = txg + object + i; 5732 5733 VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 5734 1, &txg, tx)); 5735 VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), 5736 ints, value, tx)); 5737 5738 dmu_tx_commit(tx); 5739 5740 /* 5741 * Remove a random pair of entries. 5742 */ 5743 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5744 (void) sprintf(propname, "prop_%"PRIu64"", prop); 5745 (void) sprintf(txgname, "txg_%"PRIu64"", prop); 5746 5747 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5748 5749 if (error == ENOENT) 5750 goto out; 5751 5752 ASSERT0(error); 5753 5754 tx = dmu_tx_create(os); 5755 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5756 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5757 if (txg == 0) 5758 goto out; 5759 VERIFY0(zap_remove(os, object, txgname, tx)); 5760 VERIFY0(zap_remove(os, object, propname, tx)); 5761 dmu_tx_commit(tx); 5762 out: 5763 umem_free(od, sizeof (ztest_od_t)); 5764 } 5765 5766 /* 5767 * Test case to test the upgrading of a microzap to fatzap. 5768 */ 5769 void 5770 ztest_fzap(ztest_ds_t *zd, uint64_t id) 5771 { 5772 objset_t *os = zd->zd_os; 5773 ztest_od_t *od; 5774 uint64_t object, txg, value; 5775 5776 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5777 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5778 5779 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5780 !ztest_random(2)) != 0) 5781 goto out; 5782 object = od->od_object; 5783 5784 /* 5785 * Add entries to this ZAP and make sure it spills over 5786 * and gets upgraded to a fatzap. Also, since we are adding 5787 * 2050 entries we should see ptrtbl growth and leaf-block split. 5788 */ 5789 for (value = 0; value < 2050; value++) { 5790 char name[ZFS_MAX_DATASET_NAME_LEN]; 5791 dmu_tx_t *tx; 5792 int error; 5793 5794 (void) snprintf(name, sizeof (name), "fzap-%"PRIu64"-%"PRIu64"", 5795 id, value); 5796 5797 tx = dmu_tx_create(os); 5798 dmu_tx_hold_zap(tx, object, B_TRUE, name); 5799 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5800 if (txg == 0) 5801 goto out; 5802 error = zap_add(os, object, name, sizeof (uint64_t), 1, 5803 &value, tx); 5804 ASSERT(error == 0 || error == EEXIST); 5805 dmu_tx_commit(tx); 5806 } 5807 out: 5808 umem_free(od, sizeof (ztest_od_t)); 5809 } 5810 5811 void 5812 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 5813 { 5814 (void) id; 5815 objset_t *os = zd->zd_os; 5816 ztest_od_t *od; 5817 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 5818 dmu_tx_t *tx; 5819 int i, namelen, error; 5820 int micro = ztest_random(2); 5821 char name[20], string_value[20]; 5822 void *data; 5823 5824 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5825 ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); 5826 5827 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5828 umem_free(od, sizeof (ztest_od_t)); 5829 return; 5830 } 5831 5832 object = od->od_object; 5833 5834 /* 5835 * Generate a random name of the form 'xxx.....' where each 5836 * x is a random printable character and the dots are dots. 5837 * There are 94 such characters, and the name length goes from 5838 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 5839 */ 5840 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 5841 5842 for (i = 0; i < 3; i++) 5843 name[i] = '!' + ztest_random('~' - '!' + 1); 5844 for (; i < namelen - 1; i++) 5845 name[i] = '.'; 5846 name[i] = '\0'; 5847 5848 if ((namelen & 1) || micro) { 5849 wsize = sizeof (txg); 5850 wc = 1; 5851 data = &txg; 5852 } else { 5853 wsize = 1; 5854 wc = namelen; 5855 data = string_value; 5856 } 5857 5858 count = -1ULL; 5859 VERIFY0(zap_count(os, object, &count)); 5860 ASSERT3S(count, !=, -1ULL); 5861 5862 /* 5863 * Select an operation: length, lookup, add, update, remove. 5864 */ 5865 i = ztest_random(5); 5866 5867 if (i >= 2) { 5868 tx = dmu_tx_create(os); 5869 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5870 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5871 if (txg == 0) { 5872 umem_free(od, sizeof (ztest_od_t)); 5873 return; 5874 } 5875 memcpy(string_value, name, namelen); 5876 } else { 5877 tx = NULL; 5878 txg = 0; 5879 memset(string_value, 0, namelen); 5880 } 5881 5882 switch (i) { 5883 5884 case 0: 5885 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 5886 if (error == 0) { 5887 ASSERT3U(wsize, ==, zl_wsize); 5888 ASSERT3U(wc, ==, zl_wc); 5889 } else { 5890 ASSERT3U(error, ==, ENOENT); 5891 } 5892 break; 5893 5894 case 1: 5895 error = zap_lookup(os, object, name, wsize, wc, data); 5896 if (error == 0) { 5897 if (data == string_value && 5898 memcmp(name, data, namelen) != 0) 5899 fatal(B_FALSE, "name '%s' != val '%s' len %d", 5900 name, (char *)data, namelen); 5901 } else { 5902 ASSERT3U(error, ==, ENOENT); 5903 } 5904 break; 5905 5906 case 2: 5907 error = zap_add(os, object, name, wsize, wc, data, tx); 5908 ASSERT(error == 0 || error == EEXIST); 5909 break; 5910 5911 case 3: 5912 VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); 5913 break; 5914 5915 case 4: 5916 error = zap_remove(os, object, name, tx); 5917 ASSERT(error == 0 || error == ENOENT); 5918 break; 5919 } 5920 5921 if (tx != NULL) 5922 dmu_tx_commit(tx); 5923 5924 umem_free(od, sizeof (ztest_od_t)); 5925 } 5926 5927 /* 5928 * Commit callback data. 5929 */ 5930 typedef struct ztest_cb_data { 5931 list_node_t zcd_node; 5932 uint64_t zcd_txg; 5933 int zcd_expected_err; 5934 boolean_t zcd_added; 5935 boolean_t zcd_called; 5936 spa_t *zcd_spa; 5937 } ztest_cb_data_t; 5938 5939 /* This is the actual commit callback function */ 5940 static void 5941 ztest_commit_callback(void *arg, int error) 5942 { 5943 ztest_cb_data_t *data = arg; 5944 uint64_t synced_txg; 5945 5946 VERIFY3P(data, !=, NULL); 5947 VERIFY3S(data->zcd_expected_err, ==, error); 5948 VERIFY(!data->zcd_called); 5949 5950 synced_txg = spa_last_synced_txg(data->zcd_spa); 5951 if (data->zcd_txg > synced_txg) 5952 fatal(B_FALSE, 5953 "commit callback of txg %"PRIu64" called prematurely, " 5954 "last synced txg = %"PRIu64"\n", 5955 data->zcd_txg, synced_txg); 5956 5957 data->zcd_called = B_TRUE; 5958 5959 if (error == ECANCELED) { 5960 ASSERT0(data->zcd_txg); 5961 ASSERT(!data->zcd_added); 5962 5963 /* 5964 * The private callback data should be destroyed here, but 5965 * since we are going to check the zcd_called field after 5966 * dmu_tx_abort(), we will destroy it there. 5967 */ 5968 return; 5969 } 5970 5971 ASSERT(data->zcd_added); 5972 ASSERT3U(data->zcd_txg, !=, 0); 5973 5974 (void) mutex_enter(&zcl.zcl_callbacks_lock); 5975 5976 /* See if this cb was called more quickly */ 5977 if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) 5978 zc_min_txg_delay = synced_txg - data->zcd_txg; 5979 5980 /* Remove our callback from the list */ 5981 list_remove(&zcl.zcl_callbacks, data); 5982 5983 (void) mutex_exit(&zcl.zcl_callbacks_lock); 5984 5985 umem_free(data, sizeof (ztest_cb_data_t)); 5986 } 5987 5988 /* Allocate and initialize callback data structure */ 5989 static ztest_cb_data_t * 5990 ztest_create_cb_data(objset_t *os, uint64_t txg) 5991 { 5992 ztest_cb_data_t *cb_data; 5993 5994 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 5995 5996 cb_data->zcd_txg = txg; 5997 cb_data->zcd_spa = dmu_objset_spa(os); 5998 list_link_init(&cb_data->zcd_node); 5999 6000 return (cb_data); 6001 } 6002 6003 /* 6004 * Commit callback test. 6005 */ 6006 void 6007 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 6008 { 6009 objset_t *os = zd->zd_os; 6010 ztest_od_t *od; 6011 dmu_tx_t *tx; 6012 ztest_cb_data_t *cb_data[3], *tmp_cb; 6013 uint64_t old_txg, txg; 6014 int i, error = 0; 6015 6016 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 6017 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 6018 6019 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 6020 umem_free(od, sizeof (ztest_od_t)); 6021 return; 6022 } 6023 6024 tx = dmu_tx_create(os); 6025 6026 cb_data[0] = ztest_create_cb_data(os, 0); 6027 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 6028 6029 dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); 6030 6031 /* Every once in a while, abort the transaction on purpose */ 6032 if (ztest_random(100) == 0) 6033 error = -1; 6034 6035 if (!error) 6036 error = dmu_tx_assign(tx, TXG_NOWAIT); 6037 6038 txg = error ? 0 : dmu_tx_get_txg(tx); 6039 6040 cb_data[0]->zcd_txg = txg; 6041 cb_data[1] = ztest_create_cb_data(os, txg); 6042 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 6043 6044 if (error) { 6045 /* 6046 * It's not a strict requirement to call the registered 6047 * callbacks from inside dmu_tx_abort(), but that's what 6048 * it's supposed to happen in the current implementation 6049 * so we will check for that. 6050 */ 6051 for (i = 0; i < 2; i++) { 6052 cb_data[i]->zcd_expected_err = ECANCELED; 6053 VERIFY(!cb_data[i]->zcd_called); 6054 } 6055 6056 dmu_tx_abort(tx); 6057 6058 for (i = 0; i < 2; i++) { 6059 VERIFY(cb_data[i]->zcd_called); 6060 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 6061 } 6062 6063 umem_free(od, sizeof (ztest_od_t)); 6064 return; 6065 } 6066 6067 cb_data[2] = ztest_create_cb_data(os, txg); 6068 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 6069 6070 /* 6071 * Read existing data to make sure there isn't a future leak. 6072 */ 6073 VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), 6074 &old_txg, DMU_READ_PREFETCH)); 6075 6076 if (old_txg > txg) 6077 fatal(B_FALSE, 6078 "future leak: got %"PRIu64", open txg is %"PRIu64"", 6079 old_txg, txg); 6080 6081 dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); 6082 6083 (void) mutex_enter(&zcl.zcl_callbacks_lock); 6084 6085 /* 6086 * Since commit callbacks don't have any ordering requirement and since 6087 * it is theoretically possible for a commit callback to be called 6088 * after an arbitrary amount of time has elapsed since its txg has been 6089 * synced, it is difficult to reliably determine whether a commit 6090 * callback hasn't been called due to high load or due to a flawed 6091 * implementation. 6092 * 6093 * In practice, we will assume that if after a certain number of txgs a 6094 * commit callback hasn't been called, then most likely there's an 6095 * implementation bug.. 6096 */ 6097 tmp_cb = list_head(&zcl.zcl_callbacks); 6098 if (tmp_cb != NULL && 6099 tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { 6100 fatal(B_FALSE, 6101 "Commit callback threshold exceeded, " 6102 "oldest txg: %"PRIu64", open txg: %"PRIu64"\n", 6103 tmp_cb->zcd_txg, txg); 6104 } 6105 6106 /* 6107 * Let's find the place to insert our callbacks. 6108 * 6109 * Even though the list is ordered by txg, it is possible for the 6110 * insertion point to not be the end because our txg may already be 6111 * quiescing at this point and other callbacks in the open txg 6112 * (from other objsets) may have sneaked in. 6113 */ 6114 tmp_cb = list_tail(&zcl.zcl_callbacks); 6115 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 6116 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 6117 6118 /* Add the 3 callbacks to the list */ 6119 for (i = 0; i < 3; i++) { 6120 if (tmp_cb == NULL) 6121 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 6122 else 6123 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 6124 cb_data[i]); 6125 6126 cb_data[i]->zcd_added = B_TRUE; 6127 VERIFY(!cb_data[i]->zcd_called); 6128 6129 tmp_cb = cb_data[i]; 6130 } 6131 6132 zc_cb_counter += 3; 6133 6134 (void) mutex_exit(&zcl.zcl_callbacks_lock); 6135 6136 dmu_tx_commit(tx); 6137 6138 umem_free(od, sizeof (ztest_od_t)); 6139 } 6140 6141 /* 6142 * Visit each object in the dataset. Verify that its properties 6143 * are consistent what was stored in the block tag when it was created, 6144 * and that its unused bonus buffer space has not been overwritten. 6145 */ 6146 void 6147 ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 6148 { 6149 (void) id; 6150 objset_t *os = zd->zd_os; 6151 uint64_t obj; 6152 int err = 0; 6153 6154 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 6155 ztest_block_tag_t *bt = NULL; 6156 dmu_object_info_t doi; 6157 dmu_buf_t *db; 6158 6159 ztest_object_lock(zd, obj, ZTRL_READER); 6160 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { 6161 ztest_object_unlock(zd, obj); 6162 continue; 6163 } 6164 6165 dmu_object_info_from_db(db, &doi); 6166 if (doi.doi_bonus_size >= sizeof (*bt)) 6167 bt = ztest_bt_bonus(db); 6168 6169 if (bt && bt->bt_magic == BT_MAGIC) { 6170 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 6171 bt->bt_offset, bt->bt_gen, bt->bt_txg, 6172 bt->bt_crtxg); 6173 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 6174 } 6175 6176 dmu_buf_rele(db, FTAG); 6177 ztest_object_unlock(zd, obj); 6178 } 6179 } 6180 6181 void 6182 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 6183 { 6184 (void) id; 6185 zfs_prop_t proplist[] = { 6186 ZFS_PROP_CHECKSUM, 6187 ZFS_PROP_COMPRESSION, 6188 ZFS_PROP_COPIES, 6189 ZFS_PROP_DEDUP 6190 }; 6191 6192 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6193 6194 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) { 6195 int error = ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 6196 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 6197 ASSERT(error == 0 || error == ENOSPC); 6198 } 6199 6200 int error = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, 6201 ztest_random_blocksize(), (int)ztest_random(2)); 6202 ASSERT(error == 0 || error == ENOSPC); 6203 6204 (void) pthread_rwlock_unlock(&ztest_name_lock); 6205 } 6206 6207 void 6208 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 6209 { 6210 (void) zd, (void) id; 6211 nvlist_t *props = NULL; 6212 6213 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6214 6215 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); 6216 6217 VERIFY0(spa_prop_get(ztest_spa, &props)); 6218 6219 if (ztest_opts.zo_verbose >= 6) 6220 dump_nvlist(props, 4); 6221 6222 fnvlist_free(props); 6223 6224 (void) pthread_rwlock_unlock(&ztest_name_lock); 6225 } 6226 6227 static int 6228 user_release_one(const char *snapname, const char *holdname) 6229 { 6230 nvlist_t *snaps, *holds; 6231 int error; 6232 6233 snaps = fnvlist_alloc(); 6234 holds = fnvlist_alloc(); 6235 fnvlist_add_boolean(holds, holdname); 6236 fnvlist_add_nvlist(snaps, snapname, holds); 6237 fnvlist_free(holds); 6238 error = dsl_dataset_user_release(snaps, NULL); 6239 fnvlist_free(snaps); 6240 return (error); 6241 } 6242 6243 /* 6244 * Test snapshot hold/release and deferred destroy. 6245 */ 6246 void 6247 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 6248 { 6249 int error; 6250 objset_t *os = zd->zd_os; 6251 objset_t *origin; 6252 char snapname[100]; 6253 char fullname[100]; 6254 char clonename[100]; 6255 char tag[100]; 6256 char osname[ZFS_MAX_DATASET_NAME_LEN]; 6257 nvlist_t *holds; 6258 6259 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6260 6261 dmu_objset_name(os, osname); 6262 6263 (void) snprintf(snapname, sizeof (snapname), "sh1_%"PRIu64"", id); 6264 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 6265 (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%"PRIu64"", 6266 osname, id); 6267 (void) snprintf(tag, sizeof (tag), "tag_%"PRIu64"", id); 6268 6269 /* 6270 * Clean up from any previous run. 6271 */ 6272 error = dsl_destroy_head(clonename); 6273 if (error != ENOENT) 6274 ASSERT0(error); 6275 error = user_release_one(fullname, tag); 6276 if (error != ESRCH && error != ENOENT) 6277 ASSERT0(error); 6278 error = dsl_destroy_snapshot(fullname, B_FALSE); 6279 if (error != ENOENT) 6280 ASSERT0(error); 6281 6282 /* 6283 * Create snapshot, clone it, mark snap for deferred destroy, 6284 * destroy clone, verify snap was also destroyed. 6285 */ 6286 error = dmu_objset_snapshot_one(osname, snapname); 6287 if (error) { 6288 if (error == ENOSPC) { 6289 ztest_record_enospc("dmu_objset_snapshot"); 6290 goto out; 6291 } 6292 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 6293 } 6294 6295 error = dmu_objset_clone(clonename, fullname); 6296 if (error) { 6297 if (error == ENOSPC) { 6298 ztest_record_enospc("dmu_objset_clone"); 6299 goto out; 6300 } 6301 fatal(B_FALSE, "dmu_objset_clone(%s) = %d", clonename, error); 6302 } 6303 6304 error = dsl_destroy_snapshot(fullname, B_TRUE); 6305 if (error) { 6306 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6307 fullname, error); 6308 } 6309 6310 error = dsl_destroy_head(clonename); 6311 if (error) 6312 fatal(B_FALSE, "dsl_destroy_head(%s) = %d", clonename, error); 6313 6314 error = dmu_objset_hold(fullname, FTAG, &origin); 6315 if (error != ENOENT) 6316 fatal(B_FALSE, "dmu_objset_hold(%s) = %d", fullname, error); 6317 6318 /* 6319 * Create snapshot, add temporary hold, verify that we can't 6320 * destroy a held snapshot, mark for deferred destroy, 6321 * release hold, verify snapshot was destroyed. 6322 */ 6323 error = dmu_objset_snapshot_one(osname, snapname); 6324 if (error) { 6325 if (error == ENOSPC) { 6326 ztest_record_enospc("dmu_objset_snapshot"); 6327 goto out; 6328 } 6329 fatal(B_FALSE, "dmu_objset_snapshot(%s) = %d", fullname, error); 6330 } 6331 6332 holds = fnvlist_alloc(); 6333 fnvlist_add_string(holds, fullname, tag); 6334 error = dsl_dataset_user_hold(holds, 0, NULL); 6335 fnvlist_free(holds); 6336 6337 if (error == ENOSPC) { 6338 ztest_record_enospc("dsl_dataset_user_hold"); 6339 goto out; 6340 } else if (error) { 6341 fatal(B_FALSE, "dsl_dataset_user_hold(%s, %s) = %u", 6342 fullname, tag, error); 6343 } 6344 6345 error = dsl_destroy_snapshot(fullname, B_FALSE); 6346 if (error != EBUSY) { 6347 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 6348 fullname, error); 6349 } 6350 6351 error = dsl_destroy_snapshot(fullname, B_TRUE); 6352 if (error) { 6353 fatal(B_FALSE, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 6354 fullname, error); 6355 } 6356 6357 error = user_release_one(fullname, tag); 6358 if (error) 6359 fatal(B_FALSE, "user_release_one(%s, %s) = %d", 6360 fullname, tag, error); 6361 6362 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 6363 6364 out: 6365 (void) pthread_rwlock_unlock(&ztest_name_lock); 6366 } 6367 6368 /* 6369 * Inject random faults into the on-disk data. 6370 */ 6371 void 6372 ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 6373 { 6374 (void) zd, (void) id; 6375 ztest_shared_t *zs = ztest_shared; 6376 spa_t *spa = ztest_spa; 6377 int fd; 6378 uint64_t offset; 6379 uint64_t leaves; 6380 uint64_t bad = 0x1990c0ffeedecadeull; 6381 uint64_t top, leaf; 6382 uint64_t raidz_children; 6383 char *path0; 6384 char *pathrand; 6385 size_t fsize; 6386 int bshift = SPA_MAXBLOCKSHIFT + 2; 6387 int iters = 1000; 6388 int maxfaults; 6389 int mirror_save; 6390 vdev_t *vd0 = NULL; 6391 uint64_t guid0 = 0; 6392 boolean_t islog = B_FALSE; 6393 boolean_t injected = B_FALSE; 6394 6395 path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6396 pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6397 6398 mutex_enter(&ztest_vdev_lock); 6399 6400 /* 6401 * Device removal is in progress, fault injection must be disabled 6402 * until it completes and the pool is scrubbed. The fault injection 6403 * strategy for damaging blocks does not take in to account evacuated 6404 * blocks which may have already been damaged. 6405 */ 6406 if (ztest_device_removal_active) 6407 goto out; 6408 6409 /* 6410 * The fault injection strategy for damaging blocks cannot be used 6411 * if raidz expansion is in progress. The leaves value 6412 * (attached raidz children) is variable and strategy for damaging 6413 * blocks will corrupt same data blocks on different child vdevs 6414 * because of the reflow process. 6415 */ 6416 if (spa->spa_raidz_expand != NULL) 6417 goto out; 6418 6419 maxfaults = MAXFAULTS(zs); 6420 raidz_children = ztest_get_raidz_children(spa); 6421 leaves = MAX(zs->zs_mirrors, 1) * raidz_children; 6422 mirror_save = zs->zs_mirrors; 6423 6424 ASSERT3U(leaves, >=, 1); 6425 6426 /* 6427 * While ztest is running the number of leaves will not change. This 6428 * is critical for the fault injection logic as it determines where 6429 * errors can be safely injected such that they are always repairable. 6430 * 6431 * When restarting ztest a different number of leaves may be requested 6432 * which will shift the regions to be damaged. This is fine as long 6433 * as the pool has been scrubbed prior to using the new mapping. 6434 * Failure to do can result in non-repairable damage being injected. 6435 */ 6436 if (ztest_pool_scrubbed == B_FALSE) 6437 goto out; 6438 6439 /* 6440 * Grab the name lock as reader. There are some operations 6441 * which don't like to have their vdevs changed while 6442 * they are in progress (i.e. spa_change_guid). Those 6443 * operations will have grabbed the name lock as writer. 6444 */ 6445 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6446 6447 /* 6448 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 6449 */ 6450 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6451 6452 if (ztest_random(2) == 0) { 6453 /* 6454 * Inject errors on a normal data device or slog device. 6455 */ 6456 top = ztest_random_vdev_top(spa, B_TRUE); 6457 leaf = ztest_random(leaves) + zs->zs_splits; 6458 6459 /* 6460 * Generate paths to the first leaf in this top-level vdev, 6461 * and to the random leaf we selected. We'll induce transient 6462 * write failures and random online/offline activity on leaf 0, 6463 * and we'll write random garbage to the randomly chosen leaf. 6464 */ 6465 (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, 6466 ztest_opts.zo_dir, ztest_opts.zo_pool, 6467 top * leaves + zs->zs_splits); 6468 (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, 6469 ztest_opts.zo_dir, ztest_opts.zo_pool, 6470 top * leaves + leaf); 6471 6472 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 6473 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 6474 islog = B_TRUE; 6475 6476 /* 6477 * If the top-level vdev needs to be resilvered 6478 * then we only allow faults on the device that is 6479 * resilvering. 6480 */ 6481 if (vd0 != NULL && maxfaults != 1 && 6482 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 6483 vd0->vdev_resilver_txg != 0)) { 6484 /* 6485 * Make vd0 explicitly claim to be unreadable, 6486 * or unwritable, or reach behind its back 6487 * and close the underlying fd. We can do this if 6488 * maxfaults == 0 because we'll fail and reexecute, 6489 * and we can do it if maxfaults >= 2 because we'll 6490 * have enough redundancy. If maxfaults == 1, the 6491 * combination of this with injection of random data 6492 * corruption below exceeds the pool's fault tolerance. 6493 */ 6494 vdev_file_t *vf = vd0->vdev_tsd; 6495 6496 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 6497 (long long)vd0->vdev_id, (int)maxfaults); 6498 6499 if (vf != NULL && ztest_random(3) == 0) { 6500 (void) close(vf->vf_file->f_fd); 6501 vf->vf_file->f_fd = -1; 6502 } else if (ztest_random(2) == 0) { 6503 vd0->vdev_cant_read = B_TRUE; 6504 } else { 6505 vd0->vdev_cant_write = B_TRUE; 6506 } 6507 guid0 = vd0->vdev_guid; 6508 } 6509 } else { 6510 /* 6511 * Inject errors on an l2cache device. 6512 */ 6513 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6514 6515 if (sav->sav_count == 0) { 6516 spa_config_exit(spa, SCL_STATE, FTAG); 6517 (void) pthread_rwlock_unlock(&ztest_name_lock); 6518 goto out; 6519 } 6520 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 6521 guid0 = vd0->vdev_guid; 6522 (void) strlcpy(path0, vd0->vdev_path, MAXPATHLEN); 6523 (void) strlcpy(pathrand, vd0->vdev_path, MAXPATHLEN); 6524 6525 leaf = 0; 6526 leaves = 1; 6527 maxfaults = INT_MAX; /* no limit on cache devices */ 6528 } 6529 6530 spa_config_exit(spa, SCL_STATE, FTAG); 6531 (void) pthread_rwlock_unlock(&ztest_name_lock); 6532 6533 /* 6534 * If we can tolerate two or more faults, or we're dealing 6535 * with a slog, randomly online/offline vd0. 6536 */ 6537 if ((maxfaults >= 2 || islog) && guid0 != 0) { 6538 if (ztest_random(10) < 6) { 6539 int flags = (ztest_random(2) == 0 ? 6540 ZFS_OFFLINE_TEMPORARY : 0); 6541 6542 /* 6543 * We have to grab the zs_name_lock as writer to 6544 * prevent a race between offlining a slog and 6545 * destroying a dataset. Offlining the slog will 6546 * grab a reference on the dataset which may cause 6547 * dsl_destroy_head() to fail with EBUSY thus 6548 * leaving the dataset in an inconsistent state. 6549 */ 6550 if (islog) 6551 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6552 6553 VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); 6554 6555 if (islog) 6556 (void) pthread_rwlock_unlock(&ztest_name_lock); 6557 } else { 6558 /* 6559 * Ideally we would like to be able to randomly 6560 * call vdev_[on|off]line without holding locks 6561 * to force unpredictable failures but the side 6562 * effects of vdev_[on|off]line prevent us from 6563 * doing so. 6564 */ 6565 (void) vdev_online(spa, guid0, 0, NULL); 6566 } 6567 } 6568 6569 if (maxfaults == 0) 6570 goto out; 6571 6572 /* 6573 * We have at least single-fault tolerance, so inject data corruption. 6574 */ 6575 fd = open(pathrand, O_RDWR); 6576 6577 if (fd == -1) /* we hit a gap in the device namespace */ 6578 goto out; 6579 6580 fsize = lseek(fd, 0, SEEK_END); 6581 6582 while (--iters != 0) { 6583 /* 6584 * The offset must be chosen carefully to ensure that 6585 * we do not inject a given logical block with errors 6586 * on two different leaf devices, because ZFS can not 6587 * tolerate that (if maxfaults==1). 6588 * 6589 * To achieve this we divide each leaf device into 6590 * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). 6591 * Each chunk is further divided into error-injection 6592 * ranges (can accept errors) and clear ranges (we do 6593 * not inject errors in those). Each error-injection 6594 * range can accept errors only for a single leaf vdev. 6595 * Error-injection ranges are separated by clear ranges. 6596 * 6597 * For example, with 3 leaves, each chunk looks like: 6598 * 0 to 32M: injection range for leaf 0 6599 * 32M to 64M: clear range - no injection allowed 6600 * 64M to 96M: injection range for leaf 1 6601 * 96M to 128M: clear range - no injection allowed 6602 * 128M to 160M: injection range for leaf 2 6603 * 160M to 192M: clear range - no injection allowed 6604 * 6605 * Each clear range must be large enough such that a 6606 * single block cannot straddle it. This way a block 6607 * can't be a target in two different injection ranges 6608 * (on different leaf vdevs). 6609 */ 6610 offset = ztest_random(fsize / (leaves << bshift)) * 6611 (leaves << bshift) + (leaf << bshift) + 6612 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 6613 6614 /* 6615 * Only allow damage to the labels at one end of the vdev. 6616 * 6617 * If all labels are damaged, the device will be totally 6618 * inaccessible, which will result in loss of data, 6619 * because we also damage (parts of) the other side of 6620 * the mirror/raidz. 6621 * 6622 * Additionally, we will always have both an even and an 6623 * odd label, so that we can handle crashes in the 6624 * middle of vdev_config_sync(). 6625 */ 6626 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 6627 continue; 6628 6629 /* 6630 * The two end labels are stored at the "end" of the disk, but 6631 * the end of the disk (vdev_psize) is aligned to 6632 * sizeof (vdev_label_t). 6633 */ 6634 uint64_t psize = P2ALIGN_TYPED(fsize, sizeof (vdev_label_t), 6635 uint64_t); 6636 if ((leaf & 1) == 1 && 6637 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 6638 continue; 6639 6640 if (mirror_save != zs->zs_mirrors) { 6641 (void) close(fd); 6642 goto out; 6643 } 6644 6645 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 6646 fatal(B_TRUE, 6647 "can't inject bad word at 0x%"PRIx64" in %s", 6648 offset, pathrand); 6649 6650 if (ztest_opts.zo_verbose >= 7) 6651 (void) printf("injected bad word into %s," 6652 " offset 0x%"PRIx64"\n", pathrand, offset); 6653 6654 injected = B_TRUE; 6655 } 6656 6657 (void) close(fd); 6658 out: 6659 mutex_exit(&ztest_vdev_lock); 6660 6661 if (injected && ztest_opts.zo_raid_do_expand) { 6662 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6663 if (error == 0) { 6664 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6665 txg_wait_synced(spa_get_dsl(spa), 0); 6666 } 6667 } 6668 6669 umem_free(path0, MAXPATHLEN); 6670 umem_free(pathrand, MAXPATHLEN); 6671 } 6672 6673 /* 6674 * By design ztest will never inject uncorrectable damage in to the pool. 6675 * Issue a scrub, wait for it to complete, and verify there is never any 6676 * persistent damage. 6677 * 6678 * Only after a full scrub has been completed is it safe to start injecting 6679 * data corruption. See the comment in zfs_fault_inject(). 6680 */ 6681 static int 6682 ztest_scrub_impl(spa_t *spa) 6683 { 6684 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6685 if (error) 6686 return (error); 6687 6688 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6689 txg_wait_synced(spa_get_dsl(spa), 0); 6690 6691 if (spa_approx_errlog_size(spa) > 0) 6692 return (ECKSUM); 6693 6694 ztest_pool_scrubbed = B_TRUE; 6695 6696 return (0); 6697 } 6698 6699 /* 6700 * Scrub the pool. 6701 */ 6702 void 6703 ztest_scrub(ztest_ds_t *zd, uint64_t id) 6704 { 6705 (void) zd, (void) id; 6706 spa_t *spa = ztest_spa; 6707 int error; 6708 6709 /* 6710 * Scrub in progress by device removal. 6711 */ 6712 if (ztest_device_removal_active) 6713 return; 6714 6715 /* 6716 * Start a scrub, wait a moment, then force a restart. 6717 */ 6718 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6719 (void) poll(NULL, 0, 100); 6720 6721 error = ztest_scrub_impl(spa); 6722 if (error == EBUSY) 6723 error = 0; 6724 ASSERT0(error); 6725 } 6726 6727 /* 6728 * Change the guid for the pool. 6729 */ 6730 void 6731 ztest_reguid(ztest_ds_t *zd, uint64_t id) 6732 { 6733 (void) zd, (void) id; 6734 spa_t *spa = ztest_spa; 6735 uint64_t orig, load; 6736 int error; 6737 ztest_shared_t *zs = ztest_shared; 6738 6739 if (ztest_opts.zo_mmp_test) 6740 return; 6741 6742 orig = spa_guid(spa); 6743 load = spa_load_guid(spa); 6744 6745 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6746 error = spa_change_guid(spa); 6747 zs->zs_guid = spa_guid(spa); 6748 (void) pthread_rwlock_unlock(&ztest_name_lock); 6749 6750 if (error != 0) 6751 return; 6752 6753 if (ztest_opts.zo_verbose >= 4) { 6754 (void) printf("Changed guid old %"PRIu64" -> %"PRIu64"\n", 6755 orig, spa_guid(spa)); 6756 } 6757 6758 VERIFY3U(orig, !=, spa_guid(spa)); 6759 VERIFY3U(load, ==, spa_load_guid(spa)); 6760 } 6761 6762 void 6763 ztest_blake3(ztest_ds_t *zd, uint64_t id) 6764 { 6765 (void) zd, (void) id; 6766 hrtime_t end = gethrtime() + NANOSEC; 6767 zio_cksum_salt_t salt; 6768 void *salt_ptr = &salt.zcs_bytes; 6769 struct abd *abd_data, *abd_meta; 6770 void *buf, *templ; 6771 int i, *ptr; 6772 uint32_t size; 6773 BLAKE3_CTX ctx; 6774 const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); 6775 6776 size = ztest_random_blocksize(); 6777 buf = umem_alloc(size, UMEM_NOFAIL); 6778 abd_data = abd_alloc(size, B_FALSE); 6779 abd_meta = abd_alloc(size, B_TRUE); 6780 6781 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6782 *ptr = ztest_random(UINT_MAX); 6783 memset(salt_ptr, 'A', 32); 6784 6785 abd_copy_from_buf_off(abd_data, buf, 0, size); 6786 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6787 6788 while (gethrtime() <= end) { 6789 int run_count = 100; 6790 zio_cksum_t zc_ref1, zc_ref2; 6791 zio_cksum_t zc_res1, zc_res2; 6792 6793 void *ref1 = &zc_ref1; 6794 void *ref2 = &zc_ref2; 6795 void *res1 = &zc_res1; 6796 void *res2 = &zc_res2; 6797 6798 /* BLAKE3_KEY_LEN = 32 */ 6799 VERIFY0(blake3->setname("generic")); 6800 templ = abd_checksum_blake3_tmpl_init(&salt); 6801 Blake3_InitKeyed(&ctx, salt_ptr); 6802 Blake3_Update(&ctx, buf, size); 6803 Blake3_Final(&ctx, ref1); 6804 zc_ref2 = zc_ref1; 6805 ZIO_CHECKSUM_BSWAP(&zc_ref2); 6806 abd_checksum_blake3_tmpl_free(templ); 6807 6808 VERIFY0(blake3->setname("cycle")); 6809 while (run_count-- > 0) { 6810 6811 /* Test current implementation */ 6812 Blake3_InitKeyed(&ctx, salt_ptr); 6813 Blake3_Update(&ctx, buf, size); 6814 Blake3_Final(&ctx, res1); 6815 zc_res2 = zc_res1; 6816 ZIO_CHECKSUM_BSWAP(&zc_res2); 6817 6818 VERIFY0(memcmp(ref1, res1, 32)); 6819 VERIFY0(memcmp(ref2, res2, 32)); 6820 6821 /* Test ABD - data */ 6822 templ = abd_checksum_blake3_tmpl_init(&salt); 6823 abd_checksum_blake3_native(abd_data, size, 6824 templ, &zc_res1); 6825 abd_checksum_blake3_byteswap(abd_data, size, 6826 templ, &zc_res2); 6827 6828 VERIFY0(memcmp(ref1, res1, 32)); 6829 VERIFY0(memcmp(ref2, res2, 32)); 6830 6831 /* Test ABD - metadata */ 6832 abd_checksum_blake3_native(abd_meta, size, 6833 templ, &zc_res1); 6834 abd_checksum_blake3_byteswap(abd_meta, size, 6835 templ, &zc_res2); 6836 abd_checksum_blake3_tmpl_free(templ); 6837 6838 VERIFY0(memcmp(ref1, res1, 32)); 6839 VERIFY0(memcmp(ref2, res2, 32)); 6840 6841 } 6842 } 6843 6844 abd_free(abd_data); 6845 abd_free(abd_meta); 6846 umem_free(buf, size); 6847 } 6848 6849 void 6850 ztest_fletcher(ztest_ds_t *zd, uint64_t id) 6851 { 6852 (void) zd, (void) id; 6853 hrtime_t end = gethrtime() + NANOSEC; 6854 6855 while (gethrtime() <= end) { 6856 int run_count = 100; 6857 void *buf; 6858 struct abd *abd_data, *abd_meta; 6859 uint32_t size; 6860 int *ptr; 6861 int i; 6862 zio_cksum_t zc_ref; 6863 zio_cksum_t zc_ref_byteswap; 6864 6865 size = ztest_random_blocksize(); 6866 6867 buf = umem_alloc(size, UMEM_NOFAIL); 6868 abd_data = abd_alloc(size, B_FALSE); 6869 abd_meta = abd_alloc(size, B_TRUE); 6870 6871 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6872 *ptr = ztest_random(UINT_MAX); 6873 6874 abd_copy_from_buf_off(abd_data, buf, 0, size); 6875 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6876 6877 VERIFY0(fletcher_4_impl_set("scalar")); 6878 fletcher_4_native(buf, size, NULL, &zc_ref); 6879 fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); 6880 6881 VERIFY0(fletcher_4_impl_set("cycle")); 6882 while (run_count-- > 0) { 6883 zio_cksum_t zc; 6884 zio_cksum_t zc_byteswap; 6885 6886 fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); 6887 fletcher_4_native(buf, size, NULL, &zc); 6888 6889 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6890 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6891 sizeof (zc_byteswap))); 6892 6893 /* Test ABD - data */ 6894 abd_fletcher_4_byteswap(abd_data, size, NULL, 6895 &zc_byteswap); 6896 abd_fletcher_4_native(abd_data, size, NULL, &zc); 6897 6898 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6899 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6900 sizeof (zc_byteswap))); 6901 6902 /* Test ABD - metadata */ 6903 abd_fletcher_4_byteswap(abd_meta, size, NULL, 6904 &zc_byteswap); 6905 abd_fletcher_4_native(abd_meta, size, NULL, &zc); 6906 6907 VERIFY0(memcmp(&zc, &zc_ref, sizeof (zc))); 6908 VERIFY0(memcmp(&zc_byteswap, &zc_ref_byteswap, 6909 sizeof (zc_byteswap))); 6910 6911 } 6912 6913 umem_free(buf, size); 6914 abd_free(abd_data); 6915 abd_free(abd_meta); 6916 } 6917 } 6918 6919 void 6920 ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) 6921 { 6922 (void) zd, (void) id; 6923 void *buf; 6924 size_t size; 6925 int *ptr; 6926 int i; 6927 zio_cksum_t zc_ref; 6928 zio_cksum_t zc_ref_bswap; 6929 6930 hrtime_t end = gethrtime() + NANOSEC; 6931 6932 while (gethrtime() <= end) { 6933 int run_count = 100; 6934 6935 size = ztest_random_blocksize(); 6936 buf = umem_alloc(size, UMEM_NOFAIL); 6937 6938 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6939 *ptr = ztest_random(UINT_MAX); 6940 6941 VERIFY0(fletcher_4_impl_set("scalar")); 6942 fletcher_4_native(buf, size, NULL, &zc_ref); 6943 fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); 6944 6945 VERIFY0(fletcher_4_impl_set("cycle")); 6946 6947 while (run_count-- > 0) { 6948 zio_cksum_t zc; 6949 zio_cksum_t zc_bswap; 6950 size_t pos = 0; 6951 6952 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6953 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6954 6955 while (pos < size) { 6956 size_t inc = 64 * ztest_random(size / 67); 6957 /* sometimes add few bytes to test non-simd */ 6958 if (ztest_random(100) < 10) 6959 inc += P2ALIGN_TYPED(ztest_random(64), 6960 sizeof (uint32_t), uint64_t); 6961 6962 if (inc > (size - pos)) 6963 inc = size - pos; 6964 6965 fletcher_4_incremental_native(buf + pos, inc, 6966 &zc); 6967 fletcher_4_incremental_byteswap(buf + pos, inc, 6968 &zc_bswap); 6969 6970 pos += inc; 6971 } 6972 6973 VERIFY3U(pos, ==, size); 6974 6975 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6976 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6977 6978 /* 6979 * verify if incremental on the whole buffer is 6980 * equivalent to non-incremental version 6981 */ 6982 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6983 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6984 6985 fletcher_4_incremental_native(buf, size, &zc); 6986 fletcher_4_incremental_byteswap(buf, size, &zc_bswap); 6987 6988 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6989 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6990 } 6991 6992 umem_free(buf, size); 6993 } 6994 } 6995 6996 static int 6997 ztest_set_global_vars(void) 6998 { 6999 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 7000 char *kv = ztest_opts.zo_gvars[i]; 7001 VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); 7002 VERIFY3U(strlen(kv), >, 0); 7003 int err = set_global_var(kv); 7004 if (ztest_opts.zo_verbose > 0) { 7005 (void) printf("setting global var %s ... %s\n", kv, 7006 err ? "failed" : "ok"); 7007 } 7008 if (err != 0) { 7009 (void) fprintf(stderr, 7010 "failed to set global var '%s'\n", kv); 7011 return (err); 7012 } 7013 } 7014 return (0); 7015 } 7016 7017 static char ** 7018 ztest_global_vars_to_zdb_args(void) 7019 { 7020 char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); 7021 char **cur = args; 7022 if (args == NULL) 7023 return (NULL); 7024 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 7025 *cur++ = (char *)"-o"; 7026 *cur++ = ztest_opts.zo_gvars[i]; 7027 } 7028 ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); 7029 *cur = NULL; 7030 return (args); 7031 } 7032 7033 /* The end of strings is indicated by a NULL element */ 7034 static char * 7035 join_strings(char **strings, const char *sep) 7036 { 7037 size_t totallen = 0; 7038 for (char **sp = strings; *sp != NULL; sp++) { 7039 totallen += strlen(*sp); 7040 totallen += strlen(sep); 7041 } 7042 if (totallen > 0) { 7043 ASSERT(totallen >= strlen(sep)); 7044 totallen -= strlen(sep); 7045 } 7046 7047 size_t buflen = totallen + 1; 7048 char *o = umem_alloc(buflen, UMEM_NOFAIL); /* trailing 0 byte */ 7049 o[0] = '\0'; 7050 for (char **sp = strings; *sp != NULL; sp++) { 7051 size_t would; 7052 would = strlcat(o, *sp, buflen); 7053 VERIFY3U(would, <, buflen); 7054 if (*(sp+1) == NULL) { 7055 break; 7056 } 7057 would = strlcat(o, sep, buflen); 7058 VERIFY3U(would, <, buflen); 7059 } 7060 ASSERT3S(strlen(o), ==, totallen); 7061 return (o); 7062 } 7063 7064 static int 7065 ztest_check_path(char *path) 7066 { 7067 struct stat s; 7068 /* return true on success */ 7069 return (!stat(path, &s)); 7070 } 7071 7072 static void 7073 ztest_get_zdb_bin(char *bin, int len) 7074 { 7075 char *zdb_path; 7076 /* 7077 * Try to use $ZDB and in-tree zdb path. If not successful, just 7078 * let popen to search through PATH. 7079 */ 7080 if ((zdb_path = getenv("ZDB"))) { 7081 strlcpy(bin, zdb_path, len); /* In env */ 7082 if (!ztest_check_path(bin)) { 7083 ztest_dump_core = 0; 7084 fatal(B_TRUE, "invalid ZDB '%s'", bin); 7085 } 7086 return; 7087 } 7088 7089 VERIFY3P(realpath(getexecname(), bin), !=, NULL); 7090 if (strstr(bin, ".libs/ztest")) { 7091 strstr(bin, ".libs/ztest")[0] = '\0'; /* In-tree */ 7092 strcat(bin, "zdb"); 7093 if (ztest_check_path(bin)) 7094 return; 7095 } 7096 strcpy(bin, "zdb"); 7097 } 7098 7099 static vdev_t * 7100 ztest_random_concrete_vdev_leaf(vdev_t *vd) 7101 { 7102 if (vd == NULL) 7103 return (NULL); 7104 7105 if (vd->vdev_children == 0) 7106 return (vd); 7107 7108 vdev_t *eligible[vd->vdev_children]; 7109 int eligible_idx = 0, i; 7110 for (i = 0; i < vd->vdev_children; i++) { 7111 vdev_t *cvd = vd->vdev_child[i]; 7112 if (cvd->vdev_top->vdev_removing) 7113 continue; 7114 if (cvd->vdev_children > 0 || 7115 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 7116 eligible[eligible_idx++] = cvd; 7117 } 7118 } 7119 VERIFY3S(eligible_idx, >, 0); 7120 7121 uint64_t child_no = ztest_random(eligible_idx); 7122 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 7123 } 7124 7125 void 7126 ztest_initialize(ztest_ds_t *zd, uint64_t id) 7127 { 7128 (void) zd, (void) id; 7129 spa_t *spa = ztest_spa; 7130 int error = 0; 7131 7132 mutex_enter(&ztest_vdev_lock); 7133 7134 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 7135 7136 /* Random leaf vdev */ 7137 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 7138 if (rand_vd == NULL) { 7139 spa_config_exit(spa, SCL_VDEV, FTAG); 7140 mutex_exit(&ztest_vdev_lock); 7141 return; 7142 } 7143 7144 /* 7145 * The random vdev we've selected may change as soon as we 7146 * drop the spa_config_lock. We create local copies of things 7147 * we're interested in. 7148 */ 7149 uint64_t guid = rand_vd->vdev_guid; 7150 char *path = strdup(rand_vd->vdev_path); 7151 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 7152 7153 zfs_dbgmsg("vd %px, guid %llu", rand_vd, (u_longlong_t)guid); 7154 spa_config_exit(spa, SCL_VDEV, FTAG); 7155 7156 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 7157 7158 nvlist_t *vdev_guids = fnvlist_alloc(); 7159 nvlist_t *vdev_errlist = fnvlist_alloc(); 7160 fnvlist_add_uint64(vdev_guids, path, guid); 7161 error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); 7162 fnvlist_free(vdev_guids); 7163 fnvlist_free(vdev_errlist); 7164 7165 switch (cmd) { 7166 case POOL_INITIALIZE_CANCEL: 7167 if (ztest_opts.zo_verbose >= 4) { 7168 (void) printf("Cancel initialize %s", path); 7169 if (!active) 7170 (void) printf(" failed (no initialize active)"); 7171 (void) printf("\n"); 7172 } 7173 break; 7174 case POOL_INITIALIZE_START: 7175 if (ztest_opts.zo_verbose >= 4) { 7176 (void) printf("Start initialize %s", path); 7177 if (active && error == 0) 7178 (void) printf(" failed (already active)"); 7179 else if (error != 0) 7180 (void) printf(" failed (error %d)", error); 7181 (void) printf("\n"); 7182 } 7183 break; 7184 case POOL_INITIALIZE_SUSPEND: 7185 if (ztest_opts.zo_verbose >= 4) { 7186 (void) printf("Suspend initialize %s", path); 7187 if (!active) 7188 (void) printf(" failed (no initialize active)"); 7189 (void) printf("\n"); 7190 } 7191 break; 7192 } 7193 free(path); 7194 mutex_exit(&ztest_vdev_lock); 7195 } 7196 7197 void 7198 ztest_trim(ztest_ds_t *zd, uint64_t id) 7199 { 7200 (void) zd, (void) id; 7201 spa_t *spa = ztest_spa; 7202 int error = 0; 7203 7204 mutex_enter(&ztest_vdev_lock); 7205 7206 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 7207 7208 /* Random leaf vdev */ 7209 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 7210 if (rand_vd == NULL) { 7211 spa_config_exit(spa, SCL_VDEV, FTAG); 7212 mutex_exit(&ztest_vdev_lock); 7213 return; 7214 } 7215 7216 /* 7217 * The random vdev we've selected may change as soon as we 7218 * drop the spa_config_lock. We create local copies of things 7219 * we're interested in. 7220 */ 7221 uint64_t guid = rand_vd->vdev_guid; 7222 char *path = strdup(rand_vd->vdev_path); 7223 boolean_t active = rand_vd->vdev_trim_thread != NULL; 7224 7225 zfs_dbgmsg("vd %p, guid %llu", rand_vd, (u_longlong_t)guid); 7226 spa_config_exit(spa, SCL_VDEV, FTAG); 7227 7228 uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); 7229 uint64_t rate = 1 << ztest_random(30); 7230 boolean_t partial = (ztest_random(5) > 0); 7231 boolean_t secure = (ztest_random(5) > 0); 7232 7233 nvlist_t *vdev_guids = fnvlist_alloc(); 7234 nvlist_t *vdev_errlist = fnvlist_alloc(); 7235 fnvlist_add_uint64(vdev_guids, path, guid); 7236 error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, 7237 secure, vdev_errlist); 7238 fnvlist_free(vdev_guids); 7239 fnvlist_free(vdev_errlist); 7240 7241 switch (cmd) { 7242 case POOL_TRIM_CANCEL: 7243 if (ztest_opts.zo_verbose >= 4) { 7244 (void) printf("Cancel TRIM %s", path); 7245 if (!active) 7246 (void) printf(" failed (no TRIM active)"); 7247 (void) printf("\n"); 7248 } 7249 break; 7250 case POOL_TRIM_START: 7251 if (ztest_opts.zo_verbose >= 4) { 7252 (void) printf("Start TRIM %s", path); 7253 if (active && error == 0) 7254 (void) printf(" failed (already active)"); 7255 else if (error != 0) 7256 (void) printf(" failed (error %d)", error); 7257 (void) printf("\n"); 7258 } 7259 break; 7260 case POOL_TRIM_SUSPEND: 7261 if (ztest_opts.zo_verbose >= 4) { 7262 (void) printf("Suspend TRIM %s", path); 7263 if (!active) 7264 (void) printf(" failed (no TRIM active)"); 7265 (void) printf("\n"); 7266 } 7267 break; 7268 } 7269 free(path); 7270 mutex_exit(&ztest_vdev_lock); 7271 } 7272 7273 /* 7274 * Verify pool integrity by running zdb. 7275 */ 7276 static void 7277 ztest_run_zdb(uint64_t guid) 7278 { 7279 int status; 7280 char *bin; 7281 char *zdb; 7282 char *zbuf; 7283 const int len = MAXPATHLEN + MAXNAMELEN + 20; 7284 FILE *fp; 7285 7286 bin = umem_alloc(len, UMEM_NOFAIL); 7287 zdb = umem_alloc(len, UMEM_NOFAIL); 7288 zbuf = umem_alloc(1024, UMEM_NOFAIL); 7289 7290 ztest_get_zdb_bin(bin, len); 7291 7292 char **set_gvars_args = ztest_global_vars_to_zdb_args(); 7293 if (set_gvars_args == NULL) { 7294 fatal(B_FALSE, "Failed to allocate memory in " 7295 "ztest_global_vars_to_zdb_args(). Cannot run zdb.\n"); 7296 } 7297 char *set_gvars_args_joined = join_strings(set_gvars_args, " "); 7298 free(set_gvars_args); 7299 7300 size_t would = snprintf(zdb, len, 7301 "%s -bcc%s%s -G -d -Y -e -y %s -p %s %"PRIu64, 7302 bin, 7303 ztest_opts.zo_verbose >= 3 ? "s" : "", 7304 ztest_opts.zo_verbose >= 4 ? "v" : "", 7305 set_gvars_args_joined, 7306 ztest_opts.zo_dir, 7307 guid); 7308 ASSERT3U(would, <, len); 7309 7310 umem_free(set_gvars_args_joined, strlen(set_gvars_args_joined) + 1); 7311 7312 if (ztest_opts.zo_verbose >= 5) 7313 (void) printf("Executing %s\n", zdb); 7314 7315 fp = popen(zdb, "r"); 7316 7317 while (fgets(zbuf, 1024, fp) != NULL) 7318 if (ztest_opts.zo_verbose >= 3) 7319 (void) printf("%s", zbuf); 7320 7321 status = pclose(fp); 7322 7323 if (status == 0) 7324 goto out; 7325 7326 ztest_dump_core = 0; 7327 if (WIFEXITED(status)) 7328 fatal(B_FALSE, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 7329 else 7330 fatal(B_FALSE, "'%s' died with signal %d", 7331 zdb, WTERMSIG(status)); 7332 out: 7333 umem_free(bin, len); 7334 umem_free(zdb, len); 7335 umem_free(zbuf, 1024); 7336 } 7337 7338 static void 7339 ztest_walk_pool_directory(const char *header) 7340 { 7341 spa_t *spa = NULL; 7342 7343 if (ztest_opts.zo_verbose >= 6) 7344 (void) puts(header); 7345 7346 mutex_enter(&spa_namespace_lock); 7347 while ((spa = spa_next(spa)) != NULL) 7348 if (ztest_opts.zo_verbose >= 6) 7349 (void) printf("\t%s\n", spa_name(spa)); 7350 mutex_exit(&spa_namespace_lock); 7351 } 7352 7353 static void 7354 ztest_spa_import_export(char *oldname, char *newname) 7355 { 7356 nvlist_t *config, *newconfig; 7357 uint64_t pool_guid; 7358 spa_t *spa; 7359 int error; 7360 7361 if (ztest_opts.zo_verbose >= 4) { 7362 (void) printf("import/export: old = %s, new = %s\n", 7363 oldname, newname); 7364 } 7365 7366 /* 7367 * Clean up from previous runs. 7368 */ 7369 (void) spa_destroy(newname); 7370 7371 /* 7372 * Get the pool's configuration and guid. 7373 */ 7374 VERIFY0(spa_open(oldname, &spa, FTAG)); 7375 7376 /* 7377 * Kick off a scrub to tickle scrub/export races. 7378 */ 7379 if (ztest_random(2) == 0) 7380 (void) spa_scan(spa, POOL_SCAN_SCRUB); 7381 7382 pool_guid = spa_guid(spa); 7383 spa_close(spa, FTAG); 7384 7385 ztest_walk_pool_directory("pools before export"); 7386 7387 /* 7388 * Export it. 7389 */ 7390 VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); 7391 7392 ztest_walk_pool_directory("pools after export"); 7393 7394 /* 7395 * Try to import it. 7396 */ 7397 newconfig = spa_tryimport(config); 7398 ASSERT3P(newconfig, !=, NULL); 7399 fnvlist_free(newconfig); 7400 7401 /* 7402 * Import it under the new name. 7403 */ 7404 error = spa_import(newname, config, NULL, 0); 7405 if (error != 0) { 7406 dump_nvlist(config, 0); 7407 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 7408 oldname, newname, error); 7409 } 7410 7411 ztest_walk_pool_directory("pools after import"); 7412 7413 /* 7414 * Try to import it again -- should fail with EEXIST. 7415 */ 7416 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 7417 7418 /* 7419 * Try to import it under a different name -- should fail with EEXIST. 7420 */ 7421 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 7422 7423 /* 7424 * Verify that the pool is no longer visible under the old name. 7425 */ 7426 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 7427 7428 /* 7429 * Verify that we can open and close the pool using the new name. 7430 */ 7431 VERIFY0(spa_open(newname, &spa, FTAG)); 7432 ASSERT3U(pool_guid, ==, spa_guid(spa)); 7433 spa_close(spa, FTAG); 7434 7435 fnvlist_free(config); 7436 } 7437 7438 static void 7439 ztest_resume(spa_t *spa) 7440 { 7441 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 7442 (void) printf("resuming from suspended state\n"); 7443 spa_vdev_state_enter(spa, SCL_NONE); 7444 vdev_clear(spa, NULL); 7445 (void) spa_vdev_state_exit(spa, NULL, 0); 7446 (void) zio_resume(spa); 7447 } 7448 7449 static __attribute__((noreturn)) void 7450 ztest_resume_thread(void *arg) 7451 { 7452 spa_t *spa = arg; 7453 7454 while (!ztest_exiting) { 7455 if (spa_suspended(spa)) 7456 ztest_resume(spa); 7457 (void) poll(NULL, 0, 100); 7458 7459 /* 7460 * Periodically change the zfs_compressed_arc_enabled setting. 7461 */ 7462 if (ztest_random(10) == 0) 7463 zfs_compressed_arc_enabled = ztest_random(2); 7464 7465 /* 7466 * Periodically change the zfs_abd_scatter_enabled setting. 7467 */ 7468 if (ztest_random(10) == 0) 7469 zfs_abd_scatter_enabled = ztest_random(2); 7470 } 7471 7472 thread_exit(); 7473 } 7474 7475 static __attribute__((noreturn)) void 7476 ztest_deadman_thread(void *arg) 7477 { 7478 ztest_shared_t *zs = arg; 7479 spa_t *spa = ztest_spa; 7480 hrtime_t delay, overdue, last_run = gethrtime(); 7481 7482 delay = (zs->zs_thread_stop - zs->zs_thread_start) + 7483 MSEC2NSEC(zfs_deadman_synctime_ms); 7484 7485 while (!ztest_exiting) { 7486 /* 7487 * Wait for the delay timer while checking occasionally 7488 * if we should stop. 7489 */ 7490 if (gethrtime() < last_run + delay) { 7491 (void) poll(NULL, 0, 1000); 7492 continue; 7493 } 7494 7495 /* 7496 * If the pool is suspended then fail immediately. Otherwise, 7497 * check to see if the pool is making any progress. If 7498 * vdev_deadman() discovers that there hasn't been any recent 7499 * I/Os then it will end up aborting the tests. 7500 */ 7501 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 7502 fatal(B_FALSE, 7503 "aborting test after %llu seconds because " 7504 "pool has transitioned to a suspended state.", 7505 (u_longlong_t)zfs_deadman_synctime_ms / 1000); 7506 } 7507 vdev_deadman(spa->spa_root_vdev, FTAG); 7508 7509 /* 7510 * If the process doesn't complete within a grace period of 7511 * zfs_deadman_synctime_ms over the expected finish time, 7512 * then it may be hung and is terminated. 7513 */ 7514 overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); 7515 if (gethrtime() > overdue) { 7516 fatal(B_FALSE, 7517 "aborting test after %llu seconds because " 7518 "the process is overdue for termination.", 7519 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7520 } 7521 7522 (void) printf("ztest has been running for %lld seconds\n", 7523 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7524 7525 last_run = gethrtime(); 7526 delay = MSEC2NSEC(zfs_deadman_checktime_ms); 7527 } 7528 7529 thread_exit(); 7530 } 7531 7532 static void 7533 ztest_execute(int test, ztest_info_t *zi, uint64_t id) 7534 { 7535 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 7536 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 7537 hrtime_t functime = gethrtime(); 7538 int i; 7539 7540 for (i = 0; i < zi->zi_iters; i++) 7541 zi->zi_func(zd, id); 7542 7543 functime = gethrtime() - functime; 7544 7545 atomic_add_64(&zc->zc_count, 1); 7546 atomic_add_64(&zc->zc_time, functime); 7547 7548 if (ztest_opts.zo_verbose >= 4) 7549 (void) printf("%6.2f sec in %s\n", 7550 (double)functime / NANOSEC, zi->zi_funcname); 7551 } 7552 7553 typedef struct ztest_raidz_expand_io { 7554 uint64_t rzx_id; 7555 uint64_t rzx_amount; 7556 uint64_t rzx_bufsize; 7557 const void *rzx_buffer; 7558 uint64_t rzx_alloc_max; 7559 spa_t *rzx_spa; 7560 } ztest_expand_io_t; 7561 7562 #undef OD_ARRAY_SIZE 7563 #define OD_ARRAY_SIZE 10 7564 7565 /* 7566 * Write a request amount of data to some dataset objects. 7567 * There will be ztest_opts.zo_threads count of these running in parallel. 7568 */ 7569 static __attribute__((noreturn)) void 7570 ztest_rzx_thread(void *arg) 7571 { 7572 ztest_expand_io_t *info = (ztest_expand_io_t *)arg; 7573 ztest_od_t *od; 7574 int batchsize; 7575 int od_size; 7576 ztest_ds_t *zd = &ztest_ds[info->rzx_id % ztest_opts.zo_datasets]; 7577 spa_t *spa = info->rzx_spa; 7578 7579 od_size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 7580 od = umem_alloc(od_size, UMEM_NOFAIL); 7581 batchsize = OD_ARRAY_SIZE; 7582 7583 /* Create objects to write to */ 7584 for (int b = 0; b < batchsize; b++) { 7585 ztest_od_init(od + b, info->rzx_id, FTAG, b, 7586 DMU_OT_UINT64_OTHER, 0, 0, 0); 7587 } 7588 if (ztest_object_init(zd, od, od_size, B_FALSE) != 0) { 7589 umem_free(od, od_size); 7590 thread_exit(); 7591 } 7592 7593 for (uint64_t offset = 0, written = 0; written < info->rzx_amount; 7594 offset += info->rzx_bufsize) { 7595 /* write to 10 objects */ 7596 for (int i = 0; i < batchsize && written < info->rzx_amount; 7597 i++) { 7598 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 7599 ztest_write(zd, od[i].od_object, offset, 7600 info->rzx_bufsize, info->rzx_buffer); 7601 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 7602 written += info->rzx_bufsize; 7603 } 7604 txg_wait_synced(spa_get_dsl(spa), 0); 7605 /* due to inflation, we'll typically bail here */ 7606 if (metaslab_class_get_alloc(spa_normal_class(spa)) > 7607 info->rzx_alloc_max) { 7608 break; 7609 } 7610 } 7611 7612 /* Remove a few objects to leave some holes in allocation space */ 7613 mutex_enter(&zd->zd_dirobj_lock); 7614 (void) ztest_remove(zd, od, 2); 7615 mutex_exit(&zd->zd_dirobj_lock); 7616 7617 umem_free(od, od_size); 7618 7619 thread_exit(); 7620 } 7621 7622 static __attribute__((noreturn)) void 7623 ztest_thread(void *arg) 7624 { 7625 int rand; 7626 uint64_t id = (uintptr_t)arg; 7627 ztest_shared_t *zs = ztest_shared; 7628 uint64_t call_next; 7629 hrtime_t now; 7630 ztest_info_t *zi; 7631 ztest_shared_callstate_t *zc; 7632 7633 while ((now = gethrtime()) < zs->zs_thread_stop) { 7634 /* 7635 * See if it's time to force a crash. 7636 */ 7637 if (now > zs->zs_thread_kill && 7638 raidz_expand_pause_point == RAIDZ_EXPAND_PAUSE_NONE) { 7639 ztest_kill(zs); 7640 } 7641 7642 /* 7643 * If we're getting ENOSPC with some regularity, stop. 7644 */ 7645 if (zs->zs_enospc_count > 10) 7646 break; 7647 7648 /* 7649 * Pick a random function to execute. 7650 */ 7651 rand = ztest_random(ZTEST_FUNCS); 7652 zi = &ztest_info[rand]; 7653 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 7654 call_next = zc->zc_next; 7655 7656 if (now >= call_next && 7657 atomic_cas_64(&zc->zc_next, call_next, call_next + 7658 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 7659 ztest_execute(rand, zi, id); 7660 } 7661 } 7662 7663 thread_exit(); 7664 } 7665 7666 static void 7667 ztest_dataset_name(char *dsname, const char *pool, int d) 7668 { 7669 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 7670 } 7671 7672 static void 7673 ztest_dataset_destroy(int d) 7674 { 7675 char name[ZFS_MAX_DATASET_NAME_LEN]; 7676 int t; 7677 7678 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7679 7680 if (ztest_opts.zo_verbose >= 3) 7681 (void) printf("Destroying %s to free up space\n", name); 7682 7683 /* 7684 * Cleanup any non-standard clones and snapshots. In general, 7685 * ztest thread t operates on dataset (t % zopt_datasets), 7686 * so there may be more than one thing to clean up. 7687 */ 7688 for (t = d; t < ztest_opts.zo_threads; 7689 t += ztest_opts.zo_datasets) 7690 ztest_dsl_dataset_cleanup(name, t); 7691 7692 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 7693 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 7694 } 7695 7696 static void 7697 ztest_dataset_dirobj_verify(ztest_ds_t *zd) 7698 { 7699 uint64_t usedobjs, dirobjs, scratch; 7700 7701 /* 7702 * ZTEST_DIROBJ is the object directory for the entire dataset. 7703 * Therefore, the number of objects in use should equal the 7704 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 7705 * If not, we have an object leak. 7706 * 7707 * Note that we can only check this in ztest_dataset_open(), 7708 * when the open-context and syncing-context values agree. 7709 * That's because zap_count() returns the open-context value, 7710 * while dmu_objset_space() returns the rootbp fill count. 7711 */ 7712 VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 7713 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 7714 ASSERT3U(dirobjs + 1, ==, usedobjs); 7715 } 7716 7717 static int 7718 ztest_dataset_open(int d) 7719 { 7720 ztest_ds_t *zd = &ztest_ds[d]; 7721 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 7722 objset_t *os; 7723 zilog_t *zilog; 7724 char name[ZFS_MAX_DATASET_NAME_LEN]; 7725 int error; 7726 7727 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7728 7729 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7730 7731 error = ztest_dataset_create(name); 7732 if (error == ENOSPC) { 7733 (void) pthread_rwlock_unlock(&ztest_name_lock); 7734 ztest_record_enospc(FTAG); 7735 return (error); 7736 } 7737 ASSERT(error == 0 || error == EEXIST); 7738 7739 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 7740 B_TRUE, zd, &os)); 7741 (void) pthread_rwlock_unlock(&ztest_name_lock); 7742 7743 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 7744 7745 zilog = zd->zd_zilog; 7746 7747 if (zilog->zl_header->zh_claim_lr_seq != 0 && 7748 zilog->zl_header->zh_claim_lr_seq < committed_seq) 7749 fatal(B_FALSE, "missing log records: " 7750 "claimed %"PRIu64" < committed %"PRIu64"", 7751 zilog->zl_header->zh_claim_lr_seq, committed_seq); 7752 7753 ztest_dataset_dirobj_verify(zd); 7754 7755 zil_replay(os, zd, ztest_replay_vector); 7756 7757 ztest_dataset_dirobj_verify(zd); 7758 7759 if (ztest_opts.zo_verbose >= 6) 7760 (void) printf("%s replay %"PRIu64" blocks, " 7761 "%"PRIu64" records, seq %"PRIu64"\n", 7762 zd->zd_name, 7763 zilog->zl_parse_blk_count, 7764 zilog->zl_parse_lr_count, 7765 zilog->zl_replaying_seq); 7766 7767 zilog = zil_open(os, ztest_get_data, NULL); 7768 7769 if (zilog->zl_replaying_seq != 0 && 7770 zilog->zl_replaying_seq < committed_seq) 7771 fatal(B_FALSE, "missing log records: " 7772 "replayed %"PRIu64" < committed %"PRIu64"", 7773 zilog->zl_replaying_seq, committed_seq); 7774 7775 return (0); 7776 } 7777 7778 static void 7779 ztest_dataset_close(int d) 7780 { 7781 ztest_ds_t *zd = &ztest_ds[d]; 7782 7783 zil_close(zd->zd_zilog); 7784 dmu_objset_disown(zd->zd_os, B_TRUE, zd); 7785 7786 ztest_zd_fini(zd); 7787 } 7788 7789 static int 7790 ztest_replay_zil_cb(const char *name, void *arg) 7791 { 7792 (void) arg; 7793 objset_t *os; 7794 ztest_ds_t *zdtmp; 7795 7796 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, 7797 B_TRUE, FTAG, &os)); 7798 7799 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 7800 7801 ztest_zd_init(zdtmp, NULL, os); 7802 zil_replay(os, zdtmp, ztest_replay_vector); 7803 ztest_zd_fini(zdtmp); 7804 7805 if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && 7806 ztest_opts.zo_verbose >= 6) { 7807 zilog_t *zilog = dmu_objset_zil(os); 7808 7809 (void) printf("%s replay %"PRIu64" blocks, " 7810 "%"PRIu64" records, seq %"PRIu64"\n", 7811 name, 7812 zilog->zl_parse_blk_count, 7813 zilog->zl_parse_lr_count, 7814 zilog->zl_replaying_seq); 7815 } 7816 7817 umem_free(zdtmp, sizeof (ztest_ds_t)); 7818 7819 dmu_objset_disown(os, B_TRUE, FTAG); 7820 return (0); 7821 } 7822 7823 static void 7824 ztest_freeze(void) 7825 { 7826 ztest_ds_t *zd = &ztest_ds[0]; 7827 spa_t *spa; 7828 int numloops = 0; 7829 7830 /* freeze not supported during RAIDZ expansion */ 7831 if (ztest_opts.zo_raid_do_expand) 7832 return; 7833 7834 if (ztest_opts.zo_verbose >= 3) 7835 (void) printf("testing spa_freeze()...\n"); 7836 7837 raidz_scratch_verify(); 7838 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7839 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7840 VERIFY0(ztest_dataset_open(0)); 7841 ztest_spa = spa; 7842 7843 /* 7844 * Force the first log block to be transactionally allocated. 7845 * We have to do this before we freeze the pool -- otherwise 7846 * the log chain won't be anchored. 7847 */ 7848 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 7849 ztest_dmu_object_alloc_free(zd, 0); 7850 zil_commit(zd->zd_zilog, 0); 7851 } 7852 7853 txg_wait_synced(spa_get_dsl(spa), 0); 7854 7855 /* 7856 * Freeze the pool. This stops spa_sync() from doing anything, 7857 * so that the only way to record changes from now on is the ZIL. 7858 */ 7859 spa_freeze(spa); 7860 7861 /* 7862 * Because it is hard to predict how much space a write will actually 7863 * require beforehand, we leave ourselves some fudge space to write over 7864 * capacity. 7865 */ 7866 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 7867 7868 /* 7869 * Run tests that generate log records but don't alter the pool config 7870 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 7871 * We do a txg_wait_synced() after each iteration to force the txg 7872 * to increase well beyond the last synced value in the uberblock. 7873 * The ZIL should be OK with that. 7874 * 7875 * Run a random number of times less than zo_maxloops and ensure we do 7876 * not run out of space on the pool. 7877 */ 7878 while (ztest_random(10) != 0 && 7879 numloops++ < ztest_opts.zo_maxloops && 7880 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 7881 ztest_od_t od; 7882 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 7883 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 7884 ztest_io(zd, od.od_object, 7885 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 7886 txg_wait_synced(spa_get_dsl(spa), 0); 7887 } 7888 7889 /* 7890 * Commit all of the changes we just generated. 7891 */ 7892 zil_commit(zd->zd_zilog, 0); 7893 txg_wait_synced(spa_get_dsl(spa), 0); 7894 7895 /* 7896 * Close our dataset and close the pool. 7897 */ 7898 ztest_dataset_close(0); 7899 spa_close(spa, FTAG); 7900 kernel_fini(); 7901 7902 /* 7903 * Open and close the pool and dataset to induce log replay. 7904 */ 7905 raidz_scratch_verify(); 7906 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7907 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7908 ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); 7909 VERIFY0(ztest_dataset_open(0)); 7910 ztest_spa = spa; 7911 txg_wait_synced(spa_get_dsl(spa), 0); 7912 ztest_dataset_close(0); 7913 ztest_reguid(NULL, 0); 7914 7915 spa_close(spa, FTAG); 7916 kernel_fini(); 7917 } 7918 7919 static void 7920 ztest_import_impl(void) 7921 { 7922 importargs_t args = { 0 }; 7923 nvlist_t *cfg = NULL; 7924 int nsearch = 1; 7925 char *searchdirs[nsearch]; 7926 int flags = ZFS_IMPORT_MISSING_LOG; 7927 7928 searchdirs[0] = ztest_opts.zo_dir; 7929 args.paths = nsearch; 7930 args.path = searchdirs; 7931 args.can_be_active = B_FALSE; 7932 7933 libpc_handle_t lpch = { 7934 .lpc_lib_handle = NULL, 7935 .lpc_ops = &libzpool_config_ops, 7936 .lpc_printerr = B_TRUE 7937 }; 7938 VERIFY0(zpool_find_config(&lpch, ztest_opts.zo_pool, &cfg, &args)); 7939 VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); 7940 fnvlist_free(cfg); 7941 } 7942 7943 /* 7944 * Import a storage pool with the given name. 7945 */ 7946 static void 7947 ztest_import(ztest_shared_t *zs) 7948 { 7949 spa_t *spa; 7950 7951 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7952 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7953 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7954 7955 raidz_scratch_verify(); 7956 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7957 7958 ztest_import_impl(); 7959 7960 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7961 zs->zs_metaslab_sz = 7962 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7963 zs->zs_guid = spa_guid(spa); 7964 spa_close(spa, FTAG); 7965 7966 kernel_fini(); 7967 7968 if (!ztest_opts.zo_mmp_test) { 7969 ztest_run_zdb(zs->zs_guid); 7970 ztest_freeze(); 7971 ztest_run_zdb(zs->zs_guid); 7972 } 7973 7974 (void) pthread_rwlock_destroy(&ztest_name_lock); 7975 mutex_destroy(&ztest_vdev_lock); 7976 mutex_destroy(&ztest_checkpoint_lock); 7977 } 7978 7979 /* 7980 * After the expansion was killed, check that the pool is healthy 7981 */ 7982 static void 7983 ztest_raidz_expand_check(spa_t *spa) 7984 { 7985 ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, RAIDZ_EXPAND_KILLED); 7986 /* 7987 * Set pool check done flag, main program will run a zdb check 7988 * of the pool when we exit. 7989 */ 7990 ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_CHECKED; 7991 7992 /* Wait for reflow to finish */ 7993 if (ztest_opts.zo_verbose >= 1) { 7994 (void) printf("\nwaiting for reflow to finish ...\n"); 7995 } 7996 pool_raidz_expand_stat_t rzx_stats; 7997 pool_raidz_expand_stat_t *pres = &rzx_stats; 7998 do { 7999 txg_wait_synced(spa_get_dsl(spa), 0); 8000 (void) poll(NULL, 0, 500); /* wait 1/2 second */ 8001 8002 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8003 (void) spa_raidz_expand_get_stats(spa, pres); 8004 spa_config_exit(spa, SCL_CONFIG, FTAG); 8005 } while (pres->pres_state != DSS_FINISHED && 8006 pres->pres_reflowed < pres->pres_to_reflow); 8007 8008 if (ztest_opts.zo_verbose >= 1) { 8009 (void) printf("verifying an interrupted raidz " 8010 "expansion using a pool scrub ...\n"); 8011 } 8012 /* Will fail here if there is non-recoverable corruption detected */ 8013 VERIFY0(ztest_scrub_impl(spa)); 8014 if (ztest_opts.zo_verbose >= 1) { 8015 (void) printf("raidz expansion scrub check complete\n"); 8016 } 8017 } 8018 8019 /* 8020 * Start a raidz expansion test. We run some I/O on the pool for a while 8021 * to get some data in the pool. Then we grow the raidz and 8022 * kill the test at the requested offset into the reflow, verifying that 8023 * doing such does not lead to pool corruption. 8024 */ 8025 static void 8026 ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa) 8027 { 8028 nvlist_t *root; 8029 pool_raidz_expand_stat_t rzx_stats; 8030 pool_raidz_expand_stat_t *pres = &rzx_stats; 8031 kthread_t **run_threads; 8032 vdev_t *cvd, *rzvd = spa->spa_root_vdev->vdev_child[0]; 8033 int total_disks = rzvd->vdev_children; 8034 int data_disks = total_disks - vdev_get_nparity(rzvd); 8035 uint64_t alloc_goal; 8036 uint64_t csize; 8037 int error, t; 8038 int threads = ztest_opts.zo_threads; 8039 ztest_expand_io_t *thread_args; 8040 8041 ASSERT3U(ztest_opts.zo_raidz_expand_test, !=, RAIDZ_EXPAND_NONE); 8042 ASSERT3P(rzvd->vdev_ops, ==, &vdev_raidz_ops); 8043 ztest_opts.zo_raidz_expand_test = RAIDZ_EXPAND_STARTED; 8044 8045 /* Setup a 1 MiB buffer of random data */ 8046 uint64_t bufsize = 1024 * 1024; 8047 void *buffer = umem_alloc(bufsize, UMEM_NOFAIL); 8048 8049 if (read(ztest_fd_rand, buffer, bufsize) != bufsize) { 8050 fatal(B_TRUE, "short read from /dev/urandom"); 8051 } 8052 /* 8053 * Put some data in the pool and then attach a vdev to initiate 8054 * reflow. 8055 */ 8056 run_threads = umem_zalloc(threads * sizeof (kthread_t *), UMEM_NOFAIL); 8057 thread_args = umem_zalloc(threads * sizeof (ztest_expand_io_t), 8058 UMEM_NOFAIL); 8059 /* Aim for roughly 25% of allocatable space up to 1GB */ 8060 alloc_goal = (vdev_get_min_asize(rzvd) * data_disks) / total_disks; 8061 alloc_goal = MIN(alloc_goal >> 2, 1024*1024*1024); 8062 if (ztest_opts.zo_verbose >= 1) { 8063 (void) printf("adding data to pool '%s', goal %llu bytes\n", 8064 ztest_opts.zo_pool, (u_longlong_t)alloc_goal); 8065 } 8066 8067 /* 8068 * Kick off all the I/O generators that run in parallel. 8069 */ 8070 for (t = 0; t < threads; t++) { 8071 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 8072 umem_free(run_threads, threads * sizeof (kthread_t *)); 8073 umem_free(buffer, bufsize); 8074 return; 8075 } 8076 thread_args[t].rzx_id = t; 8077 thread_args[t].rzx_amount = alloc_goal / threads; 8078 thread_args[t].rzx_bufsize = bufsize; 8079 thread_args[t].rzx_buffer = buffer; 8080 thread_args[t].rzx_alloc_max = alloc_goal; 8081 thread_args[t].rzx_spa = spa; 8082 run_threads[t] = thread_create(NULL, 0, ztest_rzx_thread, 8083 &thread_args[t], 0, NULL, TS_RUN | TS_JOINABLE, 8084 defclsyspri); 8085 } 8086 8087 /* 8088 * Wait for all of the writers to complete. 8089 */ 8090 for (t = 0; t < threads; t++) 8091 VERIFY0(thread_join(run_threads[t])); 8092 8093 /* 8094 * Close all datasets. This must be done after all the threads 8095 * are joined so we can be sure none of the datasets are in-use 8096 * by any of the threads. 8097 */ 8098 for (t = 0; t < ztest_opts.zo_threads; t++) { 8099 if (t < ztest_opts.zo_datasets) 8100 ztest_dataset_close(t); 8101 } 8102 8103 txg_wait_synced(spa_get_dsl(spa), 0); 8104 8105 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 8106 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 8107 8108 umem_free(buffer, bufsize); 8109 umem_free(run_threads, threads * sizeof (kthread_t *)); 8110 umem_free(thread_args, threads * sizeof (ztest_expand_io_t)); 8111 8112 /* Set our reflow target to 25%, 50% or 75% of allocated size */ 8113 uint_t multiple = ztest_random(3) + 1; 8114 uint64_t reflow_max = (rzvd->vdev_stat.vs_alloc * multiple) / 4; 8115 raidz_expand_max_reflow_bytes = reflow_max; 8116 8117 if (ztest_opts.zo_verbose >= 1) { 8118 (void) printf("running raidz expansion test, killing when " 8119 "reflow reaches %llu bytes (%u/4 of allocated space)\n", 8120 (u_longlong_t)reflow_max, multiple); 8121 } 8122 8123 /* XXX - do we want some I/O load during the reflow? */ 8124 8125 /* 8126 * Use a disk size that is larger than existing ones 8127 */ 8128 cvd = rzvd->vdev_child[0]; 8129 csize = vdev_get_min_asize(cvd); 8130 csize += csize / 10; 8131 /* 8132 * Path to vdev to be attached 8133 */ 8134 char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 8135 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 8136 ztest_opts.zo_dir, ztest_opts.zo_pool, rzvd->vdev_children); 8137 /* 8138 * Build the nvlist describing newpath. 8139 */ 8140 root = make_vdev_root(newpath, NULL, NULL, csize, ztest_get_ashift(), 8141 NULL, 0, 0, 1); 8142 /* 8143 * Expand the raidz vdev by attaching the new disk 8144 */ 8145 if (ztest_opts.zo_verbose >= 1) { 8146 (void) printf("expanding raidz: %d wide to %d wide with '%s'\n", 8147 (int)rzvd->vdev_children, (int)rzvd->vdev_children + 1, 8148 newpath); 8149 } 8150 error = spa_vdev_attach(spa, rzvd->vdev_guid, root, B_FALSE, B_FALSE); 8151 nvlist_free(root); 8152 if (error != 0) { 8153 fatal(0, "raidz expand: attach (%s %llu) returned %d", 8154 newpath, (long long)csize, error); 8155 } 8156 8157 /* 8158 * Wait for reflow to begin 8159 */ 8160 while (spa->spa_raidz_expand == NULL) { 8161 txg_wait_synced(spa_get_dsl(spa), 0); 8162 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8163 } 8164 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8165 (void) spa_raidz_expand_get_stats(spa, pres); 8166 spa_config_exit(spa, SCL_CONFIG, FTAG); 8167 while (pres->pres_state != DSS_SCANNING) { 8168 txg_wait_synced(spa_get_dsl(spa), 0); 8169 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8170 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8171 (void) spa_raidz_expand_get_stats(spa, pres); 8172 spa_config_exit(spa, SCL_CONFIG, FTAG); 8173 } 8174 8175 ASSERT3U(pres->pres_state, ==, DSS_SCANNING); 8176 ASSERT3U(pres->pres_to_reflow, !=, 0); 8177 /* 8178 * Set so when we are killed we go to raidz checking rather than 8179 * restarting test. 8180 */ 8181 ztest_shared_opts->zo_raidz_expand_test = RAIDZ_EXPAND_KILLED; 8182 if (ztest_opts.zo_verbose >= 1) { 8183 (void) printf("raidz expansion reflow started, waiting for " 8184 "%llu bytes to be copied\n", (u_longlong_t)reflow_max); 8185 } 8186 8187 /* 8188 * Wait for reflow maximum to be reached and then kill the test 8189 */ 8190 while (pres->pres_reflowed < reflow_max) { 8191 txg_wait_synced(spa_get_dsl(spa), 0); 8192 (void) poll(NULL, 0, 100); /* wait 1/10 second */ 8193 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8194 (void) spa_raidz_expand_get_stats(spa, pres); 8195 spa_config_exit(spa, SCL_CONFIG, FTAG); 8196 } 8197 8198 /* Reset the reflow pause before killing */ 8199 raidz_expand_max_reflow_bytes = 0; 8200 8201 if (ztest_opts.zo_verbose >= 1) { 8202 (void) printf("killing raidz expansion test after reflow " 8203 "reached %llu bytes\n", (u_longlong_t)pres->pres_reflowed); 8204 } 8205 8206 /* 8207 * Kill ourself to simulate a panic during a reflow. Our parent will 8208 * restart the test and the changed flag value will drive the test 8209 * through the scrub/check code to verify the pool is not corrupted. 8210 */ 8211 ztest_kill(zs); 8212 } 8213 8214 static void 8215 ztest_generic_run(ztest_shared_t *zs, spa_t *spa) 8216 { 8217 kthread_t **run_threads; 8218 int t; 8219 8220 run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), 8221 UMEM_NOFAIL); 8222 8223 /* 8224 * Kick off all the tests that run in parallel. 8225 */ 8226 for (t = 0; t < ztest_opts.zo_threads; t++) { 8227 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 8228 umem_free(run_threads, ztest_opts.zo_threads * 8229 sizeof (kthread_t *)); 8230 return; 8231 } 8232 8233 run_threads[t] = thread_create(NULL, 0, ztest_thread, 8234 (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, 8235 defclsyspri); 8236 } 8237 8238 /* 8239 * Wait for all of the tests to complete. 8240 */ 8241 for (t = 0; t < ztest_opts.zo_threads; t++) 8242 VERIFY0(thread_join(run_threads[t])); 8243 8244 /* 8245 * Close all datasets. This must be done after all the threads 8246 * are joined so we can be sure none of the datasets are in-use 8247 * by any of the threads. 8248 */ 8249 for (t = 0; t < ztest_opts.zo_threads; t++) { 8250 if (t < ztest_opts.zo_datasets) 8251 ztest_dataset_close(t); 8252 } 8253 8254 txg_wait_synced(spa_get_dsl(spa), 0); 8255 8256 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 8257 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 8258 8259 umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); 8260 } 8261 8262 /* 8263 * Setup our test context and kick off threads to run tests on all datasets 8264 * in parallel. 8265 */ 8266 static void 8267 ztest_run(ztest_shared_t *zs) 8268 { 8269 spa_t *spa; 8270 objset_t *os; 8271 kthread_t *resume_thread, *deadman_thread; 8272 uint64_t object; 8273 int error; 8274 int t, d; 8275 8276 ztest_exiting = B_FALSE; 8277 8278 /* 8279 * Initialize parent/child shared state. 8280 */ 8281 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8282 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8283 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8284 8285 zs->zs_thread_start = gethrtime(); 8286 zs->zs_thread_stop = 8287 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 8288 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 8289 zs->zs_thread_kill = zs->zs_thread_stop; 8290 if (ztest_random(100) < ztest_opts.zo_killrate) { 8291 zs->zs_thread_kill -= 8292 ztest_random(ztest_opts.zo_passtime * NANOSEC); 8293 } 8294 8295 mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); 8296 8297 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 8298 offsetof(ztest_cb_data_t, zcd_node)); 8299 8300 /* 8301 * Open our pool. It may need to be imported first depending on 8302 * what tests were running when the previous pass was terminated. 8303 */ 8304 raidz_scratch_verify(); 8305 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8306 error = spa_open(ztest_opts.zo_pool, &spa, FTAG); 8307 if (error) { 8308 VERIFY3S(error, ==, ENOENT); 8309 ztest_import_impl(); 8310 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8311 zs->zs_metaslab_sz = 8312 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8313 } 8314 8315 metaslab_preload_limit = ztest_random(20) + 1; 8316 ztest_spa = spa; 8317 8318 /* 8319 * XXX - BUGBUG raidz expansion do not run this for generic for now 8320 */ 8321 if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) 8322 VERIFY0(vdev_raidz_impl_set("cycle")); 8323 8324 dmu_objset_stats_t dds; 8325 VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, 8326 DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); 8327 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 8328 dmu_objset_fast_stat(os, &dds); 8329 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 8330 dmu_objset_disown(os, B_TRUE, FTAG); 8331 8332 /* Give the dedicated raidz expansion test more grace time */ 8333 if (ztest_opts.zo_raidz_expand_test != RAIDZ_EXPAND_NONE) 8334 zfs_deadman_synctime_ms *= 2; 8335 8336 /* 8337 * Create a thread to periodically resume suspended I/O. 8338 */ 8339 resume_thread = thread_create(NULL, 0, ztest_resume_thread, 8340 spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 8341 8342 /* 8343 * Create a deadman thread and set to panic if we hang. 8344 */ 8345 deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, 8346 zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 8347 8348 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; 8349 8350 /* 8351 * Verify that we can safely inquire about any object, 8352 * whether it's allocated or not. To make it interesting, 8353 * we probe a 5-wide window around each power of two. 8354 * This hits all edge cases, including zero and the max. 8355 */ 8356 for (t = 0; t < 64; t++) { 8357 for (d = -5; d <= 5; d++) { 8358 error = dmu_object_info(spa->spa_meta_objset, 8359 (1ULL << t) + d, NULL); 8360 ASSERT(error == 0 || error == ENOENT || 8361 error == EINVAL); 8362 } 8363 } 8364 8365 /* 8366 * If we got any ENOSPC errors on the previous run, destroy something. 8367 */ 8368 if (zs->zs_enospc_count != 0) { 8369 /* Not expecting ENOSPC errors during raidz expansion tests */ 8370 ASSERT3U(ztest_opts.zo_raidz_expand_test, ==, 8371 RAIDZ_EXPAND_NONE); 8372 8373 int d = ztest_random(ztest_opts.zo_datasets); 8374 ztest_dataset_destroy(d); 8375 } 8376 zs->zs_enospc_count = 0; 8377 8378 /* 8379 * If we were in the middle of ztest_device_removal() and were killed 8380 * we need to ensure the removal and scrub complete before running 8381 * any tests that check ztest_device_removal_active. The removal will 8382 * be restarted automatically when the spa is opened, but we need to 8383 * initiate the scrub manually if it is not already in progress. Note 8384 * that we always run the scrub whenever an indirect vdev exists 8385 * because we have no way of knowing for sure if ztest_device_removal() 8386 * fully completed its scrub before the pool was reimported. 8387 * 8388 * Does not apply for the RAIDZ expansion specific test runs 8389 */ 8390 if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_NONE && 8391 (spa->spa_removing_phys.sr_state == DSS_SCANNING || 8392 spa->spa_removing_phys.sr_prev_indirect_vdev != -1)) { 8393 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 8394 txg_wait_synced(spa_get_dsl(spa), 0); 8395 8396 error = ztest_scrub_impl(spa); 8397 if (error == EBUSY) 8398 error = 0; 8399 ASSERT0(error); 8400 } 8401 8402 if (ztest_opts.zo_verbose >= 4) 8403 (void) printf("starting main threads...\n"); 8404 8405 /* 8406 * Replay all logs of all datasets in the pool. This is primarily for 8407 * temporary datasets which wouldn't otherwise get replayed, which 8408 * can trigger failures when attempting to offline a SLOG in 8409 * ztest_fault_inject(). 8410 */ 8411 (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, 8412 NULL, DS_FIND_CHILDREN); 8413 8414 if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_REQUESTED) 8415 ztest_raidz_expand_run(zs, spa); 8416 else if (ztest_opts.zo_raidz_expand_test == RAIDZ_EXPAND_KILLED) 8417 ztest_raidz_expand_check(spa); 8418 else 8419 ztest_generic_run(zs, spa); 8420 8421 /* Kill the resume and deadman threads */ 8422 ztest_exiting = B_TRUE; 8423 VERIFY0(thread_join(resume_thread)); 8424 VERIFY0(thread_join(deadman_thread)); 8425 ztest_resume(spa); 8426 8427 /* 8428 * Right before closing the pool, kick off a bunch of async I/O; 8429 * spa_close() should wait for it to complete. 8430 */ 8431 for (object = 1; object < 50; object++) { 8432 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 8433 ZIO_PRIORITY_SYNC_READ); 8434 } 8435 8436 /* Verify that at least one commit cb was called in a timely fashion */ 8437 if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) 8438 VERIFY0(zc_min_txg_delay); 8439 8440 spa_close(spa, FTAG); 8441 8442 /* 8443 * Verify that we can loop over all pools. 8444 */ 8445 mutex_enter(&spa_namespace_lock); 8446 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 8447 if (ztest_opts.zo_verbose > 3) 8448 (void) printf("spa_next: found %s\n", spa_name(spa)); 8449 mutex_exit(&spa_namespace_lock); 8450 8451 /* 8452 * Verify that we can export the pool and reimport it under a 8453 * different name. 8454 */ 8455 if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { 8456 char name[ZFS_MAX_DATASET_NAME_LEN]; 8457 (void) snprintf(name, sizeof (name), "%s_import", 8458 ztest_opts.zo_pool); 8459 ztest_spa_import_export(ztest_opts.zo_pool, name); 8460 ztest_spa_import_export(name, ztest_opts.zo_pool); 8461 } 8462 8463 kernel_fini(); 8464 8465 list_destroy(&zcl.zcl_callbacks); 8466 mutex_destroy(&zcl.zcl_callbacks_lock); 8467 (void) pthread_rwlock_destroy(&ztest_name_lock); 8468 mutex_destroy(&ztest_vdev_lock); 8469 mutex_destroy(&ztest_checkpoint_lock); 8470 } 8471 8472 static void 8473 print_time(hrtime_t t, char *timebuf) 8474 { 8475 hrtime_t s = t / NANOSEC; 8476 hrtime_t m = s / 60; 8477 hrtime_t h = m / 60; 8478 hrtime_t d = h / 24; 8479 8480 s -= m * 60; 8481 m -= h * 60; 8482 h -= d * 24; 8483 8484 timebuf[0] = '\0'; 8485 8486 if (d) 8487 (void) sprintf(timebuf, 8488 "%llud%02lluh%02llum%02llus", d, h, m, s); 8489 else if (h) 8490 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 8491 else if (m) 8492 (void) sprintf(timebuf, "%llum%02llus", m, s); 8493 else 8494 (void) sprintf(timebuf, "%llus", s); 8495 } 8496 8497 static nvlist_t * 8498 make_random_props(void) 8499 { 8500 nvlist_t *props; 8501 8502 props = fnvlist_alloc(); 8503 8504 if (ztest_random(2) == 0) 8505 return (props); 8506 8507 fnvlist_add_uint64(props, 8508 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); 8509 8510 return (props); 8511 } 8512 8513 /* 8514 * Create a storage pool with the given name and initial vdev size. 8515 * Then test spa_freeze() functionality. 8516 */ 8517 static void 8518 ztest_init(ztest_shared_t *zs) 8519 { 8520 spa_t *spa; 8521 nvlist_t *nvroot, *props; 8522 int i; 8523 8524 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 8525 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 8526 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 8527 8528 raidz_scratch_verify(); 8529 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 8530 8531 /* 8532 * Create the storage pool. 8533 */ 8534 (void) spa_destroy(ztest_opts.zo_pool); 8535 ztest_shared->zs_vdev_next_leaf = 0; 8536 zs->zs_splits = 0; 8537 zs->zs_mirrors = ztest_opts.zo_mirrors; 8538 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 8539 NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 8540 props = make_random_props(); 8541 8542 /* 8543 * We don't expect the pool to suspend unless maxfaults == 0, 8544 * in which case ztest_fault_inject() temporarily takes away 8545 * the only valid replica. 8546 */ 8547 fnvlist_add_uint64(props, 8548 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 8549 MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); 8550 8551 for (i = 0; i < SPA_FEATURES; i++) { 8552 char *buf; 8553 8554 if (!spa_feature_table[i].fi_zfs_mod_supported) 8555 continue; 8556 8557 /* 8558 * 75% chance of using the log space map feature. We want ztest 8559 * to exercise both the code paths that use the log space map 8560 * feature and the ones that don't. 8561 */ 8562 if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) 8563 continue; 8564 8565 VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", 8566 spa_feature_table[i].fi_uname)); 8567 fnvlist_add_uint64(props, buf, 0); 8568 free(buf); 8569 } 8570 8571 VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); 8572 fnvlist_free(nvroot); 8573 fnvlist_free(props); 8574 8575 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 8576 zs->zs_metaslab_sz = 8577 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 8578 zs->zs_guid = spa_guid(spa); 8579 spa_close(spa, FTAG); 8580 8581 kernel_fini(); 8582 8583 if (!ztest_opts.zo_mmp_test) { 8584 ztest_run_zdb(zs->zs_guid); 8585 ztest_freeze(); 8586 ztest_run_zdb(zs->zs_guid); 8587 } 8588 8589 (void) pthread_rwlock_destroy(&ztest_name_lock); 8590 mutex_destroy(&ztest_vdev_lock); 8591 mutex_destroy(&ztest_checkpoint_lock); 8592 } 8593 8594 static void 8595 setup_data_fd(void) 8596 { 8597 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 8598 8599 ztest_fd_data = mkstemp(ztest_name_data); 8600 ASSERT3S(ztest_fd_data, >=, 0); 8601 (void) unlink(ztest_name_data); 8602 } 8603 8604 static int 8605 shared_data_size(ztest_shared_hdr_t *hdr) 8606 { 8607 int size; 8608 8609 size = hdr->zh_hdr_size; 8610 size += hdr->zh_opts_size; 8611 size += hdr->zh_size; 8612 size += hdr->zh_stats_size * hdr->zh_stats_count; 8613 size += hdr->zh_ds_size * hdr->zh_ds_count; 8614 size += hdr->zh_scratch_state_size; 8615 8616 return (size); 8617 } 8618 8619 static void 8620 setup_hdr(void) 8621 { 8622 int size; 8623 ztest_shared_hdr_t *hdr; 8624 8625 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 8626 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 8627 ASSERT3P(hdr, !=, MAP_FAILED); 8628 8629 VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 8630 8631 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 8632 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 8633 hdr->zh_size = sizeof (ztest_shared_t); 8634 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 8635 hdr->zh_stats_count = ZTEST_FUNCS; 8636 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 8637 hdr->zh_ds_count = ztest_opts.zo_datasets; 8638 hdr->zh_scratch_state_size = sizeof (ztest_shared_scratch_state_t); 8639 8640 size = shared_data_size(hdr); 8641 VERIFY0(ftruncate(ztest_fd_data, size)); 8642 8643 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 8644 } 8645 8646 static void 8647 setup_data(void) 8648 { 8649 int size, offset; 8650 ztest_shared_hdr_t *hdr; 8651 uint8_t *buf; 8652 8653 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 8654 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 8655 ASSERT3P(hdr, !=, MAP_FAILED); 8656 8657 size = shared_data_size(hdr); 8658 8659 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 8660 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 8661 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 8662 ASSERT3P(hdr, !=, MAP_FAILED); 8663 buf = (uint8_t *)hdr; 8664 8665 offset = hdr->zh_hdr_size; 8666 ztest_shared_opts = (void *)&buf[offset]; 8667 offset += hdr->zh_opts_size; 8668 ztest_shared = (void *)&buf[offset]; 8669 offset += hdr->zh_size; 8670 ztest_shared_callstate = (void *)&buf[offset]; 8671 offset += hdr->zh_stats_size * hdr->zh_stats_count; 8672 ztest_shared_ds = (void *)&buf[offset]; 8673 offset += hdr->zh_ds_size * hdr->zh_ds_count; 8674 ztest_scratch_state = (void *)&buf[offset]; 8675 } 8676 8677 static boolean_t 8678 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 8679 { 8680 pid_t pid; 8681 int status; 8682 char *cmdbuf = NULL; 8683 8684 pid = fork(); 8685 8686 if (cmd == NULL) { 8687 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 8688 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 8689 cmd = cmdbuf; 8690 } 8691 8692 if (pid == -1) 8693 fatal(B_TRUE, "fork failed"); 8694 8695 if (pid == 0) { /* child */ 8696 char fd_data_str[12]; 8697 8698 VERIFY3S(11, >=, 8699 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 8700 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 8701 8702 if (libpath != NULL) { 8703 const char *curlp = getenv("LD_LIBRARY_PATH"); 8704 if (curlp == NULL) 8705 VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); 8706 else { 8707 char *newlp = NULL; 8708 VERIFY3S(-1, !=, 8709 asprintf(&newlp, "%s:%s", libpath, curlp)); 8710 VERIFY0(setenv("LD_LIBRARY_PATH", newlp, 1)); 8711 free(newlp); 8712 } 8713 } 8714 (void) execl(cmd, cmd, (char *)NULL); 8715 ztest_dump_core = B_FALSE; 8716 fatal(B_TRUE, "exec failed: %s", cmd); 8717 } 8718 8719 if (cmdbuf != NULL) { 8720 umem_free(cmdbuf, MAXPATHLEN); 8721 cmd = NULL; 8722 } 8723 8724 while (waitpid(pid, &status, 0) != pid) 8725 continue; 8726 if (statusp != NULL) 8727 *statusp = status; 8728 8729 if (WIFEXITED(status)) { 8730 if (WEXITSTATUS(status) != 0) { 8731 (void) fprintf(stderr, "child exited with code %d\n", 8732 WEXITSTATUS(status)); 8733 exit(2); 8734 } 8735 return (B_FALSE); 8736 } else if (WIFSIGNALED(status)) { 8737 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 8738 (void) fprintf(stderr, "child died with signal %d\n", 8739 WTERMSIG(status)); 8740 exit(3); 8741 } 8742 return (B_TRUE); 8743 } else { 8744 (void) fprintf(stderr, "something strange happened to child\n"); 8745 exit(4); 8746 } 8747 } 8748 8749 static void 8750 ztest_run_init(void) 8751 { 8752 int i; 8753 8754 ztest_shared_t *zs = ztest_shared; 8755 8756 /* 8757 * Blow away any existing copy of zpool.cache 8758 */ 8759 (void) remove(spa_config_path); 8760 8761 if (ztest_opts.zo_init == 0) { 8762 if (ztest_opts.zo_verbose >= 1) 8763 (void) printf("Importing pool %s\n", 8764 ztest_opts.zo_pool); 8765 ztest_import(zs); 8766 return; 8767 } 8768 8769 /* 8770 * Create and initialize our storage pool. 8771 */ 8772 for (i = 1; i <= ztest_opts.zo_init; i++) { 8773 memset(zs, 0, sizeof (*zs)); 8774 if (ztest_opts.zo_verbose >= 3 && 8775 ztest_opts.zo_init != 1) { 8776 (void) printf("ztest_init(), pass %d\n", i); 8777 } 8778 ztest_init(zs); 8779 } 8780 } 8781 8782 int 8783 main(int argc, char **argv) 8784 { 8785 int kills = 0; 8786 int iters = 0; 8787 int older = 0; 8788 int newer = 0; 8789 ztest_shared_t *zs; 8790 ztest_info_t *zi; 8791 ztest_shared_callstate_t *zc; 8792 char timebuf[100]; 8793 char numbuf[NN_NUMBUF_SZ]; 8794 char *cmd; 8795 boolean_t hasalt; 8796 int f, err; 8797 char *fd_data_str = getenv("ZTEST_FD_DATA"); 8798 struct sigaction action; 8799 8800 (void) setvbuf(stdout, NULL, _IOLBF, 0); 8801 8802 dprintf_setup(&argc, argv); 8803 zfs_deadman_synctime_ms = 300000; 8804 zfs_deadman_checktime_ms = 30000; 8805 /* 8806 * As two-word space map entries may not come up often (especially 8807 * if pool and vdev sizes are small) we want to force at least some 8808 * of them so the feature get tested. 8809 */ 8810 zfs_force_some_double_word_sm_entries = B_TRUE; 8811 8812 /* 8813 * Verify that even extensively damaged split blocks with many 8814 * segments can be reconstructed in a reasonable amount of time 8815 * when reconstruction is known to be possible. 8816 * 8817 * Note: the lower this value is, the more damage we inflict, and 8818 * the more time ztest spends in recovering that damage. We chose 8819 * to induce damage 1/100th of the time so recovery is tested but 8820 * not so frequently that ztest doesn't get to test other code paths. 8821 */ 8822 zfs_reconstruct_indirect_damage_fraction = 100; 8823 8824 action.sa_handler = sig_handler; 8825 sigemptyset(&action.sa_mask); 8826 action.sa_flags = 0; 8827 8828 if (sigaction(SIGSEGV, &action, NULL) < 0) { 8829 (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", 8830 strerror(errno)); 8831 exit(EXIT_FAILURE); 8832 } 8833 8834 if (sigaction(SIGABRT, &action, NULL) < 0) { 8835 (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", 8836 strerror(errno)); 8837 exit(EXIT_FAILURE); 8838 } 8839 8840 /* 8841 * Force random_get_bytes() to use /dev/urandom in order to prevent 8842 * ztest from needlessly depleting the system entropy pool. 8843 */ 8844 random_path = "/dev/urandom"; 8845 ztest_fd_rand = open(random_path, O_RDONLY | O_CLOEXEC); 8846 ASSERT3S(ztest_fd_rand, >=, 0); 8847 8848 if (!fd_data_str) { 8849 process_options(argc, argv); 8850 8851 setup_data_fd(); 8852 setup_hdr(); 8853 setup_data(); 8854 memcpy(ztest_shared_opts, &ztest_opts, 8855 sizeof (*ztest_shared_opts)); 8856 } else { 8857 ztest_fd_data = atoi(fd_data_str); 8858 setup_data(); 8859 memcpy(&ztest_opts, ztest_shared_opts, sizeof (ztest_opts)); 8860 } 8861 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 8862 8863 err = ztest_set_global_vars(); 8864 if (err != 0 && !fd_data_str) { 8865 /* error message done by ztest_set_global_vars */ 8866 exit(EXIT_FAILURE); 8867 } else { 8868 /* children should not be spawned if setting gvars fails */ 8869 VERIFY3S(err, ==, 0); 8870 } 8871 8872 /* Override location of zpool.cache */ 8873 VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", 8874 ztest_opts.zo_dir), !=, -1); 8875 8876 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 8877 UMEM_NOFAIL); 8878 zs = ztest_shared; 8879 8880 if (fd_data_str) { 8881 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 8882 metaslab_df_alloc_threshold = 8883 zs->zs_metaslab_df_alloc_threshold; 8884 8885 if (zs->zs_do_init) 8886 ztest_run_init(); 8887 else 8888 ztest_run(zs); 8889 exit(0); 8890 } 8891 8892 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 8893 8894 if (ztest_opts.zo_verbose >= 1) { 8895 (void) printf("%"PRIu64" vdevs, %d datasets, %d threads, " 8896 "%d %s disks, parity %d, %"PRIu64" seconds...\n\n", 8897 ztest_opts.zo_vdevs, 8898 ztest_opts.zo_datasets, 8899 ztest_opts.zo_threads, 8900 ztest_opts.zo_raid_children, 8901 ztest_opts.zo_raid_type, 8902 ztest_opts.zo_raid_parity, 8903 ztest_opts.zo_time); 8904 } 8905 8906 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 8907 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 8908 8909 zs->zs_do_init = B_TRUE; 8910 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 8911 if (ztest_opts.zo_verbose >= 1) { 8912 (void) printf("Executing older ztest for " 8913 "initialization: %s\n", ztest_opts.zo_alt_ztest); 8914 } 8915 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 8916 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 8917 } else { 8918 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 8919 } 8920 zs->zs_do_init = B_FALSE; 8921 8922 zs->zs_proc_start = gethrtime(); 8923 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 8924 8925 for (f = 0; f < ZTEST_FUNCS; f++) { 8926 zi = &ztest_info[f]; 8927 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8928 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 8929 zc->zc_next = UINT64_MAX; 8930 else 8931 zc->zc_next = zs->zs_proc_start + 8932 ztest_random(2 * zi->zi_interval[0] + 1); 8933 } 8934 8935 /* 8936 * Run the tests in a loop. These tests include fault injection 8937 * to verify that self-healing data works, and forced crashes 8938 * to verify that we never lose on-disk consistency. 8939 */ 8940 while (gethrtime() < zs->zs_proc_stop) { 8941 int status; 8942 boolean_t killed; 8943 8944 /* 8945 * Initialize the workload counters for each function. 8946 */ 8947 for (f = 0; f < ZTEST_FUNCS; f++) { 8948 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8949 zc->zc_count = 0; 8950 zc->zc_time = 0; 8951 } 8952 8953 /* Set the allocation switch size */ 8954 zs->zs_metaslab_df_alloc_threshold = 8955 ztest_random(zs->zs_metaslab_sz / 4) + 1; 8956 8957 if (!hasalt || ztest_random(2) == 0) { 8958 if (hasalt && ztest_opts.zo_verbose >= 1) { 8959 (void) printf("Executing newer ztest: %s\n", 8960 cmd); 8961 } 8962 newer++; 8963 killed = exec_child(cmd, NULL, B_TRUE, &status); 8964 } else { 8965 if (hasalt && ztest_opts.zo_verbose >= 1) { 8966 (void) printf("Executing older ztest: %s\n", 8967 ztest_opts.zo_alt_ztest); 8968 } 8969 older++; 8970 killed = exec_child(ztest_opts.zo_alt_ztest, 8971 ztest_opts.zo_alt_libpath, B_TRUE, &status); 8972 } 8973 8974 if (killed) 8975 kills++; 8976 iters++; 8977 8978 if (ztest_opts.zo_verbose >= 1) { 8979 hrtime_t now = gethrtime(); 8980 8981 now = MIN(now, zs->zs_proc_stop); 8982 print_time(zs->zs_proc_stop - now, timebuf); 8983 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 8984 8985 (void) printf("Pass %3d, %8s, %3"PRIu64" ENOSPC, " 8986 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 8987 iters, 8988 WIFEXITED(status) ? "Complete" : "SIGKILL", 8989 zs->zs_enospc_count, 8990 100.0 * zs->zs_alloc / zs->zs_space, 8991 numbuf, 8992 100.0 * (now - zs->zs_proc_start) / 8993 (ztest_opts.zo_time * NANOSEC), timebuf); 8994 } 8995 8996 if (ztest_opts.zo_verbose >= 2) { 8997 (void) printf("\nWorkload summary:\n\n"); 8998 (void) printf("%7s %9s %s\n", 8999 "Calls", "Time", "Function"); 9000 (void) printf("%7s %9s %s\n", 9001 "-----", "----", "--------"); 9002 for (f = 0; f < ZTEST_FUNCS; f++) { 9003 zi = &ztest_info[f]; 9004 zc = ZTEST_GET_SHARED_CALLSTATE(f); 9005 print_time(zc->zc_time, timebuf); 9006 (void) printf("%7"PRIu64" %9s %s\n", 9007 zc->zc_count, timebuf, 9008 zi->zi_funcname); 9009 } 9010 (void) printf("\n"); 9011 } 9012 9013 if (!ztest_opts.zo_mmp_test) 9014 ztest_run_zdb(zs->zs_guid); 9015 if (ztest_shared_opts->zo_raidz_expand_test == 9016 RAIDZ_EXPAND_CHECKED) 9017 break; /* raidz expand test complete */ 9018 } 9019 9020 if (ztest_opts.zo_verbose >= 1) { 9021 if (hasalt) { 9022 (void) printf("%d runs of older ztest: %s\n", older, 9023 ztest_opts.zo_alt_ztest); 9024 (void) printf("%d runs of newer ztest: %s\n", newer, 9025 cmd); 9026 } 9027 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 9028 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 9029 } 9030 9031 umem_free(cmd, MAXNAMELEN); 9032 9033 return (0); 9034 } 9035