1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013, 2014, Delphix. All rights reserved. 24 * Copyright (c) 2021, George Amanakis. All rights reserved. 25 */ 26 27 /* 28 * Routines to manage the on-disk persistent error log. 29 * 30 * Each pool stores a log of all logical data errors seen during normal 31 * operation. This is actually the union of two distinct logs: the last log, 32 * and the current log. All errors seen are logged to the current log. When a 33 * scrub completes, the current log becomes the last log, the last log is thrown 34 * out, and the current log is reinitialized. This way, if an error is somehow 35 * corrected, a new scrub will show that it no longer exists, and will be 36 * deleted from the log when the scrub completes. 37 * 38 * The log is stored using a ZAP object whose key is a string form of the 39 * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an 40 * optional 'objset:object' human-readable string describing the data. When an 41 * error is first logged, this string will be empty, indicating that no name is 42 * known. This prevents us from having to issue a potentially large amount of 43 * I/O to discover the object name during an error path. Instead, we do the 44 * calculation when the data is requested, storing the result so future queries 45 * will be faster. 46 * 47 * If the head_errlog feature is enabled, a different on-disk format is used. 48 * The error log of each head dataset is stored separately in the zap object 49 * and keyed by the head id. This enables listing every dataset affected in 50 * userland. In order to be able to track whether an error block has been 51 * modified or added to snapshots since it was marked as an error, a new tuple 52 * is introduced: zbookmark_err_phys_t. It allows the storage of the birth 53 * transaction group of an error block on-disk. The birth transaction group is 54 * used by check_filesystem() to assess whether this block was freed, 55 * re-written or added to a snapshot since its marking as an error. 56 * 57 * This log is then shipped into an nvlist where the key is the dataset name and 58 * the value is the object name. Userland is then responsible for uniquifying 59 * this list and displaying it to the user. 60 */ 61 62 #include <sys/dmu_tx.h> 63 #include <sys/spa.h> 64 #include <sys/spa_impl.h> 65 #include <sys/zap.h> 66 #include <sys/zio.h> 67 #include <sys/dsl_dir.h> 68 #include <sys/dmu_objset.h> 69 #include <sys/dbuf.h> 70 71 /* 72 * spa_upgrade_errlog_limit : A zfs module parameter that controls the number 73 * of on-disk error log entries that will be converted to the new 74 * format when enabling head_errlog. Defaults to 0 which converts 75 * all log entries. 76 */ 77 static uint32_t spa_upgrade_errlog_limit = 0; 78 79 /* 80 * Convert a bookmark to a string. 81 */ 82 static void 83 bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len) 84 { 85 (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", 86 (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, 87 (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid); 88 } 89 90 /* 91 * Convert an err_phys to a string. 92 */ 93 static void 94 errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len) 95 { 96 (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", 97 (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level, 98 (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth); 99 } 100 101 /* 102 * Convert a string to a err_phys. 103 */ 104 static void 105 name_to_errphys(char *buf, zbookmark_err_phys_t *zep) 106 { 107 zep->zb_object = zfs_strtonum(buf, &buf); 108 ASSERT(*buf == ':'); 109 zep->zb_level = (int)zfs_strtonum(buf + 1, &buf); 110 ASSERT(*buf == ':'); 111 zep->zb_blkid = zfs_strtonum(buf + 1, &buf); 112 ASSERT(*buf == ':'); 113 zep->zb_birth = zfs_strtonum(buf + 1, &buf); 114 ASSERT(*buf == '\0'); 115 } 116 117 /* 118 * Convert a string to a bookmark. 119 */ 120 static void 121 name_to_bookmark(char *buf, zbookmark_phys_t *zb) 122 { 123 zb->zb_objset = zfs_strtonum(buf, &buf); 124 ASSERT(*buf == ':'); 125 zb->zb_object = zfs_strtonum(buf + 1, &buf); 126 ASSERT(*buf == ':'); 127 zb->zb_level = (int)zfs_strtonum(buf + 1, &buf); 128 ASSERT(*buf == ':'); 129 zb->zb_blkid = zfs_strtonum(buf + 1, &buf); 130 ASSERT(*buf == '\0'); 131 } 132 133 #ifdef _KERNEL 134 static void 135 zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) 136 { 137 zb->zb_objset = dataset; 138 zb->zb_object = zep->zb_object; 139 zb->zb_level = zep->zb_level; 140 zb->zb_blkid = zep->zb_blkid; 141 } 142 #endif 143 144 static void 145 name_to_object(char *buf, uint64_t *obj) 146 { 147 *obj = zfs_strtonum(buf, &buf); 148 ASSERT(*buf == '\0'); 149 } 150 151 static int 152 get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj, 153 uint64_t *head_dataset_id) 154 { 155 dsl_pool_t *dp = spa->spa_dsl_pool; 156 dsl_dataset_t *ds; 157 objset_t *os; 158 159 dsl_pool_config_enter(dp, FTAG); 160 int error = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds); 161 if (error != 0) { 162 dsl_pool_config_exit(dp, FTAG); 163 return (error); 164 } 165 ASSERT(head_dataset_id); 166 *head_dataset_id = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; 167 168 error = dmu_objset_from_ds(ds, &os); 169 if (error != 0) { 170 dsl_dataset_rele(ds, FTAG); 171 dsl_pool_config_exit(dp, FTAG); 172 return (error); 173 } 174 175 dnode_t *dn; 176 blkptr_t bp; 177 178 error = dnode_hold(os, zep->zb_object, FTAG, &dn); 179 if (error != 0) { 180 dsl_dataset_rele(ds, FTAG); 181 dsl_pool_config_exit(dp, FTAG); 182 return (error); 183 } 184 185 rw_enter(&dn->dn_struct_rwlock, RW_READER); 186 error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, 187 NULL); 188 189 if (error == 0 && BP_IS_HOLE(&bp)) 190 error = SET_ERROR(ENOENT); 191 192 zep->zb_birth = bp.blk_birth; 193 rw_exit(&dn->dn_struct_rwlock); 194 dnode_rele(dn, FTAG); 195 dsl_dataset_rele(ds, FTAG); 196 dsl_pool_config_exit(dp, FTAG); 197 return (error); 198 } 199 200 /* 201 * Log an uncorrectable error to the persistent error log. We add it to the 202 * spa's list of pending errors. The changes are actually synced out to disk 203 * during spa_errlog_sync(). 204 */ 205 void 206 spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) 207 { 208 spa_error_entry_t search; 209 spa_error_entry_t *new; 210 avl_tree_t *tree; 211 avl_index_t where; 212 213 /* 214 * If we are trying to import a pool, ignore any errors, as we won't be 215 * writing to the pool any time soon. 216 */ 217 if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) 218 return; 219 220 mutex_enter(&spa->spa_errlist_lock); 221 222 /* 223 * If we have had a request to rotate the log, log it to the next list 224 * instead of the current one. 225 */ 226 if (spa->spa_scrub_active || spa->spa_scrub_finished) 227 tree = &spa->spa_errlist_scrub; 228 else 229 tree = &spa->spa_errlist_last; 230 231 search.se_bookmark = *zb; 232 if (avl_find(tree, &search, &where) != NULL) { 233 mutex_exit(&spa->spa_errlist_lock); 234 return; 235 } 236 237 new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); 238 new->se_bookmark = *zb; 239 avl_insert(tree, new, where); 240 241 mutex_exit(&spa->spa_errlist_lock); 242 } 243 244 #ifdef _KERNEL 245 static int 246 find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, 247 uint64_t *birth_txg) 248 { 249 objset_t *os; 250 int error = dmu_objset_from_ds(ds, &os); 251 if (error != 0) 252 return (error); 253 254 dnode_t *dn; 255 blkptr_t bp; 256 257 error = dnode_hold(os, zep->zb_object, FTAG, &dn); 258 if (error != 0) 259 return (error); 260 261 rw_enter(&dn->dn_struct_rwlock, RW_READER); 262 error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, 263 NULL); 264 265 if (error == 0 && BP_IS_HOLE(&bp)) 266 error = SET_ERROR(ENOENT); 267 268 *birth_txg = bp.blk_birth; 269 rw_exit(&dn->dn_struct_rwlock); 270 dnode_rele(dn, FTAG); 271 return (error); 272 } 273 274 /* 275 * This function serves a double role. If only_count is true, it returns 276 * (in *count) how many times an error block belonging to this filesystem is 277 * referenced by snapshots or clones. If only_count is false, each time the 278 * error block is referenced by a snapshot or clone, it fills the userspace 279 * array at uaddr with the bookmarks of the error blocks. The array is filled 280 * from the back and *count is modified to be the number of unused entries at 281 * the beginning of the array. 282 */ 283 static int 284 check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, 285 uint64_t *count, void *uaddr, boolean_t only_count) 286 { 287 dsl_dataset_t *ds; 288 dsl_pool_t *dp = spa->spa_dsl_pool; 289 290 int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds); 291 if (error != 0) 292 return (error); 293 294 uint64_t latest_txg; 295 uint64_t txg_to_consider = spa->spa_syncing_txg; 296 boolean_t check_snapshot = B_TRUE; 297 error = find_birth_txg(ds, zep, &latest_txg); 298 if (error == 0) { 299 if (zep->zb_birth == latest_txg) { 300 /* Block neither free nor rewritten. */ 301 if (!only_count) { 302 zbookmark_phys_t zb; 303 zep_to_zb(head_ds, zep, &zb); 304 if (copyout(&zb, (char *)uaddr + (*count - 1) 305 * sizeof (zbookmark_phys_t), 306 sizeof (zbookmark_phys_t)) != 0) { 307 dsl_dataset_rele(ds, FTAG); 308 return (SET_ERROR(EFAULT)); 309 } 310 (*count)--; 311 } else { 312 (*count)++; 313 } 314 check_snapshot = B_FALSE; 315 } else { 316 ASSERT3U(zep->zb_birth, <, latest_txg); 317 txg_to_consider = latest_txg; 318 } 319 } 320 321 /* How many snapshots reference this block. */ 322 uint64_t snap_count; 323 error = zap_count(spa->spa_meta_objset, 324 dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); 325 if (error != 0) { 326 dsl_dataset_rele(ds, FTAG); 327 return (error); 328 } 329 330 if (snap_count == 0) { 331 /* File system has no snapshot. */ 332 dsl_dataset_rele(ds, FTAG); 333 return (0); 334 } 335 336 uint64_t *snap_obj_array = kmem_alloc(snap_count * sizeof (uint64_t), 337 KM_SLEEP); 338 339 int aff_snap_count = 0; 340 uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 341 uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; 342 343 /* Check only snapshots created from this file system. */ 344 while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && 345 snap_obj_txg <= txg_to_consider) { 346 347 dsl_dataset_rele(ds, FTAG); 348 error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds); 349 if (error != 0) 350 goto out; 351 352 if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) 353 break; 354 355 boolean_t affected = B_TRUE; 356 if (check_snapshot) { 357 uint64_t blk_txg; 358 error = find_birth_txg(ds, zep, &blk_txg); 359 affected = (error == 0 && zep->zb_birth == blk_txg); 360 } 361 362 if (affected) { 363 snap_obj_array[aff_snap_count] = snap_obj; 364 aff_snap_count++; 365 366 if (!only_count) { 367 zbookmark_phys_t zb; 368 zep_to_zb(snap_obj, zep, &zb); 369 if (copyout(&zb, (char *)uaddr + (*count - 1) * 370 sizeof (zbookmark_phys_t), 371 sizeof (zbookmark_phys_t)) != 0) { 372 dsl_dataset_rele(ds, FTAG); 373 error = SET_ERROR(EFAULT); 374 goto out; 375 } 376 (*count)--; 377 } else { 378 (*count)++; 379 } 380 381 /* 382 * Only clones whose origins were affected could also 383 * have affected snapshots. 384 */ 385 zap_cursor_t zc; 386 zap_attribute_t za; 387 for (zap_cursor_init(&zc, spa->spa_meta_objset, 388 dsl_dataset_phys(ds)->ds_next_clones_obj); 389 zap_cursor_retrieve(&zc, &za) == 0; 390 zap_cursor_advance(&zc)) { 391 error = check_filesystem(spa, 392 za.za_first_integer, zep, 393 count, uaddr, only_count); 394 395 if (error != 0) { 396 zap_cursor_fini(&zc); 397 goto out; 398 } 399 } 400 zap_cursor_fini(&zc); 401 } 402 snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; 403 snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 404 } 405 dsl_dataset_rele(ds, FTAG); 406 407 out: 408 kmem_free(snap_obj_array, sizeof (*snap_obj_array)); 409 return (error); 410 } 411 412 static int 413 find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, 414 uint64_t *top_affected_fs) 415 { 416 uint64_t oldest_dsobj; 417 int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth, 418 &oldest_dsobj); 419 if (error != 0) 420 return (error); 421 422 dsl_dataset_t *ds; 423 error = dsl_dataset_hold_obj(spa->spa_dsl_pool, oldest_dsobj, 424 FTAG, &ds); 425 if (error != 0) 426 return (error); 427 428 *top_affected_fs = 429 dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; 430 dsl_dataset_rele(ds, FTAG); 431 return (0); 432 } 433 434 static int 435 process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, 436 uint64_t *count, void *uaddr, boolean_t only_count) 437 { 438 dsl_pool_t *dp = spa->spa_dsl_pool; 439 dsl_pool_config_enter(dp, FTAG); 440 uint64_t top_affected_fs; 441 442 int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs); 443 if (error == 0) 444 error = check_filesystem(spa, top_affected_fs, zep, count, 445 uaddr, only_count); 446 447 dsl_pool_config_exit(dp, FTAG); 448 return (error); 449 } 450 451 static uint64_t 452 get_errlog_size(spa_t *spa, uint64_t spa_err_obj) 453 { 454 if (spa_err_obj == 0) 455 return (0); 456 uint64_t total = 0; 457 458 zap_cursor_t zc; 459 zap_attribute_t za; 460 for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); 461 zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { 462 463 zap_cursor_t head_ds_cursor; 464 zap_attribute_t head_ds_attr; 465 zbookmark_err_phys_t head_ds_block; 466 467 uint64_t head_ds; 468 name_to_object(za.za_name, &head_ds); 469 470 for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset, 471 za.za_first_integer); zap_cursor_retrieve(&head_ds_cursor, 472 &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) { 473 474 name_to_errphys(head_ds_attr.za_name, &head_ds_block); 475 (void) process_error_block(spa, head_ds, &head_ds_block, 476 &total, NULL, B_TRUE); 477 } 478 zap_cursor_fini(&head_ds_cursor); 479 } 480 zap_cursor_fini(&zc); 481 return (total); 482 } 483 484 static uint64_t 485 get_errlist_size(spa_t *spa, avl_tree_t *tree) 486 { 487 if (avl_numnodes(tree) == 0) 488 return (0); 489 uint64_t total = 0; 490 491 spa_error_entry_t *se; 492 for (se = avl_first(tree); se != NULL; se = AVL_NEXT(tree, se)) { 493 zbookmark_err_phys_t zep; 494 zep.zb_object = se->se_bookmark.zb_object; 495 zep.zb_level = se->se_bookmark.zb_level; 496 zep.zb_blkid = se->se_bookmark.zb_blkid; 497 498 /* 499 * If we cannot find out the head dataset and birth txg of 500 * the present error block, we opt not to error out. In the 501 * next pool sync this information will be retrieved by 502 * sync_error_list() and written to the on-disk error log. 503 */ 504 uint64_t head_ds_obj; 505 if (get_head_and_birth_txg(spa, &zep, 506 se->se_bookmark.zb_objset, &head_ds_obj) == 0) 507 (void) process_error_block(spa, head_ds_obj, &zep, 508 &total, NULL, B_TRUE); 509 } 510 return (total); 511 } 512 #endif 513 514 /* 515 * Return the number of errors currently in the error log. This is actually the 516 * sum of both the last log and the current log, since we don't know the union 517 * of these logs until we reach userland. 518 */ 519 uint64_t 520 spa_get_errlog_size(spa_t *spa) 521 { 522 uint64_t total = 0; 523 524 if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { 525 mutex_enter(&spa->spa_errlog_lock); 526 uint64_t count; 527 if (spa->spa_errlog_scrub != 0 && 528 zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, 529 &count) == 0) 530 total += count; 531 532 if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && 533 zap_count(spa->spa_meta_objset, spa->spa_errlog_last, 534 &count) == 0) 535 total += count; 536 mutex_exit(&spa->spa_errlog_lock); 537 538 mutex_enter(&spa->spa_errlist_lock); 539 total += avl_numnodes(&spa->spa_errlist_last); 540 total += avl_numnodes(&spa->spa_errlist_scrub); 541 mutex_exit(&spa->spa_errlist_lock); 542 } else { 543 #ifdef _KERNEL 544 mutex_enter(&spa->spa_errlog_lock); 545 total += get_errlog_size(spa, spa->spa_errlog_last); 546 total += get_errlog_size(spa, spa->spa_errlog_scrub); 547 mutex_exit(&spa->spa_errlog_lock); 548 549 mutex_enter(&spa->spa_errlist_lock); 550 total += get_errlist_size(spa, &spa->spa_errlist_last); 551 total += get_errlist_size(spa, &spa->spa_errlist_scrub); 552 mutex_exit(&spa->spa_errlist_lock); 553 #endif 554 } 555 return (total); 556 } 557 558 /* 559 * This function sweeps through an on-disk error log and stores all bookmarks 560 * as error bookmarks in a new ZAP object. At the end we discard the old one, 561 * and spa_update_errlog() will set the spa's on-disk error log to new ZAP 562 * object. 563 */ 564 static void 565 sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, 566 dmu_tx_t *tx) 567 { 568 zap_cursor_t zc; 569 zap_attribute_t za; 570 zbookmark_phys_t zb; 571 uint64_t count; 572 573 *newobj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, 574 DMU_OT_NONE, 0, tx); 575 576 /* 577 * If we cannnot perform the upgrade we should clear the old on-disk 578 * error logs. 579 */ 580 if (zap_count(spa->spa_meta_objset, spa_err_obj, &count) != 0) { 581 VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); 582 return; 583 } 584 585 for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); 586 zap_cursor_retrieve(&zc, &za) == 0; 587 zap_cursor_advance(&zc)) { 588 if (spa_upgrade_errlog_limit != 0 && 589 zc.zc_cd == spa_upgrade_errlog_limit) 590 break; 591 592 name_to_bookmark(za.za_name, &zb); 593 594 zbookmark_err_phys_t zep; 595 zep.zb_object = zb.zb_object; 596 zep.zb_level = zb.zb_level; 597 zep.zb_blkid = zb.zb_blkid; 598 599 /* 600 * We cannot use get_head_and_birth_txg() because it will 601 * acquire the pool config lock, which we already have. In case 602 * of an error we simply continue. 603 */ 604 uint64_t head_dataset_obj; 605 dsl_pool_t *dp = spa->spa_dsl_pool; 606 dsl_dataset_t *ds; 607 objset_t *os; 608 609 int error = dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds); 610 if (error != 0) 611 continue; 612 613 head_dataset_obj = 614 dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; 615 616 /* 617 * The objset and the dnode are required for getting the block 618 * pointer, which is used to determine if BP_IS_HOLE(). If 619 * getting the objset or the dnode fails, do not create a 620 * zap entry (presuming we know the dataset) as this may create 621 * spurious errors that we cannot ever resolve. If an error is 622 * truly persistent, it should re-appear after a scan. 623 */ 624 if (dmu_objset_from_ds(ds, &os) != 0) { 625 dsl_dataset_rele(ds, FTAG); 626 continue; 627 } 628 629 dnode_t *dn; 630 blkptr_t bp; 631 632 if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) { 633 dsl_dataset_rele(ds, FTAG); 634 continue; 635 } 636 637 rw_enter(&dn->dn_struct_rwlock, RW_READER); 638 error = dbuf_dnode_findbp(dn, zep.zb_level, zep.zb_blkid, &bp, 639 NULL, NULL); 640 641 zep.zb_birth = bp.blk_birth; 642 rw_exit(&dn->dn_struct_rwlock); 643 dnode_rele(dn, FTAG); 644 dsl_dataset_rele(ds, FTAG); 645 646 if (error != 0 || BP_IS_HOLE(&bp)) 647 continue; 648 649 uint64_t err_obj; 650 error = zap_lookup_int_key(spa->spa_meta_objset, *newobj, 651 head_dataset_obj, &err_obj); 652 653 if (error == ENOENT) { 654 err_obj = zap_create(spa->spa_meta_objset, 655 DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); 656 657 (void) zap_update_int_key(spa->spa_meta_objset, 658 *newobj, head_dataset_obj, err_obj, tx); 659 } 660 661 char buf[64]; 662 errphys_to_name(&zep, buf, sizeof (buf)); 663 664 const char *name = ""; 665 (void) zap_update(spa->spa_meta_objset, err_obj, 666 buf, 1, strlen(name) + 1, name, tx); 667 } 668 zap_cursor_fini(&zc); 669 670 VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); 671 } 672 673 void 674 spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx) 675 { 676 uint64_t newobj = 0; 677 678 mutex_enter(&spa->spa_errlog_lock); 679 if (spa->spa_errlog_last != 0) { 680 sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx); 681 spa->spa_errlog_last = newobj; 682 } 683 684 if (spa->spa_errlog_scrub != 0) { 685 sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx); 686 spa->spa_errlog_scrub = newobj; 687 } 688 mutex_exit(&spa->spa_errlog_lock); 689 } 690 691 #ifdef _KERNEL 692 /* 693 * If an error block is shared by two datasets it will be counted twice. For 694 * detailed message see spa_get_errlog_size() above. 695 */ 696 static int 697 process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count) 698 { 699 zap_cursor_t zc; 700 zap_attribute_t za; 701 702 if (obj == 0) 703 return (0); 704 705 if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { 706 for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); 707 zap_cursor_retrieve(&zc, &za) == 0; 708 zap_cursor_advance(&zc)) { 709 if (*count == 0) { 710 zap_cursor_fini(&zc); 711 return (SET_ERROR(ENOMEM)); 712 } 713 714 zbookmark_phys_t zb; 715 name_to_bookmark(za.za_name, &zb); 716 717 if (copyout(&zb, (char *)uaddr + 718 (*count - 1) * sizeof (zbookmark_phys_t), 719 sizeof (zbookmark_phys_t)) != 0) { 720 zap_cursor_fini(&zc); 721 return (SET_ERROR(EFAULT)); 722 } 723 *count -= 1; 724 725 } 726 zap_cursor_fini(&zc); 727 return (0); 728 } 729 730 for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); 731 zap_cursor_retrieve(&zc, &za) == 0; 732 zap_cursor_advance(&zc)) { 733 734 zap_cursor_t head_ds_cursor; 735 zap_attribute_t head_ds_attr; 736 737 uint64_t head_ds_err_obj = za.za_first_integer; 738 uint64_t head_ds; 739 name_to_object(za.za_name, &head_ds); 740 for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset, 741 head_ds_err_obj); zap_cursor_retrieve(&head_ds_cursor, 742 &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) { 743 744 zbookmark_err_phys_t head_ds_block; 745 name_to_errphys(head_ds_attr.za_name, &head_ds_block); 746 int error = process_error_block(spa, head_ds, 747 &head_ds_block, count, uaddr, B_FALSE); 748 749 if (error != 0) { 750 zap_cursor_fini(&head_ds_cursor); 751 zap_cursor_fini(&zc); 752 return (error); 753 } 754 } 755 zap_cursor_fini(&head_ds_cursor); 756 } 757 zap_cursor_fini(&zc); 758 return (0); 759 } 760 761 static int 762 process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count) 763 { 764 spa_error_entry_t *se; 765 766 if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { 767 for (se = avl_first(list); se != NULL; 768 se = AVL_NEXT(list, se)) { 769 770 if (*count == 0) 771 return (SET_ERROR(ENOMEM)); 772 773 if (copyout(&se->se_bookmark, (char *)uaddr + 774 (*count - 1) * sizeof (zbookmark_phys_t), 775 sizeof (zbookmark_phys_t)) != 0) 776 return (SET_ERROR(EFAULT)); 777 778 *count -= 1; 779 } 780 return (0); 781 } 782 783 for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { 784 zbookmark_err_phys_t zep; 785 zep.zb_object = se->se_bookmark.zb_object; 786 zep.zb_level = se->se_bookmark.zb_level; 787 zep.zb_blkid = se->se_bookmark.zb_blkid; 788 789 uint64_t head_ds_obj; 790 int error = get_head_and_birth_txg(spa, &zep, 791 se->se_bookmark.zb_objset, &head_ds_obj); 792 if (error != 0) 793 return (error); 794 795 error = process_error_block(spa, head_ds_obj, &zep, count, 796 uaddr, B_FALSE); 797 if (error != 0) 798 return (error); 799 } 800 return (0); 801 } 802 #endif 803 804 /* 805 * Copy all known errors to userland as an array of bookmarks. This is 806 * actually a union of the on-disk last log and current log, as well as any 807 * pending error requests. 808 * 809 * Because the act of reading the on-disk log could cause errors to be 810 * generated, we have two separate locks: one for the error log and one for the 811 * in-core error lists. We only need the error list lock to log and error, so 812 * we grab the error log lock while we read the on-disk logs, and only pick up 813 * the error list lock when we are finished. 814 */ 815 int 816 spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count) 817 { 818 int ret = 0; 819 820 #ifdef _KERNEL 821 mutex_enter(&spa->spa_errlog_lock); 822 823 ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); 824 825 if (!ret && !spa->spa_scrub_finished) 826 ret = process_error_log(spa, spa->spa_errlog_last, uaddr, 827 count); 828 829 mutex_enter(&spa->spa_errlist_lock); 830 if (!ret) 831 ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr, 832 count); 833 if (!ret) 834 ret = process_error_list(spa, &spa->spa_errlist_last, uaddr, 835 count); 836 mutex_exit(&spa->spa_errlist_lock); 837 838 mutex_exit(&spa->spa_errlog_lock); 839 #else 840 (void) spa, (void) uaddr, (void) count; 841 #endif 842 843 return (ret); 844 } 845 846 /* 847 * Called when a scrub completes. This simply set a bit which tells which AVL 848 * tree to add new errors. spa_errlog_sync() is responsible for actually 849 * syncing the changes to the underlying objects. 850 */ 851 void 852 spa_errlog_rotate(spa_t *spa) 853 { 854 mutex_enter(&spa->spa_errlist_lock); 855 spa->spa_scrub_finished = B_TRUE; 856 mutex_exit(&spa->spa_errlist_lock); 857 } 858 859 /* 860 * Discard any pending errors from the spa_t. Called when unloading a faulted 861 * pool, as the errors encountered during the open cannot be synced to disk. 862 */ 863 void 864 spa_errlog_drain(spa_t *spa) 865 { 866 spa_error_entry_t *se; 867 void *cookie; 868 869 mutex_enter(&spa->spa_errlist_lock); 870 871 cookie = NULL; 872 while ((se = avl_destroy_nodes(&spa->spa_errlist_last, 873 &cookie)) != NULL) 874 kmem_free(se, sizeof (spa_error_entry_t)); 875 cookie = NULL; 876 while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub, 877 &cookie)) != NULL) 878 kmem_free(se, sizeof (spa_error_entry_t)); 879 880 mutex_exit(&spa->spa_errlist_lock); 881 } 882 883 /* 884 * Process a list of errors into the current on-disk log. 885 */ 886 void 887 sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) 888 { 889 spa_error_entry_t *se; 890 char buf[64]; 891 void *cookie; 892 893 if (avl_numnodes(t) == 0) 894 return; 895 896 /* create log if necessary */ 897 if (*obj == 0) 898 *obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, 899 DMU_OT_NONE, 0, tx); 900 901 /* add errors to the current log */ 902 if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { 903 for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { 904 bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); 905 906 const char *name = se->se_name ? se->se_name : ""; 907 (void) zap_update(spa->spa_meta_objset, *obj, buf, 1, 908 strlen(name) + 1, name, tx); 909 } 910 } else { 911 for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { 912 zbookmark_err_phys_t zep; 913 zep.zb_object = se->se_bookmark.zb_object; 914 zep.zb_level = se->se_bookmark.zb_level; 915 zep.zb_blkid = se->se_bookmark.zb_blkid; 916 917 /* 918 * If we cannot find out the head dataset and birth txg 919 * of the present error block, we simply continue. 920 * Reinserting that error block to the error lists, 921 * even if we are not syncing the final txg, results 922 * in duplicate posting of errors. 923 */ 924 uint64_t head_dataset_obj; 925 int error = get_head_and_birth_txg(spa, &zep, 926 se->se_bookmark.zb_objset, &head_dataset_obj); 927 if (error != 0) 928 continue; 929 930 uint64_t err_obj; 931 error = zap_lookup_int_key(spa->spa_meta_objset, 932 *obj, head_dataset_obj, &err_obj); 933 934 if (error == ENOENT) { 935 err_obj = zap_create(spa->spa_meta_objset, 936 DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); 937 938 (void) zap_update_int_key(spa->spa_meta_objset, 939 *obj, head_dataset_obj, err_obj, tx); 940 } 941 errphys_to_name(&zep, buf, sizeof (buf)); 942 943 const char *name = se->se_name ? se->se_name : ""; 944 (void) zap_update(spa->spa_meta_objset, 945 err_obj, buf, 1, strlen(name) + 1, name, tx); 946 } 947 } 948 /* purge the error list */ 949 cookie = NULL; 950 while ((se = avl_destroy_nodes(t, &cookie)) != NULL) 951 kmem_free(se, sizeof (spa_error_entry_t)); 952 } 953 954 static void 955 delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx) 956 { 957 if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { 958 zap_cursor_t zc; 959 zap_attribute_t za; 960 for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); 961 zap_cursor_retrieve(&zc, &za) == 0; 962 zap_cursor_advance(&zc)) { 963 VERIFY0(dmu_object_free(spa->spa_meta_objset, 964 za.za_first_integer, tx)); 965 } 966 zap_cursor_fini(&zc); 967 } 968 VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); 969 } 970 971 /* 972 * Sync the error log out to disk. This is a little tricky because the act of 973 * writing the error log requires the spa_errlist_lock. So, we need to lock the 974 * error lists, take a copy of the lists, and then reinitialize them. Then, we 975 * drop the error list lock and take the error log lock, at which point we 976 * do the errlog processing. Then, if we encounter an I/O error during this 977 * process, we can successfully add the error to the list. Note that this will 978 * result in the perpetual recycling of errors, but it is an unlikely situation 979 * and not a performance critical operation. 980 */ 981 void 982 spa_errlog_sync(spa_t *spa, uint64_t txg) 983 { 984 dmu_tx_t *tx; 985 avl_tree_t scrub, last; 986 int scrub_finished; 987 988 mutex_enter(&spa->spa_errlist_lock); 989 990 /* 991 * Bail out early under normal circumstances. 992 */ 993 if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && 994 avl_numnodes(&spa->spa_errlist_last) == 0 && 995 !spa->spa_scrub_finished) { 996 mutex_exit(&spa->spa_errlist_lock); 997 return; 998 } 999 1000 spa_get_errlists(spa, &last, &scrub); 1001 scrub_finished = spa->spa_scrub_finished; 1002 spa->spa_scrub_finished = B_FALSE; 1003 1004 mutex_exit(&spa->spa_errlist_lock); 1005 mutex_enter(&spa->spa_errlog_lock); 1006 1007 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1008 1009 /* 1010 * Sync out the current list of errors. 1011 */ 1012 sync_error_list(spa, &last, &spa->spa_errlog_last, tx); 1013 1014 /* 1015 * Rotate the log if necessary. 1016 */ 1017 if (scrub_finished) { 1018 if (spa->spa_errlog_last != 0) 1019 delete_errlog(spa, spa->spa_errlog_last, tx); 1020 spa->spa_errlog_last = spa->spa_errlog_scrub; 1021 spa->spa_errlog_scrub = 0; 1022 1023 sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx); 1024 } 1025 1026 /* 1027 * Sync out any pending scrub errors. 1028 */ 1029 sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx); 1030 1031 /* 1032 * Update the MOS to reflect the new values. 1033 */ 1034 (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1035 DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, 1036 &spa->spa_errlog_last, tx); 1037 (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1038 DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1, 1039 &spa->spa_errlog_scrub, tx); 1040 1041 dmu_tx_commit(tx); 1042 1043 mutex_exit(&spa->spa_errlog_lock); 1044 } 1045 1046 static void 1047 delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds, 1048 dmu_tx_t *tx) 1049 { 1050 if (spa_err_obj == 0) 1051 return; 1052 1053 zap_cursor_t zc; 1054 zap_attribute_t za; 1055 for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); 1056 zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { 1057 uint64_t head_ds; 1058 name_to_object(za.za_name, &head_ds); 1059 if (head_ds == ds) { 1060 (void) zap_remove(spa->spa_meta_objset, spa_err_obj, 1061 za.za_name, tx); 1062 VERIFY0(dmu_object_free(spa->spa_meta_objset, 1063 za.za_first_integer, tx)); 1064 break; 1065 } 1066 } 1067 zap_cursor_fini(&zc); 1068 } 1069 1070 void 1071 spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx) 1072 { 1073 mutex_enter(&spa->spa_errlog_lock); 1074 delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx); 1075 delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx); 1076 mutex_exit(&spa->spa_errlog_lock); 1077 } 1078 1079 static int 1080 find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head, 1081 uint64_t *txg) 1082 { 1083 dsl_dataset_t *ds; 1084 dsl_pool_t *dp = spa->spa_dsl_pool; 1085 1086 int error = dsl_dataset_hold_obj(dp, old_head, FTAG, &ds); 1087 if (error != 0) 1088 return (error); 1089 1090 uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 1091 uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; 1092 1093 while (prev_obj != 0) { 1094 dsl_dataset_rele(ds, FTAG); 1095 if ((error = dsl_dataset_hold_obj(dp, prev_obj, 1096 FTAG, &ds)) == 0 && 1097 dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head) 1098 break; 1099 1100 if (error != 0) 1101 return (error); 1102 1103 prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; 1104 prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 1105 } 1106 dsl_dataset_rele(ds, FTAG); 1107 ASSERT(prev_obj != 0); 1108 *txg = prev_obj_txg; 1109 return (0); 1110 } 1111 1112 static void 1113 swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t 1114 old_head, dmu_tx_t *tx) 1115 { 1116 if (spa_err_obj == 0) 1117 return; 1118 1119 uint64_t old_head_errlog; 1120 int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, 1121 old_head, &old_head_errlog); 1122 1123 /* If no error log, then there is nothing to do. */ 1124 if (error != 0) 1125 return; 1126 1127 uint64_t txg; 1128 error = find_txg_ancestor_snapshot(spa, new_head, old_head, &txg); 1129 if (error != 0) 1130 return; 1131 1132 /* 1133 * Create an error log if the file system being promoted does not 1134 * already have one. 1135 */ 1136 uint64_t new_head_errlog; 1137 error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head, 1138 &new_head_errlog); 1139 1140 if (error != 0) { 1141 new_head_errlog = zap_create(spa->spa_meta_objset, 1142 DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); 1143 1144 (void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj, 1145 new_head, new_head_errlog, tx); 1146 } 1147 1148 zap_cursor_t zc; 1149 zap_attribute_t za; 1150 zbookmark_err_phys_t err_block; 1151 for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog); 1152 zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { 1153 1154 const char *name = ""; 1155 name_to_errphys(za.za_name, &err_block); 1156 if (err_block.zb_birth < txg) { 1157 (void) zap_update(spa->spa_meta_objset, new_head_errlog, 1158 za.za_name, 1, strlen(name) + 1, name, tx); 1159 1160 (void) zap_remove(spa->spa_meta_objset, old_head_errlog, 1161 za.za_name, tx); 1162 } 1163 } 1164 zap_cursor_fini(&zc); 1165 } 1166 1167 void 1168 spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds, 1169 dmu_tx_t *tx) 1170 { 1171 mutex_enter(&spa->spa_errlog_lock); 1172 swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx); 1173 swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx); 1174 mutex_exit(&spa->spa_errlog_lock); 1175 } 1176 1177 #if defined(_KERNEL) 1178 /* error handling */ 1179 EXPORT_SYMBOL(spa_log_error); 1180 EXPORT_SYMBOL(spa_get_errlog_size); 1181 EXPORT_SYMBOL(spa_get_errlog); 1182 EXPORT_SYMBOL(spa_errlog_rotate); 1183 EXPORT_SYMBOL(spa_errlog_drain); 1184 EXPORT_SYMBOL(spa_errlog_sync); 1185 EXPORT_SYMBOL(spa_get_errlists); 1186 EXPORT_SYMBOL(spa_delete_dataset_errlog); 1187 EXPORT_SYMBOL(spa_swap_errlog); 1188 EXPORT_SYMBOL(sync_error_list); 1189 EXPORT_SYMBOL(spa_upgrade_errlog); 1190 #endif 1191 1192 /* BEGIN CSTYLED */ 1193 ZFS_MODULE_PARAM(zfs_spa, spa_, upgrade_errlog_limit, INT, ZMOD_RW, 1194 "Limit the number of errors which will be upgraded to the new " 1195 "on-disk error log when enabling head_errlog"); 1196 /* END CSTYLED */ 1197