1 /*- 2 * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions, and the following disclaimer, 10 * without modification. 11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 12 * substantially similar to the "NO WARRANTY" disclaimer below 13 * ("Disclaimer") and any redistribution must be conditioned upon 14 * including a substantially similar Disclaimer requirement for further 15 * binary redistribution. 16 * 17 * NO WARRANTY 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGES. 29 * 30 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 31 */ 32 33 /** 34 * \file case_file.cc 35 * 36 * We keep case files for any leaf vdev that is not in the optimal state. 37 * However, we only serialize to disk those events that need to be preserved 38 * across reboots. For now, this is just a log of soft errors which we 39 * accumulate in order to mark a device as degraded. 40 */ 41 #include <sys/cdefs.h> 42 #include <sys/byteorder.h> 43 #include <sys/time.h> 44 45 #include <sys/fs/zfs.h> 46 47 #include <dirent.h> 48 #include <fcntl.h> 49 #include <iomanip> 50 #include <fstream> 51 #include <functional> 52 #include <sstream> 53 #include <syslog.h> 54 #include <unistd.h> 55 56 #include <libzutil.h> 57 #include <libzfs.h> 58 59 #include <list> 60 #include <map> 61 #include <string> 62 63 #include <devdctl/guid.h> 64 #include <devdctl/event.h> 65 #include <devdctl/event_factory.h> 66 #include <devdctl/exception.h> 67 #include <devdctl/consumer.h> 68 69 #include "callout.h" 70 #include "vdev_iterator.h" 71 #include "zfsd_event.h" 72 #include "case_file.h" 73 #include "vdev.h" 74 #include "zfsd.h" 75 #include "zfsd_exception.h" 76 #include "zpool_list.h" 77 /*============================ Namespace Control =============================*/ 78 using std::hex; 79 using std::ifstream; 80 using std::stringstream; 81 using std::setfill; 82 using std::setw; 83 84 using DevdCtl::Event; 85 using DevdCtl::EventFactory; 86 using DevdCtl::EventList; 87 using DevdCtl::Guid; 88 using DevdCtl::ParseException; 89 90 /*--------------------------------- CaseFile ---------------------------------*/ 91 //- CaseFile Static Data ------------------------------------------------------- 92 93 CaseFileList CaseFile::s_activeCases; 94 const string CaseFile::s_caseFilePath = "/var/db/zfsd/cases"; 95 96 //- CaseFile Static Public Methods --------------------------------------------- 97 CaseFile * 98 CaseFile::Find(Guid poolGUID, Guid vdevGUID) 99 { 100 for (CaseFileList::iterator curCase = s_activeCases.begin(); 101 curCase != s_activeCases.end(); curCase++) { 102 103 if (((*curCase)->PoolGUID() != poolGUID 104 && Guid::InvalidGuid() != poolGUID) 105 || (*curCase)->VdevGUID() != vdevGUID) 106 continue; 107 108 /* 109 * We only carry one active case per-vdev. 110 */ 111 return (*curCase); 112 } 113 return (NULL); 114 } 115 116 void 117 CaseFile::Find(Guid poolGUID, Guid vdevGUID, CaseFileList &cases) 118 { 119 for (CaseFileList::iterator curCase = s_activeCases.begin(); 120 curCase != s_activeCases.end(); curCase++) { 121 if (((*curCase)->PoolGUID() != poolGUID && 122 Guid::InvalidGuid() != poolGUID) || 123 (*curCase)->VdevGUID() != vdevGUID) 124 continue; 125 126 /* 127 * We can have multiple cases for spare vdevs 128 */ 129 cases.push_back(*curCase); 130 if (!(*curCase)->IsSpare()) { 131 return; 132 } 133 } 134 } 135 136 CaseFile * 137 CaseFile::Find(const string &physPath) 138 { 139 CaseFile *result = NULL; 140 141 for (CaseFileList::iterator curCase = s_activeCases.begin(); 142 curCase != s_activeCases.end(); curCase++) { 143 144 if ((*curCase)->PhysicalPath() != physPath) 145 continue; 146 147 if (result != NULL) { 148 syslog(LOG_WARNING, "Multiple casefiles found for " 149 "physical path %s. " 150 "This is most likely a bug in zfsd", 151 physPath.c_str()); 152 } 153 result = *curCase; 154 } 155 return (result); 156 } 157 158 159 void 160 CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event) 161 { 162 CaseFileList::iterator casefile; 163 for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){ 164 CaseFileList::iterator next = casefile; 165 next++; 166 if (poolGUID == (*casefile)->PoolGUID()) 167 (*casefile)->ReEvaluate(event); 168 casefile = next; 169 } 170 } 171 172 CaseFile & 173 CaseFile::Create(Vdev &vdev) 174 { 175 CaseFile *activeCase; 176 177 activeCase = Find(vdev.PoolGUID(), vdev.GUID()); 178 if (activeCase == NULL) 179 activeCase = new CaseFile(vdev); 180 181 return (*activeCase); 182 } 183 184 void 185 CaseFile::DeSerialize() 186 { 187 struct dirent **caseFiles; 188 189 int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles, 190 DeSerializeSelector, /*compar*/NULL)); 191 192 if (numCaseFiles == -1) 193 return; 194 if (numCaseFiles == 0) { 195 free(caseFiles); 196 return; 197 } 198 199 for (int i = 0; i < numCaseFiles; i++) { 200 201 DeSerializeFile(caseFiles[i]->d_name); 202 free(caseFiles[i]); 203 } 204 free(caseFiles); 205 } 206 207 bool 208 CaseFile::Empty() 209 { 210 return (s_activeCases.empty()); 211 } 212 213 void 214 CaseFile::LogAll() 215 { 216 for (CaseFileList::iterator curCase = s_activeCases.begin(); 217 curCase != s_activeCases.end(); curCase++) 218 (*curCase)->Log(); 219 } 220 221 void 222 CaseFile::PurgeAll() 223 { 224 /* 225 * Serialize casefiles before deleting them so that they can be reread 226 * and revalidated during BuildCaseFiles. 227 * CaseFiles remove themselves from this list on destruction. 228 */ 229 while (s_activeCases.size() != 0) { 230 CaseFile *casefile = s_activeCases.front(); 231 casefile->Serialize(); 232 delete casefile; 233 } 234 235 } 236 237 int 238 CaseFile::IsSpare() 239 { 240 return (m_is_spare); 241 } 242 243 //- CaseFile Public Methods ---------------------------------------------------- 244 bool 245 CaseFile::RefreshVdevState() 246 { 247 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 248 zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front()); 249 if (casePool == NULL) 250 return (false); 251 252 Vdev vd(casePool, CaseVdev(casePool)); 253 if (vd.DoesNotExist()) 254 return (false); 255 256 m_vdevState = vd.State(); 257 m_vdevPhysPath = vd.PhysicalPath(); 258 m_vdevName = vd.Name(casePool, false); 259 return (true); 260 } 261 262 bool 263 CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev) 264 { 265 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 266 zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front()); 267 int flags = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE; 268 269 if (pool == NULL || !RefreshVdevState()) { 270 /* 271 * The pool or vdev for this case file is no longer 272 * part of the configuration. This can happen 273 * if we process a device arrival notification 274 * before seeing the ZFS configuration change 275 * event. 276 */ 277 syslog(LOG_INFO, 278 "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured. " 279 "Closing\n", 280 PoolGUIDString().c_str(), 281 VdevGUIDString().c_str()); 282 Close(); 283 284 /* 285 * Since this event was not used to close this 286 * case, do not report it as consumed. 287 */ 288 return (/*consumed*/false); 289 } 290 291 if (VdevState() > VDEV_STATE_FAULTED) { 292 /* 293 * For now, newly discovered devices only help for 294 * devices that are missing. In the future, we might 295 * use a newly inserted spare to replace a degraded 296 * or faulted device. 297 */ 298 syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored", 299 PoolGUIDString().c_str(), VdevGUIDString().c_str()); 300 return (/*consumed*/false); 301 } 302 if (VdevState() == VDEV_STATE_OFFLINE) { 303 /* 304 * OFFLINE is an administrative decision. No need for zfsd to 305 * do anything. 306 */ 307 syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored", 308 PoolGUIDString().c_str(), VdevGUIDString().c_str()); 309 return (/*consumed*/false); 310 } 311 312 if (vdev != NULL 313 && ( vdev->PoolGUID() == m_poolGUID 314 || vdev->PoolGUID() == Guid::InvalidGuid()) 315 && vdev->GUID() == m_vdevGUID) { 316 317 if (IsSpare()) 318 flags |= ZFS_ONLINE_SPARE; 319 if (zpool_vdev_online(pool, vdev->GUIDString().c_str(), 320 flags, &m_vdevState) != 0) { 321 syslog(LOG_ERR, 322 "Failed to online vdev(%s/%s:%s): %s: %s\n", 323 zpool_get_name(pool), vdev->GUIDString().c_str(), 324 devPath.c_str(), libzfs_error_action(g_zfsHandle), 325 libzfs_error_description(g_zfsHandle)); 326 return (/*consumed*/false); 327 } 328 329 syslog(LOG_INFO, "Onlined vdev(%s/%s:%s). State now %s.\n", 330 zpool_get_name(pool), vdev->GUIDString().c_str(), 331 devPath.c_str(), 332 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 333 334 /* 335 * Check the vdev state post the online action to see 336 * if we can retire this case. 337 */ 338 CloseIfSolved(); 339 340 return (/*consumed*/true); 341 } 342 343 /* 344 * If the auto-replace policy is enabled, and we have physical 345 * path information, try a physical path replacement. 346 */ 347 if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) { 348 syslog(LOG_INFO, 349 "CaseFile(%s:%s:%s): AutoReplace not set. " 350 "Ignoring device insertion.\n", 351 PoolGUIDString().c_str(), 352 VdevGUIDString().c_str(), 353 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 354 return (/*consumed*/false); 355 } 356 357 if (PhysicalPath().empty()) { 358 syslog(LOG_INFO, 359 "CaseFile(%s:%s:%s): No physical path information. " 360 "Ignoring device insertion.\n", 361 PoolGUIDString().c_str(), 362 VdevGUIDString().c_str(), 363 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 364 return (/*consumed*/false); 365 } 366 367 if (physPath != PhysicalPath()) { 368 syslog(LOG_INFO, 369 "CaseFile(%s:%s:%s): Physical path mismatch. " 370 "Ignoring device insertion.\n", 371 PoolGUIDString().c_str(), 372 VdevGUIDString().c_str(), 373 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 374 return (/*consumed*/false); 375 } 376 377 /* Write a label on the newly inserted disk. */ 378 if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) { 379 syslog(LOG_ERR, 380 "Replace vdev(%s/%s) by physical path (label): %s: %s\n", 381 zpool_get_name(pool), VdevGUIDString().c_str(), 382 libzfs_error_action(g_zfsHandle), 383 libzfs_error_description(g_zfsHandle)); 384 return (/*consumed*/false); 385 } 386 387 syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s", 388 PoolGUIDString().c_str(), VdevGUIDString().c_str(), 389 devPath.c_str()); 390 return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false)); 391 } 392 393 bool 394 CaseFile::ReEvaluate(const ZfsEvent &event) 395 { 396 bool consumed(false); 397 398 if (event.Value("type") == "sysevent.fs.zfs.vdev_remove") { 399 /* 400 * The Vdev we represent has been removed from the 401 * configuration. This case is no longer of value. 402 */ 403 Close(); 404 405 return (/*consumed*/true); 406 } else if (event.Value("type") == "sysevent.fs.zfs.pool_destroy") { 407 /* This Pool has been destroyed. Discard the case */ 408 Close(); 409 410 return (/*consumed*/true); 411 } else if (event.Value("type") == "sysevent.fs.zfs.config_sync") { 412 RefreshVdevState(); 413 if (VdevState() < VDEV_STATE_HEALTHY && 414 VdevState() != VDEV_STATE_OFFLINE) 415 consumed = ActivateSpare(); 416 } 417 418 419 if (event.Value("class") == "resource.fs.zfs.removed") { 420 bool spare_activated; 421 422 if (!RefreshVdevState()) { 423 /* 424 * The pool or vdev for this case file is no longer 425 * part of the configuration. This can happen 426 * if we process a device arrival notification 427 * before seeing the ZFS configuration change 428 * event. 429 */ 430 syslog(LOG_INFO, 431 "CaseFile::ReEvaluate(%s,%s) Pool/Vdev " 432 "unconfigured. Closing\n", 433 PoolGUIDString().c_str(), 434 VdevGUIDString().c_str()); 435 /* 436 * Close the case now so we won't waste cycles in the 437 * system rescan 438 */ 439 Close(); 440 441 /* 442 * Since this event was not used to close this 443 * case, do not report it as consumed. 444 */ 445 return (/*consumed*/false); 446 } 447 448 /* 449 * Discard any tentative I/O error events for 450 * this case. They were most likely caused by the 451 * hot-unplug of this device. 452 */ 453 PurgeTentativeEvents(); 454 455 /* Try to activate spares if they are available */ 456 spare_activated = ActivateSpare(); 457 458 /* 459 * Rescan the drives in the system to see if a recent 460 * drive arrival can be used to solve this case. 461 */ 462 ZfsDaemon::RequestSystemRescan(); 463 464 /* 465 * Consume the event if we successfully activated a spare. 466 * Otherwise, leave it in the unconsumed events list so that the 467 * future addition of a spare to this pool might be able to 468 * close the case 469 */ 470 consumed = spare_activated; 471 } else if (event.Value("class") == "resource.fs.zfs.statechange") { 472 RefreshVdevState(); 473 /* 474 * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to 475 * activate a hotspare. Otherwise, ignore the event 476 */ 477 if (VdevState() == VDEV_STATE_FAULTED || 478 VdevState() == VDEV_STATE_DEGRADED || 479 VdevState() == VDEV_STATE_CANT_OPEN) 480 (void) ActivateSpare(); 481 consumed = true; 482 } 483 else if (event.Value("class") == "ereport.fs.zfs.io" || 484 event.Value("class") == "ereport.fs.zfs.checksum" || 485 event.Value("class") == "ereport.fs.zfs.delay") { 486 487 m_tentativeEvents.push_front(event.DeepCopy()); 488 RegisterCallout(event); 489 consumed = true; 490 } 491 492 bool closed(CloseIfSolved()); 493 494 return (consumed || closed); 495 } 496 497 /* Find a Vdev containing the vdev with the given GUID */ 498 static nvlist_t* 499 find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid) 500 { 501 nvlist_t **vdevChildren; 502 int error; 503 unsigned ch, numChildren; 504 505 error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, 506 &vdevChildren, &numChildren); 507 508 if (error != 0 || numChildren == 0) 509 return (NULL); 510 511 for (ch = 0; ch < numChildren; ch++) { 512 nvlist *result; 513 Vdev vdev(pool_config, vdevChildren[ch]); 514 515 if (vdev.GUID() == child_guid) 516 return (config); 517 518 result = find_parent(pool_config, vdevChildren[ch], child_guid); 519 if (result != NULL) 520 return (result); 521 } 522 523 return (NULL); 524 } 525 526 bool 527 CaseFile::ActivateSpare() { 528 nvlist_t *config, *nvroot, *parent_config; 529 nvlist_t **spares; 530 const char *devPath, *poolname, *vdev_type; 531 u_int nspares, i; 532 int error; 533 534 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 535 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 536 if (zhp == NULL) { 537 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool " 538 "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID); 539 return (false); 540 } 541 poolname = zpool_get_name(zhp); 542 config = zpool_get_config(zhp, NULL); 543 if (config == NULL) { 544 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool " 545 "config for pool %s", poolname); 546 return (false); 547 } 548 error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot); 549 if (error != 0){ 550 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev " 551 "tree for pool %s", poolname); 552 return (false); 553 } 554 555 parent_config = find_parent(config, nvroot, m_vdevGUID); 556 if (parent_config != NULL) { 557 const char *parent_type; 558 559 /* 560 * Don't activate spares for members of a "replacing" vdev. 561 * They're already dealt with. Sparing them will just drag out 562 * the resilver process. 563 */ 564 error = nvlist_lookup_string(parent_config, 565 ZPOOL_CONFIG_TYPE, &parent_type); 566 if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0) 567 return (false); 568 } 569 570 nspares = 0; 571 nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 572 &nspares); 573 if (nspares == 0) { 574 /* The pool has no spares configured */ 575 syslog(LOG_INFO, "CaseFile::ActivateSpare: " 576 "No spares available for pool %s", poolname); 577 return (false); 578 } 579 for (i = 0; i < nspares; i++) { 580 uint64_t *nvlist_array; 581 vdev_stat_t *vs; 582 uint_t nstats; 583 584 if (nvlist_lookup_uint64_array(spares[i], 585 ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) { 586 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not " 587 "find vdev stats for pool %s, spare %d", 588 poolname, i); 589 return (false); 590 } 591 vs = reinterpret_cast<vdev_stat_t *>(nvlist_array); 592 593 if ((vs->vs_aux != VDEV_AUX_SPARED) 594 && (vs->vs_state == VDEV_STATE_HEALTHY)) { 595 /* We found a usable spare */ 596 break; 597 } 598 } 599 600 if (i == nspares) { 601 /* No available spares were found */ 602 return (false); 603 } 604 605 error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath); 606 if (error != 0) { 607 syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine " 608 "the path of pool %s, spare %d. Error %d", 609 poolname, i, error); 610 return (false); 611 } 612 613 error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type); 614 if (error != 0) { 615 syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine " 616 "the vdev type of pool %s, spare %d. Error %d", 617 poolname, i, error); 618 return (false); 619 } 620 621 return (Replace(vdev_type, devPath, /*isspare*/true)); 622 } 623 624 /* Does the argument event refer to a checksum error? */ 625 static bool 626 IsChecksumEvent(const Event* const event) 627 { 628 return ("ereport.fs.zfs.checksum" == event->Value("type")); 629 } 630 631 /* Does the argument event refer to an IO error? */ 632 static bool 633 IsIOEvent(const Event* const event) 634 { 635 return ("ereport.fs.zfs.io" == event->Value("type")); 636 } 637 638 /* Does the argument event refer to an IO delay? */ 639 static bool 640 IsDelayEvent(const Event* const event) 641 { 642 return ("ereport.fs.zfs.delay" == event->Value("type")); 643 } 644 645 void 646 CaseFile::RegisterCallout(const Event &event) 647 { 648 timeval now, countdown, elapsed, timestamp, zero, remaining; 649 /** 650 * The time ZFSD waits before promoting a tentative event 651 * into a permanent event. 652 */ 653 int sec = -1; 654 if (IsChecksumEvent(&event)) 655 sec = CaseFile::GetVdevProp(VDEV_PROP_CHECKSUM_T); 656 else if (IsIOEvent(&event)) 657 sec = CaseFile::GetVdevProp(VDEV_PROP_IO_T); 658 else if (IsDelayEvent(&event)) 659 sec = CaseFile::GetVdevProp(VDEV_PROP_SLOW_IO_T); 660 661 if (sec == -1) 662 sec = 60; /* default */ 663 664 timeval removeGracePeriod = { 665 sec, /*sec*/ 666 0 /*usec*/ 667 }; 668 669 gettimeofday(&now, 0); 670 timestamp = event.GetTimestamp(); 671 timersub(&now, ×tamp, &elapsed); 672 timersub(&removeGracePeriod, &elapsed, &countdown); 673 /* 674 * If countdown is <= zero, Reset the timer to the 675 * smallest positive time value instead 676 */ 677 timerclear(&zero); 678 if (timercmp(&countdown, &zero, <=)) { 679 timerclear(&countdown); 680 countdown.tv_usec = 1; 681 } 682 683 remaining = m_tentativeTimer.TimeRemaining(); 684 685 if (!m_tentativeTimer.IsPending() 686 || timercmp(&countdown, &remaining, <)) 687 m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this); 688 } 689 690 691 bool 692 CaseFile::CloseIfSolved() 693 { 694 if (m_events.empty() 695 && m_tentativeEvents.empty()) { 696 697 /* 698 * We currently do not track or take actions on 699 * devices in the degraded or faulted state. 700 * Once we have support for spare pools, we'll 701 * retain these cases so that any spares added in 702 * the future can be applied to them. 703 */ 704 switch (VdevState()) { 705 case VDEV_STATE_HEALTHY: 706 /* No need to keep cases for healthy vdevs */ 707 case VDEV_STATE_OFFLINE: 708 /* 709 * Offline is a deliberate administrative action. zfsd 710 * doesn't need to do anything for this state. 711 */ 712 Close(); 713 return (true); 714 case VDEV_STATE_REMOVED: 715 case VDEV_STATE_CANT_OPEN: 716 /* 717 * Keep open. We may solve it with a newly inserted 718 * device. 719 */ 720 case VDEV_STATE_FAULTED: 721 case VDEV_STATE_DEGRADED: 722 /* 723 * Keep open. We may solve it with the future 724 * addition of a spare to the pool 725 */ 726 case VDEV_STATE_UNKNOWN: 727 case VDEV_STATE_CLOSED: 728 /* 729 * Keep open? This may not be the correct behavior, 730 * but it's what we've always done 731 */ 732 ; 733 } 734 735 /* 736 * Re-serialize the case in order to remove any 737 * previous event data. 738 */ 739 Serialize(); 740 } 741 742 return (false); 743 } 744 745 void 746 CaseFile::Log() 747 { 748 syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(), 749 VdevGUIDString().c_str(), PhysicalPath().c_str()); 750 syslog(LOG_INFO, "\tVdev State = %s\n", 751 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 752 if (m_tentativeEvents.size() != 0) { 753 syslog(LOG_INFO, "\t=== Tentative Events ===\n"); 754 for (EventList::iterator event(m_tentativeEvents.begin()); 755 event != m_tentativeEvents.end(); event++) 756 (*event)->Log(LOG_INFO); 757 } 758 if (m_events.size() != 0) { 759 syslog(LOG_INFO, "\t=== Events ===\n"); 760 for (EventList::iterator event(m_events.begin()); 761 event != m_events.end(); event++) 762 (*event)->Log(LOG_INFO); 763 } 764 } 765 766 //- CaseFile Static Protected Methods ------------------------------------------ 767 void 768 CaseFile::OnGracePeriodEnded(void *arg) 769 { 770 CaseFile &casefile(*static_cast<CaseFile *>(arg)); 771 772 casefile.OnGracePeriodEnded(); 773 } 774 775 int 776 CaseFile::DeSerializeSelector(const struct dirent *dirEntry) 777 { 778 uint64_t poolGUID; 779 uint64_t vdevGUID; 780 781 if (dirEntry->d_type == DT_REG 782 && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case", 783 &poolGUID, &vdevGUID) == 2) 784 return (1); 785 return (0); 786 } 787 788 void 789 CaseFile::DeSerializeFile(const char *fileName) 790 { 791 string fullName(s_caseFilePath + '/' + fileName); 792 CaseFile *existingCaseFile(NULL); 793 CaseFile *caseFile(NULL); 794 795 try { 796 uint64_t poolGUID; 797 uint64_t vdevGUID; 798 nvlist_t *vdevConf; 799 800 if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case", 801 &poolGUID, &vdevGUID) != 2) { 802 throw ZfsdException("CaseFile::DeSerialize: " 803 "Unintelligible CaseFile filename %s.\n", fileName); 804 } 805 existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID)); 806 if (existingCaseFile != NULL) { 807 /* 808 * If the vdev is already degraded or faulted, 809 * there's no point in keeping the state around 810 * that we use to put a drive into the degraded 811 * state. However, if the vdev is simply missing, 812 * preserve the case data in the hopes that it will 813 * return. 814 */ 815 caseFile = existingCaseFile; 816 vdev_state curState(caseFile->VdevState()); 817 if (curState > VDEV_STATE_CANT_OPEN 818 && curState < VDEV_STATE_HEALTHY) { 819 unlink(fileName); 820 return; 821 } 822 } else { 823 ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID); 824 if (zpl.empty() 825 || (vdevConf = VdevIterator(zpl.front()) 826 .Find(vdevGUID)) == NULL) { 827 /* 828 * Either the pool no longer exists 829 * or this vdev is no longer a member of 830 * the pool. 831 */ 832 unlink(fullName.c_str()); 833 return; 834 } 835 836 /* 837 * Any vdev we find that does not have a case file 838 * must be in the healthy state and thus worthy of 839 * continued SERD data tracking. 840 */ 841 caseFile = new CaseFile(Vdev(zpl.front(), vdevConf)); 842 } 843 844 ifstream caseStream(fullName.c_str()); 845 if (!caseStream) 846 throw ZfsdException("CaseFile::DeSerialize: Unable to " 847 "read %s.\n", fileName); 848 849 caseFile->DeSerialize(caseStream); 850 } catch (const ParseException &exp) { 851 852 exp.Log(); 853 if (caseFile != existingCaseFile) 854 delete caseFile; 855 856 /* 857 * Since we can't parse the file, unlink it so we don't 858 * trip over it again. 859 */ 860 unlink(fileName); 861 } catch (const ZfsdException &zfsException) { 862 863 zfsException.Log(); 864 if (caseFile != existingCaseFile) 865 delete caseFile; 866 } 867 } 868 869 //- CaseFile Protected Methods ------------------------------------------------- 870 CaseFile::CaseFile(const Vdev &vdev) 871 : m_poolGUID(vdev.PoolGUID()), 872 m_vdevGUID(vdev.GUID()), 873 m_vdevState(vdev.State()), 874 m_vdevPhysPath(vdev.PhysicalPath()), 875 m_is_spare(vdev.IsSpare()) 876 { 877 stringstream guidString; 878 879 guidString << m_vdevGUID; 880 m_vdevGUIDString = guidString.str(); 881 guidString.str(""); 882 guidString << m_poolGUID; 883 m_poolGUIDString = guidString.str(); 884 885 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 886 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 887 m_vdevName = vdev.Name(zhp, false); 888 889 s_activeCases.push_back(this); 890 891 syslog(LOG_INFO, "Creating new CaseFile:\n"); 892 Log(); 893 } 894 895 CaseFile::~CaseFile() 896 { 897 PurgeEvents(); 898 PurgeTentativeEvents(); 899 m_tentativeTimer.Stop(); 900 s_activeCases.remove(this); 901 } 902 903 void 904 CaseFile::PurgeEvents() 905 { 906 for (EventList::iterator event(m_events.begin()); 907 event != m_events.end(); event++) 908 delete *event; 909 910 m_events.clear(); 911 } 912 913 void 914 CaseFile::PurgeTentativeEvents() 915 { 916 for (EventList::iterator event(m_tentativeEvents.begin()); 917 event != m_tentativeEvents.end(); event++) 918 delete *event; 919 920 m_tentativeEvents.clear(); 921 } 922 923 void 924 CaseFile::SerializeEvList(const EventList events, int fd, 925 const char* prefix) const 926 { 927 if (events.empty()) 928 return; 929 for (EventList::const_iterator curEvent = events.begin(); 930 curEvent != events.end(); curEvent++) { 931 const string &eventString((*curEvent)->GetEventString()); 932 933 // TODO: replace many write(2) calls with a single writev(2) 934 if (prefix) 935 write(fd, prefix, strlen(prefix)); 936 write(fd, eventString.c_str(), eventString.length()); 937 } 938 } 939 940 void 941 CaseFile::Serialize() 942 { 943 stringstream saveFile; 944 945 saveFile << setfill('0') 946 << s_caseFilePath << "/" 947 << "pool_" << PoolGUIDString() 948 << "_vdev_" << VdevGUIDString() 949 << ".case"; 950 951 if (m_events.empty() && m_tentativeEvents.empty()) { 952 unlink(saveFile.str().c_str()); 953 return; 954 } 955 956 int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644)); 957 if (fd == -1) { 958 syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n", 959 saveFile.str().c_str()); 960 return; 961 } 962 SerializeEvList(m_events, fd); 963 SerializeEvList(m_tentativeEvents, fd, "tentative "); 964 close(fd); 965 } 966 967 /* 968 * XXX: This method assumes that events may not contain embedded newlines. If 969 * ever events can contain embedded newlines, then CaseFile must switch 970 * serialization formats 971 */ 972 void 973 CaseFile::DeSerialize(ifstream &caseStream) 974 { 975 string evString; 976 const EventFactory &factory(ZfsDaemon::Get().GetFactory()); 977 978 caseStream >> std::noskipws >> std::ws; 979 while (caseStream.good()) { 980 /* 981 * Outline: 982 * read the beginning of a line and check it for 983 * "tentative". If found, discard "tentative". 984 * Create a new event 985 * continue 986 */ 987 EventList* destEvents; 988 const string tentFlag("tentative "); 989 string line; 990 std::stringbuf lineBuf; 991 992 caseStream.get(lineBuf); 993 caseStream.ignore(); /*discard the newline character*/ 994 line = lineBuf.str(); 995 if (line.compare(0, tentFlag.size(), tentFlag) == 0) { 996 /* Discard "tentative" */ 997 line.erase(0, tentFlag.size()); 998 destEvents = &m_tentativeEvents; 999 } else { 1000 destEvents = &m_events; 1001 } 1002 Event *event(Event::CreateEvent(factory, line)); 1003 if (event != NULL) { 1004 destEvents->push_back(event); 1005 RegisterCallout(*event); 1006 } 1007 } 1008 } 1009 1010 void 1011 CaseFile::Close() 1012 { 1013 /* 1014 * This case is no longer relevant. Clean up our 1015 * serialization file, and delete the case. 1016 */ 1017 syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n", 1018 PoolGUIDString().c_str(), VdevGUIDString().c_str(), 1019 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 1020 1021 /* 1022 * Serialization of a Case with no event data, clears the 1023 * Serialization data for that event. 1024 */ 1025 PurgeEvents(); 1026 Serialize(); 1027 1028 delete this; 1029 } 1030 1031 void 1032 CaseFile::OnGracePeriodEnded() 1033 { 1034 bool should_fault, should_degrade; 1035 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 1036 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 1037 1038 m_events.splice(m_events.begin(), m_tentativeEvents); 1039 should_fault = ShouldFault(); 1040 should_degrade = ShouldDegrade(); 1041 1042 if (should_fault || should_degrade) { 1043 if (zhp == NULL 1044 || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) { 1045 /* 1046 * Either the pool no longer exists 1047 * or this vdev is no longer a member of 1048 * the pool. 1049 */ 1050 Close(); 1051 return; 1052 } 1053 1054 } 1055 1056 /* A fault condition has priority over a degrade condition */ 1057 if (ShouldFault()) { 1058 /* Fault the vdev and close the case. */ 1059 if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID, 1060 VDEV_AUX_ERR_EXCEEDED) == 0) { 1061 syslog(LOG_INFO, "Faulting vdev(%s/%s)", 1062 PoolGUIDString().c_str(), 1063 VdevGUIDString().c_str()); 1064 Close(); 1065 return; 1066 } 1067 else { 1068 syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n", 1069 PoolGUIDString().c_str(), 1070 VdevGUIDString().c_str(), 1071 libzfs_error_action(g_zfsHandle), 1072 libzfs_error_description(g_zfsHandle)); 1073 } 1074 } 1075 else if (ShouldDegrade()) { 1076 /* Degrade the vdev and close the case. */ 1077 if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID, 1078 VDEV_AUX_ERR_EXCEEDED) == 0) { 1079 syslog(LOG_INFO, "Degrading vdev(%s/%s)", 1080 PoolGUIDString().c_str(), 1081 VdevGUIDString().c_str()); 1082 Close(); 1083 return; 1084 } 1085 else { 1086 syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n", 1087 PoolGUIDString().c_str(), 1088 VdevGUIDString().c_str(), 1089 libzfs_error_action(g_zfsHandle), 1090 libzfs_error_description(g_zfsHandle)); 1091 } 1092 } 1093 Serialize(); 1094 } 1095 1096 Vdev 1097 CaseFile::BeingReplacedBy(zpool_handle_t *zhp) { 1098 Vdev vd(zhp, CaseVdev(zhp)); 1099 std::list<Vdev> children; 1100 std::list<Vdev>::iterator children_it; 1101 1102 Vdev parent(vd.Parent()); 1103 Vdev replacing(NonexistentVdev); 1104 1105 /* 1106 * To determine whether we are being replaced by another spare that 1107 * is still working, then make sure that it is currently spared and 1108 * that the spare is either resilvering or healthy. If any of these 1109 * conditions fail, then we are not being replaced by a spare. 1110 * 1111 * If the spare is healthy, then the case file should be closed very 1112 * soon after this check. 1113 */ 1114 if (parent.DoesNotExist() 1115 || parent.Name(zhp, /*verbose*/false) != "spare") 1116 return (NonexistentVdev); 1117 1118 children = parent.Children(); 1119 children_it = children.begin(); 1120 for (;children_it != children.end(); children_it++) { 1121 Vdev child = *children_it; 1122 1123 /* Skip our vdev. */ 1124 if (child.GUID() == VdevGUID()) 1125 continue; 1126 /* 1127 * Accept the first child that doesn't match our GUID, or 1128 * any resilvering/healthy device if one exists. 1129 */ 1130 if (replacing.DoesNotExist() || child.IsResilvering() 1131 || child.State() == VDEV_STATE_HEALTHY) 1132 replacing = child; 1133 } 1134 1135 return (replacing); 1136 } 1137 1138 bool 1139 CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) { 1140 nvlist_t *nvroot, *newvd; 1141 const char *poolname; 1142 string oldstr(VdevGUIDString()); 1143 bool retval = true; 1144 1145 /* Figure out what pool we're working on */ 1146 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 1147 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 1148 if (zhp == NULL) { 1149 syslog(LOG_ERR, "CaseFile::Replace: could not find pool for " 1150 "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID); 1151 return (false); 1152 } 1153 poolname = zpool_get_name(zhp); 1154 Vdev vd(zhp, CaseVdev(zhp)); 1155 Vdev replaced(BeingReplacedBy(zhp)); 1156 1157 if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) { 1158 /* If we are already being replaced by a working spare, pass. */ 1159 if (replaced.IsResilvering() 1160 || replaced.State() == VDEV_STATE_HEALTHY) { 1161 syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already " 1162 "replaced", VdevGUIDString().c_str(), path); 1163 return (/*consumed*/false); 1164 } 1165 /* 1166 * If we have already been replaced by a spare, but that spare 1167 * is broken, we must spare the spare, not the original device. 1168 */ 1169 oldstr = replaced.GUIDString(); 1170 syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing " 1171 "broken spare %s instead", VdevGUIDString().c_str(), 1172 path, oldstr.c_str()); 1173 } 1174 1175 /* 1176 * Build a root vdev/leaf vdev configuration suitable for 1177 * zpool_vdev_attach. Only enough data for the kernel to find 1178 * the device (i.e. type and disk device node path) are needed. 1179 */ 1180 nvroot = NULL; 1181 newvd = NULL; 1182 1183 if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0 1184 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { 1185 syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate " 1186 "configuration data.", poolname, oldstr.c_str()); 1187 if (nvroot != NULL) 1188 nvlist_free(nvroot); 1189 return (false); 1190 } 1191 if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0 1192 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 1193 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 1194 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1195 &newvd, 1) != 0) { 1196 syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize " 1197 "configuration data.", poolname, oldstr.c_str()); 1198 nvlist_free(newvd); 1199 nvlist_free(nvroot); 1200 return (true); 1201 } 1202 1203 /* Data was copied when added to the root vdev. */ 1204 nvlist_free(newvd); 1205 1206 retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot, 1207 /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0); 1208 if (retval) 1209 syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n", 1210 poolname, oldstr.c_str(), path); 1211 else 1212 syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n", 1213 poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle), 1214 libzfs_error_description(g_zfsHandle)); 1215 nvlist_free(nvroot); 1216 1217 return (retval); 1218 } 1219 1220 /* Lookup the vdev prop. Used for checksum, IO, or slow IO props */ 1221 int 1222 CaseFile::GetVdevProp(vdev_prop_t vdev_prop) const 1223 { 1224 char val[ZFS_MAXPROPLEN]; 1225 zprop_source_t srctype; 1226 DevdCtl::Guid poolGUID = PoolGUID(); 1227 ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID); 1228 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 1229 1230 char *prop_str = (char *) vdev_prop_to_name(vdev_prop); 1231 if (zhp == NULL || zpool_get_vdev_prop(zhp, m_vdevName.c_str(), 1232 vdev_prop, prop_str, val, sizeof (val), &srctype, B_FALSE) != 0) 1233 return (-1); 1234 1235 /* we'll get "-" from libzfs for a prop that is not set */ 1236 if (zfs_isnumber(val) == B_FALSE) 1237 return (-1); 1238 1239 return (atoi(val)); 1240 } 1241 1242 bool 1243 CaseFile::ShouldDegrade() const 1244 { 1245 int checksum_n = GetVdevProp(VDEV_PROP_CHECKSUM_N); 1246 if (checksum_n == -1) 1247 checksum_n = DEFAULT_ZFS_DEGRADE_IO_COUNT; 1248 return (std::count_if(m_events.begin(), m_events.end(), 1249 IsChecksumEvent) > checksum_n); 1250 } 1251 1252 bool 1253 CaseFile::ShouldFault() const 1254 { 1255 bool should_fault_for_io, should_fault_for_delay; 1256 int io_n = GetVdevProp(VDEV_PROP_IO_N); 1257 int slow_io_n = GetVdevProp(VDEV_PROP_SLOW_IO_N); 1258 1259 if (io_n == -1) 1260 io_n = DEFAULT_ZFS_DEGRADE_IO_COUNT; 1261 if (slow_io_n == -1) 1262 slow_io_n = DEFAULT_ZFS_FAULT_SLOW_IO_COUNT; 1263 1264 should_fault_for_io = std::count_if(m_events.begin(), m_events.end(), 1265 IsIOEvent) > io_n; 1266 should_fault_for_delay = std::count_if(m_events.begin(), m_events.end(), 1267 IsDelayEvent) > slow_io_n; 1268 1269 return (should_fault_for_io || should_fault_for_delay); 1270 } 1271 1272 nvlist_t * 1273 CaseFile::CaseVdev(zpool_handle_t *zhp) const 1274 { 1275 return (VdevIterator(zhp).Find(VdevGUID())); 1276 } 1277