1 /*- 2 * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions, and the following disclaimer, 10 * without modification. 11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 12 * substantially similar to the "NO WARRANTY" disclaimer below 13 * ("Disclaimer") and any redistribution must be conditioned upon 14 * including a substantially similar Disclaimer requirement for further 15 * binary redistribution. 16 * 17 * NO WARRANTY 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGES. 29 * 30 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 31 */ 32 33 /** 34 * \file case_file.cc 35 * 36 * We keep case files for any leaf vdev that is not in the optimal state. 37 * However, we only serialize to disk those events that need to be preserved 38 * across reboots. For now, this is just a log of soft errors which we 39 * accumulate in order to mark a device as degraded. 40 */ 41 #include <sys/cdefs.h> 42 #include <sys/byteorder.h> 43 #include <sys/time.h> 44 45 #include <sys/fs/zfs.h> 46 47 #include <dirent.h> 48 #include <fcntl.h> 49 #include <iomanip> 50 #include <fstream> 51 #include <functional> 52 #include <sstream> 53 #include <syslog.h> 54 #include <unistd.h> 55 56 #include <libzfs.h> 57 58 #include <list> 59 #include <map> 60 #include <string> 61 62 #include <devdctl/guid.h> 63 #include <devdctl/event.h> 64 #include <devdctl/event_factory.h> 65 #include <devdctl/exception.h> 66 #include <devdctl/consumer.h> 67 68 #include "callout.h" 69 #include "vdev_iterator.h" 70 #include "zfsd_event.h" 71 #include "case_file.h" 72 #include "vdev.h" 73 #include "zfsd.h" 74 #include "zfsd_exception.h" 75 #include "zpool_list.h" 76 /*============================ Namespace Control =============================*/ 77 using std::hex; 78 using std::ifstream; 79 using std::stringstream; 80 using std::setfill; 81 using std::setw; 82 83 using DevdCtl::Event; 84 using DevdCtl::EventFactory; 85 using DevdCtl::EventList; 86 using DevdCtl::Guid; 87 using DevdCtl::ParseException; 88 89 /*--------------------------------- CaseFile ---------------------------------*/ 90 //- CaseFile Static Data ------------------------------------------------------- 91 92 CaseFileList CaseFile::s_activeCases; 93 const string CaseFile::s_caseFilePath = "/var/db/zfsd/cases"; 94 const timeval CaseFile::s_removeGracePeriod = { 60 /*sec*/, 0 /*usec*/}; 95 96 //- CaseFile Static Public Methods --------------------------------------------- 97 CaseFile * 98 CaseFile::Find(Guid poolGUID, Guid vdevGUID) 99 { 100 for (CaseFileList::iterator curCase = s_activeCases.begin(); 101 curCase != s_activeCases.end(); curCase++) { 102 103 if (((*curCase)->PoolGUID() != poolGUID 104 && Guid::InvalidGuid() != poolGUID) 105 || (*curCase)->VdevGUID() != vdevGUID) 106 continue; 107 108 /* 109 * We only carry one active case per-vdev. 110 */ 111 return (*curCase); 112 } 113 return (NULL); 114 } 115 116 void 117 CaseFile::Find(Guid poolGUID, Guid vdevGUID, CaseFileList &cases) 118 { 119 for (CaseFileList::iterator curCase = s_activeCases.begin(); 120 curCase != s_activeCases.end(); curCase++) { 121 if (((*curCase)->PoolGUID() != poolGUID && 122 Guid::InvalidGuid() != poolGUID) || 123 (*curCase)->VdevGUID() != vdevGUID) 124 continue; 125 126 /* 127 * We can have multiple cases for spare vdevs 128 */ 129 cases.push_back(*curCase); 130 if (!(*curCase)->IsSpare()) { 131 return; 132 } 133 } 134 } 135 136 CaseFile * 137 CaseFile::Find(const string &physPath) 138 { 139 CaseFile *result = NULL; 140 141 for (CaseFileList::iterator curCase = s_activeCases.begin(); 142 curCase != s_activeCases.end(); curCase++) { 143 144 if ((*curCase)->PhysicalPath() != physPath) 145 continue; 146 147 if (result != NULL) { 148 syslog(LOG_WARNING, "Multiple casefiles found for " 149 "physical path %s. " 150 "This is most likely a bug in zfsd", 151 physPath.c_str()); 152 } 153 result = *curCase; 154 } 155 return (result); 156 } 157 158 159 void 160 CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event) 161 { 162 CaseFileList::iterator casefile; 163 for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){ 164 CaseFileList::iterator next = casefile; 165 next++; 166 if (poolGUID == (*casefile)->PoolGUID()) 167 (*casefile)->ReEvaluate(event); 168 casefile = next; 169 } 170 } 171 172 CaseFile & 173 CaseFile::Create(Vdev &vdev) 174 { 175 CaseFile *activeCase; 176 177 activeCase = Find(vdev.PoolGUID(), vdev.GUID()); 178 if (activeCase == NULL) 179 activeCase = new CaseFile(vdev); 180 181 return (*activeCase); 182 } 183 184 void 185 CaseFile::DeSerialize() 186 { 187 struct dirent **caseFiles; 188 189 int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles, 190 DeSerializeSelector, /*compar*/NULL)); 191 192 if (numCaseFiles == -1) 193 return; 194 if (numCaseFiles == 0) { 195 free(caseFiles); 196 return; 197 } 198 199 for (int i = 0; i < numCaseFiles; i++) { 200 201 DeSerializeFile(caseFiles[i]->d_name); 202 free(caseFiles[i]); 203 } 204 free(caseFiles); 205 } 206 207 bool 208 CaseFile::Empty() 209 { 210 return (s_activeCases.empty()); 211 } 212 213 void 214 CaseFile::LogAll() 215 { 216 for (CaseFileList::iterator curCase = s_activeCases.begin(); 217 curCase != s_activeCases.end(); curCase++) 218 (*curCase)->Log(); 219 } 220 221 void 222 CaseFile::PurgeAll() 223 { 224 /* 225 * Serialize casefiles before deleting them so that they can be reread 226 * and revalidated during BuildCaseFiles. 227 * CaseFiles remove themselves from this list on destruction. 228 */ 229 while (s_activeCases.size() != 0) { 230 CaseFile *casefile = s_activeCases.front(); 231 casefile->Serialize(); 232 delete casefile; 233 } 234 235 } 236 237 int 238 CaseFile::IsSpare() 239 { 240 return (m_is_spare); 241 } 242 243 //- CaseFile Public Methods ---------------------------------------------------- 244 bool 245 CaseFile::RefreshVdevState() 246 { 247 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 248 zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front()); 249 if (casePool == NULL) 250 return (false); 251 252 Vdev vd(casePool, CaseVdev(casePool)); 253 if (vd.DoesNotExist()) 254 return (false); 255 256 m_vdevState = vd.State(); 257 m_vdevPhysPath = vd.PhysicalPath(); 258 return (true); 259 } 260 261 bool 262 CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev) 263 { 264 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 265 zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front()); 266 int flags = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE; 267 268 if (pool == NULL || !RefreshVdevState()) { 269 /* 270 * The pool or vdev for this case file is no longer 271 * part of the configuration. This can happen 272 * if we process a device arrival notification 273 * before seeing the ZFS configuration change 274 * event. 275 */ 276 syslog(LOG_INFO, 277 "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured. " 278 "Closing\n", 279 PoolGUIDString().c_str(), 280 VdevGUIDString().c_str()); 281 Close(); 282 283 /* 284 * Since this event was not used to close this 285 * case, do not report it as consumed. 286 */ 287 return (/*consumed*/false); 288 } 289 290 if (VdevState() > VDEV_STATE_CANT_OPEN) { 291 /* 292 * For now, newly discovered devices only help for 293 * devices that are missing. In the future, we might 294 * use a newly inserted spare to replace a degraded 295 * or faulted device. 296 */ 297 syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored", 298 PoolGUIDString().c_str(), VdevGUIDString().c_str()); 299 return (/*consumed*/false); 300 } 301 302 if (vdev != NULL 303 && ( vdev->PoolGUID() == m_poolGUID 304 || vdev->PoolGUID() == Guid::InvalidGuid()) 305 && vdev->GUID() == m_vdevGUID) { 306 307 if (IsSpare()) 308 flags |= ZFS_ONLINE_SPARE; 309 if (zpool_vdev_online(pool, vdev->GUIDString().c_str(), 310 flags, &m_vdevState) != 0) { 311 syslog(LOG_ERR, 312 "Failed to online vdev(%s/%s:%s): %s: %s\n", 313 zpool_get_name(pool), vdev->GUIDString().c_str(), 314 devPath.c_str(), libzfs_error_action(g_zfsHandle), 315 libzfs_error_description(g_zfsHandle)); 316 return (/*consumed*/false); 317 } 318 319 syslog(LOG_INFO, "Onlined vdev(%s/%s:%s). State now %s.\n", 320 zpool_get_name(pool), vdev->GUIDString().c_str(), 321 devPath.c_str(), 322 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 323 324 /* 325 * Check the vdev state post the online action to see 326 * if we can retire this case. 327 */ 328 CloseIfSolved(); 329 330 return (/*consumed*/true); 331 } 332 333 /* 334 * If the auto-replace policy is enabled, and we have physical 335 * path information, try a physical path replacement. 336 */ 337 if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) { 338 syslog(LOG_INFO, 339 "CaseFile(%s:%s:%s): AutoReplace not set. " 340 "Ignoring device insertion.\n", 341 PoolGUIDString().c_str(), 342 VdevGUIDString().c_str(), 343 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 344 return (/*consumed*/false); 345 } 346 347 if (PhysicalPath().empty()) { 348 syslog(LOG_INFO, 349 "CaseFile(%s:%s:%s): No physical path information. " 350 "Ignoring device insertion.\n", 351 PoolGUIDString().c_str(), 352 VdevGUIDString().c_str(), 353 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 354 return (/*consumed*/false); 355 } 356 357 if (physPath != PhysicalPath()) { 358 syslog(LOG_INFO, 359 "CaseFile(%s:%s:%s): Physical path mismatch. " 360 "Ignoring device insertion.\n", 361 PoolGUIDString().c_str(), 362 VdevGUIDString().c_str(), 363 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 364 return (/*consumed*/false); 365 } 366 367 /* Write a label on the newly inserted disk. */ 368 if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) { 369 syslog(LOG_ERR, 370 "Replace vdev(%s/%s) by physical path (label): %s: %s\n", 371 zpool_get_name(pool), VdevGUIDString().c_str(), 372 libzfs_error_action(g_zfsHandle), 373 libzfs_error_description(g_zfsHandle)); 374 return (/*consumed*/false); 375 } 376 377 syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s", 378 PoolGUIDString().c_str(), VdevGUIDString().c_str(), 379 devPath.c_str()); 380 return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false)); 381 } 382 383 bool 384 CaseFile::ReEvaluate(const ZfsEvent &event) 385 { 386 bool consumed(false); 387 388 if (event.Value("type") == "sysevent.fs.zfs.vdev_remove") { 389 /* 390 * The Vdev we represent has been removed from the 391 * configuration. This case is no longer of value. 392 */ 393 Close(); 394 395 return (/*consumed*/true); 396 } else if (event.Value("type") == "sysevent.fs.zfs.pool_destroy") { 397 /* This Pool has been destroyed. Discard the case */ 398 Close(); 399 400 return (/*consumed*/true); 401 } else if (event.Value("type") == "sysevent.fs.zfs.config_sync") { 402 RefreshVdevState(); 403 if (VdevState() < VDEV_STATE_HEALTHY) 404 consumed = ActivateSpare(); 405 } 406 407 408 if (event.Value("class") == "resource.fs.zfs.removed") { 409 bool spare_activated; 410 411 if (!RefreshVdevState()) { 412 /* 413 * The pool or vdev for this case file is no longer 414 * part of the configuration. This can happen 415 * if we process a device arrival notification 416 * before seeing the ZFS configuration change 417 * event. 418 */ 419 syslog(LOG_INFO, 420 "CaseFile::ReEvaluate(%s,%s) Pool/Vdev " 421 "unconfigured. Closing\n", 422 PoolGUIDString().c_str(), 423 VdevGUIDString().c_str()); 424 /* 425 * Close the case now so we won't waste cycles in the 426 * system rescan 427 */ 428 Close(); 429 430 /* 431 * Since this event was not used to close this 432 * case, do not report it as consumed. 433 */ 434 return (/*consumed*/false); 435 } 436 437 /* 438 * Discard any tentative I/O error events for 439 * this case. They were most likely caused by the 440 * hot-unplug of this device. 441 */ 442 PurgeTentativeEvents(); 443 444 /* Try to activate spares if they are available */ 445 spare_activated = ActivateSpare(); 446 447 /* 448 * Rescan the drives in the system to see if a recent 449 * drive arrival can be used to solve this case. 450 */ 451 ZfsDaemon::RequestSystemRescan(); 452 453 /* 454 * Consume the event if we successfully activated a spare. 455 * Otherwise, leave it in the unconsumed events list so that the 456 * future addition of a spare to this pool might be able to 457 * close the case 458 */ 459 consumed = spare_activated; 460 } else if (event.Value("class") == "resource.fs.zfs.statechange") { 461 RefreshVdevState(); 462 /* 463 * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to 464 * activate a hotspare. Otherwise, ignore the event 465 */ 466 if (VdevState() == VDEV_STATE_FAULTED || 467 VdevState() == VDEV_STATE_DEGRADED || 468 VdevState() == VDEV_STATE_CANT_OPEN) 469 (void) ActivateSpare(); 470 consumed = true; 471 } 472 else if (event.Value("class") == "ereport.fs.zfs.io" || 473 event.Value("class") == "ereport.fs.zfs.checksum" || 474 event.Value("class") == "ereport.fs.zfs.delay") { 475 476 m_tentativeEvents.push_front(event.DeepCopy()); 477 RegisterCallout(event); 478 consumed = true; 479 } 480 481 bool closed(CloseIfSolved()); 482 483 return (consumed || closed); 484 } 485 486 /* Find a Vdev containing the vdev with the given GUID */ 487 static nvlist_t* 488 find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid) 489 { 490 nvlist_t **vdevChildren; 491 int error; 492 unsigned ch, numChildren; 493 494 error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, 495 &vdevChildren, &numChildren); 496 497 if (error != 0 || numChildren == 0) 498 return (NULL); 499 500 for (ch = 0; ch < numChildren; ch++) { 501 nvlist *result; 502 Vdev vdev(pool_config, vdevChildren[ch]); 503 504 if (vdev.GUID() == child_guid) 505 return (config); 506 507 result = find_parent(pool_config, vdevChildren[ch], child_guid); 508 if (result != NULL) 509 return (result); 510 } 511 512 return (NULL); 513 } 514 515 bool 516 CaseFile::ActivateSpare() { 517 nvlist_t *config, *nvroot, *parent_config; 518 nvlist_t **spares; 519 const char *devPath, *poolname, *vdev_type; 520 u_int nspares, i; 521 int error; 522 523 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 524 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 525 if (zhp == NULL) { 526 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool " 527 "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID); 528 return (false); 529 } 530 poolname = zpool_get_name(zhp); 531 config = zpool_get_config(zhp, NULL); 532 if (config == NULL) { 533 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool " 534 "config for pool %s", poolname); 535 return (false); 536 } 537 error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot); 538 if (error != 0){ 539 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev " 540 "tree for pool %s", poolname); 541 return (false); 542 } 543 544 parent_config = find_parent(config, nvroot, m_vdevGUID); 545 if (parent_config != NULL) { 546 const char *parent_type; 547 548 /* 549 * Don't activate spares for members of a "replacing" vdev. 550 * They're already dealt with. Sparing them will just drag out 551 * the resilver process. 552 */ 553 error = nvlist_lookup_string(parent_config, 554 ZPOOL_CONFIG_TYPE, &parent_type); 555 if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0) 556 return (false); 557 } 558 559 nspares = 0; 560 nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 561 &nspares); 562 if (nspares == 0) { 563 /* The pool has no spares configured */ 564 syslog(LOG_INFO, "CaseFile::ActivateSpare: " 565 "No spares available for pool %s", poolname); 566 return (false); 567 } 568 for (i = 0; i < nspares; i++) { 569 uint64_t *nvlist_array; 570 vdev_stat_t *vs; 571 uint_t nstats; 572 573 if (nvlist_lookup_uint64_array(spares[i], 574 ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) { 575 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not " 576 "find vdev stats for pool %s, spare %d", 577 poolname, i); 578 return (false); 579 } 580 vs = reinterpret_cast<vdev_stat_t *>(nvlist_array); 581 582 if ((vs->vs_aux != VDEV_AUX_SPARED) 583 && (vs->vs_state == VDEV_STATE_HEALTHY)) { 584 /* We found a usable spare */ 585 break; 586 } 587 } 588 589 if (i == nspares) { 590 /* No available spares were found */ 591 return (false); 592 } 593 594 error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath); 595 if (error != 0) { 596 syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine " 597 "the path of pool %s, spare %d. Error %d", 598 poolname, i, error); 599 return (false); 600 } 601 602 error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type); 603 if (error != 0) { 604 syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine " 605 "the vdev type of pool %s, spare %d. Error %d", 606 poolname, i, error); 607 return (false); 608 } 609 610 return (Replace(vdev_type, devPath, /*isspare*/true)); 611 } 612 613 void 614 CaseFile::RegisterCallout(const Event &event) 615 { 616 timeval now, countdown, elapsed, timestamp, zero, remaining; 617 618 gettimeofday(&now, 0); 619 timestamp = event.GetTimestamp(); 620 timersub(&now, ×tamp, &elapsed); 621 timersub(&s_removeGracePeriod, &elapsed, &countdown); 622 /* 623 * If countdown is <= zero, Reset the timer to the 624 * smallest positive time value instead 625 */ 626 timerclear(&zero); 627 if (timercmp(&countdown, &zero, <=)) { 628 timerclear(&countdown); 629 countdown.tv_usec = 1; 630 } 631 632 remaining = m_tentativeTimer.TimeRemaining(); 633 634 if (!m_tentativeTimer.IsPending() 635 || timercmp(&countdown, &remaining, <)) 636 m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this); 637 } 638 639 640 bool 641 CaseFile::CloseIfSolved() 642 { 643 if (m_events.empty() 644 && m_tentativeEvents.empty()) { 645 646 /* 647 * We currently do not track or take actions on 648 * devices in the degraded or faulted state. 649 * Once we have support for spare pools, we'll 650 * retain these cases so that any spares added in 651 * the future can be applied to them. 652 */ 653 switch (VdevState()) { 654 case VDEV_STATE_HEALTHY: 655 /* No need to keep cases for healthy vdevs */ 656 Close(); 657 return (true); 658 case VDEV_STATE_REMOVED: 659 case VDEV_STATE_CANT_OPEN: 660 /* 661 * Keep open. We may solve it with a newly inserted 662 * device. 663 */ 664 case VDEV_STATE_FAULTED: 665 case VDEV_STATE_DEGRADED: 666 /* 667 * Keep open. We may solve it with the future 668 * addition of a spare to the pool 669 */ 670 case VDEV_STATE_UNKNOWN: 671 case VDEV_STATE_CLOSED: 672 case VDEV_STATE_OFFLINE: 673 /* 674 * Keep open? This may not be the correct behavior, 675 * but it's what we've always done 676 */ 677 ; 678 } 679 680 /* 681 * Re-serialize the case in order to remove any 682 * previous event data. 683 */ 684 Serialize(); 685 } 686 687 return (false); 688 } 689 690 void 691 CaseFile::Log() 692 { 693 syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(), 694 VdevGUIDString().c_str(), PhysicalPath().c_str()); 695 syslog(LOG_INFO, "\tVdev State = %s\n", 696 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 697 if (m_tentativeEvents.size() != 0) { 698 syslog(LOG_INFO, "\t=== Tentative Events ===\n"); 699 for (EventList::iterator event(m_tentativeEvents.begin()); 700 event != m_tentativeEvents.end(); event++) 701 (*event)->Log(LOG_INFO); 702 } 703 if (m_events.size() != 0) { 704 syslog(LOG_INFO, "\t=== Events ===\n"); 705 for (EventList::iterator event(m_events.begin()); 706 event != m_events.end(); event++) 707 (*event)->Log(LOG_INFO); 708 } 709 } 710 711 //- CaseFile Static Protected Methods ------------------------------------------ 712 void 713 CaseFile::OnGracePeriodEnded(void *arg) 714 { 715 CaseFile &casefile(*static_cast<CaseFile *>(arg)); 716 717 casefile.OnGracePeriodEnded(); 718 } 719 720 int 721 CaseFile::DeSerializeSelector(const struct dirent *dirEntry) 722 { 723 uint64_t poolGUID; 724 uint64_t vdevGUID; 725 726 if (dirEntry->d_type == DT_REG 727 && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case", 728 &poolGUID, &vdevGUID) == 2) 729 return (1); 730 return (0); 731 } 732 733 void 734 CaseFile::DeSerializeFile(const char *fileName) 735 { 736 string fullName(s_caseFilePath + '/' + fileName); 737 CaseFile *existingCaseFile(NULL); 738 CaseFile *caseFile(NULL); 739 740 try { 741 uint64_t poolGUID; 742 uint64_t vdevGUID; 743 nvlist_t *vdevConf; 744 745 if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case", 746 &poolGUID, &vdevGUID) != 2) { 747 throw ZfsdException("CaseFile::DeSerialize: " 748 "Unintelligible CaseFile filename %s.\n", fileName); 749 } 750 existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID)); 751 if (existingCaseFile != NULL) { 752 /* 753 * If the vdev is already degraded or faulted, 754 * there's no point in keeping the state around 755 * that we use to put a drive into the degraded 756 * state. However, if the vdev is simply missing, 757 * preserve the case data in the hopes that it will 758 * return. 759 */ 760 caseFile = existingCaseFile; 761 vdev_state curState(caseFile->VdevState()); 762 if (curState > VDEV_STATE_CANT_OPEN 763 && curState < VDEV_STATE_HEALTHY) { 764 unlink(fileName); 765 return; 766 } 767 } else { 768 ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID); 769 if (zpl.empty() 770 || (vdevConf = VdevIterator(zpl.front()) 771 .Find(vdevGUID)) == NULL) { 772 /* 773 * Either the pool no longer exists 774 * or this vdev is no longer a member of 775 * the pool. 776 */ 777 unlink(fullName.c_str()); 778 return; 779 } 780 781 /* 782 * Any vdev we find that does not have a case file 783 * must be in the healthy state and thus worthy of 784 * continued SERD data tracking. 785 */ 786 caseFile = new CaseFile(Vdev(zpl.front(), vdevConf)); 787 } 788 789 ifstream caseStream(fullName.c_str()); 790 if (!caseStream) 791 throw ZfsdException("CaseFile::DeSerialize: Unable to " 792 "read %s.\n", fileName); 793 794 caseFile->DeSerialize(caseStream); 795 } catch (const ParseException &exp) { 796 797 exp.Log(); 798 if (caseFile != existingCaseFile) 799 delete caseFile; 800 801 /* 802 * Since we can't parse the file, unlink it so we don't 803 * trip over it again. 804 */ 805 unlink(fileName); 806 } catch (const ZfsdException &zfsException) { 807 808 zfsException.Log(); 809 if (caseFile != existingCaseFile) 810 delete caseFile; 811 } 812 } 813 814 //- CaseFile Protected Methods ------------------------------------------------- 815 CaseFile::CaseFile(const Vdev &vdev) 816 : m_poolGUID(vdev.PoolGUID()), 817 m_vdevGUID(vdev.GUID()), 818 m_vdevState(vdev.State()), 819 m_vdevPhysPath(vdev.PhysicalPath()), 820 m_is_spare(vdev.IsSpare()) 821 { 822 stringstream guidString; 823 824 guidString << m_vdevGUID; 825 m_vdevGUIDString = guidString.str(); 826 guidString.str(""); 827 guidString << m_poolGUID; 828 m_poolGUIDString = guidString.str(); 829 830 s_activeCases.push_back(this); 831 832 syslog(LOG_INFO, "Creating new CaseFile:\n"); 833 Log(); 834 } 835 836 CaseFile::~CaseFile() 837 { 838 PurgeEvents(); 839 PurgeTentativeEvents(); 840 m_tentativeTimer.Stop(); 841 s_activeCases.remove(this); 842 } 843 844 void 845 CaseFile::PurgeEvents() 846 { 847 for (EventList::iterator event(m_events.begin()); 848 event != m_events.end(); event++) 849 delete *event; 850 851 m_events.clear(); 852 } 853 854 void 855 CaseFile::PurgeTentativeEvents() 856 { 857 for (EventList::iterator event(m_tentativeEvents.begin()); 858 event != m_tentativeEvents.end(); event++) 859 delete *event; 860 861 m_tentativeEvents.clear(); 862 } 863 864 void 865 CaseFile::SerializeEvList(const EventList events, int fd, 866 const char* prefix) const 867 { 868 if (events.empty()) 869 return; 870 for (EventList::const_iterator curEvent = events.begin(); 871 curEvent != events.end(); curEvent++) { 872 const string &eventString((*curEvent)->GetEventString()); 873 874 // TODO: replace many write(2) calls with a single writev(2) 875 if (prefix) 876 write(fd, prefix, strlen(prefix)); 877 write(fd, eventString.c_str(), eventString.length()); 878 } 879 } 880 881 void 882 CaseFile::Serialize() 883 { 884 stringstream saveFile; 885 886 saveFile << setfill('0') 887 << s_caseFilePath << "/" 888 << "pool_" << PoolGUIDString() 889 << "_vdev_" << VdevGUIDString() 890 << ".case"; 891 892 if (m_events.empty() && m_tentativeEvents.empty()) { 893 unlink(saveFile.str().c_str()); 894 return; 895 } 896 897 int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644)); 898 if (fd == -1) { 899 syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n", 900 saveFile.str().c_str()); 901 return; 902 } 903 SerializeEvList(m_events, fd); 904 SerializeEvList(m_tentativeEvents, fd, "tentative "); 905 close(fd); 906 } 907 908 /* 909 * XXX: This method assumes that events may not contain embedded newlines. If 910 * ever events can contain embedded newlines, then CaseFile must switch 911 * serialization formats 912 */ 913 void 914 CaseFile::DeSerialize(ifstream &caseStream) 915 { 916 string evString; 917 const EventFactory &factory(ZfsDaemon::Get().GetFactory()); 918 919 caseStream >> std::noskipws >> std::ws; 920 while (caseStream.good()) { 921 /* 922 * Outline: 923 * read the beginning of a line and check it for 924 * "tentative". If found, discard "tentative". 925 * Create a new event 926 * continue 927 */ 928 EventList* destEvents; 929 const string tentFlag("tentative "); 930 string line; 931 std::stringbuf lineBuf; 932 933 caseStream.get(lineBuf); 934 caseStream.ignore(); /*discard the newline character*/ 935 line = lineBuf.str(); 936 if (line.compare(0, tentFlag.size(), tentFlag) == 0) { 937 /* Discard "tentative" */ 938 line.erase(0, tentFlag.size()); 939 destEvents = &m_tentativeEvents; 940 } else { 941 destEvents = &m_events; 942 } 943 Event *event(Event::CreateEvent(factory, line)); 944 if (event != NULL) { 945 destEvents->push_back(event); 946 RegisterCallout(*event); 947 } 948 } 949 } 950 951 void 952 CaseFile::Close() 953 { 954 /* 955 * This case is no longer relevant. Clean up our 956 * serialization file, and delete the case. 957 */ 958 syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n", 959 PoolGUIDString().c_str(), VdevGUIDString().c_str(), 960 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 961 962 /* 963 * Serialization of a Case with no event data, clears the 964 * Serialization data for that event. 965 */ 966 PurgeEvents(); 967 Serialize(); 968 969 delete this; 970 } 971 972 void 973 CaseFile::OnGracePeriodEnded() 974 { 975 bool should_fault, should_degrade; 976 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 977 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 978 979 m_events.splice(m_events.begin(), m_tentativeEvents); 980 should_fault = ShouldFault(); 981 should_degrade = ShouldDegrade(); 982 983 if (should_fault || should_degrade) { 984 if (zhp == NULL 985 || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) { 986 /* 987 * Either the pool no longer exists 988 * or this vdev is no longer a member of 989 * the pool. 990 */ 991 Close(); 992 return; 993 } 994 995 } 996 997 /* A fault condition has priority over a degrade condition */ 998 if (ShouldFault()) { 999 /* Fault the vdev and close the case. */ 1000 if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID, 1001 VDEV_AUX_ERR_EXCEEDED) == 0) { 1002 syslog(LOG_INFO, "Faulting vdev(%s/%s)", 1003 PoolGUIDString().c_str(), 1004 VdevGUIDString().c_str()); 1005 Close(); 1006 return; 1007 } 1008 else { 1009 syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n", 1010 PoolGUIDString().c_str(), 1011 VdevGUIDString().c_str(), 1012 libzfs_error_action(g_zfsHandle), 1013 libzfs_error_description(g_zfsHandle)); 1014 } 1015 } 1016 else if (ShouldDegrade()) { 1017 /* Degrade the vdev and close the case. */ 1018 if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID, 1019 VDEV_AUX_ERR_EXCEEDED) == 0) { 1020 syslog(LOG_INFO, "Degrading vdev(%s/%s)", 1021 PoolGUIDString().c_str(), 1022 VdevGUIDString().c_str()); 1023 Close(); 1024 return; 1025 } 1026 else { 1027 syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n", 1028 PoolGUIDString().c_str(), 1029 VdevGUIDString().c_str(), 1030 libzfs_error_action(g_zfsHandle), 1031 libzfs_error_description(g_zfsHandle)); 1032 } 1033 } 1034 Serialize(); 1035 } 1036 1037 Vdev 1038 CaseFile::BeingReplacedBy(zpool_handle_t *zhp) { 1039 Vdev vd(zhp, CaseVdev(zhp)); 1040 std::list<Vdev> children; 1041 std::list<Vdev>::iterator children_it; 1042 1043 Vdev parent(vd.Parent()); 1044 Vdev replacing(NonexistentVdev); 1045 1046 /* 1047 * To determine whether we are being replaced by another spare that 1048 * is still working, then make sure that it is currently spared and 1049 * that the spare is either resilvering or healthy. If any of these 1050 * conditions fail, then we are not being replaced by a spare. 1051 * 1052 * If the spare is healthy, then the case file should be closed very 1053 * soon after this check. 1054 */ 1055 if (parent.DoesNotExist() 1056 || parent.Name(zhp, /*verbose*/false) != "spare") 1057 return (NonexistentVdev); 1058 1059 children = parent.Children(); 1060 children_it = children.begin(); 1061 for (;children_it != children.end(); children_it++) { 1062 Vdev child = *children_it; 1063 1064 /* Skip our vdev. */ 1065 if (child.GUID() == VdevGUID()) 1066 continue; 1067 /* 1068 * Accept the first child that doesn't match our GUID, or 1069 * any resilvering/healthy device if one exists. 1070 */ 1071 if (replacing.DoesNotExist() || child.IsResilvering() 1072 || child.State() == VDEV_STATE_HEALTHY) 1073 replacing = child; 1074 } 1075 1076 return (replacing); 1077 } 1078 1079 bool 1080 CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) { 1081 nvlist_t *nvroot, *newvd; 1082 const char *poolname; 1083 string oldstr(VdevGUIDString()); 1084 bool retval = true; 1085 1086 /* Figure out what pool we're working on */ 1087 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 1088 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 1089 if (zhp == NULL) { 1090 syslog(LOG_ERR, "CaseFile::Replace: could not find pool for " 1091 "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID); 1092 return (false); 1093 } 1094 poolname = zpool_get_name(zhp); 1095 Vdev vd(zhp, CaseVdev(zhp)); 1096 Vdev replaced(BeingReplacedBy(zhp)); 1097 1098 if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) { 1099 /* If we are already being replaced by a working spare, pass. */ 1100 if (replaced.IsResilvering() 1101 || replaced.State() == VDEV_STATE_HEALTHY) { 1102 syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already " 1103 "replaced", VdevGUIDString().c_str(), path); 1104 return (/*consumed*/false); 1105 } 1106 /* 1107 * If we have already been replaced by a spare, but that spare 1108 * is broken, we must spare the spare, not the original device. 1109 */ 1110 oldstr = replaced.GUIDString(); 1111 syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing " 1112 "broken spare %s instead", VdevGUIDString().c_str(), 1113 path, oldstr.c_str()); 1114 } 1115 1116 /* 1117 * Build a root vdev/leaf vdev configuration suitable for 1118 * zpool_vdev_attach. Only enough data for the kernel to find 1119 * the device (i.e. type and disk device node path) are needed. 1120 */ 1121 nvroot = NULL; 1122 newvd = NULL; 1123 1124 if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0 1125 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { 1126 syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate " 1127 "configuration data.", poolname, oldstr.c_str()); 1128 if (nvroot != NULL) 1129 nvlist_free(nvroot); 1130 return (false); 1131 } 1132 if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0 1133 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 1134 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 1135 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1136 &newvd, 1) != 0) { 1137 syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize " 1138 "configuration data.", poolname, oldstr.c_str()); 1139 nvlist_free(newvd); 1140 nvlist_free(nvroot); 1141 return (true); 1142 } 1143 1144 /* Data was copied when added to the root vdev. */ 1145 nvlist_free(newvd); 1146 1147 retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot, 1148 /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0); 1149 if (retval) 1150 syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n", 1151 poolname, oldstr.c_str(), path); 1152 else 1153 syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n", 1154 poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle), 1155 libzfs_error_description(g_zfsHandle)); 1156 nvlist_free(nvroot); 1157 1158 return (retval); 1159 } 1160 1161 /* Does the argument event refer to a checksum error? */ 1162 static bool 1163 IsChecksumEvent(const Event* const event) 1164 { 1165 return ("ereport.fs.zfs.checksum" == event->Value("type")); 1166 } 1167 1168 /* Does the argument event refer to an IO error? */ 1169 static bool 1170 IsIOEvent(const Event* const event) 1171 { 1172 return ("ereport.fs.zfs.io" == event->Value("type")); 1173 } 1174 1175 /* Does the argument event refer to an IO delay? */ 1176 static bool 1177 IsDelayEvent(const Event* const event) 1178 { 1179 return ("ereport.fs.zfs.delay" == event->Value("type")); 1180 } 1181 1182 bool 1183 CaseFile::ShouldDegrade() const 1184 { 1185 return (std::count_if(m_events.begin(), m_events.end(), 1186 IsChecksumEvent) > ZFS_DEGRADE_IO_COUNT); 1187 } 1188 1189 bool 1190 CaseFile::ShouldFault() const 1191 { 1192 bool should_fault_for_io, should_fault_for_delay; 1193 1194 should_fault_for_io = std::count_if(m_events.begin(), m_events.end(), 1195 IsIOEvent) > ZFS_DEGRADE_IO_COUNT; 1196 should_fault_for_delay = std::count_if(m_events.begin(), m_events.end(), 1197 IsDelayEvent) > ZFS_FAULT_DELAY_COUNT; 1198 1199 return (should_fault_for_io || should_fault_for_delay); 1200 } 1201 1202 nvlist_t * 1203 CaseFile::CaseVdev(zpool_handle_t *zhp) const 1204 { 1205 return (VdevIterator(zhp).Find(VdevGUID())); 1206 } 1207