1 /*- 2 * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions, and the following disclaimer, 10 * without modification. 11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 12 * substantially similar to the "NO WARRANTY" disclaimer below 13 * ("Disclaimer") and any redistribution must be conditioned upon 14 * including a substantially similar Disclaimer requirement for further 15 * binary redistribution. 16 * 17 * NO WARRANTY 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGES. 29 * 30 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 31 */ 32 33 /** 34 * \file case_file.cc 35 * 36 * We keep case files for any leaf vdev that is not in the optimal state. 37 * However, we only serialize to disk those events that need to be preserved 38 * across reboots. For now, this is just a log of soft errors which we 39 * accumulate in order to mark a device as degraded. 40 */ 41 #include <sys/cdefs.h> 42 #include <sys/byteorder.h> 43 #include <sys/time.h> 44 45 #include <sys/fs/zfs.h> 46 47 #include <dirent.h> 48 #include <fcntl.h> 49 #include <iomanip> 50 #include <fstream> 51 #include <functional> 52 #include <sstream> 53 #include <syslog.h> 54 #include <unistd.h> 55 56 #include <libzfs.h> 57 58 #include <list> 59 #include <map> 60 #include <string> 61 62 #include <devdctl/guid.h> 63 #include <devdctl/event.h> 64 #include <devdctl/event_factory.h> 65 #include <devdctl/exception.h> 66 #include <devdctl/consumer.h> 67 68 #include "callout.h" 69 #include "vdev_iterator.h" 70 #include "zfsd_event.h" 71 #include "case_file.h" 72 #include "vdev.h" 73 #include "zfsd.h" 74 #include "zfsd_exception.h" 75 #include "zpool_list.h" 76 77 __FBSDID("$FreeBSD$"); 78 79 /*============================ Namespace Control =============================*/ 80 using std::hex; 81 using std::ifstream; 82 using std::stringstream; 83 using std::setfill; 84 using std::setw; 85 86 using DevdCtl::Event; 87 using DevdCtl::EventFactory; 88 using DevdCtl::EventList; 89 using DevdCtl::Guid; 90 using DevdCtl::ParseException; 91 92 /*--------------------------------- CaseFile ---------------------------------*/ 93 //- CaseFile Static Data ------------------------------------------------------- 94 95 CaseFileList CaseFile::s_activeCases; 96 const string CaseFile::s_caseFilePath = "/var/db/zfsd/cases"; 97 const timeval CaseFile::s_removeGracePeriod = { 60 /*sec*/, 0 /*usec*/}; 98 99 //- CaseFile Static Public Methods --------------------------------------------- 100 CaseFile * 101 CaseFile::Find(Guid poolGUID, Guid vdevGUID) 102 { 103 for (CaseFileList::iterator curCase = s_activeCases.begin(); 104 curCase != s_activeCases.end(); curCase++) { 105 106 if (((*curCase)->PoolGUID() != poolGUID 107 && Guid::InvalidGuid() != poolGUID) 108 || (*curCase)->VdevGUID() != vdevGUID) 109 continue; 110 111 /* 112 * We only carry one active case per-vdev. 113 */ 114 return (*curCase); 115 } 116 return (NULL); 117 } 118 119 void 120 CaseFile::Find(Guid poolGUID, Guid vdevGUID, CaseFileList &cases) 121 { 122 for (CaseFileList::iterator curCase = s_activeCases.begin(); 123 curCase != s_activeCases.end(); curCase++) { 124 if (((*curCase)->PoolGUID() != poolGUID && 125 Guid::InvalidGuid() != poolGUID) || 126 (*curCase)->VdevGUID() != vdevGUID) 127 continue; 128 129 /* 130 * We can have multiple cases for spare vdevs 131 */ 132 cases.push_back(*curCase); 133 if (!(*curCase)->IsSpare()) { 134 return; 135 } 136 } 137 } 138 139 CaseFile * 140 CaseFile::Find(const string &physPath) 141 { 142 CaseFile *result = NULL; 143 144 for (CaseFileList::iterator curCase = s_activeCases.begin(); 145 curCase != s_activeCases.end(); curCase++) { 146 147 if ((*curCase)->PhysicalPath() != physPath) 148 continue; 149 150 if (result != NULL) { 151 syslog(LOG_WARNING, "Multiple casefiles found for " 152 "physical path %s. " 153 "This is most likely a bug in zfsd", 154 physPath.c_str()); 155 } 156 result = *curCase; 157 } 158 return (result); 159 } 160 161 162 void 163 CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event) 164 { 165 CaseFileList::iterator casefile; 166 for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){ 167 CaseFileList::iterator next = casefile; 168 next++; 169 if (poolGUID == (*casefile)->PoolGUID()) 170 (*casefile)->ReEvaluate(event); 171 casefile = next; 172 } 173 } 174 175 CaseFile & 176 CaseFile::Create(Vdev &vdev) 177 { 178 CaseFile *activeCase; 179 180 activeCase = Find(vdev.PoolGUID(), vdev.GUID()); 181 if (activeCase == NULL) 182 activeCase = new CaseFile(vdev); 183 184 return (*activeCase); 185 } 186 187 void 188 CaseFile::DeSerialize() 189 { 190 struct dirent **caseFiles; 191 192 int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles, 193 DeSerializeSelector, /*compar*/NULL)); 194 195 if (numCaseFiles == -1) 196 return; 197 if (numCaseFiles == 0) { 198 free(caseFiles); 199 return; 200 } 201 202 for (int i = 0; i < numCaseFiles; i++) { 203 204 DeSerializeFile(caseFiles[i]->d_name); 205 free(caseFiles[i]); 206 } 207 free(caseFiles); 208 } 209 210 bool 211 CaseFile::Empty() 212 { 213 return (s_activeCases.empty()); 214 } 215 216 void 217 CaseFile::LogAll() 218 { 219 for (CaseFileList::iterator curCase = s_activeCases.begin(); 220 curCase != s_activeCases.end(); curCase++) 221 (*curCase)->Log(); 222 } 223 224 void 225 CaseFile::PurgeAll() 226 { 227 /* 228 * Serialize casefiles before deleting them so that they can be reread 229 * and revalidated during BuildCaseFiles. 230 * CaseFiles remove themselves from this list on destruction. 231 */ 232 while (s_activeCases.size() != 0) { 233 CaseFile *casefile = s_activeCases.front(); 234 casefile->Serialize(); 235 delete casefile; 236 } 237 238 } 239 240 int 241 CaseFile::IsSpare() 242 { 243 return (m_is_spare); 244 } 245 246 //- CaseFile Public Methods ---------------------------------------------------- 247 bool 248 CaseFile::RefreshVdevState() 249 { 250 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 251 zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front()); 252 if (casePool == NULL) 253 return (false); 254 255 Vdev vd(casePool, CaseVdev(casePool)); 256 if (vd.DoesNotExist()) 257 return (false); 258 259 m_vdevState = vd.State(); 260 m_vdevPhysPath = vd.PhysicalPath(); 261 return (true); 262 } 263 264 bool 265 CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev) 266 { 267 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 268 zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front()); 269 int flags = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE; 270 271 if (pool == NULL || !RefreshVdevState()) { 272 /* 273 * The pool or vdev for this case file is no longer 274 * part of the configuration. This can happen 275 * if we process a device arrival notification 276 * before seeing the ZFS configuration change 277 * event. 278 */ 279 syslog(LOG_INFO, 280 "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured. " 281 "Closing\n", 282 PoolGUIDString().c_str(), 283 VdevGUIDString().c_str()); 284 Close(); 285 286 /* 287 * Since this event was not used to close this 288 * case, do not report it as consumed. 289 */ 290 return (/*consumed*/false); 291 } 292 293 if (VdevState() > VDEV_STATE_CANT_OPEN) { 294 /* 295 * For now, newly discovered devices only help for 296 * devices that are missing. In the future, we might 297 * use a newly inserted spare to replace a degraded 298 * or faulted device. 299 */ 300 syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored", 301 PoolGUIDString().c_str(), VdevGUIDString().c_str()); 302 return (/*consumed*/false); 303 } 304 305 if (vdev != NULL 306 && ( vdev->PoolGUID() == m_poolGUID 307 || vdev->PoolGUID() == Guid::InvalidGuid()) 308 && vdev->GUID() == m_vdevGUID) { 309 310 if (IsSpare()) 311 flags |= ZFS_ONLINE_SPARE; 312 if (zpool_vdev_online(pool, vdev->GUIDString().c_str(), 313 flags, &m_vdevState) != 0) { 314 syslog(LOG_ERR, 315 "Failed to online vdev(%s/%s:%s): %s: %s\n", 316 zpool_get_name(pool), vdev->GUIDString().c_str(), 317 devPath.c_str(), libzfs_error_action(g_zfsHandle), 318 libzfs_error_description(g_zfsHandle)); 319 return (/*consumed*/false); 320 } 321 322 syslog(LOG_INFO, "Onlined vdev(%s/%s:%s). State now %s.\n", 323 zpool_get_name(pool), vdev->GUIDString().c_str(), 324 devPath.c_str(), 325 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 326 327 /* 328 * Check the vdev state post the online action to see 329 * if we can retire this case. 330 */ 331 CloseIfSolved(); 332 333 return (/*consumed*/true); 334 } 335 336 /* 337 * If the auto-replace policy is enabled, and we have physical 338 * path information, try a physical path replacement. 339 */ 340 if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) { 341 syslog(LOG_INFO, 342 "CaseFile(%s:%s:%s): AutoReplace not set. " 343 "Ignoring device insertion.\n", 344 PoolGUIDString().c_str(), 345 VdevGUIDString().c_str(), 346 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 347 return (/*consumed*/false); 348 } 349 350 if (PhysicalPath().empty()) { 351 syslog(LOG_INFO, 352 "CaseFile(%s:%s:%s): No physical path information. " 353 "Ignoring device insertion.\n", 354 PoolGUIDString().c_str(), 355 VdevGUIDString().c_str(), 356 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 357 return (/*consumed*/false); 358 } 359 360 if (physPath != PhysicalPath()) { 361 syslog(LOG_INFO, 362 "CaseFile(%s:%s:%s): Physical path mismatch. " 363 "Ignoring device insertion.\n", 364 PoolGUIDString().c_str(), 365 VdevGUIDString().c_str(), 366 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 367 return (/*consumed*/false); 368 } 369 370 /* Write a label on the newly inserted disk. */ 371 if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) { 372 syslog(LOG_ERR, 373 "Replace vdev(%s/%s) by physical path (label): %s: %s\n", 374 zpool_get_name(pool), VdevGUIDString().c_str(), 375 libzfs_error_action(g_zfsHandle), 376 libzfs_error_description(g_zfsHandle)); 377 return (/*consumed*/false); 378 } 379 380 syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s", 381 PoolGUIDString().c_str(), VdevGUIDString().c_str(), 382 devPath.c_str()); 383 return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false)); 384 } 385 386 bool 387 CaseFile::ReEvaluate(const ZfsEvent &event) 388 { 389 bool consumed(false); 390 391 if (event.Value("type") == "sysevent.fs.zfs.vdev_remove") { 392 /* 393 * The Vdev we represent has been removed from the 394 * configuration. This case is no longer of value. 395 */ 396 Close(); 397 398 return (/*consumed*/true); 399 } else if (event.Value("type") == "sysevent.fs.zfs.pool_destroy") { 400 /* This Pool has been destroyed. Discard the case */ 401 Close(); 402 403 return (/*consumed*/true); 404 } else if (event.Value("type") == "sysevent.fs.zfs.config_sync") { 405 RefreshVdevState(); 406 if (VdevState() < VDEV_STATE_HEALTHY) 407 consumed = ActivateSpare(); 408 } 409 410 411 if (event.Value("class") == "resource.fs.zfs.removed") { 412 bool spare_activated; 413 414 if (!RefreshVdevState()) { 415 /* 416 * The pool or vdev for this case file is no longer 417 * part of the configuration. This can happen 418 * if we process a device arrival notification 419 * before seeing the ZFS configuration change 420 * event. 421 */ 422 syslog(LOG_INFO, 423 "CaseFile::ReEvaluate(%s,%s) Pool/Vdev " 424 "unconfigured. Closing\n", 425 PoolGUIDString().c_str(), 426 VdevGUIDString().c_str()); 427 /* 428 * Close the case now so we won't waste cycles in the 429 * system rescan 430 */ 431 Close(); 432 433 /* 434 * Since this event was not used to close this 435 * case, do not report it as consumed. 436 */ 437 return (/*consumed*/false); 438 } 439 440 /* 441 * Discard any tentative I/O error events for 442 * this case. They were most likely caused by the 443 * hot-unplug of this device. 444 */ 445 PurgeTentativeEvents(); 446 447 /* Try to activate spares if they are available */ 448 spare_activated = ActivateSpare(); 449 450 /* 451 * Rescan the drives in the system to see if a recent 452 * drive arrival can be used to solve this case. 453 */ 454 ZfsDaemon::RequestSystemRescan(); 455 456 /* 457 * Consume the event if we successfully activated a spare. 458 * Otherwise, leave it in the unconsumed events list so that the 459 * future addition of a spare to this pool might be able to 460 * close the case 461 */ 462 consumed = spare_activated; 463 } else if (event.Value("class") == "resource.fs.zfs.statechange") { 464 RefreshVdevState(); 465 /* 466 * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to 467 * activate a hotspare. Otherwise, ignore the event 468 */ 469 if (VdevState() == VDEV_STATE_FAULTED || 470 VdevState() == VDEV_STATE_DEGRADED || 471 VdevState() == VDEV_STATE_CANT_OPEN) 472 (void) ActivateSpare(); 473 consumed = true; 474 } 475 else if (event.Value("class") == "ereport.fs.zfs.io" || 476 event.Value("class") == "ereport.fs.zfs.checksum") { 477 478 m_tentativeEvents.push_front(event.DeepCopy()); 479 RegisterCallout(event); 480 consumed = true; 481 } 482 483 bool closed(CloseIfSolved()); 484 485 return (consumed || closed); 486 } 487 488 /* Find a Vdev containing the vdev with the given GUID */ 489 static nvlist_t* 490 find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid) 491 { 492 nvlist_t **vdevChildren; 493 int error; 494 unsigned ch, numChildren; 495 496 error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, 497 &vdevChildren, &numChildren); 498 499 if (error != 0 || numChildren == 0) 500 return (NULL); 501 502 for (ch = 0; ch < numChildren; ch++) { 503 nvlist *result; 504 Vdev vdev(pool_config, vdevChildren[ch]); 505 506 if (vdev.GUID() == child_guid) 507 return (config); 508 509 result = find_parent(pool_config, vdevChildren[ch], child_guid); 510 if (result != NULL) 511 return (result); 512 } 513 514 return (NULL); 515 } 516 517 bool 518 CaseFile::ActivateSpare() { 519 nvlist_t *config, *nvroot, *parent_config; 520 nvlist_t **spares; 521 const char *devPath, *poolname, *vdev_type; 522 u_int nspares, i; 523 int error; 524 525 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 526 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 527 if (zhp == NULL) { 528 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool " 529 "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID); 530 return (false); 531 } 532 poolname = zpool_get_name(zhp); 533 config = zpool_get_config(zhp, NULL); 534 if (config == NULL) { 535 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool " 536 "config for pool %s", poolname); 537 return (false); 538 } 539 error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot); 540 if (error != 0){ 541 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev " 542 "tree for pool %s", poolname); 543 return (false); 544 } 545 546 parent_config = find_parent(config, nvroot, m_vdevGUID); 547 if (parent_config != NULL) { 548 const char *parent_type; 549 550 /* 551 * Don't activate spares for members of a "replacing" vdev. 552 * They're already dealt with. Sparing them will just drag out 553 * the resilver process. 554 */ 555 error = nvlist_lookup_string(parent_config, 556 ZPOOL_CONFIG_TYPE, &parent_type); 557 if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0) 558 return (false); 559 } 560 561 nspares = 0; 562 nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 563 &nspares); 564 if (nspares == 0) { 565 /* The pool has no spares configured */ 566 syslog(LOG_INFO, "CaseFile::ActivateSpare: " 567 "No spares available for pool %s", poolname); 568 return (false); 569 } 570 for (i = 0; i < nspares; i++) { 571 uint64_t *nvlist_array; 572 vdev_stat_t *vs; 573 uint_t nstats; 574 575 if (nvlist_lookup_uint64_array(spares[i], 576 ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) { 577 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not " 578 "find vdev stats for pool %s, spare %d", 579 poolname, i); 580 return (false); 581 } 582 vs = reinterpret_cast<vdev_stat_t *>(nvlist_array); 583 584 if ((vs->vs_aux != VDEV_AUX_SPARED) 585 && (vs->vs_state == VDEV_STATE_HEALTHY)) { 586 /* We found a usable spare */ 587 break; 588 } 589 } 590 591 if (i == nspares) { 592 /* No available spares were found */ 593 return (false); 594 } 595 596 error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath); 597 if (error != 0) { 598 syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine " 599 "the path of pool %s, spare %d. Error %d", 600 poolname, i, error); 601 return (false); 602 } 603 604 error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type); 605 if (error != 0) { 606 syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine " 607 "the vdev type of pool %s, spare %d. Error %d", 608 poolname, i, error); 609 return (false); 610 } 611 612 return (Replace(vdev_type, devPath, /*isspare*/true)); 613 } 614 615 void 616 CaseFile::RegisterCallout(const Event &event) 617 { 618 timeval now, countdown, elapsed, timestamp, zero, remaining; 619 620 gettimeofday(&now, 0); 621 timestamp = event.GetTimestamp(); 622 timersub(&now, ×tamp, &elapsed); 623 timersub(&s_removeGracePeriod, &elapsed, &countdown); 624 /* 625 * If countdown is <= zero, Reset the timer to the 626 * smallest positive time value instead 627 */ 628 timerclear(&zero); 629 if (timercmp(&countdown, &zero, <=)) { 630 timerclear(&countdown); 631 countdown.tv_usec = 1; 632 } 633 634 remaining = m_tentativeTimer.TimeRemaining(); 635 636 if (!m_tentativeTimer.IsPending() 637 || timercmp(&countdown, &remaining, <)) 638 m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this); 639 } 640 641 642 bool 643 CaseFile::CloseIfSolved() 644 { 645 if (m_events.empty() 646 && m_tentativeEvents.empty()) { 647 648 /* 649 * We currently do not track or take actions on 650 * devices in the degraded or faulted state. 651 * Once we have support for spare pools, we'll 652 * retain these cases so that any spares added in 653 * the future can be applied to them. 654 */ 655 switch (VdevState()) { 656 case VDEV_STATE_HEALTHY: 657 /* No need to keep cases for healthy vdevs */ 658 Close(); 659 return (true); 660 case VDEV_STATE_REMOVED: 661 case VDEV_STATE_CANT_OPEN: 662 /* 663 * Keep open. We may solve it with a newly inserted 664 * device. 665 */ 666 case VDEV_STATE_FAULTED: 667 case VDEV_STATE_DEGRADED: 668 /* 669 * Keep open. We may solve it with the future 670 * addition of a spare to the pool 671 */ 672 case VDEV_STATE_UNKNOWN: 673 case VDEV_STATE_CLOSED: 674 case VDEV_STATE_OFFLINE: 675 /* 676 * Keep open? This may not be the correct behavior, 677 * but it's what we've always done 678 */ 679 ; 680 } 681 682 /* 683 * Re-serialize the case in order to remove any 684 * previous event data. 685 */ 686 Serialize(); 687 } 688 689 return (false); 690 } 691 692 void 693 CaseFile::Log() 694 { 695 syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(), 696 VdevGUIDString().c_str(), PhysicalPath().c_str()); 697 syslog(LOG_INFO, "\tVdev State = %s\n", 698 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 699 if (m_tentativeEvents.size() != 0) { 700 syslog(LOG_INFO, "\t=== Tentative Events ===\n"); 701 for (EventList::iterator event(m_tentativeEvents.begin()); 702 event != m_tentativeEvents.end(); event++) 703 (*event)->Log(LOG_INFO); 704 } 705 if (m_events.size() != 0) { 706 syslog(LOG_INFO, "\t=== Events ===\n"); 707 for (EventList::iterator event(m_events.begin()); 708 event != m_events.end(); event++) 709 (*event)->Log(LOG_INFO); 710 } 711 } 712 713 //- CaseFile Static Protected Methods ------------------------------------------ 714 void 715 CaseFile::OnGracePeriodEnded(void *arg) 716 { 717 CaseFile &casefile(*static_cast<CaseFile *>(arg)); 718 719 casefile.OnGracePeriodEnded(); 720 } 721 722 int 723 CaseFile::DeSerializeSelector(const struct dirent *dirEntry) 724 { 725 uint64_t poolGUID; 726 uint64_t vdevGUID; 727 728 if (dirEntry->d_type == DT_REG 729 && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case", 730 &poolGUID, &vdevGUID) == 2) 731 return (1); 732 return (0); 733 } 734 735 void 736 CaseFile::DeSerializeFile(const char *fileName) 737 { 738 string fullName(s_caseFilePath + '/' + fileName); 739 CaseFile *existingCaseFile(NULL); 740 CaseFile *caseFile(NULL); 741 742 try { 743 uint64_t poolGUID; 744 uint64_t vdevGUID; 745 nvlist_t *vdevConf; 746 747 if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case", 748 &poolGUID, &vdevGUID) != 2) { 749 throw ZfsdException("CaseFile::DeSerialize: " 750 "Unintelligible CaseFile filename %s.\n", fileName); 751 } 752 existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID)); 753 if (existingCaseFile != NULL) { 754 /* 755 * If the vdev is already degraded or faulted, 756 * there's no point in keeping the state around 757 * that we use to put a drive into the degraded 758 * state. However, if the vdev is simply missing, 759 * preserve the case data in the hopes that it will 760 * return. 761 */ 762 caseFile = existingCaseFile; 763 vdev_state curState(caseFile->VdevState()); 764 if (curState > VDEV_STATE_CANT_OPEN 765 && curState < VDEV_STATE_HEALTHY) { 766 unlink(fileName); 767 return; 768 } 769 } else { 770 ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID); 771 if (zpl.empty() 772 || (vdevConf = VdevIterator(zpl.front()) 773 .Find(vdevGUID)) == NULL) { 774 /* 775 * Either the pool no longer exists 776 * or this vdev is no longer a member of 777 * the pool. 778 */ 779 unlink(fullName.c_str()); 780 return; 781 } 782 783 /* 784 * Any vdev we find that does not have a case file 785 * must be in the healthy state and thus worthy of 786 * continued SERD data tracking. 787 */ 788 caseFile = new CaseFile(Vdev(zpl.front(), vdevConf)); 789 } 790 791 ifstream caseStream(fullName.c_str()); 792 if (!caseStream) 793 throw ZfsdException("CaseFile::DeSerialize: Unable to " 794 "read %s.\n", fileName); 795 796 caseFile->DeSerialize(caseStream); 797 } catch (const ParseException &exp) { 798 799 exp.Log(); 800 if (caseFile != existingCaseFile) 801 delete caseFile; 802 803 /* 804 * Since we can't parse the file, unlink it so we don't 805 * trip over it again. 806 */ 807 unlink(fileName); 808 } catch (const ZfsdException &zfsException) { 809 810 zfsException.Log(); 811 if (caseFile != existingCaseFile) 812 delete caseFile; 813 } 814 } 815 816 //- CaseFile Protected Methods ------------------------------------------------- 817 CaseFile::CaseFile(const Vdev &vdev) 818 : m_poolGUID(vdev.PoolGUID()), 819 m_vdevGUID(vdev.GUID()), 820 m_vdevState(vdev.State()), 821 m_vdevPhysPath(vdev.PhysicalPath()), 822 m_is_spare(vdev.IsSpare()) 823 { 824 stringstream guidString; 825 826 guidString << m_vdevGUID; 827 m_vdevGUIDString = guidString.str(); 828 guidString.str(""); 829 guidString << m_poolGUID; 830 m_poolGUIDString = guidString.str(); 831 832 s_activeCases.push_back(this); 833 834 syslog(LOG_INFO, "Creating new CaseFile:\n"); 835 Log(); 836 } 837 838 CaseFile::~CaseFile() 839 { 840 PurgeEvents(); 841 PurgeTentativeEvents(); 842 m_tentativeTimer.Stop(); 843 s_activeCases.remove(this); 844 } 845 846 void 847 CaseFile::PurgeEvents() 848 { 849 for (EventList::iterator event(m_events.begin()); 850 event != m_events.end(); event++) 851 delete *event; 852 853 m_events.clear(); 854 } 855 856 void 857 CaseFile::PurgeTentativeEvents() 858 { 859 for (EventList::iterator event(m_tentativeEvents.begin()); 860 event != m_tentativeEvents.end(); event++) 861 delete *event; 862 863 m_tentativeEvents.clear(); 864 } 865 866 void 867 CaseFile::SerializeEvList(const EventList events, int fd, 868 const char* prefix) const 869 { 870 if (events.empty()) 871 return; 872 for (EventList::const_iterator curEvent = events.begin(); 873 curEvent != events.end(); curEvent++) { 874 const string &eventString((*curEvent)->GetEventString()); 875 876 // TODO: replace many write(2) calls with a single writev(2) 877 if (prefix) 878 write(fd, prefix, strlen(prefix)); 879 write(fd, eventString.c_str(), eventString.length()); 880 } 881 } 882 883 void 884 CaseFile::Serialize() 885 { 886 stringstream saveFile; 887 888 saveFile << setfill('0') 889 << s_caseFilePath << "/" 890 << "pool_" << PoolGUIDString() 891 << "_vdev_" << VdevGUIDString() 892 << ".case"; 893 894 if (m_events.empty() && m_tentativeEvents.empty()) { 895 unlink(saveFile.str().c_str()); 896 return; 897 } 898 899 int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644)); 900 if (fd == -1) { 901 syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n", 902 saveFile.str().c_str()); 903 return; 904 } 905 SerializeEvList(m_events, fd); 906 SerializeEvList(m_tentativeEvents, fd, "tentative "); 907 close(fd); 908 } 909 910 /* 911 * XXX: This method assumes that events may not contain embedded newlines. If 912 * ever events can contain embedded newlines, then CaseFile must switch 913 * serialization formats 914 */ 915 void 916 CaseFile::DeSerialize(ifstream &caseStream) 917 { 918 string evString; 919 const EventFactory &factory(ZfsDaemon::Get().GetFactory()); 920 921 caseStream >> std::noskipws >> std::ws; 922 while (caseStream.good()) { 923 /* 924 * Outline: 925 * read the beginning of a line and check it for 926 * "tentative". If found, discard "tentative". 927 * Create a new event 928 * continue 929 */ 930 EventList* destEvents; 931 const string tentFlag("tentative "); 932 string line; 933 std::stringbuf lineBuf; 934 935 caseStream.get(lineBuf); 936 caseStream.ignore(); /*discard the newline character*/ 937 line = lineBuf.str(); 938 if (line.compare(0, tentFlag.size(), tentFlag) == 0) { 939 /* Discard "tentative" */ 940 line.erase(0, tentFlag.size()); 941 destEvents = &m_tentativeEvents; 942 } else { 943 destEvents = &m_events; 944 } 945 Event *event(Event::CreateEvent(factory, line)); 946 if (event != NULL) { 947 destEvents->push_back(event); 948 RegisterCallout(*event); 949 } 950 } 951 } 952 953 void 954 CaseFile::Close() 955 { 956 /* 957 * This case is no longer relevant. Clean up our 958 * serialization file, and delete the case. 959 */ 960 syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n", 961 PoolGUIDString().c_str(), VdevGUIDString().c_str(), 962 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 963 964 /* 965 * Serialization of a Case with no event data, clears the 966 * Serialization data for that event. 967 */ 968 PurgeEvents(); 969 Serialize(); 970 971 delete this; 972 } 973 974 void 975 CaseFile::OnGracePeriodEnded() 976 { 977 bool should_fault, should_degrade; 978 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 979 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 980 981 m_events.splice(m_events.begin(), m_tentativeEvents); 982 should_fault = ShouldFault(); 983 should_degrade = ShouldDegrade(); 984 985 if (should_fault || should_degrade) { 986 if (zhp == NULL 987 || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) { 988 /* 989 * Either the pool no longer exists 990 * or this vdev is no longer a member of 991 * the pool. 992 */ 993 Close(); 994 return; 995 } 996 997 } 998 999 /* A fault condition has priority over a degrade condition */ 1000 if (ShouldFault()) { 1001 /* Fault the vdev and close the case. */ 1002 if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID, 1003 VDEV_AUX_ERR_EXCEEDED) == 0) { 1004 syslog(LOG_INFO, "Faulting vdev(%s/%s)", 1005 PoolGUIDString().c_str(), 1006 VdevGUIDString().c_str()); 1007 Close(); 1008 return; 1009 } 1010 else { 1011 syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n", 1012 PoolGUIDString().c_str(), 1013 VdevGUIDString().c_str(), 1014 libzfs_error_action(g_zfsHandle), 1015 libzfs_error_description(g_zfsHandle)); 1016 } 1017 } 1018 else if (ShouldDegrade()) { 1019 /* Degrade the vdev and close the case. */ 1020 if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID, 1021 VDEV_AUX_ERR_EXCEEDED) == 0) { 1022 syslog(LOG_INFO, "Degrading vdev(%s/%s)", 1023 PoolGUIDString().c_str(), 1024 VdevGUIDString().c_str()); 1025 Close(); 1026 return; 1027 } 1028 else { 1029 syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n", 1030 PoolGUIDString().c_str(), 1031 VdevGUIDString().c_str(), 1032 libzfs_error_action(g_zfsHandle), 1033 libzfs_error_description(g_zfsHandle)); 1034 } 1035 } 1036 Serialize(); 1037 } 1038 1039 Vdev 1040 CaseFile::BeingReplacedBy(zpool_handle_t *zhp) { 1041 Vdev vd(zhp, CaseVdev(zhp)); 1042 std::list<Vdev> children; 1043 std::list<Vdev>::iterator children_it; 1044 1045 Vdev parent(vd.Parent()); 1046 Vdev replacing(NonexistentVdev); 1047 1048 /* 1049 * To determine whether we are being replaced by another spare that 1050 * is still working, then make sure that it is currently spared and 1051 * that the spare is either resilvering or healthy. If any of these 1052 * conditions fail, then we are not being replaced by a spare. 1053 * 1054 * If the spare is healthy, then the case file should be closed very 1055 * soon after this check. 1056 */ 1057 if (parent.DoesNotExist() 1058 || parent.Name(zhp, /*verbose*/false) != "spare") 1059 return (NonexistentVdev); 1060 1061 children = parent.Children(); 1062 children_it = children.begin(); 1063 for (;children_it != children.end(); children_it++) { 1064 Vdev child = *children_it; 1065 1066 /* Skip our vdev. */ 1067 if (child.GUID() == VdevGUID()) 1068 continue; 1069 /* 1070 * Accept the first child that doesn't match our GUID, or 1071 * any resilvering/healthy device if one exists. 1072 */ 1073 if (replacing.DoesNotExist() || child.IsResilvering() 1074 || child.State() == VDEV_STATE_HEALTHY) 1075 replacing = child; 1076 } 1077 1078 return (replacing); 1079 } 1080 1081 bool 1082 CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) { 1083 nvlist_t *nvroot, *newvd; 1084 const char *poolname; 1085 string oldstr(VdevGUIDString()); 1086 bool retval = true; 1087 1088 /* Figure out what pool we're working on */ 1089 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 1090 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 1091 if (zhp == NULL) { 1092 syslog(LOG_ERR, "CaseFile::Replace: could not find pool for " 1093 "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID); 1094 return (false); 1095 } 1096 poolname = zpool_get_name(zhp); 1097 Vdev vd(zhp, CaseVdev(zhp)); 1098 Vdev replaced(BeingReplacedBy(zhp)); 1099 1100 if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) { 1101 /* If we are already being replaced by a working spare, pass. */ 1102 if (replaced.IsResilvering() 1103 || replaced.State() == VDEV_STATE_HEALTHY) { 1104 syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already " 1105 "replaced", VdevGUIDString().c_str(), path); 1106 return (/*consumed*/false); 1107 } 1108 /* 1109 * If we have already been replaced by a spare, but that spare 1110 * is broken, we must spare the spare, not the original device. 1111 */ 1112 oldstr = replaced.GUIDString(); 1113 syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing " 1114 "broken spare %s instead", VdevGUIDString().c_str(), 1115 path, oldstr.c_str()); 1116 } 1117 1118 /* 1119 * Build a root vdev/leaf vdev configuration suitable for 1120 * zpool_vdev_attach. Only enough data for the kernel to find 1121 * the device (i.e. type and disk device node path) are needed. 1122 */ 1123 nvroot = NULL; 1124 newvd = NULL; 1125 1126 if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0 1127 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { 1128 syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate " 1129 "configuration data.", poolname, oldstr.c_str()); 1130 if (nvroot != NULL) 1131 nvlist_free(nvroot); 1132 return (false); 1133 } 1134 if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0 1135 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 1136 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 1137 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1138 &newvd, 1) != 0) { 1139 syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize " 1140 "configuration data.", poolname, oldstr.c_str()); 1141 nvlist_free(newvd); 1142 nvlist_free(nvroot); 1143 return (true); 1144 } 1145 1146 /* Data was copied when added to the root vdev. */ 1147 nvlist_free(newvd); 1148 1149 retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot, 1150 /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0); 1151 if (retval) 1152 syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n", 1153 poolname, oldstr.c_str(), path); 1154 else 1155 syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n", 1156 poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle), 1157 libzfs_error_description(g_zfsHandle)); 1158 nvlist_free(nvroot); 1159 1160 return (retval); 1161 } 1162 1163 /* Does the argument event refer to a checksum error? */ 1164 static bool 1165 IsChecksumEvent(const Event* const event) 1166 { 1167 return ("ereport.fs.zfs.checksum" == event->Value("type")); 1168 } 1169 1170 /* Does the argument event refer to an IO error? */ 1171 static bool 1172 IsIOEvent(const Event* const event) 1173 { 1174 return ("ereport.fs.zfs.io" == event->Value("type")); 1175 } 1176 1177 bool 1178 CaseFile::ShouldDegrade() const 1179 { 1180 return (std::count_if(m_events.begin(), m_events.end(), 1181 IsChecksumEvent) > ZFS_DEGRADE_IO_COUNT); 1182 } 1183 1184 bool 1185 CaseFile::ShouldFault() const 1186 { 1187 return (std::count_if(m_events.begin(), m_events.end(), 1188 IsIOEvent) > ZFS_DEGRADE_IO_COUNT); 1189 } 1190 1191 nvlist_t * 1192 CaseFile::CaseVdev(zpool_handle_t *zhp) const 1193 { 1194 return (VdevIterator(zhp).Find(VdevGUID())); 1195 } 1196