1 /*- 2 * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions, and the following disclaimer, 10 * without modification. 11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 12 * substantially similar to the "NO WARRANTY" disclaimer below 13 * ("Disclaimer") and any redistribution must be conditioned upon 14 * including a substantially similar Disclaimer requirement for further 15 * binary redistribution. 16 * 17 * NO WARRANTY 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGES. 29 * 30 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 31 */ 32 33 /** 34 * \file case_file.cc 35 * 36 * We keep case files for any leaf vdev that is not in the optimal state. 37 * However, we only serialize to disk those events that need to be preserved 38 * across reboots. For now, this is just a log of soft errors which we 39 * accumulate in order to mark a device as degraded. 40 */ 41 #include <sys/cdefs.h> 42 #include <sys/time.h> 43 44 #include <sys/fs/zfs.h> 45 46 #include <dirent.h> 47 #include <iomanip> 48 #include <fstream> 49 #include <functional> 50 #include <sstream> 51 #include <syslog.h> 52 #include <unistd.h> 53 54 #include <libzfs.h> 55 56 #include <list> 57 #include <map> 58 #include <string> 59 60 #include <devdctl/guid.h> 61 #include <devdctl/event.h> 62 #include <devdctl/event_factory.h> 63 #include <devdctl/exception.h> 64 #include <devdctl/consumer.h> 65 66 #include "callout.h" 67 #include "vdev_iterator.h" 68 #include "zfsd_event.h" 69 #include "case_file.h" 70 #include "vdev.h" 71 #include "zfsd.h" 72 #include "zfsd_exception.h" 73 #include "zpool_list.h" 74 75 __FBSDID("$FreeBSD$"); 76 77 /*============================ Namespace Control =============================*/ 78 using std::auto_ptr; 79 using std::hex; 80 using std::ifstream; 81 using std::stringstream; 82 using std::setfill; 83 using std::setw; 84 85 using DevdCtl::Event; 86 using DevdCtl::EventFactory; 87 using DevdCtl::EventList; 88 using DevdCtl::Guid; 89 using DevdCtl::ParseException; 90 91 /*--------------------------------- CaseFile ---------------------------------*/ 92 //- CaseFile Static Data ------------------------------------------------------- 93 94 CaseFileList CaseFile::s_activeCases; 95 const string CaseFile::s_caseFilePath = "/var/db/zfsd/cases"; 96 const timeval CaseFile::s_removeGracePeriod = { 60 /*sec*/, 0 /*usec*/}; 97 98 //- CaseFile Static Public Methods --------------------------------------------- 99 CaseFile * 100 CaseFile::Find(Guid poolGUID, Guid vdevGUID) 101 { 102 for (CaseFileList::iterator curCase = s_activeCases.begin(); 103 curCase != s_activeCases.end(); curCase++) { 104 105 if (((*curCase)->PoolGUID() != poolGUID 106 && Guid::InvalidGuid() != poolGUID) 107 || (*curCase)->VdevGUID() != vdevGUID) 108 continue; 109 110 /* 111 * We only carry one active case per-vdev. 112 */ 113 return (*curCase); 114 } 115 return (NULL); 116 } 117 118 CaseFile * 119 CaseFile::Find(const string &physPath) 120 { 121 CaseFile *result = NULL; 122 123 for (CaseFileList::iterator curCase = s_activeCases.begin(); 124 curCase != s_activeCases.end(); curCase++) { 125 126 if ((*curCase)->PhysicalPath() != physPath) 127 continue; 128 129 if (result != NULL) { 130 syslog(LOG_WARNING, "Multiple casefiles found for " 131 "physical path %s. " 132 "This is most likely a bug in zfsd", 133 physPath.c_str()); 134 } 135 result = *curCase; 136 } 137 return (result); 138 } 139 140 141 void 142 CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event) 143 { 144 CaseFileList::iterator casefile; 145 for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){ 146 CaseFileList::iterator next = casefile; 147 next++; 148 if (poolGUID == (*casefile)->PoolGUID()) 149 (*casefile)->ReEvaluate(event); 150 casefile = next; 151 } 152 } 153 154 CaseFile & 155 CaseFile::Create(Vdev &vdev) 156 { 157 CaseFile *activeCase; 158 159 activeCase = Find(vdev.PoolGUID(), vdev.GUID()); 160 if (activeCase == NULL) 161 activeCase = new CaseFile(vdev); 162 163 return (*activeCase); 164 } 165 166 void 167 CaseFile::DeSerialize() 168 { 169 struct dirent **caseFiles; 170 171 int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles, 172 DeSerializeSelector, /*compar*/NULL)); 173 174 if (numCaseFiles == -1) 175 return; 176 if (numCaseFiles == 0) { 177 free(caseFiles); 178 return; 179 } 180 181 for (int i = 0; i < numCaseFiles; i++) { 182 183 DeSerializeFile(caseFiles[i]->d_name); 184 free(caseFiles[i]); 185 } 186 free(caseFiles); 187 } 188 189 bool 190 CaseFile::Empty() 191 { 192 return (s_activeCases.empty()); 193 } 194 195 void 196 CaseFile::LogAll() 197 { 198 for (CaseFileList::iterator curCase = s_activeCases.begin(); 199 curCase != s_activeCases.end(); curCase++) 200 (*curCase)->Log(); 201 } 202 203 void 204 CaseFile::PurgeAll() 205 { 206 /* 207 * Serialize casefiles before deleting them so that they can be reread 208 * and revalidated during BuildCaseFiles. 209 * CaseFiles remove themselves from this list on destruction. 210 */ 211 while (s_activeCases.size() != 0) { 212 CaseFile *casefile = s_activeCases.front(); 213 casefile->Serialize(); 214 delete casefile; 215 } 216 217 } 218 219 //- CaseFile Public Methods ---------------------------------------------------- 220 bool 221 CaseFile::RefreshVdevState() 222 { 223 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 224 zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front()); 225 if (casePool == NULL) 226 return (false); 227 228 Vdev vd(casePool, CaseVdev(casePool)); 229 if (vd.DoesNotExist()) 230 return (false); 231 232 m_vdevState = vd.State(); 233 m_vdevPhysPath = vd.PhysicalPath(); 234 return (true); 235 } 236 237 bool 238 CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev) 239 { 240 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 241 zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front()); 242 zpool_boot_label_t boot_type; 243 uint64_t boot_size; 244 245 if (pool == NULL || !RefreshVdevState()) { 246 /* 247 * The pool or vdev for this case file is no longer 248 * part of the configuration. This can happen 249 * if we process a device arrival notification 250 * before seeing the ZFS configuration change 251 * event. 252 */ 253 syslog(LOG_INFO, 254 "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured. " 255 "Closing\n", 256 PoolGUIDString().c_str(), 257 VdevGUIDString().c_str()); 258 Close(); 259 260 /* 261 * Since this event was not used to close this 262 * case, do not report it as consumed. 263 */ 264 return (/*consumed*/false); 265 } 266 267 if (VdevState() > VDEV_STATE_CANT_OPEN) { 268 /* 269 * For now, newly discovered devices only help for 270 * devices that are missing. In the future, we might 271 * use a newly inserted spare to replace a degraded 272 * or faulted device. 273 */ 274 syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored", 275 PoolGUIDString().c_str(), VdevGUIDString().c_str()); 276 return (/*consumed*/false); 277 } 278 279 if (vdev != NULL 280 && ( vdev->PoolGUID() == m_poolGUID 281 || vdev->PoolGUID() == Guid::InvalidGuid()) 282 && vdev->GUID() == m_vdevGUID) { 283 284 zpool_vdev_online(pool, vdev->GUIDString().c_str(), 285 ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, 286 &m_vdevState); 287 syslog(LOG_INFO, "Onlined vdev(%s/%s:%s). State now %s.\n", 288 zpool_get_name(pool), vdev->GUIDString().c_str(), 289 devPath.c_str(), 290 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 291 292 /* 293 * Check the vdev state post the online action to see 294 * if we can retire this case. 295 */ 296 CloseIfSolved(); 297 298 return (/*consumed*/true); 299 } 300 301 /* 302 * If the auto-replace policy is enabled, and we have physical 303 * path information, try a physical path replacement. 304 */ 305 if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) { 306 syslog(LOG_INFO, 307 "CaseFile(%s:%s:%s): AutoReplace not set. " 308 "Ignoring device insertion.\n", 309 PoolGUIDString().c_str(), 310 VdevGUIDString().c_str(), 311 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 312 return (/*consumed*/false); 313 } 314 315 if (PhysicalPath().empty()) { 316 syslog(LOG_INFO, 317 "CaseFile(%s:%s:%s): No physical path information. " 318 "Ignoring device insertion.\n", 319 PoolGUIDString().c_str(), 320 VdevGUIDString().c_str(), 321 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 322 return (/*consumed*/false); 323 } 324 325 if (physPath != PhysicalPath()) { 326 syslog(LOG_INFO, 327 "CaseFile(%s:%s:%s): Physical path mismatch. " 328 "Ignoring device insertion.\n", 329 PoolGUIDString().c_str(), 330 VdevGUIDString().c_str(), 331 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 332 return (/*consumed*/false); 333 } 334 335 /* Write a label on the newly inserted disk. */ 336 if (zpool_is_bootable(pool)) 337 boot_type = ZPOOL_COPY_BOOT_LABEL; 338 else 339 boot_type = ZPOOL_NO_BOOT_LABEL; 340 boot_size = zpool_get_prop_int(pool, ZPOOL_PROP_BOOTSIZE, NULL); 341 if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str(), 342 boot_type, boot_size, NULL) != 0) { 343 syslog(LOG_ERR, 344 "Replace vdev(%s/%s) by physical path (label): %s: %s\n", 345 zpool_get_name(pool), VdevGUIDString().c_str(), 346 libzfs_error_action(g_zfsHandle), 347 libzfs_error_description(g_zfsHandle)); 348 return (/*consumed*/false); 349 } 350 351 syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s", 352 PoolGUIDString().c_str(), VdevGUIDString().c_str(), 353 devPath.c_str()); 354 return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false)); 355 } 356 357 bool 358 CaseFile::ReEvaluate(const ZfsEvent &event) 359 { 360 bool consumed(false); 361 362 if (event.Value("type") == "misc.fs.zfs.vdev_remove") { 363 /* 364 * The Vdev we represent has been removed from the 365 * configuration. This case is no longer of value. 366 */ 367 Close(); 368 369 return (/*consumed*/true); 370 } else if (event.Value("type") == "misc.fs.zfs.pool_destroy") { 371 /* This Pool has been destroyed. Discard the case */ 372 Close(); 373 374 return (/*consumed*/true); 375 } else if (event.Value("type") == "misc.fs.zfs.config_sync") { 376 RefreshVdevState(); 377 if (VdevState() < VDEV_STATE_HEALTHY) 378 consumed = ActivateSpare(); 379 } 380 381 382 if (event.Value("class") == "resource.fs.zfs.removed") { 383 bool spare_activated; 384 385 if (!RefreshVdevState()) { 386 /* 387 * The pool or vdev for this case file is no longer 388 * part of the configuration. This can happen 389 * if we process a device arrival notification 390 * before seeing the ZFS configuration change 391 * event. 392 */ 393 syslog(LOG_INFO, 394 "CaseFile::ReEvaluate(%s,%s) Pool/Vdev " 395 "unconfigured. Closing\n", 396 PoolGUIDString().c_str(), 397 VdevGUIDString().c_str()); 398 /* 399 * Close the case now so we won't waste cycles in the 400 * system rescan 401 */ 402 Close(); 403 404 /* 405 * Since this event was not used to close this 406 * case, do not report it as consumed. 407 */ 408 return (/*consumed*/false); 409 } 410 411 /* 412 * Discard any tentative I/O error events for 413 * this case. They were most likely caused by the 414 * hot-unplug of this device. 415 */ 416 PurgeTentativeEvents(); 417 418 /* Try to activate spares if they are available */ 419 spare_activated = ActivateSpare(); 420 421 /* 422 * Rescan the drives in the system to see if a recent 423 * drive arrival can be used to solve this case. 424 */ 425 ZfsDaemon::RequestSystemRescan(); 426 427 /* 428 * Consume the event if we successfully activated a spare. 429 * Otherwise, leave it in the unconsumed events list so that the 430 * future addition of a spare to this pool might be able to 431 * close the case 432 */ 433 consumed = spare_activated; 434 } else if (event.Value("class") == "resource.fs.zfs.statechange") { 435 RefreshVdevState(); 436 /* 437 * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to 438 * activate a hotspare. Otherwise, ignore the event 439 */ 440 if (VdevState() == VDEV_STATE_FAULTED || 441 VdevState() == VDEV_STATE_DEGRADED || 442 VdevState() == VDEV_STATE_CANT_OPEN) 443 (void) ActivateSpare(); 444 consumed = true; 445 } 446 else if (event.Value("class") == "ereport.fs.zfs.io" || 447 event.Value("class") == "ereport.fs.zfs.checksum") { 448 449 m_tentativeEvents.push_front(event.DeepCopy()); 450 RegisterCallout(event); 451 consumed = true; 452 } 453 454 bool closed(CloseIfSolved()); 455 456 return (consumed || closed); 457 } 458 459 /* Find a Vdev containing the vdev with the given GUID */ 460 static nvlist_t* 461 find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid) 462 { 463 nvlist_t **vdevChildren; 464 int error; 465 unsigned ch, numChildren; 466 467 error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, 468 &vdevChildren, &numChildren); 469 470 if (error != 0 || numChildren == 0) 471 return (NULL); 472 473 for (ch = 0; ch < numChildren; ch++) { 474 nvlist *result; 475 Vdev vdev(pool_config, vdevChildren[ch]); 476 477 if (vdev.GUID() == child_guid) 478 return (config); 479 480 result = find_parent(pool_config, vdevChildren[ch], child_guid); 481 if (result != NULL) 482 return (result); 483 } 484 485 return (NULL); 486 } 487 488 bool 489 CaseFile::ActivateSpare() { 490 nvlist_t *config, *nvroot, *parent_config; 491 nvlist_t **spares; 492 char *devPath, *vdev_type; 493 const char *poolname; 494 u_int nspares, i; 495 int error; 496 497 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 498 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 499 if (zhp == NULL) { 500 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool " 501 "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID); 502 return (false); 503 } 504 poolname = zpool_get_name(zhp); 505 config = zpool_get_config(zhp, NULL); 506 if (config == NULL) { 507 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool " 508 "config for pool %s", poolname); 509 return (false); 510 } 511 error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot); 512 if (error != 0){ 513 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev " 514 "tree for pool %s", poolname); 515 return (false); 516 } 517 518 parent_config = find_parent(config, nvroot, m_vdevGUID); 519 if (parent_config != NULL) { 520 char *parent_type; 521 522 /* 523 * Don't activate spares for members of a "replacing" vdev. 524 * They're already dealt with. Sparing them will just drag out 525 * the resilver process. 526 */ 527 error = nvlist_lookup_string(parent_config, 528 ZPOOL_CONFIG_TYPE, &parent_type); 529 if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0) 530 return (false); 531 } 532 533 nspares = 0; 534 nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 535 &nspares); 536 if (nspares == 0) { 537 /* The pool has no spares configured */ 538 syslog(LOG_INFO, "CaseFile::ActivateSpare: " 539 "No spares available for pool %s", poolname); 540 return (false); 541 } 542 for (i = 0; i < nspares; i++) { 543 uint64_t *nvlist_array; 544 vdev_stat_t *vs; 545 uint_t nstats; 546 547 if (nvlist_lookup_uint64_array(spares[i], 548 ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) { 549 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not " 550 "find vdev stats for pool %s, spare %d", 551 poolname, i); 552 return (false); 553 } 554 vs = reinterpret_cast<vdev_stat_t *>(nvlist_array); 555 556 if ((vs->vs_aux != VDEV_AUX_SPARED) 557 && (vs->vs_state == VDEV_STATE_HEALTHY)) { 558 /* We found a usable spare */ 559 break; 560 } 561 } 562 563 if (i == nspares) { 564 /* No available spares were found */ 565 return (false); 566 } 567 568 error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath); 569 if (error != 0) { 570 syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine " 571 "the path of pool %s, spare %d. Error %d", 572 poolname, i, error); 573 return (false); 574 } 575 576 error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type); 577 if (error != 0) { 578 syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine " 579 "the vdev type of pool %s, spare %d. Error %d", 580 poolname, i, error); 581 return (false); 582 } 583 584 return (Replace(vdev_type, devPath, /*isspare*/true)); 585 } 586 587 void 588 CaseFile::RegisterCallout(const Event &event) 589 { 590 timeval now, countdown, elapsed, timestamp, zero, remaining; 591 592 gettimeofday(&now, 0); 593 timestamp = event.GetTimestamp(); 594 timersub(&now, ×tamp, &elapsed); 595 timersub(&s_removeGracePeriod, &elapsed, &countdown); 596 /* 597 * If countdown is <= zero, Reset the timer to the 598 * smallest positive time value instead 599 */ 600 timerclear(&zero); 601 if (timercmp(&countdown, &zero, <=)) { 602 timerclear(&countdown); 603 countdown.tv_usec = 1; 604 } 605 606 remaining = m_tentativeTimer.TimeRemaining(); 607 608 if (!m_tentativeTimer.IsPending() 609 || timercmp(&countdown, &remaining, <)) 610 m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this); 611 } 612 613 614 bool 615 CaseFile::CloseIfSolved() 616 { 617 if (m_events.empty() 618 && m_tentativeEvents.empty()) { 619 620 /* 621 * We currently do not track or take actions on 622 * devices in the degraded or faulted state. 623 * Once we have support for spare pools, we'll 624 * retain these cases so that any spares added in 625 * the future can be applied to them. 626 */ 627 switch (VdevState()) { 628 case VDEV_STATE_HEALTHY: 629 /* No need to keep cases for healthy vdevs */ 630 Close(); 631 return (true); 632 case VDEV_STATE_REMOVED: 633 case VDEV_STATE_CANT_OPEN: 634 /* 635 * Keep open. We may solve it with a newly inserted 636 * device. 637 */ 638 case VDEV_STATE_FAULTED: 639 case VDEV_STATE_DEGRADED: 640 /* 641 * Keep open. We may solve it with the future 642 * addition of a spare to the pool 643 */ 644 case VDEV_STATE_UNKNOWN: 645 case VDEV_STATE_CLOSED: 646 case VDEV_STATE_OFFLINE: 647 /* 648 * Keep open? This may not be the correct behavior, 649 * but it's what we've always done 650 */ 651 ; 652 } 653 654 /* 655 * Re-serialize the case in order to remove any 656 * previous event data. 657 */ 658 Serialize(); 659 } 660 661 return (false); 662 } 663 664 void 665 CaseFile::Log() 666 { 667 syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(), 668 VdevGUIDString().c_str(), PhysicalPath().c_str()); 669 syslog(LOG_INFO, "\tVdev State = %s\n", 670 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 671 if (m_tentativeEvents.size() != 0) { 672 syslog(LOG_INFO, "\t=== Tentative Events ===\n"); 673 for (EventList::iterator event(m_tentativeEvents.begin()); 674 event != m_tentativeEvents.end(); event++) 675 (*event)->Log(LOG_INFO); 676 } 677 if (m_events.size() != 0) { 678 syslog(LOG_INFO, "\t=== Events ===\n"); 679 for (EventList::iterator event(m_events.begin()); 680 event != m_events.end(); event++) 681 (*event)->Log(LOG_INFO); 682 } 683 } 684 685 //- CaseFile Static Protected Methods ------------------------------------------ 686 void 687 CaseFile::OnGracePeriodEnded(void *arg) 688 { 689 CaseFile &casefile(*static_cast<CaseFile *>(arg)); 690 691 casefile.OnGracePeriodEnded(); 692 } 693 694 int 695 CaseFile::DeSerializeSelector(const struct dirent *dirEntry) 696 { 697 uint64_t poolGUID; 698 uint64_t vdevGUID; 699 700 if (dirEntry->d_type == DT_REG 701 && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case", 702 &poolGUID, &vdevGUID) == 2) 703 return (1); 704 return (0); 705 } 706 707 void 708 CaseFile::DeSerializeFile(const char *fileName) 709 { 710 string fullName(s_caseFilePath + '/' + fileName); 711 CaseFile *existingCaseFile(NULL); 712 CaseFile *caseFile(NULL); 713 714 try { 715 uint64_t poolGUID; 716 uint64_t vdevGUID; 717 nvlist_t *vdevConf; 718 719 if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case", 720 &poolGUID, &vdevGUID) != 2) { 721 throw ZfsdException("CaseFile::DeSerialize: " 722 "Unintelligible CaseFile filename %s.\n", fileName); 723 } 724 existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID)); 725 if (existingCaseFile != NULL) { 726 /* 727 * If the vdev is already degraded or faulted, 728 * there's no point in keeping the state around 729 * that we use to put a drive into the degraded 730 * state. However, if the vdev is simply missing, 731 * preserve the case data in the hopes that it will 732 * return. 733 */ 734 caseFile = existingCaseFile; 735 vdev_state curState(caseFile->VdevState()); 736 if (curState > VDEV_STATE_CANT_OPEN 737 && curState < VDEV_STATE_HEALTHY) { 738 unlink(fileName); 739 return; 740 } 741 } else { 742 ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID); 743 if (zpl.empty() 744 || (vdevConf = VdevIterator(zpl.front()) 745 .Find(vdevGUID)) == NULL) { 746 /* 747 * Either the pool no longer exists 748 * or this vdev is no longer a member of 749 * the pool. 750 */ 751 unlink(fullName.c_str()); 752 return; 753 } 754 755 /* 756 * Any vdev we find that does not have a case file 757 * must be in the healthy state and thus worthy of 758 * continued SERD data tracking. 759 */ 760 caseFile = new CaseFile(Vdev(zpl.front(), vdevConf)); 761 } 762 763 ifstream caseStream(fullName.c_str()); 764 if (!caseStream) 765 throw ZfsdException("CaseFile::DeSerialize: Unable to " 766 "read %s.\n", fileName); 767 768 caseFile->DeSerialize(caseStream); 769 } catch (const ParseException &exp) { 770 771 exp.Log(); 772 if (caseFile != existingCaseFile) 773 delete caseFile; 774 775 /* 776 * Since we can't parse the file, unlink it so we don't 777 * trip over it again. 778 */ 779 unlink(fileName); 780 } catch (const ZfsdException &zfsException) { 781 782 zfsException.Log(); 783 if (caseFile != existingCaseFile) 784 delete caseFile; 785 } 786 } 787 788 //- CaseFile Protected Methods ------------------------------------------------- 789 CaseFile::CaseFile(const Vdev &vdev) 790 : m_poolGUID(vdev.PoolGUID()), 791 m_vdevGUID(vdev.GUID()), 792 m_vdevState(vdev.State()), 793 m_vdevPhysPath(vdev.PhysicalPath()) 794 { 795 stringstream guidString; 796 797 guidString << m_vdevGUID; 798 m_vdevGUIDString = guidString.str(); 799 guidString.str(""); 800 guidString << m_poolGUID; 801 m_poolGUIDString = guidString.str(); 802 803 s_activeCases.push_back(this); 804 805 syslog(LOG_INFO, "Creating new CaseFile:\n"); 806 Log(); 807 } 808 809 CaseFile::~CaseFile() 810 { 811 PurgeEvents(); 812 PurgeTentativeEvents(); 813 m_tentativeTimer.Stop(); 814 s_activeCases.remove(this); 815 } 816 817 void 818 CaseFile::PurgeEvents() 819 { 820 for (EventList::iterator event(m_events.begin()); 821 event != m_events.end(); event++) 822 delete *event; 823 824 m_events.clear(); 825 } 826 827 void 828 CaseFile::PurgeTentativeEvents() 829 { 830 for (EventList::iterator event(m_tentativeEvents.begin()); 831 event != m_tentativeEvents.end(); event++) 832 delete *event; 833 834 m_tentativeEvents.clear(); 835 } 836 837 void 838 CaseFile::SerializeEvList(const EventList events, int fd, 839 const char* prefix) const 840 { 841 if (events.empty()) 842 return; 843 for (EventList::const_iterator curEvent = events.begin(); 844 curEvent != events.end(); curEvent++) { 845 const string &eventString((*curEvent)->GetEventString()); 846 847 // TODO: replace many write(2) calls with a single writev(2) 848 if (prefix) 849 write(fd, prefix, strlen(prefix)); 850 write(fd, eventString.c_str(), eventString.length()); 851 } 852 } 853 854 void 855 CaseFile::Serialize() 856 { 857 stringstream saveFile; 858 859 saveFile << setfill('0') 860 << s_caseFilePath << "/" 861 << "pool_" << PoolGUIDString() 862 << "_vdev_" << VdevGUIDString() 863 << ".case"; 864 865 if (m_events.empty() && m_tentativeEvents.empty()) { 866 unlink(saveFile.str().c_str()); 867 return; 868 } 869 870 int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644)); 871 if (fd == -1) { 872 syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n", 873 saveFile.str().c_str()); 874 return; 875 } 876 SerializeEvList(m_events, fd); 877 SerializeEvList(m_tentativeEvents, fd, "tentative "); 878 close(fd); 879 } 880 881 /* 882 * XXX: This method assumes that events may not contain embedded newlines. If 883 * ever events can contain embedded newlines, then CaseFile must switch 884 * serialization formats 885 */ 886 void 887 CaseFile::DeSerialize(ifstream &caseStream) 888 { 889 string evString; 890 const EventFactory &factory(ZfsDaemon::Get().GetFactory()); 891 892 caseStream >> std::noskipws >> std::ws; 893 while (caseStream.good()) { 894 /* 895 * Outline: 896 * read the beginning of a line and check it for 897 * "tentative". If found, discard "tentative". 898 * Create a new event 899 * continue 900 */ 901 EventList* destEvents; 902 const string tentFlag("tentative "); 903 string line; 904 std::stringbuf lineBuf; 905 906 caseStream.get(lineBuf); 907 caseStream.ignore(); /*discard the newline character*/ 908 line = lineBuf.str(); 909 if (line.compare(0, tentFlag.size(), tentFlag) == 0) { 910 /* Discard "tentative" */ 911 line.erase(0, tentFlag.size()); 912 destEvents = &m_tentativeEvents; 913 } else { 914 destEvents = &m_events; 915 } 916 Event *event(Event::CreateEvent(factory, line)); 917 if (event != NULL) { 918 destEvents->push_back(event); 919 RegisterCallout(*event); 920 } 921 } 922 } 923 924 void 925 CaseFile::Close() 926 { 927 /* 928 * This case is no longer relevant. Clean up our 929 * serialization file, and delete the case. 930 */ 931 syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n", 932 PoolGUIDString().c_str(), VdevGUIDString().c_str(), 933 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 934 935 /* 936 * Serialization of a Case with no event data, clears the 937 * Serialization data for that event. 938 */ 939 PurgeEvents(); 940 Serialize(); 941 942 delete this; 943 } 944 945 void 946 CaseFile::OnGracePeriodEnded() 947 { 948 bool should_fault, should_degrade; 949 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 950 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 951 952 m_events.splice(m_events.begin(), m_tentativeEvents); 953 should_fault = ShouldFault(); 954 should_degrade = ShouldDegrade(); 955 956 if (should_fault || should_degrade) { 957 if (zhp == NULL 958 || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) { 959 /* 960 * Either the pool no longer exists 961 * or this vdev is no longer a member of 962 * the pool. 963 */ 964 Close(); 965 return; 966 } 967 968 } 969 970 /* A fault condition has priority over a degrade condition */ 971 if (ShouldFault()) { 972 /* Fault the vdev and close the case. */ 973 if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID, 974 VDEV_AUX_ERR_EXCEEDED) == 0) { 975 syslog(LOG_INFO, "Faulting vdev(%s/%s)", 976 PoolGUIDString().c_str(), 977 VdevGUIDString().c_str()); 978 Close(); 979 return; 980 } 981 else { 982 syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n", 983 PoolGUIDString().c_str(), 984 VdevGUIDString().c_str(), 985 libzfs_error_action(g_zfsHandle), 986 libzfs_error_description(g_zfsHandle)); 987 } 988 } 989 else if (ShouldDegrade()) { 990 /* Degrade the vdev and close the case. */ 991 if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID, 992 VDEV_AUX_ERR_EXCEEDED) == 0) { 993 syslog(LOG_INFO, "Degrading vdev(%s/%s)", 994 PoolGUIDString().c_str(), 995 VdevGUIDString().c_str()); 996 Close(); 997 return; 998 } 999 else { 1000 syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n", 1001 PoolGUIDString().c_str(), 1002 VdevGUIDString().c_str(), 1003 libzfs_error_action(g_zfsHandle), 1004 libzfs_error_description(g_zfsHandle)); 1005 } 1006 } 1007 Serialize(); 1008 } 1009 1010 Vdev 1011 CaseFile::BeingReplacedBy(zpool_handle_t *zhp) { 1012 Vdev vd(zhp, CaseVdev(zhp)); 1013 std::list<Vdev> children; 1014 std::list<Vdev>::iterator children_it; 1015 1016 Vdev parent(vd.Parent()); 1017 Vdev replacing(NonexistentVdev); 1018 1019 /* 1020 * To determine whether we are being replaced by another spare that 1021 * is still working, then make sure that it is currently spared and 1022 * that the spare is either resilvering or healthy. If any of these 1023 * conditions fail, then we are not being replaced by a spare. 1024 * 1025 * If the spare is healthy, then the case file should be closed very 1026 * soon after this check. 1027 */ 1028 if (parent.DoesNotExist() 1029 || parent.Name(zhp, /*verbose*/false) != "spare") 1030 return (NonexistentVdev); 1031 1032 children = parent.Children(); 1033 children_it = children.begin(); 1034 for (;children_it != children.end(); children_it++) { 1035 Vdev child = *children_it; 1036 1037 /* Skip our vdev. */ 1038 if (child.GUID() == VdevGUID()) 1039 continue; 1040 /* 1041 * Accept the first child that doesn't match our GUID, or 1042 * any resilvering/healthy device if one exists. 1043 */ 1044 if (replacing.DoesNotExist() || child.IsResilvering() 1045 || child.State() == VDEV_STATE_HEALTHY) 1046 replacing = child; 1047 } 1048 1049 return (replacing); 1050 } 1051 1052 bool 1053 CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) { 1054 nvlist_t *nvroot, *newvd; 1055 const char *poolname; 1056 string oldstr(VdevGUIDString()); 1057 bool retval = true; 1058 1059 /* Figure out what pool we're working on */ 1060 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 1061 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 1062 if (zhp == NULL) { 1063 syslog(LOG_ERR, "CaseFile::Replace: could not find pool for " 1064 "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID); 1065 return (false); 1066 } 1067 poolname = zpool_get_name(zhp); 1068 Vdev vd(zhp, CaseVdev(zhp)); 1069 Vdev replaced(BeingReplacedBy(zhp)); 1070 1071 if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) { 1072 /* If we are already being replaced by a working spare, pass. */ 1073 if (replaced.IsResilvering() 1074 || replaced.State() == VDEV_STATE_HEALTHY) { 1075 syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already " 1076 "replaced", VdevGUIDString().c_str(), path); 1077 return (/*consumed*/false); 1078 } 1079 /* 1080 * If we have already been replaced by a spare, but that spare 1081 * is broken, we must spare the spare, not the original device. 1082 */ 1083 oldstr = replaced.GUIDString(); 1084 syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing " 1085 "broken spare %s instead", VdevGUIDString().c_str(), 1086 path, oldstr.c_str()); 1087 } 1088 1089 /* 1090 * Build a root vdev/leaf vdev configuration suitable for 1091 * zpool_vdev_attach. Only enough data for the kernel to find 1092 * the device (i.e. type and disk device node path) are needed. 1093 */ 1094 nvroot = NULL; 1095 newvd = NULL; 1096 1097 if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0 1098 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { 1099 syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate " 1100 "configuration data.", poolname, oldstr.c_str()); 1101 if (nvroot != NULL) 1102 nvlist_free(nvroot); 1103 return (false); 1104 } 1105 if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0 1106 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 1107 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 1108 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1109 &newvd, 1) != 0) { 1110 syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize " 1111 "configuration data.", poolname, oldstr.c_str()); 1112 nvlist_free(newvd); 1113 nvlist_free(nvroot); 1114 return (true); 1115 } 1116 1117 /* Data was copied when added to the root vdev. */ 1118 nvlist_free(newvd); 1119 1120 retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot, 1121 /*replace*/B_TRUE) == 0); 1122 if (retval) 1123 syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n", 1124 poolname, oldstr.c_str(), path); 1125 else 1126 syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n", 1127 poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle), 1128 libzfs_error_description(g_zfsHandle)); 1129 nvlist_free(nvroot); 1130 1131 return (retval); 1132 } 1133 1134 /* Does the argument event refer to a checksum error? */ 1135 static bool 1136 IsChecksumEvent(const Event* const event) 1137 { 1138 return ("ereport.fs.zfs.checksum" == event->Value("type")); 1139 } 1140 1141 /* Does the argument event refer to an IO error? */ 1142 static bool 1143 IsIOEvent(const Event* const event) 1144 { 1145 return ("ereport.fs.zfs.io" == event->Value("type")); 1146 } 1147 1148 bool 1149 CaseFile::ShouldDegrade() const 1150 { 1151 return (std::count_if(m_events.begin(), m_events.end(), 1152 IsChecksumEvent) > ZFS_DEGRADE_IO_COUNT); 1153 } 1154 1155 bool 1156 CaseFile::ShouldFault() const 1157 { 1158 return (std::count_if(m_events.begin(), m_events.end(), 1159 IsIOEvent) > ZFS_DEGRADE_IO_COUNT); 1160 } 1161 1162 nvlist_t * 1163 CaseFile::CaseVdev(zpool_handle_t *zhp) const 1164 { 1165 return (VdevIterator(zhp).Find(VdevGUID())); 1166 } 1167