1 /*- 2 * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions, and the following disclaimer, 10 * without modification. 11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer 12 * substantially similar to the "NO WARRANTY" disclaimer below 13 * ("Disclaimer") and any redistribution must be conditioned upon 14 * including a substantially similar Disclaimer requirement for further 15 * binary redistribution. 16 * 17 * NO WARRANTY 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGES. 29 * 30 * Authors: Justin T. Gibbs (Spectra Logic Corporation) 31 */ 32 33 /** 34 * \file case_file.cc 35 * 36 * We keep case files for any leaf vdev that is not in the optimal state. 37 * However, we only serialize to disk those events that need to be preserved 38 * across reboots. For now, this is just a log of soft errors which we 39 * accumulate in order to mark a device as degraded. 40 */ 41 #include <sys/cdefs.h> 42 #include <sys/byteorder.h> 43 #include <sys/time.h> 44 45 #include <sys/fs/zfs.h> 46 47 #include <dirent.h> 48 #include <fcntl.h> 49 #include <iomanip> 50 #include <fstream> 51 #include <functional> 52 #include <sstream> 53 #include <syslog.h> 54 #include <unistd.h> 55 56 #include <libzfs.h> 57 58 #include <list> 59 #include <map> 60 #include <string> 61 62 #include <devdctl/guid.h> 63 #include <devdctl/event.h> 64 #include <devdctl/event_factory.h> 65 #include <devdctl/exception.h> 66 #include <devdctl/consumer.h> 67 68 #include "callout.h" 69 #include "vdev_iterator.h" 70 #include "zfsd_event.h" 71 #include "case_file.h" 72 #include "vdev.h" 73 #include "zfsd.h" 74 #include "zfsd_exception.h" 75 #include "zpool_list.h" 76 77 __FBSDID("$FreeBSD$"); 78 79 /*============================ Namespace Control =============================*/ 80 using std::hex; 81 using std::ifstream; 82 using std::stringstream; 83 using std::setfill; 84 using std::setw; 85 86 using DevdCtl::Event; 87 using DevdCtl::EventFactory; 88 using DevdCtl::EventList; 89 using DevdCtl::Guid; 90 using DevdCtl::ParseException; 91 92 /*--------------------------------- CaseFile ---------------------------------*/ 93 //- CaseFile Static Data ------------------------------------------------------- 94 95 CaseFileList CaseFile::s_activeCases; 96 const string CaseFile::s_caseFilePath = "/var/db/zfsd/cases"; 97 const timeval CaseFile::s_removeGracePeriod = { 60 /*sec*/, 0 /*usec*/}; 98 99 //- CaseFile Static Public Methods --------------------------------------------- 100 CaseFile * 101 CaseFile::Find(Guid poolGUID, Guid vdevGUID) 102 { 103 for (CaseFileList::iterator curCase = s_activeCases.begin(); 104 curCase != s_activeCases.end(); curCase++) { 105 106 if (((*curCase)->PoolGUID() != poolGUID 107 && Guid::InvalidGuid() != poolGUID) 108 || (*curCase)->VdevGUID() != vdevGUID) 109 continue; 110 111 /* 112 * We only carry one active case per-vdev. 113 */ 114 return (*curCase); 115 } 116 return (NULL); 117 } 118 119 CaseFile * 120 CaseFile::Find(const string &physPath) 121 { 122 CaseFile *result = NULL; 123 124 for (CaseFileList::iterator curCase = s_activeCases.begin(); 125 curCase != s_activeCases.end(); curCase++) { 126 127 if ((*curCase)->PhysicalPath() != physPath) 128 continue; 129 130 if (result != NULL) { 131 syslog(LOG_WARNING, "Multiple casefiles found for " 132 "physical path %s. " 133 "This is most likely a bug in zfsd", 134 physPath.c_str()); 135 } 136 result = *curCase; 137 } 138 return (result); 139 } 140 141 142 void 143 CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event) 144 { 145 CaseFileList::iterator casefile; 146 for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){ 147 CaseFileList::iterator next = casefile; 148 next++; 149 if (poolGUID == (*casefile)->PoolGUID()) 150 (*casefile)->ReEvaluate(event); 151 casefile = next; 152 } 153 } 154 155 CaseFile & 156 CaseFile::Create(Vdev &vdev) 157 { 158 CaseFile *activeCase; 159 160 activeCase = Find(vdev.PoolGUID(), vdev.GUID()); 161 if (activeCase == NULL) 162 activeCase = new CaseFile(vdev); 163 164 return (*activeCase); 165 } 166 167 void 168 CaseFile::DeSerialize() 169 { 170 struct dirent **caseFiles; 171 172 int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles, 173 DeSerializeSelector, /*compar*/NULL)); 174 175 if (numCaseFiles == -1) 176 return; 177 if (numCaseFiles == 0) { 178 free(caseFiles); 179 return; 180 } 181 182 for (int i = 0; i < numCaseFiles; i++) { 183 184 DeSerializeFile(caseFiles[i]->d_name); 185 free(caseFiles[i]); 186 } 187 free(caseFiles); 188 } 189 190 bool 191 CaseFile::Empty() 192 { 193 return (s_activeCases.empty()); 194 } 195 196 void 197 CaseFile::LogAll() 198 { 199 for (CaseFileList::iterator curCase = s_activeCases.begin(); 200 curCase != s_activeCases.end(); curCase++) 201 (*curCase)->Log(); 202 } 203 204 void 205 CaseFile::PurgeAll() 206 { 207 /* 208 * Serialize casefiles before deleting them so that they can be reread 209 * and revalidated during BuildCaseFiles. 210 * CaseFiles remove themselves from this list on destruction. 211 */ 212 while (s_activeCases.size() != 0) { 213 CaseFile *casefile = s_activeCases.front(); 214 casefile->Serialize(); 215 delete casefile; 216 } 217 218 } 219 220 //- CaseFile Public Methods ---------------------------------------------------- 221 bool 222 CaseFile::RefreshVdevState() 223 { 224 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 225 zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front()); 226 if (casePool == NULL) 227 return (false); 228 229 Vdev vd(casePool, CaseVdev(casePool)); 230 if (vd.DoesNotExist()) 231 return (false); 232 233 m_vdevState = vd.State(); 234 m_vdevPhysPath = vd.PhysicalPath(); 235 return (true); 236 } 237 238 bool 239 CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev) 240 { 241 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 242 zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front()); 243 244 if (pool == NULL || !RefreshVdevState()) { 245 /* 246 * The pool or vdev for this case file is no longer 247 * part of the configuration. This can happen 248 * if we process a device arrival notification 249 * before seeing the ZFS configuration change 250 * event. 251 */ 252 syslog(LOG_INFO, 253 "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured. " 254 "Closing\n", 255 PoolGUIDString().c_str(), 256 VdevGUIDString().c_str()); 257 Close(); 258 259 /* 260 * Since this event was not used to close this 261 * case, do not report it as consumed. 262 */ 263 return (/*consumed*/false); 264 } 265 266 if (VdevState() > VDEV_STATE_CANT_OPEN) { 267 /* 268 * For now, newly discovered devices only help for 269 * devices that are missing. In the future, we might 270 * use a newly inserted spare to replace a degraded 271 * or faulted device. 272 */ 273 syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored", 274 PoolGUIDString().c_str(), VdevGUIDString().c_str()); 275 return (/*consumed*/false); 276 } 277 278 if (vdev != NULL 279 && ( vdev->PoolGUID() == m_poolGUID 280 || vdev->PoolGUID() == Guid::InvalidGuid()) 281 && vdev->GUID() == m_vdevGUID) { 282 283 if (zpool_vdev_online(pool, vdev->GUIDString().c_str(), 284 ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, 285 &m_vdevState) != 0) { 286 syslog(LOG_ERR, 287 "Failed to online vdev(%s/%s:%s): %s: %s\n", 288 zpool_get_name(pool), vdev->GUIDString().c_str(), 289 devPath.c_str(), libzfs_error_action(g_zfsHandle), 290 libzfs_error_description(g_zfsHandle)); 291 return (/*consumed*/false); 292 } 293 294 syslog(LOG_INFO, "Onlined vdev(%s/%s:%s). State now %s.\n", 295 zpool_get_name(pool), vdev->GUIDString().c_str(), 296 devPath.c_str(), 297 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 298 299 /* 300 * Check the vdev state post the online action to see 301 * if we can retire this case. 302 */ 303 CloseIfSolved(); 304 305 return (/*consumed*/true); 306 } 307 308 /* 309 * If the auto-replace policy is enabled, and we have physical 310 * path information, try a physical path replacement. 311 */ 312 if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) { 313 syslog(LOG_INFO, 314 "CaseFile(%s:%s:%s): AutoReplace not set. " 315 "Ignoring device insertion.\n", 316 PoolGUIDString().c_str(), 317 VdevGUIDString().c_str(), 318 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 319 return (/*consumed*/false); 320 } 321 322 if (PhysicalPath().empty()) { 323 syslog(LOG_INFO, 324 "CaseFile(%s:%s:%s): No physical path information. " 325 "Ignoring device insertion.\n", 326 PoolGUIDString().c_str(), 327 VdevGUIDString().c_str(), 328 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 329 return (/*consumed*/false); 330 } 331 332 if (physPath != PhysicalPath()) { 333 syslog(LOG_INFO, 334 "CaseFile(%s:%s:%s): Physical path mismatch. " 335 "Ignoring device insertion.\n", 336 PoolGUIDString().c_str(), 337 VdevGUIDString().c_str(), 338 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 339 return (/*consumed*/false); 340 } 341 342 /* Write a label on the newly inserted disk. */ 343 if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) { 344 syslog(LOG_ERR, 345 "Replace vdev(%s/%s) by physical path (label): %s: %s\n", 346 zpool_get_name(pool), VdevGUIDString().c_str(), 347 libzfs_error_action(g_zfsHandle), 348 libzfs_error_description(g_zfsHandle)); 349 return (/*consumed*/false); 350 } 351 352 syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s", 353 PoolGUIDString().c_str(), VdevGUIDString().c_str(), 354 devPath.c_str()); 355 return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false)); 356 } 357 358 bool 359 CaseFile::ReEvaluate(const ZfsEvent &event) 360 { 361 bool consumed(false); 362 363 if (event.Value("type") == "misc.fs.zfs.vdev_remove") { 364 /* 365 * The Vdev we represent has been removed from the 366 * configuration. This case is no longer of value. 367 */ 368 Close(); 369 370 return (/*consumed*/true); 371 } else if (event.Value("type") == "misc.fs.zfs.pool_destroy") { 372 /* This Pool has been destroyed. Discard the case */ 373 Close(); 374 375 return (/*consumed*/true); 376 } else if (event.Value("type") == "misc.fs.zfs.config_sync") { 377 RefreshVdevState(); 378 if (VdevState() < VDEV_STATE_HEALTHY) 379 consumed = ActivateSpare(); 380 } 381 382 383 if (event.Value("class") == "resource.fs.zfs.removed") { 384 bool spare_activated; 385 386 if (!RefreshVdevState()) { 387 /* 388 * The pool or vdev for this case file is no longer 389 * part of the configuration. This can happen 390 * if we process a device arrival notification 391 * before seeing the ZFS configuration change 392 * event. 393 */ 394 syslog(LOG_INFO, 395 "CaseFile::ReEvaluate(%s,%s) Pool/Vdev " 396 "unconfigured. Closing\n", 397 PoolGUIDString().c_str(), 398 VdevGUIDString().c_str()); 399 /* 400 * Close the case now so we won't waste cycles in the 401 * system rescan 402 */ 403 Close(); 404 405 /* 406 * Since this event was not used to close this 407 * case, do not report it as consumed. 408 */ 409 return (/*consumed*/false); 410 } 411 412 /* 413 * Discard any tentative I/O error events for 414 * this case. They were most likely caused by the 415 * hot-unplug of this device. 416 */ 417 PurgeTentativeEvents(); 418 419 /* Try to activate spares if they are available */ 420 spare_activated = ActivateSpare(); 421 422 /* 423 * Rescan the drives in the system to see if a recent 424 * drive arrival can be used to solve this case. 425 */ 426 ZfsDaemon::RequestSystemRescan(); 427 428 /* 429 * Consume the event if we successfully activated a spare. 430 * Otherwise, leave it in the unconsumed events list so that the 431 * future addition of a spare to this pool might be able to 432 * close the case 433 */ 434 consumed = spare_activated; 435 } else if (event.Value("class") == "resource.fs.zfs.statechange") { 436 RefreshVdevState(); 437 /* 438 * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to 439 * activate a hotspare. Otherwise, ignore the event 440 */ 441 if (VdevState() == VDEV_STATE_FAULTED || 442 VdevState() == VDEV_STATE_DEGRADED || 443 VdevState() == VDEV_STATE_CANT_OPEN) 444 (void) ActivateSpare(); 445 consumed = true; 446 } 447 else if (event.Value("class") == "ereport.fs.zfs.io" || 448 event.Value("class") == "ereport.fs.zfs.checksum") { 449 450 m_tentativeEvents.push_front(event.DeepCopy()); 451 RegisterCallout(event); 452 consumed = true; 453 } 454 455 bool closed(CloseIfSolved()); 456 457 return (consumed || closed); 458 } 459 460 /* Find a Vdev containing the vdev with the given GUID */ 461 static nvlist_t* 462 find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid) 463 { 464 nvlist_t **vdevChildren; 465 int error; 466 unsigned ch, numChildren; 467 468 error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, 469 &vdevChildren, &numChildren); 470 471 if (error != 0 || numChildren == 0) 472 return (NULL); 473 474 for (ch = 0; ch < numChildren; ch++) { 475 nvlist *result; 476 Vdev vdev(pool_config, vdevChildren[ch]); 477 478 if (vdev.GUID() == child_guid) 479 return (config); 480 481 result = find_parent(pool_config, vdevChildren[ch], child_guid); 482 if (result != NULL) 483 return (result); 484 } 485 486 return (NULL); 487 } 488 489 bool 490 CaseFile::ActivateSpare() { 491 nvlist_t *config, *nvroot, *parent_config; 492 nvlist_t **spares; 493 char *devPath, *vdev_type; 494 const char *poolname; 495 u_int nspares, i; 496 int error; 497 498 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 499 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 500 if (zhp == NULL) { 501 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool " 502 "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID); 503 return (false); 504 } 505 poolname = zpool_get_name(zhp); 506 config = zpool_get_config(zhp, NULL); 507 if (config == NULL) { 508 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool " 509 "config for pool %s", poolname); 510 return (false); 511 } 512 error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot); 513 if (error != 0){ 514 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev " 515 "tree for pool %s", poolname); 516 return (false); 517 } 518 519 parent_config = find_parent(config, nvroot, m_vdevGUID); 520 if (parent_config != NULL) { 521 char *parent_type; 522 523 /* 524 * Don't activate spares for members of a "replacing" vdev. 525 * They're already dealt with. Sparing them will just drag out 526 * the resilver process. 527 */ 528 error = nvlist_lookup_string(parent_config, 529 ZPOOL_CONFIG_TYPE, &parent_type); 530 if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0) 531 return (false); 532 } 533 534 nspares = 0; 535 nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 536 &nspares); 537 if (nspares == 0) { 538 /* The pool has no spares configured */ 539 syslog(LOG_INFO, "CaseFile::ActivateSpare: " 540 "No spares available for pool %s", poolname); 541 return (false); 542 } 543 for (i = 0; i < nspares; i++) { 544 uint64_t *nvlist_array; 545 vdev_stat_t *vs; 546 uint_t nstats; 547 548 if (nvlist_lookup_uint64_array(spares[i], 549 ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) { 550 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not " 551 "find vdev stats for pool %s, spare %d", 552 poolname, i); 553 return (false); 554 } 555 vs = reinterpret_cast<vdev_stat_t *>(nvlist_array); 556 557 if ((vs->vs_aux != VDEV_AUX_SPARED) 558 && (vs->vs_state == VDEV_STATE_HEALTHY)) { 559 /* We found a usable spare */ 560 break; 561 } 562 } 563 564 if (i == nspares) { 565 /* No available spares were found */ 566 return (false); 567 } 568 569 error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath); 570 if (error != 0) { 571 syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine " 572 "the path of pool %s, spare %d. Error %d", 573 poolname, i, error); 574 return (false); 575 } 576 577 error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type); 578 if (error != 0) { 579 syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine " 580 "the vdev type of pool %s, spare %d. Error %d", 581 poolname, i, error); 582 return (false); 583 } 584 585 return (Replace(vdev_type, devPath, /*isspare*/true)); 586 } 587 588 void 589 CaseFile::RegisterCallout(const Event &event) 590 { 591 timeval now, countdown, elapsed, timestamp, zero, remaining; 592 593 gettimeofday(&now, 0); 594 timestamp = event.GetTimestamp(); 595 timersub(&now, ×tamp, &elapsed); 596 timersub(&s_removeGracePeriod, &elapsed, &countdown); 597 /* 598 * If countdown is <= zero, Reset the timer to the 599 * smallest positive time value instead 600 */ 601 timerclear(&zero); 602 if (timercmp(&countdown, &zero, <=)) { 603 timerclear(&countdown); 604 countdown.tv_usec = 1; 605 } 606 607 remaining = m_tentativeTimer.TimeRemaining(); 608 609 if (!m_tentativeTimer.IsPending() 610 || timercmp(&countdown, &remaining, <)) 611 m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this); 612 } 613 614 615 bool 616 CaseFile::CloseIfSolved() 617 { 618 if (m_events.empty() 619 && m_tentativeEvents.empty()) { 620 621 /* 622 * We currently do not track or take actions on 623 * devices in the degraded or faulted state. 624 * Once we have support for spare pools, we'll 625 * retain these cases so that any spares added in 626 * the future can be applied to them. 627 */ 628 switch (VdevState()) { 629 case VDEV_STATE_HEALTHY: 630 /* No need to keep cases for healthy vdevs */ 631 Close(); 632 return (true); 633 case VDEV_STATE_REMOVED: 634 case VDEV_STATE_CANT_OPEN: 635 /* 636 * Keep open. We may solve it with a newly inserted 637 * device. 638 */ 639 case VDEV_STATE_FAULTED: 640 case VDEV_STATE_DEGRADED: 641 /* 642 * Keep open. We may solve it with the future 643 * addition of a spare to the pool 644 */ 645 case VDEV_STATE_UNKNOWN: 646 case VDEV_STATE_CLOSED: 647 case VDEV_STATE_OFFLINE: 648 /* 649 * Keep open? This may not be the correct behavior, 650 * but it's what we've always done 651 */ 652 ; 653 } 654 655 /* 656 * Re-serialize the case in order to remove any 657 * previous event data. 658 */ 659 Serialize(); 660 } 661 662 return (false); 663 } 664 665 void 666 CaseFile::Log() 667 { 668 syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(), 669 VdevGUIDString().c_str(), PhysicalPath().c_str()); 670 syslog(LOG_INFO, "\tVdev State = %s\n", 671 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 672 if (m_tentativeEvents.size() != 0) { 673 syslog(LOG_INFO, "\t=== Tentative Events ===\n"); 674 for (EventList::iterator event(m_tentativeEvents.begin()); 675 event != m_tentativeEvents.end(); event++) 676 (*event)->Log(LOG_INFO); 677 } 678 if (m_events.size() != 0) { 679 syslog(LOG_INFO, "\t=== Events ===\n"); 680 for (EventList::iterator event(m_events.begin()); 681 event != m_events.end(); event++) 682 (*event)->Log(LOG_INFO); 683 } 684 } 685 686 //- CaseFile Static Protected Methods ------------------------------------------ 687 void 688 CaseFile::OnGracePeriodEnded(void *arg) 689 { 690 CaseFile &casefile(*static_cast<CaseFile *>(arg)); 691 692 casefile.OnGracePeriodEnded(); 693 } 694 695 int 696 CaseFile::DeSerializeSelector(const struct dirent *dirEntry) 697 { 698 uint64_t poolGUID; 699 uint64_t vdevGUID; 700 701 if (dirEntry->d_type == DT_REG 702 && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case", 703 &poolGUID, &vdevGUID) == 2) 704 return (1); 705 return (0); 706 } 707 708 void 709 CaseFile::DeSerializeFile(const char *fileName) 710 { 711 string fullName(s_caseFilePath + '/' + fileName); 712 CaseFile *existingCaseFile(NULL); 713 CaseFile *caseFile(NULL); 714 715 try { 716 uint64_t poolGUID; 717 uint64_t vdevGUID; 718 nvlist_t *vdevConf; 719 720 if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case", 721 &poolGUID, &vdevGUID) != 2) { 722 throw ZfsdException("CaseFile::DeSerialize: " 723 "Unintelligible CaseFile filename %s.\n", fileName); 724 } 725 existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID)); 726 if (existingCaseFile != NULL) { 727 /* 728 * If the vdev is already degraded or faulted, 729 * there's no point in keeping the state around 730 * that we use to put a drive into the degraded 731 * state. However, if the vdev is simply missing, 732 * preserve the case data in the hopes that it will 733 * return. 734 */ 735 caseFile = existingCaseFile; 736 vdev_state curState(caseFile->VdevState()); 737 if (curState > VDEV_STATE_CANT_OPEN 738 && curState < VDEV_STATE_HEALTHY) { 739 unlink(fileName); 740 return; 741 } 742 } else { 743 ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID); 744 if (zpl.empty() 745 || (vdevConf = VdevIterator(zpl.front()) 746 .Find(vdevGUID)) == NULL) { 747 /* 748 * Either the pool no longer exists 749 * or this vdev is no longer a member of 750 * the pool. 751 */ 752 unlink(fullName.c_str()); 753 return; 754 } 755 756 /* 757 * Any vdev we find that does not have a case file 758 * must be in the healthy state and thus worthy of 759 * continued SERD data tracking. 760 */ 761 caseFile = new CaseFile(Vdev(zpl.front(), vdevConf)); 762 } 763 764 ifstream caseStream(fullName.c_str()); 765 if (!caseStream) 766 throw ZfsdException("CaseFile::DeSerialize: Unable to " 767 "read %s.\n", fileName); 768 769 caseFile->DeSerialize(caseStream); 770 } catch (const ParseException &exp) { 771 772 exp.Log(); 773 if (caseFile != existingCaseFile) 774 delete caseFile; 775 776 /* 777 * Since we can't parse the file, unlink it so we don't 778 * trip over it again. 779 */ 780 unlink(fileName); 781 } catch (const ZfsdException &zfsException) { 782 783 zfsException.Log(); 784 if (caseFile != existingCaseFile) 785 delete caseFile; 786 } 787 } 788 789 //- CaseFile Protected Methods ------------------------------------------------- 790 CaseFile::CaseFile(const Vdev &vdev) 791 : m_poolGUID(vdev.PoolGUID()), 792 m_vdevGUID(vdev.GUID()), 793 m_vdevState(vdev.State()), 794 m_vdevPhysPath(vdev.PhysicalPath()) 795 { 796 stringstream guidString; 797 798 guidString << m_vdevGUID; 799 m_vdevGUIDString = guidString.str(); 800 guidString.str(""); 801 guidString << m_poolGUID; 802 m_poolGUIDString = guidString.str(); 803 804 s_activeCases.push_back(this); 805 806 syslog(LOG_INFO, "Creating new CaseFile:\n"); 807 Log(); 808 } 809 810 CaseFile::~CaseFile() 811 { 812 PurgeEvents(); 813 PurgeTentativeEvents(); 814 m_tentativeTimer.Stop(); 815 s_activeCases.remove(this); 816 } 817 818 void 819 CaseFile::PurgeEvents() 820 { 821 for (EventList::iterator event(m_events.begin()); 822 event != m_events.end(); event++) 823 delete *event; 824 825 m_events.clear(); 826 } 827 828 void 829 CaseFile::PurgeTentativeEvents() 830 { 831 for (EventList::iterator event(m_tentativeEvents.begin()); 832 event != m_tentativeEvents.end(); event++) 833 delete *event; 834 835 m_tentativeEvents.clear(); 836 } 837 838 void 839 CaseFile::SerializeEvList(const EventList events, int fd, 840 const char* prefix) const 841 { 842 if (events.empty()) 843 return; 844 for (EventList::const_iterator curEvent = events.begin(); 845 curEvent != events.end(); curEvent++) { 846 const string &eventString((*curEvent)->GetEventString()); 847 848 // TODO: replace many write(2) calls with a single writev(2) 849 if (prefix) 850 write(fd, prefix, strlen(prefix)); 851 write(fd, eventString.c_str(), eventString.length()); 852 } 853 } 854 855 void 856 CaseFile::Serialize() 857 { 858 stringstream saveFile; 859 860 saveFile << setfill('0') 861 << s_caseFilePath << "/" 862 << "pool_" << PoolGUIDString() 863 << "_vdev_" << VdevGUIDString() 864 << ".case"; 865 866 if (m_events.empty() && m_tentativeEvents.empty()) { 867 unlink(saveFile.str().c_str()); 868 return; 869 } 870 871 int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644)); 872 if (fd == -1) { 873 syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n", 874 saveFile.str().c_str()); 875 return; 876 } 877 SerializeEvList(m_events, fd); 878 SerializeEvList(m_tentativeEvents, fd, "tentative "); 879 close(fd); 880 } 881 882 /* 883 * XXX: This method assumes that events may not contain embedded newlines. If 884 * ever events can contain embedded newlines, then CaseFile must switch 885 * serialization formats 886 */ 887 void 888 CaseFile::DeSerialize(ifstream &caseStream) 889 { 890 string evString; 891 const EventFactory &factory(ZfsDaemon::Get().GetFactory()); 892 893 caseStream >> std::noskipws >> std::ws; 894 while (caseStream.good()) { 895 /* 896 * Outline: 897 * read the beginning of a line and check it for 898 * "tentative". If found, discard "tentative". 899 * Create a new event 900 * continue 901 */ 902 EventList* destEvents; 903 const string tentFlag("tentative "); 904 string line; 905 std::stringbuf lineBuf; 906 907 caseStream.get(lineBuf); 908 caseStream.ignore(); /*discard the newline character*/ 909 line = lineBuf.str(); 910 if (line.compare(0, tentFlag.size(), tentFlag) == 0) { 911 /* Discard "tentative" */ 912 line.erase(0, tentFlag.size()); 913 destEvents = &m_tentativeEvents; 914 } else { 915 destEvents = &m_events; 916 } 917 Event *event(Event::CreateEvent(factory, line)); 918 if (event != NULL) { 919 destEvents->push_back(event); 920 RegisterCallout(*event); 921 } 922 } 923 } 924 925 void 926 CaseFile::Close() 927 { 928 /* 929 * This case is no longer relevant. Clean up our 930 * serialization file, and delete the case. 931 */ 932 syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n", 933 PoolGUIDString().c_str(), VdevGUIDString().c_str(), 934 zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); 935 936 /* 937 * Serialization of a Case with no event data, clears the 938 * Serialization data for that event. 939 */ 940 PurgeEvents(); 941 Serialize(); 942 943 delete this; 944 } 945 946 void 947 CaseFile::OnGracePeriodEnded() 948 { 949 bool should_fault, should_degrade; 950 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 951 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 952 953 m_events.splice(m_events.begin(), m_tentativeEvents); 954 should_fault = ShouldFault(); 955 should_degrade = ShouldDegrade(); 956 957 if (should_fault || should_degrade) { 958 if (zhp == NULL 959 || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) { 960 /* 961 * Either the pool no longer exists 962 * or this vdev is no longer a member of 963 * the pool. 964 */ 965 Close(); 966 return; 967 } 968 969 } 970 971 /* A fault condition has priority over a degrade condition */ 972 if (ShouldFault()) { 973 /* Fault the vdev and close the case. */ 974 if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID, 975 VDEV_AUX_ERR_EXCEEDED) == 0) { 976 syslog(LOG_INFO, "Faulting vdev(%s/%s)", 977 PoolGUIDString().c_str(), 978 VdevGUIDString().c_str()); 979 Close(); 980 return; 981 } 982 else { 983 syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n", 984 PoolGUIDString().c_str(), 985 VdevGUIDString().c_str(), 986 libzfs_error_action(g_zfsHandle), 987 libzfs_error_description(g_zfsHandle)); 988 } 989 } 990 else if (ShouldDegrade()) { 991 /* Degrade the vdev and close the case. */ 992 if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID, 993 VDEV_AUX_ERR_EXCEEDED) == 0) { 994 syslog(LOG_INFO, "Degrading vdev(%s/%s)", 995 PoolGUIDString().c_str(), 996 VdevGUIDString().c_str()); 997 Close(); 998 return; 999 } 1000 else { 1001 syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n", 1002 PoolGUIDString().c_str(), 1003 VdevGUIDString().c_str(), 1004 libzfs_error_action(g_zfsHandle), 1005 libzfs_error_description(g_zfsHandle)); 1006 } 1007 } 1008 Serialize(); 1009 } 1010 1011 Vdev 1012 CaseFile::BeingReplacedBy(zpool_handle_t *zhp) { 1013 Vdev vd(zhp, CaseVdev(zhp)); 1014 std::list<Vdev> children; 1015 std::list<Vdev>::iterator children_it; 1016 1017 Vdev parent(vd.Parent()); 1018 Vdev replacing(NonexistentVdev); 1019 1020 /* 1021 * To determine whether we are being replaced by another spare that 1022 * is still working, then make sure that it is currently spared and 1023 * that the spare is either resilvering or healthy. If any of these 1024 * conditions fail, then we are not being replaced by a spare. 1025 * 1026 * If the spare is healthy, then the case file should be closed very 1027 * soon after this check. 1028 */ 1029 if (parent.DoesNotExist() 1030 || parent.Name(zhp, /*verbose*/false) != "spare") 1031 return (NonexistentVdev); 1032 1033 children = parent.Children(); 1034 children_it = children.begin(); 1035 for (;children_it != children.end(); children_it++) { 1036 Vdev child = *children_it; 1037 1038 /* Skip our vdev. */ 1039 if (child.GUID() == VdevGUID()) 1040 continue; 1041 /* 1042 * Accept the first child that doesn't match our GUID, or 1043 * any resilvering/healthy device if one exists. 1044 */ 1045 if (replacing.DoesNotExist() || child.IsResilvering() 1046 || child.State() == VDEV_STATE_HEALTHY) 1047 replacing = child; 1048 } 1049 1050 return (replacing); 1051 } 1052 1053 bool 1054 CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) { 1055 nvlist_t *nvroot, *newvd; 1056 const char *poolname; 1057 string oldstr(VdevGUIDString()); 1058 bool retval = true; 1059 1060 /* Figure out what pool we're working on */ 1061 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); 1062 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); 1063 if (zhp == NULL) { 1064 syslog(LOG_ERR, "CaseFile::Replace: could not find pool for " 1065 "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID); 1066 return (false); 1067 } 1068 poolname = zpool_get_name(zhp); 1069 Vdev vd(zhp, CaseVdev(zhp)); 1070 Vdev replaced(BeingReplacedBy(zhp)); 1071 1072 if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) { 1073 /* If we are already being replaced by a working spare, pass. */ 1074 if (replaced.IsResilvering() 1075 || replaced.State() == VDEV_STATE_HEALTHY) { 1076 syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already " 1077 "replaced", VdevGUIDString().c_str(), path); 1078 return (/*consumed*/false); 1079 } 1080 /* 1081 * If we have already been replaced by a spare, but that spare 1082 * is broken, we must spare the spare, not the original device. 1083 */ 1084 oldstr = replaced.GUIDString(); 1085 syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing " 1086 "broken spare %s instead", VdevGUIDString().c_str(), 1087 path, oldstr.c_str()); 1088 } 1089 1090 /* 1091 * Build a root vdev/leaf vdev configuration suitable for 1092 * zpool_vdev_attach. Only enough data for the kernel to find 1093 * the device (i.e. type and disk device node path) are needed. 1094 */ 1095 nvroot = NULL; 1096 newvd = NULL; 1097 1098 if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0 1099 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { 1100 syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate " 1101 "configuration data.", poolname, oldstr.c_str()); 1102 if (nvroot != NULL) 1103 nvlist_free(nvroot); 1104 return (false); 1105 } 1106 if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0 1107 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 1108 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 1109 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1110 &newvd, 1) != 0) { 1111 syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize " 1112 "configuration data.", poolname, oldstr.c_str()); 1113 nvlist_free(newvd); 1114 nvlist_free(nvroot); 1115 return (true); 1116 } 1117 1118 /* Data was copied when added to the root vdev. */ 1119 nvlist_free(newvd); 1120 1121 retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot, 1122 /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0); 1123 if (retval) 1124 syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n", 1125 poolname, oldstr.c_str(), path); 1126 else 1127 syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n", 1128 poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle), 1129 libzfs_error_description(g_zfsHandle)); 1130 nvlist_free(nvroot); 1131 1132 return (retval); 1133 } 1134 1135 /* Does the argument event refer to a checksum error? */ 1136 static bool 1137 IsChecksumEvent(const Event* const event) 1138 { 1139 return ("ereport.fs.zfs.checksum" == event->Value("type")); 1140 } 1141 1142 /* Does the argument event refer to an IO error? */ 1143 static bool 1144 IsIOEvent(const Event* const event) 1145 { 1146 return ("ereport.fs.zfs.io" == event->Value("type")); 1147 } 1148 1149 bool 1150 CaseFile::ShouldDegrade() const 1151 { 1152 return (std::count_if(m_events.begin(), m_events.end(), 1153 IsChecksumEvent) > ZFS_DEGRADE_IO_COUNT); 1154 } 1155 1156 bool 1157 CaseFile::ShouldFault() const 1158 { 1159 return (std::count_if(m_events.begin(), m_events.end(), 1160 IsIOEvent) > ZFS_DEGRADE_IO_COUNT); 1161 } 1162 1163 nvlist_t * 1164 CaseFile::CaseVdev(zpool_handle_t *zhp) const 1165 { 1166 return (VdevIterator(zhp).Find(VdevGUID())); 1167 } 1168