1 /*-
2 * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions, and the following disclaimer,
10 * without modification.
11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer
12 * substantially similar to the "NO WARRANTY" disclaimer below
13 * ("Disclaimer") and any redistribution must be conditioned upon
14 * including a substantially similar Disclaimer requirement for further
15 * binary redistribution.
16 *
17 * NO WARRANTY
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGES.
29 *
30 * Authors: Justin T. Gibbs (Spectra Logic Corporation)
31 */
32
33 /**
34 * \file case_file.cc
35 *
36 * We keep case files for any leaf vdev that is not in the optimal state.
37 * However, we only serialize to disk those events that need to be preserved
38 * across reboots. For now, this is just a log of soft errors which we
39 * accumulate in order to mark a device as degraded.
40 */
41 #include <sys/cdefs.h>
42 #include <sys/byteorder.h>
43 #include <sys/time.h>
44
45 #include <sys/fs/zfs.h>
46
47 #include <dirent.h>
48 #include <fcntl.h>
49 #include <iomanip>
50 #include <fstream>
51 #include <functional>
52 #include <sstream>
53 #include <syslog.h>
54 #include <unistd.h>
55
56 #include <libzutil.h>
57 #include <libzfs.h>
58
59 #include <list>
60 #include <map>
61 #include <string>
62
63 #include <devdctl/guid.h>
64 #include <devdctl/event.h>
65 #include <devdctl/event_factory.h>
66 #include <devdctl/exception.h>
67 #include <devdctl/consumer.h>
68
69 #include "callout.h"
70 #include "vdev_iterator.h"
71 #include "zfsd_event.h"
72 #include "case_file.h"
73 #include "vdev.h"
74 #include "zfsd.h"
75 #include "zfsd_exception.h"
76 #include "zpool_list.h"
77 /*============================ Namespace Control =============================*/
78 using std::hex;
79 using std::ifstream;
80 using std::stringstream;
81 using std::setfill;
82 using std::setw;
83
84 using DevdCtl::Event;
85 using DevdCtl::EventFactory;
86 using DevdCtl::EventList;
87 using DevdCtl::Guid;
88 using DevdCtl::ParseException;
89
90 /*--------------------------------- CaseFile ---------------------------------*/
91 //- CaseFile Static Data -------------------------------------------------------
92
93 CaseFileList CaseFile::s_activeCases;
94 const string CaseFile::s_caseFilePath = "/var/db/zfsd/cases";
95
96 //- CaseFile Static Public Methods ---------------------------------------------
97 CaseFile *
Find(Guid poolGUID,Guid vdevGUID)98 CaseFile::Find(Guid poolGUID, Guid vdevGUID)
99 {
100 for (CaseFileList::iterator curCase = s_activeCases.begin();
101 curCase != s_activeCases.end(); curCase++) {
102
103 if (((*curCase)->PoolGUID() != poolGUID
104 && Guid::InvalidGuid() != poolGUID)
105 || (*curCase)->VdevGUID() != vdevGUID)
106 continue;
107
108 /*
109 * We only carry one active case per-vdev.
110 */
111 return (*curCase);
112 }
113 return (NULL);
114 }
115
116 void
Find(Guid poolGUID,Guid vdevGUID,CaseFileList & cases)117 CaseFile::Find(Guid poolGUID, Guid vdevGUID, CaseFileList &cases)
118 {
119 for (CaseFileList::iterator curCase = s_activeCases.begin();
120 curCase != s_activeCases.end(); curCase++) {
121 if (((*curCase)->PoolGUID() != poolGUID &&
122 Guid::InvalidGuid() != poolGUID) ||
123 (*curCase)->VdevGUID() != vdevGUID)
124 continue;
125
126 /*
127 * We can have multiple cases for spare vdevs
128 */
129 cases.push_back(*curCase);
130 if (!(*curCase)->IsSpare()) {
131 return;
132 }
133 }
134 }
135
136 CaseFile *
Find(const string & physPath)137 CaseFile::Find(const string &physPath)
138 {
139 CaseFile *result = NULL;
140
141 for (CaseFileList::iterator curCase = s_activeCases.begin();
142 curCase != s_activeCases.end(); curCase++) {
143
144 if ((*curCase)->PhysicalPath() != physPath)
145 continue;
146
147 if (result != NULL) {
148 syslog(LOG_WARNING, "Multiple casefiles found for "
149 "physical path %s. "
150 "This is most likely a bug in zfsd",
151 physPath.c_str());
152 }
153 result = *curCase;
154 }
155 return (result);
156 }
157
158
159 void
ReEvaluateByGuid(Guid poolGUID,const ZfsEvent & event)160 CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event)
161 {
162 CaseFileList::iterator casefile;
163 for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){
164 CaseFileList::iterator next = casefile;
165 next++;
166 if (poolGUID == (*casefile)->PoolGUID())
167 (*casefile)->ReEvaluate(event);
168 casefile = next;
169 }
170 }
171
172 CaseFile &
Create(Vdev & vdev)173 CaseFile::Create(Vdev &vdev)
174 {
175 CaseFile *activeCase;
176
177 activeCase = Find(vdev.PoolGUID(), vdev.GUID());
178 if (activeCase == NULL)
179 activeCase = new CaseFile(vdev);
180
181 return (*activeCase);
182 }
183
184 void
DeSerialize()185 CaseFile::DeSerialize()
186 {
187 struct dirent **caseFiles;
188
189 int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles,
190 DeSerializeSelector, /*compar*/NULL));
191
192 if (numCaseFiles == -1)
193 return;
194 if (numCaseFiles == 0) {
195 free(caseFiles);
196 return;
197 }
198
199 for (int i = 0; i < numCaseFiles; i++) {
200
201 DeSerializeFile(caseFiles[i]->d_name);
202 free(caseFiles[i]);
203 }
204 free(caseFiles);
205 }
206
207 bool
Empty()208 CaseFile::Empty()
209 {
210 return (s_activeCases.empty());
211 }
212
213 void
LogAll()214 CaseFile::LogAll()
215 {
216 for (CaseFileList::iterator curCase = s_activeCases.begin();
217 curCase != s_activeCases.end(); curCase++)
218 (*curCase)->Log();
219 }
220
221 void
PurgeAll()222 CaseFile::PurgeAll()
223 {
224 /*
225 * Serialize casefiles before deleting them so that they can be reread
226 * and revalidated during BuildCaseFiles.
227 * CaseFiles remove themselves from this list on destruction.
228 */
229 while (s_activeCases.size() != 0) {
230 CaseFile *casefile = s_activeCases.front();
231 casefile->Serialize();
232 delete casefile;
233 }
234
235 }
236
237 int
IsSpare()238 CaseFile::IsSpare()
239 {
240 return (m_is_spare);
241 }
242
243 //- CaseFile Public Methods ----------------------------------------------------
244 bool
RefreshVdevState()245 CaseFile::RefreshVdevState()
246 {
247 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
248 zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front());
249 if (casePool == NULL)
250 return (false);
251
252 Vdev vd(casePool, CaseVdev(casePool));
253 if (vd.DoesNotExist())
254 return (false);
255
256 m_vdevState = vd.State();
257 m_vdevPhysPath = vd.PhysicalPath();
258 m_vdevName = vd.Name(casePool, false);
259 return (true);
260 }
261
262 bool
ReEvaluate(const string & devPath,const string & physPath,Vdev * vdev)263 CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev)
264 {
265 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
266 zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front());
267 int flags = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE;
268
269 if (pool == NULL || !RefreshVdevState()) {
270 /*
271 * The pool or vdev for this case file is no longer
272 * part of the configuration. This can happen
273 * if we process a device arrival notification
274 * before seeing the ZFS configuration change
275 * event.
276 */
277 syslog(LOG_INFO,
278 "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured. "
279 "Closing\n",
280 PoolGUIDString().c_str(),
281 VdevGUIDString().c_str());
282 Close();
283
284 /*
285 * Since this event was not used to close this
286 * case, do not report it as consumed.
287 */
288 return (/*consumed*/false);
289 }
290
291 if (VdevState() > VDEV_STATE_FAULTED) {
292 /*
293 * For now, newly discovered devices only help for
294 * devices that are missing. In the future, we might
295 * use a newly inserted spare to replace a degraded
296 * or faulted device.
297 */
298 syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored",
299 PoolGUIDString().c_str(), VdevGUIDString().c_str());
300 return (/*consumed*/false);
301 }
302
303 if (vdev != NULL
304 && ( vdev->PoolGUID() == m_poolGUID
305 || vdev->PoolGUID() == Guid::InvalidGuid())
306 && vdev->GUID() == m_vdevGUID) {
307
308 if (IsSpare())
309 flags |= ZFS_ONLINE_SPARE;
310 if (zpool_vdev_online(pool, vdev->GUIDString().c_str(),
311 flags, &m_vdevState) != 0) {
312 syslog(LOG_ERR,
313 "Failed to online vdev(%s/%s:%s): %s: %s\n",
314 zpool_get_name(pool), vdev->GUIDString().c_str(),
315 devPath.c_str(), libzfs_error_action(g_zfsHandle),
316 libzfs_error_description(g_zfsHandle));
317 return (/*consumed*/false);
318 }
319
320 syslog(LOG_INFO, "Onlined vdev(%s/%s:%s). State now %s.\n",
321 zpool_get_name(pool), vdev->GUIDString().c_str(),
322 devPath.c_str(),
323 zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
324
325 /*
326 * Check the vdev state post the online action to see
327 * if we can retire this case.
328 */
329 CloseIfSolved();
330
331 return (/*consumed*/true);
332 }
333
334 /*
335 * If the auto-replace policy is enabled, and we have physical
336 * path information, try a physical path replacement.
337 */
338 if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) {
339 syslog(LOG_INFO,
340 "CaseFile(%s:%s:%s): AutoReplace not set. "
341 "Ignoring device insertion.\n",
342 PoolGUIDString().c_str(),
343 VdevGUIDString().c_str(),
344 zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
345 return (/*consumed*/false);
346 }
347
348 if (PhysicalPath().empty()) {
349 syslog(LOG_INFO,
350 "CaseFile(%s:%s:%s): No physical path information. "
351 "Ignoring device insertion.\n",
352 PoolGUIDString().c_str(),
353 VdevGUIDString().c_str(),
354 zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
355 return (/*consumed*/false);
356 }
357
358 if (physPath != PhysicalPath()) {
359 syslog(LOG_INFO,
360 "CaseFile(%s:%s:%s): Physical path mismatch. "
361 "Ignoring device insertion.\n",
362 PoolGUIDString().c_str(),
363 VdevGUIDString().c_str(),
364 zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
365 return (/*consumed*/false);
366 }
367
368 /* Write a label on the newly inserted disk. */
369 if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) {
370 syslog(LOG_ERR,
371 "Replace vdev(%s/%s) by physical path (label): %s: %s\n",
372 zpool_get_name(pool), VdevGUIDString().c_str(),
373 libzfs_error_action(g_zfsHandle),
374 libzfs_error_description(g_zfsHandle));
375 return (/*consumed*/false);
376 }
377
378 syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s",
379 PoolGUIDString().c_str(), VdevGUIDString().c_str(),
380 devPath.c_str());
381 return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false));
382 }
383
384 bool
ReEvaluate(const ZfsEvent & event)385 CaseFile::ReEvaluate(const ZfsEvent &event)
386 {
387 bool consumed(false);
388
389 if (event.Value("type") == "sysevent.fs.zfs.vdev_remove") {
390 /*
391 * The Vdev we represent has been removed from the
392 * configuration. This case is no longer of value.
393 */
394 Close();
395
396 return (/*consumed*/true);
397 } else if (event.Value("type") == "sysevent.fs.zfs.pool_destroy") {
398 /* This Pool has been destroyed. Discard the case */
399 Close();
400
401 return (/*consumed*/true);
402 } else if (event.Value("type") == "sysevent.fs.zfs.config_sync") {
403 RefreshVdevState();
404 if (VdevState() < VDEV_STATE_HEALTHY)
405 consumed = ActivateSpare();
406 }
407
408
409 if (event.Value("class") == "resource.fs.zfs.removed") {
410 bool spare_activated;
411
412 if (!RefreshVdevState()) {
413 /*
414 * The pool or vdev for this case file is no longer
415 * part of the configuration. This can happen
416 * if we process a device arrival notification
417 * before seeing the ZFS configuration change
418 * event.
419 */
420 syslog(LOG_INFO,
421 "CaseFile::ReEvaluate(%s,%s) Pool/Vdev "
422 "unconfigured. Closing\n",
423 PoolGUIDString().c_str(),
424 VdevGUIDString().c_str());
425 /*
426 * Close the case now so we won't waste cycles in the
427 * system rescan
428 */
429 Close();
430
431 /*
432 * Since this event was not used to close this
433 * case, do not report it as consumed.
434 */
435 return (/*consumed*/false);
436 }
437
438 /*
439 * Discard any tentative I/O error events for
440 * this case. They were most likely caused by the
441 * hot-unplug of this device.
442 */
443 PurgeTentativeEvents();
444
445 /* Try to activate spares if they are available */
446 spare_activated = ActivateSpare();
447
448 /*
449 * Rescan the drives in the system to see if a recent
450 * drive arrival can be used to solve this case.
451 */
452 ZfsDaemon::RequestSystemRescan();
453
454 /*
455 * Consume the event if we successfully activated a spare.
456 * Otherwise, leave it in the unconsumed events list so that the
457 * future addition of a spare to this pool might be able to
458 * close the case
459 */
460 consumed = spare_activated;
461 } else if (event.Value("class") == "resource.fs.zfs.statechange") {
462 RefreshVdevState();
463 /*
464 * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to
465 * activate a hotspare. Otherwise, ignore the event
466 */
467 if (VdevState() == VDEV_STATE_FAULTED ||
468 VdevState() == VDEV_STATE_DEGRADED ||
469 VdevState() == VDEV_STATE_CANT_OPEN)
470 (void) ActivateSpare();
471 consumed = true;
472 }
473 else if (event.Value("class") == "ereport.fs.zfs.io" ||
474 event.Value("class") == "ereport.fs.zfs.checksum" ||
475 event.Value("class") == "ereport.fs.zfs.delay") {
476
477 m_tentativeEvents.push_front(event.DeepCopy());
478 RegisterCallout(event);
479 consumed = true;
480 }
481
482 bool closed(CloseIfSolved());
483
484 return (consumed || closed);
485 }
486
487 /* Find a Vdev containing the vdev with the given GUID */
488 static nvlist_t*
find_parent(nvlist_t * pool_config,nvlist_t * config,DevdCtl::Guid child_guid)489 find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid)
490 {
491 nvlist_t **vdevChildren;
492 int error;
493 unsigned ch, numChildren;
494
495 error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
496 &vdevChildren, &numChildren);
497
498 if (error != 0 || numChildren == 0)
499 return (NULL);
500
501 for (ch = 0; ch < numChildren; ch++) {
502 nvlist *result;
503 Vdev vdev(pool_config, vdevChildren[ch]);
504
505 if (vdev.GUID() == child_guid)
506 return (config);
507
508 result = find_parent(pool_config, vdevChildren[ch], child_guid);
509 if (result != NULL)
510 return (result);
511 }
512
513 return (NULL);
514 }
515
516 bool
ActivateSpare()517 CaseFile::ActivateSpare() {
518 nvlist_t *config, *nvroot, *parent_config;
519 nvlist_t **spares;
520 const char *devPath, *poolname, *vdev_type;
521 u_int nspares, i;
522 int error;
523
524 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
525 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
526 if (zhp == NULL) {
527 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
528 "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID);
529 return (false);
530 }
531 poolname = zpool_get_name(zhp);
532 config = zpool_get_config(zhp, NULL);
533 if (config == NULL) {
534 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
535 "config for pool %s", poolname);
536 return (false);
537 }
538 error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot);
539 if (error != 0){
540 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev "
541 "tree for pool %s", poolname);
542 return (false);
543 }
544
545 parent_config = find_parent(config, nvroot, m_vdevGUID);
546 if (parent_config != NULL) {
547 const char *parent_type;
548
549 /*
550 * Don't activate spares for members of a "replacing" vdev.
551 * They're already dealt with. Sparing them will just drag out
552 * the resilver process.
553 */
554 error = nvlist_lookup_string(parent_config,
555 ZPOOL_CONFIG_TYPE, &parent_type);
556 if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0)
557 return (false);
558 }
559
560 nspares = 0;
561 nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
562 &nspares);
563 if (nspares == 0) {
564 /* The pool has no spares configured */
565 syslog(LOG_INFO, "CaseFile::ActivateSpare: "
566 "No spares available for pool %s", poolname);
567 return (false);
568 }
569 for (i = 0; i < nspares; i++) {
570 uint64_t *nvlist_array;
571 vdev_stat_t *vs;
572 uint_t nstats;
573
574 if (nvlist_lookup_uint64_array(spares[i],
575 ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) {
576 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not "
577 "find vdev stats for pool %s, spare %d",
578 poolname, i);
579 return (false);
580 }
581 vs = reinterpret_cast<vdev_stat_t *>(nvlist_array);
582
583 if ((vs->vs_aux != VDEV_AUX_SPARED)
584 && (vs->vs_state == VDEV_STATE_HEALTHY)) {
585 /* We found a usable spare */
586 break;
587 }
588 }
589
590 if (i == nspares) {
591 /* No available spares were found */
592 return (false);
593 }
594
595 error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath);
596 if (error != 0) {
597 syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
598 "the path of pool %s, spare %d. Error %d",
599 poolname, i, error);
600 return (false);
601 }
602
603 error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type);
604 if (error != 0) {
605 syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
606 "the vdev type of pool %s, spare %d. Error %d",
607 poolname, i, error);
608 return (false);
609 }
610
611 return (Replace(vdev_type, devPath, /*isspare*/true));
612 }
613
614 /* Does the argument event refer to a checksum error? */
615 static bool
IsChecksumEvent(const Event * const event)616 IsChecksumEvent(const Event* const event)
617 {
618 return ("ereport.fs.zfs.checksum" == event->Value("type"));
619 }
620
621 /* Does the argument event refer to an IO error? */
622 static bool
IsIOEvent(const Event * const event)623 IsIOEvent(const Event* const event)
624 {
625 return ("ereport.fs.zfs.io" == event->Value("type"));
626 }
627
628 /* Does the argument event refer to an IO delay? */
629 static bool
IsDelayEvent(const Event * const event)630 IsDelayEvent(const Event* const event)
631 {
632 return ("ereport.fs.zfs.delay" == event->Value("type"));
633 }
634
635 void
RegisterCallout(const Event & event)636 CaseFile::RegisterCallout(const Event &event)
637 {
638 timeval now, countdown, elapsed, timestamp, zero, remaining;
639 /**
640 * The time ZFSD waits before promoting a tentative event
641 * into a permanent event.
642 */
643 int sec = -1;
644 if (IsChecksumEvent(&event))
645 sec = CaseFile::GetVdevProp(VDEV_PROP_CHECKSUM_T);
646 else if (IsIOEvent(&event))
647 sec = CaseFile::GetVdevProp(VDEV_PROP_IO_T);
648 else if (IsDelayEvent(&event))
649 sec = CaseFile::GetVdevProp(VDEV_PROP_SLOW_IO_T);
650
651 if (sec == -1)
652 sec = 60; /* default */
653
654 timeval removeGracePeriod = {
655 sec, /*sec*/
656 0 /*usec*/
657 };
658
659 gettimeofday(&now, 0);
660 timestamp = event.GetTimestamp();
661 timersub(&now, ×tamp, &elapsed);
662 timersub(&removeGracePeriod, &elapsed, &countdown);
663 /*
664 * If countdown is <= zero, Reset the timer to the
665 * smallest positive time value instead
666 */
667 timerclear(&zero);
668 if (timercmp(&countdown, &zero, <=)) {
669 timerclear(&countdown);
670 countdown.tv_usec = 1;
671 }
672
673 remaining = m_tentativeTimer.TimeRemaining();
674
675 if (!m_tentativeTimer.IsPending()
676 || timercmp(&countdown, &remaining, <))
677 m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this);
678 }
679
680
681 bool
CloseIfSolved()682 CaseFile::CloseIfSolved()
683 {
684 if (m_events.empty()
685 && m_tentativeEvents.empty()) {
686
687 /*
688 * We currently do not track or take actions on
689 * devices in the degraded or faulted state.
690 * Once we have support for spare pools, we'll
691 * retain these cases so that any spares added in
692 * the future can be applied to them.
693 */
694 switch (VdevState()) {
695 case VDEV_STATE_HEALTHY:
696 /* No need to keep cases for healthy vdevs */
697 Close();
698 return (true);
699 case VDEV_STATE_REMOVED:
700 case VDEV_STATE_CANT_OPEN:
701 /*
702 * Keep open. We may solve it with a newly inserted
703 * device.
704 */
705 case VDEV_STATE_FAULTED:
706 case VDEV_STATE_DEGRADED:
707 /*
708 * Keep open. We may solve it with the future
709 * addition of a spare to the pool
710 */
711 case VDEV_STATE_UNKNOWN:
712 case VDEV_STATE_CLOSED:
713 case VDEV_STATE_OFFLINE:
714 /*
715 * Keep open? This may not be the correct behavior,
716 * but it's what we've always done
717 */
718 ;
719 }
720
721 /*
722 * Re-serialize the case in order to remove any
723 * previous event data.
724 */
725 Serialize();
726 }
727
728 return (false);
729 }
730
731 void
Log()732 CaseFile::Log()
733 {
734 syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(),
735 VdevGUIDString().c_str(), PhysicalPath().c_str());
736 syslog(LOG_INFO, "\tVdev State = %s\n",
737 zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
738 if (m_tentativeEvents.size() != 0) {
739 syslog(LOG_INFO, "\t=== Tentative Events ===\n");
740 for (EventList::iterator event(m_tentativeEvents.begin());
741 event != m_tentativeEvents.end(); event++)
742 (*event)->Log(LOG_INFO);
743 }
744 if (m_events.size() != 0) {
745 syslog(LOG_INFO, "\t=== Events ===\n");
746 for (EventList::iterator event(m_events.begin());
747 event != m_events.end(); event++)
748 (*event)->Log(LOG_INFO);
749 }
750 }
751
752 //- CaseFile Static Protected Methods ------------------------------------------
753 void
OnGracePeriodEnded(void * arg)754 CaseFile::OnGracePeriodEnded(void *arg)
755 {
756 CaseFile &casefile(*static_cast<CaseFile *>(arg));
757
758 casefile.OnGracePeriodEnded();
759 }
760
761 int
DeSerializeSelector(const struct dirent * dirEntry)762 CaseFile::DeSerializeSelector(const struct dirent *dirEntry)
763 {
764 uint64_t poolGUID;
765 uint64_t vdevGUID;
766
767 if (dirEntry->d_type == DT_REG
768 && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
769 &poolGUID, &vdevGUID) == 2)
770 return (1);
771 return (0);
772 }
773
774 void
DeSerializeFile(const char * fileName)775 CaseFile::DeSerializeFile(const char *fileName)
776 {
777 string fullName(s_caseFilePath + '/' + fileName);
778 CaseFile *existingCaseFile(NULL);
779 CaseFile *caseFile(NULL);
780
781 try {
782 uint64_t poolGUID;
783 uint64_t vdevGUID;
784 nvlist_t *vdevConf;
785
786 if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
787 &poolGUID, &vdevGUID) != 2) {
788 throw ZfsdException("CaseFile::DeSerialize: "
789 "Unintelligible CaseFile filename %s.\n", fileName);
790 }
791 existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID));
792 if (existingCaseFile != NULL) {
793 /*
794 * If the vdev is already degraded or faulted,
795 * there's no point in keeping the state around
796 * that we use to put a drive into the degraded
797 * state. However, if the vdev is simply missing,
798 * preserve the case data in the hopes that it will
799 * return.
800 */
801 caseFile = existingCaseFile;
802 vdev_state curState(caseFile->VdevState());
803 if (curState > VDEV_STATE_CANT_OPEN
804 && curState < VDEV_STATE_HEALTHY) {
805 unlink(fileName);
806 return;
807 }
808 } else {
809 ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID);
810 if (zpl.empty()
811 || (vdevConf = VdevIterator(zpl.front())
812 .Find(vdevGUID)) == NULL) {
813 /*
814 * Either the pool no longer exists
815 * or this vdev is no longer a member of
816 * the pool.
817 */
818 unlink(fullName.c_str());
819 return;
820 }
821
822 /*
823 * Any vdev we find that does not have a case file
824 * must be in the healthy state and thus worthy of
825 * continued SERD data tracking.
826 */
827 caseFile = new CaseFile(Vdev(zpl.front(), vdevConf));
828 }
829
830 ifstream caseStream(fullName.c_str());
831 if (!caseStream)
832 throw ZfsdException("CaseFile::DeSerialize: Unable to "
833 "read %s.\n", fileName);
834
835 caseFile->DeSerialize(caseStream);
836 } catch (const ParseException &exp) {
837
838 exp.Log();
839 if (caseFile != existingCaseFile)
840 delete caseFile;
841
842 /*
843 * Since we can't parse the file, unlink it so we don't
844 * trip over it again.
845 */
846 unlink(fileName);
847 } catch (const ZfsdException &zfsException) {
848
849 zfsException.Log();
850 if (caseFile != existingCaseFile)
851 delete caseFile;
852 }
853 }
854
855 //- CaseFile Protected Methods -------------------------------------------------
CaseFile(const Vdev & vdev)856 CaseFile::CaseFile(const Vdev &vdev)
857 : m_poolGUID(vdev.PoolGUID()),
858 m_vdevGUID(vdev.GUID()),
859 m_vdevState(vdev.State()),
860 m_vdevPhysPath(vdev.PhysicalPath()),
861 m_is_spare(vdev.IsSpare())
862 {
863 stringstream guidString;
864
865 guidString << m_vdevGUID;
866 m_vdevGUIDString = guidString.str();
867 guidString.str("");
868 guidString << m_poolGUID;
869 m_poolGUIDString = guidString.str();
870
871 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
872 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
873 m_vdevName = vdev.Name(zhp, false);
874
875 s_activeCases.push_back(this);
876
877 syslog(LOG_INFO, "Creating new CaseFile:\n");
878 Log();
879 }
880
~CaseFile()881 CaseFile::~CaseFile()
882 {
883 PurgeEvents();
884 PurgeTentativeEvents();
885 m_tentativeTimer.Stop();
886 s_activeCases.remove(this);
887 }
888
889 void
PurgeEvents()890 CaseFile::PurgeEvents()
891 {
892 for (EventList::iterator event(m_events.begin());
893 event != m_events.end(); event++)
894 delete *event;
895
896 m_events.clear();
897 }
898
899 void
PurgeTentativeEvents()900 CaseFile::PurgeTentativeEvents()
901 {
902 for (EventList::iterator event(m_tentativeEvents.begin());
903 event != m_tentativeEvents.end(); event++)
904 delete *event;
905
906 m_tentativeEvents.clear();
907 }
908
909 void
SerializeEvList(const EventList events,int fd,const char * prefix) const910 CaseFile::SerializeEvList(const EventList events, int fd,
911 const char* prefix) const
912 {
913 if (events.empty())
914 return;
915 for (EventList::const_iterator curEvent = events.begin();
916 curEvent != events.end(); curEvent++) {
917 const string &eventString((*curEvent)->GetEventString());
918
919 // TODO: replace many write(2) calls with a single writev(2)
920 if (prefix)
921 write(fd, prefix, strlen(prefix));
922 write(fd, eventString.c_str(), eventString.length());
923 }
924 }
925
926 void
Serialize()927 CaseFile::Serialize()
928 {
929 stringstream saveFile;
930
931 saveFile << setfill('0')
932 << s_caseFilePath << "/"
933 << "pool_" << PoolGUIDString()
934 << "_vdev_" << VdevGUIDString()
935 << ".case";
936
937 if (m_events.empty() && m_tentativeEvents.empty()) {
938 unlink(saveFile.str().c_str());
939 return;
940 }
941
942 int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644));
943 if (fd == -1) {
944 syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n",
945 saveFile.str().c_str());
946 return;
947 }
948 SerializeEvList(m_events, fd);
949 SerializeEvList(m_tentativeEvents, fd, "tentative ");
950 close(fd);
951 }
952
953 /*
954 * XXX: This method assumes that events may not contain embedded newlines. If
955 * ever events can contain embedded newlines, then CaseFile must switch
956 * serialization formats
957 */
958 void
DeSerialize(ifstream & caseStream)959 CaseFile::DeSerialize(ifstream &caseStream)
960 {
961 string evString;
962 const EventFactory &factory(ZfsDaemon::Get().GetFactory());
963
964 caseStream >> std::noskipws >> std::ws;
965 while (caseStream.good()) {
966 /*
967 * Outline:
968 * read the beginning of a line and check it for
969 * "tentative". If found, discard "tentative".
970 * Create a new event
971 * continue
972 */
973 EventList* destEvents;
974 const string tentFlag("tentative ");
975 string line;
976 std::stringbuf lineBuf;
977
978 caseStream.get(lineBuf);
979 caseStream.ignore(); /*discard the newline character*/
980 line = lineBuf.str();
981 if (line.compare(0, tentFlag.size(), tentFlag) == 0) {
982 /* Discard "tentative" */
983 line.erase(0, tentFlag.size());
984 destEvents = &m_tentativeEvents;
985 } else {
986 destEvents = &m_events;
987 }
988 Event *event(Event::CreateEvent(factory, line));
989 if (event != NULL) {
990 destEvents->push_back(event);
991 RegisterCallout(*event);
992 }
993 }
994 }
995
996 void
Close()997 CaseFile::Close()
998 {
999 /*
1000 * This case is no longer relevant. Clean up our
1001 * serialization file, and delete the case.
1002 */
1003 syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n",
1004 PoolGUIDString().c_str(), VdevGUIDString().c_str(),
1005 zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
1006
1007 /*
1008 * Serialization of a Case with no event data, clears the
1009 * Serialization data for that event.
1010 */
1011 PurgeEvents();
1012 Serialize();
1013
1014 delete this;
1015 }
1016
1017 void
OnGracePeriodEnded()1018 CaseFile::OnGracePeriodEnded()
1019 {
1020 bool should_fault, should_degrade;
1021 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
1022 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
1023
1024 m_events.splice(m_events.begin(), m_tentativeEvents);
1025 should_fault = ShouldFault();
1026 should_degrade = ShouldDegrade();
1027
1028 if (should_fault || should_degrade) {
1029 if (zhp == NULL
1030 || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) {
1031 /*
1032 * Either the pool no longer exists
1033 * or this vdev is no longer a member of
1034 * the pool.
1035 */
1036 Close();
1037 return;
1038 }
1039
1040 }
1041
1042 /* A fault condition has priority over a degrade condition */
1043 if (ShouldFault()) {
1044 /* Fault the vdev and close the case. */
1045 if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID,
1046 VDEV_AUX_ERR_EXCEEDED) == 0) {
1047 syslog(LOG_INFO, "Faulting vdev(%s/%s)",
1048 PoolGUIDString().c_str(),
1049 VdevGUIDString().c_str());
1050 Close();
1051 return;
1052 }
1053 else {
1054 syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n",
1055 PoolGUIDString().c_str(),
1056 VdevGUIDString().c_str(),
1057 libzfs_error_action(g_zfsHandle),
1058 libzfs_error_description(g_zfsHandle));
1059 }
1060 }
1061 else if (ShouldDegrade()) {
1062 /* Degrade the vdev and close the case. */
1063 if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID,
1064 VDEV_AUX_ERR_EXCEEDED) == 0) {
1065 syslog(LOG_INFO, "Degrading vdev(%s/%s)",
1066 PoolGUIDString().c_str(),
1067 VdevGUIDString().c_str());
1068 Close();
1069 return;
1070 }
1071 else {
1072 syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n",
1073 PoolGUIDString().c_str(),
1074 VdevGUIDString().c_str(),
1075 libzfs_error_action(g_zfsHandle),
1076 libzfs_error_description(g_zfsHandle));
1077 }
1078 }
1079 Serialize();
1080 }
1081
1082 Vdev
BeingReplacedBy(zpool_handle_t * zhp)1083 CaseFile::BeingReplacedBy(zpool_handle_t *zhp) {
1084 Vdev vd(zhp, CaseVdev(zhp));
1085 std::list<Vdev> children;
1086 std::list<Vdev>::iterator children_it;
1087
1088 Vdev parent(vd.Parent());
1089 Vdev replacing(NonexistentVdev);
1090
1091 /*
1092 * To determine whether we are being replaced by another spare that
1093 * is still working, then make sure that it is currently spared and
1094 * that the spare is either resilvering or healthy. If any of these
1095 * conditions fail, then we are not being replaced by a spare.
1096 *
1097 * If the spare is healthy, then the case file should be closed very
1098 * soon after this check.
1099 */
1100 if (parent.DoesNotExist()
1101 || parent.Name(zhp, /*verbose*/false) != "spare")
1102 return (NonexistentVdev);
1103
1104 children = parent.Children();
1105 children_it = children.begin();
1106 for (;children_it != children.end(); children_it++) {
1107 Vdev child = *children_it;
1108
1109 /* Skip our vdev. */
1110 if (child.GUID() == VdevGUID())
1111 continue;
1112 /*
1113 * Accept the first child that doesn't match our GUID, or
1114 * any resilvering/healthy device if one exists.
1115 */
1116 if (replacing.DoesNotExist() || child.IsResilvering()
1117 || child.State() == VDEV_STATE_HEALTHY)
1118 replacing = child;
1119 }
1120
1121 return (replacing);
1122 }
1123
1124 bool
Replace(const char * vdev_type,const char * path,bool isspare)1125 CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) {
1126 nvlist_t *nvroot, *newvd;
1127 const char *poolname;
1128 string oldstr(VdevGUIDString());
1129 bool retval = true;
1130
1131 /* Figure out what pool we're working on */
1132 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
1133 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
1134 if (zhp == NULL) {
1135 syslog(LOG_ERR, "CaseFile::Replace: could not find pool for "
1136 "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID);
1137 return (false);
1138 }
1139 poolname = zpool_get_name(zhp);
1140 Vdev vd(zhp, CaseVdev(zhp));
1141 Vdev replaced(BeingReplacedBy(zhp));
1142
1143 if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) {
1144 /* If we are already being replaced by a working spare, pass. */
1145 if (replaced.IsResilvering()
1146 || replaced.State() == VDEV_STATE_HEALTHY) {
1147 syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already "
1148 "replaced", VdevGUIDString().c_str(), path);
1149 return (/*consumed*/false);
1150 }
1151 /*
1152 * If we have already been replaced by a spare, but that spare
1153 * is broken, we must spare the spare, not the original device.
1154 */
1155 oldstr = replaced.GUIDString();
1156 syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing "
1157 "broken spare %s instead", VdevGUIDString().c_str(),
1158 path, oldstr.c_str());
1159 }
1160
1161 /*
1162 * Build a root vdev/leaf vdev configuration suitable for
1163 * zpool_vdev_attach. Only enough data for the kernel to find
1164 * the device (i.e. type and disk device node path) are needed.
1165 */
1166 nvroot = NULL;
1167 newvd = NULL;
1168
1169 if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0
1170 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
1171 syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate "
1172 "configuration data.", poolname, oldstr.c_str());
1173 if (nvroot != NULL)
1174 nvlist_free(nvroot);
1175 return (false);
1176 }
1177 if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0
1178 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0
1179 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0
1180 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1181 &newvd, 1) != 0) {
1182 syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize "
1183 "configuration data.", poolname, oldstr.c_str());
1184 nvlist_free(newvd);
1185 nvlist_free(nvroot);
1186 return (true);
1187 }
1188
1189 /* Data was copied when added to the root vdev. */
1190 nvlist_free(newvd);
1191
1192 retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot,
1193 /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0);
1194 if (retval)
1195 syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n",
1196 poolname, oldstr.c_str(), path);
1197 else
1198 syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n",
1199 poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle),
1200 libzfs_error_description(g_zfsHandle));
1201 nvlist_free(nvroot);
1202
1203 return (retval);
1204 }
1205
1206 /* Lookup the vdev prop. Used for checksum, IO, or slow IO props */
1207 int
GetVdevProp(vdev_prop_t vdev_prop) const1208 CaseFile::GetVdevProp(vdev_prop_t vdev_prop) const
1209 {
1210 char val[ZFS_MAXPROPLEN];
1211 zprop_source_t srctype;
1212 DevdCtl::Guid poolGUID = PoolGUID();
1213 ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID);
1214 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
1215
1216 char *prop_str = (char *) vdev_prop_to_name(vdev_prop);
1217 if (zhp == NULL || zpool_get_vdev_prop(zhp, m_vdevName.c_str(),
1218 vdev_prop, prop_str, val, sizeof (val), &srctype, B_FALSE) != 0)
1219 return (-1);
1220
1221 /* we'll get "-" from libzfs for a prop that is not set */
1222 if (zfs_isnumber(val) == B_FALSE)
1223 return (-1);
1224
1225 return (atoi(val));
1226 }
1227
1228 bool
ShouldDegrade() const1229 CaseFile::ShouldDegrade() const
1230 {
1231 int checksum_n = GetVdevProp(VDEV_PROP_CHECKSUM_N);
1232 if (checksum_n == -1)
1233 checksum_n = DEFAULT_ZFS_DEGRADE_IO_COUNT;
1234 return (std::count_if(m_events.begin(), m_events.end(),
1235 IsChecksumEvent) > checksum_n);
1236 }
1237
1238 bool
ShouldFault() const1239 CaseFile::ShouldFault() const
1240 {
1241 bool should_fault_for_io, should_fault_for_delay;
1242 int io_n = GetVdevProp(VDEV_PROP_IO_N);
1243 int slow_io_n = GetVdevProp(VDEV_PROP_SLOW_IO_N);
1244
1245 if (io_n == -1)
1246 io_n = DEFAULT_ZFS_DEGRADE_IO_COUNT;
1247 if (slow_io_n == -1)
1248 slow_io_n = DEFAULT_ZFS_FAULT_SLOW_IO_COUNT;
1249
1250 should_fault_for_io = std::count_if(m_events.begin(), m_events.end(),
1251 IsIOEvent) > io_n;
1252 should_fault_for_delay = std::count_if(m_events.begin(), m_events.end(),
1253 IsDelayEvent) > slow_io_n;
1254
1255 return (should_fault_for_io || should_fault_for_delay);
1256 }
1257
1258 nvlist_t *
CaseVdev(zpool_handle_t * zhp) const1259 CaseFile::CaseVdev(zpool_handle_t *zhp) const
1260 {
1261 return (VdevIterator(zhp).Find(VdevGUID()));
1262 }
1263