xref: /freebsd/cddl/usr.sbin/zfsd/case_file.cc (revision e0c4386e7e71d93b0edc0c8fa156263fc4a8b0b6)
1 /*-
2  * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions, and the following disclaimer,
10  *    without modification.
11  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
12  *    substantially similar to the "NO WARRANTY" disclaimer below
13  *    ("Disclaimer") and any redistribution must be conditioned upon
14  *    including a substantially similar Disclaimer requirement for further
15  *    binary redistribution.
16  *
17  * NO WARRANTY
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGES.
29  *
30  * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
31  */
32 
33 /**
34  * \file case_file.cc
35  *
36  * We keep case files for any leaf vdev that is not in the optimal state.
37  * However, we only serialize to disk those events that need to be preserved
38  * across reboots.  For now, this is just a log of soft errors which we
39  * accumulate in order to mark a device as degraded.
40  */
41 #include <sys/cdefs.h>
42 #include <sys/byteorder.h>
43 #include <sys/time.h>
44 
45 #include <sys/fs/zfs.h>
46 
47 #include <dirent.h>
48 #include <fcntl.h>
49 #include <iomanip>
50 #include <fstream>
51 #include <functional>
52 #include <sstream>
53 #include <syslog.h>
54 #include <unistd.h>
55 
56 #include <libzfs.h>
57 
58 #include <list>
59 #include <map>
60 #include <string>
61 
62 #include <devdctl/guid.h>
63 #include <devdctl/event.h>
64 #include <devdctl/event_factory.h>
65 #include <devdctl/exception.h>
66 #include <devdctl/consumer.h>
67 
68 #include "callout.h"
69 #include "vdev_iterator.h"
70 #include "zfsd_event.h"
71 #include "case_file.h"
72 #include "vdev.h"
73 #include "zfsd.h"
74 #include "zfsd_exception.h"
75 #include "zpool_list.h"
76 /*============================ Namespace Control =============================*/
77 using std::hex;
78 using std::ifstream;
79 using std::stringstream;
80 using std::setfill;
81 using std::setw;
82 
83 using DevdCtl::Event;
84 using DevdCtl::EventFactory;
85 using DevdCtl::EventList;
86 using DevdCtl::Guid;
87 using DevdCtl::ParseException;
88 
89 /*--------------------------------- CaseFile ---------------------------------*/
90 //- CaseFile Static Data -------------------------------------------------------
91 
92 CaseFileList  CaseFile::s_activeCases;
93 const string  CaseFile::s_caseFilePath = "/var/db/zfsd/cases";
94 const timeval CaseFile::s_removeGracePeriod = { 60 /*sec*/, 0 /*usec*/};
95 
96 //- CaseFile Static Public Methods ---------------------------------------------
97 CaseFile *
98 CaseFile::Find(Guid poolGUID, Guid vdevGUID)
99 {
100 	for (CaseFileList::iterator curCase = s_activeCases.begin();
101 	     curCase != s_activeCases.end(); curCase++) {
102 
103 		if (((*curCase)->PoolGUID() != poolGUID
104 		  && Guid::InvalidGuid() != poolGUID)
105 		 || (*curCase)->VdevGUID() != vdevGUID)
106 			continue;
107 
108 		/*
109 		 * We only carry one active case per-vdev.
110 		 */
111 		return (*curCase);
112 	}
113 	return (NULL);
114 }
115 
116 void
117 CaseFile::Find(Guid poolGUID, Guid vdevGUID, CaseFileList &cases)
118 {
119 	for (CaseFileList::iterator curCase = s_activeCases.begin();
120 	    curCase != s_activeCases.end(); curCase++) {
121 		if (((*curCase)->PoolGUID() != poolGUID &&
122 		    Guid::InvalidGuid() != poolGUID) ||
123 		    (*curCase)->VdevGUID() != vdevGUID)
124 			continue;
125 
126 		/*
127 		 * We can have multiple cases for spare vdevs
128 		 */
129 		cases.push_back(*curCase);
130 		if (!(*curCase)->IsSpare()) {
131 			return;
132 		}
133 	}
134 }
135 
136 CaseFile *
137 CaseFile::Find(const string &physPath)
138 {
139 	CaseFile *result = NULL;
140 
141 	for (CaseFileList::iterator curCase = s_activeCases.begin();
142 	     curCase != s_activeCases.end(); curCase++) {
143 
144 		if ((*curCase)->PhysicalPath() != physPath)
145 			continue;
146 
147 		if (result != NULL) {
148 			syslog(LOG_WARNING, "Multiple casefiles found for "
149 			    "physical path %s.  "
150 			    "This is most likely a bug in zfsd",
151 			    physPath.c_str());
152 		}
153 		result = *curCase;
154 	}
155 	return (result);
156 }
157 
158 
159 void
160 CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event)
161 {
162 	CaseFileList::iterator casefile;
163 	for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){
164 		CaseFileList::iterator next = casefile;
165 		next++;
166 		if (poolGUID == (*casefile)->PoolGUID())
167 			(*casefile)->ReEvaluate(event);
168 		casefile = next;
169 	}
170 }
171 
172 CaseFile &
173 CaseFile::Create(Vdev &vdev)
174 {
175 	CaseFile *activeCase;
176 
177 	activeCase = Find(vdev.PoolGUID(), vdev.GUID());
178 	if (activeCase == NULL)
179 		activeCase = new CaseFile(vdev);
180 
181 	return (*activeCase);
182 }
183 
184 void
185 CaseFile::DeSerialize()
186 {
187 	struct dirent **caseFiles;
188 
189 	int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles,
190 			 DeSerializeSelector, /*compar*/NULL));
191 
192 	if (numCaseFiles == -1)
193 		return;
194 	if (numCaseFiles == 0) {
195 		free(caseFiles);
196 		return;
197 	}
198 
199 	for (int i = 0; i < numCaseFiles; i++) {
200 
201 		DeSerializeFile(caseFiles[i]->d_name);
202 		free(caseFiles[i]);
203 	}
204 	free(caseFiles);
205 }
206 
207 bool
208 CaseFile::Empty()
209 {
210 	return (s_activeCases.empty());
211 }
212 
213 void
214 CaseFile::LogAll()
215 {
216 	for (CaseFileList::iterator curCase = s_activeCases.begin();
217 	     curCase != s_activeCases.end(); curCase++)
218 		(*curCase)->Log();
219 }
220 
221 void
222 CaseFile::PurgeAll()
223 {
224 	/*
225 	 * Serialize casefiles before deleting them so that they can be reread
226 	 * and revalidated during BuildCaseFiles.
227 	 * CaseFiles remove themselves from this list on destruction.
228 	 */
229 	while (s_activeCases.size() != 0) {
230 		CaseFile *casefile = s_activeCases.front();
231 		casefile->Serialize();
232 		delete casefile;
233 	}
234 
235 }
236 
237 int
238 CaseFile::IsSpare()
239 {
240 	return (m_is_spare);
241 }
242 
243 //- CaseFile Public Methods ----------------------------------------------------
244 bool
245 CaseFile::RefreshVdevState()
246 {
247 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
248 	zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front());
249 	if (casePool == NULL)
250 		return (false);
251 
252 	Vdev vd(casePool, CaseVdev(casePool));
253 	if (vd.DoesNotExist())
254 		return (false);
255 
256 	m_vdevState    = vd.State();
257 	m_vdevPhysPath = vd.PhysicalPath();
258 	return (true);
259 }
260 
261 bool
262 CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev)
263 {
264 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
265 	zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front());
266 	int flags = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE;
267 
268 	if (pool == NULL || !RefreshVdevState()) {
269 		/*
270 		 * The pool or vdev for this case file is no longer
271 		 * part of the configuration.  This can happen
272 		 * if we process a device arrival notification
273 		 * before seeing the ZFS configuration change
274 		 * event.
275 		 */
276 		syslog(LOG_INFO,
277 		       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured.  "
278 		       "Closing\n",
279 		       PoolGUIDString().c_str(),
280 		       VdevGUIDString().c_str());
281 		Close();
282 
283 		/*
284 		 * Since this event was not used to close this
285 		 * case, do not report it as consumed.
286 		 */
287 		return (/*consumed*/false);
288 	}
289 
290 	if (VdevState() > VDEV_STATE_CANT_OPEN) {
291 		/*
292 		 * For now, newly discovered devices only help for
293 		 * devices that are missing.  In the future, we might
294 		 * use a newly inserted spare to replace a degraded
295 		 * or faulted device.
296 		 */
297 		syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored",
298 		    PoolGUIDString().c_str(), VdevGUIDString().c_str());
299 		return (/*consumed*/false);
300 	}
301 
302 	if (vdev != NULL
303 	 && ( vdev->PoolGUID() == m_poolGUID
304 	   || vdev->PoolGUID() == Guid::InvalidGuid())
305 	 && vdev->GUID() == m_vdevGUID) {
306 
307 		if (IsSpare())
308 			flags |= ZFS_ONLINE_SPARE;
309 		if (zpool_vdev_online(pool, vdev->GUIDString().c_str(),
310 		    flags, &m_vdevState) != 0) {
311 			syslog(LOG_ERR,
312 			    "Failed to online vdev(%s/%s:%s): %s: %s\n",
313 			    zpool_get_name(pool), vdev->GUIDString().c_str(),
314 			    devPath.c_str(), libzfs_error_action(g_zfsHandle),
315 			    libzfs_error_description(g_zfsHandle));
316 			return (/*consumed*/false);
317 		}
318 
319 		syslog(LOG_INFO, "Onlined vdev(%s/%s:%s).  State now %s.\n",
320 		       zpool_get_name(pool), vdev->GUIDString().c_str(),
321 		       devPath.c_str(),
322 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
323 
324 		/*
325 		 * Check the vdev state post the online action to see
326 		 * if we can retire this case.
327 		 */
328 		CloseIfSolved();
329 
330 		return (/*consumed*/true);
331 	}
332 
333 	/*
334 	 * If the auto-replace policy is enabled, and we have physical
335 	 * path information, try a physical path replacement.
336 	 */
337 	if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) {
338 		syslog(LOG_INFO,
339 		       "CaseFile(%s:%s:%s): AutoReplace not set.  "
340 		       "Ignoring device insertion.\n",
341 		       PoolGUIDString().c_str(),
342 		       VdevGUIDString().c_str(),
343 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
344 		return (/*consumed*/false);
345 	}
346 
347 	if (PhysicalPath().empty()) {
348 		syslog(LOG_INFO,
349 		       "CaseFile(%s:%s:%s): No physical path information.  "
350 		       "Ignoring device insertion.\n",
351 		       PoolGUIDString().c_str(),
352 		       VdevGUIDString().c_str(),
353 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
354 		return (/*consumed*/false);
355 	}
356 
357 	if (physPath != PhysicalPath()) {
358 		syslog(LOG_INFO,
359 		       "CaseFile(%s:%s:%s): Physical path mismatch.  "
360 		       "Ignoring device insertion.\n",
361 		       PoolGUIDString().c_str(),
362 		       VdevGUIDString().c_str(),
363 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
364 		return (/*consumed*/false);
365 	}
366 
367 	/* Write a label on the newly inserted disk. */
368 	if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) {
369 		syslog(LOG_ERR,
370 		       "Replace vdev(%s/%s) by physical path (label): %s: %s\n",
371 		       zpool_get_name(pool), VdevGUIDString().c_str(),
372 		       libzfs_error_action(g_zfsHandle),
373 		       libzfs_error_description(g_zfsHandle));
374 		return (/*consumed*/false);
375 	}
376 
377 	syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s",
378 	    PoolGUIDString().c_str(), VdevGUIDString().c_str(),
379 	    devPath.c_str());
380 	return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false));
381 }
382 
383 bool
384 CaseFile::ReEvaluate(const ZfsEvent &event)
385 {
386 	bool consumed(false);
387 
388 	if (event.Value("type") == "sysevent.fs.zfs.vdev_remove") {
389 		/*
390 		 * The Vdev we represent has been removed from the
391 		 * configuration.  This case is no longer of value.
392 		 */
393 		Close();
394 
395 		return (/*consumed*/true);
396 	} else if (event.Value("type") == "sysevent.fs.zfs.pool_destroy") {
397 		/* This Pool has been destroyed.  Discard the case */
398 		Close();
399 
400 		return (/*consumed*/true);
401 	} else if (event.Value("type") == "sysevent.fs.zfs.config_sync") {
402 		RefreshVdevState();
403 		if (VdevState() < VDEV_STATE_HEALTHY)
404 			consumed = ActivateSpare();
405 	}
406 
407 
408 	if (event.Value("class") == "resource.fs.zfs.removed") {
409 		bool spare_activated;
410 
411 		if (!RefreshVdevState()) {
412 			/*
413 			 * The pool or vdev for this case file is no longer
414 			 * part of the configuration.  This can happen
415 			 * if we process a device arrival notification
416 			 * before seeing the ZFS configuration change
417 			 * event.
418 			 */
419 			syslog(LOG_INFO,
420 			       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev "
421 			       "unconfigured.  Closing\n",
422 			       PoolGUIDString().c_str(),
423 			       VdevGUIDString().c_str());
424 			/*
425 			 * Close the case now so we won't waste cycles in the
426 			 * system rescan
427 			 */
428 			Close();
429 
430 			/*
431 			 * Since this event was not used to close this
432 			 * case, do not report it as consumed.
433 			 */
434 			return (/*consumed*/false);
435 		}
436 
437 		/*
438 		 * Discard any tentative I/O error events for
439 		 * this case.  They were most likely caused by the
440 		 * hot-unplug of this device.
441 		 */
442 		PurgeTentativeEvents();
443 
444 		/* Try to activate spares if they are available */
445 		spare_activated = ActivateSpare();
446 
447 		/*
448 		 * Rescan the drives in the system to see if a recent
449 		 * drive arrival can be used to solve this case.
450 		 */
451 		ZfsDaemon::RequestSystemRescan();
452 
453 		/*
454 		 * Consume the event if we successfully activated a spare.
455 		 * Otherwise, leave it in the unconsumed events list so that the
456 		 * future addition of a spare to this pool might be able to
457 		 * close the case
458 		 */
459 		consumed = spare_activated;
460 	} else if (event.Value("class") == "resource.fs.zfs.statechange") {
461 		RefreshVdevState();
462 		/*
463 		 * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to
464 		 * activate a hotspare.  Otherwise, ignore the event
465 		 */
466 		if (VdevState() == VDEV_STATE_FAULTED ||
467 		    VdevState() == VDEV_STATE_DEGRADED ||
468 		    VdevState() == VDEV_STATE_CANT_OPEN)
469 			(void) ActivateSpare();
470 		consumed = true;
471 	}
472 	else if (event.Value("class") == "ereport.fs.zfs.io" ||
473 	         event.Value("class") == "ereport.fs.zfs.checksum" ||
474 		 event.Value("class") == "ereport.fs.zfs.delay") {
475 
476 		m_tentativeEvents.push_front(event.DeepCopy());
477 		RegisterCallout(event);
478 		consumed = true;
479 	}
480 
481 	bool closed(CloseIfSolved());
482 
483 	return (consumed || closed);
484 }
485 
486 /* Find a Vdev containing the vdev with the given GUID */
487 static nvlist_t*
488 find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid)
489 {
490 	nvlist_t **vdevChildren;
491 	int        error;
492 	unsigned   ch, numChildren;
493 
494 	error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
495 					   &vdevChildren, &numChildren);
496 
497 	if (error != 0 || numChildren == 0)
498 		return (NULL);
499 
500 	for (ch = 0; ch < numChildren; ch++) {
501 		nvlist *result;
502 		Vdev vdev(pool_config, vdevChildren[ch]);
503 
504 		if (vdev.GUID() == child_guid)
505 			return (config);
506 
507 		result = find_parent(pool_config, vdevChildren[ch], child_guid);
508 		if (result != NULL)
509 			return (result);
510 	}
511 
512 	return (NULL);
513 }
514 
515 bool
516 CaseFile::ActivateSpare() {
517 	nvlist_t	*config, *nvroot, *parent_config;
518 	nvlist_t       **spares;
519 	const char	*devPath, *poolname, *vdev_type;
520 	u_int		 nspares, i;
521 	int		 error;
522 
523 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
524 	zpool_handle_t	*zhp(zpl.empty() ? NULL : zpl.front());
525 	if (zhp == NULL) {
526 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
527 		       "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID);
528 		return (false);
529 	}
530 	poolname = zpool_get_name(zhp);
531 	config = zpool_get_config(zhp, NULL);
532 	if (config == NULL) {
533 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
534 		       "config for pool %s", poolname);
535 		return (false);
536 	}
537 	error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot);
538 	if (error != 0){
539 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev "
540 		       "tree for pool %s", poolname);
541 		return (false);
542 	}
543 
544 	parent_config = find_parent(config, nvroot, m_vdevGUID);
545 	if (parent_config != NULL) {
546 		const char *parent_type;
547 
548 		/*
549 		 * Don't activate spares for members of a "replacing" vdev.
550 		 * They're already dealt with.  Sparing them will just drag out
551 		 * the resilver process.
552 		 */
553 		error = nvlist_lookup_string(parent_config,
554 		    ZPOOL_CONFIG_TYPE, &parent_type);
555 		if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0)
556 			return (false);
557 	}
558 
559 	nspares = 0;
560 	nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
561 				   &nspares);
562 	if (nspares == 0) {
563 		/* The pool has no spares configured */
564 		syslog(LOG_INFO, "CaseFile::ActivateSpare: "
565 		       "No spares available for pool %s", poolname);
566 		return (false);
567 	}
568 	for (i = 0; i < nspares; i++) {
569 		uint64_t    *nvlist_array;
570 		vdev_stat_t *vs;
571 		uint_t	     nstats;
572 
573 		if (nvlist_lookup_uint64_array(spares[i],
574 		    ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) {
575 			syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not "
576 			       "find vdev stats for pool %s, spare %d",
577 			       poolname, i);
578 			return (false);
579 		}
580 		vs = reinterpret_cast<vdev_stat_t *>(nvlist_array);
581 
582 		if ((vs->vs_aux != VDEV_AUX_SPARED)
583 		 && (vs->vs_state == VDEV_STATE_HEALTHY)) {
584 			/* We found a usable spare */
585 			break;
586 		}
587 	}
588 
589 	if (i == nspares) {
590 		/* No available spares were found */
591 		return (false);
592 	}
593 
594 	error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath);
595 	if (error != 0) {
596 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
597 		       "the path of pool %s, spare %d. Error %d",
598 		       poolname, i, error);
599 		return (false);
600 	}
601 
602 	error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type);
603 	if (error != 0) {
604 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
605 		       "the vdev type of pool %s, spare %d. Error %d",
606 		       poolname, i, error);
607 		return (false);
608 	}
609 
610 	return (Replace(vdev_type, devPath, /*isspare*/true));
611 }
612 
613 void
614 CaseFile::RegisterCallout(const Event &event)
615 {
616 	timeval now, countdown, elapsed, timestamp, zero, remaining;
617 
618 	gettimeofday(&now, 0);
619 	timestamp = event.GetTimestamp();
620 	timersub(&now, &timestamp, &elapsed);
621 	timersub(&s_removeGracePeriod, &elapsed, &countdown);
622 	/*
623 	 * If countdown is <= zero, Reset the timer to the
624 	 * smallest positive time value instead
625 	 */
626 	timerclear(&zero);
627 	if (timercmp(&countdown, &zero, <=)) {
628 		timerclear(&countdown);
629 		countdown.tv_usec = 1;
630 	}
631 
632 	remaining = m_tentativeTimer.TimeRemaining();
633 
634 	if (!m_tentativeTimer.IsPending()
635 	 || timercmp(&countdown, &remaining, <))
636 		m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this);
637 }
638 
639 
640 bool
641 CaseFile::CloseIfSolved()
642 {
643 	if (m_events.empty()
644 	 && m_tentativeEvents.empty()) {
645 
646 		/*
647 		 * We currently do not track or take actions on
648 		 * devices in the degraded or faulted state.
649 		 * Once we have support for spare pools, we'll
650 		 * retain these cases so that any spares added in
651 		 * the future can be applied to them.
652 		 */
653 		switch (VdevState()) {
654 		case VDEV_STATE_HEALTHY:
655 			/* No need to keep cases for healthy vdevs */
656 			Close();
657 			return (true);
658 		case VDEV_STATE_REMOVED:
659 		case VDEV_STATE_CANT_OPEN:
660 			/*
661 			 * Keep open.  We may solve it with a newly inserted
662 			 * device.
663 			 */
664 		case VDEV_STATE_FAULTED:
665 		case VDEV_STATE_DEGRADED:
666 			/*
667 			 * Keep open.  We may solve it with the future
668 			 * addition of a spare to the pool
669 			 */
670 		case VDEV_STATE_UNKNOWN:
671 		case VDEV_STATE_CLOSED:
672 		case VDEV_STATE_OFFLINE:
673 			/*
674 			 * Keep open?  This may not be the correct behavior,
675 			 * but it's what we've always done
676 			 */
677 			;
678 		}
679 
680 		/*
681 		 * Re-serialize the case in order to remove any
682 		 * previous event data.
683 		 */
684 		Serialize();
685 	}
686 
687 	return (false);
688 }
689 
690 void
691 CaseFile::Log()
692 {
693 	syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(),
694 	       VdevGUIDString().c_str(), PhysicalPath().c_str());
695 	syslog(LOG_INFO, "\tVdev State = %s\n",
696 	       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
697 	if (m_tentativeEvents.size() != 0) {
698 		syslog(LOG_INFO, "\t=== Tentative Events ===\n");
699 		for (EventList::iterator event(m_tentativeEvents.begin());
700 		     event != m_tentativeEvents.end(); event++)
701 			(*event)->Log(LOG_INFO);
702 	}
703 	if (m_events.size() != 0) {
704 		syslog(LOG_INFO, "\t=== Events ===\n");
705 		for (EventList::iterator event(m_events.begin());
706 		     event != m_events.end(); event++)
707 			(*event)->Log(LOG_INFO);
708 	}
709 }
710 
711 //- CaseFile Static Protected Methods ------------------------------------------
712 void
713 CaseFile::OnGracePeriodEnded(void *arg)
714 {
715 	CaseFile &casefile(*static_cast<CaseFile *>(arg));
716 
717 	casefile.OnGracePeriodEnded();
718 }
719 
720 int
721 CaseFile::DeSerializeSelector(const struct dirent *dirEntry)
722 {
723 	uint64_t poolGUID;
724 	uint64_t vdevGUID;
725 
726 	if (dirEntry->d_type == DT_REG
727 	 && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
728 		   &poolGUID, &vdevGUID) == 2)
729 		return (1);
730 	return (0);
731 }
732 
733 void
734 CaseFile::DeSerializeFile(const char *fileName)
735 {
736 	string	  fullName(s_caseFilePath + '/' + fileName);
737 	CaseFile *existingCaseFile(NULL);
738 	CaseFile *caseFile(NULL);
739 
740 	try {
741 		uint64_t poolGUID;
742 		uint64_t vdevGUID;
743 		nvlist_t *vdevConf;
744 
745 		if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
746 		       &poolGUID, &vdevGUID) != 2) {
747 			throw ZfsdException("CaseFile::DeSerialize: "
748 			    "Unintelligible CaseFile filename %s.\n", fileName);
749 		}
750 		existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID));
751 		if (existingCaseFile != NULL) {
752 			/*
753 			 * If the vdev is already degraded or faulted,
754 			 * there's no point in keeping the state around
755 			 * that we use to put a drive into the degraded
756 			 * state.  However, if the vdev is simply missing,
757 			 * preserve the case data in the hopes that it will
758 			 * return.
759 			 */
760 			caseFile = existingCaseFile;
761 			vdev_state curState(caseFile->VdevState());
762 			if (curState > VDEV_STATE_CANT_OPEN
763 			 && curState < VDEV_STATE_HEALTHY) {
764 				unlink(fileName);
765 				return;
766 			}
767 		} else {
768 			ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID);
769 			if (zpl.empty()
770 			 || (vdevConf = VdevIterator(zpl.front())
771 						    .Find(vdevGUID)) == NULL) {
772 				/*
773 				 * Either the pool no longer exists
774 				 * or this vdev is no longer a member of
775 				 * the pool.
776 				 */
777 				unlink(fullName.c_str());
778 				return;
779 			}
780 
781 			/*
782 			 * Any vdev we find that does not have a case file
783 			 * must be in the healthy state and thus worthy of
784 			 * continued SERD data tracking.
785 			 */
786 			caseFile = new CaseFile(Vdev(zpl.front(), vdevConf));
787 		}
788 
789 		ifstream caseStream(fullName.c_str());
790 		if (!caseStream)
791 			throw ZfsdException("CaseFile::DeSerialize: Unable to "
792 					    "read %s.\n", fileName);
793 
794 		caseFile->DeSerialize(caseStream);
795 	} catch (const ParseException &exp) {
796 
797 		exp.Log();
798 		if (caseFile != existingCaseFile)
799 			delete caseFile;
800 
801 		/*
802 		 * Since we can't parse the file, unlink it so we don't
803 		 * trip over it again.
804 		 */
805 		unlink(fileName);
806 	} catch (const ZfsdException &zfsException) {
807 
808 		zfsException.Log();
809 		if (caseFile != existingCaseFile)
810 			delete caseFile;
811 	}
812 }
813 
814 //- CaseFile Protected Methods -------------------------------------------------
815 CaseFile::CaseFile(const Vdev &vdev)
816  : m_poolGUID(vdev.PoolGUID()),
817    m_vdevGUID(vdev.GUID()),
818    m_vdevState(vdev.State()),
819    m_vdevPhysPath(vdev.PhysicalPath()),
820    m_is_spare(vdev.IsSpare())
821 {
822 	stringstream guidString;
823 
824 	guidString << m_vdevGUID;
825 	m_vdevGUIDString = guidString.str();
826 	guidString.str("");
827 	guidString << m_poolGUID;
828 	m_poolGUIDString = guidString.str();
829 
830 	s_activeCases.push_back(this);
831 
832 	syslog(LOG_INFO, "Creating new CaseFile:\n");
833 	Log();
834 }
835 
836 CaseFile::~CaseFile()
837 {
838 	PurgeEvents();
839 	PurgeTentativeEvents();
840 	m_tentativeTimer.Stop();
841 	s_activeCases.remove(this);
842 }
843 
844 void
845 CaseFile::PurgeEvents()
846 {
847 	for (EventList::iterator event(m_events.begin());
848 	     event != m_events.end(); event++)
849 		delete *event;
850 
851 	m_events.clear();
852 }
853 
854 void
855 CaseFile::PurgeTentativeEvents()
856 {
857 	for (EventList::iterator event(m_tentativeEvents.begin());
858 	     event != m_tentativeEvents.end(); event++)
859 		delete *event;
860 
861 	m_tentativeEvents.clear();
862 }
863 
864 void
865 CaseFile::SerializeEvList(const EventList events, int fd,
866 		const char* prefix) const
867 {
868 	if (events.empty())
869 		return;
870 	for (EventList::const_iterator curEvent = events.begin();
871 	     curEvent != events.end(); curEvent++) {
872 		const string &eventString((*curEvent)->GetEventString());
873 
874 		// TODO: replace many write(2) calls with a single writev(2)
875 		if (prefix)
876 			write(fd, prefix, strlen(prefix));
877 		write(fd, eventString.c_str(), eventString.length());
878 	}
879 }
880 
881 void
882 CaseFile::Serialize()
883 {
884 	stringstream saveFile;
885 
886 	saveFile << setfill('0')
887 		 << s_caseFilePath << "/"
888 		 << "pool_" << PoolGUIDString()
889 		 << "_vdev_" << VdevGUIDString()
890 		 << ".case";
891 
892 	if (m_events.empty() && m_tentativeEvents.empty()) {
893 		unlink(saveFile.str().c_str());
894 		return;
895 	}
896 
897 	int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644));
898 	if (fd == -1) {
899 		syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n",
900 		       saveFile.str().c_str());
901 		return;
902 	}
903 	SerializeEvList(m_events, fd);
904 	SerializeEvList(m_tentativeEvents, fd, "tentative ");
905 	close(fd);
906 }
907 
908 /*
909  * XXX: This method assumes that events may not contain embedded newlines.  If
910  * ever events can contain embedded newlines, then CaseFile must switch
911  * serialization formats
912  */
913 void
914 CaseFile::DeSerialize(ifstream &caseStream)
915 {
916 	string	      evString;
917 	const EventFactory &factory(ZfsDaemon::Get().GetFactory());
918 
919 	caseStream >> std::noskipws >> std::ws;
920 	while (caseStream.good()) {
921 		/*
922 		 * Outline:
923 		 * read the beginning of a line and check it for
924 		 * "tentative".  If found, discard "tentative".
925 		 * Create a new event
926 		 * continue
927 		 */
928 		EventList* destEvents;
929 		const string tentFlag("tentative ");
930 		string line;
931 		std::stringbuf lineBuf;
932 
933 		caseStream.get(lineBuf);
934 		caseStream.ignore();  /*discard the newline character*/
935 		line = lineBuf.str();
936 		if (line.compare(0, tentFlag.size(), tentFlag) == 0) {
937 			/* Discard "tentative" */
938 			line.erase(0, tentFlag.size());
939 			destEvents = &m_tentativeEvents;
940 		} else {
941 			destEvents = &m_events;
942 		}
943 		Event *event(Event::CreateEvent(factory, line));
944 		if (event != NULL) {
945 			destEvents->push_back(event);
946 			RegisterCallout(*event);
947 		}
948 	}
949 }
950 
951 void
952 CaseFile::Close()
953 {
954 	/*
955 	 * This case is no longer relevant.  Clean up our
956 	 * serialization file, and delete the case.
957 	 */
958 	syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n",
959 	       PoolGUIDString().c_str(), VdevGUIDString().c_str(),
960 	       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
961 
962 	/*
963 	 * Serialization of a Case with no event data, clears the
964 	 * Serialization data for that event.
965 	 */
966 	PurgeEvents();
967 	Serialize();
968 
969 	delete this;
970 }
971 
972 void
973 CaseFile::OnGracePeriodEnded()
974 {
975 	bool should_fault, should_degrade;
976 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
977 	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
978 
979 	m_events.splice(m_events.begin(), m_tentativeEvents);
980 	should_fault = ShouldFault();
981 	should_degrade = ShouldDegrade();
982 
983 	if (should_fault || should_degrade) {
984 		if (zhp == NULL
985 		 || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) {
986 			/*
987 			 * Either the pool no longer exists
988 			 * or this vdev is no longer a member of
989 			 * the pool.
990 			 */
991 			Close();
992 			return;
993 		}
994 
995 	}
996 
997 	/* A fault condition has priority over a degrade condition */
998 	if (ShouldFault()) {
999 		/* Fault the vdev and close the case. */
1000 		if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID,
1001 				       VDEV_AUX_ERR_EXCEEDED) == 0) {
1002 			syslog(LOG_INFO, "Faulting vdev(%s/%s)",
1003 			       PoolGUIDString().c_str(),
1004 			       VdevGUIDString().c_str());
1005 			Close();
1006 			return;
1007 		}
1008 		else {
1009 			syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n",
1010 			       PoolGUIDString().c_str(),
1011 			       VdevGUIDString().c_str(),
1012 			       libzfs_error_action(g_zfsHandle),
1013 			       libzfs_error_description(g_zfsHandle));
1014 		}
1015 	}
1016 	else if (ShouldDegrade()) {
1017 		/* Degrade the vdev and close the case. */
1018 		if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID,
1019 				       VDEV_AUX_ERR_EXCEEDED) == 0) {
1020 			syslog(LOG_INFO, "Degrading vdev(%s/%s)",
1021 			       PoolGUIDString().c_str(),
1022 			       VdevGUIDString().c_str());
1023 			Close();
1024 			return;
1025 		}
1026 		else {
1027 			syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n",
1028 			       PoolGUIDString().c_str(),
1029 			       VdevGUIDString().c_str(),
1030 			       libzfs_error_action(g_zfsHandle),
1031 			       libzfs_error_description(g_zfsHandle));
1032 		}
1033 	}
1034 	Serialize();
1035 }
1036 
1037 Vdev
1038 CaseFile::BeingReplacedBy(zpool_handle_t *zhp) {
1039 	Vdev vd(zhp, CaseVdev(zhp));
1040 	std::list<Vdev> children;
1041 	std::list<Vdev>::iterator children_it;
1042 
1043 	Vdev parent(vd.Parent());
1044 	Vdev replacing(NonexistentVdev);
1045 
1046 	/*
1047 	 * To determine whether we are being replaced by another spare that
1048 	 * is still working, then make sure that it is currently spared and
1049 	 * that the spare is either resilvering or healthy.  If any of these
1050 	 * conditions fail, then we are not being replaced by a spare.
1051 	 *
1052 	 * If the spare is healthy, then the case file should be closed very
1053 	 * soon after this check.
1054 	 */
1055 	if (parent.DoesNotExist()
1056 	 || parent.Name(zhp, /*verbose*/false) != "spare")
1057 		return (NonexistentVdev);
1058 
1059 	children = parent.Children();
1060 	children_it = children.begin();
1061 	for (;children_it != children.end(); children_it++) {
1062 		Vdev child = *children_it;
1063 
1064 		/* Skip our vdev. */
1065 		if (child.GUID() == VdevGUID())
1066 			continue;
1067 		/*
1068 		 * Accept the first child that doesn't match our GUID, or
1069 		 * any resilvering/healthy device if one exists.
1070 		 */
1071 		if (replacing.DoesNotExist() || child.IsResilvering()
1072 		 || child.State() == VDEV_STATE_HEALTHY)
1073 			replacing = child;
1074 	}
1075 
1076 	return (replacing);
1077 }
1078 
1079 bool
1080 CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) {
1081 	nvlist_t *nvroot, *newvd;
1082 	const char *poolname;
1083 	string oldstr(VdevGUIDString());
1084 	bool retval = true;
1085 
1086 	/* Figure out what pool we're working on */
1087 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
1088 	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
1089 	if (zhp == NULL) {
1090 		syslog(LOG_ERR, "CaseFile::Replace: could not find pool for "
1091 		       "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID);
1092 		return (false);
1093 	}
1094 	poolname = zpool_get_name(zhp);
1095 	Vdev vd(zhp, CaseVdev(zhp));
1096 	Vdev replaced(BeingReplacedBy(zhp));
1097 
1098 	if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) {
1099 		/* If we are already being replaced by a working spare, pass. */
1100 		if (replaced.IsResilvering()
1101 		 || replaced.State() == VDEV_STATE_HEALTHY) {
1102 			syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already "
1103 			    "replaced", VdevGUIDString().c_str(), path);
1104 			return (/*consumed*/false);
1105 		}
1106 		/*
1107 		 * If we have already been replaced by a spare, but that spare
1108 		 * is broken, we must spare the spare, not the original device.
1109 		 */
1110 		oldstr = replaced.GUIDString();
1111 		syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing "
1112 		    "broken spare %s instead", VdevGUIDString().c_str(),
1113 		    path, oldstr.c_str());
1114 	}
1115 
1116 	/*
1117 	 * Build a root vdev/leaf vdev configuration suitable for
1118 	 * zpool_vdev_attach. Only enough data for the kernel to find
1119 	 * the device (i.e. type and disk device node path) are needed.
1120 	 */
1121 	nvroot = NULL;
1122 	newvd = NULL;
1123 
1124 	if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0
1125 	 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
1126 		syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate "
1127 		    "configuration data.", poolname, oldstr.c_str());
1128 		if (nvroot != NULL)
1129 			nvlist_free(nvroot);
1130 		return (false);
1131 	}
1132 	if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0
1133 	 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0
1134 	 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0
1135 	 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1136 				    &newvd, 1) != 0) {
1137 		syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize "
1138 		    "configuration data.", poolname, oldstr.c_str());
1139 		nvlist_free(newvd);
1140 		nvlist_free(nvroot);
1141 		return (true);
1142 	}
1143 
1144 	/* Data was copied when added to the root vdev. */
1145 	nvlist_free(newvd);
1146 
1147 	retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot,
1148        /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0);
1149 	if (retval)
1150 		syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n",
1151 		    poolname, oldstr.c_str(), path);
1152 	else
1153 		syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n",
1154 		    poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle),
1155 		    libzfs_error_description(g_zfsHandle));
1156 	nvlist_free(nvroot);
1157 
1158 	return (retval);
1159 }
1160 
1161 /* Does the argument event refer to a checksum error? */
1162 static bool
1163 IsChecksumEvent(const Event* const event)
1164 {
1165 	return ("ereport.fs.zfs.checksum" == event->Value("type"));
1166 }
1167 
1168 /* Does the argument event refer to an IO error? */
1169 static bool
1170 IsIOEvent(const Event* const event)
1171 {
1172 	return ("ereport.fs.zfs.io" == event->Value("type"));
1173 }
1174 
1175 /* Does the argument event refer to an IO delay? */
1176 static bool
1177 IsDelayEvent(const Event* const event)
1178 {
1179 	return ("ereport.fs.zfs.delay" == event->Value("type"));
1180 }
1181 
1182 bool
1183 CaseFile::ShouldDegrade() const
1184 {
1185 	return (std::count_if(m_events.begin(), m_events.end(),
1186 			      IsChecksumEvent) > ZFS_DEGRADE_IO_COUNT);
1187 }
1188 
1189 bool
1190 CaseFile::ShouldFault() const
1191 {
1192 	bool should_fault_for_io, should_fault_for_delay;
1193 
1194 	should_fault_for_io = std::count_if(m_events.begin(), m_events.end(),
1195 			      IsIOEvent) > ZFS_DEGRADE_IO_COUNT;
1196 	should_fault_for_delay = std::count_if(m_events.begin(), m_events.end(),
1197 			      IsDelayEvent) > ZFS_FAULT_DELAY_COUNT;
1198 
1199 	return (should_fault_for_io || should_fault_for_delay);
1200 }
1201 
1202 nvlist_t *
1203 CaseFile::CaseVdev(zpool_handle_t *zhp) const
1204 {
1205 	return (VdevIterator(zhp).Find(VdevGUID()));
1206 }
1207