xref: /freebsd/cddl/usr.sbin/zfsd/case_file.cc (revision b37f6c9805edb4b89f0a8c2b78f78a3dcfc0647b)
1 /*-
2  * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions, and the following disclaimer,
10  *    without modification.
11  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
12  *    substantially similar to the "NO WARRANTY" disclaimer below
13  *    ("Disclaimer") and any redistribution must be conditioned upon
14  *    including a substantially similar Disclaimer requirement for further
15  *    binary redistribution.
16  *
17  * NO WARRANTY
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGES.
29  *
30  * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
31  */
32 
33 /**
34  * \file case_file.cc
35  *
36  * We keep case files for any leaf vdev that is not in the optimal state.
37  * However, we only serialize to disk those events that need to be preserved
38  * across reboots.  For now, this is just a log of soft errors which we
39  * accumulate in order to mark a device as degraded.
40  */
41 #include <sys/cdefs.h>
42 #include <sys/time.h>
43 
44 #include <sys/fs/zfs.h>
45 
46 #include <dirent.h>
47 #include <iomanip>
48 #include <fstream>
49 #include <functional>
50 #include <sstream>
51 #include <syslog.h>
52 #include <unistd.h>
53 
54 #include <libzfs.h>
55 
56 #include <list>
57 #include <map>
58 #include <string>
59 
60 #include <devdctl/guid.h>
61 #include <devdctl/event.h>
62 #include <devdctl/event_factory.h>
63 #include <devdctl/exception.h>
64 #include <devdctl/consumer.h>
65 
66 #include "callout.h"
67 #include "vdev_iterator.h"
68 #include "zfsd_event.h"
69 #include "case_file.h"
70 #include "vdev.h"
71 #include "zfsd.h"
72 #include "zfsd_exception.h"
73 #include "zpool_list.h"
74 
75 __FBSDID("$FreeBSD$");
76 
77 /*============================ Namespace Control =============================*/
78 using std::auto_ptr;
79 using std::hex;
80 using std::ifstream;
81 using std::stringstream;
82 using std::setfill;
83 using std::setw;
84 
85 using DevdCtl::Event;
86 using DevdCtl::EventFactory;
87 using DevdCtl::EventList;
88 using DevdCtl::Guid;
89 using DevdCtl::ParseException;
90 
91 /*--------------------------------- CaseFile ---------------------------------*/
92 //- CaseFile Static Data -------------------------------------------------------
93 
94 CaseFileList  CaseFile::s_activeCases;
95 const string  CaseFile::s_caseFilePath = "/var/db/zfsd/cases";
96 const timeval CaseFile::s_removeGracePeriod = { 60 /*sec*/, 0 /*usec*/};
97 
98 //- CaseFile Static Public Methods ---------------------------------------------
99 CaseFile *
100 CaseFile::Find(Guid poolGUID, Guid vdevGUID)
101 {
102 	for (CaseFileList::iterator curCase = s_activeCases.begin();
103 	     curCase != s_activeCases.end(); curCase++) {
104 
105 		if (((*curCase)->PoolGUID() != poolGUID
106 		  && Guid::InvalidGuid() != poolGUID)
107 		 || (*curCase)->VdevGUID() != vdevGUID)
108 			continue;
109 
110 		/*
111 		 * We only carry one active case per-vdev.
112 		 */
113 		return (*curCase);
114 	}
115 	return (NULL);
116 }
117 
118 CaseFile *
119 CaseFile::Find(const string &physPath)
120 {
121 	CaseFile *result = NULL;
122 
123 	for (CaseFileList::iterator curCase = s_activeCases.begin();
124 	     curCase != s_activeCases.end(); curCase++) {
125 
126 		if ((*curCase)->PhysicalPath() != physPath)
127 			continue;
128 
129 		if (result != NULL) {
130 			syslog(LOG_WARNING, "Multiple casefiles found for "
131 			    "physical path %s.  "
132 			    "This is most likely a bug in zfsd",
133 			    physPath.c_str());
134 		}
135 		result = *curCase;
136 	}
137 	return (result);
138 }
139 
140 
141 void
142 CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event)
143 {
144 	CaseFileList::iterator casefile;
145 	for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){
146 		CaseFileList::iterator next = casefile;
147 		next++;
148 		if (poolGUID == (*casefile)->PoolGUID())
149 			(*casefile)->ReEvaluate(event);
150 		casefile = next;
151 	}
152 }
153 
154 CaseFile &
155 CaseFile::Create(Vdev &vdev)
156 {
157 	CaseFile *activeCase;
158 
159 	activeCase = Find(vdev.PoolGUID(), vdev.GUID());
160 	if (activeCase == NULL)
161 		activeCase = new CaseFile(vdev);
162 
163 	return (*activeCase);
164 }
165 
166 void
167 CaseFile::DeSerialize()
168 {
169 	struct dirent **caseFiles;
170 
171 	int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles,
172 			 DeSerializeSelector, /*compar*/NULL));
173 
174 	if (numCaseFiles == -1)
175 		return;
176 	if (numCaseFiles == 0) {
177 		free(caseFiles);
178 		return;
179 	}
180 
181 	for (int i = 0; i < numCaseFiles; i++) {
182 
183 		DeSerializeFile(caseFiles[i]->d_name);
184 		free(caseFiles[i]);
185 	}
186 	free(caseFiles);
187 }
188 
189 void
190 CaseFile::LogAll()
191 {
192 	for (CaseFileList::iterator curCase = s_activeCases.begin();
193 	     curCase != s_activeCases.end(); curCase++)
194 		(*curCase)->Log();
195 }
196 
197 void
198 CaseFile::PurgeAll()
199 {
200 	/*
201 	 * Serialize casefiles before deleting them so that they can be reread
202 	 * and revalidated during BuildCaseFiles.
203 	 * CaseFiles remove themselves from this list on destruction.
204 	 */
205 	while (s_activeCases.size() != 0) {
206 		CaseFile *casefile = s_activeCases.front();
207 		casefile->Serialize();
208 		delete casefile;
209 	}
210 
211 }
212 
213 //- CaseFile Public Methods ----------------------------------------------------
214 bool
215 CaseFile::RefreshVdevState()
216 {
217 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
218 	zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front());
219 	if (casePool == NULL)
220 		return (false);
221 
222 	Vdev vd(casePool, CaseVdev(casePool));
223 	if (vd.DoesNotExist())
224 		return (false);
225 
226 	m_vdevState    = vd.State();
227 	m_vdevPhysPath = vd.PhysicalPath();
228 	return (true);
229 }
230 
231 bool
232 CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev)
233 {
234 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
235 	zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front());
236 
237 	if (pool == NULL || !RefreshVdevState()) {
238 		/*
239 		 * The pool or vdev for this case file is no longer
240 		 * part of the configuration.  This can happen
241 		 * if we process a device arrival notification
242 		 * before seeing the ZFS configuration change
243 		 * event.
244 		 */
245 		syslog(LOG_INFO,
246 		       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured.  "
247 		       "Closing\n",
248 		       PoolGUIDString().c_str(),
249 		       VdevGUIDString().c_str());
250 		Close();
251 
252 		/*
253 		 * Since this event was not used to close this
254 		 * case, do not report it as consumed.
255 		 */
256 		return (/*consumed*/false);
257 	}
258 
259 	if (VdevState() > VDEV_STATE_CANT_OPEN) {
260 		/*
261 		 * For now, newly discovered devices only help for
262 		 * devices that are missing.  In the future, we might
263 		 * use a newly inserted spare to replace a degraded
264 		 * or faulted device.
265 		 */
266 		syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored",
267 		    PoolGUIDString().c_str(), VdevGUIDString().c_str());
268 		return (/*consumed*/false);
269 	}
270 
271 	if (vdev != NULL
272 	 && ( vdev->PoolGUID() == m_poolGUID
273 	   || vdev->PoolGUID() == Guid::InvalidGuid())
274 	 && vdev->GUID() == m_vdevGUID) {
275 
276 		zpool_vdev_online(pool, vdev->GUIDString().c_str(),
277 				  ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE,
278 				  &m_vdevState);
279 		syslog(LOG_INFO, "Onlined vdev(%s/%s:%s).  State now %s.\n",
280 		       zpool_get_name(pool), vdev->GUIDString().c_str(),
281 		       devPath.c_str(),
282 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
283 
284 		/*
285 		 * Check the vdev state post the online action to see
286 		 * if we can retire this case.
287 		 */
288 		CloseIfSolved();
289 
290 		return (/*consumed*/true);
291 	}
292 
293 	/*
294 	 * If the auto-replace policy is enabled, and we have physical
295 	 * path information, try a physical path replacement.
296 	 */
297 	if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) {
298 		syslog(LOG_INFO,
299 		       "CaseFile(%s:%s:%s): AutoReplace not set.  "
300 		       "Ignoring device insertion.\n",
301 		       PoolGUIDString().c_str(),
302 		       VdevGUIDString().c_str(),
303 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
304 		return (/*consumed*/false);
305 	}
306 
307 	if (PhysicalPath().empty()) {
308 		syslog(LOG_INFO,
309 		       "CaseFile(%s:%s:%s): No physical path information.  "
310 		       "Ignoring device insertion.\n",
311 		       PoolGUIDString().c_str(),
312 		       VdevGUIDString().c_str(),
313 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
314 		return (/*consumed*/false);
315 	}
316 
317 	if (physPath != PhysicalPath()) {
318 		syslog(LOG_INFO,
319 		       "CaseFile(%s:%s:%s): Physical path mismatch.  "
320 		       "Ignoring device insertion.\n",
321 		       PoolGUIDString().c_str(),
322 		       VdevGUIDString().c_str(),
323 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
324 		return (/*consumed*/false);
325 	}
326 
327 	/* Write a label on the newly inserted disk. */
328 	if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) {
329 		syslog(LOG_ERR,
330 		       "Replace vdev(%s/%s) by physical path (label): %s: %s\n",
331 		       zpool_get_name(pool), VdevGUIDString().c_str(),
332 		       libzfs_error_action(g_zfsHandle),
333 		       libzfs_error_description(g_zfsHandle));
334 		return (/*consumed*/false);
335 	}
336 
337 	syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s",
338 	    PoolGUIDString().c_str(), VdevGUIDString().c_str(),
339 	    devPath.c_str());
340 	return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false));
341 }
342 
343 bool
344 CaseFile::ReEvaluate(const ZfsEvent &event)
345 {
346 	bool consumed(false);
347 
348 	if (event.Value("type") == "misc.fs.zfs.vdev_remove") {
349 		/*
350 		 * The Vdev we represent has been removed from the
351 		 * configuration.  This case is no longer of value.
352 		 */
353 		Close();
354 
355 		return (/*consumed*/true);
356 	} else if (event.Value("type") == "misc.fs.zfs.pool_destroy") {
357 		/* This Pool has been destroyed.  Discard the case */
358 		Close();
359 
360 		return (/*consumed*/true);
361 	} else if (event.Value("type") == "misc.fs.zfs.config_sync") {
362 		RefreshVdevState();
363 		if (VdevState() < VDEV_STATE_HEALTHY)
364 			consumed = ActivateSpare();
365 	}
366 
367 
368 	if (event.Value("class") == "resource.fs.zfs.removed") {
369 		bool spare_activated;
370 
371 		if (!RefreshVdevState()) {
372 			/*
373 			 * The pool or vdev for this case file is no longer
374 			 * part of the configuration.  This can happen
375 			 * if we process a device arrival notification
376 			 * before seeing the ZFS configuration change
377 			 * event.
378 			 */
379 			syslog(LOG_INFO,
380 			       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev "
381 			       "unconfigured.  Closing\n",
382 			       PoolGUIDString().c_str(),
383 			       VdevGUIDString().c_str());
384 			/*
385 			 * Close the case now so we won't waste cycles in the
386 			 * system rescan
387 			 */
388 			Close();
389 
390 			/*
391 			 * Since this event was not used to close this
392 			 * case, do not report it as consumed.
393 			 */
394 			return (/*consumed*/false);
395 		}
396 
397 		/*
398 		 * Discard any tentative I/O error events for
399 		 * this case.  They were most likely caused by the
400 		 * hot-unplug of this device.
401 		 */
402 		PurgeTentativeEvents();
403 
404 		/* Try to activate spares if they are available */
405 		spare_activated = ActivateSpare();
406 
407 		/*
408 		 * Rescan the drives in the system to see if a recent
409 		 * drive arrival can be used to solve this case.
410 		 */
411 		ZfsDaemon::RequestSystemRescan();
412 
413 		/*
414 		 * Consume the event if we successfully activated a spare.
415 		 * Otherwise, leave it in the unconsumed events list so that the
416 		 * future addition of a spare to this pool might be able to
417 		 * close the case
418 		 */
419 		consumed = spare_activated;
420 	} else if (event.Value("class") == "resource.fs.zfs.statechange") {
421 		RefreshVdevState();
422 		/*
423 		 * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to
424 		 * activate a hotspare.  Otherwise, ignore the event
425 		 */
426 		if (VdevState() == VDEV_STATE_FAULTED ||
427 		    VdevState() == VDEV_STATE_DEGRADED ||
428 		    VdevState() == VDEV_STATE_CANT_OPEN)
429 			(void) ActivateSpare();
430 		consumed = true;
431 	}
432 	else if (event.Value("class") == "ereport.fs.zfs.io" ||
433 	         event.Value("class") == "ereport.fs.zfs.checksum") {
434 
435 		m_tentativeEvents.push_front(event.DeepCopy());
436 		RegisterCallout(event);
437 		consumed = true;
438 	}
439 
440 	bool closed(CloseIfSolved());
441 
442 	return (consumed || closed);
443 }
444 
445 
446 bool
447 CaseFile::ActivateSpare() {
448 	nvlist_t	*config, *nvroot;
449 	nvlist_t       **spares;
450 	char		*devPath, *vdev_type;
451 	const char	*poolname;
452 	u_int		 nspares, i;
453 	int		 error;
454 
455 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
456 	zpool_handle_t	*zhp(zpl.empty() ? NULL : zpl.front());
457 	if (zhp == NULL) {
458 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
459 		       "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID);
460 		return (false);
461 	}
462 	poolname = zpool_get_name(zhp);
463 	config = zpool_get_config(zhp, NULL);
464 	if (config == NULL) {
465 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
466 		       "config for pool %s", poolname);
467 		return (false);
468 	}
469 	error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot);
470 	if (error != 0){
471 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev "
472 		       "tree for pool %s", poolname);
473 		return (false);
474 	}
475 	nspares = 0;
476 	nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
477 				   &nspares);
478 	if (nspares == 0) {
479 		/* The pool has no spares configured */
480 		syslog(LOG_INFO, "CaseFile::ActivateSpare: "
481 		       "No spares available for pool %s", poolname);
482 		return (false);
483 	}
484 	for (i = 0; i < nspares; i++) {
485 		uint64_t    *nvlist_array;
486 		vdev_stat_t *vs;
487 		uint_t	     nstats;
488 
489 		if (nvlist_lookup_uint64_array(spares[i],
490 		    ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) {
491 			syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not "
492 			       "find vdev stats for pool %s, spare %d",
493 			       poolname, i);
494 			return (false);
495 		}
496 		vs = reinterpret_cast<vdev_stat_t *>(nvlist_array);
497 
498 		if ((vs->vs_aux != VDEV_AUX_SPARED)
499 		 && (vs->vs_state == VDEV_STATE_HEALTHY)) {
500 			/* We found a usable spare */
501 			break;
502 		}
503 	}
504 
505 	if (i == nspares) {
506 		/* No available spares were found */
507 		return (false);
508 	}
509 
510 	error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath);
511 	if (error != 0) {
512 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
513 		       "the path of pool %s, spare %d. Error %d",
514 		       poolname, i, error);
515 		return (false);
516 	}
517 
518 	error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type);
519 	if (error != 0) {
520 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
521 		       "the vdev type of pool %s, spare %d. Error %d",
522 		       poolname, i, error);
523 		return (false);
524 	}
525 
526 	return (Replace(vdev_type, devPath, /*isspare*/true));
527 }
528 
529 void
530 CaseFile::RegisterCallout(const Event &event)
531 {
532 	timeval now, countdown, elapsed, timestamp, zero, remaining;
533 
534 	gettimeofday(&now, 0);
535 	timestamp = event.GetTimestamp();
536 	timersub(&now, &timestamp, &elapsed);
537 	timersub(&s_removeGracePeriod, &elapsed, &countdown);
538 	/*
539 	 * If countdown is <= zero, Reset the timer to the
540 	 * smallest positive time value instead
541 	 */
542 	timerclear(&zero);
543 	if (timercmp(&countdown, &zero, <=)) {
544 		timerclear(&countdown);
545 		countdown.tv_usec = 1;
546 	}
547 
548 	remaining = m_tentativeTimer.TimeRemaining();
549 
550 	if (!m_tentativeTimer.IsPending()
551 	 || timercmp(&countdown, &remaining, <))
552 		m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this);
553 }
554 
555 
556 bool
557 CaseFile::CloseIfSolved()
558 {
559 	if (m_events.empty()
560 	 && m_tentativeEvents.empty()) {
561 
562 		/*
563 		 * We currently do not track or take actions on
564 		 * devices in the degraded or faulted state.
565 		 * Once we have support for spare pools, we'll
566 		 * retain these cases so that any spares added in
567 		 * the future can be applied to them.
568 		 */
569 		switch (VdevState()) {
570 		case VDEV_STATE_HEALTHY:
571 			/* No need to keep cases for healthy vdevs */
572 			Close();
573 			return (true);
574 		case VDEV_STATE_REMOVED:
575 		case VDEV_STATE_CANT_OPEN:
576 			/*
577 			 * Keep open.  We may solve it with a newly inserted
578 			 * device.
579 			 */
580 		case VDEV_STATE_FAULTED:
581 		case VDEV_STATE_DEGRADED:
582 			/*
583 			 * Keep open.  We may solve it with the future
584 			 * addition of a spare to the pool
585 			 */
586 		case VDEV_STATE_UNKNOWN:
587 		case VDEV_STATE_CLOSED:
588 		case VDEV_STATE_OFFLINE:
589 			/*
590 			 * Keep open?  This may not be the correct behavior,
591 			 * but it's what we've always done
592 			 */
593 			;
594 		}
595 
596 		/*
597 		 * Re-serialize the case in order to remove any
598 		 * previous event data.
599 		 */
600 		Serialize();
601 	}
602 
603 	return (false);
604 }
605 
606 void
607 CaseFile::Log()
608 {
609 	syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(),
610 	       VdevGUIDString().c_str(), PhysicalPath().c_str());
611 	syslog(LOG_INFO, "\tVdev State = %s\n",
612 	       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
613 	if (m_tentativeEvents.size() != 0) {
614 		syslog(LOG_INFO, "\t=== Tentative Events ===\n");
615 		for (EventList::iterator event(m_tentativeEvents.begin());
616 		     event != m_tentativeEvents.end(); event++)
617 			(*event)->Log(LOG_INFO);
618 	}
619 	if (m_events.size() != 0) {
620 		syslog(LOG_INFO, "\t=== Events ===\n");
621 		for (EventList::iterator event(m_events.begin());
622 		     event != m_events.end(); event++)
623 			(*event)->Log(LOG_INFO);
624 	}
625 }
626 
627 //- CaseFile Static Protected Methods ------------------------------------------
628 void
629 CaseFile::OnGracePeriodEnded(void *arg)
630 {
631 	CaseFile &casefile(*static_cast<CaseFile *>(arg));
632 
633 	casefile.OnGracePeriodEnded();
634 }
635 
636 int
637 CaseFile::DeSerializeSelector(const struct dirent *dirEntry)
638 {
639 	uint64_t poolGUID;
640 	uint64_t vdevGUID;
641 
642 	if (dirEntry->d_type == DT_REG
643 	 && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
644 		   &poolGUID, &vdevGUID) == 2)
645 		return (1);
646 	return (0);
647 }
648 
649 void
650 CaseFile::DeSerializeFile(const char *fileName)
651 {
652 	string	  fullName(s_caseFilePath + '/' + fileName);
653 	CaseFile *existingCaseFile(NULL);
654 	CaseFile *caseFile(NULL);
655 
656 	try {
657 		uint64_t poolGUID;
658 		uint64_t vdevGUID;
659 		nvlist_t *vdevConf;
660 
661 		if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
662 		       &poolGUID, &vdevGUID) != 2) {
663 			throw ZfsdException("CaseFile::DeSerialize: "
664 			    "Unintelligible CaseFile filename %s.\n", fileName);
665 		}
666 		existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID));
667 		if (existingCaseFile != NULL) {
668 			/*
669 			 * If the vdev is already degraded or faulted,
670 			 * there's no point in keeping the state around
671 			 * that we use to put a drive into the degraded
672 			 * state.  However, if the vdev is simply missing,
673 			 * preserve the case data in the hopes that it will
674 			 * return.
675 			 */
676 			caseFile = existingCaseFile;
677 			vdev_state curState(caseFile->VdevState());
678 			if (curState > VDEV_STATE_CANT_OPEN
679 			 && curState < VDEV_STATE_HEALTHY) {
680 				unlink(fileName);
681 				return;
682 			}
683 		} else {
684 			ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID);
685 			if (zpl.empty()
686 			 || (vdevConf = VdevIterator(zpl.front())
687 						    .Find(vdevGUID)) == NULL) {
688 				/*
689 				 * Either the pool no longer exists
690 				 * or this vdev is no longer a member of
691 				 * the pool.
692 				 */
693 				unlink(fullName.c_str());
694 				return;
695 			}
696 
697 			/*
698 			 * Any vdev we find that does not have a case file
699 			 * must be in the healthy state and thus worthy of
700 			 * continued SERD data tracking.
701 			 */
702 			caseFile = new CaseFile(Vdev(zpl.front(), vdevConf));
703 		}
704 
705 		ifstream caseStream(fullName.c_str());
706 		if (!caseStream)
707 			throw ZfsdException("CaseFile::DeSerialize: Unable to "
708 					    "read %s.\n", fileName);
709 
710 		caseFile->DeSerialize(caseStream);
711 	} catch (const ParseException &exp) {
712 
713 		exp.Log();
714 		if (caseFile != existingCaseFile)
715 			delete caseFile;
716 
717 		/*
718 		 * Since we can't parse the file, unlink it so we don't
719 		 * trip over it again.
720 		 */
721 		unlink(fileName);
722 	} catch (const ZfsdException &zfsException) {
723 
724 		zfsException.Log();
725 		if (caseFile != existingCaseFile)
726 			delete caseFile;
727 	}
728 }
729 
730 //- CaseFile Protected Methods -------------------------------------------------
731 CaseFile::CaseFile(const Vdev &vdev)
732  : m_poolGUID(vdev.PoolGUID()),
733    m_vdevGUID(vdev.GUID()),
734    m_vdevState(vdev.State()),
735    m_vdevPhysPath(vdev.PhysicalPath())
736 {
737 	stringstream guidString;
738 
739 	guidString << m_vdevGUID;
740 	m_vdevGUIDString = guidString.str();
741 	guidString.str("");
742 	guidString << m_poolGUID;
743 	m_poolGUIDString = guidString.str();
744 
745 	s_activeCases.push_back(this);
746 
747 	syslog(LOG_INFO, "Creating new CaseFile:\n");
748 	Log();
749 }
750 
751 CaseFile::~CaseFile()
752 {
753 	PurgeEvents();
754 	PurgeTentativeEvents();
755 	m_tentativeTimer.Stop();
756 	s_activeCases.remove(this);
757 }
758 
759 void
760 CaseFile::PurgeEvents()
761 {
762 	for (EventList::iterator event(m_events.begin());
763 	     event != m_events.end(); event++)
764 		delete *event;
765 
766 	m_events.clear();
767 }
768 
769 void
770 CaseFile::PurgeTentativeEvents()
771 {
772 	for (EventList::iterator event(m_tentativeEvents.begin());
773 	     event != m_tentativeEvents.end(); event++)
774 		delete *event;
775 
776 	m_tentativeEvents.clear();
777 }
778 
779 void
780 CaseFile::SerializeEvList(const EventList events, int fd,
781 		const char* prefix) const
782 {
783 	if (events.empty())
784 		return;
785 	for (EventList::const_iterator curEvent = events.begin();
786 	     curEvent != events.end(); curEvent++) {
787 		const string &eventString((*curEvent)->GetEventString());
788 
789 		// TODO: replace many write(2) calls with a single writev(2)
790 		if (prefix)
791 			write(fd, prefix, strlen(prefix));
792 		write(fd, eventString.c_str(), eventString.length());
793 	}
794 }
795 
796 void
797 CaseFile::Serialize()
798 {
799 	stringstream saveFile;
800 
801 	saveFile << setfill('0')
802 		 << s_caseFilePath << "/"
803 		 << "pool_" << PoolGUIDString()
804 		 << "_vdev_" << VdevGUIDString()
805 		 << ".case";
806 
807 	if (m_events.empty() && m_tentativeEvents.empty()) {
808 		unlink(saveFile.str().c_str());
809 		return;
810 	}
811 
812 	int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644));
813 	if (fd == -1) {
814 		syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n",
815 		       saveFile.str().c_str());
816 		return;
817 	}
818 	SerializeEvList(m_events, fd);
819 	SerializeEvList(m_tentativeEvents, fd, "tentative ");
820 	close(fd);
821 }
822 
823 /*
824  * XXX: This method assumes that events may not contain embedded newlines.  If
825  * ever events can contain embedded newlines, then CaseFile must switch
826  * serialization formats
827  */
828 void
829 CaseFile::DeSerialize(ifstream &caseStream)
830 {
831 	string	      evString;
832 	const EventFactory &factory(ZfsDaemon::Get().GetFactory());
833 
834 	caseStream >> std::noskipws >> std::ws;
835 	while (caseStream.good()) {
836 		/*
837 		 * Outline:
838 		 * read the beginning of a line and check it for
839 		 * "tentative".  If found, discard "tentative".
840 		 * Create a new event
841 		 * continue
842 		 */
843 		EventList* destEvents;
844 		const string tentFlag("tentative ");
845 		string line;
846 		std::stringbuf lineBuf;
847 
848 		caseStream.get(lineBuf);
849 		caseStream.ignore();  /*discard the newline character*/
850 		line = lineBuf.str();
851 		if (line.compare(0, tentFlag.size(), tentFlag) == 0) {
852 			/* Discard "tentative" */
853 			line.erase(0, tentFlag.size());
854 			destEvents = &m_tentativeEvents;
855 		} else {
856 			destEvents = &m_events;
857 		}
858 		Event *event(Event::CreateEvent(factory, line));
859 		if (event != NULL) {
860 			destEvents->push_back(event);
861 			RegisterCallout(*event);
862 		}
863 	}
864 }
865 
866 void
867 CaseFile::Close()
868 {
869 	/*
870 	 * This case is no longer relevant.  Clean up our
871 	 * serialization file, and delete the case.
872 	 */
873 	syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n",
874 	       PoolGUIDString().c_str(), VdevGUIDString().c_str(),
875 	       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
876 
877 	/*
878 	 * Serialization of a Case with no event data, clears the
879 	 * Serialization data for that event.
880 	 */
881 	PurgeEvents();
882 	Serialize();
883 
884 	delete this;
885 }
886 
887 void
888 CaseFile::OnGracePeriodEnded()
889 {
890 	bool should_fault, should_degrade;
891 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
892 	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
893 
894 	m_events.splice(m_events.begin(), m_tentativeEvents);
895 	should_fault = ShouldFault();
896 	should_degrade = ShouldDegrade();
897 
898 	if (should_fault || should_degrade) {
899 		if (zhp == NULL
900 		 || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) {
901 			/*
902 			 * Either the pool no longer exists
903 			 * or this vdev is no longer a member of
904 			 * the pool.
905 			 */
906 			Close();
907 			return;
908 		}
909 
910 	}
911 
912 	/* A fault condition has priority over a degrade condition */
913 	if (ShouldFault()) {
914 		/* Fault the vdev and close the case. */
915 		if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID,
916 				       VDEV_AUX_ERR_EXCEEDED) == 0) {
917 			syslog(LOG_INFO, "Faulting vdev(%s/%s)",
918 			       PoolGUIDString().c_str(),
919 			       VdevGUIDString().c_str());
920 			Close();
921 			return;
922 		}
923 		else {
924 			syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n",
925 			       PoolGUIDString().c_str(),
926 			       VdevGUIDString().c_str(),
927 			       libzfs_error_action(g_zfsHandle),
928 			       libzfs_error_description(g_zfsHandle));
929 		}
930 	}
931 	else if (ShouldDegrade()) {
932 		/* Degrade the vdev and close the case. */
933 		if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID,
934 				       VDEV_AUX_ERR_EXCEEDED) == 0) {
935 			syslog(LOG_INFO, "Degrading vdev(%s/%s)",
936 			       PoolGUIDString().c_str(),
937 			       VdevGUIDString().c_str());
938 			Close();
939 			return;
940 		}
941 		else {
942 			syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n",
943 			       PoolGUIDString().c_str(),
944 			       VdevGUIDString().c_str(),
945 			       libzfs_error_action(g_zfsHandle),
946 			       libzfs_error_description(g_zfsHandle));
947 		}
948 	}
949 	Serialize();
950 }
951 
952 Vdev
953 CaseFile::BeingReplacedBy(zpool_handle_t *zhp) {
954 	Vdev vd(zhp, CaseVdev(zhp));
955 	std::list<Vdev> children;
956 	std::list<Vdev>::iterator children_it;
957 
958 	Vdev parent(vd.Parent());
959 	Vdev replacing(NonexistentVdev);
960 
961 	/*
962 	 * To determine whether we are being replaced by another spare that
963 	 * is still working, then make sure that it is currently spared and
964 	 * that the spare is either resilvering or healthy.  If any of these
965 	 * conditions fail, then we are not being replaced by a spare.
966 	 *
967 	 * If the spare is healthy, then the case file should be closed very
968 	 * soon after this check.
969 	 */
970 	if (parent.DoesNotExist()
971 	 || parent.Name(zhp, /*verbose*/false) != "spare")
972 		return (NonexistentVdev);
973 
974 	children = parent.Children();
975 	children_it = children.begin();
976 	for (;children_it != children.end(); children_it++) {
977 		Vdev child = *children_it;
978 
979 		/* Skip our vdev. */
980 		if (child.GUID() == VdevGUID())
981 			continue;
982 		/*
983 		 * Accept the first child that doesn't match our GUID, or
984 		 * any resilvering/healthy device if one exists.
985 		 */
986 		if (replacing.DoesNotExist() || child.IsResilvering()
987 		 || child.State() == VDEV_STATE_HEALTHY)
988 			replacing = child;
989 	}
990 
991 	return (replacing);
992 }
993 
994 bool
995 CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) {
996 	nvlist_t *nvroot, *newvd;
997 	const char *poolname;
998 	string oldstr(VdevGUIDString());
999 	bool retval = true;
1000 
1001 	/* Figure out what pool we're working on */
1002 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
1003 	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
1004 	if (zhp == NULL) {
1005 		syslog(LOG_ERR, "CaseFile::Replace: could not find pool for "
1006 		       "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID);
1007 		return (false);
1008 	}
1009 	poolname = zpool_get_name(zhp);
1010 	Vdev vd(zhp, CaseVdev(zhp));
1011 	Vdev replaced(BeingReplacedBy(zhp));
1012 
1013 	if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) {
1014 		/* If we are already being replaced by a working spare, pass. */
1015 		if (replaced.IsResilvering()
1016 		 || replaced.State() == VDEV_STATE_HEALTHY) {
1017 			syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already "
1018 			    "replaced", VdevGUIDString().c_str(), path);
1019 			return (/*consumed*/false);
1020 		}
1021 		/*
1022 		 * If we have already been replaced by a spare, but that spare
1023 		 * is broken, we must spare the spare, not the original device.
1024 		 */
1025 		oldstr = replaced.GUIDString();
1026 		syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing "
1027 		    "broken spare %s instead", VdevGUIDString().c_str(),
1028 		    path, oldstr.c_str());
1029 	}
1030 
1031 	/*
1032 	 * Build a root vdev/leaf vdev configuration suitable for
1033 	 * zpool_vdev_attach. Only enough data for the kernel to find
1034 	 * the device (i.e. type and disk device node path) are needed.
1035 	 */
1036 	nvroot = NULL;
1037 	newvd = NULL;
1038 
1039 	if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0
1040 	 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
1041 		syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate "
1042 		    "configuration data.", poolname, oldstr.c_str());
1043 		if (nvroot != NULL)
1044 			nvlist_free(nvroot);
1045 		return (false);
1046 	}
1047 	if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0
1048 	 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0
1049 	 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0
1050 	 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1051 				    &newvd, 1) != 0) {
1052 		syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize "
1053 		    "configuration data.", poolname, oldstr.c_str());
1054 		nvlist_free(newvd);
1055 		nvlist_free(nvroot);
1056 		return (true);
1057 	}
1058 
1059 	/* Data was copied when added to the root vdev. */
1060 	nvlist_free(newvd);
1061 
1062 	retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot,
1063 	    /*replace*/B_TRUE) == 0);
1064 	if (retval)
1065 		syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n",
1066 		    poolname, oldstr.c_str(), path);
1067 	else
1068 		syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n",
1069 		    poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle),
1070 		    libzfs_error_description(g_zfsHandle));
1071 	nvlist_free(nvroot);
1072 
1073 	return (retval);
1074 }
1075 
1076 /* Does the argument event refer to a checksum error? */
1077 static bool
1078 IsChecksumEvent(const Event* const event)
1079 {
1080 	return ("ereport.fs.zfs.checksum" == event->Value("type"));
1081 }
1082 
1083 /* Does the argument event refer to an IO error? */
1084 static bool
1085 IsIOEvent(const Event* const event)
1086 {
1087 	return ("ereport.fs.zfs.io" == event->Value("type"));
1088 }
1089 
1090 bool
1091 CaseFile::ShouldDegrade() const
1092 {
1093 	return (std::count_if(m_events.begin(), m_events.end(),
1094 			      IsChecksumEvent) > ZFS_DEGRADE_IO_COUNT);
1095 }
1096 
1097 bool
1098 CaseFile::ShouldFault() const
1099 {
1100 	return (std::count_if(m_events.begin(), m_events.end(),
1101 			      IsIOEvent) > ZFS_DEGRADE_IO_COUNT);
1102 }
1103 
1104 nvlist_t *
1105 CaseFile::CaseVdev(zpool_handle_t *zhp) const
1106 {
1107 	return (VdevIterator(zhp).Find(VdevGUID()));
1108 }
1109