xref: /freebsd/cddl/usr.sbin/zfsd/case_file.cc (revision 13ec1e3155c7e9bf037b12af186351b7fa9b9450)
1 /*-
2  * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions, and the following disclaimer,
10  *    without modification.
11  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
12  *    substantially similar to the "NO WARRANTY" disclaimer below
13  *    ("Disclaimer") and any redistribution must be conditioned upon
14  *    including a substantially similar Disclaimer requirement for further
15  *    binary redistribution.
16  *
17  * NO WARRANTY
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGES.
29  *
30  * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
31  */
32 
33 /**
34  * \file case_file.cc
35  *
36  * We keep case files for any leaf vdev that is not in the optimal state.
37  * However, we only serialize to disk those events that need to be preserved
38  * across reboots.  For now, this is just a log of soft errors which we
39  * accumulate in order to mark a device as degraded.
40  */
41 #include <sys/cdefs.h>
42 #include <sys/byteorder.h>
43 #include <sys/time.h>
44 
45 #include <sys/fs/zfs.h>
46 
47 #include <dirent.h>
48 #include <fcntl.h>
49 #include <iomanip>
50 #include <fstream>
51 #include <functional>
52 #include <sstream>
53 #include <syslog.h>
54 #include <unistd.h>
55 
56 #include <libzfs.h>
57 
58 #include <list>
59 #include <map>
60 #include <string>
61 
62 #include <devdctl/guid.h>
63 #include <devdctl/event.h>
64 #include <devdctl/event_factory.h>
65 #include <devdctl/exception.h>
66 #include <devdctl/consumer.h>
67 
68 #include "callout.h"
69 #include "vdev_iterator.h"
70 #include "zfsd_event.h"
71 #include "case_file.h"
72 #include "vdev.h"
73 #include "zfsd.h"
74 #include "zfsd_exception.h"
75 #include "zpool_list.h"
76 
77 __FBSDID("$FreeBSD$");
78 
79 /*============================ Namespace Control =============================*/
80 using std::hex;
81 using std::ifstream;
82 using std::stringstream;
83 using std::setfill;
84 using std::setw;
85 
86 using DevdCtl::Event;
87 using DevdCtl::EventFactory;
88 using DevdCtl::EventList;
89 using DevdCtl::Guid;
90 using DevdCtl::ParseException;
91 
92 /*--------------------------------- CaseFile ---------------------------------*/
93 //- CaseFile Static Data -------------------------------------------------------
94 
95 CaseFileList  CaseFile::s_activeCases;
96 const string  CaseFile::s_caseFilePath = "/var/db/zfsd/cases";
97 const timeval CaseFile::s_removeGracePeriod = { 60 /*sec*/, 0 /*usec*/};
98 
99 //- CaseFile Static Public Methods ---------------------------------------------
100 CaseFile *
101 CaseFile::Find(Guid poolGUID, Guid vdevGUID)
102 {
103 	for (CaseFileList::iterator curCase = s_activeCases.begin();
104 	     curCase != s_activeCases.end(); curCase++) {
105 
106 		if (((*curCase)->PoolGUID() != poolGUID
107 		  && Guid::InvalidGuid() != poolGUID)
108 		 || (*curCase)->VdevGUID() != vdevGUID)
109 			continue;
110 
111 		/*
112 		 * We only carry one active case per-vdev.
113 		 */
114 		return (*curCase);
115 	}
116 	return (NULL);
117 }
118 
119 CaseFile *
120 CaseFile::Find(const string &physPath)
121 {
122 	CaseFile *result = NULL;
123 
124 	for (CaseFileList::iterator curCase = s_activeCases.begin();
125 	     curCase != s_activeCases.end(); curCase++) {
126 
127 		if ((*curCase)->PhysicalPath() != physPath)
128 			continue;
129 
130 		if (result != NULL) {
131 			syslog(LOG_WARNING, "Multiple casefiles found for "
132 			    "physical path %s.  "
133 			    "This is most likely a bug in zfsd",
134 			    physPath.c_str());
135 		}
136 		result = *curCase;
137 	}
138 	return (result);
139 }
140 
141 
142 void
143 CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event)
144 {
145 	CaseFileList::iterator casefile;
146 	for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){
147 		CaseFileList::iterator next = casefile;
148 		next++;
149 		if (poolGUID == (*casefile)->PoolGUID())
150 			(*casefile)->ReEvaluate(event);
151 		casefile = next;
152 	}
153 }
154 
155 CaseFile &
156 CaseFile::Create(Vdev &vdev)
157 {
158 	CaseFile *activeCase;
159 
160 	activeCase = Find(vdev.PoolGUID(), vdev.GUID());
161 	if (activeCase == NULL)
162 		activeCase = new CaseFile(vdev);
163 
164 	return (*activeCase);
165 }
166 
167 void
168 CaseFile::DeSerialize()
169 {
170 	struct dirent **caseFiles;
171 
172 	int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles,
173 			 DeSerializeSelector, /*compar*/NULL));
174 
175 	if (numCaseFiles == -1)
176 		return;
177 	if (numCaseFiles == 0) {
178 		free(caseFiles);
179 		return;
180 	}
181 
182 	for (int i = 0; i < numCaseFiles; i++) {
183 
184 		DeSerializeFile(caseFiles[i]->d_name);
185 		free(caseFiles[i]);
186 	}
187 	free(caseFiles);
188 }
189 
190 bool
191 CaseFile::Empty()
192 {
193 	return (s_activeCases.empty());
194 }
195 
196 void
197 CaseFile::LogAll()
198 {
199 	for (CaseFileList::iterator curCase = s_activeCases.begin();
200 	     curCase != s_activeCases.end(); curCase++)
201 		(*curCase)->Log();
202 }
203 
204 void
205 CaseFile::PurgeAll()
206 {
207 	/*
208 	 * Serialize casefiles before deleting them so that they can be reread
209 	 * and revalidated during BuildCaseFiles.
210 	 * CaseFiles remove themselves from this list on destruction.
211 	 */
212 	while (s_activeCases.size() != 0) {
213 		CaseFile *casefile = s_activeCases.front();
214 		casefile->Serialize();
215 		delete casefile;
216 	}
217 
218 }
219 
220 //- CaseFile Public Methods ----------------------------------------------------
221 bool
222 CaseFile::RefreshVdevState()
223 {
224 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
225 	zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front());
226 	if (casePool == NULL)
227 		return (false);
228 
229 	Vdev vd(casePool, CaseVdev(casePool));
230 	if (vd.DoesNotExist())
231 		return (false);
232 
233 	m_vdevState    = vd.State();
234 	m_vdevPhysPath = vd.PhysicalPath();
235 	return (true);
236 }
237 
238 bool
239 CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev)
240 {
241 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
242 	zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front());
243 
244 	if (pool == NULL || !RefreshVdevState()) {
245 		/*
246 		 * The pool or vdev for this case file is no longer
247 		 * part of the configuration.  This can happen
248 		 * if we process a device arrival notification
249 		 * before seeing the ZFS configuration change
250 		 * event.
251 		 */
252 		syslog(LOG_INFO,
253 		       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured.  "
254 		       "Closing\n",
255 		       PoolGUIDString().c_str(),
256 		       VdevGUIDString().c_str());
257 		Close();
258 
259 		/*
260 		 * Since this event was not used to close this
261 		 * case, do not report it as consumed.
262 		 */
263 		return (/*consumed*/false);
264 	}
265 
266 	if (VdevState() > VDEV_STATE_CANT_OPEN) {
267 		/*
268 		 * For now, newly discovered devices only help for
269 		 * devices that are missing.  In the future, we might
270 		 * use a newly inserted spare to replace a degraded
271 		 * or faulted device.
272 		 */
273 		syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored",
274 		    PoolGUIDString().c_str(), VdevGUIDString().c_str());
275 		return (/*consumed*/false);
276 	}
277 
278 	if (vdev != NULL
279 	 && ( vdev->PoolGUID() == m_poolGUID
280 	   || vdev->PoolGUID() == Guid::InvalidGuid())
281 	 && vdev->GUID() == m_vdevGUID) {
282 
283 		if (zpool_vdev_online(pool, vdev->GUIDString().c_str(),
284 		    ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE,
285 		    &m_vdevState) != 0) {
286 			syslog(LOG_ERR,
287 			    "Failed to online vdev(%s/%s:%s): %s: %s\n",
288 			    zpool_get_name(pool), vdev->GUIDString().c_str(),
289 			    devPath.c_str(), libzfs_error_action(g_zfsHandle),
290 			    libzfs_error_description(g_zfsHandle));
291 			return (/*consumed*/false);
292 		}
293 
294 		syslog(LOG_INFO, "Onlined vdev(%s/%s:%s).  State now %s.\n",
295 		       zpool_get_name(pool), vdev->GUIDString().c_str(),
296 		       devPath.c_str(),
297 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
298 
299 		/*
300 		 * Check the vdev state post the online action to see
301 		 * if we can retire this case.
302 		 */
303 		CloseIfSolved();
304 
305 		return (/*consumed*/true);
306 	}
307 
308 	/*
309 	 * If the auto-replace policy is enabled, and we have physical
310 	 * path information, try a physical path replacement.
311 	 */
312 	if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) {
313 		syslog(LOG_INFO,
314 		       "CaseFile(%s:%s:%s): AutoReplace not set.  "
315 		       "Ignoring device insertion.\n",
316 		       PoolGUIDString().c_str(),
317 		       VdevGUIDString().c_str(),
318 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
319 		return (/*consumed*/false);
320 	}
321 
322 	if (PhysicalPath().empty()) {
323 		syslog(LOG_INFO,
324 		       "CaseFile(%s:%s:%s): No physical path information.  "
325 		       "Ignoring device insertion.\n",
326 		       PoolGUIDString().c_str(),
327 		       VdevGUIDString().c_str(),
328 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
329 		return (/*consumed*/false);
330 	}
331 
332 	if (physPath != PhysicalPath()) {
333 		syslog(LOG_INFO,
334 		       "CaseFile(%s:%s:%s): Physical path mismatch.  "
335 		       "Ignoring device insertion.\n",
336 		       PoolGUIDString().c_str(),
337 		       VdevGUIDString().c_str(),
338 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
339 		return (/*consumed*/false);
340 	}
341 
342 	/* Write a label on the newly inserted disk. */
343 	if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) {
344 		syslog(LOG_ERR,
345 		       "Replace vdev(%s/%s) by physical path (label): %s: %s\n",
346 		       zpool_get_name(pool), VdevGUIDString().c_str(),
347 		       libzfs_error_action(g_zfsHandle),
348 		       libzfs_error_description(g_zfsHandle));
349 		return (/*consumed*/false);
350 	}
351 
352 	syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s",
353 	    PoolGUIDString().c_str(), VdevGUIDString().c_str(),
354 	    devPath.c_str());
355 	return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false));
356 }
357 
358 bool
359 CaseFile::ReEvaluate(const ZfsEvent &event)
360 {
361 	bool consumed(false);
362 
363 	if (event.Value("type") == "misc.fs.zfs.vdev_remove") {
364 		/*
365 		 * The Vdev we represent has been removed from the
366 		 * configuration.  This case is no longer of value.
367 		 */
368 		Close();
369 
370 		return (/*consumed*/true);
371 	} else if (event.Value("type") == "misc.fs.zfs.pool_destroy") {
372 		/* This Pool has been destroyed.  Discard the case */
373 		Close();
374 
375 		return (/*consumed*/true);
376 	} else if (event.Value("type") == "misc.fs.zfs.config_sync") {
377 		RefreshVdevState();
378 		if (VdevState() < VDEV_STATE_HEALTHY)
379 			consumed = ActivateSpare();
380 	}
381 
382 
383 	if (event.Value("class") == "resource.fs.zfs.removed") {
384 		bool spare_activated;
385 
386 		if (!RefreshVdevState()) {
387 			/*
388 			 * The pool or vdev for this case file is no longer
389 			 * part of the configuration.  This can happen
390 			 * if we process a device arrival notification
391 			 * before seeing the ZFS configuration change
392 			 * event.
393 			 */
394 			syslog(LOG_INFO,
395 			       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev "
396 			       "unconfigured.  Closing\n",
397 			       PoolGUIDString().c_str(),
398 			       VdevGUIDString().c_str());
399 			/*
400 			 * Close the case now so we won't waste cycles in the
401 			 * system rescan
402 			 */
403 			Close();
404 
405 			/*
406 			 * Since this event was not used to close this
407 			 * case, do not report it as consumed.
408 			 */
409 			return (/*consumed*/false);
410 		}
411 
412 		/*
413 		 * Discard any tentative I/O error events for
414 		 * this case.  They were most likely caused by the
415 		 * hot-unplug of this device.
416 		 */
417 		PurgeTentativeEvents();
418 
419 		/* Try to activate spares if they are available */
420 		spare_activated = ActivateSpare();
421 
422 		/*
423 		 * Rescan the drives in the system to see if a recent
424 		 * drive arrival can be used to solve this case.
425 		 */
426 		ZfsDaemon::RequestSystemRescan();
427 
428 		/*
429 		 * Consume the event if we successfully activated a spare.
430 		 * Otherwise, leave it in the unconsumed events list so that the
431 		 * future addition of a spare to this pool might be able to
432 		 * close the case
433 		 */
434 		consumed = spare_activated;
435 	} else if (event.Value("class") == "resource.fs.zfs.statechange") {
436 		RefreshVdevState();
437 		/*
438 		 * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to
439 		 * activate a hotspare.  Otherwise, ignore the event
440 		 */
441 		if (VdevState() == VDEV_STATE_FAULTED ||
442 		    VdevState() == VDEV_STATE_DEGRADED ||
443 		    VdevState() == VDEV_STATE_CANT_OPEN)
444 			(void) ActivateSpare();
445 		consumed = true;
446 	}
447 	else if (event.Value("class") == "ereport.fs.zfs.io" ||
448 	         event.Value("class") == "ereport.fs.zfs.checksum") {
449 
450 		m_tentativeEvents.push_front(event.DeepCopy());
451 		RegisterCallout(event);
452 		consumed = true;
453 	}
454 
455 	bool closed(CloseIfSolved());
456 
457 	return (consumed || closed);
458 }
459 
460 /* Find a Vdev containing the vdev with the given GUID */
461 static nvlist_t*
462 find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid)
463 {
464 	nvlist_t **vdevChildren;
465 	int        error;
466 	unsigned   ch, numChildren;
467 
468 	error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
469 					   &vdevChildren, &numChildren);
470 
471 	if (error != 0 || numChildren == 0)
472 		return (NULL);
473 
474 	for (ch = 0; ch < numChildren; ch++) {
475 		nvlist *result;
476 		Vdev vdev(pool_config, vdevChildren[ch]);
477 
478 		if (vdev.GUID() == child_guid)
479 			return (config);
480 
481 		result = find_parent(pool_config, vdevChildren[ch], child_guid);
482 		if (result != NULL)
483 			return (result);
484 	}
485 
486 	return (NULL);
487 }
488 
489 bool
490 CaseFile::ActivateSpare() {
491 	nvlist_t	*config, *nvroot, *parent_config;
492 	nvlist_t       **spares;
493 	char		*devPath, *vdev_type;
494 	const char	*poolname;
495 	u_int		 nspares, i;
496 	int		 error;
497 
498 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
499 	zpool_handle_t	*zhp(zpl.empty() ? NULL : zpl.front());
500 	if (zhp == NULL) {
501 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
502 		       "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID);
503 		return (false);
504 	}
505 	poolname = zpool_get_name(zhp);
506 	config = zpool_get_config(zhp, NULL);
507 	if (config == NULL) {
508 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
509 		       "config for pool %s", poolname);
510 		return (false);
511 	}
512 	error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot);
513 	if (error != 0){
514 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev "
515 		       "tree for pool %s", poolname);
516 		return (false);
517 	}
518 
519 	parent_config = find_parent(config, nvroot, m_vdevGUID);
520 	if (parent_config != NULL) {
521 		char *parent_type;
522 
523 		/*
524 		 * Don't activate spares for members of a "replacing" vdev.
525 		 * They're already dealt with.  Sparing them will just drag out
526 		 * the resilver process.
527 		 */
528 		error = nvlist_lookup_string(parent_config,
529 		    ZPOOL_CONFIG_TYPE, &parent_type);
530 		if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0)
531 			return (false);
532 	}
533 
534 	nspares = 0;
535 	nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
536 				   &nspares);
537 	if (nspares == 0) {
538 		/* The pool has no spares configured */
539 		syslog(LOG_INFO, "CaseFile::ActivateSpare: "
540 		       "No spares available for pool %s", poolname);
541 		return (false);
542 	}
543 	for (i = 0; i < nspares; i++) {
544 		uint64_t    *nvlist_array;
545 		vdev_stat_t *vs;
546 		uint_t	     nstats;
547 
548 		if (nvlist_lookup_uint64_array(spares[i],
549 		    ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) {
550 			syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not "
551 			       "find vdev stats for pool %s, spare %d",
552 			       poolname, i);
553 			return (false);
554 		}
555 		vs = reinterpret_cast<vdev_stat_t *>(nvlist_array);
556 
557 		if ((vs->vs_aux != VDEV_AUX_SPARED)
558 		 && (vs->vs_state == VDEV_STATE_HEALTHY)) {
559 			/* We found a usable spare */
560 			break;
561 		}
562 	}
563 
564 	if (i == nspares) {
565 		/* No available spares were found */
566 		return (false);
567 	}
568 
569 	error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath);
570 	if (error != 0) {
571 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
572 		       "the path of pool %s, spare %d. Error %d",
573 		       poolname, i, error);
574 		return (false);
575 	}
576 
577 	error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type);
578 	if (error != 0) {
579 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
580 		       "the vdev type of pool %s, spare %d. Error %d",
581 		       poolname, i, error);
582 		return (false);
583 	}
584 
585 	return (Replace(vdev_type, devPath, /*isspare*/true));
586 }
587 
588 void
589 CaseFile::RegisterCallout(const Event &event)
590 {
591 	timeval now, countdown, elapsed, timestamp, zero, remaining;
592 
593 	gettimeofday(&now, 0);
594 	timestamp = event.GetTimestamp();
595 	timersub(&now, &timestamp, &elapsed);
596 	timersub(&s_removeGracePeriod, &elapsed, &countdown);
597 	/*
598 	 * If countdown is <= zero, Reset the timer to the
599 	 * smallest positive time value instead
600 	 */
601 	timerclear(&zero);
602 	if (timercmp(&countdown, &zero, <=)) {
603 		timerclear(&countdown);
604 		countdown.tv_usec = 1;
605 	}
606 
607 	remaining = m_tentativeTimer.TimeRemaining();
608 
609 	if (!m_tentativeTimer.IsPending()
610 	 || timercmp(&countdown, &remaining, <))
611 		m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this);
612 }
613 
614 
615 bool
616 CaseFile::CloseIfSolved()
617 {
618 	if (m_events.empty()
619 	 && m_tentativeEvents.empty()) {
620 
621 		/*
622 		 * We currently do not track or take actions on
623 		 * devices in the degraded or faulted state.
624 		 * Once we have support for spare pools, we'll
625 		 * retain these cases so that any spares added in
626 		 * the future can be applied to them.
627 		 */
628 		switch (VdevState()) {
629 		case VDEV_STATE_HEALTHY:
630 			/* No need to keep cases for healthy vdevs */
631 			Close();
632 			return (true);
633 		case VDEV_STATE_REMOVED:
634 		case VDEV_STATE_CANT_OPEN:
635 			/*
636 			 * Keep open.  We may solve it with a newly inserted
637 			 * device.
638 			 */
639 		case VDEV_STATE_FAULTED:
640 		case VDEV_STATE_DEGRADED:
641 			/*
642 			 * Keep open.  We may solve it with the future
643 			 * addition of a spare to the pool
644 			 */
645 		case VDEV_STATE_UNKNOWN:
646 		case VDEV_STATE_CLOSED:
647 		case VDEV_STATE_OFFLINE:
648 			/*
649 			 * Keep open?  This may not be the correct behavior,
650 			 * but it's what we've always done
651 			 */
652 			;
653 		}
654 
655 		/*
656 		 * Re-serialize the case in order to remove any
657 		 * previous event data.
658 		 */
659 		Serialize();
660 	}
661 
662 	return (false);
663 }
664 
665 void
666 CaseFile::Log()
667 {
668 	syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(),
669 	       VdevGUIDString().c_str(), PhysicalPath().c_str());
670 	syslog(LOG_INFO, "\tVdev State = %s\n",
671 	       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
672 	if (m_tentativeEvents.size() != 0) {
673 		syslog(LOG_INFO, "\t=== Tentative Events ===\n");
674 		for (EventList::iterator event(m_tentativeEvents.begin());
675 		     event != m_tentativeEvents.end(); event++)
676 			(*event)->Log(LOG_INFO);
677 	}
678 	if (m_events.size() != 0) {
679 		syslog(LOG_INFO, "\t=== Events ===\n");
680 		for (EventList::iterator event(m_events.begin());
681 		     event != m_events.end(); event++)
682 			(*event)->Log(LOG_INFO);
683 	}
684 }
685 
686 //- CaseFile Static Protected Methods ------------------------------------------
687 void
688 CaseFile::OnGracePeriodEnded(void *arg)
689 {
690 	CaseFile &casefile(*static_cast<CaseFile *>(arg));
691 
692 	casefile.OnGracePeriodEnded();
693 }
694 
695 int
696 CaseFile::DeSerializeSelector(const struct dirent *dirEntry)
697 {
698 	uint64_t poolGUID;
699 	uint64_t vdevGUID;
700 
701 	if (dirEntry->d_type == DT_REG
702 	 && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
703 		   &poolGUID, &vdevGUID) == 2)
704 		return (1);
705 	return (0);
706 }
707 
708 void
709 CaseFile::DeSerializeFile(const char *fileName)
710 {
711 	string	  fullName(s_caseFilePath + '/' + fileName);
712 	CaseFile *existingCaseFile(NULL);
713 	CaseFile *caseFile(NULL);
714 
715 	try {
716 		uint64_t poolGUID;
717 		uint64_t vdevGUID;
718 		nvlist_t *vdevConf;
719 
720 		if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
721 		       &poolGUID, &vdevGUID) != 2) {
722 			throw ZfsdException("CaseFile::DeSerialize: "
723 			    "Unintelligible CaseFile filename %s.\n", fileName);
724 		}
725 		existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID));
726 		if (existingCaseFile != NULL) {
727 			/*
728 			 * If the vdev is already degraded or faulted,
729 			 * there's no point in keeping the state around
730 			 * that we use to put a drive into the degraded
731 			 * state.  However, if the vdev is simply missing,
732 			 * preserve the case data in the hopes that it will
733 			 * return.
734 			 */
735 			caseFile = existingCaseFile;
736 			vdev_state curState(caseFile->VdevState());
737 			if (curState > VDEV_STATE_CANT_OPEN
738 			 && curState < VDEV_STATE_HEALTHY) {
739 				unlink(fileName);
740 				return;
741 			}
742 		} else {
743 			ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID);
744 			if (zpl.empty()
745 			 || (vdevConf = VdevIterator(zpl.front())
746 						    .Find(vdevGUID)) == NULL) {
747 				/*
748 				 * Either the pool no longer exists
749 				 * or this vdev is no longer a member of
750 				 * the pool.
751 				 */
752 				unlink(fullName.c_str());
753 				return;
754 			}
755 
756 			/*
757 			 * Any vdev we find that does not have a case file
758 			 * must be in the healthy state and thus worthy of
759 			 * continued SERD data tracking.
760 			 */
761 			caseFile = new CaseFile(Vdev(zpl.front(), vdevConf));
762 		}
763 
764 		ifstream caseStream(fullName.c_str());
765 		if (!caseStream)
766 			throw ZfsdException("CaseFile::DeSerialize: Unable to "
767 					    "read %s.\n", fileName);
768 
769 		caseFile->DeSerialize(caseStream);
770 	} catch (const ParseException &exp) {
771 
772 		exp.Log();
773 		if (caseFile != existingCaseFile)
774 			delete caseFile;
775 
776 		/*
777 		 * Since we can't parse the file, unlink it so we don't
778 		 * trip over it again.
779 		 */
780 		unlink(fileName);
781 	} catch (const ZfsdException &zfsException) {
782 
783 		zfsException.Log();
784 		if (caseFile != existingCaseFile)
785 			delete caseFile;
786 	}
787 }
788 
789 //- CaseFile Protected Methods -------------------------------------------------
790 CaseFile::CaseFile(const Vdev &vdev)
791  : m_poolGUID(vdev.PoolGUID()),
792    m_vdevGUID(vdev.GUID()),
793    m_vdevState(vdev.State()),
794    m_vdevPhysPath(vdev.PhysicalPath())
795 {
796 	stringstream guidString;
797 
798 	guidString << m_vdevGUID;
799 	m_vdevGUIDString = guidString.str();
800 	guidString.str("");
801 	guidString << m_poolGUID;
802 	m_poolGUIDString = guidString.str();
803 
804 	s_activeCases.push_back(this);
805 
806 	syslog(LOG_INFO, "Creating new CaseFile:\n");
807 	Log();
808 }
809 
810 CaseFile::~CaseFile()
811 {
812 	PurgeEvents();
813 	PurgeTentativeEvents();
814 	m_tentativeTimer.Stop();
815 	s_activeCases.remove(this);
816 }
817 
818 void
819 CaseFile::PurgeEvents()
820 {
821 	for (EventList::iterator event(m_events.begin());
822 	     event != m_events.end(); event++)
823 		delete *event;
824 
825 	m_events.clear();
826 }
827 
828 void
829 CaseFile::PurgeTentativeEvents()
830 {
831 	for (EventList::iterator event(m_tentativeEvents.begin());
832 	     event != m_tentativeEvents.end(); event++)
833 		delete *event;
834 
835 	m_tentativeEvents.clear();
836 }
837 
838 void
839 CaseFile::SerializeEvList(const EventList events, int fd,
840 		const char* prefix) const
841 {
842 	if (events.empty())
843 		return;
844 	for (EventList::const_iterator curEvent = events.begin();
845 	     curEvent != events.end(); curEvent++) {
846 		const string &eventString((*curEvent)->GetEventString());
847 
848 		// TODO: replace many write(2) calls with a single writev(2)
849 		if (prefix)
850 			write(fd, prefix, strlen(prefix));
851 		write(fd, eventString.c_str(), eventString.length());
852 	}
853 }
854 
855 void
856 CaseFile::Serialize()
857 {
858 	stringstream saveFile;
859 
860 	saveFile << setfill('0')
861 		 << s_caseFilePath << "/"
862 		 << "pool_" << PoolGUIDString()
863 		 << "_vdev_" << VdevGUIDString()
864 		 << ".case";
865 
866 	if (m_events.empty() && m_tentativeEvents.empty()) {
867 		unlink(saveFile.str().c_str());
868 		return;
869 	}
870 
871 	int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644));
872 	if (fd == -1) {
873 		syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n",
874 		       saveFile.str().c_str());
875 		return;
876 	}
877 	SerializeEvList(m_events, fd);
878 	SerializeEvList(m_tentativeEvents, fd, "tentative ");
879 	close(fd);
880 }
881 
882 /*
883  * XXX: This method assumes that events may not contain embedded newlines.  If
884  * ever events can contain embedded newlines, then CaseFile must switch
885  * serialization formats
886  */
887 void
888 CaseFile::DeSerialize(ifstream &caseStream)
889 {
890 	string	      evString;
891 	const EventFactory &factory(ZfsDaemon::Get().GetFactory());
892 
893 	caseStream >> std::noskipws >> std::ws;
894 	while (caseStream.good()) {
895 		/*
896 		 * Outline:
897 		 * read the beginning of a line and check it for
898 		 * "tentative".  If found, discard "tentative".
899 		 * Create a new event
900 		 * continue
901 		 */
902 		EventList* destEvents;
903 		const string tentFlag("tentative ");
904 		string line;
905 		std::stringbuf lineBuf;
906 
907 		caseStream.get(lineBuf);
908 		caseStream.ignore();  /*discard the newline character*/
909 		line = lineBuf.str();
910 		if (line.compare(0, tentFlag.size(), tentFlag) == 0) {
911 			/* Discard "tentative" */
912 			line.erase(0, tentFlag.size());
913 			destEvents = &m_tentativeEvents;
914 		} else {
915 			destEvents = &m_events;
916 		}
917 		Event *event(Event::CreateEvent(factory, line));
918 		if (event != NULL) {
919 			destEvents->push_back(event);
920 			RegisterCallout(*event);
921 		}
922 	}
923 }
924 
925 void
926 CaseFile::Close()
927 {
928 	/*
929 	 * This case is no longer relevant.  Clean up our
930 	 * serialization file, and delete the case.
931 	 */
932 	syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n",
933 	       PoolGUIDString().c_str(), VdevGUIDString().c_str(),
934 	       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
935 
936 	/*
937 	 * Serialization of a Case with no event data, clears the
938 	 * Serialization data for that event.
939 	 */
940 	PurgeEvents();
941 	Serialize();
942 
943 	delete this;
944 }
945 
946 void
947 CaseFile::OnGracePeriodEnded()
948 {
949 	bool should_fault, should_degrade;
950 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
951 	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
952 
953 	m_events.splice(m_events.begin(), m_tentativeEvents);
954 	should_fault = ShouldFault();
955 	should_degrade = ShouldDegrade();
956 
957 	if (should_fault || should_degrade) {
958 		if (zhp == NULL
959 		 || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) {
960 			/*
961 			 * Either the pool no longer exists
962 			 * or this vdev is no longer a member of
963 			 * the pool.
964 			 */
965 			Close();
966 			return;
967 		}
968 
969 	}
970 
971 	/* A fault condition has priority over a degrade condition */
972 	if (ShouldFault()) {
973 		/* Fault the vdev and close the case. */
974 		if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID,
975 				       VDEV_AUX_ERR_EXCEEDED) == 0) {
976 			syslog(LOG_INFO, "Faulting vdev(%s/%s)",
977 			       PoolGUIDString().c_str(),
978 			       VdevGUIDString().c_str());
979 			Close();
980 			return;
981 		}
982 		else {
983 			syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n",
984 			       PoolGUIDString().c_str(),
985 			       VdevGUIDString().c_str(),
986 			       libzfs_error_action(g_zfsHandle),
987 			       libzfs_error_description(g_zfsHandle));
988 		}
989 	}
990 	else if (ShouldDegrade()) {
991 		/* Degrade the vdev and close the case. */
992 		if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID,
993 				       VDEV_AUX_ERR_EXCEEDED) == 0) {
994 			syslog(LOG_INFO, "Degrading vdev(%s/%s)",
995 			       PoolGUIDString().c_str(),
996 			       VdevGUIDString().c_str());
997 			Close();
998 			return;
999 		}
1000 		else {
1001 			syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n",
1002 			       PoolGUIDString().c_str(),
1003 			       VdevGUIDString().c_str(),
1004 			       libzfs_error_action(g_zfsHandle),
1005 			       libzfs_error_description(g_zfsHandle));
1006 		}
1007 	}
1008 	Serialize();
1009 }
1010 
1011 Vdev
1012 CaseFile::BeingReplacedBy(zpool_handle_t *zhp) {
1013 	Vdev vd(zhp, CaseVdev(zhp));
1014 	std::list<Vdev> children;
1015 	std::list<Vdev>::iterator children_it;
1016 
1017 	Vdev parent(vd.Parent());
1018 	Vdev replacing(NonexistentVdev);
1019 
1020 	/*
1021 	 * To determine whether we are being replaced by another spare that
1022 	 * is still working, then make sure that it is currently spared and
1023 	 * that the spare is either resilvering or healthy.  If any of these
1024 	 * conditions fail, then we are not being replaced by a spare.
1025 	 *
1026 	 * If the spare is healthy, then the case file should be closed very
1027 	 * soon after this check.
1028 	 */
1029 	if (parent.DoesNotExist()
1030 	 || parent.Name(zhp, /*verbose*/false) != "spare")
1031 		return (NonexistentVdev);
1032 
1033 	children = parent.Children();
1034 	children_it = children.begin();
1035 	for (;children_it != children.end(); children_it++) {
1036 		Vdev child = *children_it;
1037 
1038 		/* Skip our vdev. */
1039 		if (child.GUID() == VdevGUID())
1040 			continue;
1041 		/*
1042 		 * Accept the first child that doesn't match our GUID, or
1043 		 * any resilvering/healthy device if one exists.
1044 		 */
1045 		if (replacing.DoesNotExist() || child.IsResilvering()
1046 		 || child.State() == VDEV_STATE_HEALTHY)
1047 			replacing = child;
1048 	}
1049 
1050 	return (replacing);
1051 }
1052 
1053 bool
1054 CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) {
1055 	nvlist_t *nvroot, *newvd;
1056 	const char *poolname;
1057 	string oldstr(VdevGUIDString());
1058 	bool retval = true;
1059 
1060 	/* Figure out what pool we're working on */
1061 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
1062 	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
1063 	if (zhp == NULL) {
1064 		syslog(LOG_ERR, "CaseFile::Replace: could not find pool for "
1065 		       "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID);
1066 		return (false);
1067 	}
1068 	poolname = zpool_get_name(zhp);
1069 	Vdev vd(zhp, CaseVdev(zhp));
1070 	Vdev replaced(BeingReplacedBy(zhp));
1071 
1072 	if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) {
1073 		/* If we are already being replaced by a working spare, pass. */
1074 		if (replaced.IsResilvering()
1075 		 || replaced.State() == VDEV_STATE_HEALTHY) {
1076 			syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already "
1077 			    "replaced", VdevGUIDString().c_str(), path);
1078 			return (/*consumed*/false);
1079 		}
1080 		/*
1081 		 * If we have already been replaced by a spare, but that spare
1082 		 * is broken, we must spare the spare, not the original device.
1083 		 */
1084 		oldstr = replaced.GUIDString();
1085 		syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing "
1086 		    "broken spare %s instead", VdevGUIDString().c_str(),
1087 		    path, oldstr.c_str());
1088 	}
1089 
1090 	/*
1091 	 * Build a root vdev/leaf vdev configuration suitable for
1092 	 * zpool_vdev_attach. Only enough data for the kernel to find
1093 	 * the device (i.e. type and disk device node path) are needed.
1094 	 */
1095 	nvroot = NULL;
1096 	newvd = NULL;
1097 
1098 	if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0
1099 	 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
1100 		syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate "
1101 		    "configuration data.", poolname, oldstr.c_str());
1102 		if (nvroot != NULL)
1103 			nvlist_free(nvroot);
1104 		return (false);
1105 	}
1106 	if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0
1107 	 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0
1108 	 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0
1109 	 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1110 				    &newvd, 1) != 0) {
1111 		syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize "
1112 		    "configuration data.", poolname, oldstr.c_str());
1113 		nvlist_free(newvd);
1114 		nvlist_free(nvroot);
1115 		return (true);
1116 	}
1117 
1118 	/* Data was copied when added to the root vdev. */
1119 	nvlist_free(newvd);
1120 
1121 	retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot,
1122        /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0);
1123 	if (retval)
1124 		syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n",
1125 		    poolname, oldstr.c_str(), path);
1126 	else
1127 		syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n",
1128 		    poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle),
1129 		    libzfs_error_description(g_zfsHandle));
1130 	nvlist_free(nvroot);
1131 
1132 	return (retval);
1133 }
1134 
1135 /* Does the argument event refer to a checksum error? */
1136 static bool
1137 IsChecksumEvent(const Event* const event)
1138 {
1139 	return ("ereport.fs.zfs.checksum" == event->Value("type"));
1140 }
1141 
1142 /* Does the argument event refer to an IO error? */
1143 static bool
1144 IsIOEvent(const Event* const event)
1145 {
1146 	return ("ereport.fs.zfs.io" == event->Value("type"));
1147 }
1148 
1149 bool
1150 CaseFile::ShouldDegrade() const
1151 {
1152 	return (std::count_if(m_events.begin(), m_events.end(),
1153 			      IsChecksumEvent) > ZFS_DEGRADE_IO_COUNT);
1154 }
1155 
1156 bool
1157 CaseFile::ShouldFault() const
1158 {
1159 	return (std::count_if(m_events.begin(), m_events.end(),
1160 			      IsIOEvent) > ZFS_DEGRADE_IO_COUNT);
1161 }
1162 
1163 nvlist_t *
1164 CaseFile::CaseVdev(zpool_handle_t *zhp) const
1165 {
1166 	return (VdevIterator(zhp).Find(VdevGUID()));
1167 }
1168