xref: /freebsd/cddl/usr.sbin/zfsd/case_file.cc (revision 63cbe8d1d95f97e93929ec66f1138693d08dd9f6)
1 /*-
2  * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions, and the following disclaimer,
10  *    without modification.
11  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
12  *    substantially similar to the "NO WARRANTY" disclaimer below
13  *    ("Disclaimer") and any redistribution must be conditioned upon
14  *    including a substantially similar Disclaimer requirement for further
15  *    binary redistribution.
16  *
17  * NO WARRANTY
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGES.
29  *
30  * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
31  */
32 
33 /**
34  * \file case_file.cc
35  *
36  * We keep case files for any leaf vdev that is not in the optimal state.
37  * However, we only serialize to disk those events that need to be preserved
38  * across reboots.  For now, this is just a log of soft errors which we
39  * accumulate in order to mark a device as degraded.
40  */
41 #include <sys/cdefs.h>
42 #include <sys/time.h>
43 
44 #include <sys/fs/zfs.h>
45 
46 #include <dirent.h>
47 #include <iomanip>
48 #include <fstream>
49 #include <functional>
50 #include <sstream>
51 #include <syslog.h>
52 #include <unistd.h>
53 
54 #include <libzfs.h>
55 
56 #include <list>
57 #include <map>
58 #include <string>
59 
60 #include <devdctl/guid.h>
61 #include <devdctl/event.h>
62 #include <devdctl/event_factory.h>
63 #include <devdctl/exception.h>
64 #include <devdctl/consumer.h>
65 
66 #include "callout.h"
67 #include "vdev_iterator.h"
68 #include "zfsd_event.h"
69 #include "case_file.h"
70 #include "vdev.h"
71 #include "zfsd.h"
72 #include "zfsd_exception.h"
73 #include "zpool_list.h"
74 
75 __FBSDID("$FreeBSD$");
76 
77 /*============================ Namespace Control =============================*/
78 using std::auto_ptr;
79 using std::hex;
80 using std::ifstream;
81 using std::stringstream;
82 using std::setfill;
83 using std::setw;
84 
85 using DevdCtl::Event;
86 using DevdCtl::EventFactory;
87 using DevdCtl::EventList;
88 using DevdCtl::Guid;
89 using DevdCtl::ParseException;
90 
91 /*--------------------------------- CaseFile ---------------------------------*/
92 //- CaseFile Static Data -------------------------------------------------------
93 
94 CaseFileList  CaseFile::s_activeCases;
95 const string  CaseFile::s_caseFilePath = "/var/db/zfsd/cases";
96 const timeval CaseFile::s_removeGracePeriod = { 60 /*sec*/, 0 /*usec*/};
97 
98 //- CaseFile Static Public Methods ---------------------------------------------
99 CaseFile *
100 CaseFile::Find(Guid poolGUID, Guid vdevGUID)
101 {
102 	for (CaseFileList::iterator curCase = s_activeCases.begin();
103 	     curCase != s_activeCases.end(); curCase++) {
104 
105 		if (((*curCase)->PoolGUID() != poolGUID
106 		  && Guid::InvalidGuid() != poolGUID)
107 		 || (*curCase)->VdevGUID() != vdevGUID)
108 			continue;
109 
110 		/*
111 		 * We only carry one active case per-vdev.
112 		 */
113 		return (*curCase);
114 	}
115 	return (NULL);
116 }
117 
118 CaseFile *
119 CaseFile::Find(const string &physPath)
120 {
121 	CaseFile *result = NULL;
122 
123 	for (CaseFileList::iterator curCase = s_activeCases.begin();
124 	     curCase != s_activeCases.end(); curCase++) {
125 
126 		if ((*curCase)->PhysicalPath() != physPath)
127 			continue;
128 
129 		if (result != NULL) {
130 			syslog(LOG_WARNING, "Multiple casefiles found for "
131 			    "physical path %s.  "
132 			    "This is most likely a bug in zfsd",
133 			    physPath.c_str());
134 		}
135 		result = *curCase;
136 	}
137 	return (result);
138 }
139 
140 
141 void
142 CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event)
143 {
144 	CaseFileList::iterator casefile;
145 	for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){
146 		CaseFileList::iterator next = casefile;
147 		next++;
148 		if (poolGUID == (*casefile)->PoolGUID())
149 			(*casefile)->ReEvaluate(event);
150 		casefile = next;
151 	}
152 }
153 
154 CaseFile &
155 CaseFile::Create(Vdev &vdev)
156 {
157 	CaseFile *activeCase;
158 
159 	activeCase = Find(vdev.PoolGUID(), vdev.GUID());
160 	if (activeCase == NULL)
161 		activeCase = new CaseFile(vdev);
162 
163 	return (*activeCase);
164 }
165 
166 void
167 CaseFile::DeSerialize()
168 {
169 	struct dirent **caseFiles;
170 
171 	int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles,
172 			 DeSerializeSelector, /*compar*/NULL));
173 
174 	if (numCaseFiles == -1)
175 		return;
176 	if (numCaseFiles == 0) {
177 		free(caseFiles);
178 		return;
179 	}
180 
181 	for (int i = 0; i < numCaseFiles; i++) {
182 
183 		DeSerializeFile(caseFiles[i]->d_name);
184 		free(caseFiles[i]);
185 	}
186 	free(caseFiles);
187 }
188 
189 bool
190 CaseFile::Empty()
191 {
192 	return (s_activeCases.empty());
193 }
194 
195 void
196 CaseFile::LogAll()
197 {
198 	for (CaseFileList::iterator curCase = s_activeCases.begin();
199 	     curCase != s_activeCases.end(); curCase++)
200 		(*curCase)->Log();
201 }
202 
203 void
204 CaseFile::PurgeAll()
205 {
206 	/*
207 	 * Serialize casefiles before deleting them so that they can be reread
208 	 * and revalidated during BuildCaseFiles.
209 	 * CaseFiles remove themselves from this list on destruction.
210 	 */
211 	while (s_activeCases.size() != 0) {
212 		CaseFile *casefile = s_activeCases.front();
213 		casefile->Serialize();
214 		delete casefile;
215 	}
216 
217 }
218 
219 //- CaseFile Public Methods ----------------------------------------------------
220 bool
221 CaseFile::RefreshVdevState()
222 {
223 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
224 	zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front());
225 	if (casePool == NULL)
226 		return (false);
227 
228 	Vdev vd(casePool, CaseVdev(casePool));
229 	if (vd.DoesNotExist())
230 		return (false);
231 
232 	m_vdevState    = vd.State();
233 	m_vdevPhysPath = vd.PhysicalPath();
234 	return (true);
235 }
236 
237 bool
238 CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev)
239 {
240 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
241 	zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front());
242 	zpool_boot_label_t boot_type;
243 	uint64_t boot_size;
244 
245 	if (pool == NULL || !RefreshVdevState()) {
246 		/*
247 		 * The pool or vdev for this case file is no longer
248 		 * part of the configuration.  This can happen
249 		 * if we process a device arrival notification
250 		 * before seeing the ZFS configuration change
251 		 * event.
252 		 */
253 		syslog(LOG_INFO,
254 		       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured.  "
255 		       "Closing\n",
256 		       PoolGUIDString().c_str(),
257 		       VdevGUIDString().c_str());
258 		Close();
259 
260 		/*
261 		 * Since this event was not used to close this
262 		 * case, do not report it as consumed.
263 		 */
264 		return (/*consumed*/false);
265 	}
266 
267 	if (VdevState() > VDEV_STATE_CANT_OPEN) {
268 		/*
269 		 * For now, newly discovered devices only help for
270 		 * devices that are missing.  In the future, we might
271 		 * use a newly inserted spare to replace a degraded
272 		 * or faulted device.
273 		 */
274 		syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored",
275 		    PoolGUIDString().c_str(), VdevGUIDString().c_str());
276 		return (/*consumed*/false);
277 	}
278 
279 	if (vdev != NULL
280 	 && ( vdev->PoolGUID() == m_poolGUID
281 	   || vdev->PoolGUID() == Guid::InvalidGuid())
282 	 && vdev->GUID() == m_vdevGUID) {
283 
284 		zpool_vdev_online(pool, vdev->GUIDString().c_str(),
285 				  ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE,
286 				  &m_vdevState);
287 		syslog(LOG_INFO, "Onlined vdev(%s/%s:%s).  State now %s.\n",
288 		       zpool_get_name(pool), vdev->GUIDString().c_str(),
289 		       devPath.c_str(),
290 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
291 
292 		/*
293 		 * Check the vdev state post the online action to see
294 		 * if we can retire this case.
295 		 */
296 		CloseIfSolved();
297 
298 		return (/*consumed*/true);
299 	}
300 
301 	/*
302 	 * If the auto-replace policy is enabled, and we have physical
303 	 * path information, try a physical path replacement.
304 	 */
305 	if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) {
306 		syslog(LOG_INFO,
307 		       "CaseFile(%s:%s:%s): AutoReplace not set.  "
308 		       "Ignoring device insertion.\n",
309 		       PoolGUIDString().c_str(),
310 		       VdevGUIDString().c_str(),
311 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
312 		return (/*consumed*/false);
313 	}
314 
315 	if (PhysicalPath().empty()) {
316 		syslog(LOG_INFO,
317 		       "CaseFile(%s:%s:%s): No physical path information.  "
318 		       "Ignoring device insertion.\n",
319 		       PoolGUIDString().c_str(),
320 		       VdevGUIDString().c_str(),
321 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
322 		return (/*consumed*/false);
323 	}
324 
325 	if (physPath != PhysicalPath()) {
326 		syslog(LOG_INFO,
327 		       "CaseFile(%s:%s:%s): Physical path mismatch.  "
328 		       "Ignoring device insertion.\n",
329 		       PoolGUIDString().c_str(),
330 		       VdevGUIDString().c_str(),
331 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
332 		return (/*consumed*/false);
333 	}
334 
335 	/* Write a label on the newly inserted disk. */
336 	if (zpool_is_bootable(pool))
337 		boot_type = ZPOOL_COPY_BOOT_LABEL;
338 	else
339 		boot_type = ZPOOL_NO_BOOT_LABEL;
340 	boot_size = zpool_get_prop_int(pool, ZPOOL_PROP_BOOTSIZE, NULL);
341 	if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str(),
342 	    boot_type, boot_size, NULL) != 0) {
343 		syslog(LOG_ERR,
344 		       "Replace vdev(%s/%s) by physical path (label): %s: %s\n",
345 		       zpool_get_name(pool), VdevGUIDString().c_str(),
346 		       libzfs_error_action(g_zfsHandle),
347 		       libzfs_error_description(g_zfsHandle));
348 		return (/*consumed*/false);
349 	}
350 
351 	syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s",
352 	    PoolGUIDString().c_str(), VdevGUIDString().c_str(),
353 	    devPath.c_str());
354 	return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false));
355 }
356 
357 bool
358 CaseFile::ReEvaluate(const ZfsEvent &event)
359 {
360 	bool consumed(false);
361 
362 	if (event.Value("type") == "misc.fs.zfs.vdev_remove") {
363 		/*
364 		 * The Vdev we represent has been removed from the
365 		 * configuration.  This case is no longer of value.
366 		 */
367 		Close();
368 
369 		return (/*consumed*/true);
370 	} else if (event.Value("type") == "misc.fs.zfs.pool_destroy") {
371 		/* This Pool has been destroyed.  Discard the case */
372 		Close();
373 
374 		return (/*consumed*/true);
375 	} else if (event.Value("type") == "misc.fs.zfs.config_sync") {
376 		RefreshVdevState();
377 		if (VdevState() < VDEV_STATE_HEALTHY)
378 			consumed = ActivateSpare();
379 	}
380 
381 
382 	if (event.Value("class") == "resource.fs.zfs.removed") {
383 		bool spare_activated;
384 
385 		if (!RefreshVdevState()) {
386 			/*
387 			 * The pool or vdev for this case file is no longer
388 			 * part of the configuration.  This can happen
389 			 * if we process a device arrival notification
390 			 * before seeing the ZFS configuration change
391 			 * event.
392 			 */
393 			syslog(LOG_INFO,
394 			       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev "
395 			       "unconfigured.  Closing\n",
396 			       PoolGUIDString().c_str(),
397 			       VdevGUIDString().c_str());
398 			/*
399 			 * Close the case now so we won't waste cycles in the
400 			 * system rescan
401 			 */
402 			Close();
403 
404 			/*
405 			 * Since this event was not used to close this
406 			 * case, do not report it as consumed.
407 			 */
408 			return (/*consumed*/false);
409 		}
410 
411 		/*
412 		 * Discard any tentative I/O error events for
413 		 * this case.  They were most likely caused by the
414 		 * hot-unplug of this device.
415 		 */
416 		PurgeTentativeEvents();
417 
418 		/* Try to activate spares if they are available */
419 		spare_activated = ActivateSpare();
420 
421 		/*
422 		 * Rescan the drives in the system to see if a recent
423 		 * drive arrival can be used to solve this case.
424 		 */
425 		ZfsDaemon::RequestSystemRescan();
426 
427 		/*
428 		 * Consume the event if we successfully activated a spare.
429 		 * Otherwise, leave it in the unconsumed events list so that the
430 		 * future addition of a spare to this pool might be able to
431 		 * close the case
432 		 */
433 		consumed = spare_activated;
434 	} else if (event.Value("class") == "resource.fs.zfs.statechange") {
435 		RefreshVdevState();
436 		/*
437 		 * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to
438 		 * activate a hotspare.  Otherwise, ignore the event
439 		 */
440 		if (VdevState() == VDEV_STATE_FAULTED ||
441 		    VdevState() == VDEV_STATE_DEGRADED ||
442 		    VdevState() == VDEV_STATE_CANT_OPEN)
443 			(void) ActivateSpare();
444 		consumed = true;
445 	}
446 	else if (event.Value("class") == "ereport.fs.zfs.io" ||
447 	         event.Value("class") == "ereport.fs.zfs.checksum") {
448 
449 		m_tentativeEvents.push_front(event.DeepCopy());
450 		RegisterCallout(event);
451 		consumed = true;
452 	}
453 
454 	bool closed(CloseIfSolved());
455 
456 	return (consumed || closed);
457 }
458 
459 /* Find a Vdev containing the vdev with the given GUID */
460 static nvlist_t*
461 find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid)
462 {
463 	nvlist_t **vdevChildren;
464 	int        error;
465 	unsigned   ch, numChildren;
466 
467 	error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
468 					   &vdevChildren, &numChildren);
469 
470 	if (error != 0 || numChildren == 0)
471 		return (NULL);
472 
473 	for (ch = 0; ch < numChildren; ch++) {
474 		nvlist *result;
475 		Vdev vdev(pool_config, vdevChildren[ch]);
476 
477 		if (vdev.GUID() == child_guid)
478 			return (config);
479 
480 		result = find_parent(pool_config, vdevChildren[ch], child_guid);
481 		if (result != NULL)
482 			return (result);
483 	}
484 
485 	return (NULL);
486 }
487 
488 bool
489 CaseFile::ActivateSpare() {
490 	nvlist_t	*config, *nvroot, *parent_config;
491 	nvlist_t       **spares;
492 	char		*devPath, *vdev_type;
493 	const char	*poolname;
494 	u_int		 nspares, i;
495 	int		 error;
496 
497 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
498 	zpool_handle_t	*zhp(zpl.empty() ? NULL : zpl.front());
499 	if (zhp == NULL) {
500 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
501 		       "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID);
502 		return (false);
503 	}
504 	poolname = zpool_get_name(zhp);
505 	config = zpool_get_config(zhp, NULL);
506 	if (config == NULL) {
507 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
508 		       "config for pool %s", poolname);
509 		return (false);
510 	}
511 	error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot);
512 	if (error != 0){
513 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev "
514 		       "tree for pool %s", poolname);
515 		return (false);
516 	}
517 
518 	parent_config = find_parent(config, nvroot, m_vdevGUID);
519 	if (parent_config != NULL) {
520 		char *parent_type;
521 
522 		/*
523 		 * Don't activate spares for members of a "replacing" vdev.
524 		 * They're already dealt with.  Sparing them will just drag out
525 		 * the resilver process.
526 		 */
527 		error = nvlist_lookup_string(parent_config,
528 		    ZPOOL_CONFIG_TYPE, &parent_type);
529 		if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0)
530 			return (false);
531 	}
532 
533 	nspares = 0;
534 	nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
535 				   &nspares);
536 	if (nspares == 0) {
537 		/* The pool has no spares configured */
538 		syslog(LOG_INFO, "CaseFile::ActivateSpare: "
539 		       "No spares available for pool %s", poolname);
540 		return (false);
541 	}
542 	for (i = 0; i < nspares; i++) {
543 		uint64_t    *nvlist_array;
544 		vdev_stat_t *vs;
545 		uint_t	     nstats;
546 
547 		if (nvlist_lookup_uint64_array(spares[i],
548 		    ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) {
549 			syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not "
550 			       "find vdev stats for pool %s, spare %d",
551 			       poolname, i);
552 			return (false);
553 		}
554 		vs = reinterpret_cast<vdev_stat_t *>(nvlist_array);
555 
556 		if ((vs->vs_aux != VDEV_AUX_SPARED)
557 		 && (vs->vs_state == VDEV_STATE_HEALTHY)) {
558 			/* We found a usable spare */
559 			break;
560 		}
561 	}
562 
563 	if (i == nspares) {
564 		/* No available spares were found */
565 		return (false);
566 	}
567 
568 	error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath);
569 	if (error != 0) {
570 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
571 		       "the path of pool %s, spare %d. Error %d",
572 		       poolname, i, error);
573 		return (false);
574 	}
575 
576 	error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type);
577 	if (error != 0) {
578 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
579 		       "the vdev type of pool %s, spare %d. Error %d",
580 		       poolname, i, error);
581 		return (false);
582 	}
583 
584 	return (Replace(vdev_type, devPath, /*isspare*/true));
585 }
586 
587 void
588 CaseFile::RegisterCallout(const Event &event)
589 {
590 	timeval now, countdown, elapsed, timestamp, zero, remaining;
591 
592 	gettimeofday(&now, 0);
593 	timestamp = event.GetTimestamp();
594 	timersub(&now, &timestamp, &elapsed);
595 	timersub(&s_removeGracePeriod, &elapsed, &countdown);
596 	/*
597 	 * If countdown is <= zero, Reset the timer to the
598 	 * smallest positive time value instead
599 	 */
600 	timerclear(&zero);
601 	if (timercmp(&countdown, &zero, <=)) {
602 		timerclear(&countdown);
603 		countdown.tv_usec = 1;
604 	}
605 
606 	remaining = m_tentativeTimer.TimeRemaining();
607 
608 	if (!m_tentativeTimer.IsPending()
609 	 || timercmp(&countdown, &remaining, <))
610 		m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this);
611 }
612 
613 
614 bool
615 CaseFile::CloseIfSolved()
616 {
617 	if (m_events.empty()
618 	 && m_tentativeEvents.empty()) {
619 
620 		/*
621 		 * We currently do not track or take actions on
622 		 * devices in the degraded or faulted state.
623 		 * Once we have support for spare pools, we'll
624 		 * retain these cases so that any spares added in
625 		 * the future can be applied to them.
626 		 */
627 		switch (VdevState()) {
628 		case VDEV_STATE_HEALTHY:
629 			/* No need to keep cases for healthy vdevs */
630 			Close();
631 			return (true);
632 		case VDEV_STATE_REMOVED:
633 		case VDEV_STATE_CANT_OPEN:
634 			/*
635 			 * Keep open.  We may solve it with a newly inserted
636 			 * device.
637 			 */
638 		case VDEV_STATE_FAULTED:
639 		case VDEV_STATE_DEGRADED:
640 			/*
641 			 * Keep open.  We may solve it with the future
642 			 * addition of a spare to the pool
643 			 */
644 		case VDEV_STATE_UNKNOWN:
645 		case VDEV_STATE_CLOSED:
646 		case VDEV_STATE_OFFLINE:
647 			/*
648 			 * Keep open?  This may not be the correct behavior,
649 			 * but it's what we've always done
650 			 */
651 			;
652 		}
653 
654 		/*
655 		 * Re-serialize the case in order to remove any
656 		 * previous event data.
657 		 */
658 		Serialize();
659 	}
660 
661 	return (false);
662 }
663 
664 void
665 CaseFile::Log()
666 {
667 	syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(),
668 	       VdevGUIDString().c_str(), PhysicalPath().c_str());
669 	syslog(LOG_INFO, "\tVdev State = %s\n",
670 	       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
671 	if (m_tentativeEvents.size() != 0) {
672 		syslog(LOG_INFO, "\t=== Tentative Events ===\n");
673 		for (EventList::iterator event(m_tentativeEvents.begin());
674 		     event != m_tentativeEvents.end(); event++)
675 			(*event)->Log(LOG_INFO);
676 	}
677 	if (m_events.size() != 0) {
678 		syslog(LOG_INFO, "\t=== Events ===\n");
679 		for (EventList::iterator event(m_events.begin());
680 		     event != m_events.end(); event++)
681 			(*event)->Log(LOG_INFO);
682 	}
683 }
684 
685 //- CaseFile Static Protected Methods ------------------------------------------
686 void
687 CaseFile::OnGracePeriodEnded(void *arg)
688 {
689 	CaseFile &casefile(*static_cast<CaseFile *>(arg));
690 
691 	casefile.OnGracePeriodEnded();
692 }
693 
694 int
695 CaseFile::DeSerializeSelector(const struct dirent *dirEntry)
696 {
697 	uint64_t poolGUID;
698 	uint64_t vdevGUID;
699 
700 	if (dirEntry->d_type == DT_REG
701 	 && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
702 		   &poolGUID, &vdevGUID) == 2)
703 		return (1);
704 	return (0);
705 }
706 
707 void
708 CaseFile::DeSerializeFile(const char *fileName)
709 {
710 	string	  fullName(s_caseFilePath + '/' + fileName);
711 	CaseFile *existingCaseFile(NULL);
712 	CaseFile *caseFile(NULL);
713 
714 	try {
715 		uint64_t poolGUID;
716 		uint64_t vdevGUID;
717 		nvlist_t *vdevConf;
718 
719 		if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
720 		       &poolGUID, &vdevGUID) != 2) {
721 			throw ZfsdException("CaseFile::DeSerialize: "
722 			    "Unintelligible CaseFile filename %s.\n", fileName);
723 		}
724 		existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID));
725 		if (existingCaseFile != NULL) {
726 			/*
727 			 * If the vdev is already degraded or faulted,
728 			 * there's no point in keeping the state around
729 			 * that we use to put a drive into the degraded
730 			 * state.  However, if the vdev is simply missing,
731 			 * preserve the case data in the hopes that it will
732 			 * return.
733 			 */
734 			caseFile = existingCaseFile;
735 			vdev_state curState(caseFile->VdevState());
736 			if (curState > VDEV_STATE_CANT_OPEN
737 			 && curState < VDEV_STATE_HEALTHY) {
738 				unlink(fileName);
739 				return;
740 			}
741 		} else {
742 			ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID);
743 			if (zpl.empty()
744 			 || (vdevConf = VdevIterator(zpl.front())
745 						    .Find(vdevGUID)) == NULL) {
746 				/*
747 				 * Either the pool no longer exists
748 				 * or this vdev is no longer a member of
749 				 * the pool.
750 				 */
751 				unlink(fullName.c_str());
752 				return;
753 			}
754 
755 			/*
756 			 * Any vdev we find that does not have a case file
757 			 * must be in the healthy state and thus worthy of
758 			 * continued SERD data tracking.
759 			 */
760 			caseFile = new CaseFile(Vdev(zpl.front(), vdevConf));
761 		}
762 
763 		ifstream caseStream(fullName.c_str());
764 		if (!caseStream)
765 			throw ZfsdException("CaseFile::DeSerialize: Unable to "
766 					    "read %s.\n", fileName);
767 
768 		caseFile->DeSerialize(caseStream);
769 	} catch (const ParseException &exp) {
770 
771 		exp.Log();
772 		if (caseFile != existingCaseFile)
773 			delete caseFile;
774 
775 		/*
776 		 * Since we can't parse the file, unlink it so we don't
777 		 * trip over it again.
778 		 */
779 		unlink(fileName);
780 	} catch (const ZfsdException &zfsException) {
781 
782 		zfsException.Log();
783 		if (caseFile != existingCaseFile)
784 			delete caseFile;
785 	}
786 }
787 
788 //- CaseFile Protected Methods -------------------------------------------------
789 CaseFile::CaseFile(const Vdev &vdev)
790  : m_poolGUID(vdev.PoolGUID()),
791    m_vdevGUID(vdev.GUID()),
792    m_vdevState(vdev.State()),
793    m_vdevPhysPath(vdev.PhysicalPath())
794 {
795 	stringstream guidString;
796 
797 	guidString << m_vdevGUID;
798 	m_vdevGUIDString = guidString.str();
799 	guidString.str("");
800 	guidString << m_poolGUID;
801 	m_poolGUIDString = guidString.str();
802 
803 	s_activeCases.push_back(this);
804 
805 	syslog(LOG_INFO, "Creating new CaseFile:\n");
806 	Log();
807 }
808 
809 CaseFile::~CaseFile()
810 {
811 	PurgeEvents();
812 	PurgeTentativeEvents();
813 	m_tentativeTimer.Stop();
814 	s_activeCases.remove(this);
815 }
816 
817 void
818 CaseFile::PurgeEvents()
819 {
820 	for (EventList::iterator event(m_events.begin());
821 	     event != m_events.end(); event++)
822 		delete *event;
823 
824 	m_events.clear();
825 }
826 
827 void
828 CaseFile::PurgeTentativeEvents()
829 {
830 	for (EventList::iterator event(m_tentativeEvents.begin());
831 	     event != m_tentativeEvents.end(); event++)
832 		delete *event;
833 
834 	m_tentativeEvents.clear();
835 }
836 
837 void
838 CaseFile::SerializeEvList(const EventList events, int fd,
839 		const char* prefix) const
840 {
841 	if (events.empty())
842 		return;
843 	for (EventList::const_iterator curEvent = events.begin();
844 	     curEvent != events.end(); curEvent++) {
845 		const string &eventString((*curEvent)->GetEventString());
846 
847 		// TODO: replace many write(2) calls with a single writev(2)
848 		if (prefix)
849 			write(fd, prefix, strlen(prefix));
850 		write(fd, eventString.c_str(), eventString.length());
851 	}
852 }
853 
854 void
855 CaseFile::Serialize()
856 {
857 	stringstream saveFile;
858 
859 	saveFile << setfill('0')
860 		 << s_caseFilePath << "/"
861 		 << "pool_" << PoolGUIDString()
862 		 << "_vdev_" << VdevGUIDString()
863 		 << ".case";
864 
865 	if (m_events.empty() && m_tentativeEvents.empty()) {
866 		unlink(saveFile.str().c_str());
867 		return;
868 	}
869 
870 	int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644));
871 	if (fd == -1) {
872 		syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n",
873 		       saveFile.str().c_str());
874 		return;
875 	}
876 	SerializeEvList(m_events, fd);
877 	SerializeEvList(m_tentativeEvents, fd, "tentative ");
878 	close(fd);
879 }
880 
881 /*
882  * XXX: This method assumes that events may not contain embedded newlines.  If
883  * ever events can contain embedded newlines, then CaseFile must switch
884  * serialization formats
885  */
886 void
887 CaseFile::DeSerialize(ifstream &caseStream)
888 {
889 	string	      evString;
890 	const EventFactory &factory(ZfsDaemon::Get().GetFactory());
891 
892 	caseStream >> std::noskipws >> std::ws;
893 	while (caseStream.good()) {
894 		/*
895 		 * Outline:
896 		 * read the beginning of a line and check it for
897 		 * "tentative".  If found, discard "tentative".
898 		 * Create a new event
899 		 * continue
900 		 */
901 		EventList* destEvents;
902 		const string tentFlag("tentative ");
903 		string line;
904 		std::stringbuf lineBuf;
905 
906 		caseStream.get(lineBuf);
907 		caseStream.ignore();  /*discard the newline character*/
908 		line = lineBuf.str();
909 		if (line.compare(0, tentFlag.size(), tentFlag) == 0) {
910 			/* Discard "tentative" */
911 			line.erase(0, tentFlag.size());
912 			destEvents = &m_tentativeEvents;
913 		} else {
914 			destEvents = &m_events;
915 		}
916 		Event *event(Event::CreateEvent(factory, line));
917 		if (event != NULL) {
918 			destEvents->push_back(event);
919 			RegisterCallout(*event);
920 		}
921 	}
922 }
923 
924 void
925 CaseFile::Close()
926 {
927 	/*
928 	 * This case is no longer relevant.  Clean up our
929 	 * serialization file, and delete the case.
930 	 */
931 	syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n",
932 	       PoolGUIDString().c_str(), VdevGUIDString().c_str(),
933 	       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
934 
935 	/*
936 	 * Serialization of a Case with no event data, clears the
937 	 * Serialization data for that event.
938 	 */
939 	PurgeEvents();
940 	Serialize();
941 
942 	delete this;
943 }
944 
945 void
946 CaseFile::OnGracePeriodEnded()
947 {
948 	bool should_fault, should_degrade;
949 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
950 	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
951 
952 	m_events.splice(m_events.begin(), m_tentativeEvents);
953 	should_fault = ShouldFault();
954 	should_degrade = ShouldDegrade();
955 
956 	if (should_fault || should_degrade) {
957 		if (zhp == NULL
958 		 || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) {
959 			/*
960 			 * Either the pool no longer exists
961 			 * or this vdev is no longer a member of
962 			 * the pool.
963 			 */
964 			Close();
965 			return;
966 		}
967 
968 	}
969 
970 	/* A fault condition has priority over a degrade condition */
971 	if (ShouldFault()) {
972 		/* Fault the vdev and close the case. */
973 		if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID,
974 				       VDEV_AUX_ERR_EXCEEDED) == 0) {
975 			syslog(LOG_INFO, "Faulting vdev(%s/%s)",
976 			       PoolGUIDString().c_str(),
977 			       VdevGUIDString().c_str());
978 			Close();
979 			return;
980 		}
981 		else {
982 			syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n",
983 			       PoolGUIDString().c_str(),
984 			       VdevGUIDString().c_str(),
985 			       libzfs_error_action(g_zfsHandle),
986 			       libzfs_error_description(g_zfsHandle));
987 		}
988 	}
989 	else if (ShouldDegrade()) {
990 		/* Degrade the vdev and close the case. */
991 		if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID,
992 				       VDEV_AUX_ERR_EXCEEDED) == 0) {
993 			syslog(LOG_INFO, "Degrading vdev(%s/%s)",
994 			       PoolGUIDString().c_str(),
995 			       VdevGUIDString().c_str());
996 			Close();
997 			return;
998 		}
999 		else {
1000 			syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n",
1001 			       PoolGUIDString().c_str(),
1002 			       VdevGUIDString().c_str(),
1003 			       libzfs_error_action(g_zfsHandle),
1004 			       libzfs_error_description(g_zfsHandle));
1005 		}
1006 	}
1007 	Serialize();
1008 }
1009 
1010 Vdev
1011 CaseFile::BeingReplacedBy(zpool_handle_t *zhp) {
1012 	Vdev vd(zhp, CaseVdev(zhp));
1013 	std::list<Vdev> children;
1014 	std::list<Vdev>::iterator children_it;
1015 
1016 	Vdev parent(vd.Parent());
1017 	Vdev replacing(NonexistentVdev);
1018 
1019 	/*
1020 	 * To determine whether we are being replaced by another spare that
1021 	 * is still working, then make sure that it is currently spared and
1022 	 * that the spare is either resilvering or healthy.  If any of these
1023 	 * conditions fail, then we are not being replaced by a spare.
1024 	 *
1025 	 * If the spare is healthy, then the case file should be closed very
1026 	 * soon after this check.
1027 	 */
1028 	if (parent.DoesNotExist()
1029 	 || parent.Name(zhp, /*verbose*/false) != "spare")
1030 		return (NonexistentVdev);
1031 
1032 	children = parent.Children();
1033 	children_it = children.begin();
1034 	for (;children_it != children.end(); children_it++) {
1035 		Vdev child = *children_it;
1036 
1037 		/* Skip our vdev. */
1038 		if (child.GUID() == VdevGUID())
1039 			continue;
1040 		/*
1041 		 * Accept the first child that doesn't match our GUID, or
1042 		 * any resilvering/healthy device if one exists.
1043 		 */
1044 		if (replacing.DoesNotExist() || child.IsResilvering()
1045 		 || child.State() == VDEV_STATE_HEALTHY)
1046 			replacing = child;
1047 	}
1048 
1049 	return (replacing);
1050 }
1051 
1052 bool
1053 CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) {
1054 	nvlist_t *nvroot, *newvd;
1055 	const char *poolname;
1056 	string oldstr(VdevGUIDString());
1057 	bool retval = true;
1058 
1059 	/* Figure out what pool we're working on */
1060 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
1061 	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
1062 	if (zhp == NULL) {
1063 		syslog(LOG_ERR, "CaseFile::Replace: could not find pool for "
1064 		       "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID);
1065 		return (false);
1066 	}
1067 	poolname = zpool_get_name(zhp);
1068 	Vdev vd(zhp, CaseVdev(zhp));
1069 	Vdev replaced(BeingReplacedBy(zhp));
1070 
1071 	if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) {
1072 		/* If we are already being replaced by a working spare, pass. */
1073 		if (replaced.IsResilvering()
1074 		 || replaced.State() == VDEV_STATE_HEALTHY) {
1075 			syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already "
1076 			    "replaced", VdevGUIDString().c_str(), path);
1077 			return (/*consumed*/false);
1078 		}
1079 		/*
1080 		 * If we have already been replaced by a spare, but that spare
1081 		 * is broken, we must spare the spare, not the original device.
1082 		 */
1083 		oldstr = replaced.GUIDString();
1084 		syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing "
1085 		    "broken spare %s instead", VdevGUIDString().c_str(),
1086 		    path, oldstr.c_str());
1087 	}
1088 
1089 	/*
1090 	 * Build a root vdev/leaf vdev configuration suitable for
1091 	 * zpool_vdev_attach. Only enough data for the kernel to find
1092 	 * the device (i.e. type and disk device node path) are needed.
1093 	 */
1094 	nvroot = NULL;
1095 	newvd = NULL;
1096 
1097 	if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0
1098 	 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
1099 		syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate "
1100 		    "configuration data.", poolname, oldstr.c_str());
1101 		if (nvroot != NULL)
1102 			nvlist_free(nvroot);
1103 		return (false);
1104 	}
1105 	if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0
1106 	 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0
1107 	 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0
1108 	 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1109 				    &newvd, 1) != 0) {
1110 		syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize "
1111 		    "configuration data.", poolname, oldstr.c_str());
1112 		nvlist_free(newvd);
1113 		nvlist_free(nvroot);
1114 		return (true);
1115 	}
1116 
1117 	/* Data was copied when added to the root vdev. */
1118 	nvlist_free(newvd);
1119 
1120 	retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot,
1121 	    /*replace*/B_TRUE) == 0);
1122 	if (retval)
1123 		syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n",
1124 		    poolname, oldstr.c_str(), path);
1125 	else
1126 		syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n",
1127 		    poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle),
1128 		    libzfs_error_description(g_zfsHandle));
1129 	nvlist_free(nvroot);
1130 
1131 	return (retval);
1132 }
1133 
1134 /* Does the argument event refer to a checksum error? */
1135 static bool
1136 IsChecksumEvent(const Event* const event)
1137 {
1138 	return ("ereport.fs.zfs.checksum" == event->Value("type"));
1139 }
1140 
1141 /* Does the argument event refer to an IO error? */
1142 static bool
1143 IsIOEvent(const Event* const event)
1144 {
1145 	return ("ereport.fs.zfs.io" == event->Value("type"));
1146 }
1147 
1148 bool
1149 CaseFile::ShouldDegrade() const
1150 {
1151 	return (std::count_if(m_events.begin(), m_events.end(),
1152 			      IsChecksumEvent) > ZFS_DEGRADE_IO_COUNT);
1153 }
1154 
1155 bool
1156 CaseFile::ShouldFault() const
1157 {
1158 	return (std::count_if(m_events.begin(), m_events.end(),
1159 			      IsIOEvent) > ZFS_DEGRADE_IO_COUNT);
1160 }
1161 
1162 nvlist_t *
1163 CaseFile::CaseVdev(zpool_handle_t *zhp) const
1164 {
1165 	return (VdevIterator(zhp).Find(VdevGUID()));
1166 }
1167