xref: /freebsd/cddl/usr.sbin/zfsd/case_file.h (revision 069ac18495ad8fde2748bc94b0f80a50250bb01d)
1 /*-
2  * Copyright (c) 2011, 2012, 2013 Spectra Logic Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions, and the following disclaimer,
10  *    without modification.
11  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
12  *    substantially similar to the "NO WARRANTY" disclaimer below
13  *    ("Disclaimer") and any redistribution must be conditioned upon
14  *    including a substantially similar Disclaimer requirement for further
15  *    binary redistribution.
16  *
17  * NO WARRANTY
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGES.
29  *
30  * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
31  */
32 
33 /**
34  * \file case_file.h
35  *
36  * CaseFile objects aggregate vdev faults that may require ZFSD action
37  * in order to maintain the health of a ZFS pool.
38  *
39  * Header requirements:
40  *
41  *    #include <list>
42  *
43  *    #include "callout.h"
44  *    #include "zfsd_event.h"
45  */
46 #ifndef _CASE_FILE_H_
47 #define	_CASE_FILE_H_
48 
49 /*=========================== Forward Declarations ===========================*/
50 class CaseFile;
51 class Vdev;
52 
53 /*============================= Class Definitions ============================*/
54 /*------------------------------- CaseFileList -------------------------------*/
55 /**
56  * CaseFileList is a specialization of the standard list STL container.
57  */
58 typedef std::list< CaseFile *> CaseFileList;
59 
60 /*--------------------------------- CaseFile ---------------------------------*/
61 /**
62  * A CaseFile object is instantiated anytime a vdev for an active pool
63  * experiences an I/O error, is faulted by ZFS, or is determined to be
64  * missing/removed.
65  *
66  * A vdev may have at most one CaseFile.
67  *
68  * CaseFiles are retired when a vdev leaves an active pool configuration
69  * or an action is taken to resolve the issues recorded in the CaseFile.
70  *
71  * Logging a case against a vdev does not imply that an immediate action
72  * to resolve a fault is required or even desired.  For example, a CaseFile
73  * must accumulate a number of I/O errors in order to flag a device as
74  * degraded.
75  *
76  * Vdev I/O errors are not recorded in ZFS label inforamation.  For this
77  * reasons, CaseFile%%s with accumulated I/O error events are serialized
78  * to the file system so that they survive across boots.  Currently all
79  * other fault types can be reconstructed from ZFS label information, so
80  * CaseFile%%s for missing, faulted, or degradded members are just recreated
81  * at ZFSD startup instead of being deserialized from the file system.
82  */
83 class CaseFile
84 {
85 public:
86 	/**
87 	 * \brief Find a CaseFile object by a vdev's pool/vdev GUID tuple.
88 	 *
89 	 * \param poolGUID  Pool GUID for the vdev of the CaseFile to find.
90 	 * 		    If InvalidGuid, then only match the vdev GUID
91 	 * 		    instead of both pool and vdev GUIDs.
92 	 * \param vdevGUID  Vdev GUID for the vdev of the CaseFile to find.
93 	 *
94 	 * \return  If found, a pointer to a valid CaseFile object.
95 	 *          Otherwise NULL.
96 	 */
97 	static CaseFile *Find(DevdCtl::Guid poolGUID, DevdCtl::Guid vdevGUID);
98 
99 	/**
100 	 * \brief Find multiple CaseFile objects by a vdev's pool/vdev
101 	 *        GUID tuple (special case for spare vdevs)
102 	 *
103 	 * \param poolGUID  Pool GUID for the vdev of the CaseFile to find.
104 	 * 		    If InvalidGuid, then only match the vdev GUID
105 	 * 		    instead of both pool and vdev GUIDs.
106 	 * \param vdevGUID  Vdev GUID for the vdev of the CaseFile to find.
107 	 * \param caseList  List of cases associated with the vdev.
108 	 */
109 	static void  Find(DevdCtl::Guid poolGUID, DevdCtl::Guid vdevGUID,
110 				     CaseFileList &caseList);
111 
112 	/**
113 	 * \brief Find a CaseFile object by a vdev's current/last known
114 	 *        physical path.
115 	 *
116 	 * \param physPath  Physical path of the vdev of the CaseFile to find.
117 	 *
118 	 * \return  If found, a pointer to a valid CaseFile object.
119 	 *          Otherwise NULL.
120 	 */
121 	static CaseFile *Find(const string &physPath);
122 
123 	/**
124 	 * \brief ReEvaluate all open cases whose pool guid matches the argument
125 	 *
126 	 * \param poolGUID	Only reevaluate cases for this pool
127 	 * \param event		Try to consume this event with the casefile
128 	 */
129 	static void ReEvaluateByGuid(DevdCtl::Guid poolGUID,
130 				     const ZfsEvent &event);
131 
132 	/**
133 	 * \brief Create or return an existing active CaseFile for the
134 	 *        specified vdev.
135 	 *
136 	 * \param vdev  The vdev object for which to find/create a CaseFile.
137 	 *
138 	 * \return  A reference to a valid CaseFile object.
139 	 */
140 	static CaseFile &Create(Vdev &vdev);
141 
142 	/**
143 	 * \brief Deserialize all serialized CaseFile objects found in
144 	 *        the file system.
145 	 */
146 	static void      DeSerialize();
147 
148 	/**
149 	 * \brief returns true if there are no CaseFiles
150 	 */
151 	static bool	Empty();
152 
153 	/**
154 	 * \brief Emit syslog data on all active CaseFile%%s in the system.
155 	 */
156 	static void      LogAll();
157 
158 	/**
159 	 * \brief Destroy the in-core cache of CaseFile data.
160 	 *
161 	 * This routine does not disturb the on disk, serialized, CaseFile
162 	 * data.
163 	 */
164 	static void      PurgeAll();
165 
166 	DevdCtl::Guid PoolGUID()       const;
167 	DevdCtl::Guid VdevGUID()       const;
168 	vdev_state    VdevState()      const;
169 	const string &PoolGUIDString() const;
170 	const string &VdevGUIDString() const;
171 	const string &PhysicalPath()   const;
172 
173 	/**
174 	 * \brief Attempt to resolve this CaseFile using the disk
175 	 *        resource at the given device/physical path/vdev object
176 	 *        tuple.
177 	 *
178 	 * \param devPath   The devfs path for the disk resource.
179 	 * \param physPath  The physical path information reported by
180 	 *                  the disk resource.
181 	 * \param vdev      If the disk contains ZFS label information,
182 	 *                  a pointer to the disk label's vdev object
183 	 *                  data.  Otherwise NULL.
184 	 *
185 	 * \return  True if this event was consumed by this CaseFile.
186 	 */
187 	bool ReEvaluate(const string &devPath, const string &physPath,
188 			Vdev *vdev);
189 
190 	/**
191 	 * \brief Update this CaseFile in light of the provided ZfsEvent.
192 	 *
193 	 * Must be virtual so it can be overridden in the unit tests
194 	 *
195 	 * \param event  The ZfsEvent to evaluate.
196 	 *
197 	 * \return  True if this event was consumed by this CaseFile.
198 	 */
199 	virtual bool ReEvaluate(const ZfsEvent &event);
200 
201 	/**
202 	 * \brief Register an itimer callout for the given event, if necessary
203 	 */
204 	virtual void RegisterCallout(const DevdCtl::Event &event);
205 
206 	/**
207 	 * \brief Close a case if it is no longer relevant.
208 	 *
209 	 * This method deals with cases tracking soft errors.  Soft errors
210 	 * will be discarded should a remove event occur within a short period
211 	 * of the soft errors being reported.  We also discard the events
212 	 * if the vdev is marked degraded or failed.
213 	 *
214 	 * \return  True if the case is closed.  False otherwise.
215 	 */
216 	bool CloseIfSolved();
217 
218 	/**
219 	 * \brief Emit data about this CaseFile via syslog(3).
220 	 */
221 	void Log();
222 
223 	/**
224 	 * \brief Whether we should degrade this vdev
225 	 */
226 	bool ShouldDegrade() const;
227 
228 	/**
229 	 * \brief Whether we should fault this vdev
230 	 */
231 	bool ShouldFault() const;
232 
233 	/**
234 	 * \brief If this vdev is spare
235 	 */
236 	int IsSpare();
237 
238 protected:
239 	enum {
240 		/**
241 		 * The number of soft errors on a vdev required
242 		 * to transition a vdev from healthy to degraded
243 		 * status.
244 		 */
245 		ZFS_DEGRADE_IO_COUNT = 50,
246 		/**
247 		 * The number of delay errors on a vdev required to fault it
248 		 */
249 		ZFS_FAULT_DELAY_COUNT = 8,
250 	};
251 
252 	static CalloutFunc_t OnGracePeriodEnded;
253 
254 	/**
255 	 * \brief scandir(3) filter function used to find files containing
256 	 *        serialized CaseFile data.
257 	 *
258 	 * \param dirEntry  Directory entry for the file to filter.
259 	 *
260 	 * \return  Non-zero for a file to include in the selection,
261 	 *          otherwise 0.
262 	 */
263 	static int  DeSerializeSelector(const struct dirent *dirEntry);
264 
265 	/**
266 	 * \brief Given the name of a file containing serialized events from a
267 	 *        CaseFile object, create/update an in-core CaseFile object
268 	 *        representing the serialized data.
269 	 *
270 	 * \param fileName  The name of a file containing serialized events
271 	 *                  from a CaseFile object.
272 	 */
273 	static void DeSerializeFile(const char *fileName);
274 
275 	/** Constructor. */
276 	CaseFile(const Vdev &vdev);
277 
278 	/**
279 	 * Destructor.
280 	 * Must be virtual so it can be subclassed in the unit tests
281 	 */
282 	virtual ~CaseFile();
283 
284 	/**
285 	 * \brief Reload state for the vdev associated with this CaseFile.
286 	 *
287 	 * \return  True if the refresh was successful.  False if the system
288 	 *          has no record of the pool or vdev for this CaseFile.
289 	 */
290 	virtual bool RefreshVdevState();
291 
292 	/**
293 	 * \brief Free all events in the m_events list.
294 	 */
295 	void PurgeEvents();
296 
297 	/**
298 	 * \brief Free all events in the m_tentativeEvents list.
299 	 */
300 	void PurgeTentativeEvents();
301 
302 	/**
303 	 * \brief Commit to file system storage.
304 	 */
305 	void Serialize();
306 
307 	/**
308 	 * \brief Retrieve event data from a serialization stream.
309 	 *
310 	 * \param caseStream  The serializtion stream to parse.
311 	 */
312 	void DeSerialize(std::ifstream &caseStream);
313 
314 	/**
315 	 * \brief Serializes the supplied event list and writes it to fd
316 	 *
317 	 * \param prefix  If not NULL, this prefix will be prepended to
318 	 *                every event in the file.
319 	 */
320 	void SerializeEvList(const DevdCtl::EventList events, int fd,
321 			     const char* prefix=NULL) const;
322 
323 	/**
324 	 * \brief Unconditionally close a CaseFile.
325 	 */
326 	virtual void Close();
327 
328 	/**
329 	 * \brief Callout callback invoked when the remove timer grace
330 	 *        period expires.
331 	 *
332 	 * If no remove events are received prior to the grace period
333 	 * firing, then any tentative events are promoted and counted
334 	 * against the health of the vdev.
335 	 */
336 	void OnGracePeriodEnded();
337 
338 	/**
339 	 * \brief Attempt to activate a spare on this case's pool.
340 	 *
341 	 * Call this whenever a pool becomes degraded.  It will look for any
342 	 * spare devices and activate one to replace the casefile's vdev.  It
343 	 * will _not_ close the casefile; that should only happen when the
344 	 * missing drive is replaced or the user promotes the spare.
345 	 *
346 	 * \return True if a spare was activated
347 	 */
348 	bool ActivateSpare();
349 
350 	/**
351 	 * \brief replace a pool's vdev with another
352 	 *
353 	 * \param vdev_type   The type of the new vdev.  Usually either
354 	 *                    VDEV_TYPE_DISK or VDEV_TYPE_FILE
355 	 * \param path        The file system path to the new vdev
356 	 * \param isspare     Whether the new vdev is a spare
357 	 *
358 	 * \return            true iff the replacement was successful
359 	 */
360 	bool Replace(const char* vdev_type, const char* path, bool isspare);
361 
362 	/**
363 	 * \brief Which vdev, if any, is replacing ours.
364 	 *
365 	 * \param zhp		Pool handle state from the caller context
366 	 *
367 	 * \return		the vdev that is currently replacing ours,
368 	 *			or NonexistentVdev if there isn't one.
369 	 */
370 	Vdev BeingReplacedBy(zpool_handle_t *zhp);
371 
372 	/**
373 	 * \brief All CaseFiles being tracked by ZFSD.
374 	 */
375 	static CaseFileList  s_activeCases;
376 
377 	/**
378 	 * \brief The file system path to serialized CaseFile data.
379 	 */
380 	static const string  s_caseFilePath;
381 
382 	/**
383 	 * \brief The time ZFSD waits before promoting a tentative event
384 	 *        into a permanent event.
385 	 */
386 	static const timeval s_removeGracePeriod;
387 
388 	/**
389 	 * \brief A list of soft error events counted against the health of
390 	 *        a vdev.
391 	 */
392 	DevdCtl::EventList m_events;
393 
394 	/**
395 	 * \brief A list of soft error events waiting for a grace period
396 	 *        expiration before being counted against the health of
397 	 *        a vdev.
398 	 */
399 	DevdCtl::EventList m_tentativeEvents;
400 
401 	DevdCtl::Guid	   m_poolGUID;
402 	DevdCtl::Guid	   m_vdevGUID;
403 	vdev_state	   m_vdevState;
404 	string		   m_poolGUIDString;
405 	string		   m_vdevGUIDString;
406 	string		   m_vdevPhysPath;
407 	int		   m_is_spare;
408 
409 	/**
410 	 * \brief Callout activated when a grace period
411 	 */
412 	Callout		  m_tentativeTimer;
413 
414 private:
415 	nvlist_t	*CaseVdev(zpool_handle_t *zhp)	const;
416 };
417 
418 inline DevdCtl::Guid
419 CaseFile::PoolGUID() const
420 {
421 	return (m_poolGUID);
422 }
423 
424 inline DevdCtl::Guid
425 CaseFile::VdevGUID() const
426 {
427 	return (m_vdevGUID);
428 }
429 
430 inline vdev_state
431 CaseFile::VdevState() const
432 {
433 	return (m_vdevState);
434 }
435 
436 inline const string &
437 CaseFile::PoolGUIDString() const
438 {
439 	return (m_poolGUIDString);
440 }
441 
442 inline const string &
443 CaseFile::VdevGUIDString() const
444 {
445 	return (m_vdevGUIDString);
446 }
447 
448 inline const string &
449 CaseFile::PhysicalPath() const
450 {
451 	return (m_vdevPhysPath);
452 }
453 
454 #endif /* _CASE_FILE_H_ */
455