xref: /illumos-gate/usr/src/lib/libzfs/common/libzfs_status.c (revision d6bb6a8465e557cb946ef49d56ed3202f6218652)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * This file contains the functions which analyze the status of a pool.  This
30  * include both the status of an active pool, as well as the status exported
31  * pools.  Returns one of the ZPOOL_STATUS_* defines describing the status of
32  * the pool.  This status is independent (to a certain degree) from the state of
33  * the pool.  A pool's state descsribes only whether or not it is capable of
34  * providing the necessary fault tolerance for data.  The status describes the
35  * overall status of devices.  A pool that is online can still have a device
36  * that is experiencing errors.
37  *
38  * Only a subset of the possible faults can be detected using 'zpool status',
39  * and not all possible errors correspond to a FMA message ID.  The explanation
40  * is left up to the caller, depending on whether it is a live pool or an
41  * import.
42  */
43 
44 #include <libzfs.h>
45 #include <string.h>
46 #include "libzfs_impl.h"
47 
48 /*
49  * Message ID table.  This must be kep in sync with the ZPOOL_STATUS_* defines
50  * in libzfs.h.  Note that there are some status results which go past the end
51  * of this table, and hence have no associated message ID.
52  */
53 static char *msgid_table[] = {
54 	"ZFS-8000-14",
55 	"ZFS-8000-2Q",
56 	"ZFS-8000-3C",
57 	"ZFS-8000-4J",
58 	"ZFS-8000-5E",
59 	"ZFS-8000-6X",
60 	"ZFS-8000-72",
61 	"ZFS-8000-8A",
62 	"ZFS-8000-9P",
63 	"ZFS-8000-A5"
64 };
65 
66 /*
67  * If the pool is active, a certain class of static errors is overridden by the
68  * faults as analayzed by FMA.  These faults have separate knowledge articles,
69  * and the article referred to by 'zpool status' must match that indicated by
70  * the syslog error message.  We override missing data as well as corrupt pool.
71  */
72 static char *msgid_table_active[] = {
73 	"ZFS-8000-14",
74 	"ZFS-8000-D3",		/* overridden */
75 	"ZFS-8000-D3",		/* overridden */
76 	"ZFS-8000-4J",
77 	"ZFS-8000-5E",
78 	"ZFS-8000-6X",
79 	"ZFS-8000-CS",		/* overridden */
80 	"ZFS-8000-8A",
81 	"ZFS-8000-9P",
82 	"ZFS-8000-CS",		/* overridden */
83 };
84 
85 #define	NMSGID	(sizeof (msgid_table) / sizeof (msgid_table[0]))
86 
87 /* ARGSUSED */
88 static int
89 vdev_missing(uint64_t state, uint64_t aux, uint64_t errs)
90 {
91 	return (state == VDEV_STATE_CANT_OPEN &&
92 	    aux == VDEV_AUX_OPEN_FAILED);
93 }
94 
95 /* ARGSUSED */
96 static int
97 vdev_errors(uint64_t state, uint64_t aux, uint64_t errs)
98 {
99 	return (errs != 0);
100 }
101 
102 /* ARGSUSED */
103 static int
104 vdev_broken(uint64_t state, uint64_t aux, uint64_t errs)
105 {
106 	return (state == VDEV_STATE_CANT_OPEN);
107 }
108 
109 /* ARGSUSED */
110 static int
111 vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
112 {
113 	return (state == VDEV_STATE_OFFLINE);
114 }
115 
116 /*
117  * Detect if any leaf devices that have seen errors or could not be opened.
118  */
119 static int
120 find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
121 {
122 	nvlist_t **child;
123 	vdev_stat_t *vs;
124 	uint_t c, children;
125 	char *type;
126 
127 	/*
128 	 * Ignore problems within a 'replacing' vdev, since we're presumably in
129 	 * the process of repairing any such errors, and don't want to call them
130 	 * out again.  We'll pick up the fact that a resilver is happening
131 	 * later.
132 	 */
133 	verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0);
134 	if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
135 		return (FALSE);
136 
137 	if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child,
138 	    &children) == 0) {
139 		for (c = 0; c < children; c++)
140 			if (find_vdev_problem(child[c], func))
141 				return (TRUE);
142 	} else {
143 		verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS,
144 		    (uint64_t **)&vs, &c) == 0);
145 
146 		if (func(vs->vs_state, vs->vs_aux,
147 		    vs->vs_read_errors +
148 		    vs->vs_write_errors +
149 		    vs->vs_checksum_errors))
150 			return (TRUE);
151 	}
152 
153 	return (FALSE);
154 }
155 
156 /*
157  * Active pool health status.
158  *
159  * To determine the status for a pool, we make several passes over the config,
160  * picking the most egregious error we find.  In order of importance, we do the
161  * following:
162  *
163  *	- Check for a complete and valid configuration
164  *	- Look for any missing devices in a non-replicated config
165  *	- Check for any data errors
166  *	- Check for any missing devices in a replicated config
167  *	- Look for any devices showing errors
168  *	- Check for any resilvering devices
169  *
170  * There can obviously be multiple errors within a single pool, so this routine
171  * only picks the most damaging of all the current errors to report.
172  */
173 static zpool_status_t
174 check_status(nvlist_t *config, int isimport)
175 {
176 	nvlist_t *nvroot;
177 	vdev_stat_t *vs;
178 	uint_t vsc;
179 	uint64_t nerr;
180 
181 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
182 	    &nvroot) == 0);
183 	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
184 	    (uint64_t **)&vs, &vsc) == 0);
185 
186 	/*
187 	 * Check that the config is complete.
188 	 */
189 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
190 	    vs->vs_aux == VDEV_AUX_BAD_GUID_SUM)
191 		return (ZPOOL_STATUS_BAD_GUID_SUM);
192 
193 	/*
194 	 * Missing devices in non-replicated config.
195 	 */
196 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
197 	    find_vdev_problem(nvroot, vdev_missing))
198 		return (ZPOOL_STATUS_MISSING_DEV_NR);
199 
200 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
201 	    find_vdev_problem(nvroot, vdev_broken))
202 		return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
203 
204 	/*
205 	 * Corrupted pool metadata
206 	 */
207 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
208 	    vs->vs_aux == VDEV_AUX_CORRUPT_DATA)
209 		return (ZPOOL_STATUS_CORRUPT_POOL);
210 
211 	/*
212 	 * Persistent data errors.
213 	 */
214 	if (!isimport) {
215 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
216 		    &nerr) == 0 && nerr != 0)
217 			return (ZPOOL_STATUS_CORRUPT_DATA);
218 	}
219 
220 	/*
221 	 * Missing devices in a replicated config.
222 	 */
223 	if (find_vdev_problem(nvroot, vdev_missing))
224 		return (ZPOOL_STATUS_MISSING_DEV_R);
225 	if (find_vdev_problem(nvroot, vdev_broken))
226 		return (ZPOOL_STATUS_CORRUPT_LABEL_R);
227 
228 	/*
229 	 * Devices with errors
230 	 */
231 	if (!isimport && find_vdev_problem(nvroot, vdev_errors))
232 		return (ZPOOL_STATUS_FAILING_DEV);
233 
234 	/*
235 	 * Offlined devices
236 	 */
237 	if (find_vdev_problem(nvroot, vdev_offlined))
238 		return (ZPOOL_STATUS_OFFLINE_DEV);
239 
240 	/*
241 	 * Currently resilvering
242 	 */
243 	if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
244 		return (ZPOOL_STATUS_RESILVERING);
245 
246 	/*
247 	 * We currently have no way to detect the following errors:
248 	 *
249 	 * 	CORRUPT_CACHE
250 	 * 	VERSION_MISMATCH
251 	 */
252 
253 	return (ZPOOL_STATUS_OK);
254 }
255 
256 zpool_status_t
257 zpool_get_status(zpool_handle_t *zhp, char **msgid)
258 {
259 	zpool_status_t ret = check_status(zhp->zpool_config, FALSE);
260 
261 	if (ret >= NMSGID)
262 		*msgid = NULL;
263 	else
264 		*msgid = msgid_table_active[ret];
265 
266 	return (ret);
267 }
268 
269 zpool_status_t
270 zpool_import_status(nvlist_t *config, char **msgid)
271 {
272 	zpool_status_t ret = check_status(config, TRUE);
273 
274 	if (ret >= NMSGID)
275 		*msgid = NULL;
276 	else
277 		*msgid = msgid_table[ret];
278 
279 	return (ret);
280 }
281