xref: /titanic_44/usr/src/lib/libzfs/common/libzfs_status.c (revision 22eb7cb54d8a6bcf6fe2674cb4b1f0cf2d85cfb6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * This file contains the functions which analyze the status of a pool.  This
30  * include both the status of an active pool, as well as the status exported
31  * pools.  Returns one of the ZPOOL_STATUS_* defines describing the status of
32  * the pool.  This status is independent (to a certain degree) from the state of
33  * the pool.  A pool's state describes only whether or not it is capable of
34  * providing the necessary fault tolerance for data.  The status describes the
35  * overall status of devices.  A pool that is online can still have a device
36  * that is experiencing errors.
37  *
38  * Only a subset of the possible faults can be detected using 'zpool status',
39  * and not all possible errors correspond to a FMA message ID.  The explanation
40  * is left up to the caller, depending on whether it is a live pool or an
41  * import.
42  */
43 
44 #include <libzfs.h>
45 #include <string.h>
46 #include <unistd.h>
47 #include "libzfs_impl.h"
48 
49 /*
50  * Message ID table.  This must be kept in sync with the ZPOOL_STATUS_* defines
51  * in libzfs.h.  Note that there are some status results which go past the end
52  * of this table, and hence have no associated message ID.
53  */
54 static char *zfs_msgid_table[] = {
55 	"ZFS-8000-14",
56 	"ZFS-8000-2Q",
57 	"ZFS-8000-3C",
58 	"ZFS-8000-4J",
59 	"ZFS-8000-5E",
60 	"ZFS-8000-6X",
61 	"ZFS-8000-72",
62 	"ZFS-8000-8A",
63 	"ZFS-8000-9P",
64 	"ZFS-8000-A5",
65 	"ZFS-8000-EY",
66 	"ZFS-8000-HC",
67 	"ZFS-8000-JQ"
68 };
69 
70 #define	NMSGID	(sizeof (zfs_msgid_table) / sizeof (zfs_msgid_table[0]))
71 
72 /* ARGSUSED */
73 static int
74 vdev_missing(uint64_t state, uint64_t aux, uint64_t errs)
75 {
76 	return (state == VDEV_STATE_CANT_OPEN &&
77 	    aux == VDEV_AUX_OPEN_FAILED);
78 }
79 
80 /* ARGSUSED */
81 static int
82 vdev_faulted(uint64_t state, uint64_t aux, uint64_t errs)
83 {
84 	return (state == VDEV_STATE_FAULTED);
85 }
86 
87 /* ARGSUSED */
88 static int
89 vdev_errors(uint64_t state, uint64_t aux, uint64_t errs)
90 {
91 	return (state == VDEV_STATE_DEGRADED || errs != 0);
92 }
93 
94 /* ARGSUSED */
95 static int
96 vdev_broken(uint64_t state, uint64_t aux, uint64_t errs)
97 {
98 	return (state == VDEV_STATE_CANT_OPEN);
99 }
100 
101 /* ARGSUSED */
102 static int
103 vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
104 {
105 	return (state == VDEV_STATE_OFFLINE);
106 }
107 
108 /*
109  * Detect if any leaf devices that have seen errors or could not be opened.
110  */
111 static boolean_t
112 find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
113 {
114 	nvlist_t **child;
115 	vdev_stat_t *vs;
116 	uint_t c, children;
117 	char *type;
118 
119 	/*
120 	 * Ignore problems within a 'replacing' vdev, since we're presumably in
121 	 * the process of repairing any such errors, and don't want to call them
122 	 * out again.  We'll pick up the fact that a resilver is happening
123 	 * later.
124 	 */
125 	verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0);
126 	if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
127 		return (B_FALSE);
128 
129 	if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child,
130 	    &children) == 0) {
131 		for (c = 0; c < children; c++)
132 			if (find_vdev_problem(child[c], func))
133 				return (B_TRUE);
134 	} else {
135 		verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS,
136 		    (uint64_t **)&vs, &c) == 0);
137 
138 		if (func(vs->vs_state, vs->vs_aux,
139 		    vs->vs_read_errors +
140 		    vs->vs_write_errors +
141 		    vs->vs_checksum_errors))
142 			return (B_TRUE);
143 	}
144 
145 	return (B_FALSE);
146 }
147 
148 /*
149  * Active pool health status.
150  *
151  * To determine the status for a pool, we make several passes over the config,
152  * picking the most egregious error we find.  In order of importance, we do the
153  * following:
154  *
155  *	- Check for a complete and valid configuration
156  *	- Look for any faulted or missing devices in a non-replicated config
157  *	- Check for any data errors
158  *	- Check for any faulted or missing devices in a replicated config
159  *	- Look for any devices showing errors
160  *	- Check for any resilvering devices
161  *
162  * There can obviously be multiple errors within a single pool, so this routine
163  * only picks the most damaging of all the current errors to report.
164  */
165 static zpool_status_t
166 check_status(zpool_handle_t *zhp, nvlist_t *config, boolean_t isimport)
167 {
168 	nvlist_t *nvroot;
169 	vdev_stat_t *vs;
170 	uint_t vsc;
171 	uint64_t nerr;
172 	uint64_t version;
173 	uint64_t stateval;
174 	uint64_t hostid = 0;
175 
176 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
177 	    &version) == 0);
178 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
179 	    &nvroot) == 0);
180 	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
181 	    (uint64_t **)&vs, &vsc) == 0);
182 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
183 	    &stateval) == 0);
184 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
185 
186 	/*
187 	 * Pool last accessed by another system.
188 	 */
189 	if (hostid != 0 && (unsigned long)hostid != gethostid() &&
190 	    stateval == POOL_STATE_ACTIVE)
191 		return (ZPOOL_STATUS_HOSTID_MISMATCH);
192 
193 	/*
194 	 * Newer on-disk version.
195 	 */
196 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
197 	    vs->vs_aux == VDEV_AUX_VERSION_NEWER)
198 		return (ZPOOL_STATUS_VERSION_NEWER);
199 
200 	/*
201 	 * Check that the config is complete.
202 	 */
203 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
204 	    vs->vs_aux == VDEV_AUX_BAD_GUID_SUM)
205 		return (ZPOOL_STATUS_BAD_GUID_SUM);
206 
207 	/*
208 	 * Pool has experienced failed I/O.
209 	 */
210 	if (stateval == POOL_STATE_IO_FAILURE) {
211 		zpool_handle_t *tmp_zhp = NULL;
212 		libzfs_handle_t *hdl = NULL;
213 		char property[ZPOOL_MAXPROPLEN];
214 		char *failmode = NULL;
215 
216 		if (zhp == NULL) {
217 			char *poolname;
218 
219 			verify(nvlist_lookup_string(config,
220 			    ZPOOL_CONFIG_POOL_NAME, &poolname) == 0);
221 			if ((hdl = libzfs_init()) == NULL)
222 				return (ZPOOL_STATUS_IO_FAILURE_WAIT);
223 			tmp_zhp = zpool_open_canfail(hdl, poolname);
224 			if (tmp_zhp == NULL) {
225 				libzfs_fini(hdl);
226 				return (ZPOOL_STATUS_IO_FAILURE_WAIT);
227 			}
228 		}
229 		if (zpool_get_prop(zhp ? zhp : tmp_zhp, ZPOOL_PROP_FAILUREMODE,
230 		    property, sizeof (property), NULL) == 0)
231 			failmode = property;
232 		if (tmp_zhp != NULL)
233 			zpool_close(tmp_zhp);
234 		if (hdl != NULL)
235 			libzfs_fini(hdl);
236 		if (failmode == NULL)
237 			return (ZPOOL_STATUS_IO_FAILURE_WAIT);
238 
239 		if (strncmp(failmode, "continue", strlen("continue")) == 0)
240 			return (ZPOOL_STATUS_IO_FAILURE_CONTINUE);
241 		else
242 			return (ZPOOL_STATUS_IO_FAILURE_WAIT);
243 	}
244 
245 	/*
246 	 * Bad devices in non-replicated config.
247 	 */
248 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
249 	    find_vdev_problem(nvroot, vdev_faulted))
250 		return (ZPOOL_STATUS_FAULTED_DEV_NR);
251 
252 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
253 	    find_vdev_problem(nvroot, vdev_missing))
254 		return (ZPOOL_STATUS_MISSING_DEV_NR);
255 
256 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
257 	    find_vdev_problem(nvroot, vdev_broken))
258 		return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
259 
260 	/*
261 	 * Corrupted pool metadata
262 	 */
263 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
264 	    vs->vs_aux == VDEV_AUX_CORRUPT_DATA)
265 		return (ZPOOL_STATUS_CORRUPT_POOL);
266 
267 	/*
268 	 * Persistent data errors.
269 	 */
270 	if (!isimport) {
271 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
272 		    &nerr) == 0 && nerr != 0)
273 			return (ZPOOL_STATUS_CORRUPT_DATA);
274 	}
275 
276 	/*
277 	 * Missing devices in a replicated config.
278 	 */
279 	if (find_vdev_problem(nvroot, vdev_faulted))
280 		return (ZPOOL_STATUS_FAULTED_DEV_R);
281 	if (find_vdev_problem(nvroot, vdev_missing))
282 		return (ZPOOL_STATUS_MISSING_DEV_R);
283 	if (find_vdev_problem(nvroot, vdev_broken))
284 		return (ZPOOL_STATUS_CORRUPT_LABEL_R);
285 
286 	/*
287 	 * Devices with errors
288 	 */
289 	if (!isimport && find_vdev_problem(nvroot, vdev_errors))
290 		return (ZPOOL_STATUS_FAILING_DEV);
291 
292 	/*
293 	 * Offlined devices
294 	 */
295 	if (find_vdev_problem(nvroot, vdev_offlined))
296 		return (ZPOOL_STATUS_OFFLINE_DEV);
297 
298 	/*
299 	 * Currently resilvering
300 	 */
301 	if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
302 		return (ZPOOL_STATUS_RESILVERING);
303 
304 	/*
305 	 * Outdated, but usable, version
306 	 */
307 	if (version < SPA_VERSION)
308 		return (ZPOOL_STATUS_VERSION_OLDER);
309 
310 	return (ZPOOL_STATUS_OK);
311 }
312 
313 zpool_status_t
314 zpool_get_status(zpool_handle_t *zhp, char **msgid)
315 {
316 	zpool_status_t ret = check_status(zhp, zhp->zpool_config, B_FALSE);
317 
318 	if (ret >= NMSGID)
319 		*msgid = NULL;
320 	else
321 		*msgid = zfs_msgid_table[ret];
322 
323 	return (ret);
324 }
325 
326 zpool_status_t
327 zpool_import_status(nvlist_t *config, char **msgid)
328 {
329 	zpool_status_t ret = check_status(NULL, config, B_TRUE);
330 
331 	if (ret >= NMSGID)
332 		*msgid = NULL;
333 	else
334 		*msgid = zfs_msgid_table[ret];
335 
336 	return (ret);
337 }
338