xref: /illumos-gate/usr/src/lib/libzfs/common/libzfs_status.c (revision ed5289f91b9bf164dccd6c75398362be77a4478d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * This file contains the functions which analyze the status of a pool.  This
30  * include both the status of an active pool, as well as the status exported
31  * pools.  Returns one of the ZPOOL_STATUS_* defines describing the status of
32  * the pool.  This status is independent (to a certain degree) from the state of
33  * the pool.  A pool's state describes only whether or not it is capable of
34  * providing the necessary fault tolerance for data.  The status describes the
35  * overall status of devices.  A pool that is online can still have a device
36  * that is experiencing errors.
37  *
38  * Only a subset of the possible faults can be detected using 'zpool status',
39  * and not all possible errors correspond to a FMA message ID.  The explanation
40  * is left up to the caller, depending on whether it is a live pool or an
41  * import.
42  */
43 
44 #include <libzfs.h>
45 #include <string.h>
46 #include <unistd.h>
47 #include "libzfs_impl.h"
48 
49 /*
50  * Message ID table.  This must be kept in sync with the ZPOOL_STATUS_* defines
51  * in libzfs.h.  Note that there are some status results which go past the end
52  * of this table, and hence have no associated message ID.
53  */
54 static char *zfs_msgid_table[] = {
55 	"ZFS-8000-14",
56 	"ZFS-8000-2Q",
57 	"ZFS-8000-3C",
58 	"ZFS-8000-4J",
59 	"ZFS-8000-5E",
60 	"ZFS-8000-6X",
61 	"ZFS-8000-72",
62 	"ZFS-8000-8A",
63 	"ZFS-8000-9P",
64 	"ZFS-8000-A5",
65 	"ZFS-8000-EY",
66 	"ZFS-8000-HC",
67 	"ZFS-8000-JQ",
68 	"ZFS-8000-K4",
69 };
70 
71 #define	NMSGID	(sizeof (zfs_msgid_table) / sizeof (zfs_msgid_table[0]))
72 
73 /* ARGSUSED */
74 static int
75 vdev_missing(uint64_t state, uint64_t aux, uint64_t errs)
76 {
77 	return (state == VDEV_STATE_CANT_OPEN &&
78 	    aux == VDEV_AUX_OPEN_FAILED);
79 }
80 
81 /* ARGSUSED */
82 static int
83 vdev_faulted(uint64_t state, uint64_t aux, uint64_t errs)
84 {
85 	return (state == VDEV_STATE_FAULTED);
86 }
87 
88 /* ARGSUSED */
89 static int
90 vdev_errors(uint64_t state, uint64_t aux, uint64_t errs)
91 {
92 	return (state == VDEV_STATE_DEGRADED || errs != 0);
93 }
94 
95 /* ARGSUSED */
96 static int
97 vdev_broken(uint64_t state, uint64_t aux, uint64_t errs)
98 {
99 	return (state == VDEV_STATE_CANT_OPEN);
100 }
101 
102 /* ARGSUSED */
103 static int
104 vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
105 {
106 	return (state == VDEV_STATE_OFFLINE);
107 }
108 
109 /*
110  * Detect if any leaf devices that have seen errors or could not be opened.
111  */
112 static boolean_t
113 find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
114 {
115 	nvlist_t **child;
116 	vdev_stat_t *vs;
117 	uint_t c, children;
118 	char *type;
119 
120 	/*
121 	 * Ignore problems within a 'replacing' vdev, since we're presumably in
122 	 * the process of repairing any such errors, and don't want to call them
123 	 * out again.  We'll pick up the fact that a resilver is happening
124 	 * later.
125 	 */
126 	verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0);
127 	if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
128 		return (B_FALSE);
129 
130 	if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child,
131 	    &children) == 0) {
132 		for (c = 0; c < children; c++)
133 			if (find_vdev_problem(child[c], func))
134 				return (B_TRUE);
135 	} else {
136 		verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS,
137 		    (uint64_t **)&vs, &c) == 0);
138 
139 		if (func(vs->vs_state, vs->vs_aux,
140 		    vs->vs_read_errors +
141 		    vs->vs_write_errors +
142 		    vs->vs_checksum_errors))
143 			return (B_TRUE);
144 	}
145 
146 	return (B_FALSE);
147 }
148 
149 /*
150  * Active pool health status.
151  *
152  * To determine the status for a pool, we make several passes over the config,
153  * picking the most egregious error we find.  In order of importance, we do the
154  * following:
155  *
156  *	- Check for a complete and valid configuration
157  *	- Look for any faulted or missing devices in a non-replicated config
158  *	- Check for any data errors
159  *	- Check for any faulted or missing devices in a replicated config
160  *	- Look for any devices showing errors
161  *	- Check for any resilvering devices
162  *
163  * There can obviously be multiple errors within a single pool, so this routine
164  * only picks the most damaging of all the current errors to report.
165  */
166 static zpool_status_t
167 check_status(zpool_handle_t *zhp, nvlist_t *config, boolean_t isimport)
168 {
169 	nvlist_t *nvroot;
170 	vdev_stat_t *vs;
171 	uint_t vsc;
172 	uint64_t nerr;
173 	uint64_t version;
174 	uint64_t stateval;
175 	uint64_t hostid = 0;
176 
177 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
178 	    &version) == 0);
179 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
180 	    &nvroot) == 0);
181 	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
182 	    (uint64_t **)&vs, &vsc) == 0);
183 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
184 	    &stateval) == 0);
185 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
186 
187 	/*
188 	 * Pool last accessed by another system.
189 	 */
190 	if (hostid != 0 && (unsigned long)hostid != gethostid() &&
191 	    stateval == POOL_STATE_ACTIVE)
192 		return (ZPOOL_STATUS_HOSTID_MISMATCH);
193 
194 	/*
195 	 * Newer on-disk version.
196 	 */
197 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
198 	    vs->vs_aux == VDEV_AUX_VERSION_NEWER)
199 		return (ZPOOL_STATUS_VERSION_NEWER);
200 
201 	/*
202 	 * Check that the config is complete.
203 	 */
204 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
205 	    vs->vs_aux == VDEV_AUX_BAD_GUID_SUM)
206 		return (ZPOOL_STATUS_BAD_GUID_SUM);
207 
208 	/*
209 	 * Pool has experienced failed I/O.
210 	 */
211 	if (stateval == POOL_STATE_IO_FAILURE) {
212 		zpool_handle_t *tmp_zhp = NULL;
213 		libzfs_handle_t *hdl = NULL;
214 		char property[ZPOOL_MAXPROPLEN];
215 		char *failmode = NULL;
216 
217 		if (zhp == NULL) {
218 			char *poolname;
219 
220 			verify(nvlist_lookup_string(config,
221 			    ZPOOL_CONFIG_POOL_NAME, &poolname) == 0);
222 			if ((hdl = libzfs_init()) == NULL)
223 				return (ZPOOL_STATUS_IO_FAILURE_WAIT);
224 			tmp_zhp = zpool_open_canfail(hdl, poolname);
225 			if (tmp_zhp == NULL) {
226 				libzfs_fini(hdl);
227 				return (ZPOOL_STATUS_IO_FAILURE_WAIT);
228 			}
229 		}
230 		if (zpool_get_prop(zhp ? zhp : tmp_zhp, ZPOOL_PROP_FAILUREMODE,
231 		    property, sizeof (property), NULL) == 0)
232 			failmode = property;
233 		if (tmp_zhp != NULL)
234 			zpool_close(tmp_zhp);
235 		if (hdl != NULL)
236 			libzfs_fini(hdl);
237 		if (failmode == NULL)
238 			return (ZPOOL_STATUS_IO_FAILURE_WAIT);
239 
240 		if (strncmp(failmode, "continue", strlen("continue")) == 0)
241 			return (ZPOOL_STATUS_IO_FAILURE_CONTINUE);
242 		else
243 			return (ZPOOL_STATUS_IO_FAILURE_WAIT);
244 	}
245 
246 	/*
247 	 * Could not read a log.
248 	 */
249 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
250 	    vs->vs_aux == VDEV_AUX_BAD_LOG) {
251 		return (ZPOOL_STATUS_BAD_LOG);
252 	}
253 
254 	/*
255 	 * Bad devices in non-replicated config.
256 	 */
257 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
258 	    find_vdev_problem(nvroot, vdev_faulted))
259 		return (ZPOOL_STATUS_FAULTED_DEV_NR);
260 
261 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
262 	    find_vdev_problem(nvroot, vdev_missing))
263 		return (ZPOOL_STATUS_MISSING_DEV_NR);
264 
265 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
266 	    find_vdev_problem(nvroot, vdev_broken))
267 		return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
268 
269 	/*
270 	 * Corrupted pool metadata
271 	 */
272 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
273 	    vs->vs_aux == VDEV_AUX_CORRUPT_DATA)
274 		return (ZPOOL_STATUS_CORRUPT_POOL);
275 
276 	/*
277 	 * Persistent data errors.
278 	 */
279 	if (!isimport) {
280 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
281 		    &nerr) == 0 && nerr != 0)
282 			return (ZPOOL_STATUS_CORRUPT_DATA);
283 	}
284 
285 	/*
286 	 * Missing devices in a replicated config.
287 	 */
288 	if (find_vdev_problem(nvroot, vdev_faulted))
289 		return (ZPOOL_STATUS_FAULTED_DEV_R);
290 	if (find_vdev_problem(nvroot, vdev_missing))
291 		return (ZPOOL_STATUS_MISSING_DEV_R);
292 	if (find_vdev_problem(nvroot, vdev_broken))
293 		return (ZPOOL_STATUS_CORRUPT_LABEL_R);
294 
295 	/*
296 	 * Devices with errors
297 	 */
298 	if (!isimport && find_vdev_problem(nvroot, vdev_errors))
299 		return (ZPOOL_STATUS_FAILING_DEV);
300 
301 	/*
302 	 * Offlined devices
303 	 */
304 	if (find_vdev_problem(nvroot, vdev_offlined))
305 		return (ZPOOL_STATUS_OFFLINE_DEV);
306 
307 	/*
308 	 * Currently resilvering
309 	 */
310 	if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
311 		return (ZPOOL_STATUS_RESILVERING);
312 
313 	/*
314 	 * Outdated, but usable, version
315 	 */
316 	if (version < SPA_VERSION)
317 		return (ZPOOL_STATUS_VERSION_OLDER);
318 
319 	return (ZPOOL_STATUS_OK);
320 }
321 
322 zpool_status_t
323 zpool_get_status(zpool_handle_t *zhp, char **msgid)
324 {
325 	zpool_status_t ret = check_status(zhp, zhp->zpool_config, B_FALSE);
326 
327 	if (ret >= NMSGID)
328 		*msgid = NULL;
329 	else
330 		*msgid = zfs_msgid_table[ret];
331 
332 	return (ret);
333 }
334 
335 zpool_status_t
336 zpool_import_status(nvlist_t *config, char **msgid)
337 {
338 	zpool_status_t ret = check_status(NULL, config, B_TRUE);
339 
340 	if (ret >= NMSGID)
341 		*msgid = NULL;
342 	else
343 		*msgid = zfs_msgid_table[ret];
344 
345 	return (ret);
346 }
347