xref: /titanic_41/usr/src/lib/libzfs/common/libzfs_status.c (revision 3afe87ebb25691cb6d158edaa34a6fb9b703a691)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * This file contains the functions which analyze the status of a pool.  This
28  * include both the status of an active pool, as well as the status exported
29  * pools.  Returns one of the ZPOOL_STATUS_* defines describing the status of
30  * the pool.  This status is independent (to a certain degree) from the state of
31  * the pool.  A pool's state describes only whether or not it is capable of
32  * providing the necessary fault tolerance for data.  The status describes the
33  * overall status of devices.  A pool that is online can still have a device
34  * that is experiencing errors.
35  *
36  * Only a subset of the possible faults can be detected using 'zpool status',
37  * and not all possible errors correspond to a FMA message ID.  The explanation
38  * is left up to the caller, depending on whether it is a live pool or an
39  * import.
40  */
41 
42 #include <libzfs.h>
43 #include <string.h>
44 #include <unistd.h>
45 #include "libzfs_impl.h"
46 
47 /*
48  * Message ID table.  This must be kept in sync with the ZPOOL_STATUS_* defines
49  * in libzfs.h.  Note that there are some status results which go past the end
50  * of this table, and hence have no associated message ID.
51  */
52 static char *zfs_msgid_table[] = {
53 	"ZFS-8000-14",
54 	"ZFS-8000-2Q",
55 	"ZFS-8000-3C",
56 	"ZFS-8000-4J",
57 	"ZFS-8000-5E",
58 	"ZFS-8000-6X",
59 	"ZFS-8000-72",
60 	"ZFS-8000-8A",
61 	"ZFS-8000-9P",
62 	"ZFS-8000-A5",
63 	"ZFS-8000-EY",
64 	"ZFS-8000-HC",
65 	"ZFS-8000-JQ",
66 	"ZFS-8000-K4",
67 };
68 
69 #define	NMSGID	(sizeof (zfs_msgid_table) / sizeof (zfs_msgid_table[0]))
70 
71 /* ARGSUSED */
72 static int
73 vdev_missing(uint64_t state, uint64_t aux, uint64_t errs)
74 {
75 	return (state == VDEV_STATE_CANT_OPEN &&
76 	    aux == VDEV_AUX_OPEN_FAILED);
77 }
78 
79 /* ARGSUSED */
80 static int
81 vdev_faulted(uint64_t state, uint64_t aux, uint64_t errs)
82 {
83 	return (state == VDEV_STATE_FAULTED);
84 }
85 
86 /* ARGSUSED */
87 static int
88 vdev_errors(uint64_t state, uint64_t aux, uint64_t errs)
89 {
90 	return (state == VDEV_STATE_DEGRADED || errs != 0);
91 }
92 
93 /* ARGSUSED */
94 static int
95 vdev_broken(uint64_t state, uint64_t aux, uint64_t errs)
96 {
97 	return (state == VDEV_STATE_CANT_OPEN);
98 }
99 
100 /* ARGSUSED */
101 static int
102 vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
103 {
104 	return (state == VDEV_STATE_OFFLINE);
105 }
106 
107 /*
108  * Detect if any leaf devices that have seen errors or could not be opened.
109  */
110 static boolean_t
111 find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
112 {
113 	nvlist_t **child;
114 	vdev_stat_t *vs;
115 	uint_t c, children;
116 	char *type;
117 
118 	/*
119 	 * Ignore problems within a 'replacing' vdev, since we're presumably in
120 	 * the process of repairing any such errors, and don't want to call them
121 	 * out again.  We'll pick up the fact that a resilver is happening
122 	 * later.
123 	 */
124 	verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0);
125 	if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
126 		return (B_FALSE);
127 
128 	if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child,
129 	    &children) == 0) {
130 		for (c = 0; c < children; c++)
131 			if (find_vdev_problem(child[c], func))
132 				return (B_TRUE);
133 	} else {
134 		verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS,
135 		    (uint64_t **)&vs, &c) == 0);
136 
137 		if (func(vs->vs_state, vs->vs_aux,
138 		    vs->vs_read_errors +
139 		    vs->vs_write_errors +
140 		    vs->vs_checksum_errors))
141 			return (B_TRUE);
142 	}
143 
144 	return (B_FALSE);
145 }
146 
147 /*
148  * Active pool health status.
149  *
150  * To determine the status for a pool, we make several passes over the config,
151  * picking the most egregious error we find.  In order of importance, we do the
152  * following:
153  *
154  *	- Check for a complete and valid configuration
155  *	- Look for any faulted or missing devices in a non-replicated config
156  *	- Check for any data errors
157  *	- Check for any faulted or missing devices in a replicated config
158  *	- Look for any devices showing errors
159  *	- Check for any resilvering devices
160  *
161  * There can obviously be multiple errors within a single pool, so this routine
162  * only picks the most damaging of all the current errors to report.
163  */
164 static zpool_status_t
165 check_status(nvlist_t *config, boolean_t isimport)
166 {
167 	nvlist_t *nvroot;
168 	vdev_stat_t *vs;
169 	uint_t vsc;
170 	uint64_t nerr;
171 	uint64_t version;
172 	uint64_t stateval;
173 	uint64_t suspended;
174 	uint64_t hostid = 0;
175 
176 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
177 	    &version) == 0);
178 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
179 	    &nvroot) == 0);
180 	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
181 	    (uint64_t **)&vs, &vsc) == 0);
182 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
183 	    &stateval) == 0);
184 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
185 
186 	/*
187 	 * Pool last accessed by another system.
188 	 */
189 	if (hostid != 0 && (unsigned long)hostid != gethostid() &&
190 	    stateval == POOL_STATE_ACTIVE)
191 		return (ZPOOL_STATUS_HOSTID_MISMATCH);
192 
193 	/*
194 	 * Newer on-disk version.
195 	 */
196 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
197 	    vs->vs_aux == VDEV_AUX_VERSION_NEWER)
198 		return (ZPOOL_STATUS_VERSION_NEWER);
199 
200 	/*
201 	 * Check that the config is complete.
202 	 */
203 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
204 	    vs->vs_aux == VDEV_AUX_BAD_GUID_SUM)
205 		return (ZPOOL_STATUS_BAD_GUID_SUM);
206 
207 	/*
208 	 * Check whether the pool has suspended due to failed I/O.
209 	 */
210 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_SUSPENDED,
211 	    &suspended) == 0) {
212 		if (suspended == ZIO_FAILURE_MODE_CONTINUE)
213 			return (ZPOOL_STATUS_IO_FAILURE_CONTINUE);
214 		return (ZPOOL_STATUS_IO_FAILURE_WAIT);
215 	}
216 
217 	/*
218 	 * Could not read a log.
219 	 */
220 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
221 	    vs->vs_aux == VDEV_AUX_BAD_LOG) {
222 		return (ZPOOL_STATUS_BAD_LOG);
223 	}
224 
225 	/*
226 	 * Bad devices in non-replicated config.
227 	 */
228 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
229 	    find_vdev_problem(nvroot, vdev_faulted))
230 		return (ZPOOL_STATUS_FAULTED_DEV_NR);
231 
232 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
233 	    find_vdev_problem(nvroot, vdev_missing))
234 		return (ZPOOL_STATUS_MISSING_DEV_NR);
235 
236 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
237 	    find_vdev_problem(nvroot, vdev_broken))
238 		return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
239 
240 	/*
241 	 * Corrupted pool metadata
242 	 */
243 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
244 	    vs->vs_aux == VDEV_AUX_CORRUPT_DATA)
245 		return (ZPOOL_STATUS_CORRUPT_POOL);
246 
247 	/*
248 	 * Persistent data errors.
249 	 */
250 	if (!isimport) {
251 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
252 		    &nerr) == 0 && nerr != 0)
253 			return (ZPOOL_STATUS_CORRUPT_DATA);
254 	}
255 
256 	/*
257 	 * Missing devices in a replicated config.
258 	 */
259 	if (find_vdev_problem(nvroot, vdev_faulted))
260 		return (ZPOOL_STATUS_FAULTED_DEV_R);
261 	if (find_vdev_problem(nvroot, vdev_missing))
262 		return (ZPOOL_STATUS_MISSING_DEV_R);
263 	if (find_vdev_problem(nvroot, vdev_broken))
264 		return (ZPOOL_STATUS_CORRUPT_LABEL_R);
265 
266 	/*
267 	 * Devices with errors
268 	 */
269 	if (!isimport && find_vdev_problem(nvroot, vdev_errors))
270 		return (ZPOOL_STATUS_FAILING_DEV);
271 
272 	/*
273 	 * Offlined devices
274 	 */
275 	if (find_vdev_problem(nvroot, vdev_offlined))
276 		return (ZPOOL_STATUS_OFFLINE_DEV);
277 
278 	/*
279 	 * Currently resilvering
280 	 */
281 	if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
282 		return (ZPOOL_STATUS_RESILVERING);
283 
284 	/*
285 	 * Outdated, but usable, version
286 	 */
287 	if (version < SPA_VERSION)
288 		return (ZPOOL_STATUS_VERSION_OLDER);
289 
290 	return (ZPOOL_STATUS_OK);
291 }
292 
293 zpool_status_t
294 zpool_get_status(zpool_handle_t *zhp, char **msgid)
295 {
296 	zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE);
297 
298 	if (ret >= NMSGID)
299 		*msgid = NULL;
300 	else
301 		*msgid = zfs_msgid_table[ret];
302 
303 	return (ret);
304 }
305 
306 zpool_status_t
307 zpool_import_status(nvlist_t *config, char **msgid)
308 {
309 	zpool_status_t ret = check_status(config, B_TRUE);
310 
311 	if (ret >= NMSGID)
312 		*msgid = NULL;
313 	else
314 		*msgid = zfs_msgid_table[ret];
315 
316 	return (ret);
317 }
318