xref: /titanic_41/usr/src/lib/libzfs/common/libzfs_status.c (revision e7437265dc2a4920c197ed4337665539d358b22c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * This file contains the functions which analyze the status of a pool.  This
30  * include both the status of an active pool, as well as the status exported
31  * pools.  Returns one of the ZPOOL_STATUS_* defines describing the status of
32  * the pool.  This status is independent (to a certain degree) from the state of
33  * the pool.  A pool's state describes only whether or not it is capable of
34  * providing the necessary fault tolerance for data.  The status describes the
35  * overall status of devices.  A pool that is online can still have a device
36  * that is experiencing errors.
37  *
38  * Only a subset of the possible faults can be detected using 'zpool status',
39  * and not all possible errors correspond to a FMA message ID.  The explanation
40  * is left up to the caller, depending on whether it is a live pool or an
41  * import.
42  */
43 
44 #include <libzfs.h>
45 #include <string.h>
46 #include <unistd.h>
47 #include "libzfs_impl.h"
48 
49 /*
50  * Message ID table.  This must be kept in sync with the ZPOOL_STATUS_* defines
51  * in libzfs.h.  Note that there are some status results which go past the end
52  * of this table, and hence have no associated message ID.
53  */
54 static char *zfs_msgid_table[] = {
55 	"ZFS-8000-14",
56 	"ZFS-8000-2Q",
57 	"ZFS-8000-3C",
58 	"ZFS-8000-4J",
59 	"ZFS-8000-5E",
60 	"ZFS-8000-6X",
61 	"ZFS-8000-72",
62 	"ZFS-8000-8A",
63 	"ZFS-8000-9P",
64 	"ZFS-8000-A5",
65 	"ZFS-8000-EY"
66 };
67 
68 #define	NMSGID	(sizeof (zfs_msgid_table) / sizeof (zfs_msgid_table[0]))
69 
70 /* ARGSUSED */
71 static int
72 vdev_missing(uint64_t state, uint64_t aux, uint64_t errs)
73 {
74 	return (state == VDEV_STATE_CANT_OPEN &&
75 	    aux == VDEV_AUX_OPEN_FAILED);
76 }
77 
78 /* ARGSUSED */
79 static int
80 vdev_faulted(uint64_t state, uint64_t aux, uint64_t errs)
81 {
82 	return (state == VDEV_STATE_FAULTED);
83 }
84 
85 /* ARGSUSED */
86 static int
87 vdev_errors(uint64_t state, uint64_t aux, uint64_t errs)
88 {
89 	return (state == VDEV_STATE_DEGRADED || errs != 0);
90 }
91 
92 /* ARGSUSED */
93 static int
94 vdev_broken(uint64_t state, uint64_t aux, uint64_t errs)
95 {
96 	return (state == VDEV_STATE_CANT_OPEN);
97 }
98 
99 /* ARGSUSED */
100 static int
101 vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
102 {
103 	return (state == VDEV_STATE_OFFLINE);
104 }
105 
106 /*
107  * Detect if any leaf devices that have seen errors or could not be opened.
108  */
109 static boolean_t
110 find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
111 {
112 	nvlist_t **child;
113 	vdev_stat_t *vs;
114 	uint_t c, children;
115 	char *type;
116 
117 	/*
118 	 * Ignore problems within a 'replacing' vdev, since we're presumably in
119 	 * the process of repairing any such errors, and don't want to call them
120 	 * out again.  We'll pick up the fact that a resilver is happening
121 	 * later.
122 	 */
123 	verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0);
124 	if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
125 		return (B_FALSE);
126 
127 	if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child,
128 	    &children) == 0) {
129 		for (c = 0; c < children; c++)
130 			if (find_vdev_problem(child[c], func))
131 				return (B_TRUE);
132 	} else {
133 		verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS,
134 		    (uint64_t **)&vs, &c) == 0);
135 
136 		if (func(vs->vs_state, vs->vs_aux,
137 		    vs->vs_read_errors +
138 		    vs->vs_write_errors +
139 		    vs->vs_checksum_errors))
140 			return (B_TRUE);
141 	}
142 
143 	return (B_FALSE);
144 }
145 
146 /*
147  * Active pool health status.
148  *
149  * To determine the status for a pool, we make several passes over the config,
150  * picking the most egregious error we find.  In order of importance, we do the
151  * following:
152  *
153  *	- Check for a complete and valid configuration
154  *	- Look for any faulted or missing devices in a non-replicated config
155  *	- Check for any data errors
156  *	- Check for any faulted or missing devices in a replicated config
157  *	- Look for any devices showing errors
158  *	- Check for any resilvering devices
159  *
160  * There can obviously be multiple errors within a single pool, so this routine
161  * only picks the most damaging of all the current errors to report.
162  */
163 static zpool_status_t
164 check_status(nvlist_t *config, boolean_t isimport)
165 {
166 	nvlist_t *nvroot;
167 	vdev_stat_t *vs;
168 	uint_t vsc;
169 	uint64_t nerr;
170 	uint64_t version;
171 	uint64_t stateval;
172 	uint64_t hostid = 0;
173 
174 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
175 	    &version) == 0);
176 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
177 	    &nvroot) == 0);
178 	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
179 	    (uint64_t **)&vs, &vsc) == 0);
180 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
181 	    &stateval) == 0);
182 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
183 
184 	/*
185 	 * Pool last accessed by another system.
186 	 */
187 	if (hostid != 0 && (unsigned long)hostid != gethostid() &&
188 	    stateval == POOL_STATE_ACTIVE)
189 		return (ZPOOL_STATUS_HOSTID_MISMATCH);
190 
191 	/*
192 	 * Newer on-disk version.
193 	 */
194 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
195 	    vs->vs_aux == VDEV_AUX_VERSION_NEWER)
196 		return (ZPOOL_STATUS_VERSION_NEWER);
197 
198 	/*
199 	 * Check that the config is complete.
200 	 */
201 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
202 	    vs->vs_aux == VDEV_AUX_BAD_GUID_SUM)
203 		return (ZPOOL_STATUS_BAD_GUID_SUM);
204 
205 	/*
206 	 * Bad devices in non-replicated config.
207 	 */
208 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
209 	    find_vdev_problem(nvroot, vdev_faulted))
210 		return (ZPOOL_STATUS_FAULTED_DEV_NR);
211 
212 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
213 	    find_vdev_problem(nvroot, vdev_missing))
214 		return (ZPOOL_STATUS_MISSING_DEV_NR);
215 
216 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
217 	    find_vdev_problem(nvroot, vdev_broken))
218 		return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
219 
220 	/*
221 	 * Corrupted pool metadata
222 	 */
223 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
224 	    vs->vs_aux == VDEV_AUX_CORRUPT_DATA)
225 		return (ZPOOL_STATUS_CORRUPT_POOL);
226 
227 	/*
228 	 * Persistent data errors.
229 	 */
230 	if (!isimport) {
231 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
232 		    &nerr) == 0 && nerr != 0)
233 			return (ZPOOL_STATUS_CORRUPT_DATA);
234 	}
235 
236 	/*
237 	 * Missing devices in a replicated config.
238 	 */
239 	if (find_vdev_problem(nvroot, vdev_faulted))
240 		return (ZPOOL_STATUS_FAULTED_DEV_R);
241 	if (find_vdev_problem(nvroot, vdev_missing))
242 		return (ZPOOL_STATUS_MISSING_DEV_R);
243 	if (find_vdev_problem(nvroot, vdev_broken))
244 		return (ZPOOL_STATUS_CORRUPT_LABEL_R);
245 
246 	/*
247 	 * Devices with errors
248 	 */
249 	if (!isimport && find_vdev_problem(nvroot, vdev_errors))
250 		return (ZPOOL_STATUS_FAILING_DEV);
251 
252 	/*
253 	 * Offlined devices
254 	 */
255 	if (find_vdev_problem(nvroot, vdev_offlined))
256 		return (ZPOOL_STATUS_OFFLINE_DEV);
257 
258 	/*
259 	 * Currently resilvering
260 	 */
261 	if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
262 		return (ZPOOL_STATUS_RESILVERING);
263 
264 	/*
265 	 * Outdated, but usable, version
266 	 */
267 	if (version < SPA_VERSION)
268 		return (ZPOOL_STATUS_VERSION_OLDER);
269 
270 	return (ZPOOL_STATUS_OK);
271 }
272 
273 zpool_status_t
274 zpool_get_status(zpool_handle_t *zhp, char **msgid)
275 {
276 	zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE);
277 
278 	if (ret >= NMSGID)
279 		*msgid = NULL;
280 	else
281 		*msgid = zfs_msgid_table[ret];
282 
283 	return (ret);
284 }
285 
286 zpool_status_t
287 zpool_import_status(nvlist_t *config, char **msgid)
288 {
289 	zpool_status_t ret = check_status(config, B_TRUE);
290 
291 	if (ret >= NMSGID)
292 		*msgid = NULL;
293 	else
294 		*msgid = zfs_msgid_table[ret];
295 
296 	return (ret);
297 }
298