xref: /titanic_41/usr/src/uts/common/io/lvm/raid/raid_hotspare.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 #pragma ident	"%Z%%M%	%I%	%E% SMI"
23 
24 /*
25  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
26  * Use is subject to license terms.
27  */
28 
29 /*
30  * NAME:	raid_hotspare.c
31  * DESCRIPTION: RAID driver source file containing routines related to
32  *		hospare operation.
33  * ROUTINES PROVIDED FOR EXTERNAL USE:
34  * raid_hs_release() - release a hotspare device
35  *  raid_hotspares() - prompt the hospare daemon to attempt needed hotspare work
36  */
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/conf.h>
41 #include <sys/file.h>
42 #include <sys/user.h>
43 #include <sys/uio.h>
44 #include <sys/t_lock.h>
45 #include <sys/buf.h>
46 #include <sys/dkio.h>
47 #include <sys/vtoc.h>
48 #include <sys/kmem.h>
49 #include <vm/page.h>
50 #include <sys/sysmacros.h>
51 #include <sys/types.h>
52 #include <sys/mkdev.h>
53 #include <sys/stat.h>
54 #include <sys/open.h>
55 #include <sys/lvm/md_raid.h>
56 #include <sys/modctl.h>
57 #include <sys/ddi.h>
58 #include <sys/sunddi.h>
59 #include <sys/debug.h>
60 
61 #include <sys/sysevent/eventdefs.h>
62 #include <sys/sysevent/svm.h>
63 
64 extern mdq_anchor_t	md_hs_daemon;
65 static daemon_request_t hotspare_request;
66 
67 extern md_set_t		md_set[];
68 extern md_ops_t 	raid_md_ops;
69 
70 /*
71  * NAME:	raid_hs_release
72  *
73  * DESCRIPTION: Release the hotspare.
74  *
75  * PARAMETERS:	int error - indication of error on hotspare
76  *		mr_unit_t  *un - raid unit
77  *		mddb_recid_t  *recids - output records to commit revised hs info
78  *		int hs_index - component to release
79  *
80  * LOCKS:	Expects Unit Writer Lock to be held across call.
81  */
82 void
raid_hs_release(hs_cmds_t cmd,mr_unit_t * un,mddb_recid_t * recids,int hs_index)83 raid_hs_release(
84 	hs_cmds_t	cmd,
85 	mr_unit_t	*un,
86 	mddb_recid_t	*recids,
87 	int		hs_index
88 )
89 {
90 	mr_column_t	*col;
91 
92 	col = &un->un_column[hs_index];
93 
94 	/* close the hotspare device */
95 	if (col->un_devflags & MD_RAID_DEV_ISOPEN) {
96 		md_layered_close(col->un_dev, MD_OFLG_NULL);
97 		col->un_devflags &= ~MD_RAID_DEV_ISOPEN;
98 	}
99 
100 	/* return the hotspare to the pool */
101 	(void) md_hot_spare_ifc(cmd, un->un_hsp_id, 0, 0, recids,
102 	    &col->un_hs_key, NULL, NULL);
103 
104 	col->un_hs_pwstart = 0;
105 	col->un_hs_devstart = 0;
106 	col->un_hs_id = (mddb_recid_t)0;
107 	col->un_hs_key = 0;
108 }
109 
110 
111 /*
112  * NAME:	check_comp_4_hs
113  *
114  * DESCRIPTION: Check whether the input component has an error and can be
115  *		backed with a hot spare (RCS_ERRED state), and initiate
116  *		a resync if so.
117  *
118  * PARAMETERS:	mr_unit_t *un - raid unit
119  *		int hs_index	- component to check
120  *
121  * LOCKS:	Expects Unit Writer Lock to be held upon entrance.  Releases
122  *		the lock prior to calling raid_resync_unit, then reacquires
123  *		it before returning.
124  */
125 static void
check_comp_4_hs(mr_unit_t * un,int hs_index)126 check_comp_4_hs(
127 	mr_unit_t *un,
128 	int hs_index
129 )
130 {
131 	mddb_recid_t	recids[3];
132 	minor_t		mnum = MD_SID(un);
133 	mdi_unit_t	*ui;
134 	rcs_state_t	state;
135 	diskaddr_t	size;
136 	int		err;
137 	mr_column_t	*col;
138 	md_error_t	mde = mdnullerror;
139 	char		devname[MD_MAX_CTDLEN];
140 	char		hs_devname[MD_MAX_CTDLEN];
141 	set_t		setno;
142 	md_dev64_t	tmpdev;
143 	diskaddr_t	tmpdaddr;
144 
145 
146 	/* initialize */
147 	setno = MD_UN2SET(un);
148 	ui = MDI_UNIT(mnum);
149 	md_unit_readerexit(ui);
150 	(void) md_io_writerlock(ui);
151 	un = (mr_unit_t *)md_unit_writerlock(ui);
152 	col = &un->un_column[hs_index];
153 
154 	/*
155 	 * add a hotspare for erred column only if not resyncing
156 	 */
157 	if ((!(COLUMN_STATE(un, hs_index) & RCS_ERRED)) ||
158 	    (raid_state_cnt(un, (RCS_ERRED | RCS_LAST_ERRED)) != 1) ||
159 	    (raid_state_cnt(un, RCS_RESYNC) > 0)) {
160 		goto errout;
161 	}
162 
163 	recids[0] = 0;
164 	recids[1] = 0;
165 	/* if there is already a hotspare then just return */
166 	if (HOTSPARED(un, hs_index) && (col->un_devstate & RCS_ERRED)) {
167 		raid_hs_release(HS_BAD, un, &recids[0], hs_index);
168 		cmn_err(CE_WARN, "md: %s: %s hotspare errored and released",
169 		    md_shortname(mnum),
170 		    md_devname(MD_MIN2SET(mnum), col->un_dev, NULL, 0));
171 		col->un_dev = col->un_orig_dev;
172 		col->un_pwstart = col->un_orig_pwstart;
173 		col->un_devstart = col->un_orig_devstart;
174 		raid_commit(un, recids);
175 
176 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HS_FREED, SVM_TAG_METADEVICE,
177 		    setno, MD_SID(un));
178 	}
179 	ASSERT(!HOTSPARED(un, hs_index));
180 
181 	state = col->un_devstate;
182 	size = col->un_pwstart + un->un_pwsize +
183 	    (un->un_segsize * un->un_segsincolumn);
184 
185 again:
186 	/* quit if resync is already active */
187 	col->un_devflags |= MD_RAID_REGEN_RESYNC;
188 	if (resync_request(mnum, hs_index, 0, NULL))
189 		goto errout;
190 
191 	recids[0] = 0;
192 	recids[1] = 0;
193 
194 	tmpdev = col->un_dev;
195 	tmpdaddr = col->un_hs_pwstart;
196 
197 	/* get a hotspare */
198 	if (md_hot_spare_ifc(HS_GET, un->un_hsp_id, size,
199 	    ((col->un_orig_pwstart >= 1) &&
200 	    (col->un_orig_pwstart != MD_DISKADDR_ERROR)),
201 	    &col->un_hs_id, &col->un_hs_key, &tmpdev, &tmpdaddr) != 0) {
202 		col->un_dev = tmpdev;
203 		col->un_hs_pwstart = tmpdaddr;
204 		release_resync_request(mnum);
205 		raid_set_state(un, hs_index, state, 1);
206 		goto errout;
207 	}
208 
209 	col->un_hs_pwstart = tmpdaddr;
210 
211 	/*
212 	 * record id is filled in by raid_commit, recids[0] filled in by
213 	 * md_hot_spare_ifc if needed
214 	 */
215 	recids[0] = col->un_hs_id;
216 	recids[1] = 0;
217 
218 	/*
219 	 * close the device and open the hot spare.  The device should
220 	 * never be a hotspare here.
221 	 */
222 	if (col->un_devflags & MD_RAID_DEV_ISOPEN) {
223 		md_layered_close(col->un_orig_dev, MD_OFLG_NULL);
224 		col->un_devflags &= ~MD_RAID_DEV_ISOPEN;
225 	}
226 	/*
227 	 * Try open by device id
228 	 */
229 	tmpdev = md_resolve_bydevid(mnum, tmpdev, col->un_hs_key);
230 	if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
231 		md_dev64_t hs_dev = tmpdev;
232 		/* cannot open return to orig */
233 		raid_hs_release(HS_BAD, un, &recids[0], hs_index);
234 		release_resync_request(mnum);
235 		raid_set_state(un, hs_index, state, 1);
236 		col->un_dev = col->un_orig_dev;
237 		col->un_devstart = col->un_orig_devstart;
238 		col->un_pwstart = col->un_orig_pwstart;
239 		col->un_devflags &= ~MD_RAID_DEV_ISOPEN;
240 		raid_commit(un, recids);
241 		cmn_err(CE_WARN, "md: %s: open error of hotspare %s",
242 		    md_shortname(mnum),
243 		    md_devname(MD_MIN2SET(mnum), hs_dev, NULL, 0));
244 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HS_FREED, SVM_TAG_HS, setno,
245 		    MD_SID(un));
246 		goto again;
247 	}
248 
249 	col->un_dev = tmpdev;
250 
251 	col->un_devflags |= MD_RAID_DEV_ISOPEN;
252 
253 	/*
254 	 * move the values into the device fields.  Since in some cases
255 	 * the pwstart is not zero this must be added into the start of
256 	 * the hotspare to avoid over writting the label
257 	 */
258 	col->un_hs_pwstart += col->un_orig_pwstart;
259 	col->un_pwstart = col->un_hs_pwstart;
260 	col->un_hs_devstart = col->un_hs_pwstart + un->un_pwsize;
261 	col->un_devstart = col->un_hs_devstart;
262 
263 	/* commit unit and hotspare records and release lock */
264 	raid_commit(un, recids);
265 	md_unit_writerexit(ui);
266 	md_io_writerexit(ui);
267 
268 	err = raid_resync_unit(mnum, &mde);
269 
270 	/* if resync fails, transition back to erred state and reset */
271 	if (err) {
272 		/* reaquire unit writerr lock */
273 		un = (mr_unit_t *)md_unit_writerlock(ui);
274 
275 		raid_set_state(un, hs_index, RCS_ERRED, 0);
276 
277 		/*
278 		 * close the hotspare and return it.  Then restore the
279 		 * original device back to the original state
280 		 */
281 		raid_hs_release(HS_FREE, un, &recids[0], hs_index);
282 		col->un_dev = col->un_orig_dev;
283 		col->un_devstart = col->un_orig_devstart;
284 		col->un_pwstart = col->un_orig_pwstart;
285 		raid_commit(un, recids);
286 		md_unit_writerexit(ui);
287 		un = (mr_unit_t *)md_unit_readerlock(ui);
288 		return;
289 	}
290 
291 	setno = MD_MIN2SET(mnum);
292 
293 	(void) md_devname(setno, col->un_orig_dev, devname,
294 		sizeof (devname));
295 	(void) md_devname(setno, col->un_dev, hs_devname,
296 		sizeof (hs_devname));
297 
298 	cmn_err(CE_NOTE, "md: %s: hotspared device %s with %s",
299 	    md_shortname(mnum), devname, hs_devname);
300 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_HS, setno,
301 	    MD_SID(un));
302 	(void) md_unit_readerlock(ui);
303 	return;
304 
305 errout:
306 	md_unit_writerexit(ui);
307 	md_io_writerexit(ui);
308 	un = (mr_unit_t *)md_unit_readerlock(ui);
309 }
310 
311 /*
312  * NAME:	check_4_hs
313  *
314  * DESCRIPTION: Check every component of every raid unit for any device which
315  *		needs to be backed with a hot spare.
316  *
317  * PARAMETERS:	daemon_request_t *dr - hotspare request daemon
318  *
319  * LOCKS:	Acquires and releases the Hotspare Request Lock and the RAID
320  *		Driver Lock. Acquires the Unit Writer Lock which is released
321  *		in check_comp_4_hs.
322  */
323 static void
check_4_hs(daemon_request_t * dr)324 check_4_hs(daemon_request_t *dr)
325 {
326 	mdi_unit_t	*ui;
327 	mr_unit_t	*un;
328 	md_link_t	*next;
329 	int		i;
330 
331 	mutex_enter(&dr->dr_mx);	/* clear up front so can poke */
332 	dr->dr_pending = 0;		/* again in low level routine if */
333 	mutex_exit(&dr->dr_mx);		/* something found to do	*/
334 
335 	/*
336 	 * Scan raid unit list and call component hotspare check routine for
337 	 * each component of each unit where resync is inactive.
338 	 */
339 	rw_enter(&raid_md_ops.md_link_rw.lock, RW_READER);
340 	for (next = raid_md_ops.md_head; next != NULL; next = next->ln_next) {
341 		ui = MDI_UNIT(next->ln_id);
342 		un = (mr_unit_t *)md_unit_readerlock(ui);
343 		if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
344 		    (raid_state_cnt(un, RCS_RESYNC) == 0) &&
345 		    (UNIT_STATE(un) & RUS_ERRED) &&
346 		    (un->un_hsp_id != -1) &&
347 		    (raid_state_cnt(un, RCS_ERRED) == 1)) {
348 			for (i = 0; i < un->un_totalcolumncnt; i++)
349 				if (un->un_column[i].un_devstate == RCS_ERRED)
350 					check_comp_4_hs(un, i);
351 		}
352 		md_unit_readerexit(ui);
353 	}
354 	rw_exit(&raid_md_ops.md_link_rw.lock);
355 }
356 
357 /*
358  * NAME:	raid_hotspares
359  *
360  * DESCRIPTION: Initiate a check of all RAID devices for components which
361  *		may require a hot spare, if it is not already running.
362  *
363  * PARAMETERS:	NONE
364  *
365  * LOCKS:	Acquires and releases the Hotspare Request Lock.
366  */
367 intptr_t
raid_hotspares()368 raid_hotspares()
369 {
370 	/* if available, make request for hotspare to master daemon */
371 	mutex_enter(&hotspare_request.dr_mx);
372 	if (hotspare_request.dr_pending == 0) {
373 		hotspare_request.dr_pending = 1;
374 		daemon_request(&md_hs_daemon,
375 		    check_4_hs, (daemon_queue_t *)&hotspare_request, REQ_OLD);
376 	}
377 	mutex_exit(&hotspare_request.dr_mx);
378 	return (0);
379 }
380