xref: /titanic_41/usr/src/uts/common/io/lvm/raid/raid_resync.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * NAME:	raid_resync.c
31  * DESCRIPTION: RAID driver source file containing routines related to resync
32  *		operation.
33  * ROUTINES PROVIDED FOR EXTERNAL USE:
34  *	   resync_request() - get resync lock if available
35  *	   release_resync_request() - relinquish resync lock
36  *	   erred_check_line() - provide write instruction for erred column
37  *	     init_pw_area() - initialize pre-write area
38  *	     copy_pw_area() - copy pre-write area from one device to another
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/conf.h>
44 #include <sys/file.h>
45 #include <sys/user.h>
46 #include <sys/uio.h>
47 #include <sys/t_lock.h>
48 #include <sys/buf.h>
49 #include <sys/dkio.h>
50 #include <sys/vtoc.h>
51 #include <sys/kmem.h>
52 #include <vm/page.h>
53 #include <sys/sysmacros.h>
54 #include <sys/types.h>
55 #include <sys/mkdev.h>
56 #include <sys/stat.h>
57 #include <sys/open.h>
58 #include <sys/disp.h>
59 #include <sys/modctl.h>
60 #include <sys/ddi.h>
61 #include <sys/sunddi.h>
62 #include <sys/lvm/md_raid.h>
63 
64 #include <sys/sysevent/eventdefs.h>
65 #include <sys/sysevent/svm.h>
66 
67 #define	NOCOLUMN	(-1)
68 
69 extern md_set_t		md_set[];
70 extern kmem_cache_t	*raid_child_cache;
71 extern kmem_cache_t	*raid_parent_cache;
72 extern md_resync_t	md_cpr_resync;
73 extern major_t		md_major;
74 extern void		raid_parent_init(md_raidps_t *ps);
75 extern void		raid_child_init(md_raidcs_t *ps);
76 
77 /*
78  * NAMES:	xor
79  * DESCRIPTION: Xor two chunks of data together.  The data referenced by
80  *		addr1 and addr2 are xor'd together for size and written into
81  *		addr1.
82  * PARAMETERS:	caddr_t addr1 - address of first chunk of data and destination
83  *		caddr_t addr2 - address of second chunk of data
84  *		u_int	 size - number to xor
85  */
86 static void
xor(caddr_t addr1,caddr_t addr2,size_t size)87 xor(caddr_t addr1, caddr_t addr2, size_t size)
88 {
89 	while (size--) {
90 		*addr1++ ^= *addr2++;
91 	}
92 }
93 
94 /*
95  * NAME:	release_resync_request
96  *
97  * DESCRIPTION: Release resync active flag and reset unit values accordingly.
98  *
99  * PARAMETERS:	minor_t	    mnum - minor number identity of metadevice
100  *
101  * LOCKS:	Expects Unit Writer Lock to be held across call.
102  */
103 void
release_resync_request(minor_t mnum)104 release_resync_request(
105 	minor_t		mnum
106 )
107 {
108 	mr_unit_t	*un;
109 
110 	un = MD_UNIT(mnum);
111 	ASSERT(un != NULL);
112 
113 	un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
114 
115 	un->un_column[un->un_resync_index].un_devflags &= ~MD_RAID_RESYNC;
116 	un->un_column[un->un_resync_index].un_devflags &= ~MD_RAID_RESYNC_ERRED;
117 	un->un_column[un->un_resync_index].un_devflags &=
118 	    ~(MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC);
119 
120 	un->un_resync_line_index = 0;
121 	un->un_resync_index = NOCOLUMN;
122 }
123 
124 /*
125  * NAME:	resync_request
126  *
127  * DESCRIPTION: Request resync.	 If resync is available (no current active
128  *		resync), mark unit as resync active and initialize.
129  *
130  * PARAMETERS:	minor_t	    mnum - minor number identity of metadevice
131  *		int column_index - index of column to resync
132  *		int	copysize - copysize of ioctl request
133  *		md_error_t   *ep - error output parameter
134  *
135  * RETURN:	0 if resync is available, 1 otherwise.
136  *
137  * LOCKS:	Expects Unit Writer Lock to be held across call.
138  *
139  * NOTE:	Sets un_resync_copysize to the input value in copysize, the
140  *		existing value from an incomplete previous resync with an
141  *		input value in copysize, or the lesser of the unit segment
142  *		size or maxio.
143  */
144 /* ARGSUSED */
145 int
resync_request(minor_t mnum,int column_index,size_t copysize,md_error_t * mde)146 resync_request(
147 	minor_t		mnum,
148 	int		column_index,
149 	size_t		copysize,
150 	md_error_t	*mde
151 )
152 {
153 	mr_unit_t	*un;
154 
155 	un = MD_UNIT(mnum);
156 	ASSERT(un != NULL);
157 
158 	/* if resync or grow not already active, set resync active for unit */
159 	if (! (un->un_column[column_index].un_devflags & MD_RAID_RESYNC) &&
160 	    ((un->c.un_status & MD_UN_RESYNC_ACTIVE) ||
161 	    (un->c.un_status & MD_UN_GROW_PENDING) ||
162 	    (un->un_column[column_index].un_devstate & RCS_RESYNC))) {
163 		if (mde)
164 			return (mdmderror(mde, MDE_GROW_DELAYED, mnum));
165 		return (1);
166 	}
167 
168 	if (un->un_column[column_index].un_devstate &
169 	    (RCS_ERRED | RCS_LAST_ERRED))
170 		un->un_column[column_index].un_devflags |= MD_RAID_DEV_ERRED;
171 	else
172 		un->un_column[column_index].un_devflags &= ~MD_RAID_DEV_ERRED;
173 	un->c.un_status |= MD_UN_RESYNC_ACTIVE;
174 	un->un_resync_index = column_index;
175 	un->un_resync_line_index = 0;
176 	raid_set_state(un, column_index, RCS_RESYNC, 0);
177 
178 	return (0);
179 }
180 
181 /*
182  * Name:	alloc_bufs
183  *
184  * DESCRIPTION: Initialize resync_comp buffers.
185  *
186  * PARAMETERS:	size_t	   bsize - size of buffer
187  *		buf_t *read_buf1 - first read buf
188  *		buf_t *read_buf2 - second read buf
189  *		buf_t *write_buf - write buf
190  */
191 static void
alloc_bufs(md_raidcs_t * cs,size_t bsize)192 alloc_bufs(md_raidcs_t *cs, size_t bsize)
193 {
194 	/* allocate buffers, write uses the read_buf1 buffer */
195 	cs->cs_dbuffer = kmem_zalloc(bsize, KM_SLEEP);
196 	cs->cs_pbuffer = kmem_zalloc(bsize, KM_SLEEP);
197 }
198 
199 void
init_buf(buf_t * bp,int flags,size_t size)200 init_buf(buf_t *bp, int flags, size_t size)
201 {
202 	/* zero buf */
203 	bzero((caddr_t)bp, sizeof (buf_t));
204 
205 	/* set b_back and b_forw to point back to buf */
206 	bp->b_back = bp;
207 	bp->b_forw = bp;
208 
209 	/* set flags size */
210 	bp->b_flags = flags;
211 	bp->b_bufsize = size;
212 	bp->b_offset = -1;
213 
214 	/* setup semaphores */
215 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
216 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
217 }
218 
219 void
destroy_buf(buf_t * bp)220 destroy_buf(buf_t *bp)
221 {
222 	sema_destroy(&bp->b_io);
223 	sema_destroy(&bp->b_sem);
224 }
225 
226 void
reset_buf(buf_t * bp,int flags,size_t size)227 reset_buf(buf_t *bp, int flags, size_t size)
228 {
229 	destroy_buf(bp);
230 	init_buf(bp, flags, size);
231 }
232 
233 /*
234  * NAME:	free_bufs
235  *
236  * DESCRIPTION: Free up buffers.
237  *
238  * PARAMETERS:	size_t	   bsize - size of buffer
239  *		buf_t *read_buf1 - first read buf
240  *		buf_t *read_buf2 - second read buf
241  *		buf_t *write_buf - write buf
242  */
243 static void
free_bufs(size_t bsize,md_raidcs_t * cs)244 free_bufs(size_t bsize, md_raidcs_t *cs)
245 {
246 	kmem_free(cs->cs_dbuffer, bsize);
247 	kmem_free(cs->cs_pbuffer, bsize);
248 }
249 
250 /*
251  * NAME:	init_pw_area
252  *
253  * DESCRIPTION: Initialize pre-write area to all zeros.
254  *
255  * PARAMETERS:	minor_t	      mnum      - minor number identity of metadevice
256  *		md_dev64_t dev_to_write - index of column to resync
257  *		int   column_index      - index of column to resync
258  *
259  * RETURN:	1 if write error on resync device, otherwise 0
260  *
261  * LOCKS:	Expects Unit Reader Lock to be held across call.
262  */
263 int
init_pw_area(mr_unit_t * un,md_dev64_t dev_to_write,diskaddr_t pwstart,uint_t col)264 init_pw_area(
265 	mr_unit_t *un,
266 	md_dev64_t dev_to_write,
267 	diskaddr_t pwstart,
268 	uint_t	col
269 )
270 {
271 	buf_t	buf;
272 	caddr_t	databuffer;
273 	size_t	copysize;
274 	size_t	bsize;
275 	int	error = 0;
276 	int	i;
277 
278 	ASSERT(un != NULL);
279 	ASSERT(un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN);
280 
281 	bsize = un->un_iosize;
282 	copysize = dbtob(bsize);
283 	databuffer = kmem_zalloc(copysize, KM_SLEEP);
284 	init_buf(&buf, (B_BUSY | B_WRITE), copysize);
285 
286 	for (i = 0; i < un->un_pwcnt; i++) {
287 		/* magic field is 0 for 4.0 compatability */
288 		RAID_FILLIN_RPW(databuffer, un, 0, 0,
289 				0, 0, 0,
290 				0, col, 0);
291 		buf.b_un.b_addr = (caddr_t)databuffer;
292 		buf.b_edev = md_dev64_to_dev(dev_to_write);
293 		buf.b_bcount = dbtob(bsize);
294 		buf.b_lblkno = pwstart + (i * un->un_iosize);
295 
296 		/* write buf */
297 		(void) md_call_strategy(&buf, MD_STR_NOTTOP, NULL);
298 
299 		if (biowait(&buf)) {
300 			error = 1;
301 			break;
302 		}
303 		reset_buf(&buf, (B_BUSY | B_WRITE), copysize);
304 	} /* for */
305 
306 	destroy_buf(&buf);
307 	kmem_free(databuffer, copysize);
308 
309 	return (error);
310 }
311 
312 /*
313  * NAME:	raid_open_alt
314  *
315  * DESCRIPTION: opens the alt device used during resync.
316  *
317  * PARAMETERS:	un
318  *
319  * RETURN:	0 - successfull
320  *		1 - failed
321  *
322  * LOCKS:	requires unit writer lock
323  */
324 
325 static int
raid_open_alt(mr_unit_t * un,int index)326 raid_open_alt(mr_unit_t *un, int index)
327 {
328 	mr_column_t	*column = &un->un_column[index];
329 	set_t		setno = MD_MIN2SET(MD_SID(un));
330 	side_t		side = mddb_getsidenum(setno);
331 	md_dev64_t	tmpdev = column->un_alt_dev;
332 
333 	/* correct locks */
334 	ASSERT(UNIT_WRITER_HELD(un));
335 	/* not already writing to */
336 	ASSERT(! (column->un_devflags & MD_RAID_WRITE_ALT));
337 	/* not already open */
338 	ASSERT(! (column->un_devflags & MD_RAID_ALT_ISOPEN));
339 
340 	if (tmpdev != NODEV64) {
341 		/*
342 		 * Open by device id. We use orig_key since alt_dev
343 		 * has been set by the caller to be the same as orig_dev.
344 		 */
345 		if ((md_getmajor(tmpdev) != md_major) &&
346 			md_devid_found(setno, side, column->un_orig_key) == 1) {
347 			tmpdev = md_resolve_bydevid(MD_SID(un), tmpdev,
348 				column->un_orig_key);
349 		}
350 		if (md_layered_open(MD_SID(un), &tmpdev, MD_OFLG_NULL)) {
351 			/* failed open */
352 			column->un_alt_dev = tmpdev;
353 			return (1);
354 		} else {
355 			/* open suceeded */
356 			column->un_alt_dev = tmpdev;
357 			column->un_devflags |= MD_RAID_ALT_ISOPEN;
358 			return (0);
359 		}
360 	} else
361 		/* no alt device to open */
362 		return (1);
363 }
364 
365 
366 /*
367  * NAME:	raid_close_alt
368  *
369  * DESCRIPTION: closes the alt device used during resync.
370  *
371  * PARAMETERS:	un - raid unit structure
372  *		indes - raid column
373  *
374  * RETURN:	none
375  *
376  * LOCKS:	requires unit writer lock
377  */
378 
379 static void
raid_close_alt(mr_unit_t * un,int index)380 raid_close_alt(mr_unit_t *un, int index)
381 {
382 	mr_column_t	*column = &un->un_column[index];
383 	md_dev64_t	tmpdev = column->un_alt_dev;
384 
385 	ASSERT(UNIT_WRITER_HELD(un));	/* correct locks */
386 	ASSERT(! (column->un_devflags & MD_RAID_WRITE_ALT)); /* not writing */
387 	ASSERT(column->un_devflags & MD_RAID_ALT_ISOPEN); /* already open */
388 	ASSERT(tmpdev != NODEV64); /* is a device */
389 
390 	md_layered_close(column->un_alt_dev, MD_OFLG_NULL);
391 	column->un_devflags &= ~MD_RAID_ALT_ISOPEN;
392 	column->un_alt_dev = NODEV64;
393 }
394 
395 static diskaddr_t
raid_resync_fillin_cs(diskaddr_t line,uint_t line_count,md_raidcs_t * cs)396 raid_resync_fillin_cs(diskaddr_t line, uint_t line_count, md_raidcs_t *cs)
397 {
398 	mr_unit_t	*un = cs->cs_un;
399 
400 	ASSERT(line < un->un_segsincolumn);
401 
402 	cs->cs_line = line;
403 	cs->cs_blkno = line * un->un_segsize;
404 	cs->cs_blkcnt = un->un_segsize * line_count;
405 	cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
406 	raid_line_reader_lock(cs, 1);
407 
408 	return (line + line_count);
409 }
410 
411 /* states returned by raid_resync_line */
412 
413 #define	RAID_RESYNC_OKAY	0
414 #define	RAID_RESYNC_RDERROR	2
415 #define	RAID_RESYNC_WRERROR	3
416 #define	RAID_RESYNC_STATE	4
417 
418 int
raid_resync_region(md_raidcs_t * cs,diskaddr_t line,uint_t line_count,int * single_read,hs_cmds_t * hs_state,int * err_col,md_dev64_t dev_to_write,diskaddr_t write_dev_start)419 raid_resync_region(
420 	md_raidcs_t	*cs,
421 	diskaddr_t	line,
422 	uint_t		line_count,
423 	int		*single_read,
424 	hs_cmds_t	*hs_state,
425 	int		*err_col,
426 	md_dev64_t	dev_to_write,
427 	diskaddr_t	write_dev_start)
428 {
429 	mr_unit_t 	*un = cs->cs_un;
430 	buf_t		*readb1 = &cs->cs_pbuf;
431 	buf_t		*readb2 = &cs->cs_dbuf;
432 	buf_t		*writeb = &cs->cs_hbuf;
433 	diskaddr_t	off;
434 	size_t		tcopysize;
435 	size_t		copysize;
436 	int 		resync;
437 	int		quit = 0;
438 	size_t		leftinseg;
439 	int		i;
440 
441 	resync = un->un_resync_index;
442 	off = line * un->un_segsize;
443 	copysize = un->un_resync_copysize;
444 
445 	/* find first column to read, skip resync column */
446 
447 	leftinseg = un->un_segsize * line_count;
448 	while (leftinseg) {
449 
450 		/* truncate last chunk to end if needed */
451 		if (copysize > leftinseg)
452 			tcopysize = leftinseg;
453 		else
454 			tcopysize = copysize;
455 		leftinseg -= tcopysize;
456 
457 		/*
458 		 * One of two scenarios:
459 		 * 1) resync device with hotspare ok.  This implies that
460 		 *    we are copying from a good hotspare to a new good original
461 		 *    device.  In this case readb1 is used as the buf for
462 		 *    the read from the hotspare device.
463 		 * 2) For all other cases, including when in case 1) and an
464 		 *    error is detected on the (formerly good) hotspare device,
465 		 *    readb1 is used for the initial read.  readb2 is used for
466 		 *    all other reads.	Each readb2 buffer is xor'd into the
467 		 *    readb1 buffer.
468 		 *
469 		 * In both cases, writeb is used for the write, using readb1's
470 		 * buffer.
471 		 *
472 		 * For case 2, we could alternatively perform the read for all
473 		 * devices concurrently to improve performance.	 However,
474 		 * this could diminish performance for concurrent reads and
475 		 * writes if low on memory.
476 		 */
477 
478 		/* read first buffer */
479 
480 		/* switch to read from good columns if single_read */
481 		if (*single_read) {
482 			if (un->un_column[resync].un_dev == NODEV64)
483 				return (RAID_RESYNC_RDERROR);
484 
485 			reset_buf(readb1, B_READ | B_BUSY,
486 			    dbtob(copysize));
487 			readb1->b_bcount = dbtob(tcopysize);
488 			readb1->b_un.b_addr = cs->cs_pbuffer;
489 			readb1->b_edev = md_dev64_to_dev(
490 						un->un_column[resync].un_dev);
491 			readb1->b_lblkno =
492 			    un->un_column[resync].un_devstart + off;
493 			(void) md_call_strategy(readb1, MD_STR_NOTTOP, NULL);
494 			if (biowait(readb1)) {
495 				/*
496 				 * at this point just start rebuilding the
497 				 * data and go on since the other column
498 				 * are ok.
499 				 */
500 				*single_read = 0;
501 				*hs_state = HS_BAD;
502 				un->un_column[resync].un_devflags &=
503 				    ~MD_RAID_COPY_RESYNC;
504 				un->un_column[resync].un_devflags |=
505 				    MD_RAID_REGEN_RESYNC;
506 			}
507 		}
508 
509 		/* if reading from all non-resync columns */
510 		if (!*single_read) {
511 			/* for each column, read line and xor into write buf */
512 			bzero(cs->cs_pbuffer, dbtob(tcopysize));
513 			for (i = 0; i < un->un_totalcolumncnt; i++) {
514 
515 				if (un->un_column[i].un_dev == NODEV64)
516 					return (RAID_RESYNC_RDERROR);
517 
518 				/* skip column getting resync'ed */
519 				if (i == resync) {
520 					continue;
521 				}
522 				reset_buf(readb1, B_READ | B_BUSY,
523 				    dbtob(copysize));
524 				readb1->b_bcount = dbtob(tcopysize);
525 				readb1->b_un.b_addr = cs->cs_dbuffer;
526 				readb1->b_edev = md_dev64_to_dev(
527 						un->un_column[i].un_dev);
528 				readb1->b_lblkno =
529 				    un->un_column[i].un_devstart + off;
530 
531 				(void) md_call_strategy(readb1, MD_STR_NOTTOP,
532 					NULL);
533 				if (biowait(readb1)) {
534 					*err_col = i;
535 					quit = RAID_RESYNC_RDERROR;
536 				}
537 
538 				if (quit)
539 					return (quit);
540 
541 				/* xor readb2 data into readb1 */
542 				xor(cs->cs_pbuffer, readb1->b_un.b_addr,
543 				    dbtob(tcopysize));
544 			} /* for */
545 		}
546 
547 		reset_buf(writeb, B_WRITE | B_BUSY,
548 		    dbtob(copysize));
549 		writeb->b_bcount = dbtob(tcopysize);
550 		writeb->b_un.b_addr = cs->cs_pbuffer;
551 		writeb->b_lblkno = off + write_dev_start;
552 		writeb->b_edev = md_dev64_to_dev(dev_to_write);
553 
554 		/* set write block number and perform the write */
555 		(void) md_call_strategy(writeb, MD_STR_NOTTOP, NULL);
556 		if (biowait(writeb)) {
557 			if (*single_read == 0) {
558 				*hs_state = HS_BAD;
559 			}
560 			return (RAID_RESYNC_WRERROR);
561 		}
562 		writeb->b_blkno += tcopysize;
563 		off += tcopysize;
564 	} /* while */
565 	sema_destroy(&readb1->b_io);
566 	sema_destroy(&readb1->b_sem);
567 	sema_destroy(&readb2->b_io);
568 	sema_destroy(&readb2->b_sem);
569 	sema_destroy(&writeb->b_io);
570 	sema_destroy(&writeb->b_sem);
571 	return (RAID_RESYNC_OKAY);
572 }
573 
574 /*
575  * NAME:	resync_comp
576  *
577  * DESCRIPTION: Resync the component.  Iterate through the raid unit a line at
578  *		a time, read from the good device(s) and write the resync
579  *		device.
580  *
581  * PARAMETERS:	minor_t	   mnum - minor number identity of metadevice
582  *		md_raidcs_t *cs - child save struct
583  *
584  * RETURN:	 0 - successfull
585  *		 1 - failed
586  *		-1 - aborted
587  *
588  * LOCKS:	Expects Unit Reader Lock to be held across call.  Acquires and
589  *		releases Line Reader Lock for per-line I/O.
590  */
591 static void
resync_comp(minor_t mnum,md_raidcs_t * cs)592 resync_comp(
593 	minor_t		mnum,
594 	md_raidcs_t	*cs
595 )
596 {
597 	mdi_unit_t	*ui;
598 	mr_unit_t	*un;
599 	mddb_recid_t	recids[2];
600 	rcs_state_t	state;
601 	md_dev64_t	dev_to_write;
602 	diskaddr_t	write_pwstart;
603 	diskaddr_t	write_devstart;
604 	md_dev64_t	dev;
605 	int		resync;
606 	int		i;
607 	int		single_read = 0;
608 	int		err;
609 	int		err_cnt;
610 	int		last_err;
611 	diskaddr_t	line;
612 	diskaddr_t	segsincolumn;
613 	size_t		bsize;
614 	uint_t		line_count;
615 
616 	/*
617 	 * hs_state is the state of the hotspare on the column being resynced
618 	 * dev_state is the state of the resync target
619 	 */
620 	hs_cmds_t	hs_state;
621 	int		err_col = -1;
622 	diskaddr_t	resync_end_pos;
623 
624 	ui = MDI_UNIT(mnum);
625 	ASSERT(ui != NULL);
626 
627 	un = cs->cs_un;
628 
629 	md_unit_readerexit(ui);
630 	un = (mr_unit_t *)md_io_writerlock(ui);
631 	un = (mr_unit_t *)md_unit_writerlock(ui);
632 	resync = un->un_resync_index;
633 	state = un->un_column[resync].un_devstate;
634 	line_count = un->un_maxio / un->un_segsize;
635 	if (line_count == 0) { /* handle the case of segsize > maxio */
636 		line_count = 1;
637 		bsize = un->un_maxio;
638 	} else
639 		bsize = line_count * un->un_segsize;
640 
641 	un->un_resync_copysize = (uint_t)bsize;
642 
643 	ASSERT(un->c.un_status & MD_UN_RESYNC_ACTIVE);
644 	ASSERT(un->un_column[resync].un_devflags &
645 	    (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC));
646 
647 	/*
648 	 * if the column is not in resync then just bail out.
649 	 */
650 	if (! (un->un_column[resync].un_devstate & RCS_RESYNC)) {
651 		md_unit_writerexit(ui);
652 		md_io_writerexit(ui);
653 		un = (mr_unit_t *)md_unit_readerlock(ui);
654 		return;
655 	}
656 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START, SVM_TAG_METADEVICE,
657 	    MD_UN2SET(un), MD_SID(un));
658 
659 	/* identify device to write and its start block */
660 
661 	if (un->un_column[resync].un_alt_dev != NODEV64) {
662 		if (raid_open_alt(un, resync)) {
663 			raid_set_state(un, resync, state, 0);
664 			md_unit_writerexit(ui);
665 			md_io_writerexit(ui);
666 			un = (mr_unit_t *)md_unit_readerlock(ui);
667 			cmn_err(CE_WARN, "md: %s: %s open failed replace "
668 				"terminated", md_shortname(MD_SID(un)),
669 				md_devname(MD_UN2SET(un),
670 					un->un_column[resync].un_alt_dev,
671 					NULL, 0));
672 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
673 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
674 			return;
675 		}
676 		ASSERT(un->un_column[resync].un_devflags & MD_RAID_COPY_RESYNC);
677 		dev_to_write = un->un_column[resync].un_alt_dev;
678 		write_devstart = un->un_column[resync].un_alt_devstart;
679 		write_pwstart = un->un_column[resync].un_alt_pwstart;
680 		if (un->un_column[resync].un_devflags & MD_RAID_DEV_ERRED) {
681 			single_read = 0;
682 			hs_state = HS_BAD;
683 		} else {
684 			hs_state = HS_FREE;
685 			single_read = 1;
686 		}
687 		un->un_column[resync].un_devflags |= MD_RAID_WRITE_ALT;
688 	} else {
689 		dev_to_write = un->un_column[resync].un_dev;
690 		write_devstart = un->un_column[resync].un_devstart;
691 		write_pwstart = un->un_column[resync].un_pwstart;
692 		single_read = 0;
693 		hs_state = HS_FREE;
694 		ASSERT(un->un_column[resync].un_devflags &
695 		    MD_RAID_REGEN_RESYNC);
696 	}
697 
698 	alloc_bufs(cs, dbtob(bsize));
699 	/* initialize pre-write area */
700 	if (init_pw_area(un, dev_to_write, write_pwstart, resync)) {
701 		un->un_column[resync].un_devflags &= ~MD_RAID_WRITE_ALT;
702 		if (un->un_column[resync].un_alt_dev != NODEV64) {
703 			raid_close_alt(un, resync);
704 		}
705 		md_unit_writerexit(ui);
706 		md_io_writerexit(ui);
707 		if (dev_to_write == un->un_column[resync].un_dev)
708 			hs_state = HS_BAD;
709 		err = RAID_RESYNC_WRERROR;
710 		goto resync_comp_error;
711 	}
712 
713 	un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
714 	segsincolumn = un->un_segsincolumn;
715 	err_cnt = raid_state_cnt(un, RCS_ERRED | RCS_LAST_ERRED);
716 
717 	/* commit the record */
718 
719 	md_unit_writerexit(ui);
720 	md_io_writerexit(ui);
721 
722 
723 	/* resync each line of the unit */
724 	for (line = 0; line <  segsincolumn; line += line_count) {
725 		/*
726 		 * Update address range in child struct and lock the line.
727 		 *
728 		 * The reader version of the line lock is used since only
729 		 * resync will use data beyond un_resync_line_index on the
730 		 * resync device.
731 		 */
732 		un = (mr_unit_t *)md_io_readerlock(ui);
733 		if (line + line_count > segsincolumn)
734 			line_count = segsincolumn - line;
735 		resync_end_pos = raid_resync_fillin_cs(line, line_count, cs);
736 		(void) md_unit_readerlock(ui);
737 		ASSERT(un->un_resync_line_index == resync_end_pos);
738 		err = raid_resync_region(cs, line, (int)line_count,
739 		    &single_read, &hs_state, &err_col, dev_to_write,
740 		    write_devstart);
741 
742 		/*
743 		 * if the column failed to resync then stop writing directly
744 		 * to the column.
745 		 */
746 		if (err)
747 			un->un_resync_line_index = 0;
748 
749 		md_unit_readerexit(ui);
750 		raid_line_exit(cs);
751 		md_io_readerexit(ui);
752 
753 		if (err)
754 			break;
755 
756 		un = (mr_unit_t *)md_unit_writerlock(ui);
757 
758 		if (raid_state_cnt(un, RCS_ERRED | RCS_LAST_ERRED) != err_cnt) {
759 			err = RAID_RESYNC_STATE;
760 			md_unit_writerexit(ui);
761 			break;
762 		}
763 		md_unit_writerexit(ui);
764 	} /* for */
765 
766 resync_comp_error:
767 	un = (mr_unit_t *)md_io_writerlock(ui);
768 	(void) md_unit_writerlock(ui);
769 	un->un_column[resync].un_devflags &= ~MD_RAID_WRITE_ALT;
770 
771 	recids[0] = 0;
772 	recids[1] = 0;
773 	switch (err) {
774 		/*
775 		 * successful resync
776 		 */
777 	    case RAID_RESYNC_OKAY:
778 		/* initialize pre-write area */
779 		if ((un->un_column[resync].un_orig_dev != NODEV64) &&
780 		    (un->un_column[resync].un_orig_dev ==
781 		    un->un_column[resync].un_alt_dev)) {
782 			/*
783 			 * replacing a hot spare
784 			 * release the hot spare, which will close the hotspare
785 			 * and mark it closed.
786 			 */
787 			raid_hs_release(hs_state, un, &recids[0], resync);
788 			/*
789 			 * make the resync target the main device and
790 			 * mark open
791 			 */
792 			un->un_column[resync].un_hs_id = 0;
793 			un->un_column[resync].un_dev =
794 			    un->un_column[resync].un_orig_dev;
795 			un->un_column[resync].un_devstart =
796 			    un->un_column[resync].un_orig_devstart;
797 			un->un_column[resync].un_pwstart =
798 			    un->un_column[resync].un_orig_pwstart;
799 			un->un_column[resync].un_devflags |= MD_RAID_DEV_ISOPEN;
800 			/* alt becomes the device so don't close it */
801 			un->un_column[resync].un_devflags &= ~MD_RAID_WRITE_ALT;
802 			un->un_column[resync].un_devflags &=
803 			    ~MD_RAID_ALT_ISOPEN;
804 			un->un_column[resync].un_alt_dev = NODEV64;
805 		}
806 		raid_set_state(un, resync, RCS_OKAY, 0);
807 		break;
808 
809 	    case RAID_RESYNC_WRERROR:
810 		if (HOTSPARED(un, resync) && single_read &&
811 		    (un->un_column[resync].un_devflags & MD_RAID_COPY_RESYNC)) {
812 			/*
813 			 * this is the case where the resync target is
814 			 * bad but there is a good hotspare.  In this
815 			 * case keep the hotspare, and go back to okay.
816 			 */
817 			raid_set_state(un, resync, RCS_OKAY, 0);
818 			cmn_err(CE_WARN, "md: %s: %s write error, replace "
819 				"terminated", md_shortname(MD_SID(un)),
820 				md_devname(MD_UN2SET(un),
821 					un->un_column[resync].un_orig_dev,
822 					NULL, 0));
823 			break;
824 		}
825 		if (HOTSPARED(un, resync)) {
826 			raid_hs_release(hs_state, un, &recids[0], resync);
827 			un->un_column[resync].un_dev =
828 			    un->un_column[resync].un_orig_dev;
829 			un->un_column[resync].un_devstart =
830 			    un->un_column[resync].un_orig_devstart;
831 			un->un_column[resync].un_pwstart =
832 			    un->un_column[resync].un_orig_pwstart;
833 		}
834 		raid_set_state(un, resync, RCS_ERRED, 0);
835 		if (un->un_column[resync].un_devflags & MD_RAID_REGEN_RESYNC)
836 			dev = un->un_column[resync].un_dev;
837 		else
838 			dev = un->un_column[resync].un_alt_dev;
839 		cmn_err(CE_WARN, "md: %s: %s write error replace terminated",
840 		    md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un), dev,
841 		    NULL, 0));
842 		break;
843 
844 	    case RAID_RESYNC_STATE:
845 		if (HOTSPARED(un, resync) && single_read &&
846 		    (un->un_column[resync].un_devflags & MD_RAID_COPY_RESYNC)) {
847 			/*
848 			 * this is the case where the resync target is
849 			 * bad but there is a good hotspare.  In this
850 			 * case keep the hotspare, and go back to okay.
851 			 */
852 			raid_set_state(un, resync, RCS_OKAY, 0);
853 			cmn_err(CE_WARN, "md: %s: needs maintenance, replace "
854 			    "terminated", md_shortname(MD_SID(un)));
855 			break;
856 		}
857 		if (HOTSPARED(un, resync)) {
858 			raid_hs_release(hs_state, un, &recids[0], resync);
859 			un->un_column[resync].un_dev =
860 			    un->un_column[resync].un_orig_dev;
861 			un->un_column[resync].un_devstart =
862 			    un->un_column[resync].un_orig_devstart;
863 			un->un_column[resync].un_pwstart =
864 			    un->un_column[resync].un_orig_pwstart;
865 		}
866 		break;
867 	    case RAID_RESYNC_RDERROR:
868 		if (HOTSPARED(un, resync)) {
869 			raid_hs_release(hs_state, un, &recids[0], resync);
870 			un->un_column[resync].un_dev =
871 			    un->un_column[resync].un_orig_dev;
872 			un->un_column[resync].un_devstart =
873 			    un->un_column[resync].un_orig_devstart;
874 			un->un_column[resync].un_pwstart =
875 			    un->un_column[resync].un_orig_pwstart;
876 		}
877 
878 		if ((resync != err_col) && (err_col != NOCOLUMN))
879 			raid_set_state(un, err_col, RCS_ERRED, 0);
880 		break;
881 
882 	    default:
883 		ASSERT(0);
884 	}
885 	if (un->un_column[resync].un_alt_dev != NODEV64) {
886 		raid_close_alt(un, resync);
887 	}
888 
889 	/*
890 	 * an io operation may have gotten an error and placed a
891 	 * column in erred state.  This will abort the resync, which
892 	 * will end up in last erred.  This is ugly so go through
893 	 * the columns and do cleanup
894 	 */
895 	err_cnt = 0;
896 	last_err = 0;
897 	for (i = 0; i < un->un_totalcolumncnt; i++) {
898 		if (un->un_column[i].un_devstate & RCS_OKAY)
899 			continue;
900 		if (i == resync) {
901 			raid_set_state(un, i, RCS_ERRED, 1);
902 			err_cnt++;
903 		} else if (err == RAID_RESYNC_OKAY) {
904 			err_cnt++;
905 		} else {
906 			raid_set_state(un, i, RCS_LAST_ERRED, 1);
907 			last_err++;
908 		}
909 	}
910 	if ((err_cnt == 0) && (last_err == 0))
911 		un->un_state = RUS_OKAY;
912 	else if (last_err == 0) {
913 		un->un_state = RUS_ERRED;
914 		ASSERT(err_cnt == 1);
915 	} else if (last_err > 0) {
916 		un->un_state = RUS_LAST_ERRED;
917 	}
918 
919 	uniqtime32(&un->un_column[resync].un_devtimestamp);
920 	un->un_resync_copysize = 0;
921 	un->un_column[resync].un_devflags &=
922 	    ~(MD_RAID_REGEN_RESYNC | MD_RAID_COPY_RESYNC);
923 	raid_commit(un, recids);
924 	/* release unit writer lock and acquire unit reader lock */
925 	md_unit_writerexit(ui);
926 	md_io_writerexit(ui);
927 	(void) md_unit_readerlock(ui);
928 	if (err == RAID_RESYNC_OKAY) {
929 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
930 		    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
931 	} else {
932 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
933 		    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
934 		if (raid_state_cnt(un, RCS_ERRED |
935 			RCS_LAST_ERRED) > 1) {
936 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
937 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
938 		} else {
939 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
940 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
941 		}
942 	}
943 
944 	free_bufs(dbtob(bsize), cs);
945 }
946 
947 /*
948  * NAME:	resync_unit
949  *
950  * DESCRIPTION: Start of RAID resync thread.  Perform up front allocations,
951  *		initializations and consistency checking, then call
952  *		resync_comp to resync the component.
953  *
954  * PARAMETERS:	minor_t mnum - minor number identity of metadevice
955  *
956  * LOCKS:	Acquires and releases Unit Reader Lock to maintain unit
957  *		existence during resync.
958  *		Acquires and releases the resync count lock for cpr.
959  */
960 static void
resync_unit(minor_t mnum)961 resync_unit(
962 	minor_t mnum
963 )
964 {
965 	mdi_unit_t	*ui;
966 	mr_unit_t	*un;
967 	md_raidps_t	*ps = NULL;
968 	md_raidcs_t	*cs = NULL;
969 	int		resync;
970 
971 	/*
972 	 * Increment the raid resync count for cpr
973 	 */
974 	mutex_enter(&md_cpr_resync.md_resync_mutex);
975 	md_cpr_resync.md_raid_resync++;
976 	mutex_exit(&md_cpr_resync.md_resync_mutex);
977 
978 	ui = MDI_UNIT(mnum);
979 	ASSERT(ui != NULL);
980 
981 	un = (mr_unit_t *)md_unit_readerlock(ui);
982 
983 	/*
984 	 * Allocate parent and child memory pool structures.  These are
985 	 * only needed to lock raid lines, so only the minimal
986 	 * required fields for this purpose are initialized.
987 	 *
988 	 * Do not use the reserve pool for resync.
989 	 */
990 	ps = kmem_cache_alloc(raid_parent_cache, MD_ALLOCFLAGS);
991 	raid_parent_init(ps);
992 	cs = kmem_cache_alloc(raid_child_cache, MD_ALLOCFLAGS);
993 	raid_child_init(cs);
994 	resync = un->un_resync_index;
995 	ps->ps_un = un;
996 	ps->ps_ui = ui;
997 	ps->ps_flags = MD_RPS_INUSE;
998 	cs->cs_ps = ps;
999 	cs->cs_un = un;
1000 
1001 	ASSERT(!(un->un_column[resync].un_devflags & MD_RAID_WRITE_ALT));
1002 
1003 	resync_comp(mnum, cs);
1004 	release_resync_request(mnum);
1005 
1006 	kmem_cache_free(raid_child_cache, cs);
1007 	kmem_cache_free(raid_parent_cache, ps);
1008 
1009 	md_unit_readerexit(ui);
1010 
1011 	/* close raid unit */
1012 	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
1013 
1014 	/* poke hot spare daemon */
1015 	(void) raid_hotspares();
1016 
1017 	/*
1018 	 * Decrement the raid resync count for cpr
1019 	 */
1020 	mutex_enter(&md_cpr_resync.md_resync_mutex);
1021 	md_cpr_resync.md_raid_resync--;
1022 	mutex_exit(&md_cpr_resync.md_resync_mutex);
1023 
1024 	thread_exit();
1025 }
1026 
1027 /*
1028  * NAME:	raid_resync_unit
1029  *
1030  * DESCRIPTION: RAID metadevice specific resync routine.
1031  *		Open the unit and start resync_unit as a separate thread.
1032  *
1033  * PARAMETERS:	minor_t	  mnum - minor number identity of metadevice
1034  *		md_error_t *ep - output error parameter
1035  *
1036  * RETURN:	On error return 1 or set ep to nonzero, otherwise return 0.
1037  *
1038  * LOCKS:	Acquires and releases Unit Writer Lock.
1039  */
1040 int
raid_resync_unit(minor_t mnum,md_error_t * ep)1041 raid_resync_unit(
1042 	minor_t			mnum,
1043 	md_error_t		*ep
1044 )
1045 {
1046 	mdi_unit_t	*ui;
1047 	set_t		setno = MD_MIN2SET(mnum);
1048 	mr_unit_t	*un;
1049 
1050 	ui = MDI_UNIT(mnum);
1051 	un = MD_UNIT(mnum);
1052 
1053 	if (md_get_setstatus(setno) & MD_SET_STALE)
1054 		return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
1055 
1056 	ASSERT(un->un_column[un->un_resync_index].un_devflags &
1057 	    (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC));
1058 
1059 	/* Don't start a resync if the device is not available */
1060 	if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
1061 		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
1062 	}
1063 
1064 	if (raid_internal_open(mnum, FREAD | FWRITE, OTYP_LYR, 0)) {
1065 		(void) md_unit_writerlock(ui);
1066 		release_resync_request(mnum);
1067 		md_unit_writerexit(ui);
1068 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
1069 		    setno, MD_SID(un));
1070 		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
1071 	}
1072 
1073 	/* start resync_unit thread */
1074 	(void) thread_create(NULL, 0, resync_unit, (void *)(uintptr_t)mnum,
1075 	    0, &p0, TS_RUN, minclsyspri);
1076 
1077 	return (0);
1078 }
1079