xref: /titanic_41/usr/src/uts/common/io/lvm/raid/raid_replay.c (revision d7cd82522afdd890a66c7600b499590ad44e84bd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * NAME:	raid_replay.c
30  *
31  * DESCRIPTION: RAID driver source file containing routines related to replay
32  *		operation.
33  *
34  * ROUTINES PROVIDED FOR EXTERNAL USE:
35  *		raid_replay() - replay all the pre write entries in the unit.
36  */
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/conf.h>
41 #include <sys/file.h>
42 #include <sys/user.h>
43 #include <sys/uio.h>
44 #include <sys/t_lock.h>
45 #include <sys/buf.h>
46 #include <sys/dkio.h>
47 #include <sys/vtoc.h>
48 #include <sys/kmem.h>
49 #include <vm/page.h>
50 #include <sys/sysmacros.h>
51 #include <sys/types.h>
52 #include <sys/mkdev.h>
53 #include <sys/stat.h>
54 #include <sys/open.h>
55 #include <sys/modctl.h>
56 #include <sys/ddi.h>
57 #include <sys/sunddi.h>
58 
59 #include <sys/lvm/md_raid.h>
60 
61 #include <sys/sysevent/eventdefs.h>
62 #include <sys/sysevent/svm.h>
63 
64 /* functions forward declarations */
65 static int	raid_replay_error(mr_unit_t *un, int column);
66 
67 int		raid_total_rply_entries = 0;
68 
69 /*
70  * NAMES:	raid_rply_dealloc, raid_rply_alloc
71  * DESCRIPTION: RAID metadevice replay buffer allocation/deallocation routines
72  * PARAMETERS:	mr_unit_t *un - pointer to the unit structure
73  *		mr_unit_t *un - pointer to the unit structure
74  * RETURNS:
75  */
76 static void
raid_rply_dealloc(mr_unit_t * un,raid_rplybuf_t ** bufs,raid_rplybuf_t * rwbuf1,raid_rplybuf_t * rwbuf2)77 raid_rply_dealloc(mr_unit_t *un,
78 		raid_rplybuf_t **bufs,
79 		raid_rplybuf_t *rwbuf1,
80 		raid_rplybuf_t *rwbuf2)
81 {
82 	int	i;
83 	raid_rplybuf_t *tmp;
84 
85 	for (i = 0, tmp = *bufs; i < un->un_totalcolumncnt; i++, tmp++) {
86 		if (tmp->rpl_data) {
87 			kmem_free(tmp->rpl_data, DEV_BSIZE);
88 			tmp->rpl_data = NULL;
89 		}
90 		if (tmp->rpl_buf) {
91 			kmem_free(tmp->rpl_buf, sizeof (buf_t));
92 			tmp->rpl_buf = NULL;
93 		}
94 	}
95 	kmem_free(*bufs, sizeof (raid_rplybuf_t) * un->un_totalcolumncnt);
96 	*bufs = NULL;
97 	if (rwbuf1->rpl_data) {
98 		kmem_free(rwbuf1->rpl_data, dbtob(un->un_iosize));
99 		rwbuf1->rpl_data = NULL;
100 	}
101 	if (rwbuf1->rpl_buf) {
102 		kmem_free((caddr_t)rwbuf1->rpl_buf, sizeof (buf_t));
103 		rwbuf1->rpl_buf = NULL;
104 	}
105 	if (rwbuf2->rpl_data) {
106 		kmem_free(rwbuf2->rpl_data, dbtob(un->un_iosize));
107 		rwbuf2->rpl_data = NULL;
108 	}
109 	if (rwbuf2->rpl_buf) {
110 		kmem_free((caddr_t)rwbuf2->rpl_buf, sizeof (buf_t));
111 		rwbuf2->rpl_buf = NULL;
112 	}
113 }
114 
115 static void
raid_rply_alloc(mr_unit_t * un,raid_rplybuf_t ** bufs,raid_rplybuf_t * rwbuf1,raid_rplybuf_t * rwbuf2)116 raid_rply_alloc(mr_unit_t *un,
117 		raid_rplybuf_t **bufs,
118 		raid_rplybuf_t *rwbuf1,
119 		raid_rplybuf_t *rwbuf2)
120 {
121 	int		i;
122 	raid_rplybuf_t *tmp;
123 	buf_t		*bp;
124 
125 	/* intialization */
126 	*bufs = kmem_zalloc(sizeof (raid_rplybuf_t) * un->un_totalcolumncnt,
127 	    KM_SLEEP);
128 	ASSERT(*bufs != NULL);
129 	bzero((caddr_t)rwbuf1, sizeof (raid_rplybuf_t));
130 	bzero((caddr_t)rwbuf2, sizeof (raid_rplybuf_t));
131 
132 	/* allocate all the buffers required for the replay processing */
133 	for (i = 0, tmp = *bufs; i < un->un_totalcolumncnt; i++, tmp++) {
134 		tmp->rpl_data = kmem_zalloc(DEV_BSIZE, KM_SLEEP);
135 		ASSERT(tmp->rpl_data != NULL);
136 		tmp->rpl_buf = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
137 		ASSERT(tmp->rpl_buf != NULL);
138 		bp = (buf_t *)tmp->rpl_buf;
139 		bp->b_back = bp;
140 		bp->b_forw = bp;
141 		bp->b_flags = B_BUSY;
142 		bp->b_offset = -1;
143 		/* Initialize semaphores */
144 		sema_init(&bp->b_io, 0, NULL,
145 			SEMA_DEFAULT, NULL);
146 		sema_init(&bp->b_sem, 0, NULL,
147 			SEMA_DEFAULT, NULL);
148 	}
149 
150 	rwbuf1->rpl_data = kmem_zalloc(dbtob(un->un_iosize), KM_SLEEP);
151 	ASSERT(rwbuf1->rpl_data != NULL);
152 	rwbuf1->rpl_buf = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
153 	ASSERT(rwbuf1->rpl_buf != NULL);
154 	rwbuf2->rpl_data = kmem_zalloc(dbtob(un->un_iosize), KM_SLEEP);
155 	ASSERT(rwbuf2->rpl_data != NULL);
156 	rwbuf2->rpl_buf = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
157 	ASSERT(rwbuf2->rpl_buf != NULL);
158 
159 	bp = (buf_t *)rwbuf1->rpl_buf;
160 	bp->b_back = bp;
161 	bp->b_forw = bp;
162 	bp->b_flags = B_BUSY;
163 	bp->b_offset = -1;
164 	/* Initialize semaphores */
165 	sema_init(&bp->b_io, 0, NULL,
166 		SEMA_DEFAULT, NULL);
167 	sema_init(&bp->b_sem, 0, NULL,
168 		SEMA_DEFAULT, NULL);
169 	bp = (buf_t *)rwbuf2->rpl_buf;
170 	bp->b_back = bp;
171 	bp->b_forw = bp;
172 	bp->b_flags = B_BUSY;
173 	bp->b_offset = -1;
174 	/* Initialize semaphores */
175 	sema_init(&bp->b_io, 0, NULL,
176 		SEMA_DEFAULT, NULL);
177 	sema_init(&bp->b_sem, 0, NULL,
178 		SEMA_DEFAULT, NULL);
179 }
180 
181 /*
182  * NAMES:	rpl_insert, rpl_delete, rpl_find
183  * DESCRIPTION: RAID metadevice replay list processing APIs
184  * PARAMETERS:	raid_rplylst_t *list - pointer to the replay list.
185  *		raid_pwhdr_t   *pwptr - pointer to a pre-write header.
186  * RETURNS:
187  */
188 static void
rpl_insert(raid_rplylst_t ** listp,raid_rplylst_t * newp)189 rpl_insert(raid_rplylst_t **listp, raid_rplylst_t *newp)
190 {
191 	raid_rplylst_t *tmp, **prevp;
192 
193 	for (prevp = listp; ((tmp = *prevp) != NULL); prevp = &tmp->rpl_next) {
194 		if (tmp->rpl_id > newp->rpl_id) {
195 			break;
196 		}
197 	}
198 	newp->rpl_next = tmp;
199 	*prevp = newp;
200 }
201 
202 static void
rpl_delete(raid_rplylst_t ** prevp,raid_rplylst_t * oldp)203 rpl_delete(raid_rplylst_t **prevp, raid_rplylst_t *oldp)
204 {
205 
206 	ASSERT((caddr_t)oldp);
207 	raid_total_rply_entries --;
208 	*prevp = oldp->rpl_next;
209 	kmem_free((caddr_t)oldp, sizeof (raid_rplylst_t));
210 }
211 
212 static raid_rplylst_t *
rpl_find(raid_rplylst_t * list,long long pw_id)213 rpl_find(raid_rplylst_t *list, long long pw_id)
214 {
215 	raid_rplylst_t *tmp;
216 
217 	for (tmp = list; tmp; tmp = tmp->rpl_next) {
218 		if (pw_id == tmp->rpl_id) {
219 			return (tmp);
220 		}
221 	}
222 	return ((raid_rplylst_t *)NULL);
223 }
224 
225 /*
226  * NAMES:	enq_rplylst
227  * DESCRIPTION: Enqueue a pre-write header into the replay list.
228  * PARAMETERS:	raid_rplylst_t *list - pointer to the replay list.
229  *		raid_pwhdr_t   *pwptr - pointer to a pre-write header.
230  * RETURNS:
231  */
232 static void
enq_rplylst(raid_rplylst_t ** listp,raid_pwhdr_t * pwhp,uint_t slot,int column)233 enq_rplylst(raid_rplylst_t **listp, raid_pwhdr_t *pwhp,
234 		uint_t slot, int column)
235 {
236 	raid_rplylst_t *newp, *oldp;
237 
238 	/* check if the pre-write existed in the list */
239 	if ((pwhp->rpw_colcount <= 2) &&
240 	    (oldp = rpl_find(*listp, pwhp->rpw_id))) {
241 		bcopy((caddr_t)pwhp, (caddr_t)&oldp->rpl_pwhdr2,
242 			sizeof (raid_pwhdr_t));
243 		oldp->rpl_slot2   = slot;
244 		oldp->rpl_column2 = column;
245 	} else {
246 		raid_total_rply_entries ++;
247 		newp = (raid_rplylst_t *)kmem_zalloc(sizeof (raid_rplylst_t),
248 		    KM_SLEEP);
249 		ASSERT(newp != NULL);
250 		bcopy((caddr_t)pwhp, (caddr_t)&newp->rpl_pwhdr1,
251 			sizeof (raid_pwhdr_t));
252 		bzero((caddr_t)&newp->rpl_pwhdr2, sizeof (raid_pwhdr_t));
253 
254 		newp->rpl_id = pwhp->rpw_id;
255 		newp->rpl_column1 = column;
256 		newp->rpl_slot1 = slot;
257 		newp->rpl_next = (raid_rplylst_t *)NULL;
258 		newp->rpl_colcnt = pwhp->rpw_colcount;
259 		rpl_insert(listp, newp);
260 	}
261 }
262 
263 /*
264  * NAMES:	pw_read_done and pw_write_done
265  * DESCRIPTION: don't know the usage yet ??? (TBD)
266  * PARAMETERS:
267  * RETURNS:
268  */
269 static int
pw_read_done(buf_t * bp)270 pw_read_done(buf_t *bp)
271 {
272 	ASSERT(SEMA_HELD(&bp->b_sem));
273 	ASSERT((bp->b_flags & B_DONE) == 0);
274 
275 	bp->b_flags |= B_DONE;
276 
277 	if (bp->b_flags & B_ASYNC)
278 		sema_v(&bp->b_sem);
279 	else
280 		/* wakeup the thread waiting on this buf */
281 		sema_v(&bp->b_io);
282 	return (0);
283 }
284 
285 static int
pw_write_done(buf_t * bp)286 pw_write_done(buf_t *bp)
287 {
288 	ASSERT(SEMA_HELD(&bp->b_sem));
289 	ASSERT((bp->b_flags & B_DONE) == 0);
290 
291 	bp->b_flags |= B_DONE;
292 
293 	if (bp->b_flags & B_ASYNC)
294 		sema_v(&bp->b_sem);
295 	else
296 		/* wakeup the thread waiting on this buf */
297 		sema_v(&bp->b_io);
298 
299 	return (0);
300 }
301 
302 /*
303  * NAMES:	raid_pwhdr_read
304  * DESCRIPTION: issue a syncronous read to read a pre-write header
305  * PARAMETERS:	mr_unit_t *un - pointer to the unit structure
306  *		int	pw_slot - pre-write entry slot number
307  *		int	column	- column number for the pre-write entry
308  *		raid_rplybuf_t *bufp - pointer to the replay buffer structure
309  * RETURNS:
310  */
311 static void
raid_pwhdr_read(mr_unit_t * un,int pw_slot,int column,raid_rplybuf_t * bufp)312 raid_pwhdr_read(mr_unit_t *un, int pw_slot, int column, raid_rplybuf_t *bufp)
313 {
314 	buf_t		*bp;
315 
316 	/* set up pointers from raid_rplybuf_t *bufp */
317 	bp = (buf_t *)bufp->rpl_buf;
318 
319 	/* calculate the data address or block number */
320 	bp->b_un.b_addr = bufp->rpl_data;
321 	bp->b_lblkno = un->un_column[column].un_pwstart +
322 		pw_slot * un->un_iosize;
323 	bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev);
324 	bp->b_bufsize = DEV_BSIZE;
325 	bp->b_bcount = DEV_BSIZE;
326 	bp->b_flags  = (B_READ | B_BUSY);
327 	bp->b_iodone = pw_read_done;
328 	(void) md_call_strategy(bp, 0, NULL);
329 }
330 
331 /*
332  * NAMES:	raid_pw_read
333  * DESCRIPTION: issue a syncronous read to read a pre-write entry
334  * PARAMETERS:	mr_unit_t	*un    - pointer to the unit structure
335  *		int		column - column number for the pre-write entry
336  *		u_int		slot   - pre-write entry slot number
337  *		raid_rplybuf_t	*bufp  - pointer to the replay buffer structure
338  * RETURNS:
339  */
340 static int
raid_pw_read(mr_unit_t * un,int column,uint_t slot,raid_rplybuf_t * bufp)341 raid_pw_read(mr_unit_t *un, int column, uint_t slot, raid_rplybuf_t *bufp)
342 {
343 	buf_t	*bp;
344 	int	error;
345 	uint_t	blkcnt  = un->un_iosize;
346 	uint_t	bytecnt = blkcnt * DEV_BSIZE;
347 
348 	/* if this column is no longer accessible, return */
349 	if (!COLUMN_ISUP(un, column))
350 		return (RAID_RPLY_COMPREPLAY);
351 
352 	/* set up pointers from raid_rplybuf_t *bufp */
353 	bp = (buf_t *)bufp->rpl_buf;
354 
355 	/* calculate the data address or block number */
356 	bp->b_un.b_addr = bufp->rpl_data;
357 	bp->b_bufsize = bytecnt;
358 	bp->b_bcount = bytecnt;
359 	bp->b_flags = (B_READ | B_BUSY);
360 	bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev);
361 	bp->b_lblkno = un->un_column[column].un_pwstart + (slot * blkcnt);
362 	bp->b_iodone = pw_read_done;
363 	(void) md_call_strategy(bp, 0, NULL);
364 	if (biowait(bp)) {
365 		error = raid_replay_error(un, column);
366 		return (error);
367 	}
368 	return (0);
369 }
370 
371 /*
372  * NAMES:	raid_pw_write
373  * DESCRIPTION: issue a syncronous write to write a pre-write entry
374  * PARAMETERS:	mr_unit_t *un - pointer to the unit structure
375  *		int	column	- column number for the pre-write entry
376  *		raid_pwhdr_t   *pwhp - needed for some infos about the pw header
377  *		raid_rplybuf_t *bufp - pointer to the replay buffer structure
378  * RETURNS:
379  */
380 static int
raid_pw_write(mr_unit_t * un,int column,raid_pwhdr_t * pwhp,raid_rplybuf_t * bufp)381 raid_pw_write(mr_unit_t *un, int column, raid_pwhdr_t *pwhp,
382     raid_rplybuf_t *bufp)
383 {
384 	buf_t	 *bp;
385 	int	 error;
386 
387 	/* if this column is no longer accessible, return */
388 	if (!COLUMN_ISUP(un, column))
389 		return (RAID_RPLY_COMPREPLAY);
390 
391 	/* set up pointers from raid_rplybuf_t *bufp */
392 	bp = (buf_t *)bufp->rpl_buf;
393 
394 	/* calculate the data address or block number */
395 	bp->b_un.b_addr = bufp->rpl_data + DEV_BSIZE;
396 	bp->b_bufsize = dbtob(pwhp->rpw_blkcnt);
397 	bp->b_bcount = dbtob(pwhp->rpw_blkcnt);
398 	bp->b_flags = (B_WRITE | B_BUSY);
399 	bp->b_edev  = md_dev64_to_dev(un->un_column[column].un_dev);
400 	bp->b_lblkno = un->un_column[column].un_devstart + pwhp->rpw_blkno;
401 	bp->b_iodone = pw_write_done;
402 	(void) md_call_strategy(bp, 0, NULL);
403 	if (biowait(bp)) {
404 		error = raid_replay_error(un, column);
405 		return (error);
406 	}
407 	return (0);
408 }
409 
410 /*
411  * NAMES:	genchecksum
412  * DESCRIPTION: generate check sum for a pre-write entry
413  * PARAMETERS:	caddr_t addr - where the data bytes are
414  *		int bcount - number of bytes in the pre-write entry
415  * RETURNS:
416  */
417 static uint_t
genchecksum(caddr_t addr,size_t bcount)418 genchecksum(caddr_t addr, size_t bcount)
419 {
420 	uint_t *dbuf;
421 	size_t wordcnt;
422 	uint_t dsum = 0;
423 
424 	wordcnt = bcount / sizeof (uint_t);
425 	dbuf = (uint_t *)(void *)(addr);
426 
427 	while (wordcnt--) {
428 		dsum ^= *dbuf;
429 		dbuf++;
430 	}
431 	return (dsum);
432 }
433 
434 /*
435  * NAMES:	raid_rply_verify
436  * DESCRIPTION: verify the pre-write entry for replay
437  * PARAMETERS:	mr_unit_t *un	- pointer to unit structure
438  *		int col1	- column number 1
439  *		int goodsum1	- flag to indicate good checksum
440  *		int *do_1	- flag to indicate whether we should replay
441  *				  the first pre-write
442  *		int col2	- column number 2
443  *		int goodsum2	- flag to indicate good checksum
444  *		int *do_2	- flag to indicate whether we should replay
445  *				  the first pre-write
446  * RETURNS:
447  */
448 static void
raid_rply_verify(mr_unit_t * un,int col1,int goodsum1,int * do_1,int col2,int goodsum2,int * do_2)449 raid_rply_verify(mr_unit_t *un, int col1, int goodsum1, int *do_1,
450     int col2, int goodsum2, int *do_2)
451 {
452 	int	good_state1 = 0;
453 	int	good_state2 = 0;
454 
455 	*do_1 = 0; *do_2 = 0;		/* prepare for the worst */
456 	if (COLUMN_ISUP(un, col1)) {
457 		good_state1 = 1;
458 	}
459 	if (COLUMN_ISUP(un, col2)) {
460 		good_state2 = 1;
461 	}
462 	if ((good_state1 & good_state2) && (goodsum1 & goodsum2)) {
463 		/* if both columns check out, do it */
464 		*do_1 = 1; *do_2 = 1;
465 	} else if ((good_state1 & goodsum1) && !good_state2) {
466 		/* if one column is okay and the other is errored, do it */
467 		*do_1 = 1; *do_2 = 0;
468 	} else if ((good_state2 & goodsum2) && !good_state1) {
469 		/* if one column is okay and the other is errored, do it */
470 		*do_2 = 1; *do_1 = 0;
471 	}
472 }
473 
474 /*
475  * NAMES:	raid_rplyeach
476  * DESCRIPTION: issue a syncronous read to read a pre-write header
477  * PARAMETERS:	mr_unit_t *un - pointer to the unit structure
478  *		raid_rplylst_t *eachp - pointer to the replay list entry
479  *		raid_rplybuf_t *rwbuf1 - pointer to the replay buffer structure
480  *		raid_rplybuf_t *rwbuf2 - pointer to the replay buffer structure
481  * RETURNS:
482  */
483 static int
raid_rplyeach(mr_unit_t * un,raid_rplylst_t * eachp,raid_rplybuf_t * rwbuf1,raid_rplybuf_t * rwbuf2)484 raid_rplyeach(
485 	mr_unit_t	*un,
486 	raid_rplylst_t	*eachp,
487 	raid_rplybuf_t	*rwbuf1,
488 	raid_rplybuf_t	*rwbuf2
489 )
490 {
491 	raid_pwhdr_t	*pwhp1;
492 	raid_pwhdr_t	*pwhp2;
493 	uint_t		dsum1 = 0;
494 	uint_t		dsum2 = 0;
495 	int		good_pw1 = 0;
496 	int		good_pw2 = 0;
497 	int		do_1 = 0;
498 	int		do_2 = 0;
499 	int		error = 0;
500 
501 	/* First verify the normal case - two pre-write entries are all good */
502 	if ((eachp->rpl_pwhdr1.rpw_magic == RAID_PWMAGIC &&
503 	    eachp->rpl_pwhdr2.rpw_magic == RAID_PWMAGIC) &&
504 	    (eachp->rpl_pwhdr1.rpw_blkcnt == eachp->rpl_pwhdr2.rpw_blkcnt)) {
505 
506 		ASSERT(eachp->rpl_pwhdr1.rpw_id == eachp->rpl_pwhdr2.rpw_id);
507 
508 		/* read the pre-write entries */
509 		error = raid_pw_read(un, eachp->rpl_column1,
510 		    eachp->rpl_slot1, rwbuf1);
511 		pwhp1 = &eachp->rpl_pwhdr1;
512 		if (error) {
513 			if (error != RAID_RPLY_COMPREPLAY)
514 				return (error);
515 			good_pw1 = FALSE;
516 		} else {
517 			/* generate checksum for each pre-write entry */
518 			dsum1 = genchecksum(rwbuf1->rpl_data + DEV_BSIZE,
519 						dbtob(pwhp1->rpw_blkcnt));
520 			good_pw1 = (dsum1 == pwhp1->rpw_sum);
521 		}
522 
523 		error = raid_pw_read(un, eachp->rpl_column2, eachp->rpl_slot2,
524 		    rwbuf2);
525 		pwhp2 = &eachp->rpl_pwhdr2;
526 		if (error) {
527 			if (error != RAID_RPLY_COMPREPLAY)
528 				return (error);
529 			good_pw2 = FALSE;
530 		} else {
531 			/* generate checksum for pre-write entry */
532 			dsum2 = genchecksum(rwbuf2->rpl_data + DEV_BSIZE,
533 						dbtob(pwhp2->rpw_blkcnt));
534 			good_pw2 = (dsum2 == pwhp2->rpw_sum);
535 		}
536 
537 		/* verify the checksums and states */
538 		raid_rply_verify(un, eachp->rpl_column1, good_pw1, &do_1,
539 			eachp->rpl_column2, good_pw2, &do_2);
540 
541 		/* write (replay) the pre-write entries */
542 		if (do_1) {
543 			error = raid_pw_write(un, eachp->rpl_column1,
544 			    &eachp->rpl_pwhdr1, rwbuf1);
545 			if (error && (error != RAID_RPLY_COMPREPLAY)) {
546 				return (error);
547 			}
548 		}
549 		if (do_2) {
550 			error = raid_pw_write(un, eachp->rpl_column2,
551 			    &eachp->rpl_pwhdr2, rwbuf2);
552 			if (error && (error != RAID_RPLY_COMPREPLAY)) {
553 				return (error);
554 			}
555 		}
556 		return (0);
557 	}
558 	if (eachp->rpl_pwhdr1.rpw_magic == RAID_PWMAGIC) {
559 		/*
560 		 * if partner was errored at time of write
561 		 * or due to open or replay, replay this entry
562 		 */
563 		if ((eachp->rpl_pwhdr1.rpw_columnnum == -1) ||
564 		    (! COLUMN_ISUP(un, eachp->rpl_pwhdr1.rpw_columnnum))) {
565 			/* read the pre-write entry */
566 			error = raid_pw_read(un, eachp->rpl_column1,
567 			    eachp->rpl_slot1, rwbuf1);
568 			if (error)
569 				return (error);
570 			/* generate checksum for the pre-write entry */
571 			pwhp1 = &eachp->rpl_pwhdr1;
572 			dsum1 = genchecksum(rwbuf1->rpl_data + DEV_BSIZE,
573 						dbtob(pwhp1->rpw_blkcnt));
574 			if (dsum1 == pwhp1->rpw_sum) {
575 				error = raid_pw_write(un, eachp->rpl_column1,
576 						&eachp->rpl_pwhdr1, rwbuf1);
577 				if (error && (error != RAID_RPLY_COMPREPLAY)) {
578 					return (error);
579 				}
580 			}
581 		}
582 		return (0);
583 	}
584 
585 	return (0);
586 }
587 
588 static int
replay_line(mr_unit_t * un,raid_rplylst_t * eachp,raid_rplybuf_t * rplybuf)589 replay_line(mr_unit_t *un, raid_rplylst_t *eachp, raid_rplybuf_t *rplybuf)
590 {
591 	raid_pwhdr_t	*pwhdr1, *pwhdr2;
592 	raid_rplylst_t	*eachpn;
593 	int		i;
594 	int		cnt;
595 	diskaddr_t	blkno;
596 	uint_t		blkcnt;
597 	long long	id;
598 	int		dsum;
599 	int		error;
600 	int		colcnt, col, col2;
601 	int		down;
602 
603 	if (eachp->rpl_id == 0)
604 		return (0);
605 	/*
606 	 * check: 1 - enough equal ids
607 	 *	  2 - all have same columncnt
608 	 *	  3 - all have same blkno
609 	 *	  4 - all have same blkcnt
610 	 *
611 	 * read each and check the checksum
612 	 * write each
613 	 */
614 
615 	cnt = eachp->rpl_colcnt;
616 	id = eachp->rpl_id;
617 	pwhdr1 = &eachp->rpl_pwhdr1;
618 	blkno = pwhdr1->rpw_blkno;
619 	blkcnt = pwhdr1->rpw_blkcnt;
620 
621 	error = raid_pw_read(un, eachp->rpl_column1, eachp->rpl_slot1, rplybuf);
622 	dsum = genchecksum(rplybuf->rpl_data + DEV_BSIZE,
623 	    dbtob(pwhdr1->rpw_blkcnt));
624 
625 	if (dsum != pwhdr1->rpw_sum)
626 		return (0);
627 
628 	if (error) {
629 		if (error == RAID_RPLY_COMPREPLAY)
630 			return (0);
631 		else
632 			return (1);
633 	}
634 
635 	eachpn = eachp->rpl_next;
636 	for (i = 1; i < cnt; i++) {
637 		if (eachpn == NULL)
638 			break;
639 		col2 = eachpn->rpl_column1;
640 		ASSERT(col2 < un->un_totalcolumncnt);
641 		pwhdr2 = &eachpn->rpl_pwhdr1;
642 		if ((pwhdr2->rpw_blkno != blkno) ||
643 		    (pwhdr2->rpw_blkcnt != blkcnt) ||
644 		    (eachpn->rpl_id != id) ||
645 		    (pwhdr2->rpw_colcount != cnt)) {
646 			return (0);
647 		}
648 
649 		error = raid_pw_read(un, col2, eachpn->rpl_slot1, rplybuf);
650 		dsum = genchecksum(rplybuf->rpl_data + DEV_BSIZE,
651 		    dbtob(pwhdr2->rpw_blkcnt));
652 		if (dsum != pwhdr2->rpw_sum)
653 			return (0);
654 		eachpn = eachpn->rpl_next;
655 	}
656 	colcnt = i;
657 
658 	if (error)
659 		return (0);
660 
661 	down = raid_state_cnt(un, RCS_ERRED);
662 	if ((i != un->un_totalcolumncnt) &&
663 	    (i != (un->un_totalcolumncnt - down)))
664 		return (0);
665 
666 	/* there ara enough columns to write correctly */
667 	eachpn = eachp;
668 	for (i = 0; i < colcnt; i++) {
669 		col = eachpn->rpl_column1;
670 		error = raid_pw_read(un, col, eachpn->rpl_slot1, rplybuf);
671 		error = raid_pw_write(un, col, &eachpn->rpl_pwhdr1, rplybuf);
672 		eachpn->rpl_id = 0;
673 		if (error && (error != RAID_RPLY_COMPREPLAY))
674 			return (1);
675 		eachpn = eachpn->rpl_next;
676 	}
677 	return (0);
678 }
679 
680 /*
681  * NAMES:	raid_replay_error
682  * DESCRIPTION: RAID metadevice replay error handling routine (TBD)
683  * PARAMETERS:
684  * RETURNS:
685  */
686 static int
raid_replay_error(mr_unit_t * un,int column)687 raid_replay_error(mr_unit_t *un, int column)
688 {
689 	int	error = RAID_RPLY_COMPREPLAY;
690 
691 	raid_set_state(un, column, RCS_ERRED, 0);
692 	raid_commit(un, NULL);
693 
694 	if (UNIT_STATE(un) == RUS_LAST_ERRED) {
695 		error = RAID_RPLY_READONLY;
696 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE,
697 		    MD_UN2SET(un), MD_SID(un));
698 	} else if (UNIT_STATE(un) == RUS_ERRED) {
699 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE,
700 		    MD_UN2SET(un), MD_SID(un));
701 	}
702 
703 	return (error);
704 }
705 
706 /*
707  * NAMES:	raid_replay
708  * DESCRIPTION: RAID metadevice main replay processing routine
709  * PARAMETERS:	mr_unit_t *un - pointer to an unit structure
710  * RETURNS:
711  */
712 
713 int
raid_replay(mr_unit_t * un)714 raid_replay(mr_unit_t *un)
715 {
716 	raid_rplylst_t	*rplylst = NULL;
717 	raid_rplylst_t	**prevp, *eachp;
718 	raid_rplybuf_t	*rplybuf;
719 	raid_rplybuf_t	rwbuf1;
720 	raid_rplybuf_t	rwbuf2;
721 	mr_column_t	*colptr;
722 	raid_pwhdr_t	pwhdr;
723 	raid_pwhdr_t	*pwhdrp = &pwhdr;
724 	int		error = 0;
725 	int		i, j;
726 	diskaddr_t	max_blkno = un->un_segsize * un->un_segsincolumn;
727 	int		totalcolumns = un->un_totalcolumncnt;
728 
729 	raid_rply_alloc(un, &rplybuf, &rwbuf1, &rwbuf2);
730 
731 	/* build a replay list based on the order of pre-write id */
732 	for (i = 0; i < un->un_pwcnt; i++) {
733 		/* issue a synchronous read for each column */
734 		for (j = 0; j < un->un_totalcolumncnt; j++) {
735 			if (COLUMN_ISUP(un, j)) {
736 				raid_pwhdr_read(un, i, j, &rplybuf[j]);
737 				/* wait for I/O completion for each column */
738 				if (biowait((buf_t *)rplybuf[j].rpl_buf)) {
739 					/* potential state transition */
740 					error = raid_replay_error(un, j);
741 					if (error == RAID_RPLY_COMPREPLAY)
742 						continue;
743 					else
744 						goto replay_failed;
745 				}
746 				if (un->c.un_revision & MD_64BIT_META_DEV) {
747 					pwhdrp = (raid_pwhdr_t *)
748 							rplybuf[j].rpl_data;
749 				} else {
750 					RAID_CONVERT_RPW((raid_pwhdr32_od_t *)
751 							rplybuf[j].rpl_data,
752 							pwhdrp);
753 				}
754 
755 				/* first check pre-write magic number */
756 				if (pwhdrp->rpw_magic != RAID_PWMAGIC) {
757 					continue;
758 				}
759 				if (pwhdrp->rpw_column != j) {
760 					continue;
761 				}
762 				if (pwhdrp->rpw_id == (long long) 0) {
763 					continue;
764 				}
765 				if (pwhdrp->rpw_blkcnt > (un->un_iosize - 1)) {
766 					continue;
767 				}
768 				if (pwhdrp->rpw_blkcnt == 0) {
769 					continue;
770 				}
771 				if (pwhdrp->rpw_blkno > max_blkno) {
772 					continue;
773 				}
774 				if ((pwhdrp->rpw_columnnum < 0) ||
775 				    (pwhdrp->rpw_columnnum > totalcolumns)) {
776 					continue;
777 				}
778 				if (((pwhdrp->rpw_colcount != 1) &&
779 				    (pwhdrp->rpw_colcount != 2) &&
780 				    (pwhdrp->rpw_colcount != totalcolumns))) {
781 					continue;
782 				}
783 
784 				enq_rplylst(&rplylst, pwhdrp, i, j);
785 			}
786 		}
787 	}
788 
789 	/* replay each entry in the replay list */
790 	prevp = &rplylst;
791 	while ((eachp = *prevp) != NULL) {
792 		/* zero out the pre-write headers in the buffer */
793 		bzero((caddr_t)rwbuf1.rpl_data, sizeof (raid_pwhdr_t));
794 		bzero((caddr_t)rwbuf2.rpl_data, sizeof (raid_pwhdr_t));
795 
796 		if (eachp->rpl_colcnt <= 2)
797 			error = raid_rplyeach(un, eachp, &rwbuf1, &rwbuf2);
798 		else
799 			error = replay_line(un, eachp, &rwbuf1);
800 
801 		if (error && (error != RAID_RPLY_COMPREPLAY)) {
802 			goto replay_failed;
803 		}
804 
805 		/* free the processed replay list entry */
806 		rpl_delete(prevp, eachp);
807 		prevp = &rplylst;
808 	}
809 
810 	/* zero out all pre-write entries in this unit */
811 	for (j = 0; j < un->un_totalcolumncnt; j++) {
812 		if (COLUMN_ISUP(un, j)) {
813 			colptr = &un->un_column[j];
814 			if (init_pw_area(un, colptr->un_dev,
815 						colptr->un_pwstart, j))
816 				break;
817 		}
818 	}
819 
820 	/* deallocate all the buffer resource allocated in this routine */
821 	raid_rply_dealloc(un, &rplybuf, &rwbuf1, &rwbuf2);
822 
823 	return (RAID_RPLY_SUCCESS);
824 
825 replay_failed:
826 
827 	/* first release the list */
828 	prevp = &rplylst;
829 	while ((eachp = *prevp) != NULL) {
830 		rpl_delete(prevp, eachp);
831 		prevp = &rplylst;
832 	}
833 
834 	/* then release buffers */
835 	raid_rply_dealloc(un, &rplybuf, &rwbuf1, &rwbuf2);
836 
837 	/* also reset the pre-write id variable to one */
838 	un->un_pwid = 1;
839 	raid_total_rply_entries = 0;
840 
841 	return (error);
842 }
843