1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 /*
29 * NAME: raid_replay.c
30 *
31 * DESCRIPTION: RAID driver source file containing routines related to replay
32 * operation.
33 *
34 * ROUTINES PROVIDED FOR EXTERNAL USE:
35 * raid_replay() - replay all the pre write entries in the unit.
36 */
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/conf.h>
41 #include <sys/file.h>
42 #include <sys/user.h>
43 #include <sys/uio.h>
44 #include <sys/t_lock.h>
45 #include <sys/buf.h>
46 #include <sys/dkio.h>
47 #include <sys/vtoc.h>
48 #include <sys/kmem.h>
49 #include <vm/page.h>
50 #include <sys/sysmacros.h>
51 #include <sys/types.h>
52 #include <sys/mkdev.h>
53 #include <sys/stat.h>
54 #include <sys/open.h>
55 #include <sys/modctl.h>
56 #include <sys/ddi.h>
57 #include <sys/sunddi.h>
58
59 #include <sys/lvm/md_raid.h>
60
61 #include <sys/sysevent/eventdefs.h>
62 #include <sys/sysevent/svm.h>
63
64 /* functions forward declarations */
65 static int raid_replay_error(mr_unit_t *un, int column);
66
67 int raid_total_rply_entries = 0;
68
69 /*
70 * NAMES: raid_rply_dealloc, raid_rply_alloc
71 * DESCRIPTION: RAID metadevice replay buffer allocation/deallocation routines
72 * PARAMETERS: mr_unit_t *un - pointer to the unit structure
73 * mr_unit_t *un - pointer to the unit structure
74 * RETURNS:
75 */
76 static void
raid_rply_dealloc(mr_unit_t * un,raid_rplybuf_t ** bufs,raid_rplybuf_t * rwbuf1,raid_rplybuf_t * rwbuf2)77 raid_rply_dealloc(mr_unit_t *un,
78 raid_rplybuf_t **bufs,
79 raid_rplybuf_t *rwbuf1,
80 raid_rplybuf_t *rwbuf2)
81 {
82 int i;
83 raid_rplybuf_t *tmp;
84
85 for (i = 0, tmp = *bufs; i < un->un_totalcolumncnt; i++, tmp++) {
86 if (tmp->rpl_data) {
87 kmem_free(tmp->rpl_data, DEV_BSIZE);
88 tmp->rpl_data = NULL;
89 }
90 if (tmp->rpl_buf) {
91 kmem_free(tmp->rpl_buf, sizeof (buf_t));
92 tmp->rpl_buf = NULL;
93 }
94 }
95 kmem_free(*bufs, sizeof (raid_rplybuf_t) * un->un_totalcolumncnt);
96 *bufs = NULL;
97 if (rwbuf1->rpl_data) {
98 kmem_free(rwbuf1->rpl_data, dbtob(un->un_iosize));
99 rwbuf1->rpl_data = NULL;
100 }
101 if (rwbuf1->rpl_buf) {
102 kmem_free((caddr_t)rwbuf1->rpl_buf, sizeof (buf_t));
103 rwbuf1->rpl_buf = NULL;
104 }
105 if (rwbuf2->rpl_data) {
106 kmem_free(rwbuf2->rpl_data, dbtob(un->un_iosize));
107 rwbuf2->rpl_data = NULL;
108 }
109 if (rwbuf2->rpl_buf) {
110 kmem_free((caddr_t)rwbuf2->rpl_buf, sizeof (buf_t));
111 rwbuf2->rpl_buf = NULL;
112 }
113 }
114
115 static void
raid_rply_alloc(mr_unit_t * un,raid_rplybuf_t ** bufs,raid_rplybuf_t * rwbuf1,raid_rplybuf_t * rwbuf2)116 raid_rply_alloc(mr_unit_t *un,
117 raid_rplybuf_t **bufs,
118 raid_rplybuf_t *rwbuf1,
119 raid_rplybuf_t *rwbuf2)
120 {
121 int i;
122 raid_rplybuf_t *tmp;
123 buf_t *bp;
124
125 /* intialization */
126 *bufs = kmem_zalloc(sizeof (raid_rplybuf_t) * un->un_totalcolumncnt,
127 KM_SLEEP);
128 ASSERT(*bufs != NULL);
129 bzero((caddr_t)rwbuf1, sizeof (raid_rplybuf_t));
130 bzero((caddr_t)rwbuf2, sizeof (raid_rplybuf_t));
131
132 /* allocate all the buffers required for the replay processing */
133 for (i = 0, tmp = *bufs; i < un->un_totalcolumncnt; i++, tmp++) {
134 tmp->rpl_data = kmem_zalloc(DEV_BSIZE, KM_SLEEP);
135 ASSERT(tmp->rpl_data != NULL);
136 tmp->rpl_buf = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
137 ASSERT(tmp->rpl_buf != NULL);
138 bp = (buf_t *)tmp->rpl_buf;
139 bp->b_back = bp;
140 bp->b_forw = bp;
141 bp->b_flags = B_BUSY;
142 bp->b_offset = -1;
143 /* Initialize semaphores */
144 sema_init(&bp->b_io, 0, NULL,
145 SEMA_DEFAULT, NULL);
146 sema_init(&bp->b_sem, 0, NULL,
147 SEMA_DEFAULT, NULL);
148 }
149
150 rwbuf1->rpl_data = kmem_zalloc(dbtob(un->un_iosize), KM_SLEEP);
151 ASSERT(rwbuf1->rpl_data != NULL);
152 rwbuf1->rpl_buf = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
153 ASSERT(rwbuf1->rpl_buf != NULL);
154 rwbuf2->rpl_data = kmem_zalloc(dbtob(un->un_iosize), KM_SLEEP);
155 ASSERT(rwbuf2->rpl_data != NULL);
156 rwbuf2->rpl_buf = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
157 ASSERT(rwbuf2->rpl_buf != NULL);
158
159 bp = (buf_t *)rwbuf1->rpl_buf;
160 bp->b_back = bp;
161 bp->b_forw = bp;
162 bp->b_flags = B_BUSY;
163 bp->b_offset = -1;
164 /* Initialize semaphores */
165 sema_init(&bp->b_io, 0, NULL,
166 SEMA_DEFAULT, NULL);
167 sema_init(&bp->b_sem, 0, NULL,
168 SEMA_DEFAULT, NULL);
169 bp = (buf_t *)rwbuf2->rpl_buf;
170 bp->b_back = bp;
171 bp->b_forw = bp;
172 bp->b_flags = B_BUSY;
173 bp->b_offset = -1;
174 /* Initialize semaphores */
175 sema_init(&bp->b_io, 0, NULL,
176 SEMA_DEFAULT, NULL);
177 sema_init(&bp->b_sem, 0, NULL,
178 SEMA_DEFAULT, NULL);
179 }
180
181 /*
182 * NAMES: rpl_insert, rpl_delete, rpl_find
183 * DESCRIPTION: RAID metadevice replay list processing APIs
184 * PARAMETERS: raid_rplylst_t *list - pointer to the replay list.
185 * raid_pwhdr_t *pwptr - pointer to a pre-write header.
186 * RETURNS:
187 */
188 static void
rpl_insert(raid_rplylst_t ** listp,raid_rplylst_t * newp)189 rpl_insert(raid_rplylst_t **listp, raid_rplylst_t *newp)
190 {
191 raid_rplylst_t *tmp, **prevp;
192
193 for (prevp = listp; ((tmp = *prevp) != NULL); prevp = &tmp->rpl_next) {
194 if (tmp->rpl_id > newp->rpl_id) {
195 break;
196 }
197 }
198 newp->rpl_next = tmp;
199 *prevp = newp;
200 }
201
202 static void
rpl_delete(raid_rplylst_t ** prevp,raid_rplylst_t * oldp)203 rpl_delete(raid_rplylst_t **prevp, raid_rplylst_t *oldp)
204 {
205
206 ASSERT((caddr_t)oldp);
207 raid_total_rply_entries --;
208 *prevp = oldp->rpl_next;
209 kmem_free((caddr_t)oldp, sizeof (raid_rplylst_t));
210 }
211
212 static raid_rplylst_t *
rpl_find(raid_rplylst_t * list,long long pw_id)213 rpl_find(raid_rplylst_t *list, long long pw_id)
214 {
215 raid_rplylst_t *tmp;
216
217 for (tmp = list; tmp; tmp = tmp->rpl_next) {
218 if (pw_id == tmp->rpl_id) {
219 return (tmp);
220 }
221 }
222 return ((raid_rplylst_t *)NULL);
223 }
224
225 /*
226 * NAMES: enq_rplylst
227 * DESCRIPTION: Enqueue a pre-write header into the replay list.
228 * PARAMETERS: raid_rplylst_t *list - pointer to the replay list.
229 * raid_pwhdr_t *pwptr - pointer to a pre-write header.
230 * RETURNS:
231 */
232 static void
enq_rplylst(raid_rplylst_t ** listp,raid_pwhdr_t * pwhp,uint_t slot,int column)233 enq_rplylst(raid_rplylst_t **listp, raid_pwhdr_t *pwhp,
234 uint_t slot, int column)
235 {
236 raid_rplylst_t *newp, *oldp;
237
238 /* check if the pre-write existed in the list */
239 if ((pwhp->rpw_colcount <= 2) &&
240 (oldp = rpl_find(*listp, pwhp->rpw_id))) {
241 bcopy((caddr_t)pwhp, (caddr_t)&oldp->rpl_pwhdr2,
242 sizeof (raid_pwhdr_t));
243 oldp->rpl_slot2 = slot;
244 oldp->rpl_column2 = column;
245 } else {
246 raid_total_rply_entries ++;
247 newp = (raid_rplylst_t *)kmem_zalloc(sizeof (raid_rplylst_t),
248 KM_SLEEP);
249 ASSERT(newp != NULL);
250 bcopy((caddr_t)pwhp, (caddr_t)&newp->rpl_pwhdr1,
251 sizeof (raid_pwhdr_t));
252 bzero((caddr_t)&newp->rpl_pwhdr2, sizeof (raid_pwhdr_t));
253
254 newp->rpl_id = pwhp->rpw_id;
255 newp->rpl_column1 = column;
256 newp->rpl_slot1 = slot;
257 newp->rpl_next = (raid_rplylst_t *)NULL;
258 newp->rpl_colcnt = pwhp->rpw_colcount;
259 rpl_insert(listp, newp);
260 }
261 }
262
263 /*
264 * NAMES: pw_read_done and pw_write_done
265 * DESCRIPTION: don't know the usage yet ??? (TBD)
266 * PARAMETERS:
267 * RETURNS:
268 */
269 static int
pw_read_done(buf_t * bp)270 pw_read_done(buf_t *bp)
271 {
272 ASSERT(SEMA_HELD(&bp->b_sem));
273 ASSERT((bp->b_flags & B_DONE) == 0);
274
275 bp->b_flags |= B_DONE;
276
277 if (bp->b_flags & B_ASYNC)
278 sema_v(&bp->b_sem);
279 else
280 /* wakeup the thread waiting on this buf */
281 sema_v(&bp->b_io);
282 return (0);
283 }
284
285 static int
pw_write_done(buf_t * bp)286 pw_write_done(buf_t *bp)
287 {
288 ASSERT(SEMA_HELD(&bp->b_sem));
289 ASSERT((bp->b_flags & B_DONE) == 0);
290
291 bp->b_flags |= B_DONE;
292
293 if (bp->b_flags & B_ASYNC)
294 sema_v(&bp->b_sem);
295 else
296 /* wakeup the thread waiting on this buf */
297 sema_v(&bp->b_io);
298
299 return (0);
300 }
301
302 /*
303 * NAMES: raid_pwhdr_read
304 * DESCRIPTION: issue a syncronous read to read a pre-write header
305 * PARAMETERS: mr_unit_t *un - pointer to the unit structure
306 * int pw_slot - pre-write entry slot number
307 * int column - column number for the pre-write entry
308 * raid_rplybuf_t *bufp - pointer to the replay buffer structure
309 * RETURNS:
310 */
311 static void
raid_pwhdr_read(mr_unit_t * un,int pw_slot,int column,raid_rplybuf_t * bufp)312 raid_pwhdr_read(mr_unit_t *un, int pw_slot, int column, raid_rplybuf_t *bufp)
313 {
314 buf_t *bp;
315
316 /* set up pointers from raid_rplybuf_t *bufp */
317 bp = (buf_t *)bufp->rpl_buf;
318
319 /* calculate the data address or block number */
320 bp->b_un.b_addr = bufp->rpl_data;
321 bp->b_lblkno = un->un_column[column].un_pwstart +
322 pw_slot * un->un_iosize;
323 bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev);
324 bp->b_bufsize = DEV_BSIZE;
325 bp->b_bcount = DEV_BSIZE;
326 bp->b_flags = (B_READ | B_BUSY);
327 bp->b_iodone = pw_read_done;
328 (void) md_call_strategy(bp, 0, NULL);
329 }
330
331 /*
332 * NAMES: raid_pw_read
333 * DESCRIPTION: issue a syncronous read to read a pre-write entry
334 * PARAMETERS: mr_unit_t *un - pointer to the unit structure
335 * int column - column number for the pre-write entry
336 * u_int slot - pre-write entry slot number
337 * raid_rplybuf_t *bufp - pointer to the replay buffer structure
338 * RETURNS:
339 */
340 static int
raid_pw_read(mr_unit_t * un,int column,uint_t slot,raid_rplybuf_t * bufp)341 raid_pw_read(mr_unit_t *un, int column, uint_t slot, raid_rplybuf_t *bufp)
342 {
343 buf_t *bp;
344 int error;
345 uint_t blkcnt = un->un_iosize;
346 uint_t bytecnt = blkcnt * DEV_BSIZE;
347
348 /* if this column is no longer accessible, return */
349 if (!COLUMN_ISUP(un, column))
350 return (RAID_RPLY_COMPREPLAY);
351
352 /* set up pointers from raid_rplybuf_t *bufp */
353 bp = (buf_t *)bufp->rpl_buf;
354
355 /* calculate the data address or block number */
356 bp->b_un.b_addr = bufp->rpl_data;
357 bp->b_bufsize = bytecnt;
358 bp->b_bcount = bytecnt;
359 bp->b_flags = (B_READ | B_BUSY);
360 bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev);
361 bp->b_lblkno = un->un_column[column].un_pwstart + (slot * blkcnt);
362 bp->b_iodone = pw_read_done;
363 (void) md_call_strategy(bp, 0, NULL);
364 if (biowait(bp)) {
365 error = raid_replay_error(un, column);
366 return (error);
367 }
368 return (0);
369 }
370
371 /*
372 * NAMES: raid_pw_write
373 * DESCRIPTION: issue a syncronous write to write a pre-write entry
374 * PARAMETERS: mr_unit_t *un - pointer to the unit structure
375 * int column - column number for the pre-write entry
376 * raid_pwhdr_t *pwhp - needed for some infos about the pw header
377 * raid_rplybuf_t *bufp - pointer to the replay buffer structure
378 * RETURNS:
379 */
380 static int
raid_pw_write(mr_unit_t * un,int column,raid_pwhdr_t * pwhp,raid_rplybuf_t * bufp)381 raid_pw_write(mr_unit_t *un, int column, raid_pwhdr_t *pwhp,
382 raid_rplybuf_t *bufp)
383 {
384 buf_t *bp;
385 int error;
386
387 /* if this column is no longer accessible, return */
388 if (!COLUMN_ISUP(un, column))
389 return (RAID_RPLY_COMPREPLAY);
390
391 /* set up pointers from raid_rplybuf_t *bufp */
392 bp = (buf_t *)bufp->rpl_buf;
393
394 /* calculate the data address or block number */
395 bp->b_un.b_addr = bufp->rpl_data + DEV_BSIZE;
396 bp->b_bufsize = dbtob(pwhp->rpw_blkcnt);
397 bp->b_bcount = dbtob(pwhp->rpw_blkcnt);
398 bp->b_flags = (B_WRITE | B_BUSY);
399 bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev);
400 bp->b_lblkno = un->un_column[column].un_devstart + pwhp->rpw_blkno;
401 bp->b_iodone = pw_write_done;
402 (void) md_call_strategy(bp, 0, NULL);
403 if (biowait(bp)) {
404 error = raid_replay_error(un, column);
405 return (error);
406 }
407 return (0);
408 }
409
410 /*
411 * NAMES: genchecksum
412 * DESCRIPTION: generate check sum for a pre-write entry
413 * PARAMETERS: caddr_t addr - where the data bytes are
414 * int bcount - number of bytes in the pre-write entry
415 * RETURNS:
416 */
417 static uint_t
genchecksum(caddr_t addr,size_t bcount)418 genchecksum(caddr_t addr, size_t bcount)
419 {
420 uint_t *dbuf;
421 size_t wordcnt;
422 uint_t dsum = 0;
423
424 wordcnt = bcount / sizeof (uint_t);
425 dbuf = (uint_t *)(void *)(addr);
426
427 while (wordcnt--) {
428 dsum ^= *dbuf;
429 dbuf++;
430 }
431 return (dsum);
432 }
433
434 /*
435 * NAMES: raid_rply_verify
436 * DESCRIPTION: verify the pre-write entry for replay
437 * PARAMETERS: mr_unit_t *un - pointer to unit structure
438 * int col1 - column number 1
439 * int goodsum1 - flag to indicate good checksum
440 * int *do_1 - flag to indicate whether we should replay
441 * the first pre-write
442 * int col2 - column number 2
443 * int goodsum2 - flag to indicate good checksum
444 * int *do_2 - flag to indicate whether we should replay
445 * the first pre-write
446 * RETURNS:
447 */
448 static void
raid_rply_verify(mr_unit_t * un,int col1,int goodsum1,int * do_1,int col2,int goodsum2,int * do_2)449 raid_rply_verify(mr_unit_t *un, int col1, int goodsum1, int *do_1,
450 int col2, int goodsum2, int *do_2)
451 {
452 int good_state1 = 0;
453 int good_state2 = 0;
454
455 *do_1 = 0; *do_2 = 0; /* prepare for the worst */
456 if (COLUMN_ISUP(un, col1)) {
457 good_state1 = 1;
458 }
459 if (COLUMN_ISUP(un, col2)) {
460 good_state2 = 1;
461 }
462 if ((good_state1 & good_state2) && (goodsum1 & goodsum2)) {
463 /* if both columns check out, do it */
464 *do_1 = 1; *do_2 = 1;
465 } else if ((good_state1 & goodsum1) && !good_state2) {
466 /* if one column is okay and the other is errored, do it */
467 *do_1 = 1; *do_2 = 0;
468 } else if ((good_state2 & goodsum2) && !good_state1) {
469 /* if one column is okay and the other is errored, do it */
470 *do_2 = 1; *do_1 = 0;
471 }
472 }
473
474 /*
475 * NAMES: raid_rplyeach
476 * DESCRIPTION: issue a syncronous read to read a pre-write header
477 * PARAMETERS: mr_unit_t *un - pointer to the unit structure
478 * raid_rplylst_t *eachp - pointer to the replay list entry
479 * raid_rplybuf_t *rwbuf1 - pointer to the replay buffer structure
480 * raid_rplybuf_t *rwbuf2 - pointer to the replay buffer structure
481 * RETURNS:
482 */
483 static int
raid_rplyeach(mr_unit_t * un,raid_rplylst_t * eachp,raid_rplybuf_t * rwbuf1,raid_rplybuf_t * rwbuf2)484 raid_rplyeach(
485 mr_unit_t *un,
486 raid_rplylst_t *eachp,
487 raid_rplybuf_t *rwbuf1,
488 raid_rplybuf_t *rwbuf2
489 )
490 {
491 raid_pwhdr_t *pwhp1;
492 raid_pwhdr_t *pwhp2;
493 uint_t dsum1 = 0;
494 uint_t dsum2 = 0;
495 int good_pw1 = 0;
496 int good_pw2 = 0;
497 int do_1 = 0;
498 int do_2 = 0;
499 int error = 0;
500
501 /* First verify the normal case - two pre-write entries are all good */
502 if ((eachp->rpl_pwhdr1.rpw_magic == RAID_PWMAGIC &&
503 eachp->rpl_pwhdr2.rpw_magic == RAID_PWMAGIC) &&
504 (eachp->rpl_pwhdr1.rpw_blkcnt == eachp->rpl_pwhdr2.rpw_blkcnt)) {
505
506 ASSERT(eachp->rpl_pwhdr1.rpw_id == eachp->rpl_pwhdr2.rpw_id);
507
508 /* read the pre-write entries */
509 error = raid_pw_read(un, eachp->rpl_column1,
510 eachp->rpl_slot1, rwbuf1);
511 pwhp1 = &eachp->rpl_pwhdr1;
512 if (error) {
513 if (error != RAID_RPLY_COMPREPLAY)
514 return (error);
515 good_pw1 = FALSE;
516 } else {
517 /* generate checksum for each pre-write entry */
518 dsum1 = genchecksum(rwbuf1->rpl_data + DEV_BSIZE,
519 dbtob(pwhp1->rpw_blkcnt));
520 good_pw1 = (dsum1 == pwhp1->rpw_sum);
521 }
522
523 error = raid_pw_read(un, eachp->rpl_column2, eachp->rpl_slot2,
524 rwbuf2);
525 pwhp2 = &eachp->rpl_pwhdr2;
526 if (error) {
527 if (error != RAID_RPLY_COMPREPLAY)
528 return (error);
529 good_pw2 = FALSE;
530 } else {
531 /* generate checksum for pre-write entry */
532 dsum2 = genchecksum(rwbuf2->rpl_data + DEV_BSIZE,
533 dbtob(pwhp2->rpw_blkcnt));
534 good_pw2 = (dsum2 == pwhp2->rpw_sum);
535 }
536
537 /* verify the checksums and states */
538 raid_rply_verify(un, eachp->rpl_column1, good_pw1, &do_1,
539 eachp->rpl_column2, good_pw2, &do_2);
540
541 /* write (replay) the pre-write entries */
542 if (do_1) {
543 error = raid_pw_write(un, eachp->rpl_column1,
544 &eachp->rpl_pwhdr1, rwbuf1);
545 if (error && (error != RAID_RPLY_COMPREPLAY)) {
546 return (error);
547 }
548 }
549 if (do_2) {
550 error = raid_pw_write(un, eachp->rpl_column2,
551 &eachp->rpl_pwhdr2, rwbuf2);
552 if (error && (error != RAID_RPLY_COMPREPLAY)) {
553 return (error);
554 }
555 }
556 return (0);
557 }
558 if (eachp->rpl_pwhdr1.rpw_magic == RAID_PWMAGIC) {
559 /*
560 * if partner was errored at time of write
561 * or due to open or replay, replay this entry
562 */
563 if ((eachp->rpl_pwhdr1.rpw_columnnum == -1) ||
564 (! COLUMN_ISUP(un, eachp->rpl_pwhdr1.rpw_columnnum))) {
565 /* read the pre-write entry */
566 error = raid_pw_read(un, eachp->rpl_column1,
567 eachp->rpl_slot1, rwbuf1);
568 if (error)
569 return (error);
570 /* generate checksum for the pre-write entry */
571 pwhp1 = &eachp->rpl_pwhdr1;
572 dsum1 = genchecksum(rwbuf1->rpl_data + DEV_BSIZE,
573 dbtob(pwhp1->rpw_blkcnt));
574 if (dsum1 == pwhp1->rpw_sum) {
575 error = raid_pw_write(un, eachp->rpl_column1,
576 &eachp->rpl_pwhdr1, rwbuf1);
577 if (error && (error != RAID_RPLY_COMPREPLAY)) {
578 return (error);
579 }
580 }
581 }
582 return (0);
583 }
584
585 return (0);
586 }
587
588 static int
replay_line(mr_unit_t * un,raid_rplylst_t * eachp,raid_rplybuf_t * rplybuf)589 replay_line(mr_unit_t *un, raid_rplylst_t *eachp, raid_rplybuf_t *rplybuf)
590 {
591 raid_pwhdr_t *pwhdr1, *pwhdr2;
592 raid_rplylst_t *eachpn;
593 int i;
594 int cnt;
595 diskaddr_t blkno;
596 uint_t blkcnt;
597 long long id;
598 int dsum;
599 int error;
600 int colcnt, col, col2;
601 int down;
602
603 if (eachp->rpl_id == 0)
604 return (0);
605 /*
606 * check: 1 - enough equal ids
607 * 2 - all have same columncnt
608 * 3 - all have same blkno
609 * 4 - all have same blkcnt
610 *
611 * read each and check the checksum
612 * write each
613 */
614
615 cnt = eachp->rpl_colcnt;
616 id = eachp->rpl_id;
617 pwhdr1 = &eachp->rpl_pwhdr1;
618 blkno = pwhdr1->rpw_blkno;
619 blkcnt = pwhdr1->rpw_blkcnt;
620
621 error = raid_pw_read(un, eachp->rpl_column1, eachp->rpl_slot1, rplybuf);
622 dsum = genchecksum(rplybuf->rpl_data + DEV_BSIZE,
623 dbtob(pwhdr1->rpw_blkcnt));
624
625 if (dsum != pwhdr1->rpw_sum)
626 return (0);
627
628 if (error) {
629 if (error == RAID_RPLY_COMPREPLAY)
630 return (0);
631 else
632 return (1);
633 }
634
635 eachpn = eachp->rpl_next;
636 for (i = 1; i < cnt; i++) {
637 if (eachpn == NULL)
638 break;
639 col2 = eachpn->rpl_column1;
640 ASSERT(col2 < un->un_totalcolumncnt);
641 pwhdr2 = &eachpn->rpl_pwhdr1;
642 if ((pwhdr2->rpw_blkno != blkno) ||
643 (pwhdr2->rpw_blkcnt != blkcnt) ||
644 (eachpn->rpl_id != id) ||
645 (pwhdr2->rpw_colcount != cnt)) {
646 return (0);
647 }
648
649 error = raid_pw_read(un, col2, eachpn->rpl_slot1, rplybuf);
650 dsum = genchecksum(rplybuf->rpl_data + DEV_BSIZE,
651 dbtob(pwhdr2->rpw_blkcnt));
652 if (dsum != pwhdr2->rpw_sum)
653 return (0);
654 eachpn = eachpn->rpl_next;
655 }
656 colcnt = i;
657
658 if (error)
659 return (0);
660
661 down = raid_state_cnt(un, RCS_ERRED);
662 if ((i != un->un_totalcolumncnt) &&
663 (i != (un->un_totalcolumncnt - down)))
664 return (0);
665
666 /* there ara enough columns to write correctly */
667 eachpn = eachp;
668 for (i = 0; i < colcnt; i++) {
669 col = eachpn->rpl_column1;
670 error = raid_pw_read(un, col, eachpn->rpl_slot1, rplybuf);
671 error = raid_pw_write(un, col, &eachpn->rpl_pwhdr1, rplybuf);
672 eachpn->rpl_id = 0;
673 if (error && (error != RAID_RPLY_COMPREPLAY))
674 return (1);
675 eachpn = eachpn->rpl_next;
676 }
677 return (0);
678 }
679
680 /*
681 * NAMES: raid_replay_error
682 * DESCRIPTION: RAID metadevice replay error handling routine (TBD)
683 * PARAMETERS:
684 * RETURNS:
685 */
686 static int
raid_replay_error(mr_unit_t * un,int column)687 raid_replay_error(mr_unit_t *un, int column)
688 {
689 int error = RAID_RPLY_COMPREPLAY;
690
691 raid_set_state(un, column, RCS_ERRED, 0);
692 raid_commit(un, NULL);
693
694 if (UNIT_STATE(un) == RUS_LAST_ERRED) {
695 error = RAID_RPLY_READONLY;
696 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE,
697 MD_UN2SET(un), MD_SID(un));
698 } else if (UNIT_STATE(un) == RUS_ERRED) {
699 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE,
700 MD_UN2SET(un), MD_SID(un));
701 }
702
703 return (error);
704 }
705
706 /*
707 * NAMES: raid_replay
708 * DESCRIPTION: RAID metadevice main replay processing routine
709 * PARAMETERS: mr_unit_t *un - pointer to an unit structure
710 * RETURNS:
711 */
712
713 int
raid_replay(mr_unit_t * un)714 raid_replay(mr_unit_t *un)
715 {
716 raid_rplylst_t *rplylst = NULL;
717 raid_rplylst_t **prevp, *eachp;
718 raid_rplybuf_t *rplybuf;
719 raid_rplybuf_t rwbuf1;
720 raid_rplybuf_t rwbuf2;
721 mr_column_t *colptr;
722 raid_pwhdr_t pwhdr;
723 raid_pwhdr_t *pwhdrp = &pwhdr;
724 int error = 0;
725 int i, j;
726 diskaddr_t max_blkno = un->un_segsize * un->un_segsincolumn;
727 int totalcolumns = un->un_totalcolumncnt;
728
729 raid_rply_alloc(un, &rplybuf, &rwbuf1, &rwbuf2);
730
731 /* build a replay list based on the order of pre-write id */
732 for (i = 0; i < un->un_pwcnt; i++) {
733 /* issue a synchronous read for each column */
734 for (j = 0; j < un->un_totalcolumncnt; j++) {
735 if (COLUMN_ISUP(un, j)) {
736 raid_pwhdr_read(un, i, j, &rplybuf[j]);
737 /* wait for I/O completion for each column */
738 if (biowait((buf_t *)rplybuf[j].rpl_buf)) {
739 /* potential state transition */
740 error = raid_replay_error(un, j);
741 if (error == RAID_RPLY_COMPREPLAY)
742 continue;
743 else
744 goto replay_failed;
745 }
746 if (un->c.un_revision & MD_64BIT_META_DEV) {
747 pwhdrp = (raid_pwhdr_t *)
748 rplybuf[j].rpl_data;
749 } else {
750 RAID_CONVERT_RPW((raid_pwhdr32_od_t *)
751 rplybuf[j].rpl_data,
752 pwhdrp);
753 }
754
755 /* first check pre-write magic number */
756 if (pwhdrp->rpw_magic != RAID_PWMAGIC) {
757 continue;
758 }
759 if (pwhdrp->rpw_column != j) {
760 continue;
761 }
762 if (pwhdrp->rpw_id == (long long) 0) {
763 continue;
764 }
765 if (pwhdrp->rpw_blkcnt > (un->un_iosize - 1)) {
766 continue;
767 }
768 if (pwhdrp->rpw_blkcnt == 0) {
769 continue;
770 }
771 if (pwhdrp->rpw_blkno > max_blkno) {
772 continue;
773 }
774 if ((pwhdrp->rpw_columnnum < 0) ||
775 (pwhdrp->rpw_columnnum > totalcolumns)) {
776 continue;
777 }
778 if (((pwhdrp->rpw_colcount != 1) &&
779 (pwhdrp->rpw_colcount != 2) &&
780 (pwhdrp->rpw_colcount != totalcolumns))) {
781 continue;
782 }
783
784 enq_rplylst(&rplylst, pwhdrp, i, j);
785 }
786 }
787 }
788
789 /* replay each entry in the replay list */
790 prevp = &rplylst;
791 while ((eachp = *prevp) != NULL) {
792 /* zero out the pre-write headers in the buffer */
793 bzero((caddr_t)rwbuf1.rpl_data, sizeof (raid_pwhdr_t));
794 bzero((caddr_t)rwbuf2.rpl_data, sizeof (raid_pwhdr_t));
795
796 if (eachp->rpl_colcnt <= 2)
797 error = raid_rplyeach(un, eachp, &rwbuf1, &rwbuf2);
798 else
799 error = replay_line(un, eachp, &rwbuf1);
800
801 if (error && (error != RAID_RPLY_COMPREPLAY)) {
802 goto replay_failed;
803 }
804
805 /* free the processed replay list entry */
806 rpl_delete(prevp, eachp);
807 prevp = &rplylst;
808 }
809
810 /* zero out all pre-write entries in this unit */
811 for (j = 0; j < un->un_totalcolumncnt; j++) {
812 if (COLUMN_ISUP(un, j)) {
813 colptr = &un->un_column[j];
814 if (init_pw_area(un, colptr->un_dev,
815 colptr->un_pwstart, j))
816 break;
817 }
818 }
819
820 /* deallocate all the buffer resource allocated in this routine */
821 raid_rply_dealloc(un, &rplybuf, &rwbuf1, &rwbuf2);
822
823 return (RAID_RPLY_SUCCESS);
824
825 replay_failed:
826
827 /* first release the list */
828 prevp = &rplylst;
829 while ((eachp = *prevp) != NULL) {
830 rpl_delete(prevp, eachp);
831 prevp = &rplylst;
832 }
833
834 /* then release buffers */
835 raid_rply_dealloc(un, &rplybuf, &rwbuf1, &rwbuf2);
836
837 /* also reset the pre-write id variable to one */
838 un->un_pwid = 1;
839 raid_total_rply_entries = 0;
840
841 return (error);
842 }
843