1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #pragma ident "%Z%%M% %I% %E% SMI"
28
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/types.h>
32 #include <sys/debug.h>
33 #include <sys/errno.h>
34 #include <sys/sysmacros.h>
35 #include <sys/t_lock.h>
36 #include <sys/kmem.h>
37 #include <sys/lvm/md_trans.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/file.h>
41 #include <sys/proc.h>
42 #include <sys/disp.h>
43 #include <sys/lvm/md_notify.h>
44 #include <sys/lvm/mdvar.h>
45
46 #include <sys/sysevent/eventdefs.h>
47 #include <sys/sysevent/svm.h>
48
49 extern unit_t md_nunits;
50 extern set_t md_nsets;
51 extern md_set_t md_set[];
52
53 extern md_ops_t trans_md_ops;
54 extern major_t md_major;
55
56
57
58
59 static kmutex_t ml_lock;
60 static ml_unit_t *ul_list; /* List of all log units */
61 static int md_nlogs;
62 static kmutex_t ut_mutex; /* per log list of metatrans units */
63 static kmutex_t oc_mutex; /* single threads opens/closes */
64
65 static void md_free_cirbuf(cirbuf_ic_t *cb);
66
67 #define IOWAIT(bp) sema_p(&bp->b_io)
68 #define IODONE(bp) sema_v(&bp->b_io)
69
70 void
_init_ldl(void)71 _init_ldl(void)
72 {
73 mutex_init(&ut_mutex, NULL, MUTEX_DRIVER, NULL);
74 mutex_init(&oc_mutex, NULL, MUTEX_DRIVER, NULL);
75 mutex_init(&ml_lock, NULL, MUTEX_DRIVER, NULL);
76 }
77
78 void
_fini_ldl(void)79 _fini_ldl(void)
80 {
81 mutex_destroy(&ut_mutex);
82 mutex_destroy(&oc_mutex);
83 mutex_destroy(&ml_lock);
84 }
85
86 static void
ldl_errorstate(ml_unit_t * ul)87 ldl_errorstate(ml_unit_t *ul)
88 {
89 char *str;
90
91 if (ldl_iserror(ul))
92 str = "Error";
93 else if (ldl_isherror(ul))
94 str = "Hard Error";
95 else
96 str = "Okay";
97
98 cmn_err(CE_WARN, "md: logging device: %s changed state to %s",
99 md_devname(mddb_getsetnum(ul->un_recid), ul->un_dev, NULL, 0), str);
100 }
101
102
103 /*
104 * atomically commit the log unit struct and any underlying metadevice struct
105 */
106 static void
logcommitdb(ml_unit_t * ul)107 logcommitdb(ml_unit_t *ul)
108 {
109 mddb_recid_t recids[4];
110
111 TRANSSTATS(ts_logcommitdb);
112
113 uniqtime32(&ul->un_timestamp);
114
115 /*
116 * commit the log device and its child (if metadevice)
117 */
118 recids[0] = ul->un_recid;
119 if (ul->un_status & LDL_METADEVICE) {
120 struct mdc_unit *c = MD_UNIT(md_getminor(ul->un_dev));
121 recids[1] = c->un_record_id;
122 recids[2] = 0;
123 } else
124 recids[1] = 0;
125
126 mddb_commitrecs_wrapper(recids);
127 }
128
129 static void
md_alloc_wrbuf(cirbuf_ic_t * cb,size_t bufsize)130 md_alloc_wrbuf(cirbuf_ic_t *cb, size_t bufsize)
131 {
132 int i;
133 buf_t *bp;
134
135 /*
136 * Clear previous allocation
137 */
138 if (cb->cb_nb)
139 md_free_cirbuf(cb);
140
141 bzero((caddr_t)cb, sizeof (*cb));
142 rw_init(&cb->cb_rwlock.lock, NULL, RW_DRIVER, NULL);
143
144 rw_enter(&cb->cb_rwlock.lock, RW_WRITER);
145
146 /*
147 * preallocate 3 bp's and put them on the free list.
148 */
149 for (i = 0; i < 3; ++i) {
150 bp = md_trans_zalloc(sizeof (buf_t));
151 sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
152 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
153 bp->b_offset = -1;
154 bp->b_forw = cb->cb_free;
155 cb->cb_free = bp;
156
157 TRANSSTATS(ts_alloc_bp);
158 }
159
160 cb->cb_va = md_trans_alloc(bufsize);
161 cb->cb_nb = bufsize;
162
163 /*
164 * first bp claims entire write buffer
165 */
166 bp = cb->cb_free;
167 cb->cb_free = bp->b_forw;
168
169 bp->b_forw = bp;
170 bp->b_back = bp;
171 cb->cb_bp = bp;
172 bp->b_un.b_addr = cb->cb_va;
173 bp->b_bufsize = cb->cb_nb;
174
175 rw_exit(&cb->cb_rwlock.lock);
176 }
177
178 static void
md_alloc_rdbuf(cirbuf_ic_t * cb,size_t bufsize,size_t blksize)179 md_alloc_rdbuf(cirbuf_ic_t *cb, size_t bufsize, size_t blksize)
180 {
181 caddr_t va;
182 size_t nb;
183 buf_t *bp;
184
185 /*
186 * Clear previous allocation
187 */
188 if (cb->cb_nb)
189 md_free_cirbuf(cb);
190
191 bzero((caddr_t)cb, sizeof (*cb));
192 rw_init(&cb->cb_rwlock.lock, NULL, RW_DRIVER, NULL);
193
194 rw_enter(&cb->cb_rwlock.lock, RW_WRITER);
195
196 cb->cb_va = md_trans_alloc(bufsize);
197 cb->cb_nb = bufsize;
198
199 /*
200 * preallocate N bufs that are hard-sized to blksize
201 * in other words, the read buffer pool is a linked list
202 * of statically sized bufs.
203 */
204 va = cb->cb_va;
205 while ((nb = bufsize) != 0) {
206 if (nb > blksize)
207 nb = blksize;
208 bp = md_trans_alloc(sizeof (buf_t));
209 bzero((caddr_t)bp, sizeof (buf_t));
210 sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
211 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
212 bp->b_un.b_addr = va;
213 bp->b_bufsize = nb;
214 bp->b_offset = -1;
215 if (cb->cb_bp) {
216 bp->b_forw = cb->cb_bp->b_forw;
217 bp->b_back = cb->cb_bp;
218 cb->cb_bp->b_forw->b_back = bp;
219 cb->cb_bp->b_forw = bp;
220 } else
221 bp->b_forw = bp->b_back = bp;
222 cb->cb_bp = bp;
223
224 TRANSSTATS(ts_alloc_bp);
225
226 bufsize -= nb;
227 va += nb;
228 }
229
230 rw_exit(&cb->cb_rwlock.lock);
231 }
232
233
234 static void
md_free_cirbuf(cirbuf_ic_t * cb)235 md_free_cirbuf(cirbuf_ic_t *cb)
236 {
237 buf_t *bp;
238
239 if (cb->cb_nb == 0)
240 return;
241
242 rw_enter(&cb->cb_rwlock.lock, RW_WRITER);
243 ASSERT(cb->cb_dirty == NULL);
244
245 /*
246 * free the active bufs
247 */
248 while ((bp = cb->cb_bp) != NULL) {
249 if (bp == bp->b_forw)
250 cb->cb_bp = NULL;
251 else
252 cb->cb_bp = bp->b_forw;
253 bp->b_back->b_forw = bp->b_forw;
254 bp->b_forw->b_back = bp->b_back;
255 sema_destroy(&bp->b_sem);
256 sema_destroy(&bp->b_io);
257 md_trans_free(bp, sizeof (buf_t));
258 }
259
260 /*
261 * free the free bufs
262 */
263 while ((bp = cb->cb_free) != NULL) {
264 cb->cb_free = bp->b_forw;
265 sema_destroy(&bp->b_sem);
266 sema_destroy(&bp->b_io);
267 md_trans_free(bp, sizeof (buf_t));
268 }
269 md_trans_free(cb->cb_va, cb->cb_nb);
270 cb->cb_va = NULL;
271 cb->cb_nb = 0;
272 rw_exit(&cb->cb_rwlock.lock);
273 rw_destroy(&cb->cb_rwlock.lock);
274 }
275
276 int
ldl_build_incore(ml_unit_t * ul,int snarfing)277 ldl_build_incore(ml_unit_t *ul, int snarfing)
278 {
279 size_t bufsize;
280 set_t setno;
281
282 setno = mddb_getsetnum(ul->un_recid);
283
284 ASSERT(ul->un_head_lof >= ul->un_bol_lof);
285 ASSERT(ul->un_bol_lof);
286
287 if (ul->un_status & LDL_BEING_RESET) {
288 mddb_setrecprivate(ul->un_recid, MD_PRV_PENDCLEAN);
289 return (1);
290 }
291
292 /*
293 * If snarfing the log device,
294 * then remake the device number
295 * else (we are creating the log device)
296 * set the driver name in the shared name space.
297 */
298 if (snarfing) {
299 ul->un_dev = md_getdevnum(setno, mddb_getsidenum(setno),
300 ul->un_key, MD_NOTRUST_DEVT);
301 }
302
303 /*
304 * With the current device id implementation there is possibility
305 * that we may have NODEV if the underlying can't be resolved at
306 * snarf time. If this is the case we want to be consistent with
307 * the normal behavior and continue to allow log to be put on the list.
308 * We delay the resolve of the dev_t so we can resolve at the open
309 * time of the log device by device id
310 */
311 if ((md_getmajor(ul->un_dev) == md_major) &&
312 (md_dev_exists(ul->un_dev) == 0)) {
313 return (1);
314 }
315
316 mutex_enter(&ml_lock);
317
318 /*
319 * initialize incore structs
320 * LDL_FIND_TAIL flag indicates that all I/O must wait until the
321 * tail has been found.
322 */
323 ul->un_opencnt = 0;
324 ul->un_transcnt = 0;
325 ul->un_resv = 0;
326 ul->un_utlist = NULL;
327 ul->un_logmap = NULL;
328 ul->un_status |= LDL_FIND_TAIL;
329 ul->un_status &= ~LDL_SCAN_ACTIVE;
330 ASSERT(ul->un_devbsize == DEV_BSIZE);
331
332 mutex_init(&ul->un_log_mutex, NULL, MUTEX_DRIVER, NULL);
333
334 /*
335 * allocate some read and write buffers
336 */
337 bufsize = md_ldl_bufsize(ul);
338 ul->un_rdbuf.cb_nb = 0;
339 md_alloc_rdbuf(&ul->un_rdbuf, bufsize, MAPBLOCKSIZE);
340 ul->un_wrbuf.cb_nb = 0;
341 md_alloc_wrbuf(&ul->un_wrbuf, bufsize);
342
343 if (snarfing) {
344 if (ul->un_error & LDL_ANYERROR) {
345 ul->un_error = LDL_HERROR;
346 ldl_errorstate(ul);
347 } else
348 ul->un_error = 0;
349 }
350
351 /* Put on the unit list */
352 ul->un_next = ul_list;
353 ul_list = ul;
354 md_nlogs++;
355
356 mutex_exit(&ml_lock);
357 return (0);
358 }
359
360 ml_unit_t *
ldl_findlog(mddb_recid_t recid)361 ldl_findlog(mddb_recid_t recid)
362 {
363 ml_unit_t *ul;
364
365 /*
366 * Find a unit struct by database recid
367 */
368 mutex_enter(&ml_lock);
369 for (ul = ul_list; ul; ul = ul->un_next)
370 if (ul->un_recid == recid)
371 break;
372 mutex_exit(&ml_lock);
373 return (ul);
374 }
375
376 /*
377 * ldl_utadd adds a metatrans device to the log's list of mt devices.
378 * WARNING: top_end_sync() scans this list W/O locking for performance!!!
379 */
380 void
ldl_utadd(mt_unit_t * un)381 ldl_utadd(mt_unit_t *un)
382 {
383 ml_unit_t *ul = un->un_l_unit;
384
385 if (ul == NULL)
386 return;
387
388 mutex_enter(&ut_mutex);
389 un->un_next = ul->un_utlist;
390 ul->un_utlist = un;
391 ASSERT((ul->un_logmap == NULL) || (ul->un_logmap == un->un_logmap));
392 ul->un_logmap = un->un_logmap;
393 mutex_exit(&ut_mutex);
394 }
395
396 /*
397 * ldl_utdel removes a metatrans device to the log's list of mt devices.
398 * WARNING: top_end_sync() scans this list W/O locking for performance!!!
399 */
400 static void
ldl_utdel(mt_unit_t * un)401 ldl_utdel(mt_unit_t *un)
402 {
403 ml_unit_t *ul = un->un_l_unit;
404 mt_unit_t **utp = &ul->un_utlist;
405
406 mutex_enter(&ut_mutex);
407 for (utp = &ul->un_utlist;
408 *utp && (*utp != un);
409 utp = &(*utp)->un_next);
410 if (*utp)
411 *utp = un->un_next;
412 un->un_l_unit = NULL;
413 mutex_exit(&ut_mutex);
414 }
415
416 mddb_recid_t
ldl_create(mdkey_t key,mt_unit_t * un)417 ldl_create(mdkey_t key, mt_unit_t *un)
418 {
419 ml_unit_t *ul;
420 mddb_recid_t recid;
421 struct timeval32 tv;
422 mddb_type_t typ1;
423 set_t setno;
424
425 setno = MD_UN2SET(un);
426
427 /*
428 * Find a unit struct for this key and set
429 * If we found one then, we are done.
430 * Else create one.
431 */
432 mutex_enter(&ml_lock);
433 for (ul = ul_list; ul; ul = ul->un_next)
434 if ((ul->un_key == key) &&
435 (mddb_getsetnum(ul->un_recid) == setno))
436 break;
437 mutex_exit(&ml_lock);
438
439 if (ul)
440 return (ul->un_recid);
441
442 typ1 = (mddb_type_t)md_getshared_key(setno,
443 trans_md_ops.md_driver.md_drivername);
444 recid = mddb_createrec(ML_UNIT_ONDSZ, typ1, LOG_REC,
445 MD_CRO_32BIT | MD_CRO_TRANS_LOG, setno);
446 if (recid < 0)
447 return (recid);
448 mddb_setrecprivate(recid, MD_PRV_GOTIT);
449
450 ul = (ml_unit_t *)mddb_getrecaddr_resize(recid, sizeof (*ul), 0);
451
452 ul->un_recid = recid;
453 ul->un_key = key;
454 ul->un_dev = md_getdevnum(setno, mddb_getsidenum(setno), key,
455 MD_NOTRUST_DEVT);
456 ul->un_bol_lof = (off32_t)dbtob(un->un_l_sblk);
457 ul->un_eol_lof = ul->un_bol_lof + (off32_t)dbtob(un->un_l_nblks);
458 ul->un_pwsblk = un->un_l_pwsblk;
459 ul->un_nblks = un->un_l_nblks;
460 ul->un_tblks = un->un_l_tblks;
461 ul->un_maxresv = un->un_l_maxresv;
462 ul->un_maxtransfer = (uint_t)dbtob(un->un_l_maxtransfer);
463 ul->un_devbsize = DEV_BSIZE;
464
465 /*
466 * empty log
467 */
468 uniqtime32(&tv);
469 ul->un_head_lof = ul->un_bol_lof;
470 ul->un_tail_lof = ul->un_bol_lof;
471 ul->un_head_ident = tv.tv_sec;
472 ul->un_tail_ident = tv.tv_sec;
473
474 if (md_getmajor(ul->un_dev) == md_major)
475 ul->un_status |= LDL_METADEVICE;
476
477 md_set_parent(ul->un_dev, (int)MD_MULTI_PARENT);
478 (void) ldl_build_incore(ul, 0);
479 logcommitdb(ul);
480 return (recid);
481 }
482
483 int
ldl_open_dev(mt_unit_t * un,ml_unit_t * ul)484 ldl_open_dev(mt_unit_t *un, ml_unit_t *ul)
485 {
486 int err = 0;
487 md_dev64_t tmpdev;
488 minor_t mnum = MD_SID(un);
489 set_t setno = MD_MIN2SET(MD_SID(un));
490 side_t side = mddb_getsidenum(setno);
491
492 mutex_enter(&oc_mutex);
493
494 if (ul->un_opencnt) {
495 ul->un_opencnt++;
496 mutex_exit(&oc_mutex);
497 return (0);
498 }
499
500 tmpdev = ul->un_dev;
501 /*
502 * Do the open by device id if it is regular device
503 */
504 if ((md_getmajor(tmpdev) != md_major) &&
505 md_devid_found(setno, side, ul->un_key) == 1) {
506 tmpdev = md_resolve_bydevid(mnum, tmpdev, ul->un_key);
507 }
508 err = md_layered_open(mnum, &tmpdev, MD_OFLG_NULL);
509 ul->un_dev = tmpdev;
510
511 if (err == 0)
512 ul->un_opencnt++;
513
514 mutex_exit(&oc_mutex);
515 return (err);
516 }
517
518 void
ldl_close_dev(ml_unit_t * ul)519 ldl_close_dev(ml_unit_t *ul)
520 {
521
522 mutex_enter(&oc_mutex);
523
524 ul->un_opencnt--;
525
526 if (ul->un_opencnt) {
527 mutex_exit(&oc_mutex);
528 return;
529 }
530
531 /* Last reference to the log, close it */
532 md_layered_close(ul->un_dev, MD_OFLG_NULL);
533
534 mutex_exit(&oc_mutex);
535 }
536
537
538 /*
539 * LOGSCAN STUFF
540 */
541 int
ldl_isherror(ml_unit_t * ul)542 ldl_isherror(ml_unit_t *ul)
543 {
544 return ((ul != NULL) && (ul->un_error & LDL_HERROR));
545 }
546
547 int
ldl_iserror(ml_unit_t * ul)548 ldl_iserror(ml_unit_t *ul)
549 {
550 return ((ul != NULL) && (ul->un_error & LDL_ERROR));
551 }
552
553 size_t
md_ldl_bufsize(ml_unit_t * ul)554 md_ldl_bufsize(ml_unit_t *ul)
555 {
556 size_t bufsize;
557
558 /*
559 * initial guess is the maxtransfer value for this log device
560 * reduce by number of logs
561 * increase for sharing
562 * increase if too small
563 * decrease if too large
564 */
565 bufsize = ul->un_maxtransfer;
566 if (md_nlogs)
567 bufsize /= md_nlogs;
568 if (ul->un_transcnt)
569 bufsize *= ul->un_transcnt;
570 bufsize = dbtob(btod(bufsize));
571 if (bufsize < LDL_MINBUFSIZE)
572 bufsize = LDL_MINBUFSIZE;
573 if (bufsize > maxphys)
574 bufsize = maxphys;
575 if (bufsize > ul->un_maxtransfer)
576 bufsize = ul->un_maxtransfer;
577 return (bufsize);
578 }
579
580 /*
581 * if necessary; open all underlying devices for ul and start threads
582 * called at snarf, metainit, and open
583 */
584 void
ldl_open_underlying(mt_unit_t * un)585 ldl_open_underlying(mt_unit_t *un)
586 {
587 ml_unit_t *ul = un->un_l_unit;
588 int err = 0;
589
590
591 /*
592 * first, handle the case of detached logs
593 */
594 if (ul == NULL) {
595 err = trans_open_all_devs(un);
596 if (err == 0) {
597 un->un_flags &= ~TRANS_NEED_OPEN;
598 un->un_flags |= TRANS_OPENED;
599 }
600 }
601 }
602
603 /*
604 * remove log unit struct from global linked list
605 */
606 static void
ldl_unlist(ml_unit_t * ul)607 ldl_unlist(ml_unit_t *ul)
608 {
609 ml_unit_t **ulp;
610
611 /*
612 * remove from list
613 */
614 mutex_enter(&ml_lock);
615 for (ulp = &ul_list; *ulp && (*ulp != ul); ulp = &(*ulp)->un_next);
616 if (*ulp) {
617 *ulp = ul->un_next;
618 --md_nlogs;
619 }
620 mutex_exit(&ml_lock);
621 }
622
623 /*
624 * get rid of a log unit from the database
625 */
626 void
ldl_cleanup(ml_unit_t * ul)627 ldl_cleanup(ml_unit_t *ul)
628 {
629 sv_dev_t sv;
630
631 /* Save the log key */
632 sv.setno = mddb_getsetnum(ul->un_recid);
633 sv.key = ul->un_key;
634
635 mddb_deleterec_wrapper(ul->un_recid);
636 md_rem_names(&sv, 1);
637 }
638
639 static void
ldl_delete(ml_unit_t * ul,int removing)640 ldl_delete(ml_unit_t *ul, int removing)
641 {
642
643 /*
644 * remove from list
645 */
646 ldl_unlist(ul);
647
648 /*
649 * free up resources
650 */
651 md_free_cirbuf(&ul->un_rdbuf);
652 md_free_cirbuf(&ul->un_wrbuf);
653
654 mutex_destroy(&ul->un_log_mutex);
655
656 if (removing) {
657 md_reset_parent(ul->un_dev);
658 ul->un_status |= LDL_BEING_RESET;
659 logcommitdb(ul);
660 ldl_cleanup(ul);
661 }
662 }
663
664 /*
665 * detach log from trans device
666 * caller insures that trans device is idle and will remain idle
667 */
668 /* ARGSUSED */
669 int
ldl_reset(mt_unit_t * un,int removing,int force)670 ldl_reset(mt_unit_t *un, int removing, int force)
671 {
672 ml_unit_t *ul = un->un_l_unit;
673
674 if (ul == NULL)
675 return (0);
676
677 if (un->un_flags & TRANS_DETACHING) {
678 un->un_flags &= ~TRANS_DETACHING;
679 un->un_flags |= TRANS_DETACHED;
680 trans_commit(un, 0);
681 }
682
683 /*
684 * remove this metatrans device from the log's list of mt devices
685 */
686 ldl_utdel(un);
687
688 /*
689 * busy; do nothing
690 */
691 if (ul->un_utlist)
692 return (0);
693
694 ldl_delete(ul, removing);
695
696 return (0);
697 }
698