1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/vnode.h>
29 #include <sys/errno.h>
30 #include <sys/sysmacros.h>
31 #include <sys/debug.h>
32 #include <sys/kmem.h>
33 #include <sys/conf.h>
34 #include <sys/proc.h>
35 #include <sys/cmn_err.h>
36 #include <sys/fssnap_if.h>
37 #include <sys/fs/ufs_inode.h>
38 #include <sys/fs/ufs_filio.h>
39 #include <sys/fs/ufs_log.h>
40 #include <sys/fs/ufs_bio.h>
41 #include <sys/inttypes.h>
42 #include <sys/callb.h>
43 #include <sys/tnf_probe.h>
44
45 /*
46 * Kernel threads for logging
47 * Currently only one for rolling the log (one per log).
48 */
49
50 #define LUFS_DEFAULT_NUM_ROLL_BUFS 16
51 #define LUFS_DEFAULT_MIN_ROLL_BUFS 4
52 #define LUFS_DEFAULT_MAX_ROLL_BUFS 64
53
54 /*
55 * Macros
56 */
57 #define logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme)
58 #define ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof)
59
60 /*
61 * Tunables
62 */
63 uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS;
64 uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS;
65 uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS;
66 long logmap_maxnme = 1536;
67 int trans_roll_tics = 0;
68 uint64_t trans_roll_new_delta = 0;
69 uint64_t lrr_wait = 0;
70 /*
71 * Key for thread specific data for the roll thread to
72 * bypass snapshot throttling
73 */
74 uint_t bypass_snapshot_throttle_key;
75
76 /*
77 * externs
78 */
79 extern kmutex_t ml_scan;
80 extern kcondvar_t ml_scan_cv;
81 extern int maxphys;
82
83 static void
trans_roll_wait(mt_map_t * logmap,callb_cpr_t * cprinfop)84 trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop)
85 {
86 mutex_enter(&logmap->mtm_mutex);
87 logmap->mtm_ref = 0;
88 if (logmap->mtm_flags & MTM_FORCE_ROLL) {
89 cv_broadcast(&logmap->mtm_from_roll_cv);
90 }
91 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING);
92 CALLB_CPR_SAFE_BEGIN(cprinfop);
93 (void) cv_reltimedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex,
94 trans_roll_tics, TR_CLOCK_TICK);
95 CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex);
96 logmap->mtm_flags |= MTM_ROLLING;
97 mutex_exit(&logmap->mtm_mutex);
98 }
99
100 /*
101 * returns the number of 8K buffers to use for rolling the log
102 */
103 static uint32_t
log_roll_buffers()104 log_roll_buffers()
105 {
106 /*
107 * sanity validate the tunable lufs_num_roll_bufs
108 */
109 if (lufs_num_roll_bufs < lufs_min_roll_bufs) {
110 return (lufs_min_roll_bufs);
111 }
112 if (lufs_num_roll_bufs > lufs_max_roll_bufs) {
113 return (lufs_max_roll_bufs);
114 }
115 return (lufs_num_roll_bufs);
116 }
117
118 /*
119 * Find something to roll, then if we don't have cached roll buffers
120 * covering all the deltas in that MAPBLOCK then read the master
121 * and overlay the deltas.
122 * returns;
123 * 0 if sucessful
124 * 1 on finding nothing to roll
125 * 2 on error
126 */
127 int
log_roll_read(ml_unit_t * ul,rollbuf_t * rbs,int nmblk,caddr_t roll_bufs,int * retnbuf)128 log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs,
129 int *retnbuf)
130 {
131 offset_t mof;
132 buf_t *bp;
133 rollbuf_t *rbp;
134 mt_map_t *logmap = ul->un_logmap;
135 daddr_t mblkno;
136 int i;
137 int error;
138 int nbuf;
139
140 /*
141 * Make sure there is really something to roll
142 */
143 mof = 0;
144 if (!logmap_next_roll(logmap, &mof)) {
145 return (1);
146 }
147
148 /*
149 * build some master blocks + deltas to roll forward
150 */
151 rw_enter(&logmap->mtm_rwlock, RW_READER);
152 nbuf = 0;
153 do {
154 mof = mof & (offset_t)MAPBLOCKMASK;
155 mblkno = lbtodb(mof);
156
157 /*
158 * Check for the case of a new delta to a set up buffer
159 */
160 for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
161 if (P2ALIGN(rbp->rb_bh.b_blkno,
162 MAPBLOCKSIZE / DEV_BSIZE) == mblkno) {
163 TNF_PROBE_0(trans_roll_new_delta, "lufs",
164 /* CSTYLED */);
165 trans_roll_new_delta++;
166 /* Flush out the current set of buffers */
167 goto flush_bufs;
168 }
169 }
170
171 /*
172 * Work out what to roll next. If it isn't cached then read
173 * it asynchronously from the master.
174 */
175 bp = &rbp->rb_bh;
176 bp->b_blkno = mblkno;
177 bp->b_flags = B_READ;
178 bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT);
179 bp->b_bufsize = MAPBLOCKSIZE;
180 if (top_read_roll(rbp, ul)) {
181 /* logmap deltas were in use */
182 if (nbuf == 0) {
183 /*
184 * On first buffer wait for the logmap user
185 * to finish by grabbing the logmap lock
186 * exclusively rather than spinning
187 */
188 rw_exit(&logmap->mtm_rwlock);
189 lrr_wait++;
190 rw_enter(&logmap->mtm_rwlock, RW_WRITER);
191 rw_exit(&logmap->mtm_rwlock);
192 return (1);
193 }
194 /* we have at least one buffer - flush it */
195 goto flush_bufs;
196 }
197 if ((bp->b_flags & B_INVAL) == 0) {
198 nbuf++;
199 }
200 mof += MAPBLOCKSIZE;
201 } while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof));
202
203 /*
204 * If there was nothing to roll cycle back
205 */
206 if (nbuf == 0) {
207 rw_exit(&logmap->mtm_rwlock);
208 return (1);
209 }
210
211 flush_bufs:
212 /*
213 * For each buffer, if it isn't cached then wait for the read to
214 * finish and overlay the deltas.
215 */
216 for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
217 if (!rbp->rb_crb) {
218 bp = &rbp->rb_bh;
219 if (trans_not_wait(bp)) {
220 ldl_seterror(ul,
221 "Error reading master during ufs log roll");
222 error = 1;
223 }
224 /*
225 * sync read the data from the log
226 */
227 if (ldl_read(ul, bp->b_un.b_addr,
228 ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK,
229 MAPBLOCKSIZE, rbp->rb_age)) {
230 error = 1;
231 }
232 }
233
234 /*
235 * reset the age bit in the age list
236 */
237 logmap_list_put_roll(logmap, rbp->rb_age);
238
239 if (ul->un_flags & LDL_ERROR) {
240 error = 1;
241 }
242 }
243 rw_exit(&logmap->mtm_rwlock);
244 if (error)
245 return (2);
246 *retnbuf = nbuf;
247 return (0);
248 }
249
250 /*
251 * Write out a cached roll buffer
252 */
253 void
log_roll_write_crb(ufsvfs_t * ufsvfsp,rollbuf_t * rbp)254 log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
255 {
256 crb_t *crb = rbp->rb_crb;
257 buf_t *bp = &rbp->rb_bh;
258
259 bp->b_blkno = lbtodb(crb->c_mof);
260 bp->b_un.b_addr = crb->c_buf;
261 bp->b_bcount = crb->c_nb;
262 bp->b_bufsize = crb->c_nb;
263 ASSERT((crb->c_nb & DEV_BMASK) == 0);
264 bp->b_flags = B_WRITE;
265 logstats.ls_rwrites.value.ui64++;
266
267 /* if snapshots are enabled, call it */
268 if (ufsvfsp->vfs_snapshot) {
269 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
270 } else {
271 (void) bdev_strategy(bp);
272 }
273 }
274
275 /*
276 * Write out a set of non cached roll buffers
277 */
278 void
log_roll_write_bufs(ufsvfs_t * ufsvfsp,rollbuf_t * rbp)279 log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
280 {
281 buf_t *bp = &rbp->rb_bh;
282 buf_t *bp2;
283 rbsecmap_t secmap = rbp->rb_secmap;
284 int j, k;
285
286 ASSERT(secmap);
287 ASSERT((bp->b_flags & B_INVAL) == 0);
288
289 do { /* for each contiguous block of sectors */
290 /* find start of next sector to write */
291 for (j = 0; j < 16; ++j) {
292 if (secmap & UINT16_C(1))
293 break;
294 secmap >>= 1;
295 }
296 bp->b_un.b_addr += (j << DEV_BSHIFT);
297 bp->b_blkno += j;
298
299 /* calculate number of sectors */
300 secmap >>= 1;
301 j++;
302 for (k = 1; j < 16; ++j) {
303 if ((secmap & UINT16_C(1)) == 0)
304 break;
305 secmap >>= 1;
306 k++;
307 }
308 bp->b_bcount = k << DEV_BSHIFT;
309 bp->b_flags = B_WRITE;
310 logstats.ls_rwrites.value.ui64++;
311
312 /* if snapshots are enabled, call it */
313 if (ufsvfsp->vfs_snapshot)
314 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
315 else
316 (void) bdev_strategy(bp);
317 if (secmap) {
318 /*
319 * Allocate another buf_t to handle
320 * the next write in this MAPBLOCK
321 * Chain them via b_list.
322 */
323 bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP);
324 bp->b_list = bp2;
325 bioinit(bp2);
326 bp2->b_iodone = trans_not_done;
327 bp2->b_bufsize = MAPBLOCKSIZE;
328 bp2->b_edev = bp->b_edev;
329 bp2->b_un.b_addr =
330 bp->b_un.b_addr + bp->b_bcount;
331 bp2->b_blkno = bp->b_blkno + k;
332 bp = bp2;
333 }
334 } while (secmap);
335 }
336
337 /*
338 * Asynchronously roll the deltas, using the sector map
339 * in each rollbuf_t.
340 */
341 int
log_roll_write(ml_unit_t * ul,rollbuf_t * rbs,int nbuf)342 log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf)
343 {
344
345 ufsvfs_t *ufsvfsp = ul->un_ufsvfs;
346 rollbuf_t *rbp;
347 buf_t *bp, *bp2;
348 rollbuf_t *head, *prev, *rbp2;
349
350 /*
351 * Order the buffers by blkno
352 */
353 ASSERT(nbuf > 0);
354 #ifdef lint
355 prev = rbs;
356 #endif
357 for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) {
358 for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) {
359 if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) {
360 if (rbp2 == head) {
361 rbp->rb_next = head;
362 head = rbp;
363 } else {
364 prev->rb_next = rbp;
365 rbp->rb_next = rbp2;
366 }
367 break;
368 }
369 }
370 if (rbp2 == NULL) {
371 prev->rb_next = rbp;
372 rbp->rb_next = NULL;
373 }
374 }
375
376 /*
377 * issue the in-order writes
378 */
379 for (rbp = head; rbp; rbp = rbp2) {
380 if (rbp->rb_crb) {
381 log_roll_write_crb(ufsvfsp, rbp);
382 } else {
383 log_roll_write_bufs(ufsvfsp, rbp);
384 }
385 /* null out the rb_next link for next set of rolling */
386 rbp2 = rbp->rb_next;
387 rbp->rb_next = NULL;
388 }
389
390 /*
391 * wait for all the writes to finish
392 */
393 for (rbp = rbs; rbp < rbs + nbuf; rbp++) {
394 bp = &rbp->rb_bh;
395 if (trans_not_wait(bp)) {
396 ldl_seterror(ul,
397 "Error writing master during ufs log roll");
398 }
399
400 /*
401 * Now wait for all the "cloned" buffer writes (if any)
402 * and free those headers
403 */
404 bp2 = bp->b_list;
405 bp->b_list = NULL;
406 while (bp2) {
407 if (trans_not_wait(bp2)) {
408 ldl_seterror(ul,
409 "Error writing master during ufs log roll");
410 }
411 bp = bp2;
412 bp2 = bp2->b_list;
413 kmem_free(bp, sizeof (buf_t));
414 }
415 }
416
417 if (ul->un_flags & LDL_ERROR)
418 return (1);
419 return (0);
420 }
421
422 void
trans_roll(ml_unit_t * ul)423 trans_roll(ml_unit_t *ul)
424 {
425 callb_cpr_t cprinfo;
426 mt_map_t *logmap = ul->un_logmap;
427 rollbuf_t *rbs;
428 rollbuf_t *rbp;
429 buf_t *bp;
430 caddr_t roll_bufs;
431 uint32_t nmblk;
432 int i;
433 int doingforceroll;
434 int nbuf;
435
436 CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr,
437 "trans_roll");
438
439 /*
440 * We do not want the roll thread's writes to be
441 * throttled by the snapshot.
442 * If they are throttled then we can have a deadlock
443 * between the roll thread and the snapshot taskq thread:
444 * roll thread wants the throttling semaphore and
445 * the snapshot taskq thread cannot release the semaphore
446 * because it is writing to the log and the log is full.
447 */
448
449 (void) tsd_set(bypass_snapshot_throttle_key, (void*)1);
450
451 /*
452 * setup some roll parameters
453 */
454 if (trans_roll_tics == 0)
455 trans_roll_tics = 5 * hz;
456 nmblk = log_roll_buffers();
457
458 /*
459 * allocate the buffers and buffer headers
460 */
461 roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP);
462 rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP);
463
464 /*
465 * initialize the buffer headers
466 */
467 for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) {
468 rbp->rb_next = NULL;
469 bp = &rbp->rb_bh;
470 bioinit(bp);
471 bp->b_edev = ul->un_dev;
472 bp->b_iodone = trans_not_done;
473 bp->b_bufsize = MAPBLOCKSIZE;
474 }
475
476 doingforceroll = 0;
477
478 again:
479 /*
480 * LOOP FOREVER
481 */
482
483 /*
484 * exit on demand
485 */
486 mutex_enter(&logmap->mtm_mutex);
487 if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) {
488 kmem_free(rbs, nmblk * sizeof (rollbuf_t));
489 kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE);
490 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING |
491 MTM_ROLL_EXIT | MTM_ROLLING);
492 cv_broadcast(&logmap->mtm_from_roll_cv);
493 CALLB_CPR_EXIT(&cprinfo);
494 thread_exit();
495 /* NOTREACHED */
496 }
497
498 /*
499 * MT_SCAN debug mode
500 * don't roll except in FORCEROLL situations
501 */
502 if (logmap->mtm_debug & MT_SCAN)
503 if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) {
504 mutex_exit(&logmap->mtm_mutex);
505 trans_roll_wait(logmap, &cprinfo);
506 goto again;
507 }
508 ASSERT(logmap->mtm_trimlof == 0);
509
510 /*
511 * If we've finished a force roll cycle then wakeup any
512 * waiters.
513 */
514 if (doingforceroll) {
515 doingforceroll = 0;
516 logmap->mtm_flags &= ~MTM_FORCE_ROLL;
517 mutex_exit(&logmap->mtm_mutex);
518 cv_broadcast(&logmap->mtm_from_roll_cv);
519 } else {
520 mutex_exit(&logmap->mtm_mutex);
521 }
522
523 /*
524 * If someone wants us to roll something; then do it
525 */
526 if (logmap->mtm_flags & MTM_FORCE_ROLL) {
527 doingforceroll = 1;
528 goto rollsomething;
529 }
530
531 /*
532 * Log is busy, check if logmap is getting full.
533 */
534 if (logmap_need_roll(logmap)) {
535 goto rollsomething;
536 }
537
538 /*
539 * Check if the log is idle and is not empty
540 */
541 if (!logmap->mtm_ref && !ldl_empty(ul)) {
542 goto rollsomething;
543 }
544
545 /*
546 * Log is busy, check if its getting full
547 */
548 if (ldl_need_roll(ul)) {
549 goto rollsomething;
550 }
551
552 /*
553 * nothing to do; wait a bit and then start over
554 */
555 trans_roll_wait(logmap, &cprinfo);
556 goto again;
557
558 /*
559 * ROLL SOMETHING
560 */
561
562 rollsomething:
563 /*
564 * Use the cached roll buffers, or read the master
565 * and overlay the deltas
566 */
567 switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) {
568 case 1: trans_roll_wait(logmap, &cprinfo);
569 /* FALLTHROUGH */
570 case 2: goto again;
571 /* default case is success */
572 }
573
574 /*
575 * Asynchronously write out the deltas
576 */
577 if (log_roll_write(ul, rbs, nbuf))
578 goto again;
579
580 /*
581 * free up the deltas in the logmap
582 */
583 for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
584 bp = &rbp->rb_bh;
585 logmap_remove_roll(logmap,
586 ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE);
587 }
588
589 /*
590 * free up log space; if possible
591 */
592 logmap_sethead(logmap, ul);
593
594 /*
595 * LOOP
596 */
597 goto again;
598 }
599