1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/vnode.h>
29 #include <sys/errno.h>
30 #include <sys/sysmacros.h>
31 #include <sys/debug.h>
32 #include <sys/kmem.h>
33 #include <sys/conf.h>
34 #include <sys/proc.h>
35 #include <sys/cmn_err.h>
36 #include <sys/fssnap_if.h>
37 #include <sys/fs/ufs_inode.h>
38 #include <sys/fs/ufs_filio.h>
39 #include <sys/fs/ufs_log.h>
40 #include <sys/fs/ufs_bio.h>
41 #include <sys/inttypes.h>
42 #include <sys/callb.h>
43
44 /*
45 * Kernel threads for logging
46 * Currently only one for rolling the log (one per log).
47 */
48
49 #define LUFS_DEFAULT_NUM_ROLL_BUFS 16
50 #define LUFS_DEFAULT_MIN_ROLL_BUFS 4
51 #define LUFS_DEFAULT_MAX_ROLL_BUFS 64
52
53 /*
54 * Macros
55 */
56 #define logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme)
57 #define ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof)
58
59 /*
60 * Tunables
61 */
62 uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS;
63 uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS;
64 uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS;
65 long logmap_maxnme = 1536;
66 int trans_roll_tics = 0;
67 uint64_t trans_roll_new_delta = 0;
68 uint64_t lrr_wait = 0;
69 /*
70 * Key for thread specific data for the roll thread to
71 * bypass snapshot throttling
72 */
73 uint_t bypass_snapshot_throttle_key;
74
75 /*
76 * externs
77 */
78 extern kmutex_t ml_scan;
79 extern kcondvar_t ml_scan_cv;
80 extern int maxphys;
81
82 static void
trans_roll_wait(mt_map_t * logmap,callb_cpr_t * cprinfop)83 trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop)
84 {
85 mutex_enter(&logmap->mtm_mutex);
86 logmap->mtm_ref = 0;
87 if (logmap->mtm_flags & MTM_FORCE_ROLL) {
88 cv_broadcast(&logmap->mtm_from_roll_cv);
89 }
90 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING);
91 CALLB_CPR_SAFE_BEGIN(cprinfop);
92 (void) cv_reltimedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex,
93 trans_roll_tics, TR_CLOCK_TICK);
94 CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex);
95 logmap->mtm_flags |= MTM_ROLLING;
96 mutex_exit(&logmap->mtm_mutex);
97 }
98
99 /*
100 * returns the number of 8K buffers to use for rolling the log
101 */
102 static uint32_t
log_roll_buffers()103 log_roll_buffers()
104 {
105 /*
106 * sanity validate the tunable lufs_num_roll_bufs
107 */
108 if (lufs_num_roll_bufs < lufs_min_roll_bufs) {
109 return (lufs_min_roll_bufs);
110 }
111 if (lufs_num_roll_bufs > lufs_max_roll_bufs) {
112 return (lufs_max_roll_bufs);
113 }
114 return (lufs_num_roll_bufs);
115 }
116
117 /*
118 * Find something to roll, then if we don't have cached roll buffers
119 * covering all the deltas in that MAPBLOCK then read the master
120 * and overlay the deltas.
121 * returns;
122 * 0 if sucessful
123 * 1 on finding nothing to roll
124 * 2 on error
125 */
126 int
log_roll_read(ml_unit_t * ul,rollbuf_t * rbs,int nmblk,caddr_t roll_bufs,int * retnbuf)127 log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs,
128 int *retnbuf)
129 {
130 offset_t mof;
131 buf_t *bp;
132 rollbuf_t *rbp;
133 mt_map_t *logmap = ul->un_logmap;
134 daddr_t mblkno;
135 int i;
136 int error;
137 int nbuf;
138
139 /*
140 * Make sure there is really something to roll
141 */
142 mof = 0;
143 if (!logmap_next_roll(logmap, &mof)) {
144 return (1);
145 }
146
147 /*
148 * build some master blocks + deltas to roll forward
149 */
150 rw_enter(&logmap->mtm_rwlock, RW_READER);
151 nbuf = 0;
152 do {
153 mof = mof & (offset_t)MAPBLOCKMASK;
154 mblkno = lbtodb(mof);
155
156 /*
157 * Check for the case of a new delta to a set up buffer
158 */
159 for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
160 if (P2ALIGN(rbp->rb_bh.b_blkno,
161 MAPBLOCKSIZE / DEV_BSIZE) == mblkno) {
162 trans_roll_new_delta++;
163 /* Flush out the current set of buffers */
164 goto flush_bufs;
165 }
166 }
167
168 /*
169 * Work out what to roll next. If it isn't cached then read
170 * it asynchronously from the master.
171 */
172 bp = &rbp->rb_bh;
173 bp->b_blkno = mblkno;
174 bp->b_flags = B_READ;
175 bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT);
176 bp->b_bufsize = MAPBLOCKSIZE;
177 if (top_read_roll(rbp, ul)) {
178 /* logmap deltas were in use */
179 if (nbuf == 0) {
180 /*
181 * On first buffer wait for the logmap user
182 * to finish by grabbing the logmap lock
183 * exclusively rather than spinning
184 */
185 rw_exit(&logmap->mtm_rwlock);
186 lrr_wait++;
187 rw_enter(&logmap->mtm_rwlock, RW_WRITER);
188 rw_exit(&logmap->mtm_rwlock);
189 return (1);
190 }
191 /* we have at least one buffer - flush it */
192 goto flush_bufs;
193 }
194 if ((bp->b_flags & B_INVAL) == 0) {
195 nbuf++;
196 }
197 mof += MAPBLOCKSIZE;
198 } while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof));
199
200 /*
201 * If there was nothing to roll cycle back
202 */
203 if (nbuf == 0) {
204 rw_exit(&logmap->mtm_rwlock);
205 return (1);
206 }
207
208 flush_bufs:
209 /*
210 * For each buffer, if it isn't cached then wait for the read to
211 * finish and overlay the deltas.
212 */
213 for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
214 if (!rbp->rb_crb) {
215 bp = &rbp->rb_bh;
216 if (trans_not_wait(bp)) {
217 ldl_seterror(ul,
218 "Error reading master during ufs log roll");
219 error = 1;
220 }
221 /*
222 * sync read the data from the log
223 */
224 if (ldl_read(ul, bp->b_un.b_addr,
225 ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK,
226 MAPBLOCKSIZE, rbp->rb_age)) {
227 error = 1;
228 }
229 }
230
231 /*
232 * reset the age bit in the age list
233 */
234 logmap_list_put_roll(logmap, rbp->rb_age);
235
236 if (ul->un_flags & LDL_ERROR) {
237 error = 1;
238 }
239 }
240 rw_exit(&logmap->mtm_rwlock);
241 if (error)
242 return (2);
243 *retnbuf = nbuf;
244 return (0);
245 }
246
247 /*
248 * Write out a cached roll buffer
249 */
250 void
log_roll_write_crb(ufsvfs_t * ufsvfsp,rollbuf_t * rbp)251 log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
252 {
253 crb_t *crb = rbp->rb_crb;
254 buf_t *bp = &rbp->rb_bh;
255
256 bp->b_blkno = lbtodb(crb->c_mof);
257 bp->b_un.b_addr = crb->c_buf;
258 bp->b_bcount = crb->c_nb;
259 bp->b_bufsize = crb->c_nb;
260 ASSERT((crb->c_nb & DEV_BMASK) == 0);
261 bp->b_flags = B_WRITE;
262 logstats.ls_rwrites.value.ui64++;
263
264 /* if snapshots are enabled, call it */
265 if (ufsvfsp->vfs_snapshot) {
266 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
267 } else {
268 (void) bdev_strategy(bp);
269 }
270 }
271
272 /*
273 * Write out a set of non cached roll buffers
274 */
275 void
log_roll_write_bufs(ufsvfs_t * ufsvfsp,rollbuf_t * rbp)276 log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
277 {
278 buf_t *bp = &rbp->rb_bh;
279 buf_t *bp2;
280 rbsecmap_t secmap = rbp->rb_secmap;
281 int j, k;
282
283 ASSERT(secmap);
284 ASSERT((bp->b_flags & B_INVAL) == 0);
285
286 do { /* for each contiguous block of sectors */
287 /* find start of next sector to write */
288 for (j = 0; j < 16; ++j) {
289 if (secmap & UINT16_C(1))
290 break;
291 secmap >>= 1;
292 }
293 bp->b_un.b_addr += (j << DEV_BSHIFT);
294 bp->b_blkno += j;
295
296 /* calculate number of sectors */
297 secmap >>= 1;
298 j++;
299 for (k = 1; j < 16; ++j) {
300 if ((secmap & UINT16_C(1)) == 0)
301 break;
302 secmap >>= 1;
303 k++;
304 }
305 bp->b_bcount = k << DEV_BSHIFT;
306 bp->b_flags = B_WRITE;
307 logstats.ls_rwrites.value.ui64++;
308
309 /* if snapshots are enabled, call it */
310 if (ufsvfsp->vfs_snapshot)
311 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
312 else
313 (void) bdev_strategy(bp);
314 if (secmap) {
315 /*
316 * Allocate another buf_t to handle
317 * the next write in this MAPBLOCK
318 * Chain them via b_list.
319 */
320 bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP);
321 bp->b_list = bp2;
322 bioinit(bp2);
323 bp2->b_iodone = trans_not_done;
324 bp2->b_bufsize = MAPBLOCKSIZE;
325 bp2->b_edev = bp->b_edev;
326 bp2->b_un.b_addr =
327 bp->b_un.b_addr + bp->b_bcount;
328 bp2->b_blkno = bp->b_blkno + k;
329 bp = bp2;
330 }
331 } while (secmap);
332 }
333
334 /*
335 * Asynchronously roll the deltas, using the sector map
336 * in each rollbuf_t.
337 */
338 int
log_roll_write(ml_unit_t * ul,rollbuf_t * rbs,int nbuf)339 log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf)
340 {
341
342 ufsvfs_t *ufsvfsp = ul->un_ufsvfs;
343 rollbuf_t *rbp;
344 buf_t *bp, *bp2;
345 rollbuf_t *head, *prev, *rbp2;
346
347 /*
348 * Order the buffers by blkno
349 */
350 ASSERT(nbuf > 0);
351 #ifdef lint
352 prev = rbs;
353 #endif
354 for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) {
355 for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) {
356 if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) {
357 if (rbp2 == head) {
358 rbp->rb_next = head;
359 head = rbp;
360 } else {
361 prev->rb_next = rbp;
362 rbp->rb_next = rbp2;
363 }
364 break;
365 }
366 }
367 if (rbp2 == NULL) {
368 prev->rb_next = rbp;
369 rbp->rb_next = NULL;
370 }
371 }
372
373 /*
374 * issue the in-order writes
375 */
376 for (rbp = head; rbp; rbp = rbp2) {
377 if (rbp->rb_crb) {
378 log_roll_write_crb(ufsvfsp, rbp);
379 } else {
380 log_roll_write_bufs(ufsvfsp, rbp);
381 }
382 /* null out the rb_next link for next set of rolling */
383 rbp2 = rbp->rb_next;
384 rbp->rb_next = NULL;
385 }
386
387 /*
388 * wait for all the writes to finish
389 */
390 for (rbp = rbs; rbp < rbs + nbuf; rbp++) {
391 bp = &rbp->rb_bh;
392 if (trans_not_wait(bp)) {
393 ldl_seterror(ul,
394 "Error writing master during ufs log roll");
395 }
396
397 /*
398 * Now wait for all the "cloned" buffer writes (if any)
399 * and free those headers
400 */
401 bp2 = bp->b_list;
402 bp->b_list = NULL;
403 while (bp2) {
404 if (trans_not_wait(bp2)) {
405 ldl_seterror(ul,
406 "Error writing master during ufs log roll");
407 }
408 bp = bp2;
409 bp2 = bp2->b_list;
410 kmem_free(bp, sizeof (buf_t));
411 }
412 }
413
414 if (ul->un_flags & LDL_ERROR)
415 return (1);
416 return (0);
417 }
418
419 void
trans_roll(ml_unit_t * ul)420 trans_roll(ml_unit_t *ul)
421 {
422 callb_cpr_t cprinfo;
423 mt_map_t *logmap = ul->un_logmap;
424 rollbuf_t *rbs;
425 rollbuf_t *rbp;
426 buf_t *bp;
427 caddr_t roll_bufs;
428 uint32_t nmblk;
429 int i;
430 int doingforceroll;
431 int nbuf;
432
433 CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr,
434 "trans_roll");
435
436 /*
437 * We do not want the roll thread's writes to be
438 * throttled by the snapshot.
439 * If they are throttled then we can have a deadlock
440 * between the roll thread and the snapshot taskq thread:
441 * roll thread wants the throttling semaphore and
442 * the snapshot taskq thread cannot release the semaphore
443 * because it is writing to the log and the log is full.
444 */
445
446 (void) tsd_set(bypass_snapshot_throttle_key, (void*)1);
447
448 /*
449 * setup some roll parameters
450 */
451 if (trans_roll_tics == 0)
452 trans_roll_tics = 5 * hz;
453 nmblk = log_roll_buffers();
454
455 /*
456 * allocate the buffers and buffer headers
457 */
458 roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP);
459 rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP);
460
461 /*
462 * initialize the buffer headers
463 */
464 for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) {
465 rbp->rb_next = NULL;
466 bp = &rbp->rb_bh;
467 bioinit(bp);
468 bp->b_edev = ul->un_dev;
469 bp->b_iodone = trans_not_done;
470 bp->b_bufsize = MAPBLOCKSIZE;
471 }
472
473 doingforceroll = 0;
474
475 again:
476 /*
477 * LOOP FOREVER
478 */
479
480 /*
481 * exit on demand
482 */
483 mutex_enter(&logmap->mtm_mutex);
484 if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) {
485 kmem_free(rbs, nmblk * sizeof (rollbuf_t));
486 kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE);
487 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING |
488 MTM_ROLL_EXIT | MTM_ROLLING);
489 cv_broadcast(&logmap->mtm_from_roll_cv);
490 CALLB_CPR_EXIT(&cprinfo);
491 thread_exit();
492 /* NOTREACHED */
493 }
494
495 /*
496 * MT_SCAN debug mode
497 * don't roll except in FORCEROLL situations
498 */
499 if (logmap->mtm_debug & MT_SCAN)
500 if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) {
501 mutex_exit(&logmap->mtm_mutex);
502 trans_roll_wait(logmap, &cprinfo);
503 goto again;
504 }
505 ASSERT(logmap->mtm_trimlof == 0);
506
507 /*
508 * If we've finished a force roll cycle then wakeup any
509 * waiters.
510 */
511 if (doingforceroll) {
512 doingforceroll = 0;
513 logmap->mtm_flags &= ~MTM_FORCE_ROLL;
514 mutex_exit(&logmap->mtm_mutex);
515 cv_broadcast(&logmap->mtm_from_roll_cv);
516 } else {
517 mutex_exit(&logmap->mtm_mutex);
518 }
519
520 /*
521 * If someone wants us to roll something; then do it
522 */
523 if (logmap->mtm_flags & MTM_FORCE_ROLL) {
524 doingforceroll = 1;
525 goto rollsomething;
526 }
527
528 /*
529 * Log is busy, check if logmap is getting full.
530 */
531 if (logmap_need_roll(logmap)) {
532 goto rollsomething;
533 }
534
535 /*
536 * Check if the log is idle and is not empty
537 */
538 if (!logmap->mtm_ref && !ldl_empty(ul)) {
539 goto rollsomething;
540 }
541
542 /*
543 * Log is busy, check if its getting full
544 */
545 if (ldl_need_roll(ul)) {
546 goto rollsomething;
547 }
548
549 /*
550 * nothing to do; wait a bit and then start over
551 */
552 trans_roll_wait(logmap, &cprinfo);
553 goto again;
554
555 /*
556 * ROLL SOMETHING
557 */
558
559 rollsomething:
560 /*
561 * Use the cached roll buffers, or read the master
562 * and overlay the deltas
563 */
564 switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) {
565 case 1: trans_roll_wait(logmap, &cprinfo);
566 /* FALLTHROUGH */
567 case 2: goto again;
568 /* default case is success */
569 }
570
571 /*
572 * Asynchronously write out the deltas
573 */
574 if (log_roll_write(ul, rbs, nbuf))
575 goto again;
576
577 /*
578 * free up the deltas in the logmap
579 */
580 for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
581 bp = &rbp->rb_bh;
582 logmap_remove_roll(logmap,
583 ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE);
584 }
585
586 /*
587 * free up log space; if possible
588 */
589 logmap_sethead(logmap, ul);
590
591 /*
592 * LOOP
593 */
594 goto again;
595 }
596