xref: /illumos-gate/usr/src/uts/common/fs/ufs/lufs_top.c (revision 8629b981ede6d47b0583ca2d3e62baeaa4f26e93)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2015 by Delphix. All rights reserved.
28  */
29 
30 #include <sys/systm.h>
31 #include <sys/types.h>
32 #include <sys/vnode.h>
33 #include <sys/errno.h>
34 #include <sys/sysmacros.h>
35 #include <sys/debug.h>
36 #include <sys/kmem.h>
37 #include <sys/conf.h>
38 #include <sys/proc.h>
39 #include <sys/taskq.h>
40 #include <sys/cmn_err.h>
41 #include <sys/fs/ufs_inode.h>
42 #include <sys/fs/ufs_filio.h>
43 #include <sys/fs/ufs_log.h>
44 #include <sys/fs/ufs_bio.h>
45 
46 /*
47  * FILE SYSTEM INTERFACE TO TRANSACTION OPERATIONS (TOP; like VOP)
48  */
49 
50 uint_t topkey; /* tsd transaction key */
51 
52 /*
53  * declare a delta
54  */
55 void
56 top_delta(
57 	ufsvfs_t *ufsvfsp,
58 	offset_t mof,
59 	off_t nb,
60 	delta_t dtyp,
61 	int (*func)(),
62 	ulong_t arg)
63 {
64 	ml_unit_t		*ul	= ufsvfsp->vfs_log;
65 	threadtrans_t		*tp	= tsd_get(topkey);
66 
67 	ASSERT(ufsvfsp->vfs_dev == ul->un_dev);
68 	ASSERT(nb);
69 	ASSERT(((ul->un_debug & (MT_TRANSACT|MT_MATAMAP)) == 0) ||
70 	    top_delta_debug(ul, mof, nb, dtyp));
71 
72 	deltamap_add(ul->un_deltamap, mof, nb, dtyp, func, arg, tp);
73 
74 	ul->un_logmap->mtm_ref = 1; /* for roll thread's heuristic */
75 	if (tp) {
76 		tp->any_deltas = 1;
77 	}
78 }
79 
80 /*
81  * cancel a delta
82  */
83 void
84 top_cancel(ufsvfs_t *ufsvfsp, offset_t mof, off_t nb, int flags)
85 {
86 	ml_unit_t	*ul	= ufsvfsp->vfs_log;
87 	int		metadata = flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA);
88 
89 	ASSERT(ufsvfsp->vfs_dev == ul->un_dev);
90 	ASSERT(nb);
91 	ASSERT(((ul->un_debug & (MT_TRANSACT|MT_MATAMAP)) == 0) ||
92 	    (!(flags & metadata) ||
93 	    top_delta_debug(ul, mof, nb, DT_CANCEL)));
94 
95 	if (metadata)
96 		deltamap_del(ul->un_deltamap, mof, nb);
97 
98 	logmap_cancel(ul, mof, nb, metadata);
99 
100 	/*
101 	 * needed for the roll thread's heuristic
102 	 */
103 	ul->un_logmap->mtm_ref = 1;
104 }
105 
106 /*
107  * check if this delta has been canceled (metadata -> userdata)
108  */
109 int
110 top_iscancel(ufsvfs_t *ufsvfsp, offset_t mof, off_t nb)
111 {
112 	ml_unit_t	*ul	= ufsvfsp->vfs_log;
113 
114 	ASSERT(ufsvfsp->vfs_dev == ul->un_dev);
115 	ASSERT(nb);
116 	if (logmap_iscancel(ul->un_logmap, mof, nb))
117 		return (1);
118 	if (ul->un_flags & LDL_ERROR)
119 		return (1);
120 	return (0);
121 }
122 
123 /*
124  * put device into error state
125  */
126 void
127 top_seterror(ufsvfs_t *ufsvfsp)
128 {
129 	ml_unit_t	*ul	= ufsvfsp->vfs_log;
130 
131 	ASSERT(ufsvfsp->vfs_dev == ul->un_dev);
132 	ldl_seterror(ul, "ufs is forcing a ufs log error");
133 }
134 
135 /*
136  * issue a empty sync op to help empty the delta/log map or the log
137  */
138 static void
139 top_issue_sync(ufsvfs_t *ufsvfsp)
140 {
141 	int error = 0;
142 
143 	if ((curthread->t_flag & T_DONTBLOCK) == 0)
144 		curthread->t_flag |= T_DONTBLOCK;
145 	top_begin_sync(ufsvfsp, TOP_COMMIT_ASYNC, 0, &error);
146 	if (!error) {
147 		top_end_sync(ufsvfsp, &error, TOP_COMMIT_ASYNC, 0);
148 	}
149 }
150 
151 static void
152 top_issue_from_taskq(void *arg)
153 {
154 	ufsvfs_t *ufsvfsp = arg;
155 	ml_unit_t *ul = ufsvfsp->vfs_log;
156 	mt_map_t *mtm = ul->un_logmap;
157 
158 	top_issue_sync(ufsvfsp);
159 
160 	/*
161 	 * We were called from the taskq_dispatch() in top_begin_async(), so
162 	 * decrement mtm_taskq_sync_count and wake up the thread waiting
163 	 * on the mtm_cv if the mtm_taskq_sync_count hits zero.
164 	 */
165 	ASSERT(taskq_member(system_taskq, curthread));
166 
167 	mutex_enter(&mtm->mtm_lock);
168 	mtm->mtm_taskq_sync_count--;
169 	if (mtm->mtm_taskq_sync_count == 0) {
170 		cv_signal(&mtm->mtm_cv);
171 	}
172 	mutex_exit(&mtm->mtm_lock);
173 }
174 
175 /*
176  * MOBY TRANSACTION ROUTINES
177  * begin a moby transaction
178  *	sync ops enter until first sync op finishes
179  *	async ops enter until last sync op finishes
180  * end a moby transaction
181  *		outstanding deltas are pushed thru log
182  *		log buffer is committed (incore only)
183  *		next trans is open to async ops
184  *		log buffer is committed on the log
185  *		next trans is open to sync ops
186  */
187 
188 /*ARGSUSED*/
189 void
190 top_begin_sync(ufsvfs_t *ufsvfsp, top_t topid, ulong_t size, int *error)
191 {
192 	ml_unit_t	*ul	= ufsvfsp->vfs_log;
193 	mt_map_t	*mtm = ul->un_logmap;
194 	threadtrans_t	*tp;
195 	ushort_t	seq;
196 
197 	ASSERT(ufsvfsp->vfs_dev == ul->un_dev);
198 	ASSERT(error != NULL);
199 	ASSERT(*error == 0);
200 
201 	mutex_enter(&mtm->mtm_lock);
202 	if (topid == TOP_FSYNC) {
203 		/*
204 		 * Error the fsync immediately if this is an nfs thread
205 		 * and its last transaction has already been committed.
206 		 * The only transactions outstanding are those
207 		 * where no commit has even started
208 		 * (last_async_tid == mtm->mtm_tid)
209 		 * or those where a commit is in progress
210 		 * (last_async_tid == mtm->mtm_committid)
211 		 */
212 		if (curthread->t_flag & T_DONTPEND) {
213 			tp = tsd_get(topkey);
214 			if (tp && (tp->last_async_tid != mtm->mtm_tid) &&
215 			    (tp->last_async_tid != mtm->mtm_committid)) {
216 				mutex_exit(&mtm->mtm_lock);
217 				*error = 1;
218 				return;
219 			}
220 		}
221 
222 		/*
223 		 * If there's already other synchronous transactions
224 		 * and we haven't allowed async ones to start yet
225 		 * then just wait for the commit to complete.
226 		 */
227 		if (((mtm->mtm_closed & (TOP_SYNC | TOP_ASYNC)) ==
228 		    (TOP_SYNC | TOP_ASYNC)) || mtm->mtm_activesync) {
229 			seq = mtm->mtm_seq;
230 			do {
231 				cv_wait(&mtm->mtm_cv_commit, &mtm->mtm_lock);
232 			} while (seq == mtm->mtm_seq);
233 			mutex_exit(&mtm->mtm_lock);
234 			*error = 1;
235 			return;
236 		}
237 		if (mtm->mtm_closed & TOP_SYNC) {
238 			/*
239 			 * We know we're in the window where a thread is
240 			 * committing a transaction in top_end_sync() and
241 			 * has allowed async threads to start but hasn't
242 			 * got the completion on the commit write to
243 			 * allow sync threads to start.
244 			 * So wait for that commit completion then retest
245 			 * for the quick nfs check and if that fails
246 			 * go on to start a transaction
247 			 */
248 			seq = mtm->mtm_seq;
249 			do {
250 				cv_wait(&mtm->mtm_cv_commit, &mtm->mtm_lock);
251 			} while (seq == mtm->mtm_seq);
252 
253 			/* tp is set above if T_DONTPEND */
254 			if ((curthread->t_flag & T_DONTPEND) && tp &&
255 			    (tp->last_async_tid != mtm->mtm_tid) &&
256 			    (tp->last_async_tid != mtm->mtm_committid)) {
257 				mutex_exit(&mtm->mtm_lock);
258 				*error = 1;
259 				return;
260 			}
261 		}
262 	}
263 retry:
264 	mtm->mtm_ref = 1;
265 	/*
266 	 * current transaction closed to sync ops; try for next transaction
267 	 */
268 	if ((mtm->mtm_closed & TOP_SYNC) && !panicstr) {
269 		ulong_t		resv;
270 
271 		/*
272 		 * We know a commit is in progress, if we are trying to
273 		 * commit and we haven't allowed async ones to start yet,
274 		 * then just wait for the commit completion
275 		 */
276 		if ((size == TOP_COMMIT_SIZE) &&
277 		    (((mtm->mtm_closed & (TOP_SYNC | TOP_ASYNC)) ==
278 		    (TOP_SYNC | TOP_ASYNC)) || (mtm->mtm_activesync))) {
279 			seq = mtm->mtm_seq;
280 			do {
281 				cv_wait(&mtm->mtm_cv_commit, &mtm->mtm_lock);
282 			} while (seq == mtm->mtm_seq);
283 			mutex_exit(&mtm->mtm_lock);
284 			*error = 1;
285 			return;
286 		}
287 
288 		/*
289 		 * next transaction is full; try for next transaction
290 		 */
291 		resv = size + ul->un_resv_wantin + ul->un_resv;
292 		if (resv > ul->un_maxresv) {
293 			cv_wait(&mtm->mtm_cv_commit, &mtm->mtm_lock);
294 			goto retry;
295 		}
296 		/*
297 		 * we are in the next transaction; wait for it to start
298 		 */
299 		mtm->mtm_wantin++;
300 		ul->un_resv_wantin += size;
301 		/*
302 		 * The corresponding cv_broadcast wakes up
303 		 * all threads that have been validated to go into
304 		 * the next transaction. However, because spurious
305 		 * cv_wait wakeups are possible we use a sequence
306 		 * number to check that the commit and cv_broadcast
307 		 * has really occurred. We couldn't use mtm_tid
308 		 * because on error that doesn't get incremented.
309 		 */
310 		seq = mtm->mtm_seq;
311 		do {
312 			cv_wait(&mtm->mtm_cv_commit, &mtm->mtm_lock);
313 		} while (seq == mtm->mtm_seq);
314 	} else {
315 		/*
316 		 * if the current transaction is full; try the next one
317 		 */
318 		if (size && (ul->un_resv && ((size + ul->un_resv) >
319 		    ul->un_maxresv)) && !panicstr) {
320 			/*
321 			 * log is over reserved and no one will unresv the space
322 			 *	so generate empty sync op to unresv the space
323 			 */
324 			if (mtm->mtm_activesync == 0) {
325 				mutex_exit(&mtm->mtm_lock);
326 				top_issue_sync(ufsvfsp);
327 				mutex_enter(&mtm->mtm_lock);
328 				goto retry;
329 			}
330 			cv_wait(&mtm->mtm_cv_commit, &mtm->mtm_lock);
331 			goto retry;
332 		}
333 		/*
334 		 * we are in the current transaction
335 		 */
336 		mtm->mtm_active++;
337 		mtm->mtm_activesync++;
338 		ul->un_resv += size;
339 	}
340 
341 	ASSERT(mtm->mtm_active > 0);
342 	ASSERT(mtm->mtm_activesync > 0);
343 	mutex_exit(&mtm->mtm_lock);
344 
345 	ASSERT(((ul->un_debug & MT_TRANSACT) == 0) ||
346 	    top_begin_debug(ul, topid, size));
347 }
348 
349 int tryfail_cnt;
350 
351 int
352 top_begin_async(ufsvfs_t *ufsvfsp, top_t topid, ulong_t size, int tryasync)
353 {
354 	ml_unit_t	*ul	= ufsvfsp->vfs_log;
355 	mt_map_t	*mtm	= ul->un_logmap;
356 	threadtrans_t   *tp;
357 
358 	ASSERT(ufsvfsp->vfs_dev == ul->un_dev);
359 
360 	tp = tsd_get(topkey);
361 	if (tp == NULL) {
362 		tp = kmem_zalloc(sizeof (threadtrans_t), KM_SLEEP);
363 		(void) tsd_set(topkey, tp);
364 	}
365 	tp->deltas_size = 0;
366 	tp->any_deltas = 0;
367 
368 	mutex_enter(&mtm->mtm_lock);
369 retry:
370 	mtm->mtm_ref = 1;
371 	/*
372 	 * current transaction closed to async ops; try for next transaction
373 	 */
374 	if ((mtm->mtm_closed & TOP_ASYNC) && !panicstr) {
375 		if (tryasync) {
376 			mutex_exit(&mtm->mtm_lock);
377 			tryfail_cnt++;
378 			return (EWOULDBLOCK);
379 		}
380 		cv_wait(&mtm->mtm_cv_next, &mtm->mtm_lock);
381 		goto retry;
382 	}
383 
384 	/*
385 	 * if the current transaction is full; try the next one
386 	 */
387 	if (((size + ul->un_resv + ul->un_resv_wantin) > ul->un_maxresv) &&
388 	    !panicstr) {
389 		/*
390 		 * log is overreserved and no one will unresv the space
391 		 *	so generate empty sync op to unresv the space
392 		 * We need TOP_SYNC_FORCED because we want to know when
393 		 * a top_end_sync is completed.
394 		 * mtm_taskq_sync_count is needed because we want to keep track
395 		 * of the pending top_issue_sync dispatches so that during
396 		 * forced umount we can wait for these to complete.
397 		 * mtm_taskq_sync_count is decremented in top_issue_sync and
398 		 * can remain set even after top_end_sync completes.
399 		 * We have a window between the clearing of TOP_SYNC_FORCED
400 		 * flag and the decrementing of mtm_taskq_sync_count.
401 		 * If in this window new async transactions start consuming
402 		 * log space, the log can get overreserved.
403 		 * Subsequently a new async transaction would fail to generate
404 		 * an empty sync transaction via the taskq, since it finds
405 		 * the mtm_taskq_sync_count set. This can cause a hang.
406 		 * Hence we do not test for mtm_taskq_sync_count being zero.
407 		 * Instead, the TOP_SYNC_FORCED flag is tested here.
408 		 */
409 		if ((mtm->mtm_activesync == 0) &&
410 		    (!(mtm->mtm_closed & TOP_SYNC_FORCED))) {
411 			/*
412 			 * Set flag to stop multiple forced empty
413 			 * sync transactions. Increment mtm_taskq_sync_count.
414 			 */
415 			mtm->mtm_closed |= TOP_SYNC_FORCED;
416 			mtm->mtm_taskq_sync_count++;
417 			mutex_exit(&mtm->mtm_lock);
418 			(void) taskq_dispatch(system_taskq,
419 			    top_issue_from_taskq, ufsvfsp, TQ_SLEEP);
420 			if (tryasync) {
421 				tryfail_cnt++;
422 				return (EWOULDBLOCK);
423 			}
424 			mutex_enter(&mtm->mtm_lock);
425 			goto retry;
426 		}
427 		if (tryasync) {
428 			mutex_exit(&mtm->mtm_lock);
429 			tryfail_cnt++;
430 			return (EWOULDBLOCK);
431 		}
432 		cv_wait(&mtm->mtm_cv_next, &mtm->mtm_lock);
433 		goto retry;
434 	}
435 	/*
436 	 * we are in the current transaction
437 	 */
438 	mtm->mtm_active++;
439 	ul->un_resv += size;
440 
441 	ASSERT(mtm->mtm_active > 0);
442 	mutex_exit(&mtm->mtm_lock);
443 
444 	ASSERT(((ul->un_debug & MT_TRANSACT) == 0) ||
445 	    top_begin_debug(ul, topid, size));
446 	return (0);
447 }
448 
449 /*ARGSUSED*/
450 void
451 top_end_sync(ufsvfs_t *ufsvfsp, int *ep, top_t topid, ulong_t size)
452 {
453 	ml_unit_t	*ul	= ufsvfsp->vfs_log;
454 	mt_map_t	*mtm	= ul->un_logmap;
455 	mapentry_t	*cancellist;
456 	uint32_t	tid;
457 
458 	ASSERT(ufsvfsp->vfs_dev == ul->un_dev);
459 	ASSERT(((ul->un_debug & MT_TRANSACT) == 0) ||
460 	    top_end_debug(ul, mtm, topid, size));
461 
462 	mutex_enter(&mtm->mtm_lock);
463 	tid = mtm->mtm_tid;
464 
465 	mtm->mtm_activesync--;
466 	mtm->mtm_active--;
467 
468 	mtm->mtm_ref = 1;
469 
470 	/*
471 	 * wait for last syncop to complete
472 	 */
473 	if (mtm->mtm_activesync || panicstr) {
474 		ushort_t seq = mtm->mtm_seq;
475 
476 		mtm->mtm_closed = TOP_SYNC;
477 
478 		do {
479 			cv_wait(&mtm->mtm_cv_commit, &mtm->mtm_lock);
480 		} while (seq == mtm->mtm_seq);
481 		mutex_exit(&mtm->mtm_lock);
482 		goto out;
483 	}
484 	/*
485 	 * last syncop; close current transaction to all ops
486 	 */
487 	mtm->mtm_closed = TOP_SYNC|TOP_ASYNC;
488 
489 	/*
490 	 * wait for last asyncop to finish
491 	 */
492 	while (mtm->mtm_active) {
493 		cv_wait(&mtm->mtm_cv_eot, &mtm->mtm_lock);
494 	}
495 
496 	/*
497 	 * push dirty metadata thru the log
498 	 */
499 	deltamap_push(ul);
500 
501 	ASSERT(((ul->un_debug & MT_FORCEROLL) == 0) ||
502 	    top_roll_debug(ul));
503 
504 	mtm->mtm_tid = tid + 1;	/* can overflow to 0 */
505 
506 	/*
507 	 * Empty the cancellist, but save it for logmap_free_cancel
508 	 */
509 	mutex_enter(&mtm->mtm_mutex);
510 	cancellist = mtm->mtm_cancel;
511 	mtm->mtm_cancel = NULL;
512 	mutex_exit(&mtm->mtm_mutex);
513 
514 	/*
515 	 * allow async ops
516 	 */
517 	ASSERT(mtm->mtm_active == 0);
518 	ul->un_resv = 0; /* unreserve the log space */
519 	mtm->mtm_closed = TOP_SYNC;
520 	/*
521 	 * Hold the un_log_mutex here until we are done writing
522 	 * the commit record to prevent any more deltas to be written
523 	 * to the log after we allow async operations.
524 	 */
525 	mutex_enter(&ul->un_log_mutex);
526 	mutex_exit(&mtm->mtm_lock);
527 	cv_broadcast(&mtm->mtm_cv_next);
528 
529 	/*
530 	 * asynchronously write the commit record,
531 	 */
532 	logmap_commit(ul, tid);
533 
534 	/*
535 	 * wait for outstanding log writes (e.g., commits) to finish
536 	 */
537 	ldl_waito(ul);
538 
539 	/*
540 	 * Now that we are sure the commit has been written to the log
541 	 * we can free any canceled deltas.  If we free them before
542 	 * guaranteeing that the commit was written, we could panic before
543 	 * the commit, but after an async thread has allocated and written
544 	 * to canceled freed block.
545 	 */
546 
547 	logmap_free_cancel(mtm, &cancellist);
548 	mutex_exit(&ul->un_log_mutex);
549 
550 	/*
551 	 * now, allow all ops
552 	 */
553 	mutex_enter(&mtm->mtm_lock);
554 	mtm->mtm_active += mtm->mtm_wantin;
555 	ul->un_resv += ul->un_resv_wantin;
556 	mtm->mtm_activesync = mtm->mtm_wantin;
557 	mtm->mtm_wantin = 0;
558 	mtm->mtm_closed = 0;
559 	ul->un_resv_wantin = 0;
560 	mtm->mtm_committid = mtm->mtm_tid;
561 	mtm->mtm_seq++;
562 	mutex_exit(&mtm->mtm_lock);
563 
564 	/*
565 	 * Finish any other synchronous transactions and
566 	 * start any waiting new synchronous transactions
567 	 */
568 	cv_broadcast(&mtm->mtm_cv_commit);
569 
570 	/*
571 	 * if the logmap is getting full; roll something
572 	 */
573 	if (logmap_need_roll_sync(mtm)) {
574 		logmap_forceroll_nowait(mtm);
575 	}
576 
577 out:
578 	if (ul->un_flags & LDL_ERROR)
579 		*ep = EIO;
580 }
581 
582 /*ARGSUSED*/
583 void
584 top_end_async(ufsvfs_t *ufsvfsp, top_t topid, ulong_t size)
585 {
586 	ml_unit_t	*ul	= ufsvfsp->vfs_log;
587 	mt_map_t	*mtm	= ul->un_logmap;
588 	threadtrans_t	*tp	= tsd_get(topkey);
589 	int		wakeup_needed = 0;
590 
591 	ASSERT(tp);
592 	ASSERT(ufsvfsp->vfs_dev == ul->un_dev);
593 	ASSERT(((ul->un_debug & MT_TRANSACT) == 0) ||
594 	    top_end_debug(ul, mtm, topid, size));
595 
596 	mutex_enter(&mtm->mtm_lock);
597 
598 	if (size > tp->deltas_size) {
599 		ul->un_resv -= (size - tp->deltas_size);
600 	}
601 	if (tp->any_deltas) {
602 		tp->last_async_tid = mtm->mtm_tid;
603 	}
604 	mtm->mtm_ref = 1;
605 
606 	mtm->mtm_active--;
607 	if ((mtm->mtm_active == 0) &&
608 	    (mtm->mtm_closed == (TOP_SYNC|TOP_ASYNC))) {
609 		wakeup_needed = 1;
610 	}
611 	mutex_exit(&mtm->mtm_lock);
612 	if (wakeup_needed)
613 		cv_signal(&mtm->mtm_cv_eot);
614 
615 	/*
616 	 * Generate a sync op if the log, logmap, or deltamap are heavily used.
617 	 * Unless we are possibly holding any VM locks, since if we are holding
618 	 * any VM locks and we issue a top_end_sync(), we could deadlock.
619 	 */
620 	if ((mtm->mtm_activesync == 0) &&
621 	    !(mtm->mtm_closed & TOP_SYNC) &&
622 	    (deltamap_need_commit(ul->un_deltamap) ||
623 	    logmap_need_commit(mtm) ||
624 	    ldl_need_commit(ul)) &&
625 	    (topid != TOP_GETPAGE)) {
626 		top_issue_sync(ufsvfsp);
627 	}
628 	/*
629 	 * roll something from the log if the logmap is too full
630 	 */
631 	if (logmap_need_roll_async(mtm))
632 		logmap_forceroll_nowait(mtm);
633 }
634 
635 /*
636  * Called from roll thread;
637  *	buffer set for reading master
638  * Returns
639  *	0 - success, can continue with next buffer
640  *	1 - failure due to logmap deltas being in use
641  */
642 int
643 top_read_roll(rollbuf_t *rbp, ml_unit_t *ul)
644 {
645 	buf_t		*bp	= &rbp->rb_bh;
646 	offset_t	mof	= ldbtob(bp->b_blkno);
647 
648 	/*
649 	 * get a list of deltas
650 	 */
651 	if (logmap_list_get_roll(ul->un_logmap, mof, rbp)) {
652 		/* logmap deltas are in use */
653 		return (1);
654 	}
655 
656 	/*
657 	 * no deltas were found, nothing to roll
658 	 */
659 	if (rbp->rb_age == NULL) {
660 		bp->b_flags |= B_INVAL;
661 		return (0);
662 	}
663 
664 	/*
665 	 * If there is one cached roll buffer that cover all the deltas then
666 	 * we can use that instead of copying to a separate roll buffer.
667 	 */
668 	if (rbp->rb_crb) {
669 		rbp->rb_bh.b_blkno = lbtodb(rbp->rb_crb->c_mof);
670 		return (0);
671 	}
672 
673 	/*
674 	 * Set up the read.
675 	 * If no read is needed logmap_setup_read() returns 0.
676 	 */
677 	if (logmap_setup_read(rbp->rb_age, rbp)) {
678 		/*
679 		 * async read the data from master
680 		 */
681 		logstats.ls_rreads.value.ui64++;
682 		bp->b_bcount = MAPBLOCKSIZE;
683 		(void) bdev_strategy(bp);
684 		lwp_stat_update(LWP_STAT_INBLK, 1);
685 	} else {
686 		sema_v(&bp->b_io); /* mark read as complete */
687 	}
688 	return (0);
689 }
690 
691 int ufs_crb_enable = 1;
692 
693 /*
694  * move deltas from deltamap into the log
695  */
696 void
697 top_log(ufsvfs_t *ufsvfsp, char *va, offset_t vamof, off_t nb,
698     caddr_t buf, uint32_t bufsz)
699 {
700 	ml_unit_t	*ul = ufsvfsp->vfs_log;
701 	mapentry_t	*me;
702 	offset_t	hmof;
703 	uint32_t	hnb, nb1;
704 
705 	/*
706 	 * needed for the roll thread's heuristic
707 	 */
708 	ul->un_logmap->mtm_ref = 1;
709 
710 	if (buf && ufs_crb_enable) {
711 		ASSERT((bufsz & DEV_BMASK) == 0);
712 		/*
713 		 * Move any deltas to the logmap. Split requests that
714 		 * straddle MAPBLOCKSIZE hash boundaries (i.e. summary info).
715 		 */
716 		for (hmof = vamof - (va - buf), nb1 = nb; bufsz;
717 		    bufsz -= hnb, hmof += hnb, buf += hnb, nb1 -= hnb) {
718 			hnb = MAPBLOCKSIZE - (hmof & MAPBLOCKOFF);
719 			if (hnb > bufsz)
720 				hnb = bufsz;
721 			me = deltamap_remove(ul->un_deltamap,
722 			    MAX(hmof, vamof), MIN(hnb, nb1));
723 			if (me) {
724 				logmap_add_buf(ul, va, hmof, me, buf, hnb);
725 			}
726 		}
727 	} else {
728 		/*
729 		 * if there are deltas
730 		 */
731 		me = deltamap_remove(ul->un_deltamap, vamof, nb);
732 		if (me) {
733 			/*
734 			 * move to logmap
735 			 */
736 			logmap_add(ul, va, vamof, me);
737 		}
738 	}
739 
740 	ASSERT((ul->un_matamap == NULL) ||
741 	    matamap_within(ul->un_matamap, vamof, nb));
742 }
743 
744 
745 static void
746 top_threadtrans_destroy(void *tp)
747 {
748 	kmem_free(tp, sizeof (threadtrans_t));
749 }
750 
751 void
752 _init_top(void)
753 {
754 	ASSERT(top_init_debug());
755 
756 	/*
757 	 * set up the delta layer
758 	 */
759 	_init_map();
760 
761 	/*
762 	 * Initialise the thread specific data transaction key
763 	 */
764 	tsd_create(&topkey, top_threadtrans_destroy);
765 }
766