xref: /titanic_52/usr/src/uts/common/fs/ufs/lufs_map.c (revision 554ff184129088135ad2643c1c9832174a17be88)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 #pragma ident	"%Z%%M%	%I%	%E% SMI"
23 
24 /*
25  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
26  * Use is subject to license terms.
27  */
28 
29 #include <sys/systm.h>
30 #include <sys/types.h>
31 #include <sys/vnode.h>
32 #include <sys/errno.h>
33 #include <sys/sysmacros.h>
34 #include <sys/debug.h>
35 #include <sys/kmem.h>
36 #include <sys/conf.h>
37 #include <sys/proc.h>
38 #include <sys/cmn_err.h>
39 #include <sys/fs/ufs_inode.h>
40 #include <sys/fs/ufs_filio.h>
41 #include <sys/fs/ufs_log.h>
42 #include <sys/inttypes.h>
43 #include <sys/atomic.h>
44 #include <sys/tuneable.h>
45 
46 /*
47  * externs
48  */
49 extern pri_t minclsyspri;
50 extern struct kmem_cache *lufs_bp;
51 extern int ufs_trans_push_quota();
52 
53 /*
54  * globals
55  */
56 kmem_cache_t *mapentry_cache;
57 
58 /*
59  * logmap tuning constants
60  */
61 long	logmap_maxnme_commit	= 2048;
62 long	logmap_maxnme_async	= 4096;
63 long	logmap_maxnme_sync	= 6144;
64 long	logmap_maxcfrag_commit	= 4;	/* Max canceled fragments per moby */
65 
66 
67 uint64_t ufs_crb_size = 0;		/* current size of all crb buffers */
68 uint64_t ufs_crb_max_size = 0;		/* highest crb buffer use so far */
69 size_t ufs_crb_limit;			/* max allowable size for crbs */
70 uint64_t ufs_crb_alloc_fails = 0;	/* crb allocation failures stat */
71 #define	UFS_MAX_CRB_DEFAULT_DIVISOR 10	/* max 1/10 kmem_maxavail() */
72 int ufs_max_crb_divisor = UFS_MAX_CRB_DEFAULT_DIVISOR; /* tunable */
73 void handle_dquot(mapentry_t *);
74 
75 /*
76  * GENERIC MAP ROUTINES
77  */
78 
79 #define	CRB_FREE(crb, me) \
80 	kmem_free(crb->c_buf, crb->c_nb); \
81 	atomic_add_64(&ufs_crb_size, -(uint64_t)crb->c_nb); \
82 	kmem_free(crb, sizeof (crb_t)); \
83 	(me)->me_crb = NULL;
84 
85 #define	CRB_RELE(me) { \
86 	crb_t *crb = (me)->me_crb; \
87 	if (crb && (--crb->c_refcnt == 0)) { \
88 		CRB_FREE(crb, me) \
89 	} \
90 }
91 
92 /*
93  * Check that the old delta has an argument and a push function of
94  * ufs_trans_push_quota(), then check that the old and new deltas differ.
95  * If so we clean up with handle_dquot() before replacing the old delta.
96  */
97 #define	HANDLE_DQUOT(me, melist) { \
98 	if ((me->me_arg) && \
99 	    (me->me_func == ufs_trans_push_quota)) { \
100 		if (!((me->me_dt == melist->me_dt) && \
101 		    (me->me_arg == melist->me_arg) && \
102 		    (me->me_func == melist->me_func))) { \
103 			handle_dquot(me); \
104 		} \
105 	} \
106 }
107 
108 /*
109  * free up all the mapentries for a map
110  */
111 void
112 map_free_entries(mt_map_t *mtm)
113 {
114 	int		i;
115 	mapentry_t	*me;
116 
117 	while ((me = mtm->mtm_next) != (mapentry_t *)mtm) {
118 		me->me_next->me_prev = me->me_prev;
119 		me->me_prev->me_next = me->me_next;
120 		CRB_RELE(me);
121 		kmem_cache_free(mapentry_cache, me);
122 	}
123 	for (i = 0; i < mtm->mtm_nhash; i++)
124 		mtm->mtm_hash[i] = NULL;
125 	mtm->mtm_nme = 0;
126 	mtm->mtm_nmet = 0;
127 }
128 
129 /*
130  * done with map; free if necessary
131  */
132 mt_map_t *
133 map_put(mt_map_t *mtm)
134 {
135 	/*
136 	 * free up the map's memory
137 	 */
138 	map_free_entries(mtm);
139 	ASSERT(map_put_debug(mtm));
140 	kmem_free(mtm->mtm_hash,
141 		(size_t) (sizeof (mapentry_t *) * mtm->mtm_nhash));
142 	mutex_destroy(&mtm->mtm_mutex);
143 	mutex_destroy(&mtm->mtm_scan_mutex);
144 	cv_destroy(&mtm->mtm_to_roll_cv);
145 	cv_destroy(&mtm->mtm_from_roll_cv);
146 	rw_destroy(&mtm->mtm_rwlock);
147 	mutex_destroy(&mtm->mtm_lock);
148 	cv_destroy(&mtm->mtm_cv_commit);
149 	cv_destroy(&mtm->mtm_cv_next);
150 	cv_destroy(&mtm->mtm_cv_eot);
151 	cv_destroy(&mtm->mtm_cv);
152 	kmem_free(mtm, sizeof (mt_map_t));
153 	return (NULL);
154 }
155 /*
156  * Allocate a map;
157  */
158 mt_map_t *
159 map_get(ml_unit_t *ul, enum maptypes maptype, int nh)
160 {
161 	mt_map_t	*mtm;
162 
163 	/*
164 	 * assume the map is not here and allocate the necessary structs
165 	 */
166 	mtm = kmem_zalloc(sizeof (mt_map_t), KM_SLEEP);
167 	mutex_init(&mtm->mtm_mutex, NULL, MUTEX_DEFAULT, NULL);
168 	mutex_init(&mtm->mtm_scan_mutex, NULL, MUTEX_DEFAULT, NULL);
169 	cv_init(&mtm->mtm_to_roll_cv, NULL, CV_DEFAULT, NULL);
170 	cv_init(&mtm->mtm_from_roll_cv, NULL, CV_DEFAULT, NULL);
171 	rw_init(&mtm->mtm_rwlock, NULL, RW_DEFAULT, NULL);
172 	mtm->mtm_next = (mapentry_t *)mtm;
173 	mtm->mtm_prev = (mapentry_t *)mtm;
174 	mtm->mtm_hash = kmem_zalloc((size_t) (sizeof (mapentry_t *) * nh),
175 	    KM_SLEEP);
176 	mtm->mtm_nhash = nh;
177 	mtm->mtm_debug = ul->un_debug;
178 	mtm->mtm_type = maptype;
179 
180 	mtm->mtm_cfrags = 0;
181 	mtm->mtm_cfragmax = logmap_maxcfrag_commit;
182 
183 	/*
184 	 * for scan test
185 	 */
186 	mtm->mtm_ul = ul;
187 
188 	/*
189 	 * Initialize locks
190 	 */
191 	mutex_init(&mtm->mtm_lock, NULL, MUTEX_DEFAULT, NULL);
192 	cv_init(&mtm->mtm_cv_commit, NULL, CV_DEFAULT, NULL);
193 	cv_init(&mtm->mtm_cv_next, NULL, CV_DEFAULT, NULL);
194 	cv_init(&mtm->mtm_cv_eot, NULL, CV_DEFAULT, NULL);
195 	cv_init(&mtm->mtm_cv, NULL, CV_DEFAULT, NULL);
196 	ASSERT(map_get_debug(ul, mtm));
197 
198 	return (mtm);
199 }
200 
201 /*
202  * DELTAMAP ROUTINES
203  */
204 /*
205  * deltamap tuning constants
206  */
207 long	deltamap_maxnme	= 1024;	/* global so it can be set */
208 
209 int
210 deltamap_need_commit(mt_map_t *mtm)
211 {
212 	return (mtm->mtm_nme > deltamap_maxnme);
213 }
214 
215 /*
216  * put a delta into a deltamap; may sleep on memory
217  */
218 void
219 deltamap_add(
220 	mt_map_t *mtm,
221 	offset_t mof,
222 	off_t nb,
223 	delta_t dtyp,
224 	int (*func)(),
225 	ulong_t arg,
226 	threadtrans_t *tp)
227 {
228 	int32_t		hnb;
229 	mapentry_t	*me;
230 	mapentry_t	**mep;
231 
232 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
233 		map_check_linkage(mtm));
234 
235 	mutex_enter(&mtm->mtm_mutex);
236 
237 	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
238 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
239 		if (hnb > nb)
240 			hnb = nb;
241 		/*
242 		 * Search for dup entry. We need to ensure that we don't
243 		 * replace a map entry which carries quota information
244 		 * with a map entry which doesn't. In that case we lose
245 		 * reference the the dquot structure which will not be
246 		 * cleaned up by the push function me->me_func as this will
247 		 * never be called.
248 		 * The stray dquot would be found later by invalidatedq()
249 		 * causing a panic when the filesystem is unmounted.
250 		 */
251 		mep = MAP_HASH(mof, mtm);
252 		for (me = *mep; me; me = me->me_hash) {
253 			if (DATAwithinME(mof, hnb, me)) {
254 			    if (me->me_func == ufs_trans_push_quota) {
255 				/*
256 				 * Don't remove quota entries which have
257 				 * incremented the ref count (those with a
258 				 * ufs_trans_push_quota push function).
259 				 * Let logmap_add[_buf] clean them up.
260 				 */
261 				continue;
262 			    }
263 			    break;
264 			}
265 			ASSERT((dtyp == DT_CANCEL) ||
266 				(!DATAoverlapME(mof, hnb, me)) ||
267 				MEwithinDATA(me, mof, hnb));
268 		}
269 
270 		if (me) {
271 			/* already in map */
272 			continue;
273 		}
274 
275 		/*
276 		 * Add up all the delta map deltas so we can compute
277 		 * an upper bound on the log size used.
278 		 * Note, some deltas get removed from the deltamap
279 		 * before the deltamap_push by lufs_write_strategy
280 		 * and so multiple deltas to the same mof offset
281 		 * don't get cancelled here but in the logmap.
282 		 * Thus we can't easily get a accurate count of
283 		 * the log space used - only an upper bound.
284 		 */
285 		if (tp && (mtm->mtm_ul->un_deltamap == mtm)) {
286 			ASSERT(dtyp != DT_CANCEL);
287 			if (dtyp == DT_ABZERO) {
288 				tp->deltas_size += sizeof (struct delta);
289 			} else {
290 				tp->deltas_size +=
291 				    (hnb + sizeof (struct delta));
292 			}
293 		}
294 
295 		delta_stats[dtyp]++;
296 
297 		/*
298 		 * get a mapentry
299 		 * May need to drop & re-grab the mtm_mutex
300 		 * and then recheck for a duplicate
301 		 */
302 		me = kmem_cache_alloc(mapentry_cache, KM_NOSLEEP);
303 		if (me == NULL) {
304 			mutex_exit(&mtm->mtm_mutex);
305 			me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
306 			mutex_enter(&mtm->mtm_mutex);
307 		}
308 		bzero(me, sizeof (mapentry_t));
309 
310 		/*
311 		 * initialize and put in deltamap
312 		 */
313 		me->me_mof = mof;
314 		me->me_nb = hnb;
315 		me->me_func = func;
316 		me->me_arg = arg;
317 		me->me_dt = dtyp;
318 		me->me_flags = ME_HASH;
319 		me->me_tid = mtm->mtm_tid;
320 
321 		me->me_hash = *mep;
322 		*mep = me;
323 		me->me_next = (mapentry_t *)mtm;
324 		me->me_prev = mtm->mtm_prev;
325 		mtm->mtm_prev->me_next = me;
326 		mtm->mtm_prev = me;
327 		mtm->mtm_nme++;
328 	}
329 	mutex_exit(&mtm->mtm_mutex);
330 
331 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
332 		map_check_linkage(mtm));
333 }
334 
335 /*
336  * remove deltas within (mof, nb) and return as linked list
337  */
338 mapentry_t *
339 deltamap_remove(mt_map_t *mtm, offset_t mof, off_t nb)
340 {
341 	off_t		hnb;
342 	mapentry_t	*me;
343 	mapentry_t	**mep;
344 	mapentry_t	*mer;
345 
346 	if (mtm == NULL)
347 		return (NULL);
348 
349 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
350 		map_check_linkage(mtm));
351 
352 	mutex_enter(&mtm->mtm_mutex);
353 	for (mer = NULL, hnb = 0; nb; nb -= hnb, mof += hnb) {
354 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
355 		if (hnb > nb)
356 			hnb = nb;
357 		/*
358 		 * remove entries from hash and return as a aged linked list
359 		 */
360 		mep = MAP_HASH(mof, mtm);
361 		while ((me = *mep) != 0) {
362 			if (MEwithinDATA(me, mof, hnb)) {
363 				*mep = me->me_hash;
364 				me->me_next->me_prev = me->me_prev;
365 				me->me_prev->me_next = me->me_next;
366 				me->me_hash = mer;
367 				mer = me;
368 				me->me_flags |= ME_LIST;
369 				me->me_flags &= ~ME_HASH;
370 				mtm->mtm_nme--;
371 			} else
372 				mep = &me->me_hash;
373 		}
374 	}
375 	mutex_exit(&mtm->mtm_mutex);
376 
377 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
378 		map_check_linkage(mtm));
379 
380 	return (mer);
381 }
382 
383 /*
384  * delete entries within (mof, nb)
385  */
386 void
387 deltamap_del(mt_map_t *mtm, offset_t mof, off_t nb)
388 {
389 	mapentry_t	*me;
390 	mapentry_t	*menext;
391 
392 	menext = deltamap_remove(mtm, mof, nb);
393 	while ((me = menext) != 0) {
394 		menext = me->me_hash;
395 		kmem_cache_free(mapentry_cache, me);
396 	}
397 }
398 
399 /*
400  * Call the indicated function to cause deltas to move to the logmap.
401  * top_end_sync() is the only caller of this function and
402  * it has waited for the completion of all threads, so there can
403  * be no other activity in the deltamap. Therefore we don't need to
404  * hold the deltamap lock.
405  */
406 void
407 deltamap_push(ml_unit_t *ul)
408 {
409 	delta_t		dtyp;
410 	int		(*func)();
411 	ulong_t		arg;
412 	mapentry_t	*me;
413 	offset_t	mof;
414 	off_t		nb;
415 	mt_map_t	*mtm	= ul->un_deltamap;
416 
417 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
418 		map_check_linkage(mtm));
419 
420 	/*
421 	 * for every entry in the deltamap
422 	 */
423 	while ((me = mtm->mtm_next) != (mapentry_t *)mtm) {
424 		ASSERT(me->me_func);
425 		func = me->me_func;
426 		dtyp = me->me_dt;
427 		arg = me->me_arg;
428 		mof = me->me_mof;
429 		nb = me->me_nb;
430 		if ((ul->un_flags & LDL_ERROR) ||
431 		    (*func)(ul->un_ufsvfs, dtyp, arg))
432 			deltamap_del(mtm, mof, nb);
433 	}
434 
435 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
436 		map_check_linkage(mtm));
437 }
438 
439 /*
440  * LOGMAP ROUTINES
441  */
442 
443 int
444 logmap_need_commit(mt_map_t *mtm)
445 {
446 	return ((mtm->mtm_nmet > logmap_maxnme_commit) ||
447 		(mtm->mtm_cfrags >= mtm->mtm_cfragmax));
448 }
449 
450 int
451 logmap_need_roll_async(mt_map_t *mtm)
452 {
453 	return (mtm->mtm_nme > logmap_maxnme_async);
454 }
455 
456 int
457 logmap_need_roll_sync(mt_map_t *mtm)
458 {
459 	return (mtm->mtm_nme > logmap_maxnme_sync);
460 }
461 
462 void
463 logmap_start_roll(ml_unit_t *ul)
464 {
465 	mt_map_t	*logmap	= ul->un_logmap;
466 
467 	logmap_settail(logmap, ul);
468 	ASSERT(!(ul->un_flags & LDL_NOROLL));
469 	mutex_enter(&logmap->mtm_mutex);
470 	if ((logmap->mtm_flags & MTM_ROLL_RUNNING) == 0) {
471 		logmap->mtm_flags |= MTM_ROLL_RUNNING;
472 		logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_EXIT);
473 		(void) thread_create(NULL, 0, trans_roll, ul, 0, &p0,
474 		    TS_RUN, minclsyspri);
475 	}
476 	mutex_exit(&logmap->mtm_mutex);
477 }
478 
479 void
480 logmap_kill_roll(ml_unit_t *ul)
481 {
482 	mt_map_t	*mtm	= ul->un_logmap;
483 
484 	if (mtm == NULL)
485 		return;
486 
487 	mutex_enter(&mtm->mtm_mutex);
488 
489 	while (mtm->mtm_flags & MTM_ROLL_RUNNING) {
490 		mtm->mtm_flags |= MTM_ROLL_EXIT;
491 		cv_signal(&mtm->mtm_to_roll_cv);
492 		cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex);
493 	}
494 	mutex_exit(&mtm->mtm_mutex);
495 }
496 
497 /*
498  * kick the roll thread if it's not doing anything
499  */
500 void
501 logmap_forceroll_nowait(mt_map_t *logmap)
502 {
503 	/*
504 	 * Don't need to lock mtm_mutex to read mtm_flags here as we
505 	 * don't care in the rare case when we get a transitional value
506 	 * of mtm_flags. Just by signalling the thread it will wakeup
507 	 * and notice it has too many logmap entries.
508 	 */
509 	ASSERT(!(logmap->mtm_ul->un_flags & LDL_NOROLL));
510 	if ((logmap->mtm_flags & MTM_ROLLING) == 0) {
511 		cv_signal(&logmap->mtm_to_roll_cv);
512 	}
513 }
514 
515 /*
516  * kick the roll thread and wait for it to finish a cycle
517  */
518 void
519 logmap_forceroll(mt_map_t *mtm)
520 {
521 	mutex_enter(&mtm->mtm_mutex);
522 	if ((mtm->mtm_flags & MTM_FORCE_ROLL) == 0) {
523 		mtm->mtm_flags |= MTM_FORCE_ROLL;
524 		cv_signal(&mtm->mtm_to_roll_cv);
525 	}
526 	do {
527 		if ((mtm->mtm_flags & MTM_ROLL_RUNNING) == 0) {
528 			mtm->mtm_flags &= ~MTM_FORCE_ROLL;
529 			goto out;
530 		}
531 		cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex);
532 	} while (mtm->mtm_flags & MTM_FORCE_ROLL);
533 out:
534 	mutex_exit(&mtm->mtm_mutex);
535 }
536 
537 /*
538  * remove rolled deltas within (mof, nb) and free them
539  */
540 void
541 logmap_remove_roll(mt_map_t *mtm, offset_t mof, off_t nb)
542 {
543 	int		dolock = 0;
544 	off_t		hnb;
545 	mapentry_t	*me;
546 	mapentry_t	**mep;
547 	offset_t	savmof	= mof;
548 	off_t		savnb	= nb;
549 
550 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
551 		map_check_linkage(mtm));
552 
553 again:
554 	if (dolock)
555 		rw_enter(&mtm->mtm_rwlock, RW_WRITER);
556 	mutex_enter(&mtm->mtm_mutex);
557 	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
558 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
559 		if (hnb > nb)
560 			hnb = nb;
561 		/*
562 		 * remove and free the rolled entries
563 		 */
564 		mep = MAP_HASH(mof, mtm);
565 		while ((me = *mep) != 0) {
566 			if ((me->me_flags & ME_ROLL) &&
567 			    (MEwithinDATA(me, mof, hnb))) {
568 				if (me->me_flags & ME_AGE) {
569 					ASSERT(dolock == 0);
570 					dolock = 1;
571 					mutex_exit(&mtm->mtm_mutex);
572 					mof = savmof;
573 					nb = savnb;
574 					goto again;
575 				}
576 				*mep = me->me_hash;
577 				me->me_next->me_prev = me->me_prev;
578 				me->me_prev->me_next = me->me_next;
579 				me->me_flags &= ~(ME_HASH|ME_ROLL);
580 				ASSERT(!(me->me_flags & ME_USER));
581 				mtm->mtm_nme--;
582 				/*
583 				 * cancelled entries are handled by someone else
584 				 */
585 				if ((me->me_flags & ME_CANCEL) == 0) {
586 					roll_stats[me->me_dt]++;
587 					CRB_RELE(me);
588 					kmem_cache_free(mapentry_cache, me);
589 				}
590 			} else
591 				mep = &me->me_hash;
592 		}
593 	}
594 	mutex_exit(&mtm->mtm_mutex);
595 
596 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
597 		map_check_linkage(mtm));
598 
599 	if (dolock)
600 		rw_exit(&mtm->mtm_rwlock);
601 }
602 
603 /*
604  * Find the disk offset of the next delta to roll.
605  * Returns 0: no more deltas to roll or a transaction is being committed
606  *	   1: a delta to roll has been found and *mofp points
607  *	      to the master file disk offset
608  */
609 int
610 logmap_next_roll(mt_map_t *logmap, offset_t *mofp)
611 {
612 	mapentry_t *me;
613 
614 	ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) ||
615 		map_check_linkage(logmap));
616 
617 	mutex_enter(&logmap->mtm_mutex);
618 	for (me = logmap->mtm_next; me != (mapentry_t *)logmap;
619 	    me = me->me_next) {
620 		/* already rolled */
621 		if (me->me_flags & ME_ROLL) {
622 			continue;
623 		}
624 
625 		/* part of currently busy transaction; stop */
626 		if (me->me_tid == logmap->mtm_tid) {
627 			break;
628 		}
629 
630 		/* part of commit-in-progress transaction; stop */
631 		if (me->me_tid == logmap->mtm_committid) {
632 			break;
633 		}
634 
635 		/*
636 		 * We shouldn't see a DT_CANCEL mapentry whose
637 		 * tid != mtm_committid, or != mtm_tid since
638 		 * these are removed at the end of each committed
639 		 * transaction.
640 		 */
641 		ASSERT(!(me->me_dt == DT_CANCEL));
642 
643 		*mofp = me->me_mof;
644 		mutex_exit(&logmap->mtm_mutex);
645 		return (1);
646 	}
647 	mutex_exit(&logmap->mtm_mutex);
648 	return (0);
649 }
650 
651 /*
652  * put mapentry on sorted age list
653  */
654 static void
655 logmap_list_age(mapentry_t **age, mapentry_t *meadd)
656 {
657 	mapentry_t	*me;
658 
659 	ASSERT(!(meadd->me_flags & (ME_AGE|ME_LIST)));
660 
661 	for (me = *age; me; age = &me->me_agenext, me = *age) {
662 		if (me->me_age > meadd->me_age)
663 			break;
664 	}
665 	meadd->me_agenext = me;
666 	meadd->me_flags |= ME_AGE;
667 	*age = meadd;
668 }
669 
670 /*
671  * get a list of deltas within <mof, mof+nb>
672  *	returns with mtm_rwlock held
673  *	return value says whether the entire mof range is covered by deltas
674  */
675 int
676 logmap_list_get(
677 	mt_map_t *mtm,
678 	offset_t mof,
679 	off_t nb,
680 	mapentry_t **age)
681 {
682 	off_t		hnb;
683 	mapentry_t	*me;
684 	mapentry_t	**mep;
685 	int		rwtype	= RW_READER;
686 	offset_t	savmof	= mof;
687 	off_t		savnb	= nb;
688 	int		entire	= 0;
689 	crb_t		*crb;
690 
691 	mtm->mtm_ref = 1;
692 again:
693 
694 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
695 		map_check_linkage(mtm));
696 
697 	rw_enter(&mtm->mtm_rwlock, rwtype);
698 	*age = NULL;
699 	mutex_enter(&mtm->mtm_mutex);
700 	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
701 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
702 		if (hnb > nb)
703 			hnb = nb;
704 		/*
705 		 * find overlapping entries
706 		 */
707 		mep = MAP_HASH(mof, mtm);
708 		for (me = *mep; me; me = me->me_hash) {
709 			if (me->me_dt == DT_CANCEL)
710 				continue;
711 			if (!DATAoverlapME(mof, hnb, me))
712 				continue;
713 			/*
714 			 * check if map entry is in use
715 			 * (about to be rolled).
716 			 */
717 			if (me->me_flags & ME_AGE) {
718 				/*
719 				 * reset the age bit in the list,
720 				 * upgrade the lock, and try again
721 				 */
722 				for (me = *age; me; me = *age) {
723 					*age = me->me_agenext;
724 					me->me_flags &= ~ME_AGE;
725 				}
726 				mutex_exit(&mtm->mtm_mutex);
727 				rw_exit(&mtm->mtm_rwlock);
728 				rwtype = RW_WRITER;
729 				mof = savmof;
730 				nb = savnb;
731 				entire = 0;
732 				goto again;
733 			} else {
734 				/* add mapentry to age ordered list */
735 				logmap_list_age(age, me);
736 				crb = me->me_crb;
737 				if (crb) {
738 					if (DATAwithinCRB(savmof, savnb, crb)) {
739 						entire = 1;
740 					}
741 				} else {
742 					if (DATAwithinME(savmof, savnb, me)) {
743 						entire = 1;
744 					}
745 				}
746 			}
747 		}
748 	}
749 	mutex_exit(&mtm->mtm_mutex);
750 
751 	ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
752 	return (entire);
753 }
754 
755 /*
756  * Get a list of deltas for rolling - returns sucess or failure.
757  * Also return the cached roll buffer if all deltas point to it.
758  */
759 int
760 logmap_list_get_roll(mt_map_t *logmap, offset_t mof, rollbuf_t *rbp)
761 {
762 	mapentry_t	*me, **mep, *age = NULL;
763 	crb_t		*crb = NULL;
764 
765 	ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock));
766 	ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) ||
767 		map_check_linkage(logmap));
768 	ASSERT((mof & MAPBLOCKOFF) == 0);
769 
770 	rbp->rb_crb = NULL;
771 
772 	/*
773 	 * find overlapping entries
774 	 */
775 	mutex_enter(&logmap->mtm_mutex);
776 	mep = MAP_HASH(mof, logmap);
777 	for (me = *mep; me; me = me->me_hash) {
778 		if (!DATAoverlapME(mof, MAPBLOCKSIZE, me))
779 			continue;
780 		if (me->me_tid == logmap->mtm_tid)
781 			continue;
782 		if (me->me_tid == logmap->mtm_committid)
783 			continue;
784 		if (me->me_dt == DT_CANCEL)
785 			continue;
786 
787 		/*
788 		 * Check if map entry is in use (by lufs_read_strategy())
789 		 * and if so reset the age bit in the list,
790 		 * upgrade the lock, and try again
791 		 */
792 		if (me->me_flags & ME_AGE) {
793 			for (me = age; me; me = age) {
794 				age = me->me_agenext;
795 				me->me_flags &= ~ME_AGE;
796 			}
797 			mutex_exit(&logmap->mtm_mutex);
798 			return (1); /* failure */
799 		} else {
800 			/* add mapentry to age ordered list */
801 			logmap_list_age(&age, me);
802 		}
803 	}
804 	if (!age) {
805 		goto out;
806 	}
807 
808 	/*
809 	 * Mark the deltas as being rolled.
810 	 */
811 	for (me = age; me; me = me->me_agenext) {
812 		me->me_flags |= ME_ROLL;
813 	}
814 
815 	/*
816 	 * Test if all deltas are covered by one valid roll buffer
817 	 */
818 	crb = age->me_crb;
819 	if (crb && !(crb->c_invalid)) {
820 		for (me = age; me; me = me->me_agenext) {
821 			if (me->me_crb != crb) {
822 				crb = NULL;
823 				break;
824 			}
825 		}
826 		rbp->rb_crb = crb;
827 	}
828 out:
829 	rbp->rb_age = age;
830 
831 	mutex_exit(&logmap->mtm_mutex);
832 
833 	ASSERT(((logmap->mtm_debug & MT_SCAN) == 0) ||
834 		logmap_logscan_debug(logmap, age));
835 	ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock));
836 	return (0); /* success */
837 }
838 
839 void
840 logmap_list_put_roll(mt_map_t *mtm, mapentry_t *age)
841 {
842 	mapentry_t	*me;
843 
844 	ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
845 	mutex_enter(&mtm->mtm_mutex);
846 	for (me = age; me; me = age) {
847 		age = me->me_agenext;
848 		me->me_flags &= ~ME_AGE;
849 	}
850 	mutex_exit(&mtm->mtm_mutex);
851 }
852 
853 void
854 logmap_list_put(mt_map_t *mtm, mapentry_t *age)
855 {
856 	mapentry_t	*me;
857 
858 	ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
859 	mutex_enter(&mtm->mtm_mutex);
860 	for (me = age; me; me = age) {
861 		age = me->me_agenext;
862 		me->me_flags &= ~ME_AGE;
863 	}
864 	mutex_exit(&mtm->mtm_mutex);
865 	rw_exit(&mtm->mtm_rwlock);
866 }
867 
868 #define	UFS_RW_BALANCE 2
869 int ufs_rw_balance = UFS_RW_BALANCE;
870 
871 /*
872  * Check if we need to read the master.
873  * The master does not need to be read if the log deltas to the
874  * block are for one contiguous set of full disk sectors.
875  * Both cylinder group bit maps DT_CG (8K); directory entries (512B);
876  * and possibly others should not require master disk reads.
877  * Calculate the sector map for writing later.
878  */
879 int
880 logmap_setup_read(mapentry_t *age, rollbuf_t *rbp)
881 {
882 	offset_t mof;
883 	crb_t *crb;
884 	mapentry_t *me;
885 	int32_t nb;
886 	int i;
887 	int start_sec, end_sec;
888 	int read_needed = 0;
889 	int all_inodes = 1;
890 	int first_sec = INT_MAX;
891 	int last_sec = -1;
892 	rbsecmap_t secmap = 0;
893 
894 	/* LINTED: warning: logical expression always true: op "||" */
895 	ASSERT((MAPBLOCKSIZE / DEV_BSIZE) == (sizeof (secmap) * NBBY));
896 
897 	for (me = age; me; me = me->me_agenext) {
898 		crb = me->me_crb;
899 		if (crb) {
900 			nb = crb->c_nb;
901 			mof = crb->c_mof;
902 		} else {
903 			nb = me->me_nb;
904 			mof = me->me_mof;
905 		}
906 
907 		/*
908 		 * If the delta is not sector aligned then
909 		 * read the whole block.
910 		 */
911 		if ((nb & DEV_BMASK) || (mof & DEV_BMASK)) {
912 			read_needed = 1;
913 		}
914 
915 		/* Set sector map used in the MAPBLOCKSIZE block.  */
916 		start_sec = (mof & MAPBLOCKOFF) >> DEV_BSHIFT;
917 		end_sec = start_sec + ((nb - 1) >> DEV_BSHIFT);
918 		for (i = start_sec; i <= end_sec; i++) {
919 			secmap |= UINT16_C(1) << i;
920 		}
921 
922 		if (me->me_dt != DT_INODE) {
923 			all_inodes = 0;
924 		}
925 		if (start_sec < first_sec) {
926 			first_sec = start_sec;
927 		}
928 		if (end_sec > last_sec) {
929 			last_sec = end_sec;
930 		}
931 	}
932 
933 	ASSERT(secmap);
934 	ASSERT(first_sec != INT_MAX);
935 	ASSERT(last_sec != -1);
936 
937 	if (all_inodes) {
938 		/*
939 		 * Here we have a tradeoff choice. It must be better to
940 		 * do 2 writes * in the same MAPBLOCKSIZE chunk, than a
941 		 * read and a write. But what about 3 or more writes, versus
942 		 * a read+write? * Where is the cut over? It will depend on
943 		 * the track caching, scsi driver and other activity.
944 		 * A unpublished tunable is defined (ufs_rw_balance) that
945 		 * currently defaults to 2.
946 		 */
947 		if (!read_needed) {
948 			int count = 0, gap = 0;
949 			int sector_set; /* write needed to this sector */
950 
951 			/* Count the gaps (every 1 to 0 transation) */
952 			for (i = first_sec + 1; i < last_sec; i++) {
953 				sector_set = secmap & (UINT16_C(1) << i);
954 				if (!gap && !sector_set) {
955 					gap = 1;
956 					count++;
957 					if (count > ufs_rw_balance) {
958 						read_needed = 1;
959 						break;
960 					}
961 				} else if (gap && sector_set) {
962 					gap = 0;
963 				}
964 			}
965 		}
966 
967 		/*
968 		 * Inodes commonly make up the majority (~85%) of deltas.
969 		 * They cannot contain embedded user data, so its safe to
970 		 * read and write them all in one IO.
971 		 * But for directory entries, shadow inode data, and
972 		 * quota record data the user data fragments can be embedded
973 		 * betwen those metadata, and so its not safe to read, modify
974 		 * then write the entire range as user asynchronous user data
975 		 * writes could get overwritten with old data.
976 		 * Thus we have to create a segment map of meta data that
977 		 * needs to get written.
978 		 *
979 		 * If user data was logged then this issue would go away.
980 		 */
981 		if (read_needed) {
982 			for (i = first_sec + 1; i < last_sec; i++) {
983 				secmap |= (UINT16_C(1) << i);
984 			}
985 		}
986 	}
987 	rbp->rb_secmap = secmap;
988 	return (read_needed);
989 }
990 
991 /*
992  * Abort the load of a set of log map delta's.
993  * ie,
994  * Clear out all mapentries on this unit's log map
995  * which have a tid (transaction id) equal to the
996  * parameter tid.   Walk the cancel list, taking everything
997  * off it, too.
998  */
999 static void
1000 logmap_abort(ml_unit_t *ul, uint32_t tid)
1001 {
1002 	struct mt_map	*mtm = ul->un_logmap;	/* Log map */
1003 	mapentry_t	*me,
1004 			**mep;
1005 	int		i;
1006 
1007 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1008 		map_check_linkage(mtm));
1009 
1010 	/*
1011 	 * wait for any outstanding reads to finish; lock out future reads
1012 	 */
1013 	rw_enter(&mtm->mtm_rwlock, RW_WRITER);
1014 
1015 	mutex_enter(&mtm->mtm_mutex);
1016 	/* Take everything off cancel list */
1017 	while ((me = mtm->mtm_cancel) != NULL) {
1018 		mtm->mtm_cancel = me->me_cancel;
1019 		me->me_flags &= ~ME_CANCEL;
1020 		me->me_cancel = NULL;
1021 	}
1022 
1023 	/*
1024 	 * Now take out all mapentries with current tid, and committid
1025 	 * as this function is called from logmap_logscan and logmap_commit
1026 	 * When it is called from logmap_logscan mtm_tid == mtm_committid
1027 	 * But when logmap_abort is called from logmap_commit it is
1028 	 * because the log errored when trying to write the commit record,
1029 	 * after the async ops have been allowed to start in top_end_sync.
1030 	 * So we also need to remove all mapentries from the transaction whose
1031 	 * commit failed.
1032 	 */
1033 	for (i = 0; i < mtm->mtm_nhash; i++) {
1034 		mep = &mtm->mtm_hash[i];
1035 		while ((me = *mep) != NULL) {
1036 			if (me->me_tid == tid ||
1037 				me->me_tid == mtm->mtm_committid) {
1038 				*mep = me->me_hash;
1039 				me->me_next->me_prev = me->me_prev;
1040 				me->me_prev->me_next = me->me_next;
1041 				if (!(me->me_flags & ME_USER)) {
1042 					mtm->mtm_nme--;
1043 				}
1044 				CRB_RELE(me);
1045 				kmem_cache_free(mapentry_cache, me);
1046 				continue;
1047 			}
1048 			mep = &me->me_hash;
1049 		}
1050 	}
1051 
1052 	if (!(ul->un_flags & LDL_SCAN))
1053 		mtm->mtm_flags |= MTM_CANCELED;
1054 	mutex_exit(&mtm->mtm_mutex);
1055 	mtm->mtm_dirty = 0;
1056 	mtm->mtm_nmet = 0;
1057 	rw_exit(&mtm->mtm_rwlock);
1058 
1059 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1060 		map_check_linkage(mtm));
1061 }
1062 
1063 static void
1064 logmap_wait_space(mt_map_t *mtm, ml_unit_t *ul, mapentry_t *me)
1065 {
1066 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1067 
1068 	while (!ldl_has_space(ul, me)) {
1069 		ASSERT(!(ul->un_flags & LDL_NOROLL));
1070 		mutex_exit(&ul->un_log_mutex);
1071 		logmap_forceroll(mtm);
1072 		mutex_enter(&ul->un_log_mutex);
1073 		if (ul->un_flags & LDL_ERROR)
1074 			break;
1075 	}
1076 
1077 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1078 }
1079 
1080 /*
1081  * put a list of deltas into a logmap
1082  * If va == NULL, don't write to the log.
1083  */
1084 void
1085 logmap_add(
1086 	ml_unit_t *ul,
1087 	char *va,			/* Ptr to buf w/deltas & data */
1088 	offset_t vamof,			/* Offset on master of buf start */
1089 	mapentry_t *melist)		/* Entries to add */
1090 {
1091 	offset_t	mof;
1092 	off_t		nb;
1093 	mapentry_t	*me;
1094 	mapentry_t	**mep;
1095 	mapentry_t	**savmep;
1096 	uint32_t	tid;
1097 	mt_map_t	*mtm	= ul->un_logmap;
1098 
1099 	mutex_enter(&ul->un_log_mutex);
1100 	if (va)
1101 		logmap_wait_space(mtm, ul, melist);
1102 
1103 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1104 		map_check_linkage(mtm));
1105 
1106 	mtm->mtm_ref = 1;
1107 	mtm->mtm_dirty++;
1108 	tid = mtm->mtm_tid;
1109 	while (melist) {
1110 		mof = melist->me_mof;
1111 		nb  = melist->me_nb;
1112 
1113 		/*
1114 		 * search for overlaping entries
1115 		 */
1116 		savmep = mep = MAP_HASH(mof, mtm);
1117 		mutex_enter(&mtm->mtm_mutex);
1118 		while ((me = *mep) != 0) {
1119 			/*
1120 			 * Data consumes old map entry; cancel map entry.
1121 			 * Take care when we replace an old map entry
1122 			 * which carries quota information with a newer entry
1123 			 * which does not. In that case the push function
1124 			 * would not be called to clean up the dquot structure.
1125 			 * This would be found later by invalidatedq() causing
1126 			 * a panic when the filesystem in unmounted.
1127 			 * We clean up the dquot manually and then replace
1128 			 * the map entry.
1129 			 */
1130 			if (MEwithinDATA(me, mof, nb) &&
1131 			    ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) {
1132 				if (tid == me->me_tid &&
1133 				    ((me->me_flags & ME_AGE) == 0)) {
1134 					*mep = me->me_hash;
1135 					me->me_next->me_prev = me->me_prev;
1136 					me->me_prev->me_next = me->me_next;
1137 					ASSERT(!(me->me_flags & ME_USER));
1138 					mtm->mtm_nme--;
1139 					/*
1140 					 * Special case if the mapentry
1141 					 * carries a dquot and a push function.
1142 					 * We have to clean up the quota info
1143 					 * before replacing the mapentry.
1144 					 */
1145 					if (me->me_dt == DT_QR)
1146 						HANDLE_DQUOT(me, melist);
1147 
1148 					kmem_cache_free(mapentry_cache, me);
1149 					continue;
1150 				}
1151 				me->me_cancel = mtm->mtm_cancel;
1152 				mtm->mtm_cancel = me;
1153 				me->me_flags |= ME_CANCEL;
1154 			}
1155 			mep = &(*mep)->me_hash;
1156 		}
1157 		mutex_exit(&mtm->mtm_mutex);
1158 
1159 		/*
1160 		 * remove from list
1161 		 */
1162 		me = melist;
1163 		melist = melist->me_hash;
1164 		me->me_flags &= ~ME_LIST;
1165 		/*
1166 		 * If va != NULL, put in the log.
1167 		 */
1168 		if (va)
1169 			ldl_write(ul, va, vamof, me);
1170 		if (ul->un_flags & LDL_ERROR) {
1171 			kmem_cache_free(mapentry_cache, me);
1172 			continue;
1173 		}
1174 		ASSERT((va == NULL) ||
1175 			((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) ||
1176 			map_check_ldl_write(ul, va, vamof, me));
1177 
1178 		/*
1179 		 * put on hash
1180 		 */
1181 		mutex_enter(&mtm->mtm_mutex);
1182 		me->me_hash = *savmep;
1183 		*savmep = me;
1184 		me->me_next = (mapentry_t *)mtm;
1185 		me->me_prev = mtm->mtm_prev;
1186 		mtm->mtm_prev->me_next = me;
1187 		mtm->mtm_prev = me;
1188 		me->me_flags |= ME_HASH;
1189 		me->me_tid = tid;
1190 		me->me_age = mtm->mtm_age++;
1191 		mtm->mtm_nme++;
1192 		mtm->mtm_nmet++;
1193 		mutex_exit(&mtm->mtm_mutex);
1194 	}
1195 
1196 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1197 		map_check_linkage(mtm));
1198 	mutex_exit(&ul->un_log_mutex);
1199 }
1200 
1201 /*
1202  * Add the delta(s) into the log.
1203  * Create one cached roll buffer logmap entry, and reference count the
1204  * number of mapentries refering to it.
1205  * Cancel previous logmap entries.
1206  * logmap_add is tolerant of failure to allocate a cached roll buffer.
1207  */
1208 void
1209 logmap_add_buf(
1210 	ml_unit_t *ul,
1211 	char *va,			/* Ptr to buf w/deltas & data */
1212 	offset_t bufmof,		/* Offset on master of buf start */
1213 	mapentry_t *melist,		/* Entries to add */
1214 	caddr_t	buf,			/* Buffer containing delta(s) */
1215 	uint32_t bufsz)			/* Size of buf */
1216 {
1217 	offset_t	mof;
1218 	offset_t	vamof = bufmof + (va - buf);
1219 	off_t		nb;
1220 	mapentry_t	*me;
1221 	mapentry_t	**mep;
1222 	mapentry_t	**savmep;
1223 	uint32_t	tid;
1224 	mt_map_t	*mtm	= ul->un_logmap;
1225 	crb_t		*crb;
1226 	crb_t		*crbsav = NULL;
1227 
1228 	ASSERT((bufsz & DEV_BMASK) == 0);
1229 	mutex_enter(&ul->un_log_mutex);
1230 	logmap_wait_space(mtm, ul, melist);
1231 
1232 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1233 		map_check_linkage(mtm));
1234 
1235 	mtm->mtm_ref = 1;
1236 	mtm->mtm_dirty++;
1237 	tid = mtm->mtm_tid;
1238 	while (melist) {
1239 		mof = melist->me_mof;
1240 		nb  = melist->me_nb;
1241 
1242 		/*
1243 		 * search for overlapping entries
1244 		 */
1245 		savmep = mep = MAP_HASH(mof, mtm);
1246 		mutex_enter(&mtm->mtm_mutex);
1247 		while ((me = *mep) != 0) {
1248 			/*
1249 			 * Data consumes old map entry; cancel map entry.
1250 			 * Take care when we replace an old map entry
1251 			 * which carries quota information with a newer entry
1252 			 * which does not. In that case the push function
1253 			 * would not be called to clean up the dquot structure.
1254 			 * This would be found later by invalidatedq() causing
1255 			 * a panic when the filesystem in unmounted.
1256 			 * We clean up the dquot manually and then replace
1257 			 * the map entry.
1258 			 */
1259 			crb = me->me_crb;
1260 			if (MEwithinDATA(me, mof, nb) &&
1261 			    ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) {
1262 				if (tid == me->me_tid &&
1263 				    ((me->me_flags & ME_AGE) == 0)) {
1264 					*mep = me->me_hash;
1265 					me->me_next->me_prev = me->me_prev;
1266 					me->me_prev->me_next = me->me_next;
1267 					ASSERT(!(me->me_flags & ME_USER));
1268 					mtm->mtm_nme--;
1269 					/*
1270 					 * Special case if the mapentry
1271 					 * carries a dquot and a push function.
1272 					 * We have to clean up the quota info
1273 					 * before replacing the mapentry.
1274 					 */
1275 					if (me->me_dt == DT_QR)
1276 						HANDLE_DQUOT(me, melist);
1277 
1278 					/*
1279 					 * If this soon to be deleted mapentry
1280 					 * has a suitable roll buffer then
1281 					 * re-use it.
1282 					 */
1283 					if (crb && (--crb->c_refcnt == 0)) {
1284 						if (crbsav ||
1285 						    (crb->c_nb != bufsz)) {
1286 							CRB_FREE(crb, me);
1287 						} else {
1288 							bcopy(buf, crb->c_buf,
1289 							    bufsz);
1290 							crb->c_invalid = 0;
1291 							crb->c_mof = bufmof;
1292 							crbsav = crb;
1293 							me->me_crb = NULL;
1294 						}
1295 					}
1296 					kmem_cache_free(mapentry_cache, me);
1297 					continue;
1298 				}
1299 				me->me_cancel = mtm->mtm_cancel;
1300 				mtm->mtm_cancel = me;
1301 				me->me_flags |= ME_CANCEL;
1302 			}
1303 
1304 			/*
1305 			 * Inode deltas within the same fs block come
1306 			 * in individually as separate calls to logmap_add().
1307 			 * All others come in as one call. So check for an
1308 			 * existing entry where we can re-use the crb.
1309 			 */
1310 			if ((me->me_dt == DT_INODE) && (tid == me->me_tid) &&
1311 			    !crbsav && crb &&
1312 			    WITHIN(mof, nb, crb->c_mof, crb->c_nb)) {
1313 				ASSERT(crb->c_mof == bufmof);
1314 				ASSERT(crb->c_nb == bufsz);
1315 				bcopy(buf, crb->c_buf, bufsz);
1316 				crbsav = crb;
1317 			}
1318 			mep = &(*mep)->me_hash;
1319 		}
1320 		mutex_exit(&mtm->mtm_mutex);
1321 
1322 		/*
1323 		 * If we don't already have a crb then allocate one
1324 		 * and copy the incoming buffer. Only do this once
1325 		 * for all the incoming deltas.
1326 		 */
1327 		if ((crbsav == NULL) && (melist->me_dt != DT_ABZERO)) {
1328 			/*
1329 			 * Only use a cached roll buffer if we
1330 			 * have enough memory, and check for failures.
1331 			 */
1332 			if (((ufs_crb_size + bufsz) < ufs_crb_limit) &&
1333 			    (kmem_avail() > bufsz)) {
1334 				crbsav = kmem_alloc(sizeof (crb_t), KM_NOSLEEP);
1335 			} else {
1336 				ufs_crb_alloc_fails++;
1337 			}
1338 			if (crbsav) {
1339 				crbsav->c_buf = kmem_alloc(bufsz, KM_NOSLEEP);
1340 				if (crbsav->c_buf) {
1341 					atomic_add_64(&ufs_crb_size,
1342 					    (uint64_t)bufsz);
1343 					if (ufs_crb_size > ufs_crb_max_size) {
1344 						ufs_crb_max_size = ufs_crb_size;
1345 					}
1346 					bcopy(buf, crbsav->c_buf, bufsz);
1347 					crbsav->c_nb = bufsz;
1348 					crbsav->c_refcnt = 0;
1349 					crbsav->c_invalid = 0;
1350 					ASSERT((bufmof & DEV_BMASK) == 0);
1351 					crbsav->c_mof = bufmof;
1352 				} else {
1353 					kmem_free(crbsav, sizeof (crb_t));
1354 					crbsav = NULL;
1355 				}
1356 			}
1357 		}
1358 
1359 		/*
1360 		 * remove from list
1361 		 */
1362 		me = melist;
1363 		melist = melist->me_hash;
1364 		me->me_flags &= ~ME_LIST;
1365 		me->me_crb = crbsav;
1366 		if (crbsav) {
1367 			crbsav->c_refcnt++;
1368 		}
1369 		crbsav = NULL;
1370 
1371 		ASSERT(va);
1372 		ldl_write(ul, va, vamof, me); /* add to on-disk log */
1373 		if (ul->un_flags & LDL_ERROR) {
1374 			CRB_RELE(me);
1375 			kmem_cache_free(mapentry_cache, me);
1376 			continue;
1377 		}
1378 		ASSERT(((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) ||
1379 			map_check_ldl_write(ul, va, vamof, me));
1380 
1381 		/*
1382 		 * put on hash
1383 		 */
1384 		mutex_enter(&mtm->mtm_mutex);
1385 		me->me_hash = *savmep;
1386 		*savmep = me;
1387 		me->me_next = (mapentry_t *)mtm;
1388 		me->me_prev = mtm->mtm_prev;
1389 		mtm->mtm_prev->me_next = me;
1390 		mtm->mtm_prev = me;
1391 		me->me_flags |= ME_HASH;
1392 		me->me_tid = tid;
1393 		me->me_age = mtm->mtm_age++;
1394 		mtm->mtm_nme++;
1395 		mtm->mtm_nmet++;
1396 		mutex_exit(&mtm->mtm_mutex);
1397 	}
1398 
1399 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1400 		map_check_linkage(mtm));
1401 	mutex_exit(&ul->un_log_mutex);
1402 }
1403 
1404 /*
1405  * free up any cancelled deltas
1406  */
1407 void
1408 logmap_free_cancel(mt_map_t *mtm, mapentry_t **cancelhead)
1409 {
1410 	int		dolock	= 0;
1411 	mapentry_t	*me;
1412 	mapentry_t	**mep;
1413 
1414 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1415 		map_check_linkage(mtm));
1416 
1417 again:
1418 	if (dolock)
1419 		rw_enter(&mtm->mtm_rwlock, RW_WRITER);
1420 
1421 	/*
1422 	 * At EOT, cancel the indicated deltas
1423 	 */
1424 	mutex_enter(&mtm->mtm_mutex);
1425 	if (mtm->mtm_flags & MTM_CANCELED) {
1426 		mtm->mtm_flags &= ~MTM_CANCELED;
1427 		ASSERT(dolock == 0);
1428 		mutex_exit(&mtm->mtm_mutex);
1429 		return;
1430 	}
1431 
1432 	while ((me = *cancelhead) != NULL) {
1433 		/*
1434 		 * roll forward or read collision; wait and try again
1435 		 */
1436 		if (me->me_flags & ME_AGE) {
1437 			ASSERT(dolock == 0);
1438 			mutex_exit(&mtm->mtm_mutex);
1439 			dolock = 1;
1440 			goto again;
1441 		}
1442 		/*
1443 		 * remove from cancel list
1444 		 */
1445 		*cancelhead = me->me_cancel;
1446 		me->me_cancel = NULL;
1447 		me->me_flags &= ~(ME_CANCEL);
1448 
1449 		/*
1450 		 * logmap_remove_roll handles ME_ROLL entries later
1451 		 *	we leave them around for logmap_iscancel
1452 		 *	XXX is this necessary?
1453 		 */
1454 		if (me->me_flags & ME_ROLL)
1455 			continue;
1456 
1457 		/*
1458 		 * remove from hash (if necessary)
1459 		 */
1460 		if (me->me_flags & ME_HASH) {
1461 			mep = MAP_HASH(me->me_mof, mtm);
1462 			while (*mep) {
1463 				if (*mep == me) {
1464 					*mep = me->me_hash;
1465 					me->me_next->me_prev = me->me_prev;
1466 					me->me_prev->me_next = me->me_next;
1467 					me->me_flags &= ~(ME_HASH);
1468 					if (!(me->me_flags & ME_USER)) {
1469 						mtm->mtm_nme--;
1470 					}
1471 					break;
1472 				} else
1473 					mep = &(*mep)->me_hash;
1474 			}
1475 		}
1476 		/*
1477 		 * put the entry on the free list
1478 		 */
1479 		CRB_RELE(me);
1480 		kmem_cache_free(mapentry_cache, me);
1481 	}
1482 	mutex_exit(&mtm->mtm_mutex);
1483 	if (dolock)
1484 		rw_exit(&mtm->mtm_rwlock);
1485 
1486 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1487 		map_check_linkage(mtm));
1488 }
1489 
1490 
1491 void
1492 logmap_commit(ml_unit_t *ul, uint32_t tid)
1493 {
1494 	mapentry_t	me;
1495 	mt_map_t	*mtm	= ul->un_logmap;
1496 
1497 
1498 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1499 
1500 	/*
1501 	 * async'ly write a commit rec into the log
1502 	 */
1503 	if (mtm->mtm_dirty) {
1504 		/*
1505 		 * put commit record into log
1506 		 */
1507 		me.me_mof = mtm->mtm_tid;
1508 		me.me_dt = DT_COMMIT;
1509 		me.me_nb = 0;
1510 		me.me_hash = NULL;
1511 		logmap_wait_space(mtm, ul, &me);
1512 		ldl_write(ul, NULL, (offset_t)0, &me);
1513 		ldl_round_commit(ul);
1514 
1515 		/*
1516 		 * abort on error; else reset dirty flag
1517 		 */
1518 		if (ul->un_flags & LDL_ERROR)
1519 			logmap_abort(ul, tid);
1520 		else {
1521 			mtm->mtm_dirty = 0;
1522 			mtm->mtm_nmet = 0;
1523 			mtm->mtm_cfrags = 0;
1524 		}
1525 		/* push commit */
1526 		ldl_push_commit(ul);
1527 	}
1528 }
1529 
1530 void
1531 logmap_sethead(mt_map_t *mtm, ml_unit_t *ul)
1532 {
1533 	off_t		lof;
1534 	uint32_t	tid;
1535 	mapentry_t	*me;
1536 
1537 	/*
1538 	 * move the head forward so the log knows how full it is
1539 	 * Make sure to skip any mapentry whose me_lof is 0, these
1540 	 * are just place holders for DT_CANCELED freed user blocks
1541 	 * for the current moby.
1542 	 */
1543 	mutex_enter(&ul->un_log_mutex);
1544 	mutex_enter(&mtm->mtm_mutex);
1545 	me = mtm->mtm_next;
1546 	while (me != (mapentry_t *)mtm && me->me_lof == 0) {
1547 		me = me->me_next;
1548 	}
1549 
1550 	if (me == (mapentry_t *)mtm)
1551 		lof = -1;
1552 	else {
1553 		lof = me->me_lof;
1554 		tid = me->me_tid;
1555 	}
1556 	mutex_exit(&mtm->mtm_mutex);
1557 	ldl_sethead(ul, lof, tid);
1558 	if (lof == -1)
1559 		mtm->mtm_age = 0;
1560 	mutex_exit(&ul->un_log_mutex);
1561 }
1562 
1563 void
1564 logmap_settail(mt_map_t *mtm, ml_unit_t *ul)
1565 {
1566 	off_t		lof;
1567 	size_t		nb;
1568 
1569 	/*
1570 	 * set the tail after the logmap_abort
1571 	 */
1572 	mutex_enter(&ul->un_log_mutex);
1573 	mutex_enter(&mtm->mtm_mutex);
1574 	if (mtm->mtm_prev == (mapentry_t *)mtm)
1575 		lof = -1;
1576 	else {
1577 		/*
1578 		 * set the tail to the end of the last commit
1579 		 */
1580 		lof = mtm->mtm_tail_lof;
1581 		nb = mtm->mtm_tail_nb;
1582 	}
1583 	mutex_exit(&mtm->mtm_mutex);
1584 	ldl_settail(ul, lof, nb);
1585 	mutex_exit(&ul->un_log_mutex);
1586 }
1587 
1588 /*
1589  * when reseting a device; roll the log until every
1590  * delta has been rolled forward
1591  */
1592 void
1593 logmap_roll_dev(ml_unit_t *ul)
1594 {
1595 	mt_map_t	*mtm	= ul->un_logmap;
1596 	mapentry_t	*me;
1597 	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
1598 
1599 again:
1600 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1601 		map_check_linkage(mtm));
1602 	if (ul->un_flags & (LDL_ERROR|LDL_NOROLL))
1603 		return;
1604 
1605 	/*
1606 	 * look for deltas
1607 	 */
1608 	mutex_enter(&mtm->mtm_mutex);
1609 	for (me = mtm->mtm_next; me != (mapentry_t *)mtm; me = me->me_next) {
1610 		if (me->me_flags & ME_ROLL)
1611 			break;
1612 		if (me->me_tid == mtm->mtm_tid)
1613 			continue;
1614 		if (me->me_tid == mtm->mtm_committid)
1615 			continue;
1616 		break;
1617 	}
1618 
1619 	/*
1620 	 * found a delta; kick the roll thread
1621 	 * but only if the thread is running... (jmh)
1622 	 */
1623 	if (me != (mapentry_t *)mtm) {
1624 		mutex_exit(&mtm->mtm_mutex);
1625 		logmap_forceroll(mtm);
1626 		goto again;
1627 	}
1628 
1629 	/*
1630 	 * no more deltas, return
1631 	 */
1632 	mutex_exit(&mtm->mtm_mutex);
1633 	(void) ufs_putsummaryinfo(ul->un_dev, ufsvfsp, ufsvfsp->vfs_fs);
1634 
1635 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1636 		map_check_linkage(mtm));
1637 }
1638 
1639 static void
1640 logmap_cancel_delta(ml_unit_t *ul, offset_t mof, int32_t nb, int metadata)
1641 {
1642 	mapentry_t	*me;
1643 	mapentry_t	**mep;
1644 	mt_map_t	*mtm	= ul->un_logmap;
1645 	int		frags;
1646 
1647 	/*
1648 	 * map has been referenced and is dirty
1649 	 */
1650 	mtm->mtm_ref = 1;
1651 	mtm->mtm_dirty++;
1652 
1653 	/*
1654 	 * get a mapentry
1655 	 */
1656 	me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
1657 	bzero(me, sizeof (mapentry_t));
1658 
1659 	/*
1660 	 * initialize cancel record and put in logmap
1661 	 */
1662 	me->me_mof = mof;
1663 	me->me_nb = nb;
1664 	me->me_dt = DT_CANCEL;
1665 	me->me_tid = mtm->mtm_tid;
1666 	me->me_hash = NULL;
1667 
1668 	/*
1669 	 * Write delta to log if this delta is for metadata.  If this is not
1670 	 * metadata it is user data and we are just putting a cancel
1671 	 * mapentry into the hash to cancel a user block deletion
1672 	 * in which we do not want the block to be allocated
1673 	 * within this moby.  This cancel entry will prevent the block from
1674 	 * being allocated within the moby and prevent user data corruption
1675 	 * if we happen to crash before this moby is committed.
1676 	 */
1677 	mutex_enter(&ul->un_log_mutex);
1678 	if (metadata) {
1679 		logmap_wait_space(mtm, ul, me);
1680 		ldl_write(ul, NULL, (offset_t)0, me);
1681 		if (ul->un_flags & LDL_ERROR) {
1682 			kmem_cache_free(mapentry_cache, me);
1683 			mutex_exit(&ul->un_log_mutex);
1684 			return;
1685 		}
1686 	}
1687 
1688 	/*
1689 	 * put in hash and on cancel list
1690 	 */
1691 	mep = MAP_HASH(mof, mtm);
1692 	mutex_enter(&mtm->mtm_mutex);
1693 	me->me_age = mtm->mtm_age++;
1694 	me->me_hash = *mep;
1695 	*mep = me;
1696 	me->me_next = (mapentry_t *)mtm;
1697 	me->me_prev = mtm->mtm_prev;
1698 	mtm->mtm_prev->me_next = me;
1699 	mtm->mtm_prev = me;
1700 	me->me_cancel = mtm->mtm_cancel;
1701 	mtm->mtm_cancel = me;
1702 	if (metadata) {
1703 		mtm->mtm_nme++;
1704 		mtm->mtm_nmet++;
1705 	} else {
1706 		me->me_flags = ME_USER;
1707 	}
1708 	me->me_flags |= (ME_HASH|ME_CANCEL);
1709 	if (!(metadata)) {
1710 		frags = blkoff(ul->un_ufsvfs->vfs_fs, nb);
1711 		if (frags)
1712 			mtm->mtm_cfrags += numfrags(ul->un_ufsvfs->vfs_fs,
1713 				frags);
1714 	}
1715 	mutex_exit(&mtm->mtm_mutex);
1716 
1717 	mutex_exit(&ul->un_log_mutex);
1718 }
1719 
1720 /*
1721  * cancel entries in a logmap (entries are freed at EOT)
1722  */
1723 void
1724 logmap_cancel(ml_unit_t *ul, offset_t mof, off_t nb, int metadata)
1725 {
1726 	int32_t		hnb;
1727 	mapentry_t	*me;
1728 	mapentry_t	**mep;
1729 	mt_map_t	*mtm	= ul->un_logmap;
1730 	crb_t		*crb;
1731 
1732 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1733 		map_check_linkage(mtm));
1734 
1735 	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
1736 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
1737 		if (hnb > nb)
1738 			hnb = nb;
1739 		/*
1740 		 * find overlapping entries
1741 		 */
1742 		mep = MAP_HASH(mof, mtm);
1743 		mutex_enter(&mtm->mtm_mutex);
1744 		for (me = *mep; me; me = me->me_hash) {
1745 			if (!DATAoverlapME(mof, hnb, me))
1746 				continue;
1747 
1748 			ASSERT(MEwithinDATA(me, mof, hnb));
1749 
1750 			if ((me->me_flags & ME_CANCEL) == 0) {
1751 				me->me_cancel = mtm->mtm_cancel;
1752 				mtm->mtm_cancel = me;
1753 				me->me_flags |= ME_CANCEL;
1754 				crb = me->me_crb;
1755 				if (crb) {
1756 					crb->c_invalid = 1;
1757 				}
1758 			}
1759 		}
1760 		mutex_exit(&mtm->mtm_mutex);
1761 
1762 		/*
1763 		 * put a cancel record into the log
1764 		 */
1765 		logmap_cancel_delta(ul, mof, hnb, metadata);
1766 	}
1767 
1768 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1769 		map_check_linkage(mtm));
1770 }
1771 
1772 /*
1773  * check for overlap w/cancel delta
1774  */
1775 int
1776 logmap_iscancel(mt_map_t *mtm, offset_t mof, off_t nb)
1777 {
1778 	off_t		hnb;
1779 	mapentry_t	*me;
1780 	mapentry_t	**mep;
1781 
1782 	mutex_enter(&mtm->mtm_mutex);
1783 	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
1784 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
1785 		if (hnb > nb)
1786 			hnb = nb;
1787 		/*
1788 		 * search for dup entry
1789 		 */
1790 		mep = MAP_HASH(mof, mtm);
1791 		for (me = *mep; me; me = me->me_hash) {
1792 			if (((me->me_flags & ME_ROLL) == 0) &&
1793 			    (me->me_dt != DT_CANCEL))
1794 				continue;
1795 			if (DATAoverlapME(mof, hnb, me))
1796 				break;
1797 		}
1798 
1799 		/*
1800 		 * overlap detected
1801 		 */
1802 		if (me) {
1803 			mutex_exit(&mtm->mtm_mutex);
1804 			return (1);
1805 		}
1806 	}
1807 	mutex_exit(&mtm->mtm_mutex);
1808 	return (0);
1809 }
1810 
1811 static int
1812 logmap_logscan_add(ml_unit_t *ul, struct delta *dp, off_t lof, size_t *nbp)
1813 {
1814 	mapentry_t	*me;
1815 	int		error;
1816 	mt_map_t	*mtm	= ul->un_logmap;
1817 
1818 	/*
1819 	 * verify delta header; failure == mediafail
1820 	 */
1821 	error = 0;
1822 	/* delta type */
1823 	if ((dp->d_typ <= DT_NONE) || (dp->d_typ >= DT_MAX))
1824 		error = EINVAL;
1825 	if (dp->d_typ == DT_COMMIT) {
1826 		if (dp->d_nb != INT32_C(0) && dp->d_nb != INT32_C(-1))
1827 			error = EINVAL;
1828 	} else {
1829 		/* length of delta */
1830 		if ((dp->d_nb < INT32_C(0)) ||
1831 		    (dp->d_nb > INT32_C(MAPBLOCKSIZE)))
1832 			error = EINVAL;
1833 
1834 		/* offset on master device */
1835 		if (dp->d_mof < INT64_C(0))
1836 			error = EINVAL;
1837 	}
1838 
1839 	if (error) {
1840 		ldl_seterror(ul, "Error processing ufs log data during scan");
1841 		return (error);
1842 	}
1843 
1844 	/*
1845 	 * process commit record
1846 	 */
1847 	if (dp->d_typ == DT_COMMIT) {
1848 		if (mtm->mtm_dirty) {
1849 			ASSERT(dp->d_nb == INT32_C(0));
1850 			logmap_free_cancel(mtm, &mtm->mtm_cancel);
1851 			mtm->mtm_dirty = 0;
1852 			mtm->mtm_nmet = 0;
1853 			mtm->mtm_tid++;
1854 			mtm->mtm_committid = mtm->mtm_tid;
1855 			ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) ||
1856 				logmap_logscan_commit_debug(lof, mtm));
1857 		}
1858 		/*
1859 		 * return #bytes to next sector (next delta header)
1860 		 */
1861 		*nbp = ldl_logscan_nbcommit(lof);
1862 		mtm->mtm_tail_lof = lof;
1863 		mtm->mtm_tail_nb = *nbp;
1864 		return (0);
1865 	}
1866 
1867 	/*
1868 	 * add delta to logmap
1869 	 */
1870 	me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
1871 	bzero(me, sizeof (mapentry_t));
1872 	me->me_lof = lof;
1873 	me->me_mof = dp->d_mof;
1874 	me->me_nb = dp->d_nb;
1875 	me->me_tid = mtm->mtm_tid;
1876 	me->me_dt = dp->d_typ;
1877 	me->me_hash = NULL;
1878 	me->me_flags = (ME_LIST | ME_SCAN);
1879 	logmap_add(ul, NULL, 0, me);
1880 	switch (dp->d_typ) {
1881 	case DT_CANCEL:
1882 		me->me_flags |= ME_CANCEL;
1883 		me->me_cancel = mtm->mtm_cancel;
1884 		mtm->mtm_cancel = me;
1885 		break;
1886 	default:
1887 		ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) ||
1888 			logmap_logscan_add_debug(dp, mtm));
1889 		break;
1890 	}
1891 
1892 sizeofdelta:
1893 	/*
1894 	 * return #bytes till next delta header
1895 	 */
1896 	if ((dp->d_typ == DT_CANCEL) || (dp->d_typ == DT_ABZERO))
1897 		*nbp = 0;
1898 	else
1899 		*nbp = dp->d_nb;
1900 	return (0);
1901 }
1902 
1903 void
1904 logmap_logscan(ml_unit_t *ul)
1905 {
1906 	size_t		nb, nbd;
1907 	off_t		lof;
1908 	struct delta	delta;
1909 	mt_map_t	*logmap	= ul->un_logmap;
1910 
1911 	ASSERT(ul->un_deltamap->mtm_next == (mapentry_t *)ul->un_deltamap);
1912 
1913 	/*
1914 	 * prepare the log for a logscan
1915 	 */
1916 	ldl_logscan_begin(ul);
1917 
1918 	/*
1919 	 * prepare the logmap for a logscan
1920 	 */
1921 	(void) map_free_entries(logmap);
1922 	logmap->mtm_tid = 0;
1923 	logmap->mtm_committid = UINT32_C(0);
1924 	logmap->mtm_age = 0;
1925 	logmap->mtm_dirty = 0;
1926 	logmap->mtm_ref = 0;
1927 
1928 	/*
1929 	 * while not at end of log
1930 	 *	read delta header
1931 	 *	add to logmap
1932 	 *	seek to beginning of next delta
1933 	 */
1934 	lof = ul->un_head_lof;
1935 	nbd = sizeof (delta);
1936 	while (lof != ul->un_tail_lof) {
1937 
1938 		/* read delta header */
1939 		if (ldl_logscan_read(ul, &lof, nbd, (caddr_t)&delta))
1940 			break;
1941 
1942 		/* add to logmap */
1943 		if (logmap_logscan_add(ul, &delta, lof, &nb))
1944 			break;
1945 
1946 		/* seek to next header (skip data) */
1947 		if (ldl_logscan_read(ul, &lof, nb, NULL))
1948 			break;
1949 	}
1950 
1951 	/*
1952 	 * remove the last partial transaction from the logmap
1953 	 */
1954 	logmap_abort(ul, logmap->mtm_tid);
1955 
1956 	ldl_logscan_end(ul);
1957 }
1958 
1959 void
1960 _init_map(void)
1961 {
1962 	/*
1963 	 * Initialise the mapentry cache. No constructor or deconstructor
1964 	 * is needed. Also no reclaim function is supplied as reclaiming
1965 	 * current entries is not possible.
1966 	 */
1967 	mapentry_cache = kmem_cache_create("lufs_mapentry_cache",
1968 	    sizeof (mapentry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1969 }
1970 
1971 /*
1972  * Special case when we replace an old map entry which carries quota
1973  * information with a newer entry which does not.
1974  * In that case the push function would not be called to clean up the
1975  * dquot structure. This would be found later by invalidatedq() causing
1976  * a panic when the filesystem in unmounted.
1977  * We clean up the dquot manually before replacing the map entry.
1978  */
1979 void
1980 handle_dquot(mapentry_t *me)
1981 {
1982 	int dolock = 0;
1983 	int domutex = 0;
1984 	struct dquot *dqp;
1985 
1986 	dqp = (struct dquot *)me->me_arg;
1987 
1988 	/*
1989 	 * We need vfs_dqrwlock to call dqput()
1990 	 */
1991 	dolock = (!RW_LOCK_HELD(&dqp->dq_ufsvfsp->vfs_dqrwlock));
1992 	if (dolock)
1993 		rw_enter(&dqp->dq_ufsvfsp->vfs_dqrwlock, RW_READER);
1994 
1995 	domutex = (!MUTEX_HELD(&dqp->dq_lock));
1996 	if (domutex)
1997 		mutex_enter(&dqp->dq_lock);
1998 
1999 	/*
2000 	 * Only clean up if the dquot is referenced
2001 	 */
2002 	if (dqp->dq_cnt == 0) {
2003 		if (domutex)
2004 			mutex_exit(&dqp->dq_lock);
2005 		if (dolock)
2006 			rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock);
2007 		return;
2008 	}
2009 
2010 	dqp->dq_flags &= ~(DQ_MOD|DQ_TRANS);
2011 	dqput(dqp);
2012 
2013 	if (domutex)
2014 		mutex_exit(&dqp->dq_lock);
2015 
2016 	if (dolock)
2017 		rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock);
2018 
2019 }
2020