xref: /illumos-gate/usr/src/uts/common/fs/ufs/lufs_map.c (revision 2aeafac3612e19716bf8164f89c3c9196342979c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2012 Milan Jurik. All rights reserved.
25  */
26 
27 #include <sys/systm.h>
28 #include <sys/types.h>
29 #include <sys/vnode.h>
30 #include <sys/errno.h>
31 #include <sys/sysmacros.h>
32 #include <sys/debug.h>
33 #include <sys/kmem.h>
34 #include <sys/conf.h>
35 #include <sys/proc.h>
36 #include <sys/cmn_err.h>
37 #include <sys/fs/ufs_inode.h>
38 #include <sys/fs/ufs_filio.h>
39 #include <sys/fs/ufs_log.h>
40 #include <sys/inttypes.h>
41 #include <sys/atomic.h>
42 #include <sys/tuneable.h>
43 
44 /*
45  * externs
46  */
47 extern pri_t minclsyspri;
48 extern struct kmem_cache *lufs_bp;
49 extern int ufs_trans_push_quota(ufsvfs_t *, delta_t, struct dquot *);
50 
51 /*
52  * globals
53  */
54 kmem_cache_t *mapentry_cache;
55 
56 /*
57  * logmap tuning constants
58  */
59 long	logmap_maxnme_commit	= 2048;
60 long	logmap_maxnme_async	= 4096;
61 long	logmap_maxnme_sync	= 6144;
62 long	logmap_maxcfrag_commit	= 4;	/* Max canceled fragments per moby */
63 
64 
65 uint64_t ufs_crb_size = 0;		/* current size of all crb buffers */
66 uint64_t ufs_crb_max_size = 0;		/* highest crb buffer use so far */
67 size_t ufs_crb_limit;			/* max allowable size for crbs */
68 uint64_t ufs_crb_alloc_fails = 0;	/* crb allocation failures stat */
69 #define	UFS_MAX_CRB_DEFAULT_DIVISOR 10	/* max 1/10 kmem_maxavail() */
70 int ufs_max_crb_divisor = UFS_MAX_CRB_DEFAULT_DIVISOR; /* tunable */
71 void handle_dquot(mapentry_t *);
72 
73 /*
74  * GENERIC MAP ROUTINES
75  */
76 
77 #define	CRB_FREE(crb, me) \
78 	kmem_free(crb->c_buf, crb->c_nb); \
79 	atomic_add_64(&ufs_crb_size, -(uint64_t)crb->c_nb); \
80 	kmem_free(crb, sizeof (crb_t)); \
81 	(me)->me_crb = NULL;
82 
83 #define	CRB_RELE(me) { \
84 	crb_t *crb = (me)->me_crb; \
85 	if (crb && (--crb->c_refcnt == 0)) { \
86 		CRB_FREE(crb, me) \
87 	} \
88 }
89 
90 /*
91  * Check that the old delta has an argument and a push function of
92  * ufs_trans_push_quota(), then check that the old and new deltas differ.
93  * If so we clean up with handle_dquot() before replacing the old delta.
94  */
95 #define	HANDLE_DQUOT(me, melist) { \
96 	if ((me->me_arg) && \
97 	    (me->me_func == ufs_trans_push_quota)) { \
98 		if (!((me->me_dt == melist->me_dt) && \
99 		    (me->me_arg == melist->me_arg) && \
100 		    (me->me_func == melist->me_func))) { \
101 			handle_dquot(me); \
102 		} \
103 	} \
104 }
105 
106 /*
107  * free up all the mapentries for a map
108  */
109 void
110 map_free_entries(mt_map_t *mtm)
111 {
112 	int		i;
113 	mapentry_t	*me;
114 
115 	while ((me = mtm->mtm_next) != (mapentry_t *)mtm) {
116 		me->me_next->me_prev = me->me_prev;
117 		me->me_prev->me_next = me->me_next;
118 		CRB_RELE(me);
119 		kmem_cache_free(mapentry_cache, me);
120 	}
121 	for (i = 0; i < mtm->mtm_nhash; i++)
122 		mtm->mtm_hash[i] = NULL;
123 	mtm->mtm_nme = 0;
124 	mtm->mtm_nmet = 0;
125 }
126 
127 /*
128  * done with map; free if necessary
129  */
130 mt_map_t *
131 map_put(mt_map_t *mtm)
132 {
133 	/*
134 	 * free up the map's memory
135 	 */
136 	map_free_entries(mtm);
137 	ASSERT(map_put_debug(mtm));
138 	kmem_free(mtm->mtm_hash,
139 	    (size_t) (sizeof (mapentry_t *) * mtm->mtm_nhash));
140 	mutex_destroy(&mtm->mtm_mutex);
141 	mutex_destroy(&mtm->mtm_scan_mutex);
142 	cv_destroy(&mtm->mtm_to_roll_cv);
143 	cv_destroy(&mtm->mtm_from_roll_cv);
144 	rw_destroy(&mtm->mtm_rwlock);
145 	mutex_destroy(&mtm->mtm_lock);
146 	cv_destroy(&mtm->mtm_cv_commit);
147 	cv_destroy(&mtm->mtm_cv_next);
148 	cv_destroy(&mtm->mtm_cv_eot);
149 	cv_destroy(&mtm->mtm_cv);
150 	kmem_free(mtm, sizeof (mt_map_t));
151 	return (NULL);
152 }
153 /*
154  * Allocate a map;
155  */
156 mt_map_t *
157 map_get(ml_unit_t *ul, enum maptypes maptype, int nh)
158 {
159 	mt_map_t	*mtm;
160 
161 	/*
162 	 * assume the map is not here and allocate the necessary structs
163 	 */
164 	mtm = kmem_zalloc(sizeof (mt_map_t), KM_SLEEP);
165 	mutex_init(&mtm->mtm_mutex, NULL, MUTEX_DEFAULT, NULL);
166 	mutex_init(&mtm->mtm_scan_mutex, NULL, MUTEX_DEFAULT, NULL);
167 	cv_init(&mtm->mtm_to_roll_cv, NULL, CV_DEFAULT, NULL);
168 	cv_init(&mtm->mtm_from_roll_cv, NULL, CV_DEFAULT, NULL);
169 	rw_init(&mtm->mtm_rwlock, NULL, RW_DEFAULT, NULL);
170 	mtm->mtm_next = (mapentry_t *)mtm;
171 	mtm->mtm_prev = (mapentry_t *)mtm;
172 	mtm->mtm_hash = kmem_zalloc((size_t) (sizeof (mapentry_t *) * nh),
173 	    KM_SLEEP);
174 	mtm->mtm_nhash = nh;
175 	mtm->mtm_debug = ul->un_debug;
176 	mtm->mtm_type = maptype;
177 
178 	mtm->mtm_cfrags = 0;
179 	mtm->mtm_cfragmax = logmap_maxcfrag_commit;
180 
181 	/*
182 	 * for scan test
183 	 */
184 	mtm->mtm_ul = ul;
185 
186 	/*
187 	 * Initialize locks
188 	 */
189 	mutex_init(&mtm->mtm_lock, NULL, MUTEX_DEFAULT, NULL);
190 	cv_init(&mtm->mtm_cv_commit, NULL, CV_DEFAULT, NULL);
191 	cv_init(&mtm->mtm_cv_next, NULL, CV_DEFAULT, NULL);
192 	cv_init(&mtm->mtm_cv_eot, NULL, CV_DEFAULT, NULL);
193 	cv_init(&mtm->mtm_cv, NULL, CV_DEFAULT, NULL);
194 	ASSERT(map_get_debug(ul, mtm));
195 
196 	return (mtm);
197 }
198 
199 /*
200  * DELTAMAP ROUTINES
201  */
202 /*
203  * deltamap tuning constants
204  */
205 long	deltamap_maxnme	= 1024;	/* global so it can be set */
206 
207 int
208 deltamap_need_commit(mt_map_t *mtm)
209 {
210 	return (mtm->mtm_nme > deltamap_maxnme);
211 }
212 
213 /*
214  * put a delta into a deltamap; may sleep on memory
215  */
216 void
217 deltamap_add(
218 	mt_map_t *mtm,
219 	offset_t mof,
220 	off_t nb,
221 	delta_t dtyp,
222 	int (*func)(),
223 	ulong_t arg,
224 	threadtrans_t *tp)
225 {
226 	int32_t		hnb;
227 	mapentry_t	*me;
228 	mapentry_t	**mep;
229 
230 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
231 	    map_check_linkage(mtm));
232 
233 	mutex_enter(&mtm->mtm_mutex);
234 
235 	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
236 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
237 		if (hnb > nb)
238 			hnb = nb;
239 		/*
240 		 * Search for dup entry. We need to ensure that we don't
241 		 * replace a map entry which carries quota information
242 		 * with a map entry which doesn't. In that case we lose
243 		 * reference the the dquot structure which will not be
244 		 * cleaned up by the push function me->me_func as this will
245 		 * never be called.
246 		 * The stray dquot would be found later by invalidatedq()
247 		 * causing a panic when the filesystem is unmounted.
248 		 */
249 		mep = MAP_HASH(mof, mtm);
250 		for (me = *mep; me; me = me->me_hash) {
251 			if (DATAwithinME(mof, hnb, me)) {
252 				/*
253 				 * Don't remove quota entries which have
254 				 * incremented the ref count (those with a
255 				 * ufs_trans_push_quota push function).
256 				 * Let logmap_add[_buf] clean them up.
257 				 */
258 				if (me->me_func == ufs_trans_push_quota) {
259 					continue;
260 				}
261 				break;
262 			}
263 			ASSERT((dtyp == DT_CANCEL) ||
264 			    (!DATAoverlapME(mof, hnb, me)) ||
265 			    MEwithinDATA(me, mof, hnb));
266 		}
267 
268 		if (me) {
269 			/* already in map */
270 			continue;
271 		}
272 
273 		/*
274 		 * Add up all the delta map deltas so we can compute
275 		 * an upper bound on the log size used.
276 		 * Note, some deltas get removed from the deltamap
277 		 * before the deltamap_push by lufs_write_strategy
278 		 * and so multiple deltas to the same mof offset
279 		 * don't get cancelled here but in the logmap.
280 		 * Thus we can't easily get a accurate count of
281 		 * the log space used - only an upper bound.
282 		 */
283 		if (tp && (mtm->mtm_ul->un_deltamap == mtm)) {
284 			ASSERT(dtyp != DT_CANCEL);
285 			if (dtyp == DT_ABZERO) {
286 				tp->deltas_size += sizeof (struct delta);
287 			} else {
288 				tp->deltas_size +=
289 				    (hnb + sizeof (struct delta));
290 			}
291 		}
292 
293 		delta_stats[dtyp]++;
294 
295 		/*
296 		 * get a mapentry
297 		 * May need to drop & re-grab the mtm_mutex
298 		 * and then recheck for a duplicate
299 		 */
300 		me = kmem_cache_alloc(mapentry_cache, KM_NOSLEEP);
301 		if (me == NULL) {
302 			mutex_exit(&mtm->mtm_mutex);
303 			me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
304 			mutex_enter(&mtm->mtm_mutex);
305 		}
306 		bzero(me, sizeof (mapentry_t));
307 
308 		/*
309 		 * initialize and put in deltamap
310 		 */
311 		me->me_mof = mof;
312 		me->me_nb = hnb;
313 		me->me_func = func;
314 		me->me_arg = arg;
315 		me->me_dt = dtyp;
316 		me->me_flags = ME_HASH;
317 		me->me_tid = mtm->mtm_tid;
318 
319 		me->me_hash = *mep;
320 		*mep = me;
321 		me->me_next = (mapentry_t *)mtm;
322 		me->me_prev = mtm->mtm_prev;
323 		mtm->mtm_prev->me_next = me;
324 		mtm->mtm_prev = me;
325 		mtm->mtm_nme++;
326 	}
327 	mutex_exit(&mtm->mtm_mutex);
328 
329 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
330 	    map_check_linkage(mtm));
331 }
332 
333 /*
334  * remove deltas within (mof, nb) and return as linked list
335  */
336 mapentry_t *
337 deltamap_remove(mt_map_t *mtm, offset_t mof, off_t nb)
338 {
339 	off_t		hnb;
340 	mapentry_t	*me;
341 	mapentry_t	**mep;
342 	mapentry_t	*mer;
343 
344 	if (mtm == NULL)
345 		return (NULL);
346 
347 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
348 	    map_check_linkage(mtm));
349 
350 	mutex_enter(&mtm->mtm_mutex);
351 	for (mer = NULL, hnb = 0; nb; nb -= hnb, mof += hnb) {
352 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
353 		if (hnb > nb)
354 			hnb = nb;
355 		/*
356 		 * remove entries from hash and return as a aged linked list
357 		 */
358 		mep = MAP_HASH(mof, mtm);
359 		while ((me = *mep) != 0) {
360 			if (MEwithinDATA(me, mof, hnb)) {
361 				*mep = me->me_hash;
362 				me->me_next->me_prev = me->me_prev;
363 				me->me_prev->me_next = me->me_next;
364 				me->me_hash = mer;
365 				mer = me;
366 				me->me_flags |= ME_LIST;
367 				me->me_flags &= ~ME_HASH;
368 				mtm->mtm_nme--;
369 			} else
370 				mep = &me->me_hash;
371 		}
372 	}
373 	mutex_exit(&mtm->mtm_mutex);
374 
375 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
376 	    map_check_linkage(mtm));
377 
378 	return (mer);
379 }
380 
381 /*
382  * delete entries within (mof, nb)
383  */
384 void
385 deltamap_del(mt_map_t *mtm, offset_t mof, off_t nb)
386 {
387 	mapentry_t	*me;
388 	mapentry_t	*menext;
389 
390 	menext = deltamap_remove(mtm, mof, nb);
391 	while ((me = menext) != 0) {
392 		menext = me->me_hash;
393 		kmem_cache_free(mapentry_cache, me);
394 	}
395 }
396 
397 /*
398  * Call the indicated function to cause deltas to move to the logmap.
399  * top_end_sync() is the only caller of this function and
400  * it has waited for the completion of all threads, so there can
401  * be no other activity in the deltamap. Therefore we don't need to
402  * hold the deltamap lock.
403  */
404 void
405 deltamap_push(ml_unit_t *ul)
406 {
407 	delta_t		dtyp;
408 	int		(*func)();
409 	ulong_t		arg;
410 	mapentry_t	*me;
411 	offset_t	mof;
412 	off_t		nb;
413 	mt_map_t	*mtm	= ul->un_deltamap;
414 
415 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
416 	    map_check_linkage(mtm));
417 
418 	/*
419 	 * for every entry in the deltamap
420 	 */
421 	while ((me = mtm->mtm_next) != (mapentry_t *)mtm) {
422 		ASSERT(me->me_func);
423 		func = me->me_func;
424 		dtyp = me->me_dt;
425 		arg = me->me_arg;
426 		mof = me->me_mof;
427 		nb = me->me_nb;
428 		if ((ul->un_flags & LDL_ERROR) ||
429 		    (*func)(ul->un_ufsvfs, dtyp, arg))
430 			deltamap_del(mtm, mof, nb);
431 	}
432 
433 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
434 	    map_check_linkage(mtm));
435 }
436 
437 /*
438  * LOGMAP ROUTINES
439  */
440 
441 int
442 logmap_need_commit(mt_map_t *mtm)
443 {
444 	return ((mtm->mtm_nmet > logmap_maxnme_commit) ||
445 	    (mtm->mtm_cfrags >= mtm->mtm_cfragmax));
446 }
447 
448 int
449 logmap_need_roll_async(mt_map_t *mtm)
450 {
451 	return (mtm->mtm_nme > logmap_maxnme_async);
452 }
453 
454 int
455 logmap_need_roll_sync(mt_map_t *mtm)
456 {
457 	return (mtm->mtm_nme > logmap_maxnme_sync);
458 }
459 
460 void
461 logmap_start_roll(ml_unit_t *ul)
462 {
463 	mt_map_t	*logmap	= ul->un_logmap;
464 
465 	logmap_settail(logmap, ul);
466 	ASSERT(!(ul->un_flags & LDL_NOROLL));
467 	mutex_enter(&logmap->mtm_mutex);
468 	if ((logmap->mtm_flags & MTM_ROLL_RUNNING) == 0) {
469 		logmap->mtm_flags |= MTM_ROLL_RUNNING;
470 		logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_EXIT);
471 		(void) thread_create(NULL, 0, trans_roll, ul, 0, &p0,
472 		    TS_RUN, minclsyspri);
473 	}
474 	mutex_exit(&logmap->mtm_mutex);
475 }
476 
477 void
478 logmap_kill_roll(ml_unit_t *ul)
479 {
480 	mt_map_t	*mtm	= ul->un_logmap;
481 
482 	if (mtm == NULL)
483 		return;
484 
485 	mutex_enter(&mtm->mtm_mutex);
486 
487 	while (mtm->mtm_flags & MTM_ROLL_RUNNING) {
488 		mtm->mtm_flags |= MTM_ROLL_EXIT;
489 		cv_signal(&mtm->mtm_to_roll_cv);
490 		cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex);
491 	}
492 	mutex_exit(&mtm->mtm_mutex);
493 }
494 
495 /*
496  * kick the roll thread if it's not doing anything
497  */
498 void
499 logmap_forceroll_nowait(mt_map_t *logmap)
500 {
501 	/*
502 	 * Don't need to lock mtm_mutex to read mtm_flags here as we
503 	 * don't care in the rare case when we get a transitional value
504 	 * of mtm_flags. Just by signalling the thread it will wakeup
505 	 * and notice it has too many logmap entries.
506 	 */
507 	ASSERT(!(logmap->mtm_ul->un_flags & LDL_NOROLL));
508 	if ((logmap->mtm_flags & MTM_ROLLING) == 0) {
509 		cv_signal(&logmap->mtm_to_roll_cv);
510 	}
511 }
512 
513 /*
514  * kick the roll thread and wait for it to finish a cycle
515  */
516 void
517 logmap_forceroll(mt_map_t *mtm)
518 {
519 	mutex_enter(&mtm->mtm_mutex);
520 	if ((mtm->mtm_flags & MTM_FORCE_ROLL) == 0) {
521 		mtm->mtm_flags |= MTM_FORCE_ROLL;
522 		cv_signal(&mtm->mtm_to_roll_cv);
523 	}
524 	do {
525 		if ((mtm->mtm_flags & MTM_ROLL_RUNNING) == 0) {
526 			mtm->mtm_flags &= ~MTM_FORCE_ROLL;
527 			goto out;
528 		}
529 		cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex);
530 	} while (mtm->mtm_flags & MTM_FORCE_ROLL);
531 out:
532 	mutex_exit(&mtm->mtm_mutex);
533 }
534 
535 /*
536  * remove rolled deltas within (mof, nb) and free them
537  */
538 void
539 logmap_remove_roll(mt_map_t *mtm, offset_t mof, off_t nb)
540 {
541 	int		dolock = 0;
542 	off_t		hnb;
543 	mapentry_t	*me;
544 	mapentry_t	**mep;
545 	offset_t	savmof	= mof;
546 	off_t		savnb	= nb;
547 
548 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
549 	    map_check_linkage(mtm));
550 
551 again:
552 	if (dolock)
553 		rw_enter(&mtm->mtm_rwlock, RW_WRITER);
554 	mutex_enter(&mtm->mtm_mutex);
555 	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
556 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
557 		if (hnb > nb)
558 			hnb = nb;
559 		/*
560 		 * remove and free the rolled entries
561 		 */
562 		mep = MAP_HASH(mof, mtm);
563 		while ((me = *mep) != 0) {
564 			if ((me->me_flags & ME_ROLL) &&
565 			    (MEwithinDATA(me, mof, hnb))) {
566 				if (me->me_flags & ME_AGE) {
567 					ASSERT(dolock == 0);
568 					dolock = 1;
569 					mutex_exit(&mtm->mtm_mutex);
570 					mof = savmof;
571 					nb = savnb;
572 					goto again;
573 				}
574 				*mep = me->me_hash;
575 				me->me_next->me_prev = me->me_prev;
576 				me->me_prev->me_next = me->me_next;
577 				me->me_flags &= ~(ME_HASH|ME_ROLL);
578 				ASSERT(!(me->me_flags & ME_USER));
579 				mtm->mtm_nme--;
580 				/*
581 				 * cancelled entries are handled by someone else
582 				 */
583 				if ((me->me_flags & ME_CANCEL) == 0) {
584 					roll_stats[me->me_dt]++;
585 					CRB_RELE(me);
586 					kmem_cache_free(mapentry_cache, me);
587 				}
588 			} else
589 				mep = &me->me_hash;
590 		}
591 	}
592 	mutex_exit(&mtm->mtm_mutex);
593 
594 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
595 	    map_check_linkage(mtm));
596 
597 	if (dolock)
598 		rw_exit(&mtm->mtm_rwlock);
599 }
600 
601 /*
602  * Find the disk offset of the next delta to roll.
603  * Returns 0: no more deltas to roll or a transaction is being committed
604  *	   1: a delta to roll has been found and *mofp points
605  *	      to the master file disk offset
606  */
607 int
608 logmap_next_roll(mt_map_t *logmap, offset_t *mofp)
609 {
610 	mapentry_t *me;
611 
612 	ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) ||
613 	    map_check_linkage(logmap));
614 
615 	mutex_enter(&logmap->mtm_mutex);
616 	for (me = logmap->mtm_next; me != (mapentry_t *)logmap;
617 	    me = me->me_next) {
618 		/* already rolled */
619 		if (me->me_flags & ME_ROLL) {
620 			continue;
621 		}
622 
623 		/* part of currently busy transaction; stop */
624 		if (me->me_tid == logmap->mtm_tid) {
625 			break;
626 		}
627 
628 		/* part of commit-in-progress transaction; stop */
629 		if (me->me_tid == logmap->mtm_committid) {
630 			break;
631 		}
632 
633 		/*
634 		 * We shouldn't see a DT_CANCEL mapentry whose
635 		 * tid != mtm_committid, or != mtm_tid since
636 		 * these are removed at the end of each committed
637 		 * transaction.
638 		 */
639 		ASSERT(!(me->me_dt == DT_CANCEL));
640 
641 		*mofp = me->me_mof;
642 		mutex_exit(&logmap->mtm_mutex);
643 		return (1);
644 	}
645 	mutex_exit(&logmap->mtm_mutex);
646 	return (0);
647 }
648 
649 /*
650  * put mapentry on sorted age list
651  */
652 static void
653 logmap_list_age(mapentry_t **age, mapentry_t *meadd)
654 {
655 	mapentry_t	*me;
656 
657 	ASSERT(!(meadd->me_flags & (ME_AGE|ME_LIST)));
658 
659 	for (me = *age; me; age = &me->me_agenext, me = *age) {
660 		if (me->me_age > meadd->me_age)
661 			break;
662 	}
663 	meadd->me_agenext = me;
664 	meadd->me_flags |= ME_AGE;
665 	*age = meadd;
666 }
667 
668 /*
669  * get a list of deltas within <mof, mof+nb>
670  *	returns with mtm_rwlock held
671  *	return value says whether the entire mof range is covered by deltas
672  */
673 int
674 logmap_list_get(
675 	mt_map_t *mtm,
676 	offset_t mof,
677 	off_t nb,
678 	mapentry_t **age)
679 {
680 	off_t		hnb;
681 	mapentry_t	*me;
682 	mapentry_t	**mep;
683 	int		rwtype	= RW_READER;
684 	offset_t	savmof	= mof;
685 	off_t		savnb	= nb;
686 	int		entire	= 0;
687 	crb_t		*crb;
688 
689 	mtm->mtm_ref = 1;
690 again:
691 
692 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
693 	    map_check_linkage(mtm));
694 
695 	rw_enter(&mtm->mtm_rwlock, rwtype);
696 	*age = NULL;
697 	mutex_enter(&mtm->mtm_mutex);
698 	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
699 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
700 		if (hnb > nb)
701 			hnb = nb;
702 		/*
703 		 * find overlapping entries
704 		 */
705 		mep = MAP_HASH(mof, mtm);
706 		for (me = *mep; me; me = me->me_hash) {
707 			if (me->me_dt == DT_CANCEL)
708 				continue;
709 			if (!DATAoverlapME(mof, hnb, me))
710 				continue;
711 			/*
712 			 * check if map entry is in use
713 			 * (about to be rolled).
714 			 */
715 			if (me->me_flags & ME_AGE) {
716 				/*
717 				 * reset the age bit in the list,
718 				 * upgrade the lock, and try again
719 				 */
720 				for (me = *age; me; me = *age) {
721 					*age = me->me_agenext;
722 					me->me_flags &= ~ME_AGE;
723 				}
724 				mutex_exit(&mtm->mtm_mutex);
725 				rw_exit(&mtm->mtm_rwlock);
726 				rwtype = RW_WRITER;
727 				mof = savmof;
728 				nb = savnb;
729 				entire = 0;
730 				goto again;
731 			} else {
732 				/* add mapentry to age ordered list */
733 				logmap_list_age(age, me);
734 				crb = me->me_crb;
735 				if (crb) {
736 					if (DATAwithinCRB(savmof, savnb, crb)) {
737 						entire = 1;
738 					}
739 				} else {
740 					if (DATAwithinME(savmof, savnb, me)) {
741 						entire = 1;
742 					}
743 				}
744 			}
745 		}
746 	}
747 	mutex_exit(&mtm->mtm_mutex);
748 
749 	ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
750 	return (entire);
751 }
752 
753 /*
754  * Get a list of deltas for rolling - returns sucess or failure.
755  * Also return the cached roll buffer if all deltas point to it.
756  */
757 int
758 logmap_list_get_roll(mt_map_t *logmap, offset_t mof, rollbuf_t *rbp)
759 {
760 	mapentry_t	*me, **mep, *age = NULL;
761 	crb_t		*crb = NULL;
762 
763 	ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock));
764 	ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) ||
765 	    map_check_linkage(logmap));
766 	ASSERT((mof & MAPBLOCKOFF) == 0);
767 
768 	rbp->rb_crb = NULL;
769 
770 	/*
771 	 * find overlapping entries
772 	 */
773 	mutex_enter(&logmap->mtm_mutex);
774 	mep = MAP_HASH(mof, logmap);
775 	for (me = *mep; me; me = me->me_hash) {
776 		if (!DATAoverlapME(mof, MAPBLOCKSIZE, me))
777 			continue;
778 		if (me->me_tid == logmap->mtm_tid)
779 			continue;
780 		if (me->me_tid == logmap->mtm_committid)
781 			continue;
782 		if (me->me_dt == DT_CANCEL)
783 			continue;
784 
785 		/*
786 		 * Check if map entry is in use (by lufs_read_strategy())
787 		 * and if so reset the age bit in the list,
788 		 * upgrade the lock, and try again
789 		 */
790 		if (me->me_flags & ME_AGE) {
791 			for (me = age; me; me = age) {
792 				age = me->me_agenext;
793 				me->me_flags &= ~ME_AGE;
794 			}
795 			mutex_exit(&logmap->mtm_mutex);
796 			return (1); /* failure */
797 		} else {
798 			/* add mapentry to age ordered list */
799 			logmap_list_age(&age, me);
800 		}
801 	}
802 	if (!age) {
803 		goto out;
804 	}
805 
806 	/*
807 	 * Mark the deltas as being rolled.
808 	 */
809 	for (me = age; me; me = me->me_agenext) {
810 		me->me_flags |= ME_ROLL;
811 	}
812 
813 	/*
814 	 * Test if all deltas are covered by one valid roll buffer
815 	 */
816 	crb = age->me_crb;
817 	if (crb && !(crb->c_invalid)) {
818 		for (me = age; me; me = me->me_agenext) {
819 			if (me->me_crb != crb) {
820 				crb = NULL;
821 				break;
822 			}
823 		}
824 		rbp->rb_crb = crb;
825 	}
826 out:
827 	rbp->rb_age = age;
828 
829 	mutex_exit(&logmap->mtm_mutex);
830 
831 	ASSERT(((logmap->mtm_debug & MT_SCAN) == 0) ||
832 	    logmap_logscan_debug(logmap, age));
833 	ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock));
834 	return (0); /* success */
835 }
836 
837 void
838 logmap_list_put_roll(mt_map_t *mtm, mapentry_t *age)
839 {
840 	mapentry_t	*me;
841 
842 	ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
843 	mutex_enter(&mtm->mtm_mutex);
844 	for (me = age; me; me = age) {
845 		age = me->me_agenext;
846 		me->me_flags &= ~ME_AGE;
847 	}
848 	mutex_exit(&mtm->mtm_mutex);
849 }
850 
851 void
852 logmap_list_put(mt_map_t *mtm, mapentry_t *age)
853 {
854 	mapentry_t	*me;
855 
856 	ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
857 	mutex_enter(&mtm->mtm_mutex);
858 	for (me = age; me; me = age) {
859 		age = me->me_agenext;
860 		me->me_flags &= ~ME_AGE;
861 	}
862 	mutex_exit(&mtm->mtm_mutex);
863 	rw_exit(&mtm->mtm_rwlock);
864 }
865 
866 #define	UFS_RW_BALANCE 2
867 int ufs_rw_balance = UFS_RW_BALANCE;
868 
869 /*
870  * Check if we need to read the master.
871  * The master does not need to be read if the log deltas to the
872  * block are for one contiguous set of full disk sectors.
873  * Both cylinder group bit maps DT_CG (8K); directory entries (512B);
874  * and possibly others should not require master disk reads.
875  * Calculate the sector map for writing later.
876  */
877 int
878 logmap_setup_read(mapentry_t *age, rollbuf_t *rbp)
879 {
880 	offset_t mof;
881 	crb_t *crb;
882 	mapentry_t *me;
883 	int32_t nb;
884 	int i;
885 	int start_sec, end_sec;
886 	int read_needed = 0;
887 	int all_inodes = 1;
888 	int first_sec = INT_MAX;
889 	int last_sec = -1;
890 	rbsecmap_t secmap = 0;
891 
892 	/* LINTED: warning: logical expression always true: op "||" */
893 	ASSERT((MAPBLOCKSIZE / DEV_BSIZE) == (sizeof (secmap) * NBBY));
894 
895 	for (me = age; me; me = me->me_agenext) {
896 		crb = me->me_crb;
897 		if (crb) {
898 			nb = crb->c_nb;
899 			mof = crb->c_mof;
900 		} else {
901 			nb = me->me_nb;
902 			mof = me->me_mof;
903 		}
904 
905 		/*
906 		 * If the delta is not sector aligned then
907 		 * read the whole block.
908 		 */
909 		if ((nb & DEV_BMASK) || (mof & DEV_BMASK)) {
910 			read_needed = 1;
911 		}
912 
913 		/* Set sector map used in the MAPBLOCKSIZE block.  */
914 		start_sec = (mof & MAPBLOCKOFF) >> DEV_BSHIFT;
915 		end_sec = start_sec + ((nb - 1) >> DEV_BSHIFT);
916 		for (i = start_sec; i <= end_sec; i++) {
917 			secmap |= UINT16_C(1) << i;
918 		}
919 
920 		if (me->me_dt != DT_INODE) {
921 			all_inodes = 0;
922 		}
923 		if (start_sec < first_sec) {
924 			first_sec = start_sec;
925 		}
926 		if (end_sec > last_sec) {
927 			last_sec = end_sec;
928 		}
929 	}
930 
931 	ASSERT(secmap);
932 	ASSERT(first_sec != INT_MAX);
933 	ASSERT(last_sec != -1);
934 
935 	if (all_inodes) {
936 		/*
937 		 * Here we have a tradeoff choice. It must be better to
938 		 * do 2 writes * in the same MAPBLOCKSIZE chunk, than a
939 		 * read and a write. But what about 3 or more writes, versus
940 		 * a read+write? * Where is the cut over? It will depend on
941 		 * the track caching, scsi driver and other activity.
942 		 * A unpublished tunable is defined (ufs_rw_balance) that
943 		 * currently defaults to 2.
944 		 */
945 		if (!read_needed) {
946 			int count = 0, gap = 0;
947 			int sector_set; /* write needed to this sector */
948 
949 			/* Count the gaps (every 1 to 0 transation) */
950 			for (i = first_sec + 1; i < last_sec; i++) {
951 				sector_set = secmap & (UINT16_C(1) << i);
952 				if (!gap && !sector_set) {
953 					gap = 1;
954 					count++;
955 					if (count > ufs_rw_balance) {
956 						read_needed = 1;
957 						break;
958 					}
959 				} else if (gap && sector_set) {
960 					gap = 0;
961 				}
962 			}
963 		}
964 
965 		/*
966 		 * Inodes commonly make up the majority (~85%) of deltas.
967 		 * They cannot contain embedded user data, so its safe to
968 		 * read and write them all in one IO.
969 		 * But for directory entries, shadow inode data, and
970 		 * quota record data the user data fragments can be embedded
971 		 * betwen those metadata, and so its not safe to read, modify
972 		 * then write the entire range as user asynchronous user data
973 		 * writes could get overwritten with old data.
974 		 * Thus we have to create a segment map of meta data that
975 		 * needs to get written.
976 		 *
977 		 * If user data was logged then this issue would go away.
978 		 */
979 		if (read_needed) {
980 			for (i = first_sec + 1; i < last_sec; i++) {
981 				secmap |= (UINT16_C(1) << i);
982 			}
983 		}
984 	}
985 	rbp->rb_secmap = secmap;
986 	return (read_needed);
987 }
988 
989 /*
990  * Abort the load of a set of log map delta's.
991  * ie,
992  * Clear out all mapentries on this unit's log map
993  * which have a tid (transaction id) equal to the
994  * parameter tid.   Walk the cancel list, taking everything
995  * off it, too.
996  */
997 static void
998 logmap_abort(ml_unit_t *ul, uint32_t tid)
999 {
1000 	struct mt_map	*mtm = ul->un_logmap;	/* Log map */
1001 	mapentry_t	*me, **mep;
1002 	int		i;
1003 
1004 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1005 	    map_check_linkage(mtm));
1006 
1007 	/*
1008 	 * wait for any outstanding reads to finish; lock out future reads
1009 	 */
1010 	rw_enter(&mtm->mtm_rwlock, RW_WRITER);
1011 
1012 	mutex_enter(&mtm->mtm_mutex);
1013 	/* Take everything off cancel list */
1014 	while ((me = mtm->mtm_cancel) != NULL) {
1015 		mtm->mtm_cancel = me->me_cancel;
1016 		me->me_flags &= ~ME_CANCEL;
1017 		me->me_cancel = NULL;
1018 	}
1019 
1020 	/*
1021 	 * Now take out all mapentries with current tid, and committid
1022 	 * as this function is called from logmap_logscan and logmap_commit
1023 	 * When it is called from logmap_logscan mtm_tid == mtm_committid
1024 	 * But when logmap_abort is called from logmap_commit it is
1025 	 * because the log errored when trying to write the commit record,
1026 	 * after the async ops have been allowed to start in top_end_sync.
1027 	 * So we also need to remove all mapentries from the transaction whose
1028 	 * commit failed.
1029 	 */
1030 	for (i = 0; i < mtm->mtm_nhash; i++) {
1031 		mep = &mtm->mtm_hash[i];
1032 		while ((me = *mep) != NULL) {
1033 			if (me->me_tid == tid ||
1034 			    me->me_tid == mtm->mtm_committid) {
1035 				*mep = me->me_hash;
1036 				me->me_next->me_prev = me->me_prev;
1037 				me->me_prev->me_next = me->me_next;
1038 				if (!(me->me_flags & ME_USER)) {
1039 					mtm->mtm_nme--;
1040 				}
1041 				CRB_RELE(me);
1042 				kmem_cache_free(mapentry_cache, me);
1043 				continue;
1044 			}
1045 			mep = &me->me_hash;
1046 		}
1047 	}
1048 
1049 	if (!(ul->un_flags & LDL_SCAN))
1050 		mtm->mtm_flags |= MTM_CANCELED;
1051 	mutex_exit(&mtm->mtm_mutex);
1052 	mtm->mtm_dirty = 0;
1053 	mtm->mtm_nmet = 0;
1054 	rw_exit(&mtm->mtm_rwlock);
1055 
1056 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1057 	    map_check_linkage(mtm));
1058 }
1059 
1060 static void
1061 logmap_wait_space(mt_map_t *mtm, ml_unit_t *ul, mapentry_t *me)
1062 {
1063 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1064 
1065 	while (!ldl_has_space(ul, me)) {
1066 		ASSERT(!(ul->un_flags & LDL_NOROLL));
1067 		mutex_exit(&ul->un_log_mutex);
1068 		logmap_forceroll(mtm);
1069 		mutex_enter(&ul->un_log_mutex);
1070 		if (ul->un_flags & LDL_ERROR)
1071 			break;
1072 	}
1073 
1074 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1075 }
1076 
1077 /*
1078  * put a list of deltas into a logmap
1079  * If va == NULL, don't write to the log.
1080  */
1081 void
1082 logmap_add(
1083 	ml_unit_t *ul,
1084 	char *va,			/* Ptr to buf w/deltas & data */
1085 	offset_t vamof,			/* Offset on master of buf start */
1086 	mapentry_t *melist)		/* Entries to add */
1087 {
1088 	offset_t	mof;
1089 	off_t		nb;
1090 	mapentry_t	*me;
1091 	mapentry_t	**mep;
1092 	mapentry_t	**savmep;
1093 	uint32_t	tid;
1094 	mt_map_t	*mtm	= ul->un_logmap;
1095 
1096 	mutex_enter(&ul->un_log_mutex);
1097 	if (va)
1098 		logmap_wait_space(mtm, ul, melist);
1099 
1100 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1101 	    map_check_linkage(mtm));
1102 
1103 	mtm->mtm_ref = 1;
1104 	mtm->mtm_dirty++;
1105 	tid = mtm->mtm_tid;
1106 	while (melist) {
1107 		mof = melist->me_mof;
1108 		nb  = melist->me_nb;
1109 
1110 		/*
1111 		 * search for overlaping entries
1112 		 */
1113 		savmep = mep = MAP_HASH(mof, mtm);
1114 		mutex_enter(&mtm->mtm_mutex);
1115 		while ((me = *mep) != 0) {
1116 			/*
1117 			 * Data consumes old map entry; cancel map entry.
1118 			 * Take care when we replace an old map entry
1119 			 * which carries quota information with a newer entry
1120 			 * which does not. In that case the push function
1121 			 * would not be called to clean up the dquot structure.
1122 			 * This would be found later by invalidatedq() causing
1123 			 * a panic when the filesystem in unmounted.
1124 			 * We clean up the dquot manually and then replace
1125 			 * the map entry.
1126 			 */
1127 			if (MEwithinDATA(me, mof, nb) &&
1128 			    ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) {
1129 				if (tid == me->me_tid &&
1130 				    ((me->me_flags & ME_AGE) == 0)) {
1131 					*mep = me->me_hash;
1132 					me->me_next->me_prev = me->me_prev;
1133 					me->me_prev->me_next = me->me_next;
1134 					ASSERT(!(me->me_flags & ME_USER));
1135 					mtm->mtm_nme--;
1136 					/*
1137 					 * Special case if the mapentry
1138 					 * carries a dquot and a push function.
1139 					 * We have to clean up the quota info
1140 					 * before replacing the mapentry.
1141 					 */
1142 					if (me->me_dt == DT_QR)
1143 						HANDLE_DQUOT(me, melist);
1144 
1145 					kmem_cache_free(mapentry_cache, me);
1146 					continue;
1147 				}
1148 				me->me_cancel = mtm->mtm_cancel;
1149 				mtm->mtm_cancel = me;
1150 				me->me_flags |= ME_CANCEL;
1151 			}
1152 			mep = &(*mep)->me_hash;
1153 		}
1154 		mutex_exit(&mtm->mtm_mutex);
1155 
1156 		/*
1157 		 * remove from list
1158 		 */
1159 		me = melist;
1160 		melist = melist->me_hash;
1161 		me->me_flags &= ~ME_LIST;
1162 		/*
1163 		 * If va != NULL, put in the log.
1164 		 */
1165 		if (va)
1166 			ldl_write(ul, va, vamof, me);
1167 		if (ul->un_flags & LDL_ERROR) {
1168 			kmem_cache_free(mapentry_cache, me);
1169 			continue;
1170 		}
1171 		ASSERT((va == NULL) ||
1172 		    ((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) ||
1173 		    map_check_ldl_write(ul, va, vamof, me));
1174 
1175 		/*
1176 		 * put on hash
1177 		 */
1178 		mutex_enter(&mtm->mtm_mutex);
1179 		me->me_hash = *savmep;
1180 		*savmep = me;
1181 		me->me_next = (mapentry_t *)mtm;
1182 		me->me_prev = mtm->mtm_prev;
1183 		mtm->mtm_prev->me_next = me;
1184 		mtm->mtm_prev = me;
1185 		me->me_flags |= ME_HASH;
1186 		me->me_tid = tid;
1187 		me->me_age = mtm->mtm_age++;
1188 		mtm->mtm_nme++;
1189 		mtm->mtm_nmet++;
1190 		mutex_exit(&mtm->mtm_mutex);
1191 	}
1192 
1193 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1194 	    map_check_linkage(mtm));
1195 	mutex_exit(&ul->un_log_mutex);
1196 }
1197 
1198 /*
1199  * Add the delta(s) into the log.
1200  * Create one cached roll buffer logmap entry, and reference count the
1201  * number of mapentries refering to it.
1202  * Cancel previous logmap entries.
1203  * logmap_add is tolerant of failure to allocate a cached roll buffer.
1204  */
1205 void
1206 logmap_add_buf(
1207 	ml_unit_t *ul,
1208 	char *va,			/* Ptr to buf w/deltas & data */
1209 	offset_t bufmof,		/* Offset on master of buf start */
1210 	mapentry_t *melist,		/* Entries to add */
1211 	caddr_t	buf,			/* Buffer containing delta(s) */
1212 	uint32_t bufsz)			/* Size of buf */
1213 {
1214 	offset_t	mof;
1215 	offset_t	vamof = bufmof + (va - buf);
1216 	off_t		nb;
1217 	mapentry_t	*me;
1218 	mapentry_t	**mep;
1219 	mapentry_t	**savmep;
1220 	uint32_t	tid;
1221 	mt_map_t	*mtm	= ul->un_logmap;
1222 	crb_t		*crb;
1223 	crb_t		*crbsav = NULL;
1224 
1225 	ASSERT((bufsz & DEV_BMASK) == 0);
1226 	mutex_enter(&ul->un_log_mutex);
1227 	logmap_wait_space(mtm, ul, melist);
1228 
1229 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1230 	    map_check_linkage(mtm));
1231 
1232 	mtm->mtm_ref = 1;
1233 	mtm->mtm_dirty++;
1234 	tid = mtm->mtm_tid;
1235 	while (melist) {
1236 		mof = melist->me_mof;
1237 		nb  = melist->me_nb;
1238 
1239 		/*
1240 		 * search for overlapping entries
1241 		 */
1242 		savmep = mep = MAP_HASH(mof, mtm);
1243 		mutex_enter(&mtm->mtm_mutex);
1244 		while ((me = *mep) != 0) {
1245 			/*
1246 			 * Data consumes old map entry; cancel map entry.
1247 			 * Take care when we replace an old map entry
1248 			 * which carries quota information with a newer entry
1249 			 * which does not. In that case the push function
1250 			 * would not be called to clean up the dquot structure.
1251 			 * This would be found later by invalidatedq() causing
1252 			 * a panic when the filesystem in unmounted.
1253 			 * We clean up the dquot manually and then replace
1254 			 * the map entry.
1255 			 */
1256 			crb = me->me_crb;
1257 			if (MEwithinDATA(me, mof, nb) &&
1258 			    ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) {
1259 				if (tid == me->me_tid &&
1260 				    ((me->me_flags & ME_AGE) == 0)) {
1261 					*mep = me->me_hash;
1262 					me->me_next->me_prev = me->me_prev;
1263 					me->me_prev->me_next = me->me_next;
1264 					ASSERT(!(me->me_flags & ME_USER));
1265 					mtm->mtm_nme--;
1266 					/*
1267 					 * Special case if the mapentry
1268 					 * carries a dquot and a push function.
1269 					 * We have to clean up the quota info
1270 					 * before replacing the mapentry.
1271 					 */
1272 					if (me->me_dt == DT_QR)
1273 						HANDLE_DQUOT(me, melist);
1274 
1275 					/*
1276 					 * If this soon to be deleted mapentry
1277 					 * has a suitable roll buffer then
1278 					 * re-use it.
1279 					 */
1280 					if (crb && (--crb->c_refcnt == 0)) {
1281 						if (crbsav ||
1282 						    (crb->c_nb != bufsz)) {
1283 							CRB_FREE(crb, me);
1284 						} else {
1285 							bcopy(buf, crb->c_buf,
1286 							    bufsz);
1287 							crb->c_invalid = 0;
1288 							crb->c_mof = bufmof;
1289 							crbsav = crb;
1290 							me->me_crb = NULL;
1291 						}
1292 					}
1293 					kmem_cache_free(mapentry_cache, me);
1294 					continue;
1295 				}
1296 				me->me_cancel = mtm->mtm_cancel;
1297 				mtm->mtm_cancel = me;
1298 				me->me_flags |= ME_CANCEL;
1299 			}
1300 
1301 			/*
1302 			 * Inode deltas within the same fs block come
1303 			 * in individually as separate calls to logmap_add().
1304 			 * All others come in as one call. So check for an
1305 			 * existing entry where we can re-use the crb.
1306 			 */
1307 			if ((me->me_dt == DT_INODE) && (tid == me->me_tid) &&
1308 			    !crbsav && crb &&
1309 			    WITHIN(mof, nb, crb->c_mof, crb->c_nb)) {
1310 				ASSERT(crb->c_mof == bufmof);
1311 				ASSERT(crb->c_nb == bufsz);
1312 				bcopy(buf, crb->c_buf, bufsz);
1313 				crbsav = crb;
1314 			}
1315 			mep = &(*mep)->me_hash;
1316 		}
1317 		mutex_exit(&mtm->mtm_mutex);
1318 
1319 		/*
1320 		 * If we don't already have a crb then allocate one
1321 		 * and copy the incoming buffer. Only do this once
1322 		 * for all the incoming deltas.
1323 		 */
1324 		if ((crbsav == NULL) && (melist->me_dt != DT_ABZERO)) {
1325 			/*
1326 			 * Only use a cached roll buffer if we
1327 			 * have enough memory, and check for failures.
1328 			 */
1329 			if (((ufs_crb_size + bufsz) < ufs_crb_limit) &&
1330 			    (kmem_avail() > bufsz)) {
1331 				crbsav = kmem_alloc(sizeof (crb_t), KM_NOSLEEP);
1332 			} else {
1333 				ufs_crb_alloc_fails++;
1334 			}
1335 			if (crbsav) {
1336 				crbsav->c_buf = kmem_alloc(bufsz, KM_NOSLEEP);
1337 				if (crbsav->c_buf) {
1338 					atomic_add_64(&ufs_crb_size,
1339 					    (uint64_t)bufsz);
1340 					if (ufs_crb_size > ufs_crb_max_size) {
1341 						ufs_crb_max_size = ufs_crb_size;
1342 					}
1343 					bcopy(buf, crbsav->c_buf, bufsz);
1344 					crbsav->c_nb = bufsz;
1345 					crbsav->c_refcnt = 0;
1346 					crbsav->c_invalid = 0;
1347 					ASSERT((bufmof & DEV_BMASK) == 0);
1348 					crbsav->c_mof = bufmof;
1349 				} else {
1350 					kmem_free(crbsav, sizeof (crb_t));
1351 					crbsav = NULL;
1352 				}
1353 			}
1354 		}
1355 
1356 		/*
1357 		 * remove from list
1358 		 */
1359 		me = melist;
1360 		melist = melist->me_hash;
1361 		me->me_flags &= ~ME_LIST;
1362 		me->me_crb = crbsav;
1363 		if (crbsav) {
1364 			crbsav->c_refcnt++;
1365 		}
1366 		crbsav = NULL;
1367 
1368 		ASSERT(va);
1369 		ldl_write(ul, va, vamof, me); /* add to on-disk log */
1370 		if (ul->un_flags & LDL_ERROR) {
1371 			CRB_RELE(me);
1372 			kmem_cache_free(mapentry_cache, me);
1373 			continue;
1374 		}
1375 		ASSERT(((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) ||
1376 		    map_check_ldl_write(ul, va, vamof, me));
1377 
1378 		/*
1379 		 * put on hash
1380 		 */
1381 		mutex_enter(&mtm->mtm_mutex);
1382 		me->me_hash = *savmep;
1383 		*savmep = me;
1384 		me->me_next = (mapentry_t *)mtm;
1385 		me->me_prev = mtm->mtm_prev;
1386 		mtm->mtm_prev->me_next = me;
1387 		mtm->mtm_prev = me;
1388 		me->me_flags |= ME_HASH;
1389 		me->me_tid = tid;
1390 		me->me_age = mtm->mtm_age++;
1391 		mtm->mtm_nme++;
1392 		mtm->mtm_nmet++;
1393 		mutex_exit(&mtm->mtm_mutex);
1394 	}
1395 
1396 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1397 	    map_check_linkage(mtm));
1398 	mutex_exit(&ul->un_log_mutex);
1399 }
1400 
1401 /*
1402  * free up any cancelled deltas
1403  */
1404 void
1405 logmap_free_cancel(mt_map_t *mtm, mapentry_t **cancelhead)
1406 {
1407 	int		dolock	= 0;
1408 	mapentry_t	*me;
1409 	mapentry_t	**mep;
1410 
1411 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1412 	    map_check_linkage(mtm));
1413 
1414 again:
1415 	if (dolock)
1416 		rw_enter(&mtm->mtm_rwlock, RW_WRITER);
1417 
1418 	/*
1419 	 * At EOT, cancel the indicated deltas
1420 	 */
1421 	mutex_enter(&mtm->mtm_mutex);
1422 	if (mtm->mtm_flags & MTM_CANCELED) {
1423 		mtm->mtm_flags &= ~MTM_CANCELED;
1424 		ASSERT(dolock == 0);
1425 		mutex_exit(&mtm->mtm_mutex);
1426 		return;
1427 	}
1428 
1429 	while ((me = *cancelhead) != NULL) {
1430 		/*
1431 		 * roll forward or read collision; wait and try again
1432 		 */
1433 		if (me->me_flags & ME_AGE) {
1434 			ASSERT(dolock == 0);
1435 			mutex_exit(&mtm->mtm_mutex);
1436 			dolock = 1;
1437 			goto again;
1438 		}
1439 		/*
1440 		 * remove from cancel list
1441 		 */
1442 		*cancelhead = me->me_cancel;
1443 		me->me_cancel = NULL;
1444 		me->me_flags &= ~(ME_CANCEL);
1445 
1446 		/*
1447 		 * logmap_remove_roll handles ME_ROLL entries later
1448 		 *	we leave them around for logmap_iscancel
1449 		 *	XXX is this necessary?
1450 		 */
1451 		if (me->me_flags & ME_ROLL)
1452 			continue;
1453 
1454 		/*
1455 		 * remove from hash (if necessary)
1456 		 */
1457 		if (me->me_flags & ME_HASH) {
1458 			mep = MAP_HASH(me->me_mof, mtm);
1459 			while (*mep) {
1460 				if (*mep == me) {
1461 					*mep = me->me_hash;
1462 					me->me_next->me_prev = me->me_prev;
1463 					me->me_prev->me_next = me->me_next;
1464 					me->me_flags &= ~(ME_HASH);
1465 					if (!(me->me_flags & ME_USER)) {
1466 						mtm->mtm_nme--;
1467 					}
1468 					break;
1469 				} else
1470 					mep = &(*mep)->me_hash;
1471 			}
1472 		}
1473 		/*
1474 		 * put the entry on the free list
1475 		 */
1476 		CRB_RELE(me);
1477 		kmem_cache_free(mapentry_cache, me);
1478 	}
1479 	mutex_exit(&mtm->mtm_mutex);
1480 	if (dolock)
1481 		rw_exit(&mtm->mtm_rwlock);
1482 
1483 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1484 	    map_check_linkage(mtm));
1485 }
1486 
1487 
1488 void
1489 logmap_commit(ml_unit_t *ul, uint32_t tid)
1490 {
1491 	mapentry_t	me;
1492 	mt_map_t	*mtm	= ul->un_logmap;
1493 
1494 
1495 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1496 
1497 	/*
1498 	 * async'ly write a commit rec into the log
1499 	 */
1500 	if (mtm->mtm_dirty) {
1501 		/*
1502 		 * put commit record into log
1503 		 */
1504 		me.me_mof = mtm->mtm_tid;
1505 		me.me_dt = DT_COMMIT;
1506 		me.me_nb = 0;
1507 		me.me_hash = NULL;
1508 		logmap_wait_space(mtm, ul, &me);
1509 		ldl_write(ul, NULL, (offset_t)0, &me);
1510 		ldl_round_commit(ul);
1511 
1512 		/*
1513 		 * abort on error; else reset dirty flag
1514 		 */
1515 		if (ul->un_flags & LDL_ERROR)
1516 			logmap_abort(ul, tid);
1517 		else {
1518 			mtm->mtm_dirty = 0;
1519 			mtm->mtm_nmet = 0;
1520 			mtm->mtm_cfrags = 0;
1521 		}
1522 		/* push commit */
1523 		ldl_push_commit(ul);
1524 	}
1525 }
1526 
1527 void
1528 logmap_sethead(mt_map_t *mtm, ml_unit_t *ul)
1529 {
1530 	off_t		lof;
1531 	uint32_t	tid;
1532 	mapentry_t	*me;
1533 
1534 	/*
1535 	 * move the head forward so the log knows how full it is
1536 	 * Make sure to skip any mapentry whose me_lof is 0, these
1537 	 * are just place holders for DT_CANCELED freed user blocks
1538 	 * for the current moby.
1539 	 */
1540 	mutex_enter(&ul->un_log_mutex);
1541 	mutex_enter(&mtm->mtm_mutex);
1542 	me = mtm->mtm_next;
1543 	while (me != (mapentry_t *)mtm && me->me_lof == 0) {
1544 		me = me->me_next;
1545 	}
1546 
1547 	if (me == (mapentry_t *)mtm)
1548 		lof = -1;
1549 	else {
1550 		lof = me->me_lof;
1551 		tid = me->me_tid;
1552 	}
1553 	mutex_exit(&mtm->mtm_mutex);
1554 	ldl_sethead(ul, lof, tid);
1555 	if (lof == -1)
1556 		mtm->mtm_age = 0;
1557 	mutex_exit(&ul->un_log_mutex);
1558 }
1559 
1560 void
1561 logmap_settail(mt_map_t *mtm, ml_unit_t *ul)
1562 {
1563 	off_t		lof;
1564 	size_t		nb;
1565 
1566 	/*
1567 	 * set the tail after the logmap_abort
1568 	 */
1569 	mutex_enter(&ul->un_log_mutex);
1570 	mutex_enter(&mtm->mtm_mutex);
1571 	if (mtm->mtm_prev == (mapentry_t *)mtm)
1572 		lof = -1;
1573 	else {
1574 		/*
1575 		 * set the tail to the end of the last commit
1576 		 */
1577 		lof = mtm->mtm_tail_lof;
1578 		nb = mtm->mtm_tail_nb;
1579 	}
1580 	mutex_exit(&mtm->mtm_mutex);
1581 	ldl_settail(ul, lof, nb);
1582 	mutex_exit(&ul->un_log_mutex);
1583 }
1584 
1585 /*
1586  * when reseting a device; roll the log until every
1587  * delta has been rolled forward
1588  */
1589 void
1590 logmap_roll_dev(ml_unit_t *ul)
1591 {
1592 	mt_map_t	*mtm	= ul->un_logmap;
1593 	mapentry_t	*me;
1594 	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
1595 
1596 again:
1597 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1598 	    map_check_linkage(mtm));
1599 	if (ul->un_flags & (LDL_ERROR|LDL_NOROLL))
1600 		return;
1601 
1602 	/*
1603 	 * look for deltas
1604 	 */
1605 	mutex_enter(&mtm->mtm_mutex);
1606 	for (me = mtm->mtm_next; me != (mapentry_t *)mtm; me = me->me_next) {
1607 		if (me->me_flags & ME_ROLL)
1608 			break;
1609 		if (me->me_tid == mtm->mtm_tid)
1610 			continue;
1611 		if (me->me_tid == mtm->mtm_committid)
1612 			continue;
1613 		break;
1614 	}
1615 
1616 	/*
1617 	 * found a delta; kick the roll thread
1618 	 * but only if the thread is running... (jmh)
1619 	 */
1620 	if (me != (mapentry_t *)mtm) {
1621 		mutex_exit(&mtm->mtm_mutex);
1622 		logmap_forceroll(mtm);
1623 		goto again;
1624 	}
1625 
1626 	/*
1627 	 * no more deltas, return
1628 	 */
1629 	mutex_exit(&mtm->mtm_mutex);
1630 	(void) ufs_putsummaryinfo(ul->un_dev, ufsvfsp, ufsvfsp->vfs_fs);
1631 
1632 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1633 	    map_check_linkage(mtm));
1634 }
1635 
1636 static void
1637 logmap_cancel_delta(ml_unit_t *ul, offset_t mof, int32_t nb, int metadata)
1638 {
1639 	mapentry_t	*me;
1640 	mapentry_t	**mep;
1641 	mt_map_t	*mtm	= ul->un_logmap;
1642 	int		frags;
1643 
1644 	/*
1645 	 * map has been referenced and is dirty
1646 	 */
1647 	mtm->mtm_ref = 1;
1648 	mtm->mtm_dirty++;
1649 
1650 	/*
1651 	 * get a mapentry
1652 	 */
1653 	me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
1654 	bzero(me, sizeof (mapentry_t));
1655 
1656 	/*
1657 	 * initialize cancel record and put in logmap
1658 	 */
1659 	me->me_mof = mof;
1660 	me->me_nb = nb;
1661 	me->me_dt = DT_CANCEL;
1662 	me->me_tid = mtm->mtm_tid;
1663 	me->me_hash = NULL;
1664 
1665 	/*
1666 	 * Write delta to log if this delta is for metadata.  If this is not
1667 	 * metadata it is user data and we are just putting a cancel
1668 	 * mapentry into the hash to cancel a user block deletion
1669 	 * in which we do not want the block to be allocated
1670 	 * within this moby.  This cancel entry will prevent the block from
1671 	 * being allocated within the moby and prevent user data corruption
1672 	 * if we happen to crash before this moby is committed.
1673 	 */
1674 	mutex_enter(&ul->un_log_mutex);
1675 	if (metadata) {
1676 		logmap_wait_space(mtm, ul, me);
1677 		ldl_write(ul, NULL, (offset_t)0, me);
1678 		if (ul->un_flags & LDL_ERROR) {
1679 			kmem_cache_free(mapentry_cache, me);
1680 			mutex_exit(&ul->un_log_mutex);
1681 			return;
1682 		}
1683 	}
1684 
1685 	/*
1686 	 * put in hash and on cancel list
1687 	 */
1688 	mep = MAP_HASH(mof, mtm);
1689 	mutex_enter(&mtm->mtm_mutex);
1690 	me->me_age = mtm->mtm_age++;
1691 	me->me_hash = *mep;
1692 	*mep = me;
1693 	me->me_next = (mapentry_t *)mtm;
1694 	me->me_prev = mtm->mtm_prev;
1695 	mtm->mtm_prev->me_next = me;
1696 	mtm->mtm_prev = me;
1697 	me->me_cancel = mtm->mtm_cancel;
1698 	mtm->mtm_cancel = me;
1699 	if (metadata) {
1700 		mtm->mtm_nme++;
1701 		mtm->mtm_nmet++;
1702 	} else {
1703 		me->me_flags = ME_USER;
1704 	}
1705 	me->me_flags |= (ME_HASH|ME_CANCEL);
1706 	if (!(metadata)) {
1707 		frags = blkoff(ul->un_ufsvfs->vfs_fs, nb);
1708 		if (frags)
1709 			mtm->mtm_cfrags +=
1710 			    numfrags(ul->un_ufsvfs->vfs_fs, frags);
1711 	}
1712 	mutex_exit(&mtm->mtm_mutex);
1713 
1714 	mutex_exit(&ul->un_log_mutex);
1715 }
1716 
1717 /*
1718  * cancel entries in a logmap (entries are freed at EOT)
1719  */
1720 void
1721 logmap_cancel(ml_unit_t *ul, offset_t mof, off_t nb, int metadata)
1722 {
1723 	int32_t		hnb;
1724 	mapentry_t	*me;
1725 	mapentry_t	**mep;
1726 	mt_map_t	*mtm	= ul->un_logmap;
1727 	crb_t		*crb;
1728 
1729 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1730 	    map_check_linkage(mtm));
1731 
1732 	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
1733 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
1734 		if (hnb > nb)
1735 			hnb = nb;
1736 		/*
1737 		 * Find overlapping metadata entries.  Don't search through
1738 		 * the hash chains if this is user data because it is only
1739 		 * possible to have overlapping map entries for metadata,
1740 		 * and the search can become expensive for large files.
1741 		 */
1742 		if (metadata) {
1743 			mep = MAP_HASH(mof, mtm);
1744 			mutex_enter(&mtm->mtm_mutex);
1745 			for (me = *mep; me; me = me->me_hash) {
1746 				if (!DATAoverlapME(mof, hnb, me))
1747 					continue;
1748 
1749 				ASSERT(MEwithinDATA(me, mof, hnb));
1750 
1751 				if ((me->me_flags & ME_CANCEL) == 0) {
1752 					me->me_cancel = mtm->mtm_cancel;
1753 					mtm->mtm_cancel = me;
1754 					me->me_flags |= ME_CANCEL;
1755 					crb = me->me_crb;
1756 					if (crb) {
1757 						crb->c_invalid = 1;
1758 					}
1759 				}
1760 			}
1761 			mutex_exit(&mtm->mtm_mutex);
1762 		}
1763 
1764 		/*
1765 		 * put a cancel record into the log
1766 		 */
1767 		logmap_cancel_delta(ul, mof, hnb, metadata);
1768 	}
1769 
1770 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1771 	    map_check_linkage(mtm));
1772 }
1773 
1774 /*
1775  * check for overlap w/cancel delta
1776  */
1777 int
1778 logmap_iscancel(mt_map_t *mtm, offset_t mof, off_t nb)
1779 {
1780 	off_t		hnb;
1781 	mapentry_t	*me;
1782 	mapentry_t	**mep;
1783 
1784 	mutex_enter(&mtm->mtm_mutex);
1785 	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
1786 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
1787 		if (hnb > nb)
1788 			hnb = nb;
1789 		/*
1790 		 * search for dup entry
1791 		 */
1792 		mep = MAP_HASH(mof, mtm);
1793 		for (me = *mep; me; me = me->me_hash) {
1794 			if (((me->me_flags & ME_ROLL) == 0) &&
1795 			    (me->me_dt != DT_CANCEL))
1796 				continue;
1797 			if (DATAoverlapME(mof, hnb, me))
1798 				break;
1799 		}
1800 
1801 		/*
1802 		 * overlap detected
1803 		 */
1804 		if (me) {
1805 			mutex_exit(&mtm->mtm_mutex);
1806 			return (1);
1807 		}
1808 	}
1809 	mutex_exit(&mtm->mtm_mutex);
1810 	return (0);
1811 }
1812 
1813 static int
1814 logmap_logscan_add(ml_unit_t *ul, struct delta *dp, off_t lof, size_t *nbp)
1815 {
1816 	mapentry_t	*me;
1817 	int		error;
1818 	mt_map_t	*mtm	= ul->un_logmap;
1819 
1820 	/*
1821 	 * verify delta header; failure == mediafail
1822 	 */
1823 	error = 0;
1824 	/* delta type */
1825 	if ((dp->d_typ <= DT_NONE) || (dp->d_typ >= DT_MAX))
1826 		error = EINVAL;
1827 	if (dp->d_typ == DT_COMMIT) {
1828 		if (dp->d_nb != INT32_C(0) && dp->d_nb != INT32_C(-1))
1829 			error = EINVAL;
1830 	} else {
1831 		/* length of delta */
1832 		if ((dp->d_nb < INT32_C(0)) ||
1833 		    (dp->d_nb > INT32_C(MAPBLOCKSIZE)))
1834 			error = EINVAL;
1835 
1836 		/* offset on master device */
1837 		if (dp->d_mof < INT64_C(0))
1838 			error = EINVAL;
1839 	}
1840 
1841 	if (error) {
1842 		ldl_seterror(ul, "Error processing ufs log data during scan");
1843 		return (error);
1844 	}
1845 
1846 	/*
1847 	 * process commit record
1848 	 */
1849 	if (dp->d_typ == DT_COMMIT) {
1850 		if (mtm->mtm_dirty) {
1851 			ASSERT(dp->d_nb == INT32_C(0));
1852 			logmap_free_cancel(mtm, &mtm->mtm_cancel);
1853 			mtm->mtm_dirty = 0;
1854 			mtm->mtm_nmet = 0;
1855 			mtm->mtm_tid++;
1856 			mtm->mtm_committid = mtm->mtm_tid;
1857 			ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) ||
1858 			    logmap_logscan_commit_debug(lof, mtm));
1859 		}
1860 		/*
1861 		 * return #bytes to next sector (next delta header)
1862 		 */
1863 		*nbp = ldl_logscan_nbcommit(lof);
1864 		mtm->mtm_tail_lof = lof;
1865 		mtm->mtm_tail_nb = *nbp;
1866 		return (0);
1867 	}
1868 
1869 	/*
1870 	 * add delta to logmap
1871 	 */
1872 	me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
1873 	bzero(me, sizeof (mapentry_t));
1874 	me->me_lof = lof;
1875 	me->me_mof = dp->d_mof;
1876 	me->me_nb = dp->d_nb;
1877 	me->me_tid = mtm->mtm_tid;
1878 	me->me_dt = dp->d_typ;
1879 	me->me_hash = NULL;
1880 	me->me_flags = (ME_LIST | ME_SCAN);
1881 	logmap_add(ul, NULL, 0, me);
1882 	switch (dp->d_typ) {
1883 	case DT_CANCEL:
1884 		me->me_flags |= ME_CANCEL;
1885 		me->me_cancel = mtm->mtm_cancel;
1886 		mtm->mtm_cancel = me;
1887 		break;
1888 	default:
1889 		ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) ||
1890 		    logmap_logscan_add_debug(dp, mtm));
1891 		break;
1892 	}
1893 
1894 sizeofdelta:
1895 	/*
1896 	 * return #bytes till next delta header
1897 	 */
1898 	if ((dp->d_typ == DT_CANCEL) || (dp->d_typ == DT_ABZERO))
1899 		*nbp = 0;
1900 	else
1901 		*nbp = dp->d_nb;
1902 	return (0);
1903 }
1904 
1905 void
1906 logmap_logscan(ml_unit_t *ul)
1907 {
1908 	size_t		nb, nbd;
1909 	off_t		lof;
1910 	struct delta	delta;
1911 	mt_map_t	*logmap	= ul->un_logmap;
1912 
1913 	ASSERT(ul->un_deltamap->mtm_next == (mapentry_t *)ul->un_deltamap);
1914 
1915 	/*
1916 	 * prepare the log for a logscan
1917 	 */
1918 	ldl_logscan_begin(ul);
1919 
1920 	/*
1921 	 * prepare the logmap for a logscan
1922 	 */
1923 	(void) map_free_entries(logmap);
1924 	logmap->mtm_tid = 0;
1925 	logmap->mtm_committid = UINT32_C(0);
1926 	logmap->mtm_age = 0;
1927 	logmap->mtm_dirty = 0;
1928 	logmap->mtm_ref = 0;
1929 
1930 	/*
1931 	 * while not at end of log
1932 	 *	read delta header
1933 	 *	add to logmap
1934 	 *	seek to beginning of next delta
1935 	 */
1936 	lof = ul->un_head_lof;
1937 	nbd = sizeof (delta);
1938 	while (lof != ul->un_tail_lof) {
1939 
1940 		/* read delta header */
1941 		if (ldl_logscan_read(ul, &lof, nbd, (caddr_t)&delta))
1942 			break;
1943 
1944 		/* add to logmap */
1945 		if (logmap_logscan_add(ul, &delta, lof, &nb))
1946 			break;
1947 
1948 		/* seek to next header (skip data) */
1949 		if (ldl_logscan_read(ul, &lof, nb, NULL))
1950 			break;
1951 	}
1952 
1953 	/*
1954 	 * remove the last partial transaction from the logmap
1955 	 */
1956 	logmap_abort(ul, logmap->mtm_tid);
1957 
1958 	ldl_logscan_end(ul);
1959 }
1960 
1961 void
1962 _init_map(void)
1963 {
1964 	/*
1965 	 * Initialise the mapentry cache. No constructor or deconstructor
1966 	 * is needed. Also no reclaim function is supplied as reclaiming
1967 	 * current entries is not possible.
1968 	 */
1969 	mapentry_cache = kmem_cache_create("lufs_mapentry_cache",
1970 	    sizeof (mapentry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1971 }
1972 
1973 /*
1974  * Special case when we replace an old map entry which carries quota
1975  * information with a newer entry which does not.
1976  * In that case the push function would not be called to clean up the
1977  * dquot structure. This would be found later by invalidatedq() causing
1978  * a panic when the filesystem in unmounted.
1979  * We clean up the dquot manually before replacing the map entry.
1980  */
1981 void
1982 handle_dquot(mapentry_t *me)
1983 {
1984 	int dolock = 0;
1985 	int domutex = 0;
1986 	struct dquot *dqp;
1987 
1988 	dqp = (struct dquot *)me->me_arg;
1989 
1990 	/*
1991 	 * We need vfs_dqrwlock to call dqput()
1992 	 */
1993 	dolock = (!RW_LOCK_HELD(&dqp->dq_ufsvfsp->vfs_dqrwlock));
1994 	if (dolock)
1995 		rw_enter(&dqp->dq_ufsvfsp->vfs_dqrwlock, RW_READER);
1996 
1997 	domutex = (!MUTEX_HELD(&dqp->dq_lock));
1998 	if (domutex)
1999 		mutex_enter(&dqp->dq_lock);
2000 
2001 	/*
2002 	 * Only clean up if the dquot is referenced
2003 	 */
2004 	if (dqp->dq_cnt == 0) {
2005 		if (domutex)
2006 			mutex_exit(&dqp->dq_lock);
2007 		if (dolock)
2008 			rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock);
2009 		return;
2010 	}
2011 
2012 	dqp->dq_flags &= ~(DQ_MOD|DQ_TRANS);
2013 	dqput(dqp);
2014 
2015 	if (domutex)
2016 		mutex_exit(&dqp->dq_lock);
2017 
2018 	if (dolock)
2019 		rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock);
2020 
2021 }
2022