xref: /illumos-gate/usr/src/uts/common/fs/ufs/lufs_map.c (revision fbd1c0dae6f4a2ccc2ce0527c7f19d3dd5ea90b8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 #pragma ident	"%Z%%M%	%I%	%E% SMI"
22 
23 /*
24  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #include <sys/systm.h>
29 #include <sys/types.h>
30 #include <sys/vnode.h>
31 #include <sys/errno.h>
32 #include <sys/sysmacros.h>
33 #include <sys/debug.h>
34 #include <sys/kmem.h>
35 #include <sys/conf.h>
36 #include <sys/proc.h>
37 #include <sys/cmn_err.h>
38 #include <sys/fs/ufs_inode.h>
39 #include <sys/fs/ufs_filio.h>
40 #include <sys/fs/ufs_log.h>
41 #include <sys/inttypes.h>
42 #include <sys/atomic.h>
43 #include <sys/tuneable.h>
44 
45 /*
46  * externs
47  */
48 extern pri_t minclsyspri;
49 extern struct kmem_cache *lufs_bp;
50 extern int ufs_trans_push_quota();
51 
52 /*
53  * globals
54  */
55 kmem_cache_t *mapentry_cache;
56 
57 /*
58  * logmap tuning constants
59  */
60 long	logmap_maxnme_commit	= 2048;
61 long	logmap_maxnme_async	= 4096;
62 long	logmap_maxnme_sync	= 6144;
63 long	logmap_maxcfrag_commit	= 4;	/* Max canceled fragments per moby */
64 
65 
66 uint64_t ufs_crb_size = 0;		/* current size of all crb buffers */
67 uint64_t ufs_crb_max_size = 0;		/* highest crb buffer use so far */
68 size_t ufs_crb_limit;			/* max allowable size for crbs */
69 uint64_t ufs_crb_alloc_fails = 0;	/* crb allocation failures stat */
70 #define	UFS_MAX_CRB_DEFAULT_DIVISOR 10	/* max 1/10 kmem_maxavail() */
71 int ufs_max_crb_divisor = UFS_MAX_CRB_DEFAULT_DIVISOR; /* tunable */
72 void handle_dquot(mapentry_t *);
73 
74 /*
75  * GENERIC MAP ROUTINES
76  */
77 
78 #define	CRB_FREE(crb, me) \
79 	kmem_free(crb->c_buf, crb->c_nb); \
80 	atomic_add_64(&ufs_crb_size, -(uint64_t)crb->c_nb); \
81 	kmem_free(crb, sizeof (crb_t)); \
82 	(me)->me_crb = NULL;
83 
84 #define	CRB_RELE(me) { \
85 	crb_t *crb = (me)->me_crb; \
86 	if (crb && (--crb->c_refcnt == 0)) { \
87 		CRB_FREE(crb, me) \
88 	} \
89 }
90 
91 /*
92  * Check that the old delta has an argument and a push function of
93  * ufs_trans_push_quota(), then check that the old and new deltas differ.
94  * If so we clean up with handle_dquot() before replacing the old delta.
95  */
96 #define	HANDLE_DQUOT(me, melist) { \
97 	if ((me->me_arg) && \
98 	    (me->me_func == ufs_trans_push_quota)) { \
99 		if (!((me->me_dt == melist->me_dt) && \
100 		    (me->me_arg == melist->me_arg) && \
101 		    (me->me_func == melist->me_func))) { \
102 			handle_dquot(me); \
103 		} \
104 	} \
105 }
106 
107 /*
108  * free up all the mapentries for a map
109  */
110 void
111 map_free_entries(mt_map_t *mtm)
112 {
113 	int		i;
114 	mapentry_t	*me;
115 
116 	while ((me = mtm->mtm_next) != (mapentry_t *)mtm) {
117 		me->me_next->me_prev = me->me_prev;
118 		me->me_prev->me_next = me->me_next;
119 		CRB_RELE(me);
120 		kmem_cache_free(mapentry_cache, me);
121 	}
122 	for (i = 0; i < mtm->mtm_nhash; i++)
123 		mtm->mtm_hash[i] = NULL;
124 	mtm->mtm_nme = 0;
125 	mtm->mtm_nmet = 0;
126 }
127 
128 /*
129  * done with map; free if necessary
130  */
131 mt_map_t *
132 map_put(mt_map_t *mtm)
133 {
134 	/*
135 	 * free up the map's memory
136 	 */
137 	map_free_entries(mtm);
138 	ASSERT(map_put_debug(mtm));
139 	kmem_free(mtm->mtm_hash,
140 	    (size_t) (sizeof (mapentry_t *) * mtm->mtm_nhash));
141 	mutex_destroy(&mtm->mtm_mutex);
142 	mutex_destroy(&mtm->mtm_scan_mutex);
143 	cv_destroy(&mtm->mtm_to_roll_cv);
144 	cv_destroy(&mtm->mtm_from_roll_cv);
145 	rw_destroy(&mtm->mtm_rwlock);
146 	mutex_destroy(&mtm->mtm_lock);
147 	cv_destroy(&mtm->mtm_cv_commit);
148 	cv_destroy(&mtm->mtm_cv_next);
149 	cv_destroy(&mtm->mtm_cv_eot);
150 	cv_destroy(&mtm->mtm_cv);
151 	kmem_free(mtm, sizeof (mt_map_t));
152 	return (NULL);
153 }
154 /*
155  * Allocate a map;
156  */
157 mt_map_t *
158 map_get(ml_unit_t *ul, enum maptypes maptype, int nh)
159 {
160 	mt_map_t	*mtm;
161 
162 	/*
163 	 * assume the map is not here and allocate the necessary structs
164 	 */
165 	mtm = kmem_zalloc(sizeof (mt_map_t), KM_SLEEP);
166 	mutex_init(&mtm->mtm_mutex, NULL, MUTEX_DEFAULT, NULL);
167 	mutex_init(&mtm->mtm_scan_mutex, NULL, MUTEX_DEFAULT, NULL);
168 	cv_init(&mtm->mtm_to_roll_cv, NULL, CV_DEFAULT, NULL);
169 	cv_init(&mtm->mtm_from_roll_cv, NULL, CV_DEFAULT, NULL);
170 	rw_init(&mtm->mtm_rwlock, NULL, RW_DEFAULT, NULL);
171 	mtm->mtm_next = (mapentry_t *)mtm;
172 	mtm->mtm_prev = (mapentry_t *)mtm;
173 	mtm->mtm_hash = kmem_zalloc((size_t) (sizeof (mapentry_t *) * nh),
174 	    KM_SLEEP);
175 	mtm->mtm_nhash = nh;
176 	mtm->mtm_debug = ul->un_debug;
177 	mtm->mtm_type = maptype;
178 
179 	mtm->mtm_cfrags = 0;
180 	mtm->mtm_cfragmax = logmap_maxcfrag_commit;
181 
182 	/*
183 	 * for scan test
184 	 */
185 	mtm->mtm_ul = ul;
186 
187 	/*
188 	 * Initialize locks
189 	 */
190 	mutex_init(&mtm->mtm_lock, NULL, MUTEX_DEFAULT, NULL);
191 	cv_init(&mtm->mtm_cv_commit, NULL, CV_DEFAULT, NULL);
192 	cv_init(&mtm->mtm_cv_next, NULL, CV_DEFAULT, NULL);
193 	cv_init(&mtm->mtm_cv_eot, NULL, CV_DEFAULT, NULL);
194 	cv_init(&mtm->mtm_cv, NULL, CV_DEFAULT, NULL);
195 	ASSERT(map_get_debug(ul, mtm));
196 
197 	return (mtm);
198 }
199 
200 /*
201  * DELTAMAP ROUTINES
202  */
203 /*
204  * deltamap tuning constants
205  */
206 long	deltamap_maxnme	= 1024;	/* global so it can be set */
207 
208 int
209 deltamap_need_commit(mt_map_t *mtm)
210 {
211 	return (mtm->mtm_nme > deltamap_maxnme);
212 }
213 
214 /*
215  * put a delta into a deltamap; may sleep on memory
216  */
217 void
218 deltamap_add(
219 	mt_map_t *mtm,
220 	offset_t mof,
221 	off_t nb,
222 	delta_t dtyp,
223 	int (*func)(),
224 	ulong_t arg,
225 	threadtrans_t *tp)
226 {
227 	int32_t		hnb;
228 	mapentry_t	*me;
229 	mapentry_t	**mep;
230 
231 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
232 	    map_check_linkage(mtm));
233 
234 	mutex_enter(&mtm->mtm_mutex);
235 
236 	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
237 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
238 		if (hnb > nb)
239 			hnb = nb;
240 		/*
241 		 * Search for dup entry. We need to ensure that we don't
242 		 * replace a map entry which carries quota information
243 		 * with a map entry which doesn't. In that case we lose
244 		 * reference the the dquot structure which will not be
245 		 * cleaned up by the push function me->me_func as this will
246 		 * never be called.
247 		 * The stray dquot would be found later by invalidatedq()
248 		 * causing a panic when the filesystem is unmounted.
249 		 */
250 		mep = MAP_HASH(mof, mtm);
251 		for (me = *mep; me; me = me->me_hash) {
252 			if (DATAwithinME(mof, hnb, me)) {
253 				/*
254 				 * Don't remove quota entries which have
255 				 * incremented the ref count (those with a
256 				 * ufs_trans_push_quota push function).
257 				 * Let logmap_add[_buf] clean them up.
258 				 */
259 				if (me->me_func == ufs_trans_push_quota) {
260 					continue;
261 				}
262 				break;
263 			}
264 			ASSERT((dtyp == DT_CANCEL) ||
265 			    (!DATAoverlapME(mof, hnb, me)) ||
266 			    MEwithinDATA(me, mof, hnb));
267 		}
268 
269 		if (me) {
270 			/* already in map */
271 			continue;
272 		}
273 
274 		/*
275 		 * Add up all the delta map deltas so we can compute
276 		 * an upper bound on the log size used.
277 		 * Note, some deltas get removed from the deltamap
278 		 * before the deltamap_push by lufs_write_strategy
279 		 * and so multiple deltas to the same mof offset
280 		 * don't get cancelled here but in the logmap.
281 		 * Thus we can't easily get a accurate count of
282 		 * the log space used - only an upper bound.
283 		 */
284 		if (tp && (mtm->mtm_ul->un_deltamap == mtm)) {
285 			ASSERT(dtyp != DT_CANCEL);
286 			if (dtyp == DT_ABZERO) {
287 				tp->deltas_size += sizeof (struct delta);
288 			} else {
289 				tp->deltas_size +=
290 				    (hnb + sizeof (struct delta));
291 			}
292 		}
293 
294 		delta_stats[dtyp]++;
295 
296 		/*
297 		 * get a mapentry
298 		 * May need to drop & re-grab the mtm_mutex
299 		 * and then recheck for a duplicate
300 		 */
301 		me = kmem_cache_alloc(mapentry_cache, KM_NOSLEEP);
302 		if (me == NULL) {
303 			mutex_exit(&mtm->mtm_mutex);
304 			me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
305 			mutex_enter(&mtm->mtm_mutex);
306 		}
307 		bzero(me, sizeof (mapentry_t));
308 
309 		/*
310 		 * initialize and put in deltamap
311 		 */
312 		me->me_mof = mof;
313 		me->me_nb = hnb;
314 		me->me_func = func;
315 		me->me_arg = arg;
316 		me->me_dt = dtyp;
317 		me->me_flags = ME_HASH;
318 		me->me_tid = mtm->mtm_tid;
319 
320 		me->me_hash = *mep;
321 		*mep = me;
322 		me->me_next = (mapentry_t *)mtm;
323 		me->me_prev = mtm->mtm_prev;
324 		mtm->mtm_prev->me_next = me;
325 		mtm->mtm_prev = me;
326 		mtm->mtm_nme++;
327 	}
328 	mutex_exit(&mtm->mtm_mutex);
329 
330 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
331 	    map_check_linkage(mtm));
332 }
333 
334 /*
335  * remove deltas within (mof, nb) and return as linked list
336  */
337 mapentry_t *
338 deltamap_remove(mt_map_t *mtm, offset_t mof, off_t nb)
339 {
340 	off_t		hnb;
341 	mapentry_t	*me;
342 	mapentry_t	**mep;
343 	mapentry_t	*mer;
344 
345 	if (mtm == NULL)
346 		return (NULL);
347 
348 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
349 	    map_check_linkage(mtm));
350 
351 	mutex_enter(&mtm->mtm_mutex);
352 	for (mer = NULL, hnb = 0; nb; nb -= hnb, mof += hnb) {
353 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
354 		if (hnb > nb)
355 			hnb = nb;
356 		/*
357 		 * remove entries from hash and return as a aged linked list
358 		 */
359 		mep = MAP_HASH(mof, mtm);
360 		while ((me = *mep) != 0) {
361 			if (MEwithinDATA(me, mof, hnb)) {
362 				*mep = me->me_hash;
363 				me->me_next->me_prev = me->me_prev;
364 				me->me_prev->me_next = me->me_next;
365 				me->me_hash = mer;
366 				mer = me;
367 				me->me_flags |= ME_LIST;
368 				me->me_flags &= ~ME_HASH;
369 				mtm->mtm_nme--;
370 			} else
371 				mep = &me->me_hash;
372 		}
373 	}
374 	mutex_exit(&mtm->mtm_mutex);
375 
376 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
377 	    map_check_linkage(mtm));
378 
379 	return (mer);
380 }
381 
382 /*
383  * delete entries within (mof, nb)
384  */
385 void
386 deltamap_del(mt_map_t *mtm, offset_t mof, off_t nb)
387 {
388 	mapentry_t	*me;
389 	mapentry_t	*menext;
390 
391 	menext = deltamap_remove(mtm, mof, nb);
392 	while ((me = menext) != 0) {
393 		menext = me->me_hash;
394 		kmem_cache_free(mapentry_cache, me);
395 	}
396 }
397 
398 /*
399  * Call the indicated function to cause deltas to move to the logmap.
400  * top_end_sync() is the only caller of this function and
401  * it has waited for the completion of all threads, so there can
402  * be no other activity in the deltamap. Therefore we don't need to
403  * hold the deltamap lock.
404  */
405 void
406 deltamap_push(ml_unit_t *ul)
407 {
408 	delta_t		dtyp;
409 	int		(*func)();
410 	ulong_t		arg;
411 	mapentry_t	*me;
412 	offset_t	mof;
413 	off_t		nb;
414 	mt_map_t	*mtm	= ul->un_deltamap;
415 
416 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
417 	    map_check_linkage(mtm));
418 
419 	/*
420 	 * for every entry in the deltamap
421 	 */
422 	while ((me = mtm->mtm_next) != (mapentry_t *)mtm) {
423 		ASSERT(me->me_func);
424 		func = me->me_func;
425 		dtyp = me->me_dt;
426 		arg = me->me_arg;
427 		mof = me->me_mof;
428 		nb = me->me_nb;
429 		if ((ul->un_flags & LDL_ERROR) ||
430 		    (*func)(ul->un_ufsvfs, dtyp, arg))
431 			deltamap_del(mtm, mof, nb);
432 	}
433 
434 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
435 	    map_check_linkage(mtm));
436 }
437 
438 /*
439  * LOGMAP ROUTINES
440  */
441 
442 int
443 logmap_need_commit(mt_map_t *mtm)
444 {
445 	return ((mtm->mtm_nmet > logmap_maxnme_commit) ||
446 	    (mtm->mtm_cfrags >= mtm->mtm_cfragmax));
447 }
448 
449 int
450 logmap_need_roll_async(mt_map_t *mtm)
451 {
452 	return (mtm->mtm_nme > logmap_maxnme_async);
453 }
454 
455 int
456 logmap_need_roll_sync(mt_map_t *mtm)
457 {
458 	return (mtm->mtm_nme > logmap_maxnme_sync);
459 }
460 
461 void
462 logmap_start_roll(ml_unit_t *ul)
463 {
464 	mt_map_t	*logmap	= ul->un_logmap;
465 
466 	logmap_settail(logmap, ul);
467 	ASSERT(!(ul->un_flags & LDL_NOROLL));
468 	mutex_enter(&logmap->mtm_mutex);
469 	if ((logmap->mtm_flags & MTM_ROLL_RUNNING) == 0) {
470 		logmap->mtm_flags |= MTM_ROLL_RUNNING;
471 		logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_EXIT);
472 		(void) thread_create(NULL, 0, trans_roll, ul, 0, &p0,
473 		    TS_RUN, minclsyspri);
474 	}
475 	mutex_exit(&logmap->mtm_mutex);
476 }
477 
478 void
479 logmap_kill_roll(ml_unit_t *ul)
480 {
481 	mt_map_t	*mtm	= ul->un_logmap;
482 
483 	if (mtm == NULL)
484 		return;
485 
486 	mutex_enter(&mtm->mtm_mutex);
487 
488 	while (mtm->mtm_flags & MTM_ROLL_RUNNING) {
489 		mtm->mtm_flags |= MTM_ROLL_EXIT;
490 		cv_signal(&mtm->mtm_to_roll_cv);
491 		cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex);
492 	}
493 	mutex_exit(&mtm->mtm_mutex);
494 }
495 
496 /*
497  * kick the roll thread if it's not doing anything
498  */
499 void
500 logmap_forceroll_nowait(mt_map_t *logmap)
501 {
502 	/*
503 	 * Don't need to lock mtm_mutex to read mtm_flags here as we
504 	 * don't care in the rare case when we get a transitional value
505 	 * of mtm_flags. Just by signalling the thread it will wakeup
506 	 * and notice it has too many logmap entries.
507 	 */
508 	ASSERT(!(logmap->mtm_ul->un_flags & LDL_NOROLL));
509 	if ((logmap->mtm_flags & MTM_ROLLING) == 0) {
510 		cv_signal(&logmap->mtm_to_roll_cv);
511 	}
512 }
513 
514 /*
515  * kick the roll thread and wait for it to finish a cycle
516  */
517 void
518 logmap_forceroll(mt_map_t *mtm)
519 {
520 	mutex_enter(&mtm->mtm_mutex);
521 	if ((mtm->mtm_flags & MTM_FORCE_ROLL) == 0) {
522 		mtm->mtm_flags |= MTM_FORCE_ROLL;
523 		cv_signal(&mtm->mtm_to_roll_cv);
524 	}
525 	do {
526 		if ((mtm->mtm_flags & MTM_ROLL_RUNNING) == 0) {
527 			mtm->mtm_flags &= ~MTM_FORCE_ROLL;
528 			goto out;
529 		}
530 		cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex);
531 	} while (mtm->mtm_flags & MTM_FORCE_ROLL);
532 out:
533 	mutex_exit(&mtm->mtm_mutex);
534 }
535 
536 /*
537  * remove rolled deltas within (mof, nb) and free them
538  */
539 void
540 logmap_remove_roll(mt_map_t *mtm, offset_t mof, off_t nb)
541 {
542 	int		dolock = 0;
543 	off_t		hnb;
544 	mapentry_t	*me;
545 	mapentry_t	**mep;
546 	offset_t	savmof	= mof;
547 	off_t		savnb	= nb;
548 
549 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
550 	    map_check_linkage(mtm));
551 
552 again:
553 	if (dolock)
554 		rw_enter(&mtm->mtm_rwlock, RW_WRITER);
555 	mutex_enter(&mtm->mtm_mutex);
556 	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
557 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
558 		if (hnb > nb)
559 			hnb = nb;
560 		/*
561 		 * remove and free the rolled entries
562 		 */
563 		mep = MAP_HASH(mof, mtm);
564 		while ((me = *mep) != 0) {
565 			if ((me->me_flags & ME_ROLL) &&
566 			    (MEwithinDATA(me, mof, hnb))) {
567 				if (me->me_flags & ME_AGE) {
568 					ASSERT(dolock == 0);
569 					dolock = 1;
570 					mutex_exit(&mtm->mtm_mutex);
571 					mof = savmof;
572 					nb = savnb;
573 					goto again;
574 				}
575 				*mep = me->me_hash;
576 				me->me_next->me_prev = me->me_prev;
577 				me->me_prev->me_next = me->me_next;
578 				me->me_flags &= ~(ME_HASH|ME_ROLL);
579 				ASSERT(!(me->me_flags & ME_USER));
580 				mtm->mtm_nme--;
581 				/*
582 				 * cancelled entries are handled by someone else
583 				 */
584 				if ((me->me_flags & ME_CANCEL) == 0) {
585 					roll_stats[me->me_dt]++;
586 					CRB_RELE(me);
587 					kmem_cache_free(mapentry_cache, me);
588 				}
589 			} else
590 				mep = &me->me_hash;
591 		}
592 	}
593 	mutex_exit(&mtm->mtm_mutex);
594 
595 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
596 	    map_check_linkage(mtm));
597 
598 	if (dolock)
599 		rw_exit(&mtm->mtm_rwlock);
600 }
601 
602 /*
603  * Find the disk offset of the next delta to roll.
604  * Returns 0: no more deltas to roll or a transaction is being committed
605  *	   1: a delta to roll has been found and *mofp points
606  *	      to the master file disk offset
607  */
608 int
609 logmap_next_roll(mt_map_t *logmap, offset_t *mofp)
610 {
611 	mapentry_t *me;
612 
613 	ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) ||
614 	    map_check_linkage(logmap));
615 
616 	mutex_enter(&logmap->mtm_mutex);
617 	for (me = logmap->mtm_next; me != (mapentry_t *)logmap;
618 	    me = me->me_next) {
619 		/* already rolled */
620 		if (me->me_flags & ME_ROLL) {
621 			continue;
622 		}
623 
624 		/* part of currently busy transaction; stop */
625 		if (me->me_tid == logmap->mtm_tid) {
626 			break;
627 		}
628 
629 		/* part of commit-in-progress transaction; stop */
630 		if (me->me_tid == logmap->mtm_committid) {
631 			break;
632 		}
633 
634 		/*
635 		 * We shouldn't see a DT_CANCEL mapentry whose
636 		 * tid != mtm_committid, or != mtm_tid since
637 		 * these are removed at the end of each committed
638 		 * transaction.
639 		 */
640 		ASSERT(!(me->me_dt == DT_CANCEL));
641 
642 		*mofp = me->me_mof;
643 		mutex_exit(&logmap->mtm_mutex);
644 		return (1);
645 	}
646 	mutex_exit(&logmap->mtm_mutex);
647 	return (0);
648 }
649 
650 /*
651  * put mapentry on sorted age list
652  */
653 static void
654 logmap_list_age(mapentry_t **age, mapentry_t *meadd)
655 {
656 	mapentry_t	*me;
657 
658 	ASSERT(!(meadd->me_flags & (ME_AGE|ME_LIST)));
659 
660 	for (me = *age; me; age = &me->me_agenext, me = *age) {
661 		if (me->me_age > meadd->me_age)
662 			break;
663 	}
664 	meadd->me_agenext = me;
665 	meadd->me_flags |= ME_AGE;
666 	*age = meadd;
667 }
668 
669 /*
670  * get a list of deltas within <mof, mof+nb>
671  *	returns with mtm_rwlock held
672  *	return value says whether the entire mof range is covered by deltas
673  */
674 int
675 logmap_list_get(
676 	mt_map_t *mtm,
677 	offset_t mof,
678 	off_t nb,
679 	mapentry_t **age)
680 {
681 	off_t		hnb;
682 	mapentry_t	*me;
683 	mapentry_t	**mep;
684 	int		rwtype	= RW_READER;
685 	offset_t	savmof	= mof;
686 	off_t		savnb	= nb;
687 	int		entire	= 0;
688 	crb_t		*crb;
689 
690 	mtm->mtm_ref = 1;
691 again:
692 
693 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
694 	    map_check_linkage(mtm));
695 
696 	rw_enter(&mtm->mtm_rwlock, rwtype);
697 	*age = NULL;
698 	mutex_enter(&mtm->mtm_mutex);
699 	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
700 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
701 		if (hnb > nb)
702 			hnb = nb;
703 		/*
704 		 * find overlapping entries
705 		 */
706 		mep = MAP_HASH(mof, mtm);
707 		for (me = *mep; me; me = me->me_hash) {
708 			if (me->me_dt == DT_CANCEL)
709 				continue;
710 			if (!DATAoverlapME(mof, hnb, me))
711 				continue;
712 			/*
713 			 * check if map entry is in use
714 			 * (about to be rolled).
715 			 */
716 			if (me->me_flags & ME_AGE) {
717 				/*
718 				 * reset the age bit in the list,
719 				 * upgrade the lock, and try again
720 				 */
721 				for (me = *age; me; me = *age) {
722 					*age = me->me_agenext;
723 					me->me_flags &= ~ME_AGE;
724 				}
725 				mutex_exit(&mtm->mtm_mutex);
726 				rw_exit(&mtm->mtm_rwlock);
727 				rwtype = RW_WRITER;
728 				mof = savmof;
729 				nb = savnb;
730 				entire = 0;
731 				goto again;
732 			} else {
733 				/* add mapentry to age ordered list */
734 				logmap_list_age(age, me);
735 				crb = me->me_crb;
736 				if (crb) {
737 					if (DATAwithinCRB(savmof, savnb, crb)) {
738 						entire = 1;
739 					}
740 				} else {
741 					if (DATAwithinME(savmof, savnb, me)) {
742 						entire = 1;
743 					}
744 				}
745 			}
746 		}
747 	}
748 	mutex_exit(&mtm->mtm_mutex);
749 
750 	ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
751 	return (entire);
752 }
753 
754 /*
755  * Get a list of deltas for rolling - returns sucess or failure.
756  * Also return the cached roll buffer if all deltas point to it.
757  */
758 int
759 logmap_list_get_roll(mt_map_t *logmap, offset_t mof, rollbuf_t *rbp)
760 {
761 	mapentry_t	*me, **mep, *age = NULL;
762 	crb_t		*crb = NULL;
763 
764 	ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock));
765 	ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) ||
766 	    map_check_linkage(logmap));
767 	ASSERT((mof & MAPBLOCKOFF) == 0);
768 
769 	rbp->rb_crb = NULL;
770 
771 	/*
772 	 * find overlapping entries
773 	 */
774 	mutex_enter(&logmap->mtm_mutex);
775 	mep = MAP_HASH(mof, logmap);
776 	for (me = *mep; me; me = me->me_hash) {
777 		if (!DATAoverlapME(mof, MAPBLOCKSIZE, me))
778 			continue;
779 		if (me->me_tid == logmap->mtm_tid)
780 			continue;
781 		if (me->me_tid == logmap->mtm_committid)
782 			continue;
783 		if (me->me_dt == DT_CANCEL)
784 			continue;
785 
786 		/*
787 		 * Check if map entry is in use (by lufs_read_strategy())
788 		 * and if so reset the age bit in the list,
789 		 * upgrade the lock, and try again
790 		 */
791 		if (me->me_flags & ME_AGE) {
792 			for (me = age; me; me = age) {
793 				age = me->me_agenext;
794 				me->me_flags &= ~ME_AGE;
795 			}
796 			mutex_exit(&logmap->mtm_mutex);
797 			return (1); /* failure */
798 		} else {
799 			/* add mapentry to age ordered list */
800 			logmap_list_age(&age, me);
801 		}
802 	}
803 	if (!age) {
804 		goto out;
805 	}
806 
807 	/*
808 	 * Mark the deltas as being rolled.
809 	 */
810 	for (me = age; me; me = me->me_agenext) {
811 		me->me_flags |= ME_ROLL;
812 	}
813 
814 	/*
815 	 * Test if all deltas are covered by one valid roll buffer
816 	 */
817 	crb = age->me_crb;
818 	if (crb && !(crb->c_invalid)) {
819 		for (me = age; me; me = me->me_agenext) {
820 			if (me->me_crb != crb) {
821 				crb = NULL;
822 				break;
823 			}
824 		}
825 		rbp->rb_crb = crb;
826 	}
827 out:
828 	rbp->rb_age = age;
829 
830 	mutex_exit(&logmap->mtm_mutex);
831 
832 	ASSERT(((logmap->mtm_debug & MT_SCAN) == 0) ||
833 	    logmap_logscan_debug(logmap, age));
834 	ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock));
835 	return (0); /* success */
836 }
837 
838 void
839 logmap_list_put_roll(mt_map_t *mtm, mapentry_t *age)
840 {
841 	mapentry_t	*me;
842 
843 	ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
844 	mutex_enter(&mtm->mtm_mutex);
845 	for (me = age; me; me = age) {
846 		age = me->me_agenext;
847 		me->me_flags &= ~ME_AGE;
848 	}
849 	mutex_exit(&mtm->mtm_mutex);
850 }
851 
852 void
853 logmap_list_put(mt_map_t *mtm, mapentry_t *age)
854 {
855 	mapentry_t	*me;
856 
857 	ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
858 	mutex_enter(&mtm->mtm_mutex);
859 	for (me = age; me; me = age) {
860 		age = me->me_agenext;
861 		me->me_flags &= ~ME_AGE;
862 	}
863 	mutex_exit(&mtm->mtm_mutex);
864 	rw_exit(&mtm->mtm_rwlock);
865 }
866 
867 #define	UFS_RW_BALANCE 2
868 int ufs_rw_balance = UFS_RW_BALANCE;
869 
870 /*
871  * Check if we need to read the master.
872  * The master does not need to be read if the log deltas to the
873  * block are for one contiguous set of full disk sectors.
874  * Both cylinder group bit maps DT_CG (8K); directory entries (512B);
875  * and possibly others should not require master disk reads.
876  * Calculate the sector map for writing later.
877  */
878 int
879 logmap_setup_read(mapentry_t *age, rollbuf_t *rbp)
880 {
881 	offset_t mof;
882 	crb_t *crb;
883 	mapentry_t *me;
884 	int32_t nb;
885 	int i;
886 	int start_sec, end_sec;
887 	int read_needed = 0;
888 	int all_inodes = 1;
889 	int first_sec = INT_MAX;
890 	int last_sec = -1;
891 	rbsecmap_t secmap = 0;
892 
893 	/* LINTED: warning: logical expression always true: op "||" */
894 	ASSERT((MAPBLOCKSIZE / DEV_BSIZE) == (sizeof (secmap) * NBBY));
895 
896 	for (me = age; me; me = me->me_agenext) {
897 		crb = me->me_crb;
898 		if (crb) {
899 			nb = crb->c_nb;
900 			mof = crb->c_mof;
901 		} else {
902 			nb = me->me_nb;
903 			mof = me->me_mof;
904 		}
905 
906 		/*
907 		 * If the delta is not sector aligned then
908 		 * read the whole block.
909 		 */
910 		if ((nb & DEV_BMASK) || (mof & DEV_BMASK)) {
911 			read_needed = 1;
912 		}
913 
914 		/* Set sector map used in the MAPBLOCKSIZE block.  */
915 		start_sec = (mof & MAPBLOCKOFF) >> DEV_BSHIFT;
916 		end_sec = start_sec + ((nb - 1) >> DEV_BSHIFT);
917 		for (i = start_sec; i <= end_sec; i++) {
918 			secmap |= UINT16_C(1) << i;
919 		}
920 
921 		if (me->me_dt != DT_INODE) {
922 			all_inodes = 0;
923 		}
924 		if (start_sec < first_sec) {
925 			first_sec = start_sec;
926 		}
927 		if (end_sec > last_sec) {
928 			last_sec = end_sec;
929 		}
930 	}
931 
932 	ASSERT(secmap);
933 	ASSERT(first_sec != INT_MAX);
934 	ASSERT(last_sec != -1);
935 
936 	if (all_inodes) {
937 		/*
938 		 * Here we have a tradeoff choice. It must be better to
939 		 * do 2 writes * in the same MAPBLOCKSIZE chunk, than a
940 		 * read and a write. But what about 3 or more writes, versus
941 		 * a read+write? * Where is the cut over? It will depend on
942 		 * the track caching, scsi driver and other activity.
943 		 * A unpublished tunable is defined (ufs_rw_balance) that
944 		 * currently defaults to 2.
945 		 */
946 		if (!read_needed) {
947 			int count = 0, gap = 0;
948 			int sector_set; /* write needed to this sector */
949 
950 			/* Count the gaps (every 1 to 0 transation) */
951 			for (i = first_sec + 1; i < last_sec; i++) {
952 				sector_set = secmap & (UINT16_C(1) << i);
953 				if (!gap && !sector_set) {
954 					gap = 1;
955 					count++;
956 					if (count > ufs_rw_balance) {
957 						read_needed = 1;
958 						break;
959 					}
960 				} else if (gap && sector_set) {
961 					gap = 0;
962 				}
963 			}
964 		}
965 
966 		/*
967 		 * Inodes commonly make up the majority (~85%) of deltas.
968 		 * They cannot contain embedded user data, so its safe to
969 		 * read and write them all in one IO.
970 		 * But for directory entries, shadow inode data, and
971 		 * quota record data the user data fragments can be embedded
972 		 * betwen those metadata, and so its not safe to read, modify
973 		 * then write the entire range as user asynchronous user data
974 		 * writes could get overwritten with old data.
975 		 * Thus we have to create a segment map of meta data that
976 		 * needs to get written.
977 		 *
978 		 * If user data was logged then this issue would go away.
979 		 */
980 		if (read_needed) {
981 			for (i = first_sec + 1; i < last_sec; i++) {
982 				secmap |= (UINT16_C(1) << i);
983 			}
984 		}
985 	}
986 	rbp->rb_secmap = secmap;
987 	return (read_needed);
988 }
989 
990 /*
991  * Abort the load of a set of log map delta's.
992  * ie,
993  * Clear out all mapentries on this unit's log map
994  * which have a tid (transaction id) equal to the
995  * parameter tid.   Walk the cancel list, taking everything
996  * off it, too.
997  */
998 static void
999 logmap_abort(ml_unit_t *ul, uint32_t tid)
1000 {
1001 	struct mt_map	*mtm = ul->un_logmap;	/* Log map */
1002 	mapentry_t	*me, **mep;
1003 	int		i;
1004 
1005 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1006 	    map_check_linkage(mtm));
1007 
1008 	/*
1009 	 * wait for any outstanding reads to finish; lock out future reads
1010 	 */
1011 	rw_enter(&mtm->mtm_rwlock, RW_WRITER);
1012 
1013 	mutex_enter(&mtm->mtm_mutex);
1014 	/* Take everything off cancel list */
1015 	while ((me = mtm->mtm_cancel) != NULL) {
1016 		mtm->mtm_cancel = me->me_cancel;
1017 		me->me_flags &= ~ME_CANCEL;
1018 		me->me_cancel = NULL;
1019 	}
1020 
1021 	/*
1022 	 * Now take out all mapentries with current tid, and committid
1023 	 * as this function is called from logmap_logscan and logmap_commit
1024 	 * When it is called from logmap_logscan mtm_tid == mtm_committid
1025 	 * But when logmap_abort is called from logmap_commit it is
1026 	 * because the log errored when trying to write the commit record,
1027 	 * after the async ops have been allowed to start in top_end_sync.
1028 	 * So we also need to remove all mapentries from the transaction whose
1029 	 * commit failed.
1030 	 */
1031 	for (i = 0; i < mtm->mtm_nhash; i++) {
1032 		mep = &mtm->mtm_hash[i];
1033 		while ((me = *mep) != NULL) {
1034 			if (me->me_tid == tid ||
1035 			    me->me_tid == mtm->mtm_committid) {
1036 				*mep = me->me_hash;
1037 				me->me_next->me_prev = me->me_prev;
1038 				me->me_prev->me_next = me->me_next;
1039 				if (!(me->me_flags & ME_USER)) {
1040 					mtm->mtm_nme--;
1041 				}
1042 				CRB_RELE(me);
1043 				kmem_cache_free(mapentry_cache, me);
1044 				continue;
1045 			}
1046 			mep = &me->me_hash;
1047 		}
1048 	}
1049 
1050 	if (!(ul->un_flags & LDL_SCAN))
1051 		mtm->mtm_flags |= MTM_CANCELED;
1052 	mutex_exit(&mtm->mtm_mutex);
1053 	mtm->mtm_dirty = 0;
1054 	mtm->mtm_nmet = 0;
1055 	rw_exit(&mtm->mtm_rwlock);
1056 
1057 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1058 	    map_check_linkage(mtm));
1059 }
1060 
1061 static void
1062 logmap_wait_space(mt_map_t *mtm, ml_unit_t *ul, mapentry_t *me)
1063 {
1064 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1065 
1066 	while (!ldl_has_space(ul, me)) {
1067 		ASSERT(!(ul->un_flags & LDL_NOROLL));
1068 		mutex_exit(&ul->un_log_mutex);
1069 		logmap_forceroll(mtm);
1070 		mutex_enter(&ul->un_log_mutex);
1071 		if (ul->un_flags & LDL_ERROR)
1072 			break;
1073 	}
1074 
1075 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1076 }
1077 
1078 /*
1079  * put a list of deltas into a logmap
1080  * If va == NULL, don't write to the log.
1081  */
1082 void
1083 logmap_add(
1084 	ml_unit_t *ul,
1085 	char *va,			/* Ptr to buf w/deltas & data */
1086 	offset_t vamof,			/* Offset on master of buf start */
1087 	mapentry_t *melist)		/* Entries to add */
1088 {
1089 	offset_t	mof;
1090 	off_t		nb;
1091 	mapentry_t	*me;
1092 	mapentry_t	**mep;
1093 	mapentry_t	**savmep;
1094 	uint32_t	tid;
1095 	mt_map_t	*mtm	= ul->un_logmap;
1096 
1097 	mutex_enter(&ul->un_log_mutex);
1098 	if (va)
1099 		logmap_wait_space(mtm, ul, melist);
1100 
1101 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1102 	    map_check_linkage(mtm));
1103 
1104 	mtm->mtm_ref = 1;
1105 	mtm->mtm_dirty++;
1106 	tid = mtm->mtm_tid;
1107 	while (melist) {
1108 		mof = melist->me_mof;
1109 		nb  = melist->me_nb;
1110 
1111 		/*
1112 		 * search for overlaping entries
1113 		 */
1114 		savmep = mep = MAP_HASH(mof, mtm);
1115 		mutex_enter(&mtm->mtm_mutex);
1116 		while ((me = *mep) != 0) {
1117 			/*
1118 			 * Data consumes old map entry; cancel map entry.
1119 			 * Take care when we replace an old map entry
1120 			 * which carries quota information with a newer entry
1121 			 * which does not. In that case the push function
1122 			 * would not be called to clean up the dquot structure.
1123 			 * This would be found later by invalidatedq() causing
1124 			 * a panic when the filesystem in unmounted.
1125 			 * We clean up the dquot manually and then replace
1126 			 * the map entry.
1127 			 */
1128 			if (MEwithinDATA(me, mof, nb) &&
1129 			    ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) {
1130 				if (tid == me->me_tid &&
1131 				    ((me->me_flags & ME_AGE) == 0)) {
1132 					*mep = me->me_hash;
1133 					me->me_next->me_prev = me->me_prev;
1134 					me->me_prev->me_next = me->me_next;
1135 					ASSERT(!(me->me_flags & ME_USER));
1136 					mtm->mtm_nme--;
1137 					/*
1138 					 * Special case if the mapentry
1139 					 * carries a dquot and a push function.
1140 					 * We have to clean up the quota info
1141 					 * before replacing the mapentry.
1142 					 */
1143 					if (me->me_dt == DT_QR)
1144 						HANDLE_DQUOT(me, melist);
1145 
1146 					kmem_cache_free(mapentry_cache, me);
1147 					continue;
1148 				}
1149 				me->me_cancel = mtm->mtm_cancel;
1150 				mtm->mtm_cancel = me;
1151 				me->me_flags |= ME_CANCEL;
1152 			}
1153 			mep = &(*mep)->me_hash;
1154 		}
1155 		mutex_exit(&mtm->mtm_mutex);
1156 
1157 		/*
1158 		 * remove from list
1159 		 */
1160 		me = melist;
1161 		melist = melist->me_hash;
1162 		me->me_flags &= ~ME_LIST;
1163 		/*
1164 		 * If va != NULL, put in the log.
1165 		 */
1166 		if (va)
1167 			ldl_write(ul, va, vamof, me);
1168 		if (ul->un_flags & LDL_ERROR) {
1169 			kmem_cache_free(mapentry_cache, me);
1170 			continue;
1171 		}
1172 		ASSERT((va == NULL) ||
1173 		    ((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) ||
1174 		    map_check_ldl_write(ul, va, vamof, me));
1175 
1176 		/*
1177 		 * put on hash
1178 		 */
1179 		mutex_enter(&mtm->mtm_mutex);
1180 		me->me_hash = *savmep;
1181 		*savmep = me;
1182 		me->me_next = (mapentry_t *)mtm;
1183 		me->me_prev = mtm->mtm_prev;
1184 		mtm->mtm_prev->me_next = me;
1185 		mtm->mtm_prev = me;
1186 		me->me_flags |= ME_HASH;
1187 		me->me_tid = tid;
1188 		me->me_age = mtm->mtm_age++;
1189 		mtm->mtm_nme++;
1190 		mtm->mtm_nmet++;
1191 		mutex_exit(&mtm->mtm_mutex);
1192 	}
1193 
1194 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1195 	    map_check_linkage(mtm));
1196 	mutex_exit(&ul->un_log_mutex);
1197 }
1198 
1199 /*
1200  * Add the delta(s) into the log.
1201  * Create one cached roll buffer logmap entry, and reference count the
1202  * number of mapentries refering to it.
1203  * Cancel previous logmap entries.
1204  * logmap_add is tolerant of failure to allocate a cached roll buffer.
1205  */
1206 void
1207 logmap_add_buf(
1208 	ml_unit_t *ul,
1209 	char *va,			/* Ptr to buf w/deltas & data */
1210 	offset_t bufmof,		/* Offset on master of buf start */
1211 	mapentry_t *melist,		/* Entries to add */
1212 	caddr_t	buf,			/* Buffer containing delta(s) */
1213 	uint32_t bufsz)			/* Size of buf */
1214 {
1215 	offset_t	mof;
1216 	offset_t	vamof = bufmof + (va - buf);
1217 	off_t		nb;
1218 	mapentry_t	*me;
1219 	mapentry_t	**mep;
1220 	mapentry_t	**savmep;
1221 	uint32_t	tid;
1222 	mt_map_t	*mtm	= ul->un_logmap;
1223 	crb_t		*crb;
1224 	crb_t		*crbsav = NULL;
1225 
1226 	ASSERT((bufsz & DEV_BMASK) == 0);
1227 	mutex_enter(&ul->un_log_mutex);
1228 	logmap_wait_space(mtm, ul, melist);
1229 
1230 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1231 	    map_check_linkage(mtm));
1232 
1233 	mtm->mtm_ref = 1;
1234 	mtm->mtm_dirty++;
1235 	tid = mtm->mtm_tid;
1236 	while (melist) {
1237 		mof = melist->me_mof;
1238 		nb  = melist->me_nb;
1239 
1240 		/*
1241 		 * search for overlapping entries
1242 		 */
1243 		savmep = mep = MAP_HASH(mof, mtm);
1244 		mutex_enter(&mtm->mtm_mutex);
1245 		while ((me = *mep) != 0) {
1246 			/*
1247 			 * Data consumes old map entry; cancel map entry.
1248 			 * Take care when we replace an old map entry
1249 			 * which carries quota information with a newer entry
1250 			 * which does not. In that case the push function
1251 			 * would not be called to clean up the dquot structure.
1252 			 * This would be found later by invalidatedq() causing
1253 			 * a panic when the filesystem in unmounted.
1254 			 * We clean up the dquot manually and then replace
1255 			 * the map entry.
1256 			 */
1257 			crb = me->me_crb;
1258 			if (MEwithinDATA(me, mof, nb) &&
1259 			    ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) {
1260 				if (tid == me->me_tid &&
1261 				    ((me->me_flags & ME_AGE) == 0)) {
1262 					*mep = me->me_hash;
1263 					me->me_next->me_prev = me->me_prev;
1264 					me->me_prev->me_next = me->me_next;
1265 					ASSERT(!(me->me_flags & ME_USER));
1266 					mtm->mtm_nme--;
1267 					/*
1268 					 * Special case if the mapentry
1269 					 * carries a dquot and a push function.
1270 					 * We have to clean up the quota info
1271 					 * before replacing the mapentry.
1272 					 */
1273 					if (me->me_dt == DT_QR)
1274 						HANDLE_DQUOT(me, melist);
1275 
1276 					/*
1277 					 * If this soon to be deleted mapentry
1278 					 * has a suitable roll buffer then
1279 					 * re-use it.
1280 					 */
1281 					if (crb && (--crb->c_refcnt == 0)) {
1282 						if (crbsav ||
1283 						    (crb->c_nb != bufsz)) {
1284 							CRB_FREE(crb, me);
1285 						} else {
1286 							bcopy(buf, crb->c_buf,
1287 							    bufsz);
1288 							crb->c_invalid = 0;
1289 							crb->c_mof = bufmof;
1290 							crbsav = crb;
1291 							me->me_crb = NULL;
1292 						}
1293 					}
1294 					kmem_cache_free(mapentry_cache, me);
1295 					continue;
1296 				}
1297 				me->me_cancel = mtm->mtm_cancel;
1298 				mtm->mtm_cancel = me;
1299 				me->me_flags |= ME_CANCEL;
1300 			}
1301 
1302 			/*
1303 			 * Inode deltas within the same fs block come
1304 			 * in individually as separate calls to logmap_add().
1305 			 * All others come in as one call. So check for an
1306 			 * existing entry where we can re-use the crb.
1307 			 */
1308 			if ((me->me_dt == DT_INODE) && (tid == me->me_tid) &&
1309 			    !crbsav && crb &&
1310 			    WITHIN(mof, nb, crb->c_mof, crb->c_nb)) {
1311 				ASSERT(crb->c_mof == bufmof);
1312 				ASSERT(crb->c_nb == bufsz);
1313 				bcopy(buf, crb->c_buf, bufsz);
1314 				crbsav = crb;
1315 			}
1316 			mep = &(*mep)->me_hash;
1317 		}
1318 		mutex_exit(&mtm->mtm_mutex);
1319 
1320 		/*
1321 		 * If we don't already have a crb then allocate one
1322 		 * and copy the incoming buffer. Only do this once
1323 		 * for all the incoming deltas.
1324 		 */
1325 		if ((crbsav == NULL) && (melist->me_dt != DT_ABZERO)) {
1326 			/*
1327 			 * Only use a cached roll buffer if we
1328 			 * have enough memory, and check for failures.
1329 			 */
1330 			if (((ufs_crb_size + bufsz) < ufs_crb_limit) &&
1331 			    (kmem_avail() > bufsz)) {
1332 				crbsav = kmem_alloc(sizeof (crb_t), KM_NOSLEEP);
1333 			} else {
1334 				ufs_crb_alloc_fails++;
1335 			}
1336 			if (crbsav) {
1337 				crbsav->c_buf = kmem_alloc(bufsz, KM_NOSLEEP);
1338 				if (crbsav->c_buf) {
1339 					atomic_add_64(&ufs_crb_size,
1340 					    (uint64_t)bufsz);
1341 					if (ufs_crb_size > ufs_crb_max_size) {
1342 						ufs_crb_max_size = ufs_crb_size;
1343 					}
1344 					bcopy(buf, crbsav->c_buf, bufsz);
1345 					crbsav->c_nb = bufsz;
1346 					crbsav->c_refcnt = 0;
1347 					crbsav->c_invalid = 0;
1348 					ASSERT((bufmof & DEV_BMASK) == 0);
1349 					crbsav->c_mof = bufmof;
1350 				} else {
1351 					kmem_free(crbsav, sizeof (crb_t));
1352 					crbsav = NULL;
1353 				}
1354 			}
1355 		}
1356 
1357 		/*
1358 		 * remove from list
1359 		 */
1360 		me = melist;
1361 		melist = melist->me_hash;
1362 		me->me_flags &= ~ME_LIST;
1363 		me->me_crb = crbsav;
1364 		if (crbsav) {
1365 			crbsav->c_refcnt++;
1366 		}
1367 		crbsav = NULL;
1368 
1369 		ASSERT(va);
1370 		ldl_write(ul, va, vamof, me); /* add to on-disk log */
1371 		if (ul->un_flags & LDL_ERROR) {
1372 			CRB_RELE(me);
1373 			kmem_cache_free(mapentry_cache, me);
1374 			continue;
1375 		}
1376 		ASSERT(((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) ||
1377 		    map_check_ldl_write(ul, va, vamof, me));
1378 
1379 		/*
1380 		 * put on hash
1381 		 */
1382 		mutex_enter(&mtm->mtm_mutex);
1383 		me->me_hash = *savmep;
1384 		*savmep = me;
1385 		me->me_next = (mapentry_t *)mtm;
1386 		me->me_prev = mtm->mtm_prev;
1387 		mtm->mtm_prev->me_next = me;
1388 		mtm->mtm_prev = me;
1389 		me->me_flags |= ME_HASH;
1390 		me->me_tid = tid;
1391 		me->me_age = mtm->mtm_age++;
1392 		mtm->mtm_nme++;
1393 		mtm->mtm_nmet++;
1394 		mutex_exit(&mtm->mtm_mutex);
1395 	}
1396 
1397 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1398 	    map_check_linkage(mtm));
1399 	mutex_exit(&ul->un_log_mutex);
1400 }
1401 
1402 /*
1403  * free up any cancelled deltas
1404  */
1405 void
1406 logmap_free_cancel(mt_map_t *mtm, mapentry_t **cancelhead)
1407 {
1408 	int		dolock	= 0;
1409 	mapentry_t	*me;
1410 	mapentry_t	**mep;
1411 
1412 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1413 	    map_check_linkage(mtm));
1414 
1415 again:
1416 	if (dolock)
1417 		rw_enter(&mtm->mtm_rwlock, RW_WRITER);
1418 
1419 	/*
1420 	 * At EOT, cancel the indicated deltas
1421 	 */
1422 	mutex_enter(&mtm->mtm_mutex);
1423 	if (mtm->mtm_flags & MTM_CANCELED) {
1424 		mtm->mtm_flags &= ~MTM_CANCELED;
1425 		ASSERT(dolock == 0);
1426 		mutex_exit(&mtm->mtm_mutex);
1427 		return;
1428 	}
1429 
1430 	while ((me = *cancelhead) != NULL) {
1431 		/*
1432 		 * roll forward or read collision; wait and try again
1433 		 */
1434 		if (me->me_flags & ME_AGE) {
1435 			ASSERT(dolock == 0);
1436 			mutex_exit(&mtm->mtm_mutex);
1437 			dolock = 1;
1438 			goto again;
1439 		}
1440 		/*
1441 		 * remove from cancel list
1442 		 */
1443 		*cancelhead = me->me_cancel;
1444 		me->me_cancel = NULL;
1445 		me->me_flags &= ~(ME_CANCEL);
1446 
1447 		/*
1448 		 * logmap_remove_roll handles ME_ROLL entries later
1449 		 *	we leave them around for logmap_iscancel
1450 		 *	XXX is this necessary?
1451 		 */
1452 		if (me->me_flags & ME_ROLL)
1453 			continue;
1454 
1455 		/*
1456 		 * remove from hash (if necessary)
1457 		 */
1458 		if (me->me_flags & ME_HASH) {
1459 			mep = MAP_HASH(me->me_mof, mtm);
1460 			while (*mep) {
1461 				if (*mep == me) {
1462 					*mep = me->me_hash;
1463 					me->me_next->me_prev = me->me_prev;
1464 					me->me_prev->me_next = me->me_next;
1465 					me->me_flags &= ~(ME_HASH);
1466 					if (!(me->me_flags & ME_USER)) {
1467 						mtm->mtm_nme--;
1468 					}
1469 					break;
1470 				} else
1471 					mep = &(*mep)->me_hash;
1472 			}
1473 		}
1474 		/*
1475 		 * put the entry on the free list
1476 		 */
1477 		CRB_RELE(me);
1478 		kmem_cache_free(mapentry_cache, me);
1479 	}
1480 	mutex_exit(&mtm->mtm_mutex);
1481 	if (dolock)
1482 		rw_exit(&mtm->mtm_rwlock);
1483 
1484 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1485 	    map_check_linkage(mtm));
1486 }
1487 
1488 
1489 void
1490 logmap_commit(ml_unit_t *ul, uint32_t tid)
1491 {
1492 	mapentry_t	me;
1493 	mt_map_t	*mtm	= ul->un_logmap;
1494 
1495 
1496 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1497 
1498 	/*
1499 	 * async'ly write a commit rec into the log
1500 	 */
1501 	if (mtm->mtm_dirty) {
1502 		/*
1503 		 * put commit record into log
1504 		 */
1505 		me.me_mof = mtm->mtm_tid;
1506 		me.me_dt = DT_COMMIT;
1507 		me.me_nb = 0;
1508 		me.me_hash = NULL;
1509 		logmap_wait_space(mtm, ul, &me);
1510 		ldl_write(ul, NULL, (offset_t)0, &me);
1511 		ldl_round_commit(ul);
1512 
1513 		/*
1514 		 * abort on error; else reset dirty flag
1515 		 */
1516 		if (ul->un_flags & LDL_ERROR)
1517 			logmap_abort(ul, tid);
1518 		else {
1519 			mtm->mtm_dirty = 0;
1520 			mtm->mtm_nmet = 0;
1521 			mtm->mtm_cfrags = 0;
1522 		}
1523 		/* push commit */
1524 		ldl_push_commit(ul);
1525 	}
1526 }
1527 
1528 void
1529 logmap_sethead(mt_map_t *mtm, ml_unit_t *ul)
1530 {
1531 	off_t		lof;
1532 	uint32_t	tid;
1533 	mapentry_t	*me;
1534 
1535 	/*
1536 	 * move the head forward so the log knows how full it is
1537 	 * Make sure to skip any mapentry whose me_lof is 0, these
1538 	 * are just place holders for DT_CANCELED freed user blocks
1539 	 * for the current moby.
1540 	 */
1541 	mutex_enter(&ul->un_log_mutex);
1542 	mutex_enter(&mtm->mtm_mutex);
1543 	me = mtm->mtm_next;
1544 	while (me != (mapentry_t *)mtm && me->me_lof == 0) {
1545 		me = me->me_next;
1546 	}
1547 
1548 	if (me == (mapentry_t *)mtm)
1549 		lof = -1;
1550 	else {
1551 		lof = me->me_lof;
1552 		tid = me->me_tid;
1553 	}
1554 	mutex_exit(&mtm->mtm_mutex);
1555 	ldl_sethead(ul, lof, tid);
1556 	if (lof == -1)
1557 		mtm->mtm_age = 0;
1558 	mutex_exit(&ul->un_log_mutex);
1559 }
1560 
1561 void
1562 logmap_settail(mt_map_t *mtm, ml_unit_t *ul)
1563 {
1564 	off_t		lof;
1565 	size_t		nb;
1566 
1567 	/*
1568 	 * set the tail after the logmap_abort
1569 	 */
1570 	mutex_enter(&ul->un_log_mutex);
1571 	mutex_enter(&mtm->mtm_mutex);
1572 	if (mtm->mtm_prev == (mapentry_t *)mtm)
1573 		lof = -1;
1574 	else {
1575 		/*
1576 		 * set the tail to the end of the last commit
1577 		 */
1578 		lof = mtm->mtm_tail_lof;
1579 		nb = mtm->mtm_tail_nb;
1580 	}
1581 	mutex_exit(&mtm->mtm_mutex);
1582 	ldl_settail(ul, lof, nb);
1583 	mutex_exit(&ul->un_log_mutex);
1584 }
1585 
1586 /*
1587  * when reseting a device; roll the log until every
1588  * delta has been rolled forward
1589  */
1590 void
1591 logmap_roll_dev(ml_unit_t *ul)
1592 {
1593 	mt_map_t	*mtm	= ul->un_logmap;
1594 	mapentry_t	*me;
1595 	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
1596 
1597 again:
1598 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1599 	    map_check_linkage(mtm));
1600 	if (ul->un_flags & (LDL_ERROR|LDL_NOROLL))
1601 		return;
1602 
1603 	/*
1604 	 * look for deltas
1605 	 */
1606 	mutex_enter(&mtm->mtm_mutex);
1607 	for (me = mtm->mtm_next; me != (mapentry_t *)mtm; me = me->me_next) {
1608 		if (me->me_flags & ME_ROLL)
1609 			break;
1610 		if (me->me_tid == mtm->mtm_tid)
1611 			continue;
1612 		if (me->me_tid == mtm->mtm_committid)
1613 			continue;
1614 		break;
1615 	}
1616 
1617 	/*
1618 	 * found a delta; kick the roll thread
1619 	 * but only if the thread is running... (jmh)
1620 	 */
1621 	if (me != (mapentry_t *)mtm) {
1622 		mutex_exit(&mtm->mtm_mutex);
1623 		logmap_forceroll(mtm);
1624 		goto again;
1625 	}
1626 
1627 	/*
1628 	 * no more deltas, return
1629 	 */
1630 	mutex_exit(&mtm->mtm_mutex);
1631 	(void) ufs_putsummaryinfo(ul->un_dev, ufsvfsp, ufsvfsp->vfs_fs);
1632 
1633 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1634 	    map_check_linkage(mtm));
1635 }
1636 
1637 static void
1638 logmap_cancel_delta(ml_unit_t *ul, offset_t mof, int32_t nb, int metadata)
1639 {
1640 	mapentry_t	*me;
1641 	mapentry_t	**mep;
1642 	mt_map_t	*mtm	= ul->un_logmap;
1643 	int		frags;
1644 
1645 	/*
1646 	 * map has been referenced and is dirty
1647 	 */
1648 	mtm->mtm_ref = 1;
1649 	mtm->mtm_dirty++;
1650 
1651 	/*
1652 	 * get a mapentry
1653 	 */
1654 	me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
1655 	bzero(me, sizeof (mapentry_t));
1656 
1657 	/*
1658 	 * initialize cancel record and put in logmap
1659 	 */
1660 	me->me_mof = mof;
1661 	me->me_nb = nb;
1662 	me->me_dt = DT_CANCEL;
1663 	me->me_tid = mtm->mtm_tid;
1664 	me->me_hash = NULL;
1665 
1666 	/*
1667 	 * Write delta to log if this delta is for metadata.  If this is not
1668 	 * metadata it is user data and we are just putting a cancel
1669 	 * mapentry into the hash to cancel a user block deletion
1670 	 * in which we do not want the block to be allocated
1671 	 * within this moby.  This cancel entry will prevent the block from
1672 	 * being allocated within the moby and prevent user data corruption
1673 	 * if we happen to crash before this moby is committed.
1674 	 */
1675 	mutex_enter(&ul->un_log_mutex);
1676 	if (metadata) {
1677 		logmap_wait_space(mtm, ul, me);
1678 		ldl_write(ul, NULL, (offset_t)0, me);
1679 		if (ul->un_flags & LDL_ERROR) {
1680 			kmem_cache_free(mapentry_cache, me);
1681 			mutex_exit(&ul->un_log_mutex);
1682 			return;
1683 		}
1684 	}
1685 
1686 	/*
1687 	 * put in hash and on cancel list
1688 	 */
1689 	mep = MAP_HASH(mof, mtm);
1690 	mutex_enter(&mtm->mtm_mutex);
1691 	me->me_age = mtm->mtm_age++;
1692 	me->me_hash = *mep;
1693 	*mep = me;
1694 	me->me_next = (mapentry_t *)mtm;
1695 	me->me_prev = mtm->mtm_prev;
1696 	mtm->mtm_prev->me_next = me;
1697 	mtm->mtm_prev = me;
1698 	me->me_cancel = mtm->mtm_cancel;
1699 	mtm->mtm_cancel = me;
1700 	if (metadata) {
1701 		mtm->mtm_nme++;
1702 		mtm->mtm_nmet++;
1703 	} else {
1704 		me->me_flags = ME_USER;
1705 	}
1706 	me->me_flags |= (ME_HASH|ME_CANCEL);
1707 	if (!(metadata)) {
1708 		frags = blkoff(ul->un_ufsvfs->vfs_fs, nb);
1709 		if (frags)
1710 			mtm->mtm_cfrags +=
1711 			    numfrags(ul->un_ufsvfs->vfs_fs, frags);
1712 	}
1713 	mutex_exit(&mtm->mtm_mutex);
1714 
1715 	mutex_exit(&ul->un_log_mutex);
1716 }
1717 
1718 /*
1719  * cancel entries in a logmap (entries are freed at EOT)
1720  */
1721 void
1722 logmap_cancel(ml_unit_t *ul, offset_t mof, off_t nb, int metadata)
1723 {
1724 	int32_t		hnb;
1725 	mapentry_t	*me;
1726 	mapentry_t	**mep;
1727 	mt_map_t	*mtm	= ul->un_logmap;
1728 	crb_t		*crb;
1729 
1730 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1731 	    map_check_linkage(mtm));
1732 
1733 	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
1734 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
1735 		if (hnb > nb)
1736 			hnb = nb;
1737 		/*
1738 		 * Find overlapping metadata entries.  Don't search through
1739 		 * the hash chains if this is user data because it is only
1740 		 * possible to have overlapping map entries for metadata,
1741 		 * and the search can become expensive for large files.
1742 		 */
1743 		if (metadata) {
1744 			mep = MAP_HASH(mof, mtm);
1745 			mutex_enter(&mtm->mtm_mutex);
1746 			for (me = *mep; me; me = me->me_hash) {
1747 				if (!DATAoverlapME(mof, hnb, me))
1748 					continue;
1749 
1750 				ASSERT(MEwithinDATA(me, mof, hnb));
1751 
1752 				if ((me->me_flags & ME_CANCEL) == 0) {
1753 					me->me_cancel = mtm->mtm_cancel;
1754 					mtm->mtm_cancel = me;
1755 					me->me_flags |= ME_CANCEL;
1756 					crb = me->me_crb;
1757 					if (crb) {
1758 						crb->c_invalid = 1;
1759 					}
1760 				}
1761 			}
1762 			mutex_exit(&mtm->mtm_mutex);
1763 		}
1764 
1765 		/*
1766 		 * put a cancel record into the log
1767 		 */
1768 		logmap_cancel_delta(ul, mof, hnb, metadata);
1769 	}
1770 
1771 	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1772 	    map_check_linkage(mtm));
1773 }
1774 
1775 /*
1776  * check for overlap w/cancel delta
1777  */
1778 int
1779 logmap_iscancel(mt_map_t *mtm, offset_t mof, off_t nb)
1780 {
1781 	off_t		hnb;
1782 	mapentry_t	*me;
1783 	mapentry_t	**mep;
1784 
1785 	mutex_enter(&mtm->mtm_mutex);
1786 	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
1787 		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
1788 		if (hnb > nb)
1789 			hnb = nb;
1790 		/*
1791 		 * search for dup entry
1792 		 */
1793 		mep = MAP_HASH(mof, mtm);
1794 		for (me = *mep; me; me = me->me_hash) {
1795 			if (((me->me_flags & ME_ROLL) == 0) &&
1796 			    (me->me_dt != DT_CANCEL))
1797 				continue;
1798 			if (DATAoverlapME(mof, hnb, me))
1799 				break;
1800 		}
1801 
1802 		/*
1803 		 * overlap detected
1804 		 */
1805 		if (me) {
1806 			mutex_exit(&mtm->mtm_mutex);
1807 			return (1);
1808 		}
1809 	}
1810 	mutex_exit(&mtm->mtm_mutex);
1811 	return (0);
1812 }
1813 
1814 static int
1815 logmap_logscan_add(ml_unit_t *ul, struct delta *dp, off_t lof, size_t *nbp)
1816 {
1817 	mapentry_t	*me;
1818 	int		error;
1819 	mt_map_t	*mtm	= ul->un_logmap;
1820 
1821 	/*
1822 	 * verify delta header; failure == mediafail
1823 	 */
1824 	error = 0;
1825 	/* delta type */
1826 	if ((dp->d_typ <= DT_NONE) || (dp->d_typ >= DT_MAX))
1827 		error = EINVAL;
1828 	if (dp->d_typ == DT_COMMIT) {
1829 		if (dp->d_nb != INT32_C(0) && dp->d_nb != INT32_C(-1))
1830 			error = EINVAL;
1831 	} else {
1832 		/* length of delta */
1833 		if ((dp->d_nb < INT32_C(0)) ||
1834 		    (dp->d_nb > INT32_C(MAPBLOCKSIZE)))
1835 			error = EINVAL;
1836 
1837 		/* offset on master device */
1838 		if (dp->d_mof < INT64_C(0))
1839 			error = EINVAL;
1840 	}
1841 
1842 	if (error) {
1843 		ldl_seterror(ul, "Error processing ufs log data during scan");
1844 		return (error);
1845 	}
1846 
1847 	/*
1848 	 * process commit record
1849 	 */
1850 	if (dp->d_typ == DT_COMMIT) {
1851 		if (mtm->mtm_dirty) {
1852 			ASSERT(dp->d_nb == INT32_C(0));
1853 			logmap_free_cancel(mtm, &mtm->mtm_cancel);
1854 			mtm->mtm_dirty = 0;
1855 			mtm->mtm_nmet = 0;
1856 			mtm->mtm_tid++;
1857 			mtm->mtm_committid = mtm->mtm_tid;
1858 			ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) ||
1859 			    logmap_logscan_commit_debug(lof, mtm));
1860 		}
1861 		/*
1862 		 * return #bytes to next sector (next delta header)
1863 		 */
1864 		*nbp = ldl_logscan_nbcommit(lof);
1865 		mtm->mtm_tail_lof = lof;
1866 		mtm->mtm_tail_nb = *nbp;
1867 		return (0);
1868 	}
1869 
1870 	/*
1871 	 * add delta to logmap
1872 	 */
1873 	me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
1874 	bzero(me, sizeof (mapentry_t));
1875 	me->me_lof = lof;
1876 	me->me_mof = dp->d_mof;
1877 	me->me_nb = dp->d_nb;
1878 	me->me_tid = mtm->mtm_tid;
1879 	me->me_dt = dp->d_typ;
1880 	me->me_hash = NULL;
1881 	me->me_flags = (ME_LIST | ME_SCAN);
1882 	logmap_add(ul, NULL, 0, me);
1883 	switch (dp->d_typ) {
1884 	case DT_CANCEL:
1885 		me->me_flags |= ME_CANCEL;
1886 		me->me_cancel = mtm->mtm_cancel;
1887 		mtm->mtm_cancel = me;
1888 		break;
1889 	default:
1890 		ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) ||
1891 		    logmap_logscan_add_debug(dp, mtm));
1892 		break;
1893 	}
1894 
1895 sizeofdelta:
1896 	/*
1897 	 * return #bytes till next delta header
1898 	 */
1899 	if ((dp->d_typ == DT_CANCEL) || (dp->d_typ == DT_ABZERO))
1900 		*nbp = 0;
1901 	else
1902 		*nbp = dp->d_nb;
1903 	return (0);
1904 }
1905 
1906 void
1907 logmap_logscan(ml_unit_t *ul)
1908 {
1909 	size_t		nb, nbd;
1910 	off_t		lof;
1911 	struct delta	delta;
1912 	mt_map_t	*logmap	= ul->un_logmap;
1913 
1914 	ASSERT(ul->un_deltamap->mtm_next == (mapentry_t *)ul->un_deltamap);
1915 
1916 	/*
1917 	 * prepare the log for a logscan
1918 	 */
1919 	ldl_logscan_begin(ul);
1920 
1921 	/*
1922 	 * prepare the logmap for a logscan
1923 	 */
1924 	(void) map_free_entries(logmap);
1925 	logmap->mtm_tid = 0;
1926 	logmap->mtm_committid = UINT32_C(0);
1927 	logmap->mtm_age = 0;
1928 	logmap->mtm_dirty = 0;
1929 	logmap->mtm_ref = 0;
1930 
1931 	/*
1932 	 * while not at end of log
1933 	 *	read delta header
1934 	 *	add to logmap
1935 	 *	seek to beginning of next delta
1936 	 */
1937 	lof = ul->un_head_lof;
1938 	nbd = sizeof (delta);
1939 	while (lof != ul->un_tail_lof) {
1940 
1941 		/* read delta header */
1942 		if (ldl_logscan_read(ul, &lof, nbd, (caddr_t)&delta))
1943 			break;
1944 
1945 		/* add to logmap */
1946 		if (logmap_logscan_add(ul, &delta, lof, &nb))
1947 			break;
1948 
1949 		/* seek to next header (skip data) */
1950 		if (ldl_logscan_read(ul, &lof, nb, NULL))
1951 			break;
1952 	}
1953 
1954 	/*
1955 	 * remove the last partial transaction from the logmap
1956 	 */
1957 	logmap_abort(ul, logmap->mtm_tid);
1958 
1959 	ldl_logscan_end(ul);
1960 }
1961 
1962 void
1963 _init_map(void)
1964 {
1965 	/*
1966 	 * Initialise the mapentry cache. No constructor or deconstructor
1967 	 * is needed. Also no reclaim function is supplied as reclaiming
1968 	 * current entries is not possible.
1969 	 */
1970 	mapentry_cache = kmem_cache_create("lufs_mapentry_cache",
1971 	    sizeof (mapentry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1972 }
1973 
1974 /*
1975  * Special case when we replace an old map entry which carries quota
1976  * information with a newer entry which does not.
1977  * In that case the push function would not be called to clean up the
1978  * dquot structure. This would be found later by invalidatedq() causing
1979  * a panic when the filesystem in unmounted.
1980  * We clean up the dquot manually before replacing the map entry.
1981  */
1982 void
1983 handle_dquot(mapentry_t *me)
1984 {
1985 	int dolock = 0;
1986 	int domutex = 0;
1987 	struct dquot *dqp;
1988 
1989 	dqp = (struct dquot *)me->me_arg;
1990 
1991 	/*
1992 	 * We need vfs_dqrwlock to call dqput()
1993 	 */
1994 	dolock = (!RW_LOCK_HELD(&dqp->dq_ufsvfsp->vfs_dqrwlock));
1995 	if (dolock)
1996 		rw_enter(&dqp->dq_ufsvfsp->vfs_dqrwlock, RW_READER);
1997 
1998 	domutex = (!MUTEX_HELD(&dqp->dq_lock));
1999 	if (domutex)
2000 		mutex_enter(&dqp->dq_lock);
2001 
2002 	/*
2003 	 * Only clean up if the dquot is referenced
2004 	 */
2005 	if (dqp->dq_cnt == 0) {
2006 		if (domutex)
2007 			mutex_exit(&dqp->dq_lock);
2008 		if (dolock)
2009 			rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock);
2010 		return;
2011 	}
2012 
2013 	dqp->dq_flags &= ~(DQ_MOD|DQ_TRANS);
2014 	dqput(dqp);
2015 
2016 	if (domutex)
2017 		mutex_exit(&dqp->dq_lock);
2018 
2019 	if (dolock)
2020 		rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock);
2021 
2022 }
2023