xref: /titanic_41/usr/src/uts/common/fs/ufs/ufs_trans.c (revision bdfc6d18da790deeec2e0eb09c625902defe2498)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29 
30 /*
31  * Portions of this source code were derived from Berkeley 4.3 BSD
32  * under license from the Regents of the University of California.
33  */
34 
35 #pragma ident	"%Z%%M%	%I%	%E% SMI"
36 
37 #include <sys/sysmacros.h>
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/systm.h>
41 #include <sys/t_lock.h>
42 #include <sys/uio.h>
43 #include <sys/kmem.h>
44 #include <sys/thread.h>
45 #include <sys/vfs.h>
46 #include <sys/errno.h>
47 #include <sys/buf.h>
48 #include <sys/vnode.h>
49 #include <sys/fs/ufs_trans.h>
50 #include <sys/fs/ufs_inode.h>
51 #include <sys/fs/ufs_fs.h>
52 #include <sys/fs/ufs_fsdir.h>
53 #include <sys/fs/ufs_quota.h>
54 #include <sys/fs/ufs_panic.h>
55 #include <sys/fs/ufs_bio.h>
56 #include <sys/fs/ufs_log.h>
57 #include <sys/cmn_err.h>
58 #include <sys/file.h>
59 #include <sys/debug.h>
60 
61 
62 extern kmutex_t ufsvfs_mutex;
63 extern struct ufsvfs *ufs_instances;
64 
65 /*
66  * hlock any file systems w/errored logs
67  */
68 int
69 ufs_trans_hlock()
70 {
71 	struct ufsvfs	*ufsvfsp;
72 	struct lockfs	lockfs;
73 	int		error;
74 	int		retry	= 0;
75 
76 	/*
77 	 * find fs's that paniced or have errored logging devices
78 	 */
79 	mutex_enter(&ufsvfs_mutex);
80 	for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next) {
81 		/*
82 		 * not mounted; continue
83 		 */
84 		if ((ufsvfsp->vfs_vfs == NULL) ||
85 		    (ufsvfsp->vfs_validfs == UT_UNMOUNTED))
86 			continue;
87 		/*
88 		 * disallow unmounts (hlock occurs below)
89 		 */
90 		if (TRANS_ISERROR(ufsvfsp))
91 			ufsvfsp->vfs_validfs = UT_HLOCKING;
92 	}
93 	mutex_exit(&ufsvfs_mutex);
94 
95 	/*
96 	 * hlock the fs's that paniced or have errored logging devices
97 	 */
98 again:
99 	mutex_enter(&ufsvfs_mutex);
100 	for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next)
101 		if (ufsvfsp->vfs_validfs == UT_HLOCKING)
102 			break;
103 	mutex_exit(&ufsvfs_mutex);
104 	if (ufsvfsp == NULL)
105 		return (retry);
106 	/*
107 	 * hlock the file system
108 	 */
109 	(void) ufs_fiolfss(ufsvfsp->vfs_root, &lockfs);
110 	if (!LOCKFS_IS_ELOCK(&lockfs)) {
111 		lockfs.lf_lock = LOCKFS_HLOCK;
112 		lockfs.lf_flags = 0;
113 		lockfs.lf_comlen = 0;
114 		lockfs.lf_comment = NULL;
115 		error = ufs_fiolfs(ufsvfsp->vfs_root, &lockfs, 0);
116 		/*
117 		 * retry after awhile; another app currently doing lockfs
118 		 */
119 		if (error == EBUSY || error == EINVAL)
120 			retry = 1;
121 	} else {
122 		if (ufsfx_get_failure_qlen() > 0) {
123 			if (mutex_tryenter(&ufs_fix.uq_mutex)) {
124 				ufs_fix.uq_lowat = ufs_fix.uq_ne;
125 				cv_broadcast(&ufs_fix.uq_cv);
126 				mutex_exit(&ufs_fix.uq_mutex);
127 			}
128 		}
129 		retry = 1;
130 	}
131 
132 	/*
133 	 * allow unmounts
134 	 */
135 	ufsvfsp->vfs_validfs = UT_MOUNTED;
136 	goto again;
137 }
138 
139 /*ARGSUSED*/
140 void
141 ufs_trans_onerror()
142 {
143 	mutex_enter(&ufs_hlock.uq_mutex);
144 	ufs_hlock.uq_ne = ufs_hlock.uq_lowat;
145 	cv_broadcast(&ufs_hlock.uq_cv);
146 	mutex_exit(&ufs_hlock.uq_mutex);
147 }
148 
149 void
150 ufs_trans_sbupdate(struct ufsvfs *ufsvfsp, struct vfs *vfsp, top_t topid)
151 {
152 	if (curthread->t_flag & T_DONTBLOCK) {
153 		sbupdate(vfsp);
154 		return;
155 	} else {
156 
157 		if (panicstr && TRANS_ISTRANS(ufsvfsp))
158 			return;
159 
160 		curthread->t_flag |= T_DONTBLOCK;
161 		TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE);
162 		sbupdate(vfsp);
163 		TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE);
164 		curthread->t_flag &= ~T_DONTBLOCK;
165 	}
166 }
167 
168 void
169 ufs_trans_iupdat(struct inode *ip, int waitfor)
170 {
171 	struct ufsvfs	*ufsvfsp;
172 
173 	if (curthread->t_flag & T_DONTBLOCK) {
174 		rw_enter(&ip->i_contents, RW_READER);
175 		ufs_iupdat(ip, waitfor);
176 		rw_exit(&ip->i_contents);
177 		return;
178 	} else {
179 		ufsvfsp = ip->i_ufsvfs;
180 
181 		if (panicstr && TRANS_ISTRANS(ufsvfsp))
182 			return;
183 
184 		curthread->t_flag |= T_DONTBLOCK;
185 		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip));
186 		rw_enter(&ip->i_contents, RW_READER);
187 		ufs_iupdat(ip, waitfor);
188 		rw_exit(&ip->i_contents);
189 		TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip));
190 		curthread->t_flag &= ~T_DONTBLOCK;
191 	}
192 }
193 
194 void
195 ufs_trans_sbwrite(struct ufsvfs *ufsvfsp, top_t topid)
196 {
197 	if (curthread->t_flag & T_DONTBLOCK) {
198 		mutex_enter(&ufsvfsp->vfs_lock);
199 		ufs_sbwrite(ufsvfsp);
200 		mutex_exit(&ufsvfsp->vfs_lock);
201 		return;
202 	} else {
203 
204 		if (panicstr && TRANS_ISTRANS(ufsvfsp))
205 			return;
206 
207 		curthread->t_flag |= T_DONTBLOCK;
208 		TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE);
209 		mutex_enter(&ufsvfsp->vfs_lock);
210 		ufs_sbwrite(ufsvfsp);
211 		mutex_exit(&ufsvfsp->vfs_lock);
212 		TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE);
213 		curthread->t_flag &= ~T_DONTBLOCK;
214 	}
215 }
216 
217 /*ARGSUSED*/
218 int
219 ufs_trans_push_si(ufsvfs_t *ufsvfsp, delta_t dtyp, int ignore)
220 {
221 	struct fs	*fs;
222 
223 	fs = ufsvfsp->vfs_fs;
224 	mutex_enter(&ufsvfsp->vfs_lock);
225 	TRANS_LOG(ufsvfsp, (char *)fs->fs_u.fs_csp,
226 		ldbtob(fsbtodb(fs, fs->fs_csaddr)), fs->fs_cssize,
227 		(caddr_t)fs->fs_u.fs_csp, fs->fs_cssize);
228 	mutex_exit(&ufsvfsp->vfs_lock);
229 	return (0);
230 }
231 
232 /*ARGSUSED*/
233 int
234 ufs_trans_push_buf(ufsvfs_t *ufsvfsp, delta_t dtyp, daddr_t bno)
235 {
236 	struct buf	*bp;
237 
238 	bp = (struct buf *)UFS_GETBLK(ufsvfsp, ufsvfsp->vfs_dev, bno, 1);
239 	if (bp == NULL)
240 		return (ENOENT);
241 
242 	if (bp->b_flags & B_DELWRI) {
243 		/*
244 		 * Do not use brwrite() here since the buffer is already
245 		 * marked for retry or not by the code that called
246 		 * TRANS_BUF().
247 		 */
248 		UFS_BWRITE(ufsvfsp, bp);
249 		return (0);
250 	}
251 	/*
252 	 * If we did not find the real buf for this block above then
253 	 * clear the dev so the buf won't be found by mistake
254 	 * for this block later.  We had to allocate at least a 1 byte
255 	 * buffer to keep brelse happy.
256 	 */
257 	if (bp->b_bufsize == 1) {
258 		bp->b_dev = (o_dev_t)NODEV;
259 		bp->b_edev = NODEV;
260 		bp->b_flags = 0;
261 	}
262 	brelse(bp);
263 	return (ENOENT);
264 }
265 
266 /*ARGSUSED*/
267 int
268 ufs_trans_push_inode(ufsvfs_t *ufsvfsp, delta_t dtyp, ino_t ino)
269 {
270 	int		error;
271 	struct inode	*ip;
272 
273 	/*
274 	 * Grab the quota lock (if the file system has not been forcibly
275 	 * unmounted).
276 	 */
277 	if (ufsvfsp)
278 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
279 
280 	error = ufs_iget(ufsvfsp->vfs_vfs, ino, &ip, kcred);
281 
282 	if (ufsvfsp)
283 		rw_exit(&ufsvfsp->vfs_dqrwlock);
284 	if (error)
285 		return (ENOENT);
286 
287 	if (ip->i_flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) {
288 		rw_enter(&ip->i_contents, RW_READER);
289 		ufs_iupdat(ip, 1);
290 		rw_exit(&ip->i_contents);
291 		VN_RELE(ITOV(ip));
292 		return (0);
293 	}
294 	VN_RELE(ITOV(ip));
295 	return (ENOENT);
296 }
297 
298 #ifdef DEBUG
299 /*
300  *	These routines maintain the metadata map (matamap)
301  */
302 
303 /*
304  * update the metadata map at mount
305  */
306 static int
307 ufs_trans_mata_mount_scan(struct inode *ip, void *arg)
308 {
309 	/*
310 	 * wrong file system; keep looking
311 	 */
312 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
313 		return (0);
314 
315 	/*
316 	 * load the metadata map
317 	 */
318 	rw_enter(&ip->i_contents, RW_WRITER);
319 	ufs_trans_mata_iget(ip);
320 	rw_exit(&ip->i_contents);
321 	return (0);
322 }
323 
324 void
325 ufs_trans_mata_mount(struct ufsvfs *ufsvfsp)
326 {
327 	struct fs	*fs	= ufsvfsp->vfs_fs;
328 	ino_t		ino;
329 	int		i;
330 
331 	/*
332 	 * put static metadata into matamap
333 	 *	superblock
334 	 *	cylinder groups
335 	 *	inode groups
336 	 *	existing inodes
337 	 */
338 	TRANS_MATAADD(ufsvfsp, ldbtob(SBLOCK), fs->fs_sbsize);
339 
340 	for (ino = i = 0; i < fs->fs_ncg; ++i, ino += fs->fs_ipg) {
341 		TRANS_MATAADD(ufsvfsp,
342 		    ldbtob(fsbtodb(fs, cgtod(fs, i))), fs->fs_cgsize);
343 		TRANS_MATAADD(ufsvfsp,
344 		    ldbtob(fsbtodb(fs, itod(fs, ino))),
345 		    fs->fs_ipg * sizeof (struct dinode));
346 	}
347 	(void) ufs_scan_inodes(0, ufs_trans_mata_mount_scan, ufsvfsp, ufsvfsp);
348 }
349 
350 /*
351  * clear the metadata map at umount
352  */
353 void
354 ufs_trans_mata_umount(struct ufsvfs *ufsvfsp)
355 {
356 	top_mataclr(ufsvfsp);
357 }
358 
359 /*
360  * summary info (may be extended during growfs test)
361  */
362 void
363 ufs_trans_mata_si(struct ufsvfs *ufsvfsp, struct fs *fs)
364 {
365 	TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, fs->fs_csaddr)),
366 			fs->fs_cssize);
367 }
368 
369 /*
370  * scan an allocation block (either inode or true block)
371  */
372 static void
373 ufs_trans_mata_direct(
374 	struct inode *ip,
375 	daddr_t *fragsp,
376 	daddr32_t *blkp,
377 	unsigned int nblk)
378 {
379 	int		i;
380 	daddr_t		frag;
381 	ulong_t		nb;
382 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
383 	struct fs	*fs		= ufsvfsp->vfs_fs;
384 
385 	for (i = 0; i < nblk && *fragsp; ++i, ++blkp)
386 		if ((frag = *blkp) != 0) {
387 			if (*fragsp > fs->fs_frag) {
388 				nb = fs->fs_bsize;
389 				*fragsp -= fs->fs_frag;
390 			} else {
391 				nb = *fragsp * fs->fs_fsize;
392 				*fragsp = 0;
393 			}
394 			TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb);
395 		}
396 }
397 
398 /*
399  * scan an indirect allocation block (either inode or true block)
400  */
401 static void
402 ufs_trans_mata_indir(
403 	struct inode *ip,
404 	daddr_t *fragsp,
405 	daddr_t frag,
406 	int level)
407 {
408 	struct ufsvfs *ufsvfsp	= ip->i_ufsvfs;
409 	struct fs *fs = ufsvfsp->vfs_fs;
410 	int ne = fs->fs_bsize / (int)sizeof (daddr32_t);
411 	int i;
412 	struct buf *bp;
413 	daddr32_t *blkp;
414 	o_mode_t ifmt = ip->i_mode & IFMT;
415 
416 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, frag), fs->fs_bsize);
417 	if (bp->b_flags & B_ERROR) {
418 		brelse(bp);
419 		return;
420 	}
421 	blkp = bp->b_un.b_daddr;
422 
423 	if (level || (ifmt == IFDIR) || (ifmt == IFSHAD) ||
424 	    (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod))
425 		ufs_trans_mata_direct(ip, fragsp, blkp, ne);
426 
427 	if (level)
428 		for (i = 0; i < ne && *fragsp; ++i, ++blkp)
429 			ufs_trans_mata_indir(ip, fragsp, *blkp, level-1);
430 	brelse(bp);
431 }
432 
433 /*
434  * put appropriate metadata into matamap for this inode
435  */
436 void
437 ufs_trans_mata_iget(struct inode *ip)
438 {
439 	int		i;
440 	daddr_t		frags	= dbtofsb(ip->i_fs, ip->i_blocks);
441 	o_mode_t	ifmt 	= ip->i_mode & IFMT;
442 
443 	if (frags && ((ifmt == IFDIR) || (ifmt == IFSHAD) ||
444 	    (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)))
445 		ufs_trans_mata_direct(ip, &frags, &ip->i_db[0], NDADDR);
446 
447 	if (frags)
448 		ufs_trans_mata_direct(ip, &frags, &ip->i_ib[0], NIADDR);
449 
450 	for (i = 0; i < NIADDR && frags; ++i)
451 		if (ip->i_ib[i])
452 			ufs_trans_mata_indir(ip, &frags, ip->i_ib[i], i);
453 }
454 
455 /*
456  * freeing possible metadata (block of user data)
457  */
458 void
459 ufs_trans_mata_free(struct ufsvfs *ufsvfsp, offset_t mof, off_t nb)
460 {
461 	top_matadel(ufsvfsp, mof, nb);
462 
463 }
464 
465 /*
466  * allocating metadata
467  */
468 void
469 ufs_trans_mata_alloc(
470 	struct ufsvfs *ufsvfsp,
471 	struct inode *ip,
472 	daddr_t frag,
473 	ulong_t nb,
474 	int indir)
475 {
476 	struct fs	*fs	= ufsvfsp->vfs_fs;
477 	o_mode_t	ifmt 	= ip->i_mode & IFMT;
478 
479 	if (indir || ((ifmt == IFDIR) || (ifmt == IFSHAD) ||
480 	    (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)))
481 		TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb);
482 }
483 
484 #endif /* DEBUG */
485 
486 /*
487  * ufs_trans_dir is used to declare a directory delta
488  */
489 int
490 ufs_trans_dir(struct inode *ip, off_t offset)
491 {
492 	daddr_t	bn;
493 	int	contig = 0, error;
494 
495 	ASSERT(ip);
496 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
497 	error = bmap_read(ip, (u_offset_t)offset, &bn, &contig);
498 	if (error || (bn == UFS_HOLE)) {
499 		cmn_err(CE_WARN, "ufs_trans_dir - could not get block"
500 		    " number error = %d bn = %d\n", error, (int)bn);
501 		if (error == 0)	/* treat UFS_HOLE as an I/O error */
502 			error = EIO;
503 		return (error);
504 	}
505 	TRANS_DELTA(ip->i_ufsvfs, ldbtob(bn), DIRBLKSIZ, DT_DIR, 0, 0);
506 	return (error);
507 }
508 
509 /*ARGSUSED*/
510 int
511 ufs_trans_push_quota(ufsvfs_t *ufsvfsp, delta_t dtyp, struct dquot *dqp)
512 {
513 	/*
514 	 * Lock the quota subsystem (ufsvfsp can be NULL
515 	 * if the DQ_ERROR is set).
516 	 */
517 	if (ufsvfsp)
518 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
519 	mutex_enter(&dqp->dq_lock);
520 
521 	/*
522 	 * If this transaction has been cancelled by closedq_scan_inode(),
523 	 * then bail out now.  We don't call dqput() in this case because
524 	 * it has already been done.
525 	 */
526 	if ((dqp->dq_flags & DQ_TRANS) == 0) {
527 		mutex_exit(&dqp->dq_lock);
528 		if (ufsvfsp)
529 			rw_exit(&ufsvfsp->vfs_dqrwlock);
530 		return (0);
531 	}
532 
533 	if (dqp->dq_flags & DQ_ERROR) {
534 		/*
535 		 * Paranoia to make sure that there is at least one
536 		 * reference to the dquot struct.  We are done with
537 		 * the dquot (due to an error) so clear logging
538 		 * specific markers.
539 		 */
540 		ASSERT(dqp->dq_cnt >= 1);
541 		dqp->dq_flags &= ~DQ_TRANS;
542 		dqput(dqp);
543 		mutex_exit(&dqp->dq_lock);
544 		if (ufsvfsp)
545 			rw_exit(&ufsvfsp->vfs_dqrwlock);
546 		return (1);
547 	}
548 
549 	if (dqp->dq_flags & (DQ_MOD | DQ_BLKS | DQ_FILES)) {
550 		ASSERT((dqp->dq_mof != UFS_HOLE) && (dqp->dq_mof != 0));
551 		TRANS_LOG(ufsvfsp, (caddr_t)&dqp->dq_dqb,
552 		    dqp->dq_mof, (int)sizeof (struct dqblk), NULL, 0);
553 		/*
554 		 * Paranoia to make sure that there is at least one
555 		 * reference to the dquot struct.  Clear the
556 		 * modification flag because the operation is now in
557 		 * the log.  Also clear the logging specific markers
558 		 * that were set in ufs_trans_quota().
559 		 */
560 		ASSERT(dqp->dq_cnt >= 1);
561 		dqp->dq_flags &= ~(DQ_MOD | DQ_TRANS);
562 		dqput(dqp);
563 	}
564 
565 	/*
566 	 * At this point, the logging specific flag should be clear,
567 	 * but add paranoia just in case something has gone wrong.
568 	 */
569 	ASSERT((dqp->dq_flags & DQ_TRANS) == 0);
570 	mutex_exit(&dqp->dq_lock);
571 	if (ufsvfsp)
572 		rw_exit(&ufsvfsp->vfs_dqrwlock);
573 	return (0);
574 }
575 
576 /*
577  * ufs_trans_quota take in a uid, allocates the disk space, placing the
578  * quota record into the metamap, then declares the delta.
579  */
580 /*ARGSUSED*/
581 void
582 ufs_trans_quota(struct dquot *dqp)
583 {
584 
585 	struct inode	*qip = dqp->dq_ufsvfsp->vfs_qinod;
586 
587 	ASSERT(qip);
588 	ASSERT(MUTEX_HELD(&dqp->dq_lock));
589 	ASSERT(dqp->dq_flags & DQ_MOD);
590 	ASSERT(dqp->dq_mof != 0);
591 	ASSERT(dqp->dq_mof != UFS_HOLE);
592 
593 	/*
594 	 * Mark this dquot to indicate that we are starting a logging
595 	 * file system operation for this dquot.  Also increment the
596 	 * reference count so that the dquot does not get reused while
597 	 * it is on the mapentry_t list.  DQ_TRANS is cleared and the
598 	 * reference count is decremented by ufs_trans_push_quota.
599 	 *
600 	 * If the file system is force-unmounted while there is a
601 	 * pending quota transaction, then closedq_scan_inode() will
602 	 * clear the DQ_TRANS flag and decrement the reference count.
603 	 *
604 	 * Since deltamap_add() drops multiple transactions to the
605 	 * same dq_mof and ufs_trans_push_quota() won't get called,
606 	 * we use DQ_TRANS to prevent repeat transactions from
607 	 * incrementing the reference count (or calling TRANS_DELTA()).
608 	 */
609 	if ((dqp->dq_flags & DQ_TRANS) == 0) {
610 		dqp->dq_flags |= DQ_TRANS;
611 		dqp->dq_cnt++;
612 		TRANS_DELTA(qip->i_ufsvfs, dqp->dq_mof, sizeof (struct dqblk),
613 		    DT_QR, ufs_trans_push_quota, (ulong_t)dqp);
614 	}
615 }
616 
617 void
618 ufs_trans_dqrele(struct dquot *dqp)
619 {
620 	struct ufsvfs	*ufsvfsp = dqp->dq_ufsvfsp;
621 
622 	curthread->t_flag |= T_DONTBLOCK;
623 	TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE);
624 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
625 	dqrele(dqp);
626 	rw_exit(&ufsvfsp->vfs_dqrwlock);
627 	TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE);
628 	curthread->t_flag &= ~T_DONTBLOCK;
629 }
630 
631 int ufs_trans_max_resv = TOP_MAX_RESV;	/* will be adjusted for testing */
632 long ufs_trans_avgbfree = 0;		/* will be adjusted for testing */
633 #define	TRANS_MAX_WRITE	(1024 * 1024)
634 size_t ufs_trans_max_resid = TRANS_MAX_WRITE;
635 
636 /*
637  * Calculate the log reservation for the given write or truncate
638  */
639 static ulong_t
640 ufs_log_amt(struct inode *ip, offset_t offset, ssize_t resid, int trunc)
641 {
642 	long		ncg, last2blk;
643 	long		niblk		= 0;
644 	u_offset_t	writeend, offblk;
645 	int		resv;
646 	daddr_t		nblk, maxfblk;
647 	long		avgbfree;
648 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
649 	struct fs	*fs		= ufsvfsp->vfs_fs;
650 	long		fni		= NINDIR(fs);
651 	int		bsize		= fs->fs_bsize;
652 
653 	/*
654 	 * Assume that the request will fit in 1 or 2 cg's,
655 	 * resv is the amount of log space to reserve (in bytes).
656 	 */
657 	resv = SIZECG(ip) * 2 + INODESIZE + 1024;
658 
659 	/*
660 	 * get max position of write in fs blocks
661 	 */
662 	writeend = offset + resid;
663 	maxfblk = lblkno(fs, writeend);
664 	offblk = lblkno(fs, offset);
665 	/*
666 	 * request size in fs blocks
667 	 */
668 	nblk = lblkno(fs, blkroundup(fs, resid));
669 	/*
670 	 * Adjust for sparse files
671 	 */
672 	if (trunc)
673 		nblk = MIN(nblk, ip->i_blocks);
674 
675 	/*
676 	 * Adjust avgbfree (for testing)
677 	 */
678 	avgbfree = (ufs_trans_avgbfree) ? 1 : ufsvfsp->vfs_avgbfree + 1;
679 
680 	/*
681 	 * Calculate maximum number of blocks of triple indirect
682 	 * pointers to write.
683 	 */
684 	last2blk = NDADDR + fni + fni * fni;
685 	if (maxfblk > last2blk) {
686 		long nl2ptr;
687 		long n3blk;
688 
689 		if (offblk > last2blk)
690 			n3blk = maxfblk - offblk;
691 		else
692 			n3blk = maxfblk - last2blk;
693 		niblk += roundup(n3blk * sizeof (daddr_t), bsize) / bsize + 1;
694 		nl2ptr = roundup(niblk, fni) / fni + 1;
695 		niblk += roundup(nl2ptr * sizeof (daddr_t), bsize) / bsize + 2;
696 		maxfblk -= n3blk;
697 	}
698 	/*
699 	 * calculate maximum number of blocks of double indirect
700 	 * pointers to write.
701 	 */
702 	if (maxfblk > NDADDR + fni) {
703 		long n2blk;
704 
705 		if (offblk > NDADDR + fni)
706 			n2blk = maxfblk - offblk;
707 		else
708 			n2blk = maxfblk - NDADDR + fni;
709 		niblk += roundup(n2blk * sizeof (daddr_t), bsize) / bsize + 2;
710 		maxfblk -= n2blk;
711 	}
712 	/*
713 	 * Add in indirect pointer block write
714 	 */
715 	if (maxfblk > NDADDR) {
716 		niblk += 1;
717 	}
718 	/*
719 	 * Calculate deltas for indirect pointer writes
720 	 */
721 	resv += niblk * (fs->fs_bsize + sizeof (struct delta));
722 	/*
723 	 * maximum number of cg's needed for request
724 	 */
725 	ncg = nblk / avgbfree;
726 	if (ncg > fs->fs_ncg)
727 		ncg = fs->fs_ncg;
728 
729 	/*
730 	 * maximum amount of log space needed for request
731 	 */
732 	if (ncg > 2)
733 		resv += (ncg - 2) * SIZECG(ip);
734 
735 	return (resv);
736 }
737 
738 /*
739  * Calculate the amount of log space that needs to be reserved for this
740  * trunc request.  If the amount of log space is too large, then
741  * calculate the the size that the requests needs to be split into.
742  */
743 static void
744 ufs_trans_trunc_resv(
745 	struct inode *ip,
746 	u_offset_t length,
747 	int *resvp,
748 	u_offset_t *residp)
749 {
750 	ulong_t		resv;
751 	u_offset_t	size, offset, resid;
752 	int		nchunks;
753 
754 	/*
755 	 *    *resvp is the amount of log space to reserve (in bytes).
756 	 *    when nonzero, *residp is the number of bytes to truncate.
757 	 */
758 	*residp = 0;
759 
760 	if (length < ip->i_size) {
761 		size = ip->i_size - length;
762 	} else {
763 		resv = SIZECG(ip) * 2 + INODESIZE + 1024;
764 		/*
765 		 * truncate up, doesn't really use much space,
766 		 * the default above should be sufficient.
767 		 */
768 		goto done;
769 	}
770 
771 	offset = length;
772 	resid = size;
773 	nchunks = 1;
774 	for (; (resv = ufs_log_amt(ip, offset, resid, 1)) > ufs_trans_max_resv;
775 		offset = length + (nchunks - 1) * resid) {
776 		nchunks++;
777 		resid = size / nchunks;
778 	}
779 	/*
780 	 * If this request takes too much log space, it will be split
781 	 */
782 	if (nchunks > 1) {
783 		*residp = resid;
784 	}
785 done:
786 	*resvp = resv;
787 }
788 
789 int
790 ufs_trans_itrunc(struct inode *ip, u_offset_t length, int flags, cred_t *cr)
791 {
792 	int 		err, issync, resv;
793 	u_offset_t	resid;
794 	int		do_block	= 0;
795 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
796 	struct fs	*fs		= ufsvfsp->vfs_fs;
797 
798 	/*
799 	 * Not logging; just do the trunc
800 	 */
801 	if (!TRANS_ISTRANS(ufsvfsp)) {
802 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
803 		rw_enter(&ip->i_contents, RW_WRITER);
804 		err = ufs_itrunc(ip, length, flags, cr);
805 		rw_exit(&ip->i_contents);
806 		rw_exit(&ufsvfsp->vfs_dqrwlock);
807 		return (err);
808 	}
809 
810 	/*
811 	 * within the lockfs protocol but *not* part of a transaction
812 	 */
813 	do_block = curthread->t_flag & T_DONTBLOCK;
814 	curthread->t_flag |= T_DONTBLOCK;
815 
816 	/*
817 	 * Trunc the file (in pieces, if necessary)
818 	 */
819 again:
820 	ufs_trans_trunc_resv(ip, length, &resv, &resid);
821 	TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ITRUNC, resv);
822 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
823 	rw_enter(&ip->i_contents, RW_WRITER);
824 	if (resid) {
825 		/*
826 		 * resid is only set if we have to truncate in chunks
827 		 */
828 		ASSERT(length + resid < ip->i_size);
829 
830 		/*
831 		 * Partially trunc file down to desired size (length).
832 		 * Only retain I_FREE on the last partial trunc.
833 		 * Round up size to a block boundary, to ensure the truncate
834 		 * doesn't have to allocate blocks. This is done both for
835 		 * performance and to fix a bug where if the block can't be
836 		 * allocated then the inode delete fails, but the inode
837 		 * is still freed with attached blocks and non-zero size
838 		 * (bug 4348738).
839 		 */
840 		err = ufs_itrunc(ip, blkroundup(fs, (ip->i_size - resid)),
841 		    flags & ~I_FREE, cr);
842 		ASSERT(ip->i_size != length);
843 	} else
844 		err = ufs_itrunc(ip, length, flags, cr);
845 	if (!do_block)
846 		curthread->t_flag &= ~T_DONTBLOCK;
847 	rw_exit(&ip->i_contents);
848 	rw_exit(&ufsvfsp->vfs_dqrwlock);
849 	TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ITRUNC, resv);
850 
851 	if ((err == 0) && resid) {
852 		ufsvfsp->vfs_avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
853 		goto again;
854 	}
855 	return (err);
856 }
857 
858 /*
859  * Fault in the pages of the first n bytes specified by the uio structure.
860  * 1 byte in each page is touched and the uio struct is unmodified.
861  * Any error will terminate the process as this is only a best
862  * attempt to get the pages resident.
863  */
864 static void
865 ufs_trans_touch(ssize_t n, struct uio *uio)
866 {
867 	struct iovec *iov;
868 	ulong_t cnt, incr;
869 	caddr_t p;
870 	uint8_t tmp;
871 
872 	iov = uio->uio_iov;
873 
874 	while (n) {
875 		cnt = MIN(iov->iov_len, n);
876 		if (cnt == 0) {
877 			/* empty iov entry */
878 			iov++;
879 			continue;
880 		}
881 		n -= cnt;
882 		/*
883 		 * touch each page in this segment.
884 		 */
885 		p = iov->iov_base;
886 		while (cnt) {
887 			switch (uio->uio_segflg) {
888 			case UIO_USERSPACE:
889 			case UIO_USERISPACE:
890 				if (fuword8(p, &tmp))
891 					return;
892 				break;
893 			case UIO_SYSSPACE:
894 				if (kcopy(p, &tmp, 1))
895 					return;
896 				break;
897 			}
898 			incr = MIN(cnt, PAGESIZE);
899 			p += incr;
900 			cnt -= incr;
901 		}
902 		/*
903 		 * touch the last byte in case it straddles a page.
904 		 */
905 		p--;
906 		switch (uio->uio_segflg) {
907 		case UIO_USERSPACE:
908 		case UIO_USERISPACE:
909 			if (fuword8(p, &tmp))
910 				return;
911 			break;
912 		case UIO_SYSSPACE:
913 			if (kcopy(p, &tmp, 1))
914 				return;
915 			break;
916 		}
917 		iov++;
918 	}
919 }
920 
921 /*
922  * Calculate the amount of log space that needs to be reserved for this
923  * write request.  If the amount of log space is too large, then
924  * calculate the size that the requests needs to be split into.
925  * First try fixed chunks of size ufs_trans_max_resid. If that
926  * is too big, iterate down to the largest size that will fit.
927  * Pagein the pages in the first chunk here, so that the pagein is
928  * avoided later when the transaction is open.
929  */
930 void
931 ufs_trans_write_resv(
932 	struct inode *ip,
933 	struct uio *uio,
934 	int *resvp,
935 	int *residp)
936 {
937 	ulong_t		resv;
938 	offset_t	offset;
939 	ssize_t		resid;
940 	int		nchunks;
941 
942 	*residp = 0;
943 	offset = uio->uio_offset;
944 	resid = MIN(uio->uio_resid, ufs_trans_max_resid);
945 	resv = ufs_log_amt(ip, offset, resid, 0);
946 	if (resv <= ufs_trans_max_resv) {
947 		ufs_trans_touch(resid, uio);
948 		if (resid != uio->uio_resid)
949 			*residp = resid;
950 		*resvp = resv;
951 		return;
952 	}
953 
954 	resid = uio->uio_resid;
955 	nchunks = 1;
956 	for (; (resv = ufs_log_amt(ip, offset, resid, 0)) > ufs_trans_max_resv;
957 		offset = uio->uio_offset + (nchunks - 1) * resid) {
958 		nchunks++;
959 		resid = uio->uio_resid / nchunks;
960 	}
961 	ufs_trans_touch(resid, uio);
962 	/*
963 	 * If this request takes too much log space, it will be split
964 	 */
965 	if (nchunks > 1)
966 		*residp = resid;
967 	*resvp = resv;
968 }
969 
970 /*
971  * Issue write request.
972  *
973  * Split a large request into smaller chunks.
974  */
975 int
976 ufs_trans_write(
977 	struct inode *ip,
978 	struct uio *uio,
979 	int ioflag,
980 	cred_t *cr,
981 	int resv,
982 	long resid)
983 {
984 	long		realresid;
985 	int		err;
986 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
987 
988 	/*
989 	 * since the write is too big and would "HOG THE LOG" it needs to
990 	 * be broken up and done in pieces.  NOTE, the caller will
991 	 * issue the EOT after the request has been completed
992 	 */
993 	realresid = uio->uio_resid;
994 
995 again:
996 	/*
997 	 * Perform partial request (uiomove will update uio for us)
998 	 *	Request is split up into "resid" size chunks until
999 	 *	"realresid" bytes have been transferred.
1000 	 */
1001 	uio->uio_resid = MIN(resid, realresid);
1002 	realresid -= uio->uio_resid;
1003 	err = wrip(ip, uio, ioflag, cr);
1004 
1005 	/*
1006 	 * Error or request is done; caller issues final EOT
1007 	 */
1008 	if (err || uio->uio_resid || (realresid == 0)) {
1009 		uio->uio_resid += realresid;
1010 		return (err);
1011 	}
1012 
1013 	/*
1014 	 * Generate EOT for this part of the request
1015 	 */
1016 	rw_exit(&ip->i_contents);
1017 	rw_exit(&ufsvfsp->vfs_dqrwlock);
1018 	if (ioflag & (FSYNC|FDSYNC)) {
1019 		TRANS_END_SYNC(ufsvfsp, err, TOP_WRITE_SYNC, resv);
1020 	} else {
1021 		TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
1022 	}
1023 
1024 	/*
1025 	 * Make sure the input buffer is resident before starting
1026 	 * the next transaction.
1027 	 */
1028 	ufs_trans_touch(MIN(resid, realresid), uio);
1029 
1030 	/*
1031 	 * Generate BOT for next part of the request
1032 	 */
1033 	if (ioflag & (FSYNC|FDSYNC)) {
1034 		int error;
1035 		TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, error);
1036 		ASSERT(!error);
1037 	} else {
1038 		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
1039 	}
1040 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1041 	rw_enter(&ip->i_contents, RW_WRITER);
1042 	/*
1043 	 * Error during EOT (probably device error while writing commit rec)
1044 	 */
1045 	if (err)
1046 		return (err);
1047 	goto again;
1048 }
1049