xref: /titanic_50/usr/src/uts/common/fs/ufs/ufs_trans.c (revision f0e308969424e7466232f0795ba1891698c3adf7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 #include <sys/sysmacros.h>
35 #include <sys/param.h>
36 #include <sys/types.h>
37 #include <sys/systm.h>
38 #include <sys/t_lock.h>
39 #include <sys/uio.h>
40 #include <sys/kmem.h>
41 #include <sys/thread.h>
42 #include <sys/vfs.h>
43 #include <sys/errno.h>
44 #include <sys/buf.h>
45 #include <sys/vnode.h>
46 #include <sys/fs/ufs_trans.h>
47 #include <sys/fs/ufs_inode.h>
48 #include <sys/fs/ufs_fs.h>
49 #include <sys/fs/ufs_fsdir.h>
50 #include <sys/fs/ufs_quota.h>
51 #include <sys/fs/ufs_panic.h>
52 #include <sys/fs/ufs_bio.h>
53 #include <sys/fs/ufs_log.h>
54 #include <sys/cmn_err.h>
55 #include <sys/file.h>
56 #include <sys/debug.h>
57 
58 
59 extern kmutex_t ufsvfs_mutex;
60 extern struct ufsvfs *ufs_instances;
61 
62 /*
63  * hlock any file systems w/errored logs
64  */
65 int
66 ufs_trans_hlock()
67 {
68 	struct ufsvfs	*ufsvfsp;
69 	struct lockfs	lockfs;
70 	int		error;
71 	int		retry	= 0;
72 
73 	/*
74 	 * find fs's that paniced or have errored logging devices
75 	 */
76 	mutex_enter(&ufsvfs_mutex);
77 	for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next) {
78 		/*
79 		 * not mounted; continue
80 		 */
81 		if ((ufsvfsp->vfs_vfs == NULL) ||
82 		    (ufsvfsp->vfs_validfs == UT_UNMOUNTED))
83 			continue;
84 		/*
85 		 * disallow unmounts (hlock occurs below)
86 		 */
87 		if (TRANS_ISERROR(ufsvfsp))
88 			ufsvfsp->vfs_validfs = UT_HLOCKING;
89 	}
90 	mutex_exit(&ufsvfs_mutex);
91 
92 	/*
93 	 * hlock the fs's that paniced or have errored logging devices
94 	 */
95 again:
96 	mutex_enter(&ufsvfs_mutex);
97 	for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next)
98 		if (ufsvfsp->vfs_validfs == UT_HLOCKING)
99 			break;
100 	mutex_exit(&ufsvfs_mutex);
101 	if (ufsvfsp == NULL)
102 		return (retry);
103 	/*
104 	 * hlock the file system
105 	 */
106 	(void) ufs_fiolfss(ufsvfsp->vfs_root, &lockfs);
107 	if (!LOCKFS_IS_ELOCK(&lockfs)) {
108 		lockfs.lf_lock = LOCKFS_HLOCK;
109 		lockfs.lf_flags = 0;
110 		lockfs.lf_comlen = 0;
111 		lockfs.lf_comment = NULL;
112 		error = ufs_fiolfs(ufsvfsp->vfs_root, &lockfs, 0);
113 		/*
114 		 * retry after awhile; another app currently doing lockfs
115 		 */
116 		if (error == EBUSY || error == EINVAL)
117 			retry = 1;
118 	} else {
119 		if (ufsfx_get_failure_qlen() > 0) {
120 			if (mutex_tryenter(&ufs_fix.uq_mutex)) {
121 				ufs_fix.uq_lowat = ufs_fix.uq_ne;
122 				cv_broadcast(&ufs_fix.uq_cv);
123 				mutex_exit(&ufs_fix.uq_mutex);
124 			}
125 		}
126 		retry = 1;
127 	}
128 
129 	/*
130 	 * allow unmounts
131 	 */
132 	ufsvfsp->vfs_validfs = UT_MOUNTED;
133 	goto again;
134 }
135 
136 /*ARGSUSED*/
137 void
138 ufs_trans_onerror()
139 {
140 	mutex_enter(&ufs_hlock.uq_mutex);
141 	ufs_hlock.uq_ne = ufs_hlock.uq_lowat;
142 	cv_broadcast(&ufs_hlock.uq_cv);
143 	mutex_exit(&ufs_hlock.uq_mutex);
144 }
145 
146 void
147 ufs_trans_sbupdate(struct ufsvfs *ufsvfsp, struct vfs *vfsp, top_t topid)
148 {
149 	if (curthread->t_flag & T_DONTBLOCK) {
150 		sbupdate(vfsp);
151 		return;
152 	} else {
153 
154 		if (panicstr && TRANS_ISTRANS(ufsvfsp))
155 			return;
156 
157 		curthread->t_flag |= T_DONTBLOCK;
158 		TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE);
159 		sbupdate(vfsp);
160 		TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE);
161 		curthread->t_flag &= ~T_DONTBLOCK;
162 	}
163 }
164 
165 void
166 ufs_trans_iupdat(struct inode *ip, int waitfor)
167 {
168 	struct ufsvfs	*ufsvfsp;
169 
170 	if (curthread->t_flag & T_DONTBLOCK) {
171 		rw_enter(&ip->i_contents, RW_READER);
172 		ufs_iupdat(ip, waitfor);
173 		rw_exit(&ip->i_contents);
174 		return;
175 	} else {
176 		ufsvfsp = ip->i_ufsvfs;
177 
178 		if (panicstr && TRANS_ISTRANS(ufsvfsp))
179 			return;
180 
181 		curthread->t_flag |= T_DONTBLOCK;
182 		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip));
183 		rw_enter(&ip->i_contents, RW_READER);
184 		ufs_iupdat(ip, waitfor);
185 		rw_exit(&ip->i_contents);
186 		TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip));
187 		curthread->t_flag &= ~T_DONTBLOCK;
188 	}
189 }
190 
191 void
192 ufs_trans_sbwrite(struct ufsvfs *ufsvfsp, top_t topid)
193 {
194 	if (curthread->t_flag & T_DONTBLOCK) {
195 		mutex_enter(&ufsvfsp->vfs_lock);
196 		ufs_sbwrite(ufsvfsp);
197 		mutex_exit(&ufsvfsp->vfs_lock);
198 		return;
199 	} else {
200 
201 		if (panicstr && TRANS_ISTRANS(ufsvfsp))
202 			return;
203 
204 		curthread->t_flag |= T_DONTBLOCK;
205 		TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE);
206 		mutex_enter(&ufsvfsp->vfs_lock);
207 		ufs_sbwrite(ufsvfsp);
208 		mutex_exit(&ufsvfsp->vfs_lock);
209 		TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE);
210 		curthread->t_flag &= ~T_DONTBLOCK;
211 	}
212 }
213 
214 /*ARGSUSED*/
215 int
216 ufs_trans_push_si(ufsvfs_t *ufsvfsp, delta_t dtyp, int ignore)
217 {
218 	struct fs	*fs;
219 
220 	fs = ufsvfsp->vfs_fs;
221 	mutex_enter(&ufsvfsp->vfs_lock);
222 	TRANS_LOG(ufsvfsp, (char *)fs->fs_u.fs_csp,
223 	    ldbtob(fsbtodb(fs, fs->fs_csaddr)), fs->fs_cssize,
224 	    (caddr_t)fs->fs_u.fs_csp, fs->fs_cssize);
225 	mutex_exit(&ufsvfsp->vfs_lock);
226 	return (0);
227 }
228 
229 /*ARGSUSED*/
230 int
231 ufs_trans_push_buf(ufsvfs_t *ufsvfsp, delta_t dtyp, daddr_t bno)
232 {
233 	struct buf	*bp;
234 
235 	bp = (struct buf *)UFS_GETBLK(ufsvfsp, ufsvfsp->vfs_dev, bno, 1);
236 	if (bp == NULL)
237 		return (ENOENT);
238 
239 	if (bp->b_flags & B_DELWRI) {
240 		/*
241 		 * Do not use brwrite() here since the buffer is already
242 		 * marked for retry or not by the code that called
243 		 * TRANS_BUF().
244 		 */
245 		UFS_BWRITE(ufsvfsp, bp);
246 		return (0);
247 	}
248 	/*
249 	 * If we did not find the real buf for this block above then
250 	 * clear the dev so the buf won't be found by mistake
251 	 * for this block later.  We had to allocate at least a 1 byte
252 	 * buffer to keep brelse happy.
253 	 */
254 	if (bp->b_bufsize == 1) {
255 		bp->b_dev = (o_dev_t)NODEV;
256 		bp->b_edev = NODEV;
257 		bp->b_flags = 0;
258 	}
259 	brelse(bp);
260 	return (ENOENT);
261 }
262 
263 /*ARGSUSED*/
264 int
265 ufs_trans_push_inode(ufsvfs_t *ufsvfsp, delta_t dtyp, ino_t ino)
266 {
267 	int		error;
268 	struct inode	*ip;
269 
270 	/*
271 	 * Grab the quota lock (if the file system has not been forcibly
272 	 * unmounted).
273 	 */
274 	if (ufsvfsp)
275 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
276 
277 	error = ufs_iget(ufsvfsp->vfs_vfs, ino, &ip, kcred);
278 
279 	if (ufsvfsp)
280 		rw_exit(&ufsvfsp->vfs_dqrwlock);
281 	if (error)
282 		return (ENOENT);
283 
284 	if (ip->i_flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) {
285 		rw_enter(&ip->i_contents, RW_READER);
286 		ufs_iupdat(ip, 1);
287 		rw_exit(&ip->i_contents);
288 		VN_RELE(ITOV(ip));
289 		return (0);
290 	}
291 	VN_RELE(ITOV(ip));
292 	return (ENOENT);
293 }
294 
295 #ifdef DEBUG
296 /*
297  *	These routines maintain the metadata map (matamap)
298  */
299 
300 /*
301  * update the metadata map at mount
302  */
303 static int
304 ufs_trans_mata_mount_scan(struct inode *ip, void *arg)
305 {
306 	/*
307 	 * wrong file system; keep looking
308 	 */
309 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
310 		return (0);
311 
312 	/*
313 	 * load the metadata map
314 	 */
315 	rw_enter(&ip->i_contents, RW_WRITER);
316 	ufs_trans_mata_iget(ip);
317 	rw_exit(&ip->i_contents);
318 	return (0);
319 }
320 
321 void
322 ufs_trans_mata_mount(struct ufsvfs *ufsvfsp)
323 {
324 	struct fs	*fs	= ufsvfsp->vfs_fs;
325 	ino_t		ino;
326 	int		i;
327 
328 	/*
329 	 * put static metadata into matamap
330 	 *	superblock
331 	 *	cylinder groups
332 	 *	inode groups
333 	 *	existing inodes
334 	 */
335 	TRANS_MATAADD(ufsvfsp, ldbtob(SBLOCK), fs->fs_sbsize);
336 
337 	for (ino = i = 0; i < fs->fs_ncg; ++i, ino += fs->fs_ipg) {
338 		TRANS_MATAADD(ufsvfsp,
339 		    ldbtob(fsbtodb(fs, cgtod(fs, i))), fs->fs_cgsize);
340 		TRANS_MATAADD(ufsvfsp,
341 		    ldbtob(fsbtodb(fs, itod(fs, ino))),
342 		    fs->fs_ipg * sizeof (struct dinode));
343 	}
344 	(void) ufs_scan_inodes(0, ufs_trans_mata_mount_scan, ufsvfsp, ufsvfsp);
345 }
346 
347 /*
348  * clear the metadata map at umount
349  */
350 void
351 ufs_trans_mata_umount(struct ufsvfs *ufsvfsp)
352 {
353 	top_mataclr(ufsvfsp);
354 }
355 
356 /*
357  * summary info (may be extended during growfs test)
358  */
359 void
360 ufs_trans_mata_si(struct ufsvfs *ufsvfsp, struct fs *fs)
361 {
362 	TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, fs->fs_csaddr)),
363 	    fs->fs_cssize);
364 }
365 
366 /*
367  * scan an allocation block (either inode or true block)
368  */
369 static void
370 ufs_trans_mata_direct(
371 	struct inode *ip,
372 	daddr_t *fragsp,
373 	daddr32_t *blkp,
374 	unsigned int nblk)
375 {
376 	int		i;
377 	daddr_t		frag;
378 	ulong_t		nb;
379 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
380 	struct fs	*fs		= ufsvfsp->vfs_fs;
381 
382 	for (i = 0; i < nblk && *fragsp; ++i, ++blkp)
383 		if ((frag = *blkp) != 0) {
384 			if (*fragsp > fs->fs_frag) {
385 				nb = fs->fs_bsize;
386 				*fragsp -= fs->fs_frag;
387 			} else {
388 				nb = *fragsp * fs->fs_fsize;
389 				*fragsp = 0;
390 			}
391 			TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb);
392 		}
393 }
394 
395 /*
396  * scan an indirect allocation block (either inode or true block)
397  */
398 static void
399 ufs_trans_mata_indir(
400 	struct inode *ip,
401 	daddr_t *fragsp,
402 	daddr_t frag,
403 	int level)
404 {
405 	struct ufsvfs *ufsvfsp	= ip->i_ufsvfs;
406 	struct fs *fs = ufsvfsp->vfs_fs;
407 	int ne = fs->fs_bsize / (int)sizeof (daddr32_t);
408 	int i;
409 	struct buf *bp;
410 	daddr32_t *blkp;
411 	o_mode_t ifmt = ip->i_mode & IFMT;
412 
413 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, frag), fs->fs_bsize);
414 	if (bp->b_flags & B_ERROR) {
415 		brelse(bp);
416 		return;
417 	}
418 	blkp = bp->b_un.b_daddr;
419 
420 	if (level || (ifmt == IFDIR) || (ifmt == IFSHAD) ||
421 	    (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod))
422 		ufs_trans_mata_direct(ip, fragsp, blkp, ne);
423 
424 	if (level)
425 		for (i = 0; i < ne && *fragsp; ++i, ++blkp)
426 			ufs_trans_mata_indir(ip, fragsp, *blkp, level-1);
427 	brelse(bp);
428 }
429 
430 /*
431  * put appropriate metadata into matamap for this inode
432  */
433 void
434 ufs_trans_mata_iget(struct inode *ip)
435 {
436 	int		i;
437 	daddr_t		frags	= dbtofsb(ip->i_fs, ip->i_blocks);
438 	o_mode_t	ifmt 	= ip->i_mode & IFMT;
439 
440 	if (frags && ((ifmt == IFDIR) || (ifmt == IFSHAD) ||
441 	    (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)))
442 		ufs_trans_mata_direct(ip, &frags, &ip->i_db[0], NDADDR);
443 
444 	if (frags)
445 		ufs_trans_mata_direct(ip, &frags, &ip->i_ib[0], NIADDR);
446 
447 	for (i = 0; i < NIADDR && frags; ++i)
448 		if (ip->i_ib[i])
449 			ufs_trans_mata_indir(ip, &frags, ip->i_ib[i], i);
450 }
451 
452 /*
453  * freeing possible metadata (block of user data)
454  */
455 void
456 ufs_trans_mata_free(struct ufsvfs *ufsvfsp, offset_t mof, off_t nb)
457 {
458 	top_matadel(ufsvfsp, mof, nb);
459 
460 }
461 
462 /*
463  * allocating metadata
464  */
465 void
466 ufs_trans_mata_alloc(
467 	struct ufsvfs *ufsvfsp,
468 	struct inode *ip,
469 	daddr_t frag,
470 	ulong_t nb,
471 	int indir)
472 {
473 	struct fs	*fs	= ufsvfsp->vfs_fs;
474 	o_mode_t	ifmt 	= ip->i_mode & IFMT;
475 
476 	if (indir || ((ifmt == IFDIR) || (ifmt == IFSHAD) ||
477 	    (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)))
478 		TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb);
479 }
480 
481 #endif /* DEBUG */
482 
483 /*
484  * ufs_trans_dir is used to declare a directory delta
485  */
486 int
487 ufs_trans_dir(struct inode *ip, off_t offset)
488 {
489 	daddr_t	bn;
490 	int	contig = 0, error;
491 
492 	ASSERT(ip);
493 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
494 	error = bmap_read(ip, (u_offset_t)offset, &bn, &contig);
495 	if (error || (bn == UFS_HOLE)) {
496 		cmn_err(CE_WARN, "ufs_trans_dir - could not get block"
497 		    " number error = %d bn = %d\n", error, (int)bn);
498 		if (error == 0)	/* treat UFS_HOLE as an I/O error */
499 			error = EIO;
500 		return (error);
501 	}
502 	TRANS_DELTA(ip->i_ufsvfs, ldbtob(bn), DIRBLKSIZ, DT_DIR, 0, 0);
503 	return (error);
504 }
505 
506 /*ARGSUSED*/
507 int
508 ufs_trans_push_quota(ufsvfs_t *ufsvfsp, delta_t dtyp, struct dquot *dqp)
509 {
510 	/*
511 	 * Lock the quota subsystem (ufsvfsp can be NULL
512 	 * if the DQ_ERROR is set).
513 	 */
514 	if (ufsvfsp)
515 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
516 	mutex_enter(&dqp->dq_lock);
517 
518 	/*
519 	 * If this transaction has been cancelled by closedq_scan_inode(),
520 	 * then bail out now.  We don't call dqput() in this case because
521 	 * it has already been done.
522 	 */
523 	if ((dqp->dq_flags & DQ_TRANS) == 0) {
524 		mutex_exit(&dqp->dq_lock);
525 		if (ufsvfsp)
526 			rw_exit(&ufsvfsp->vfs_dqrwlock);
527 		return (0);
528 	}
529 
530 	if (dqp->dq_flags & DQ_ERROR) {
531 		/*
532 		 * Paranoia to make sure that there is at least one
533 		 * reference to the dquot struct.  We are done with
534 		 * the dquot (due to an error) so clear logging
535 		 * specific markers.
536 		 */
537 		ASSERT(dqp->dq_cnt >= 1);
538 		dqp->dq_flags &= ~DQ_TRANS;
539 		dqput(dqp);
540 		mutex_exit(&dqp->dq_lock);
541 		if (ufsvfsp)
542 			rw_exit(&ufsvfsp->vfs_dqrwlock);
543 		return (1);
544 	}
545 
546 	if (dqp->dq_flags & (DQ_MOD | DQ_BLKS | DQ_FILES)) {
547 		ASSERT((dqp->dq_mof != UFS_HOLE) && (dqp->dq_mof != 0));
548 		TRANS_LOG(ufsvfsp, (caddr_t)&dqp->dq_dqb,
549 		    dqp->dq_mof, (int)sizeof (struct dqblk), NULL, 0);
550 		/*
551 		 * Paranoia to make sure that there is at least one
552 		 * reference to the dquot struct.  Clear the
553 		 * modification flag because the operation is now in
554 		 * the log.  Also clear the logging specific markers
555 		 * that were set in ufs_trans_quota().
556 		 */
557 		ASSERT(dqp->dq_cnt >= 1);
558 		dqp->dq_flags &= ~(DQ_MOD | DQ_TRANS);
559 		dqput(dqp);
560 	}
561 
562 	/*
563 	 * At this point, the logging specific flag should be clear,
564 	 * but add paranoia just in case something has gone wrong.
565 	 */
566 	ASSERT((dqp->dq_flags & DQ_TRANS) == 0);
567 	mutex_exit(&dqp->dq_lock);
568 	if (ufsvfsp)
569 		rw_exit(&ufsvfsp->vfs_dqrwlock);
570 	return (0);
571 }
572 
573 /*
574  * ufs_trans_quota take in a uid, allocates the disk space, placing the
575  * quota record into the metamap, then declares the delta.
576  */
577 /*ARGSUSED*/
578 void
579 ufs_trans_quota(struct dquot *dqp)
580 {
581 
582 	struct inode	*qip = dqp->dq_ufsvfsp->vfs_qinod;
583 
584 	ASSERT(qip);
585 	ASSERT(MUTEX_HELD(&dqp->dq_lock));
586 	ASSERT(dqp->dq_flags & DQ_MOD);
587 	ASSERT(dqp->dq_mof != 0);
588 	ASSERT(dqp->dq_mof != UFS_HOLE);
589 
590 	/*
591 	 * Mark this dquot to indicate that we are starting a logging
592 	 * file system operation for this dquot.  Also increment the
593 	 * reference count so that the dquot does not get reused while
594 	 * it is on the mapentry_t list.  DQ_TRANS is cleared and the
595 	 * reference count is decremented by ufs_trans_push_quota.
596 	 *
597 	 * If the file system is force-unmounted while there is a
598 	 * pending quota transaction, then closedq_scan_inode() will
599 	 * clear the DQ_TRANS flag and decrement the reference count.
600 	 *
601 	 * Since deltamap_add() drops multiple transactions to the
602 	 * same dq_mof and ufs_trans_push_quota() won't get called,
603 	 * we use DQ_TRANS to prevent repeat transactions from
604 	 * incrementing the reference count (or calling TRANS_DELTA()).
605 	 */
606 	if ((dqp->dq_flags & DQ_TRANS) == 0) {
607 		dqp->dq_flags |= DQ_TRANS;
608 		dqp->dq_cnt++;
609 		TRANS_DELTA(qip->i_ufsvfs, dqp->dq_mof, sizeof (struct dqblk),
610 		    DT_QR, ufs_trans_push_quota, (ulong_t)dqp);
611 	}
612 }
613 
614 void
615 ufs_trans_dqrele(struct dquot *dqp)
616 {
617 	struct ufsvfs	*ufsvfsp = dqp->dq_ufsvfsp;
618 
619 	curthread->t_flag |= T_DONTBLOCK;
620 	TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE);
621 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
622 	dqrele(dqp);
623 	rw_exit(&ufsvfsp->vfs_dqrwlock);
624 	TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE);
625 	curthread->t_flag &= ~T_DONTBLOCK;
626 }
627 
628 int ufs_trans_max_resv = TOP_MAX_RESV;	/* will be adjusted for testing */
629 long ufs_trans_avgbfree = 0;		/* will be adjusted for testing */
630 #define	TRANS_MAX_WRITE	(1024 * 1024)
631 size_t ufs_trans_max_resid = TRANS_MAX_WRITE;
632 
633 /*
634  * Calculate the log reservation for the given write or truncate
635  */
636 static ulong_t
637 ufs_log_amt(struct inode *ip, offset_t offset, ssize_t resid, int trunc)
638 {
639 	long		ncg, last2blk;
640 	long		niblk		= 0;
641 	u_offset_t	writeend, offblk;
642 	int		resv;
643 	daddr_t		nblk, maxfblk;
644 	long		avgbfree;
645 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
646 	struct fs	*fs		= ufsvfsp->vfs_fs;
647 	long		fni		= NINDIR(fs);
648 	int		bsize		= fs->fs_bsize;
649 
650 	/*
651 	 * Assume that the request will fit in 1 or 2 cg's,
652 	 * resv is the amount of log space to reserve (in bytes).
653 	 */
654 	resv = SIZECG(ip) * 2 + INODESIZE + 1024;
655 
656 	/*
657 	 * get max position of write in fs blocks
658 	 */
659 	writeend = offset + resid;
660 	maxfblk = lblkno(fs, writeend);
661 	offblk = lblkno(fs, offset);
662 	/*
663 	 * request size in fs blocks
664 	 */
665 	nblk = lblkno(fs, blkroundup(fs, resid));
666 	/*
667 	 * Adjust for sparse files
668 	 */
669 	if (trunc)
670 		nblk = MIN(nblk, ip->i_blocks);
671 
672 	/*
673 	 * Adjust avgbfree (for testing)
674 	 */
675 	avgbfree = (ufs_trans_avgbfree) ? 1 : ufsvfsp->vfs_avgbfree + 1;
676 
677 	/*
678 	 * Calculate maximum number of blocks of triple indirect
679 	 * pointers to write.
680 	 */
681 	last2blk = NDADDR + fni + fni * fni;
682 	if (maxfblk > last2blk) {
683 		long nl2ptr;
684 		long n3blk;
685 
686 		if (offblk > last2blk)
687 			n3blk = maxfblk - offblk;
688 		else
689 			n3blk = maxfblk - last2blk;
690 		niblk += roundup(n3blk * sizeof (daddr_t), bsize) / bsize + 1;
691 		nl2ptr = roundup(niblk, fni) / fni + 1;
692 		niblk += roundup(nl2ptr * sizeof (daddr_t), bsize) / bsize + 2;
693 		maxfblk -= n3blk;
694 	}
695 	/*
696 	 * calculate maximum number of blocks of double indirect
697 	 * pointers to write.
698 	 */
699 	if (maxfblk > NDADDR + fni) {
700 		long n2blk;
701 
702 		if (offblk > NDADDR + fni)
703 			n2blk = maxfblk - offblk;
704 		else
705 			n2blk = maxfblk - NDADDR + fni;
706 		niblk += roundup(n2blk * sizeof (daddr_t), bsize) / bsize + 2;
707 		maxfblk -= n2blk;
708 	}
709 	/*
710 	 * Add in indirect pointer block write
711 	 */
712 	if (maxfblk > NDADDR) {
713 		niblk += 1;
714 	}
715 	/*
716 	 * Calculate deltas for indirect pointer writes
717 	 */
718 	resv += niblk * (fs->fs_bsize + sizeof (struct delta));
719 	/*
720 	 * maximum number of cg's needed for request
721 	 */
722 	ncg = nblk / avgbfree;
723 	if (ncg > fs->fs_ncg)
724 		ncg = fs->fs_ncg;
725 
726 	/*
727 	 * maximum amount of log space needed for request
728 	 */
729 	if (ncg > 2)
730 		resv += (ncg - 2) * SIZECG(ip);
731 
732 	return (resv);
733 }
734 
735 /*
736  * Calculate the amount of log space that needs to be reserved for this
737  * trunc request.  If the amount of log space is too large, then
738  * calculate the the size that the requests needs to be split into.
739  */
740 void
741 ufs_trans_trunc_resv(
742 	struct inode *ip,
743 	u_offset_t length,
744 	int *resvp,
745 	u_offset_t *residp)
746 {
747 	ulong_t		resv;
748 	u_offset_t	size, offset, resid;
749 	int		nchunks, incr;
750 	int		is_sparse = 0;
751 
752 	/*
753 	 *    *resvp is the amount of log space to reserve (in bytes).
754 	 *    when nonzero, *residp is the number of bytes to truncate.
755 	 */
756 	*residp = 0;
757 
758 	if (length < ip->i_size) {
759 		size = ip->i_size - length;
760 	} else {
761 		resv = SIZECG(ip) * 2 + INODESIZE + 1024;
762 		/*
763 		 * truncate up, doesn't really use much space,
764 		 * the default above should be sufficient.
765 		 */
766 		goto done;
767 	}
768 
769 	/*
770 	 * There is no need to split sparse file truncation into
771 	 * as many chunks as that of regular files.
772 	 */
773 	is_sparse = bmap_has_holes(ip);
774 
775 	offset = length;
776 	resid = size;
777 	nchunks = 1;
778 	incr = 0;
779 
780 	do {
781 		resv = ufs_log_amt(ip, offset, resid, 1);
782 		/*
783 		 * If this is the first iteration, set "incr".
784 		 */
785 		if (!incr) {
786 			/*
787 			 * If this request takes too much log space,
788 			 * it will be split into "nchunks". If this split
789 			 * is not enough, linearly increment the nchunks in
790 			 * the next iteration.
791 			 */
792 			if (resv > ufs_trans_max_resv && !is_sparse) {
793 				nchunks = MAX(size/ufs_trans_max_resv, 1);
794 				incr = nchunks;
795 			} else {
796 				incr = 1;
797 			}
798 		} else
799 			nchunks += incr;
800 		resid = size / nchunks;
801 		offset = length + (nchunks - 1) * resid;
802 	} while (resv > ufs_trans_max_resv);
803 
804 	if (nchunks > 1) {
805 		*residp = resid;
806 	}
807 done:
808 	*resvp = resv;
809 }
810 
811 int
812 ufs_trans_itrunc(struct inode *ip, u_offset_t length, int flags, cred_t *cr)
813 {
814 	int 		err, issync, resv;
815 	u_offset_t	resid;
816 	int		do_block	= 0;
817 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
818 	struct fs	*fs		= ufsvfsp->vfs_fs;
819 
820 	/*
821 	 * Not logging; just do the trunc
822 	 */
823 	if (!TRANS_ISTRANS(ufsvfsp)) {
824 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
825 		rw_enter(&ip->i_contents, RW_WRITER);
826 		err = ufs_itrunc(ip, length, flags, cr);
827 		rw_exit(&ip->i_contents);
828 		rw_exit(&ufsvfsp->vfs_dqrwlock);
829 		return (err);
830 	}
831 
832 	/*
833 	 * within the lockfs protocol but *not* part of a transaction
834 	 */
835 	do_block = curthread->t_flag & T_DONTBLOCK;
836 	curthread->t_flag |= T_DONTBLOCK;
837 
838 	/*
839 	 * Trunc the file (in pieces, if necessary)
840 	 */
841 again:
842 	ufs_trans_trunc_resv(ip, length, &resv, &resid);
843 	TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ITRUNC, resv);
844 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
845 	rw_enter(&ip->i_contents, RW_WRITER);
846 	if (resid) {
847 		/*
848 		 * resid is only set if we have to truncate in chunks
849 		 */
850 		ASSERT(length + resid < ip->i_size);
851 
852 		/*
853 		 * Partially trunc file down to desired size (length).
854 		 * Only retain I_FREE on the last partial trunc.
855 		 * Round up size to a block boundary, to ensure the truncate
856 		 * doesn't have to allocate blocks. This is done both for
857 		 * performance and to fix a bug where if the block can't be
858 		 * allocated then the inode delete fails, but the inode
859 		 * is still freed with attached blocks and non-zero size
860 		 * (bug 4348738).
861 		 */
862 		err = ufs_itrunc(ip, blkroundup(fs, (ip->i_size - resid)),
863 		    flags & ~I_FREE, cr);
864 		ASSERT(ip->i_size != length);
865 	} else
866 		err = ufs_itrunc(ip, length, flags, cr);
867 	if (!do_block)
868 		curthread->t_flag &= ~T_DONTBLOCK;
869 	rw_exit(&ip->i_contents);
870 	rw_exit(&ufsvfsp->vfs_dqrwlock);
871 	TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ITRUNC, resv);
872 
873 	if ((err == 0) && resid) {
874 		ufsvfsp->vfs_avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
875 		goto again;
876 	}
877 	return (err);
878 }
879 
880 /*
881  * Calculate the amount of log space that needs to be reserved for this
882  * write request.  If the amount of log space is too large, then
883  * calculate the size that the requests needs to be split into.
884  * First try fixed chunks of size ufs_trans_max_resid. If that
885  * is too big, iterate down to the largest size that will fit.
886  * Pagein the pages in the first chunk here, so that the pagein is
887  * avoided later when the transaction is open.
888  */
889 void
890 ufs_trans_write_resv(
891 	struct inode *ip,
892 	struct uio *uio,
893 	int *resvp,
894 	int *residp)
895 {
896 	ulong_t		resv;
897 	offset_t	offset;
898 	ssize_t		resid;
899 	int		nchunks;
900 
901 	*residp = 0;
902 	offset = uio->uio_offset;
903 	resid = MIN(uio->uio_resid, ufs_trans_max_resid);
904 	resv = ufs_log_amt(ip, offset, resid, 0);
905 	if (resv <= ufs_trans_max_resv) {
906 		uio_prefaultpages(resid, uio);
907 		if (resid != uio->uio_resid)
908 			*residp = resid;
909 		*resvp = resv;
910 		return;
911 	}
912 
913 	resid = uio->uio_resid;
914 	nchunks = 1;
915 	for (; (resv = ufs_log_amt(ip, offset, resid, 0)) > ufs_trans_max_resv;
916 	    offset = uio->uio_offset + (nchunks - 1) * resid) {
917 		nchunks++;
918 		resid = uio->uio_resid / nchunks;
919 	}
920 	uio_prefaultpages(resid, uio);
921 	/*
922 	 * If this request takes too much log space, it will be split
923 	 */
924 	if (nchunks > 1)
925 		*residp = resid;
926 	*resvp = resv;
927 }
928 
929 /*
930  * Issue write request.
931  *
932  * Split a large request into smaller chunks.
933  */
934 int
935 ufs_trans_write(
936 	struct inode *ip,
937 	struct uio *uio,
938 	int ioflag,
939 	cred_t *cr,
940 	int resv,
941 	long resid)
942 {
943 	long		realresid;
944 	int		err;
945 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
946 
947 	/*
948 	 * since the write is too big and would "HOG THE LOG" it needs to
949 	 * be broken up and done in pieces.  NOTE, the caller will
950 	 * issue the EOT after the request has been completed
951 	 */
952 	realresid = uio->uio_resid;
953 
954 again:
955 	/*
956 	 * Perform partial request (uiomove will update uio for us)
957 	 *	Request is split up into "resid" size chunks until
958 	 *	"realresid" bytes have been transferred.
959 	 */
960 	uio->uio_resid = MIN(resid, realresid);
961 	realresid -= uio->uio_resid;
962 	err = wrip(ip, uio, ioflag, cr);
963 
964 	/*
965 	 * Error or request is done; caller issues final EOT
966 	 */
967 	if (err || uio->uio_resid || (realresid == 0)) {
968 		uio->uio_resid += realresid;
969 		return (err);
970 	}
971 
972 	/*
973 	 * Generate EOT for this part of the request
974 	 */
975 	rw_exit(&ip->i_contents);
976 	rw_exit(&ufsvfsp->vfs_dqrwlock);
977 	if (ioflag & (FSYNC|FDSYNC)) {
978 		TRANS_END_SYNC(ufsvfsp, err, TOP_WRITE_SYNC, resv);
979 	} else {
980 		TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
981 	}
982 
983 	/*
984 	 * Make sure the input buffer is resident before starting
985 	 * the next transaction.
986 	 */
987 	uio_prefaultpages(MIN(resid, realresid), uio);
988 
989 	/*
990 	 * Generate BOT for next part of the request
991 	 */
992 	if (ioflag & (FSYNC|FDSYNC)) {
993 		int error;
994 		TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, error);
995 		ASSERT(!error);
996 	} else {
997 		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
998 	}
999 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1000 	rw_enter(&ip->i_contents, RW_WRITER);
1001 	/*
1002 	 * Error during EOT (probably device error while writing commit rec)
1003 	 */
1004 	if (err)
1005 		return (err);
1006 	goto again;
1007 }
1008