xref: /titanic_51/usr/src/uts/common/fs/ufs/ufs_trans.c (revision 843e19887f64dde75055cf8842fc4db2171eff45)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 #include <sys/sysmacros.h>
37 #include <sys/param.h>
38 #include <sys/types.h>
39 #include <sys/systm.h>
40 #include <sys/t_lock.h>
41 #include <sys/uio.h>
42 #include <sys/kmem.h>
43 #include <sys/thread.h>
44 #include <sys/vfs.h>
45 #include <sys/errno.h>
46 #include <sys/buf.h>
47 #include <sys/vnode.h>
48 #include <sys/fs/ufs_trans.h>
49 #include <sys/fs/ufs_inode.h>
50 #include <sys/fs/ufs_fs.h>
51 #include <sys/fs/ufs_fsdir.h>
52 #include <sys/fs/ufs_quota.h>
53 #include <sys/fs/ufs_panic.h>
54 #include <sys/fs/ufs_bio.h>
55 #include <sys/fs/ufs_log.h>
56 #include <sys/cmn_err.h>
57 #include <sys/file.h>
58 #include <sys/debug.h>
59 
60 
61 extern kmutex_t ufsvfs_mutex;
62 extern struct ufsvfs *ufs_instances;
63 
64 /*
65  * hlock any file systems w/errored logs
66  */
67 int
68 ufs_trans_hlock()
69 {
70 	struct ufsvfs	*ufsvfsp;
71 	struct lockfs	lockfs;
72 	int		error;
73 	int		retry	= 0;
74 
75 	/*
76 	 * find fs's that paniced or have errored logging devices
77 	 */
78 	mutex_enter(&ufsvfs_mutex);
79 	for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next) {
80 		/*
81 		 * not mounted; continue
82 		 */
83 		if ((ufsvfsp->vfs_vfs == NULL) ||
84 		    (ufsvfsp->vfs_validfs == UT_UNMOUNTED))
85 			continue;
86 		/*
87 		 * disallow unmounts (hlock occurs below)
88 		 */
89 		if (TRANS_ISERROR(ufsvfsp))
90 			ufsvfsp->vfs_validfs = UT_HLOCKING;
91 	}
92 	mutex_exit(&ufsvfs_mutex);
93 
94 	/*
95 	 * hlock the fs's that paniced or have errored logging devices
96 	 */
97 again:
98 	mutex_enter(&ufsvfs_mutex);
99 	for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next)
100 		if (ufsvfsp->vfs_validfs == UT_HLOCKING)
101 			break;
102 	mutex_exit(&ufsvfs_mutex);
103 	if (ufsvfsp == NULL)
104 		return (retry);
105 	/*
106 	 * hlock the file system
107 	 */
108 	(void) ufs_fiolfss(ufsvfsp->vfs_root, &lockfs);
109 	if (!LOCKFS_IS_ELOCK(&lockfs)) {
110 		lockfs.lf_lock = LOCKFS_HLOCK;
111 		lockfs.lf_flags = 0;
112 		lockfs.lf_comlen = 0;
113 		lockfs.lf_comment = NULL;
114 		error = ufs_fiolfs(ufsvfsp->vfs_root, &lockfs, 0);
115 		/*
116 		 * retry after awhile; another app currently doing lockfs
117 		 */
118 		if (error == EBUSY || error == EINVAL)
119 			retry = 1;
120 	} else {
121 		if (ufsfx_get_failure_qlen() > 0) {
122 			if (mutex_tryenter(&ufs_fix.uq_mutex)) {
123 				ufs_fix.uq_lowat = ufs_fix.uq_ne;
124 				cv_broadcast(&ufs_fix.uq_cv);
125 				mutex_exit(&ufs_fix.uq_mutex);
126 			}
127 		}
128 		retry = 1;
129 	}
130 
131 	/*
132 	 * allow unmounts
133 	 */
134 	ufsvfsp->vfs_validfs = UT_MOUNTED;
135 	goto again;
136 }
137 
138 /*ARGSUSED*/
139 void
140 ufs_trans_onerror()
141 {
142 	mutex_enter(&ufs_hlock.uq_mutex);
143 	ufs_hlock.uq_ne = ufs_hlock.uq_lowat;
144 	cv_broadcast(&ufs_hlock.uq_cv);
145 	mutex_exit(&ufs_hlock.uq_mutex);
146 }
147 
148 void
149 ufs_trans_sbupdate(struct ufsvfs *ufsvfsp, struct vfs *vfsp, top_t topid)
150 {
151 	if (curthread->t_flag & T_DONTBLOCK) {
152 		sbupdate(vfsp);
153 		return;
154 	} else {
155 
156 		if (panicstr && TRANS_ISTRANS(ufsvfsp))
157 			return;
158 
159 		curthread->t_flag |= T_DONTBLOCK;
160 		TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE);
161 		sbupdate(vfsp);
162 		TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE);
163 		curthread->t_flag &= ~T_DONTBLOCK;
164 	}
165 }
166 
167 void
168 ufs_trans_iupdat(struct inode *ip, int waitfor)
169 {
170 	struct ufsvfs	*ufsvfsp;
171 
172 	if (curthread->t_flag & T_DONTBLOCK) {
173 		rw_enter(&ip->i_contents, RW_READER);
174 		ufs_iupdat(ip, waitfor);
175 		rw_exit(&ip->i_contents);
176 		return;
177 	} else {
178 		ufsvfsp = ip->i_ufsvfs;
179 
180 		if (panicstr && TRANS_ISTRANS(ufsvfsp))
181 			return;
182 
183 		curthread->t_flag |= T_DONTBLOCK;
184 		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip));
185 		rw_enter(&ip->i_contents, RW_READER);
186 		ufs_iupdat(ip, waitfor);
187 		rw_exit(&ip->i_contents);
188 		TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip));
189 		curthread->t_flag &= ~T_DONTBLOCK;
190 	}
191 }
192 
193 void
194 ufs_trans_sbwrite(struct ufsvfs *ufsvfsp, top_t topid)
195 {
196 	if (curthread->t_flag & T_DONTBLOCK) {
197 		mutex_enter(&ufsvfsp->vfs_lock);
198 		ufs_sbwrite(ufsvfsp);
199 		mutex_exit(&ufsvfsp->vfs_lock);
200 		return;
201 	} else {
202 
203 		if (panicstr && TRANS_ISTRANS(ufsvfsp))
204 			return;
205 
206 		curthread->t_flag |= T_DONTBLOCK;
207 		TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE);
208 		mutex_enter(&ufsvfsp->vfs_lock);
209 		ufs_sbwrite(ufsvfsp);
210 		mutex_exit(&ufsvfsp->vfs_lock);
211 		TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE);
212 		curthread->t_flag &= ~T_DONTBLOCK;
213 	}
214 }
215 
216 /*ARGSUSED*/
217 int
218 ufs_trans_push_si(ufsvfs_t *ufsvfsp, delta_t dtyp, int ignore)
219 {
220 	struct fs	*fs;
221 
222 	fs = ufsvfsp->vfs_fs;
223 	mutex_enter(&ufsvfsp->vfs_lock);
224 	TRANS_LOG(ufsvfsp, (char *)fs->fs_u.fs_csp,
225 	    ldbtob(fsbtodb(fs, fs->fs_csaddr)), fs->fs_cssize,
226 	    (caddr_t)fs->fs_u.fs_csp, fs->fs_cssize);
227 	mutex_exit(&ufsvfsp->vfs_lock);
228 	return (0);
229 }
230 
231 /*ARGSUSED*/
232 int
233 ufs_trans_push_buf(ufsvfs_t *ufsvfsp, delta_t dtyp, daddr_t bno)
234 {
235 	struct buf	*bp;
236 
237 	bp = (struct buf *)UFS_GETBLK(ufsvfsp, ufsvfsp->vfs_dev, bno, 1);
238 	if (bp == NULL)
239 		return (ENOENT);
240 
241 	if (bp->b_flags & B_DELWRI) {
242 		/*
243 		 * Do not use brwrite() here since the buffer is already
244 		 * marked for retry or not by the code that called
245 		 * TRANS_BUF().
246 		 */
247 		UFS_BWRITE(ufsvfsp, bp);
248 		return (0);
249 	}
250 	/*
251 	 * If we did not find the real buf for this block above then
252 	 * clear the dev so the buf won't be found by mistake
253 	 * for this block later.  We had to allocate at least a 1 byte
254 	 * buffer to keep brelse happy.
255 	 */
256 	if (bp->b_bufsize == 1) {
257 		bp->b_dev = (o_dev_t)NODEV;
258 		bp->b_edev = NODEV;
259 		bp->b_flags = 0;
260 	}
261 	brelse(bp);
262 	return (ENOENT);
263 }
264 
265 /*ARGSUSED*/
266 int
267 ufs_trans_push_inode(ufsvfs_t *ufsvfsp, delta_t dtyp, ino_t ino)
268 {
269 	int		error;
270 	struct inode	*ip;
271 
272 	/*
273 	 * Grab the quota lock (if the file system has not been forcibly
274 	 * unmounted).
275 	 */
276 	if (ufsvfsp)
277 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
278 
279 	error = ufs_iget(ufsvfsp->vfs_vfs, ino, &ip, kcred);
280 
281 	if (ufsvfsp)
282 		rw_exit(&ufsvfsp->vfs_dqrwlock);
283 	if (error)
284 		return (ENOENT);
285 
286 	if (ip->i_flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) {
287 		rw_enter(&ip->i_contents, RW_READER);
288 		ufs_iupdat(ip, 1);
289 		rw_exit(&ip->i_contents);
290 		VN_RELE(ITOV(ip));
291 		return (0);
292 	}
293 	VN_RELE(ITOV(ip));
294 	return (ENOENT);
295 }
296 
297 #ifdef DEBUG
298 /*
299  *	These routines maintain the metadata map (matamap)
300  */
301 
302 /*
303  * update the metadata map at mount
304  */
305 static int
306 ufs_trans_mata_mount_scan(struct inode *ip, void *arg)
307 {
308 	/*
309 	 * wrong file system; keep looking
310 	 */
311 	if (ip->i_ufsvfs != (struct ufsvfs *)arg)
312 		return (0);
313 
314 	/*
315 	 * load the metadata map
316 	 */
317 	rw_enter(&ip->i_contents, RW_WRITER);
318 	ufs_trans_mata_iget(ip);
319 	rw_exit(&ip->i_contents);
320 	return (0);
321 }
322 
323 void
324 ufs_trans_mata_mount(struct ufsvfs *ufsvfsp)
325 {
326 	struct fs	*fs	= ufsvfsp->vfs_fs;
327 	ino_t		ino;
328 	int		i;
329 
330 	/*
331 	 * put static metadata into matamap
332 	 *	superblock
333 	 *	cylinder groups
334 	 *	inode groups
335 	 *	existing inodes
336 	 */
337 	TRANS_MATAADD(ufsvfsp, ldbtob(SBLOCK), fs->fs_sbsize);
338 
339 	for (ino = i = 0; i < fs->fs_ncg; ++i, ino += fs->fs_ipg) {
340 		TRANS_MATAADD(ufsvfsp,
341 		    ldbtob(fsbtodb(fs, cgtod(fs, i))), fs->fs_cgsize);
342 		TRANS_MATAADD(ufsvfsp,
343 		    ldbtob(fsbtodb(fs, itod(fs, ino))),
344 		    fs->fs_ipg * sizeof (struct dinode));
345 	}
346 	(void) ufs_scan_inodes(0, ufs_trans_mata_mount_scan, ufsvfsp, ufsvfsp);
347 }
348 
349 /*
350  * clear the metadata map at umount
351  */
352 void
353 ufs_trans_mata_umount(struct ufsvfs *ufsvfsp)
354 {
355 	top_mataclr(ufsvfsp);
356 }
357 
358 /*
359  * summary info (may be extended during growfs test)
360  */
361 void
362 ufs_trans_mata_si(struct ufsvfs *ufsvfsp, struct fs *fs)
363 {
364 	TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, fs->fs_csaddr)),
365 	    fs->fs_cssize);
366 }
367 
368 /*
369  * scan an allocation block (either inode or true block)
370  */
371 static void
372 ufs_trans_mata_direct(
373 	struct inode *ip,
374 	daddr_t *fragsp,
375 	daddr32_t *blkp,
376 	unsigned int nblk)
377 {
378 	int		i;
379 	daddr_t		frag;
380 	ulong_t		nb;
381 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
382 	struct fs	*fs		= ufsvfsp->vfs_fs;
383 
384 	for (i = 0; i < nblk && *fragsp; ++i, ++blkp)
385 		if ((frag = *blkp) != 0) {
386 			if (*fragsp > fs->fs_frag) {
387 				nb = fs->fs_bsize;
388 				*fragsp -= fs->fs_frag;
389 			} else {
390 				nb = *fragsp * fs->fs_fsize;
391 				*fragsp = 0;
392 			}
393 			TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb);
394 		}
395 }
396 
397 /*
398  * scan an indirect allocation block (either inode or true block)
399  */
400 static void
401 ufs_trans_mata_indir(
402 	struct inode *ip,
403 	daddr_t *fragsp,
404 	daddr_t frag,
405 	int level)
406 {
407 	struct ufsvfs *ufsvfsp	= ip->i_ufsvfs;
408 	struct fs *fs = ufsvfsp->vfs_fs;
409 	int ne = fs->fs_bsize / (int)sizeof (daddr32_t);
410 	int i;
411 	struct buf *bp;
412 	daddr32_t *blkp;
413 	o_mode_t ifmt = ip->i_mode & IFMT;
414 
415 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, frag), fs->fs_bsize);
416 	if (bp->b_flags & B_ERROR) {
417 		brelse(bp);
418 		return;
419 	}
420 	blkp = bp->b_un.b_daddr;
421 
422 	if (level || (ifmt == IFDIR) || (ifmt == IFSHAD) ||
423 	    (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod))
424 		ufs_trans_mata_direct(ip, fragsp, blkp, ne);
425 
426 	if (level)
427 		for (i = 0; i < ne && *fragsp; ++i, ++blkp)
428 			ufs_trans_mata_indir(ip, fragsp, *blkp, level-1);
429 	brelse(bp);
430 }
431 
432 /*
433  * put appropriate metadata into matamap for this inode
434  */
435 void
436 ufs_trans_mata_iget(struct inode *ip)
437 {
438 	int		i;
439 	daddr_t		frags	= dbtofsb(ip->i_fs, ip->i_blocks);
440 	o_mode_t	ifmt 	= ip->i_mode & IFMT;
441 
442 	if (frags && ((ifmt == IFDIR) || (ifmt == IFSHAD) ||
443 	    (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)))
444 		ufs_trans_mata_direct(ip, &frags, &ip->i_db[0], NDADDR);
445 
446 	if (frags)
447 		ufs_trans_mata_direct(ip, &frags, &ip->i_ib[0], NIADDR);
448 
449 	for (i = 0; i < NIADDR && frags; ++i)
450 		if (ip->i_ib[i])
451 			ufs_trans_mata_indir(ip, &frags, ip->i_ib[i], i);
452 }
453 
454 /*
455  * freeing possible metadata (block of user data)
456  */
457 void
458 ufs_trans_mata_free(struct ufsvfs *ufsvfsp, offset_t mof, off_t nb)
459 {
460 	top_matadel(ufsvfsp, mof, nb);
461 
462 }
463 
464 /*
465  * allocating metadata
466  */
467 void
468 ufs_trans_mata_alloc(
469 	struct ufsvfs *ufsvfsp,
470 	struct inode *ip,
471 	daddr_t frag,
472 	ulong_t nb,
473 	int indir)
474 {
475 	struct fs	*fs	= ufsvfsp->vfs_fs;
476 	o_mode_t	ifmt 	= ip->i_mode & IFMT;
477 
478 	if (indir || ((ifmt == IFDIR) || (ifmt == IFSHAD) ||
479 	    (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)))
480 		TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb);
481 }
482 
483 #endif /* DEBUG */
484 
485 /*
486  * ufs_trans_dir is used to declare a directory delta
487  */
488 int
489 ufs_trans_dir(struct inode *ip, off_t offset)
490 {
491 	daddr_t	bn;
492 	int	contig = 0, error;
493 
494 	ASSERT(ip);
495 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
496 	error = bmap_read(ip, (u_offset_t)offset, &bn, &contig);
497 	if (error || (bn == UFS_HOLE)) {
498 		cmn_err(CE_WARN, "ufs_trans_dir - could not get block"
499 		    " number error = %d bn = %d\n", error, (int)bn);
500 		if (error == 0)	/* treat UFS_HOLE as an I/O error */
501 			error = EIO;
502 		return (error);
503 	}
504 	TRANS_DELTA(ip->i_ufsvfs, ldbtob(bn), DIRBLKSIZ, DT_DIR, 0, 0);
505 	return (error);
506 }
507 
508 /*ARGSUSED*/
509 int
510 ufs_trans_push_quota(ufsvfs_t *ufsvfsp, delta_t dtyp, struct dquot *dqp)
511 {
512 	/*
513 	 * Lock the quota subsystem (ufsvfsp can be NULL
514 	 * if the DQ_ERROR is set).
515 	 */
516 	if (ufsvfsp)
517 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
518 	mutex_enter(&dqp->dq_lock);
519 
520 	/*
521 	 * If this transaction has been cancelled by closedq_scan_inode(),
522 	 * then bail out now.  We don't call dqput() in this case because
523 	 * it has already been done.
524 	 */
525 	if ((dqp->dq_flags & DQ_TRANS) == 0) {
526 		mutex_exit(&dqp->dq_lock);
527 		if (ufsvfsp)
528 			rw_exit(&ufsvfsp->vfs_dqrwlock);
529 		return (0);
530 	}
531 
532 	if (dqp->dq_flags & DQ_ERROR) {
533 		/*
534 		 * Paranoia to make sure that there is at least one
535 		 * reference to the dquot struct.  We are done with
536 		 * the dquot (due to an error) so clear logging
537 		 * specific markers.
538 		 */
539 		ASSERT(dqp->dq_cnt >= 1);
540 		dqp->dq_flags &= ~DQ_TRANS;
541 		dqput(dqp);
542 		mutex_exit(&dqp->dq_lock);
543 		if (ufsvfsp)
544 			rw_exit(&ufsvfsp->vfs_dqrwlock);
545 		return (1);
546 	}
547 
548 	if (dqp->dq_flags & (DQ_MOD | DQ_BLKS | DQ_FILES)) {
549 		ASSERT((dqp->dq_mof != UFS_HOLE) && (dqp->dq_mof != 0));
550 		TRANS_LOG(ufsvfsp, (caddr_t)&dqp->dq_dqb,
551 		    dqp->dq_mof, (int)sizeof (struct dqblk), NULL, 0);
552 		/*
553 		 * Paranoia to make sure that there is at least one
554 		 * reference to the dquot struct.  Clear the
555 		 * modification flag because the operation is now in
556 		 * the log.  Also clear the logging specific markers
557 		 * that were set in ufs_trans_quota().
558 		 */
559 		ASSERT(dqp->dq_cnt >= 1);
560 		dqp->dq_flags &= ~(DQ_MOD | DQ_TRANS);
561 		dqput(dqp);
562 	}
563 
564 	/*
565 	 * At this point, the logging specific flag should be clear,
566 	 * but add paranoia just in case something has gone wrong.
567 	 */
568 	ASSERT((dqp->dq_flags & DQ_TRANS) == 0);
569 	mutex_exit(&dqp->dq_lock);
570 	if (ufsvfsp)
571 		rw_exit(&ufsvfsp->vfs_dqrwlock);
572 	return (0);
573 }
574 
575 /*
576  * ufs_trans_quota take in a uid, allocates the disk space, placing the
577  * quota record into the metamap, then declares the delta.
578  */
579 /*ARGSUSED*/
580 void
581 ufs_trans_quota(struct dquot *dqp)
582 {
583 
584 	struct inode	*qip = dqp->dq_ufsvfsp->vfs_qinod;
585 
586 	ASSERT(qip);
587 	ASSERT(MUTEX_HELD(&dqp->dq_lock));
588 	ASSERT(dqp->dq_flags & DQ_MOD);
589 	ASSERT(dqp->dq_mof != 0);
590 	ASSERT(dqp->dq_mof != UFS_HOLE);
591 
592 	/*
593 	 * Mark this dquot to indicate that we are starting a logging
594 	 * file system operation for this dquot.  Also increment the
595 	 * reference count so that the dquot does not get reused while
596 	 * it is on the mapentry_t list.  DQ_TRANS is cleared and the
597 	 * reference count is decremented by ufs_trans_push_quota.
598 	 *
599 	 * If the file system is force-unmounted while there is a
600 	 * pending quota transaction, then closedq_scan_inode() will
601 	 * clear the DQ_TRANS flag and decrement the reference count.
602 	 *
603 	 * Since deltamap_add() drops multiple transactions to the
604 	 * same dq_mof and ufs_trans_push_quota() won't get called,
605 	 * we use DQ_TRANS to prevent repeat transactions from
606 	 * incrementing the reference count (or calling TRANS_DELTA()).
607 	 */
608 	if ((dqp->dq_flags & DQ_TRANS) == 0) {
609 		dqp->dq_flags |= DQ_TRANS;
610 		dqp->dq_cnt++;
611 		TRANS_DELTA(qip->i_ufsvfs, dqp->dq_mof, sizeof (struct dqblk),
612 		    DT_QR, ufs_trans_push_quota, (ulong_t)dqp);
613 	}
614 }
615 
616 void
617 ufs_trans_dqrele(struct dquot *dqp)
618 {
619 	struct ufsvfs	*ufsvfsp = dqp->dq_ufsvfsp;
620 
621 	curthread->t_flag |= T_DONTBLOCK;
622 	TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE);
623 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
624 	dqrele(dqp);
625 	rw_exit(&ufsvfsp->vfs_dqrwlock);
626 	TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE);
627 	curthread->t_flag &= ~T_DONTBLOCK;
628 }
629 
630 int ufs_trans_max_resv = TOP_MAX_RESV;	/* will be adjusted for testing */
631 long ufs_trans_avgbfree = 0;		/* will be adjusted for testing */
632 #define	TRANS_MAX_WRITE	(1024 * 1024)
633 size_t ufs_trans_max_resid = TRANS_MAX_WRITE;
634 
635 /*
636  * Calculate the log reservation for the given write or truncate
637  */
638 static ulong_t
639 ufs_log_amt(struct inode *ip, offset_t offset, ssize_t resid, int trunc)
640 {
641 	long		ncg, last2blk;
642 	long		niblk		= 0;
643 	u_offset_t	writeend, offblk;
644 	int		resv;
645 	daddr_t		nblk, maxfblk;
646 	long		avgbfree;
647 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
648 	struct fs	*fs		= ufsvfsp->vfs_fs;
649 	long		fni		= NINDIR(fs);
650 	int		bsize		= fs->fs_bsize;
651 
652 	/*
653 	 * Assume that the request will fit in 1 or 2 cg's,
654 	 * resv is the amount of log space to reserve (in bytes).
655 	 */
656 	resv = SIZECG(ip) * 2 + INODESIZE + 1024;
657 
658 	/*
659 	 * get max position of write in fs blocks
660 	 */
661 	writeend = offset + resid;
662 	maxfblk = lblkno(fs, writeend);
663 	offblk = lblkno(fs, offset);
664 	/*
665 	 * request size in fs blocks
666 	 */
667 	nblk = lblkno(fs, blkroundup(fs, resid));
668 	/*
669 	 * Adjust for sparse files
670 	 */
671 	if (trunc)
672 		nblk = MIN(nblk, ip->i_blocks);
673 
674 	/*
675 	 * Adjust avgbfree (for testing)
676 	 */
677 	avgbfree = (ufs_trans_avgbfree) ? 1 : ufsvfsp->vfs_avgbfree + 1;
678 
679 	/*
680 	 * Calculate maximum number of blocks of triple indirect
681 	 * pointers to write.
682 	 */
683 	last2blk = NDADDR + fni + fni * fni;
684 	if (maxfblk > last2blk) {
685 		long nl2ptr;
686 		long n3blk;
687 
688 		if (offblk > last2blk)
689 			n3blk = maxfblk - offblk;
690 		else
691 			n3blk = maxfblk - last2blk;
692 		niblk += roundup(n3blk * sizeof (daddr_t), bsize) / bsize + 1;
693 		nl2ptr = roundup(niblk, fni) / fni + 1;
694 		niblk += roundup(nl2ptr * sizeof (daddr_t), bsize) / bsize + 2;
695 		maxfblk -= n3blk;
696 	}
697 	/*
698 	 * calculate maximum number of blocks of double indirect
699 	 * pointers to write.
700 	 */
701 	if (maxfblk > NDADDR + fni) {
702 		long n2blk;
703 
704 		if (offblk > NDADDR + fni)
705 			n2blk = maxfblk - offblk;
706 		else
707 			n2blk = maxfblk - NDADDR + fni;
708 		niblk += roundup(n2blk * sizeof (daddr_t), bsize) / bsize + 2;
709 		maxfblk -= n2blk;
710 	}
711 	/*
712 	 * Add in indirect pointer block write
713 	 */
714 	if (maxfblk > NDADDR) {
715 		niblk += 1;
716 	}
717 	/*
718 	 * Calculate deltas for indirect pointer writes
719 	 */
720 	resv += niblk * (fs->fs_bsize + sizeof (struct delta));
721 	/*
722 	 * maximum number of cg's needed for request
723 	 */
724 	ncg = nblk / avgbfree;
725 	if (ncg > fs->fs_ncg)
726 		ncg = fs->fs_ncg;
727 
728 	/*
729 	 * maximum amount of log space needed for request
730 	 */
731 	if (ncg > 2)
732 		resv += (ncg - 2) * SIZECG(ip);
733 
734 	return (resv);
735 }
736 
737 /*
738  * Calculate the amount of log space that needs to be reserved for this
739  * trunc request.  If the amount of log space is too large, then
740  * calculate the the size that the requests needs to be split into.
741  */
742 void
743 ufs_trans_trunc_resv(
744 	struct inode *ip,
745 	u_offset_t length,
746 	int *resvp,
747 	u_offset_t *residp)
748 {
749 	ulong_t		resv;
750 	u_offset_t	size, offset, resid;
751 	int		nchunks;
752 
753 	/*
754 	 *    *resvp is the amount of log space to reserve (in bytes).
755 	 *    when nonzero, *residp is the number of bytes to truncate.
756 	 */
757 	*residp = 0;
758 
759 	if (length < ip->i_size) {
760 		size = ip->i_size - length;
761 	} else {
762 		resv = SIZECG(ip) * 2 + INODESIZE + 1024;
763 		/*
764 		 * truncate up, doesn't really use much space,
765 		 * the default above should be sufficient.
766 		 */
767 		goto done;
768 	}
769 
770 	offset = length;
771 	resid = size;
772 	nchunks = 1;
773 	for (; (resv = ufs_log_amt(ip, offset, resid, 1)) > ufs_trans_max_resv;
774 	    offset = length + (nchunks - 1) * resid) {
775 		nchunks++;
776 		resid = size / nchunks;
777 	}
778 	/*
779 	 * If this request takes too much log space, it will be split
780 	 */
781 	if (nchunks > 1) {
782 		*residp = resid;
783 	}
784 done:
785 	*resvp = resv;
786 }
787 
788 int
789 ufs_trans_itrunc(struct inode *ip, u_offset_t length, int flags, cred_t *cr)
790 {
791 	int 		err, issync, resv;
792 	u_offset_t	resid;
793 	int		do_block	= 0;
794 	struct ufsvfs	*ufsvfsp	= ip->i_ufsvfs;
795 	struct fs	*fs		= ufsvfsp->vfs_fs;
796 
797 	/*
798 	 * Not logging; just do the trunc
799 	 */
800 	if (!TRANS_ISTRANS(ufsvfsp)) {
801 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
802 		rw_enter(&ip->i_contents, RW_WRITER);
803 		err = ufs_itrunc(ip, length, flags, cr);
804 		rw_exit(&ip->i_contents);
805 		rw_exit(&ufsvfsp->vfs_dqrwlock);
806 		return (err);
807 	}
808 
809 	/*
810 	 * within the lockfs protocol but *not* part of a transaction
811 	 */
812 	do_block = curthread->t_flag & T_DONTBLOCK;
813 	curthread->t_flag |= T_DONTBLOCK;
814 
815 	/*
816 	 * Trunc the file (in pieces, if necessary)
817 	 */
818 again:
819 	ufs_trans_trunc_resv(ip, length, &resv, &resid);
820 	TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ITRUNC, resv);
821 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
822 	rw_enter(&ip->i_contents, RW_WRITER);
823 	if (resid) {
824 		/*
825 		 * resid is only set if we have to truncate in chunks
826 		 */
827 		ASSERT(length + resid < ip->i_size);
828 
829 		/*
830 		 * Partially trunc file down to desired size (length).
831 		 * Only retain I_FREE on the last partial trunc.
832 		 * Round up size to a block boundary, to ensure the truncate
833 		 * doesn't have to allocate blocks. This is done both for
834 		 * performance and to fix a bug where if the block can't be
835 		 * allocated then the inode delete fails, but the inode
836 		 * is still freed with attached blocks and non-zero size
837 		 * (bug 4348738).
838 		 */
839 		err = ufs_itrunc(ip, blkroundup(fs, (ip->i_size - resid)),
840 		    flags & ~I_FREE, cr);
841 		ASSERT(ip->i_size != length);
842 	} else
843 		err = ufs_itrunc(ip, length, flags, cr);
844 	if (!do_block)
845 		curthread->t_flag &= ~T_DONTBLOCK;
846 	rw_exit(&ip->i_contents);
847 	rw_exit(&ufsvfsp->vfs_dqrwlock);
848 	TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ITRUNC, resv);
849 
850 	if ((err == 0) && resid) {
851 		ufsvfsp->vfs_avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
852 		goto again;
853 	}
854 	return (err);
855 }
856 
857 /*
858  * Fault in the pages of the first n bytes specified by the uio structure.
859  * 1 byte in each page is touched and the uio struct is unmodified.
860  * Any error will terminate the process as this is only a best
861  * attempt to get the pages resident.
862  */
863 static void
864 ufs_trans_touch(ssize_t n, struct uio *uio)
865 {
866 	struct iovec *iov;
867 	ulong_t cnt, incr;
868 	caddr_t p;
869 	uint8_t tmp;
870 
871 	iov = uio->uio_iov;
872 
873 	while (n) {
874 		cnt = MIN(iov->iov_len, n);
875 		if (cnt == 0) {
876 			/* empty iov entry */
877 			iov++;
878 			continue;
879 		}
880 		n -= cnt;
881 		/*
882 		 * touch each page in this segment.
883 		 */
884 		p = iov->iov_base;
885 		while (cnt) {
886 			switch (uio->uio_segflg) {
887 			case UIO_USERSPACE:
888 			case UIO_USERISPACE:
889 				if (fuword8(p, &tmp))
890 					return;
891 				break;
892 			case UIO_SYSSPACE:
893 				if (kcopy(p, &tmp, 1))
894 					return;
895 				break;
896 			}
897 			incr = MIN(cnt, PAGESIZE);
898 			p += incr;
899 			cnt -= incr;
900 		}
901 		/*
902 		 * touch the last byte in case it straddles a page.
903 		 */
904 		p--;
905 		switch (uio->uio_segflg) {
906 		case UIO_USERSPACE:
907 		case UIO_USERISPACE:
908 			if (fuword8(p, &tmp))
909 				return;
910 			break;
911 		case UIO_SYSSPACE:
912 			if (kcopy(p, &tmp, 1))
913 				return;
914 			break;
915 		}
916 		iov++;
917 	}
918 }
919 
920 /*
921  * Calculate the amount of log space that needs to be reserved for this
922  * write request.  If the amount of log space is too large, then
923  * calculate the size that the requests needs to be split into.
924  * First try fixed chunks of size ufs_trans_max_resid. If that
925  * is too big, iterate down to the largest size that will fit.
926  * Pagein the pages in the first chunk here, so that the pagein is
927  * avoided later when the transaction is open.
928  */
929 void
930 ufs_trans_write_resv(
931 	struct inode *ip,
932 	struct uio *uio,
933 	int *resvp,
934 	int *residp)
935 {
936 	ulong_t		resv;
937 	offset_t	offset;
938 	ssize_t		resid;
939 	int		nchunks;
940 
941 	*residp = 0;
942 	offset = uio->uio_offset;
943 	resid = MIN(uio->uio_resid, ufs_trans_max_resid);
944 	resv = ufs_log_amt(ip, offset, resid, 0);
945 	if (resv <= ufs_trans_max_resv) {
946 		ufs_trans_touch(resid, uio);
947 		if (resid != uio->uio_resid)
948 			*residp = resid;
949 		*resvp = resv;
950 		return;
951 	}
952 
953 	resid = uio->uio_resid;
954 	nchunks = 1;
955 	for (; (resv = ufs_log_amt(ip, offset, resid, 0)) > ufs_trans_max_resv;
956 	    offset = uio->uio_offset + (nchunks - 1) * resid) {
957 		nchunks++;
958 		resid = uio->uio_resid / nchunks;
959 	}
960 	ufs_trans_touch(resid, uio);
961 	/*
962 	 * If this request takes too much log space, it will be split
963 	 */
964 	if (nchunks > 1)
965 		*residp = resid;
966 	*resvp = resv;
967 }
968 
969 /*
970  * Issue write request.
971  *
972  * Split a large request into smaller chunks.
973  */
974 int
975 ufs_trans_write(
976 	struct inode *ip,
977 	struct uio *uio,
978 	int ioflag,
979 	cred_t *cr,
980 	int resv,
981 	long resid)
982 {
983 	long		realresid;
984 	int		err;
985 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
986 
987 	/*
988 	 * since the write is too big and would "HOG THE LOG" it needs to
989 	 * be broken up and done in pieces.  NOTE, the caller will
990 	 * issue the EOT after the request has been completed
991 	 */
992 	realresid = uio->uio_resid;
993 
994 again:
995 	/*
996 	 * Perform partial request (uiomove will update uio for us)
997 	 *	Request is split up into "resid" size chunks until
998 	 *	"realresid" bytes have been transferred.
999 	 */
1000 	uio->uio_resid = MIN(resid, realresid);
1001 	realresid -= uio->uio_resid;
1002 	err = wrip(ip, uio, ioflag, cr);
1003 
1004 	/*
1005 	 * Error or request is done; caller issues final EOT
1006 	 */
1007 	if (err || uio->uio_resid || (realresid == 0)) {
1008 		uio->uio_resid += realresid;
1009 		return (err);
1010 	}
1011 
1012 	/*
1013 	 * Generate EOT for this part of the request
1014 	 */
1015 	rw_exit(&ip->i_contents);
1016 	rw_exit(&ufsvfsp->vfs_dqrwlock);
1017 	if (ioflag & (FSYNC|FDSYNC)) {
1018 		TRANS_END_SYNC(ufsvfsp, err, TOP_WRITE_SYNC, resv);
1019 	} else {
1020 		TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
1021 	}
1022 
1023 	/*
1024 	 * Make sure the input buffer is resident before starting
1025 	 * the next transaction.
1026 	 */
1027 	ufs_trans_touch(MIN(resid, realresid), uio);
1028 
1029 	/*
1030 	 * Generate BOT for next part of the request
1031 	 */
1032 	if (ioflag & (FSYNC|FDSYNC)) {
1033 		int error;
1034 		TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, error);
1035 		ASSERT(!error);
1036 	} else {
1037 		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
1038 	}
1039 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1040 	rw_enter(&ip->i_contents, RW_WRITER);
1041 	/*
1042 	 * Error during EOT (probably device error while writing commit rec)
1043 	 */
1044 	if (err)
1045 		return (err);
1046 	goto again;
1047 }
1048