xref: /titanic_52/usr/src/uts/common/fs/ufs/ufs_inode.c (revision b7d62af5b42f0da2eb668e8d33d24d2f4fdd98a8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 
41 #pragma ident	"%Z%%M%	%I%	%E% SMI"
42 
43 #include <sys/types.h>
44 #include <sys/t_lock.h>
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/uio.h>
48 #include <sys/bitmap.h>
49 #include <sys/signal.h>
50 #include <sys/cred.h>
51 #include <sys/user.h>
52 #include <sys/vfs.h>
53 #include <sys/stat.h>
54 #include <sys/vnode.h>
55 #include <sys/buf.h>
56 #include <sys/proc.h>
57 #include <sys/disp.h>
58 #include <sys/dnlc.h>
59 #include <sys/mode.h>
60 #include <sys/cmn_err.h>
61 #include <sys/kstat.h>
62 #include <sys/acl.h>
63 #include <sys/var.h>
64 #include <sys/fs/ufs_inode.h>
65 #include <sys/fs/ufs_fs.h>
66 #include <sys/fs/ufs_trans.h>
67 #include <sys/fs/ufs_acl.h>
68 #include <sys/fs/ufs_bio.h>
69 #include <sys/fs/ufs_quota.h>
70 #include <sys/fs/ufs_log.h>
71 #include <vm/hat.h>
72 #include <vm/as.h>
73 #include <vm/pvn.h>
74 #include <vm/seg.h>
75 #include <sys/swap.h>
76 #include <sys/cpuvar.h>
77 #include <sys/sysmacros.h>
78 #include <sys/errno.h>
79 #include <sys/kmem.h>
80 #include <sys/debug.h>
81 #include <fs/fs_subr.h>
82 #include <sys/policy.h>
83 
84 struct kmem_cache *inode_cache;		/* cache of free inodes */
85 
86 /* UFS Inode Cache Stats -- Not protected */
87 struct	instats ins = {
88 	{ "size",		KSTAT_DATA_ULONG },
89 	{ "maxsize",		KSTAT_DATA_ULONG },
90 	{ "hits",		KSTAT_DATA_ULONG },
91 	{ "misses",		KSTAT_DATA_ULONG },
92 	{ "kmem allocs",	KSTAT_DATA_ULONG },
93 	{ "kmem frees",		KSTAT_DATA_ULONG },
94 	{ "maxsize reached",	KSTAT_DATA_ULONG },
95 	{ "puts at frontlist",	KSTAT_DATA_ULONG },
96 	{ "puts at backlist",	KSTAT_DATA_ULONG },
97 	{ "queues to free",	KSTAT_DATA_ULONG },
98 	{ "scans",		KSTAT_DATA_ULONG },
99 	{ "thread idles",	KSTAT_DATA_ULONG },
100 	{ "lookup idles",	KSTAT_DATA_ULONG },
101 	{ "vget idles",		KSTAT_DATA_ULONG },
102 	{ "cache allocs",	KSTAT_DATA_ULONG },
103 	{ "cache frees",	KSTAT_DATA_ULONG },
104 	{ "pushes at close",	KSTAT_DATA_ULONG }
105 };
106 
107 /* kstat data */
108 static kstat_t		*ufs_inode_kstat = NULL;
109 
110 union ihead *ihead;	/* inode LRU cache, Chris Maltby */
111 kmutex_t *ih_lock;	/* protect inode cache hash table */
112 static int ino_hashlen = 4;	/* desired average hash chain length */
113 int inohsz;		/* number of buckets in the hash table */
114 
115 kmutex_t	ufs_scan_lock;	/* stop racing multiple ufs_scan_inodes() */
116 kmutex_t	ufs_iuniqtime_lock; /* protect iuniqtime */
117 kmutex_t	ufsvfs_mutex;
118 struct ufsvfs	*oldufsvfslist, *ufsvfslist;
119 
120 /*
121  * time to wait after ufsvfsp->vfs_iotstamp before declaring that no
122  * I/Os are going on.
123  */
124 clock_t	ufs_iowait;
125 
126 /*
127  * the threads that process idle inodes and free (deleted) inodes
128  * have high water marks that are set in ufsinit().
129  * These values but can be no less then the minimum shown below
130  */
131 int	ufs_idle_max;	/* # of allowable idle inodes */
132 ulong_t	ufs_inode_max;	/* hard limit of allowable idle inodes */
133 #define	UFS_IDLE_MAX	(16)	/* min # of allowable idle inodes */
134 
135 /*
136  * Tunables for ufs write throttling.
137  * These are validated in ufs_iinit() since improper settings
138  * can lead to filesystem hangs.
139  */
140 #define	UFS_HW_DEFAULT	(16 * 1024 * 1024)
141 #define	UFS_LW_DEFAULT	(8 * 1024 * 1024)
142 int	ufs_HW = UFS_HW_DEFAULT;
143 int	ufs_LW = UFS_LW_DEFAULT;
144 
145 static void ihinit(void);
146 extern int hash2ints(int, int);
147 
148 static int ufs_iget_internal(struct vfs *, ino_t, struct inode **,
149     struct cred *, int);
150 
151 /* ARGSUSED */
152 static int
153 ufs_inode_kstat_update(kstat_t *ksp, int rw)
154 {
155 	if (rw == KSTAT_WRITE)
156 		return (EACCES);
157 
158 	ins.in_malloc.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
159 	    "slab_alloc");
160 	ins.in_mfree.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
161 	    "slab_free");
162 	ins.in_kcalloc.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
163 	    "alloc");
164 	ins.in_kcfree.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
165 	    "free");
166 	ins.in_size.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
167 	    "buf_inuse");
168 	ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
169 	    "buf_max");
170 	ins.in_misses.value.ul = ins.in_kcalloc.value.ul;
171 
172 	return (0);
173 }
174 
175 void
176 ufs_iinit(void)
177 {
178 	/*
179 	 * Validate that ufs_HW > ufs_LW.
180 	 * The default values for these two tunables have been increased.
181 	 * There is now a range of values for ufs_HW that used to be
182 	 * legal on previous Solaris versions but no longer is now.
183 	 * Upgrading a machine which has an /etc/system setting for ufs_HW
184 	 * from that range can lead to filesystem hangs unless the values
185 	 * are checked here.
186 	 */
187 	if (ufs_HW <= ufs_LW) {
188 		cmn_err(CE_WARN,
189 			    "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.",
190 			    ufs_HW, ufs_LW);
191 		ufs_LW = UFS_LW_DEFAULT;
192 		ufs_HW = UFS_HW_DEFAULT;
193 		cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n",
194 			    ufs_HW, ufs_LW);
195 	}
196 
197 	/*
198 	 * Adjust the tunable `ufs_ninode' to a reasonable value
199 	 */
200 	if (ufs_ninode <= 0)
201 		ufs_ninode = ncsize;
202 	if (ufs_inode_max == 0)
203 		ufs_inode_max = (ulong_t)((kmem_maxavail() >> 2) /
204 					sizeof (struct inode));
205 	if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) {
206 		cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld",
207 				ufs_inode_max);
208 		ufs_ninode = ufs_inode_max;
209 	}
210 	/*
211 	 * Wait till third call of ufs_update to declare that no I/Os are
212 	 * going on. This allows deferred access times to be flushed to disk.
213 	 */
214 	ufs_iowait = v.v_autoup * hz * 2;
215 
216 	/*
217 	 * idle thread runs when 25% of ufs_ninode entries are on the queue
218 	 */
219 	if (ufs_idle_max == 0)
220 		ufs_idle_max = ufs_ninode >> 2;
221 	if (ufs_idle_max < UFS_IDLE_MAX)
222 		ufs_idle_max = UFS_IDLE_MAX;
223 	if (ufs_idle_max > ufs_ninode)
224 		ufs_idle_max = ufs_ninode;
225 	/*
226 	 * This is really a misnomer, it is ufs_queue_init
227 	 */
228 	ufs_thread_init(&ufs_idle_q, ufs_idle_max);
229 	ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL);
230 
231 	/*
232 	 * global hlock thread
233 	 */
234 	ufs_thread_init(&ufs_hlock, 1);
235 	ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL);
236 
237 	ihinit();
238 	qtinit();
239 	ins.in_maxsize.value.ul = ufs_ninode;
240 	if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs",
241 	    KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t),
242 	    KSTAT_FLAG_VIRTUAL)) != NULL) {
243 		ufs_inode_kstat->ks_data = (void *)&ins;
244 		ufs_inode_kstat->ks_update = ufs_inode_kstat_update;
245 		kstat_install(ufs_inode_kstat);
246 	}
247 	ufsfx_init();		/* fix-on-panic initialization */
248 	si_cache_init();
249 	ufs_directio_init();
250 	lufs_init();
251 	mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL);
252 }
253 
254 /* ARGSUSED */
255 static int
256 ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags)
257 {
258 	struct inode *ip = buf;
259 	struct vnode *vp;
260 
261 	rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL);
262 	rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL);
263 	mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL);
264 	dnlc_dir_init(&ip->i_danchor);
265 
266 	cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL);
267 
268 	vp = vn_alloc(KM_SLEEP);
269 	ip->i_vnode = vp;
270 
271 	vn_setops(vp, ufs_vnodeops);
272 	vp->v_data = (caddr_t)ip;
273 
274 	return (0);
275 }
276 
277 /* ARGSUSED */
278 static void
279 ufs_inode_cache_destructor(void *buf, void *cdrarg)
280 {
281 	struct inode *ip = buf;
282 	struct vnode *vp;
283 
284 	vp = ITOV(ip);
285 
286 	rw_destroy(&ip->i_rwlock);
287 	rw_destroy(&ip->i_contents);
288 
289 	mutex_destroy(&ip->i_tlock);
290 	if (vp->v_type == VDIR) {
291 		dnlc_dir_fini(&ip->i_danchor);
292 	}
293 
294 	cv_destroy(&ip->i_wrcv);
295 
296 	vn_free(vp);
297 }
298 
299 /*
300  * Initialize hash links for inodes
301  * and build inode free list.
302  */
303 void
304 ihinit(void)
305 {
306 	int i;
307 	union	ihead *ih = ihead;
308 
309 	mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL);
310 
311 	inohsz = 1 << highbit(ufs_ninode / ino_hashlen);
312 	ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP);
313 	ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP);
314 
315 	for (i = 0, ih = ihead; i < inohsz; i++,  ih++) {
316 		ih->ih_head[0] = ih;
317 		ih->ih_head[1] = ih;
318 		mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL);
319 	}
320 	inode_cache = kmem_cache_create("ufs_inode_cache",
321 		sizeof (struct inode), 0, ufs_inode_cache_constructor,
322 		ufs_inode_cache_destructor, ufs_inode_cache_reclaim,
323 		NULL, NULL, 0);
324 }
325 
326 /*
327  * Free an inode structure
328  */
329 void
330 ufs_free_inode(struct inode *ip)
331 {
332 	vn_invalid(ITOV(ip));
333 	kmem_cache_free(inode_cache, ip);
334 }
335 
336 /*
337  * Allocate an inode structure
338  */
339 struct inode *
340 ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino)
341 {
342 	struct inode *ip;
343 	vnode_t *vp;
344 
345 	ip = kmem_cache_alloc(inode_cache, KM_SLEEP);
346 	/*
347 	 * at this point we have a newly allocated inode
348 	 */
349 	ip->i_freef = ip;
350 	ip->i_freeb = ip;
351 	ip->i_flag = IREF;
352 	ip->i_seq = 0xFF;	/* Unique initial value */
353 	ip->i_dev = ufsvfsp->vfs_dev;
354 	ip->i_ufsvfs = ufsvfsp;
355 	ip->i_devvp = ufsvfsp->vfs_devvp;
356 	ip->i_number = ino;
357 	ip->i_diroff = 0;
358 	ip->i_nextr = 0;
359 	ip->i_map = NULL;
360 	ip->i_rdev = 0;
361 	ip->i_writes = 0;
362 	ip->i_mode = 0;
363 	ip->i_delaylen = 0;
364 	ip->i_delayoff = 0;
365 	ip->i_nextrio = 0;
366 	ip->i_ufs_acl = NULL;
367 	ip->i_cflags = 0;
368 	ip->i_mapcnt = 0;
369 	ip->i_dquot = NULL;
370 	ip->i_cachedir = 1;
371 	ip->i_writer = NULL;
372 
373 	/*
374 	 * the vnode for this inode was allocated by the constructor
375 	 */
376 	vp = ITOV(ip);
377 	vn_reinit(vp);
378 	if (ino == (ino_t)UFSROOTINO)
379 		vp->v_flag = VROOT;
380 	vp->v_vfsp = ufsvfsp->vfs_vfs;
381 	vn_exists(vp);
382 	return (ip);
383 }
384 
385 /*
386  * Look up an inode by device, inumber.  If it is in core (in the
387  * inode structure), honor the locking protocol.  If it is not in
388  * core, read it in from the specified device after freeing any pages.
389  * In all cases, a pointer to a VN_HELD inode structure is returned.
390  */
391 int
392 ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr)
393 {
394 	return (ufs_iget_internal(vfsp, ino, ipp, cr, 0));
395 }
396 
397 /*
398  * A version of ufs_iget which returns only allocated, linked inodes.
399  * This is appropriate for any callers who do not expect a free inode.
400  */
401 int
402 ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp,
403     struct cred *cr)
404 {
405 	return (ufs_iget_internal(vfsp, ino, ipp, cr, 1));
406 }
407 
408 /*
409  * Set vnode attributes based on v_type, this should be called whenever
410  * an inode's i_mode is changed.
411  */
412 void
413 ufs_reset_vnode(vnode_t *vp)
414 {
415 	/*
416 	 * an old DBE hack
417 	 */
418 	if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
419 		vp->v_flag |= VSWAPLIKE;
420 	else
421 		vp->v_flag &= ~VSWAPLIKE;
422 
423 	/*
424 	 * if not swap like and it's just a regular file, we want
425 	 * to maintain the vnode's pages sorted by clean/modified
426 	 * for faster sync'ing to disk
427 	 */
428 	if (vp->v_type == VREG)
429 		vp->v_flag |= VMODSORT;
430 	else
431 		vp->v_flag &= ~VMODSORT;
432 
433 	/*
434 	 * Is this an attribute hidden dir?
435 	 */
436 	if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR)
437 		vp->v_flag |= V_XATTRDIR;
438 	else
439 		vp->v_flag &= ~V_XATTRDIR;
440 }
441 
442 /*
443  * Shared implementation of ufs_iget and ufs_iget_alloced.  The 'validate'
444  * flag is used to distinguish the two; when true, we validate that the inode
445  * being retrieved looks like a linked and allocated inode.
446  */
447 /* ARGSUSED */
448 static int
449 ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp,
450     struct cred *cr, int validate)
451 {
452 	struct inode *ip, *sp;
453 	union ihead *ih;
454 	kmutex_t *ihm;
455 	struct buf *bp;
456 	struct dinode *dp;
457 	struct vnode *vp;
458 	extern vfs_t EIO_vfs;
459 	int error;
460 	int ftype;	/* XXX - Remove later on */
461 	dev_t vfs_dev;
462 	struct ufsvfs *ufsvfsp;
463 	struct fs *fs;
464 	int hno;
465 	daddr_t bno;
466 	ulong_t ioff;
467 
468 	CPU_STATS_ADD_K(sys, ufsiget, 1);
469 
470 	/*
471 	 * Lookup inode in cache.
472 	 */
473 	vfs_dev = vfsp->vfs_dev;
474 	hno = INOHASH(ino);
475 	ih = &ihead[hno];
476 	ihm = &ih_lock[hno];
477 
478 again:
479 	mutex_enter(ihm);
480 	for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) {
481 		if (ino != ip->i_number || vfs_dev != ip->i_dev ||
482 		    (ip->i_flag & ISTALE))
483 			continue;
484 
485 		/*
486 		 * Found the interesting inode; hold it and drop the cache lock
487 		 */
488 		vp = ITOV(ip);	/* for locknest */
489 		VN_HOLD(vp);
490 		mutex_exit(ihm);
491 		rw_enter(&ip->i_contents, RW_READER);
492 
493 		/*
494 		 * if necessary, remove from idle list
495 		 */
496 		if ((ip->i_flag & IREF) == 0) {
497 			if (ufs_rmidle(ip))
498 				VN_RELE(vp);
499 		}
500 
501 		/*
502 		 * Could the inode be read from disk?
503 		 */
504 		if (ip->i_flag & ISTALE) {
505 			rw_exit(&ip->i_contents);
506 			VN_RELE(vp);
507 			goto again;
508 		}
509 
510 		ins.in_hits.value.ul++;
511 		*ipp = ip;
512 
513 		/*
514 		 * Reset the vnode's attribute flags
515 		 */
516 		mutex_enter(&vp->v_lock);
517 		ufs_reset_vnode(vp);
518 		mutex_exit(&vp->v_lock);
519 
520 		rw_exit(&ip->i_contents);
521 
522 		return (0);
523 	}
524 	mutex_exit(ihm);
525 
526 	/*
527 	 * Inode was not in cache.
528 	 *
529 	 * Allocate a new entry
530 	 */
531 	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
532 	fs = ufsvfsp->vfs_fs;
533 
534 	ip = ufs_alloc_inode(ufsvfsp, ino);
535 	vp = ITOV(ip);
536 
537 	bno = fsbtodb(fs, itod(fs, ino));
538 	ioff = (sizeof (struct dinode)) * (itoo(fs, ino));
539 	ip->i_doff = (offset_t)ioff + ldbtob(bno);
540 
541 	/*
542 	 * put a place holder in the cache (if not already there)
543 	 */
544 	mutex_enter(ihm);
545 	for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw)
546 		if (ino == sp->i_number && vfs_dev == sp->i_dev &&
547 		    ((sp->i_flag & ISTALE) == 0)) {
548 			mutex_exit(ihm);
549 			ufs_free_inode(ip);
550 			goto again;
551 		}
552 	/*
553 	 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
554 	 * here, but if we do, then shadow inode allocations panic the
555 	 * system.  We don't have to hold vfs_dqrwlock for shadow inodes
556 	 * and the ufs_iget() parameters don't tell us what we are getting
557 	 * so we have no way of knowing this is a ufs_iget() call from
558 	 * a ufs_ialloc() call for a shadow inode.
559 	 */
560 	rw_enter(&ip->i_contents, RW_WRITER);
561 	insque(ip, ih);
562 	mutex_exit(ihm);
563 	/*
564 	 * read the dinode
565 	 */
566 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize);
567 
568 	/*
569 	 * Check I/O errors
570 	 */
571 	error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0);
572 	if (error) {
573 		brelse(bp);
574 		ip->i_flag |= ISTALE;	/* in case someone is looking it up */
575 		rw_exit(&ip->i_contents);
576 		vp->v_vfsp = &EIO_vfs;
577 		VN_RELE(vp);
578 		return (error);
579 	}
580 	/*
581 	 * initialize the inode's dinode
582 	 */
583 	dp = (struct dinode *)(ioff + bp->b_un.b_addr);
584 	ip->i_ic = dp->di_ic;			/* structure assignment */
585 	brelse(bp);
586 
587 	/*
588 	 * Maintain compatibility with Solaris 1.x UFS
589 	 */
590 	if (ip->i_suid != UID_LONG)
591 		ip->i_uid = ip->i_suid;
592 	if (ip->i_sgid != GID_LONG)
593 		ip->i_gid = ip->i_sgid;
594 
595 	ftype = ip->i_mode & IFMT;
596 	if (ftype == IFBLK || ftype == IFCHR) {
597 		dev_t dv;
598 		uint_t top16 = ip->i_ordev & 0xffff0000u;
599 
600 		if (top16 == 0 || top16 == 0xffff0000u)
601 			dv = expdev(ip->i_ordev);
602 		else
603 			dv = expldev(ip->i_ordev);
604 		vp->v_rdev = ip->i_rdev = dv;
605 	}
606 
607 	/*
608 	 * if our caller only expects allocated inodes, verify that
609 	 * this inode looks good; throw it out if it's bad.
610 	 */
611 	if (validate) {
612 		if ((ftype == 0) || (ip->i_nlink <= 0)) {
613 			ip->i_flag |= ISTALE;
614 			rw_exit(&ip->i_contents);
615 			vp->v_vfsp = &EIO_vfs;
616 			VN_RELE(vp);
617 			cmn_err(CE_NOTE,
618 			    "%s: unexpected free inode %d, run fsck(1M)%s",
619 			    fs->fs_fsmnt, (int)ino,
620 			    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
621 			return (EIO);
622 		}
623 	}
624 
625 	/*
626 	 * finish initializing the vnode
627 	 */
628 	vp->v_type = IFTOVT((mode_t)ip->i_mode);
629 
630 	ufs_reset_vnode(vp);
631 
632 	/*
633 	 * read the shadow
634 	 */
635 	if (ftype != 0 && ip->i_shadow != 0) {
636 		if ((error = ufs_si_load(ip, cr)) != 0) {
637 			ip->i_flag |= ISTALE;
638 			ip->i_ufs_acl = NULL;
639 			rw_exit(&ip->i_contents);
640 			vp->v_vfsp = &EIO_vfs;
641 			VN_RELE(vp);
642 			return (error);
643 		}
644 	}
645 
646 	/*
647 	 * Only attach quota information if the inode has a type and if
648 	 * that type is not a shadow inode.
649 	 */
650 	if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) &&
651 	    ((ip->i_mode & IFMT) != IFATTRDIR)) {
652 		ip->i_dquot = getinoquota(ip);
653 	}
654 	TRANS_MATA_IGET(ufsvfsp, ip);
655 	*ipp = ip;
656 	rw_exit(&ip->i_contents);
657 
658 	return (0);
659 }
660 
661 /*
662  * Vnode is no longer referenced, write the inode out
663  * and if necessary, truncate and deallocate the file.
664  */
665 void
666 ufs_iinactive(struct inode *ip)
667 {
668 	int		front;
669 	struct inode	*iq;
670 	struct inode	*hip;
671 	struct ufs_q	*uq;
672 	struct vnode	*vp = ITOV(ip);
673 
674 
675 	/*
676 	 * Because the vnode type might have been changed,
677 	 * the dnlc_dir_purge must be called unconditionally.
678 	 */
679 	dnlc_dir_purge(&ip->i_danchor);
680 
681 	/*
682 	 * Get exclusive access to inode data.
683 	 */
684 	rw_enter(&ip->i_contents, RW_WRITER);
685 	ASSERT(ip->i_flag & IREF);
686 
687 	/*
688 	 * Make sure no one reclaimed the inode before we put it on
689 	 * the freelist or destroy it. We keep our 'hold' on the vnode
690 	 * from vn_rele until we are ready to do something with the inode.
691 	 *
692 	 * Pageout may put a VN_HOLD/VN_RELE at anytime during this
693 	 * operation via an async putpage, so we must make sure
694 	 * we don't free/destroy the inode more than once. ufs_iget
695 	 * may also put a VN_HOLD on the inode before it grabs
696 	 * the i_contents lock. This is done so we don't free
697 	 * an inode that a thread is waiting on.
698 	 */
699 	mutex_enter(&vp->v_lock);
700 
701 	if (vp->v_count > 1) {
702 	    vp->v_count--;  /* release our hold from vn_rele */
703 	    mutex_exit(&vp->v_lock);
704 	    rw_exit(&ip->i_contents);
705 	    return;
706 	}
707 	mutex_exit(&vp->v_lock);
708 
709 	/*
710 	 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed
711 	 * and clean.  It can be safely destroyed (cyf).
712 	 */
713 	if (ip->i_ufsvfs == NULL) {
714 		rw_exit(&ip->i_contents);
715 		ufs_si_del(ip);
716 		ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
717 		ufs_free_inode(ip);
718 		return;
719 	}
720 
721 	/*
722 	 * queue idle inode to appropriate thread. Will check v_count == 1
723 	 * prior to putting this on the appropriate queue.
724 	 * Stale inodes will be unhashed and freed by the ufs idle thread
725 	 * in ufs_idle_free()
726 	 */
727 	front = 1;
728 	if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 &&
729 	    ip->i_mode && ip->i_nlink <= 0) {
730 		/*
731 		 * Mark the i_flag to indicate that inode is being deleted.
732 		 * This flag will be cleared when the deletion is complete.
733 		 * This prevents nfs from sneaking in via ufs_vget() while
734 		 * the delete is in progress (bugid 1242481).
735 		 */
736 		ip->i_flag |= IDEL;
737 
738 		/*
739 		 * NOIDEL means that deletes are not allowed at this time;
740 		 * whoever resets NOIDEL will also send this inode back
741 		 * through ufs_iinactive.  IREF remains set.
742 		 */
743 		if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
744 			mutex_enter(&vp->v_lock);
745 			vp->v_count--;
746 			mutex_exit(&vp->v_lock);
747 			rw_exit(&ip->i_contents);
748 			return;
749 		}
750 		if (!TRANS_ISTRANS(ip->i_ufsvfs)) {
751 			rw_exit(&ip->i_contents);
752 			ufs_delete(ip->i_ufsvfs, ip, 0);
753 			return;
754 		}
755 
756 		/* queue to delete thread; IREF remains set */
757 		ins.in_qfree.value.ul++;
758 		uq = &ip->i_ufsvfs->vfs_delete;
759 
760 		mutex_enter(&uq->uq_mutex);
761 
762 		/* add to q */
763 		if ((iq = uq->uq_ihead) != 0) {
764 			ip->i_freef = iq;
765 			ip->i_freeb = iq->i_freeb;
766 			iq->i_freeb->i_freef = ip;
767 			iq->i_freeb = ip;
768 			if (front)
769 				uq->uq_ihead = ip;
770 		} else {
771 			uq->uq_ihead = ip;
772 			ip->i_freef = ip;
773 			ip->i_freeb = ip;
774 		}
775 	} else {
776 		/*
777 		 * queue to idle thread
778 		 *  Check the v_count == 1 again.
779 		 *
780 		 */
781 		mutex_enter(&vp->v_lock);
782 		if (vp->v_count > 1) {
783 		    vp->v_count--;  /* release our hold from vn_rele */
784 		    mutex_exit(&vp->v_lock);
785 		    rw_exit(&ip->i_contents);
786 		    return;
787 		}
788 		mutex_exit(&vp->v_lock);
789 		uq = &ufs_idle_q;
790 
791 		/*
792 		 * useful iff it has pages or is a fastsymlink; otherwise junk
793 		 */
794 		mutex_enter(&uq->uq_mutex);
795 
796 		/* clear IREF means `on idle list' */
797 		ip->i_flag &= ~(IREF | IDIRECTIO);
798 
799 		if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) {
800 			ins.in_frback.value.ul++;
801 			hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)];
802 			ufs_nuseful_iq++;
803 		} else {
804 			ins.in_frfront.value.ul++;
805 			hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)];
806 			ip->i_flag |= IJUNKIQ;
807 			ufs_njunk_iq++;
808 		}
809 		ip->i_freef = hip;
810 		ip->i_freeb = hip->i_freeb;
811 		hip->i_freeb->i_freef = ip;
812 		hip->i_freeb = ip;
813 	}
814 
815 	/* wakeup thread(s) if q is overfull */
816 	if (++uq->uq_ne == uq->uq_lowat)
817 		cv_broadcast(&uq->uq_cv);
818 
819 	/* all done, release the q and inode */
820 	mutex_exit(&uq->uq_mutex);
821 	rw_exit(&ip->i_contents);
822 }
823 
824 /*
825  * Check accessed and update flags on an inode structure.
826  * If any are on, update the inode with the (unique) current time.
827  * If waitfor is given, insure I/O order so wait for write to complete.
828  */
829 void
830 ufs_iupdat(struct inode *ip, int waitfor)
831 {
832 	struct buf	*bp;
833 	struct fs	*fp;
834 	struct dinode	*dp;
835 	struct ufsvfs	*ufsvfsp 	= ip->i_ufsvfs;
836 	int 		i;
837 	int		do_trans_times;
838 	ushort_t	flag;
839 	o_uid_t		suid;
840 	o_gid_t		sgid;
841 
842 	/*
843 	 * This function is now safe to be called with either the reader
844 	 * or writer i_contents lock.
845 	 */
846 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
847 
848 	/*
849 	 * Return if file system has been forcibly umounted.
850 	 */
851 	if (ufsvfsp == NULL)
852 		return;
853 
854 	flag = ip->i_flag;	/* Atomic read */
855 	/*
856 	 * We better not update the disk inode from a stale inode.
857 	 */
858 	if (flag & ISTALE)
859 		return;
860 
861 	fp = ip->i_fs;
862 
863 	if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) {
864 		if (fp->fs_ronly) {
865 			mutex_enter(&ip->i_tlock);
866 			ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
867 			mutex_exit(&ip->i_tlock);
868 			return;
869 		}
870 		/*
871 		 * fs is active while metadata is being written
872 		 */
873 		mutex_enter(&ufsvfsp->vfs_lock);
874 		ufs_notclean(ufsvfsp);
875 		/*
876 		 * get the dinode
877 		 */
878 		bp = UFS_BREAD(ufsvfsp, ip->i_dev,
879 		    (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)),
880 		    (int)fp->fs_bsize);
881 		if (bp->b_flags & B_ERROR) {
882 			mutex_enter(&ip->i_tlock);
883 			ip->i_flag &=
884 			    ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
885 			mutex_exit(&ip->i_tlock);
886 			brelse(bp);
887 			return;
888 		}
889 		/*
890 		 * munge inode fields
891 		 */
892 		mutex_enter(&ip->i_tlock);
893 		ITIMES_NOLOCK(ip);
894 		do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC);
895 		ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
896 		mutex_exit(&ip->i_tlock);
897 
898 		/*
899 		 * For reads and concurrent re-writes, no deltas were
900 		 * entered for the access time changes - do it now.
901 		 */
902 		if (do_trans_times) {
903 			TRANS_INODE_TIMES(ufsvfsp, ip);
904 		}
905 
906 		/*
907 		 * For SunOS 5.0->5.4, these lines below read:
908 		 *
909 		 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
910 		 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
911 		 *
912 		 * where MAXUID was set to 60002.  This was incorrect -
913 		 * the uids should have been constrained to what fitted into
914 		 * a 16-bit word.
915 		 *
916 		 * This means that files from 4.x filesystems that have an
917 		 * i_suid field larger than 60002 will have that field
918 		 * changed to 65535.
919 		 *
920 		 * Security note: 4.x UFS could never create a i_suid of
921 		 * UID_LONG since that would've corresponded to -1.
922 		 */
923 		suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
924 			UID_LONG : ip->i_uid;
925 		sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
926 			GID_LONG : ip->i_gid;
927 
928 		if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) {
929 			ip->i_suid = suid;
930 			ip->i_sgid = sgid;
931 			TRANS_INODE(ufsvfsp, ip);
932 		}
933 
934 		if ((ip->i_mode & IFMT) == IFBLK ||
935 		    (ip->i_mode & IFMT) == IFCHR) {
936 			dev_t d = ip->i_rdev;
937 			dev32_t dev32;
938 
939 			/*
940 			 * load first direct block only if special device
941 			 */
942 			if (!cmpldev(&dev32, d)) {
943 				/*
944 				 * We panic here because there's "no way"
945 				 * we should have been able to create a large
946 				 * inode with a large dev_t.  Earlier layers
947 				 * should've caught this.
948 				 */
949 				panic("ip %p: i_rdev too big", (void *)ip);
950 			}
951 
952 			if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
953 				ip->i_ordev = dev32;	/* can't use old fmt. */
954 			} else {
955 				ip->i_ordev = cmpdev(d);
956 			}
957 		}
958 
959 		/*
960 		 * copy inode to dinode (zero fastsymlnk in dinode)
961 		 */
962 		dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number);
963 		dp->di_ic = ip->i_ic;	/* structure assignment */
964 		if (flag & IFASTSYMLNK) {
965 			for (i = 1; i < NDADDR; i++)
966 				dp->di_db[i] = 0;
967 			for (i = 0; i < NIADDR; i++)
968 				dp->di_ib[i] = 0;
969 		}
970 		if (TRANS_ISTRANS(ufsvfsp)) {
971 			/*
972 			 * Pass only a sector size buffer containing
973 			 * the inode, otherwise when the buffer is copied
974 			 * into a cached roll buffer then too much memory
975 			 * gets consumed if 8KB inode buffers are passed.
976 			 */
977 			TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff,
978 			    sizeof (struct dinode),
979 			    (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE),
980 			    DEV_BSIZE);
981 
982 			brelse(bp);
983 		} else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) {
984 			UFS_BRWRITE(ufsvfsp, bp);
985 
986 			/*
987 			 * Synchronous write has guaranteed that inode
988 			 * has been written on disk so clear the flag
989 			 */
990 			mutex_enter(&ip->i_tlock);
991 			ip->i_flag &= ~IBDWRITE;
992 			mutex_exit(&ip->i_tlock);
993 		} else {
994 			bdrwrite(bp);
995 
996 			/*
997 			 * This write hasn't guaranteed that inode has been
998 			 * written on the disk.
999 			 * Since, all updat flags on inode are cleared, we must
1000 			 * remember the condition in case inode is to be updated
1001 			 * synchronously later (e.g.- fsync()/fdatasync())
1002 			 * and inode has not been modified yet.
1003 			 */
1004 			mutex_enter(&ip->i_tlock);
1005 			ip->i_flag |= IBDWRITE;
1006 			mutex_exit(&ip->i_tlock);
1007 		}
1008 	} else {
1009 		/*
1010 		 * In case previous inode update was done asynchronously
1011 		 * (IBDWRITE) and this inode update request wants guaranteed
1012 		 * (synchronous) disk update, flush the inode.
1013 		 */
1014 		if (waitfor && (flag & IBDWRITE)) {
1015 			blkflush(ip->i_dev,
1016 				(daddr_t)fsbtodb(fp, itod(fp, ip->i_number)));
1017 			mutex_enter(&ip->i_tlock);
1018 			ip->i_flag &= ~IBDWRITE;
1019 			mutex_exit(&ip->i_tlock);
1020 		}
1021 	}
1022 }
1023 
1024 #define	SINGLE	0	/* index of single indirect block */
1025 #define	DOUBLE	1	/* index of double indirect block */
1026 #define	TRIPLE	2	/* index of triple indirect block */
1027 
1028 /*
1029  * Release blocks associated with the inode ip and
1030  * stored in the indirect block bn.  Blocks are free'd
1031  * in LIFO order up to (but not including) lastbn.  If
1032  * level is greater than SINGLE, the block is an indirect
1033  * block and recursive calls to indirtrunc must be used to
1034  * cleanse other indirect blocks.
1035  *
1036  * N.B.: triple indirect blocks are untested.
1037  */
1038 static long
1039 indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags)
1040 {
1041 	int i;
1042 	struct buf *bp, *copy;
1043 	daddr32_t *bap;
1044 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1045 	struct fs *fs = ufsvfsp->vfs_fs;
1046 	daddr_t nb, last;
1047 	long factor;
1048 	int blocksreleased = 0, nblocks;
1049 
1050 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
1051 	/*
1052 	 * Calculate index in current block of last
1053 	 * block to be kept.  -1 indicates the entire
1054 	 * block so we need not calculate the index.
1055 	 */
1056 	factor = 1;
1057 	for (i = SINGLE; i < level; i++)
1058 		factor *= NINDIR(fs);
1059 	last = lastbn;
1060 	if (lastbn > 0)
1061 		last /= factor;
1062 	nblocks = btodb(fs->fs_bsize);
1063 	/*
1064 	 * Get buffer of block pointers, zero those
1065 	 * entries corresponding to blocks to be free'd,
1066 	 * and update on disk copy first.
1067 	 * *Unless* the root pointer has been synchronously
1068 	 * written to disk.  If nothing points to this
1069 	 * indirect block then don't bother zero'ing and
1070 	 * writing it.
1071 	 */
1072 	bp = UFS_BREAD(ufsvfsp,
1073 			ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize);
1074 	if (bp->b_flags & B_ERROR) {
1075 		brelse(bp);
1076 		return (0);
1077 	}
1078 	bap = bp->b_un.b_daddr;
1079 	if ((flags & I_CHEAP) == 0) {
1080 		uint_t	zb;
1081 
1082 		zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t));
1083 
1084 		if (zb) {
1085 			/*
1086 			 * push any data into the log before we zero it
1087 			 */
1088 			if (bp->b_flags & B_DELWRI)
1089 				TRANS_LOG(ufsvfsp, (caddr_t)bap,
1090 					ldbtob(bp->b_blkno), bp->b_bcount,
1091 					bp->b_un.b_addr, bp->b_bcount);
1092 			copy = ngeteblk(fs->fs_bsize);
1093 			bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr,
1094 				(uint_t)fs->fs_bsize);
1095 			bzero((caddr_t)&bap[last + 1], zb);
1096 
1097 			TRANS_BUF(ufsvfsp,
1098 				(caddr_t)&bap[last + 1] - (caddr_t)bap,
1099 				zb, bp, DT_ABZERO);
1100 
1101 			UFS_BRWRITE(ufsvfsp, bp);
1102 			bp = copy, bap = bp->b_un.b_daddr;
1103 		}
1104 	} else {
1105 		/* make sure write retries are also cleared */
1106 		bp->b_flags &= ~(B_DELWRI | B_RETRYWRI);
1107 		bp->b_flags |= B_STALE | B_AGE;
1108 	}
1109 
1110 	/*
1111 	 * Recursively free totally unused blocks.
1112 	 */
1113 	flags |= I_CHEAP;
1114 	for (i = NINDIR(fs) - 1; i > last; i--) {
1115 		nb = bap[i];
1116 		if (nb == 0)
1117 			continue;
1118 		if (level > SINGLE) {
1119 			blocksreleased +=
1120 			    indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags);
1121 			free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK);
1122 		} else
1123 			free(ip, nb, (off_t)fs->fs_bsize, flags);
1124 		blocksreleased += nblocks;
1125 	}
1126 	flags &= ~I_CHEAP;
1127 
1128 	/*
1129 	 * Recursively free last partial block.
1130 	 */
1131 	if (level > SINGLE && lastbn >= 0) {
1132 		last = lastbn % factor;
1133 		nb = bap[i];
1134 		if (nb != 0)
1135 			blocksreleased += indirtrunc(ip, nb, last, level - 1,
1136 				flags);
1137 	}
1138 	brelse(bp);
1139 	return (blocksreleased);
1140 }
1141 
1142 /*
1143  * Truncate the inode ip to at most length size.
1144  * Free affected disk blocks -- the blocks of the
1145  * file are removed in reverse order.
1146  *
1147  * N.B.: triple indirect blocks are untested.
1148  */
1149 static int i_genrand = 1234;
1150 int
1151 ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr)
1152 {
1153 	struct fs *fs = oip->i_fs;
1154 	struct ufsvfs *ufsvfsp = oip->i_ufsvfs;
1155 	struct inode *ip;
1156 	daddr_t lastblock;
1157 	off_t bsize;
1158 	int boff;
1159 	daddr_t bn, lastiblock[NIADDR];
1160 	int level;
1161 	long nblocks, blocksreleased = 0;
1162 	int i;
1163 	ushort_t mode;
1164 	struct inode tip;
1165 	int err;
1166 	u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ?
1167 	    (UFS_MAXOFFSET_T) : (MAXOFF32_T);
1168 
1169 	/*
1170 	 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
1171 	 * other uses need the reader lock. opendq() holds the writer lock.
1172 	 */
1173 	ASSERT((oip->i_mode & IFMT) == IFSHAD ||
1174 		RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
1175 	ASSERT(RW_WRITE_HELD(&oip->i_contents));
1176 	/*
1177 	 * We only allow truncation of regular files and directories
1178 	 * to arbitrary lengths here.  In addition, we allow symbolic
1179 	 * links to be truncated only to zero length.  Other inode
1180 	 * types cannot have their length set here.  Disk blocks are
1181 	 * being dealt with - especially device inodes where
1182 	 * ip->i_ordev is actually being stored in ip->i_db[0]!
1183 	 */
1184 	TRANS_INODE(ufsvfsp, oip);
1185 	mode = oip->i_mode & IFMT;
1186 	if (flags & I_FREE) {
1187 		i_genrand *= 16843009;  /* turns into shift and adds */
1188 		i_genrand++;
1189 		oip->i_gen += ((i_genrand + lbolt) & 0xffff) + 1;
1190 		oip->i_flag |= ICHG |IUPD;
1191 		oip->i_seq++;
1192 		if (length == oip->i_size)
1193 			return (0);
1194 		flags |= I_CHEAP;
1195 	}
1196 	if (mode == IFIFO)
1197 		return (0);
1198 	if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR &&
1199 	    !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD)
1200 		return (EINVAL);
1201 	if (length > maxoffset)
1202 		return (EFBIG);
1203 	if ((mode == IFDIR) || (mode == IFATTRDIR))
1204 		flags |= I_DIR;
1205 	if (mode == IFSHAD)
1206 		flags |= I_SHAD;
1207 	if (oip == ufsvfsp->vfs_qinod)
1208 		flags |= I_QUOTA;
1209 	if (length == oip->i_size) {
1210 		/* update ctime and mtime to please POSIX tests */
1211 		oip->i_flag |= ICHG |IUPD;
1212 		oip->i_seq++;
1213 		if (length == 0) {
1214 			/* nothing to cache so clear the flag */
1215 			oip->i_flag &= ~IFASTSYMLNK;
1216 		}
1217 		return (0);
1218 	}
1219 	/* wipe out fast symlink till next access */
1220 	if (oip->i_flag & IFASTSYMLNK) {
1221 		int j;
1222 
1223 		ASSERT(ITOV(oip)->v_type == VLNK);
1224 
1225 		oip->i_flag &= ~IFASTSYMLNK;
1226 
1227 		for (j = 1; j < NDADDR; j++)
1228 			oip->i_db[j] = 0;
1229 		for (j = 0; j < NIADDR; j++)
1230 			oip->i_ib[j] = 0;
1231 	}
1232 
1233 	boff = (int)blkoff(fs, length);
1234 
1235 	if (length > oip->i_size) {
1236 		/*
1237 		 * Trunc up case.  BMAPALLOC will insure that the right blocks
1238 		 * are allocated.  This includes extending the old frag to a
1239 		 * full block (if needed) in addition to doing any work
1240 		 * needed for allocating the last block.
1241 		 */
1242 		if (boff == 0)
1243 			err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr);
1244 		else
1245 			err = BMAPALLOC(oip, length - 1, boff, cr);
1246 
1247 		if (err == 0) {
1248 			/*
1249 			 * Save old size and set inode's size now
1250 			 * so that we don't cause too much of the
1251 			 * file to be zero'd and pushed.
1252 			 */
1253 			u_offset_t osize = oip->i_size;
1254 			oip->i_size  = length;
1255 			/*
1256 			 * Make sure we zero out the remaining bytes of
1257 			 * the page in case a mmap scribbled on it. We
1258 			 * can't prevent a mmap from writing beyond EOF
1259 			 * on the last page of a file.
1260 			 *
1261 			 */
1262 			if ((boff = (int)blkoff(fs, osize)) != 0) {
1263 				bsize = (int)lblkno(fs, osize - 1) >= NDADDR ?
1264 				    fs->fs_bsize : fragroundup(fs, boff);
1265 				pvn_vpzero(ITOV(oip), osize,
1266 				    (size_t)(bsize - boff));
1267 			}
1268 			oip->i_flag |= ICHG|IATTCHG;
1269 			oip->i_seq++;
1270 			ITIMES_NOLOCK(oip);
1271 			/*
1272 			 * MAXOFF32_T is old 2GB size limit. If
1273 			 * this operation caused a large file to be
1274 			 * created, turn on the superblock flag
1275 			 * and update the superblock, if the flag
1276 			 * is not already on.
1277 			 */
1278 			if ((length > (u_offset_t)MAXOFF32_T) &&
1279 			    !(fs->fs_flags & FSLARGEFILES)) {
1280 				ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1281 				mutex_enter(&ufsvfsp->vfs_lock);
1282 				fs->fs_flags |= FSLARGEFILES;
1283 				ufs_sbwrite(ufsvfsp);
1284 				mutex_exit(&ufsvfsp->vfs_lock);
1285 			}
1286 		}
1287 
1288 		return (err);
1289 	}
1290 
1291 	/*
1292 	 * Update the pages of the file.  If the file is not being
1293 	 * truncated to a block boundary, the contents of the
1294 	 * pages following the end of the file must be zero'ed
1295 	 * in case it ever become accessible again because
1296 	 * of subsequent file growth.
1297 	 */
1298 	if (boff == 0) {
1299 		(void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage,
1300 		    B_INVAL | B_TRUNC, CRED());
1301 	} else {
1302 		/*
1303 		 * Make sure that the last block is properly allocated.
1304 		 * We only really have to do this if the last block is
1305 		 * actually allocated since ufs_bmap will now handle the case
1306 		 * of an fragment which has no block allocated.  Just to
1307 		 * be sure, we do it now independent of current allocation.
1308 		 */
1309 		err = BMAPALLOC(oip, length - 1, boff, cr);
1310 		if (err)
1311 			return (err);
1312 
1313 		/*
1314 		 * BMAPALLOC will call bmap_write which defers i_seq
1315 		 * processing.  If the timestamps were changed, update
1316 		 * i_seq before rdip drops i_contents or syncs the inode.
1317 		 */
1318 		if (oip->i_flag & (ICHG|IUPD))
1319 			oip->i_seq++;
1320 
1321 		/*
1322 		 * BugId 4069932
1323 		 * Make sure that the relevant partial page appears in
1324 		 * the v_pages list, so that pvn_vpzero() will do its
1325 		 * job.  Since doing this correctly requires everything
1326 		 * in rdip() except for the uiomove(), it's easier and
1327 		 * safer to do the uiomove() rather than duplicate the
1328 		 * rest of rdip() here.
1329 		 *
1330 		 * To get here, we know that length indicates a byte
1331 		 * that is not the first byte of a block.  (length - 1)
1332 		 * is the last actual byte known to exist.  Deduction
1333 		 * shows it is in the same block as byte (length).
1334 		 * Thus, this rdip() invocation should always succeed
1335 		 * except in the face of i/o errors, and give us the
1336 		 * block we care about.
1337 		 *
1338 		 * rdip() makes the same locking assertions and
1339 		 * assumptions as we do.  We do not acquire any locks
1340 		 * before calling it, so we have not changed the locking
1341 		 * situation.  Finally, there do not appear to be any
1342 		 * paths whereby rdip() ends up invoking us again.
1343 		 * Thus, infinite recursion is avoided.
1344 		 */
1345 		{
1346 			uio_t uio;
1347 			iovec_t iov[1];
1348 			char buffer;
1349 
1350 			uio.uio_iov = iov;
1351 			uio.uio_iovcnt = 1;
1352 			uio.uio_loffset = length - 1;
1353 			uio.uio_resid = 1;
1354 			uio.uio_segflg = UIO_SYSSPACE;
1355 			uio.uio_extflg = UIO_COPY_CACHED;
1356 
1357 			iov[0].iov_base = &buffer;
1358 			iov[0].iov_len = 1;
1359 
1360 			err = rdip(oip, &uio, UIO_READ, NULL);
1361 			if (err)
1362 				return (err);
1363 		}
1364 
1365 		bsize = (int)lblkno(fs, length - 1) >= NDADDR ?
1366 		    fs->fs_bsize : fragroundup(fs, boff);
1367 		pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff));
1368 		/*
1369 		 * Ensure full fs block is marked as dirty.
1370 		 */
1371 		(void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff),
1372 		    ufs_putapage, B_INVAL | B_TRUNC, CRED());
1373 	}
1374 
1375 	/*
1376 	 * Calculate index into inode's block list of
1377 	 * last direct and indirect blocks (if any)
1378 	 * which we want to keep.  Lastblock is -1 when
1379 	 * the file is truncated to 0.
1380 	 */
1381 	lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
1382 	lastiblock[SINGLE] = lastblock - NDADDR;
1383 	lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
1384 	lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
1385 	nblocks = btodb(fs->fs_bsize);
1386 
1387 	/*
1388 	 * Update file and block pointers
1389 	 * on disk before we start freeing blocks.
1390 	 * If we crash before free'ing blocks below,
1391 	 * the blocks will be returned to the free list.
1392 	 * lastiblock values are also normalized to -1
1393 	 * for calls to indirtrunc below.
1394 	 */
1395 	tip = *oip;			/* structure copy */
1396 	ip = &tip;
1397 
1398 	for (level = TRIPLE; level >= SINGLE; level--)
1399 		if (lastiblock[level] < 0) {
1400 			oip->i_ib[level] = 0;
1401 			lastiblock[level] = -1;
1402 		}
1403 	for (i = NDADDR - 1; i > lastblock; i--) {
1404 		oip->i_db[i] = 0;
1405 		flags |= I_CHEAP;
1406 	}
1407 	oip->i_size = length;
1408 	oip->i_flag |= ICHG|IUPD|IATTCHG;
1409 	oip->i_seq++;
1410 	if (!TRANS_ISTRANS(ufsvfsp))
1411 		ufs_iupdat(oip, I_SYNC);	/* do sync inode update */
1412 
1413 	/*
1414 	 * Indirect blocks first.
1415 	 */
1416 	for (level = TRIPLE; level >= SINGLE; level--) {
1417 		bn = ip->i_ib[level];
1418 		if (bn != 0) {
1419 			blocksreleased +=
1420 			    indirtrunc(ip, bn, lastiblock[level], level, flags);
1421 			if (lastiblock[level] < 0) {
1422 				ip->i_ib[level] = 0;
1423 				free(ip, bn, (off_t)fs->fs_bsize,
1424 					flags | I_IBLK);
1425 				blocksreleased += nblocks;
1426 			}
1427 		}
1428 		if (lastiblock[level] >= 0)
1429 			goto done;
1430 	}
1431 
1432 	/*
1433 	 * All whole direct blocks or frags.
1434 	 */
1435 	for (i = NDADDR - 1; i > lastblock; i--) {
1436 		bn = ip->i_db[i];
1437 		if (bn == 0)
1438 			continue;
1439 		ip->i_db[i] = 0;
1440 		bsize = (off_t)blksize(fs, ip, i);
1441 		free(ip, bn, bsize, flags);
1442 		blocksreleased += btodb(bsize);
1443 	}
1444 	if (lastblock < 0)
1445 		goto done;
1446 
1447 	/*
1448 	 * Finally, look for a change in size of the
1449 	 * last direct block; release any frags.
1450 	 */
1451 	bn = ip->i_db[lastblock];
1452 	if (bn != 0) {
1453 		off_t oldspace, newspace;
1454 
1455 		/*
1456 		 * Calculate amount of space we're giving
1457 		 * back as old block size minus new block size.
1458 		 */
1459 		oldspace = blksize(fs, ip, lastblock);
1460 		UFS_SET_ISIZE(length, ip);
1461 		newspace = blksize(fs, ip, lastblock);
1462 		if (newspace == 0) {
1463 			err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0");
1464 			return (err);
1465 		}
1466 		if (oldspace - newspace > 0) {
1467 			/*
1468 			 * Block number of space to be free'd is
1469 			 * the old block # plus the number of frags
1470 			 * required for the storage we're keeping.
1471 			 */
1472 			bn += numfrags(fs, newspace);
1473 			free(ip, bn, oldspace - newspace, flags);
1474 			blocksreleased += btodb(oldspace - newspace);
1475 		}
1476 	}
1477 done:
1478 /* BEGIN PARANOIA */
1479 	for (level = SINGLE; level <= TRIPLE; level++)
1480 		if (ip->i_ib[level] != oip->i_ib[level]) {
1481 			err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block");
1482 			return (err);
1483 		}
1484 
1485 	for (i = 0; i < NDADDR; i++)
1486 		if (ip->i_db[i] != oip->i_db[i]) {
1487 			err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block");
1488 			return (err);
1489 		}
1490 /* END PARANOIA */
1491 	oip->i_blocks -= blocksreleased;
1492 
1493 	if (oip->i_blocks < 0) {		/* sanity */
1494 		cmn_err(CE_NOTE,
1495 		    "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
1496 		    fs->fs_fsmnt, (int)oip->i_number, oip->i_size,
1497 		    (int)oip->i_blocks);
1498 		oip->i_blocks = 0;
1499 	}
1500 	oip->i_flag |= ICHG|IATTCHG;
1501 	oip->i_seq++;
1502 	/* blocksreleased is >= zero, so this can not fail */
1503 	(void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL,
1504 		(size_t *)NULL);
1505 	return (0);
1506 }
1507 
1508 /*
1509  * Check mode permission on inode.  Mode is READ, WRITE or EXEC.
1510  * In the case of WRITE, the read-only status of the file system
1511  * is checked.  Depending on the calling user, the appropriate
1512  * mode bits are selected; privileges to override missing permission
1513  * bits are checked through secpolicy_vnode_access().
1514  */
1515 int
1516 ufs_iaccess(void *vip, int mode, struct cred *cr)
1517 {
1518 	struct inode *ip = vip;
1519 	int shift = 0;
1520 
1521 	if (mode & IWRITE) {
1522 		/*
1523 		 * Disallow write attempts on read-only
1524 		 * file systems, unless the file is a block
1525 		 * or character device or a FIFO.
1526 		 */
1527 		if (ip->i_fs->fs_ronly != 0) {
1528 			if ((ip->i_mode & IFMT) != IFCHR &&
1529 			    (ip->i_mode & IFMT) != IFBLK &&
1530 			    (ip->i_mode & IFMT) != IFIFO) {
1531 				return (EROFS);
1532 			}
1533 		}
1534 	}
1535 	/*
1536 	 * If there is a shadow inode check for the presence of an acl,
1537 	 * if the acl is there use the ufs_acl_access routine to check
1538 	 * the acl
1539 	 */
1540 	if (ip->i_ufs_acl && ip->i_ufs_acl->aowner)
1541 		return (ufs_acl_access(ip, mode, cr));
1542 
1543 	/*
1544 	 * Access check is based on only
1545 	 * one of owner, group, public.
1546 	 * If not owner, then check group.
1547 	 * If not a member of the group, then
1548 	 * check public access.
1549 	 */
1550 	if (crgetuid(cr) != ip->i_uid) {
1551 		shift += 3;
1552 		if (!groupmember((uid_t)ip->i_gid, cr))
1553 			shift += 3;
1554 	}
1555 
1556 	mode &= ~(ip->i_mode << shift);
1557 
1558 	if (mode == 0)
1559 		return (0);
1560 
1561 	/* test missing privilege bits */
1562 	return (secpolicy_vnode_access(cr, ITOV(ip), ip->i_uid, mode));
1563 }
1564 
1565 /*
1566  * if necessary, remove an inode from the free list
1567  *	i_contents is held except at unmount
1568  *
1569  * Return 1 if the inode is taken off of the ufs_idle_q,
1570  * and the caller is expected to call VN_RELE.
1571  *
1572  * Return 0 otherwise.
1573  */
1574 int
1575 ufs_rmidle(struct inode *ip)
1576 {
1577 	int rval = 0;
1578 
1579 	mutex_enter(&ip->i_tlock);
1580 	if ((ip->i_flag & IREF) == 0) {
1581 		mutex_enter(&ufs_idle_q.uq_mutex);
1582 		ip->i_freef->i_freeb = ip->i_freeb;
1583 		ip->i_freeb->i_freef = ip->i_freef;
1584 		ip->i_freef = ip;
1585 		ip->i_freeb = ip;
1586 		ip->i_flag |= IREF;
1587 		ufs_idle_q.uq_ne--;
1588 		if (ip->i_flag & IJUNKIQ) {
1589 			ufs_njunk_iq--;
1590 			ip->i_flag &= ~IJUNKIQ;
1591 		} else {
1592 			ufs_nuseful_iq--;
1593 		}
1594 		mutex_exit(&ufs_idle_q.uq_mutex);
1595 		rval = 1;
1596 	}
1597 	mutex_exit(&ip->i_tlock);
1598 	return (rval);
1599 }
1600 
1601 /*
1602  * scan the hash of inodes and call func with the inode locked
1603  */
1604 int
1605 ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg,
1606 		struct ufsvfs *ufsvfsp)
1607 {
1608 	struct inode		*ip;		/* current inode */
1609 	struct inode		*lip = NULL;	/* last/previous inode */
1610 	union ihead		*ih;		/* current hash chain */
1611 	int			error, i;
1612 	int			saverror = 0;
1613 	int			lip_held;	/* lip needs a VN_RELE() */
1614 
1615 	/*
1616 	 * If ufsvfsp is NULL, then our caller should be holding
1617 	 * ufs_scan_lock to avoid conflicts between ufs_unmount() and
1618 	 * ufs_update().  Otherwise, to avoid false-positives in
1619 	 * ufs_unmount()'s v_count-based EBUSY check, we only hold
1620 	 * those inodes that are in the file system our caller cares
1621 	 * about.
1622 	 *
1623 	 * We know that ip is a valid inode in the hash chain (and thus
1624 	 * we can trust i_ufsvfs) because the inode we chained from
1625 	 * (lip) is still in the hash chain.  This is true because either:
1626 	 *
1627 	 * 1. We did not drop the hash chain lock since the last
1628 	 *    iteration (because we were not interested in the last inode),
1629 	 * or
1630 	 * 2. We maintained a hold on the last inode while we
1631 	 *    we were processing it, so it could not be removed
1632 	 *    from the hash chain.
1633 	 *
1634 	 * The whole reason we're dropping and re-grabbing the chain
1635 	 * lock on every inode is so that we don't present a major
1636 	 * choke point on throughput, particularly when we've been
1637 	 * called on behalf of fsflush.
1638 	 */
1639 
1640 	for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
1641 		mutex_enter(&ih_lock[i]);
1642 		for (ip = ih->ih_chain[0], lip_held = 0;
1643 		    ip != (struct inode *)ih;
1644 		    ip = lip->i_forw) {
1645 
1646 			ins.in_scan.value.ul++;
1647 
1648 			/*
1649 			 * Undo the previous iteration's VN_HOLD(), but
1650 			 * only if one was done.
1651 			 */
1652 			if (lip_held)
1653 				VN_RELE(ITOV(lip));
1654 
1655 			lip = ip;
1656 			if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) {
1657 				/*
1658 				 * We're not processing all inodes, and
1659 				 * this inode is not in the filesystem of
1660 				 * interest, so skip it.  No need to do a
1661 				 * VN_HOLD() since we're not dropping the
1662 				 * hash chain lock until after we've
1663 				 * done the i_forw traversal above.
1664 				 */
1665 				lip_held = 0;
1666 				continue;
1667 			}
1668 			VN_HOLD(ITOV(ip));
1669 			lip_held = 1;
1670 			mutex_exit(&ih_lock[i]);
1671 
1672 			/*
1673 			 * Acquire the contents lock as writer to make
1674 			 * sure that the inode has been initialized in
1675 			 * the cache or removed from the idle list by
1676 			 * ufs_iget().  This works because ufs_iget()
1677 			 * acquires the contents lock before putting
1678 			 * the inode into the cache.  If we can lock
1679 			 * it, then he's done with it.
1680 			 */
1681 
1682 			if (rwtry) {
1683 				if (!rw_tryenter(&ip->i_contents, RW_WRITER)) {
1684 					mutex_enter(&ih_lock[i]);
1685 					continue;
1686 				}
1687 			} else {
1688 				rw_enter(&ip->i_contents, RW_WRITER);
1689 			}
1690 
1691 			rw_exit(&ip->i_contents);
1692 
1693 			/*
1694 			 * ISTALE means the inode couldn't be read
1695 			 *
1696 			 * We don't have to hold the i_contents lock
1697 			 * for this check for a couple of
1698 			 * reasons. First, if ISTALE is set then the
1699 			 * flag cannot be cleared until the inode is
1700 			 * removed from the cache and that cannot
1701 			 * happen until after we VN_RELE() it.
1702 			 * Second, if ISTALE is not set, then the
1703 			 * inode is in the cache and does not need to
1704 			 * be read from disk so ISTALE cannot be set
1705 			 * while we are not looking.
1706 			 */
1707 			if ((ip->i_flag & ISTALE) == 0) {
1708 				if ((error = (*func)(ip, arg)) != 0)
1709 					saverror = error;
1710 			}
1711 
1712 			mutex_enter(&ih_lock[i]);
1713 		}
1714 		if (lip_held)
1715 			VN_RELE(ITOV(lip));
1716 		mutex_exit(&ih_lock[i]);
1717 	}
1718 	return (saverror);
1719 }
1720 
1721 /*
1722  * Mark inode with the current time, plus a unique increment.
1723  *
1724  * Since we only keep 32-bit time on disk, if UFS is still alive
1725  * beyond 2038, filesystem times will simply stick at the last
1726  * possible second of 32-bit time. Not ideal, but probably better
1727  * than going into the remote past, or confusing applications with
1728  * negative time.
1729  */
1730 void
1731 ufs_imark(struct inode *ip)
1732 {
1733 	timestruc_t now;
1734 	int32_t usec, nsec;
1735 
1736 	/*
1737 	 * The update of i_seq may have been deferred, increase i_seq here
1738 	 * to make sure it is in sync with the timestamps.
1739 	 */
1740 	if (ip->i_flag & ISEQ) {
1741 		ASSERT(ip->i_flag & (IUPD|ICHG));
1742 		ip->i_seq++;
1743 		ip->i_flag &= ~ISEQ;
1744 	}
1745 
1746 	gethrestime(&now);
1747 
1748 	/*
1749 	 * Fast algorithm to convert nsec to usec -- see hrt2ts()
1750 	 * in common/os/timers.c for a full description.
1751 	 */
1752 	nsec = now.tv_nsec;
1753 	usec = nsec + (nsec >> 2);
1754 	usec = nsec + (usec >> 1);
1755 	usec = nsec + (usec >> 2);
1756 	usec = nsec + (usec >> 4);
1757 	usec = nsec - (usec >> 3);
1758 	usec = nsec + (usec >> 2);
1759 	usec = nsec + (usec >> 3);
1760 	usec = nsec + (usec >> 4);
1761 	usec = nsec + (usec >> 1);
1762 	usec = nsec + (usec >> 6);
1763 	usec = usec >> 10;
1764 
1765 	mutex_enter(&ufs_iuniqtime_lock);
1766 	if (now.tv_sec > (time_t)iuniqtime.tv_sec ||
1767 	    usec > iuniqtime.tv_usec) {
1768 		if (now.tv_sec < TIME32_MAX) {
1769 			iuniqtime.tv_sec = (time32_t)now.tv_sec;
1770 			iuniqtime.tv_usec = usec;
1771 		}
1772 	} else {
1773 		if (iuniqtime.tv_sec < TIME32_MAX) {
1774 			iuniqtime.tv_usec++;
1775 			/* Check for usec overflow */
1776 			if (iuniqtime.tv_usec >= MICROSEC) {
1777 				iuniqtime.tv_sec++;
1778 				iuniqtime.tv_usec = 0;
1779 			}
1780 		}
1781 	}
1782 
1783 	if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) {
1784 		ip->i_atime = iuniqtime;
1785 	}
1786 	if (ip->i_flag & IUPD) {
1787 		ip->i_mtime = iuniqtime;
1788 		ip->i_flag |= IMODTIME;
1789 	}
1790 	if (ip->i_flag & ICHG) {
1791 		ip->i_diroff = 0;
1792 		ip->i_ctime = iuniqtime;
1793 	}
1794 	mutex_exit(&ufs_iuniqtime_lock);
1795 }
1796 
1797 /*
1798  * Update timestamps in inode.
1799  */
1800 void
1801 ufs_itimes_nolock(struct inode *ip)
1802 {
1803 
1804 	/*
1805 	 * if noatime is set and the inode access time is the only field that
1806 	 * must be changed, exit immediately.
1807 	 */
1808 	if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) &&
1809 	    (ip->i_ufsvfs->vfs_noatime)) {
1810 		return;
1811 	}
1812 
1813 	if (ip->i_flag & (IUPD|IACC|ICHG)) {
1814 		if (ip->i_flag & ICHG)
1815 			ip->i_flag |= IMOD;
1816 		else
1817 			ip->i_flag |= IMODACC;
1818 		ufs_imark(ip);
1819 		ip->i_flag &= ~(IACC|IUPD|ICHG);
1820 	}
1821 }
1822