xref: /titanic_51/usr/src/uts/common/fs/ufs/ufs_inode.c (revision c39526b769298791ff5b0b6c5e761f49aabaeb4e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #include <sys/types.h>
40 #include <sys/t_lock.h>
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/uio.h>
44 #include <sys/bitmap.h>
45 #include <sys/signal.h>
46 #include <sys/cred.h>
47 #include <sys/user.h>
48 #include <sys/vfs.h>
49 #include <sys/stat.h>
50 #include <sys/vnode.h>
51 #include <sys/buf.h>
52 #include <sys/proc.h>
53 #include <sys/disp.h>
54 #include <sys/dnlc.h>
55 #include <sys/mode.h>
56 #include <sys/cmn_err.h>
57 #include <sys/kstat.h>
58 #include <sys/acl.h>
59 #include <sys/var.h>
60 #include <sys/fs/ufs_inode.h>
61 #include <sys/fs/ufs_fs.h>
62 #include <sys/fs/ufs_trans.h>
63 #include <sys/fs/ufs_acl.h>
64 #include <sys/fs/ufs_bio.h>
65 #include <sys/fs/ufs_quota.h>
66 #include <sys/fs/ufs_log.h>
67 #include <vm/hat.h>
68 #include <vm/as.h>
69 #include <vm/pvn.h>
70 #include <vm/seg.h>
71 #include <sys/swap.h>
72 #include <sys/cpuvar.h>
73 #include <sys/sysmacros.h>
74 #include <sys/errno.h>
75 #include <sys/kmem.h>
76 #include <sys/debug.h>
77 #include <fs/fs_subr.h>
78 #include <sys/policy.h>
79 
80 struct kmem_cache *inode_cache;		/* cache of free inodes */
81 
82 /* UFS Inode Cache Stats -- Not protected */
83 struct	instats ins = {
84 	{ "size",		KSTAT_DATA_ULONG },
85 	{ "maxsize",		KSTAT_DATA_ULONG },
86 	{ "hits",		KSTAT_DATA_ULONG },
87 	{ "misses",		KSTAT_DATA_ULONG },
88 	{ "kmem allocs",	KSTAT_DATA_ULONG },
89 	{ "kmem frees",		KSTAT_DATA_ULONG },
90 	{ "maxsize reached",	KSTAT_DATA_ULONG },
91 	{ "puts at frontlist",	KSTAT_DATA_ULONG },
92 	{ "puts at backlist",	KSTAT_DATA_ULONG },
93 	{ "queues to free",	KSTAT_DATA_ULONG },
94 	{ "scans",		KSTAT_DATA_ULONG },
95 	{ "thread idles",	KSTAT_DATA_ULONG },
96 	{ "lookup idles",	KSTAT_DATA_ULONG },
97 	{ "vget idles",		KSTAT_DATA_ULONG },
98 	{ "cache allocs",	KSTAT_DATA_ULONG },
99 	{ "cache frees",	KSTAT_DATA_ULONG },
100 	{ "pushes at close",	KSTAT_DATA_ULONG }
101 };
102 
103 /* kstat data */
104 static kstat_t		*ufs_inode_kstat = NULL;
105 
106 union ihead *ihead;	/* inode LRU cache, Chris Maltby */
107 kmutex_t *ih_lock;	/* protect inode cache hash table */
108 static int ino_hashlen = 4;	/* desired average hash chain length */
109 int inohsz;		/* number of buckets in the hash table */
110 
111 kmutex_t	ufs_scan_lock;	/* stop racing multiple ufs_scan_inodes() */
112 kmutex_t	ufs_iuniqtime_lock; /* protect iuniqtime */
113 kmutex_t	ufsvfs_mutex;
114 struct ufsvfs	*oldufsvfslist, *ufsvfslist;
115 
116 /*
117  * time to wait after ufsvfsp->vfs_iotstamp before declaring that no
118  * I/Os are going on.
119  */
120 clock_t	ufs_iowait;
121 
122 /*
123  * the threads that process idle inodes and free (deleted) inodes
124  * have high water marks that are set in ufsinit().
125  * These values but can be no less then the minimum shown below
126  */
127 int	ufs_idle_max;	/* # of allowable idle inodes */
128 ulong_t	ufs_inode_max;	/* hard limit of allowable idle inodes */
129 #define	UFS_IDLE_MAX	(16)	/* min # of allowable idle inodes */
130 
131 /*
132  * Tunables for ufs write throttling.
133  * These are validated in ufs_iinit() since improper settings
134  * can lead to filesystem hangs.
135  */
136 #define	UFS_HW_DEFAULT	(16 * 1024 * 1024)
137 #define	UFS_LW_DEFAULT	(8 * 1024 * 1024)
138 int	ufs_HW = UFS_HW_DEFAULT;
139 int	ufs_LW = UFS_LW_DEFAULT;
140 
141 static void ihinit(void);
142 extern int hash2ints(int, int);
143 
144 static int ufs_iget_internal(struct vfs *, ino_t, struct inode **,
145     struct cred *, int);
146 
147 /* ARGSUSED */
148 static int
149 ufs_inode_kstat_update(kstat_t *ksp, int rw)
150 {
151 	if (rw == KSTAT_WRITE)
152 		return (EACCES);
153 
154 	ins.in_malloc.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
155 	    "slab_alloc");
156 	ins.in_mfree.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
157 	    "slab_free");
158 	ins.in_kcalloc.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
159 	    "alloc");
160 	ins.in_kcfree.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
161 	    "free");
162 	ins.in_size.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
163 	    "buf_inuse");
164 	ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
165 	    "buf_max");
166 	ins.in_misses.value.ul = ins.in_kcalloc.value.ul;
167 
168 	return (0);
169 }
170 
171 void
172 ufs_iinit(void)
173 {
174 	/*
175 	 * Validate that ufs_HW > ufs_LW.
176 	 * The default values for these two tunables have been increased.
177 	 * There is now a range of values for ufs_HW that used to be
178 	 * legal on previous Solaris versions but no longer is now.
179 	 * Upgrading a machine which has an /etc/system setting for ufs_HW
180 	 * from that range can lead to filesystem hangs unless the values
181 	 * are checked here.
182 	 */
183 	if (ufs_HW <= ufs_LW) {
184 		cmn_err(CE_WARN,
185 		    "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.",
186 		    ufs_HW, ufs_LW);
187 		ufs_LW = UFS_LW_DEFAULT;
188 		ufs_HW = UFS_HW_DEFAULT;
189 		cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n",
190 		    ufs_HW, ufs_LW);
191 	}
192 
193 	/*
194 	 * Adjust the tunable `ufs_ninode' to a reasonable value
195 	 */
196 	if (ufs_ninode <= 0)
197 		ufs_ninode = ncsize;
198 	if (ufs_inode_max == 0)
199 		ufs_inode_max =
200 		    (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode));
201 	if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) {
202 		cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld",
203 		    ufs_inode_max);
204 		ufs_ninode = ufs_inode_max;
205 	}
206 	/*
207 	 * Wait till third call of ufs_update to declare that no I/Os are
208 	 * going on. This allows deferred access times to be flushed to disk.
209 	 */
210 	ufs_iowait = v.v_autoup * hz * 2;
211 
212 	/*
213 	 * idle thread runs when 25% of ufs_ninode entries are on the queue
214 	 */
215 	if (ufs_idle_max == 0)
216 		ufs_idle_max = ufs_ninode >> 2;
217 	if (ufs_idle_max < UFS_IDLE_MAX)
218 		ufs_idle_max = UFS_IDLE_MAX;
219 	if (ufs_idle_max > ufs_ninode)
220 		ufs_idle_max = ufs_ninode;
221 	/*
222 	 * This is really a misnomer, it is ufs_queue_init
223 	 */
224 	ufs_thread_init(&ufs_idle_q, ufs_idle_max);
225 	ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL);
226 
227 	/*
228 	 * global hlock thread
229 	 */
230 	ufs_thread_init(&ufs_hlock, 1);
231 	ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL);
232 
233 	ihinit();
234 	qtinit();
235 	ins.in_maxsize.value.ul = ufs_ninode;
236 	if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs",
237 	    KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t),
238 	    KSTAT_FLAG_VIRTUAL)) != NULL) {
239 		ufs_inode_kstat->ks_data = (void *)&ins;
240 		ufs_inode_kstat->ks_update = ufs_inode_kstat_update;
241 		kstat_install(ufs_inode_kstat);
242 	}
243 	ufsfx_init();		/* fix-on-panic initialization */
244 	si_cache_init();
245 	ufs_directio_init();
246 	lufs_init();
247 	mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL);
248 }
249 
250 /* ARGSUSED */
251 static int
252 ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags)
253 {
254 	struct inode *ip = buf;
255 	struct vnode *vp;
256 
257 	vp = ip->i_vnode = vn_alloc(kmflags);
258 	if (vp == NULL) {
259 		return (-1);
260 	}
261 	vn_setops(vp, ufs_vnodeops);
262 	vp->v_data = ip;
263 
264 	rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL);
265 	rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL);
266 	mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL);
267 	dnlc_dir_init(&ip->i_danchor);
268 
269 	cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL);
270 
271 	return (0);
272 }
273 
274 /* ARGSUSED */
275 static void
276 ufs_inode_cache_destructor(void *buf, void *cdrarg)
277 {
278 	struct inode *ip = buf;
279 	struct vnode *vp;
280 
281 	vp = ITOV(ip);
282 
283 	rw_destroy(&ip->i_rwlock);
284 	rw_destroy(&ip->i_contents);
285 	mutex_destroy(&ip->i_tlock);
286 	if (vp->v_type == VDIR) {
287 		dnlc_dir_fini(&ip->i_danchor);
288 	}
289 
290 	cv_destroy(&ip->i_wrcv);
291 
292 	vn_free(vp);
293 }
294 
295 /*
296  * Initialize hash links for inodes
297  * and build inode free list.
298  */
299 void
300 ihinit(void)
301 {
302 	int i;
303 	union	ihead *ih = ihead;
304 
305 	mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL);
306 
307 	inohsz = 1 << highbit(ufs_ninode / ino_hashlen);
308 	ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP);
309 	ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP);
310 
311 	for (i = 0, ih = ihead; i < inohsz; i++,  ih++) {
312 		ih->ih_head[0] = ih;
313 		ih->ih_head[1] = ih;
314 		mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL);
315 	}
316 	inode_cache = kmem_cache_create("ufs_inode_cache",
317 	    sizeof (struct inode), 0, ufs_inode_cache_constructor,
318 	    ufs_inode_cache_destructor, ufs_inode_cache_reclaim,
319 	    NULL, NULL, 0);
320 }
321 
322 /*
323  * Free an inode structure
324  */
325 void
326 ufs_free_inode(struct inode *ip)
327 {
328 	vn_invalid(ITOV(ip));
329 	kmem_cache_free(inode_cache, ip);
330 }
331 
332 /*
333  * Allocate an inode structure
334  */
335 struct inode *
336 ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino)
337 {
338 	struct inode *ip;
339 	vnode_t *vp;
340 
341 	ip = kmem_cache_alloc(inode_cache, KM_SLEEP);
342 	/*
343 	 * at this point we have a newly allocated inode
344 	 */
345 	ip->i_freef = ip;
346 	ip->i_freeb = ip;
347 	ip->i_flag = IREF;
348 	ip->i_seq = 0xFF;	/* Unique initial value */
349 	ip->i_dev = ufsvfsp->vfs_dev;
350 	ip->i_ufsvfs = ufsvfsp;
351 	ip->i_devvp = ufsvfsp->vfs_devvp;
352 	ip->i_number = ino;
353 	ip->i_diroff = 0;
354 	ip->i_nextr = 0;
355 	ip->i_map = NULL;
356 	ip->i_rdev = 0;
357 	ip->i_writes = 0;
358 	ip->i_mode = 0;
359 	ip->i_delaylen = 0;
360 	ip->i_delayoff = 0;
361 	ip->i_nextrio = 0;
362 	ip->i_ufs_acl = NULL;
363 	ip->i_cflags = 0;
364 	ip->i_mapcnt = 0;
365 	ip->i_dquot = NULL;
366 	ip->i_cachedir = CD_ENABLED;
367 	ip->i_writer = NULL;
368 
369 	/*
370 	 * the vnode for this inode was allocated by the constructor
371 	 */
372 	vp = ITOV(ip);
373 	vn_reinit(vp);
374 	if (ino == (ino_t)UFSROOTINO)
375 		vp->v_flag = VROOT;
376 	vp->v_vfsp = ufsvfsp->vfs_vfs;
377 	vn_exists(vp);
378 	return (ip);
379 }
380 
381 /*
382  * Look up an inode by device, inumber.  If it is in core (in the
383  * inode structure), honor the locking protocol.  If it is not in
384  * core, read it in from the specified device after freeing any pages.
385  * In all cases, a pointer to a VN_HELD inode structure is returned.
386  */
387 int
388 ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr)
389 {
390 	return (ufs_iget_internal(vfsp, ino, ipp, cr, 0));
391 }
392 
393 /*
394  * A version of ufs_iget which returns only allocated, linked inodes.
395  * This is appropriate for any callers who do not expect a free inode.
396  */
397 int
398 ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp,
399     struct cred *cr)
400 {
401 	return (ufs_iget_internal(vfsp, ino, ipp, cr, 1));
402 }
403 
404 /*
405  * Set vnode attributes based on v_type, this should be called whenever
406  * an inode's i_mode is changed.
407  */
408 void
409 ufs_reset_vnode(vnode_t *vp)
410 {
411 	/*
412 	 * an old DBE hack
413 	 */
414 	if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
415 		vp->v_flag |= VSWAPLIKE;
416 	else
417 		vp->v_flag &= ~VSWAPLIKE;
418 
419 	/*
420 	 * if not swap like and it's just a regular file, we want
421 	 * to maintain the vnode's pages sorted by clean/modified
422 	 * for faster sync'ing to disk
423 	 */
424 	if (vp->v_type == VREG)
425 		vp->v_flag |= VMODSORT;
426 	else
427 		vp->v_flag &= ~VMODSORT;
428 
429 	/*
430 	 * Is this an attribute hidden dir?
431 	 */
432 	if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR)
433 		vp->v_flag |= V_XATTRDIR;
434 	else
435 		vp->v_flag &= ~V_XATTRDIR;
436 }
437 
438 /*
439  * Shared implementation of ufs_iget and ufs_iget_alloced.  The 'validate'
440  * flag is used to distinguish the two; when true, we validate that the inode
441  * being retrieved looks like a linked and allocated inode.
442  */
443 /* ARGSUSED */
444 static int
445 ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp,
446     struct cred *cr, int validate)
447 {
448 	struct inode *ip, *sp;
449 	union ihead *ih;
450 	kmutex_t *ihm;
451 	struct buf *bp;
452 	struct dinode *dp;
453 	struct vnode *vp;
454 	extern vfs_t EIO_vfs;
455 	int error;
456 	int ftype;	/* XXX - Remove later on */
457 	dev_t vfs_dev;
458 	struct ufsvfs *ufsvfsp;
459 	struct fs *fs;
460 	int hno;
461 	daddr_t bno;
462 	ulong_t ioff;
463 
464 	CPU_STATS_ADD_K(sys, ufsiget, 1);
465 
466 	/*
467 	 * Lookup inode in cache.
468 	 */
469 	vfs_dev = vfsp->vfs_dev;
470 	hno = INOHASH(ino);
471 	ih = &ihead[hno];
472 	ihm = &ih_lock[hno];
473 
474 again:
475 	mutex_enter(ihm);
476 	for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) {
477 		if (ino != ip->i_number || vfs_dev != ip->i_dev ||
478 		    (ip->i_flag & ISTALE))
479 			continue;
480 
481 		/*
482 		 * Found the interesting inode; hold it and drop the cache lock
483 		 */
484 		vp = ITOV(ip);	/* for locknest */
485 		VN_HOLD(vp);
486 		mutex_exit(ihm);
487 		rw_enter(&ip->i_contents, RW_READER);
488 
489 		/*
490 		 * if necessary, remove from idle list
491 		 */
492 		if ((ip->i_flag & IREF) == 0) {
493 			if (ufs_rmidle(ip))
494 				VN_RELE(vp);
495 		}
496 
497 		/*
498 		 * Could the inode be read from disk?
499 		 */
500 		if (ip->i_flag & ISTALE) {
501 			rw_exit(&ip->i_contents);
502 			VN_RELE(vp);
503 			goto again;
504 		}
505 
506 		ins.in_hits.value.ul++;
507 		*ipp = ip;
508 
509 		/*
510 		 * Reset the vnode's attribute flags
511 		 */
512 		mutex_enter(&vp->v_lock);
513 		ufs_reset_vnode(vp);
514 		mutex_exit(&vp->v_lock);
515 
516 		rw_exit(&ip->i_contents);
517 
518 		return (0);
519 	}
520 	mutex_exit(ihm);
521 
522 	/*
523 	 * Inode was not in cache.
524 	 *
525 	 * Allocate a new entry
526 	 */
527 	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
528 	fs = ufsvfsp->vfs_fs;
529 
530 	ip = ufs_alloc_inode(ufsvfsp, ino);
531 	vp = ITOV(ip);
532 
533 	bno = fsbtodb(fs, itod(fs, ino));
534 	ioff = (sizeof (struct dinode)) * (itoo(fs, ino));
535 	ip->i_doff = (offset_t)ioff + ldbtob(bno);
536 
537 	/*
538 	 * put a place holder in the cache (if not already there)
539 	 */
540 	mutex_enter(ihm);
541 	for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw)
542 		if (ino == sp->i_number && vfs_dev == sp->i_dev &&
543 		    ((sp->i_flag & ISTALE) == 0)) {
544 			mutex_exit(ihm);
545 			ufs_free_inode(ip);
546 			goto again;
547 		}
548 	/*
549 	 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
550 	 * here, but if we do, then shadow inode allocations panic the
551 	 * system.  We don't have to hold vfs_dqrwlock for shadow inodes
552 	 * and the ufs_iget() parameters don't tell us what we are getting
553 	 * so we have no way of knowing this is a ufs_iget() call from
554 	 * a ufs_ialloc() call for a shadow inode.
555 	 */
556 	rw_enter(&ip->i_contents, RW_WRITER);
557 	insque(ip, ih);
558 	mutex_exit(ihm);
559 	/*
560 	 * read the dinode
561 	 */
562 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize);
563 
564 	/*
565 	 * Check I/O errors
566 	 */
567 	error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0);
568 	if (error) {
569 		brelse(bp);
570 		ip->i_flag |= ISTALE;	/* in case someone is looking it up */
571 		rw_exit(&ip->i_contents);
572 		vp->v_vfsp = &EIO_vfs;
573 		VN_RELE(vp);
574 		return (error);
575 	}
576 	/*
577 	 * initialize the inode's dinode
578 	 */
579 	dp = (struct dinode *)(ioff + bp->b_un.b_addr);
580 	ip->i_ic = dp->di_ic;			/* structure assignment */
581 	brelse(bp);
582 
583 	/*
584 	 * Maintain compatibility with Solaris 1.x UFS
585 	 */
586 	if (ip->i_suid != UID_LONG)
587 		ip->i_uid = ip->i_suid;
588 	if (ip->i_sgid != GID_LONG)
589 		ip->i_gid = ip->i_sgid;
590 
591 	ftype = ip->i_mode & IFMT;
592 	if (ftype == IFBLK || ftype == IFCHR) {
593 		dev_t dv;
594 		uint_t top16 = ip->i_ordev & 0xffff0000u;
595 
596 		if (top16 == 0 || top16 == 0xffff0000u)
597 			dv = expdev(ip->i_ordev);
598 		else
599 			dv = expldev(ip->i_ordev);
600 		vp->v_rdev = ip->i_rdev = dv;
601 	}
602 
603 	/*
604 	 * if our caller only expects allocated inodes, verify that
605 	 * this inode looks good; throw it out if it's bad.
606 	 */
607 	if (validate) {
608 		if ((ftype == 0) || (ip->i_nlink <= 0)) {
609 			ip->i_flag |= ISTALE;
610 			rw_exit(&ip->i_contents);
611 			vp->v_vfsp = &EIO_vfs;
612 			VN_RELE(vp);
613 			cmn_err(CE_NOTE,
614 			    "%s: unexpected free inode %d, run fsck(1M)%s",
615 			    fs->fs_fsmnt, (int)ino,
616 			    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
617 			return (EIO);
618 		}
619 	}
620 
621 	/*
622 	 * Finish initializing the vnode, special handling for shadow inodes
623 	 * because IFTOVT() will produce a v_type of VNON which is not what we
624 	 * want, set v_type to VREG explicitly in that case.
625 	 */
626 	if (ftype == IFSHAD) {
627 		vp->v_type = VREG;
628 	} else {
629 		vp->v_type = IFTOVT((mode_t)ip->i_mode);
630 	}
631 
632 	ufs_reset_vnode(vp);
633 
634 	/*
635 	 * read the shadow
636 	 */
637 	if (ftype != 0 && ip->i_shadow != 0) {
638 		if ((error = ufs_si_load(ip, cr)) != 0) {
639 			ip->i_flag |= ISTALE;
640 			ip->i_ufs_acl = NULL;
641 			rw_exit(&ip->i_contents);
642 			vp->v_vfsp = &EIO_vfs;
643 			VN_RELE(vp);
644 			return (error);
645 		}
646 	}
647 
648 	/*
649 	 * Only attach quota information if the inode has a type and if
650 	 * that type is not a shadow inode.
651 	 */
652 	if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) &&
653 	    ((ip->i_mode & IFMT) != IFATTRDIR)) {
654 		ip->i_dquot = getinoquota(ip);
655 	}
656 	TRANS_MATA_IGET(ufsvfsp, ip);
657 	*ipp = ip;
658 	rw_exit(&ip->i_contents);
659 
660 	return (0);
661 }
662 
663 /*
664  * Vnode is no longer referenced, write the inode out
665  * and if necessary, truncate and deallocate the file.
666  */
667 void
668 ufs_iinactive(struct inode *ip)
669 {
670 	int		front;
671 	struct inode	*iq;
672 	struct inode	*hip;
673 	struct ufs_q	*uq;
674 	struct vnode	*vp = ITOV(ip);
675 	struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
676 	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
677 
678 	/*
679 	 * Because the vnode type might have been changed,
680 	 * the dnlc_dir_purge must be called unconditionally.
681 	 */
682 	dnlc_dir_purge(&ip->i_danchor);
683 
684 	/*
685 	 * Get exclusive access to inode data.
686 	 */
687 	rw_enter(&ip->i_contents, RW_WRITER);
688 	ASSERT(ip->i_flag & IREF);
689 
690 	/*
691 	 * Make sure no one reclaimed the inode before we put it on
692 	 * the freelist or destroy it. We keep our 'hold' on the vnode
693 	 * from vn_rele until we are ready to do something with the inode.
694 	 *
695 	 * Pageout may put a VN_HOLD/VN_RELE at anytime during this
696 	 * operation via an async putpage, so we must make sure
697 	 * we don't free/destroy the inode more than once. ufs_iget
698 	 * may also put a VN_HOLD on the inode before it grabs
699 	 * the i_contents lock. This is done so we don't free
700 	 * an inode that a thread is waiting on.
701 	 */
702 	mutex_enter(&vp->v_lock);
703 
704 	if (vp->v_count > 1) {
705 		vp->v_count--;  /* release our hold from vn_rele */
706 		mutex_exit(&vp->v_lock);
707 		rw_exit(&ip->i_contents);
708 		return;
709 	}
710 	mutex_exit(&vp->v_lock);
711 
712 	/*
713 	 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed
714 	 * and clean.  It can be safely destroyed (cyf).
715 	 */
716 	if (ip->i_ufsvfs == NULL) {
717 		rw_exit(&ip->i_contents);
718 		ufs_si_del(ip);
719 		ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
720 		ufs_free_inode(ip);
721 		return;
722 	}
723 
724 	/*
725 	 * queue idle inode to appropriate thread. Will check v_count == 1
726 	 * prior to putting this on the appropriate queue.
727 	 * Stale inodes will be unhashed and freed by the ufs idle thread
728 	 * in ufs_idle_free()
729 	 */
730 	front = 1;
731 	if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 &&
732 	    ip->i_mode && ip->i_nlink <= 0) {
733 		/*
734 		 * Mark the i_flag to indicate that inode is being deleted.
735 		 * This flag will be cleared when the deletion is complete.
736 		 * This prevents nfs from sneaking in via ufs_vget() while
737 		 * the delete is in progress (bugid 1242481).
738 		 */
739 		ip->i_flag |= IDEL;
740 
741 		/*
742 		 * NOIDEL means that deletes are not allowed at this time;
743 		 * whoever resets NOIDEL will also send this inode back
744 		 * through ufs_iinactive.  IREF remains set.
745 		 */
746 		if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
747 			mutex_enter(&vp->v_lock);
748 			vp->v_count--;
749 			mutex_exit(&vp->v_lock);
750 			rw_exit(&ip->i_contents);
751 			return;
752 		}
753 		if (!TRANS_ISTRANS(ip->i_ufsvfs)) {
754 			rw_exit(&ip->i_contents);
755 			ufs_delete(ip->i_ufsvfs, ip, 0);
756 			return;
757 		}
758 
759 		/* queue to delete thread; IREF remains set */
760 		ins.in_qfree.value.ul++;
761 		uq = &ip->i_ufsvfs->vfs_delete;
762 
763 		mutex_enter(&uq->uq_mutex);
764 
765 		/* add to q */
766 		if ((iq = uq->uq_ihead) != 0) {
767 			ip->i_freef = iq;
768 			ip->i_freeb = iq->i_freeb;
769 			iq->i_freeb->i_freef = ip;
770 			iq->i_freeb = ip;
771 			if (front)
772 				uq->uq_ihead = ip;
773 		} else {
774 			uq->uq_ihead = ip;
775 			ip->i_freef = ip;
776 			ip->i_freeb = ip;
777 		}
778 
779 		delq_info->delq_unreclaimed_files += 1;
780 		delq_info->delq_unreclaimed_blocks += ip->i_blocks;
781 	} else {
782 		/*
783 		 * queue to idle thread
784 		 *  Check the v_count == 1 again.
785 		 *
786 		 */
787 		mutex_enter(&vp->v_lock);
788 		if (vp->v_count > 1) {
789 			vp->v_count--;  /* release our hold from vn_rele */
790 			mutex_exit(&vp->v_lock);
791 			rw_exit(&ip->i_contents);
792 			return;
793 		}
794 		mutex_exit(&vp->v_lock);
795 		uq = &ufs_idle_q;
796 
797 		/*
798 		 * useful iff it has pages or is a fastsymlink; otherwise junk
799 		 */
800 		mutex_enter(&uq->uq_mutex);
801 
802 		/* clear IREF means `on idle list' */
803 		ip->i_flag &= ~(IREF | IDIRECTIO);
804 
805 		if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) {
806 			ins.in_frback.value.ul++;
807 			hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)];
808 			ufs_nuseful_iq++;
809 		} else {
810 			ins.in_frfront.value.ul++;
811 			hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)];
812 			ip->i_flag |= IJUNKIQ;
813 			ufs_njunk_iq++;
814 		}
815 		ip->i_freef = hip;
816 		ip->i_freeb = hip->i_freeb;
817 		hip->i_freeb->i_freef = ip;
818 		hip->i_freeb = ip;
819 	}
820 
821 	/* wakeup thread(s) if q is overfull */
822 	if (++uq->uq_ne == uq->uq_lowat)
823 		cv_broadcast(&uq->uq_cv);
824 
825 	/* all done, release the q and inode */
826 	mutex_exit(&uq->uq_mutex);
827 	rw_exit(&ip->i_contents);
828 }
829 
830 /*
831  * Check accessed and update flags on an inode structure.
832  * If any are on, update the inode with the (unique) current time.
833  * If waitfor is given, insure I/O order so wait for write to complete.
834  */
835 void
836 ufs_iupdat(struct inode *ip, int waitfor)
837 {
838 	struct buf	*bp;
839 	struct fs	*fp;
840 	struct dinode	*dp;
841 	struct ufsvfs	*ufsvfsp 	= ip->i_ufsvfs;
842 	int 		i;
843 	int		do_trans_times;
844 	ushort_t	flag;
845 	o_uid_t		suid;
846 	o_gid_t		sgid;
847 
848 	/*
849 	 * This function is now safe to be called with either the reader
850 	 * or writer i_contents lock.
851 	 */
852 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
853 
854 	/*
855 	 * Return if file system has been forcibly umounted.
856 	 */
857 	if (ufsvfsp == NULL)
858 		return;
859 
860 	flag = ip->i_flag;	/* Atomic read */
861 	/*
862 	 * We better not update the disk inode from a stale inode.
863 	 */
864 	if (flag & ISTALE)
865 		return;
866 
867 	fp = ip->i_fs;
868 
869 	if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) {
870 		if (fp->fs_ronly) {
871 			mutex_enter(&ip->i_tlock);
872 			ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
873 			mutex_exit(&ip->i_tlock);
874 			return;
875 		}
876 		/*
877 		 * fs is active while metadata is being written
878 		 */
879 		mutex_enter(&ufsvfsp->vfs_lock);
880 		ufs_notclean(ufsvfsp);
881 		/*
882 		 * get the dinode
883 		 */
884 		bp = UFS_BREAD(ufsvfsp, ip->i_dev,
885 		    (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)),
886 		    (int)fp->fs_bsize);
887 		if (bp->b_flags & B_ERROR) {
888 			mutex_enter(&ip->i_tlock);
889 			ip->i_flag &=
890 			    ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
891 			mutex_exit(&ip->i_tlock);
892 			brelse(bp);
893 			return;
894 		}
895 		/*
896 		 * munge inode fields
897 		 */
898 		mutex_enter(&ip->i_tlock);
899 		ITIMES_NOLOCK(ip);
900 		do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC);
901 		ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
902 		mutex_exit(&ip->i_tlock);
903 
904 		/*
905 		 * For reads and concurrent re-writes, no deltas were
906 		 * entered for the access time changes - do it now.
907 		 */
908 		if (do_trans_times) {
909 			TRANS_INODE_TIMES(ufsvfsp, ip);
910 		}
911 
912 		/*
913 		 * For SunOS 5.0->5.4, these lines below read:
914 		 *
915 		 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
916 		 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
917 		 *
918 		 * where MAXUID was set to 60002.  This was incorrect -
919 		 * the uids should have been constrained to what fitted into
920 		 * a 16-bit word.
921 		 *
922 		 * This means that files from 4.x filesystems that have an
923 		 * i_suid field larger than 60002 will have that field
924 		 * changed to 65535.
925 		 *
926 		 * Security note: 4.x UFS could never create a i_suid of
927 		 * UID_LONG since that would've corresponded to -1.
928 		 */
929 		suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
930 		    UID_LONG : ip->i_uid;
931 		sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
932 		    GID_LONG : ip->i_gid;
933 
934 		if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) {
935 			ip->i_suid = suid;
936 			ip->i_sgid = sgid;
937 			TRANS_INODE(ufsvfsp, ip);
938 		}
939 
940 		if ((ip->i_mode & IFMT) == IFBLK ||
941 		    (ip->i_mode & IFMT) == IFCHR) {
942 			dev_t d = ip->i_rdev;
943 			dev32_t dev32;
944 
945 			/*
946 			 * load first direct block only if special device
947 			 */
948 			if (!cmpldev(&dev32, d)) {
949 				/*
950 				 * We panic here because there's "no way"
951 				 * we should have been able to create a large
952 				 * inode with a large dev_t.  Earlier layers
953 				 * should've caught this.
954 				 */
955 				panic("ip %p: i_rdev too big", (void *)ip);
956 			}
957 
958 			if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
959 				ip->i_ordev = dev32;	/* can't use old fmt. */
960 			} else {
961 				ip->i_ordev = cmpdev(d);
962 			}
963 		}
964 
965 		/*
966 		 * copy inode to dinode (zero fastsymlnk in dinode)
967 		 */
968 		dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number);
969 		dp->di_ic = ip->i_ic;	/* structure assignment */
970 		if (flag & IFASTSYMLNK) {
971 			for (i = 1; i < NDADDR; i++)
972 				dp->di_db[i] = 0;
973 			for (i = 0; i < NIADDR; i++)
974 				dp->di_ib[i] = 0;
975 		}
976 		if (TRANS_ISTRANS(ufsvfsp)) {
977 			/*
978 			 * Pass only a sector size buffer containing
979 			 * the inode, otherwise when the buffer is copied
980 			 * into a cached roll buffer then too much memory
981 			 * gets consumed if 8KB inode buffers are passed.
982 			 */
983 			TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff,
984 			    sizeof (struct dinode),
985 			    (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE),
986 			    DEV_BSIZE);
987 
988 			brelse(bp);
989 		} else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) {
990 			UFS_BRWRITE(ufsvfsp, bp);
991 
992 			/*
993 			 * Synchronous write has guaranteed that inode
994 			 * has been written on disk so clear the flag
995 			 */
996 			mutex_enter(&ip->i_tlock);
997 			ip->i_flag &= ~IBDWRITE;
998 			mutex_exit(&ip->i_tlock);
999 		} else {
1000 			bdrwrite(bp);
1001 
1002 			/*
1003 			 * This write hasn't guaranteed that inode has been
1004 			 * written on the disk.
1005 			 * Since, all updat flags on inode are cleared, we must
1006 			 * remember the condition in case inode is to be updated
1007 			 * synchronously later (e.g.- fsync()/fdatasync())
1008 			 * and inode has not been modified yet.
1009 			 */
1010 			mutex_enter(&ip->i_tlock);
1011 			ip->i_flag |= IBDWRITE;
1012 			mutex_exit(&ip->i_tlock);
1013 		}
1014 	} else {
1015 		/*
1016 		 * In case previous inode update was done asynchronously
1017 		 * (IBDWRITE) and this inode update request wants guaranteed
1018 		 * (synchronous) disk update, flush the inode.
1019 		 */
1020 		if (waitfor && (flag & IBDWRITE)) {
1021 			blkflush(ip->i_dev,
1022 			    (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)));
1023 			mutex_enter(&ip->i_tlock);
1024 			ip->i_flag &= ~IBDWRITE;
1025 			mutex_exit(&ip->i_tlock);
1026 		}
1027 	}
1028 }
1029 
1030 #define	SINGLE	0	/* index of single indirect block */
1031 #define	DOUBLE	1	/* index of double indirect block */
1032 #define	TRIPLE	2	/* index of triple indirect block */
1033 
1034 /*
1035  * Release blocks associated with the inode ip and
1036  * stored in the indirect block bn.  Blocks are free'd
1037  * in LIFO order up to (but not including) lastbn.  If
1038  * level is greater than SINGLE, the block is an indirect
1039  * block and recursive calls to indirtrunc must be used to
1040  * cleanse other indirect blocks.
1041  *
1042  * N.B.: triple indirect blocks are untested.
1043  */
1044 static long
1045 indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags)
1046 {
1047 	int i;
1048 	struct buf *bp, *copy;
1049 	daddr32_t *bap;
1050 	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1051 	struct fs *fs = ufsvfsp->vfs_fs;
1052 	daddr_t nb, last;
1053 	long factor;
1054 	int blocksreleased = 0, nblocks;
1055 
1056 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
1057 	/*
1058 	 * Calculate index in current block of last
1059 	 * block to be kept.  -1 indicates the entire
1060 	 * block so we need not calculate the index.
1061 	 */
1062 	factor = 1;
1063 	for (i = SINGLE; i < level; i++)
1064 		factor *= NINDIR(fs);
1065 	last = lastbn;
1066 	if (lastbn > 0)
1067 		last /= factor;
1068 	nblocks = btodb(fs->fs_bsize);
1069 	/*
1070 	 * Get buffer of block pointers, zero those
1071 	 * entries corresponding to blocks to be free'd,
1072 	 * and update on disk copy first.
1073 	 * *Unless* the root pointer has been synchronously
1074 	 * written to disk.  If nothing points to this
1075 	 * indirect block then don't bother zero'ing and
1076 	 * writing it.
1077 	 */
1078 	bp = UFS_BREAD(ufsvfsp,
1079 	    ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize);
1080 	if (bp->b_flags & B_ERROR) {
1081 		brelse(bp);
1082 		return (0);
1083 	}
1084 	bap = bp->b_un.b_daddr;
1085 	if ((flags & I_CHEAP) == 0) {
1086 		uint_t	zb;
1087 
1088 		zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t));
1089 
1090 		if (zb) {
1091 			/*
1092 			 * push any data into the log before we zero it
1093 			 */
1094 			if (bp->b_flags & B_DELWRI)
1095 				TRANS_LOG(ufsvfsp, (caddr_t)bap,
1096 				    ldbtob(bp->b_blkno), bp->b_bcount,
1097 				    bp->b_un.b_addr, bp->b_bcount);
1098 			copy = ngeteblk(fs->fs_bsize);
1099 			bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr,
1100 			    (uint_t)fs->fs_bsize);
1101 			bzero((caddr_t)&bap[last + 1], zb);
1102 
1103 			TRANS_BUF(ufsvfsp,
1104 			    (caddr_t)&bap[last + 1] - (caddr_t)bap,
1105 			    zb, bp, DT_ABZERO);
1106 
1107 			UFS_BRWRITE(ufsvfsp, bp);
1108 			bp = copy, bap = bp->b_un.b_daddr;
1109 		}
1110 	} else {
1111 		/* make sure write retries are also cleared */
1112 		bp->b_flags &= ~(B_DELWRI | B_RETRYWRI);
1113 		bp->b_flags |= B_STALE | B_AGE;
1114 	}
1115 
1116 	/*
1117 	 * Recursively free totally unused blocks.
1118 	 */
1119 	flags |= I_CHEAP;
1120 	for (i = NINDIR(fs) - 1; i > last; i--) {
1121 		nb = bap[i];
1122 		if (nb == 0)
1123 			continue;
1124 		if (level > SINGLE) {
1125 			blocksreleased +=
1126 			    indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags);
1127 			free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK);
1128 		} else
1129 			free(ip, nb, (off_t)fs->fs_bsize, flags);
1130 		blocksreleased += nblocks;
1131 	}
1132 	flags &= ~I_CHEAP;
1133 
1134 	/*
1135 	 * Recursively free last partial block.
1136 	 */
1137 	if (level > SINGLE && lastbn >= 0) {
1138 		last = lastbn % factor;
1139 		nb = bap[i];
1140 		if (nb != 0)
1141 			blocksreleased +=
1142 			    indirtrunc(ip, nb, last, level - 1, flags);
1143 	}
1144 	brelse(bp);
1145 	return (blocksreleased);
1146 }
1147 
1148 /*
1149  * Truncate the inode ip to at most length size.
1150  * Free affected disk blocks -- the blocks of the
1151  * file are removed in reverse order.
1152  *
1153  * N.B.: triple indirect blocks are untested.
1154  */
1155 static int i_genrand = 1234;
1156 int
1157 ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr)
1158 {
1159 	struct fs *fs = oip->i_fs;
1160 	struct ufsvfs *ufsvfsp = oip->i_ufsvfs;
1161 	struct inode *ip;
1162 	daddr_t lastblock;
1163 	off_t bsize;
1164 	int boff;
1165 	daddr_t bn, lastiblock[NIADDR];
1166 	int level;
1167 	long nblocks, blocksreleased = 0;
1168 	int i;
1169 	ushort_t mode;
1170 	struct inode tip;
1171 	int err;
1172 	u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ?
1173 	    (UFS_MAXOFFSET_T) : (MAXOFF32_T);
1174 
1175 	/*
1176 	 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
1177 	 * other uses need the reader lock. opendq() holds the writer lock.
1178 	 */
1179 	ASSERT((oip->i_mode & IFMT) == IFSHAD ||
1180 	    RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
1181 	ASSERT(RW_WRITE_HELD(&oip->i_contents));
1182 	/*
1183 	 * We only allow truncation of regular files and directories
1184 	 * to arbitrary lengths here.  In addition, we allow symbolic
1185 	 * links to be truncated only to zero length.  Other inode
1186 	 * types cannot have their length set here.  Disk blocks are
1187 	 * being dealt with - especially device inodes where
1188 	 * ip->i_ordev is actually being stored in ip->i_db[0]!
1189 	 */
1190 	TRANS_INODE(ufsvfsp, oip);
1191 	mode = oip->i_mode & IFMT;
1192 	if (flags & I_FREE) {
1193 		i_genrand *= 16843009;  /* turns into shift and adds */
1194 		i_genrand++;
1195 		oip->i_gen += ((i_genrand + ddi_get_lbolt()) & 0xffff) + 1;
1196 		oip->i_flag |= ICHG |IUPD;
1197 		oip->i_seq++;
1198 		if (length == oip->i_size)
1199 			return (0);
1200 		flags |= I_CHEAP;
1201 	}
1202 	if (mode == IFIFO)
1203 		return (0);
1204 	if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR &&
1205 	    !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD)
1206 		return (EINVAL);
1207 	if (length > maxoffset)
1208 		return (EFBIG);
1209 	if ((mode == IFDIR) || (mode == IFATTRDIR))
1210 		flags |= I_DIR;
1211 	if (mode == IFSHAD)
1212 		flags |= I_SHAD;
1213 	if (oip == ufsvfsp->vfs_qinod)
1214 		flags |= I_QUOTA;
1215 	if (length == oip->i_size) {
1216 		/* update ctime and mtime to please POSIX tests */
1217 		oip->i_flag |= ICHG |IUPD;
1218 		oip->i_seq++;
1219 		if (length == 0) {
1220 			/* nothing to cache so clear the flag */
1221 			oip->i_flag &= ~IFASTSYMLNK;
1222 		}
1223 		return (0);
1224 	}
1225 	/* wipe out fast symlink till next access */
1226 	if (oip->i_flag & IFASTSYMLNK) {
1227 		int j;
1228 
1229 		ASSERT(ITOV(oip)->v_type == VLNK);
1230 
1231 		oip->i_flag &= ~IFASTSYMLNK;
1232 
1233 		for (j = 1; j < NDADDR; j++)
1234 			oip->i_db[j] = 0;
1235 		for (j = 0; j < NIADDR; j++)
1236 			oip->i_ib[j] = 0;
1237 	}
1238 
1239 	boff = (int)blkoff(fs, length);
1240 
1241 	if (length > oip->i_size) {
1242 		/*
1243 		 * Trunc up case.  BMAPALLOC will insure that the right blocks
1244 		 * are allocated.  This includes extending the old frag to a
1245 		 * full block (if needed) in addition to doing any work
1246 		 * needed for allocating the last block.
1247 		 */
1248 		if (boff == 0)
1249 			err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr);
1250 		else
1251 			err = BMAPALLOC(oip, length - 1, boff, cr);
1252 
1253 		if (err == 0) {
1254 			/*
1255 			 * Save old size and set inode's size now
1256 			 * so that we don't cause too much of the
1257 			 * file to be zero'd and pushed.
1258 			 */
1259 			u_offset_t osize = oip->i_size;
1260 			oip->i_size  = length;
1261 			/*
1262 			 * Make sure we zero out the remaining bytes of
1263 			 * the page in case a mmap scribbled on it. We
1264 			 * can't prevent a mmap from writing beyond EOF
1265 			 * on the last page of a file.
1266 			 *
1267 			 */
1268 			if ((boff = (int)blkoff(fs, osize)) != 0) {
1269 				bsize = (int)lblkno(fs, osize - 1) >= NDADDR ?
1270 				    fs->fs_bsize : fragroundup(fs, boff);
1271 				pvn_vpzero(ITOV(oip), osize,
1272 				    (size_t)(bsize - boff));
1273 			}
1274 			oip->i_flag |= ICHG|IATTCHG;
1275 			oip->i_seq++;
1276 			ITIMES_NOLOCK(oip);
1277 			/*
1278 			 * MAXOFF32_T is old 2GB size limit. If
1279 			 * this operation caused a large file to be
1280 			 * created, turn on the superblock flag
1281 			 * and update the superblock, if the flag
1282 			 * is not already on.
1283 			 */
1284 			if ((length > (u_offset_t)MAXOFF32_T) &&
1285 			    !(fs->fs_flags & FSLARGEFILES)) {
1286 				ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1287 				mutex_enter(&ufsvfsp->vfs_lock);
1288 				fs->fs_flags |= FSLARGEFILES;
1289 				ufs_sbwrite(ufsvfsp);
1290 				mutex_exit(&ufsvfsp->vfs_lock);
1291 			}
1292 		}
1293 
1294 		return (err);
1295 	}
1296 
1297 	/*
1298 	 * Update the pages of the file.  If the file is not being
1299 	 * truncated to a block boundary, the contents of the
1300 	 * pages following the end of the file must be zero'ed
1301 	 * in case it ever become accessible again because
1302 	 * of subsequent file growth.
1303 	 */
1304 	if (boff == 0) {
1305 		(void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage,
1306 		    B_INVAL | B_TRUNC, CRED());
1307 	} else {
1308 		/*
1309 		 * Make sure that the last block is properly allocated.
1310 		 * We only really have to do this if the last block is
1311 		 * actually allocated since ufs_bmap will now handle the case
1312 		 * of an fragment which has no block allocated.  Just to
1313 		 * be sure, we do it now independent of current allocation.
1314 		 */
1315 		err = BMAPALLOC(oip, length - 1, boff, cr);
1316 		if (err)
1317 			return (err);
1318 
1319 		/*
1320 		 * BMAPALLOC will call bmap_write which defers i_seq
1321 		 * processing.  If the timestamps were changed, update
1322 		 * i_seq before rdip drops i_contents or syncs the inode.
1323 		 */
1324 		if (oip->i_flag & (ICHG|IUPD))
1325 			oip->i_seq++;
1326 
1327 		/*
1328 		 * BugId 4069932
1329 		 * Make sure that the relevant partial page appears in
1330 		 * the v_pages list, so that pvn_vpzero() will do its
1331 		 * job.  Since doing this correctly requires everything
1332 		 * in rdip() except for the uiomove(), it's easier and
1333 		 * safer to do the uiomove() rather than duplicate the
1334 		 * rest of rdip() here.
1335 		 *
1336 		 * To get here, we know that length indicates a byte
1337 		 * that is not the first byte of a block.  (length - 1)
1338 		 * is the last actual byte known to exist.  Deduction
1339 		 * shows it is in the same block as byte (length).
1340 		 * Thus, this rdip() invocation should always succeed
1341 		 * except in the face of i/o errors, and give us the
1342 		 * block we care about.
1343 		 *
1344 		 * rdip() makes the same locking assertions and
1345 		 * assumptions as we do.  We do not acquire any locks
1346 		 * before calling it, so we have not changed the locking
1347 		 * situation.  Finally, there do not appear to be any
1348 		 * paths whereby rdip() ends up invoking us again.
1349 		 * Thus, infinite recursion is avoided.
1350 		 */
1351 		{
1352 			uio_t uio;
1353 			iovec_t iov[1];
1354 			char buffer;
1355 
1356 			uio.uio_iov = iov;
1357 			uio.uio_iovcnt = 1;
1358 			uio.uio_loffset = length - 1;
1359 			uio.uio_resid = 1;
1360 			uio.uio_segflg = UIO_SYSSPACE;
1361 			uio.uio_extflg = UIO_COPY_CACHED;
1362 
1363 			iov[0].iov_base = &buffer;
1364 			iov[0].iov_len = 1;
1365 
1366 			err = rdip(oip, &uio, UIO_READ, NULL);
1367 			if (err)
1368 				return (err);
1369 		}
1370 
1371 		bsize = (int)lblkno(fs, length - 1) >= NDADDR ?
1372 		    fs->fs_bsize : fragroundup(fs, boff);
1373 		pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff));
1374 		/*
1375 		 * Ensure full fs block is marked as dirty.
1376 		 */
1377 		(void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff),
1378 		    ufs_putapage, B_INVAL | B_TRUNC, CRED());
1379 	}
1380 
1381 	/*
1382 	 * Calculate index into inode's block list of
1383 	 * last direct and indirect blocks (if any)
1384 	 * which we want to keep.  Lastblock is -1 when
1385 	 * the file is truncated to 0.
1386 	 */
1387 	lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
1388 	lastiblock[SINGLE] = lastblock - NDADDR;
1389 	lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
1390 	lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
1391 	nblocks = btodb(fs->fs_bsize);
1392 
1393 	/*
1394 	 * Update file and block pointers
1395 	 * on disk before we start freeing blocks.
1396 	 * If we crash before free'ing blocks below,
1397 	 * the blocks will be returned to the free list.
1398 	 * lastiblock values are also normalized to -1
1399 	 * for calls to indirtrunc below.
1400 	 */
1401 	tip = *oip;			/* structure copy */
1402 	ip = &tip;
1403 
1404 	for (level = TRIPLE; level >= SINGLE; level--)
1405 		if (lastiblock[level] < 0) {
1406 			oip->i_ib[level] = 0;
1407 			lastiblock[level] = -1;
1408 		}
1409 	for (i = NDADDR - 1; i > lastblock; i--) {
1410 		oip->i_db[i] = 0;
1411 		flags |= I_CHEAP;
1412 	}
1413 	oip->i_size = length;
1414 	oip->i_flag |= ICHG|IUPD|IATTCHG;
1415 	oip->i_seq++;
1416 	if (!TRANS_ISTRANS(ufsvfsp))
1417 		ufs_iupdat(oip, I_SYNC);	/* do sync inode update */
1418 
1419 	/*
1420 	 * Indirect blocks first.
1421 	 */
1422 	for (level = TRIPLE; level >= SINGLE; level--) {
1423 		bn = ip->i_ib[level];
1424 		if (bn != 0) {
1425 			blocksreleased +=
1426 			    indirtrunc(ip, bn, lastiblock[level], level, flags);
1427 			if (lastiblock[level] < 0) {
1428 				ip->i_ib[level] = 0;
1429 				free(ip, bn, (off_t)fs->fs_bsize,
1430 				    flags | I_IBLK);
1431 				blocksreleased += nblocks;
1432 			}
1433 		}
1434 		if (lastiblock[level] >= 0)
1435 			goto done;
1436 	}
1437 
1438 	/*
1439 	 * All whole direct blocks or frags.
1440 	 */
1441 	for (i = NDADDR - 1; i > lastblock; i--) {
1442 		bn = ip->i_db[i];
1443 		if (bn == 0)
1444 			continue;
1445 		ip->i_db[i] = 0;
1446 		bsize = (off_t)blksize(fs, ip, i);
1447 		free(ip, bn, bsize, flags);
1448 		blocksreleased += btodb(bsize);
1449 	}
1450 	if (lastblock < 0)
1451 		goto done;
1452 
1453 	/*
1454 	 * Finally, look for a change in size of the
1455 	 * last direct block; release any frags.
1456 	 */
1457 	bn = ip->i_db[lastblock];
1458 	if (bn != 0) {
1459 		off_t oldspace, newspace;
1460 
1461 		/*
1462 		 * Calculate amount of space we're giving
1463 		 * back as old block size minus new block size.
1464 		 */
1465 		oldspace = blksize(fs, ip, lastblock);
1466 		UFS_SET_ISIZE(length, ip);
1467 		newspace = blksize(fs, ip, lastblock);
1468 		if (newspace == 0) {
1469 			err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0");
1470 			return (err);
1471 		}
1472 		if (oldspace - newspace > 0) {
1473 			/*
1474 			 * Block number of space to be free'd is
1475 			 * the old block # plus the number of frags
1476 			 * required for the storage we're keeping.
1477 			 */
1478 			bn += numfrags(fs, newspace);
1479 			free(ip, bn, oldspace - newspace, flags);
1480 			blocksreleased += btodb(oldspace - newspace);
1481 		}
1482 	}
1483 done:
1484 /* BEGIN PARANOIA */
1485 	for (level = SINGLE; level <= TRIPLE; level++)
1486 		if (ip->i_ib[level] != oip->i_ib[level]) {
1487 			err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block");
1488 			return (err);
1489 		}
1490 
1491 	for (i = 0; i < NDADDR; i++)
1492 		if (ip->i_db[i] != oip->i_db[i]) {
1493 			err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block");
1494 			return (err);
1495 		}
1496 /* END PARANOIA */
1497 	oip->i_blocks -= blocksreleased;
1498 
1499 	if (oip->i_blocks < 0) {		/* sanity */
1500 		cmn_err(CE_NOTE,
1501 		    "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
1502 		    fs->fs_fsmnt, (int)oip->i_number, oip->i_size,
1503 		    (int)oip->i_blocks);
1504 		oip->i_blocks = 0;
1505 	}
1506 	oip->i_flag |= ICHG|IATTCHG;
1507 	oip->i_seq++;
1508 	/* blocksreleased is >= zero, so this can not fail */
1509 	(void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL,
1510 	    (size_t *)NULL);
1511 	return (0);
1512 }
1513 
1514 /*
1515  * Check mode permission on inode.  Mode is READ, WRITE or EXEC.
1516  * In the case of WRITE, the read-only status of the file system
1517  * is checked.  Depending on the calling user, the appropriate
1518  * mode bits are selected; privileges to override missing permission
1519  * bits are checked through secpolicy_vnode_access().
1520  * The i_contens lock must be held as reader here to prevent racing with
1521  * the acl subsystem removing/setting/changing acls on this inode.
1522  * The caller is responsible for indicating whether or not the i_contents
1523  * lock needs to be acquired here or if already held.
1524  */
1525 int
1526 ufs_iaccess(struct inode  *ip, int mode, struct cred *cr, int dolock)
1527 {
1528 	int shift = 0;
1529 	int ret = 0;
1530 
1531 	if (dolock)
1532 		rw_enter(&ip->i_contents, RW_READER);
1533 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
1534 
1535 	if (mode & IWRITE) {
1536 		/*
1537 		 * Disallow write attempts on read-only
1538 		 * file systems, unless the file is a block
1539 		 * or character device or a FIFO.
1540 		 */
1541 		if (ip->i_fs->fs_ronly != 0) {
1542 			if ((ip->i_mode & IFMT) != IFCHR &&
1543 			    (ip->i_mode & IFMT) != IFBLK &&
1544 			    (ip->i_mode & IFMT) != IFIFO) {
1545 				ret = EROFS;
1546 				goto out;
1547 			}
1548 		}
1549 	}
1550 	/*
1551 	 * If there is an acl, check the acl and return.
1552 	 */
1553 	if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) {
1554 		ret = ufs_acl_access(ip, mode, cr);
1555 		goto out;
1556 	}
1557 
1558 	/*
1559 	 * Access check is based on only one of owner, group, public.
1560 	 * If not owner, then check group.
1561 	 * If not a member of the group, then check public access.
1562 	 */
1563 	if (crgetuid(cr) != ip->i_uid) {
1564 		shift += 3;
1565 		if (!groupmember((uid_t)ip->i_gid, cr))
1566 			shift += 3;
1567 	}
1568 
1569 	mode &= ~(ip->i_mode << shift);
1570 
1571 	if (mode == 0)
1572 		goto out;
1573 
1574 	/* test missing privilege bits */
1575 	ret = secpolicy_vnode_access(cr, ITOV(ip), ip->i_uid, mode);
1576 out:
1577 	if (dolock)
1578 		rw_exit(&ip->i_contents);
1579 	return (ret);
1580 }
1581 
1582 /*
1583  * if necessary, remove an inode from the free list
1584  *	i_contents is held except at unmount
1585  *
1586  * Return 1 if the inode is taken off of the ufs_idle_q,
1587  * and the caller is expected to call VN_RELE.
1588  *
1589  * Return 0 otherwise.
1590  */
1591 int
1592 ufs_rmidle(struct inode *ip)
1593 {
1594 	int rval = 0;
1595 
1596 	mutex_enter(&ip->i_tlock);
1597 	if ((ip->i_flag & IREF) == 0) {
1598 		mutex_enter(&ufs_idle_q.uq_mutex);
1599 		ip->i_freef->i_freeb = ip->i_freeb;
1600 		ip->i_freeb->i_freef = ip->i_freef;
1601 		ip->i_freef = ip;
1602 		ip->i_freeb = ip;
1603 		ip->i_flag |= IREF;
1604 		ufs_idle_q.uq_ne--;
1605 		if (ip->i_flag & IJUNKIQ) {
1606 			ufs_njunk_iq--;
1607 			ip->i_flag &= ~IJUNKIQ;
1608 		} else {
1609 			ufs_nuseful_iq--;
1610 		}
1611 		mutex_exit(&ufs_idle_q.uq_mutex);
1612 		rval = 1;
1613 	}
1614 	mutex_exit(&ip->i_tlock);
1615 	return (rval);
1616 }
1617 
1618 /*
1619  * scan the hash of inodes and call func with the inode locked
1620  */
1621 int
1622 ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg,
1623 		struct ufsvfs *ufsvfsp)
1624 {
1625 	struct inode		*ip;		/* current inode */
1626 	struct inode		*lip = NULL;	/* last/previous inode */
1627 	union ihead		*ih;		/* current hash chain */
1628 	int			error, i;
1629 	int			saverror = 0;
1630 	int			lip_held;	/* lip needs a VN_RELE() */
1631 
1632 	/*
1633 	 * If ufsvfsp is NULL, then our caller should be holding
1634 	 * ufs_scan_lock to avoid conflicts between ufs_unmount() and
1635 	 * ufs_update().  Otherwise, to avoid false-positives in
1636 	 * ufs_unmount()'s v_count-based EBUSY check, we only hold
1637 	 * those inodes that are in the file system our caller cares
1638 	 * about.
1639 	 *
1640 	 * We know that ip is a valid inode in the hash chain (and thus
1641 	 * we can trust i_ufsvfs) because the inode we chained from
1642 	 * (lip) is still in the hash chain.  This is true because either:
1643 	 *
1644 	 * 1. We did not drop the hash chain lock since the last
1645 	 *    iteration (because we were not interested in the last inode),
1646 	 * or
1647 	 * 2. We maintained a hold on the last inode while we
1648 	 *    we were processing it, so it could not be removed
1649 	 *    from the hash chain.
1650 	 *
1651 	 * The whole reason we're dropping and re-grabbing the chain
1652 	 * lock on every inode is so that we don't present a major
1653 	 * choke point on throughput, particularly when we've been
1654 	 * called on behalf of fsflush.
1655 	 */
1656 
1657 	for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
1658 		mutex_enter(&ih_lock[i]);
1659 		for (ip = ih->ih_chain[0], lip_held = 0;
1660 		    ip != (struct inode *)ih;
1661 		    ip = lip->i_forw) {
1662 
1663 			ins.in_scan.value.ul++;
1664 
1665 			/*
1666 			 * Undo the previous iteration's VN_HOLD(), but
1667 			 * only if one was done.
1668 			 */
1669 			if (lip_held)
1670 				VN_RELE(ITOV(lip));
1671 
1672 			lip = ip;
1673 			if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) {
1674 				/*
1675 				 * We're not processing all inodes, and
1676 				 * this inode is not in the filesystem of
1677 				 * interest, so skip it.  No need to do a
1678 				 * VN_HOLD() since we're not dropping the
1679 				 * hash chain lock until after we've
1680 				 * done the i_forw traversal above.
1681 				 */
1682 				lip_held = 0;
1683 				continue;
1684 			}
1685 			VN_HOLD(ITOV(ip));
1686 			lip_held = 1;
1687 			mutex_exit(&ih_lock[i]);
1688 
1689 			/*
1690 			 * Acquire the contents lock as writer to make
1691 			 * sure that the inode has been initialized in
1692 			 * the cache or removed from the idle list by
1693 			 * ufs_iget().  This works because ufs_iget()
1694 			 * acquires the contents lock before putting
1695 			 * the inode into the cache.  If we can lock
1696 			 * it, then he's done with it.
1697 			 */
1698 
1699 			if (rwtry) {
1700 				if (!rw_tryenter(&ip->i_contents, RW_WRITER)) {
1701 					mutex_enter(&ih_lock[i]);
1702 					continue;
1703 				}
1704 			} else {
1705 				rw_enter(&ip->i_contents, RW_WRITER);
1706 			}
1707 
1708 			rw_exit(&ip->i_contents);
1709 
1710 			/*
1711 			 * ISTALE means the inode couldn't be read
1712 			 *
1713 			 * We don't have to hold the i_contents lock
1714 			 * for this check for a couple of
1715 			 * reasons. First, if ISTALE is set then the
1716 			 * flag cannot be cleared until the inode is
1717 			 * removed from the cache and that cannot
1718 			 * happen until after we VN_RELE() it.
1719 			 * Second, if ISTALE is not set, then the
1720 			 * inode is in the cache and does not need to
1721 			 * be read from disk so ISTALE cannot be set
1722 			 * while we are not looking.
1723 			 */
1724 			if ((ip->i_flag & ISTALE) == 0) {
1725 				if ((error = (*func)(ip, arg)) != 0)
1726 					saverror = error;
1727 			}
1728 
1729 			mutex_enter(&ih_lock[i]);
1730 		}
1731 		if (lip_held)
1732 			VN_RELE(ITOV(lip));
1733 		mutex_exit(&ih_lock[i]);
1734 	}
1735 	return (saverror);
1736 }
1737 
1738 /*
1739  * Mark inode with the current time, plus a unique increment.
1740  *
1741  * Since we only keep 32-bit time on disk, if UFS is still alive
1742  * beyond 2038, filesystem times will simply stick at the last
1743  * possible second of 32-bit time. Not ideal, but probably better
1744  * than going into the remote past, or confusing applications with
1745  * negative time.
1746  */
1747 void
1748 ufs_imark(struct inode *ip)
1749 {
1750 	timestruc_t now;
1751 	int32_t usec, nsec;
1752 
1753 	/*
1754 	 * The update of i_seq may have been deferred, increase i_seq here
1755 	 * to make sure it is in sync with the timestamps.
1756 	 */
1757 	if (ip->i_flag & ISEQ) {
1758 		ASSERT(ip->i_flag & (IUPD|ICHG));
1759 		ip->i_seq++;
1760 		ip->i_flag &= ~ISEQ;
1761 	}
1762 
1763 	gethrestime(&now);
1764 
1765 	/*
1766 	 * Fast algorithm to convert nsec to usec -- see hrt2ts()
1767 	 * in common/os/timers.c for a full description.
1768 	 */
1769 	nsec = now.tv_nsec;
1770 	usec = nsec + (nsec >> 2);
1771 	usec = nsec + (usec >> 1);
1772 	usec = nsec + (usec >> 2);
1773 	usec = nsec + (usec >> 4);
1774 	usec = nsec - (usec >> 3);
1775 	usec = nsec + (usec >> 2);
1776 	usec = nsec + (usec >> 3);
1777 	usec = nsec + (usec >> 4);
1778 	usec = nsec + (usec >> 1);
1779 	usec = nsec + (usec >> 6);
1780 	usec = usec >> 10;
1781 
1782 	mutex_enter(&ufs_iuniqtime_lock);
1783 	if (now.tv_sec > (time_t)iuniqtime.tv_sec ||
1784 	    usec > iuniqtime.tv_usec) {
1785 		if (now.tv_sec < TIME32_MAX) {
1786 			iuniqtime.tv_sec = (time32_t)now.tv_sec;
1787 			iuniqtime.tv_usec = usec;
1788 		}
1789 	} else {
1790 		if (iuniqtime.tv_sec < TIME32_MAX) {
1791 			iuniqtime.tv_usec++;
1792 			/* Check for usec overflow */
1793 			if (iuniqtime.tv_usec >= MICROSEC) {
1794 				iuniqtime.tv_sec++;
1795 				iuniqtime.tv_usec = 0;
1796 			}
1797 		}
1798 	}
1799 
1800 	if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) {
1801 		ip->i_atime = iuniqtime;
1802 	}
1803 	if (ip->i_flag & IUPD) {
1804 		ip->i_mtime = iuniqtime;
1805 		ip->i_flag |= IMODTIME;
1806 	}
1807 	if (ip->i_flag & ICHG) {
1808 		ip->i_diroff = 0;
1809 		ip->i_ctime = iuniqtime;
1810 	}
1811 	mutex_exit(&ufs_iuniqtime_lock);
1812 }
1813 
1814 /*
1815  * Update timestamps in inode.
1816  */
1817 void
1818 ufs_itimes_nolock(struct inode *ip)
1819 {
1820 
1821 	/*
1822 	 * if noatime is set and the inode access time is the only field that
1823 	 * must be changed, exit immediately.
1824 	 */
1825 	if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) &&
1826 	    (ip->i_ufsvfs->vfs_noatime)) {
1827 		return;
1828 	}
1829 
1830 	if (ip->i_flag & (IUPD|IACC|ICHG)) {
1831 		if (ip->i_flag & ICHG)
1832 			ip->i_flag |= IMOD;
1833 		else
1834 			ip->i_flag |= IMODACC;
1835 		ufs_imark(ip);
1836 		ip->i_flag &= ~(IACC|IUPD|ICHG);
1837 	}
1838 }
1839