1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
26 /* All Rights Reserved */
27
28 /*
29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 * The Regents of the University of California
31 * All Rights Reserved
32 *
33 * University Acknowledgment- Portions of this document are derived from
34 * software developed by the University of California, Berkeley, and its
35 * contributors.
36 */
37
38 #include <sys/types.h>
39 #include <sys/t_lock.h>
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/uio.h>
43 #include <sys/bitmap.h>
44 #include <sys/signal.h>
45 #include <sys/cred.h>
46 #include <sys/user.h>
47 #include <sys/vfs.h>
48 #include <sys/stat.h>
49 #include <sys/vnode.h>
50 #include <sys/buf.h>
51 #include <sys/proc.h>
52 #include <sys/disp.h>
53 #include <sys/dnlc.h>
54 #include <sys/mode.h>
55 #include <sys/cmn_err.h>
56 #include <sys/kstat.h>
57 #include <sys/acl.h>
58 #include <sys/var.h>
59 #include <sys/fs/ufs_inode.h>
60 #include <sys/fs/ufs_fs.h>
61 #include <sys/fs/ufs_trans.h>
62 #include <sys/fs/ufs_acl.h>
63 #include <sys/fs/ufs_bio.h>
64 #include <sys/fs/ufs_quota.h>
65 #include <sys/fs/ufs_log.h>
66 #include <vm/hat.h>
67 #include <vm/as.h>
68 #include <vm/pvn.h>
69 #include <vm/seg.h>
70 #include <sys/swap.h>
71 #include <sys/cpuvar.h>
72 #include <sys/sysmacros.h>
73 #include <sys/errno.h>
74 #include <sys/kmem.h>
75 #include <sys/debug.h>
76 #include <fs/fs_subr.h>
77 #include <sys/policy.h>
78
79 struct kmem_cache *inode_cache; /* cache of free inodes */
80
81 /* UFS Inode Cache Stats -- Not protected */
82 struct instats ins = {
83 { "size", KSTAT_DATA_ULONG },
84 { "maxsize", KSTAT_DATA_ULONG },
85 { "hits", KSTAT_DATA_ULONG },
86 { "misses", KSTAT_DATA_ULONG },
87 { "kmem allocs", KSTAT_DATA_ULONG },
88 { "kmem frees", KSTAT_DATA_ULONG },
89 { "maxsize reached", KSTAT_DATA_ULONG },
90 { "puts at frontlist", KSTAT_DATA_ULONG },
91 { "puts at backlist", KSTAT_DATA_ULONG },
92 { "queues to free", KSTAT_DATA_ULONG },
93 { "scans", KSTAT_DATA_ULONG },
94 { "thread idles", KSTAT_DATA_ULONG },
95 { "lookup idles", KSTAT_DATA_ULONG },
96 { "vget idles", KSTAT_DATA_ULONG },
97 { "cache allocs", KSTAT_DATA_ULONG },
98 { "cache frees", KSTAT_DATA_ULONG },
99 { "pushes at close", KSTAT_DATA_ULONG }
100 };
101
102 /* kstat data */
103 static kstat_t *ufs_inode_kstat = NULL;
104
105 union ihead *ihead; /* inode LRU cache, Chris Maltby */
106 kmutex_t *ih_lock; /* protect inode cache hash table */
107 static int ino_hashlen = 4; /* desired average hash chain length */
108 int inohsz; /* number of buckets in the hash table */
109
110 kmutex_t ufs_scan_lock; /* stop racing multiple ufs_scan_inodes() */
111 kmutex_t ufs_iuniqtime_lock; /* protect iuniqtime */
112 kmutex_t ufsvfs_mutex;
113 struct ufsvfs *oldufsvfslist, *ufsvfslist;
114
115 /*
116 * time to wait after ufsvfsp->vfs_iotstamp before declaring that no
117 * I/Os are going on.
118 */
119 clock_t ufs_iowait;
120
121 /*
122 * the threads that process idle inodes and free (deleted) inodes
123 * have high water marks that are set in ufsinit().
124 * These values but can be no less then the minimum shown below
125 */
126 int ufs_idle_max; /* # of allowable idle inodes */
127 ulong_t ufs_inode_max; /* hard limit of allowable idle inodes */
128 #define UFS_IDLE_MAX (16) /* min # of allowable idle inodes */
129
130 /*
131 * Tunables for ufs write throttling.
132 * These are validated in ufs_iinit() since improper settings
133 * can lead to filesystem hangs.
134 */
135 #define UFS_HW_DEFAULT (16 * 1024 * 1024)
136 #define UFS_LW_DEFAULT (8 * 1024 * 1024)
137 int ufs_HW = UFS_HW_DEFAULT;
138 int ufs_LW = UFS_LW_DEFAULT;
139
140 static void ihinit(void);
141 extern int hash2ints(int, int);
142
143 static int ufs_iget_internal(struct vfs *, ino_t, struct inode **,
144 struct cred *, int);
145
146 /* ARGSUSED */
147 static int
ufs_inode_kstat_update(kstat_t * ksp,int rw)148 ufs_inode_kstat_update(kstat_t *ksp, int rw)
149 {
150 if (rw == KSTAT_WRITE)
151 return (EACCES);
152
153 ins.in_malloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
154 "slab_alloc");
155 ins.in_mfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
156 "slab_free");
157 ins.in_kcalloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
158 "alloc");
159 ins.in_kcfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
160 "free");
161 ins.in_size.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
162 "buf_inuse");
163 ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
164 "buf_max");
165 ins.in_misses.value.ul = ins.in_kcalloc.value.ul;
166
167 return (0);
168 }
169
170 void
ufs_iinit(void)171 ufs_iinit(void)
172 {
173 /*
174 * Validate that ufs_HW > ufs_LW.
175 * The default values for these two tunables have been increased.
176 * There is now a range of values for ufs_HW that used to be
177 * legal on previous Solaris versions but no longer is now.
178 * Upgrading a machine which has an /etc/system setting for ufs_HW
179 * from that range can lead to filesystem hangs unless the values
180 * are checked here.
181 */
182 if (ufs_HW <= ufs_LW) {
183 cmn_err(CE_WARN,
184 "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.",
185 ufs_HW, ufs_LW);
186 ufs_LW = UFS_LW_DEFAULT;
187 ufs_HW = UFS_HW_DEFAULT;
188 cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n",
189 ufs_HW, ufs_LW);
190 }
191
192 /*
193 * Adjust the tunable `ufs_ninode' to a reasonable value
194 */
195 if (ufs_ninode <= 0)
196 ufs_ninode = ncsize;
197 if (ufs_inode_max == 0)
198 ufs_inode_max =
199 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode));
200 if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) {
201 cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld",
202 ufs_inode_max);
203 ufs_ninode = ufs_inode_max;
204 }
205 /*
206 * Wait till third call of ufs_update to declare that no I/Os are
207 * going on. This allows deferred access times to be flushed to disk.
208 */
209 ufs_iowait = v.v_autoup * hz * 2;
210
211 /*
212 * idle thread runs when 25% of ufs_ninode entries are on the queue
213 */
214 if (ufs_idle_max == 0)
215 ufs_idle_max = ufs_ninode >> 2;
216 if (ufs_idle_max < UFS_IDLE_MAX)
217 ufs_idle_max = UFS_IDLE_MAX;
218 if (ufs_idle_max > ufs_ninode)
219 ufs_idle_max = ufs_ninode;
220 /*
221 * This is really a misnomer, it is ufs_queue_init
222 */
223 ufs_thread_init(&ufs_idle_q, ufs_idle_max);
224 ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL);
225
226 /*
227 * global hlock thread
228 */
229 ufs_thread_init(&ufs_hlock, 1);
230 ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL);
231
232 ihinit();
233 qtinit();
234 ins.in_maxsize.value.ul = ufs_ninode;
235 if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs",
236 KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t),
237 KSTAT_FLAG_VIRTUAL)) != NULL) {
238 ufs_inode_kstat->ks_data = (void *)&ins;
239 ufs_inode_kstat->ks_update = ufs_inode_kstat_update;
240 kstat_install(ufs_inode_kstat);
241 }
242 ufsfx_init(); /* fix-on-panic initialization */
243 si_cache_init();
244 ufs_directio_init();
245 lufs_init();
246 mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL);
247 }
248
249 /* ARGSUSED */
250 static int
ufs_inode_cache_constructor(void * buf,void * cdrarg,int kmflags)251 ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags)
252 {
253 struct inode *ip = buf;
254 struct vnode *vp;
255
256 vp = ip->i_vnode = vn_alloc(kmflags);
257 if (vp == NULL) {
258 return (-1);
259 }
260 vn_setops(vp, ufs_vnodeops);
261 vp->v_data = ip;
262
263 rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL);
264 rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL);
265 mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL);
266 dnlc_dir_init(&ip->i_danchor);
267
268 cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL);
269
270 return (0);
271 }
272
273 /* ARGSUSED */
274 static void
ufs_inode_cache_destructor(void * buf,void * cdrarg)275 ufs_inode_cache_destructor(void *buf, void *cdrarg)
276 {
277 struct inode *ip = buf;
278 struct vnode *vp;
279
280 vp = ITOV(ip);
281
282 rw_destroy(&ip->i_rwlock);
283 rw_destroy(&ip->i_contents);
284 mutex_destroy(&ip->i_tlock);
285 if (vp->v_type == VDIR) {
286 dnlc_dir_fini(&ip->i_danchor);
287 }
288
289 cv_destroy(&ip->i_wrcv);
290
291 vn_free(vp);
292 }
293
294 /*
295 * Initialize hash links for inodes
296 * and build inode free list.
297 */
298 void
ihinit(void)299 ihinit(void)
300 {
301 int i;
302 union ihead *ih = ihead;
303
304 mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL);
305
306 inohsz = 1 << highbit(ufs_ninode / ino_hashlen);
307 ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP);
308 ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP);
309
310 for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
311 ih->ih_head[0] = ih;
312 ih->ih_head[1] = ih;
313 mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL);
314 }
315 inode_cache = kmem_cache_create("ufs_inode_cache",
316 sizeof (struct inode), 0, ufs_inode_cache_constructor,
317 ufs_inode_cache_destructor, ufs_inode_cache_reclaim,
318 NULL, NULL, 0);
319 }
320
321 /*
322 * Free an inode structure
323 */
324 void
ufs_free_inode(struct inode * ip)325 ufs_free_inode(struct inode *ip)
326 {
327 vn_invalid(ITOV(ip));
328 kmem_cache_free(inode_cache, ip);
329 }
330
331 /*
332 * Allocate an inode structure
333 */
334 struct inode *
ufs_alloc_inode(ufsvfs_t * ufsvfsp,ino_t ino)335 ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino)
336 {
337 struct inode *ip;
338 vnode_t *vp;
339
340 ip = kmem_cache_alloc(inode_cache, KM_SLEEP);
341 /*
342 * at this point we have a newly allocated inode
343 */
344 ip->i_freef = ip;
345 ip->i_freeb = ip;
346 ip->i_flag = IREF;
347 ip->i_seq = 0xFF; /* Unique initial value */
348 ip->i_dev = ufsvfsp->vfs_dev;
349 ip->i_ufsvfs = ufsvfsp;
350 ip->i_devvp = ufsvfsp->vfs_devvp;
351 ip->i_number = ino;
352 ip->i_diroff = 0;
353 ip->i_nextr = 0;
354 ip->i_map = NULL;
355 ip->i_rdev = 0;
356 ip->i_writes = 0;
357 ip->i_mode = 0;
358 ip->i_delaylen = 0;
359 ip->i_delayoff = 0;
360 ip->i_nextrio = 0;
361 ip->i_ufs_acl = NULL;
362 ip->i_cflags = 0;
363 ip->i_mapcnt = 0;
364 ip->i_dquot = NULL;
365 ip->i_cachedir = CD_ENABLED;
366 ip->i_writer = NULL;
367
368 /*
369 * the vnode for this inode was allocated by the constructor
370 */
371 vp = ITOV(ip);
372 vn_reinit(vp);
373 if (ino == (ino_t)UFSROOTINO)
374 vp->v_flag = VROOT;
375 vp->v_vfsp = ufsvfsp->vfs_vfs;
376 vn_exists(vp);
377 return (ip);
378 }
379
380 /*
381 * Look up an inode by device, inumber. If it is in core (in the
382 * inode structure), honor the locking protocol. If it is not in
383 * core, read it in from the specified device after freeing any pages.
384 * In all cases, a pointer to a VN_HELD inode structure is returned.
385 */
386 int
ufs_iget(struct vfs * vfsp,ino_t ino,struct inode ** ipp,struct cred * cr)387 ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr)
388 {
389 return (ufs_iget_internal(vfsp, ino, ipp, cr, 0));
390 }
391
392 /*
393 * A version of ufs_iget which returns only allocated, linked inodes.
394 * This is appropriate for any callers who do not expect a free inode.
395 */
396 int
ufs_iget_alloced(struct vfs * vfsp,ino_t ino,struct inode ** ipp,struct cred * cr)397 ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp,
398 struct cred *cr)
399 {
400 return (ufs_iget_internal(vfsp, ino, ipp, cr, 1));
401 }
402
403 /*
404 * Set vnode attributes based on v_type, this should be called whenever
405 * an inode's i_mode is changed.
406 */
407 void
ufs_reset_vnode(vnode_t * vp)408 ufs_reset_vnode(vnode_t *vp)
409 {
410 /*
411 * an old DBE hack
412 */
413 if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
414 vp->v_flag |= VSWAPLIKE;
415 else
416 vp->v_flag &= ~VSWAPLIKE;
417
418 /*
419 * if not swap like and it's just a regular file, we want
420 * to maintain the vnode's pages sorted by clean/modified
421 * for faster sync'ing to disk
422 */
423 if (vp->v_type == VREG)
424 vp->v_flag |= VMODSORT;
425 else
426 vp->v_flag &= ~VMODSORT;
427
428 /*
429 * Is this an attribute hidden dir?
430 */
431 if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR)
432 vp->v_flag |= V_XATTRDIR;
433 else
434 vp->v_flag &= ~V_XATTRDIR;
435 }
436
437 /*
438 * Shared implementation of ufs_iget and ufs_iget_alloced. The 'validate'
439 * flag is used to distinguish the two; when true, we validate that the inode
440 * being retrieved looks like a linked and allocated inode.
441 */
442 /* ARGSUSED */
443 static int
ufs_iget_internal(struct vfs * vfsp,ino_t ino,struct inode ** ipp,struct cred * cr,int validate)444 ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp,
445 struct cred *cr, int validate)
446 {
447 struct inode *ip, *sp;
448 union ihead *ih;
449 kmutex_t *ihm;
450 struct buf *bp;
451 struct dinode *dp;
452 struct vnode *vp;
453 extern vfs_t EIO_vfs;
454 int error;
455 int ftype; /* XXX - Remove later on */
456 dev_t vfs_dev;
457 struct ufsvfs *ufsvfsp;
458 struct fs *fs;
459 int hno;
460 daddr_t bno;
461 ulong_t ioff;
462
463 CPU_STATS_ADD_K(sys, ufsiget, 1);
464
465 /*
466 * Lookup inode in cache.
467 */
468 vfs_dev = vfsp->vfs_dev;
469 hno = INOHASH(ino);
470 ih = &ihead[hno];
471 ihm = &ih_lock[hno];
472
473 again:
474 mutex_enter(ihm);
475 for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) {
476 if (ino != ip->i_number || vfs_dev != ip->i_dev ||
477 (ip->i_flag & ISTALE))
478 continue;
479
480 /*
481 * Found the interesting inode; hold it and drop the cache lock
482 */
483 vp = ITOV(ip); /* for locknest */
484 VN_HOLD(vp);
485 mutex_exit(ihm);
486 rw_enter(&ip->i_contents, RW_READER);
487
488 /*
489 * if necessary, remove from idle list
490 */
491 if ((ip->i_flag & IREF) == 0) {
492 if (ufs_rmidle(ip))
493 VN_RELE(vp);
494 }
495
496 /*
497 * Could the inode be read from disk?
498 */
499 if (ip->i_flag & ISTALE) {
500 rw_exit(&ip->i_contents);
501 VN_RELE(vp);
502 goto again;
503 }
504
505 ins.in_hits.value.ul++;
506 *ipp = ip;
507
508 /*
509 * Reset the vnode's attribute flags
510 */
511 mutex_enter(&vp->v_lock);
512 ufs_reset_vnode(vp);
513 mutex_exit(&vp->v_lock);
514
515 rw_exit(&ip->i_contents);
516
517 return (0);
518 }
519 mutex_exit(ihm);
520
521 /*
522 * Inode was not in cache.
523 *
524 * Allocate a new entry
525 */
526 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
527 fs = ufsvfsp->vfs_fs;
528
529 ip = ufs_alloc_inode(ufsvfsp, ino);
530 vp = ITOV(ip);
531
532 bno = fsbtodb(fs, itod(fs, ino));
533 ioff = (sizeof (struct dinode)) * (itoo(fs, ino));
534 ip->i_doff = (offset_t)ioff + ldbtob(bno);
535
536 /*
537 * put a place holder in the cache (if not already there)
538 */
539 mutex_enter(ihm);
540 for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw)
541 if (ino == sp->i_number && vfs_dev == sp->i_dev &&
542 ((sp->i_flag & ISTALE) == 0)) {
543 mutex_exit(ihm);
544 ufs_free_inode(ip);
545 goto again;
546 }
547 /*
548 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
549 * here, but if we do, then shadow inode allocations panic the
550 * system. We don't have to hold vfs_dqrwlock for shadow inodes
551 * and the ufs_iget() parameters don't tell us what we are getting
552 * so we have no way of knowing this is a ufs_iget() call from
553 * a ufs_ialloc() call for a shadow inode.
554 */
555 rw_enter(&ip->i_contents, RW_WRITER);
556 insque(ip, ih);
557 mutex_exit(ihm);
558 /*
559 * read the dinode
560 */
561 bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize);
562
563 /*
564 * Check I/O errors
565 */
566 error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0);
567 if (error) {
568 brelse(bp);
569 ip->i_flag |= ISTALE; /* in case someone is looking it up */
570 rw_exit(&ip->i_contents);
571 vp->v_vfsp = &EIO_vfs;
572 VN_RELE(vp);
573 return (error);
574 }
575 /*
576 * initialize the inode's dinode
577 */
578 dp = (struct dinode *)(ioff + bp->b_un.b_addr);
579 ip->i_ic = dp->di_ic; /* structure assignment */
580 brelse(bp);
581
582 /*
583 * Maintain compatibility with Solaris 1.x UFS
584 */
585 if (ip->i_suid != UID_LONG)
586 ip->i_uid = ip->i_suid;
587 if (ip->i_sgid != GID_LONG)
588 ip->i_gid = ip->i_sgid;
589
590 ftype = ip->i_mode & IFMT;
591 if (ftype == IFBLK || ftype == IFCHR) {
592 dev_t dv;
593 uint_t top16 = ip->i_ordev & 0xffff0000u;
594
595 if (top16 == 0 || top16 == 0xffff0000u)
596 dv = expdev(ip->i_ordev);
597 else
598 dv = expldev(ip->i_ordev);
599 vp->v_rdev = ip->i_rdev = dv;
600 }
601
602 /*
603 * if our caller only expects allocated inodes, verify that
604 * this inode looks good; throw it out if it's bad.
605 */
606 if (validate) {
607 if ((ftype == 0) || (ip->i_nlink <= 0)) {
608 ip->i_flag |= ISTALE;
609 rw_exit(&ip->i_contents);
610 vp->v_vfsp = &EIO_vfs;
611 VN_RELE(vp);
612 cmn_err(CE_NOTE,
613 "%s: unexpected free inode %d, run fsck(1M)%s",
614 fs->fs_fsmnt, (int)ino,
615 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
616 return (EIO);
617 }
618 }
619
620 /*
621 * Finish initializing the vnode, special handling for shadow inodes
622 * because IFTOVT() will produce a v_type of VNON which is not what we
623 * want, set v_type to VREG explicitly in that case.
624 */
625 if (ftype == IFSHAD) {
626 vp->v_type = VREG;
627 } else {
628 vp->v_type = IFTOVT((mode_t)ip->i_mode);
629 }
630
631 ufs_reset_vnode(vp);
632
633 /*
634 * read the shadow
635 */
636 if (ftype != 0 && ip->i_shadow != 0) {
637 if ((error = ufs_si_load(ip, cr)) != 0) {
638 ip->i_flag |= ISTALE;
639 ip->i_ufs_acl = NULL;
640 rw_exit(&ip->i_contents);
641 vp->v_vfsp = &EIO_vfs;
642 VN_RELE(vp);
643 return (error);
644 }
645 }
646
647 /*
648 * Only attach quota information if the inode has a type and if
649 * that type is not a shadow inode.
650 */
651 if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) &&
652 ((ip->i_mode & IFMT) != IFATTRDIR)) {
653 ip->i_dquot = getinoquota(ip);
654 }
655 TRANS_MATA_IGET(ufsvfsp, ip);
656 *ipp = ip;
657 rw_exit(&ip->i_contents);
658
659 return (0);
660 }
661
662 /*
663 * Vnode is no longer referenced, write the inode out
664 * and if necessary, truncate and deallocate the file.
665 */
666 void
ufs_iinactive(struct inode * ip)667 ufs_iinactive(struct inode *ip)
668 {
669 int front;
670 struct inode *iq;
671 struct inode *hip;
672 struct ufs_q *uq;
673 struct vnode *vp = ITOV(ip);
674 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
675 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
676
677 /*
678 * Because the vnode type might have been changed,
679 * the dnlc_dir_purge must be called unconditionally.
680 */
681 dnlc_dir_purge(&ip->i_danchor);
682
683 /*
684 * Get exclusive access to inode data.
685 */
686 rw_enter(&ip->i_contents, RW_WRITER);
687 ASSERT(ip->i_flag & IREF);
688
689 /*
690 * Make sure no one reclaimed the inode before we put it on
691 * the freelist or destroy it. We keep our 'hold' on the vnode
692 * from vn_rele until we are ready to do something with the inode.
693 *
694 * Pageout may put a VN_HOLD/VN_RELE at anytime during this
695 * operation via an async putpage, so we must make sure
696 * we don't free/destroy the inode more than once. ufs_iget
697 * may also put a VN_HOLD on the inode before it grabs
698 * the i_contents lock. This is done so we don't free
699 * an inode that a thread is waiting on.
700 */
701 mutex_enter(&vp->v_lock);
702
703 if (vp->v_count > 1) {
704 vp->v_count--; /* release our hold from vn_rele */
705 mutex_exit(&vp->v_lock);
706 rw_exit(&ip->i_contents);
707 return;
708 }
709 mutex_exit(&vp->v_lock);
710
711 /*
712 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed
713 * and clean. It can be safely destroyed (cyf).
714 */
715 if (ip->i_ufsvfs == NULL) {
716 rw_exit(&ip->i_contents);
717 ufs_si_del(ip);
718 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
719 ufs_free_inode(ip);
720 return;
721 }
722
723 /*
724 * queue idle inode to appropriate thread. Will check v_count == 1
725 * prior to putting this on the appropriate queue.
726 * Stale inodes will be unhashed and freed by the ufs idle thread
727 * in ufs_idle_free()
728 */
729 front = 1;
730 if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 &&
731 ip->i_mode && ip->i_nlink <= 0) {
732 /*
733 * Mark the i_flag to indicate that inode is being deleted.
734 * This flag will be cleared when the deletion is complete.
735 * This prevents nfs from sneaking in via ufs_vget() while
736 * the delete is in progress (bugid 1242481).
737 */
738 ip->i_flag |= IDEL;
739
740 /*
741 * NOIDEL means that deletes are not allowed at this time;
742 * whoever resets NOIDEL will also send this inode back
743 * through ufs_iinactive. IREF remains set.
744 */
745 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
746 mutex_enter(&vp->v_lock);
747 vp->v_count--;
748 mutex_exit(&vp->v_lock);
749 rw_exit(&ip->i_contents);
750 return;
751 }
752 if (!TRANS_ISTRANS(ip->i_ufsvfs)) {
753 rw_exit(&ip->i_contents);
754 ufs_delete(ip->i_ufsvfs, ip, 0);
755 return;
756 }
757
758 /* queue to delete thread; IREF remains set */
759 ins.in_qfree.value.ul++;
760 uq = &ip->i_ufsvfs->vfs_delete;
761
762 mutex_enter(&uq->uq_mutex);
763
764 /* add to q */
765 if ((iq = uq->uq_ihead) != 0) {
766 ip->i_freef = iq;
767 ip->i_freeb = iq->i_freeb;
768 iq->i_freeb->i_freef = ip;
769 iq->i_freeb = ip;
770 if (front)
771 uq->uq_ihead = ip;
772 } else {
773 uq->uq_ihead = ip;
774 ip->i_freef = ip;
775 ip->i_freeb = ip;
776 }
777
778 delq_info->delq_unreclaimed_files += 1;
779 delq_info->delq_unreclaimed_blocks += ip->i_blocks;
780 } else {
781 /*
782 * queue to idle thread
783 * Check the v_count == 1 again.
784 *
785 */
786 mutex_enter(&vp->v_lock);
787 if (vp->v_count > 1) {
788 vp->v_count--; /* release our hold from vn_rele */
789 mutex_exit(&vp->v_lock);
790 rw_exit(&ip->i_contents);
791 return;
792 }
793 mutex_exit(&vp->v_lock);
794 uq = &ufs_idle_q;
795
796 /*
797 * useful iff it has pages or is a fastsymlink; otherwise junk
798 */
799 mutex_enter(&uq->uq_mutex);
800
801 /* clear IREF means `on idle list' */
802 ip->i_flag &= ~(IREF | IDIRECTIO);
803
804 if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) {
805 ins.in_frback.value.ul++;
806 hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)];
807 ufs_nuseful_iq++;
808 } else {
809 ins.in_frfront.value.ul++;
810 hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)];
811 ip->i_flag |= IJUNKIQ;
812 ufs_njunk_iq++;
813 }
814 ip->i_freef = hip;
815 ip->i_freeb = hip->i_freeb;
816 hip->i_freeb->i_freef = ip;
817 hip->i_freeb = ip;
818 }
819
820 /* wakeup thread(s) if q is overfull */
821 if (++uq->uq_ne == uq->uq_lowat)
822 cv_broadcast(&uq->uq_cv);
823
824 /* all done, release the q and inode */
825 mutex_exit(&uq->uq_mutex);
826 rw_exit(&ip->i_contents);
827 }
828
829 /*
830 * Check accessed and update flags on an inode structure.
831 * If any are on, update the inode with the (unique) current time.
832 * If waitfor is given, insure I/O order so wait for write to complete.
833 */
834 void
ufs_iupdat(struct inode * ip,int waitfor)835 ufs_iupdat(struct inode *ip, int waitfor)
836 {
837 struct buf *bp;
838 struct fs *fp;
839 struct dinode *dp;
840 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
841 int i;
842 int do_trans_times;
843 ushort_t flag;
844 o_uid_t suid;
845 o_gid_t sgid;
846
847 /*
848 * This function is now safe to be called with either the reader
849 * or writer i_contents lock.
850 */
851 ASSERT(RW_LOCK_HELD(&ip->i_contents));
852
853 /*
854 * Return if file system has been forcibly umounted.
855 */
856 if (ufsvfsp == NULL)
857 return;
858
859 flag = ip->i_flag; /* Atomic read */
860 /*
861 * We better not update the disk inode from a stale inode.
862 */
863 if (flag & ISTALE)
864 return;
865
866 fp = ip->i_fs;
867
868 if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) {
869 if (fp->fs_ronly) {
870 mutex_enter(&ip->i_tlock);
871 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
872 mutex_exit(&ip->i_tlock);
873 return;
874 }
875 /*
876 * fs is active while metadata is being written
877 */
878 mutex_enter(&ufsvfsp->vfs_lock);
879 ufs_notclean(ufsvfsp);
880 /*
881 * get the dinode
882 */
883 bp = UFS_BREAD(ufsvfsp, ip->i_dev,
884 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)),
885 (int)fp->fs_bsize);
886 if (bp->b_flags & B_ERROR) {
887 mutex_enter(&ip->i_tlock);
888 ip->i_flag &=
889 ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
890 mutex_exit(&ip->i_tlock);
891 brelse(bp);
892 return;
893 }
894 /*
895 * munge inode fields
896 */
897 mutex_enter(&ip->i_tlock);
898 ITIMES_NOLOCK(ip);
899 do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC);
900 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
901 mutex_exit(&ip->i_tlock);
902
903 /*
904 * For reads and concurrent re-writes, no deltas were
905 * entered for the access time changes - do it now.
906 */
907 if (do_trans_times) {
908 TRANS_INODE_TIMES(ufsvfsp, ip);
909 }
910
911 /*
912 * For SunOS 5.0->5.4, these lines below read:
913 *
914 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
915 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
916 *
917 * where MAXUID was set to 60002. This was incorrect -
918 * the uids should have been constrained to what fitted into
919 * a 16-bit word.
920 *
921 * This means that files from 4.x filesystems that have an
922 * i_suid field larger than 60002 will have that field
923 * changed to 65535.
924 *
925 * Security note: 4.x UFS could never create a i_suid of
926 * UID_LONG since that would've corresponded to -1.
927 */
928 suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
929 UID_LONG : ip->i_uid;
930 sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
931 GID_LONG : ip->i_gid;
932
933 if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) {
934 ip->i_suid = suid;
935 ip->i_sgid = sgid;
936 TRANS_INODE(ufsvfsp, ip);
937 }
938
939 if ((ip->i_mode & IFMT) == IFBLK ||
940 (ip->i_mode & IFMT) == IFCHR) {
941 dev_t d = ip->i_rdev;
942 dev32_t dev32;
943
944 /*
945 * load first direct block only if special device
946 */
947 if (!cmpldev(&dev32, d)) {
948 /*
949 * We panic here because there's "no way"
950 * we should have been able to create a large
951 * inode with a large dev_t. Earlier layers
952 * should've caught this.
953 */
954 panic("ip %p: i_rdev too big", (void *)ip);
955 }
956
957 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
958 ip->i_ordev = dev32; /* can't use old fmt. */
959 } else {
960 ip->i_ordev = cmpdev(d);
961 }
962 }
963
964 /*
965 * copy inode to dinode (zero fastsymlnk in dinode)
966 */
967 dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number);
968 dp->di_ic = ip->i_ic; /* structure assignment */
969 if (flag & IFASTSYMLNK) {
970 for (i = 1; i < NDADDR; i++)
971 dp->di_db[i] = 0;
972 for (i = 0; i < NIADDR; i++)
973 dp->di_ib[i] = 0;
974 }
975 if (TRANS_ISTRANS(ufsvfsp)) {
976 /*
977 * Pass only a sector size buffer containing
978 * the inode, otherwise when the buffer is copied
979 * into a cached roll buffer then too much memory
980 * gets consumed if 8KB inode buffers are passed.
981 */
982 TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff,
983 sizeof (struct dinode),
984 (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE),
985 DEV_BSIZE);
986
987 brelse(bp);
988 } else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) {
989 UFS_BRWRITE(ufsvfsp, bp);
990
991 /*
992 * Synchronous write has guaranteed that inode
993 * has been written on disk so clear the flag
994 */
995 mutex_enter(&ip->i_tlock);
996 ip->i_flag &= ~IBDWRITE;
997 mutex_exit(&ip->i_tlock);
998 } else {
999 bdrwrite(bp);
1000
1001 /*
1002 * This write hasn't guaranteed that inode has been
1003 * written on the disk.
1004 * Since, all updat flags on inode are cleared, we must
1005 * remember the condition in case inode is to be updated
1006 * synchronously later (e.g.- fsync()/fdatasync())
1007 * and inode has not been modified yet.
1008 */
1009 mutex_enter(&ip->i_tlock);
1010 ip->i_flag |= IBDWRITE;
1011 mutex_exit(&ip->i_tlock);
1012 }
1013 } else {
1014 /*
1015 * In case previous inode update was done asynchronously
1016 * (IBDWRITE) and this inode update request wants guaranteed
1017 * (synchronous) disk update, flush the inode.
1018 */
1019 if (waitfor && (flag & IBDWRITE)) {
1020 blkflush(ip->i_dev,
1021 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)));
1022 mutex_enter(&ip->i_tlock);
1023 ip->i_flag &= ~IBDWRITE;
1024 mutex_exit(&ip->i_tlock);
1025 }
1026 }
1027 }
1028
1029 #define SINGLE 0 /* index of single indirect block */
1030 #define DOUBLE 1 /* index of double indirect block */
1031 #define TRIPLE 2 /* index of triple indirect block */
1032
1033 /*
1034 * Release blocks associated with the inode ip and
1035 * stored in the indirect block bn. Blocks are free'd
1036 * in LIFO order up to (but not including) lastbn. If
1037 * level is greater than SINGLE, the block is an indirect
1038 * block and recursive calls to indirtrunc must be used to
1039 * cleanse other indirect blocks.
1040 *
1041 * N.B.: triple indirect blocks are untested.
1042 */
1043 static long
indirtrunc(struct inode * ip,daddr_t bn,daddr_t lastbn,int level,int flags)1044 indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags)
1045 {
1046 int i;
1047 struct buf *bp, *copy;
1048 daddr32_t *bap;
1049 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1050 struct fs *fs = ufsvfsp->vfs_fs;
1051 daddr_t nb, last;
1052 long factor;
1053 int blocksreleased = 0, nblocks;
1054
1055 ASSERT(RW_WRITE_HELD(&ip->i_contents));
1056 /*
1057 * Calculate index in current block of last
1058 * block to be kept. -1 indicates the entire
1059 * block so we need not calculate the index.
1060 */
1061 factor = 1;
1062 for (i = SINGLE; i < level; i++)
1063 factor *= NINDIR(fs);
1064 last = lastbn;
1065 if (lastbn > 0)
1066 last /= factor;
1067 nblocks = btodb(fs->fs_bsize);
1068 /*
1069 * Get buffer of block pointers, zero those
1070 * entries corresponding to blocks to be free'd,
1071 * and update on disk copy first.
1072 * *Unless* the root pointer has been synchronously
1073 * written to disk. If nothing points to this
1074 * indirect block then don't bother zero'ing and
1075 * writing it.
1076 */
1077 bp = UFS_BREAD(ufsvfsp,
1078 ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize);
1079 if (bp->b_flags & B_ERROR) {
1080 brelse(bp);
1081 return (0);
1082 }
1083 bap = bp->b_un.b_daddr;
1084 if ((flags & I_CHEAP) == 0) {
1085 uint_t zb;
1086
1087 zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t));
1088
1089 if (zb) {
1090 /*
1091 * push any data into the log before we zero it
1092 */
1093 if (bp->b_flags & B_DELWRI)
1094 TRANS_LOG(ufsvfsp, (caddr_t)bap,
1095 ldbtob(bp->b_blkno), bp->b_bcount,
1096 bp->b_un.b_addr, bp->b_bcount);
1097 copy = ngeteblk(fs->fs_bsize);
1098 bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr,
1099 (uint_t)fs->fs_bsize);
1100 bzero((caddr_t)&bap[last + 1], zb);
1101
1102 TRANS_BUF(ufsvfsp,
1103 (caddr_t)&bap[last + 1] - (caddr_t)bap,
1104 zb, bp, DT_ABZERO);
1105
1106 UFS_BRWRITE(ufsvfsp, bp);
1107 bp = copy, bap = bp->b_un.b_daddr;
1108 }
1109 } else {
1110 /* make sure write retries are also cleared */
1111 bp->b_flags &= ~(B_DELWRI | B_RETRYWRI);
1112 bp->b_flags |= B_STALE | B_AGE;
1113 }
1114
1115 /*
1116 * Recursively free totally unused blocks.
1117 */
1118 flags |= I_CHEAP;
1119 for (i = NINDIR(fs) - 1; i > last; i--) {
1120 nb = bap[i];
1121 if (nb == 0)
1122 continue;
1123 if (level > SINGLE) {
1124 blocksreleased +=
1125 indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags);
1126 free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK);
1127 } else
1128 free(ip, nb, (off_t)fs->fs_bsize, flags);
1129 blocksreleased += nblocks;
1130 }
1131 flags &= ~I_CHEAP;
1132
1133 /*
1134 * Recursively free last partial block.
1135 */
1136 if (level > SINGLE && lastbn >= 0) {
1137 last = lastbn % factor;
1138 nb = bap[i];
1139 if (nb != 0)
1140 blocksreleased +=
1141 indirtrunc(ip, nb, last, level - 1, flags);
1142 }
1143 brelse(bp);
1144 return (blocksreleased);
1145 }
1146
1147 /*
1148 * Truncate the inode ip to at most length size.
1149 * Free affected disk blocks -- the blocks of the
1150 * file are removed in reverse order.
1151 *
1152 * N.B.: triple indirect blocks are untested.
1153 */
1154 static int i_genrand = 1234;
1155 int
ufs_itrunc(struct inode * oip,u_offset_t length,int flags,cred_t * cr)1156 ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr)
1157 {
1158 struct fs *fs = oip->i_fs;
1159 struct ufsvfs *ufsvfsp = oip->i_ufsvfs;
1160 struct inode *ip;
1161 daddr_t lastblock;
1162 off_t bsize;
1163 int boff;
1164 daddr_t bn, lastiblock[NIADDR];
1165 int level;
1166 long nblocks, blocksreleased = 0;
1167 int i;
1168 ushort_t mode;
1169 struct inode tip;
1170 int err;
1171 u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ?
1172 (UFS_MAXOFFSET_T) : (MAXOFF32_T);
1173
1174 /*
1175 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
1176 * other uses need the reader lock. opendq() holds the writer lock.
1177 */
1178 ASSERT((oip->i_mode & IFMT) == IFSHAD ||
1179 RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
1180 ASSERT(RW_WRITE_HELD(&oip->i_contents));
1181 /*
1182 * We only allow truncation of regular files and directories
1183 * to arbitrary lengths here. In addition, we allow symbolic
1184 * links to be truncated only to zero length. Other inode
1185 * types cannot have their length set here. Disk blocks are
1186 * being dealt with - especially device inodes where
1187 * ip->i_ordev is actually being stored in ip->i_db[0]!
1188 */
1189 TRANS_INODE(ufsvfsp, oip);
1190 mode = oip->i_mode & IFMT;
1191 if (flags & I_FREE) {
1192 i_genrand *= 16843009; /* turns into shift and adds */
1193 i_genrand++;
1194 oip->i_gen += ((i_genrand + ddi_get_lbolt()) & 0xffff) + 1;
1195 oip->i_flag |= ICHG |IUPD;
1196 oip->i_seq++;
1197 if (length == oip->i_size)
1198 return (0);
1199 flags |= I_CHEAP;
1200 }
1201 if (mode == IFIFO)
1202 return (0);
1203 if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR &&
1204 !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD)
1205 return (EINVAL);
1206 if (length > maxoffset)
1207 return (EFBIG);
1208 if ((mode == IFDIR) || (mode == IFATTRDIR))
1209 flags |= I_DIR;
1210 if (mode == IFSHAD)
1211 flags |= I_SHAD;
1212 if (oip == ufsvfsp->vfs_qinod)
1213 flags |= I_QUOTA;
1214 if (length == oip->i_size) {
1215 /* update ctime and mtime to please POSIX tests */
1216 oip->i_flag |= ICHG |IUPD;
1217 oip->i_seq++;
1218 if (length == 0) {
1219 /* nothing to cache so clear the flag */
1220 oip->i_flag &= ~IFASTSYMLNK;
1221 }
1222 return (0);
1223 }
1224 /* wipe out fast symlink till next access */
1225 if (oip->i_flag & IFASTSYMLNK) {
1226 int j;
1227
1228 ASSERT(ITOV(oip)->v_type == VLNK);
1229
1230 oip->i_flag &= ~IFASTSYMLNK;
1231
1232 for (j = 1; j < NDADDR; j++)
1233 oip->i_db[j] = 0;
1234 for (j = 0; j < NIADDR; j++)
1235 oip->i_ib[j] = 0;
1236 }
1237
1238 boff = (int)blkoff(fs, length);
1239
1240 if (length > oip->i_size) {
1241 /*
1242 * Trunc up case. BMAPALLOC will insure that the right blocks
1243 * are allocated. This includes extending the old frag to a
1244 * full block (if needed) in addition to doing any work
1245 * needed for allocating the last block.
1246 */
1247 if (boff == 0)
1248 err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr);
1249 else
1250 err = BMAPALLOC(oip, length - 1, boff, cr);
1251
1252 if (err == 0) {
1253 /*
1254 * Save old size and set inode's size now
1255 * so that we don't cause too much of the
1256 * file to be zero'd and pushed.
1257 */
1258 u_offset_t osize = oip->i_size;
1259 oip->i_size = length;
1260 /*
1261 * Make sure we zero out the remaining bytes of
1262 * the page in case a mmap scribbled on it. We
1263 * can't prevent a mmap from writing beyond EOF
1264 * on the last page of a file.
1265 *
1266 */
1267 if ((boff = (int)blkoff(fs, osize)) != 0) {
1268 bsize = (int)lblkno(fs, osize - 1) >= NDADDR ?
1269 fs->fs_bsize : fragroundup(fs, boff);
1270 pvn_vpzero(ITOV(oip), osize,
1271 (size_t)(bsize - boff));
1272 }
1273 oip->i_flag |= ICHG|IATTCHG;
1274 oip->i_seq++;
1275 ITIMES_NOLOCK(oip);
1276 /*
1277 * MAXOFF32_T is old 2GB size limit. If
1278 * this operation caused a large file to be
1279 * created, turn on the superblock flag
1280 * and update the superblock, if the flag
1281 * is not already on.
1282 */
1283 if ((length > (u_offset_t)MAXOFF32_T) &&
1284 !(fs->fs_flags & FSLARGEFILES)) {
1285 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1286 mutex_enter(&ufsvfsp->vfs_lock);
1287 fs->fs_flags |= FSLARGEFILES;
1288 ufs_sbwrite(ufsvfsp);
1289 mutex_exit(&ufsvfsp->vfs_lock);
1290 }
1291 }
1292
1293 return (err);
1294 }
1295
1296 /*
1297 * Update the pages of the file. If the file is not being
1298 * truncated to a block boundary, the contents of the
1299 * pages following the end of the file must be zero'ed
1300 * in case it ever become accessible again because
1301 * of subsequent file growth.
1302 */
1303 if (boff == 0) {
1304 (void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage,
1305 B_INVAL | B_TRUNC, CRED());
1306 } else {
1307 /*
1308 * Make sure that the last block is properly allocated.
1309 * We only really have to do this if the last block is
1310 * actually allocated since ufs_bmap will now handle the case
1311 * of an fragment which has no block allocated. Just to
1312 * be sure, we do it now independent of current allocation.
1313 */
1314 err = BMAPALLOC(oip, length - 1, boff, cr);
1315 if (err)
1316 return (err);
1317
1318 /*
1319 * BMAPALLOC will call bmap_write which defers i_seq
1320 * processing. If the timestamps were changed, update
1321 * i_seq before rdip drops i_contents or syncs the inode.
1322 */
1323 if (oip->i_flag & (ICHG|IUPD))
1324 oip->i_seq++;
1325
1326 /*
1327 * BugId 4069932
1328 * Make sure that the relevant partial page appears in
1329 * the v_pages list, so that pvn_vpzero() will do its
1330 * job. Since doing this correctly requires everything
1331 * in rdip() except for the uiomove(), it's easier and
1332 * safer to do the uiomove() rather than duplicate the
1333 * rest of rdip() here.
1334 *
1335 * To get here, we know that length indicates a byte
1336 * that is not the first byte of a block. (length - 1)
1337 * is the last actual byte known to exist. Deduction
1338 * shows it is in the same block as byte (length).
1339 * Thus, this rdip() invocation should always succeed
1340 * except in the face of i/o errors, and give us the
1341 * block we care about.
1342 *
1343 * rdip() makes the same locking assertions and
1344 * assumptions as we do. We do not acquire any locks
1345 * before calling it, so we have not changed the locking
1346 * situation. Finally, there do not appear to be any
1347 * paths whereby rdip() ends up invoking us again.
1348 * Thus, infinite recursion is avoided.
1349 */
1350 {
1351 uio_t uio;
1352 iovec_t iov[1];
1353 char buffer;
1354
1355 uio.uio_iov = iov;
1356 uio.uio_iovcnt = 1;
1357 uio.uio_loffset = length - 1;
1358 uio.uio_resid = 1;
1359 uio.uio_segflg = UIO_SYSSPACE;
1360 uio.uio_extflg = UIO_COPY_CACHED;
1361
1362 iov[0].iov_base = &buffer;
1363 iov[0].iov_len = 1;
1364
1365 err = rdip(oip, &uio, UIO_READ, NULL);
1366 if (err)
1367 return (err);
1368 }
1369
1370 bsize = (int)lblkno(fs, length - 1) >= NDADDR ?
1371 fs->fs_bsize : fragroundup(fs, boff);
1372 pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff));
1373 /*
1374 * Ensure full fs block is marked as dirty.
1375 */
1376 (void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff),
1377 ufs_putapage, B_INVAL | B_TRUNC, CRED());
1378 }
1379
1380 /*
1381 * Calculate index into inode's block list of
1382 * last direct and indirect blocks (if any)
1383 * which we want to keep. Lastblock is -1 when
1384 * the file is truncated to 0.
1385 */
1386 lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
1387 lastiblock[SINGLE] = lastblock - NDADDR;
1388 lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
1389 lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
1390 nblocks = btodb(fs->fs_bsize);
1391
1392 /*
1393 * Update file and block pointers
1394 * on disk before we start freeing blocks.
1395 * If we crash before free'ing blocks below,
1396 * the blocks will be returned to the free list.
1397 * lastiblock values are also normalized to -1
1398 * for calls to indirtrunc below.
1399 */
1400 tip = *oip; /* structure copy */
1401 ip = &tip;
1402
1403 for (level = TRIPLE; level >= SINGLE; level--)
1404 if (lastiblock[level] < 0) {
1405 oip->i_ib[level] = 0;
1406 lastiblock[level] = -1;
1407 }
1408 for (i = NDADDR - 1; i > lastblock; i--) {
1409 oip->i_db[i] = 0;
1410 flags |= I_CHEAP;
1411 }
1412 oip->i_size = length;
1413 oip->i_flag |= ICHG|IUPD|IATTCHG;
1414 oip->i_seq++;
1415 if (!TRANS_ISTRANS(ufsvfsp))
1416 ufs_iupdat(oip, I_SYNC); /* do sync inode update */
1417
1418 /*
1419 * Indirect blocks first.
1420 */
1421 for (level = TRIPLE; level >= SINGLE; level--) {
1422 bn = ip->i_ib[level];
1423 if (bn != 0) {
1424 blocksreleased +=
1425 indirtrunc(ip, bn, lastiblock[level], level, flags);
1426 if (lastiblock[level] < 0) {
1427 ip->i_ib[level] = 0;
1428 free(ip, bn, (off_t)fs->fs_bsize,
1429 flags | I_IBLK);
1430 blocksreleased += nblocks;
1431 }
1432 }
1433 if (lastiblock[level] >= 0)
1434 goto done;
1435 }
1436
1437 /*
1438 * All whole direct blocks or frags.
1439 */
1440 for (i = NDADDR - 1; i > lastblock; i--) {
1441 bn = ip->i_db[i];
1442 if (bn == 0)
1443 continue;
1444 ip->i_db[i] = 0;
1445 bsize = (off_t)blksize(fs, ip, i);
1446 free(ip, bn, bsize, flags);
1447 blocksreleased += btodb(bsize);
1448 }
1449 if (lastblock < 0)
1450 goto done;
1451
1452 /*
1453 * Finally, look for a change in size of the
1454 * last direct block; release any frags.
1455 */
1456 bn = ip->i_db[lastblock];
1457 if (bn != 0) {
1458 off_t oldspace, newspace;
1459
1460 /*
1461 * Calculate amount of space we're giving
1462 * back as old block size minus new block size.
1463 */
1464 oldspace = blksize(fs, ip, lastblock);
1465 UFS_SET_ISIZE(length, ip);
1466 newspace = blksize(fs, ip, lastblock);
1467 if (newspace == 0) {
1468 err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0");
1469 return (err);
1470 }
1471 if (oldspace - newspace > 0) {
1472 /*
1473 * Block number of space to be free'd is
1474 * the old block # plus the number of frags
1475 * required for the storage we're keeping.
1476 */
1477 bn += numfrags(fs, newspace);
1478 free(ip, bn, oldspace - newspace, flags);
1479 blocksreleased += btodb(oldspace - newspace);
1480 }
1481 }
1482 done:
1483 /* BEGIN PARANOIA */
1484 for (level = SINGLE; level <= TRIPLE; level++)
1485 if (ip->i_ib[level] != oip->i_ib[level]) {
1486 err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block");
1487 return (err);
1488 }
1489
1490 for (i = 0; i < NDADDR; i++)
1491 if (ip->i_db[i] != oip->i_db[i]) {
1492 err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block");
1493 return (err);
1494 }
1495 /* END PARANOIA */
1496 oip->i_blocks -= blocksreleased;
1497
1498 if (oip->i_blocks < 0) { /* sanity */
1499 cmn_err(CE_NOTE,
1500 "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
1501 fs->fs_fsmnt, (int)oip->i_number, oip->i_size,
1502 (int)oip->i_blocks);
1503 oip->i_blocks = 0;
1504 }
1505 oip->i_flag |= ICHG|IATTCHG;
1506 oip->i_seq++;
1507 /* blocksreleased is >= zero, so this can not fail */
1508 (void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL,
1509 (size_t *)NULL);
1510 return (0);
1511 }
1512
1513 /*
1514 * Check mode permission on inode. Mode is READ, WRITE or EXEC.
1515 * In the case of WRITE, the read-only status of the file system
1516 * is checked. Depending on the calling user, the appropriate
1517 * mode bits are selected; privileges to override missing permission
1518 * bits are checked through secpolicy_vnode_access().
1519 * The i_contens lock must be held as reader here to prevent racing with
1520 * the acl subsystem removing/setting/changing acls on this inode.
1521 * The caller is responsible for indicating whether or not the i_contents
1522 * lock needs to be acquired here or if already held.
1523 */
1524 int
ufs_iaccess(struct inode * ip,int mode,struct cred * cr,int dolock)1525 ufs_iaccess(struct inode *ip, int mode, struct cred *cr, int dolock)
1526 {
1527 int shift = 0;
1528 int ret = 0;
1529
1530 if (dolock)
1531 rw_enter(&ip->i_contents, RW_READER);
1532 ASSERT(RW_LOCK_HELD(&ip->i_contents));
1533
1534 if (mode & IWRITE) {
1535 /*
1536 * Disallow write attempts on read-only
1537 * file systems, unless the file is a block
1538 * or character device or a FIFO.
1539 */
1540 if (ip->i_fs->fs_ronly != 0) {
1541 if ((ip->i_mode & IFMT) != IFCHR &&
1542 (ip->i_mode & IFMT) != IFBLK &&
1543 (ip->i_mode & IFMT) != IFIFO) {
1544 ret = EROFS;
1545 goto out;
1546 }
1547 }
1548 }
1549 /*
1550 * If there is an acl, check the acl and return.
1551 */
1552 if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) {
1553 ret = ufs_acl_access(ip, mode, cr);
1554 goto out;
1555 }
1556
1557 /*
1558 * Access check is based on only one of owner, group, public.
1559 * If not owner, then check group.
1560 * If not a member of the group, then check public access.
1561 */
1562 if (crgetuid(cr) != ip->i_uid) {
1563 shift += 3;
1564 if (!groupmember((uid_t)ip->i_gid, cr))
1565 shift += 3;
1566 }
1567
1568 /* test missing privilege bits */
1569 ret = secpolicy_vnode_access2(cr, ITOV(ip), ip->i_uid,
1570 ip->i_mode << shift, mode);
1571 out:
1572 if (dolock)
1573 rw_exit(&ip->i_contents);
1574 return (ret);
1575 }
1576
1577 /*
1578 * if necessary, remove an inode from the free list
1579 * i_contents is held except at unmount
1580 *
1581 * Return 1 if the inode is taken off of the ufs_idle_q,
1582 * and the caller is expected to call VN_RELE.
1583 *
1584 * Return 0 otherwise.
1585 */
1586 int
ufs_rmidle(struct inode * ip)1587 ufs_rmidle(struct inode *ip)
1588 {
1589 int rval = 0;
1590
1591 mutex_enter(&ip->i_tlock);
1592 if ((ip->i_flag & IREF) == 0) {
1593 mutex_enter(&ufs_idle_q.uq_mutex);
1594 ip->i_freef->i_freeb = ip->i_freeb;
1595 ip->i_freeb->i_freef = ip->i_freef;
1596 ip->i_freef = ip;
1597 ip->i_freeb = ip;
1598 ip->i_flag |= IREF;
1599 ufs_idle_q.uq_ne--;
1600 if (ip->i_flag & IJUNKIQ) {
1601 ufs_njunk_iq--;
1602 ip->i_flag &= ~IJUNKIQ;
1603 } else {
1604 ufs_nuseful_iq--;
1605 }
1606 mutex_exit(&ufs_idle_q.uq_mutex);
1607 rval = 1;
1608 }
1609 mutex_exit(&ip->i_tlock);
1610 return (rval);
1611 }
1612
1613 /*
1614 * scan the hash of inodes and call func with the inode locked
1615 */
1616 int
ufs_scan_inodes(int rwtry,int (* func)(struct inode *,void *),void * arg,struct ufsvfs * ufsvfsp)1617 ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg,
1618 struct ufsvfs *ufsvfsp)
1619 {
1620 struct inode *ip; /* current inode */
1621 struct inode *lip = NULL; /* last/previous inode */
1622 union ihead *ih; /* current hash chain */
1623 int error, i;
1624 int saverror = 0;
1625 int lip_held; /* lip needs a VN_RELE() */
1626
1627 /*
1628 * If ufsvfsp is NULL, then our caller should be holding
1629 * ufs_scan_lock to avoid conflicts between ufs_unmount() and
1630 * ufs_update(). Otherwise, to avoid false-positives in
1631 * ufs_unmount()'s v_count-based EBUSY check, we only hold
1632 * those inodes that are in the file system our caller cares
1633 * about.
1634 *
1635 * We know that ip is a valid inode in the hash chain (and thus
1636 * we can trust i_ufsvfs) because the inode we chained from
1637 * (lip) is still in the hash chain. This is true because either:
1638 *
1639 * 1. We did not drop the hash chain lock since the last
1640 * iteration (because we were not interested in the last inode),
1641 * or
1642 * 2. We maintained a hold on the last inode while we
1643 * we were processing it, so it could not be removed
1644 * from the hash chain.
1645 *
1646 * The whole reason we're dropping and re-grabbing the chain
1647 * lock on every inode is so that we don't present a major
1648 * choke point on throughput, particularly when we've been
1649 * called on behalf of fsflush.
1650 */
1651
1652 for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
1653 mutex_enter(&ih_lock[i]);
1654 for (ip = ih->ih_chain[0], lip_held = 0;
1655 ip != (struct inode *)ih;
1656 ip = lip->i_forw) {
1657
1658 ins.in_scan.value.ul++;
1659
1660 /*
1661 * Undo the previous iteration's VN_HOLD(), but
1662 * only if one was done.
1663 */
1664 if (lip_held)
1665 VN_RELE(ITOV(lip));
1666
1667 lip = ip;
1668 if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) {
1669 /*
1670 * We're not processing all inodes, and
1671 * this inode is not in the filesystem of
1672 * interest, so skip it. No need to do a
1673 * VN_HOLD() since we're not dropping the
1674 * hash chain lock until after we've
1675 * done the i_forw traversal above.
1676 */
1677 lip_held = 0;
1678 continue;
1679 }
1680 VN_HOLD(ITOV(ip));
1681 lip_held = 1;
1682 mutex_exit(&ih_lock[i]);
1683
1684 /*
1685 * Acquire the contents lock as writer to make
1686 * sure that the inode has been initialized in
1687 * the cache or removed from the idle list by
1688 * ufs_iget(). This works because ufs_iget()
1689 * acquires the contents lock before putting
1690 * the inode into the cache. If we can lock
1691 * it, then he's done with it.
1692 */
1693
1694 if (rwtry) {
1695 if (!rw_tryenter(&ip->i_contents, RW_WRITER)) {
1696 mutex_enter(&ih_lock[i]);
1697 continue;
1698 }
1699 } else {
1700 rw_enter(&ip->i_contents, RW_WRITER);
1701 }
1702
1703 rw_exit(&ip->i_contents);
1704
1705 /*
1706 * ISTALE means the inode couldn't be read
1707 *
1708 * We don't have to hold the i_contents lock
1709 * for this check for a couple of
1710 * reasons. First, if ISTALE is set then the
1711 * flag cannot be cleared until the inode is
1712 * removed from the cache and that cannot
1713 * happen until after we VN_RELE() it.
1714 * Second, if ISTALE is not set, then the
1715 * inode is in the cache and does not need to
1716 * be read from disk so ISTALE cannot be set
1717 * while we are not looking.
1718 */
1719 if ((ip->i_flag & ISTALE) == 0) {
1720 if ((error = (*func)(ip, arg)) != 0)
1721 saverror = error;
1722 }
1723
1724 mutex_enter(&ih_lock[i]);
1725 }
1726 if (lip_held)
1727 VN_RELE(ITOV(lip));
1728 mutex_exit(&ih_lock[i]);
1729 }
1730 return (saverror);
1731 }
1732
1733 /*
1734 * Mark inode with the current time, plus a unique increment.
1735 *
1736 * Since we only keep 32-bit time on disk, if UFS is still alive
1737 * beyond 2038, filesystem times will simply stick at the last
1738 * possible second of 32-bit time. Not ideal, but probably better
1739 * than going into the remote past, or confusing applications with
1740 * negative time.
1741 */
1742 void
ufs_imark(struct inode * ip)1743 ufs_imark(struct inode *ip)
1744 {
1745 timestruc_t now;
1746 int32_t usec, nsec;
1747
1748 /*
1749 * The update of i_seq may have been deferred, increase i_seq here
1750 * to make sure it is in sync with the timestamps.
1751 */
1752 if (ip->i_flag & ISEQ) {
1753 ASSERT(ip->i_flag & (IUPD|ICHG));
1754 ip->i_seq++;
1755 ip->i_flag &= ~ISEQ;
1756 }
1757
1758 gethrestime(&now);
1759
1760 /*
1761 * Fast algorithm to convert nsec to usec -- see hrt2ts()
1762 * in common/os/timers.c for a full description.
1763 */
1764 nsec = now.tv_nsec;
1765 usec = nsec + (nsec >> 2);
1766 usec = nsec + (usec >> 1);
1767 usec = nsec + (usec >> 2);
1768 usec = nsec + (usec >> 4);
1769 usec = nsec - (usec >> 3);
1770 usec = nsec + (usec >> 2);
1771 usec = nsec + (usec >> 3);
1772 usec = nsec + (usec >> 4);
1773 usec = nsec + (usec >> 1);
1774 usec = nsec + (usec >> 6);
1775 usec = usec >> 10;
1776
1777 mutex_enter(&ufs_iuniqtime_lock);
1778 if (now.tv_sec > (time_t)iuniqtime.tv_sec ||
1779 usec > iuniqtime.tv_usec) {
1780 if (now.tv_sec < TIME32_MAX) {
1781 iuniqtime.tv_sec = (time32_t)now.tv_sec;
1782 iuniqtime.tv_usec = usec;
1783 }
1784 } else {
1785 if (iuniqtime.tv_sec < TIME32_MAX) {
1786 iuniqtime.tv_usec++;
1787 /* Check for usec overflow */
1788 if (iuniqtime.tv_usec >= MICROSEC) {
1789 iuniqtime.tv_sec++;
1790 iuniqtime.tv_usec = 0;
1791 }
1792 }
1793 }
1794
1795 if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) {
1796 ip->i_atime = iuniqtime;
1797 }
1798 if (ip->i_flag & IUPD) {
1799 ip->i_mtime = iuniqtime;
1800 ip->i_flag |= IMODTIME;
1801 }
1802 if (ip->i_flag & ICHG) {
1803 ip->i_diroff = 0;
1804 ip->i_ctime = iuniqtime;
1805 }
1806 mutex_exit(&ufs_iuniqtime_lock);
1807 }
1808
1809 /*
1810 * Update timestamps in inode.
1811 */
1812 void
ufs_itimes_nolock(struct inode * ip)1813 ufs_itimes_nolock(struct inode *ip)
1814 {
1815
1816 /*
1817 * if noatime is set and the inode access time is the only field that
1818 * must be changed, exit immediately.
1819 */
1820 if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) &&
1821 (ip->i_ufsvfs->vfs_noatime)) {
1822 return;
1823 }
1824
1825 if (ip->i_flag & (IUPD|IACC|ICHG)) {
1826 if (ip->i_flag & ICHG)
1827 ip->i_flag |= IMOD;
1828 else
1829 ip->i_flag |= IMODACC;
1830 ufs_imark(ip);
1831 ip->i_flag &= ~(IACC|IUPD|ICHG);
1832 }
1833 }
1834