1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
24 */
25
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28
29 /*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39 #include <sys/types.h>
40 #include <sys/t_lock.h>
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/uio.h>
44 #include <sys/bitmap.h>
45 #include <sys/signal.h>
46 #include <sys/cred.h>
47 #include <sys/user.h>
48 #include <sys/vfs.h>
49 #include <sys/stat.h>
50 #include <sys/vnode.h>
51 #include <sys/buf.h>
52 #include <sys/proc.h>
53 #include <sys/disp.h>
54 #include <sys/dnlc.h>
55 #include <sys/mode.h>
56 #include <sys/cmn_err.h>
57 #include <sys/kstat.h>
58 #include <sys/acl.h>
59 #include <sys/var.h>
60 #include <sys/fs/ufs_inode.h>
61 #include <sys/fs/ufs_fs.h>
62 #include <sys/fs/ufs_trans.h>
63 #include <sys/fs/ufs_acl.h>
64 #include <sys/fs/ufs_bio.h>
65 #include <sys/fs/ufs_quota.h>
66 #include <sys/fs/ufs_log.h>
67 #include <vm/hat.h>
68 #include <vm/as.h>
69 #include <vm/pvn.h>
70 #include <vm/seg.h>
71 #include <sys/swap.h>
72 #include <sys/cpuvar.h>
73 #include <sys/sysmacros.h>
74 #include <sys/errno.h>
75 #include <sys/kmem.h>
76 #include <sys/debug.h>
77 #include <fs/fs_subr.h>
78 #include <sys/policy.h>
79
80 struct kmem_cache *inode_cache; /* cache of free inodes */
81
82 /* UFS Inode Cache Stats -- Not protected */
83 struct instats ins = {
84 { "size", KSTAT_DATA_ULONG },
85 { "maxsize", KSTAT_DATA_ULONG },
86 { "hits", KSTAT_DATA_ULONG },
87 { "misses", KSTAT_DATA_ULONG },
88 { "kmem allocs", KSTAT_DATA_ULONG },
89 { "kmem frees", KSTAT_DATA_ULONG },
90 { "maxsize reached", KSTAT_DATA_ULONG },
91 { "puts at frontlist", KSTAT_DATA_ULONG },
92 { "puts at backlist", KSTAT_DATA_ULONG },
93 { "queues to free", KSTAT_DATA_ULONG },
94 { "scans", KSTAT_DATA_ULONG },
95 { "thread idles", KSTAT_DATA_ULONG },
96 { "lookup idles", KSTAT_DATA_ULONG },
97 { "vget idles", KSTAT_DATA_ULONG },
98 { "cache allocs", KSTAT_DATA_ULONG },
99 { "cache frees", KSTAT_DATA_ULONG },
100 { "pushes at close", KSTAT_DATA_ULONG }
101 };
102
103 /* kstat data */
104 static kstat_t *ufs_inode_kstat = NULL;
105
106 union ihead *ihead; /* inode LRU cache, Chris Maltby */
107 kmutex_t *ih_lock; /* protect inode cache hash table */
108 static int ino_hashlen = 4; /* desired average hash chain length */
109 int inohsz; /* number of buckets in the hash table */
110 struct timeval32 iuniqtime;
111
112 kmutex_t ufs_scan_lock; /* stop racing multiple ufs_scan_inodes() */
113 kmutex_t ufs_iuniqtime_lock; /* protect iuniqtime */
114 kmutex_t ufsvfs_mutex;
115 struct ufsvfs *oldufsvfslist, *ufsvfslist;
116
117 /*
118 * time to wait after ufsvfsp->vfs_iotstamp before declaring that no
119 * I/Os are going on.
120 */
121 clock_t ufs_iowait;
122
123 /*
124 * the threads that process idle inodes and free (deleted) inodes
125 * have high water marks that are set in ufsinit().
126 * These values but can be no less then the minimum shown below
127 */
128 int ufs_idle_max; /* # of allowable idle inodes */
129 ulong_t ufs_inode_max; /* hard limit of allowable idle inodes */
130 #define UFS_IDLE_MAX (16) /* min # of allowable idle inodes */
131
132 /*
133 * Tunables for ufs write throttling.
134 * These are validated in ufs_iinit() since improper settings
135 * can lead to filesystem hangs.
136 */
137 #define UFS_HW_DEFAULT (16 * 1024 * 1024)
138 #define UFS_LW_DEFAULT (8 * 1024 * 1024)
139 int ufs_HW = UFS_HW_DEFAULT;
140 int ufs_LW = UFS_LW_DEFAULT;
141
142 static void ihinit(void);
143 extern int hash2ints(int, int);
144
145 static int ufs_iget_internal(struct vfs *, ino_t, struct inode **,
146 struct cred *, int);
147
148 /* ARGSUSED */
149 static int
ufs_inode_kstat_update(kstat_t * ksp,int rw)150 ufs_inode_kstat_update(kstat_t *ksp, int rw)
151 {
152 if (rw == KSTAT_WRITE)
153 return (EACCES);
154
155 ins.in_malloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
156 "slab_alloc");
157 ins.in_mfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
158 "slab_free");
159 ins.in_kcalloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
160 "alloc");
161 ins.in_kcfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
162 "free");
163 ins.in_size.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
164 "buf_inuse");
165 ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
166 "buf_max");
167 ins.in_misses.value.ul = ins.in_kcalloc.value.ul;
168
169 return (0);
170 }
171
172 void
ufs_iinit(void)173 ufs_iinit(void)
174 {
175 /*
176 * Validate that ufs_HW > ufs_LW.
177 * The default values for these two tunables have been increased.
178 * There is now a range of values for ufs_HW that used to be
179 * legal on previous Solaris versions but no longer is now.
180 * Upgrading a machine which has an /etc/system setting for ufs_HW
181 * from that range can lead to filesystem hangs unless the values
182 * are checked here.
183 */
184 if (ufs_HW <= ufs_LW) {
185 cmn_err(CE_WARN,
186 "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.",
187 ufs_HW, ufs_LW);
188 ufs_LW = UFS_LW_DEFAULT;
189 ufs_HW = UFS_HW_DEFAULT;
190 cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n",
191 ufs_HW, ufs_LW);
192 }
193
194 /*
195 * Adjust the tunable `ufs_ninode' to a reasonable value
196 */
197 if (ufs_ninode <= 0)
198 ufs_ninode = ncsize;
199 if (ufs_inode_max == 0)
200 ufs_inode_max =
201 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode));
202 if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) {
203 cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld",
204 ufs_inode_max);
205 ufs_ninode = ufs_inode_max;
206 }
207 /*
208 * Wait till third call of ufs_update to declare that no I/Os are
209 * going on. This allows deferred access times to be flushed to disk.
210 */
211 ufs_iowait = v.v_autoup * hz * 2;
212
213 /*
214 * idle thread runs when 25% of ufs_ninode entries are on the queue
215 */
216 if (ufs_idle_max == 0)
217 ufs_idle_max = ufs_ninode >> 2;
218 if (ufs_idle_max < UFS_IDLE_MAX)
219 ufs_idle_max = UFS_IDLE_MAX;
220 if (ufs_idle_max > ufs_ninode)
221 ufs_idle_max = ufs_ninode;
222 /*
223 * This is really a misnomer, it is ufs_queue_init
224 */
225 ufs_thread_init(&ufs_idle_q, ufs_idle_max);
226 ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL);
227
228 /*
229 * global hlock thread
230 */
231 ufs_thread_init(&ufs_hlock, 1);
232 ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL);
233
234 ihinit();
235 qtinit();
236 ins.in_maxsize.value.ul = ufs_ninode;
237 if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs",
238 KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t),
239 KSTAT_FLAG_VIRTUAL)) != NULL) {
240 ufs_inode_kstat->ks_data = (void *)&ins;
241 ufs_inode_kstat->ks_update = ufs_inode_kstat_update;
242 kstat_install(ufs_inode_kstat);
243 }
244 ufsfx_init(); /* fix-on-panic initialization */
245 si_cache_init();
246 ufs_directio_init();
247 lufs_init();
248 mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL);
249 }
250
251 /* ARGSUSED */
252 static int
ufs_inode_cache_constructor(void * buf,void * cdrarg,int kmflags)253 ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags)
254 {
255 struct inode *ip = buf;
256 struct vnode *vp;
257
258 vp = ip->i_vnode = vn_alloc(kmflags);
259 if (vp == NULL) {
260 return (-1);
261 }
262 vn_setops(vp, ufs_vnodeops);
263 vp->v_data = ip;
264
265 rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL);
266 rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL);
267 mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL);
268 dnlc_dir_init(&ip->i_danchor);
269
270 cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL);
271
272 return (0);
273 }
274
275 /* ARGSUSED */
276 static void
ufs_inode_cache_destructor(void * buf,void * cdrarg)277 ufs_inode_cache_destructor(void *buf, void *cdrarg)
278 {
279 struct inode *ip = buf;
280 struct vnode *vp;
281
282 vp = ITOV(ip);
283
284 rw_destroy(&ip->i_rwlock);
285 rw_destroy(&ip->i_contents);
286 mutex_destroy(&ip->i_tlock);
287 if (vp->v_type == VDIR) {
288 dnlc_dir_fini(&ip->i_danchor);
289 }
290
291 cv_destroy(&ip->i_wrcv);
292
293 vn_free(vp);
294 }
295
296 /*
297 * Initialize hash links for inodes
298 * and build inode free list.
299 */
300 void
ihinit(void)301 ihinit(void)
302 {
303 int i;
304 union ihead *ih = ihead;
305
306 mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL);
307
308 inohsz = 1 << highbit(ufs_ninode / ino_hashlen);
309 ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP);
310 ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP);
311
312 for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
313 ih->ih_head[0] = ih;
314 ih->ih_head[1] = ih;
315 mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL);
316 }
317 inode_cache = kmem_cache_create("ufs_inode_cache",
318 sizeof (struct inode), 0, ufs_inode_cache_constructor,
319 ufs_inode_cache_destructor, ufs_inode_cache_reclaim,
320 NULL, NULL, 0);
321 }
322
323 /*
324 * Free an inode structure
325 */
326 void
ufs_free_inode(struct inode * ip)327 ufs_free_inode(struct inode *ip)
328 {
329 vn_invalid(ITOV(ip));
330 kmem_cache_free(inode_cache, ip);
331 }
332
333 /*
334 * Allocate an inode structure
335 */
336 struct inode *
ufs_alloc_inode(ufsvfs_t * ufsvfsp,ino_t ino)337 ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino)
338 {
339 struct inode *ip;
340 vnode_t *vp;
341
342 ip = kmem_cache_alloc(inode_cache, KM_SLEEP);
343 /*
344 * at this point we have a newly allocated inode
345 */
346 ip->i_freef = ip;
347 ip->i_freeb = ip;
348 ip->i_flag = IREF;
349 ip->i_seq = 0xFF; /* Unique initial value */
350 ip->i_dev = ufsvfsp->vfs_dev;
351 ip->i_ufsvfs = ufsvfsp;
352 ip->i_devvp = ufsvfsp->vfs_devvp;
353 ip->i_number = ino;
354 ip->i_diroff = 0;
355 ip->i_nextr = 0;
356 ip->i_map = NULL;
357 ip->i_rdev = 0;
358 ip->i_writes = 0;
359 ip->i_mode = 0;
360 ip->i_delaylen = 0;
361 ip->i_delayoff = 0;
362 ip->i_nextrio = 0;
363 ip->i_ufs_acl = NULL;
364 ip->i_cflags = 0;
365 ip->i_mapcnt = 0;
366 ip->i_dquot = NULL;
367 ip->i_cachedir = CD_ENABLED;
368 ip->i_writer = NULL;
369
370 /*
371 * the vnode for this inode was allocated by the constructor
372 */
373 vp = ITOV(ip);
374 vn_reinit(vp);
375 if (ino == (ino_t)UFSROOTINO)
376 vp->v_flag = VROOT;
377 vp->v_vfsp = ufsvfsp->vfs_vfs;
378 vn_exists(vp);
379 return (ip);
380 }
381
382 /*
383 * Look up an inode by device, inumber. If it is in core (in the
384 * inode structure), honor the locking protocol. If it is not in
385 * core, read it in from the specified device after freeing any pages.
386 * In all cases, a pointer to a VN_HELD inode structure is returned.
387 */
388 int
ufs_iget(struct vfs * vfsp,ino_t ino,struct inode ** ipp,struct cred * cr)389 ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr)
390 {
391 return (ufs_iget_internal(vfsp, ino, ipp, cr, 0));
392 }
393
394 /*
395 * A version of ufs_iget which returns only allocated, linked inodes.
396 * This is appropriate for any callers who do not expect a free inode.
397 */
398 int
ufs_iget_alloced(struct vfs * vfsp,ino_t ino,struct inode ** ipp,struct cred * cr)399 ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp,
400 struct cred *cr)
401 {
402 return (ufs_iget_internal(vfsp, ino, ipp, cr, 1));
403 }
404
405 /*
406 * Set vnode attributes based on v_type, this should be called whenever
407 * an inode's i_mode is changed.
408 */
409 void
ufs_reset_vnode(vnode_t * vp)410 ufs_reset_vnode(vnode_t *vp)
411 {
412 /*
413 * an old DBE hack
414 */
415 if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
416 vp->v_flag |= VSWAPLIKE;
417 else
418 vp->v_flag &= ~VSWAPLIKE;
419
420 /*
421 * if not swap like and it's just a regular file, we want
422 * to maintain the vnode's pages sorted by clean/modified
423 * for faster sync'ing to disk
424 */
425 if (vp->v_type == VREG)
426 vp->v_flag |= VMODSORT;
427 else
428 vp->v_flag &= ~VMODSORT;
429
430 /*
431 * Is this an attribute hidden dir?
432 */
433 if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR)
434 vp->v_flag |= V_XATTRDIR;
435 else
436 vp->v_flag &= ~V_XATTRDIR;
437 }
438
439 /*
440 * Shared implementation of ufs_iget and ufs_iget_alloced. The 'validate'
441 * flag is used to distinguish the two; when true, we validate that the inode
442 * being retrieved looks like a linked and allocated inode.
443 */
444 /* ARGSUSED */
445 static int
ufs_iget_internal(struct vfs * vfsp,ino_t ino,struct inode ** ipp,struct cred * cr,int validate)446 ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp,
447 struct cred *cr, int validate)
448 {
449 struct inode *ip, *sp;
450 union ihead *ih;
451 kmutex_t *ihm;
452 struct buf *bp;
453 struct dinode *dp;
454 struct vnode *vp;
455 extern vfs_t EIO_vfs;
456 int error;
457 int ftype; /* XXX - Remove later on */
458 dev_t vfs_dev;
459 struct ufsvfs *ufsvfsp;
460 struct fs *fs;
461 int hno;
462 daddr_t bno;
463 ulong_t ioff;
464
465 CPU_STATS_ADD_K(sys, ufsiget, 1);
466
467 /*
468 * Lookup inode in cache.
469 */
470 vfs_dev = vfsp->vfs_dev;
471 hno = INOHASH(ino);
472 ih = &ihead[hno];
473 ihm = &ih_lock[hno];
474
475 again:
476 mutex_enter(ihm);
477 for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) {
478 if (ino != ip->i_number || vfs_dev != ip->i_dev ||
479 (ip->i_flag & ISTALE))
480 continue;
481
482 /*
483 * Found the interesting inode; hold it and drop the cache lock
484 */
485 vp = ITOV(ip); /* for locknest */
486 VN_HOLD(vp);
487 mutex_exit(ihm);
488 rw_enter(&ip->i_contents, RW_READER);
489
490 /*
491 * if necessary, remove from idle list
492 */
493 if ((ip->i_flag & IREF) == 0) {
494 if (ufs_rmidle(ip))
495 VN_RELE(vp);
496 }
497
498 /*
499 * Could the inode be read from disk?
500 */
501 if (ip->i_flag & ISTALE) {
502 rw_exit(&ip->i_contents);
503 VN_RELE(vp);
504 goto again;
505 }
506
507 ins.in_hits.value.ul++;
508 *ipp = ip;
509
510 /*
511 * Reset the vnode's attribute flags
512 */
513 mutex_enter(&vp->v_lock);
514 ufs_reset_vnode(vp);
515 mutex_exit(&vp->v_lock);
516
517 rw_exit(&ip->i_contents);
518
519 return (0);
520 }
521 mutex_exit(ihm);
522
523 /*
524 * Inode was not in cache.
525 *
526 * Allocate a new entry
527 */
528 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
529 fs = ufsvfsp->vfs_fs;
530
531 ip = ufs_alloc_inode(ufsvfsp, ino);
532 vp = ITOV(ip);
533
534 bno = fsbtodb(fs, itod(fs, ino));
535 ioff = (sizeof (struct dinode)) * (itoo(fs, ino));
536 ip->i_doff = (offset_t)ioff + ldbtob(bno);
537
538 /*
539 * put a place holder in the cache (if not already there)
540 */
541 mutex_enter(ihm);
542 for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw)
543 if (ino == sp->i_number && vfs_dev == sp->i_dev &&
544 ((sp->i_flag & ISTALE) == 0)) {
545 mutex_exit(ihm);
546 ufs_free_inode(ip);
547 goto again;
548 }
549 /*
550 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
551 * here, but if we do, then shadow inode allocations panic the
552 * system. We don't have to hold vfs_dqrwlock for shadow inodes
553 * and the ufs_iget() parameters don't tell us what we are getting
554 * so we have no way of knowing this is a ufs_iget() call from
555 * a ufs_ialloc() call for a shadow inode.
556 */
557 rw_enter(&ip->i_contents, RW_WRITER);
558 insque(ip, ih);
559 mutex_exit(ihm);
560 /*
561 * read the dinode
562 */
563 bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize);
564
565 /*
566 * Check I/O errors
567 */
568 error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0);
569 if (error) {
570 brelse(bp);
571 ip->i_flag |= ISTALE; /* in case someone is looking it up */
572 rw_exit(&ip->i_contents);
573 vp->v_vfsp = &EIO_vfs;
574 VN_RELE(vp);
575 return (error);
576 }
577 /*
578 * initialize the inode's dinode
579 */
580 dp = (struct dinode *)(ioff + bp->b_un.b_addr);
581 ip->i_ic = dp->di_ic; /* structure assignment */
582 brelse(bp);
583
584 /*
585 * Maintain compatibility with Solaris 1.x UFS
586 */
587 if (ip->i_suid != UID_LONG)
588 ip->i_uid = ip->i_suid;
589 if (ip->i_sgid != GID_LONG)
590 ip->i_gid = ip->i_sgid;
591
592 ftype = ip->i_mode & IFMT;
593 if (ftype == IFBLK || ftype == IFCHR) {
594 dev_t dv;
595 uint_t top16 = ip->i_ordev & 0xffff0000u;
596
597 if (top16 == 0 || top16 == 0xffff0000u)
598 dv = expdev(ip->i_ordev);
599 else
600 dv = expldev(ip->i_ordev);
601 vp->v_rdev = ip->i_rdev = dv;
602 }
603
604 /*
605 * if our caller only expects allocated inodes, verify that
606 * this inode looks good; throw it out if it's bad.
607 */
608 if (validate) {
609 if ((ftype == 0) || (ip->i_nlink <= 0)) {
610 ip->i_flag |= ISTALE;
611 rw_exit(&ip->i_contents);
612 vp->v_vfsp = &EIO_vfs;
613 VN_RELE(vp);
614 cmn_err(CE_NOTE,
615 "%s: unexpected free inode %d, run fsck(8)%s",
616 fs->fs_fsmnt, (int)ino,
617 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
618 return (EIO);
619 }
620 }
621
622 /*
623 * Finish initializing the vnode, special handling for shadow inodes
624 * because IFTOVT() will produce a v_type of VNON which is not what we
625 * want, set v_type to VREG explicitly in that case.
626 */
627 if (ftype == IFSHAD) {
628 vp->v_type = VREG;
629 } else {
630 vp->v_type = IFTOVT((mode_t)ip->i_mode);
631 }
632
633 ufs_reset_vnode(vp);
634
635 /*
636 * read the shadow
637 */
638 if (ftype != 0 && ip->i_shadow != 0) {
639 if ((error = ufs_si_load(ip, cr)) != 0) {
640 ip->i_flag |= ISTALE;
641 ip->i_ufs_acl = NULL;
642 rw_exit(&ip->i_contents);
643 vp->v_vfsp = &EIO_vfs;
644 VN_RELE(vp);
645 return (error);
646 }
647 }
648
649 /*
650 * Only attach quota information if the inode has a type and if
651 * that type is not a shadow inode.
652 */
653 if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) &&
654 ((ip->i_mode & IFMT) != IFATTRDIR)) {
655 ip->i_dquot = getinoquota(ip);
656 }
657 TRANS_MATA_IGET(ufsvfsp, ip);
658 *ipp = ip;
659 rw_exit(&ip->i_contents);
660
661 return (0);
662 }
663
664 /*
665 * Vnode is no longer referenced, write the inode out
666 * and if necessary, truncate and deallocate the file.
667 */
668 void
ufs_iinactive(struct inode * ip)669 ufs_iinactive(struct inode *ip)
670 {
671 int front;
672 struct inode *iq;
673 struct inode *hip;
674 struct ufs_q *uq;
675 struct vnode *vp = ITOV(ip);
676 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
677 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
678
679 /*
680 * Because the vnode type might have been changed,
681 * the dnlc_dir_purge must be called unconditionally.
682 */
683 dnlc_dir_purge(&ip->i_danchor);
684
685 /*
686 * Get exclusive access to inode data.
687 */
688 rw_enter(&ip->i_contents, RW_WRITER);
689 ASSERT(ip->i_flag & IREF);
690
691 /*
692 * Make sure no one reclaimed the inode before we put it on
693 * the freelist or destroy it. We keep our 'hold' on the vnode
694 * from vn_rele until we are ready to do something with the inode.
695 *
696 * Pageout may put a VN_HOLD/VN_RELE at anytime during this
697 * operation via an async putpage, so we must make sure
698 * we don't free/destroy the inode more than once. ufs_iget
699 * may also put a VN_HOLD on the inode before it grabs
700 * the i_contents lock. This is done so we don't free
701 * an inode that a thread is waiting on.
702 */
703 mutex_enter(&vp->v_lock);
704
705 if (vp->v_count > 1) {
706 VN_RELE_LOCKED(vp);
707 mutex_exit(&vp->v_lock);
708 rw_exit(&ip->i_contents);
709 return;
710 }
711 mutex_exit(&vp->v_lock);
712
713 /*
714 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed
715 * and clean. It can be safely destroyed (cyf).
716 */
717 if (ip->i_ufsvfs == NULL) {
718 rw_exit(&ip->i_contents);
719 ufs_si_del(ip);
720 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
721 ufs_free_inode(ip);
722 return;
723 }
724
725 /*
726 * queue idle inode to appropriate thread. Will check v_count == 1
727 * prior to putting this on the appropriate queue.
728 * Stale inodes will be unhashed and freed by the ufs idle thread
729 * in ufs_idle_free()
730 */
731 front = 1;
732 if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 &&
733 ip->i_mode && ip->i_nlink <= 0) {
734 /*
735 * Mark the i_flag to indicate that inode is being deleted.
736 * This flag will be cleared when the deletion is complete.
737 * This prevents nfs from sneaking in via ufs_vget() while
738 * the delete is in progress (bugid 1242481).
739 */
740 ip->i_flag |= IDEL;
741
742 /*
743 * NOIDEL means that deletes are not allowed at this time;
744 * whoever resets NOIDEL will also send this inode back
745 * through ufs_iinactive. IREF remains set.
746 */
747 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
748 mutex_enter(&vp->v_lock);
749 VN_RELE_LOCKED(vp);
750 mutex_exit(&vp->v_lock);
751 rw_exit(&ip->i_contents);
752 return;
753 }
754 if (!TRANS_ISTRANS(ip->i_ufsvfs)) {
755 rw_exit(&ip->i_contents);
756 ufs_delete(ip->i_ufsvfs, ip, 0);
757 return;
758 }
759
760 /* queue to delete thread; IREF remains set */
761 ins.in_qfree.value.ul++;
762 uq = &ip->i_ufsvfs->vfs_delete;
763
764 mutex_enter(&uq->uq_mutex);
765
766 /* add to q */
767 if ((iq = uq->uq_ihead) != 0) {
768 ip->i_freef = iq;
769 ip->i_freeb = iq->i_freeb;
770 iq->i_freeb->i_freef = ip;
771 iq->i_freeb = ip;
772 if (front)
773 uq->uq_ihead = ip;
774 } else {
775 uq->uq_ihead = ip;
776 ip->i_freef = ip;
777 ip->i_freeb = ip;
778 }
779
780 delq_info->delq_unreclaimed_files += 1;
781 delq_info->delq_unreclaimed_blocks += ip->i_blocks;
782 } else {
783 /*
784 * queue to idle thread
785 * Check the v_count == 1 again.
786 *
787 */
788 mutex_enter(&vp->v_lock);
789 if (vp->v_count > 1) {
790 VN_RELE_LOCKED(vp);
791 mutex_exit(&vp->v_lock);
792 rw_exit(&ip->i_contents);
793 return;
794 }
795 mutex_exit(&vp->v_lock);
796 uq = &ufs_idle_q;
797
798 /*
799 * useful iff it has pages or is a fastsymlink; otherwise junk
800 */
801 mutex_enter(&uq->uq_mutex);
802
803 /* clear IREF means `on idle list' */
804 ip->i_flag &= ~(IREF | IDIRECTIO);
805
806 if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) {
807 ins.in_frback.value.ul++;
808 hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)];
809 ufs_nuseful_iq++;
810 } else {
811 ins.in_frfront.value.ul++;
812 hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)];
813 ip->i_flag |= IJUNKIQ;
814 ufs_njunk_iq++;
815 }
816 ip->i_freef = hip;
817 ip->i_freeb = hip->i_freeb;
818 hip->i_freeb->i_freef = ip;
819 hip->i_freeb = ip;
820 }
821
822 /* wakeup thread(s) if q is overfull */
823 if (++uq->uq_ne == uq->uq_lowat)
824 cv_broadcast(&uq->uq_cv);
825
826 /* all done, release the q and inode */
827 mutex_exit(&uq->uq_mutex);
828 rw_exit(&ip->i_contents);
829 }
830
831 /*
832 * Check accessed and update flags on an inode structure.
833 * If any are on, update the inode with the (unique) current time.
834 * If waitfor is given, insure I/O order so wait for write to complete.
835 */
836 void
ufs_iupdat(struct inode * ip,int waitfor)837 ufs_iupdat(struct inode *ip, int waitfor)
838 {
839 struct buf *bp;
840 struct fs *fp;
841 struct dinode *dp;
842 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
843 int i;
844 int do_trans_times;
845 ushort_t flag;
846 o_uid_t suid;
847 o_gid_t sgid;
848
849 /*
850 * This function is now safe to be called with either the reader
851 * or writer i_contents lock.
852 */
853 ASSERT(RW_LOCK_HELD(&ip->i_contents));
854
855 /*
856 * Return if file system has been forcibly umounted.
857 */
858 if (ufsvfsp == NULL)
859 return;
860
861 flag = ip->i_flag; /* Atomic read */
862 /*
863 * We better not update the disk inode from a stale inode.
864 */
865 if (flag & ISTALE)
866 return;
867
868 fp = ip->i_fs;
869
870 if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) {
871 if (fp->fs_ronly) {
872 mutex_enter(&ip->i_tlock);
873 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
874 mutex_exit(&ip->i_tlock);
875 return;
876 }
877 /*
878 * fs is active while metadata is being written
879 */
880 mutex_enter(&ufsvfsp->vfs_lock);
881 ufs_notclean(ufsvfsp);
882 /*
883 * get the dinode
884 */
885 bp = UFS_BREAD(ufsvfsp, ip->i_dev,
886 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)),
887 (int)fp->fs_bsize);
888 if (bp->b_flags & B_ERROR) {
889 mutex_enter(&ip->i_tlock);
890 ip->i_flag &=
891 ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
892 mutex_exit(&ip->i_tlock);
893 brelse(bp);
894 return;
895 }
896 /*
897 * munge inode fields
898 */
899 mutex_enter(&ip->i_tlock);
900 ITIMES_NOLOCK(ip);
901 do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC);
902 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
903 mutex_exit(&ip->i_tlock);
904
905 /*
906 * For reads and concurrent re-writes, no deltas were
907 * entered for the access time changes - do it now.
908 */
909 if (do_trans_times) {
910 TRANS_INODE_TIMES(ufsvfsp, ip);
911 }
912
913 /*
914 * For SunOS 5.0->5.4, these lines below read:
915 *
916 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
917 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
918 *
919 * where MAXUID was set to 60002. This was incorrect -
920 * the uids should have been constrained to what fitted into
921 * a 16-bit word.
922 *
923 * This means that files from 4.x filesystems that have an
924 * i_suid field larger than 60002 will have that field
925 * changed to 65535.
926 *
927 * Security note: 4.x UFS could never create a i_suid of
928 * UID_LONG since that would've corresponded to -1.
929 */
930 suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
931 UID_LONG : ip->i_uid;
932 sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
933 GID_LONG : ip->i_gid;
934
935 if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) {
936 ip->i_suid = suid;
937 ip->i_sgid = sgid;
938 TRANS_INODE(ufsvfsp, ip);
939 }
940
941 if ((ip->i_mode & IFMT) == IFBLK ||
942 (ip->i_mode & IFMT) == IFCHR) {
943 dev_t d = ip->i_rdev;
944 dev32_t dev32;
945
946 /*
947 * load first direct block only if special device
948 */
949 if (!cmpldev(&dev32, d)) {
950 /*
951 * We panic here because there's "no way"
952 * we should have been able to create a large
953 * inode with a large dev_t. Earlier layers
954 * should've caught this.
955 */
956 panic("ip %p: i_rdev too big", (void *)ip);
957 }
958
959 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
960 ip->i_ordev = dev32; /* can't use old fmt. */
961 } else {
962 ip->i_ordev = cmpdev(d);
963 }
964 }
965
966 /*
967 * copy inode to dinode (zero fastsymlnk in dinode)
968 */
969 dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number);
970 dp->di_ic = ip->i_ic; /* structure assignment */
971 if (flag & IFASTSYMLNK) {
972 for (i = 1; i < NDADDR; i++)
973 dp->di_db[i] = 0;
974 for (i = 0; i < NIADDR; i++)
975 dp->di_ib[i] = 0;
976 }
977 if (TRANS_ISTRANS(ufsvfsp)) {
978 /*
979 * Pass only a sector size buffer containing
980 * the inode, otherwise when the buffer is copied
981 * into a cached roll buffer then too much memory
982 * gets consumed if 8KB inode buffers are passed.
983 */
984 TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff,
985 sizeof (struct dinode),
986 (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE),
987 DEV_BSIZE);
988
989 brelse(bp);
990 } else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) {
991 UFS_BRWRITE(ufsvfsp, bp);
992
993 /*
994 * Synchronous write has guaranteed that inode
995 * has been written on disk so clear the flag
996 */
997 mutex_enter(&ip->i_tlock);
998 ip->i_flag &= ~IBDWRITE;
999 mutex_exit(&ip->i_tlock);
1000 } else {
1001 bdrwrite(bp);
1002
1003 /*
1004 * This write hasn't guaranteed that inode has been
1005 * written on the disk.
1006 * Since, all updat flags on inode are cleared, we must
1007 * remember the condition in case inode is to be updated
1008 * synchronously later (e.g.- fsync()/fdatasync())
1009 * and inode has not been modified yet.
1010 */
1011 mutex_enter(&ip->i_tlock);
1012 ip->i_flag |= IBDWRITE;
1013 mutex_exit(&ip->i_tlock);
1014 }
1015 } else {
1016 /*
1017 * In case previous inode update was done asynchronously
1018 * (IBDWRITE) and this inode update request wants guaranteed
1019 * (synchronous) disk update, flush the inode.
1020 */
1021 if (waitfor && (flag & IBDWRITE)) {
1022 blkflush(ip->i_dev,
1023 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)));
1024 mutex_enter(&ip->i_tlock);
1025 ip->i_flag &= ~IBDWRITE;
1026 mutex_exit(&ip->i_tlock);
1027 }
1028 }
1029 }
1030
1031 #define SINGLE 0 /* index of single indirect block */
1032 #define DOUBLE 1 /* index of double indirect block */
1033 #define TRIPLE 2 /* index of triple indirect block */
1034
1035 /*
1036 * Release blocks associated with the inode ip and
1037 * stored in the indirect block bn. Blocks are free'd
1038 * in LIFO order up to (but not including) lastbn. If
1039 * level is greater than SINGLE, the block is an indirect
1040 * block and recursive calls to indirtrunc must be used to
1041 * cleanse other indirect blocks.
1042 *
1043 * N.B.: triple indirect blocks are untested.
1044 */
1045 static long
indirtrunc(struct inode * ip,daddr_t bn,daddr_t lastbn,int level,int flags)1046 indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags)
1047 {
1048 int i;
1049 struct buf *bp, *copy;
1050 daddr32_t *bap;
1051 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1052 struct fs *fs = ufsvfsp->vfs_fs;
1053 daddr_t nb, last;
1054 long factor;
1055 int blocksreleased = 0, nblocks;
1056
1057 ASSERT(RW_WRITE_HELD(&ip->i_contents));
1058 /*
1059 * Calculate index in current block of last
1060 * block to be kept. -1 indicates the entire
1061 * block so we need not calculate the index.
1062 */
1063 factor = 1;
1064 for (i = SINGLE; i < level; i++)
1065 factor *= NINDIR(fs);
1066 last = lastbn;
1067 if (lastbn > 0)
1068 last /= factor;
1069 nblocks = btodb(fs->fs_bsize);
1070 /*
1071 * Get buffer of block pointers, zero those
1072 * entries corresponding to blocks to be free'd,
1073 * and update on disk copy first.
1074 * *Unless* the root pointer has been synchronously
1075 * written to disk. If nothing points to this
1076 * indirect block then don't bother zero'ing and
1077 * writing it.
1078 */
1079 bp = UFS_BREAD(ufsvfsp,
1080 ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize);
1081 if (bp->b_flags & B_ERROR) {
1082 brelse(bp);
1083 return (0);
1084 }
1085 bap = bp->b_un.b_daddr;
1086 if ((flags & I_CHEAP) == 0) {
1087 uint_t zb;
1088
1089 zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t));
1090
1091 if (zb) {
1092 /*
1093 * push any data into the log before we zero it
1094 */
1095 if (bp->b_flags & B_DELWRI)
1096 TRANS_LOG(ufsvfsp, (caddr_t)bap,
1097 ldbtob(bp->b_blkno), bp->b_bcount,
1098 bp->b_un.b_addr, bp->b_bcount);
1099 copy = ngeteblk(fs->fs_bsize);
1100 bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr,
1101 (uint_t)fs->fs_bsize);
1102 bzero((caddr_t)&bap[last + 1], zb);
1103
1104 TRANS_BUF(ufsvfsp,
1105 (caddr_t)&bap[last + 1] - (caddr_t)bap,
1106 zb, bp, DT_ABZERO);
1107
1108 UFS_BRWRITE(ufsvfsp, bp);
1109 bp = copy, bap = bp->b_un.b_daddr;
1110 }
1111 } else {
1112 /* make sure write retries are also cleared */
1113 bp->b_flags &= ~(B_DELWRI | B_RETRYWRI);
1114 bp->b_flags |= B_STALE | B_AGE;
1115 }
1116
1117 /*
1118 * Recursively free totally unused blocks.
1119 */
1120 flags |= I_CHEAP;
1121 for (i = NINDIR(fs) - 1; i > last; i--) {
1122 nb = bap[i];
1123 if (nb == 0)
1124 continue;
1125 if (level > SINGLE) {
1126 blocksreleased +=
1127 indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags);
1128 free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK);
1129 } else
1130 free(ip, nb, (off_t)fs->fs_bsize, flags);
1131 blocksreleased += nblocks;
1132 }
1133 flags &= ~I_CHEAP;
1134
1135 /*
1136 * Recursively free last partial block.
1137 */
1138 if (level > SINGLE && lastbn >= 0) {
1139 last = lastbn % factor;
1140 nb = bap[i];
1141 if (nb != 0)
1142 blocksreleased +=
1143 indirtrunc(ip, nb, last, level - 1, flags);
1144 }
1145 brelse(bp);
1146 return (blocksreleased);
1147 }
1148
1149 /*
1150 * Truncate the inode ip to at most length size.
1151 * Free affected disk blocks -- the blocks of the
1152 * file are removed in reverse order.
1153 *
1154 * N.B.: triple indirect blocks are untested.
1155 */
1156 static int i_genrand = 1234;
1157 int
ufs_itrunc(struct inode * oip,u_offset_t length,int flags,cred_t * cr)1158 ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr)
1159 {
1160 struct fs *fs = oip->i_fs;
1161 struct ufsvfs *ufsvfsp = oip->i_ufsvfs;
1162 struct inode *ip;
1163 daddr_t lastblock;
1164 off_t bsize;
1165 int boff;
1166 daddr_t bn, lastiblock[NIADDR];
1167 int level;
1168 long nblocks, blocksreleased = 0;
1169 int i;
1170 ushort_t mode;
1171 struct inode tip;
1172 int err;
1173 u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ?
1174 (UFS_MAXOFFSET_T) : (MAXOFF32_T);
1175
1176 /*
1177 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
1178 * other uses need the reader lock. opendq() holds the writer lock.
1179 */
1180 ASSERT((oip->i_mode & IFMT) == IFSHAD ||
1181 RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
1182 ASSERT(RW_WRITE_HELD(&oip->i_contents));
1183 /*
1184 * We only allow truncation of regular files and directories
1185 * to arbitrary lengths here. In addition, we allow symbolic
1186 * links to be truncated only to zero length. Other inode
1187 * types cannot have their length set here. Disk blocks are
1188 * being dealt with - especially device inodes where
1189 * ip->i_ordev is actually being stored in ip->i_db[0]!
1190 */
1191 TRANS_INODE(ufsvfsp, oip);
1192 mode = oip->i_mode & IFMT;
1193 if (flags & I_FREE) {
1194 i_genrand *= 16843009; /* turns into shift and adds */
1195 i_genrand++;
1196 oip->i_gen += ((i_genrand + ddi_get_lbolt()) & 0xffff) + 1;
1197 oip->i_flag |= ICHG |IUPD;
1198 oip->i_seq++;
1199 if (length == oip->i_size)
1200 return (0);
1201 flags |= I_CHEAP;
1202 }
1203 if (mode == IFIFO)
1204 return (0);
1205 if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR &&
1206 !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD)
1207 return (EINVAL);
1208 if (length > maxoffset)
1209 return (EFBIG);
1210 if ((mode == IFDIR) || (mode == IFATTRDIR))
1211 flags |= I_DIR;
1212 if (mode == IFSHAD)
1213 flags |= I_SHAD;
1214 if (oip == ufsvfsp->vfs_qinod)
1215 flags |= I_QUOTA;
1216 if (length == oip->i_size) {
1217 /* update ctime and mtime to please POSIX tests */
1218 oip->i_flag |= ICHG |IUPD;
1219 oip->i_seq++;
1220 if (length == 0) {
1221 /* nothing to cache so clear the flag */
1222 oip->i_flag &= ~IFASTSYMLNK;
1223 }
1224 return (0);
1225 }
1226 /* wipe out fast symlink till next access */
1227 if (oip->i_flag & IFASTSYMLNK) {
1228 int j;
1229
1230 ASSERT(ITOV(oip)->v_type == VLNK);
1231
1232 oip->i_flag &= ~IFASTSYMLNK;
1233
1234 for (j = 1; j < NDADDR; j++)
1235 oip->i_db[j] = 0;
1236 for (j = 0; j < NIADDR; j++)
1237 oip->i_ib[j] = 0;
1238 }
1239
1240 boff = (int)blkoff(fs, length);
1241
1242 if (length > oip->i_size) {
1243 /*
1244 * Trunc up case. BMAPALLOC will insure that the right blocks
1245 * are allocated. This includes extending the old frag to a
1246 * full block (if needed) in addition to doing any work
1247 * needed for allocating the last block.
1248 */
1249 if (boff == 0)
1250 err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr);
1251 else
1252 err = BMAPALLOC(oip, length - 1, boff, cr);
1253
1254 if (err == 0) {
1255 /*
1256 * Save old size and set inode's size now
1257 * so that we don't cause too much of the
1258 * file to be zero'd and pushed.
1259 */
1260 u_offset_t osize = oip->i_size;
1261 oip->i_size = length;
1262 /*
1263 * Make sure we zero out the remaining bytes of
1264 * the page in case a mmap scribbled on it. We
1265 * can't prevent a mmap from writing beyond EOF
1266 * on the last page of a file.
1267 *
1268 */
1269 if ((boff = (int)blkoff(fs, osize)) != 0) {
1270 bsize = (int)lblkno(fs, osize - 1) >= NDADDR ?
1271 fs->fs_bsize : fragroundup(fs, boff);
1272 pvn_vpzero(ITOV(oip), osize,
1273 (size_t)(bsize - boff));
1274 }
1275 oip->i_flag |= ICHG|IATTCHG;
1276 oip->i_seq++;
1277 ITIMES_NOLOCK(oip);
1278 /*
1279 * MAXOFF32_T is old 2GB size limit. If
1280 * this operation caused a large file to be
1281 * created, turn on the superblock flag
1282 * and update the superblock, if the flag
1283 * is not already on.
1284 */
1285 if ((length > (u_offset_t)MAXOFF32_T) &&
1286 !(fs->fs_flags & FSLARGEFILES)) {
1287 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1288 mutex_enter(&ufsvfsp->vfs_lock);
1289 fs->fs_flags |= FSLARGEFILES;
1290 ufs_sbwrite(ufsvfsp);
1291 mutex_exit(&ufsvfsp->vfs_lock);
1292 }
1293 }
1294
1295 return (err);
1296 }
1297
1298 /*
1299 * Update the pages of the file. If the file is not being
1300 * truncated to a block boundary, the contents of the
1301 * pages following the end of the file must be zero'ed
1302 * in case it ever become accessible again because
1303 * of subsequent file growth.
1304 */
1305 if (boff == 0) {
1306 (void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage,
1307 B_INVAL | B_TRUNC, CRED());
1308 } else {
1309 /*
1310 * Make sure that the last block is properly allocated.
1311 * We only really have to do this if the last block is
1312 * actually allocated since ufs_bmap will now handle the case
1313 * of an fragment which has no block allocated. Just to
1314 * be sure, we do it now independent of current allocation.
1315 */
1316 err = BMAPALLOC(oip, length - 1, boff, cr);
1317 if (err)
1318 return (err);
1319
1320 /*
1321 * BMAPALLOC will call bmap_write which defers i_seq
1322 * processing. If the timestamps were changed, update
1323 * i_seq before rdip drops i_contents or syncs the inode.
1324 */
1325 if (oip->i_flag & (ICHG|IUPD))
1326 oip->i_seq++;
1327
1328 /*
1329 * BugId 4069932
1330 * Make sure that the relevant partial page appears in
1331 * the v_pages list, so that pvn_vpzero() will do its
1332 * job. Since doing this correctly requires everything
1333 * in rdip() except for the uiomove(), it's easier and
1334 * safer to do the uiomove() rather than duplicate the
1335 * rest of rdip() here.
1336 *
1337 * To get here, we know that length indicates a byte
1338 * that is not the first byte of a block. (length - 1)
1339 * is the last actual byte known to exist. Deduction
1340 * shows it is in the same block as byte (length).
1341 * Thus, this rdip() invocation should always succeed
1342 * except in the face of i/o errors, and give us the
1343 * block we care about.
1344 *
1345 * rdip() makes the same locking assertions and
1346 * assumptions as we do. We do not acquire any locks
1347 * before calling it, so we have not changed the locking
1348 * situation. Finally, there do not appear to be any
1349 * paths whereby rdip() ends up invoking us again.
1350 * Thus, infinite recursion is avoided.
1351 */
1352 {
1353 uio_t uio;
1354 iovec_t iov[1];
1355 char buffer;
1356
1357 uio.uio_iov = iov;
1358 uio.uio_iovcnt = 1;
1359 uio.uio_loffset = length - 1;
1360 uio.uio_resid = 1;
1361 uio.uio_segflg = UIO_SYSSPACE;
1362 uio.uio_extflg = UIO_COPY_CACHED;
1363
1364 iov[0].iov_base = &buffer;
1365 iov[0].iov_len = 1;
1366
1367 err = rdip(oip, &uio, UIO_READ, NULL);
1368 if (err)
1369 return (err);
1370 }
1371
1372 bsize = (int)lblkno(fs, length - 1) >= NDADDR ?
1373 fs->fs_bsize : fragroundup(fs, boff);
1374 pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff));
1375 /*
1376 * Ensure full fs block is marked as dirty.
1377 */
1378 (void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff),
1379 ufs_putapage, B_INVAL | B_TRUNC, CRED());
1380 }
1381
1382 /*
1383 * Calculate index into inode's block list of
1384 * last direct and indirect blocks (if any)
1385 * which we want to keep. Lastblock is -1 when
1386 * the file is truncated to 0.
1387 */
1388 lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
1389 lastiblock[SINGLE] = lastblock - NDADDR;
1390 lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
1391 lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
1392 nblocks = btodb(fs->fs_bsize);
1393
1394 /*
1395 * Update file and block pointers
1396 * on disk before we start freeing blocks.
1397 * If we crash before free'ing blocks below,
1398 * the blocks will be returned to the free list.
1399 * lastiblock values are also normalized to -1
1400 * for calls to indirtrunc below.
1401 */
1402 tip = *oip; /* structure copy */
1403 ip = &tip;
1404
1405 for (level = TRIPLE; level >= SINGLE; level--)
1406 if (lastiblock[level] < 0) {
1407 oip->i_ib[level] = 0;
1408 lastiblock[level] = -1;
1409 }
1410 for (i = NDADDR - 1; i > lastblock; i--) {
1411 oip->i_db[i] = 0;
1412 flags |= I_CHEAP;
1413 }
1414 oip->i_size = length;
1415 oip->i_flag |= ICHG|IUPD|IATTCHG;
1416 oip->i_seq++;
1417 if (!TRANS_ISTRANS(ufsvfsp))
1418 ufs_iupdat(oip, I_SYNC); /* do sync inode update */
1419
1420 /*
1421 * Indirect blocks first.
1422 */
1423 for (level = TRIPLE; level >= SINGLE; level--) {
1424 bn = ip->i_ib[level];
1425 if (bn != 0) {
1426 blocksreleased +=
1427 indirtrunc(ip, bn, lastiblock[level], level, flags);
1428 if (lastiblock[level] < 0) {
1429 ip->i_ib[level] = 0;
1430 free(ip, bn, (off_t)fs->fs_bsize,
1431 flags | I_IBLK);
1432 blocksreleased += nblocks;
1433 }
1434 }
1435 if (lastiblock[level] >= 0)
1436 goto done;
1437 }
1438
1439 /*
1440 * All whole direct blocks or frags.
1441 */
1442 for (i = NDADDR - 1; i > lastblock; i--) {
1443 bn = ip->i_db[i];
1444 if (bn == 0)
1445 continue;
1446 ip->i_db[i] = 0;
1447 bsize = (off_t)blksize(fs, ip, i);
1448 free(ip, bn, bsize, flags);
1449 blocksreleased += btodb(bsize);
1450 }
1451 if (lastblock < 0)
1452 goto done;
1453
1454 /*
1455 * Finally, look for a change in size of the
1456 * last direct block; release any frags.
1457 */
1458 bn = ip->i_db[lastblock];
1459 if (bn != 0) {
1460 off_t oldspace, newspace;
1461
1462 /*
1463 * Calculate amount of space we're giving
1464 * back as old block size minus new block size.
1465 */
1466 oldspace = blksize(fs, ip, lastblock);
1467 UFS_SET_ISIZE(length, ip);
1468 newspace = blksize(fs, ip, lastblock);
1469 if (newspace == 0) {
1470 err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0");
1471 return (err);
1472 }
1473 if (oldspace - newspace > 0) {
1474 /*
1475 * Block number of space to be free'd is
1476 * the old block # plus the number of frags
1477 * required for the storage we're keeping.
1478 */
1479 bn += numfrags(fs, newspace);
1480 free(ip, bn, oldspace - newspace, flags);
1481 blocksreleased += btodb(oldspace - newspace);
1482 }
1483 }
1484 done:
1485 /* BEGIN PARANOIA */
1486 for (level = SINGLE; level <= TRIPLE; level++)
1487 if (ip->i_ib[level] != oip->i_ib[level]) {
1488 err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block");
1489 return (err);
1490 }
1491
1492 for (i = 0; i < NDADDR; i++)
1493 if (ip->i_db[i] != oip->i_db[i]) {
1494 err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block");
1495 return (err);
1496 }
1497 /* END PARANOIA */
1498 oip->i_blocks -= blocksreleased;
1499
1500 if (oip->i_blocks < 0) { /* sanity */
1501 cmn_err(CE_NOTE,
1502 "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
1503 fs->fs_fsmnt, (int)oip->i_number, oip->i_size,
1504 (int)oip->i_blocks);
1505 oip->i_blocks = 0;
1506 }
1507 oip->i_flag |= ICHG|IATTCHG;
1508 oip->i_seq++;
1509 /* blocksreleased is >= zero, so this can not fail */
1510 (void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL,
1511 (size_t *)NULL);
1512 return (0);
1513 }
1514
1515 /*
1516 * Check mode permission on inode. Mode is READ, WRITE or EXEC.
1517 * In the case of WRITE, the read-only status of the file system
1518 * is checked. Depending on the calling user, the appropriate
1519 * mode bits are selected; privileges to override missing permission
1520 * bits are checked through secpolicy_vnode_access().
1521 * The i_contens lock must be held as reader here to prevent racing with
1522 * the acl subsystem removing/setting/changing acls on this inode.
1523 * The caller is responsible for indicating whether or not the i_contents
1524 * lock needs to be acquired here or if already held.
1525 */
1526 int
ufs_iaccess(struct inode * ip,int mode,struct cred * cr,int dolock)1527 ufs_iaccess(struct inode *ip, int mode, struct cred *cr, int dolock)
1528 {
1529 int shift = 0;
1530 int ret = 0;
1531
1532 if (dolock)
1533 rw_enter(&ip->i_contents, RW_READER);
1534 ASSERT(RW_LOCK_HELD(&ip->i_contents));
1535
1536 if (mode & IWRITE) {
1537 /*
1538 * Disallow write attempts on read-only
1539 * file systems, unless the file is a block
1540 * or character device or a FIFO.
1541 */
1542 if (ip->i_fs->fs_ronly != 0) {
1543 if ((ip->i_mode & IFMT) != IFCHR &&
1544 (ip->i_mode & IFMT) != IFBLK &&
1545 (ip->i_mode & IFMT) != IFIFO) {
1546 ret = EROFS;
1547 goto out;
1548 }
1549 }
1550 }
1551 /*
1552 * If there is an acl, check the acl and return.
1553 */
1554 if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) {
1555 ret = ufs_acl_access(ip, mode, cr);
1556 goto out;
1557 }
1558
1559 /*
1560 * Access check is based on only one of owner, group, public.
1561 * If not owner, then check group.
1562 * If not a member of the group, then check public access.
1563 */
1564 if (crgetuid(cr) != ip->i_uid) {
1565 shift += 3;
1566 if (!groupmember((uid_t)ip->i_gid, cr))
1567 shift += 3;
1568 }
1569
1570 /* test missing privilege bits */
1571 ret = secpolicy_vnode_access2(cr, ITOV(ip), ip->i_uid,
1572 ip->i_mode << shift, mode);
1573 out:
1574 if (dolock)
1575 rw_exit(&ip->i_contents);
1576 return (ret);
1577 }
1578
1579 /*
1580 * if necessary, remove an inode from the free list
1581 * i_contents is held except at unmount
1582 *
1583 * Return 1 if the inode is taken off of the ufs_idle_q,
1584 * and the caller is expected to call VN_RELE.
1585 *
1586 * Return 0 otherwise.
1587 */
1588 int
ufs_rmidle(struct inode * ip)1589 ufs_rmidle(struct inode *ip)
1590 {
1591 int rval = 0;
1592
1593 mutex_enter(&ip->i_tlock);
1594 if ((ip->i_flag & IREF) == 0) {
1595 mutex_enter(&ufs_idle_q.uq_mutex);
1596 ip->i_freef->i_freeb = ip->i_freeb;
1597 ip->i_freeb->i_freef = ip->i_freef;
1598 ip->i_freef = ip;
1599 ip->i_freeb = ip;
1600 ip->i_flag |= IREF;
1601 ufs_idle_q.uq_ne--;
1602 if (ip->i_flag & IJUNKIQ) {
1603 ufs_njunk_iq--;
1604 ip->i_flag &= ~IJUNKIQ;
1605 } else {
1606 ufs_nuseful_iq--;
1607 }
1608 mutex_exit(&ufs_idle_q.uq_mutex);
1609 rval = 1;
1610 }
1611 mutex_exit(&ip->i_tlock);
1612 return (rval);
1613 }
1614
1615 /*
1616 * scan the hash of inodes and call func with the inode locked
1617 */
1618 int
ufs_scan_inodes(int rwtry,int (* func)(struct inode *,void *),void * arg,struct ufsvfs * ufsvfsp)1619 ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg,
1620 struct ufsvfs *ufsvfsp)
1621 {
1622 struct inode *ip; /* current inode */
1623 struct inode *lip = NULL; /* last/previous inode */
1624 union ihead *ih; /* current hash chain */
1625 int error, i;
1626 int saverror = 0;
1627 int lip_held; /* lip needs a VN_RELE() */
1628
1629 /*
1630 * If ufsvfsp is NULL, then our caller should be holding
1631 * ufs_scan_lock to avoid conflicts between ufs_unmount() and
1632 * ufs_update(). Otherwise, to avoid false-positives in
1633 * ufs_unmount()'s v_count-based EBUSY check, we only hold
1634 * those inodes that are in the file system our caller cares
1635 * about.
1636 *
1637 * We know that ip is a valid inode in the hash chain (and thus
1638 * we can trust i_ufsvfs) because the inode we chained from
1639 * (lip) is still in the hash chain. This is true because either:
1640 *
1641 * 1. We did not drop the hash chain lock since the last
1642 * iteration (because we were not interested in the last inode),
1643 * or
1644 * 2. We maintained a hold on the last inode while we
1645 * we were processing it, so it could not be removed
1646 * from the hash chain.
1647 *
1648 * The whole reason we're dropping and re-grabbing the chain
1649 * lock on every inode is so that we don't present a major
1650 * choke point on throughput, particularly when we've been
1651 * called on behalf of fsflush.
1652 */
1653
1654 for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
1655 mutex_enter(&ih_lock[i]);
1656 for (ip = ih->ih_chain[0], lip_held = 0;
1657 ip != (struct inode *)ih;
1658 ip = lip->i_forw) {
1659
1660 ins.in_scan.value.ul++;
1661
1662 /*
1663 * Undo the previous iteration's VN_HOLD(), but
1664 * only if one was done.
1665 */
1666 if (lip_held)
1667 VN_RELE(ITOV(lip));
1668
1669 lip = ip;
1670 if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) {
1671 /*
1672 * We're not processing all inodes, and
1673 * this inode is not in the filesystem of
1674 * interest, so skip it. No need to do a
1675 * VN_HOLD() since we're not dropping the
1676 * hash chain lock until after we've
1677 * done the i_forw traversal above.
1678 */
1679 lip_held = 0;
1680 continue;
1681 }
1682 VN_HOLD(ITOV(ip));
1683 lip_held = 1;
1684 mutex_exit(&ih_lock[i]);
1685
1686 /*
1687 * Acquire the contents lock as writer to make
1688 * sure that the inode has been initialized in
1689 * the cache or removed from the idle list by
1690 * ufs_iget(). This works because ufs_iget()
1691 * acquires the contents lock before putting
1692 * the inode into the cache. If we can lock
1693 * it, then ufs_iget() is done with it.
1694 */
1695
1696 if (rwtry) {
1697 if (!rw_tryenter(&ip->i_contents, RW_WRITER)) {
1698 mutex_enter(&ih_lock[i]);
1699 continue;
1700 }
1701 } else {
1702 rw_enter(&ip->i_contents, RW_WRITER);
1703 }
1704
1705 rw_exit(&ip->i_contents);
1706
1707 /*
1708 * ISTALE means the inode couldn't be read
1709 *
1710 * We don't have to hold the i_contents lock
1711 * for this check for a couple of
1712 * reasons. First, if ISTALE is set then the
1713 * flag cannot be cleared until the inode is
1714 * removed from the cache and that cannot
1715 * happen until after we VN_RELE() it.
1716 * Second, if ISTALE is not set, then the
1717 * inode is in the cache and does not need to
1718 * be read from disk so ISTALE cannot be set
1719 * while we are not looking.
1720 */
1721 if ((ip->i_flag & ISTALE) == 0) {
1722 if ((error = (*func)(ip, arg)) != 0)
1723 saverror = error;
1724 }
1725
1726 mutex_enter(&ih_lock[i]);
1727 }
1728 if (lip_held)
1729 VN_RELE(ITOV(lip));
1730 mutex_exit(&ih_lock[i]);
1731 }
1732 return (saverror);
1733 }
1734
1735 /*
1736 * Mark inode with the current time, plus a unique increment.
1737 *
1738 * Since we only keep 32-bit time on disk, if UFS is still alive
1739 * beyond 2038, filesystem times will simply stick at the last
1740 * possible second of 32-bit time. Not ideal, but probably better
1741 * than going into the remote past, or confusing applications with
1742 * negative time.
1743 */
1744 void
ufs_imark(struct inode * ip)1745 ufs_imark(struct inode *ip)
1746 {
1747 timestruc_t now;
1748 int32_t usec, nsec;
1749
1750 /*
1751 * The update of i_seq may have been deferred, increase i_seq here
1752 * to make sure it is in sync with the timestamps.
1753 */
1754 if (ip->i_flag & ISEQ) {
1755 ASSERT(ip->i_flag & (IUPD|ICHG));
1756 ip->i_seq++;
1757 ip->i_flag &= ~ISEQ;
1758 }
1759
1760 gethrestime(&now);
1761
1762 /*
1763 * Fast algorithm to convert nsec to usec -- see hrt2ts()
1764 * in common/os/timers.c for a full description.
1765 */
1766 nsec = now.tv_nsec;
1767 usec = nsec + (nsec >> 2);
1768 usec = nsec + (usec >> 1);
1769 usec = nsec + (usec >> 2);
1770 usec = nsec + (usec >> 4);
1771 usec = nsec - (usec >> 3);
1772 usec = nsec + (usec >> 2);
1773 usec = nsec + (usec >> 3);
1774 usec = nsec + (usec >> 4);
1775 usec = nsec + (usec >> 1);
1776 usec = nsec + (usec >> 6);
1777 usec = usec >> 10;
1778
1779 mutex_enter(&ufs_iuniqtime_lock);
1780 if (now.tv_sec > (time_t)iuniqtime.tv_sec ||
1781 usec > iuniqtime.tv_usec) {
1782 if (now.tv_sec < TIME32_MAX) {
1783 iuniqtime.tv_sec = (time32_t)now.tv_sec;
1784 iuniqtime.tv_usec = usec;
1785 }
1786 } else {
1787 if (iuniqtime.tv_sec < TIME32_MAX) {
1788 iuniqtime.tv_usec++;
1789 /* Check for usec overflow */
1790 if (iuniqtime.tv_usec >= MICROSEC) {
1791 iuniqtime.tv_sec++;
1792 iuniqtime.tv_usec = 0;
1793 }
1794 }
1795 }
1796
1797 if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) {
1798 ip->i_atime = iuniqtime;
1799 }
1800 if (ip->i_flag & IUPD) {
1801 ip->i_mtime = iuniqtime;
1802 ip->i_flag |= IMODTIME;
1803 }
1804 if (ip->i_flag & ICHG) {
1805 ip->i_diroff = 0;
1806 ip->i_ctime = iuniqtime;
1807 }
1808 mutex_exit(&ufs_iuniqtime_lock);
1809 }
1810
1811 /*
1812 * Update timestamps in inode.
1813 */
1814 void
ufs_itimes_nolock(struct inode * ip)1815 ufs_itimes_nolock(struct inode *ip)
1816 {
1817
1818 /*
1819 * if noatime is set and the inode access time is the only field that
1820 * must be changed, exit immediately.
1821 */
1822 if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) &&
1823 (ip->i_ufsvfs->vfs_noatime)) {
1824 return;
1825 }
1826
1827 if (ip->i_flag & (IUPD|IACC|ICHG)) {
1828 if (ip->i_flag & ICHG)
1829 ip->i_flag |= IMOD;
1830 else
1831 ip->i_flag |= IMODACC;
1832 ufs_imark(ip);
1833 ip->i_flag &= ~(IACC|IUPD|ICHG);
1834 }
1835 }
1836