1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2024 Oxide Computer Company
24 */
25
26 #include <sys/types.h>
27 #include <sys/t_lock.h>
28 #include <sys/param.h>
29 #include <sys/time.h>
30 #include <sys/systm.h>
31 #include <sys/sysmacros.h>
32 #include <sys/resource.h>
33 #include <sys/signal.h>
34 #include <sys/cred.h>
35 #include <sys/user.h>
36 #include <sys/buf.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/proc.h>
40 #include <sys/disp.h>
41 #include <sys/file.h>
42 #include <sys/fcntl.h>
43 #include <sys/flock.h>
44 #include <sys/atomic.h>
45 #include <sys/kmem.h>
46 #include <sys/uio.h>
47 #include <sys/conf.h>
48 #include <sys/mman.h>
49 #include <sys/pathname.h>
50 #include <sys/debug.h>
51 #include <sys/vmsystm.h>
52 #include <sys/cmn_err.h>
53 #include <sys/filio.h>
54 #include <sys/dnlc.h>
55
56 #include <sys/fs/ufs_filio.h>
57 #include <sys/fs/ufs_lockfs.h>
58 #include <sys/fs/ufs_fs.h>
59 #include <sys/fs/ufs_inode.h>
60 #include <sys/fs/ufs_fsdir.h>
61 #include <sys/fs/ufs_quota.h>
62 #include <sys/fs/ufs_trans.h>
63 #include <sys/fs/ufs_log.h>
64 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */
65 #include <sys/errno.h>
66 #include <sys/sysinfo.h>
67
68 #include <vm/hat.h>
69 #include <vm/page.h>
70 #include <vm/pvn.h>
71 #include <vm/as.h>
72 #include <vm/seg.h>
73 #include <vm/seg_map.h>
74 #include <vm/seg_vn.h>
75 #include <vm/rm.h>
76 #include <sys/swap.h>
77 #include <sys/model.h>
78 #include <sys/policy.h>
79
80 #include "fs/fs_subr.h"
81
82 /*
83 * ufs_fioio is the ufs equivalent of NFS_CNVT and is tailored to
84 * metamucil's needs. It may change at any time.
85 */
86 /* ARGSUSED */
87 int
ufs_fioio(struct vnode * vp,struct fioio * fiou,int flag,struct cred * cr)88 ufs_fioio(
89 struct vnode *vp, /* any file on the fs */
90 struct fioio *fiou, /* fioio struct in userland */
91 int flag, /* flag from VOP_IOCTL() */
92 struct cred *cr) /* credentials from ufs_ioctl */
93 {
94 int error = 0;
95 struct vnode *vpio = NULL; /* vnode for inode open */
96 struct inode *ipio = NULL; /* inode for inode open */
97 struct file *fpio = NULL; /* file for inode open */
98 struct inode *ip; /* inode for file system */
99 struct fs *fs; /* fs for file system */
100 STRUCT_DECL(fioio, fio); /* copy of user's fioio struct */
101
102 /*
103 * must be privileged
104 */
105 if (secpolicy_fs_config(cr, vp->v_vfsp) != 0)
106 return (EPERM);
107
108 STRUCT_INIT(fio, flag & DATAMODEL_MASK);
109
110 /*
111 * get user's copy of fioio struct
112 */
113 if (copyin(fiou, STRUCT_BUF(fio), STRUCT_SIZE(fio)))
114 return (EFAULT);
115
116 ip = VTOI(vp);
117 fs = ip->i_fs;
118
119 /*
120 * check the inode number against the fs's inode number bounds
121 */
122 if (STRUCT_FGET(fio, fio_ino) < UFSROOTINO)
123 return (ESRCH);
124 if (STRUCT_FGET(fio, fio_ino) >= fs->fs_ncg * fs->fs_ipg)
125 return (ESRCH);
126
127 rw_enter(&ip->i_ufsvfs->vfs_dqrwlock, RW_READER);
128
129 /*
130 * get the inode
131 */
132 error = ufs_iget(ip->i_vfs, STRUCT_FGET(fio, fio_ino), &ipio, cr);
133
134 rw_exit(&ip->i_ufsvfs->vfs_dqrwlock);
135
136 if (error)
137 return (error);
138
139 /*
140 * check the generation number
141 */
142 rw_enter(&ipio->i_contents, RW_READER);
143 if (ipio->i_gen != STRUCT_FGET(fio, fio_gen)) {
144 error = ESTALE;
145 rw_exit(&ipio->i_contents);
146 goto errout;
147 }
148
149 /*
150 * check if the inode is free
151 */
152 if (ipio->i_mode == 0) {
153 error = ENOENT;
154 rw_exit(&ipio->i_contents);
155 goto errout;
156 }
157 rw_exit(&ipio->i_contents);
158
159 /*
160 * Adapted from copen: get a file struct
161 * Large Files: We open this file descriptor with FOFFMAX flag
162 * set so that it will be like a large file open.
163 */
164 if (falloc(NULL, (FREAD|FOFFMAX), &fpio, STRUCT_FADDR(fio, fio_fd)))
165 goto errout;
166
167 /*
168 * Adapted from vn_open: check access and then open the file
169 */
170 vpio = ITOV(ipio);
171 if (error = VOP_ACCESS(vpio, VREAD, 0, cr, NULL))
172 goto errout;
173
174 if (error = VOP_OPEN(&vpio, FREAD, cr, NULL))
175 goto errout;
176
177 /*
178 * Adapted from copen: initialize the file struct
179 */
180 fpio->f_vnode = vpio;
181
182 /*
183 * return the fd
184 */
185 if (copyout(STRUCT_BUF(fio), fiou, STRUCT_SIZE(fio))) {
186 error = EFAULT;
187 goto errout;
188 }
189 setf(STRUCT_FGET(fio, fio_fd), fpio);
190 mutex_exit(&fpio->f_tlock);
191 return (0);
192 errout:
193 /*
194 * free the file struct and fd
195 */
196 if (fpio) {
197 setf(STRUCT_FGET(fio, fio_fd), NULL);
198 unfalloc(fpio);
199 }
200
201 /*
202 * release the hold on the inode
203 */
204 if (ipio)
205 VN_RELE(ITOV(ipio));
206 return (error);
207 }
208
209 /*
210 * ufs_fiosatime
211 * set access time w/o altering change time. This ioctl is tailored
212 * to metamucil's needs and may change at any time.
213 */
214 int
ufs_fiosatime(struct vnode * vp,struct timeval * tvu,int flag,struct cred * cr)215 ufs_fiosatime(
216 struct vnode *vp, /* file's vnode */
217 struct timeval *tvu, /* struct timeval in userland */
218 int flag, /* flag from VOP_IOCTL() */
219 struct cred *cr) /* credentials from ufs_ioctl */
220 {
221 struct inode *ip; /* inode for vp */
222 struct timeval32 tv; /* copy of user's timeval */
223 int now = 0;
224
225 /*
226 * must have sufficient privileges
227 */
228 if (secpolicy_fs_config(cr, vp->v_vfsp) != 0)
229 return (EPERM);
230
231 /*
232 * get user's copy of timeval struct and check values
233 * if input is NULL, will set time to now
234 */
235 if (tvu == NULL) {
236 now = 1;
237 } else {
238 if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
239 if (copyin(tvu, &tv, sizeof (tv)))
240 return (EFAULT);
241 } else {
242 struct timeval tv64;
243
244 if (copyin(tvu, &tv64, sizeof (tv64)))
245 return (EFAULT);
246 if (TIMEVAL_OVERFLOW(&tv64))
247 return (EOVERFLOW);
248 TIMEVAL_TO_TIMEVAL32(&tv, &tv64);
249 }
250
251 if (tv.tv_usec < 0 || tv.tv_usec >= 1000000)
252 return (EINVAL);
253 }
254
255 /*
256 * update access time
257 */
258 ip = VTOI(vp);
259 rw_enter(&ip->i_contents, RW_WRITER);
260 ITIMES_NOLOCK(ip);
261 if (now) {
262 mutex_enter(&ufs_iuniqtime_lock);
263 ip->i_atime = iuniqtime;
264 mutex_exit(&ufs_iuniqtime_lock);
265 } else {
266 ip->i_atime = tv;
267 }
268 ip->i_flag |= IMODACC;
269 rw_exit(&ip->i_contents);
270
271 return (0);
272 }
273
274 /*
275 * ufs_fiogdio
276 * Get delayed-io state. This ioctl is tailored
277 * to metamucil's needs and may change at any time.
278 */
279 /* ARGSUSED */
280 int
ufs_fiogdio(struct vnode * vp,uint_t * diop,int flag,struct cred * cr)281 ufs_fiogdio(
282 struct vnode *vp, /* file's vnode */
283 uint_t *diop, /* dio state returned here */
284 int flag, /* flag from ufs_ioctl */
285 struct cred *cr) /* credentials from ufs_ioctl */
286 {
287 struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
288
289 /*
290 * forcibly unmounted
291 */
292 if (ufsvfsp == NULL)
293 return (EIO);
294
295 if (suword32(diop, ufsvfsp->vfs_dio))
296 return (EFAULT);
297 return (0);
298 }
299
300 /*
301 * ufs_fiosdio
302 * Set delayed-io state. This ioctl is tailored
303 * to metamucil's needs and may change at any time.
304 */
305 int
ufs_fiosdio(struct vnode * vp,uint_t * diop,int flag,struct cred * cr)306 ufs_fiosdio(
307 struct vnode *vp, /* file's vnode */
308 uint_t *diop, /* dio flag */
309 int flag, /* flag from ufs_ioctl */
310 struct cred *cr) /* credentials from ufs_ioctl */
311 {
312 uint_t dio; /* copy of user's dio */
313 struct inode *ip; /* inode for vp */
314 struct ufsvfs *ufsvfsp;
315 struct fs *fs;
316 struct ulockfs *ulp;
317 int error = 0;
318
319 #ifdef lint
320 flag = flag;
321 #endif
322
323 /* check input conditions */
324 if (secpolicy_fs_config(cr, vp->v_vfsp) != 0)
325 return (EPERM);
326
327 if (copyin(diop, &dio, sizeof (dio)))
328 return (EFAULT);
329
330 if (dio > 1)
331 return (EINVAL);
332
333 /* file system has been forcibly unmounted */
334 if (VTOI(vp)->i_ufsvfs == NULL)
335 return (EIO);
336
337 ip = VTOI(vp);
338 ufsvfsp = ip->i_ufsvfs;
339 ulp = &ufsvfsp->vfs_ulockfs;
340
341 /* logging file system; dio ignored */
342 if (TRANS_ISTRANS(ufsvfsp))
343 return (error);
344
345 /* hold the mutex to prevent race with a lockfs request */
346 vfs_lock_wait(vp->v_vfsp);
347 mutex_enter(&ulp->ul_lock);
348 atomic_inc_ulong(&ufs_quiesce_pend);
349
350 if (ULOCKFS_IS_HLOCK(ulp)) {
351 error = EIO;
352 goto out;
353 }
354
355 if (ULOCKFS_IS_ELOCK(ulp)) {
356 error = EBUSY;
357 goto out;
358 }
359 /* wait for outstanding accesses to finish */
360 if (error = ufs_quiesce(ulp))
361 goto out;
362
363 /* flush w/invalidate */
364 if (error = ufs_flush(vp->v_vfsp))
365 goto out;
366
367 /*
368 * update dio
369 */
370 mutex_enter(&ufsvfsp->vfs_lock);
371 ufsvfsp->vfs_dio = dio;
372
373 /*
374 * enable/disable clean flag processing
375 */
376 fs = ip->i_fs;
377 if (fs->fs_ronly == 0 &&
378 fs->fs_clean != FSBAD &&
379 fs->fs_clean != FSLOG) {
380 if (dio)
381 fs->fs_clean = FSSUSPEND;
382 else
383 fs->fs_clean = FSACTIVE;
384 ufs_sbwrite(ufsvfsp);
385 mutex_exit(&ufsvfsp->vfs_lock);
386 } else
387 mutex_exit(&ufsvfsp->vfs_lock);
388 out:
389 /*
390 * we need this broadcast because of the ufs_quiesce call above
391 */
392 atomic_dec_ulong(&ufs_quiesce_pend);
393 cv_broadcast(&ulp->ul_cv);
394 mutex_exit(&ulp->ul_lock);
395 vfs_unlock(vp->v_vfsp);
396 return (error);
397 }
398
399 /*
400 * ufs_fioffs - common function for VFS and ioctl entry points to flush a single
401 * file system.
402 */
403 int
ufs_fioffs(vfs_t * vfsp,cred_t * cr)404 ufs_fioffs(vfs_t *vfsp, cred_t *cr)
405 {
406 int error;
407 struct ufsvfs *ufsvfsp;
408 struct ulockfs *ulp;
409
410 /* file system has been forcibly unmounted */
411 if ((vfsp->vfs_flag & VFS_UNMOUNTED) != 0)
412 return (EIO);
413
414 ufsvfsp = vfsp->vfs_data;
415 ulp = &ufsvfsp->vfs_ulockfs;
416
417 /*
418 * suspend the delete thread
419 * this must be done outside the lockfs locking protocol
420 */
421 vfs_lock_wait(vfsp);
422 ufs_thread_suspend(&ufsvfsp->vfs_delete);
423
424 /* hold the mutex to prevent race with a lockfs request */
425 mutex_enter(&ulp->ul_lock);
426 atomic_inc_ulong(&ufs_quiesce_pend);
427
428 if (ULOCKFS_IS_HLOCK(ulp)) {
429 error = EIO;
430 goto out;
431 }
432 if (ULOCKFS_IS_ELOCK(ulp)) {
433 error = EBUSY;
434 goto out;
435 }
436 /* wait for outstanding accesses to finish */
437 if (error = ufs_quiesce(ulp))
438 goto out;
439
440 /*
441 * If logging, and the logmap was marked as not rollable,
442 * make it rollable now, and start the trans_roll thread and
443 * the reclaim thread. The log at this point is safe to write to.
444 */
445 if (ufsvfsp->vfs_log) {
446 ml_unit_t *ul = ufsvfsp->vfs_log;
447 struct fs *fsp = ufsvfsp->vfs_fs;
448 int err;
449
450 if (ul->un_flags & LDL_NOROLL) {
451 ul->un_flags &= ~LDL_NOROLL;
452 logmap_start_roll(ul);
453 if (!fsp->fs_ronly && (fsp->fs_reclaim &
454 (FS_RECLAIM|FS_RECLAIMING))) {
455 fsp->fs_reclaim &= ~FS_RECLAIM;
456 fsp->fs_reclaim |= FS_RECLAIMING;
457 ufs_thread_start(&ufsvfsp->vfs_reclaim,
458 ufs_thread_reclaim, vfsp);
459 if (!fsp->fs_ronly) {
460 TRANS_SBWRITE(ufsvfsp,
461 TOP_SBUPDATE_UPDATE);
462 if (err =
463 geterror(ufsvfsp->vfs_bufp)) {
464 refstr_t *mntpt;
465 mntpt = vfs_getmntpoint(vfsp);
466 cmn_err(CE_NOTE,
467 "Filesystem Flush "
468 "Failed to update "
469 "Reclaim Status for "
470 " %s, Write failed to "
471 "update superblock, "
472 "error %d",
473 refstr_value(mntpt),
474 err);
475 refstr_rele(mntpt);
476 }
477 }
478 }
479 }
480 }
481
482 /* synchronously flush dirty data and metadata */
483 error = ufs_flush(vfsp);
484
485 out:
486 atomic_dec_ulong(&ufs_quiesce_pend);
487 cv_broadcast(&ulp->ul_cv);
488 mutex_exit(&ulp->ul_lock);
489 vfs_unlock(vfsp);
490
491 /*
492 * allow the delete thread to continue
493 */
494 ufs_thread_continue(&ufsvfsp->vfs_delete);
495 return (error);
496 }
497
498 /*
499 * ufs_fioisbusy
500 * Get number of references on this vnode.
501 * Contract-private interface for Legato's NetWorker product.
502 */
503 /* ARGSUSED */
504 int
ufs_fioisbusy(struct vnode * vp,int * isbusy,struct cred * cr)505 ufs_fioisbusy(struct vnode *vp, int *isbusy, struct cred *cr)
506 {
507 int is_it_busy;
508
509 /*
510 * The caller holds one reference, there may be one in the dnlc
511 * so we need to flush it.
512 */
513 if (vp->v_count > 1)
514 dnlc_purge_vp(vp);
515 /*
516 * Since we've just flushed the dnlc and we hold a reference
517 * to this vnode, then anything but 1 means busy (this had
518 * BETTER not be zero!). Also, it's possible for someone to
519 * have this file mmap'ed with no additional reference count.
520 */
521 ASSERT(vp->v_count > 0);
522 if ((vp->v_count == 1) && (VTOI(vp)->i_mapcnt == 0))
523 is_it_busy = 0;
524 else
525 is_it_busy = 1;
526
527 if (suword32(isbusy, is_it_busy))
528 return (EFAULT);
529 return (0);
530 }
531
532 /* ARGSUSED */
533 int
ufs_fiodirectio(struct vnode * vp,int cmd,struct cred * cr)534 ufs_fiodirectio(struct vnode *vp, int cmd, struct cred *cr)
535 {
536 int error = 0;
537 struct inode *ip = VTOI(vp);
538
539 /*
540 * Acquire reader lock and set/reset direct mode
541 */
542 rw_enter(&ip->i_contents, RW_READER);
543 mutex_enter(&ip->i_tlock);
544 if (cmd == DIRECTIO_ON)
545 ip->i_flag |= IDIRECTIO; /* enable direct mode */
546 else if (cmd == DIRECTIO_OFF)
547 ip->i_flag &= ~IDIRECTIO; /* disable direct mode */
548 else
549 error = EINVAL;
550 mutex_exit(&ip->i_tlock);
551 rw_exit(&ip->i_contents);
552 return (error);
553 }
554
555 /*
556 * ufs_fiotune
557 * Allow some tunables to be set on a mounted fs
558 */
559 int
ufs_fiotune(struct vnode * vp,struct fiotune * uftp,struct cred * cr)560 ufs_fiotune(struct vnode *vp, struct fiotune *uftp, struct cred *cr)
561 {
562 struct fiotune ftp;
563 struct fs *fs;
564 struct ufsvfs *ufsvfsp;
565
566 /*
567 * must have sufficient privileges
568 */
569 if (secpolicy_fs_config(cr, vp->v_vfsp) != 0)
570 return (EPERM);
571
572 /*
573 * get user's copy
574 */
575 if (copyin(uftp, &ftp, sizeof (ftp)))
576 return (EFAULT);
577
578 /*
579 * some minimal sanity checks
580 */
581 if ((ftp.maxcontig <= 0) ||
582 (ftp.rotdelay != 0) ||
583 (ftp.maxbpg <= 0) ||
584 (ftp.minfree < 0) ||
585 (ftp.minfree > 99) ||
586 ((ftp.optim != FS_OPTTIME) && (ftp.optim != FS_OPTSPACE)))
587 return (EINVAL);
588
589 /*
590 * update superblock but don't write it! If it gets out, fine.
591 */
592 fs = VTOI(vp)->i_fs;
593
594 fs->fs_maxcontig = ftp.maxcontig;
595 fs->fs_rotdelay = ftp.rotdelay;
596 fs->fs_maxbpg = ftp.maxbpg;
597 fs->fs_minfree = ftp.minfree;
598 fs->fs_optim = ftp.optim;
599
600 /*
601 * Adjust cluster based on the new maxcontig. The cluster size
602 * can be any positive value. The check for this is done above.
603 */
604 ufsvfsp = VTOI(vp)->i_ufsvfs;
605 ufsvfsp->vfs_ioclustsz = fs->fs_bsize * fs->fs_maxcontig;
606
607 /*
608 * Adjust minfrags from minfree
609 */
610 ufsvfsp->vfs_minfrags = (int)((int64_t)fs->fs_dsize *
611 fs->fs_minfree / 100);
612
613 /*
614 * Write the superblock
615 */
616 if (fs->fs_ronly == 0) {
617 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SBUPDATE_UPDATE,
618 TOP_SBWRITE_SIZE);
619 TRANS_SBWRITE(ufsvfsp, TOP_SBUPDATE_UPDATE);
620 TRANS_END_ASYNC(ufsvfsp, TOP_SBUPDATE_UPDATE, TOP_SBWRITE_SIZE);
621 }
622
623 return (0);
624 }
625
626 /*
627 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
628 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
629 */
630 int
ufs_fio_holey(vnode_t * vp,int cmd,offset_t * off)631 ufs_fio_holey(vnode_t *vp, int cmd, offset_t *off)
632 {
633 inode_t *ip = VTOI(vp);
634 u_offset_t noff = (u_offset_t)*off; /* new offset */
635 u_offset_t isz;
636 int error;
637 boolean_t hole;
638
639 rw_enter(&ip->i_contents, RW_READER);
640 isz = ip->i_size;
641 if (noff >= isz) {
642 rw_exit(&ip->i_contents);
643 return (ENXIO);
644 }
645
646 /*
647 * Check for the usual case where a file has no holes.
648 * If so we can optimise to set the end of the file as the first
649 * (virtual) hole. This avoids bmap_find() searching through
650 * every block in the file for a (non-existent) hole.
651 */
652 if (!bmap_has_holes(ip)) {
653 rw_exit(&ip->i_contents);
654 if (cmd == _FIO_SEEK_HOLE) {
655 *off = isz;
656 return (0);
657 }
658 /* *off must already point to valid data (non hole) */
659 return (0);
660 }
661
662 /*
663 * Calling bmap_read() one block at a time on a 1TB file takes forever,
664 * so we use a special function to search for holes or blocks.
665 */
666 if (cmd == _FIO_SEEK_HOLE)
667 hole = B_TRUE;
668 else
669 hole = B_FALSE;
670 error = bmap_find(ip, hole, &noff);
671 rw_exit(&ip->i_contents);
672
673 /* end of file? */
674 if (error == ENXIO) {
675 /*
676 * Handle the virtual hole at the end of file.
677 */
678 if (cmd == _FIO_SEEK_HOLE) {
679 *off = isz;
680 return (0);
681 }
682 return (ENXIO);
683 }
684 if (noff < *off)
685 return (error);
686 *off = noff;
687 return (error);
688 }
689
690 int
ufs_mark_compressed(struct vnode * vp)691 ufs_mark_compressed(struct vnode *vp)
692 {
693 struct inode *ip = VTOI(vp);
694 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
695
696 if (vp->v_type != VREG)
697 return (EINVAL);
698
699 rw_enter(&ip->i_contents, RW_WRITER);
700 ip->i_cflags |= ICOMPRESS;
701 TRANS_INODE(ufsvfsp, ip);
702 ip->i_flag |= (ICHG|ISEQ);
703 ip->i_seq++;
704 if (!TRANS_ISTRANS(ufsvfsp))
705 ufs_iupdat(ip, I_ASYNC);
706 rw_exit(&ip->i_contents);
707
708 return (0);
709 }
710