xref: /titanic_50/usr/src/uts/common/fs/ufs/ufs_filio.c (revision d8995fac6ffdcecbc6267dca23d51e588d182768)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/time.h>
33 #include <sys/systm.h>
34 #include <sys/sysmacros.h>
35 #include <sys/resource.h>
36 #include <sys/signal.h>
37 #include <sys/cred.h>
38 #include <sys/user.h>
39 #include <sys/buf.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/proc.h>
43 #include <sys/disp.h>
44 #include <sys/file.h>
45 #include <sys/fcntl.h>
46 #include <sys/flock.h>
47 #include <sys/kmem.h>
48 #include <sys/uio.h>
49 #include <sys/conf.h>
50 #include <sys/mman.h>
51 #include <sys/pathname.h>
52 #include <sys/debug.h>
53 #include <sys/vmmeter.h>
54 #include <sys/vmsystm.h>
55 #include <sys/cmn_err.h>
56 #include <sys/vtrace.h>
57 #include <sys/filio.h>
58 #include <sys/dnlc.h>
59 
60 #include <sys/fs/ufs_filio.h>
61 #include <sys/fs/ufs_lockfs.h>
62 #include <sys/fs/ufs_fs.h>
63 #include <sys/fs/ufs_inode.h>
64 #include <sys/fs/ufs_fsdir.h>
65 #include <sys/fs/ufs_quota.h>
66 #include <sys/fs/ufs_trans.h>
67 #include <sys/fs/ufs_log.h>
68 #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
69 #include <sys/errno.h>
70 #include <sys/sysinfo.h>
71 
72 #include <vm/hat.h>
73 #include <vm/page.h>
74 #include <vm/pvn.h>
75 #include <vm/as.h>
76 #include <vm/seg.h>
77 #include <vm/seg_map.h>
78 #include <vm/seg_vn.h>
79 #include <vm/rm.h>
80 #include <sys/swap.h>
81 #include <sys/model.h>
82 #include <sys/policy.h>
83 
84 #include "fs/fs_subr.h"
85 
86 /*
87  * ufs_fioio is the ufs equivalent of NFS_CNVT and is tailored to
88  * metamucil's needs.  It may change at any time.
89  */
90 /* ARGSUSED */
91 int
92 ufs_fioio(
93 	struct vnode	*vp,		/* any file on the fs */
94 	struct fioio	*fiou,		/* fioio struct in userland */
95 	int		flag,		/* flag from VOP_IOCTL() */
96 	struct cred	*cr)		/* credentials from ufs_ioctl */
97 {
98 	int		error	= 0;
99 	struct vnode	*vpio	= NULL;	/* vnode for inode open */
100 	struct inode	*ipio	= NULL;	/* inode for inode open */
101 	struct file	*fpio	= NULL;	/* file  for inode open */
102 	struct inode	*ip;		/* inode for file system */
103 	struct fs	*fs;		/* fs    for file system */
104 	STRUCT_DECL(fioio, fio);	/* copy of user's fioio struct */
105 
106 	/*
107 	 * must be privileged
108 	 */
109 	if (secpolicy_fs_config(cr, vp->v_vfsp) != 0)
110 		return (EPERM);
111 
112 	STRUCT_INIT(fio, flag & DATAMODEL_MASK);
113 
114 	/*
115 	 * get user's copy of fioio struct
116 	 */
117 	if (copyin(fiou, STRUCT_BUF(fio), STRUCT_SIZE(fio)))
118 		return (EFAULT);
119 
120 	ip = VTOI(vp);
121 	fs = ip->i_fs;
122 
123 	/*
124 	 * check the inode number against the fs's inode number bounds
125 	 */
126 	if (STRUCT_FGET(fio, fio_ino) < UFSROOTINO)
127 		return (ESRCH);
128 	if (STRUCT_FGET(fio, fio_ino) >= fs->fs_ncg * fs->fs_ipg)
129 		return (ESRCH);
130 
131 	rw_enter(&ip->i_ufsvfs->vfs_dqrwlock, RW_READER);
132 
133 	/*
134 	 * get the inode
135 	 */
136 	error = ufs_iget(ip->i_vfs, STRUCT_FGET(fio, fio_ino), &ipio, cr);
137 
138 	rw_exit(&ip->i_ufsvfs->vfs_dqrwlock);
139 
140 	if (error)
141 		return (error);
142 
143 	/*
144 	 * check the generation number
145 	 */
146 	rw_enter(&ipio->i_contents, RW_READER);
147 	if (ipio->i_gen != STRUCT_FGET(fio, fio_gen)) {
148 		error = ESTALE;
149 		rw_exit(&ipio->i_contents);
150 		goto errout;
151 	}
152 
153 	/*
154 	 * check if the inode is free
155 	 */
156 	if (ipio->i_mode == 0) {
157 		error = ENOENT;
158 		rw_exit(&ipio->i_contents);
159 		goto errout;
160 	}
161 	rw_exit(&ipio->i_contents);
162 
163 	/*
164 	 *	Adapted from copen: get a file struct
165 	 *	Large Files: We open this file descriptor with FOFFMAX flag
166 	 *	set so that it will be like a large file open.
167 	 */
168 	if (falloc(NULL, (FREAD|FOFFMAX), &fpio, STRUCT_FADDR(fio, fio_fd)))
169 		goto errout;
170 
171 	/*
172 	 *	Adapted from vn_open: check access and then open the file
173 	 */
174 	vpio = ITOV(ipio);
175 	if (error = VOP_ACCESS(vpio, VREAD, 0, cr))
176 		goto errout;
177 
178 	if (error = VOP_OPEN(&vpio, FREAD, cr))
179 		goto errout;
180 
181 	/*
182 	 *	Adapted from copen: initialize the file struct
183 	 */
184 	fpio->f_vnode = vpio;
185 
186 	/*
187 	 * return the fd
188 	 */
189 	if (copyout(STRUCT_BUF(fio), fiou, STRUCT_SIZE(fio))) {
190 		error = EFAULT;
191 		goto errout;
192 	}
193 	setf(STRUCT_FGET(fio, fio_fd), fpio);
194 	mutex_exit(&fpio->f_tlock);
195 	return (0);
196 errout:
197 	/*
198 	 * free the file struct and fd
199 	 */
200 	if (fpio) {
201 		setf(STRUCT_FGET(fio, fio_fd), NULL);
202 		unfalloc(fpio);
203 	}
204 
205 	/*
206 	 * release the hold on the inode
207 	 */
208 	if (ipio)
209 		VN_RELE(ITOV(ipio));
210 	return (error);
211 }
212 
213 /*
214  * ufs_fiosatime
215  *	set access time w/o altering change time.  This ioctl is tailored
216  *	to metamucil's needs and may change at any time.
217  */
218 int
219 ufs_fiosatime(
220 	struct vnode	*vp,		/* file's vnode */
221 	struct timeval	*tvu,		/* struct timeval in userland */
222 	int		flag,		/* flag from VOP_IOCTL() */
223 	struct cred	*cr)		/* credentials from ufs_ioctl */
224 {
225 	struct inode	*ip;		/* inode for vp */
226 	struct timeval32 tv;		/* copy of user's timeval */
227 	int now = 0;
228 
229 	/*
230 	 * must have sufficient privileges
231 	 */
232 	if (secpolicy_fs_config(cr, vp->v_vfsp) != 0)
233 		return (EPERM);
234 
235 	/*
236 	 * get user's copy of timeval struct and check values
237 	 * if input is NULL, will set time to now
238 	 */
239 	if (tvu == NULL) {
240 		now = 1;
241 	} else {
242 		if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
243 			if (copyin(tvu, &tv, sizeof (tv)))
244 				return (EFAULT);
245 		} else {
246 			struct timeval tv64;
247 
248 			if (copyin(tvu, &tv64, sizeof (tv64)))
249 				return (EFAULT);
250 			if (TIMEVAL_OVERFLOW(&tv64))
251 				return (EOVERFLOW);
252 			TIMEVAL_TO_TIMEVAL32(&tv, &tv64);
253 		}
254 
255 		if (tv.tv_usec < 0 || tv.tv_usec >= 1000000)
256 			return (EINVAL);
257 	}
258 
259 	/*
260 	 * update access time
261 	 */
262 	ip = VTOI(vp);
263 	rw_enter(&ip->i_contents, RW_WRITER);
264 	ITIMES_NOLOCK(ip);
265 	if (now) {
266 		mutex_enter(&ufs_iuniqtime_lock);
267 		ip->i_atime = iuniqtime;
268 		mutex_exit(&ufs_iuniqtime_lock);
269 	} else {
270 		ip->i_atime = tv;
271 	}
272 	ip->i_flag |= IMODACC;
273 	rw_exit(&ip->i_contents);
274 
275 	return (0);
276 }
277 
278 /*
279  * ufs_fiogdio
280  *	Get delayed-io state.  This ioctl is tailored
281  *	to metamucil's needs and may change at any time.
282  */
283 /* ARGSUSED */
284 int
285 ufs_fiogdio(
286 	struct vnode	*vp,		/* file's vnode */
287 	uint_t		*diop,		/* dio state returned here */
288 	int		flag,		/* flag from ufs_ioctl */
289 	struct cred	*cr)		/* credentials from ufs_ioctl */
290 {
291 	struct ufsvfs	*ufsvfsp	= VTOI(vp)->i_ufsvfs;
292 
293 	/*
294 	 * forcibly unmounted
295 	 */
296 	if (ufsvfsp == NULL)
297 		return (EIO);
298 
299 	if (suword32(diop, ufsvfsp->vfs_dio))
300 		return (EFAULT);
301 	return (0);
302 }
303 
304 /*
305  * ufs_fiosdio
306  *	Set delayed-io state.  This ioctl is tailored
307  *	to metamucil's needs and may change at any time.
308  */
309 int
310 ufs_fiosdio(
311 	struct vnode	*vp,		/* file's vnode */
312 	uint_t		*diop,		/* dio flag */
313 	int		flag,		/* flag from ufs_ioctl */
314 	struct cred	*cr)		/* credentials from ufs_ioctl */
315 {
316 	uint_t		dio;		/* copy of user's dio */
317 	struct inode	*ip;		/* inode for vp */
318 	struct ufsvfs	*ufsvfsp;
319 	struct fs	*fs;
320 	struct ulockfs	*ulp;
321 	int		error = 0;
322 
323 #ifdef lint
324 	flag = flag;
325 #endif
326 
327 	/* check input conditions */
328 	if (secpolicy_fs_config(cr, vp->v_vfsp) != 0)
329 		return (EPERM);
330 
331 	if (copyin(diop, &dio, sizeof (dio)))
332 		return (EFAULT);
333 
334 	if (dio > 1)
335 		return (EINVAL);
336 
337 	/* file system has been forcibly unmounted */
338 	if (VTOI(vp)->i_ufsvfs == NULL)
339 		return (EIO);
340 
341 	ip = VTOI(vp);
342 	ufsvfsp = ip->i_ufsvfs;
343 	ulp = &ufsvfsp->vfs_ulockfs;
344 
345 	/* logging file system; dio ignored */
346 	if (TRANS_ISTRANS(ufsvfsp))
347 		return (error);
348 
349 	/* hold the mutex to prevent race with a lockfs request */
350 	vfs_lock_wait(vp->v_vfsp);
351 	mutex_enter(&ulp->ul_lock);
352 
353 	if (ULOCKFS_IS_HLOCK(ulp)) {
354 		error = EIO;
355 		goto out;
356 	}
357 
358 	if (ULOCKFS_IS_ELOCK(ulp)) {
359 		error = EBUSY;
360 		goto out;
361 	}
362 	/* wait for outstanding accesses to finish */
363 	if (error = ufs_quiesce(ulp))
364 		goto out;
365 
366 	/* flush w/invalidate */
367 	if (error = ufs_flush(vp->v_vfsp))
368 		goto out;
369 
370 	/*
371 	 * update dio
372 	 */
373 	mutex_enter(&ufsvfsp->vfs_lock);
374 	ufsvfsp->vfs_dio = dio;
375 
376 	/*
377 	 * enable/disable clean flag processing
378 	 */
379 	fs = ip->i_fs;
380 	if (fs->fs_ronly == 0 &&
381 	    fs->fs_clean != FSBAD &&
382 	    fs->fs_clean != FSLOG) {
383 		if (dio)
384 			fs->fs_clean = FSSUSPEND;
385 		else
386 			fs->fs_clean = FSACTIVE;
387 		ufs_sbwrite(ufsvfsp);
388 		mutex_exit(&ufsvfsp->vfs_lock);
389 	} else
390 		mutex_exit(&ufsvfsp->vfs_lock);
391 out:
392 	/*
393 	 * we need this broadcast because of the ufs_quiesce call above
394 	 */
395 	cv_broadcast(&ulp->ul_cv);
396 	mutex_exit(&ulp->ul_lock);
397 	vfs_unlock(vp->v_vfsp);
398 	return (error);
399 }
400 
401 /*
402  * ufs_fioffs - ioctl handler for flushing file system
403  */
404 /* ARGSUSED */
405 int
406 ufs_fioffs(
407 	struct vnode	*vp,
408 	char 		*vap,		/* must be NULL - reserved */
409 	struct cred	*cr)		/* credentials from ufs_ioctl */
410 {
411 	int error;
412 	struct ufsvfs	*ufsvfsp;
413 	struct ulockfs	*ulp;
414 
415 	/* file system has been forcibly unmounted */
416 	ufsvfsp = VTOI(vp)->i_ufsvfs;
417 	if (ufsvfsp == NULL)
418 		return (EIO);
419 
420 	ulp = &ufsvfsp->vfs_ulockfs;
421 
422 	/*
423 	 * suspend the delete thread
424 	 *	this must be done outside the lockfs locking protocol
425 	 */
426 	ufs_thread_suspend(&ufsvfsp->vfs_delete);
427 
428 	vfs_lock_wait(vp->v_vfsp);
429 	/* hold the mutex to prevent race with a lockfs request */
430 	mutex_enter(&ulp->ul_lock);
431 
432 	if (ULOCKFS_IS_HLOCK(ulp)) {
433 		error = EIO;
434 		goto out;
435 	}
436 	if (ULOCKFS_IS_ELOCK(ulp)) {
437 		error = EBUSY;
438 		goto out;
439 	}
440 	/* wait for outstanding accesses to finish */
441 	if (error = ufs_quiesce(ulp))
442 		goto out;
443 
444 	/*
445 	 * If logging, and the logmap was marked as not rollable,
446 	 * make it rollable now, and start the trans_roll thread and
447 	 * the reclaim thread.  The log at this point is safe to write to.
448 	 */
449 	if (ufsvfsp->vfs_log) {
450 		ml_unit_t	*ul = ufsvfsp->vfs_log;
451 		struct fs	*fsp = ufsvfsp->vfs_fs;
452 		int		err;
453 
454 		if (ul->un_flags & LDL_NOROLL) {
455 			ul->un_flags &= ~LDL_NOROLL;
456 			logmap_start_roll(ul);
457 			if (!fsp->fs_ronly && (fsp->fs_reclaim &
458 				(FS_RECLAIM|FS_RECLAIMING))) {
459 				fsp->fs_reclaim &= ~FS_RECLAIM;
460 				fsp->fs_reclaim |= FS_RECLAIMING;
461 				ufs_thread_start(&ufsvfsp->vfs_reclaim,
462 					ufs_thread_reclaim,
463 					vp->v_vfsp);
464 				if (!fsp->fs_ronly) {
465 					TRANS_SBWRITE(ufsvfsp,
466 						TOP_SBUPDATE_UPDATE);
467 					if (err =
468 					    geterror(ufsvfsp->vfs_bufp)) {
469 						refstr_t	*mntpt;
470 						mntpt = vfs_getmntpoint(
471 							vp->v_vfsp);
472 						cmn_err(CE_NOTE,
473 							"Filesystem Flush "
474 							"Failed to update "
475 							"Reclaim Status for "
476 							" %s, Write failed to "
477 							"update superblock, "
478 							"error %d",
479 							refstr_value(mntpt),
480 							err);
481 						refstr_rele(mntpt);
482 					}
483 				}
484 			}
485 		}
486 	}
487 
488 	/* synchronously flush dirty data and metadata */
489 	error = ufs_flush(vp->v_vfsp);
490 
491 out:
492 	cv_broadcast(&ulp->ul_cv);
493 	mutex_exit(&ulp->ul_lock);
494 	vfs_unlock(vp->v_vfsp);
495 
496 	/*
497 	 * allow the delete thread to continue
498 	 */
499 	ufs_thread_continue(&ufsvfsp->vfs_delete);
500 	return (error);
501 }
502 
503 /*
504  * ufs_fioisbusy
505  *	Get number of references on this vnode.
506  *	Contract-private interface for Legato's NetWorker product.
507  */
508 /* ARGSUSED */
509 int
510 ufs_fioisbusy(struct vnode *vp, int *isbusy, struct cred *cr)
511 {
512 	int is_it_busy;
513 
514 	/*
515 	 * The caller holds one reference, there may be one in the dnlc
516 	 * so we need to flush it.
517 	 */
518 	if (vp->v_count > 1)
519 		dnlc_purge_vp(vp);
520 	/*
521 	 * Since we've just flushed the dnlc and we hold a reference
522 	 * to this vnode, then anything but 1 means busy (this had
523 	 * BETTER not be zero!). Also, it's possible for someone to
524 	 * have this file mmap'ed with no additional reference count.
525 	 */
526 	ASSERT(vp->v_count > 0);
527 	if ((vp->v_count == 1) && (VTOI(vp)->i_mapcnt == 0))
528 		is_it_busy = 0;
529 	else
530 		is_it_busy = 1;
531 
532 	if (suword32(isbusy, is_it_busy))
533 		return (EFAULT);
534 	return (0);
535 }
536 
537 /* ARGSUSED */
538 int
539 ufs_fiodirectio(struct vnode *vp, int cmd, struct cred *cr)
540 {
541 	int		error	= 0;
542 	struct inode	*ip	= VTOI(vp);
543 
544 	/*
545 	 * Acquire reader lock and set/reset direct mode
546 	 */
547 	rw_enter(&ip->i_contents, RW_READER);
548 	mutex_enter(&ip->i_tlock);
549 	if (cmd == DIRECTIO_ON)
550 		ip->i_flag |= IDIRECTIO;	/* enable direct mode */
551 	else if (cmd == DIRECTIO_OFF)
552 		ip->i_flag &= ~IDIRECTIO;	/* disable direct mode */
553 	else
554 		error = EINVAL;
555 	mutex_exit(&ip->i_tlock);
556 	rw_exit(&ip->i_contents);
557 	return (error);
558 }
559 
560 /*
561  * ufs_fiotune
562  *	Allow some tunables to be set on a mounted fs
563  */
564 int
565 ufs_fiotune(struct vnode *vp, struct fiotune *uftp, struct cred *cr)
566 {
567 	struct fiotune	ftp;
568 	struct fs	*fs;
569 	struct ufsvfs	*ufsvfsp;
570 
571 	/*
572 	 * must have sufficient privileges
573 	 */
574 	if (secpolicy_fs_config(cr, vp->v_vfsp) != 0)
575 		return (EPERM);
576 
577 	/*
578 	 * get user's copy
579 	 */
580 	if (copyin(uftp, &ftp, sizeof (ftp)))
581 		return (EFAULT);
582 
583 	/*
584 	 * some minimal sanity checks
585 	 */
586 	if ((ftp.maxcontig <= 0) ||
587 	    (ftp.rotdelay != 0) ||
588 	    (ftp.maxbpg <= 0) ||
589 	    (ftp.minfree < 0) ||
590 	    (ftp.minfree > 99) ||
591 	    ((ftp.optim != FS_OPTTIME) && (ftp.optim != FS_OPTSPACE)))
592 		return (EINVAL);
593 
594 	/*
595 	 * update superblock but don't write it!  If it gets out, fine.
596 	 */
597 	fs = VTOI(vp)->i_fs;
598 
599 	fs->fs_maxcontig = ftp.maxcontig;
600 	fs->fs_rotdelay = ftp.rotdelay;
601 	fs->fs_maxbpg = ftp.maxbpg;
602 	fs->fs_minfree = ftp.minfree;
603 	fs->fs_optim = ftp.optim;
604 
605 	/*
606 	 * Adjust cluster based on the new maxcontig. The cluster size
607 	 * can be any positive value. The check for this is done above.
608 	 */
609 	ufsvfsp = VTOI(vp)->i_ufsvfs;
610 	ufsvfsp->vfs_ioclustsz = fs->fs_bsize * fs->fs_maxcontig;
611 
612 	/*
613 	 * Adjust minfrags from minfree
614 	 */
615 	ufsvfsp->vfs_minfrags = (int)((int64_t)fs->fs_dsize *
616 							fs->fs_minfree / 100);
617 
618 	/*
619 	 * Write the superblock
620 	 */
621 	if (fs->fs_ronly == 0) {
622 		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SBUPDATE_UPDATE,
623 		    TOP_SBWRITE_SIZE);
624 		TRANS_SBWRITE(ufsvfsp, TOP_SBUPDATE_UPDATE);
625 		TRANS_END_ASYNC(ufsvfsp, TOP_SBUPDATE_UPDATE, TOP_SBWRITE_SIZE);
626 	}
627 
628 	return (0);
629 }
630 
631 /*
632  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
633  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
634  */
635 int
636 ufs_fio_holey(vnode_t *vp, int cmd, offset_t *off)
637 {
638 	inode_t	*ip = VTOI(vp);
639 	u_offset_t noff = (u_offset_t)*off; /* new offset */
640 	u_offset_t isz;
641 	int error;
642 	boolean_t hole;
643 
644 	rw_enter(&ip->i_contents, RW_READER);
645 	isz = ip->i_size;
646 	if (noff >= isz)  {
647 		rw_exit(&ip->i_contents);
648 		return (ENXIO);
649 	}
650 
651 	/*
652 	 * Check for the usual case where a file has no holes.
653 	 * If so we can optimise to set the end of the file as the first
654 	 * (virtual) hole. This avoids bmap_find() searching through
655 	 * every block in the file for a (non-existent) hole.
656 	 */
657 	if (!bmap_has_holes(ip)) {
658 		rw_exit(&ip->i_contents);
659 		if (cmd == _FIO_SEEK_HOLE) {
660 			*off = isz;
661 			return (0);
662 		}
663 		/* *off must already point to valid data (non hole) */
664 		return (0);
665 	}
666 
667 	/*
668 	 * Calling bmap_read() one block at a time on a 1TB file takes forever,
669 	 * so we use a special function to search for holes or blocks.
670 	 */
671 	if (cmd == _FIO_SEEK_HOLE)
672 		hole = B_TRUE;
673 	else
674 		hole = B_FALSE;
675 	error = bmap_find(ip, hole, &noff);
676 	rw_exit(&ip->i_contents);
677 
678 	/* end of file? */
679 	if (error == ENXIO) {
680 		/*
681 		 * Handle the virtual hole at the end of file.
682 		 */
683 		if (cmd == _FIO_SEEK_HOLE) {
684 			*off = isz;
685 			return (0);
686 		}
687 		return (ENXIO);
688 	}
689 	if (noff < *off)
690 		return (error);
691 	*off = noff;
692 	return (error);
693 }
694