/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/time.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/signal.h>
#include <sys/cred.h>
#include <sys/user.h>
#include <sys/buf.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/flock.h>
#include <sys/kmem.h>
#include <sys/uio.h>
#include <sys/conf.h>
#include <sys/mman.h>
#include <sys/pathname.h>
#include <sys/debug.h>
#include <sys/vmmeter.h>
#include <sys/vmsystm.h>
#include <sys/cmn_err.h>
#include <sys/vtrace.h>
#include <sys/filio.h>
#include <sys/dnlc.h>

#include <sys/fs/ufs_filio.h>
#include <sys/fs/ufs_lockfs.h>
#include <sys/fs/ufs_fs.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_fsdir.h>
#include <sys/fs/ufs_quota.h>
#include <sys/fs/ufs_trans.h>
#include <sys/fs/ufs_log.h>
#include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
#include <sys/errno.h>
#include <sys/sysinfo.h>

#include <vm/hat.h>
#include <vm/page.h>
#include <vm/pvn.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_map.h>
#include <vm/seg_vn.h>
#include <vm/rm.h>
#include <sys/swap.h>
#include <sys/model.h>
#include <sys/policy.h>

#include "fs/fs_subr.h"

/*
 * ufs_fioio is the ufs equivalent of NFS_CNVT and is tailored to
 * metamucil's needs.  It may change at any time.
 */
/* ARGSUSED */
int
ufs_fioio(
	struct vnode	*vp,		/* any file on the fs */
	struct fioio	*fiou,		/* fioio struct in userland */
	int		flag,		/* flag from VOP_IOCTL() */
	struct cred	*cr)		/* credentials from ufs_ioctl */
{
	int		error	= 0;
	struct vnode	*vpio	= NULL;	/* vnode for inode open */
	struct inode	*ipio	= NULL;	/* inode for inode open */
	struct file	*fpio	= NULL;	/* file  for inode open */
	struct inode	*ip;		/* inode for file system */
	struct fs	*fs;		/* fs    for file system */
	STRUCT_DECL(fioio, fio);	/* copy of user's fioio struct */

	/*
	 * must be privileged
	 */
	if (secpolicy_fs_config(cr, vp->v_vfsp) != 0)
		return (EPERM);

	STRUCT_INIT(fio, flag & DATAMODEL_MASK);

	/*
	 * get user's copy of fioio struct
	 */
	if (copyin(fiou, STRUCT_BUF(fio), STRUCT_SIZE(fio)))
		return (EFAULT);

	ip = VTOI(vp);
	fs = ip->i_fs;

	/*
	 * check the inode number against the fs's inode number bounds
	 */
	if (STRUCT_FGET(fio, fio_ino) < UFSROOTINO)
		return (ESRCH);
	if (STRUCT_FGET(fio, fio_ino) >= fs->fs_ncg * fs->fs_ipg)
		return (ESRCH);

	rw_enter(&ip->i_ufsvfs->vfs_dqrwlock, RW_READER);

	/*
	 * get the inode
	 */
	error = ufs_iget(ip->i_vfs, STRUCT_FGET(fio, fio_ino), &ipio, cr);

	rw_exit(&ip->i_ufsvfs->vfs_dqrwlock);

	if (error)
		return (error);

	/*
	 * check the generation number
	 */
	rw_enter(&ipio->i_contents, RW_READER);
	if (ipio->i_gen != STRUCT_FGET(fio, fio_gen)) {
		error = ESTALE;
		rw_exit(&ipio->i_contents);
		goto errout;
	}

	/*
	 * check if the inode is free
	 */
	if (ipio->i_mode == 0) {
		error = ENOENT;
		rw_exit(&ipio->i_contents);
		goto errout;
	}
	rw_exit(&ipio->i_contents);

	/*
	 *	Adapted from copen: get a file struct
	 *	Large Files: We open this file descriptor with FOFFMAX flag
	 *	set so that it will be like a large file open.
	 */
	if (falloc(NULL, (FREAD|FOFFMAX), &fpio, STRUCT_FADDR(fio, fio_fd)))
		goto errout;

	/*
	 *	Adapted from vn_open: check access and then open the file
	 */
	vpio = ITOV(ipio);
	if (error = VOP_ACCESS(vpio, VREAD, 0, cr))
		goto errout;

	if (error = VOP_OPEN(&vpio, FREAD, cr))
		goto errout;

	/*
	 *	Adapted from copen: initialize the file struct
	 */
	fpio->f_vnode = vpio;

	/*
	 * return the fd
	 */
	if (copyout(STRUCT_BUF(fio), fiou, STRUCT_SIZE(fio))) {
		error = EFAULT;
		goto errout;
	}
	setf(STRUCT_FGET(fio, fio_fd), fpio);
	mutex_exit(&fpio->f_tlock);
	return (0);
errout:
	/*
	 * free the file struct and fd
	 */
	if (fpio) {
		setf(STRUCT_FGET(fio, fio_fd), NULL);
		unfalloc(fpio);
	}

	/*
	 * release the hold on the inode
	 */
	if (ipio)
		VN_RELE(ITOV(ipio));
	return (error);
}

/*
 * ufs_fiosatime
 *	set access time w/o altering change time.  This ioctl is tailored
 *	to metamucil's needs and may change at any time.
 */
int
ufs_fiosatime(
	struct vnode	*vp,		/* file's vnode */
	struct timeval	*tvu,		/* struct timeval in userland */
	int		flag,		/* flag from VOP_IOCTL() */
	struct cred	*cr)		/* credentials from ufs_ioctl */
{
	struct inode	*ip;		/* inode for vp */
	struct timeval32 tv;		/* copy of user's timeval */
	int now = 0;

	/*
	 * must have sufficient privileges
	 */
	if (secpolicy_fs_config(cr, vp->v_vfsp) != 0)
		return (EPERM);

	/*
	 * get user's copy of timeval struct and check values
	 * if input is NULL, will set time to now
	 */
	if (tvu == NULL) {
		now = 1;
	} else {
		if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
			if (copyin(tvu, &tv, sizeof (tv)))
				return (EFAULT);
		} else {
			struct timeval tv64;

			if (copyin(tvu, &tv64, sizeof (tv64)))
				return (EFAULT);
			if (TIMEVAL_OVERFLOW(&tv64))
				return (EOVERFLOW);
			TIMEVAL_TO_TIMEVAL32(&tv, &tv64);
		}

		if (tv.tv_usec < 0 || tv.tv_usec >= 1000000)
			return (EINVAL);
	}

	/*
	 * update access time
	 */
	ip = VTOI(vp);
	rw_enter(&ip->i_contents, RW_WRITER);
	ITIMES_NOLOCK(ip);
	if (now) {
		mutex_enter(&ufs_iuniqtime_lock);
		ip->i_atime = iuniqtime;
		mutex_exit(&ufs_iuniqtime_lock);
	} else {
		ip->i_atime = tv;
	}
	ip->i_flag |= IMODACC;
	rw_exit(&ip->i_contents);

	return (0);
}

/*
 * ufs_fiogdio
 *	Get delayed-io state.  This ioctl is tailored
 *	to metamucil's needs and may change at any time.
 */
/* ARGSUSED */
int
ufs_fiogdio(
	struct vnode	*vp,		/* file's vnode */
	uint_t		*diop,		/* dio state returned here */
	int		flag,		/* flag from ufs_ioctl */
	struct cred	*cr)		/* credentials from ufs_ioctl */
{
	struct ufsvfs	*ufsvfsp	= VTOI(vp)->i_ufsvfs;

	/*
	 * forcibly unmounted
	 */
	if (ufsvfsp == NULL)
		return (EIO);

	if (suword32(diop, ufsvfsp->vfs_dio))
		return (EFAULT);
	return (0);
}

/*
 * ufs_fiosdio
 *	Set delayed-io state.  This ioctl is tailored
 *	to metamucil's needs and may change at any time.
 */
int
ufs_fiosdio(
	struct vnode	*vp,		/* file's vnode */
	uint_t		*diop,		/* dio flag */
	int		flag,		/* flag from ufs_ioctl */
	struct cred	*cr)		/* credentials from ufs_ioctl */
{
	uint_t		dio;		/* copy of user's dio */
	struct inode	*ip;		/* inode for vp */
	struct ufsvfs	*ufsvfsp;
	struct fs	*fs;
	struct ulockfs	*ulp;
	int		error = 0;

#ifdef lint
	flag = flag;
#endif

	/* check input conditions */
	if (secpolicy_fs_config(cr, vp->v_vfsp) != 0)
		return (EPERM);

	if (copyin(diop, &dio, sizeof (dio)))
		return (EFAULT);

	if (dio > 1)
		return (EINVAL);

	/* file system has been forcibly unmounted */
	if (VTOI(vp)->i_ufsvfs == NULL)
		return (EIO);

	ip = VTOI(vp);
	ufsvfsp = ip->i_ufsvfs;
	ulp = &ufsvfsp->vfs_ulockfs;

	/* logging file system; dio ignored */
	if (TRANS_ISTRANS(ufsvfsp))
		return (error);

	/* hold the mutex to prevent race with a lockfs request */
	vfs_lock_wait(vp->v_vfsp);
	mutex_enter(&ulp->ul_lock);

	if (ULOCKFS_IS_HLOCK(ulp)) {
		error = EIO;
		goto out;
	}

	if (ULOCKFS_IS_ELOCK(ulp)) {
		error = EBUSY;
		goto out;
	}
	/* wait for outstanding accesses to finish */
	if (error = ufs_quiesce(ulp))
		goto out;

	/* flush w/invalidate */
	if (error = ufs_flush(vp->v_vfsp))
		goto out;

	/*
	 * update dio
	 */
	mutex_enter(&ufsvfsp->vfs_lock);
	ufsvfsp->vfs_dio = dio;

	/*
	 * enable/disable clean flag processing
	 */
	fs = ip->i_fs;
	if (fs->fs_ronly == 0 &&
	    fs->fs_clean != FSBAD &&
	    fs->fs_clean != FSLOG) {
		if (dio)
			fs->fs_clean = FSSUSPEND;
		else
			fs->fs_clean = FSACTIVE;
		ufs_sbwrite(ufsvfsp);
		mutex_exit(&ufsvfsp->vfs_lock);
	} else
		mutex_exit(&ufsvfsp->vfs_lock);
out:
	/*
	 * we need this broadcast because of the ufs_quiesce call above
	 */
	cv_broadcast(&ulp->ul_cv);
	mutex_exit(&ulp->ul_lock);
	vfs_unlock(vp->v_vfsp);
	return (error);
}

/*
 * ufs_fioffs - ioctl handler for flushing file system
 */
/* ARGSUSED */
int
ufs_fioffs(
	struct vnode	*vp,
	char 		*vap,		/* must be NULL - reserved */
	struct cred	*cr)		/* credentials from ufs_ioctl */
{
	int error;
	struct ufsvfs	*ufsvfsp;
	struct ulockfs	*ulp;

	/* file system has been forcibly unmounted */
	ufsvfsp = VTOI(vp)->i_ufsvfs;
	if (ufsvfsp == NULL)
		return (EIO);

	ulp = &ufsvfsp->vfs_ulockfs;

	/*
	 * suspend the delete thread
	 *	this must be done outside the lockfs locking protocol
	 */
	ufs_thread_suspend(&ufsvfsp->vfs_delete);

	vfs_lock_wait(vp->v_vfsp);
	/* hold the mutex to prevent race with a lockfs request */
	mutex_enter(&ulp->ul_lock);

	if (ULOCKFS_IS_HLOCK(ulp)) {
		error = EIO;
		goto out;
	}
	if (ULOCKFS_IS_ELOCK(ulp)) {
		error = EBUSY;
		goto out;
	}
	/* wait for outstanding accesses to finish */
	if (error = ufs_quiesce(ulp))
		goto out;

	/*
	 * If logging, and the logmap was marked as not rollable,
	 * make it rollable now, and start the trans_roll thread and
	 * the reclaim thread.  The log at this point is safe to write to.
	 */
	if (ufsvfsp->vfs_log) {
		ml_unit_t	*ul = ufsvfsp->vfs_log;
		struct fs	*fsp = ufsvfsp->vfs_fs;
		int		err;

		if (ul->un_flags & LDL_NOROLL) {
			ul->un_flags &= ~LDL_NOROLL;
			logmap_start_roll(ul);
			if (!fsp->fs_ronly && (fsp->fs_reclaim &
				(FS_RECLAIM|FS_RECLAIMING))) {
				fsp->fs_reclaim &= ~FS_RECLAIM;
				fsp->fs_reclaim |= FS_RECLAIMING;
				ufs_thread_start(&ufsvfsp->vfs_reclaim,
					ufs_thread_reclaim,
					vp->v_vfsp);
				if (!fsp->fs_ronly) {
					TRANS_SBWRITE(ufsvfsp,
						TOP_SBUPDATE_UPDATE);
					if (err =
					    geterror(ufsvfsp->vfs_bufp)) {
						refstr_t	*mntpt;
						mntpt = vfs_getmntpoint(
							vp->v_vfsp);
						cmn_err(CE_NOTE,
							"Filesystem Flush "
							"Failed to update "
							"Reclaim Status for "
							" %s, Write failed to "
							"update superblock, "
							"error %d",
							refstr_value(mntpt),
							err);
						refstr_rele(mntpt);
					}
				}
			}
		}
	}

	/* synchronously flush dirty data and metadata */
	error = ufs_flush(vp->v_vfsp);

out:
	cv_broadcast(&ulp->ul_cv);
	mutex_exit(&ulp->ul_lock);
	vfs_unlock(vp->v_vfsp);

	/*
	 * allow the delete thread to continue
	 */
	ufs_thread_continue(&ufsvfsp->vfs_delete);
	return (error);
}

/*
 * ufs_fioisbusy
 *	Get number of references on this vnode.
 *	Contract-private interface for Legato's NetWorker product.
 */
/* ARGSUSED */
int
ufs_fioisbusy(struct vnode *vp, int *isbusy, struct cred *cr)
{
	int is_it_busy;

	/*
	 * The caller holds one reference, there may be one in the dnlc
	 * so we need to flush it.
	 */
	if (vp->v_count > 1)
		dnlc_purge_vp(vp);
	/*
	 * Since we've just flushed the dnlc and we hold a reference
	 * to this vnode, then anything but 1 means busy (this had
	 * BETTER not be zero!). Also, it's possible for someone to
	 * have this file mmap'ed with no additional reference count.
	 */
	ASSERT(vp->v_count > 0);
	if ((vp->v_count == 1) && (VTOI(vp)->i_mapcnt == 0))
		is_it_busy = 0;
	else
		is_it_busy = 1;

	if (suword32(isbusy, is_it_busy))
		return (EFAULT);
	return (0);
}

/* ARGSUSED */
int
ufs_fiodirectio(struct vnode *vp, int cmd, struct cred *cr)
{
	int		error	= 0;
	struct inode	*ip	= VTOI(vp);

	/*
	 * Acquire reader lock and set/reset direct mode
	 */
	rw_enter(&ip->i_contents, RW_READER);
	mutex_enter(&ip->i_tlock);
	if (cmd == DIRECTIO_ON)
		ip->i_flag |= IDIRECTIO;	/* enable direct mode */
	else if (cmd == DIRECTIO_OFF)
		ip->i_flag &= ~IDIRECTIO;	/* disable direct mode */
	else
		error = EINVAL;
	mutex_exit(&ip->i_tlock);
	rw_exit(&ip->i_contents);
	return (error);
}

/*
 * ufs_fiotune
 *	Allow some tunables to be set on a mounted fs
 */
int
ufs_fiotune(struct vnode *vp, struct fiotune *uftp, struct cred *cr)
{
	struct fiotune	ftp;
	struct fs	*fs;
	struct ufsvfs	*ufsvfsp;

	/*
	 * must have sufficient privileges
	 */
	if (secpolicy_fs_config(cr, vp->v_vfsp) != 0)
		return (EPERM);

	/*
	 * get user's copy
	 */
	if (copyin(uftp, &ftp, sizeof (ftp)))
		return (EFAULT);

	/*
	 * some minimal sanity checks
	 */
	if ((ftp.maxcontig <= 0) ||
	    (ftp.rotdelay != 0) ||
	    (ftp.maxbpg <= 0) ||
	    (ftp.minfree < 0) ||
	    (ftp.minfree > 99) ||
	    ((ftp.optim != FS_OPTTIME) && (ftp.optim != FS_OPTSPACE)))
		return (EINVAL);

	/*
	 * update superblock but don't write it!  If it gets out, fine.
	 */
	fs = VTOI(vp)->i_fs;

	fs->fs_maxcontig = ftp.maxcontig;
	fs->fs_rotdelay = ftp.rotdelay;
	fs->fs_maxbpg = ftp.maxbpg;
	fs->fs_minfree = ftp.minfree;
	fs->fs_optim = ftp.optim;

	/*
	 * Adjust cluster based on the new maxcontig. The cluster size
	 * can be any positive value. The check for this is done above.
	 */
	ufsvfsp = VTOI(vp)->i_ufsvfs;
	ufsvfsp->vfs_ioclustsz = fs->fs_bsize * fs->fs_maxcontig;

	/*
	 * Adjust minfrags from minfree
	 */
	ufsvfsp->vfs_minfrags = (int)((int64_t)fs->fs_dsize *
							fs->fs_minfree / 100);

	/*
	 * Write the superblock
	 */
	if (fs->fs_ronly == 0) {
		TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SBUPDATE_UPDATE,
		    TOP_SBWRITE_SIZE);
		TRANS_SBWRITE(ufsvfsp, TOP_SBUPDATE_UPDATE);
		TRANS_END_ASYNC(ufsvfsp, TOP_SBUPDATE_UPDATE, TOP_SBWRITE_SIZE);
	}

	return (0);
}

/*
 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
 */
int
ufs_fio_holey(vnode_t *vp, int cmd, offset_t *off)
{
	inode_t	*ip = VTOI(vp);
	u_offset_t noff = (u_offset_t)*off; /* new offset */
	u_offset_t isz;
	int error;
	boolean_t hole;

	rw_enter(&ip->i_contents, RW_READER);
	isz = ip->i_size;
	if (noff >= isz)  {
		rw_exit(&ip->i_contents);
		return (ENXIO);
	}

	/*
	 * Check for the usual case where a file has no holes.
	 * If so we can optimise to set the end of the file as the first
	 * (virtual) hole. This avoids bmap_find() searching through
	 * every block in the file for a (non-existent) hole.
	 */
	if (!bmap_has_holes(ip)) {
		rw_exit(&ip->i_contents);
		if (cmd == _FIO_SEEK_HOLE) {
			*off = isz;
			return (0);
		}
		/* *off must already point to valid data (non hole) */
		return (0);
	}

	/*
	 * Calling bmap_read() one block at a time on a 1TB file takes forever,
	 * so we use a special function to search for holes or blocks.
	 */
	if (cmd == _FIO_SEEK_HOLE)
		hole = B_TRUE;
	else
		hole = B_FALSE;
	error = bmap_find(ip, hole, &noff);
	rw_exit(&ip->i_contents);

	/* end of file? */
	if (error == ENXIO) {
		/*
		 * Handle the virtual hole at the end of file.
		 */
		if (cmd == _FIO_SEEK_HOLE) {
			*off = isz;
			return (0);
		}
		return (ENXIO);
	}
	if (noff < *off)
		return (error);
	*off = noff;
	return (error);
}