xref: /freebsd/sys/kern/vfs_vnops.c (revision b52b9d56d4e96089873a75f9e29062eec19fabba)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/fcntl.h>
45 #include <sys/file.h>
46 #include <sys/stat.h>
47 #include <sys/proc.h>
48 #include <sys/lock.h>
49 #include <sys/mount.h>
50 #include <sys/mutex.h>
51 #include <sys/namei.h>
52 #include <sys/vnode.h>
53 #include <sys/bio.h>
54 #include <sys/buf.h>
55 #include <sys/filio.h>
56 #include <sys/sx.h>
57 #include <sys/ttycom.h>
58 #include <sys/conf.h>
59 #include <sys/syslog.h>
60 
61 #include <machine/limits.h>
62 
63 static int vn_closefile(struct file *fp, struct thread *td);
64 static int vn_ioctl(struct file *fp, u_long com, void *data,
65 		struct thread *td);
66 static int vn_read(struct file *fp, struct uio *uio,
67 		struct ucred *cred, int flags, struct thread *td);
68 static int vn_poll(struct file *fp, int events, struct ucred *cred,
69 		struct thread *td);
70 static int vn_kqfilter(struct file *fp, struct knote *kn);
71 static int vn_statfile(struct file *fp, struct stat *sb, struct thread *td);
72 static int vn_write(struct file *fp, struct uio *uio,
73 		struct ucred *cred, int flags, struct thread *td);
74 
75 struct 	fileops vnops = {
76 	vn_read, vn_write, vn_ioctl, vn_poll, vn_kqfilter,
77 	vn_statfile, vn_closefile
78 };
79 
80 int
81 vn_open(ndp, flagp, cmode)
82 	register struct nameidata *ndp;
83 	int *flagp, cmode;
84 {
85 	struct thread *td = ndp->ni_cnd.cn_thread;
86 
87 	return (vn_open_cred(ndp, flagp, cmode, td->td_ucred));
88 }
89 
90 /*
91  * Common code for vnode open operations.
92  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
93  *
94  * Note that this does NOT free nameidata for the successful case,
95  * due to the NDINIT being done elsewhere.
96  */
97 int
98 vn_open_cred(ndp, flagp, cmode, cred)
99 	register struct nameidata *ndp;
100 	int *flagp, cmode;
101 	struct ucred *cred;
102 {
103 	struct vnode *vp;
104 	struct mount *mp;
105 	struct thread *td = ndp->ni_cnd.cn_thread;
106 	struct vattr vat;
107 	struct vattr *vap = &vat;
108 	int mode, fmode, error;
109 #ifdef LOOKUP_SHARED
110 	int exclusive;	/* The current intended lock state */
111 
112 	exclusive = 0;
113 #endif
114 
115 restart:
116 	fmode = *flagp;
117 	if (fmode & O_CREAT) {
118 		ndp->ni_cnd.cn_nameiop = CREATE;
119 		ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
120 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
121 			ndp->ni_cnd.cn_flags |= FOLLOW;
122 		bwillwrite();
123 		if ((error = namei(ndp)) != 0)
124 			return (error);
125 		if (ndp->ni_vp == NULL) {
126 			VATTR_NULL(vap);
127 			vap->va_type = VREG;
128 			vap->va_mode = cmode;
129 			if (fmode & O_EXCL)
130 				vap->va_vaflags |= VA_EXCLUSIVE;
131 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
132 				NDFREE(ndp, NDF_ONLY_PNBUF);
133 				vput(ndp->ni_dvp);
134 				if ((error = vn_start_write(NULL, &mp,
135 				    V_XSLEEP | PCATCH)) != 0)
136 					return (error);
137 				goto restart;
138 			}
139 			VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
140 			error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
141 					   &ndp->ni_cnd, vap);
142 			vput(ndp->ni_dvp);
143 			vn_finished_write(mp);
144 			if (error) {
145 				NDFREE(ndp, NDF_ONLY_PNBUF);
146 				return (error);
147 			}
148 			ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
149 			ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
150 			fmode &= ~O_TRUNC;
151 			vp = ndp->ni_vp;
152 #ifdef LOOKUP_SHARED
153 			exclusive = 1;
154 #endif
155 		} else {
156 			if (ndp->ni_dvp == ndp->ni_vp)
157 				vrele(ndp->ni_dvp);
158 			else
159 				vput(ndp->ni_dvp);
160 			ndp->ni_dvp = NULL;
161 			vp = ndp->ni_vp;
162 			if (fmode & O_EXCL) {
163 				error = EEXIST;
164 				goto bad;
165 			}
166 			fmode &= ~O_CREAT;
167 		}
168 	} else {
169 		ndp->ni_cnd.cn_nameiop = LOOKUP;
170 #ifdef LOOKUP_SHARED
171 		ndp->ni_cnd.cn_flags =
172 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
173 		    LOCKSHARED | LOCKLEAF;
174 #else
175 		ndp->ni_cnd.cn_flags =
176 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
177 #endif
178 		if ((error = namei(ndp)) != 0)
179 			return (error);
180 		vp = ndp->ni_vp;
181 	}
182 	if (vp->v_type == VLNK) {
183 		error = EMLINK;
184 		goto bad;
185 	}
186 	if (vp->v_type == VSOCK) {
187 		error = EOPNOTSUPP;
188 		goto bad;
189 	}
190 	if ((fmode & O_CREAT) == 0) {
191 		mode = 0;
192 		if (fmode & (FWRITE | O_TRUNC)) {
193 			if (vp->v_type == VDIR) {
194 				error = EISDIR;
195 				goto bad;
196 			}
197 			error = vn_writechk(vp);
198 			if (error)
199 				goto bad;
200 			mode |= VWRITE;
201 		}
202 		if (fmode & FREAD)
203 			mode |= VREAD;
204 		if (fmode & O_APPEND)
205 			mode |= VAPPEND;
206 		if (mode) {
207 		        error = VOP_ACCESS(vp, mode, cred, td);
208 			if (error)
209 				goto bad;
210 		}
211 	}
212 	if ((error = VOP_OPEN(vp, fmode, cred, td)) != 0)
213 		goto bad;
214 	/*
215 	 * Make sure that a VM object is created for VMIO support.
216 	 */
217 	if (vn_canvmio(vp) == TRUE) {
218 #ifdef LOOKUP_SHARED
219 		int flock;
220 
221 		if (!exclusive && VOP_GETVOBJECT(vp, NULL) != 0)
222 			VOP_LOCK(vp, LK_UPGRADE, td);
223 		/*
224 		 * In cases where the object is marked as dead object_create
225 		 * will unlock and relock exclusive.  It is safe to call in
226 		 * here with a shared lock because we only examine fields that
227 		 * the shared lock guarantees will be stable.  In the UPGRADE
228 		 * case it is not likely that anyone has used this vnode yet
229 		 * so there will be no contention.  The logic after this call
230 		 * restores the requested locking state.
231 		 */
232 #endif
233 		if ((error = vfs_object_create(vp, td, cred)) != 0) {
234 			VOP_UNLOCK(vp, 0, td);
235 			VOP_CLOSE(vp, fmode, cred, td);
236 			NDFREE(ndp, NDF_ONLY_PNBUF);
237 			vrele(vp);
238 			*flagp = fmode;
239 			return (error);
240 		}
241 #ifdef LOOKUP_SHARED
242 		flock = VOP_ISLOCKED(vp, td);
243 		if (!exclusive && flock == LK_EXCLUSIVE)
244 			VOP_LOCK(vp, LK_DOWNGRADE, td);
245 #endif
246 	}
247 
248 	if (fmode & FWRITE)
249 		vp->v_writecount++;
250 	*flagp = fmode;
251 	return (0);
252 bad:
253 	NDFREE(ndp, NDF_ONLY_PNBUF);
254 	vput(vp);
255 	*flagp = fmode;
256 	return (error);
257 }
258 
259 /*
260  * Check for write permissions on the specified vnode.
261  * Prototype text segments cannot be written.
262  */
263 int
264 vn_writechk(vp)
265 	register struct vnode *vp;
266 {
267 
268 	/*
269 	 * If there's shared text associated with
270 	 * the vnode, try to free it up once.  If
271 	 * we fail, we can't allow writing.
272 	 */
273 	if (vp->v_flag & VTEXT)
274 		return (ETXTBSY);
275 	return (0);
276 }
277 
278 /*
279  * Vnode close call
280  */
281 int
282 vn_close(vp, flags, cred, td)
283 	register struct vnode *vp;
284 	int flags;
285 	struct ucred *cred;
286 	struct thread *td;
287 {
288 	int error;
289 
290 	if (flags & FWRITE)
291 		vp->v_writecount--;
292 	error = VOP_CLOSE(vp, flags, cred, td);
293 	/*
294 	 * XXX - In certain instances VOP_CLOSE has to do the vrele
295 	 * itself. If the vrele has been done, it will return EAGAIN
296 	 * to indicate that the vrele should not be done again. When
297 	 * this happens, we just return success. The correct thing to
298 	 * do would be to have all VOP_CLOSE instances do the vrele.
299 	 */
300 	if (error == EAGAIN)
301 		return (0);
302 	vrele(vp);
303 	return (error);
304 }
305 
306 /*
307  * Sequential heuristic - detect sequential operation
308  */
309 static __inline
310 int
311 sequential_heuristic(struct uio *uio, struct file *fp)
312 {
313 
314 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
315 	    uio->uio_offset == fp->f_nextoff) {
316 		/*
317 		 * XXX we assume that the filesystem block size is
318 		 * the default.  Not true, but still gives us a pretty
319 		 * good indicator of how sequential the read operations
320 		 * are.
321 		 */
322 		fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
323 		if (fp->f_seqcount >= 127)
324 			fp->f_seqcount = 127;
325 		return(fp->f_seqcount << 16);
326 	}
327 
328 	/*
329 	 * Not sequential, quick draw-down of seqcount
330 	 */
331 	if (fp->f_seqcount > 1)
332 		fp->f_seqcount = 1;
333 	else
334 		fp->f_seqcount = 0;
335 	return(0);
336 }
337 
338 /*
339  * Package up an I/O request on a vnode into a uio and do it.
340  */
341 int
342 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td)
343 	enum uio_rw rw;
344 	struct vnode *vp;
345 	caddr_t base;
346 	int len;
347 	off_t offset;
348 	enum uio_seg segflg;
349 	int ioflg;
350 	struct ucred *cred;
351 	int *aresid;
352 	struct thread *td;
353 {
354 	struct uio auio;
355 	struct iovec aiov;
356 	struct mount *mp;
357 	int error;
358 
359 	if ((ioflg & IO_NODELOCKED) == 0) {
360 		mp = NULL;
361 		if (rw == UIO_WRITE) {
362 			if (vp->v_type != VCHR &&
363 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
364 			    != 0)
365 				return (error);
366 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
367 		} else {
368 			vn_lock(vp, LK_SHARED | LK_RETRY, td);
369 		}
370 
371 	}
372 	auio.uio_iov = &aiov;
373 	auio.uio_iovcnt = 1;
374 	aiov.iov_base = base;
375 	aiov.iov_len = len;
376 	auio.uio_resid = len;
377 	auio.uio_offset = offset;
378 	auio.uio_segflg = segflg;
379 	auio.uio_rw = rw;
380 	auio.uio_td = td;
381 	if (rw == UIO_READ) {
382 		error = VOP_READ(vp, &auio, ioflg, cred);
383 	} else {
384 		error = VOP_WRITE(vp, &auio, ioflg, cred);
385 	}
386 	if (aresid)
387 		*aresid = auio.uio_resid;
388 	else
389 		if (auio.uio_resid && error == 0)
390 			error = EIO;
391 	if ((ioflg & IO_NODELOCKED) == 0) {
392 		if (rw == UIO_WRITE)
393 			vn_finished_write(mp);
394 		VOP_UNLOCK(vp, 0, td);
395 	}
396 	return (error);
397 }
398 
399 /*
400  * Package up an I/O request on a vnode into a uio and do it.  The I/O
401  * request is split up into smaller chunks and we try to avoid saturating
402  * the buffer cache while potentially holding a vnode locked, so we
403  * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
404  * to give other processes a chance to lock the vnode (either other processes
405  * core'ing the same binary, or unrelated processes scanning the directory).
406  */
407 int
408 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td)
409 	enum uio_rw rw;
410 	struct vnode *vp;
411 	caddr_t base;
412 	int len;
413 	off_t offset;
414 	enum uio_seg segflg;
415 	int ioflg;
416 	struct ucred *cred;
417 	int *aresid;
418 	struct thread *td;
419 {
420 	int error = 0;
421 
422 	do {
423 		int chunk = (len > MAXBSIZE) ? MAXBSIZE : len;
424 
425 		if (rw != UIO_READ && vp->v_type == VREG)
426 			bwillwrite();
427 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
428 		    ioflg, cred, aresid, td);
429 		len -= chunk;	/* aresid calc already includes length */
430 		if (error)
431 			break;
432 		offset += chunk;
433 		base += chunk;
434 		uio_yield();
435 	} while (len);
436 	if (aresid)
437 		*aresid += len;
438 	return (error);
439 }
440 
441 /*
442  * File table vnode read routine.
443  */
444 static int
445 vn_read(fp, uio, cred, flags, td)
446 	struct file *fp;
447 	struct uio *uio;
448 	struct ucred *cred;
449 	struct thread *td;
450 	int flags;
451 {
452 	struct vnode *vp;
453 	int error, ioflag;
454 
455 	mtx_lock(&Giant);
456 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
457 	    uio->uio_td, td));
458 	vp = (struct vnode *)fp->f_data;
459 	ioflag = 0;
460 	if (fp->f_flag & FNONBLOCK)
461 		ioflag |= IO_NDELAY;
462 	if (fp->f_flag & O_DIRECT)
463 		ioflag |= IO_DIRECT;
464 	VOP_LEASE(vp, td, cred, LEASE_READ);
465 	vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
466 	if ((flags & FOF_OFFSET) == 0)
467 		uio->uio_offset = fp->f_offset;
468 
469 	ioflag |= sequential_heuristic(uio, fp);
470 
471 	error = VOP_READ(vp, uio, ioflag, cred);
472 	if ((flags & FOF_OFFSET) == 0)
473 		fp->f_offset = uio->uio_offset;
474 	fp->f_nextoff = uio->uio_offset;
475 	VOP_UNLOCK(vp, 0, td);
476 	mtx_unlock(&Giant);
477 	return (error);
478 }
479 
480 /*
481  * File table vnode write routine.
482  */
483 static int
484 vn_write(fp, uio, cred, flags, td)
485 	struct file *fp;
486 	struct uio *uio;
487 	struct ucred *cred;
488 	struct thread *td;
489 	int flags;
490 {
491 	struct vnode *vp;
492 	struct mount *mp;
493 	int error, ioflag;
494 
495 	mtx_lock(&Giant);
496 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
497 	    uio->uio_td, td));
498 	vp = (struct vnode *)fp->f_data;
499 	if (vp->v_type == VREG)
500 		bwillwrite();
501 	ioflag = IO_UNIT;
502 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
503 		ioflag |= IO_APPEND;
504 	if (fp->f_flag & FNONBLOCK)
505 		ioflag |= IO_NDELAY;
506 	if (fp->f_flag & O_DIRECT)
507 		ioflag |= IO_DIRECT;
508 	if ((fp->f_flag & O_FSYNC) ||
509 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
510 		ioflag |= IO_SYNC;
511 	mp = NULL;
512 	if (vp->v_type != VCHR &&
513 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
514 		mtx_unlock(&Giant);
515 		return (error);
516 	}
517 	VOP_LEASE(vp, td, cred, LEASE_WRITE);
518 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
519 	if ((flags & FOF_OFFSET) == 0)
520 		uio->uio_offset = fp->f_offset;
521 	ioflag |= sequential_heuristic(uio, fp);
522 	error = VOP_WRITE(vp, uio, ioflag, cred);
523 	if ((flags & FOF_OFFSET) == 0)
524 		fp->f_offset = uio->uio_offset;
525 	fp->f_nextoff = uio->uio_offset;
526 	VOP_UNLOCK(vp, 0, td);
527 	vn_finished_write(mp);
528 	mtx_unlock(&Giant);
529 	return (error);
530 }
531 
532 /*
533  * File table vnode stat routine.
534  */
535 static int
536 vn_statfile(fp, sb, td)
537 	struct file *fp;
538 	struct stat *sb;
539 	struct thread *td;
540 {
541 	struct vnode *vp = (struct vnode *)fp->f_data;
542 	int error;
543 
544 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
545 	error = vn_stat(vp, sb, td);
546 	VOP_UNLOCK(vp, 0, td);
547 
548 	return (error);
549 }
550 
551 /*
552  * Stat a vnode; implementation for the stat syscall
553  */
554 int
555 vn_stat(vp, sb, td)
556 	struct vnode *vp;
557 	register struct stat *sb;
558 	struct thread *td;
559 {
560 	struct vattr vattr;
561 	register struct vattr *vap;
562 	int error;
563 	u_short mode;
564 
565 	vap = &vattr;
566 	error = VOP_GETATTR(vp, vap, td->td_ucred, td);
567 	if (error)
568 		return (error);
569 
570 	/*
571 	 * Zero the spare stat fields
572 	 */
573 	bzero(sb, sizeof *sb);
574 
575 	/*
576 	 * Copy from vattr table
577 	 */
578 	if (vap->va_fsid != VNOVAL)
579 		sb->st_dev = vap->va_fsid;
580 	else
581 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
582 	sb->st_ino = vap->va_fileid;
583 	mode = vap->va_mode;
584 	switch (vap->va_type) {
585 	case VREG:
586 		mode |= S_IFREG;
587 		break;
588 	case VDIR:
589 		mode |= S_IFDIR;
590 		break;
591 	case VBLK:
592 		mode |= S_IFBLK;
593 		break;
594 	case VCHR:
595 		mode |= S_IFCHR;
596 		break;
597 	case VLNK:
598 		mode |= S_IFLNK;
599 		/* This is a cosmetic change, symlinks do not have a mode. */
600 		if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
601 			sb->st_mode &= ~ACCESSPERMS;	/* 0000 */
602 		else
603 			sb->st_mode |= ACCESSPERMS;	/* 0777 */
604 		break;
605 	case VSOCK:
606 		mode |= S_IFSOCK;
607 		break;
608 	case VFIFO:
609 		mode |= S_IFIFO;
610 		break;
611 	default:
612 		return (EBADF);
613 	};
614 	sb->st_mode = mode;
615 	sb->st_nlink = vap->va_nlink;
616 	sb->st_uid = vap->va_uid;
617 	sb->st_gid = vap->va_gid;
618 	sb->st_rdev = vap->va_rdev;
619 	if (vap->va_size > OFF_MAX)
620 		return (EOVERFLOW);
621 	sb->st_size = vap->va_size;
622 	sb->st_atimespec = vap->va_atime;
623 	sb->st_mtimespec = vap->va_mtime;
624 	sb->st_ctimespec = vap->va_ctime;
625 	sb->st_birthtimespec = vap->va_birthtime;
626 
627         /*
628 	 * According to www.opengroup.org, the meaning of st_blksize is
629 	 *   "a filesystem-specific preferred I/O block size for this
630 	 *    object.  In some filesystem types, this may vary from file
631 	 *    to file"
632 	 * Default to PAGE_SIZE after much discussion.
633 	 */
634 
635 	if (vap->va_type == VREG) {
636 		sb->st_blksize = vap->va_blocksize;
637 	} else if (vn_isdisk(vp, NULL)) {
638 		sb->st_blksize = vp->v_rdev->si_bsize_best;
639 		if (sb->st_blksize < vp->v_rdev->si_bsize_phys)
640 			sb->st_blksize = vp->v_rdev->si_bsize_phys;
641 		if (sb->st_blksize < BLKDEV_IOSIZE)
642 			sb->st_blksize = BLKDEV_IOSIZE;
643 	} else {
644 		sb->st_blksize = PAGE_SIZE;
645 	}
646 
647 	sb->st_flags = vap->va_flags;
648 	if (suser(td))
649 		sb->st_gen = 0;
650 	else
651 		sb->st_gen = vap->va_gen;
652 
653 #if (S_BLKSIZE == 512)
654 	/* Optimize this case */
655 	sb->st_blocks = vap->va_bytes >> 9;
656 #else
657 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
658 #endif
659 	return (0);
660 }
661 
662 /*
663  * File table vnode ioctl routine.
664  */
665 static int
666 vn_ioctl(fp, com, data, td)
667 	struct file *fp;
668 	u_long com;
669 	void *data;
670 	struct thread *td;
671 {
672 	register struct vnode *vp = ((struct vnode *)fp->f_data);
673 	struct vnode *vpold;
674 	struct vattr vattr;
675 	int error;
676 
677 	switch (vp->v_type) {
678 
679 	case VREG:
680 	case VDIR:
681 		if (com == FIONREAD) {
682 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
683 			error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
684 			VOP_UNLOCK(vp, 0, td);
685 			if (error)
686 				return (error);
687 			*(int *)data = vattr.va_size - fp->f_offset;
688 			return (0);
689 		}
690 		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
691 			return (0);			/* XXX */
692 		/* fall into ... */
693 
694 	default:
695 #if 0
696 		return (ENOTTY);
697 #endif
698 	case VFIFO:
699 	case VCHR:
700 	case VBLK:
701 		if (com == FIODTYPE) {
702 			if (vp->v_type != VCHR && vp->v_type != VBLK)
703 				return (ENOTTY);
704 			*(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK;
705 			return (0);
706 		}
707 		error = VOP_IOCTL(vp, com, data, fp->f_flag, td->td_ucred, td);
708 		if (error == 0 && com == TIOCSCTTY) {
709 
710 			/* Do nothing if reassigning same control tty */
711 			sx_slock(&proctree_lock);
712 			if (td->td_proc->p_session->s_ttyvp == vp) {
713 				sx_sunlock(&proctree_lock);
714 				return (0);
715 			}
716 
717 			vpold = td->td_proc->p_session->s_ttyvp;
718 			VREF(vp);
719 			SESS_LOCK(td->td_proc->p_session);
720 			td->td_proc->p_session->s_ttyvp = vp;
721 			SESS_UNLOCK(td->td_proc->p_session);
722 
723 			sx_sunlock(&proctree_lock);
724 
725 			/* Get rid of reference to old control tty */
726 			if (vpold)
727 				vrele(vpold);
728 		}
729 		return (error);
730 	}
731 }
732 
733 /*
734  * File table vnode poll routine.
735  */
736 static int
737 vn_poll(fp, events, cred, td)
738 	struct file *fp;
739 	int events;
740 	struct ucred *cred;
741 	struct thread *td;
742 {
743 
744 	return (VOP_POLL(((struct vnode *)fp->f_data), events, cred, td));
745 }
746 
747 /*
748  * Check that the vnode is still valid, and if so
749  * acquire requested lock.
750  */
751 int
752 #ifndef	DEBUG_LOCKS
753 vn_lock(vp, flags, td)
754 #else
755 debug_vn_lock(vp, flags, td, filename, line)
756 #endif
757 	struct vnode *vp;
758 	int flags;
759 	struct thread *td;
760 #ifdef	DEBUG_LOCKS
761 	const char *filename;
762 	int line;
763 #endif
764 {
765 	int error;
766 
767 	do {
768 		if ((flags & LK_INTERLOCK) == 0)
769 			mtx_lock(&vp->v_interlock);
770 		if ((vp->v_flag & VXLOCK) && vp->v_vxproc != curthread) {
771 			vp->v_flag |= VXWANT;
772 			msleep(vp, &vp->v_interlock, PINOD | PDROP,
773 			    "vn_lock", 0);
774 			error = ENOENT;
775 		} else {
776 #if 0
777 			/* this can now occur in normal operation */
778 			if (vp->v_vxproc != NULL)
779 				log(LOG_INFO, "VXLOCK interlock avoided in vn_lock\n");
780 #endif
781 #ifdef	DEBUG_LOCKS
782 			vp->filename = filename;
783 			vp->line = line;
784 #endif
785 			error = VOP_LOCK(vp,
786 				    flags | LK_NOPAUSE | LK_INTERLOCK, td);
787 			if (error == 0)
788 				return (error);
789 		}
790 		flags &= ~LK_INTERLOCK;
791 	} while (flags & LK_RETRY);
792 	return (error);
793 }
794 
795 /*
796  * File table vnode close routine.
797  */
798 static int
799 vn_closefile(fp, td)
800 	struct file *fp;
801 	struct thread *td;
802 {
803 
804 	fp->f_ops = &badfileops;
805 	return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
806 		fp->f_cred, td));
807 }
808 
809 /*
810  * Preparing to start a filesystem write operation. If the operation is
811  * permitted, then we bump the count of operations in progress and
812  * proceed. If a suspend request is in progress, we wait until the
813  * suspension is over, and then proceed.
814  */
815 int
816 vn_start_write(vp, mpp, flags)
817 	struct vnode *vp;
818 	struct mount **mpp;
819 	int flags;
820 {
821 	struct mount *mp;
822 	int error;
823 
824 	/*
825 	 * If a vnode is provided, get and return the mount point that
826 	 * to which it will write.
827 	 */
828 	if (vp != NULL) {
829 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
830 			*mpp = NULL;
831 			if (error != EOPNOTSUPP)
832 				return (error);
833 			return (0);
834 		}
835 	}
836 	if ((mp = *mpp) == NULL)
837 		return (0);
838 	/*
839 	 * Check on status of suspension.
840 	 */
841 	while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
842 		if (flags & V_NOWAIT)
843 			return (EWOULDBLOCK);
844 		error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
845 		    "suspfs", 0);
846 		if (error)
847 			return (error);
848 	}
849 	if (flags & V_XSLEEP)
850 		return (0);
851 	mp->mnt_writeopcount++;
852 	return (0);
853 }
854 
855 /*
856  * Secondary suspension. Used by operations such as vop_inactive
857  * routines that are needed by the higher level functions. These
858  * are allowed to proceed until all the higher level functions have
859  * completed (indicated by mnt_writeopcount dropping to zero). At that
860  * time, these operations are halted until the suspension is over.
861  */
862 int
863 vn_write_suspend_wait(vp, mp, flags)
864 	struct vnode *vp;
865 	struct mount *mp;
866 	int flags;
867 {
868 	int error;
869 
870 	if (vp != NULL) {
871 		if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
872 			if (error != EOPNOTSUPP)
873 				return (error);
874 			return (0);
875 		}
876 	}
877 	/*
878 	 * If we are not suspended or have not yet reached suspended
879 	 * mode, then let the operation proceed.
880 	 */
881 	if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0)
882 		return (0);
883 	if (flags & V_NOWAIT)
884 		return (EWOULDBLOCK);
885 	/*
886 	 * Wait for the suspension to finish.
887 	 */
888 	return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
889 	    "suspfs", 0));
890 }
891 
892 /*
893  * Filesystem write operation has completed. If we are suspending and this
894  * operation is the last one, notify the suspender that the suspension is
895  * now in effect.
896  */
897 void
898 vn_finished_write(mp)
899 	struct mount *mp;
900 {
901 
902 	if (mp == NULL)
903 		return;
904 	mp->mnt_writeopcount--;
905 	if (mp->mnt_writeopcount < 0)
906 		panic("vn_finished_write: neg cnt");
907 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
908 	    mp->mnt_writeopcount <= 0)
909 		wakeup(&mp->mnt_writeopcount);
910 }
911 
912 /*
913  * Request a filesystem to suspend write operations.
914  */
915 void
916 vfs_write_suspend(mp)
917 	struct mount *mp;
918 {
919 	struct thread *td = curthread;
920 
921 	if (mp->mnt_kern_flag & MNTK_SUSPEND)
922 		return;
923 	mp->mnt_kern_flag |= MNTK_SUSPEND;
924 	if (mp->mnt_writeopcount > 0)
925 		(void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0);
926 	VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td);
927 	mp->mnt_kern_flag |= MNTK_SUSPENDED;
928 }
929 
930 /*
931  * Request a filesystem to resume write operations.
932  */
933 void
934 vfs_write_resume(mp)
935 	struct mount *mp;
936 {
937 
938 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0)
939 		return;
940 	mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED);
941 	wakeup(&mp->mnt_writeopcount);
942 	wakeup(&mp->mnt_flag);
943 }
944 
945 /*
946  * Implement kqueues for files by translating it to vnode operation.
947  */
948 static int
949 vn_kqfilter(struct file *fp, struct knote *kn)
950 {
951 
952 	return (VOP_KQFILTER(((struct vnode *)fp->f_data), kn));
953 }
954 
955 /*
956  * Simplified in-kernel wrapper calls for extended attribute access.
957  * Both calls pass in a NULL credential, authorizing as "kernel" access.
958  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
959  */
960 int
961 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
962     const char *attrname, int *buflen, char *buf, struct thread *td)
963 {
964 	struct uio	auio;
965 	struct iovec	iov;
966 	int	error;
967 
968 	iov.iov_len = *buflen;
969 	iov.iov_base = buf;
970 
971 	auio.uio_iov = &iov;
972 	auio.uio_iovcnt = 1;
973 	auio.uio_rw = UIO_READ;
974 	auio.uio_segflg = UIO_SYSSPACE;
975 	auio.uio_td = td;
976 	auio.uio_offset = 0;
977 	auio.uio_resid = *buflen;
978 
979 	if ((ioflg & IO_NODELOCKED) == 0)
980 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
981 
982 	/* authorize attribute retrieval as kernel */
983 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
984 	    td);
985 
986 	if ((ioflg & IO_NODELOCKED) == 0)
987 		VOP_UNLOCK(vp, 0, td);
988 
989 	if (error == 0) {
990 		*buflen = *buflen - auio.uio_resid;
991 	}
992 
993 	return (error);
994 }
995 
996 /*
997  * XXX failure mode if partially written?
998  */
999 int
1000 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1001     const char *attrname, int buflen, char *buf, struct thread *td)
1002 {
1003 	struct uio	auio;
1004 	struct iovec	iov;
1005 	struct mount	*mp;
1006 	int	error;
1007 
1008 	iov.iov_len = buflen;
1009 	iov.iov_base = buf;
1010 
1011 	auio.uio_iov = &iov;
1012 	auio.uio_iovcnt = 1;
1013 	auio.uio_rw = UIO_WRITE;
1014 	auio.uio_segflg = UIO_SYSSPACE;
1015 	auio.uio_td = td;
1016 	auio.uio_offset = 0;
1017 	auio.uio_resid = buflen;
1018 
1019 	if ((ioflg & IO_NODELOCKED) == 0) {
1020 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1021 			return (error);
1022 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1023 	}
1024 
1025 	/* authorize attribute setting as kernel */
1026 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
1027 
1028 	if ((ioflg & IO_NODELOCKED) == 0) {
1029 		vn_finished_write(mp);
1030 		VOP_UNLOCK(vp, 0, td);
1031 	}
1032 
1033 	return (error);
1034 }
1035 
1036 int
1037 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1038     const char *attrname, struct thread *td)
1039 {
1040 	struct mount	*mp;
1041 	int	error;
1042 
1043 	if ((ioflg & IO_NODELOCKED) == 0) {
1044 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1045 			return (error);
1046 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1047 	}
1048 
1049 	/* authorize attribute removal as kernel */
1050 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td);
1051 
1052 	if ((ioflg & IO_NODELOCKED) == 0) {
1053 		vn_finished_write(mp);
1054 		VOP_UNLOCK(vp, 0, td);
1055 	}
1056 
1057 	return (error);
1058 }
1059