xref: /freebsd/sys/kern/vfs_vnops.c (revision daf1cffce2e07931f27c6c6998652e90df6ba87e)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
39  * $FreeBSD$
40  */
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/fcntl.h>
45 #include <sys/file.h>
46 #include <sys/stat.h>
47 #include <sys/proc.h>
48 #include <sys/mount.h>
49 #include <sys/namei.h>
50 #include <sys/vnode.h>
51 #include <sys/buf.h>
52 #include <sys/filio.h>
53 #include <sys/ttycom.h>
54 #include <sys/conf.h>
55 #include <vm/vm_zone.h>
56 
57 static int vn_closefile __P((struct file *fp, struct proc *p));
58 static int vn_ioctl __P((struct file *fp, u_long com, caddr_t data,
59 		struct proc *p));
60 static int vn_read __P((struct file *fp, struct uio *uio,
61 		struct ucred *cred, int flags, struct proc *p));
62 static int vn_poll __P((struct file *fp, int events, struct ucred *cred,
63 		struct proc *p));
64 static int vn_statfile __P((struct file *fp, struct stat *sb, struct proc *p));
65 static int vn_write __P((struct file *fp, struct uio *uio,
66 		struct ucred *cred, int flags, struct proc *p));
67 
68 struct 	fileops vnops =
69 	{ vn_read, vn_write, vn_ioctl, vn_poll, vn_statfile, vn_closefile };
70 
71 /*
72  * Common code for vnode open operations.
73  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
74  *
75  * Note that this do NOT free nameidata for the successful case,
76  * due to the NDINIT being done elsewhere.
77  */
78 int
79 vn_open(ndp, fmode, cmode)
80 	register struct nameidata *ndp;
81 	int fmode, cmode;
82 {
83 	register struct vnode *vp;
84 	register struct proc *p = ndp->ni_cnd.cn_proc;
85 	register struct ucred *cred = p->p_ucred;
86 	struct vattr vat;
87 	struct vattr *vap = &vat;
88 	int mode, error;
89 
90 	if (fmode & O_CREAT) {
91 		ndp->ni_cnd.cn_nameiop = CREATE;
92 		ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
93 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
94 			ndp->ni_cnd.cn_flags |= FOLLOW;
95 		bwillwrite();
96 		error = namei(ndp);
97 		if (error)
98 			return (error);
99 		if (ndp->ni_vp == NULL) {
100 			VATTR_NULL(vap);
101 			vap->va_type = VREG;
102 			vap->va_mode = cmode;
103 			if (fmode & O_EXCL)
104 				vap->va_vaflags |= VA_EXCLUSIVE;
105 			VOP_LEASE(ndp->ni_dvp, p, cred, LEASE_WRITE);
106 			error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
107 					   &ndp->ni_cnd, vap);
108 			if (error) {
109 				NDFREE(ndp, NDF_ONLY_PNBUF);
110 				vput(ndp->ni_dvp);
111 				return (error);
112 			}
113 			vput(ndp->ni_dvp);
114 			ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
115 			ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
116 			fmode &= ~O_TRUNC;
117 			vp = ndp->ni_vp;
118 		} else {
119 			if (ndp->ni_dvp == ndp->ni_vp)
120 				vrele(ndp->ni_dvp);
121 			else
122 				vput(ndp->ni_dvp);
123 			ndp->ni_dvp = NULL;
124 			vp = ndp->ni_vp;
125 			if (fmode & O_EXCL) {
126 				error = EEXIST;
127 				goto bad;
128 			}
129 			fmode &= ~O_CREAT;
130 		}
131 	} else {
132 		ndp->ni_cnd.cn_nameiop = LOOKUP;
133 		ndp->ni_cnd.cn_flags =
134 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
135 		error = namei(ndp);
136 		if (error)
137 			return (error);
138 		vp = ndp->ni_vp;
139 	}
140 	if (vp->v_type == VLNK) {
141 		error = EMLINK;
142 		goto bad;
143 	}
144 	if (vp->v_type == VSOCK) {
145 		error = EOPNOTSUPP;
146 		goto bad;
147 	}
148 	if ((fmode & O_CREAT) == 0) {
149 		mode = 0;
150 		if (fmode & (FWRITE | O_TRUNC)) {
151 			if (vp->v_type == VDIR) {
152 				error = EISDIR;
153 				goto bad;
154 			}
155 			error = vn_writechk(vp);
156 			if (error)
157 				goto bad;
158 			mode |= VWRITE;
159 		}
160 		if (fmode & FREAD)
161 			mode |= VREAD;
162 		if (mode) {
163 		        error = VOP_ACCESS(vp, mode, cred, p);
164 			if (error)
165 				goto bad;
166 		}
167 	}
168 	if (fmode & O_TRUNC) {
169 		VOP_UNLOCK(vp, 0, p);				/* XXX */
170 		VOP_LEASE(vp, p, cred, LEASE_WRITE);
171 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);	/* XXX */
172 		VATTR_NULL(vap);
173 		vap->va_size = 0;
174 		error = VOP_SETATTR(vp, vap, cred, p);
175 		if (error)
176 			goto bad;
177 	}
178 	error = VOP_OPEN(vp, fmode, cred, p);
179 	if (error)
180 		goto bad;
181 	/*
182 	 * Make sure that a VM object is created for VMIO support.
183 	 */
184 	if (vn_canvmio(vp) == TRUE) {
185 		if ((error = vfs_object_create(vp, p, cred)) != 0)
186 			goto bad;
187 	}
188 
189 	if (fmode & FWRITE)
190 		vp->v_writecount++;
191 	return (0);
192 bad:
193 	NDFREE(ndp, NDF_ONLY_PNBUF);
194 	vput(vp);
195 	return (error);
196 }
197 
198 /*
199  * Check for write permissions on the specified vnode.
200  * Prototype text segments cannot be written.
201  */
202 int
203 vn_writechk(vp)
204 	register struct vnode *vp;
205 {
206 
207 	/*
208 	 * If there's shared text associated with
209 	 * the vnode, try to free it up once.  If
210 	 * we fail, we can't allow writing.
211 	 */
212 	if (vp->v_flag & VTEXT)
213 		return (ETXTBSY);
214 	return (0);
215 }
216 
217 /*
218  * Vnode close call
219  */
220 int
221 vn_close(vp, flags, cred, p)
222 	register struct vnode *vp;
223 	int flags;
224 	struct ucred *cred;
225 	struct proc *p;
226 {
227 	int error;
228 
229 	if (flags & FWRITE)
230 		vp->v_writecount--;
231 	error = VOP_CLOSE(vp, flags, cred, p);
232 	vrele(vp);
233 	return (error);
234 }
235 
236 /*
237  * Package up an I/O request on a vnode into a uio and do it.
238  */
239 int
240 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p)
241 	enum uio_rw rw;
242 	struct vnode *vp;
243 	caddr_t base;
244 	int len;
245 	off_t offset;
246 	enum uio_seg segflg;
247 	int ioflg;
248 	struct ucred *cred;
249 	int *aresid;
250 	struct proc *p;
251 {
252 	struct uio auio;
253 	struct iovec aiov;
254 	int error;
255 
256 	if ((ioflg & IO_NODELOCKED) == 0)
257 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
258 	auio.uio_iov = &aiov;
259 	auio.uio_iovcnt = 1;
260 	aiov.iov_base = base;
261 	aiov.iov_len = len;
262 	auio.uio_resid = len;
263 	auio.uio_offset = offset;
264 	auio.uio_segflg = segflg;
265 	auio.uio_rw = rw;
266 	auio.uio_procp = p;
267 	if (rw == UIO_READ) {
268 		error = VOP_READ(vp, &auio, ioflg, cred);
269 	} else {
270 		error = VOP_WRITE(vp, &auio, ioflg, cred);
271 	}
272 	if (aresid)
273 		*aresid = auio.uio_resid;
274 	else
275 		if (auio.uio_resid && error == 0)
276 			error = EIO;
277 	if ((ioflg & IO_NODELOCKED) == 0)
278 		VOP_UNLOCK(vp, 0, p);
279 	return (error);
280 }
281 
282 /*
283  * File table vnode read routine.
284  */
285 static int
286 vn_read(fp, uio, cred, flags, p)
287 	struct file *fp;
288 	struct uio *uio;
289 	struct ucred *cred;
290 	struct proc *p;
291 	int flags;
292 {
293 	struct vnode *vp;
294 	int error, ioflag;
295 
296 	KASSERT(uio->uio_procp == p, ("uio_procp %p is not p %p",
297 	    uio->uio_procp, p));
298 	vp = (struct vnode *)fp->f_data;
299 	ioflag = 0;
300 	if (fp->f_flag & FNONBLOCK)
301 		ioflag |= IO_NDELAY;
302 	VOP_LEASE(vp, p, cred, LEASE_READ);
303 	vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
304 	if ((flags & FOF_OFFSET) == 0)
305 		uio->uio_offset = fp->f_offset;
306 
307 	/*
308 	 * Sequential read heuristic.
309 	 * If we have been doing sequential input,
310 	 * a rewind operation doesn't turn off
311 	 * sequential input mode.
312 	 */
313 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
314 	    uio->uio_offset == fp->f_nextread) {
315 		int tmpseq = fp->f_seqcount;
316 		/*
317 		 * XXX we assume that the filesystem block size is
318 		 * the default.  Not true, but still gives us a pretty
319 		 * good indicator of how sequential the read operations
320 		 * are.
321 		 */
322 		tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
323 		if (tmpseq >= 127)
324 			tmpseq = 127;
325 		fp->f_seqcount = tmpseq;
326 		ioflag |= fp->f_seqcount << 16;
327 	} else {
328 		if (fp->f_seqcount > 1)
329 			fp->f_seqcount = 1;
330 		else
331 			fp->f_seqcount = 0;
332 	}
333 	error = VOP_READ(vp, uio, ioflag, cred);
334 	if ((flags & FOF_OFFSET) == 0)
335 		fp->f_offset = uio->uio_offset;
336 	fp->f_nextread = uio->uio_offset;
337 	VOP_UNLOCK(vp, 0, p);
338 	return (error);
339 }
340 
341 /*
342  * File table vnode write routine.
343  */
344 static int
345 vn_write(fp, uio, cred, flags, p)
346 	struct file *fp;
347 	struct uio *uio;
348 	struct ucred *cred;
349 	struct proc *p;
350 	int flags;
351 {
352 	struct vnode *vp;
353 	int error, ioflag;
354 
355 	KASSERT(uio->uio_procp == p, ("uio_procp %p is not p %p",
356 	    uio->uio_procp, p));
357 	vp = (struct vnode *)fp->f_data;
358 	if (vp->v_type == VREG)
359 		bwillwrite();
360 	vp = (struct vnode *)fp->f_data;	/* XXX needed? */
361 	ioflag = IO_UNIT;
362 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
363 		ioflag |= IO_APPEND;
364 	if (fp->f_flag & FNONBLOCK)
365 		ioflag |= IO_NDELAY;
366 	if ((fp->f_flag & O_FSYNC) ||
367 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
368 		ioflag |= IO_SYNC;
369 	VOP_LEASE(vp, p, cred, LEASE_WRITE);
370 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
371 	if ((flags & FOF_OFFSET) == 0)
372 		uio->uio_offset = fp->f_offset;
373 	error = VOP_WRITE(vp, uio, ioflag, cred);
374 	if ((flags & FOF_OFFSET) == 0)
375 		fp->f_offset = uio->uio_offset;
376 	VOP_UNLOCK(vp, 0, p);
377 	return (error);
378 }
379 
380 /*
381  * File table vnode stat routine.
382  */
383 static int
384 vn_statfile(fp, sb, p)
385 	struct file *fp;
386 	struct stat *sb;
387 	struct proc *p;
388 {
389 	struct vnode *vp = (struct vnode *)fp->f_data;
390 
391 	return vn_stat(vp, sb, p);
392 }
393 
394 int
395 vn_stat(vp, sb, p)
396 	struct vnode *vp;
397 	register struct stat *sb;
398 	struct proc *p;
399 {
400 	struct vattr vattr;
401 	register struct vattr *vap;
402 	int error;
403 	u_short mode;
404 
405 	vap = &vattr;
406 	error = VOP_GETATTR(vp, vap, p->p_ucred, p);
407 	if (error)
408 		return (error);
409 
410 	/*
411 	 * Zero the spare stat fields
412 	 */
413 	sb->st_lspare = 0;
414 	sb->st_qspare[0] = 0;
415 	sb->st_qspare[1] = 0;
416 
417 	/*
418 	 * Copy from vattr table
419 	 */
420 	if (vap->va_fsid != VNOVAL)
421 		sb->st_dev = vap->va_fsid;
422 	else
423 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
424 	sb->st_ino = vap->va_fileid;
425 	mode = vap->va_mode;
426 	switch (vap->va_type) {
427 	case VREG:
428 		mode |= S_IFREG;
429 		break;
430 	case VDIR:
431 		mode |= S_IFDIR;
432 		break;
433 	case VBLK:
434 		mode |= S_IFBLK;
435 		break;
436 	case VCHR:
437 		mode |= S_IFCHR;
438 		break;
439 	case VLNK:
440 		mode |= S_IFLNK;
441 		/* This is a cosmetic change, symlinks do not have a mode. */
442 		if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
443 			sb->st_mode &= ~ACCESSPERMS;	/* 0000 */
444 		else
445 			sb->st_mode |= ACCESSPERMS;	/* 0777 */
446 		break;
447 	case VSOCK:
448 		mode |= S_IFSOCK;
449 		break;
450 	case VFIFO:
451 		mode |= S_IFIFO;
452 		break;
453 	default:
454 		return (EBADF);
455 	};
456 	sb->st_mode = mode;
457 	sb->st_nlink = vap->va_nlink;
458 	sb->st_uid = vap->va_uid;
459 	sb->st_gid = vap->va_gid;
460 	sb->st_rdev = vap->va_rdev;
461 	sb->st_size = vap->va_size;
462 	sb->st_atimespec = vap->va_atime;
463 	sb->st_mtimespec = vap->va_mtime;
464 	sb->st_ctimespec = vap->va_ctime;
465 
466         /*
467 	 * According to www.opengroup.org, the meaning of st_blksize is
468 	 *   "a filesystem-specific preferred I/O block size for this
469 	 *    object.  In some filesystem types, this may vary from file
470 	 *    to file"
471 	 * Default to zero to catch bogus uses of this field.
472 	 */
473 
474 	if (vap->va_type == VREG) {
475 		sb->st_blksize = vap->va_blocksize;
476 	} else if (vn_isdisk(vp, NULL)) {
477 		sb->st_blksize = vp->v_rdev->si_bsize_best;
478 		if (sb->st_blksize < vp->v_rdev->si_bsize_phys)
479 			sb->st_blksize = vp->v_rdev->si_bsize_phys;
480 		if (sb->st_blksize < BLKDEV_IOSIZE)
481 			sb->st_blksize = BLKDEV_IOSIZE;
482 	} else {
483 		sb->st_blksize = 0;
484 	}
485 
486 	sb->st_flags = vap->va_flags;
487 	if (suser_xxx(p->p_ucred, 0, 0))
488 		sb->st_gen = 0;
489 	else
490 		sb->st_gen = vap->va_gen;
491 
492 #if (S_BLKSIZE == 512)
493 	/* Optimize this case */
494 	sb->st_blocks = vap->va_bytes >> 9;
495 #else
496 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
497 #endif
498 	return (0);
499 }
500 
501 /*
502  * File table vnode ioctl routine.
503  */
504 static int
505 vn_ioctl(fp, com, data, p)
506 	struct file *fp;
507 	u_long com;
508 	caddr_t data;
509 	struct proc *p;
510 {
511 	register struct vnode *vp = ((struct vnode *)fp->f_data);
512 	struct vattr vattr;
513 	int error;
514 
515 	switch (vp->v_type) {
516 
517 	case VREG:
518 	case VDIR:
519 		if (com == FIONREAD) {
520 			error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
521 			if (error)
522 				return (error);
523 			*(int *)data = vattr.va_size - fp->f_offset;
524 			return (0);
525 		}
526 		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
527 			return (0);			/* XXX */
528 		/* fall into ... */
529 
530 	default:
531 #if 0
532 		return (ENOTTY);
533 #endif
534 	case VFIFO:
535 	case VCHR:
536 	case VBLK:
537 		if (com == FIODTYPE) {
538 			if (vp->v_type != VCHR && vp->v_type != VBLK)
539 				return (ENOTTY);
540 			*(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK;
541 			return (0);
542 		}
543 		error = VOP_IOCTL(vp, com, data, fp->f_flag, p->p_ucred, p);
544 		if (error == 0 && com == TIOCSCTTY) {
545 
546 			/* Do nothing if reassigning same control tty */
547 			if (p->p_session->s_ttyvp == vp)
548 				return (0);
549 
550 			/* Get rid of reference to old control tty */
551 			if (p->p_session->s_ttyvp)
552 				vrele(p->p_session->s_ttyvp);
553 
554 			p->p_session->s_ttyvp = vp;
555 			VREF(vp);
556 		}
557 		return (error);
558 	}
559 }
560 
561 /*
562  * File table vnode poll routine.
563  */
564 static int
565 vn_poll(fp, events, cred, p)
566 	struct file *fp;
567 	int events;
568 	struct ucred *cred;
569 	struct proc *p;
570 {
571 
572 	return (VOP_POLL(((struct vnode *)fp->f_data), events, cred, p));
573 }
574 
575 /*
576  * Check that the vnode is still valid, and if so
577  * acquire requested lock.
578  */
579 int
580 #ifndef	DEBUG_LOCKS
581 vn_lock(vp, flags, p)
582 #else
583 debug_vn_lock(vp, flags, p, filename, line)
584 #endif
585 	struct vnode *vp;
586 	int flags;
587 	struct proc *p;
588 #ifdef	DEBUG_LOCKS
589 	const char *filename;
590 	int line;
591 #endif
592 {
593 	int error;
594 
595 	do {
596 		if ((flags & LK_INTERLOCK) == 0)
597 			simple_lock(&vp->v_interlock);
598 		if (vp->v_flag & VXLOCK) {
599 			vp->v_flag |= VXWANT;
600 			simple_unlock(&vp->v_interlock);
601 			tsleep((caddr_t)vp, PINOD, "vn_lock", 0);
602 			error = ENOENT;
603 		} else {
604 #ifdef	DEBUG_LOCKS
605 			vp->filename = filename;
606 			vp->line = line;
607 #endif
608 			error = VOP_LOCK(vp,
609 				    flags | LK_NOPAUSE | LK_INTERLOCK, p);
610 			if (error == 0)
611 				return (error);
612 		}
613 		flags &= ~LK_INTERLOCK;
614 	} while (flags & LK_RETRY);
615 	return (error);
616 }
617 
618 /*
619  * File table vnode close routine.
620  */
621 static int
622 vn_closefile(fp, p)
623 	struct file *fp;
624 	struct proc *p;
625 {
626 
627 	fp->f_ops = &badfileops;
628 	return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
629 		fp->f_cred, p));
630 }
631