xref: /titanic_50/usr/src/uts/common/fs/tmpfs/tmp_vnops.c (revision 5c9d25d25ae7531d61aca4904f76e3dae2f457bf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/t_lock.h>
29 #include <sys/systm.h>
30 #include <sys/sysmacros.h>
31 #include <sys/user.h>
32 #include <sys/time.h>
33 #include <sys/vfs.h>
34 #include <sys/vfs_opreg.h>
35 #include <sys/vnode.h>
36 #include <sys/file.h>
37 #include <sys/fcntl.h>
38 #include <sys/flock.h>
39 #include <sys/kmem.h>
40 #include <sys/uio.h>
41 #include <sys/errno.h>
42 #include <sys/stat.h>
43 #include <sys/cred.h>
44 #include <sys/dirent.h>
45 #include <sys/pathname.h>
46 #include <sys/vmsystm.h>
47 #include <sys/fs/tmp.h>
48 #include <sys/fs/tmpnode.h>
49 #include <sys/mman.h>
50 #include <vm/hat.h>
51 #include <vm/seg_vn.h>
52 #include <vm/seg_map.h>
53 #include <vm/seg.h>
54 #include <vm/anon.h>
55 #include <vm/as.h>
56 #include <vm/page.h>
57 #include <vm/pvn.h>
58 #include <sys/cmn_err.h>
59 #include <sys/debug.h>
60 #include <sys/swap.h>
61 #include <sys/buf.h>
62 #include <sys/vm.h>
63 #include <sys/vtrace.h>
64 #include <sys/policy.h>
65 #include <fs/fs_subr.h>
66 
67 static int	tmp_getapage(struct vnode *, u_offset_t, size_t, uint_t *,
68 	page_t **, size_t, struct seg *, caddr_t, enum seg_rw, struct cred *);
69 static int 	tmp_putapage(struct vnode *, page_t *, u_offset_t *, size_t *,
70 	int, struct cred *);
71 
72 /* ARGSUSED1 */
73 static int
74 tmp_open(struct vnode **vpp, int flag, struct cred *cred, caller_context_t *ct)
75 {
76 	/*
77 	 * swapon to a tmpfs file is not supported so access
78 	 * is denied on open if VISSWAP is set.
79 	 */
80 	if ((*vpp)->v_flag & VISSWAP)
81 		return (EINVAL);
82 	return (0);
83 }
84 
85 /* ARGSUSED1 */
86 static int
87 tmp_close(
88 	struct vnode *vp,
89 	int flag,
90 	int count,
91 	offset_t offset,
92 	struct cred *cred,
93 	caller_context_t *ct)
94 {
95 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
96 	cleanshares(vp, ttoproc(curthread)->p_pid);
97 	return (0);
98 }
99 
100 /*
101  * wrtmp does the real work of write requests for tmpfs.
102  */
103 static int
104 wrtmp(
105 	struct tmount *tm,
106 	struct tmpnode *tp,
107 	struct uio *uio,
108 	struct cred *cr,
109 	struct caller_context *ct)
110 {
111 	pgcnt_t pageoffset;	/* offset in pages */
112 	ulong_t segmap_offset;	/* pagesize byte offset into segmap */
113 	caddr_t base;		/* base of segmap */
114 	ssize_t bytes;		/* bytes to uiomove */
115 	pfn_t pagenumber;	/* offset in pages into tmp file */
116 	struct vnode *vp;
117 	int error = 0;
118 	int	pagecreate;	/* == 1 if we allocated a page */
119 	int	newpage;
120 	rlim64_t limit = uio->uio_llimit;
121 	long oresid = uio->uio_resid;
122 	timestruc_t now;
123 
124 	long tn_size_changed = 0;
125 	long old_tn_size;
126 	long new_tn_size;
127 
128 	vp = TNTOV(tp);
129 	ASSERT(vp->v_type == VREG);
130 
131 	TRACE_1(TR_FAC_TMPFS, TR_TMPFS_RWTMP_START,
132 	    "tmp_wrtmp_start:vp %p", vp);
133 
134 	ASSERT(RW_WRITE_HELD(&tp->tn_contents));
135 	ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
136 
137 	if (MANDLOCK(vp, tp->tn_mode)) {
138 		rw_exit(&tp->tn_contents);
139 		/*
140 		 * tmp_getattr ends up being called by chklock
141 		 */
142 		error = chklock(vp, FWRITE, uio->uio_loffset, uio->uio_resid,
143 		    uio->uio_fmode, ct);
144 		rw_enter(&tp->tn_contents, RW_WRITER);
145 		if (error != 0) {
146 			TRACE_2(TR_FAC_TMPFS, TR_TMPFS_RWTMP_END,
147 			    "tmp_wrtmp_end:vp %p error %d", vp, error);
148 			return (error);
149 		}
150 	}
151 
152 	if (uio->uio_loffset < 0)
153 		return (EINVAL);
154 
155 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
156 		limit = MAXOFFSET_T;
157 
158 	if (uio->uio_loffset >= limit) {
159 		proc_t *p = ttoproc(curthread);
160 
161 		mutex_enter(&p->p_lock);
162 		(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
163 		    p, RCA_UNSAFE_SIGINFO);
164 		mutex_exit(&p->p_lock);
165 		return (EFBIG);
166 	}
167 
168 	if (uio->uio_loffset >= MAXOFF_T) {
169 		TRACE_2(TR_FAC_TMPFS, TR_TMPFS_RWTMP_END,
170 		    "tmp_wrtmp_end:vp %p error %d", vp, EINVAL);
171 		return (EFBIG);
172 	}
173 
174 	if (uio->uio_resid == 0) {
175 		TRACE_2(TR_FAC_TMPFS, TR_TMPFS_RWTMP_END,
176 		    "tmp_wrtmp_end:vp %p error %d", vp, 0);
177 		return (0);
178 	}
179 
180 	if (limit > MAXOFF_T)
181 		limit = MAXOFF_T;
182 
183 	do {
184 		long	offset;
185 		long	delta;
186 
187 		offset = (long)uio->uio_offset;
188 		pageoffset = offset & PAGEOFFSET;
189 		/*
190 		 * A maximum of PAGESIZE bytes of data is transferred
191 		 * each pass through this loop
192 		 */
193 		bytes = MIN(PAGESIZE - pageoffset, uio->uio_resid);
194 
195 		if (offset + bytes >= limit) {
196 			if (offset >= limit) {
197 				error = EFBIG;
198 				goto out;
199 			}
200 			bytes = limit - offset;
201 		}
202 		pagenumber = btop(offset);
203 
204 		/*
205 		 * delta is the amount of anonymous memory
206 		 * to reserve for the file.
207 		 * We always reserve in pagesize increments so
208 		 * unless we're extending the file into a new page,
209 		 * we don't need to call tmp_resv.
210 		 */
211 		delta = offset + bytes -
212 		    P2ROUNDUP_TYPED(tp->tn_size, PAGESIZE, u_offset_t);
213 		if (delta > 0) {
214 			pagecreate = 1;
215 			if (tmp_resv(tm, tp, delta, pagecreate)) {
216 				/*
217 				 * Log file system full in the zone that owns
218 				 * the tmpfs mount, as well as in the global
219 				 * zone if necessary.
220 				 */
221 				zcmn_err(tm->tm_vfsp->vfs_zone->zone_id,
222 				    CE_WARN, "%s: File system full, "
223 				    "swap space limit exceeded",
224 				    tm->tm_mntpath);
225 
226 				if (tm->tm_vfsp->vfs_zone->zone_id !=
227 				    GLOBAL_ZONEID) {
228 
229 					vfs_t *vfs = tm->tm_vfsp;
230 
231 					zcmn_err(GLOBAL_ZONEID,
232 					    CE_WARN, "%s: File system full, "
233 					    "swap space limit exceeded",
234 					    vfs->vfs_vnodecovered->v_path);
235 				}
236 				error = ENOSPC;
237 				break;
238 			}
239 			tmpnode_growmap(tp, (ulong_t)offset + bytes);
240 		}
241 		/* grow the file to the new length */
242 		if (offset + bytes > tp->tn_size) {
243 			tn_size_changed = 1;
244 			old_tn_size = tp->tn_size;
245 			/*
246 			 * Postpone updating tp->tn_size until uiomove() is
247 			 * done.
248 			 */
249 			new_tn_size = offset + bytes;
250 		}
251 		if (bytes == PAGESIZE) {
252 			/*
253 			 * Writing whole page so reading from disk
254 			 * is a waste
255 			 */
256 			pagecreate = 1;
257 		} else {
258 			pagecreate = 0;
259 		}
260 		/*
261 		 * If writing past EOF or filling in a hole
262 		 * we need to allocate an anon slot.
263 		 */
264 		if (anon_get_ptr(tp->tn_anon, pagenumber) == NULL) {
265 			(void) anon_set_ptr(tp->tn_anon, pagenumber,
266 			    anon_alloc(vp, ptob(pagenumber)), ANON_SLEEP);
267 			pagecreate = 1;
268 			tp->tn_nblocks++;
269 		}
270 
271 		/*
272 		 * We have to drop the contents lock to allow the VM
273 		 * system to reacquire it in tmp_getpage()
274 		 */
275 		rw_exit(&tp->tn_contents);
276 
277 		/*
278 		 * Touch the page and fault it in if it is not in core
279 		 * before segmap_getmapflt or vpm_data_copy can lock it.
280 		 * This is to avoid the deadlock if the buffer is mapped
281 		 * to the same file through mmap which we want to write.
282 		 */
283 		uio_prefaultpages((long)bytes, uio);
284 
285 		newpage = 0;
286 		if (vpm_enable) {
287 			/*
288 			 * Copy data. If new pages are created, part of
289 			 * the page that is not written will be initizliazed
290 			 * with zeros.
291 			 */
292 			error = vpm_data_copy(vp, offset, bytes, uio,
293 			    !pagecreate, &newpage, 1, S_WRITE);
294 		} else {
295 			/* Get offset within the segmap mapping */
296 			segmap_offset = (offset & PAGEMASK) & MAXBOFFSET;
297 			base = segmap_getmapflt(segkmap, vp,
298 			    (offset &  MAXBMASK), PAGESIZE, !pagecreate,
299 			    S_WRITE);
300 		}
301 
302 
303 		if (!vpm_enable && pagecreate) {
304 			/*
305 			 * segmap_pagecreate() returns 1 if it calls
306 			 * page_create_va() to allocate any pages.
307 			 */
308 			newpage = segmap_pagecreate(segkmap,
309 			    base + segmap_offset, (size_t)PAGESIZE, 0);
310 			/*
311 			 * Clear from the beginning of the page to the starting
312 			 * offset of the data.
313 			 */
314 			if (pageoffset != 0)
315 				(void) kzero(base + segmap_offset,
316 				    (size_t)pageoffset);
317 		}
318 
319 		if (!vpm_enable) {
320 			error = uiomove(base + segmap_offset + pageoffset,
321 			    (long)bytes, UIO_WRITE, uio);
322 		}
323 
324 		if (!vpm_enable && pagecreate &&
325 		    uio->uio_offset < P2ROUNDUP(offset + bytes, PAGESIZE)) {
326 			long	zoffset; /* zero from offset into page */
327 			/*
328 			 * We created pages w/o initializing them completely,
329 			 * thus we need to zero the part that wasn't set up.
330 			 * This happens on most EOF write cases and if
331 			 * we had some sort of error during the uiomove.
332 			 */
333 			long nmoved;
334 
335 			nmoved = uio->uio_offset - offset;
336 			ASSERT((nmoved + pageoffset) <= PAGESIZE);
337 
338 			/*
339 			 * Zero from the end of data in the page to the
340 			 * end of the page.
341 			 */
342 			if ((zoffset = pageoffset + nmoved) < PAGESIZE)
343 				(void) kzero(base + segmap_offset + zoffset,
344 				    (size_t)PAGESIZE - zoffset);
345 		}
346 
347 		/*
348 		 * Unlock the pages which have been allocated by
349 		 * page_create_va() in segmap_pagecreate()
350 		 */
351 		if (!vpm_enable && newpage) {
352 			segmap_pageunlock(segkmap, base + segmap_offset,
353 			    (size_t)PAGESIZE, S_WRITE);
354 		}
355 
356 		if (error) {
357 			/*
358 			 * If we failed on a write, we must
359 			 * be sure to invalidate any pages that may have
360 			 * been allocated.
361 			 */
362 			if (vpm_enable) {
363 				(void) vpm_sync_pages(vp, offset, PAGESIZE,
364 				    SM_INVAL);
365 			} else {
366 				(void) segmap_release(segkmap, base, SM_INVAL);
367 			}
368 		} else {
369 			if (vpm_enable) {
370 				error = vpm_sync_pages(vp, offset, PAGESIZE,
371 				    0);
372 			} else {
373 				error = segmap_release(segkmap, base, 0);
374 			}
375 		}
376 
377 		/*
378 		 * Re-acquire contents lock.
379 		 */
380 		rw_enter(&tp->tn_contents, RW_WRITER);
381 
382 		/*
383 		 * Update tn_size.
384 		 */
385 		if (tn_size_changed)
386 			tp->tn_size = new_tn_size;
387 
388 		/*
389 		 * If the uiomove failed, fix up tn_size.
390 		 */
391 		if (error) {
392 			if (tn_size_changed) {
393 				/*
394 				 * The uiomove failed, and we
395 				 * allocated blocks,so get rid
396 				 * of them.
397 				 */
398 				(void) tmpnode_trunc(tm, tp,
399 				    (ulong_t)old_tn_size);
400 			}
401 		} else {
402 			/*
403 			 * XXX - Can this be out of the loop?
404 			 */
405 			if ((tp->tn_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) &&
406 			    (tp->tn_mode & (S_ISUID | S_ISGID)) &&
407 			    secpolicy_vnode_setid_retain(cr,
408 			    (tp->tn_mode & S_ISUID) != 0 && tp->tn_uid == 0)) {
409 				/*
410 				 * Clear Set-UID & Set-GID bits on
411 				 * successful write if not privileged
412 				 * and at least one of the execute bits
413 				 * is set.  If we always clear Set-GID,
414 				 * mandatory file and record locking is
415 				 * unuseable.
416 				 */
417 				tp->tn_mode &= ~(S_ISUID | S_ISGID);
418 			}
419 			gethrestime(&now);
420 			tp->tn_mtime = now;
421 			tp->tn_ctime = now;
422 		}
423 	} while (error == 0 && uio->uio_resid > 0 && bytes != 0);
424 
425 out:
426 	/*
427 	 * If we've already done a partial-write, terminate
428 	 * the write but return no error.
429 	 */
430 	if (oresid != uio->uio_resid)
431 		error = 0;
432 	TRACE_2(TR_FAC_TMPFS, TR_TMPFS_RWTMP_END,
433 	    "tmp_wrtmp_end:vp %p error %d", vp, error);
434 	return (error);
435 }
436 
437 /*
438  * rdtmp does the real work of read requests for tmpfs.
439  */
440 static int
441 rdtmp(
442 	struct tmount *tm,
443 	struct tmpnode *tp,
444 	struct uio *uio,
445 	struct caller_context *ct)
446 {
447 	ulong_t pageoffset;	/* offset in tmpfs file (uio_offset) */
448 	ulong_t segmap_offset;	/* pagesize byte offset into segmap */
449 	caddr_t base;		/* base of segmap */
450 	ssize_t bytes;		/* bytes to uiomove */
451 	struct vnode *vp;
452 	int error;
453 	long oresid = uio->uio_resid;
454 
455 #if defined(lint)
456 	tm = tm;
457 #endif
458 	vp = TNTOV(tp);
459 
460 	TRACE_1(TR_FAC_TMPFS, TR_TMPFS_RWTMP_START, "tmp_rdtmp_start:vp %p",
461 	    vp);
462 
463 	ASSERT(RW_LOCK_HELD(&tp->tn_contents));
464 
465 	if (MANDLOCK(vp, tp->tn_mode)) {
466 		rw_exit(&tp->tn_contents);
467 		/*
468 		 * tmp_getattr ends up being called by chklock
469 		 */
470 		error = chklock(vp, FREAD, uio->uio_loffset, uio->uio_resid,
471 		    uio->uio_fmode, ct);
472 		rw_enter(&tp->tn_contents, RW_READER);
473 		if (error != 0) {
474 			TRACE_2(TR_FAC_TMPFS, TR_TMPFS_RWTMP_END,
475 			    "tmp_rdtmp_end:vp %p error %d", vp, error);
476 			return (error);
477 		}
478 	}
479 	ASSERT(tp->tn_type == VREG);
480 
481 	if (uio->uio_loffset >= MAXOFF_T) {
482 		TRACE_2(TR_FAC_TMPFS, TR_TMPFS_RWTMP_END,
483 		    "tmp_rdtmp_end:vp %p error %d", vp, EINVAL);
484 		return (0);
485 	}
486 	if (uio->uio_loffset < 0)
487 		return (EINVAL);
488 	if (uio->uio_resid == 0) {
489 		TRACE_2(TR_FAC_TMPFS, TR_TMPFS_RWTMP_END,
490 		    "tmp_rdtmp_end:vp %p error %d", vp, 0);
491 		return (0);
492 	}
493 
494 	vp = TNTOV(tp);
495 
496 	do {
497 		long diff;
498 		long offset;
499 
500 		offset = uio->uio_offset;
501 		pageoffset = offset & PAGEOFFSET;
502 		bytes = MIN(PAGESIZE - pageoffset, uio->uio_resid);
503 
504 		diff = tp->tn_size - offset;
505 
506 		if (diff <= 0) {
507 			error = 0;
508 			goto out;
509 		}
510 		if (diff < bytes)
511 			bytes = diff;
512 
513 		/*
514 		 * We have to drop the contents lock to allow the VM system
515 		 * to reacquire it in tmp_getpage() should the uiomove cause a
516 		 * pagefault.
517 		 */
518 		rw_exit(&tp->tn_contents);
519 
520 		if (vpm_enable) {
521 			/*
522 			 * Copy data.
523 			 */
524 			error = vpm_data_copy(vp, offset, bytes, uio, 1, NULL,
525 			    0, S_READ);
526 		} else {
527 			segmap_offset = (offset & PAGEMASK) & MAXBOFFSET;
528 			base = segmap_getmapflt(segkmap, vp, offset & MAXBMASK,
529 			    bytes, 1, S_READ);
530 
531 			error = uiomove(base + segmap_offset + pageoffset,
532 			    (long)bytes, UIO_READ, uio);
533 		}
534 
535 		if (error) {
536 			if (vpm_enable) {
537 				(void) vpm_sync_pages(vp, offset, PAGESIZE, 0);
538 			} else {
539 				(void) segmap_release(segkmap, base, 0);
540 			}
541 		} else {
542 			if (vpm_enable) {
543 				error = vpm_sync_pages(vp, offset, PAGESIZE,
544 				    0);
545 			} else {
546 				error = segmap_release(segkmap, base, 0);
547 			}
548 		}
549 
550 		/*
551 		 * Re-acquire contents lock.
552 		 */
553 		rw_enter(&tp->tn_contents, RW_READER);
554 
555 	} while (error == 0 && uio->uio_resid > 0);
556 
557 out:
558 	gethrestime(&tp->tn_atime);
559 
560 	/*
561 	 * If we've already done a partial read, terminate
562 	 * the read but return no error.
563 	 */
564 	if (oresid != uio->uio_resid)
565 		error = 0;
566 
567 	TRACE_2(TR_FAC_TMPFS, TR_TMPFS_RWTMP_END,
568 	    "tmp_rdtmp_end:vp %x error %d", vp, error);
569 	return (error);
570 }
571 
572 /* ARGSUSED2 */
573 static int
574 tmp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred,
575     struct caller_context *ct)
576 {
577 	struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
578 	struct tmount *tm = (struct tmount *)VTOTM(vp);
579 	int error;
580 
581 	/*
582 	 * We don't currently support reading non-regular files
583 	 */
584 	if (vp->v_type == VDIR)
585 		return (EISDIR);
586 	if (vp->v_type != VREG)
587 		return (EINVAL);
588 	/*
589 	 * tmp_rwlock should have already been called from layers above
590 	 */
591 	ASSERT(RW_READ_HELD(&tp->tn_rwlock));
592 
593 	rw_enter(&tp->tn_contents, RW_READER);
594 
595 	error = rdtmp(tm, tp, uiop, ct);
596 
597 	rw_exit(&tp->tn_contents);
598 
599 	return (error);
600 }
601 
602 static int
603 tmp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
604     struct caller_context *ct)
605 {
606 	struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
607 	struct tmount *tm = (struct tmount *)VTOTM(vp);
608 	int error;
609 
610 	/*
611 	 * We don't currently support writing to non-regular files
612 	 */
613 	if (vp->v_type != VREG)
614 		return (EINVAL);	/* XXX EISDIR? */
615 
616 	/*
617 	 * tmp_rwlock should have already been called from layers above
618 	 */
619 	ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
620 
621 	rw_enter(&tp->tn_contents, RW_WRITER);
622 
623 	if (ioflag & FAPPEND) {
624 		/*
625 		 * In append mode start at end of file.
626 		 */
627 		uiop->uio_loffset = tp->tn_size;
628 	}
629 
630 	error = wrtmp(tm, tp, uiop, cred, ct);
631 
632 	rw_exit(&tp->tn_contents);
633 
634 	return (error);
635 }
636 
637 /* ARGSUSED */
638 static int
639 tmp_ioctl(
640 	struct vnode *vp,
641 	int com,
642 	intptr_t data,
643 	int flag,
644 	struct cred *cred,
645 	int *rvalp,
646 	caller_context_t *ct)
647 {
648 	return (ENOTTY);
649 }
650 
651 /* ARGSUSED2 */
652 static int
653 tmp_getattr(
654 	struct vnode *vp,
655 	struct vattr *vap,
656 	int flags,
657 	struct cred *cred,
658 	caller_context_t *ct)
659 {
660 	struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
661 	struct vnode *mvp;
662 	struct vattr va;
663 	int attrs = 1;
664 
665 	/*
666 	 * A special case to handle the root tnode on a diskless nfs
667 	 * client who may have had its uid and gid inherited
668 	 * from an nfs vnode with nobody ownership.  Likely the
669 	 * root filesystem. After nfs is fully functional the uid/gid
670 	 * may be mapable so ask again.
671 	 * vfsp can't get unmounted because we hold vp.
672 	 */
673 	if (vp->v_flag & VROOT &&
674 	    (mvp = vp->v_vfsp->vfs_vnodecovered) != NULL) {
675 		mutex_enter(&tp->tn_tlock);
676 		if (tp->tn_uid == UID_NOBODY || tp->tn_gid == GID_NOBODY) {
677 			mutex_exit(&tp->tn_tlock);
678 			bzero(&va, sizeof (struct vattr));
679 			va.va_mask = AT_UID|AT_GID;
680 			attrs = VOP_GETATTR(mvp, &va, 0, cred, ct);
681 		} else {
682 			mutex_exit(&tp->tn_tlock);
683 		}
684 	}
685 	mutex_enter(&tp->tn_tlock);
686 	if (attrs == 0) {
687 		tp->tn_uid = va.va_uid;
688 		tp->tn_gid = va.va_gid;
689 	}
690 	vap->va_type = vp->v_type;
691 	vap->va_mode = tp->tn_mode & MODEMASK;
692 	vap->va_uid = tp->tn_uid;
693 	vap->va_gid = tp->tn_gid;
694 	vap->va_fsid = tp->tn_fsid;
695 	vap->va_nodeid = (ino64_t)tp->tn_nodeid;
696 	vap->va_nlink = tp->tn_nlink;
697 	vap->va_size = (u_offset_t)tp->tn_size;
698 	vap->va_atime = tp->tn_atime;
699 	vap->va_mtime = tp->tn_mtime;
700 	vap->va_ctime = tp->tn_ctime;
701 	vap->va_blksize = PAGESIZE;
702 	vap->va_rdev = tp->tn_rdev;
703 	vap->va_seq = tp->tn_seq;
704 
705 	/*
706 	 * XXX Holes are not taken into account.  We could take the time to
707 	 * run through the anon array looking for allocated slots...
708 	 */
709 	vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size)));
710 	mutex_exit(&tp->tn_tlock);
711 	return (0);
712 }
713 
714 /*ARGSUSED4*/
715 static int
716 tmp_setattr(
717 	struct vnode *vp,
718 	struct vattr *vap,
719 	int flags,
720 	struct cred *cred,
721 	caller_context_t *ct)
722 {
723 	struct tmount *tm = (struct tmount *)VTOTM(vp);
724 	struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
725 	int error = 0;
726 	struct vattr *get;
727 	long mask;
728 
729 	/*
730 	 * Cannot set these attributes
731 	 */
732 	if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR))
733 		return (EINVAL);
734 
735 	mutex_enter(&tp->tn_tlock);
736 
737 	get = &tp->tn_attr;
738 	/*
739 	 * Change file access modes. Must be owner or have sufficient
740 	 * privileges.
741 	 */
742 	error = secpolicy_vnode_setattr(cred, vp, vap, get, flags, tmp_taccess,
743 	    tp);
744 
745 	if (error)
746 		goto out;
747 
748 	mask = vap->va_mask;
749 
750 	if (mask & AT_MODE) {
751 		get->va_mode &= S_IFMT;
752 		get->va_mode |= vap->va_mode & ~S_IFMT;
753 	}
754 
755 	if (mask & AT_UID)
756 		get->va_uid = vap->va_uid;
757 	if (mask & AT_GID)
758 		get->va_gid = vap->va_gid;
759 	if (mask & AT_ATIME)
760 		get->va_atime = vap->va_atime;
761 	if (mask & AT_MTIME)
762 		get->va_mtime = vap->va_mtime;
763 
764 	if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME))
765 		gethrestime(&tp->tn_ctime);
766 
767 	if (mask & AT_SIZE) {
768 		ASSERT(vp->v_type != VDIR);
769 
770 		/* Don't support large files. */
771 		if (vap->va_size > MAXOFF_T) {
772 			error = EFBIG;
773 			goto out;
774 		}
775 		mutex_exit(&tp->tn_tlock);
776 
777 		rw_enter(&tp->tn_rwlock, RW_WRITER);
778 		rw_enter(&tp->tn_contents, RW_WRITER);
779 		error = tmpnode_trunc(tm, tp, (ulong_t)vap->va_size);
780 		rw_exit(&tp->tn_contents);
781 		rw_exit(&tp->tn_rwlock);
782 		goto out1;
783 	}
784 out:
785 	mutex_exit(&tp->tn_tlock);
786 out1:
787 	return (error);
788 }
789 
790 /* ARGSUSED2 */
791 static int
792 tmp_access(
793 	struct vnode *vp,
794 	int mode,
795 	int flags,
796 	struct cred *cred,
797 	caller_context_t *ct)
798 {
799 	struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
800 	int error;
801 
802 	mutex_enter(&tp->tn_tlock);
803 	error = tmp_taccess(tp, mode, cred);
804 	mutex_exit(&tp->tn_tlock);
805 	return (error);
806 }
807 
808 /* ARGSUSED3 */
809 static int
810 tmp_lookup(
811 	struct vnode *dvp,
812 	char *nm,
813 	struct vnode **vpp,
814 	struct pathname *pnp,
815 	int flags,
816 	struct vnode *rdir,
817 	struct cred *cred,
818 	caller_context_t *ct,
819 	int *direntflags,
820 	pathname_t *realpnp)
821 {
822 	struct tmpnode *tp = (struct tmpnode *)VTOTN(dvp);
823 	struct tmpnode *ntp = NULL;
824 	int error;
825 
826 
827 	/* allow cd into @ dir */
828 	if (flags & LOOKUP_XATTR) {
829 		struct tmpnode *xdp;
830 		struct tmount *tm;
831 
832 		/*
833 		 * don't allow attributes if not mounted XATTR support
834 		 */
835 		if (!(dvp->v_vfsp->vfs_flag & VFS_XATTR))
836 			return (EINVAL);
837 
838 		if (tp->tn_flags & ISXATTR)
839 			/* No attributes on attributes */
840 			return (EINVAL);
841 
842 		rw_enter(&tp->tn_rwlock, RW_WRITER);
843 		if (tp->tn_xattrdp == NULL) {
844 			if (!(flags & CREATE_XATTR_DIR)) {
845 				rw_exit(&tp->tn_rwlock);
846 				return (ENOENT);
847 			}
848 
849 			/*
850 			 * No attribute directory exists for this
851 			 * node - create the attr dir as a side effect
852 			 * of this lookup.
853 			 */
854 
855 			/*
856 			 * Make sure we have adequate permission...
857 			 */
858 
859 			if ((error = tmp_taccess(tp, VWRITE, cred)) != 0) {
860 				rw_exit(&tp->tn_rwlock);
861 				return (error);
862 			}
863 
864 			xdp = tmp_memalloc(sizeof (struct tmpnode),
865 			    TMP_MUSTHAVE);
866 			tm = VTOTM(dvp);
867 			tmpnode_init(tm, xdp, &tp->tn_attr, NULL);
868 			/*
869 			 * Fix-up fields unique to attribute directories.
870 			 */
871 			xdp->tn_flags = ISXATTR;
872 			xdp->tn_type = VDIR;
873 			if (tp->tn_type == VDIR) {
874 				xdp->tn_mode = tp->tn_attr.va_mode;
875 			} else {
876 				xdp->tn_mode = 0700;
877 				if (tp->tn_attr.va_mode & 0040)
878 					xdp->tn_mode |= 0750;
879 				if (tp->tn_attr.va_mode & 0004)
880 					xdp->tn_mode |= 0705;
881 			}
882 			xdp->tn_vnode->v_type = VDIR;
883 			xdp->tn_vnode->v_flag |= V_XATTRDIR;
884 			tdirinit(tp, xdp);
885 			tp->tn_xattrdp = xdp;
886 		} else {
887 			VN_HOLD(tp->tn_xattrdp->tn_vnode);
888 		}
889 		*vpp = TNTOV(tp->tn_xattrdp);
890 		rw_exit(&tp->tn_rwlock);
891 		return (0);
892 	}
893 
894 	/*
895 	 * Null component name is a synonym for directory being searched.
896 	 */
897 	if (*nm == '\0') {
898 		VN_HOLD(dvp);
899 		*vpp = dvp;
900 		return (0);
901 	}
902 	ASSERT(tp);
903 
904 	error = tdirlookup(tp, nm, &ntp, cred);
905 
906 	if (error == 0) {
907 		ASSERT(ntp);
908 		*vpp = TNTOV(ntp);
909 		/*
910 		 * If vnode is a device return special vnode instead
911 		 */
912 		if (IS_DEVVP(*vpp)) {
913 			struct vnode *newvp;
914 
915 			newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type,
916 			    cred);
917 			VN_RELE(*vpp);
918 			*vpp = newvp;
919 		}
920 	}
921 	TRACE_4(TR_FAC_TMPFS, TR_TMPFS_LOOKUP,
922 	    "tmpfs lookup:vp %p name %s vpp %p error %d",
923 	    dvp, nm, vpp, error);
924 	return (error);
925 }
926 
927 /*ARGSUSED7*/
928 static int
929 tmp_create(
930 	struct vnode *dvp,
931 	char *nm,
932 	struct vattr *vap,
933 	enum vcexcl exclusive,
934 	int mode,
935 	struct vnode **vpp,
936 	struct cred *cred,
937 	int flag,
938 	caller_context_t *ct,
939 	vsecattr_t *vsecp)
940 {
941 	struct tmpnode *parent;
942 	struct tmount *tm;
943 	struct tmpnode *self;
944 	int error;
945 	struct tmpnode *oldtp;
946 
947 again:
948 	parent = (struct tmpnode *)VTOTN(dvp);
949 	tm = (struct tmount *)VTOTM(dvp);
950 	self = NULL;
951 	error = 0;
952 	oldtp = NULL;
953 
954 	/* device files not allowed in ext. attr dirs */
955 	if ((parent->tn_flags & ISXATTR) &&
956 	    (vap->va_type == VBLK || vap->va_type == VCHR ||
957 	    vap->va_type == VFIFO || vap->va_type == VDOOR ||
958 	    vap->va_type == VSOCK || vap->va_type == VPORT))
959 			return (EINVAL);
960 
961 	if (vap->va_type == VREG && (vap->va_mode & VSVTX)) {
962 		/* Must be privileged to set sticky bit */
963 		if (secpolicy_vnode_stky_modify(cred))
964 			vap->va_mode &= ~VSVTX;
965 	} else if (vap->va_type == VNON) {
966 		return (EINVAL);
967 	}
968 
969 	/*
970 	 * Null component name is a synonym for directory being searched.
971 	 */
972 	if (*nm == '\0') {
973 		VN_HOLD(dvp);
974 		oldtp = parent;
975 	} else {
976 		error = tdirlookup(parent, nm, &oldtp, cred);
977 	}
978 
979 	if (error == 0) {	/* name found */
980 		ASSERT(oldtp);
981 
982 		rw_enter(&oldtp->tn_rwlock, RW_WRITER);
983 
984 		/*
985 		 * if create/read-only an existing
986 		 * directory, allow it
987 		 */
988 		if (exclusive == EXCL)
989 			error = EEXIST;
990 		else if ((oldtp->tn_type == VDIR) && (mode & VWRITE))
991 			error = EISDIR;
992 		else {
993 			error = tmp_taccess(oldtp, mode, cred);
994 		}
995 
996 		if (error) {
997 			rw_exit(&oldtp->tn_rwlock);
998 			tmpnode_rele(oldtp);
999 			return (error);
1000 		}
1001 		*vpp = TNTOV(oldtp);
1002 		if ((*vpp)->v_type == VREG && (vap->va_mask & AT_SIZE) &&
1003 		    vap->va_size == 0) {
1004 			rw_enter(&oldtp->tn_contents, RW_WRITER);
1005 			(void) tmpnode_trunc(tm, oldtp, 0);
1006 			rw_exit(&oldtp->tn_contents);
1007 		}
1008 		rw_exit(&oldtp->tn_rwlock);
1009 		if (IS_DEVVP(*vpp)) {
1010 			struct vnode *newvp;
1011 
1012 			newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type,
1013 			    cred);
1014 			VN_RELE(*vpp);
1015 			if (newvp == NULL) {
1016 				return (ENOSYS);
1017 			}
1018 			*vpp = newvp;
1019 		}
1020 
1021 		if (error == 0) {
1022 			vnevent_create(*vpp, ct);
1023 		}
1024 		return (0);
1025 	}
1026 
1027 	if (error != ENOENT)
1028 		return (error);
1029 
1030 	rw_enter(&parent->tn_rwlock, RW_WRITER);
1031 	error = tdirenter(tm, parent, nm, DE_CREATE,
1032 	    (struct tmpnode *)NULL, (struct tmpnode *)NULL,
1033 	    vap, &self, cred, ct);
1034 	rw_exit(&parent->tn_rwlock);
1035 
1036 	if (error) {
1037 		if (self)
1038 			tmpnode_rele(self);
1039 
1040 		if (error == EEXIST) {
1041 			/*
1042 			 * This means that the file was created sometime
1043 			 * after we checked and did not find it and when
1044 			 * we went to create it.
1045 			 * Since creat() is supposed to truncate a file
1046 			 * that already exits go back to the begining
1047 			 * of the function. This time we will find it
1048 			 * and go down the tmp_trunc() path
1049 			 */
1050 			goto again;
1051 		}
1052 		return (error);
1053 	}
1054 
1055 	*vpp = TNTOV(self);
1056 
1057 	if (!error && IS_DEVVP(*vpp)) {
1058 		struct vnode *newvp;
1059 
1060 		newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cred);
1061 		VN_RELE(*vpp);
1062 		if (newvp == NULL)
1063 			return (ENOSYS);
1064 		*vpp = newvp;
1065 	}
1066 	TRACE_3(TR_FAC_TMPFS, TR_TMPFS_CREATE,
1067 	    "tmpfs create:dvp %p nm %s vpp %p", dvp, nm, vpp);
1068 	return (0);
1069 }
1070 
1071 /* ARGSUSED3 */
1072 static int
1073 tmp_remove(
1074 	struct vnode *dvp,
1075 	char *nm,
1076 	struct cred *cred,
1077 	caller_context_t *ct,
1078 	int flags)
1079 {
1080 	struct tmpnode *parent = (struct tmpnode *)VTOTN(dvp);
1081 	int error;
1082 	struct tmpnode *tp = NULL;
1083 
1084 	error = tdirlookup(parent, nm, &tp, cred);
1085 	if (error)
1086 		return (error);
1087 
1088 	ASSERT(tp);
1089 	rw_enter(&parent->tn_rwlock, RW_WRITER);
1090 	rw_enter(&tp->tn_rwlock, RW_WRITER);
1091 
1092 	if (tp->tn_type != VDIR ||
1093 	    (error = secpolicy_fs_linkdir(cred, dvp->v_vfsp)) == 0)
1094 		error = tdirdelete(parent, tp, nm, DR_REMOVE, cred);
1095 
1096 	rw_exit(&tp->tn_rwlock);
1097 	rw_exit(&parent->tn_rwlock);
1098 	vnevent_remove(TNTOV(tp), dvp, nm, ct);
1099 	tmpnode_rele(tp);
1100 
1101 	TRACE_3(TR_FAC_TMPFS, TR_TMPFS_REMOVE,
1102 	    "tmpfs remove:dvp %p nm %s error %d", dvp, nm, error);
1103 	return (error);
1104 }
1105 
1106 /* ARGSUSED4 */
1107 static int
1108 tmp_link(
1109 	struct vnode *dvp,
1110 	struct vnode *srcvp,
1111 	char *tnm,
1112 	struct cred *cred,
1113 	caller_context_t *ct,
1114 	int flags)
1115 {
1116 	struct tmpnode *parent;
1117 	struct tmpnode *from;
1118 	struct tmount *tm = (struct tmount *)VTOTM(dvp);
1119 	int error;
1120 	struct tmpnode *found = NULL;
1121 	struct vnode *realvp;
1122 
1123 	if (VOP_REALVP(srcvp, &realvp, ct) == 0)
1124 		srcvp = realvp;
1125 
1126 	parent = (struct tmpnode *)VTOTN(dvp);
1127 	from = (struct tmpnode *)VTOTN(srcvp);
1128 
1129 	if ((srcvp->v_type == VDIR &&
1130 	    secpolicy_fs_linkdir(cred, dvp->v_vfsp)) ||
1131 	    (from->tn_uid != crgetuid(cred) && secpolicy_basic_link(cred)))
1132 		return (EPERM);
1133 
1134 	/*
1135 	 * Make sure link for extended attributes is valid
1136 	 * We only support hard linking of xattr's in xattrdir to an xattrdir
1137 	 */
1138 	if ((from->tn_flags & ISXATTR) != (parent->tn_flags & ISXATTR))
1139 		return (EINVAL);
1140 
1141 	error = tdirlookup(parent, tnm, &found, cred);
1142 	if (error == 0) {
1143 		ASSERT(found);
1144 		tmpnode_rele(found);
1145 		return (EEXIST);
1146 	}
1147 
1148 	if (error != ENOENT)
1149 		return (error);
1150 
1151 	rw_enter(&parent->tn_rwlock, RW_WRITER);
1152 	error = tdirenter(tm, parent, tnm, DE_LINK, (struct tmpnode *)NULL,
1153 	    from, NULL, (struct tmpnode **)NULL, cred, ct);
1154 	rw_exit(&parent->tn_rwlock);
1155 	if (error == 0) {
1156 		vnevent_link(srcvp, ct);
1157 	}
1158 	return (error);
1159 }
1160 
1161 /* ARGSUSED5 */
1162 static int
1163 tmp_rename(
1164 	struct vnode *odvp,	/* source parent vnode */
1165 	char *onm,		/* source name */
1166 	struct vnode *ndvp,	/* destination parent vnode */
1167 	char *nnm,		/* destination name */
1168 	struct cred *cred,
1169 	caller_context_t *ct,
1170 	int flags)
1171 {
1172 	struct tmpnode *fromparent;
1173 	struct tmpnode *toparent;
1174 	struct tmpnode *fromtp = NULL;	/* source tmpnode */
1175 	struct tmount *tm = (struct tmount *)VTOTM(odvp);
1176 	int error;
1177 	int samedir = 0;	/* set if odvp == ndvp */
1178 	struct vnode *realvp;
1179 
1180 	if (VOP_REALVP(ndvp, &realvp, ct) == 0)
1181 		ndvp = realvp;
1182 
1183 	fromparent = (struct tmpnode *)VTOTN(odvp);
1184 	toparent = (struct tmpnode *)VTOTN(ndvp);
1185 
1186 	if ((fromparent->tn_flags & ISXATTR) != (toparent->tn_flags & ISXATTR))
1187 		return (EINVAL);
1188 
1189 	mutex_enter(&tm->tm_renamelck);
1190 
1191 	/*
1192 	 * Look up tmpnode of file we're supposed to rename.
1193 	 */
1194 	error = tdirlookup(fromparent, onm, &fromtp, cred);
1195 	if (error) {
1196 		mutex_exit(&tm->tm_renamelck);
1197 		return (error);
1198 	}
1199 
1200 	/*
1201 	 * Make sure we can delete the old (source) entry.  This
1202 	 * requires write permission on the containing directory.  If
1203 	 * that directory is "sticky" it requires further checks.
1204 	 */
1205 	if (((error = tmp_taccess(fromparent, VWRITE, cred)) != 0) ||
1206 	    (error = tmp_sticky_remove_access(fromparent, fromtp, cred)) != 0)
1207 		goto done;
1208 
1209 	/*
1210 	 * Check for renaming to or from '.' or '..' or that
1211 	 * fromtp == fromparent
1212 	 */
1213 	if ((onm[0] == '.' &&
1214 	    (onm[1] == '\0' || (onm[1] == '.' && onm[2] == '\0'))) ||
1215 	    (nnm[0] == '.' &&
1216 	    (nnm[1] == '\0' || (nnm[1] == '.' && nnm[2] == '\0'))) ||
1217 	    (fromparent == fromtp)) {
1218 		error = EINVAL;
1219 		goto done;
1220 	}
1221 
1222 	samedir = (fromparent == toparent);
1223 	/*
1224 	 * Make sure we can search and rename into the new
1225 	 * (destination) directory.
1226 	 */
1227 	if (!samedir) {
1228 		error = tmp_taccess(toparent, VEXEC|VWRITE, cred);
1229 		if (error)
1230 			goto done;
1231 	}
1232 
1233 	/*
1234 	 * Link source to new target
1235 	 */
1236 	rw_enter(&toparent->tn_rwlock, RW_WRITER);
1237 	error = tdirenter(tm, toparent, nnm, DE_RENAME,
1238 	    fromparent, fromtp, (struct vattr *)NULL,
1239 	    (struct tmpnode **)NULL, cred, ct);
1240 	rw_exit(&toparent->tn_rwlock);
1241 
1242 	if (error) {
1243 		/*
1244 		 * ESAME isn't really an error; it indicates that the
1245 		 * operation should not be done because the source and target
1246 		 * are the same file, but that no error should be reported.
1247 		 */
1248 		if (error == ESAME)
1249 			error = 0;
1250 		goto done;
1251 	}
1252 	vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct);
1253 
1254 	/*
1255 	 * Notify the target directory if not same as
1256 	 * source directory.
1257 	 */
1258 	if (ndvp != odvp) {
1259 		vnevent_rename_dest_dir(ndvp, ct);
1260 	}
1261 
1262 	/*
1263 	 * Unlink from source.
1264 	 */
1265 	rw_enter(&fromparent->tn_rwlock, RW_WRITER);
1266 	rw_enter(&fromtp->tn_rwlock, RW_WRITER);
1267 
1268 	error = tdirdelete(fromparent, fromtp, onm, DR_RENAME, cred);
1269 
1270 	/*
1271 	 * The following handles the case where our source tmpnode was
1272 	 * removed before we got to it.
1273 	 *
1274 	 * XXX We should also cleanup properly in the case where tdirdelete
1275 	 * fails for some other reason.  Currently this case shouldn't happen.
1276 	 * (see 1184991).
1277 	 */
1278 	if (error == ENOENT)
1279 		error = 0;
1280 
1281 	rw_exit(&fromtp->tn_rwlock);
1282 	rw_exit(&fromparent->tn_rwlock);
1283 done:
1284 	tmpnode_rele(fromtp);
1285 	mutex_exit(&tm->tm_renamelck);
1286 
1287 	TRACE_5(TR_FAC_TMPFS, TR_TMPFS_RENAME,
1288 	    "tmpfs rename:ovp %p onm %s nvp %p nnm %s error %d", odvp, onm,
1289 	    ndvp, nnm, error);
1290 	return (error);
1291 }
1292 
1293 /* ARGSUSED5 */
1294 static int
1295 tmp_mkdir(
1296 	struct vnode *dvp,
1297 	char *nm,
1298 	struct vattr *va,
1299 	struct vnode **vpp,
1300 	struct cred *cred,
1301 	caller_context_t *ct,
1302 	int flags,
1303 	vsecattr_t *vsecp)
1304 {
1305 	struct tmpnode *parent = (struct tmpnode *)VTOTN(dvp);
1306 	struct tmpnode *self = NULL;
1307 	struct tmount *tm = (struct tmount *)VTOTM(dvp);
1308 	int error;
1309 
1310 	/* no new dirs allowed in xattr dirs */
1311 	if (parent->tn_flags & ISXATTR)
1312 		return (EINVAL);
1313 
1314 	/*
1315 	 * Might be dangling directory.  Catch it here,
1316 	 * because a ENOENT return from tdirlookup() is
1317 	 * an "o.k. return".
1318 	 */
1319 	if (parent->tn_nlink == 0)
1320 		return (ENOENT);
1321 
1322 	error = tdirlookup(parent, nm, &self, cred);
1323 	if (error == 0) {
1324 		ASSERT(self);
1325 		tmpnode_rele(self);
1326 		return (EEXIST);
1327 	}
1328 	if (error != ENOENT)
1329 		return (error);
1330 
1331 	rw_enter(&parent->tn_rwlock, RW_WRITER);
1332 	error = tdirenter(tm, parent, nm, DE_MKDIR, (struct tmpnode *)NULL,
1333 	    (struct tmpnode *)NULL, va, &self, cred, ct);
1334 	if (error) {
1335 		rw_exit(&parent->tn_rwlock);
1336 		if (self)
1337 			tmpnode_rele(self);
1338 		return (error);
1339 	}
1340 	rw_exit(&parent->tn_rwlock);
1341 	*vpp = TNTOV(self);
1342 	return (0);
1343 }
1344 
1345 /* ARGSUSED4 */
1346 static int
1347 tmp_rmdir(
1348 	struct vnode *dvp,
1349 	char *nm,
1350 	struct vnode *cdir,
1351 	struct cred *cred,
1352 	caller_context_t *ct,
1353 	int flags)
1354 {
1355 	struct tmpnode *parent = (struct tmpnode *)VTOTN(dvp);
1356 	struct tmpnode *self = NULL;
1357 	struct vnode *vp;
1358 	int error = 0;
1359 
1360 	/*
1361 	 * Return error when removing . and ..
1362 	 */
1363 	if (strcmp(nm, ".") == 0)
1364 		return (EINVAL);
1365 	if (strcmp(nm, "..") == 0)
1366 		return (EEXIST); /* Should be ENOTEMPTY */
1367 	error = tdirlookup(parent, nm, &self, cred);
1368 	if (error)
1369 		return (error);
1370 
1371 	rw_enter(&parent->tn_rwlock, RW_WRITER);
1372 	rw_enter(&self->tn_rwlock, RW_WRITER);
1373 
1374 	vp = TNTOV(self);
1375 	if (vp == dvp || vp == cdir) {
1376 		error = EINVAL;
1377 		goto done1;
1378 	}
1379 	if (self->tn_type != VDIR) {
1380 		error = ENOTDIR;
1381 		goto done1;
1382 	}
1383 
1384 	mutex_enter(&self->tn_tlock);
1385 	if (self->tn_nlink > 2) {
1386 		mutex_exit(&self->tn_tlock);
1387 		error = EEXIST;
1388 		goto done1;
1389 	}
1390 	mutex_exit(&self->tn_tlock);
1391 
1392 	if (vn_vfswlock(vp)) {
1393 		error = EBUSY;
1394 		goto done1;
1395 	}
1396 	if (vn_mountedvfs(vp) != NULL) {
1397 		error = EBUSY;
1398 		goto done;
1399 	}
1400 
1401 	/*
1402 	 * Check for an empty directory
1403 	 * i.e. only includes entries for "." and ".."
1404 	 */
1405 	if (self->tn_dirents > 2) {
1406 		error = EEXIST;		/* SIGH should be ENOTEMPTY */
1407 		/*
1408 		 * Update atime because checking tn_dirents is logically
1409 		 * equivalent to reading the directory
1410 		 */
1411 		gethrestime(&self->tn_atime);
1412 		goto done;
1413 	}
1414 
1415 	error = tdirdelete(parent, self, nm, DR_RMDIR, cred);
1416 done:
1417 	vn_vfsunlock(vp);
1418 done1:
1419 	rw_exit(&self->tn_rwlock);
1420 	rw_exit(&parent->tn_rwlock);
1421 	vnevent_rmdir(TNTOV(self), dvp, nm, ct);
1422 	tmpnode_rele(self);
1423 
1424 	return (error);
1425 }
1426 
1427 /* ARGSUSED2 */
1428 static int
1429 tmp_readdir(
1430 	struct vnode *vp,
1431 	struct uio *uiop,
1432 	struct cred *cred,
1433 	int *eofp,
1434 	caller_context_t *ct,
1435 	int flags)
1436 {
1437 	struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
1438 	struct tdirent *tdp;
1439 	int error = 0;
1440 	size_t namelen;
1441 	struct dirent64 *dp;
1442 	ulong_t offset;
1443 	ulong_t total_bytes_wanted;
1444 	long outcount = 0;
1445 	long bufsize;
1446 	int reclen;
1447 	caddr_t outbuf;
1448 
1449 	if (uiop->uio_loffset >= MAXOFF_T) {
1450 		if (eofp)
1451 			*eofp = 1;
1452 		return (0);
1453 	}
1454 	/*
1455 	 * assuming system call has already called tmp_rwlock
1456 	 */
1457 	ASSERT(RW_READ_HELD(&tp->tn_rwlock));
1458 
1459 	if (uiop->uio_iovcnt != 1)
1460 		return (EINVAL);
1461 
1462 	if (vp->v_type != VDIR)
1463 		return (ENOTDIR);
1464 
1465 	/*
1466 	 * There's a window here where someone could have removed
1467 	 * all the entries in the directory after we put a hold on the
1468 	 * vnode but before we grabbed the rwlock.  Just return.
1469 	 */
1470 	if (tp->tn_dir == NULL) {
1471 		if (tp->tn_nlink) {
1472 			panic("empty directory 0x%p", (void *)tp);
1473 			/*NOTREACHED*/
1474 		}
1475 		return (0);
1476 	}
1477 
1478 	/*
1479 	 * Get space for multiple directory entries
1480 	 */
1481 	total_bytes_wanted = uiop->uio_iov->iov_len;
1482 	bufsize = total_bytes_wanted + sizeof (struct dirent64);
1483 	outbuf = kmem_alloc(bufsize, KM_SLEEP);
1484 
1485 	dp = (struct dirent64 *)outbuf;
1486 
1487 
1488 	offset = 0;
1489 	tdp = tp->tn_dir;
1490 	while (tdp) {
1491 		namelen = strlen(tdp->td_name);	/* no +1 needed */
1492 		offset = tdp->td_offset;
1493 		if (offset >= uiop->uio_offset) {
1494 			reclen = (int)DIRENT64_RECLEN(namelen);
1495 			if (outcount + reclen > total_bytes_wanted) {
1496 				if (!outcount)
1497 					/*
1498 					 * Buffer too small for any entries.
1499 					 */
1500 					error = EINVAL;
1501 				break;
1502 			}
1503 			ASSERT(tdp->td_tmpnode != NULL);
1504 
1505 			/* use strncpy(9f) to zero out uninitialized bytes */
1506 
1507 			(void) strncpy(dp->d_name, tdp->td_name,
1508 			    DIRENT64_NAMELEN(reclen));
1509 			dp->d_reclen = (ushort_t)reclen;
1510 			dp->d_ino = (ino64_t)tdp->td_tmpnode->tn_nodeid;
1511 			dp->d_off = (offset_t)tdp->td_offset + 1;
1512 			dp = (struct dirent64 *)
1513 			    ((uintptr_t)dp + dp->d_reclen);
1514 			outcount += reclen;
1515 			ASSERT(outcount <= bufsize);
1516 		}
1517 		tdp = tdp->td_next;
1518 	}
1519 
1520 	if (!error)
1521 		error = uiomove(outbuf, outcount, UIO_READ, uiop);
1522 
1523 	if (!error) {
1524 		/* If we reached the end of the list our offset */
1525 		/* should now be just past the end. */
1526 		if (!tdp) {
1527 			offset += 1;
1528 			if (eofp)
1529 				*eofp = 1;
1530 		} else if (eofp)
1531 			*eofp = 0;
1532 		uiop->uio_offset = offset;
1533 	}
1534 	gethrestime(&tp->tn_atime);
1535 	kmem_free(outbuf, bufsize);
1536 	return (error);
1537 }
1538 
1539 /* ARGSUSED5 */
1540 static int
1541 tmp_symlink(
1542 	struct vnode *dvp,
1543 	char *lnm,
1544 	struct vattr *tva,
1545 	char *tnm,
1546 	struct cred *cred,
1547 	caller_context_t *ct,
1548 	int flags)
1549 {
1550 	struct tmpnode *parent = (struct tmpnode *)VTOTN(dvp);
1551 	struct tmpnode *self = (struct tmpnode *)NULL;
1552 	struct tmount *tm = (struct tmount *)VTOTM(dvp);
1553 	char *cp = NULL;
1554 	int error;
1555 	size_t len;
1556 
1557 	/* no symlinks allowed to files in xattr dirs */
1558 	if (parent->tn_flags & ISXATTR)
1559 		return (EINVAL);
1560 
1561 	error = tdirlookup(parent, lnm, &self, cred);
1562 	if (error == 0) {
1563 		/*
1564 		 * The entry already exists
1565 		 */
1566 		tmpnode_rele(self);
1567 		return (EEXIST);	/* was 0 */
1568 	}
1569 
1570 	if (error != ENOENT) {
1571 		if (self != NULL)
1572 			tmpnode_rele(self);
1573 		return (error);
1574 	}
1575 
1576 	rw_enter(&parent->tn_rwlock, RW_WRITER);
1577 	error = tdirenter(tm, parent, lnm, DE_CREATE, (struct tmpnode *)NULL,
1578 	    (struct tmpnode *)NULL, tva, &self, cred, ct);
1579 	rw_exit(&parent->tn_rwlock);
1580 
1581 	if (error) {
1582 		if (self)
1583 			tmpnode_rele(self);
1584 		return (error);
1585 	}
1586 	len = strlen(tnm) + 1;
1587 	cp = tmp_memalloc(len, 0);
1588 	if (cp == NULL) {
1589 		tmpnode_rele(self);
1590 		return (ENOSPC);
1591 	}
1592 	(void) strcpy(cp, tnm);
1593 
1594 	self->tn_symlink = cp;
1595 	self->tn_size = len - 1;
1596 	tmpnode_rele(self);
1597 	return (error);
1598 }
1599 
1600 /* ARGSUSED2 */
1601 static int
1602 tmp_readlink(
1603 	struct vnode *vp,
1604 	struct uio *uiop,
1605 	struct cred *cred,
1606 	caller_context_t *ct)
1607 {
1608 	struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
1609 	int error = 0;
1610 
1611 	if (vp->v_type != VLNK)
1612 		return (EINVAL);
1613 
1614 	rw_enter(&tp->tn_rwlock, RW_READER);
1615 	rw_enter(&tp->tn_contents, RW_READER);
1616 	error = uiomove(tp->tn_symlink, tp->tn_size, UIO_READ, uiop);
1617 	gethrestime(&tp->tn_atime);
1618 	rw_exit(&tp->tn_contents);
1619 	rw_exit(&tp->tn_rwlock);
1620 	return (error);
1621 }
1622 
1623 /* ARGSUSED */
1624 static int
1625 tmp_fsync(
1626 	struct vnode *vp,
1627 	int syncflag,
1628 	struct cred *cred,
1629 	caller_context_t *ct)
1630 {
1631 	return (0);
1632 }
1633 
1634 /* ARGSUSED */
1635 static void
1636 tmp_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
1637 {
1638 	struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
1639 	struct tmount *tm = (struct tmount *)VFSTOTM(vp->v_vfsp);
1640 
1641 	rw_enter(&tp->tn_rwlock, RW_WRITER);
1642 top:
1643 	mutex_enter(&tp->tn_tlock);
1644 	mutex_enter(&vp->v_lock);
1645 	ASSERT(vp->v_count >= 1);
1646 
1647 	/*
1648 	 * If we don't have the last hold or the link count is non-zero,
1649 	 * there's little to do -- just drop our hold.
1650 	 */
1651 	if (vp->v_count > 1 || tp->tn_nlink != 0) {
1652 		vp->v_count--;
1653 		mutex_exit(&vp->v_lock);
1654 		mutex_exit(&tp->tn_tlock);
1655 		rw_exit(&tp->tn_rwlock);
1656 		return;
1657 	}
1658 
1659 	/*
1660 	 * We have the last hold *and* the link count is zero, so this
1661 	 * tmpnode is dead from the filesystem's viewpoint.  However,
1662 	 * if the tmpnode has any pages associated with it (i.e. if it's
1663 	 * a normal file with non-zero size), the tmpnode can still be
1664 	 * discovered by pageout or fsflush via the page vnode pointers.
1665 	 * In this case we must drop all our locks, truncate the tmpnode,
1666 	 * and try the whole dance again.
1667 	 */
1668 	if (tp->tn_size != 0) {
1669 		if (tp->tn_type == VREG) {
1670 			mutex_exit(&vp->v_lock);
1671 			mutex_exit(&tp->tn_tlock);
1672 			rw_enter(&tp->tn_contents, RW_WRITER);
1673 			(void) tmpnode_trunc(tm, tp, 0);
1674 			rw_exit(&tp->tn_contents);
1675 			ASSERT(tp->tn_size == 0);
1676 			ASSERT(tp->tn_nblocks == 0);
1677 			goto top;
1678 		}
1679 		if (tp->tn_type == VLNK)
1680 			tmp_memfree(tp->tn_symlink, tp->tn_size + 1);
1681 	}
1682 
1683 	/*
1684 	 * Remove normal file/dir's xattr dir and xattrs.
1685 	 */
1686 	if (tp->tn_xattrdp) {
1687 		struct tmpnode *xtp = tp->tn_xattrdp;
1688 
1689 		ASSERT(xtp->tn_flags & ISXATTR);
1690 		tmpnode_hold(xtp);
1691 		rw_enter(&xtp->tn_rwlock, RW_WRITER);
1692 		tdirtrunc(xtp);
1693 		DECR_COUNT(&xtp->tn_nlink, &xtp->tn_tlock);
1694 		tp->tn_xattrdp = NULL;
1695 		rw_exit(&xtp->tn_rwlock);
1696 		tmpnode_rele(xtp);
1697 	}
1698 
1699 	mutex_exit(&vp->v_lock);
1700 	mutex_exit(&tp->tn_tlock);
1701 	/* Here's our chance to send invalid event while we're between locks */
1702 	vn_invalid(TNTOV(tp));
1703 	mutex_enter(&tm->tm_contents);
1704 	if (tp->tn_forw == NULL)
1705 		tm->tm_rootnode->tn_back = tp->tn_back;
1706 	else
1707 		tp->tn_forw->tn_back = tp->tn_back;
1708 	tp->tn_back->tn_forw = tp->tn_forw;
1709 	mutex_exit(&tm->tm_contents);
1710 	rw_exit(&tp->tn_rwlock);
1711 	rw_destroy(&tp->tn_rwlock);
1712 	mutex_destroy(&tp->tn_tlock);
1713 	vn_free(TNTOV(tp));
1714 	tmp_memfree(tp, sizeof (struct tmpnode));
1715 }
1716 
1717 /* ARGSUSED2 */
1718 static int
1719 tmp_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1720 {
1721 	struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
1722 	struct tfid *tfid;
1723 
1724 	if (fidp->fid_len < (sizeof (struct tfid) - sizeof (ushort_t))) {
1725 		fidp->fid_len = sizeof (struct tfid) - sizeof (ushort_t);
1726 		return (ENOSPC);
1727 	}
1728 
1729 	tfid = (struct tfid *)fidp;
1730 	bzero(tfid, sizeof (struct tfid));
1731 	tfid->tfid_len = (int)sizeof (struct tfid) - sizeof (ushort_t);
1732 
1733 	tfid->tfid_ino = tp->tn_nodeid;
1734 	tfid->tfid_gen = tp->tn_gen;
1735 
1736 	return (0);
1737 }
1738 
1739 
1740 /*
1741  * Return all the pages from [off..off+len] in given file
1742  */
1743 /* ARGSUSED */
1744 static int
1745 tmp_getpage(
1746 	struct vnode *vp,
1747 	offset_t off,
1748 	size_t len,
1749 	uint_t *protp,
1750 	page_t *pl[],
1751 	size_t plsz,
1752 	struct seg *seg,
1753 	caddr_t addr,
1754 	enum seg_rw rw,
1755 	struct cred *cr,
1756 	caller_context_t *ct)
1757 {
1758 	int err = 0;
1759 	struct tmpnode *tp = VTOTN(vp);
1760 	anoff_t toff = (anoff_t)off;
1761 	size_t tlen = len;
1762 	u_offset_t tmpoff;
1763 	timestruc_t now;
1764 
1765 	rw_enter(&tp->tn_contents, RW_READER);
1766 
1767 	if (off + len  > tp->tn_size + PAGEOFFSET) {
1768 		err = EFAULT;
1769 		goto out;
1770 	}
1771 	/*
1772 	 * Look for holes (no anon slot) in faulting range. If there are
1773 	 * holes we have to switch to a write lock and fill them in. Swap
1774 	 * space for holes was already reserved when the file was grown.
1775 	 */
1776 	tmpoff = toff;
1777 	if (non_anon(tp->tn_anon, btop(off), &tmpoff, &tlen)) {
1778 		if (!rw_tryupgrade(&tp->tn_contents)) {
1779 			rw_exit(&tp->tn_contents);
1780 			rw_enter(&tp->tn_contents, RW_WRITER);
1781 			/* Size may have changed when lock was dropped */
1782 			if (off + len  > tp->tn_size + PAGEOFFSET) {
1783 				err = EFAULT;
1784 				goto out;
1785 			}
1786 		}
1787 		for (toff = (anoff_t)off; toff < (anoff_t)off + len;
1788 		    toff += PAGESIZE) {
1789 			if (anon_get_ptr(tp->tn_anon, btop(toff)) == NULL) {
1790 				/* XXX - may allocate mem w. write lock held */
1791 				(void) anon_set_ptr(tp->tn_anon, btop(toff),
1792 				    anon_alloc(vp, toff), ANON_SLEEP);
1793 				tp->tn_nblocks++;
1794 			}
1795 		}
1796 		rw_downgrade(&tp->tn_contents);
1797 	}
1798 
1799 
1800 	if (len <= PAGESIZE)
1801 		err = tmp_getapage(vp, (u_offset_t)off, len, protp, pl, plsz,
1802 		    seg, addr, rw, cr);
1803 	else
1804 		err = pvn_getpages(tmp_getapage, vp, (u_offset_t)off, len,
1805 		    protp, pl, plsz, seg, addr, rw, cr);
1806 
1807 	gethrestime(&now);
1808 	tp->tn_atime = now;
1809 	if (rw == S_WRITE)
1810 		tp->tn_mtime = now;
1811 
1812 out:
1813 	rw_exit(&tp->tn_contents);
1814 	return (err);
1815 }
1816 
1817 /*
1818  * Called from pvn_getpages or swap_getpage to get a particular page.
1819  */
1820 /*ARGSUSED*/
1821 static int
1822 tmp_getapage(
1823 	struct vnode *vp,
1824 	u_offset_t off,
1825 	size_t len,
1826 	uint_t *protp,
1827 	page_t *pl[],
1828 	size_t plsz,
1829 	struct seg *seg,
1830 	caddr_t addr,
1831 	enum seg_rw rw,
1832 	struct cred *cr)
1833 {
1834 	struct page *pp;
1835 	int flags;
1836 	int err = 0;
1837 	struct vnode *pvp;
1838 	u_offset_t poff;
1839 
1840 	if (protp != NULL)
1841 		*protp = PROT_ALL;
1842 again:
1843 	if (pp = page_lookup(vp, off, rw == S_CREATE ? SE_EXCL : SE_SHARED)) {
1844 		if (pl) {
1845 			pl[0] = pp;
1846 			pl[1] = NULL;
1847 		} else {
1848 			page_unlock(pp);
1849 		}
1850 	} else {
1851 		pp = page_create_va(vp, off, PAGESIZE,
1852 		    PG_WAIT | PG_EXCL, seg, addr);
1853 		/*
1854 		 * Someone raced in and created the page after we did the
1855 		 * lookup but before we did the create, so go back and
1856 		 * try to look it up again.
1857 		 */
1858 		if (pp == NULL)
1859 			goto again;
1860 		/*
1861 		 * Fill page from backing store, if any. If none, then
1862 		 * either this is a newly filled hole or page must have
1863 		 * been unmodified and freed so just zero it out.
1864 		 */
1865 		err = swap_getphysname(vp, off, &pvp, &poff);
1866 		if (err) {
1867 			panic("tmp_getapage: no anon slot vp %p "
1868 			    "off %llx pp %p\n", (void *)vp, off, (void *)pp);
1869 		}
1870 		if (pvp) {
1871 			flags = (pl == NULL ? B_ASYNC|B_READ : B_READ);
1872 			err = VOP_PAGEIO(pvp, pp, (u_offset_t)poff, PAGESIZE,
1873 			    flags, cr, NULL);
1874 			if (flags & B_ASYNC)
1875 				pp = NULL;
1876 		} else if (rw != S_CREATE) {
1877 			pagezero(pp, 0, PAGESIZE);
1878 		}
1879 		if (err && pp)
1880 			pvn_read_done(pp, B_ERROR);
1881 		if (err == 0) {
1882 			if (pl)
1883 				pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw);
1884 			else
1885 				pvn_io_done(pp);
1886 		}
1887 	}
1888 	return (err);
1889 }
1890 
1891 
1892 /*
1893  * Flags are composed of {B_INVAL, B_DIRTY B_FREE, B_DONTNEED}.
1894  * If len == 0, do from off to EOF.
1895  */
1896 static int tmp_nopage = 0;	/* Don't do tmp_putpage's if set */
1897 
1898 /* ARGSUSED */
1899 int
1900 tmp_putpage(
1901 	register struct vnode *vp,
1902 	offset_t off,
1903 	size_t len,
1904 	int flags,
1905 	struct cred *cr,
1906 	caller_context_t *ct)
1907 {
1908 	register page_t *pp;
1909 	u_offset_t io_off;
1910 	size_t io_len = 0;
1911 	int err = 0;
1912 	struct tmpnode *tp = VTOTN(vp);
1913 	int dolock;
1914 
1915 	if (tmp_nopage)
1916 		return (0);
1917 
1918 	ASSERT(vp->v_count != 0);
1919 
1920 	if (vp->v_flag & VNOMAP)
1921 		return (ENOSYS);
1922 
1923 	/*
1924 	 * This being tmpfs, we don't ever do i/o unless we really
1925 	 * have to (when we're low on memory and pageout calls us
1926 	 * with B_ASYNC | B_FREE or the user explicitly asks for it with
1927 	 * B_DONTNEED).
1928 	 * XXX to approximately track the mod time like ufs we should
1929 	 * update the times here. The problem is, once someone does a
1930 	 * store we never clear the mod bit and do i/o, thus fsflush
1931 	 * will keep calling us every 30 seconds to do the i/o and we'll
1932 	 * continually update the mod time. At least we update the mod
1933 	 * time on the first store because this results in a call to getpage.
1934 	 */
1935 	if (flags != (B_ASYNC | B_FREE) && (flags & B_INVAL) == 0 &&
1936 	    (flags & B_DONTNEED) == 0)
1937 		return (0);
1938 	/*
1939 	 * If this thread owns the lock, i.e., this thread grabbed it
1940 	 * as writer somewhere above, then we don't need to grab the
1941 	 * lock as reader in this routine.
1942 	 */
1943 	dolock = (rw_owner(&tp->tn_contents) != curthread);
1944 
1945 	/*
1946 	 * If this is pageout don't block on the lock as you could deadlock
1947 	 * when freemem == 0 (another thread has the read lock and is blocked
1948 	 * creating a page, and a third thread is waiting to get the writers
1949 	 * lock - waiting writers priority blocks us from getting the read
1950 	 * lock). Of course, if the only freeable pages are on this tmpnode
1951 	 * we're hosed anyways. A better solution might be a new lock type.
1952 	 * Note: ufs has the same problem.
1953 	 */
1954 	if (curproc == proc_pageout) {
1955 		if (!rw_tryenter(&tp->tn_contents, RW_READER))
1956 			return (ENOMEM);
1957 	} else if (dolock)
1958 		rw_enter(&tp->tn_contents, RW_READER);
1959 
1960 	if (!vn_has_cached_data(vp))
1961 		goto out;
1962 
1963 	if (len == 0) {
1964 		if (curproc == proc_pageout) {
1965 			panic("tmp: pageout can't block");
1966 			/*NOTREACHED*/
1967 		}
1968 
1969 		/* Search the entire vp list for pages >= off. */
1970 		err = pvn_vplist_dirty(vp, (u_offset_t)off, tmp_putapage,
1971 		    flags, cr);
1972 	} else {
1973 		u_offset_t eoff;
1974 
1975 		/*
1976 		 * Loop over all offsets in the range [off...off + len]
1977 		 * looking for pages to deal with.
1978 		 */
1979 		eoff = MIN(off + len, tp->tn_size);
1980 		for (io_off = off; io_off < eoff; io_off += io_len) {
1981 			/*
1982 			 * If we are not invalidating, synchronously
1983 			 * freeing or writing pages use the routine
1984 			 * page_lookup_nowait() to prevent reclaiming
1985 			 * them from the free list.
1986 			 */
1987 			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
1988 				pp = page_lookup(vp, io_off,
1989 				    (flags & (B_INVAL | B_FREE)) ?
1990 				    SE_EXCL : SE_SHARED);
1991 			} else {
1992 				pp = page_lookup_nowait(vp, io_off,
1993 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
1994 			}
1995 
1996 			if (pp == NULL || pvn_getdirty(pp, flags) == 0)
1997 				io_len = PAGESIZE;
1998 			else {
1999 				err = tmp_putapage(vp, pp, &io_off, &io_len,
2000 				    flags, cr);
2001 				if (err != 0)
2002 					break;
2003 			}
2004 		}
2005 	}
2006 	/* If invalidating, verify all pages on vnode list are gone. */
2007 	if (err == 0 && off == 0 && len == 0 &&
2008 	    (flags & B_INVAL) && vn_has_cached_data(vp)) {
2009 		panic("tmp_putpage: B_INVAL, pages not gone");
2010 		/*NOTREACHED*/
2011 	}
2012 out:
2013 	if ((curproc == proc_pageout) || dolock)
2014 		rw_exit(&tp->tn_contents);
2015 	/*
2016 	 * Only reason putapage is going to give us SE_NOSWAP as error
2017 	 * is when we ask a page to be written to physical backing store
2018 	 * and there is none. Ignore this because we might be dealing
2019 	 * with a swap page which does not have any backing store
2020 	 * on disk. In any other case we won't get this error over here.
2021 	 */
2022 	if (err == SE_NOSWAP)
2023 		err = 0;
2024 	return (err);
2025 }
2026 
2027 long tmp_putpagecnt, tmp_pagespushed;
2028 
2029 /*
2030  * Write out a single page.
2031  * For tmpfs this means choose a physical swap slot and write the page
2032  * out using VOP_PAGEIO. For performance, we attempt to kluster; i.e.,
2033  * we try to find a bunch of other dirty pages adjacent in the file
2034  * and a bunch of contiguous swap slots, and then write all the pages
2035  * out in a single i/o.
2036  */
2037 /*ARGSUSED*/
2038 static int
2039 tmp_putapage(
2040 	struct vnode *vp,
2041 	page_t *pp,
2042 	u_offset_t *offp,
2043 	size_t *lenp,
2044 	int flags,
2045 	struct cred *cr)
2046 {
2047 	int err;
2048 	ulong_t klstart, kllen;
2049 	page_t *pplist, *npplist;
2050 	extern int klustsize;
2051 	long tmp_klustsize;
2052 	struct tmpnode *tp;
2053 	size_t pp_off, pp_len;
2054 	u_offset_t io_off;
2055 	size_t io_len;
2056 	struct vnode *pvp;
2057 	u_offset_t pstart;
2058 	u_offset_t offset;
2059 	u_offset_t tmpoff;
2060 
2061 	ASSERT(PAGE_LOCKED(pp));
2062 
2063 	/* Kluster in tmp_klustsize chunks */
2064 	tp = VTOTN(vp);
2065 	tmp_klustsize = klustsize;
2066 	offset = pp->p_offset;
2067 	klstart = (offset / tmp_klustsize) * tmp_klustsize;
2068 	kllen = MIN(tmp_klustsize, tp->tn_size - klstart);
2069 
2070 	/* Get a kluster of pages */
2071 	pplist =
2072 	    pvn_write_kluster(vp, pp, &tmpoff, &pp_len, klstart, kllen, flags);
2073 
2074 	pp_off = (size_t)tmpoff;
2075 
2076 	/*
2077 	 * Get a cluster of physical offsets for the pages; the amount we
2078 	 * get may be some subrange of what we ask for (io_off, io_len).
2079 	 */
2080 	io_off = pp_off;
2081 	io_len = pp_len;
2082 	err = swap_newphysname(vp, offset, &io_off, &io_len, &pvp, &pstart);
2083 	ASSERT(err != SE_NOANON); /* anon slot must have been filled */
2084 	if (err) {
2085 		pvn_write_done(pplist, B_ERROR | B_WRITE | flags);
2086 		/*
2087 		 * If this routine is called as a result of segvn_sync
2088 		 * operation and we have no physical swap then we can get an
2089 		 * error here. In such case we would return SE_NOSWAP as error.
2090 		 * At this point, we expect only SE_NOSWAP.
2091 		 */
2092 		ASSERT(err == SE_NOSWAP);
2093 		if (flags & B_INVAL)
2094 			err = ENOMEM;
2095 		goto out;
2096 	}
2097 	ASSERT(pp_off <= io_off && io_off + io_len <= pp_off + pp_len);
2098 	ASSERT(io_off <= offset && offset < io_off + io_len);
2099 
2100 	/* Toss pages at front/rear that we couldn't get physical backing for */
2101 	if (io_off != pp_off) {
2102 		npplist = NULL;
2103 		page_list_break(&pplist, &npplist, btop(io_off - pp_off));
2104 		ASSERT(pplist->p_offset == pp_off);
2105 		ASSERT(pplist->p_prev->p_offset == io_off - PAGESIZE);
2106 		pvn_write_done(pplist, B_ERROR | B_WRITE | flags);
2107 		pplist = npplist;
2108 	}
2109 	if (io_off + io_len < pp_off + pp_len) {
2110 		npplist = NULL;
2111 		page_list_break(&pplist, &npplist, btop(io_len));
2112 		ASSERT(npplist->p_offset == io_off + io_len);
2113 		ASSERT(npplist->p_prev->p_offset == pp_off + pp_len - PAGESIZE);
2114 		pvn_write_done(npplist, B_ERROR | B_WRITE | flags);
2115 	}
2116 
2117 	ASSERT(pplist->p_offset == io_off);
2118 	ASSERT(pplist->p_prev->p_offset == io_off + io_len - PAGESIZE);
2119 	ASSERT(btopr(io_len) <= btopr(kllen));
2120 
2121 	/* Do i/o on the remaining kluster */
2122 	err = VOP_PAGEIO(pvp, pplist, (u_offset_t)pstart, io_len,
2123 	    B_WRITE | flags, cr, NULL);
2124 
2125 	if ((flags & B_ASYNC) == 0) {
2126 		pvn_write_done(pplist, ((err) ? B_ERROR : 0) | B_WRITE | flags);
2127 	}
2128 out:
2129 	if (!err) {
2130 		if (offp)
2131 			*offp = io_off;
2132 		if (lenp)
2133 			*lenp = io_len;
2134 		tmp_putpagecnt++;
2135 		tmp_pagespushed += btop(io_len);
2136 	}
2137 	if (err && err != ENOMEM && err != SE_NOSWAP)
2138 		cmn_err(CE_WARN, "tmp_putapage: err %d\n", err);
2139 	return (err);
2140 }
2141 
2142 /* ARGSUSED */
2143 static int
2144 tmp_map(
2145 	struct vnode *vp,
2146 	offset_t off,
2147 	struct as *as,
2148 	caddr_t *addrp,
2149 	size_t len,
2150 	uchar_t prot,
2151 	uchar_t maxprot,
2152 	uint_t flags,
2153 	struct cred *cred,
2154 	caller_context_t *ct)
2155 {
2156 	struct segvn_crargs vn_a;
2157 	struct tmpnode *tp = (struct tmpnode *)VTOTN(vp);
2158 	int error;
2159 
2160 #ifdef _ILP32
2161 	if (len > MAXOFF_T)
2162 		return (ENOMEM);
2163 #endif
2164 
2165 	if (vp->v_flag & VNOMAP)
2166 		return (ENOSYS);
2167 
2168 	if (off < 0 || (offset_t)(off + len) < 0 ||
2169 	    off > MAXOFF_T || (off + len) > MAXOFF_T)
2170 		return (ENXIO);
2171 
2172 	if (vp->v_type != VREG)
2173 		return (ENODEV);
2174 
2175 	/*
2176 	 * Don't allow mapping to locked file
2177 	 */
2178 	if (vn_has_mandatory_locks(vp, tp->tn_mode)) {
2179 		return (EAGAIN);
2180 	}
2181 
2182 	as_rangelock(as);
2183 	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
2184 	if (error != 0) {
2185 		as_rangeunlock(as);
2186 		return (error);
2187 	}
2188 
2189 	vn_a.vp = vp;
2190 	vn_a.offset = (u_offset_t)off;
2191 	vn_a.type = flags & MAP_TYPE;
2192 	vn_a.prot = prot;
2193 	vn_a.maxprot = maxprot;
2194 	vn_a.flags = flags & ~MAP_TYPE;
2195 	vn_a.cred = cred;
2196 	vn_a.amp = NULL;
2197 	vn_a.szc = 0;
2198 	vn_a.lgrp_mem_policy_flags = 0;
2199 
2200 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
2201 	as_rangeunlock(as);
2202 	return (error);
2203 }
2204 
2205 /*
2206  * tmp_addmap and tmp_delmap can't be called since the vp
2207  * maintained in the segvn mapping is NULL.
2208  */
2209 /* ARGSUSED */
2210 static int
2211 tmp_addmap(
2212 	struct vnode *vp,
2213 	offset_t off,
2214 	struct as *as,
2215 	caddr_t addr,
2216 	size_t len,
2217 	uchar_t prot,
2218 	uchar_t maxprot,
2219 	uint_t flags,
2220 	struct cred *cred,
2221 	caller_context_t *ct)
2222 {
2223 	return (0);
2224 }
2225 
2226 /* ARGSUSED */
2227 static int
2228 tmp_delmap(
2229 	struct vnode *vp,
2230 	offset_t off,
2231 	struct as *as,
2232 	caddr_t addr,
2233 	size_t len,
2234 	uint_t prot,
2235 	uint_t maxprot,
2236 	uint_t flags,
2237 	struct cred *cred,
2238 	caller_context_t *ct)
2239 {
2240 	return (0);
2241 }
2242 
2243 static int
2244 tmp_freesp(struct vnode *vp, struct flock64 *lp, int flag)
2245 {
2246 	register int i;
2247 	register struct tmpnode *tp = VTOTN(vp);
2248 	int error;
2249 
2250 	ASSERT(vp->v_type == VREG);
2251 	ASSERT(lp->l_start >= 0);
2252 
2253 	if (lp->l_len != 0)
2254 		return (EINVAL);
2255 
2256 	rw_enter(&tp->tn_rwlock, RW_WRITER);
2257 	if (tp->tn_size == lp->l_start) {
2258 		rw_exit(&tp->tn_rwlock);
2259 		return (0);
2260 	}
2261 
2262 	/*
2263 	 * Check for any mandatory locks on the range
2264 	 */
2265 	if (MANDLOCK(vp, tp->tn_mode)) {
2266 		long save_start;
2267 
2268 		save_start = lp->l_start;
2269 
2270 		if (tp->tn_size < lp->l_start) {
2271 			/*
2272 			 * "Truncate up" case: need to make sure there
2273 			 * is no lock beyond current end-of-file. To
2274 			 * do so, we need to set l_start to the size
2275 			 * of the file temporarily.
2276 			 */
2277 			lp->l_start = tp->tn_size;
2278 		}
2279 		lp->l_type = F_WRLCK;
2280 		lp->l_sysid = 0;
2281 		lp->l_pid = ttoproc(curthread)->p_pid;
2282 		i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK;
2283 		if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 ||
2284 		    lp->l_type != F_UNLCK) {
2285 			rw_exit(&tp->tn_rwlock);
2286 			return (i ? i : EAGAIN);
2287 		}
2288 
2289 		lp->l_start = save_start;
2290 	}
2291 	VFSTOTM(vp->v_vfsp);
2292 
2293 	rw_enter(&tp->tn_contents, RW_WRITER);
2294 	error = tmpnode_trunc((struct tmount *)VFSTOTM(vp->v_vfsp),
2295 	    tp, (ulong_t)lp->l_start);
2296 	rw_exit(&tp->tn_contents);
2297 	rw_exit(&tp->tn_rwlock);
2298 	return (error);
2299 }
2300 
2301 /* ARGSUSED */
2302 static int
2303 tmp_space(
2304 	struct vnode *vp,
2305 	int cmd,
2306 	struct flock64 *bfp,
2307 	int flag,
2308 	offset_t offset,
2309 	cred_t *cred,
2310 	caller_context_t *ct)
2311 {
2312 	int error;
2313 
2314 	if (cmd != F_FREESP)
2315 		return (EINVAL);
2316 	if ((error = convoff(vp, bfp, 0, (offset_t)offset)) == 0) {
2317 		if ((bfp->l_start > MAXOFF_T) || (bfp->l_len > MAXOFF_T))
2318 			return (EFBIG);
2319 		error = tmp_freesp(vp, bfp, flag);
2320 	}
2321 	return (error);
2322 }
2323 
2324 /* ARGSUSED */
2325 static int
2326 tmp_seek(
2327 	struct vnode *vp,
2328 	offset_t ooff,
2329 	offset_t *noffp,
2330 	caller_context_t *ct)
2331 {
2332 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
2333 }
2334 
2335 /* ARGSUSED2 */
2336 static int
2337 tmp_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
2338 {
2339 	struct tmpnode *tp = VTOTN(vp);
2340 
2341 	if (write_lock) {
2342 		rw_enter(&tp->tn_rwlock, RW_WRITER);
2343 	} else {
2344 		rw_enter(&tp->tn_rwlock, RW_READER);
2345 	}
2346 	return (write_lock);
2347 }
2348 
2349 /* ARGSUSED1 */
2350 static void
2351 tmp_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
2352 {
2353 	struct tmpnode *tp = VTOTN(vp);
2354 
2355 	rw_exit(&tp->tn_rwlock);
2356 }
2357 
2358 static int
2359 tmp_pathconf(
2360 	struct vnode *vp,
2361 	int cmd,
2362 	ulong_t *valp,
2363 	cred_t *cr,
2364 	caller_context_t *ct)
2365 {
2366 	struct tmpnode *tp = NULL;
2367 	int error;
2368 
2369 	switch (cmd) {
2370 	case _PC_XATTR_EXISTS:
2371 		if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
2372 			*valp = 0;	/* assume no attributes */
2373 			error = 0;	/* okay to ask */
2374 			tp = VTOTN(vp);
2375 			rw_enter(&tp->tn_rwlock, RW_READER);
2376 			if (tp->tn_xattrdp) {
2377 				rw_enter(&tp->tn_xattrdp->tn_rwlock, RW_READER);
2378 				/* do not count "." and ".." */
2379 				if (tp->tn_xattrdp->tn_dirents > 2)
2380 					*valp = 1;
2381 				rw_exit(&tp->tn_xattrdp->tn_rwlock);
2382 			}
2383 			rw_exit(&tp->tn_rwlock);
2384 		} else {
2385 			error = EINVAL;
2386 		}
2387 		break;
2388 	case _PC_SATTR_ENABLED:
2389 	case _PC_SATTR_EXISTS:
2390 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2391 		    (vp->v_type == VREG || vp->v_type == VDIR);
2392 		error = 0;
2393 		break;
2394 	default:
2395 		error = fs_pathconf(vp, cmd, valp, cr, ct);
2396 	}
2397 	return (error);
2398 }
2399 
2400 
2401 struct vnodeops *tmp_vnodeops;
2402 
2403 const fs_operation_def_t tmp_vnodeops_template[] = {
2404 	VOPNAME_OPEN,		{ .vop_open = tmp_open },
2405 	VOPNAME_CLOSE,		{ .vop_close = tmp_close },
2406 	VOPNAME_READ,		{ .vop_read = tmp_read },
2407 	VOPNAME_WRITE,		{ .vop_write = tmp_write },
2408 	VOPNAME_IOCTL,		{ .vop_ioctl = tmp_ioctl },
2409 	VOPNAME_GETATTR,	{ .vop_getattr = tmp_getattr },
2410 	VOPNAME_SETATTR,	{ .vop_setattr = tmp_setattr },
2411 	VOPNAME_ACCESS,		{ .vop_access = tmp_access },
2412 	VOPNAME_LOOKUP,		{ .vop_lookup = tmp_lookup },
2413 	VOPNAME_CREATE,		{ .vop_create = tmp_create },
2414 	VOPNAME_REMOVE,		{ .vop_remove = tmp_remove },
2415 	VOPNAME_LINK,		{ .vop_link = tmp_link },
2416 	VOPNAME_RENAME,		{ .vop_rename = tmp_rename },
2417 	VOPNAME_MKDIR,		{ .vop_mkdir = tmp_mkdir },
2418 	VOPNAME_RMDIR,		{ .vop_rmdir = tmp_rmdir },
2419 	VOPNAME_READDIR,	{ .vop_readdir = tmp_readdir },
2420 	VOPNAME_SYMLINK,	{ .vop_symlink = tmp_symlink },
2421 	VOPNAME_READLINK,	{ .vop_readlink = tmp_readlink },
2422 	VOPNAME_FSYNC,		{ .vop_fsync = tmp_fsync },
2423 	VOPNAME_INACTIVE,	{ .vop_inactive = tmp_inactive },
2424 	VOPNAME_FID,		{ .vop_fid = tmp_fid },
2425 	VOPNAME_RWLOCK,		{ .vop_rwlock = tmp_rwlock },
2426 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = tmp_rwunlock },
2427 	VOPNAME_SEEK,		{ .vop_seek = tmp_seek },
2428 	VOPNAME_SPACE,		{ .vop_space = tmp_space },
2429 	VOPNAME_GETPAGE,	{ .vop_getpage = tmp_getpage },
2430 	VOPNAME_PUTPAGE,	{ .vop_putpage = tmp_putpage },
2431 	VOPNAME_MAP,		{ .vop_map = tmp_map },
2432 	VOPNAME_ADDMAP,		{ .vop_addmap = tmp_addmap },
2433 	VOPNAME_DELMAP,		{ .vop_delmap = tmp_delmap },
2434 	VOPNAME_PATHCONF,	{ .vop_pathconf = tmp_pathconf },
2435 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
2436 	NULL,			NULL
2437 };
2438