xref: /titanic_44/usr/src/uts/common/fs/tmpfs/tmp_dir.c (revision 2b4a78020b9c38d1b95e2f3fefa6d6e4be382d1f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/time.h>
33 #include <sys/vfs.h>
34 #include <sys/vnode.h>
35 #include <sys/errno.h>
36 #include <sys/cmn_err.h>
37 #include <sys/cred.h>
38 #include <sys/stat.h>
39 #include <sys/debug.h>
40 #include <sys/policy.h>
41 #include <sys/fs/tmpnode.h>
42 #include <sys/fs/tmp.h>
43 #include <sys/vtrace.h>
44 
45 static int tdircheckpath(struct tmpnode *, struct tmpnode *, struct cred *);
46 static int tdirrename(struct tmpnode *, struct tmpnode *, struct tmpnode *,
47 	char *, struct tmpnode *, struct tdirent *, struct cred *);
48 static void tdirfixdotdot(struct tmpnode *, struct tmpnode *, struct tmpnode *);
49 static int tdirmaketnode(struct tmpnode *, struct tmount *, struct vattr *,
50 	enum de_op, struct tmpnode **, struct cred *);
51 static int tdiraddentry(struct tmpnode *, struct tmpnode *, char *,
52 	enum de_op, struct tmpnode *);
53 
54 
55 #define	T_HASH_SIZE	8192		/* must be power of 2 */
56 #define	T_MUTEX_SIZE	64
57 
58 static struct tdirent	*t_hashtable[T_HASH_SIZE];
59 static kmutex_t		 t_hashmutex[T_MUTEX_SIZE];
60 
61 #define	T_HASH_INDEX(a)		((a) & (T_HASH_SIZE-1))
62 #define	T_MUTEX_INDEX(a)	((a) & (T_MUTEX_SIZE-1))
63 
64 #define	TMPFS_HASH(tp, name, hash)				\
65 	{							\
66 		char Xc, *Xcp;					\
67 		hash = (uint_t)(uintptr_t)(tp) >> 8;		\
68 		for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++)	\
69 			hash = (hash << 4) + hash + (uint_t)Xc;	\
70 	}
71 
72 void
73 tmpfs_hash_init(void)
74 {
75 	int	ix;
76 
77 	for (ix = 0; ix < T_MUTEX_SIZE; ix++)
78 		mutex_init(&t_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL);
79 }
80 
81 /*
82  * This routine is where the rubber meets the road for identities.
83  */
84 static void
85 tmpfs_hash_in(struct tdirent *t)
86 {
87 	uint_t		hash;
88 	struct tdirent	**prevpp;
89 	kmutex_t	*t_hmtx;
90 
91 	TMPFS_HASH(t->td_parent, t->td_name, hash);
92 	t->td_hash = hash;
93 	prevpp = &t_hashtable[T_HASH_INDEX(hash)];
94 	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
95 	mutex_enter(t_hmtx);
96 	t->td_link = *prevpp;
97 	*prevpp = t;
98 	mutex_exit(t_hmtx);
99 }
100 
101 /*
102  * Remove tdirent *t from the hash list.
103  */
104 static void
105 tmpfs_hash_out(struct tdirent *t)
106 {
107 	uint_t		hash;
108 	struct tdirent	**prevpp;
109 	kmutex_t	*t_hmtx;
110 
111 	hash = t->td_hash;
112 	prevpp = &t_hashtable[T_HASH_INDEX(hash)];
113 	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
114 	mutex_enter(t_hmtx);
115 	while (*prevpp != t)
116 		prevpp = &(*prevpp)->td_link;
117 	*prevpp = t->td_link;
118 	mutex_exit(t_hmtx);
119 }
120 
121 /*
122  * Currently called by tdirrename() only.
123  * rename operation needs to be done with lock held, to ensure that
124  * no other operations can access the tmpnode at the same instance.
125  */
126 static void
127 tmpfs_hash_change(struct tdirent *tdp, struct tmpnode *fromtp)
128 {
129 	uint_t		hash;
130 	kmutex_t	*t_hmtx;
131 
132 	hash = tdp->td_hash;
133 	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
134 	mutex_enter(t_hmtx);
135 	tdp->td_tmpnode = fromtp;
136 	mutex_exit(t_hmtx);
137 }
138 
139 static struct tdirent *
140 tmpfs_hash_lookup(char *name, struct tmpnode *parent, uint_t hold,
141 	struct tmpnode **found)
142 {
143 	struct tdirent	*l;
144 	uint_t		hash;
145 	kmutex_t	*t_hmtx;
146 	struct tmpnode	*tnp;
147 
148 	TMPFS_HASH(parent, name, hash);
149 	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
150 	mutex_enter(t_hmtx);
151 	l = t_hashtable[T_HASH_INDEX(hash)];
152 	while (l) {
153 		if ((l->td_hash == hash) &&
154 		    (l->td_parent == parent) &&
155 		    (strcmp(l->td_name, name) == 0)) {
156 			/*
157 			 * We need to make sure that the tmpnode that
158 			 * we put a hold on is the same one that we pass back.
159 			 * Hence, temporary variable tnp is necessary.
160 			 */
161 			tnp = l->td_tmpnode;
162 			if (hold) {
163 				ASSERT(tnp);
164 				tmpnode_hold(tnp);
165 			}
166 			if (found)
167 				*found = tnp;
168 			mutex_exit(t_hmtx);
169 			return (l);
170 		} else {
171 			l = l->td_link;
172 		}
173 	}
174 	mutex_exit(t_hmtx);
175 	return (NULL);
176 }
177 
178 /*
179  * Search directory 'parent' for entry 'name'.
180  *
181  * The calling thread can't hold the write version
182  * of the rwlock for the directory being searched
183  *
184  * 0 is returned on success and *foundtp points
185  * to the found tmpnode with its vnode held.
186  */
187 int
188 tdirlookup(
189 	struct tmpnode *parent,
190 	char *name,
191 	struct tmpnode **foundtp,
192 	struct cred *cred)
193 {
194 	int error;
195 
196 	*foundtp = NULL;
197 	if (parent->tn_type != VDIR)
198 		return (ENOTDIR);
199 
200 	if ((error = tmp_taccess(parent, VEXEC, cred)))
201 		return (error);
202 
203 	if (*name == '\0') {
204 		tmpnode_hold(parent);
205 		*foundtp = parent;
206 		return (0);
207 	}
208 
209 	/*
210 	 * Search the directory for the matching name
211 	 * We need the lock protecting the tn_dir list
212 	 * so that it doesn't change out from underneath us.
213 	 * tmpfs_hash_lookup() will pass back the tmpnode
214 	 * with a hold on it.
215 	 */
216 
217 	if (tmpfs_hash_lookup(name, parent, 1, foundtp) != NULL) {
218 		ASSERT(*foundtp);
219 		return (0);
220 	}
221 
222 	return (ENOENT);
223 }
224 
225 /*
226  * Enter a directory entry for 'name' and 'tp' into directory 'dir'
227  *
228  * Returns 0 on success.
229  */
230 int
231 tdirenter(
232 	struct tmount	*tm,
233 	struct tmpnode	*dir,		/* target directory to make entry in */
234 	char		*name,		/* name of entry */
235 	enum de_op	op,		/* entry operation */
236 	struct tmpnode	*fromparent,	/* source directory if rename */
237 	struct tmpnode	*tp,		/* source tmpnode, if link/rename */
238 	struct vattr	*va,
239 	struct tmpnode	**tpp,		/* return tmpnode, if create/mkdir */
240 	struct cred	*cred,
241 	caller_context_t *ctp)
242 {
243 	struct tdirent *tdp;
244 	struct tmpnode *found = NULL;
245 	int error = 0;
246 	char *s;
247 
248 	/*
249 	 * tn_rwlock is held to serialize direnter and dirdeletes
250 	 */
251 	ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
252 	ASSERT(dir->tn_type == VDIR);
253 
254 	/*
255 	 * Don't allow '/' characters in pathname component
256 	 * (thus in ufs_direnter()).
257 	 */
258 	for (s = name; *s; s++)
259 		if (*s == '/')
260 			return (EACCES);
261 
262 	if (name[0] == '\0')
263 		panic("tdirenter: NULL name");
264 
265 	/*
266 	 * For link and rename lock the source entry and check the link count
267 	 * to see if it has been removed while it was unlocked.
268 	 */
269 	if (op == DE_LINK || op == DE_RENAME) {
270 		if (tp != dir)
271 			rw_enter(&tp->tn_rwlock, RW_WRITER);
272 		mutex_enter(&tp->tn_tlock);
273 		if (tp->tn_nlink == 0) {
274 			mutex_exit(&tp->tn_tlock);
275 			if (tp != dir)
276 				rw_exit(&tp->tn_rwlock);
277 			return (ENOENT);
278 		}
279 
280 		if (tp->tn_nlink == MAXLINK) {
281 			mutex_exit(&tp->tn_tlock);
282 			if (tp != dir)
283 				rw_exit(&tp->tn_rwlock);
284 			return (EMLINK);
285 		}
286 		tp->tn_nlink++;
287 		gethrestime(&tp->tn_ctime);
288 		mutex_exit(&tp->tn_tlock);
289 		if (tp != dir)
290 			rw_exit(&tp->tn_rwlock);
291 	}
292 
293 	/*
294 	 * This might be a "dangling detached directory".
295 	 * it could have been removed, but a reference
296 	 * to it kept in u_cwd.  don't bother searching
297 	 * it, and with any luck the user will get tired
298 	 * of dealing with us and cd to some absolute
299 	 * pathway.  *sigh*, thus in ufs, too.
300 	 */
301 	if (dir->tn_nlink == 0) {
302 		error = ENOENT;
303 		goto out;
304 	}
305 
306 	/*
307 	 * If this is a rename of a directory and the parent is
308 	 * different (".." must be changed), then the source
309 	 * directory must not be in the directory hierarchy
310 	 * above the target, as this would orphan everything
311 	 * below the source directory.
312 	 */
313 	if (op == DE_RENAME) {
314 		if (tp == dir) {
315 			error = EINVAL;
316 			goto out;
317 		}
318 		if (tp->tn_type == VDIR) {
319 			if ((fromparent != dir) &&
320 			    (error = tdircheckpath(tp, dir, cred))) {
321 				goto out;
322 			}
323 		}
324 	}
325 
326 	/*
327 	 * Search for the entry.  Return "found" if it exists.
328 	 */
329 	tdp = tmpfs_hash_lookup(name, dir, 1, &found);
330 
331 	if (tdp) {
332 		ASSERT(found);
333 		switch (op) {
334 		case DE_CREATE:
335 		case DE_MKDIR:
336 			if (tpp) {
337 				*tpp = found;
338 				error = EEXIST;
339 			} else {
340 				tmpnode_rele(found);
341 			}
342 			break;
343 
344 		case DE_RENAME:
345 			error = tdirrename(fromparent, tp,
346 			    dir, name, found, tdp, cred);
347 			if (error == 0) {
348 				if (found != NULL) {
349 					vnevent_rename_dest(TNTOV(found),
350 					    TNTOV(dir), name, ctp);
351 				}
352 			}
353 
354 			tmpnode_rele(found);
355 			break;
356 
357 		case DE_LINK:
358 			/*
359 			 * Can't link to an existing file.
360 			 */
361 			error = EEXIST;
362 			tmpnode_rele(found);
363 			break;
364 		}
365 	} else {
366 
367 		/*
368 		 * The entry does not exist. Check write permission in
369 		 * directory to see if entry can be created.
370 		 */
371 		if (error = tmp_taccess(dir, VWRITE, cred))
372 			goto out;
373 		if (op == DE_CREATE || op == DE_MKDIR) {
374 			/*
375 			 * Make new tmpnode and directory entry as required.
376 			 */
377 			error = tdirmaketnode(dir, tm, va, op, &tp, cred);
378 			if (error)
379 				goto out;
380 		}
381 		if (error = tdiraddentry(dir, tp, name, op, fromparent)) {
382 			if (op == DE_CREATE || op == DE_MKDIR) {
383 				/*
384 				 * Unmake the inode we just made.
385 				 */
386 				rw_enter(&tp->tn_rwlock, RW_WRITER);
387 				if ((tp->tn_type) == VDIR) {
388 					ASSERT(tdp == NULL);
389 					/*
390 					 * cleanup allocs made by tdirinit()
391 					 */
392 					tdirtrunc(tp);
393 				}
394 				mutex_enter(&tp->tn_tlock);
395 				tp->tn_nlink = 0;
396 				mutex_exit(&tp->tn_tlock);
397 				gethrestime(&tp->tn_ctime);
398 				rw_exit(&tp->tn_rwlock);
399 				tmpnode_rele(tp);
400 				tp = NULL;
401 			}
402 		} else if (tpp) {
403 			*tpp = tp;
404 		} else if (op == DE_CREATE || op == DE_MKDIR) {
405 			tmpnode_rele(tp);
406 		}
407 	}
408 
409 out:
410 	if (error && (op == DE_LINK || op == DE_RENAME)) {
411 		/*
412 		 * Undo bumped link count.
413 		 */
414 		DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
415 		gethrestime(&tp->tn_ctime);
416 	}
417 	return (error);
418 }
419 
420 /*
421  * Delete entry tp of name "nm" from dir.
422  * Free dir entry space and decrement link count on tmpnode(s).
423  *
424  * Return 0 on success.
425  */
426 int
427 tdirdelete(
428 	struct tmpnode *dir,
429 	struct tmpnode *tp,
430 	char *nm,
431 	enum dr_op op,
432 	struct cred *cred)
433 {
434 	struct tdirent *tpdp;
435 	int error;
436 	size_t namelen;
437 	struct tmpnode *tnp;
438 	timestruc_t now;
439 
440 	ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
441 	ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
442 	ASSERT(dir->tn_type == VDIR);
443 
444 	if (nm[0] == '\0')
445 		panic("tdirdelete: NULL name for %p", (void *)tp);
446 
447 	/*
448 	 * return error when removing . and ..
449 	 */
450 	if (nm[0] == '.') {
451 		if (nm[1] == '\0')
452 			return (EINVAL);
453 		if (nm[1] == '.' && nm[2] == '\0')
454 			return (EEXIST); /* thus in ufs */
455 	}
456 
457 	if (error = tmp_taccess(dir, VEXEC|VWRITE, cred))
458 		return (error);
459 
460 	/*
461 	 * If the parent directory is "sticky", then the user must
462 	 * own the parent directory or the file in it, or else must
463 	 * have permission to write the file.  Otherwise it may not
464 	 * be deleted (except by privileged users).
465 	 * Same as ufs_dirremove.
466 	 */
467 	if ((error = tmp_sticky_remove_access(dir, tp, cred)) != 0)
468 		return (error);
469 
470 	if (dir->tn_dir == NULL)
471 		return (ENOENT);
472 
473 	tpdp = tmpfs_hash_lookup(nm, dir, 0, &tnp);
474 	if (tpdp == NULL) {
475 		/*
476 		 * If it is gone, some other thread got here first!
477 		 * Return error ENOENT.
478 		 */
479 		return (ENOENT);
480 	}
481 
482 	/*
483 	 * If the tmpnode in the tdirent changed, we were probably
484 	 * the victim of a concurrent rename operation.  The original
485 	 * is gone, so return that status (same as UFS).
486 	 */
487 	if (tp != tnp)
488 		return (ENOENT);
489 
490 	tmpfs_hash_out(tpdp);
491 
492 	/*
493 	 * Take tpdp out of the directory list.
494 	 */
495 	ASSERT(tpdp->td_next != tpdp);
496 	ASSERT(tpdp->td_prev != tpdp);
497 	if (tpdp->td_prev) {
498 		tpdp->td_prev->td_next = tpdp->td_next;
499 	}
500 	if (tpdp->td_next) {
501 		tpdp->td_next->td_prev = tpdp->td_prev;
502 	}
503 
504 	/*
505 	 * If the roving slot pointer happens to match tpdp,
506 	 * point it at the previous dirent.
507 	 */
508 	if (dir->tn_dir->td_prev == tpdp) {
509 		dir->tn_dir->td_prev = tpdp->td_prev;
510 	}
511 	ASSERT(tpdp->td_next != tpdp);
512 	ASSERT(tpdp->td_prev != tpdp);
513 
514 	/*
515 	 * tpdp points to the correct directory entry
516 	 */
517 	namelen = strlen(tpdp->td_name) + 1;
518 
519 	tmp_memfree(tpdp, sizeof (struct tdirent) + namelen);
520 	dir->tn_size -= (sizeof (struct tdirent) + namelen);
521 	dir->tn_dirents--;
522 
523 	gethrestime(&now);
524 	dir->tn_mtime = now;
525 	dir->tn_ctime = now;
526 	tp->tn_ctime = now;
527 
528 	ASSERT(tp->tn_nlink > 0);
529 	DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
530 	if (op == DR_RMDIR && tp->tn_type == VDIR) {
531 		tdirtrunc(tp);
532 		ASSERT(tp->tn_nlink == 0);
533 	}
534 	return (0);
535 }
536 
537 /*
538  * tdirinit is used internally to initialize a directory (dir)
539  * with '.' and '..' entries without checking permissions and locking
540  */
541 void
542 tdirinit(
543 	struct tmpnode *parent,		/* parent of directory to initialize */
544 	struct tmpnode *dir)		/* the new directory */
545 {
546 	struct tdirent *dot, *dotdot;
547 	timestruc_t now;
548 
549 	ASSERT(RW_WRITE_HELD(&parent->tn_rwlock));
550 	ASSERT(dir->tn_type == VDIR);
551 
552 	dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE);
553 	dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE);
554 
555 	/*
556 	 * Initialize the entries
557 	 */
558 	dot->td_tmpnode = dir;
559 	dot->td_offset = 0;
560 	dot->td_name = (char *)dot + sizeof (struct tdirent);
561 	dot->td_name[0] = '.';
562 	dot->td_parent = dir;
563 	tmpfs_hash_in(dot);
564 
565 	dotdot->td_tmpnode = parent;
566 	dotdot->td_offset = 1;
567 	dotdot->td_name = (char *)dotdot + sizeof (struct tdirent);
568 	dotdot->td_name[0] = '.';
569 	dotdot->td_name[1] = '.';
570 	dotdot->td_parent = dir;
571 	tmpfs_hash_in(dotdot);
572 
573 	/*
574 	 * Initialize directory entry list.
575 	 */
576 	dot->td_next = dotdot;
577 	dot->td_prev = dotdot;	/* dot's td_prev holds roving slot pointer */
578 	dotdot->td_next = NULL;
579 	dotdot->td_prev = dot;
580 
581 	gethrestime(&now);
582 	dir->tn_mtime = now;
583 	dir->tn_ctime = now;
584 
585 	/*
586 	 * Link counts are special for the hidden attribute directory.
587 	 * The only explicit reference in the name space is "." and
588 	 * the reference through ".." is not counted on the parent
589 	 * file. The attrdir is created as a side effect to lookup,
590 	 * so don't change the ctime of the parent.
591 	 * Since tdirinit is called with both dir and parent being the
592 	 * same for the root vnode, we need to increment this before we set
593 	 * tn_nlink = 2 below.
594 	 */
595 	if (!(dir->tn_vnode->v_flag & V_XATTRDIR)) {
596 		INCR_COUNT(&parent->tn_nlink, &parent->tn_tlock);
597 		parent->tn_ctime = now;
598 	}
599 
600 	dir->tn_dir = dot;
601 	dir->tn_size = 2 * sizeof (struct tdirent) + 5;	/* dot and dotdot */
602 	dir->tn_dirents = 2;
603 	dir->tn_nlink = 2;
604 }
605 
606 
607 /*
608  * tdirtrunc is called to remove all directory entries under this directory.
609  */
610 void
611 tdirtrunc(struct tmpnode *dir)
612 {
613 	struct tdirent *tdp;
614 	struct tmpnode *tp;
615 	size_t namelen;
616 	timestruc_t now;
617 	int isvattrdir, isdotdot, skip_decr;
618 
619 	ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
620 	ASSERT(dir->tn_type == VDIR);
621 
622 	isvattrdir = (dir->tn_vnode->v_flag & V_XATTRDIR) ? 1 : 0;
623 	for (tdp = dir->tn_dir; tdp; tdp = dir->tn_dir) {
624 		ASSERT(tdp->td_next != tdp);
625 		ASSERT(tdp->td_prev != tdp);
626 		ASSERT(tdp->td_tmpnode);
627 
628 		dir->tn_dir = tdp->td_next;
629 		namelen = strlen(tdp->td_name) + 1;
630 
631 		/*
632 		 * Adjust the link counts to account for this directory
633 		 * entry removal. Hidden attribute directories may
634 		 * not be empty as they may be truncated as a side-
635 		 * effect of removing the parent. We do hold/rele
636 		 * operations to free up these tmpnodes.
637 		 *
638 		 * Skip the link count adjustment for parents of
639 		 * attribute directories as those link counts
640 		 * do not include the ".." reference in the hidden
641 		 * directories.
642 		 */
643 		tp = tdp->td_tmpnode;
644 		isdotdot = (strcmp("..", tdp->td_name) == 0);
645 		skip_decr = (isvattrdir && isdotdot);
646 		if (!skip_decr) {
647 			ASSERT(tp->tn_nlink > 0);
648 			DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
649 		}
650 
651 		tmpfs_hash_out(tdp);
652 
653 		tmp_memfree(tdp, sizeof (struct tdirent) + namelen);
654 		dir->tn_size -= (sizeof (struct tdirent) + namelen);
655 		dir->tn_dirents--;
656 	}
657 
658 	gethrestime(&now);
659 	dir->tn_mtime = now;
660 	dir->tn_ctime = now;
661 
662 	ASSERT(dir->tn_dir == NULL);
663 	ASSERT(dir->tn_size == 0);
664 	ASSERT(dir->tn_dirents == 0);
665 }
666 
667 /*
668  * Check if the source directory is in the path of the target directory.
669  * The target directory is locked by the caller.
670  *
671  * XXX - The source and target's should be different upon entry.
672  */
673 static int
674 tdircheckpath(
675 	struct tmpnode *fromtp,
676 	struct tmpnode	*toparent,
677 	struct cred	*cred)
678 {
679 	int	error = 0;
680 	struct tmpnode *dir, *dotdot;
681 	struct tdirent *tdp;
682 
683 	ASSERT(RW_WRITE_HELD(&toparent->tn_rwlock));
684 
685 	tdp = tmpfs_hash_lookup("..", toparent, 1, &dotdot);
686 	if (tdp == NULL)
687 		return (ENOENT);
688 
689 	ASSERT(dotdot);
690 
691 	if (dotdot == toparent) {
692 		/* root of fs.  search trivially satisfied. */
693 		tmpnode_rele(dotdot);
694 		return (0);
695 	}
696 	for (;;) {
697 		/*
698 		 * Return error for cases like "mv c c/d",
699 		 * "mv c c/d/e" and so on.
700 		 */
701 		if (dotdot == fromtp) {
702 			tmpnode_rele(dotdot);
703 			error = EINVAL;
704 			break;
705 		}
706 		dir = dotdot;
707 		error = tdirlookup(dir, "..", &dotdot, cred);
708 		if (error) {
709 			tmpnode_rele(dir);
710 			break;
711 		}
712 		/*
713 		 * We're okay if we traverse the directory tree up to
714 		 * the root directory and don't run into the
715 		 * parent directory.
716 		 */
717 		if (dir == dotdot) {
718 			tmpnode_rele(dir);
719 			tmpnode_rele(dotdot);
720 			break;
721 		}
722 		tmpnode_rele(dir);
723 	}
724 	return (error);
725 }
726 
727 static int
728 tdirrename(
729 	struct tmpnode *fromparent,	/* parent directory of source */
730 	struct tmpnode *fromtp,		/* source tmpnode */
731 	struct tmpnode *toparent,	/* parent directory of target */
732 	char *nm,			/* entry we are trying to change */
733 	struct tmpnode *to,		/* target tmpnode */
734 	struct tdirent *where,		/* target tmpnode directory entry */
735 	struct cred *cred)		/* credentials */
736 {
737 	int error = 0;
738 	int doingdirectory;
739 	timestruc_t now;
740 
741 #if defined(lint)
742 	nm = nm;
743 #endif
744 	ASSERT(RW_WRITE_HELD(&toparent->tn_rwlock));
745 
746 	/*
747 	 * Short circuit rename of something to itself.
748 	 */
749 	if (fromtp == to)
750 		return (ESAME);		/* special KLUDGE error code */
751 
752 	rw_enter(&fromtp->tn_rwlock, RW_READER);
753 	rw_enter(&to->tn_rwlock, RW_READER);
754 
755 	/*
756 	 * Check that everything is on the same filesystem.
757 	 */
758 	if (to->tn_vnode->v_vfsp != toparent->tn_vnode->v_vfsp ||
759 	    to->tn_vnode->v_vfsp != fromtp->tn_vnode->v_vfsp) {
760 		error = EXDEV;
761 		goto out;
762 	}
763 
764 	/*
765 	 * Must have write permission to rewrite target entry.
766 	 * Check for stickyness.
767 	 */
768 	if ((error = tmp_taccess(toparent, VWRITE, cred)) != 0 ||
769 	    (error = tmp_sticky_remove_access(toparent, to, cred)) != 0)
770 		goto out;
771 
772 	/*
773 	 * Ensure source and target are compatible (both directories
774 	 * or both not directories).  If target is a directory it must
775 	 * be empty and have no links to it; in addition it must not
776 	 * be a mount point, and both the source and target must be
777 	 * writable.
778 	 */
779 	doingdirectory = (fromtp->tn_type == VDIR);
780 	if (to->tn_type == VDIR) {
781 		if (!doingdirectory) {
782 			error = EISDIR;
783 			goto out;
784 		}
785 		/*
786 		 * vn_vfswlock will prevent mounts from using the directory
787 		 * until we are done.
788 		 */
789 		if (vn_vfswlock(TNTOV(to))) {
790 			error = EBUSY;
791 			goto out;
792 		}
793 		if (vn_mountedvfs(TNTOV(to)) != NULL) {
794 			vn_vfsunlock(TNTOV(to));
795 			error = EBUSY;
796 			goto out;
797 		}
798 
799 		mutex_enter(&to->tn_tlock);
800 		if (to->tn_dirents > 2 || to->tn_nlink > 2) {
801 			mutex_exit(&to->tn_tlock);
802 			vn_vfsunlock(TNTOV(to));
803 			error = EEXIST; /* SIGH should be ENOTEMPTY */
804 			/*
805 			 * Update atime because checking tn_dirents is
806 			 * logically equivalent to reading the directory
807 			 */
808 			gethrestime(&to->tn_atime);
809 			goto out;
810 		}
811 		mutex_exit(&to->tn_tlock);
812 	} else if (doingdirectory) {
813 		error = ENOTDIR;
814 		goto out;
815 	}
816 
817 	tmpfs_hash_change(where, fromtp);
818 	gethrestime(&now);
819 	toparent->tn_mtime = now;
820 	toparent->tn_ctime = now;
821 
822 	/*
823 	 * Upgrade to write lock on "to" (i.e., the target tmpnode).
824 	 */
825 	rw_exit(&to->tn_rwlock);
826 	rw_enter(&to->tn_rwlock, RW_WRITER);
827 
828 	/*
829 	 * Decrement the link count of the target tmpnode.
830 	 */
831 	DECR_COUNT(&to->tn_nlink, &to->tn_tlock);
832 	to->tn_ctime = now;
833 
834 	if (doingdirectory) {
835 		/*
836 		 * The entry for "to" no longer exists so release the vfslock.
837 		 */
838 		vn_vfsunlock(TNTOV(to));
839 
840 		/*
841 		 * Decrement the target link count and delete all entires.
842 		 */
843 		tdirtrunc(to);
844 		ASSERT(to->tn_nlink == 0);
845 
846 		/*
847 		 * Renaming a directory with the parent different
848 		 * requires that ".." be rewritten.  The window is
849 		 * still there for ".." to be inconsistent, but this
850 		 * is unavoidable, and a lot shorter than when it was
851 		 * done in a user process.
852 		 */
853 		if (fromparent != toparent)
854 			tdirfixdotdot(fromtp, fromparent, toparent);
855 	}
856 out:
857 	rw_exit(&to->tn_rwlock);
858 	rw_exit(&fromtp->tn_rwlock);
859 	return (error);
860 }
861 
862 static void
863 tdirfixdotdot(
864 	struct tmpnode	*fromtp,	/* child directory */
865 	struct tmpnode	*fromparent,	/* old parent directory */
866 	struct tmpnode	*toparent)	/* new parent directory */
867 {
868 	struct tdirent	*dotdot;
869 
870 	ASSERT(RW_LOCK_HELD(&toparent->tn_rwlock));
871 
872 	/*
873 	 * Increment the link count in the new parent tmpnode
874 	 */
875 	INCR_COUNT(&toparent->tn_nlink, &toparent->tn_tlock);
876 	gethrestime(&toparent->tn_ctime);
877 
878 	dotdot = tmpfs_hash_lookup("..", fromtp, 0, NULL);
879 
880 	ASSERT(dotdot->td_tmpnode == fromparent);
881 	dotdot->td_tmpnode = toparent;
882 
883 	/*
884 	 * Decrement the link count of the old parent tmpnode.
885 	 * If fromparent is NULL, then this is a new directory link;
886 	 * it has no parent, so we need not do anything.
887 	 */
888 	if (fromparent != NULL) {
889 		mutex_enter(&fromparent->tn_tlock);
890 		if (fromparent->tn_nlink != 0) {
891 			fromparent->tn_nlink--;
892 			gethrestime(&fromparent->tn_ctime);
893 		}
894 		mutex_exit(&fromparent->tn_tlock);
895 	}
896 }
897 
898 static int
899 tdiraddentry(
900 	struct tmpnode	*dir,	/* target directory to make entry in */
901 	struct tmpnode	*tp,	/* new tmpnode */
902 	char		*name,
903 	enum de_op	op,
904 	struct tmpnode	*fromtp)
905 {
906 	struct tdirent *tdp, *tpdp;
907 	size_t		namelen, alloc_size;
908 	timestruc_t	now;
909 
910 	/*
911 	 * Make sure the parent directory wasn't removed from
912 	 * underneath the caller.
913 	 */
914 	if (dir->tn_dir == NULL)
915 		return (ENOENT);
916 
917 	/*
918 	 * Check that everything is on the same filesystem.
919 	 */
920 	if (tp->tn_vnode->v_vfsp != dir->tn_vnode->v_vfsp)
921 		return (EXDEV);
922 
923 	/*
924 	 * Allocate and initialize directory entry
925 	 */
926 	namelen = strlen(name) + 1;
927 	alloc_size = namelen + sizeof (struct tdirent);
928 	tdp = tmp_memalloc(alloc_size, 0);
929 	if (tdp == NULL)
930 		return (ENOSPC);
931 
932 	if ((op == DE_RENAME) && (tp->tn_type == VDIR))
933 		tdirfixdotdot(tp, fromtp, dir);
934 
935 	dir->tn_size += alloc_size;
936 	dir->tn_dirents++;
937 	tdp->td_tmpnode = tp;
938 	tdp->td_parent = dir;
939 
940 	/*
941 	 * The directory entry and its name were allocated sequentially.
942 	 */
943 	tdp->td_name = (char *)tdp + sizeof (struct tdirent);
944 	(void) strcpy(tdp->td_name, name);
945 
946 	tmpfs_hash_in(tdp);
947 
948 	/*
949 	 * Some utilities expect the size of a directory to remain
950 	 * somewhat static.  For example, a routine which unlinks
951 	 * files between calls to readdir(); the size of the
952 	 * directory changes from underneath it and so the real
953 	 * directory offset in bytes is invalid.  To circumvent
954 	 * this problem, we initialize a directory entry with an
955 	 * phony offset, and use this offset to determine end of
956 	 * file in tmp_readdir.
957 	 */
958 	tpdp = dir->tn_dir->td_prev;
959 	/*
960 	 * Install at first empty "slot" in directory list.
961 	 */
962 	while (tpdp->td_next != NULL && (tpdp->td_next->td_offset -
963 	    tpdp->td_offset) <= 1) {
964 		ASSERT(tpdp->td_next != tpdp);
965 		ASSERT(tpdp->td_prev != tpdp);
966 		ASSERT(tpdp->td_next->td_offset > tpdp->td_offset);
967 		tpdp = tpdp->td_next;
968 	}
969 	tdp->td_offset = tpdp->td_offset + 1;
970 
971 	/*
972 	 * If we're at the end of the dirent list and the offset (which
973 	 * is necessarily the largest offset in this directory) is more
974 	 * than twice the number of dirents, that means the directory is
975 	 * 50% holes.  At this point we reset the slot pointer back to
976 	 * the beginning of the directory so we start using the holes.
977 	 * The idea is that if there are N dirents, there must also be
978 	 * N holes, so we can satisfy the next N creates by walking at
979 	 * most 2N entries; thus the average cost of a create is constant.
980 	 * Note that we use the first dirent's td_prev as the roving
981 	 * slot pointer; it's ugly, but it saves a word in every dirent.
982 	 */
983 	if (tpdp->td_next == NULL && tpdp->td_offset > 2 * dir->tn_dirents)
984 		dir->tn_dir->td_prev = dir->tn_dir->td_next;
985 	else
986 		dir->tn_dir->td_prev = tdp;
987 
988 	ASSERT(tpdp->td_next != tpdp);
989 	ASSERT(tpdp->td_prev != tpdp);
990 
991 	tdp->td_next = tpdp->td_next;
992 	if (tdp->td_next) {
993 		tdp->td_next->td_prev = tdp;
994 	}
995 	tdp->td_prev = tpdp;
996 	tpdp->td_next = tdp;
997 
998 	ASSERT(tdp->td_next != tdp);
999 	ASSERT(tdp->td_prev != tdp);
1000 	ASSERT(tpdp->td_next != tpdp);
1001 	ASSERT(tpdp->td_prev != tpdp);
1002 
1003 	gethrestime(&now);
1004 	dir->tn_mtime = now;
1005 	dir->tn_ctime = now;
1006 
1007 	return (0);
1008 }
1009 
1010 static int
1011 tdirmaketnode(
1012 	struct tmpnode *dir,
1013 	struct tmount	*tm,
1014 	struct vattr	*va,
1015 	enum	de_op	op,
1016 	struct tmpnode **newnode,
1017 	struct cred	*cred)
1018 {
1019 	struct tmpnode *tp;
1020 	enum vtype	type;
1021 
1022 	ASSERT(va != NULL);
1023 	ASSERT(op == DE_CREATE || op == DE_MKDIR);
1024 	if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
1025 	    ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
1026 		return (EOVERFLOW);
1027 	type = va->va_type;
1028 	tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
1029 	tmpnode_init(tm, tp, va, cred);
1030 
1031 	/* setup normal file/dir's extended attribute directory */
1032 	if (dir->tn_flags & ISXATTR) {
1033 		/* parent dir is , mark file as xattr */
1034 		tp->tn_flags |= ISXATTR;
1035 	}
1036 
1037 
1038 	if (type == VBLK || type == VCHR) {
1039 		tp->tn_vnode->v_rdev = tp->tn_rdev = va->va_rdev;
1040 	} else {
1041 		tp->tn_vnode->v_rdev = tp->tn_rdev = NODEV;
1042 	}
1043 	tp->tn_vnode->v_type = type;
1044 	tp->tn_uid = crgetuid(cred);
1045 
1046 	/*
1047 	 * To determine the group-id of the created file:
1048 	 *   1) If the gid is set in the attribute list (non-Sun & pre-4.0
1049 	 *	clients are not likely to set the gid), then use it if
1050 	 *	the process is privileged, belongs to the target group,
1051 	 *	or the group is the same as the parent directory.
1052 	 *   2) If the filesystem was not mounted with the Old-BSD-compatible
1053 	 *	GRPID option, and the directory's set-gid bit is clear,
1054 	 *	then use the process's gid.
1055 	 *   3) Otherwise, set the group-id to the gid of the parent directory.
1056 	 */
1057 	if ((va->va_mask & AT_GID) &&
1058 	    ((va->va_gid == dir->tn_gid) || groupmember(va->va_gid, cred) ||
1059 	    secpolicy_vnode_create_gid(cred) == 0)) {
1060 		/*
1061 		 * XXX - is this only the case when a 4.0 NFS client, or a
1062 		 * client derived from that code, makes a call over the wire?
1063 		 */
1064 		tp->tn_gid = va->va_gid;
1065 	} else {
1066 		if (dir->tn_mode & VSGID)
1067 			tp->tn_gid = dir->tn_gid;
1068 		else
1069 			tp->tn_gid = crgetgid(cred);
1070 	}
1071 	/*
1072 	 * If we're creating a directory, and the parent directory has the
1073 	 * set-GID bit set, set it on the new directory.
1074 	 * Otherwise, if the user is neither privileged nor a member of the
1075 	 * file's new group, clear the file's set-GID bit.
1076 	 */
1077 	if (dir->tn_mode & VSGID && type == VDIR)
1078 		tp->tn_mode |= VSGID;
1079 	else {
1080 		if ((tp->tn_mode & VSGID) &&
1081 		    secpolicy_vnode_setids_setgids(cred, tp->tn_gid) != 0)
1082 			tp->tn_mode &= ~VSGID;
1083 	}
1084 
1085 	if (va->va_mask & AT_ATIME)
1086 		tp->tn_atime = va->va_atime;
1087 	if (va->va_mask & AT_MTIME)
1088 		tp->tn_mtime = va->va_mtime;
1089 
1090 	if (op == DE_MKDIR)
1091 		tdirinit(dir, tp);
1092 
1093 	*newnode = tp;
1094 	return (0);
1095 }
1096