xref: /titanic_50/usr/src/uts/common/fs/tmpfs/tmp_dir.c (revision b65731f1f612238279eb4d997f43589b535c5646)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/time.h>
34 #include <sys/vfs.h>
35 #include <sys/vnode.h>
36 #include <sys/errno.h>
37 #include <sys/cmn_err.h>
38 #include <sys/cred.h>
39 #include <sys/stat.h>
40 #include <sys/debug.h>
41 #include <sys/policy.h>
42 #include <sys/fs/tmpnode.h>
43 #include <sys/fs/tmp.h>
44 #include <sys/vtrace.h>
45 
46 static int tdircheckpath(struct tmpnode *, struct tmpnode *, struct cred *);
47 static int tdirrename(struct tmpnode *, struct tmpnode *, struct tmpnode *,
48 	char *, struct tmpnode *, struct tdirent *, struct cred *);
49 static void tdirfixdotdot(struct tmpnode *, struct tmpnode *, struct tmpnode *);
50 static int tdirmaketnode(struct tmpnode *, struct tmount *, struct vattr *,
51 	enum de_op, struct tmpnode **, struct cred *);
52 static int tdiraddentry(struct tmpnode *, struct tmpnode *, char *,
53 	enum de_op, struct tmpnode *);
54 
55 
56 #define	T_HASH_SIZE	8192		/* must be power of 2 */
57 #define	T_MUTEX_SIZE	64
58 
59 static struct tdirent	*t_hashtable[T_HASH_SIZE];
60 static kmutex_t		 t_hashmutex[T_MUTEX_SIZE];
61 
62 #define	T_HASH_INDEX(a)		((a) & (T_HASH_SIZE-1))
63 #define	T_MUTEX_INDEX(a)	((a) & (T_MUTEX_SIZE-1))
64 
65 #define	TMPFS_HASH(tp, name, hash)				\
66 	{							\
67 		char Xc, *Xcp;					\
68 		hash = (uint_t)(uintptr_t)(tp) >> 8;		\
69 		for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++)	\
70 			hash = (hash << 4) + hash + (uint_t)Xc;	\
71 	}
72 
73 void
74 tmpfs_hash_init(void)
75 {
76 	int	ix;
77 
78 	for (ix = 0; ix < T_MUTEX_SIZE; ix++)
79 		mutex_init(&t_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL);
80 }
81 
82 /*
83  * This routine is where the rubber meets the road for identities.
84  */
85 static void
86 tmpfs_hash_in(struct tdirent *t)
87 {
88 	uint_t		hash;
89 	struct tdirent	**prevpp;
90 	kmutex_t	*t_hmtx;
91 
92 	TMPFS_HASH(t->td_parent, t->td_name, hash);
93 	t->td_hash = hash;
94 	prevpp = &t_hashtable[T_HASH_INDEX(hash)];
95 	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
96 	mutex_enter(t_hmtx);
97 	t->td_link = *prevpp;
98 	*prevpp = t;
99 	mutex_exit(t_hmtx);
100 }
101 
102 /*
103  * Remove tdirent *t from the hash list.
104  */
105 static void
106 tmpfs_hash_out(struct tdirent *t)
107 {
108 	uint_t		hash;
109 	struct tdirent	**prevpp;
110 	kmutex_t	*t_hmtx;
111 
112 	hash = t->td_hash;
113 	prevpp = &t_hashtable[T_HASH_INDEX(hash)];
114 	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
115 	mutex_enter(t_hmtx);
116 	while (*prevpp != t)
117 		prevpp = &(*prevpp)->td_link;
118 	*prevpp = t->td_link;
119 	mutex_exit(t_hmtx);
120 }
121 
122 /*
123  * Currently called by tdirrename() only.
124  * rename operation needs to be done with lock held, to ensure that
125  * no other operations can access the tmpnode at the same instance.
126  */
127 static void
128 tmpfs_hash_change(struct tdirent *tdp, struct tmpnode *fromtp)
129 {
130 	uint_t		hash;
131 	kmutex_t	*t_hmtx;
132 
133 	hash = tdp->td_hash;
134 	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
135 	mutex_enter(t_hmtx);
136 	tdp->td_tmpnode = fromtp;
137 	mutex_exit(t_hmtx);
138 }
139 
140 static struct tdirent *
141 tmpfs_hash_lookup(char *name, struct tmpnode *parent, uint_t hold,
142 	struct tmpnode **found)
143 {
144 	struct tdirent	*l;
145 	uint_t		hash;
146 	kmutex_t	*t_hmtx;
147 	struct tmpnode	*tnp;
148 
149 	TMPFS_HASH(parent, name, hash);
150 	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
151 	mutex_enter(t_hmtx);
152 	l = t_hashtable[T_HASH_INDEX(hash)];
153 	while (l) {
154 		if ((l->td_hash == hash) &&
155 		    (l->td_parent == parent) &&
156 		    (strcmp(l->td_name, name) == 0)) {
157 			/*
158 			 * We need to make sure that the tmpnode that
159 			 * we put a hold on is the same one that we pass back.
160 			 * Hence, temporary variable tnp is necessary.
161 			 */
162 			tnp = l->td_tmpnode;
163 			if (hold) {
164 				ASSERT(tnp);
165 				tmpnode_hold(tnp);
166 			}
167 			if (found)
168 				*found = tnp;
169 			mutex_exit(t_hmtx);
170 			return (l);
171 		} else {
172 			l = l->td_link;
173 		}
174 	}
175 	mutex_exit(t_hmtx);
176 	return (NULL);
177 }
178 
179 /*
180  * Search directory 'parent' for entry 'name'.
181  *
182  * The calling thread can't hold the write version
183  * of the rwlock for the directory being searched
184  *
185  * 0 is returned on success and *foundtp points
186  * to the found tmpnode with its vnode held.
187  */
188 int
189 tdirlookup(
190 	struct tmpnode *parent,
191 	char *name,
192 	struct tmpnode **foundtp,
193 	struct cred *cred)
194 {
195 	int error;
196 
197 	*foundtp = NULL;
198 	if (parent->tn_type != VDIR)
199 		return (ENOTDIR);
200 
201 	if ((error = tmp_taccess(parent, VEXEC, cred)))
202 		return (error);
203 
204 	if (*name == '\0') {
205 		tmpnode_hold(parent);
206 		*foundtp = parent;
207 		return (0);
208 	}
209 
210 	/*
211 	 * Search the directory for the matching name
212 	 * We need the lock protecting the tn_dir list
213 	 * so that it doesn't change out from underneath us.
214 	 * tmpfs_hash_lookup() will pass back the tmpnode
215 	 * with a hold on it.
216 	 */
217 
218 	if (tmpfs_hash_lookup(name, parent, 1, foundtp) != NULL) {
219 		ASSERT(*foundtp);
220 		return (0);
221 	}
222 
223 	return (ENOENT);
224 }
225 
226 /*
227  * Enter a directory entry for 'name' and 'tp' into directory 'dir'
228  *
229  * Returns 0 on success.
230  */
231 int
232 tdirenter(
233 	struct tmount	*tm,
234 	struct tmpnode	*dir,		/* target directory to make entry in */
235 	char		*name,		/* name of entry */
236 	enum de_op	op,		/* entry operation */
237 	struct tmpnode	*fromparent,	/* source directory if rename */
238 	struct tmpnode	*tp,		/* source tmpnode, if link/rename */
239 	struct vattr	*va,
240 	struct tmpnode	**tpp,		/* return tmpnode, if create/mkdir */
241 	struct cred	*cred)
242 {
243 	struct tdirent *tdp;
244 	struct tmpnode *found = NULL;
245 	int error = 0;
246 	char *s;
247 
248 	/*
249 	 * tn_rwlock is held to serialize direnter and dirdeletes
250 	 */
251 	ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
252 	ASSERT(dir->tn_type == VDIR);
253 
254 	/*
255 	 * Don't allow '/' characters in pathname component
256 	 * (thus in ufs_direnter()).
257 	 */
258 	for (s = name; *s; s++)
259 		if (*s == '/')
260 			return (EACCES);
261 
262 	if (name[0] == '\0')
263 		panic("tdirenter: NULL name");
264 
265 	/*
266 	 * For link and rename lock the source entry and check the link count
267 	 * to see if it has been removed while it was unlocked.
268 	 */
269 	if (op == DE_LINK || op == DE_RENAME) {
270 		if (tp != dir)
271 			rw_enter(&tp->tn_rwlock, RW_WRITER);
272 		mutex_enter(&tp->tn_tlock);
273 		if (tp->tn_nlink == 0) {
274 			mutex_exit(&tp->tn_tlock);
275 			if (tp != dir)
276 				rw_exit(&tp->tn_rwlock);
277 			return (ENOENT);
278 		}
279 
280 		if (tp->tn_nlink == MAXLINK) {
281 			mutex_exit(&tp->tn_tlock);
282 			if (tp != dir)
283 				rw_exit(&tp->tn_rwlock);
284 			return (EMLINK);
285 		}
286 		tp->tn_nlink++;
287 		gethrestime(&tp->tn_ctime);
288 		mutex_exit(&tp->tn_tlock);
289 		if (tp != dir)
290 			rw_exit(&tp->tn_rwlock);
291 	}
292 
293 	/*
294 	 * This might be a "dangling detached directory".
295 	 * it could have been removed, but a reference
296 	 * to it kept in u_cwd.  don't bother searching
297 	 * it, and with any luck the user will get tired
298 	 * of dealing with us and cd to some absolute
299 	 * pathway.  *sigh*, thus in ufs, too.
300 	 */
301 	if (dir->tn_nlink == 0) {
302 		error = ENOENT;
303 		goto out;
304 	}
305 
306 	/*
307 	 * If this is a rename of a directory and the parent is
308 	 * different (".." must be changed), then the source
309 	 * directory must not be in the directory hierarchy
310 	 * above the target, as this would orphan everything
311 	 * below the source directory.
312 	 */
313 	if (op == DE_RENAME) {
314 		if (tp == dir) {
315 			error = EINVAL;
316 			goto out;
317 		}
318 		if (tp->tn_type == VDIR) {
319 			if ((fromparent != dir) &&
320 			    (error = tdircheckpath(tp, dir, cred))) {
321 				goto out;
322 			}
323 		}
324 	}
325 
326 	/*
327 	 * Search for the entry.  Return "found" if it exists.
328 	 */
329 	tdp = tmpfs_hash_lookup(name, dir, 1, &found);
330 
331 	if (tdp) {
332 		ASSERT(found);
333 		switch (op) {
334 		case DE_CREATE:
335 		case DE_MKDIR:
336 			if (tpp) {
337 				*tpp = found;
338 				error = EEXIST;
339 			} else {
340 				tmpnode_rele(found);
341 			}
342 			break;
343 
344 		case DE_RENAME:
345 			error = tdirrename(fromparent, tp,
346 			    dir, name, found, tdp, cred);
347 			if (error == 0) {
348 				vnevent_rename_dest(TNTOV(found));
349 			}
350 			tmpnode_rele(found);
351 			break;
352 
353 		case DE_LINK:
354 			/*
355 			 * Can't link to an existing file.
356 			 */
357 			error = EEXIST;
358 			tmpnode_rele(found);
359 			break;
360 		}
361 	} else {
362 
363 		/*
364 		 * The entry does not exist. Check write permission in
365 		 * directory to see if entry can be created.
366 		 */
367 		if (error = tmp_taccess(dir, VWRITE, cred))
368 			goto out;
369 		if (op == DE_CREATE || op == DE_MKDIR) {
370 			/*
371 			 * Make new tmpnode and directory entry as required.
372 			 */
373 			error = tdirmaketnode(dir, tm, va, op, &tp, cred);
374 			if (error)
375 				goto out;
376 		}
377 		if (error = tdiraddentry(dir, tp, name, op, fromparent)) {
378 			if (op == DE_CREATE || op == DE_MKDIR) {
379 				/*
380 				 * Unmake the inode we just made.
381 				 */
382 				rw_enter(&tp->tn_rwlock, RW_WRITER);
383 				if ((tp->tn_type) == VDIR) {
384 					ASSERT(tdp == NULL);
385 					/*
386 					 * cleanup allocs made by tdirinit()
387 					 */
388 					tdirtrunc(tp);
389 				}
390 				mutex_enter(&tp->tn_tlock);
391 				tp->tn_nlink = 0;
392 				mutex_exit(&tp->tn_tlock);
393 				gethrestime(&tp->tn_ctime);
394 				rw_exit(&tp->tn_rwlock);
395 				tmpnode_rele(tp);
396 				tp = NULL;
397 			}
398 		} else if (tpp) {
399 			*tpp = tp;
400 		} else if (op == DE_CREATE || op == DE_MKDIR) {
401 			tmpnode_rele(tp);
402 		}
403 	}
404 
405 	if ((op == DE_RENAME) && (error == 0)) {
406 		vnevent_rename_src(TNTOV(tp));
407 	}
408 out:
409 	if (error && (op == DE_LINK || op == DE_RENAME)) {
410 		/*
411 		 * Undo bumped link count.
412 		 */
413 		DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
414 		gethrestime(&tp->tn_ctime);
415 	}
416 	return (error);
417 }
418 
419 /*
420  * Delete entry tp of name "nm" from dir.
421  * Free dir entry space and decrement link count on tmpnode(s).
422  *
423  * Return 0 on success.
424  */
425 int
426 tdirdelete(
427 	struct tmpnode *dir,
428 	struct tmpnode *tp,
429 	char *nm,
430 	enum dr_op op,
431 	struct cred *cred)
432 {
433 	struct tdirent *tpdp;
434 	int error;
435 	size_t namelen;
436 	struct tmpnode *tnp;
437 	timestruc_t now;
438 
439 	ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
440 	ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
441 	ASSERT(dir->tn_type == VDIR);
442 
443 	if (nm[0] == '\0')
444 		panic("tdirdelete: NULL name for %p", (void *)tp);
445 
446 	/*
447 	 * return error when removing . and ..
448 	 */
449 	if (nm[0] == '.') {
450 		if (nm[1] == '\0')
451 			return (EINVAL);
452 		if (nm[1] == '.' && nm[2] == '\0')
453 			return (EEXIST); /* thus in ufs */
454 	}
455 
456 	if (error = tmp_taccess(dir, VEXEC|VWRITE, cred))
457 		return (error);
458 
459 	/*
460 	 * If the parent directory is "sticky", then the user must
461 	 * own the parent directory or the file in it, or else must
462 	 * have permission to write the file.  Otherwise it may not
463 	 * be deleted (except by privileged users).
464 	 * Same as ufs_dirremove.
465 	 */
466 	if ((error = tmp_sticky_remove_access(dir, tp, cred)) != 0)
467 		return (error);
468 
469 	if (dir->tn_dir == NULL)
470 		return (ENOENT);
471 
472 	tpdp = tmpfs_hash_lookup(nm, dir, 0, &tnp);
473 	if (tpdp == NULL) {
474 		/*
475 		 * If it is gone, some other thread got here first!
476 		 * Return error ENOENT.
477 		 */
478 		return (ENOENT);
479 	}
480 
481 	/*
482 	 * If the tmpnode in the tdirent changed, we were probably
483 	 * the victim of a concurrent rename operation.  The original
484 	 * is gone, so return that status (same as UFS).
485 	 */
486 	if (tp != tnp)
487 		return (ENOENT);
488 
489 	tmpfs_hash_out(tpdp);
490 
491 	/*
492 	 * Take tpdp out of the directory list.
493 	 */
494 	ASSERT(tpdp->td_next != tpdp);
495 	ASSERT(tpdp->td_prev != tpdp);
496 	if (tpdp->td_prev) {
497 		tpdp->td_prev->td_next = tpdp->td_next;
498 	}
499 	if (tpdp->td_next) {
500 		tpdp->td_next->td_prev = tpdp->td_prev;
501 	}
502 
503 	/*
504 	 * If the roving slot pointer happens to match tpdp,
505 	 * point it at the previous dirent.
506 	 */
507 	if (dir->tn_dir->td_prev == tpdp) {
508 		dir->tn_dir->td_prev = tpdp->td_prev;
509 	}
510 	ASSERT(tpdp->td_next != tpdp);
511 	ASSERT(tpdp->td_prev != tpdp);
512 
513 	/*
514 	 * tpdp points to the correct directory entry
515 	 */
516 	namelen = strlen(tpdp->td_name) + 1;
517 
518 	tmp_memfree(tpdp, sizeof (struct tdirent) + namelen);
519 	dir->tn_size -= (sizeof (struct tdirent) + namelen);
520 	dir->tn_dirents--;
521 
522 	gethrestime(&now);
523 	dir->tn_mtime = now;
524 	dir->tn_ctime = now;
525 	tp->tn_ctime = now;
526 
527 	ASSERT(tp->tn_nlink > 0);
528 	DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
529 	if (op == DR_RMDIR && tp->tn_type == VDIR) {
530 		tdirtrunc(tp);
531 		ASSERT(tp->tn_nlink == 0);
532 	}
533 	return (0);
534 }
535 
536 /*
537  * tdirinit is used internally to initialize a directory (dir)
538  * with '.' and '..' entries without checking permissions and locking
539  */
540 void
541 tdirinit(
542 	struct tmpnode *parent,		/* parent of directory to initialize */
543 	struct tmpnode *dir)		/* the new directory */
544 {
545 	struct tdirent *dot, *dotdot;
546 	timestruc_t now;
547 
548 	ASSERT(RW_WRITE_HELD(&parent->tn_rwlock));
549 	ASSERT(dir->tn_type == VDIR);
550 
551 	dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE);
552 	dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE);
553 
554 	/*
555 	 * Initialize the entries
556 	 */
557 	dot->td_tmpnode = dir;
558 	dot->td_offset = 0;
559 	dot->td_name = (char *)dot + sizeof (struct tdirent);
560 	dot->td_name[0] = '.';
561 	dot->td_parent = dir;
562 	tmpfs_hash_in(dot);
563 
564 	dotdot->td_tmpnode = parent;
565 	dotdot->td_offset = 1;
566 	dotdot->td_name = (char *)dotdot + sizeof (struct tdirent);
567 	dotdot->td_name[0] = '.';
568 	dotdot->td_name[1] = '.';
569 	dotdot->td_parent = dir;
570 	tmpfs_hash_in(dotdot);
571 
572 	/*
573 	 * Initialize directory entry list.
574 	 */
575 	dot->td_next = dotdot;
576 	dot->td_prev = dotdot;	/* dot's td_prev holds roving slot pointer */
577 	dotdot->td_next = NULL;
578 	dotdot->td_prev = dot;
579 
580 	gethrestime(&now);
581 	dir->tn_mtime = now;
582 	dir->tn_ctime = now;
583 
584 	/*
585 	 * Link counts are special for the hidden attribute directory.
586 	 * The only explicit reference in the name space is "." and
587 	 * the reference through ".." is not counted on the parent
588 	 * file. The attrdir is created as a side effect to lookup,
589 	 * so don't change the ctime of the parent.
590 	 * Since tdirinit is called with both dir and parent being the
591 	 * same for the root vnode, we need to increment this before we set
592 	 * tn_nlink = 2 below.
593 	 */
594 	if (!(dir->tn_vnode->v_flag & V_XATTRDIR)) {
595 		INCR_COUNT(&parent->tn_nlink, &parent->tn_tlock);
596 		parent->tn_ctime = now;
597 	}
598 
599 	dir->tn_dir = dot;
600 	dir->tn_size = 2 * sizeof (struct tdirent) + 5;	/* dot and dotdot */
601 	dir->tn_dirents = 2;
602 	dir->tn_nlink = 2;
603 }
604 
605 
606 /*
607  * tdirtrunc is called to remove all directory entries under this directory.
608  */
609 void
610 tdirtrunc(struct tmpnode *dir)
611 {
612 	struct tdirent *tdp;
613 	struct tmpnode *tp;
614 	size_t namelen;
615 	timestruc_t now;
616 	int isvattrdir, isdotdot, skip_decr;
617 
618 	ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
619 	ASSERT(dir->tn_type == VDIR);
620 
621 	isvattrdir = (dir->tn_vnode->v_flag & V_XATTRDIR) ? 1 : 0;
622 	for (tdp = dir->tn_dir; tdp; tdp = dir->tn_dir) {
623 		ASSERT(tdp->td_next != tdp);
624 		ASSERT(tdp->td_prev != tdp);
625 		ASSERT(tdp->td_tmpnode);
626 
627 		dir->tn_dir = tdp->td_next;
628 		namelen = strlen(tdp->td_name) + 1;
629 
630 		/*
631 		 * Adjust the link counts to account for this directory
632 		 * entry removal. Hidden attribute directories may
633 		 * not be empty as they may be truncated as a side-
634 		 * effect of removing the parent. We do hold/rele
635 		 * operations to free up these tmpnodes.
636 		 *
637 		 * Skip the link count adjustment for parents of
638 		 * attribute directories as those link counts
639 		 * do not include the ".." reference in the hidden
640 		 * directories.
641 		 */
642 		tp = tdp->td_tmpnode;
643 		isdotdot = (strcmp("..", tdp->td_name) == 0);
644 		skip_decr = (isvattrdir && isdotdot);
645 		if (!skip_decr) {
646 			ASSERT(tp->tn_nlink > 0);
647 			DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
648 		}
649 
650 		tmpfs_hash_out(tdp);
651 
652 		tmp_memfree(tdp, sizeof (struct tdirent) + namelen);
653 		dir->tn_size -= (sizeof (struct tdirent) + namelen);
654 		dir->tn_dirents--;
655 	}
656 
657 	gethrestime(&now);
658 	dir->tn_mtime = now;
659 	dir->tn_ctime = now;
660 
661 	ASSERT(dir->tn_dir == NULL);
662 	ASSERT(dir->tn_size == 0);
663 	ASSERT(dir->tn_dirents == 0);
664 }
665 
666 /*
667  * Check if the source directory is in the path of the target directory.
668  * The target directory is locked by the caller.
669  *
670  * XXX - The source and target's should be different upon entry.
671  */
672 static int
673 tdircheckpath(
674 	struct tmpnode *fromtp,
675 	struct tmpnode	*toparent,
676 	struct cred	*cred)
677 {
678 	int	error = 0;
679 	struct tmpnode *dir, *dotdot;
680 	struct tdirent *tdp;
681 
682 	ASSERT(RW_WRITE_HELD(&toparent->tn_rwlock));
683 
684 	tdp = tmpfs_hash_lookup("..", toparent, 1, &dotdot);
685 	if (tdp == NULL)
686 		return (ENOENT);
687 
688 	ASSERT(dotdot);
689 
690 	if (dotdot == toparent) {
691 		/* root of fs.  search trivially satisfied. */
692 		tmpnode_rele(dotdot);
693 		return (0);
694 	}
695 	for (;;) {
696 		/*
697 		 * Return error for cases like "mv c c/d",
698 		 * "mv c c/d/e" and so on.
699 		 */
700 		if (dotdot == fromtp) {
701 			tmpnode_rele(dotdot);
702 			error = EINVAL;
703 			break;
704 		}
705 		dir = dotdot;
706 		error = tdirlookup(dir, "..", &dotdot, cred);
707 		if (error) {
708 			tmpnode_rele(dir);
709 			break;
710 		}
711 		/*
712 		 * We're okay if we traverse the directory tree up to
713 		 * the root directory and don't run into the
714 		 * parent directory.
715 		 */
716 		if (dir == dotdot) {
717 			tmpnode_rele(dir);
718 			tmpnode_rele(dotdot);
719 			break;
720 		}
721 		tmpnode_rele(dir);
722 	}
723 	return (error);
724 }
725 
726 static int
727 tdirrename(
728 	struct tmpnode *fromparent,	/* parent directory of source */
729 	struct tmpnode *fromtp,		/* source tmpnode */
730 	struct tmpnode *toparent,	/* parent directory of target */
731 	char *nm,			/* entry we are trying to change */
732 	struct tmpnode *to,		/* target tmpnode */
733 	struct tdirent *where,		/* target tmpnode directory entry */
734 	struct cred *cred)		/* credentials */
735 {
736 	int error = 0;
737 	int doingdirectory;
738 	timestruc_t now;
739 
740 #if defined(lint)
741 	nm = nm;
742 #endif
743 	ASSERT(RW_WRITE_HELD(&toparent->tn_rwlock));
744 
745 	/*
746 	 * Short circuit rename of something to itself.
747 	 */
748 	if (fromtp == to)
749 		return (ESAME);		/* special KLUDGE error code */
750 
751 	rw_enter(&fromtp->tn_rwlock, RW_READER);
752 	rw_enter(&to->tn_rwlock, RW_READER);
753 
754 	/*
755 	 * Check that everything is on the same filesystem.
756 	 */
757 	if (to->tn_vnode->v_vfsp != toparent->tn_vnode->v_vfsp ||
758 	    to->tn_vnode->v_vfsp != fromtp->tn_vnode->v_vfsp) {
759 		error = EXDEV;
760 		goto out;
761 	}
762 
763 	/*
764 	 * Must have write permission to rewrite target entry.
765 	 * Check for stickyness.
766 	 */
767 	if ((error = tmp_taccess(toparent, VWRITE, cred)) != 0 ||
768 	    (error = tmp_sticky_remove_access(toparent, to, cred)) != 0)
769 		goto out;
770 
771 	/*
772 	 * Ensure source and target are compatible (both directories
773 	 * or both not directories).  If target is a directory it must
774 	 * be empty and have no links to it; in addition it must not
775 	 * be a mount point, and both the source and target must be
776 	 * writable.
777 	 */
778 	doingdirectory = (fromtp->tn_type == VDIR);
779 	if (to->tn_type == VDIR) {
780 		if (!doingdirectory) {
781 			error = EISDIR;
782 			goto out;
783 		}
784 		/*
785 		 * vn_vfswlock will prevent mounts from using the directory
786 		 * until we are done.
787 		 */
788 		if (vn_vfswlock(TNTOV(to))) {
789 			error = EBUSY;
790 			goto out;
791 		}
792 		if (vn_mountedvfs(TNTOV(to)) != NULL) {
793 			vn_vfsunlock(TNTOV(to));
794 			error = EBUSY;
795 			goto out;
796 		}
797 
798 		mutex_enter(&to->tn_tlock);
799 		if (to->tn_dirents > 2 || to->tn_nlink > 2) {
800 			mutex_exit(&to->tn_tlock);
801 			vn_vfsunlock(TNTOV(to));
802 			error = EEXIST; /* SIGH should be ENOTEMPTY */
803 			/*
804 			 * Update atime because checking tn_dirents is
805 			 * logically equivalent to reading the directory
806 			 */
807 			gethrestime(&to->tn_atime);
808 			goto out;
809 		}
810 		mutex_exit(&to->tn_tlock);
811 	} else if (doingdirectory) {
812 		error = ENOTDIR;
813 		goto out;
814 	}
815 
816 	tmpfs_hash_change(where, fromtp);
817 	gethrestime(&now);
818 	toparent->tn_mtime = now;
819 	toparent->tn_ctime = now;
820 
821 	/*
822 	 * Upgrade to write lock on "to" (i.e., the target tmpnode).
823 	 */
824 	rw_exit(&to->tn_rwlock);
825 	rw_enter(&to->tn_rwlock, RW_WRITER);
826 
827 	/*
828 	 * Decrement the link count of the target tmpnode.
829 	 */
830 	DECR_COUNT(&to->tn_nlink, &to->tn_tlock);
831 	to->tn_ctime = now;
832 
833 	if (doingdirectory) {
834 		/*
835 		 * The entry for "to" no longer exists so release the vfslock.
836 		 */
837 		vn_vfsunlock(TNTOV(to));
838 
839 		/*
840 		 * Decrement the target link count and delete all entires.
841 		 */
842 		tdirtrunc(to);
843 		ASSERT(to->tn_nlink == 0);
844 
845 		/*
846 		 * Renaming a directory with the parent different
847 		 * requires that ".." be rewritten.  The window is
848 		 * still there for ".." to be inconsistent, but this
849 		 * is unavoidable, and a lot shorter than when it was
850 		 * done in a user process.
851 		 */
852 		if (fromparent != toparent)
853 			tdirfixdotdot(fromtp, fromparent, toparent);
854 	}
855 out:
856 	rw_exit(&to->tn_rwlock);
857 	rw_exit(&fromtp->tn_rwlock);
858 	return (error);
859 }
860 
861 static void
862 tdirfixdotdot(
863 	struct tmpnode	*fromtp,	/* child directory */
864 	struct tmpnode	*fromparent,	/* old parent directory */
865 	struct tmpnode	*toparent)	/* new parent directory */
866 {
867 	struct tdirent	*dotdot;
868 
869 	ASSERT(RW_LOCK_HELD(&toparent->tn_rwlock));
870 
871 	/*
872 	 * Increment the link count in the new parent tmpnode
873 	 */
874 	INCR_COUNT(&toparent->tn_nlink, &toparent->tn_tlock);
875 	gethrestime(&toparent->tn_ctime);
876 
877 	dotdot = tmpfs_hash_lookup("..", fromtp, 0, NULL);
878 
879 	ASSERT(dotdot->td_tmpnode == fromparent);
880 	dotdot->td_tmpnode = toparent;
881 
882 	/*
883 	 * Decrement the link count of the old parent tmpnode.
884 	 * If fromparent is NULL, then this is a new directory link;
885 	 * it has no parent, so we need not do anything.
886 	 */
887 	if (fromparent != NULL) {
888 		mutex_enter(&fromparent->tn_tlock);
889 		if (fromparent->tn_nlink != 0) {
890 			fromparent->tn_nlink--;
891 			gethrestime(&fromparent->tn_ctime);
892 		}
893 		mutex_exit(&fromparent->tn_tlock);
894 	}
895 }
896 
897 static int
898 tdiraddentry(
899 	struct tmpnode	*dir,	/* target directory to make entry in */
900 	struct tmpnode	*tp,	/* new tmpnode */
901 	char		*name,
902 	enum de_op	op,
903 	struct tmpnode	*fromtp)
904 {
905 	struct tdirent *tdp, *tpdp;
906 	size_t		namelen, alloc_size;
907 	timestruc_t	now;
908 
909 	/*
910 	 * Make sure the parent directory wasn't removed from
911 	 * underneath the caller.
912 	 */
913 	if (dir->tn_dir == NULL)
914 		return (ENOENT);
915 
916 	/*
917 	 * Check that everything is on the same filesystem.
918 	 */
919 	if (tp->tn_vnode->v_vfsp != dir->tn_vnode->v_vfsp)
920 		return (EXDEV);
921 
922 	/*
923 	 * Allocate and initialize directory entry
924 	 */
925 	namelen = strlen(name) + 1;
926 	alloc_size = namelen + sizeof (struct tdirent);
927 	tdp = tmp_memalloc(alloc_size, 0);
928 	if (tdp == NULL)
929 		return (ENOSPC);
930 
931 	if ((op == DE_RENAME) && (tp->tn_type == VDIR))
932 		tdirfixdotdot(tp, fromtp, dir);
933 
934 	dir->tn_size += alloc_size;
935 	dir->tn_dirents++;
936 	tdp->td_tmpnode = tp;
937 	tdp->td_parent = dir;
938 
939 	/*
940 	 * The directory entry and its name were allocated sequentially.
941 	 */
942 	tdp->td_name = (char *)tdp + sizeof (struct tdirent);
943 	(void) strcpy(tdp->td_name, name);
944 
945 	tmpfs_hash_in(tdp);
946 
947 	/*
948 	 * Some utilities expect the size of a directory to remain
949 	 * somewhat static.  For example, a routine which unlinks
950 	 * files between calls to readdir(); the size of the
951 	 * directory changes from underneath it and so the real
952 	 * directory offset in bytes is invalid.  To circumvent
953 	 * this problem, we initialize a directory entry with an
954 	 * phony offset, and use this offset to determine end of
955 	 * file in tmp_readdir.
956 	 */
957 	tpdp = dir->tn_dir->td_prev;
958 	/*
959 	 * Install at first empty "slot" in directory list.
960 	 */
961 	while (tpdp->td_next != NULL && (tpdp->td_next->td_offset -
962 	    tpdp->td_offset) <= 1) {
963 		ASSERT(tpdp->td_next != tpdp);
964 		ASSERT(tpdp->td_prev != tpdp);
965 		ASSERT(tpdp->td_next->td_offset > tpdp->td_offset);
966 		tpdp = tpdp->td_next;
967 	}
968 	tdp->td_offset = tpdp->td_offset + 1;
969 
970 	/*
971 	 * If we're at the end of the dirent list and the offset (which
972 	 * is necessarily the largest offset in this directory) is more
973 	 * than twice the number of dirents, that means the directory is
974 	 * 50% holes.  At this point we reset the slot pointer back to
975 	 * the beginning of the directory so we start using the holes.
976 	 * The idea is that if there are N dirents, there must also be
977 	 * N holes, so we can satisfy the next N creates by walking at
978 	 * most 2N entries; thus the average cost of a create is constant.
979 	 * Note that we use the first dirent's td_prev as the roving
980 	 * slot pointer; it's ugly, but it saves a word in every dirent.
981 	 */
982 	if (tpdp->td_next == NULL && tpdp->td_offset > 2 * dir->tn_dirents)
983 		dir->tn_dir->td_prev = dir->tn_dir->td_next;
984 	else
985 		dir->tn_dir->td_prev = tdp;
986 
987 	ASSERT(tpdp->td_next != tpdp);
988 	ASSERT(tpdp->td_prev != tpdp);
989 
990 	tdp->td_next = tpdp->td_next;
991 	if (tdp->td_next) {
992 		tdp->td_next->td_prev = tdp;
993 	}
994 	tdp->td_prev = tpdp;
995 	tpdp->td_next = tdp;
996 
997 	ASSERT(tdp->td_next != tdp);
998 	ASSERT(tdp->td_prev != tdp);
999 	ASSERT(tpdp->td_next != tpdp);
1000 	ASSERT(tpdp->td_prev != tpdp);
1001 
1002 	gethrestime(&now);
1003 	dir->tn_mtime = now;
1004 	dir->tn_ctime = now;
1005 
1006 	return (0);
1007 }
1008 
1009 static int
1010 tdirmaketnode(
1011 	struct tmpnode *dir,
1012 	struct tmount	*tm,
1013 	struct vattr	*va,
1014 	enum	de_op	op,
1015 	struct tmpnode **newnode,
1016 	struct cred	*cred)
1017 {
1018 	struct tmpnode *tp;
1019 	enum vtype	type;
1020 
1021 	ASSERT(va != NULL);
1022 	ASSERT(op == DE_CREATE || op == DE_MKDIR);
1023 	if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
1024 	    ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
1025 		return (EOVERFLOW);
1026 	type = va->va_type;
1027 	tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
1028 	tmpnode_init(tm, tp, va, cred);
1029 
1030 	/* setup normal file/dir's extended attribute directory */
1031 	if (dir->tn_flags & ISXATTR) {
1032 		/* parent dir is , mark file as xattr */
1033 		tp->tn_flags |= ISXATTR;
1034 	}
1035 
1036 
1037 	if (type == VBLK || type == VCHR) {
1038 		tp->tn_vnode->v_rdev = tp->tn_rdev = va->va_rdev;
1039 	} else {
1040 		tp->tn_vnode->v_rdev = tp->tn_rdev = NODEV;
1041 	}
1042 	tp->tn_vnode->v_type = type;
1043 	tp->tn_uid = crgetuid(cred);
1044 
1045 	/*
1046 	 * To determine the group-id of the created file:
1047 	 *   1) If the gid is set in the attribute list (non-Sun & pre-4.0
1048 	 *	clients are not likely to set the gid), then use it if
1049 	 *	the process is privileged, belongs to the target group,
1050 	 *	or the group is the same as the parent directory.
1051 	 *   2) If the filesystem was not mounted with the Old-BSD-compatible
1052 	 *	GRPID option, and the directory's set-gid bit is clear,
1053 	 *	then use the process's gid.
1054 	 *   3) Otherwise, set the group-id to the gid of the parent directory.
1055 	 */
1056 	if ((va->va_mask & AT_GID) &&
1057 	    ((va->va_gid == dir->tn_gid) || groupmember(va->va_gid, cred) ||
1058 	    secpolicy_vnode_create_gid(cred) == 0)) {
1059 		/*
1060 		 * XXX - is this only the case when a 4.0 NFS client, or a
1061 		 * client derived from that code, makes a call over the wire?
1062 		 */
1063 		tp->tn_gid = va->va_gid;
1064 	} else {
1065 		if (dir->tn_mode & VSGID)
1066 			tp->tn_gid = dir->tn_gid;
1067 		else
1068 			tp->tn_gid = crgetgid(cred);
1069 	}
1070 	/*
1071 	 * If we're creating a directory, and the parent directory has the
1072 	 * set-GID bit set, set it on the new directory.
1073 	 * Otherwise, if the user is neither privileged nor a member of the
1074 	 * file's new group, clear the file's set-GID bit.
1075 	 */
1076 	if (dir->tn_mode & VSGID && type == VDIR)
1077 		tp->tn_mode |= VSGID;
1078 	else {
1079 		if ((tp->tn_mode & VSGID) &&
1080 		    secpolicy_vnode_setids_setgids(cred, tp->tn_gid) != 0)
1081 			tp->tn_mode &= ~VSGID;
1082 	}
1083 
1084 	if (va->va_mask & AT_ATIME)
1085 		tp->tn_atime = va->va_atime;
1086 	if (va->va_mask & AT_MTIME)
1087 		tp->tn_mtime = va->va_mtime;
1088 
1089 	if (op == DE_MKDIR)
1090 		tdirinit(dir, tp);
1091 
1092 	*newnode = tp;
1093 	return (0);
1094 }
1095