xref: /illumos-gate/usr/src/uts/common/fs/tmpfs/tmp_dir.c (revision 20a7641f9918de8574b8b3b47dbe35c4bfc78df1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/sysmacros.h>
29 #include <sys/systm.h>
30 #include <sys/time.h>
31 #include <sys/vfs.h>
32 #include <sys/vnode.h>
33 #include <sys/errno.h>
34 #include <sys/cmn_err.h>
35 #include <sys/cred.h>
36 #include <sys/stat.h>
37 #include <sys/debug.h>
38 #include <sys/policy.h>
39 #include <sys/fs/tmpnode.h>
40 #include <sys/fs/tmp.h>
41 #include <sys/vtrace.h>
42 
43 static int tdircheckpath(struct tmpnode *, struct tmpnode *, struct cred *);
44 static int tdirrename(struct tmpnode *, struct tmpnode *, struct tmpnode *,
45 	char *, struct tmpnode *, struct tdirent *, struct cred *);
46 static void tdirfixdotdot(struct tmpnode *, struct tmpnode *, struct tmpnode *);
47 static int tdirmaketnode(struct tmpnode *, struct tmount *, struct vattr *,
48 	enum de_op, struct tmpnode **, struct cred *);
49 static int tdiraddentry(struct tmpnode *, struct tmpnode *, char *,
50 	enum de_op, struct tmpnode *);
51 
52 
53 #define	T_HASH_SIZE	8192		/* must be power of 2 */
54 #define	T_MUTEX_SIZE	64
55 
56 /* Non-static so compilers won't constant-fold these away. */
57 clock_t tmpfs_rename_backoff_delay = 1;
58 unsigned int tmpfs_rename_backoff_tries = 0;
59 unsigned long tmpfs_rename_loops = 0;
60 
61 static struct tdirent	*t_hashtable[T_HASH_SIZE];
62 static kmutex_t		 t_hashmutex[T_MUTEX_SIZE];
63 
64 #define	T_HASH_INDEX(a)		((a) & (T_HASH_SIZE-1))
65 #define	T_MUTEX_INDEX(a)	((a) & (T_MUTEX_SIZE-1))
66 
67 #define	TMPFS_HASH(tp, name, hash)				\
68 	{							\
69 		char Xc, *Xcp;					\
70 		hash = (uint_t)(uintptr_t)(tp) >> 8;		\
71 		for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++)	\
72 			hash = (hash << 4) + hash + (uint_t)Xc;	\
73 	}
74 
75 void
76 tmpfs_hash_init(void)
77 {
78 	int	ix;
79 
80 	for (ix = 0; ix < T_MUTEX_SIZE; ix++)
81 		mutex_init(&t_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL);
82 }
83 
84 /*
85  * This routine is where the rubber meets the road for identities.
86  */
87 static void
88 tmpfs_hash_in(struct tdirent *t)
89 {
90 	uint_t		hash;
91 	struct tdirent	**prevpp;
92 	kmutex_t	*t_hmtx;
93 
94 	TMPFS_HASH(t->td_parent, t->td_name, hash);
95 	t->td_hash = hash;
96 	prevpp = &t_hashtable[T_HASH_INDEX(hash)];
97 	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
98 	mutex_enter(t_hmtx);
99 	t->td_link = *prevpp;
100 	*prevpp = t;
101 	mutex_exit(t_hmtx);
102 }
103 
104 /*
105  * Remove tdirent *t from the hash list.
106  */
107 static void
108 tmpfs_hash_out(struct tdirent *t)
109 {
110 	uint_t		hash;
111 	struct tdirent	**prevpp;
112 	kmutex_t	*t_hmtx;
113 
114 	hash = t->td_hash;
115 	prevpp = &t_hashtable[T_HASH_INDEX(hash)];
116 	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
117 	mutex_enter(t_hmtx);
118 	while (*prevpp != t)
119 		prevpp = &(*prevpp)->td_link;
120 	*prevpp = t->td_link;
121 	mutex_exit(t_hmtx);
122 }
123 
124 /*
125  * Currently called by tdirrename() only.
126  * rename operation needs to be done with lock held, to ensure that
127  * no other operations can access the tmpnode at the same instance.
128  */
129 static void
130 tmpfs_hash_change(struct tdirent *tdp, struct tmpnode *fromtp)
131 {
132 	uint_t		hash;
133 	kmutex_t	*t_hmtx;
134 
135 	hash = tdp->td_hash;
136 	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
137 	mutex_enter(t_hmtx);
138 	tdp->td_tmpnode = fromtp;
139 	mutex_exit(t_hmtx);
140 }
141 
142 static struct tdirent *
143 tmpfs_hash_lookup(char *name, struct tmpnode *parent, uint_t hold,
144 	struct tmpnode **found)
145 {
146 	struct tdirent	*l;
147 	uint_t		hash;
148 	kmutex_t	*t_hmtx;
149 	struct tmpnode	*tnp;
150 
151 	TMPFS_HASH(parent, name, hash);
152 	t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
153 	mutex_enter(t_hmtx);
154 	l = t_hashtable[T_HASH_INDEX(hash)];
155 	while (l) {
156 		if ((l->td_hash == hash) &&
157 		    (l->td_parent == parent) &&
158 		    (strcmp(l->td_name, name) == 0)) {
159 			/*
160 			 * We need to make sure that the tmpnode that
161 			 * we put a hold on is the same one that we pass back.
162 			 * Hence, temporary variable tnp is necessary.
163 			 */
164 			tnp = l->td_tmpnode;
165 			if (hold) {
166 				ASSERT(tnp);
167 				tmpnode_hold(tnp);
168 			}
169 			if (found)
170 				*found = tnp;
171 			mutex_exit(t_hmtx);
172 			return (l);
173 		} else {
174 			l = l->td_link;
175 		}
176 	}
177 	mutex_exit(t_hmtx);
178 	return (NULL);
179 }
180 
181 /*
182  * Search directory 'parent' for entry 'name'.
183  *
184  * The calling thread can't hold the write version
185  * of the rwlock for the directory being searched
186  *
187  * 0 is returned on success and *foundtp points
188  * to the found tmpnode with its vnode held.
189  */
190 int
191 tdirlookup(
192 	struct tmpnode *parent,
193 	char *name,
194 	struct tmpnode **foundtp,
195 	struct cred *cred)
196 {
197 	int error;
198 
199 	*foundtp = NULL;
200 	if (parent->tn_type != VDIR)
201 		return (ENOTDIR);
202 
203 	if ((error = tmp_taccess(parent, VEXEC, cred)))
204 		return (error);
205 
206 	if (*name == '\0') {
207 		tmpnode_hold(parent);
208 		*foundtp = parent;
209 		return (0);
210 	}
211 
212 	/*
213 	 * Search the directory for the matching name
214 	 * We need the lock protecting the tn_dir list
215 	 * so that it doesn't change out from underneath us.
216 	 * tmpfs_hash_lookup() will pass back the tmpnode
217 	 * with a hold on it.
218 	 */
219 
220 	if (tmpfs_hash_lookup(name, parent, 1, foundtp) != NULL) {
221 		ASSERT(*foundtp);
222 		return (0);
223 	}
224 
225 	return (ENOENT);
226 }
227 
228 /*
229  * Enter a directory entry for 'name' and 'tp' into directory 'dir'
230  *
231  * Returns 0 on success.
232  */
233 int
234 tdirenter(
235 	struct tmount	*tm,
236 	struct tmpnode	*dir,		/* target directory to make entry in */
237 	char		*name,		/* name of entry */
238 	enum de_op	op,		/* entry operation */
239 	struct tmpnode	*fromparent,	/* source directory if rename */
240 	struct tmpnode	*tp,		/* source tmpnode, if link/rename */
241 	struct vattr	*va,
242 	struct tmpnode	**tpp,		/* return tmpnode, if create/mkdir */
243 	struct cred	*cred,
244 	caller_context_t *ctp)
245 {
246 	struct tdirent *tdp;
247 	struct tmpnode *found = NULL;
248 	int error = 0;
249 	char *s;
250 
251 	/*
252 	 * tn_rwlock is held to serialize direnter and dirdeletes
253 	 */
254 	ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
255 	ASSERT(dir->tn_type == VDIR);
256 
257 	/*
258 	 * Don't allow '/' characters in pathname component
259 	 * (thus in ufs_direnter()).
260 	 */
261 	for (s = name; *s; s++)
262 		if (*s == '/')
263 			return (EACCES);
264 
265 	if (name[0] == '\0')
266 		panic("tdirenter: NULL name");
267 
268 	/*
269 	 * For link and rename lock the source entry and check the link count
270 	 * to see if it has been removed while it was unlocked.
271 	 */
272 	if (op == DE_LINK || op == DE_RENAME) {
273 		if (tp != dir) {
274 			unsigned int tries = 0;
275 
276 			/*
277 			 * If we are acquiring tp->tn_rwlock (for SOURCE)
278 			 * inside here, we must consider the following:
279 			 *
280 			 * - dir->tn_rwlock (TARGET) is already HELD (see
281 			 * above ASSERT()).
282 			 *
283 			 * - It is possible our SOURCE is a parent of our
284 			 * TARGET. Yes it's unusual, but it will return an
285 			 * error below via tdircheckpath().
286 			 *
287 			 * - It is also possible that another thread,
288 			 * concurrent to this one, is performing
289 			 * rmdir(TARGET), which means it will first acquire
290 			 * SOURCE's lock, THEN acquire TARGET's lock, which
291 			 * could result in this thread holding TARGET and
292 			 * trying for SOURCE, but the other thread holding
293 			 * SOURCE and trying for TARGET.  This is deadlock,
294 			 * and it's inducible.
295 			 *
296 			 * To prevent this, we borrow some techniques from UFS
297 			 * and rw_tryenter(), delaying if we fail, and
298 			 * if someone tweaks the number of backoff tries to be
299 			 * nonzero, return EBUSY after that number of tries.
300 			 */
301 			while (!rw_tryenter(&tp->tn_rwlock, RW_WRITER)) {
302 				/*
303 				 * Sloppy, but this is a diagnostic so atomic
304 				 * increment would be overkill.
305 				 */
306 				tmpfs_rename_loops++;
307 
308 				if (tmpfs_rename_backoff_tries != 0) {
309 					if (tries > tmpfs_rename_backoff_tries)
310 						return (EBUSY);
311 					tries++;
312 				}
313 				/*
314 				 * NOTE: We're still holding dir->tn_rwlock,
315 				 * so drop it over the delay, so any other
316 				 * thread can get its business done.
317 				 *
318 				 * No state change or state inspection happens
319 				 * prior to here, so it is not wholly dangerous
320 				 * to release-and-reacquire dir->tn_rwlock.
321 				 *
322 				 * Hold the vnode of dir in case it gets
323 				 * released by another thread, though.
324 				 */
325 				VN_HOLD(TNTOV(dir));
326 				rw_exit(&dir->tn_rwlock);
327 				delay(tmpfs_rename_backoff_delay);
328 				rw_enter(&dir->tn_rwlock, RW_WRITER);
329 				VN_RELE(TNTOV(dir));
330 			}
331 		}
332 		mutex_enter(&tp->tn_tlock);
333 		if (tp->tn_nlink == 0) {
334 			mutex_exit(&tp->tn_tlock);
335 			if (tp != dir)
336 				rw_exit(&tp->tn_rwlock);
337 			return (ENOENT);
338 		}
339 
340 		if (tp->tn_nlink == MAXLINK) {
341 			mutex_exit(&tp->tn_tlock);
342 			if (tp != dir)
343 				rw_exit(&tp->tn_rwlock);
344 			return (EMLINK);
345 		}
346 		tp->tn_nlink++;
347 		gethrestime(&tp->tn_ctime);
348 		mutex_exit(&tp->tn_tlock);
349 		if (tp != dir)
350 			rw_exit(&tp->tn_rwlock);
351 	}
352 
353 	/*
354 	 * This might be a "dangling detached directory".
355 	 * it could have been removed, but a reference
356 	 * to it kept in u_cwd.  don't bother searching
357 	 * it, and with any luck the user will get tired
358 	 * of dealing with us and cd to some absolute
359 	 * pathway.  *sigh*, thus in ufs, too.
360 	 */
361 	if (dir->tn_nlink == 0) {
362 		error = ENOENT;
363 		goto out;
364 	}
365 
366 	/*
367 	 * If this is a rename of a directory and the parent is
368 	 * different (".." must be changed), then the source
369 	 * directory must not be in the directory hierarchy
370 	 * above the target, as this would orphan everything
371 	 * below the source directory.
372 	 */
373 	if (op == DE_RENAME) {
374 		if (tp == dir) {
375 			error = EINVAL;
376 			goto out;
377 		}
378 		if (tp->tn_type == VDIR) {
379 			if ((fromparent != dir) &&
380 			    (error = tdircheckpath(tp, dir, cred))) {
381 				goto out;
382 			}
383 		}
384 	}
385 
386 	/*
387 	 * Search for the entry.  Return "found" if it exists.
388 	 */
389 	tdp = tmpfs_hash_lookup(name, dir, 1, &found);
390 
391 	if (tdp) {
392 		ASSERT(found);
393 		switch (op) {
394 		case DE_CREATE:
395 		case DE_MKDIR:
396 			if (tpp) {
397 				*tpp = found;
398 				error = EEXIST;
399 			} else {
400 				tmpnode_rele(found);
401 			}
402 			break;
403 
404 		case DE_RENAME:
405 			error = tdirrename(fromparent, tp,
406 			    dir, name, found, tdp, cred);
407 			if (error == 0) {
408 				if (found != NULL) {
409 					vnevent_rename_dest(TNTOV(found),
410 					    TNTOV(dir), name, ctp);
411 				}
412 			}
413 
414 			tmpnode_rele(found);
415 			break;
416 
417 		case DE_LINK:
418 			/*
419 			 * Can't link to an existing file.
420 			 */
421 			error = EEXIST;
422 			tmpnode_rele(found);
423 			break;
424 		}
425 	} else {
426 
427 		/*
428 		 * The entry does not exist. Check write permission in
429 		 * directory to see if entry can be created.
430 		 */
431 		if (error = tmp_taccess(dir, VWRITE, cred))
432 			goto out;
433 		if (op == DE_CREATE || op == DE_MKDIR) {
434 			/*
435 			 * Make new tmpnode and directory entry as required.
436 			 */
437 			error = tdirmaketnode(dir, tm, va, op, &tp, cred);
438 			if (error)
439 				goto out;
440 		}
441 		if (error = tdiraddentry(dir, tp, name, op, fromparent)) {
442 			if (op == DE_CREATE || op == DE_MKDIR) {
443 				/*
444 				 * Unmake the inode we just made.
445 				 */
446 				rw_enter(&tp->tn_rwlock, RW_WRITER);
447 				if ((tp->tn_type) == VDIR) {
448 					ASSERT(tdp == NULL);
449 					/*
450 					 * cleanup allocs made by tdirinit()
451 					 */
452 					tdirtrunc(tp);
453 				}
454 				mutex_enter(&tp->tn_tlock);
455 				tp->tn_nlink = 0;
456 				mutex_exit(&tp->tn_tlock);
457 				gethrestime(&tp->tn_ctime);
458 				rw_exit(&tp->tn_rwlock);
459 				tmpnode_rele(tp);
460 				tp = NULL;
461 			}
462 		} else if (tpp) {
463 			*tpp = tp;
464 		} else if (op == DE_CREATE || op == DE_MKDIR) {
465 			tmpnode_rele(tp);
466 		}
467 	}
468 
469 out:
470 	if (error && (op == DE_LINK || op == DE_RENAME)) {
471 		/*
472 		 * Undo bumped link count.
473 		 */
474 		DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
475 		gethrestime(&tp->tn_ctime);
476 	}
477 	return (error);
478 }
479 
480 /*
481  * Delete entry tp of name "nm" from dir.
482  * Free dir entry space and decrement link count on tmpnode(s).
483  *
484  * Return 0 on success.
485  */
486 int
487 tdirdelete(
488 	struct tmpnode *dir,
489 	struct tmpnode *tp,
490 	char *nm,
491 	enum dr_op op,
492 	struct cred *cred)
493 {
494 	struct tdirent *tpdp;
495 	int error;
496 	size_t namelen;
497 	struct tmpnode *tnp;
498 	timestruc_t now;
499 
500 	ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
501 	ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
502 	ASSERT(dir->tn_type == VDIR);
503 
504 	if (nm[0] == '\0')
505 		panic("tdirdelete: NULL name for %p", (void *)tp);
506 
507 	/*
508 	 * return error when removing . and ..
509 	 */
510 	if (nm[0] == '.') {
511 		if (nm[1] == '\0')
512 			return (EINVAL);
513 		if (nm[1] == '.' && nm[2] == '\0')
514 			return (EEXIST); /* thus in ufs */
515 	}
516 
517 	if (error = tmp_taccess(dir, VEXEC|VWRITE, cred))
518 		return (error);
519 
520 	/*
521 	 * If the parent directory is "sticky", then the user must
522 	 * own the parent directory or the file in it, or else must
523 	 * have permission to write the file.  Otherwise it may not
524 	 * be deleted (except by privileged users).
525 	 * Same as ufs_dirremove.
526 	 */
527 	if ((error = tmp_sticky_remove_access(dir, tp, cred)) != 0)
528 		return (error);
529 
530 	if (dir->tn_dir == NULL)
531 		return (ENOENT);
532 
533 	tpdp = tmpfs_hash_lookup(nm, dir, 0, &tnp);
534 	if (tpdp == NULL) {
535 		/*
536 		 * If it is gone, some other thread got here first!
537 		 * Return error ENOENT.
538 		 */
539 		return (ENOENT);
540 	}
541 
542 	/*
543 	 * If the tmpnode in the tdirent changed, we were probably
544 	 * the victim of a concurrent rename operation.  The original
545 	 * is gone, so return that status (same as UFS).
546 	 */
547 	if (tp != tnp)
548 		return (ENOENT);
549 
550 	tmpfs_hash_out(tpdp);
551 
552 	/*
553 	 * Take tpdp out of the directory list.
554 	 */
555 	ASSERT(tpdp->td_next != tpdp);
556 	ASSERT(tpdp->td_prev != tpdp);
557 	if (tpdp->td_prev) {
558 		tpdp->td_prev->td_next = tpdp->td_next;
559 	}
560 	if (tpdp->td_next) {
561 		tpdp->td_next->td_prev = tpdp->td_prev;
562 	}
563 
564 	/*
565 	 * If the roving slot pointer happens to match tpdp,
566 	 * point it at the previous dirent.
567 	 */
568 	if (dir->tn_dir->td_prev == tpdp) {
569 		dir->tn_dir->td_prev = tpdp->td_prev;
570 	}
571 	ASSERT(tpdp->td_next != tpdp);
572 	ASSERT(tpdp->td_prev != tpdp);
573 
574 	/*
575 	 * tpdp points to the correct directory entry
576 	 */
577 	namelen = strlen(tpdp->td_name) + 1;
578 
579 	tmp_memfree(tpdp, sizeof (struct tdirent) + namelen);
580 	dir->tn_size -= (sizeof (struct tdirent) + namelen);
581 	dir->tn_dirents--;
582 
583 	gethrestime(&now);
584 	dir->tn_mtime = now;
585 	dir->tn_ctime = now;
586 	tp->tn_ctime = now;
587 
588 	ASSERT(tp->tn_nlink > 0);
589 	DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
590 	if (op == DR_RMDIR && tp->tn_type == VDIR) {
591 		tdirtrunc(tp);
592 		ASSERT(tp->tn_nlink == 0);
593 	}
594 	return (0);
595 }
596 
597 /*
598  * tdirinit is used internally to initialize a directory (dir)
599  * with '.' and '..' entries without checking permissions and locking
600  */
601 void
602 tdirinit(
603 	struct tmpnode *parent,		/* parent of directory to initialize */
604 	struct tmpnode *dir)		/* the new directory */
605 {
606 	struct tdirent *dot, *dotdot;
607 	timestruc_t now;
608 
609 	ASSERT(RW_WRITE_HELD(&parent->tn_rwlock));
610 	ASSERT(dir->tn_type == VDIR);
611 
612 	dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE);
613 	dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE);
614 
615 	/*
616 	 * Initialize the entries
617 	 */
618 	dot->td_tmpnode = dir;
619 	dot->td_offset = 0;
620 	dot->td_name = (char *)dot + sizeof (struct tdirent);
621 	dot->td_name[0] = '.';
622 	dot->td_parent = dir;
623 	tmpfs_hash_in(dot);
624 
625 	dotdot->td_tmpnode = parent;
626 	dotdot->td_offset = 1;
627 	dotdot->td_name = (char *)dotdot + sizeof (struct tdirent);
628 	dotdot->td_name[0] = '.';
629 	dotdot->td_name[1] = '.';
630 	dotdot->td_parent = dir;
631 	tmpfs_hash_in(dotdot);
632 
633 	/*
634 	 * Initialize directory entry list.
635 	 */
636 	dot->td_next = dotdot;
637 	dot->td_prev = dotdot;	/* dot's td_prev holds roving slot pointer */
638 	dotdot->td_next = NULL;
639 	dotdot->td_prev = dot;
640 
641 	gethrestime(&now);
642 	dir->tn_mtime = now;
643 	dir->tn_ctime = now;
644 
645 	/*
646 	 * Link counts are special for the hidden attribute directory.
647 	 * The only explicit reference in the name space is "." and
648 	 * the reference through ".." is not counted on the parent
649 	 * file. The attrdir is created as a side effect to lookup,
650 	 * so don't change the ctime of the parent.
651 	 * Since tdirinit is called with both dir and parent being the
652 	 * same for the root vnode, we need to increment this before we set
653 	 * tn_nlink = 2 below.
654 	 */
655 	if (!(dir->tn_vnode->v_flag & V_XATTRDIR)) {
656 		INCR_COUNT(&parent->tn_nlink, &parent->tn_tlock);
657 		parent->tn_ctime = now;
658 	}
659 
660 	dir->tn_dir = dot;
661 	dir->tn_size = 2 * sizeof (struct tdirent) + 5;	/* dot and dotdot */
662 	dir->tn_dirents = 2;
663 	dir->tn_nlink = 2;
664 }
665 
666 
667 /*
668  * tdirtrunc is called to remove all directory entries under this directory.
669  */
670 void
671 tdirtrunc(struct tmpnode *dir)
672 {
673 	struct tdirent *tdp;
674 	struct tmpnode *tp;
675 	size_t namelen;
676 	timestruc_t now;
677 	int isvattrdir, isdotdot, skip_decr;
678 
679 	ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
680 	ASSERT(dir->tn_type == VDIR);
681 
682 	isvattrdir = (dir->tn_vnode->v_flag & V_XATTRDIR) ? 1 : 0;
683 	for (tdp = dir->tn_dir; tdp; tdp = dir->tn_dir) {
684 		ASSERT(tdp->td_next != tdp);
685 		ASSERT(tdp->td_prev != tdp);
686 		ASSERT(tdp->td_tmpnode);
687 
688 		dir->tn_dir = tdp->td_next;
689 		namelen = strlen(tdp->td_name) + 1;
690 
691 		/*
692 		 * Adjust the link counts to account for this directory
693 		 * entry removal. Hidden attribute directories may
694 		 * not be empty as they may be truncated as a side-
695 		 * effect of removing the parent. We do hold/rele
696 		 * operations to free up these tmpnodes.
697 		 *
698 		 * Skip the link count adjustment for parents of
699 		 * attribute directories as those link counts
700 		 * do not include the ".." reference in the hidden
701 		 * directories.
702 		 */
703 		tp = tdp->td_tmpnode;
704 		isdotdot = (strcmp("..", tdp->td_name) == 0);
705 		skip_decr = (isvattrdir && isdotdot);
706 		if (!skip_decr) {
707 			ASSERT(tp->tn_nlink > 0);
708 			DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
709 		}
710 
711 		tmpfs_hash_out(tdp);
712 
713 		tmp_memfree(tdp, sizeof (struct tdirent) + namelen);
714 		dir->tn_size -= (sizeof (struct tdirent) + namelen);
715 		dir->tn_dirents--;
716 	}
717 
718 	gethrestime(&now);
719 	dir->tn_mtime = now;
720 	dir->tn_ctime = now;
721 
722 	ASSERT(dir->tn_dir == NULL);
723 	ASSERT(dir->tn_size == 0);
724 	ASSERT(dir->tn_dirents == 0);
725 }
726 
727 /*
728  * Check if the source directory is in the path of the target directory.
729  * The target directory is locked by the caller.
730  *
731  * XXX - The source and target's should be different upon entry.
732  */
733 static int
734 tdircheckpath(
735 	struct tmpnode *fromtp,
736 	struct tmpnode	*toparent,
737 	struct cred	*cred)
738 {
739 	int	error = 0;
740 	struct tmpnode *dir, *dotdot;
741 	struct tdirent *tdp;
742 
743 	ASSERT(RW_WRITE_HELD(&toparent->tn_rwlock));
744 
745 	tdp = tmpfs_hash_lookup("..", toparent, 1, &dotdot);
746 	if (tdp == NULL)
747 		return (ENOENT);
748 
749 	ASSERT(dotdot);
750 
751 	if (dotdot == toparent) {
752 		/* root of fs.  search trivially satisfied. */
753 		tmpnode_rele(dotdot);
754 		return (0);
755 	}
756 	for (;;) {
757 		/*
758 		 * Return error for cases like "mv c c/d",
759 		 * "mv c c/d/e" and so on.
760 		 */
761 		if (dotdot == fromtp) {
762 			tmpnode_rele(dotdot);
763 			error = EINVAL;
764 			break;
765 		}
766 		dir = dotdot;
767 		error = tdirlookup(dir, "..", &dotdot, cred);
768 		if (error) {
769 			tmpnode_rele(dir);
770 			break;
771 		}
772 		/*
773 		 * We're okay if we traverse the directory tree up to
774 		 * the root directory and don't run into the
775 		 * parent directory.
776 		 */
777 		if (dir == dotdot) {
778 			tmpnode_rele(dir);
779 			tmpnode_rele(dotdot);
780 			break;
781 		}
782 		tmpnode_rele(dir);
783 	}
784 	return (error);
785 }
786 
787 static int
788 tdirrename(
789 	struct tmpnode *fromparent,	/* parent directory of source */
790 	struct tmpnode *fromtp,		/* source tmpnode */
791 	struct tmpnode *toparent,	/* parent directory of target */
792 	char *nm,			/* entry we are trying to change */
793 	struct tmpnode *to,		/* target tmpnode */
794 	struct tdirent *where,		/* target tmpnode directory entry */
795 	struct cred *cred)		/* credentials */
796 {
797 	int error = 0;
798 	int doingdirectory;
799 	timestruc_t now;
800 
801 #if defined(lint)
802 	nm = nm;
803 #endif
804 	ASSERT(RW_WRITE_HELD(&toparent->tn_rwlock));
805 
806 	/*
807 	 * Short circuit rename of something to itself.
808 	 */
809 	if (fromtp == to)
810 		return (ESAME);		/* special KLUDGE error code */
811 
812 	rw_enter(&fromtp->tn_rwlock, RW_READER);
813 	rw_enter(&to->tn_rwlock, RW_READER);
814 
815 	/*
816 	 * Check that everything is on the same filesystem.
817 	 */
818 	if (to->tn_vnode->v_vfsp != toparent->tn_vnode->v_vfsp ||
819 	    to->tn_vnode->v_vfsp != fromtp->tn_vnode->v_vfsp) {
820 		error = EXDEV;
821 		goto out;
822 	}
823 
824 	/*
825 	 * Must have write permission to rewrite target entry.
826 	 * Check for stickyness.
827 	 */
828 	if ((error = tmp_taccess(toparent, VWRITE, cred)) != 0 ||
829 	    (error = tmp_sticky_remove_access(toparent, to, cred)) != 0)
830 		goto out;
831 
832 	/*
833 	 * Ensure source and target are compatible (both directories
834 	 * or both not directories).  If target is a directory it must
835 	 * be empty and have no links to it; in addition it must not
836 	 * be a mount point, and both the source and target must be
837 	 * writable.
838 	 */
839 	doingdirectory = (fromtp->tn_type == VDIR);
840 	if (to->tn_type == VDIR) {
841 		if (!doingdirectory) {
842 			error = EISDIR;
843 			goto out;
844 		}
845 		/*
846 		 * vn_vfswlock will prevent mounts from using the directory
847 		 * until we are done.
848 		 */
849 		if (vn_vfswlock(TNTOV(to))) {
850 			error = EBUSY;
851 			goto out;
852 		}
853 		if (vn_mountedvfs(TNTOV(to)) != NULL) {
854 			vn_vfsunlock(TNTOV(to));
855 			error = EBUSY;
856 			goto out;
857 		}
858 
859 		mutex_enter(&to->tn_tlock);
860 		if (to->tn_dirents > 2 || to->tn_nlink > 2) {
861 			mutex_exit(&to->tn_tlock);
862 			vn_vfsunlock(TNTOV(to));
863 			error = EEXIST; /* SIGH should be ENOTEMPTY */
864 			/*
865 			 * Update atime because checking tn_dirents is
866 			 * logically equivalent to reading the directory
867 			 */
868 			gethrestime(&to->tn_atime);
869 			goto out;
870 		}
871 		mutex_exit(&to->tn_tlock);
872 	} else if (doingdirectory) {
873 		error = ENOTDIR;
874 		goto out;
875 	}
876 
877 	tmpfs_hash_change(where, fromtp);
878 	gethrestime(&now);
879 	toparent->tn_mtime = now;
880 	toparent->tn_ctime = now;
881 
882 	/*
883 	 * Upgrade to write lock on "to" (i.e., the target tmpnode).
884 	 */
885 	rw_exit(&to->tn_rwlock);
886 	rw_enter(&to->tn_rwlock, RW_WRITER);
887 
888 	/*
889 	 * Decrement the link count of the target tmpnode.
890 	 */
891 	DECR_COUNT(&to->tn_nlink, &to->tn_tlock);
892 	to->tn_ctime = now;
893 
894 	if (doingdirectory) {
895 		/*
896 		 * The entry for "to" no longer exists so release the vfslock.
897 		 */
898 		vn_vfsunlock(TNTOV(to));
899 
900 		/*
901 		 * Decrement the target link count and delete all entires.
902 		 */
903 		tdirtrunc(to);
904 		ASSERT(to->tn_nlink == 0);
905 
906 		/*
907 		 * Renaming a directory with the parent different
908 		 * requires that ".." be rewritten.  The window is
909 		 * still there for ".." to be inconsistent, but this
910 		 * is unavoidable, and a lot shorter than when it was
911 		 * done in a user process.
912 		 */
913 		if (fromparent != toparent)
914 			tdirfixdotdot(fromtp, fromparent, toparent);
915 	}
916 out:
917 	rw_exit(&to->tn_rwlock);
918 	rw_exit(&fromtp->tn_rwlock);
919 	return (error);
920 }
921 
922 static void
923 tdirfixdotdot(
924 	struct tmpnode	*fromtp,	/* child directory */
925 	struct tmpnode	*fromparent,	/* old parent directory */
926 	struct tmpnode	*toparent)	/* new parent directory */
927 {
928 	struct tdirent	*dotdot;
929 
930 	ASSERT(RW_LOCK_HELD(&toparent->tn_rwlock));
931 
932 	/*
933 	 * Increment the link count in the new parent tmpnode
934 	 */
935 	INCR_COUNT(&toparent->tn_nlink, &toparent->tn_tlock);
936 	gethrestime(&toparent->tn_ctime);
937 
938 	dotdot = tmpfs_hash_lookup("..", fromtp, 0, NULL);
939 
940 	ASSERT(dotdot->td_tmpnode == fromparent);
941 	dotdot->td_tmpnode = toparent;
942 
943 	/*
944 	 * Decrement the link count of the old parent tmpnode.
945 	 * If fromparent is NULL, then this is a new directory link;
946 	 * it has no parent, so we need not do anything.
947 	 */
948 	if (fromparent != NULL) {
949 		mutex_enter(&fromparent->tn_tlock);
950 		if (fromparent->tn_nlink != 0) {
951 			fromparent->tn_nlink--;
952 			gethrestime(&fromparent->tn_ctime);
953 		}
954 		mutex_exit(&fromparent->tn_tlock);
955 	}
956 }
957 
958 static int
959 tdiraddentry(
960 	struct tmpnode	*dir,	/* target directory to make entry in */
961 	struct tmpnode	*tp,	/* new tmpnode */
962 	char		*name,
963 	enum de_op	op,
964 	struct tmpnode	*fromtp)
965 {
966 	struct tdirent *tdp, *tpdp;
967 	size_t		namelen, alloc_size;
968 	timestruc_t	now;
969 
970 	/*
971 	 * Make sure the parent directory wasn't removed from
972 	 * underneath the caller.
973 	 */
974 	if (dir->tn_dir == NULL)
975 		return (ENOENT);
976 
977 	/*
978 	 * Check that everything is on the same filesystem.
979 	 */
980 	if (tp->tn_vnode->v_vfsp != dir->tn_vnode->v_vfsp)
981 		return (EXDEV);
982 
983 	/*
984 	 * Allocate and initialize directory entry
985 	 */
986 	namelen = strlen(name) + 1;
987 	alloc_size = namelen + sizeof (struct tdirent);
988 	tdp = tmp_memalloc(alloc_size, 0);
989 	if (tdp == NULL)
990 		return (ENOSPC);
991 
992 	if ((op == DE_RENAME) && (tp->tn_type == VDIR))
993 		tdirfixdotdot(tp, fromtp, dir);
994 
995 	dir->tn_size += alloc_size;
996 	dir->tn_dirents++;
997 	tdp->td_tmpnode = tp;
998 	tdp->td_parent = dir;
999 
1000 	/*
1001 	 * The directory entry and its name were allocated sequentially.
1002 	 */
1003 	tdp->td_name = (char *)tdp + sizeof (struct tdirent);
1004 	(void) strcpy(tdp->td_name, name);
1005 
1006 	tmpfs_hash_in(tdp);
1007 
1008 	/*
1009 	 * Some utilities expect the size of a directory to remain
1010 	 * somewhat static.  For example, a routine which unlinks
1011 	 * files between calls to readdir(); the size of the
1012 	 * directory changes from underneath it and so the real
1013 	 * directory offset in bytes is invalid.  To circumvent
1014 	 * this problem, we initialize a directory entry with an
1015 	 * phony offset, and use this offset to determine end of
1016 	 * file in tmp_readdir.
1017 	 */
1018 	tpdp = dir->tn_dir->td_prev;
1019 	/*
1020 	 * Install at first empty "slot" in directory list.
1021 	 */
1022 	while (tpdp->td_next != NULL && (tpdp->td_next->td_offset -
1023 	    tpdp->td_offset) <= 1) {
1024 		ASSERT(tpdp->td_next != tpdp);
1025 		ASSERT(tpdp->td_prev != tpdp);
1026 		ASSERT(tpdp->td_next->td_offset > tpdp->td_offset);
1027 		tpdp = tpdp->td_next;
1028 	}
1029 	tdp->td_offset = tpdp->td_offset + 1;
1030 
1031 	/*
1032 	 * If we're at the end of the dirent list and the offset (which
1033 	 * is necessarily the largest offset in this directory) is more
1034 	 * than twice the number of dirents, that means the directory is
1035 	 * 50% holes.  At this point we reset the slot pointer back to
1036 	 * the beginning of the directory so we start using the holes.
1037 	 * The idea is that if there are N dirents, there must also be
1038 	 * N holes, so we can satisfy the next N creates by walking at
1039 	 * most 2N entries; thus the average cost of a create is constant.
1040 	 * Note that we use the first dirent's td_prev as the roving
1041 	 * slot pointer; it's ugly, but it saves a word in every dirent.
1042 	 */
1043 	if (tpdp->td_next == NULL && tpdp->td_offset > 2 * dir->tn_dirents)
1044 		dir->tn_dir->td_prev = dir->tn_dir->td_next;
1045 	else
1046 		dir->tn_dir->td_prev = tdp;
1047 
1048 	ASSERT(tpdp->td_next != tpdp);
1049 	ASSERT(tpdp->td_prev != tpdp);
1050 
1051 	tdp->td_next = tpdp->td_next;
1052 	if (tdp->td_next) {
1053 		tdp->td_next->td_prev = tdp;
1054 	}
1055 	tdp->td_prev = tpdp;
1056 	tpdp->td_next = tdp;
1057 
1058 	ASSERT(tdp->td_next != tdp);
1059 	ASSERT(tdp->td_prev != tdp);
1060 	ASSERT(tpdp->td_next != tpdp);
1061 	ASSERT(tpdp->td_prev != tpdp);
1062 
1063 	gethrestime(&now);
1064 	dir->tn_mtime = now;
1065 	dir->tn_ctime = now;
1066 
1067 	return (0);
1068 }
1069 
1070 static int
1071 tdirmaketnode(
1072 	struct tmpnode *dir,
1073 	struct tmount	*tm,
1074 	struct vattr	*va,
1075 	enum	de_op	op,
1076 	struct tmpnode **newnode,
1077 	struct cred	*cred)
1078 {
1079 	struct tmpnode *tp;
1080 	enum vtype	type;
1081 
1082 	ASSERT(va != NULL);
1083 	ASSERT(op == DE_CREATE || op == DE_MKDIR);
1084 	if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
1085 	    ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
1086 		return (EOVERFLOW);
1087 	type = va->va_type;
1088 	tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
1089 	tmpnode_init(tm, tp, va, cred);
1090 
1091 	/* setup normal file/dir's extended attribute directory */
1092 	if (dir->tn_flags & ISXATTR) {
1093 		/* parent dir is , mark file as xattr */
1094 		tp->tn_flags |= ISXATTR;
1095 	}
1096 
1097 
1098 	if (type == VBLK || type == VCHR) {
1099 		tp->tn_vnode->v_rdev = tp->tn_rdev = va->va_rdev;
1100 	} else {
1101 		tp->tn_vnode->v_rdev = tp->tn_rdev = NODEV;
1102 	}
1103 	tp->tn_vnode->v_type = type;
1104 	tp->tn_uid = crgetuid(cred);
1105 
1106 	/*
1107 	 * To determine the group-id of the created file:
1108 	 *   1) If the gid is set in the attribute list (non-Sun & pre-4.0
1109 	 *	clients are not likely to set the gid), then use it if
1110 	 *	the process is privileged, belongs to the target group,
1111 	 *	or the group is the same as the parent directory.
1112 	 *   2) If the filesystem was not mounted with the Old-BSD-compatible
1113 	 *	GRPID option, and the directory's set-gid bit is clear,
1114 	 *	then use the process's gid.
1115 	 *   3) Otherwise, set the group-id to the gid of the parent directory.
1116 	 */
1117 	if ((va->va_mask & AT_GID) &&
1118 	    ((va->va_gid == dir->tn_gid) || groupmember(va->va_gid, cred) ||
1119 	    secpolicy_vnode_create_gid(cred) == 0)) {
1120 		/*
1121 		 * XXX - is this only the case when a 4.0 NFS client, or a
1122 		 * client derived from that code, makes a call over the wire?
1123 		 */
1124 		tp->tn_gid = va->va_gid;
1125 	} else {
1126 		if (dir->tn_mode & VSGID)
1127 			tp->tn_gid = dir->tn_gid;
1128 		else
1129 			tp->tn_gid = crgetgid(cred);
1130 	}
1131 	/*
1132 	 * If we're creating a directory, and the parent directory has the
1133 	 * set-GID bit set, set it on the new directory.
1134 	 * Otherwise, if the user is neither privileged nor a member of the
1135 	 * file's new group, clear the file's set-GID bit.
1136 	 */
1137 	if (dir->tn_mode & VSGID && type == VDIR)
1138 		tp->tn_mode |= VSGID;
1139 	else {
1140 		if ((tp->tn_mode & VSGID) &&
1141 		    secpolicy_vnode_setids_setgids(cred, tp->tn_gid) != 0)
1142 			tp->tn_mode &= ~VSGID;
1143 	}
1144 
1145 	if (va->va_mask & AT_ATIME)
1146 		tp->tn_atime = va->va_atime;
1147 	if (va->va_mask & AT_MTIME)
1148 		tp->tn_mtime = va->va_mtime;
1149 
1150 	if (op == DE_MKDIR)
1151 		tdirinit(dir, tp);
1152 
1153 	*newnode = tp;
1154 	return (0);
1155 }
1156