xref: /titanic_44/usr/src/uts/common/fs/ufs/ufs_dir.c (revision 7663b81667fda05833f609eceac713f0a83c2347)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 
41 #pragma ident	"%Z%%M%	%I%	%E% SMI"
42 
43 /*
44  * Directory manipulation routines.
45  *
46  * When manipulating directories, the i_rwlock provides serialization
47  * since directories cannot be mmapped. The i_contents lock is redundant.
48  */
49 
50 #include <sys/types.h>
51 #include <sys/t_lock.h>
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/signal.h>
55 #include <sys/cred.h>
56 #include <sys/proc.h>
57 #include <sys/disp.h>
58 #include <sys/user.h>
59 #include <sys/vfs.h>
60 #include <sys/vnode.h>
61 #include <sys/stat.h>
62 #include <sys/mode.h>
63 #include <sys/buf.h>
64 #include <sys/uio.h>
65 #include <sys/dnlc.h>
66 #include <sys/fs/ufs_inode.h>
67 #include <sys/fs/ufs_fs.h>
68 #include <sys/mount.h>
69 #include <sys/fs/ufs_fsdir.h>
70 #include <sys/fs/ufs_trans.h>
71 #include <sys/fs/ufs_panic.h>
72 #include <sys/fs/ufs_quota.h>
73 #include <sys/errno.h>
74 #include <sys/debug.h>
75 #include <vm/seg.h>
76 #include <sys/sysmacros.h>
77 #include <sys/cmn_err.h>
78 #include <sys/cpuvar.h>
79 #include <sys/unistd.h>
80 #include <sys/policy.h>
81 
82 /*
83  * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ
84  */
85 #if !ISP2(DIRBLKSIZ)
86 #error	"DIRBLKSIZ not a power of 2"
87 #endif
88 
89 /*
90  * A virgin directory.
91  */
92 static struct dirtemplate mastertemplate = {
93 	0, 12, 1, ".",
94 	0, DIRBLKSIZ - 12, 2, ".."
95 };
96 
97 #define	LDIRSIZ(len) \
98 	((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3))
99 #define	MAX_DIR_NAME_LEN(len) \
100 	(((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1)
101 
102 /*
103  * The dnlc directory cache allows a 64 bit handle for directory entries.
104  * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset
105  * into the handle. Note, a 32 bit offset allows a 4GB directory, which
106  * is way beyond what could be cached in memory by the directory
107  * caching routines. So we are quite safe with this limit.
108  * The macros below pack and unpack the handle.
109  */
110 #define	H_TO_INO(h) (uint32_t)((h) & UINT_MAX)
111 #define	H_TO_OFF(h) (off_t)((h) >> 32)
112 #define	INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino))
113 
114 /*
115  * The average size of a typical on disk directory entry is about 16 bytes
116  * and so defines AV_DIRECT_SHIFT : log2(16)
117  * This define is only used to approximate the number of entries
118  * is a directory. This is needed for dnlc_dir_start() which will immediately
119  * return an error if the value is not within its acceptable range of
120  * number of files in a directory.
121  */
122 #define	AV_DIRECT_SHIFT 4
123 /*
124  * If the directory size (from i_size) is greater than the ufs_min_dir_cache
125  * tunable then we request dnlc directory caching.
126  * This has found to be profitable after 1024 file names.
127  */
128 int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT;
129 
130 #ifdef DEBUG
131 int dirchk = 1;
132 #else /* !DEBUG */
133 int dirchk = 0;
134 #endif /* DEBUG */
135 int ufs_negative_cache = 1;
136 uint64_t ufs_dirremove_retry_cnt;
137 
138 static void dirbad();
139 static int ufs_dirrename();
140 static int ufs_diraddentry();
141 static int ufs_dirempty();
142 static int ufs_dirscan();
143 static int ufs_dirclrdotdot();
144 static int ufs_dirfixdotdot();
145 static int ufs_dirpurgedotdot();
146 static int dirprepareentry();
147 static int ufs_dirmakedirect();
148 static int dirbadname();
149 static int dirmangled();
150 
151 /*
152  * Look for a given name in a directory.  On successful return, *ipp
153  * will point to the VN_HELD inode.
154  */
155 int
156 ufs_dirlook(
157 	struct inode *dp,
158 	char *namep,
159 	struct inode **ipp,
160 	struct cred *cr,
161 	int skipdnlc)			/* skip the 1st level dnlc */
162 {
163 	uint64_t handle;
164 	struct fbuf *fbp;		/* a buffer of directory entries */
165 	struct direct *ep;		/* the current directory entry */
166 	struct vnode *vp;
167 	struct vnode *dvp;		/* directory vnode ptr */
168 	dcanchor_t *dcap;
169 	off_t endsearch;		/* offset to end directory search */
170 	off_t offset;
171 	off_t start_off;		/* starting offset from middle search */
172 	off_t last_offset;		/* last offset */
173 	int entryoffsetinblock;		/* offset of ep in addr's buffer */
174 	int numdirpasses;		/* strategy for directory search */
175 	int namlen;			/* length of name */
176 	int err;
177 	int doingchk;
178 	int i;
179 	int caching;
180 	ino_t ep_ino;			/* entry i number */
181 	ino_t chkino;
182 	ushort_t ep_reclen;		/* direct local d_reclen */
183 
184 	ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */
185 
186 	/*
187 	 * Check accessibility of directory.
188 	 */
189 	if (((dp->i_mode & IFMT) != IFDIR) &&
190 	    ((dp->i_mode & IFMT) != IFATTRDIR))
191 		return (ENOTDIR);
192 
193 	if (err = ufs_iaccess(dp, IEXEC, cr))
194 		return (err);
195 
196 	/*
197 	 * Check the directory name lookup cache, first for individual files
198 	 * then for complete directories.
199 	 */
200 	dvp = ITOV(dp);
201 	if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) {
202 		/* vp is already held from dnlc_lookup */
203 		if (vp == DNLC_NO_VNODE) {
204 			VN_RELE(vp);
205 			return (ENOENT);
206 		}
207 		*ipp = VTOI(vp);
208 		return (0);
209 	}
210 
211 	dcap = &dp->i_danchor;
212 
213 	/*
214 	 * Grab the reader lock on the directory data before checking
215 	 * the dnlc to avoid a race with ufs_dirremove() & friends.
216 	 */
217 	rw_enter(&dp->i_rwlock, RW_READER);
218 
219 	switch (dnlc_dir_lookup(dcap, namep, &handle)) {
220 	case DFOUND:
221 		ep_ino = (ino_t)H_TO_INO(handle);
222 		if (dp->i_number == ep_ino) {
223 			VN_HOLD(dvp);	/* want ourself, "." */
224 			*ipp = dp;
225 			rw_exit(&dp->i_rwlock);
226 			return (0);
227 		}
228 		if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) {
229 			uint64_t handle2;
230 			/*
231 			 * release the lock on the dir we are searching
232 			 * to avoid a deadlock when grabbing the
233 			 * i_contents lock in ufs_iget_alloced().
234 			 */
235 			rw_exit(&dp->i_rwlock);
236 			rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
237 			err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
238 			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
239 			/*
240 			 * must recheck as we dropped dp->i_rwlock
241 			 */
242 			rw_enter(&dp->i_rwlock, RW_READER);
243 			if (!err && (dnlc_dir_lookup(dcap, namep, &handle2)
244 			    == DFOUND) && (handle == handle2)) {
245 				dnlc_update(dvp, namep, ITOV(*ipp));
246 				rw_exit(&dp->i_rwlock);
247 				return (0);
248 			}
249 			/* check failed, read the actual directory */
250 			if (!err) {
251 				VN_RELE(ITOV(*ipp));
252 			}
253 			goto restart;
254 		}
255 		/* usual case of not "." nor ".." */
256 		rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
257 		err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
258 		rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
259 		if (err) {
260 			rw_exit(&dp->i_rwlock);
261 			return (err);
262 		}
263 		dnlc_update(dvp, namep, ITOV(*ipp));
264 		rw_exit(&dp->i_rwlock);
265 		return (0);
266 	case DNOENT:
267 		if (ufs_negative_cache && (dp->i_nlink > 0)) {
268 			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
269 		}
270 		rw_exit(&dp->i_rwlock);
271 		return (ENOENT);
272 	default:
273 		break;
274 	}
275 restart:
276 
277 	fbp = NULL;
278 	doingchk = 0;
279 	chkino = 0;
280 	caching = 0;
281 
282 	/*
283 	 * Attempt to cache any directories greater than
284 	 * the tunable ufs_min_cache_dir.
285 	 */
286 	if ((dp->i_size >= ufs_min_dir_cache) && (dp->i_cachedir)) {
287 		switch (dnlc_dir_start(dcap, dp->i_size >> AV_DIRECT_SHIFT)) {
288 		case DNOMEM:
289 		case DTOOBIG:
290 			dp->i_cachedir = 0;
291 			break;
292 		case DOK:
293 			caching = 1;
294 			break;
295 		default:
296 			break;
297 		}
298 	}
299 	/*
300 	 * If caching we don't stop when the file has been
301 	 * found, but need to know later, so clear *ipp now
302 	 */
303 	*ipp = NULL;
304 
305 recheck:
306 	if (caching) {
307 		offset = 0;
308 		entryoffsetinblock = 0;
309 		numdirpasses = 1;
310 	} else {
311 		/*
312 		 * Take care to look at dp->i_diroff only once, as it
313 		 * may be changing due to other threads/cpus.
314 		 */
315 		offset = dp->i_diroff;
316 		if (offset > dp->i_size) {
317 			offset = 0;
318 		}
319 		if (offset == 0) {
320 			entryoffsetinblock = 0;
321 			numdirpasses = 1;
322 		} else {
323 			start_off = offset;
324 
325 			entryoffsetinblock = blkoff(dp->i_fs, offset);
326 			if (entryoffsetinblock != 0) {
327 				err = blkatoff(dp, offset, (char **)0, &fbp);
328 				if (err)
329 					goto bad;
330 			}
331 			numdirpasses = 2;
332 		}
333 	}
334 	endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t);
335 	namlen = strlen(namep);
336 	last_offset = 0;
337 
338 searchloop:
339 	while (offset < endsearch) {
340 		/*
341 		 * If offset is on a block boundary,
342 		 * read the next directory block.
343 		 * Release previous if it exists.
344 		 */
345 		if (blkoff(dp->i_fs, offset) == 0) {
346 			if (fbp != NULL) {
347 				fbrelse(fbp, S_OTHER);
348 			}
349 			err = blkatoff(dp, offset, (char **)0, &fbp);
350 			if (err)
351 				goto bad;
352 			entryoffsetinblock = 0;
353 		}
354 
355 		/*
356 		 * If the offset to the next entry is invalid or if the
357 		 * next entry is a zero length record or if the record
358 		 * length is invalid, then skip to the next directory
359 		 * block.  Complete validation checks are done if the
360 		 * record length is invalid.
361 		 *
362 		 * Full validation checks are slow so they are disabled
363 		 * by default.  Complete checks can be run by patching
364 		 * "dirchk" to be true.
365 		 *
366 		 * We have to check the validity of entryoffsetinblock
367 		 * here because it can be set to i_diroff above.
368 		 */
369 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock);
370 		if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 ||
371 		    (dirchk || (ep->d_reclen & 0x3)) &&
372 		    dirmangled(dp, ep, entryoffsetinblock, offset)) {
373 			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
374 			offset += i;
375 			entryoffsetinblock += i;
376 			if (caching) {
377 				dnlc_dir_purge(dcap);
378 				caching = 0;
379 			}
380 			continue;
381 		}
382 
383 		ep_reclen = ep->d_reclen;
384 
385 		/*
386 		 * Add named entries and free space into the directory cache
387 		 */
388 		if (caching) {
389 			ushort_t extra;
390 			off_t off2;
391 
392 			if (ep->d_ino == 0) {
393 				extra = ep_reclen;
394 				if (offset & (DIRBLKSIZ - 1)) {
395 					dnlc_dir_purge(dcap);
396 					dp->i_cachedir = 0;
397 					caching = 0;
398 				}
399 			} else {
400 				/*
401 				 * entries hold the previous offset except the
402 				 * 1st which holds the offset + 1
403 				 */
404 				if (offset & (DIRBLKSIZ - 1)) {
405 					off2 = last_offset;
406 				} else {
407 					off2 = offset + 1;
408 				}
409 				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
410 				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
411 				extra = ep_reclen - DIRSIZ(ep);
412 			}
413 			if (caching && (extra >= LDIRSIZ(1))) {
414 				caching = (dnlc_dir_add_space(dcap, extra,
415 				    (uint64_t)offset) == DOK);
416 			}
417 		}
418 
419 		/*
420 		 * Check for a name match.
421 		 * We have the parent inode read locked with i_rwlock.
422 		 */
423 		if (ep->d_ino && ep->d_namlen == namlen &&
424 		    *namep == *ep->d_name &&	/* fast chk 1st chr */
425 		    bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) {
426 
427 			/*
428 			 * We have to release the fbp early here to avoid
429 			 * a possible deadlock situation where we have the
430 			 * fbp and want the directory inode and someone doing
431 			 * a ufs_direnter_* has the directory inode and wants
432 			 * the fbp.  XXX - is this still needed?
433 			 */
434 			ep_ino = (ino_t)ep->d_ino;
435 			ASSERT(fbp != NULL);
436 			fbrelse(fbp, S_OTHER);
437 			fbp = NULL;
438 
439 			/*
440 			 * Atomic update (read lock held)
441 			 */
442 			dp->i_diroff = offset;
443 
444 			if (namlen == 2 && namep[0] == '.' && namep[1] == '.') {
445 				struct timeval32 omtime;
446 
447 				if (caching) {
448 					dnlc_dir_purge(dcap);
449 					caching = 0;
450 				}
451 				if (doingchk) {
452 					/*
453 					 * if the inumber didn't change
454 					 * continue with already found inode.
455 					 */
456 					if (ep_ino == chkino)
457 						goto checkok;
458 					else {
459 						VN_RELE(ITOV(*ipp));
460 						/* *ipp is nulled at restart */
461 						goto restart;
462 					}
463 				}
464 				/*
465 				 * release the lock on the dir we are searching
466 				 * to avoid a deadlock when grabbing the
467 				 * i_contents lock in ufs_iget_alloced().
468 				 */
469 				omtime = dp->i_mtime;
470 				rw_exit(&dp->i_rwlock);
471 				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
472 						RW_READER);
473 				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
474 				    cr);
475 				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
476 				rw_enter(&dp->i_rwlock, RW_READER);
477 				if (err)
478 					goto bad;
479 				/*
480 				 * Since we released the lock on the directory,
481 				 * we must check that the same inode is still
482 				 * the ".." entry for this directory.
483 				 */
484 				/*CSTYLED*/
485 				if (timercmp(&omtime, &dp->i_mtime, !=)) {
486 					/*
487 					 * Modification time changed on the
488 					 * directory, we must go check if
489 					 * the inumber changed for ".."
490 					 */
491 					doingchk = 1;
492 					chkino = ep_ino;
493 					entryoffsetinblock = 0;
494 					if (caching) {
495 						/*
496 						 * Forget directory caching
497 						 * for this rare case
498 						 */
499 						dnlc_dir_purge(dcap);
500 						caching = 0;
501 					}
502 					goto recheck;
503 				}
504 			} else if (dp->i_number == ep_ino) {
505 				VN_HOLD(dvp);	/* want ourself, "." */
506 				*ipp = dp;
507 				if (caching) {
508 					dnlc_dir_purge(dcap);
509 					caching = 0;
510 				}
511 			} else {
512 				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
513 						RW_READER);
514 				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
515 				    cr);
516 				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
517 				if (err)
518 					goto bad;
519 			}
520 checkok:
521 			ASSERT(*ipp);
522 			dnlc_update(dvp, namep, ITOV(*ipp));
523 			/*
524 			 * If we are not caching then just return the entry
525 			 * otherwise complete loading up the cache
526 			 */
527 			if (!caching) {
528 				rw_exit(&dp->i_rwlock);
529 				return (0);
530 			}
531 			err = blkatoff(dp, offset, (char **)0, &fbp);
532 			if (err)
533 				goto bad;
534 		}
535 		last_offset = offset;
536 		offset += ep_reclen;
537 		entryoffsetinblock += ep_reclen;
538 	}
539 	/*
540 	 * If we started in the middle of the directory and failed
541 	 * to find our target, we must check the beginning as well.
542 	 */
543 	if (numdirpasses == 2) {
544 		numdirpasses--;
545 		offset = 0;
546 		endsearch = start_off;
547 		goto searchloop;
548 	}
549 
550 	/*
551 	 * If whole directory caching is on (or was originally on) then
552 	 * the entry may have been found.
553 	 */
554 	if (*ipp == NULL) {
555 		err = ENOENT;
556 		if (ufs_negative_cache && (dp->i_nlink > 0)) {
557 			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
558 		}
559 	}
560 	if (caching) {
561 		dnlc_dir_complete(dcap);
562 		caching = 0;
563 	}
564 
565 bad:
566 	if (err && *ipp) {
567 		/*
568 		 * err and *ipp can both be set if we were attempting to
569 		 * cache the directory, and we found the entry, then later
570 		 * while trying to complete the directory cache encountered
571 		 * a error (eg reading a directory sector).
572 		 */
573 		VN_RELE(ITOV(*ipp));
574 		*ipp = NULL;
575 	}
576 
577 	if (fbp)
578 		fbrelse(fbp, S_OTHER);
579 	rw_exit(&dp->i_rwlock);
580 	if (caching)
581 		dnlc_dir_purge(dcap);
582 	return (err);
583 }
584 
585 /*
586  * Write a new directory entry for DE_CREATE or DE_MKDIR operations.
587  */
588 int
589 ufs_direnter_cm(
590 	struct inode *tdp,	/* target directory to make entry in */
591 	char *namep,		/* name of entry */
592 	enum de_op op,		/* entry operation */
593 	struct vattr *vap,	/* attributes if new inode needed */
594 	struct inode **ipp,	/* return entered inode here */
595 	struct cred *cr,	/* user credentials */
596 	int flags)		/* no entry exists */
597 {
598 	struct inode *tip;	/* inode of (existing) target file */
599 	char *s;
600 	struct slot slot;	/* slot info to pass around */
601 	int namlen;		/* length of name */
602 	int err;		/* error number */
603 	struct inode *nip;	/* new inode */
604 	int do_rele_nip = 0;	/* release nip */
605 	int noentry = flags & ~IQUIET;
606 	int quiet = flags & IQUIET;	/* Suppress out of inodes message */
607 
608 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
609 
610 	if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) ||
611 	    ((vap->va_type == VCHR) || (vap->va_type == VBLK) ||
612 	    (vap->va_type == VDOOR) || (vap->va_type == VSOCK) ||
613 	    (vap->va_type == VFIFO))))
614 		return (EINVAL);
615 
616 	/* don't allow '/' characters in pathname component */
617 	for (s = namep, namlen = 0; *s; s++, namlen++)
618 		if (*s == '/')
619 			return (EACCES);
620 	ASSERT(namlen);
621 
622 	/*
623 	 * If name is "." or ".." then if this is a create look it up
624 	 * and return EEXIST.
625 	 */
626 	if (namep[0] == '.' &&
627 	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
628 		/*
629 		 * ufs_dirlook will acquire the i_rwlock
630 		 */
631 		rw_exit(&tdp->i_rwlock);
632 		if (err = ufs_dirlook(tdp, namep, ipp, cr, 0)) {
633 			rw_enter(&tdp->i_rwlock, RW_WRITER);
634 			return (err);
635 		}
636 		rw_enter(&tdp->i_rwlock, RW_WRITER);
637 		return (EEXIST);
638 	}
639 
640 	/*
641 	 * If target directory has not been removed, then we can consider
642 	 * allowing file to be created.
643 	 */
644 	if (tdp->i_nlink <= 0) {
645 		return (ENOENT);
646 	}
647 
648 	/*
649 	 * Check accessibility of directory.
650 	 */
651 	if (((tdp->i_mode & IFMT) != IFDIR) &&
652 	    ((tdp->i_mode & IFMT) != IFATTRDIR)) {
653 		return (ENOTDIR);
654 	}
655 
656 	/*
657 	 * Execute access is required to search the directory.
658 	 */
659 	if (err = ufs_iaccess(tdp, IEXEC, cr)) {
660 		return (err);
661 	}
662 
663 	/*
664 	 * Search for the entry. Return VN_HELD tip if found.
665 	 */
666 	tip = NULL;
667 	slot.fbp = NULL;
668 	slot.status = NONE;
669 	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
670 	rw_enter(&tdp->i_contents, RW_WRITER);
671 	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry);
672 	if (err)
673 		goto out;
674 	if (tip) {
675 		ASSERT(!noentry);
676 		*ipp = tip;
677 		err = EEXIST;
678 	} else {
679 		/*
680 		 * The entry does not exist. Check write permission in
681 		 * directory to see if entry can be created.
682 		 */
683 		if (err = ufs_iaccess(tdp, IWRITE, cr))
684 			goto out;
685 		/*
686 		 * Make new inode and directory entry.
687 		 */
688 		tdp->i_flag |= quiet;
689 		if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) {
690 			if (nip != NULL)
691 				do_rele_nip = 1;
692 			goto out;
693 		}
694 		if (err = ufs_diraddentry(tdp, namep, op,
695 		    namlen, &slot, nip, NULL, cr)) {
696 			/*
697 			 * Unmake the inode we just made.
698 			 */
699 			rw_enter(&nip->i_contents, RW_WRITER);
700 			if (((nip->i_mode & IFMT) == IFDIR) ||
701 			    ((nip->i_mode & IFMT) == IFATTRDIR)) {
702 				tdp->i_nlink--;
703 				ufs_setreclaim(tdp);
704 				tdp->i_flag |= ICHG;
705 				tdp->i_seq++;
706 				TRANS_INODE(tdp->i_ufsvfs, tdp);
707 				ITIMES_NOLOCK(tdp);
708 			}
709 			nip->i_nlink = 0;
710 			ufs_setreclaim(nip);
711 			TRANS_INODE(nip->i_ufsvfs, nip);
712 			nip->i_flag |= ICHG;
713 			nip->i_seq++;
714 			ITIMES_NOLOCK(nip);
715 			rw_exit(&nip->i_contents);
716 			do_rele_nip = 1;
717 		} else {
718 			*ipp = nip;
719 		}
720 	}
721 
722 out:
723 	if (slot.fbp)
724 		fbrelse(slot.fbp, S_OTHER);
725 
726 	tdp->i_flag &= ~quiet;
727 	rw_exit(&tdp->i_contents);
728 
729 	/*
730 	 * Drop vfs_dqrwlock before calling VN_RELE() on nip to
731 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
732 	 */
733 	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
734 
735 	if (do_rele_nip) {
736 		VN_RELE(ITOV(nip));
737 	}
738 
739 	return (err);
740 }
741 
742 /*
743  * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations.
744  * If tvpp is non-null, return with the pointer to the target vnode.
745  */
746 int
747 ufs_direnter_lr(
748 	struct inode *tdp,	/* target directory to make entry in */
749 	char *namep,		/* name of entry */
750 	enum de_op op,		/* entry operation */
751 	struct inode *sdp,	/* source inode parent if rename */
752 	struct inode *sip,	/* source inode */
753 	struct cred *cr,	/* user credentials */
754 	vnode_t **tvpp)		/* Return: (held) vnode of (existing) target */
755 {
756 	struct inode *tip;	/* inode of (existing) target file */
757 	char *s;
758 	struct slot slot;	/* slot info to pass around */
759 	int namlen;		/* length of name */
760 	int err;		/* error number */
761 
762 	/* don't allow '/' characters in pathname component */
763 	for (s = namep, namlen = 0; *s; s++, namlen++)
764 		if (*s == '/')
765 			return (EACCES);
766 	ASSERT(namlen);
767 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
768 
769 	/*
770 	 * If name is "." or ".." then if this is a create look it up
771 	 * and return EEXIST.  Rename or link TO "." or ".." is forbidden.
772 	 */
773 	if (namep[0] == '.' &&
774 	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
775 		if (op == DE_RENAME) {
776 			return (EINVAL);	/* *SIGH* should be ENOTEMPTY */
777 		}
778 		return (EEXIST);
779 	}
780 	/*
781 	 * For link and rename lock the source entry and check the link count
782 	 * to see if it has been removed while it was unlocked.  If not, we
783 	 * increment the link count and force the inode to disk to make sure
784 	 * that it is there before any directory entry that points to it.
785 	 *
786 	 * In the case of a symbolic link, we are dealing with a new inode
787 	 * which does not yet have any links.  We've created it with a link
788 	 * count of 1, and we don't want to increment it since this will be
789 	 * its first link.
790 	 *
791 	 * We are about to push the inode to disk. We make sure
792 	 * that the inode's data blocks are flushed first so the
793 	 * inode and it's data blocks are always in sync.  This
794 	 * adds some robustness in in the event of a power failure
795 	 * or panic where sync fails. If we panic before the
796 	 * inode is updated, then the inode still refers to the
797 	 * old data blocks (or none for a new file). If we panic
798 	 * after the inode is updated, then the inode refers to
799 	 * the new data blocks.
800 	 *
801 	 * We do this before grabbing the i_contents lock because
802 	 * ufs_syncip() will want that lock. We could do the data
803 	 * syncing after the removal checks, but upon return from
804 	 * the data sync we would have to repeat the removal
805 	 * checks.
806 	 */
807 	if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) {
808 		return (err);
809 	}
810 
811 	rw_enter(&sip->i_contents, RW_WRITER);
812 	if (sip->i_nlink <= 0) {
813 		rw_exit(&sip->i_contents);
814 		return (ENOENT);
815 	}
816 	if (sip->i_nlink == MAXLINK) {
817 		rw_exit(&sip->i_contents);
818 		return (EMLINK);
819 	}
820 
821 	/*
822 	 * Sync the indirect blocks associated with the file
823 	 * for the same reasons as described above.  Since this
824 	 * call wants the i_contents lock held for it we can do
825 	 * this here with no extra work.
826 	 */
827 	if (err = ufs_sync_indir(sip)) {
828 		rw_exit(&sip->i_contents);
829 		return (err);
830 	}
831 
832 	if (op != DE_SYMLINK)
833 		sip->i_nlink++;
834 	TRANS_INODE(sip->i_ufsvfs, sip);
835 	sip->i_flag |= ICHG;
836 	sip->i_seq++;
837 	ufs_iupdat(sip, I_SYNC);
838 	rw_exit(&sip->i_contents);
839 
840 	/*
841 	 * If target directory has not been removed, then we can consider
842 	 * allowing file to be created.
843 	 */
844 	if (tdp->i_nlink <= 0) {
845 		err = ENOENT;
846 		goto out2;
847 	}
848 	/*
849 	 * Check accessibility of directory.
850 	 */
851 	if (((tdp->i_mode & IFMT) != IFDIR) &&
852 	    (tdp->i_mode & IFMT) != IFATTRDIR) {
853 		err = ENOTDIR;
854 		goto out2;
855 	}
856 	/*
857 	 * Execute access is required to search the directory.
858 	 */
859 	if (err = ufs_iaccess(tdp, IEXEC, cr)) {
860 		goto out2;
861 	}
862 
863 	/*
864 	 * Search for the entry. Return VN_HELD tip if found.
865 	 */
866 	tip = NULL;
867 	slot.status = NONE;
868 	slot.fbp = NULL;
869 	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
870 	rw_enter(&tdp->i_contents, RW_WRITER);
871 	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0);
872 	if (err)
873 		goto out;
874 
875 	if (tip) {
876 		switch (op) {
877 		case DE_RENAME:
878 			err = ufs_dirrename(sdp, sip, tdp, namep,
879 			    tip, &slot, cr);
880 			break;
881 
882 		case DE_LINK:
883 		case DE_SYMLINK:
884 			/*
885 			 * Can't link to an existing file.
886 			 */
887 			err = EEXIST;
888 			break;
889 		default:
890 			break;
891 		}
892 	} else {
893 		/*
894 		 * The entry does not exist. Check write permission in
895 		 * directory to see if entry can be created.
896 		 */
897 		if (err = ufs_iaccess(tdp, IWRITE, cr))
898 			goto out;
899 		err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp,
900 		    cr);
901 	}
902 
903 out:
904 	if (slot.fbp)
905 		fbrelse(slot.fbp, S_OTHER);
906 
907 	rw_exit(&tdp->i_contents);
908 
909 	/*
910 	 * Drop vfs_dqrwlock before calling VN_RELE() on tip to
911 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
912 	 */
913 	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
914 
915 	/*
916 	 * If we renamed a file over the top of an existing file,
917 	 * or linked a file to an existing file (or tried to),
918 	 * then set *tvpp to the target vnode, if tvpp is non-null
919 	 * otherwise, release and delete (or just release) the inode.
920 	 *
921 	 * N.B., by returning the target's vnode pointer to the caller,
922 	 * that caller becomes responsible for doing the VN_RELE.
923 	 */
924 	if (tip) {
925 		if ((err == 0) && (tvpp != NULL)) {
926 			*tvpp = ITOV(tip);
927 		} else {
928 			VN_RELE(ITOV(tip));
929 		}
930 	}
931 
932 out2:
933 	if (err) {
934 		/*
935 		 * Undo bumped link count.
936 		 */
937 		if (op != DE_SYMLINK) {
938 			rw_enter(&sip->i_contents, RW_WRITER);
939 			sip->i_nlink--;
940 			ufs_setreclaim(sip);
941 			TRANS_INODE(sip->i_ufsvfs, sip);
942 			sip->i_flag |= ICHG;
943 			sip->i_seq++;
944 			ITIMES_NOLOCK(sip);
945 			rw_exit(&sip->i_contents);
946 		}
947 	}
948 	return (err);
949 }
950 
951 /*
952  * Check for the existence of a name in a directory (unless noentry
953  * is set) , or else of an empty
954  * slot in which an entry may be made.  If the requested name is found,
955  * then on return *ipp points at the inode and *offp contains
956  * its offset in the directory.  If the name is not found, then *ipp
957  * will be NULL and *slotp will contain information about a directory slot in
958  * which an entry may be made (either an empty slot, or the first position
959  * past the end of the directory).
960  * The target directory inode (tdp) is supplied write locked (i_rwlock).
961  *
962  * This may not be used on "." or "..", but aliases of "." are ok.
963  */
964 int
965 ufs_dircheckforname(
966 	struct inode *tdp,	/* inode of directory being checked */
967 	char *namep,		/* name we're checking for */
968 	int namlen,		/* length of name, excluding null */
969 	struct slot *slotp,	/* slot structure */
970 	struct inode **ipp,	/* return inode if we find one */
971 	struct cred *cr,
972 	int noentry)		/* noentry - just look for space */
973 {
974 	uint64_t handle;
975 	struct fbuf *fbp;	/* pointer to directory block */
976 	struct direct *ep;	/* directory entry */
977 	struct direct *nep;	/* next directory entry */
978 	dcanchor_t *dcap;
979 	vnode_t *dvp;		/* directory vnode ptr */
980 	off_t dirsize;		/* size of the directory */
981 	off_t offset;		/* offset in the directory */
982 	off_t last_offset;	/* last offset */
983 	off_t enduseful;	/* pointer past last used dir slot */
984 	int entryoffsetinblk;	/* offset of ep in fbp's buffer */
985 	int i;			/* length of mangled entry */
986 	int needed;
987 	int err;
988 	int first;
989 	int caching;
990 	int stat;
991 	ino_t ep_ino;
992 	slotstat_t initstat = slotp->status;
993 
994 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
995 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
996 	ASSERT(*ipp == NULL);
997 	fbp = NULL;
998 
999 	/*
1000 	 * First check if there is a complete cache of the directory.
1001 	 */
1002 	dvp = ITOV(tdp);
1003 
1004 	dcap = &tdp->i_danchor;
1005 	if (noentry) {
1006 		/*
1007 		 * We know from the 1st level dnlc cache that the entry
1008 		 * doesn't exist, so don't bother searching the directory
1009 		 * cache, but just look for space (possibly in the directory
1010 		 * cache).
1011 		 */
1012 		stat = DNOENT;
1013 	} else {
1014 		stat = dnlc_dir_lookup(dcap, namep, &handle);
1015 	}
1016 	switch (stat) {
1017 	case DFOUND:
1018 		ep_ino = (ino_t)H_TO_INO(handle);
1019 		if (tdp->i_number == ep_ino) {
1020 			*ipp = tdp;	/* we want ourself, ie "." */
1021 			VN_HOLD(dvp);
1022 		} else {
1023 			err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr);
1024 			if (err)
1025 				return (err);
1026 		}
1027 		offset = H_TO_OFF(handle);
1028 		first = 0;
1029 		if (offset & 1) {
1030 			/* This is the first entry in the block */
1031 			first = 1;
1032 			offset -= 1;
1033 			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1034 		}
1035 		err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1036 		if (err) {
1037 			VN_RELE(ITOV(*ipp));
1038 			*ipp = NULL;
1039 			return (err);
1040 		}
1041 		/*
1042 		 * Check the validity of the entry.
1043 		 * If it's bad, then throw away the cache and
1044 		 * continue without it. The dirmangled() routine
1045 		 * will then be called upon it.
1046 		 */
1047 		if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1048 			VN_RELE(ITOV(*ipp));
1049 			*ipp = NULL;
1050 			dnlc_dir_purge(dcap);
1051 			break;
1052 		}
1053 		/*
1054 		 * Remember the returned offset is the offset of the
1055 		 * preceding record (unless this is the 1st record
1056 		 * in the DIRBLKSIZ sized block (disk sector)), then it's
1057 		 * offset + 1. Note, no real offsets are on odd boundaries.
1058 		 */
1059 		if (first) {
1060 			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1061 			slotp->offset = offset;
1062 			slotp->size = 0;
1063 			slotp->ep = ep;
1064 		} else {
1065 			/* get the next entry */
1066 			nep = (struct direct *)((char *)ep + ep->d_reclen);
1067 			/*
1068 			 * Check the validity of this entry as well
1069 			 * If it's bad, then throw away the cache and
1070 			 * continue without it. The dirmangled() routine
1071 			 * will then be called upon it.
1072 			 */
1073 			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1074 			    (nep->d_ino != ep_ino)) {
1075 				VN_RELE(ITOV(*ipp));
1076 				*ipp = NULL;
1077 				dnlc_dir_purge(dcap);
1078 				break;
1079 			}
1080 			slotp->offset = offset + ep->d_reclen;
1081 			slotp->size = ep->d_reclen;
1082 			slotp->ep = nep;
1083 		}
1084 		slotp->status = EXIST;
1085 		slotp->fbp = fbp;
1086 		slotp->endoff = 0;
1087 		slotp->cached = 1;
1088 		dnlc_update(dvp, namep, ITOV(*ipp));
1089 		return (0);
1090 	case DNOENT:
1091 		/*
1092 		 * The caller gets to set the initial slot status to
1093 		 * indicate whether it's interested in getting a
1094 		 * empty slot. For example, the status can be set
1095 		 * to FOUND when an entry is being deleted.
1096 		 */
1097 		ASSERT(slotp->fbp == NULL);
1098 		if (slotp->status == FOUND) {
1099 			return (0);
1100 		}
1101 		switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen),
1102 		    &handle)) {
1103 		case DFOUND:
1104 			offset = (off_t)handle;
1105 			err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1106 			if (err) {
1107 				dnlc_dir_purge(dcap);
1108 				ASSERT(*ipp == NULL);
1109 				return (err);
1110 			}
1111 			/*
1112 			 * Check the validity of the entry.
1113 			 * If it's bad, then throw away the cache and
1114 			 * continue without it. The dirmangled() routine
1115 			 * will then be called upon it.
1116 			 */
1117 			if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1118 				dnlc_dir_purge(dcap);
1119 				break;
1120 			}
1121 			/*
1122 			 * Remember the returned offset is the offset of the
1123 			 * containing record.
1124 			 */
1125 			slotp->status = FOUND;
1126 			slotp->ep = ep;
1127 			slotp->offset = offset;
1128 			slotp->fbp = fbp;
1129 			slotp->size = ep->d_reclen;
1130 			/*
1131 			 * Set end offset to 0. Truncation is handled
1132 			 * because the dnlc cache will blow away the
1133 			 * cached directory when an entry is removed
1134 			 * that drops the entries left to less than half
1135 			 * the minumum number (dnlc_min_dir_cache).
1136 			 */
1137 			slotp->endoff = 0;
1138 			slotp->cached = 1;
1139 			return (0);
1140 		case DNOENT:
1141 			slotp->status = NONE;
1142 			slotp->offset = P2ROUNDUP_TYPED(tdp->i_size,
1143 			    DIRBLKSIZ, u_offset_t);
1144 			slotp->size = DIRBLKSIZ;
1145 			slotp->endoff = 0;
1146 			slotp->cached = 1;
1147 			return (0);
1148 		default:
1149 			break;
1150 		}
1151 		break;
1152 	}
1153 	slotp->cached = 0;
1154 	caching = NULL;
1155 	if (tdp->i_cachedir && !noentry) {
1156 		/*
1157 		 * Attempt to cache any directories greater than
1158 		 * the tunable ufs_min_cache_dir.
1159 		 */
1160 		if (tdp->i_size >= ufs_min_dir_cache) {
1161 			switch (dnlc_dir_start(dcap,
1162 			    tdp->i_size >> AV_DIRECT_SHIFT)) {
1163 			case DNOMEM:
1164 			case DTOOBIG:
1165 				tdp->i_cachedir = 0;
1166 				break;
1167 			case DOK:
1168 				caching = 1;
1169 				break;
1170 			default:
1171 				break;
1172 			}
1173 		}
1174 	}
1175 
1176 	/*
1177 	 * No point in using i_diroff since we must search whole directory
1178 	 */
1179 	dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t);
1180 	enduseful = 0;
1181 	offset = last_offset = 0;
1182 	entryoffsetinblk = 0;
1183 	needed = (int)LDIRSIZ(namlen);
1184 	while (offset < dirsize) {
1185 		/*
1186 		 * If offset is on a block boundary,
1187 		 * read the next directory block.
1188 		 * Release previous if it exists.
1189 		 */
1190 		if (blkoff(tdp->i_fs, offset) == 0) {
1191 			if (fbp != NULL)
1192 				fbrelse(fbp, S_OTHER);
1193 
1194 			err = blkatoff(tdp, offset, (char **)0, &fbp);
1195 			if (err) {
1196 				ASSERT(*ipp == NULL);
1197 				if (caching) {
1198 					dnlc_dir_purge(dcap);
1199 				}
1200 				return (err);
1201 			}
1202 			entryoffsetinblk = 0;
1203 		}
1204 		/*
1205 		 * If still looking for a slot, and at a DIRBLKSIZ
1206 		 * boundary, have to start looking for free space
1207 		 * again.
1208 		 */
1209 		if (slotp->status == NONE &&
1210 		    (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) {
1211 			slotp->offset = -1;
1212 		}
1213 		/*
1214 		 * If the next entry is a zero length record or if the
1215 		 * record length is invalid, then skip to the next
1216 		 * directory block.  Complete validation checks are
1217 		 * done if the record length is invalid.
1218 		 *
1219 		 * Full validation checks are slow so they are disabled
1220 		 * by default.  Complete checks can be run by patching
1221 		 * "dirchk" to be true.
1222 		 *
1223 		 * We do not have to check the validity of
1224 		 * entryoffsetinblk here because it starts out as zero
1225 		 * and is only incremented by d_reclen values that we
1226 		 * validate here.
1227 		 */
1228 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk);
1229 		if (ep->d_reclen == 0 ||
1230 		    (dirchk || (ep->d_reclen & 0x3)) &&
1231 		    dirmangled(tdp, ep, entryoffsetinblk, offset)) {
1232 			i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1));
1233 			offset += i;
1234 			entryoffsetinblk += i;
1235 			if (caching) {
1236 				dnlc_dir_purge(dcap);
1237 				caching = 0;
1238 			}
1239 			continue;
1240 		}
1241 
1242 		/*
1243 		 * Add named entries and free space into the directory cache
1244 		 */
1245 		if (caching) {
1246 			ushort_t extra;
1247 			off_t off2;
1248 
1249 			if (ep->d_ino == 0) {
1250 				extra = ep->d_reclen;
1251 				if (offset & (DIRBLKSIZ - 1)) {
1252 					dnlc_dir_purge(dcap);
1253 					caching = 0;
1254 				}
1255 			} else {
1256 				/*
1257 				 * entries hold the previous offset if
1258 				 * not the 1st one
1259 				 */
1260 				if (offset & (DIRBLKSIZ - 1)) {
1261 					off2 = last_offset;
1262 				} else {
1263 					off2 = offset + 1;
1264 				}
1265 				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
1266 				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
1267 				extra = ep->d_reclen - DIRSIZ(ep);
1268 			}
1269 			if (caching && (extra >= LDIRSIZ(1))) {
1270 				caching = (dnlc_dir_add_space(dcap, extra,
1271 				    (uint64_t)offset) == DOK);
1272 			}
1273 		}
1274 
1275 		/*
1276 		 * If an appropriate sized slot has not yet been found,
1277 		 * check to see if one is available.
1278 		 */
1279 		if ((slotp->status != FOUND) && (slotp->status != EXIST)) {
1280 			int size = ep->d_reclen;
1281 
1282 			if (ep->d_ino != 0)
1283 				size -= DIRSIZ(ep);
1284 			if (size > 0) {
1285 				if (size >= needed) {
1286 					slotp->offset = offset;
1287 					slotp->size = ep->d_reclen;
1288 					if (noentry) {
1289 						slotp->ep = ep;
1290 						slotp->fbp = fbp;
1291 						slotp->status = FOUND;
1292 						slotp->endoff = 0;
1293 						return (0);
1294 					}
1295 					slotp->status = FOUND;
1296 				} else if (slotp->status == NONE) {
1297 					if (slotp->offset == -1)
1298 						slotp->offset = offset;
1299 				}
1300 			}
1301 		}
1302 		/*
1303 		 * Check for a name match.
1304 		 */
1305 		if (ep->d_ino && ep->d_namlen == namlen &&
1306 		    *namep == *ep->d_name &&	/* fast chk 1st char */
1307 		    bcmp(namep, ep->d_name, namlen) == 0) {
1308 
1309 			tdp->i_diroff = offset;
1310 
1311 			if (tdp->i_number == ep->d_ino) {
1312 				*ipp = tdp;	/* we want ourself, ie "." */
1313 				VN_HOLD(dvp);
1314 			} else {
1315 				err = ufs_iget_alloced(tdp->i_vfs,
1316 				    (ino_t)ep->d_ino, ipp, cr);
1317 				if (err) {
1318 					fbrelse(fbp, S_OTHER);
1319 					if (caching)
1320 						dnlc_dir_purge(dcap);
1321 					return (err);
1322 				}
1323 			}
1324 			slotp->status = EXIST;
1325 			slotp->offset = offset;
1326 			slotp->size = (int)(offset - last_offset);
1327 			slotp->fbp = fbp;
1328 			slotp->ep = ep;
1329 			slotp->endoff = 0;
1330 			if (caching)
1331 				dnlc_dir_purge(dcap);
1332 			return (0);
1333 		}
1334 		last_offset = offset;
1335 		offset += ep->d_reclen;
1336 		entryoffsetinblk += ep->d_reclen;
1337 		if (ep->d_ino)
1338 			enduseful = offset;
1339 	}
1340 	if (fbp) {
1341 		fbrelse(fbp, S_OTHER);
1342 	}
1343 
1344 	if (caching) {
1345 		dnlc_dir_complete(dcap);
1346 		slotp->cached = 1;
1347 		if (slotp->status == FOUND) {
1348 			if (initstat == FOUND) {
1349 				return (0);
1350 			}
1351 			(void) dnlc_dir_rem_space_by_handle(dcap,
1352 			    slotp->offset);
1353 			slotp->endoff = 0;
1354 			return (0);
1355 		}
1356 	}
1357 
1358 	if (slotp->status == NONE) {
1359 		/*
1360 		 * We didn't find a slot; the new directory entry should be put
1361 		 * at the end of the directory.  Return an indication of where
1362 		 * this is, and set "endoff" to zero; since we're going to have
1363 		 * to extend the directory, we're certainly not going to
1364 		 * truncate it.
1365 		 */
1366 		slotp->offset = dirsize;
1367 		slotp->size = DIRBLKSIZ;
1368 		slotp->endoff = 0;
1369 	} else {
1370 		/*
1371 		 * We found a slot, and will return an indication of where that
1372 		 * slot is, as any new directory entry will be put there.
1373 		 * Since that slot will become a useful entry, if the last
1374 		 * useful entry we found was before this one, update the offset
1375 		 * of the last useful entry.
1376 		 */
1377 		if (enduseful < slotp->offset + slotp->size)
1378 			enduseful = slotp->offset + slotp->size;
1379 		slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t);
1380 	}
1381 	*ipp = NULL;
1382 	return (0);
1383 }
1384 
1385 uint64_t ufs_dirrename_retry_cnt;
1386 
1387 /*
1388  * Rename the entry in the directory tdp so that it points to
1389  * sip instead of tip.
1390  */
1391 static int
1392 ufs_dirrename(
1393 	struct inode *sdp,	/* parent directory of source */
1394 	struct inode *sip,	/* source inode */
1395 	struct inode *tdp,	/* parent directory of target */
1396 	char *namep,		/* entry we are trying to change */
1397 	struct inode *tip,	/* target inode */
1398 	struct slot *slotp,	/* slot for entry */
1399 	struct cred *cr)	/* credentials */
1400 {
1401 	vnode_t *tdvp;
1402 	off_t offset;
1403 	int err;
1404 	int doingdirectory;
1405 
1406 	ASSERT(sdp->i_ufsvfs != NULL);
1407 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1408 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1409 	/*
1410 	 * Short circuit rename of something to itself.
1411 	 */
1412 	if (sip->i_number == tip->i_number) {
1413 		return (ESAME); /* special KLUDGE error code */
1414 	}
1415 
1416 	/*
1417 	 * We're locking 2 peer level locks, so must use tryenter
1418 	 * on the 2nd to avoid deadlocks that would occur
1419 	 * if we renamed a->b and b->a concurrently.
1420 	 */
1421 retry:
1422 	rw_enter(&tip->i_contents, RW_WRITER);
1423 	if (!rw_tryenter(&sip->i_contents, RW_READER)) {
1424 		/*
1425 		 * drop tip and wait (sleep) until we stand a chance
1426 		 * of holding sip
1427 		 */
1428 		rw_exit(&tip->i_contents);
1429 		rw_enter(&sip->i_contents, RW_READER);
1430 		/*
1431 		 * Reverse the lock grabs in case we have heavy
1432 		 * contention on the 2nd lock.
1433 		 */
1434 		if (!rw_tryenter(&tip->i_contents, RW_WRITER)) {
1435 			ufs_dirrename_retry_cnt++;
1436 			rw_exit(&sip->i_contents);
1437 			goto retry;
1438 		}
1439 	}
1440 
1441 	/*
1442 	 * Check that everything is on the same filesystem.
1443 	 */
1444 	if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) ||
1445 	    (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) {
1446 		err = EXDEV;		/* XXX archaic */
1447 		goto out;
1448 	}
1449 	/*
1450 	 * Must have write permission to rewrite target entry.
1451 	 * Perform additional checks for sticky directories.
1452 	 */
1453 	if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0 ||
1454 	    (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0)
1455 		goto out;
1456 
1457 	/*
1458 	 * Ensure source and target are compatible (both directories
1459 	 * or both not directories).  If target is a directory it must
1460 	 * be empty and have no links to it; in addition it must not
1461 	 * be a mount point, and both the source and target must be
1462 	 * writable.
1463 	 */
1464 	doingdirectory = (((sip->i_mode & IFMT) == IFDIR) ||
1465 	    ((sip->i_mode & IFMT) == IFATTRDIR));
1466 	if (((tip->i_mode & IFMT) == IFDIR) ||
1467 	    ((tip->i_mode & IFMT) == IFATTRDIR)) {
1468 		if (!doingdirectory) {
1469 			err = EISDIR;
1470 			goto out;
1471 		}
1472 		/*
1473 		 * vn_vfswlock will prevent mounts from using the directory
1474 		 * until we are done.
1475 		 */
1476 		if (vn_vfswlock(ITOV(tip))) {
1477 			err = EBUSY;
1478 			goto out;
1479 		}
1480 		if (vn_mountedvfs(ITOV(tip)) != NULL) {
1481 			vn_vfsunlock(ITOV(tip));
1482 			err = EBUSY;
1483 			goto out;
1484 		}
1485 		if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) {
1486 			vn_vfsunlock(ITOV(tip));
1487 			err = EEXIST;	/* SIGH should be ENOTEMPTY */
1488 			goto out;
1489 		}
1490 	} else if (doingdirectory) {
1491 		err = ENOTDIR;
1492 		goto out;
1493 	}
1494 
1495 	/*
1496 	 * Rewrite the inode pointer for target name entry
1497 	 * from the target inode (ip) to the source inode (sip).
1498 	 * This prevents the target entry from disappearing
1499 	 * during a crash. Mark the directory inode to reflect the changes.
1500 	 */
1501 	tdvp = ITOV(tdp);
1502 	slotp->ep->d_ino = (int32_t)sip->i_number;
1503 	dnlc_update(tdvp, namep, ITOV(sip));
1504 	if (slotp->size) {
1505 		offset = slotp->offset - slotp->size;
1506 	} else {
1507 		offset = slotp->offset + 1;
1508 	}
1509 	if (slotp->cached) {
1510 		(void) dnlc_dir_update(&tdp->i_danchor, namep,
1511 		    INO_OFF_TO_H(slotp->ep->d_ino, offset));
1512 	}
1513 
1514 	err = TRANS_DIR(tdp, slotp->offset);
1515 	if (err)
1516 		fbrelse(slotp->fbp, S_OTHER);
1517 	else
1518 		err = ufs_fbwrite(slotp->fbp, tdp);
1519 
1520 	slotp->fbp = NULL;
1521 	if (err) {
1522 		if (doingdirectory)
1523 			vn_vfsunlock(ITOV(tip));
1524 		goto out;
1525 	}
1526 
1527 	TRANS_INODE(tdp->i_ufsvfs, tdp);
1528 	tdp->i_flag |= IUPD|ICHG;
1529 	tdp->i_seq++;
1530 	ITIMES_NOLOCK(tdp);
1531 
1532 	/*
1533 	 * Decrement the link count of the target inode.
1534 	 * Fix the ".." entry in sip to point to dp.
1535 	 * This is done after the new entry is on the disk.
1536 	 */
1537 	tip->i_nlink--;
1538 	TRANS_INODE(tip->i_ufsvfs, tip);
1539 	tip->i_flag |= ICHG;
1540 	tip->i_seq++;
1541 	ITIMES_NOLOCK(tip);
1542 	if (doingdirectory) {
1543 		/*
1544 		 * The entry for tip no longer exists so I can unlock the
1545 		 * vfslock.
1546 		 */
1547 		vn_vfsunlock(ITOV(tip));
1548 		/*
1549 		 * Decrement target link count once more if it was a directory.
1550 		 */
1551 		if (--tip->i_nlink != 0) {
1552 			err = ufs_fault(ITOV(tip),
1553 		    "ufs_dirrename: target directory link count != 0 (%s)",
1554 			    tip->i_fs->fs_fsmnt);
1555 			rw_exit(&tip->i_contents);
1556 			return (err);
1557 		}
1558 		TRANS_INODE(tip->i_ufsvfs, tip);
1559 		ufs_setreclaim(tip);
1560 		/*
1561 		 * Renaming a directory with the parent different
1562 		 * requires that ".." be rewritten.  The window is
1563 		 * still there for ".." to be inconsistent, but this
1564 		 * is unavoidable, and a lot shorter than when it was
1565 		 * done in a user process.  We decrement the link
1566 		 * count in the new parent as appropriate to reflect
1567 		 * the just-removed target.  If the parent is the
1568 		 * same, this is appropriate since the original
1569 		 * directory is going away.  If the new parent is
1570 		 * different, ufs_dirfixdotdot() will bump the link count
1571 		 * back.
1572 		 */
1573 		tdp->i_nlink--;
1574 		ufs_setreclaim(tdp);
1575 		TRANS_INODE(tdp->i_ufsvfs, tdp);
1576 		tdp->i_flag |= ICHG;
1577 		tdp->i_seq++;
1578 		ITIMES_NOLOCK(tdp);
1579 		if (sdp != tdp) {
1580 			rw_exit(&tip->i_contents);
1581 			rw_exit(&sip->i_contents);
1582 			err = ufs_dirfixdotdot(sip, sdp, tdp);
1583 			return (err);
1584 		}
1585 	} else
1586 		ufs_setreclaim(tip);
1587 out:
1588 	rw_exit(&tip->i_contents);
1589 	rw_exit(&sip->i_contents);
1590 	return (err);
1591 }
1592 
1593 /*
1594  * Fix the ".." entry of the child directory so that it points
1595  * to the new parent directory instead of the old one.  Routine
1596  * assumes that dp is a directory and that all the inodes are on
1597  * the same file system.
1598  */
1599 static int
1600 ufs_dirfixdotdot(
1601 	struct inode *dp,	/* child directory */
1602 	struct inode *opdp,	/* old parent directory */
1603 	struct inode *npdp)	/* new parent directory */
1604 {
1605 	struct fbuf *fbp;
1606 	struct dirtemplate *dirp;
1607 	vnode_t *dvp;
1608 	int err;
1609 
1610 	ASSERT(RW_WRITE_HELD(&npdp->i_rwlock));
1611 	ASSERT(RW_WRITE_HELD(&npdp->i_contents));
1612 
1613 	/*
1614 	 * We hold the child directory's i_contents lock before calling
1615 	 * blkatoff so that we honor correct locking protocol which is
1616 	 * i_contents lock and then page lock. (blkatoff will call
1617 	 * ufs_getpage where we want the page lock)
1618 	 * We hold the child directory's i_rwlock before i_contents (as
1619 	 * per the locking protocol) since we are modifying the ".." entry
1620 	 * of the child directory.
1621 	 * We hold the i_rwlock and i_contents lock until we record
1622 	 * this directory delta to the log (via ufs_trans_dir) and have
1623 	 * done fbrelse.
1624 	 */
1625 	rw_enter(&dp->i_rwlock, RW_WRITER);
1626 	rw_enter(&dp->i_contents, RW_WRITER);
1627 	err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp);
1628 	if (err)
1629 		goto bad;
1630 
1631 	if (dp->i_nlink <= 0 ||
1632 	    dp->i_size < sizeof (struct dirtemplate)) {
1633 		err = ENOENT;
1634 		goto bad;
1635 	}
1636 
1637 	if (dirp->dotdot_namlen != 2 ||
1638 	    dirp->dotdot_name[0] != '.' ||
1639 	    dirp->dotdot_name[1] != '.') {	/* Sanity check. */
1640 		dirbad(dp, "mangled .. entry", (off_t)0);
1641 		err = ENOTDIR;
1642 		goto bad;
1643 	}
1644 
1645 	/*
1646 	 * Increment the link count in the new parent inode and force it out.
1647 	 */
1648 	if (npdp->i_nlink == MAXLINK) {
1649 		err = EMLINK;
1650 		goto bad;
1651 	}
1652 	npdp->i_nlink++;
1653 	TRANS_INODE(npdp->i_ufsvfs, npdp);
1654 	npdp->i_flag |= ICHG;
1655 	npdp->i_seq++;
1656 	ufs_iupdat(npdp, I_SYNC);
1657 
1658 	/*
1659 	 * Rewrite the child ".." entry and force it out.
1660 	 */
1661 	dvp = ITOV(dp);
1662 	dirp->dotdot_ino = (uint32_t)npdp->i_number;
1663 	dnlc_update(dvp, "..", ITOV(npdp));
1664 	(void) dnlc_dir_update(&dp->i_danchor, "..",
1665 	    INO_OFF_TO_H(dirp->dotdot_ino, 0));
1666 
1667 	err = TRANS_DIR(dp, 0);
1668 	if (err)
1669 		fbrelse(fbp, S_OTHER);
1670 	else
1671 		err = ufs_fbwrite(fbp, dp);
1672 
1673 	fbp = NULL;
1674 	if (err)
1675 		goto bad;
1676 
1677 	rw_exit(&dp->i_contents);
1678 	rw_exit(&dp->i_rwlock);
1679 
1680 	/*
1681 	 * Decrement the link count of the old parent inode and force it out.
1682 	 */
1683 	ASSERT(opdp);
1684 	rw_enter(&opdp->i_contents, RW_WRITER);
1685 	ASSERT(opdp->i_nlink > 0);
1686 	opdp->i_nlink--;
1687 	ufs_setreclaim(opdp);
1688 	TRANS_INODE(opdp->i_ufsvfs, opdp);
1689 	opdp->i_flag |= ICHG;
1690 	opdp->i_seq++;
1691 	ufs_iupdat(opdp, I_SYNC);
1692 	rw_exit(&opdp->i_contents);
1693 	return (0);
1694 
1695 bad:
1696 	if (fbp)
1697 		fbrelse(fbp, S_OTHER);
1698 	rw_exit(&dp->i_contents);
1699 	rw_exit(&dp->i_rwlock);
1700 	return (err);
1701 }
1702 
1703 /*
1704  * Enter the file sip in the directory tdp with name namep.
1705  */
1706 static int
1707 ufs_diraddentry(
1708 	struct inode *tdp,
1709 	char *namep,
1710 	enum de_op op,
1711 	int namlen,
1712 	struct slot *slotp,
1713 	struct inode *sip,
1714 	struct inode *sdp,
1715 	struct cred *cr)
1716 {
1717 	struct direct *ep, *nep;
1718 	vnode_t *tdvp;
1719 	dcanchor_t *dcap = &tdp->i_danchor;
1720 	off_t offset;
1721 	int err;
1722 	ushort_t extra;
1723 
1724 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1725 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1726 	/*
1727 	 * Prepare a new entry.  If the caller has not supplied an
1728 	 * existing inode, make a new one.
1729 	 */
1730 	err = dirprepareentry(tdp, slotp, cr);
1731 	if (err) {
1732 		if (slotp->fbp) {
1733 			fbrelse(slotp->fbp, S_OTHER);
1734 			slotp->fbp = NULL;
1735 		}
1736 		return (err);
1737 	}
1738 	/*
1739 	 * Check inode to be linked to see if it is in the
1740 	 * same filesystem.
1741 	 */
1742 	if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) {
1743 		err = EXDEV;
1744 		goto bad;
1745 	}
1746 
1747 	/*
1748 	 * If renaming a directory then fix up the ".." entry in the
1749 	 * directory to point to the new parent.
1750 	 */
1751 	if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) ||
1752 	    ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) {
1753 		err = ufs_dirfixdotdot(sip, sdp, tdp);
1754 		if (err)
1755 			goto bad;
1756 	}
1757 
1758 	/*
1759 	 * Fill in entry data.
1760 	 */
1761 	ep = slotp->ep;
1762 	ep->d_namlen = (ushort_t)namlen;
1763 	(void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3));
1764 	ep->d_ino = (uint32_t)sip->i_number;
1765 	tdvp = ITOV(tdp);
1766 	dnlc_update(tdvp, namep, ITOV(sip));
1767 	/*
1768 	 * Note the offset supplied for any named entry is
1769 	 * the offset of the previous one, unless it's the 1st.
1770 	 * slotp->size is used to pass the length to
1771 	 * the previous entry.
1772 	 */
1773 	if (slotp->size) {
1774 		offset = slotp->offset - slotp->size;
1775 	} else {
1776 		offset = slotp->offset + 1;
1777 	}
1778 
1779 	if (slotp->cached) {
1780 		/*
1781 		 * Add back any usable unused space to the dnlc directory
1782 		 * cache.
1783 		 */
1784 		extra = ep->d_reclen - DIRSIZ(ep);
1785 		if (extra >= LDIRSIZ(1)) {
1786 			(void) dnlc_dir_add_space(dcap, extra,
1787 			    (uint64_t)slotp->offset);
1788 		}
1789 
1790 		(void) dnlc_dir_add_entry(dcap, namep,
1791 		    INO_OFF_TO_H(ep->d_ino, offset));
1792 
1793 		/* adjust the previous offset of the next entry */
1794 		nep = (struct direct *)((char *)ep + ep->d_reclen);
1795 		if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
1796 			/*
1797 			 * Not a new block.
1798 			 *
1799 			 * Check the validity of the next entry.
1800 			 * If it's bad, then throw away the cache, and
1801 			 * continue as before directory caching.
1802 			 */
1803 			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1804 			    dnlc_dir_update(dcap, nep->d_name,
1805 			    INO_OFF_TO_H(nep->d_ino, slotp->offset))
1806 			    == DNOENT) {
1807 				dnlc_dir_purge(dcap);
1808 				slotp->cached = 0;
1809 			}
1810 		}
1811 	}
1812 
1813 	/*
1814 	 * Write out the directory block.
1815 	 */
1816 	err = TRANS_DIR(tdp, slotp->offset);
1817 	if (err)
1818 		fbrelse(slotp->fbp, S_OTHER);
1819 	else
1820 		err = ufs_fbwrite(slotp->fbp, tdp);
1821 
1822 	slotp->fbp = NULL;
1823 	/*
1824 	 * If this is a rename of a directory, then we have already
1825 	 * fixed the ".." entry to refer to the new parent. If err
1826 	 * is true at this point, we have failed to update the new
1827 	 * parent to refer to the renamed directory.
1828 	 * XXX - we need to unwind the ".." fix.
1829 	 */
1830 	if (err)
1831 		return (err);
1832 
1833 	/*
1834 	 * Mark the directory inode to reflect the changes.
1835 	 * Truncate the directory to chop off blocks of empty entries.
1836 	 */
1837 
1838 	TRANS_INODE(tdp->i_ufsvfs, tdp);
1839 	tdp->i_flag |= IUPD|ICHG;
1840 	tdp->i_seq++;
1841 	tdp->i_diroff = 0;
1842 	ITIMES_NOLOCK(tdp);
1843 	/*
1844 	 * If the directory grew then dirprepareentry() will have
1845 	 * set IATTCHG in tdp->i_flag, then the directory inode must
1846 	 * be flushed out. This is because if fsync() is used later
1847 	 * the directory size must be correct, otherwise a crash would
1848 	 * cause fsck to move the file to lost+found. Also because later
1849 	 * a file may be linked in more than one directory, then there
1850 	 * is no way to flush the original directory. So it must be
1851 	 * flushed out on creation. See bug 4293809.
1852 	 */
1853 	if (tdp->i_flag & IATTCHG) {
1854 		ufs_iupdat(tdp, I_SYNC);
1855 	}
1856 
1857 	if (slotp->endoff && (slotp->endoff < tdp->i_size)) {
1858 		if (!TRANS_ISTRANS(tdp->i_ufsvfs)) {
1859 			(void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0,
1860 						cr);
1861 		}
1862 	}
1863 
1864 
1865 	return (0);
1866 
1867 bad:
1868 	if (slotp->cached) {
1869 		dnlc_dir_purge(dcap);
1870 		fbrelse(slotp->fbp, S_OTHER);
1871 		slotp->cached = 0;
1872 		slotp->fbp = NULL;
1873 		return (err);
1874 	}
1875 
1876 	/*
1877 	 * Clear out entry prepared by dirprepareent.
1878 	 */
1879 	slotp->ep->d_ino = 0;
1880 	slotp->ep->d_namlen = 0;
1881 
1882 	/*
1883 	 * Don't touch err so we don't clobber the real error that got us here.
1884 	 */
1885 	if (TRANS_DIR(tdp, slotp->offset))
1886 		fbrelse(slotp->fbp, S_OTHER);
1887 	else
1888 		(void) ufs_fbwrite(slotp->fbp, tdp);
1889 	slotp->fbp = NULL;
1890 	return (err);
1891 }
1892 
1893 /*
1894  * Prepare a directory slot to receive an entry.
1895  */
1896 static int
1897 dirprepareentry(
1898 	struct inode *dp,	/* directory we are working in */
1899 	struct slot *slotp,	/* available slot info */
1900 	struct cred *cr)
1901 {
1902 	struct direct *ep, *nep;
1903 	off_t entryend;
1904 	int err;
1905 	slotstat_t status = slotp->status;
1906 	ushort_t dsize;
1907 
1908 	ASSERT((status == NONE) || (status == FOUND));
1909 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
1910 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
1911 	/*
1912 	 * If we didn't find a slot, then indicate that the
1913 	 * new slot belongs at the end of the directory.
1914 	 * If we found a slot, then the new entry can be
1915 	 * put at slotp->offset.
1916 	 */
1917 	entryend = slotp->offset + slotp->size;
1918 	if (status == NONE) {
1919 		ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0);
1920 		if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
1921 			err = ufs_fault(ITOV(dp),
1922 			    "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d"
1923 			    " > dp->i_fs->fs_fsize: %d (%s)",
1924 			    DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt);
1925 			return (err);
1926 		}
1927 		/*
1928 		 * Allocate the new block.
1929 		 */
1930 		err = BMAPALLOC(dp, (u_offset_t)slotp->offset,
1931 		    (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr);
1932 		if (err) {
1933 			return (err);
1934 		}
1935 		dp->i_size = entryend;
1936 		TRANS_INODE(dp->i_ufsvfs, dp);
1937 		dp->i_flag |= IUPD|ICHG|IATTCHG;
1938 		dp->i_seq++;
1939 		ITIMES_NOLOCK(dp);
1940 	} else if (entryend > dp->i_size) {
1941 		/*
1942 		 * Adjust directory size, if needed. This should never
1943 		 * push the size past a new multiple of DIRBLKSIZ.
1944 		 * This is an artifact of the old (4.2BSD) way of initializing
1945 		 * directory sizes to be less than DIRBLKSIZ.
1946 		 */
1947 		dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t);
1948 		TRANS_INODE(dp->i_ufsvfs, dp);
1949 		dp->i_flag |= IUPD|ICHG|IATTCHG;
1950 		dp->i_seq++;
1951 		ITIMES_NOLOCK(dp);
1952 	}
1953 
1954 	/*
1955 	 * Get the block containing the space for the new directory entry.
1956 	 */
1957 	if (slotp->fbp == NULL) {
1958 		err = blkatoff(dp, slotp->offset, (char **)&slotp->ep,
1959 		    &slotp->fbp);
1960 		if (err) {
1961 			return (err);
1962 		}
1963 	}
1964 	ep = slotp->ep;
1965 
1966 	switch (status) {
1967 	case NONE:
1968 		/*
1969 		 * No space in the directory. slotp->offset will be on a
1970 		 * directory block boundary and we will write the new entry
1971 		 * into a fresh block.
1972 		 */
1973 		ep->d_reclen = DIRBLKSIZ;
1974 		slotp->size = 0; /* length of previous entry */
1975 		break;
1976 	case FOUND:
1977 		/*
1978 		 * An entry of the required size has been found. Use it.
1979 		 */
1980 		if (ep->d_ino == 0) {
1981 			/* this is the 1st record in a block */
1982 			slotp->size = 0; /* length of previous entry */
1983 		} else {
1984 			dsize = DIRSIZ(ep);
1985 			nep = (struct direct *)((char *)ep + dsize);
1986 			nep->d_reclen = ep->d_reclen - dsize;
1987 			ep->d_reclen = dsize;
1988 			slotp->ep = nep;
1989 			slotp->offset += dsize;
1990 			slotp->size = dsize; /* length of previous entry */
1991 		}
1992 		break;
1993 	default:
1994 		break;
1995 	}
1996 	return (0);
1997 }
1998 
1999 /*
2000  * Allocate and initialize a new inode that will go into directory tdp.
2001  * This routine is called from ufs_symlink(), as well as within this file.
2002  */
2003 int
2004 ufs_dirmakeinode(
2005 	struct inode *tdp,
2006 	struct inode **ipp,
2007 	struct vattr *vap,
2008 	enum de_op op,
2009 	struct cred *cr)
2010 {
2011 	struct inode *ip;
2012 	enum vtype type;
2013 	int imode;			/* mode and format as in inode */
2014 	ino_t ipref;
2015 	int err;
2016 	timestruc_t now;
2017 
2018 	ASSERT(vap != NULL);
2019 	ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR ||
2020 		op == DE_SYMLINK);
2021 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
2022 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
2023 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
2024 	/*
2025 	 * Allocate a new inode.
2026 	 */
2027 	type = vap->va_type;
2028 	if (type == VDIR) {
2029 		ipref = dirpref(tdp);
2030 	} else {
2031 		ipref = tdp->i_number;
2032 	}
2033 	if (op == DE_ATTRDIR)
2034 		imode = vap->va_mode;
2035 	else
2036 		imode = MAKEIMODE(type, vap->va_mode);
2037 	*ipp = NULL;
2038 	err = ufs_ialloc(tdp, ipref, imode, &ip, cr);
2039 	if (err)
2040 		return (err);
2041 
2042 	/*
2043 	 * We don't need to grab vfs_dqrwlock here because it is held
2044 	 * in ufs_direnter_*() above us.
2045 	 */
2046 	ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock));
2047 	rw_enter(&ip->i_contents, RW_WRITER);
2048 	if (ip->i_dquot != NULL) {
2049 		err = ufs_fault(ITOV(ip),
2050 		    "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)",
2051 				    tdp->i_fs->fs_fsmnt);
2052 		rw_exit(&ip->i_contents);
2053 		return (err);
2054 	}
2055 	*ipp = ip;
2056 	ip->i_mode = (o_mode_t)imode;
2057 	if (type == VBLK || type == VCHR) {
2058 		dev_t d = vap->va_rdev;
2059 		dev32_t dev32;
2060 
2061 		/*
2062 		 * Don't allow a special file to be created with a
2063 		 * dev_t that cannot be represented by this filesystem
2064 		 * format on disk.
2065 		 */
2066 		if (!cmpldev(&dev32, d)) {
2067 			err = EOVERFLOW;
2068 			goto fail;
2069 		}
2070 
2071 		ITOV(ip)->v_rdev = ip->i_rdev = d;
2072 
2073 		if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
2074 			ip->i_ordev = dev32; /* can't use old format */
2075 		} else {
2076 			ip->i_ordev = cmpdev(d);
2077 		}
2078 	}
2079 	ITOV(ip)->v_type = type;
2080 	ufs_reset_vnode(ip->i_vnode);
2081 	if (type == VDIR) {
2082 		ip->i_nlink = 2; /* anticipating a call to dirmakedirect */
2083 	} else {
2084 		ip->i_nlink = 1;
2085 	}
2086 
2087 	if (op == DE_ATTRDIR) {
2088 		ip->i_uid = vap->va_uid;
2089 		ip->i_gid = vap->va_gid;
2090 	} else
2091 		ip->i_uid = crgetuid(cr);
2092 	/*
2093 	 * To determine the group-id of the created file:
2094 	 *   1) If the gid is set in the attribute list (non-Sun & pre-4.0
2095 	 *	clients are not likely to set the gid), then use it if
2096 	 *	the process is privileged, belongs to the target group,
2097 	 *	or the group is the same as the parent directory.
2098 	 *   2) If the filesystem was not mounted with the Old-BSD-compatible
2099 	 *	GRPID option, and the directory's set-gid bit is clear,
2100 	 *	then use the process's gid.
2101 	 *   3) Otherwise, set the group-id to the gid of the parent directory.
2102 	 */
2103 	if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) &&
2104 	    ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) ||
2105 	    secpolicy_vnode_create_gid(cr) == 0)) {
2106 		/*
2107 		 * XXX - is this only the case when a 4.0 NFS client, or a
2108 		 * client derived from that code, makes a call over the wire?
2109 		 */
2110 		ip->i_gid = vap->va_gid;
2111 	} else
2112 		ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr);
2113 
2114 	/*
2115 	 * For SunOS 5.0->5.4, the lines below read:
2116 	 *
2117 	 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
2118 	 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
2119 	 *
2120 	 * where MAXUID was set to 60002.  See notes on this in ufs_inode.c
2121 	 */
2122 	ip->i_suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
2123 		UID_LONG : ip->i_uid;
2124 	ip->i_sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
2125 		GID_LONG : ip->i_gid;
2126 
2127 	/*
2128 	 * If we're creating a directory, and the parent directory has the
2129 	 * set-GID bit set, set it on the new directory.
2130 	 * Otherwise, if the user is neither privileged nor a member of the
2131 	 * file's new group, clear the file's set-GID bit.
2132 	 */
2133 	if ((tdp->i_mode & ISGID) && (type == VDIR))
2134 		ip->i_mode |= ISGID;
2135 	else {
2136 		if ((ip->i_mode & ISGID) &&
2137 		    secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0)
2138 			ip->i_mode &= ~ISGID;
2139 	}
2140 
2141 	if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2142 	    ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2143 		err = EOVERFLOW;
2144 		goto fail;
2145 	}
2146 
2147 	/*
2148 	 * Extended attribute directories are not subject to quotas.
2149 	 */
2150 	if (op != DE_ATTRDIR)
2151 		ip->i_dquot = getinoquota(ip);
2152 	else
2153 		ip->i_dquot = NULL;
2154 
2155 	if (op == DE_MKDIR || op == DE_ATTRDIR) {
2156 		err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr);
2157 		if (err)
2158 			goto fail;
2159 	}
2160 
2161 	/*
2162 	 * generate the shadow inode and attach it to the new object
2163 	 */
2164 	ASSERT((tdp->i_shadow && tdp->i_ufs_acl) ||
2165 	    (!tdp->i_shadow && !tdp->i_ufs_acl));
2166 	if (tdp->i_shadow && tdp->i_ufs_acl &&
2167 	    (((tdp->i_mode & IFMT) == IFDIR) ||
2168 	    ((tdp->i_mode & IFMT) == IFATTRDIR))) {
2169 		err = ufs_si_inherit(ip, tdp, ip->i_mode, cr);
2170 		if (err) {
2171 			if (op == DE_MKDIR) {
2172 				/*
2173 				 * clean up parent directory
2174 				 *
2175 				 * tdp->i_contents already locked from
2176 				 * ufs_direnter_*()
2177 				 */
2178 				tdp->i_nlink--;
2179 				TRANS_INODE(tdp->i_ufsvfs, tdp);
2180 				tdp->i_flag |= ICHG;
2181 				tdp->i_seq++;
2182 				ufs_iupdat(tdp, I_SYNC);
2183 			}
2184 			goto fail;
2185 		}
2186 	}
2187 
2188 	/*
2189 	 * If the passed in attributes contain atime and/or mtime
2190 	 * settings, then use them instead of using the current
2191 	 * high resolution time.
2192 	 */
2193 	if (vap->va_mask & (AT_MTIME|AT_ATIME)) {
2194 		if (vap->va_mask & AT_ATIME) {
2195 			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
2196 			ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2197 			ip->i_flag &= ~IACC;
2198 		} else
2199 			ip->i_flag |= IACC;
2200 		if (vap->va_mask & AT_MTIME) {
2201 			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
2202 			ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2203 			gethrestime(&now);
2204 			if (now.tv_sec > TIME32_MAX) {
2205 				/*
2206 				 * In 2038, ctime sticks forever..
2207 				 */
2208 				ip->i_ctime.tv_sec = TIME32_MAX;
2209 				ip->i_ctime.tv_usec = 0;
2210 			} else {
2211 				ip->i_ctime.tv_sec = now.tv_sec;
2212 				ip->i_ctime.tv_usec = now.tv_nsec / 1000;
2213 			}
2214 			ip->i_flag &= ~(IUPD|ICHG);
2215 			ip->i_flag |= IMODTIME;
2216 		} else
2217 			ip->i_flag |= IUPD|ICHG;
2218 		ip->i_flag |= IMOD;
2219 	} else
2220 		ip->i_flag |= IACC|IUPD|ICHG;
2221 	ip->i_seq++;
2222 
2223 	/*
2224 	 * If this is an attribute tag it as one.
2225 	 */
2226 	if ((tdp->i_mode & IFMT) == IFATTRDIR) {
2227 		ip->i_cflags |= IXATTR;
2228 	}
2229 
2230 	/*
2231 	 * push inode before it's name appears in a directory
2232 	 */
2233 	TRANS_INODE(ip->i_ufsvfs, ip);
2234 	ufs_iupdat(ip, I_SYNC);
2235 	rw_exit(&ip->i_contents);
2236 	return (0);
2237 
2238 fail:
2239 	/* Throw away inode we just allocated. */
2240 	ip->i_nlink = 0;
2241 	ufs_setreclaim(ip);
2242 	TRANS_INODE(ip->i_ufsvfs, ip);
2243 	ip->i_flag |= ICHG;
2244 	ip->i_seq++;
2245 	ITIMES_NOLOCK(ip);
2246 	rw_exit(&ip->i_contents);
2247 	return (err);
2248 }
2249 
2250 /*
2251  * Write a prototype directory into the empty inode ip, whose parent is dp.
2252  */
2253 static int
2254 ufs_dirmakedirect(
2255 	struct inode *ip,		/* new directory */
2256 	struct inode *dp,		/* parent directory */
2257 	int	attrdir,
2258 	struct cred *cr)
2259 {
2260 	struct dirtemplate *dirp;
2261 	struct fbuf *fbp;
2262 	int err;
2263 
2264 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
2265 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2266 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
2267 	/*
2268 	 * Allocate space for the directory we're creating.
2269 	 */
2270 	err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr);
2271 	if (err)
2272 		return (err);
2273 	if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
2274 		err = ufs_fault(ITOV(dp),
2275 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)",
2276 					DIRBLKSIZ, dp->i_fs->fs_fsize,
2277 					dp->i_fs->fs_fsmnt);
2278 		return (err);
2279 	}
2280 	ip->i_size = DIRBLKSIZ;
2281 	TRANS_INODE(ip->i_ufsvfs, ip);
2282 	ip->i_flag |= IUPD|ICHG|IATTCHG;
2283 	ip->i_seq++;
2284 	ITIMES_NOLOCK(ip);
2285 	/*
2286 	 * Update the tdp link count and write out the change.
2287 	 * This reflects the ".." entry we'll soon write.
2288 	 */
2289 	if (dp->i_nlink == MAXLINK)
2290 		return (EMLINK);
2291 	if (attrdir == 0)
2292 		dp->i_nlink++;
2293 	TRANS_INODE(dp->i_ufsvfs, dp);
2294 	dp->i_flag |= ICHG;
2295 	dp->i_seq++;
2296 	ufs_iupdat(dp, I_SYNC);
2297 	/*
2298 	 * Initialize directory with "."
2299 	 * and ".." from static template.
2300 	 *
2301 	 * Since the parent directory is locked, we don't have to
2302 	 * worry about anything changing when we drop the write
2303 	 * lock on (ip).
2304 	 *
2305 	 */
2306 	err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize,
2307 	    S_READ, &fbp);
2308 
2309 	if (err) {
2310 		goto fail;
2311 	}
2312 	dirp = (struct dirtemplate *)fbp->fb_addr;
2313 	/*
2314 	 * Now initialize the directory we're creating
2315 	 * with the "." and ".." entries.
2316 	 */
2317 	*dirp = mastertemplate;			/* structure assignment */
2318 	dirp->dot_ino = (uint32_t)ip->i_number;
2319 	dirp->dotdot_ino = (uint32_t)dp->i_number;
2320 
2321 	err = TRANS_DIR(ip, 0);
2322 	if (err) {
2323 		fbrelse(fbp, S_OTHER);
2324 		goto fail;
2325 	}
2326 
2327 	err = ufs_fbwrite(fbp, ip);
2328 	if (err) {
2329 		goto fail;
2330 	}
2331 
2332 	return (0);
2333 
2334 fail:
2335 	if (attrdir == 0)
2336 		dp->i_nlink--;
2337 	TRANS_INODE(dp->i_ufsvfs, dp);
2338 	dp->i_flag |= ICHG;
2339 	dp->i_seq++;
2340 	ufs_iupdat(dp, I_SYNC);
2341 	return (err);
2342 }
2343 
2344 /*
2345  * Delete a directory entry.  If oip is nonzero the entry is checked
2346  * to make sure it still reflects oip.
2347  *
2348  * If vpp is non-null, return the ptr of the (held) vnode associated with
2349  * the removed name.  The caller is responsible for doing the VN_RELE().
2350  */
2351 int
2352 ufs_dirremove(
2353 	struct inode *dp,
2354 	char *namep,
2355 	struct inode *oip,
2356 	struct vnode *cdir,
2357 	enum dr_op op,
2358 	struct cred *cr,
2359 	vnode_t **vpp)	/* Return (held) vnode ptr of removed file/dir */
2360 {
2361 	struct direct *ep, *pep, *nep;
2362 	struct inode *ip;
2363 	vnode_t *dvp, *vp;
2364 	struct slot slot;
2365 	int namlen;
2366 	int err;
2367 	int mode;
2368 	ushort_t extra;
2369 
2370 	namlen = (int)strlen(namep);
2371 	if (namlen == 0)
2372 		return (ufs_fault(ITOV(dp), "ufs_dirremove: namlen == 0"));
2373 	/*
2374 	 * return error when removing . and ..
2375 	 */
2376 	if (namep[0] == '.') {
2377 		if (namlen == 1)
2378 			return (EINVAL);
2379 		else if (namlen == 2 && namep[1] == '.') {
2380 			return (EEXIST);	/* SIGH should be ENOTEMPTY */
2381 		}
2382 	}
2383 
2384 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2385 	/*
2386 	 * Check accessibility of directory.
2387 	 */
2388 retry:
2389 	if (((dp->i_mode & IFMT) != IFDIR) &&
2390 	    ((dp->i_mode & IFMT) != IFATTRDIR)) {
2391 		return (ENOTDIR);
2392 	}
2393 
2394 	/*
2395 	 * Execute access is required to search the directory.
2396 	 * Access for write is interpreted as allowing
2397 	 * deletion of files in the directory.
2398 	 */
2399 	if (err = ufs_iaccess(dp, IEXEC|IWRITE, cr)) {
2400 		return (err);
2401 	}
2402 
2403 	ip = NULL;
2404 	slot.fbp = NULL;
2405 	slot.status = FOUND;	/* don't need to look for empty slot */
2406 	rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
2407 	rw_enter(&dp->i_contents, RW_WRITER);
2408 	err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0);
2409 	if (err)
2410 		goto out_novfs;
2411 	if (ip == NULL) {
2412 		err = ENOENT;
2413 		goto out_novfs;
2414 	}
2415 	vp = ITOV(ip);
2416 	if (oip && oip != ip) {
2417 		err = ENOENT;
2418 		goto out_novfs;
2419 	}
2420 
2421 	mode = ip->i_mode & IFMT;
2422 	if (mode == IFDIR || mode == IFATTRDIR) {
2423 
2424 		/*
2425 		 * vn_vfswlock() prevents races between mount and rmdir.
2426 		 */
2427 		if (vn_vfswlock(vp)) {
2428 			err = EBUSY;
2429 			goto out_novfs;
2430 		}
2431 		if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) {
2432 			err = EBUSY;
2433 			goto out;
2434 		}
2435 		/*
2436 		 * If we are removing a directory, get a lock on it.
2437 		 * Taking a writer lock prevents a parallel ufs_dirlook from
2438 		 * incorrectly entering a negative cache vnode entry in the dnlc
2439 		 * If the directory is empty, it will stay empty until
2440 		 * we can remove it.
2441 		 */
2442 		if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) {
2443 			/*
2444 			 * It is possible that a thread in rename would have
2445 			 * acquired this rwlock. To prevent a deadlock we
2446 			 * do a rw_tryenter. If we fail to get the lock
2447 			 * we drop all the locks we have acquired, wait
2448 			 * for 2 ticks and reacquire the
2449 			 * directory's (dp) i_rwlock and try again.
2450 			 * If we dont drop dp's i_rwlock then we will panic
2451 			 * with a "Deadlock: cycle in blocking chain"
2452 			 * since in ufs_dircheckpath we want dp's i_rwlock.
2453 			 * dp is guaranteed to exist since ufs_dirremove is
2454 			 * called after a VN_HOLD(dp) has been done.
2455 			 */
2456 			ufs_dirremove_retry_cnt++;
2457 			vn_vfsunlock(vp);
2458 			if (slot.fbp)
2459 				fbrelse(slot.fbp, S_OTHER);
2460 			rw_exit(&dp->i_contents);
2461 			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2462 			rw_exit(&dp->i_rwlock);
2463 			VN_RELE(vp);
2464 			delay(2);
2465 			rw_enter(&dp->i_rwlock, RW_WRITER);
2466 			goto retry;
2467 		}
2468 	}
2469 	rw_enter(&ip->i_contents, RW_READER);
2470 
2471 	/*
2472 	 * Now check the restrictions that apply on sticky directories.
2473 	 */
2474 	if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) {
2475 		rw_exit(&ip->i_contents);
2476 		if (mode == IFDIR || mode == IFATTRDIR)
2477 			rw_exit(&ip->i_rwlock);
2478 		goto out;
2479 	}
2480 
2481 	if (op == DR_RMDIR) {
2482 		/*
2483 		 * For rmdir(2), some special checks are required.
2484 		 * (a) Don't remove any alias of the parent (e.g. ".").
2485 		 * (b) Don't remove the current directory.
2486 		 * (c) Make sure the entry is (still) a directory.
2487 		 * (d) Make sure the directory is empty.
2488 		 */
2489 
2490 		if (dp == ip || vp == cdir)
2491 			err = EINVAL;
2492 		else if (((ip->i_mode & IFMT) != IFDIR) &&
2493 		    ((ip->i_mode & IFMT) != IFATTRDIR))
2494 			err = ENOTDIR;
2495 		else if ((ip->i_nlink > 2) ||
2496 		    !ufs_dirempty(ip, dp->i_number, cr)) {
2497 			err = EEXIST;	/* SIGH should be ENOTEMPTY */
2498 		}
2499 
2500 		if (err) {
2501 			rw_exit(&ip->i_contents);
2502 			if (mode == IFDIR || mode == IFATTRDIR)
2503 				rw_exit(&ip->i_rwlock);
2504 			goto out;
2505 		}
2506 	} else if (op == DR_REMOVE)  {
2507 		/*
2508 		 * unlink(2) requires a different check: allow only
2509 		 * privileged users to unlink a directory.
2510 		 */
2511 		if (vp->v_type == VDIR &&
2512 		    secpolicy_fs_linkdir(cr, vp->v_vfsp)) {
2513 			err = EPERM;
2514 			rw_exit(&ip->i_contents);
2515 			rw_exit(&ip->i_rwlock);
2516 			goto out;
2517 		}
2518 	}
2519 
2520 	rw_exit(&ip->i_contents);
2521 
2522 	/*
2523 	 * Remove the cache'd entry, if any.
2524 	 */
2525 	dvp = ITOV(dp);
2526 	dnlc_remove(dvp, namep);
2527 	ep = slot.ep;
2528 	ep->d_ino = 0;
2529 
2530 	if (slot.cached) {
2531 		dcanchor_t *dcap = &dp->i_danchor;
2532 
2533 		(void) dnlc_dir_rem_entry(dcap, namep, NULL);
2534 		if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) {
2535 			(void) dnlc_dir_rem_space_by_handle(dcap, slot.offset);
2536 		}
2537 		if (slot.offset & (DIRBLKSIZ - 1)) {
2538 			/*
2539 			 * Collapse new free space into previous entry.
2540 			 * Note, the previous entry has already been
2541 			 * validated in ufs_dircheckforname().
2542 			 */
2543 			ASSERT(slot.size);
2544 			pep = (struct direct *)((char *)ep - slot.size);
2545 			if ((pep->d_ino == 0) &&
2546 			    ((uintptr_t)pep & (DIRBLKSIZ - 1))) {
2547 				dnlc_dir_purge(dcap);
2548 				slot.cached = 0;
2549 				goto nocache;
2550 			}
2551 			if (pep->d_ino) {
2552 				extra = pep->d_reclen - DIRSIZ(pep);
2553 			} else {
2554 				extra = pep->d_reclen;
2555 			}
2556 			if (extra >= LDIRSIZ(1)) {
2557 				(void) dnlc_dir_rem_space_by_handle(dcap,
2558 				    (uint64_t)(slot.offset - slot.size));
2559 			}
2560 			pep->d_reclen += ep->d_reclen;
2561 			(void) dnlc_dir_add_space(dcap, extra + ep->d_reclen,
2562 				(uint64_t)(slot.offset - slot.size));
2563 			/* adjust the previous pointer in the next entry */
2564 			nep = (struct direct *)((char *)ep + ep->d_reclen);
2565 			if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
2566 				/*
2567 				 * Not a new block.
2568 				 *
2569 				 * Check the validity of the entry.
2570 				 * If it's bad, then throw away the cache and
2571 				 * continue.
2572 				 */
2573 				if ((nep->d_reclen == 0) ||
2574 				    (nep->d_reclen & 0x3) ||
2575 				    (dnlc_dir_update(dcap, nep->d_name,
2576 				    INO_OFF_TO_H(nep->d_ino,
2577 				    slot.offset - slot.size)) == DNOENT)) {
2578 					dnlc_dir_purge(dcap);
2579 					slot.cached = 0;
2580 				}
2581 			}
2582 		} else {
2583 			(void) dnlc_dir_add_space(dcap, ep->d_reclen,
2584 			(uint64_t)slot.offset);
2585 		}
2586 	} else {
2587 		/*
2588 		 * If the entry isn't the first in the directory, we must
2589 		 * reclaim the space of the now empty record by adding
2590 		 * the record size to the size of the previous entry.
2591 		 */
2592 		if (slot.offset & (DIRBLKSIZ - 1)) {
2593 			/*
2594 			 * Collapse new free space into previous entry.
2595 			 */
2596 			pep = (struct direct *)((char *)ep - slot.size);
2597 			pep->d_reclen += ep->d_reclen;
2598 		}
2599 	}
2600 nocache:
2601 
2602 
2603 	err = TRANS_DIR(dp, slot.offset);
2604 	if (err)
2605 		fbrelse(slot.fbp, S_OTHER);
2606 	else
2607 		err = ufs_fbwrite(slot.fbp, dp);
2608 	slot.fbp = NULL;
2609 
2610 	/*
2611 	 * If we were removing a directory, it is 'gone' now, but we cannot
2612 	 * unlock it as a thread may be waiting for the lock in ufs_create. If
2613 	 * we did, it could then create a file in a deleted directory.
2614 	 */
2615 
2616 	if (err) {
2617 		if (mode == IFDIR || mode == IFATTRDIR)
2618 			rw_exit(&ip->i_rwlock);
2619 		goto out;
2620 	}
2621 
2622 	rw_enter(&ip->i_contents, RW_WRITER);
2623 
2624 	dp->i_flag |= IUPD|ICHG;
2625 	dp->i_seq++;
2626 	ip->i_flag |= ICHG;
2627 	ip->i_seq++;
2628 
2629 	TRANS_INODE(dp->i_ufsvfs, dp);
2630 	TRANS_INODE(ip->i_ufsvfs, ip);
2631 	/*
2632 	 * Now dispose of the inode.
2633 	 */
2634 	if (ip->i_nlink > 0) {
2635 		/*
2636 		 * This is not done for IFATTRDIR's because they don't
2637 		 * have entries in the dnlc and the link counts are
2638 		 * not incremented when they are created.
2639 		 */
2640 		if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) {
2641 			/*
2642 			 * Decrement by 2 because we're trashing the "."
2643 			 * entry as well as removing the entry in dp.
2644 			 * Clear the directory entry, but there may be
2645 			 * other hard links so don't free the inode.
2646 			 * Decrement the dp linkcount because we're
2647 			 * trashing the ".." entry.
2648 			 */
2649 			ip->i_nlink -= 2;
2650 			dp->i_nlink--;
2651 			ufs_setreclaim(dp);
2652 			/*
2653 			 * XXX need to discard negative cache entries
2654 			 * for vp.  See comment in ufs_delete().
2655 			 */
2656 			dnlc_remove(vp, ".");
2657 			dnlc_remove(vp, "..");
2658 			/*
2659 			 * The return value is ignored here bacause if
2660 			 * the directory purge fails we don't want to
2661 			 * stop the delete. If ufs_dirpurgedotdot fails
2662 			 * the delete will continue with the preexiting
2663 			 * behavior.
2664 			 */
2665 			(void) ufs_dirpurgedotdot(ip, dp->i_number, cr);
2666 		} else {
2667 			ip->i_nlink--;
2668 		}
2669 		ufs_setreclaim(ip);
2670 	}
2671 	ITIMES_NOLOCK(dp);
2672 	ITIMES_NOLOCK(ip);
2673 
2674 	if (!TRANS_ISTRANS(dp->i_ufsvfs))
2675 		ufs_iupdat(dp, I_SYNC);
2676 	if (!TRANS_ISTRANS(ip->i_ufsvfs))
2677 		ufs_iupdat(ip, I_SYNC);
2678 
2679 	rw_exit(&ip->i_contents);
2680 	if (mode == IFDIR || mode == IFATTRDIR)
2681 		rw_exit(&ip->i_rwlock);
2682 out:
2683 	if (mode == IFDIR || mode == IFATTRDIR) {
2684 		vn_vfsunlock(vp);
2685 	}
2686 out_novfs:
2687 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
2688 
2689 	if (slot.fbp)
2690 		fbrelse(slot.fbp, S_OTHER);
2691 
2692 	rw_exit(&dp->i_contents);
2693 	rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2694 
2695 	/*
2696 	 * If no error and vpp is non-NULL, return the vnode ptr to the caller.
2697 	 * The caller becomes responsible for the VN_RELE().  Otherwise,
2698 	 * Release (and delete) the inode after we drop vfs_dqrwlock to
2699 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
2700 	 */
2701 	if (ip) {
2702 		if ((err == 0) && (vpp != NULL)) {
2703 			*vpp = ITOV(ip);
2704 		} else {
2705 			VN_RELE(vp);
2706 		}
2707 	}
2708 
2709 	return (err);
2710 }
2711 
2712 /*
2713  * Return buffer with contents of block "offset"
2714  * from the beginning of directory "ip".  If "res"
2715  * is non-zero, fill it in with a pointer to the
2716  * remaining space in the directory.
2717  *
2718  */
2719 
2720 int
2721 blkatoff(
2722 	struct inode *ip,
2723 	off_t offset,
2724 	char **res,
2725 	struct fbuf **fbpp)
2726 {
2727 	struct fs *fs;
2728 	struct fbuf *fbp;
2729 	daddr_t lbn;
2730 	uint_t bsize;
2731 	int err;
2732 
2733 	CPU_STATS_ADD_K(sys, ufsdirblk, 1);
2734 	fs = ip->i_fs;
2735 	lbn = (daddr_t)lblkno(fs, offset);
2736 	bsize = (uint_t)blksize(fs, ip, lbn);
2737 	err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask),
2738 			bsize, S_READ, &fbp);
2739 	if (err) {
2740 		*fbpp = (struct fbuf *)NULL;
2741 		return (err);
2742 	}
2743 	if (res)
2744 		*res = fbp->fb_addr + blkoff(fs, offset);
2745 	*fbpp = fbp;
2746 	return (0);
2747 }
2748 
2749 /*
2750  * Do consistency checking:
2751  *	record length must be multiple of 4
2752  *	entry must fit in rest of its DIRBLKSIZ block
2753  *	record must be large enough to contain entry
2754  *	name is not longer than MAXNAMLEN
2755  *	name must be as long as advertised, and null terminated
2756  * NOTE: record length must not be zero (should be checked previously).
2757  *       This routine is only called if dirchk is true.
2758  *       It would be nice to set the FSBAD flag in the super-block when
2759  *       this routine fails so that a fsck is forced on next reboot,
2760  *       but locking is a problem.
2761  */
2762 static int
2763 dirmangled(
2764 	struct inode *dp,
2765 	struct direct *ep,
2766 	int entryoffsetinblock,
2767 	off_t offset)
2768 {
2769 	int i;
2770 
2771 	i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
2772 	if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i ||
2773 	    (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN ||
2774 	    ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) {
2775 		dirbad(dp, "mangled entry", offset);
2776 		return (1);
2777 	}
2778 	return (0);
2779 }
2780 
2781 static void
2782 dirbad(struct inode *ip, char *how, off_t offset)
2783 {
2784 	cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s",
2785 	    ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how);
2786 }
2787 
2788 static int
2789 dirbadname(char *sp, int l)
2790 {
2791 	while (l--) {			/* check for nulls */
2792 		if (*sp++ == '\0') {
2793 			return (1);
2794 		}
2795 	}
2796 	return (*sp);			/* check for terminating null */
2797 }
2798 
2799 /*
2800  * Check if a directory is empty or not.
2801  */
2802 static int
2803 ufs_dirempty(
2804 	struct inode *ip,
2805 	ino_t parentino,
2806 	struct cred *cr)
2807 {
2808 	return (ufs_dirscan(ip, parentino, cr, 0));
2809 }
2810 
2811 /*
2812  * clear the .. directory entry.
2813  */
2814 static int
2815 ufs_dirpurgedotdot(
2816 	struct inode *ip,
2817 	ino_t parentino,
2818 	struct cred *cr)
2819 {
2820 	return (ufs_dirscan(ip, parentino, cr, 1));
2821 }
2822 
2823 /*
2824  * Scan the directoy. If clr_dotdot is true clear the ..
2825  * directory else check to see if the directory is empty.
2826  *
2827  * Using a struct dirtemplate here is not precisely
2828  * what we want, but better than using a struct direct.
2829  *
2830  * clr_dotdot is used as a flag to tell us if we need
2831  * to clear the dotdot entry
2832  *
2833  * N.B.: does not handle corrupted directories.
2834  */
2835 static int
2836 ufs_dirscan(
2837 	struct inode *ip,
2838 	ino_t parentino,
2839 	struct cred *cr,
2840 	int clr_dotdot)
2841 {
2842 	offset_t off;
2843 	struct dirtemplate dbuf;
2844 	struct direct *dp = (struct direct *)&dbuf;
2845 	int err, count;
2846 	int empty = 1;	/* Assume it's empty */
2847 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
2848 
2849 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
2850 
2851 	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
2852 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
2853 		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
2854 		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
2855 		/*
2856 		 * Since we read MINDIRSIZ, residual must
2857 		 * be 0 unless we're at end of file.
2858 		 */
2859 		if (err || count != 0 || dp->d_reclen == 0) {
2860 			empty = 0;
2861 			break;
2862 		}
2863 		/* skip empty entries */
2864 		if (dp->d_ino == 0)
2865 			continue;
2866 		/* accept only "." and ".." */
2867 		if (dp->d_namlen > 2 || dp->d_name[0] != '.') {
2868 			empty = 0;
2869 			break;
2870 		}
2871 		/*
2872 		 * At this point d_namlen must be 1 or 2.
2873 		 * 1 implies ".", 2 implies ".." if second
2874 		 * char is also "."
2875 		 */
2876 		if (dp->d_namlen == 1)
2877 			continue;
2878 		if (dp->d_name[1] == '.' &&
2879 		    (ino_t)dp->d_ino == parentino) {
2880 			/*
2881 			 * If we're doing a purge we need to check for
2882 			 * the . and .. entries and clear the d_ino for ..
2883 			 *
2884 			 * if clr_dotdot is set ufs_dirscan does not
2885 			 * check for an empty directory.
2886 			 */
2887 			if (clr_dotdot) {
2888 				/*
2889 				 * Have to actually zap the ..
2890 				 * entry in the directory, as
2891 				 * otherwise someone might have
2892 				 * dp as its cwd and try to
2893 				 * open .., which now points to
2894 				 * an unallocated inode.
2895 				 */
2896 				empty = ufs_dirclrdotdot(ip, parentino);
2897 				break;
2898 			} else {
2899 				continue;
2900 			}
2901 		}
2902 		empty = 0;
2903 		break;
2904 	}
2905 	return (empty);
2906 }
2907 
2908 clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */
2909 uint64_t dircheck_retry_cnt;
2910 /*
2911  * Check if source directory inode is in the path of the target directory.
2912  * Target is supplied locked.
2913  *
2914  * The source and target inode's should be different upon entry.
2915  */
2916 int
2917 ufs_dircheckpath(
2918 	ino_t source_ino,
2919 	struct inode *target,
2920 	struct inode *sdp,
2921 	struct cred *cr)
2922 {
2923 	struct fbuf *fbp;
2924 	struct dirtemplate *dirp;
2925 	struct inode *ip;
2926 	struct ufsvfs *ufsvfsp;
2927 	struct inode *tip;
2928 	ino_t dotdotino;
2929 	int err;
2930 
2931 	ASSERT(target->i_ufsvfs != NULL);
2932 	ASSERT(RW_LOCK_HELD(&target->i_rwlock));
2933 	ASSERT(RW_LOCK_HELD(&sdp->i_rwlock));
2934 
2935 	ip = target;
2936 	if (ip->i_number == source_ino) {
2937 		err = EINVAL;
2938 		goto out;
2939 	}
2940 	if (ip->i_number == UFSROOTINO) {
2941 		err = 0;
2942 		goto out;
2943 	}
2944 	/*
2945 	 * Search back through the directory tree, using the ".." entries.
2946 	 * Fail any attempt to move a directory into an ancestor directory.
2947 	 */
2948 	fbp = NULL;
2949 	for (;;) {
2950 		struct vfs	*vfs;
2951 
2952 		err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp);
2953 		if (err)
2954 			break;
2955 		if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 ||
2956 		    ip->i_size < sizeof (struct dirtemplate)) {
2957 			dirbad(ip, "bad size, unlinked or not dir", (off_t)0);
2958 			err = ENOTDIR;
2959 			break;
2960 		}
2961 		if (dirp->dotdot_namlen != 2 ||
2962 		    dirp->dotdot_name[0] != '.' ||
2963 		    dirp->dotdot_name[1] != '.') {
2964 			dirbad(ip, "mangled .. entry", (off_t)0);
2965 			err = ENOTDIR;		/* Sanity check */
2966 			break;
2967 		}
2968 		dotdotino = (ino_t)dirp->dotdot_ino;
2969 		if (dotdotino == source_ino) {
2970 			err = EINVAL;
2971 			break;
2972 		}
2973 		if (dotdotino == UFSROOTINO)
2974 			break;
2975 		if (fbp) {
2976 			fbrelse(fbp, S_OTHER);
2977 			fbp = NULL;
2978 		}
2979 		vfs = ip->i_vfs;
2980 		ufsvfsp = ip->i_ufsvfs;
2981 
2982 		if (ip != target) {
2983 			rw_exit(&ip->i_rwlock);
2984 			VN_RELE(ITOV(ip));
2985 		}
2986 		/*
2987 		 * Race to get the inode.
2988 		 */
2989 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2990 		if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) {
2991 			rw_exit(&ufsvfsp->vfs_dqrwlock);
2992 			ip = NULL;
2993 			break;
2994 		}
2995 		rw_exit(&ufsvfsp->vfs_dqrwlock);
2996 		/*
2997 		 * If the directory of the source inode (also a directory)
2998 		 * is the same as this next entry up the chain, then
2999 		 * we know the source directory itself can't be in the
3000 		 * chain. This also prevents a panic because we already
3001 		 * have sdp->i_rwlock locked.
3002 		 */
3003 		if (tip == sdp) {
3004 			VN_RELE(ITOV(tip));
3005 			ip = NULL;
3006 			break;
3007 		}
3008 		ip = tip;
3009 
3010 		/*
3011 		 * If someone has set the WRITE_WANTED bit in this lock and if
3012 		 * this happens to be a sdp or tdp of another parallel rename
3013 		 * which is executing  the same code and in similar situation
3014 		 * we end up in a 4 way deadlock. We need to make sure that
3015 		 * the WRITE_WANTED bit is not  set.
3016 		 */
3017 retry_lock:
3018 		if (!rw_tryenter(&ip->i_rwlock, RW_READER)) {
3019 			/*
3020 			 * If the lock held as WRITER thats fine but if it
3021 			 * has WRITE_WANTED bit set we might end up in a
3022 			 * deadlock. If WRITE_WANTED is set we return
3023 			 * with EAGAIN else we just go back and try.
3024 			 */
3025 			if (RW_ISWRITER(&ip->i_rwlock) &&
3026 					!(RW_WRITE_HELD(&ip->i_rwlock))) {
3027 				err = EAGAIN;
3028 				if (fbp) {
3029 					fbrelse(fbp, S_OTHER);
3030 				}
3031 				VN_RELE(ITOV(ip));
3032 				return (err);
3033 			} else {
3034 				/*
3035 				 * The lock is being write held. We could
3036 				 * just do a rw_enter here but there is a
3037 				 * window between the check and now, where
3038 				 * the status could have changed, so to
3039 				 * avoid looping we backoff and go back to
3040 				 * try for the lock.
3041 				 */
3042 				delay(retry_backoff_delay);
3043 				dircheck_retry_cnt++;
3044 				goto retry_lock;
3045 			}
3046 		}
3047 	}
3048 	if (fbp) {
3049 		fbrelse(fbp, S_OTHER);
3050 	}
3051 out:
3052 	if (ip) {
3053 		if (ip != target) {
3054 			rw_exit(&ip->i_rwlock);
3055 			VN_RELE(ITOV(ip));
3056 		}
3057 	}
3058 	return (err);
3059 }
3060 
3061 int
3062 ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr)
3063 {
3064 	offset_t off;
3065 	struct dirtemplate dbuf;
3066 	struct direct *dp = (struct direct *)&dbuf;
3067 	int err, count;
3068 	int empty = 1;	/* Assume it's empty */
3069 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
3070 
3071 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
3072 
3073 	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
3074 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
3075 		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
3076 		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
3077 		/*
3078 		 * Since we read MINDIRSIZ, residual must
3079 		 * be 0 unless we're at end of file.
3080 		 */
3081 
3082 		if (err || count != 0 || dp->d_reclen == 0) {
3083 			empty = 0;
3084 			break;
3085 		}
3086 		/* skip empty entries */
3087 		if (dp->d_ino == 0)
3088 			continue;
3089 		/*
3090 		 * At this point d_namlen must be 1 or 2.
3091 		 * 1 implies ".", 2 implies ".." if second
3092 		 * char is also "."
3093 		 */
3094 
3095 		if (dp->d_namlen == 1 && dp->d_name[0] == '.' &&
3096 				(ino_t)dp->d_ino == parentino)
3097 			continue;
3098 
3099 		if (dp->d_namlen == 2 && dp->d_name[0] == '.' &&
3100 			dp->d_name[1] == '.') {
3101 			continue;
3102 		}
3103 		empty = 0;
3104 		break;
3105 	}
3106 	return (empty);
3107 }
3108 
3109 
3110 /*
3111  * Allocate and initialize a new shadow inode to contain extended attributes.
3112  */
3113 int
3114 ufs_xattrmkdir(
3115 	struct inode *tdp,
3116 	struct inode **ipp,
3117 	int flags,
3118 	struct cred *cr)
3119 {
3120 	struct inode *ip;
3121 	struct vattr va;
3122 	int err;
3123 	int retry = 1;
3124 	struct ufsvfs *ufsvfsp;
3125 	struct ulockfs *ulp;
3126 	int issync;
3127 	int trans_size;
3128 	int dorwlock;		/* 0 = not yet taken, */
3129 				/* 1 = taken outside the transaction, */
3130 				/* 2 = taken inside the transaction */
3131 
3132 	/*
3133 	 * Validate permission to create attribute directory
3134 	 */
3135 
3136 	if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0) {
3137 		return (err);
3138 	}
3139 
3140 	if (vn_is_readonly(ITOV(tdp)))
3141 		return (EROFS);
3142 
3143 	/*
3144 	 * No need to re-init err after again:, since it's set before
3145 	 * the next use of it.
3146 	 */
3147 again:
3148 	dorwlock = 0;
3149 	va.va_type = VDIR;
3150 	va.va_uid = tdp->i_uid;
3151 	va.va_gid = tdp->i_gid;
3152 
3153 	if ((tdp->i_mode & IFMT) == IFDIR) {
3154 		va.va_mode = (o_mode_t)IFATTRDIR;
3155 		va.va_mode |= tdp->i_mode & 0777;
3156 	} else {
3157 		va.va_mode = (o_mode_t)IFATTRDIR|0700;
3158 		if (tdp->i_mode & 0040)
3159 			va.va_mode |= 0750;
3160 		if (tdp->i_mode & 0004)
3161 			va.va_mode |= 0705;
3162 	}
3163 	va.va_mask = AT_TYPE|AT_MODE;
3164 
3165 	ufsvfsp = tdp->i_ufsvfs;
3166 
3167 	err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
3168 	if (err)
3169 		return (err);
3170 
3171 	/*
3172 	 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
3173 	 * This follows the protocol for read()/write().
3174 	 */
3175 	if (ITOV(tdp)->v_type != VDIR) {
3176 		rw_enter(&tdp->i_rwlock, RW_WRITER);
3177 		dorwlock = 1;
3178 	}
3179 
3180 	if (ulp) {
3181 		trans_size = (int)TOP_MKDIR_SIZE(tdp);
3182 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size);
3183 	}
3184 
3185 	/*
3186 	 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
3187 	 * This follows the protocol established by
3188 	 * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
3189 	 */
3190 	if (dorwlock == 0) {
3191 		rw_enter(&tdp->i_rwlock, RW_WRITER);
3192 		dorwlock = 2;
3193 	}
3194 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3195 	rw_enter(&tdp->i_contents, RW_WRITER);
3196 
3197 	/*
3198 	 * Suppress out of inodes messages if we will retry.
3199 	 */
3200 	if (retry)
3201 		tdp->i_flag |= IQUIET;
3202 	err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr);
3203 	tdp->i_flag &= ~IQUIET;
3204 
3205 	if (err)
3206 		goto fail;
3207 
3208 	if (flags) {
3209 
3210 		/*
3211 		 * Now attach it to src file.
3212 		 */
3213 
3214 		tdp->i_oeftflag = ip->i_number;
3215 	}
3216 
3217 	ip->i_cflags |= IXATTR;
3218 	ITOV(ip)->v_flag |= V_XATTRDIR;
3219 	TRANS_INODE(ufsvfsp, tdp);
3220 	tdp->i_flag |= ICHG | IUPD;
3221 	tdp->i_seq++;
3222 	ufs_iupdat(tdp, I_SYNC);
3223 	rw_exit(&tdp->i_contents);
3224 	rw_exit(&ufsvfsp->vfs_dqrwlock);
3225 
3226 	rw_enter(&ip->i_rwlock, RW_WRITER);
3227 	rw_enter(&ip->i_contents, RW_WRITER);
3228 	TRANS_INODE(ufsvfsp, ip);
3229 	ip->i_flag |= ICHG| IUPD;
3230 	ip->i_seq++;
3231 	ufs_iupdat(ip, I_SYNC);
3232 	rw_exit(&ip->i_contents);
3233 	rw_exit(&ip->i_rwlock);
3234 	if (dorwlock == 2)
3235 		rw_exit(&tdp->i_rwlock);
3236 	if (ulp) {
3237 		int terr = 0;
3238 
3239 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
3240 		ufs_lockfs_end(ulp);
3241 		if (err == 0)
3242 			err = terr;
3243 	}
3244 	if (dorwlock == 1)
3245 		rw_exit(&tdp->i_rwlock);
3246 	*ipp = ip;
3247 	return (err);
3248 
3249 fail:
3250 	rw_exit(&tdp->i_contents);
3251 	rw_exit(&ufsvfsp->vfs_dqrwlock);
3252 	if (dorwlock == 2)
3253 		rw_exit(&tdp->i_rwlock);
3254 	if (ulp) {
3255 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
3256 		ufs_lockfs_end(ulp);
3257 	}
3258 	if (dorwlock == 1)
3259 		rw_exit(&tdp->i_rwlock);
3260 	if (ip != NULL)
3261 		VN_RELE(ITOV(ip));
3262 
3263 	/*
3264 	 * No inodes?  See if any are tied up in pending deletions.
3265 	 * This has to be done outside of any of the above, because
3266 	 * the draining operation can't be done from inside a transaction.
3267 	 */
3268 	if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3269 		ufs_delete_drain_wait(ufsvfsp, 1);
3270 		retry = 0;
3271 		goto again;
3272 	}
3273 
3274 	return (err);
3275 }
3276 
3277 /*
3278  * clear the dotdot directory entry.
3279  * Used by ufs_dirscan when clr_dotdot
3280  * flag is set and we're deleting a
3281  * directory.
3282  */
3283 static int
3284 ufs_dirclrdotdot(struct inode *ip, ino_t parentino)
3285 {
3286 	struct fbuf *fbp;
3287 	struct direct *dotp, *dotdotp;
3288 	int err = 0;
3289 
3290 	ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
3291 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
3292 	err = blkatoff(ip, 0, NULL, &fbp);
3293 	if (err) {
3294 		return (err);
3295 	}
3296 
3297 	dotp = (struct direct *)fbp->fb_addr;
3298 	if ((dotp->d_namlen < (MAXNAMLEN + 1)) &&
3299 	    ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) {
3300 		dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen);
3301 		if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) &&
3302 		    ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) {
3303 
3304 			dotp->d_reclen += dotdotp->d_reclen;
3305 			if (parentino == dotdotp->d_ino) {
3306 				dotdotp->d_ino = 0;
3307 				dotdotp->d_namlen = 0;
3308 				dotdotp->d_reclen = 0;
3309 			}
3310 
3311 			err = TRANS_DIR(ip, 0);
3312 			if (err) {
3313 				fbrelse(fbp, S_OTHER);
3314 			} else {
3315 				err = ufs_fbwrite(fbp, ip);
3316 			}
3317 		}
3318 	} else {
3319 		err = -1;
3320 	}
3321 	return (err);
3322 }
3323