xref: /titanic_50/usr/src/uts/common/fs/ufs/ufs_dir.c (revision 5f9e250aa611c12bbaccc0be612e5b97ccca2762)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 
40 #pragma ident	"%Z%%M%	%I%	%E% SMI"
41 
42 /*
43  * Directory manipulation routines.
44  *
45  * When manipulating directories, the i_rwlock provides serialization
46  * since directories cannot be mmapped. The i_contents lock is redundant.
47  */
48 
49 #include <sys/types.h>
50 #include <sys/t_lock.h>
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/signal.h>
54 #include <sys/cred.h>
55 #include <sys/proc.h>
56 #include <sys/disp.h>
57 #include <sys/user.h>
58 #include <sys/vfs.h>
59 #include <sys/vnode.h>
60 #include <sys/stat.h>
61 #include <sys/mode.h>
62 #include <sys/buf.h>
63 #include <sys/uio.h>
64 #include <sys/dnlc.h>
65 #include <sys/fs/ufs_inode.h>
66 #include <sys/fs/ufs_fs.h>
67 #include <sys/mount.h>
68 #include <sys/fs/ufs_fsdir.h>
69 #include <sys/fs/ufs_trans.h>
70 #include <sys/fs/ufs_panic.h>
71 #include <sys/fs/ufs_quota.h>
72 #include <sys/errno.h>
73 #include <sys/debug.h>
74 #include <vm/seg.h>
75 #include <sys/sysmacros.h>
76 #include <sys/cmn_err.h>
77 #include <sys/cpuvar.h>
78 #include <sys/unistd.h>
79 #include <sys/policy.h>
80 
81 /*
82  * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ
83  */
84 #if !ISP2(DIRBLKSIZ)
85 #error	"DIRBLKSIZ not a power of 2"
86 #endif
87 
88 /*
89  * A virgin directory.
90  */
91 static struct dirtemplate mastertemplate = {
92 	0, 12, 1, ".",
93 	0, DIRBLKSIZ - 12, 2, ".."
94 };
95 
96 #define	LDIRSIZ(len) \
97 	((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3))
98 #define	MAX_DIR_NAME_LEN(len) \
99 	(((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1)
100 
101 /*
102  * The dnlc directory cache allows a 64 bit handle for directory entries.
103  * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset
104  * into the handle. Note, a 32 bit offset allows a 4GB directory, which
105  * is way beyond what could be cached in memory by the directory
106  * caching routines. So we are quite safe with this limit.
107  * The macros below pack and unpack the handle.
108  */
109 #define	H_TO_INO(h) (uint32_t)((h) & UINT_MAX)
110 #define	H_TO_OFF(h) (off_t)((h) >> 32)
111 #define	INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino))
112 
113 /*
114  * The average size of a typical on disk directory entry is about 16 bytes
115  * and so defines AV_DIRECT_SHIFT : log2(16)
116  * This define is only used to approximate the number of entries
117  * is a directory. This is needed for dnlc_dir_start() which will immediately
118  * return an error if the value is not within its acceptable range of
119  * number of files in a directory.
120  */
121 #define	AV_DIRECT_SHIFT 4
122 /*
123  * If the directory size (from i_size) is greater than the ufs_min_dir_cache
124  * tunable then we request dnlc directory caching.
125  * This has found to be profitable after 1024 file names.
126  */
127 int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT;
128 
129 /* The time point the dnlc directory caching was disabled */
130 static hrtime_t ufs_dc_disable_at;
131 /* directory caching disable duration */
132 static hrtime_t ufs_dc_disable_duration = (hrtime_t)NANOSEC * 5;
133 
134 #ifdef DEBUG
135 int dirchk = 1;
136 #else /* !DEBUG */
137 int dirchk = 0;
138 #endif /* DEBUG */
139 int ufs_negative_cache = 1;
140 uint64_t ufs_dirremove_retry_cnt;
141 
142 static void dirbad();
143 static int ufs_dirrename();
144 static int ufs_diraddentry();
145 static int ufs_dirempty();
146 static int ufs_dirscan();
147 static int ufs_dirclrdotdot();
148 static int ufs_dirfixdotdot();
149 static int ufs_dirpurgedotdot();
150 static int dirprepareentry();
151 static int ufs_dirmakedirect();
152 static int dirbadname();
153 static int dirmangled();
154 
155 /*
156  * Look for a given name in a directory.  On successful return, *ipp
157  * will point to the VN_HELD inode.
158  */
159 int
160 ufs_dirlook(
161 	struct inode *dp,
162 	char *namep,
163 	struct inode **ipp,
164 	struct cred *cr,
165 	int skipdnlc)			/* skip the 1st level dnlc */
166 {
167 	uint64_t handle;
168 	struct fbuf *fbp;		/* a buffer of directory entries */
169 	struct direct *ep;		/* the current directory entry */
170 	struct vnode *vp;
171 	struct vnode *dvp;		/* directory vnode ptr */
172 	dcanchor_t *dcap;
173 	off_t endsearch;		/* offset to end directory search */
174 	off_t offset;
175 	off_t start_off;		/* starting offset from middle search */
176 	off_t last_offset;		/* last offset */
177 	int entryoffsetinblock;		/* offset of ep in addr's buffer */
178 	int numdirpasses;		/* strategy for directory search */
179 	int namlen;			/* length of name */
180 	int err;
181 	int doingchk;
182 	int i;
183 	int caching;
184 	ino_t ep_ino;			/* entry i number */
185 	ino_t chkino;
186 	ushort_t ep_reclen;		/* direct local d_reclen */
187 
188 	ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */
189 
190 	/*
191 	 * Check accessibility of directory.
192 	 */
193 	if (((dp->i_mode & IFMT) != IFDIR) &&
194 	    ((dp->i_mode & IFMT) != IFATTRDIR))
195 		return (ENOTDIR);
196 
197 	if (err = ufs_iaccess(dp, IEXEC, cr))
198 		return (err);
199 
200 	/*
201 	 * Check the directory name lookup cache, first for individual files
202 	 * then for complete directories.
203 	 */
204 	dvp = ITOV(dp);
205 	if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) {
206 		/* vp is already held from dnlc_lookup */
207 		if (vp == DNLC_NO_VNODE) {
208 			VN_RELE(vp);
209 			return (ENOENT);
210 		}
211 		*ipp = VTOI(vp);
212 		return (0);
213 	}
214 
215 	dcap = &dp->i_danchor;
216 
217 	/*
218 	 * Grab the reader lock on the directory data before checking
219 	 * the dnlc to avoid a race with ufs_dirremove() & friends.
220 	 */
221 	rw_enter(&dp->i_rwlock, RW_READER);
222 
223 	switch (dnlc_dir_lookup(dcap, namep, &handle)) {
224 	case DFOUND:
225 		ep_ino = (ino_t)H_TO_INO(handle);
226 		if (dp->i_number == ep_ino) {
227 			VN_HOLD(dvp);	/* want ourself, "." */
228 			*ipp = dp;
229 			rw_exit(&dp->i_rwlock);
230 			return (0);
231 		}
232 		if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) {
233 			uint64_t handle2;
234 			/*
235 			 * release the lock on the dir we are searching
236 			 * to avoid a deadlock when grabbing the
237 			 * i_contents lock in ufs_iget_alloced().
238 			 */
239 			rw_exit(&dp->i_rwlock);
240 			rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
241 			err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
242 			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
243 			/*
244 			 * must recheck as we dropped dp->i_rwlock
245 			 */
246 			rw_enter(&dp->i_rwlock, RW_READER);
247 			if (!err && (dnlc_dir_lookup(dcap, namep, &handle2)
248 			    == DFOUND) && (handle == handle2)) {
249 				dnlc_update(dvp, namep, ITOV(*ipp));
250 				rw_exit(&dp->i_rwlock);
251 				return (0);
252 			}
253 			/* check failed, read the actual directory */
254 			if (!err) {
255 				VN_RELE(ITOV(*ipp));
256 			}
257 			goto restart;
258 		}
259 		/* usual case of not "." nor ".." */
260 		rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
261 		err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
262 		rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
263 		if (err) {
264 			rw_exit(&dp->i_rwlock);
265 			return (err);
266 		}
267 		dnlc_update(dvp, namep, ITOV(*ipp));
268 		rw_exit(&dp->i_rwlock);
269 		return (0);
270 	case DNOENT:
271 		if (ufs_negative_cache && (dp->i_nlink > 0)) {
272 			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
273 		}
274 		rw_exit(&dp->i_rwlock);
275 		return (ENOENT);
276 	default:
277 		break;
278 	}
279 restart:
280 
281 	fbp = NULL;
282 	doingchk = 0;
283 	chkino = 0;
284 	caching = 0;
285 
286 	/*
287 	 * Attempt to cache any directories greater than the tunable
288 	 * ufs_min_cache_dir. If it fails due to memory shortage (DNOMEM),
289 	 * disable caching for this directory and record the system time.
290 	 * Any attempt after the disable time has expired will enable
291 	 * the caching again.
292 	 */
293 	if (dp->i_size >= ufs_min_dir_cache) {
294 		/*
295 		 * if the directory caching disable time has expired
296 		 * enable the caching again.
297 		 */
298 		if (dp->i_cachedir == CD_DISABLED_NOMEM &&
299 		    gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
300 			ufs_dc_disable_at = 0;
301 			dp->i_cachedir = CD_ENABLED;
302 		}
303 		if (dp->i_cachedir == CD_ENABLED) {
304 			switch (dnlc_dir_start(dcap, dp->i_size >>
305 				AV_DIRECT_SHIFT)) {
306 			case DNOMEM:
307 				dp->i_cachedir = CD_DISABLED_NOMEM;
308 				ufs_dc_disable_at = gethrtime();
309 				break;
310 			case DTOOBIG:
311 				dp->i_cachedir = CD_DISABLED_TOOBIG;
312 				break;
313 			case DOK:
314 				caching = 1;
315 				break;
316 			default:
317 				break;
318 			}
319 		}
320 	}
321 	/*
322 	 * If caching we don't stop when the file has been
323 	 * found, but need to know later, so clear *ipp now
324 	 */
325 	*ipp = NULL;
326 
327 recheck:
328 	if (caching) {
329 		offset = 0;
330 		entryoffsetinblock = 0;
331 		numdirpasses = 1;
332 	} else {
333 		/*
334 		 * Take care to look at dp->i_diroff only once, as it
335 		 * may be changing due to other threads/cpus.
336 		 */
337 		offset = dp->i_diroff;
338 		if (offset > dp->i_size) {
339 			offset = 0;
340 		}
341 		if (offset == 0) {
342 			entryoffsetinblock = 0;
343 			numdirpasses = 1;
344 		} else {
345 			start_off = offset;
346 
347 			entryoffsetinblock = blkoff(dp->i_fs, offset);
348 			if (entryoffsetinblock != 0) {
349 				err = blkatoff(dp, offset, (char **)0, &fbp);
350 				if (err)
351 					goto bad;
352 			}
353 			numdirpasses = 2;
354 		}
355 	}
356 	endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t);
357 	namlen = strlen(namep);
358 	last_offset = 0;
359 
360 searchloop:
361 	while (offset < endsearch) {
362 		/*
363 		 * If offset is on a block boundary,
364 		 * read the next directory block.
365 		 * Release previous if it exists.
366 		 */
367 		if (blkoff(dp->i_fs, offset) == 0) {
368 			if (fbp != NULL) {
369 				fbrelse(fbp, S_OTHER);
370 			}
371 			err = blkatoff(dp, offset, (char **)0, &fbp);
372 			if (err)
373 				goto bad;
374 			entryoffsetinblock = 0;
375 		}
376 
377 		/*
378 		 * If the offset to the next entry is invalid or if the
379 		 * next entry is a zero length record or if the record
380 		 * length is invalid, then skip to the next directory
381 		 * block.  Complete validation checks are done if the
382 		 * record length is invalid.
383 		 *
384 		 * Full validation checks are slow so they are disabled
385 		 * by default.  Complete checks can be run by patching
386 		 * "dirchk" to be true.
387 		 *
388 		 * We have to check the validity of entryoffsetinblock
389 		 * here because it can be set to i_diroff above.
390 		 */
391 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock);
392 		if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 ||
393 		    (dirchk || (ep->d_reclen & 0x3)) &&
394 		    dirmangled(dp, ep, entryoffsetinblock, offset)) {
395 			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
396 			offset += i;
397 			entryoffsetinblock += i;
398 			if (caching) {
399 				dnlc_dir_purge(dcap);
400 				caching = 0;
401 			}
402 			continue;
403 		}
404 
405 		ep_reclen = ep->d_reclen;
406 
407 		/*
408 		 * Add named entries and free space into the directory cache
409 		 */
410 		if (caching) {
411 			ushort_t extra;
412 			off_t off2;
413 
414 			if (ep->d_ino == 0) {
415 				extra = ep_reclen;
416 				if (offset & (DIRBLKSIZ - 1)) {
417 					dnlc_dir_purge(dcap);
418 					dp->i_cachedir = CD_DISABLED;
419 					caching = 0;
420 				}
421 			} else {
422 				/*
423 				 * entries hold the previous offset except the
424 				 * 1st which holds the offset + 1
425 				 */
426 				if (offset & (DIRBLKSIZ - 1)) {
427 					off2 = last_offset;
428 				} else {
429 					off2 = offset + 1;
430 				}
431 				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
432 				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
433 				extra = ep_reclen - DIRSIZ(ep);
434 			}
435 			if (caching && (extra >= LDIRSIZ(1))) {
436 				caching = (dnlc_dir_add_space(dcap, extra,
437 				    (uint64_t)offset) == DOK);
438 			}
439 		}
440 
441 		/*
442 		 * Check for a name match.
443 		 * We have the parent inode read locked with i_rwlock.
444 		 */
445 		if (ep->d_ino && ep->d_namlen == namlen &&
446 		    *namep == *ep->d_name &&	/* fast chk 1st chr */
447 		    bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) {
448 
449 			/*
450 			 * We have to release the fbp early here to avoid
451 			 * a possible deadlock situation where we have the
452 			 * fbp and want the directory inode and someone doing
453 			 * a ufs_direnter_* has the directory inode and wants
454 			 * the fbp.  XXX - is this still needed?
455 			 */
456 			ep_ino = (ino_t)ep->d_ino;
457 			ASSERT(fbp != NULL);
458 			fbrelse(fbp, S_OTHER);
459 			fbp = NULL;
460 
461 			/*
462 			 * Atomic update (read lock held)
463 			 */
464 			dp->i_diroff = offset;
465 
466 			if (namlen == 2 && namep[0] == '.' && namep[1] == '.') {
467 				struct timeval32 omtime;
468 
469 				if (caching) {
470 					dnlc_dir_purge(dcap);
471 					caching = 0;
472 				}
473 				if (doingchk) {
474 					/*
475 					 * if the inumber didn't change
476 					 * continue with already found inode.
477 					 */
478 					if (ep_ino == chkino)
479 						goto checkok;
480 					else {
481 						VN_RELE(ITOV(*ipp));
482 						/* *ipp is nulled at restart */
483 						goto restart;
484 					}
485 				}
486 				/*
487 				 * release the lock on the dir we are searching
488 				 * to avoid a deadlock when grabbing the
489 				 * i_contents lock in ufs_iget_alloced().
490 				 */
491 				omtime = dp->i_mtime;
492 				rw_exit(&dp->i_rwlock);
493 				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
494 						RW_READER);
495 				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
496 				    cr);
497 				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
498 				rw_enter(&dp->i_rwlock, RW_READER);
499 				if (err)
500 					goto bad;
501 				/*
502 				 * Since we released the lock on the directory,
503 				 * we must check that the same inode is still
504 				 * the ".." entry for this directory.
505 				 */
506 				/*CSTYLED*/
507 				if (timercmp(&omtime, &dp->i_mtime, !=)) {
508 					/*
509 					 * Modification time changed on the
510 					 * directory, we must go check if
511 					 * the inumber changed for ".."
512 					 */
513 					doingchk = 1;
514 					chkino = ep_ino;
515 					entryoffsetinblock = 0;
516 					if (caching) {
517 						/*
518 						 * Forget directory caching
519 						 * for this rare case
520 						 */
521 						dnlc_dir_purge(dcap);
522 						caching = 0;
523 					}
524 					goto recheck;
525 				}
526 			} else if (dp->i_number == ep_ino) {
527 				VN_HOLD(dvp);	/* want ourself, "." */
528 				*ipp = dp;
529 				if (caching) {
530 					dnlc_dir_purge(dcap);
531 					caching = 0;
532 				}
533 			} else {
534 				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
535 						RW_READER);
536 				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
537 				    cr);
538 				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
539 				if (err)
540 					goto bad;
541 			}
542 checkok:
543 			ASSERT(*ipp);
544 			dnlc_update(dvp, namep, ITOV(*ipp));
545 			/*
546 			 * If we are not caching then just return the entry
547 			 * otherwise complete loading up the cache
548 			 */
549 			if (!caching) {
550 				rw_exit(&dp->i_rwlock);
551 				return (0);
552 			}
553 			err = blkatoff(dp, offset, (char **)0, &fbp);
554 			if (err)
555 				goto bad;
556 		}
557 		last_offset = offset;
558 		offset += ep_reclen;
559 		entryoffsetinblock += ep_reclen;
560 	}
561 	/*
562 	 * If we started in the middle of the directory and failed
563 	 * to find our target, we must check the beginning as well.
564 	 */
565 	if (numdirpasses == 2) {
566 		numdirpasses--;
567 		offset = 0;
568 		endsearch = start_off;
569 		goto searchloop;
570 	}
571 
572 	/*
573 	 * If whole directory caching is on (or was originally on) then
574 	 * the entry may have been found.
575 	 */
576 	if (*ipp == NULL) {
577 		err = ENOENT;
578 		if (ufs_negative_cache && (dp->i_nlink > 0)) {
579 			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
580 		}
581 	}
582 	if (caching) {
583 		dnlc_dir_complete(dcap);
584 		caching = 0;
585 	}
586 
587 bad:
588 	if (err && *ipp) {
589 		/*
590 		 * err and *ipp can both be set if we were attempting to
591 		 * cache the directory, and we found the entry, then later
592 		 * while trying to complete the directory cache encountered
593 		 * a error (eg reading a directory sector).
594 		 */
595 		VN_RELE(ITOV(*ipp));
596 		*ipp = NULL;
597 	}
598 
599 	if (fbp)
600 		fbrelse(fbp, S_OTHER);
601 	rw_exit(&dp->i_rwlock);
602 	if (caching)
603 		dnlc_dir_purge(dcap);
604 	return (err);
605 }
606 
607 /*
608  * Write a new directory entry for DE_CREATE or DE_MKDIR operations.
609  */
610 int
611 ufs_direnter_cm(
612 	struct inode *tdp,	/* target directory to make entry in */
613 	char *namep,		/* name of entry */
614 	enum de_op op,		/* entry operation */
615 	struct vattr *vap,	/* attributes if new inode needed */
616 	struct inode **ipp,	/* return entered inode here */
617 	struct cred *cr,	/* user credentials */
618 	int flags)		/* no entry exists */
619 {
620 	struct inode *tip;	/* inode of (existing) target file */
621 	char *s;
622 	struct ufs_slot slot;	/* slot info to pass around */
623 	int namlen;		/* length of name */
624 	int err;		/* error number */
625 	struct inode *nip;	/* new inode */
626 	int do_rele_nip = 0;	/* release nip */
627 	int noentry = flags & ~IQUIET;
628 	int quiet = flags & IQUIET;	/* Suppress out of inodes message */
629 
630 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
631 
632 	if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) ||
633 	    ((vap->va_type == VCHR) || (vap->va_type == VBLK) ||
634 	    (vap->va_type == VDOOR) || (vap->va_type == VSOCK) ||
635 	    (vap->va_type == VFIFO))))
636 		return (EINVAL);
637 
638 	/* don't allow '/' characters in pathname component */
639 	for (s = namep, namlen = 0; *s; s++, namlen++)
640 		if (*s == '/')
641 			return (EACCES);
642 	ASSERT(namlen);
643 
644 	/*
645 	 * If name is "." or ".." then if this is a create look it up
646 	 * and return EEXIST.
647 	 */
648 	if (namep[0] == '.' &&
649 	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
650 		/*
651 		 * ufs_dirlook will acquire the i_rwlock
652 		 */
653 		rw_exit(&tdp->i_rwlock);
654 		if (err = ufs_dirlook(tdp, namep, ipp, cr, 0)) {
655 			rw_enter(&tdp->i_rwlock, RW_WRITER);
656 			return (err);
657 		}
658 		rw_enter(&tdp->i_rwlock, RW_WRITER);
659 		return (EEXIST);
660 	}
661 
662 	/*
663 	 * If target directory has not been removed, then we can consider
664 	 * allowing file to be created.
665 	 */
666 	if (tdp->i_nlink <= 0) {
667 		return (ENOENT);
668 	}
669 
670 	/*
671 	 * Check accessibility of directory.
672 	 */
673 	if (((tdp->i_mode & IFMT) != IFDIR) &&
674 	    ((tdp->i_mode & IFMT) != IFATTRDIR)) {
675 		return (ENOTDIR);
676 	}
677 
678 	/*
679 	 * Execute access is required to search the directory.
680 	 */
681 	if (err = ufs_iaccess(tdp, IEXEC, cr)) {
682 		return (err);
683 	}
684 
685 	/*
686 	 * Search for the entry. Return VN_HELD tip if found.
687 	 */
688 	tip = NULL;
689 	slot.fbp = NULL;
690 	slot.status = NONE;
691 	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
692 	rw_enter(&tdp->i_contents, RW_WRITER);
693 	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry);
694 	if (err)
695 		goto out;
696 	if (tip) {
697 		ASSERT(!noentry);
698 		*ipp = tip;
699 		err = EEXIST;
700 	} else {
701 		/*
702 		 * The entry does not exist. Check write permission in
703 		 * directory to see if entry can be created.
704 		 */
705 		if (err = ufs_iaccess(tdp, IWRITE, cr))
706 			goto out;
707 		/*
708 		 * Make new inode and directory entry.
709 		 */
710 		tdp->i_flag |= quiet;
711 		if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) {
712 			if (nip != NULL)
713 				do_rele_nip = 1;
714 			goto out;
715 		}
716 		if (err = ufs_diraddentry(tdp, namep, op,
717 		    namlen, &slot, nip, NULL, cr)) {
718 			/*
719 			 * Unmake the inode we just made.
720 			 */
721 			rw_enter(&nip->i_contents, RW_WRITER);
722 			if (((nip->i_mode & IFMT) == IFDIR) ||
723 			    ((nip->i_mode & IFMT) == IFATTRDIR)) {
724 				tdp->i_nlink--;
725 				ufs_setreclaim(tdp);
726 				tdp->i_flag |= ICHG;
727 				tdp->i_seq++;
728 				TRANS_INODE(tdp->i_ufsvfs, tdp);
729 				ITIMES_NOLOCK(tdp);
730 			}
731 			nip->i_nlink = 0;
732 			ufs_setreclaim(nip);
733 			TRANS_INODE(nip->i_ufsvfs, nip);
734 			nip->i_flag |= ICHG;
735 			nip->i_seq++;
736 			ITIMES_NOLOCK(nip);
737 			rw_exit(&nip->i_contents);
738 			do_rele_nip = 1;
739 		} else {
740 			*ipp = nip;
741 		}
742 	}
743 
744 out:
745 	if (slot.fbp)
746 		fbrelse(slot.fbp, S_OTHER);
747 
748 	tdp->i_flag &= ~quiet;
749 	rw_exit(&tdp->i_contents);
750 
751 	/*
752 	 * Drop vfs_dqrwlock before calling VN_RELE() on nip to
753 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
754 	 */
755 	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
756 
757 	if (do_rele_nip) {
758 		VN_RELE(ITOV(nip));
759 	}
760 
761 	return (err);
762 }
763 
764 /*
765  * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations.
766  * If tvpp is non-null, return with the pointer to the target vnode.
767  */
768 int
769 ufs_direnter_lr(
770 	struct inode *tdp,	/* target directory to make entry in */
771 	char *namep,		/* name of entry */
772 	enum de_op op,		/* entry operation */
773 	struct inode *sdp,	/* source inode parent if rename */
774 	struct inode *sip,	/* source inode */
775 	struct cred *cr,	/* user credentials */
776 	vnode_t **tvpp)		/* Return: (held) vnode of (existing) target */
777 {
778 	struct inode *tip;	/* inode of (existing) target file */
779 	char *s;
780 	struct ufs_slot slot;	/* slot info to pass around */
781 	int namlen;		/* length of name */
782 	int err;		/* error number */
783 
784 	/* don't allow '/' characters in pathname component */
785 	for (s = namep, namlen = 0; *s; s++, namlen++)
786 		if (*s == '/')
787 			return (EACCES);
788 	ASSERT(namlen);
789 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
790 
791 	/*
792 	 * If name is "." or ".." then if this is a create look it up
793 	 * and return EEXIST.  Rename or link TO "." or ".." is forbidden.
794 	 */
795 	if (namep[0] == '.' &&
796 	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
797 		if (op == DE_RENAME) {
798 			return (EINVAL);	/* *SIGH* should be ENOTEMPTY */
799 		}
800 		return (EEXIST);
801 	}
802 	/*
803 	 * For link and rename lock the source entry and check the link count
804 	 * to see if it has been removed while it was unlocked.  If not, we
805 	 * increment the link count and force the inode to disk to make sure
806 	 * that it is there before any directory entry that points to it.
807 	 *
808 	 * In the case of a symbolic link, we are dealing with a new inode
809 	 * which does not yet have any links.  We've created it with a link
810 	 * count of 1, and we don't want to increment it since this will be
811 	 * its first link.
812 	 *
813 	 * We are about to push the inode to disk. We make sure
814 	 * that the inode's data blocks are flushed first so the
815 	 * inode and it's data blocks are always in sync.  This
816 	 * adds some robustness in in the event of a power failure
817 	 * or panic where sync fails. If we panic before the
818 	 * inode is updated, then the inode still refers to the
819 	 * old data blocks (or none for a new file). If we panic
820 	 * after the inode is updated, then the inode refers to
821 	 * the new data blocks.
822 	 *
823 	 * We do this before grabbing the i_contents lock because
824 	 * ufs_syncip() will want that lock. We could do the data
825 	 * syncing after the removal checks, but upon return from
826 	 * the data sync we would have to repeat the removal
827 	 * checks.
828 	 */
829 	if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) {
830 		return (err);
831 	}
832 
833 	rw_enter(&sip->i_contents, RW_WRITER);
834 	if (sip->i_nlink <= 0) {
835 		rw_exit(&sip->i_contents);
836 		return (ENOENT);
837 	}
838 	if (sip->i_nlink == MAXLINK) {
839 		rw_exit(&sip->i_contents);
840 		return (EMLINK);
841 	}
842 
843 	/*
844 	 * Sync the indirect blocks associated with the file
845 	 * for the same reasons as described above.  Since this
846 	 * call wants the i_contents lock held for it we can do
847 	 * this here with no extra work.
848 	 */
849 	if (err = ufs_sync_indir(sip)) {
850 		rw_exit(&sip->i_contents);
851 		return (err);
852 	}
853 
854 	if (op != DE_SYMLINK)
855 		sip->i_nlink++;
856 	TRANS_INODE(sip->i_ufsvfs, sip);
857 	sip->i_flag |= ICHG;
858 	sip->i_seq++;
859 	ufs_iupdat(sip, I_SYNC);
860 	rw_exit(&sip->i_contents);
861 
862 	/*
863 	 * If target directory has not been removed, then we can consider
864 	 * allowing file to be created.
865 	 */
866 	if (tdp->i_nlink <= 0) {
867 		err = ENOENT;
868 		goto out2;
869 	}
870 	/*
871 	 * Check accessibility of directory.
872 	 */
873 	if (((tdp->i_mode & IFMT) != IFDIR) &&
874 	    (tdp->i_mode & IFMT) != IFATTRDIR) {
875 		err = ENOTDIR;
876 		goto out2;
877 	}
878 	/*
879 	 * Execute access is required to search the directory.
880 	 */
881 	if (err = ufs_iaccess(tdp, IEXEC, cr)) {
882 		goto out2;
883 	}
884 
885 	/*
886 	 * Search for the entry. Return VN_HELD tip if found.
887 	 */
888 	tip = NULL;
889 	slot.status = NONE;
890 	slot.fbp = NULL;
891 	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
892 	rw_enter(&tdp->i_contents, RW_WRITER);
893 	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0);
894 	if (err)
895 		goto out;
896 
897 	if (tip) {
898 		switch (op) {
899 		case DE_RENAME:
900 			err = ufs_dirrename(sdp, sip, tdp, namep,
901 			    tip, &slot, cr);
902 			break;
903 
904 		case DE_LINK:
905 		case DE_SYMLINK:
906 			/*
907 			 * Can't link to an existing file.
908 			 */
909 			err = EEXIST;
910 			break;
911 		default:
912 			break;
913 		}
914 	} else {
915 		/*
916 		 * The entry does not exist. Check write permission in
917 		 * directory to see if entry can be created.
918 		 */
919 		if (err = ufs_iaccess(tdp, IWRITE, cr))
920 			goto out;
921 		err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp,
922 		    cr);
923 	}
924 
925 out:
926 	if (slot.fbp)
927 		fbrelse(slot.fbp, S_OTHER);
928 
929 	rw_exit(&tdp->i_contents);
930 
931 	/*
932 	 * Drop vfs_dqrwlock before calling VN_RELE() on tip to
933 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
934 	 */
935 	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
936 
937 	/*
938 	 * If we renamed a file over the top of an existing file,
939 	 * or linked a file to an existing file (or tried to),
940 	 * then set *tvpp to the target vnode, if tvpp is non-null
941 	 * otherwise, release and delete (or just release) the inode.
942 	 *
943 	 * N.B., by returning the target's vnode pointer to the caller,
944 	 * that caller becomes responsible for doing the VN_RELE.
945 	 */
946 	if (tip) {
947 		if ((err == 0) && (tvpp != NULL)) {
948 			*tvpp = ITOV(tip);
949 		} else {
950 			VN_RELE(ITOV(tip));
951 		}
952 	}
953 
954 out2:
955 	if (err) {
956 		/*
957 		 * Undo bumped link count.
958 		 */
959 		if (op != DE_SYMLINK) {
960 			rw_enter(&sip->i_contents, RW_WRITER);
961 			sip->i_nlink--;
962 			ufs_setreclaim(sip);
963 			TRANS_INODE(sip->i_ufsvfs, sip);
964 			sip->i_flag |= ICHG;
965 			sip->i_seq++;
966 			ITIMES_NOLOCK(sip);
967 			rw_exit(&sip->i_contents);
968 		}
969 	}
970 	return (err);
971 }
972 
973 /*
974  * Check for the existence of a name in a directory (unless noentry
975  * is set) , or else of an empty
976  * slot in which an entry may be made.  If the requested name is found,
977  * then on return *ipp points at the inode and *offp contains
978  * its offset in the directory.  If the name is not found, then *ipp
979  * will be NULL and *slotp will contain information about a directory slot in
980  * which an entry may be made (either an empty slot, or the first position
981  * past the end of the directory).
982  * The target directory inode (tdp) is supplied write locked (i_rwlock).
983  *
984  * This may not be used on "." or "..", but aliases of "." are ok.
985  */
986 int
987 ufs_dircheckforname(
988 	struct inode *tdp,	/* inode of directory being checked */
989 	char *namep,		/* name we're checking for */
990 	int namlen,		/* length of name, excluding null */
991 	struct ufs_slot *slotp,	/* slot structure */
992 	struct inode **ipp,	/* return inode if we find one */
993 	struct cred *cr,
994 	int noentry)		/* noentry - just look for space */
995 {
996 	uint64_t handle;
997 	struct fbuf *fbp;	/* pointer to directory block */
998 	struct direct *ep;	/* directory entry */
999 	struct direct *nep;	/* next directory entry */
1000 	dcanchor_t *dcap;
1001 	vnode_t *dvp;		/* directory vnode ptr */
1002 	off_t dirsize;		/* size of the directory */
1003 	off_t offset;		/* offset in the directory */
1004 	off_t last_offset;	/* last offset */
1005 	off_t enduseful;	/* pointer past last used dir slot */
1006 	int entryoffsetinblk;	/* offset of ep in fbp's buffer */
1007 	int i;			/* length of mangled entry */
1008 	int needed;
1009 	int err;
1010 	int first;
1011 	int caching;
1012 	int stat;
1013 	ino_t ep_ino;
1014 	slotstat_t initstat = slotp->status;
1015 
1016 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1017 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1018 	ASSERT(*ipp == NULL);
1019 	fbp = NULL;
1020 
1021 	/*
1022 	 * First check if there is a complete cache of the directory.
1023 	 */
1024 	dvp = ITOV(tdp);
1025 
1026 	dcap = &tdp->i_danchor;
1027 	if (noentry) {
1028 		/*
1029 		 * We know from the 1st level dnlc cache that the entry
1030 		 * doesn't exist, so don't bother searching the directory
1031 		 * cache, but just look for space (possibly in the directory
1032 		 * cache).
1033 		 */
1034 		stat = DNOENT;
1035 	} else {
1036 		stat = dnlc_dir_lookup(dcap, namep, &handle);
1037 	}
1038 	switch (stat) {
1039 	case DFOUND:
1040 		ep_ino = (ino_t)H_TO_INO(handle);
1041 		if (tdp->i_number == ep_ino) {
1042 			*ipp = tdp;	/* we want ourself, ie "." */
1043 			VN_HOLD(dvp);
1044 		} else {
1045 			err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr);
1046 			if (err)
1047 				return (err);
1048 		}
1049 		offset = H_TO_OFF(handle);
1050 		first = 0;
1051 		if (offset & 1) {
1052 			/* This is the first entry in the block */
1053 			first = 1;
1054 			offset -= 1;
1055 			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1056 		}
1057 		err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1058 		if (err) {
1059 			VN_RELE(ITOV(*ipp));
1060 			*ipp = NULL;
1061 			return (err);
1062 		}
1063 		/*
1064 		 * Check the validity of the entry.
1065 		 * If it's bad, then throw away the cache and
1066 		 * continue without it. The dirmangled() routine
1067 		 * will then be called upon it.
1068 		 */
1069 		if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1070 			VN_RELE(ITOV(*ipp));
1071 			*ipp = NULL;
1072 			dnlc_dir_purge(dcap);
1073 			break;
1074 		}
1075 		/*
1076 		 * Remember the returned offset is the offset of the
1077 		 * preceding record (unless this is the 1st record
1078 		 * in the DIRBLKSIZ sized block (disk sector)), then it's
1079 		 * offset + 1. Note, no real offsets are on odd boundaries.
1080 		 */
1081 		if (first) {
1082 			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1083 			slotp->offset = offset;
1084 			slotp->size = 0;
1085 			slotp->ep = ep;
1086 		} else {
1087 			/* get the next entry */
1088 			nep = (struct direct *)((char *)ep + ep->d_reclen);
1089 			/*
1090 			 * Check the validity of this entry as well
1091 			 * If it's bad, then throw away the cache and
1092 			 * continue without it. The dirmangled() routine
1093 			 * will then be called upon it.
1094 			 */
1095 			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1096 			    (nep->d_ino != ep_ino)) {
1097 				VN_RELE(ITOV(*ipp));
1098 				*ipp = NULL;
1099 				dnlc_dir_purge(dcap);
1100 				break;
1101 			}
1102 			slotp->offset = offset + ep->d_reclen;
1103 			slotp->size = ep->d_reclen;
1104 			slotp->ep = nep;
1105 		}
1106 		slotp->status = EXIST;
1107 		slotp->fbp = fbp;
1108 		slotp->endoff = 0;
1109 		slotp->cached = 1;
1110 		dnlc_update(dvp, namep, ITOV(*ipp));
1111 		return (0);
1112 	case DNOENT:
1113 		/*
1114 		 * The caller gets to set the initial slot status to
1115 		 * indicate whether it's interested in getting a
1116 		 * empty slot. For example, the status can be set
1117 		 * to FOUND when an entry is being deleted.
1118 		 */
1119 		ASSERT(slotp->fbp == NULL);
1120 		if (slotp->status == FOUND) {
1121 			return (0);
1122 		}
1123 		switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen),
1124 		    &handle)) {
1125 		case DFOUND:
1126 			offset = (off_t)handle;
1127 			err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1128 			if (err) {
1129 				dnlc_dir_purge(dcap);
1130 				ASSERT(*ipp == NULL);
1131 				return (err);
1132 			}
1133 			/*
1134 			 * Check the validity of the entry.
1135 			 * If it's bad, then throw away the cache and
1136 			 * continue without it. The dirmangled() routine
1137 			 * will then be called upon it.
1138 			 */
1139 			if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1140 				dnlc_dir_purge(dcap);
1141 				break;
1142 			}
1143 			/*
1144 			 * Remember the returned offset is the offset of the
1145 			 * containing record.
1146 			 */
1147 			slotp->status = FOUND;
1148 			slotp->ep = ep;
1149 			slotp->offset = offset;
1150 			slotp->fbp = fbp;
1151 			slotp->size = ep->d_reclen;
1152 			/*
1153 			 * Set end offset to 0. Truncation is handled
1154 			 * because the dnlc cache will blow away the
1155 			 * cached directory when an entry is removed
1156 			 * that drops the entries left to less than half
1157 			 * the minumum number (dnlc_min_dir_cache).
1158 			 */
1159 			slotp->endoff = 0;
1160 			slotp->cached = 1;
1161 			return (0);
1162 		case DNOENT:
1163 			slotp->status = NONE;
1164 			slotp->offset = P2ROUNDUP_TYPED(tdp->i_size,
1165 			    DIRBLKSIZ, u_offset_t);
1166 			slotp->size = DIRBLKSIZ;
1167 			slotp->endoff = 0;
1168 			slotp->cached = 1;
1169 			return (0);
1170 		default:
1171 			break;
1172 		}
1173 		break;
1174 	}
1175 	slotp->cached = 0;
1176 	caching = NULL;
1177 	if (!noentry && tdp->i_size >= ufs_min_dir_cache) {
1178 		/*
1179 		 * if the directory caching disable time has expired
1180 		 * enable caching again.
1181 		 */
1182 		if (tdp->i_cachedir == CD_DISABLED_NOMEM &&
1183 		    gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
1184 			ufs_dc_disable_at = 0;
1185 			tdp->i_cachedir = CD_ENABLED;
1186 		}
1187 		/*
1188 		 * Attempt to cache any directories greater than the tunable
1189 		 * ufs_min_cache_dir. If it fails due to memory shortage
1190 		 * (DNOMEM), disable caching for this directory and record
1191 		 * the system time. Any attempt after the disable time has
1192 		 * expired will enable the caching again.
1193 		 */
1194 		if (tdp->i_cachedir == CD_ENABLED) {
1195 			switch (dnlc_dir_start(dcap,
1196 			    tdp->i_size >> AV_DIRECT_SHIFT)) {
1197 			case DNOMEM:
1198 				tdp->i_cachedir = CD_DISABLED_NOMEM;
1199 				ufs_dc_disable_at = gethrtime();
1200 				break;
1201 			case DTOOBIG:
1202 				tdp->i_cachedir = CD_DISABLED_TOOBIG;
1203 				break;
1204 			case DOK:
1205 				caching = 1;
1206 				break;
1207 			default:
1208 				break;
1209 			}
1210 		}
1211 	}
1212 
1213 	/*
1214 	 * No point in using i_diroff since we must search whole directory
1215 	 */
1216 	dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t);
1217 	enduseful = 0;
1218 	offset = last_offset = 0;
1219 	entryoffsetinblk = 0;
1220 	needed = (int)LDIRSIZ(namlen);
1221 	while (offset < dirsize) {
1222 		/*
1223 		 * If offset is on a block boundary,
1224 		 * read the next directory block.
1225 		 * Release previous if it exists.
1226 		 */
1227 		if (blkoff(tdp->i_fs, offset) == 0) {
1228 			if (fbp != NULL)
1229 				fbrelse(fbp, S_OTHER);
1230 
1231 			err = blkatoff(tdp, offset, (char **)0, &fbp);
1232 			if (err) {
1233 				ASSERT(*ipp == NULL);
1234 				if (caching) {
1235 					dnlc_dir_purge(dcap);
1236 				}
1237 				return (err);
1238 			}
1239 			entryoffsetinblk = 0;
1240 		}
1241 		/*
1242 		 * If still looking for a slot, and at a DIRBLKSIZ
1243 		 * boundary, have to start looking for free space
1244 		 * again.
1245 		 */
1246 		if (slotp->status == NONE &&
1247 		    (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) {
1248 			slotp->offset = -1;
1249 		}
1250 		/*
1251 		 * If the next entry is a zero length record or if the
1252 		 * record length is invalid, then skip to the next
1253 		 * directory block.  Complete validation checks are
1254 		 * done if the record length is invalid.
1255 		 *
1256 		 * Full validation checks are slow so they are disabled
1257 		 * by default.  Complete checks can be run by patching
1258 		 * "dirchk" to be true.
1259 		 *
1260 		 * We do not have to check the validity of
1261 		 * entryoffsetinblk here because it starts out as zero
1262 		 * and is only incremented by d_reclen values that we
1263 		 * validate here.
1264 		 */
1265 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk);
1266 		if (ep->d_reclen == 0 ||
1267 		    (dirchk || (ep->d_reclen & 0x3)) &&
1268 		    dirmangled(tdp, ep, entryoffsetinblk, offset)) {
1269 			i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1));
1270 			offset += i;
1271 			entryoffsetinblk += i;
1272 			if (caching) {
1273 				dnlc_dir_purge(dcap);
1274 				caching = 0;
1275 			}
1276 			continue;
1277 		}
1278 
1279 		/*
1280 		 * Add named entries and free space into the directory cache
1281 		 */
1282 		if (caching) {
1283 			ushort_t extra;
1284 			off_t off2;
1285 
1286 			if (ep->d_ino == 0) {
1287 				extra = ep->d_reclen;
1288 				if (offset & (DIRBLKSIZ - 1)) {
1289 					dnlc_dir_purge(dcap);
1290 					caching = 0;
1291 				}
1292 			} else {
1293 				/*
1294 				 * entries hold the previous offset if
1295 				 * not the 1st one
1296 				 */
1297 				if (offset & (DIRBLKSIZ - 1)) {
1298 					off2 = last_offset;
1299 				} else {
1300 					off2 = offset + 1;
1301 				}
1302 				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
1303 				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
1304 				extra = ep->d_reclen - DIRSIZ(ep);
1305 			}
1306 			if (caching && (extra >= LDIRSIZ(1))) {
1307 				caching = (dnlc_dir_add_space(dcap, extra,
1308 				    (uint64_t)offset) == DOK);
1309 			}
1310 		}
1311 
1312 		/*
1313 		 * If an appropriate sized slot has not yet been found,
1314 		 * check to see if one is available.
1315 		 */
1316 		if ((slotp->status != FOUND) && (slotp->status != EXIST)) {
1317 			int size = ep->d_reclen;
1318 
1319 			if (ep->d_ino != 0)
1320 				size -= DIRSIZ(ep);
1321 			if (size > 0) {
1322 				if (size >= needed) {
1323 					slotp->offset = offset;
1324 					slotp->size = ep->d_reclen;
1325 					if (noentry) {
1326 						slotp->ep = ep;
1327 						slotp->fbp = fbp;
1328 						slotp->status = FOUND;
1329 						slotp->endoff = 0;
1330 						return (0);
1331 					}
1332 					slotp->status = FOUND;
1333 				} else if (slotp->status == NONE) {
1334 					if (slotp->offset == -1)
1335 						slotp->offset = offset;
1336 				}
1337 			}
1338 		}
1339 		/*
1340 		 * Check for a name match.
1341 		 */
1342 		if (ep->d_ino && ep->d_namlen == namlen &&
1343 		    *namep == *ep->d_name &&	/* fast chk 1st char */
1344 		    bcmp(namep, ep->d_name, namlen) == 0) {
1345 
1346 			tdp->i_diroff = offset;
1347 
1348 			if (tdp->i_number == ep->d_ino) {
1349 				*ipp = tdp;	/* we want ourself, ie "." */
1350 				VN_HOLD(dvp);
1351 			} else {
1352 				err = ufs_iget_alloced(tdp->i_vfs,
1353 				    (ino_t)ep->d_ino, ipp, cr);
1354 				if (err) {
1355 					fbrelse(fbp, S_OTHER);
1356 					if (caching)
1357 						dnlc_dir_purge(dcap);
1358 					return (err);
1359 				}
1360 			}
1361 			slotp->status = EXIST;
1362 			slotp->offset = offset;
1363 			slotp->size = (int)(offset - last_offset);
1364 			slotp->fbp = fbp;
1365 			slotp->ep = ep;
1366 			slotp->endoff = 0;
1367 			if (caching)
1368 				dnlc_dir_purge(dcap);
1369 			return (0);
1370 		}
1371 		last_offset = offset;
1372 		offset += ep->d_reclen;
1373 		entryoffsetinblk += ep->d_reclen;
1374 		if (ep->d_ino)
1375 			enduseful = offset;
1376 	}
1377 	if (fbp) {
1378 		fbrelse(fbp, S_OTHER);
1379 	}
1380 
1381 	if (caching) {
1382 		dnlc_dir_complete(dcap);
1383 		slotp->cached = 1;
1384 		if (slotp->status == FOUND) {
1385 			if (initstat == FOUND) {
1386 				return (0);
1387 			}
1388 			(void) dnlc_dir_rem_space_by_handle(dcap,
1389 			    slotp->offset);
1390 			slotp->endoff = 0;
1391 			return (0);
1392 		}
1393 	}
1394 
1395 	if (slotp->status == NONE) {
1396 		/*
1397 		 * We didn't find a slot; the new directory entry should be put
1398 		 * at the end of the directory.  Return an indication of where
1399 		 * this is, and set "endoff" to zero; since we're going to have
1400 		 * to extend the directory, we're certainly not going to
1401 		 * truncate it.
1402 		 */
1403 		slotp->offset = dirsize;
1404 		slotp->size = DIRBLKSIZ;
1405 		slotp->endoff = 0;
1406 	} else {
1407 		/*
1408 		 * We found a slot, and will return an indication of where that
1409 		 * slot is, as any new directory entry will be put there.
1410 		 * Since that slot will become a useful entry, if the last
1411 		 * useful entry we found was before this one, update the offset
1412 		 * of the last useful entry.
1413 		 */
1414 		if (enduseful < slotp->offset + slotp->size)
1415 			enduseful = slotp->offset + slotp->size;
1416 		slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t);
1417 	}
1418 	*ipp = NULL;
1419 	return (0);
1420 }
1421 
1422 uint64_t ufs_dirrename_retry_cnt;
1423 
1424 /*
1425  * Rename the entry in the directory tdp so that it points to
1426  * sip instead of tip.
1427  */
1428 static int
1429 ufs_dirrename(
1430 	struct inode *sdp,	/* parent directory of source */
1431 	struct inode *sip,	/* source inode */
1432 	struct inode *tdp,	/* parent directory of target */
1433 	char *namep,		/* entry we are trying to change */
1434 	struct inode *tip,	/* target inode */
1435 	struct ufs_slot *slotp,	/* slot for entry */
1436 	struct cred *cr)	/* credentials */
1437 {
1438 	vnode_t *tdvp;
1439 	off_t offset;
1440 	int err;
1441 	int doingdirectory;
1442 
1443 	ASSERT(sdp->i_ufsvfs != NULL);
1444 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1445 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1446 	/*
1447 	 * Short circuit rename of something to itself.
1448 	 */
1449 	if (sip->i_number == tip->i_number) {
1450 		return (ESAME); /* special KLUDGE error code */
1451 	}
1452 
1453 	/*
1454 	 * We're locking 2 peer level locks, so must use tryenter
1455 	 * on the 2nd to avoid deadlocks that would occur
1456 	 * if we renamed a->b and b->a concurrently.
1457 	 */
1458 retry:
1459 	rw_enter(&tip->i_contents, RW_WRITER);
1460 	if (!rw_tryenter(&sip->i_contents, RW_READER)) {
1461 		/*
1462 		 * drop tip and wait (sleep) until we stand a chance
1463 		 * of holding sip
1464 		 */
1465 		rw_exit(&tip->i_contents);
1466 		rw_enter(&sip->i_contents, RW_READER);
1467 		/*
1468 		 * Reverse the lock grabs in case we have heavy
1469 		 * contention on the 2nd lock.
1470 		 */
1471 		if (!rw_tryenter(&tip->i_contents, RW_WRITER)) {
1472 			ufs_dirrename_retry_cnt++;
1473 			rw_exit(&sip->i_contents);
1474 			goto retry;
1475 		}
1476 	}
1477 
1478 	/*
1479 	 * Check that everything is on the same filesystem.
1480 	 */
1481 	if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) ||
1482 	    (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) {
1483 		err = EXDEV;		/* XXX archaic */
1484 		goto out;
1485 	}
1486 	/*
1487 	 * Must have write permission to rewrite target entry.
1488 	 * Perform additional checks for sticky directories.
1489 	 */
1490 	if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0 ||
1491 	    (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0)
1492 		goto out;
1493 
1494 	/*
1495 	 * Ensure source and target are compatible (both directories
1496 	 * or both not directories).  If target is a directory it must
1497 	 * be empty and have no links to it; in addition it must not
1498 	 * be a mount point, and both the source and target must be
1499 	 * writable.
1500 	 */
1501 	doingdirectory = (((sip->i_mode & IFMT) == IFDIR) ||
1502 	    ((sip->i_mode & IFMT) == IFATTRDIR));
1503 	if (((tip->i_mode & IFMT) == IFDIR) ||
1504 	    ((tip->i_mode & IFMT) == IFATTRDIR)) {
1505 		if (!doingdirectory) {
1506 			err = EISDIR;
1507 			goto out;
1508 		}
1509 		/*
1510 		 * vn_vfsrlock will prevent mounts from using the directory
1511 		 * until we are done.
1512 		 */
1513 		if (vn_vfsrlock(ITOV(tip))) {
1514 			err = EBUSY;
1515 			goto out;
1516 		}
1517 		if (vn_mountedvfs(ITOV(tip)) != NULL) {
1518 			vn_vfsunlock(ITOV(tip));
1519 			err = EBUSY;
1520 			goto out;
1521 		}
1522 		if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) {
1523 			vn_vfsunlock(ITOV(tip));
1524 			err = EEXIST;	/* SIGH should be ENOTEMPTY */
1525 			goto out;
1526 		}
1527 	} else if (doingdirectory) {
1528 		err = ENOTDIR;
1529 		goto out;
1530 	}
1531 
1532 	/*
1533 	 * Rewrite the inode pointer for target name entry
1534 	 * from the target inode (ip) to the source inode (sip).
1535 	 * This prevents the target entry from disappearing
1536 	 * during a crash. Mark the directory inode to reflect the changes.
1537 	 */
1538 	tdvp = ITOV(tdp);
1539 	slotp->ep->d_ino = (int32_t)sip->i_number;
1540 	dnlc_update(tdvp, namep, ITOV(sip));
1541 	if (slotp->size) {
1542 		offset = slotp->offset - slotp->size;
1543 	} else {
1544 		offset = slotp->offset + 1;
1545 	}
1546 	if (slotp->cached) {
1547 		(void) dnlc_dir_update(&tdp->i_danchor, namep,
1548 		    INO_OFF_TO_H(slotp->ep->d_ino, offset));
1549 	}
1550 
1551 	err = TRANS_DIR(tdp, slotp->offset);
1552 	if (err)
1553 		fbrelse(slotp->fbp, S_OTHER);
1554 	else
1555 		err = ufs_fbwrite(slotp->fbp, tdp);
1556 
1557 	slotp->fbp = NULL;
1558 	if (err) {
1559 		if (doingdirectory)
1560 			vn_vfsunlock(ITOV(tip));
1561 		goto out;
1562 	}
1563 
1564 	TRANS_INODE(tdp->i_ufsvfs, tdp);
1565 	tdp->i_flag |= IUPD|ICHG;
1566 	tdp->i_seq++;
1567 	ITIMES_NOLOCK(tdp);
1568 
1569 	/*
1570 	 * Decrement the link count of the target inode.
1571 	 * Fix the ".." entry in sip to point to dp.
1572 	 * This is done after the new entry is on the disk.
1573 	 */
1574 	tip->i_nlink--;
1575 	TRANS_INODE(tip->i_ufsvfs, tip);
1576 	tip->i_flag |= ICHG;
1577 	tip->i_seq++;
1578 	ITIMES_NOLOCK(tip);
1579 	if (doingdirectory) {
1580 		/*
1581 		 * The entry for tip no longer exists so I can unlock the
1582 		 * vfslock.
1583 		 */
1584 		vn_vfsunlock(ITOV(tip));
1585 		/*
1586 		 * Decrement target link count once more if it was a directory.
1587 		 */
1588 		if (--tip->i_nlink != 0) {
1589 			err = ufs_fault(ITOV(tip),
1590 		    "ufs_dirrename: target directory link count != 0 (%s)",
1591 			    tip->i_fs->fs_fsmnt);
1592 			rw_exit(&tip->i_contents);
1593 			return (err);
1594 		}
1595 		TRANS_INODE(tip->i_ufsvfs, tip);
1596 		ufs_setreclaim(tip);
1597 		/*
1598 		 * Renaming a directory with the parent different
1599 		 * requires that ".." be rewritten.  The window is
1600 		 * still there for ".." to be inconsistent, but this
1601 		 * is unavoidable, and a lot shorter than when it was
1602 		 * done in a user process.  We decrement the link
1603 		 * count in the new parent as appropriate to reflect
1604 		 * the just-removed target.  If the parent is the
1605 		 * same, this is appropriate since the original
1606 		 * directory is going away.  If the new parent is
1607 		 * different, ufs_dirfixdotdot() will bump the link count
1608 		 * back.
1609 		 */
1610 		tdp->i_nlink--;
1611 		ufs_setreclaim(tdp);
1612 		TRANS_INODE(tdp->i_ufsvfs, tdp);
1613 		tdp->i_flag |= ICHG;
1614 		tdp->i_seq++;
1615 		ITIMES_NOLOCK(tdp);
1616 		if (sdp != tdp) {
1617 			rw_exit(&tip->i_contents);
1618 			rw_exit(&sip->i_contents);
1619 			err = ufs_dirfixdotdot(sip, sdp, tdp);
1620 			return (err);
1621 		}
1622 	} else
1623 		ufs_setreclaim(tip);
1624 out:
1625 	rw_exit(&tip->i_contents);
1626 	rw_exit(&sip->i_contents);
1627 	return (err);
1628 }
1629 
1630 /*
1631  * Fix the ".." entry of the child directory so that it points
1632  * to the new parent directory instead of the old one.  Routine
1633  * assumes that dp is a directory and that all the inodes are on
1634  * the same file system.
1635  */
1636 static int
1637 ufs_dirfixdotdot(
1638 	struct inode *dp,	/* child directory */
1639 	struct inode *opdp,	/* old parent directory */
1640 	struct inode *npdp)	/* new parent directory */
1641 {
1642 	struct fbuf *fbp;
1643 	struct dirtemplate *dirp;
1644 	vnode_t *dvp;
1645 	int err;
1646 
1647 	ASSERT(RW_WRITE_HELD(&npdp->i_rwlock));
1648 	ASSERT(RW_WRITE_HELD(&npdp->i_contents));
1649 
1650 	/*
1651 	 * We hold the child directory's i_contents lock before calling
1652 	 * blkatoff so that we honor correct locking protocol which is
1653 	 * i_contents lock and then page lock. (blkatoff will call
1654 	 * ufs_getpage where we want the page lock)
1655 	 * We hold the child directory's i_rwlock before i_contents (as
1656 	 * per the locking protocol) since we are modifying the ".." entry
1657 	 * of the child directory.
1658 	 * We hold the i_rwlock and i_contents lock until we record
1659 	 * this directory delta to the log (via ufs_trans_dir) and have
1660 	 * done fbrelse.
1661 	 */
1662 	rw_enter(&dp->i_rwlock, RW_WRITER);
1663 	rw_enter(&dp->i_contents, RW_WRITER);
1664 	err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp);
1665 	if (err)
1666 		goto bad;
1667 
1668 	if (dp->i_nlink <= 0 ||
1669 	    dp->i_size < sizeof (struct dirtemplate)) {
1670 		err = ENOENT;
1671 		goto bad;
1672 	}
1673 
1674 	if (dirp->dotdot_namlen != 2 ||
1675 	    dirp->dotdot_name[0] != '.' ||
1676 	    dirp->dotdot_name[1] != '.') {	/* Sanity check. */
1677 		dirbad(dp, "mangled .. entry", (off_t)0);
1678 		err = ENOTDIR;
1679 		goto bad;
1680 	}
1681 
1682 	/*
1683 	 * Increment the link count in the new parent inode and force it out.
1684 	 */
1685 	if (npdp->i_nlink == MAXLINK) {
1686 		err = EMLINK;
1687 		goto bad;
1688 	}
1689 	npdp->i_nlink++;
1690 	TRANS_INODE(npdp->i_ufsvfs, npdp);
1691 	npdp->i_flag |= ICHG;
1692 	npdp->i_seq++;
1693 	ufs_iupdat(npdp, I_SYNC);
1694 
1695 	/*
1696 	 * Rewrite the child ".." entry and force it out.
1697 	 */
1698 	dvp = ITOV(dp);
1699 	dirp->dotdot_ino = (uint32_t)npdp->i_number;
1700 	dnlc_update(dvp, "..", ITOV(npdp));
1701 	(void) dnlc_dir_update(&dp->i_danchor, "..",
1702 	    INO_OFF_TO_H(dirp->dotdot_ino, 0));
1703 
1704 	err = TRANS_DIR(dp, 0);
1705 	if (err)
1706 		fbrelse(fbp, S_OTHER);
1707 	else
1708 		err = ufs_fbwrite(fbp, dp);
1709 
1710 	fbp = NULL;
1711 	if (err)
1712 		goto bad;
1713 
1714 	rw_exit(&dp->i_contents);
1715 	rw_exit(&dp->i_rwlock);
1716 
1717 	/*
1718 	 * Decrement the link count of the old parent inode and force it out.
1719 	 */
1720 	ASSERT(opdp);
1721 	rw_enter(&opdp->i_contents, RW_WRITER);
1722 	ASSERT(opdp->i_nlink > 0);
1723 	opdp->i_nlink--;
1724 	ufs_setreclaim(opdp);
1725 	TRANS_INODE(opdp->i_ufsvfs, opdp);
1726 	opdp->i_flag |= ICHG;
1727 	opdp->i_seq++;
1728 	ufs_iupdat(opdp, I_SYNC);
1729 	rw_exit(&opdp->i_contents);
1730 	return (0);
1731 
1732 bad:
1733 	if (fbp)
1734 		fbrelse(fbp, S_OTHER);
1735 	rw_exit(&dp->i_contents);
1736 	rw_exit(&dp->i_rwlock);
1737 	return (err);
1738 }
1739 
1740 /*
1741  * Enter the file sip in the directory tdp with name namep.
1742  */
1743 static int
1744 ufs_diraddentry(
1745 	struct inode *tdp,
1746 	char *namep,
1747 	enum de_op op,
1748 	int namlen,
1749 	struct ufs_slot *slotp,
1750 	struct inode *sip,
1751 	struct inode *sdp,
1752 	struct cred *cr)
1753 {
1754 	struct direct *ep, *nep;
1755 	vnode_t *tdvp;
1756 	dcanchor_t *dcap = &tdp->i_danchor;
1757 	off_t offset;
1758 	int err;
1759 	ushort_t extra;
1760 
1761 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1762 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1763 	/*
1764 	 * Prepare a new entry.  If the caller has not supplied an
1765 	 * existing inode, make a new one.
1766 	 */
1767 	err = dirprepareentry(tdp, slotp, cr);
1768 	if (err) {
1769 		if (slotp->fbp) {
1770 			fbrelse(slotp->fbp, S_OTHER);
1771 			slotp->fbp = NULL;
1772 		}
1773 		return (err);
1774 	}
1775 	/*
1776 	 * Check inode to be linked to see if it is in the
1777 	 * same filesystem.
1778 	 */
1779 	if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) {
1780 		err = EXDEV;
1781 		goto bad;
1782 	}
1783 
1784 	/*
1785 	 * If renaming a directory then fix up the ".." entry in the
1786 	 * directory to point to the new parent.
1787 	 */
1788 	if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) ||
1789 	    ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) {
1790 		err = ufs_dirfixdotdot(sip, sdp, tdp);
1791 		if (err)
1792 			goto bad;
1793 	}
1794 
1795 	/*
1796 	 * Fill in entry data.
1797 	 */
1798 	ep = slotp->ep;
1799 	ep->d_namlen = (ushort_t)namlen;
1800 	(void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3));
1801 	ep->d_ino = (uint32_t)sip->i_number;
1802 	tdvp = ITOV(tdp);
1803 	dnlc_update(tdvp, namep, ITOV(sip));
1804 	/*
1805 	 * Note the offset supplied for any named entry is
1806 	 * the offset of the previous one, unless it's the 1st.
1807 	 * slotp->size is used to pass the length to
1808 	 * the previous entry.
1809 	 */
1810 	if (slotp->size) {
1811 		offset = slotp->offset - slotp->size;
1812 	} else {
1813 		offset = slotp->offset + 1;
1814 	}
1815 
1816 	if (slotp->cached) {
1817 		/*
1818 		 * Add back any usable unused space to the dnlc directory
1819 		 * cache.
1820 		 */
1821 		extra = ep->d_reclen - DIRSIZ(ep);
1822 		if (extra >= LDIRSIZ(1)) {
1823 			(void) dnlc_dir_add_space(dcap, extra,
1824 			    (uint64_t)slotp->offset);
1825 		}
1826 
1827 		(void) dnlc_dir_add_entry(dcap, namep,
1828 		    INO_OFF_TO_H(ep->d_ino, offset));
1829 
1830 		/* adjust the previous offset of the next entry */
1831 		nep = (struct direct *)((char *)ep + ep->d_reclen);
1832 		if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
1833 			/*
1834 			 * Not a new block.
1835 			 *
1836 			 * Check the validity of the next entry.
1837 			 * If it's bad, then throw away the cache, and
1838 			 * continue as before directory caching.
1839 			 */
1840 			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1841 			    dnlc_dir_update(dcap, nep->d_name,
1842 			    INO_OFF_TO_H(nep->d_ino, slotp->offset))
1843 			    == DNOENT) {
1844 				dnlc_dir_purge(dcap);
1845 				slotp->cached = 0;
1846 			}
1847 		}
1848 	}
1849 
1850 	/*
1851 	 * Write out the directory block.
1852 	 */
1853 	err = TRANS_DIR(tdp, slotp->offset);
1854 	if (err)
1855 		fbrelse(slotp->fbp, S_OTHER);
1856 	else
1857 		err = ufs_fbwrite(slotp->fbp, tdp);
1858 
1859 	slotp->fbp = NULL;
1860 	/*
1861 	 * If this is a rename of a directory, then we have already
1862 	 * fixed the ".." entry to refer to the new parent. If err
1863 	 * is true at this point, we have failed to update the new
1864 	 * parent to refer to the renamed directory.
1865 	 * XXX - we need to unwind the ".." fix.
1866 	 */
1867 	if (err)
1868 		return (err);
1869 
1870 	/*
1871 	 * Mark the directory inode to reflect the changes.
1872 	 * Truncate the directory to chop off blocks of empty entries.
1873 	 */
1874 
1875 	TRANS_INODE(tdp->i_ufsvfs, tdp);
1876 	tdp->i_flag |= IUPD|ICHG;
1877 	tdp->i_seq++;
1878 	tdp->i_diroff = 0;
1879 	ITIMES_NOLOCK(tdp);
1880 	/*
1881 	 * If the directory grew then dirprepareentry() will have
1882 	 * set IATTCHG in tdp->i_flag, then the directory inode must
1883 	 * be flushed out. This is because if fsync() is used later
1884 	 * the directory size must be correct, otherwise a crash would
1885 	 * cause fsck to move the file to lost+found. Also because later
1886 	 * a file may be linked in more than one directory, then there
1887 	 * is no way to flush the original directory. So it must be
1888 	 * flushed out on creation. See bug 4293809.
1889 	 */
1890 	if (tdp->i_flag & IATTCHG) {
1891 		ufs_iupdat(tdp, I_SYNC);
1892 	}
1893 
1894 	if (slotp->endoff && (slotp->endoff < tdp->i_size)) {
1895 		if (!TRANS_ISTRANS(tdp->i_ufsvfs)) {
1896 			(void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0,
1897 						cr);
1898 		}
1899 	}
1900 
1901 
1902 	return (0);
1903 
1904 bad:
1905 	if (slotp->cached) {
1906 		dnlc_dir_purge(dcap);
1907 		fbrelse(slotp->fbp, S_OTHER);
1908 		slotp->cached = 0;
1909 		slotp->fbp = NULL;
1910 		return (err);
1911 	}
1912 
1913 	/*
1914 	 * Clear out entry prepared by dirprepareent.
1915 	 */
1916 	slotp->ep->d_ino = 0;
1917 	slotp->ep->d_namlen = 0;
1918 
1919 	/*
1920 	 * Don't touch err so we don't clobber the real error that got us here.
1921 	 */
1922 	if (TRANS_DIR(tdp, slotp->offset))
1923 		fbrelse(slotp->fbp, S_OTHER);
1924 	else
1925 		(void) ufs_fbwrite(slotp->fbp, tdp);
1926 	slotp->fbp = NULL;
1927 	return (err);
1928 }
1929 
1930 /*
1931  * Prepare a directory slot to receive an entry.
1932  */
1933 static int
1934 dirprepareentry(
1935 	struct inode *dp,	/* directory we are working in */
1936 	struct ufs_slot *slotp,	/* available slot info */
1937 	struct cred *cr)
1938 {
1939 	struct direct *ep, *nep;
1940 	off_t entryend;
1941 	int err;
1942 	slotstat_t status = slotp->status;
1943 	ushort_t dsize;
1944 
1945 	ASSERT((status == NONE) || (status == FOUND));
1946 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
1947 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
1948 	/*
1949 	 * If we didn't find a slot, then indicate that the
1950 	 * new slot belongs at the end of the directory.
1951 	 * If we found a slot, then the new entry can be
1952 	 * put at slotp->offset.
1953 	 */
1954 	entryend = slotp->offset + slotp->size;
1955 	if (status == NONE) {
1956 		ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0);
1957 		if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
1958 			err = ufs_fault(ITOV(dp),
1959 			    "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d"
1960 			    " > dp->i_fs->fs_fsize: %d (%s)",
1961 			    DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt);
1962 			return (err);
1963 		}
1964 		/*
1965 		 * Allocate the new block.
1966 		 */
1967 		err = BMAPALLOC(dp, (u_offset_t)slotp->offset,
1968 		    (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr);
1969 		if (err) {
1970 			return (err);
1971 		}
1972 		dp->i_size = entryend;
1973 		TRANS_INODE(dp->i_ufsvfs, dp);
1974 		dp->i_flag |= IUPD|ICHG|IATTCHG;
1975 		dp->i_seq++;
1976 		ITIMES_NOLOCK(dp);
1977 	} else if (entryend > dp->i_size) {
1978 		/*
1979 		 * Adjust directory size, if needed. This should never
1980 		 * push the size past a new multiple of DIRBLKSIZ.
1981 		 * This is an artifact of the old (4.2BSD) way of initializing
1982 		 * directory sizes to be less than DIRBLKSIZ.
1983 		 */
1984 		dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t);
1985 		TRANS_INODE(dp->i_ufsvfs, dp);
1986 		dp->i_flag |= IUPD|ICHG|IATTCHG;
1987 		dp->i_seq++;
1988 		ITIMES_NOLOCK(dp);
1989 	}
1990 
1991 	/*
1992 	 * Get the block containing the space for the new directory entry.
1993 	 */
1994 	if (slotp->fbp == NULL) {
1995 		err = blkatoff(dp, slotp->offset, (char **)&slotp->ep,
1996 		    &slotp->fbp);
1997 		if (err) {
1998 			return (err);
1999 		}
2000 	}
2001 	ep = slotp->ep;
2002 
2003 	switch (status) {
2004 	case NONE:
2005 		/*
2006 		 * No space in the directory. slotp->offset will be on a
2007 		 * directory block boundary and we will write the new entry
2008 		 * into a fresh block.
2009 		 */
2010 		ep->d_reclen = DIRBLKSIZ;
2011 		slotp->size = 0; /* length of previous entry */
2012 		break;
2013 	case FOUND:
2014 		/*
2015 		 * An entry of the required size has been found. Use it.
2016 		 */
2017 		if (ep->d_ino == 0) {
2018 			/* this is the 1st record in a block */
2019 			slotp->size = 0; /* length of previous entry */
2020 		} else {
2021 			dsize = DIRSIZ(ep);
2022 			nep = (struct direct *)((char *)ep + dsize);
2023 			nep->d_reclen = ep->d_reclen - dsize;
2024 			ep->d_reclen = dsize;
2025 			slotp->ep = nep;
2026 			slotp->offset += dsize;
2027 			slotp->size = dsize; /* length of previous entry */
2028 		}
2029 		break;
2030 	default:
2031 		break;
2032 	}
2033 	return (0);
2034 }
2035 
2036 /*
2037  * Allocate and initialize a new inode that will go into directory tdp.
2038  * This routine is called from ufs_symlink(), as well as within this file.
2039  */
2040 int
2041 ufs_dirmakeinode(
2042 	struct inode *tdp,
2043 	struct inode **ipp,
2044 	struct vattr *vap,
2045 	enum de_op op,
2046 	struct cred *cr)
2047 {
2048 	struct inode *ip;
2049 	enum vtype type;
2050 	int imode;			/* mode and format as in inode */
2051 	ino_t ipref;
2052 	int err;
2053 	timestruc_t now;
2054 
2055 	ASSERT(vap != NULL);
2056 	ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR ||
2057 		op == DE_SYMLINK);
2058 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
2059 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
2060 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
2061 	/*
2062 	 * Allocate a new inode.
2063 	 */
2064 	type = vap->va_type;
2065 	if (type == VDIR) {
2066 		ipref = dirpref(tdp);
2067 	} else {
2068 		ipref = tdp->i_number;
2069 	}
2070 	if (op == DE_ATTRDIR)
2071 		imode = vap->va_mode;
2072 	else
2073 		imode = MAKEIMODE(type, vap->va_mode);
2074 	*ipp = NULL;
2075 	err = ufs_ialloc(tdp, ipref, imode, &ip, cr);
2076 	if (err)
2077 		return (err);
2078 
2079 	/*
2080 	 * We don't need to grab vfs_dqrwlock here because it is held
2081 	 * in ufs_direnter_*() above us.
2082 	 */
2083 	ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock));
2084 	rw_enter(&ip->i_contents, RW_WRITER);
2085 	if (ip->i_dquot != NULL) {
2086 		err = ufs_fault(ITOV(ip),
2087 		    "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)",
2088 				    tdp->i_fs->fs_fsmnt);
2089 		rw_exit(&ip->i_contents);
2090 		return (err);
2091 	}
2092 	*ipp = ip;
2093 	ip->i_mode = (o_mode_t)imode;
2094 	if (type == VBLK || type == VCHR) {
2095 		dev_t d = vap->va_rdev;
2096 		dev32_t dev32;
2097 
2098 		/*
2099 		 * Don't allow a special file to be created with a
2100 		 * dev_t that cannot be represented by this filesystem
2101 		 * format on disk.
2102 		 */
2103 		if (!cmpldev(&dev32, d)) {
2104 			err = EOVERFLOW;
2105 			goto fail;
2106 		}
2107 
2108 		ITOV(ip)->v_rdev = ip->i_rdev = d;
2109 
2110 		if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
2111 			ip->i_ordev = dev32; /* can't use old format */
2112 		} else {
2113 			ip->i_ordev = cmpdev(d);
2114 		}
2115 	}
2116 	ITOV(ip)->v_type = type;
2117 	ufs_reset_vnode(ip->i_vnode);
2118 	if (type == VDIR) {
2119 		ip->i_nlink = 2; /* anticipating a call to dirmakedirect */
2120 	} else {
2121 		ip->i_nlink = 1;
2122 	}
2123 
2124 	if (op == DE_ATTRDIR) {
2125 		ip->i_uid = vap->va_uid;
2126 		ip->i_gid = vap->va_gid;
2127 	} else
2128 		ip->i_uid = crgetuid(cr);
2129 	/*
2130 	 * To determine the group-id of the created file:
2131 	 *   1) If the gid is set in the attribute list (non-Sun & pre-4.0
2132 	 *	clients are not likely to set the gid), then use it if
2133 	 *	the process is privileged, belongs to the target group,
2134 	 *	or the group is the same as the parent directory.
2135 	 *   2) If the filesystem was not mounted with the Old-BSD-compatible
2136 	 *	GRPID option, and the directory's set-gid bit is clear,
2137 	 *	then use the process's gid.
2138 	 *   3) Otherwise, set the group-id to the gid of the parent directory.
2139 	 */
2140 	if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) &&
2141 	    ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) ||
2142 	    secpolicy_vnode_create_gid(cr) == 0)) {
2143 		/*
2144 		 * XXX - is this only the case when a 4.0 NFS client, or a
2145 		 * client derived from that code, makes a call over the wire?
2146 		 */
2147 		ip->i_gid = vap->va_gid;
2148 	} else
2149 		ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr);
2150 
2151 	/*
2152 	 * For SunOS 5.0->5.4, the lines below read:
2153 	 *
2154 	 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
2155 	 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
2156 	 *
2157 	 * where MAXUID was set to 60002.  See notes on this in ufs_inode.c
2158 	 */
2159 	ip->i_suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
2160 		UID_LONG : ip->i_uid;
2161 	ip->i_sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
2162 		GID_LONG : ip->i_gid;
2163 
2164 	/*
2165 	 * If we're creating a directory, and the parent directory has the
2166 	 * set-GID bit set, set it on the new directory.
2167 	 * Otherwise, if the user is neither privileged nor a member of the
2168 	 * file's new group, clear the file's set-GID bit.
2169 	 */
2170 	if ((tdp->i_mode & ISGID) && (type == VDIR))
2171 		ip->i_mode |= ISGID;
2172 	else {
2173 		if ((ip->i_mode & ISGID) &&
2174 		    secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0)
2175 			ip->i_mode &= ~ISGID;
2176 	}
2177 
2178 	if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2179 	    ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2180 		err = EOVERFLOW;
2181 		goto fail;
2182 	}
2183 
2184 	/*
2185 	 * Extended attribute directories are not subject to quotas.
2186 	 */
2187 	if (op != DE_ATTRDIR)
2188 		ip->i_dquot = getinoquota(ip);
2189 	else
2190 		ip->i_dquot = NULL;
2191 
2192 	if (op == DE_MKDIR || op == DE_ATTRDIR) {
2193 		err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr);
2194 		if (err)
2195 			goto fail;
2196 	}
2197 
2198 	/*
2199 	 * generate the shadow inode and attach it to the new object
2200 	 */
2201 	ASSERT((tdp->i_shadow && tdp->i_ufs_acl) ||
2202 	    (!tdp->i_shadow && !tdp->i_ufs_acl));
2203 	if (tdp->i_shadow && tdp->i_ufs_acl &&
2204 	    (((tdp->i_mode & IFMT) == IFDIR) ||
2205 	    ((tdp->i_mode & IFMT) == IFATTRDIR))) {
2206 		err = ufs_si_inherit(ip, tdp, ip->i_mode, cr);
2207 		if (err) {
2208 			if (op == DE_MKDIR) {
2209 				/*
2210 				 * clean up parent directory
2211 				 *
2212 				 * tdp->i_contents already locked from
2213 				 * ufs_direnter_*()
2214 				 */
2215 				tdp->i_nlink--;
2216 				TRANS_INODE(tdp->i_ufsvfs, tdp);
2217 				tdp->i_flag |= ICHG;
2218 				tdp->i_seq++;
2219 				ufs_iupdat(tdp, I_SYNC);
2220 			}
2221 			goto fail;
2222 		}
2223 	}
2224 
2225 	/*
2226 	 * If the passed in attributes contain atime and/or mtime
2227 	 * settings, then use them instead of using the current
2228 	 * high resolution time.
2229 	 */
2230 	if (vap->va_mask & (AT_MTIME|AT_ATIME)) {
2231 		if (vap->va_mask & AT_ATIME) {
2232 			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
2233 			ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2234 			ip->i_flag &= ~IACC;
2235 		} else
2236 			ip->i_flag |= IACC;
2237 		if (vap->va_mask & AT_MTIME) {
2238 			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
2239 			ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2240 			gethrestime(&now);
2241 			if (now.tv_sec > TIME32_MAX) {
2242 				/*
2243 				 * In 2038, ctime sticks forever..
2244 				 */
2245 				ip->i_ctime.tv_sec = TIME32_MAX;
2246 				ip->i_ctime.tv_usec = 0;
2247 			} else {
2248 				ip->i_ctime.tv_sec = now.tv_sec;
2249 				ip->i_ctime.tv_usec = now.tv_nsec / 1000;
2250 			}
2251 			ip->i_flag &= ~(IUPD|ICHG);
2252 			ip->i_flag |= IMODTIME;
2253 		} else
2254 			ip->i_flag |= IUPD|ICHG;
2255 		ip->i_flag |= IMOD;
2256 	} else
2257 		ip->i_flag |= IACC|IUPD|ICHG;
2258 	ip->i_seq++;
2259 
2260 	/*
2261 	 * If this is an attribute tag it as one.
2262 	 */
2263 	if ((tdp->i_mode & IFMT) == IFATTRDIR) {
2264 		ip->i_cflags |= IXATTR;
2265 	}
2266 
2267 	/*
2268 	 * push inode before it's name appears in a directory
2269 	 */
2270 	TRANS_INODE(ip->i_ufsvfs, ip);
2271 	ufs_iupdat(ip, I_SYNC);
2272 	rw_exit(&ip->i_contents);
2273 	return (0);
2274 
2275 fail:
2276 	/* Throw away inode we just allocated. */
2277 	ip->i_nlink = 0;
2278 	ufs_setreclaim(ip);
2279 	TRANS_INODE(ip->i_ufsvfs, ip);
2280 	ip->i_flag |= ICHG;
2281 	ip->i_seq++;
2282 	ITIMES_NOLOCK(ip);
2283 	rw_exit(&ip->i_contents);
2284 	return (err);
2285 }
2286 
2287 /*
2288  * Write a prototype directory into the empty inode ip, whose parent is dp.
2289  */
2290 static int
2291 ufs_dirmakedirect(
2292 	struct inode *ip,		/* new directory */
2293 	struct inode *dp,		/* parent directory */
2294 	int	attrdir,
2295 	struct cred *cr)
2296 {
2297 	struct dirtemplate *dirp;
2298 	struct fbuf *fbp;
2299 	int err;
2300 
2301 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
2302 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2303 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
2304 	/*
2305 	 * Allocate space for the directory we're creating.
2306 	 */
2307 	err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr);
2308 	if (err)
2309 		return (err);
2310 	if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
2311 		err = ufs_fault(ITOV(dp),
2312 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)",
2313 					DIRBLKSIZ, dp->i_fs->fs_fsize,
2314 					dp->i_fs->fs_fsmnt);
2315 		return (err);
2316 	}
2317 	ip->i_size = DIRBLKSIZ;
2318 	TRANS_INODE(ip->i_ufsvfs, ip);
2319 	ip->i_flag |= IUPD|ICHG|IATTCHG;
2320 	ip->i_seq++;
2321 	ITIMES_NOLOCK(ip);
2322 	/*
2323 	 * Update the tdp link count and write out the change.
2324 	 * This reflects the ".." entry we'll soon write.
2325 	 */
2326 	if (dp->i_nlink == MAXLINK)
2327 		return (EMLINK);
2328 	if (attrdir == 0)
2329 		dp->i_nlink++;
2330 	TRANS_INODE(dp->i_ufsvfs, dp);
2331 	dp->i_flag |= ICHG;
2332 	dp->i_seq++;
2333 	ufs_iupdat(dp, I_SYNC);
2334 	/*
2335 	 * Initialize directory with "."
2336 	 * and ".." from static template.
2337 	 *
2338 	 * Since the parent directory is locked, we don't have to
2339 	 * worry about anything changing when we drop the write
2340 	 * lock on (ip).
2341 	 *
2342 	 */
2343 	err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize,
2344 	    S_READ, &fbp);
2345 
2346 	if (err) {
2347 		goto fail;
2348 	}
2349 	dirp = (struct dirtemplate *)fbp->fb_addr;
2350 	/*
2351 	 * Now initialize the directory we're creating
2352 	 * with the "." and ".." entries.
2353 	 */
2354 	*dirp = mastertemplate;			/* structure assignment */
2355 	dirp->dot_ino = (uint32_t)ip->i_number;
2356 	dirp->dotdot_ino = (uint32_t)dp->i_number;
2357 
2358 	err = TRANS_DIR(ip, 0);
2359 	if (err) {
2360 		fbrelse(fbp, S_OTHER);
2361 		goto fail;
2362 	}
2363 
2364 	err = ufs_fbwrite(fbp, ip);
2365 	if (err) {
2366 		goto fail;
2367 	}
2368 
2369 	return (0);
2370 
2371 fail:
2372 	if (attrdir == 0)
2373 		dp->i_nlink--;
2374 	TRANS_INODE(dp->i_ufsvfs, dp);
2375 	dp->i_flag |= ICHG;
2376 	dp->i_seq++;
2377 	ufs_iupdat(dp, I_SYNC);
2378 	return (err);
2379 }
2380 
2381 /*
2382  * Delete a directory entry.  If oip is nonzero the entry is checked
2383  * to make sure it still reflects oip.
2384  *
2385  * If vpp is non-null, return the ptr of the (held) vnode associated with
2386  * the removed name.  The caller is responsible for doing the VN_RELE().
2387  */
2388 int
2389 ufs_dirremove(
2390 	struct inode *dp,
2391 	char *namep,
2392 	struct inode *oip,
2393 	struct vnode *cdir,
2394 	enum dr_op op,
2395 	struct cred *cr,
2396 	vnode_t **vpp)	/* Return (held) vnode ptr of removed file/dir */
2397 {
2398 	struct direct *ep, *pep, *nep;
2399 	struct inode *ip;
2400 	vnode_t *dvp, *vp;
2401 	struct ufs_slot slot;
2402 	int namlen;
2403 	int err;
2404 	int mode;
2405 	ushort_t extra;
2406 
2407 	namlen = (int)strlen(namep);
2408 	if (namlen == 0)
2409 		return (ufs_fault(ITOV(dp), "ufs_dirremove: namlen == 0"));
2410 	/*
2411 	 * return error when removing . and ..
2412 	 */
2413 	if (namep[0] == '.') {
2414 		if (namlen == 1)
2415 			return (EINVAL);
2416 		else if (namlen == 2 && namep[1] == '.') {
2417 			return (EEXIST);	/* SIGH should be ENOTEMPTY */
2418 		}
2419 	}
2420 
2421 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2422 	/*
2423 	 * Check accessibility of directory.
2424 	 */
2425 retry:
2426 	if (((dp->i_mode & IFMT) != IFDIR) &&
2427 	    ((dp->i_mode & IFMT) != IFATTRDIR)) {
2428 		return (ENOTDIR);
2429 	}
2430 
2431 	/*
2432 	 * Execute access is required to search the directory.
2433 	 * Access for write is interpreted as allowing
2434 	 * deletion of files in the directory.
2435 	 */
2436 	if (err = ufs_iaccess(dp, IEXEC|IWRITE, cr)) {
2437 		return (err);
2438 	}
2439 
2440 	ip = NULL;
2441 	slot.fbp = NULL;
2442 	slot.status = FOUND;	/* don't need to look for empty slot */
2443 	rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
2444 	rw_enter(&dp->i_contents, RW_WRITER);
2445 	err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0);
2446 	if (err)
2447 		goto out_novfs;
2448 	if (ip == NULL) {
2449 		err = ENOENT;
2450 		goto out_novfs;
2451 	}
2452 	vp = ITOV(ip);
2453 	if (oip && oip != ip) {
2454 		err = ENOENT;
2455 		goto out_novfs;
2456 	}
2457 
2458 	mode = ip->i_mode & IFMT;
2459 	if (mode == IFDIR || mode == IFATTRDIR) {
2460 
2461 		/*
2462 		 * vn_vfsrlock() prevents races between mount and rmdir.
2463 		 */
2464 		if (vn_vfsrlock(vp)) {
2465 			err = EBUSY;
2466 			goto out_novfs;
2467 		}
2468 		if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) {
2469 			err = EBUSY;
2470 			goto out;
2471 		}
2472 		/*
2473 		 * If we are removing a directory, get a lock on it.
2474 		 * Taking a writer lock prevents a parallel ufs_dirlook from
2475 		 * incorrectly entering a negative cache vnode entry in the dnlc
2476 		 * If the directory is empty, it will stay empty until
2477 		 * we can remove it.
2478 		 */
2479 		if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) {
2480 			/*
2481 			 * It is possible that a thread in rename would have
2482 			 * acquired this rwlock. To prevent a deadlock we
2483 			 * do a rw_tryenter. If we fail to get the lock
2484 			 * we drop all the locks we have acquired, wait
2485 			 * for 2 ticks and reacquire the
2486 			 * directory's (dp) i_rwlock and try again.
2487 			 * If we dont drop dp's i_rwlock then we will panic
2488 			 * with a "Deadlock: cycle in blocking chain"
2489 			 * since in ufs_dircheckpath we want dp's i_rwlock.
2490 			 * dp is guaranteed to exist since ufs_dirremove is
2491 			 * called after a VN_HOLD(dp) has been done.
2492 			 */
2493 			ufs_dirremove_retry_cnt++;
2494 			vn_vfsunlock(vp);
2495 			if (slot.fbp)
2496 				fbrelse(slot.fbp, S_OTHER);
2497 			rw_exit(&dp->i_contents);
2498 			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2499 			rw_exit(&dp->i_rwlock);
2500 			VN_RELE(vp);
2501 			delay(2);
2502 			rw_enter(&dp->i_rwlock, RW_WRITER);
2503 			goto retry;
2504 		}
2505 	}
2506 	rw_enter(&ip->i_contents, RW_READER);
2507 
2508 	/*
2509 	 * Now check the restrictions that apply on sticky directories.
2510 	 */
2511 	if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) {
2512 		rw_exit(&ip->i_contents);
2513 		if (mode == IFDIR || mode == IFATTRDIR)
2514 			rw_exit(&ip->i_rwlock);
2515 		goto out;
2516 	}
2517 
2518 	if (op == DR_RMDIR) {
2519 		/*
2520 		 * For rmdir(2), some special checks are required.
2521 		 * (a) Don't remove any alias of the parent (e.g. ".").
2522 		 * (b) Don't remove the current directory.
2523 		 * (c) Make sure the entry is (still) a directory.
2524 		 * (d) Make sure the directory is empty.
2525 		 */
2526 
2527 		if (dp == ip || vp == cdir)
2528 			err = EINVAL;
2529 		else if (((ip->i_mode & IFMT) != IFDIR) &&
2530 		    ((ip->i_mode & IFMT) != IFATTRDIR))
2531 			err = ENOTDIR;
2532 		else if ((ip->i_nlink > 2) ||
2533 		    !ufs_dirempty(ip, dp->i_number, cr)) {
2534 			err = EEXIST;	/* SIGH should be ENOTEMPTY */
2535 		}
2536 
2537 		if (err) {
2538 			rw_exit(&ip->i_contents);
2539 			if (mode == IFDIR || mode == IFATTRDIR)
2540 				rw_exit(&ip->i_rwlock);
2541 			goto out;
2542 		}
2543 	} else if (op == DR_REMOVE)  {
2544 		/*
2545 		 * unlink(2) requires a different check: allow only
2546 		 * privileged users to unlink a directory.
2547 		 */
2548 		if (vp->v_type == VDIR &&
2549 		    secpolicy_fs_linkdir(cr, vp->v_vfsp)) {
2550 			err = EPERM;
2551 			rw_exit(&ip->i_contents);
2552 			rw_exit(&ip->i_rwlock);
2553 			goto out;
2554 		}
2555 	}
2556 
2557 	rw_exit(&ip->i_contents);
2558 
2559 	/*
2560 	 * Remove the cache'd entry, if any.
2561 	 */
2562 	dvp = ITOV(dp);
2563 	dnlc_remove(dvp, namep);
2564 	ep = slot.ep;
2565 	ep->d_ino = 0;
2566 
2567 	if (slot.cached) {
2568 		dcanchor_t *dcap = &dp->i_danchor;
2569 
2570 		(void) dnlc_dir_rem_entry(dcap, namep, NULL);
2571 		if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) {
2572 			(void) dnlc_dir_rem_space_by_handle(dcap, slot.offset);
2573 		}
2574 		if (slot.offset & (DIRBLKSIZ - 1)) {
2575 			/*
2576 			 * Collapse new free space into previous entry.
2577 			 * Note, the previous entry has already been
2578 			 * validated in ufs_dircheckforname().
2579 			 */
2580 			ASSERT(slot.size);
2581 			pep = (struct direct *)((char *)ep - slot.size);
2582 			if ((pep->d_ino == 0) &&
2583 			    ((uintptr_t)pep & (DIRBLKSIZ - 1))) {
2584 				dnlc_dir_purge(dcap);
2585 				slot.cached = 0;
2586 				goto nocache;
2587 			}
2588 			if (pep->d_ino) {
2589 				extra = pep->d_reclen - DIRSIZ(pep);
2590 			} else {
2591 				extra = pep->d_reclen;
2592 			}
2593 			if (extra >= LDIRSIZ(1)) {
2594 				(void) dnlc_dir_rem_space_by_handle(dcap,
2595 				    (uint64_t)(slot.offset - slot.size));
2596 			}
2597 			pep->d_reclen += ep->d_reclen;
2598 			(void) dnlc_dir_add_space(dcap, extra + ep->d_reclen,
2599 				(uint64_t)(slot.offset - slot.size));
2600 			/* adjust the previous pointer in the next entry */
2601 			nep = (struct direct *)((char *)ep + ep->d_reclen);
2602 			if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
2603 				/*
2604 				 * Not a new block.
2605 				 *
2606 				 * Check the validity of the entry.
2607 				 * If it's bad, then throw away the cache and
2608 				 * continue.
2609 				 */
2610 				if ((nep->d_reclen == 0) ||
2611 				    (nep->d_reclen & 0x3) ||
2612 				    (dnlc_dir_update(dcap, nep->d_name,
2613 				    INO_OFF_TO_H(nep->d_ino,
2614 				    slot.offset - slot.size)) == DNOENT)) {
2615 					dnlc_dir_purge(dcap);
2616 					slot.cached = 0;
2617 				}
2618 			}
2619 		} else {
2620 			(void) dnlc_dir_add_space(dcap, ep->d_reclen,
2621 			(uint64_t)slot.offset);
2622 		}
2623 	} else {
2624 		/*
2625 		 * If the entry isn't the first in the directory, we must
2626 		 * reclaim the space of the now empty record by adding
2627 		 * the record size to the size of the previous entry.
2628 		 */
2629 		if (slot.offset & (DIRBLKSIZ - 1)) {
2630 			/*
2631 			 * Collapse new free space into previous entry.
2632 			 */
2633 			pep = (struct direct *)((char *)ep - slot.size);
2634 			pep->d_reclen += ep->d_reclen;
2635 		}
2636 	}
2637 nocache:
2638 
2639 
2640 	err = TRANS_DIR(dp, slot.offset);
2641 	if (err)
2642 		fbrelse(slot.fbp, S_OTHER);
2643 	else
2644 		err = ufs_fbwrite(slot.fbp, dp);
2645 	slot.fbp = NULL;
2646 
2647 	/*
2648 	 * If we were removing a directory, it is 'gone' now, but we cannot
2649 	 * unlock it as a thread may be waiting for the lock in ufs_create. If
2650 	 * we did, it could then create a file in a deleted directory.
2651 	 */
2652 
2653 	if (err) {
2654 		if (mode == IFDIR || mode == IFATTRDIR)
2655 			rw_exit(&ip->i_rwlock);
2656 		goto out;
2657 	}
2658 
2659 	rw_enter(&ip->i_contents, RW_WRITER);
2660 
2661 	dp->i_flag |= IUPD|ICHG;
2662 	dp->i_seq++;
2663 	ip->i_flag |= ICHG;
2664 	ip->i_seq++;
2665 
2666 	TRANS_INODE(dp->i_ufsvfs, dp);
2667 	TRANS_INODE(ip->i_ufsvfs, ip);
2668 	/*
2669 	 * Now dispose of the inode.
2670 	 */
2671 	if (ip->i_nlink > 0) {
2672 		/*
2673 		 * This is not done for IFATTRDIR's because they don't
2674 		 * have entries in the dnlc and the link counts are
2675 		 * not incremented when they are created.
2676 		 */
2677 		if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) {
2678 			/*
2679 			 * Decrement by 2 because we're trashing the "."
2680 			 * entry as well as removing the entry in dp.
2681 			 * Clear the directory entry, but there may be
2682 			 * other hard links so don't free the inode.
2683 			 * Decrement the dp linkcount because we're
2684 			 * trashing the ".." entry.
2685 			 */
2686 			ip->i_nlink -= 2;
2687 			dp->i_nlink--;
2688 			ufs_setreclaim(dp);
2689 			/*
2690 			 * XXX need to discard negative cache entries
2691 			 * for vp.  See comment in ufs_delete().
2692 			 */
2693 			dnlc_remove(vp, ".");
2694 			dnlc_remove(vp, "..");
2695 			/*
2696 			 * The return value is ignored here bacause if
2697 			 * the directory purge fails we don't want to
2698 			 * stop the delete. If ufs_dirpurgedotdot fails
2699 			 * the delete will continue with the preexiting
2700 			 * behavior.
2701 			 */
2702 			(void) ufs_dirpurgedotdot(ip, dp->i_number, cr);
2703 		} else {
2704 			ip->i_nlink--;
2705 		}
2706 		ufs_setreclaim(ip);
2707 	}
2708 	ITIMES_NOLOCK(dp);
2709 	ITIMES_NOLOCK(ip);
2710 
2711 	if (!TRANS_ISTRANS(dp->i_ufsvfs))
2712 		ufs_iupdat(dp, I_SYNC);
2713 	if (!TRANS_ISTRANS(ip->i_ufsvfs))
2714 		ufs_iupdat(ip, I_SYNC);
2715 
2716 	rw_exit(&ip->i_contents);
2717 	if (mode == IFDIR || mode == IFATTRDIR)
2718 		rw_exit(&ip->i_rwlock);
2719 out:
2720 	if (mode == IFDIR || mode == IFATTRDIR) {
2721 		vn_vfsunlock(vp);
2722 	}
2723 out_novfs:
2724 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
2725 
2726 	if (slot.fbp)
2727 		fbrelse(slot.fbp, S_OTHER);
2728 
2729 	rw_exit(&dp->i_contents);
2730 	rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2731 
2732 	/*
2733 	 * If no error and vpp is non-NULL, return the vnode ptr to the caller.
2734 	 * The caller becomes responsible for the VN_RELE().  Otherwise,
2735 	 * Release (and delete) the inode after we drop vfs_dqrwlock to
2736 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
2737 	 */
2738 	if (ip) {
2739 		if ((err == 0) && (vpp != NULL)) {
2740 			*vpp = ITOV(ip);
2741 		} else {
2742 			VN_RELE(vp);
2743 		}
2744 	}
2745 
2746 	return (err);
2747 }
2748 
2749 /*
2750  * Return buffer with contents of block "offset"
2751  * from the beginning of directory "ip".  If "res"
2752  * is non-zero, fill it in with a pointer to the
2753  * remaining space in the directory.
2754  *
2755  */
2756 
2757 int
2758 blkatoff(
2759 	struct inode *ip,
2760 	off_t offset,
2761 	char **res,
2762 	struct fbuf **fbpp)
2763 {
2764 	struct fs *fs;
2765 	struct fbuf *fbp;
2766 	daddr_t lbn;
2767 	uint_t bsize;
2768 	int err;
2769 
2770 	CPU_STATS_ADD_K(sys, ufsdirblk, 1);
2771 	fs = ip->i_fs;
2772 	lbn = (daddr_t)lblkno(fs, offset);
2773 	bsize = (uint_t)blksize(fs, ip, lbn);
2774 	err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask),
2775 			bsize, S_READ, &fbp);
2776 	if (err) {
2777 		*fbpp = (struct fbuf *)NULL;
2778 		return (err);
2779 	}
2780 	if (res)
2781 		*res = fbp->fb_addr + blkoff(fs, offset);
2782 	*fbpp = fbp;
2783 	return (0);
2784 }
2785 
2786 /*
2787  * Do consistency checking:
2788  *	record length must be multiple of 4
2789  *	entry must fit in rest of its DIRBLKSIZ block
2790  *	record must be large enough to contain entry
2791  *	name is not longer than MAXNAMLEN
2792  *	name must be as long as advertised, and null terminated
2793  * NOTE: record length must not be zero (should be checked previously).
2794  *       This routine is only called if dirchk is true.
2795  *       It would be nice to set the FSBAD flag in the super-block when
2796  *       this routine fails so that a fsck is forced on next reboot,
2797  *       but locking is a problem.
2798  */
2799 static int
2800 dirmangled(
2801 	struct inode *dp,
2802 	struct direct *ep,
2803 	int entryoffsetinblock,
2804 	off_t offset)
2805 {
2806 	int i;
2807 
2808 	i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
2809 	if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i ||
2810 	    (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN ||
2811 	    ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) {
2812 		dirbad(dp, "mangled entry", offset);
2813 		return (1);
2814 	}
2815 	return (0);
2816 }
2817 
2818 static void
2819 dirbad(struct inode *ip, char *how, off_t offset)
2820 {
2821 	cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s",
2822 	    ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how);
2823 }
2824 
2825 static int
2826 dirbadname(char *sp, int l)
2827 {
2828 	while (l--) {			/* check for nulls */
2829 		if (*sp++ == '\0') {
2830 			return (1);
2831 		}
2832 	}
2833 	return (*sp);			/* check for terminating null */
2834 }
2835 
2836 /*
2837  * Check if a directory is empty or not.
2838  */
2839 static int
2840 ufs_dirempty(
2841 	struct inode *ip,
2842 	ino_t parentino,
2843 	struct cred *cr)
2844 {
2845 	return (ufs_dirscan(ip, parentino, cr, 0));
2846 }
2847 
2848 /*
2849  * clear the .. directory entry.
2850  */
2851 static int
2852 ufs_dirpurgedotdot(
2853 	struct inode *ip,
2854 	ino_t parentino,
2855 	struct cred *cr)
2856 {
2857 	return (ufs_dirscan(ip, parentino, cr, 1));
2858 }
2859 
2860 /*
2861  * Scan the directoy. If clr_dotdot is true clear the ..
2862  * directory else check to see if the directory is empty.
2863  *
2864  * Using a struct dirtemplate here is not precisely
2865  * what we want, but better than using a struct direct.
2866  *
2867  * clr_dotdot is used as a flag to tell us if we need
2868  * to clear the dotdot entry
2869  *
2870  * N.B.: does not handle corrupted directories.
2871  */
2872 static int
2873 ufs_dirscan(
2874 	struct inode *ip,
2875 	ino_t parentino,
2876 	struct cred *cr,
2877 	int clr_dotdot)
2878 {
2879 	offset_t off;
2880 	struct dirtemplate dbuf;
2881 	struct direct *dp = (struct direct *)&dbuf;
2882 	int err, count;
2883 	int empty = 1;	/* Assume it's empty */
2884 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
2885 
2886 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
2887 
2888 	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
2889 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
2890 		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
2891 		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
2892 		/*
2893 		 * Since we read MINDIRSIZ, residual must
2894 		 * be 0 unless we're at end of file.
2895 		 */
2896 		if (err || count != 0 || dp->d_reclen == 0) {
2897 			empty = 0;
2898 			break;
2899 		}
2900 		/* skip empty entries */
2901 		if (dp->d_ino == 0)
2902 			continue;
2903 		/* accept only "." and ".." */
2904 		if (dp->d_namlen > 2 || dp->d_name[0] != '.') {
2905 			empty = 0;
2906 			break;
2907 		}
2908 		/*
2909 		 * At this point d_namlen must be 1 or 2.
2910 		 * 1 implies ".", 2 implies ".." if second
2911 		 * char is also "."
2912 		 */
2913 		if (dp->d_namlen == 1)
2914 			continue;
2915 		if (dp->d_name[1] == '.' &&
2916 		    (ino_t)dp->d_ino == parentino) {
2917 			/*
2918 			 * If we're doing a purge we need to check for
2919 			 * the . and .. entries and clear the d_ino for ..
2920 			 *
2921 			 * if clr_dotdot is set ufs_dirscan does not
2922 			 * check for an empty directory.
2923 			 */
2924 			if (clr_dotdot) {
2925 				/*
2926 				 * Have to actually zap the ..
2927 				 * entry in the directory, as
2928 				 * otherwise someone might have
2929 				 * dp as its cwd and try to
2930 				 * open .., which now points to
2931 				 * an unallocated inode.
2932 				 */
2933 				empty = ufs_dirclrdotdot(ip, parentino);
2934 				break;
2935 			} else {
2936 				continue;
2937 			}
2938 		}
2939 		empty = 0;
2940 		break;
2941 	}
2942 	return (empty);
2943 }
2944 
2945 clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */
2946 uint64_t dircheck_retry_cnt;
2947 /*
2948  * Check if source directory inode is in the path of the target directory.
2949  * Target is supplied locked.
2950  *
2951  * The source and target inode's should be different upon entry.
2952  */
2953 int
2954 ufs_dircheckpath(
2955 	ino_t source_ino,
2956 	struct inode *target,
2957 	struct inode *sdp,
2958 	struct cred *cr)
2959 {
2960 	struct fbuf *fbp;
2961 	struct dirtemplate *dirp;
2962 	struct inode *ip;
2963 	struct ufsvfs *ufsvfsp;
2964 	struct inode *tip;
2965 	ino_t dotdotino;
2966 	int err;
2967 
2968 	ASSERT(target->i_ufsvfs != NULL);
2969 	ASSERT(RW_LOCK_HELD(&target->i_rwlock));
2970 	ASSERT(RW_LOCK_HELD(&sdp->i_rwlock));
2971 
2972 	ip = target;
2973 	if (ip->i_number == source_ino) {
2974 		err = EINVAL;
2975 		goto out;
2976 	}
2977 	if (ip->i_number == UFSROOTINO) {
2978 		err = 0;
2979 		goto out;
2980 	}
2981 	/*
2982 	 * Search back through the directory tree, using the ".." entries.
2983 	 * Fail any attempt to move a directory into an ancestor directory.
2984 	 */
2985 	fbp = NULL;
2986 	for (;;) {
2987 		struct vfs	*vfs;
2988 
2989 		err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp);
2990 		if (err)
2991 			break;
2992 		if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 ||
2993 		    ip->i_size < sizeof (struct dirtemplate)) {
2994 			dirbad(ip, "bad size, unlinked or not dir", (off_t)0);
2995 			err = ENOTDIR;
2996 			break;
2997 		}
2998 		if (dirp->dotdot_namlen != 2 ||
2999 		    dirp->dotdot_name[0] != '.' ||
3000 		    dirp->dotdot_name[1] != '.') {
3001 			dirbad(ip, "mangled .. entry", (off_t)0);
3002 			err = ENOTDIR;		/* Sanity check */
3003 			break;
3004 		}
3005 		dotdotino = (ino_t)dirp->dotdot_ino;
3006 		if (dotdotino == source_ino) {
3007 			err = EINVAL;
3008 			break;
3009 		}
3010 		if (dotdotino == UFSROOTINO)
3011 			break;
3012 		if (fbp) {
3013 			fbrelse(fbp, S_OTHER);
3014 			fbp = NULL;
3015 		}
3016 		vfs = ip->i_vfs;
3017 		ufsvfsp = ip->i_ufsvfs;
3018 
3019 		if (ip != target) {
3020 			rw_exit(&ip->i_rwlock);
3021 			VN_RELE(ITOV(ip));
3022 		}
3023 		/*
3024 		 * Race to get the inode.
3025 		 */
3026 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3027 		if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) {
3028 			rw_exit(&ufsvfsp->vfs_dqrwlock);
3029 			ip = NULL;
3030 			break;
3031 		}
3032 		rw_exit(&ufsvfsp->vfs_dqrwlock);
3033 		/*
3034 		 * If the directory of the source inode (also a directory)
3035 		 * is the same as this next entry up the chain, then
3036 		 * we know the source directory itself can't be in the
3037 		 * chain. This also prevents a panic because we already
3038 		 * have sdp->i_rwlock locked.
3039 		 */
3040 		if (tip == sdp) {
3041 			VN_RELE(ITOV(tip));
3042 			ip = NULL;
3043 			break;
3044 		}
3045 		ip = tip;
3046 
3047 		/*
3048 		 * If someone has set the WRITE_WANTED bit in this lock and if
3049 		 * this happens to be a sdp or tdp of another parallel rename
3050 		 * which is executing  the same code and in similar situation
3051 		 * we end up in a 4 way deadlock. We need to make sure that
3052 		 * the WRITE_WANTED bit is not  set.
3053 		 */
3054 retry_lock:
3055 		if (!rw_tryenter(&ip->i_rwlock, RW_READER)) {
3056 			/*
3057 			 * If the lock held as WRITER thats fine but if it
3058 			 * has WRITE_WANTED bit set we might end up in a
3059 			 * deadlock. If WRITE_WANTED is set we return
3060 			 * with EAGAIN else we just go back and try.
3061 			 */
3062 			if (RW_ISWRITER(&ip->i_rwlock) &&
3063 					!(RW_WRITE_HELD(&ip->i_rwlock))) {
3064 				err = EAGAIN;
3065 				if (fbp) {
3066 					fbrelse(fbp, S_OTHER);
3067 				}
3068 				VN_RELE(ITOV(ip));
3069 				return (err);
3070 			} else {
3071 				/*
3072 				 * The lock is being write held. We could
3073 				 * just do a rw_enter here but there is a
3074 				 * window between the check and now, where
3075 				 * the status could have changed, so to
3076 				 * avoid looping we backoff and go back to
3077 				 * try for the lock.
3078 				 */
3079 				delay(retry_backoff_delay);
3080 				dircheck_retry_cnt++;
3081 				goto retry_lock;
3082 			}
3083 		}
3084 	}
3085 	if (fbp) {
3086 		fbrelse(fbp, S_OTHER);
3087 	}
3088 out:
3089 	if (ip) {
3090 		if (ip != target) {
3091 			rw_exit(&ip->i_rwlock);
3092 			VN_RELE(ITOV(ip));
3093 		}
3094 	}
3095 	return (err);
3096 }
3097 
3098 int
3099 ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr)
3100 {
3101 	offset_t off;
3102 	struct dirtemplate dbuf;
3103 	struct direct *dp = (struct direct *)&dbuf;
3104 	int err, count;
3105 	int empty = 1;	/* Assume it's empty */
3106 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
3107 
3108 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
3109 
3110 	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
3111 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
3112 		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
3113 		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
3114 		/*
3115 		 * Since we read MINDIRSIZ, residual must
3116 		 * be 0 unless we're at end of file.
3117 		 */
3118 
3119 		if (err || count != 0 || dp->d_reclen == 0) {
3120 			empty = 0;
3121 			break;
3122 		}
3123 		/* skip empty entries */
3124 		if (dp->d_ino == 0)
3125 			continue;
3126 		/*
3127 		 * At this point d_namlen must be 1 or 2.
3128 		 * 1 implies ".", 2 implies ".." if second
3129 		 * char is also "."
3130 		 */
3131 
3132 		if (dp->d_namlen == 1 && dp->d_name[0] == '.' &&
3133 				(ino_t)dp->d_ino == parentino)
3134 			continue;
3135 
3136 		if (dp->d_namlen == 2 && dp->d_name[0] == '.' &&
3137 			dp->d_name[1] == '.') {
3138 			continue;
3139 		}
3140 		empty = 0;
3141 		break;
3142 	}
3143 	return (empty);
3144 }
3145 
3146 
3147 /*
3148  * Allocate and initialize a new shadow inode to contain extended attributes.
3149  */
3150 int
3151 ufs_xattrmkdir(
3152 	struct inode *tdp,
3153 	struct inode **ipp,
3154 	int flags,
3155 	struct cred *cr)
3156 {
3157 	struct inode *ip;
3158 	struct vattr va;
3159 	int err;
3160 	int retry = 1;
3161 	struct ufsvfs *ufsvfsp;
3162 	struct ulockfs *ulp;
3163 	int issync;
3164 	int trans_size;
3165 	int dorwlock;		/* 0 = not yet taken, */
3166 				/* 1 = taken outside the transaction, */
3167 				/* 2 = taken inside the transaction */
3168 
3169 	/*
3170 	 * Validate permission to create attribute directory
3171 	 */
3172 
3173 	if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0) {
3174 		return (err);
3175 	}
3176 
3177 	if (vn_is_readonly(ITOV(tdp)))
3178 		return (EROFS);
3179 
3180 	/*
3181 	 * No need to re-init err after again:, since it's set before
3182 	 * the next use of it.
3183 	 */
3184 again:
3185 	dorwlock = 0;
3186 	va.va_type = VDIR;
3187 	va.va_uid = tdp->i_uid;
3188 	va.va_gid = tdp->i_gid;
3189 
3190 	if ((tdp->i_mode & IFMT) == IFDIR) {
3191 		va.va_mode = (o_mode_t)IFATTRDIR;
3192 		va.va_mode |= tdp->i_mode & 0777;
3193 	} else {
3194 		va.va_mode = (o_mode_t)IFATTRDIR|0700;
3195 		if (tdp->i_mode & 0040)
3196 			va.va_mode |= 0750;
3197 		if (tdp->i_mode & 0004)
3198 			va.va_mode |= 0705;
3199 	}
3200 	va.va_mask = AT_TYPE|AT_MODE;
3201 
3202 	ufsvfsp = tdp->i_ufsvfs;
3203 
3204 	err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
3205 	if (err)
3206 		return (err);
3207 
3208 	/*
3209 	 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
3210 	 * This follows the protocol for read()/write().
3211 	 */
3212 	if (ITOV(tdp)->v_type != VDIR) {
3213 		rw_enter(&tdp->i_rwlock, RW_WRITER);
3214 		dorwlock = 1;
3215 	}
3216 
3217 	if (ulp) {
3218 		trans_size = (int)TOP_MKDIR_SIZE(tdp);
3219 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size);
3220 	}
3221 
3222 	/*
3223 	 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
3224 	 * This follows the protocol established by
3225 	 * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
3226 	 */
3227 	if (dorwlock == 0) {
3228 		rw_enter(&tdp->i_rwlock, RW_WRITER);
3229 		dorwlock = 2;
3230 	}
3231 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3232 	rw_enter(&tdp->i_contents, RW_WRITER);
3233 
3234 	/*
3235 	 * Suppress out of inodes messages if we will retry.
3236 	 */
3237 	if (retry)
3238 		tdp->i_flag |= IQUIET;
3239 	err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr);
3240 	tdp->i_flag &= ~IQUIET;
3241 
3242 	if (err)
3243 		goto fail;
3244 
3245 	if (flags) {
3246 
3247 		/*
3248 		 * Now attach it to src file.
3249 		 */
3250 
3251 		tdp->i_oeftflag = ip->i_number;
3252 	}
3253 
3254 	ip->i_cflags |= IXATTR;
3255 	ITOV(ip)->v_flag |= V_XATTRDIR;
3256 	TRANS_INODE(ufsvfsp, tdp);
3257 	tdp->i_flag |= ICHG | IUPD;
3258 	tdp->i_seq++;
3259 	ufs_iupdat(tdp, I_SYNC);
3260 	rw_exit(&tdp->i_contents);
3261 	rw_exit(&ufsvfsp->vfs_dqrwlock);
3262 
3263 	rw_enter(&ip->i_rwlock, RW_WRITER);
3264 	rw_enter(&ip->i_contents, RW_WRITER);
3265 	TRANS_INODE(ufsvfsp, ip);
3266 	ip->i_flag |= ICHG| IUPD;
3267 	ip->i_seq++;
3268 	ufs_iupdat(ip, I_SYNC);
3269 	rw_exit(&ip->i_contents);
3270 	rw_exit(&ip->i_rwlock);
3271 	if (dorwlock == 2)
3272 		rw_exit(&tdp->i_rwlock);
3273 	if (ulp) {
3274 		int terr = 0;
3275 
3276 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
3277 		ufs_lockfs_end(ulp);
3278 		if (err == 0)
3279 			err = terr;
3280 	}
3281 	if (dorwlock == 1)
3282 		rw_exit(&tdp->i_rwlock);
3283 	*ipp = ip;
3284 	return (err);
3285 
3286 fail:
3287 	rw_exit(&tdp->i_contents);
3288 	rw_exit(&ufsvfsp->vfs_dqrwlock);
3289 	if (dorwlock == 2)
3290 		rw_exit(&tdp->i_rwlock);
3291 	if (ulp) {
3292 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
3293 		ufs_lockfs_end(ulp);
3294 	}
3295 	if (dorwlock == 1)
3296 		rw_exit(&tdp->i_rwlock);
3297 	if (ip != NULL)
3298 		VN_RELE(ITOV(ip));
3299 
3300 	/*
3301 	 * No inodes?  See if any are tied up in pending deletions.
3302 	 * This has to be done outside of any of the above, because
3303 	 * the draining operation can't be done from inside a transaction.
3304 	 */
3305 	if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3306 		ufs_delete_drain_wait(ufsvfsp, 1);
3307 		retry = 0;
3308 		goto again;
3309 	}
3310 
3311 	return (err);
3312 }
3313 
3314 /*
3315  * clear the dotdot directory entry.
3316  * Used by ufs_dirscan when clr_dotdot
3317  * flag is set and we're deleting a
3318  * directory.
3319  */
3320 static int
3321 ufs_dirclrdotdot(struct inode *ip, ino_t parentino)
3322 {
3323 	struct fbuf *fbp;
3324 	struct direct *dotp, *dotdotp;
3325 	int err = 0;
3326 
3327 	ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
3328 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
3329 	err = blkatoff(ip, 0, NULL, &fbp);
3330 	if (err) {
3331 		return (err);
3332 	}
3333 
3334 	dotp = (struct direct *)fbp->fb_addr;
3335 	if ((dotp->d_namlen < (MAXNAMLEN + 1)) &&
3336 	    ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) {
3337 		dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen);
3338 		if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) &&
3339 		    ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) {
3340 
3341 			dotp->d_reclen += dotdotp->d_reclen;
3342 			if (parentino == dotdotp->d_ino) {
3343 				dotdotp->d_ino = 0;
3344 				dotdotp->d_namlen = 0;
3345 				dotdotp->d_reclen = 0;
3346 			}
3347 
3348 			err = TRANS_DIR(ip, 0);
3349 			if (err) {
3350 				fbrelse(fbp, S_OTHER);
3351 			} else {
3352 				err = ufs_fbwrite(fbp, ip);
3353 			}
3354 		}
3355 	} else {
3356 		err = -1;
3357 	}
3358 	return (err);
3359 }
3360