xref: /titanic_52/usr/src/uts/common/fs/ufs/ufs_dir.c (revision d583b39bfb4e2571d3e41097c5c357ffe353ad45)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
26 /*	  All Rights Reserved  	*/
27 
28 /*
29  * University Copyright- Copyright (c) 1982, 1986, 1988
30  * The Regents of the University of California
31  * All Rights Reserved
32  *
33  * University Acknowledgment- Portions of this document are derived from
34  * software developed by the University of California, Berkeley, and its
35  * contributors.
36  */
37 
38 /*
39  * Directory manipulation routines.
40  *
41  * When manipulating directories, the i_rwlock provides serialization
42  * since directories cannot be mmapped. The i_contents lock is redundant.
43  */
44 
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/signal.h>
50 #include <sys/cred.h>
51 #include <sys/proc.h>
52 #include <sys/disp.h>
53 #include <sys/user.h>
54 #include <sys/vfs.h>
55 #include <sys/vnode.h>
56 #include <sys/stat.h>
57 #include <sys/mode.h>
58 #include <sys/buf.h>
59 #include <sys/uio.h>
60 #include <sys/dnlc.h>
61 #include <sys/fs/ufs_inode.h>
62 #include <sys/fs/ufs_fs.h>
63 #include <sys/mount.h>
64 #include <sys/fs/ufs_fsdir.h>
65 #include <sys/fs/ufs_trans.h>
66 #include <sys/fs/ufs_panic.h>
67 #include <sys/fs/ufs_quota.h>
68 #include <sys/errno.h>
69 #include <sys/debug.h>
70 #include <vm/seg.h>
71 #include <sys/sysmacros.h>
72 #include <sys/cmn_err.h>
73 #include <sys/cpuvar.h>
74 #include <sys/unistd.h>
75 #include <sys/policy.h>
76 
77 /*
78  * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ
79  */
80 #if !ISP2(DIRBLKSIZ)
81 #error	"DIRBLKSIZ not a power of 2"
82 #endif
83 
84 /*
85  * A virgin directory.
86  */
87 static struct dirtemplate mastertemplate = {
88 	0, 12, 1, ".",
89 	0, DIRBLKSIZ - 12, 2, ".."
90 };
91 
92 #define	LDIRSIZ(len) \
93 	((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3))
94 #define	MAX_DIR_NAME_LEN(len) \
95 	(((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1)
96 
97 /*
98  * The dnlc directory cache allows a 64 bit handle for directory entries.
99  * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset
100  * into the handle. Note, a 32 bit offset allows a 4GB directory, which
101  * is way beyond what could be cached in memory by the directory
102  * caching routines. So we are quite safe with this limit.
103  * The macros below pack and unpack the handle.
104  */
105 #define	H_TO_INO(h) (uint32_t)((h) & UINT_MAX)
106 #define	H_TO_OFF(h) (off_t)((h) >> 32)
107 #define	INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino))
108 
109 /*
110  * The average size of a typical on disk directory entry is about 16 bytes
111  * and so defines AV_DIRECT_SHIFT : log2(16)
112  * This define is only used to approximate the number of entries
113  * is a directory. This is needed for dnlc_dir_start() which will immediately
114  * return an error if the value is not within its acceptable range of
115  * number of files in a directory.
116  */
117 #define	AV_DIRECT_SHIFT 4
118 /*
119  * If the directory size (from i_size) is greater than the ufs_min_dir_cache
120  * tunable then we request dnlc directory caching.
121  * This has found to be profitable after 1024 file names.
122  */
123 int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT;
124 
125 /* The time point the dnlc directory caching was disabled */
126 static hrtime_t ufs_dc_disable_at;
127 /* directory caching disable duration */
128 static hrtime_t ufs_dc_disable_duration = (hrtime_t)NANOSEC * 5;
129 
130 #ifdef DEBUG
131 int dirchk = 1;
132 #else /* !DEBUG */
133 int dirchk = 0;
134 #endif /* DEBUG */
135 int ufs_negative_cache = 1;
136 uint64_t ufs_dirremove_retry_cnt;
137 
138 static void dirbad();
139 static int ufs_dirrename();
140 static int ufs_diraddentry();
141 static int ufs_dirempty();
142 static int ufs_dirscan();
143 static int ufs_dirclrdotdot();
144 static int ufs_dirfixdotdot();
145 static int ufs_dirpurgedotdot();
146 static int dirprepareentry();
147 static int ufs_dirmakedirect();
148 static int dirbadname();
149 static int dirmangled();
150 
151 /*
152  * Check accessibility of directory against inquired mode and type.
153  * Execute access is required to search the directory.
154  * Access for write is interpreted as allowing
155  * deletion of files in the directory.
156  * Note, the reader i_contents lock will be acquired in
157  * ufs_iaccess().
158  */
159 int
160 ufs_diraccess(struct inode *ip, int mode, struct cred *cr)
161 {
162 	if (((ip->i_mode & IFMT) != IFDIR) &&
163 	    ((ip->i_mode & IFMT) != IFATTRDIR))
164 		return (ENOTDIR);
165 
166 	return (ufs_iaccess(ip, mode, cr, 1));
167 }
168 
169 /*
170  * Look for a given name in a directory.  On successful return, *ipp
171  * will point to the VN_HELD inode.
172  * The caller is responsible for checking accessibility upfront
173  * via ufs_diraccess().
174  */
175 int
176 ufs_dirlook(
177 	struct inode *dp,
178 	char *namep,
179 	struct inode **ipp,
180 	struct cred *cr,
181 	int skipdnlc,			/* skip the 1st level dnlc */
182 	int skipcaching)		/* force directory caching off */
183 {
184 	uint64_t handle;
185 	struct fbuf *fbp;		/* a buffer of directory entries */
186 	struct direct *ep;		/* the current directory entry */
187 	struct vnode *vp;
188 	struct vnode *dvp;		/* directory vnode ptr */
189 	struct ulockfs *ulp;
190 	dcanchor_t *dcap;
191 	off_t endsearch;		/* offset to end directory search */
192 	off_t offset;
193 	off_t start_off;		/* starting offset from middle search */
194 	off_t last_offset;		/* last offset */
195 	int entryoffsetinblock;		/* offset of ep in addr's buffer */
196 	int numdirpasses;		/* strategy for directory search */
197 	int namlen;			/* length of name */
198 	int err;
199 	int doingchk;
200 	int i;
201 	int caching;
202 	int indeadlock;
203 	ino_t ep_ino;			/* entry i number */
204 	ino_t chkino;
205 	ushort_t ep_reclen;		/* direct local d_reclen */
206 
207 	ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */
208 
209 	if (dp->i_ufsvfs)
210 		ulp = &dp->i_ufsvfs->vfs_ulockfs;
211 
212 	/*
213 	 * Check the directory name lookup cache, first for individual files
214 	 * then for complete directories.
215 	 */
216 	dvp = ITOV(dp);
217 	if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) {
218 		/* vp is already held from dnlc_lookup */
219 		if (vp == DNLC_NO_VNODE) {
220 			VN_RELE(vp);
221 			return (ENOENT);
222 		}
223 		*ipp = VTOI(vp);
224 		return (0);
225 	}
226 
227 	dcap = &dp->i_danchor;
228 
229 	/*
230 	 * Grab the reader lock on the directory data before checking
231 	 * the dnlc to avoid a race with ufs_dirremove() & friends.
232 	 *
233 	 * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to
234 	 * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
235 	 * possible, retries the operation.
236 	 */
237 	ufs_tryirwlock((&dp->i_rwlock), RW_READER, retry_dircache);
238 	if (indeadlock)
239 		return (EAGAIN);
240 
241 	switch (dnlc_dir_lookup(dcap, namep, &handle)) {
242 	case DFOUND:
243 		ep_ino = (ino_t)H_TO_INO(handle);
244 		if (dp->i_number == ep_ino) {
245 			VN_HOLD(dvp);	/* want ourself, "." */
246 			*ipp = dp;
247 			rw_exit(&dp->i_rwlock);
248 			return (0);
249 		}
250 		if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) {
251 			uint64_t handle2;
252 			/*
253 			 * release the lock on the dir we are searching
254 			 * to avoid a deadlock when grabbing the
255 			 * i_contents lock in ufs_iget_alloced().
256 			 */
257 			rw_exit(&dp->i_rwlock);
258 			rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
259 			err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
260 			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
261 			/*
262 			 * must recheck as we dropped dp->i_rwlock
263 			 */
264 			ufs_tryirwlock(&dp->i_rwlock, RW_READER, retry_parent);
265 			if (indeadlock) {
266 				if (!err)
267 					VN_RELE(ITOV(*ipp));
268 				return (EAGAIN);
269 			}
270 			if (!err && (dnlc_dir_lookup(dcap, namep, &handle2)
271 			    == DFOUND) && (handle == handle2)) {
272 				dnlc_update(dvp, namep, ITOV(*ipp));
273 				rw_exit(&dp->i_rwlock);
274 				return (0);
275 			}
276 			/* check failed, read the actual directory */
277 			if (!err) {
278 				VN_RELE(ITOV(*ipp));
279 			}
280 			goto restart;
281 		}
282 		/* usual case of not "." nor ".." */
283 		rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
284 		err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
285 		rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
286 		if (err) {
287 			rw_exit(&dp->i_rwlock);
288 			return (err);
289 		}
290 		dnlc_update(dvp, namep, ITOV(*ipp));
291 		rw_exit(&dp->i_rwlock);
292 		return (0);
293 	case DNOENT:
294 		if (ufs_negative_cache && (dp->i_nlink > 0)) {
295 			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
296 		}
297 		rw_exit(&dp->i_rwlock);
298 		return (ENOENT);
299 	default:
300 		break;
301 	}
302 restart:
303 
304 	fbp = NULL;
305 	doingchk = 0;
306 	chkino = 0;
307 	caching = 0;
308 
309 	/*
310 	 * Attempt to cache any directories greater than the tunable
311 	 * ufs_min_cache_dir. If it fails due to memory shortage (DNOMEM),
312 	 * disable caching for this directory and record the system time.
313 	 * Any attempt after the disable time has expired will enable
314 	 * the caching again.
315 	 */
316 	if (!skipcaching && (dp->i_size >= ufs_min_dir_cache)) {
317 		/*
318 		 * if the directory caching disable time has expired
319 		 * enable the caching again.
320 		 */
321 		if (dp->i_cachedir == CD_DISABLED_NOMEM &&
322 		    gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
323 			ufs_dc_disable_at = 0;
324 			dp->i_cachedir = CD_ENABLED;
325 		}
326 		if (dp->i_cachedir == CD_ENABLED) {
327 			switch (dnlc_dir_start(dcap, dp->i_size >>
328 			    AV_DIRECT_SHIFT)) {
329 			case DNOMEM:
330 				dp->i_cachedir = CD_DISABLED_NOMEM;
331 				ufs_dc_disable_at = gethrtime();
332 				break;
333 			case DTOOBIG:
334 				dp->i_cachedir = CD_DISABLED_TOOBIG;
335 				break;
336 			case DOK:
337 				caching = 1;
338 				break;
339 			default:
340 				break;
341 			}
342 		}
343 	}
344 	/*
345 	 * If caching we don't stop when the file has been
346 	 * found, but need to know later, so clear *ipp now
347 	 */
348 	*ipp = NULL;
349 
350 recheck:
351 	if (caching) {
352 		offset = 0;
353 		entryoffsetinblock = 0;
354 		numdirpasses = 1;
355 	} else {
356 		/*
357 		 * Take care to look at dp->i_diroff only once, as it
358 		 * may be changing due to other threads/cpus.
359 		 */
360 		offset = dp->i_diroff;
361 		if (offset > dp->i_size) {
362 			offset = 0;
363 		}
364 		if (offset == 0) {
365 			entryoffsetinblock = 0;
366 			numdirpasses = 1;
367 		} else {
368 			start_off = offset;
369 
370 			entryoffsetinblock = blkoff(dp->i_fs, offset);
371 			if (entryoffsetinblock != 0) {
372 				err = blkatoff(dp, offset, (char **)0, &fbp);
373 				if (err)
374 					goto bad;
375 			}
376 			numdirpasses = 2;
377 		}
378 	}
379 	endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t);
380 	namlen = strlen(namep);
381 	last_offset = 0;
382 
383 searchloop:
384 	while (offset < endsearch) {
385 		/*
386 		 * If offset is on a block boundary,
387 		 * read the next directory block.
388 		 * Release previous if it exists.
389 		 */
390 		if (blkoff(dp->i_fs, offset) == 0) {
391 			if (fbp != NULL) {
392 				fbrelse(fbp, S_OTHER);
393 			}
394 			err = blkatoff(dp, offset, (char **)0, &fbp);
395 			if (err)
396 				goto bad;
397 			entryoffsetinblock = 0;
398 		}
399 
400 		/*
401 		 * If the offset to the next entry is invalid or if the
402 		 * next entry is a zero length record or if the record
403 		 * length is invalid, then skip to the next directory
404 		 * block.  Complete validation checks are done if the
405 		 * record length is invalid.
406 		 *
407 		 * Full validation checks are slow so they are disabled
408 		 * by default.  Complete checks can be run by patching
409 		 * "dirchk" to be true.
410 		 *
411 		 * We have to check the validity of entryoffsetinblock
412 		 * here because it can be set to i_diroff above.
413 		 */
414 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock);
415 		if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 ||
416 		    (dirchk || (ep->d_reclen & 0x3)) &&
417 		    dirmangled(dp, ep, entryoffsetinblock, offset)) {
418 			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
419 			offset += i;
420 			entryoffsetinblock += i;
421 			if (caching) {
422 				dnlc_dir_purge(dcap);
423 				caching = 0;
424 			}
425 			continue;
426 		}
427 
428 		ep_reclen = ep->d_reclen;
429 
430 		/*
431 		 * Add named entries and free space into the directory cache
432 		 */
433 		if (caching) {
434 			ushort_t extra;
435 			off_t off2;
436 
437 			if (ep->d_ino == 0) {
438 				extra = ep_reclen;
439 				if (offset & (DIRBLKSIZ - 1)) {
440 					dnlc_dir_purge(dcap);
441 					dp->i_cachedir = CD_DISABLED;
442 					caching = 0;
443 				}
444 			} else {
445 				/*
446 				 * entries hold the previous offset except the
447 				 * 1st which holds the offset + 1
448 				 */
449 				if (offset & (DIRBLKSIZ - 1)) {
450 					off2 = last_offset;
451 				} else {
452 					off2 = offset + 1;
453 				}
454 				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
455 				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
456 				extra = ep_reclen - DIRSIZ(ep);
457 			}
458 			if (caching && (extra >= LDIRSIZ(1))) {
459 				caching = (dnlc_dir_add_space(dcap, extra,
460 				    (uint64_t)offset) == DOK);
461 			}
462 		}
463 
464 		/*
465 		 * Check for a name match.
466 		 * We have the parent inode read locked with i_rwlock.
467 		 */
468 		if (ep->d_ino && ep->d_namlen == namlen &&
469 		    *namep == *ep->d_name &&	/* fast chk 1st chr */
470 		    bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) {
471 
472 			/*
473 			 * We have to release the fbp early here to avoid
474 			 * a possible deadlock situation where we have the
475 			 * fbp and want the directory inode and someone doing
476 			 * a ufs_direnter_* has the directory inode and wants
477 			 * the fbp.  XXX - is this still needed?
478 			 */
479 			ep_ino = (ino_t)ep->d_ino;
480 			ASSERT(fbp != NULL);
481 			fbrelse(fbp, S_OTHER);
482 			fbp = NULL;
483 
484 			/*
485 			 * Atomic update (read lock held)
486 			 */
487 			dp->i_diroff = offset;
488 
489 			if (namlen == 2 && namep[0] == '.' && namep[1] == '.') {
490 				struct timeval32 omtime;
491 
492 				if (caching) {
493 					dnlc_dir_purge(dcap);
494 					caching = 0;
495 				}
496 				if (doingchk) {
497 					/*
498 					 * if the inumber didn't change
499 					 * continue with already found inode.
500 					 */
501 					if (ep_ino == chkino)
502 						goto checkok;
503 					else {
504 						VN_RELE(ITOV(*ipp));
505 						/* *ipp is nulled at restart */
506 						goto restart;
507 					}
508 				}
509 				/*
510 				 * release the lock on the dir we are searching
511 				 * to avoid a deadlock when grabbing the
512 				 * i_contents lock in ufs_iget_alloced().
513 				 */
514 				omtime = dp->i_mtime;
515 				rw_exit(&dp->i_rwlock);
516 				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
517 				    RW_READER);
518 				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
519 				    cr);
520 				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
521 				ufs_tryirwlock(&dp->i_rwlock, RW_READER,
522 				    retry_disk);
523 				if (indeadlock) {
524 					if (!err)
525 						VN_RELE(ITOV(*ipp));
526 					return (EAGAIN);
527 				}
528 				if (err)
529 					goto bad;
530 				/*
531 				 * Since we released the lock on the directory,
532 				 * we must check that the same inode is still
533 				 * the ".." entry for this directory.
534 				 */
535 				/*CSTYLED*/
536 				if (timercmp(&omtime, &dp->i_mtime, !=)) {
537 					/*
538 					 * Modification time changed on the
539 					 * directory, we must go check if
540 					 * the inumber changed for ".."
541 					 */
542 					doingchk = 1;
543 					chkino = ep_ino;
544 					entryoffsetinblock = 0;
545 					if (caching) {
546 						/*
547 						 * Forget directory caching
548 						 * for this rare case
549 						 */
550 						dnlc_dir_purge(dcap);
551 						caching = 0;
552 					}
553 					goto recheck;
554 				}
555 			} else if (dp->i_number == ep_ino) {
556 				VN_HOLD(dvp);	/* want ourself, "." */
557 				*ipp = dp;
558 				if (caching) {
559 					dnlc_dir_purge(dcap);
560 					caching = 0;
561 				}
562 			} else {
563 				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
564 				    RW_READER);
565 				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
566 				    cr);
567 				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
568 				if (err)
569 					goto bad;
570 			}
571 checkok:
572 			ASSERT(*ipp);
573 			dnlc_update(dvp, namep, ITOV(*ipp));
574 			/*
575 			 * If we are not caching then just return the entry
576 			 * otherwise complete loading up the cache
577 			 */
578 			if (!caching) {
579 				rw_exit(&dp->i_rwlock);
580 				return (0);
581 			}
582 			err = blkatoff(dp, offset, (char **)0, &fbp);
583 			if (err)
584 				goto bad;
585 		}
586 		last_offset = offset;
587 		offset += ep_reclen;
588 		entryoffsetinblock += ep_reclen;
589 	}
590 	/*
591 	 * If we started in the middle of the directory and failed
592 	 * to find our target, we must check the beginning as well.
593 	 */
594 	if (numdirpasses == 2) {
595 		numdirpasses--;
596 		offset = 0;
597 		endsearch = start_off;
598 		goto searchloop;
599 	}
600 
601 	/*
602 	 * If whole directory caching is on (or was originally on) then
603 	 * the entry may have been found.
604 	 */
605 	if (*ipp == NULL) {
606 		err = ENOENT;
607 		if (ufs_negative_cache && (dp->i_nlink > 0)) {
608 			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
609 		}
610 	}
611 	if (caching) {
612 		dnlc_dir_complete(dcap);
613 		caching = 0;
614 	}
615 
616 bad:
617 	if (err && *ipp) {
618 		/*
619 		 * err and *ipp can both be set if we were attempting to
620 		 * cache the directory, and we found the entry, then later
621 		 * while trying to complete the directory cache encountered
622 		 * a error (eg reading a directory sector).
623 		 */
624 		VN_RELE(ITOV(*ipp));
625 		*ipp = NULL;
626 	}
627 
628 	if (fbp)
629 		fbrelse(fbp, S_OTHER);
630 	rw_exit(&dp->i_rwlock);
631 	if (caching)
632 		dnlc_dir_purge(dcap);
633 	return (err);
634 }
635 
636 /*
637  * Write a new directory entry for DE_CREATE or DE_MKDIR operations.
638  */
639 int
640 ufs_direnter_cm(
641 	struct inode *tdp,	/* target directory to make entry in */
642 	char *namep,		/* name of entry */
643 	enum de_op op,		/* entry operation */
644 	struct vattr *vap,	/* attributes if new inode needed */
645 	struct inode **ipp,	/* return entered inode here */
646 	struct cred *cr,	/* user credentials */
647 	int flags)		/* no entry exists */
648 {
649 	struct inode *tip;	/* inode of (existing) target file */
650 	char *s;
651 	struct ufs_slot slot;	/* slot info to pass around */
652 	int namlen;		/* length of name */
653 	int err;		/* error number */
654 	struct inode *nip;	/* new inode */
655 	int do_rele_nip = 0;	/* release nip */
656 	int noentry = flags & ~IQUIET;
657 	int quiet = flags & IQUIET;	/* Suppress out of inodes message */
658 	int indeadlock;
659 	struct ulockfs *ulp;
660 
661 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
662 
663 	if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) ||
664 	    ((vap->va_type == VCHR) || (vap->va_type == VBLK) ||
665 	    (vap->va_type == VDOOR) || (vap->va_type == VSOCK) ||
666 	    (vap->va_type == VFIFO))))
667 		return (EINVAL);
668 
669 	/* don't allow '/' characters in pathname component */
670 	for (s = namep, namlen = 0; *s; s++, namlen++)
671 		if (*s == '/')
672 			return (EACCES);
673 	ASSERT(namlen);
674 
675 	/*
676 	 * Check accessibility of target directory.
677 	 */
678 	if (err = ufs_diraccess(tdp, IEXEC, cr))
679 		return (err);
680 
681 	/*
682 	 * If name is "." or ".." then if this is a create look it up
683 	 * and return EEXIST.
684 	 */
685 	if (namep[0] == '.' &&
686 	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
687 		/*
688 		 * ufs_dirlook will acquire the i_rwlock
689 		 */
690 		if (tdp->i_ufsvfs)
691 			ulp = &tdp->i_ufsvfs->vfs_ulockfs;
692 		rw_exit(&tdp->i_rwlock);
693 		if (err = ufs_dirlook(tdp, namep, ipp, cr, 0, 0)) {
694 			if (err == EAGAIN)
695 				return (err);
696 
697 			/*
698 			 * ufs_tryirwlock uses rw_tryenter and checks for
699 			 * SLOCK to avoid i_rwlock, ufs_lockfs_begin deadlock.
700 			 * If deadlock possible, retries the operation.
701 			 */
702 			ufs_tryirwlock(&tdp->i_rwlock, RW_WRITER, retry_err);
703 			if (indeadlock)
704 				return (EAGAIN);
705 
706 			return (err);
707 		}
708 		ufs_tryirwlock(&tdp->i_rwlock, RW_WRITER, retry);
709 		if (indeadlock) {
710 			VN_RELE(ITOV(*ipp));
711 			return (EAGAIN);
712 		}
713 		return (EEXIST);
714 	}
715 
716 	/*
717 	 * If target directory has not been removed, then we can consider
718 	 * allowing file to be created.
719 	 */
720 	if (tdp->i_nlink <= 0) {
721 		return (ENOENT);
722 	}
723 
724 	/*
725 	 * Search for the entry. Return VN_HELD tip if found.
726 	 */
727 	tip = NULL;
728 	slot.fbp = NULL;
729 	slot.status = NONE;
730 	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
731 	rw_enter(&tdp->i_contents, RW_WRITER);
732 	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry);
733 	if (err)
734 		goto out;
735 	if (tip) {
736 		ASSERT(!noentry);
737 		*ipp = tip;
738 		err = EEXIST;
739 	} else {
740 		/*
741 		 * The entry does not exist. Check write permission in
742 		 * directory to see if entry can be created.
743 		 */
744 		if (err = ufs_iaccess(tdp, IWRITE, cr, 0))
745 			goto out;
746 		/*
747 		 * Make new inode and directory entry.
748 		 */
749 		tdp->i_flag |= quiet;
750 		if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) {
751 			if (nip != NULL)
752 				do_rele_nip = 1;
753 			goto out;
754 		}
755 		if (err = ufs_diraddentry(tdp, namep, op,
756 		    namlen, &slot, nip, NULL, cr)) {
757 			/*
758 			 * Unmake the inode we just made.
759 			 */
760 			rw_enter(&nip->i_contents, RW_WRITER);
761 			if (((nip->i_mode & IFMT) == IFDIR) ||
762 			    ((nip->i_mode & IFMT) == IFATTRDIR)) {
763 				tdp->i_nlink--;
764 				ufs_setreclaim(tdp);
765 				tdp->i_flag |= ICHG;
766 				tdp->i_seq++;
767 				TRANS_INODE(tdp->i_ufsvfs, tdp);
768 				ITIMES_NOLOCK(tdp);
769 			}
770 			nip->i_nlink = 0;
771 			ufs_setreclaim(nip);
772 			TRANS_INODE(nip->i_ufsvfs, nip);
773 			nip->i_flag |= ICHG;
774 			nip->i_seq++;
775 			ITIMES_NOLOCK(nip);
776 			rw_exit(&nip->i_contents);
777 			do_rele_nip = 1;
778 		} else {
779 			*ipp = nip;
780 		}
781 	}
782 
783 out:
784 	if (slot.fbp)
785 		fbrelse(slot.fbp, S_OTHER);
786 
787 	tdp->i_flag &= ~quiet;
788 	rw_exit(&tdp->i_contents);
789 
790 	/*
791 	 * Drop vfs_dqrwlock before calling VN_RELE() on nip to
792 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
793 	 */
794 	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
795 
796 	if (do_rele_nip) {
797 		VN_RELE(ITOV(nip));
798 	}
799 
800 	return (err);
801 }
802 
803 /*
804  * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations.
805  */
806 int
807 ufs_direnter_lr(
808 	struct inode *tdp,	/* target directory to make entry in */
809 	char *namep,		/* name of entry */
810 	enum de_op op,		/* entry operation */
811 	struct inode *sdp,	/* source inode parent if rename */
812 	struct inode *sip,	/* source inode */
813 	struct cred *cr)	/* user credentials */
814 {
815 	struct inode *tip;	/* inode of (existing) target file */
816 	char *s;
817 	struct ufs_slot slot;	/* slot info to pass around */
818 	int namlen;		/* length of name */
819 	int err;		/* error number */
820 
821 	/* don't allow '/' characters in pathname component */
822 	for (s = namep, namlen = 0; *s; s++, namlen++)
823 		if (*s == '/')
824 			return (EACCES);
825 	ASSERT(namlen);
826 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
827 
828 	/*
829 	 * If name is "." or ".." then if this is a create look it up
830 	 * and return EEXIST.  Rename or link TO "." or ".." is forbidden.
831 	 */
832 	if (namep[0] == '.' &&
833 	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
834 		if (op == DE_RENAME) {
835 			return (EINVAL);	/* *SIGH* should be ENOTEMPTY */
836 		}
837 		return (EEXIST);
838 	}
839 	/*
840 	 * For link and rename lock the source entry and check the link count
841 	 * to see if it has been removed while it was unlocked.  If not, we
842 	 * increment the link count and force the inode to disk to make sure
843 	 * that it is there before any directory entry that points to it.
844 	 *
845 	 * In the case of a symbolic link, we are dealing with a new inode
846 	 * which does not yet have any links.  We've created it with a link
847 	 * count of 1, and we don't want to increment it since this will be
848 	 * its first link.
849 	 *
850 	 * We are about to push the inode to disk. We make sure
851 	 * that the inode's data blocks are flushed first so the
852 	 * inode and it's data blocks are always in sync.  This
853 	 * adds some robustness in in the event of a power failure
854 	 * or panic where sync fails. If we panic before the
855 	 * inode is updated, then the inode still refers to the
856 	 * old data blocks (or none for a new file). If we panic
857 	 * after the inode is updated, then the inode refers to
858 	 * the new data blocks.
859 	 *
860 	 * We do this before grabbing the i_contents lock because
861 	 * ufs_syncip() will want that lock. We could do the data
862 	 * syncing after the removal checks, but upon return from
863 	 * the data sync we would have to repeat the removal
864 	 * checks.
865 	 */
866 	if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) {
867 		return (err);
868 	}
869 
870 	rw_enter(&sip->i_contents, RW_WRITER);
871 	if (sip->i_nlink <= 0) {
872 		rw_exit(&sip->i_contents);
873 		return (ENOENT);
874 	}
875 	if (sip->i_nlink == MAXLINK) {
876 		rw_exit(&sip->i_contents);
877 		return (EMLINK);
878 	}
879 
880 	/*
881 	 * Sync the indirect blocks associated with the file
882 	 * for the same reasons as described above.  Since this
883 	 * call wants the i_contents lock held for it we can do
884 	 * this here with no extra work.
885 	 */
886 	if (err = ufs_sync_indir(sip)) {
887 		rw_exit(&sip->i_contents);
888 		return (err);
889 	}
890 
891 	if (op != DE_SYMLINK)
892 		sip->i_nlink++;
893 	TRANS_INODE(sip->i_ufsvfs, sip);
894 	sip->i_flag |= ICHG;
895 	sip->i_seq++;
896 	ufs_iupdat(sip, I_SYNC);
897 	rw_exit(&sip->i_contents);
898 
899 	/*
900 	 * If target directory has not been removed, then we can consider
901 	 * allowing file to be created.
902 	 */
903 	if (tdp->i_nlink <= 0) {
904 		err = ENOENT;
905 		goto out2;
906 	}
907 
908 	/*
909 	 * Check accessibility of target directory.
910 	 */
911 	if (err = ufs_diraccess(tdp, IEXEC, cr))
912 		goto out2;
913 
914 	/*
915 	 * Search for the entry. Return VN_HELD tip if found.
916 	 */
917 	tip = NULL;
918 	slot.status = NONE;
919 	slot.fbp = NULL;
920 	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
921 	rw_enter(&tdp->i_contents, RW_WRITER);
922 	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0);
923 	if (err)
924 		goto out;
925 
926 	if (tip) {
927 		switch (op) {
928 		case DE_RENAME:
929 			err = ufs_dirrename(sdp, sip, tdp, namep,
930 			    tip, &slot, cr);
931 			break;
932 
933 		case DE_LINK:
934 		case DE_SYMLINK:
935 			/*
936 			 * Can't link to an existing file.
937 			 */
938 			err = EEXIST;
939 			break;
940 		default:
941 			break;
942 		}
943 	} else {
944 		/*
945 		 * The entry does not exist. Check write permission in
946 		 * directory to see if entry can be created.
947 		 */
948 		if (err = ufs_iaccess(tdp, IWRITE, cr, 0))
949 			goto out;
950 		err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp,
951 		    cr);
952 	}
953 
954 out:
955 	if (slot.fbp)
956 		fbrelse(slot.fbp, S_OTHER);
957 
958 	rw_exit(&tdp->i_contents);
959 
960 	/*
961 	 * Drop vfs_dqrwlock before calling VN_RELE() on tip to
962 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
963 	 */
964 	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
965 
966 	/*
967 	 * If we renamed a file over the top of an existing file,
968 	 * or linked a file to an existing file (or tried to),
969 	 * then release and delete (or just release) the inode.
970 	 */
971 	if (tip)
972 		VN_RELE(ITOV(tip));
973 
974 out2:
975 	if (err) {
976 		/*
977 		 * Undo bumped link count.
978 		 */
979 		if (op != DE_SYMLINK) {
980 			rw_enter(&sip->i_contents, RW_WRITER);
981 			sip->i_nlink--;
982 			ufs_setreclaim(sip);
983 			TRANS_INODE(sip->i_ufsvfs, sip);
984 			sip->i_flag |= ICHG;
985 			sip->i_seq++;
986 			ITIMES_NOLOCK(sip);
987 			rw_exit(&sip->i_contents);
988 		}
989 	}
990 	return (err);
991 }
992 
993 /*
994  * Check for the existence of a name in a directory (unless noentry
995  * is set) , or else of an empty
996  * slot in which an entry may be made.  If the requested name is found,
997  * then on return *ipp points at the inode and *offp contains
998  * its offset in the directory.  If the name is not found, then *ipp
999  * will be NULL and *slotp will contain information about a directory slot in
1000  * which an entry may be made (either an empty slot, or the first position
1001  * past the end of the directory).
1002  * The target directory inode (tdp) is supplied write locked (i_rwlock).
1003  *
1004  * This may not be used on "." or "..", but aliases of "." are ok.
1005  */
1006 int
1007 ufs_dircheckforname(
1008 	struct inode *tdp,	/* inode of directory being checked */
1009 	char *namep,		/* name we're checking for */
1010 	int namlen,		/* length of name, excluding null */
1011 	struct ufs_slot *slotp,	/* slot structure */
1012 	struct inode **ipp,	/* return inode if we find one */
1013 	struct cred *cr,
1014 	int noentry)		/* noentry - just look for space */
1015 {
1016 	uint64_t handle;
1017 	struct fbuf *fbp;	/* pointer to directory block */
1018 	struct direct *ep;	/* directory entry */
1019 	struct direct *nep;	/* next directory entry */
1020 	dcanchor_t *dcap;
1021 	vnode_t *dvp;		/* directory vnode ptr */
1022 	off_t dirsize;		/* size of the directory */
1023 	off_t offset;		/* offset in the directory */
1024 	off_t last_offset;	/* last offset */
1025 	off_t enduseful;	/* pointer past last used dir slot */
1026 	int entryoffsetinblk;	/* offset of ep in fbp's buffer */
1027 	int i;			/* length of mangled entry */
1028 	int needed;
1029 	int err;
1030 	int first;
1031 	int caching;
1032 	int stat;
1033 	ino_t ep_ino;
1034 	slotstat_t initstat = slotp->status;
1035 
1036 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1037 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1038 	ASSERT(*ipp == NULL);
1039 	fbp = NULL;
1040 
1041 	/*
1042 	 * First check if there is a complete cache of the directory.
1043 	 */
1044 	dvp = ITOV(tdp);
1045 
1046 	dcap = &tdp->i_danchor;
1047 	if (noentry) {
1048 		/*
1049 		 * We know from the 1st level dnlc cache that the entry
1050 		 * doesn't exist, so don't bother searching the directory
1051 		 * cache, but just look for space (possibly in the directory
1052 		 * cache).
1053 		 */
1054 		stat = DNOENT;
1055 	} else {
1056 		stat = dnlc_dir_lookup(dcap, namep, &handle);
1057 	}
1058 	switch (stat) {
1059 	case DFOUND:
1060 		ep_ino = (ino_t)H_TO_INO(handle);
1061 		if (tdp->i_number == ep_ino) {
1062 			*ipp = tdp;	/* we want ourself, ie "." */
1063 			VN_HOLD(dvp);
1064 		} else {
1065 			err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr);
1066 			if (err)
1067 				return (err);
1068 		}
1069 		offset = H_TO_OFF(handle);
1070 		first = 0;
1071 		if (offset & 1) {
1072 			/* This is the first entry in the block */
1073 			first = 1;
1074 			offset -= 1;
1075 			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1076 		}
1077 		err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1078 		if (err) {
1079 			VN_RELE(ITOV(*ipp));
1080 			*ipp = NULL;
1081 			return (err);
1082 		}
1083 		/*
1084 		 * Check the validity of the entry.
1085 		 * If it's bad, then throw away the cache and
1086 		 * continue without it. The dirmangled() routine
1087 		 * will then be called upon it.
1088 		 */
1089 		if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1090 			VN_RELE(ITOV(*ipp));
1091 			*ipp = NULL;
1092 			dnlc_dir_purge(dcap);
1093 			break;
1094 		}
1095 		/*
1096 		 * Remember the returned offset is the offset of the
1097 		 * preceding record (unless this is the 1st record
1098 		 * in the DIRBLKSIZ sized block (disk sector)), then it's
1099 		 * offset + 1. Note, no real offsets are on odd boundaries.
1100 		 */
1101 		if (first) {
1102 			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1103 			slotp->offset = offset;
1104 			slotp->size = 0;
1105 			slotp->ep = ep;
1106 		} else {
1107 			/* get the next entry */
1108 			nep = (struct direct *)((char *)ep + ep->d_reclen);
1109 			/*
1110 			 * Check the validity of this entry as well
1111 			 * If it's bad, then throw away the cache and
1112 			 * continue without it. The dirmangled() routine
1113 			 * will then be called upon it.
1114 			 */
1115 			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1116 			    (nep->d_ino != ep_ino)) {
1117 				VN_RELE(ITOV(*ipp));
1118 				*ipp = NULL;
1119 				dnlc_dir_purge(dcap);
1120 				break;
1121 			}
1122 			slotp->offset = offset + ep->d_reclen;
1123 			slotp->size = ep->d_reclen;
1124 			slotp->ep = nep;
1125 		}
1126 		slotp->status = EXIST;
1127 		slotp->fbp = fbp;
1128 		slotp->endoff = 0;
1129 		slotp->cached = 1;
1130 		dnlc_update(dvp, namep, ITOV(*ipp));
1131 		return (0);
1132 	case DNOENT:
1133 		/*
1134 		 * The caller gets to set the initial slot status to
1135 		 * indicate whether it's interested in getting a
1136 		 * empty slot. For example, the status can be set
1137 		 * to FOUND when an entry is being deleted.
1138 		 */
1139 		ASSERT(slotp->fbp == NULL);
1140 		if (slotp->status == FOUND) {
1141 			return (0);
1142 		}
1143 		switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen),
1144 		    &handle)) {
1145 		case DFOUND:
1146 			offset = (off_t)handle;
1147 			err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1148 			if (err) {
1149 				dnlc_dir_purge(dcap);
1150 				ASSERT(*ipp == NULL);
1151 				return (err);
1152 			}
1153 			/*
1154 			 * Check the validity of the entry.
1155 			 * If it's bad, then throw away the cache and
1156 			 * continue without it. The dirmangled() routine
1157 			 * will then be called upon it.
1158 			 */
1159 			if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1160 				dnlc_dir_purge(dcap);
1161 				break;
1162 			}
1163 			/*
1164 			 * Remember the returned offset is the offset of the
1165 			 * containing record.
1166 			 */
1167 			slotp->status = FOUND;
1168 			slotp->ep = ep;
1169 			slotp->offset = offset;
1170 			slotp->fbp = fbp;
1171 			slotp->size = ep->d_reclen;
1172 			/*
1173 			 * Set end offset to 0. Truncation is handled
1174 			 * because the dnlc cache will blow away the
1175 			 * cached directory when an entry is removed
1176 			 * that drops the entries left to less than half
1177 			 * the minumum number (dnlc_min_dir_cache).
1178 			 */
1179 			slotp->endoff = 0;
1180 			slotp->cached = 1;
1181 			return (0);
1182 		case DNOENT:
1183 			slotp->status = NONE;
1184 			slotp->offset = P2ROUNDUP_TYPED(tdp->i_size,
1185 			    DIRBLKSIZ, u_offset_t);
1186 			slotp->size = DIRBLKSIZ;
1187 			slotp->endoff = 0;
1188 			slotp->cached = 1;
1189 			return (0);
1190 		default:
1191 			break;
1192 		}
1193 		break;
1194 	}
1195 	slotp->cached = 0;
1196 	caching = NULL;
1197 	if (!noentry && tdp->i_size >= ufs_min_dir_cache) {
1198 		/*
1199 		 * if the directory caching disable time has expired
1200 		 * enable caching again.
1201 		 */
1202 		if (tdp->i_cachedir == CD_DISABLED_NOMEM &&
1203 		    gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
1204 			ufs_dc_disable_at = 0;
1205 			tdp->i_cachedir = CD_ENABLED;
1206 		}
1207 		/*
1208 		 * Attempt to cache any directories greater than the tunable
1209 		 * ufs_min_cache_dir. If it fails due to memory shortage
1210 		 * (DNOMEM), disable caching for this directory and record
1211 		 * the system time. Any attempt after the disable time has
1212 		 * expired will enable the caching again.
1213 		 */
1214 		if (tdp->i_cachedir == CD_ENABLED) {
1215 			switch (dnlc_dir_start(dcap,
1216 			    tdp->i_size >> AV_DIRECT_SHIFT)) {
1217 			case DNOMEM:
1218 				tdp->i_cachedir = CD_DISABLED_NOMEM;
1219 				ufs_dc_disable_at = gethrtime();
1220 				break;
1221 			case DTOOBIG:
1222 				tdp->i_cachedir = CD_DISABLED_TOOBIG;
1223 				break;
1224 			case DOK:
1225 				caching = 1;
1226 				break;
1227 			default:
1228 				break;
1229 			}
1230 		}
1231 	}
1232 
1233 	/*
1234 	 * No point in using i_diroff since we must search whole directory
1235 	 */
1236 	dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t);
1237 	enduseful = 0;
1238 	offset = last_offset = 0;
1239 	entryoffsetinblk = 0;
1240 	needed = (int)LDIRSIZ(namlen);
1241 	while (offset < dirsize) {
1242 		/*
1243 		 * If offset is on a block boundary,
1244 		 * read the next directory block.
1245 		 * Release previous if it exists.
1246 		 */
1247 		if (blkoff(tdp->i_fs, offset) == 0) {
1248 			if (fbp != NULL)
1249 				fbrelse(fbp, S_OTHER);
1250 
1251 			err = blkatoff(tdp, offset, (char **)0, &fbp);
1252 			if (err) {
1253 				ASSERT(*ipp == NULL);
1254 				if (caching) {
1255 					dnlc_dir_purge(dcap);
1256 				}
1257 				return (err);
1258 			}
1259 			entryoffsetinblk = 0;
1260 		}
1261 		/*
1262 		 * If still looking for a slot, and at a DIRBLKSIZ
1263 		 * boundary, have to start looking for free space
1264 		 * again.
1265 		 */
1266 		if (slotp->status == NONE &&
1267 		    (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) {
1268 			slotp->offset = -1;
1269 		}
1270 		/*
1271 		 * If the next entry is a zero length record or if the
1272 		 * record length is invalid, then skip to the next
1273 		 * directory block.  Complete validation checks are
1274 		 * done if the record length is invalid.
1275 		 *
1276 		 * Full validation checks are slow so they are disabled
1277 		 * by default.  Complete checks can be run by patching
1278 		 * "dirchk" to be true.
1279 		 *
1280 		 * We do not have to check the validity of
1281 		 * entryoffsetinblk here because it starts out as zero
1282 		 * and is only incremented by d_reclen values that we
1283 		 * validate here.
1284 		 */
1285 		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk);
1286 		if (ep->d_reclen == 0 ||
1287 		    (dirchk || (ep->d_reclen & 0x3)) &&
1288 		    dirmangled(tdp, ep, entryoffsetinblk, offset)) {
1289 			i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1));
1290 			offset += i;
1291 			entryoffsetinblk += i;
1292 			if (caching) {
1293 				dnlc_dir_purge(dcap);
1294 				caching = 0;
1295 			}
1296 			continue;
1297 		}
1298 
1299 		/*
1300 		 * Add named entries and free space into the directory cache
1301 		 */
1302 		if (caching) {
1303 			ushort_t extra;
1304 			off_t off2;
1305 
1306 			if (ep->d_ino == 0) {
1307 				extra = ep->d_reclen;
1308 				if (offset & (DIRBLKSIZ - 1)) {
1309 					dnlc_dir_purge(dcap);
1310 					caching = 0;
1311 				}
1312 			} else {
1313 				/*
1314 				 * entries hold the previous offset if
1315 				 * not the 1st one
1316 				 */
1317 				if (offset & (DIRBLKSIZ - 1)) {
1318 					off2 = last_offset;
1319 				} else {
1320 					off2 = offset + 1;
1321 				}
1322 				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
1323 				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
1324 				extra = ep->d_reclen - DIRSIZ(ep);
1325 			}
1326 			if (caching && (extra >= LDIRSIZ(1))) {
1327 				caching = (dnlc_dir_add_space(dcap, extra,
1328 				    (uint64_t)offset) == DOK);
1329 			}
1330 		}
1331 
1332 		/*
1333 		 * If an appropriate sized slot has not yet been found,
1334 		 * check to see if one is available.
1335 		 */
1336 		if ((slotp->status != FOUND) && (slotp->status != EXIST)) {
1337 			int size = ep->d_reclen;
1338 
1339 			if (ep->d_ino != 0)
1340 				size -= DIRSIZ(ep);
1341 			if (size > 0) {
1342 				if (size >= needed) {
1343 					slotp->offset = offset;
1344 					slotp->size = ep->d_reclen;
1345 					if (noentry) {
1346 						slotp->ep = ep;
1347 						slotp->fbp = fbp;
1348 						slotp->status = FOUND;
1349 						slotp->endoff = 0;
1350 						return (0);
1351 					}
1352 					slotp->status = FOUND;
1353 				} else if (slotp->status == NONE) {
1354 					if (slotp->offset == -1)
1355 						slotp->offset = offset;
1356 				}
1357 			}
1358 		}
1359 		/*
1360 		 * Check for a name match.
1361 		 */
1362 		if (ep->d_ino && ep->d_namlen == namlen &&
1363 		    *namep == *ep->d_name &&	/* fast chk 1st char */
1364 		    bcmp(namep, ep->d_name, namlen) == 0) {
1365 
1366 			tdp->i_diroff = offset;
1367 
1368 			if (tdp->i_number == ep->d_ino) {
1369 				*ipp = tdp;	/* we want ourself, ie "." */
1370 				VN_HOLD(dvp);
1371 			} else {
1372 				err = ufs_iget_alloced(tdp->i_vfs,
1373 				    (ino_t)ep->d_ino, ipp, cr);
1374 				if (err) {
1375 					fbrelse(fbp, S_OTHER);
1376 					if (caching)
1377 						dnlc_dir_purge(dcap);
1378 					return (err);
1379 				}
1380 			}
1381 			slotp->status = EXIST;
1382 			slotp->offset = offset;
1383 			slotp->size = (int)(offset - last_offset);
1384 			slotp->fbp = fbp;
1385 			slotp->ep = ep;
1386 			slotp->endoff = 0;
1387 			if (caching)
1388 				dnlc_dir_purge(dcap);
1389 			return (0);
1390 		}
1391 		last_offset = offset;
1392 		offset += ep->d_reclen;
1393 		entryoffsetinblk += ep->d_reclen;
1394 		if (ep->d_ino)
1395 			enduseful = offset;
1396 	}
1397 	if (fbp) {
1398 		fbrelse(fbp, S_OTHER);
1399 	}
1400 
1401 	if (caching) {
1402 		dnlc_dir_complete(dcap);
1403 		slotp->cached = 1;
1404 		if (slotp->status == FOUND) {
1405 			if (initstat == FOUND) {
1406 				return (0);
1407 			}
1408 			(void) dnlc_dir_rem_space_by_handle(dcap,
1409 			    slotp->offset);
1410 			slotp->endoff = 0;
1411 			return (0);
1412 		}
1413 	}
1414 
1415 	if (slotp->status == NONE) {
1416 		/*
1417 		 * We didn't find a slot; the new directory entry should be put
1418 		 * at the end of the directory.  Return an indication of where
1419 		 * this is, and set "endoff" to zero; since we're going to have
1420 		 * to extend the directory, we're certainly not going to
1421 		 * truncate it.
1422 		 */
1423 		slotp->offset = dirsize;
1424 		slotp->size = DIRBLKSIZ;
1425 		slotp->endoff = 0;
1426 	} else {
1427 		/*
1428 		 * We found a slot, and will return an indication of where that
1429 		 * slot is, as any new directory entry will be put there.
1430 		 * Since that slot will become a useful entry, if the last
1431 		 * useful entry we found was before this one, update the offset
1432 		 * of the last useful entry.
1433 		 */
1434 		if (enduseful < slotp->offset + slotp->size)
1435 			enduseful = slotp->offset + slotp->size;
1436 		slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t);
1437 	}
1438 	*ipp = NULL;
1439 	return (0);
1440 }
1441 
1442 uint64_t ufs_dirrename_retry_cnt;
1443 
1444 /*
1445  * Rename the entry in the directory tdp so that it points to
1446  * sip instead of tip.
1447  */
1448 static int
1449 ufs_dirrename(
1450 	struct inode *sdp,	/* parent directory of source */
1451 	struct inode *sip,	/* source inode */
1452 	struct inode *tdp,	/* parent directory of target */
1453 	char *namep,		/* entry we are trying to change */
1454 	struct inode *tip,	/* target inode */
1455 	struct ufs_slot *slotp,	/* slot for entry */
1456 	struct cred *cr)	/* credentials */
1457 {
1458 	vnode_t *tdvp;
1459 	off_t offset;
1460 	int err;
1461 	int doingdirectory;
1462 
1463 	ASSERT(sdp->i_ufsvfs != NULL);
1464 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1465 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1466 	/*
1467 	 * Short circuit rename of something to itself.
1468 	 */
1469 	if (sip->i_number == tip->i_number) {
1470 		return (ESAME); /* special KLUDGE error code */
1471 	}
1472 
1473 	/*
1474 	 * We're locking 2 peer level locks, so must use tryenter
1475 	 * on the 2nd to avoid deadlocks that would occur
1476 	 * if we renamed a->b and b->a concurrently.
1477 	 */
1478 retry:
1479 	rw_enter(&tip->i_contents, RW_WRITER);
1480 	if (!rw_tryenter(&sip->i_contents, RW_READER)) {
1481 		/*
1482 		 * drop tip and wait (sleep) until we stand a chance
1483 		 * of holding sip
1484 		 */
1485 		rw_exit(&tip->i_contents);
1486 		rw_enter(&sip->i_contents, RW_READER);
1487 		/*
1488 		 * Reverse the lock grabs in case we have heavy
1489 		 * contention on the 2nd lock.
1490 		 */
1491 		if (!rw_tryenter(&tip->i_contents, RW_WRITER)) {
1492 			ufs_dirrename_retry_cnt++;
1493 			rw_exit(&sip->i_contents);
1494 			goto retry;
1495 		}
1496 	}
1497 
1498 	/*
1499 	 * Check that everything is on the same filesystem.
1500 	 */
1501 	if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) ||
1502 	    (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) {
1503 		err = EXDEV;		/* XXX archaic */
1504 		goto out;
1505 	}
1506 	/*
1507 	 * Must have write permission to rewrite target entry.
1508 	 * Perform additional checks for sticky directories.
1509 	 */
1510 	if ((err = ufs_iaccess(tdp, IWRITE, cr, 0)) != 0 ||
1511 	    (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0)
1512 		goto out;
1513 
1514 	/*
1515 	 * Ensure source and target are compatible (both directories
1516 	 * or both not directories).  If target is a directory it must
1517 	 * be empty and have no links to it; in addition it must not
1518 	 * be a mount point, and both the source and target must be
1519 	 * writable.
1520 	 */
1521 	doingdirectory = (((sip->i_mode & IFMT) == IFDIR) ||
1522 	    ((sip->i_mode & IFMT) == IFATTRDIR));
1523 	if (((tip->i_mode & IFMT) == IFDIR) ||
1524 	    ((tip->i_mode & IFMT) == IFATTRDIR)) {
1525 		if (!doingdirectory) {
1526 			err = EISDIR;
1527 			goto out;
1528 		}
1529 		/*
1530 		 * vn_vfsrlock will prevent mounts from using the directory
1531 		 * until we are done.
1532 		 */
1533 		if (vn_vfsrlock(ITOV(tip))) {
1534 			err = EBUSY;
1535 			goto out;
1536 		}
1537 		if (vn_mountedvfs(ITOV(tip)) != NULL) {
1538 			vn_vfsunlock(ITOV(tip));
1539 			err = EBUSY;
1540 			goto out;
1541 		}
1542 		if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) {
1543 			vn_vfsunlock(ITOV(tip));
1544 			err = EEXIST;	/* SIGH should be ENOTEMPTY */
1545 			goto out;
1546 		}
1547 	} else if (doingdirectory) {
1548 		err = ENOTDIR;
1549 		goto out;
1550 	}
1551 
1552 	/*
1553 	 * Rewrite the inode pointer for target name entry
1554 	 * from the target inode (ip) to the source inode (sip).
1555 	 * This prevents the target entry from disappearing
1556 	 * during a crash. Mark the directory inode to reflect the changes.
1557 	 */
1558 	tdvp = ITOV(tdp);
1559 	slotp->ep->d_ino = (int32_t)sip->i_number;
1560 	dnlc_update(tdvp, namep, ITOV(sip));
1561 	if (slotp->size) {
1562 		offset = slotp->offset - slotp->size;
1563 	} else {
1564 		offset = slotp->offset + 1;
1565 	}
1566 	if (slotp->cached) {
1567 		(void) dnlc_dir_update(&tdp->i_danchor, namep,
1568 		    INO_OFF_TO_H(slotp->ep->d_ino, offset));
1569 	}
1570 
1571 	err = TRANS_DIR(tdp, slotp->offset);
1572 	if (err)
1573 		fbrelse(slotp->fbp, S_OTHER);
1574 	else
1575 		err = ufs_fbwrite(slotp->fbp, tdp);
1576 
1577 	slotp->fbp = NULL;
1578 	if (err) {
1579 		if (doingdirectory)
1580 			vn_vfsunlock(ITOV(tip));
1581 		goto out;
1582 	}
1583 
1584 	TRANS_INODE(tdp->i_ufsvfs, tdp);
1585 	tdp->i_flag |= IUPD|ICHG;
1586 	tdp->i_seq++;
1587 	ITIMES_NOLOCK(tdp);
1588 
1589 	/*
1590 	 * Decrement the link count of the target inode.
1591 	 * Fix the ".." entry in sip to point to dp.
1592 	 * This is done after the new entry is on the disk.
1593 	 */
1594 	tip->i_nlink--;
1595 	TRANS_INODE(tip->i_ufsvfs, tip);
1596 	tip->i_flag |= ICHG;
1597 	tip->i_seq++;
1598 	ITIMES_NOLOCK(tip);
1599 	if (doingdirectory) {
1600 		/*
1601 		 * The entry for tip no longer exists so I can unlock the
1602 		 * vfslock.
1603 		 */
1604 		vn_vfsunlock(ITOV(tip));
1605 		/*
1606 		 * Decrement target link count once more if it was a directory.
1607 		 */
1608 		if (--tip->i_nlink != 0) {
1609 			err = ufs_fault(ITOV(tip),
1610 		    "ufs_dirrename: target directory link count != 0 (%s)",
1611 			    tip->i_fs->fs_fsmnt);
1612 			rw_exit(&tip->i_contents);
1613 			return (err);
1614 		}
1615 		TRANS_INODE(tip->i_ufsvfs, tip);
1616 		ufs_setreclaim(tip);
1617 		/*
1618 		 * Renaming a directory with the parent different
1619 		 * requires that ".." be rewritten.  The window is
1620 		 * still there for ".." to be inconsistent, but this
1621 		 * is unavoidable, and a lot shorter than when it was
1622 		 * done in a user process.  We decrement the link
1623 		 * count in the new parent as appropriate to reflect
1624 		 * the just-removed target.  If the parent is the
1625 		 * same, this is appropriate since the original
1626 		 * directory is going away.  If the new parent is
1627 		 * different, ufs_dirfixdotdot() will bump the link count
1628 		 * back.
1629 		 */
1630 		tdp->i_nlink--;
1631 		ufs_setreclaim(tdp);
1632 		TRANS_INODE(tdp->i_ufsvfs, tdp);
1633 		tdp->i_flag |= ICHG;
1634 		tdp->i_seq++;
1635 		ITIMES_NOLOCK(tdp);
1636 		if (sdp != tdp) {
1637 			rw_exit(&tip->i_contents);
1638 			rw_exit(&sip->i_contents);
1639 			err = ufs_dirfixdotdot(sip, sdp, tdp);
1640 			return (err);
1641 		}
1642 	} else
1643 		ufs_setreclaim(tip);
1644 out:
1645 	rw_exit(&tip->i_contents);
1646 	rw_exit(&sip->i_contents);
1647 	return (err);
1648 }
1649 
1650 /*
1651  * Fix the ".." entry of the child directory so that it points
1652  * to the new parent directory instead of the old one.  Routine
1653  * assumes that dp is a directory and that all the inodes are on
1654  * the same file system.
1655  */
1656 static int
1657 ufs_dirfixdotdot(
1658 	struct inode *dp,	/* child directory */
1659 	struct inode *opdp,	/* old parent directory */
1660 	struct inode *npdp)	/* new parent directory */
1661 {
1662 	struct fbuf *fbp;
1663 	struct dirtemplate *dirp;
1664 	vnode_t *dvp;
1665 	int err;
1666 
1667 	ASSERT(RW_WRITE_HELD(&npdp->i_rwlock));
1668 	ASSERT(RW_WRITE_HELD(&npdp->i_contents));
1669 
1670 	/*
1671 	 * We hold the child directory's i_contents lock before calling
1672 	 * blkatoff so that we honor correct locking protocol which is
1673 	 * i_contents lock and then page lock. (blkatoff will call
1674 	 * ufs_getpage where we want the page lock)
1675 	 * We hold the child directory's i_rwlock before i_contents (as
1676 	 * per the locking protocol) since we are modifying the ".." entry
1677 	 * of the child directory.
1678 	 * We hold the i_rwlock and i_contents lock until we record
1679 	 * this directory delta to the log (via ufs_trans_dir) and have
1680 	 * done fbrelse.
1681 	 */
1682 	rw_enter(&dp->i_rwlock, RW_WRITER);
1683 	rw_enter(&dp->i_contents, RW_WRITER);
1684 	err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp);
1685 	if (err)
1686 		goto bad;
1687 
1688 	if (dp->i_nlink <= 0 ||
1689 	    dp->i_size < sizeof (struct dirtemplate)) {
1690 		err = ENOENT;
1691 		goto bad;
1692 	}
1693 
1694 	if (dirp->dotdot_namlen != 2 ||
1695 	    dirp->dotdot_name[0] != '.' ||
1696 	    dirp->dotdot_name[1] != '.') {	/* Sanity check. */
1697 		dirbad(dp, "mangled .. entry", (off_t)0);
1698 		err = ENOTDIR;
1699 		goto bad;
1700 	}
1701 
1702 	/*
1703 	 * Increment the link count in the new parent inode and force it out.
1704 	 */
1705 	if (npdp->i_nlink == MAXLINK) {
1706 		err = EMLINK;
1707 		goto bad;
1708 	}
1709 	npdp->i_nlink++;
1710 	TRANS_INODE(npdp->i_ufsvfs, npdp);
1711 	npdp->i_flag |= ICHG;
1712 	npdp->i_seq++;
1713 	ufs_iupdat(npdp, I_SYNC);
1714 
1715 	/*
1716 	 * Rewrite the child ".." entry and force it out.
1717 	 */
1718 	dvp = ITOV(dp);
1719 	dirp->dotdot_ino = (uint32_t)npdp->i_number;
1720 	dnlc_update(dvp, "..", ITOV(npdp));
1721 	(void) dnlc_dir_update(&dp->i_danchor, "..",
1722 	    INO_OFF_TO_H(dirp->dotdot_ino, 0));
1723 
1724 	err = TRANS_DIR(dp, 0);
1725 	if (err)
1726 		fbrelse(fbp, S_OTHER);
1727 	else
1728 		err = ufs_fbwrite(fbp, dp);
1729 
1730 	fbp = NULL;
1731 	if (err)
1732 		goto bad;
1733 
1734 	rw_exit(&dp->i_contents);
1735 	rw_exit(&dp->i_rwlock);
1736 
1737 	/*
1738 	 * Decrement the link count of the old parent inode and force it out.
1739 	 */
1740 	ASSERT(opdp);
1741 	rw_enter(&opdp->i_contents, RW_WRITER);
1742 	ASSERT(opdp->i_nlink > 0);
1743 	opdp->i_nlink--;
1744 	ufs_setreclaim(opdp);
1745 	TRANS_INODE(opdp->i_ufsvfs, opdp);
1746 	opdp->i_flag |= ICHG;
1747 	opdp->i_seq++;
1748 	ufs_iupdat(opdp, I_SYNC);
1749 	rw_exit(&opdp->i_contents);
1750 	return (0);
1751 
1752 bad:
1753 	if (fbp)
1754 		fbrelse(fbp, S_OTHER);
1755 	rw_exit(&dp->i_contents);
1756 	rw_exit(&dp->i_rwlock);
1757 	return (err);
1758 }
1759 
1760 /*
1761  * Enter the file sip in the directory tdp with name namep.
1762  */
1763 static int
1764 ufs_diraddentry(
1765 	struct inode *tdp,
1766 	char *namep,
1767 	enum de_op op,
1768 	int namlen,
1769 	struct ufs_slot *slotp,
1770 	struct inode *sip,
1771 	struct inode *sdp,
1772 	struct cred *cr)
1773 {
1774 	struct direct *ep, *nep;
1775 	vnode_t *tdvp;
1776 	dcanchor_t *dcap = &tdp->i_danchor;
1777 	off_t offset;
1778 	int err;
1779 	ushort_t extra;
1780 
1781 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1782 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1783 	/*
1784 	 * Prepare a new entry.  If the caller has not supplied an
1785 	 * existing inode, make a new one.
1786 	 */
1787 	err = dirprepareentry(tdp, slotp, cr);
1788 	if (err) {
1789 		if (slotp->fbp) {
1790 			fbrelse(slotp->fbp, S_OTHER);
1791 			slotp->fbp = NULL;
1792 		}
1793 		return (err);
1794 	}
1795 	/*
1796 	 * Check inode to be linked to see if it is in the
1797 	 * same filesystem.
1798 	 */
1799 	if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) {
1800 		err = EXDEV;
1801 		goto bad;
1802 	}
1803 
1804 	/*
1805 	 * If renaming a directory then fix up the ".." entry in the
1806 	 * directory to point to the new parent.
1807 	 */
1808 	if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) ||
1809 	    ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) {
1810 		err = ufs_dirfixdotdot(sip, sdp, tdp);
1811 		if (err)
1812 			goto bad;
1813 	}
1814 
1815 	/*
1816 	 * Fill in entry data.
1817 	 */
1818 	ep = slotp->ep;
1819 	ep->d_namlen = (ushort_t)namlen;
1820 	(void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3));
1821 	ep->d_ino = (uint32_t)sip->i_number;
1822 	tdvp = ITOV(tdp);
1823 	dnlc_update(tdvp, namep, ITOV(sip));
1824 	/*
1825 	 * Note the offset supplied for any named entry is
1826 	 * the offset of the previous one, unless it's the 1st.
1827 	 * slotp->size is used to pass the length to
1828 	 * the previous entry.
1829 	 */
1830 	if (slotp->size) {
1831 		offset = slotp->offset - slotp->size;
1832 	} else {
1833 		offset = slotp->offset + 1;
1834 	}
1835 
1836 	if (slotp->cached) {
1837 		/*
1838 		 * Add back any usable unused space to the dnlc directory
1839 		 * cache.
1840 		 */
1841 		extra = ep->d_reclen - DIRSIZ(ep);
1842 		if (extra >= LDIRSIZ(1)) {
1843 			(void) dnlc_dir_add_space(dcap, extra,
1844 			    (uint64_t)slotp->offset);
1845 		}
1846 
1847 		(void) dnlc_dir_add_entry(dcap, namep,
1848 		    INO_OFF_TO_H(ep->d_ino, offset));
1849 
1850 		/* adjust the previous offset of the next entry */
1851 		nep = (struct direct *)((char *)ep + ep->d_reclen);
1852 		if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
1853 			/*
1854 			 * Not a new block.
1855 			 *
1856 			 * Check the validity of the next entry.
1857 			 * If it's bad, then throw away the cache, and
1858 			 * continue as before directory caching.
1859 			 */
1860 			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1861 			    dnlc_dir_update(dcap, nep->d_name,
1862 			    INO_OFF_TO_H(nep->d_ino, slotp->offset))
1863 			    == DNOENT) {
1864 				dnlc_dir_purge(dcap);
1865 				slotp->cached = 0;
1866 			}
1867 		}
1868 	}
1869 
1870 	/*
1871 	 * Write out the directory block.
1872 	 */
1873 	err = TRANS_DIR(tdp, slotp->offset);
1874 	if (err)
1875 		fbrelse(slotp->fbp, S_OTHER);
1876 	else
1877 		err = ufs_fbwrite(slotp->fbp, tdp);
1878 
1879 	slotp->fbp = NULL;
1880 	/*
1881 	 * If this is a rename of a directory, then we have already
1882 	 * fixed the ".." entry to refer to the new parent. If err
1883 	 * is true at this point, we have failed to update the new
1884 	 * parent to refer to the renamed directory.
1885 	 * XXX - we need to unwind the ".." fix.
1886 	 */
1887 	if (err)
1888 		return (err);
1889 
1890 	/*
1891 	 * Mark the directory inode to reflect the changes.
1892 	 * Truncate the directory to chop off blocks of empty entries.
1893 	 */
1894 
1895 	TRANS_INODE(tdp->i_ufsvfs, tdp);
1896 	tdp->i_flag |= IUPD|ICHG;
1897 	tdp->i_seq++;
1898 	tdp->i_diroff = 0;
1899 	ITIMES_NOLOCK(tdp);
1900 	/*
1901 	 * If the directory grew then dirprepareentry() will have
1902 	 * set IATTCHG in tdp->i_flag, then the directory inode must
1903 	 * be flushed out. This is because if fsync() is used later
1904 	 * the directory size must be correct, otherwise a crash would
1905 	 * cause fsck to move the file to lost+found. Also because later
1906 	 * a file may be linked in more than one directory, then there
1907 	 * is no way to flush the original directory. So it must be
1908 	 * flushed out on creation. See bug 4293809.
1909 	 */
1910 	if (tdp->i_flag & IATTCHG) {
1911 		ufs_iupdat(tdp, I_SYNC);
1912 	}
1913 
1914 	if (slotp->endoff && (slotp->endoff < tdp->i_size)) {
1915 		if (!TRANS_ISTRANS(tdp->i_ufsvfs)) {
1916 			(void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0,
1917 			    cr);
1918 		}
1919 	}
1920 
1921 
1922 	return (0);
1923 
1924 bad:
1925 	if (slotp->cached) {
1926 		dnlc_dir_purge(dcap);
1927 		fbrelse(slotp->fbp, S_OTHER);
1928 		slotp->cached = 0;
1929 		slotp->fbp = NULL;
1930 		return (err);
1931 	}
1932 
1933 	/*
1934 	 * Clear out entry prepared by dirprepareent.
1935 	 */
1936 	slotp->ep->d_ino = 0;
1937 	slotp->ep->d_namlen = 0;
1938 
1939 	/*
1940 	 * Don't touch err so we don't clobber the real error that got us here.
1941 	 */
1942 	if (TRANS_DIR(tdp, slotp->offset))
1943 		fbrelse(slotp->fbp, S_OTHER);
1944 	else
1945 		(void) ufs_fbwrite(slotp->fbp, tdp);
1946 	slotp->fbp = NULL;
1947 	return (err);
1948 }
1949 
1950 /*
1951  * Prepare a directory slot to receive an entry.
1952  */
1953 static int
1954 dirprepareentry(
1955 	struct inode *dp,	/* directory we are working in */
1956 	struct ufs_slot *slotp,	/* available slot info */
1957 	struct cred *cr)
1958 {
1959 	struct direct *ep, *nep;
1960 	off_t entryend;
1961 	int err;
1962 	slotstat_t status = slotp->status;
1963 	ushort_t dsize;
1964 
1965 	ASSERT((status == NONE) || (status == FOUND));
1966 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
1967 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
1968 	/*
1969 	 * If we didn't find a slot, then indicate that the
1970 	 * new slot belongs at the end of the directory.
1971 	 * If we found a slot, then the new entry can be
1972 	 * put at slotp->offset.
1973 	 */
1974 	entryend = slotp->offset + slotp->size;
1975 	if (status == NONE) {
1976 		ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0);
1977 		if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
1978 			err = ufs_fault(ITOV(dp),
1979 			    "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d"
1980 			    " > dp->i_fs->fs_fsize: %d (%s)",
1981 			    DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt);
1982 			return (err);
1983 		}
1984 		/*
1985 		 * Allocate the new block.
1986 		 */
1987 		err = BMAPALLOC(dp, (u_offset_t)slotp->offset,
1988 		    (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr);
1989 		if (err) {
1990 			return (err);
1991 		}
1992 		dp->i_size = entryend;
1993 		TRANS_INODE(dp->i_ufsvfs, dp);
1994 		dp->i_flag |= IUPD|ICHG|IATTCHG;
1995 		dp->i_seq++;
1996 		ITIMES_NOLOCK(dp);
1997 	} else if (entryend > dp->i_size) {
1998 		/*
1999 		 * Adjust directory size, if needed. This should never
2000 		 * push the size past a new multiple of DIRBLKSIZ.
2001 		 * This is an artifact of the old (4.2BSD) way of initializing
2002 		 * directory sizes to be less than DIRBLKSIZ.
2003 		 */
2004 		dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t);
2005 		TRANS_INODE(dp->i_ufsvfs, dp);
2006 		dp->i_flag |= IUPD|ICHG|IATTCHG;
2007 		dp->i_seq++;
2008 		ITIMES_NOLOCK(dp);
2009 	}
2010 
2011 	/*
2012 	 * Get the block containing the space for the new directory entry.
2013 	 */
2014 	if (slotp->fbp == NULL) {
2015 		err = blkatoff(dp, slotp->offset, (char **)&slotp->ep,
2016 		    &slotp->fbp);
2017 		if (err) {
2018 			return (err);
2019 		}
2020 	}
2021 	ep = slotp->ep;
2022 
2023 	switch (status) {
2024 	case NONE:
2025 		/*
2026 		 * No space in the directory. slotp->offset will be on a
2027 		 * directory block boundary and we will write the new entry
2028 		 * into a fresh block.
2029 		 */
2030 		ep->d_reclen = DIRBLKSIZ;
2031 		slotp->size = 0; /* length of previous entry */
2032 		break;
2033 	case FOUND:
2034 		/*
2035 		 * An entry of the required size has been found. Use it.
2036 		 */
2037 		if (ep->d_ino == 0) {
2038 			/* this is the 1st record in a block */
2039 			slotp->size = 0; /* length of previous entry */
2040 		} else {
2041 			dsize = DIRSIZ(ep);
2042 			nep = (struct direct *)((char *)ep + dsize);
2043 			nep->d_reclen = ep->d_reclen - dsize;
2044 			ep->d_reclen = dsize;
2045 			slotp->ep = nep;
2046 			slotp->offset += dsize;
2047 			slotp->size = dsize; /* length of previous entry */
2048 		}
2049 		break;
2050 	default:
2051 		break;
2052 	}
2053 	return (0);
2054 }
2055 
2056 /*
2057  * Allocate and initialize a new inode that will go into directory tdp.
2058  * This routine is called from ufs_symlink(), as well as within this file.
2059  */
2060 int
2061 ufs_dirmakeinode(
2062 	struct inode *tdp,
2063 	struct inode **ipp,
2064 	struct vattr *vap,
2065 	enum de_op op,
2066 	struct cred *cr)
2067 {
2068 	struct inode *ip;
2069 	enum vtype type;
2070 	int imode;			/* mode and format as in inode */
2071 	ino_t ipref;
2072 	int err;
2073 	timestruc_t now;
2074 
2075 	ASSERT(vap != NULL);
2076 	ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR ||
2077 	    op == DE_SYMLINK);
2078 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
2079 	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
2080 	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
2081 	/*
2082 	 * Allocate a new inode.
2083 	 */
2084 	type = vap->va_type;
2085 	if (type == VDIR) {
2086 		ipref = dirpref(tdp);
2087 	} else {
2088 		ipref = tdp->i_number;
2089 	}
2090 	if (op == DE_ATTRDIR)
2091 		imode = vap->va_mode;
2092 	else
2093 		imode = MAKEIMODE(type, vap->va_mode);
2094 	*ipp = NULL;
2095 	err = ufs_ialloc(tdp, ipref, imode, &ip, cr);
2096 	if (err)
2097 		return (err);
2098 
2099 	/*
2100 	 * We don't need to grab vfs_dqrwlock here because it is held
2101 	 * in ufs_direnter_*() above us.
2102 	 */
2103 	ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock));
2104 	rw_enter(&ip->i_contents, RW_WRITER);
2105 	if (ip->i_dquot != NULL) {
2106 		err = ufs_fault(ITOV(ip),
2107 		    "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)",
2108 		    tdp->i_fs->fs_fsmnt);
2109 		rw_exit(&ip->i_contents);
2110 		return (err);
2111 	}
2112 	*ipp = ip;
2113 	ip->i_mode = (o_mode_t)imode;
2114 	if (type == VBLK || type == VCHR) {
2115 		dev_t d = vap->va_rdev;
2116 		dev32_t dev32;
2117 
2118 		/*
2119 		 * Don't allow a special file to be created with a
2120 		 * dev_t that cannot be represented by this filesystem
2121 		 * format on disk.
2122 		 */
2123 		if (!cmpldev(&dev32, d)) {
2124 			err = EOVERFLOW;
2125 			goto fail;
2126 		}
2127 
2128 		ITOV(ip)->v_rdev = ip->i_rdev = d;
2129 
2130 		if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
2131 			ip->i_ordev = dev32; /* can't use old format */
2132 		} else {
2133 			ip->i_ordev = cmpdev(d);
2134 		}
2135 	}
2136 	ITOV(ip)->v_type = type;
2137 	ufs_reset_vnode(ip->i_vnode);
2138 	if (type == VDIR) {
2139 		ip->i_nlink = 2; /* anticipating a call to dirmakedirect */
2140 	} else {
2141 		ip->i_nlink = 1;
2142 	}
2143 
2144 	if (op == DE_ATTRDIR) {
2145 		ip->i_uid = vap->va_uid;
2146 		ip->i_gid = vap->va_gid;
2147 	} else
2148 		ip->i_uid = crgetuid(cr);
2149 	/*
2150 	 * To determine the group-id of the created file:
2151 	 *   1) If the gid is set in the attribute list (non-Sun & pre-4.0
2152 	 *	clients are not likely to set the gid), then use it if
2153 	 *	the process is privileged, belongs to the target group,
2154 	 *	or the group is the same as the parent directory.
2155 	 *   2) If the filesystem was not mounted with the Old-BSD-compatible
2156 	 *	GRPID option, and the directory's set-gid bit is clear,
2157 	 *	then use the process's gid.
2158 	 *   3) Otherwise, set the group-id to the gid of the parent directory.
2159 	 */
2160 	if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) &&
2161 	    ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) ||
2162 	    secpolicy_vnode_create_gid(cr) == 0)) {
2163 		/*
2164 		 * XXX - is this only the case when a 4.0 NFS client, or a
2165 		 * client derived from that code, makes a call over the wire?
2166 		 */
2167 		ip->i_gid = vap->va_gid;
2168 	} else
2169 		ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr);
2170 
2171 	/*
2172 	 * For SunOS 5.0->5.4, the lines below read:
2173 	 *
2174 	 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
2175 	 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
2176 	 *
2177 	 * where MAXUID was set to 60002.  See notes on this in ufs_inode.c
2178 	 */
2179 	ip->i_suid =
2180 	    (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? UID_LONG : ip->i_uid;
2181 	ip->i_sgid =
2182 	    (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? GID_LONG : ip->i_gid;
2183 
2184 	/*
2185 	 * If we're creating a directory, and the parent directory has the
2186 	 * set-GID bit set, set it on the new directory.
2187 	 * Otherwise, if the user is neither privileged nor a member of the
2188 	 * file's new group, clear the file's set-GID bit.
2189 	 */
2190 	if ((tdp->i_mode & ISGID) && (type == VDIR))
2191 		ip->i_mode |= ISGID;
2192 	else {
2193 		if ((ip->i_mode & ISGID) &&
2194 		    secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0)
2195 			ip->i_mode &= ~ISGID;
2196 	}
2197 
2198 	if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2199 	    ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2200 		err = EOVERFLOW;
2201 		goto fail;
2202 	}
2203 
2204 	/*
2205 	 * Extended attribute directories are not subject to quotas.
2206 	 */
2207 	if (op != DE_ATTRDIR)
2208 		ip->i_dquot = getinoquota(ip);
2209 	else
2210 		ip->i_dquot = NULL;
2211 
2212 	if (op == DE_MKDIR || op == DE_ATTRDIR) {
2213 		err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr);
2214 		if (err)
2215 			goto fail;
2216 	}
2217 
2218 	/*
2219 	 * generate the shadow inode and attach it to the new object
2220 	 */
2221 	ASSERT((tdp->i_shadow && tdp->i_ufs_acl) ||
2222 	    (!tdp->i_shadow && !tdp->i_ufs_acl));
2223 	if (tdp->i_shadow && tdp->i_ufs_acl &&
2224 	    (((tdp->i_mode & IFMT) == IFDIR) ||
2225 	    ((tdp->i_mode & IFMT) == IFATTRDIR))) {
2226 		err = ufs_si_inherit(ip, tdp, ip->i_mode, cr);
2227 		if (err) {
2228 			if (op == DE_MKDIR) {
2229 				/*
2230 				 * clean up parent directory
2231 				 *
2232 				 * tdp->i_contents already locked from
2233 				 * ufs_direnter_*()
2234 				 */
2235 				tdp->i_nlink--;
2236 				TRANS_INODE(tdp->i_ufsvfs, tdp);
2237 				tdp->i_flag |= ICHG;
2238 				tdp->i_seq++;
2239 				ufs_iupdat(tdp, I_SYNC);
2240 			}
2241 			goto fail;
2242 		}
2243 	}
2244 
2245 	/*
2246 	 * If the passed in attributes contain atime and/or mtime
2247 	 * settings, then use them instead of using the current
2248 	 * high resolution time.
2249 	 */
2250 	if (vap->va_mask & (AT_MTIME|AT_ATIME)) {
2251 		if (vap->va_mask & AT_ATIME) {
2252 			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
2253 			ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2254 			ip->i_flag &= ~IACC;
2255 		} else
2256 			ip->i_flag |= IACC;
2257 		if (vap->va_mask & AT_MTIME) {
2258 			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
2259 			ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2260 			gethrestime(&now);
2261 			if (now.tv_sec > TIME32_MAX) {
2262 				/*
2263 				 * In 2038, ctime sticks forever..
2264 				 */
2265 				ip->i_ctime.tv_sec = TIME32_MAX;
2266 				ip->i_ctime.tv_usec = 0;
2267 			} else {
2268 				ip->i_ctime.tv_sec = now.tv_sec;
2269 				ip->i_ctime.tv_usec = now.tv_nsec / 1000;
2270 			}
2271 			ip->i_flag &= ~(IUPD|ICHG);
2272 			ip->i_flag |= IMODTIME;
2273 		} else
2274 			ip->i_flag |= IUPD|ICHG;
2275 		ip->i_flag |= IMOD;
2276 	} else
2277 		ip->i_flag |= IACC|IUPD|ICHG;
2278 	ip->i_seq++;
2279 
2280 	/*
2281 	 * If this is an attribute tag it as one.
2282 	 */
2283 	if ((tdp->i_mode & IFMT) == IFATTRDIR) {
2284 		ip->i_cflags |= IXATTR;
2285 	}
2286 
2287 	/*
2288 	 * push inode before it's name appears in a directory
2289 	 */
2290 	TRANS_INODE(ip->i_ufsvfs, ip);
2291 	ufs_iupdat(ip, I_SYNC);
2292 	rw_exit(&ip->i_contents);
2293 	return (0);
2294 
2295 fail:
2296 	/* Throw away inode we just allocated. */
2297 	ip->i_nlink = 0;
2298 	ufs_setreclaim(ip);
2299 	TRANS_INODE(ip->i_ufsvfs, ip);
2300 	ip->i_flag |= ICHG;
2301 	ip->i_seq++;
2302 	ITIMES_NOLOCK(ip);
2303 	rw_exit(&ip->i_contents);
2304 	return (err);
2305 }
2306 
2307 /*
2308  * Write a prototype directory into the empty inode ip, whose parent is dp.
2309  */
2310 static int
2311 ufs_dirmakedirect(
2312 	struct inode *ip,		/* new directory */
2313 	struct inode *dp,		/* parent directory */
2314 	int	attrdir,
2315 	struct cred *cr)
2316 {
2317 	struct dirtemplate *dirp;
2318 	struct fbuf *fbp;
2319 	int err;
2320 
2321 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
2322 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2323 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
2324 	/*
2325 	 * Allocate space for the directory we're creating.
2326 	 */
2327 	err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr);
2328 	if (err)
2329 		return (err);
2330 	if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
2331 		err = ufs_fault(ITOV(dp),
2332 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)",
2333 		    DIRBLKSIZ, dp->i_fs->fs_fsize,
2334 		    dp->i_fs->fs_fsmnt);
2335 		return (err);
2336 	}
2337 	ip->i_size = DIRBLKSIZ;
2338 	TRANS_INODE(ip->i_ufsvfs, ip);
2339 	ip->i_flag |= IUPD|ICHG|IATTCHG;
2340 	ip->i_seq++;
2341 	ITIMES_NOLOCK(ip);
2342 	/*
2343 	 * Update the tdp link count and write out the change.
2344 	 * This reflects the ".." entry we'll soon write.
2345 	 */
2346 	if (dp->i_nlink == MAXLINK)
2347 		return (EMLINK);
2348 	if (attrdir == 0)
2349 		dp->i_nlink++;
2350 	TRANS_INODE(dp->i_ufsvfs, dp);
2351 	dp->i_flag |= ICHG;
2352 	dp->i_seq++;
2353 	ufs_iupdat(dp, I_SYNC);
2354 	/*
2355 	 * Initialize directory with "."
2356 	 * and ".." from static template.
2357 	 *
2358 	 * Since the parent directory is locked, we don't have to
2359 	 * worry about anything changing when we drop the write
2360 	 * lock on (ip).
2361 	 *
2362 	 */
2363 	err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize,
2364 	    S_READ, &fbp);
2365 
2366 	if (err) {
2367 		goto fail;
2368 	}
2369 	dirp = (struct dirtemplate *)fbp->fb_addr;
2370 	/*
2371 	 * Now initialize the directory we're creating
2372 	 * with the "." and ".." entries.
2373 	 */
2374 	*dirp = mastertemplate;			/* structure assignment */
2375 	dirp->dot_ino = (uint32_t)ip->i_number;
2376 	dirp->dotdot_ino = (uint32_t)dp->i_number;
2377 
2378 	err = TRANS_DIR(ip, 0);
2379 	if (err) {
2380 		fbrelse(fbp, S_OTHER);
2381 		goto fail;
2382 	}
2383 
2384 	err = ufs_fbwrite(fbp, ip);
2385 	if (err) {
2386 		goto fail;
2387 	}
2388 
2389 	return (0);
2390 
2391 fail:
2392 	if (attrdir == 0)
2393 		dp->i_nlink--;
2394 	TRANS_INODE(dp->i_ufsvfs, dp);
2395 	dp->i_flag |= ICHG;
2396 	dp->i_seq++;
2397 	ufs_iupdat(dp, I_SYNC);
2398 	return (err);
2399 }
2400 
2401 /*
2402  * Delete a directory entry.  If oip is nonzero the entry is checked
2403  * to make sure it still reflects oip.
2404  */
2405 int
2406 ufs_dirremove(
2407 	struct inode *dp,
2408 	char *namep,
2409 	struct inode *oip,
2410 	struct vnode *cdir,
2411 	enum dr_op op,
2412 	struct cred *cr)
2413 {
2414 	struct direct *ep, *pep, *nep;
2415 	struct inode *ip;
2416 	vnode_t *dvp, *vp;
2417 	struct ufs_slot slot;
2418 	int namlen;
2419 	int err;
2420 	int mode;
2421 	ushort_t extra;
2422 
2423 	namlen = (int)strlen(namep);
2424 	if (namlen == 0) {
2425 		struct fs	*fs = dp->i_fs;
2426 
2427 		cmn_err(CE_WARN, "%s: ufs_dirremove: attempted to remove"
2428 		    " nameless file in directory (directory inode %llu)",
2429 		    fs->fs_fsmnt, (u_longlong_t)dp->i_number);
2430 		ASSERT(namlen != 0);
2431 
2432 		return (ENOENT);
2433 	}
2434 
2435 	/*
2436 	 * return error when removing . and ..
2437 	 */
2438 	if (namep[0] == '.') {
2439 		if (namlen == 1)
2440 			return (EINVAL);
2441 		else if (namlen == 2 && namep[1] == '.') {
2442 			return (EEXIST);	/* SIGH should be ENOTEMPTY */
2443 		}
2444 	}
2445 
2446 	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2447 
2448 retry:
2449 	/*
2450 	 * Check accessibility of directory.
2451 	 */
2452 	if (err = ufs_diraccess(dp, IEXEC|IWRITE, cr))
2453 		return (err);
2454 
2455 	ip = NULL;
2456 	slot.fbp = NULL;
2457 	slot.status = FOUND;	/* don't need to look for empty slot */
2458 	rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
2459 	rw_enter(&dp->i_contents, RW_WRITER);
2460 
2461 	err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0);
2462 	if (err)
2463 		goto out_novfs;
2464 	if (ip == NULL) {
2465 		err = ENOENT;
2466 		goto out_novfs;
2467 	}
2468 	vp = ITOV(ip);
2469 	if (oip && oip != ip) {
2470 		err = ENOENT;
2471 		goto out_novfs;
2472 	}
2473 
2474 	mode = ip->i_mode & IFMT;
2475 	if (mode == IFDIR || mode == IFATTRDIR) {
2476 
2477 		/*
2478 		 * vn_vfsrlock() prevents races between mount and rmdir.
2479 		 */
2480 		if (vn_vfsrlock(vp)) {
2481 			err = EBUSY;
2482 			goto out_novfs;
2483 		}
2484 		if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) {
2485 			err = EBUSY;
2486 			goto out;
2487 		}
2488 		/*
2489 		 * If we are removing a directory, get a lock on it.
2490 		 * Taking a writer lock prevents a parallel ufs_dirlook from
2491 		 * incorrectly entering a negative cache vnode entry in the dnlc
2492 		 * If the directory is empty, it will stay empty until
2493 		 * we can remove it.
2494 		 */
2495 		if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) {
2496 			/*
2497 			 * It is possible that a thread in rename would have
2498 			 * acquired this rwlock. To prevent a deadlock we
2499 			 * do a rw_tryenter. If we fail to get the lock
2500 			 * we drop all the locks we have acquired, wait
2501 			 * for 2 ticks and reacquire the
2502 			 * directory's (dp) i_rwlock and try again.
2503 			 * If we dont drop dp's i_rwlock then we will panic
2504 			 * with a "Deadlock: cycle in blocking chain"
2505 			 * since in ufs_dircheckpath we want dp's i_rwlock.
2506 			 * dp is guaranteed to exist since ufs_dirremove is
2507 			 * called after a VN_HOLD(dp) has been done.
2508 			 */
2509 			ufs_dirremove_retry_cnt++;
2510 			vn_vfsunlock(vp);
2511 			if (slot.fbp)
2512 				fbrelse(slot.fbp, S_OTHER);
2513 			rw_exit(&dp->i_contents);
2514 			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2515 			rw_exit(&dp->i_rwlock);
2516 			VN_RELE(vp);
2517 			delay(2);
2518 			rw_enter(&dp->i_rwlock, RW_WRITER);
2519 			goto retry;
2520 		}
2521 	}
2522 	rw_enter(&ip->i_contents, RW_READER);
2523 
2524 	/*
2525 	 * Now check the restrictions that apply on sticky directories.
2526 	 */
2527 	if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) {
2528 		rw_exit(&ip->i_contents);
2529 		if (mode == IFDIR || mode == IFATTRDIR)
2530 			rw_exit(&ip->i_rwlock);
2531 		goto out;
2532 	}
2533 
2534 	if (op == DR_RMDIR) {
2535 		/*
2536 		 * For rmdir(2), some special checks are required.
2537 		 * (a) Don't remove any alias of the parent (e.g. ".").
2538 		 * (b) Don't remove the current directory.
2539 		 * (c) Make sure the entry is (still) a directory.
2540 		 * (d) Make sure the directory is empty.
2541 		 */
2542 
2543 		if (dp == ip || vp == cdir)
2544 			err = EINVAL;
2545 		else if (((ip->i_mode & IFMT) != IFDIR) &&
2546 		    ((ip->i_mode & IFMT) != IFATTRDIR))
2547 			err = ENOTDIR;
2548 		else if ((ip->i_nlink > 2) ||
2549 		    !ufs_dirempty(ip, dp->i_number, cr)) {
2550 			err = EEXIST;	/* SIGH should be ENOTEMPTY */
2551 		}
2552 
2553 		if (err) {
2554 			rw_exit(&ip->i_contents);
2555 			if (mode == IFDIR || mode == IFATTRDIR)
2556 				rw_exit(&ip->i_rwlock);
2557 			goto out;
2558 		}
2559 	} else if (op == DR_REMOVE)  {
2560 		/*
2561 		 * unlink(2) requires a different check: allow only
2562 		 * privileged users to unlink a directory.
2563 		 */
2564 		if (vp->v_type == VDIR &&
2565 		    secpolicy_fs_linkdir(cr, vp->v_vfsp)) {
2566 			err = EPERM;
2567 			rw_exit(&ip->i_contents);
2568 			rw_exit(&ip->i_rwlock);
2569 			goto out;
2570 		}
2571 	}
2572 
2573 	rw_exit(&ip->i_contents);
2574 
2575 	/*
2576 	 * Remove the cache'd entry, if any.
2577 	 */
2578 	dvp = ITOV(dp);
2579 	dnlc_remove(dvp, namep);
2580 	ep = slot.ep;
2581 	ep->d_ino = 0;
2582 
2583 	if (slot.cached) {
2584 		dcanchor_t *dcap = &dp->i_danchor;
2585 
2586 		(void) dnlc_dir_rem_entry(dcap, namep, NULL);
2587 		if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) {
2588 			(void) dnlc_dir_rem_space_by_handle(dcap, slot.offset);
2589 		}
2590 		if (slot.offset & (DIRBLKSIZ - 1)) {
2591 			/*
2592 			 * Collapse new free space into previous entry.
2593 			 * Note, the previous entry has already been
2594 			 * validated in ufs_dircheckforname().
2595 			 */
2596 			ASSERT(slot.size);
2597 			pep = (struct direct *)((char *)ep - slot.size);
2598 			if ((pep->d_ino == 0) &&
2599 			    ((uintptr_t)pep & (DIRBLKSIZ - 1))) {
2600 				dnlc_dir_purge(dcap);
2601 				slot.cached = 0;
2602 				goto nocache;
2603 			}
2604 			if (pep->d_ino) {
2605 				extra = pep->d_reclen - DIRSIZ(pep);
2606 			} else {
2607 				extra = pep->d_reclen;
2608 			}
2609 			if (extra >= LDIRSIZ(1)) {
2610 				(void) dnlc_dir_rem_space_by_handle(dcap,
2611 				    (uint64_t)(slot.offset - slot.size));
2612 			}
2613 			pep->d_reclen += ep->d_reclen;
2614 			(void) dnlc_dir_add_space(dcap, extra + ep->d_reclen,
2615 			    (uint64_t)(slot.offset - slot.size));
2616 			/* adjust the previous pointer in the next entry */
2617 			nep = (struct direct *)((char *)ep + ep->d_reclen);
2618 			if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
2619 				/*
2620 				 * Not a new block.
2621 				 *
2622 				 * Check the validity of the entry.
2623 				 * If it's bad, then throw away the cache and
2624 				 * continue.
2625 				 */
2626 				if ((nep->d_reclen == 0) ||
2627 				    (nep->d_reclen & 0x3) ||
2628 				    (dnlc_dir_update(dcap, nep->d_name,
2629 				    INO_OFF_TO_H(nep->d_ino,
2630 				    slot.offset - slot.size)) == DNOENT)) {
2631 					dnlc_dir_purge(dcap);
2632 					slot.cached = 0;
2633 				}
2634 			}
2635 		} else {
2636 			(void) dnlc_dir_add_space(dcap, ep->d_reclen,
2637 			    (uint64_t)slot.offset);
2638 		}
2639 	} else {
2640 		/*
2641 		 * If the entry isn't the first in the directory, we must
2642 		 * reclaim the space of the now empty record by adding
2643 		 * the record size to the size of the previous entry.
2644 		 */
2645 		if (slot.offset & (DIRBLKSIZ - 1)) {
2646 			/*
2647 			 * Collapse new free space into previous entry.
2648 			 */
2649 			pep = (struct direct *)((char *)ep - slot.size);
2650 			pep->d_reclen += ep->d_reclen;
2651 		}
2652 	}
2653 nocache:
2654 
2655 
2656 	err = TRANS_DIR(dp, slot.offset);
2657 	if (err)
2658 		fbrelse(slot.fbp, S_OTHER);
2659 	else
2660 		err = ufs_fbwrite(slot.fbp, dp);
2661 	slot.fbp = NULL;
2662 
2663 	/*
2664 	 * If we were removing a directory, it is 'gone' now, but we cannot
2665 	 * unlock it as a thread may be waiting for the lock in ufs_create. If
2666 	 * we did, it could then create a file in a deleted directory.
2667 	 */
2668 
2669 	if (err) {
2670 		if (mode == IFDIR || mode == IFATTRDIR)
2671 			rw_exit(&ip->i_rwlock);
2672 		goto out;
2673 	}
2674 
2675 	rw_enter(&ip->i_contents, RW_WRITER);
2676 
2677 	dp->i_flag |= IUPD|ICHG;
2678 	dp->i_seq++;
2679 	ip->i_flag |= ICHG;
2680 	ip->i_seq++;
2681 
2682 	TRANS_INODE(dp->i_ufsvfs, dp);
2683 	TRANS_INODE(ip->i_ufsvfs, ip);
2684 	/*
2685 	 * Now dispose of the inode.
2686 	 */
2687 	if (ip->i_nlink > 0) {
2688 		/*
2689 		 * This is not done for IFATTRDIR's because they don't
2690 		 * have entries in the dnlc and the link counts are
2691 		 * not incremented when they are created.
2692 		 */
2693 		if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) {
2694 			/*
2695 			 * Decrement by 2 because we're trashing the "."
2696 			 * entry as well as removing the entry in dp.
2697 			 * Clear the directory entry, but there may be
2698 			 * other hard links so don't free the inode.
2699 			 * Decrement the dp linkcount because we're
2700 			 * trashing the ".." entry.
2701 			 */
2702 			ip->i_nlink -= 2;
2703 			dp->i_nlink--;
2704 			ufs_setreclaim(dp);
2705 			/*
2706 			 * XXX need to discard negative cache entries
2707 			 * for vp.  See comment in ufs_delete().
2708 			 */
2709 			dnlc_remove(vp, ".");
2710 			dnlc_remove(vp, "..");
2711 			/*
2712 			 * The return value is ignored here bacause if
2713 			 * the directory purge fails we don't want to
2714 			 * stop the delete. If ufs_dirpurgedotdot fails
2715 			 * the delete will continue with the preexiting
2716 			 * behavior.
2717 			 */
2718 			(void) ufs_dirpurgedotdot(ip, dp->i_number, cr);
2719 		} else {
2720 			ip->i_nlink--;
2721 		}
2722 		ufs_setreclaim(ip);
2723 	}
2724 	ITIMES_NOLOCK(dp);
2725 	ITIMES_NOLOCK(ip);
2726 
2727 	if (!TRANS_ISTRANS(dp->i_ufsvfs))
2728 		ufs_iupdat(dp, I_SYNC);
2729 	if (!TRANS_ISTRANS(ip->i_ufsvfs))
2730 		ufs_iupdat(ip, I_SYNC);
2731 
2732 	rw_exit(&ip->i_contents);
2733 	if (mode == IFDIR || mode == IFATTRDIR)
2734 		rw_exit(&ip->i_rwlock);
2735 out:
2736 	if (mode == IFDIR || mode == IFATTRDIR) {
2737 		vn_vfsunlock(vp);
2738 	}
2739 out_novfs:
2740 	ASSERT(RW_WRITE_HELD(&dp->i_contents));
2741 
2742 	if (slot.fbp)
2743 		fbrelse(slot.fbp, S_OTHER);
2744 
2745 	rw_exit(&dp->i_contents);
2746 	rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2747 
2748 	/*
2749 	 * Release (and delete) the inode after we drop vfs_dqrwlock to
2750 	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
2751 	 */
2752 	if (ip)
2753 		VN_RELE(vp);
2754 
2755 	return (err);
2756 }
2757 
2758 /*
2759  * Return buffer with contents of block "offset"
2760  * from the beginning of directory "ip".  If "res"
2761  * is non-zero, fill it in with a pointer to the
2762  * remaining space in the directory.
2763  *
2764  */
2765 
2766 int
2767 blkatoff(
2768 	struct inode *ip,
2769 	off_t offset,
2770 	char **res,
2771 	struct fbuf **fbpp)
2772 {
2773 	struct fs *fs;
2774 	struct fbuf *fbp;
2775 	daddr_t lbn;
2776 	uint_t bsize;
2777 	int err;
2778 
2779 	CPU_STATS_ADD_K(sys, ufsdirblk, 1);
2780 	fs = ip->i_fs;
2781 	lbn = (daddr_t)lblkno(fs, offset);
2782 	bsize = (uint_t)blksize(fs, ip, lbn);
2783 	err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask),
2784 	    bsize, S_READ, &fbp);
2785 	if (err) {
2786 		*fbpp = (struct fbuf *)NULL;
2787 		return (err);
2788 	}
2789 	if (res)
2790 		*res = fbp->fb_addr + blkoff(fs, offset);
2791 	*fbpp = fbp;
2792 	return (0);
2793 }
2794 
2795 /*
2796  * Do consistency checking:
2797  *	record length must be multiple of 4
2798  *	entry must fit in rest of its DIRBLKSIZ block
2799  *	record must be large enough to contain entry
2800  *	name is not longer than MAXNAMLEN
2801  *	name must be as long as advertised, and null terminated
2802  * NOTE: record length must not be zero (should be checked previously).
2803  *       This routine is only called if dirchk is true.
2804  *       It would be nice to set the FSBAD flag in the super-block when
2805  *       this routine fails so that a fsck is forced on next reboot,
2806  *       but locking is a problem.
2807  */
2808 static int
2809 dirmangled(
2810 	struct inode *dp,
2811 	struct direct *ep,
2812 	int entryoffsetinblock,
2813 	off_t offset)
2814 {
2815 	int i;
2816 
2817 	i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
2818 	if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i ||
2819 	    (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN ||
2820 	    ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) {
2821 		dirbad(dp, "mangled entry", offset);
2822 		return (1);
2823 	}
2824 	return (0);
2825 }
2826 
2827 static void
2828 dirbad(struct inode *ip, char *how, off_t offset)
2829 {
2830 	cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s",
2831 	    ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how);
2832 }
2833 
2834 static int
2835 dirbadname(char *sp, int l)
2836 {
2837 	while (l--) {			/* check for nulls */
2838 		if (*sp++ == '\0') {
2839 			return (1);
2840 		}
2841 	}
2842 	return (*sp);			/* check for terminating null */
2843 }
2844 
2845 /*
2846  * Check if a directory is empty or not.
2847  */
2848 static int
2849 ufs_dirempty(
2850 	struct inode *ip,
2851 	ino_t parentino,
2852 	struct cred *cr)
2853 {
2854 	return (ufs_dirscan(ip, parentino, cr, 0));
2855 }
2856 
2857 /*
2858  * clear the .. directory entry.
2859  */
2860 static int
2861 ufs_dirpurgedotdot(
2862 	struct inode *ip,
2863 	ino_t parentino,
2864 	struct cred *cr)
2865 {
2866 	return (ufs_dirscan(ip, parentino, cr, 1));
2867 }
2868 
2869 /*
2870  * Scan the directoy. If clr_dotdot is true clear the ..
2871  * directory else check to see if the directory is empty.
2872  *
2873  * Using a struct dirtemplate here is not precisely
2874  * what we want, but better than using a struct direct.
2875  *
2876  * clr_dotdot is used as a flag to tell us if we need
2877  * to clear the dotdot entry
2878  *
2879  * N.B.: does not handle corrupted directories.
2880  */
2881 static int
2882 ufs_dirscan(
2883 	struct inode *ip,
2884 	ino_t parentino,
2885 	struct cred *cr,
2886 	int clr_dotdot)
2887 {
2888 	offset_t off;
2889 	struct dirtemplate dbuf;
2890 	struct direct *dp = (struct direct *)&dbuf;
2891 	int err, count;
2892 	int empty = 1;	/* Assume it's empty */
2893 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
2894 
2895 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
2896 
2897 	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
2898 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
2899 		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
2900 		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
2901 		/*
2902 		 * Since we read MINDIRSIZ, residual must
2903 		 * be 0 unless we're at end of file.
2904 		 */
2905 		if (err || count != 0 || dp->d_reclen == 0) {
2906 			empty = 0;
2907 			break;
2908 		}
2909 		/* skip empty entries */
2910 		if (dp->d_ino == 0)
2911 			continue;
2912 		/* accept only "." and ".." */
2913 		if (dp->d_namlen > 2 || dp->d_name[0] != '.') {
2914 			empty = 0;
2915 			break;
2916 		}
2917 		/*
2918 		 * At this point d_namlen must be 1 or 2.
2919 		 * 1 implies ".", 2 implies ".." if second
2920 		 * char is also "."
2921 		 */
2922 		if (dp->d_namlen == 1)
2923 			continue;
2924 		if (dp->d_name[1] == '.' &&
2925 		    (ino_t)dp->d_ino == parentino) {
2926 			/*
2927 			 * If we're doing a purge we need to check for
2928 			 * the . and .. entries and clear the d_ino for ..
2929 			 *
2930 			 * if clr_dotdot is set ufs_dirscan does not
2931 			 * check for an empty directory.
2932 			 */
2933 			if (clr_dotdot) {
2934 				/*
2935 				 * Have to actually zap the ..
2936 				 * entry in the directory, as
2937 				 * otherwise someone might have
2938 				 * dp as its cwd and try to
2939 				 * open .., which now points to
2940 				 * an unallocated inode.
2941 				 */
2942 				empty = ufs_dirclrdotdot(ip, parentino);
2943 				break;
2944 			} else {
2945 				continue;
2946 			}
2947 		}
2948 		empty = 0;
2949 		break;
2950 	}
2951 	return (empty);
2952 }
2953 
2954 clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */
2955 uint64_t dircheck_retry_cnt;
2956 /*
2957  * Check if source directory inode is in the path of the target directory.
2958  * Target is supplied locked.
2959  *
2960  * The source and target inode's should be different upon entry.
2961  */
2962 int
2963 ufs_dircheckpath(
2964 	ino_t source_ino,
2965 	struct inode *target,
2966 	struct inode *sdp,
2967 	struct cred *cr)
2968 {
2969 	struct fbuf *fbp;
2970 	struct dirtemplate *dirp;
2971 	struct inode *ip;
2972 	struct ufsvfs *ufsvfsp;
2973 	struct inode *tip;
2974 	ino_t dotdotino;
2975 	int err;
2976 
2977 	ASSERT(target->i_ufsvfs != NULL);
2978 	ASSERT(RW_LOCK_HELD(&target->i_rwlock));
2979 	ASSERT(RW_LOCK_HELD(&sdp->i_rwlock));
2980 
2981 	ip = target;
2982 	if (ip->i_number == source_ino) {
2983 		err = EINVAL;
2984 		goto out;
2985 	}
2986 	if (ip->i_number == UFSROOTINO) {
2987 		err = 0;
2988 		goto out;
2989 	}
2990 	/*
2991 	 * Search back through the directory tree, using the ".." entries.
2992 	 * Fail any attempt to move a directory into an ancestor directory.
2993 	 */
2994 	fbp = NULL;
2995 	for (;;) {
2996 		struct vfs	*vfs;
2997 
2998 		err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp);
2999 		if (err)
3000 			break;
3001 		if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 ||
3002 		    ip->i_size < sizeof (struct dirtemplate)) {
3003 			dirbad(ip, "bad size, unlinked or not dir", (off_t)0);
3004 			err = ENOTDIR;
3005 			break;
3006 		}
3007 		if (dirp->dotdot_namlen != 2 ||
3008 		    dirp->dotdot_name[0] != '.' ||
3009 		    dirp->dotdot_name[1] != '.') {
3010 			dirbad(ip, "mangled .. entry", (off_t)0);
3011 			err = ENOTDIR;		/* Sanity check */
3012 			break;
3013 		}
3014 		dotdotino = (ino_t)dirp->dotdot_ino;
3015 		if (dotdotino == source_ino) {
3016 			err = EINVAL;
3017 			break;
3018 		}
3019 		if (dotdotino == UFSROOTINO)
3020 			break;
3021 		if (fbp) {
3022 			fbrelse(fbp, S_OTHER);
3023 			fbp = NULL;
3024 		}
3025 		vfs = ip->i_vfs;
3026 		ufsvfsp = ip->i_ufsvfs;
3027 
3028 		if (ip != target) {
3029 			rw_exit(&ip->i_rwlock);
3030 			VN_RELE(ITOV(ip));
3031 		}
3032 		/*
3033 		 * Race to get the inode.
3034 		 */
3035 		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3036 		if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) {
3037 			rw_exit(&ufsvfsp->vfs_dqrwlock);
3038 			ip = NULL;
3039 			break;
3040 		}
3041 		rw_exit(&ufsvfsp->vfs_dqrwlock);
3042 		/*
3043 		 * If the directory of the source inode (also a directory)
3044 		 * is the same as this next entry up the chain, then
3045 		 * we know the source directory itself can't be in the
3046 		 * chain. This also prevents a panic because we already
3047 		 * have sdp->i_rwlock locked.
3048 		 */
3049 		if (tip == sdp) {
3050 			VN_RELE(ITOV(tip));
3051 			ip = NULL;
3052 			break;
3053 		}
3054 		ip = tip;
3055 
3056 		/*
3057 		 * If someone has set the WRITE_WANTED bit in this lock and if
3058 		 * this happens to be a sdp or tdp of another parallel rename
3059 		 * which is executing  the same code and in similar situation
3060 		 * we end up in a 4 way deadlock. We need to make sure that
3061 		 * the WRITE_WANTED bit is not  set.
3062 		 */
3063 retry_lock:
3064 		if (!rw_tryenter(&ip->i_rwlock, RW_READER)) {
3065 			/*
3066 			 * If the lock held as WRITER thats fine but if it
3067 			 * has WRITE_WANTED bit set we might end up in a
3068 			 * deadlock. If WRITE_WANTED is set we return
3069 			 * with EAGAIN else we just go back and try.
3070 			 */
3071 			if (RW_ISWRITER(&ip->i_rwlock) &&
3072 			    !(RW_WRITE_HELD(&ip->i_rwlock))) {
3073 				err = EAGAIN;
3074 				if (fbp) {
3075 					fbrelse(fbp, S_OTHER);
3076 				}
3077 				VN_RELE(ITOV(ip));
3078 				return (err);
3079 			} else {
3080 				/*
3081 				 * The lock is being write held. We could
3082 				 * just do a rw_enter here but there is a
3083 				 * window between the check and now, where
3084 				 * the status could have changed, so to
3085 				 * avoid looping we backoff and go back to
3086 				 * try for the lock.
3087 				 */
3088 				delay(retry_backoff_delay);
3089 				dircheck_retry_cnt++;
3090 				goto retry_lock;
3091 			}
3092 		}
3093 	}
3094 	if (fbp) {
3095 		fbrelse(fbp, S_OTHER);
3096 	}
3097 out:
3098 	if (ip) {
3099 		if (ip != target) {
3100 			rw_exit(&ip->i_rwlock);
3101 			VN_RELE(ITOV(ip));
3102 		}
3103 	}
3104 	return (err);
3105 }
3106 
3107 int
3108 ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr)
3109 {
3110 	offset_t off;
3111 	struct dirtemplate dbuf;
3112 	struct direct *dp = (struct direct *)&dbuf;
3113 	int err, count;
3114 	int empty = 1;	/* Assume it's empty */
3115 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
3116 
3117 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
3118 
3119 	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
3120 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
3121 		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
3122 		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
3123 		/*
3124 		 * Since we read MINDIRSIZ, residual must
3125 		 * be 0 unless we're at end of file.
3126 		 */
3127 
3128 		if (err || count != 0 || dp->d_reclen == 0) {
3129 			empty = 0;
3130 			break;
3131 		}
3132 		/* skip empty entries */
3133 		if (dp->d_ino == 0)
3134 			continue;
3135 		/*
3136 		 * At this point d_namlen must be 1 or 2.
3137 		 * 1 implies ".", 2 implies ".." if second
3138 		 * char is also "."
3139 		 */
3140 
3141 		if (dp->d_namlen == 1 && dp->d_name[0] == '.' &&
3142 		    (ino_t)dp->d_ino == parentino)
3143 			continue;
3144 
3145 		if (dp->d_namlen == 2 && dp->d_name[0] == '.' &&
3146 		    dp->d_name[1] == '.') {
3147 			continue;
3148 		}
3149 		empty = 0;
3150 		break;
3151 	}
3152 	return (empty);
3153 }
3154 
3155 
3156 /*
3157  * Allocate and initialize a new shadow inode to contain extended attributes.
3158  */
3159 int
3160 ufs_xattrmkdir(
3161 	struct inode *tdp,
3162 	struct inode **ipp,
3163 	int flags,
3164 	struct cred *cr)
3165 {
3166 	struct inode *ip;
3167 	struct vattr va;
3168 	int err;
3169 	int retry = 1;
3170 	struct ufsvfs *ufsvfsp;
3171 	struct ulockfs *ulp;
3172 	int issync;
3173 	int trans_size;
3174 	int dorwlock;		/* 0 = not yet taken, */
3175 				/* 1 = taken outside the transaction, */
3176 				/* 2 = taken inside the transaction */
3177 
3178 	/*
3179 	 * Validate permission to create attribute directory
3180 	 */
3181 
3182 	if ((err = ufs_iaccess(tdp, IWRITE, cr, 1)) != 0) {
3183 		return (err);
3184 	}
3185 
3186 	if (vn_is_readonly(ITOV(tdp)))
3187 		return (EROFS);
3188 
3189 	/*
3190 	 * No need to re-init err after again:, since it's set before
3191 	 * the next use of it.
3192 	 */
3193 again:
3194 	dorwlock = 0;
3195 	va.va_type = VDIR;
3196 	va.va_uid = tdp->i_uid;
3197 	va.va_gid = tdp->i_gid;
3198 
3199 	if ((tdp->i_mode & IFMT) == IFDIR) {
3200 		va.va_mode = (o_mode_t)IFATTRDIR;
3201 		va.va_mode |= tdp->i_mode & 0777;
3202 	} else {
3203 		va.va_mode = (o_mode_t)IFATTRDIR|0700;
3204 		if (tdp->i_mode & 0040)
3205 			va.va_mode |= 0750;
3206 		if (tdp->i_mode & 0004)
3207 			va.va_mode |= 0705;
3208 	}
3209 	va.va_mask = AT_TYPE|AT_MODE;
3210 
3211 	ufsvfsp = tdp->i_ufsvfs;
3212 
3213 	err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
3214 	if (err)
3215 		return (err);
3216 
3217 	/*
3218 	 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
3219 	 * This follows the protocol for read()/write().
3220 	 */
3221 	if (ITOV(tdp)->v_type != VDIR) {
3222 		rw_enter(&tdp->i_rwlock, RW_WRITER);
3223 		dorwlock = 1;
3224 	}
3225 
3226 	if (ulp) {
3227 		trans_size = (int)TOP_MKDIR_SIZE(tdp);
3228 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size);
3229 	}
3230 
3231 	/*
3232 	 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
3233 	 * This follows the protocol established by
3234 	 * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
3235 	 */
3236 	if (dorwlock == 0) {
3237 		rw_enter(&tdp->i_rwlock, RW_WRITER);
3238 		dorwlock = 2;
3239 	}
3240 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3241 	rw_enter(&tdp->i_contents, RW_WRITER);
3242 
3243 	/*
3244 	 * Suppress out of inodes messages if we will retry.
3245 	 */
3246 	if (retry)
3247 		tdp->i_flag |= IQUIET;
3248 	err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr);
3249 	tdp->i_flag &= ~IQUIET;
3250 
3251 	if (err)
3252 		goto fail;
3253 
3254 	if (flags) {
3255 
3256 		/*
3257 		 * Now attach it to src file.
3258 		 */
3259 
3260 		tdp->i_oeftflag = ip->i_number;
3261 	}
3262 
3263 	ip->i_cflags |= IXATTR;
3264 	ITOV(ip)->v_flag |= V_XATTRDIR;
3265 	TRANS_INODE(ufsvfsp, tdp);
3266 	tdp->i_flag |= ICHG | IUPD;
3267 	tdp->i_seq++;
3268 	ufs_iupdat(tdp, I_SYNC);
3269 	rw_exit(&tdp->i_contents);
3270 	rw_exit(&ufsvfsp->vfs_dqrwlock);
3271 
3272 	rw_enter(&ip->i_rwlock, RW_WRITER);
3273 	rw_enter(&ip->i_contents, RW_WRITER);
3274 	TRANS_INODE(ufsvfsp, ip);
3275 	ip->i_flag |= ICHG| IUPD;
3276 	ip->i_seq++;
3277 	ufs_iupdat(ip, I_SYNC);
3278 	rw_exit(&ip->i_contents);
3279 	rw_exit(&ip->i_rwlock);
3280 	if (dorwlock == 2)
3281 		rw_exit(&tdp->i_rwlock);
3282 	if (ulp) {
3283 		int terr = 0;
3284 
3285 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
3286 		ufs_lockfs_end(ulp);
3287 		if (err == 0)
3288 			err = terr;
3289 	}
3290 	if (dorwlock == 1)
3291 		rw_exit(&tdp->i_rwlock);
3292 	*ipp = ip;
3293 	return (err);
3294 
3295 fail:
3296 	rw_exit(&tdp->i_contents);
3297 	rw_exit(&ufsvfsp->vfs_dqrwlock);
3298 	if (dorwlock == 2)
3299 		rw_exit(&tdp->i_rwlock);
3300 	if (ulp) {
3301 		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
3302 		ufs_lockfs_end(ulp);
3303 	}
3304 	if (dorwlock == 1)
3305 		rw_exit(&tdp->i_rwlock);
3306 	if (ip != NULL)
3307 		VN_RELE(ITOV(ip));
3308 
3309 	/*
3310 	 * No inodes?  See if any are tied up in pending deletions.
3311 	 * This has to be done outside of any of the above, because
3312 	 * the draining operation can't be done from inside a transaction.
3313 	 */
3314 	if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3315 		ufs_delete_drain_wait(ufsvfsp, 1);
3316 		retry = 0;
3317 		goto again;
3318 	}
3319 
3320 	return (err);
3321 }
3322 
3323 /*
3324  * clear the dotdot directory entry.
3325  * Used by ufs_dirscan when clr_dotdot
3326  * flag is set and we're deleting a
3327  * directory.
3328  */
3329 static int
3330 ufs_dirclrdotdot(struct inode *ip, ino_t parentino)
3331 {
3332 	struct fbuf *fbp;
3333 	struct direct *dotp, *dotdotp;
3334 	int err = 0;
3335 
3336 	ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
3337 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
3338 	err = blkatoff(ip, 0, NULL, &fbp);
3339 	if (err) {
3340 		return (err);
3341 	}
3342 
3343 	dotp = (struct direct *)fbp->fb_addr;
3344 	if ((dotp->d_namlen < (MAXNAMLEN + 1)) &&
3345 	    ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) {
3346 		dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen);
3347 		if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) &&
3348 		    ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) {
3349 
3350 			dotp->d_reclen += dotdotp->d_reclen;
3351 			if (parentino == dotdotp->d_ino) {
3352 				dotdotp->d_ino = 0;
3353 				dotdotp->d_namlen = 0;
3354 				dotdotp->d_reclen = 0;
3355 			}
3356 
3357 			err = TRANS_DIR(ip, 0);
3358 			if (err) {
3359 				fbrelse(fbp, S_OTHER);
3360 			} else {
3361 				err = ufs_fbwrite(fbp, ip);
3362 			}
3363 		}
3364 	} else {
3365 		err = -1;
3366 	}
3367 	return (err);
3368 }
3369